For this project we will be doing the Bike Sharing Demand Kaggle challenge!
You can download the data or just use the supplied csv in the repository. The data has the following features:
head(bike)
## datetime season holiday workingday weather temp atemp humidity
## 1 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81
## 2 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 80
## 3 2011-01-01 02:00:00 1 0 0 1 9.02 13.635 80
## 4 2011-01-01 03:00:00 1 0 0 1 9.84 14.395 75
## 5 2011-01-01 04:00:00 1 0 0 1 9.84 14.395 75
## 6 2011-01-01 05:00:00 1 0 0 2 9.84 12.880 75
## windspeed casual registered count
## 1 0.0000 3 13 16
## 2 0.0000 8 32 40
## 3 0.0000 5 27 32
## 4 0.0000 3 10 13
## 5 0.0000 0 1 1
## 6 6.0032 0 1 1
Create a scatter plot of count vs temp.
library(ggplot2)
ggplot(bike, aes(x= temp, y=count, color = temp)) + geom_point(alpha = 1, size=2)
bike$datetime <- as.POSIXct(bike$datetime)
ggplot(bike, aes(x= datetime, y=count, color = temp)) + geom_point(alpha = 1, size=2)
cor(bike[,c('temp','count')])
## temp count
## temp 1.0000000 0.3944536
## count 0.3944536 1.0000000
ggplot(bike,aes(factor(season),count,color=factor(season) )) + geom_boxplot() + theme_bw()
A lot of times we need to use domain knowledge and experience to engineer and create new features. Let’s go ahead and engineer some new features from the datetime column. Let us Create an “hour” column that takes the hour from the datetime column. WE probably need to apply some function to the entire datetime column and reassign it.
bike$hour <- sapply(bike$datetime,function(x){format(x,"%H")})
library(dplyr)
fig1 <-ggplot(filter(bike, workingday == 1), aes(x= hour, y=count, color = temp)) + geom_point(position=position_jitter(w=1, h=0))
fig1<- fig1 +scale_color_gradientn(colors=c('blue', 'red', 'green',' orange', 'yellow'))
fig1
fig1 <-ggplot(filter(bike, workingday == 0), aes(x= hour, y=count, color = temp)) + geom_point(position=position_jitter(w=1, h=0))
fig1<- fig1 +scale_color_gradientn(colors=c('blue', 'red', 'green',' orange', 'yellow'))
fig1
Using lm() to build a model that predicts count based solely on the temp feature,and name it temp.model
#?lm
temp.model <- lm(count~temp,bike)
summary(temp.model)
##
## Call:
## lm(formula = count ~ temp, data = bike)
##
## Residuals:
## Min 1Q Median 3Q Max
## -293.32 -112.36 -33.36 78.98 741.44
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.0462 4.4394 1.362 0.173
## temp 9.1705 0.2048 44.783 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 166.5 on 10884 degrees of freedom
## Multiple R-squared: 0.1556, Adjusted R-squared: 0.1555
## F-statistic: 2006 on 1 and 10884 DF, p-value: < 2.2e-16
# methos one using the intercept 6.0462 = beta0 and and temp =9.1705
6.0462 + (9.1705*25)
## [1] 235.3087
#methos 2 the model
predict(temp.model, data.frame(temp=c(25)))
## 1
## 235.3097
bike$hour <- sapply(bike$hour, as.numeric)
model_2 <- lm(count ~ . -casual - registered -datetime -atemp,bike )
summary(model_2)
##
## Call:
## lm(formula = count ~ . - casual - registered - datetime - atemp,
## data = bike)
##
## Residuals:
## Min 1Q Median 3Q Max
## -324.61 -96.88 -31.01 55.27 688.83
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.91369 8.45147 5.551 2.91e-08 ***
## season 21.70333 1.35409 16.028 < 2e-16 ***
## holiday -10.29914 8.79069 -1.172 0.241
## workingday -0.71781 3.14463 -0.228 0.819
## weather -3.20909 2.49731 -1.285 0.199
## temp 7.01953 0.19135 36.684 < 2e-16 ***
## humidity -2.21174 0.09083 -24.349 < 2e-16 ***
## windspeed 0.20271 0.18639 1.088 0.277
## hour 7.61283 0.21688 35.102 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 147.8 on 10877 degrees of freedom
## Multiple R-squared: 0.3344, Adjusted R-squared: 0.3339
## F-statistic: 683 on 8 and 10877 DF, p-value: < 2.2e-16