library(tidyr) library(dplyr) library(caTools) library(ggplot2)
bike <- read.csv(file.choose())
head(bike)
ggplot(bike,aes(temp,count)) + geom_point(alpha=0.2, aes(color=temp)) + theme_bw()
bike$datetime <- as.POSIXct(bike$datetime)
ggplot(bike,aes(datetime,count)) + geom_point(aes(color=temp),alpha=0.5) + scale_color_continuous(low='#55D8CE',high='#FF6E2E') +theme_bw()
cor(bike[,c('temp','count')])
Let's explore the season data. Create a boxplot, with the y axis indicating count and the x axis begin a box for each season
ggplot(bike,aes(factor(season),count)) + geom_boxplot(aes(color=factor(season))) +theme_bw()
bike$hour <- sapply(bike$datetime,function(x){format(x,"%H")})
create a scatterplot of count versus hour, with color scale based on temp. Only use bike data where workingday==1
pl <- ggplot(filter(bike,workingday==1),aes(hour,count)) pl <- pl + geom_point(position=position_jitter(w=1, h=0),aes(color=temp),alpha=0.5) pl <- pl + scale_color_gradientn(colours = c('dark blue','blue','light blue','light green','yellow','orange','red')) pl + theme_bw()
pl <- ggplot(filter(bike,workingday==0),aes(hour,count)) pl <- pl + geom_point(position=position_jitter(w=1, h=0),aes(color=temp),alpha=0.8) pl <- pl + scale_color_gradientn(colours = c('dark blue','blue','light blue','light green','yellow','orange','red')) pl + theme_bw()
temp.model <- lm(count~temp,bike)
summary(temp.model)
6.0462 + 9.17*25
temp.test <- data.frame(temp=c(25)) predict(temp.model,temp.test)
bike$hour <- sapply(bike$hour,as.numeric)
model <- lm(count ~ . -casual - registered -datetime -atemp,bike )
summary(model)
A linear model like the one we chose which uses OLS won't be able to take into account seasonality of our data, and will get thrown off by the growth in our dataset, accidentally attributing it towards the winter season, instead of realizing its just overall demand growing! Later on, we'll see if other models may be a better fit for this sort of data.
sample <- sample.split(bike$count, SplitRatio = 0.6)
previous <- subset(bike, sample == TRUE) future <- subset(bike, sample == FALSE)
model1 <- lm(count~., bike) summary(model1)
plot(model1)
count.predict <- predict(model1, future)
results <- cbind(count.predict, future$count)
colnames(results) <- c("predicted", "actual")
results <- as.data.frame(results)
results