Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(dplyr)
- library(ggplot2)
- library(car)
- library(xts)
- library(scales)
- library(gridExtra)
- data_h <- read.csv("C:\\Users\\migue\\Desktop\\THEFUTURE\\DM1\\Assignment\\hour.csv", header=T)
- data_h
- data_d <- read.csv("C:\\Users\\migue\\Desktop\\THEFUTURE\\DM1\\Assignment\\day.csv", header=T)
- ##Seasons were represented by numbers in the original dataset
- data_d$season <- factor(data_d$season, levels = c(1, 2, 3 ,4), labels = c("spring", "summer", "fall", "winter"))
- data_h$season <- factor(data_h$season, levels = c(1, 2, 3 ,4), labels = c("spring", "summer", "fall", "winter"))
- ##The weathersit variable describes the "quality" of the weather" with numbers:
- ##- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
- ##- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
- ##- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
- ##- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- ##- temp : Normalized temperature in Celsius. The values are derived
- ##TODO: should we change this to a nominal variable?
- ##Converting weathersit to a factor of four levels to make the plotting of the data easier.
- data_d$weathersit <- factor(data_d$weathersit, levels = c(1,2,3,4), labels = c("1", "2", "3", "4"))
- data_h$weathersit <- factor(data_h$weathersit, levels = c(1,2,3,4), labels = c("1", "2", "3", "4"))
- ##holiday was 0 and 1. Now is nominal for clarity.
- data_d$holiday <- factor(data_d$holiday, levels = c(0,1), labels = c("work-day", "holiday"))
- data_h$holiday <- factor(data_h$holiday, levels = c(0,1), labels = c("work-day", "holiday"))
- ##Converting the dates to a timeseries.
- ##data_d$dteday <- xts(data_d$dteday, order.by=as.POSIXct(data_d$dteday))
- ##data_h$dteday <- xts(data_h$dteday, order.by=as.POSIXct(data_h$dteday))
- ##TODO check incomplete cases
- ##TODO check outliers
- ##TODO check further variables that need some kind of preprocessing
- summary(data_h)
- summary(data_d)
- ##check if total number of users is a normal distribution
- ##It doesn't seem to follow a Normal distribution because of the extreme values.
- qqPlot(data_d$cnt, main = "QQ plot of total number of users per day.")
- ##Since the total number of users is not random, let's study possible predictors.
- ##Total users by season
- ggplot(data_d, aes(x = cnt)) + geom_histogram(binwidth = 500, col="black", fill = "blue") + facet_wrap(~ season)
- ggplot(data_d, aes(x = cnt)) + geom_histogram(binwidth = 500, col="black", fill = "blue") + facet_wrap(~ holiday) + ggtitle("Distribution of total user when holiday.")
- ggplot(data_d, aes(x = cnt)) + geom_histogram(binwidth = 500, col="black", fill = "blue") + facet_wrap(~ weathersit) + ggtitle("Distribution of total users by weather conditions.") + xlab("Nº users") + ylab("Nº occcurences")
- ##Temperature graph. Can we add the medians of each variable?
- ggplot(data_d, aes(windspeed,cnt)) + stat_density_2d(aes(fill = ..level..), geom="polygon")+
- scale_fill_gradient(low="blue", high="red") + geom_density_2d(color = "black") + ggtitle("Temperature graph between windspeed and number of users.")
- ggplot(data_d, aes(x = hum, y = cnt, shape = weathersit, colour = weathersit)) + geom_point() + ggtitle("Scatter plot between humidity levels and number of users.") + xlab("Humidity level") + ylab("Nº users")
- ggplot(data_d, aes(x = temp, y = cnt, shape = weathersit, colour = weathersit)) + geom_point() + ggtitle("Scatter plot between temperature measures and number of users.") + xlab("Temperature (ºC)") + ylab("Nº users")
- ggplot(data_d, aes(x = as.Date(dteday) , y = cnt, shape = weathersit, colour = weathersit)) + geom_point() + ggtitle("Scatter plotd of the total number of users throughout the days.") + xlab("Date (dd-mm-yy)") + ylab("Nº users") + scale_x_date(labels = date_format("%d-%m-%y"))
- ##data_y
- ggplot(data_d , aes(x = as.Date(dteday), y = cnt, colour = format(as.Date(data_d$dteday), "%Y"))) + geom_point(alpha=0.5) + geom_smooth(span = 0.2) +
- scale_color_manual(values = c('#999999','#E69F00')) +
- theme(legend.position=c(0,1), legend.justification=c(0,1))
- ##TODO study the fact that the number of total users was overall higher in 2012. Temperature, weatherit, windspeed, humidty? Popularity?
- ##TODO at what hours there are more users? This can make the prediction of the reliability of the system better.
- ##TODO multiple boxplots of temp, weather, windsp, hum, etc...
- ##Can we split the bins in the histograms in two colors, one for the casual users, other for the registered?
- ##Check http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html for insight
- ##IMPORTANT plot hours and dates https://stackoverflow.com/questions/7160565/how-to-create-a-time-scatterplot-with-r
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement