Advertisement
Guest User

Untitled

a guest
Dec 12th, 2017
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.84 KB | None | 0 0
  1. library(dplyr)
  2. library(ggplot2)
  3. library(car)
  4. library(xts)
  5. library(scales)
  6. library(gridExtra)
  7.  
  8. data_h <- read.csv("C:\\Users\\migue\\Desktop\\THEFUTURE\\DM1\\Assignment\\hour.csv", header=T)
  9. data_h
  10. data_d <- read.csv("C:\\Users\\migue\\Desktop\\THEFUTURE\\DM1\\Assignment\\day.csv", header=T)
  11.  
  12.  
  13. ##Seasons were represented by numbers in the original dataset
  14. data_d$season <- factor(data_d$season, levels = c(1, 2, 3 ,4), labels = c("spring", "summer", "fall", "winter"))
  15. data_h$season <- factor(data_h$season, levels = c(1, 2, 3 ,4), labels = c("spring", "summer", "fall", "winter"))
  16.  
  17. ##The weathersit variable describes the "quality" of the weather" with numbers:
  18. ##- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
  19. ##- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
  20. ##- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
  21. ##- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
  22. ##- temp : Normalized temperature in Celsius. The values are derived
  23. ##TODO: should we change this to a nominal variable?
  24.  
  25. ##Converting weathersit to a factor of four levels to make the plotting of the data easier.
  26. data_d$weathersit <- factor(data_d$weathersit, levels = c(1,2,3,4), labels = c("1", "2", "3", "4"))
  27. data_h$weathersit <- factor(data_h$weathersit, levels = c(1,2,3,4), labels = c("1", "2", "3", "4"))
  28. ##holiday was 0 and 1. Now is nominal for clarity.
  29. data_d$holiday <- factor(data_d$holiday, levels = c(0,1), labels = c("work-day", "holiday"))
  30. data_h$holiday <- factor(data_h$holiday, levels = c(0,1), labels = c("work-day", "holiday"))
  31.  
  32. ##Converting the dates to a timeseries.
  33. ##data_d$dteday <- xts(data_d$dteday, order.by=as.POSIXct(data_d$dteday))
  34. ##data_h$dteday <- xts(data_h$dteday, order.by=as.POSIXct(data_h$dteday))
  35. ##TODO check incomplete cases
  36. ##TODO check outliers
  37. ##TODO check further variables that need some kind of preprocessing
  38.  
  39. summary(data_h)
  40. summary(data_d)
  41.  
  42.  
  43. ##check if total number of users is a normal distribution
  44. ##It doesn't seem to follow a Normal distribution because of the extreme values.
  45. qqPlot(data_d$cnt, main = "QQ plot of total number of users per day.")
  46.  
  47. ##Since the total number of users is not random, let's study possible predictors.
  48.  
  49. ##Total users by season
  50. ggplot(data_d, aes(x = cnt)) + geom_histogram(binwidth = 500, col="black", fill = "blue") + facet_wrap(~ season)
  51.  
  52. ggplot(data_d, aes(x = cnt)) + geom_histogram(binwidth = 500, col="black", fill = "blue") + facet_wrap(~ holiday) + ggtitle("Distribution of total user when holiday.")
  53. ggplot(data_d, aes(x = cnt)) + geom_histogram(binwidth = 500, col="black", fill = "blue") + facet_wrap(~ weathersit) + ggtitle("Distribution of total users by weather conditions.") + xlab("Nº users") + ylab("Nº occcurences")
  54.  
  55. ##Temperature graph. Can we add the medians of each variable?
  56. ggplot(data_d, aes(windspeed,cnt)) + stat_density_2d(aes(fill = ..level..), geom="polygon")+
  57. scale_fill_gradient(low="blue", high="red") + geom_density_2d(color = "black") + ggtitle("Temperature graph between windspeed and number of users.")
  58.  
  59.  
  60. ggplot(data_d, aes(x = hum, y = cnt, shape = weathersit, colour = weathersit)) + geom_point() + ggtitle("Scatter plot between humidity levels and number of users.") + xlab("Humidity level") + ylab("Nº users")
  61. ggplot(data_d, aes(x = temp, y = cnt, shape = weathersit, colour = weathersit)) + geom_point() + ggtitle("Scatter plot between temperature measures and number of users.") + xlab("Temperature (ºC)") + ylab("Nº users")
  62.  
  63. ggplot(data_d, aes(x = as.Date(dteday) , y = cnt, shape = weathersit, colour = weathersit)) + geom_point() + ggtitle("Scatter plotd of the total number of users throughout the days.") + xlab("Date (dd-mm-yy)") + ylab("Nº users") + scale_x_date(labels = date_format("%d-%m-%y"))
  64.  
  65.  
  66.  
  67. ##data_y
  68.  
  69. ggplot(data_d , aes(x = as.Date(dteday), y = cnt, colour = format(as.Date(data_d$dteday), "%Y"))) + geom_point(alpha=0.5) + geom_smooth(span = 0.2) +
  70. scale_color_manual(values = c('#999999','#E69F00')) +
  71. theme(legend.position=c(0,1), legend.justification=c(0,1))
  72.  
  73. ##TODO study the fact that the number of total users was overall higher in 2012. Temperature, weatherit, windspeed, humidty? Popularity?
  74. ##TODO at what hours there are more users? This can make the prediction of the reliability of the system better.
  75. ##TODO multiple boxplots of temp, weather, windsp, hum, etc...
  76.  
  77. ##Can we split the bins in the histograms in two colors, one for the casual users, other for the registered?
  78. ##Check http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html for insight
  79.  
  80. ##IMPORTANT plot hours and dates https://stackoverflow.com/questions/7160565/how-to-create-a-time-scatterplot-with-r
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement