Advertisement
Guest User

Untitled

a guest
Apr 26th, 2017
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 14.22 KB | None | 0 0
  1. # Read libraries
  2. library(readr)
  3. library(dplyr)
  4. library(ggplot2)
  5. library(RODBC)
  6. library(qualityTools)
  7. library(moments)
  8. library(exptest)
  9. library(MASS)
  10. library(rms)
  11. library(stats)
  12. library(foreign)
  13. library(Hmisc)
  14. library(reshape2)
  15.  
  16.  
  17. # Load data from database
  18. query <- "SELECT Mn.PartitionKey, Mn.RoomKey, Mn.Value, Mp.CloseDateTime, Mn.MeasureDateTime,
  19. CASE WHEN DATEPART(hour, Mn.MeasureDateTime) < 12 THEN 'ochtend' ELSE (CASE WHEN DATEPART(hour, Mn.MeasureDateTime) < 18 THEN 'middag' ELSE 'avond' END) end AS DayPart,
  20. DATEDIFF(second,Mn.MeasureDateTime,Mn.CloseDateTime) AS CleaningDuration,
  21. SUM(ZM.Count) AS SumCount, MAX(ZM.Count)/10 AS MaxValue, COUNT(ZM.Count) AS MeasuresCount,
  22. (SUM(ZM.Count)/10)/COUNT(ZM.COUNT) AS MeanWithZeros,
  23. CASE WHEN SUM(ZM.Count)=0 THEN 0 ELSE (SUM(ZM.Count)/10)/COUNT(CASE WHEN ZM.Count>1 THEN 1 ELSE NULL END) end AS MeanWithoutZeros,
  24. R.Name, R.Type, R.FlooringType, R.Surface, R.Capacity,
  25. ROUND((CASE WHEN SUM(ZM.Count)=0 THEN 0 ELSE (SUM(ZM.Count)/10)/COUNT(CASE WHEN ZM.Count>1 THEN 1 ELSE NULL END) end)*1.0/R.Capacity, 2) AS MeanOccupancy,
  26. ROUND((MAX(ZM.Count)*1.0/10)/R.Capacity, 2) AS MaxOccupancy
  27. FROM dbo.Measure Mn OUTER APPLY
  28. (SELECT TOP 1 CloseDateTime FROM Measure Mi WHERE Mi.RoomId = Mn.RoomId AND Mi.CloseDateTime < Mn.MeasureDateTime ORDER BY CloseDateTime DESC) Mp
  29. JOIN Room R ON Mn.RoomId = R.Id
  30. JOIN ZoneMeasure ZM ON R.ZoneId = ZM.ZoneId AND ZM.MeasureDateTime BETWEEN Mp.CloseDateTime AND Mn.MeasureDateTime
  31. WHERE (Mn.RoomKey LIKE '%004' OR Mn.RoomKey LIKE '%009' OR Mn.RoomKey LIKE '%106' OR Mn.RoomKey LIKE '%107' OR Mn.RoomKey LIKE '%211' OR Mn.RoomKey LIKE '%213' OR Mn.RoomKey LIKE '%214' OR Mn.RoomKey LIKE '%217' OR Mn.RoomKey LIKE '%218' OR Mn.RoomKey LIKE '%221' OR Mn.RoomKey LIKE '%226' OR Mn.RoomKey LIKE '%229' OR Mn.RoomKey LIKE '%309' OR Mn.RoomKey LIKE '%313' OR Mn.RoomKey LIKE '%314' OR Mn.RoomKey LIKE '%317' OR Mn.RoomKey LIKE '%318' OR Mn.RoomKey LIKE '%321' OR Mn.RoomKey LIKE '%326' OR Mn.RoomKey LIKE '%329' OR Mn.RoomKey LIKE '%222' OR Mn.RoomKey LIKE '%404' OR Mn.RoomKey LIKE '%406' OR Mn.RoomKey LIKE '%408' OR Mn.RoomKey LIKE '%409' OR Mn.RoomKey LIKE '%411' OR Mn.RoomKey LIKE '%413' OR Mn.RoomKey LIKE '%417' OR Mn.RoomKey LIKE '%421' OR Mn.RoomKey LIKE '%425' OR Mn.RoomKey LIKE '%430' OR Mn.RoomKey LIKE '%432' OR Mn.RoomKey LIKE '%435' OR Mn.RoomKey LIKE '%437' OR Mn.RoomKey LIKE '%503' OR Mn.RoomKey LIKE '%504' OR Mn.RoomKey LIKE '%508' OR Mn.RoomKey LIKE '%512' OR Mn.RoomKey LIKE '%516' OR Mn.RoomKey LIKE '%521' OR Mn.RoomKey LIKE '%525' OR Mn.RoomKey LIKE '%529' OR Mn.RoomKey LIKE '%531' OR Mn.RoomKey LIKE '%535' OR Mn.RoomKey LIKE '%551' OR Mn.RoomKey LIKE '%556' OR Mn.RoomKey LIKE '%561' OR Mn.RoomKey LIKE '%563' OR Mn.RoomKey LIKE '%566' OR Mn.RoomKey LIKE '%569' OR Mn.RoomKey LIKE '%602' OR Mn.RoomKey LIKE '%606' OR Mn.RoomKey LIKE '%610' OR Mn.RoomKey LIKE '%612' OR Mn.RoomKey LIKE '%616' OR Mn.RoomKey LIKE '%621' OR Mn.RoomKey LIKE '%625' OR Mn.RoomKey LIKE '%629' OR Mn.RoomKey LIKE '%631' OR Mn.RoomKey LIKE '%635' OR Mn.RoomKey LIKE '%651' OR Mn.RoomKey LIKE '%|662' OR Mn.RoomKey LIKE '%666' OR Mn.RoomKey LIKE '%703' OR Mn.RoomKey LIKE '%707' OR Mn.RoomKey LIKE '%713' OR Mn.RoomKey LIKE '%717' OR Mn.RoomKey LIKE '%720' OR Mn.RoomKey LIKE '%725' OR Mn.RoomKey LIKE '%732' OR Mn.RoomKey LIKE '%735' OR Mn.RoomKey LIKE '%751' OR Mn.RoomKey LIKE '%759' OR Mn.RoomKey LIKE '%763' OR Mn.RoomKey LIKE '%766' OR Mn.RoomKey LIKE '%767' OR Mn.RoomKey LIKE '%771' OR Mn.RoomKey LIKE '%773' OR Mn.RoomKey LIKE '%803' OR Mn.RoomKey LIKE '%806' OR Mn.RoomKey LIKE '%812' OR Mn.RoomKey LIKE '%836' OR Mn.RoomKey LIKE '%832' OR Mn.RoomKey LIKE '%826' OR Mn.RoomKey LIKE '%852' OR Mn.RoomKey LIKE '%858' OR Mn.RoomKey LIKE '%865')
  32. AND Mn.MeasureDateTime > '2017-03-28 12:53:45.000'
  33. --AND DATEDIFF(second,Mn.MeasureDateTime,Mn.CloseDateTime) > 10
  34. AND Mn.Value > 0
  35. GROUP BY Mn.PartitionKey, Mn.RoomKey, Mn.Value, Mp.CloseDateTime, Mn.MeasureDateTime, R.Name, R.Type, R.FlooringType, R.Surface, R.Capacity, Mn.CloseDateTime"
  36.  
  37. dbhandle <- odbcDriverConnect('driver={SQL Server};server=rooftopsqlserveras.database.windows.net;database=Rooftop;uid=rooftopreader@rooftopsqlserveras;pwd=!one#0oftp;')
  38. res <- sqlQuery(dbhandle, query)
  39. odbcClose(dbhandle)
  40.  
  41.  
  42.  
  43. #Statistical checks
  44. #Checks on normality
  45. qqnorm(res$SumCount, main = "SumCount Normal Q-Q Plot")
  46. qqline(res$SumCount)
  47. shapiro.test(res$SumCount)
  48.  
  49. qqnorm(res$MeanWithoutZeros, main = "MeanWithoutZeros Normal Q-Q Plot")
  50. qqline(res$MeanWithoutZeros)
  51. shapiro.test(res$MeanWithoutZeros)
  52.  
  53. #Alternative Check on normality
  54. qqPlot(res$SumCount, "normal")
  55. qqPlot(res$MeanWithoutZeros, "normal")
  56.  
  57.  
  58. qqPlot(res$Surface, "normal")
  59. qqPlot(res$MaxValue, "normal")
  60. qqPlot(res$Capacity, "normal")
  61. qqPlot(res$MeanOccupancy, "normal") #Lijkt normaal verdeeld
  62. qqPlot(res$MaxOccupancy, "normal") #Lijkt normaal verdeeld
  63.  
  64.  
  65. #Checks on Exponential Distribution
  66. qqPlot(res$SumCount, "exponential", DB = TRUE)
  67. qqPlot(res$MeanWithoutZeros, "exponential", DB = TRUE)
  68.  
  69. #Checks on Poisson Distribution
  70.  
  71.  
  72. #Gini test on Exponential distribution
  73. gini.exp.test(res$SumCount)
  74. gini.exp.test(res$MeanWithoutZeros)
  75. zSumCount <- (gini.exp.test(res$SumCount)$statistic-mean(res$SumCount))/sqrt(var(res$SumCount))
  76. zMeanWithoutZeros <- (gini.exp.test(res$MeanWithoutZeros)$statistic-mean(res$MeanWithoutZeros))/sqrt(var(res$MeanWithoutZeros))
  77. print(zSumCount)
  78. print(zMeanWithoutZeros)
  79. #Significantieniveau 5%: als -1.96 < Z < 1.96 is niet bewezen dat verdeling van metingen afwijkt van exponentiele verdeling
  80. #Significantieniveau 1%: als -2.575 < Z < 2.575 is niet bewezen dat verdeling van metingen afwijkt van exponentiele verdeling
  81.  
  82.  
  83. #Mean, Variance, Skewness and Kurtosis
  84. mean(res$SumCount)
  85. var(res$SumCount)
  86. skewness(res$SumCount)
  87. kurtosis(res$SumCount)
  88. mean(res$MeanWithoutZeros)
  89. var(res$MeanWithoutZeros)
  90. skewness(res$MeanWithoutZeros)
  91. kurtosis(res$MeanWithoutZeros)
  92.  
  93. #Boxplot to determine outliers
  94. boxplot(res$SumCount, main = "SumCountData Boxplot")
  95. boxplot(res$MeanWithoutZeros, main = "MeanWithoutZeros Boxplot")
  96.  
  97. #Scatterplots
  98. ggplot(data = res, aes(y = res$MeanWithoutZeros, x = res$Value)) +geom_point()
  99. ggplot(data = res, aes(y = res$MeanWithoutZeros, x = cut(res$Value, seq(from = -1.5, to = 5.5, by = 1)))) +geom_boxplot()
  100.  
  101.  
  102. #Correlation Coefficients
  103. #cor(res$Value, res$MeanWithoutZeros, use = "everything", method = c("pearson"))
  104. #cor(res$Value, res$MeanWithoutZeros, use = "everything", method = c("kendall"))
  105. cor(res$Value, res$SumCount, use = "everything", method = c("spearman"))
  106. cor.test(res$Value, res$SumCount, method ="spearman")
  107. cor(res$Value, res$MeanWithoutZeros, use = "everything", method = c("spearman"))
  108. cor.test(res$Value, res$MeanWithoutZeros, method ="spearman")
  109. cor(res$Value, res$Surface, use = "everything", method = c("spearman"))
  110. cor.test(res$Value, res$Surface, method ="spearman")
  111. cor(res$Value, res$MaxValue, use = "everything", method = c("spearman"))
  112. cor.test(res$Value, res$MaxValue, method ="spearman")
  113. cor(res$Value, res$Capacity, use = "everything", method = c("spearman"))
  114. cor.test(res$Value, res$Capacity, method ="spearman")
  115. cor(res$Value, res$MeanOccupancy, use = "everything", method = c("spearman"))
  116. cor.test(res$Value, res$MeanOccupancy, method ="spearman")
  117. cor(res$Value, res$MaxOccupancy, use = "everything", method = c("spearman"))
  118. cor.test(res$Value, res$MaxOccupancy, method ="spearman")
  119.  
  120.  
  121. #Barplots
  122. Eerste <- res
  123. Eerste$Value <- as.factor(Eerste$Value)
  124.  
  125. # Plot barplot value vs Gem sumcount
  126. nieuweData <- Eerste %>% group_by(Value) %>% summarise(gembezoekers = mean(MeanWithoutZeros, na.rm=TRUE))
  127. ggplot(nieuweData, aes(x=Value, y=gembezoekers)) + geom_bar(stat = "identity")
  128.  
  129. # Plot barplot value vs Gem aantal bezoekers
  130. nieuweData <- Eerste %>% group_by(Value) %>% summarise(gemsumcount = mean(SumCount, na.rm=TRUE))
  131. ggplot(nieuweData, aes(x=Value, y=gemsumcount)) + geom_bar(stat = "identity")
  132.  
  133.  
  134. #Remove outliers for experimental use
  135. remove_outliers <- function(x, na.rm = TRUE, ...) {
  136. qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
  137. H <- 1.5 * IQR(x, na.rm = na.rm)
  138. y <- x
  139. y[x < (qnt[1] - H)] <- NA
  140. y[x > (qnt[2] + H)] <- NA
  141. y
  142. }
  143. #SumCount with removed outliers
  144. SumCountMinOutliers <- data.frame(remove_outliers(res$SumCount), res$Value, na.rm=TRUE)
  145. boxplot(SumCountMinOutliers)
  146. SumCountMinOutliers <- SumCountMinOutliers[!is.na(SumCountMinOutliers$remove_outliers.res.SumCount.),]
  147.  
  148. minoutlierdat <- SumCountMinOutliers %>% group_by(res.Value) %>% summarise(gemsumcount = mean(remove_outliers.res.SumCount., na.rm=TRUE))
  149. ggplot(minoutlierdat, aes(x=res.Value, y=gemsumcount)) + geom_bar(stat = "identity")
  150.  
  151. cor(SumCountMinOutliers$res.Value, SumCountMinOutliers$remove_outliers.res.SumCount., use = "everything", method = c("spearman"))
  152.  
  153. #MeanWithoutZeros with removed outliers
  154. MWZMinOutliers <- data.frame(remove_outliers(res$MeanWithoutZeros), res$Value, na.rm=TRUE)
  155. boxplot(MWZMinOutliers)
  156. MWZMinOutliers <- MWZMinOutliers[!is.na(MWZMinOutliers$remove_outliers.res.MeanWithoutZeros.),]
  157.  
  158. minoutlierdat <- MWZMinOutliers %>% group_by(res.Value) %>% summarise(gemsumcount = mean(remove_outliers.res.MeanWithoutZeros., na.rm=TRUE))
  159. ggplot(minoutlierdat, aes(x=res.Value, y=gemsumcount)) + geom_bar(stat = "identity")
  160.  
  161. cor(MWZMinOutliers$res.Value, MWZMinOutliers$remove_outliers.res.MeanWithoutZeros., use = "everything", method = c("spearman"))
  162.  
  163.  
  164. ###############################################################################################################################################
  165.  
  166. dd <- datadist(res)
  167. options(datadist="dd")
  168.  
  169. #polr method for ordinal logistic regression
  170. model1 <- polr(as.factor(Value) ~ MeanWithoutZeros + MaxValue + Surface, data = res, Hess=TRUE)
  171. model2 <- polr(as.ordered(Value) ~ MeanWithoutZeros + MaxValue + Surface + Capacity + MeanOccupancy + MaxOccupancy, data = res, Hess=TRUE)
  172.  
  173.  
  174. #summary, tables and figures
  175. summary(model2)
  176. ctable <- coef(summary(model2))
  177. p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
  178. (ctable <- cbind(ctable, "p value" = p))
  179. (ci <- confint(model2)) #At level = 0,90 (alpha 90%) significance changes.
  180. confint.default(model2)
  181. ## odds ratios
  182. exp(coef(model2))
  183. ## OR and CI
  184. exp(cbind(OR = coef(model2), ci))
  185.  
  186.  
  187. #Predict probabilities on a testSet
  188. trainSet <- res[res$MeasureDateTime < '2017-04-12 12:00:00',]
  189. testSet <- res[res$MeasureDateTime > '2017-04-12 12:00:00',]
  190. #Create a polr model using a training set as data, do not use complete res data.
  191. testModel <- polr(as.factor(Value) ~ MeanWithoutZeros+Surface+MeanOccupancy+MaxValue+MaxOccupancy+SumCount+MeasuresCount, data = trainSet, Hess=TRUE)
  192. #Create a frame with the usefull variables and bind the model predictions.
  193. testFrame <- data.frame(SumCount <- testSet$SumCount, MeanWithoutZeros <- testSet$MeanWithoutZeros, MaxValue <- testSet$MaxValue, Surface <- testSet$Surface,
  194. Capacity <- testSet$Capacity, MeanOccupancy <- testSet$MeanOccupancy, MaxOccupancy <- testSet$MaxOccupancy,
  195. FlooringType <- testSet$FlooringType)
  196. testFrame <- cbind(testFrame, predict(testModel, testFrame, type = "probs"))
  197. #Create a new frame which only contains the predicted value and the true value from the testSet.
  198. probs <- c("1","2","3","4","5")
  199. probabilitiesFrame <- testFrame[probs]
  200. highestProbability <- as.numeric(colnames(probabilitiesFrame)[apply(probabilitiesFrame,1,which.max)])
  201. highestProbability <- as.data.frame(cbind(highestProbability, testSet$Value))
  202. #Check how many times the model got the prediction right.
  203. outcome <- highestProbability[highestProbability$highestProbability == highestProbability$V2,]
  204. accuracy <- nrow(outcome)/nrow(highestProbability)
  205. accuracy
  206.  
  207.  
  208.  
  209. #lrm method for ordinal logistic regression
  210. lrm1 <- lrm(as.factor(Value) ~ MeanWithoutZeros + MaxValue + Surface + Capacity + MeanOccupancy + MaxOccupancy, data = res, x=TRUE, y=TRUE)
  211. summary(lrm1)
  212. print(lrm1)
  213. lrm.valid <- validate(lrm1, method="boot", B=100) #Geeft errors, vaker proberen te runnen tot het lukt. Wsl veroorzaakt door weinig Value=1
  214. lrm.valid
  215.  
  216. lrm.calib <- calibrate(lrm1, method="boot", B=100) #Geeft errors, vaker proberen te runnen tot het lukt. Wsl veroorzaakt door weinig Value=1
  217. par(bg="white", las=1)
  218. plot(lrm.calib, las=1)
  219.  
  220.  
  221. ###############################################################################################################################################
  222. #Prediction on a 3 scale basis
  223. #Frist instantiate original trainSet and testSet
  224. trainSet3scale <- trainSet
  225. testSet3scale <- testSet
  226. trainSet3scale$Value[trainSet3scale$Value == 2] <- 1
  227. trainSet3scale$Value[trainSet3scale$Value == 3] <- 2
  228. trainSet3scale$Value[trainSet3scale$Value == 4] <- 3
  229. trainSet3scale$Value[trainSet3scale$Value == 5] <- 3
  230. testSet3scale$Value[testSet3scale$Value == 2] <- 1
  231. testSet3scale$Value[testSet3scale$Value == 3] <- 2
  232. testSet3scale$Value[testSet3scale$Value == 4] <- 3
  233. testSet3scale$Value[testSet3scale$Value == 5] <- 3
  234.  
  235. #Create a polr model using a training set as data, do not use complete res data.
  236. testModel3scale <- polr(as.factor(Value) ~ Surface+MaxValue+Capacity+MeanOccupancy+as.factor(FlooringType), data = trainSet3scale, Hess=TRUE)
  237. #Create a frame with the usefull variables and bind the model predictions.
  238. testFrame3scale <- data.frame(MeanWithoutZeros <- testSet3scale$MeanWithoutZeros, MaxValue <- testSet3scale$MaxValue, Surface <- testSet3scale$Surface,
  239. Capacity <- testSet3scale$Capacity, MeanOccupancy <- testSet3scale$MeanOccupancy, MaxOccupancy <- testSet3scale$MaxOccupancy,
  240. FlooringType <- testSet3scale$FlooringType)
  241. testFrame3scale <- cbind(testFrame3scale, predict(testModel3scale, testFrame3scale, type = "probs"))
  242. #Create a new frame which only contains the predicted value and the true value from the testSet.
  243. probs <- c("1","2","3")
  244. probabilitiesFrame3scale <- testFrame3scale[probs]
  245. highestProbability3scale <- as.numeric(colnames(probabilitiesFrame3scale)[apply(probabilitiesFrame3scale,1,which.max)])
  246. highestProbability3scale <- as.data.frame(cbind(highestProbability3scale, testSet3scale$Value))
  247. #Check how many times the model got the prediction right.
  248. outcome3scale <- highestProbability3scale[highestProbability3scale$highestProbability3scale == highestProbability3scale$V2,]
  249. accuracy3scale <- nrow(outcome3scale)/nrow(highestProbability3scale)
  250. accuracy3scale
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement