Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Read libraries
- library(readr)
- library(dplyr)
- library(ggplot2)
- library(RODBC)
- library(qualityTools)
- library(moments)
- library(exptest)
- library(MASS)
- library(rms)
- library(stats)
- library(foreign)
- library(Hmisc)
- library(reshape2)
- # Load data from database
- query <- "SELECT Mn.PartitionKey, Mn.RoomKey, Mn.Value, Mp.CloseDateTime, Mn.MeasureDateTime,
- CASE WHEN DATEPART(hour, Mn.MeasureDateTime) < 12 THEN 'ochtend' ELSE (CASE WHEN DATEPART(hour, Mn.MeasureDateTime) < 18 THEN 'middag' ELSE 'avond' END) end AS DayPart,
- DATEDIFF(second,Mn.MeasureDateTime,Mn.CloseDateTime) AS CleaningDuration,
- SUM(ZM.Count) AS SumCount, MAX(ZM.Count)/10 AS MaxValue, COUNT(ZM.Count) AS MeasuresCount,
- (SUM(ZM.Count)/10)/COUNT(ZM.COUNT) AS MeanWithZeros,
- CASE WHEN SUM(ZM.Count)=0 THEN 0 ELSE (SUM(ZM.Count)/10)/COUNT(CASE WHEN ZM.Count>1 THEN 1 ELSE NULL END) end AS MeanWithoutZeros,
- R.Name, R.Type, R.FlooringType, R.Surface, R.Capacity,
- ROUND((CASE WHEN SUM(ZM.Count)=0 THEN 0 ELSE (SUM(ZM.Count)/10)/COUNT(CASE WHEN ZM.Count>1 THEN 1 ELSE NULL END) end)*1.0/R.Capacity, 2) AS MeanOccupancy,
- ROUND((MAX(ZM.Count)*1.0/10)/R.Capacity, 2) AS MaxOccupancy
- FROM dbo.Measure Mn OUTER APPLY
- (SELECT TOP 1 CloseDateTime FROM Measure Mi WHERE Mi.RoomId = Mn.RoomId AND Mi.CloseDateTime < Mn.MeasureDateTime ORDER BY CloseDateTime DESC) Mp
- JOIN Room R ON Mn.RoomId = R.Id
- JOIN ZoneMeasure ZM ON R.ZoneId = ZM.ZoneId AND ZM.MeasureDateTime BETWEEN Mp.CloseDateTime AND Mn.MeasureDateTime
- WHERE (Mn.RoomKey LIKE '%004' OR Mn.RoomKey LIKE '%009' OR Mn.RoomKey LIKE '%106' OR Mn.RoomKey LIKE '%107' OR Mn.RoomKey LIKE '%211' OR Mn.RoomKey LIKE '%213' OR Mn.RoomKey LIKE '%214' OR Mn.RoomKey LIKE '%217' OR Mn.RoomKey LIKE '%218' OR Mn.RoomKey LIKE '%221' OR Mn.RoomKey LIKE '%226' OR Mn.RoomKey LIKE '%229' OR Mn.RoomKey LIKE '%309' OR Mn.RoomKey LIKE '%313' OR Mn.RoomKey LIKE '%314' OR Mn.RoomKey LIKE '%317' OR Mn.RoomKey LIKE '%318' OR Mn.RoomKey LIKE '%321' OR Mn.RoomKey LIKE '%326' OR Mn.RoomKey LIKE '%329' OR Mn.RoomKey LIKE '%222' OR Mn.RoomKey LIKE '%404' OR Mn.RoomKey LIKE '%406' OR Mn.RoomKey LIKE '%408' OR Mn.RoomKey LIKE '%409' OR Mn.RoomKey LIKE '%411' OR Mn.RoomKey LIKE '%413' OR Mn.RoomKey LIKE '%417' OR Mn.RoomKey LIKE '%421' OR Mn.RoomKey LIKE '%425' OR Mn.RoomKey LIKE '%430' OR Mn.RoomKey LIKE '%432' OR Mn.RoomKey LIKE '%435' OR Mn.RoomKey LIKE '%437' OR Mn.RoomKey LIKE '%503' OR Mn.RoomKey LIKE '%504' OR Mn.RoomKey LIKE '%508' OR Mn.RoomKey LIKE '%512' OR Mn.RoomKey LIKE '%516' OR Mn.RoomKey LIKE '%521' OR Mn.RoomKey LIKE '%525' OR Mn.RoomKey LIKE '%529' OR Mn.RoomKey LIKE '%531' OR Mn.RoomKey LIKE '%535' OR Mn.RoomKey LIKE '%551' OR Mn.RoomKey LIKE '%556' OR Mn.RoomKey LIKE '%561' OR Mn.RoomKey LIKE '%563' OR Mn.RoomKey LIKE '%566' OR Mn.RoomKey LIKE '%569' OR Mn.RoomKey LIKE '%602' OR Mn.RoomKey LIKE '%606' OR Mn.RoomKey LIKE '%610' OR Mn.RoomKey LIKE '%612' OR Mn.RoomKey LIKE '%616' OR Mn.RoomKey LIKE '%621' OR Mn.RoomKey LIKE '%625' OR Mn.RoomKey LIKE '%629' OR Mn.RoomKey LIKE '%631' OR Mn.RoomKey LIKE '%635' OR Mn.RoomKey LIKE '%651' OR Mn.RoomKey LIKE '%|662' OR Mn.RoomKey LIKE '%666' OR Mn.RoomKey LIKE '%703' OR Mn.RoomKey LIKE '%707' OR Mn.RoomKey LIKE '%713' OR Mn.RoomKey LIKE '%717' OR Mn.RoomKey LIKE '%720' OR Mn.RoomKey LIKE '%725' OR Mn.RoomKey LIKE '%732' OR Mn.RoomKey LIKE '%735' OR Mn.RoomKey LIKE '%751' OR Mn.RoomKey LIKE '%759' OR Mn.RoomKey LIKE '%763' OR Mn.RoomKey LIKE '%766' OR Mn.RoomKey LIKE '%767' OR Mn.RoomKey LIKE '%771' OR Mn.RoomKey LIKE '%773' OR Mn.RoomKey LIKE '%803' OR Mn.RoomKey LIKE '%806' OR Mn.RoomKey LIKE '%812' OR Mn.RoomKey LIKE '%836' OR Mn.RoomKey LIKE '%832' OR Mn.RoomKey LIKE '%826' OR Mn.RoomKey LIKE '%852' OR Mn.RoomKey LIKE '%858' OR Mn.RoomKey LIKE '%865')
- AND Mn.MeasureDateTime > '2017-03-28 12:53:45.000'
- --AND DATEDIFF(second,Mn.MeasureDateTime,Mn.CloseDateTime) > 10
- AND Mn.Value > 0
- GROUP BY Mn.PartitionKey, Mn.RoomKey, Mn.Value, Mp.CloseDateTime, Mn.MeasureDateTime, R.Name, R.Type, R.FlooringType, R.Surface, R.Capacity, Mn.CloseDateTime"
- dbhandle <- odbcDriverConnect('driver={SQL Server};server=rooftopsqlserveras.database.windows.net;database=Rooftop;uid=rooftopreader@rooftopsqlserveras;pwd=!one#0oftp;')
- res <- sqlQuery(dbhandle, query)
- odbcClose(dbhandle)
- #Statistical checks
- #Checks on normality
- qqnorm(res$SumCount, main = "SumCount Normal Q-Q Plot")
- qqline(res$SumCount)
- shapiro.test(res$SumCount)
- qqnorm(res$MeanWithoutZeros, main = "MeanWithoutZeros Normal Q-Q Plot")
- qqline(res$MeanWithoutZeros)
- shapiro.test(res$MeanWithoutZeros)
- #Alternative Check on normality
- qqPlot(res$SumCount, "normal")
- qqPlot(res$MeanWithoutZeros, "normal")
- qqPlot(res$Surface, "normal")
- qqPlot(res$MaxValue, "normal")
- qqPlot(res$Capacity, "normal")
- qqPlot(res$MeanOccupancy, "normal") #Lijkt normaal verdeeld
- qqPlot(res$MaxOccupancy, "normal") #Lijkt normaal verdeeld
- #Checks on Exponential Distribution
- qqPlot(res$SumCount, "exponential", DB = TRUE)
- qqPlot(res$MeanWithoutZeros, "exponential", DB = TRUE)
- #Checks on Poisson Distribution
- #Gini test on Exponential distribution
- gini.exp.test(res$SumCount)
- gini.exp.test(res$MeanWithoutZeros)
- zSumCount <- (gini.exp.test(res$SumCount)$statistic-mean(res$SumCount))/sqrt(var(res$SumCount))
- zMeanWithoutZeros <- (gini.exp.test(res$MeanWithoutZeros)$statistic-mean(res$MeanWithoutZeros))/sqrt(var(res$MeanWithoutZeros))
- print(zSumCount)
- print(zMeanWithoutZeros)
- #Significantieniveau 5%: als -1.96 < Z < 1.96 is niet bewezen dat verdeling van metingen afwijkt van exponentiele verdeling
- #Significantieniveau 1%: als -2.575 < Z < 2.575 is niet bewezen dat verdeling van metingen afwijkt van exponentiele verdeling
- #Mean, Variance, Skewness and Kurtosis
- mean(res$SumCount)
- var(res$SumCount)
- skewness(res$SumCount)
- kurtosis(res$SumCount)
- mean(res$MeanWithoutZeros)
- var(res$MeanWithoutZeros)
- skewness(res$MeanWithoutZeros)
- kurtosis(res$MeanWithoutZeros)
- #Boxplot to determine outliers
- boxplot(res$SumCount, main = "SumCountData Boxplot")
- boxplot(res$MeanWithoutZeros, main = "MeanWithoutZeros Boxplot")
- #Scatterplots
- ggplot(data = res, aes(y = res$MeanWithoutZeros, x = res$Value)) +geom_point()
- ggplot(data = res, aes(y = res$MeanWithoutZeros, x = cut(res$Value, seq(from = -1.5, to = 5.5, by = 1)))) +geom_boxplot()
- #Correlation Coefficients
- #cor(res$Value, res$MeanWithoutZeros, use = "everything", method = c("pearson"))
- #cor(res$Value, res$MeanWithoutZeros, use = "everything", method = c("kendall"))
- cor(res$Value, res$SumCount, use = "everything", method = c("spearman"))
- cor.test(res$Value, res$SumCount, method ="spearman")
- cor(res$Value, res$MeanWithoutZeros, use = "everything", method = c("spearman"))
- cor.test(res$Value, res$MeanWithoutZeros, method ="spearman")
- cor(res$Value, res$Surface, use = "everything", method = c("spearman"))
- cor.test(res$Value, res$Surface, method ="spearman")
- cor(res$Value, res$MaxValue, use = "everything", method = c("spearman"))
- cor.test(res$Value, res$MaxValue, method ="spearman")
- cor(res$Value, res$Capacity, use = "everything", method = c("spearman"))
- cor.test(res$Value, res$Capacity, method ="spearman")
- cor(res$Value, res$MeanOccupancy, use = "everything", method = c("spearman"))
- cor.test(res$Value, res$MeanOccupancy, method ="spearman")
- cor(res$Value, res$MaxOccupancy, use = "everything", method = c("spearman"))
- cor.test(res$Value, res$MaxOccupancy, method ="spearman")
- #Barplots
- Eerste <- res
- Eerste$Value <- as.factor(Eerste$Value)
- # Plot barplot value vs Gem sumcount
- nieuweData <- Eerste %>% group_by(Value) %>% summarise(gembezoekers = mean(MeanWithoutZeros, na.rm=TRUE))
- ggplot(nieuweData, aes(x=Value, y=gembezoekers)) + geom_bar(stat = "identity")
- # Plot barplot value vs Gem aantal bezoekers
- nieuweData <- Eerste %>% group_by(Value) %>% summarise(gemsumcount = mean(SumCount, na.rm=TRUE))
- ggplot(nieuweData, aes(x=Value, y=gemsumcount)) + geom_bar(stat = "identity")
- #Remove outliers for experimental use
- remove_outliers <- function(x, na.rm = TRUE, ...) {
- qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
- H <- 1.5 * IQR(x, na.rm = na.rm)
- y <- x
- y[x < (qnt[1] - H)] <- NA
- y[x > (qnt[2] + H)] <- NA
- y
- }
- #SumCount with removed outliers
- SumCountMinOutliers <- data.frame(remove_outliers(res$SumCount), res$Value, na.rm=TRUE)
- boxplot(SumCountMinOutliers)
- SumCountMinOutliers <- SumCountMinOutliers[!is.na(SumCountMinOutliers$remove_outliers.res.SumCount.),]
- minoutlierdat <- SumCountMinOutliers %>% group_by(res.Value) %>% summarise(gemsumcount = mean(remove_outliers.res.SumCount., na.rm=TRUE))
- ggplot(minoutlierdat, aes(x=res.Value, y=gemsumcount)) + geom_bar(stat = "identity")
- cor(SumCountMinOutliers$res.Value, SumCountMinOutliers$remove_outliers.res.SumCount., use = "everything", method = c("spearman"))
- #MeanWithoutZeros with removed outliers
- MWZMinOutliers <- data.frame(remove_outliers(res$MeanWithoutZeros), res$Value, na.rm=TRUE)
- boxplot(MWZMinOutliers)
- MWZMinOutliers <- MWZMinOutliers[!is.na(MWZMinOutliers$remove_outliers.res.MeanWithoutZeros.),]
- minoutlierdat <- MWZMinOutliers %>% group_by(res.Value) %>% summarise(gemsumcount = mean(remove_outliers.res.MeanWithoutZeros., na.rm=TRUE))
- ggplot(minoutlierdat, aes(x=res.Value, y=gemsumcount)) + geom_bar(stat = "identity")
- cor(MWZMinOutliers$res.Value, MWZMinOutliers$remove_outliers.res.MeanWithoutZeros., use = "everything", method = c("spearman"))
- ###############################################################################################################################################
- dd <- datadist(res)
- options(datadist="dd")
- #polr method for ordinal logistic regression
- model1 <- polr(as.factor(Value) ~ MeanWithoutZeros + MaxValue + Surface, data = res, Hess=TRUE)
- model2 <- polr(as.ordered(Value) ~ MeanWithoutZeros + MaxValue + Surface + Capacity + MeanOccupancy + MaxOccupancy, data = res, Hess=TRUE)
- #summary, tables and figures
- summary(model2)
- ctable <- coef(summary(model2))
- p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
- (ctable <- cbind(ctable, "p value" = p))
- (ci <- confint(model2)) #At level = 0,90 (alpha 90%) significance changes.
- confint.default(model2)
- ## odds ratios
- exp(coef(model2))
- ## OR and CI
- exp(cbind(OR = coef(model2), ci))
- #Predict probabilities on a testSet
- trainSet <- res[res$MeasureDateTime < '2017-04-12 12:00:00',]
- testSet <- res[res$MeasureDateTime > '2017-04-12 12:00:00',]
- #Create a polr model using a training set as data, do not use complete res data.
- testModel <- polr(as.factor(Value) ~ MeanWithoutZeros+Surface+MeanOccupancy+MaxValue+MaxOccupancy+SumCount+MeasuresCount, data = trainSet, Hess=TRUE)
- #Create a frame with the usefull variables and bind the model predictions.
- testFrame <- data.frame(SumCount <- testSet$SumCount, MeanWithoutZeros <- testSet$MeanWithoutZeros, MaxValue <- testSet$MaxValue, Surface <- testSet$Surface,
- Capacity <- testSet$Capacity, MeanOccupancy <- testSet$MeanOccupancy, MaxOccupancy <- testSet$MaxOccupancy,
- FlooringType <- testSet$FlooringType)
- testFrame <- cbind(testFrame, predict(testModel, testFrame, type = "probs"))
- #Create a new frame which only contains the predicted value and the true value from the testSet.
- probs <- c("1","2","3","4","5")
- probabilitiesFrame <- testFrame[probs]
- highestProbability <- as.numeric(colnames(probabilitiesFrame)[apply(probabilitiesFrame,1,which.max)])
- highestProbability <- as.data.frame(cbind(highestProbability, testSet$Value))
- #Check how many times the model got the prediction right.
- outcome <- highestProbability[highestProbability$highestProbability == highestProbability$V2,]
- accuracy <- nrow(outcome)/nrow(highestProbability)
- accuracy
- #lrm method for ordinal logistic regression
- lrm1 <- lrm(as.factor(Value) ~ MeanWithoutZeros + MaxValue + Surface + Capacity + MeanOccupancy + MaxOccupancy, data = res, x=TRUE, y=TRUE)
- summary(lrm1)
- print(lrm1)
- lrm.valid <- validate(lrm1, method="boot", B=100) #Geeft errors, vaker proberen te runnen tot het lukt. Wsl veroorzaakt door weinig Value=1
- lrm.valid
- lrm.calib <- calibrate(lrm1, method="boot", B=100) #Geeft errors, vaker proberen te runnen tot het lukt. Wsl veroorzaakt door weinig Value=1
- par(bg="white", las=1)
- plot(lrm.calib, las=1)
- ###############################################################################################################################################
- #Prediction on a 3 scale basis
- #Frist instantiate original trainSet and testSet
- trainSet3scale <- trainSet
- testSet3scale <- testSet
- trainSet3scale$Value[trainSet3scale$Value == 2] <- 1
- trainSet3scale$Value[trainSet3scale$Value == 3] <- 2
- trainSet3scale$Value[trainSet3scale$Value == 4] <- 3
- trainSet3scale$Value[trainSet3scale$Value == 5] <- 3
- testSet3scale$Value[testSet3scale$Value == 2] <- 1
- testSet3scale$Value[testSet3scale$Value == 3] <- 2
- testSet3scale$Value[testSet3scale$Value == 4] <- 3
- testSet3scale$Value[testSet3scale$Value == 5] <- 3
- #Create a polr model using a training set as data, do not use complete res data.
- testModel3scale <- polr(as.factor(Value) ~ Surface+MaxValue+Capacity+MeanOccupancy+as.factor(FlooringType), data = trainSet3scale, Hess=TRUE)
- #Create a frame with the usefull variables and bind the model predictions.
- testFrame3scale <- data.frame(MeanWithoutZeros <- testSet3scale$MeanWithoutZeros, MaxValue <- testSet3scale$MaxValue, Surface <- testSet3scale$Surface,
- Capacity <- testSet3scale$Capacity, MeanOccupancy <- testSet3scale$MeanOccupancy, MaxOccupancy <- testSet3scale$MaxOccupancy,
- FlooringType <- testSet3scale$FlooringType)
- testFrame3scale <- cbind(testFrame3scale, predict(testModel3scale, testFrame3scale, type = "probs"))
- #Create a new frame which only contains the predicted value and the true value from the testSet.
- probs <- c("1","2","3")
- probabilitiesFrame3scale <- testFrame3scale[probs]
- highestProbability3scale <- as.numeric(colnames(probabilitiesFrame3scale)[apply(probabilitiesFrame3scale,1,which.max)])
- highestProbability3scale <- as.data.frame(cbind(highestProbability3scale, testSet3scale$Value))
- #Check how many times the model got the prediction right.
- outcome3scale <- highestProbability3scale[highestProbability3scale$highestProbability3scale == highestProbability3scale$V2,]
- accuracy3scale <- nrow(outcome3scale)/nrow(highestProbability3scale)
- accuracy3scale
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement