Untitled

# Read libraries
library(readr)
library(dplyr)
library(ggplot2)
library(RODBC)
library(qualityTools)
library(moments)
library(exptest)
library(MASS)
library(rms)
library(stats)
library(foreign)
library(Hmisc)
library(reshape2)


# Load data from database
query <- "SELECT Mn.PartitionKey, Mn.RoomKey, Mn.Value, Mp.CloseDateTime, Mn.MeasureDateTime,
CASE WHEN DATEPART(hour, Mn.MeasureDateTime) < 12 THEN 'ochtend' ELSE (CASE WHEN DATEPART(hour, Mn.MeasureDateTime) < 18 THEN 'middag' ELSE 'avond' END) end AS DayPart,
DATEDIFF(second,Mn.MeasureDateTime,Mn.CloseDateTime) AS CleaningDuration,
SUM(ZM.Count) AS SumCount, MAX(ZM.Count)/10 AS MaxValue, COUNT(ZM.Count) AS MeasuresCount,
(SUM(ZM.Count)/10)/COUNT(ZM.COUNT) AS MeanWithZeros,
CASE WHEN SUM(ZM.Count)=0 THEN 0 ELSE (SUM(ZM.Count)/10)/COUNT(CASE WHEN ZM.Count>1 THEN 1 ELSE NULL END) end AS MeanWithoutZeros,
R.Name, R.Type, R.FlooringType, R.Surface, R.Capacity,
ROUND((CASE WHEN SUM(ZM.Count)=0 THEN 0 ELSE (SUM(ZM.Count)/10)/COUNT(CASE WHEN ZM.Count>1 THEN 1 ELSE NULL END) end)*1.0/R.Capacity, 2) AS MeanOccupancy,
ROUND((MAX(ZM.Count)*1.0/10)/R.Capacity, 2) AS MaxOccupancy
FROM dbo.Measure Mn OUTER APPLY
(SELECT TOP 1 CloseDateTime FROM Measure Mi WHERE Mi.RoomId = Mn.RoomId AND Mi.CloseDateTime < Mn.MeasureDateTime ORDER BY CloseDateTime DESC) Mp
JOIN Room R ON Mn.RoomId = R.Id
JOIN ZoneMeasure ZM ON R.ZoneId = ZM.ZoneId AND ZM.MeasureDateTime BETWEEN Mp.CloseDateTime AND Mn.MeasureDateTime
WHERE (Mn.RoomKey LIKE '%004' OR Mn.RoomKey LIKE '%009' OR Mn.RoomKey LIKE '%106' OR Mn.RoomKey LIKE '%107' OR Mn.RoomKey LIKE '%211' OR Mn.RoomKey LIKE '%213' OR Mn.RoomKey LIKE '%214' OR Mn.RoomKey LIKE '%217' OR Mn.RoomKey LIKE '%218' OR Mn.RoomKey LIKE '%221' OR Mn.RoomKey LIKE '%226' OR Mn.RoomKey LIKE '%229' OR Mn.RoomKey LIKE '%309' OR Mn.RoomKey LIKE '%313' OR Mn.RoomKey LIKE '%314' OR Mn.RoomKey LIKE '%317' OR Mn.RoomKey LIKE '%318' OR Mn.RoomKey LIKE '%321' OR Mn.RoomKey LIKE '%326' OR Mn.RoomKey LIKE '%329' OR Mn.RoomKey LIKE '%222' OR Mn.RoomKey LIKE '%404' OR Mn.RoomKey LIKE '%406' OR Mn.RoomKey LIKE '%408' OR Mn.RoomKey LIKE '%409' OR Mn.RoomKey LIKE '%411' OR Mn.RoomKey LIKE '%413' OR Mn.RoomKey LIKE '%417' OR Mn.RoomKey LIKE '%421' OR Mn.RoomKey LIKE '%425' OR Mn.RoomKey LIKE '%430' OR Mn.RoomKey LIKE '%432' OR Mn.RoomKey LIKE '%435' OR Mn.RoomKey LIKE '%437' OR Mn.RoomKey LIKE '%503' OR Mn.RoomKey LIKE '%504' OR Mn.RoomKey LIKE '%508' OR Mn.RoomKey LIKE '%512' OR Mn.RoomKey LIKE '%516' OR Mn.RoomKey LIKE '%521' OR Mn.RoomKey LIKE '%525' OR Mn.RoomKey LIKE '%529' OR Mn.RoomKey LIKE '%531' OR Mn.RoomKey LIKE '%535' OR Mn.RoomKey LIKE '%551' OR Mn.RoomKey LIKE '%556' OR Mn.RoomKey LIKE '%561' OR Mn.RoomKey LIKE '%563' OR Mn.RoomKey LIKE '%566' OR Mn.RoomKey LIKE '%569' OR Mn.RoomKey LIKE '%602' OR Mn.RoomKey LIKE '%606' OR Mn.RoomKey LIKE '%610' OR Mn.RoomKey LIKE '%612' OR Mn.RoomKey LIKE '%616' OR Mn.RoomKey LIKE '%621' OR Mn.RoomKey LIKE '%625' OR Mn.RoomKey LIKE '%629' OR Mn.RoomKey LIKE '%631' OR Mn.RoomKey LIKE '%635' OR Mn.RoomKey LIKE '%651' OR Mn.RoomKey LIKE '%|662' OR Mn.RoomKey LIKE '%666' OR Mn.RoomKey LIKE '%703' OR Mn.RoomKey LIKE '%707' OR Mn.RoomKey LIKE '%713' OR Mn.RoomKey LIKE '%717' OR Mn.RoomKey LIKE '%720' OR Mn.RoomKey LIKE '%725' OR Mn.RoomKey LIKE '%732' OR Mn.RoomKey LIKE '%735' OR Mn.RoomKey LIKE '%751' OR Mn.RoomKey LIKE '%759' OR Mn.RoomKey LIKE '%763' OR Mn.RoomKey LIKE '%766' OR Mn.RoomKey LIKE '%767' OR Mn.RoomKey LIKE '%771' OR Mn.RoomKey LIKE '%773' OR Mn.RoomKey LIKE '%803' OR Mn.RoomKey LIKE '%806' OR Mn.RoomKey LIKE '%812' OR Mn.RoomKey LIKE '%836' OR Mn.RoomKey LIKE '%832' OR Mn.RoomKey LIKE '%826' OR Mn.RoomKey LIKE '%852' OR Mn.RoomKey LIKE '%858' OR Mn.RoomKey LIKE '%865')
AND Mn.MeasureDateTime > '2017-03-28 12:53:45.000'
--AND DATEDIFF(second,Mn.MeasureDateTime,Mn.CloseDateTime) > 10
AND Mn.Value > 0
GROUP BY Mn.PartitionKey, Mn.RoomKey, Mn.Value, Mp.CloseDateTime, Mn.MeasureDateTime, R.Name, R.Type, R.FlooringType, R.Surface, R.Capacity, Mn.CloseDateTime"

dbhandle <- odbcDriverConnect('driver={SQL Server};server=rooftopsqlserveras.database.windows.net;database=Rooftop;uid=rooftopreader@rooftopsqlserveras;pwd=!one#0oftp;')
res <- sqlQuery(dbhandle, query)
odbcClose(dbhandle)


#Statistical checks
#Checks on normality
qqnorm(res$SumCount, main = "SumCount Normal Q-Q Plot")
qqline(res$SumCount)
shapiro.test(res$SumCount)

qqnorm(res$MeanWithoutZeros, main = "MeanWithoutZeros Normal Q-Q Plot")
qqline(res$MeanWithoutZeros)
shapiro.test(res$MeanWithoutZeros)

#Alternative Check on normality
qqPlot(res$SumCount, "normal")
qqPlot(res$MeanWithoutZeros, "normal")


qqPlot(res$Surface, "normal")
qqPlot(res$MaxValue, "normal")
qqPlot(res$Capacity, "normal")
qqPlot(res$MeanOccupancy, "normal") #Lijkt normaal verdeeld
qqPlot(res$MaxOccupancy, "normal") #Lijkt normaal verdeeld


#Checks on Exponential Distribution
qqPlot(res$SumCount, "exponential", DB = TRUE)
qqPlot(res$MeanWithoutZeros, "exponential", DB = TRUE)

#Checks on Poisson Distribution


#Gini test on Exponential distribution
gini.exp.test(res$SumCount)
gini.exp.test(res$MeanWithoutZeros)
zSumCount <- (gini.exp.test(res$SumCount)$statistic-mean(res$SumCount))/sqrt(var(res$SumCount))
zMeanWithoutZeros <- (gini.exp.test(res$MeanWithoutZeros)$statistic-mean(res$MeanWithoutZeros))/sqrt(var(res$MeanWithoutZeros))
print(zSumCount)
print(zMeanWithoutZeros)
#Significantieniveau 5%: als -1.96 <  Z < 1.96 is niet bewezen dat verdeling van metingen afwijkt van exponentiele verdeling
#Significantieniveau 1%: als -2.575 <  Z < 2.575 is niet bewezen dat verdeling van metingen afwijkt van exponentiele verdeling


#Mean, Variance, Skewness and Kurtosis
mean(res$SumCount)
var(res$SumCount)
skewness(res$SumCount)
kurtosis(res$SumCount)
mean(res$MeanWithoutZeros)
var(res$MeanWithoutZeros)
skewness(res$MeanWithoutZeros)
kurtosis(res$MeanWithoutZeros)

#Boxplot to determine outliers
boxplot(res$SumCount, main = "SumCountData Boxplot")
boxplot(res$MeanWithoutZeros, main = "MeanWithoutZeros Boxplot")

#Scatterplots
ggplot(data = res, aes(y = res$MeanWithoutZeros, x = res$Value)) +geom_point()
ggplot(data = res, aes(y = res$MeanWithoutZeros, x = cut(res$Value, seq(from = -1.5, to = 5.5, by = 1)))) +geom_boxplot()


#Correlation Coefficients
#cor(res$Value, res$MeanWithoutZeros, use = "everything", method = c("pearson"))
#cor(res$Value, res$MeanWithoutZeros, use = "everything", method = c("kendall"))
cor(res$Value, res$SumCount, use = "everything", method = c("spearman"))
cor.test(res$Value, res$SumCount, method ="spearman")
cor(res$Value, res$MeanWithoutZeros, use = "everything", method = c("spearman"))
cor.test(res$Value, res$MeanWithoutZeros, method ="spearman")
cor(res$Value, res$Surface, use = "everything", method = c("spearman"))
cor.test(res$Value, res$Surface, method ="spearman")
cor(res$Value, res$MaxValue, use = "everything", method = c("spearman"))
cor.test(res$Value, res$MaxValue, method ="spearman")
cor(res$Value, res$Capacity, use = "everything", method = c("spearman"))
cor.test(res$Value, res$Capacity, method ="spearman")
cor(res$Value, res$MeanOccupancy, use = "everything", method = c("spearman"))
cor.test(res$Value, res$MeanOccupancy, method ="spearman")
cor(res$Value, res$MaxOccupancy, use = "everything", method = c("spearman"))
cor.test(res$Value, res$MaxOccupancy, method ="spearman")


#Barplots
Eerste <- res
Eerste$Value <- as.factor(Eerste$Value)

# Plot barplot value vs Gem sumcount
nieuweData <- Eerste %>% group_by(Value) %>% summarise(gembezoekers = mean(MeanWithoutZeros, na.rm=TRUE))
ggplot(nieuweData, aes(x=Value, y=gembezoekers)) + geom_bar(stat = "identity")

# Plot barplot value vs Gem aantal bezoekers
nieuweData <- Eerste %>% group_by(Value) %>% summarise(gemsumcount = mean(SumCount, na.rm=TRUE))
ggplot(nieuweData, aes(x=Value, y=gemsumcount)) + geom_bar(stat = "identity")


#Remove outliers for experimental use
remove_outliers <- function(x, na.rm = TRUE, ...) {
  qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
  H <- 1.5 * IQR(x, na.rm = na.rm)
  y <- x
  y[x < (qnt[1] - H)] <- NA
  y[x > (qnt[2] + H)] <- NA
  y
}
#SumCount with removed outliers
SumCountMinOutliers <- data.frame(remove_outliers(res$SumCount), res$Value, na.rm=TRUE)
boxplot(SumCountMinOutliers)
SumCountMinOutliers <- SumCountMinOutliers[!is.na(SumCountMinOutliers$remove_outliers.res.SumCount.),]

minoutlierdat <- SumCountMinOutliers %>% group_by(res.Value) %>% summarise(gemsumcount = mean(remove_outliers.res.SumCount., na.rm=TRUE))
ggplot(minoutlierdat, aes(x=res.Value, y=gemsumcount)) + geom_bar(stat = "identity")

cor(SumCountMinOutliers$res.Value, SumCountMinOutliers$remove_outliers.res.SumCount., use = "everything", method = c("spearman"))

#MeanWithoutZeros with removed outliers
MWZMinOutliers <- data.frame(remove_outliers(res$MeanWithoutZeros), res$Value, na.rm=TRUE)
boxplot(MWZMinOutliers)
MWZMinOutliers <- MWZMinOutliers[!is.na(MWZMinOutliers$remove_outliers.res.MeanWithoutZeros.),]

minoutlierdat <- MWZMinOutliers %>% group_by(res.Value) %>% summarise(gemsumcount = mean(remove_outliers.res.MeanWithoutZeros., na.rm=TRUE))
ggplot(minoutlierdat, aes(x=res.Value, y=gemsumcount)) + geom_bar(stat = "identity")

cor(MWZMinOutliers$res.Value, MWZMinOutliers$remove_outliers.res.MeanWithoutZeros., use = "everything", method = c("spearman"))


###############################################################################################################################################

dd <- datadist(res)
options(datadist="dd")

#polr method for ordinal logistic regression
model1 <- polr(as.factor(Value) ~ MeanWithoutZeros + MaxValue + Surface, data = res, Hess=TRUE)
model2 <- polr(as.ordered(Value) ~ MeanWithoutZeros + MaxValue + Surface + Capacity + MeanOccupancy + MaxOccupancy, data = res, Hess=TRUE)


#summary, tables and figures
summary(model2)
ctable <- coef(summary(model2))
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
(ctable <- cbind(ctable, "p value" = p))
(ci <- confint(model2)) #At level = 0,90 (alpha 90%) significance changes.
confint.default(model2)
## odds ratios
exp(coef(model2))
## OR and CI
exp(cbind(OR = coef(model2), ci))


#Predict probabilities on a testSet
trainSet <- res[res$MeasureDateTime < '2017-04-12 12:00:00',]
testSet <- res[res$MeasureDateTime > '2017-04-12 12:00:00',]
#Create a polr model using a training set as data, do not use complete res data.
testModel <- polr(as.factor(Value) ~ MeanWithoutZeros+Surface+MeanOccupancy+MaxValue+MaxOccupancy+SumCount+MeasuresCount, data = trainSet, Hess=TRUE)
#Create a frame with the usefull variables and bind the model predictions.
testFrame <- data.frame(SumCount <- testSet$SumCount, MeanWithoutZeros <- testSet$MeanWithoutZeros, MaxValue <- testSet$MaxValue, Surface <- testSet$Surface,
                        Capacity <- testSet$Capacity, MeanOccupancy <- testSet$MeanOccupancy, MaxOccupancy <- testSet$MaxOccupancy,
                        FlooringType <- testSet$FlooringType)
testFrame <- cbind(testFrame, predict(testModel, testFrame, type = "probs"))
#Create a new frame which only contains the predicted value and the true value from the testSet.
probs <- c("1","2","3","4","5")
probabilitiesFrame <- testFrame[probs]
highestProbability <- as.numeric(colnames(probabilitiesFrame)[apply(probabilitiesFrame,1,which.max)])
highestProbability <- as.data.frame(cbind(highestProbability, testSet$Value))
#Check how many times the model got the prediction right.
outcome <- highestProbability[highestProbability$highestProbability == highestProbability$V2,]
accuracy <- nrow(outcome)/nrow(highestProbability)
accuracy


#lrm method for ordinal logistic regression
lrm1 <- lrm(as.factor(Value) ~ MeanWithoutZeros + MaxValue + Surface + Capacity + MeanOccupancy + MaxOccupancy, data = res, x=TRUE, y=TRUE)
summary(lrm1)
print(lrm1)
lrm.valid <- validate(lrm1, method="boot", B=100) #Geeft errors, vaker proberen te runnen tot het lukt. Wsl veroorzaakt door weinig Value=1
lrm.valid

lrm.calib <- calibrate(lrm1, method="boot", B=100) #Geeft errors, vaker proberen te runnen tot het lukt. Wsl veroorzaakt door weinig Value=1
par(bg="white", las=1)
plot(lrm.calib, las=1)


###############################################################################################################################################
#Prediction on a 3 scale basis
#Frist instantiate original trainSet and testSet
trainSet3scale <- trainSet
testSet3scale <- testSet
trainSet3scale$Value[trainSet3scale$Value == 2] <- 1
trainSet3scale$Value[trainSet3scale$Value == 3] <- 2
trainSet3scale$Value[trainSet3scale$Value == 4] <- 3
trainSet3scale$Value[trainSet3scale$Value == 5] <- 3
testSet3scale$Value[testSet3scale$Value == 2] <- 1
testSet3scale$Value[testSet3scale$Value == 3] <- 2
testSet3scale$Value[testSet3scale$Value == 4] <- 3
testSet3scale$Value[testSet3scale$Value == 5] <- 3

#Create a polr model using a training set as data, do not use complete res data.
testModel3scale <- polr(as.factor(Value) ~ Surface+MaxValue+Capacity+MeanOccupancy+as.factor(FlooringType), data = trainSet3scale, Hess=TRUE)
#Create a frame with the usefull variables and bind the model predictions.
testFrame3scale <- data.frame(MeanWithoutZeros <- testSet3scale$MeanWithoutZeros, MaxValue <- testSet3scale$MaxValue, Surface <- testSet3scale$Surface,
                        Capacity <- testSet3scale$Capacity, MeanOccupancy <- testSet3scale$MeanOccupancy, MaxOccupancy <- testSet3scale$MaxOccupancy,
                        FlooringType <- testSet3scale$FlooringType)
testFrame3scale <- cbind(testFrame3scale, predict(testModel3scale, testFrame3scale, type = "probs"))
#Create a new frame which only contains the predicted value and the true value from the testSet.
probs <- c("1","2","3")
probabilitiesFrame3scale <- testFrame3scale[probs]
highestProbability3scale <- as.numeric(colnames(probabilitiesFrame3scale)[apply(probabilitiesFrame3scale,1,which.max)])
highestProbability3scale <- as.data.frame(cbind(highestProbability3scale, testSet3scale$Value))
#Check how many times the model got the prediction right.
outcome3scale <- highestProbability3scale[highestProbability3scale$highestProbability3scale == highestProbability3scale$V2,]
accuracy3scale <- nrow(outcome3scale)/nrow(highestProbability3scale)
accuracy3scale