Advertisement
Guest User

Untitled

a guest
Jul 13th, 2017
177
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 4.01 KB | None | 0 0
  1. setwd("/Users/gemenenarcis/Documents/MATLAB/Football-data-challenge/R/")
  2. library(gbm)
  3. traincsv <- read.csv("../trainSet/train.csv",header = TRUE,sep=",")
  4. testcsv <- read.csv("../testSet/test.csv",header = TRUE,sep=",")
  5. dates <- as.Date(traincsv$Date, "%Y-%m-%d")
  6. years <- as.numeric(format(dates, "%Y"))
  7. months <- format(dates, "%d")
  8. uniqueYears <- sort.int(unique(years))
  9. indexes <- matrix(0,2,length(uniqueYears) - 1)
  10. for (i in 1:(length(uniqueYears) - 1))
  11. {
  12.   year <- uniqueYears[i]
  13.   nextYear <- uniqueYears[i+1]
  14.   indx <- as.numeric(format(dates, "%Y")) == year & as.numeric(format(dates, "%m")) >= 8
  15.   indx <- indx | (as.numeric(format(dates, "%Y")) == nextYear & as.numeric(format(dates, "%m")) <= 6)
  16.   indx <- which(indx == TRUE)
  17.  
  18.   #attention!!the dates in train have to be sorted
  19.   if(length(indx) > 0)
  20.   {
  21.     indexes[1,i] <- min(indx);
  22.     indexes[2,i] <- max(indx);
  23.     stopifnot(length(indx)  == indexes[2,i] - indexes[1,i] + 1)
  24.   }
  25. }
  26. matches <- matrix(0, length(traincsv$ID), 2);
  27. winners <- matrix(0, length(traincsv$ID), 1);
  28.  
  29. for (i in 1:length(traincsv$ID))
  30. {
  31.   array <- unlist(train[i,], use.names = FALSE)
  32.   array <- array[!is.na(array)]
  33.   matches[i,] <- array[3:4]
  34.   winners[i] <- array[5]
  35.   if(winners[i] == 3)
  36.     winners[i] <- 1
  37.   else if(winners[i] == 1)
  38.     winners[i] <- 3
  39. }
  40. #data.frame("Actual" = train$HomeTeam,
  41.  #          "PredictedProbability" = train$AwayTeam)
  42.  
  43. LogLossBinary = function(actual, predicted, eps = 1e-15) {  
  44.   predicted = pmin(pmax(predicted, eps), 1-eps)  
  45.   - (sum(actual * log(predicted) + (1 - actual) * log(1 - predicted))) / length(actual)
  46. }
  47.  
  48. for (i in 1:1)#ncol(indexes))
  49. {
  50.   dataSubsetProportion = .2;
  51.   rows = indexes[1,i]:indexes[2,i]
  52.   trainingNonHoldoutSet = traincsv[!(1:nrow(traincsv) %in% rows), 3:4];#to train
  53.   print(nrow(trainingHoldoutSet))
  54.   print(nrow(trainingNonHoldoutSet))
  55.  
  56.    
  57.   gbmWithCrossValidation = gbm(formula = traincsv$FTR[!(1:nrow(traincsv) %in% rows)] ~ .,
  58.                                distribution = "multinomial",
  59.                                data = trainingNonHoldoutSet,
  60.                                n.trees = 2000,
  61.                                shrinkage = .1,
  62.                                n.minobsinnode = 200,
  63.                                cv.folds = 5,
  64.                                n.cores = 1)
  65.   bestTreeForPrediction = gbm.perf(gbmWithCrossValidation)
  66.  
  67.   gbmHoldoutPredictions = predict(object = gbmWithCrossValidation,
  68.                                   newdata = trainingHoldoutSet,
  69.                                   n.trees = bestTreeForPrediction,
  70.                                   type = "response")
  71.  
  72.   gbmNonHoldoutPredictions = predict(object = gbmWithCrossValidation,
  73.                                      newdata = trainingNonHoldoutSet,
  74.                                      n.trees = bestTreeForPrediction,
  75.                                      type = "response")
  76.   print(paste(LogLossBinary(train$Response[randomRows], gbmHoldoutPredictions),
  77.               "Holdout Log Loss"))
  78.   print(paste(LogLossBinary(train$Response[!(1:nrow(train) %in% randomRows)], gbmNonHoldoutPredictions),
  79.               "Non-Holdout Log Loss"))
  80. }
  81.  
  82.  
  83. #dataSubsetProportion = .2;
  84. #randomRows = sample(1:nrow(train), floor(nrow(train) * dataSubsetProportion));#
  85. #trainingHoldoutSet = train[randomRows, ];#to test
  86. #trainingNonHoldoutSet = train[!(1:nrow(train) %in% randomRows), ];#to train
  87.  
  88. #gbmWithCrossValidation = gbm(formula = Response ~ .,
  89. #                             distribution = "bernoulli",
  90. #                             data = trainingNonHoldoutSet,
  91. #                             n.trees = 2000,
  92. #                             shrinkage = .1,
  93. #                             n.minobsinnode = 200,
  94. #                             cv.folds = 5,
  95. #                             n.cores = 1)
  96.  
  97. #best  TreeForPrediction = gbm.perf(gbmWithCrossValidation)
  98. #"%y-%d-%d"
  99. #for(i in 1:length(train$HomeTeam))
  100. #{
  101.   #array = unlist(train[i,], use.names = FALSE);
  102.   #array = array[!is.na(array)];
  103.   #print(array);
  104. #}
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement