Advertisement
Guest User

Untitled

a guest
Oct 23rd, 2016
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.22 KB | None | 0 0
  1. rm(list = ls())
  2. gc()
  3.  
  4. # загрузка библиотек
  5. library(data.table)
  6. library(ggplot2)
  7. library(xgboost)
  8. library(caret)
  9.  
  10. rate <- pi^exp(1) #22.4591577184
  11.  
  12. # загрузка данных
  13. tran <- fread('transactions.csv')
  14. cust <- fread('customers_gender_train.csv')
  15. mcc <- fread('tr_mcc_codes.csv')
  16. tr_type <- fread('tr_types.csv')
  17.  
  18. # добавляем признаки по дням недели и продолжительности дней, сколько человек являлся клиентом (dur)
  19. mcc$mcc_code <- paste('mcc', as.character(mcc$mcc_code), sep = '_')
  20. tran[, amount := round(amount / rate, 2)]
  21. tran[, c('day', 'time') := tstrsplit(tr_datetime, ' ', fixed = TRUE, type.convert = TRUE)]
  22. tran[, tr_datetime := NULL]
  23. tran[, dw := day %% 7]
  24. tran[, day := day + 1]
  25. tran[, dw := dw + 1]
  26. tran[, dur := max(day) - min(day), by = customer_id]
  27.  
  28. # dw, пишем среднее кол-во транзакций клиента в определенный день недели, усреднение по времени "жизни" клиента
  29. tmp <- unique(tran[, .(N = (.N / dur)), by = c('customer_id', 'dw')])
  30. dw <- dcast(tmp, customer_id ~ dw, value.var = 'N', fill = 0)
  31. colnames(dw)[2:length(colnames(dw))] <- paste('dw', colnames(dw)[2:length(colnames(dw))], sep = '_')
  32.  
  33. # money, аналогично предыдущему куску, выносим в отдельные переменные положительные суммы и отрицательные
  34. money <- tran[, .(rich = sum(amount)), by = customer_id]
  35. m_plus <- unique(tran[amount > 0, .(money_plus = sum(amount) / dur), by = customer_id])
  36. m_minus <- unique(tran[amount < 0, .(money_minus = sum(amount) / dur), by = customer_id])
  37. money <- merge(money, m_plus, by = 'customer_id', all.x = T)
  38. money <- merge(money, m_minus, by = 'customer_id', all.x = T)
  39. sum(is.na(money))
  40. money[is.na(money)] = 0
  41. money[, rich := NULL]
  42. rm(list = c('m_plus', 'm_minus'))
  43.  
  44. # фичи по комбинации customer_id, mcc_code, tr_type
  45. tmp <- unique(tran[, .(mean_val = .N / dur), by = .(customer_id, mcc_code, tr_type)])
  46. pred <- dcast(tmp, customer_id ~ mcc_code + tr_type, value.var = 'mean_val', fill = 0)
  47. rm(list = c('tmp', 'tran'))
  48.  
  49. # сливаем все вместе
  50. colnames(pred)[2:length(colnames(pred))] <- paste('mcc_tr', colnames(pred)[2:length(colnames(pred))], sep = '_')
  51. pred <- merge(pred, money, by = 'customer_id', all.x = T)
  52. pred <- merge(pred, cust, by = 'customer_id', all.x = T)
  53. pred <- merge(pred, dw, by = 'customer_id', all.x = T)
  54.  
  55. # удаление столбцов с маленькой суммой
  56. tmp <- colSums(pred)
  57. n_col <- names(tmp[abs(tmp) < 0.01])
  58. pred[, (n_col) := NULL]
  59.  
  60. # делаем трейн и тест
  61. X <- pred[!is.na(gender)]
  62. y <- X$gender
  63. X[, gender := NULL]
  64. X_pred <- pred[is.na(gender), -c('gender'), with = FALSE]
  65. c_id <- X_pred$customer_id
  66. X[, customer_id := NULL]
  67. X_pred[, customer_id := NULL]
  68. rm(list = c('cust', 'pred', 'money'))
  69.  
  70. # scale, ухудшает результат
  71. #preProc <- preProcess(X, method=c("center", "scale"))
  72. #X <- predict(preProc, X)
  73. #X_pred <- predict(preProc, X_pred)
  74.  
  75. # тюнинг xgboost, тут уже итоговые параметры
  76. xgbGrid <- expand.grid(
  77. nrounds = 200, #OK
  78. max_depth = 6, #OK
  79. eta = 0.2,
  80. gamma = 6, #OK
  81. colsample_bytree = 0.1, #OK
  82. min_child_weight = 12 #OK
  83. )
  84. fitControl <- trainControl(method = "cv", number = 3)
  85.  
  86. m1 <- train(X, as.factor(y),
  87. method = 'xgbTree',
  88. trControl = fitControl,
  89. metric = "auc",
  90. tuneGrid = xgbGrid
  91. )
  92. m1$bestTune
  93.  
  94. # строим итоговую модель
  95. k <- 256 # дальнейшее увеличение не улучшает модель
  96. param <- list(
  97. max_depth = 6,
  98. eta = 0.2/k,
  99. gamma = 6,
  100. colsample_bytree = 0.1,
  101. min_child_weight = 12,
  102. subsample = 0.7,
  103. objective = 'binary:logistic',
  104. eval_metric = "auc"
  105. )
  106.  
  107. model <- xgboost(data = as.matrix(X), label = y, params = param, nrounds = 200*k, print_every_n = 500)
  108.  
  109. f_imp <- xgb.importance(feature_names = colnames(as.matrix(X)), model = model)
  110. xgb.plot.importance(f_imp[Gain > 0.01])
  111.  
  112. res <- predict(model, as.matrix(X_pred))
  113. ans <- data.frame(c_id, res)
  114. colnames(ans) <- c('customer_id', 'gender')
  115. write.csv(ans, 'r_xgb_mcc_tr_dw_pm.csv', quote = F, row.names = F)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement