Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- rm(list = ls())
- gc()
- # загрузка библиотек
- library(data.table)
- library(ggplot2)
- library(xgboost)
- library(caret)
- rate <- pi^exp(1) #22.4591577184
- # загрузка данных
- tran <- fread('transactions.csv')
- cust <- fread('customers_gender_train.csv')
- mcc <- fread('tr_mcc_codes.csv')
- tr_type <- fread('tr_types.csv')
- # добавляем признаки по дням недели и продолжительности дней, сколько человек являлся клиентом (dur)
- mcc$mcc_code <- paste('mcc', as.character(mcc$mcc_code), sep = '_')
- tran[, amount := round(amount / rate, 2)]
- tran[, c('day', 'time') := tstrsplit(tr_datetime, ' ', fixed = TRUE, type.convert = TRUE)]
- tran[, tr_datetime := NULL]
- tran[, dw := day %% 7]
- tran[, day := day + 1]
- tran[, dw := dw + 1]
- tran[, dur := max(day) - min(day), by = customer_id]
- # dw, пишем среднее кол-во транзакций клиента в определенный день недели, усреднение по времени "жизни" клиента
- tmp <- unique(tran[, .(N = (.N / dur)), by = c('customer_id', 'dw')])
- dw <- dcast(tmp, customer_id ~ dw, value.var = 'N', fill = 0)
- colnames(dw)[2:length(colnames(dw))] <- paste('dw', colnames(dw)[2:length(colnames(dw))], sep = '_')
- # money, аналогично предыдущему куску, выносим в отдельные переменные положительные суммы и отрицательные
- money <- tran[, .(rich = sum(amount)), by = customer_id]
- m_plus <- unique(tran[amount > 0, .(money_plus = sum(amount) / dur), by = customer_id])
- m_minus <- unique(tran[amount < 0, .(money_minus = sum(amount) / dur), by = customer_id])
- money <- merge(money, m_plus, by = 'customer_id', all.x = T)
- money <- merge(money, m_minus, by = 'customer_id', all.x = T)
- sum(is.na(money))
- money[is.na(money)] = 0
- money[, rich := NULL]
- rm(list = c('m_plus', 'm_minus'))
- # фичи по комбинации customer_id, mcc_code, tr_type
- tmp <- unique(tran[, .(mean_val = .N / dur), by = .(customer_id, mcc_code, tr_type)])
- pred <- dcast(tmp, customer_id ~ mcc_code + tr_type, value.var = 'mean_val', fill = 0)
- rm(list = c('tmp', 'tran'))
- # сливаем все вместе
- colnames(pred)[2:length(colnames(pred))] <- paste('mcc_tr', colnames(pred)[2:length(colnames(pred))], sep = '_')
- pred <- merge(pred, money, by = 'customer_id', all.x = T)
- pred <- merge(pred, cust, by = 'customer_id', all.x = T)
- pred <- merge(pred, dw, by = 'customer_id', all.x = T)
- # удаление столбцов с маленькой суммой
- tmp <- colSums(pred)
- n_col <- names(tmp[abs(tmp) < 0.01])
- pred[, (n_col) := NULL]
- # делаем трейн и тест
- X <- pred[!is.na(gender)]
- y <- X$gender
- X[, gender := NULL]
- X_pred <- pred[is.na(gender), -c('gender'), with = FALSE]
- c_id <- X_pred$customer_id
- X[, customer_id := NULL]
- X_pred[, customer_id := NULL]
- rm(list = c('cust', 'pred', 'money'))
- # scale, ухудшает результат
- #preProc <- preProcess(X, method=c("center", "scale"))
- #X <- predict(preProc, X)
- #X_pred <- predict(preProc, X_pred)
- # тюнинг xgboost, тут уже итоговые параметры
- xgbGrid <- expand.grid(
- nrounds = 200, #OK
- max_depth = 6, #OK
- eta = 0.2,
- gamma = 6, #OK
- colsample_bytree = 0.1, #OK
- min_child_weight = 12 #OK
- )
- fitControl <- trainControl(method = "cv", number = 3)
- m1 <- train(X, as.factor(y),
- method = 'xgbTree',
- trControl = fitControl,
- metric = "auc",
- tuneGrid = xgbGrid
- )
- m1$bestTune
- # строим итоговую модель
- k <- 256 # дальнейшее увеличение не улучшает модель
- param <- list(
- max_depth = 6,
- eta = 0.2/k,
- gamma = 6,
- colsample_bytree = 0.1,
- min_child_weight = 12,
- subsample = 0.7,
- objective = 'binary:logistic',
- eval_metric = "auc"
- )
- model <- xgboost(data = as.matrix(X), label = y, params = param, nrounds = 200*k, print_every_n = 500)
- f_imp <- xgb.importance(feature_names = colnames(as.matrix(X)), model = model)
- xgb.plot.importance(f_imp[Gain > 0.01])
- res <- predict(model, as.matrix(X_pred))
- ans <- data.frame(c_id, res)
- colnames(ans) <- c('customer_id', 'gender')
- write.csv(ans, 'r_xgb_mcc_tr_dw_pm.csv', quote = F, row.names = F)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement