Advertisement
Guest User

Untitled

a guest
May 23rd, 2019
92
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 24.97 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import random as rnd
  4. import seaborn as sns
  5. import matplotlib.pyplot as plt
  6. from sklearn.linear_model import LogisticRegression
  7. from sklearn.svm import SVC, LinearSVC, SVR
  8. from sklearn.ensemble import RandomForestClassifier
  9. from sklearn.neighbors import KNeighborsClassifier
  10. from sklearn.linear_model import Lasso, ElasticNet
  11.  
  12. from sklearn.model_selection import train_test_split
  13. from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold, KFold
  14.  
  15. #import torch
  16. #import torch.nn as nn
  17. #import torch.nn.functional as F
  18. #import torch.optim as optim
  19.  
  20. import pandas as pd
  21. from xgboost import XGBClassifier, XGBRegressor
  22. from lightgbm import LGBMRegressor
  23. import xgboost as xgb
  24. import xgbfir
  25.  
  26. import shap
  27. shap.initjs()
  28.  
  29. import category_encoders as ce
  30.  
  31. from catboost import CatBoostClassifier
  32.  
  33.  
  34. from hyperopt import hp, tpe, STATUS_OK, Trials, fmin, tpe, hp, space_eval
  35.  
  36.  
  37. pd.set_option('display.max_columns', None)
  38.  
  39. '''
  40. class Net(nn.Module):
  41. def __init__(self):
  42. super(Net, self).__init__()
  43.  
  44. self.dr0 = nn.Dropout(p=0.1)
  45.  
  46. self.fc1 = nn.Linear(254, 150)
  47. self.dr1 = nn.Dropout(p=0.35)
  48.  
  49. self.fc2 = nn.Linear(150, 50)
  50. self.dr2 = nn.Dropout(p=0.1)
  51.  
  52. self.fc3 = nn.Linear(50, 2)
  53.  
  54. self.i1 = nn.Parameter(torch.tensor(0.4806), requires_grad=True)
  55. self.e1 = nn.Parameter(torch.tensor(0.3389), requires_grad=True)
  56. self.t1 = nn.Parameter(torch.tensor(-0.5508), requires_grad=True)
  57.  
  58. self.i2 = nn.Parameter(torch.tensor(0.5739), requires_grad=True)
  59. self.e2 = nn.Parameter(torch.tensor(0.4860), requires_grad=True)
  60. self.t2 = nn.Parameter(torch.tensor(-0.2934), requires_grad=True)
  61.  
  62.  
  63. def forward(self, x):
  64. x = x.view(-1, 254)
  65.  
  66. x = self.dr0(x)
  67.  
  68. x = self.fc1(x)
  69. x = self.i1 * x + self.e1 * F.selu(x) + self.t1 * torch.tanh(x)
  70. x = self.dr1(x)
  71.  
  72. x = self.fc2(x)
  73. x = self.i2 * x + self.e2 * F.selu(x) + self.t2 * torch.tanh(x)
  74. x = self.dr2(x)
  75.  
  76. x = self.fc3(x)
  77.  
  78. return F.log_softmax(x, dim=1)
  79.  
  80.  
  81. class NN:
  82. def __init__(self, epochs, batch_size=32):
  83. self.model = Net()
  84. self.epochs = epochs
  85.  
  86. self.batch_size = batch_size
  87. self.optimizer = optim.SGD(self.model.parameters(), lr=0.001, momentum=0.7)
  88.  
  89. def train_batch(self, data, target):
  90. self.model.train()
  91. self.optimizer.zero_grad()
  92. output = self.model(data)
  93.  
  94. loss = F.nll_loss(output, target.type(torch.int64))
  95. loss.backward()
  96. self.optimizer.step()
  97.  
  98. def test(self, X, y, epoch):
  99. self.model.eval()
  100. loss = 0.0
  101. correct = 0
  102.  
  103. weight = torch.Tensor([1, 3])
  104. with torch.no_grad():
  105. for data, target in zip(X, y):
  106. data = torch.Tensor(data)
  107. target = torch.Tensor(target)
  108. output = self.model(data.unsqueeze(1))
  109. target = target.type(torch.int64)
  110. loss += F.nll_loss(output, target, reduction='sum', weight=weight).item()
  111.  
  112. pred = output.argmax(1, keepdim=True)
  113. correct += pred.eq(target.view_as(pred)).sum().item()
  114.  
  115. print('Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
  116. loss, correct, len(X),
  117. 100. * correct / len(y)))
  118.  
  119. def fit(self, X, y, X_val, y_val):
  120. X = (X - X.mean(axis=0)) / X.std(axis=0)
  121.  
  122.  
  123. batches = (len(X)) // self.batch_size + 1
  124. for epoch in range(1, self.epochs + 1):
  125. for batch in range(batches):
  126. lo, hi = self.batch_size * batch, self.batch_size * (batch + 1)
  127. X_tens = torch.Tensor(X[lo: hi])
  128. y_tens = torch.Tensor(y[lo: hi])
  129.  
  130. self.train_batch(X_tens, y_tens)
  131. # scheduler.batch_step()
  132. # print(optimizer.state_dict()['param_groups'][0]['lr'])
  133.  
  134. X_val = (X_val - X_val.mean(axis=0)) / X_val.std(axis=0)
  135. X_test = torch.Tensor(X_val)
  136. y_test = torch.Tensor(y_val).unsqueeze(1)
  137. self.test(X_test, y_test, epoch)
  138.  
  139. def pred(self, y):
  140. return self.model(y.unsqueeze(1))
  141. '''
  142.  
  143.  
  144.  
  145.  
  146.  
  147.  
  148.  
  149. def set_currency(df):
  150. # df['HRK'] = df['HRK'] * df['VALUTA_1']
  151. df['EUR'] = df['EUR'] * df['VALUTA_2']
  152. df['USD'] = df['USD'] * df['VALUTA_3']
  153. df['CHF'] = df['CHF'] * df['VALUTA_4']
  154. df['GBP'] = df['GBP'] * df['VALUTA_5']
  155. df.drop(columns=['delta_EUR', 'delta_USD', 'delta_CHF', 'delta_GBP'], inplace=True)
  156.  
  157.  
  158. def planirano_trajanje_train(train):
  159. train.loc[np.isnan(train['PLANIRANO_TRAJANJE(m)']), 'PLANIRANO_TRAJANJE(m)'] = -3
  160. train['PLANIRANO_TRAJANJE(3m)'] = train['PLANIRANO_TRAJANJE(m)'] // 3
  161. train.loc[
  162. (train["PLANIRANO_TRAJANJE(3m)"] >= -1) & (train["PLANIRANO_TRAJANJE(3m)"] < 2), "PLANIRANO_TRAJANJE(m)"] = -1
  163. train.loc[
  164. (train["PLANIRANO_TRAJANJE(3m)"] >= 2) & (train["PLANIRANO_TRAJANJE(3m)"] <= 6), "PLANIRANO_TRAJANJE(m)"] = -2
  165. train.loc[
  166. (train["PLANIRANO_TRAJANJE(3m)"] >= 7) & (train["PLANIRANO_TRAJANJE(3m)"] <= 21), "PLANIRANO_TRAJANJE(m)"] = -3
  167. train.loc[
  168. (train["PLANIRANO_TRAJANJE(3m)"] >= 22) & (train["PLANIRANO_TRAJANJE(3m)"] < 36), "PLANIRANO_TRAJANJE(m)"] = -4
  169. train.loc[
  170. (train["PLANIRANO_TRAJANJE(3m)"] >= 36) & (train["PLANIRANO_TRAJANJE(3m)"] < 200), "PLANIRANO_TRAJANJE(m)"] = -5
  171. train.drop(columns=['PLANIRANO_TRAJANJE(3m)'], inplace=True)
  172.  
  173.  
  174. def planirano_trajanje_test(test):
  175. # PLANIRANO TRAJANJE(3m)
  176. test.loc[np.isnan(test['PLANIRANO_TRAJANJE(m)']), 'PLANIRANO_TRAJANJE(m)'] = -3
  177. test['PLANIRANO_TRAJANJE(3m)'] = test['PLANIRANO_TRAJANJE(m)'] // 3
  178. test.loc[
  179. (test["PLANIRANO_TRAJANJE(3m)"] >= -1) & (test["PLANIRANO_TRAJANJE(3m)"] < 2), "PLANIRANO_TRAJANJE(m)"] = -1
  180. test.loc[
  181. (test["PLANIRANO_TRAJANJE(3m)"] >= 2) & (test["PLANIRANO_TRAJANJE(3m)"] <= 6), "PLANIRANO_TRAJANJE(m)"] = -2
  182. test.loc[
  183. (test["PLANIRANO_TRAJANJE(3m)"] >= 7) & (test["PLANIRANO_TRAJANJE(3m)"] <= 21), "PLANIRANO_TRAJANJE(m)"] = -3
  184. test.loc[
  185. (test["PLANIRANO_TRAJANJE(3m)"] >= 22) & (test["PLANIRANO_TRAJANJE(3m)"] < 36), "PLANIRANO_TRAJANJE(m)"] = -4
  186. test.loc[
  187. (test["PLANIRANO_TRAJANJE(3m)"] >= 36) & (test["PLANIRANO_TRAJANJE(3m)"] < 200), "PLANIRANO_TRAJANJE(m)"] = -5
  188. test.drop(columns=['PLANIRANO_TRAJANJE(3m)'], inplace=True)
  189.  
  190.  
  191. def create_validation_from_df(df, validation_size, dist_yes=0.55, wanted_train_distribution=0.265673):
  192. dist_no = 1 - dist_yes
  193.  
  194. len_yes = int(validation_size * dist_yes)
  195. len_no = int(validation_size * dist_no)
  196.  
  197. yes = df[df['PRIJEVREMENI_RASKID'] == 1].index.tolist()
  198. no = df[df['PRIJEVREMENI_RASKID'] == 0].index.tolist()
  199.  
  200. np.random.shuffle(yes)
  201. np.random.shuffle(no)
  202.  
  203. validationN = df.loc[no[:len_no]]
  204. validationY = df.loc[yes[:len_yes]]
  205.  
  206. current_dist = (len(yes) - len_yes) / (len(no) - len_no + len(yes) - len_yes)
  207. overflow = int((len(no) - len_no) * (1 - current_dist * (1 - wanted_train_distribution) / (1 - current_dist) / wanted_train_distribution))
  208.  
  209.  
  210. X_validation = pd.concat([validationN, validationY])
  211. y_validation = X_validation['PRIJEVREMENI_RASKID']
  212. X_validation.drop(columns=['PRIJEVREMENI_RASKID'], inplace=True)
  213.  
  214. df.drop(no[:len_no + overflow], inplace=True)
  215. df.drop(yes[:len_yes], inplace=True)
  216.  
  217. return df, X_validation, y_validation
  218.  
  219.  
  220. def create_validation_xy(X, y, validation_size, dist_yes=0.55, wanted_train_distribution=0.265673):
  221. from sklearn.utils import shuffle
  222. dist_no = 1 - dist_yes
  223.  
  224. X, y = shuffle(X, y)
  225. yes = np.where(y == 1)[0]
  226. no = np.where(y == 0)[0]
  227.  
  228. len_yes = int(validation_size * dist_yes)
  229. len_no = int(validation_size * dist_no)
  230.  
  231. X_val = np.concatenate([X[yes[:len_yes]], X[no[:len_no]]])
  232. y_val = np.concatenate([y[yes[:len_yes]], y[no[:len_no]]])
  233. X_val, y_val = shuffle(X_val, y_val)
  234.  
  235. current_dist = (len(yes) - len_yes) / (len(yes) - len_yes + len(no) - len_no)
  236. overflow = int((len(no) - len_no) * (1 - current_dist * (1 - wanted_train_distribution) / (1 - current_dist) / wanted_train_distribution))
  237.  
  238. X_train = np.concatenate([X[yes[len_yes:]], X[no[len_no + overflow:]]])
  239. y_train = np.concatenate([y[yes[len_yes:]], y[no[len_no + overflow:]]])
  240. X_train, y_train = shuffle(X_train, y_train)
  241. # print('dist u trainu:', y_train.sum() / len(y_train))
  242.  
  243. return X_train, y_train, X_val, y_val
  244.  
  245.  
  246. def cross_val_acc(clf, X, y, n=5, dist_yes=0.55, wanted_train_distribution=0.265673):
  247. from sklearn.metrics import f1_score, accuracy_score
  248. from sklearn.utils import shuffle
  249. dist_no = 1 - dist_yes
  250.  
  251. X, y = shuffle(X, y)
  252.  
  253. results = []
  254. f1s = []
  255. batch_size = (len(X) // n + 1) // 3
  256. for i in range(n):
  257. X_train, y_train, X_val, y_val = create_validation_xy(X, y, batch_size, dist_yes, wanted_train_distribution)
  258. clf.fit(X_train, y_train)
  259. y_pred = clf.predict(X_val)
  260.  
  261. # print(y_val.sum() / len(y_val), y_pred.sum() / len(y_pred))
  262. results.append(accuracy_score(y_val, y_pred))
  263. f1s.append(f1_score(y_val, y_pred))
  264. print('Done: {} / {}'.format(i + 1, n))
  265. return results, f1s
  266.  
  267.  
  268. ft = pd.read_csv('../featuri/BitniVanjskiFeaturi7.csv')
  269. train = pd.read_csv('train.csv')
  270. train['PRIJEVREMENI_RASKID'] = train['PRIJEVREMENI_RASKID'].map({'Y': 1, 'N': 0}).astype(int)
  271.  
  272. test = pd.read_csv('test.csv')
  273. test.drop(columns=['PRIJEVREMENI_RASKID'], inplace=True)
  274.  
  275. dates = ft['GODINA_MJESEC'].tolist()
  276. for col in ft:
  277. feat = {}
  278. if col == 'GODINA_MJESEC':
  279. continue
  280. for i, value in enumerate(ft[col]):
  281. feat[dates[i]] = value
  282.  
  283. train[col] = train['GODINA_MJESEC'].map(feat)
  284. test[col] = test['GODINA_MJESEC'].map(feat)
  285.  
  286. print(train.shape)
  287.  
  288.  
  289. def g(x):
  290. x = (x - 1995) / 23
  291. x -= 0.02
  292. return (x / 2.2 - 0.5) * np.cos(x * 5.6 * np.pi) - (x / 5)
  293.  
  294.  
  295. #
  296. # x = np.arange(1995, 2019, 1)
  297. # y = g(x)
  298. # plt.subplot(3, 1, 1)
  299. # sns.distplot(train['GODINA'])
  300. # plt.subplot(3, 1, 2)
  301. # sns.barplot(x='GODINA', y='PRIJEVREMENI_RASKID', data=train)
  302. # plt.subplot(3, 1, 3)
  303. # plt.plot(x, y, linewidth=2.0)
  304. #
  305. # plt.show()
  306.  
  307.  
  308. train['f_RASKID'] = 0
  309.  
  310. train.loc[
  311. train['OSTALI_UGOVORI'] > 0, 'f_RASKID'] = (train[train['OSTALI_UGOVORI'] > 0]['OSTALI_RASKIDI'] /
  312. train[train['OSTALI_UGOVORI'] > 0]['OSTALI_UGOVORI'] - 0.27) * \
  313. np.power(train[train['OSTALI_UGOVORI'] > 0]['OSTALI_UGOVORI'], 1 / 3)
  314.  
  315. test['f_RASKID'] = 0
  316. test.loc[
  317. test['OSTALI_UGOVORI'] > 0, 'f_RASKID'] = (test[test['OSTALI_UGOVORI'] > 0]['OSTALI_RASKIDI'] /
  318. test[test['OSTALI_UGOVORI'] > 0]['OSTALI_UGOVORI'] - 0.27) * \
  319. np.power(test[test['OSTALI_UGOVORI'] > 0]['OSTALI_UGOVORI'], 1 / 3)
  320.  
  321. from scipy.stats import boxcox_normmax, boxcox
  322.  
  323. # train['noise'] = np.random.normal(1, 0.12, size=len(train))
  324. train['f_RASKID'] *= np.random.normal(1, 0.12, size=len(train))
  325.  
  326. train.loc[train['OSTALI_UGOVORI'] >= 10, 'OSTALI_UGOVORI'] = 10
  327.  
  328.  
  329. combinations = [('TIP_KAMATE', 'VRSTA_KLIJENTA'),
  330. ('TIP_KAMATE', 'VRSTA_PROIZVODA'),
  331. ('VALUTA', 'VRSTA_KLIJENTA'),
  332. ('VALUTA', 'VRSTA_PROIZVODA'),
  333. ]
  334.  
  335. comb_columns = []
  336. for pair in combinations:
  337. train[pair[0] + '+' + pair[1]] = train[pair[0]].astype(str) + ', ' + train[pair[1]].astype(str)
  338. test[pair[0] + '+' + pair[1]] = test[pair[0]].astype(str) + ', ' + test[pair[1]].astype(str)
  339. comb_columns.append(pair[0] + '+' + pair[1])
  340.  
  341.  
  342. train = pd.get_dummies(train, columns=comb_columns, drop_first=False)
  343. test = pd.get_dummies(test, columns=comb_columns, drop_first=False)
  344.  
  345. train = pd.get_dummies(train, columns=['UGOVORENI_IZNOS', 'VALUTA', 'VRSTA_KLIJENTA', 'PROIZVOD', 'VRSTA_PROIZVODA',
  346. 'VISINA_KAMATE', 'TIP_KAMATE', 'STAROST'], drop_first=False)
  347. test = pd.get_dummies(test, columns=['UGOVORENI_IZNOS', 'VALUTA', 'VRSTA_KLIJENTA', 'PROIZVOD', 'VRSTA_PROIZVODA',
  348. 'VISINA_KAMATE', 'TIP_KAMATE', 'STAROST'], drop_first=False)
  349.  
  350. set_currency(train)
  351. set_currency(test)
  352.  
  353. drop_from_train = []
  354. drop_from_test = []
  355. for col in train:
  356. if (train[col] != 0).sum() < 3000:
  357. drop_from_train.append(col)
  358. if col in test.columns.values:
  359. drop_from_test.append(col)
  360.  
  361.  
  362. # train.drop(columns=['VALUTA_2', 'VALUTA_3', 'VALUTA_4', 'VALUTA_5', 'VRSTA_KLIJENTA_1110', 'VRSTA_KLIJENTA_1120',
  363. # 'VRSTA_KLIJENTA_1420', 'VRSTA_KLIJENTA_1610', 'VRSTA_PROIZVODA_L', 'GBP'], # 'VRSTA_KLIJENTA_1320',
  364. # inplace=True)
  365. # test.drop(columns=['VALUTA_2', 'VALUTA_3', 'VALUTA_4', 'VALUTA_5', 'VRSTA_KLIJENTA_1110', 'VRSTA_KLIJENTA_1120',
  366. # 'VRSTA_KLIJENTA_1420', 'VRSTA_KLIJENTA_1610', 'VRSTA_PROIZVODA_L', 'GBP'], # 'VRSTA_KLIJENTA_1320',
  367. # inplace=True)
  368.  
  369. train.drop(columns=drop_from_train, inplace=True)
  370. test.drop(columns=drop_from_test, inplace=True)
  371. train.drop(columns=['KLIJENT_ID', 'OZNAKA_PARTIJE', 'GODINA_MJESEC'], inplace=True)
  372. test.drop(columns=['KLIJENT_ID', 'OZNAKA_PARTIJE', 'GODINA_MJESEC'], inplace=True)
  373.  
  374.  
  375. # FREQUENCY ENCODING
  376. # for col in comb_columns:
  377. # freq = {}
  378. # for val in train[col].unique():
  379. # freq[val] = (train[col] == val).sum() / len(train)
  380. # train[col] = train[col].map(freq)
  381. # test[col] = test[col].map(freq)
  382. #
  383. # for col in ['UGOVORENI_IZNOS', 'VALUTA', 'VRSTA_KLIJENTA', 'PROIZVOD', 'VRSTA_PROIZVODA', 'VISINA_KAMATE', 'TIP_KAMATE',
  384. # 'GODINA', 'STAROST']:
  385. # freq = {}
  386. # for val in train[col].unique():
  387. # freq[val] = (train[col] == val).sum() / len(train)
  388. # train[col] = train[col].map(freq)
  389. # test[col] = test[col].map(freq)
  390.  
  391.  
  392. tr = set(train.columns.values)
  393. tst = set(test.columns.values)
  394.  
  395.  
  396. for x in tr.difference(tst):
  397. if x != 'PRIJEVREMENI_RASKID' and x != 'PRODULJIVANJE':
  398. test[x] = 0
  399.  
  400. tr = set(train.columns.values)
  401. tst = set(test.columns.values)
  402.  
  403. print(tst.difference(tr))
  404. print(tr.difference(tst))
  405. test.drop(columns=list(tst.difference(tr)), inplace=True)
  406.  
  407.  
  408. # y = train['PRIJEVREMENI_RASKID']
  409. # produljivanje = train['PRODULJIVANJE']
  410. # train.drop(columns=['PRIJEVREMENI_RASKID', 'PRODULJIVANJE'], inplace=True)
  411. #
  412. # encoder = ce.target_encoder.TargetEncoder(cols=comb_columns, smoothing=2, min_samples_leaf=4000)
  413. # encoder.fit(train, y)
  414. #
  415. # train = encoder.transform(train)
  416. # test = encoder.transform(test)
  417. #
  418. # train['PRODULJIVANJE'] = produljivanje
  419. # train['PRIJEVREMENI_RASKID'] = y
  420.  
  421.  
  422. test = test[train.drop(columns=['PRODULJIVANJE', 'PRIJEVREMENI_RASKID']).columns.values]
  423.  
  424. test1 = train.drop(columns=['PRIJEVREMENI_RASKID', 'PRODULJIVANJE', 'PLANIRANO_TRAJANJE(m)']).loc[np.isnan(train['PLANIRANO_TRAJANJE(m)'])].values
  425. test2 = test.drop(columns='PLANIRANO_TRAJANJE(m)').loc[np.isnan(test['PLANIRANO_TRAJANJE(m)'])].values
  426.  
  427. train1 = train.drop(columns=['PRIJEVREMENI_RASKID', 'PRODULJIVANJE']).loc[~np.isnan(train['PLANIRANO_TRAJANJE(m)'])]
  428. train2 = test.loc[~np.isnan(test['PLANIRANO_TRAJANJE(m)'])]
  429.  
  430.  
  431. df = pd.concat([train1, train2], sort=False)
  432.  
  433.  
  434. X_train = df.drop(columns='PLANIRANO_TRAJANJE(m)').values
  435. y_train = df['PLANIRANO_TRAJANJE(m)'].values
  436.  
  437. lgbm = LGBMRegressor(n_estimators=200, n_jobs=6)
  438.  
  439. kfolds = KFold(shuffle=True, n_splits=5)
  440. # mse = -cross_val_score(lgbm, X_train, y_train, scoring="neg_mean_squared_error", cv=kfolds, n_jobs=3, verbose=3)
  441. # print(np.mean(mse), np.std(mse))
  442.  
  443. lgbm.fit(X_train, y_train)
  444. print(X_train.shape, test1.shape)
  445. pred_train = lgbm.predict(test1)
  446. train.loc[np.isnan(train['PLANIRANO_TRAJANJE(m)']), 'PLANIRANO_TRAJANJE(m)'] = pred_train
  447.  
  448. train['PLANIRANO_TRAJANJE(m)'] = train['PLANIRANO_TRAJANJE(m)'].astype(int)
  449.  
  450. # test ima 0 missing valuesa u PLANIRANO TRAJANJE
  451. # pred_test = reg.predict(test2)
  452. # test.loc[np.isnan(train['PLANIRANO_TRAJANJE(m)']), 'PLANIRANO_TRAJANJE(m)'] = pred_test
  453.  
  454. planirano_trajanje_train(train)
  455. planirano_trajanje_test(test)
  456.  
  457. train['PLANIRANO_TRAJANJE(m)'] *= -1
  458. test['PLANIRANO_TRAJANJE(m)'] *= -1
  459.  
  460. train['f2'] = train['f_RASKID'] + train['PLANIRANO_TRAJANJE(m)'] / 3
  461. test['f2'] = test['f_RASKID'] + test['PLANIRANO_TRAJANJE(m)'] / 3
  462.  
  463. train['f3'] = train['f2'] + (2018 - train['GODINA']) / 20
  464. test['f3'] = test['f2'] + (2018 - test['GODINA']) / 20
  465.  
  466. test = test[train.drop(columns=['PRODULJIVANJE', 'PRIJEVREMENI_RASKID']).columns.values]
  467. X_test = test.values
  468.  
  469.  
  470. X = train.drop(columns=['PRODULJIVANJE', 'PRIJEVREMENI_RASKID']).values
  471. y = train['PRODULJIVANJE'].values
  472.  
  473.  
  474. clf1 = XGBClassifier(n_jobs=8, n_estimators=200, scale_pos_weight=1.3)
  475. # print(cross_val_acc(clf1, X, y, 5, dist_yes=0.151, wanted_train_distribution=0.151))
  476.  
  477. clf1.fit(X, y)
  478.  
  479. y_pred1 = clf1.predict(X_test)
  480. test['PRODULJIVANJE'] = y_pred1
  481. print(test['PRODULJIVANJE'].value_counts() / len(test))
  482.  
  483.  
  484. train['f4'] = train['f3'] + train['PRODULJIVANJE'] - 0.5
  485. test['f4'] = test['f3'] + test['PRODULJIVANJE'] - 0.5
  486.  
  487.  
  488. train['f2+f4'] = train['f2'] + train['f4']
  489. test['f2+f4'] = test['f2'] + test['f4']
  490.  
  491. train['f2*f4'] = train['f2'] * train['f4']
  492. test['f2*f4'] = test['f2'] * test['f4']
  493.  
  494. # train['f2-f4'] = train['f2'] - train['f4']
  495. # test['f2-f4'] = test['f2'] - test['f4']
  496.  
  497.  
  498. train['f4+mj'] = train['f4'] + train['MJESEC'] / 12
  499. test['f4+mj'] = test['f4'] + test['MJESEC'] / 12
  500.  
  501. train['f4*mj'] = train['f4'] * train['MJESEC'] / 12
  502. test['f4*mj'] = test['f4'] * test['MJESEC'] / 12
  503.  
  504. train['f4+dan'] = train['f4'] + train['dan'] / 20
  505. test['f4+dan'] = test['f4'] + test['dan'] / 20
  506.  
  507. train['f4*dan'] = train['f4'] * train['dan'] / 20
  508. test['f4*dan'] = test['f4'] * test['dan'] / 20
  509.  
  510. # train['f2+f_raskid'] = train['f2'] + train['f_RASKID']
  511. # test['f2+f_raskid'] = test['f2'] + test['f_RASKID']
  512.  
  513.  
  514. # train['f2+f4|f4+dan'] = train['f2+f4'] + train['f4+dan']
  515. # test['f2+f4|f4+dan'] = test['f2+f4'] + test['f4+dan']
  516. #
  517. # train['f2+f4||f4+dan'] = train['f2+f4'] * train['f4+dan']
  518. # test['f2+f4||f4+dan'] = test['f2+f4'] * test['f4+dan']
  519.  
  520.  
  521. train['OSTALI_RASKIDI+f4'] = train['OSTALI_RASKIDI'] / 8 + train['f4']
  522. test['OSTALI_RASKIDI+f4'] = test['OSTALI_RASKIDI'] / 8 + test['f4']
  523.  
  524.  
  525. train['OSTALI_UGOVORI+f2+f4'] = train['OSTALI_UGOVORI'] / 70 + train['f2+f4']
  526. test['OSTALI_UGOVORI+f2+f4'] = test['OSTALI_UGOVORI'] / 70 + test['f2+f4']
  527.  
  528. train['OSTALI_UGOVORI * f2+f4'] = train['OSTALI_UGOVORI'] / 70 * train['f2+f4']
  529. test['OSTALI_UGOVORI * f2+f4'] = test['OSTALI_UGOVORI'] / 70 * test['f2+f4']
  530.  
  531.  
  532. # train['f2+f_raskid|f4+dan'] = train['f2+f_raskid'] + train['f4+dan']
  533. # test['f2+f_raskid|f4+dan'] = test['f2+f_raskid'] + test['f4+dan']
  534. #
  535. # train['f2+f_raskid||f4+dan'] = train['f2+f_raskid'] * train['f4+dan']
  536. # test['f2+f_raskid||f4+dan'] = test['f2+f_raskid'] * test['f4+dan']
  537.  
  538.  
  539. funkcije = ['f_RASKID', 'f2', 'f3', 'f4', 'f2*f4', 'f2+f4', 'f4+mj', 'f4+dan', 'f4*mj', 'f4+dan',
  540. 'OSTALI_RASKIDI+f4', 'OSTALI_UGOVORI+f2+f4', 'OSTALI_UGOVORI * f2+f4']
  541.  
  542.  
  543.  
  544. # for fj in funkcije:
  545. # raskid = train[train['PRIJEVREMENI_RASKID'] == 1][fj]
  546. # neraskid = train[train['PRIJEVREMENI_RASKID'] == 0][fj]
  547. #
  548. # sns.distplot(raskid, color='red', kde=True, hist=False, kde_kws={'shade': True, 'linewidth': 1.2}, label='raskid')
  549. # sns.distplot(neraskid, color='blue', kde=True, hist=False, kde_kws={'shade': True, 'linewidth': 1.2}, label='neraskid')
  550. # plt.show()
  551.  
  552.  
  553. order = train.columns.values.tolist()
  554. order.remove('PRIJEVREMENI_RASKID')
  555. test = test[order]
  556.  
  557. # train.drop(columns='f_RASKID', inplace=True)
  558. # test.drop(columns='f_RASKID', inplace=True)
  559.  
  560. X_test = test.values
  561.  
  562.  
  563. X = train.drop(columns=['PRIJEVREMENI_RASKID']).values
  564. y = train['PRIJEVREMENI_RASKID'].values
  565.  
  566. #
  567. #
  568. # neural = NN(epochs=100)
  569. # X_train, y_train, X_val, y_val = create_validation_xy(X, y, 20000)
  570. # neural.fit(X, y, X_val, y_val)
  571.  
  572.  
  573. clf = XGBClassifier(n_jobs=8, n_estimators=200, scale_pos_weight=3.1, learning_rate=0.05, max_depth=10)
  574. # clf2 = XGBClassifier(n_jobs=8, n_estimators=150, scale_pos_weight=3.1, learning_rate=0.014, max_depth=6)
  575. # clf3 = XGBClassifier(n_jobs=8, n_estimators=100, scale_pos_weight=3.1)
  576. #
  577. # acc, f1 = cross_val_acc(clf, X, y, 3)
  578. # baseline_acc, baseline_f1 = np.mean(acc), np.mean(f1)
  579. # print(baseline_acc, baseline_f1)
  580.  
  581. # acc, f1 = cross_val_acc(clf2, X, y, 3)
  582. # baseline_acc, baseline_f1 = np.mean(acc), np.mean(f1)
  583. # print(baseline_acc, baseline_f1)
  584. #
  585. # acc, f1 = cross_val_acc(clf3, X, y, 3)
  586. # baseline_acc, baseline_f1 = np.mean(acc), np.mean(f1)
  587. # print(baseline_acc, baseline_f1)
  588.  
  589. # acc, f1 = cross_val_acc(clf2, X, y, 3)
  590. # baseline_acc, baseline_f1 = np.mean(acc), np.mean(f1)
  591. # print(baseline_acc, baseline_f1)
  592.  
  593.  
  594.  
  595. # text = open('rezultati.txt', 'a+')
  596. for n in [8, 10, 12]:
  597. for e in range(-4, 0):
  598. for sub in [0.5, 0.7, 0.9, 1]:
  599. lr = np.exp(e)
  600. X = train.drop(columns=['PRIJEVREMENI_RASKID']).values
  601. y = train['PRIJEVREMENI_RASKID'].values
  602.  
  603. clf = XGBClassifier(n_jobs=9, n_estimators=230, scale_pos_weight=2.9, learning_rate=lr, max_depth=n, subsample=sub)
  604. acc, f1 = cross_val_acc(clf, X, y, 5)
  605. baseline_acc, baseline_f1 = np.mean(acc), np.mean(f1)
  606. print('max depth: {}, lr: {}'.format(n, lr), baseline_acc, baseline_f1)
  607.  
  608. exit()
  609. # text.close()
  610.  
  611. # exit()
  612. # perm = pd.DataFrame(columns=['Feature', 'Accuracy', 'F1 score'])
  613. # for i, col in enumerate(X.transpose()):
  614. # temp = np.random.permutation(col)
  615. # X_perm = X.transpose().copy()
  616. # X_perm[i] = temp
  617. # X_perm = X_perm.transpose()
  618. #
  619. # acc, f1 = cross_val_acc(clf, X_perm, y, 4)
  620. # accuracy = round(baseline_acc - np.mean(acc), 7)
  621. # f1score = round(baseline_f1 - np.mean(f1), 7)
  622. #
  623. # acc, f1 = cross_val_acc(clf, X_perm, y, 4)
  624. # accuracy += round(baseline_acc - np.mean(acc), 7)
  625. # f1score += round(baseline_f1 - np.mean(f1), 7)
  626. #
  627. # accuracy = round(accuracy / 2, 5)
  628. # f1score = round(f1score / 2, 5)
  629. #
  630. # perm = perm.append({'Feature': l[i], 'Accuracy': accuracy, 'F1 score': f1score}, ignore_index=True)
  631. # print("{} / {}, {}: {}, {}".format(i + 1, 157, l[i], accuracy, f1score))
  632. #
  633. # perm.to_excel('Permutation_importance.xlsx', index=False)
  634. # print(perm.head())
  635.  
  636. print('poceo fitati')
  637. clf.fit(X, y)
  638. xgbfir.saveXgbFI(clf, OutputXlsxFile='xgbi.xlsx', feature_names=train.drop(columns='PRIJEVREMENI_RASKID').columns.values)
  639.  
  640. # clf2.fit(X, y)
  641. # clf3.fit(X, y)
  642.  
  643. df = pd.DataFrame(columns=['feature', 'SHAP value'])
  644.  
  645. print('racunam shap')
  646. explainer = shap.TreeExplainer(clf)
  647. shap_values = explainer.shap_values(X)
  648. for i, el in enumerate(train.drop(columns='PRIJEVREMENI_RASKID').columns.values):
  649. df = df.append({'feature': el, 'SHAP value': np.mean(np.abs(shap_values[i]))}, ignore_index=True)
  650.  
  651. df.to_excel('shap.xlsx', index=False)
  652.  
  653.  
  654. # shap.summary_plot(shap_values, X, plot_type="bar", feature_names=train.drop(columns=['PRIJEVREMENI_RASKID']).columns.values)
  655.  
  656.  
  657. # clf2.fit(X, y)
  658. #
  659. # plt.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
  660. # plt.show()
  661.  
  662.  
  663. # d = {}
  664. # zero_importance_features = []
  665. # for i, el in enumerate(train.drop(columns='PRIJEVREMENI_RASKID').columns.values):
  666. # if clf.feature_importances_[i] == 0:# and clf2.feature_importances_[i] == 0:
  667. # zero_importance_features.append(el)
  668. # # if clf.feature_importances_[i] != 0:
  669. # # d[clf.feature_importances_[i]] = el
  670. # # else:
  671. # # zero_importance_features.append(el)
  672. #
  673. # print(zero_importance_features)
  674.  
  675. # fi = np.sort(clf.feature_importances_)
  676.  
  677.  
  678. # for el in zero_importance:
  679. # print(el, 0.0)
  680. #
  681. # for el in fi:
  682. # if el == 0:
  683. # continue
  684. # s = ' '
  685. # k = d[el] + s
  686. # k = k[:34]
  687. # print(k, round(el, 5))
  688.  
  689.  
  690. y_pred = clf.predict(X_test)
  691. # y_pred2 = clf2.predict(X_test)
  692. # y_pred3 = clf3.predict(X_test)
  693.  
  694.  
  695. test = pd.read_excel('eval_dataset_nan.xlsx')
  696.  
  697. # test['PRIJEVREMENI_RASKID'] = np.round((y_pred + y_pred2 + y_pred3) / 3)
  698. test['PRIJEVREMENI_RASKID'] = y_pred
  699. test['PRIJEVREMENI_RASKID'] = test['PRIJEVREMENI_RASKID'].map({1: 'Y', 0: 'N'})
  700.  
  701. test.to_csv('student.csv')
  702.  
  703. print(test['PRIJEVREMENI_RASKID'].value_counts() / len(test))
  704.  
  705.  
  706.  
  707. # def hyperopt_train_test(params):
  708. # clf = XGBClassifier(**params)
  709. # acc, f1 = cross_val_acc(clf, X, y, 5)
  710. # acc = np.mean(acc)
  711. # return -acc
  712. #
  713. # space = {
  714. # 'n_estimators': 500,
  715. # 'max_depth': hp.choice('max_depth ', range(2, 30)),
  716. # # 'learning_rate ': hp.uniform('learning_rate ', 0.1, 2),
  717. # # 'subsample': hp.uniform('subsample', 0.3, 1),
  718. # 'scale_pos_weight ': 2.762,
  719. # 'n_jobs': -1
  720. # }
  721. #
  722. #
  723. # def f(params):
  724. # acc = hyperopt_train_test(params)
  725. # return {'loss': acc, 'status': STATUS_OK}
  726. #
  727. #
  728. # trials = Trials()
  729. # best = fmin(f, space, algo=tpe.suggest, max_evals=20, trials=trials)
  730. #
  731. # print(space_eval(space, best))
  732. # exit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement