Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import random as rnd
- import seaborn as sns
- import matplotlib.pyplot as plt
- from sklearn.linear_model import LogisticRegression
- from sklearn.svm import SVC, LinearSVC, SVR
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.linear_model import Lasso, ElasticNet
- from sklearn.model_selection import train_test_split
- from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold, KFold
- #import torch
- #import torch.nn as nn
- #import torch.nn.functional as F
- #import torch.optim as optim
- import pandas as pd
- from xgboost import XGBClassifier, XGBRegressor
- from lightgbm import LGBMRegressor
- import xgboost as xgb
- import xgbfir
- import shap
- shap.initjs()
- import category_encoders as ce
- from catboost import CatBoostClassifier
- from hyperopt import hp, tpe, STATUS_OK, Trials, fmin, tpe, hp, space_eval
- pd.set_option('display.max_columns', None)
- '''
- class Net(nn.Module):
- def __init__(self):
- super(Net, self).__init__()
- self.dr0 = nn.Dropout(p=0.1)
- self.fc1 = nn.Linear(254, 150)
- self.dr1 = nn.Dropout(p=0.35)
- self.fc2 = nn.Linear(150, 50)
- self.dr2 = nn.Dropout(p=0.1)
- self.fc3 = nn.Linear(50, 2)
- self.i1 = nn.Parameter(torch.tensor(0.4806), requires_grad=True)
- self.e1 = nn.Parameter(torch.tensor(0.3389), requires_grad=True)
- self.t1 = nn.Parameter(torch.tensor(-0.5508), requires_grad=True)
- self.i2 = nn.Parameter(torch.tensor(0.5739), requires_grad=True)
- self.e2 = nn.Parameter(torch.tensor(0.4860), requires_grad=True)
- self.t2 = nn.Parameter(torch.tensor(-0.2934), requires_grad=True)
- def forward(self, x):
- x = x.view(-1, 254)
- x = self.dr0(x)
- x = self.fc1(x)
- x = self.i1 * x + self.e1 * F.selu(x) + self.t1 * torch.tanh(x)
- x = self.dr1(x)
- x = self.fc2(x)
- x = self.i2 * x + self.e2 * F.selu(x) + self.t2 * torch.tanh(x)
- x = self.dr2(x)
- x = self.fc3(x)
- return F.log_softmax(x, dim=1)
- class NN:
- def __init__(self, epochs, batch_size=32):
- self.model = Net()
- self.epochs = epochs
- self.batch_size = batch_size
- self.optimizer = optim.SGD(self.model.parameters(), lr=0.001, momentum=0.7)
- def train_batch(self, data, target):
- self.model.train()
- self.optimizer.zero_grad()
- output = self.model(data)
- loss = F.nll_loss(output, target.type(torch.int64))
- loss.backward()
- self.optimizer.step()
- def test(self, X, y, epoch):
- self.model.eval()
- loss = 0.0
- correct = 0
- weight = torch.Tensor([1, 3])
- with torch.no_grad():
- for data, target in zip(X, y):
- data = torch.Tensor(data)
- target = torch.Tensor(target)
- output = self.model(data.unsqueeze(1))
- target = target.type(torch.int64)
- loss += F.nll_loss(output, target, reduction='sum', weight=weight).item()
- pred = output.argmax(1, keepdim=True)
- correct += pred.eq(target.view_as(pred)).sum().item()
- print('Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
- loss, correct, len(X),
- 100. * correct / len(y)))
- def fit(self, X, y, X_val, y_val):
- X = (X - X.mean(axis=0)) / X.std(axis=0)
- batches = (len(X)) // self.batch_size + 1
- for epoch in range(1, self.epochs + 1):
- for batch in range(batches):
- lo, hi = self.batch_size * batch, self.batch_size * (batch + 1)
- X_tens = torch.Tensor(X[lo: hi])
- y_tens = torch.Tensor(y[lo: hi])
- self.train_batch(X_tens, y_tens)
- # scheduler.batch_step()
- # print(optimizer.state_dict()['param_groups'][0]['lr'])
- X_val = (X_val - X_val.mean(axis=0)) / X_val.std(axis=0)
- X_test = torch.Tensor(X_val)
- y_test = torch.Tensor(y_val).unsqueeze(1)
- self.test(X_test, y_test, epoch)
- def pred(self, y):
- return self.model(y.unsqueeze(1))
- '''
- def set_currency(df):
- # df['HRK'] = df['HRK'] * df['VALUTA_1']
- df['EUR'] = df['EUR'] * df['VALUTA_2']
- df['USD'] = df['USD'] * df['VALUTA_3']
- df['CHF'] = df['CHF'] * df['VALUTA_4']
- df['GBP'] = df['GBP'] * df['VALUTA_5']
- df.drop(columns=['delta_EUR', 'delta_USD', 'delta_CHF', 'delta_GBP'], inplace=True)
- def planirano_trajanje_train(train):
- train.loc[np.isnan(train['PLANIRANO_TRAJANJE(m)']), 'PLANIRANO_TRAJANJE(m)'] = -3
- train['PLANIRANO_TRAJANJE(3m)'] = train['PLANIRANO_TRAJANJE(m)'] // 3
- train.loc[
- (train["PLANIRANO_TRAJANJE(3m)"] >= -1) & (train["PLANIRANO_TRAJANJE(3m)"] < 2), "PLANIRANO_TRAJANJE(m)"] = -1
- train.loc[
- (train["PLANIRANO_TRAJANJE(3m)"] >= 2) & (train["PLANIRANO_TRAJANJE(3m)"] <= 6), "PLANIRANO_TRAJANJE(m)"] = -2
- train.loc[
- (train["PLANIRANO_TRAJANJE(3m)"] >= 7) & (train["PLANIRANO_TRAJANJE(3m)"] <= 21), "PLANIRANO_TRAJANJE(m)"] = -3
- train.loc[
- (train["PLANIRANO_TRAJANJE(3m)"] >= 22) & (train["PLANIRANO_TRAJANJE(3m)"] < 36), "PLANIRANO_TRAJANJE(m)"] = -4
- train.loc[
- (train["PLANIRANO_TRAJANJE(3m)"] >= 36) & (train["PLANIRANO_TRAJANJE(3m)"] < 200), "PLANIRANO_TRAJANJE(m)"] = -5
- train.drop(columns=['PLANIRANO_TRAJANJE(3m)'], inplace=True)
- def planirano_trajanje_test(test):
- # PLANIRANO TRAJANJE(3m)
- test.loc[np.isnan(test['PLANIRANO_TRAJANJE(m)']), 'PLANIRANO_TRAJANJE(m)'] = -3
- test['PLANIRANO_TRAJANJE(3m)'] = test['PLANIRANO_TRAJANJE(m)'] // 3
- test.loc[
- (test["PLANIRANO_TRAJANJE(3m)"] >= -1) & (test["PLANIRANO_TRAJANJE(3m)"] < 2), "PLANIRANO_TRAJANJE(m)"] = -1
- test.loc[
- (test["PLANIRANO_TRAJANJE(3m)"] >= 2) & (test["PLANIRANO_TRAJANJE(3m)"] <= 6), "PLANIRANO_TRAJANJE(m)"] = -2
- test.loc[
- (test["PLANIRANO_TRAJANJE(3m)"] >= 7) & (test["PLANIRANO_TRAJANJE(3m)"] <= 21), "PLANIRANO_TRAJANJE(m)"] = -3
- test.loc[
- (test["PLANIRANO_TRAJANJE(3m)"] >= 22) & (test["PLANIRANO_TRAJANJE(3m)"] < 36), "PLANIRANO_TRAJANJE(m)"] = -4
- test.loc[
- (test["PLANIRANO_TRAJANJE(3m)"] >= 36) & (test["PLANIRANO_TRAJANJE(3m)"] < 200), "PLANIRANO_TRAJANJE(m)"] = -5
- test.drop(columns=['PLANIRANO_TRAJANJE(3m)'], inplace=True)
- def create_validation_from_df(df, validation_size, dist_yes=0.55, wanted_train_distribution=0.265673):
- dist_no = 1 - dist_yes
- len_yes = int(validation_size * dist_yes)
- len_no = int(validation_size * dist_no)
- yes = df[df['PRIJEVREMENI_RASKID'] == 1].index.tolist()
- no = df[df['PRIJEVREMENI_RASKID'] == 0].index.tolist()
- np.random.shuffle(yes)
- np.random.shuffle(no)
- validationN = df.loc[no[:len_no]]
- validationY = df.loc[yes[:len_yes]]
- current_dist = (len(yes) - len_yes) / (len(no) - len_no + len(yes) - len_yes)
- overflow = int((len(no) - len_no) * (1 - current_dist * (1 - wanted_train_distribution) / (1 - current_dist) / wanted_train_distribution))
- X_validation = pd.concat([validationN, validationY])
- y_validation = X_validation['PRIJEVREMENI_RASKID']
- X_validation.drop(columns=['PRIJEVREMENI_RASKID'], inplace=True)
- df.drop(no[:len_no + overflow], inplace=True)
- df.drop(yes[:len_yes], inplace=True)
- return df, X_validation, y_validation
- def create_validation_xy(X, y, validation_size, dist_yes=0.55, wanted_train_distribution=0.265673):
- from sklearn.utils import shuffle
- dist_no = 1 - dist_yes
- X, y = shuffle(X, y)
- yes = np.where(y == 1)[0]
- no = np.where(y == 0)[0]
- len_yes = int(validation_size * dist_yes)
- len_no = int(validation_size * dist_no)
- X_val = np.concatenate([X[yes[:len_yes]], X[no[:len_no]]])
- y_val = np.concatenate([y[yes[:len_yes]], y[no[:len_no]]])
- X_val, y_val = shuffle(X_val, y_val)
- current_dist = (len(yes) - len_yes) / (len(yes) - len_yes + len(no) - len_no)
- overflow = int((len(no) - len_no) * (1 - current_dist * (1 - wanted_train_distribution) / (1 - current_dist) / wanted_train_distribution))
- X_train = np.concatenate([X[yes[len_yes:]], X[no[len_no + overflow:]]])
- y_train = np.concatenate([y[yes[len_yes:]], y[no[len_no + overflow:]]])
- X_train, y_train = shuffle(X_train, y_train)
- # print('dist u trainu:', y_train.sum() / len(y_train))
- return X_train, y_train, X_val, y_val
- def cross_val_acc(clf, X, y, n=5, dist_yes=0.55, wanted_train_distribution=0.265673):
- from sklearn.metrics import f1_score, accuracy_score
- from sklearn.utils import shuffle
- dist_no = 1 - dist_yes
- X, y = shuffle(X, y)
- results = []
- f1s = []
- batch_size = (len(X) // n + 1) // 3
- for i in range(n):
- X_train, y_train, X_val, y_val = create_validation_xy(X, y, batch_size, dist_yes, wanted_train_distribution)
- clf.fit(X_train, y_train)
- y_pred = clf.predict(X_val)
- # print(y_val.sum() / len(y_val), y_pred.sum() / len(y_pred))
- results.append(accuracy_score(y_val, y_pred))
- f1s.append(f1_score(y_val, y_pred))
- print('Done: {} / {}'.format(i + 1, n))
- return results, f1s
- ft = pd.read_csv('../featuri/BitniVanjskiFeaturi7.csv')
- train = pd.read_csv('train.csv')
- train['PRIJEVREMENI_RASKID'] = train['PRIJEVREMENI_RASKID'].map({'Y': 1, 'N': 0}).astype(int)
- test = pd.read_csv('test.csv')
- test.drop(columns=['PRIJEVREMENI_RASKID'], inplace=True)
- dates = ft['GODINA_MJESEC'].tolist()
- for col in ft:
- feat = {}
- if col == 'GODINA_MJESEC':
- continue
- for i, value in enumerate(ft[col]):
- feat[dates[i]] = value
- train[col] = train['GODINA_MJESEC'].map(feat)
- test[col] = test['GODINA_MJESEC'].map(feat)
- print(train.shape)
- def g(x):
- x = (x - 1995) / 23
- x -= 0.02
- return (x / 2.2 - 0.5) * np.cos(x * 5.6 * np.pi) - (x / 5)
- #
- # x = np.arange(1995, 2019, 1)
- # y = g(x)
- # plt.subplot(3, 1, 1)
- # sns.distplot(train['GODINA'])
- # plt.subplot(3, 1, 2)
- # sns.barplot(x='GODINA', y='PRIJEVREMENI_RASKID', data=train)
- # plt.subplot(3, 1, 3)
- # plt.plot(x, y, linewidth=2.0)
- #
- # plt.show()
- train['f_RASKID'] = 0
- train.loc[
- train['OSTALI_UGOVORI'] > 0, 'f_RASKID'] = (train[train['OSTALI_UGOVORI'] > 0]['OSTALI_RASKIDI'] /
- train[train['OSTALI_UGOVORI'] > 0]['OSTALI_UGOVORI'] - 0.27) * \
- np.power(train[train['OSTALI_UGOVORI'] > 0]['OSTALI_UGOVORI'], 1 / 3)
- test['f_RASKID'] = 0
- test.loc[
- test['OSTALI_UGOVORI'] > 0, 'f_RASKID'] = (test[test['OSTALI_UGOVORI'] > 0]['OSTALI_RASKIDI'] /
- test[test['OSTALI_UGOVORI'] > 0]['OSTALI_UGOVORI'] - 0.27) * \
- np.power(test[test['OSTALI_UGOVORI'] > 0]['OSTALI_UGOVORI'], 1 / 3)
- from scipy.stats import boxcox_normmax, boxcox
- # train['noise'] = np.random.normal(1, 0.12, size=len(train))
- train['f_RASKID'] *= np.random.normal(1, 0.12, size=len(train))
- train.loc[train['OSTALI_UGOVORI'] >= 10, 'OSTALI_UGOVORI'] = 10
- combinations = [('TIP_KAMATE', 'VRSTA_KLIJENTA'),
- ('TIP_KAMATE', 'VRSTA_PROIZVODA'),
- ('VALUTA', 'VRSTA_KLIJENTA'),
- ('VALUTA', 'VRSTA_PROIZVODA'),
- ]
- comb_columns = []
- for pair in combinations:
- train[pair[0] + '+' + pair[1]] = train[pair[0]].astype(str) + ', ' + train[pair[1]].astype(str)
- test[pair[0] + '+' + pair[1]] = test[pair[0]].astype(str) + ', ' + test[pair[1]].astype(str)
- comb_columns.append(pair[0] + '+' + pair[1])
- train = pd.get_dummies(train, columns=comb_columns, drop_first=False)
- test = pd.get_dummies(test, columns=comb_columns, drop_first=False)
- train = pd.get_dummies(train, columns=['UGOVORENI_IZNOS', 'VALUTA', 'VRSTA_KLIJENTA', 'PROIZVOD', 'VRSTA_PROIZVODA',
- 'VISINA_KAMATE', 'TIP_KAMATE', 'STAROST'], drop_first=False)
- test = pd.get_dummies(test, columns=['UGOVORENI_IZNOS', 'VALUTA', 'VRSTA_KLIJENTA', 'PROIZVOD', 'VRSTA_PROIZVODA',
- 'VISINA_KAMATE', 'TIP_KAMATE', 'STAROST'], drop_first=False)
- set_currency(train)
- set_currency(test)
- drop_from_train = []
- drop_from_test = []
- for col in train:
- if (train[col] != 0).sum() < 3000:
- drop_from_train.append(col)
- if col in test.columns.values:
- drop_from_test.append(col)
- # train.drop(columns=['VALUTA_2', 'VALUTA_3', 'VALUTA_4', 'VALUTA_5', 'VRSTA_KLIJENTA_1110', 'VRSTA_KLIJENTA_1120',
- # 'VRSTA_KLIJENTA_1420', 'VRSTA_KLIJENTA_1610', 'VRSTA_PROIZVODA_L', 'GBP'], # 'VRSTA_KLIJENTA_1320',
- # inplace=True)
- # test.drop(columns=['VALUTA_2', 'VALUTA_3', 'VALUTA_4', 'VALUTA_5', 'VRSTA_KLIJENTA_1110', 'VRSTA_KLIJENTA_1120',
- # 'VRSTA_KLIJENTA_1420', 'VRSTA_KLIJENTA_1610', 'VRSTA_PROIZVODA_L', 'GBP'], # 'VRSTA_KLIJENTA_1320',
- # inplace=True)
- train.drop(columns=drop_from_train, inplace=True)
- test.drop(columns=drop_from_test, inplace=True)
- train.drop(columns=['KLIJENT_ID', 'OZNAKA_PARTIJE', 'GODINA_MJESEC'], inplace=True)
- test.drop(columns=['KLIJENT_ID', 'OZNAKA_PARTIJE', 'GODINA_MJESEC'], inplace=True)
- # FREQUENCY ENCODING
- # for col in comb_columns:
- # freq = {}
- # for val in train[col].unique():
- # freq[val] = (train[col] == val).sum() / len(train)
- # train[col] = train[col].map(freq)
- # test[col] = test[col].map(freq)
- #
- # for col in ['UGOVORENI_IZNOS', 'VALUTA', 'VRSTA_KLIJENTA', 'PROIZVOD', 'VRSTA_PROIZVODA', 'VISINA_KAMATE', 'TIP_KAMATE',
- # 'GODINA', 'STAROST']:
- # freq = {}
- # for val in train[col].unique():
- # freq[val] = (train[col] == val).sum() / len(train)
- # train[col] = train[col].map(freq)
- # test[col] = test[col].map(freq)
- tr = set(train.columns.values)
- tst = set(test.columns.values)
- for x in tr.difference(tst):
- if x != 'PRIJEVREMENI_RASKID' and x != 'PRODULJIVANJE':
- test[x] = 0
- tr = set(train.columns.values)
- tst = set(test.columns.values)
- print(tst.difference(tr))
- print(tr.difference(tst))
- test.drop(columns=list(tst.difference(tr)), inplace=True)
- # y = train['PRIJEVREMENI_RASKID']
- # produljivanje = train['PRODULJIVANJE']
- # train.drop(columns=['PRIJEVREMENI_RASKID', 'PRODULJIVANJE'], inplace=True)
- #
- # encoder = ce.target_encoder.TargetEncoder(cols=comb_columns, smoothing=2, min_samples_leaf=4000)
- # encoder.fit(train, y)
- #
- # train = encoder.transform(train)
- # test = encoder.transform(test)
- #
- # train['PRODULJIVANJE'] = produljivanje
- # train['PRIJEVREMENI_RASKID'] = y
- test = test[train.drop(columns=['PRODULJIVANJE', 'PRIJEVREMENI_RASKID']).columns.values]
- test1 = train.drop(columns=['PRIJEVREMENI_RASKID', 'PRODULJIVANJE', 'PLANIRANO_TRAJANJE(m)']).loc[np.isnan(train['PLANIRANO_TRAJANJE(m)'])].values
- test2 = test.drop(columns='PLANIRANO_TRAJANJE(m)').loc[np.isnan(test['PLANIRANO_TRAJANJE(m)'])].values
- train1 = train.drop(columns=['PRIJEVREMENI_RASKID', 'PRODULJIVANJE']).loc[~np.isnan(train['PLANIRANO_TRAJANJE(m)'])]
- train2 = test.loc[~np.isnan(test['PLANIRANO_TRAJANJE(m)'])]
- df = pd.concat([train1, train2], sort=False)
- X_train = df.drop(columns='PLANIRANO_TRAJANJE(m)').values
- y_train = df['PLANIRANO_TRAJANJE(m)'].values
- lgbm = LGBMRegressor(n_estimators=200, n_jobs=6)
- kfolds = KFold(shuffle=True, n_splits=5)
- # mse = -cross_val_score(lgbm, X_train, y_train, scoring="neg_mean_squared_error", cv=kfolds, n_jobs=3, verbose=3)
- # print(np.mean(mse), np.std(mse))
- lgbm.fit(X_train, y_train)
- print(X_train.shape, test1.shape)
- pred_train = lgbm.predict(test1)
- train.loc[np.isnan(train['PLANIRANO_TRAJANJE(m)']), 'PLANIRANO_TRAJANJE(m)'] = pred_train
- train['PLANIRANO_TRAJANJE(m)'] = train['PLANIRANO_TRAJANJE(m)'].astype(int)
- # test ima 0 missing valuesa u PLANIRANO TRAJANJE
- # pred_test = reg.predict(test2)
- # test.loc[np.isnan(train['PLANIRANO_TRAJANJE(m)']), 'PLANIRANO_TRAJANJE(m)'] = pred_test
- planirano_trajanje_train(train)
- planirano_trajanje_test(test)
- train['PLANIRANO_TRAJANJE(m)'] *= -1
- test['PLANIRANO_TRAJANJE(m)'] *= -1
- train['f2'] = train['f_RASKID'] + train['PLANIRANO_TRAJANJE(m)'] / 3
- test['f2'] = test['f_RASKID'] + test['PLANIRANO_TRAJANJE(m)'] / 3
- train['f3'] = train['f2'] + (2018 - train['GODINA']) / 20
- test['f3'] = test['f2'] + (2018 - test['GODINA']) / 20
- test = test[train.drop(columns=['PRODULJIVANJE', 'PRIJEVREMENI_RASKID']).columns.values]
- X_test = test.values
- X = train.drop(columns=['PRODULJIVANJE', 'PRIJEVREMENI_RASKID']).values
- y = train['PRODULJIVANJE'].values
- clf1 = XGBClassifier(n_jobs=8, n_estimators=200, scale_pos_weight=1.3)
- # print(cross_val_acc(clf1, X, y, 5, dist_yes=0.151, wanted_train_distribution=0.151))
- clf1.fit(X, y)
- y_pred1 = clf1.predict(X_test)
- test['PRODULJIVANJE'] = y_pred1
- print(test['PRODULJIVANJE'].value_counts() / len(test))
- train['f4'] = train['f3'] + train['PRODULJIVANJE'] - 0.5
- test['f4'] = test['f3'] + test['PRODULJIVANJE'] - 0.5
- train['f2+f4'] = train['f2'] + train['f4']
- test['f2+f4'] = test['f2'] + test['f4']
- train['f2*f4'] = train['f2'] * train['f4']
- test['f2*f4'] = test['f2'] * test['f4']
- # train['f2-f4'] = train['f2'] - train['f4']
- # test['f2-f4'] = test['f2'] - test['f4']
- train['f4+mj'] = train['f4'] + train['MJESEC'] / 12
- test['f4+mj'] = test['f4'] + test['MJESEC'] / 12
- train['f4*mj'] = train['f4'] * train['MJESEC'] / 12
- test['f4*mj'] = test['f4'] * test['MJESEC'] / 12
- train['f4+dan'] = train['f4'] + train['dan'] / 20
- test['f4+dan'] = test['f4'] + test['dan'] / 20
- train['f4*dan'] = train['f4'] * train['dan'] / 20
- test['f4*dan'] = test['f4'] * test['dan'] / 20
- # train['f2+f_raskid'] = train['f2'] + train['f_RASKID']
- # test['f2+f_raskid'] = test['f2'] + test['f_RASKID']
- # train['f2+f4|f4+dan'] = train['f2+f4'] + train['f4+dan']
- # test['f2+f4|f4+dan'] = test['f2+f4'] + test['f4+dan']
- #
- # train['f2+f4||f4+dan'] = train['f2+f4'] * train['f4+dan']
- # test['f2+f4||f4+dan'] = test['f2+f4'] * test['f4+dan']
- train['OSTALI_RASKIDI+f4'] = train['OSTALI_RASKIDI'] / 8 + train['f4']
- test['OSTALI_RASKIDI+f4'] = test['OSTALI_RASKIDI'] / 8 + test['f4']
- train['OSTALI_UGOVORI+f2+f4'] = train['OSTALI_UGOVORI'] / 70 + train['f2+f4']
- test['OSTALI_UGOVORI+f2+f4'] = test['OSTALI_UGOVORI'] / 70 + test['f2+f4']
- train['OSTALI_UGOVORI * f2+f4'] = train['OSTALI_UGOVORI'] / 70 * train['f2+f4']
- test['OSTALI_UGOVORI * f2+f4'] = test['OSTALI_UGOVORI'] / 70 * test['f2+f4']
- # train['f2+f_raskid|f4+dan'] = train['f2+f_raskid'] + train['f4+dan']
- # test['f2+f_raskid|f4+dan'] = test['f2+f_raskid'] + test['f4+dan']
- #
- # train['f2+f_raskid||f4+dan'] = train['f2+f_raskid'] * train['f4+dan']
- # test['f2+f_raskid||f4+dan'] = test['f2+f_raskid'] * test['f4+dan']
- funkcije = ['f_RASKID', 'f2', 'f3', 'f4', 'f2*f4', 'f2+f4', 'f4+mj', 'f4+dan', 'f4*mj', 'f4+dan',
- 'OSTALI_RASKIDI+f4', 'OSTALI_UGOVORI+f2+f4', 'OSTALI_UGOVORI * f2+f4']
- # for fj in funkcije:
- # raskid = train[train['PRIJEVREMENI_RASKID'] == 1][fj]
- # neraskid = train[train['PRIJEVREMENI_RASKID'] == 0][fj]
- #
- # sns.distplot(raskid, color='red', kde=True, hist=False, kde_kws={'shade': True, 'linewidth': 1.2}, label='raskid')
- # sns.distplot(neraskid, color='blue', kde=True, hist=False, kde_kws={'shade': True, 'linewidth': 1.2}, label='neraskid')
- # plt.show()
- order = train.columns.values.tolist()
- order.remove('PRIJEVREMENI_RASKID')
- test = test[order]
- # train.drop(columns='f_RASKID', inplace=True)
- # test.drop(columns='f_RASKID', inplace=True)
- X_test = test.values
- X = train.drop(columns=['PRIJEVREMENI_RASKID']).values
- y = train['PRIJEVREMENI_RASKID'].values
- #
- #
- # neural = NN(epochs=100)
- # X_train, y_train, X_val, y_val = create_validation_xy(X, y, 20000)
- # neural.fit(X, y, X_val, y_val)
- clf = XGBClassifier(n_jobs=8, n_estimators=200, scale_pos_weight=3.1, learning_rate=0.05, max_depth=10)
- # clf2 = XGBClassifier(n_jobs=8, n_estimators=150, scale_pos_weight=3.1, learning_rate=0.014, max_depth=6)
- # clf3 = XGBClassifier(n_jobs=8, n_estimators=100, scale_pos_weight=3.1)
- #
- # acc, f1 = cross_val_acc(clf, X, y, 3)
- # baseline_acc, baseline_f1 = np.mean(acc), np.mean(f1)
- # print(baseline_acc, baseline_f1)
- # acc, f1 = cross_val_acc(clf2, X, y, 3)
- # baseline_acc, baseline_f1 = np.mean(acc), np.mean(f1)
- # print(baseline_acc, baseline_f1)
- #
- # acc, f1 = cross_val_acc(clf3, X, y, 3)
- # baseline_acc, baseline_f1 = np.mean(acc), np.mean(f1)
- # print(baseline_acc, baseline_f1)
- # acc, f1 = cross_val_acc(clf2, X, y, 3)
- # baseline_acc, baseline_f1 = np.mean(acc), np.mean(f1)
- # print(baseline_acc, baseline_f1)
- # text = open('rezultati.txt', 'a+')
- for n in [8, 10, 12]:
- for e in range(-4, 0):
- for sub in [0.5, 0.7, 0.9, 1]:
- lr = np.exp(e)
- X = train.drop(columns=['PRIJEVREMENI_RASKID']).values
- y = train['PRIJEVREMENI_RASKID'].values
- clf = XGBClassifier(n_jobs=9, n_estimators=230, scale_pos_weight=2.9, learning_rate=lr, max_depth=n, subsample=sub)
- acc, f1 = cross_val_acc(clf, X, y, 5)
- baseline_acc, baseline_f1 = np.mean(acc), np.mean(f1)
- print('max depth: {}, lr: {}'.format(n, lr), baseline_acc, baseline_f1)
- exit()
- # text.close()
- # exit()
- # perm = pd.DataFrame(columns=['Feature', 'Accuracy', 'F1 score'])
- # for i, col in enumerate(X.transpose()):
- # temp = np.random.permutation(col)
- # X_perm = X.transpose().copy()
- # X_perm[i] = temp
- # X_perm = X_perm.transpose()
- #
- # acc, f1 = cross_val_acc(clf, X_perm, y, 4)
- # accuracy = round(baseline_acc - np.mean(acc), 7)
- # f1score = round(baseline_f1 - np.mean(f1), 7)
- #
- # acc, f1 = cross_val_acc(clf, X_perm, y, 4)
- # accuracy += round(baseline_acc - np.mean(acc), 7)
- # f1score += round(baseline_f1 - np.mean(f1), 7)
- #
- # accuracy = round(accuracy / 2, 5)
- # f1score = round(f1score / 2, 5)
- #
- # perm = perm.append({'Feature': l[i], 'Accuracy': accuracy, 'F1 score': f1score}, ignore_index=True)
- # print("{} / {}, {}: {}, {}".format(i + 1, 157, l[i], accuracy, f1score))
- #
- # perm.to_excel('Permutation_importance.xlsx', index=False)
- # print(perm.head())
- print('poceo fitati')
- clf.fit(X, y)
- xgbfir.saveXgbFI(clf, OutputXlsxFile='xgbi.xlsx', feature_names=train.drop(columns='PRIJEVREMENI_RASKID').columns.values)
- # clf2.fit(X, y)
- # clf3.fit(X, y)
- df = pd.DataFrame(columns=['feature', 'SHAP value'])
- print('racunam shap')
- explainer = shap.TreeExplainer(clf)
- shap_values = explainer.shap_values(X)
- for i, el in enumerate(train.drop(columns='PRIJEVREMENI_RASKID').columns.values):
- df = df.append({'feature': el, 'SHAP value': np.mean(np.abs(shap_values[i]))}, ignore_index=True)
- df.to_excel('shap.xlsx', index=False)
- # shap.summary_plot(shap_values, X, plot_type="bar", feature_names=train.drop(columns=['PRIJEVREMENI_RASKID']).columns.values)
- # clf2.fit(X, y)
- #
- # plt.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
- # plt.show()
- # d = {}
- # zero_importance_features = []
- # for i, el in enumerate(train.drop(columns='PRIJEVREMENI_RASKID').columns.values):
- # if clf.feature_importances_[i] == 0:# and clf2.feature_importances_[i] == 0:
- # zero_importance_features.append(el)
- # # if clf.feature_importances_[i] != 0:
- # # d[clf.feature_importances_[i]] = el
- # # else:
- # # zero_importance_features.append(el)
- #
- # print(zero_importance_features)
- # fi = np.sort(clf.feature_importances_)
- # for el in zero_importance:
- # print(el, 0.0)
- #
- # for el in fi:
- # if el == 0:
- # continue
- # s = ' '
- # k = d[el] + s
- # k = k[:34]
- # print(k, round(el, 5))
- y_pred = clf.predict(X_test)
- # y_pred2 = clf2.predict(X_test)
- # y_pred3 = clf3.predict(X_test)
- test = pd.read_excel('eval_dataset_nan.xlsx')
- # test['PRIJEVREMENI_RASKID'] = np.round((y_pred + y_pred2 + y_pred3) / 3)
- test['PRIJEVREMENI_RASKID'] = y_pred
- test['PRIJEVREMENI_RASKID'] = test['PRIJEVREMENI_RASKID'].map({1: 'Y', 0: 'N'})
- test.to_csv('student.csv')
- print(test['PRIJEVREMENI_RASKID'].value_counts() / len(test))
- # def hyperopt_train_test(params):
- # clf = XGBClassifier(**params)
- # acc, f1 = cross_val_acc(clf, X, y, 5)
- # acc = np.mean(acc)
- # return -acc
- #
- # space = {
- # 'n_estimators': 500,
- # 'max_depth': hp.choice('max_depth ', range(2, 30)),
- # # 'learning_rate ': hp.uniform('learning_rate ', 0.1, 2),
- # # 'subsample': hp.uniform('subsample', 0.3, 1),
- # 'scale_pos_weight ': 2.762,
- # 'n_jobs': -1
- # }
- #
- #
- # def f(params):
- # acc = hyperopt_train_test(params)
- # return {'loss': acc, 'status': STATUS_OK}
- #
- #
- # trials = Trials()
- # best = fmin(f, space, algo=tpe.suggest, max_evals=20, trials=trials)
- #
- # print(space_eval(space, best))
- # exit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement