Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import random as rnd
- import seaborn as sns
- import matplotlib.pyplot as plt
- from sklearn.linear_model import LogisticRegression
- from sklearn.svm import SVC, LinearSVC
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.naive_bayes import GaussianNB
- from sklearn.linear_model import Perceptron
- from sklearn.linear_model import SGDClassifier
- from sklearn.linear_model import LassoLars
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.model_selection import train_test_split
- from sklearn.model_selection import cross_val_score
- import xgboost as xgb
- from xgboost import XGBClassifier
- from hyperopt import hp, tpe, STATUS_OK, Trials, fmin, tpe, hp, space_eval
- pd.set_option('display.max_columns', None)
- def create_validation_set(df, validation_size, dist_y=0.55, wanted_dist=0.265673):
- dist_n = 1 - dist_y
- len_yes = int(validation_size * dist_y)
- len_no = int(validation_size * dist_n)
- yes = df[df['PRIJEVREMENI_RASKID'] == 1].index.tolist()
- no = df[df['PRIJEVREMENI_RASKID'] == 0].index.tolist()
- np.random.shuffle(yes)
- np.random.shuffle(no)
- validationN = df.loc[no[:len_no]]
- validationY = df.loc[yes[:len_yes]]
- current_dist = (len(yes) - len_yes) / (len(no) - len_no + len(yes) - len_yes)
- overflow = int((len(no) - len_no) * (1 - current_dist * (1 - wanted_dist) / (1 - current_dist) / wanted_dist))
- X_validation = pd.concat([validationN, validationY])
- y_validation = X_validation['PRIJEVREMENI_RASKID']
- X_validation.drop(columns=['PRIJEVREMENI_RASKID'], inplace=True)
- df.drop(no[:len_no + overflow], inplace=True)
- df.drop(yes[:len_yes], inplace=True)
- return df, X_validation, y_validation
- def create_validation_Xy(X, y, validation_size, dist_y=0.55, wanted_dist=0.265673):
- from sklearn.utils import shuffle
- dist_n = 1 - dist_y
- X, y = shuffle(X, y)
- yes = np.where(y == 1)[0]
- no = np.where(y == 0)[0]
- len_yes = int(validation_size * dist_y)
- len_no = int(validation_size * dist_n)
- X_val = np.concatenate([X[yes[:len_yes]], X[no[:len_no]]])
- y_val = np.concatenate([y[yes[:len_yes]], y[no[:len_no]]])
- X_val, y_val = shuffle(X_val, y_val)
- current_dist = (len(yes) - len_yes) / (len(yes) - len_yes + len(no) - len_no)
- overflow = int((len(no) - len_no) * (1 - current_dist * (1 - wanted_dist) / (1 - current_dist) / wanted_dist))
- X_train = np.concatenate([X[yes[len_yes:]], X[no[len_no + overflow:]]])
- y_train = np.concatenate([y[yes[len_yes:]], y[no[len_no + overflow:]]])
- X_train, y_train = shuffle(X_train, y_train)
- # print('dist u trainu:', y_train.sum() / len(y_train))
- return X_train, y_train, X_val, y_val
- def cross_val_acc(clf, X, y, n=5, dist_y=0.55, wanted_dist=0.265673):
- from sklearn.metrics import f1_score, accuracy_score
- from sklearn.utils import shuffle
- dist_n = 1 - dist_y
- X, y = shuffle(X, y)
- results = []
- f1s = []
- batch_size = (len(X) // n + 1) // 3
- for i in range(n):
- X_train, y_train, X_val, y_val = create_validation_Xy(X, y, batch_size, dist_y, wanted_dist)
- clf.fit(X_train, y_train)
- y_pred = clf.predict(X_val)
- # print(y_val.sum() / len(y_val), y_pred.sum() / len(y_pred))
- results.append(accuracy_score(y_val, y_pred))
- f1s.append(f1_score(y_val, y_pred))
- print('Done: {} / {}'.format(i + 1, n))
- return results, f1s
- train = pd.read_csv('ProbraniFeaturi2.csv')
- test = pd.read_csv('ProbraniTest2.csv')
- ft1 = pd.read_csv('ivanovi_featuri.csv')
- ft2 = pd.read_csv('bitni_featuri.csv')
- ft3 = pd.read_csv('ALL.csv')
- train['DATUM_OTVARANJA'] = pd.to_datetime(train['DATUM_OTVARANJA'])
- train = train.set_index('DATUM_OTVARANJA')
- train['GODINA'] = train.index.year
- train['MJESEC'] = train.index.month
- train['MJESEC'] = '0' + train['MJESEC'].astype(str)
- train['MJESEC'] = train['MJESEC'].transform(lambda x: x[-2:])
- train['GODINA_MJESEC'] = train['GODINA'].astype(str) + '-' + train['MJESEC'].astype(str)
- test['DATUM_OTVARANJA'] = pd.to_datetime(test['DATUM_OTVARANJA'])
- test = test.set_index('DATUM_OTVARANJA')
- test['GODINA'] = test.index.year
- test['MJESEC'] = test.index.month
- test['MJESEC'] = '0' + test['MJESEC'].astype(str)
- test['MJESEC'] = test['MJESEC'].transform(lambda x: x[-2:])
- test['GODINA_MJESEC'] = test['GODINA'].astype(str) + '-' + test['MJESEC'].astype(str)
- dates = ft1['GODINA_MJESEC'].tolist()
- li = ['BDP (u mil. HRK, tekuće cijene) b', 'BDP po stanovniku (u EUR) ', 'Prosječna godišnja stopa inflacije potrošačkih cijena ', 'Tekući račun platne bilance (u mil. EUR) c', 'Inozemni dug (u mil. EUR, na kraju razdoblja) c', 'Bruto međunarodne pričuve (u mil. EUR, na kraju razdoblja)', 'Stopa nezaposlenosti (prema definiciji ILO-a, stanovništvo starije od 15 god.) f']
- zarez = ['BDP (u mil. HRK, tekuće cijene) b', 'BDP po stanovniku (u EUR) ', 'Inozemni dug (u mil. EUR, na kraju razdoblja) c', 'Bruto međunarodne pričuve (u mil. EUR, na kraju razdoblja)', 'Tekući račun platne bilance (u mil. EUR) c']
- '''
- for col in ft1:
- if col == 'GODINA_MJESEC':
- continue
- feat = {}
- #if col in zarez:
- #print(col)
- #ft[col] = ft[col].str.replace(',','')
- for i,value in enumerate(ft1[col]):
- feat[dates[i]] = value
- train[col] = train['GODINA_MJESEC'].map(feat)
- train[col] = (train[col].astype('int64') - train[col].astype('int64').mean())/(train[col].astype('int64').std())
- test[col] = test['GODINA_MJESEC'].map(feat)
- test[col] = (test[col].astype('int64') - test[col].astype('int64').mean())/(test[col].astype('int64').std())
- '''
- for col in ft2:
- if col == 'GODINA_MJESEC':
- continue
- feat = {}
- #if col in zarez:
- #print(col)
- #ft[col] = ft[col].str.replace(',','')
- for i,value in enumerate(ft2[col]):
- feat[dates[i]] = value
- train[col] = train['GODINA_MJESEC'].map(feat)
- train[col] = (train[col].astype('int64') - train[col].astype('int64').mean())/(train[col].astype('int64').std())
- test[col] = test['GODINA_MJESEC'].map(feat)
- test[col] = (test[col].astype('int64') - test[col].astype('int64').mean())/(test[col].astype('int64').std())
- '''
- for col in ft3:
- if col == 'GODINA_MJESEC':
- continue
- feat = {}
- #if col in zarez:
- #print(col)
- #ft[col] = ft[col].str.replace(',','')
- for i,value in enumerate(ft3[col]):
- feat[dates[i]] = value
- train[col] = train['GODINA_MJESEC'].map(feat)
- train[col] = (train[col].astype('int64') - train[col].astype('int64').mean())/(train[col].astype('int64').std())
- test[col] = test['GODINA_MJESEC'].map(feat)
- test[col] = (test[col].astype('int64') - test[col].astype('int64').mean())/(test[col].astype('int64').std())
- '''
- test = test.drop(columns = ['PRIJEVREMENI_RASKID'])
- train = train.drop(columns = ['OZNAKA_PARTIJE', 'KLIJENT_ID', 'STAROST', 'GODINA', 'MJESEC', 'GODINA_MJESEC'])
- test = test.drop(columns = ['OZNAKA_PARTIJE', 'KLIJENT_ID', 'STAROST', 'GODINA', 'MJESEC', 'GODINA_MJESEC'])
- print(train.head(),test.head())
- #train = train.drop(columns = ['BROJ_KREDITA_2_3', 'BROJ_KREDITA_9'])
- #test = test.drop(columns = ['BROJ_KREDITA_2_3', 'BROJ_KREDITA_9'])
- train['PRIJEVREMENI_RASKID'] = train['PRIJEVREMENI_RASKID'].map( {'Y': 1, 'N': 0} ).astype(int)
- print(train.columns.values.tolist())
- print(test.columns.values.tolist())
- X_test = test.values
- X = train.drop(columns = ['PRIJEVREMENI_RASKID']).values
- y = train['PRIJEVREMENI_RASKID'].values
- clf = XGBClassifier(scale_pos_weight=3.7, n_jobs=-1, n_estimators=500)
- acc, f1 = cross_val_acc(clf, X, y, 5)
- print(np.mean(acc), np.mean(f1), sep='\n')
- clf.fit(X, y)
- print(clf.feature_importances_)
- # plot
- plt.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
- plt.show()
- y_pred = clf.predict(X_test)
- sol = pd.read_excel('eval_dataset_nan.xlsx')
- sol['PRIJEVREMENI_RASKID'] = y_pred
- sol['PRIJEVREMENI_RASKID'] = sol['PRIJEVREMENI_RASKID'].map( {1: 'Y', 0: 'N'} )
- print(sol['PRIJEVREMENI_RASKID'].value_counts())
- sol.to_csv('student.csv')
- print("done")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement