Advertisement
Guest User

Untitled

a guest
May 21st, 2019
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.21 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import random as rnd
  4. import seaborn as sns
  5. import matplotlib.pyplot as plt
  6. from sklearn.linear_model import LogisticRegression
  7. from sklearn.svm import SVC, LinearSVC
  8. from sklearn.ensemble import RandomForestClassifier
  9. from sklearn.neighbors import KNeighborsClassifier
  10. from sklearn.naive_bayes import GaussianNB
  11. from sklearn.linear_model import Perceptron
  12. from sklearn.linear_model import SGDClassifier
  13. from sklearn.linear_model import LassoLars
  14. from sklearn.tree import DecisionTreeClassifier
  15. from sklearn.model_selection import train_test_split
  16. from sklearn.model_selection import cross_val_score
  17. import xgboost as xgb
  18. from xgboost import XGBClassifier
  19.  
  20. from hyperopt import hp, tpe, STATUS_OK, Trials, fmin, tpe, hp, space_eval
  21.  
  22. pd.set_option('display.max_columns', None)
  23.  
  24. def create_validation_set(df, validation_size, dist_y=0.55, wanted_dist=0.265673):
  25. dist_n = 1 - dist_y
  26.  
  27. len_yes = int(validation_size * dist_y)
  28. len_no = int(validation_size * dist_n)
  29.  
  30. yes = df[df['PRIJEVREMENI_RASKID'] == 1].index.tolist()
  31. no = df[df['PRIJEVREMENI_RASKID'] == 0].index.tolist()
  32.  
  33. np.random.shuffle(yes)
  34. np.random.shuffle(no)
  35.  
  36. validationN = df.loc[no[:len_no]]
  37. validationY = df.loc[yes[:len_yes]]
  38.  
  39. current_dist = (len(yes) - len_yes) / (len(no) - len_no + len(yes) - len_yes)
  40. overflow = int((len(no) - len_no) * (1 - current_dist * (1 - wanted_dist) / (1 - current_dist) / wanted_dist))
  41.  
  42.  
  43. X_validation = pd.concat([validationN, validationY])
  44. y_validation = X_validation['PRIJEVREMENI_RASKID']
  45. X_validation.drop(columns=['PRIJEVREMENI_RASKID'], inplace=True)
  46.  
  47. df.drop(no[:len_no + overflow], inplace=True)
  48. df.drop(yes[:len_yes], inplace=True)
  49.  
  50. return df, X_validation, y_validation
  51.  
  52.  
  53. def create_validation_Xy(X, y, validation_size, dist_y=0.55, wanted_dist=0.265673):
  54. from sklearn.utils import shuffle
  55. dist_n = 1 - dist_y
  56.  
  57. X, y = shuffle(X, y)
  58. yes = np.where(y == 1)[0]
  59. no = np.where(y == 0)[0]
  60.  
  61. len_yes = int(validation_size * dist_y)
  62. len_no = int(validation_size * dist_n)
  63.  
  64. X_val = np.concatenate([X[yes[:len_yes]], X[no[:len_no]]])
  65. y_val = np.concatenate([y[yes[:len_yes]], y[no[:len_no]]])
  66. X_val, y_val = shuffle(X_val, y_val)
  67.  
  68. current_dist = (len(yes) - len_yes) / (len(yes) - len_yes + len(no) - len_no)
  69. overflow = int((len(no) - len_no) * (1 - current_dist * (1 - wanted_dist) / (1 - current_dist) / wanted_dist))
  70.  
  71. X_train = np.concatenate([X[yes[len_yes:]], X[no[len_no + overflow:]]])
  72. y_train = np.concatenate([y[yes[len_yes:]], y[no[len_no + overflow:]]])
  73. X_train, y_train = shuffle(X_train, y_train)
  74. # print('dist u trainu:', y_train.sum() / len(y_train))
  75.  
  76. return X_train, y_train, X_val, y_val
  77.  
  78.  
  79. def cross_val_acc(clf, X, y, n=5, dist_y=0.55, wanted_dist=0.265673):
  80. from sklearn.metrics import f1_score, accuracy_score
  81. from sklearn.utils import shuffle
  82. dist_n = 1 - dist_y
  83.  
  84. X, y = shuffle(X, y)
  85.  
  86. results = []
  87. f1s = []
  88. batch_size = (len(X) // n + 1) // 3
  89. for i in range(n):
  90. X_train, y_train, X_val, y_val = create_validation_Xy(X, y, batch_size, dist_y, wanted_dist)
  91. clf.fit(X_train, y_train)
  92. y_pred = clf.predict(X_val)
  93.  
  94. # print(y_val.sum() / len(y_val), y_pred.sum() / len(y_pred))
  95. results.append(accuracy_score(y_val, y_pred))
  96. f1s.append(f1_score(y_val, y_pred))
  97. print('Done: {} / {}'.format(i + 1, n))
  98. return results, f1s
  99.  
  100. train = pd.read_csv('ProbraniFeaturi2.csv')
  101. test = pd.read_csv('ProbraniTest2.csv')
  102. ft1 = pd.read_csv('ivanovi_featuri.csv')
  103. ft2 = pd.read_csv('bitni_featuri.csv')
  104. ft3 = pd.read_csv('ALL.csv')
  105.  
  106. train['DATUM_OTVARANJA'] = pd.to_datetime(train['DATUM_OTVARANJA'])
  107. train = train.set_index('DATUM_OTVARANJA')
  108.  
  109. train['GODINA'] = train.index.year
  110. train['MJESEC'] = train.index.month
  111.  
  112. train['MJESEC'] = '0' + train['MJESEC'].astype(str)
  113. train['MJESEC'] = train['MJESEC'].transform(lambda x: x[-2:])
  114.  
  115. train['GODINA_MJESEC'] = train['GODINA'].astype(str) + '-' + train['MJESEC'].astype(str)
  116.  
  117. test['DATUM_OTVARANJA'] = pd.to_datetime(test['DATUM_OTVARANJA'])
  118. test = test.set_index('DATUM_OTVARANJA')
  119.  
  120. test['GODINA'] = test.index.year
  121. test['MJESEC'] = test.index.month
  122.  
  123. test['MJESEC'] = '0' + test['MJESEC'].astype(str)
  124. test['MJESEC'] = test['MJESEC'].transform(lambda x: x[-2:])
  125.  
  126. test['GODINA_MJESEC'] = test['GODINA'].astype(str) + '-' + test['MJESEC'].astype(str)
  127.  
  128. dates = ft1['GODINA_MJESEC'].tolist()
  129.  
  130. li = ['BDP (u mil. HRK, tekuće cijene) b', 'BDP po stanovniku (u EUR) ', 'Prosječna godišnja stopa inflacije potrošačkih cijena ', 'Tekući račun platne bilance (u mil. EUR) c', 'Inozemni dug (u mil. EUR, na kraju razdoblja) c', 'Bruto međunarodne pričuve (u mil. EUR, na kraju razdoblja)', 'Stopa nezaposlenosti (prema definiciji ILO-a, stanovništvo starije od 15 god.) f']
  131. zarez = ['BDP (u mil. HRK, tekuće cijene) b', 'BDP po stanovniku (u EUR) ', 'Inozemni dug (u mil. EUR, na kraju razdoblja) c', 'Bruto međunarodne pričuve (u mil. EUR, na kraju razdoblja)', 'Tekući račun platne bilance (u mil. EUR) c']
  132. '''
  133. for col in ft1:
  134. if col == 'GODINA_MJESEC':
  135. continue
  136. feat = {}
  137. #if col in zarez:
  138. #print(col)
  139. #ft[col] = ft[col].str.replace(',','')
  140. for i,value in enumerate(ft1[col]):
  141. feat[dates[i]] = value
  142.  
  143. train[col] = train['GODINA_MJESEC'].map(feat)
  144. train[col] = (train[col].astype('int64') - train[col].astype('int64').mean())/(train[col].astype('int64').std())
  145. test[col] = test['GODINA_MJESEC'].map(feat)
  146. test[col] = (test[col].astype('int64') - test[col].astype('int64').mean())/(test[col].astype('int64').std())
  147. '''
  148. for col in ft2:
  149. if col == 'GODINA_MJESEC':
  150. continue
  151. feat = {}
  152. #if col in zarez:
  153. #print(col)
  154. #ft[col] = ft[col].str.replace(',','')
  155. for i,value in enumerate(ft2[col]):
  156. feat[dates[i]] = value
  157.  
  158. train[col] = train['GODINA_MJESEC'].map(feat)
  159. train[col] = (train[col].astype('int64') - train[col].astype('int64').mean())/(train[col].astype('int64').std())
  160. test[col] = test['GODINA_MJESEC'].map(feat)
  161. test[col] = (test[col].astype('int64') - test[col].astype('int64').mean())/(test[col].astype('int64').std())
  162. '''
  163. for col in ft3:
  164. if col == 'GODINA_MJESEC':
  165. continue
  166. feat = {}
  167. #if col in zarez:
  168. #print(col)
  169. #ft[col] = ft[col].str.replace(',','')
  170. for i,value in enumerate(ft3[col]):
  171. feat[dates[i]] = value
  172.  
  173. train[col] = train['GODINA_MJESEC'].map(feat)
  174. train[col] = (train[col].astype('int64') - train[col].astype('int64').mean())/(train[col].astype('int64').std())
  175. test[col] = test['GODINA_MJESEC'].map(feat)
  176. test[col] = (test[col].astype('int64') - test[col].astype('int64').mean())/(test[col].astype('int64').std())
  177. '''
  178.  
  179. test = test.drop(columns = ['PRIJEVREMENI_RASKID'])
  180. train = train.drop(columns = ['OZNAKA_PARTIJE', 'KLIJENT_ID', 'STAROST', 'GODINA', 'MJESEC', 'GODINA_MJESEC'])
  181. test = test.drop(columns = ['OZNAKA_PARTIJE', 'KLIJENT_ID', 'STAROST', 'GODINA', 'MJESEC', 'GODINA_MJESEC'])
  182.  
  183. print(train.head(),test.head())
  184.  
  185. #train = train.drop(columns = ['BROJ_KREDITA_2_3', 'BROJ_KREDITA_9'])
  186. #test = test.drop(columns = ['BROJ_KREDITA_2_3', 'BROJ_KREDITA_9'])
  187.  
  188. train['PRIJEVREMENI_RASKID'] = train['PRIJEVREMENI_RASKID'].map( {'Y': 1, 'N': 0} ).astype(int)
  189.  
  190. print(train.columns.values.tolist())
  191. print(test.columns.values.tolist())
  192. X_test = test.values
  193. X = train.drop(columns = ['PRIJEVREMENI_RASKID']).values
  194. y = train['PRIJEVREMENI_RASKID'].values
  195.  
  196. clf = XGBClassifier(scale_pos_weight=3.7, n_jobs=-1, n_estimators=500)
  197. acc, f1 = cross_val_acc(clf, X, y, 5)
  198.  
  199. print(np.mean(acc), np.mean(f1), sep='\n')
  200.  
  201. clf.fit(X, y)
  202. print(clf.feature_importances_)
  203. # plot
  204. plt.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
  205. plt.show()
  206.  
  207. y_pred = clf.predict(X_test)
  208.  
  209. sol = pd.read_excel('eval_dataset_nan.xlsx')
  210. sol['PRIJEVREMENI_RASKID'] = y_pred
  211. sol['PRIJEVREMENI_RASKID'] = sol['PRIJEVREMENI_RASKID'].map( {1: 'Y', 0: 'N'} )
  212. print(sol['PRIJEVREMENI_RASKID'].value_counts())
  213. sol.to_csv('student.csv')
  214. print("done")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement