Advertisement
Guest User

Untitled

a guest
Mar 20th, 2018
72
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.71 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. from sklearn import metrics
  4. import matplotlib as plt
  5. from xgboost.sklearn import XGBClassifier
  6. from matplotlib import pyplot as plt
  7.  
  8. np.random.seed(0)
  9.  
  10.  
  11.  
  12. values = pd.read_csv("/home/pierluigi/Scaricati/dataset_tf_idf_start.csv", sep=";")
  13. second = pd.read_csv("/home/pierluigi/Scaricati/index_tf.csv", sep=";")
  14.  
  15.  
  16. def create_index_tf(row):
  17. if(row['index']):
  18. return int(row['index'].replace("_u","").replace("_t","").replace("_j",""))
  19.  
  20. second['index_tf'] = second.apply (lambda row: create_index_tf (row),axis=1)
  21.  
  22. def replace_index(row):
  23. if("_u" in row['index']):
  24. return 0
  25. elif("_j" in row['index']):
  26. return 0
  27. elif("_t" in row['index']):
  28. return 1
  29. else:
  30. return 0
  31.  
  32.  
  33.  
  34. df_last=values.join(second.set_index('index_tf'), on='index_tf')
  35.  
  36. df_last['index']=df_last.apply (lambda row: replace_index(row),axis=1)
  37.  
  38. from sklearn.model_selection import train_test_split
  39.  
  40. train, test = train_test_split(df_last, test_size=0.2)
  41.  
  42.  
  43. x = df_last.drop('index',axis=1)
  44. x = x.drop('index_tf',axis=1)
  45.  
  46.  
  47. y = df_last[['index']]
  48.  
  49. x.reset_index()
  50. y.reset_index()
  51.  
  52.  
  53. import xgboost as xgb
  54.  
  55. np.random.seed(0)
  56. target="index"
  57.  
  58.  
  59.  
  60. def modelfit_TRTE(alg, dtrain, dtest, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
  61. if useTrainCV:
  62. xgb_param = alg.get_xgb_params()
  63. xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
  64. xgtest = xgb.DMatrix(dtest[predictors].values)
  65. cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
  66. metrics='auc', early_stopping_rounds=early_stopping_rounds)
  67. alg.set_params(n_estimators=cvresult.shape[0])
  68.  
  69. # Fit the algorithm on the data
  70. alg.fit(dtrain[predictors], dtrain['index'], eval_metric='auc')
  71.  
  72. # Predict training set:
  73. dtrain_predictions = alg.predict(dtrain[predictors])
  74. dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]
  75.  
  76. # Print model report:
  77. print("\nModel Report")
  78. print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['index'].values, dtrain_predictions))
  79. print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['index'], dtrain_predprob))
  80.  
  81. #Predict on testing data:
  82. #dtest.loc[:, 'predprob'] = alg.predict_proba(dtest[predictors])[:, 1]
  83. dtest.is_copy = False
  84. dtest['predprob'] = alg.predict_proba(dtest[predictors])[:, 1]
  85.  
  86. results = test.merge(dtest[['index_tf', 'predprob']], on='index_tf')
  87. print('AUC Score (Test): %f' % metrics.roc_auc_score(results['index'], results['predprob_x']))
  88.  
  89. feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
  90. feat_imp.plot(kind='bar', title='Feature Importances')
  91. plt.ylabel('Feature Importance Score')
  92. plt.show()
  93.  
  94.  
  95. if __name__ == '__main__':
  96. predictors = ["euro","cambio","fondo","migliaio","entita_geografica","dicembre","rischio","copertura","fair_value","extra_ue","prodotto","verso","strumento","credito","mercato","valuta","effetto","milione","europa","derivato","utile","controllato","incremento","asia","vendita","accantonamento","cambio_realizzare","spa","principalmente","svalutazione","totale","materia","acconto","impresa_sottoporre_controllo","commerciale","utilizzo","marchio","circa","tasso_cambio","eur"]
  97. xgb1 = XGBClassifier(
  98. learning_rate =0.01,#prima 0.1
  99. n_estimators=5000,#prima 1000
  100. max_depth=3,
  101. min_child_weight=1,
  102. gamma=0,
  103. subsample=0.8,
  104. colsample_bytree=0.8,
  105. objective= 'binary:logistic',
  106. nthread=1,
  107. scale_pos_weight=1,
  108. reg_alpha=1,
  109. seed=27)
  110.  
  111.  
  112. modelfit_TRTE(xgb1, train, test, predictors)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement