Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- from sklearn import metrics
- import matplotlib as plt
- from xgboost.sklearn import XGBClassifier
- from matplotlib import pyplot as plt
- np.random.seed(0)
- values = pd.read_csv("/home/pierluigi/Scaricati/dataset_tf_idf_start.csv", sep=";")
- second = pd.read_csv("/home/pierluigi/Scaricati/index_tf.csv", sep=";")
- def create_index_tf(row):
- if(row['index']):
- return int(row['index'].replace("_u","").replace("_t","").replace("_j",""))
- second['index_tf'] = second.apply (lambda row: create_index_tf (row),axis=1)
- def replace_index(row):
- if("_u" in row['index']):
- return 0
- elif("_j" in row['index']):
- return 0
- elif("_t" in row['index']):
- return 1
- else:
- return 0
- df_last=values.join(second.set_index('index_tf'), on='index_tf')
- df_last['index']=df_last.apply (lambda row: replace_index(row),axis=1)
- from sklearn.model_selection import train_test_split
- train, test = train_test_split(df_last, test_size=0.2)
- x = df_last.drop('index',axis=1)
- x = x.drop('index_tf',axis=1)
- y = df_last[['index']]
- x.reset_index()
- y.reset_index()
- import xgboost as xgb
- np.random.seed(0)
- target="index"
- def modelfit_TRTE(alg, dtrain, dtest, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
- if useTrainCV:
- xgb_param = alg.get_xgb_params()
- xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
- xgtest = xgb.DMatrix(dtest[predictors].values)
- cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
- metrics='auc', early_stopping_rounds=early_stopping_rounds)
- alg.set_params(n_estimators=cvresult.shape[0])
- # Fit the algorithm on the data
- alg.fit(dtrain[predictors], dtrain['index'], eval_metric='auc')
- # Predict training set:
- dtrain_predictions = alg.predict(dtrain[predictors])
- dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]
- # Print model report:
- print("\nModel Report")
- print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['index'].values, dtrain_predictions))
- print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['index'], dtrain_predprob))
- #Predict on testing data:
- #dtest.loc[:, 'predprob'] = alg.predict_proba(dtest[predictors])[:, 1]
- dtest.is_copy = False
- dtest['predprob'] = alg.predict_proba(dtest[predictors])[:, 1]
- results = test.merge(dtest[['index_tf', 'predprob']], on='index_tf')
- print('AUC Score (Test): %f' % metrics.roc_auc_score(results['index'], results['predprob_x']))
- feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
- feat_imp.plot(kind='bar', title='Feature Importances')
- plt.ylabel('Feature Importance Score')
- plt.show()
- if __name__ == '__main__':
- predictors = ["euro","cambio","fondo","migliaio","entita_geografica","dicembre","rischio","copertura","fair_value","extra_ue","prodotto","verso","strumento","credito","mercato","valuta","effetto","milione","europa","derivato","utile","controllato","incremento","asia","vendita","accantonamento","cambio_realizzare","spa","principalmente","svalutazione","totale","materia","acconto","impresa_sottoporre_controllo","commerciale","utilizzo","marchio","circa","tasso_cambio","eur"]
- xgb1 = XGBClassifier(
- learning_rate =0.01,#prima 0.1
- n_estimators=5000,#prima 1000
- max_depth=3,
- min_child_weight=1,
- gamma=0,
- subsample=0.8,
- colsample_bytree=0.8,
- objective= 'binary:logistic',
- nthread=1,
- scale_pos_weight=1,
- reg_alpha=1,
- seed=27)
- modelfit_TRTE(xgb1, train, test, predictors)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement