Untitled

import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib as plt
from xgboost.sklearn import XGBClassifier
from matplotlib import pyplot as plt

np.random.seed(0)


values = pd.read_csv("/home/pierluigi/Scaricati/dataset_tf_idf_start.csv", sep=";")
second = pd.read_csv("/home/pierluigi/Scaricati/index_tf.csv", sep=";")


def create_index_tf(row):
    if(row['index']):
        return int(row['index'].replace("_u","").replace("_t","").replace("_j",""))

second['index_tf'] = second.apply (lambda row: create_index_tf (row),axis=1)

def replace_index(row):
    if("_u" in row['index']):
        return 0
    elif("_j" in row['index']):
        return 0
    elif("_t" in row['index']):
        return 1
    else:
        return 0


df_last=values.join(second.set_index('index_tf'), on='index_tf')

df_last['index']=df_last.apply (lambda row: replace_index(row),axis=1)

from sklearn.model_selection import train_test_split

train, test = train_test_split(df_last, test_size=0.2)


x = df_last.drop('index',axis=1)
x = x.drop('index_tf',axis=1)


y = df_last[['index']]

x.reset_index()
y.reset_index()


import xgboost as xgb

np.random.seed(0)
target="index"


def modelfit_TRTE(alg, dtrain, dtest, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        xgtest = xgb.DMatrix(dtest[predictors].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])

    # Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['index'], eval_metric='auc')

    # Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]

    # Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['index'].values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['index'], dtrain_predprob))

    #Predict on testing data:
    #dtest.loc[:, 'predprob'] = alg.predict_proba(dtest[predictors])[:, 1]
    dtest.is_copy = False
    dtest['predprob'] = alg.predict_proba(dtest[predictors])[:, 1]

    results = test.merge(dtest[['index_tf', 'predprob']], on='index_tf')
    print('AUC Score (Test): %f' % metrics.roc_auc_score(results['index'], results['predprob_x']))

    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.show()


if __name__ == '__main__':
    predictors = ["euro","cambio","fondo","migliaio","entita_geografica","dicembre","rischio","copertura","fair_value","extra_ue","prodotto","verso","strumento","credito","mercato","valuta","effetto","milione","europa","derivato","utile","controllato","incremento","asia","vendita","accantonamento","cambio_realizzare","spa","principalmente","svalutazione","totale","materia","acconto","impresa_sottoporre_controllo","commerciale","utilizzo","marchio","circa","tasso_cambio","eur"]
    xgb1 = XGBClassifier(
    learning_rate =0.01,#prima 0.1
    n_estimators=5000,#prima 1000
    max_depth=3,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=1,
    scale_pos_weight=1,
    reg_alpha=1,
    seed=27)


    modelfit_TRTE(xgb1, train, test, predictors)