Untitled

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb
from numpy import mean
from numpy import std
import pandas
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
import numpy
from itertools import combinations

original_df = pandas.read_csv('train.nmv.csv')
train_df = original_df.iloc[: , :-1]
classes = original_df[original_df.columns[-1]].tolist()

scaler = MinMaxScaler()
train_data = scaler.fit_transform(train_df)

sets = []
sets.append(train_test_split(train_data, classes))

'''
cor_matrix = train_df.corr().abs()
upper_tri = cor_matrix.where(numpy.triu(numpy.ones(cor_matrix.shape),k=1).astype(bool))

to_drop = [int(column.replace('a', '')) for column in upper_tri.columns if any(upper_tri[column] > 0.6)]

df1 = train_df.drop(train_df.columns[to_drop], axis=1)
scaler = MinMaxScaler()
train_data = scaler.fit_transform(df1)

sets.append(train_test_split(train_data, classes))
'''

iteration = 0
for x_train, x_test, y_train, y_test in sets:
    '''
    mnb = MultinomialNB().fit(x_train, y_train)
    print("(" + str(iteration) + ") score on MNB test: " + str(mnb.score(x_test, y_test)))
    print("(" + str(iteration) + ") score on MNB train: "+ str(mnb.score(x_train, y_train)))
    print("---")

    lr = LogisticRegression(max_iter=1000).fit(x_train, y_train)
    print("(" + str(iteration) + ") score on LR test: " + str(lr.score(x_test, y_test)))
    print("(" + str(iteration) + ") score on LR train: "+ str(lr.score(x_train, y_train)))
    print("---")

    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(lr, x_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1)
    print('K-fold accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
    print("---")

    knn = KNeighborsClassifier(algorithm = 'brute', n_jobs=-1).fit(x_train, y_train)
    print("(" + str(iteration) + ") score on KNN test: " + str(knn.score(x_test, y_test)))
    print("(" + str(iteration) + ") score on KNN train: "+ str(knn.score(x_train, y_train)))
    print("---")

    svm=  LinearSVC(C=0.0001).fit(x_train, y_train)
    print("(" + str(iteration) + ") score on SVM test: " + str(svm.score(x_test, y_test)))
    print("(" + str(iteration) + ") score on SVM train: "+ str(svm.score(x_train, y_train)))
    print("---")

    clf = DecisionTreeClassifier().fit(x_train, y_train)
    print("(" + str(iteration) + ") score on CLF test: " + str(clf.score(x_test, y_test)))
    print("(" + str(iteration) + ") score on CLF train: "+ str(clf.score(x_train, y_train)))
    print("---")

    bg = BaggingClassifier(DecisionTreeClassifier(),max_samples=0.5,max_features=1.0,n_estimators=10).fit(x_train, y_train)
    print("(" + str(iteration) + ") score on BG test: " + str(bg.score(x_test, y_test)))
    print("(" + str(iteration) + ") score on BG train: "+ str(bg.score(x_train, y_train)))
    print("---")

    adb = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=10,max_depth=4),n_estimators=10,learning_rate=0.6).fit(x_train, y_train)
    print("(" + str(iteration) + ") score on ADB test: " + str(adb.score(x_test, y_test)))
    print("(" + str(iteration) + ") score on ADB train: "+ str(adb.score(x_train, y_train)))
    print("---")

    clusters = 3
    KNNClassifier = KNeighborsClassifier(n_neighbors = clusters).fit(x_train, y_train)
    print("(" + str(iteration) + ") score on " + str(clusters) + "-NN test: " + str(KNNClassifier.score(x_test, y_test)))
    print("(" + str(iteration) + ") score on " + str(clusters) + "-NN test: " + str(KNNClassifier.score(x_train, y_train)))
    print("---")
    '''

    rf = RandomForestClassifier(n_estimators=30, max_depth=9).fit(x_train, y_train)
    print("(" + str(iteration) + ") score on RF test: " + str(rf.score(x_test, y_test)))
    print("(" + str(iteration) + ") score on RF train: "+ str(rf.score(x_train, y_train)))
    print("---")

    xg_reg = xgb.XGBRegressor().fit(x_train, y_train)

    y_pred = xg_reg.predict(x_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("(" + str(iteration) + ") score on XGB test: " + str(accuracy))

    y_pred = xg_reg.predict(x_train)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_train, predictions)
    print("(" + str(iteration) + ") score on XGB train: "+ str(accuracy))
    print("---")

    model = LGBMClassifier().fit(x_train, y_train)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(model, x_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    n_scores_2 = cross_val_score(model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    print("(" + str(iteration) + ") score on LGBM test: " + str(mean(n_scores)))
    print("(" + str(iteration) + ") score on LGBM test: " + str(mean(n_scores_2)))
    print("---")

    '''
    #model = CatBoostClassifier(verbose=0, n_estimators=100).fit(x_train, y_train)
    model = CatBoostClassifier().fit(x_train, y_train)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(model, x_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

    model = CatBoostClassifier()
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores_2 = cross_val_score(model, x_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    print("(" + str(iteration) + ") score on Cat Boost test: " + str(mean(n_scores)))
    print("(" + str(iteration) + ") score on Cat Boost test: " + str(mean(n_scores_2)))
    print("---")
    '''

    '''
    # Number of trees in random forest
    n_estimators = [int(x) for x in numpy.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in numpy.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]# Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}

    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1).fit(x_train, y_train)
    print("(" + str(iteration) + ") score on RF test random: " + str(rf.score(x_test, y_test)))
    print("(" + str(iteration) + ") score on RF train random: "+ str(rf.score(x_train, y_train)))
    print("Best params: " + str(rf_random.best_params_))
    print("---")
    '''

    iteration += 1

'''
def get_redundant_pairs(df):
    # Get diagonal and lower triangular pairs of correlation matrix
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(train_df, 100))
'''

'''
algorithms = [('mnb',mnb), ('lr', lr), ('knn', knn), ('svm', svm), ('clf', clf), ('bg', bg), ('adb', adb), ('rf', rf)]
highest_test = 0
highest_train = 0
highest_alg = []
for i in range(2, len(algorithms) + 1):
    combs = combinations(algorithms, i)
    for comb in combs:
        labels = []
        for alg in comb:
            labels.append(alg[0])
        evc=VotingClassifier(estimators=list(comb),voting='hard').fit(x_train, y_train)
        test_score = evc.score(x_test, y_test)
        train_score = evc.score(x_train, y_train)
        print("score on EVC test " + str(labels) + ": " + str(test_score))
        print("score on EVC train " + str(labels) + ": "+ str())
        print("---")

        if test_score > highest_test:
            highest_test = test_score
            highest_train = train_score
            highest_alg = labels

print('The highest EVC combination was ' + str(highest_alg) + ' with ' + str(highest_test) + ' test accuracy and ' + str(highest_train) + ' train accuracy')
'''