Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from catboost import CatBoostClassifier
- from lightgbm import LGBMClassifier
- import xgboost as xgb
- from numpy import mean
- from numpy import std
- import pandas
- from sklearn.model_selection import train_test_split
- from sklearn.naive_bayes import MultinomialNB
- from sklearn.linear_model import LogisticRegression
- from sklearn.preprocessing import MinMaxScaler
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.svm import LinearSVC
- from sklearn.ensemble import BaggingClassifier
- from sklearn.ensemble import AdaBoostClassifier
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.ensemble import VotingClassifier
- from sklearn.cluster import KMeans
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.model_selection import cross_val_score
- from sklearn.model_selection import RepeatedStratifiedKFold
- from sklearn.model_selection import RandomizedSearchCV
- from sklearn.metrics import accuracy_score
- import numpy
- from itertools import combinations
- original_df = pandas.read_csv('train.nmv.csv')
- train_df = original_df.iloc[: , :-1]
- classes = original_df[original_df.columns[-1]].tolist()
- scaler = MinMaxScaler()
- train_data = scaler.fit_transform(train_df)
- sets = []
- sets.append(train_test_split(train_data, classes))
- '''
- cor_matrix = train_df.corr().abs()
- upper_tri = cor_matrix.where(numpy.triu(numpy.ones(cor_matrix.shape),k=1).astype(bool))
- to_drop = [int(column.replace('a', '')) for column in upper_tri.columns if any(upper_tri[column] > 0.6)]
- df1 = train_df.drop(train_df.columns[to_drop], axis=1)
- scaler = MinMaxScaler()
- train_data = scaler.fit_transform(df1)
- sets.append(train_test_split(train_data, classes))
- '''
- iteration = 0
- for x_train, x_test, y_train, y_test in sets:
- '''
- mnb = MultinomialNB().fit(x_train, y_train)
- print("(" + str(iteration) + ") score on MNB test: " + str(mnb.score(x_test, y_test)))
- print("(" + str(iteration) + ") score on MNB train: "+ str(mnb.score(x_train, y_train)))
- print("---")
- lr = LogisticRegression(max_iter=1000).fit(x_train, y_train)
- print("(" + str(iteration) + ") score on LR test: " + str(lr.score(x_test, y_test)))
- print("(" + str(iteration) + ") score on LR train: "+ str(lr.score(x_train, y_train)))
- print("---")
- cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
- n_scores = cross_val_score(lr, x_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1)
- print('K-fold accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
- print("---")
- knn = KNeighborsClassifier(algorithm = 'brute', n_jobs=-1).fit(x_train, y_train)
- print("(" + str(iteration) + ") score on KNN test: " + str(knn.score(x_test, y_test)))
- print("(" + str(iteration) + ") score on KNN train: "+ str(knn.score(x_train, y_train)))
- print("---")
- svm= LinearSVC(C=0.0001).fit(x_train, y_train)
- print("(" + str(iteration) + ") score on SVM test: " + str(svm.score(x_test, y_test)))
- print("(" + str(iteration) + ") score on SVM train: "+ str(svm.score(x_train, y_train)))
- print("---")
- clf = DecisionTreeClassifier().fit(x_train, y_train)
- print("(" + str(iteration) + ") score on CLF test: " + str(clf.score(x_test, y_test)))
- print("(" + str(iteration) + ") score on CLF train: "+ str(clf.score(x_train, y_train)))
- print("---")
- bg = BaggingClassifier(DecisionTreeClassifier(),max_samples=0.5,max_features=1.0,n_estimators=10).fit(x_train, y_train)
- print("(" + str(iteration) + ") score on BG test: " + str(bg.score(x_test, y_test)))
- print("(" + str(iteration) + ") score on BG train: "+ str(bg.score(x_train, y_train)))
- print("---")
- adb = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=10,max_depth=4),n_estimators=10,learning_rate=0.6).fit(x_train, y_train)
- print("(" + str(iteration) + ") score on ADB test: " + str(adb.score(x_test, y_test)))
- print("(" + str(iteration) + ") score on ADB train: "+ str(adb.score(x_train, y_train)))
- print("---")
- clusters = 3
- KNNClassifier = KNeighborsClassifier(n_neighbors = clusters).fit(x_train, y_train)
- print("(" + str(iteration) + ") score on " + str(clusters) + "-NN test: " + str(KNNClassifier.score(x_test, y_test)))
- print("(" + str(iteration) + ") score on " + str(clusters) + "-NN test: " + str(KNNClassifier.score(x_train, y_train)))
- print("---")
- '''
- rf = RandomForestClassifier(n_estimators=30, max_depth=9).fit(x_train, y_train)
- print("(" + str(iteration) + ") score on RF test: " + str(rf.score(x_test, y_test)))
- print("(" + str(iteration) + ") score on RF train: "+ str(rf.score(x_train, y_train)))
- print("---")
- xg_reg = xgb.XGBRegressor().fit(x_train, y_train)
- y_pred = xg_reg.predict(x_test)
- predictions = [round(value) for value in y_pred]
- accuracy = accuracy_score(y_test, predictions)
- print("(" + str(iteration) + ") score on XGB test: " + str(accuracy))
- y_pred = xg_reg.predict(x_train)
- predictions = [round(value) for value in y_pred]
- accuracy = accuracy_score(y_train, predictions)
- print("(" + str(iteration) + ") score on XGB train: "+ str(accuracy))
- print("---")
- model = LGBMClassifier().fit(x_train, y_train)
- cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
- n_scores = cross_val_score(model, x_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
- n_scores_2 = cross_val_score(model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
- print("(" + str(iteration) + ") score on LGBM test: " + str(mean(n_scores)))
- print("(" + str(iteration) + ") score on LGBM test: " + str(mean(n_scores_2)))
- print("---")
- '''
- #model = CatBoostClassifier(verbose=0, n_estimators=100).fit(x_train, y_train)
- model = CatBoostClassifier().fit(x_train, y_train)
- cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
- n_scores = cross_val_score(model, x_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
- model = CatBoostClassifier()
- cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
- n_scores_2 = cross_val_score(model, x_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
- print("(" + str(iteration) + ") score on Cat Boost test: " + str(mean(n_scores)))
- print("(" + str(iteration) + ") score on Cat Boost test: " + str(mean(n_scores_2)))
- print("---")
- '''
- '''
- # Number of trees in random forest
- n_estimators = [int(x) for x in numpy.linspace(start = 200, stop = 2000, num = 10)]
- # Number of features to consider at every split
- max_features = ['auto', 'sqrt']
- # Maximum number of levels in tree
- max_depth = [int(x) for x in numpy.linspace(10, 110, num = 11)]
- max_depth.append(None)
- # Minimum number of samples required to split a node
- min_samples_split = [2, 5, 10]
- # Minimum number of samples required at each leaf node
- min_samples_leaf = [1, 2, 4]
- # Method of selecting samples for training each tree
- bootstrap = [True, False]# Create the random grid
- random_grid = {'n_estimators': n_estimators,
- 'max_features': max_features,
- 'max_depth': max_depth,
- 'min_samples_split': min_samples_split,
- 'min_samples_leaf': min_samples_leaf,
- 'bootstrap': bootstrap}
- rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1).fit(x_train, y_train)
- print("(" + str(iteration) + ") score on RF test random: " + str(rf.score(x_test, y_test)))
- print("(" + str(iteration) + ") score on RF train random: "+ str(rf.score(x_train, y_train)))
- print("Best params: " + str(rf_random.best_params_))
- print("---")
- '''
- iteration += 1
- '''
- def get_redundant_pairs(df):
- # Get diagonal and lower triangular pairs of correlation matrix
- pairs_to_drop = set()
- cols = df.columns
- for i in range(0, df.shape[1]):
- for j in range(0, i+1):
- pairs_to_drop.add((cols[i], cols[j]))
- return pairs_to_drop
- def get_top_abs_correlations(df, n=5):
- au_corr = df.corr().abs().unstack()
- labels_to_drop = get_redundant_pairs(df)
- au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
- return au_corr[0:n]
- print("Top Absolute Correlations")
- print(get_top_abs_correlations(train_df, 100))
- '''
- '''
- algorithms = [('mnb',mnb), ('lr', lr), ('knn', knn), ('svm', svm), ('clf', clf), ('bg', bg), ('adb', adb), ('rf', rf)]
- highest_test = 0
- highest_train = 0
- highest_alg = []
- for i in range(2, len(algorithms) + 1):
- combs = combinations(algorithms, i)
- for comb in combs:
- labels = []
- for alg in comb:
- labels.append(alg[0])
- evc=VotingClassifier(estimators=list(comb),voting='hard').fit(x_train, y_train)
- test_score = evc.score(x_test, y_test)
- train_score = evc.score(x_train, y_train)
- print("score on EVC test " + str(labels) + ": " + str(test_score))
- print("score on EVC train " + str(labels) + ": "+ str())
- print("---")
- if test_score > highest_test:
- highest_test = test_score
- highest_train = train_score
- highest_alg = labels
- print('The highest EVC combination was ' + str(highest_alg) + ' with ' + str(highest_test) + ' test accuracy and ' + str(highest_train) + ' train accuracy')
- '''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement