Arham-4

Untitled

Nov 25th, 2021
674
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from catboost import CatBoostClassifier
  2. from lightgbm import LGBMClassifier
  3. import xgboost as xgb
  4. from numpy import mean
  5. from numpy import std
  6. import pandas
  7. from sklearn.model_selection import train_test_split
  8. from sklearn.naive_bayes import MultinomialNB
  9. from sklearn.linear_model import LogisticRegression
  10. from sklearn.preprocessing import MinMaxScaler
  11. from sklearn.neighbors import KNeighborsClassifier
  12. from sklearn.svm import LinearSVC
  13. from sklearn.ensemble import BaggingClassifier
  14. from sklearn.ensemble import AdaBoostClassifier
  15. from sklearn.ensemble import RandomForestClassifier
  16. from sklearn.tree import DecisionTreeClassifier
  17. from sklearn.ensemble import VotingClassifier
  18. from sklearn.cluster import KMeans
  19. from sklearn.neighbors import KNeighborsClassifier
  20. from sklearn.model_selection import cross_val_score
  21. from sklearn.model_selection import RepeatedStratifiedKFold
  22. from sklearn.model_selection import RandomizedSearchCV
  23. from sklearn.metrics import accuracy_score
  24. import numpy
  25. from itertools import combinations
  26.  
  27. original_df = pandas.read_csv('train.nmv.csv')
  28. train_df = original_df.iloc[: , :-1]
  29. classes = original_df[original_df.columns[-1]].tolist()
  30.  
  31. scaler = MinMaxScaler()
  32. train_data = scaler.fit_transform(train_df)
  33.  
  34. sets = []
  35. sets.append(train_test_split(train_data, classes))
  36.  
  37. '''
  38. cor_matrix = train_df.corr().abs()
  39. upper_tri = cor_matrix.where(numpy.triu(numpy.ones(cor_matrix.shape),k=1).astype(bool))
  40.  
  41. to_drop = [int(column.replace('a', '')) for column in upper_tri.columns if any(upper_tri[column] > 0.6)]
  42.  
  43. df1 = train_df.drop(train_df.columns[to_drop], axis=1)
  44. scaler = MinMaxScaler()
  45. train_data = scaler.fit_transform(df1)
  46.  
  47. sets.append(train_test_split(train_data, classes))
  48. '''
  49.  
  50. iteration = 0
  51. for x_train, x_test, y_train, y_test in sets:
  52.     '''
  53.    mnb = MultinomialNB().fit(x_train, y_train)
  54.    print("(" + str(iteration) + ") score on MNB test: " + str(mnb.score(x_test, y_test)))
  55.    print("(" + str(iteration) + ") score on MNB train: "+ str(mnb.score(x_train, y_train)))
  56.    print("---")
  57.  
  58.    lr = LogisticRegression(max_iter=1000).fit(x_train, y_train)
  59.    print("(" + str(iteration) + ") score on LR test: " + str(lr.score(x_test, y_test)))
  60.    print("(" + str(iteration) + ") score on LR train: "+ str(lr.score(x_train, y_train)))
  61.    print("---")
  62.  
  63.    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  64.    n_scores = cross_val_score(lr, x_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1)
  65.    print('K-fold accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
  66.    print("---")
  67.  
  68.    knn = KNeighborsClassifier(algorithm = 'brute', n_jobs=-1).fit(x_train, y_train)
  69.    print("(" + str(iteration) + ") score on KNN test: " + str(knn.score(x_test, y_test)))
  70.    print("(" + str(iteration) + ") score on KNN train: "+ str(knn.score(x_train, y_train)))
  71.    print("---")
  72.  
  73.    svm=  LinearSVC(C=0.0001).fit(x_train, y_train)
  74.    print("(" + str(iteration) + ") score on SVM test: " + str(svm.score(x_test, y_test)))
  75.    print("(" + str(iteration) + ") score on SVM train: "+ str(svm.score(x_train, y_train)))
  76.    print("---")
  77.  
  78.    clf = DecisionTreeClassifier().fit(x_train, y_train)
  79.    print("(" + str(iteration) + ") score on CLF test: " + str(clf.score(x_test, y_test)))
  80.    print("(" + str(iteration) + ") score on CLF train: "+ str(clf.score(x_train, y_train)))
  81.    print("---")
  82.  
  83.    bg = BaggingClassifier(DecisionTreeClassifier(),max_samples=0.5,max_features=1.0,n_estimators=10).fit(x_train, y_train)
  84.    print("(" + str(iteration) + ") score on BG test: " + str(bg.score(x_test, y_test)))
  85.    print("(" + str(iteration) + ") score on BG train: "+ str(bg.score(x_train, y_train)))
  86.    print("---")
  87.  
  88.    adb = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=10,max_depth=4),n_estimators=10,learning_rate=0.6).fit(x_train, y_train)
  89.    print("(" + str(iteration) + ") score on ADB test: " + str(adb.score(x_test, y_test)))
  90.    print("(" + str(iteration) + ") score on ADB train: "+ str(adb.score(x_train, y_train)))
  91.    print("---")
  92.    
  93.    clusters = 3
  94.    KNNClassifier = KNeighborsClassifier(n_neighbors = clusters).fit(x_train, y_train)
  95.    print("(" + str(iteration) + ") score on " + str(clusters) + "-NN test: " + str(KNNClassifier.score(x_test, y_test)))
  96.    print("(" + str(iteration) + ") score on " + str(clusters) + "-NN test: " + str(KNNClassifier.score(x_train, y_train)))
  97.    print("---")
  98.    '''
  99.  
  100.     rf = RandomForestClassifier(n_estimators=30, max_depth=9).fit(x_train, y_train)
  101.     print("(" + str(iteration) + ") score on RF test: " + str(rf.score(x_test, y_test)))
  102.     print("(" + str(iteration) + ") score on RF train: "+ str(rf.score(x_train, y_train)))
  103.     print("---")
  104.    
  105.     xg_reg = xgb.XGBRegressor().fit(x_train, y_train)
  106.    
  107.     y_pred = xg_reg.predict(x_test)
  108.     predictions = [round(value) for value in y_pred]
  109.     accuracy = accuracy_score(y_test, predictions)
  110.     print("(" + str(iteration) + ") score on XGB test: " + str(accuracy))
  111.    
  112.     y_pred = xg_reg.predict(x_train)
  113.     predictions = [round(value) for value in y_pred]
  114.     accuracy = accuracy_score(y_train, predictions)
  115.     print("(" + str(iteration) + ") score on XGB train: "+ str(accuracy))
  116.     print("---")
  117.    
  118.     model = LGBMClassifier().fit(x_train, y_train)
  119.     cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  120.     n_scores = cross_val_score(model, x_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
  121.     n_scores_2 = cross_val_score(model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
  122.     print("(" + str(iteration) + ") score on LGBM test: " + str(mean(n_scores)))
  123.     print("(" + str(iteration) + ") score on LGBM test: " + str(mean(n_scores_2)))
  124.     print("---")
  125.    
  126.     '''
  127.    #model = CatBoostClassifier(verbose=0, n_estimators=100).fit(x_train, y_train)
  128.    model = CatBoostClassifier().fit(x_train, y_train)
  129.    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  130.    n_scores = cross_val_score(model, x_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
  131.    
  132.    model = CatBoostClassifier()
  133.    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
  134.    n_scores_2 = cross_val_score(model, x_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
  135.    print("(" + str(iteration) + ") score on Cat Boost test: " + str(mean(n_scores)))
  136.    print("(" + str(iteration) + ") score on Cat Boost test: " + str(mean(n_scores_2)))
  137.    print("---")
  138.    '''
  139.    
  140.     '''
  141.    # Number of trees in random forest
  142.    n_estimators = [int(x) for x in numpy.linspace(start = 200, stop = 2000, num = 10)]
  143.    # Number of features to consider at every split
  144.    max_features = ['auto', 'sqrt']
  145.    # Maximum number of levels in tree
  146.    max_depth = [int(x) for x in numpy.linspace(10, 110, num = 11)]
  147.    max_depth.append(None)
  148.    # Minimum number of samples required to split a node
  149.    min_samples_split = [2, 5, 10]
  150.    # Minimum number of samples required at each leaf node
  151.    min_samples_leaf = [1, 2, 4]
  152.    # Method of selecting samples for training each tree
  153.    bootstrap = [True, False]# Create the random grid
  154.    random_grid = {'n_estimators': n_estimators,
  155.                   'max_features': max_features,
  156.                   'max_depth': max_depth,
  157.                   'min_samples_split': min_samples_split,
  158.                   'min_samples_leaf': min_samples_leaf,
  159.                   'bootstrap': bootstrap}
  160.    
  161.    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1).fit(x_train, y_train)
  162.    print("(" + str(iteration) + ") score on RF test random: " + str(rf.score(x_test, y_test)))
  163.    print("(" + str(iteration) + ") score on RF train random: "+ str(rf.score(x_train, y_train)))
  164.    print("Best params: " + str(rf_random.best_params_))
  165.    print("---")
  166.    '''
  167.    
  168.     iteration += 1
  169.  
  170. '''
  171. def get_redundant_pairs(df):
  172.    # Get diagonal and lower triangular pairs of correlation matrix
  173.    pairs_to_drop = set()
  174.    cols = df.columns
  175.    for i in range(0, df.shape[1]):
  176.        for j in range(0, i+1):
  177.            pairs_to_drop.add((cols[i], cols[j]))
  178.    return pairs_to_drop
  179.  
  180. def get_top_abs_correlations(df, n=5):
  181.    au_corr = df.corr().abs().unstack()
  182.    labels_to_drop = get_redundant_pairs(df)
  183.    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
  184.    return au_corr[0:n]
  185.  
  186. print("Top Absolute Correlations")
  187. print(get_top_abs_correlations(train_df, 100))
  188. '''
  189.  
  190. '''
  191. algorithms = [('mnb',mnb), ('lr', lr), ('knn', knn), ('svm', svm), ('clf', clf), ('bg', bg), ('adb', adb), ('rf', rf)]
  192. highest_test = 0
  193. highest_train = 0
  194. highest_alg = []
  195. for i in range(2, len(algorithms) + 1):
  196.    combs = combinations(algorithms, i)
  197.    for comb in combs:
  198.        labels = []
  199.        for alg in comb:
  200.            labels.append(alg[0])
  201.        evc=VotingClassifier(estimators=list(comb),voting='hard').fit(x_train, y_train)
  202.        test_score = evc.score(x_test, y_test)
  203.        train_score = evc.score(x_train, y_train)
  204.        print("score on EVC test " + str(labels) + ": " + str(test_score))
  205.        print("score on EVC train " + str(labels) + ": "+ str())
  206.        print("---")
  207.        
  208.        if test_score > highest_test:
  209.            highest_test = test_score
  210.            highest_train = train_score
  211.            highest_alg = labels
  212.  
  213. print('The highest EVC combination was ' + str(highest_alg) + ' with ' + str(highest_test) + ' test accuracy and ' + str(highest_train) + ' train accuracy')
  214. '''
RAW Paste Data