SHARE
TWEET

Dataset_dataset

a guest Jun 25th, 2019 55 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import pandas
  2. from sklearn import metrics
  3. from sklearn.ensemble import RandomForestClassifier
  4. from sklearn.naive_bayes import GaussianNB
  5. from sklearn.svm import SVC
  6. import math
  7. import numpy as np
  8. from sklearn.feature_selection import SelectKBest, SelectPercentile
  9. from sklearn.tree import DecisionTreeClassifier
  10. from sklearn.preprocessing import MinMaxScaler
  11. from sklearn.preprocessing import LabelBinarizer
  12. from sklearn.model_selection import KFold
  13. from sklearn.feature_selection import chi2
  14. import warnings
  15.  
  16. warnings.filterwarnings('ignore')
  17.  
  18.  
  19. def split_input_output(dataset):
  20.     return (dataset[:, :-1], dataset[:, -1])
  21.  
  22.  
  23. def getAUC(model, testX, testY):
  24.     if len(set(testY)) > 2:
  25.         lb = LabelBinarizer()
  26.         lb.fit(testY)
  27.         y_test = lb.transform(testY)
  28.         y_pred = lb.transform(model.predict(testX))
  29.         return round(metrics.roc_auc_score(y_test, y_pred) * 100, 2)
  30.     return round(metrics.roc_auc_score(testY, model.predict_proba(testX)[:, 1], average='micro') * 100, 2)
  31.  
  32.  
  33. def scale(x):
  34.     X, y = split_input_output(x)
  35.     X = MinMaxScaler(feature_range=(0, 1)).fit_transform(X)
  36.     return np.concatenate((X, y[:, None]), axis=1)
  37.  
  38.  
  39. def split(dataset):
  40.     sub_datasets = list()
  41.     X, y = split_input_output(dataset)
  42.     step = 0
  43.     number_of_feaures = len(X[0])
  44.     if number_of_feaures < 10:
  45.         step = 20
  46.     elif number_of_feaures < 20:
  47.         step = 10
  48.     else:
  49.         step = 5
  50.     for i in range(step, 100 + 1, step):
  51.         temp = SelectPercentile(chi2, percentile=i).fit_transform(X, y)
  52.         sub_datasets.append(np.concatenate((temp, y[:, None]), axis=1))
  53.     return sub_datasets
  54.  
  55.  
  56. def ratio_of_types(dataset, number_of_features):
  57.     binary = categorical = numeric = 0
  58.     for i in range(len(dataset[0])):
  59.         col = dataset[:, i]
  60.         unique_value_number = len(set(col))
  61.         if unique_value_number == 2:
  62.             binary += 1
  63.         elif unique_value_number < 15:
  64.             categorical += 1
  65.         elif unique_value_number >= 15:
  66.             numeric += 1
  67.     return (binary / number_of_features, categorical / number_of_features, numeric / number_of_features)
  68.  
  69.  
  70. def find_Classification_type(y):
  71.     if len(set(y)) == 2:
  72.         return 'binary'
  73.     else:
  74.         return 'micro'
  75.  
  76. dataset_paths = ['occupancy_data/occupancy.csv', "UCI HAR Dataset/train/HAR.csv", "bupa.csv", "diabetes.csv",
  77.                  "hepatitis.csv", "mushroom.csv", "new-thyroid.data", "parkinsons.data", "phishing.csv",
  78.                  "winconcin_breast_cancer.csv"]
  79. dataset_paths=["Datasets/"+x for x in dataset_paths]
  80. for dataset_path in dataset_paths:
  81.     print(dataset_path.split("/")[-1].split(".")[0])
  82.     number_of_folds = 10
  83.     dataset = scale(pandas.read_csv(dataset_path, header=None).values)
  84.     number_of_features = len(dataset[0]) - 1
  85.     dataset_size = len(dataset)
  86.     binary_in_dataset, categorical_in_dataset, numeric_in_dataset = ratio_of_types(dataset[:, :-1], number_of_features)
  87.     sub_datasets = split(dataset)
  88.     for dataset in sub_datasets:
  89.         X, Y = split_input_output(dataset)
  90.         classificationType = find_Classification_type(Y)
  91.         for classifier_type in range(4):
  92.             kf = KFold(n_splits=number_of_folds, shuffle=True)
  93.             sum = 0
  94.             for train_index, test_index in kf.split(X):
  95.                 trainX, testX = X[train_index], X[test_index]
  96.                 trainY, testY = Y[train_index], Y[test_index]
  97.                 if classifier_type == 0:
  98.                     clf = DecisionTreeClassifier(criterion="entropy")
  99.                 elif classifier_type == 1:
  100.                     clf = RandomForestClassifier(n_estimators=100)
  101.                 elif classifier_type == 2:
  102.                     clf = GaussianNB()
  103.                 elif classifier_type == 3:
  104.                     clf = SVC(probability=True, gamma='auto')
  105.                 clf = clf.fit(trainX, trainY)
  106.                 y_predict = clf.predict(testX)
  107.                 sum += round(metrics.accuracy_score(testY, y_predict), 2)
  108.             print(dataset_size, end=',')
  109.             print(binary_in_dataset, end=',')
  110.             print(categorical_in_dataset, end=',')
  111.             print(numeric_in_dataset, end=',')
  112.             print(len(X[0]), end=',')
  113.             print(len(X[0]) / number_of_features, end=',')
  114.             for x in ratio_of_types(X, number_of_features):
  115.                 print(x, end=',')
  116.             print(classifier_type, end=',')
  117.             print(sum / number_of_folds)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top