otot957

Untitled

May 27th, 2020
306
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.37 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3.  
  4. # # Imports
  5.  
  6. #[2]:
  7. from tqdm import tqdm
  8.  
  9. from numba import jit
  10. import numpy as np # linear algebra
  11. import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
  12. from sklearn.model_selection import train_test_split
  13.  
  14. from sklearn.tree import DecisionTreeClassifier
  15. from sklearn.linear_model import LogisticRegression
  16. from sklearn.ensemble import RandomForestClassifier
  17. from sklearn.neighbors import KNeighborsClassifier
  18.  
  19. from sklearn.metrics import precision_recall_curve
  20. import matplotlib.pyplot as plt
  21. import seaborn as sns
  22. import csv
  23.  
  24. from sklearn import svm
  25. from sklearn import metrics
  26. import pprint
  27. from tqdm import tqdm_notebook
  28. from textwrap import wrap
  29. from sklearn.metrics import confusion_matrix
  30. from matplotlib import pyplot
  31. from queue import Queue
  32.  
  33. import random
  34. import math
  35. from itertools import chain
  36.  
  37. import pandas as pd
  38. import os
  39. import re
  40. import numpy as np
  41. import matplotlib.pyplot as plt
  42.  
  43. def warn(*args, **kwargs):
  44.     pass
  45. import warnings
  46. warnings.warn = warn
  47.  
  48.  
  49. import ast
  50. from sklearn.linear_model import SGDClassifier
  51. from sklearn.metrics import roc_curve, auc
  52. import matplotlib.pyplot as plt
  53. import scipy.stats as st
  54. import multiprocessing
  55. import multiprocess
  56.  
  57.  
  58. from joblib import Parallel, delayed
  59. from collections import OrderedDict, defaultdict
  60.  
  61. from helpers.Plotter import *
  62. from helpers.DataSaver import *
  63. from helpers.BackwardsFeatureSelector import *
  64. from helpers.ForwardFeatureSelector import *
  65. from helpers.CorrelationFeatureSelector import *
  66. from helpers.GeneticFeatureSelector import *
  67. from helpers.VarianceFeatureSelector import *
  68. from helpers.InfoGainFeatureSelector import *
  69. from helpers.GiniImportanceFeatureSelector import *
  70. from helpers.ParamExpand import *
  71. from helpers.ParallelDataSaver import *
  72. from DataPreprocess import DataPreprocess
  73.  
  74. from ModelUtils import *
  75. from Utils import *
  76.  
  77.  
  78. VersionCheck()
  79.  
  80.  
  81.  
  82.  
  83. ####GLOBALS#####
  84. STATS=["tpr","tnr","ppv","npv","fnr","fpr","fdr","for","ts","acc","ba","f1","mcc","bm","mk","tn","fp","fn","tp"]
  85.  
  86.  
  87. FEATURE_SELECTOR,FILENAME=ParseArgs()
  88. print(FEATURE_SELECTOR, FILENAME)
  89.  
  90. stat_dict=[
  91.         "acc",
  92.         "tpr",
  93.         "tnr",
  94.         "ppv",
  95.         "npv",
  96.         "fnr",
  97.         "fpr",
  98.         "fdr",
  99.         "for",
  100.         "ts",
  101.         "acc",
  102.         "ba",
  103.         "f1",
  104.         "mcc",
  105.         "bm",
  106.         "mk",
  107.         "tn",
  108.         "fp",
  109.         "fn",
  110.         "tp"
  111. ]
  112. y_label= {
  113.     "DT_Simple":np.s_[:-4],
  114.     "SGD":np.s_[:-4],
  115.     "SVC":np.s_[:-4],
  116.     "RF":np.s_[:-4]
  117. }
  118. model_dict={
  119.     "DT_Simple": DecisionTreeClassifier,
  120.     "SGD":SGDClassifier,
  121.     "SVC":svm.SVC,
  122.     "RF": RandomForestClassifier
  123. }
  124.  
  125.  
  126. X,y,metadata=DataPreprocess()
  127.  
  128. def pathtodata():
  129.     """read all csv files in SGD results"""
  130.     global y_label
  131.     y_label_list = []
  132.     data_array=[]
  133.     folder_name="SVC"
  134.     for filename in os.listdir("./Latest Results/SVC"):
  135.         data_array.append(pd.read_csv( "./Latest Results/SVC/noFEATURESELECTION_svc.csv"))
  136.         y_label_list.append(filename[:-4])
  137.     return data_array,y_label_list,"SVC"
  138.  
  139. def extract(data_array):
  140.     global stats_dict
  141.     """extract params and sort data by highest accuracy"""
  142.     params=[]
  143.     features=[]
  144.     status=[]
  145.     """sort all csvs according to accuracy"""
  146.     for i,d in enumerate(data_array):
  147.         d=d.sort_values(["accuracy"],ascending=False).reset_index()
  148.         params.append(d["params"])
  149.         features.append(d["features"])
  150.         status.append(d[stat_dict])
  151.     return params,features,status
  152.  
  153.  
  154. def modify(params, features, status,y_label_list,folder_name):
  155.     """modify params features and stats to needed format to train"""
  156.     featureslist=[]
  157.     paramlist=[]
  158.     statuslist=[]
  159.     #extract 100 highest features
  160.     for fi in features:
  161.         for i,f in enumerate(fi):
  162.                 f=f[1:-1]
  163.                 f=(f.replace(".","").replace(",",""))
  164.                 f=f.split()
  165.                 f=[float(x) for x in f]
  166.                 fi[i]=f
  167.         for i,f in enumerate(fi):
  168.             if i<1:
  169.                 featureslist.append(np.asarray(f))
  170.             else: break
  171.  
  172.     #extract 100 highest models of every feature selection
  173.     for p in params:
  174.         for i,ps in enumerate(p):
  175.             if i <1:
  176.                 paramlist.append(ast.literal_eval(ps))
  177.             else: break
  178.  
  179.     for j,si in enumerate(status):
  180.         for i in range (0,1):
  181.             if i<1:
  182.                 t=si.iloc[i,:].to_dict()
  183.                 t["feature selector"]=y_label_list[j]
  184.                 t["classifier"]=folder_name
  185.                 statuslist.append(t)
  186.             else: break
  187.  
  188.     return featureslist,paramlist,statuslist
  189.  
  190. def model_evaluate(X, k,params,model):
  191.     global STATS
  192.     RUNS=20
  193.     runs_auc=np.zeros((RUNS,))
  194.    
  195.     for run in range(0,RUNS):
  196.         aucroc = np.zeros((k,))
  197.         #Iterates all over k groups of videos
  198.         for i, split_test_indices,split_train_indices in get_split(run):
  199.             # fit model
  200.             sgdc = model(**params)
  201.             sgdc.fit(X[split_train_indices, :], y[split_train_indices])
  202.  
  203.             ypredict = sgdc.predict_proba(X[split_test_indices, :])[:, 1]
  204.             fpr,tpr,thres= roc_curve(y[split_test_indices], ypredict)
  205.             aucroc[i]= auc(fpr,tpr)
  206.            
  207.  
  208.         #accuracy of the run'th run
  209.         runs_auc[run]=np.mean(aucroc)
  210.        
  211.     return np.mean(runs_auc)
  212.  
  213. def generate_X(X,f):
  214.     newX=(X[:,[i for i in range(0,len(f)) if f[i]==1]])
  215.     return newX
  216. datasaver=DataSaver("SVC1_auc.csv")
  217.  
  218. def model_creation(X,paramlist,featureslist,statuslist):
  219.     modelauc=[]
  220.     for i in tqdm(range(0,len(paramlist))):
  221.         auc = model_evaluate(generate_X(X,featureslist[i]),5,paramlist[i],model_dict[statuslist[i]["classifier"]])
  222.         x={"auc":auc, "params":paramlist[i],"features":featureslist[i]}
  223.         y=statuslist[i]
  224.         data= {**x, **y}
  225.         datasaver.add_data([data])
  226.    
  227.     datasaver.save_data()
  228.     return modelauc
  229.  
  230.  
  231. data_array,y_label_list,folder_name= pathtodata()
  232. params,features, stats= extract(data_array)
  233. featureslist,paramlist,statuslist = modify(params,features, stats,y_label_list,folder_name)
  234. print(statuslist[0]["feature selector"])
  235. print(statuslist[0]["classifier"])
  236. print(len(statuslist))
  237. modelauc= model_creation(X,paramlist,featureslist,statuslist)
  238. print(modelauc)
Add Comment
Please, Sign In to add comment