Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # coding: utf-8
- # # Imports
- #[2]:
- from tqdm import tqdm
- from numba import jit
- import numpy as np # linear algebra
- import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
- from sklearn.model_selection import train_test_split
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.linear_model import LogisticRegression
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.metrics import precision_recall_curve
- import matplotlib.pyplot as plt
- import seaborn as sns
- import csv
- from sklearn import svm
- from sklearn import metrics
- import pprint
- from tqdm import tqdm_notebook
- from textwrap import wrap
- from sklearn.metrics import confusion_matrix
- from matplotlib import pyplot
- from queue import Queue
- import random
- import math
- from itertools import chain
- import pandas as pd
- import os
- import re
- import numpy as np
- import matplotlib.pyplot as plt
- def warn(*args, **kwargs):
- pass
- import warnings
- warnings.warn = warn
- import ast
- from sklearn.linear_model import SGDClassifier
- from sklearn.metrics import roc_curve, auc
- import matplotlib.pyplot as plt
- import scipy.stats as st
- import multiprocessing
- import multiprocess
- from joblib import Parallel, delayed
- from collections import OrderedDict, defaultdict
- from helpers.Plotter import *
- from helpers.DataSaver import *
- from helpers.BackwardsFeatureSelector import *
- from helpers.ForwardFeatureSelector import *
- from helpers.CorrelationFeatureSelector import *
- from helpers.GeneticFeatureSelector import *
- from helpers.VarianceFeatureSelector import *
- from helpers.InfoGainFeatureSelector import *
- from helpers.GiniImportanceFeatureSelector import *
- from helpers.ParamExpand import *
- from helpers.ParallelDataSaver import *
- from DataPreprocess import DataPreprocess
- from ModelUtils import *
- from Utils import *
- VersionCheck()
- ####GLOBALS#####
- STATS=["tpr","tnr","ppv","npv","fnr","fpr","fdr","for","ts","acc","ba","f1","mcc","bm","mk","tn","fp","fn","tp"]
- FEATURE_SELECTOR,FILENAME=ParseArgs()
- print(FEATURE_SELECTOR, FILENAME)
- stat_dict=[
- "acc",
- "tpr",
- "tnr",
- "ppv",
- "npv",
- "fnr",
- "fpr",
- "fdr",
- "for",
- "ts",
- "acc",
- "ba",
- "f1",
- "mcc",
- "bm",
- "mk",
- "tn",
- "fp",
- "fn",
- "tp"
- ]
- y_label= {
- "DT_Simple":np.s_[:-4],
- "SGD":np.s_[:-4],
- "SVC":np.s_[:-4],
- "RF":np.s_[:-4]
- }
- model_dict={
- "DT_Simple": DecisionTreeClassifier,
- "SGD":SGDClassifier,
- "SVC":svm.SVC,
- "RF": RandomForestClassifier
- }
- X,y,metadata=DataPreprocess()
- def pathtodata():
- """read all csv files in SGD results"""
- global y_label
- y_label_list = []
- data_array=[]
- folder_name="SVC"
- for filename in os.listdir("./Latest Results/SVC"):
- data_array.append(pd.read_csv( "./Latest Results/SVC/noFEATURESELECTION_svc.csv"))
- y_label_list.append(filename[:-4])
- return data_array,y_label_list,"SVC"
- def extract(data_array):
- global stats_dict
- """extract params and sort data by highest accuracy"""
- params=[]
- features=[]
- status=[]
- """sort all csvs according to accuracy"""
- for i,d in enumerate(data_array):
- d=d.sort_values(["accuracy"],ascending=False).reset_index()
- params.append(d["params"])
- features.append(d["features"])
- status.append(d[stat_dict])
- return params,features,status
- def modify(params, features, status,y_label_list,folder_name):
- """modify params features and stats to needed format to train"""
- featureslist=[]
- paramlist=[]
- statuslist=[]
- #extract 100 highest features
- for fi in features:
- for i,f in enumerate(fi):
- f=f[1:-1]
- f=(f.replace(".","").replace(",",""))
- f=f.split()
- f=[float(x) for x in f]
- fi[i]=f
- for i,f in enumerate(fi):
- if i<1:
- featureslist.append(np.asarray(f))
- else: break
- #extract 100 highest models of every feature selection
- for p in params:
- for i,ps in enumerate(p):
- if i <1:
- paramlist.append(ast.literal_eval(ps))
- else: break
- for j,si in enumerate(status):
- for i in range (0,1):
- if i<1:
- t=si.iloc[i,:].to_dict()
- t["feature selector"]=y_label_list[j]
- t["classifier"]=folder_name
- statuslist.append(t)
- else: break
- return featureslist,paramlist,statuslist
- def model_evaluate(X, k,params,model):
- global STATS
- RUNS=20
- runs_auc=np.zeros((RUNS,))
- for run in range(0,RUNS):
- aucroc = np.zeros((k,))
- #Iterates all over k groups of videos
- for i, split_test_indices,split_train_indices in get_split(run):
- # fit model
- sgdc = model(**params)
- sgdc.fit(X[split_train_indices, :], y[split_train_indices])
- ypredict = sgdc.predict_proba(X[split_test_indices, :])[:, 1]
- fpr,tpr,thres= roc_curve(y[split_test_indices], ypredict)
- aucroc[i]= auc(fpr,tpr)
- #accuracy of the run'th run
- runs_auc[run]=np.mean(aucroc)
- return np.mean(runs_auc)
- def generate_X(X,f):
- newX=(X[:,[i for i in range(0,len(f)) if f[i]==1]])
- return newX
- datasaver=DataSaver("SVC1_auc.csv")
- def model_creation(X,paramlist,featureslist,statuslist):
- modelauc=[]
- for i in tqdm(range(0,len(paramlist))):
- auc = model_evaluate(generate_X(X,featureslist[i]),5,paramlist[i],model_dict[statuslist[i]["classifier"]])
- x={"auc":auc, "params":paramlist[i],"features":featureslist[i]}
- y=statuslist[i]
- data= {**x, **y}
- datasaver.add_data([data])
- datasaver.save_data()
- return modelauc
- data_array,y_label_list,folder_name= pathtodata()
- params,features, stats= extract(data_array)
- featureslist,paramlist,statuslist = modify(params,features, stats,y_label_list,folder_name)
- print(statuslist[0]["feature selector"])
- print(statuslist[0]["classifier"])
- print(len(statuslist))
- modelauc= model_creation(X,paramlist,featureslist,statuslist)
- print(modelauc)
Add Comment
Please, Sign In to add comment