Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Functions
- def get_dataset_count_of_missing_values(dataset):
- mis_val = dataset.isnull().sum()
- mis_val_percent = 100 * dataset.isnull().sum() / len(dataset)
- mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
- mis_val_table_ren_columns = mis_val_table.rename(
- columns = {0 : 'Missing Values', 1 : '% of Total Values'})
- mis_val_table_ren_columns = mis_val_table_ren_columns[
- mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
- '% of Total Values', ascending=False).round(1)
- print ("Your selected dataframe has " + str(dataset.shape[1]) + " columns.\n"
- "There are " + str(mis_val_table_ren_columns.shape[0]) +
- " columns that have missing values.")
- return mis_val_table_ren_columns
- def remove_columns_with_missing_values_inplace(dataset, missing_values_table, threshold):
- for index, row in missing_values_table.iterrows():
- if row['Missing Values'] > threshold:
- #print(row['Missing Values'], row['% of Total Values'])
- print(row.name)
- dataset.drop(row.name, axis = 1, inplace = True)
- # Libraries
- import numpy as np
- import matplotlib.pyplot as plt
- import pandas as pd
- pd.set_option('display.max_rows', 2500)
- pd.set_option('display.max_columns', 2500)
- pd.set_option('display.width', 2000)
- # Improting the dataset
- dataset = pd.read_csv('patients_without_empty_rows.csv')
- dataset.drop(dataset.columns[[0]], axis=1, inplace=True)
- #dataset.to_csv("patients_without_empty_rows.csv")
- #print(dataset.isna().sum())
- # Handle missing data
- #missing_values_table = get_dataset_count_of_missing_values(dataset)
- #remove_columns_with_missing_values_inplace(dataset, missing_values_table, threshold = 160000)
- #print(missing_values_table)
- #dataset.to_csv("patients_without_nan.csv")
- #dataset.dropna(inplace=True)
- #dataset.to_csv("patients_without_empty_rows.csv")
- X = dataset.iloc[:, dataset.columns != "disposition"]
- y = dataset.iloc[:, 11]
- # Encoding categorical data
- # Encoding the Independent Variable
- from sklearn.preprocessing import LabelEncoder, OneHotEncoder
- labelencoder_X = LabelEncoder()
- X.iloc[:, 0] = labelencoder_X.fit_transform(X.iloc[:, 0])
- for i in range(3, 13):
- labelencoder_X = LabelEncoder()
- X.iloc[:, i] = labelencoder_X.fit_transform(X.iloc[:, i])
- onehotencoder = OneHotEncoder(sparse=False,categorical_features = [0,3,4,5,6,7,8,9,10,11,12])
- X_fitted = onehotencoder.fit_transform(X).toarray()
- print(X_fitted)
- print(onehotencoder.feature_indices_) # [ 0 3 9 15]
- """
- """
- # Encoding the Dependent Variable
- labelencoder_y = LabelEncoder()
- y = labelencoder_y.fit_transform(y)
- cat_columns = [
- "dep_name",
- "gender",
- "ethnicity",
- "race",
- "lang",
- "religion",
- "maritalstatus",
- "employstatus",
- "insurance_status",
- "arrivalmode",
- "previousdispo" ]#, "disposition"]
- X_encoded = pd.get_dummies(X, prefix_sep = "_",
- columns = cat_columns)
- ##
- dataset_encoded = pd.get_dummies(dataset, prefix_sep="_",
- columns=cat_columns)
- ##
- cat_columns = [
- "dep_name",
- "gender",
- "ethnicity",
- "race",
- "lang",
- "religion",
- "maritalstatus",
- "employstatus",
- "insurance_status",
- "arrivalmode",
- "previousdispo"]
- X_encoded = pd.get_dummies(X, prefix_sep="_",
- columns=cat_columns)
- #apply SelectKBest class to extract top 10 best features
- from sklearn.feature_selection import SelectKBest
- from sklearn.feature_selection import chi2
- bestfeatures = SelectKBest(score_func=chi2, k=75)
- fit = bestfeatures.fit(X_encoded,y)
- dfscores = pd.DataFrame(fit.scores_)
- dfcolumns = pd.DataFrame(X_encoded.columns)
- #concat two dataframes for better visualization
- featureScores = pd.concat([dfcolumns,dfscores],axis=1)
- featureScores.columns = ['Feature','Score'] #naming the dataframe columns
- print(featureScores.nlargest(75,'Score')) #print 10 best features
- from sklearn.ensemble import ExtraTreesClassifier
- model = ExtraTreesClassifier()
- model.fit(X_encoded,y)
- print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
- #plot graph of feature importances for better visualization
- feat_importances = pd.Series(model.feature_importances_, index=X_encoded.columns)
- feat_importances.nlargest(75).plot(kind='barh')
- plt.show()
- # Correlation matrix
- import seaborn as sns
- #get correlations of each features in dataset
- corrmat = dataset_encoded.corr()
- top_corr_features = corrmat.index
- for i in range(0, 75):
- print(str(i+1) + ". " + top_corr_features[i])
- plt.figure(figsize=(5,5))
- #plot heat map
- g=sns.heatmap(dataset[top_corr_features].corr(),annot=True,cmap="RdYlGn")
- # Pearson's Correlation
- from numpy.random import randn
- from numpy.random import seed
- from scipy.stats import pearsonr
- corr, _ = pearsonr(X_encoded["bph"], y)
- columns = [
- "esi" ,
- "meds_cardiovascular" ,
- "age" ,
- "meds_gastrointestinal" ,
- "previousdispo_Admit" ,
- "previousdispo_Discharge" ,
- "meds_vitamins" ,
- "arrivalmode_ambulance" ,
- "employstatus_Retired" ,
- "meds_psychotherapeuticdrugs" ,
- "meds_thyroidpreps" ,
- "meds_antihyperglycemics" ,
- "meds_analgesics" ,
- "meds_antiplateletdrugs" ,
- "meds_antiasthmatics" ,
- "htn" ,
- "arrivalmode_Car" ,
- "n_edvisits" ,
- "n_surgeries" ,
- "meds_antiarthritics" ,
- "meds_elect/caloric/h2o" ,
- "race_White or Caucasian" ,
- "meds_diuretics" ,
- "insurance_status_Medicare" ,
- "meds_cardiacdrugs" ,
- "maritalstatus_Single" ,
- "dep_name_C" ,
- "meds_cnsdrugs" ,
- "dep_name_A" ,
- "arrivalmode_Walk-in" ,
- "meds_unclassifieddrugproducts" ,
- "diabmelnoc" ,
- "employstatus_Full Time" ,
- "cc_shortnessofbreath" ,
- "n_admissions" ,
- "ekg_count" ,
- "hyperlipidem" ,
- "religion_None" ,
- "cxr_count" ,
- "cc_abdominalpain" ,
- "insurance_status_Other" ,
- "dep_name_B" ,
- "employstatus_Not Employed" ,
- "unclassified" ,
- "religion_Catholic" ,
- "insurance_status_Self pay" ,
- "insurance_status_Medicaid" ,
- "cc_chestpain" ,
- "otherxr_count" ,
- "gender_Male" ,
- "gender_Female" ,
- "previousdispo_No previous dispo",
- "esophgealdx" ,
- "mooddisorders" ,
- "coronathero" ,
- "maritalstatus_Married" ,
- "insurance_status_Commercial" ,
- "anxietydisorders" ,
- "cc_other" ,
- "asthma" ,
- "otherct_count" ,
- "meds_anticoagulants" ,
- "meds_antibiotics" ,
- "meds_hormones" ,
- "meds_eentpreps" ,
- "otherus_count" ,
- "cc_alcoholintoxication" ,
- "headct_count" ,
- "religion_Christian" ,
- "meds_antihistamines" ,
- "proteinua_count" ,
- "proteinua_last" ,
- "ketonesua_count" ,
- "bloodua_count" ,
- "leukocytesua_count" ,
- "glucoseua_count"
- ]
- columns_pval = [
- "esi",
- "age",
- "adltrespfl",
- "alcoholrelateddisorders",
- "anemia",
- "asthma",
- "backproblem",
- "bonectcncr",
- "bph",
- "breastcancr",
- "breastdx",
- "brnchlngca",
- "cardiacanom",
- "dysrhythmia",
- "hyperlipidem",
- "intobstruct",
- "intracrninj",
- "kidnyrnlca",
- "leukemias",
- "maligneopls",
- "mycoses",
- "nephritis",
- "osteoarthros",
- "otdxkidney",
- "othliverdx",
- "otjointdx",
- "ovarycancer",
- "pancreasdx",
- "personalitydisorders",
- "septicemia",
- "substancerelateddisorders",
- "syncope",
- "thyroiddsor",
- "ulceratcol",
- "urinyorgca",
- "n_edvisits",
- "n_admissions",
- "ketonesua_last",
- "pregtestur_count",
- "bloodculture,routine_count",
- "cxr_count",
- "ekg_count",
- "meds_analgesics",
- "meds_anti-obesitydrugs",
- "meds_antiarthritics",
- "meds_antiasthmatics",
- "meds_antibiotics",
- "meds_anticoagulants",
- "meds_antihistamines",
- "meds_antihyperglycemics",
- "meds_antineoplastics",
- "meds_antiparkinsondrugs",
- "meds_antiplateletdrugs",
- "meds_antivirals",
- "meds_autonomicdrugs",
- "meds_blood",
- "meds_cardiacdrugs",
- "meds_cardiovascular",
- "meds_cnsdrugs",
- "meds_contraceptives",
- "meds_diuretics",
- "meds_hormones",
- "meds_immunosuppressants",
- "meds_pre-natalvitamins",
- "meds_psychotherapeuticdrugs",
- "meds_thyroidpreps",
- "meds_unclassifieddrugproducts",
- "meds_vitamins",
- "cc_abdominalpain",
- "cc_abdominalpainpregnant",
- "cc_abnormallab",
- "cc_alcoholintoxication",
- "cc_alcoholproblem",
- "cc_allergicreaction",
- "cc_alteredmentalstatus",
- "cc_anxiety",
- "cc_assaultvictim",
- "cc_bleeding/bruising",
- "cc_blurredvision",
- "cc_bodyfluidexposure",
- "cc_breathingdifficulty",
- "cc_chestpain",
- "cc_confusion",
- "cc_constipation",
- "cc_dehydration",
- "cc_drug/alcoholassessment",
- "cc_drugproblem",
- "cc_dyspnea",
- "cc_edema",
- "cc_elevatedbloodsugar-symptomatic",
- "cc_emesis",
- "cc_epistaxis",
- "cc_extremityweakness",
- "cc_fall",
- "cc_fatigue",
- "cc_fever",
- "cc_fever-75yearsorolder",
- "cc_fever-9weeksto74years",
- "cc_feverimmunocompromised",
- "cc_follow-upcellulitis",
- "cc_fulltrauma",
- "cc_gibleeding",
- "cc_giproblem",
- "cc_headache-newonsetornewsymptoms",
- "cc_hypertension",
- "cc_hypotension",
- "cc_legswelling",
- "cc_lethargy",
- "cc_lossofconsciousness",
- "cc_maleguproblem",
- "cc_modifiedtrauma",
- "cc_motorvehiclecrash",
- "cc_multiplefalls",
- "cc_neurologicproblem",
- "cc_other",
- "cc_psychiatricevaluation",
- "cc_psychoticsymptoms",
- "cc_seizure-newonset",
- "cc_shortnessofbreath",
- "cc_sicklecellpain",
- "cc_strokealert",
- "cc_suture/stapleremoval",
- "cc_syncope",
- "cc_unresponsive",
- "cc_urinaryretention",
- "cc_vaginalbleeding",
- "cc_weakness",
- "cc_withdrawal-alcohol",
- "cc_woundinfection",
- "dep_name_A",
- "dep_name_C",
- "gender_Female",
- "race_American Indian or Alaska Native",
- "lang_English",
- "lang_Other",
- "religion_Other",
- "religion_Unknown",
- "employstatus_Full Time",
- "employstatus_Part Time",
- "insurance_status_Commercial",
- "insurance_status_Medicaid",
- "insurance_status_Medicare",
- "insurance_status_Self pay",
- "arrivalmode_Car",
- "arrivalmode_Walk-in",
- "previousdispo_Admit",
- "previousdispo_Discharge",
- ]
- new_dataset = X_encoded[columns_pval].copy()#[:100000]
- y = y#[:100000]
- # Splitting the dataset into the Training set and Test set
- from sklearn.model_selection import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(X_encoded, y,
- test_size = 0.2)
- # Feature Scaling
- from sklearn.preprocessing import StandardScaler
- sc_X = StandardScaler()
- X_train = sc_X.fit_transform(X_train)
- X_test = sc_X.transform(X_test)
- # Fitting Classifier to the Training set
- from sklearn.neighbors import KNeighborsClassifier
- classifier = KNeighborsClassifier(n_neighbors = 8, metric = "minkowski", p = 2, n_jobs=-1)
- classifier.fit(X_train, y_train)
- # Predicting the Test set results
- y_pred = classifier.predict(X_test)
- # Making the Confusion Matrix (function)
- from sklearn.metrics import confusion_matrix
- cm = confusion_matrix(y_test, y_pred)
- from sklearn.metrics import accuracy_score
- accuracy_score(y_test, y_pred)
- # Feature Scaling (WHEN ALGORITHM IS BASED ON EUCLIDEAN DISTANCE!!!!)
- from sklearn.preprocessing import StandardScaler
- sc_X = StandardScaler()
- X_train = sc_X.fit_transform(X_train)
- X_test = sc_X.transform(X_test)
- # Fitting Random Forest Classifier to the Training set
- from sklearn.ensemble import RandomForestClassifier
- classifier = RandomForestClassifier(n_estimators = 138, bootstrap = False, max_depth = 72, max_features = "sqrt", min_samples_leaf = 2, min_samples_split = 32, criterion = "entropy", n_jobs=-1)
- classifier.fit(X_train, y_train)
- # Predicting the Test set results
- y_pred = classifier.predict(X_test)
- # Making the Confusion Matrix (function)
- from sklearn.metrics import confusion_matrix
- cm = confusion_matrix(y_test, y_pred)
- from sklearn.model_selection import RandomizedSearchCV
- # Number of trees in random forest
- n_estimators = [int(x) for x in np.linspace(start = 50, stop = 2000, num = 35)]
- # Number of features to consider at every split
- max_features = ['sqrt', 'log2']
- # Maximum number of levels in tree
- max_depth = [int(x) for x in np.linspace(10, 250, num = 20)]
- max_depth.append(None)
- # Minimum number of samples required to split a node
- min_samples_split = [5, 10, 15, 20, 25]
- # Minimum number of samples required at each leaf node
- min_samples_leaf = [5, 10, 15, 20, 25, 30, 50]
- # Method of selecting samples for training each tree
- bootstrap = [True, False]
- # Create the random grid
- random_grid = {'n_estimators': n_estimators,
- 'max_features': max_features,
- 'max_depth': max_depth,
- 'min_samples_split': min_samples_split,
- 'min_samples_leaf': min_samples_leaf,
- 'bootstrap': bootstrap}
- print(random_grid)
- {'bootstrap': [True, False],
- 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
- 'max_features': ['auto', 'sqrt'],
- 'min_samples_leaf': [1, 2, 4],
- 'min_samples_split': [2, 5, 10],
- 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
- # Use the random grid to search for best hyperparameters
- # First create the base model to tune
- rf = RandomForestClassifier()
- # Random search of parameters, using 3 fold cross validation,
- # search across 100 different combinations, and use all available cores
- rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, cv = 3, verbose=2, random_state=42, n_jobs = -1)
- # Fit the random search model
- rf_random.fit(X_train, y_train)
- X_encoded2= X_encoded[columns_pval].copy()[:56000]
- y2= y[:56000]
- # BACKWARD ELIMINATION
- import statsmodels.api as sm
- SL = 0.005
- def backwardElimination(x, sl):
- numVars = len(x.columns)
- for i in range(0, numVars):
- classifier_OLS = sm.OLS(y2, x).fit()
- maxVar = max(classifier_OLS.pvalues)
- if maxVar > sl:
- for j in range(0, numVars - i):
- if (classifier_OLS.pvalues[j] == maxVar):
- print(x.columns[j])
- print(classifier_OLS.pvalues[j])
- print("---------------------")
- x.drop(x.columns[j], axis=1, inplace=True)
- classifier_OLS.summary()
- return x
- """
- def backwardElimination(x, SL):
- numVars = len(x.columns)
- temp = np.zeros((56000,644)).astype(int)
- for i in range(0, numVars):
- classifier_OLS = sm.OLS(y2, x).fit()
- maxVar = max(classifier_OLS.pvalues)
- adjR_before = classifier_OLS.rsquared_adj.astype(float)
- if maxVar > SL:
- for j in range(0, numVars - i):
- if (classifier_OLS.pvalues[j].astype(float) == maxVar):
- temp = x
- x.drop(x.columns[j], axis=1, inplace=True)
- # y2 = np.delete(y2, j)
- tmp_classifier = sm.OLS(y2, x).fit()
- adjR_after = tmp_classifier.rsquared_adj.astype(float)
- print(len(temp.columns))
- print(len(x.columns))
- print(str(adjR_before) + " : " + str(adjR_after))
- if (adjR_before >= adjR_after):
- x_rollback = np.hstack((x, temp.iloc[:, 0: j]))
- x_rollback = np.delete(x_rollback, j, 1)
- return x_rollback
- else:
- print("-----")
- continue
- classifier_OLS.summary()
- return x
- """
- backwardElimination(X_encoded2, SL)
- for row in X_encoded2.columns:
- print("\"" + row + "\",")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement