Advertisement
Guest User

Untitled

a guest
Jul 18th, 2019
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 16.89 KB | None | 0 0
  1. # Functions
  2. def get_dataset_count_of_missing_values(dataset):
  3. mis_val = dataset.isnull().sum()
  4. mis_val_percent = 100 * dataset.isnull().sum() / len(dataset)
  5. mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
  6. mis_val_table_ren_columns = mis_val_table.rename(
  7. columns = {0 : 'Missing Values', 1 : '% of Total Values'})
  8. mis_val_table_ren_columns = mis_val_table_ren_columns[
  9. mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
  10. '% of Total Values', ascending=False).round(1)
  11. print ("Your selected dataframe has " + str(dataset.shape[1]) + " columns.\n"
  12. "There are " + str(mis_val_table_ren_columns.shape[0]) +
  13. " columns that have missing values.")
  14. return mis_val_table_ren_columns
  15.  
  16. def remove_columns_with_missing_values_inplace(dataset, missing_values_table, threshold):
  17. for index, row in missing_values_table.iterrows():
  18. if row['Missing Values'] > threshold:
  19. #print(row['Missing Values'], row['% of Total Values'])
  20. print(row.name)
  21. dataset.drop(row.name, axis = 1, inplace = True)
  22.  
  23. # Libraries
  24. import numpy as np
  25. import matplotlib.pyplot as plt
  26. import pandas as pd
  27.  
  28. pd.set_option('display.max_rows', 2500)
  29. pd.set_option('display.max_columns', 2500)
  30. pd.set_option('display.width', 2000)
  31.  
  32. # Improting the dataset
  33. dataset = pd.read_csv('patients_without_empty_rows.csv')
  34.  
  35. dataset.drop(dataset.columns[[0]], axis=1, inplace=True)
  36. #dataset.to_csv("patients_without_empty_rows.csv")
  37. #print(dataset.isna().sum())
  38.  
  39. # Handle missing data
  40. #missing_values_table = get_dataset_count_of_missing_values(dataset)
  41. #remove_columns_with_missing_values_inplace(dataset, missing_values_table, threshold = 160000)
  42. #print(missing_values_table)
  43. #dataset.to_csv("patients_without_nan.csv")
  44. #dataset.dropna(inplace=True)
  45. #dataset.to_csv("patients_without_empty_rows.csv")
  46.  
  47. X = dataset.iloc[:, dataset.columns != "disposition"]
  48. y = dataset.iloc[:, 11]
  49.  
  50. # Encoding categorical data
  51. # Encoding the Independent Variable
  52. from sklearn.preprocessing import LabelEncoder, OneHotEncoder
  53. labelencoder_X = LabelEncoder()
  54. X.iloc[:, 0] = labelencoder_X.fit_transform(X.iloc[:, 0])
  55.  
  56. for i in range(3, 13):
  57. labelencoder_X = LabelEncoder()
  58. X.iloc[:, i] = labelencoder_X.fit_transform(X.iloc[:, i])
  59.  
  60.  
  61. onehotencoder = OneHotEncoder(sparse=False,categorical_features = [0,3,4,5,6,7,8,9,10,11,12])
  62. X_fitted = onehotencoder.fit_transform(X).toarray()
  63. print(X_fitted)
  64. print(onehotencoder.feature_indices_) # [ 0 3 9 15]
  65. """
  66. """
  67. # Encoding the Dependent Variable
  68. labelencoder_y = LabelEncoder()
  69. y = labelencoder_y.fit_transform(y)
  70.  
  71. cat_columns = [
  72. "dep_name",
  73. "gender",
  74. "ethnicity",
  75. "race",
  76. "lang",
  77. "religion",
  78. "maritalstatus",
  79. "employstatus",
  80. "insurance_status",
  81. "arrivalmode",
  82. "previousdispo" ]#, "disposition"]
  83.  
  84. X_encoded = pd.get_dummies(X, prefix_sep = "_",
  85. columns = cat_columns)
  86.  
  87. ##
  88. dataset_encoded = pd.get_dummies(dataset, prefix_sep="_",
  89. columns=cat_columns)
  90. ##
  91.  
  92.  
  93. cat_columns = [
  94. "dep_name",
  95. "gender",
  96. "ethnicity",
  97. "race",
  98. "lang",
  99. "religion",
  100. "maritalstatus",
  101. "employstatus",
  102. "insurance_status",
  103. "arrivalmode",
  104. "previousdispo"]
  105.  
  106. X_encoded = pd.get_dummies(X, prefix_sep="_",
  107. columns=cat_columns)
  108.  
  109.  
  110.  
  111.  
  112.  
  113.  
  114.  
  115.  
  116.  
  117.  
  118.  
  119.  
  120.  
  121. #apply SelectKBest class to extract top 10 best features
  122. from sklearn.feature_selection import SelectKBest
  123. from sklearn.feature_selection import chi2
  124.  
  125. bestfeatures = SelectKBest(score_func=chi2, k=75)
  126. fit = bestfeatures.fit(X_encoded,y)
  127. dfscores = pd.DataFrame(fit.scores_)
  128. dfcolumns = pd.DataFrame(X_encoded.columns)
  129. #concat two dataframes for better visualization
  130. featureScores = pd.concat([dfcolumns,dfscores],axis=1)
  131. featureScores.columns = ['Feature','Score'] #naming the dataframe columns
  132. print(featureScores.nlargest(75,'Score')) #print 10 best features
  133.  
  134.  
  135.  
  136. from sklearn.ensemble import ExtraTreesClassifier
  137. model = ExtraTreesClassifier()
  138. model.fit(X_encoded,y)
  139. print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
  140. #plot graph of feature importances for better visualization
  141. feat_importances = pd.Series(model.feature_importances_, index=X_encoded.columns)
  142. feat_importances.nlargest(75).plot(kind='barh')
  143. plt.show()
  144.  
  145.  
  146. # Correlation matrix
  147. import seaborn as sns
  148. #get correlations of each features in dataset
  149. corrmat = dataset_encoded.corr()
  150.  
  151. top_corr_features = corrmat.index
  152. for i in range(0, 75):
  153. print(str(i+1) + ". " + top_corr_features[i])
  154.  
  155. plt.figure(figsize=(5,5))
  156. #plot heat map
  157. g=sns.heatmap(dataset[top_corr_features].corr(),annot=True,cmap="RdYlGn")
  158.  
  159. # Pearson's Correlation
  160. from numpy.random import randn
  161. from numpy.random import seed
  162. from scipy.stats import pearsonr
  163. corr, _ = pearsonr(X_encoded["bph"], y)
  164.  
  165.  
  166. columns = [
  167. "esi" ,
  168. "meds_cardiovascular" ,
  169. "age" ,
  170. "meds_gastrointestinal" ,
  171. "previousdispo_Admit" ,
  172. "previousdispo_Discharge" ,
  173. "meds_vitamins" ,
  174. "arrivalmode_ambulance" ,
  175. "employstatus_Retired" ,
  176. "meds_psychotherapeuticdrugs" ,
  177. "meds_thyroidpreps" ,
  178. "meds_antihyperglycemics" ,
  179. "meds_analgesics" ,
  180. "meds_antiplateletdrugs" ,
  181. "meds_antiasthmatics" ,
  182. "htn" ,
  183. "arrivalmode_Car" ,
  184. "n_edvisits" ,
  185. "n_surgeries" ,
  186. "meds_antiarthritics" ,
  187. "meds_elect/caloric/h2o" ,
  188. "race_White or Caucasian" ,
  189. "meds_diuretics" ,
  190. "insurance_status_Medicare" ,
  191. "meds_cardiacdrugs" ,
  192. "maritalstatus_Single" ,
  193. "dep_name_C" ,
  194. "meds_cnsdrugs" ,
  195. "dep_name_A" ,
  196. "arrivalmode_Walk-in" ,
  197. "meds_unclassifieddrugproducts" ,
  198. "diabmelnoc" ,
  199. "employstatus_Full Time" ,
  200. "cc_shortnessofbreath" ,
  201. "n_admissions" ,
  202. "ekg_count" ,
  203. "hyperlipidem" ,
  204. "religion_None" ,
  205. "cxr_count" ,
  206. "cc_abdominalpain" ,
  207. "insurance_status_Other" ,
  208. "dep_name_B" ,
  209. "employstatus_Not Employed" ,
  210. "unclassified" ,
  211. "religion_Catholic" ,
  212. "insurance_status_Self pay" ,
  213. "insurance_status_Medicaid" ,
  214. "cc_chestpain" ,
  215. "otherxr_count" ,
  216. "gender_Male" ,
  217. "gender_Female" ,
  218. "previousdispo_No previous dispo",
  219. "esophgealdx" ,
  220. "mooddisorders" ,
  221. "coronathero" ,
  222. "maritalstatus_Married" ,
  223. "insurance_status_Commercial" ,
  224. "anxietydisorders" ,
  225. "cc_other" ,
  226. "asthma" ,
  227. "otherct_count" ,
  228. "meds_anticoagulants" ,
  229. "meds_antibiotics" ,
  230. "meds_hormones" ,
  231. "meds_eentpreps" ,
  232. "otherus_count" ,
  233. "cc_alcoholintoxication" ,
  234. "headct_count" ,
  235. "religion_Christian" ,
  236. "meds_antihistamines" ,
  237. "proteinua_count" ,
  238. "proteinua_last" ,
  239. "ketonesua_count" ,
  240. "bloodua_count" ,
  241. "leukocytesua_count" ,
  242. "glucoseua_count"
  243. ]
  244.  
  245. columns_pval = [
  246. "esi",
  247. "age",
  248. "adltrespfl",
  249. "alcoholrelateddisorders",
  250. "anemia",
  251. "asthma",
  252. "backproblem",
  253. "bonectcncr",
  254. "bph",
  255. "breastcancr",
  256. "breastdx",
  257. "brnchlngca",
  258. "cardiacanom",
  259. "dysrhythmia",
  260. "hyperlipidem",
  261. "intobstruct",
  262. "intracrninj",
  263. "kidnyrnlca",
  264. "leukemias",
  265. "maligneopls",
  266. "mycoses",
  267. "nephritis",
  268. "osteoarthros",
  269. "otdxkidney",
  270. "othliverdx",
  271. "otjointdx",
  272. "ovarycancer",
  273. "pancreasdx",
  274. "personalitydisorders",
  275. "septicemia",
  276. "substancerelateddisorders",
  277. "syncope",
  278. "thyroiddsor",
  279. "ulceratcol",
  280. "urinyorgca",
  281. "n_edvisits",
  282. "n_admissions",
  283. "ketonesua_last",
  284. "pregtestur_count",
  285. "bloodculture,routine_count",
  286. "cxr_count",
  287. "ekg_count",
  288. "meds_analgesics",
  289. "meds_anti-obesitydrugs",
  290. "meds_antiarthritics",
  291. "meds_antiasthmatics",
  292. "meds_antibiotics",
  293. "meds_anticoagulants",
  294. "meds_antihistamines",
  295. "meds_antihyperglycemics",
  296. "meds_antineoplastics",
  297. "meds_antiparkinsondrugs",
  298. "meds_antiplateletdrugs",
  299. "meds_antivirals",
  300. "meds_autonomicdrugs",
  301. "meds_blood",
  302. "meds_cardiacdrugs",
  303. "meds_cardiovascular",
  304. "meds_cnsdrugs",
  305. "meds_contraceptives",
  306. "meds_diuretics",
  307. "meds_hormones",
  308. "meds_immunosuppressants",
  309. "meds_pre-natalvitamins",
  310. "meds_psychotherapeuticdrugs",
  311. "meds_thyroidpreps",
  312. "meds_unclassifieddrugproducts",
  313. "meds_vitamins",
  314. "cc_abdominalpain",
  315. "cc_abdominalpainpregnant",
  316. "cc_abnormallab",
  317. "cc_alcoholintoxication",
  318. "cc_alcoholproblem",
  319. "cc_allergicreaction",
  320. "cc_alteredmentalstatus",
  321. "cc_anxiety",
  322. "cc_assaultvictim",
  323. "cc_bleeding/bruising",
  324. "cc_blurredvision",
  325. "cc_bodyfluidexposure",
  326. "cc_breathingdifficulty",
  327. "cc_chestpain",
  328. "cc_confusion",
  329. "cc_constipation",
  330. "cc_dehydration",
  331. "cc_drug/alcoholassessment",
  332. "cc_drugproblem",
  333. "cc_dyspnea",
  334. "cc_edema",
  335. "cc_elevatedbloodsugar-symptomatic",
  336. "cc_emesis",
  337. "cc_epistaxis",
  338. "cc_extremityweakness",
  339. "cc_fall",
  340. "cc_fatigue",
  341. "cc_fever",
  342. "cc_fever-75yearsorolder",
  343. "cc_fever-9weeksto74years",
  344. "cc_feverimmunocompromised",
  345. "cc_follow-upcellulitis",
  346. "cc_fulltrauma",
  347. "cc_gibleeding",
  348. "cc_giproblem",
  349. "cc_headache-newonsetornewsymptoms",
  350. "cc_hypertension",
  351. "cc_hypotension",
  352. "cc_legswelling",
  353. "cc_lethargy",
  354. "cc_lossofconsciousness",
  355. "cc_maleguproblem",
  356. "cc_modifiedtrauma",
  357. "cc_motorvehiclecrash",
  358. "cc_multiplefalls",
  359. "cc_neurologicproblem",
  360. "cc_other",
  361. "cc_psychiatricevaluation",
  362. "cc_psychoticsymptoms",
  363. "cc_seizure-newonset",
  364. "cc_shortnessofbreath",
  365. "cc_sicklecellpain",
  366. "cc_strokealert",
  367. "cc_suture/stapleremoval",
  368. "cc_syncope",
  369. "cc_unresponsive",
  370. "cc_urinaryretention",
  371. "cc_vaginalbleeding",
  372. "cc_weakness",
  373. "cc_withdrawal-alcohol",
  374. "cc_woundinfection",
  375. "dep_name_A",
  376. "dep_name_C",
  377. "gender_Female",
  378. "race_American Indian or Alaska Native",
  379. "lang_English",
  380. "lang_Other",
  381. "religion_Other",
  382. "religion_Unknown",
  383. "employstatus_Full Time",
  384. "employstatus_Part Time",
  385. "insurance_status_Commercial",
  386. "insurance_status_Medicaid",
  387. "insurance_status_Medicare",
  388. "insurance_status_Self pay",
  389. "arrivalmode_Car",
  390. "arrivalmode_Walk-in",
  391. "previousdispo_Admit",
  392. "previousdispo_Discharge",
  393. ]
  394.  
  395. new_dataset = X_encoded[columns_pval].copy()#[:100000]
  396. y = y#[:100000]
  397. # Splitting the dataset into the Training set and Test set
  398. from sklearn.model_selection import train_test_split
  399. X_train, X_test, y_train, y_test = train_test_split(X_encoded, y,
  400. test_size = 0.2)
  401.  
  402. # Feature Scaling
  403. from sklearn.preprocessing import StandardScaler
  404. sc_X = StandardScaler()
  405. X_train = sc_X.fit_transform(X_train)
  406. X_test = sc_X.transform(X_test)
  407.  
  408. # Fitting Classifier to the Training set
  409. from sklearn.neighbors import KNeighborsClassifier
  410. classifier = KNeighborsClassifier(n_neighbors = 8, metric = "minkowski", p = 2, n_jobs=-1)
  411. classifier.fit(X_train, y_train)
  412.  
  413. # Predicting the Test set results
  414. y_pred = classifier.predict(X_test)
  415.  
  416. # Making the Confusion Matrix (function)
  417. from sklearn.metrics import confusion_matrix
  418. cm = confusion_matrix(y_test, y_pred)
  419.  
  420. from sklearn.metrics import accuracy_score
  421. accuracy_score(y_test, y_pred)
  422.  
  423.  
  424.  
  425.  
  426.  
  427.  
  428.  
  429.  
  430.  
  431.  
  432. # Feature Scaling (WHEN ALGORITHM IS BASED ON EUCLIDEAN DISTANCE!!!!)
  433. from sklearn.preprocessing import StandardScaler
  434. sc_X = StandardScaler()
  435. X_train = sc_X.fit_transform(X_train)
  436. X_test = sc_X.transform(X_test)
  437.  
  438. # Fitting Random Forest Classifier to the Training set
  439. from sklearn.ensemble import RandomForestClassifier
  440. classifier = RandomForestClassifier(n_estimators = 138, bootstrap = False, max_depth = 72, max_features = "sqrt", min_samples_leaf = 2, min_samples_split = 32, criterion = "entropy", n_jobs=-1)
  441. classifier.fit(X_train, y_train)
  442.  
  443. # Predicting the Test set results
  444. y_pred = classifier.predict(X_test)
  445.  
  446. # Making the Confusion Matrix (function)
  447. from sklearn.metrics import confusion_matrix
  448. cm = confusion_matrix(y_test, y_pred)
  449.  
  450.  
  451.  
  452.  
  453.  
  454.  
  455.  
  456.  
  457.  
  458.  
  459.  
  460.  
  461.  
  462.  
  463. from sklearn.model_selection import RandomizedSearchCV
  464. # Number of trees in random forest
  465. n_estimators = [int(x) for x in np.linspace(start = 50, stop = 2000, num = 35)]
  466. # Number of features to consider at every split
  467. max_features = ['sqrt', 'log2']
  468. # Maximum number of levels in tree
  469. max_depth = [int(x) for x in np.linspace(10, 250, num = 20)]
  470. max_depth.append(None)
  471. # Minimum number of samples required to split a node
  472. min_samples_split = [5, 10, 15, 20, 25]
  473. # Minimum number of samples required at each leaf node
  474. min_samples_leaf = [5, 10, 15, 20, 25, 30, 50]
  475. # Method of selecting samples for training each tree
  476. bootstrap = [True, False]
  477. # Create the random grid
  478. random_grid = {'n_estimators': n_estimators,
  479. 'max_features': max_features,
  480. 'max_depth': max_depth,
  481. 'min_samples_split': min_samples_split,
  482. 'min_samples_leaf': min_samples_leaf,
  483. 'bootstrap': bootstrap}
  484. print(random_grid)
  485. {'bootstrap': [True, False],
  486. 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
  487. 'max_features': ['auto', 'sqrt'],
  488. 'min_samples_leaf': [1, 2, 4],
  489. 'min_samples_split': [2, 5, 10],
  490. 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
  491.  
  492. # Use the random grid to search for best hyperparameters
  493. # First create the base model to tune
  494. rf = RandomForestClassifier()
  495. # Random search of parameters, using 3 fold cross validation,
  496. # search across 100 different combinations, and use all available cores
  497. rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, cv = 3, verbose=2, random_state=42, n_jobs = -1)
  498. # Fit the random search model
  499. rf_random.fit(X_train, y_train)
  500.  
  501.  
  502.  
  503.  
  504.  
  505.  
  506.  
  507.  
  508.  
  509.  
  510.  
  511.  
  512. X_encoded2= X_encoded[columns_pval].copy()[:56000]
  513. y2= y[:56000]
  514.  
  515. # BACKWARD ELIMINATION
  516. import statsmodels.api as sm
  517. SL = 0.005
  518.  
  519.  
  520.  
  521. def backwardElimination(x, sl):
  522. numVars = len(x.columns)
  523. for i in range(0, numVars):
  524. classifier_OLS = sm.OLS(y2, x).fit()
  525. maxVar = max(classifier_OLS.pvalues)
  526. if maxVar > sl:
  527. for j in range(0, numVars - i):
  528. if (classifier_OLS.pvalues[j] == maxVar):
  529. print(x.columns[j])
  530. print(classifier_OLS.pvalues[j])
  531. print("---------------------")
  532. x.drop(x.columns[j], axis=1, inplace=True)
  533. classifier_OLS.summary()
  534. return x
  535.  
  536. """
  537. def backwardElimination(x, SL):
  538. numVars = len(x.columns)
  539. temp = np.zeros((56000,644)).astype(int)
  540. for i in range(0, numVars):
  541. classifier_OLS = sm.OLS(y2, x).fit()
  542. maxVar = max(classifier_OLS.pvalues)
  543. adjR_before = classifier_OLS.rsquared_adj.astype(float)
  544. if maxVar > SL:
  545. for j in range(0, numVars - i):
  546. if (classifier_OLS.pvalues[j].astype(float) == maxVar):
  547. temp = x
  548. x.drop(x.columns[j], axis=1, inplace=True)
  549. # y2 = np.delete(y2, j)
  550. tmp_classifier = sm.OLS(y2, x).fit()
  551. adjR_after = tmp_classifier.rsquared_adj.astype(float)
  552. print(len(temp.columns))
  553. print(len(x.columns))
  554. print(str(adjR_before) + " : " + str(adjR_after))
  555. if (adjR_before >= adjR_after):
  556. x_rollback = np.hstack((x, temp.iloc[:, 0: j]))
  557. x_rollback = np.delete(x_rollback, j, 1)
  558. return x_rollback
  559. else:
  560. print("-----")
  561. continue
  562. classifier_OLS.summary()
  563. return x
  564. """
  565. backwardElimination(X_encoded2, SL)
  566.  
  567. for row in X_encoded2.columns:
  568. print("\"" + row + "\",")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement