Advertisement
Guest User

Untitled

a guest
Jul 18th, 2019
352
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.31 KB | None | 0 0
  1. [38]:
  2.  
  3. import pandas as pd
  4. import numpy as np
  5. import matplotlib.pyplot as plt
  6. import seaborn as sns
  7. from sklearn.preprocessing import StandardScaler
  8. from sklearn.model_selection import train_test_split
  9. from sklearn.model_selection import cross_val_score
  10. from sklearn.ensemble import RandomForestClassifier
  11. from sklearn.naive_bayes import GaussianNB
  12. from sklearn.tree import DecisionTreeClassifier
  13. from sklearn.svm import SVC
  14. from sklearn.preprocessing import LabelEncoder
  15. import warnings
  16. warnings.simplefilter(action='ignore', category=FutureWarning)
  17. ####Read the stroke datafile####
  18.  
  19. [39]:
  20.  
  21. stroke_pred=pd.read_csv('/Users/lakuu2/Downloads/stroke_data/train_2.csv')
  22. [40]:
  23.  
  24. stroke_pred.head()
  25. [40]:
  26. id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
  27. 0 30669 Male 3.0 0 0 No children Rural 95.12 18.0 NaN 0
  28. 1 30468 Male 58.0 1 0 Yes Private Urban 87.96 39.2 never smoked 0
  29. 2 16523 Female 8.0 0 0 No Private Urban 110.89 17.6 NaN 0
  30. 3 56543 Female 70.0 0 0 Yes Private Rural 69.04 35.9 formerly smoked 0
  31. 4 46136 Male 14.0 0 0 No Never_worked Rural 161.28 19.1 NaN 0
  32. ####Checking for missing values####
  33.  
  34. [41]:
  35.  
  36. stroke_pred.isna().sum()
  37. [41]:
  38. id 0
  39. gender 0
  40. age 0
  41. hypertension 0
  42. heart_disease 0
  43. ever_married 0
  44. work_type 0
  45. Residence_type 0
  46. avg_glucose_level 0
  47. bmi 1462
  48. smoking_status 13292
  49. stroke 0
  50. dtype: int64
  51. ####Replace "bmi" with its mean value####
  52.  
  53. [42]:
  54.  
  55. stroke_pred['bmi'].fillna(stroke_pred['bmi'].mean(), inplace=True)
  56. [43]:
  57.  
  58. stroke_pred.isna().sum()
  59. [43]:
  60. id 0
  61. gender 0
  62. age 0
  63. hypertension 0
  64. heart_disease 0
  65. ever_married 0
  66. work_type 0
  67. Residence_type 0
  68. avg_glucose_level 0
  69. bmi 0
  70. smoking_status 13292
  71. stroke 0
  72. dtype: int64
  73. ####Drop any missing values left####
  74.  
  75. [44]:
  76.  
  77. stroke_pred.dropna(inplace=True)
  78. [45]:
  79.  
  80. stroke_pred.shape
  81. [45]:
  82. (30108, 12)
  83. [46]:
  84.  
  85. stroke_pred.isna().sum()
  86. [46]:
  87. id 0
  88. gender 0
  89. age 0
  90. hypertension 0
  91. heart_disease 0
  92. ever_married 0
  93. work_type 0
  94. Residence_type 0
  95. avg_glucose_level 0
  96. bmi 0
  97. smoking_status 0
  98. stroke 0
  99. dtype: int64
  100. [47]:
  101.  
  102. stroke_pred.info()
  103. <class 'pandas.core.frame.DataFrame'>
  104. Int64Index: 30108 entries, 1 to 43399
  105. Data columns (total 12 columns):
  106. id 30108 non-null int64
  107. gender 30108 non-null object
  108. age 30108 non-null float64
  109. hypertension 30108 non-null int64
  110. heart_disease 30108 non-null int64
  111. ever_married 30108 non-null object
  112. work_type 30108 non-null object
  113. Residence_type 30108 non-null object
  114. avg_glucose_level 30108 non-null float64
  115. bmi 30108 non-null float64
  116. smoking_status 30108 non-null object
  117. stroke 30108 non-null int64
  118. dtypes: float64(3), int64(4), object(5)
  119. memory usage: 3.0+ MB
  120. [48]:
  121.  
  122. stroke_pred.head(6)
  123. [48]:
  124. id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
  125. 1 30468 Male 58.0 1 0 Yes Private Urban 87.96 39.2 never smoked 0
  126. 3 56543 Female 70.0 0 0 Yes Private Rural 69.04 35.9 formerly smoked 0
  127. 6 52800 Female 52.0 0 0 Yes Private Urban 77.59 17.7 formerly smoked 0
  128. 7 41413 Female 75.0 0 1 Yes Self-employed Rural 243.53 27.0 never smoked 0
  129. 8 15266 Female 32.0 0 0 Yes Private Rural 77.67 32.3 smokes 0
  130. 9 28674 Female 74.0 1 0 Yes Self-employed Urban 205.84 54.6 never smoked 0
  131. ####Perform label encoding to convert text values to numerical values####
  132.  
  133. [49]:
  134.  
  135. lb_make = LabelEncoder()
  136. stroke_pred["gender"] = lb_make.fit_transform(stroke_pred["gender"])
  137. stroke_pred["ever_married"] = lb_make.fit_transform(stroke_pred["ever_married"])
  138. stroke_pred["work_type"] = lb_make.fit_transform(stroke_pred["work_type"])
  139. stroke_pred["Residence_type"] = lb_make.fit_transform(stroke_pred["Residence_type"])
  140. stroke_pred["smoking_status"] = lb_make.fit_transform(stroke_pred["smoking_status"])
  141. [50]:
  142.  
  143. stroke_pred.head()
  144. [50]:
  145. id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
  146. 1 30468 1 58.0 1 0 1 2 1 87.96 39.2 1 0
  147. 3 56543 0 70.0 0 0 1 2 0 69.04 35.9 0 0
  148. 6 52800 0 52.0 0 0 1 2 1 77.59 17.7 0 0
  149. 7 41413 0 75.0 0 1 1 3 0 243.53 27.0 1 0
  150. 8 15266 0 32.0 0 0 1 2 0 77.67 32.3 2 0
  151. ####Convert data type to integer or float####
  152.  
  153. [51]:
  154.  
  155. stroke_pred["gender"]= stroke_pred["gender"].astype(int)
  156. stroke_pred["ever_married"]= stroke_pred["ever_married"].astype(int)
  157. stroke_pred["work_type"]= stroke_pred["work_type"].astype(int)
  158. stroke_pred["Residence_type"]= stroke_pred["Residence_type"].astype(int)
  159. stroke_pred["smoking_status"]= stroke_pred["smoking_status"].astype(int)
  160. ####Perform Standard Scaler and make "stroke" variable the target variable####
  161.  
  162. [52]:
  163.  
  164. X = stroke_pred.drop(['stroke','id'], axis=1)
  165. X=StandardScaler().fit_transform(X)
  166. y = stroke_pred.stroke
  167. /anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/data.py:625: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.
  168. return self.partial_fit(X, y)
  169. /anaconda3/lib/python3.7/site-packages/sklearn/base.py:462: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.
  170. return self.fit(X, **fit_params).transform(X)
  171. ####Split dataset into train and test data and set randon state####
  172.  
  173. [53]:
  174.  
  175. X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size=0.3, random_state=21)
  176. [54]:
  177.  
  178. NB = GaussianNB()
  179. DT = DecisionTreeClassifier()
  180. RF = RandomForestClassifier()
  181. SV = SVC(probability=True)
  182. ####Iterate through the models####
  183.  
  184. [55]:
  185.  
  186. Model_list=[NB, DT, RF, SV]
  187. Accuracy=[]
  188. for model in Model_list:
  189. score = cross_val_score(model, X_test, Y_test, cv=2)
  190. avg_score = np.mean(score)
  191. Accuracy.append(avg_score)
  192. Model_name= ['NB', 'DT', 'RF', 'SV']
  193. Model_performace= pd.DataFrame({'Model': Model_name, 'Accuracy': Accuracy})
  194. Model_performace = Model_performace.sort_values(by='Accuracy', ascending=False)
  195. Model_performace
  196. [55]:
  197. Model Accuracy
  198. 3 SV 0.979741
  199. 2 RF 0.979520
  200. 1 DT 0.953061
  201. 0 NB 0.919185
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement