Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- [38]:
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- import seaborn as sns
- from sklearn.preprocessing import StandardScaler
- from sklearn.model_selection import train_test_split
- from sklearn.model_selection import cross_val_score
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.naive_bayes import GaussianNB
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.svm import SVC
- from sklearn.preprocessing import LabelEncoder
-
- import warnings
- warnings.simplefilter(action='ignore', category=FutureWarning)
- ####Read the stroke datafile####
- [39]:
- stroke_pred=pd.read_csv('/Users/lakuu2/Downloads/stroke_data/train_2.csv')
- [40]:
- stroke_pred.head()
- [40]:
- id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
- 0 30669 Male 3.0 0 0 No children Rural 95.12 18.0 NaN 0
- 1 30468 Male 58.0 1 0 Yes Private Urban 87.96 39.2 never smoked 0
- 2 16523 Female 8.0 0 0 No Private Urban 110.89 17.6 NaN 0
- 3 56543 Female 70.0 0 0 Yes Private Rural 69.04 35.9 formerly smoked 0
- 4 46136 Male 14.0 0 0 No Never_worked Rural 161.28 19.1 NaN 0
- ####Checking for missing values####
- [41]:
- stroke_pred.isna().sum()
- [41]:
- id 0
- gender 0
- age 0
- hypertension 0
- heart_disease 0
- ever_married 0
- work_type 0
- Residence_type 0
- avg_glucose_level 0
- bmi 1462
- smoking_status 13292
- stroke 0
- dtype: int64
- ####Replace "bmi" with its mean value####
- [42]:
- stroke_pred['bmi'].fillna(stroke_pred['bmi'].mean(), inplace=True)
- [43]:
- stroke_pred.isna().sum()
- [43]:
- id 0
- gender 0
- age 0
- hypertension 0
- heart_disease 0
- ever_married 0
- work_type 0
- Residence_type 0
- avg_glucose_level 0
- bmi 0
- smoking_status 13292
- stroke 0
- dtype: int64
- ####Drop any missing values left####
- [44]:
- stroke_pred.dropna(inplace=True)
- [45]:
- stroke_pred.shape
- [45]:
- (30108, 12)
- [46]:
- stroke_pred.isna().sum()
- [46]:
- id 0
- gender 0
- age 0
- hypertension 0
- heart_disease 0
- ever_married 0
- work_type 0
- Residence_type 0
- avg_glucose_level 0
- bmi 0
- smoking_status 0
- stroke 0
- dtype: int64
- [47]:
- stroke_pred.info()
- <class 'pandas.core.frame.DataFrame'>
- Int64Index: 30108 entries, 1 to 43399
- Data columns (total 12 columns):
- id 30108 non-null int64
- gender 30108 non-null object
- age 30108 non-null float64
- hypertension 30108 non-null int64
- heart_disease 30108 non-null int64
- ever_married 30108 non-null object
- work_type 30108 non-null object
- Residence_type 30108 non-null object
- avg_glucose_level 30108 non-null float64
- bmi 30108 non-null float64
- smoking_status 30108 non-null object
- stroke 30108 non-null int64
- dtypes: float64(3), int64(4), object(5)
- memory usage: 3.0+ MB
- [48]:
- stroke_pred.head(6)
- [48]:
- id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
- 1 30468 Male 58.0 1 0 Yes Private Urban 87.96 39.2 never smoked 0
- 3 56543 Female 70.0 0 0 Yes Private Rural 69.04 35.9 formerly smoked 0
- 6 52800 Female 52.0 0 0 Yes Private Urban 77.59 17.7 formerly smoked 0
- 7 41413 Female 75.0 0 1 Yes Self-employed Rural 243.53 27.0 never smoked 0
- 8 15266 Female 32.0 0 0 Yes Private Rural 77.67 32.3 smokes 0
- 9 28674 Female 74.0 1 0 Yes Self-employed Urban 205.84 54.6 never smoked 0
- ####Perform label encoding to convert text values to numerical values####
- [49]:
- lb_make = LabelEncoder()
- stroke_pred["gender"] = lb_make.fit_transform(stroke_pred["gender"])
- stroke_pred["ever_married"] = lb_make.fit_transform(stroke_pred["ever_married"])
- stroke_pred["work_type"] = lb_make.fit_transform(stroke_pred["work_type"])
- stroke_pred["Residence_type"] = lb_make.fit_transform(stroke_pred["Residence_type"])
- stroke_pred["smoking_status"] = lb_make.fit_transform(stroke_pred["smoking_status"])
- [50]:
- stroke_pred.head()
- [50]:
- id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
- 1 30468 1 58.0 1 0 1 2 1 87.96 39.2 1 0
- 3 56543 0 70.0 0 0 1 2 0 69.04 35.9 0 0
- 6 52800 0 52.0 0 0 1 2 1 77.59 17.7 0 0
- 7 41413 0 75.0 0 1 1 3 0 243.53 27.0 1 0
- 8 15266 0 32.0 0 0 1 2 0 77.67 32.3 2 0
- ####Convert data type to integer or float####
- [51]:
- stroke_pred["gender"]= stroke_pred["gender"].astype(int)
- stroke_pred["ever_married"]= stroke_pred["ever_married"].astype(int)
- stroke_pred["work_type"]= stroke_pred["work_type"].astype(int)
- stroke_pred["Residence_type"]= stroke_pred["Residence_type"].astype(int)
- stroke_pred["smoking_status"]= stroke_pred["smoking_status"].astype(int)
- ####Perform Standard Scaler and make "stroke" variable the target variable####
- [52]:
- X = stroke_pred.drop(['stroke','id'], axis=1)
- X=StandardScaler().fit_transform(X)
-
- y = stroke_pred.stroke
- /anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/data.py:625: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.
- return self.partial_fit(X, y)
- /anaconda3/lib/python3.7/site-packages/sklearn/base.py:462: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.
- return self.fit(X, **fit_params).transform(X)
- ####Split dataset into train and test data and set randon state####
- [53]:
- X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size=0.3, random_state=21)
- [54]:
- NB = GaussianNB()
- DT = DecisionTreeClassifier()
- RF = RandomForestClassifier()
- SV = SVC(probability=True)
- ####Iterate through the models####
- [55]:
- Model_list=[NB, DT, RF, SV]
-
- Accuracy=[]
-
- for model in Model_list:
- score = cross_val_score(model, X_test, Y_test, cv=2)
- avg_score = np.mean(score)
- Accuracy.append(avg_score)
-
- Model_name= ['NB', 'DT', 'RF', 'SV']
- Model_performace= pd.DataFrame({'Model': Model_name, 'Accuracy': Accuracy})
- Model_performace = Model_performace.sort_values(by='Accuracy', ascending=False)
- Model_performace
- [55]:
- Model Accuracy
- 3 SV 0.979741
- 2 RF 0.979520
- 1 DT 0.953061
- 0 NB 0.919185
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement