Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Author: Abu Kaisar Jamil , University of Asia Pacific
- # SVM algorithm to predict who servives or not
- # Platform: Google Colabotary
- # import numpy and pandas
- import numpy as np
- import pandas as pd
- #load train and test file
- train = pd.read_csv('train.csv')
- test = pd.read_csv('test.csv')
- train.head()
- #print the formate
- train.shape
- test.shape
- #missing train data
- train.isnull().sum()
- #missing test data
- test.isnull().sum()
- #Graphical libaray for visualization
- import matplotlib.pyplot as plt
- %matplotlib inline
- import seaborn as sns
- sns.set()
- #barchart function
- def bar_chart(feature):
- survived = train[train['Survived']==1][feature].value_counts()
- dead = train[train['Survived']==0][feature].value_counts()
- df = pd.DataFrame([survived,dead])
- df.index = ['Survived','Dead']
- df.plot(kind='bar',stacked=True, figsize=(10,5))
- #show gender
- bar_chart('Sex')
- #summing train and test dataset
- train_test_data = [train,test]
- for dataset in train_test_data:
- dataset['Title'] = dataset['Name'].str.extract('([A-Za-z]+)\.',expand=False)
- #find number of train and test dataset title
- train['Title'].value_counts()
- #others titles are defined by 3
- title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,
- "Countess": 3, "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Mme": 3, "Capt": 3, "Sir": 3, }
- for dataset in train_test_data:
- dataset['Title'] = dataset['Title'].map(title_mapping)
- #print train and test data
- train.head()
- test.head()
- # delete unnecessary feature from dataset
- train.drop('Name', axis=1, inplace=True)
- test.drop('Name', axis=1, inplace=True)
- # now print
- train.head()
- test.head()
- #maping sex
- sex_mapping = {"male": 0, "female": 1}
- for dataset in train_test_data:
- dataset['Sex'] = dataset['Sex'].map(sex_mapping)
- #print barchart
- bar_chart('Sex')
- # fill missing age with median age for each title (Mr, Mrs, Miss, Others)
- train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
- test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True)
- train.groupby("Title")["Age"].transform("median")
- # now print
- train.head()
- # showing in graph
- facet = sns.FacetGrid(train, hue="Survived",aspect=4)
- facet.map(sns.kdeplot,'Age',shade= True)
- facet.set(xlim=(0, train['Age'].max()))
- facet.add_legend()
- # show
- plt.show()
- #differentionate age
- for dataset in train_test_data:
- dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0,
- dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 1,
- dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 36), 'Age'] = 2,
- dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 62), 'Age'] = 3,
- dataset.loc[ dataset['Age'] > 62, 'Age'] = 4
- # now print
- train.head()
- #filling missing values
- Pclass1 = train[train['Pclass']==1]['Embarked'].value_counts()
- Pclass2 = train[train['Pclass']==2]['Embarked'].value_counts()
- Pclass3 = train[train['Pclass']==3]['Embarked'].value_counts()
- df = pd.DataFrame([Pclass1, Pclass2, Pclass3])
- df.index = ['1st class','2nd class', '3rd class']
- df.plot(kind='bar',stacked=True, figsize=(10,5))
- for dataset in train_test_data:
- dataset['Embarked'] = dataset['Embarked'].fillna('S')
- train.head()
- #each city defined by each number
- embarked_mapping = {"S": 0, "C": 1, "Q": 2}
- for dataset in train_test_data:
- dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)
- # fill missing Fare with median fare for each Pclass
- train["Fare"].fillna(train.groupby("Pclass")["Fare"].transform("median"), inplace=True)
- test["Fare"].fillna(test.groupby("Pclass")["Fare"].transform("median"), inplace=True)
- train.head(5)
- #dividing tickets price into different begs
- for dataset in train_test_data:
- dataset.loc[ dataset['Fare'] <= 17, 'Fare'] = 0,
- dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1,
- dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2,
- dataset.loc[ dataset['Fare'] > 100, 'Fare'] = 3
- train.head()
- train.Cabin.value_counts()
- #define which cabin is it ?
- for dataset in train_test_data:
- dataset['Cabin'] = dataset['Cabin'].str[:1]
- Pclass1 = train[train['Pclass']==1]['Cabin'].value_counts()
- Pclass2 = train[train['Pclass']==2]['Cabin'].value_counts()
- Pclass3 = train[train['Pclass']==3]['Cabin'].value_counts()
- df = pd.DataFrame([Pclass1, Pclass2, Pclass3])
- df.index = ['1st class','2nd class', '3rd class']
- df.plot(kind='bar',stacked=True, figsize=(10,5))
- # cabin mapping
- cabin_mapping = {"A": 0, "B": 0.4, "C": 0.8, "D": 1.2, "E": 1.6, "F": 2, "G": 2.4, "T": 2.8}
- for dataset in train_test_data:
- dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)
- # fill missing Fare with median fare for each Pclass
- train["Cabin"].fillna(train.groupby("Pclass")["Cabin"].transform("median"), inplace=True)
- test["Cabin"].fillna(test.groupby("Pclass")["Cabin"].transform("median"), inplace=True)
- train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
- test["FamilySize"] = test["SibSp"] + test["Parch"] + 1
- facet = sns.FacetGrid(train, hue="Survived",aspect=4)
- facet.map(sns.kdeplot,'FamilySize',shade= True)
- facet.set(xlim=(0, train['FamilySize'].max()))
- facet.add_legend()
- plt.xlim(0)
- #family mapping
- family_mapping = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 9: 3.2, 10: 3.6, 11: 4}
- for dataset in train_test_data:
- dataset['FamilySize'] = dataset['FamilySize'].map(family_mapping)
- train.head()
- #droping features
- features_drop = ['Ticket', 'SibSp', 'Parch']
- train = train.drop(features_drop, axis=1)
- test = test.drop(features_drop, axis=1)
- train = train.drop(['PassengerId'], axis=1)
- train_data = train.drop('Survived', axis=1)
- target = train['Survived']
- #now print
- train_data.shape, target.shape
- #import libraries
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.model_selection import KFold
- from sklearn.model_selection import cross_val_score
- from sklearn.svm import SVC
- k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
- train_data.head(50)
- test.head()
- #using Decision Tree
- clf = DecisionTreeClassifier()
- scoring = 'accuracy'
- score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
- print(score)
- # decision tree Score
- round(np.mean(score)*100, 2)
- #Testing by SVM
- clf = SVC(gamma="auto")
- clf.fit(train_data, target)
- test_data = test.drop("PassengerId", axis=1).copy()
- prediction = clf.predict(test_data)
- submission = pd.DataFrame({
- "PassengerId": test["PassengerId"],
- "Survived": prediction
- })
- # servived result will be saved as submission.csv
- submission.to_csv('submission.csv', index=False)
- submission = pd.read_csv('submission.csv')
- submission.head()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement