Advertisement
Guest User

Untitled

a guest
Oct 13th, 2019
146
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.54 KB | None | 0 0
  1. # Author: Abu Kaisar Jamil , University of Asia Pacific
  2. # SVM algorithm to predict who servives or not
  3.  
  4. # Platform: Google Colabotary
  5.  
  6. # import numpy and pandas
  7. import numpy as np
  8. import pandas as pd
  9.  
  10. #load train and test file
  11. train = pd.read_csv('train.csv')
  12. test = pd.read_csv('test.csv')
  13. train.head()
  14.  
  15. #print the formate
  16. train.shape
  17. test.shape
  18.  
  19. #missing train data
  20. train.isnull().sum()
  21.  
  22. #missing test data
  23. test.isnull().sum()
  24.  
  25. #Graphical libaray for visualization
  26. import matplotlib.pyplot as plt
  27. %matplotlib inline
  28. import seaborn as sns
  29. sns.set()
  30.  
  31. #barchart function
  32. def bar_chart(feature):
  33. survived = train[train['Survived']==1][feature].value_counts()
  34. dead = train[train['Survived']==0][feature].value_counts()
  35. df = pd.DataFrame([survived,dead])
  36. df.index = ['Survived','Dead']
  37. df.plot(kind='bar',stacked=True, figsize=(10,5))
  38. #show gender
  39. bar_chart('Sex')
  40.  
  41. #summing train and test dataset
  42. train_test_data = [train,test]
  43.  
  44. for dataset in train_test_data:
  45. dataset['Title'] = dataset['Name'].str.extract('([A-Za-z]+)\.',expand=False)
  46.  
  47. #find number of train and test dataset title
  48. train['Title'].value_counts()
  49.  
  50. #others titles are defined by 3
  51. title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,
  52. "Countess": 3, "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Mme": 3, "Capt": 3, "Sir": 3, }
  53. for dataset in train_test_data:
  54. dataset['Title'] = dataset['Title'].map(title_mapping)
  55. #print train and test data
  56. train.head()
  57. test.head()
  58.  
  59. # delete unnecessary feature from dataset
  60. train.drop('Name', axis=1, inplace=True)
  61. test.drop('Name', axis=1, inplace=True)
  62. # now print
  63. train.head()
  64. test.head()
  65.  
  66. #maping sex
  67. sex_mapping = {"male": 0, "female": 1}
  68. for dataset in train_test_data:
  69. dataset['Sex'] = dataset['Sex'].map(sex_mapping)
  70. #print barchart
  71. bar_chart('Sex')
  72.  
  73. # fill missing age with median age for each title (Mr, Mrs, Miss, Others)
  74. train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
  75. test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True)
  76. train.groupby("Title")["Age"].transform("median")
  77. # now print
  78. train.head()
  79.  
  80. # showing in graph
  81. facet = sns.FacetGrid(train, hue="Survived",aspect=4)
  82. facet.map(sns.kdeplot,'Age',shade= True)
  83. facet.set(xlim=(0, train['Age'].max()))
  84. facet.add_legend()
  85. # show
  86. plt.show()
  87.  
  88. #differentionate age
  89.  
  90. for dataset in train_test_data:
  91. dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0,
  92. dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 1,
  93. dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 36), 'Age'] = 2,
  94. dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 62), 'Age'] = 3,
  95. dataset.loc[ dataset['Age'] > 62, 'Age'] = 4
  96. # now print
  97. train.head()
  98.  
  99. #filling missing values
  100. Pclass1 = train[train['Pclass']==1]['Embarked'].value_counts()
  101. Pclass2 = train[train['Pclass']==2]['Embarked'].value_counts()
  102. Pclass3 = train[train['Pclass']==3]['Embarked'].value_counts()
  103. df = pd.DataFrame([Pclass1, Pclass2, Pclass3])
  104. df.index = ['1st class','2nd class', '3rd class']
  105. df.plot(kind='bar',stacked=True, figsize=(10,5))
  106.  
  107. for dataset in train_test_data:
  108. dataset['Embarked'] = dataset['Embarked'].fillna('S')
  109. train.head()
  110.  
  111. #each city defined by each number
  112. embarked_mapping = {"S": 0, "C": 1, "Q": 2}
  113. for dataset in train_test_data:
  114. dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)
  115.  
  116. # fill missing Fare with median fare for each Pclass
  117. train["Fare"].fillna(train.groupby("Pclass")["Fare"].transform("median"), inplace=True)
  118. test["Fare"].fillna(test.groupby("Pclass")["Fare"].transform("median"), inplace=True)
  119. train.head(5)
  120.  
  121. #dividing tickets price into different begs
  122. for dataset in train_test_data:
  123. dataset.loc[ dataset['Fare'] <= 17, 'Fare'] = 0,
  124. dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1,
  125. dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2,
  126. dataset.loc[ dataset['Fare'] > 100, 'Fare'] = 3
  127. train.head()
  128. train.Cabin.value_counts()
  129.  
  130. #define which cabin is it ?
  131. for dataset in train_test_data:
  132. dataset['Cabin'] = dataset['Cabin'].str[:1]
  133.  
  134. Pclass1 = train[train['Pclass']==1]['Cabin'].value_counts()
  135. Pclass2 = train[train['Pclass']==2]['Cabin'].value_counts()
  136. Pclass3 = train[train['Pclass']==3]['Cabin'].value_counts()
  137. df = pd.DataFrame([Pclass1, Pclass2, Pclass3])
  138. df.index = ['1st class','2nd class', '3rd class']
  139. df.plot(kind='bar',stacked=True, figsize=(10,5))
  140.  
  141. # cabin mapping
  142. cabin_mapping = {"A": 0, "B": 0.4, "C": 0.8, "D": 1.2, "E": 1.6, "F": 2, "G": 2.4, "T": 2.8}
  143. for dataset in train_test_data:
  144. dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)
  145.  
  146. # fill missing Fare with median fare for each Pclass
  147. train["Cabin"].fillna(train.groupby("Pclass")["Cabin"].transform("median"), inplace=True)
  148. test["Cabin"].fillna(test.groupby("Pclass")["Cabin"].transform("median"), inplace=True)
  149. train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
  150. test["FamilySize"] = test["SibSp"] + test["Parch"] + 1
  151. facet = sns.FacetGrid(train, hue="Survived",aspect=4)
  152. facet.map(sns.kdeplot,'FamilySize',shade= True)
  153. facet.set(xlim=(0, train['FamilySize'].max()))
  154. facet.add_legend()
  155. plt.xlim(0)
  156.  
  157. #family mapping
  158. family_mapping = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 9: 3.2, 10: 3.6, 11: 4}
  159. for dataset in train_test_data:
  160. dataset['FamilySize'] = dataset['FamilySize'].map(family_mapping)
  161. train.head()
  162.  
  163. #droping features
  164. features_drop = ['Ticket', 'SibSp', 'Parch']
  165. train = train.drop(features_drop, axis=1)
  166. test = test.drop(features_drop, axis=1)
  167. train = train.drop(['PassengerId'], axis=1)
  168. train_data = train.drop('Survived', axis=1)
  169. target = train['Survived']
  170. #now print
  171. train_data.shape, target.shape
  172.  
  173. #import libraries
  174. from sklearn.tree import DecisionTreeClassifier
  175. from sklearn.model_selection import KFold
  176. from sklearn.model_selection import cross_val_score
  177. from sklearn.svm import SVC
  178. k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
  179. train_data.head(50)
  180. test.head()
  181.  
  182. #using Decision Tree
  183. clf = DecisionTreeClassifier()
  184. scoring = 'accuracy'
  185. score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
  186. print(score)
  187.  
  188. # decision tree Score
  189. round(np.mean(score)*100, 2)
  190.  
  191. #Testing by SVM
  192. clf = SVC(gamma="auto")
  193. clf.fit(train_data, target)
  194. test_data = test.drop("PassengerId", axis=1).copy()
  195. prediction = clf.predict(test_data)
  196.  
  197. submission = pd.DataFrame({
  198. "PassengerId": test["PassengerId"],
  199. "Survived": prediction
  200. })
  201. # servived result will be saved as submission.csv
  202. submission.to_csv('submission.csv', index=False)
  203. submission = pd.read_csv('submission.csv')
  204. submission.head()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement