Advertisement
Guest User

Untitled

a guest
Mar 23rd, 2017
182
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.67 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3.  
  4. import seaborn as sns
  5. import matplotlib.pyplot as plt
  6. %matplotlib inline
  7.  
  8. from sklearn.linear_model import LogisticRegression
  9. from sklearn.neighbors import KNeighborsClassifier
  10.  
  11.  
  12.  
  13. train_df = pd.read_csv('train.csv')
  14. test_df = pd.read_csv('test.csv')
  15. combine = [train_df, test_df]
  16. train_df.head()
  17.  
  18.  
  19. train_df.describe()
  20. train_df.describe(include=['O'])
  21.  
  22. #which columns have missing values?
  23. missing_values = {}
  24.  
  25. columns = list(train_df)
  26. for column in columns:
  27. missing = sum(train_df[column].isnull())
  28. missing_values[column] = missing
  29.  
  30. print(missing_values)
  31.  
  32.  
  33. #averaging the 1's and 0's (1=lived, 0=died) will give us an idea of if more people lived or died. Run these four lines one at a
  34. ###time to see the results.
  35. train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)
  36. train_df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)
  37. train_df[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)
  38. train_df[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)
  39.  
  40.  
  41. #Correlating numerical features
  42. g = sns.FacetGrid(train_df, col='Survived')
  43. g.map(plt.hist, 'Age', bins=20)
  44.  
  45.  
  46. grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.0, aspect=1.6)
  47. grid.map(plt.hist, 'Age', bins=20)
  48. grid.add_legend()
  49.  
  50.  
  51. grid = sns.FacetGrid(train_df, row='Embarked', size=2.0, aspect=1.6)
  52. grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
  53. grid.add_legend()
  54.  
  55.  
  56. grid = sns.FacetGrid(train_df, col='Survived', row='Embarked', size=2.0, aspect=1.6)
  57. grid.map(sns.barplot, 'Sex', 'Fare', ci=None)
  58. grid.add_legend()
  59.  
  60.  
  61. #drop 'Cabin' and 'Ticket'
  62. train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
  63. test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
  64. combine = [train_df, test_df]
  65.  
  66.  
  67. #Engineer a 'Title' feature
  68. #regex aka regular expression
  69.  
  70. for dataframe in combine:
  71. dataframe['Title'] = dataframe.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
  72.  
  73. pd.crosstab(train_df['Title'], train_df['Sex'])
  74.  
  75.  
  76. #grouping rare titles together
  77. for dataset in combine:
  78. dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', \
  79. 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer',\
  80. 'Dona'], 'Rare')
  81. dataset['Title'] = dataset['Title'].replace(['Mlle', 'Ms'], 'Miss')
  82. dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
  83.  
  84. train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()
  85.  
  86.  
  87. #convert categorical titles to nominal
  88. title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}
  89.  
  90. for dataset in combine:
  91. dataset['Title'] = dataset['Title'].map(title_mapping)
  92.  
  93. train_df.head()
  94.  
  95. #drop Name variable PassengerId
  96. train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
  97. test_df = test_df.drop(['Name'], axis=1)
  98. combine = [train_df, test_df]
  99. train_df.head()
  100.  
  101. #convert Sex to nominal
  102.  
  103. for dataset in combine:
  104. dataset['Sex'] = dataset['Sex'].map({'female':1, 'male':0})
  105.  
  106. train_df.head()
  107.  
  108. #################### New Stuff- please copy this in your code for next class. We will discuss it then ##########################
  109. #What should we do about missing age values? Age will likely be a strong predictor for survivorship.
  110. #We will estimate age values based on other correlated data: Sex, and Pclass
  111.  
  112. #start by making an empty array for the guessed age values for each combination of Sex x Pclass
  113. guess_ages = np.zeros((2,3))
  114. guess_ages
  115.  
  116. for dataset in combine:
  117. for i in range(0, 2):
  118. for j in range(0, 3):
  119. guess_df = dataset[(dataset['Sex'] == i) & (dataset['Pclass'] == j+1)]['Age'].dropna()
  120.  
  121. age_guess = guess_df.median()
  122.  
  123. # Convert random age float to nearest .5 age
  124. guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
  125.  
  126. for i in range(0, 2):
  127. for j in range(0, 3):
  128. dataset.loc[(dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1), 'Age'] = guess_ages[i,j]
  129.  
  130. dataset['Age'] = dataset['Age'].astype(int)
  131.  
  132. train_df.head()
  133.  
  134.  
  135. #'Embarked' is missing a few values. We will use the mode to fill these, then convert to ordinal.
  136. freq_port = train_df.Embarked.dropna().mode()[0]
  137.  
  138. for dataset in combine:
  139. dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
  140.  
  141. train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)
  142.  
  143. for dataset in combine:
  144. dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
  145.  
  146. train_df.head()
  147.  
  148.  
  149. #We will use median to fill the missing 'Fare' values.
  150. test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
  151.  
  152.  
  153. #How does family contribute to survivorship?
  154. for dataset in combine:
  155. dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
  156.  
  157. train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)
  158.  
  159. #Let's make a new feature called IsAlone
  160. for dataset in combine:
  161. dataset['IsAlone'] = 0
  162. dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
  163.  
  164. train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()
  165.  
  166. #So now we can drop Parch, SibSp, and FamilySize features in favor of IsAlone
  167. train_df = train_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
  168. test_df = test_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
  169. combine = [train_df, test_df]
  170.  
  171. train_df.head()
  172.  
  173.  
  174. ########################## Next up, making models!!! (We will do this live in class next week) #####################################
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement