Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import seaborn as sns
- import matplotlib.pyplot as plt
- %matplotlib inline
- from sklearn.linear_model import LogisticRegression
- from sklearn.neighbors import KNeighborsClassifier
- train_df = pd.read_csv('train.csv')
- test_df = pd.read_csv('test.csv')
- combine = [train_df, test_df]
- train_df.head()
- train_df.describe()
- train_df.describe(include=['O'])
- #which columns have missing values?
- missing_values = {}
- columns = list(train_df)
- for column in columns:
- missing = sum(train_df[column].isnull())
- missing_values[column] = missing
- print(missing_values)
- #averaging the 1's and 0's (1=lived, 0=died) will give us an idea of if more people lived or died. Run these four lines one at a
- ###time to see the results.
- train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)
- train_df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)
- train_df[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)
- train_df[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)
- #Correlating numerical features
- g = sns.FacetGrid(train_df, col='Survived')
- g.map(plt.hist, 'Age', bins=20)
- grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.0, aspect=1.6)
- grid.map(plt.hist, 'Age', bins=20)
- grid.add_legend()
- grid = sns.FacetGrid(train_df, row='Embarked', size=2.0, aspect=1.6)
- grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
- grid.add_legend()
- grid = sns.FacetGrid(train_df, col='Survived', row='Embarked', size=2.0, aspect=1.6)
- grid.map(sns.barplot, 'Sex', 'Fare', ci=None)
- grid.add_legend()
- #drop 'Cabin' and 'Ticket'
- train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
- test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
- combine = [train_df, test_df]
- #Engineer a 'Title' feature
- #regex aka regular expression
- for dataframe in combine:
- dataframe['Title'] = dataframe.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
- pd.crosstab(train_df['Title'], train_df['Sex'])
- #grouping rare titles together
- for dataset in combine:
- dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', \
- 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer',\
- 'Dona'], 'Rare')
- dataset['Title'] = dataset['Title'].replace(['Mlle', 'Ms'], 'Miss')
- dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
- train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()
- #convert categorical titles to nominal
- title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}
- for dataset in combine:
- dataset['Title'] = dataset['Title'].map(title_mapping)
- train_df.head()
- #drop Name variable PassengerId
- train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
- test_df = test_df.drop(['Name'], axis=1)
- combine = [train_df, test_df]
- train_df.head()
- #convert Sex to nominal
- for dataset in combine:
- dataset['Sex'] = dataset['Sex'].map({'female':1, 'male':0})
- train_df.head()
- #################### New Stuff- please copy this in your code for next class. We will discuss it then ##########################
- #What should we do about missing age values? Age will likely be a strong predictor for survivorship.
- #We will estimate age values based on other correlated data: Sex, and Pclass
- #start by making an empty array for the guessed age values for each combination of Sex x Pclass
- guess_ages = np.zeros((2,3))
- guess_ages
- for dataset in combine:
- for i in range(0, 2):
- for j in range(0, 3):
- guess_df = dataset[(dataset['Sex'] == i) & (dataset['Pclass'] == j+1)]['Age'].dropna()
- age_guess = guess_df.median()
- # Convert random age float to nearest .5 age
- guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
- for i in range(0, 2):
- for j in range(0, 3):
- dataset.loc[(dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1), 'Age'] = guess_ages[i,j]
- dataset['Age'] = dataset['Age'].astype(int)
- train_df.head()
- #'Embarked' is missing a few values. We will use the mode to fill these, then convert to ordinal.
- freq_port = train_df.Embarked.dropna().mode()[0]
- for dataset in combine:
- dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
- train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)
- for dataset in combine:
- dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
- train_df.head()
- #We will use median to fill the missing 'Fare' values.
- test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
- #How does family contribute to survivorship?
- for dataset in combine:
- dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
- train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)
- #Let's make a new feature called IsAlone
- for dataset in combine:
- dataset['IsAlone'] = 0
- dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
- train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()
- #So now we can drop Parch, SibSp, and FamilySize features in favor of IsAlone
- train_df = train_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
- test_df = test_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
- combine = [train_df, test_df]
- train_df.head()
- ########################## Next up, making models!!! (We will do this live in class next week) #####################################
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement