Advertisement
Guest User

Untitled

a guest
Mar 24th, 2017
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.55 KB | None | 0 0
  1. import re
  2.  
  3. import numpy as np
  4. import pandas as pd
  5. from keras.layers import Dense, Dropout
  6. from keras.models import Sequential
  7. from keras.wrappers.scikit_learn import KerasClassifier
  8. from sklearn.metrics import accuracy_score
  9. from sklearn.model_selection import StratifiedKFold, cross_val_score
  10.  
  11.  
  12. def normalize(series):
  13. return (series - series.min()) / (series.max() - series.min())
  14.  
  15.  
  16. def encode_one_hot(df, column, axis=1):
  17. return df.join(pd.get_dummies(df[column], column)).drop(column, axis=axis)
  18.  
  19.  
  20. def extract_title(name):
  21. title = re.findall(r',(.*?)\.', name)[0].strip()
  22.  
  23. if title in ['Dona', 'Lady', 'the Countess']:
  24. return 'Lady'
  25. elif title in ['Mme', 'Mlle']:
  26. return 'Mme'
  27. elif title in ['Capt', 'Don', 'Major', 'Sir', 'Jonkheer', 'Col']:
  28. return 'Sir'
  29. else:
  30. return title
  31.  
  32.  
  33. def extract_deck(cabin):
  34. return cabin[0:1] if pd.notnull(cabin) else 'Unknown'
  35.  
  36.  
  37. def build_model():
  38. m = Sequential([
  39. Dense(30, activation='relu', input_dim=feature_count), # layer size = feature size + 1
  40. Dense(30, activation='relu'),
  41. Dense(1, activation='sigmoid'), # TODO 1x sigmoid vs 2x softmax?
  42. ])
  43. m.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
  44. return m
  45.  
  46.  
  47. EPOCHS = 30
  48. BATCH_SIZE = 32
  49. RANDOM_STATE = 1337
  50. NUM_FOLDS = 10
  51.  
  52. # Import data
  53. df_train = pd.read_csv('train.csv')
  54. df_test = pd.read_csv('test.csv')
  55. df = pd.concat([df_train.drop('Survived', axis=1), df_test], ignore_index=True) # type: pd.DataFrame
  56. df = df.drop(['PassengerId', 'Ticket'], axis=1) # type: pd.DataFrame
  57.  
  58. df['Title'] = df['Name'].map(extract_title)
  59. df['Deck'] = df['Cabin'].map(extract_deck)
  60. df['Age'] = normalize(df['Age'].fillna(df['Age'].median()))
  61. df['Parch'] = normalize(df['Parch'])
  62. df['SibSp'] = normalize(df['SibSp'])
  63. df['Fare'] = normalize(df['Fare'])
  64. df['FamSize'] = normalize(df['SibSp'] * df['Parch'])
  65.  
  66. df = encode_one_hot(df, 'Sex')
  67. df = encode_one_hot(df, 'Embarked')
  68. df = encode_one_hot(df, 'Deck')
  69. df = encode_one_hot(df, 'Title')
  70.  
  71. x_all = df.drop(['Name', 'Cabin'], axis=1).as_matrix()
  72.  
  73. train_count = len(df_train)
  74. feature_count = x_all.shape[1]
  75. print('Number of features:', feature_count)
  76.  
  77. x_submit = x_all[train_count:]
  78. x_train = x_all[:train_count]
  79. y_train = df_train['Survived']
  80.  
  81. # Evaluate model using 10-fold cross-validation
  82. model = KerasClassifier(build_fn=build_model, nb_epoch=EPOCHS, batch_size=BATCH_SIZE, verbose=False)
  83. cv = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=RANDOM_STATE)
  84. results = cross_val_score(model, x_train, y_train, cv=cv, n_jobs=-1)
  85. print('Mean accuracy in %i-fold CV:\t' % NUM_FOLDS, results.mean())
  86.  
  87. # Build model on complete training data
  88. model = build_model()
  89. model.fit(x_train, y_train, nb_epoch=EPOCHS, batch_size=BATCH_SIZE, verbose=False)
  90.  
  91. # Evaluate model using confusion matrix
  92. y_pred = model.predict_classes(x_train, verbose=False).flatten()
  93. print('Final accuracy on training data:', accuracy_score(y_train, y_pred), '\n')
  94. print(pd.crosstab(y_train, y_pred, rownames=['Real'], colnames=['Predicted'], margins=True))
  95.  
  96. # Store wrong predictions to file
  97. row_filter = [y1 != y2 for (y1, y2) in zip(y_pred, y_train)]
  98. df_wrong = df_train.copy()
  99. df_wrong['SurvivedPrediction'] = y_pred
  100. df_wrong = df_wrong[row_filter]
  101. df_wrong.to_csv('wrong.csv', index=False)
  102. print('\nWrote', len(df_wrong), 'rows to wrong.csv')
  103.  
  104. # Submit
  105. y_submit = model.predict_classes(x_submit, verbose=False).flatten()
  106. df_submit = pd.DataFrame(y_submit, index=df_test['PassengerId'], columns=['Survived'])
  107. df_submit.to_csv('submission.csv')
  108. print('Wrote', len(df_submit), 'rows to submission.csv')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement