Advertisement
makispaiktis

Kaggle 6 - Binary Classification

Jul 13th, 2023
673
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.83 KB | None | 0 0
  1. import matplotlib.pyplot as plt
  2. # Set Matplotlib defaults
  3. plt.style.use('seaborn-whitegrid')
  4. plt.rc('figure', autolayout=True)
  5. plt.rc('axes', labelweight='bold', labelsize='large', titleweight='bold', titlesize=18, titlepad=10)
  6. plt.rc('animation', html='html5')
  7.  
  8. import pandas as pd
  9. from sklearn.model_selection import train_test_split
  10. from sklearn.preprocessing import StandardScaler, OneHotEncoder
  11. from sklearn.impute import SimpleImputer
  12. from sklearn.pipeline import make_pipeline
  13. from sklearn.compose import make_column_transformer
  14. from tensorflow import keras
  15. from tensorflow.keras import layers
  16. from keras.callbacks import EarlyStopping
  17.  
  18.  
  19.  
  20.  
  21. # 1a. Read dataset 1 and separate the target
  22. hotel = pd.read_csv('../input/dl-course-data/hotel.csv')
  23. X = hotel.copy()
  24. y = X.pop('is_canceled')
  25.  
  26. # 1b. A little transformation
  27. X['arrival_date_month'] = \
  28.     X['arrival_date_month'].map(
  29.         {'January':1, 'February': 2, 'March':3,
  30.          'April':4, 'May':5, 'June':6, 'July':7,
  31.          'August':8, 'September':9, 'October':10,
  32.          'November':11, 'December':12}
  33.     )
  34.  
  35. # 1c. Features: numerical and categorical
  36. features_num = [
  37.     "lead_time", "arrival_date_week_number",
  38.     "arrival_date_day_of_month", "stays_in_weekend_nights",
  39.     "stays_in_week_nights", "adults", "children", "babies",
  40.     "is_repeated_guest", "previous_cancellations",
  41.     "previous_bookings_not_canceled", "required_car_parking_spaces",
  42.     "total_of_special_requests", "adr",
  43. ]
  44. features_cat = [
  45.     "hotel", "arrival_date_month", "meal",
  46.     "market_segment", "distribution_channel",
  47.     "reserved_room_type", "deposit_type", "customer_type",
  48. ]
  49.  
  50. # 1d. Transformers and preprocessors
  51. transformer_num = make_pipeline(
  52.     SimpleImputer(strategy="constant"), # there are a few missing values
  53.     StandardScaler(),
  54. )
  55. transformer_cat = make_pipeline(
  56.     SimpleImputer(strategy="constant", fill_value="NA"),
  57.     OneHotEncoder(handle_unknown='ignore'),
  58. )
  59.  
  60. preprocessor = make_column_transformer(
  61.     (transformer_num, features_num),
  62.     (transformer_cat, features_cat),
  63. )
  64.  
  65. # stratify - make sure classes are evenlly represented across splits
  66. X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, train_size=0.75)
  67. X_train = preprocessor.fit_transform(X_train)
  68. X_valid = preprocessor.transform(X_valid)
  69. input_shape = [X_train.shape[1]]
  70.  
  71.  
  72.  
  73. # 2. Model for classification - Sigmoid activation function - Dropout + batch normalization
  74. model = keras.Sequential([ layers.BatchNormalization(),
  75.                            layers.Dense(256, activation='relu', input_shape=input_shape),
  76.                            layers.BatchNormalization(),
  77.                            layers.Dropout(0.3),
  78.                            layers.Dense(256, activation='relu'),
  79.                            layers.BatchNormalization(),
  80.                            layers.Dropout(0.3),
  81.                            layers.Dense(1, activation='sigmoid') ])
  82.  
  83. model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])
  84.  
  85. early_stopping = EarlyStopping(patience=5, min_delta=0.001, restore_best_weights=True)
  86.  
  87. history = model.fit(X_train, y_train,
  88.                     validation_data=(X_valid, y_valid),
  89.                     batch_size=512,
  90.                     epochs=200,
  91.                     callbacks=[early_stopping] )
  92.  
  93. history_df = pd.DataFrame(history.history)
  94. history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
  95. history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")
  96. print("Though we can see the training loss continuing to fall, the early stopping callback prevented any overfitting. Moreover, the accuracy rose at the same rate as the cross-entropy fell, so it appears that minimizing cross-entropy was a good stand-in. All in all, it looks like this training was a success!", end="\n\n\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement