Advertisement
Guest User

Untitled

a guest
Mar 26th, 2019
75
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.55 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import matplotlib.pyplot as plt
  4. from sklearn.utils import shuffle
  5. import seaborn as sns
  6. from scipy.stats import norm
  7.  
  8.  
  9. from scipy.stats.distributions import *
  10. from sklearn.ensemble import RandomForestClassifier
  11. from sklearn.metrics import log_loss
  12. from sklearn.model_selection import RandomizedSearchCV
  13.  
  14.  
  15. def get_data():
  16.     train = pd.read_csv('data/raw/train.csv')
  17.     test = pd.read_csv('data/raw/test.csv')
  18.     train = shuffle(train)
  19.  
  20.  
  21.     # #train
  22.     # train.SibSp = train.SibSp.map({0: 7, 1: 9, 2: 8, 3: 6, 4: 5, 5: 4, 6: 3, 7: 2, 8: 4, 9: 0})
  23.     # train.SibSp -= 4
  24.     # print(train.corr(method='pearson')['SibSp'])
  25.  
  26.     target_data = train['Survived'].to_numpy()
  27.     target_data = np.expand_dims(target_data, axis=1)
  28.  
  29.  
  30.     # sns.catplot(x='Cabin', y='Survived', data=train
  31.     #train['SibSp'] = train['SibSp'].map({0: 2, 1: 0, 2: 1, 3: 3, 4: 4})
  32.  
  33.     train["Age"] = train.groupby("Pclass")["Age"].transform(lambda x: x.fillna(x.median()))
  34.     test["Age"] = test.groupby("Pclass")["Age"].transform(lambda x: x.fillna(x.median()))
  35.     test["Fare"] = train.groupby("Pclass")["Fare"].transform(lambda x: x.fillna(x.median()))
  36.  
  37.     train.drop(columns='Survived', inplace=True)
  38.  
  39.     data = pd.concat([train, test])
  40.  
  41.     data["Name"] = data["Name"].transform(lambda x: x.split(', ')[1].split()[0])
  42.  
  43.     data['Name'] = data['Name'].replace('the', 'Mrs.')
  44.     data['Name'] = data['Name'].replace('Lady.', 'Mrs.')
  45.     data['Name'] = data['Name'].replace('Mme.', 'Mrs.')
  46.     data['Name'] = data['Name'].replace('Dona.', 'Mrs.')
  47.     data['Name'] = data['Name'].replace('Ms.', 'Miss.')
  48.     data['Name'] = data['Name'].replace('Don.', 'Mr.')
  49.     data['Name'] = data['Name'].replace('Sir.', 'Mr.')
  50.     data['Name'] = data['Name'].replace('Capt.', 'Rev.')
  51.     data['Name'] = data['Name'].replace('Major.', 'Pers.')
  52.     data['Name'] = data['Name'].replace('Col.', 'Pers.')
  53.     data['Name'] = data['Name'].replace('Dr.', 'Pers.')
  54.     data['Name'] = data['Name'].replace('Jonkheer.', 'Rev.')
  55.     data['Name'] = data['Name'].replace('Mlle.', 'Miss.')
  56.  
  57.     data['Embarked'] = data['Embarked'].fillna('S')
  58.  
  59.     # data['Alone'] = ~((data['SibSp'] > 0) | (data['Parch'] > 0))
  60.     # data['SibSp'] = data['SibSp'].map({0: 2, 1: 0, 2: 1, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9})
  61.     # data['Par'] = ((data['Age'] < 16) * data['Parch']).map({0: 0, 1: 2, 2: 1})
  62.     # data['Ch'] = ((data['Age'] > 16) * data['Parch']).map({4: 0, 6: 0, 5: 1, 0: 2, 1: 3, 2: 4, 3: 5})
  63.     #
  64.     # data['NoCabin'] = (~data['Cabin'].isna()).astype(int)
  65.  
  66.     data = pd.get_dummies(data, columns=["Sex", "Name", "Embarked"], drop_first=True)
  67.     data.drop(columns=['Ticket', 'Cabin'], inplace=True)
  68.  
  69.     #  BOX COX TRANSFORMATION
  70.  
  71.     from scipy.special import boxcox
  72.     from scipy.stats import boxcox_normmax
  73.     data['Fare'] -= data['Fare'].min() - 1.
  74.     data['Age'] -= data['Age'].min() - 1.
  75.  
  76.     data['Fare'] = boxcox(data['Fare'], boxcox_normmax(data['Fare']))
  77.     data['Age'] = boxcox(data['Age'], boxcox_normmax(data['Age']))
  78.  
  79.     train = data[0:891].copy()
  80.     test = data[891:1310].copy()
  81.  
  82.     train = train - train.mean(axis=0)
  83.     train = train / train.std(axis=0)
  84.     test = test - test.mean(axis=0)
  85.     test = test / test.std(axis=0)
  86.  
  87.     train_data = train.to_numpy()
  88.     test_data = test.to_numpy()
  89.  
  90.     print(train_data.shape)
  91.     return train_data, target_data.ravel(), test_data
  92.  
  93.  
  94. def main():
  95.     X_train, y_train, X_pred = get_data()
  96.  
  97.     X = X_train
  98.     X_val = X_train[791:891]
  99.     X_train = X_train[0:791]
  100.  
  101.     y = y_train
  102.     y_val = y_train[791:891]
  103.     y_train = y_train[0:791]
  104.  
  105.     # n_estimators: 1000
  106.     # min_samples_split: 0.008
  107.     # min_samples_leaf: 0.0005
  108.     # max_features: sqrt
  109.     # max_depth: None
  110.     # bootstrap: True
  111.     # 84 / 100
  112.     #
  113.     # n_estimators: 1050
  114.     # min_samples_split: 0.01
  115.     # min_samples_leaf: 0.0008
  116.     # max_features: auto
  117.     # max_depth: None
  118.     # bootstrap: True
  119.     # 86 / 100
  120.     #
  121.     # n_estimators: 1050
  122.     # min_samples_split: 0.01105
  123.     # min_samples_leaf: 0.0008
  124.     # max_features: sqrt
  125.     # max_depth: None
  126.     # bootstrap: True
  127.     # 84 / 100
  128.     #
  129.     # n_estimators: 1100
  130.     # min_samples_split: 0.011
  131.     # min_samples_leaf: 0.00075
  132.     # max_features: auto
  133.     # max_depth: None
  134.     # bootstrap: True
  135.     # # 81 / 100
  136.  
  137.     # params = {'bootstrap': [True],
  138.     #           'max_depth': [None],
  139.     #           'max_features': ['auto', 'sqrt'],
  140.     #           'min_samples_leaf': [0.0003, 0.0004, 0.0005, 0.0007, 0.00075, 0.0008, 0.00085, 0.0009],
  141.     #           'min_samples_split': [0.01, 0.0105, 0.1107, 0.011, 0.01103, 0.01105, 0.01107, 0.012],
  142.     #           'n_estimators': [950, 1000, 1050, 1075, 1100]
  143.     #           }
  144.  
  145.     rfc = RandomForestClassifier(n_estimators=1050, min_samples_split=0.01105, min_samples_leaf=0.0008,
  146.                                  max_features='sqrt', max_depth=None, bootstrap=True)
  147.  
  148.     # rscv = RandomizedSearchCV(rfc, param_distributions=params, verbose=2, n_jobs=10, n_iter=700)
  149.     # rscv.fit(X_train, y_train)
  150.  
  151.     # for k, v in rscv.best_params_.items():
  152.     #     print(k, ': ', v, sep='')
  153.     rfc.fit(X_train, y_train)
  154.     out = rfc.predict(X_val)
  155.  
  156.     print(len(out) - np.abs(out - y_val).sum(), '/', len(out))
  157.  
  158.     # rfc.set_params(**rfc.best_params_)
  159.     rfc.fit(X, y)
  160.     answer = rfc.predict(X_pred)
  161.  
  162.     indx = [i for i in range(892, 1310, 1)]
  163.     df = pd.DataFrame.from_dict({'PassengerId': indx, 'Survived': answer})
  164.     df.to_csv('data/out.csv', index=False)
  165.  
  166.  
  167. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement