SHARE
TWEET

Untitled

a guest Mar 26th, 2019 70 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import numpy as np
  2. import pandas as pd
  3. import matplotlib.pyplot as plt
  4. from sklearn.utils import shuffle
  5. import seaborn as sns
  6. from scipy.stats import norm
  7.  
  8.  
  9. from scipy.stats.distributions import *
  10. from sklearn.ensemble import RandomForestClassifier
  11. from sklearn.metrics import log_loss
  12. from sklearn.model_selection import RandomizedSearchCV
  13.  
  14.  
  15. def get_data():
  16.     train = pd.read_csv('data/raw/train.csv')
  17.     test = pd.read_csv('data/raw/test.csv')
  18.     train = shuffle(train)
  19.  
  20.  
  21.     # #train
  22.     # train.SibSp = train.SibSp.map({0: 7, 1: 9, 2: 8, 3: 6, 4: 5, 5: 4, 6: 3, 7: 2, 8: 4, 9: 0})
  23.     # train.SibSp -= 4
  24.     # print(train.corr(method='pearson')['SibSp'])
  25.  
  26.     target_data = train['Survived'].to_numpy()
  27.     target_data = np.expand_dims(target_data, axis=1)
  28.  
  29.  
  30.     # sns.catplot(x='Cabin', y='Survived', data=train
  31.     #train['SibSp'] = train['SibSp'].map({0: 2, 1: 0, 2: 1, 3: 3, 4: 4})
  32.  
  33.     train["Age"] = train.groupby("Pclass")["Age"].transform(lambda x: x.fillna(x.median()))
  34.     test["Age"] = test.groupby("Pclass")["Age"].transform(lambda x: x.fillna(x.median()))
  35.     test["Fare"] = train.groupby("Pclass")["Fare"].transform(lambda x: x.fillna(x.median()))
  36.  
  37.     train.drop(columns='Survived', inplace=True)
  38.  
  39.     data = pd.concat([train, test])
  40.  
  41.     data["Name"] = data["Name"].transform(lambda x: x.split(', ')[1].split()[0])
  42.  
  43.     data['Name'] = data['Name'].replace('the', 'Mrs.')
  44.     data['Name'] = data['Name'].replace('Lady.', 'Mrs.')
  45.     data['Name'] = data['Name'].replace('Mme.', 'Mrs.')
  46.     data['Name'] = data['Name'].replace('Dona.', 'Mrs.')
  47.     data['Name'] = data['Name'].replace('Ms.', 'Miss.')
  48.     data['Name'] = data['Name'].replace('Don.', 'Mr.')
  49.     data['Name'] = data['Name'].replace('Sir.', 'Mr.')
  50.     data['Name'] = data['Name'].replace('Capt.', 'Rev.')
  51.     data['Name'] = data['Name'].replace('Major.', 'Pers.')
  52.     data['Name'] = data['Name'].replace('Col.', 'Pers.')
  53.     data['Name'] = data['Name'].replace('Dr.', 'Pers.')
  54.     data['Name'] = data['Name'].replace('Jonkheer.', 'Rev.')
  55.     data['Name'] = data['Name'].replace('Mlle.', 'Miss.')
  56.  
  57.     data['Embarked'] = data['Embarked'].fillna('S')
  58.  
  59.     # data['Alone'] = ~((data['SibSp'] > 0) | (data['Parch'] > 0))
  60.     # data['SibSp'] = data['SibSp'].map({0: 2, 1: 0, 2: 1, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9})
  61.     # data['Par'] = ((data['Age'] < 16) * data['Parch']).map({0: 0, 1: 2, 2: 1})
  62.     # data['Ch'] = ((data['Age'] > 16) * data['Parch']).map({4: 0, 6: 0, 5: 1, 0: 2, 1: 3, 2: 4, 3: 5})
  63.     #
  64.     # data['NoCabin'] = (~data['Cabin'].isna()).astype(int)
  65.  
  66.     data = pd.get_dummies(data, columns=["Sex", "Name", "Embarked"], drop_first=True)
  67.     data.drop(columns=['Ticket', 'Cabin'], inplace=True)
  68.  
  69.     #  BOX COX TRANSFORMATION
  70.  
  71.     from scipy.special import boxcox
  72.     from scipy.stats import boxcox_normmax
  73.     data['Fare'] -= data['Fare'].min() - 1.
  74.     data['Age'] -= data['Age'].min() - 1.
  75.  
  76.     data['Fare'] = boxcox(data['Fare'], boxcox_normmax(data['Fare']))
  77.     data['Age'] = boxcox(data['Age'], boxcox_normmax(data['Age']))
  78.  
  79.     train = data[0:891].copy()
  80.     test = data[891:1310].copy()
  81.  
  82.     train = train - train.mean(axis=0)
  83.     train = train / train.std(axis=0)
  84.     test = test - test.mean(axis=0)
  85.     test = test / test.std(axis=0)
  86.  
  87.     train_data = train.to_numpy()
  88.     test_data = test.to_numpy()
  89.  
  90.     print(train_data.shape)
  91.     return train_data, target_data.ravel(), test_data
  92.  
  93.  
  94. def main():
  95.     dtrain, dtarget, dtask = get_data()
  96.  
  97.     all_train = dtrain
  98.     dtest = dtrain[791:891]
  99.     dtrain = dtrain[0:791]
  100.  
  101.     all_target = dtarget
  102.     dtestT = dtarget[791:891]
  103.     dtarget = dtarget[0:791]
  104.  
  105.     params = {
  106.         "n_estimators": rv_discrete(a=1000, b=3000),
  107.         "max_depth": rv_discrete(a=1, b=5),
  108.         "min_samples_split": rv_continuous(a=0.2, b=0.8),
  109.         "min_samples_leaf": rv_continuous(a=0.2, b=0.8),
  110.         "max_features": "string"
  111.     }
  112.  
  113.     # n_estimators: 1000
  114.     # min_samples_split: 0.008
  115.     # min_samples_leaf: 0.0005
  116.     # max_features: sqrt
  117.     # max_depth: None
  118.     # bootstrap: True
  119.     # 84 / 100
  120.     #
  121.     # n_estimators: 1050
  122.     # min_samples_split: 0.01
  123.     # min_samples_leaf: 0.0008
  124.     # max_features: auto
  125.     # max_depth: None
  126.     # bootstrap: True
  127.     # 86 / 100
  128.     #
  129.     # n_estimators: 1050
  130.     # min_samples_split: 0.01105
  131.     # min_samples_leaf: 0.0008
  132.     # max_features: sqrt
  133.     # max_depth: None
  134.     # bootstrap: True
  135.     # 84 / 100
  136.     #
  137.     # n_estimators: 1100
  138.     # min_samples_split: 0.011
  139.     # min_samples_leaf: 0.00075
  140.     # max_features: auto
  141.     # max_depth: None
  142.     # bootstrap: True
  143.     # # 81 / 100
  144.  
  145.     # params = {'bootstrap': [True],
  146.     #           'max_depth': [None],
  147.     #           'max_features': ['auto', 'sqrt'],
  148.     #           'min_samples_leaf': [0.0003, 0.0004, 0.0005, 0.0007, 0.00075, 0.0008, 0.00085, 0.0009],
  149.     #           'min_samples_split': [0.01, 0.0105, 0.1107, 0.011, 0.01103, 0.01105, 0.01107, 0.012],
  150.     #           'n_estimators': [950, 1000, 1050, 1075, 1100]
  151.     #           }
  152.  
  153.     rfc = RandomForestClassifier(n_estimators=1050, min_samples_split=0.01105, min_samples_leaf=0.0008,
  154.                                  max_features='sqrt', max_depth=None, bootstrap=True)
  155.  
  156.     # rscv = RandomizedSearchCV(rfc, param_distributions=params, verbose=2, n_jobs=10, n_iter=700)
  157.     # rscv.fit(dtrain, dtarget)
  158.  
  159.     # for k, v in rscv.best_params_.items():
  160.     #     print(k, ': ', v, sep='')
  161.     rfc.fit(dtrain, dtarget)
  162.     out = rfc.predict(dtest)
  163.  
  164.     print(len(out) - np.abs(out - dtestT).sum(), '/', len(out))
  165.  
  166.     # rfc.set_params(**rfc.best_params_)
  167.     rfc.fit(all_train, all_target)
  168.     answer = rfc.predict(dtask)
  169.  
  170.     indx = [i for i in range(892, 1310, 1)]
  171.     df = pd.DataFrame.from_dict({'PassengerId': indx, 'Survived': answer})
  172.     df.to_csv('data/out.csv', index=False)
  173.  
  174.  
  175. main()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top