Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- from sklearn.utils import shuffle
- import seaborn as sns
- from scipy.stats import norm
- from scipy.stats.distributions import *
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.metrics import log_loss
- from sklearn.model_selection import RandomizedSearchCV
- def get_data():
- train = pd.read_csv('data/raw/train.csv')
- test = pd.read_csv('data/raw/test.csv')
- train = shuffle(train)
- # #train
- # train.SibSp = train.SibSp.map({0: 7, 1: 9, 2: 8, 3: 6, 4: 5, 5: 4, 6: 3, 7: 2, 8: 4, 9: 0})
- # train.SibSp -= 4
- # print(train.corr(method='pearson')['SibSp'])
- target_data = train['Survived'].to_numpy()
- target_data = np.expand_dims(target_data, axis=1)
- # sns.catplot(x='Cabin', y='Survived', data=train
- #train['SibSp'] = train['SibSp'].map({0: 2, 1: 0, 2: 1, 3: 3, 4: 4})
- train["Age"] = train.groupby("Pclass")["Age"].transform(lambda x: x.fillna(x.median()))
- test["Age"] = test.groupby("Pclass")["Age"].transform(lambda x: x.fillna(x.median()))
- test["Fare"] = train.groupby("Pclass")["Fare"].transform(lambda x: x.fillna(x.median()))
- train.drop(columns='Survived', inplace=True)
- data = pd.concat([train, test])
- data["Name"] = data["Name"].transform(lambda x: x.split(', ')[1].split()[0])
- data['Name'] = data['Name'].replace('the', 'Mrs.')
- data['Name'] = data['Name'].replace('Lady.', 'Mrs.')
- data['Name'] = data['Name'].replace('Mme.', 'Mrs.')
- data['Name'] = data['Name'].replace('Dona.', 'Mrs.')
- data['Name'] = data['Name'].replace('Ms.', 'Miss.')
- data['Name'] = data['Name'].replace('Don.', 'Mr.')
- data['Name'] = data['Name'].replace('Sir.', 'Mr.')
- data['Name'] = data['Name'].replace('Capt.', 'Rev.')
- data['Name'] = data['Name'].replace('Major.', 'Pers.')
- data['Name'] = data['Name'].replace('Col.', 'Pers.')
- data['Name'] = data['Name'].replace('Dr.', 'Pers.')
- data['Name'] = data['Name'].replace('Jonkheer.', 'Rev.')
- data['Name'] = data['Name'].replace('Mlle.', 'Miss.')
- data['Embarked'] = data['Embarked'].fillna('S')
- # data['Alone'] = ~((data['SibSp'] > 0) | (data['Parch'] > 0))
- # data['SibSp'] = data['SibSp'].map({0: 2, 1: 0, 2: 1, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9})
- # data['Par'] = ((data['Age'] < 16) * data['Parch']).map({0: 0, 1: 2, 2: 1})
- # data['Ch'] = ((data['Age'] > 16) * data['Parch']).map({4: 0, 6: 0, 5: 1, 0: 2, 1: 3, 2: 4, 3: 5})
- #
- # data['NoCabin'] = (~data['Cabin'].isna()).astype(int)
- data = pd.get_dummies(data, columns=["Sex", "Name", "Embarked"], drop_first=True)
- data.drop(columns=['Ticket', 'Cabin'], inplace=True)
- # BOX COX TRANSFORMATION
- from scipy.special import boxcox
- from scipy.stats import boxcox_normmax
- data['Fare'] -= data['Fare'].min() - 1.
- data['Age'] -= data['Age'].min() - 1.
- data['Fare'] = boxcox(data['Fare'], boxcox_normmax(data['Fare']))
- data['Age'] = boxcox(data['Age'], boxcox_normmax(data['Age']))
- train = data[0:891].copy()
- test = data[891:1310].copy()
- train = train - train.mean(axis=0)
- train = train / train.std(axis=0)
- test = test - test.mean(axis=0)
- test = test / test.std(axis=0)
- train_data = train.to_numpy()
- test_data = test.to_numpy()
- print(train_data.shape)
- return train_data, target_data.ravel(), test_data
- def main():
- X_train, y_train, X_pred = get_data()
- X = X_train
- X_val = X_train[791:891]
- X_train = X_train[0:791]
- y = y_train
- y_val = y_train[791:891]
- y_train = y_train[0:791]
- # n_estimators: 1000
- # min_samples_split: 0.008
- # min_samples_leaf: 0.0005
- # max_features: sqrt
- # max_depth: None
- # bootstrap: True
- # 84 / 100
- #
- # n_estimators: 1050
- # min_samples_split: 0.01
- # min_samples_leaf: 0.0008
- # max_features: auto
- # max_depth: None
- # bootstrap: True
- # 86 / 100
- #
- # n_estimators: 1050
- # min_samples_split: 0.01105
- # min_samples_leaf: 0.0008
- # max_features: sqrt
- # max_depth: None
- # bootstrap: True
- # 84 / 100
- #
- # n_estimators: 1100
- # min_samples_split: 0.011
- # min_samples_leaf: 0.00075
- # max_features: auto
- # max_depth: None
- # bootstrap: True
- # # 81 / 100
- # params = {'bootstrap': [True],
- # 'max_depth': [None],
- # 'max_features': ['auto', 'sqrt'],
- # 'min_samples_leaf': [0.0003, 0.0004, 0.0005, 0.0007, 0.00075, 0.0008, 0.00085, 0.0009],
- # 'min_samples_split': [0.01, 0.0105, 0.1107, 0.011, 0.01103, 0.01105, 0.01107, 0.012],
- # 'n_estimators': [950, 1000, 1050, 1075, 1100]
- # }
- rfc = RandomForestClassifier(n_estimators=1050, min_samples_split=0.01105, min_samples_leaf=0.0008,
- max_features='sqrt', max_depth=None, bootstrap=True)
- # rscv = RandomizedSearchCV(rfc, param_distributions=params, verbose=2, n_jobs=10, n_iter=700)
- # rscv.fit(X_train, y_train)
- # for k, v in rscv.best_params_.items():
- # print(k, ': ', v, sep='')
- rfc.fit(X_train, y_train)
- out = rfc.predict(X_val)
- print(len(out) - np.abs(out - y_val).sum(), '/', len(out))
- # rfc.set_params(**rfc.best_params_)
- rfc.fit(X, y)
- answer = rfc.predict(X_pred)
- indx = [i for i in range(892, 1310, 1)]
- df = pd.DataFrame.from_dict({'PassengerId': indx, 'Survived': answer})
- df.to_csv('data/out.csv', index=False)
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement