• API
• FAQ
• Tools
• Archive
daily pastebin goal
30%
SHARE
TWEET

# Untitled

a guest Mar 26th, 2019 67 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
1. import numpy as np
2. import pandas as pd
3. import matplotlib.pyplot as plt
4. from sklearn.utils import shuffle
5. import seaborn as sns
6. from scipy.stats import norm
7.
8.
9. from scipy.stats.distributions import *
10. from sklearn.ensemble import RandomForestClassifier
11. from sklearn.metrics import log_loss
12. from sklearn.model_selection import RandomizedSearchCV
13.
14.
15. def get_data():
18.     train = shuffle(train)
19.
20.
21.     # #train
22.     # train.SibSp = train.SibSp.map({0: 7, 1: 9, 2: 8, 3: 6, 4: 5, 5: 4, 6: 3, 7: 2, 8: 4, 9: 0})
23.     # train.SibSp -= 4
24.     # print(train.corr(method='pearson')['SibSp'])
25.
26.     target_data = train['Survived'].to_numpy()
27.     target_data = np.expand_dims(target_data, axis=1)
28.
29.
30.     # sns.catplot(x='Cabin', y='Survived', data=train
31.     #train['SibSp'] = train['SibSp'].map({0: 2, 1: 0, 2: 1, 3: 3, 4: 4})
32.
33.     train["Age"] = train.groupby("Pclass")["Age"].transform(lambda x: x.fillna(x.median()))
34.     test["Age"] = test.groupby("Pclass")["Age"].transform(lambda x: x.fillna(x.median()))
35.     test["Fare"] = train.groupby("Pclass")["Fare"].transform(lambda x: x.fillna(x.median()))
36.
37.     train.drop(columns='Survived', inplace=True)
38.
39.     data = pd.concat([train, test])
40.
41.     data["Name"] = data["Name"].transform(lambda x: x.split(', ')[1].split()[0])
42.
43.     data['Name'] = data['Name'].replace('the', 'Mrs.')
45.     data['Name'] = data['Name'].replace('Mme.', 'Mrs.')
46.     data['Name'] = data['Name'].replace('Dona.', 'Mrs.')
47.     data['Name'] = data['Name'].replace('Ms.', 'Miss.')
48.     data['Name'] = data['Name'].replace('Don.', 'Mr.')
49.     data['Name'] = data['Name'].replace('Sir.', 'Mr.')
50.     data['Name'] = data['Name'].replace('Capt.', 'Rev.')
51.     data['Name'] = data['Name'].replace('Major.', 'Pers.')
52.     data['Name'] = data['Name'].replace('Col.', 'Pers.')
53.     data['Name'] = data['Name'].replace('Dr.', 'Pers.')
54.     data['Name'] = data['Name'].replace('Jonkheer.', 'Rev.')
55.     data['Name'] = data['Name'].replace('Mlle.', 'Miss.')
56.
57.     data['Embarked'] = data['Embarked'].fillna('S')
58.
59.     # data['Alone'] = ~((data['SibSp'] > 0) | (data['Parch'] > 0))
60.     # data['SibSp'] = data['SibSp'].map({0: 2, 1: 0, 2: 1, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9})
61.     # data['Par'] = ((data['Age'] < 16) * data['Parch']).map({0: 0, 1: 2, 2: 1})
62.     # data['Ch'] = ((data['Age'] > 16) * data['Parch']).map({4: 0, 6: 0, 5: 1, 0: 2, 1: 3, 2: 4, 3: 5})
63.     #
64.     # data['NoCabin'] = (~data['Cabin'].isna()).astype(int)
65.
66.     data = pd.get_dummies(data, columns=["Sex", "Name", "Embarked"], drop_first=True)
67.     data.drop(columns=['Ticket', 'Cabin'], inplace=True)
68.
69.     #  BOX COX TRANSFORMATION
70.
71.     from scipy.special import boxcox
72.     from scipy.stats import boxcox_normmax
73.     data['Fare'] -= data['Fare'].min() - 1.
74.     data['Age'] -= data['Age'].min() - 1.
75.
76.     data['Fare'] = boxcox(data['Fare'], boxcox_normmax(data['Fare']))
77.     data['Age'] = boxcox(data['Age'], boxcox_normmax(data['Age']))
78.
79.     train = data[0:891].copy()
80.     test = data[891:1310].copy()
81.
82.     train = train - train.mean(axis=0)
83.     train = train / train.std(axis=0)
84.     test = test - test.mean(axis=0)
85.     test = test / test.std(axis=0)
86.
87.     train_data = train.to_numpy()
88.     test_data = test.to_numpy()
89.
90.     print(train_data.shape)
91.     return train_data, target_data.ravel(), test_data
92.
93.
94. def main():
95.     dtrain, dtarget, dtask = get_data()
96.
97.     all_train = dtrain
98.     dtest = dtrain[791:891]
99.     dtrain = dtrain[0:791]
100.
101.     all_target = dtarget
102.     dtestT = dtarget[791:891]
103.     dtarget = dtarget[0:791]
104.
105.     params = {
106.         "n_estimators": rv_discrete(a=1000, b=3000),
107.         "max_depth": rv_discrete(a=1, b=5),
108.         "min_samples_split": rv_continuous(a=0.2, b=0.8),
109.         "min_samples_leaf": rv_continuous(a=0.2, b=0.8),
110.         "max_features": "string"
111.     }
112.
113.     # n_estimators: 1000
114.     # min_samples_split: 0.008
115.     # min_samples_leaf: 0.0005
116.     # max_features: sqrt
117.     # max_depth: None
118.     # bootstrap: True
119.     # 84 / 100
120.     #
121.     # n_estimators: 1050
122.     # min_samples_split: 0.01
123.     # min_samples_leaf: 0.0008
124.     # max_features: auto
125.     # max_depth: None
126.     # bootstrap: True
127.     # 86 / 100
128.     #
129.     # n_estimators: 1050
130.     # min_samples_split: 0.01105
131.     # min_samples_leaf: 0.0008
132.     # max_features: sqrt
133.     # max_depth: None
134.     # bootstrap: True
135.     # 84 / 100
136.     #
137.     # n_estimators: 1100
138.     # min_samples_split: 0.011
139.     # min_samples_leaf: 0.00075
140.     # max_features: auto
141.     # max_depth: None
142.     # bootstrap: True
143.     # # 81 / 100
144.
145.     # params = {'bootstrap': [True],
146.     #           'max_depth': [None],
147.     #           'max_features': ['auto', 'sqrt'],
148.     #           'min_samples_leaf': [0.0003, 0.0004, 0.0005, 0.0007, 0.00075, 0.0008, 0.00085, 0.0009],
149.     #           'min_samples_split': [0.01, 0.0105, 0.1107, 0.011, 0.01103, 0.01105, 0.01107, 0.012],
150.     #           'n_estimators': [950, 1000, 1050, 1075, 1100]
151.     #           }
152.
153.     rfc = RandomForestClassifier(n_estimators=1050, min_samples_split=0.01105, min_samples_leaf=0.0008,
154.                                  max_features='sqrt', max_depth=None, bootstrap=True)
155.
156.     # rscv = RandomizedSearchCV(rfc, param_distributions=params, verbose=2, n_jobs=10, n_iter=700)
157.     # rscv.fit(dtrain, dtarget)
158.
159.     # for k, v in rscv.best_params_.items():
160.     #     print(k, ': ', v, sep='')
161.     rfc.fit(dtrain, dtarget)
162.     out = rfc.predict(dtest)
163.
164.     print(len(out) - np.abs(out - dtestT).sum(), '/', len(out))
165.
166.     # rfc.set_params(**rfc.best_params_)
167.     rfc.fit(all_train, all_target)