Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- from sklearn.utils import shuffle
- from sklearn.model_selection import GridSearchCV
- from sklearn.decomposition import PCA
- from sklearn.preprocessing import LabelEncoder , StandardScaler
- from sklearn.metrics import brier_score_loss, make_scorer
- from sklearn.decomposition import PCA
- from sklearn.preprocessing import StandardScaler
- from sklearn.ensemble import RandomForestClassifier
- import datetime as datetime
- from sklearn.svm import SVC
- brier = make_scorer(brier_score_loss, greater_is_better = False , needs_proba = True)
- def test(year):
- print("-------------------------------------------------------")
- print(year)
- print(datetime.datetime.now())
- print()
- print()
- full_training_df = pd.read_csv("yty_train_" + str(year) + ".csv")
- full_testing_df = pd.read_csv("yty_test_" + str(year) + ".csv")
- X_test = full_testing_df.drop("y_test" , axis = 1)
- y_test = full_testing_df["y_test"]
- X_train = full_training_df.drop("y_test" , axis = 1)
- y_train = full_training_df["y_test"]
- X_train = X_train[X_train.columns & X_test.columns]
- X_test = X_test[X_train.columns & X_test.columns ]
- rf_clf = RandomForestClassifier(n_estimators = 5000 , max_depth = 35 ,
- n_jobs = -1)
- t1 = datetime.datetime.now()
- rf_clf.fit(X_train , y_train)
- t2 = datetime.datetime.now()
- print("time to fit:" , t2-t1)
- t1 = datetime.datetime.now()
- predicted_probs = pd.DataFrame(rf_clf.predic_proba(X_test_full)[:,1])
- t2 = datetime.datetime.now()
- print("time to test:" , t2-t1)
- print("test score:" , brier_score_loss(y_test , predicted_probs))
- predicted_probs.columns = ["predicted_prob"]
- y_test.columns = ["true_prob"]
- joined_df = pd.concat([predicted_probs , y_test], axis=1, sort=False)
- joined_df["adjusted"] = 0
- N_plus = joined["true_prob"].sum()
- N_minus = joined.shape[0]-N_plus
- beta = N_plus / N_minus
- joined_df.adjusted = (beta*joined.predicted_probs)/(beta*joined.predicted_probs-joined.predicted_probs+1)
- joined_df.to_csv(str(year) + "results_full_rf.csv" , index = False)
- test(15)
- test(16)
- test(17)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement