Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from sklearn.model_selection import train_test_split
- import matplotlib.pyplot as plt
- from sklearn.preprocessing import Imputer
- df = pd.read_csv("bouts_out_new.csv")
- # PRE-PROCESSING AND CLEAN-UP
- # Models can only handle numeric features so I convert the non-numeric features
- # into numeric using dummy features
- clean_df = pd.get_dummies(df)
- # This results in 39 features
- print(clean_df.columns)
- print(clean_df.shape)
- # Test both imputed values aswell as a completely clean dataset
- # Split the dataset, drop result label from data
- clean_df.columns
- target = clean_df.drop(['result_win_B', 'result_win_A', 'result_draw'], axis=1)
- X_train, X_test, y_train, y_test = train_test_split(
- clean_df.drop(['result_win_B', 'result_win_A', 'result_draw'],
- axis=1), target, random_state=0)
- import sklearn.feature_selection
- selection = sklearn.feature_selection.SelectKBest(k=20)
- selected_features = selection.fit(X_train, y_train)
- indices_selected = selected_features.get_support(indices=True)
- colnames_selected = [clean_df.columns[i] for i in indices_selected]
- X_train_selected = X_train[colnames_selected]
- X_test_selected = X_test[colnames_selected]
- colnames_selected
Add Comment
Please, Sign In to add comment