Untitled

import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import Imputer

df = pd.read_csv("bouts_out_new.csv")


# PRE-PROCESSING AND CLEAN-UP

# Models can only handle numeric features so I convert the non-numeric features
# into numeric using dummy features

clean_df = pd.get_dummies(df)

# This results in 39 features
print(clean_df.columns)
print(clean_df.shape)


# Test both imputed values aswell as a completely clean dataset

# Split the dataset, drop result label from data
clean_df.columns

target = clean_df.drop(['result_win_B', 'result_win_A', 'result_draw'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    clean_df.drop(['result_win_B', 'result_win_A', 'result_draw'],
                  axis=1), target, random_state=0)


import sklearn.feature_selection


selection = sklearn.feature_selection.SelectKBest(k=20)
selected_features = selection.fit(X_train, y_train)
indices_selected = selected_features.get_support(indices=True)
colnames_selected = [clean_df.columns[i] for i in indices_selected]

X_train_selected = X_train[colnames_selected]
X_test_selected = X_test[colnames_selected]

colnames_selected