brute force script

import pandas as pd
from sklearn.linear_model import LinearRegression

stupid = ['OPEID', 'name', 'city', 'state', 'region']
categorical = ['highest_degree', 'ownership', 'locale', 'hbcu', 'online_only']
not_consider = ['enrollment', 'net_price', 'avg_cost']
df = pd.read_csv('four_year_colleges.csv') \
    .drop(columns=stupid + categorical + not_consider)

train = df.sample(int(len(df) * 0.8))
test = df.drop(train.index)

target = 'default_rate'

X_train = train.drop(columns=[target])
og_feats = list(X_train.columns)
X_poly = X_train ** 2
X_poly.rename(columns={name: name + '^2' for name in X_poly.columns}, inplace=True)
X_train = pd.concat([X_train, X_poly], axis=1)

X_test = test.drop(columns=[target])
X_poly = X_test ** 2
X_poly.rename(columns={name: name + '^2' for name in X_poly.columns}, inplace=True)
X_test = pd.concat([X_test, X_poly], axis=1)

y_train = train[target]
y_test = test[target]

best_r2 = 0
best_things = None, None
for ss in range(1, 1 << len(og_feats)):
    use = [f for v, f in enumerate(og_feats) if ss & (1 << v)]
    for deg2 in range(1 << len(use)):
        this_use = use.copy()
        for i in range(len(use)):
            if deg2 & (1 << i):
                this_use.append(use[i] + '^2')

        this_X = X_train[this_use]
        regressor = LinearRegression()
        regressor.fit(this_X, y_train)

        this_X = X_test[this_use]
        r2 = regressor.score(this_X, y_test)
        if r2 > best_r2:
            best_r2 = r2
            best_things = this_use

print(best_r2)
print(best_things)