Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from sklearn.linear_model import LinearRegression
- stupid = ['OPEID', 'name', 'city', 'state', 'region']
- categorical = ['highest_degree', 'ownership', 'locale', 'hbcu', 'online_only']
- not_consider = ['enrollment', 'net_price', 'avg_cost']
- df = pd.read_csv('four_year_colleges.csv') \
- .drop(columns=stupid + categorical + not_consider)
- train = df.sample(int(len(df) * 0.8))
- test = df.drop(train.index)
- target = 'default_rate'
- X_train = train.drop(columns=[target])
- og_feats = list(X_train.columns)
- X_poly = X_train ** 2
- X_poly.rename(columns={name: name + '^2' for name in X_poly.columns}, inplace=True)
- X_train = pd.concat([X_train, X_poly], axis=1)
- X_test = test.drop(columns=[target])
- X_poly = X_test ** 2
- X_poly.rename(columns={name: name + '^2' for name in X_poly.columns}, inplace=True)
- X_test = pd.concat([X_test, X_poly], axis=1)
- y_train = train[target]
- y_test = test[target]
- best_r2 = 0
- best_things = None, None
- for ss in range(1, 1 << len(og_feats)):
- use = [f for v, f in enumerate(og_feats) if ss & (1 << v)]
- for deg2 in range(1 << len(use)):
- this_use = use.copy()
- for i in range(len(use)):
- if deg2 & (1 << i):
- this_use.append(use[i] + '^2')
- this_X = X_train[this_use]
- regressor = LinearRegression()
- regressor.fit(this_X, y_train)
- this_X = X_test[this_use]
- r2 = regressor.score(this_X, y_test)
- if r2 > best_r2:
- best_r2 = r2
- best_things = this_use
- print(best_r2)
- print(best_things)
Add Comment
Please, Sign In to add comment