Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # %%
- import numpy as np
- import pandas as pd
- from matplotlib import pyplot as plt
- import lightgbm as lgb
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import accuracy_score
- from sklearn.tree import DecisionTreeClassifier, export_graphviz
- # %%
- df = pd.read_csv('data/cars-train.csv')
- # %%
- def get(d, **kwargs):
- selector = np.ones(d.shape[0], dtype=bool)
- for name, value in kwargs.items():
- selector &= (d[name] == value)
- return d[selector]
- def car(d):
- return get(d, car_or_bus='car')
- def bus(d):
- return get(d, car_or_bus='bus')
- # %%
- car(df)['rating'].plot.hist(color='r', alpha=0.5, label='car')
- bus(df)['rating'].plot.hist(color='b', alpha=0.5, label='bus')
- plt.title("Rating")
- plt.legend()
- plt.show()
- # %%
- car(df)['v'].plot.hist(color='r', alpha=0.5, label='car')
- bus(df)['v'].plot.hist(color='b', alpha=0.5, label='bus')
- plt.title("Velocity")
- plt.legend()
- plt.show()
- # %%
- car(df)['t'].plot.hist(color='r', alpha=0.5, label='car')
- bus(df)['t'].plot.hist(color='b', alpha=0.5, label='bus')
- plt.title("Time")
- plt.legend()
- plt.show()
- # %%
- car(df)['distance'].plot.hist(color='r', alpha=0.5, label='car')
- bus(df)['distance'].plot.hist(color='b', alpha=0.5, label='bus')
- plt.title("Distance")
- plt.legend()
- plt.show()
- # %%
- (car(df)['v'] * car(df)['distance']).plot.hist(color='r', alpha=0.5, label='car')
- (bus(df)['v'] * bus(df)['distance']).plot.hist(color='b', alpha=0.5, label='bus')
- plt.title("Velocity * Distance")
- plt.legend()
- plt.show()
- # %%
- (car(df)['t'] * car(df)['distance']).plot.hist(color='r', alpha=0.5, label='car')
- (bus(df)['t'] * bus(df)['distance']).plot.hist(color='b', alpha=0.5, label='bus')
- plt.title("Time * Distance")
- plt.legend()
- plt.show()
- # %%
- (car(df)['distance'] * car(df)['distance']).plot.hist(color='r', alpha=0.5, label='car')
- (bus(df)['distance'] * bus(df)['distance']).plot.hist(color='b', alpha=0.5, label='bus')
- plt.title("Distance * Distance")
- plt.legend()
- plt.show()
- # %%
- df['distance_sq'] = df['distance'] * df['distance']
- car(df[df['distance_sq'] < 70])['distance_sq'].plot.hist(color='r', alpha=0.5, label='car')
- bus(df[df['distance_sq'] < 70])['distance_sq'].plot.hist(color='b', alpha=0.5, label='bus')
- plt.title("Distance * Distance")
- plt.legend()
- plt.show()
- # %%
- df['t_sq'] = df['t'] * df['t']
- car(df[df['t_sq'] < 0.5])['t_sq'].plot.hist(color='r', alpha=0.5, label='car')
- bus(df[df['t_sq'] < 0.5])['t_sq'].plot.hist(color='b', alpha=0.5, label='bus')
- plt.title("Time * Time")
- plt.legend()
- plt.show()
- # %%
- df['v_sq'] = df['v'] * df['v']
- car(df[df['v_sq'] < 200])['v_sq'].plot.hist(color='r', alpha=0.5, label='car')
- bus(df[df['v_sq'] < 200])['v_sq'].plot.hist(color='b', alpha=0.5, label='bus')
- plt.title("Velocity * Velocity")
- plt.legend()
- plt.show()
- # %% Drop outliers
- df.drop([133], inplace=True) # Car with rating=1
- # %%
- df['car_or_bus'] = df['car_or_bus'].map({'car': 0, 'bus': 1})
- # %%
- X_train, X_valid, y_train, y_valid = train_test_split(df.drop('car_or_bus', axis=1), df['car_or_bus'], test_size=.2)
- # %%
- def acc(pred, true_data):
- return "accuracy", accuracy_score(true_data.label, pred > 0.5), True
- # %%
- # params = {
- # 'objective': 'binary',
- # 'num_iterations': 100,
- # 'learning_rate': 0.01,
- # 'num_leaves': 16,
- # 'metric': 'auc',
- # 'early_stopping_rounds': 5,
- # 'seed': 6741
- # }
- # # train = lgb.Dataset(df.drop('car_or_bus', axis=1), df['car_or_bus'])
- # train = lgb.Dataset(X_train[['rating']], y_train)
- # valid = lgb.Dataset(X_valid[['rating']], y_valid, reference=train)
- # booster = lgb.train(params, train_set=train, valid_sets=valid)
- # %%
- tree = DecisionTreeClassifier(min_samples_leaf=10)
- tree.fit(df.drop('car_or_bus', axis=1), df['car_or_bus'])
- # %%
- split_data = {}
- for s in np.linspace(0, 1, 10000):
- split_data[s] = accuracy_score(y_valid, (tree.predict_proba(X_valid) > s)[:, 1])
- print(np.unique(list(split_data.values())))
- print(max(split_data.items(), key=lambda x: x[1]))
- # %%
- test = pd.read_csv('data/cars-test.csv')
- # %%
- test['distance_sq'] = test['distance'] * test['distance']
- test['v_sq'] = test['v'] * test['v']
- test['t_sq'] = test['t'] * test['t']
- # %%
- pred = tree.predict_proba(test)[:, 1] > 0.36363636363636365
- pred = pd.Series(pred).map({False: 'car', True: 'bus'})
- pred.to_csv('subm.csv', index=False)
Advertisement
Add Comment
Please, Sign In to add comment