gt22

Untitled

Dec 9th, 2018
275
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.31 KB | None | 0 0
  1. # %%
  2. import numpy as np
  3. import pandas as pd
  4. from matplotlib import pyplot as plt
  5. import lightgbm as lgb
  6. from sklearn.model_selection import train_test_split
  7. from sklearn.metrics import accuracy_score
  8. from sklearn.tree import DecisionTreeClassifier, export_graphviz
  9.  
  10. # %%
  11. df = pd.read_csv('data/cars-train.csv')
  12. # %%
  13.  
  14.  
  15. def get(d, **kwargs):
  16.     selector = np.ones(d.shape[0], dtype=bool)
  17.     for name, value in kwargs.items():
  18.         selector &= (d[name] == value)
  19.     return d[selector]
  20.  
  21.  
  22. def car(d):
  23.     return get(d, car_or_bus='car')
  24.  
  25.  
  26. def bus(d):
  27.     return get(d, car_or_bus='bus')
  28.  
  29.  
  30. # %%
  31. car(df)['rating'].plot.hist(color='r', alpha=0.5, label='car')
  32. bus(df)['rating'].plot.hist(color='b', alpha=0.5, label='bus')
  33. plt.title("Rating")
  34. plt.legend()
  35. plt.show()
  36. # %%
  37. car(df)['v'].plot.hist(color='r', alpha=0.5, label='car')
  38. bus(df)['v'].plot.hist(color='b', alpha=0.5, label='bus')
  39. plt.title("Velocity")
  40. plt.legend()
  41. plt.show()
  42. # %%
  43. car(df)['t'].plot.hist(color='r', alpha=0.5, label='car')
  44. bus(df)['t'].plot.hist(color='b', alpha=0.5, label='bus')
  45. plt.title("Time")
  46. plt.legend()
  47. plt.show()
  48. # %%
  49. car(df)['distance'].plot.hist(color='r', alpha=0.5, label='car')
  50. bus(df)['distance'].plot.hist(color='b', alpha=0.5, label='bus')
  51. plt.title("Distance")
  52. plt.legend()
  53. plt.show()
  54. # %%
  55. (car(df)['v'] * car(df)['distance']).plot.hist(color='r', alpha=0.5, label='car')
  56. (bus(df)['v'] * bus(df)['distance']).plot.hist(color='b', alpha=0.5, label='bus')
  57. plt.title("Velocity * Distance")
  58. plt.legend()
  59. plt.show()
  60. # %%
  61. (car(df)['t'] * car(df)['distance']).plot.hist(color='r', alpha=0.5, label='car')
  62. (bus(df)['t'] * bus(df)['distance']).plot.hist(color='b', alpha=0.5, label='bus')
  63. plt.title("Time * Distance")
  64. plt.legend()
  65. plt.show()
  66. # %%
  67. (car(df)['distance'] * car(df)['distance']).plot.hist(color='r', alpha=0.5, label='car')
  68. (bus(df)['distance'] * bus(df)['distance']).plot.hist(color='b', alpha=0.5, label='bus')
  69. plt.title("Distance * Distance")
  70. plt.legend()
  71. plt.show()
  72. # %%
  73. df['distance_sq'] = df['distance'] * df['distance']
  74. car(df[df['distance_sq'] < 70])['distance_sq'].plot.hist(color='r', alpha=0.5, label='car')
  75. bus(df[df['distance_sq'] < 70])['distance_sq'].plot.hist(color='b', alpha=0.5, label='bus')
  76. plt.title("Distance * Distance")
  77. plt.legend()
  78. plt.show()
  79. # %%
  80. df['t_sq'] = df['t'] * df['t']
  81. car(df[df['t_sq'] < 0.5])['t_sq'].plot.hist(color='r', alpha=0.5, label='car')
  82. bus(df[df['t_sq'] < 0.5])['t_sq'].plot.hist(color='b', alpha=0.5, label='bus')
  83. plt.title("Time * Time")
  84. plt.legend()
  85. plt.show()
  86. # %%
  87. df['v_sq'] = df['v'] * df['v']
  88. car(df[df['v_sq'] < 200])['v_sq'].plot.hist(color='r', alpha=0.5, label='car')
  89. bus(df[df['v_sq'] < 200])['v_sq'].plot.hist(color='b', alpha=0.5, label='bus')
  90. plt.title("Velocity * Velocity")
  91. plt.legend()
  92. plt.show()
  93. # %% Drop outliers
  94. df.drop([133], inplace=True)  # Car with rating=1
  95. # %%
  96. df['car_or_bus'] = df['car_or_bus'].map({'car': 0, 'bus': 1})
  97. # %%
  98. X_train, X_valid, y_train, y_valid = train_test_split(df.drop('car_or_bus', axis=1), df['car_or_bus'], test_size=.2)
  99. # %%
  100.  
  101.  
  102. def acc(pred, true_data):
  103.     return "accuracy", accuracy_score(true_data.label, pred > 0.5), True
  104.  
  105.  
  106. # %%
  107. # params = {
  108. #     'objective': 'binary',
  109. #     'num_iterations': 100,
  110. #     'learning_rate': 0.01,
  111. #     'num_leaves': 16,
  112. #     'metric': 'auc',
  113. #     'early_stopping_rounds': 5,
  114. #     'seed': 6741
  115. # }
  116. # # train = lgb.Dataset(df.drop('car_or_bus', axis=1), df['car_or_bus'])
  117. # train = lgb.Dataset(X_train[['rating']], y_train)
  118. # valid = lgb.Dataset(X_valid[['rating']], y_valid, reference=train)
  119. # booster = lgb.train(params, train_set=train, valid_sets=valid)
  120. # %%
  121. tree = DecisionTreeClassifier(min_samples_leaf=10)
  122. tree.fit(df.drop('car_or_bus', axis=1), df['car_or_bus'])
  123. # %%
  124. split_data = {}
  125. for s in np.linspace(0, 1, 10000):
  126.     split_data[s] = accuracy_score(y_valid, (tree.predict_proba(X_valid) > s)[:, 1])
  127. print(np.unique(list(split_data.values())))
  128. print(max(split_data.items(), key=lambda x: x[1]))
  129. # %%
  130. test = pd.read_csv('data/cars-test.csv')
  131. # %%
  132. test['distance_sq'] = test['distance'] * test['distance']
  133. test['v_sq'] = test['v'] * test['v']
  134. test['t_sq'] = test['t'] * test['t']
  135. # %%
  136. pred = tree.predict_proba(test)[:, 1] > 0.36363636363636365
  137. pred = pd.Series(pred).map({False: 'car', True: 'bus'})
  138. pred.to_csv('subm.csv', index=False)
Advertisement
Add Comment
Please, Sign In to add comment