Advertisement
Guest User

Untitled

a guest
Nov 24th, 2017
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.23 KB | None | 0 0
  1. import os.path
  2. from collections import Counter
  3. import numpy as np
  4. import scipy as sp
  5. import pandas as pd
  6.  
  7. import csv
  8.  
  9. # [L, R] - границы дисперсии, только факторы с такой дисперсией будем рассматривать
  10. from sklearn.ensemble import AdaBoostClassifier, IsolationForest, RandomForestClassifier
  11.  
  12. L = 0.5
  13. R = 10
  14. # pos - индексы факторов (столбцов), дисперсия которых не лежит в [L, R]
  15. # pos = [0, 1, 2, 4, 8, 13, 21, 22, 23, 27, 28]
  16. # pos = [5, 20, 29]
  17. # pos = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,27, 28]
  18. pos = [0, 1, 2, 3, 4, 5, 6, 7,8,9,10,11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,27, 28]
  19.  
  20. # all = {i for i in range(30)}
  21. # s = {18,10,9,8,16,0,4}
  22. # pos = list(all.difference(s))
  23.  
  24. class StandardScaler:
  25. def __init__(self):
  26. self.mean = []
  27. self.var = []
  28.  
  29. def fit(self, X):
  30. for j in range(len(X[0])):
  31. # среднее значение
  32. sum = 0.0
  33. for i in range(len(X)):
  34. sum += X[i][j]
  35. self.mean.append(sum / len(X))
  36. # дисперсия
  37. sum = 0.0
  38. for i in range(len(X)):
  39. sum += (X[i][j] - self.mean[j]) * (X[i][j] - self.mean[j])
  40. sum /= (len(X) - 1)
  41. self.var.append(sp.sqrt(sum))
  42. # print(self.var)
  43. # delete_features_outlier(X, self.var)
  44. return self
  45.  
  46. def transform(self, X):
  47. for i in range(len(X)):
  48. for j in range(len(X[i])):
  49. X[i][j] = (X[i][j] - self.mean[j]) / self.var[j]
  50. return X
  51.  
  52.  
  53. # Так как большой разброс и очень мало данных,
  54. # значит влияние фактора c большой (или маленькой) дисперсией только ухудшает точность -
  55. # удалим такие факторы
  56. def delete_features(X, var):
  57. ind = []
  58. for i in range(len(var)):
  59. if not (L < var[i] < R):
  60. ind.append(i)
  61. # print(ind)
  62. return np.delete(X, ind, 1)
  63.  
  64.  
  65. def delete_features_outlier(X, var):
  66. pos = np.asarray(var).reshape((-1, 1))
  67. clf = IsolationForest(random_state=42)
  68. clf.fit(pos)
  69. y_pred = clf.predict(pos).reshape((1, -1)).tolist()[0]
  70. ind = []
  71. for i, v in enumerate(y_pred):
  72. if v == -1:
  73. ind.append(i)
  74. print(ind)
  75. return np.delete(X, ind, 1)
  76.  
  77.  
  78. class MinMaxScaler:
  79. def __init__(self):
  80. self.data_min = []
  81. self.data_max = []
  82.  
  83. def fit(self, X):
  84. for j in range(len(X[0])):
  85. self.data_min.append(X[:, j].min())
  86. self.data_max.append(X[:, j].max())
  87. return self
  88.  
  89. def transform(self, X):
  90. for i in range(len(X)):
  91. # print(i)
  92. for j in range(len(X[i])):
  93. X[i][j] = (X[i][j] - self.data_min[j]) / \
  94. (self.data_max[j] - self.data_min[j])
  95. return X
  96.  
  97.  
  98. def cross_features(X):
  99. n = len(X[0])
  100. A = np.zeros(len(X), [])
  101. for i in range(len(X)):
  102. for j in range(n):
  103. for k in range(j, n):
  104. A[i].append(X[i][j] * X[i][k])
  105. return A
  106.  
  107.  
  108. class LinearRegression:
  109. def __init__(self):
  110. self.beta = None
  111. self.beta0 = None
  112.  
  113. def fit(self, X, y):
  114. sum = y.sum()
  115. self.beta0 = sum / len(y)
  116. T = X.transpose()
  117. # TX = np.dot(T, X)
  118. TX = np.matmul(T, X)
  119. ITX = np.linalg.inv(TX)
  120. ITXT = np.matmul(ITX, T)
  121. self.beta = np.matmul(ITXT, np.transpose(y))
  122. return self
  123.  
  124. def predict(self, X):
  125. p = self.beta
  126. res = [0] * len(X)
  127. for k in range(len(res)):
  128. for i in range(len(p)):
  129. res[k] += p[i] * X[k][i]
  130. # будем использовать beta0, как свободный член
  131. res[k] += self.beta0
  132. return res
  133.  
  134.  
  135. def rmse(y_true, y_pred):
  136. sum = 0.0
  137. for i in range(len(y_true)):
  138. sum += (y_pred[i] - y_true[i]) * (y_pred[i] - y_true[i])
  139. return sp.sqrt(sum / len(y_true))
  140.  
  141.  
  142. def learn_test_split(X, y):
  143. XL = []
  144. XT = []
  145. yL = []
  146. yT = []
  147. for i in range(len(X)):
  148. a = np.random.randint(0, 1)
  149. if a == 0 and len(XL) < len(X) / 2:
  150. XL.append(X[i])
  151. yL.append(y[i])
  152. else:
  153. XT.append(X[i])
  154. yT.append(y[i])
  155. X_learn = np.asarray(XL)
  156. X_test = np.asarray(XT)
  157. y_learn = np.asarray(yL)
  158. y_test = np.asarray(yT)
  159. return X_learn, X_test, y_learn, y_test
  160.  
  161.  
  162. def cv_linear_regression(X, y, *, n_iter=40):
  163. sum_rmse = 0.0
  164.  
  165. for i in range(n_iter):
  166. X_learn, X_test, y_learn, y_test = learn_test_split(X, y)
  167. pl = Pipeline().fit(X_learn, y_learn)
  168. # a.fit(X_learn, y_learn)
  169. y_pred = pl.predict(X_test)
  170. sum_rmse += rmse(y_test, y_pred)
  171. return sum_rmse / n_iter
  172.  
  173.  
  174. def feature_importance(X, Y, feature_names):
  175. adabBoost = RandomForestClassifier(random_state=42, n_estimators=10)
  176. adabBoost.fit(X, Y)
  177. importances = adabBoost.feature_importances_
  178.  
  179. indices = np.argsort(importances)[::-1]
  180.  
  181. # Print the feature ranking
  182. print("Feature ranking:")
  183.  
  184. for f in range(X.shape[1]):
  185. print("%d. feature %s (%f)" % (f + 1, feature_names[indices[f]], importances[indices[f]]))
  186.  
  187.  
  188. class Pipeline:
  189. def __init__(self):
  190. # self.scaler = MinMaxScaler()
  191. self.scaler = StandardScaler()
  192. self.LR = LinearRegression()
  193.  
  194. def fit(self, X, y):
  195. self.scaler = self.scaler.fit(X)
  196. X = self.scaler.transform(X)
  197. # from matplotlib import pyplot as plt
  198. # plt.scatter(X[:, 0], X[:, 1], color='blue', lw=0, label='NORM')
  199. # plt.show()
  200. self.LR = self.LR.fit(X, y)
  201. return self
  202.  
  203. def predict(self, X):
  204. X = self.scaler.transform(X)
  205. y = self.LR.predict(X)
  206. return y
  207.  
  208.  
  209. class CatEncoder:
  210. def __init__(self):
  211. self.feature_values = [[]]
  212. self.rows = []
  213.  
  214. def fit(self, X):
  215. l = 0
  216. for j in range(len(X[0])):
  217. if type(X[0][j]) == str:
  218. self.rows.append(j)
  219. self.feature_values.append([])
  220. for i in range(len(X)):
  221. self.feature_values[l].append((X[i][j], i))
  222. self.feature_values[l].sort()
  223. l += 1
  224. return self
  225.  
  226. def transform(self, X):
  227. for k, j in enumerate(self.rows):
  228. for i, cur in enumerate(self.feature_values[k]):
  229. X[cur[1]][j] = i
  230. return X
  231.  
  232.  
  233. def clean_data(X):
  234. # X = np.delete(X, del_columns, 1)
  235. for j in range(len(X[0])):
  236. cnt = Counter()
  237. empty = []
  238. for i in range(len(X)):
  239. if str(X[i][j]) != 'nan':
  240. cnt[X[i][j]] += 1
  241. else:
  242. empty.append(i)
  243. for i, v in enumerate(empty):
  244. X[v][j] = cnt.most_common(1)[0][0]
  245. return X
  246.  
  247.  
  248. def into(X):
  249. X = clean_data(X)
  250. CE = CatEncoder().fit(X)
  251. X = CE.transform(X)
  252. X = X.astype('float64')
  253. return X
  254.  
  255.  
  256. def main():
  257. df_learn = pd.read_csv("learn.csv")
  258. X_learn = df_learn.ix[:, :-1].as_matrix()
  259. X_learn = into(X_learn)
  260. y_learn = df_learn.ix[:, -1].as_matrix()
  261. X_learn = np.delete(X_learn, pos, 1)
  262.  
  263.  
  264. names = []
  265. for i in range(len(X_learn[1])):
  266. names.append(i)
  267. feature_importance(X_learn, y_learn, names)
  268. # X = np.delete(X, pos, 1)
  269. rmse = cv_linear_regression(X_learn, y_learn)
  270. Pl = Pipeline().fit(X_learn, y_learn)
  271.  
  272. df_test = pd.read_csv("test.csv")
  273. # X_test = pd.get_dummies(df_test.ix[:, 1:],columns=cat_columns).as_matrix()
  274. X_test = df_test.ix[:, 1:].as_matrix()
  275. X_test = into(X_test)
  276. X_test = np.delete(X_test, pos, 1)
  277. y_pred = Pl.predict(X_test)
  278. solution_name, _ext = os.path.splitext(os.path.basename(__file__))
  279. pd.DataFrame({"y": y_pred, "id": df_test["id"]}) \
  280. .to_csv(solution_name + ".csv", index=False, header=True)
  281. print("rmse:", rmse)
  282.  
  283.  
  284. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement