Guest User

Untitled

a guest
Nov 18th, 2017
94
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 14.18 KB | None | 0 0
  1. import numpy as np
  2. np.random.seed(1024)
  3.  
  4. import pandas as pd
  5.  
  6. import chainer
  7. from chainer import serializers
  8. from chainer.optimizers import Adam
  9. import chainer.functions as F
  10. import chainer.links as L
  11. from chainer import reporter
  12. from chainer.datasets import TupleDataset
  13. from chainer import training
  14. from chainer.training import extensions
  15. from chainer.dataset import concat_examples
  16.  
  17. import base
  18. import argparse
  19. from collections import OrderedDict
  20. import re
  21. import six
  22.  
  23.  
  24. mu = 4.4126225
  25. sigma = 2.4928892
  26. eps = 1e-5
  27.  
  28.  
  29. class MLP(chainer.Chain):
  30. def __init__(self, in_size, hidden_size, out_size, large=False):
  31. super(MLP, self).__init__()
  32. self.large = large
  33. with self.init_scope():
  34. if large:
  35. self.l1 = L.Linear(in_size, hidden_size)
  36. self.bn1 = L.BatchNormalization(hidden_size)
  37. self.l2 = L.Linear(hidden_size, hidden_size)
  38. self.bn2 = L.BatchNormalization(hidden_size)
  39. self.l3 = L.Linear(hidden_size, hidden_size)
  40. self.bn3 = L.BatchNormalization(hidden_size)
  41. self.l4 = L.Linear(hidden_size, out_size)
  42. else:
  43. self.l1 = L.Linear(in_size, hidden_size)
  44. self.l2 = L.Linear(hidden_size, hidden_size)
  45. self.l3 = L.Linear(hidden_size, out_size)
  46.  
  47. def predict(self, x):
  48. if self.large:
  49. x = self.l1(x)
  50. x = self.bn1(x)
  51. x = F.relu(x)
  52.  
  53. x = self.l2(x)
  54. x = self.bn2(x)
  55. x = F.relu(x)
  56.  
  57. x = self.l3(x)
  58. x = self.bn3(x)
  59. x = F.relu(x)
  60.  
  61. x = self.l4(x)
  62. else:
  63. x = self.l1(x)
  64. x = F.relu(x)
  65. x = self.l2(x)
  66. x = F.relu(x)
  67. x = self.l3(x)
  68.  
  69. return F.softmax(x)
  70.  
  71. def __call__(self, x, d):
  72. y_pred = self.predict(x)
  73. approx_smape = F.sum(y_pred * d, axis=1)
  74. loss = F.mean(approx_smape)
  75. reporter.report({'loss': loss, 'approx_smape': approx_smape}, self)
  76. return loss
  77.  
  78.  
  79. class PageViewDataset:
  80.  
  81. def __init__(self, purpose='valid', data_type='train', gap=0):
  82. gap = 2
  83. need_y = (purpose == 'valid' or data_type == 'train')
  84.  
  85. if purpose == 'valid':
  86. data = pd.read_hdf(base.WORKING_DIR + f'train_1.h5', 'tables')
  87. y_start = 440
  88. elif purpose == 'test':
  89. data = pd.read_hdf(base.WORKING_DIR + f'train_2.h5', 'tables')
  90. y_start = 805
  91. else:
  92. raise NotImplementedError
  93.  
  94. if data_type == 'train':
  95. y_start -= 62 + gap
  96.  
  97. self.date_str = pd.Series(data.columns[1:])
  98. self.date = pd.to_datetime(pd.Series(data.columns[1:]))
  99. self.page = data['Page']
  100. data = data.drop('Page', axis=1)
  101. data = data.values.astype(np.float32)
  102.  
  103. if need_y:
  104. task_feats, date_feats, y = self.get_features(data, purpose, y_start, gap, True)
  105.  
  106. def get_smape_each_feat(y_pred, y_true):
  107. y_true = y_true[..., None]
  108. smape = 2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + y_true + eps)
  109. return smape
  110.  
  111. y = y.reshape(-1)
  112. self.y = y
  113. date_feats = date_feats.reshape(-1, date_feats.shape[2])
  114. smape = get_smape_each_feat(date_feats, y)
  115.  
  116. else:
  117. task_feats, date_feats = self.get_features(data, purpose, y_start, gap, False)
  118. date_feats = date_feats.reshape(-1, date_feats.shape[2])
  119.  
  120. self.date_feats = date_feats
  121.  
  122. task_feats = np.tile(task_feats[:, None, :], (1, 62, 1))
  123. task_date_feats = np.zeros((task_feats.shape[0], 62, 2))
  124. task_date_feats[:, :, 0] = self.week_of_y / 6
  125. task_date_feats[:, :, 1] = np.arange(62) / 61
  126. total_feats = np.concatenate([task_feats, task_date_feats], axis=2)
  127. total_feats = total_feats.reshape(-1, total_feats.shape[2])
  128. total_feats = total_feats.astype(np.float32)
  129.  
  130. if need_y:
  131. self._datasets = (total_feats, smape)
  132. else:
  133. self._datasets = (total_feats,)
  134. self._length = len(self._datasets[0])
  135.  
  136. def get_features(self, data, purpose, y_start, gap, return_y=True):
  137. n = data.shape[0]
  138. x_stop = y_start - gap
  139.  
  140. nan_count = np.mean(~np.isfinite(data[:, x_stop - 7:x_stop]), axis=1)
  141. data[~np.isfinite(data)] = 0 # destructive assignment !!!!
  142. zero_count = np.mean(data[:, x_stop - 7:x_stop] == 0, axis=1)
  143.  
  144. if return_y:
  145. y = data[:, y_start:y_start + 62]
  146.  
  147. date_feat_num = 17
  148. date_feats = np.empty((n, 62, date_feat_num), dtype=np.float32)
  149. # weakly median
  150. def weekly_median(week_num):
  151. term = data[:, x_stop - (7 * week_num):x_stop]
  152. med = np.median(term.reshape(n, week_num, 7), axis=1)
  153. return np.tile(med, 10)[:, gap % 7:gap % 7 + 62]
  154. date_feats[:, :, 0] = weekly_median(1)
  155. date_feats[:, :, 1] = weekly_median(2)
  156. date_feats[:, :, 2] = weekly_median(4)
  157. date_feats[:, :, 3] = weekly_median(8)
  158. # Median of weekly Median
  159. date_feats[:, :, 4] = np.median(date_feats[:, :, 0:2], axis=2)
  160. date_feats[:, :, 5] = np.median(date_feats[:, :, 0:4], axis=2)
  161. # # auto reg
  162. # date_feats[:, :, 4] = date_feats[:, :, 0] - date_feats[:, :, 1]
  163. # date_feats[:, :, 5] = date_feats[:, :, 0] - date_feats[:, :, 3]
  164. # last year
  165. one_year_back = 366 if purpose == 'valid' else 365
  166. date_feats[:, :, 6] = data[:, y_start - one_year_back:y_start - one_year_back + 62]
  167.  
  168. # dayofweek of self.date[y_start - 77:y_start + 62 - 77] equals
  169. # to that of self.date[y_start:y_start + 62]
  170. self.week_of_y = self.date[y_start - 77:y_start + 62 - 77].dt.dayofweek
  171.  
  172. # weekend or weekday
  173. def assign_weekend_or_weekday(i, term_length):
  174. term = data[:, x_stop - term_length:x_stop]
  175. week_of_term = self.date[x_stop - term_length:x_stop].dt.dayofweek
  176.  
  177. date_feats[:, self.week_of_y >= 5, i] = np.median(term[:, np.where(week_of_term >= 5)[0]], axis=1)[:, None]
  178. date_feats[:, self.week_of_y < 5, i] = np.median(term[:, np.where(week_of_term < 5)[0]], axis=1)[:, None]
  179. # define the Windows according to Ehsan's kernel
  180. r = 1.61803398875
  181. windows = np.round(r ** np.arange(0, 9) * 7).astype(int)
  182. for i, w in enumerate(windows):
  183. assign_weekend_or_weekday(i + 7, w)
  184. # Median of Median
  185. date_feats[:, :, 16] = np.median(date_feats[:, :, 7:16], axis=2)
  186.  
  187. # standarize for task feats
  188. data = (np.log1p(data) - mu) / sigma
  189.  
  190. task_feat_num = 16
  191. task_feats = np.empty((n, task_feat_num), dtype=np.float32)
  192.  
  193. # count feats
  194. task_feats[:, 0] = nan_count
  195. task_feats[:, 1] = zero_count
  196. # short term volatility
  197. task_feats[:, 2] = np.std(data[:, x_stop - 7:x_stop], axis=1)
  198. # lastest diff
  199. task_feats[:, 3] = data[:, x_stop - 1] - data[:, x_stop - 2]
  200. # median
  201. task_feats[:, 4] = np.median(data[:, x_stop - 7:x_stop], axis=1)
  202. task_feats[:, 5] = np.median(data[:, x_stop - 30:x_stop], axis=1)
  203. task_feats[:, 6] = np.median(data[:, x_stop - 60:x_stop], axis=1)
  204. # 90 percentile
  205. task_feats[:, 7] = np.percentile(data[:, x_stop - 7:x_stop], 90, axis=1)
  206. task_feats[:, 8] = np.percentile(data[:, x_stop - 30:x_stop], 90, axis=1)
  207. task_feats[:, 9] = np.percentile(data[:, x_stop - 60:x_stop], 90, axis=1)
  208. # auto reg
  209. task_feats[:, 10] = task_feats[:, 4] - task_feats[:, 5]
  210. task_feats[:, 11] = task_feats[:, 4] - task_feats[:, 6]
  211. # argmax pos
  212. task_feats[:, 12] = np.argmax(data[:, x_stop - 30:x_stop], axis=1) / 29
  213. task_feats[:, 13] = np.argmax(data[:, x_stop - 60:x_stop], axis=1) / 59
  214. # diff between max and lastest
  215. task_feats[:, 14] = data[:, x_stop - 1] - np.max(data[:, x_stop - 30:x_stop], axis=1)
  216. task_feats[:, 15] = data[:, x_stop - 1] - np.max(data[:, x_stop - 60:x_stop], axis=1)
  217.  
  218. task_dummy_feats = {}
  219. pat = re.compile(r'(.*)_([^.]+)\.[^.]+.org_(.*)_(.*)')
  220. splits = self.page.map(lambda x: pat.match(x).groups()).tolist()
  221. splits = pd.DataFrame(splits, columns=['name', 'country', 'access', 'agent'])
  222. def add_dummies(prefix):
  223. df = pd.get_dummies(splits[prefix], prefix=prefix)
  224. for col in df.columns:
  225. task_dummy_feats[col] = df[col]
  226. add_dummies('country')
  227. add_dummies('access')
  228. add_dummies('agent')
  229.  
  230. task_feats = np.concatenate([task_feats, pd.DataFrame(task_dummy_feats).values], axis=1)
  231.  
  232. if return_y:
  233. return task_feats, date_feats, y
  234. else:
  235. return task_feats, date_feats
  236.  
  237. def __getitem__(self, index):
  238. batches = [dataset[index] for dataset in self._datasets]
  239. if isinstance(index, slice):
  240. length = len(batches[0])
  241. return [tuple([batch[i] for batch in batches])
  242. for i in six.moves.range(length)]
  243. else:
  244. return tuple(batches)
  245.  
  246. def __len__(self):
  247. return self._length
  248.  
  249.  
  250. if __name__ == '__main__':
  251. chainer.set_debug(True)
  252. chainer.config.meta_train = True
  253.  
  254. # TODO : write argparse description
  255. parser = argparse.ArgumentParser()
  256. parser.add_argument('-batch_size', default=256, type=int)
  257. parser.add_argument('-n_iter', default=100, type=int)
  258. parser.add_argument('-valid_interval', default=1, type=int)
  259. parser.add_argument('-valid_batch_size', default=1024, type=int)
  260. parser.add_argument('-save_interval', default=1, type=int)
  261. parser.add_argument('-gpu', default=-1, type=int)
  262. parser.add_argument('-large', action='store_true')
  263. parser.add_argument('-description', default='no description')
  264. parser.add_argument('-purpose', default='valid')
  265. args = parser.parse_args()
  266. om = base.OutputManager(vars(args))
  267.  
  268. train = PageViewDataset(args.purpose, 'train')
  269. valid = PageViewDataset(args.purpose, 'test')
  270. model = MLP(train._datasets[0].shape[1], 256, train._datasets[1].shape[1], args.large)
  271.  
  272. # transfor model to gpu
  273. if args.gpu >= 0:
  274. chainer.cuda.get_device_from_id(args.gpu).use()
  275. model.to_gpu(args.gpu)
  276. # chainer.cuda.to_gpu(train._datasets[0], args.gpu)
  277. # chainer.cuda.to_gpu(train._datasets[1], args.gpu)
  278.  
  279. optimizer = Adam()
  280. optimizer.setup(model)
  281.  
  282. train_iter = chainer.iterators.SerialIterator(train, args.batch_size, repeat=True, shuffle=True)
  283. valid_iter = chainer.iterators.SerialIterator(valid, args.valid_batch_size, repeat=False, shuffle=False)
  284.  
  285. updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
  286. trainer = training.Trainer(updater, (args.n_iter, 'epoch'), out=om.get_path())
  287.  
  288. trainer.extend(extensions.dump_graph('main/loss'))
  289. trainer.extend(extensions.snapshot(), trigger=(args.save_interval, 'epoch'))
  290. trainer.extend(extensions.LogReport())
  291. trainer.extend(extensions.ProgressBar())
  292.  
  293. if args.purpose == 'valid':
  294. trainer.extend(extensions.Evaluator(valid_iter, model, device=args.gpu),
  295. trigger=(args.valid_interval, 'epoch'))
  296. trainer.extend(extensions.PrintReport(
  297. ['epoch', 'main/loss', 'validation/main/loss', 'elapsed_time']))
  298.  
  299. @training.make_extension(trigger=training.triggers.MinValueTrigger(
  300. 'validation/main/loss', trigger=(args.valid_interval, 'epoch')), priority=-100)
  301. def save_base_model(trainer):
  302. print('save best')
  303. serializers.save_npz(om.get_path() + 'best.model', model)
  304.  
  305. trainer.extend(save_base_model)
  306. else:
  307. trainer.extend(extensions.PrintReport(
  308. ['epoch', 'main/loss', 'elapsed_time']))
  309.  
  310. trainer.run()
  311.  
  312. if args.purpose == 'valid':
  313. valid_iter.reset()
  314. pred_valid = []
  315. with chainer.no_backprop_mode():
  316. for batch in valid_iter:
  317. x, _ = concat_examples(batch, args.gpu)
  318. pred_valid.append(chainer.cuda.to_cpu(model.predict(x).data))
  319. pred_valid = np.concatenate(pred_valid, axis=0)
  320.  
  321. y = valid.y
  322. date_feats = valid.date_feats
  323. pred1 = (date_feats * pred_valid).sum(axis=1)
  324. pred2 = np.round(pred1)
  325. pred3 = np.zeros(pred_valid.shape[0])
  326. for i in range(pred_valid.shape[0]):
  327. pred3[i] = date_feats[i, np.argmax(pred_valid[i])]
  328.  
  329. print(1, base.SMAPE(pred1, y))
  330. print(2, base.SMAPE(pred2, y))
  331. print(3, base.SMAPE(pred3, y))
  332.  
  333. serializers.load_npz(om.get_path() + 'best.model', model)
  334.  
  335. valid_iter.reset()
  336. pred_valid = []
  337. with chainer.no_backprop_mode():
  338. for batch in valid_iter:
  339. x, _ = concat_examples(batch, args.gpu)
  340. pred_valid.append(chainer.cuda.to_cpu(model.predict(x).data))
  341. pred_valid = np.concatenate(pred_valid, axis=0)
  342.  
  343. y = valid.y
  344. date_feats = valid.date_feats
  345. pred1 = (date_feats * pred_valid).sum(axis=1)
  346. pred2 = np.round(pred1)
  347. pred3 = np.zeros(pred_valid.shape[0])
  348. for i in range(pred_valid.shape[0]):
  349. pred3[i] = date_feats[i, np.argmax(pred_valid[i])]
  350.  
  351. print(1, base.SMAPE(pred1, y))
  352. print(2, base.SMAPE(pred2, y))
  353. print(3, base.SMAPE(pred3, y))
  354. else:
  355. valid_iter.reset()
  356. pred_valid = []
  357. with chainer.no_backprop_mode():
  358. for batch in valid_iter:
  359. x, _ = concat_examples(batch, args.gpu)
  360. pred_valid.append(chainer.cuda.to_cpu(model.predict(x).data))
  361. pred_valid = np.concatenate(pred_valid, axis=0)
  362.  
  363. date_feats = valid.date_feats
  364. pred = (date_feats * pred_valid).sum(axis=1)
  365. pred = np.round(pred)
  366. # pred = np.zeros(pred_valid.shape[0])
  367. # for i in range(pred_valid.shape[0]):
  368. # pred[i] = date_feats[i, np.argmax(pred_valid[i])]
  369.  
  370. pred = pred.reshape((-1, 62))
  371. assert len(pred) == len(valid.page)
  372. pred_df = pd.DataFrame(pred, columns=pd.date_range('2017-09-13', '2017-11-13'), index=valid.page)
  373. pred_df = pred_df.reset_index()
  374. pred_df.to_hdf(om.get_path() + 'pred_df.h5', 'tables', complevel=9, complib='blosc')
Add Comment
Please, Sign In to add comment