Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- np.random.seed(1024)
- import pandas as pd
- import chainer
- from chainer import serializers
- from chainer.optimizers import Adam
- import chainer.functions as F
- import chainer.links as L
- from chainer import reporter
- from chainer.datasets import TupleDataset
- from chainer import training
- from chainer.training import extensions
- from chainer.dataset import concat_examples
- import base
- import argparse
- from collections import OrderedDict
- import re
- import six
- mu = 4.4126225
- sigma = 2.4928892
- eps = 1e-5
- class MLP(chainer.Chain):
- def __init__(self, in_size, hidden_size, out_size, large=False):
- super(MLP, self).__init__()
- self.large = large
- with self.init_scope():
- if large:
- self.l1 = L.Linear(in_size, hidden_size)
- self.bn1 = L.BatchNormalization(hidden_size)
- self.l2 = L.Linear(hidden_size, hidden_size)
- self.bn2 = L.BatchNormalization(hidden_size)
- self.l3 = L.Linear(hidden_size, hidden_size)
- self.bn3 = L.BatchNormalization(hidden_size)
- self.l4 = L.Linear(hidden_size, out_size)
- else:
- self.l1 = L.Linear(in_size, hidden_size)
- self.l2 = L.Linear(hidden_size, hidden_size)
- self.l3 = L.Linear(hidden_size, out_size)
- def predict(self, x):
- if self.large:
- x = self.l1(x)
- x = self.bn1(x)
- x = F.relu(x)
- x = self.l2(x)
- x = self.bn2(x)
- x = F.relu(x)
- x = self.l3(x)
- x = self.bn3(x)
- x = F.relu(x)
- x = self.l4(x)
- else:
- x = self.l1(x)
- x = F.relu(x)
- x = self.l2(x)
- x = F.relu(x)
- x = self.l3(x)
- return F.softmax(x)
- def __call__(self, x, d):
- y_pred = self.predict(x)
- approx_smape = F.sum(y_pred * d, axis=1)
- loss = F.mean(approx_smape)
- reporter.report({'loss': loss, 'approx_smape': approx_smape}, self)
- return loss
- class PageViewDataset:
- def __init__(self, purpose='valid', data_type='train', gap=0):
- gap = 2
- need_y = (purpose == 'valid' or data_type == 'train')
- if purpose == 'valid':
- data = pd.read_hdf(base.WORKING_DIR + f'train_1.h5', 'tables')
- y_start = 440
- elif purpose == 'test':
- data = pd.read_hdf(base.WORKING_DIR + f'train_2.h5', 'tables')
- y_start = 805
- else:
- raise NotImplementedError
- if data_type == 'train':
- y_start -= 62 + gap
- self.date_str = pd.Series(data.columns[1:])
- self.date = pd.to_datetime(pd.Series(data.columns[1:]))
- self.page = data['Page']
- data = data.drop('Page', axis=1)
- data = data.values.astype(np.float32)
- if need_y:
- task_feats, date_feats, y = self.get_features(data, purpose, y_start, gap, True)
- def get_smape_each_feat(y_pred, y_true):
- y_true = y_true[..., None]
- smape = 2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + y_true + eps)
- return smape
- y = y.reshape(-1)
- self.y = y
- date_feats = date_feats.reshape(-1, date_feats.shape[2])
- smape = get_smape_each_feat(date_feats, y)
- else:
- task_feats, date_feats = self.get_features(data, purpose, y_start, gap, False)
- date_feats = date_feats.reshape(-1, date_feats.shape[2])
- self.date_feats = date_feats
- task_feats = np.tile(task_feats[:, None, :], (1, 62, 1))
- task_date_feats = np.zeros((task_feats.shape[0], 62, 2))
- task_date_feats[:, :, 0] = self.week_of_y / 6
- task_date_feats[:, :, 1] = np.arange(62) / 61
- total_feats = np.concatenate([task_feats, task_date_feats], axis=2)
- total_feats = total_feats.reshape(-1, total_feats.shape[2])
- total_feats = total_feats.astype(np.float32)
- if need_y:
- self._datasets = (total_feats, smape)
- else:
- self._datasets = (total_feats,)
- self._length = len(self._datasets[0])
- def get_features(self, data, purpose, y_start, gap, return_y=True):
- n = data.shape[0]
- x_stop = y_start - gap
- nan_count = np.mean(~np.isfinite(data[:, x_stop - 7:x_stop]), axis=1)
- data[~np.isfinite(data)] = 0 # destructive assignment !!!!
- zero_count = np.mean(data[:, x_stop - 7:x_stop] == 0, axis=1)
- if return_y:
- y = data[:, y_start:y_start + 62]
- date_feat_num = 17
- date_feats = np.empty((n, 62, date_feat_num), dtype=np.float32)
- # weakly median
- def weekly_median(week_num):
- term = data[:, x_stop - (7 * week_num):x_stop]
- med = np.median(term.reshape(n, week_num, 7), axis=1)
- return np.tile(med, 10)[:, gap % 7:gap % 7 + 62]
- date_feats[:, :, 0] = weekly_median(1)
- date_feats[:, :, 1] = weekly_median(2)
- date_feats[:, :, 2] = weekly_median(4)
- date_feats[:, :, 3] = weekly_median(8)
- # Median of weekly Median
- date_feats[:, :, 4] = np.median(date_feats[:, :, 0:2], axis=2)
- date_feats[:, :, 5] = np.median(date_feats[:, :, 0:4], axis=2)
- # # auto reg
- # date_feats[:, :, 4] = date_feats[:, :, 0] - date_feats[:, :, 1]
- # date_feats[:, :, 5] = date_feats[:, :, 0] - date_feats[:, :, 3]
- # last year
- one_year_back = 366 if purpose == 'valid' else 365
- date_feats[:, :, 6] = data[:, y_start - one_year_back:y_start - one_year_back + 62]
- # dayofweek of self.date[y_start - 77:y_start + 62 - 77] equals
- # to that of self.date[y_start:y_start + 62]
- self.week_of_y = self.date[y_start - 77:y_start + 62 - 77].dt.dayofweek
- # weekend or weekday
- def assign_weekend_or_weekday(i, term_length):
- term = data[:, x_stop - term_length:x_stop]
- week_of_term = self.date[x_stop - term_length:x_stop].dt.dayofweek
- date_feats[:, self.week_of_y >= 5, i] = np.median(term[:, np.where(week_of_term >= 5)[0]], axis=1)[:, None]
- date_feats[:, self.week_of_y < 5, i] = np.median(term[:, np.where(week_of_term < 5)[0]], axis=1)[:, None]
- # define the Windows according to Ehsan's kernel
- r = 1.61803398875
- windows = np.round(r ** np.arange(0, 9) * 7).astype(int)
- for i, w in enumerate(windows):
- assign_weekend_or_weekday(i + 7, w)
- # Median of Median
- date_feats[:, :, 16] = np.median(date_feats[:, :, 7:16], axis=2)
- # standarize for task feats
- data = (np.log1p(data) - mu) / sigma
- task_feat_num = 16
- task_feats = np.empty((n, task_feat_num), dtype=np.float32)
- # count feats
- task_feats[:, 0] = nan_count
- task_feats[:, 1] = zero_count
- # short term volatility
- task_feats[:, 2] = np.std(data[:, x_stop - 7:x_stop], axis=1)
- # lastest diff
- task_feats[:, 3] = data[:, x_stop - 1] - data[:, x_stop - 2]
- # median
- task_feats[:, 4] = np.median(data[:, x_stop - 7:x_stop], axis=1)
- task_feats[:, 5] = np.median(data[:, x_stop - 30:x_stop], axis=1)
- task_feats[:, 6] = np.median(data[:, x_stop - 60:x_stop], axis=1)
- # 90 percentile
- task_feats[:, 7] = np.percentile(data[:, x_stop - 7:x_stop], 90, axis=1)
- task_feats[:, 8] = np.percentile(data[:, x_stop - 30:x_stop], 90, axis=1)
- task_feats[:, 9] = np.percentile(data[:, x_stop - 60:x_stop], 90, axis=1)
- # auto reg
- task_feats[:, 10] = task_feats[:, 4] - task_feats[:, 5]
- task_feats[:, 11] = task_feats[:, 4] - task_feats[:, 6]
- # argmax pos
- task_feats[:, 12] = np.argmax(data[:, x_stop - 30:x_stop], axis=1) / 29
- task_feats[:, 13] = np.argmax(data[:, x_stop - 60:x_stop], axis=1) / 59
- # diff between max and lastest
- task_feats[:, 14] = data[:, x_stop - 1] - np.max(data[:, x_stop - 30:x_stop], axis=1)
- task_feats[:, 15] = data[:, x_stop - 1] - np.max(data[:, x_stop - 60:x_stop], axis=1)
- task_dummy_feats = {}
- pat = re.compile(r'(.*)_([^.]+)\.[^.]+.org_(.*)_(.*)')
- splits = self.page.map(lambda x: pat.match(x).groups()).tolist()
- splits = pd.DataFrame(splits, columns=['name', 'country', 'access', 'agent'])
- def add_dummies(prefix):
- df = pd.get_dummies(splits[prefix], prefix=prefix)
- for col in df.columns:
- task_dummy_feats[col] = df[col]
- add_dummies('country')
- add_dummies('access')
- add_dummies('agent')
- task_feats = np.concatenate([task_feats, pd.DataFrame(task_dummy_feats).values], axis=1)
- if return_y:
- return task_feats, date_feats, y
- else:
- return task_feats, date_feats
- def __getitem__(self, index):
- batches = [dataset[index] for dataset in self._datasets]
- if isinstance(index, slice):
- length = len(batches[0])
- return [tuple([batch[i] for batch in batches])
- for i in six.moves.range(length)]
- else:
- return tuple(batches)
- def __len__(self):
- return self._length
- if __name__ == '__main__':
- chainer.set_debug(True)
- chainer.config.meta_train = True
- # TODO : write argparse description
- parser = argparse.ArgumentParser()
- parser.add_argument('-batch_size', default=256, type=int)
- parser.add_argument('-n_iter', default=100, type=int)
- parser.add_argument('-valid_interval', default=1, type=int)
- parser.add_argument('-valid_batch_size', default=1024, type=int)
- parser.add_argument('-save_interval', default=1, type=int)
- parser.add_argument('-gpu', default=-1, type=int)
- parser.add_argument('-large', action='store_true')
- parser.add_argument('-description', default='no description')
- parser.add_argument('-purpose', default='valid')
- args = parser.parse_args()
- om = base.OutputManager(vars(args))
- train = PageViewDataset(args.purpose, 'train')
- valid = PageViewDataset(args.purpose, 'test')
- model = MLP(train._datasets[0].shape[1], 256, train._datasets[1].shape[1], args.large)
- # transfor model to gpu
- if args.gpu >= 0:
- chainer.cuda.get_device_from_id(args.gpu).use()
- model.to_gpu(args.gpu)
- # chainer.cuda.to_gpu(train._datasets[0], args.gpu)
- # chainer.cuda.to_gpu(train._datasets[1], args.gpu)
- optimizer = Adam()
- optimizer.setup(model)
- train_iter = chainer.iterators.SerialIterator(train, args.batch_size, repeat=True, shuffle=True)
- valid_iter = chainer.iterators.SerialIterator(valid, args.valid_batch_size, repeat=False, shuffle=False)
- updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
- trainer = training.Trainer(updater, (args.n_iter, 'epoch'), out=om.get_path())
- trainer.extend(extensions.dump_graph('main/loss'))
- trainer.extend(extensions.snapshot(), trigger=(args.save_interval, 'epoch'))
- trainer.extend(extensions.LogReport())
- trainer.extend(extensions.ProgressBar())
- if args.purpose == 'valid':
- trainer.extend(extensions.Evaluator(valid_iter, model, device=args.gpu),
- trigger=(args.valid_interval, 'epoch'))
- trainer.extend(extensions.PrintReport(
- ['epoch', 'main/loss', 'validation/main/loss', 'elapsed_time']))
- @training.make_extension(trigger=training.triggers.MinValueTrigger(
- 'validation/main/loss', trigger=(args.valid_interval, 'epoch')), priority=-100)
- def save_base_model(trainer):
- print('save best')
- serializers.save_npz(om.get_path() + 'best.model', model)
- trainer.extend(save_base_model)
- else:
- trainer.extend(extensions.PrintReport(
- ['epoch', 'main/loss', 'elapsed_time']))
- trainer.run()
- if args.purpose == 'valid':
- valid_iter.reset()
- pred_valid = []
- with chainer.no_backprop_mode():
- for batch in valid_iter:
- x, _ = concat_examples(batch, args.gpu)
- pred_valid.append(chainer.cuda.to_cpu(model.predict(x).data))
- pred_valid = np.concatenate(pred_valid, axis=0)
- y = valid.y
- date_feats = valid.date_feats
- pred1 = (date_feats * pred_valid).sum(axis=1)
- pred2 = np.round(pred1)
- pred3 = np.zeros(pred_valid.shape[0])
- for i in range(pred_valid.shape[0]):
- pred3[i] = date_feats[i, np.argmax(pred_valid[i])]
- print(1, base.SMAPE(pred1, y))
- print(2, base.SMAPE(pred2, y))
- print(3, base.SMAPE(pred3, y))
- serializers.load_npz(om.get_path() + 'best.model', model)
- valid_iter.reset()
- pred_valid = []
- with chainer.no_backprop_mode():
- for batch in valid_iter:
- x, _ = concat_examples(batch, args.gpu)
- pred_valid.append(chainer.cuda.to_cpu(model.predict(x).data))
- pred_valid = np.concatenate(pred_valid, axis=0)
- y = valid.y
- date_feats = valid.date_feats
- pred1 = (date_feats * pred_valid).sum(axis=1)
- pred2 = np.round(pred1)
- pred3 = np.zeros(pred_valid.shape[0])
- for i in range(pred_valid.shape[0]):
- pred3[i] = date_feats[i, np.argmax(pred_valid[i])]
- print(1, base.SMAPE(pred1, y))
- print(2, base.SMAPE(pred2, y))
- print(3, base.SMAPE(pred3, y))
- else:
- valid_iter.reset()
- pred_valid = []
- with chainer.no_backprop_mode():
- for batch in valid_iter:
- x, _ = concat_examples(batch, args.gpu)
- pred_valid.append(chainer.cuda.to_cpu(model.predict(x).data))
- pred_valid = np.concatenate(pred_valid, axis=0)
- date_feats = valid.date_feats
- pred = (date_feats * pred_valid).sum(axis=1)
- pred = np.round(pred)
- # pred = np.zeros(pred_valid.shape[0])
- # for i in range(pred_valid.shape[0]):
- # pred[i] = date_feats[i, np.argmax(pred_valid[i])]
- pred = pred.reshape((-1, 62))
- assert len(pred) == len(valid.page)
- pred_df = pd.DataFrame(pred, columns=pd.date_range('2017-09-13', '2017-11-13'), index=valid.page)
- pred_df = pred_df.reset_index()
- pred_df.to_hdf(om.get_path() + 'pred_df.h5', 'tables', complevel=9, complib='blosc')
Add Comment
Please, Sign In to add comment