Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import tensorflow as tf
- import numpy as np
- import pickle
- train_x = pickle.load(open('data/train_x.pkl', 'rb'))
- train_y = pickle.load(open('data/train_y.pkl', 'rb'))
- val_x = pickle.load(open('data/val_x.pkl', 'rb'))
- val_y = pickle.load(open('data/val_y.pkl', 'rb'))
- test_x = pickle.load(open('data/test_x.pkl', 'rb'))
- train_x = [[int(x) for x in t] for t in train_x] # 原pickle文件都是string类型
- val_x = [[int(x) for x in v] for v in val_x]
- test_x = [[int(x) for x in t] for t in test_x]
- subtract = lambda y: int(y) - 1
- train_y = list(map(subtract, train_y)) # 因为预测的时候网络的输出的类别是从0开始的,最后在预测test集的时候应将预测结果都加1
- val_y = list(map(subtract, val_y))
- PAD = 1280000 # 将train集合和valid集合都id都统计了一边,总共是0~1279999,因此用1280000作为padding操作的id
- padding_length = 1000 # 截取文章的前800个词
- with tf.python_io.TFRecordWriter('./data/train.tfrecord') as writer:
- for x, y in zip(train_x, train_y):
- if len(x) > padding_length:
- x = x[:padding_length]
- mask = np.ones_like(x)
- else:
- mask = np.ones_like(x)
- x = np.pad(x, (0, padding_length - len(x)), mode='constant', constant_values=(0, PAD))
- mask = np.pad(mask, (0, padding_length - len(mask)), mode='constant', constant_values=(0, 0))
- example = tf.train.Example(features=tf.train.Features(feature={
- 'x': tf.train.Feature(int64_list=tf.train.Int64List(value=x)),
- 'mask': tf.train.Feature(int64_list=tf.train.Int64List(value=mask)),
- 'y': tf.train.Feature(int64_list=tf.train.Int64List(value=[y]))
- }))
- tf_example = example.SerializeToString()
- writer.write(tf_example)
- with tf.python_io.TFRecordWriter('./data/valid.tfrecord') as writer:
- for x, y in zip(val_x, val_y):
- if len(x) > padding_length:
- x = x[:padding_length]
- mask = np.ones_like(x)
- else:
- mask = np.ones_like(x)
- x = np.pad(x, (0, padding_length - len(x)), mode='constant', constant_values=(0, PAD))
- mask = np.pad(mask, (0, padding_length - len(mask)), mode='constant', constant_values=(0, 0))
- example = tf.train.Example(features=tf.train.Features(feature={
- 'x': tf.train.Feature(int64_list=tf.train.Int64List(value=x)),
- 'mask': tf.train.Feature(int64_list=tf.train.Int64List(value=mask)),
- 'y': tf.train.Feature(int64_list=tf.train.Int64List(value=[y]))
- }))
- tf_example = example.SerializeToString()
- writer.write(tf_example)
- with tf.python_io.TFRecordWriter('./data/test.tfrecord') as writer:
- for x in test_x:
- if len(x) > padding_length:
- x = x[:padding_length]
- mask = np.ones_like(x)
- else:
- mask = np.ones_like(x)
- x = np.pad(x, (0, padding_length - len(x)), mode='constant', constant_values=(0, PAD))
- mask = np.pad(mask, (0, padding_length - len(mask)), mode='constant', constant_values=(0, 0))
- example = tf.train.Example(features=tf.train.Features(feature={
- 'x': tf.train.Feature(int64_list=tf.train.Int64List(value=x)),
- 'mask': tf.train.Feature(int64_list=tf.train.Int64List(value=mask)),
- 'y': tf.train.Feature(int64_list=tf.train.Int64List(value=[1]))
- }))
- tf_example = example.SerializeToString()
- writer.write(tf_example)
Add Comment
Please, Sign In to add comment