Untitled

comments = pd.read_csv('set1.tsv', sep = 't', index_col = 0)
annotations = pd.read_csv('set2.tsv',  sep = 't')
labels = annotations.groupby('rev_id')['score'].mean()
labels = pd.DataFrame(data=labels.values, columns=['score'])
scaler = MinMaxScaler(feature_range=(0, 1))
labels = 1-(scaler.fit_transform(labels))
comments['score'] = labels
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))
comments = comments.reset_index()
vocab_size = 20000
max_length = 300  # max length of the comments in the model
tok = Tokenizer(num_words=vocab_size)  # make the Keras tokenizer
tok.fit_on_texts(comments['comment'])  # fit it to comments
train = comments.query("split=='train'")
test = comments.query("split=='test'")
x_train = pad_sequences(tok.texts_to_matrix(train[['comment']], mode='count'), maxlen=max_length, padding='post')
y_train = np.array([[float(d)] for d in train[['score']].pop('score')]).T
x_test = pad_sequences(tok.texts_to_matrix(test[['comment']], mode='count'), maxlen=max_length, padding='post')
y_test = np.array([[float(d)] for d in test[['score']].pop('score')]).T