Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- comments = pd.read_csv('set1.tsv', sep = 't', index_col = 0)
- annotations = pd.read_csv('set2.tsv', sep = 't')
- labels = annotations.groupby('rev_id')['score'].mean()
- labels = pd.DataFrame(data=labels.values, columns=['score'])
- scaler = MinMaxScaler(feature_range=(0, 1))
- labels = 1-(scaler.fit_transform(labels))
- comments['score'] = labels
- comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
- comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))
- comments = comments.reset_index()
- vocab_size = 20000
- max_length = 300 # max length of the comments in the model
- tok = Tokenizer(num_words=vocab_size) # make the Keras tokenizer
- tok.fit_on_texts(comments['comment']) # fit it to comments
- train = comments.query("split=='train'")
- test = comments.query("split=='test'")
- x_train = pad_sequences(tok.texts_to_matrix(train[['comment']], mode='count'), maxlen=max_length, padding='post')
- y_train = np.array([[float(d)] for d in train[['score']].pop('score')]).T
- x_test = pad_sequences(tok.texts_to_matrix(test[['comment']], mode='count'), maxlen=max_length, padding='post')
- y_test = np.array([[float(d)] for d in test[['score']].pop('score')]).T
Add Comment
Please, Sign In to add comment