Guest User

Untitled

a guest
Mar 18th, 2018
110
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.18 KB | None | 0 0
  1. comments = pd.read_csv('set1.tsv', sep = 't', index_col = 0)
  2. annotations = pd.read_csv('set2.tsv', sep = 't')
  3. labels = annotations.groupby('rev_id')['score'].mean()
  4. labels = pd.DataFrame(data=labels.values, columns=['score'])
  5. scaler = MinMaxScaler(feature_range=(0, 1))
  6. labels = 1-(scaler.fit_transform(labels))
  7. comments['score'] = labels
  8. comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
  9. comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))
  10. comments = comments.reset_index()
  11. vocab_size = 20000
  12. max_length = 300 # max length of the comments in the model
  13. tok = Tokenizer(num_words=vocab_size) # make the Keras tokenizer
  14. tok.fit_on_texts(comments['comment']) # fit it to comments
  15. train = comments.query("split=='train'")
  16. test = comments.query("split=='test'")
  17. x_train = pad_sequences(tok.texts_to_matrix(train[['comment']], mode='count'), maxlen=max_length, padding='post')
  18. y_train = np.array([[float(d)] for d in train[['score']].pop('score')]).T
  19. x_test = pad_sequences(tok.texts_to_matrix(test[['comment']], mode='count'), maxlen=max_length, padding='post')
  20. y_test = np.array([[float(d)] for d in test[['score']].pop('score')]).T
Add Comment
Please, Sign In to add comment