Guest User

Untitled

a guest
Nov 18th, 2017
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.64 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. from sklearn.feature_extraction.text import CountVectorizer
  4.  
  5. data = {
  6. 'label': [0, 1, 0, 1],
  7. 'text': ['first bit of text', 'second bit of text', 'third text', 'text number four']
  8. }
  9. data = pd.DataFrame.from_dict(data)
  10.  
  11. # Form vocab dictionary
  12. vectorizer = CountVectorizer()
  13. vectorizer.fit_transform(data['text'].tolist())
  14. vocab_text = vectorizer.vocabulary_
  15.  
  16. # Convert text
  17. def convert_text(text):
  18. text_list = text.split(' ')
  19. return [vocab_text[t]+1 for t in text_list]
  20.  
  21. data['text'] = data['text'].apply(convert_text)
  22.  
  23. # Get X and y matrices
  24. y = np.array(data['label'])
  25. X = np.array(data['text'])`
Add Comment
Please, Sign In to add comment