Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- from sklearn.feature_extraction.text import CountVectorizer
- data = {
- 'label': [0, 1, 0, 1],
- 'text': ['first bit of text', 'second bit of text', 'third text', 'text number four']
- }
- data = pd.DataFrame.from_dict(data)
- # Form vocab dictionary
- vectorizer = CountVectorizer()
- vectorizer.fit_transform(data['text'].tolist())
- vocab_text = vectorizer.vocabulary_
- # Convert text
- def convert_text(text):
- text_list = text.split(' ')
- return [vocab_text[t]+1 for t in text_list]
- data['text'] = data['text'].apply(convert_text)
- # Get X and y matrices
- y = np.array(data['label'])
- X = np.array(data['text'])`
Add Comment
Please, Sign In to add comment