Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sklearn.feature_extraction.text import CountVectorizer
- import json
- import pandas
- import numpy
- corpus_path = 'data/training/training-data.csv'
- # prepare training data for bow (corpus)
- X_training = []
- dataframe = pandas.read_csv(corpus_path, header=None)
- for i in xrange(len(dataframe[0])):
- X_training.append(dataframe[0][i])
- sentences = numpy.array(X_training)
- # create bow vocabulary
- vectorizer = CountVectorizer()
- vectorizer.fit_transform(sentences).todense()
- # save vocabulary to json file
- with open ('vocabulary.json', 'w') as vocabFile:
- json.dump(vectorizer.vocabulary_ , vocabFile)
- print "vocabulary is saved"
Add Comment
Please, Sign In to add comment