Guest User

Untitled

a guest
Feb 21st, 2018
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.62 KB | None | 0 0
  1. from sklearn.feature_extraction.text import CountVectorizer
  2. import json
  3. import pandas
  4. import numpy
  5.  
  6. corpus_path = 'data/training/training-data.csv'
  7.  
  8. # prepare training data for bow (corpus)
  9. X_training = []
  10. dataframe = pandas.read_csv(corpus_path, header=None)
  11. for i in xrange(len(dataframe[0])):
  12. X_training.append(dataframe[0][i])
  13. sentences = numpy.array(X_training)
  14.  
  15. # create bow vocabulary
  16. vectorizer = CountVectorizer()
  17. vectorizer.fit_transform(sentences).todense()
  18.  
  19. # save vocabulary to json file
  20. with open ('vocabulary.json', 'w') as vocabFile:
  21. json.dump(vectorizer.vocabulary_ , vocabFile)
  22.  
  23. print "vocabulary is saved"
Add Comment
Please, Sign In to add comment