Advertisement
Guest User

Untitled

a guest
Feb 26th, 2017
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.62 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import seaborn as sn
  4. from scipy import stats
  5. #import missingno as msno
  6. from datetime import datetime
  7. import matplotlib.pyplot as plt
  8. import os
  9. import sys
  10. import codecs
  11. import nltk
  12. from nltk.corpus import stopwords
  13. from sklearn.feature_extraction.text import CountVectorizer
  14.  
  15. default_stopwords = set(nltk.corpus.stopwords.words('english'))
  16.  
  17. def get_words(df):
  18. return (' '.join(df['description']))
  19.  
  20. data_df = pd.read_json("../data/train.json")
  21. wordbag = get_words(data_df)
  22.  
  23. words = nltk.word_tokenize(
  24.  
  25.  
  26. vec = CountVectorizer(analyzer=u'word', stop_words="english").fit([wordbag])
  27. print vec
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement