Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import seaborn as sn
- from scipy import stats
- #import missingno as msno
- from datetime import datetime
- import matplotlib.pyplot as plt
- import os
- import sys
- import codecs
- import nltk
- from nltk.corpus import stopwords
- from sklearn.feature_extraction.text import CountVectorizer
- default_stopwords = set(nltk.corpus.stopwords.words('english'))
- def get_words(df):
- return (' '.join(df['description']))
- data_df = pd.read_json("../data/train.json")
- wordbag = get_words(data_df)
- words = nltk.word_tokenize(
- vec = CountVectorizer(analyzer=u'word', stop_words="english").fit([wordbag])
- print vec
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement