Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- sample_train_data = ['Dispute delays National Assembly formation process',
- 'Country is looking to encourage entrepreneurs and startup process',
- 'Airline fuel surcharges to go up from Tuesday']
- from sklearn.feature_extraction.text import CountVectorizer
- # instantiate Vectorizer
- vec = CountVectorizer()
- # feed/learn the 'vocabulary' of the training data
- vec.fit(sample_train_data)
- CountVectorizer(analyzer='word', binary=False, decode_error='strict',
- dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
- lowercase=True, max_df=1.0, max_features=None, min_df=1,
- ngram_range=(1, 1), preprocessor=None, stop_words=None,
- strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
- tokenizer=None, vocabulary=None)
Add Comment
Please, Sign In to add comment