Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- >>> from itertools import chain
- >>> import pandas as pd
- >>> from nltk import word_tokenize
- >>> from nltk import FreqDist
- >>> df = pd.read_csv('x')
- >>> df['Description']
- 0 Here is a sentence.
- 1 This is a foo bar sentence.
- Name: Description, dtype: object
- >>> df['Description'].map(word_tokenize)
- 0 [Here, is, a, sentence, .]
- 1 [This, is, a, foo, bar, sentence, .]
- Name: Description, dtype: object
- >>> sents = df['Description'].map(word_tokenize).tolist()
- >>> FreqDist(list(chain(*[everygrams(sent, 1, 3) for sent in sents])))
- FreqDist({('sentence',): 2, ('is', 'a'): 2, ('sentence', '.'): 2, ('is',): 2, ('.',): 2, ('a',): 2, ('Here', 'is', 'a'): 1, ('a', 'foo'): 1, ('a', 'sentence'): 1, ('bar', 'sentence', '.'): 1, ...})
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement