Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sussex_nltk.corpus_readers import MedlineCorpusReader, ReutersCorpusReader, TwitterCorpusReader
- readers = [MedlineCorpusReader(), ReutersCorpusReader(), TwitterCorpusReader()]
- sample_size = 10000
- for reader in readers:
- sentences = map(lambda s: word_tokenize(s.lower()), reader.sample_raw_sents(sample_size))
- stops = 0
- others = 0
- for tokens in sentences:
- for token in tokens:
- if token in stop:
- stops += 1
- else:
- others += 1
- print(f"{round(stops / (stops + others) * 100, 2)}% were stop words with a total of {stops}/{stops + others}")
Advertisement
Add Comment
Please, Sign In to add comment