Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- print c[25:50]
- ['aluminum co of america', 'aluminum co of america', 'aluminum co of america', 'aluminum company of america', 'aluminum company of america', 'aluminum co of america', 'aluminum company of america', 'aluminum company of america', 'asset acceptance capital corp.', 'asset acceptance capital corp.', 'asset acceptance capital corp.', 'asset acceptance capital corp.', 'asset acceptance capital corp.', 'asset acceptance capital corp.', 'asset acceptance capital corp.', 'asset acceptance capital corp.', 'ace cash express, inc.', 'ace cash express, inc.', 'airtran holdings, inc.', 'airtran holdings, inc.', 'airtran holdings, inc.', 'airtran holdings, inc.', 'airtran holdings, inc.', 'airtran holdings, inc.', 'airtran holdings, inc.']
- from collections import Counter
- r=[]
- for e in c:
- r.extend(e.split())
- count=Counter(r)
- {'inc.': 18670, 'corporation': 9255, 'company': 2632, 'group,': 1190, '&': 1158, 'financial': 1025}
- # Using Generator Expression with `Counter` to speed it up a little bit
- from collections import Counter
- count = Counter(item for e in c for item in e.split())
- # Get most frequently used words
- words = {item for item, cnt in count.most_common(6)}
- # filter the `words` in `c` and reconstruct the sentences in `c`
- [" ".join([item for item in e.split() if item not in words]) for e in c]
- import re
- p = re.compile(' |'.join(word for word in count))
- cleaned = [p.sub('', item) for item in c]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement