Advertisement
Guest User

Untitled

a guest
Apr 23rd, 2014
53
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.41 KB | None | 0 0
  1. print c[25:50]
  2. ['aluminum co of america', 'aluminum co of america', 'aluminum co of america', 'aluminum company of america', 'aluminum company of america', 'aluminum co of america', 'aluminum company of america', 'aluminum company of america', 'asset acceptance capital corp.', 'asset acceptance capital corp.', 'asset acceptance capital corp.', 'asset acceptance capital corp.', 'asset acceptance capital corp.', 'asset acceptance capital corp.', 'asset acceptance capital corp.', 'asset acceptance capital corp.', 'ace cash express, inc.', 'ace cash express, inc.', 'airtran holdings, inc.', 'airtran holdings, inc.', 'airtran holdings, inc.', 'airtran holdings, inc.', 'airtran holdings, inc.', 'airtran holdings, inc.', 'airtran holdings, inc.']
  3.  
  4. from collections import Counter
  5. r=[]
  6. for e in c:
  7. r.extend(e.split())
  8.  
  9. count=Counter(r)
  10.  
  11. {'inc.': 18670, 'corporation': 9255, 'company': 2632, 'group,': 1190, '&': 1158, 'financial': 1025}
  12.  
  13. # Using Generator Expression with `Counter` to speed it up a little bit
  14. from collections import Counter
  15. count = Counter(item for e in c for item in e.split())
  16.  
  17. # Get most frequently used words
  18. words = {item for item, cnt in count.most_common(6)}
  19.  
  20. # filter the `words` in `c` and reconstruct the sentences in `c`
  21. [" ".join([item for item in e.split() if item not in words]) for e in c]
  22.  
  23. import re
  24. p = re.compile(' |'.join(word for word in count))
  25. cleaned = [p.sub('', item) for item in c]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement