Guest User

Untitled

a guest
Jul 22nd, 2018
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.74 KB | None | 0 0
  1. #!/usr/bin/env python3
  2.  
  3. import re
  4.  
  5. with open("input.html") as inp_file:
  6. content = inp_file.read()
  7.  
  8. pattern = r'<p.*?tweet-text.*?>((.|\s)*?)<\/p>';
  9.  
  10. tweets = [m[0] for m in re.findall(pattern, content)]
  11. tweets_wo_links = [re.sub(r"<a.*?(</a>)", '', tweet) for tweet in tweets]
  12. tweets_wo_tags = [re.sub(r"<.*?>", '', tweet) for tweet in tweets_wo_links]
  13.  
  14. print("tweets found:", len(tweets))
  15.  
  16. brain_lists = [re.findall(r"([a-zA-Z0-9]*?)\W+?[bB][rR][aA][iI][nN]", tweet) for tweet in tweets_wo_tags]
  17.  
  18. brains = [brain for brain_list in brain_lists for brain in brain_list]
  19. brains_lcase = [brain.lower() for brain in brains]
  20.  
  21. from collections import Counter
  22. c = Counter(brains_lcase)
  23. for (key,count) in c.most_common():
  24. print(key, ": ", count, sep='')
Add Comment
Please, Sign In to add comment