Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- import re
- with open("input.html") as inp_file:
- content = inp_file.read()
- pattern = r'<p.*?tweet-text.*?>((.|\s)*?)<\/p>';
- tweets = [m[0] for m in re.findall(pattern, content)]
- tweets_wo_links = [re.sub(r"<a.*?(</a>)", '', tweet) for tweet in tweets]
- tweets_wo_tags = [re.sub(r"<.*?>", '', tweet) for tweet in tweets_wo_links]
- print("tweets found:", len(tweets))
- brain_lists = [re.findall(r"([a-zA-Z0-9]*?)\W+?[bB][rR][aA][iI][nN]", tweet) for tweet in tweets_wo_tags]
- brains = [brain for brain_list in brain_lists for brain in brain_list]
- brains_lcase = [brain.lower() for brain in brains]
- from collections import Counter
- c = Counter(brains_lcase)
- for (key,count) in c.most_common():
- print(key, ": ", count, sep='')
Add Comment
Please, Sign In to add comment