Untitled

#!/usr/bin/env python3

import re

with open("input.html") as inp_file:
    content = inp_file.read()

pattern = r'<p.*?tweet-text.*?>((.|\s)*?)<\/p>';

tweets = [m[0] for m in re.findall(pattern, content)]
tweets_wo_links = [re.sub(r"<a.*?(</a>)", '', tweet) for tweet in tweets]
tweets_wo_tags = [re.sub(r"<.*?>", '', tweet) for tweet in tweets_wo_links]

print("tweets found:", len(tweets))

brain_lists = [re.findall(r"([a-zA-Z0-9]*?)\W+?[bB][rR][aA][iI][nN]", tweet) for tweet in tweets_wo_tags]

brains = [brain for brain_list in brain_lists for brain in brain_list]
brains_lcase = [brain.lower() for brain in brains]

from collections import Counter
c = Counter(brains_lcase)
for (key,count) in c.most_common():
    print(key, ": ", count, sep='')