[hackerrank] ML text processing warmup

import sys, re, collections

months_str = "January|February|March|April|May|June|July|August|September|October|November|December"
months_short = '|'.join(s[:3] for s in months_str.split("|"))

def find_dates(test):
    months_str = "January|February|March|April|May|June|July|August|September|October|November|December"
    months_short = '|'.join(s[:3] for s in months_str.split("|"))

    day = "(\d|\d\d)(th|st|rd)?"
    of = "(\ |\ of\ )"
    mon_fmt = "(%s|%s)" % (months_str, months_short)
    year = "(\d{2}|\d{4})"
    date1_re = r"(\b%(day)s%(of)s%(mon_fmt)s\,?\ ?%(year)s\b)" % {'day':day, 'of':of, 'mon_fmt':mon_fmt, 'year':year}
    date1_compiled = re.compile(date1_re, re.IGNORECASE)

    date2_re = r"(\b(\d{2}|\d{4})[-./]\d{2}[-./](\d{2}|\d{4})\b)"
    date2_compiled = re.compile(date2_re, re.IGNORECASE)

    date3_re = r"(\b%(day)s\ %(mon_fmt)s,?\ ?%(year)s\b)" % {'day':day, 'of':of, 'mon_fmt':mon_fmt, 'year':year}
    date3_compiled = re.compile(date3_re, re.IGNORECASE)

    date4_re = r"(\b%(mon_fmt)s %(day)s,?\ ?%(year)s\b)" % {'day':day, 'of':of, 'mon_fmt':mon_fmt, 'year':year}
    date4_compiled = re.compile(date4_re, re.IGNORECASE)

    num_dates = 0
    print date1_re
    for m in re.findall(date1_compiled, test):
        num_dates += 1

    for m in re.findall(date2_compiled, test):
        num_dates += 1

    for m in re.findall(date3_compiled, test):
        num_dates += 1

    for m in re.findall(date4_compiled, test):
        num_dates += 1

    return num_dates

def main(Testcases):

    compiled_regex = re.compile(r'(\bthe\b|\ban\b|\ba\b)', re.IGNORECASE)


    date_regex = re.compile(r'(\b\d{2}/\d{2}/\d{2}\b)', re.IGNORECASE)


    for testcase in Testcases:
        article_counts = collections.defaultdict(int)
        num_dates = 0

        for m in re.findall(compiled_regex, testcase):
            article_counts[m] += 1
            print m
        num_dates += find_dates(testcase)
        print article_counts['a']
        print article_counts['an']
        print article_counts['the']
        print num_dates


def simplify_string(S):
    return "." + S.replace(" ", ".") + "."

if __name__ == '__main__':
    fp = open("text_warmup.txt")
    T = int(fp.readline())
    Testcases = [fp.readline() for i in range(T)]
    #Testcases = ["an the THE,tHethE.The An, anan 10/10/10"]

    main(Testcases)