Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys, re, collections
- months_str = "January|February|March|April|May|June|July|August|September|October|November|December"
- months_short = '|'.join(s[:3] for s in months_str.split("|"))
- def find_dates(test):
- months_str = "January|February|March|April|May|June|July|August|September|October|November|December"
- months_short = '|'.join(s[:3] for s in months_str.split("|"))
- day = "(\d|\d\d)(th|st|rd)?"
- of = "(\ |\ of\ )"
- mon_fmt = "(%s|%s)" % (months_str, months_short)
- year = "(\d{2}|\d{4})"
- date1_re = r"(\b%(day)s%(of)s%(mon_fmt)s\,?\ ?%(year)s\b)" % {'day':day, 'of':of, 'mon_fmt':mon_fmt, 'year':year}
- date1_compiled = re.compile(date1_re, re.IGNORECASE)
- date2_re = r"(\b(\d{2}|\d{4})[-./]\d{2}[-./](\d{2}|\d{4})\b)"
- date2_compiled = re.compile(date2_re, re.IGNORECASE)
- date3_re = r"(\b%(day)s\ %(mon_fmt)s,?\ ?%(year)s\b)" % {'day':day, 'of':of, 'mon_fmt':mon_fmt, 'year':year}
- date3_compiled = re.compile(date3_re, re.IGNORECASE)
- date4_re = r"(\b%(mon_fmt)s %(day)s,?\ ?%(year)s\b)" % {'day':day, 'of':of, 'mon_fmt':mon_fmt, 'year':year}
- date4_compiled = re.compile(date4_re, re.IGNORECASE)
- num_dates = 0
- print date1_re
- for m in re.findall(date1_compiled, test):
- num_dates += 1
- for m in re.findall(date2_compiled, test):
- num_dates += 1
- for m in re.findall(date3_compiled, test):
- num_dates += 1
- for m in re.findall(date4_compiled, test):
- num_dates += 1
- return num_dates
- def main(Testcases):
- compiled_regex = re.compile(r'(\bthe\b|\ban\b|\ba\b)', re.IGNORECASE)
- date_regex = re.compile(r'(\b\d{2}/\d{2}/\d{2}\b)', re.IGNORECASE)
- for testcase in Testcases:
- article_counts = collections.defaultdict(int)
- num_dates = 0
- for m in re.findall(compiled_regex, testcase):
- article_counts[m] += 1
- print m
- num_dates += find_dates(testcase)
- print article_counts['a']
- print article_counts['an']
- print article_counts['the']
- print num_dates
- def simplify_string(S):
- return "." + S.replace(" ", ".") + "."
- if __name__ == '__main__':
- fp = open("text_warmup.txt")
- T = int(fp.readline())
- Testcases = [fp.readline() for i in range(T)]
- #Testcases = ["an the THE,tHethE.The An, anan 10/10/10"]
- main(Testcases)
Add Comment
Please, Sign In to add comment