runnig

[hackerrank] ML text processing warmup

Jun 26th, 2013
162
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.48 KB | None | 0 0
  1. import sys, re, collections
  2.  
  3. months_str = "January|February|March|April|May|June|July|August|September|October|November|December"
  4. months_short = '|'.join(s[:3] for s in months_str.split("|"))
  5.  
  6. def find_dates(test):
  7.     months_str = "January|February|March|April|May|June|July|August|September|October|November|December"
  8.     months_short = '|'.join(s[:3] for s in months_str.split("|"))
  9.  
  10.     day = "(\d|\d\d)(th|st|rd)?"
  11.     of = "(\ |\ of\ )"
  12.     mon_fmt = "(%s|%s)" % (months_str, months_short)
  13.     year = "(\d{2}|\d{4})"
  14.     date1_re = r"(\b%(day)s%(of)s%(mon_fmt)s\,?\ ?%(year)s\b)" % {'day':day, 'of':of, 'mon_fmt':mon_fmt, 'year':year}
  15.     date1_compiled = re.compile(date1_re, re.IGNORECASE)
  16.  
  17.     date2_re = r"(\b(\d{2}|\d{4})[-./]\d{2}[-./](\d{2}|\d{4})\b)"
  18.     date2_compiled = re.compile(date2_re, re.IGNORECASE)
  19.  
  20.     date3_re = r"(\b%(day)s\ %(mon_fmt)s,?\ ?%(year)s\b)" % {'day':day, 'of':of, 'mon_fmt':mon_fmt, 'year':year}
  21.     date3_compiled = re.compile(date3_re, re.IGNORECASE)
  22.  
  23.     date4_re = r"(\b%(mon_fmt)s %(day)s,?\ ?%(year)s\b)" % {'day':day, 'of':of, 'mon_fmt':mon_fmt, 'year':year}
  24.     date4_compiled = re.compile(date4_re, re.IGNORECASE)
  25.  
  26.     num_dates = 0
  27.     print date1_re
  28.     for m in re.findall(date1_compiled, test):
  29.         num_dates += 1
  30.        
  31.     for m in re.findall(date2_compiled, test):
  32.         num_dates += 1
  33.        
  34.     for m in re.findall(date3_compiled, test):
  35.         num_dates += 1
  36.        
  37.     for m in re.findall(date4_compiled, test):
  38.         num_dates += 1        
  39.        
  40.     return num_dates
  41.        
  42. def main(Testcases):
  43.    
  44.     compiled_regex = re.compile(r'(\bthe\b|\ban\b|\ba\b)', re.IGNORECASE)
  45.    
  46.    
  47.     date_regex = re.compile(r'(\b\d{2}/\d{2}/\d{2}\b)', re.IGNORECASE)
  48.    
  49.  
  50.     for testcase in Testcases:
  51.         article_counts = collections.defaultdict(int)
  52.         num_dates = 0
  53.        
  54.         for m in re.findall(compiled_regex, testcase):        
  55.             article_counts[m] += 1
  56.             print m
  57.         num_dates += find_dates(testcase)
  58.         print article_counts['a']
  59.         print article_counts['an']
  60.         print article_counts['the']
  61.         print num_dates
  62.    
  63.            
  64.  
  65. def simplify_string(S):
  66.     return "." + S.replace(" ", ".") + "."
  67.        
  68. if __name__ == '__main__':
  69.     fp = open("text_warmup.txt")
  70.     T = int(fp.readline())
  71.     Testcases = [fp.readline() for i in range(T)]
  72.     #Testcases = ["an the THE,tHethE.The An, anan 10/10/10"]
  73.    
  74.     main(Testcases)
Add Comment
Please, Sign In to add comment