SHARE
TWEET

Naturalisation decree parser

a guest Mar 24th, 2019 162 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from sys import argv
  2. from os import listdir
  3. from os.path import isfile, join
  4. from tika import parser
  5. import re
  6.  
  7. mentioned_decrees = set()
  8. decrees_dict = {}
  9.  
  10. parse_dir = argv[1]
  11. min_year = 0 if len(argv) < 3 else argv[2]
  12. files = sorted([f for f in listdir(parse_dir) if f.endswith('.pdf') and isfile(join(parse_dir, f))])
  13.  
  14. for f in files:
  15.     decree_date = f.split('_')[1]
  16.     decree_date = f'{decree_date[:4]}/{decree_date[4:6]}/{decree_date[6:]}'
  17.     text = parser\
  18.         .from_file(join(parse_dir, f))['content']\
  19.         .replace(' ', '')\
  20.         .replace('\n', '')\
  21.         .replace('\r', '')\
  22.         .replace('\t', '')
  23.     decree_number_regex = re.compile(r'20\d\dX\d{6}')
  24.     decrees = set([(decree[0:4], decree[5:8], decree[8:]) for decree in decree_number_regex.findall(text)])
  25.     new_decrees = decrees.difference(mentioned_decrees)
  26.     mentioned_decrees = mentioned_decrees.union(new_decrees)
  27.     decrees_dict[decree_date] = new_decrees
  28.  
  29. series = sorted(set((y, s) for (y, s, n) in mentioned_decrees if y >= min_year))
  30. print(';'.join([''] + sorted(decrees_dict.keys())))
  31.  
  32. for (y, s) in series:
  33.     def belongs_cur_series(x):
  34.         cur_y, cur_s, _ = x
  35.         return y == cur_y and s == cur_s
  36.  
  37.     def cur_series_count_str(x):
  38.         result = len(list(filter(belongs_cur_series, x)))
  39.         return str(result) if result else ''
  40.  
  41.     print(';'.join([f'{y}X{s}'] + [cur_series_count_str(v) for v in decrees_dict.values()]))
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Top