Advertisement
Guest User

Naturalisation decree parser

a guest
Mar 24th, 2019
260
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.45 KB | None | 0 0
  1. from sys import argv
  2. from os import listdir
  3. from os.path import isfile, join
  4. from tika import parser
  5. import re
  6.  
  7. mentioned_decrees = set()
  8. decrees_dict = {}
  9.  
  10. parse_dir = argv[1]
  11. min_year = 0 if len(argv) < 3 else argv[2]
  12. files = sorted([f for f in listdir(parse_dir) if f.endswith('.pdf') and isfile(join(parse_dir, f))])
  13.  
  14. for f in files:
  15.     decree_date = f.split('_')[1]
  16.     decree_date = f'{decree_date[:4]}/{decree_date[4:6]}/{decree_date[6:]}'
  17.     text = parser\
  18.         .from_file(join(parse_dir, f))['content']\
  19.         .replace(' ', '')\
  20.         .replace('\n', '')\
  21.         .replace('\r', '')\
  22.         .replace('\t', '')
  23.     decree_number_regex = re.compile(r'20\d\dX\d{6}')
  24.     decrees = set([(decree[0:4], decree[5:8], decree[8:]) for decree in decree_number_regex.findall(text)])
  25.     new_decrees = decrees.difference(mentioned_decrees)
  26.     mentioned_decrees = mentioned_decrees.union(new_decrees)
  27.     decrees_dict[decree_date] = new_decrees
  28.  
  29. series = sorted(set((y, s) for (y, s, n) in mentioned_decrees if y >= min_year))
  30. print(';'.join([''] + sorted(decrees_dict.keys())))
  31.  
  32. for (y, s) in series:
  33.     def belongs_cur_series(x):
  34.         cur_y, cur_s, _ = x
  35.         return y == cur_y and s == cur_s
  36.  
  37.     def cur_series_count_str(x):
  38.         result = len(list(filter(belongs_cur_series, x)))
  39.         return str(result) if result else ''
  40.  
  41.     print(';'.join([f'{y}X{s}'] + [cur_series_count_str(v) for v in decrees_dict.values()]))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement