Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sys import argv
- from os import listdir
- from os.path import isfile, join
- from tika import parser
- import re
- mentioned_decrees = set()
- decrees_dict = {}
- parse_dir = argv[1]
- min_year = 0 if len(argv) < 3 else argv[2]
- files = sorted([f for f in listdir(parse_dir) if f.endswith('.pdf') and isfile(join(parse_dir, f))])
- for f in files:
- decree_date = f.split('_')[1]
- decree_date = f'{decree_date[:4]}/{decree_date[4:6]}/{decree_date[6:]}'
- text = parser\
- .from_file(join(parse_dir, f))['content']\
- .replace(' ', '')\
- .replace('\n', '')\
- .replace('\r', '')\
- .replace('\t', '')
- decree_number_regex = re.compile(r'20\d\dX\d{6}')
- decrees = set([(decree[0:4], decree[5:8], decree[8:]) for decree in decree_number_regex.findall(text)])
- new_decrees = decrees.difference(mentioned_decrees)
- mentioned_decrees = mentioned_decrees.union(new_decrees)
- decrees_dict[decree_date] = new_decrees
- series = sorted(set((y, s) for (y, s, n) in mentioned_decrees if y >= min_year))
- print(';'.join([''] + sorted(decrees_dict.keys())))
- for (y, s) in series:
- def belongs_cur_series(x):
- cur_y, cur_s, _ = x
- return y == cur_y and s == cur_s
- def cur_series_count_str(x):
- result = len(list(filter(belongs_cur_series, x)))
- return str(result) if result else ''
- print(';'.join([f'{y}X{s}'] + [cur_series_count_str(v) for v in decrees_dict.values()]))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement