Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # coding=utf-8
- import json
- import io
- import re
- def process_kanji (dfn):
- # gets string and returns list of ■一■-level dfns processed
- #re_obj = re.split(ur"■[一二三四五六七八九十]■", dfn)
- re_obj = re.split(r'■[一二三四五六七八九]■', dfn, re.UNICODE)
- if(len(re_obj) > 2):
- common = re_obj.pop(0)
- re_obj = map (lambda x: common + x, re_obj)
- return re_obj
- def process_numbers (dfn_lst):
- # gets lst of dfns and returns (possibly bigger) final lst of dfns
- final_lst = []
- for s in dfn_lst:
- re_obj = re.split(r'([123456789])', s, re.UNICODE)
- if (len(re_obj) > 2):
- common = re_obj.pop(0)
- re_obj = map( lambda x: common + x, re_obj)
- final_lst.extend(re_obj)
- return final_lst
- def process_dfn (dfn):
- # receives raw string and returns LIST of strings (one for each definition)
- # process "■一■" and similar
- kanji_lst = process_kanji (dfn) # returns list of strings
- # process "(1)" and similar
- lst = process_numbers (kanji_lst)
- return lst
- number = 1
- names = []
- for number in range(1,34):
- name = 'term_bank_' + str(number) + '.json'
- names.append(name)
- for file_name in names:
- print(file_name)
- glob = []
- with open(file_name) as json_file:
- data = json.load(json_file)
- for elem in data:
- lst = elem[5]
- lst_dfn = []
- for raw_dfn in lst:
- lst_dfn.extend(process_dfn(raw_dfn))
- for dfn in lst_dfn:
- current_elem = []
- current_elem.append( elem[0] )
- current_elem.append( elem[1] )
- current_elem.append( elem[2] )
- current_elem.append( elem[3] )
- current_elem.append( elem[4] )
- current_elem.append( [dfn] )
- current_elem.append( elem[6] )
- current_elem.append( elem[7] )
- glob.append( current_elem )
- with io.open(str(file_name), 'w', encoding='utf-8') as f:
- json.dump(glob, f, ensure_ascii=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement