Advertisement
Guest User

Untitled

a guest
Dec 28th, 2019
136
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.12 KB | None | 0 0
  1. # coding=utf-8
  2. import json
  3. import io
  4. import re
  5.  
  6. def process_kanji (dfn):
  7.     # gets string and returns list of ■一■-level dfns processed
  8.     #re_obj = re.split(ur"■[一二三四五六七八九十]■", dfn)
  9.     re_obj = re.split(r'■[一二三四五六七八九]■', dfn, re.UNICODE)
  10.     if(len(re_obj) > 2):
  11.         common = re_obj.pop(0)
  12.         re_obj = map (lambda x: common + x, re_obj)
  13.  
  14.     return re_obj
  15.  
  16. def process_numbers (dfn_lst):
  17.     # gets lst of dfns and returns (possibly bigger) final lst of dfns
  18.     final_lst = []
  19.     for s in dfn_lst:
  20.         re_obj = re.split(r'([123456789])', s, re.UNICODE)
  21.         if (len(re_obj) > 2):
  22.             common = re_obj.pop(0)
  23.             re_obj = map( lambda x: common + x, re_obj)
  24.         final_lst.extend(re_obj)
  25.     return final_lst
  26.  
  27.  
  28.  
  29. def process_dfn (dfn):
  30.     # receives raw string and returns LIST of strings (one for each definition)
  31.  
  32.     # process "■一■" and similar
  33.     kanji_lst = process_kanji (dfn) # returns list of strings
  34.    
  35.     # process "(1)" and similar
  36.     lst = process_numbers (kanji_lst)
  37.  
  38.     return lst
  39.  
  40.  
  41. number = 1
  42.  
  43. names = []
  44.  
  45. for number in range(1,34):
  46.         name = 'term_bank_' + str(number) + '.json'
  47.         names.append(name)
  48.  
  49.  
  50. for file_name in names:
  51.     print(file_name)
  52.  
  53.     glob = []
  54.  
  55.     with open(file_name) as json_file:
  56.         data = json.load(json_file)
  57.  
  58.     for elem in data:
  59.         lst = elem[5]
  60.  
  61.         lst_dfn = []
  62.  
  63.         for raw_dfn in lst:
  64.             lst_dfn.extend(process_dfn(raw_dfn))
  65.  
  66.         for dfn in lst_dfn:
  67.             current_elem = []
  68.  
  69.             current_elem.append( elem[0] )
  70.             current_elem.append( elem[1] )
  71.             current_elem.append( elem[2] )
  72.             current_elem.append( elem[3] )
  73.             current_elem.append( elem[4] )
  74.             current_elem.append( [dfn] )
  75.             current_elem.append( elem[6] )
  76.             current_elem.append( elem[7] )
  77.  
  78.             glob.append( current_elem )
  79.  
  80.  
  81.     with io.open(str(file_name), 'w', encoding='utf-8') as f:
  82.         json.dump(glob, f, ensure_ascii=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement