Advertisement
Guest User

EPWING JSON cleaner for Nazeka

a guest
Apr 4th, 2020
386
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.22 KB | None | 0 0
  1. #!python
  2. # -*- coding: utf-8 -*-
  3. from collections import OrderedDict
  4. import json
  5. import sys
  6. from io import open
  7. import re
  8.  
  9. #works with shinjirin and shinmeikai
  10.  
  11. entries = json.load(open("shinjirin.json", encoding="utf-8"), object_pairs_hook=OrderedDict)
  12.  
  13. jj = []
  14.  
  15. def iskana(s):
  16.     for c in s:
  17.         c = ord(c)
  18.         if c == " " or c == " ":
  19.             continue
  20.         # hiragana and katakana
  21.         elif c >= 0x3040 and c < 0x3100:
  22.             continue
  23.         # halfwidth etc
  24.         elif c >= 0xFF65 and c < 0xFFA0:
  25.             continue
  26.         # ainu extensions
  27.         elif c >= 0x31F0 and c < 0x3200:
  28.             continue
  29.         # hentaigana A
  30.         elif c >= 0x1B000 and c < 0x1B100:
  31.             continue
  32.         # hentaigana B
  33.         elif c >= 0x1B100 and c < 0x1B130:
  34.             continue
  35.         else:
  36.             return False
  37.     return True
  38.  
  39. i = 0
  40. for entry in entries["subbooks"][0]["entries"]:
  41.     #if i > 10000: break
  42.     i += 1
  43.     try:
  44.         # some entries are broken (zero-epwing gives no text, ebwin4 shows trash)
  45.         heading = entry["heading"]
  46.         text = entry["text"]
  47.     except:
  48.         continue
  49.    
  50.     if "【" in heading:
  51.         reading = heading.split("【")[0]
  52.         spelling = heading.split("【")[1].split("】")[0]
  53.     elif "[" in heading:
  54.         reading = heading.split("[")[0]
  55.         spelling = heading.split("[")[1].split("]")[0]
  56.     else:
  57.         reading = heading
  58.         spelling = ""
  59.     reading = reading.replace("-", "")
  60.     reading = reading.replace("・", "")
  61.     spellings = spelling.split("・")
  62.     lines = text.split("\n")[1:-1]
  63.     if lines[0] != "" and lines[0][0] == "(" and lines[0][-1] == ")" and ")" not in lines[0][1:-1]:
  64.         lines = lines[1:]
  65.    
  66.    
  67.     if len(reading) == 1 and spellings[0] == "" and len(lines) > 10:
  68.         continue
  69.    
  70.     if len(reading) == 0:
  71.         continue
  72.    
  73.     if not iskana(reading):
  74.         continue
  75.    
  76.     jj += [OrderedDict([("r",reading),("s",spellings),("l",lines)])]
  77.  
  78.  
  79. f = open("nazeka_shinjirin.json", "w", newline="\n", encoding="utf-8")
  80. f.write(json.dumps(jj, ensure_ascii=False, indent=4, separators=(',',':')))#, separators=(',',':')))
  81. f.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement