Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!python
- # -*- coding: utf-8 -*-
- from collections import OrderedDict
- import json
- import sys
- from io import open
- import re
- #works with shinjirin and shinmeikai
- entries = json.load(open("shinjirin.json", encoding="utf-8"), object_pairs_hook=OrderedDict)
- jj = []
- def iskana(s):
- for c in s:
- c = ord(c)
- if c == " " or c == " ":
- continue
- # hiragana and katakana
- elif c >= 0x3040 and c < 0x3100:
- continue
- # halfwidth etc
- elif c >= 0xFF65 and c < 0xFFA0:
- continue
- # ainu extensions
- elif c >= 0x31F0 and c < 0x3200:
- continue
- # hentaigana A
- elif c >= 0x1B000 and c < 0x1B100:
- continue
- # hentaigana B
- elif c >= 0x1B100 and c < 0x1B130:
- continue
- else:
- return False
- return True
- i = 0
- for entry in entries["subbooks"][0]["entries"]:
- #if i > 10000: break
- i += 1
- try:
- # some entries are broken (zero-epwing gives no text, ebwin4 shows trash)
- heading = entry["heading"]
- text = entry["text"]
- except:
- continue
- if "【" in heading:
- reading = heading.split("【")[0]
- spelling = heading.split("【")[1].split("】")[0]
- elif "[" in heading:
- reading = heading.split("[")[0]
- spelling = heading.split("[")[1].split("]")[0]
- else:
- reading = heading
- spelling = ""
- reading = reading.replace("-", "")
- reading = reading.replace("・", "")
- spellings = spelling.split("・")
- lines = text.split("\n")[1:-1]
- if lines[0] != "" and lines[0][0] == "(" and lines[0][-1] == ")" and ")" not in lines[0][1:-1]:
- lines = lines[1:]
- if len(reading) == 1 and spellings[0] == "" and len(lines) > 10:
- continue
- if len(reading) == 0:
- continue
- if not iskana(reading):
- continue
- jj += [OrderedDict([("r",reading),("s",spellings),("l",lines)])]
- f = open("nazeka_shinjirin.json", "w", newline="\n", encoding="utf-8")
- f.write(json.dumps(jj, ensure_ascii=False, indent=4, separators=(',',':')))#, separators=(',',':')))
- f.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement