SHARE
TWEET
dereadnator.py
a guest
Aug 31st, 2015
10
Never
- # -*- coding: utf-8 -*-
- import pandas as pd
- import re
- from core import Words, ankiFurigana
- def hiraToKataCh(ch):
- if ord(ch) >= 0x3041 and ord(ch) <= 0x3096:
- return unichr(ord(ch) + 0x60)
- else:
- return ch
- def hiraToKata(s):
- return "".join([hiraToKataCh(ch) for ch in s])
- # Receives a .csv file with three columns (the third will be filled/overwritten)
- # Replaces the pronunciation of the kanji between parenthesis by the kanji that
- # is in the first column.
- def main():
- with open('jkeywords.csv') as f:
- df = pd.read_csv(f, sep='\t', names=["kanji", "oldkey", "newkey"],
- encoding='utf8')
- df.fillna('', inplace=True) # make empty fields strings too, not floats.
- #load dictionary
- dic = Words({"pathDicFile":"jmdict_freqs.txt"})._dic
- # select the normal or japanese parenthesis and it's contents only if it only
- # have kanas in between them.
- regex = re.compile(u"[(|(][ぁ-ー]+[)|)]", re.UNICODE)
- for index, row in df.iterrows():
- ok = row['oldkey']
- if ok == "":
- continue
- kanji = row['kanji']
- s = ""
- for word in ok.split(u"、"): # japanese comma
- kanjiword, n = regex.subn(kanji, word)
- readings = []
- if n == 0:
- kanjiword = kanji # when the reading is for the whole kanji alone
- readings = [word]
- elif kanjiword in dic:
- readings = dic[kanjiword].keys()
- if len(readings) > 1:
- targetReadings = [hiraToKata(t) for t in regex.findall(word)]
- #are there any with len(targetReadings) > 1?
- #if so, maybe build another regex to check they are all there and in the correct order
- readings = [r for r in readings if targetReadings[0][1:-1] in hiraToKata(r)]
- answers = u"、".join([ankiFurigana(kanjiword, r) for r in readings])
- if len(readings) < 1:
- answers = u"NOTFOUND"
- if len(readings) > 1:
- answers += u"、CHECK(" + str(len(readings)) + ")"
- questions = answers.replace(kanji, u"〇")
- s += questions + u"、"
- #could put the answers in the deck too, like in the plugin
- row['newkey'] = s[:-1] # remove the trailing comma
- df.to_csv('processed.csv', sep='\t', header=False, index=False, encoding='utf8')
- main()
RAW Paste Data
