SHARE
TWEET

dereadnator.py

a guest Aug 31st, 2015 10 Never
  1. # -*- coding: utf-8 -*-
  2.  
  3. import pandas as pd
  4. import re
  5. from core import Words, ankiFurigana
  6.  
  7. def hiraToKataCh(ch):
  8.     if ord(ch) >= 0x3041 and ord(ch) <= 0x3096:
  9.         return unichr(ord(ch) + 0x60)
  10.     else:
  11.         return ch
  12.  
  13. def hiraToKata(s):
  14.     return "".join([hiraToKataCh(ch) for ch in s])
  15.  
  16. # Receives a .csv file with three columns (the third will be filled/overwritten)
  17. # Replaces the pronunciation of the kanji between parenthesis by the kanji that
  18. # is in the first column.
  19.  
  20. def main():
  21.   with open('jkeywords.csv') as f:
  22.     df = pd.read_csv(f, sep='\t', names=["kanji", "oldkey", "newkey"],
  23.                      encoding='utf8')
  24.   df.fillna('', inplace=True) # make empty fields strings too, not floats.
  25.  
  26.   #load dictionary
  27.   dic = Words({"pathDicFile":"jmdict_freqs.txt"})._dic
  28.  
  29.   # select the normal or japanese parenthesis and it's contents only if it only
  30.   # have kanas in between them.
  31.   regex = re.compile(u"[(|(][ぁ-ー]+[)|)]", re.UNICODE)
  32.   for index, row in df.iterrows():
  33.     ok = row['oldkey']
  34.     if ok == "":
  35.       continue
  36.     kanji = row['kanji']
  37.     s = ""
  38.     for word in ok.split(u"、"):  # japanese comma
  39.       kanjiword, n = regex.subn(kanji, word)
  40.       readings = []
  41.       if n == 0:
  42.         kanjiword = kanji  # when the reading is for the whole kanji alone
  43.         readings = [word]
  44.       elif kanjiword in dic:
  45.         readings = dic[kanjiword].keys()
  46.         if len(readings) > 1:
  47.           targetReadings = [hiraToKata(t) for t in regex.findall(word)]
  48.           #are there any with len(targetReadings) > 1?
  49.           #if so, maybe build another regex to check they are all there and in the correct order
  50.           readings = [r for r in readings if targetReadings[0][1:-1] in hiraToKata(r)]
  51.       answers = u"、".join([ankiFurigana(kanjiword, r) for r in readings])
  52.       if len(readings) < 1:
  53.         answers = u"NOTFOUND"
  54.       if len(readings) > 1:
  55.         answers += u"、CHECK(" + str(len(readings)) + ")"
  56.       questions = answers.replace(kanji, u"〇")
  57.       s += questions + u"、"
  58.       #could put the answers in the deck too, like in the plugin
  59.     row['newkey'] = s[:-1]  # remove the trailing comma
  60.  
  61.   df.to_csv('processed.csv', sep='\t', header=False, index=False, encoding='utf8')
  62.  
  63. main()
RAW Paste Data
Top