Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def isKanji( c ):
- """ Return true if the character is Kanji (or actually any CJK Unified Ideograph) """
- return 0x4E00 <= ord(c) < 0x9FFF
- def containsRisukasa( line, ref ):
- for kanji in line:
- if not kanji in ref:
- return True
- def filterKanji( dict_file, ref_file, good_out, bad_out ):
- ref = set( open( ref_file, "rb" ).read().decode("utf-8") )
- bad = open( bad_out, "wb" )
- good = open( good_out, "wb" )
- for line in open(dict_file, "rb"):
- kanjiList = filter(isKanji, line.decode("utf-8"))
- if containsRisukasa( kanjiList, ref ):
- bad.write( line )
- else:
- good.write( line )
- filterKanji( "10000_jouyou.txt", "kyouiku_kanji.utf8.txt", "good.txt", "bad.txt")
Add Comment
Please, Sign In to add comment