Guest User

Untitled

a guest
Jul 18th, 2018
71
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.68 KB | None | 0 0
  1. def isKanji( c ):
  2. """ Return true if the character is Kanji (or actually any CJK Unified Ideograph) """
  3. return 0x4E00 <= ord(c) < 0x9FFF
  4.  
  5. def containsRisukasa( line, ref ):
  6. for kanji in line:
  7. if not kanji in ref:
  8. return True
  9.  
  10. def filterKanji( dict_file, ref_file, good_out, bad_out ):
  11. ref = set( open( ref_file, "rb" ).read().decode("utf-8") )
  12. bad = open( bad_out, "wb" )
  13. good = open( good_out, "wb" )
  14. for line in open(dict_file, "rb"):
  15. kanjiList = filter(isKanji, line.decode("utf-8"))
  16. if containsRisukasa( kanjiList, ref ):
  17. bad.write( line )
  18. else:
  19. good.write( line )
  20.  
  21. filterKanji( "10000_jouyou.txt", "kyouiku_kanji.utf8.txt", "good.txt", "bad.txt")
Add Comment
Please, Sign In to add comment