SHARE
TWEET

nplus1.py

a guest Jun 15th, 2011 60 Never
  1. #! /usr/bin/env python
  2.  
  3. from sys import argv
  4. import sqlite3
  5. import os
  6. import shutil
  7. import glob
  8. import MeCab
  9. import lxml.html
  10. import sets
  11.  
  12. ANKIFOLDER = os.environ["HOME"] + "/.anki/decks/"
  13. MATURE_THRESHOLD = 21
  14. TEMPFILE = "/tmp/nplus1.anki"
  15.  
  16. class Result:
  17.     def __init__(self, str, set):
  18.         self.str = str
  19.         self.set = set
  20.  
  21.     def __str__(self):
  22.         ss = ''
  23.         for s in self.set:
  24.            ss += s + ", "
  25.         ss = ss[:-2]
  26.         return "%.2d [%s] [%s]" % (len(self.set), self.str, ss)
  27.  
  28.     def __lt__(self, other):
  29.         return len(self.set) < len(other.set)
  30.  
  31. def striphtml(s):
  32.     t = lxml.html.fromstring(s)
  33.     return t.text_content()
  34.  
  35. def mecabme(s):
  36.     out = sets.Set()
  37.     m = MeCab.Tagger()
  38.     n = m.parseToNode(s)
  39.     n = n.next
  40.     while n:
  41.         out.add(n.surface)
  42.         n = n.next
  43.     return out
  44.  
  45. def generatedb():
  46.     db = sets.Set()
  47.     g = glob.glob(ANKIFOLDER + "/*anki")
  48.     for d in g:
  49.         shutil.copy(d, TEMPFILE)
  50.        
  51.         con = sqlite3.Connection(TEMPFILE)
  52.         cur = con.cursor()
  53.  
  54.         e = cur.execute("SELECT * FROM cards;")
  55.         for c in e:
  56.             if (c[10] > MATURE_THRESHOLD):
  57.                 db |= mecabme(striphtml(c[7]).encode('utf-8'))
  58.         os.remove(TEMPFILE)
  59.     return db
  60.  
  61. def getfile(f):
  62.     return open(f).read().split('\n')
  63.  
  64. def main(filepath):
  65.     db = generatedb()
  66.     data = getfile(filepath)
  67.  
  68.     out = []
  69.  
  70.     for d in data:
  71.         if d == '':
  72.             continue
  73.         m = mecabme(d)
  74.         out.append(Result(d, m - db))
  75.  
  76.     out.sort()
  77.  
  78.     for o in out:
  79.         print o
  80.  
  81. if __name__ == '__main__':
  82.     try:
  83.         main(argv[1])
  84.     except:
  85.         print 'no arg'
RAW Paste Data
Top