SHARE
TWEET
nplus1.py
a guest
Jun 15th, 2011
60
Never
- #! /usr/bin/env python
- from sys import argv
- import sqlite3
- import os
- import shutil
- import glob
- import MeCab
- import lxml.html
- import sets
- ANKIFOLDER = os.environ["HOME"] + "/.anki/decks/"
- MATURE_THRESHOLD = 21
- TEMPFILE = "/tmp/nplus1.anki"
- class Result:
- def __init__(self, str, set):
- self.str = str
- self.set = set
- def __str__(self):
- ss = ''
- for s in self.set:
- ss += s + ", "
- ss = ss[:-2]
- return "%.2d [%s] [%s]" % (len(self.set), self.str, ss)
- def __lt__(self, other):
- return len(self.set) < len(other.set)
- def striphtml(s):
- t = lxml.html.fromstring(s)
- return t.text_content()
- def mecabme(s):
- out = sets.Set()
- m = MeCab.Tagger()
- n = m.parseToNode(s)
- n = n.next
- while n:
- out.add(n.surface)
- n = n.next
- return out
- def generatedb():
- db = sets.Set()
- g = glob.glob(ANKIFOLDER + "/*anki")
- for d in g:
- shutil.copy(d, TEMPFILE)
- con = sqlite3.Connection(TEMPFILE)
- cur = con.cursor()
- e = cur.execute("SELECT * FROM cards;")
- for c in e:
- if (c[10] > MATURE_THRESHOLD):
- db |= mecabme(striphtml(c[7]).encode('utf-8'))
- os.remove(TEMPFILE)
- return db
- def getfile(f):
- return open(f).read().split('\n')
- def main(filepath):
- db = generatedb()
- data = getfile(filepath)
- out = []
- for d in data:
- if d == '':
- continue
- m = mecabme(d)
- out.append(Result(d, m - db))
- out.sort()
- for o in out:
- print o
- if __name__ == '__main__':
- try:
- main(argv[1])
- except:
- print 'no arg'
RAW Paste Data
