Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! /usr/bin/env python
- from sys import argv
- import sqlite3
- import os
- import shutil
- import glob
- import MeCab
- import lxml.html
- import sets
- import re
- #import cProfile
- ANKIFILE = os.environ["HOME"] + "/Anki/jake/collection.anki2"
- SCRATCHFILE = os.environ["HOME"] + "/docs/scratch.txt"
- IGNORELIST = os.environ["HOME"] + "/.config/nplus1/ignore"
- TEMPFILE = "/tmp/nplus1.anki2"
- REMOVEZERO = True
- class Result:
- def __init__(self, line, str, set):
- self.line = line
- self.str = str
- self.set = set
- def __str__(self):
- ss = ''
- for s in self.set:
- ss += s + ", "
- ss = ss[:-2]
- return "%.2d %.3d [%s] [%s]" % (len(self.set), self.line, ss, self.str)
- def __lt__(self, other):
- if len(self.set) != len(other.set):
- return len(self.set) < len(other.set)
- else:
- return self.line < other.line
- def striphtml(s):
- t = lxml.html.fromstring(s)
- return t.text_content()
- def mecabme(s):
- out = sets.Set()
- m = MeCab.Tagger()
- n = m.parseToNode(s)
- n = n.next
- while n:
- n2 = n.feature.split(',')
- if n2[6] != '*':
- out.add(n2[6])
- else:
- out.add(n.surface)
- n = n.next
- return out
- def generatedb():
- db = sets.Set()
- shutil.copy(ANKIFILE, TEMPFILE)
- con = sqlite3.Connection(TEMPFILE)
- cur = con.cursor()
- e = cur.execute("SELECT flds FROM notes;")
- for c in e:
- s = c[0].split('\x1f')[0].encode('utf-8')
- r = re.findall("({{c\d+::(.*?)}})", s)
- for i in r:
- s = s.replace(i[0], i[1])
- db |= mecabme(s)
- ignorelist = open(IGNORELIST, "r").read().split('\n')
- for i in ignorelist:
- db |= sets.Set(i.split(' '))
- os.remove(TEMPFILE)
- return db
- def getfile(f):
- return open(f).read().split('\n')
- def main(filepath):
- db = generatedb()
- data = getfile(filepath)
- out = []
- for i in range(len(data)):
- d = data[i]
- if d == '':
- continue
- m = mecabme(d)
- out.append(Result(i+1, d, m - db))
- out.sort()
- for o in out:
- if REMOVEZERO:
- if o.set:
- print o
- else:
- print o
- if __name__ == '__main__':
- try:
- #cProfile.run('main(argv[1])')
- main(argv[1])
- except:
- main(SCRATCHFILE)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement