Advertisement
Guest User

Untitled

a guest
Mar 16th, 2014
197
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.43 KB | None | 0 0
  1. #! /usr/bin/env python
  2.  
  3. from sys import argv
  4. import sqlite3
  5. import os
  6. import shutil
  7. import glob
  8. import MeCab
  9. import lxml.html
  10. import sets
  11. import re
  12. #import cProfile
  13.  
  14. ANKIFILE = os.environ["HOME"] + "/Anki/jake/collection.anki2"
  15. SCRATCHFILE = os.environ["HOME"] + "/docs/scratch.txt"
  16. IGNORELIST = os.environ["HOME"] + "/.config/nplus1/ignore"
  17. TEMPFILE = "/tmp/nplus1.anki2"
  18. REMOVEZERO = True
  19.  
  20. class Result:
  21.     def __init__(self, line, str, set):
  22.         self.line = line
  23.         self.str = str
  24.         self.set = set
  25.  
  26.     def __str__(self):
  27.         ss = ''
  28.         for s in self.set:
  29.            ss += s + ", "
  30.         ss = ss[:-2]
  31.         return "%.2d %.3d [%s]     [%s]" % (len(self.set), self.line, ss, self.str)
  32.  
  33.     def __lt__(self, other):
  34.         if len(self.set) != len(other.set):
  35.             return len(self.set) < len(other.set)
  36.         else:
  37.             return self.line < other.line
  38.  
  39. def striphtml(s):
  40.     t = lxml.html.fromstring(s)
  41.     return t.text_content()
  42.  
  43. def mecabme(s):
  44.     out = sets.Set()
  45.     m = MeCab.Tagger()
  46.     n = m.parseToNode(s)
  47.     n = n.next
  48.     while n:
  49.         n2 = n.feature.split(',')
  50.         if n2[6] != '*':
  51.             out.add(n2[6])
  52.         else:
  53.             out.add(n.surface)
  54.         n = n.next
  55.     return out
  56.  
  57. def generatedb():
  58.     db = sets.Set()
  59.     shutil.copy(ANKIFILE, TEMPFILE)
  60.    
  61.     con = sqlite3.Connection(TEMPFILE)
  62.     cur = con.cursor()
  63.  
  64.     e = cur.execute("SELECT flds FROM notes;")
  65.     for c in e:
  66.         s = c[0].split('\x1f')[0].encode('utf-8')
  67.         r = re.findall("({{c\d+::(.*?)}})", s)
  68.         for i in r:
  69.             s = s.replace(i[0], i[1])
  70.         db |= mecabme(s)
  71.  
  72.     ignorelist = open(IGNORELIST, "r").read().split('\n')
  73.     for i in ignorelist:
  74.         db |= sets.Set(i.split(' '))
  75.    
  76.     os.remove(TEMPFILE)
  77.     return db
  78.  
  79. def getfile(f):
  80.     return open(f).read().split('\n')
  81.  
  82. def main(filepath):
  83.     db = generatedb()
  84.     data = getfile(filepath)
  85.  
  86.     out = []
  87.  
  88.     for i in range(len(data)):
  89.         d = data[i]
  90.         if d == '':
  91.             continue
  92.         m = mecabme(d)
  93.         out.append(Result(i+1, d, m - db))
  94.  
  95.     out.sort()
  96.  
  97.     for o in out:
  98.         if REMOVEZERO:
  99.             if o.set:
  100.                 print o
  101.         else:
  102.             print o
  103.  
  104. if __name__ == '__main__':
  105.     try:
  106.         #cProfile.run('main(argv[1])')
  107.         main(argv[1])
  108.     except:
  109.         main(SCRATCHFILE)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement