Advertisement
Guest User

fb2info.py

a guest
Mar 19th, 2012
617
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 11.97 KB | None | 0 0
  1. #!/usr/bin/python -Ou
  2. # -*- coding: utf-8 -*-
  3.  
  4. import sys, os, locale, getopt, zipfile, exceptions, time
  5.  
  6. from lxml import etree # this one may fail on UNICODE :( # see http://lxml.de/
  7. # import BeautifulSoup # see http://www.crummy.com/software/BeautifulSoup/bs4/doc/ # may use http://code.google.com/p/html5lib/
  8.  
  9. import sqlite3
  10.  
  11. # CREATE TABLE archive (
  12. #   id integer primary key
  13. # , stamp datetime default(datetime('now'))
  14. # , name text not null unique
  15. # );
  16. # CREATE TABLE file (
  17. #   id integer primary key
  18. # , stamp datetime default(datetime('now'))
  19. # , name text not null
  20. # , archive integer references archive(id) on delete cascade
  21. # , lreid integer
  22. # );
  23.  
  24. __tag_dict_sql = '''
  25. CREATE TABLE tag_dict (
  26.  id integer primary key
  27. , name text not null unique
  28. );
  29. '''
  30.  
  31. __file_metas_sql = '''
  32. CREATE TABLE file_meta (
  33.  id integer primary key
  34. , file integer references file(id) on delete cascade
  35. , section integer references tag_dict(id)
  36. , tag integer references tag_dict(id)
  37. , value text
  38. );
  39. '''
  40.  
  41. _db = None
  42. _dbfile = 'index.db'
  43.  
  44. _save_sql = open('save.sql','w')
  45.  
  46. def db_open(fname=_dbfile):
  47.   global _db, _db_size
  48.   _db = None
  49.   reopen = False
  50.   try:
  51.     print 'Using database',`fname`
  52.     _db_size = os.path.getsize(_dbfile)
  53.     _db = sqlite3.connect(fname,isolation_level='DEFERRED')
  54.  
  55.     ok, rs = run_sql('select count(*) from tag_dict')
  56.     if not ok :
  57.       db_begin()
  58.       ok, rs = run_sql(__tag_dict_sql)
  59.       if not ok :
  60.         print >>sys.stderr,'Cannot create table tag_dict'
  61.         db_close()
  62.         return
  63.       reopen = True
  64.       print 'tag_dict - created:',`_db.commit()`
  65.      
  66.     ok, rs = run_sql('select count(*) from file_meta')
  67.     if not ok :
  68.       db_begin()
  69.       ok, rs = run_sql(__file_metas_sql)
  70.       if not ok :
  71.         print >>sys.stderr,'Cannot create table file_meta'
  72.         db_close()
  73.         return
  74.       reopen = True
  75.       print 'file_meta - created:',`_db.commit()`
  76.  
  77.     if reopen :
  78.       db_close()
  79.       _db = sqlite3.connect(fname,isolation_level='DEFERRED')
  80.       print 'Database has been reopen'
  81.  
  82.     _db.commit()
  83.   except sqlite3.Error,why:
  84.     print >>sys.stderr, 'Cannot connect:', why
  85.     _db = None
  86.  
  87. def db_close():
  88.   global _db, _db_size
  89.   db_diff = os.path.getsize(_dbfile) - _db_size
  90.   print >>sys.stderr,"# Database",`_dbfile`,'size change:',db_diff,'bytes'
  91.   if _db is not None :
  92.     print >>sys.stderr,'# Total changes:', `_db.total_changes`
  93.     try: _db.close()
  94.     except: pass
  95.     _db = None
  96.  
  97. def run_sql(sql,*args):
  98.   c = None
  99.   Result = None
  100.   # print 'run_sql('+`sql`+','+`args`+'):',
  101.   if _save_sql is not None :
  102.     print >>_save_sql,`sql`,`args`
  103.   try:
  104.     c = _db.cursor()
  105.     if not args :
  106.       c.execute(sql)
  107.     else:
  108.       c.execute(sql,args)
  109.     if sql.lstrip().upper().startswith('SELECT') :
  110.       Result = c.fetchall()
  111.       if _save_sql is not None :
  112.         print >>_save_sql,'/*',`Result`,'*/'
  113.     # print `Result`
  114.     c.close()
  115.     # print 'run_sql('+`sql`+','+`args`+'):',`(True,Result)`# ,'isolation=',`_db.isolation_level`
  116.     return (True,Result)
  117.   except sqlite3.Error,why:
  118.     print >>sys.stderr, 'SQL',`sql`,'args',`args`,'error:', why
  119.     if _save_sql is not None :
  120.       print >>_save_sql,'# error:',why
  121.     if c is not None :
  122.       try: c.close()
  123.       except: pass
  124.     return (False,None)
  125.  
  126. def db_begin():
  127.   return _db.commit()
  128.   # return True
  129.   if not run_sql('begin transaction')[0] :
  130.     print >>sys.stderr,'Cannot BEGIN transaction'
  131.     raise exceptions.RuntimeError('no BEGIN')
  132.  
  133. def db_commit():
  134.   return _db.commit()
  135.   # return True
  136.   if not run_sql('commit transaction')[0] :
  137.     print >>sys.stderr,'Cannot COMMIT transaction'
  138.     raise exceptions.RuntimeError('no COMMIT')
  139.  
  140. def db_rollback():
  141.   print >>sys.stderr,'### ROLLBACK ###'
  142.   return _db.rollback()
  143.  
  144. def db_clear_file(archname,filename):
  145.   file_id = db_resolve_file(filename,archname)
  146.   # print 'Clearing file',`filename`,'id',file_id
  147.   sql = "delete from file_meta where file=?"
  148.   ok, rs = run_sql(sql,file_id)
  149.   return ok
  150.  
  151. __file_id_cache = {None:(None,None)}
  152. def db_resolve_file(filename,archname=None):
  153.   if archname is not None :
  154.     ai = db_resolve_archive(archname)
  155.     if ai is not None :
  156.       __file_id_cache[None] = (archname,ai)
  157.   arch, ai = __file_id_cache[None]
  158.   cache    = __file_id_cache.get(arch,None)
  159.   if cache is None :
  160.     cache = {}
  161.     __file_id_cache[arch] = {}
  162.   else:
  163.     if filename in cache:
  164.       return cache[filename]
  165.   sql = "select id from file where name=? and archive=?"
  166.   ok, rs = run_sql(sql,filename,ai)
  167.   if ok :
  168.     cache[filename] = rs = rs[0][0]
  169.     __file_id_cache[arch].update(cache)
  170.     return rs
  171.  
  172. __arch_id_cache = {}
  173. def db_resolve_archive(filename):
  174.   if filename in __arch_id_cache :
  175.     return __arch_id_cache[filename]
  176.   sql = "select id from archive where name=?"
  177.   ok, rs = run_sql(sql,filename)
  178.   if ok :
  179.     rs = rs[0][0]
  180.     __arch_id_cache[filename] = rs
  181.     return rs
  182.  
  183. def db_add_tag(tag):
  184.   sql = "insert into tag_dict(name) values(?)"
  185.   ok, rs = run_sql(sql,tag)
  186.   return ok
  187.  
  188. __tag_id_cache = {}
  189. def db_resolve_tag(tag):
  190.   if tag in __tag_id_cache:
  191.     return __tag_id_cache[tag]
  192.   sql = "select id from tag_dict where name=?"
  193.   ok, rs = run_sql(sql,tag)
  194.   if ok :
  195.     if rs :
  196.       rs = rs[0][0]
  197.       __tag_id_cache[tag] = rs
  198.       return rs
  199.     else:
  200.       if db_add_tag(tag) :
  201.         return db_resolve_tag(tag)
  202.  
  203. def db_update_meta(file_id,top_id,tag_id,value):
  204.   sql = "insert into file_meta(file,section,tag,value) values (?,?,?,?)"
  205.   ok, rs = run_sql(sql,file_id,top_id,tag_id,value)
  206.   return ok
  207.  
  208. doUpdate=False
  209.  
  210. Ki1 = 1024
  211. Mi1 = Ki1 * Ki1
  212. Gi1 = Mi1 * Ki1
  213.  
  214. K1 = 1000
  215. M1 = K1 * K1
  216. G1 = M1 * K1
  217.  
  218. def bKMGi(n):
  219.   n = int(n)
  220.   if n >= Gi1 : return str(n>>30)+'Gi'
  221.   if n >= Mi1 : return str(n>>20)+'Mi'
  222.   if n >= Ki1 : return str(n>>10)+'ki'
  223.   return str(n)
  224.  
  225. def bKMG(n):
  226.   n = int(n)
  227.   if n >= G1 : return str(n/G1)+'G'
  228.   if n >= M1 : return str(n/M1)+'M'
  229.   if n >= K1 : return str(n/K1)+'k'
  230.   return str(n)
  231.  
  232. def mangle_tag(tag): # some "{///url///}name" form
  233.   if type(tag) != type('x') :
  234.     print >>sys.stderr,'mangle_tag('+`tag`+'): non-text tag'
  235.     return ''
  236.   try:
  237.     if not tag :
  238.       return ''
  239.     if tag.startswith('{') :
  240.       return tag.split('}')[-1]
  241.     return tag
  242.   except exceptions.Exception, why:
  243.     print >>sys.stderr,'mangle_tag('+`tag`+'):',`why`
  244.     raise
  245.  
  246. def mangle_text(text):
  247.   if not text : return ''
  248.   return ' '.join(text.strip().split())
  249.  
  250. def handle_tag(filename,top,this,value):
  251.   # print '\t*\t',`top+'/'+this`+'=<'+value+'>'
  252.   file_id = db_resolve_file(filename)
  253.   top_id = db_resolve_tag(top)
  254.   this_id = db_resolve_tag(this)
  255.   if value == '' : value = None
  256.   return db_update_meta(file_id,top_id,this_id,value)
  257.  
  258. def dig(filename,fb2):
  259.   fb2sz = len(fb2)
  260.   print '+',fb2sz,'bytes to dig.', `fb2[:256].split('>')[0] + '>'`,
  261.   try:
  262.     e = etree.fromstring(fb2)
  263.   except exceptions.Exception, why:
  264.     if isinstance(why,etree.XMLSyntaxError) :
  265.       print '- bad XML'
  266.       print >>sys.stderr,`filename`,'- bad XML'
  267.     else:
  268.       print '- Shit happened:',`why`
  269.       print >>sys.stderr,`filename`,'- shit happened:',`why`
  270.       raise
  271.     return fb2sz
  272.   d = e.find('description')
  273.   # print 'Top level elements:',len(e), 'e='+`e`, 'd='+`d`
  274.   for i in e:
  275.     x_tag = mangle_tag( i.tag[:] )
  276.     if x_tag == 'description' :
  277.       d = i
  278.       break
  279.   else:
  280.     print '- crap, no description?'
  281.     print >>sys.stderr,`filename`,'- crap, no description?'
  282.     return fb2sz
  283.   print 'ok,',
  284.   n = e = 0
  285.   for i in d:
  286.     i_tag = mangle_tag( i.tag )
  287.     # print '\t*',`i_tag`+'=<'+mangle_text(i.text)+'>'
  288.     for j in i:
  289.       j_tag = mangle_tag( j.tag )
  290.       x_tag = i_tag+'/'+j_tag
  291.       tag_v = mangle_text(j.text)
  292.       if handle_tag(filename,i_tag,j_tag,tag_v):
  293.         n += 1
  294.       else:
  295.         e += 1
  296.         print >>sys.stderr,"\tCannot update",`x_tag`,'for',`filename`,'to',`tag_v`
  297.   # print "\tTags handled:",n,'Errors:',e
  298.   print 'changes:', `_db.total_changes`
  299.  
  300.   return fb2sz
  301.  
  302. def handle( fname ):
  303.   print fname
  304.   try:
  305.     with zipfile.ZipFile(fname,'r',allowZip64=True) as zf :
  306.       if doUpdate :
  307.         print 'check for update...'
  308.         have_files = len(zf.namelist())
  309.         print 'have_files='+`have_files`
  310.         arc = db_resolve_archive(fname)
  311.         print 'arc='+`arc`
  312.         if arc is not None :
  313.           ok, done_files = run_sql("""
  314.            select count(*)
  315.              from (
  316.                select distinct file
  317.                  from file_meta
  318.                  where file in (select id from file where archive=?)
  319.              )""",arc)
  320.           print 'ok='+`ok`,'done_files='+`done_files`
  321.           if ok : done_files = done_files[0][0]
  322.           if ok and (have_files == done_files):
  323.             print >>sys.stderr,`fname`,': no update needed (%d/%d)'%(have_files,done_files)
  324.             return (0.0,0,0)
  325.           print 'Replacing',`fname`
  326.       total_size = 0
  327.       total_count = 0
  328.       t0 = time.time()
  329.       for zn in zf.namelist():
  330.         print `zn`,
  331.         fb = zf.read(zn)
  332.         db_begin()
  333.         db_clear_file(fname,zn)
  334.         db_commit() ; db_begin()
  335.         total_size += dig(zn,fb)
  336.         db_commit()
  337.         total_count += 1
  338.         # if total_count >= 10 : break
  339.       t1 = time.time()
  340.       zf.close()
  341.       dt = t1 - t0
  342.       print >>sys.stderr,`fname`,':',total_size,'bytes', '('+bKMGi(total_size)+')', 'in', total_count, 'books were processed in %.0f seconds (at %sBPS)'%(dt,bKMG(float(total_size)/dt))
  343.     return (dt,total_size,total_count)
  344.   except exceptions.Exception, why:
  345.     if isinstance(why,zipfile.BadZipfile) :
  346.       print >>sys.stderr,'Bad ZIP file', `fname`
  347.     elif isinstance(why,zipfile.LargeZipFile) :
  348.       print >>sys.stderr,'Large ZIP file', `fname`, 'cannot be processed here.'
  349.     else:
  350.       print >>sys.stderr,'Shit happened on', `fname`,':',`why`
  351.       raise
  352.  
  353. def main():
  354.   try:
  355.     try:
  356.       opts, args = getopt.getopt(
  357.         sys.argv[1:],
  358.     '?hd:u',
  359.     ('help','db=','database=','update')
  360.       )
  361.     except getopt.error, why:
  362.       print >>sys.stderr, sys.argv[0],':',why
  363.       return 1
  364.     else:
  365.       for o,v in opts :
  366.         if o in ('-h','-?','--help'):
  367.       print sys.argv[0],'[-u|--update] [-d|--db=|--database=<database>] <fb2-zip-files...>'
  368.       return 0
  369.         elif o in ('-d','--db','--database'):
  370.           global _dbfile
  371.           _dbfile = v[:]
  372.         elif o in ('-u','--update'):
  373.           global doUpdate
  374.           doUpdate = True
  375.         pass
  376.     total_time, total_size, total_count = 0.0, 0L, 0
  377.     db_open(_dbfile)
  378.     for arg in args :
  379.       t, s, c = handle( arg )
  380.       total_time += t
  381.       total_size += s
  382.       total_count+= c
  383.     db_close()
  384.     if total_time <= 0.00000001 : total_time = 1.0
  385.     print >>sys.stderr,'TOTAL: %s bytes in %d books (%d archives) were processed for %.0f seconds (%sBps, %.0f books/s).'%(
  386.       bKMGi(total_size), total_count, len(args), total_time,
  387.       bKMG(float(total_size)/total_time), float(total_count)/total_time,
  388.     )
  389.     return 0
  390.   finally:
  391.     try:
  392.       # db_rollback()
  393.       _db.commit()
  394.       db_close()
  395.     except:
  396.       pass
  397.  
  398. def __fix_io_encoding(last_resort_default='UTF-8'):
  399.   import sys
  400.   if [x for x in (sys.stdin,sys.stdout,sys.stderr) if x.encoding is None] :
  401.     import os
  402.     defEnc = None
  403.     if defEnc is None :
  404.       try:
  405.         import locale
  406.         defEnc = locale.getpreferredencoding()
  407.       except: defEnc = None
  408.     if defEnc is None :
  409.       try: defEnc = sys.getfilesystemencoding()
  410.       except: defEnc = None
  411.     if defEnc is None :
  412.       try: defEnc = sys.stdin.encoding
  413.       except: defEnc = None
  414.     if defEnc is None : defEnc = last_resort_default
  415.     os.environ['PYTHONIOENCODING'] = os.environ.get("PYTHONIOENCODING",defEnc)
  416.     os.execvpe(sys.argv[0],sys.argv,os.environ)
  417.  
  418. if __name__=='__main__' :
  419.   __fix_io_encoding() ; del __fix_io_encoding
  420.   sys.exit( main() )
  421. # vim:ai:sts=2:et
  422. # EOF #
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement