#!/usr/bin/python -Ou # -*- coding: utf-8 -*- import sys, os, locale, getopt, zipfile, exceptions, time from lxml import etree # this one may fail on UNICODE :( # see http://lxml.de/ # import BeautifulSoup # see http://www.crummy.com/software/BeautifulSoup/bs4/doc/ # may use http://code.google.com/p/html5lib/ import sqlite3 # CREATE TABLE archive ( # id integer primary key # , stamp datetime default(datetime('now')) # , name text not null unique # ); # CREATE TABLE file ( # id integer primary key # , stamp datetime default(datetime('now')) # , name text not null # , archive integer references archive(id) on delete cascade # , lreid integer # ); __tag_dict_sql = ''' CREATE TABLE tag_dict ( id integer primary key , name text not null unique ); ''' __file_metas_sql = ''' CREATE TABLE file_meta ( id integer primary key , file integer references file(id) on delete cascade , section integer references tag_dict(id) , tag integer references tag_dict(id) , value text ); ''' _db = None _dbfile = 'index.db' _save_sql = open('save.sql','w') def db_open(fname=_dbfile): global _db, _db_size _db = None reopen = False try: print 'Using database',`fname` _db_size = os.path.getsize(_dbfile) _db = sqlite3.connect(fname,isolation_level='DEFERRED') ok, rs = run_sql('select count(*) from tag_dict') if not ok : db_begin() ok, rs = run_sql(__tag_dict_sql) if not ok : print >>sys.stderr,'Cannot create table tag_dict' db_close() return reopen = True print 'tag_dict - created:',`_db.commit()` ok, rs = run_sql('select count(*) from file_meta') if not ok : db_begin() ok, rs = run_sql(__file_metas_sql) if not ok : print >>sys.stderr,'Cannot create table file_meta' db_close() return reopen = True print 'file_meta - created:',`_db.commit()` if reopen : db_close() _db = sqlite3.connect(fname,isolation_level='DEFERRED') print 'Database has been reopen' _db.commit() except sqlite3.Error,why: print >>sys.stderr, 'Cannot connect:', why _db = None def db_close(): global _db, _db_size db_diff = os.path.getsize(_dbfile) - _db_size print >>sys.stderr,"# Database",`_dbfile`,'size change:',db_diff,'bytes' if _db is not None : print >>sys.stderr,'# Total changes:', `_db.total_changes` try: _db.close() except: pass _db = None def run_sql(sql,*args): c = None Result = None # print 'run_sql('+`sql`+','+`args`+'):', if _save_sql is not None : print >>_save_sql,`sql`,`args` try: c = _db.cursor() if not args : c.execute(sql) else: c.execute(sql,args) if sql.lstrip().upper().startswith('SELECT') : Result = c.fetchall() if _save_sql is not None : print >>_save_sql,'/*',`Result`,'*/' # print `Result` c.close() # print 'run_sql('+`sql`+','+`args`+'):',`(True,Result)`# ,'isolation=',`_db.isolation_level` return (True,Result) except sqlite3.Error,why: print >>sys.stderr, 'SQL',`sql`,'args',`args`,'error:', why if _save_sql is not None : print >>_save_sql,'# error:',why if c is not None : try: c.close() except: pass return (False,None) def db_begin(): return _db.commit() # return True if not run_sql('begin transaction')[0] : print >>sys.stderr,'Cannot BEGIN transaction' raise exceptions.RuntimeError('no BEGIN') def db_commit(): return _db.commit() # return True if not run_sql('commit transaction')[0] : print >>sys.stderr,'Cannot COMMIT transaction' raise exceptions.RuntimeError('no COMMIT') def db_rollback(): print >>sys.stderr,'### ROLLBACK ###' return _db.rollback() def db_clear_file(archname,filename): file_id = db_resolve_file(filename,archname) # print 'Clearing file',`filename`,'id',file_id sql = "delete from file_meta where file=?" ok, rs = run_sql(sql,file_id) return ok __file_id_cache = {None:(None,None)} def db_resolve_file(filename,archname=None): if archname is not None : ai = db_resolve_archive(archname) if ai is not None : __file_id_cache[None] = (archname,ai) arch, ai = __file_id_cache[None] cache = __file_id_cache.get(arch,None) if cache is None : cache = {} __file_id_cache[arch] = {} else: if filename in cache: return cache[filename] sql = "select id from file where name=? and archive=?" ok, rs = run_sql(sql,filename,ai) if ok : cache[filename] = rs = rs[0][0] __file_id_cache[arch].update(cache) return rs __arch_id_cache = {} def db_resolve_archive(filename): if filename in __arch_id_cache : return __arch_id_cache[filename] sql = "select id from archive where name=?" ok, rs = run_sql(sql,filename) if ok : rs = rs[0][0] __arch_id_cache[filename] = rs return rs def db_add_tag(tag): sql = "insert into tag_dict(name) values(?)" ok, rs = run_sql(sql,tag) return ok __tag_id_cache = {} def db_resolve_tag(tag): if tag in __tag_id_cache: return __tag_id_cache[tag] sql = "select id from tag_dict where name=?" ok, rs = run_sql(sql,tag) if ok : if rs : rs = rs[0][0] __tag_id_cache[tag] = rs return rs else: if db_add_tag(tag) : return db_resolve_tag(tag) def db_update_meta(file_id,top_id,tag_id,value): sql = "insert into file_meta(file,section,tag,value) values (?,?,?,?)" ok, rs = run_sql(sql,file_id,top_id,tag_id,value) return ok doUpdate=False Ki1 = 1024 Mi1 = Ki1 * Ki1 Gi1 = Mi1 * Ki1 K1 = 1000 M1 = K1 * K1 G1 = M1 * K1 def bKMGi(n): n = int(n) if n >= Gi1 : return str(n>>30)+'Gi' if n >= Mi1 : return str(n>>20)+'Mi' if n >= Ki1 : return str(n>>10)+'ki' return str(n) def bKMG(n): n = int(n) if n >= G1 : return str(n/G1)+'G' if n >= M1 : return str(n/M1)+'M' if n >= K1 : return str(n/K1)+'k' return str(n) def mangle_tag(tag): # some "{///url///}name" form if type(tag) != type('x') : print >>sys.stderr,'mangle_tag('+`tag`+'): non-text tag' return '' try: if not tag : return '' if tag.startswith('{') : return tag.split('}')[-1] return tag except exceptions.Exception, why: print >>sys.stderr,'mangle_tag('+`tag`+'):',`why` raise def mangle_text(text): if not text : return '' return ' '.join(text.strip().split()) def handle_tag(filename,top,this,value): # print '\t*\t',`top+'/'+this`+'=<'+value+'>' file_id = db_resolve_file(filename) top_id = db_resolve_tag(top) this_id = db_resolve_tag(this) if value == '' : value = None return db_update_meta(file_id,top_id,this_id,value) def dig(filename,fb2): fb2sz = len(fb2) print '+',fb2sz,'bytes to dig.', `fb2[:256].split('>')[0] + '>'`, try: e = etree.fromstring(fb2) except exceptions.Exception, why: if isinstance(why,etree.XMLSyntaxError) : print '- bad XML' print >>sys.stderr,`filename`,'- bad XML' else: print '- Shit happened:',`why` print >>sys.stderr,`filename`,'- shit happened:',`why` raise return fb2sz d = e.find('description') # print 'Top level elements:',len(e), 'e='+`e`, 'd='+`d` for i in e: x_tag = mangle_tag( i.tag[:] ) if x_tag == 'description' : d = i break else: print '- crap, no description?' print >>sys.stderr,`filename`,'- crap, no description?' return fb2sz print 'ok,', n = e = 0 for i in d: i_tag = mangle_tag( i.tag ) # print '\t*',`i_tag`+'=<'+mangle_text(i.text)+'>' for j in i: j_tag = mangle_tag( j.tag ) x_tag = i_tag+'/'+j_tag tag_v = mangle_text(j.text) if handle_tag(filename,i_tag,j_tag,tag_v): n += 1 else: e += 1 print >>sys.stderr,"\tCannot update",`x_tag`,'for',`filename`,'to',`tag_v` # print "\tTags handled:",n,'Errors:',e print 'changes:', `_db.total_changes` return fb2sz def handle( fname ): print fname try: with zipfile.ZipFile(fname,'r',allowZip64=True) as zf : if doUpdate : print 'check for update...' have_files = len(zf.namelist()) print 'have_files='+`have_files` arc = db_resolve_archive(fname) print 'arc='+`arc` if arc is not None : ok, done_files = run_sql(""" select count(*) from ( select distinct file from file_meta where file in (select id from file where archive=?) )""",arc) print 'ok='+`ok`,'done_files='+`done_files` if ok : done_files = done_files[0][0] if ok and (have_files == done_files): print >>sys.stderr,`fname`,': no update needed (%d/%d)'%(have_files,done_files) return (0.0,0,0) print 'Replacing',`fname` total_size = 0 total_count = 0 t0 = time.time() for zn in zf.namelist(): print `zn`, fb = zf.read(zn) db_begin() db_clear_file(fname,zn) db_commit() ; db_begin() total_size += dig(zn,fb) db_commit() total_count += 1 # if total_count >= 10 : break t1 = time.time() zf.close() dt = t1 - t0 print >>sys.stderr,`fname`,':',total_size,'bytes', '('+bKMGi(total_size)+')', 'in', total_count, 'books were processed in %.0f seconds (at %sBPS)'%(dt,bKMG(float(total_size)/dt)) return (dt,total_size,total_count) except exceptions.Exception, why: if isinstance(why,zipfile.BadZipfile) : print >>sys.stderr,'Bad ZIP file', `fname` elif isinstance(why,zipfile.LargeZipFile) : print >>sys.stderr,'Large ZIP file', `fname`, 'cannot be processed here.' else: print >>sys.stderr,'Shit happened on', `fname`,':',`why` raise def main(): try: try: opts, args = getopt.getopt( sys.argv[1:], '?hd:u', ('help','db=','database=','update') ) except getopt.error, why: print >>sys.stderr, sys.argv[0],':',why return 1 else: for o,v in opts : if o in ('-h','-?','--help'): print sys.argv[0],'[-u|--update] [-d|--db=|--database=] ' return 0 elif o in ('-d','--db','--database'): global _dbfile _dbfile = v[:] elif o in ('-u','--update'): global doUpdate doUpdate = True pass total_time, total_size, total_count = 0.0, 0L, 0 db_open(_dbfile) for arg in args : t, s, c = handle( arg ) total_time += t total_size += s total_count+= c db_close() if total_time <= 0.00000001 : total_time = 1.0 print >>sys.stderr,'TOTAL: %s bytes in %d books (%d archives) were processed for %.0f seconds (%sBps, %.0f books/s).'%( bKMGi(total_size), total_count, len(args), total_time, bKMG(float(total_size)/total_time), float(total_count)/total_time, ) return 0 finally: try: # db_rollback() _db.commit() db_close() except: pass def __fix_io_encoding(last_resort_default='UTF-8'): import sys if [x for x in (sys.stdin,sys.stdout,sys.stderr) if x.encoding is None] : import os defEnc = None if defEnc is None : try: import locale defEnc = locale.getpreferredencoding() except: defEnc = None if defEnc is None : try: defEnc = sys.getfilesystemencoding() except: defEnc = None if defEnc is None : try: defEnc = sys.stdin.encoding except: defEnc = None if defEnc is None : defEnc = last_resort_default os.environ['PYTHONIOENCODING'] = os.environ.get("PYTHONIOENCODING",defEnc) os.execvpe(sys.argv[0],sys.argv,os.environ) if __name__=='__main__' : __fix_io_encoding() ; del __fix_io_encoding sys.exit( main() ) # vim:ai:sts=2:et # EOF #