Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python -Ou
- # -*- coding: utf-8 -*-
- import sys, os, locale, getopt, zipfile, exceptions, time
- from lxml import etree # this one may fail on UNICODE :( # see http://lxml.de/
- # import BeautifulSoup # see http://www.crummy.com/software/BeautifulSoup/bs4/doc/ # may use http://code.google.com/p/html5lib/
- import sqlite3
- # CREATE TABLE archive (
- # id integer primary key
- # , stamp datetime default(datetime('now'))
- # , name text not null unique
- # );
- # CREATE TABLE file (
- # id integer primary key
- # , stamp datetime default(datetime('now'))
- # , name text not null
- # , archive integer references archive(id) on delete cascade
- # , lreid integer
- # );
- __tag_dict_sql = '''
- CREATE TABLE tag_dict (
- id integer primary key
- , name text not null unique
- );
- '''
- __file_metas_sql = '''
- CREATE TABLE file_meta (
- id integer primary key
- , file integer references file(id) on delete cascade
- , section integer references tag_dict(id)
- , tag integer references tag_dict(id)
- , value text
- );
- '''
- _db = None
- _dbfile = 'index.db'
- _save_sql = open('save.sql','w')
- def db_open(fname=_dbfile):
- global _db, _db_size
- _db = None
- reopen = False
- try:
- print 'Using database',`fname`
- _db_size = os.path.getsize(_dbfile)
- _db = sqlite3.connect(fname,isolation_level='DEFERRED')
- ok, rs = run_sql('select count(*) from tag_dict')
- if not ok :
- db_begin()
- ok, rs = run_sql(__tag_dict_sql)
- if not ok :
- print >>sys.stderr,'Cannot create table tag_dict'
- db_close()
- return
- reopen = True
- print 'tag_dict - created:',`_db.commit()`
- ok, rs = run_sql('select count(*) from file_meta')
- if not ok :
- db_begin()
- ok, rs = run_sql(__file_metas_sql)
- if not ok :
- print >>sys.stderr,'Cannot create table file_meta'
- db_close()
- return
- reopen = True
- print 'file_meta - created:',`_db.commit()`
- if reopen :
- db_close()
- _db = sqlite3.connect(fname,isolation_level='DEFERRED')
- print 'Database has been reopen'
- _db.commit()
- except sqlite3.Error,why:
- print >>sys.stderr, 'Cannot connect:', why
- _db = None
- def db_close():
- global _db, _db_size
- db_diff = os.path.getsize(_dbfile) - _db_size
- print >>sys.stderr,"# Database",`_dbfile`,'size change:',db_diff,'bytes'
- if _db is not None :
- print >>sys.stderr,'# Total changes:', `_db.total_changes`
- try: _db.close()
- except: pass
- _db = None
- def run_sql(sql,*args):
- c = None
- Result = None
- # print 'run_sql('+`sql`+','+`args`+'):',
- if _save_sql is not None :
- print >>_save_sql,`sql`,`args`
- try:
- c = _db.cursor()
- if not args :
- c.execute(sql)
- else:
- c.execute(sql,args)
- if sql.lstrip().upper().startswith('SELECT') :
- Result = c.fetchall()
- if _save_sql is not None :
- print >>_save_sql,'/*',`Result`,'*/'
- # print `Result`
- c.close()
- # print 'run_sql('+`sql`+','+`args`+'):',`(True,Result)`# ,'isolation=',`_db.isolation_level`
- return (True,Result)
- except sqlite3.Error,why:
- print >>sys.stderr, 'SQL',`sql`,'args',`args`,'error:', why
- if _save_sql is not None :
- print >>_save_sql,'# error:',why
- if c is not None :
- try: c.close()
- except: pass
- return (False,None)
- def db_begin():
- return _db.commit()
- # return True
- if not run_sql('begin transaction')[0] :
- print >>sys.stderr,'Cannot BEGIN transaction'
- raise exceptions.RuntimeError('no BEGIN')
- def db_commit():
- return _db.commit()
- # return True
- if not run_sql('commit transaction')[0] :
- print >>sys.stderr,'Cannot COMMIT transaction'
- raise exceptions.RuntimeError('no COMMIT')
- def db_rollback():
- print >>sys.stderr,'### ROLLBACK ###'
- return _db.rollback()
- def db_clear_file(archname,filename):
- file_id = db_resolve_file(filename,archname)
- # print 'Clearing file',`filename`,'id',file_id
- sql = "delete from file_meta where file=?"
- ok, rs = run_sql(sql,file_id)
- return ok
- __file_id_cache = {None:(None,None)}
- def db_resolve_file(filename,archname=None):
- if archname is not None :
- ai = db_resolve_archive(archname)
- if ai is not None :
- __file_id_cache[None] = (archname,ai)
- arch, ai = __file_id_cache[None]
- cache = __file_id_cache.get(arch,None)
- if cache is None :
- cache = {}
- __file_id_cache[arch] = {}
- else:
- if filename in cache:
- return cache[filename]
- sql = "select id from file where name=? and archive=?"
- ok, rs = run_sql(sql,filename,ai)
- if ok :
- cache[filename] = rs = rs[0][0]
- __file_id_cache[arch].update(cache)
- return rs
- __arch_id_cache = {}
- def db_resolve_archive(filename):
- if filename in __arch_id_cache :
- return __arch_id_cache[filename]
- sql = "select id from archive where name=?"
- ok, rs = run_sql(sql,filename)
- if ok :
- rs = rs[0][0]
- __arch_id_cache[filename] = rs
- return rs
- def db_add_tag(tag):
- sql = "insert into tag_dict(name) values(?)"
- ok, rs = run_sql(sql,tag)
- return ok
- __tag_id_cache = {}
- def db_resolve_tag(tag):
- if tag in __tag_id_cache:
- return __tag_id_cache[tag]
- sql = "select id from tag_dict where name=?"
- ok, rs = run_sql(sql,tag)
- if ok :
- if rs :
- rs = rs[0][0]
- __tag_id_cache[tag] = rs
- return rs
- else:
- if db_add_tag(tag) :
- return db_resolve_tag(tag)
- def db_update_meta(file_id,top_id,tag_id,value):
- sql = "insert into file_meta(file,section,tag,value) values (?,?,?,?)"
- ok, rs = run_sql(sql,file_id,top_id,tag_id,value)
- return ok
- doUpdate=False
- Ki1 = 1024
- Mi1 = Ki1 * Ki1
- Gi1 = Mi1 * Ki1
- K1 = 1000
- M1 = K1 * K1
- G1 = M1 * K1
- def bKMGi(n):
- n = int(n)
- if n >= Gi1 : return str(n>>30)+'Gi'
- if n >= Mi1 : return str(n>>20)+'Mi'
- if n >= Ki1 : return str(n>>10)+'ki'
- return str(n)
- def bKMG(n):
- n = int(n)
- if n >= G1 : return str(n/G1)+'G'
- if n >= M1 : return str(n/M1)+'M'
- if n >= K1 : return str(n/K1)+'k'
- return str(n)
- def mangle_tag(tag): # some "{///url///}name" form
- if type(tag) != type('x') :
- print >>sys.stderr,'mangle_tag('+`tag`+'): non-text tag'
- return ''
- try:
- if not tag :
- return ''
- if tag.startswith('{') :
- return tag.split('}')[-1]
- return tag
- except exceptions.Exception, why:
- print >>sys.stderr,'mangle_tag('+`tag`+'):',`why`
- raise
- def mangle_text(text):
- if not text : return ''
- return ' '.join(text.strip().split())
- def handle_tag(filename,top,this,value):
- # print '\t*\t',`top+'/'+this`+'=<'+value+'>'
- file_id = db_resolve_file(filename)
- top_id = db_resolve_tag(top)
- this_id = db_resolve_tag(this)
- if value == '' : value = None
- return db_update_meta(file_id,top_id,this_id,value)
- def dig(filename,fb2):
- fb2sz = len(fb2)
- print '+',fb2sz,'bytes to dig.', `fb2[:256].split('>')[0] + '>'`,
- try:
- e = etree.fromstring(fb2)
- except exceptions.Exception, why:
- if isinstance(why,etree.XMLSyntaxError) :
- print '- bad XML'
- print >>sys.stderr,`filename`,'- bad XML'
- else:
- print '- Shit happened:',`why`
- print >>sys.stderr,`filename`,'- shit happened:',`why`
- raise
- return fb2sz
- d = e.find('description')
- # print 'Top level elements:',len(e), 'e='+`e`, 'd='+`d`
- for i in e:
- x_tag = mangle_tag( i.tag[:] )
- if x_tag == 'description' :
- d = i
- break
- else:
- print '- crap, no description?'
- print >>sys.stderr,`filename`,'- crap, no description?'
- return fb2sz
- print 'ok,',
- n = e = 0
- for i in d:
- i_tag = mangle_tag( i.tag )
- # print '\t*',`i_tag`+'=<'+mangle_text(i.text)+'>'
- for j in i:
- j_tag = mangle_tag( j.tag )
- x_tag = i_tag+'/'+j_tag
- tag_v = mangle_text(j.text)
- if handle_tag(filename,i_tag,j_tag,tag_v):
- n += 1
- else:
- e += 1
- print >>sys.stderr,"\tCannot update",`x_tag`,'for',`filename`,'to',`tag_v`
- # print "\tTags handled:",n,'Errors:',e
- print 'changes:', `_db.total_changes`
- return fb2sz
- def handle( fname ):
- print fname
- try:
- with zipfile.ZipFile(fname,'r',allowZip64=True) as zf :
- if doUpdate :
- print 'check for update...'
- have_files = len(zf.namelist())
- print 'have_files='+`have_files`
- arc = db_resolve_archive(fname)
- print 'arc='+`arc`
- if arc is not None :
- ok, done_files = run_sql("""
- select count(*)
- from (
- select distinct file
- from file_meta
- where file in (select id from file where archive=?)
- )""",arc)
- print 'ok='+`ok`,'done_files='+`done_files`
- if ok : done_files = done_files[0][0]
- if ok and (have_files == done_files):
- print >>sys.stderr,`fname`,': no update needed (%d/%d)'%(have_files,done_files)
- return (0.0,0,0)
- print 'Replacing',`fname`
- total_size = 0
- total_count = 0
- t0 = time.time()
- for zn in zf.namelist():
- print `zn`,
- fb = zf.read(zn)
- db_begin()
- db_clear_file(fname,zn)
- db_commit() ; db_begin()
- total_size += dig(zn,fb)
- db_commit()
- total_count += 1
- # if total_count >= 10 : break
- t1 = time.time()
- zf.close()
- dt = t1 - t0
- print >>sys.stderr,`fname`,':',total_size,'bytes', '('+bKMGi(total_size)+')', 'in', total_count, 'books were processed in %.0f seconds (at %sBPS)'%(dt,bKMG(float(total_size)/dt))
- return (dt,total_size,total_count)
- except exceptions.Exception, why:
- if isinstance(why,zipfile.BadZipfile) :
- print >>sys.stderr,'Bad ZIP file', `fname`
- elif isinstance(why,zipfile.LargeZipFile) :
- print >>sys.stderr,'Large ZIP file', `fname`, 'cannot be processed here.'
- else:
- print >>sys.stderr,'Shit happened on', `fname`,':',`why`
- raise
- def main():
- try:
- try:
- opts, args = getopt.getopt(
- sys.argv[1:],
- '?hd:u',
- ('help','db=','database=','update')
- )
- except getopt.error, why:
- print >>sys.stderr, sys.argv[0],':',why
- return 1
- else:
- for o,v in opts :
- if o in ('-h','-?','--help'):
- print sys.argv[0],'[-u|--update] [-d|--db=|--database=<database>] <fb2-zip-files...>'
- return 0
- elif o in ('-d','--db','--database'):
- global _dbfile
- _dbfile = v[:]
- elif o in ('-u','--update'):
- global doUpdate
- doUpdate = True
- pass
- total_time, total_size, total_count = 0.0, 0L, 0
- db_open(_dbfile)
- for arg in args :
- t, s, c = handle( arg )
- total_time += t
- total_size += s
- total_count+= c
- db_close()
- if total_time <= 0.00000001 : total_time = 1.0
- print >>sys.stderr,'TOTAL: %s bytes in %d books (%d archives) were processed for %.0f seconds (%sBps, %.0f books/s).'%(
- bKMGi(total_size), total_count, len(args), total_time,
- bKMG(float(total_size)/total_time), float(total_count)/total_time,
- )
- return 0
- finally:
- try:
- # db_rollback()
- _db.commit()
- db_close()
- except:
- pass
- def __fix_io_encoding(last_resort_default='UTF-8'):
- import sys
- if [x for x in (sys.stdin,sys.stdout,sys.stderr) if x.encoding is None] :
- import os
- defEnc = None
- if defEnc is None :
- try:
- import locale
- defEnc = locale.getpreferredencoding()
- except: defEnc = None
- if defEnc is None :
- try: defEnc = sys.getfilesystemencoding()
- except: defEnc = None
- if defEnc is None :
- try: defEnc = sys.stdin.encoding
- except: defEnc = None
- if defEnc is None : defEnc = last_resort_default
- os.environ['PYTHONIOENCODING'] = os.environ.get("PYTHONIOENCODING",defEnc)
- os.execvpe(sys.argv[0],sys.argv,os.environ)
- if __name__=='__main__' :
- __fix_io_encoding() ; del __fix_io_encoding
- sys.exit( main() )
- # vim:ai:sts=2:et
- # EOF #
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement