#!/usr/bin/python -Ou
# -*- coding: utf-8 -*-
import sys, os, locale, getopt, zipfile, exceptions, time
from lxml import etree # this one may fail on UNICODE :( # see http://lxml.de/
# import BeautifulSoup # see http://www.crummy.com/software/BeautifulSoup/bs4/doc/ # may use http://code.google.com/p/html5lib/
import sqlite3
# CREATE TABLE archive (
# id integer primary key
# , stamp datetime default(datetime('now'))
# , name text not null unique
# );
# CREATE TABLE file (
# id integer primary key
# , stamp datetime default(datetime('now'))
# , name text not null
# , archive integer references archive(id) on delete cascade
# , lreid integer
# );
__tag_dict_sql = '''
CREATE TABLE tag_dict (
id integer primary key
, name text not null unique
);
'''
__file_metas_sql = '''
CREATE TABLE file_meta (
id integer primary key
, file integer references file(id) on delete cascade
, section integer references tag_dict(id)
, tag integer references tag_dict(id)
, value text
);
'''
_db = None
_dbfile = 'index.db'
_save_sql = open('save.sql','w')
def db_open(fname=_dbfile):
global _db, _db_size
_db = None
reopen = False
try:
print 'Using database',`fname`
_db_size = os.path.getsize(_dbfile)
_db = sqlite3.connect(fname,isolation_level='DEFERRED')
ok, rs = run_sql('select count(*) from tag_dict')
if not ok :
db_begin()
ok, rs = run_sql(__tag_dict_sql)
if not ok :
print >>sys.stderr,'Cannot create table tag_dict'
db_close()
return
reopen = True
print 'tag_dict - created:',`_db.commit()`
ok, rs = run_sql('select count(*) from file_meta')
if not ok :
db_begin()
ok, rs = run_sql(__file_metas_sql)
if not ok :
print >>sys.stderr,'Cannot create table file_meta'
db_close()
return
reopen = True
print 'file_meta - created:',`_db.commit()`
if reopen :
db_close()
_db = sqlite3.connect(fname,isolation_level='DEFERRED')
print 'Database has been reopen'
_db.commit()
except sqlite3.Error,why:
print >>sys.stderr, 'Cannot connect:', why
_db = None
def db_close():
global _db, _db_size
db_diff = os.path.getsize(_dbfile) - _db_size
print >>sys.stderr,"# Database",`_dbfile`,'size change:',db_diff,'bytes'
if _db is not None :
print >>sys.stderr,'# Total changes:', `_db.total_changes`
try: _db.close()
except: pass
_db = None
def run_sql(sql,*args):
c = None
Result = None
# print 'run_sql('+`sql`+','+`args`+'):',
if _save_sql is not None :
print >>_save_sql,`sql`,`args`
try:
c = _db.cursor()
if not args :
c.execute(sql)
else:
c.execute(sql,args)
if sql.lstrip().upper().startswith('SELECT') :
Result = c.fetchall()
if _save_sql is not None :
print >>_save_sql,'/*',`Result`,'*/'
# print `Result`
c.close()
# print 'run_sql('+`sql`+','+`args`+'):',`(True,Result)`# ,'isolation=',`_db.isolation_level`
return (True,Result)
except sqlite3.Error,why:
print >>sys.stderr, 'SQL',`sql`,'args',`args`,'error:', why
if _save_sql is not None :
print >>_save_sql,'# error:',why
if c is not None :
try: c.close()
except: pass
return (False,None)
def db_begin():
return _db.commit()
# return True
if not run_sql('begin transaction')[0] :
print >>sys.stderr,'Cannot BEGIN transaction'
raise exceptions.RuntimeError('no BEGIN')
def db_commit():
return _db.commit()
# return True
if not run_sql('commit transaction')[0] :
print >>sys.stderr,'Cannot COMMIT transaction'
raise exceptions.RuntimeError('no COMMIT')
def db_rollback():
print >>sys.stderr,'### ROLLBACK ###'
return _db.rollback()
def db_clear_file(archname,filename):
file_id = db_resolve_file(filename,archname)
# print 'Clearing file',`filename`,'id',file_id
sql = "delete from file_meta where file=?"
ok, rs = run_sql(sql,file_id)
return ok
__file_id_cache = {None:(None,None)}
def db_resolve_file(filename,archname=None):
if archname is not None :
ai = db_resolve_archive(archname)
if ai is not None :
__file_id_cache[None] = (archname,ai)
arch, ai = __file_id_cache[None]
cache = __file_id_cache.get(arch,None)
if cache is None :
cache = {}
__file_id_cache[arch] = {}
else:
if filename in cache:
return cache[filename]
sql = "select id from file where name=? and archive=?"
ok, rs = run_sql(sql,filename,ai)
if ok :
cache[filename] = rs = rs[0][0]
__file_id_cache[arch].update(cache)
return rs
__arch_id_cache = {}
def db_resolve_archive(filename):
if filename in __arch_id_cache :
return __arch_id_cache[filename]
sql = "select id from archive where name=?"
ok, rs = run_sql(sql,filename)
if ok :
rs = rs[0][0]
__arch_id_cache[filename] = rs
return rs
def db_add_tag(tag):
sql = "insert into tag_dict(name) values(?)"
ok, rs = run_sql(sql,tag)
return ok
__tag_id_cache = {}
def db_resolve_tag(tag):
if tag in __tag_id_cache:
return __tag_id_cache[tag]
sql = "select id from tag_dict where name=?"
ok, rs = run_sql(sql,tag)
if ok :
if rs :
rs = rs[0][0]
__tag_id_cache[tag] = rs
return rs
else:
if db_add_tag(tag) :
return db_resolve_tag(tag)
def db_update_meta(file_id,top_id,tag_id,value):
sql = "insert into file_meta(file,section,tag,value) values (?,?,?,?)"
ok, rs = run_sql(sql,file_id,top_id,tag_id,value)
return ok
doUpdate=False
Ki1 = 1024
Mi1 = Ki1 * Ki1
Gi1 = Mi1 * Ki1
K1 = 1000
M1 = K1 * K1
G1 = M1 * K1
def bKMGi(n):
n = int(n)
if n >= Gi1 : return str(n>>30)+'Gi'
if n >= Mi1 : return str(n>>20)+'Mi'
if n >= Ki1 : return str(n>>10)+'ki'
return str(n)
def bKMG(n):
n = int(n)
if n >= G1 : return str(n/G1)+'G'
if n >= M1 : return str(n/M1)+'M'
if n >= K1 : return str(n/K1)+'k'
return str(n)
def mangle_tag(tag): # some "{///url///}name" form
if type(tag) != type('x') :
print >>sys.stderr,'mangle_tag('+`tag`+'): non-text tag'
return ''
try:
if not tag :
return ''
if tag.startswith('{') :
return tag.split('}')[-1]
return tag
except exceptions.Exception, why:
print >>sys.stderr,'mangle_tag('+`tag`+'):',`why`
raise
def mangle_text(text):
if not text : return ''
return ' '.join(text.strip().split())
def handle_tag(filename,top,this,value):
# print '\t*\t',`top+'/'+this`+'=<'+value+'>'
file_id = db_resolve_file(filename)
top_id = db_resolve_tag(top)
this_id = db_resolve_tag(this)
if value == '' : value = None
return db_update_meta(file_id,top_id,this_id,value)
def dig(filename,fb2):
fb2sz = len(fb2)
print '+',fb2sz,'bytes to dig.', `fb2[:256].split('>')[0] + '>'`,
try:
e = etree.fromstring(fb2)
except exceptions.Exception, why:
if isinstance(why,etree.XMLSyntaxError) :
print '- bad XML'
print >>sys.stderr,`filename`,'- bad XML'
else:
print '- Shit happened:',`why`
print >>sys.stderr,`filename`,'- shit happened:',`why`
raise
return fb2sz
d = e.find('description')
# print 'Top level elements:',len(e), 'e='+`e`, 'd='+`d`
for i in e:
x_tag = mangle_tag( i.tag[:] )
if x_tag == 'description' :
d = i
break
else:
print '- crap, no description?'
print >>sys.stderr,`filename`,'- crap, no description?'
return fb2sz
print 'ok,',
n = e = 0
for i in d:
i_tag = mangle_tag( i.tag )
# print '\t*',`i_tag`+'=<'+mangle_text(i.text)+'>'
for j in i:
j_tag = mangle_tag( j.tag )
x_tag = i_tag+'/'+j_tag
tag_v = mangle_text(j.text)
if handle_tag(filename,i_tag,j_tag,tag_v):
n += 1
else:
e += 1
print >>sys.stderr,"\tCannot update",`x_tag`,'for',`filename`,'to',`tag_v`
# print "\tTags handled:",n,'Errors:',e
print 'changes:', `_db.total_changes`
return fb2sz
def handle( fname ):
print fname
try:
with zipfile.ZipFile(fname,'r',allowZip64=True) as zf :
if doUpdate :
print 'check for update...'
have_files = len(zf.namelist())
print 'have_files='+`have_files`
arc = db_resolve_archive(fname)
print 'arc='+`arc`
if arc is not None :
ok, done_files = run_sql("""
select count(*)
from (
select distinct file
from file_meta
where file in (select id from file where archive=?)
)""",arc)
print 'ok='+`ok`,'done_files='+`done_files`
if ok : done_files = done_files[0][0]
if ok and (have_files == done_files):
print >>sys.stderr,`fname`,': no update needed (%d/%d)'%(have_files,done_files)
return (0.0,0,0)
print 'Replacing',`fname`
total_size = 0
total_count = 0
t0 = time.time()
for zn in zf.namelist():
print `zn`,
fb = zf.read(zn)
db_begin()
db_clear_file(fname,zn)
db_commit() ; db_begin()
total_size += dig(zn,fb)
db_commit()
total_count += 1
# if total_count >= 10 : break
t1 = time.time()
zf.close()
dt = t1 - t0
print >>sys.stderr,`fname`,':',total_size,'bytes', '('+bKMGi(total_size)+')', 'in', total_count, 'books were processed in %.0f seconds (at %sBPS)'%(dt,bKMG(float(total_size)/dt))
return (dt,total_size,total_count)
except exceptions.Exception, why:
if isinstance(why,zipfile.BadZipfile) :
print >>sys.stderr,'Bad ZIP file', `fname`
elif isinstance(why,zipfile.LargeZipFile) :
print >>sys.stderr,'Large ZIP file', `fname`, 'cannot be processed here.'
else:
print >>sys.stderr,'Shit happened on', `fname`,':',`why`
raise
def main():
try:
try:
opts, args = getopt.getopt(
sys.argv[1:],
'?hd:u',
('help','db=','database=','update')
)
except getopt.error, why:
print >>sys.stderr, sys.argv[0],':',why
return 1
else:
for o,v in opts :
if o in ('-h','-?','--help'):
print sys.argv[0],'[-u|--update] [-d|--db=|--database=<database>] <fb2-zip-files...>'
return 0
elif o in ('-d','--db','--database'):
global _dbfile
_dbfile = v[:]
elif o in ('-u','--update'):
global doUpdate
doUpdate = True
pass
total_time, total_size, total_count = 0.0, 0L, 0
db_open(_dbfile)
for arg in args :
t, s, c = handle( arg )
total_time += t
total_size += s
total_count+= c
db_close()
if total_time <= 0.00000001 : total_time = 1.0
print >>sys.stderr,'TOTAL: %s bytes in %d books (%d archives) were processed for %.0f seconds (%sBps, %.0f books/s).'%(
bKMGi(total_size), total_count, len(args), total_time,
bKMG(float(total_size)/total_time), float(total_count)/total_time,
)
return 0
finally:
try:
# db_rollback()
_db.commit()
db_close()
except:
pass
def __fix_io_encoding(last_resort_default='UTF-8'):
import sys
if [x for x in (sys.stdin,sys.stdout,sys.stderr) if x.encoding is None] :
import os
defEnc = None
if defEnc is None :
try:
import locale
defEnc = locale.getpreferredencoding()
except: defEnc = None
if defEnc is None :
try: defEnc = sys.getfilesystemencoding()
except: defEnc = None
if defEnc is None :
try: defEnc = sys.stdin.encoding
except: defEnc = None
if defEnc is None : defEnc = last_resort_default
os.environ['PYTHONIOENCODING'] = os.environ.get("PYTHONIOENCODING",defEnc)
os.execvpe(sys.argv[0],sys.argv,os.environ)
if __name__=='__main__' :
__fix_io_encoding() ; del __fix_io_encoding
sys.exit( main() )
# vim:ai:sts=2:et
# EOF #