Advertisement
gabalese

parseinfo.py (py3 + lxml)

Aug 17th, 2012
163
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.40 KB | None | 0 0
  1. #! /usr/bin/env python3
  2. # Parses info about the current epub
  3. import zipfile as ZIP
  4. import sys
  5. import os
  6. try:
  7.     from lxml import etree as ET
  8. except:
  9.     print("ERROR: lxml library must be installed.")
  10.     sys.exit(1)
  11.  
  12. namespaces = {"opf":"http://www.idpf.org/2007/opf","dc":"http://purl.org/dc/elements/1.1/"}
  13.  
  14. def parseInfo(file):
  15.     info = {}
  16.     try:
  17.         f = ZIP.ZipFile(file).read("META-INF/container.xml")
  18.     except KeyError:
  19.         print( "The %s file is not a valid OCF." % str(file) )
  20.     try:
  21.         f = ET.fromstring(f)
  22.         info["path_to_opf"] = f[0][0].get("full-path")
  23.         root_folder = os.path.dirname(info["path_to_opf"])
  24.     except:
  25.         pass
  26.     opf = ET.fromstring(ZIP.ZipFile(file).read(info["path_to_opf"]))
  27.    
  28.     id = opf.xpath("//opf:spine",namespaces=namespaces)[0].get("toc")
  29.     expr = "//*[@id='%s']" % id
  30.     info["ncx_name"] = opf.xpath(expr)[0].get("href")
  31.     info["path_to_ncx"] = root_folder + "/" + info["ncx_name"]
  32.     info.pop("ncx_name")
  33.  
  34.     return info
  35.    
  36. def parseOPF(file):
  37.     meta = {}
  38.     opf = ET.fromstring(ZIP.ZipFile(file).read(parseInfo(file)["path_to_opf"]))
  39.    
  40.     return opf
  41.    
  42. def parseNCX(file):
  43.  
  44.     ncx = {}
  45.     ncx = ET.fromstring(ZIP.ZipFile(file).read(parseInfo(file)["path_to_ncx"]))
  46.    
  47.     return ncx
  48.    
  49. def showMeta(list, attrib=""):
  50.     for i in list:
  51.         print( attrib,"\t", i.text )
  52.  
  53. class Metadata:
  54.     def __init__(self, file):
  55.         opf = parseOPF(file)
  56.        
  57.         self.title = opf.xpath("//dc:title",namespaces=namespaces)
  58.         self.author = opf.xpath("//dc:creator",namespaces=namespaces)
  59.         self.isbn = opf.xpath("//dc:identifier",namespaces=namespaces)
  60.         self.language = opf.xpath("//dc:language",namespaces=namespaces)
  61.         self.publisher = opf.xpath("//dc:publisher",namespaces=namespaces)
  62.         self.pubdate = opf.xpath("//dc:date[@opf:event='publication']",namespaces=namespaces)
  63.  
  64.  
  65. if __name__ == "__main__":
  66.     for file in sys.argv[1:]:
  67.         try:
  68.             m = Metadata(file)
  69.         except:
  70.             print("Invalid file: ",file)
  71.             continue
  72.        
  73.         print( "File: ","\t",file)
  74.         showMeta(m.title, "Title")
  75.         showMeta(m.author, "Author")
  76.         showMeta(m.language, "Language")
  77.         showMeta(m.publisher, "Publisher")
  78.         showMeta(m.isbn,"ISBN")
  79.         showMeta(m.pubdate,"PubDate")
  80.         print( "")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement