Advertisement
ludoo

Kindle/Mobipocket conversion helper

Nov 10th, 2011
332
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.71 KB | None | 0 0
  1. #!/usr/bin/env python
  2. """Kindleprep.py, generate OPF and NCX files from an HTML ebook for Kindle
  3. conversion.
  4.  
  5. Typical usage would be
  6.  
  7. ./kindleprep.py [options] file
  8.  
  9. where filename points to the HTML ebook to prepare. Once done, just run
  10. kindlegen on the produced OPF file.
  11.  
  12. In order to simplify dependencies, the HTML file needs to be parsable
  13. by an XML parser. This basically means quoting of attribute values,
  14. closed tags, and escaping of a few reserved characters.
  15.  
  16. A few conventions are used to set defaults for the ebook metadata,
  17. and the generated file names:
  18.  
  19. * the default HTML filename format is "[author name] - [ebook
  20.  title].html" and it is used to extract author and title info, you can
  21.  override this behaviour with the --author [author] and --title [title]
  22.  options
  23.  
  24. * the language default is 'en_us', you can ovverride it with the --lang
  25.  [language] option
  26.  
  27. * the default cover location is '[html filename].jpg', you can override
  28.  this default with the --cover [filename] option
  29.  
  30. * the OPF and NCX files are written by default in the same folder and
  31.  with the same HTML ebook filename, with different extensions, eg
  32.  'Spam - Ham.html' will produce a 'Spam - Ham.opf' and a
  33.  'Spam - Ham.ncx' file, you can override these defaults with the
  34.  --opf [filename] and --ncx [filename] options
  35.  
  36. In the HTML ebook, a convention is used to identify ebook sections used
  37. to build the navigation:
  38.  
  39. * the table of contents inside the ebook must be an HTML element with
  40.  an id of 'chapters'
  41.  
  42. * every anchor element inside the table of contents is used as a
  43.  chapter, if it has an href attribute and non-empty text
  44.  
  45. A sample table of contents might be something like this::
  46.  
  47.    <h2><a name="toc"></a>Table of Contents</h2>
  48.    <div id="chapters">
  49.    <p><a href="#chapter-1">Chapter 1</a></p>
  50.    <p><a href="#chapter-2">Chapter 2</a></p>
  51.    </div>
  52.    
  53. Copyright (c) 2011 Ludovico Magnocavallo
  54.  
  55. This program is free software; you can redistribute it and/or
  56. modify it under the terms of the GNU General Public License
  57. as published by the Free Software Foundation; either version 2
  58. of the License, or (at your option) any later version.
  59.  
  60. This program is distributed in the hope that it will be useful,
  61. but WITHOUT ANY WARRANTY; without even the implied warranty of
  62. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  63. GNU General Public License for more details.
  64.  
  65. You should have received a copy of the GNU General Public License
  66. along with this program; if not, write to the Free Software
  67. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  68. 02110-1301, USA.
  69. """
  70. import os
  71. import sys
  72.  
  73. from optparse import OptionParser
  74. try:
  75.     from hashlib import md5
  76. except ImportError:
  77.     from md5 import new as md5
  78. try:
  79.     from xml.etree import cElementTree as et
  80. except ImportError:
  81.     from xml.etree import ElementTree as et
  82.  
  83.  
  84. __version__ = '1.1'
  85. __author__ = "Ludovico Magnocavallo <ludo@qix.it>"
  86.  
  87.  
  88. CSS = """\
  89.    blockquote { margin: 12px; }
  90.    blockquote, blockquote p { font-style: italic; }
  91. """
  92.  
  93. OPF = """<?xml version="1.0" encoding="UTF-8"?>
  94. <package xmlns="http://www.idpf.org/2007/opf" version="2.0" unique-identifier="%(uid)s">
  95. <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
  96.    <dc:language>%(lang)s</dc:language>
  97.    <dc:title>%(title)s</dc:title>
  98.    <dc:creator opf:role="aut">%(author)s</dc:creator>
  99.    <!-- TODO: identifier -->
  100.    <dc:identifier id="BookId">%(uid)s</dc:identifier>
  101.    <meta name="cover" content="cover-image"/>
  102. </metadata>
  103. <manifest>
  104.    <item id="css" media-type="text/css" href="%(basefile)s.css" />
  105.    <item id="cover-image" media-type="image/jpeg" href="%(cover)s" />
  106.    <item id="content" media-type="application/xhtml+xml" href="%(html)s" />
  107.    <item id="ncx" media-type="application/x-dtbncx+xml" href="%(ncx)s" />
  108. </manifest>
  109. <spine toc="ncx">
  110.    <itemref idref="content" />
  111. </spine>
  112. <guide>
  113.    <reference type="cover" title="Cover" href="%(html)s#cover"></reference>
  114.    <reference type="toc" title="Table of Contents" href="%(html)s#toc"></reference>
  115.    <reference type="start" title="Start Reading" href="%(html)s#start"></reference>
  116. </guide>
  117. </package>
  118. """
  119.  
  120. NCX = """<?xml version="1.0" encoding="UTF-8"?>
  121. <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%(lang)s">
  122. <head>
  123.  <meta name="dtb:uid" content="%(uid)s" />
  124.  <meta name="dtb:depth" content="1" />
  125.  <meta name="dtb:totalPageCount" content="0" />
  126.  <meta name="dtb:maxPageNumber" content="0" />
  127. </head>
  128. <docTitle><text>%(title)s</text></docTitle>
  129. <docAuthor><text>%(author)s</text></docAuthor>
  130. <navMap>
  131.    <navPoint id="cover" playOrder="0"><navlabel><text>Cover</text></navLabel><content src="%(html)s#cover"/></navPoint>
  132.    <navPoint id="toc" playOrder="1"><navlabel><text>Table of Contents</text></navLabel><content src="%(html)s#toc"/></navPoint>
  133. %(navpoints)s
  134. </navMap>
  135. </ncx>
  136. """
  137.  
  138. NAVPOINT = '<navPoint id="%(id)s" playOrder="%(i)s"><navlabel><text>%(text)s</text></navLabel><content src="%(html)s%(href)s"/></navPoint>'
  139.  
  140.  
  141. def main():
  142.    
  143.     parser = OptionParser(usage=__doc__, version="%prog " + __version__)
  144.     parser.add_option("--ncx", type="str", dest="ncx", default=None, help="write ncx to file, defaults to the input file with .ncx extension")
  145.     parser.add_option("--opf", type="str", dest="opf", default=None, help="write opf to file, defaults to the input file with .opf extension")
  146.     parser.add_option("--cover", dest="cover", default=None, help="cover file, defaults to the input file with .jpg extension")
  147.     parser.add_option("--author", type="str", dest="author", default=None, help="author name, defaults to anything before the first '-'")
  148.     parser.add_option("--title", type="str", dest="title", default=None, help="title, defaults to anything after the last '-'")
  149.     parser.add_option("--lang", type="str", dest="lang", default="en-US", help="language, defaults to 'en-US'")
  150.  
  151.     (opts, args) = parser.parse_args()
  152.  
  153.     if not args:
  154.         raise SystemExit("No file to process")
  155.        
  156.     html = args[0]
  157.     _html = os.path.splitext(html)[0]
  158.  
  159.     cover = opts.cover or _html + '.jpg'
  160.     opf = opts.opf or _html + '.opf'
  161.     ncx = opts.ncx or _html + '.ncx'
  162.    
  163.     sys.stderr.write("Input file '%s'\n" % html)
  164.     if not os.path.isfile(html):
  165.         raise SystemExit("Input file not found")
  166.     sys.stderr.write("Cover file '%s'\n" % cover)
  167.     if not os.path.isfile(cover):
  168.         raise SystemExit("Cover not found")
  169.     sys.stderr.write("OPF file '%s'\n" % opf)
  170.     sys.stderr.write("NCX file '%s'\n" % ncx)
  171.    
  172.     author = opts.author or _html.split(' - ')[0].strip()
  173.     title = opts.title or _html.split(' - ')[-1].strip()
  174.    
  175.     sys.stderr.write("Author '%s'\n" % author)
  176.     sys.stderr.write("Title '%s'\n" % title)
  177.    
  178.     try:
  179.         tree = et.XML(file(html).read())
  180.     except et.ParseError, e:
  181.         raise SystemExit("Error parsing source file: %s" % e)
  182.        
  183.     toc = None
  184.     for el in tree.getiterator():
  185.         if el.attrib.get('id') == 'chapters':
  186.             toc = el
  187.             break
  188.  
  189.     if toc is None:
  190.         raise SystemExit("No table of contents element with id 'chapters' found")
  191.  
  192.     chapter_tag = 'a'
  193.     if toc.tag[0] == '{':
  194.         chapter_tag = toc.tag[:toc.tag.find('}') + 1] + 'a'
  195.  
  196.     chapters = list()
  197.     for el in toc.getiterator():
  198.         if el.tag == chapter_tag and 'href' in el.attrib and el.text:
  199.             chapters.append(el)
  200.        
  201.     if not chapters:
  202.         raise SystemExit("No chapters found")
  203.        
  204.     sys.stderr.write("%s chapters found\n" % len(chapters))
  205.    
  206.     opf_buffer = []
  207.     ncx_buffer = []
  208.     for i, chapter in enumerate(chapters):
  209.         _id = chapter.attrib['href'].split('#')[-1]
  210.         d = dict(text=chapter.text.strip(), i=i+2, href=chapter.attrib['href'], id=_id, html=html)
  211.        
  212.         opf_buffer.append('<itemref idref="%s" linear="yes"/>' % _id)
  213.         ncx_buffer.append(NAVPOINT % d)
  214.    
  215.     uid = md5(title+author).hexdigest()
  216.        
  217.     sys.stderr.write("writing ncx file\n")
  218.    
  219.     file(ncx, 'w').write(NCX % dict(
  220.         uid=uid,
  221.         lang=opts.lang, author=author, title=title,
  222.         html=html,
  223.         navpoints=("\n".join(ncx_buffer)).encode('utf-8')
  224.     ))
  225.    
  226.     sys.stderr.write("writing opf file\n")
  227.    
  228.     file(opf, 'w').write(OPF % dict(
  229.         uid=md5(title+author).hexdigest(),
  230.         lang=opts.lang, author=author, title=title,
  231.         html=html, basefile=_html, cover=cover, ncx=ncx,
  232.         #spine_items="\n    ".join(opf_buffer)
  233.     ))
  234.    
  235.     sys.stderr.write("writing css file\n")
  236.    
  237.     file(_html + '.css', 'w').write(CSS)
  238.    
  239.  
  240. if __name__ == '__main__':
  241.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement