Kindle/Mobipocket conversion helper

#!/usr/bin/env python
"""Kindleprep.py, generate OPF and NCX files from an HTML ebook for Kindle
conversion.

Typical usage would be

./kindleprep.py [options] file

where filename points to the HTML ebook to prepare. Once done, just run
kindlegen on the produced OPF file.

In order to simplify dependencies, the HTML file needs to be parsable
by an XML parser. This basically means quoting of attribute values,
closed tags, and escaping of a few reserved characters.

A few conventions are used to set defaults for the ebook metadata,
and the generated file names:

* the default HTML filename format is "[author name] - [ebook
  title].html" and it is used to extract author and title info, you can
  override this behaviour with the --author [author] and --title [title]
  options

* the language default is 'en_us', you can ovverride it with the --lang
  [language] option

* the default cover location is '[html filename].jpg', you can override
  this default with the --cover [filename] option

* the OPF and NCX files are written by default in the same folder and
  with the same HTML ebook filename, with different extensions, eg
  'Spam - Ham.html' will produce a 'Spam - Ham.opf' and a
  'Spam - Ham.ncx' file, you can override these defaults with the
  --opf [filename] and --ncx [filename] options

In the HTML ebook, a convention is used to identify ebook sections used
to build the navigation:

* the table of contents inside the ebook must be an HTML element with
  an id of 'chapters'

* every anchor element inside the table of contents is used as a
  chapter, if it has an href attribute and non-empty text

A sample table of contents might be something like this::

    <h2><a name="toc"></a>Table of Contents</h2>
    <div id="chapters">
    <p><a href="#chapter-1">Chapter 1</a></p>
    <p><a href="#chapter-2">Chapter 2</a></p>
    </div>

Copyright (c) 2011 Ludovico Magnocavallo

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
"""
import os
import sys

from optparse import OptionParser
try:
    from hashlib import md5
except ImportError:
    from md5 import new as md5
try:
    from xml.etree import cElementTree as et
except ImportError:
    from xml.etree import ElementTree as et


__version__ = '1.1'
__author__ = "Ludovico Magnocavallo <ludo@qix.it>"


CSS = """\
    blockquote { margin: 12px; }
    blockquote, blockquote p { font-style: italic; }
"""

OPF = """<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="2.0" unique-identifier="%(uid)s">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
    <dc:language>%(lang)s</dc:language>
    <dc:title>%(title)s</dc:title>
    <dc:creator opf:role="aut">%(author)s</dc:creator>
    <!-- TODO: identifier -->
    <dc:identifier id="BookId">%(uid)s</dc:identifier>
    <meta name="cover" content="cover-image"/>
</metadata>
<manifest>
    <item id="css" media-type="text/css" href="%(basefile)s.css" />
    <item id="cover-image" media-type="image/jpeg" href="%(cover)s" />
    <item id="content" media-type="application/xhtml+xml" href="%(html)s" />
    <item id="ncx" media-type="application/x-dtbncx+xml" href="%(ncx)s" />
</manifest>
<spine toc="ncx">
    <itemref idref="content" />
</spine>
<guide>
    <reference type="cover" title="Cover" href="%(html)s#cover"></reference>
    <reference type="toc" title="Table of Contents" href="%(html)s#toc"></reference>
    <reference type="start" title="Start Reading" href="%(html)s#start"></reference>
</guide>
</package>
"""

NCX = """<?xml version="1.0" encoding="UTF-8"?>
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%(lang)s">
<head>
  <meta name="dtb:uid" content="%(uid)s" />
  <meta name="dtb:depth" content="1" />
  <meta name="dtb:totalPageCount" content="0" />
  <meta name="dtb:maxPageNumber" content="0" />
</head>
<docTitle><text>%(title)s</text></docTitle>
<docAuthor><text>%(author)s</text></docAuthor>
<navMap>
    <navPoint id="cover" playOrder="0"><navlabel><text>Cover</text></navLabel><content src="%(html)s#cover"/></navPoint>
    <navPoint id="toc" playOrder="1"><navlabel><text>Table of Contents</text></navLabel><content src="%(html)s#toc"/></navPoint>
%(navpoints)s
</navMap>
</ncx>
"""

NAVPOINT = '<navPoint id="%(id)s" playOrder="%(i)s"><navlabel><text>%(text)s</text></navLabel><content src="%(html)s%(href)s"/></navPoint>'


def main():

    parser = OptionParser(usage=__doc__, version="%prog " + __version__)
    parser.add_option("--ncx", type="str", dest="ncx", default=None, help="write ncx to file, defaults to the input file with .ncx extension")
    parser.add_option("--opf", type="str", dest="opf", default=None, help="write opf to file, defaults to the input file with .opf extension")
    parser.add_option("--cover", dest="cover", default=None, help="cover file, defaults to the input file with .jpg extension")
    parser.add_option("--author", type="str", dest="author", default=None, help="author name, defaults to anything before the first '-'")
    parser.add_option("--title", type="str", dest="title", default=None, help="title, defaults to anything after the last '-'")
    parser.add_option("--lang", type="str", dest="lang", default="en-US", help="language, defaults to 'en-US'")

    (opts, args) = parser.parse_args()

    if not args:
        raise SystemExit("No file to process")

    html = args[0]
    _html = os.path.splitext(html)[0]

    cover = opts.cover or _html + '.jpg'
    opf = opts.opf or _html + '.opf'
    ncx = opts.ncx or _html + '.ncx'

    sys.stderr.write("Input file '%s'\n" % html)
    if not os.path.isfile(html):
        raise SystemExit("Input file not found")
    sys.stderr.write("Cover file '%s'\n" % cover)
    if not os.path.isfile(cover):
        raise SystemExit("Cover not found")
    sys.stderr.write("OPF file '%s'\n" % opf)
    sys.stderr.write("NCX file '%s'\n" % ncx)

    author = opts.author or _html.split(' - ')[0].strip()
    title = opts.title or _html.split(' - ')[-1].strip()

    sys.stderr.write("Author '%s'\n" % author)
    sys.stderr.write("Title '%s'\n" % title)

    try:
        tree = et.XML(file(html).read())
    except et.ParseError, e:
        raise SystemExit("Error parsing source file: %s" % e)

    toc = None
    for el in tree.getiterator():
        if el.attrib.get('id') == 'chapters':
            toc = el
            break

    if toc is None:
        raise SystemExit("No table of contents element with id 'chapters' found")

    chapter_tag = 'a'
    if toc.tag[0] == '{':
        chapter_tag = toc.tag[:toc.tag.find('}') + 1] + 'a'

    chapters = list()
    for el in toc.getiterator():
        if el.tag == chapter_tag and 'href' in el.attrib and el.text:
            chapters.append(el)

    if not chapters:
        raise SystemExit("No chapters found")

    sys.stderr.write("%s chapters found\n" % len(chapters))

    opf_buffer = []
    ncx_buffer = []
    for i, chapter in enumerate(chapters):
        _id = chapter.attrib['href'].split('#')[-1]
        d = dict(text=chapter.text.strip(), i=i+2, href=chapter.attrib['href'], id=_id, html=html)

        opf_buffer.append('<itemref idref="%s" linear="yes"/>' % _id)
        ncx_buffer.append(NAVPOINT % d)

    uid = md5(title+author).hexdigest()

    sys.stderr.write("writing ncx file\n")

    file(ncx, 'w').write(NCX % dict(
        uid=uid,
        lang=opts.lang, author=author, title=title,
        html=html,
        navpoints=("\n".join(ncx_buffer)).encode('utf-8')
    ))

    sys.stderr.write("writing opf file\n")

    file(opf, 'w').write(OPF % dict(
        uid=md5(title+author).hexdigest(),
        lang=opts.lang, author=author, title=title,
        html=html, basefile=_html, cover=cover, ncx=ncx,
        #spine_items="\n    ".join(opf_buffer)
    ))

    sys.stderr.write("writing css file\n")

    file(_html + '.css', 'w').write(CSS)


if __name__ == '__main__':
    main()