Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- """Kindleprep.py, generate OPF and NCX files from an HTML ebook for Kindle
- conversion.
- Typical usage would be
- ./kindleprep.py [options] file
- where filename points to the HTML ebook to prepare. Once done, just run
- kindlegen on the produced OPF file.
- In order to simplify dependencies, the HTML file needs to be parsable
- by an XML parser. This basically means quoting of attribute values,
- closed tags, and escaping of a few reserved characters.
- A few conventions are used to set defaults for the ebook metadata,
- and the generated file names:
- * the default HTML filename format is "[author name] - [ebook
- title].html" and it is used to extract author and title info, you can
- override this behaviour with the --author [author] and --title [title]
- options
- * the language default is 'en_us', you can ovverride it with the --lang
- [language] option
- * the default cover location is '[html filename].jpg', you can override
- this default with the --cover [filename] option
- * the OPF and NCX files are written by default in the same folder and
- with the same HTML ebook filename, with different extensions, eg
- 'Spam - Ham.html' will produce a 'Spam - Ham.opf' and a
- 'Spam - Ham.ncx' file, you can override these defaults with the
- --opf [filename] and --ncx [filename] options
- In the HTML ebook, a convention is used to identify ebook sections used
- to build the navigation:
- * the table of contents inside the ebook must be an HTML element with
- an id of 'chapters'
- * every anchor element inside the table of contents is used as a
- chapter, if it has an href attribute and non-empty text
- A sample table of contents might be something like this::
- <h2><a name="toc"></a>Table of Contents</h2>
- <div id="chapters">
- <p><a href="#chapter-1">Chapter 1</a></p>
- <p><a href="#chapter-2">Chapter 2</a></p>
- </div>
- Copyright (c) 2011 Ludovico Magnocavallo
- This program is free software; you can redistribute it and/or
- modify it under the terms of the GNU General Public License
- as published by the Free Software Foundation; either version 2
- of the License, or (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- 02110-1301, USA.
- """
- import os
- import sys
- from optparse import OptionParser
- try:
- from hashlib import md5
- except ImportError:
- from md5 import new as md5
- try:
- from xml.etree import cElementTree as et
- except ImportError:
- from xml.etree import ElementTree as et
- __version__ = '1.1'
- __author__ = "Ludovico Magnocavallo <ludo@qix.it>"
- CSS = """\
- blockquote { margin: 12px; }
- blockquote, blockquote p { font-style: italic; }
- """
- OPF = """<?xml version="1.0" encoding="UTF-8"?>
- <package xmlns="http://www.idpf.org/2007/opf" version="2.0" unique-identifier="%(uid)s">
- <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
- <dc:language>%(lang)s</dc:language>
- <dc:title>%(title)s</dc:title>
- <dc:creator opf:role="aut">%(author)s</dc:creator>
- <!-- TODO: identifier -->
- <dc:identifier id="BookId">%(uid)s</dc:identifier>
- <meta name="cover" content="cover-image"/>
- </metadata>
- <manifest>
- <item id="css" media-type="text/css" href="%(basefile)s.css" />
- <item id="cover-image" media-type="image/jpeg" href="%(cover)s" />
- <item id="content" media-type="application/xhtml+xml" href="%(html)s" />
- <item id="ncx" media-type="application/x-dtbncx+xml" href="%(ncx)s" />
- </manifest>
- <spine toc="ncx">
- <itemref idref="content" />
- </spine>
- <guide>
- <reference type="cover" title="Cover" href="%(html)s#cover"></reference>
- <reference type="toc" title="Table of Contents" href="%(html)s#toc"></reference>
- <reference type="start" title="Start Reading" href="%(html)s#start"></reference>
- </guide>
- </package>
- """
- NCX = """<?xml version="1.0" encoding="UTF-8"?>
- <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%(lang)s">
- <head>
- <meta name="dtb:uid" content="%(uid)s" />
- <meta name="dtb:depth" content="1" />
- <meta name="dtb:totalPageCount" content="0" />
- <meta name="dtb:maxPageNumber" content="0" />
- </head>
- <docTitle><text>%(title)s</text></docTitle>
- <docAuthor><text>%(author)s</text></docAuthor>
- <navMap>
- <navPoint id="cover" playOrder="0"><navlabel><text>Cover</text></navLabel><content src="%(html)s#cover"/></navPoint>
- <navPoint id="toc" playOrder="1"><navlabel><text>Table of Contents</text></navLabel><content src="%(html)s#toc"/></navPoint>
- %(navpoints)s
- </navMap>
- </ncx>
- """
- NAVPOINT = '<navPoint id="%(id)s" playOrder="%(i)s"><navlabel><text>%(text)s</text></navLabel><content src="%(html)s%(href)s"/></navPoint>'
- def main():
- parser = OptionParser(usage=__doc__, version="%prog " + __version__)
- parser.add_option("--ncx", type="str", dest="ncx", default=None, help="write ncx to file, defaults to the input file with .ncx extension")
- parser.add_option("--opf", type="str", dest="opf", default=None, help="write opf to file, defaults to the input file with .opf extension")
- parser.add_option("--cover", dest="cover", default=None, help="cover file, defaults to the input file with .jpg extension")
- parser.add_option("--author", type="str", dest="author", default=None, help="author name, defaults to anything before the first '-'")
- parser.add_option("--title", type="str", dest="title", default=None, help="title, defaults to anything after the last '-'")
- parser.add_option("--lang", type="str", dest="lang", default="en-US", help="language, defaults to 'en-US'")
- (opts, args) = parser.parse_args()
- if not args:
- raise SystemExit("No file to process")
- html = args[0]
- _html = os.path.splitext(html)[0]
- cover = opts.cover or _html + '.jpg'
- opf = opts.opf or _html + '.opf'
- ncx = opts.ncx or _html + '.ncx'
- sys.stderr.write("Input file '%s'\n" % html)
- if not os.path.isfile(html):
- raise SystemExit("Input file not found")
- sys.stderr.write("Cover file '%s'\n" % cover)
- if not os.path.isfile(cover):
- raise SystemExit("Cover not found")
- sys.stderr.write("OPF file '%s'\n" % opf)
- sys.stderr.write("NCX file '%s'\n" % ncx)
- author = opts.author or _html.split(' - ')[0].strip()
- title = opts.title or _html.split(' - ')[-1].strip()
- sys.stderr.write("Author '%s'\n" % author)
- sys.stderr.write("Title '%s'\n" % title)
- try:
- tree = et.XML(file(html).read())
- except et.ParseError, e:
- raise SystemExit("Error parsing source file: %s" % e)
- toc = None
- for el in tree.getiterator():
- if el.attrib.get('id') == 'chapters':
- toc = el
- break
- if toc is None:
- raise SystemExit("No table of contents element with id 'chapters' found")
- chapter_tag = 'a'
- if toc.tag[0] == '{':
- chapter_tag = toc.tag[:toc.tag.find('}') + 1] + 'a'
- chapters = list()
- for el in toc.getiterator():
- if el.tag == chapter_tag and 'href' in el.attrib and el.text:
- chapters.append(el)
- if not chapters:
- raise SystemExit("No chapters found")
- sys.stderr.write("%s chapters found\n" % len(chapters))
- opf_buffer = []
- ncx_buffer = []
- for i, chapter in enumerate(chapters):
- _id = chapter.attrib['href'].split('#')[-1]
- d = dict(text=chapter.text.strip(), i=i+2, href=chapter.attrib['href'], id=_id, html=html)
- opf_buffer.append('<itemref idref="%s" linear="yes"/>' % _id)
- ncx_buffer.append(NAVPOINT % d)
- uid = md5(title+author).hexdigest()
- sys.stderr.write("writing ncx file\n")
- file(ncx, 'w').write(NCX % dict(
- uid=uid,
- lang=opts.lang, author=author, title=title,
- html=html,
- navpoints=("\n".join(ncx_buffer)).encode('utf-8')
- ))
- sys.stderr.write("writing opf file\n")
- file(opf, 'w').write(OPF % dict(
- uid=md5(title+author).hexdigest(),
- lang=opts.lang, author=author, title=title,
- html=html, basefile=_html, cover=cover, ncx=ncx,
- #spine_items="\n ".join(opf_buffer)
- ))
- sys.stderr.write("writing css file\n")
- file(_html + '.css', 'w').write(CSS)
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement