Advertisement
Guest User

Untitled

a guest
Jan 28th, 2018
580
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.33 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import os
  4. import zipfile
  5.  
  6.  
  7. def find_between(file):
  8.     f = open(file, "r", encoding = "utf8")
  9.     soup = BeautifulSoup(f, 'html.parser')
  10.     return soup.title
  11.  
  12.  
  13. def download(link, file_name):
  14.     page = requests.get(link).text
  15.     file = open(file_name, "w", encoding="utf8")
  16.     file.write(page)
  17.     file.close()
  18.  
  19.  
  20. def clean(file_name_in, file_name_out):
  21.     raw = open(file_name_in, "r", encoding="utf8")
  22.     soup = BeautifulSoup(raw, "html.parser")
  23.     soup = soup.find(itemprop="articleBody")
  24.     text = soup.text
  25.     text = text.replace("Previous Chapter", "").replace("Next Chapter", "")
  26.     text = text.lstrip().rstrip()
  27.     chapter_title = text.split('\n', 1)[0]
  28.     text = text.replace(chapter_title, "")
  29.     text = text.lstrip().rstrip()
  30.     text = text.split("\n\r")[0]
  31.     text = text.replace("\n", "</p>\n<p>")
  32.     raw.close()
  33.     file = open(file_name_out, "w", encoding="utf8")
  34.     file.write('<html xmlns="http://www.w3.org/1999/xhtml">')
  35.     file.write("\n<head>")
  36.     file.write("\n<title>" + chapter_title + "</title>")
  37.     file.write("\n</head>")
  38.     file.write("\n<body>")
  39.     file.write("\n<strong>" + chapter_title + "</strong>" + "\n<p>")
  40.     file.write(text)
  41.     file.write("</p>")
  42.     file.write("\n</body>")
  43.     file.write("\n</html>")
  44.     os.remove(file_name_in)
  45.  
  46.  
  47. def generate(html_files, novelname, author, chapter_s, chapter_e):
  48.     epub = zipfile.ZipFile(novelname + "_" + chapter_s + "-" + chapter_e + ".epub", "w")
  49.     epub.writestr("META-INF/container.xml", '''<container version="1.0"
  50.    xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
  51.      <rootfiles>
  52.        <rootfile full-path="OEBPS/Content.opf" media-type="application/oebps-package+xml"/>
  53.      </rootfiles>
  54.    </container>''')
  55.  
  56.     index_tpl = '''<package version="3.1"
  57.    xmlns="http://www.idpf.org/2007/opf">
  58.      <metadata>
  59.        %(metadata)s
  60.          </metadata>
  61.            <manifest>
  62.              %(manifest)s2
  63.            </manifest>
  64.            <spine>
  65.              <itemref idref="toc" linear="no"/>
  66.              %(spine)s
  67.            </spine>
  68.    </package>'''
  69.  
  70.     manifest = ""
  71.     spine = ""
  72.     metadata = '''<dc:title xmlns:dc="http://purl.org/dc/elements/1.1/">%(novelname)s</dc:title>
  73.      <dc:creator xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:ns0="http://www.idpf.org/2007/opf" ns0:role="aut" ns0:file-as="NaN">%(author)s</dc:creator>
  74.        <meta xmlns:dc="http://purl.org/dc/elements/1.1/" name="calibre:series" content="%(series)s"/>''' \
  75.     % {
  76.           "novelname": novelname + ": " + chapter_s + "-" + chapter_e, "author": author, "series": novelname}
  77.     toc_manifest = '<item href="toc.xhtml" id="toc" properties="nav" media-type="application/xhtml+xml"/>'
  78.  
  79.     for i, html in enumerate(html_files):
  80.         basename = os.path.basename(html)
  81.         manifest += '<item id="file_%s" href="%s" media-type="application/xhtml+xml"/>' % (
  82.             i + 1, basename)
  83.         spine += '<itemref idref="file_%s" />' % (i + 1)
  84.         epub.write(html, "OEBPS/" + basename)
  85.  
  86.     epub.writestr("OEBPS/Content.opf", index_tpl % {
  87.                   "metadata": metadata,
  88.                   "manifest": manifest + toc_manifest,
  89.                   "spine": spine, })
  90.  
  91.     toc_start = '''<?xml version='1.0' encoding='utf-8'?>
  92.    <!DOCTYPE html>
  93.    <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
  94.        <head>
  95.            <title>%(novelname)s</title>
  96.        </head>
  97.            <body>
  98.                <section class="frontmatter TableOfContents">
  99.            <header>
  100.                <h1>Contents</h1>
  101.            </header>
  102.                <nav id="toc" role="doc-toc" epub:type="toc">
  103.                    <ol>
  104.                        %(toc_mid)s
  105.                        %(toc_end)s'''
  106.     toc_mid = ""
  107.     toc_end = '''</ol></nav></section></body></html>'''
  108.  
  109.     for i, y in enumerate(html_files):
  110.         chapter = find_between(html_files[i])
  111.         chapter = str(chapter)
  112.         toc_mid += '''<li class="toc-Chapter-rw" id="num_%s">
  113.        <a href="%s">%s</a>
  114.        </li>''' % (i, html_files[i], chapter)
  115.  
  116.     epub.writestr("OEBPS/toc.xhtml", toc_start % {"novelname": novelname, "toc_mid": toc_mid, "toc_end": toc_end})
  117.     epub.close()
  118.  
  119.     for x in html_files:
  120.         os.remove(x)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement