Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## scrape and combine wordpress chapters into an ebook [ epub file ]
- ## EXAMPLE OF USAGE:
- ### rainofsnow_to_epub('https://rainofsnow.com/the-tragedy-of-the-villainess')
- #### output at https://drive.google.com/file/d/1Ob3ZSZTvjvpOmBYKSfgfzuxn613p2Dwf
- ## REQUIREMENTS:
- ### linkToSoup [ scraperFn ] at https://pastebin.com/rBTr06vy
- ### confParse at https://pastebin.com/c0TjDmNE
- # ! pip install ebooklib
- # from ebooklib import epub
- # import time
- bSel = {
- 'title': 'div.text>h2', 'synopsis': 'div#synop',
- 'c1_link': ('span.chapter>a[href]', 'href'),
- 'details': {
- 'tSel': ('span.vt1', 'span.vt1+small.vt2'),
- 'listSel': 'ul>li:has(span.vt1+small.vt2)',
- '__apply2resultSet__': dict
- }
- }
- chSel = {
- 'title': 'div#item{{chapCt}}.bb-item>div.content>div.scroller>h2',
- 'html': ('div.scroller>div.zoomdesc-cont',*([None]*3),'prettify'),
- 'next_link': ('nav>ul>li:has(>a)+li>a:not(.disable)', 'href')
- }
- def rainofsnow_to_epub(tl_url, scraperFn=linkToSoup, selRef_b=bSel, selRef_c=chSel):
- book_dets = confParse(scraperFn(tl_url, isv=True), selRef_b)
- chapterLink = book_dets['c1_link']
- nid = f'rainofsnow_{int(time.time())}_nid'
- if 'novelid=' in chapterLink.split('?',1)[-1]:
- nid += chapterLink.split('?',1)[-1].split('novelid=',1)[0]
- book = epub.EpubBook()
- book.set_identifier(nid)
- book.set_title(book_dets['title'])
- book.set_language('en')
- book.add_author(book_dets['details'].get('Author', 'UNKNOWN'))
- book.add_metadata('DC', 'description', book_dets['synopsis'])
- for mk, mv in book_dets['details'].items():
- if mk == 'Author': continue
- book.add_metadata(None, 'meta', '', {'name': mk, 'content': mv})
- chapCt, chList, titleSel = 0, [], selRef_c['title'][:]
- while chapterLink:
- chapCt += 1
- selRef_c['title'] = titleSel.replace('{{chapCt}}', f'{chapCt}')
- cSoup = scraperFn(chapterLink, isv=False, returnErr=True)
- if isinstance(cSoup, str):
- print('', end=f'\rWaiting 5s before retrying ch{chapCt}: !{cSoup}')
- chapCt = chapCt - 1
- time.sleep(5)
- continue
- ch_data = confParse(cSoup, selRef_c)
- ch_fn, ch_name = f'ch{chapCt}.xhtml', ch_data['title']
- ch_name = f'[Ch{chapCt}] {ch_name}'
- ch_body = f'<h2>{ch_name}</h2>\n{ch_data["html"]}'
- ch_cur = epub.EpubHtml(title=ch_name, file_name=ch_fn, lang='en')
- ch_cur.set_content(f'<html><body>{ch_body}</body></html>')
- book.add_item(ch_cur)
- chList.append(ch_cur)
- print('', end=f'\rAdded as {repr(ch_fn)}: {repr(ch_name)}')
- chapterLink = ch_data['next_link']
- style = 'body { font-family: Times, Times New Roman, serif; }'
- nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css",
- media_type="text/css", content=style)
- book.add_item(nav_css)
- book.toc, book.spine = chList, ['nav', *chList]
- book.add_item(epub.EpubNcx())
- book.add_item(epub.EpubNav())
- epub.write_epub(f"{book_dets['title']}.epub", book)
- print(f'\rSaved {len(chList)} chapters to',
- repr(f"{book_dets['title']}.epub"))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement