Advertisement
Guest User

Untitled

a guest
Feb 21st, 2016
70
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from bs4 import BeautifulSoup
  2. from ebooklib import epub
  3. import sys
  4.  
  5. import urllib2
  6.  
  7. BLOG_HOST='https://blog.balhau.net/'
  8. EBOOK_NAME="Gamma Dreams Book"
  9.  
  10. def getDoc(pageNum):
  11.     try:
  12.         pageUrl='{}?paged={}'.format(BLOG_HOST,str(pageNum))
  13.         page=urllib2.urlopen(pageUrl)
  14.         return BeautifulSoup(page, 'html.parser')
  15.     except:
  16.         return []
  17.  
  18. def parseArticles(articles):
  19.     return 1
  20.     #Build ebook entries here
  21.  
  22. #Function to create ebook headers
  23. def createEbook(idEbook,ebookTitle,lang,authors):
  24.  
  25.     book = epub.EpubBook()
  26.     book.set_identifier(idEbook)
  27.     book.set_title(ebookTitle)
  28.     book.set_language(lang)
  29.  
  30.     for author in authors:
  31.         book.add_author(author)
  32.  
  33.     return book
  34.  
  35. def checkIfIsValidPage(soupObject):
  36.     try:
  37.         len(soupObject.find_all("article"))
  38.         return True
  39.     except:
  40.         return False
  41.  
  42. #articles=doc.find_all("article")
  43.  
  44. #Title
  45. #print articles[0].header.h1.a.string
  46. #Post
  47. #print articles[0].find_all('div')[1].get_text()
  48.  
  49.  
  50. ebook=createEbook("gammaDreamsBook",'Gamma Dreams Blog','pt',['Balhau'])
  51.  
  52. spine=['nav']
  53.  
  54. ##Main loop where we exctract all post info and create ebook entries
  55.  
  56. #dataPublished = entry['published']['$t']
  57.  
  58. UNTITLED=1
  59. pages=[]
  60.  
  61. pageNum=1
  62. soupObject=getDoc(pageNum)
  63.  
  64. print "Extracting data from blog"
  65. while checkIfIsValidPage(soupObject):
  66.     pages.append(soupObject.find_all('article'))
  67.     pageNum+=1
  68.     soupObject=getDoc(pageNum)
  69.  
  70.  
  71. print "Converting into epub"
  72. for page in reversed(pages):
  73.     #Flatten article
  74.     for article in reversed(page):
  75.         #Parse data if article valid
  76.         if article != None:
  77.             title=article.header.h1.get_text()
  78.             if title == None or title.strip() == '':
  79.                 title = "Entry: "+str(UNTITLED)
  80.                 UNTITLED+=1
  81.             title=title.strip()
  82.             sys.stdout.flush()
  83.             content=str(article).decode('utf-8')
  84.             c1 = epub.EpubHtml(title=title, file_name=title+'.xhtml', lang='pt')
  85.             c1.content=content
  86.             ebook.add_item(c1)
  87.             ebook.toc = ebook.toc + [epub.Link(title+'.xhtml', title, title)]
  88.             spine.append(c1)
  89.  
  90. ebook.add_item(epub.EpubNcx())
  91. ebook.add_item(epub.EpubNav())
  92. style = 'BODY {color: white;}'
  93. nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
  94. # add CSS file
  95. ebook.add_item(nav_css)
  96. # basic spine
  97. ebook.spine = spine
  98. # write to the file
  99. epub.write_epub(EBOOK_NAME+'.epub', ebook, {})
  100.  
  101. #articles=doc.find_all("article")
  102. #for article in articles:
Advertisement
RAW Paste Data Copied
Advertisement