Advertisement
Guest User

Untitled

a guest
Feb 21st, 2016
95
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.59 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. from ebooklib import epub
  3. import sys
  4.  
  5. import urllib2
  6.  
  7. BLOG_HOST='https://blog.balhau.net/'
  8. EBOOK_NAME="Gamma Dreams Book"
  9.  
  10. def getDoc(pageNum):
  11.     try:
  12.         pageUrl='{}?paged={}'.format(BLOG_HOST,str(pageNum))
  13.         page=urllib2.urlopen(pageUrl)
  14.         return BeautifulSoup(page, 'html.parser')
  15.     except:
  16.         return []
  17.  
  18. def parseArticles(articles):
  19.     return 1
  20.     #Build ebook entries here
  21.  
  22. #Function to create ebook headers
  23. def createEbook(idEbook,ebookTitle,lang,authors):
  24.  
  25.     book = epub.EpubBook()
  26.     book.set_identifier(idEbook)
  27.     book.set_title(ebookTitle)
  28.     book.set_language(lang)
  29.  
  30.     for author in authors:
  31.         book.add_author(author)
  32.  
  33.     return book
  34.  
  35. def checkIfIsValidPage(soupObject):
  36.     try:
  37.         len(soupObject.find_all("article"))
  38.         return True
  39.     except:
  40.         return False
  41.  
  42. #articles=doc.find_all("article")
  43.  
  44. #Title
  45. #print articles[0].header.h1.a.string
  46. #Post
  47. #print articles[0].find_all('div')[1].get_text()
  48.  
  49.  
  50. ebook=createEbook("gammaDreamsBook",'Gamma Dreams Blog','pt',['Balhau'])
  51.  
  52. spine=['nav']
  53.  
  54. ##Main loop where we exctract all post info and create ebook entries
  55.  
  56. #dataPublished = entry['published']['$t']
  57.  
  58. UNTITLED=1
  59. pages=[]
  60.  
  61. pageNum=1
  62. soupObject=getDoc(pageNum)
  63.  
  64. print "Extracting data from blog"
  65. while checkIfIsValidPage(soupObject):
  66.     pages.append(soupObject.find_all('article'))
  67.     pageNum+=1
  68.     soupObject=getDoc(pageNum)
  69.  
  70.  
  71. print "Converting into epub"
  72. for page in reversed(pages):
  73.     #Flatten article
  74.     for article in reversed(page):
  75.         #Parse data if article valid
  76.         if article != None:
  77.             title=article.header.h1.get_text()
  78.             if title == None or title.strip() == '':
  79.                 title = "Entry: "+str(UNTITLED)
  80.                 UNTITLED+=1
  81.             title=title.strip()
  82.             sys.stdout.flush()
  83.             content=str(article).decode('utf-8')
  84.             c1 = epub.EpubHtml(title=title, file_name=title+'.xhtml', lang='pt')
  85.             c1.content=content
  86.             ebook.add_item(c1)
  87.             ebook.toc = ebook.toc + [epub.Link(title+'.xhtml', title, title)]
  88.             spine.append(c1)
  89.  
  90. ebook.add_item(epub.EpubNcx())
  91. ebook.add_item(epub.EpubNav())
  92. style = 'BODY {color: white;}'
  93. nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
  94. # add CSS file
  95. ebook.add_item(nav_css)
  96. # basic spine
  97. ebook.spine = spine
  98. # write to the file
  99. epub.write_epub(EBOOK_NAME+'.epub', ebook, {})
  100.  
  101. #articles=doc.find_all("article")
  102. #for article in articles:
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement