Advertisement
lazyfai

appledaily-to-kindle-mailgun

Jul 6th, 2016
265
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.21 KB | None | 0 0
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3.  
  4. from __future__ import print_function
  5. import boto3
  6. import urllib
  7. import sys
  8. import json
  9. import datetime
  10. import time
  11. import os
  12. import io
  13. import requests
  14. import SwaggerPetstore
  15. from collections import OrderedDict
  16. from bs4 import BeautifulSoup
  17. from ebooklib import epub
  18. from email import encoders
  19. from email.mime.base import MIMEBase
  20. from email.mime.multipart import MIMEMultipart
  21. from email.mime.application import MIMEApplication
  22.  
  23. def convertbook(infile, outfile):
  24.     print ("converting %s to %s" % (infile, outfile))
  25.         myapikey = '### My online-convert.com API key ###'
  26.     converturl = 'http://api2.online-convert.com/jobs'
  27.     payload = '{ "conversion": [ { "target":"mobi" } ] }'
  28.     headers = {
  29.         'X-Oc-Api-Key': myapikey,
  30.         'Conten-Type': "application/json",
  31.         'Cache-Control': "no-cache"
  32.     }
  33.     resp = requests.post(converturl, data=payload, headers=headers)
  34.     #print (resp.text)
  35.     response = json.loads(resp.text)
  36.     print (response)
  37.     if (response['status']['code'] == 'incomplete'):
  38.         uploadurl = response['server']
  39.         uploadtoken = response['token']
  40.         #uploadid = response['conversion'][0]['id']
  41.         uploadid = response['id']
  42.     else:
  43.         print ("convert job init failure")
  44.         sys.exit()
  45.     # upload file now
  46.     headers = {
  47.         'x-oc-api-key': myapikey,
  48.         'X-Oc-Token': uploadtoken
  49.     }
  50.     payload = {'file': ('input.epub', open(infile, 'rb'), 'application/epub+zip', {'Expires': '0'})}
  51.     uploadurl = uploadurl.replace('/dl', '/dl/upload-file')
  52.     print ("preparing upload job to %s with token %s" % (uploadurl+'/'+uploadid, uploadtoken))
  53.     resp = requests.post(uploadurl+'/'+uploadid, files=payload, headers=headers)
  54.     print (len(resp.request.body))
  55.     response = json.loads(resp.text)
  56.     print (response)
  57.     if (resp.status_code == 200):
  58.         # wait for conversion finish
  59.         print ("convert job initialized")
  60.     else:
  61.         print ("status code = %s" % str(resp.status_code) )
  62.         sys.exit()
  63.     jobstatus = "started"
  64.     while (jobstatus != "completed"):
  65.         time.sleep(3)
  66.         print ("checking status by URL %s" % (converturl+'/'+uploadid))
  67.         resp = requests.get(converturl+'/'+uploadid, headers=headers)
  68.         response = json.loads(resp.text)
  69.         print (response)
  70.         jobstatus = response['status']['code']
  71.     # get back the file
  72.     downloaduri = response['output'][0]['uri']
  73.     print ("Getting back the file from %s" % downloaduri)
  74.     response = urllib.urlopen(downloaduri).read()
  75.     outputfile = open(outfile,'wb')
  76.     outputfile.write(response)
  77.     outputfile.close()
  78.  
  79. def handler(event, context):
  80.     today = datetime.datetime.now()
  81.         print("Buffer size = " + str(io.DEFAULT_BUFFER_SIZE))
  82.     print("Got event")
  83.     print(json.dumps(event))
  84.  
  85.     book = epub.EpubBook()
  86.    
  87.     # set metadata
  88.     todaystr = '%d-%d-%d' % (today.year, today.month, today.day)
  89.     book.set_identifier('Apply Daily')
  90.     book.set_title(u'蘋果日報 即時新聞 '+todaystr)
  91.     book.set_language('zh')
  92.     book.add_author(u'蘋果日報')
  93.     logoimg = urllib.urlopen('http://staticlayout.apple.nextmedia.com/web_images/header/ad_logo.jpg').read()
  94.         book.set_cover("cover.jpg", logoimg)
  95.     del(logoimg)
  96.  
  97.     # add default NCX and Nav file
  98.     myncx = epub.EpubNcx()
  99.     book.add_item(myncx)
  100.     mynav = epub.EpubNav()
  101.     book.add_item(mynav)
  102.    
  103.     # define CSS style
  104.     style = '''
  105.     @namespace epub "http://www.idpf.org/2007/ops";
  106.     body {
  107.         font-family: Cambria, Liberation Serif, Bitstream Vera Serif, Georgia, Times, Times New Roman, serif;
  108.     }
  109.     '''
  110.     nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
  111.     book.add_item(nav_css)
  112.    
  113.     # basic spine
  114.     book.spine = []
  115.     book.spine.insert(0, 'nav')
  116.     book.spine.insert(0, 'cover')
  117.    
  118.     desired = u'即時新聞'
  119.     desired = desired.encode('utf-8')
  120.    
  121.     baseurl = "http://hkm.appledaily.com/"
  122.     html = urllib.urlopen(baseurl).read()
  123.     soup = BeautifulSoup(html, "lxml")
  124.    
  125.     # kill all script and style elements
  126.     for script in soup(["script", "style"]):
  127.         script.extract()    # rip it out
  128.    
  129.     categories = {}
  130.     # get links
  131.     menu = soup.find(attrs={'class':'menu'})
  132.     for link in menu.find_all('li'):
  133.         ahref = link.find('a', href=True)
  134.         if ahref is not None:
  135.             href = ahref['href']
  136.             if (str(href).startswith('list.php') and ahref.getText().encode('utf-8') == desired):
  137.                 categories[ahref.getText()] = href
  138.     #print(categories)
  139.     #sys.exit()
  140.    
  141.     #newslist = {}
  142.     newsid = 0
  143.     #for chapter, category in categories.iteritems():
  144.     for chapter, category in categories.items():
  145.             newslist = OrderedDict()
  146.         #print (chapter)
  147.         #print (category)
  148.         mycat = urllib.urlopen(baseurl+category).read()
  149.         mysoup = BeautifulSoup(mycat, "lxml")
  150.         ul = mysoup.find(attrs={'class':'list'})
  151.         for li in ul.findAll('li'):
  152.             a = li.find('a', href=True)
  153.             title = li.find('p', text=True)
  154.             if a is not None:
  155.                 title = title.getText().strip()
  156.                 newsurl = baseurl + a.get('href', False)
  157.                 newslist[title] = newsurl
  158.                 #print (title + '\t' + newsurl)
  159.         for title, news in newslist.iteritems():
  160.                     if (newsid > 20):
  161.                         break
  162.                     newscontent = ""
  163.                     newsid = newsid + 1
  164.                     newsname = 'news-{0:04d}'.format(newsid)
  165.                     chapter = epub.EpubHtml(title=title, file_name=newsname+'.xhtml', media_type='application/xhtml+xml', lang='zh')
  166.                     chapter.content = u'<b>'+title+u'</b><hr/>'
  167.             print('Title: ' + title)
  168.             print('Link: ' + news)
  169.             mynews = urllib.urlopen(news).read()
  170.             mysoup = BeautifulSoup(mynews, "lxml")
  171.    
  172.             images = mysoup.find(attrs={'class':'news-img'})
  173.             if images is not None:
  174.                 for image in images.findAll('img'):
  175.                     imagesrc = image['src']
  176.                     #print('Image: ' + imagesrc)
  177.                                     # download the image
  178.                                     response = urllib.urlopen(imagesrc).read()
  179.                                     imgfile = open('/tmp/'+newsname+'.jpg','wb')
  180.                                     chapterimg = epub.EpubImage()
  181.                                     chapterimg.file_name = newsname+'.jpg'
  182.                                     chapterimg.content = response
  183.                                     book.add_item(chapterimg)
  184.                                     #book.spine.append(chapterimg)
  185.                                     imgfile.write(response)
  186.                                     imgfile.close()
  187.                                     del(chapterimg)
  188.                                     del(response)
  189.    
  190.             article = mysoup.find(attrs={'class':'content-article'})
  191.             chapter.content += u'<center><img src="'+newsname+'.jpg" width="80%" /></center><hr/>'
  192.             if article is not None:
  193.                 for paragraph in article.findAll('p'):
  194.                     if paragraph.get('class') is None and len(paragraph.getText()) > 0:
  195.                         chapter.content += u'<p>'+(paragraph.getText())+u'</p>'
  196.                         #print(paragraph.getText())
  197.                     chapter.content += u'<hr/><font size=-2><i>'+news+u'</i></font>'
  198.                     #print (chapter.content)
  199.                         book.toc.append(epub.Link(newsname+'.xhtml', title, newsname))
  200.                     book.add_item(chapter)
  201.                     book.spine.append(chapter)
  202.                     del (chapter)
  203.         #newslist = {}
  204.             del (newslist)
  205.     epub.write_epub('/tmp/output.epub', book, {})
  206.         epubsize = os.path.getsize('/tmp/output.epub')
  207.     partfilename = 'appledaily-%d-%d-%d.mobi' % (today.year, today.month, today.day)
  208.     #if (event == "DryRun"):
  209.     #   sys.exit()
  210.     convertbook('/tmp/output.epub', '/tmp/'+partfilename)
  211.         mobisize = os.path.getsize('/tmp/'+partfilename)
  212.         print ("Epub size = %d, Mobi size = %d" % (epubsize, mobisize))
  213.         msg = MIMEMultipart()
  214.         msg['Subject'] = 'convert'
  215.         msg['From'] = '### My from email address ###'
  216.         msg['To'] = '### My send to Kindle email address ###'
  217.         attachment = open('/tmp/'+partfilename,'rb')
  218.         requests.post(
  219.             "https://api.mailgun.net/v3/### my mailgun domain ###/messages",
  220.             auth=("api", "### my mailgun API key ###"),
  221.             files=[("attachment", attachment)],
  222.             data={
  223.                 "from": msg['From'],
  224.                 "to": msg['To'],
  225.                 "subject": "convert",
  226.                 "text": "Hello world"
  227.             }
  228.         )
  229.     return "Finished"
  230.  
  231. if __name__ == "__main__":
  232.     handler("DryRun", None)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement