lazyfai

appledaily-to-kindle.py

Jul 5th, 2016
206
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.43 KB | None | 0 0
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3.  
  4. from __future__ import print_function
  5. import boto3
  6. import urllib
  7. import sys
  8. import json
  9. import datetime
  10. import time
  11. import os
  12. import io
  13. import requests
  14. import SwaggerPetstore
  15. from collections import OrderedDict
  16. from bs4 import BeautifulSoup
  17. from ebooklib import epub
  18. from email import encoders
  19. from email.mime.base import MIMEBase
  20. from email.mime.multipart import MIMEMultipart
  21. from email.mime.application import MIMEApplication
  22.  
  23. def convertbook(infile, outfile):
  24.     print ("converting %s to %s" % (infile, outfile))
  25.         myapikey = '[YOUR ONLINE-CONVERT.COM API KEY HERE]'
  26.     converturl = 'http://api2.online-convert.com/jobs'
  27.     payload = '{ "conversion": [ { "target":"mobi" } ] }'
  28.     headers = {
  29.         'X-Oc-Api-Key': myapikey,
  30.         'Conten-Type': "application/json",
  31.         'Cache-Control': "no-cache"
  32.     }
  33.     resp = requests.post(converturl, data=payload, headers=headers)
  34.     #print (resp.text)
  35.     response = json.loads(resp.text)
  36.     print (response)
  37.     if (response['status']['code'] == 'incomplete'):
  38.         uploadurl = response['server']
  39.         uploadtoken = response['token']
  40.         #uploadid = response['conversion'][0]['id']
  41.         uploadid = response['id']
  42.     else:
  43.         print ("convert job init failure")
  44.         sys.exit()
  45.     # upload file now
  46.     headers = {
  47.         'x-oc-api-key': myapikey,
  48.         'X-Oc-Token': uploadtoken
  49.     }
  50.     payload = {'file': ('input.epub', open(infile, 'rb'), 'application/epub+zip', {'Expires': '0'})}
  51.     uploadurl = uploadurl.replace('/dl', '/dl/upload-file')
  52.     print ("preparing upload job to %s with token %s" % (uploadurl+'/'+uploadid, uploadtoken))
  53.     resp = requests.post(uploadurl+'/'+uploadid, files=payload, headers=headers)
  54.     print (len(resp.request.body))
  55.     response = json.loads(resp.text)
  56.     print (response)
  57.    
  58.     if (resp.status_code == 200):
  59.         # wait for conversion finish
  60.         print ("convert job initialized")
  61.     else:
  62.         print ("status code = %s" % str(resp.status_code) )
  63.         sys.exit()
  64.    
  65.     jobstatus = "started"
  66.     while (jobstatus != "completed"):
  67.         time.sleep(3)
  68.         print ("checking status by URL %s" % (converturl+'/'+uploadid))
  69.         resp = requests.get(converturl+'/'+uploadid, headers=headers)
  70.         response = json.loads(resp.text)
  71.         print (response)
  72.         jobstatus = response['status']['code']
  73.     # get back the file
  74.     downloaduri = response['output'][0]['uri']
  75.     print ("Getting back the file from %s" % downloaduri)
  76.     response = urllib.urlopen(downloaduri).read()
  77.     outputfile = open(outfile,'wb')
  78.     outputfile.write(response)
  79.     outputfile.close()
  80.  
  81. def handler(event, context):
  82.     today = datetime.datetime.now()
  83.         print("Buffer size = " + str(io.DEFAULT_BUFFER_SIZE))
  84.     print("Got event")
  85.     print(json.dumps(event))
  86.  
  87.     book = epub.EpubBook()
  88.    
  89.     # set metadata
  90.     todaystr = '%d-%d-%d' % (today.year, today.month, today.day)
  91.     book.set_identifier('Apply Daily')
  92.     book.set_title(u'蘋果日報 即時新聞 '+todaystr)
  93.     book.set_language('zh')
  94.     book.add_author(u'蘋果日報')
  95.     logoimg = urllib.urlopen('http://staticlayout.apple.nextmedia.com/web_images/header/ad_logo.jpg').read()
  96.         book.set_cover("cover.jpg", logoimg)
  97.     del(logoimg)
  98.  
  99.     # add default NCX and Nav file
  100.     myncx = epub.EpubNcx()
  101.     book.add_item(myncx)
  102.     mynav = epub.EpubNav()
  103.     book.add_item(mynav)
  104.    
  105.     # define CSS style
  106.     style = '''
  107.     @namespace epub "http://www.idpf.org/2007/ops";
  108.     body {
  109.         font-family: Cambria, Liberation Serif, Bitstream Vera Serif, Georgia, Times, Times New Roman, serif;
  110.     }
  111.     '''
  112.     nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
  113.     book.add_item(nav_css)
  114.    
  115.     # basic spine
  116.     book.spine = []
  117.     book.spine.insert(0, 'nav')
  118.     book.spine.insert(0, 'cover')
  119.    
  120.     desired = u'即時新聞'
  121.     desired = desired.encode('utf-8')
  122.    
  123.     baseurl = "http://hkm.appledaily.com/"
  124.     html = urllib.urlopen(baseurl).read()
  125.     soup = BeautifulSoup(html, "lxml")
  126.    
  127.     # kill all script and style elements
  128.     for script in soup(["script", "style"]):
  129.         script.extract()    # rip it out
  130.    
  131.     categories = {}
  132.     # get links
  133.     menu = soup.find(attrs={'class':'menu'})
  134.     for link in menu.find_all('li'):
  135.         ahref = link.find('a', href=True)
  136.         if ahref is not None:
  137.             href = ahref['href']
  138.             if (str(href).startswith('list.php') and ahref.getText().encode('utf-8') == desired):
  139.                 categories[ahref.getText()] = href
  140.     #print(categories)
  141.     #sys.exit()
  142.    
  143.     #newslist = {}
  144.     newsid = 0
  145.     #for chapter, category in categories.iteritems():
  146.     for chapter, category in categories.items():
  147.             newslist = OrderedDict()
  148.         #print (chapter)
  149.         #print (category)
  150.         mycat = urllib.urlopen(baseurl+category).read()
  151.         mysoup = BeautifulSoup(mycat, "lxml")
  152.         ul = mysoup.find(attrs={'class':'list'})
  153.         for li in ul.findAll('li'):
  154.             a = li.find('a', href=True)
  155.             title = li.find('p', text=True)
  156.             if a is not None:
  157.                 title = title.getText().strip()
  158.                 newsurl = baseurl + a.get('href', False)
  159.                 newslist[title] = newsurl
  160.                 #print (title + '\t' + newsurl)
  161.         for title, news in newslist.iteritems():
  162.                     if (newsid > 20):
  163.                         break
  164.                     newscontent = ""
  165.                     newsid = newsid + 1
  166.                     newsname = 'news-{0:04d}'.format(newsid)
  167.                     chapter = epub.EpubHtml(title=title, file_name=newsname+'.xhtml', media_type='application/xhtml+xml', lang='zh')
  168.                     chapter.content = u'<b>'+title+u'</b><hr/>'
  169.             print('Title: ' + title)
  170.             print('Link: ' + news)
  171.             mynews = urllib.urlopen(news).read()
  172.             mysoup = BeautifulSoup(mynews, "lxml")
  173.    
  174.             images = mysoup.find(attrs={'class':'news-img'})
  175.             if images is not None:
  176.                 for image in images.findAll('img'):
  177.                     imagesrc = image['src']
  178.                     #print('Image: ' + imagesrc)
  179.                                     # download the image
  180.                                     response = urllib.urlopen(imagesrc).read()
  181.                                     imgfile = open('/tmp/'+newsname+'.jpg','wb')
  182.                                     chapterimg = epub.EpubImage()
  183.                                     chapterimg.file_name = newsname+'.jpg'
  184.                                     chapterimg.content = response
  185.                                     book.add_item(chapterimg)
  186.                                     #book.spine.append(chapterimg)
  187.                                     imgfile.write(response)
  188.                                     imgfile.close()
  189.                                     del(chapterimg)
  190.                                     del(response)
  191.    
  192.             article = mysoup.find(attrs={'class':'content-article'})
  193.             chapter.content += u'<center><img src="'+newsname+'.jpg" /></center><hr/>'
  194.             if article is not None:
  195.                 for paragraph in article.findAll('p'):
  196.                     if paragraph.get('class') is None and len(paragraph.getText()) > 0:
  197.                         chapter.content += u'<p>'+(paragraph.getText())+u'</p>'
  198.                         #print(paragraph.getText())
  199.                     chapter.content += u'<hr/><font size=-2><i>'+news+u'</i></font>'
  200.                     #print (chapter.content)
  201.                         book.toc.append(epub.Link(newsname+'.xhtml', title, newsname))
  202.                     book.add_item(chapter)
  203.                     book.spine.append(chapter)
  204.                     del (chapter)
  205.         #newslist = {}
  206.             del (newslist)
  207.     epub.write_epub('/tmp/output.epub', book, {})
  208.         epubsize = os.path.getsize('/tmp/output.epub')
  209.     partfilename = 'appledaily-%d-%d-%d.mobi' % (today.year, today.month, today.day)
  210.     convertbook('/tmp/output.epub', '/tmp/'+partfilename)
  211.         mobisize = os.path.getsize('/tmp/'+partfilename)
  212.         print ("Epub size = %d, Mobi size = %d" % (epubsize, mobisize))
  213.         msg = MIMEMultipart()
  214.         msg['Subject'] = 'convert'
  215.         msg['From'] = 'lazyfai@gmail.com'
  216.         msg['To'] = 'lazyfai@kindle.com'
  217.         partimg = open('/tmp/'+partfilename,'rb')
  218.         #partimg = open('/tmp/output.epub','rb')
  219.         part = MIMEBase('application','octext-stream')
  220.         part.set_payload(partimg.read())
  221.         part.add_header('Content-Disposition', 'attachment', filename=partfilename)
  222.         encoders.encode_base64(part)
  223.         msg.attach(part)
  224.     # I use SES, can use other email sending method
  225.         client = boto3.client('ses', region_name='us-east-1')
  226.         #conn = boto3.ses.connect_to_region('us-east-1')
  227.         emailbody = msg.as_string()
  228.         response = client.send_raw_email(
  229.             Source = msg['From'],
  230.             Destinations = [msg['To']],
  231.             RawMessage = {
  232.                 'Data': emailbody
  233.             }
  234.         )
  235.     return "Finished"
  236.  
  237. if __name__ == "__main__":
  238.     handler("DryRun", None)
Add Comment
Please, Sign In to add comment