appledaily-to-kindle.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

from __future__ import print_function
import boto3
import urllib
import sys
import json
import datetime
import time
import os
import io
import requests
import SwaggerPetstore
from collections import OrderedDict
from bs4 import BeautifulSoup
from ebooklib import epub
from email import encoders
from email.mime.base import MIMEBase
from email.mime.multipart import MIMEMultipart
from email.mime.application import MIMEApplication

def convertbook(infile, outfile):
    print ("converting %s to %s" % (infile, outfile))
        myapikey = '[YOUR ONLINE-CONVERT.COM API KEY HERE]'
    converturl = 'http://api2.online-convert.com/jobs'
    payload = '{ "conversion": [ { "target":"mobi" } ] }'
    headers = {
        'X-Oc-Api-Key': myapikey,
        'Conten-Type': "application/json",
        'Cache-Control': "no-cache"
    }
    resp = requests.post(converturl, data=payload, headers=headers)
    #print (resp.text)
    response = json.loads(resp.text)
    print (response)
    if (response['status']['code'] == 'incomplete'):
        uploadurl = response['server']
        uploadtoken = response['token']
        #uploadid = response['conversion'][0]['id']
        uploadid = response['id']
    else:
        print ("convert job init failure")
        sys.exit()
    # upload file now
    headers = {
        'x-oc-api-key': myapikey,
        'X-Oc-Token': uploadtoken
    }
    payload = {'file': ('input.epub', open(infile, 'rb'), 'application/epub+zip', {'Expires': '0'})}
    uploadurl = uploadurl.replace('/dl', '/dl/upload-file')
    print ("preparing upload job to %s with token %s" % (uploadurl+'/'+uploadid, uploadtoken))
    resp = requests.post(uploadurl+'/'+uploadid, files=payload, headers=headers)
    print (len(resp.request.body))
    response = json.loads(resp.text)
    print (response)

    if (resp.status_code == 200):
        # wait for conversion finish
        print ("convert job initialized")
    else:
        print ("status code = %s" % str(resp.status_code) )
        sys.exit()

    jobstatus = "started"
    while (jobstatus != "completed"):
        time.sleep(3)
        print ("checking status by URL %s" % (converturl+'/'+uploadid))
        resp = requests.get(converturl+'/'+uploadid, headers=headers)
        response = json.loads(resp.text)
        print (response)
        jobstatus = response['status']['code']
    # get back the file
    downloaduri = response['output'][0]['uri']
    print ("Getting back the file from %s" % downloaduri)
    response = urllib.urlopen(downloaduri).read()
    outputfile = open(outfile,'wb')
    outputfile.write(response)
    outputfile.close()

def handler(event, context):
    today = datetime.datetime.now()
        print("Buffer size = " + str(io.DEFAULT_BUFFER_SIZE))
    print("Got event")
    print(json.dumps(event))

    book = epub.EpubBook()

    # set metadata
    todaystr = '%d-%d-%d' % (today.year, today.month, today.day)
    book.set_identifier('Apply Daily')
    book.set_title(u'蘋果日報 即時新聞 '+todaystr)
    book.set_language('zh')
    book.add_author(u'蘋果日報')
    logoimg = urllib.urlopen('http://staticlayout.apple.nextmedia.com/web_images/header/ad_logo.jpg').read()
        book.set_cover("cover.jpg", logoimg)
    del(logoimg)

    # add default NCX and Nav file
    myncx = epub.EpubNcx()
    book.add_item(myncx)
    mynav = epub.EpubNav()
    book.add_item(mynav)

    # define CSS style
    style = '''
    @namespace epub "http://www.idpf.org/2007/ops";
    body {
        font-family: Cambria, Liberation Serif, Bitstream Vera Serif, Georgia, Times, Times New Roman, serif;
    }
    '''
    nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
    book.add_item(nav_css)

    # basic spine
    book.spine = []
    book.spine.insert(0, 'nav')
    book.spine.insert(0, 'cover')

    desired = u'即時新聞'
    desired = desired.encode('utf-8')

    baseurl = "http://hkm.appledaily.com/"
    html = urllib.urlopen(baseurl).read()
    soup = BeautifulSoup(html, "lxml")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    categories = {}
    # get links
    menu = soup.find(attrs={'class':'menu'})
    for link in menu.find_all('li'):
        ahref = link.find('a', href=True)
        if ahref is not None:
            href = ahref['href']
            if (str(href).startswith('list.php') and ahref.getText().encode('utf-8') == desired):
                categories[ahref.getText()] = href
    #print(categories)
    #sys.exit()

    #newslist = {}
    newsid = 0
    #for chapter, category in categories.iteritems():
    for chapter, category in categories.items():
            newslist = OrderedDict()
        #print (chapter)
        #print (category)
        mycat = urllib.urlopen(baseurl+category).read()
        mysoup = BeautifulSoup(mycat, "lxml")
        ul = mysoup.find(attrs={'class':'list'})
        for li in ul.findAll('li'):
            a = li.find('a', href=True)
            title = li.find('p', text=True)
            if a is not None:
                title = title.getText().strip()
                newsurl = baseurl + a.get('href', False)
                newslist[title] = newsurl
                #print (title + '\t' + newsurl)
        for title, news in newslist.iteritems():
                    if (newsid > 20):
                        break
                    newscontent = ""
                    newsid = newsid + 1
                    newsname = 'news-{0:04d}'.format(newsid)
                    chapter = epub.EpubHtml(title=title, file_name=newsname+'.xhtml', media_type='application/xhtml+xml', lang='zh')
                    chapter.content = u'<b>'+title+u'</b><hr/>'
            print('Title: ' + title)
            print('Link: ' + news)
            mynews = urllib.urlopen(news).read()
            mysoup = BeautifulSoup(mynews, "lxml")

            images = mysoup.find(attrs={'class':'news-img'})
            if images is not None:
                for image in images.findAll('img'):
                    imagesrc = image['src']
                    #print('Image: ' + imagesrc)
                                    # download the image
                                    response = urllib.urlopen(imagesrc).read()
                                    imgfile = open('/tmp/'+newsname+'.jpg','wb')
                                    chapterimg = epub.EpubImage()
                                    chapterimg.file_name = newsname+'.jpg'
                                    chapterimg.content = response
                                    book.add_item(chapterimg)
                                    #book.spine.append(chapterimg)
                                    imgfile.write(response)
                                    imgfile.close()
                                    del(chapterimg)
                                    del(response)

            article = mysoup.find(attrs={'class':'content-article'})
            chapter.content += u'<center><img src="'+newsname+'.jpg" /></center><hr/>'
            if article is not None:
                for paragraph in article.findAll('p'):
                    if paragraph.get('class') is None and len(paragraph.getText()) > 0:
                        chapter.content += u'<p>'+(paragraph.getText())+u'</p>'
                        #print(paragraph.getText())
                    chapter.content += u'<hr/><font size=-2><i>'+news+u'</i></font>'
                    #print (chapter.content)
                        book.toc.append(epub.Link(newsname+'.xhtml', title, newsname))
                    book.add_item(chapter)
                    book.spine.append(chapter)
                    del (chapter)
        #newslist = {}
            del (newslist)
    epub.write_epub('/tmp/output.epub', book, {})
        epubsize = os.path.getsize('/tmp/output.epub')
    partfilename = 'appledaily-%d-%d-%d.mobi' % (today.year, today.month, today.day)
    convertbook('/tmp/output.epub', '/tmp/'+partfilename)
        mobisize = os.path.getsize('/tmp/'+partfilename)
        print ("Epub size = %d, Mobi size = %d" % (epubsize, mobisize))
        msg = MIMEMultipart()
        msg['Subject'] = 'convert'
        msg['From'] = 'lazyfai@gmail.com'
        msg['To'] = 'lazyfai@kindle.com'
        partimg = open('/tmp/'+partfilename,'rb')
        #partimg = open('/tmp/output.epub','rb')
        part = MIMEBase('application','octext-stream')
        part.set_payload(partimg.read())
        part.add_header('Content-Disposition', 'attachment', filename=partfilename)
        encoders.encode_base64(part)
        msg.attach(part)
    # I use SES, can use other email sending method
        client = boto3.client('ses', region_name='us-east-1')
        #conn = boto3.ses.connect_to_region('us-east-1')
        emailbody = msg.as_string()
        response = client.send_raw_email(
            Source = msg['From'],
            Destinations = [msg['To']],
            RawMessage = {
                'Data': emailbody
            }
        )
    return "Finished"

if __name__ == "__main__":
    handler("DryRun", None)