Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- from __future__ import print_function
- import boto3
- import urllib
- import sys
- import json
- import datetime
- import time
- import os
- import io
- import requests
- import SwaggerPetstore
- from collections import OrderedDict
- from bs4 import BeautifulSoup
- from ebooklib import epub
- from email import encoders
- from email.mime.base import MIMEBase
- from email.mime.multipart import MIMEMultipart
- from email.mime.application import MIMEApplication
- def convertbook(infile, outfile):
- print ("converting %s to %s" % (infile, outfile))
- myapikey = '### My online-convert.com API key ###'
- converturl = 'http://api2.online-convert.com/jobs'
- payload = '{ "conversion": [ { "target":"mobi" } ] }'
- headers = {
- 'X-Oc-Api-Key': myapikey,
- 'Conten-Type': "application/json",
- 'Cache-Control': "no-cache"
- }
- resp = requests.post(converturl, data=payload, headers=headers)
- #print (resp.text)
- response = json.loads(resp.text)
- print (response)
- if (response['status']['code'] == 'incomplete'):
- uploadurl = response['server']
- uploadtoken = response['token']
- #uploadid = response['conversion'][0]['id']
- uploadid = response['id']
- else:
- print ("convert job init failure")
- sys.exit()
- # upload file now
- headers = {
- 'x-oc-api-key': myapikey,
- 'X-Oc-Token': uploadtoken
- }
- payload = {'file': ('input.epub', open(infile, 'rb'), 'application/epub+zip', {'Expires': '0'})}
- uploadurl = uploadurl.replace('/dl', '/dl/upload-file')
- print ("preparing upload job to %s with token %s" % (uploadurl+'/'+uploadid, uploadtoken))
- resp = requests.post(uploadurl+'/'+uploadid, files=payload, headers=headers)
- print (len(resp.request.body))
- response = json.loads(resp.text)
- print (response)
- if (resp.status_code == 200):
- # wait for conversion finish
- print ("convert job initialized")
- else:
- print ("status code = %s" % str(resp.status_code) )
- sys.exit()
- jobstatus = "started"
- while (jobstatus != "completed"):
- time.sleep(3)
- print ("checking status by URL %s" % (converturl+'/'+uploadid))
- resp = requests.get(converturl+'/'+uploadid, headers=headers)
- response = json.loads(resp.text)
- print (response)
- jobstatus = response['status']['code']
- # get back the file
- downloaduri = response['output'][0]['uri']
- print ("Getting back the file from %s" % downloaduri)
- response = urllib.urlopen(downloaduri).read()
- outputfile = open(outfile,'wb')
- outputfile.write(response)
- outputfile.close()
- def handler(event, context):
- today = datetime.datetime.now()
- print("Buffer size = " + str(io.DEFAULT_BUFFER_SIZE))
- print("Got event")
- print(json.dumps(event))
- book = epub.EpubBook()
- # set metadata
- todaystr = '%d-%d-%d' % (today.year, today.month, today.day)
- book.set_identifier('Apply Daily')
- book.set_title(u'蘋果日報 即時新聞 '+todaystr)
- book.set_language('zh')
- book.add_author(u'蘋果日報')
- logoimg = urllib.urlopen('http://staticlayout.apple.nextmedia.com/web_images/header/ad_logo.jpg').read()
- book.set_cover("cover.jpg", logoimg)
- del(logoimg)
- # add default NCX and Nav file
- myncx = epub.EpubNcx()
- book.add_item(myncx)
- mynav = epub.EpubNav()
- book.add_item(mynav)
- # define CSS style
- style = '''
- @namespace epub "http://www.idpf.org/2007/ops";
- body {
- font-family: Cambria, Liberation Serif, Bitstream Vera Serif, Georgia, Times, Times New Roman, serif;
- }
- '''
- nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
- book.add_item(nav_css)
- # basic spine
- book.spine = []
- book.spine.insert(0, 'nav')
- book.spine.insert(0, 'cover')
- desired = u'即時新聞'
- desired = desired.encode('utf-8')
- baseurl = "http://hkm.appledaily.com/"
- html = urllib.urlopen(baseurl).read()
- soup = BeautifulSoup(html, "lxml")
- # kill all script and style elements
- for script in soup(["script", "style"]):
- script.extract() # rip it out
- categories = {}
- # get links
- menu = soup.find(attrs={'class':'menu'})
- for link in menu.find_all('li'):
- ahref = link.find('a', href=True)
- if ahref is not None:
- href = ahref['href']
- if (str(href).startswith('list.php') and ahref.getText().encode('utf-8') == desired):
- categories[ahref.getText()] = href
- #print(categories)
- #sys.exit()
- #newslist = {}
- newsid = 0
- #for chapter, category in categories.iteritems():
- for chapter, category in categories.items():
- newslist = OrderedDict()
- #print (chapter)
- #print (category)
- mycat = urllib.urlopen(baseurl+category).read()
- mysoup = BeautifulSoup(mycat, "lxml")
- ul = mysoup.find(attrs={'class':'list'})
- for li in ul.findAll('li'):
- a = li.find('a', href=True)
- title = li.find('p', text=True)
- if a is not None:
- title = title.getText().strip()
- newsurl = baseurl + a.get('href', False)
- newslist[title] = newsurl
- #print (title + '\t' + newsurl)
- for title, news in newslist.iteritems():
- if (newsid > 20):
- break
- newscontent = ""
- newsid = newsid + 1
- newsname = 'news-{0:04d}'.format(newsid)
- chapter = epub.EpubHtml(title=title, file_name=newsname+'.xhtml', media_type='application/xhtml+xml', lang='zh')
- chapter.content = u'<b>'+title+u'</b><hr/>'
- print('Title: ' + title)
- print('Link: ' + news)
- mynews = urllib.urlopen(news).read()
- mysoup = BeautifulSoup(mynews, "lxml")
- images = mysoup.find(attrs={'class':'news-img'})
- if images is not None:
- for image in images.findAll('img'):
- imagesrc = image['src']
- #print('Image: ' + imagesrc)
- # download the image
- response = urllib.urlopen(imagesrc).read()
- imgfile = open('/tmp/'+newsname+'.jpg','wb')
- chapterimg = epub.EpubImage()
- chapterimg.file_name = newsname+'.jpg'
- chapterimg.content = response
- book.add_item(chapterimg)
- #book.spine.append(chapterimg)
- imgfile.write(response)
- imgfile.close()
- del(chapterimg)
- del(response)
- article = mysoup.find(attrs={'class':'content-article'})
- chapter.content += u'<center><img src="'+newsname+'.jpg" width="80%" /></center><hr/>'
- if article is not None:
- for paragraph in article.findAll('p'):
- if paragraph.get('class') is None and len(paragraph.getText()) > 0:
- chapter.content += u'<p>'+(paragraph.getText())+u'</p>'
- #print(paragraph.getText())
- chapter.content += u'<hr/><font size=-2><i>'+news+u'</i></font>'
- #print (chapter.content)
- book.toc.append(epub.Link(newsname+'.xhtml', title, newsname))
- book.add_item(chapter)
- book.spine.append(chapter)
- del (chapter)
- #newslist = {}
- del (newslist)
- epub.write_epub('/tmp/output.epub', book, {})
- epubsize = os.path.getsize('/tmp/output.epub')
- partfilename = 'appledaily-%d-%d-%d.mobi' % (today.year, today.month, today.day)
- #if (event == "DryRun"):
- # sys.exit()
- convertbook('/tmp/output.epub', '/tmp/'+partfilename)
- mobisize = os.path.getsize('/tmp/'+partfilename)
- print ("Epub size = %d, Mobi size = %d" % (epubsize, mobisize))
- msg = MIMEMultipart()
- msg['Subject'] = 'convert'
- msg['From'] = '### My from email address ###'
- msg['To'] = '### My send to Kindle email address ###'
- attachment = open('/tmp/'+partfilename,'rb')
- requests.post(
- "https://api.mailgun.net/v3/### my mailgun domain ###/messages",
- auth=("api", "### my mailgun API key ###"),
- files=[("attachment", attachment)],
- data={
- "from": msg['From'],
- "to": msg['To'],
- "subject": "convert",
- "text": "Hello world"
- }
- )
- return "Finished"
- if __name__ == "__main__":
- handler("DryRun", None)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement