Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import pycurl, time, io, json, os, shutil
- from bs4 import BeautifulSoup
- def pycurl_get_f(url, data_out=None):
- c = pycurl.Curl()
- c.setopt(pycurl.URL, url)
- if data_out is not None:
- c.setopt(c.WRITEDATA, data_out)
- c.perform()
- c.close()
- def get_news_list_f():
- millis = int(round(time.time() * 1000))
- url = 'http://www3.nhk.or.jp/news/easy/news-list.json?_=' + str(millis)
- buffer = io.BytesIO()
- pycurl_get_f(url, buffer)
- return json.loads(buffer.getvalue())[0]
- def get_audio_f(subdir, news_id):
- url = 'http://www3.nhk.or.jp/news/easy/{}/{}.mp3'.format(news_id, news_id)
- f = open("{}/audios/{}.mp3".format(subdir, news_id), "wb")
- pycurl_get_f(url, f)
- f.close()
- def get_news_f(subdir, news_id):
- template_html = """<html><head>
- <link rel="stylesheet" type="text/css" href="../style.css"></head><body>
- <div id="newstitle">{}</div>
- <audio controls><source src="audios/{}.mp3" type="audio/mpeg"></audio>
- <div id="newsarticle">{}</div></body></html>"""
- url = 'http://www3.nhk.or.jp/news/easy/{}/{}.html'.format(news_id, news_id)
- buffer = io.BytesIO()
- pycurl_get_f(url, buffer)
- soup = BeautifulSoup(buffer.getvalue(), 'html.parser')
- title = soup.find("div", {"id": "newstitle"})
- news = soup.find("div", {"id": "newsarticle"})
- for a in news.select('a'):
- a.replaceWithChildren()
- file = open('{}/{}.html'.format(subdir, news_id), 'w', encoding = 'UTF-8')
- file.write(template_html.format(str(title), news_id, str(news)))
- file.close()
- if __name__ == "__main__":
- news_list = get_news_list_f()
- for key,value in news_list.items():
- subdir = 'news/' + key.replace('-', '')
- try:
- os.mkdir(subdir)
- os.mkdir(subdir + '/audios')
- except FileExistsError:
- if len(os.listdir(subdir)) - 1 == len(value) and \
- len(os.listdir(subdir + '/audios')) == len(value):
- continue
- else:
- shutil.rmtree(subdir)
- os.mkdir(subdir)
- os.mkdir(subdir + '/audios')
- for news in value:
- news_id = news['news_id']
- get_audio_f(subdir, news_id)
- get_news_f(subdir, news_id)
Advertisement
Add Comment
Please, Sign In to add comment