celestialgod

Crawl NHK News Easy

Jan 28th, 2018
337
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.12 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. import pycurl, time, io, json, os, shutil
  3. from bs4 import BeautifulSoup
  4.  
  5. def pycurl_get_f(url, data_out=None):
  6.   c = pycurl.Curl()
  7.   c.setopt(pycurl.URL, url)
  8.   if data_out is not None:
  9.     c.setopt(c.WRITEDATA, data_out)
  10.   c.perform()
  11.   c.close()
  12.  
  13. def get_news_list_f():
  14.   millis = int(round(time.time() * 1000))
  15.   url = 'http://www3.nhk.or.jp/news/easy/news-list.json?_=' + str(millis)
  16.   buffer = io.BytesIO()
  17.   pycurl_get_f(url, buffer)
  18.   return json.loads(buffer.getvalue())[0]
  19.  
  20. def get_audio_f(subdir, news_id):
  21.   url = 'http://www3.nhk.or.jp/news/easy/{}/{}.mp3'.format(news_id, news_id)
  22.   f = open("{}/audios/{}.mp3".format(subdir, news_id), "wb")
  23.   pycurl_get_f(url, f)
  24.   f.close()
  25.  
  26. def get_news_f(subdir, news_id):
  27.   template_html = """<html><head>
  28. <link rel="stylesheet" type="text/css" href="../style.css"></head><body>
  29. <div id="newstitle">{}</div>
  30. <audio controls><source src="audios/{}.mp3" type="audio/mpeg"></audio>
  31. <div id="newsarticle">{}</div></body></html>"""
  32.   url = 'http://www3.nhk.or.jp/news/easy/{}/{}.html'.format(news_id, news_id)
  33.   buffer = io.BytesIO()
  34.   pycurl_get_f(url, buffer)
  35.   soup = BeautifulSoup(buffer.getvalue(), 'html.parser')
  36.   title = soup.find("div", {"id": "newstitle"})
  37.   news = soup.find("div", {"id": "newsarticle"})
  38.   for a in news.select('a'):
  39.     a.replaceWithChildren()
  40.   file = open('{}/{}.html'.format(subdir, news_id), 'w', encoding = 'UTF-8')
  41.   file.write(template_html.format(str(title), news_id, str(news)))
  42.   file.close()
  43.  
  44.  
  45. if __name__ == "__main__":
  46.   news_list = get_news_list_f()
  47.   for key,value in news_list.items():
  48.     subdir = 'news/' + key.replace('-', '')
  49.     try:
  50.       os.mkdir(subdir)
  51.       os.mkdir(subdir + '/audios')
  52.     except FileExistsError:
  53.       if len(os.listdir(subdir)) - 1 == len(value) and \
  54.         len(os.listdir(subdir + '/audios')) == len(value):
  55.         continue
  56.       else:
  57.         shutil.rmtree(subdir)
  58.         os.mkdir(subdir)
  59.         os.mkdir(subdir + '/audios')
  60.  
  61.     for news in value:
  62.       news_id = news['news_id']
  63.       get_audio_f(subdir, news_id)
  64.       get_news_f(subdir, news_id)
Advertisement
Add Comment
Please, Sign In to add comment