Untitled

#!/usr/bin/env python

import json
import urllib.request
import lxml.html
import html2text
from urllib.parse import urlparse, urljoin
import sys

crawl_url = {}
visit = {}

ipynb = {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Bash",
   "language": "bash",
   "name": "bash"
  },
  "language_info": {
   "codemirror_mode": "shell",
   "file_extension": ".sh",
   "mimetype": "text/x-sh",
   "name": "bash"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

def markdown(source):
    if source[0] == '\n': del source[0]
    if source[-1] == '\n': source.pop()
    if len(source) == 0: return None
    if source[-1][-1] == '\n': source[-1] = source[-1][0:len(source[-1])-1]
    return {
               "cell_type": "markdown",
               "metadata": {},
               "source": source
              }

def code(source):
    if source[0] == '\n': del source[0]
    while source[-1] == '\n': source.pop()
    if len(source) == 0: return None
    if source[-1][-1] == '\n': source[-1] = source[-1][0:len(source[-1])-1]
    return {
               "cell_type": "code",
               "execution_count": None,
               "metadata": {},
               "outputs": [],
               "source": source
              }

def convert(url, cells, element):
    for child in list(element):
        if child.tag == 'div':
            convert(url, cells, child)
        elif child.tag == 'ul':
            for item in list(child):
                href = None
                for e in list(item):
                    if e.tag == 'a':
                        href = e.get('href')
                        if href[-5:] == '.html':
                            file = href[0:len(href)-5] + '.ipynb'
                            crawl_url[file] = urljoin(url, href)
                            href = file
                            e.set('href', href)
                content = lxml.html.tostring(item).decode().rstrip()
                if href:
                    #print(" ==> ", crawl_url)
                    #print(content)
                    #crawl(urlparse.urljoin(url, href))
                    if content[-5:] == '</li>':
                        content = content[0:len(content)-5] + '<a href="{}">*</a>'.format(crawl_url[href]) + '</li>'
                    else:
                        content += '<a href="{}">*</a>'.format(crawl_url[href])
                #print(content)
                source = html2text.html2text(content).splitlines(keepends=True)
                cell = markdown(source)
                if cell:
                    cells.append(cell)
        elif child.tag == 'pre':
            content = lxml.html.tostring(child)
            source = []
            for item in html2text.html2text(content.decode()).splitlines(keepends=True):
                if item[0:4] == '    ': item = item[4:]
                source.append(item)
            #print(''.join(source))
            cell = code(source)
            if cell:
                cells.append(cell)
        elif child.tag == 'blockquote':
            for qc in list(child):
                convert(url, cells, qc)
        else:
            content = lxml.html.tostring(child)
            source = html2text.html2text(content.decode()).splitlines(keepends=True)
            #print(html2text.html2text(content.decode()))
            if source[0][-2:] == '¶\n':
                source[0] = source[0][0:len(source[0])-2] + '\n'
            cell = markdown(source)
            if cell:
                cells.append(cell)


def crawl(url, file):
    if visit.get(url): return
    print('crawl({}, {})'.format(url, file))
    visit[url] = True
    page = urllib.request.urlopen(url).read()
    tree = lxml.html.fromstring(page)
    body = tree.xpath("//div[@id='body']")

    cells = []
    convert(url, cells, body)

    for i, cell in enumerate(cells):
        if cell["cell_type"] == "markdown" and \
           cell["source"][0] == "結果(例):" and \
           cells[i + 1]["cell_type"] == "code":
            cells[i + 1] = markdown(cells[i + 1]["source"])

    ipynb['cells'] = cells

    with open(file, "w") as fh:
        fh.write(json.dumps(ipynb, sort_keys=True, ensure_ascii=False, indent=1))

    for file, url in crawl_url.items():
        crawl(url, file)

if __name__ == '__main__':
    for url in sys.argv[1:]:
        print('get {}'.format(url))
        file = urlparse(url).path.rpartition('/')[-1].rpartition('.')[0] + '.ipynb'
        crawl(url, file)