Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import json
- import urllib.request
- import lxml.html
- import html2text
- from urllib.parse import urlparse, urljoin
- import sys
- crawl_url = {}
- visit = {}
- ipynb = {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": None,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Bash",
- "language": "bash",
- "name": "bash"
- },
- "language_info": {
- "codemirror_mode": "shell",
- "file_extension": ".sh",
- "mimetype": "text/x-sh",
- "name": "bash"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
- def markdown(source):
- if source[0] == '\n': del source[0]
- if source[-1] == '\n': source.pop()
- if len(source) == 0: return None
- if source[-1][-1] == '\n': source[-1] = source[-1][0:len(source[-1])-1]
- return {
- "cell_type": "markdown",
- "metadata": {},
- "source": source
- }
- def code(source):
- if source[0] == '\n': del source[0]
- while source[-1] == '\n': source.pop()
- if len(source) == 0: return None
- if source[-1][-1] == '\n': source[-1] = source[-1][0:len(source[-1])-1]
- return {
- "cell_type": "code",
- "execution_count": None,
- "metadata": {},
- "outputs": [],
- "source": source
- }
- def convert(url, cells, element):
- for child in list(element):
- if child.tag == 'div':
- convert(url, cells, child)
- elif child.tag == 'ul':
- for item in list(child):
- href = None
- for e in list(item):
- if e.tag == 'a':
- href = e.get('href')
- if href[-5:] == '.html':
- file = href[0:len(href)-5] + '.ipynb'
- crawl_url[file] = urljoin(url, href)
- href = file
- e.set('href', href)
- content = lxml.html.tostring(item).decode().rstrip()
- if href:
- #print(" ==> ", crawl_url)
- #print(content)
- #crawl(urlparse.urljoin(url, href))
- if content[-5:] == '</li>':
- content = content[0:len(content)-5] + '<a href="{}">*</a>'.format(crawl_url[href]) + '</li>'
- else:
- content += '<a href="{}">*</a>'.format(crawl_url[href])
- #print(content)
- source = html2text.html2text(content).splitlines(keepends=True)
- cell = markdown(source)
- if cell:
- cells.append(cell)
- elif child.tag == 'pre':
- content = lxml.html.tostring(child)
- source = []
- for item in html2text.html2text(content.decode()).splitlines(keepends=True):
- if item[0:4] == ' ': item = item[4:]
- source.append(item)
- #print(''.join(source))
- cell = code(source)
- if cell:
- cells.append(cell)
- elif child.tag == 'blockquote':
- for qc in list(child):
- convert(url, cells, qc)
- else:
- content = lxml.html.tostring(child)
- source = html2text.html2text(content.decode()).splitlines(keepends=True)
- #print(html2text.html2text(content.decode()))
- if source[0][-2:] == 'ΒΆ\n':
- source[0] = source[0][0:len(source[0])-2] + '\n'
- cell = markdown(source)
- if cell:
- cells.append(cell)
- def crawl(url, file):
- if visit.get(url): return
- print('crawl({}, {})'.format(url, file))
- visit[url] = True
- page = urllib.request.urlopen(url).read()
- tree = lxml.html.fromstring(page)
- body = tree.xpath("//div[@id='body']")
- cells = []
- convert(url, cells, body)
- for i, cell in enumerate(cells):
- if cell["cell_type"] == "markdown" and \
- cell["source"][0] == "η΅ζ(δΎ):" and \
- cells[i + 1]["cell_type"] == "code":
- cells[i + 1] = markdown(cells[i + 1]["source"])
- ipynb['cells'] = cells
- with open(file, "w") as fh:
- fh.write(json.dumps(ipynb, sort_keys=True, ensure_ascii=False, indent=1))
- for file, url in crawl_url.items():
- crawl(url, file)
- if __name__ == '__main__':
- for url in sys.argv[1:]:
- print('get {}'.format(url))
- file = urlparse(url).path.rpartition('/')[-1].rpartition('.')[0] + '.ipynb'
- crawl(url, file)
Add Comment
Please, Sign In to add comment