Guest User

Untitled

a guest
May 26th, 2018
95
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.47 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import json
  4. import urllib.request
  5. import lxml.html
  6. import html2text
  7. from urllib.parse import urlparse, urljoin
  8. import sys
  9.  
  10. crawl_url = {}
  11. visit = {}
  12.  
  13. ipynb = {
  14. "cells": [
  15. {
  16. "cell_type": "code",
  17. "execution_count": None,
  18. "metadata": {},
  19. "outputs": [],
  20. "source": []
  21. }
  22. ],
  23. "metadata": {
  24. "kernelspec": {
  25. "display_name": "Bash",
  26. "language": "bash",
  27. "name": "bash"
  28. },
  29. "language_info": {
  30. "codemirror_mode": "shell",
  31. "file_extension": ".sh",
  32. "mimetype": "text/x-sh",
  33. "name": "bash"
  34. }
  35. },
  36. "nbformat": 4,
  37. "nbformat_minor": 2
  38. }
  39.  
  40. def markdown(source):
  41. if source[0] == '\n': del source[0]
  42. if source[-1] == '\n': source.pop()
  43. if len(source) == 0: return None
  44. if source[-1][-1] == '\n': source[-1] = source[-1][0:len(source[-1])-1]
  45. return {
  46. "cell_type": "markdown",
  47. "metadata": {},
  48. "source": source
  49. }
  50.  
  51. def code(source):
  52. if source[0] == '\n': del source[0]
  53. while source[-1] == '\n': source.pop()
  54. if len(source) == 0: return None
  55. if source[-1][-1] == '\n': source[-1] = source[-1][0:len(source[-1])-1]
  56. return {
  57. "cell_type": "code",
  58. "execution_count": None,
  59. "metadata": {},
  60. "outputs": [],
  61. "source": source
  62. }
  63.  
  64. def convert(url, cells, element):
  65. for child in list(element):
  66. if child.tag == 'div':
  67. convert(url, cells, child)
  68. elif child.tag == 'ul':
  69. for item in list(child):
  70. href = None
  71. for e in list(item):
  72. if e.tag == 'a':
  73. href = e.get('href')
  74. if href[-5:] == '.html':
  75. file = href[0:len(href)-5] + '.ipynb'
  76. crawl_url[file] = urljoin(url, href)
  77. href = file
  78. e.set('href', href)
  79. content = lxml.html.tostring(item).decode().rstrip()
  80. if href:
  81. #print(" ==> ", crawl_url)
  82. #print(content)
  83. #crawl(urlparse.urljoin(url, href))
  84. if content[-5:] == '</li>':
  85. content = content[0:len(content)-5] + '<a href="{}">*</a>'.format(crawl_url[href]) + '</li>'
  86. else:
  87. content += '<a href="{}">*</a>'.format(crawl_url[href])
  88. #print(content)
  89. source = html2text.html2text(content).splitlines(keepends=True)
  90. cell = markdown(source)
  91. if cell:
  92. cells.append(cell)
  93. elif child.tag == 'pre':
  94. content = lxml.html.tostring(child)
  95. source = []
  96. for item in html2text.html2text(content.decode()).splitlines(keepends=True):
  97. if item[0:4] == ' ': item = item[4:]
  98. source.append(item)
  99. #print(''.join(source))
  100. cell = code(source)
  101. if cell:
  102. cells.append(cell)
  103. elif child.tag == 'blockquote':
  104. for qc in list(child):
  105. convert(url, cells, qc)
  106. else:
  107. content = lxml.html.tostring(child)
  108. source = html2text.html2text(content.decode()).splitlines(keepends=True)
  109. #print(html2text.html2text(content.decode()))
  110. if source[0][-2:] == 'ΒΆ\n':
  111. source[0] = source[0][0:len(source[0])-2] + '\n'
  112. cell = markdown(source)
  113. if cell:
  114. cells.append(cell)
  115.  
  116.  
  117.  
  118. def crawl(url, file):
  119. if visit.get(url): return
  120. print('crawl({}, {})'.format(url, file))
  121. visit[url] = True
  122. page = urllib.request.urlopen(url).read()
  123. tree = lxml.html.fromstring(page)
  124. body = tree.xpath("//div[@id='body']")
  125.  
  126. cells = []
  127. convert(url, cells, body)
  128.  
  129. for i, cell in enumerate(cells):
  130. if cell["cell_type"] == "markdown" and \
  131. cell["source"][0] == "硐果(δΎ‹):" and \
  132. cells[i + 1]["cell_type"] == "code":
  133. cells[i + 1] = markdown(cells[i + 1]["source"])
  134.  
  135. ipynb['cells'] = cells
  136.  
  137. with open(file, "w") as fh:
  138. fh.write(json.dumps(ipynb, sort_keys=True, ensure_ascii=False, indent=1))
  139.  
  140. for file, url in crawl_url.items():
  141. crawl(url, file)
  142.  
  143. if __name__ == '__main__':
  144. for url in sys.argv[1:]:
  145. print('get {}'.format(url))
  146. file = urlparse(url).path.rpartition('/')[-1].rpartition('.')[0] + '.ipynb'
  147. crawl(url, file)
Add Comment
Please, Sign In to add comment