Advertisement
pineapplemachine

wiki.libsdl.org offline copy script & instructions

Jun 22nd, 2020
1,556
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.52 KB | None
  1. #!/usr/local/bin/python
  2.  
  3. """
  4.  
  5. Use this script on the output of an httrack download of
  6. wiki.libsdl.org to clean up links and such.
  7.  
  8. Link to httrack: https://www.httrack.com/
  9.  
  10. This script expects to reside in a directory created by
  11. httrack. "./wiki.libsdl.org/" should be a valid directory
  12. path to a directory containing lots of SDL2 HTML files.
  13.  
  14. When run, the script writes a "munged" version of the
  15. downloaded website to "./wiki.libsdl.org.munged/", with
  16. links fixed to refer to local resources instead of remote
  17. resources and with some additional resources downloaded
  18. which httrack doesn't itself retrieve.
  19.  
  20. The script may create other additional directories within
  21. the same place where it is run, besides the aforementioned
  22. "./wiki.libsdl.org.munged/", to contain other resources,
  23. for example "./ajax.googleapis.com/" to contain a JQuery
  24. script dependency.
  25.  
  26. If creating a fresh new download of the wiki:
  27.  
  28. I recommend running httrack with this rate-limiting command,
  29. since as of writing (2020-06-22) the wiki will boot httrack
  30. out if it tries to download pages too rapidly:
  31.  
  32. > httrack wiki.libsdl.org -c1 -%c0.1
  33.  
  34. """
  35.  
  36. import os
  37. import pathlib
  38. import re
  39. import requests
  40. import shutil
  41.  
  42. re_sdl_version = re.compile(
  43.     r'/SDL_VERSION(["?#])'
  44. )
  45. re_wiki_href = re.compile(
  46.     r'<(link|a)([^>]*?) href="http://wiki.libsdl.org\/([^"?]*?)(["?#])'
  47. )
  48. re_wiki_href_ext = re.compile(
  49.     r'<(link|a)([^>]*?) href="http://wiki.libsdl.org\/([^"?]*?)\.([a-z]+)(["?#])'
  50. )
  51. re_wiki_src = re.compile(
  52.     r'<(script|img)([^>]*?) src="http://wiki.libsdl.org\/(.*?)"'
  53. )
  54. re_google_libs = re.compile(
  55.     r'<script([^>]*?) src="(https://)(ajax.googleapis.com)/(.*?)"'
  56. )
  57.  
  58. def munge_all_files():
  59.     # Munge all the files
  60.     download_links = set()
  61.     for root, dirs, files in os.walk("./wiki.libsdl.org"):
  62.         for file_name in files:
  63.             # Handle SDL_VERSION/SDL_version name collision issue
  64.             dest_name = file_name.replace(
  65.                 "SDL_VERSION", "SDL_VERSION_MACRO"
  66.             ).replace(
  67.                 "SDL_version-2", "SDL_version"
  68.             )
  69.             # Notify unresolved name collisions
  70.             if os.path.splitext(dest_name)[0][-2] == "-":
  71.                 raise Exception("Apparent file name collision: %s" % file_name)
  72.             # Determine destination for munged file
  73.             file_path = os.path.join(root, file_name)
  74.             dest_path = os.path.join(root, dest_name).replace(
  75.                 "wiki.libsdl.org", "wiki.libsdl.org.munged", 1
  76.             )
  77.             dest_dir_path = os.path.dirname(dest_path)
  78.             pathlib.Path(dest_dir_path).mkdir(parents=True, exist_ok=True)
  79.             # Munge HTML files and copy others
  80.             if os.path.splitext(file_name)[1] == ".html":
  81.                 print("Munging:", file_path)
  82.                 munge_file(file_path, dest_path, download_links)
  83.             else:
  84.                 print("Copying:", file_name)
  85.                 shutil.copyfile(file_path, dest_path)
  86.     # Download JS libraries hosted via google
  87.     for link in download_links:
  88.         print("Downloading additional file:", link)
  89.         path = "./" + link[3 + link.index("://"):]
  90.         dir_path = os.path.dirname(path)
  91.         pathlib.Path(dir_path).mkdir(parents=True, exist_ok=True)
  92.         response = requests.get(link)
  93.         with open(path, "wb") as download_file:
  94.             download_file.write(response.content)
  95.  
  96. def munge_file(file_path, dest_path, download_links):
  97.     file_depth = file_path.count("/") - 2
  98.     if file_depth <= 0:
  99.         rel_prefix = "./"
  100.     else:
  101.         rel_prefix = "../" * file_depth
  102.     with open(file_path, "r") as html_file:
  103.         content = html_file.read()
  104.         google_lib_links = re_google_libs.findall(content)
  105.         for match in google_lib_links:
  106.             download_links.add(match[1] + match[2] + "/" + match[3])
  107.         content = re_google_libs.sub(
  108.             '<script\\1 src="%s../\\3/\\4"' % rel_prefix, content
  109.         )
  110.         content = re_sdl_version.sub(
  111.             '/SDL_VERSION_MACRO\\1', content
  112.         )
  113.         content = re_wiki_href_ext.sub(
  114.             '<\\1\\2 href="%s\\3.\\4\\5' % rel_prefix, content
  115.         )
  116.         content = re_wiki_href.sub(
  117.             '<\\1\\2 href="%s\\3.html\\4' % rel_prefix, content
  118.         )
  119.         content = re_wiki_src.sub(
  120.             '<\\1\\2 src="%s\\3"' % rel_prefix, content
  121.         )
  122.     with open(dest_path, "w") as new_file:
  123.         new_file.write(content)
  124.    
  125. def __main__():
  126.     munge_all_files()
  127.  
  128. if __name__ == "__main__":
  129.     __main__()
Advertisement
RAW Paste Data Copied
Advertisement