Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/local/bin/python
- """
- Use this script on the output of an httrack download of
- wiki.libsdl.org to clean up links and such.
- Link to httrack: https://www.httrack.com/
- This script expects to reside in a directory created by
- httrack. "./wiki.libsdl.org/" should be a valid directory
- path to a directory containing lots of SDL2 HTML files.
- When run, the script writes a "munged" version of the
- downloaded website to "./wiki.libsdl.org.munged/", with
- links fixed to refer to local resources instead of remote
- resources and with some additional resources downloaded
- which httrack doesn't itself retrieve.
- The script may create other additional directories within
- the same place where it is run, besides the aforementioned
- "./wiki.libsdl.org.munged/", to contain other resources,
- for example "./ajax.googleapis.com/" to contain a JQuery
- script dependency.
- If creating a fresh new download of the wiki:
- I recommend running httrack with this rate-limiting command,
- since as of writing (2020-06-22) the wiki will boot httrack
- out if it tries to download pages too rapidly:
- > httrack wiki.libsdl.org -c1 -%c0.1
- """
- import os
- import pathlib
- import re
- import requests
- import shutil
- re_sdl_version = re.compile(
- r'/SDL_VERSION(["?#])'
- )
- re_wiki_href = re.compile(
- r'<(link|a)([^>]*?) href="http://wiki.libsdl.org\/([^"?]*?)(["?#])'
- )
- re_wiki_href_ext = re.compile(
- r'<(link|a)([^>]*?) href="http://wiki.libsdl.org\/([^"?]*?)\.([a-z]+)(["?#])'
- )
- re_wiki_src = re.compile(
- r'<(script|img)([^>]*?) src="http://wiki.libsdl.org\/(.*?)"'
- )
- re_google_libs = re.compile(
- r'<script([^>]*?) src="(https://)(ajax.googleapis.com)/(.*?)"'
- )
- def munge_all_files():
- # Munge all the files
- download_links = set()
- for root, dirs, files in os.walk("./wiki.libsdl.org"):
- for file_name in files:
- # Handle SDL_VERSION/SDL_version name collision issue
- dest_name = file_name.replace(
- "SDL_VERSION", "SDL_VERSION_MACRO"
- ).replace(
- "SDL_version-2", "SDL_version"
- )
- # Notify unresolved name collisions
- if os.path.splitext(dest_name)[0][-2] == "-":
- raise Exception("Apparent file name collision: %s" % file_name)
- # Determine destination for munged file
- file_path = os.path.join(root, file_name)
- dest_path = os.path.join(root, dest_name).replace(
- "wiki.libsdl.org", "wiki.libsdl.org.munged", 1
- )
- dest_dir_path = os.path.dirname(dest_path)
- pathlib.Path(dest_dir_path).mkdir(parents=True, exist_ok=True)
- # Munge HTML files and copy others
- if os.path.splitext(file_name)[1] == ".html":
- print("Munging:", file_path)
- munge_file(file_path, dest_path, download_links)
- else:
- print("Copying:", file_name)
- shutil.copyfile(file_path, dest_path)
- # Download JS libraries hosted via google
- for link in download_links:
- print("Downloading additional file:", link)
- path = "./" + link[3 + link.index("://"):]
- dir_path = os.path.dirname(path)
- pathlib.Path(dir_path).mkdir(parents=True, exist_ok=True)
- response = requests.get(link)
- with open(path, "wb") as download_file:
- download_file.write(response.content)
- def munge_file(file_path, dest_path, download_links):
- file_depth = file_path.count("/") - 2
- if file_depth <= 0:
- rel_prefix = "./"
- else:
- rel_prefix = "../" * file_depth
- with open(file_path, "r") as html_file:
- content = html_file.read()
- google_lib_links = re_google_libs.findall(content)
- for match in google_lib_links:
- download_links.add(match[1] + match[2] + "/" + match[3])
- content = re_google_libs.sub(
- '<script\\1 src="%s../\\3/\\4"' % rel_prefix, content
- )
- content = re_sdl_version.sub(
- '/SDL_VERSION_MACRO\\1', content
- )
- content = re_wiki_href_ext.sub(
- '<\\1\\2 href="%s\\3.\\4\\5' % rel_prefix, content
- )
- content = re_wiki_href.sub(
- '<\\1\\2 href="%s\\3.html\\4' % rel_prefix, content
- )
- content = re_wiki_src.sub(
- '<\\1\\2 src="%s\\3"' % rel_prefix, content
- )
- with open(dest_path, "w") as new_file:
- new_file.write(content)
- def __main__():
- munge_all_files()
- if __name__ == "__main__":
- __main__()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement