wiki.libsdl.org offline copy script & instructions

#!/usr/local/bin/python

"""

Use this script on the output of an httrack download of
wiki.libsdl.org to clean up links and such.

Link to httrack: https://www.httrack.com/

This script expects to reside in a directory created by
httrack. "./wiki.libsdl.org/" should be a valid directory
path to a directory containing lots of SDL2 HTML files.

When run, the script writes a "munged" version of the
downloaded website to "./wiki.libsdl.org.munged/", with
links fixed to refer to local resources instead of remote
resources and with some additional resources downloaded
which httrack doesn't itself retrieve.

The script may create other additional directories within
the same place where it is run, besides the aforementioned
"./wiki.libsdl.org.munged/", to contain other resources,
for example "./ajax.googleapis.com/" to contain a JQuery
script dependency.

If creating a fresh new download of the wiki:

I recommend running httrack with this rate-limiting command,
since as of writing (2020-06-22) the wiki will boot httrack
out if it tries to download pages too rapidly:

> httrack wiki.libsdl.org -c1 -%c0.1

"""

import os
import pathlib
import re
import requests
import shutil

re_sdl_version = re.compile(
    r'/SDL_VERSION(["?#])'
)
re_wiki_href = re.compile(
    r'<(link|a)([^>]*?) href="http://wiki.libsdl.org\/([^"?]*?)(["?#])'
)
re_wiki_href_ext = re.compile(
    r'<(link|a)([^>]*?) href="http://wiki.libsdl.org\/([^"?]*?)\.([a-z]+)(["?#])'
)
re_wiki_src = re.compile(
    r'<(script|img)([^>]*?) src="http://wiki.libsdl.org\/(.*?)"'
)
re_google_libs = re.compile(
    r'<script([^>]*?) src="(https://)(ajax.googleapis.com)/(.*?)"'
)

def munge_all_files():
    # Munge all the files
    download_links = set()
    for root, dirs, files in os.walk("./wiki.libsdl.org"):
        for file_name in files:
            # Handle SDL_VERSION/SDL_version name collision issue
            dest_name = file_name.replace(
                "SDL_VERSION", "SDL_VERSION_MACRO"
            ).replace(
                "SDL_version-2", "SDL_version"
            )
            # Notify unresolved name collisions
            if os.path.splitext(dest_name)[0][-2] == "-":
                raise Exception("Apparent file name collision: %s" % file_name)
            # Determine destination for munged file
            file_path = os.path.join(root, file_name)
            dest_path = os.path.join(root, dest_name).replace(
                "wiki.libsdl.org", "wiki.libsdl.org.munged", 1
            )
            dest_dir_path = os.path.dirname(dest_path)
            pathlib.Path(dest_dir_path).mkdir(parents=True, exist_ok=True)
            # Munge HTML files and copy others
            if os.path.splitext(file_name)[1] == ".html":
                print("Munging:", file_path)
                munge_file(file_path, dest_path, download_links)
            else:
                print("Copying:", file_name)
                shutil.copyfile(file_path, dest_path)
    # Download JS libraries hosted via google
    for link in download_links:
        print("Downloading additional file:", link)
        path = "./" + link[3 + link.index("://"):]
        dir_path = os.path.dirname(path)
        pathlib.Path(dir_path).mkdir(parents=True, exist_ok=True)
        response = requests.get(link)
        with open(path, "wb") as download_file:
            download_file.write(response.content)

def munge_file(file_path, dest_path, download_links):
    file_depth = file_path.count("/") - 2
    if file_depth <= 0:
        rel_prefix = "./"
    else:
        rel_prefix = "../" * file_depth
    with open(file_path, "r") as html_file:
        content = html_file.read()
        google_lib_links = re_google_libs.findall(content)
        for match in google_lib_links:
            download_links.add(match[1] + match[2] + "/" + match[3])
        content = re_google_libs.sub(
            '<script\\1 src="%s../\\3/\\4"' % rel_prefix, content
        )
        content = re_sdl_version.sub(
            '/SDL_VERSION_MACRO\\1', content
        )
        content = re_wiki_href_ext.sub(
            '<\\1\\2 href="%s\\3.\\4\\5' % rel_prefix, content
        )
        content = re_wiki_href.sub(
            '<\\1\\2 href="%s\\3.html\\4' % rel_prefix, content
        )
        content = re_wiki_src.sub(
            '<\\1\\2 src="%s\\3"' % rel_prefix, content
        )
    with open(dest_path, "w") as new_file:
        new_file.write(content)

def __main__():
    munge_all_files()

if __name__ == "__main__":
    __main__()