DazzlingThreadArchiver.py

"""
How to use:
1: Don't.
2: If you really want to though - just run it. There's not really anything required on your behalf.
How it works is - it scrapes the pastebin file I maintain that's full of desu archive links(https://pastebin.com/raw/J9Q3r785) and grabs all the *numbered* Dazzling threads.

It ignores the (at the time of writing this, four) miscellanious/unnumbered ones. Those you'll have to just scrape manually by calling Monolith and then fixing the quote links(just make them relative, i.e [https://desuarchive.org/mlp/thread/35953696/#35953777] -> #35953777]);

After it scrapes all the links from the paste, it invokes a nifty utility called Monolith with each thread link two at a time by way of asyncio(you can change how many instances of Monolith run at a time by changing the N variable, but I recommend leaving it at 2 because CloudFlare will smite you... also it ain't nice to spam Desu with requests.)

After monolith downloads a thread, the script then opens up the .html file and fixes the quote links in the same way mentioned above.
"""
# Requires beautifulsoup4, requests as dependencies
# Also requires github.com/Y2Z/monolith to be installed.
# NOTE: This is *extremely* hacky. There's barely any error handling, and it was made on a
#    'if it works it works' thought-process.
import os
import pathlib
import asyncio
import subprocess

from bs4 import BeautifulSoup
from requests import get


class DesuThread:
    def __init__(self, thread_number, thread_link):
        self.thread_number = thread_number
        self.thread_link = thread_link

    def __repr__(self):
        return repr(f'Dazzling Thread #{self.thread_number} ~ {self.thread_link}')


async def download(archive_dir, thread):
    if not os.path.exists(f'{archive_dir}/{thread.thread_number}.html'):
        print(f"Downloading thread number {thread.thread_number}...")
        thread_file_path = f"{archive_dir}/{thread.thread_number}.html"

        proc = await asyncio.subprocess.create_subprocess_exec('monolith', '-I', '-j', thread.thread_link, '-o', thread_file_path)
        await proc.wait()

        thread_file = pathlib.Path(thread_file_path)
        thread_file.write_text(thread_file.read_text().replace(f'{thread.thread_link}/#', '#'))
        print(f"Thread number {thread.thread_number} downloaded.")
    else:
        print(f"Thread number {thread.thread_number} already exists in the archive directory, skipping...")


async def main():
    threads = []
    tasks = asyncio.Queue()
    N = 2 # How many workers

    archive_link = "https://pastebin.com/raw/J9Q3r785"
    archive_request = get(archive_link)

    if archive_request.status_code == 404:
        print(f"Error: The link to the archive paste {archive_link} 404'd for some reason.")
        print("Whatever the case, script can't continue.")
        return

    archive_parsed = BeautifulSoup(archive_request.content, features="html.parser")
    archive_lines = archive_parsed.prettify().split('\n')
    print("Archive list has been parsed. Harvesting desu links...")

    for line in archive_lines:
        if 'desuarchive.org' in line:
            if '. ' in line or '. ' in line:
                line_split = line.replace(' ', '').replace('\r', '').split('.', 1)
                thread_number = line_split[0]
                thread_link = line_split[1]
                thread_obj = DesuThread(thread_number, thread_link)
                threads.append(thread_obj)
            # This ignores misc/unnumbered threads, but there's only four of them so...
            # I will just download them all manually myself and put them in the archive.
    print(f"Operation complete: {len(threads)} total threads.")
    print("Downloading...")

    current_dir = os.getcwd()
    archive_dir = f'{current_dir}/DZG Thread Archive'
    if not os.path.exists(archive_dir):
        os.mkdir(archive_dir)


    # Loop over the thread list, and create a task for each thread
    # Then, run three tasks at a time asynchronously.
    for thread in threads:
        tasks.put_nowait(download(archive_dir, thread))

    async def worker():
        while not tasks.empty():
            await tasks.get_nowait()

    await asyncio.gather(*[worker() for _ in range(N)])


if __name__ == '__main__':
    asyncio.run(main())