Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- How to use:
- 1: Don't.
- 2: If you really want to though - just run it. There's not really anything required on your behalf.
- How it works is - it scrapes the pastebin file I maintain that's full of desu archive links(https://pastebin.com/raw/J9Q3r785) and grabs all the *numbered* Dazzling threads.
- It ignores the (at the time of writing this, four) miscellanious/unnumbered ones. Those you'll have to just scrape manually by calling Monolith and then fixing the quote links(just make them relative, i.e [https://desuarchive.org/mlp/thread/35953696/#35953777] -> #35953777]);
- After it scrapes all the links from the paste, it invokes a nifty utility called Monolith with each thread link two at a time by way of asyncio(you can change how many instances of Monolith run at a time by changing the N variable, but I recommend leaving it at 2 because CloudFlare will smite you... also it ain't nice to spam Desu with requests.)
- After monolith downloads a thread, the script then opens up the .html file and fixes the quote links in the same way mentioned above.
- """
- # Requires beautifulsoup4, requests as dependencies
- # Also requires github.com/Y2Z/monolith to be installed.
- # NOTE: This is *extremely* hacky. There's barely any error handling, and it was made on a
- # 'if it works it works' thought-process.
- import os
- import pathlib
- import asyncio
- import subprocess
- from bs4 import BeautifulSoup
- from requests import get
- class DesuThread:
- def __init__(self, thread_number, thread_link):
- self.thread_number = thread_number
- self.thread_link = thread_link
- def __repr__(self):
- return repr(f'Dazzling Thread #{self.thread_number} ~ {self.thread_link}')
- async def download(archive_dir, thread):
- if not os.path.exists(f'{archive_dir}/{thread.thread_number}.html'):
- print(f"Downloading thread number {thread.thread_number}...")
- thread_file_path = f"{archive_dir}/{thread.thread_number}.html"
- proc = await asyncio.subprocess.create_subprocess_exec('monolith', '-I', '-j', thread.thread_link, '-o', thread_file_path)
- await proc.wait()
- thread_file = pathlib.Path(thread_file_path)
- thread_file.write_text(thread_file.read_text().replace(f'{thread.thread_link}/#', '#'))
- print(f"Thread number {thread.thread_number} downloaded.")
- else:
- print(f"Thread number {thread.thread_number} already exists in the archive directory, skipping...")
- async def main():
- threads = []
- tasks = asyncio.Queue()
- N = 2 # How many workers
- archive_link = "https://pastebin.com/raw/J9Q3r785"
- archive_request = get(archive_link)
- if archive_request.status_code == 404:
- print(f"Error: The link to the archive paste {archive_link} 404'd for some reason.")
- print("Whatever the case, script can't continue.")
- return
- archive_parsed = BeautifulSoup(archive_request.content, features="html.parser")
- archive_lines = archive_parsed.prettify().split('\n')
- print("Archive list has been parsed. Harvesting desu links...")
- for line in archive_lines:
- if 'desuarchive.org' in line:
- if '. ' in line or '. ' in line:
- line_split = line.replace(' ', '').replace('\r', '').split('.', 1)
- thread_number = line_split[0]
- thread_link = line_split[1]
- thread_obj = DesuThread(thread_number, thread_link)
- threads.append(thread_obj)
- # This ignores misc/unnumbered threads, but there's only four of them so...
- # I will just download them all manually myself and put them in the archive.
- print(f"Operation complete: {len(threads)} total threads.")
- print("Downloading...")
- current_dir = os.getcwd()
- archive_dir = f'{current_dir}/DZG Thread Archive'
- if not os.path.exists(archive_dir):
- os.mkdir(archive_dir)
- # Loop over the thread list, and create a task for each thread
- # Then, run three tasks at a time asynchronously.
- for thread in threads:
- tasks.put_nowait(download(archive_dir, thread))
- async def worker():
- while not tasks.empty():
- await tasks.get_nowait()
- await asyncio.gather(*[worker() for _ in range(N)])
- if __name__ == '__main__':
- asyncio.run(main())
Add Comment
Please, Sign In to add comment