StarGhoul

DazzlingThreadArchiver.py

Oct 12th, 2020
152
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.29 KB | None | 0 0
  1. """
  2. How to use:
  3. 1: Don't.
  4. 2: If you really want to though - just run it. There's not really anything required on your behalf.
  5. How it works is - it scrapes the pastebin file I maintain that's full of desu archive links(https://pastebin.com/raw/J9Q3r785) and grabs all the *numbered* Dazzling threads.
  6.  
  7. It ignores the (at the time of writing this, four) miscellanious/unnumbered ones. Those you'll have to just scrape manually by calling Monolith and then fixing the quote links(just make them relative, i.e [https://desuarchive.org/mlp/thread/35953696/#35953777] -> #35953777]);
  8.  
  9. After it scrapes all the links from the paste, it invokes a nifty utility called Monolith with each thread link two at a time by way of asyncio(you can change how many instances of Monolith run at a time by changing the N variable, but I recommend leaving it at 2 because CloudFlare will smite you... also it ain't nice to spam Desu with requests.)
  10.  
  11. After monolith downloads a thread, the script then opens up the .html file and fixes the quote links in the same way mentioned above.
  12. """
  13. # Requires beautifulsoup4, requests as dependencies
  14. # Also requires github.com/Y2Z/monolith to be installed.
  15. # NOTE: This is *extremely* hacky. There's barely any error handling, and it was made on a
  16. #    'if it works it works' thought-process.
  17. import os
  18. import pathlib
  19. import asyncio
  20. import subprocess
  21.  
  22. from bs4 import BeautifulSoup
  23. from requests import get
  24.  
  25.  
  26. class DesuThread:
  27.     def __init__(self, thread_number, thread_link):
  28.         self.thread_number = thread_number
  29.         self.thread_link = thread_link
  30.  
  31.     def __repr__(self):
  32.         return repr(f'Dazzling Thread #{self.thread_number} ~ {self.thread_link}')
  33.  
  34.  
  35. async def download(archive_dir, thread):
  36.     if not os.path.exists(f'{archive_dir}/{thread.thread_number}.html'):
  37.         print(f"Downloading thread number {thread.thread_number}...")
  38.         thread_file_path = f"{archive_dir}/{thread.thread_number}.html"
  39.  
  40.         proc = await asyncio.subprocess.create_subprocess_exec('monolith', '-I', '-j', thread.thread_link, '-o', thread_file_path)
  41.         await proc.wait()
  42.  
  43.         thread_file = pathlib.Path(thread_file_path)
  44.         thread_file.write_text(thread_file.read_text().replace(f'{thread.thread_link}/#', '#'))
  45.         print(f"Thread number {thread.thread_number} downloaded.")
  46.     else:
  47.         print(f"Thread number {thread.thread_number} already exists in the archive directory, skipping...")
  48.        
  49.  
  50. async def main():
  51.     threads = []
  52.     tasks = asyncio.Queue()
  53.     N = 2 # How many workers
  54.  
  55.     archive_link = "https://pastebin.com/raw/J9Q3r785"
  56.     archive_request = get(archive_link)
  57.  
  58.     if archive_request.status_code == 404:
  59.         print(f"Error: The link to the archive paste {archive_link} 404'd for some reason.")
  60.         print("Whatever the case, script can't continue.")
  61.         return
  62.  
  63.     archive_parsed = BeautifulSoup(archive_request.content, features="html.parser")
  64.     archive_lines = archive_parsed.prettify().split('\n')
  65.     print("Archive list has been parsed. Harvesting desu links...")
  66.  
  67.     for line in archive_lines:
  68.         if 'desuarchive.org' in line:
  69.             if '. ' in line or '. ' in line:
  70.                 line_split = line.replace(' ', '').replace('\r', '').split('.', 1)
  71.                 thread_number = line_split[0]
  72.                 thread_link = line_split[1]
  73.                 thread_obj = DesuThread(thread_number, thread_link)
  74.                 threads.append(thread_obj)
  75.             # This ignores misc/unnumbered threads, but there's only four of them so...
  76.             # I will just download them all manually myself and put them in the archive.
  77.     print(f"Operation complete: {len(threads)} total threads.")
  78.     print("Downloading...")
  79.  
  80.     current_dir = os.getcwd()
  81.     archive_dir = f'{current_dir}/DZG Thread Archive'
  82.     if not os.path.exists(archive_dir):
  83.         os.mkdir(archive_dir)
  84.  
  85.  
  86.     # Loop over the thread list, and create a task for each thread
  87.     # Then, run three tasks at a time asynchronously.
  88.     for thread in threads:
  89.         tasks.put_nowait(download(archive_dir, thread))
  90.  
  91.     async def worker():
  92.         while not tasks.empty():
  93.             await tasks.get_nowait()
  94.  
  95.     await asyncio.gather(*[worker() for _ in range(N)])
  96.  
  97.  
  98.  
  99. if __name__ == '__main__':
  100.     asyncio.run(main())
  101.  
Add Comment
Please, Sign In to add comment