Advertisement
Guest User

Untitled

a guest
Oct 3rd, 2023
160
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.68 KB | None | 0 0
  1. #!/usr/bin/env python3
  2.  
  3. import argparse
  4. import os
  5. import sys
  6. import re
  7. import urllib.request
  8. try:
  9.     import lxml.html
  10. except ModuleNotFoundError as e:
  11.     _ = e
  12.     print("The lxml module is not installed.\n",
  13.           "To install it, plase enter this command:\n",
  14.           "python -m pip install lxml")
  15.     sys.exit(1)
  16.  
  17. regex = r"^https:\/\/boards.4chan(nel)?.org\/[a-z]+\/thread\/[0-9]+$"
  18.  
  19.  
  20. def apply_parser():
  21.     def URL4chan(arg):
  22.         if not re.compile(regex).match(arg):
  23.             raise argparse.ArgumentTypeError(
  24.                 f"URL has to match '{regex}'")
  25.         return arg
  26.  
  27.     parser = argparse.ArgumentParser(
  28.         description="Downloads all images from a 4chan thread"
  29.     )
  30.     parser.add_argument("URL", help="4chan thread URL", type=URL4chan)
  31.     parser.add_argument("-o", "--output", default=os.getcwd(),
  32.                         help="Destination folder", type=str, nargs=1)
  33.     parser.add_argument("--no-output-folder", action="store_true",
  34.                         help="If set, saves images directly on output foler")
  35.     parser.add_argument("-f", "--no-interactive", action="store_true",
  36.                         help="If set, run the script w/o prompt")
  37.     return parser.parse_args()
  38.  
  39.  
  40. def run(args):
  41.     request = urllib.request.Request(args.URL,
  42.                                      headers={'User-Agent': 'Mozilla/5.0'})
  43.     thread_board = args.URL.split('/')[-3]
  44.     thread_number = args.URL.split('/')[-1]
  45.     dest_folder = args.output \
  46.         if args.no_output_folder \
  47.         else os.path.join(
  48.             args.output, f"4ch-{thread_board}-{thread_number}"
  49.         )
  50.  
  51.     content = urllib.request.urlopen(request).read()
  52.     img_links = lxml.html.fromstring(content).xpath(
  53.         '//a[@class="fileThumb"]/@href'
  54.     )
  55.     nb_imgs = len(img_links)
  56.  
  57.     if not args.no_interactive:
  58.         print(
  59.             f"The script will download {nb_imgs}"
  60.             + f" images in the folder '{dest_folder}'")
  61.         choice = input("Continue ? [y/N] ")
  62.         if choice != "y":
  63.             sys.exit(0)
  64.  
  65.     if not os.path.exists(dest_folder):
  66.         try:
  67.             os.makedirs(dest_folder)
  68.             print(f"Folder {dest_folder} created")
  69.         except OSError as e:
  70.             print(e)
  71.             sys.exit(1)
  72.  
  73.     counter = 0
  74.     for link in img_links:
  75.         filepath = os.path.join(dest_folder, link.split('/')[-1])
  76.         if not os.path.exists(filepath):
  77.             urllib.request.urlretrieve("http:"+link, filepath)
  78.             counter += 1
  79.         print(f"Images downloaded: {counter}/{nb_imgs}", end='\r')
  80.     print("\nAll images downloaded!")
  81.  
  82.  
  83. if __name__ == "__main__":
  84.     run(apply_parser())
  85.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement