Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- import argparse
- import os
- import sys
- import re
- import urllib.request
- try:
- import lxml.html
- except ModuleNotFoundError as e:
- _ = e
- print("The lxml module is not installed.\n",
- "To install it, plase enter this command:\n",
- "python -m pip install lxml")
- sys.exit(1)
- regex = r"^https:\/\/boards.4chan(nel)?.org\/[a-z]+\/thread\/[0-9]+$"
- def apply_parser():
- def URL4chan(arg):
- if not re.compile(regex).match(arg):
- raise argparse.ArgumentTypeError(
- f"URL has to match '{regex}'")
- return arg
- parser = argparse.ArgumentParser(
- description="Downloads all images from a 4chan thread"
- )
- parser.add_argument("URL", help="4chan thread URL", type=URL4chan)
- parser.add_argument("-o", "--output", default=os.getcwd(),
- help="Destination folder", type=str, nargs=1)
- parser.add_argument("--no-output-folder", action="store_true",
- help="If set, saves images directly on output foler")
- parser.add_argument("-f", "--no-interactive", action="store_true",
- help="If set, run the script w/o prompt")
- return parser.parse_args()
- def run(args):
- request = urllib.request.Request(args.URL,
- headers={'User-Agent': 'Mozilla/5.0'})
- thread_board = args.URL.split('/')[-3]
- thread_number = args.URL.split('/')[-1]
- dest_folder = args.output \
- if args.no_output_folder \
- else os.path.join(
- args.output, f"4ch-{thread_board}-{thread_number}"
- )
- content = urllib.request.urlopen(request).read()
- img_links = lxml.html.fromstring(content).xpath(
- '//a[@class="fileThumb"]/@href'
- )
- nb_imgs = len(img_links)
- if not args.no_interactive:
- print(
- f"The script will download {nb_imgs}"
- + f" images in the folder '{dest_folder}'")
- choice = input("Continue ? [y/N] ")
- if choice != "y":
- sys.exit(0)
- if not os.path.exists(dest_folder):
- try:
- os.makedirs(dest_folder)
- print(f"Folder {dest_folder} created")
- except OSError as e:
- print(e)
- sys.exit(1)
- counter = 0
- for link in img_links:
- filepath = os.path.join(dest_folder, link.split('/')[-1])
- if not os.path.exists(filepath):
- urllib.request.urlretrieve("http:"+link, filepath)
- counter += 1
- print(f"Images downloaded: {counter}/{nb_imgs}", end='\r')
- print("\nAll images downloaded!")
- if __name__ == "__main__":
- run(apply_parser())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement