Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # 4chan thread image collector
- #
- #
- #
- # Author:
- # Date: 9/Aug/2013
- # License: GNU General Public License
- #
- # Purpose:
- # Supply a valid 4chan thread URL, and optionally a download directory.
- # Will then find every image in the specified thread and download it.
- #
- # Usage:
- # 4search.py <thread url> [-d download location] [-h{elp}]
- from os import chdir,getcwd,mkdir,remove
- from os.path import isdir,isfile
- from platform import system
- from re import finditer,match
- from subprocess import call,check_call,check_output
- from sys import argv
- from urllib import urlretrieve
- class imgboard_info:
- def __init__(self,thread_url):
- raise Exception("imgboard_info.__init__ not overridden")
- def download_from_match(self,image_match):
- raise Exception("imgboard_info.download_from_match not overridden")
- # Class holding the board name and thread ID number of a thread
- class _4chan_thread_info(imgboard_info):
- # Constructor
- def __init__(self,thread_url):
- self.site = '4chan'
- print "Checking URL against 4chan format..."
- # Attempt to pull the board name and thread ID from a URL using regex
- self.thread_pattern = '(http://)?boards\.4chan\.org/([^/]*)/res/(.*)'
- thread_match = match(self.thread_pattern, thread_url)
- # If the thread is invalid, print so and throw an exception
- if thread_match==None:
- print "\tNot a 4chan thread"
- raise NameError(thread_url)
- # Otherwise, pull the correct groups and return
- else:
- print "\tURL is a 4chan thread"
- self.image_pattern = (
- 'a class=\"fileThumb' # All images start with this
- '( imgspoiler)?\" ' # Catches spoillered images
- '(target="_blank )?' # This can come in random places
- 'href=\"//' # HTML tag for link location
- '(images\.4chan\.org/' # The start of the image URL
- '[^/]*' # The board
- '/src/)' # All images are stored in this directory
- '([^\.]*\.[^\"]*)' # The filename is the final part of the URL
- )
- self.board = thread_match.group(2)
- self.thread = thread_match.group(3)
- # End of __init__
- def download_from_match(self, image_match):
- image_name = image_match.group(4)
- image_url = image_match.group(3)+image_name
- if not isfile(image_name):
- urlretrieve('http://'+image_url, image_name)
- return image_name
- # End of download_from_match
- # End of class _4chan_thread_info
- class mlpchan_thread_info(imgboard_info):
- # Constructor
- def __init__(self,thread_url):
- self.site = 'mlpchan'
- print "Checking URL against mlpchan format..."
- # Attempt to pull the board name and thread ID from a URL using regex
- self.thread_pattern = '(http://|https://)?mlpchan\.net/([^/]*)/res/(.*)'
- thread_match = match(self.thread_pattern, thread_url)
- # If the thread is invalid, print so and throw an exception
- if thread_match==None:
- print "\tNot a mlpchan thread"
- raise NameError(thread_url)
- # Otherwise, pull the correct groups and return
- else:
- print "\tURL is an mlpchan thread"
- self.image_pattern = (
- 'a href=\"(/[^/]*/src/)([^\"]*)\"'
- )
- self.board = thread_match.group(2)
- self.thread = thread_match.group(3)
- # End of __init__
- def download_from_match(self, image_match):
- image_name = image_match.group(2)
- image_url = image_match.group(1)+image_name
- if not isfile(image_name):
- urlretrieve('http://mlpchan.net'+image_url, image_name)
- return image_name
- # End of download_from_match
- # End of class mlpchan_thread_info
- # Class used to download all images from a 4chan thread
- class _4chan_thread_collector:
- # Constructor
- def __init__(self,argv):
- self.info_types = {_4chan_thread_info, mlpchan_thread_info}
- # Record the command line arguments
- self.args = argv
- # end of __init__
- # Run the downloader
- def run_downloader(self):
- print "Starting"
- # Record the starting directory
- self.starting_dir = getcwd()
- # Process command line arguments to figure out the correct thread URL
- # and download location
- thread_url = self.process_arguments()
- # Loop through each of the possible imageboards to fine if any
- # match the given URL
- for next_type in self.info_types:
- try:
- self.thread_info = next_type(thread_url)
- break;
- except NameError:
- self.thread_info = None
- if self.thread_info == None:
- print "URL matches no known imageboard, exiting"
- quit()
- # Create the directory for the board and thread,
- # and move into that directory
- self.create_directories(self.thread_info)
- # Download the HTML source for the thread
- urlretrieve(thread_url,self.thread_info.thread)
- # Search the HTML source for every image in the thread and download them
- self.download_images(thread_url,self.thread_info)
- # Delete the HTML source file and navigate back to where we started
- remove(self.thread_info.thread)
- chdir(self.starting_dir)
- # End of run_downloader
- # Print a help message
- def print_help(self):
- print 'Usage:'
- print '4search.py thread_url [-d download_location] [-h to display help]'
- # End of print_help
- # Process the incoming command line arguments
- def process_arguments(self):
- print self.args
- # If there are no argument, print the help message and exit
- if len(self.args)==1:
- self.print_help()
- quit()
- # If the -h flag is present, print the help message
- if "-h" in self.args:
- hswitch = self.args.index("-h")
- self.args.pop(hswitch)
- self.print_help()
- # if the -d flag is present, navigate to the given directory
- # create it if necessary
- if "-d" in self.args:
- dswitch = self.args.index("-d")
- self.args.pop(dswitch)
- directory = self.args.pop(dswitch)
- if not isdir(directory):
- mkdir(directory)
- chdir(directory)
- # After evaluating any flags, if there are 2 arguments,
- # the 2nd is the thread URL
- if len(self.args)==2:
- return self.args[1]
- # If there are any other number of arguments, exit
- else:
- print 'Bad arg list'
- quit()
- # End of process_arguments
- # Create directories for the board and thread
- def create_directories(self,thread_info):
- site = thread_info.site
- board = thread_info.board
- thread = thread_info.thread
- if not isdir(site):
- mkdir(site)
- chdir(site)
- if not isdir(board):
- mkdir(board)
- chdir(board)
- if not isdir(thread):
- mkdir(thread)
- chdir(thread)
- # End of create_directories
- # Parse a downloaded HTML source for thread images and download each
- def download_images(self,thread_url,thread_info):
- # Open the HTML source
- f = open(thread_info.thread, 'r')
- # Read each line of the source
- while True:
- line = f.readline()
- if line=="":
- break
- # Create an iterator for each match of the pattern on that line
- images = finditer(thread_info.image_pattern, line)
- # Iterate through each match, pull the image URL from the pattern
- # and download that image
- try:
- while True:
- print thread_info.download_from_match(images.next())+' Successfully Retrieved'
- except StopIteration:
- print 'Finished'
- pass
- # End of download_images
- # End of class _4chan_thread_collector
- # Actual Scipt:
- # Create an instance of the collector
- # Pass the command line arguments to its constructor
- # Call the download method
- _4chan_thread_collector(argv).run_downloader()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement