4search.py

#   4chan thread image collector
#
#
#
#   Author:
#   Date:   9/Aug/2013
#   License: GNU General Public License
#
#   Purpose:
#       Supply a valid 4chan thread URL, and optionally a download directory.
#       Will then find every image in the specified thread and download it.
#
#   Usage:
#       4search.py <thread url> [-d download location] [-h{elp}]


from os         import chdir,getcwd,mkdir,remove
from os.path    import isdir,isfile
from platform   import system
from re         import finditer,match
from subprocess import call,check_call,check_output
from sys        import argv
from urllib     import urlretrieve


class imgboard_info:
    def __init__(self,thread_url):
        raise Exception("imgboard_info.__init__ not overridden")
    def download_from_match(self,image_match):
        raise Exception("imgboard_info.download_from_match not overridden")

# Class holding the board name and thread ID number of a thread
class _4chan_thread_info(imgboard_info):

    # Constructor
    def __init__(self,thread_url):

        self.site = '4chan'

        print "Checking URL against 4chan format..."

        # Attempt to pull the board name and thread ID from a URL using regex
        self.thread_pattern = '(http://)?boards\.4chan\.org/([^/]*)/res/(.*)'
        thread_match = match(self.thread_pattern, thread_url)

        # If the thread is invalid, print so and throw an exception
        if thread_match==None:
            print "\tNot a 4chan thread"
            raise NameError(thread_url)

        # Otherwise, pull the correct groups and return
        else:
            print "\tURL is a 4chan thread"
            self.image_pattern = (
                'a class=\"fileThumb'   # All images start with this
                '( imgspoiler)?\" '     # Catches spoillered images
                '(target="_blank )?'    # This can come in random places
                'href=\"//'             # HTML tag for link location
                '(images\.4chan\.org/'  # The start of the image URL
                '[^/]*'                 # The board
                '/src/)'                # All images are stored in this directory
                '([^\.]*\.[^\"]*)'      # The filename is the final part of the URL
            )
            self.board = thread_match.group(2)
            self.thread = thread_match.group(3)
    # End of __init__

    def download_from_match(self, image_match):
        image_name = image_match.group(4)
        image_url = image_match.group(3)+image_name
        if not isfile(image_name):
                        urlretrieve('http://'+image_url, image_name)
        return image_name

    # End of download_from_match


# End of class _4chan_thread_info


class mlpchan_thread_info(imgboard_info):

    # Constructor
    def __init__(self,thread_url):

        self.site = 'mlpchan'

        print "Checking URL against mlpchan format..."

        # Attempt to pull the board name and thread ID from a URL using regex
        self.thread_pattern = '(http://|https://)?mlpchan\.net/([^/]*)/res/(.*)'
        thread_match = match(self.thread_pattern, thread_url)

        # If the thread is invalid, print so and throw an exception
        if thread_match==None:
            print "\tNot a mlpchan thread"
            raise NameError(thread_url)

        # Otherwise, pull the correct groups and return
        else:
            print "\tURL is an mlpchan thread"
            self.image_pattern = (
            'a href=\"(/[^/]*/src/)([^\"]*)\"'
            )
            self.board = thread_match.group(2)
            self.thread = thread_match.group(3)

    # End of __init__


    def download_from_match(self, image_match):
        image_name = image_match.group(2)
        image_url = image_match.group(1)+image_name
        if not isfile(image_name):
                        urlretrieve('http://mlpchan.net'+image_url, image_name)
        return image_name

    # End of download_from_match


# End of class mlpchan_thread_info


# Class used to download all images from a 4chan thread
class _4chan_thread_collector:

    # Constructor
    def __init__(self,argv):

        self.info_types = {_4chan_thread_info, mlpchan_thread_info}

        # Record the command line arguments
        self.args = argv

    # end of __init__


    # Run the downloader
    def run_downloader(self):
        print "Starting"
        # Record the starting directory
        self.starting_dir = getcwd()

        # Process command line arguments to figure out the correct thread URL
        #   and download location
        thread_url = self.process_arguments()

        # Loop through each of the possible imageboards to fine if any
        #   match the given URL
        for next_type in self.info_types:
            try:
                self.thread_info = next_type(thread_url)
                break;
            except NameError:
                self.thread_info = None


        if self.thread_info == None:
            print "URL matches no known imageboard, exiting"
            quit()

        # Create the directory for the board and thread,
        #   and move into that directory
        self.create_directories(self.thread_info)

        # Download the HTML source for the thread
        urlretrieve(thread_url,self.thread_info.thread)

        # Search the HTML source for every image in the thread and download them
        self.download_images(thread_url,self.thread_info)

        # Delete the HTML source file and navigate back to where we started
        remove(self.thread_info.thread)
        chdir(self.starting_dir)

    # End of run_downloader


    # Print a help message
    def print_help(self):
        print 'Usage:'
        print '4search.py thread_url [-d download_location] [-h to display help]'

    # End of print_help


    # Process the incoming command line arguments
    def process_arguments(self):
        print self.args
        # If there are no argument, print the help message and exit
        if len(self.args)==1:
            self.print_help()
            quit()

        # If the -h flag is present, print the help message
        if "-h" in self.args:
            hswitch = self.args.index("-h")
            self.args.pop(hswitch)
            self.print_help()

        # if the -d flag is present, navigate to the given directory
        #   create it if necessary
        if "-d" in self.args:
            dswitch = self.args.index("-d")
            self.args.pop(dswitch)
            directory = self.args.pop(dswitch)
            if not isdir(directory):
                mkdir(directory)
            chdir(directory)
        # After evaluating any flags, if there are 2 arguments,
        #   the 2nd is the thread URL
        if len(self.args)==2:
            return self.args[1]

        # If there are any other number of arguments, exit
        else:
                        print 'Bad arg list'
            quit()

    # End of process_arguments


    # Create directories for the board and thread
    def create_directories(self,thread_info):
        site = thread_info.site
        board = thread_info.board
        thread = thread_info.thread
        if not isdir(site):
            mkdir(site)
        chdir(site)
        if not isdir(board):
            mkdir(board)
        chdir(board)
        if not isdir(thread):
            mkdir(thread)
        chdir(thread)

    # End of create_directories


    # Parse a downloaded HTML source for thread images and download each
    def download_images(self,thread_url,thread_info):

        # Open the HTML source
        f = open(thread_info.thread, 'r')

        # Read each line of the source
        while True:
            line = f.readline()
            if line=="":
                break

            # Create an iterator for each match of the pattern on that line
            images = finditer(thread_info.image_pattern, line)
            # Iterate through each match, pull the image URL from the pattern
            #   and download that image
            try:
                while True:
                    print thread_info.download_from_match(images.next())+' Successfully Retrieved'
            except StopIteration:
                                print 'Finished'
                pass

    # End of download_images


# End of class _4chan_thread_collector

# Actual Scipt:
# Create an instance of the collector
# Pass the command line arguments to its constructor
# Call the download method
_4chan_thread_collector(argv).run_downloader()