Advertisement
ChallengerAppeared

4search.py

Sep 12th, 2013
125
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.46 KB | None | 0 0
  1. #   4chan thread image collector
  2. #
  3. #
  4. #
  5. #   Author:
  6. #   Date:   9/Aug/2013
  7. #   License: GNU General Public License
  8. #
  9. #   Purpose:
  10. #       Supply a valid 4chan thread URL, and optionally a download directory.
  11. #       Will then find every image in the specified thread and download it.
  12. #
  13. #   Usage:
  14. #       4search.py <thread url> [-d download location] [-h{elp}]
  15.  
  16.  
  17.  
  18.  
  19. from os         import chdir,getcwd,mkdir,remove
  20. from os.path    import isdir,isfile
  21. from platform   import system
  22. from re         import finditer,match
  23. from subprocess import call,check_call,check_output
  24. from sys        import argv
  25. from urllib     import urlretrieve
  26.  
  27.  
  28. class imgboard_info:
  29.     def __init__(self,thread_url):
  30.         raise Exception("imgboard_info.__init__ not overridden")
  31.     def download_from_match(self,image_match):
  32.         raise Exception("imgboard_info.download_from_match not overridden")
  33.  
  34. # Class holding the board name and thread ID number of a thread
  35. class _4chan_thread_info(imgboard_info):
  36.    
  37.     # Constructor
  38.     def __init__(self,thread_url):
  39.        
  40.         self.site = '4chan'
  41.        
  42.         print "Checking URL against 4chan format..."
  43.        
  44.         # Attempt to pull the board name and thread ID from a URL using regex
  45.         self.thread_pattern = '(http://)?boards\.4chan\.org/([^/]*)/res/(.*)'
  46.         thread_match = match(self.thread_pattern, thread_url)
  47.        
  48.         # If the thread is invalid, print so and throw an exception
  49.         if thread_match==None:
  50.             print "\tNot a 4chan thread"
  51.             raise NameError(thread_url)
  52.            
  53.         # Otherwise, pull the correct groups and return
  54.         else:
  55.             print "\tURL is a 4chan thread"
  56.             self.image_pattern = (
  57.                 'a class=\"fileThumb'   # All images start with this
  58.                 '( imgspoiler)?\" '     # Catches spoillered images
  59.                 '(target="_blank )?'    # This can come in random places
  60.                 'href=\"//'             # HTML tag for link location
  61.                 '(images\.4chan\.org/'  # The start of the image URL
  62.                 '[^/]*'                 # The board
  63.                 '/src/)'                # All images are stored in this directory
  64.                 '([^\.]*\.[^\"]*)'      # The filename is the final part of the URL
  65.             )
  66.             self.board = thread_match.group(2)
  67.             self.thread = thread_match.group(3)
  68.     # End of __init__
  69.    
  70.     def download_from_match(self, image_match):
  71.         image_name = image_match.group(4)
  72.         image_url = image_match.group(3)+image_name
  73.         if not isfile(image_name):
  74.                         urlretrieve('http://'+image_url, image_name)
  75.         return image_name
  76.        
  77.     # End of download_from_match
  78.    
  79.    
  80. # End of class _4chan_thread_info
  81.  
  82.  
  83. class mlpchan_thread_info(imgboard_info):
  84.  
  85.     # Constructor
  86.     def __init__(self,thread_url):
  87.        
  88.         self.site = 'mlpchan'
  89.        
  90.         print "Checking URL against mlpchan format..."
  91.    
  92.         # Attempt to pull the board name and thread ID from a URL using regex
  93.         self.thread_pattern = '(http://|https://)?mlpchan\.net/([^/]*)/res/(.*)'
  94.         thread_match = match(self.thread_pattern, thread_url)
  95.  
  96.         # If the thread is invalid, print so and throw an exception
  97.         if thread_match==None:
  98.             print "\tNot a mlpchan thread"
  99.             raise NameError(thread_url)
  100.  
  101.         # Otherwise, pull the correct groups and return
  102.         else:
  103.             print "\tURL is an mlpchan thread"
  104.             self.image_pattern = (
  105.             'a href=\"(/[^/]*/src/)([^\"]*)\"'
  106.             )
  107.             self.board = thread_match.group(2)
  108.             self.thread = thread_match.group(3)
  109.            
  110.     # End of __init__
  111.    
  112.    
  113.     def download_from_match(self, image_match):
  114.         image_name = image_match.group(2)
  115.         image_url = image_match.group(1)+image_name
  116.         if not isfile(image_name):
  117.                         urlretrieve('http://mlpchan.net'+image_url, image_name)
  118.         return image_name
  119.    
  120.     # End of download_from_match
  121.    
  122.  
  123. # End of class mlpchan_thread_info
  124.  
  125.  
  126.  
  127.  
  128. # Class used to download all images from a 4chan thread
  129. class _4chan_thread_collector:
  130.  
  131.     # Constructor
  132.     def __init__(self,argv):
  133.  
  134.         self.info_types = {_4chan_thread_info, mlpchan_thread_info}
  135.        
  136.         # Record the command line arguments
  137.         self.args = argv
  138.        
  139.     # end of __init__
  140.    
  141.    
  142.     # Run the downloader
  143.     def run_downloader(self):
  144.         print "Starting"
  145.         # Record the starting directory
  146.         self.starting_dir = getcwd()
  147.        
  148.         # Process command line arguments to figure out the correct thread URL
  149.         #   and download location
  150.         thread_url = self.process_arguments()
  151.        
  152.         # Loop through each of the possible imageboards to fine if any
  153.         #   match the given URL
  154.         for next_type in self.info_types:
  155.             try:
  156.                 self.thread_info = next_type(thread_url)
  157.                 break;
  158.             except NameError:
  159.                 self.thread_info = None
  160.                
  161.        
  162.         if self.thread_info == None:
  163.             print "URL matches no known imageboard, exiting"
  164.             quit()
  165.        
  166.         # Create the directory for the board and thread,
  167.         #   and move into that directory
  168.         self.create_directories(self.thread_info)
  169.        
  170.         # Download the HTML source for the thread
  171.         urlretrieve(thread_url,self.thread_info.thread)
  172.        
  173.         # Search the HTML source for every image in the thread and download them
  174.         self.download_images(thread_url,self.thread_info)
  175.        
  176.         # Delete the HTML source file and navigate back to where we started
  177.         remove(self.thread_info.thread)
  178.         chdir(self.starting_dir)
  179.        
  180.     # End of run_downloader
  181.    
  182.    
  183.     # Print a help message
  184.     def print_help(self):
  185.         print 'Usage:'
  186.         print '4search.py thread_url [-d download_location] [-h to display help]'
  187.        
  188.     # End of print_help
  189.    
  190.    
  191.     # Process the incoming command line arguments
  192.     def process_arguments(self):
  193.         print self.args
  194.         # If there are no argument, print the help message and exit
  195.         if len(self.args)==1:
  196.             self.print_help()
  197.             quit()
  198.            
  199.         # If the -h flag is present, print the help message
  200.         if "-h" in self.args:
  201.             hswitch = self.args.index("-h")
  202.             self.args.pop(hswitch)
  203.             self.print_help()
  204.            
  205.         # if the -d flag is present, navigate to the given directory
  206.         #   create it if necessary
  207.         if "-d" in self.args:
  208.             dswitch = self.args.index("-d")
  209.             self.args.pop(dswitch)
  210.             directory = self.args.pop(dswitch)
  211.             if not isdir(directory):
  212.                 mkdir(directory)
  213.             chdir(directory)
  214.         # After evaluating any flags, if there are 2 arguments,
  215.         #   the 2nd is the thread URL
  216.         if len(self.args)==2:
  217.             return self.args[1]
  218.            
  219.         # If there are any other number of arguments, exit
  220.         else:
  221.                         print 'Bad arg list'
  222.             quit()
  223.        
  224.     # End of process_arguments
  225.    
  226.    
  227.     # Create directories for the board and thread
  228.     def create_directories(self,thread_info):
  229.         site = thread_info.site
  230.         board = thread_info.board
  231.         thread = thread_info.thread
  232.         if not isdir(site):
  233.             mkdir(site)
  234.         chdir(site)
  235.         if not isdir(board):
  236.             mkdir(board)
  237.         chdir(board)
  238.         if not isdir(thread):
  239.             mkdir(thread)
  240.         chdir(thread)
  241.        
  242.     # End of create_directories
  243.    
  244.    
  245.     # Parse a downloaded HTML source for thread images and download each
  246.     def download_images(self,thread_url,thread_info):
  247.        
  248.         # Open the HTML source
  249.         f = open(thread_info.thread, 'r')
  250.        
  251.         # Read each line of the source
  252.         while True:
  253.             line = f.readline()
  254.             if line=="":
  255.                 break
  256.            
  257.             # Create an iterator for each match of the pattern on that line
  258.             images = finditer(thread_info.image_pattern, line)
  259.             # Iterate through each match, pull the image URL from the pattern
  260.             #   and download that image
  261.             try:
  262.                 while True:
  263.                     print thread_info.download_from_match(images.next())+' Successfully Retrieved'
  264.             except StopIteration:
  265.                                 print 'Finished'
  266.                 pass
  267.        
  268.     # End of download_images
  269.  
  270.    
  271. # End of class _4chan_thread_collector
  272.  
  273. # Actual Scipt:
  274. # Create an instance of the collector
  275. # Pass the command line arguments to its constructor
  276. # Call the download method
  277. _4chan_thread_collector(argv).run_downloader()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement