Advertisement
Guest User

anon-dl.py

a guest
Nov 7th, 2014
2,133
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.21 KB | None | 0 0
  1. # anon-dl.py
  2. # november 2014
  3. # from minnesota with love
  4.  
  5. # About:
  6. # This is a program written in Python that will download all images
  7. # in a given category or sub-forum of anon-ib.com
  8.  
  9. # Requires:
  10. # Python 2.7-ish
  11. # Beautiful Soup 4
  12.  
  13. # Known to work on:
  14. # Ubuntu 14.04 x64
  15. # But in theory, with a minor tweak here or there, could work on Windows or OSX
  16.  
  17. # How to:
  18. # Save this text as anon-dl.py
  19. #
  20. # In the folder where you saved this, create another folder named "images"
  21. #
  22. # Open a terminal into the folder where this file lives and run:
  23. # python anon-dl.py <CATEGORY>
  24. # Of course, replace <CATEGORY> with the URL name of the category.
  25. #
  26. # For Minnesota, for example:
  27. # python anon-dl.py ms
  28. #
  29. # Or ebony?:
  30. # python anon-dl.py eb
  31. #
  32. # If you run it without a category URL name, it will default to Austrailia. ;)
  33.  
  34. # Free for all to use or modify. -ke
  35.  
  36.  
  37. from bs4 import BeautifulSoup
  38. import sys
  39. import urllib
  40. import urllib2
  41.  
  42. # global settings
  43. savefolder = 'images/'
  44. rootdomain = 'http://anon-ib.com'
  45. DEFAULT_CAT = 'au'
  46.  
  47.  
  48. # Flatten a list of lists
  49. # From: http://stackoverflow.com/questions/406121/flattening-a-shallow-list-in-python
  50. def flatten(x):
  51.     result = []
  52.     for el in x:
  53.         if hasattr(el, "__iter__") and not isinstance(el, basestring):
  54.             result.extend(flatten(el))
  55.         else:
  56.             result.append(el)
  57.            
  58.     return result
  59.  
  60. # Find and return all page links from a given starting page
  61. def getPageLinks(soup, stateabbr):
  62.     pageLinks = []
  63.     for linky in soup.find_all('a'):
  64.         # Is this a link?
  65.         if linky.has_attr('href'):
  66.             # Make sure it's a local (and proper) reference
  67.             if not linky['href'].startswith('/' + stateabbr + '/res/') and not linky['href'].startswith('/recent.html'):
  68.                 # Is it a link to a page?
  69.                 if linky['href'].endswith('html'):
  70.                     # Passes checks, add it to our list
  71.                     pageLinks.append(linky['href'])
  72.     return pageLinks
  73.  
  74. # Find and return all page links from the sab-category's starting page
  75. def getPages(stateabbr):
  76.     # Build URL
  77.     builtlink = rootdomain + '/' + stateabbr + '/index.html'
  78.    
  79.     # Get HTML
  80.     response = urllib2.urlopen(builtlink)
  81.     html = response.read()
  82.    
  83.     # Make soup
  84.     soup = BeautifulSoup(html, "html.parser")
  85.    
  86.     # Return list of links to pages
  87.     return getPageLinks(soup, stateabbr)
  88.  
  89. # Find and return all thread links from a given page
  90. def getThreadLinks(soup, stateabbr):
  91.     threadLinks = []
  92.     for linky in soup.find_all('a'):
  93.         # Is this a link?
  94.         if linky.has_attr('href'):
  95.             # Make sure it's a local (and proper) reference
  96.             if linky['href'].startswith('/' + stateabbr + '/res/') and not linky['href'].startswith('/recent.html'):
  97.                 # Is it a link to a page?
  98.                 if linky['href'].endswith('html'):
  99.                     # Passes checks, add it to our list
  100.                     threadLinks.append(linky['href'])
  101.    
  102.     return threadLinks
  103.  
  104. # Find and return all thread links from each page in a list of pages
  105. def getThreads(pageLinkList, stateabbr):
  106.     allThreadList = []
  107.    
  108.     for page in pageLinkList:
  109.         # Build URL
  110.         builtlink = rootdomain + page
  111.        
  112.         # Get HTML
  113.         response = urllib2.urlopen(builtlink)
  114.         html = response.read()
  115.        
  116.         # Make soup
  117.         soup = BeautifulSoup(html, "html.parser")
  118.        
  119.         # Add to big list
  120.         allThreadList.append(getThreadLinks(soup, stateabbr))
  121.    
  122.     return flatten(allThreadList)
  123.  
  124. # Find and return all image links from a given thread page
  125. def getImgLinks(soup):
  126.     imageLinks = []
  127.     for linky in soup.find_all('a'):
  128.         # Is this a link?
  129.         if linky.has_attr('href'):
  130.             # Make sure it's a local reference
  131.             if linky['href'].startswith('/'):
  132.                 # Is it a link to an image?
  133.                 if linky['href'].endswith('jpg') or linky['href'].endswith('jpeg') or linky['href'].endswith('png'):
  134.                     # Passes checks, add it to our list
  135.                     imageLinks.append(linky['href'])
  136.    
  137.     return imageLinks
  138.  
  139. # Find and return all image links from each thread in a list of threads
  140. def getAllImgLinks(threadlist):
  141.     allImageList = []
  142.    
  143.     for thread in threadlist:
  144.         # Build URL
  145.         builtlink = rootdomain + thread
  146.        
  147.         # Get HTML
  148.         response = urllib2.urlopen(builtlink)
  149.         html = response.read()
  150.        
  151.         # Make soup
  152.         soup = BeautifulSoup(html, "html.parser")
  153.        
  154.         # Add to big list
  155.         allImageList.append(getImgLinks(soup))
  156.    
  157.     return flatten(allImageList)
  158.  
  159. # Download and save the images
  160. def getFiles(links):
  161.     for img in links:
  162.         imgurl = rootdomain + img
  163.         print imgurl
  164.        
  165.         #Fix filename to make sense to our local filesystem
  166.         fname = str(img).split('/')[-1:][0]
  167.         localname = savefolder + fname
  168.        
  169.         # Download the image!
  170.         urllib.urlretrieve(imgurl, localname)
  171.    
  172.     return True
  173.  
  174. # Main function
  175. def doTheThing(stateabbr):
  176.     # Debug
  177.     #print getPages(stateabbr)
  178.     #print getThreads(getPages(stateabbr))
  179.     #print getAllImgLinks(getThreads(getPages(stateabbr)))
  180.    
  181.     # Do everything
  182.     getFiles(getAllImgLinks(getThreads(getPages(stateabbr), stateabbr)))
  183.    
  184.     print 'Complete.'
  185.  
  186.  
  187. if len(sys.argv) >= 2:
  188.     inputcat = str(sys.argv[1])
  189. else:
  190.     inputcat = DEFAULT_CAT
  191.  
  192. print 'Executing: ' + str(sys.argv)
  193. print 'Please wait as we gather some data...'
  194. print 'Downloading will begin when filenames start to scroll down your screen.'
  195.  
  196. # Start the main function
  197. doTheThing(inputcat)
  198.  
  199.  
  200. # Debug
  201.  
  202. #print getPageLinks(soup)
  203. #print getThreadLinks(soup)
  204. #getFiles(getImgLinks(soup))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement