Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # anon-dl.py
- # november 2014
- # from minnesota with love
- # About:
- # This is a program written in Python that will download all images
- # in a given category or sub-forum of anon-ib.com
- # Requires:
- # Python 2.7-ish
- # Beautiful Soup 4
- # Known to work on:
- # Ubuntu 14.04 x64
- # But in theory, with a minor tweak here or there, could work on Windows or OSX
- # How to:
- # Save this text as anon-dl.py
- #
- # In the folder where you saved this, create another folder named "images"
- #
- # Open a terminal into the folder where this file lives and run:
- # python anon-dl.py <CATEGORY>
- # Of course, replace <CATEGORY> with the URL name of the category.
- #
- # For Minnesota, for example:
- # python anon-dl.py ms
- #
- # Or ebony?:
- # python anon-dl.py eb
- #
- # If you run it without a category URL name, it will default to Austrailia. ;)
- # Free for all to use or modify. -ke
- from bs4 import BeautifulSoup
- import sys
- import urllib
- import urllib2
- # global settings
- savefolder = 'images/'
- rootdomain = 'http://anon-ib.com'
- DEFAULT_CAT = 'au'
- # Flatten a list of lists
- # From: http://stackoverflow.com/questions/406121/flattening-a-shallow-list-in-python
- def flatten(x):
- result = []
- for el in x:
- if hasattr(el, "__iter__") and not isinstance(el, basestring):
- result.extend(flatten(el))
- else:
- result.append(el)
- return result
- # Find and return all page links from a given starting page
- def getPageLinks(soup, stateabbr):
- pageLinks = []
- for linky in soup.find_all('a'):
- # Is this a link?
- if linky.has_attr('href'):
- # Make sure it's a local (and proper) reference
- if not linky['href'].startswith('/' + stateabbr + '/res/') and not linky['href'].startswith('/recent.html'):
- # Is it a link to a page?
- if linky['href'].endswith('html'):
- # Passes checks, add it to our list
- pageLinks.append(linky['href'])
- return pageLinks
- # Find and return all page links from the sab-category's starting page
- def getPages(stateabbr):
- # Build URL
- builtlink = rootdomain + '/' + stateabbr + '/index.html'
- # Get HTML
- response = urllib2.urlopen(builtlink)
- html = response.read()
- # Make soup
- soup = BeautifulSoup(html, "html.parser")
- # Return list of links to pages
- return getPageLinks(soup, stateabbr)
- # Find and return all thread links from a given page
- def getThreadLinks(soup, stateabbr):
- threadLinks = []
- for linky in soup.find_all('a'):
- # Is this a link?
- if linky.has_attr('href'):
- # Make sure it's a local (and proper) reference
- if linky['href'].startswith('/' + stateabbr + '/res/') and not linky['href'].startswith('/recent.html'):
- # Is it a link to a page?
- if linky['href'].endswith('html'):
- # Passes checks, add it to our list
- threadLinks.append(linky['href'])
- return threadLinks
- # Find and return all thread links from each page in a list of pages
- def getThreads(pageLinkList, stateabbr):
- allThreadList = []
- for page in pageLinkList:
- # Build URL
- builtlink = rootdomain + page
- # Get HTML
- response = urllib2.urlopen(builtlink)
- html = response.read()
- # Make soup
- soup = BeautifulSoup(html, "html.parser")
- # Add to big list
- allThreadList.append(getThreadLinks(soup, stateabbr))
- return flatten(allThreadList)
- # Find and return all image links from a given thread page
- def getImgLinks(soup):
- imageLinks = []
- for linky in soup.find_all('a'):
- # Is this a link?
- if linky.has_attr('href'):
- # Make sure it's a local reference
- if linky['href'].startswith('/'):
- # Is it a link to an image?
- if linky['href'].endswith('jpg') or linky['href'].endswith('jpeg') or linky['href'].endswith('png'):
- # Passes checks, add it to our list
- imageLinks.append(linky['href'])
- return imageLinks
- # Find and return all image links from each thread in a list of threads
- def getAllImgLinks(threadlist):
- allImageList = []
- for thread in threadlist:
- # Build URL
- builtlink = rootdomain + thread
- # Get HTML
- response = urllib2.urlopen(builtlink)
- html = response.read()
- # Make soup
- soup = BeautifulSoup(html, "html.parser")
- # Add to big list
- allImageList.append(getImgLinks(soup))
- return flatten(allImageList)
- # Download and save the images
- def getFiles(links):
- for img in links:
- imgurl = rootdomain + img
- print imgurl
- #Fix filename to make sense to our local filesystem
- fname = str(img).split('/')[-1:][0]
- localname = savefolder + fname
- # Download the image!
- urllib.urlretrieve(imgurl, localname)
- return True
- # Main function
- def doTheThing(stateabbr):
- # Debug
- #print getPages(stateabbr)
- #print getThreads(getPages(stateabbr))
- #print getAllImgLinks(getThreads(getPages(stateabbr)))
- # Do everything
- getFiles(getAllImgLinks(getThreads(getPages(stateabbr), stateabbr)))
- print 'Complete.'
- if len(sys.argv) >= 2:
- inputcat = str(sys.argv[1])
- else:
- inputcat = DEFAULT_CAT
- print 'Executing: ' + str(sys.argv)
- print 'Please wait as we gather some data...'
- print 'Downloading will begin when filenames start to scroll down your screen.'
- # Start the main function
- doTheThing(inputcat)
- # Debug
- #print getPageLinks(soup)
- #print getThreadLinks(soup)
- #getFiles(getImgLinks(soup))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement