anon-dl.py

# anon-dl.py
# november 2014
# from minnesota with love

# About:
# This is a program written in Python that will download all images
# in a given category or sub-forum of anon-ib.com

# Requires:
# Python 2.7-ish
# Beautiful Soup 4

# Known to work on:
# Ubuntu 14.04 x64
# But in theory, with a minor tweak here or there, could work on Windows or OSX

# How to:
# Save this text as anon-dl.py
#
# In the folder where you saved this, create another folder named "images"
#
# Open a terminal into the folder where this file lives and run:
# python anon-dl.py <CATEGORY>
# Of course, replace <CATEGORY> with the URL name of the category.
#
# For Minnesota, for example:
# python anon-dl.py ms
#
# Or ebony?:
# python anon-dl.py eb
#
# If you run it without a category URL name, it will default to Austrailia. ;)

# Free for all to use or modify. -ke


from bs4 import BeautifulSoup
import sys
import urllib
import urllib2

# global settings
savefolder = 'images/'
rootdomain = 'http://anon-ib.com'
DEFAULT_CAT = 'au'


# Flatten a list of lists
# From: http://stackoverflow.com/questions/406121/flattening-a-shallow-list-in-python
def flatten(x):
    result = []
    for el in x:
        if hasattr(el, "__iter__") and not isinstance(el, basestring):
            result.extend(flatten(el))
        else:
            result.append(el)

    return result

# Find and return all page links from a given starting page
def getPageLinks(soup, stateabbr):
    pageLinks = []
    for linky in soup.find_all('a'):
        # Is this a link?
        if linky.has_attr('href'):
            # Make sure it's a local (and proper) reference
            if not linky['href'].startswith('/' + stateabbr + '/res/') and not linky['href'].startswith('/recent.html'):
                # Is it a link to a page?
                if linky['href'].endswith('html'):
                    # Passes checks, add it to our list
                    pageLinks.append(linky['href'])
    return pageLinks

# Find and return all page links from the sab-category's starting page
def getPages(stateabbr):
    # Build URL
    builtlink = rootdomain + '/' + stateabbr + '/index.html'

    # Get HTML
    response = urllib2.urlopen(builtlink)
    html = response.read()

    # Make soup
    soup = BeautifulSoup(html, "html.parser")

    # Return list of links to pages
    return getPageLinks(soup, stateabbr)

# Find and return all thread links from a given page
def getThreadLinks(soup, stateabbr):
    threadLinks = []
    for linky in soup.find_all('a'):
        # Is this a link?
        if linky.has_attr('href'):
            # Make sure it's a local (and proper) reference
            if linky['href'].startswith('/' + stateabbr + '/res/') and not linky['href'].startswith('/recent.html'):
                # Is it a link to a page?
                if linky['href'].endswith('html'):
                    # Passes checks, add it to our list
                    threadLinks.append(linky['href'])

    return threadLinks

# Find and return all thread links from each page in a list of pages
def getThreads(pageLinkList, stateabbr):
    allThreadList = []

    for page in pageLinkList:
        # Build URL
        builtlink = rootdomain + page

        # Get HTML
        response = urllib2.urlopen(builtlink)
        html = response.read()

        # Make soup
        soup = BeautifulSoup(html, "html.parser")

        # Add to big list
        allThreadList.append(getThreadLinks(soup, stateabbr))

    return flatten(allThreadList)

# Find and return all image links from a given thread page
def getImgLinks(soup):
    imageLinks = []
    for linky in soup.find_all('a'):
        # Is this a link?
        if linky.has_attr('href'):
            # Make sure it's a local reference
            if linky['href'].startswith('/'):
                # Is it a link to an image?
                if linky['href'].endswith('jpg') or linky['href'].endswith('jpeg') or linky['href'].endswith('png'):
                    # Passes checks, add it to our list
                    imageLinks.append(linky['href'])

    return imageLinks

# Find and return all image links from each thread in a list of threads
def getAllImgLinks(threadlist):
    allImageList = []

    for thread in threadlist:
        # Build URL
        builtlink = rootdomain + thread

        # Get HTML
        response = urllib2.urlopen(builtlink)
        html = response.read()

        # Make soup
        soup = BeautifulSoup(html, "html.parser")

        # Add to big list
        allImageList.append(getImgLinks(soup))

    return flatten(allImageList)

# Download and save the images
def getFiles(links):
    for img in links:
        imgurl = rootdomain + img
        print imgurl

        #Fix filename to make sense to our local filesystem
        fname = str(img).split('/')[-1:][0]
        localname = savefolder + fname

        # Download the image!
        urllib.urlretrieve(imgurl, localname)

    return True

# Main function
def doTheThing(stateabbr):
    # Debug
    #print getPages(stateabbr)
    #print getThreads(getPages(stateabbr))
    #print getAllImgLinks(getThreads(getPages(stateabbr)))

    # Do everything
    getFiles(getAllImgLinks(getThreads(getPages(stateabbr), stateabbr)))

    print 'Complete.'


if len(sys.argv) >= 2:
    inputcat = str(sys.argv[1])
else:
    inputcat = DEFAULT_CAT

print 'Executing: ' + str(sys.argv)
print 'Please wait as we gather some data...'
print 'Downloading will begin when filenames start to scroll down your screen.'

# Start the main function
doTheThing(inputcat)


# Debug

#print getPageLinks(soup)
#print getThreadLinks(soup)
#getFiles(getImgLinks(soup))