reddit-images.py

import re, praw, requests, os, glob, sys
from bs4 import BeautifulSoup

MIN_SCORE = 20 # the default minimum score before it is downloaded

if len(sys.argv) < 2:
    # no command line options sent:
    print('Usage:')
    print('  python %s subreddit [minimum score]' % (sys.argv[0]))
    sys.exit()
elif len(sys.argv) >= 2:
    # the subreddit was specified:
    targetSubreddit = sys.argv[1]
    if len(sys.argv) >= 3:
        # the desired minimum score was also specified:
        MIN_SCORE = int(sys.argv[2])


imgurUrlPattern = re.compile(r'(http://i.imgur.com/(.*))(\?.*)?')

count = 0
def downloadImage(imageUrl, localFileName):
    response = requests.get(imageUrl)
    if response.status_code == 200:
        print('Downloading %s...' % (localFileName))
        global count
        count += 1
        with open(localFileName, 'wb') as fo:
            for chunk in response.iter_content(4096):
                fo.write(chunk)

# Connect to reddit and download the subreddit front page
r = praw.Reddit(user_agent='Masoko Mozilla 2015') # Note: Be sure to change the user-agent to something unique.
submissions = r.get_subreddit(targetSubreddit).get_top_from_week(limit=200)
# Or use one of these functions:
#                                      .get_hot(limit=50)
#                                      .get_top_from_year(limit=25)
#                                      .get_top_from_month(limit=25)
#                                      .get_top_from_week(limit=25)
#                                      .get_top_from_day(limit=25)
#                                      .get_top_from_hour(limit=25)
#                                      .get_top_from_all(limit=25)

# Process all the submissions from the front page
for submission in submissions:
    print submission.url
    # Check for all the cases where we will skip a submission:
#   if "imgur.com/" not in submission.url:
#       print " skip non-imgur submissions"
#       continue # skip non-imgur submissions
    if submission.score < MIN_SCORE:
#       print "skip submissions that haven't even reached 100"
        continue # skip submissions that haven't even reached 100 (thought this should be rare if we're collecting the "hot" submission)
    if len(glob.glob('reddit_%s_%s_*' % (targetSubreddit, submission.id))) > 0:
        print "we've already downloaded files for this reddit submission"
        continue # we've already downloaded files for this reddit submission
    if 'http://imgur.com/a/' in submission.url:
#       print "--------------- album ---------------"
        # This is an album submission.
        albumId = submission.url[len('http://imgur.com/a/'):]
        htmlSource = requests.get(submission.url).text

        soup = BeautifulSoup(htmlSource)
        matches = soup.select("a.zoom")

        #print matches
        for match in matches:
            imageUrl = match['href']
            if '?' in imageUrl:
                imageFile = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('?')]
            else:
                imageFile = imageUrl[imageUrl.rfind('/') + 1:]
            localFileName = 'reddit_%s_%s_album_%s_imgur_%s' % (targetSubreddit, submission.id, albumId, imageFile)
            downloadImage('http:' + match['href'], localFileName)
    elif submission.url.endswith('.jpg') or submission.url.endswith('.png') or submission.url.endswith('.jpeg') :
#       print "direct link ------------------>"
        mo = submission.url
        imgurFilename = "wimage.jpg"
        localFileName = 'reddit_%s_%s_album_None_imgur_%s' % (targetSubreddit, submission.id, imgurFilename)
        downloadImage(submission.url, localFileName)
    elif 'http://imgur.com/gallery/' in submission.url:
        # This is a gallery submission.
        #albumId = submission.url[len('http://imgur.com/gallery/'):]
        print "--------------- /gallery/ ---------------"
        #htmlSource = requests.get(submission.url).text
        #soup = BeautifulSoup(htmlSource)
        #matches = soup.select("a.zoom")
        #for match in matches:
        #   imageUrl = match['href']
        #   if '?' in imageUrl:
        #       imageFile = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('?')]
        #   else:
        #       imageFile = imageUrl[imageUrl.rfind('/') + 1:]
        #   localFileName = 'reddit_%s_%s_album_%s_imgur_%s' % (targetSubreddit, submission.id, albumId, imageFile)
        #   downloadImage('http:' + match['href'], localFileName)
        pass
    elif 'http://i.imgur.com/' in submission.url:
        # The URL is a direct link to the image.
#       mo = imgurUrlPattern.search(submission.url) # using regex here instead of BeautifulSoup because we are pasing a url, not html
        mo = submission.url
        imgurFilename = "wimage.jpg"
        localFileName = 'reddit_%s_%s_album_None_imgur_%s' % (targetSubreddit, submission.id, imgurFilename)
        downloadImage(submission.url, localFileName)

    elif '://imgur.com/' in submission.url:
        # This is an Imgur page with a single image.
        htmlSource = requests.get(submission.url).text # download the image's page
        print submission.url
        soup = BeautifulSoup(htmlSource)
#       print soup.select("a.zoom")[0]['href']
        imageUrl = soup.select("a.zoom")[0]['href'] #[0]['href'] #<a href="//i.imgur.com/XzMLcMx.jpg" class="zoom">
        if imageUrl.startswith('//'):
            # if no schema is supplied in the url, prepend 'http:' to it
            imageUrl = 'http:' + imageUrl
        imageId = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('.')]

        if '?' in imageUrl:
            imageFile = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('?')]
        else:
            imageFile = imageUrl[imageUrl.rfind('/') + 1:]

        localFileName = 'reddit_%s_%s_album_None_imgur_%s' % (targetSubreddit, submission.id, imageFile)
        downloadImage(imageUrl, localFileName)
    else:
        print "============================= UNKNOWN====================="
print str(count) + " Wallpapers Downloaded"