Advertisement
Masoko

reddit-images.py

Apr 8th, 2016
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.30 KB | None | 0 0
  1. import re, praw, requests, os, glob, sys
  2. from bs4 import BeautifulSoup
  3.  
  4. MIN_SCORE = 20 # the default minimum score before it is downloaded
  5.  
  6. if len(sys.argv) < 2:
  7.     # no command line options sent:
  8.     print('Usage:')
  9.     print('  python %s subreddit [minimum score]' % (sys.argv[0]))
  10.     sys.exit()
  11. elif len(sys.argv) >= 2:
  12.     # the subreddit was specified:
  13.     targetSubreddit = sys.argv[1]
  14.     if len(sys.argv) >= 3:
  15.         # the desired minimum score was also specified:
  16.         MIN_SCORE = int(sys.argv[2])
  17.  
  18.  
  19. imgurUrlPattern = re.compile(r'(http://i.imgur.com/(.*))(\?.*)?')
  20.  
  21. count = 0
  22. def downloadImage(imageUrl, localFileName):
  23.     response = requests.get(imageUrl)
  24.     if response.status_code == 200:
  25.         print('Downloading %s...' % (localFileName))
  26.         global count
  27.         count += 1
  28.         with open(localFileName, 'wb') as fo:
  29.             for chunk in response.iter_content(4096):
  30.                 fo.write(chunk)
  31.  
  32. # Connect to reddit and download the subreddit front page
  33. r = praw.Reddit(user_agent='Masoko Mozilla 2015') # Note: Be sure to change the user-agent to something unique.
  34. submissions = r.get_subreddit(targetSubreddit).get_top_from_week(limit=200)
  35. # Or use one of these functions:
  36. #                                      .get_hot(limit=50)  
  37. #                                      .get_top_from_year(limit=25)
  38. #                                      .get_top_from_month(limit=25)
  39. #                                      .get_top_from_week(limit=25)
  40. #                                      .get_top_from_day(limit=25)
  41. #                                      .get_top_from_hour(limit=25)
  42. #                                      .get_top_from_all(limit=25)
  43.  
  44. # Process all the submissions from the front page
  45. for submission in submissions:
  46.     print submission.url
  47.     # Check for all the cases where we will skip a submission:
  48. #   if "imgur.com/" not in submission.url:
  49. #       print " skip non-imgur submissions"
  50. #       continue # skip non-imgur submissions
  51.     if submission.score < MIN_SCORE:
  52. #       print "skip submissions that haven't even reached 100"
  53.         continue # skip submissions that haven't even reached 100 (thought this should be rare if we're collecting the "hot" submission)
  54.     if len(glob.glob('reddit_%s_%s_*' % (targetSubreddit, submission.id))) > 0:
  55.         print "we've already downloaded files for this reddit submission"
  56.         continue # we've already downloaded files for this reddit submission
  57.     if 'http://imgur.com/a/' in submission.url:
  58. #       print "--------------- album ---------------"
  59.         # This is an album submission.
  60.         albumId = submission.url[len('http://imgur.com/a/'):]
  61.         htmlSource = requests.get(submission.url).text
  62.  
  63.         soup = BeautifulSoup(htmlSource)
  64.         matches = soup.select("a.zoom")
  65.        
  66.         #print matches
  67.         for match in matches:
  68.             imageUrl = match['href']
  69.             if '?' in imageUrl:
  70.                 imageFile = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('?')]
  71.             else:
  72.                 imageFile = imageUrl[imageUrl.rfind('/') + 1:]
  73.             localFileName = 'reddit_%s_%s_album_%s_imgur_%s' % (targetSubreddit, submission.id, albumId, imageFile)
  74.             downloadImage('http:' + match['href'], localFileName)
  75.     elif submission.url.endswith('.jpg') or submission.url.endswith('.png') or submission.url.endswith('.jpeg') :
  76. #       print "direct link ------------------>"
  77.         mo = submission.url
  78.         imgurFilename = "wimage.jpg"
  79.         localFileName = 'reddit_%s_%s_album_None_imgur_%s' % (targetSubreddit, submission.id, imgurFilename)
  80.         downloadImage(submission.url, localFileName)
  81.     elif 'http://imgur.com/gallery/' in submission.url:
  82.         # This is a gallery submission.
  83.         #albumId = submission.url[len('http://imgur.com/gallery/'):]
  84.         print "--------------- /gallery/ ---------------"
  85.         #htmlSource = requests.get(submission.url).text
  86.         #soup = BeautifulSoup(htmlSource)
  87.         #matches = soup.select("a.zoom")
  88.         #for match in matches:
  89.         #   imageUrl = match['href']
  90.         #   if '?' in imageUrl:
  91.         #       imageFile = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('?')]
  92.         #   else:
  93.         #       imageFile = imageUrl[imageUrl.rfind('/') + 1:]
  94.         #   localFileName = 'reddit_%s_%s_album_%s_imgur_%s' % (targetSubreddit, submission.id, albumId, imageFile)
  95.         #   downloadImage('http:' + match['href'], localFileName)
  96.         pass
  97.     elif 'http://i.imgur.com/' in submission.url:
  98.         # The URL is a direct link to the image.
  99. #       mo = imgurUrlPattern.search(submission.url) # using regex here instead of BeautifulSoup because we are pasing a url, not html
  100.         mo = submission.url
  101.         imgurFilename = "wimage.jpg"
  102.         localFileName = 'reddit_%s_%s_album_None_imgur_%s' % (targetSubreddit, submission.id, imgurFilename)
  103.         downloadImage(submission.url, localFileName)
  104.  
  105.     elif '://imgur.com/' in submission.url:
  106.         # This is an Imgur page with a single image.
  107.         htmlSource = requests.get(submission.url).text # download the image's page
  108.         print submission.url
  109.         soup = BeautifulSoup(htmlSource)
  110. #       print soup.select("a.zoom")[0]['href']
  111.         imageUrl = soup.select("a.zoom")[0]['href'] #[0]['href'] #<a href="//i.imgur.com/XzMLcMx.jpg" class="zoom">
  112.         if imageUrl.startswith('//'):
  113.             # if no schema is supplied in the url, prepend 'http:' to it
  114.             imageUrl = 'http:' + imageUrl
  115.         imageId = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('.')]
  116.  
  117.         if '?' in imageUrl:
  118.             imageFile = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('?')]
  119.         else:
  120.             imageFile = imageUrl[imageUrl.rfind('/') + 1:]
  121.  
  122.         localFileName = 'reddit_%s_%s_album_None_imgur_%s' % (targetSubreddit, submission.id, imageFile)
  123.         downloadImage(imageUrl, localFileName)
  124.     else:
  125.         print "============================= UNKNOWN====================="
  126. print str(count) + " Wallpapers Downloaded"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement