Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re, praw, requests, os, glob, sys
- from bs4 import BeautifulSoup
- MIN_SCORE = 20 # the default minimum score before it is downloaded
- if len(sys.argv) < 2:
- # no command line options sent:
- print('Usage:')
- print(' python %s subreddit [minimum score]' % (sys.argv[0]))
- sys.exit()
- elif len(sys.argv) >= 2:
- # the subreddit was specified:
- targetSubreddit = sys.argv[1]
- if len(sys.argv) >= 3:
- # the desired minimum score was also specified:
- MIN_SCORE = int(sys.argv[2])
- imgurUrlPattern = re.compile(r'(http://i.imgur.com/(.*))(\?.*)?')
- count = 0
- def downloadImage(imageUrl, localFileName):
- response = requests.get(imageUrl)
- if response.status_code == 200:
- print('Downloading %s...' % (localFileName))
- global count
- count += 1
- with open(localFileName, 'wb') as fo:
- for chunk in response.iter_content(4096):
- fo.write(chunk)
- # Connect to reddit and download the subreddit front page
- r = praw.Reddit(user_agent='Masoko Mozilla 2015') # Note: Be sure to change the user-agent to something unique.
- submissions = r.get_subreddit(targetSubreddit).get_top_from_week(limit=200)
- # Or use one of these functions:
- # .get_hot(limit=50)
- # .get_top_from_year(limit=25)
- # .get_top_from_month(limit=25)
- # .get_top_from_week(limit=25)
- # .get_top_from_day(limit=25)
- # .get_top_from_hour(limit=25)
- # .get_top_from_all(limit=25)
- # Process all the submissions from the front page
- for submission in submissions:
- print submission.url
- # Check for all the cases where we will skip a submission:
- # if "imgur.com/" not in submission.url:
- # print " skip non-imgur submissions"
- # continue # skip non-imgur submissions
- if submission.score < MIN_SCORE:
- # print "skip submissions that haven't even reached 100"
- continue # skip submissions that haven't even reached 100 (thought this should be rare if we're collecting the "hot" submission)
- if len(glob.glob('reddit_%s_%s_*' % (targetSubreddit, submission.id))) > 0:
- print "we've already downloaded files for this reddit submission"
- continue # we've already downloaded files for this reddit submission
- if 'http://imgur.com/a/' in submission.url:
- # print "--------------- album ---------------"
- # This is an album submission.
- albumId = submission.url[len('http://imgur.com/a/'):]
- htmlSource = requests.get(submission.url).text
- soup = BeautifulSoup(htmlSource)
- matches = soup.select("a.zoom")
- #print matches
- for match in matches:
- imageUrl = match['href']
- if '?' in imageUrl:
- imageFile = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('?')]
- else:
- imageFile = imageUrl[imageUrl.rfind('/') + 1:]
- localFileName = 'reddit_%s_%s_album_%s_imgur_%s' % (targetSubreddit, submission.id, albumId, imageFile)
- downloadImage('http:' + match['href'], localFileName)
- elif submission.url.endswith('.jpg') or submission.url.endswith('.png') or submission.url.endswith('.jpeg') :
- # print "direct link ------------------>"
- mo = submission.url
- imgurFilename = "wimage.jpg"
- localFileName = 'reddit_%s_%s_album_None_imgur_%s' % (targetSubreddit, submission.id, imgurFilename)
- downloadImage(submission.url, localFileName)
- elif 'http://imgur.com/gallery/' in submission.url:
- # This is a gallery submission.
- #albumId = submission.url[len('http://imgur.com/gallery/'):]
- print "--------------- /gallery/ ---------------"
- #htmlSource = requests.get(submission.url).text
- #soup = BeautifulSoup(htmlSource)
- #matches = soup.select("a.zoom")
- #for match in matches:
- # imageUrl = match['href']
- # if '?' in imageUrl:
- # imageFile = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('?')]
- # else:
- # imageFile = imageUrl[imageUrl.rfind('/') + 1:]
- # localFileName = 'reddit_%s_%s_album_%s_imgur_%s' % (targetSubreddit, submission.id, albumId, imageFile)
- # downloadImage('http:' + match['href'], localFileName)
- pass
- elif 'http://i.imgur.com/' in submission.url:
- # The URL is a direct link to the image.
- # mo = imgurUrlPattern.search(submission.url) # using regex here instead of BeautifulSoup because we are pasing a url, not html
- mo = submission.url
- imgurFilename = "wimage.jpg"
- localFileName = 'reddit_%s_%s_album_None_imgur_%s' % (targetSubreddit, submission.id, imgurFilename)
- downloadImage(submission.url, localFileName)
- elif '://imgur.com/' in submission.url:
- # This is an Imgur page with a single image.
- htmlSource = requests.get(submission.url).text # download the image's page
- print submission.url
- soup = BeautifulSoup(htmlSource)
- # print soup.select("a.zoom")[0]['href']
- imageUrl = soup.select("a.zoom")[0]['href'] #[0]['href'] #<a href="//i.imgur.com/XzMLcMx.jpg" class="zoom">
- if imageUrl.startswith('//'):
- # if no schema is supplied in the url, prepend 'http:' to it
- imageUrl = 'http:' + imageUrl
- imageId = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('.')]
- if '?' in imageUrl:
- imageFile = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('?')]
- else:
- imageFile = imageUrl[imageUrl.rfind('/') + 1:]
- localFileName = 'reddit_%s_%s_album_None_imgur_%s' % (targetSubreddit, submission.id, imageFile)
- downloadImage(imageUrl, localFileName)
- else:
- print "============================= UNKNOWN====================="
- print str(count) + " Wallpapers Downloaded"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement