iama_alpaca

reddit_top_x_of_x.py

Oct 26th, 2017
444
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.77 KB | None | 0 0
  1. import requests
  2. import json
  3. import sys
  4. import os
  5. import threading
  6. import re
  7. from datetime import datetime
  8.  
  9. sub = sys.argv[1]
  10. time = sys.argv[2]
  11. limit = sys.argv[3]
  12. headers = {'User-Agent': 'Periodical Image Downloader',}
  13. dled = []
  14. imgnumber = 0
  15. extensions = ['.jpg', '.png', '.jpeg', '.gif', '.mp4', '.webm', '.gifv']
  16. links = []
  17. finished_links = []
  18. threads = []
  19.  
  20. p = requests.get('https://www.reddit.com/r/{}/top.json?sort=top&t={}&limit={}'.format(sub, time, limit), headers=headers)
  21. p_json = json.loads(p.text)
  22.  
  23. if not os.path.isdir(sub+'/'+datetime.now().strftime('%Y-%m-%d')):
  24.     os.makedirs(sub+'/'+datetime.now().strftime('%Y-%m-%d')+'/json_file')
  25.  
  26. with open(sub+'/'+datetime.now().strftime('%Y-%m-%d')+'/json_file/r_{}_top_{}_of_{}.json'.format(sub, limit, time), 'w') as f:
  27.     f.write(p.text)
  28.  
  29. for a in p_json['data']['children']:
  30.     link = a['data']['url']
  31.     if 'gfycat' in link or 'imgur' in link or 'i.redd.it' in link or link.endswith(tuple(extensions)):
  32.         links.append(link)
  33.  
  34. def download(url, file_name):
  35.     print(file_name)
  36.     with open(sub+'/'+datetime.now().strftime('%Y-%m-%d')+'/'+file_name, "wb") as file:
  37.         response = requests.get(url, headers=headers)
  38.         file.write(response.content)
  39.  
  40. print('Processing links')
  41.  
  42. for c in links:
  43.     if "imgur.com" in c:
  44.         if '/a/' in c or '/gallery/' in c:
  45.             finished_links.append(c)
  46.  
  47.         elif c.endswith(tuple(extensions)):
  48.             if c.endswith('.gifv'):
  49.                 newurl = c.replace(".gifv",".mp4")
  50.                 finished_links.append(newurl)
  51.  
  52.             else:
  53.                 finished_links.append(c)
  54.  
  55.         else:
  56.             html_page = requests.get(c)
  57.             if html_page.status_code == 404:
  58.                 print('404: skipping')
  59.             else:
  60.                 print(c)
  61.                 imgur_id = c.split('/')[-1]
  62.                 try:
  63.                     link = re.findall('(?:href|src)="(?:https?:)?(\/\/i\.imgur\.com\/{}\.\S+?)"'.format(imgur_id), html_page.text)[0]
  64.                     link = 'https:' + link
  65.                     finished_links.append(link)
  66.                 except IndexError:
  67.                     print('IndexError on link {}'.format(c))
  68.                     fixedlink = c.split('?')[0]
  69.                     print(fixedlink)
  70.                     pass
  71.  
  72.     elif "i.redd.it" in c or "i.reddituploads.com" in c:
  73.         finished_links.append(c)
  74.  
  75.     elif "gfycat.com" in c and not c.endswith('.webm'):
  76.         gfycat_id = c.split('/')[-1]
  77.         link = 'http://giant.gfycat.com/{}.webm'.format(gfycat_id)
  78.         finished_links.append(link)
  79.  
  80.     elif c.endswith(tuple(extensions)):
  81.         finished_links.append(c)
  82.  
  83. print('Downloading images')
  84. try:
  85.     for d in finished_links:
  86.         imgnumber += 1
  87.         a_imgnumber = 0
  88.         a_threads = []
  89.         donelinks = []
  90.         if '/a/' in d or '/gallery/' in d:
  91.             if not os.path.isdir(sub + '/' + str(imgnumber)):
  92.                 os.makedirs(sub + '/' + str(imgnumber))
  93.             html_page = requests.get(d + '/layout/blog')
  94.             if html_page.status_code == 404:
  95.                 print('404: skipping')
  96.             else:
  97.                 imglinks = re.findall(r'\.*?{"hash":"([a-zA-Z0-9]+)".*?"ext":"(\.[a-zA-Z0-9]+)".*?', html_page.text)
  98.                 for i in imglinks:
  99.                     try:
  100.                         if i[0]+i[1] not in donelinks:
  101.                             a_imgnumber += 1
  102.                             if i[1] == '.gif':
  103.                                 ext = '.mp4'
  104.                             else:
  105.                                 ext = i[1]
  106.                             g = threading.Thread(target=download, args=('https://i.imgur.com/'+i[0]+ext, str(imgnumber) + '/' + str(a_imgnumber) + ext))
  107.                             a_threads.append(g)
  108.                             g.start()
  109.                             donelinks.append(i[0]+i[1])
  110.                     except KeyboardInterrupt:
  111.                         print('\nCtrl-C Pressed; Finishing current threads then stopping...')
  112.                         for f in a_threads:
  113.                             f.join()
  114.                         sys.exit()
  115.                 for f in a_threads:
  116.                     f.join()
  117.         else:
  118.             # filename, file_extension = os.path.splitext(os.path.basename(d))
  119.             t = threading.Thread(target=download, args=(d, os.path.basename(d)))
  120.             t.start()
  121.             threads.append(t)
  122.  
  123.     for e in threads:
  124.         e.join()
  125.  
  126. except KeyboardInterrupt:
  127.     print('\nCtrl-C Pressed; Finishing current threads then stopping...')
  128.     for e in threads:
  129.         e.join()
  130.     sys.exit()
Add Comment
Please, Sign In to add comment