Advertisement
Guest User

gettum.py

a guest
Sep 2nd, 2014
261
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.33 KB | None | 0 0
  1. import re
  2. import json
  3. import shutil
  4. import os
  5. from requests import get
  6. from bs4 import BeautifulSoup
  7. from datetime import date as dt
  8.  
  9. def check_date(date):
  10.  
  11.     if type(date) is dt:
  12.         return date
  13.     else:
  14.         year = int(str(date)[:4])
  15.         month = int(str(date)[4:])
  16.         return dt(year, month, 1)
  17.  
  18.  
  19. def month_list(date1, date2):
  20.  
  21.     d1, d2 = check_date(date1), check_date(date2)
  22.  
  23.     if d1 > d2:
  24.         d2, d1 = d1, d2
  25.  
  26.     months = [d1]
  27.     while months[-1] < d2:
  28.         last = months[-1]
  29.         if last.month == 12:
  30.             months.append(dt(last.year + 1, 1, 1))
  31.         else:
  32.             months.append(dt(last.year, last.month + 1, 1))
  33.  
  34.     return months
  35.  
  36.  
  37. def get_posts(blog, archive_date):
  38.  
  39.     date = check_date(archive_date)
  40.     response = get("http://%s.tumblr.com/archive/%s/%s" % (blog, date.year, date.month))
  41.     soup = BeautifulSoup(response.text)
  42.     return [int(json.loads(tag['data-json'])['post-id'])
  43.                 for tag in soup.select("div.post.is_photo")]
  44.  
  45.  
  46. def get_images(blog, post_id):
  47.  
  48.     post_url = "http://%s.tumblr.com/post/%s" % (blog, post_id)
  49.     image_url_re = "(http://\d+\.media\.tumblr\.com/\w+/tumblr_\w+_)\d+\.(jpg|gif|png)"
  50.     image_sizes = (1280, 500, 400, 250, 100)
  51.  
  52.     image_urls = re.findall(image_url_re, get(post_url).text)
  53.  
  54.     if not os.path.exists(blog):
  55.         os.makedirs(blog)
  56.  
  57.     downloaded_images = []
  58.     for image in image_urls:
  59.         for size in image_sizes:
  60.             resp = get("%s%s.%s" % (image[0], size, image[1]), stream = True)
  61.             if resp.status_code == 200:
  62.                 file_name = re.search("(tumblr_\w+\.\w+)", resp.url).group(1)
  63.                 with open(os.path.join(blog, file_name), 'wb') as f:
  64.                     for chunk in resp.iter_content(30000):
  65.                         f.write(chunk)
  66.                 downloaded_images.append(resp.url)
  67.                 break
  68.  
  69.     return downloaded_images
  70.  
  71.  
  72. def download(blog, date1, date2, progress = False):
  73.  
  74.     downloaded_images = []
  75.     months = month_list(date1, date2)
  76.     for date in months:
  77.         if progress: print date.year, date.month
  78.         posts = get_posts(blog, date)
  79.         for post in posts:
  80.             if progress: print "    " + str(post)
  81.             downloaded_images += get_images(blog, post)
  82.  
  83.     return downloaded_images
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement