Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import json
- import shutil
- import os
- from requests import get
- from bs4 import BeautifulSoup
- from datetime import date as dt
- def check_date(date):
- if type(date) is dt:
- return date
- else:
- year = int(str(date)[:4])
- month = int(str(date)[4:])
- return dt(year, month, 1)
- def month_list(date1, date2):
- d1, d2 = check_date(date1), check_date(date2)
- if d1 > d2:
- d2, d1 = d1, d2
- months = [d1]
- while months[-1] < d2:
- last = months[-1]
- if last.month == 12:
- months.append(dt(last.year + 1, 1, 1))
- else:
- months.append(dt(last.year, last.month + 1, 1))
- return months
- def get_posts(blog, archive_date):
- date = check_date(archive_date)
- response = get("http://%s.tumblr.com/archive/%s/%s" % (blog, date.year, date.month))
- soup = BeautifulSoup(response.text)
- return [int(json.loads(tag['data-json'])['post-id'])
- for tag in soup.select("div.post.is_photo")]
- def get_images(blog, post_id):
- post_url = "http://%s.tumblr.com/post/%s" % (blog, post_id)
- image_url_re = "(http://\d+\.media\.tumblr\.com/\w+/tumblr_\w+_)\d+\.(jpg|gif|png)"
- image_sizes = (1280, 500, 400, 250, 100)
- image_urls = re.findall(image_url_re, get(post_url).text)
- if not os.path.exists(blog):
- os.makedirs(blog)
- downloaded_images = []
- for image in image_urls:
- for size in image_sizes:
- resp = get("%s%s.%s" % (image[0], size, image[1]), stream = True)
- if resp.status_code == 200:
- file_name = re.search("(tumblr_\w+\.\w+)", resp.url).group(1)
- with open(os.path.join(blog, file_name), 'wb') as f:
- for chunk in resp.iter_content(30000):
- f.write(chunk)
- downloaded_images.append(resp.url)
- break
- return downloaded_images
- def download(blog, date1, date2, progress = False):
- downloaded_images = []
- months = month_list(date1, date2)
- for date in months:
- if progress: print date.year, date.month
- posts = get_posts(blog, date)
- for post in posts:
- if progress: print " " + str(post)
- downloaded_images += get_images(blog, post)
- return downloaded_images
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement