Advertisement
Guest User

Untitled

a guest
Mar 8th, 2017
1,964
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.17 KB | None | 0 0
  1. # -*- coding:UTF-8  -*-
  2. # based on https://github.com/yxw19870806/yxw19870806/blob/master/googlePlus/googlePlus.py
  3. # license: WTFPL
  4.  
  5. import argparse
  6. import wget
  7. import sys
  8. import re
  9. import zlib
  10. import os
  11. import string
  12. import json
  13. import requests
  14. import slugify
  15. from multiprocessing.pool import ThreadPool
  16.  
  17. URL = {
  18.     'archive': 'https://get.google.com/albumarchive/{}',
  19.     'album': 'https://get.google.com/albumarchive/{}/album/{}',
  20.     'data': 'https://get.google.com/_/AlbumArchiveUi/data',
  21.     'post_photos': "https://get.google.com/albumarchive/{}/albums/photos-from-posts"
  22. }
  23.  
  24. def find_between(input, first_string, last_string):
  25.     try:
  26.         start = input.rindex( first_string ) + len( first_string )
  27.         end = input.rindex( last_string, start ) if last_string else None
  28.         return input[start:end]
  29.     except ValueError:
  30.         return ""
  31.  
  32. def check_sub_key(needles, haystack):
  33.     if not isinstance(needles, tuple):
  34.         needles = tuple(needles)
  35.     if isinstance(haystack, dict):
  36.         for needle in needles:
  37.             if needle not in haystack:
  38.                 return False
  39.         return True
  40.     return False
  41.  
  42. def parse_url(url):
  43.     pattern = 'https://get.google.com/albumarchive/([^/]*)(/album/([^/#]*)|)'
  44.     match = re.findall(pattern, url)
  45.     if match:
  46.         match = match[0]
  47.         user_id = match[0]
  48.         if match[2]:
  49.             album_id = match[2]
  50.         else:
  51.             album_id = None
  52.     else:
  53.         user_id = None
  54.         album_id = None
  55.     return user_id, album_id
  56.  
  57. def get_list_of_albums(user_id):
  58.     pattern = '<h2\s*class=\"?[^\">]*\"\s*jsname=\"?[^\">]*\"\s*aria-label=\"?([^\">]*)\">\s*</h2>\s*<a\s*href=\"?([^\">]*)\"\s*class=\"?[^\">]*\">'
  59.     url = URL['archive'].format(user_id)
  60.     albums_list = {}
  61.  
  62.     matches = re.findall(pattern, requests.get(url).text)
  63.     for match in matches:
  64.         if '/album/' in match[1]:  # too lazy to parse posts and profile albums
  65.             albums_list[match[1].split('/album/')[1]] = match[0]
  66.  
  67.     return albums_list
  68.  
  69. def get_list_for_album(user_id, album_id):
  70.     album_page_url = URL['album'].format(user_id, album_id)
  71.     image_url_list = []
  72.  
  73.     album_page_response = requests.get(album_page_url)
  74.     if album_page_response.status_code == 200:
  75.         output = album_page_response.text
  76.         album_title = re.findall('<div\s*class=\"?[^\"]*\">?([^<]*)<\/div><div\s*class=\"?[^\"]*\"><span\s*class=\"?[^\"]*\">?[^\<]*<\/span>', output)
  77.         script_data = find_between(output, "AF_initDataCallback({key: 'ds:0'", "</script>")
  78.         script_data = find_between(script_data, "return ", "}});")
  79.         try:
  80.             script_data = json.loads(script_data)
  81.             user_key = script_data[4][0]
  82.             continue_token = script_data[3]
  83.             for data in script_data[4][1]:
  84.                 image_url_list.append(str(data[1]))
  85.         except ValueError:
  86.             return image_url_list, album_title
  87.         except IndexError:
  88.             return image_url_list, album_title
  89.         else:
  90.             while continue_token:
  91.                 continue_image_page_url = URL['data']
  92.                 post_data = {"f.req": '[[[113305010,[{"113305010":["%s",null,24,"%s"]}],null,null,0]]]' % (user_key, continue_token)}
  93.                 continue_image_page_response = requests.post(continue_image_page_url, data=post_data)
  94.                 if continue_image_page_response.status_code == 200:
  95.                     continue_data = find_between(continue_image_page_response.text, ")]}'", None).strip()
  96.                     try:
  97.                         continue_data = json.loads(continue_data)
  98.                         continue_token = continue_data[0][2]["113305010"][3]
  99.                         for data in continue_data[0][2]["113305010"][4][1]:
  100.                             image_url_list.append(str(data[1]))
  101.                     except ValueError:
  102.                         image_url_list = []
  103.                         continue_token = ""
  104.             return image_url_list, album_title
  105.  
  106. def get_photos_from_posts(user_id):
  107.     post_photos_url = URL['post_photos'].format(user_id)
  108.     extra_info = {
  109.         "is_error": True,
  110.         "blog_info_list": [],
  111.         "key": None,
  112.         "json_data": None,
  113.     }
  114.        
  115.     post_photos_response = requests.get(post_photos_url)
  116.     if post_photos_response.status_code == 200:
  117.         script_data = find_between(post_photos_response.text, "AF_initDataCallback({key: 'ds:0'", "</script>")
  118.         script_data = find_between(script_data, "return ", "}});")
  119.         try:
  120.             script_data = json.loads(script_data)
  121.         except ValueError:
  122.             script_data = []
  123.         else:
  124.             extra_info["json_data"] = script_data
  125.     if len(script_data) == 3:
  126.         if script_data[1] is not None:
  127.             extra_info["is_error"] = False
  128.             for data in script_data[1]:
  129.                 extra_blog_info = {
  130.                     "blog_id": None,
  131.                     "blog_time": None,
  132.                     "json_data": data,
  133.                 }
  134.                 if len(data) >= 2 and check_sub_key(("113305016",), data[1]) and len(data[1]["113305016"]) == 1 and len(data[1]["113305016"][0]) >= 5:
  135.                     extra_blog_info["blog_id"] = str(data[1]["113305016"][0][0])
  136.                     if isinstance(data[1]["113305016"][0][4], int):
  137.                         extra_blog_info["blog_time"] = int(data[1]["113305016"][0][4] / 1000)
  138.                 extra_info["blog_info_list"].append(extra_blog_info)
  139.         extra_info["key"] = str(script_data[2])
  140.     return extra_info
  141.  
  142. def download_file(params):
  143.     url, silent, dir = params
  144.     try:
  145.         filename = wget.download(url, out=dir, bar=None)
  146.         if not silent:
  147.             print(filename)
  148.     except:  # TODO
  149.         print('ERROR {}'.format(url))
  150.  
  151. def download_pool(list_of_files, dir, threads=8, silent=False):
  152.     pool = ThreadPool(processes=threads)
  153.     pool.map(download_file, zip(  # TODO rewrite
  154.         list_of_files,
  155.         (silent for i in range(len(list_of_files))),
  156.         (dir for i in range(len(list_of_files))))
  157.     )
  158.  
  159. if __name__ == "__main__":
  160.     parser = argparse.ArgumentParser(description="Google archive - download images from album.")
  161.     parser.add_argument("-u", "--url", help="album url", nargs='+', default=[])
  162.     parser.add_argument("-i", "--input-file", help="file with urls separated by new lines", default=None)
  163.     parser.add_argument("-t", "--threads", default=8, help="number of download threads (def. 2)", type=int)
  164.     parser.add_argument("-s", "--silent", action="store_true", help="no output to stdout")
  165.     parser.add_argument("--assume-yes", action="store_true", help="assume yes for redownloading albums")
  166.     parser.add_argument("--assume-no", action="store_true", help="assume no for redownloading albums")
  167.     args = parser.parse_args()
  168.    
  169.     if args.input_file:
  170.         with open(args.input_file) as file:
  171.             for line in file:
  172.                 args.url.append(line.strip())
  173.  
  174.     for album_url in args.url:
  175.         user_id, album_id = parse_url(album_url)
  176.         if not album_id:
  177.             print('auto albums download not yet supported!')
  178.             continue
  179.  
  180.         print('Gathering started')
  181.         image_url_list, album_title = get_list_for_album(user_id, album_id)
  182.         album_name = slugify.slugify(album_title[0], separator='_')
  183.         album_dir = user_id + '-' + str(hex(zlib.crc32(bytes(album_id, 'ascii'))))[2:] + '-' + album_name
  184.         if os.path.exists(album_dir):
  185.             print('{} already exists!'.format(album_dir))
  186.             if not args.assume_yes or not args.assume_no:
  187.                 user_input = input('Should I process anyway? [y/n]: ')
  188.             elif args.assume_no:
  189.                 continue
  190.             elif args.assume_yes:
  191.                 user_input = 'y'
  192.  
  193.             if user_input in ['y', 'yes']:
  194.                 download_pool(image_url_list, album_dir, args.threads, args.silent)
  195.             else:
  196.                 continue
  197.         else:
  198.             print('{} - download started'.format(album_dir))
  199.             os.makedirs(album_dir)
  200.             download_pool(image_url_list, album_dir, args.threads, args.silent)
  201.  
  202.     print('done.')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement