Advertisement
Sixem

instaripper.py

Jul 18th, 2016
353
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.18 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. instaripper.py
  4.  
  5. usage instaripper.py [-h] instagram directory
  6.  
  7. @tobe81cwb
  8. """
  9.  
  10. import argparse
  11. import json
  12. import logging
  13.  
  14. import threading
  15.  
  16. try:
  17.     import queue
  18. except ImportError:
  19.     import Queue as queue
  20.  
  21. try:
  22.     from urllib import parse
  23.     from urllib.request import urlopen, urlretrieve
  24.     from urllib.error import HTTPError
  25.     from urllib.parse import urlparse
  26.  
  27. except ImportError:
  28.     import urlparse as parse
  29.     from urllib import urlretrieve
  30.     from urllib2 import urlopen
  31.     from urllib2 import HTTPError
  32.     from urlparse import urlparse
  33.  
  34. import os
  35. import re
  36.  
  37. logger = logging.getLogger(__name__)
  38.  
  39.  
  40. class InstaRipper(object):
  41.  
  42.     user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/' \
  43.                  '537.36 (KHTML, like Gecko) Chrome/' \
  44.                  '51.0.2704.103 Safari/537.36'
  45.  
  46.     profile_info = None
  47.  
  48.     def __init__(self, username, directory):
  49.         self.username = username
  50.         self.directory = directory
  51.  
  52.         # capture profile information of the instagram
  53.         self.capture_profile_info()
  54.  
  55.         # download queue
  56.         self.q = queue.Queue()
  57.  
  58.         # downloaded media (for progress)
  59.         self.downloaded = 0
  60.  
  61.     def check_dir(self):
  62.         """
  63.        Check if directory exist, and creates if necessary
  64.        """
  65.         if not os.path.isdir(self.directory):
  66.             os.makedirs(self.directory)
  67.  
  68.     def get_profile_url(self):
  69.         """
  70.        Get the full URL of instagram
  71.        """
  72.         return parse.urljoin('https://instagram.com', self.username)
  73.  
  74.     def capture_shared_data(self, cursor=None):
  75.         """
  76.        Capture the shared data (json object) from instagram page
  77.        :param cursor: Instagram cursor / page
  78.        :return: Shared data as object
  79.        """
  80.         url = self.get_profile_url() if not cursor else self.get_profile_url() + '/?max_id={0}'.format(cursor)
  81.         logger.debug(u'Capturing shared data of {0}'.format(url))
  82.         try:
  83.             if not cursor:
  84.                 url = self.get_profile_url()
  85.  
  86.             # compatible with python 2 and 3
  87.             response = urlopen(url)
  88.             html = response.read()
  89.             response.close()
  90.  
  91.         except HTTPError:
  92.             raise InstaRipperProfileNotFoundError
  93.  
  94.         p = re.compile(r'window\._sharedData\s?=\s?({.*});')
  95.         shared_data = re.search(p, html.decode('utf-8')).group(1)
  96.  
  97.         return json.loads(shared_data)
  98.  
  99.     def capture_profile_info(self):
  100.         """
  101.        Capture the profile information of instagram account
  102.        :return: Object with Instagram Profile
  103.        """
  104.         logger.info(u'Capturing profile info')
  105.         self.profile_info = self.capture_shared_data()['entry_data']['ProfilePage'][0]['user']
  106.  
  107.     def get_media_info(self, media_code):
  108.         """
  109.        Get JSON from media
  110.        :param media_code: Code of the instagram media
  111.        :return: Object with media info
  112.        """
  113.         url = 'https://www.instagram.com/p/{code}/?taken-by={username}&__a=1'.format(
  114.             code=media_code, username=self.username)
  115.  
  116.         # compatible with python 2 and 3
  117.         response = urlopen(url)
  118.         html = response.read()
  119.         response.close()
  120.  
  121.         return json.loads(html.decode('utf-8'))
  122.  
  123.     def download_media(self, media_code, overwrite=False):
  124.         """
  125.        Download a photo or video from instagram
  126.        :param media_code: Instagram media code
  127.        :param overwrite: Should overwrite local files?
  128.        """
  129.         info = self.get_media_info(media_code)
  130.  
  131.         url = info['media']['display_src'] if not info['media']['is_video'] else info['media']['video_url']
  132.         filename = os.path.join(
  133.             self.directory,
  134.             os.path.basename(urlparse(url).path)
  135.         )
  136.  
  137.         if overwrite or not os.path.isfile(filename):
  138.             # compatible with python 2 and 3
  139.             urlretrieve(url, filename)
  140.  
  141.     def capture_media_codes(self, cursor=None):
  142.         """
  143.        Capture all media codes from instagram page
  144.        :param cursor: Instagram cursor / page
  145.        :return: List of media codes
  146.        """
  147.         logger.debug(u'Capturing page on {0} position'.format(cursor))
  148.         media_info = self.capture_shared_data(cursor)['entry_data']['ProfilePage'][0]['user']['media']
  149.         nodes = media_info['nodes']
  150.         page_info = media_info['page_info']
  151.  
  152.         media_code = map(lambda node: node['code'], nodes)
  153.         next_cursor = page_info['end_cursor'] if page_info['has_next_page'] else None
  154.  
  155.         return media_code, next_cursor
  156.  
  157.     def get_full_name(self):
  158.         """
  159.        Get full name of instagram user
  160.        :return: Full name of instagram user
  161.        """
  162.         return self.profile_info['full_name']
  163.  
  164.     def get_posts_count(self):
  165.         """
  166.        Get total posts of instagram user
  167.        :return: Total posts
  168.        """
  169.         return self.profile_info['media']['count']
  170.  
  171.     def download_thread_producer(self, download_threads=10, overwrite=False):
  172.         """
  173.        Initiate the download threads
  174.        :param download_threads: Number of download threads
  175.        :param overwrite: Should overwrite local files?
  176.        """
  177.         logger.info(u'Capturing media files')
  178.         logger.info(u'User {user} ({name}) has {total_posts} posts on Instagram.'
  179.                     .format(user=self.username,
  180.                             name=self.get_full_name(),
  181.                             total_posts=self.get_posts_count()))
  182.  
  183.         # create the threads
  184.         threads = []
  185.         for i in range(download_threads):
  186.             t = threading.Thread(target=self.download_thread_consumer, args=(overwrite,))
  187.             t.start()
  188.             threads.append(t)
  189.  
  190.         # get media codes and put on queue
  191.         cursor = None
  192.         while True:
  193.             page_medias = self.capture_media_codes(cursor)
  194.             if page_medias[0]:
  195.                 for media in page_medias[0]:
  196.                     self.q.put(media)
  197.  
  198.             cursor = page_medias[1]
  199.  
  200.             if not cursor:
  201.                 break
  202.  
  203.         # wait for all threads to finish
  204.         self.q.join()
  205.         for i in range(download_threads):
  206.             self.q.put(None)
  207.  
  208.         for t in threads:
  209.             t.join()
  210.  
  211.     def download_thread_consumer(self, overwrite=False):
  212.         """
  213.        Work on download queue
  214.        :param overwrite: Should overwrite local files?
  215.        """
  216.         while True:
  217.             media_code = self.q.get()
  218.             if not media_code:
  219.                 break
  220.  
  221.             logger.debug(u'Downloading media {0}'.format(media_code))
  222.             self.download_media(media_code, overwrite)
  223.             self.downloaded += 1
  224.             logger.info(u'Downloaded media {0}/{1}'.format(
  225.                 self.downloaded, self.get_posts_count()
  226.             ))
  227.  
  228.             self.q.task_done()
  229.  
  230.     def download_all_medias(self, download_threads=10, overwrite=False):
  231.         """
  232.        Download all media files from instagram account
  233.        :param download_threads: Number of download threads
  234.        :param overwrite: Should overwrite local files?
  235.        """
  236.         if self.profile_info['blocked_by_viewer'] or self.profile_info['country_block']:
  237.             raise InstaRipperBlockedAccountError
  238.  
  239.         elif self.profile_info['is_private']:
  240.             raise InstaRipperPrivateAccountError
  241.  
  242.         else:
  243.             if not os.path.isdir(self.directory):
  244.                 os.makedirs(self.directory)
  245.  
  246.             self.download_thread_producer(download_threads, overwrite)
  247.  
  248.  
  249. class InstaRipperProfileNotFoundError(Exception):
  250.     """Raised if the profile is a invalid profile"""
  251.  
  252.  
  253. class InstaRipperBlockedAccountError(Exception):
  254.     """Raised if the profile account is blocked for user"""
  255.  
  256.  
  257. class InstaRipperPrivateAccountError(Exception):
  258.     """Raised if the profile is private"""
  259.  
  260.  
  261. def init_logger():
  262.     global logger
  263.  
  264.     logger.setLevel(logging.INFO)
  265.     formatter = logging.Formatter('[%(asctime)s][%(levelname)s] - %(message)s')
  266.  
  267.     # console logger
  268.     ch = logging.StreamHandler()
  269.     ch.setFormatter(formatter)
  270.     logger.addHandler(ch)
  271.  
  272.  
  273. def main():
  274.     # parse arguments
  275.     parser = argparse.ArgumentParser(description='InstaRipper')
  276.     parser.add_argument('username', help='Instagram username')
  277.     parser.add_argument('directory', help='Where to save the images')
  278.     parser.add_argument('-t', '--threads', type=int, default=10, help='Download threads')
  279.     parser.add_argument('--overwrite', action='store_true', help='Overwrite existing files')
  280.  
  281.     args = parser.parse_args()
  282.     init_logger()
  283.  
  284.     try:
  285.         ripper = InstaRipper(args.username, args.directory)
  286.         ripper.download_all_medias(download_threads=args.threads, overwrite=args.overwrite)
  287.  
  288.     except InstaRipperProfileNotFoundError:
  289.         logger.error(u'Invalid profile. Check the profile name and try again.')
  290.  
  291.     except InstaRipperPrivateAccountError:
  292.         logger.error(u'This account is private.')
  293.  
  294.     except InstaRipperBlockedAccountError:
  295.         logger.error(u'This account is blocked for you.')
  296.  
  297.  
  298. if __name__ == '__main__':
  299.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement