Guest User

get_links_for_tag() UPDATE 2*

a guest
Feb 12th, 2018
612
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.42 KB | None | 0 0
  1. def get_links_for_tag(browser,
  2.                       tag,
  3.                       amount,
  4.                       logger,
  5.                       media=None,
  6.                       skip_top_posts=True):
  7.     """Fetches the number of links specified
  8.    by amount and returns a list of links"""
  9.     if media is None:
  10.         # All known media types
  11.         media = ['', 'Post', 'Video']
  12.     elif media == 'Photo':
  13.         # Include posts with multiple images in it
  14.         media = ['', 'Post']
  15.     else:
  16.         # Make it an array to use it in the following part
  17.         media = [media]
  18.  
  19.     browser.get('https://www.instagram.com/explore/tags/'
  20.                 + (tag[1:] if tag[:1] == '#' else tag))
  21.     # update server calls
  22.     update_activity()
  23.     sleep(2)
  24.  
  25.     # clicking load more
  26.     body_elem = browser.find_element_by_tag_name('body')
  27.     sleep(2)
  28.  
  29.     abort = True
  30.  
  31.     # Get links
  32.     if skip_top_posts:
  33.         main_elem = browser.find_element_by_xpath('//main/article/div[2]')
  34.     else:
  35.         main_elem = browser.find_element_by_tag_name('main')
  36.     total_links = 0
  37.     links = []
  38.     filtered_links = 0
  39.     try_again = 0
  40.     default_load = 21 if not skip_top_posts else 12
  41.  
  42.     while filtered_links < amount:
  43.         if amount >= default_load:
  44.             if filtered_links >= default_load:
  45.                 for i in range(3):
  46.                     browser.execute_script(
  47.                         "window.scrollTo(0, document.body.scrollHeight);")
  48.                     update_activity()
  49.                     sleep(1.5)
  50.         link_elems = main_elem.find_elements_by_tag_name('a')
  51.         if not link_elems:
  52.             main_elem2 = browser.find_element_by_xpath('//main/article/div[1]')
  53.             link_elems = main_elem2.find_elements_by_tag_name('a')
  54.         total_links += len(link_elems)
  55.        
  56.         try:
  57.             if link_elems:
  58.                 new_links = [link_elem.get_attribute('href') for link_elem in link_elems
  59.                          if link_elem and link_elem.text in media]
  60.                 for new_link in new_links:
  61.                     links.append(new_link)
  62.                
  63.                 links_all = links
  64.                 s = set()
  65.                 links = []
  66.                 for i in links_all:
  67.                     if i not in s:
  68.                         s.add(i)
  69.                         links.append(i)
  70.                
  71.                 if len(links) == filtered_links:
  72.                     try_again += 1
  73.                     if try_again > 1 :
  74.                         logger.info("This tag has less pictures than intended..")
  75.                         break
  76.                 else:
  77.                     filtered_links = len(links)
  78.                     try_again = 0
  79.                 if filtered_links < default_load and amount > filtered_links:
  80.                     logger.info("This tag has so less pictures than expected...")
  81.                     break
  82.             else:
  83.                 logger.warning("This tag does not contain a picture")
  84.                 break
  85.  
  86.         except BaseException as e:
  87.             logger.error("link_elems error {}".format(str(e)))
  88.             break
  89.  
  90.            
  91.     while (filtered_links < amount) and not abort:
  92.         amount_left = amount - filtered_links
  93.         # Average items of the right media per page loaded
  94.         new_per_page = ceil(12 * filtered_links / total_links)
  95.         if new_per_page == 0:
  96.             # Avoid division by zero
  97.             new_per_page = 1. / 12.
  98.         # Number of page load needed
  99.         new_needed = int(ceil(amount_left / new_per_page))
  100.  
  101.         if new_needed > 12:
  102.             # Don't go bananas trying to get all of instagram!
  103.             new_needed = 12
  104.  
  105.         for i in range(new_needed):  # add images x * 12
  106.             # Keep the latest window active while loading more posts
  107.             before_load = total_links
  108.             body_elem.send_keys(Keys.END)
  109.             # update server calls
  110.             update_activity()
  111.             sleep(1)
  112.             body_elem.send_keys(Keys.HOME)
  113.             sleep(1)
  114.             link_elems = main_elem.find_elements_by_tag_name('a')
  115.             total_links = len(link_elems)
  116.             abort = (before_load == total_links)
  117.             if abort:
  118.                 break
  119.  
  120.         links = [link_elem.get_attribute('href') for link_elem in link_elems
  121.                  if link_elem.text in media]
  122.         filtered_links = len(links)
  123.    
  124.     return links[:amount]
Advertisement
Add Comment
Please, Sign In to add comment