Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def get_links_for_tag(browser,
- tag,
- amount,
- logger,
- media=None,
- skip_top_posts=True):
- """Fetches the number of links specified
- by amount and returns a list of links"""
- if media is None:
- # All known media types
- media = ['', 'Post', 'Video']
- elif media == 'Photo':
- # Include posts with multiple images in it
- media = ['', 'Post']
- else:
- # Make it an array to use it in the following part
- media = [media]
- browser.get('https://www.instagram.com/explore/tags/'
- + (tag[1:] if tag[:1] == '#' else tag))
- # update server calls
- update_activity()
- sleep(2)
- # clicking load more
- body_elem = browser.find_element_by_tag_name('body')
- sleep(2)
- abort = True
- # Get links
- if skip_top_posts:
- main_elem = browser.find_element_by_xpath('//main/article/div[2]')
- else:
- main_elem = browser.find_element_by_tag_name('main')
- total_links = 0
- links = []
- filtered_links = 0
- try_again = 0
- default_load = 21 if not skip_top_posts else 12
- while filtered_links < amount:
- if amount >= default_load:
- if filtered_links >= default_load:
- for i in range(3):
- browser.execute_script(
- "window.scrollTo(0, document.body.scrollHeight);")
- update_activity()
- sleep(1.5)
- link_elems = main_elem.find_elements_by_tag_name('a')
- if not link_elems:
- main_elem2 = browser.find_element_by_xpath('//main/article/div[1]')
- link_elems = main_elem2.find_elements_by_tag_name('a')
- total_links += len(link_elems)
- try:
- if link_elems:
- new_links = [link_elem.get_attribute('href') for link_elem in link_elems
- if link_elem and link_elem.text in media]
- for new_link in new_links:
- links.append(new_link)
- links_all = links
- s = set()
- links = []
- for i in links_all:
- if i not in s:
- s.add(i)
- links.append(i)
- if len(links) == filtered_links:
- try_again += 1
- if try_again > 1 :
- logger.info("This tag has less pictures than intended..")
- break
- else:
- filtered_links = len(links)
- try_again = 0
- if filtered_links < default_load and amount > filtered_links:
- logger.info("This tag has so less pictures than expected...")
- break
- else:
- logger.warning("This tag does not contain a picture")
- break
- except BaseException as e:
- logger.error("link_elems error {}".format(str(e)))
- break
- while (filtered_links < amount) and not abort:
- amount_left = amount - filtered_links
- # Average items of the right media per page loaded
- new_per_page = ceil(12 * filtered_links / total_links)
- if new_per_page == 0:
- # Avoid division by zero
- new_per_page = 1. / 12.
- # Number of page load needed
- new_needed = int(ceil(amount_left / new_per_page))
- if new_needed > 12:
- # Don't go bananas trying to get all of instagram!
- new_needed = 12
- for i in range(new_needed): # add images x * 12
- # Keep the latest window active while loading more posts
- before_load = total_links
- body_elem.send_keys(Keys.END)
- # update server calls
- update_activity()
- sleep(1)
- body_elem.send_keys(Keys.HOME)
- sleep(1)
- link_elems = main_elem.find_elements_by_tag_name('a')
- total_links = len(link_elems)
- abort = (before_load == total_links)
- if abort:
- break
- links = [link_elem.get_attribute('href') for link_elem in link_elems
- if link_elem.text in media]
- filtered_links = len(links)
- return links[:amount]
Advertisement
Add Comment
Please, Sign In to add comment