Guest User

entry_data ( Facebook | Error )

a guest
Feb 16th, 2018
104
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 21.98 KB | None | 0 0
  1. import re
  2. import random
  3.  
  4. """Module that handles the like features"""
  5. from math import ceil
  6. from re import findall
  7. from selenium.webdriver.common.keys import Keys
  8. from selenium.common.exceptions import WebDriverException
  9.  
  10. from .time_util import sleep
  11. from .util import update_activity
  12. from .util import add_user_to_blacklist
  13. from .util import click_element
  14.  
  15.  
  16. def get_links_from_feed(browser, amount, num_of_search, logger):
  17.     """Fetches random number of links from feed and returns a list of links"""
  18.  
  19.     browser.get('https://www.instagram.com')
  20.     # update server calls
  21.     update_activity()
  22.     sleep(2)
  23.  
  24.     for i in range(num_of_search + 1):
  25.         browser.execute_script(
  26.             "window.scrollTo(0, document.body.scrollHeight);")
  27.         sleep(2)
  28.  
  29.     # get links
  30.     link_elems = browser.find_elements_by_xpath(
  31.         "//article/div[2]/div[2]/a")
  32.  
  33.     total_links = len(link_elems)
  34.     logger.info("Total of links feched for analysis: {}".format(total_links))
  35.     links = []
  36.     try:
  37.         if link_elems:
  38.             links = [link_elem.get_attribute('href') for link_elem in link_elems]
  39.             logger.info("~~~~~~~~~~~~~~~~~~~~~~~~~~~")
  40.             for i, link in enumerate(links):
  41.                 print(i, link)
  42.             logger.info("~~~~~~~~~~~~~~~~~~~~~~~~~~~")
  43.  
  44.     except BaseException as e:
  45.         logger.error("link_elems error {}".format(str(e)))
  46.  
  47.     return links
  48.  
  49.  
  50. def get_links_for_location(browser,
  51.                            location,
  52.                            amount,
  53.                            logger,
  54.                            media=None,
  55.                            skip_top_posts=True):
  56.  
  57.     """Fetches the number of links specified
  58.    by amount and returns a list of links"""
  59.     if media is None:
  60.         # All known media types
  61.         media = ['', 'Post', 'Video']
  62.     elif media == 'Photo':
  63.         # Include posts with multiple images in it
  64.         media = ['', 'Post']
  65.     else:
  66.         # Make it an array to use it in the following part
  67.         media = [media]
  68.  
  69.     browser.get('https://www.instagram.com/explore/locations/' + location)
  70.     # update server calls
  71.     update_activity()
  72.     sleep(2)
  73.  
  74.     # clicking load more
  75.     body_elem = browser.find_element_by_tag_name('body')
  76.     sleep(2)
  77.  
  78.     abort = True
  79.     try:
  80.         load_button = body_elem.find_element_by_xpath(
  81.             '//a[contains(@class, "_1cr2e _epyes")]')
  82.     except:
  83.         try:
  84.             # scroll down to load posts
  85.             for i in range(int(ceil(amount/12))):
  86.                 browser.execute_script(
  87.                     "window.scrollTo(0, document.body.scrollHeight);")
  88.                 sleep(2)
  89.         except:
  90.             logger.warning(
  91.                 'Load button not found, working with current images!')
  92.         else:
  93.             abort = False
  94.             body_elem.send_keys(Keys.END)
  95.             sleep(2)
  96.             # update server calls
  97.             update_activity()
  98.     else:
  99.         abort = False
  100.         body_elem.send_keys(Keys.END)
  101.         sleep(2)
  102.         click_element(browser, load_button) # load_button.click()
  103.         # update server calls
  104.         update_activity()
  105.  
  106.     body_elem.send_keys(Keys.HOME)
  107.     sleep(1)
  108.  
  109.     # Get links
  110.     if skip_top_posts:
  111.         main_elem = browser.find_element_by_xpath('//main/article/div[2]')
  112.     else:
  113.         main_elem = browser.find_element_by_tag_name('main')
  114.  
  115.     link_elems = main_elem.find_elements_by_tag_name('a')
  116.     total_links = len(link_elems)
  117.     links = [link_elem.get_attribute('href') for link_elem in link_elems
  118.              if link_elem.text in media]
  119.     filtered_links = len(links)
  120.  
  121.     while (filtered_links < amount) and not abort:
  122.         amount_left = amount - filtered_links
  123.         # Average items of the right media per page loaded
  124.         new_per_page = ceil(12 * filtered_links / total_links)
  125.         if new_per_page == 0:
  126.             # Avoid division by zero
  127.             new_per_page = 1. / 12.
  128.         # Number of page load needed
  129.         new_needed = int(ceil(amount_left / new_per_page))
  130.  
  131.         if new_needed > 12:
  132.             # Don't go bananas trying to get all of instagram!
  133.             new_needed = 12
  134.  
  135.         for i in range(new_needed):  # add images x * 12
  136.             # Keep the latest window active while loading more posts
  137.             before_load = total_links
  138.             body_elem.send_keys(Keys.END)
  139.             # update server calls
  140.             update_activity()
  141.             sleep(1)
  142.             body_elem.send_keys(Keys.HOME)
  143.             sleep(1)
  144.             link_elems = main_elem.find_elements_by_tag_name('a')
  145.             total_links = len(link_elems)
  146.             abort = (before_load == total_links)
  147.             if abort:
  148.                 break
  149.  
  150.         links = [link_elem.get_attribute('href') for link_elem in link_elems
  151.                  if link_elem.text in media]
  152.         filtered_links = len(links)
  153.  
  154.     return links[:amount]
  155.  
  156.  
  157. def get_links_for_tag(browser,
  158.                       tag,
  159.                       amount,
  160.                       logger,
  161.                       media=None,
  162.                       skip_top_posts=True):
  163.     """Fetches the number of links specified
  164.    by amount and returns a list of links"""
  165.     if media is None:
  166.         # All known media types
  167.         media = ['', 'Post', 'Video']
  168.     elif media == 'Photo':
  169.         # Include posts with multiple images in it
  170.         media = ['', 'Post']
  171.     else:
  172.         # Make it an array to use it in the following part
  173.         media = [media]
  174.    
  175.     browser.get('https://www.instagram.com/explore/tags/'
  176.                 + (tag[1:] if tag[:1] == '#' else tag))
  177.     # update server calls
  178.     update_activity()
  179.     sleep(2)
  180.  
  181.     # clicking load more
  182.     body_elem = browser.find_element_by_tag_name('body')
  183.     sleep(2)
  184.    
  185.     abort = True
  186.    
  187.     # Get links
  188.     if skip_top_posts:
  189.         main_elem = browser.find_element_by_xpath('//main/article/div[2]')
  190.     else:
  191.         main_elem = browser.find_element_by_tag_name('main')
  192.     total_links = 0
  193.     links = []
  194.     filtered_links = 0
  195.     try_again = 0
  196.     default_load = 21 if not skip_top_posts else 12
  197.    
  198.     while filtered_links < amount:
  199.         if amount >= default_load:
  200.             if filtered_links >= default_load:
  201.                 for i in range(3):
  202.                     browser.execute_script(
  203.                         "window.scrollTo(0, document.body.scrollHeight);")
  204.                     update_activity()
  205.                     sleep(1.5)
  206.         link_elems = main_elem.find_elements_by_tag_name('a')
  207.         if not link_elems:
  208.             main_elem2 = browser.find_element_by_xpath('//main/article/div[1]')
  209.             link_elems = main_elem2.find_elements_by_tag_name('a')
  210.         total_links += len(link_elems)
  211.        
  212.         try:
  213.             if link_elems:
  214.                 new_links = [link_elem.get_attribute('href') for link_elem in link_elems
  215.                          if link_elem and link_elem.text in media]
  216.                 for new_link in new_links:
  217.                     links.append(new_link)
  218.                              
  219.                 links_all = links
  220.                 s = set()
  221.                 links = []
  222.                 for i in links_all:
  223.                     if i not in s:
  224.                         s.add(i)
  225.                         links.append(i)
  226.            
  227.                 if len(links) == filtered_links:
  228.                     try_again += 1
  229.                     if try_again > 1 :
  230.                         logger.info("This tag has less pictures than intended..")
  231.                         break
  232.                 else:
  233.                     filtered_links = len(links)
  234.                     try_again = 0
  235.                 if filtered_links < default_load and amount > filtered_links:
  236.                     logger.info("This tag has so less pictures than expected...")
  237.                     break
  238.             else:
  239.                 logger.warning("This tag does not contain a picture")
  240.                 break
  241.                    
  242.         except BaseException as e:
  243.             logger.error("link_elems error {}".format(str(e)))
  244.             break
  245.                            
  246.                            
  247.     while (filtered_links < amount) and not abort:
  248.         amount_left = amount - filtered_links
  249.         # Average items of the right media per page loaded
  250.         new_per_page = ceil(12 * filtered_links / total_links)
  251.         if new_per_page == 0:
  252.             # Avoid division by zero
  253.             new_per_page = 1. / 12.
  254.         # Number of page load needed
  255.         new_needed = int(ceil(amount_left / new_per_page))
  256.  
  257.         if new_needed > 12:
  258.             # Don't go bananas trying to get all of instagram!
  259.             new_needed = 12
  260.                                                    
  261.         for i in range(new_needed):  # add images x * 12
  262.             # Keep the latest window active while loading more posts
  263.             before_load = total_links
  264.             body_elem.send_keys(Keys.END)
  265.             # update server calls
  266.             update_activity()
  267.             sleep(1)
  268.             body_elem.send_keys(Keys.HOME)
  269.             sleep(1)
  270.             link_elems = main_elem.find_elements_by_tag_name('a')
  271.             total_links = len(link_elems)
  272.             abort = (before_load == total_links)
  273.             if abort:
  274.                 break
  275.                                                                        
  276.         links = [link_elem.get_attribute('href') for link_elem in link_elems
  277.                 if link_elem.text in media]
  278.         filtered_links = len(links)
  279.                                                                            
  280.     return links[:amount]
  281.  
  282. def get_links_for_username(browser,
  283.                            username,
  284.                            amount,
  285.                            logger,
  286.                            randomize=False,
  287.                            media=None):
  288.  
  289.     """Fetches the number of links specified
  290.    by amount and returns a list of links"""
  291.     if media is None:
  292.         # All known media types
  293.         media = ['', 'Post', 'Video']
  294.     elif media == 'Photo':
  295.         # Include posts with multiple images in it
  296.         media = ['', 'Post']
  297.     else:
  298.         # Make it an array to use it in the following part
  299.         media = [media]
  300.  
  301.     logger.info('Getting {} image list...'.format(username))
  302.  
  303.     # Get  user profile page
  304.     browser.get('https://www.instagram.com/' + username)
  305.     # update server calls
  306.     update_activity()
  307.  
  308.     body_elem = browser.find_element_by_tag_name('body')
  309.  
  310.     try:
  311.         is_private = body_elem.find_element_by_xpath(
  312.             '//h2[@class="_kcrwx"]')
  313.     except:
  314.         logger.info('Interaction begin...')
  315.     else:
  316.         if is_private:
  317.             logger.warning('This user is private...')
  318.             return False
  319.  
  320.     abort = True
  321.  
  322.     try:
  323.         load_button = body_elem.find_element_by_xpath(
  324.             '//a[contains(@class, "_1cr2e _epyes")]')
  325.     except:
  326.         try:
  327.             # scroll down to load posts
  328.             for i in range(int(ceil(amount/12))):
  329.                 browser.execute_script(
  330.                     "window.scrollTo(0, document.body.scrollHeight);")
  331.                 sleep(2)
  332.         except:
  333.             logger.warning(
  334.                 'Load button not found, working with current images!')
  335.         else:
  336.             abort = False
  337.             body_elem.send_keys(Keys.END)
  338.             sleep(2)
  339.             # update server calls
  340.             update_activity()
  341.     else:
  342.         abort = False
  343.         body_elem.send_keys(Keys.END)
  344.         sleep(2)
  345.         click_element(browser, load_button) # load_button.click()
  346.         # update server calls
  347.         update_activity()
  348.  
  349.     body_elem.send_keys(Keys.HOME)
  350.     sleep(2)
  351.  
  352.     # Get Links
  353.     main_elem = browser.find_element_by_tag_name('main')
  354.     link_elems = main_elem.find_elements_by_tag_name('a')
  355.     total_links = len(link_elems)
  356.     links = []
  357.     filtered_links = 0
  358.     try:
  359.         if link_elems:
  360.             links = [link_elem.get_attribute('href') for link_elem in link_elems
  361.                      if link_elem and link_elem.text in media]
  362.             filtered_links = len(links)
  363.  
  364.     except BaseException as e:
  365.         logger.error("link_elems error {}}".format(str(e)))
  366.  
  367.     if randomize:
  368.         # Expanding the pooulation for better random distribution
  369.         amount = amount * 5
  370.  
  371.     while (filtered_links < amount) and not abort:
  372.         amount_left = amount - filtered_links
  373.         # Average items of the right media per page loaded
  374.         new_per_page = ceil(12 * filtered_links / total_links)
  375.         if new_per_page == 0:
  376.             # Avoid division by zero
  377.             new_per_page = 1. / 12.
  378.         # Number of page load needed
  379.         new_needed = int(ceil(amount_left / new_per_page))
  380.  
  381.         if new_needed > 12:
  382.             # Don't go bananas trying to get all of instagram!
  383.             new_needed = 12
  384.  
  385.         for i in range(new_needed):  # add images x * 12
  386.             # Keep the latest window active while loading more posts
  387.             before_load = total_links
  388.             body_elem.send_keys(Keys.END)
  389.             # update server calls
  390.             update_activity()
  391.             sleep(1)
  392.             body_elem.send_keys(Keys.HOME)
  393.             sleep(1)
  394.             link_elems = main_elem.find_elements_by_tag_name('a')
  395.             total_links = len(link_elems)
  396.             abort = (before_load == total_links)
  397.             if abort:
  398.                 break
  399.  
  400.         links = [link_elem.get_attribute('href') for link_elem in link_elems
  401.                  if link_elem.text in media]
  402.         filtered_links = len(links)
  403.  
  404.     if randomize:
  405.         # Shuffle the population index
  406.         links = random.sample(links, filtered_links)
  407.  
  408.     return links[:amount]
  409.  
  410.  
  411. def check_link(browser,
  412.                link,
  413.                dont_like,
  414.                ignore_if_contains,
  415.                ignore_users,
  416.                username,
  417.                like_by_followers_upper_limit,
  418.                like_by_followers_lower_limit,
  419.                logger):
  420.  
  421.     browser.get(link)
  422.     # update server calls
  423.     update_activity()
  424.     sleep(2)
  425.  
  426.     """Check if the Post is Valid/Exists"""
  427.     try:
  428.         post_page = browser.execute_script(
  429.             "return window._sharedData.entry_data.PostPage")
  430.     except WebDriverException:   #selenium Exception
  431.         try:
  432.             #refresh page (you would refresh twice (or more), too)
  433.             #browser.get(link)  #method 1, when page is not loaded properly, it is not expected to reload. must be navigated to first
  434.             browser.execute_script("location.reload()")   #mehod 2, page loaded properly, can be reloaded
  435.             post_page = browser.execute_script(
  436.                 "return window._sharedData.entry_data.PostPage")
  437.         except WebDriverException:
  438.             post_page = None
  439.     if post_page is None:
  440.         logger.warning('Unavailable Page: {}'.format(link.encode('utf-8')))
  441.         return True, None, None, 'Unavailable Page'
  442.  
  443.     """Gets the description of the link and checks for the dont_like tags"""
  444.     graphql = 'graphql' in post_page[0]
  445.     if graphql:
  446.         media = post_page[0]['graphql']['shortcode_media']
  447.         is_video = media['is_video']
  448.         user_name = media['owner']['username']
  449.         image_text = media['edge_media_to_caption']['edges']
  450.         image_text = image_text[0]['node']['text'] if image_text else None
  451.         owner_comments = browser.execute_script('''
  452.      latest_comments = window._sharedData.entry_data.PostPage[0].graphql.shortcode_media.edge_media_to_comment.edges;
  453.      if (latest_comments === undefined) latest_comments = Array();
  454.      owner_comments = latest_comments
  455.        .filter(item => item.node.owner.username == '{}')
  456.        .map(item => item.node.text)
  457.        .reduce((item, total) => item + '\\n' + total, '');
  458.      return owner_comments;
  459.    '''.format(user_name))
  460.     else:
  461.         media = post_page[0]['media']
  462.         is_video = media['is_video']
  463.         user_name = media['owner']['username']
  464.         image_text = media['caption']
  465.         owner_comments = browser.execute_script('''
  466.      latest_comments = window._sharedData.entry_data.PostPage[0].media.comments.nodes;
  467.      if (latest_comments === undefined) latest_comments = Array();
  468.      owner_comments = latest_comments
  469.        .filter(item => item.user.username == '{}')
  470.        .map(item => item.text)
  471.        .reduce((item, total) => item + '\\n' + total, '');
  472.      return owner_comments;
  473.    '''.format(user_name))
  474.  
  475.     if owner_comments == '':
  476.         owner_comments = None
  477.  
  478.     """Append owner comments to description as it might contain further tags"""
  479.     if image_text is None:
  480.         image_text = owner_comments
  481.     elif owner_comments:
  482.         image_text = image_text + '\n' + owner_comments
  483.  
  484.     """If the image still has no description gets the first comment"""
  485.     if image_text is None:
  486.         if graphql:
  487.             image_text = media['edge_media_to_comment']['edges']
  488.             image_text = image_text[0]['node']['text'] if image_text else None
  489.         else:
  490.             image_text = media['comments']['nodes']
  491.             image_text = image_text[0]['text'] if image_text else None
  492.     if image_text is None:
  493.         image_text = "No description"
  494.  
  495.     logger.info('Image from: {}'.format(user_name.encode('utf-8')))
  496.  
  497.     """Find the number of followes the user has"""
  498.     if like_by_followers_upper_limit or like_by_followers_lower_limit:
  499.         userlink = 'https://www.instagram.com/' + user_name
  500.         browser.get(userlink)
  501.         # update server calls
  502.         update_activity()
  503.         sleep(1)
  504.         try:
  505.             num_followers = browser.execute_script(
  506.                 "return window._sharedData.entry_data."
  507.                 "ProfilePage[0].user.followed_by.count")
  508.         except WebDriverException:
  509.             try:
  510.                 browser.execute_script("location.reload()")
  511.                 num_followers = browser.execute_script(
  512.                     "return window._sharedData.entry_data."
  513.                     "ProfilePage[0].user.followed_by.count")
  514.             except WebDriverException:
  515.                 num_followers = 'undefined'
  516.                 like_by_followers_lower_limit = None
  517.                 like_by_followers_upper_limit = None
  518.         browser.get(link)
  519.         # update server calls
  520.         update_activity()
  521.         sleep(1)
  522.         logger.info('Number of Followers: {}'.format(num_followers))
  523.  
  524.         if like_by_followers_upper_limit and \
  525.            num_followers > like_by_followers_upper_limit:
  526.                 return True, user_name, is_video, \
  527.                     'Number of followers exceeds limit'
  528.  
  529.         if like_by_followers_lower_limit and \
  530.            num_followers < like_by_followers_lower_limit:
  531.                 return True, user_name, is_video, \
  532.                     'Number of followers does not reach minimum'
  533.  
  534.     logger.info('Link: {}'.format(link.encode('utf-8')))
  535.     logger.info('Description: {}'.format(image_text.encode('utf-8')))
  536.  
  537.     """Check if the user_name is in the ignore_users list"""
  538.     if (user_name in ignore_users) or (user_name == username):
  539.         return True, user_name, is_video, 'Username'
  540.  
  541.     if any((word in image_text for word in ignore_if_contains)):
  542.         return True, user_name, is_video, 'None'
  543.  
  544.     dont_like_regex = []
  545.  
  546.     for dont_likes in dont_like:
  547.         if dont_likes.startswith("#"):
  548.             dont_like_regex.append(dont_likes + "([^\d\w]|$)")
  549.         elif dont_likes.startswith("["):
  550.             dont_like_regex.append("#" + dont_likes[1:] + "[\d\w]+([^\d\w]|$)")
  551.         elif dont_likes.startswith("]"):
  552.             dont_like_regex.append("#[\d\w]+" + dont_likes[1:] + "([^\d\w]|$)")
  553.         else:
  554.             dont_like_regex.append(
  555.                 "#[\d\w]*" + dont_likes + "[\d\w]*([^\d\w]|$)")
  556.  
  557.     for dont_likes_regex in dont_like_regex:
  558.         quash = re.search(dont_likes_regex, image_text, re.IGNORECASE)
  559.         if quash:
  560.             quashed = (quash.group(0)).split('#')[1]
  561.             iffy = (re.split(r'\W+', dont_likes_regex))[3]
  562.             inapp_unit = ('Inappropriate! ~ contains \'{}\''.format(quashed) if quashed == iffy else
  563.                               'Inappropriate! ~ contains \'{}\' in \'{}\''.format(iffy, quashed))
  564.             return True, user_name, is_video, inapp_unit
  565.  
  566.     return False, user_name, is_video, 'None'
  567.  
  568.  
  569. def like_image(browser, username, blacklist, logger):
  570.     """Likes the browser opened image"""
  571.     like_elem = browser.find_elements_by_xpath(
  572.         "//a[@role='button']/span[text()='Like']/..")
  573.     liked_elem = browser.find_elements_by_xpath(
  574.         "//a[@role='button']/span[text()='Unlike']")
  575.  
  576.     if len(like_elem) == 1:
  577.         # sleep real quick right before clicking the element
  578.         sleep(2)
  579.         click_element(browser, like_elem[0])
  580.  
  581.         logger.info('--> Image Liked!')
  582.         update_activity('likes')
  583.         if blacklist['enabled'] is True:
  584.             action = 'liked'
  585.             add_user_to_blacklist(
  586.                 browser, username, blacklist['campaign'], action, logger
  587.             )
  588.         sleep(2)
  589.         return True
  590.     elif len(liked_elem) == 1:
  591.         logger.info('--> Already Liked!')
  592.         return False
  593.     else:
  594.         logger.info('--> Invalid Like Element!')
  595.         return False
  596.  
  597.  
  598. def get_tags(browser, url):
  599.     """Gets all the tags of the given description in the url"""
  600.     browser.get(url)
  601.     # update server calls
  602.     update_activity()
  603.     sleep(1)
  604.  
  605.     graphql = browser.execute_script(
  606.         "return ('graphql' in window._sharedData.entry_data.PostPage[0])")
  607.     if graphql:
  608.         image_text = browser.execute_script(
  609.             "return window._sharedData.entry_data.PostPage[0].graphql."
  610.             "shortcode_media.edge_media_to_caption.edges[0].node.text")
  611.     else:
  612.         image_text = browser.execute_script(
  613.             "return window._sharedData.entry_data."
  614.             "PostPage[0].media.caption.text")
  615.  
  616.     tags = findall(r'#\w*', image_text)
  617.     return tags
Advertisement
Add Comment
Please, Sign In to add comment