SHARE
TWEET

serp.py

a guest Dec 5th, 2019 97 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import time
  2. import sys
  3. from selenium import webdriver
  4. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  5. from selenium.webdriver.common.keys import Keys
  6. from selenium.webdriver.firefox.options import Options
  7. from selenium.webdriver.support import wait as Wait
  8. from selenium.webdriver.support.ui import WebDriverWait as Wait
  9. from selenium.webdriver.support import expected_conditions as EC
  10. from selenium.webdriver.common.by import By
  11. from selenium.common.exceptions import TimeoutException
  12. import pandas as pd
  13.  
  14. from bs4 import BeautifulSoup
  15.  
  16.  
  17. def magenta(text):
  18.     print('\033[35m', text, '\033[0m', sep='')
  19.  
  20.  
  21. class SeleniumCtrl:
  22.  
  23.     def __init__(self):
  24.  
  25.         options = Options()
  26.         options.add_argument('-headless')
  27.         cap = DesiredCapabilities().FIREFOX
  28.         cap["marionette"] = True
  29.         cap['loggingPrefs'] = {'browser': 'ALL'}
  30.  
  31.         '''INSERT YOUR PROFILE PATH!!'''
  32.         # profile = webdriver.FirefoxProfile("your path to/profile") # here it goes a firefox profile, setting up a saved profile is the best option
  33.  
  34.         profile = webdriver.FirefoxProfile(
  35.             "/home/frank/PycharmProjects/Seo/venv/")  # here it goes a firefox profile, setting up a saved profile is the best option
  36.         self.profile_zero = profile
  37.  
  38.         browser = webdriver.Firefox(executable_path=r"/home/frank/PycharmProjects/Seo/venv/geckodriver")
  39.         self.browser = browser
  40.  
  41.     def go_to_page(self, my_url):
  42.         driver = self.browser
  43.         driver.get(my_url)
  44.  
  45.     # it starts the program in Google page for search and perform search for keywords or address
  46.     def search_with_google(self, my_search_query):
  47.         driver = self.browser
  48.         driver.get("https://www.google.com")
  49.         google_bar = driver.find_element_by_css_selector(".gLFyf")
  50.         google_bar.send_keys(str(my_search_query).replace("b'", "").replace("'", ""))
  51.         google_bar.send_keys(Keys.ENTER)
  52.  
  53.         # get rid of privacy contract with google (it would block navigation)
  54.  
  55.     def get_rid_of_contract(self):
  56.         time.sleep(5)
  57.         driver = self.browser
  58.         driver_list = [driver]
  59.         for driver in driver_list:
  60.             driver.get(
  61.                 "https://consent.google.com/ui/?continue=https://www.google.com/&origin=https://www.google.com&if=1&gl=IT&hl=it&pc=s")
  62.             not_loaded = True
  63.             # print("check before while")
  64.  
  65.             while not_loaded:
  66.                 try:
  67.                     Wait(driver, 1).until(
  68.                         lambda browsed: browsed.find_element_by_css_selector('#yDmH0d').is_displayed())
  69.                     if driver.find_element_by_css_selector('#yDmH0d'):
  70.                         # print("page loaded")
  71.                         not_loaded = False
  72.                     else:
  73.                         print("page not loaded")
  74.                         not_loaded = True
  75.                 except:
  76.                     print("into except for loading page_5")
  77.                     not_loaded = True
  78.  
  79.             my_magic_button = driver.find_element_by_css_selector("#agreeButton")
  80.             my_page_body = driver.find_element_by_css_selector("body")
  81.             my_page_body.send_keys(Keys.END)
  82.             time.sleep(2)
  83.             my_magic_button.click()
  84.             time.sleep(2)
  85.  
  86.         return False
  87.  
  88.     def get_source(self):
  89.         driver = self.browser
  90.         my_raw_source = driver.page_source
  91.         return my_raw_source
  92.  
  93.     def wait_for_page_loaded(self):
  94.         try:
  95.             Wait(self.browser, 10).until(
  96.                 EC.presence_of_element_located((By.ID, "brs"))
  97.             )
  98.         except TimeoutException as e:
  99.             print(e, "connection takes too long", file=sys.stderr)
  100.             driver.quit_driver()
  101.  
  102.     def go_to_next_serp_page(self, parsed_html):
  103.         next_a = parsed_html.find("a", id="pnnext")
  104.         self.browser.get(f"https://www.google.com{next_a['href']}")
  105.         self.wait_for_page_loaded()
  106.         parsed_html = BeautifulSoup(self.browser.page_source, features="lxml")
  107.         h3_list = parsed_html.find_all("h3")
  108.         return parsed_html, h3_list
  109.  
  110.     def quit_driver(self):
  111.         print("closing application...")
  112.         self.browser.quit()
  113.  
  114.  
  115. if __name__ == "__main__":
  116.  
  117.     df = pd.DataFrame(None, columns=['page', 'title', 'link'])
  118.     print(df)
  119.  
  120.     driver = SeleniumCtrl()
  121.     driver.get_rid_of_contract()
  122.     #print("please insert a search keyword(s) or an url")
  123.     keys = pd.read_csv('keys.csv')
  124.     #print("please insert the specific string you're looking for into the results")
  125.     my_key = 'benessereevita.it'
  126.     for my_url in keys:
  127.         driver.search_with_google(my_url)
  128.  
  129.         # now it waits for page loading using selenium proper method
  130.         driver.wait_for_page_loaded()
  131.  
  132.         # here it starts processing the results
  133.         page = driver.get_source()
  134.  
  135.         # parse in beautiful soup
  136.         soup = BeautifulSoup(page, features="lxml")
  137.         my_h3 = soup.find_all("h3")
  138.  
  139.         '''ATM it processes still only ONE SERP page: the first'''
  140.         result = []
  141.  
  142.         page_to_parse = 10 # it must replaced with param got from command line or from config.cfg
  143.         page_number = 1
  144.         absolute_position = 1
  145.  
  146.         while page_to_parse >= 1:
  147.  
  148.             for item in my_h3:
  149.                 if item.parent.name == "a":
  150.                     if item and item.get_text():
  151.                         a_tag = item.parent
  152.                         absolute_position += 1
  153.                         if my_key in str(a_tag['href']):
  154.                             # create Pandas Series
  155.                             serie = pd.Series({'page': page_number, 'Pos': absolute_position, 'title': item.get_text(), 'link': a_tag['href']}) # at this point it still lacks of absolute index.
  156.  
  157.                             # inject Series into Pandas df
  158.                             df = df.append(serie, ignore_index=True)
  159.                 else:
  160.                     if item and item.get_text():
  161.                         absolute_position += 1
  162.  
  163.  
  164.             if page_to_parse <= 1:
  165.                 with pd.option_context('display.max_columns', None):
  166.                     df.to_csv('keypos.csv')
  167.  
  168.             page_number += 1
  169.             page_to_parse -= 1
  170.  
  171.             # here switch to next SERP page
  172.             soup, my_h3 = driver.go_to_next_serp_page(soup)
  173.  
  174.         # quit driver as job is done - eventually prompt for further researches
  175.         driver.quit_driver()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top