Advertisement
Guest User

Untitled

a guest
Dec 27th, 2019
328
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.01 KB | None | 0 0
  1. from selenium import webdriver
  2. from selenium.webdriver.support.ui import WebDriverWait
  3. from selenium.webdriver.support import expected_conditions as EC
  4. from selenium.webdriver.common.by import By
  5. from selenium.webdriver.chrome.options import Options
  6. from selenium.common.exceptions import TimeoutException
  7. from bs4 import BeautifulSoup
  8. from random import randint
  9. from time import sleep
  10. from fake_useragent import UserAgent
  11. import csv
  12.  
  13. #todo
  14. # https://selenium-python.readthedocs.io/waits.html
  15. # Also try looking at webdriverwait, which helps with waiting until an element is found, and throws an Exception if the element is not found
  16. # Why is this script giving double links?
  17.  
  18.  
  19.  
  20. f = csv.writer(open('ebay_watches.csv', 'w'))
  21. f.writerow(['title', 'price', 'numSold'])
  22.  
  23.  
  24. pages = []
  25.  
  26. for i in range(0, 999):
  27.     options = Options()
  28.     ua = UserAgent()
  29.     a = ua.random
  30.     user_agent = ua.random
  31.     print(user_agent)
  32.     options.add_argument(f'user-agent={user_agent}')
  33.     driver = webdriver.Chrome('/Users/kenny/Dropbox/Python/WebScrapping/Others/chromedriver')
  34.     driver.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=watches&_sacat=0&_pgn=' + str(i))
  35.     soup = BeautifulSoup(driver.page_source, 'lxml')
  36.     driver.maximize_window()
  37.  
  38.     tempList = []
  39.  
  40.     for link in soup.find_all('a', href=True):
  41.         if 'itm' in link['href']:
  42.             print(link['href'])
  43.             tempList.append(link['href'])
  44.  
  45.     array_length = len(tempList)
  46.  
  47.     for i in range(array_length):
  48.         driver.get(tempList[i])
  49.         timeout = 5
  50.  
  51.         try:
  52.             element_present = EC.presence_of_element_located((By.XPATH, '//*[@id="itemTitle"]'))
  53.             WebDriverWait(driver, timeout).until(element_present)
  54.         except TimeoutException:
  55.             print("Timed out waiting for page to load")
  56.         try:
  57.             title = driver.find_element_by_xpath('//*[@id="itemTitle"]').text
  58.         except Exception as e:
  59.             title = ""
  60.         try:
  61.             price = driver.find_element_by_xpath('//*[@id="prcIsum"]').text.strip().split()
  62.         except Exception as e:
  63.             print(e)
  64.             try:
  65.                 price = driver.find_element_by_xpath('//*[@id="mm-saleDscPrc"]').text
  66.             except Exception as e:
  67.                 print(e)
  68.                 price = ""
  69.         soup = BeautifulSoup(driver.page_source, 'lxml')
  70.         try:
  71.             total_sold_price = soup.find('span', {'class': 'vi-qtyS-hot-red'}).text
  72.         except Exception as e:
  73.             try:
  74.                 total_sold_price = soup.find('a', {'class': 'vi-txt-underline'}).text
  75.             except Exception as e:
  76.                 print(e)
  77.                 total_sold_price = ""
  78.  
  79.  
  80.         print("title: ", title)
  81.         print("price: ", price)
  82.         print("total_sold_price: ", total_sold_price)
  83.         print(tempList[i])
  84.         print("\n")
  85.  
  86.         f.writerow([title, price, total_sold_price, tempList[i]])
  87.         i += 1
  88.         sleep(randint(1, 3))
  89.  
  90.     i+=1
  91.  
  92.  
  93.  
  94. driver.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement