Advertisement
Guest User

Untitled

a guest
Apr 5th, 2020
177
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.13 KB | None | 0 0
  1. # import web driver
  2. from selenium import webdriver
  3. from time import sleep
  4. from selenium.webdriver.common.keys import Keys
  5. from parsel import Selector
  6. import json
  7.  
  8. def validate_field(field):
  9.     if not field:
  10.         field = 'No results'
  11.     return field
  12.  
  13.  
  14. driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver')
  15.  
  16. driver.maximize_window()
  17. driver.get('https:www.google.com')
  18. sleep(3)
  19. country = "russian"
  20. potential_title = "developer"
  21. search_query = driver.find_element_by_name('q')
  22. search_query.send_keys('site:doyoubuzz.com "'+potential_title+'" AND "'+country+'"')
  23.  
  24. sleep(0.5)
  25.  
  26. search_query.send_keys(Keys.RETURN)
  27. sleep(10)
  28.  
  29.  
  30. pages=driver.find_elements_by_xpath("//*[@id='nav']/tbody/tr/td/a")
  31. youbuzz_urls = []
  32. for page in pages:
  33.     href = driver.find_elements_by_xpath('//a[starts-with(@href, "https://www.doyoubuzz.com/")]')
  34.     for i in href:
  35.         youbuzz_urls.append(i.get_attribute('href'))
  36.     try:
  37.         driver.find_element_by_xpath("//span[text()='Suivant']").click()
  38.     except:
  39.         pass
  40. sleep(0.5)
  41. for youbuzz_url in youbuzz_urls:
  42.     driver.get(youbuzz_url)
  43.  
  44.     # add a 5 second pause loading each URL
  45.     sleep(5)
  46.  
  47.     sel = Selector(text=driver.page_source)
  48.  
  49.     firstName = sel.xpath('//*[starts-with(@class,"userName__firstName")]/text()').extract_first()
  50.    
  51.     if firstName:
  52.         firstName = firstName.strip()
  53.        
  54.     lastName = sel.xpath('//*[starts-with(@class,"userName__lastName")]/text()').extract_first()
  55.  
  56.     if lastName:
  57.         lastName = lastName.strip()
  58.  
  59.  
  60.     current_title = sel.xpath('//*[@class="cvTitle"]/text()').extract_first()
  61.     if current_title:
  62.         current_title = current_title.strip()
  63.  
  64.  
  65.  
  66.     lives_in = sel.xpath('//*[starts-with(@class,"widgetUserInfo__item widgetUserInfo__item_location")]/text()').extract_first()
  67.     if lives_in:
  68.         lives_in = lives_in.strip()
  69.  
  70.     age = sel.xpath('//*[starts-with(@class,"widgetUserInfo__item widgetUserInfo__item_age")]/text()').extract_first()
  71.     if age:
  72.         age = age.strip()    
  73.    
  74.     youbuzz_url = driver.current_url
  75.  
  76.     firstName = validate_field(firstName)
  77.     lastName = validate_field(lastName)
  78.     current_title = validate_field(current_title)
  79.     lives_in = validate_field(lives_in)
  80.     youbuzz_url = validate_field(youbuzz_url)
  81.  
  82.     if lives_in != 'No Results':
  83.         lives_in = ' '.join(lives_in.split())
  84.     try:
  85.         # printing the output to the terminal
  86.         print('\n')
  87.         print('First Name: ' + firstName)
  88.         print('last Name: ' + lastName)
  89.         print('current_title: ' + current_title)
  90.         print('lives_in: ' + lives_in)
  91.         print('youbuzz_url: ' + youbuzz_url)
  92.         print('\n')
  93.     except:
  94.         pass
  95.     with open('data.json',mode='a', encoding='utf-8') as outfile:
  96.         res = {        
  97.             'currentPosition' : current_title,
  98.             'livesIn' : lives_in,
  99.             'country' : country,
  100.             'profile' : youbuzz_url,
  101.             'firstName': firstName,
  102.             'lastName' : lastName,
  103.             'age' : age
  104.         }
  105.         json.dump(res, outfile, indent=2)
  106.  
  107.  
  108.  
  109. driver.quit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement