Try95th

asda_jobs_selenium+bs4 for_so_q_75031138

Jan 6th, 2023 (edited)
117
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ## for https://stackoverflow.com/q/75031138/6146136
  2. ## output at https://docs.google.com/spreadsheets/d/1v-6lQzVPBBAZ1PV0dtKQZ7G4iC4nHWCZJt1i7BqRL7o
  3.  
  4. from selenium import webdriver
  5. from selenium.webdriver.common.by import By
  6. from selenium.webdriver.support.ui import WebDriverWait
  7. from selenium.webdriver.support import expected_conditions as EC
  8.  
  9. from urllib.parse import urljoin
  10. from bs4 import BeautifulSoup
  11. import pandas as pd ## only needed for saving as csv
  12. import os ## only needed for printing csv path at end
  13.  
  14. ###########################################################################
  15. ### FIRST PASTE [or DOWNLOAD&IMPORT] FROM https://pastebin.com/ZnZ7xM6u ###
  16. ###########################################################################
  17.  
  18. maxPages = 50  # there's only 25 but there will be duplicates
  19. max_wait = 1  # 5 ## 30 ## as you need
  20. csv_path = 'asda_jobs.csv'  # path to csv file to save to
  21.  
  22. row_sel = "div.ListGridContainer>div.rowContainerHolder"
  23. btn_sel = "a.scroller_movenext"  # .buttonEnabled"
  24. url = "https://www.asda.jobs/vacancy/find/results/"
  25. ecv = EC.visibility_of_all_elements_located
  26.  
  27. options = webdriver.ChromeOptions()
  28. options.add_argument('--headless')
  29. browser = webdriver.Chrome(chrome_options=options)
  30. wait = WebDriverWait(browser, max_wait).until
  31. browser.get(url)
  32.  
  33. addedRows, listings = [], []
  34. for pgi in range(maxPages):
  35.     wait(ecv((By.CSS_SELECTOR, row_sel)))
  36.  
  37.     soup = BeautifulSoup(browser.page_source, 'html5lib')
  38.     pgListing = [selectForList(rowSoup, {
  39.         'pageNum': (pgi+1, '"staticVal"'),
  40.         'rowId': ('input.rowId[value]', 'value'),
  41.         'jobRef': ('UNKNOWN', '"staticVal"'),
  42.         'title': ('a[title]', 'title'),
  43.         'about': 'div.rowContentContainer',
  44.         'link': ('a[href]', 'href')
  45.     }) for rowSoup in soup.select(row_sel)]
  46.     for li, pgl in enumerate(pgListing):
  47.         if not pgl['link']:
  48.             continue
  49.         pgl['link'] = lLink = urljoin(url, pgl['link'])
  50.         if '/vacancy/' in lLink:
  51.             jobRef = lLink.split('/vacancy/', 1)[-1].split('/')[0]
  52.             jobRef = [w for w in jobRef.split('-') if w.isdigit()]
  53.             if jobRef:
  54.                 pgListing[li]['jobRef'] = jobRef[-1]
  55.     newCt = len(set([
  56.         l['rowId'] for l in pgListing if l['rowId']
  57.         and l['rowId'] not in addedRows]))
  58.     print(f'{str([pgi]):>5}', 'scraped', newCt, 'new from ', end='')
  59.     selectForList(soup, [
  60.         'div.pagingText', 'span.filtersSummaryTextInnerContainer'
  61.     ], printList=' : ')
  62.  
  63.     # listings += pgListing # allows duplicates
  64.     listings += [l for l in pgListing if not (
  65.         l['rowId'] and l['rowId'] in addedRows)]
  66.     addedRows += [l['rowId'] for l in pgListing]
  67.  
  68.     nextBtn = browser.find_elements(By.CSS_SELECTOR, btn_sel)
  69.     if nextBtn:
  70.         browser.execute_script(
  71.             "arguments[0].scrollIntoView(false);", nextBtn[-1])
  72.         try:
  73.             nextBtn[-1].click()
  74.         except:
  75.             nextBtn[-1].click()
  76.     else:
  77.         break
  78.  
  79. browser.quit()
  80. del browser
  81.  
  82. pd.DataFrame(listings).to_csv(csv_path, index=False)
  83.  
  84. print('\n\nRemoved 1 duplicate each for rows with ID\n', ', '.join(
  85.     f'{ri}' for ri in set(addedRows) if addedRows.count(ri) == 2
  86. ), '\nOther repeated rows:\n\t', '\n\t '.join([
  87.     f'rowID_{ri} : skipped {addedRows.count(ri)-1} duplicates'
  88.     for ri in set(addedRows) if addedRows.count(ri) > 2
  89. ]), f'\n\nsaved {len(listings)} to "{os.path.abspath(csv_path)}"')
  90.  
Advertisement
Add Comment
Please, Sign In to add comment