Not a member of Pastebin yet?
                        Sign Up,
                        it unlocks many cool features!                    
                - ## for https://stackoverflow.com/q/75031138/6146136
 - ## output at https://docs.google.com/spreadsheets/d/1v-6lQzVPBBAZ1PV0dtKQZ7G4iC4nHWCZJt1i7BqRL7o
 - from selenium import webdriver
 - from selenium.webdriver.common.by import By
 - from selenium.webdriver.support.ui import WebDriverWait
 - from selenium.webdriver.support import expected_conditions as EC
 - from urllib.parse import urljoin
 - from bs4 import BeautifulSoup
 - import pandas as pd ## only needed for saving as csv
 - import os ## only needed for printing csv path at end
 - ###########################################################################
 - ### FIRST PASTE [or DOWNLOAD&IMPORT] FROM https://pastebin.com/ZnZ7xM6u ###
 - ###########################################################################
 - maxPages = 50 # there's only 25 but there will be duplicates
 - max_wait = 1 # 5 ## 30 ## as you need
 - csv_path = 'asda_jobs.csv' # path to csv file to save to
 - row_sel = "div.ListGridContainer>div.rowContainerHolder"
 - btn_sel = "a.scroller_movenext" # .buttonEnabled"
 - url = "https://www.asda.jobs/vacancy/find/results/"
 - ecv = EC.visibility_of_all_elements_located
 - options = webdriver.ChromeOptions()
 - options.add_argument('--headless')
 - browser = webdriver.Chrome(chrome_options=options)
 - wait = WebDriverWait(browser, max_wait).until
 - browser.get(url)
 - addedRows, listings = [], []
 - for pgi in range(maxPages):
 - wait(ecv((By.CSS_SELECTOR, row_sel)))
 - soup = BeautifulSoup(browser.page_source, 'html5lib')
 - pgListing = [selectForList(rowSoup, {
 - 'pageNum': (pgi+1, '"staticVal"'),
 - 'rowId': ('input.rowId[value]', 'value'),
 - 'jobRef': ('UNKNOWN', '"staticVal"'),
 - 'title': ('a[title]', 'title'),
 - 'about': 'div.rowContentContainer',
 - 'link': ('a[href]', 'href')
 - }) for rowSoup in soup.select(row_sel)]
 - for li, pgl in enumerate(pgListing):
 - if not pgl['link']:
 - continue
 - pgl['link'] = lLink = urljoin(url, pgl['link'])
 - if '/vacancy/' in lLink:
 - jobRef = lLink.split('/vacancy/', 1)[-1].split('/')[0]
 - jobRef = [w for w in jobRef.split('-') if w.isdigit()]
 - if jobRef:
 - pgListing[li]['jobRef'] = jobRef[-1]
 - newCt = len(set([
 - l['rowId'] for l in pgListing if l['rowId']
 - and l['rowId'] not in addedRows]))
 - print(f'{str([pgi]):>5}', 'scraped', newCt, 'new from ', end='')
 - selectForList(soup, [
 - 'div.pagingText', 'span.filtersSummaryTextInnerContainer'
 - ], printList=' : ')
 - # listings += pgListing # allows duplicates
 - listings += [l for l in pgListing if not (
 - l['rowId'] and l['rowId'] in addedRows)]
 - addedRows += [l['rowId'] for l in pgListing]
 - nextBtn = browser.find_elements(By.CSS_SELECTOR, btn_sel)
 - if nextBtn:
 - browser.execute_script(
 - "arguments[0].scrollIntoView(false);", nextBtn[-1])
 - try:
 - nextBtn[-1].click()
 - except:
 - nextBtn[-1].click()
 - else:
 - break
 - browser.quit()
 - del browser
 - pd.DataFrame(listings).to_csv(csv_path, index=False)
 - print('\n\nRemoved 1 duplicate each for rows with ID\n', ', '.join(
 - f'{ri}' for ri in set(addedRows) if addedRows.count(ri) == 2
 - ), '\nOther repeated rows:\n\t', '\n\t '.join([
 - f'rowID_{ri} : skipped {addedRows.count(ri)-1} duplicates'
 - for ri in set(addedRows) if addedRows.count(ri) > 2
 - ]), f'\n\nsaved {len(listings)} to "{os.path.abspath(csv_path)}"')
 
Advertisement
 
                    Add Comment                
                
                        Please, Sign In to add comment