## for https://stackoverflow.com/q/75031138/6146136 ## output at https://docs.google.com/spreadsheets/d/1v-6lQzVPBBAZ1PV0dtKQZ7G4iC4nHWCZJt1i7BqRL7o from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from urllib.parse import urljoin from bs4 import BeautifulSoup import pandas as pd ## only needed for saving as csv import os ## only needed for printing csv path at end ########################################################################### ### FIRST PASTE [or DOWNLOAD&IMPORT] FROM https://pastebin.com/ZnZ7xM6u ### ########################################################################### maxPages = 50 # there's only 25 but there will be duplicates max_wait = 1 # 5 ## 30 ## as you need csv_path = 'asda_jobs.csv' # path to csv file to save to row_sel = "div.ListGridContainer>div.rowContainerHolder" btn_sel = "a.scroller_movenext" # .buttonEnabled" url = "https://www.asda.jobs/vacancy/find/results/" ecv = EC.visibility_of_all_elements_located options = webdriver.ChromeOptions() options.add_argument('--headless') browser = webdriver.Chrome(chrome_options=options) wait = WebDriverWait(browser, max_wait).until browser.get(url) addedRows, listings = [], [] for pgi in range(maxPages): wait(ecv((By.CSS_SELECTOR, row_sel))) soup = BeautifulSoup(browser.page_source, 'html5lib') pgListing = [selectForList(rowSoup, { 'pageNum': (pgi+1, '"staticVal"'), 'rowId': ('input.rowId[value]', 'value'), 'jobRef': ('UNKNOWN', '"staticVal"'), 'title': ('a[title]', 'title'), 'about': 'div.rowContentContainer', 'link': ('a[href]', 'href') }) for rowSoup in soup.select(row_sel)] for li, pgl in enumerate(pgListing): if not pgl['link']: continue pgl['link'] = lLink = urljoin(url, pgl['link']) if '/vacancy/' in lLink: jobRef = lLink.split('/vacancy/', 1)[-1].split('/')[0] jobRef = [w for w in jobRef.split('-') if w.isdigit()] if jobRef: pgListing[li]['jobRef'] = jobRef[-1] newCt = len(set([ l['rowId'] for l in pgListing if l['rowId'] and l['rowId'] not in addedRows])) print(f'{str([pgi]):>5}', 'scraped', newCt, 'new from ', end='') selectForList(soup, [ 'div.pagingText', 'span.filtersSummaryTextInnerContainer' ], printList=' : ') # listings += pgListing # allows duplicates listings += [l for l in pgListing if not ( l['rowId'] and l['rowId'] in addedRows)] addedRows += [l['rowId'] for l in pgListing] nextBtn = browser.find_elements(By.CSS_SELECTOR, btn_sel) if nextBtn: browser.execute_script( "arguments[0].scrollIntoView(false);", nextBtn[-1]) try: nextBtn[-1].click() except: nextBtn[-1].click() else: break browser.quit() del browser pd.DataFrame(listings).to_csv(csv_path, index=False) print('\n\nRemoved 1 duplicate each for rows with ID\n', ', '.join( f'{ri}' for ri in set(addedRows) if addedRows.count(ri) == 2 ), '\nOther repeated rows:\n\t', '\n\t '.join([ f'rowID_{ri} : skipped {addedRows.count(ri)-1} duplicates' for ri in set(addedRows) if addedRows.count(ri) > 2 ]), f'\n\nsaved {len(listings)} to "{os.path.abspath(csv_path)}"')