asda_jobs_selenium+bs4 for_so_q_75031138

## for https://stackoverflow.com/q/75031138/6146136
## output at https://docs.google.com/spreadsheets/d/1v-6lQzVPBBAZ1PV0dtKQZ7G4iC4nHWCZJt1i7BqRL7o

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from urllib.parse import urljoin
from bs4 import BeautifulSoup
import pandas as pd ## only needed for saving as csv
import os ## only needed for printing csv path at end

###########################################################################
### FIRST PASTE [or DOWNLOAD&IMPORT] FROM https://pastebin.com/ZnZ7xM6u ###
###########################################################################

maxPages = 50  # there's only 25 but there will be duplicates
max_wait = 1  # 5 ## 30 ## as you need
csv_path = 'asda_jobs.csv'  # path to csv file to save to

row_sel = "div.ListGridContainer>div.rowContainerHolder"
btn_sel = "a.scroller_movenext"  # .buttonEnabled"
url = "https://www.asda.jobs/vacancy/find/results/"
ecv = EC.visibility_of_all_elements_located

options = webdriver.ChromeOptions()
options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=options)
wait = WebDriverWait(browser, max_wait).until
browser.get(url)

addedRows, listings = [], []
for pgi in range(maxPages):
    wait(ecv((By.CSS_SELECTOR, row_sel)))

    soup = BeautifulSoup(browser.page_source, 'html5lib')
    pgListing = [selectForList(rowSoup, {
        'pageNum': (pgi+1, '"staticVal"'),
        'rowId': ('input.rowId[value]', 'value'),
        'jobRef': ('UNKNOWN', '"staticVal"'),
        'title': ('a[title]', 'title'),
        'about': 'div.rowContentContainer',
        'link': ('a[href]', 'href')
    }) for rowSoup in soup.select(row_sel)]
    for li, pgl in enumerate(pgListing):
        if not pgl['link']:
            continue
        pgl['link'] = lLink = urljoin(url, pgl['link'])
        if '/vacancy/' in lLink:
            jobRef = lLink.split('/vacancy/', 1)[-1].split('/')[0]
            jobRef = [w for w in jobRef.split('-') if w.isdigit()]
            if jobRef:
                pgListing[li]['jobRef'] = jobRef[-1]
    newCt = len(set([
        l['rowId'] for l in pgListing if l['rowId']
        and l['rowId'] not in addedRows]))
    print(f'{str([pgi]):>5}', 'scraped', newCt, 'new from ', end='')
    selectForList(soup, [
        'div.pagingText', 'span.filtersSummaryTextInnerContainer'
    ], printList=' : ')

    # listings += pgListing # allows duplicates
    listings += [l for l in pgListing if not (
        l['rowId'] and l['rowId'] in addedRows)]
    addedRows += [l['rowId'] for l in pgListing]

    nextBtn = browser.find_elements(By.CSS_SELECTOR, btn_sel)
    if nextBtn:
        browser.execute_script(
            "arguments[0].scrollIntoView(false);", nextBtn[-1])
        try:
            nextBtn[-1].click()
        except:
            nextBtn[-1].click()
    else:
        break

browser.quit()
del browser

pd.DataFrame(listings).to_csv(csv_path, index=False)

print('\n\nRemoved 1 duplicate each for rows with ID\n', ', '.join(
    f'{ri}' for ri in set(addedRows) if addedRows.count(ri) == 2
), '\nOther repeated rows:\n\t', '\n\t '.join([
    f'rowID_{ri} : skipped {addedRows.count(ri)-1} duplicates'
    for ri in set(addedRows) if addedRows.count(ri) > 2
]), f'\n\nsaved {len(listings)} to "{os.path.abspath(csv_path)}"')