asda_jobs_requests+bs4 for_so_q_75031138

## for https://stackoverflow.com/q/75031138/6146136
## based on https://stackoverflow.com/a/75032654/6146136
## output at https://docs.google.com/spreadsheets/d/1v-6lQzVPBBAZ1PV0dtKQZ7G4iC4nHWCZJt1i7BqRL7o

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd ## only needed for saving as csv
import os ## only needed for printing csv path at end

###########################################################################
### FIRST PASTE [or DOWNLOAD&IMPORT] FROM https://pastebin.com/ZnZ7xM6u ###
###########################################################################

maxPages = 70  # [there should be only 53, but just in case]
csv_path = 'asda_jobs__requests.csv'  # path to csv file to save to

row_sel = "div.ListGridContainer>div.rowContainerHolder"
url = "https://www.asda.jobs/vacancy/find/results/ajaxaction/posbrowser_gridhandler/?"
s = requests.Session()
s.get("https://www.asda.jobs/vacancy/find/results/")
pagestamp = s.cookies['earcusession'][5:-8]
url = url + f"pagestamp={pagestamp}"

addedRows, listings = [], []
for pgi in range(maxPages):
    page = s.get(url+f"&movejump={pgi}&movejump_page={pgi+1}")
    if page.status_code != 200: break
    soup = BeautifulSoup(page.content, "lxml")
    pgListing = [selectForList(rowSoup, {
        'pageNum': (pgi+1, '"staticVal"'),
        'rowId': ('input.rowId[value]', 'value'),
        'jobRef': ('UNKNOWN', '"staticVal"'),
        'title': ('a[title]', 'title'),
        'about': 'div.rowContentContainer',
        'link': ('a[href]', 'href')
    }) for rowSoup in soup.select(row_sel)]

    for li, pgl in enumerate(pgListing):
        if not pgl['link']: continue
        pgl['link'] = lLink = urljoin(url, pgl['link'])
        if '/vacancy/' in lLink:
            jobRef = lLink.split('/vacancy/', 1)[-1].split('/')[0]
            jobRef = [w for w in jobRef.split('-') if w.isdigit()]
            if jobRef: pgListing[li]['jobRef'] = jobRef[-1]

    newCt = len(set([l['rowId'] for l in pgListing if l['rowId'] and l['rowId'] not in addedRows]))
    print(page.status_code, page.reason, 'scraped', newCt, 'new from ', end='')
    selectForList(soup, ['div.pagingText', 'span.filtersSummaryTextInnerContainer'], printList=' : ')
    if not newCt: break

    # listings += pgListing # allows duplicates [probably fine in THIS method]
    listings += [l for l in pgListing if not (l['rowId'] and l['rowId'] in addedRows)]
    addedRows += [l['rowId'] for l in pgListing]

pd.DataFrame(listings).to_csv(csv_path, index=False)

print('\n\nRemoved 1 duplicate each for rows with ID\n', ', '.join(
    f'{ri}' for ri in set(addedRows) if addedRows.count(ri) == 2
), '\nOther repeated rows:\n\t', '\n\t '.join([
    f'rowID_{ri} : skipped {addedRows.count(ri)-1} duplicates'
    for ri in set(addedRows) if addedRows.count(ri) > 2
]), f'\n\nsaved {len(listings)} to "{os.path.abspath(csv_path)}"')