Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## for https://stackoverflow.com/q/75031138/6146136
- ## based on https://stackoverflow.com/a/75032654/6146136
- ## output at https://docs.google.com/spreadsheets/d/1v-6lQzVPBBAZ1PV0dtKQZ7G4iC4nHWCZJt1i7BqRL7o
- import requests
- from bs4 import BeautifulSoup
- from urllib.parse import urljoin
- import pandas as pd ## only needed for saving as csv
- import os ## only needed for printing csv path at end
- ###########################################################################
- ### FIRST PASTE [or DOWNLOAD&IMPORT] FROM https://pastebin.com/ZnZ7xM6u ###
- ###########################################################################
- maxPages = 70 # [there should be only 53, but just in case]
- csv_path = 'asda_jobs__requests.csv' # path to csv file to save to
- row_sel = "div.ListGridContainer>div.rowContainerHolder"
- url = "https://www.asda.jobs/vacancy/find/results/ajaxaction/posbrowser_gridhandler/?"
- s = requests.Session()
- s.get("https://www.asda.jobs/vacancy/find/results/")
- pagestamp = s.cookies['earcusession'][5:-8]
- url = url + f"pagestamp={pagestamp}"
- addedRows, listings = [], []
- for pgi in range(maxPages):
- page = s.get(url+f"&movejump={pgi}&movejump_page={pgi+1}")
- if page.status_code != 200: break
- soup = BeautifulSoup(page.content, "lxml")
- pgListing = [selectForList(rowSoup, {
- 'pageNum': (pgi+1, '"staticVal"'),
- 'rowId': ('input.rowId[value]', 'value'),
- 'jobRef': ('UNKNOWN', '"staticVal"'),
- 'title': ('a[title]', 'title'),
- 'about': 'div.rowContentContainer',
- 'link': ('a[href]', 'href')
- }) for rowSoup in soup.select(row_sel)]
- for li, pgl in enumerate(pgListing):
- if not pgl['link']: continue
- pgl['link'] = lLink = urljoin(url, pgl['link'])
- if '/vacancy/' in lLink:
- jobRef = lLink.split('/vacancy/', 1)[-1].split('/')[0]
- jobRef = [w for w in jobRef.split('-') if w.isdigit()]
- if jobRef: pgListing[li]['jobRef'] = jobRef[-1]
- newCt = len(set([l['rowId'] for l in pgListing if l['rowId'] and l['rowId'] not in addedRows]))
- print(page.status_code, page.reason, 'scraped', newCt, 'new from ', end='')
- selectForList(soup, ['div.pagingText', 'span.filtersSummaryTextInnerContainer'], printList=' : ')
- if not newCt: break
- # listings += pgListing # allows duplicates [probably fine in THIS method]
- listings += [l for l in pgListing if not (l['rowId'] and l['rowId'] in addedRows)]
- addedRows += [l['rowId'] for l in pgListing]
- pd.DataFrame(listings).to_csv(csv_path, index=False)
- print('\n\nRemoved 1 duplicate each for rows with ID\n', ', '.join(
- f'{ri}' for ri in set(addedRows) if addedRows.count(ri) == 2
- ), '\nOther repeated rows:\n\t', '\n\t '.join([
- f'rowID_{ri} : skipped {addedRows.count(ri)-1} duplicates'
- for ri in set(addedRows) if addedRows.count(ri) > 2
- ]), f'\n\nsaved {len(listings)} to "{os.path.abspath(csv_path)}"')
Advertisement
Add Comment
Please, Sign In to add comment