Try95th

asda_jobs_requests+bs4 for_so_q_75031138

Jan 6th, 2023 (edited)
108
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.91 KB | None | 0 0
  1. ## for https://stackoverflow.com/q/75031138/6146136
  2. ## based on https://stackoverflow.com/a/75032654/6146136
  3. ## output at https://docs.google.com/spreadsheets/d/1v-6lQzVPBBAZ1PV0dtKQZ7G4iC4nHWCZJt1i7BqRL7o
  4.  
  5. import requests
  6. from bs4 import BeautifulSoup
  7. from urllib.parse import urljoin
  8. import pandas as pd ## only needed for saving as csv
  9. import os ## only needed for printing csv path at end
  10.  
  11. ###########################################################################
  12. ### FIRST PASTE [or DOWNLOAD&IMPORT] FROM https://pastebin.com/ZnZ7xM6u ###
  13. ###########################################################################
  14.  
  15. maxPages = 70  # [there should be only 53, but just in case]
  16. csv_path = 'asda_jobs__requests.csv'  # path to csv file to save to
  17.  
  18. row_sel = "div.ListGridContainer>div.rowContainerHolder"
  19. url = "https://www.asda.jobs/vacancy/find/results/ajaxaction/posbrowser_gridhandler/?"
  20. s = requests.Session()
  21. s.get("https://www.asda.jobs/vacancy/find/results/")
  22. pagestamp = s.cookies['earcusession'][5:-8]
  23. url = url + f"pagestamp={pagestamp}"
  24.  
  25. addedRows, listings = [], []
  26. for pgi in range(maxPages):    
  27.     page = s.get(url+f"&movejump={pgi}&movejump_page={pgi+1}")
  28.     if page.status_code != 200: break
  29.     soup = BeautifulSoup(page.content, "lxml")
  30.     pgListing = [selectForList(rowSoup, {
  31.         'pageNum': (pgi+1, '"staticVal"'),
  32.         'rowId': ('input.rowId[value]', 'value'),
  33.         'jobRef': ('UNKNOWN', '"staticVal"'),
  34.         'title': ('a[title]', 'title'),
  35.         'about': 'div.rowContentContainer',
  36.         'link': ('a[href]', 'href')
  37.     }) for rowSoup in soup.select(row_sel)]
  38.  
  39.     for li, pgl in enumerate(pgListing):
  40.         if not pgl['link']: continue
  41.         pgl['link'] = lLink = urljoin(url, pgl['link'])
  42.         if '/vacancy/' in lLink:
  43.             jobRef = lLink.split('/vacancy/', 1)[-1].split('/')[0]
  44.             jobRef = [w for w in jobRef.split('-') if w.isdigit()]
  45.             if jobRef: pgListing[li]['jobRef'] = jobRef[-1]
  46.    
  47.     newCt = len(set([l['rowId'] for l in pgListing if l['rowId'] and l['rowId'] not in addedRows]))
  48.     print(page.status_code, page.reason, 'scraped', newCt, 'new from ', end='')
  49.     selectForList(soup, ['div.pagingText', 'span.filtersSummaryTextInnerContainer'], printList=' : ')
  50.     if not newCt: break
  51.  
  52.     # listings += pgListing # allows duplicates [probably fine in THIS method]
  53.     listings += [l for l in pgListing if not (l['rowId'] and l['rowId'] in addedRows)]
  54.     addedRows += [l['rowId'] for l in pgListing]
  55.  
  56. pd.DataFrame(listings).to_csv(csv_path, index=False)
  57.  
  58. print('\n\nRemoved 1 duplicate each for rows with ID\n', ', '.join(
  59.     f'{ri}' for ri in set(addedRows) if addedRows.count(ri) == 2
  60. ), '\nOther repeated rows:\n\t', '\n\t '.join([
  61.     f'rowID_{ri} : skipped {addedRows.count(ri)-1} duplicates'
  62.     for ri in set(addedRows) if addedRows.count(ri) > 2
  63. ]), f'\n\nsaved {len(listings)} to "{os.path.abspath(csv_path)}"')
Advertisement
Add Comment
Please, Sign In to add comment