Advertisement
Try95th

paginate_scrape

Mar 19th, 2023 (edited)
132
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.77 KB | None | 0 0
  1. ## for scraping multiple pages of listings ########
  2. ## [ inputs/outputs explained at bottom ] #########
  3. ## example at https://pastebin.com/5AjaUne0 #######
  4. ###################################################
  5.  
  6. ##### REQUIREMENTS #####
  7. import pandas as pd  # for saving as CSV [for spreadsheet]
  8. import requests # cloudscraper # [use preferred library] # for fetching HTML
  9. from bs4 import BeautifulSoup # for parsing HTML
  10. '''def confParse(...): ...''' # [ COPY from https://pastebin.com/c0TjDmNE ]
  11.  
  12. fetchPg = requests.get # cloudscraper.create_scraper().get # use preferred function
  13. def paginate_scrape(pg1Url, pgSel, maxPgs=None, fpBase='', reqFn=fetchPg, dReturn=False):
  14.     pg_logs_fp = f'{fpBase}page_logs.csv'
  15.     list_fp = f'{fpBase}listings.csv'
  16.  
  17.     pgUrl, pgNo, listings, pgLogs, lCt = pg1Url, 0, [], [], 0
  18.     while pgUrl and (maxPgs is None or (pgNo:=pgNo+1)<=maxPgs):
  19.         pSoup = BeautifulSoup((pgReq:=fetchPg(pgUrl)).content)
  20.         curPg, rList = confParse(pSoup, pgSel, None if callable(pgSel) else '')
  21.         pgUrl, lCt = curPg.get('next_link'), lCt+len(rList)
  22.         rList = [{'from_page':pgNo, **r} for r in rList]
  23.         curPg = {
  24.             'page_no': pgNo, 'page_link': pgUrl, **curPg,
  25.             'request_status': f'{pgReq.status_code} {pgReq.reason}',
  26.             'listings_extracted': len(rList)
  27.         }
  28.  
  29.         if dReturn: listings, pgLogs = listings+rList, pgLogs+[curPg]
  30.         print('',end=f"\rpg{pgNo}: saving {len(rList)} listings [total:{lCt}]")
  31.         for d,f in [(rList,list_fp), ([curPg],pg_logs_fp)]:
  32.             m, h = ('w', True) if pgNo==1 else ('a', False)
  33.             pd.DataFrame(d).to_csv(f, mode=m, index=False, header=h)
  34.     print(f'\nSaved to {pg_logs_fp!r} and {list_fp!r}')
  35.     if dReturn: return pgLogs, listings
  36.  
  37. ######## INPUTS ############
  38. #-> REQUIRED: link to first page (pg1Url) and selectors (pgSel)
  39. #### [ selectors (pgSel) should be prepared as  the tSel parameter of confParse ]
  40. #### [ pgSel must be designed such that ALL rows of curPg/rList will have the same columns ]
  41. #### [ variations in column names and/or order can lead to mis-aligned CSV rows ]
  42. #-> maxPgs: page limit
  43. #### [ if set as None, scraping will continue until next-page-link cannot be found ]
  44. #-> fpBase: prefix to add to 'page_logs.csv'/'listings.csv'
  45. #### [ can include the path to a folder ]
  46. #-> reqFn: a function that takes a URL and returns a requests.Response object
  47. #### [ can write a wrapper to set headers etc. or just use cloudscraper/HTMLSessions/etc ]
  48. #-> dReturn: if set as True, will collect data and return 2 lists of dictionaries
  49.  
  50. ######## OUTPUTS ###########
  51. ## output: two csv files for the pages scraped and the listings extracted #######
  52. ## if dReturn is set, two lists of dictionaries (pgLogs, listings) will also be returned
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement