Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## for scraping multiple pages of listings ########
- ## [ inputs/outputs explained at bottom ] #########
- ## example at https://pastebin.com/5AjaUne0 #######
- ###################################################
- ##### REQUIREMENTS #####
- import pandas as pd # for saving as CSV [for spreadsheet]
- import requests # cloudscraper # [use preferred library] # for fetching HTML
- from bs4 import BeautifulSoup # for parsing HTML
- '''def confParse(...): ...''' # [ COPY from https://pastebin.com/c0TjDmNE ]
- fetchPg = requests.get # cloudscraper.create_scraper().get # use preferred function
- def paginate_scrape(pg1Url, pgSel, maxPgs=None, fpBase='', reqFn=fetchPg, dReturn=False):
- pg_logs_fp = f'{fpBase}page_logs.csv'
- list_fp = f'{fpBase}listings.csv'
- pgUrl, pgNo, listings, pgLogs, lCt = pg1Url, 0, [], [], 0
- while pgUrl and (maxPgs is None or (pgNo:=pgNo+1)<=maxPgs):
- pSoup = BeautifulSoup((pgReq:=fetchPg(pgUrl)).content)
- curPg, rList = confParse(pSoup, pgSel, None if callable(pgSel) else '')
- pgUrl, lCt = curPg.get('next_link'), lCt+len(rList)
- rList = [{'from_page':pgNo, **r} for r in rList]
- curPg = {
- 'page_no': pgNo, 'page_link': pgUrl, **curPg,
- 'request_status': f'{pgReq.status_code} {pgReq.reason}',
- 'listings_extracted': len(rList)
- }
- if dReturn: listings, pgLogs = listings+rList, pgLogs+[curPg]
- print('',end=f"\rpg{pgNo}: saving {len(rList)} listings [total:{lCt}]")
- for d,f in [(rList,list_fp), ([curPg],pg_logs_fp)]:
- m, h = ('w', True) if pgNo==1 else ('a', False)
- pd.DataFrame(d).to_csv(f, mode=m, index=False, header=h)
- print(f'\nSaved to {pg_logs_fp!r} and {list_fp!r}')
- if dReturn: return pgLogs, listings
- ######## INPUTS ############
- #-> REQUIRED: link to first page (pg1Url) and selectors (pgSel)
- #### [ selectors (pgSel) should be prepared as the tSel parameter of confParse ]
- #### [ pgSel must be designed such that ALL rows of curPg/rList will have the same columns ]
- #### [ variations in column names and/or order can lead to mis-aligned CSV rows ]
- #-> maxPgs: page limit
- #### [ if set as None, scraping will continue until next-page-link cannot be found ]
- #-> fpBase: prefix to add to 'page_logs.csv'/'listings.csv'
- #### [ can include the path to a folder ]
- #-> reqFn: a function that takes a URL and returns a requests.Response object
- #### [ can write a wrapper to set headers etc. or just use cloudscraper/HTMLSessions/etc ]
- #-> dReturn: if set as True, will collect data and return 2 lists of dictionaries
- ######## OUTPUTS ###########
- ## output: two csv files for the pages scraped and the listings extracted #######
- ## if dReturn is set, two lists of dictionaries (pgLogs, listings) will also be returned
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement