paginate_scrape

## for scraping multiple pages of listings ########
## [ inputs/outputs explained at bottom ] #########
## example at https://pastebin.com/5AjaUne0 #######
###################################################

##### REQUIREMENTS #####
import pandas as pd  # for saving as CSV [for spreadsheet]
import requests # cloudscraper # [use preferred library] # for fetching HTML
from bs4 import BeautifulSoup # for parsing HTML
'''def confParse(...): ...''' # [ COPY from https://pastebin.com/c0TjDmNE ]

fetchPg = requests.get # cloudscraper.create_scraper().get # use preferred function
def paginate_scrape(pg1Url, pgSel, maxPgs=None, fpBase='', reqFn=fetchPg, dReturn=False):
    pg_logs_fp = f'{fpBase}page_logs.csv'
    list_fp = f'{fpBase}listings.csv'

    pgUrl, pgNo, listings, pgLogs, lCt = pg1Url, 0, [], [], 0
    while pgUrl and (maxPgs is None or (pgNo:=pgNo+1)<=maxPgs):
        pSoup = BeautifulSoup((pgReq:=fetchPg(pgUrl)).content)
        curPg, rList = confParse(pSoup, pgSel, None if callable(pgSel) else '')
        pgUrl, lCt = curPg.get('next_link'), lCt+len(rList)
        rList = [{'from_page':pgNo, **r} for r in rList]
        curPg = {
            'page_no': pgNo, 'page_link': pgUrl, **curPg,
            'request_status': f'{pgReq.status_code} {pgReq.reason}',
            'listings_extracted': len(rList)
        }

        if dReturn: listings, pgLogs = listings+rList, pgLogs+[curPg]
        print('',end=f"\rpg{pgNo}: saving {len(rList)} listings [total:{lCt}]")
        for d,f in [(rList,list_fp), ([curPg],pg_logs_fp)]:
            m, h = ('w', True) if pgNo==1 else ('a', False)
            pd.DataFrame(d).to_csv(f, mode=m, index=False, header=h)
    print(f'\nSaved to {pg_logs_fp!r} and {list_fp!r}')
    if dReturn: return pgLogs, listings

######## INPUTS ############
#-> REQUIRED: link to first page (pg1Url) and selectors (pgSel)
#### [ selectors (pgSel) should be prepared as  the tSel parameter of confParse ]
#### [ pgSel must be designed such that ALL rows of curPg/rList will have the same columns ]
#### [ variations in column names and/or order can lead to mis-aligned CSV rows ]
#-> maxPgs: page limit
#### [ if set as None, scraping will continue until next-page-link cannot be found ]
#-> fpBase: prefix to add to 'page_logs.csv'/'listings.csv'
#### [ can include the path to a folder ]
#-> reqFn: a function that takes a URL and returns a requests.Response object
#### [ can write a wrapper to set headers etc. or just use cloudscraper/HTMLSessions/etc ]
#-> dReturn: if set as True, will collect data and return 2 lists of dictionaries

######## OUTPUTS ###########
## output: two csv files for the pages scraped and the listings extracted #######
## if dReturn is set, two lists of dictionaries (pgLogs, listings) will also be returned