pagination example - with tripadvisor restaurants

## scraping paginated lists: tripadvisor restaurants example ##
## output at https://docs.google.com/spreadsheets/d/14-rVFWU0QYCqk2tPX33JEUoWt8NQS-5E-wYso5QuqY4

### REQUIRED FOR BOTH EXAMPLES:
from requests_html import HTMLSession # [required for paginate_scrape INPUT reqFn]
from bs4 import BeautifulSoup # [required for paginate_scrape]
import pandas as pd # [required for paginate_scrape]
# confParse requirements # PASTE FROM https://pastebin.com/c0TjDmNE
# def confParse... # [required for paginate_scrape] # ALSO FROM https://pastebin.com/c0TjDmNE
# def paginate_scrape... # PASTE FROM https://pastebin.com/d21q6mZ2

# url = 'https://www.tripadvisor.com/Restaurants-g_______-______.html'
url = 'https://www.tripadvisor.com/Restaurants-g3676471-South_Region.html'
###############################################################################################


############################### EXAMPLE 1:  EXTRACT FROM SCRIPT ###############################
### Only Required for THIS example:
import json # required for jsonload_from_script
# def jsonload_from_script... # PASTE FROM https://pastebin.com/ewAn58Zt
# def {yield,get}_by_nested_keys # PASTE FROM https://pastebin.com/LsZUiRW2

'''NOTE: confParse is rather superfluous here you could edit paginate_scrape
   to directly use tripadvisor_script_data instead '''
def tripadvisor_script_data(pgSoup):
    var1n = 'window.__WEB_CONTEXT__'
    rFn = lambda x: x.replace('pageManifest','"pageManifest"',1)
    wwcData = jsonload_from_script(pgSoup, var1n, prepFn=rFn)

    lKeys = ['data', 'restaurants']
    listings = get_by_nested_keys(wwcData, *lKeys, dType=list, find_all=True)

    pgFilter = get_by_nested_keys(wwcData,'travelerInfo','restaurants')
    pgFilter = pgFilter if isinstance(pgFilter, dict) else {}
    nl = pgSoup.select_one('a.next[data-page-number][href]')
    pgInfo = {
        'header': get_by_nested_keys(wwcData, 'descriptiveHeaderText'),
        'results_count': get_by_nested_keys(wwcData, 'listResultCount'),
        'currency':  get_by_nested_keys(wwcData, 'preferences', 'currency'),
        **{f'filter.{k}': v for k, v in pgFilter.items()},
        'next_link': ('https://www.tripadvisor.com'+nl['href']) if nl else None
    }
    return pgInfo, listings
###########################################################################


paginate_scrape(pg1Url=url, pgSel=tripadvisor_script_data,
                reqFn=HTMLSession().get, fpBase='tripadvisor_from_script-')
# printed output:
'''
pg5: saving 22 listings [total:142]
Saved to 'tripadvisor_from_script-page_logs.csv' and 'tripadvisor_from_script-listings.csv'
'''
###############################################################################################


############################ EXAMPLE 2: EXTRACT WITH CSS SELECTORS ############################
### Only Required for THIS example:
from urllib.parse import urljoin # required for href2url
def href2url(href, baseUrl='https://www.tripadvisor.com'):
    return urljoin(baseUrl, href)

spSel = 'div.restaurant_availability_search'
pg_sel_ref = {
    'next_link': ('a.next[data-page-number][href]', 'href', None,'', href2url),
    'page_number': 'span[data-page-number].pageNum.current',
    'filter.date': f'{spSel} div:has(>div.date)+div.outer',
    'filter.time': (f'{spSel} input[value][name="rsv_time"]', 'value'),
    'filter.guests': f'{spSel} div.ppl_dropdown span.drop_down_value',
    '__selref__': True
}

lSel = 'div[data-test-target="restaurants-list"] div[data-test$="_list_item"]'
l_sel_ref = {'tSel': {
    'li_no': {'tAttr':'data-test', 'mFunc':(lambda dt: dt.split('_')[0])},
    'name': {
        'tSel': ':scope>div>div+div span>a',
        'mFunc':(lambda t: t.split('. ',1)[-1])
    }, 'details': {
        'listSel': ':scope>div>div+div>div:first-child'
                  +'+div span>span:not(span>span>span)',
        '__apply2resultSet__': (
            lambda rs: ' • '.join(r for r in rs if r!='Menu'))
    }, 'rating': {
        'tSel': 'svg[viewBox][aria-label$=" bubbles"]',
        'tAttr': 'aria-label', 'mFunc': (lambda a: a[:-8].strip())
    }, 'awards': {
        'listSel': 'div[aria-label$=" Winner"]',
        'tAttr': 'aria-label', 'mFunc': (lambda a: a[:-7].strip())
    }, 'review_snippets': {
        'listSel':':scope>div>div+div>div:not([class]) span:not(span span)'
    }, 'restaurant_link': ('span>a[href]', 'href', None,'', href2url)
}, 'listSel': lSel}
###########################################################################


paginate_scrape(pg1Url=url, pgSel={'__selref__': (pg_sel_ref, l_sel_ref)},
                reqFn=HTMLSession().get, fpBase='tripadvisor_by_selectors-')
# printed output:
'''
pg5: saving 22 listings [total:142]
Saved to 'tripadvisor_by_selectors-page_logs.csv' and 'tripadvisor_by_selectors-listings.csv'
'''
###############################################################################################