Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## scraping paginated lists: tripadvisor restaurants example ##
- ## output at https://docs.google.com/spreadsheets/d/14-rVFWU0QYCqk2tPX33JEUoWt8NQS-5E-wYso5QuqY4
- ### REQUIRED FOR BOTH EXAMPLES:
- from requests_html import HTMLSession # [required for paginate_scrape INPUT reqFn]
- from bs4 import BeautifulSoup # [required for paginate_scrape]
- import pandas as pd # [required for paginate_scrape]
- # confParse requirements # PASTE FROM https://pastebin.com/c0TjDmNE
- # def confParse... # [required for paginate_scrape] # ALSO FROM https://pastebin.com/c0TjDmNE
- # def paginate_scrape... # PASTE FROM https://pastebin.com/d21q6mZ2
- # url = 'https://www.tripadvisor.com/Restaurants-g_______-______.html'
- url = 'https://www.tripadvisor.com/Restaurants-g3676471-South_Region.html'
- ###############################################################################################
- ############################### EXAMPLE 1: EXTRACT FROM SCRIPT ###############################
- ### Only Required for THIS example:
- import json # required for jsonload_from_script
- # def jsonload_from_script... # PASTE FROM https://pastebin.com/ewAn58Zt
- # def {yield,get}_by_nested_keys # PASTE FROM https://pastebin.com/LsZUiRW2
- '''NOTE: confParse is rather superfluous here you could edit paginate_scrape
- to directly use tripadvisor_script_data instead '''
- def tripadvisor_script_data(pgSoup):
- var1n = 'window.__WEB_CONTEXT__'
- rFn = lambda x: x.replace('pageManifest','"pageManifest"',1)
- wwcData = jsonload_from_script(pgSoup, var1n, prepFn=rFn)
- lKeys = ['data', 'restaurants']
- listings = get_by_nested_keys(wwcData, *lKeys, dType=list, find_all=True)
- pgFilter = get_by_nested_keys(wwcData,'travelerInfo','restaurants')
- pgFilter = pgFilter if isinstance(pgFilter, dict) else {}
- nl = pgSoup.select_one('a.next[data-page-number][href]')
- pgInfo = {
- 'header': get_by_nested_keys(wwcData, 'descriptiveHeaderText'),
- 'results_count': get_by_nested_keys(wwcData, 'listResultCount'),
- 'currency': get_by_nested_keys(wwcData, 'preferences', 'currency'),
- **{f'filter.{k}': v for k, v in pgFilter.items()},
- 'next_link': ('https://www.tripadvisor.com'+nl['href']) if nl else None
- }
- return pgInfo, listings
- ###########################################################################
- paginate_scrape(pg1Url=url, pgSel=tripadvisor_script_data,
- reqFn=HTMLSession().get, fpBase='tripadvisor_from_script-')
- # printed output:
- '''
- pg5: saving 22 listings [total:142]
- Saved to 'tripadvisor_from_script-page_logs.csv' and 'tripadvisor_from_script-listings.csv'
- '''
- ###############################################################################################
- ############################ EXAMPLE 2: EXTRACT WITH CSS SELECTORS ############################
- ### Only Required for THIS example:
- from urllib.parse import urljoin # required for href2url
- def href2url(href, baseUrl='https://www.tripadvisor.com'):
- return urljoin(baseUrl, href)
- spSel = 'div.restaurant_availability_search'
- pg_sel_ref = {
- 'next_link': ('a.next[data-page-number][href]', 'href', None,'', href2url),
- 'page_number': 'span[data-page-number].pageNum.current',
- 'filter.date': f'{spSel} div:has(>div.date)+div.outer',
- 'filter.time': (f'{spSel} input[value][name="rsv_time"]', 'value'),
- 'filter.guests': f'{spSel} div.ppl_dropdown span.drop_down_value',
- '__selref__': True
- }
- lSel = 'div[data-test-target="restaurants-list"] div[data-test$="_list_item"]'
- l_sel_ref = {'tSel': {
- 'li_no': {'tAttr':'data-test', 'mFunc':(lambda dt: dt.split('_')[0])},
- 'name': {
- 'tSel': ':scope>div>div+div span>a',
- 'mFunc':(lambda t: t.split('. ',1)[-1])
- }, 'details': {
- 'listSel': ':scope>div>div+div>div:first-child'
- +'+div span>span:not(span>span>span)',
- '__apply2resultSet__': (
- lambda rs: ' • '.join(r for r in rs if r!='Menu'))
- }, 'rating': {
- 'tSel': 'svg[viewBox][aria-label$=" bubbles"]',
- 'tAttr': 'aria-label', 'mFunc': (lambda a: a[:-8].strip())
- }, 'awards': {
- 'listSel': 'div[aria-label$=" Winner"]',
- 'tAttr': 'aria-label', 'mFunc': (lambda a: a[:-7].strip())
- }, 'review_snippets': {
- 'listSel':':scope>div>div+div>div:not([class]) span:not(span span)'
- }, 'restaurant_link': ('span>a[href]', 'href', None,'', href2url)
- }, 'listSel': lSel}
- ###########################################################################
- paginate_scrape(pg1Url=url, pgSel={'__selref__': (pg_sel_ref, l_sel_ref)},
- reqFn=HTMLSession().get, fpBase='tripadvisor_by_selectors-')
- # printed output:
- '''
- pg5: saving 22 listings [total:142]
- Saved to 'tripadvisor_by_selectors-page_logs.csv' and 'tripadvisor_by_selectors-listings.csv'
- '''
- ###############################################################################################
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement