Advertisement
Try95th

pagination example - with tripadvisor restaurants

Mar 19th, 2023 (edited)
186
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.90 KB | None | 0 0
  1. ## scraping paginated lists: tripadvisor restaurants example ##
  2. ## output at https://docs.google.com/spreadsheets/d/14-rVFWU0QYCqk2tPX33JEUoWt8NQS-5E-wYso5QuqY4
  3.  
  4. ### REQUIRED FOR BOTH EXAMPLES:
  5. from requests_html import HTMLSession # [required for paginate_scrape INPUT reqFn]
  6. from bs4 import BeautifulSoup # [required for paginate_scrape]
  7. import pandas as pd # [required for paginate_scrape]
  8. # confParse requirements # PASTE FROM https://pastebin.com/c0TjDmNE
  9. # def confParse... # [required for paginate_scrape] # ALSO FROM https://pastebin.com/c0TjDmNE
  10. # def paginate_scrape... # PASTE FROM https://pastebin.com/d21q6mZ2
  11.  
  12. # url = 'https://www.tripadvisor.com/Restaurants-g_______-______.html'
  13. url = 'https://www.tripadvisor.com/Restaurants-g3676471-South_Region.html'
  14. ###############################################################################################
  15.  
  16.  
  17. ############################### EXAMPLE 1:  EXTRACT FROM SCRIPT ###############################
  18. ### Only Required for THIS example:
  19. import json # required for jsonload_from_script
  20. # def jsonload_from_script... # PASTE FROM https://pastebin.com/ewAn58Zt
  21. # def {yield,get}_by_nested_keys # PASTE FROM https://pastebin.com/LsZUiRW2
  22.  
  23. '''NOTE: confParse is rather superfluous here you could edit paginate_scrape
  24.   to directly use tripadvisor_script_data instead '''
  25. def tripadvisor_script_data(pgSoup):
  26.     var1n = 'window.__WEB_CONTEXT__'
  27.     rFn = lambda x: x.replace('pageManifest','"pageManifest"',1)
  28.     wwcData = jsonload_from_script(pgSoup, var1n, prepFn=rFn)
  29.  
  30.     lKeys = ['data', 'restaurants']
  31.     listings = get_by_nested_keys(wwcData, *lKeys, dType=list, find_all=True)
  32.    
  33.     pgFilter = get_by_nested_keys(wwcData,'travelerInfo','restaurants')
  34.     pgFilter = pgFilter if isinstance(pgFilter, dict) else {}
  35.     nl = pgSoup.select_one('a.next[data-page-number][href]')
  36.     pgInfo = {
  37.         'header': get_by_nested_keys(wwcData, 'descriptiveHeaderText'),
  38.         'results_count': get_by_nested_keys(wwcData, 'listResultCount'),
  39.         'currency':  get_by_nested_keys(wwcData, 'preferences', 'currency'),
  40.         **{f'filter.{k}': v for k, v in pgFilter.items()},
  41.         'next_link': ('https://www.tripadvisor.com'+nl['href']) if nl else None
  42.     }
  43.     return pgInfo, listings
  44. ###########################################################################
  45.  
  46.  
  47. paginate_scrape(pg1Url=url, pgSel=tripadvisor_script_data,
  48.                 reqFn=HTMLSession().get, fpBase='tripadvisor_from_script-')
  49. # printed output:
  50. '''
  51. pg5: saving 22 listings [total:142]
  52. Saved to 'tripadvisor_from_script-page_logs.csv' and 'tripadvisor_from_script-listings.csv'
  53. '''
  54. ###############################################################################################
  55.  
  56.  
  57. ############################ EXAMPLE 2: EXTRACT WITH CSS SELECTORS ############################
  58. ### Only Required for THIS example:
  59. from urllib.parse import urljoin # required for href2url
  60. def href2url(href, baseUrl='https://www.tripadvisor.com'):
  61.     return urljoin(baseUrl, href)
  62.  
  63. spSel = 'div.restaurant_availability_search'
  64. pg_sel_ref = {
  65.     'next_link': ('a.next[data-page-number][href]', 'href', None,'', href2url),
  66.     'page_number': 'span[data-page-number].pageNum.current',
  67.     'filter.date': f'{spSel} div:has(>div.date)+div.outer',
  68.     'filter.time': (f'{spSel} input[value][name="rsv_time"]', 'value'),
  69.     'filter.guests': f'{spSel} div.ppl_dropdown span.drop_down_value',
  70.     '__selref__': True
  71. }
  72.  
  73. lSel = 'div[data-test-target="restaurants-list"] div[data-test$="_list_item"]'
  74. l_sel_ref = {'tSel': {
  75.     'li_no': {'tAttr':'data-test', 'mFunc':(lambda dt: dt.split('_')[0])},
  76.     'name': {
  77.         'tSel': ':scope>div>div+div span>a',
  78.         'mFunc':(lambda t: t.split('. ',1)[-1])
  79.     }, 'details': {
  80.         'listSel': ':scope>div>div+div>div:first-child'
  81.                   +'+div span>span:not(span>span>span)',
  82.         '__apply2resultSet__': (
  83.             lambda rs: ' • '.join(r for r in rs if r!='Menu'))
  84.     }, 'rating': {
  85.         'tSel': 'svg[viewBox][aria-label$=" bubbles"]',
  86.         'tAttr': 'aria-label', 'mFunc': (lambda a: a[:-8].strip())
  87.     }, 'awards': {
  88.         'listSel': 'div[aria-label$=" Winner"]',
  89.         'tAttr': 'aria-label', 'mFunc': (lambda a: a[:-7].strip())
  90.     }, 'review_snippets': {
  91.         'listSel':':scope>div>div+div>div:not([class]) span:not(span span)'
  92.     }, 'restaurant_link': ('span>a[href]', 'href', None,'', href2url)
  93. }, 'listSel': lSel}
  94. ###########################################################################
  95.  
  96.  
  97. paginate_scrape(pg1Url=url, pgSel={'__selref__': (pg_sel_ref, l_sel_ref)},
  98.                 reqFn=HTMLSession().get, fpBase='tripadvisor_by_selectors-')
  99. # printed output:
  100. '''
  101. pg5: saving 22 listings [total:142]
  102. Saved to 'tripadvisor_by_selectors-page_logs.csv' and 'tripadvisor_by_selectors-listings.csv'
  103. '''
  104. ###############################################################################################
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement