Try95th

scrape_untappd_menu for so_q_75077179

Jan 10th, 2023 (edited)
157
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.77 KB | None | 0 0
  1. ## for https://stackoverflow.com/q/75077179/6146136
  2. ## and https://stackoverflow.com/q/75004108/6146136
  3.  
  4. ## IF YOU ARE A MEMBER, login and copy cookies to pass to scrape_untappd_menu  ##
  5. ########## (use https://curlconverter.com/ to get cookies dictionary) ###########
  6.  
  7.  
  8. import requests
  9. from bs4 import BeautifulSoup
  10.  
  11. ################### extract a single value from a bs4 Tag ###################
  12. def extract_from_bs4tag(tagSoup, selector, targetAttr=''):
  13.     sel, ta = str(selector).strip(), str(targetAttr).strip()
  14.     el = tagSoup.select_one(sel) if sel else tagSoup
  15.     if el: return el.get(ta) if ta else el.get_text(' ').strip()
  16.     # return None # <-- happens by default when function end is reached
  17.  
  18. ################### extract multiple values from a bs4Tag ###################  
  19. def selectForList(tagSoup, selectors, printList=False):
  20.     if isinstance(selectors, dict):
  21.         return dict(zip(selectors.keys(), selectForList(
  22.             tagSoup, selectors.values(), printList)))
  23.    
  24.     returnList, isv = [], printList
  25.     for s in selectors:
  26.         sel, ta = s[:2] if isinstance(s,tuple) and len(s)>1 else (s, '')
  27.         if ta == '"staticVal"': returnList.append(sel)
  28.         else: returnList.append(extract_from_bs4tag(tagSoup, sel, ta))
  29.            
  30.     returnList = [' '.join(r) if type(r) == list else r for r in returnList]
  31.     if isv and not isinstance(isv,str): print(returnList)
  32.     if isinstance(isv,str): print(*returnList, sep=isv)
  33.     return returnList
  34.  
  35. ############ just for  returning and printing a message with one statement ############
  36. def vRet(toPrint, toReturn=[]):
  37.     print(toPrint)
  38.     return toReturn
  39.  
  40. #################################### MAIN FUNCTION ####################################
  41. def scrape_untappd_menu(umLink, cookies={}, includeVenue=False):
  42.     rootUrl,addSlash = 'https://untappd.com','' if umLink[:1]=='/' else '/'
  43.     if not umLink.startswith(rootUrl):umLink=f'{rootUrl}{addSlash}{umLink}'
  44.     selRef = {
  45.       'name': 'div.beer-details>h5>a',
  46.       'description': 'div.beer-details>h5>em',
  47.       'rating': ('div[data-rating]', 'data-rating'),
  48.       'label': 'div.beer-label',
  49.       'label_updated':('div.beer-label>span[data-update-at]','data-update-at'),
  50.       'about': 'div.beer-details>h6>span',
  51.       'link': ('a[href*="/b/"]', 'href'),
  52.       'brewery': 'a[data-href=":brewery"]',
  53.       'brewery_link': ('a[data-href=":brewery"][href]', 'href'),
  54.       'logo_src': ('img[src*="/beer_logos/"]', 'src')
  55.     }
  56.  
  57.     ## fetch and parse page html ##
  58.     pgResp = requests.get(umLink, headers={
  59.         'user-agent': 'Mozilla/5.0'}, cookies=cookies)
  60.     try: pgResp.raise_for_status()
  61.     except Exception as e: return vRet(f'failed2scrape:{type(e)} - {e}')
  62.     pgSoup = BeautifulSoup(pgResp.content)
  63.  
  64.     ## get some venue name and id ##
  65.     vName, vn_h2, venuId = selectForList(pgSoup, [
  66.         'div.venue-name>h1', 'div.venue-name>h2',
  67.         ('*[data-venue-id]', 'data-venue-id')])
  68.     if not vName: vName =  umLink.split('/v/', 1)[-1].replace('-', ' ')
  69.     if vn_h2: vName += f' [{vn_h2}]'
  70.     if not venuId: print(f"could not find '*[data-venue-id]'")
  71.    
  72.     ## get menu items from page ##
  73.     mSel = 'ul.menu-section-list>li>div.beer-info'    
  74.     mList = [selectForList(li, selRef) for li in pgSoup.select(mSel)]
  75.  
  76.     ## find moremenu button ##
  77.     fetchMore, lmmCt = True, 1
  78.     mmBtn_sel = [
  79.         'a[data-href=":moremenu"][data-section-id]',
  80.         'a[data-href=":moremenusection"][data-menu-id]'
  81.     ]
  82.     mmBtn, msBtn = pgSoup.select_one(mmBtn_sel[0]), False
  83.     if not mmBtn:
  84.         mmBtn, msBtn = pgSoup.select_one(mmBtn_sel[1]), True
  85.         if not mmBtn:
  86.             fetchMore = False
  87.             print(f"could not find '{', '.join(mmBtn_sel)}'")
  88.    
  89.     ## load more ##
  90.     mSel = 'li>div.beer-info'  
  91.     sectCt = len(pgSoup.select('.section-area .menu-section'))
  92.     sectId = mmBtn.get('data-section-id') if mmBtn else None
  93.     menuId = mmBtn.get('data-menu-id') if mmBtn else None
  94.     while fetchMore:
  95.         lmmUrl = f'/venue/more_menu/{venuId}/{len(mList)}?section_id={sectId}'
  96.         if msBtn:
  97.             lmmUrl = f'/venue/more_menu_section/{venuId}/{sectCt}'
  98.             lmmUrl += f'?menu_id={menuId}'
  99.         print(f'[{lmmCt}] loading more from {rootUrl+lmmUrl}', end='')
  100.         lmReq = requests.get(f'{rootUrl}{lmmUrl}', headers={
  101.             'accept': 'application/json',
  102.             'user-agent': 'Mozilla/5.0',
  103.             'x-requested-with': 'XMLHttpRequest'
  104.         }, cookies=cookies)
  105.         try:
  106.             lmReq.raise_for_status()
  107.             jData = lmReq.json()
  108.             fetchMore = jData['count']
  109.             lmSoup = BeautifulSoup(jData['view'])
  110.             if lmmUrl: sectCt += fetchMore
  111.             print(f'\r[{lmmCt}] loaded {fetchMore} more from {rootUrl+lmmUrl}')
  112.         except Exception as e:
  113.             return vRet(f'\n{type(e)} - {e}', mList)
  114.        
  115.         ## get more menu items from html string instide json response ##
  116.         mList += [selectForList(li, selRef) for li in lmSoup.select(mSel)]
  117.         lmmCt += 1
  118.    
  119.     ## some cleanup [and maybe add venue name,id,link] ##
  120.     for mi, m in enumerate(mList):
  121.         m['about'] = m['about'].replace(m['brewery'], '').strip(' \u2022')
  122.         for k in ['link', 'brewery_link', 'logo_src']:
  123.             if str(m.get(k))[:1] == '/': mList[mi][k] = f'{rootUrl}{m[k]}'
  124.        
  125.         mDets = {'venueId': venuId, 'venue': vName} if includeVenue else {}
  126.         for k, v in mList[mi].items():
  127.             if k=='about' and v: v = v.strip('\u2022').strip().strip('\u2022')
  128.             mDets[k] = v.strip() if isinstance(v, str) else v
  129.         if includeVenue: mDets['venue_link'] = umLink
  130.         mList[mi] = mDets
  131.    
  132.     return vRet(f'{len(mList)} menu items from {umLink}', mList)
Advertisement
Add Comment
Please, Sign In to add comment