Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## for https://stackoverflow.com/q/75077179/6146136
- ## and https://stackoverflow.com/q/75004108/6146136
- ## IF YOU ARE A MEMBER, login and copy cookies to pass to scrape_untappd_menu ##
- ########## (use https://curlconverter.com/ to get cookies dictionary) ###########
- import requests
- from bs4 import BeautifulSoup
- ################### extract a single value from a bs4 Tag ###################
- def extract_from_bs4tag(tagSoup, selector, targetAttr=''):
- sel, ta = str(selector).strip(), str(targetAttr).strip()
- el = tagSoup.select_one(sel) if sel else tagSoup
- if el: return el.get(ta) if ta else el.get_text(' ').strip()
- # return None # <-- happens by default when function end is reached
- ################### extract multiple values from a bs4Tag ###################
- def selectForList(tagSoup, selectors, printList=False):
- if isinstance(selectors, dict):
- return dict(zip(selectors.keys(), selectForList(
- tagSoup, selectors.values(), printList)))
- returnList, isv = [], printList
- for s in selectors:
- sel, ta = s[:2] if isinstance(s,tuple) and len(s)>1 else (s, '')
- if ta == '"staticVal"': returnList.append(sel)
- else: returnList.append(extract_from_bs4tag(tagSoup, sel, ta))
- returnList = [' '.join(r) if type(r) == list else r for r in returnList]
- if isv and not isinstance(isv,str): print(returnList)
- if isinstance(isv,str): print(*returnList, sep=isv)
- return returnList
- ############ just for returning and printing a message with one statement ############
- def vRet(toPrint, toReturn=[]):
- print(toPrint)
- return toReturn
- #################################### MAIN FUNCTION ####################################
- def scrape_untappd_menu(umLink, cookies={}, includeVenue=False):
- rootUrl,addSlash = 'https://untappd.com','' if umLink[:1]=='/' else '/'
- if not umLink.startswith(rootUrl):umLink=f'{rootUrl}{addSlash}{umLink}'
- selRef = {
- 'name': 'div.beer-details>h5>a',
- 'description': 'div.beer-details>h5>em',
- 'rating': ('div[data-rating]', 'data-rating'),
- 'label': 'div.beer-label',
- 'label_updated':('div.beer-label>span[data-update-at]','data-update-at'),
- 'about': 'div.beer-details>h6>span',
- 'link': ('a[href*="/b/"]', 'href'),
- 'brewery': 'a[data-href=":brewery"]',
- 'brewery_link': ('a[data-href=":brewery"][href]', 'href'),
- 'logo_src': ('img[src*="/beer_logos/"]', 'src')
- }
- ## fetch and parse page html ##
- pgResp = requests.get(umLink, headers={
- 'user-agent': 'Mozilla/5.0'}, cookies=cookies)
- try: pgResp.raise_for_status()
- except Exception as e: return vRet(f'failed2scrape:{type(e)} - {e}')
- pgSoup = BeautifulSoup(pgResp.content)
- ## get some venue name and id ##
- vName, vn_h2, venuId = selectForList(pgSoup, [
- 'div.venue-name>h1', 'div.venue-name>h2',
- ('*[data-venue-id]', 'data-venue-id')])
- if not vName: vName = umLink.split('/v/', 1)[-1].replace('-', ' ')
- if vn_h2: vName += f' [{vn_h2}]'
- if not venuId: print(f"could not find '*[data-venue-id]'")
- ## get menu items from page ##
- mSel = 'ul.menu-section-list>li>div.beer-info'
- mList = [selectForList(li, selRef) for li in pgSoup.select(mSel)]
- ## find moremenu button ##
- fetchMore, lmmCt = True, 1
- mmBtn_sel = [
- 'a[data-href=":moremenu"][data-section-id]',
- 'a[data-href=":moremenusection"][data-menu-id]'
- ]
- mmBtn, msBtn = pgSoup.select_one(mmBtn_sel[0]), False
- if not mmBtn:
- mmBtn, msBtn = pgSoup.select_one(mmBtn_sel[1]), True
- if not mmBtn:
- fetchMore = False
- print(f"could not find '{', '.join(mmBtn_sel)}'")
- ## load more ##
- mSel = 'li>div.beer-info'
- sectCt = len(pgSoup.select('.section-area .menu-section'))
- sectId = mmBtn.get('data-section-id') if mmBtn else None
- menuId = mmBtn.get('data-menu-id') if mmBtn else None
- while fetchMore:
- lmmUrl = f'/venue/more_menu/{venuId}/{len(mList)}?section_id={sectId}'
- if msBtn:
- lmmUrl = f'/venue/more_menu_section/{venuId}/{sectCt}'
- lmmUrl += f'?menu_id={menuId}'
- print(f'[{lmmCt}] loading more from {rootUrl+lmmUrl}', end='')
- lmReq = requests.get(f'{rootUrl}{lmmUrl}', headers={
- 'accept': 'application/json',
- 'user-agent': 'Mozilla/5.0',
- 'x-requested-with': 'XMLHttpRequest'
- }, cookies=cookies)
- try:
- lmReq.raise_for_status()
- jData = lmReq.json()
- fetchMore = jData['count']
- lmSoup = BeautifulSoup(jData['view'])
- if lmmUrl: sectCt += fetchMore
- print(f'\r[{lmmCt}] loaded {fetchMore} more from {rootUrl+lmmUrl}')
- except Exception as e:
- return vRet(f'\n{type(e)} - {e}', mList)
- ## get more menu items from html string instide json response ##
- mList += [selectForList(li, selRef) for li in lmSoup.select(mSel)]
- lmmCt += 1
- ## some cleanup [and maybe add venue name,id,link] ##
- for mi, m in enumerate(mList):
- m['about'] = m['about'].replace(m['brewery'], '').strip(' \u2022')
- for k in ['link', 'brewery_link', 'logo_src']:
- if str(m.get(k))[:1] == '/': mList[mi][k] = f'{rootUrl}{m[k]}'
- mDets = {'venueId': venuId, 'venue': vName} if includeVenue else {}
- for k, v in mList[mi].items():
- if k=='about' and v: v = v.strip('\u2022').strip().strip('\u2022')
- mDets[k] = v.strip() if isinstance(v, str) else v
- if includeVenue: mDets['venue_link'] = umLink
- mList[mi] = mDets
- return vRet(f'{len(mList)} menu items from {umLink}', mList)
Advertisement
Add Comment
Please, Sign In to add comment