scrape_untappd_menu for so_q_75077179

## for https://stackoverflow.com/q/75077179/6146136
## and https://stackoverflow.com/q/75004108/6146136

## IF YOU ARE A MEMBER, login and copy cookies to pass to scrape_untappd_menu  ##
########## (use https://curlconverter.com/ to get cookies dictionary) ###########


import requests
from bs4 import BeautifulSoup

################### extract a single value from a bs4 Tag ###################
def extract_from_bs4tag(tagSoup, selector, targetAttr=''):
    sel, ta = str(selector).strip(), str(targetAttr).strip()
    el = tagSoup.select_one(sel) if sel else tagSoup
    if el: return el.get(ta) if ta else el.get_text(' ').strip()
    # return None # <-- happens by default when function end is reached

################### extract multiple values from a bs4Tag ###################
def selectForList(tagSoup, selectors, printList=False):
    if isinstance(selectors, dict):
        return dict(zip(selectors.keys(), selectForList(
            tagSoup, selectors.values(), printList)))

    returnList, isv = [], printList
    for s in selectors:
        sel, ta = s[:2] if isinstance(s,tuple) and len(s)>1 else (s, '')
        if ta == '"staticVal"': returnList.append(sel)
        else: returnList.append(extract_from_bs4tag(tagSoup, sel, ta))

    returnList = [' '.join(r) if type(r) == list else r for r in returnList]
    if isv and not isinstance(isv,str): print(returnList)
    if isinstance(isv,str): print(*returnList, sep=isv)
    return returnList

############ just for  returning and printing a message with one statement ############
def vRet(toPrint, toReturn=[]):
    print(toPrint)
    return toReturn

#################################### MAIN FUNCTION ####################################
def scrape_untappd_menu(umLink, cookies={}, includeVenue=False):
    rootUrl,addSlash = 'https://untappd.com','' if umLink[:1]=='/' else '/'
    if not umLink.startswith(rootUrl):umLink=f'{rootUrl}{addSlash}{umLink}'
    selRef = {
      'name': 'div.beer-details>h5>a',
      'description': 'div.beer-details>h5>em',
      'rating': ('div[data-rating]', 'data-rating'),
      'label': 'div.beer-label',
      'label_updated':('div.beer-label>span[data-update-at]','data-update-at'),
      'about': 'div.beer-details>h6>span',
      'link': ('a[href*="/b/"]', 'href'),
      'brewery': 'a[data-href=":brewery"]',
      'brewery_link': ('a[data-href=":brewery"][href]', 'href'),
      'logo_src': ('img[src*="/beer_logos/"]', 'src')
    }

    ## fetch and parse page html ##
    pgResp = requests.get(umLink, headers={
        'user-agent': 'Mozilla/5.0'}, cookies=cookies)
    try: pgResp.raise_for_status()
    except Exception as e: return vRet(f'failed2scrape:{type(e)} - {e}')
    pgSoup = BeautifulSoup(pgResp.content)

    ## get some venue name and id ##
    vName, vn_h2, venuId = selectForList(pgSoup, [
        'div.venue-name>h1', 'div.venue-name>h2',
        ('*[data-venue-id]', 'data-venue-id')])
    if not vName: vName =  umLink.split('/v/', 1)[-1].replace('-', ' ')
    if vn_h2: vName += f' [{vn_h2}]'
    if not venuId: print(f"could not find '*[data-venue-id]'")

    ## get menu items from page ##
    mSel = 'ul.menu-section-list>li>div.beer-info'
    mList = [selectForList(li, selRef) for li in pgSoup.select(mSel)]

    ## find moremenu button ##
    fetchMore, lmmCt = True, 1
    mmBtn_sel = [
        'a[data-href=":moremenu"][data-section-id]',
        'a[data-href=":moremenusection"][data-menu-id]'
    ]
    mmBtn, msBtn = pgSoup.select_one(mmBtn_sel[0]), False
    if not mmBtn:
        mmBtn, msBtn = pgSoup.select_one(mmBtn_sel[1]), True
        if not mmBtn:
            fetchMore = False
            print(f"could not find '{', '.join(mmBtn_sel)}'")

    ## load more ##
    mSel = 'li>div.beer-info'
    sectCt = len(pgSoup.select('.section-area .menu-section'))
    sectId = mmBtn.get('data-section-id') if mmBtn else None
    menuId = mmBtn.get('data-menu-id') if mmBtn else None
    while fetchMore:
        lmmUrl = f'/venue/more_menu/{venuId}/{len(mList)}?section_id={sectId}'
        if msBtn:
            lmmUrl = f'/venue/more_menu_section/{venuId}/{sectCt}'
            lmmUrl += f'?menu_id={menuId}'
        print(f'[{lmmCt}] loading more from {rootUrl+lmmUrl}', end='')
        lmReq = requests.get(f'{rootUrl}{lmmUrl}', headers={
            'accept': 'application/json',
            'user-agent': 'Mozilla/5.0',
            'x-requested-with': 'XMLHttpRequest'
        }, cookies=cookies)
        try:
            lmReq.raise_for_status()
            jData = lmReq.json()
            fetchMore = jData['count']
            lmSoup = BeautifulSoup(jData['view'])
            if lmmUrl: sectCt += fetchMore
            print(f'\r[{lmmCt}] loaded {fetchMore} more from {rootUrl+lmmUrl}')
        except Exception as e:
            return vRet(f'\n{type(e)} - {e}', mList)

        ## get more menu items from html string instide json response ##
        mList += [selectForList(li, selRef) for li in lmSoup.select(mSel)]
        lmmCt += 1

    ## some cleanup [and maybe add venue name,id,link] ##
    for mi, m in enumerate(mList):
        m['about'] = m['about'].replace(m['brewery'], '').strip(' \u2022')
        for k in ['link', 'brewery_link', 'logo_src']:
            if str(m.get(k))[:1] == '/': mList[mi][k] = f'{rootUrl}{m[k]}'

        mDets = {'venueId': venuId, 'venue': vName} if includeVenue else {}
        for k, v in mList[mi].items():
            if k=='about' and v: v = v.strip('\u2022').strip().strip('\u2022')
            mDets[k] = v.strip() if isinstance(v, str) else v
        if includeVenue: mDets['venue_link'] = umLink
        mList[mi] = mDets

    return vRet(f'{len(mList)} menu items from {umLink}', mList)