fbHelp__bbox for so_q_74951298

## for https://stackoverflow.com/q/74951298/6146136
#### collected output at
#### https://docs.google.com/spreadsheets/d/1SFfwYPr1uoLn-6W6mNhCwcNxBmUk3xESWNpLy1Cn14g

import json
import requests
import bs4
# from Essentials import Static

#  !pip install slimit
import pandas
import slimit
from slimit.visitors import nodevisitor

def findObj_inJS(jsStr, objName, findAll=False, jStrict=True, printErr=False):
    tree = slimit.parser.Parser().parse(jsStr)
    objList = objName if isinstance(objName, list) else [objName]
    toRet = {k: [] for k in objList}

    for n in nodevisitor.visit(tree):
        if not objList: break
        for c in (n.children() if hasattr(n, 'children') else [n]):
            if not objList: break
            if not (hasattr(c, 'left') and hasattr(c, 'right')): continue
            cName, cVal = c.left.to_ecma(), c.right.to_ecma()

            if cName in objList:
                try: toRet[cName].append(json.loads(cVal))
                except Exception as e:
                    if printErr: print(type(e), e)
                    if not jStrict: toRet[cName].append(cVal)
                if toRet[cName] and not findAll: objList.remove(cName)

    for k, v in toRet.items():
        if printErr and not v: print('not found: ', k)
        if not findAll: toRet[k] = v[0] if v else None
    return toRet if isinstance(objName, list) else toRet[objName]


class CmsIDs:
    def GetIDs():
        # cont=requests.get('https://www.facebook.com:443/help',headers=Static.headers)
        cont = requests.get('https://www.facebook.com:443/help', headers={
            'accept': ';'.join(
                [ 'text/html,application/xhtml+xml,application/xml',
                  'q=0.9,image/avif,image/webp,image/apng,*/*',
                  'q=0.8,application/signed-exchange',
                  'v=b3', 'q=0.9'  ])})

        ## in case of request errors ##
        try: cont.raise_for_status()
        except Exception as e:
            print('failed to fetch page HTML -', type(e), e)
            return
        print('fetched', cont.url, 'with', cont.status_code, cont.reason)

        soup = bs4.BeautifulSoup(cont.content)
        scrCond = lambda t: t.name == 'script' and '"__bbox"' in t.get_text()
        jScripts = [s.get_text() for s in soup.find_all(scrCond)]
        print(f'Found {len(jScripts)} script tags containing {{"__bbox"}}')

        data = [findObj_inJS(s,'"__bbox"') for s in jScripts]
        # print('--- search ---')
        return CmsIDs.search(data)


    def search(data):
        rList, dKeys = [], ['cms_object_id', 'cmsID', 'name']
        if isinstance(data, dict):
            dObj = {k: data[k] for k in dKeys if k in data}
            rList += [dObj] if dObj else []

            ## IF YOU WANT TO PRINT WHILE EXTRACTING ##
            # for k, v in dObj.items(): print(k, v)
            # if dObj: print('---')

            for val in data.values(): rList += CmsIDs.search(val)
        if isinstance(data, list):
            for val in data: rList += CmsIDs.search(val)
        return rList
#####################################################################################


####################################### USAGE #######################################
fileName = 'CmsIDs_GetIDs.csv'
dKeys = ['cms_object_id', 'cmsID', 'name']

cmsList = CmsIDs.GetIDs()
print('retrieved', len(cmsList), 'rows)
# print(pandas.DataFrame(cmsList, dtype=str).fillna('').to_markdown())
# pandas.DataFrame(cmsList).to_csv(fileName, index=False)

newFile, old_tuples = True, []
if os.path.isfile(fileName):
    try:
        newFile, old_tuples = False, [tuple(
            [r[k] for k in dKeys]
        ) for r in pandas.read_csv(fileName).fillna('').to_dict('records')]
        print(f'{len(old_tuples)} rows loaded from ',
              f'"{os.path.abspath(fileName)}"')
    except Exception as e: print(f'Could not retrieve old data {type(e)} {e}')
else: print(f'No previous data ["{fileName}" not found in current directory]')
new_tuples = [tuple(
    ['' if v is None else v for v in [r.get(k) for k in dKeys]]
) for r in cmsList]

new_rows = [dict(zip(dKeys,r)) for r in new_tuples if r not in old_tuples]
# gone_rows = [dict(zip(dKeys,r)) for r in old_tuples if r not in new_tuples]
if new_rows:
    new_df = pandas.DataFrame(new_rows)[dKeys]
    print(new_df.to_markdown(index=False, tablefmt='simple'))

    lnt, lnr = len(cmsList), len(new_rows)
    fp_abs = f'"{os.path.abspath(fileName)}"'
    new_df.to_csv(fileName, mode='a', index=False, header=newFile)
    print('\n\n\n---- saved', lnr, f'new rows [of {lnt}] to', fp_abs, ' ----')
    print( 'total rows:', len(pandas.read_csv(fileName)) )
else: print(f'No new rows [{len(cmsList)} scraped]')
#####################################################################################