Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## for https://stackoverflow.com/q/74951298/6146136
- #### collected output at
- #### https://docs.google.com/spreadsheets/d/1SFfwYPr1uoLn-6W6mNhCwcNxBmUk3xESWNpLy1Cn14g
- import json
- import requests
- import bs4
- # from Essentials import Static
- # !pip install slimit
- import pandas
- import slimit
- from slimit.visitors import nodevisitor
- def findObj_inJS(jsStr, objName, findAll=False, jStrict=True, printErr=False):
- tree = slimit.parser.Parser().parse(jsStr)
- objList = objName if isinstance(objName, list) else [objName]
- toRet = {k: [] for k in objList}
- for n in nodevisitor.visit(tree):
- if not objList: break
- for c in (n.children() if hasattr(n, 'children') else [n]):
- if not objList: break
- if not (hasattr(c, 'left') and hasattr(c, 'right')): continue
- cName, cVal = c.left.to_ecma(), c.right.to_ecma()
- if cName in objList:
- try: toRet[cName].append(json.loads(cVal))
- except Exception as e:
- if printErr: print(type(e), e)
- if not jStrict: toRet[cName].append(cVal)
- if toRet[cName] and not findAll: objList.remove(cName)
- for k, v in toRet.items():
- if printErr and not v: print('not found: ', k)
- if not findAll: toRet[k] = v[0] if v else None
- return toRet if isinstance(objName, list) else toRet[objName]
- class CmsIDs:
- def GetIDs():
- # cont=requests.get('https://www.facebook.com:443/help',headers=Static.headers)
- cont = requests.get('https://www.facebook.com:443/help', headers={
- 'accept': ';'.join(
- [ 'text/html,application/xhtml+xml,application/xml',
- 'q=0.9,image/avif,image/webp,image/apng,*/*',
- 'q=0.8,application/signed-exchange',
- 'v=b3', 'q=0.9' ])})
- ## in case of request errors ##
- try: cont.raise_for_status()
- except Exception as e:
- print('failed to fetch page HTML -', type(e), e)
- return
- print('fetched', cont.url, 'with', cont.status_code, cont.reason)
- soup = bs4.BeautifulSoup(cont.content)
- scrCond = lambda t: t.name == 'script' and '"__bbox"' in t.get_text()
- jScripts = [s.get_text() for s in soup.find_all(scrCond)]
- print(f'Found {len(jScripts)} script tags containing {{"__bbox"}}')
- data = [findObj_inJS(s,'"__bbox"') for s in jScripts]
- # print('--- search ---')
- return CmsIDs.search(data)
- def search(data):
- rList, dKeys = [], ['cms_object_id', 'cmsID', 'name']
- if isinstance(data, dict):
- dObj = {k: data[k] for k in dKeys if k in data}
- rList += [dObj] if dObj else []
- ## IF YOU WANT TO PRINT WHILE EXTRACTING ##
- # for k, v in dObj.items(): print(k, v)
- # if dObj: print('---')
- for val in data.values(): rList += CmsIDs.search(val)
- if isinstance(data, list):
- for val in data: rList += CmsIDs.search(val)
- return rList
- #####################################################################################
- ####################################### USAGE #######################################
- fileName = 'CmsIDs_GetIDs.csv'
- dKeys = ['cms_object_id', 'cmsID', 'name']
- cmsList = CmsIDs.GetIDs()
- print('retrieved', len(cmsList), 'rows)
- # print(pandas.DataFrame(cmsList, dtype=str).fillna('').to_markdown())
- # pandas.DataFrame(cmsList).to_csv(fileName, index=False)
- newFile, old_tuples = True, []
- if os.path.isfile(fileName):
- try:
- newFile, old_tuples = False, [tuple(
- [r[k] for k in dKeys]
- ) for r in pandas.read_csv(fileName).fillna('').to_dict('records')]
- print(f'{len(old_tuples)} rows loaded from ',
- f'"{os.path.abspath(fileName)}"')
- except Exception as e: print(f'Could not retrieve old data {type(e)} {e}')
- else: print(f'No previous data ["{fileName}" not found in current directory]')
- new_tuples = [tuple(
- ['' if v is None else v for v in [r.get(k) for k in dKeys]]
- ) for r in cmsList]
- new_rows = [dict(zip(dKeys,r)) for r in new_tuples if r not in old_tuples]
- # gone_rows = [dict(zip(dKeys,r)) for r in old_tuples if r not in new_tuples]
- if new_rows:
- new_df = pandas.DataFrame(new_rows)[dKeys]
- print(new_df.to_markdown(index=False, tablefmt='simple'))
- lnt, lnr = len(cmsList), len(new_rows)
- fp_abs = f'"{os.path.abspath(fileName)}"'
- new_df.to_csv(fileName, mode='a', index=False, header=newFile)
- print('\n\n\n---- saved', lnr, f'new rows [of {lnt}] to', fp_abs, ' ----')
- print( 'total rows:', len(pandas.read_csv(fileName)) )
- else: print(f'No new rows [{len(cmsList)} scraped]')
- #####################################################################################
Advertisement
Add Comment
Please, Sign In to add comment