Try95th

fbHelp__bbox for so_q_74951298

Dec 30th, 2022 (edited)
174
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.83 KB | None | 0 0
  1. ## for https://stackoverflow.com/q/74951298/6146136
  2. #### collected output at
  3. #### https://docs.google.com/spreadsheets/d/1SFfwYPr1uoLn-6W6mNhCwcNxBmUk3xESWNpLy1Cn14g
  4.  
  5. import json
  6. import requests
  7. import bs4
  8. # from Essentials import Static
  9.  
  10. #  !pip install slimit
  11. import pandas
  12. import slimit
  13. from slimit.visitors import nodevisitor
  14.  
  15. def findObj_inJS(jsStr, objName, findAll=False, jStrict=True, printErr=False):
  16.     tree = slimit.parser.Parser().parse(jsStr)
  17.     objList = objName if isinstance(objName, list) else [objName]
  18.     toRet = {k: [] for k in objList}
  19.  
  20.     for n in nodevisitor.visit(tree):
  21.         if not objList: break
  22.         for c in (n.children() if hasattr(n, 'children') else [n]):
  23.             if not objList: break
  24.             if not (hasattr(c, 'left') and hasattr(c, 'right')): continue
  25.             cName, cVal = c.left.to_ecma(), c.right.to_ecma()
  26.  
  27.             if cName in objList:
  28.                 try: toRet[cName].append(json.loads(cVal))
  29.                 except Exception as e:
  30.                     if printErr: print(type(e), e)
  31.                     if not jStrict: toRet[cName].append(cVal)
  32.                 if toRet[cName] and not findAll: objList.remove(cName)
  33.    
  34.     for k, v in toRet.items():
  35.         if printErr and not v: print('not found: ', k)
  36.         if not findAll: toRet[k] = v[0] if v else None
  37.     return toRet if isinstance(objName, list) else toRet[objName]
  38.  
  39.  
  40. class CmsIDs:
  41.     def GetIDs():
  42.         # cont=requests.get('https://www.facebook.com:443/help',headers=Static.headers)
  43.         cont = requests.get('https://www.facebook.com:443/help', headers={
  44.             'accept': ';'.join(
  45.                 [ 'text/html,application/xhtml+xml,application/xml',
  46.                   'q=0.9,image/avif,image/webp,image/apng,*/*',
  47.                   'q=0.8,application/signed-exchange',
  48.                   'v=b3', 'q=0.9'  ])})
  49.  
  50.         ## in case of request errors ##
  51.         try: cont.raise_for_status()
  52.         except Exception as e:
  53.             print('failed to fetch page HTML -', type(e), e)
  54.             return
  55.         print('fetched', cont.url, 'with', cont.status_code, cont.reason)
  56.  
  57.         soup = bs4.BeautifulSoup(cont.content)
  58.         scrCond = lambda t: t.name == 'script' and '"__bbox"' in t.get_text()
  59.         jScripts = [s.get_text() for s in soup.find_all(scrCond)]
  60.         print(f'Found {len(jScripts)} script tags containing {{"__bbox"}}')
  61.  
  62.         data = [findObj_inJS(s,'"__bbox"') for s in jScripts]
  63.         # print('--- search ---')
  64.         return CmsIDs.search(data)
  65.    
  66.    
  67.     def search(data):
  68.         rList, dKeys = [], ['cms_object_id', 'cmsID', 'name']
  69.         if isinstance(data, dict):
  70.             dObj = {k: data[k] for k in dKeys if k in data}
  71.             rList += [dObj] if dObj else []
  72.  
  73.             ## IF YOU WANT TO PRINT WHILE EXTRACTING ##
  74.             # for k, v in dObj.items(): print(k, v)
  75.             # if dObj: print('---')
  76.  
  77.             for val in data.values(): rList += CmsIDs.search(val)
  78.         if isinstance(data, list):
  79.             for val in data: rList += CmsIDs.search(val)
  80.         return rList
  81. #####################################################################################
  82.    
  83.  
  84. ####################################### USAGE #######################################
  85. fileName = 'CmsIDs_GetIDs.csv'
  86. dKeys = ['cms_object_id', 'cmsID', 'name']
  87.  
  88. cmsList = CmsIDs.GetIDs()
  89. print('retrieved', len(cmsList), 'rows)
  90. # print(pandas.DataFrame(cmsList, dtype=str).fillna('').to_markdown())
  91. # pandas.DataFrame(cmsList).to_csv(fileName, index=False)
  92.      
  93. newFile, old_tuples = True, []
  94. if os.path.isfile(fileName):
  95.    try:
  96.        newFile, old_tuples = False, [tuple(
  97.            [r[k] for k in dKeys]
  98.        ) for r in pandas.read_csv(fileName).fillna('').to_dict('records')]
  99.        print(f'{len(old_tuples)} rows loaded from ',
  100.              f'"{os.path.abspath(fileName)}"')
  101.    except Exception as e: print(f'Could not retrieve old data {type(e)} {e}')
  102. else: print(f'No previous data ["{fileName}" not found in current directory]')
  103. new_tuples = [tuple(
  104.    ['' if v is None else v for v in [r.get(k) for k in dKeys]]
  105. ) for r in cmsList]
  106.  
  107. new_rows = [dict(zip(dKeys,r)) for r in new_tuples if r not in old_tuples]
  108. # gone_rows = [dict(zip(dKeys,r)) for r in old_tuples if r not in new_tuples]
  109. if new_rows:
  110.    new_df = pandas.DataFrame(new_rows)[dKeys]
  111.    print(new_df.to_markdown(index=False, tablefmt='simple'))
  112.  
  113.    lnt, lnr = len(cmsList), len(new_rows)
  114.    fp_abs = f'"{os.path.abspath(fileName)}"'
  115.    new_df.to_csv(fileName, mode='a', index=False, header=newFile)
  116.    print('\n\n\n---- saved', lnr, f'new rows [of {lnt}] to', fp_abs, ' ----')
  117.    print( 'total rows:', len(pandas.read_csv(fileName)) )
  118. else: print(f'No new rows [{len(cmsList)} scraped]')
  119. #####################################################################################
Advertisement
Add Comment
Please, Sign In to add comment