Try95th

reusable configurable parser

Nov 13th, 2022 (edited)
151
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.96 KB | None | 0 0
  1. ## for an example of usage, see https://stackoverflow.com/a/74104518/6146136
  2.  
  3. def getSoupData(mSoup, dataStruct, maxDepth=None, curDepth=0):
  4.     if type(dataStruct) != dict:
  5.         # so selector/targetAttr can also be sent as a single string
  6.         if str(dataStruct).startswith('"ta":'):
  7.             dKey = 'targetAttr'
  8.         else: dKey = 'cssSelector'
  9.         dataStruct = str(dataStruct).replace('"ta":', '', 1)
  10.         dataStruct = {dKey: dataStruct}
  11.  
  12.     # default values: isList=False, items={}
  13.     isList = dataStruct['isList'] if 'isList' in dataStruct else False
  14.     if 'items' in dataStruct and type(dataStruct['items']) == dict:
  15.         items = dataStruct['items']
  16.     else: items = {}
  17.  
  18.     # no selector -> just use the input directly
  19.     if 'cssSelector' not in dataStruct:
  20.         soup = mSoup if type(mSoup) == list else [mSoup]
  21.     else:
  22.         soup = mSoup.select(dataStruct['cssSelector'])
  23.         # so that unneeded parts are not processed:
  24.         if not isList: soup = soup[:1]
  25.  
  26.     # return empty nothing was selected
  27.     if not soup: return [] if isList else None
  28.  
  29.     # return text or attribute values - no more recursion
  30.     if items == {}:
  31.         if 'targetAttr' in dataStruct:
  32.             targetAttr = dataStruct['targetAttr']
  33.         else: targetAttr = '"text"'  # default
  34.  
  35.         if targetAttr == '"text"':
  36.             sData = [s.get_text(strip=True) for s in soup]
  37.         # can put in more options with elif
  38.         else: sData = [s.get(targetAttr) for s in soup]
  39.  
  40.         return sData if isList else sData[0]
  41.  
  42.     # return error - recursion limited
  43.     if maxDepth is not None and curDepth > maxDepth:
  44.         return {'errorMsg': f'Maximum [{maxDepth}] exceeded at depth={curDepth}'}
  45.  
  46.     # recursively get items
  47.     sData = [dict([(i, getSoupData(
  48.         s, items[i], maxDepth, curDepth + 1
  49.     )) for i in items]) for s in soup]
  50.  
  51.     return sData if isList else sData[0]
  52.     # return list only if isList is set
Advertisement
Add Comment
Please, Sign In to add comment