Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## for an example of usage, see https://stackoverflow.com/a/74104518/6146136
- def getSoupData(mSoup, dataStruct, maxDepth=None, curDepth=0):
- if type(dataStruct) != dict:
- # so selector/targetAttr can also be sent as a single string
- if str(dataStruct).startswith('"ta":'):
- dKey = 'targetAttr'
- else: dKey = 'cssSelector'
- dataStruct = str(dataStruct).replace('"ta":', '', 1)
- dataStruct = {dKey: dataStruct}
- # default values: isList=False, items={}
- isList = dataStruct['isList'] if 'isList' in dataStruct else False
- if 'items' in dataStruct and type(dataStruct['items']) == dict:
- items = dataStruct['items']
- else: items = {}
- # no selector -> just use the input directly
- if 'cssSelector' not in dataStruct:
- soup = mSoup if type(mSoup) == list else [mSoup]
- else:
- soup = mSoup.select(dataStruct['cssSelector'])
- # so that unneeded parts are not processed:
- if not isList: soup = soup[:1]
- # return empty nothing was selected
- if not soup: return [] if isList else None
- # return text or attribute values - no more recursion
- if items == {}:
- if 'targetAttr' in dataStruct:
- targetAttr = dataStruct['targetAttr']
- else: targetAttr = '"text"' # default
- if targetAttr == '"text"':
- sData = [s.get_text(strip=True) for s in soup]
- # can put in more options with elif
- else: sData = [s.get(targetAttr) for s in soup]
- return sData if isList else sData[0]
- # return error - recursion limited
- if maxDepth is not None and curDepth > maxDepth:
- return {'errorMsg': f'Maximum [{maxDepth}] exceeded at depth={curDepth}'}
- # recursively get items
- sData = [dict([(i, getSoupData(
- s, items[i], maxDepth, curDepth + 1
- )) for i in items]) for s in soup]
- return sData if isList else sData[0]
- # return list only if isList is set
Advertisement
Add Comment
Please, Sign In to add comment