Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## for extracting specific details as from html ###########
- ## an improvement on #####
- #### getSoupData [ https://pastebin.com/DULqFBgp ] #####
- #### fillDict_fromTag [ https://pastebin.com/hKXYetmj ] ###
- ###########################################################
- #### customized tripadvisor scraper [2 examples] at https://pastebin.com/5AjaUne0
- ###### example 1: tSel as a function [confParse is quite superfluous here]
- ###### example 2: tSel as a [reference] data structure containing selectors
- #### example 3: generalized example of tSel at https://pastebin.com/8rebux2i
- ############################# REQUIRED FUNCTIONS #############################
- def call_w_params(cFunc, xList=[],xMain={}, xDef={},xUp={},fKey=None):
- if isinstance(xMain, dict) and not (fKey and xMain.get(fKey)):
- return cFunc(*xList, **{**xDef, **xMain, **xUp})
- if isinstance(xMain, tuple): return cFunc(*xList, *xMain, **xUp)
- return cFunc(*xList, xMain, **{**xDef,**xUp})
- def try_call(toCall, *cArgs, vDef=None):
- try: return toCall(*cArgs)
- except: return vDef
- def try_apply(obj, toCall, lArgs=[], nArgs={}, **dArgs):
- if not isinstance(toCall, str): lArgs = [obj, *lArgs]
- else: toCall = getattr(obj, toCall, None)
- if not callable(toCall): return obj
- try: return toCall(*lArgs, **{**nArgs, **dArgs})
- except: return obj
- ###############################################################################
- ################################ MAIN FUNCTION ################################
- def confParse(bsTag, tSel='', tAttr='', defVal=None, listSel='',
- mFunc=None, *mArgs, curDepth=0, maxDepth=950, **mnArgs):
- if tAttr in ['"', '"placeholder"', '"_"']: return tSel
- if not curDepth < maxDepth:
- print({'depth':curDepth,'limit':maxDepth, 'tSel':tSel,'return':defVal})
- return defVal
- if listSel and isinstance(listSel,str): selected = bsTag.select(listSel)
- else: selected = try_call(listSel,bsTag,[]) if callable(listSel) else None
- if isinstance(selected, list):
- return try_apply([confParse(t, tSel,tAttr,defVal, '', mFunc,*mArgs,**{
- **{k: v for k, v in mnArgs.items() if k!='__apply2resultSet__' },
- 'curDepth': curDepth+1, 'maxDepth': maxDepth
- }) for t in selected], mnArgs.get('__apply2resultSet__'))
- selRef = tSel.get('__selref__') if isinstance(tSel,dict) else None
- if isinstance(selRef, (dict, list, set, tuple)): tSel = selRef
- if isinstance(tSel, (dict, tuple, list, set)):
- dnArgs = {'curDepth': curDepth+1, 'maxDepth': maxDepth}
- cnArgs, kl = {'defVal': defVal, 'mFunc': mFunc, **mnArgs}, None
- if isinstance(tSel, dict):
- kl = [k for k in tSel if k != '__selref__']
- tSel = [v for k,v in tSel.items() if k != '__selref__']
- ta, tConv, sd = tAttr, type(tSel), [call_w_params(
- confParse, [bsTag], sParams, cnArgs, dnArgs, '__selref__'
- ) for sParams in tSel]
- return try_apply(tConv(sd) if kl is None else dict(zip(kl,sd)), ta)
- ### ACTUAL EXTRACTION ###
- if callable(tSel): el = try_call(tSel, bsTag)
- else: el = bsTag.select_one(str(tSel).strip()) if tSel else bsTag
- if not el: return defVal
- if not isinstance(tAttr, str): rVal = el
- else: rVal = el.get(tAttr,defVal) if tAttr else el.get_text(' ',strip=True)
- if tAttr=='class' and isinstance(rVal, list): rVal = ' '.join(rVal)
- #########################
- return try_apply(rVal, mFunc, mArgs, mnArgs) if mFunc else rVal
- ###############################################################################
Advertisement
Add Comment
Please, Sign In to add comment