Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## for extracting specific details as from html ##
- ## A simpler version of htreeToDict [ https://pastebin.com/BpjZSQPi ] ##
- ## Sample usage at https://stackoverflow.com/a/74631249/6146136 ##
- def extractTagData(tagSoup, selector, targetAttr='', defVal=None, link1=False):
- s = tagSoup.select_one(selector) if selector else tagSoup
- if s is None: return (defVal, None) if link1 else defVal
- if targetAttr == '':
- sTxt = s.get_text(' ').strip()
- sVal = defVal if type(defVal) == str and not sTxt else sTxt
- else: sVal = s.get(targetAttr, defVal)
- if not link1: return sVal
- link1 = s.select_one('a[href]')
- return (sVal, link1.get('href') if link1 else None)
- def fillDict_fromTag(mSoup, selectorsDict, initDict={}, rootUrl=''):
- for k, sel in selectorsDict.items():
- if type(sel) == str:
- initDict[k] = extractTagData(mSoup, sel)
- elif type(sel) == tuple and len(sel) == 2:
- initDict[k] = extractTagData(mSoup, sel[0], sel[1])
- elif type(sel) == dict and 'k' in sel and 'v' in sel:
- sk, sv = sel['k'], sel['v']
- for ot in mSoup.select(f'{k}:has({sk}):has({sv})'):
- kVal = extractTagData(ot, sk)
- sVal, sLink = extractTagData(ot, sv, link1=True)
- initDict[kVal] = sVal
- if sLink:
- if rootUrl and sLink[0] == '/': sLink = rootUrl + sLink
- initDict[f'{kVal} [link]'] = sLink
- elif type(sel) == dict and 'sel' in sel and 'sep' in sel:
- for l in mSoup.select(f'{k} {sel["sel"]}'):
- ltxt = l.get_text(' ').strip()
- if sel['sep'] in ltxt:
- kk, vv = ltxt.split(sel['sep'], 1)
- initDict[kk] = vv
- # elif... ## add more options
- else: print(f'Unfamiliar reference format: {k} --> {sel}')
- return initDict
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement