Not a member of Pastebin yet?
                        Sign Up,
                        it unlocks many cool features!                    
                - ## for extracting specific details as from html ##
 - ## A simpler version of htreeToDict [ https://pastebin.com/BpjZSQPi ] ##
 - ## Sample usage at https://stackoverflow.com/a/74631249/6146136 ##
 - def extractTagData(tagSoup, selector, targetAttr='', defVal=None, link1=False):
 - s = tagSoup.select_one(selector) if selector else tagSoup
 - if s is None: return (defVal, None) if link1 else defVal
 - if targetAttr == '':
 - sTxt = s.get_text(' ').strip()
 - sVal = defVal if type(defVal) == str and not sTxt else sTxt
 - else: sVal = s.get(targetAttr, defVal)
 - if not link1: return sVal
 - link1 = s.select_one('a[href]')
 - return (sVal, link1.get('href') if link1 else None)
 - def fillDict_fromTag(mSoup, selectorsDict, initDict={}, rootUrl=''):
 - for k, sel in selectorsDict.items():
 - if type(sel) == str:
 - initDict[k] = extractTagData(mSoup, sel)
 - elif type(sel) == tuple and len(sel) == 2:
 - initDict[k] = extractTagData(mSoup, sel[0], sel[1])
 - elif type(sel) == dict and 'k' in sel and 'v' in sel:
 - sk, sv = sel['k'], sel['v']
 - for ot in mSoup.select(f'{k}:has({sk}):has({sv})'):
 - kVal = extractTagData(ot, sk)
 - sVal, sLink = extractTagData(ot, sv, link1=True)
 - initDict[kVal] = sVal
 - if sLink:
 - if rootUrl and sLink[0] == '/': sLink = rootUrl + sLink
 - initDict[f'{kVal} [link]'] = sLink
 - elif type(sel) == dict and 'sel' in sel and 'sep' in sel:
 - for l in mSoup.select(f'{k} {sel["sel"]}'):
 - ltxt = l.get_text(' ').strip()
 - if sel['sep'] in ltxt:
 - kk, vv = ltxt.split(sel['sep'], 1)
 - initDict[kk] = vv
 - # elif... ## add more options
 - else: print(f'Unfamiliar reference format: {k} --> {sel}')
 - return initDict
 
Advertisement
 
                    Add Comment                
                
                        Please, Sign In to add comment