Advertisement
Try95th

extractTagData + fillDict_fromTag

Nov 30th, 2022 (edited)
281
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.91 KB | None | 0 0
  1. ## for extracting specific details as from html ##
  2. ## A simpler version of htreeToDict [ https://pastebin.com/BpjZSQPi ] ##
  3. ## Sample usage at https://stackoverflow.com/a/74631249/6146136 ##
  4.  
  5. def extractTagData(tagSoup, selector, targetAttr='', defVal=None, link1=False):
  6.     s = tagSoup.select_one(selector) if selector else tagSoup
  7.     if s is None: return (defVal, None) if link1 else defVal
  8.     if targetAttr == '':
  9.         sTxt = s.get_text(' ').strip()
  10.         sVal = defVal if type(defVal) == str and not sTxt else sTxt
  11.     else: sVal = s.get(targetAttr, defVal)
  12.     if not link1: return sVal
  13.     link1 = s.select_one('a[href]')
  14.     return (sVal, link1.get('href') if link1 else None)
  15.  
  16.  
  17. def fillDict_fromTag(mSoup, selectorsDict, initDict={}, rootUrl=''):
  18.     for k, sel in selectorsDict.items():
  19.         if type(sel) == str:
  20.             initDict[k] = extractTagData(mSoup, sel)
  21.         elif type(sel) == tuple and len(sel) == 2:
  22.             initDict[k] = extractTagData(mSoup, sel[0], sel[1])
  23.         elif type(sel) == dict and 'k' in sel and 'v' in sel:
  24.             sk, sv = sel['k'], sel['v']
  25.             for ot in mSoup.select(f'{k}:has({sk}):has({sv})'):
  26.                 kVal = extractTagData(ot, sk)
  27.                 sVal, sLink = extractTagData(ot, sv, link1=True)
  28.                 initDict[kVal] = sVal
  29.                 if sLink:
  30.                     if rootUrl and sLink[0] == '/': sLink = rootUrl + sLink
  31.                     initDict[f'{kVal} [link]'] = sLink
  32.         elif type(sel) == dict and 'sel' in sel and 'sep' in sel:
  33.             for l in mSoup.select(f'{k} {sel["sel"]}'):
  34.                 ltxt = l.get_text(' ').strip()
  35.                 if sel['sep'] in ltxt:
  36.                     kk, vv = ltxt.split(sel['sep'], 1)
  37.                     initDict[kk] = vv
  38.         # elif... ## add more options
  39.         else: print(f'Unfamiliar reference format: {k} --> {sel}')    
  40.    
  41.     return initDict
  42.    
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement