Try95th

htreeToDict

Nov 14th, 2022 (edited)
383
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.80 KB | None | 0 0
  1. ## for extracting specific details as from html (ip/op description at bottom) ##
  2. #### [ simpler version: getSoupData(...) at https://pastebin.com/DULqFBgp (no value-modifiers) ] ####
  3. #### [ or fillDict_fromTag(...) at https://pastebin.com/hKXYetmj (no nesting - just flat dictionary) ] ####
  4. #### [ or selectForList(...) at https://pastebin.com/ZnZ7xM6u (no mods, no nesting - just flat list) ] ####
  5. ## sample usage at https://pastebin.com/SGQN5pYd ##
  6. ## also at https://stackoverflow.com/a/74368223/6146136 [older version] ##
  7.  
  8. ## alterations for selenium are in commented lines ending with # if selenium
  9. ## [just in getElementAttr and htreeToDict; for selectElement there are 2 entirely separate versions ]
  10. ## if you uncomment those lines, comment out or remove lines ending with # not if selenium  
  11.  
  12.  
  13. ### NECESSARY IMPORTS ###
  14.  
  15. # from selenium.webdriver.common.by import By # if selenium
  16.  
  17. # def alterTextVals... # from https://pastebin.com/DSXcgQyC
  18.  
  19. def getElementAttr(elem, attrib, defVal=None):
  20.     if attrib == 'localName': aVal = elem.name # if not selenium
  21.  
  22.     elif attrib == '"allAttrs"': aVal =  elem.attrs # if not selenium  
  23.     # if attrib == '"allAttrs"': aVal =  elem.get_property('attributes') # if selenium 
  24.     ## (NOTE: the selenium version will return far more data, because it include node properties)
  25.  
  26.     elif str(attrib).strip():
  27.         aVal =  elem.get(str(attrib)) # if not selenium
  28.         # aVal =  elem.get_attribute(str(attrib)) # if selenium
  29.      
  30.     else: aVal =  elem.get_text(str(attrib), strip=True) # if not selenium    
  31.     # else: aVal =  elem.get_attribute('innerText').strip() # if selenium
  32.    
  33.     return defVal if aVal in [None, ''] and defVal is not None else aVal
  34.  
  35. def selectElement_bs4(elem, selector, conf):
  36.     selector = str(selector) # just in case
  37.     isList = conf['isList'] if 'isList' in conf else False
  38.  
  39.     # can add more complexity and options with conf but simplest for now:
  40.     if selector.strip() == '': return [elem] if isList else elem
  41.     return elem.select(selector) if isList else elem.select_one(selector)
  42.  
  43. def selectElement_selenium(elem, selector, conf):
  44.     isList = conf['isList'] if 'isList' in conf else False
  45.     if str(selector).strip() == '': return [elem] if isList else elem
  46.    
  47.     # can add more complexity and options with conf but simplest for now:
  48.     resElems = elem.find_elements_by_css_selector(selector) #  OR
  49.     # resElems=elem.find_elements(By.CSS_SELECTOR, selector) # works more often
  50.  
  51.     return resElems if conf == "list" else (resElems[0] if resElems else None)
  52.  
  53.  
  54. ### htreeToDict function definition ###
  55.    
  56. def htreeToDict(lSoup, refDict): # if not selenium
  57. # def htreeToDict(tagetElem, refDict):
  58.     detList = {}
  59.     for k, (sel, attr, altr) in refDict.items():
  60.         isList, confA = bool(altr == 'list'), {}
  61.         if type(altr) == list and altr:
  62.             if type(altr[0]) == dict: confA = altr[0]
  63.             if altr[0] == 'list': isList = True
  64.         if 'isList' in confA: isList = confA['isList']
  65.         else: confA['isList'] = isList
  66.  
  67.         s = selectElement_bs4(lSoup, sel, confA) # if not selenium
  68.         # s = selectElement_selenium(tagetElem, sel, confA) # if selenium
  69.  
  70.         if not s and type(attr) not in [dict, list]: detVal = s
  71.         elif not s: detVal = [] if type(attr) == list else {}
  72.         elif type(attr) == dict:
  73.             attrs = [(ik, iv, type(iv) == tuple) for ik, iv in attr.items()]
  74.             attr = {ik: iv for ik, iv, tiv in attrs if tiv and len(iv) == 3}
  75.             if isList: detVal = [htreeToDict(a, attr) for a in s]
  76.             else: detVal = htreeToDict(s, attr)
  77.         elif type(attr) in [list, tuple, set]:
  78.             prevT = None if type(attr) == list else type(attr)
  79.             if isList:
  80.                 detVal = [[getElementAttr(ss, a) for a in attr] for ss in s]
  81.                 if prevT: detVal = [
  82.                     set(dv) if prevT == set else tuple(dv) for dv in detVal]
  83.             else:
  84.                 detVal = [getElementAttr(s, a) for a in attr]
  85.                 if prevT: detVal = set(detVal) if prevT==set else tuple(detVal)
  86.         # elif # can add more options for attr # but the last 2 must be:
  87.         elif isList: detVal = [getElementAttr(ss, attr) for ss in s]
  88.         else: detVal = getElementAttr(s, attr)
  89.  
  90.         if detVal is None or type(altr) != list: altr = []
  91.         altr = [(str(a[0]),a[1]) for a in altr if type(a)==tuple and len(a)==2]
  92.         for aType, aVal in altr: detVal = alterTextVals(detVal, aType, aVal)
  93.  
  94.         detList[k] = detVal
  95.     return detList
  96.  
  97.  
  98. ## returns a python dictionary ("detList") after processing expected inputs:
  99. #### a bs4 Tag ("lSoup") [or a selenium WebElement ("listingEl")]
  100. #### a dictionary of selectors and attributes ("refDict")
  101. ###### to build detList, a corresponding "detVal" will be set to each key in refDict
  102. ###### refDict values should be 3-item tuples ("sel","attr","altr")
  103. ######   sel should be a css selector to search the inside lSoup/listingEl
  104. ######   attr specifies which attribute value to extract from the selected element
  105. ######     if sel is empty, then lSoup/listingEl attribute is extracted
  106. ######     if attr is a python dictionary, detVal is correspondingly with recursive call/s
  107. ######     if attr is a dict and altr is set to "list", then detVal will be LIST of dicts
  108. ######       (instead of a single python dictionary)
  109. ######   altr should be a list of 2-item tuples (unless attr is a dict)
  110. ######     the first item in the tuple (a string) would specify some type of modification for detVal
  111. ######     the second tuple item in the tuple should contain any data necessary for that modification
  112. ######     the first item in altr can also be "list" or a dictionary with 'isList' key
  113. ######     (operations based on other key-value pairs can/might be added into the function as needed)
Advertisement
Add Comment
Please, Sign In to add comment