Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## for extracting specific details as from html (ip/op description at bottom) ##
- #### [ simpler version: getSoupData(...) at https://pastebin.com/DULqFBgp (no value-modifiers) ] ####
- #### [ or fillDict_fromTag(...) at https://pastebin.com/hKXYetmj (no nesting - just flat dictionary) ] ####
- #### [ or selectForList(...) at https://pastebin.com/ZnZ7xM6u (no mods, no nesting - just flat list) ] ####
- ## sample usage at https://pastebin.com/SGQN5pYd ##
- ## also at https://stackoverflow.com/a/74368223/6146136 [older version] ##
- ## alterations for selenium are in commented lines ending with # if selenium
- ## [just in getElementAttr and htreeToDict; for selectElement there are 2 entirely separate versions ]
- ## if you uncomment those lines, comment out or remove lines ending with # not if selenium
- ### NECESSARY IMPORTS ###
- # from selenium.webdriver.common.by import By # if selenium
- # def alterTextVals... # from https://pastebin.com/DSXcgQyC
- def getElementAttr(elem, attrib, defVal=None):
- if attrib == 'localName': aVal = elem.name # if not selenium
- elif attrib == '"allAttrs"': aVal = elem.attrs # if not selenium
- # if attrib == '"allAttrs"': aVal = elem.get_property('attributes') # if selenium
- ## (NOTE: the selenium version will return far more data, because it include node properties)
- elif str(attrib).strip():
- aVal = elem.get(str(attrib)) # if not selenium
- # aVal = elem.get_attribute(str(attrib)) # if selenium
- else: aVal = elem.get_text(str(attrib), strip=True) # if not selenium
- # else: aVal = elem.get_attribute('innerText').strip() # if selenium
- return defVal if aVal in [None, ''] and defVal is not None else aVal
- def selectElement_bs4(elem, selector, conf):
- selector = str(selector) # just in case
- isList = conf['isList'] if 'isList' in conf else False
- # can add more complexity and options with conf but simplest for now:
- if selector.strip() == '': return [elem] if isList else elem
- return elem.select(selector) if isList else elem.select_one(selector)
- def selectElement_selenium(elem, selector, conf):
- isList = conf['isList'] if 'isList' in conf else False
- if str(selector).strip() == '': return [elem] if isList else elem
- # can add more complexity and options with conf but simplest for now:
- resElems = elem.find_elements_by_css_selector(selector) # OR
- # resElems=elem.find_elements(By.CSS_SELECTOR, selector) # works more often
- return resElems if conf == "list" else (resElems[0] if resElems else None)
- ### htreeToDict function definition ###
- def htreeToDict(lSoup, refDict): # if not selenium
- # def htreeToDict(tagetElem, refDict):
- detList = {}
- for k, (sel, attr, altr) in refDict.items():
- isList, confA = bool(altr == 'list'), {}
- if type(altr) == list and altr:
- if type(altr[0]) == dict: confA = altr[0]
- if altr[0] == 'list': isList = True
- if 'isList' in confA: isList = confA['isList']
- else: confA['isList'] = isList
- s = selectElement_bs4(lSoup, sel, confA) # if not selenium
- # s = selectElement_selenium(tagetElem, sel, confA) # if selenium
- if not s and type(attr) not in [dict, list]: detVal = s
- elif not s: detVal = [] if type(attr) == list else {}
- elif type(attr) == dict:
- attrs = [(ik, iv, type(iv) == tuple) for ik, iv in attr.items()]
- attr = {ik: iv for ik, iv, tiv in attrs if tiv and len(iv) == 3}
- if isList: detVal = [htreeToDict(a, attr) for a in s]
- else: detVal = htreeToDict(s, attr)
- elif type(attr) in [list, tuple, set]:
- prevT = None if type(attr) == list else type(attr)
- if isList:
- detVal = [[getElementAttr(ss, a) for a in attr] for ss in s]
- if prevT: detVal = [
- set(dv) if prevT == set else tuple(dv) for dv in detVal]
- else:
- detVal = [getElementAttr(s, a) for a in attr]
- if prevT: detVal = set(detVal) if prevT==set else tuple(detVal)
- # elif # can add more options for attr # but the last 2 must be:
- elif isList: detVal = [getElementAttr(ss, attr) for ss in s]
- else: detVal = getElementAttr(s, attr)
- if detVal is None or type(altr) != list: altr = []
- altr = [(str(a[0]),a[1]) for a in altr if type(a)==tuple and len(a)==2]
- for aType, aVal in altr: detVal = alterTextVals(detVal, aType, aVal)
- detList[k] = detVal
- return detList
- ## returns a python dictionary ("detList") after processing expected inputs:
- #### a bs4 Tag ("lSoup") [or a selenium WebElement ("listingEl")]
- #### a dictionary of selectors and attributes ("refDict")
- ###### to build detList, a corresponding "detVal" will be set to each key in refDict
- ###### refDict values should be 3-item tuples ("sel","attr","altr")
- ###### sel should be a css selector to search the inside lSoup/listingEl
- ###### attr specifies which attribute value to extract from the selected element
- ###### if sel is empty, then lSoup/listingEl attribute is extracted
- ###### if attr is a python dictionary, detVal is correspondingly with recursive call/s
- ###### if attr is a dict and altr is set to "list", then detVal will be LIST of dicts
- ###### (instead of a single python dictionary)
- ###### altr should be a list of 2-item tuples (unless attr is a dict)
- ###### the first item in the tuple (a string) would specify some type of modification for detVal
- ###### the second tuple item in the tuple should contain any data necessary for that modification
- ###### the first item in altr can also be "list" or a dictionary with 'isList' key
- ###### (operations based on other key-value pairs can/might be added into the function as needed)
Advertisement
Add Comment
Please, Sign In to add comment