Not a member of Pastebin yet?
                        Sign Up,
                        it unlocks many cool features!                    
                - ## takes a bs4 Tag and returns a string
 - ## simpler version (without table markdown) at https://pastebin.com/fh4P45iE
 - ## sample usage at https://stackoverflow.com/a/74257514/12652373
 - # when you want an attribute value or something else rather than just innerText
 - not_text = {
 - 'img': ['"["', 'alt', '"]("', 'src', '")"'], # get image alts and links
 - 'a': ['"["', '', '"]("', 'href', '")"'], # get hyperlink texts and links
 - 'br': [f'"\n"'], # include line breaks (normally skipped)
 - 'table': ['>html_to_hrmdTable'],
 - # 'ul': ['>_sep_ - '], 'ol': ['>_sep_ - '], # reduce lists
 - # 'script': ['""'], # skip script tags
 - # 'style': ['""'], # skip style tags
 - }
 - # for elements with indent
 - indentsRef = {'li': 4}
 - # "Block-level Elements" from https://www.w3schools.com/html/html_blocks.asp
 - bTags = ['address', 'article', 'aside', 'blockquote', 'canvas', 'dd', 'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hr', 'li', 'main', 'nav', 'noscript', 'ol', 'p', 'pre', 'section', 'table', 'tfoot', 'ul', 'video']
 - def html_to_hrmdTable(tSoup, colWidth_max=None, minUniq=2, minFilld=2):
 - # returns str
 - if len(tSoup.select('thead tr')) == 1:
 - simpleHeader = True
 - elif tSoup.find('tr') and tSoup.find('tr').find('th'):
 - simpleHeader = True
 - else:
 - simpleHeader = False
 - tData = [[
 - ' '.join([ch for ch in c.get_text(' ').split() if ch])
 - for c in r.select('th, td')
 - ] for r in tSoup.select('tr')]
 - colCt = max([len(r) for r in tData])
 - if not simpleHeader:
 - tData = [[
 - f'col_{i}' for i in range(1, colCt+1)
 - ]] + tData
 - if minFilld > 0 or minUniq > 0:
 - colData = [[
 - r[i] for r in tData[1:] if len(r) > i
 - ] for i in range(colCt)]
 - colData = [(
 - list(set(cd)), len([d for d in cd if d])
 - ) for cd in colData]
 - dropCols = [i for i, cu in enumerate(colData) if
 - len(cu[0]) < minUniq or cu[1] < minFilld]
 - tData = [[
 - c for i, c in enumerate(r) if i not in dropCols
 - ] for r in tData]
 - colCt = max([len(r) for r in tData])
 - if type(colWidth_max) == int and colWidth_max > 5:
 - cw = [colWidth_max]*colCt
 - else:
 - cw = [max([
 - len(r[i]) for r in tData if len(r) > i
 - ]) for i in range(colCt)]
 - tData_f = [[
 - f'{{:{w}}}'.format(c[:w]) for c, w in zip(r, cw)
 - ] for r in tData]
 - tData_f = [tData_f[0]] + [['-'*w for w in cw]] + tData_f[1:]
 - tData_f = [' | '.join(r) for r in tData_f]
 - return '\n'.join([f'| {r} |' for r in tData_f])
 - def html_to_text(hNode, ble=bTags, indents=indentsRef, specialTags=not_text):
 - # returns (isBlockElement: bool, innerText: str)
 - blePlus = ble + list(specialTags.keys())
 - if hNode.name is None:
 - return (False, hNode.get_text(' ', strip=True))
 - if hNode.name not in blePlus and not hNode.find(blePlus):
 - return (False, hNode.get_text(' ', strip=True))
 - cnIndent = ' '*sum([
 - indents[t.name] for t in (list(hNode.parents) + [hNode])
 - if t.name in indents
 - ])
 - if hNode.name in specialTags:
 - attribs = []
 - for attrib in specialTags[hNode.name]:
 - if attrib == '':
 - attribs.append(hNode.get_text(' ', strip=True))
 - elif attrib[0] == attrib[-1] == '"':
 - attribs.append(attrib[1:-1])
 - elif attrib.startswith('>_sep_'):
 - sep = attrib.replace('>_sep_', '', 1)
 - attribs.append(hNode.get_text(sep, strip=True))
 - elif attrib == '>html_to_hrmdTable':
 - attribs.append(html_to_hrmdTable(hNode))
 - # elif attrib == '>__': # can add more
 - else: attribs.append(hNode.get(attrib, ''))
 - joiner = ' ' # ?customize for multi-attribute
 - return (hNode.name in ble, joiner.join(attribs))
 - if not hNode.find(blePlus):
 - nodeText = [
 - (cnIndent + l) for l in hNode.get_text(' ', strip=True).split('\n')
 - ]
 - return (True, '\n'.join(nodeText))
 - nodeText = []
 - prevChild = None
 - for c in hNode.children:
 - c_h2t = html_to_text(c)
 - space_before = cnIndent
 - if prevChild and (not prevChild[0]) and c_h2t[0]:
 - space_before = '\n' + space_before
 - space_after = '\n' if c_h2t[0] else ' '
 - nodeText.append(space_before + c_h2t[1] + space_after)
 - prevChild = c_h2t
 - return (hNode.name in ble, ''.join(nodeText))
 - def prettify_htmlText(hNode, ble=bTags, indents=indentsRef, specialTags=not_text):
 - return '\n'.join([l.strip() for l in html_to_text(
 - hNode, ble, indents, specialTags
 - )[1].split('\n') if l.strip()])
 
Advertisement
 
                    Add Comment                
                
                        Please, Sign In to add comment