prettify_htmlText

## takes a bs4 Tag and returns a string
## simpler version (without table markdown) at https://pastebin.com/fh4P45iE
## sample usage at https://stackoverflow.com/a/74257514/12652373

# when you want an attribute value or something else rather than just innerText
not_text = {
    'img': ['"["', 'alt', '"]("', 'src', '")"'], # get image alts and links
    'a': ['"["', '', '"]("', 'href', '")"'], # get hyperlink texts and links
    'br': [f'"\n"'], # include line breaks (normally skipped)

    'table': ['>html_to_hrmdTable'],

    # 'ul': ['>_sep_ - '], 'ol': ['>_sep_ - '], # reduce lists
    # 'script': ['""'], # skip script tags
    # 'style': ['""'], # skip style tags
}

# for elements with indent
indentsRef = {'li': 4}

# "Block-level Elements" from https://www.w3schools.com/html/html_blocks.asp
bTags = ['address', 'article', 'aside', 'blockquote', 'canvas', 'dd', 'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hr', 'li', 'main', 'nav', 'noscript', 'ol', 'p', 'pre', 'section', 'table', 'tfoot', 'ul', 'video']

def html_to_hrmdTable(tSoup, colWidth_max=None, minUniq=2, minFilld=2):
    # returns str
    if len(tSoup.select('thead tr')) == 1:
        simpleHeader = True
    elif tSoup.find('tr') and tSoup.find('tr').find('th'):
        simpleHeader = True
    else:
        simpleHeader = False

    tData = [[
        ' '.join([ch for ch in c.get_text(' ').split() if ch])
        for c in r.select('th, td')
    ] for r in tSoup.select('tr')]

    colCt = max([len(r) for r in tData])
    if not simpleHeader:
        tData = [[
            f'col_{i}' for i in range(1, colCt+1)
        ]] + tData
    if minFilld > 0 or minUniq > 0:
        colData = [[
            r[i] for r in tData[1:] if len(r) > i
        ] for i in range(colCt)]
        colData = [(
            list(set(cd)), len([d for d in cd if d])
        ) for cd in colData]
        dropCols = [i for i, cu in enumerate(colData) if
            len(cu[0]) < minUniq or cu[1] < minFilld]
        tData = [[
            c for i, c in enumerate(r) if i not in dropCols
        ] for r in tData]
        colCt = max([len(r) for r in tData])

    if type(colWidth_max) == int and colWidth_max > 5:
        cw = [colWidth_max]*colCt
    else:
        cw = [max([
            len(r[i]) for r in tData if len(r) > i
        ]) for i in range(colCt)]

    tData_f = [[
        f'{{:{w}}}'.format(c[:w]) for c, w in zip(r, cw)
    ] for r in tData]

    tData_f = [tData_f[0]] + [['-'*w for w in cw]] + tData_f[1:]
    tData_f = [' | '.join(r) for r in tData_f]

    return '\n'.join([f'| {r} |' for r in tData_f])


def html_to_text(hNode, ble=bTags, indents=indentsRef, specialTags=not_text):
    # returns (isBlockElement: bool, innerText: str)

    blePlus = ble + list(specialTags.keys())

    if hNode.name is None:
        return (False, hNode.get_text(' ', strip=True))
    if hNode.name not in blePlus and not hNode.find(blePlus):
        return (False, hNode.get_text(' ', strip=True))

    cnIndent = ' '*sum([
        indents[t.name] for t in (list(hNode.parents) + [hNode])
        if t.name in indents
    ])

    if hNode.name in specialTags:
        attribs = []
        for attrib in specialTags[hNode.name]:
            if attrib == '':
                attribs.append(hNode.get_text(' ', strip=True))
            elif attrib[0] == attrib[-1] == '"':
                attribs.append(attrib[1:-1])
            elif attrib.startswith('>_sep_'):
                sep = attrib.replace('>_sep_', '', 1)
                attribs.append(hNode.get_text(sep, strip=True))
            elif attrib == '>html_to_hrmdTable':
                attribs.append(html_to_hrmdTable(hNode))
            #  elif attrib == '>__': # can add more
            else: attribs.append(hNode.get(attrib, ''))

        joiner = ' ' # ?customize for multi-attribute
        return (hNode.name in ble, joiner.join(attribs))

    if not hNode.find(blePlus):
        nodeText = [
            (cnIndent + l) for l in hNode.get_text(' ', strip=True).split('\n')
        ]
        return (True, '\n'.join(nodeText))

    nodeText = []
    prevChild = None
    for c in hNode.children:
        c_h2t = html_to_text(c)

        space_before = cnIndent
        if prevChild and (not prevChild[0]) and c_h2t[0]:
            space_before = '\n' + space_before
        space_after = '\n' if c_h2t[0] else ' '

        nodeText.append(space_before + c_h2t[1] + space_after)
        prevChild = c_h2t
    return (hNode.name in ble, ''.join(nodeText))

def prettify_htmlText(hNode, ble=bTags, indents=indentsRef, specialTags=not_text):
    return '\n'.join([l.strip() for l in html_to_text(
        hNode, ble, indents, specialTags
    )[1].split('\n') if l.strip()])