html_to_text.py

## takes a bs4 Tag and returns a 2-item tuple (bool, str)
## sample usage at https://stackoverflow.com/a/74257514/12652373

# when you want an attribute value or something else rather than just innerText
not_text = {
    'img': ['alt'], # get image alts
    'br': [f'"\n"'], # include line breaks (normally skipped)

    'tr': ['"\n| "', '>_sep_ | ', '" |"'], # table rows in separate lines
    # elements inside will be separated like | el1 | el2 |.... |
    # (problematic if rows have elements other that columns)
    # for better table-handling, see https://gist.github.com/Driftr95/89df5f5814339af7cbdd5a60cc3468a3

    'ul': ['>_sep_ - '], 'ol': ['>_sep_ - '], 'footer': ['>_sep_ - '],
    'script': ['""'], # skip script tags
    'style': ['""'] # skip style tags
}

# for elements with indent
indentsRef = {'li': 4}

# "Block-level Elements" from https://www.w3schools.com/html/html_blocks.asp
bTags = ['address', 'article', 'aside', 'blockquote', 'canvas', 'dd', 'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hr', 'li', 'main', 'nav', 'noscript', 'ol', 'p', 'pre', 'section', 'table', 'tfoot', 'ul', 'video']

def html_to_text(hNode, ble=bTags, indents=indentsRef, specialTags=not_text):
    # returns (isBlockElement: bool, innerText: str)

    blePlus = ble + list(specialTags.keys())

    if hNode.name is None:
        return (False, hNode.get_text(' ', strip=True))
    if hNode.name not in blePlus and not hNode.find(blePlus):
        return (False, hNode.get_text(' ', strip=True))

    cnIndent = ' '*sum([
        indents[t.name] for t in (list(hNode.parents) + [hNode])
        if t.name in indents
    ])

    if hNode.name in specialTags:
        attribs = []
        for attrib in specialTags[hNode.name]:
            if attrib == '':
                attribs.append(hNode.get_text(' ', strip=True))
            elif attrib[0] == attrib[-1] == '"':
                attribs.append(attrib[1:-1])
            elif attrib.startswith('>_sep_'):
                sep = attrib.replace('>_sep_', '', 1)
                attribs.append(hNode.get_text(sep, strip=True))
            #  elif attrib == '>__': # can add more
            else: attribs.append(hNode.get(attrib, ''))

        joiner = ' ' # ?customize for multi-attribute
        return (hNode.name in ble, joiner.join(attribs))

    if not hNode.find(blePlus):
        nodeText = [
            (cnIndent + l) for l in hNode.get_text(' ', strip=True).split('\n')
        ]
        return (True, '\n'.join(nodeText))

    nodeText = []
    prevChild = None
    for c in hNode.children:
        c_h2t = html_to_text(c)

        space_before = cnIndent
        if prevChild and (not prevChild[0]) and c_h2t[0]:
            space_before = '\n' + space_before
        space_after = '\n' if c_h2t[0] else ' '

        nodeText.append(space_before + c_h2t[1] + space_after)
        prevChild = c_h2t
    return (hNode.name in ble, ''.join(nodeText))