Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## takes a bs4 Tag and returns a string
- ## simpler version (without table markdown) at https://pastebin.com/fh4P45iE
- ## sample usage at https://stackoverflow.com/a/74257514/12652373
- # when you want an attribute value or something else rather than just innerText
- not_text = {
- 'img': ['"["', 'alt', '"]("', 'src', '")"'], # get image alts and links
- 'a': ['"["', '', '"]("', 'href', '")"'], # get hyperlink texts and links
- 'br': [f'"\n"'], # include line breaks (normally skipped)
- 'table': ['>html_to_hrmdTable'],
- # 'ul': ['>_sep_ - '], 'ol': ['>_sep_ - '], # reduce lists
- # 'script': ['""'], # skip script tags
- # 'style': ['""'], # skip style tags
- }
- # for elements with indent
- indentsRef = {'li': 4}
- # "Block-level Elements" from https://www.w3schools.com/html/html_blocks.asp
- bTags = ['address', 'article', 'aside', 'blockquote', 'canvas', 'dd', 'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hr', 'li', 'main', 'nav', 'noscript', 'ol', 'p', 'pre', 'section', 'table', 'tfoot', 'ul', 'video']
- def html_to_hrmdTable(tSoup, colWidth_max=None, minUniq=2, minFilld=2):
- # returns str
- if len(tSoup.select('thead tr')) == 1:
- simpleHeader = True
- elif tSoup.find('tr') and tSoup.find('tr').find('th'):
- simpleHeader = True
- else:
- simpleHeader = False
- tData = [[
- ' '.join([ch for ch in c.get_text(' ').split() if ch])
- for c in r.select('th, td')
- ] for r in tSoup.select('tr')]
- colCt = max([len(r) for r in tData])
- if not simpleHeader:
- tData = [[
- f'col_{i}' for i in range(1, colCt+1)
- ]] + tData
- if minFilld > 0 or minUniq > 0:
- colData = [[
- r[i] for r in tData[1:] if len(r) > i
- ] for i in range(colCt)]
- colData = [(
- list(set(cd)), len([d for d in cd if d])
- ) for cd in colData]
- dropCols = [i for i, cu in enumerate(colData) if
- len(cu[0]) < minUniq or cu[1] < minFilld]
- tData = [[
- c for i, c in enumerate(r) if i not in dropCols
- ] for r in tData]
- colCt = max([len(r) for r in tData])
- if type(colWidth_max) == int and colWidth_max > 5:
- cw = [colWidth_max]*colCt
- else:
- cw = [max([
- len(r[i]) for r in tData if len(r) > i
- ]) for i in range(colCt)]
- tData_f = [[
- f'{{:{w}}}'.format(c[:w]) for c, w in zip(r, cw)
- ] for r in tData]
- tData_f = [tData_f[0]] + [['-'*w for w in cw]] + tData_f[1:]
- tData_f = [' | '.join(r) for r in tData_f]
- return '\n'.join([f'| {r} |' for r in tData_f])
- def html_to_text(hNode, ble=bTags, indents=indentsRef, specialTags=not_text):
- # returns (isBlockElement: bool, innerText: str)
- blePlus = ble + list(specialTags.keys())
- if hNode.name is None:
- return (False, hNode.get_text(' ', strip=True))
- if hNode.name not in blePlus and not hNode.find(blePlus):
- return (False, hNode.get_text(' ', strip=True))
- cnIndent = ' '*sum([
- indents[t.name] for t in (list(hNode.parents) + [hNode])
- if t.name in indents
- ])
- if hNode.name in specialTags:
- attribs = []
- for attrib in specialTags[hNode.name]:
- if attrib == '':
- attribs.append(hNode.get_text(' ', strip=True))
- elif attrib[0] == attrib[-1] == '"':
- attribs.append(attrib[1:-1])
- elif attrib.startswith('>_sep_'):
- sep = attrib.replace('>_sep_', '', 1)
- attribs.append(hNode.get_text(sep, strip=True))
- elif attrib == '>html_to_hrmdTable':
- attribs.append(html_to_hrmdTable(hNode))
- # elif attrib == '>__': # can add more
- else: attribs.append(hNode.get(attrib, ''))
- joiner = ' ' # ?customize for multi-attribute
- return (hNode.name in ble, joiner.join(attribs))
- if not hNode.find(blePlus):
- nodeText = [
- (cnIndent + l) for l in hNode.get_text(' ', strip=True).split('\n')
- ]
- return (True, '\n'.join(nodeText))
- nodeText = []
- prevChild = None
- for c in hNode.children:
- c_h2t = html_to_text(c)
- space_before = cnIndent
- if prevChild and (not prevChild[0]) and c_h2t[0]:
- space_before = '\n' + space_before
- space_after = '\n' if c_h2t[0] else ' '
- nodeText.append(space_before + c_h2t[1] + space_after)
- prevChild = c_h2t
- return (hNode.name in ble, ''.join(nodeText))
- def prettify_htmlText(hNode, ble=bTags, indents=indentsRef, specialTags=not_text):
- return '\n'.join([l.strip() for l in html_to_text(
- hNode, ble, indents, specialTags
- )[1].split('\n') if l.strip()])
Advertisement
Add Comment
Please, Sign In to add comment