Try95th

prettify_htmlText

Nov 14th, 2022 (edited)
125
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.84 KB | None | 0 0
  1. ## takes a bs4 Tag and returns a string
  2. ## simpler version (without table markdown) at https://pastebin.com/fh4P45iE
  3. ## sample usage at https://stackoverflow.com/a/74257514/12652373
  4.  
  5. # when you want an attribute value or something else rather than just innerText
  6. not_text = {
  7.     'img': ['"["', 'alt', '"]("', 'src', '")"'], # get image alts and links
  8.     'a': ['"["', '', '"]("', 'href', '")"'], # get hyperlink texts and links
  9.     'br': [f'"\n"'], # include line breaks (normally skipped)
  10.  
  11.     'table': ['>html_to_hrmdTable'],
  12.  
  13.     # 'ul': ['>_sep_ - '], 'ol': ['>_sep_ - '], # reduce lists
  14.     # 'script': ['""'], # skip script tags
  15.     # 'style': ['""'], # skip style tags
  16. }
  17.  
  18. # for elements with indent
  19. indentsRef = {'li': 4}
  20.  
  21. # "Block-level Elements" from https://www.w3schools.com/html/html_blocks.asp
  22. bTags = ['address', 'article', 'aside', 'blockquote', 'canvas', 'dd', 'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hr', 'li', 'main', 'nav', 'noscript', 'ol', 'p', 'pre', 'section', 'table', 'tfoot', 'ul', 'video']
  23.  
  24. def html_to_hrmdTable(tSoup, colWidth_max=None, minUniq=2, minFilld=2):
  25.     # returns str
  26.     if len(tSoup.select('thead tr')) == 1:
  27.         simpleHeader = True
  28.     elif tSoup.find('tr') and tSoup.find('tr').find('th'):
  29.         simpleHeader = True
  30.     else:
  31.         simpleHeader = False
  32.  
  33.     tData = [[
  34.         ' '.join([ch for ch in c.get_text(' ').split() if ch])
  35.         for c in r.select('th, td')
  36.     ] for r in tSoup.select('tr')]
  37.  
  38.     colCt = max([len(r) for r in tData])
  39.     if not simpleHeader:
  40.         tData = [[
  41.             f'col_{i}' for i in range(1, colCt+1)
  42.         ]] + tData
  43.     if minFilld > 0 or minUniq > 0:
  44.         colData = [[
  45.             r[i] for r in tData[1:] if len(r) > i
  46.         ] for i in range(colCt)]
  47.         colData = [(
  48.             list(set(cd)), len([d for d in cd if d])
  49.         ) for cd in colData]
  50.         dropCols = [i for i, cu in enumerate(colData) if
  51.             len(cu[0]) < minUniq or cu[1] < minFilld]
  52.         tData = [[
  53.             c for i, c in enumerate(r) if i not in dropCols
  54.         ] for r in tData]
  55.         colCt = max([len(r) for r in tData])
  56.  
  57.     if type(colWidth_max) == int and colWidth_max > 5:
  58.         cw = [colWidth_max]*colCt
  59.     else:
  60.         cw = [max([
  61.             len(r[i]) for r in tData if len(r) > i
  62.         ]) for i in range(colCt)]
  63.  
  64.     tData_f = [[
  65.         f'{{:{w}}}'.format(c[:w]) for c, w in zip(r, cw)
  66.     ] for r in tData]
  67.  
  68.     tData_f = [tData_f[0]] + [['-'*w for w in cw]] + tData_f[1:]
  69.     tData_f = [' | '.join(r) for r in tData_f]
  70.  
  71.     return '\n'.join([f'| {r} |' for r in tData_f])
  72.  
  73.  
  74. def html_to_text(hNode, ble=bTags, indents=indentsRef, specialTags=not_text):
  75.     # returns (isBlockElement: bool, innerText: str)
  76.  
  77.     blePlus = ble + list(specialTags.keys())
  78.  
  79.     if hNode.name is None:
  80.         return (False, hNode.get_text(' ', strip=True))
  81.     if hNode.name not in blePlus and not hNode.find(blePlus):
  82.         return (False, hNode.get_text(' ', strip=True))
  83.    
  84.     cnIndent = ' '*sum([
  85.         indents[t.name] for t in (list(hNode.parents) + [hNode])
  86.         if t.name in indents
  87.     ])
  88.  
  89.     if hNode.name in specialTags:
  90.         attribs = []
  91.         for attrib in specialTags[hNode.name]:
  92.             if attrib == '':
  93.                 attribs.append(hNode.get_text(' ', strip=True))
  94.             elif attrib[0] == attrib[-1] == '"':
  95.                 attribs.append(attrib[1:-1])
  96.             elif attrib.startswith('>_sep_'):
  97.                 sep = attrib.replace('>_sep_', '', 1)
  98.                 attribs.append(hNode.get_text(sep, strip=True))
  99.             elif attrib == '>html_to_hrmdTable':
  100.                 attribs.append(html_to_hrmdTable(hNode))
  101.             #  elif attrib == '>__': # can add more
  102.             else: attribs.append(hNode.get(attrib, ''))
  103.            
  104.         joiner = ' ' # ?customize for multi-attribute
  105.         return (hNode.name in ble, joiner.join(attribs))
  106.  
  107.     if not hNode.find(blePlus):
  108.         nodeText = [
  109.             (cnIndent + l) for l in hNode.get_text(' ', strip=True).split('\n')
  110.         ]
  111.         return (True, '\n'.join(nodeText))
  112.  
  113.     nodeText = []
  114.     prevChild = None
  115.     for c in hNode.children:
  116.         c_h2t = html_to_text(c)
  117.  
  118.         space_before = cnIndent
  119.         if prevChild and (not prevChild[0]) and c_h2t[0]:
  120.             space_before = '\n' + space_before
  121.         space_after = '\n' if c_h2t[0] else ' '
  122.  
  123.         nodeText.append(space_before + c_h2t[1] + space_after)
  124.         prevChild = c_h2t
  125.     return (hNode.name in ble, ''.join(nodeText))
  126.  
  127. def prettify_htmlText(hNode, ble=bTags, indents=indentsRef, specialTags=not_text):
  128.     return '\n'.join([l.strip() for l in html_to_text(
  129.         hNode, ble, indents, specialTags
  130.     )[1].split('\n') if l.strip()])
  131.  
  132.  
  133.  
Advertisement
Add Comment
Please, Sign In to add comment