Try95th

html_to_text.py

Nov 14th, 2022 (edited)
163
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.03 KB | None | 0 0
  1. ## takes a bs4 Tag and returns a 2-item tuple (bool, str)
  2. ## sample usage at https://stackoverflow.com/a/74257514/12652373
  3.  
  4. # when you want an attribute value or something else rather than just innerText
  5. not_text = {
  6.     'img': ['alt'], # get image alts
  7.     'br': [f'"\n"'], # include line breaks (normally skipped)
  8.    
  9.     'tr': ['"\n| "', '>_sep_ | ', '" |"'], # table rows in separate lines
  10.     # elements inside will be separated like | el1 | el2 |.... |
  11.     # (problematic if rows have elements other that columns)
  12.     # for better table-handling, see https://gist.github.com/Driftr95/89df5f5814339af7cbdd5a60cc3468a3
  13.  
  14.     'ul': ['>_sep_ - '], 'ol': ['>_sep_ - '], 'footer': ['>_sep_ - '],
  15.     'script': ['""'], # skip script tags
  16.     'style': ['""'] # skip style tags
  17. }
  18.  
  19. # for elements with indent
  20. indentsRef = {'li': 4}
  21.  
  22. # "Block-level Elements" from https://www.w3schools.com/html/html_blocks.asp
  23. bTags = ['address', 'article', 'aside', 'blockquote', 'canvas', 'dd', 'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hr', 'li', 'main', 'nav', 'noscript', 'ol', 'p', 'pre', 'section', 'table', 'tfoot', 'ul', 'video']
  24.  
  25. def html_to_text(hNode, ble=bTags, indents=indentsRef, specialTags=not_text):
  26.     # returns (isBlockElement: bool, innerText: str)
  27.  
  28.     blePlus = ble + list(specialTags.keys())
  29.  
  30.     if hNode.name is None:
  31.         return (False, hNode.get_text(' ', strip=True))
  32.     if hNode.name not in blePlus and not hNode.find(blePlus):
  33.         return (False, hNode.get_text(' ', strip=True))
  34.    
  35.     cnIndent = ' '*sum([
  36.         indents[t.name] for t in (list(hNode.parents) + [hNode])
  37.         if t.name in indents
  38.     ])
  39.  
  40.     if hNode.name in specialTags:
  41.         attribs = []
  42.         for attrib in specialTags[hNode.name]:
  43.             if attrib == '':
  44.                 attribs.append(hNode.get_text(' ', strip=True))
  45.             elif attrib[0] == attrib[-1] == '"':
  46.                 attribs.append(attrib[1:-1])
  47.             elif attrib.startswith('>_sep_'):
  48.                 sep = attrib.replace('>_sep_', '', 1)
  49.                 attribs.append(hNode.get_text(sep, strip=True))
  50.             #  elif attrib == '>__': # can add more
  51.             else: attribs.append(hNode.get(attrib, ''))
  52.            
  53.         joiner = ' ' # ?customize for multi-attribute
  54.         return (hNode.name in ble, joiner.join(attribs))
  55.  
  56.     if not hNode.find(blePlus):
  57.         nodeText = [
  58.             (cnIndent + l) for l in hNode.get_text(' ', strip=True).split('\n')
  59.         ]
  60.         return (True, '\n'.join(nodeText))
  61.  
  62.     nodeText = []
  63.     prevChild = None
  64.     for c in hNode.children:
  65.         c_h2t = html_to_text(c)
  66.  
  67.         space_before = cnIndent
  68.         if prevChild and (not prevChild[0]) and c_h2t[0]:
  69.             space_before = '\n' + space_before
  70.         space_after = '\n' if c_h2t[0] else ' '
  71.  
  72.         nodeText.append(space_before + c_h2t[1] + space_after)
  73.         prevChild = c_h2t
  74.     return (hNode.name in ble, ''.join(nodeText))
Advertisement
Add Comment
Please, Sign In to add comment