SHARE
TWEET

not_my_homework

DeaD_EyE Jul 29th, 2019 64 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python3
  2. """
  3. Specialized Html2Json converter.
  4. Does only work with the given format.
  5.  
  6. Html2Json reads by default from stdin
  7. If a terminal is detected, the filename be
  8. the  first argument.
  9. """
  10.  
  11. import json
  12. import sys
  13. from argparse import ArgumentParser
  14. from collections import deque
  15. from pathlib import Path
  16. from typing import Union
  17.  
  18. from bs4 import BeautifulSoup
  19.  
  20.  
  21. example_html = """<table>
  22.  <tbody>
  23.    <tr></tr>
  24.    <tr>
  25.      <th>1 abc</th>
  26.      <td>good</td>
  27.      <td><a href="/good">John (Nick)</a></td>
  28.      <td>Lincoln</td>
  29.    </tr>
  30.    <tr>
  31.        <th>20 xyz</th>
  32.        <td>bad</td>
  33.        <td><a href="/bad">Emma</a></td>
  34.        <td>Smith</td>
  35.      </tr>
  36.      <tr></tr>
  37.      ...
  38.  </tbody>
  39. </table>"""
  40.  
  41.  
  42. def html2json(html: str, debug: bool = False):
  43.     """
  44.    Converts html to a json str.
  45.    """
  46.     collections = {'collections': []}
  47.     result = collections['collections']
  48.     # result has the same reference of the list object
  49.     # inside the list
  50.     fields = 'size identity state name nick'.split()
  51.     skip_fields = 'state nick'.split()
  52.     bs = BeautifulSoup(html, features='html.parser')
  53.     for tr in bs.find_all('tr'):
  54.         state = deque()
  55.         for th_td in tr.children:
  56.             if hasattr(th_td, 'text'):
  57.                 state.append(th_td.text)
  58.         if not state:
  59.             continue
  60.         if debug:
  61.             print(state, file=sys.stderr)
  62.         size, identity = state.popleft().strip().split()
  63.         size = int(size)
  64.         state.extendleft((size, identity))
  65.         dataset = {
  66.             field: value for (field, value) in
  67.             zip(fields, state)
  68.             if field not in skip_fields
  69.         }
  70.         result.append(dataset)
  71.     return json.dumps(collections, indent=4)
  72.  
  73.  
  74. def main(
  75.     inputfile: Union[None, Path],
  76.     debug: bool, *,
  77.     example: bool
  78.     ) -> str:
  79.     """
  80.    Convert the html inputfile to a json string
  81.    and return it.
  82.  
  83.    If sys.stdin is a pipe, then prefering this as source.
  84.    """
  85.     if example:
  86.         return html2json(example_html, debug)
  87.     if inputfile and sys.stdin.isatty():
  88.         html_source = inputfile.read_text()
  89.     elif not inputfile and not sys.stdin.isatty():
  90.         html_source = sys.stdin.read()
  91.         # reads until the pipe is closed by
  92.         # the previous process: cat for example
  93.     else:
  94.         # should be impossible
  95.         # prefering in this case the stdin
  96.         html_source = sys.stdin.read()
  97.     return html2json(html_source, debug)
  98.  
  99.  
  100. if __name__ == '__main__':
  101.     parser = ArgumentParser(description=__doc__)
  102.     parser.add_argument('-f', dest='inputfile', default=None, type=Path, help='A path to the inputfile, if stdin is not used.')
  103.     parser.add_argument('-d', dest='debug', action='store_true', help='Debug')
  104.     parser.add_argument('-e', dest='example', action='store_true', help='Example with example data')
  105.     args = parser.parse_args()
  106.     json_str = main(**vars(args))
  107.     print(json_str)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top