SHARE
TWEET

html-parser-example.py

isendrak Feb 20th, 2019 (edited) 66 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/python
  2. import re
  3.  
  4. class HtmlElement:
  5.     def __init__(self, tag, attributes={}, children=[], parent=None):
  6.         if type(tag) != str: raise TypeError("\"tag\" must be a string.")
  7.         if type(attributes) != dict: raise TypeError("\"attributes\" must be a dictionary.")
  8.         if type(children) != list: raise TypeError("\"children\" must be a list.")
  9.         if type(parent) != HtmlElement and parent != None: raise TypeError("\"parent\" must be a HtmlElement or None.")
  10.         self._tag = tag
  11.         self._attributes = attributes
  12.         self._children = children
  13.         self._parent = parent
  14.     def get_parent(self):
  15.         return self._parent
  16.     def get_children(self):
  17.         return self._children
  18.     def get_tag(self):
  19.         return self._tag
  20.     def get_attributes(self):
  21.         return self._attributes
  22.     def get_attribute(self, name):
  23.         for attribute in self._attributes:
  24.             if attribute.lower() == name.lower(): return self._attributes[attribute]
  25.         return False
  26.     def set_attribute(self, name, value):
  27.         for attribute in self._attributes:
  28.             if attribute.lower() == name.lower():
  29.                 self._attributes[attribute] = value
  30.                 return
  31.         self._attributes[name] = value
  32.     def append_child(self, child):
  33.         if type(child) not in (str, HtmlElement):
  34.             raise TypeError("\"child\" must be either a string or HtmlElement.")
  35.         self._children.append(child)
  36.     def get_element_by_id(self, id):
  37.         for child in self._children:
  38.             if type(child) != HtmlElement: continue
  39.             for attribute in child._attributes:
  40.                 if attribute.lower() == "id" and child._attributes[attribute] == id: return child
  41.             element = child.get_element_by_id(id)
  42.             if element: return element
  43.         return None
  44.     def get_elements_by_name(self, name):
  45.         elements = []
  46.         for child in self._children:
  47.             if type(child) != HtmlElement: continue
  48.             for attribute in child._attributes:
  49.                 if attribute.lower() == "name" and child._attributes[attribute].lower() == name.lower(): elements.append(child)
  50.             elements += child.get_elements_by_name(name)
  51.         return elements
  52.     def get_elements_by_class_name(self, class_name):
  53.         elements = []
  54.         for child in self._children:
  55.             if type(child) != HtmlElement: continue
  56.             for attribute in child._attributes:
  57.                 if attribute.lower() == "class" and class_name.lower() in [name.lower() for name in re.split("[ \t]+", child._attributes[attribute])]: elements.append(child)
  58.             elements += child.get_elements_by_class_name(class_name)
  59.         return elements
  60.     def get_elements_by_tag_name(self, tag_name):
  61.         elements = []
  62.         for child in self._children:
  63.             if type(child) != HtmlElement: continue
  64.             if child._tag.lower() == tag_name.lower(): elements.append(child)
  65.             elements += child.get_elements_by_tag_name(tag_name)
  66.         return elements
  67.     def __iter__(self):
  68.         for child in self._children:
  69.             yield child
  70.     def __str__(self):
  71.         if self._tag != "":
  72.             html = "<%s"%self._tag
  73.             if len(self._attributes) > 0:
  74.                 html += " " + " ".join(["%s=\"%s\""%(name, self._attributes[name]) if self._attributes[name] else name for name in self._attributes])
  75.             if len(self._children) == 0:
  76.                 if self._tag[0] != "!":
  77.                     return html + "/>"
  78.                 else:
  79.                     return html + ">"
  80.             html += ">"
  81.         else:
  82.             html = ""
  83.         for child in self._children:
  84.             html += str(child)
  85.         if self._tag != "":
  86.             return html + "</%s>"%self._tag
  87.         else:
  88.             return html
  89.  
  90. def parse_html(html, close_implicit=["meta","br","img","link"]):
  91.     if type(html) != str: raise TypeError("\"html\" must be a string.")
  92.     if type(close_implicit) not in (list, tuple): raise TypeError("\"close_implicit\" must be a list or tuple.")
  93.     close_implicit = [tag.lower() for tag in close_implicit]
  94.     root = HtmlElement("")
  95.     tags = []
  96.     in_tag = False
  97.     tag = ""
  98.     text = ""
  99.     current = root
  100.     for c in html:
  101.         if c == "<" and not in_tag:
  102.             if len(text) > 0:
  103.                 current.append_child(text)
  104.                 text = ""
  105.             in_tag = True
  106.         elif c == ">" and in_tag:
  107.             in_tag = False
  108.             if len(tag) == 0: pass
  109.             elif len(tag) >= 3 and (tag[:3] == "!--" and tag[-2::] == "--"):
  110.                 pass
  111.             elif tag[0] == "/":
  112.                 tag = tag[1:]
  113.                 if tags[-1].lower() == tag.lower():
  114.                     current = current.get_parent()
  115.                     tags.pop()
  116.                 else:
  117.                     raise ValueError("invalid closing tag \"%s\" for \"%s\""%(tag, tags[-1]))
  118.             elif tag[-1] == "/" or tag[0] == "!":
  119.                 tag = re.split("[ \t]+", tag.rstrip("/ "))
  120.                 attributes = {}
  121.                 for attribute in tag[1:]:
  122.                     attribute = re.findall("([^=]+)(?:|=[\"]{0,1}[^\"]+[\"]{0,1})", attribute)
  123.                     if len(attribute) <= 0: continue
  124.                     if len(attribute) == 1:
  125.                         attributes[attribute[0]] = None
  126.                     else:
  127.                         attributes[attribute[0]] = attribute[1].strip("\"")
  128.                 current.append_child(HtmlElement(tag[0],attributes,[],current))
  129.             else:
  130.                 tag = re.split("[ \t]+", tag)
  131.                 attributes = {}
  132.                 for attribute in tag[1:]:
  133.                     attribute = re.findall("([^=]+)(?:|=[\"]{0,1}[^\"]+[\"]{0,1})", attribute)
  134.                     if len(attribute) <= 0: continue
  135.                     if len(attribute) == 1:
  136.                         attribute = attribute[0], None
  137.                     else:
  138.                         attribute = attribute[0], attribute[1].strip("\"")
  139.                     attributes[attribute[0]] = attribute[1]
  140.                 if tag[0] in close_implicit:
  141.                     current.append_child(HtmlElement(tag[0],attributes,[],current))
  142.                 else:
  143.                     new = HtmlElement(tag[0],attributes,[],current)
  144.                     current.append_child(new)
  145.                     current = new
  146.                     tags.append(tag[0])
  147.             tag = ""
  148.         elif in_tag:
  149.             tag += c
  150.         else:
  151.             text += c
  152.     if len(tags) > 0:
  153.         raise ValueError("missing close tag for \"%s\""%tags[-1])
  154.     if len(text) > 0:
  155.         current.append_child(text)
  156.     return root
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Not a member of Pastebin yet?
Sign Up, it unlocks many cool features!
 
Top