Advertisement
isendrak

html-parser-example.py

Feb 20th, 2019
138
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.46 KB | None | 0 0
  1. #!/usr/bin/python
  2. import re
  3.  
  4. class HtmlElement:
  5.     def __init__(self, tag, attributes={}, children=[], parent=None):
  6.         if type(tag) != str: raise TypeError("\"tag\" must be a string.")
  7.         if type(attributes) != dict: raise TypeError("\"attributes\" must be a dictionary.")
  8.         if type(children) != list: raise TypeError("\"children\" must be a list.")
  9.         if type(parent) != HtmlElement and parent != None: raise TypeError("\"parent\" must be a HtmlElement or None.")
  10.         self._tag = tag
  11.         self._attributes = attributes
  12.         self._children = children
  13.         self._parent = parent
  14.     def get_parent(self):
  15.         return self._parent
  16.     def get_children(self):
  17.         return self._children
  18.     def get_tag(self):
  19.         return self._tag
  20.     def get_attributes(self):
  21.         return self._attributes
  22.     def get_attribute(self, name):
  23.         for attribute in self._attributes:
  24.             if attribute.lower() == name.lower(): return self._attributes[attribute]
  25.         return False
  26.     def set_attribute(self, name, value):
  27.         for attribute in self._attributes:
  28.             if attribute.lower() == name.lower():
  29.                 self._attributes[attribute] = value
  30.                 return
  31.         self._attributes[name] = value
  32.     def append_child(self, child):
  33.         if type(child) not in (str, HtmlElement):
  34.             raise TypeError("\"child\" must be either a string or HtmlElement.")
  35.         self._children.append(child)
  36.     def get_element_by_id(self, id):
  37.         for child in self._children:
  38.             if type(child) != HtmlElement: continue
  39.             for attribute in child._attributes:
  40.                 if attribute.lower() == "id" and child._attributes[attribute] == id: return child
  41.             element = child.get_element_by_id(id)
  42.             if element: return element
  43.         return None
  44.     def get_elements_by_name(self, name):
  45.         elements = []
  46.         for child in self._children:
  47.             if type(child) != HtmlElement: continue
  48.             for attribute in child._attributes:
  49.                 if attribute.lower() == "name" and child._attributes[attribute].lower() == name.lower(): elements.append(child)
  50.             elements += child.get_elements_by_name(name)
  51.         return elements
  52.     def get_elements_by_class_name(self, class_name):
  53.         elements = []
  54.         for child in self._children:
  55.             if type(child) != HtmlElement: continue
  56.             for attribute in child._attributes:
  57.                 if attribute.lower() == "class" and class_name.lower() in [name.lower() for name in re.split("[ \t]+", child._attributes[attribute])]: elements.append(child)
  58.             elements += child.get_elements_by_class_name(class_name)
  59.         return elements
  60.     def get_elements_by_tag_name(self, tag_name):
  61.         elements = []
  62.         for child in self._children:
  63.             if type(child) != HtmlElement: continue
  64.             if child._tag.lower() == tag_name.lower(): elements.append(child)
  65.             elements += child.get_elements_by_tag_name(tag_name)
  66.         return elements
  67.     def __iter__(self):
  68.         for child in self._children:
  69.             yield child
  70.     def __str__(self):
  71.         if self._tag != "":
  72.             html = "<%s"%self._tag
  73.             if len(self._attributes) > 0:
  74.                 html += " " + " ".join(["%s=\"%s\""%(name, self._attributes[name]) if self._attributes[name] else name for name in self._attributes])
  75.             if len(self._children) == 0:
  76.                 if self._tag[0] != "!":
  77.                     return html + "/>"
  78.                 else:
  79.                     return html + ">"
  80.             html += ">"
  81.         else:
  82.             html = ""
  83.         for child in self._children:
  84.             html += str(child)
  85.         if self._tag != "":
  86.             return html + "</%s>"%self._tag
  87.         else:
  88.             return html
  89.  
  90. def parse_html(html, close_implicit=["meta","br","img","link"]):
  91.     if type(html) != str: raise TypeError("\"html\" must be a string.")
  92.     if type(close_implicit) not in (list, tuple): raise TypeError("\"close_implicit\" must be a list or tuple.")
  93.     close_implicit = [tag.lower() for tag in close_implicit]
  94.     root = HtmlElement("")
  95.     tags = []
  96.     in_tag = False
  97.     tag = ""
  98.     text = ""
  99.     current = root
  100.     for c in html:
  101.         if c == "<" and not in_tag:
  102.             if len(text) > 0:
  103.                 current.append_child(text)
  104.                 text = ""
  105.             in_tag = True
  106.         elif c == ">" and in_tag:
  107.             in_tag = False
  108.             if len(tag) == 0: pass
  109.             elif len(tag) >= 3 and (tag[:3] == "!--" and tag[-2::] == "--"):
  110.                 pass
  111.             elif tag[0] == "/":
  112.                 tag = tag[1:]
  113.                 if tags[-1].lower() == tag.lower():
  114.                     current = current.get_parent()
  115.                     tags.pop()
  116.                 else:
  117.                     raise ValueError("invalid closing tag \"%s\" for \"%s\""%(tag, tags[-1]))
  118.             elif tag[-1] == "/" or tag[0] == "!":
  119.                 tag = re.split("[ \t]+", tag.rstrip("/ "))
  120.                 attributes = {}
  121.                 for attribute in tag[1:]:
  122.                     attribute = re.findall("([^=]+)(?:|=[\"]{0,1}[^\"]+[\"]{0,1})", attribute)
  123.                     if len(attribute) <= 0: continue
  124.                     if len(attribute) == 1:
  125.                         attributes[attribute[0]] = None
  126.                     else:
  127.                         attributes[attribute[0]] = attribute[1].strip("\"")
  128.                 current.append_child(HtmlElement(tag[0],attributes,[],current))
  129.             else:
  130.                 tag = re.split("[ \t]+", tag)
  131.                 attributes = {}
  132.                 for attribute in tag[1:]:
  133.                     attribute = re.findall("([^=]+)(?:|=[\"]{0,1}[^\"]+[\"]{0,1})", attribute)
  134.                     if len(attribute) <= 0: continue
  135.                     if len(attribute) == 1:
  136.                         attribute = attribute[0], None
  137.                     else:
  138.                         attribute = attribute[0], attribute[1].strip("\"")
  139.                     attributes[attribute[0]] = attribute[1]
  140.                 if tag[0] in close_implicit:
  141.                     current.append_child(HtmlElement(tag[0],attributes,[],current))
  142.                 else:
  143.                     new = HtmlElement(tag[0],attributes,[],current)
  144.                     current.append_child(new)
  145.                     current = new
  146.                     tags.append(tag[0])
  147.             tag = ""
  148.         elif in_tag:
  149.             tag += c
  150.         else:
  151.             text += c
  152.     if len(tags) > 0:
  153.         raise ValueError("missing close tag for \"%s\""%tags[-1])
  154.     if len(text) > 0:
  155.         current.append_child(text)
  156.     return root
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement