Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- import re
- class HtmlElement:
- def __init__(self, tag, attributes={}, children=[], parent=None):
- if type(tag) != str: raise TypeError("\"tag\" must be a string.")
- if type(attributes) != dict: raise TypeError("\"attributes\" must be a dictionary.")
- if type(children) != list: raise TypeError("\"children\" must be a list.")
- if type(parent) != HtmlElement and parent != None: raise TypeError("\"parent\" must be a HtmlElement or None.")
- self._tag = tag
- self._attributes = attributes
- self._children = children
- self._parent = parent
- def get_parent(self):
- return self._parent
- def get_children(self):
- return self._children
- def get_tag(self):
- return self._tag
- def get_attributes(self):
- return self._attributes
- def get_attribute(self, name):
- for attribute in self._attributes:
- if attribute.lower() == name.lower(): return self._attributes[attribute]
- return False
- def set_attribute(self, name, value):
- for attribute in self._attributes:
- if attribute.lower() == name.lower():
- self._attributes[attribute] = value
- return
- self._attributes[name] = value
- def append_child(self, child):
- if type(child) not in (str, HtmlElement):
- raise TypeError("\"child\" must be either a string or HtmlElement.")
- self._children.append(child)
- def get_element_by_id(self, id):
- for child in self._children:
- if type(child) != HtmlElement: continue
- for attribute in child._attributes:
- if attribute.lower() == "id" and child._attributes[attribute] == id: return child
- element = child.get_element_by_id(id)
- if element: return element
- return None
- def get_elements_by_name(self, name):
- elements = []
- for child in self._children:
- if type(child) != HtmlElement: continue
- for attribute in child._attributes:
- if attribute.lower() == "name" and child._attributes[attribute].lower() == name.lower(): elements.append(child)
- elements += child.get_elements_by_name(name)
- return elements
- def get_elements_by_class_name(self, class_name):
- elements = []
- for child in self._children:
- if type(child) != HtmlElement: continue
- for attribute in child._attributes:
- if attribute.lower() == "class" and class_name.lower() in [name.lower() for name in re.split("[ \t]+", child._attributes[attribute])]: elements.append(child)
- elements += child.get_elements_by_class_name(class_name)
- return elements
- def get_elements_by_tag_name(self, tag_name):
- elements = []
- for child in self._children:
- if type(child) != HtmlElement: continue
- if child._tag.lower() == tag_name.lower(): elements.append(child)
- elements += child.get_elements_by_tag_name(tag_name)
- return elements
- def __iter__(self):
- for child in self._children:
- yield child
- def __str__(self):
- if self._tag != "":
- html = "<%s"%self._tag
- if len(self._attributes) > 0:
- html += " " + " ".join(["%s=\"%s\""%(name, self._attributes[name]) if self._attributes[name] else name for name in self._attributes])
- if len(self._children) == 0:
- if self._tag[0] != "!":
- return html + "/>"
- else:
- return html + ">"
- html += ">"
- else:
- html = ""
- for child in self._children:
- html += str(child)
- if self._tag != "":
- return html + "</%s>"%self._tag
- else:
- return html
- def parse_html(html, close_implicit=["meta","br","img","link"]):
- if type(html) != str: raise TypeError("\"html\" must be a string.")
- if type(close_implicit) not in (list, tuple): raise TypeError("\"close_implicit\" must be a list or tuple.")
- close_implicit = [tag.lower() for tag in close_implicit]
- root = HtmlElement("")
- tags = []
- in_tag = False
- tag = ""
- text = ""
- current = root
- for c in html:
- if c == "<" and not in_tag:
- if len(text) > 0:
- current.append_child(text)
- text = ""
- in_tag = True
- elif c == ">" and in_tag:
- in_tag = False
- if len(tag) == 0: pass
- elif len(tag) >= 3 and (tag[:3] == "!--" and tag[-2::] == "--"):
- pass
- elif tag[0] == "/":
- tag = tag[1:]
- if tags[-1].lower() == tag.lower():
- current = current.get_parent()
- tags.pop()
- else:
- raise ValueError("invalid closing tag \"%s\" for \"%s\""%(tag, tags[-1]))
- elif tag[-1] == "/" or tag[0] == "!":
- tag = re.split("[ \t]+", tag.rstrip("/ "))
- attributes = {}
- for attribute in tag[1:]:
- attribute = re.findall("([^=]+)(?:|=[\"]{0,1}[^\"]+[\"]{0,1})", attribute)
- if len(attribute) <= 0: continue
- if len(attribute) == 1:
- attributes[attribute[0]] = None
- else:
- attributes[attribute[0]] = attribute[1].strip("\"")
- current.append_child(HtmlElement(tag[0],attributes,[],current))
- else:
- tag = re.split("[ \t]+", tag)
- attributes = {}
- for attribute in tag[1:]:
- attribute = re.findall("([^=]+)(?:|=[\"]{0,1}[^\"]+[\"]{0,1})", attribute)
- if len(attribute) <= 0: continue
- if len(attribute) == 1:
- attribute = attribute[0], None
- else:
- attribute = attribute[0], attribute[1].strip("\"")
- attributes[attribute[0]] = attribute[1]
- if tag[0] in close_implicit:
- current.append_child(HtmlElement(tag[0],attributes,[],current))
- else:
- new = HtmlElement(tag[0],attributes,[],current)
- current.append_child(new)
- current = new
- tags.append(tag[0])
- tag = ""
- elif in_tag:
- tag += c
- else:
- text += c
- if len(tags) > 0:
- raise ValueError("missing close tag for \"%s\""%tags[-1])
- if len(text) > 0:
- current.append_child(text)
- return root
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement