Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # read in the HTML via stdin:
- import sys
- input = sys.stdin.read()
- # define the set of toknes which our lexer knows about:
- import re
- symbol_table = (
- ("opentag", re.compile(r'<[a-zA-Z].*?>')),
- ("closetag", re.compile(r'</[a-zA-Z].*?>')),
- ("singletag", re.compile(r'<[a-zA-Z].*?/>')),
- ("other", re.compile(r'[^<>]+')),
- )
- # a representation of a token:
- class Token(object):
- def __init__(self, name, text):
- self.name = name
- self.text = text
- def __repr__(self):
- return "%s(%s)" % (self.name, self.text)
- # consumes enough chars from input to create the next token:
- def consume(symbol_pair, input):
- (token_name, pattern) = symbol_pair
- m = pattern.match(input)
- if m is not None:
- matched_text = m.group()
- (start_index, end_index) = m.span()
- token = Token(token_name, matched_text)
- result = (token, end_index)
- else:
- result = (None, None)
- return result
- # the lexer: turns a list of characters into a list of recognized tokens:
- def tokenize(symbol_table, input):
- tokens = []
- while len(input) > 0:
- for symbol_pair in symbol_table:
- (token, consumed_count) = consume(symbol_pair, input)
- if token is not None:
- tokens.append(token)
- input = input[consumed_count:]
- break
- else:
- raise Exception("bad input: '%s'" % input)
- return tokens
- # lex our input into tokens:
- tokens = tokenize(symbol_table, input)
- # print out the tokens as a sanity check:
- import pprint
- print "tokens:"
- pprint.pprint(tokens)
- # a representation of a node in a parse tree:
- class Node(object):
- def __init__(self, token):
- self.token = token
- self.closetoken = None
- self.subnodes = []
- def __repr__(self):
- return "%s(%s)" % (self.token, self.subnodes)
- # the parser: turns a linear stream of tokens into a parse tree:
- def parse(tokens, node):
- while len(tokens) > 0:
- token = tokens.pop(0)
- if token.name == "opentag":
- subnode = Node(token)
- parse(tokens, subnode)
- node.subnodes.append(subnode)
- elif token.name == "closetag":
- node.closetoken = token
- break
- elif token.name in ["singletag", "other"]:
- subnode = Node(token)
- node.subnodes.append(subnode)
- # parse the tokens into a tree. start by creating a root node (None).
- parse_tree = Node(None)
- parse(tokens, parse_tree)
- # prints out the parse tree, with indentation to indicate tree structure.
- def print_parsetree(node, indent=0):
- print "%s%s" % (" " * indent, node.token if node.token else "(root)")
- for subnode in node.subnodes:
- print_parsetree(subnode, indent+1)
- # print out the parse tree as a sanity check:
- print
- print "parse tree:"
- print_parsetree(parse_tree)
- # finds any video embed iframes and replaces then with "ios-video" divs:
- def replace_iframe(node):
- for i in range(len(node.subnodes)):
- subnode = node.subnodes[i]
- name = subnode.token.name
- text = subnode.token.text
- if name == "opentag" and text.startswith("<iframe") and "src=\"/embed/" in text and "flo-video-embed" in text:
- del node.subnodes[i]
- replacement = Node(Token("opentag", "<div class=\"ios-video\">"))
- replacement.closetoken = Token("closetag", "</div>")
- node.subnodes.insert(i, replacement)
- else:
- for subsubnode in subnode.subnodes:
- replace_iframe(subsubnode)
- # search-and-replace the video embed iframes:
- replace_iframe(parse_tree)
- # print out the modified parse tree as a sanity check:
- print
- print "modified parse tree:"
- print_parsetree(parse_tree)
- # serializes a parse tree back into HTML:
- def dump_html(node):
- if node.token:
- print node.token.text,
- for subnode in node.subnodes:
- dump_html(subnode)
- if node.closetoken:
- print node.closetoken.text,
- # print out the HTML of our modified parse tree:
- print
- print "modified HTML:"
- print dump_html(parse_tree)
Add Comment
Please, Sign In to add comment