Guest User

Untitled

a guest
Jul 20th, 2018
103
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.02 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. # read in the HTML via stdin:
  4. import sys
  5. input = sys.stdin.read()
  6.  
  7. # define the set of toknes which our lexer knows about:
  8. import re
  9. symbol_table = (
  10. ("opentag", re.compile(r'<[a-zA-Z].*?>')),
  11. ("closetag", re.compile(r'</[a-zA-Z].*?>')),
  12. ("singletag", re.compile(r'<[a-zA-Z].*?/>')),
  13. ("other", re.compile(r'[^<>]+')),
  14. )
  15.  
  16. # a representation of a token:
  17. class Token(object):
  18. def __init__(self, name, text):
  19. self.name = name
  20. self.text = text
  21. def __repr__(self):
  22. return "%s(%s)" % (self.name, self.text)
  23.  
  24. # consumes enough chars from input to create the next token:
  25. def consume(symbol_pair, input):
  26. (token_name, pattern) = symbol_pair
  27. m = pattern.match(input)
  28. if m is not None:
  29. matched_text = m.group()
  30. (start_index, end_index) = m.span()
  31. token = Token(token_name, matched_text)
  32. result = (token, end_index)
  33. else:
  34. result = (None, None)
  35. return result
  36.  
  37. # the lexer: turns a list of characters into a list of recognized tokens:
  38. def tokenize(symbol_table, input):
  39. tokens = []
  40. while len(input) > 0:
  41. for symbol_pair in symbol_table:
  42. (token, consumed_count) = consume(symbol_pair, input)
  43. if token is not None:
  44. tokens.append(token)
  45. input = input[consumed_count:]
  46. break
  47. else:
  48. raise Exception("bad input: '%s'" % input)
  49. return tokens
  50.  
  51. # lex our input into tokens:
  52. tokens = tokenize(symbol_table, input)
  53.  
  54. # print out the tokens as a sanity check:
  55. import pprint
  56. print "tokens:"
  57. pprint.pprint(tokens)
  58.  
  59. # a representation of a node in a parse tree:
  60. class Node(object):
  61. def __init__(self, token):
  62. self.token = token
  63. self.closetoken = None
  64. self.subnodes = []
  65. def __repr__(self):
  66. return "%s(%s)" % (self.token, self.subnodes)
  67.  
  68. # the parser: turns a linear stream of tokens into a parse tree:
  69. def parse(tokens, node):
  70. while len(tokens) > 0:
  71. token = tokens.pop(0)
  72. if token.name == "opentag":
  73. subnode = Node(token)
  74. parse(tokens, subnode)
  75. node.subnodes.append(subnode)
  76. elif token.name == "closetag":
  77. node.closetoken = token
  78. break
  79. elif token.name in ["singletag", "other"]:
  80. subnode = Node(token)
  81. node.subnodes.append(subnode)
  82.  
  83. # parse the tokens into a tree. start by creating a root node (None).
  84. parse_tree = Node(None)
  85. parse(tokens, parse_tree)
  86.  
  87. # prints out the parse tree, with indentation to indicate tree structure.
  88. def print_parsetree(node, indent=0):
  89. print "%s%s" % (" " * indent, node.token if node.token else "(root)")
  90. for subnode in node.subnodes:
  91. print_parsetree(subnode, indent+1)
  92.  
  93. # print out the parse tree as a sanity check:
  94. print
  95. print "parse tree:"
  96. print_parsetree(parse_tree)
  97.  
  98. # finds any video embed iframes and replaces then with "ios-video" divs:
  99. def replace_iframe(node):
  100. for i in range(len(node.subnodes)):
  101. subnode = node.subnodes[i]
  102. name = subnode.token.name
  103. text = subnode.token.text
  104. if name == "opentag" and text.startswith("<iframe") and "src=\"/embed/" in text and "flo-video-embed" in text:
  105. del node.subnodes[i]
  106. replacement = Node(Token("opentag", "<div class=\"ios-video\">"))
  107. replacement.closetoken = Token("closetag", "</div>")
  108. node.subnodes.insert(i, replacement)
  109. else:
  110. for subsubnode in subnode.subnodes:
  111. replace_iframe(subsubnode)
  112.  
  113. # search-and-replace the video embed iframes:
  114. replace_iframe(parse_tree)
  115.  
  116. # print out the modified parse tree as a sanity check:
  117. print
  118. print "modified parse tree:"
  119. print_parsetree(parse_tree)
  120.  
  121. # serializes a parse tree back into HTML:
  122. def dump_html(node):
  123. if node.token:
  124. print node.token.text,
  125. for subnode in node.subnodes:
  126. dump_html(subnode)
  127. if node.closetoken:
  128. print node.closetoken.text,
  129.  
  130. # print out the HTML of our modified parse tree:
  131. print
  132. print "modified HTML:"
  133. print dump_html(parse_tree)
Add Comment
Please, Sign In to add comment