Advertisement
Guest User

Python Test

a guest
Oct 8th, 2012
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.26 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3.  
  4. '''
  5. Context-independent xHTML pair matcher
  6. Use method <code>match(html, start_ix)</code> to find matching pair.
  7. If pair was found, this function returns a list of indexes where tag pair
  8. starts and ends. If pair wasn't found, <code>None</code> will be returned.
  9.  
  10. The last matched (or unmatched) result is saved in <code>last_match</code>
  11. dictionary for later use.
  12.  
  13. @author: Sergey Chikuyonok (serge.che@gmail.com)
  14. '''
  15. import re
  16.  
  17. start_tag = r'<([\w\:\-]+)((?:\s+[\w\-:]+(?:\s*=\s*(?:(?:"[^"]*")|(?:\'[^\']*\')|[^>\s]+))?)*)\s*(\/?)>'
  18. end_tag = r'<\/([\w\:\-]+)[^>]*>'
  19. attr = r'([\w\-:]+)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:\'((?:\\.|[^\'])*)\')|([^>\s]+)))?'
  20.  
  21. "Last matched HTML pair"
  22. last_match = {
  23.     'opening_tag': None, # Tag() or Comment() object
  24.     'closing_tag': None, # Tag() or Comment() object
  25.     'start_ix': -1,
  26.     'end_ix': -1
  27. }
  28.  
  29. cur_mode = 'xhtml'
  30. "Current matching mode"
  31.  
  32. def set_mode(new_mode):
  33.     global cur_mode
  34.     if new_mode != 'html': new_mode = 'xhtml'
  35.     cur_mode = new_mode
  36.  
  37. def make_map(elems):
  38.     """
  39.     Create dictionary of elements for faster searching
  40.     @param elems: Elements, separated by comma
  41.     @type elems: str
  42.     """
  43.     obj = {}
  44.     for elem in elems.split(','):
  45.             obj[elem] = True
  46.  
  47.     return obj
  48.  
  49. # Empty Elements - HTML 4.01
  50. empty = make_map("area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed");
  51.  
  52. # Block Elements - HTML 4.01
  53. block = make_map("address,applet,blockquote,button,center,dd,dir,div,dl,dt,fieldset,form,frameset,hr,iframe,isindex,li,map,menu,noframes,noscript,object,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul");
  54.  
  55. # Inline Elements - HTML 4.01
  56. inline = make_map("a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,select,small,span,strike,strong,sub,sup,textarea,tt,u,var");
  57.  
  58. # Elements that you can, intentionally, leave open
  59. # (and which close themselves)
  60. close_self = make_map("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr");
  61.  
  62. # Attributes that have their values filled in disabled="disabled"
  63. fill_attrs = make_map("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected");
  64.  
  65. #Special Elements (can contain anything)
  66. # serge.che: parsing data inside <scipt> elements is a "feature"
  67. special = make_map("style");
  68.  
  69. class Tag():
  70.     """Matched tag"""
  71.     def __init__(self, match, ix):
  72.         """
  73.         @type match: MatchObject
  74.         @param match: Matched HTML tag
  75.         @type ix: int
  76.         @param ix: Tag's position
  77.         """
  78.         global cur_mode
  79.        
  80.         name = match.group(1).lower()
  81.         self.name = name
  82.         self.full_tag = match.group(0)
  83.         self.start = ix
  84.         self.end = ix + len(self.full_tag)
  85.         self.unary = ( len(match.groups()) > 2 and bool(match.group(3)) ) or (name in empty and cur_mode == 'html')
  86.         self.type = 'tag'
  87.         self.close_self = (name in close_self and cur_mode == 'html')
  88.  
  89. class Comment():
  90.     "Matched comment"
  91.     def __init__(self, start, end):
  92.         self.start = start
  93.         self.end = end
  94.         self.type = 'comment'
  95.  
  96. def make_range(opening_tag=None, closing_tag=None, ix=0):
  97.     """
  98.     Makes selection ranges for matched tag pair
  99.     @type opening_tag: Tag
  100.    @type closing_tag: Tag
  101.    @type ix: int
  102.    @return list
  103.     """
  104.     start_ix, end_ix = -1, -1
  105.    
  106.     if opening_tag and not closing_tag: # unary element
  107.         start_ix = opening_tag.start
  108.         end_ix = opening_tag.end
  109.     elif opening_tag and closing_tag: # complete element
  110.         if (opening_tag.start < ix and opening_tag.end > ix) or (closing_tag.start <= ix and closing_tag.end > ix):
  111.             start_ix = opening_tag.start
  112.             end_ix = closing_tag.end;
  113.         else:
  114.             start_ix = opening_tag.end
  115.             end_ix = closing_tag.start
  116.    
  117.     return start_ix, end_ix
  118.  
  119. def save_match(opening_tag=None, closing_tag=None, ix=0):
  120.     """
  121.     Save matched tag for later use and return found indexes
  122.    @type opening_tag: Tag
  123.    @type closing_tag: Tag
  124.    @type ix: int
  125.    @return list
  126.     """
  127.     last_match['opening_tag'] = opening_tag;
  128.     last_match['closing_tag'] = closing_tag;
  129.    
  130.     last_match['start_ix'], last_match['end_ix'] = make_range(opening_tag, closing_tag, ix)
  131.    
  132.     return last_match['start_ix'] != -1 and (last_match['start_ix'], last_match['end_ix']) or (None, None)
  133.  
  134. def match(html, start_ix, mode='xhtml'):
  135.     """
  136.     Search for matching tags in <code>html</code>, starting from
  137.     <code>start_ix</code> position. The result is automatically saved
  138.     in <code>last_match</code> property
  139.     """
  140.     return _find_pair(html, start_ix, mode, save_match)
  141.  
  142. def find(html, start_ix, mode='xhtml'):
  143.     """
  144.     Search for matching tags in <code>html</code>, starting from
  145.     <code>start_ix</code> position.
  146.     """
  147.     return _find_pair(html, start_ix, mode)
  148.  
  149. def get_tags(html, start_ix, mode='xhtml'):
  150.     """
  151.     Search for matching tags in <code>html</code>, starting from
  152.     <code>start_ix</code> position. The difference between
  153.     <code>match</code> function itself is that <code>get_tags</code>
  154.     method doesn't save matched result in <code>last_match</code> property
  155.     and returns array of opening and closing tags
  156.     This method is generally used for lookups
  157.     """
  158.     return _find_pair(html, start_ix, mode, lambda op, cl=None, ix=0: (op, cl) if op and op.type == 'tag' else None)
  159.  
  160.  
  161. def _find_pair(html, start_ix, mode='xhtml', action=make_range):
  162.     """
  163.     Search for matching tags in <code>html</code>, starting from
  164.     <code>start_ix</code> position
  165.    
  166.     @param html: Code to search
  167.     @type html: str
  168.    
  169.     @param start_ix: Character index where to start searching pair
  170.     (commonly, current caret position)
  171.     @type start_ix: int
  172.    
  173.     @param action: Function that creates selection range
  174.     @type action: function
  175.    
  176.     @return: list
  177.     """
  178.  
  179.     forward_stack = []
  180.     backward_stack = []
  181.     opening_tag = None
  182.     closing_tag = None
  183.     html_len = len(html)
  184.    
  185.     set_mode(mode)
  186.  
  187.     def has_match(substr, start=None):
  188.         if start is None:
  189.             start = ix
  190.  
  191.         return html.find(substr, start) == start
  192.  
  193.  
  194.     def find_comment_start(start_pos):
  195.         while start_pos:
  196.             if html[start_pos] == '<' and has_match('<!--', start_pos):
  197.                 break
  198.  
  199.             start_pos -= 1
  200.  
  201.         return start_pos
  202.  
  203. #    find opening tag
  204.     ix = start_ix - 1
  205.     while ix >= 0:
  206.         ch = html[ix]
  207.         if ch == '<':
  208.             check_str = html[ix:]
  209.             m = re.match(end_tag, check_str)
  210.             if m:  # found closing tag
  211.                 tmp_tag = Tag(m, ix)
  212.                 if tmp_tag.start < start_ix and tmp_tag.end > start_ix: # direct hit on searched closing tag
  213.                     closing_tag = tmp_tag
  214.                 else:
  215.                     backward_stack.append(tmp_tag)
  216.             else:
  217.                 m = re.match(start_tag, check_str)
  218.                 if m: # found opening tag
  219.                     tmp_tag = Tag(m, ix);
  220.                     if tmp_tag.unary:
  221.                         if tmp_tag.start < start_ix and tmp_tag.end > start_ix: # exact match
  222.                             return action(tmp_tag, None, start_ix)
  223.                     elif backward_stack and backward_stack[-1].name == tmp_tag.name:
  224.                         backward_stack.pop()
  225.                     else: # found nearest unclosed tag
  226.                         opening_tag = tmp_tag
  227.                         break
  228.                 elif check_str.startswith('<!--'): # found comment start
  229.                     end_ix = check_str.find('-->') + ix + 3;
  230.                     if ix < start_ix and end_ix >= start_ix:
  231.                         return action(Comment(ix, end_ix))
  232.         elif ch == '-' and has_match('-->'): # found comment end
  233.             # search left until comment start is reached
  234.             ix = find_comment_start(ix)
  235.  
  236.         ix -= 1
  237.        
  238.     if not opening_tag:
  239.         return action(None)
  240.    
  241.     # find closing tag
  242.     if not closing_tag:
  243.         ix = start_ix
  244.         while ix < html_len:
  245.             ch = html[ix]
  246.             if ch == '<':
  247.                 check_str = html[ix:]
  248.                 m = re.match(start_tag, check_str)
  249.                 if m: # found opening tag
  250.                     tmp_tag = Tag(m, ix);
  251.                     if not tmp_tag.unary:
  252.                         forward_stack.append(tmp_tag)
  253.                 else:
  254.                     m = re.match(end_tag, check_str)
  255.                     if m:   #found closing tag
  256.                         tmp_tag = Tag(m, ix);
  257.                         if forward_stack and forward_stack[-1].name == tmp_tag.name:
  258.                             forward_stack.pop()
  259.                         else:  # found matched closing tag
  260.                             closing_tag = tmp_tag;
  261.                             break
  262.                     elif has_match('<!--'): # found comment
  263.                         ix += check_str.find('-->') + 3
  264.                         continue
  265.             elif ch == '-' and has_match('-->'):
  266.                 # looks like cursor was inside comment with invalid HTML
  267.                 if not forward_stack or forward_stack[-1].type != 'comment':
  268.                     end_ix = ix + 3
  269.                     return action(Comment( find_comment_start(ix), end_ix ))
  270.                
  271.             ix += 1
  272.    
  273.     return action(opening_tag, closing_tag, start_ix)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement