SHARE
TWEET

regexp

a guest Jul 1st, 2011 78 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from functools import *
  2.  
  3. WORDCHAR = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890_'
  4.  
  5.  
  6. def match_nothing(txt, pos):
  7.   return True, pos
  8.  
  9. def match_character(c, txt, pos):
  10.   return pos < len(txt) and txt[pos] == c, pos + 1
  11.  
  12. def match_space(txt, pos):
  13.   return pos < len(txt) and txt[pos].isspace(), pos + 1
  14.  
  15. def match_word(txt, pos):
  16.   return pos < len(txt) and txt[pos] in WORDCHAR, pos + 1
  17.  
  18. def match_nonword(txt, pos):
  19.   return pos < len(txt) and txt[pos] not in WORDCHAR, pos + 1
  20.  
  21. def match_dot(txt, pos):
  22.   return pos < len(txt), pos + 1
  23.  
  24. def match_start(txt, pos):
  25.   return pos == 0, pos
  26.  
  27. def match_end(txt, pos):
  28.   return pos == len(txt), pos
  29.  
  30.  
  31. def create_state(states, match=None, last=None, next=None, name=None):
  32.   if next is None: next = []
  33.   if match is None: match = match_nothing
  34.  
  35.   state = len(states)
  36.   states[state] = (match, next, name)
  37.   if last is not None:
  38.     states[last][1].append(state)
  39.  
  40.   return state
  41.  
  42.  
  43. def compile_or(states, last, regexp, pos):
  44.   mfirst = create_state(states, last=last, name='or_first')
  45.   mlast = create_state(states, name='or_last')
  46.  
  47.   while True:
  48.     pos, first, last = compile_seq(states, mfirst, regexp, pos)
  49.     states[last][1].append(mlast)
  50.     if pos != len(regexp) and regexp[pos] == '|':
  51.       pos += 1
  52.     else:
  53.       assert pos == len(regexp) or regexp[pos] == ')'
  54.       break
  55.  
  56.   return pos, mfirst, mlast
  57.  
  58.  
  59. def compile_paren(states, last, regexp, pos):
  60.   states.setdefault(-2, [])   # stores indexes
  61.   states.setdefault(-1, [])   # stores text
  62.  
  63.   group = len(states[-1])
  64.   states[-2].append(None)
  65.   states[-1].append(None)
  66.  
  67.   def match_pfirst(txt, pos):
  68.     states[-2][group] = pos
  69.     return True, pos
  70.  
  71.   def match_plast(txt, pos):
  72.     old = states[-2][group]
  73.     states[-1][group] = txt[old:pos]
  74.     return True, pos
  75.  
  76.   mfirst = create_state(states, match=match_pfirst, last=last, name='paren_first')
  77.   mlast = create_state(states, match=match_plast, name='paren_last')
  78.  
  79.   pos, first, last = compile_or(states, mfirst, regexp, pos)
  80.   assert regexp[pos] == ')'
  81.  
  82.   states[last][1].append(mlast)
  83.   return pos + 1, mfirst, mlast
  84.  
  85.  
  86. def compile_seq(states, last, regexp, pos):
  87.   first = create_state(states, last=last, name='seq')
  88.   last = first
  89.  
  90.   while pos < len(regexp):
  91.     p = regexp[pos]
  92.     if p == '\\':
  93.       pos += 1
  94.       p += regexp[pos]
  95.  
  96.     if p in '|)':
  97.       break
  98.  
  99.     elif p == '(':
  100.       pos, first, last = compile_paren(states, last, regexp, pos + 1)
  101.  
  102.     elif p in '+*':
  103.       # first -> u ->...-> last -> v -> t
  104.       # v -> first (matches at least once)
  105.       # first -> t (skip on *)
  106.       # u becomes new first
  107.       # first is inserted before u
  108.  
  109.       u = create_state(states)
  110.       v = create_state(states, next=[first])
  111.       t = create_state(states, last=v)
  112.      
  113.       states[last][1].append(v)
  114.       states[u] = states[first]
  115.       states[first] = (match_nothing, [[u], [u, t]][p == '*'])
  116.    
  117.       last = t
  118.       pos += 1
  119.  
  120.     else:  # simple states
  121.       if p == '^':
  122.         state = create_state(states, match=match_start, last=last, name='begin')
  123.       elif p == '$':
  124.         state = create_state(states, match=match_end, last=last, name='end')
  125.       elif p == '.':
  126.         state = create_state(states, match=match_dot, last=last, name='dot')
  127.       elif p == '\\.':
  128.         state = create_state(states, match=partial(match_character, '.'), last=last, name='dot')
  129.       elif p == '\\s':
  130.         state = create_state(states, match=match_space, last=last, name='space')
  131.       elif p == '\\w':
  132.         state = create_state(states, match=match_word, last=last, name='word')
  133.       elif p == '\\W':
  134.         state = create_state(states, match=match_nonword, last=last, name='nonword')
  135.       elif p.isalnum() or p in '_@':
  136.         state = create_state(states, match=partial(match_character, p), last=last, name='char_' + p)
  137.       else:
  138.         assert False
  139.  
  140.       first, last = state, state
  141.       pos += 1
  142.  
  143.   return pos, first, last
  144.  
  145.  
  146. def compile(regexp):
  147.   states = {}
  148.   pos, first, last = compile_or(states, create_state(states, name='root'), regexp, 0)
  149.   assert pos == len(regexp)
  150.   return states, last
  151.  
  152.  
  153. def backtrack(states, last, string, start=None):
  154.   if start is None:
  155.     for i in range(len(string)):
  156.       if backtrack(states, last, string, i):
  157.         return True
  158.     return False
  159.  
  160.   stack = [[0, 0, start]]   # state, pos in next, pos in text
  161.   while stack:
  162.     state = stack[-1][0]
  163.     pos = stack[-1][2]
  164.     #print 'in state', state, states[state]
  165.  
  166.     if state == last:
  167.       print 'Matches: ', string[start:pos]
  168.       for i in xrange(len(states[-1])):
  169.         print 'Group', i + 1, states[-1][i]
  170.       return True
  171.  
  172.     while stack[-1][1] < len(states[state][1]):
  173.       nstate = states[state][1][stack[-1][1]]
  174.       stack[-1][1] += 1
  175.  
  176.       ok, npos = states[nstate][0](string, pos)
  177.       if ok:
  178.         stack.append([nstate, 0, npos])
  179.         break
  180.       else:
  181.         pass
  182.         #print 'not matched', states[nstate][2]
  183.     else:
  184.       stack.pop()
  185.  
  186.   return False
  187.  
  188.  
  189.  
  190. # regexp = '(\\w+)@(\\w+)(\\.com|\\.net)'
  191. # string = 'sam@test.net'
  192. regexp = raw_input()
  193. string = raw_input()
  194.  
  195. states, last = compile(regexp)
  196. backtrack(states, last, string)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Top