Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from functools import *
- WORDCHAR = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890_'
- def match_nothing(txt, pos):
- return True, pos
- def match_character(c, txt, pos):
- return pos < len(txt) and txt[pos] == c, pos + 1
- def match_space(txt, pos):
- return pos < len(txt) and txt[pos].isspace(), pos + 1
- def match_word(txt, pos):
- return pos < len(txt) and txt[pos] in WORDCHAR, pos + 1
- def match_nonword(txt, pos):
- return pos < len(txt) and txt[pos] not in WORDCHAR, pos + 1
- def match_dot(txt, pos):
- return pos < len(txt), pos + 1
- def match_start(txt, pos):
- return pos == 0, pos
- def match_end(txt, pos):
- return pos == len(txt), pos
- def create_state(states, match=None, last=None, next=None, name=None):
- if next is None: next = []
- if match is None: match = match_nothing
- state = len(states)
- states[state] = (match, next, name)
- if last is not None:
- states[last][1].append(state)
- return state
- def compile_or(states, last, regexp, pos):
- mfirst = create_state(states, last=last, name='or_first')
- mlast = create_state(states, name='or_last')
- while True:
- pos, first, last = compile_seq(states, mfirst, regexp, pos)
- states[last][1].append(mlast)
- if pos != len(regexp) and regexp[pos] == '|':
- pos += 1
- else:
- assert pos == len(regexp) or regexp[pos] == ')'
- break
- return pos, mfirst, mlast
- def compile_paren(states, last, regexp, pos):
- states.setdefault(-2, []) # stores indexes
- states.setdefault(-1, []) # stores text
- group = len(states[-1])
- states[-2].append(None)
- states[-1].append(None)
- def match_pfirst(txt, pos):
- states[-2][group] = pos
- return True, pos
- def match_plast(txt, pos):
- old = states[-2][group]
- states[-1][group] = txt[old:pos]
- return True, pos
- mfirst = create_state(states, match=match_pfirst, last=last, name='paren_first')
- mlast = create_state(states, match=match_plast, name='paren_last')
- pos, first, last = compile_or(states, mfirst, regexp, pos)
- assert regexp[pos] == ')'
- states[last][1].append(mlast)
- return pos + 1, mfirst, mlast
- def compile_seq(states, last, regexp, pos):
- first = create_state(states, last=last, name='seq')
- last = first
- while pos < len(regexp):
- p = regexp[pos]
- if p == '\\':
- pos += 1
- p += regexp[pos]
- if p in '|)':
- break
- elif p == '(':
- pos, first, last = compile_paren(states, last, regexp, pos + 1)
- elif p in '+*':
- # first -> u ->...-> last -> v -> t
- # v -> first (matches at least once)
- # first -> t (skip on *)
- # u becomes new first
- # first is inserted before u
- u = create_state(states)
- v = create_state(states, next=[first])
- t = create_state(states, last=v)
- states[last][1].append(v)
- states[u] = states[first]
- states[first] = (match_nothing, [[u], [u, t]][p == '*'])
- last = t
- pos += 1
- else: # simple states
- if p == '^':
- state = create_state(states, match=match_start, last=last, name='begin')
- elif p == '$':
- state = create_state(states, match=match_end, last=last, name='end')
- elif p == '.':
- state = create_state(states, match=match_dot, last=last, name='dot')
- elif p == '\\.':
- state = create_state(states, match=partial(match_character, '.'), last=last, name='dot')
- elif p == '\\s':
- state = create_state(states, match=match_space, last=last, name='space')
- elif p == '\\w':
- state = create_state(states, match=match_word, last=last, name='word')
- elif p == '\\W':
- state = create_state(states, match=match_nonword, last=last, name='nonword')
- elif p.isalnum() or p in '_@':
- state = create_state(states, match=partial(match_character, p), last=last, name='char_' + p)
- else:
- assert False
- first, last = state, state
- pos += 1
- return pos, first, last
- def compile(regexp):
- states = {}
- pos, first, last = compile_or(states, create_state(states, name='root'), regexp, 0)
- assert pos == len(regexp)
- return states, last
- def backtrack(states, last, string, start=None):
- if start is None:
- for i in range(len(string)):
- if backtrack(states, last, string, i):
- return True
- return False
- stack = [[0, 0, start]] # state, pos in next, pos in text
- while stack:
- state = stack[-1][0]
- pos = stack[-1][2]
- #print 'in state', state, states[state]
- if state == last:
- print 'Matches: ', string[start:pos]
- for i in xrange(len(states[-1])):
- print 'Group', i + 1, states[-1][i]
- return True
- while stack[-1][1] < len(states[state][1]):
- nstate = states[state][1][stack[-1][1]]
- stack[-1][1] += 1
- ok, npos = states[nstate][0](string, pos)
- if ok:
- stack.append([nstate, 0, npos])
- break
- else:
- pass
- #print 'not matched', states[nstate][2]
- else:
- stack.pop()
- return False
- # regexp = '(\\w+)@(\\w+)(\\.com|\\.net)'
- # string = 'sam@test.net'
- regexp = raw_input()
- string = raw_input()
- states, last = compile(regexp)
- backtrack(states, last, string)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement