Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # regex matcher
- import types
- import sys
- # define a couple constants for regex operators
- # *,?,+,^,$,|, and concatenation
- KLEENE = 0
- OPTIONAL = 1
- PLUS = 2
- LANCHOR = 3
- RANCHOR = 4
- SEQ = 5
- OR = 6
- # regex syntax is as expected, except that I was lazy and made | a
- # postfix operator like the rest
- # so (a)(b)| matches 'a' or 'b'
- def parse_regex(reg):
- stack = []
- operator_map = {
- '*': KLEENE,
- '?': OPTIONAL,
- '+': PLUS,
- '^': LANCHOR,
- '$': RANCHOR,
- '|': OR}
- # characters push themselves on the stack
- # operators pop operands, create expression nodes e.g. [SEQ,'a','b']),
- # and push those onto the stack
- for char in reg:
- if char == '$':
- stack.append(RANCHOR)
- elif char == '^':
- stack.append(LANCHOR)
- elif char == '|':
- op1 = stack.pop()
- op2 = stack.pop()
- stack.append([operator_map[char], op1, op2])
- elif char in operator_map:
- operand = stack.pop()
- stack.append([operator_map[char], operand])
- elif char == ')':
- op = stack.pop()
- seq = []
- while op != '(':
- seq.append(op)
- op = stack.pop()
- seq.append(SEQ)
- stack.append(seq[::-1])
- else:
- stack.append(char)
- # prepend SEQ operator at the end of the process
- return [SEQ] + stack
- ### code generation stuff here
- cntr = 0
- def gen_label():
- global cntr
- ret = cntr
- cntr += 1
- return "label_{}".format(ret)
- def compile_regex(expr):
- if type(expr) == types.StringType:
- lbl = gen_label()
- return ("if (cur_char == '{}') add_next_state(&&{}, &&swap_states); \nNEXT;\n{}:\n"
- .format(expr, lbl, lbl))
- elif expr == LANCHOR:
- return "if (!left_anchor) NEXT;\n"
- elif expr == RANCHOR:
- return "if (!right_anchor) NEXT;\n"
- else:
- operator = expr[0]
- if operator == SEQ:
- operands = expr[1:]
- code = ""
- for op in operands:
- code = code + compile_regex(op)
- return code
- else:
- operand = expr[1]
- compiled_operand = compile_regex(operand)
- if operator == KLEENE:
- top_lbl = gen_label()
- skip_lbl = gen_label()
- code = "{}:\n".format(top_lbl)
- code += "add_cur_state(&&{}, &&swap_states);\n".format(skip_lbl)
- code += compiled_operand
- code += "goto {};\n".format(top_lbl)
- code += "{}:\n".format(skip_lbl)
- return code
- elif operator == PLUS:
- return compile_regex([SEQ, operand, [KLEENE, operand]])
- elif operator == OPTIONAL:
- skip_lbl = gen_label()
- code = "add_cur_state(&&{}, &&swap_states);\n".format(skip_lbl)
- code += compiled_operand
- code += "{}:\n".format(skip_lbl)
- return code
- elif operator == OR:
- alt_lbl = gen_label()
- end_lbl = gen_label()
- alternative_operand = expr[2]
- compiled_alternative = compile_regex(alternative_operand)
- code = "add_cur_state(&&{}, &&swap_states);\n".format(alt_lbl)
- code += compiled_operand
- code += "goto {};\n".format(end_lbl)
- code += "{}:\n".format(alt_lbl)
- code += compiled_alternative
- code += "{}:\n".format(end_lbl)
- return code
- else:
- raise Exception("Can't compile {}".format(operator))
- #### c support code
- #### needs labels-as-values support
- #### generated c code only handles checking against a single string at a time,
- #### comparing the compiled in pattern vs the first argument
- #### it prints matching strings, and otherwise prints nothing
- #### e.g.
- #### regex.py "(ab)*$" > test.c
- #### gcc test.c -o test
- #### > ./test 'a'
- #### >
- #### > ./test 'ab'
- #### > ab
- #### > ./test 'abc'
- #### >
- #### > ./test 'abab'
- #### > abab
- headers = """
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- """
- c_start = """
- void** cur_states = NULL;
- int num_cur_states = 0;
- void** next_states = NULL;
- int num_next_states = 0;
- void add_cur_state(void* label, void* end_label) {
- // this loop here is shit but i didn't feel like implementing a bitset for this toy
- for(int i = 0; i < num_cur_states; i++) {
- if(cur_states[i] == label) {
- return;
- }
- }
- cur_states[num_cur_states-1] = label;
- cur_states[num_cur_states++] = end_label;
- }
- void add_next_state(void* label, void* end_label) {
- // same as above
- for(int i = 0; i < num_next_states; i++) {
- if(next_states[i] == label) {
- return;
- }
- }
- next_states[num_next_states++] = label;
- }
- int main(int argc, char** argv) {
- char* str = argv[1];
- if(argc != 2) {
- printf("Please provide a string to match against\\n");
- return 0;
- }
- int len = strlen(str);
- cur_states = malloc(sizeof(void*) * reglen);
- next_states = malloc(sizeof(void*) * reglen);
- cur_states[num_cur_states++] = &&start_label;
- int i = 0;
- top_of_loop:
- if(i >= len+1) {
- return 0;
- }
- int cur_state_ptr = 0;
- int cur_char = (i < len) ? str[i] : EOF;
- int left_anchor = (i == 0);
- int right_anchor = (i == len);
- cur_states[num_cur_states++] = &&swap_states;
- #define NEXT goto *(cur_states[cur_state_ptr++]);
- NEXT;
- start_label:
- """
- c_end = """
- swap_states:
- if(num_next_states == 0) {
- return 0;
- }
- num_cur_states = num_next_states;
- num_next_states = 0;
- void* tmp = cur_states;
- cur_states = next_states;
- next_states = tmp;
- i++;
- goto top_of_loop;
- }
- """
- def wrap_c_code(code, regex_size):
- code = " " + code.replace("\n", "\n ")
- code += 'printf("%s\\n", str);\n'
- code += "return 0;\n"
- reglen = "int reglen = {};\n".format(regex_size+1)
- return headers + reglen + c_start + code + c_end
- def gen_c_code(reg):
- return wrap_c_code(compile_regex(parse_regex(reg)),
- len(reg))
- if __name__ == '__main__':
- print gen_c_code(sys.argv[1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement