Advertisement
Guest User

regular expression compiler

a guest
Oct 5th, 2018
229
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.50 KB | None | 0 0
  1. # regex matcher
  2.  
  3. import types
  4. import sys
  5.  
  6.  
  7. # define a couple constants for regex operators
  8.  
  9.  
  10. # *,?,+,^,$,|, and concatenation
  11.  
  12. KLEENE = 0
  13. OPTIONAL = 1
  14. PLUS = 2
  15. LANCHOR = 3
  16. RANCHOR = 4
  17. SEQ = 5
  18. OR = 6
  19.  
  20.  
  21. # regex syntax is as expected, except that I was lazy and made | a
  22. # postfix operator like the rest
  23.  
  24. # so (a)(b)|  matches 'a' or 'b'
  25.  
  26. def parse_regex(reg):
  27.     stack = []
  28.  
  29.     operator_map = {
  30.         '*': KLEENE,
  31.         '?': OPTIONAL,
  32.         '+': PLUS,
  33.         '^': LANCHOR,
  34.         '$': RANCHOR,
  35.         '|': OR}
  36.  
  37.     # characters push themselves on the stack
  38.     # operators pop operands, create expression nodes e.g. [SEQ,'a','b']),
  39.     # and push those onto the stack
  40.     for char in reg:
  41.         if char == '$':
  42.             stack.append(RANCHOR)
  43.         elif char == '^':
  44.             stack.append(LANCHOR)
  45.  
  46.         elif char == '|':
  47.             op1 = stack.pop()
  48.             op2 = stack.pop()
  49.             stack.append([operator_map[char], op1, op2])
  50.            
  51.         elif char in operator_map:
  52.             operand = stack.pop()
  53.             stack.append([operator_map[char], operand])
  54.         elif char == ')':
  55.            
  56.             op = stack.pop()
  57.             seq = []
  58.             while op != '(':
  59.                 seq.append(op)
  60.                 op = stack.pop()
  61.             seq.append(SEQ)
  62.             stack.append(seq[::-1])
  63.         else:
  64.             stack.append(char)
  65.  
  66.     # prepend SEQ operator at the end of the process
  67.     return [SEQ] + stack
  68.  
  69.  
  70.  
  71. ### code generation stuff here
  72.  
  73. cntr = 0
  74. def gen_label():
  75.     global cntr
  76.     ret = cntr
  77.     cntr += 1
  78.     return "label_{}".format(ret)
  79.  
  80.    
  81. def compile_regex(expr):
  82.     if type(expr) == types.StringType:
  83.         lbl = gen_label()
  84.         return ("if (cur_char == '{}') add_next_state(&&{}, &&swap_states); \nNEXT;\n{}:\n"
  85.                 .format(expr, lbl, lbl))
  86.    
  87.     elif expr == LANCHOR:
  88.         return "if (!left_anchor) NEXT;\n"
  89.  
  90.     elif expr == RANCHOR:
  91.         return "if (!right_anchor) NEXT;\n"
  92.  
  93.     else:
  94.         operator = expr[0]
  95.        
  96.         if operator == SEQ:
  97.  
  98.             operands = expr[1:]
  99.             code = ""
  100.             for op in operands:
  101.                 code = code + compile_regex(op)
  102.            
  103.             return code
  104.         else:
  105.             operand = expr[1]
  106.             compiled_operand = compile_regex(operand)
  107.  
  108.             if operator == KLEENE:
  109.                 top_lbl  = gen_label()
  110.                 skip_lbl = gen_label()
  111.                 code = "{}:\n".format(top_lbl)
  112.                 code += "add_cur_state(&&{}, &&swap_states);\n".format(skip_lbl)
  113.                 code += compiled_operand
  114.                 code += "goto {};\n".format(top_lbl)
  115.                 code += "{}:\n".format(skip_lbl)
  116.                 return code
  117.  
  118.             elif operator == PLUS:
  119.                 return compile_regex([SEQ, operand, [KLEENE, operand]])
  120.  
  121.             elif operator == OPTIONAL:
  122.                 skip_lbl = gen_label()
  123.                 code = "add_cur_state(&&{}, &&swap_states);\n".format(skip_lbl)
  124.                 code += compiled_operand
  125.                 code += "{}:\n".format(skip_lbl)
  126.                 return code
  127.  
  128.             elif operator == OR:
  129.                 alt_lbl = gen_label()
  130.                 end_lbl = gen_label()
  131.                 alternative_operand = expr[2]
  132.                 compiled_alternative = compile_regex(alternative_operand)
  133.                 code = "add_cur_state(&&{}, &&swap_states);\n".format(alt_lbl)
  134.                 code += compiled_operand
  135.                 code += "goto {};\n".format(end_lbl)
  136.                 code += "{}:\n".format(alt_lbl)
  137.                 code += compiled_alternative
  138.                 code += "{}:\n".format(end_lbl)
  139.                 return code
  140.                
  141.             else:
  142.                 raise Exception("Can't compile {}".format(operator))
  143.            
  144.        
  145.  
  146. #### c support code
  147.  
  148. #### needs labels-as-values support
  149.  
  150. #### generated c code only handles checking against a single string at a time,
  151. #### comparing the compiled in pattern vs the first argument
  152.  
  153. #### it prints matching strings, and otherwise prints nothing
  154.  
  155. #### e.g.
  156.  
  157. #### regex.py "(ab)*$" > test.c
  158. #### gcc test.c -o test
  159. #### > ./test 'a'
  160. #### >
  161. #### > ./test 'ab'
  162. #### > ab
  163. #### > ./test 'abc'
  164. #### >
  165. #### > ./test 'abab'
  166. #### > abab
  167.  
  168.  
  169.  
  170. headers = """
  171. #include <stdio.h>
  172. #include <stdlib.h>
  173. #include <string.h>
  174. """
  175.  
  176. c_start = """
  177. void** cur_states = NULL;
  178.  
  179. int num_cur_states = 0;
  180.  
  181. void** next_states = NULL;
  182. int num_next_states = 0;
  183.  
  184. void add_cur_state(void* label, void* end_label) {
  185.    
  186.  // this loop here is shit but i didn't feel like implementing a bitset for this toy
  187.  for(int i = 0; i < num_cur_states; i++) {
  188.    if(cur_states[i] == label) {
  189.      return;
  190.    }
  191.  }
  192.  cur_states[num_cur_states-1] = label;
  193.  cur_states[num_cur_states++] = end_label;
  194. }
  195.  
  196. void add_next_state(void* label, void* end_label) {
  197.  // same as above
  198.  for(int i = 0; i < num_next_states; i++) {
  199.    if(next_states[i] == label) {
  200.      return;
  201.    }
  202.  }
  203.  next_states[num_next_states++] = label;
  204. }
  205.  
  206. int main(int argc, char** argv) {
  207.  char* str = argv[1];
  208.  
  209.  if(argc != 2) {
  210.    printf("Please provide a string to match against\\n");
  211.    return 0;
  212.  }
  213.  
  214.  int len = strlen(str);
  215.  
  216.  cur_states  = malloc(sizeof(void*) * reglen);
  217.  next_states = malloc(sizeof(void*) * reglen);
  218.  
  219.  cur_states[num_cur_states++] = &&start_label;
  220.    
  221.  int i = 0;
  222.  
  223. top_of_loop:
  224.  if(i >= len+1) {
  225.    return 0;
  226.  }
  227.  
  228.  int cur_state_ptr = 0;
  229.  int cur_char = (i < len) ? str[i] : EOF;
  230.  int left_anchor = (i == 0);
  231.  int right_anchor = (i == len);
  232.  
  233.  cur_states[num_cur_states++] = &&swap_states;
  234.    
  235. #define NEXT goto *(cur_states[cur_state_ptr++]);
  236.  
  237.  NEXT;
  238.  
  239. start_label:
  240.  
  241. """
  242.    
  243. c_end = """
  244.  swap_states:
  245.  if(num_next_states == 0) {
  246.    return 0;
  247.  }
  248.  num_cur_states = num_next_states;
  249.  num_next_states = 0;
  250.  void* tmp = cur_states;
  251.  cur_states = next_states;
  252.  next_states = tmp;
  253.  
  254.  i++;
  255.  
  256.  goto top_of_loop;
  257.  
  258. }
  259. """
  260.  
  261.    
  262. def wrap_c_code(code, regex_size):
  263.     code = "  " + code.replace("\n", "\n  ")
  264.     code += 'printf("%s\\n", str);\n'
  265.     code += "return 0;\n"
  266.     reglen = "int reglen = {};\n".format(regex_size+1)
  267.  
  268.     return headers + reglen + c_start + code + c_end
  269.  
  270.    
  271.  
  272. def gen_c_code(reg):
  273.     return wrap_c_code(compile_regex(parse_regex(reg)),
  274.                        len(reg))
  275.    
  276.  
  277.  
  278. if __name__ == '__main__':
  279.     print gen_c_code(sys.argv[1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement