Advertisement
Guest User

Untitled

a guest
Aug 23rd, 2014
372
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 8.08 KB | None | 0 0
  1. /*
  2.  * Regular expression implementation.
  3.  * Supports only ( | ) * + ?.  No escapes.
  4.  * Compiles to NFA and then simulates NFA
  5.  * using Thompson's algorithm.
  6.  *
  7.  * See also http://swtch.com/~rsc/regexp/ and
  8.  * Thompson, Ken.  Regular Expression Search Algorithm,
  9.  * Communications of the ACM 11(6) (June 1968), pp. 419-422.
  10.  *
  11.  * Copyright (c) 2007 Russ Cox.
  12.  * Can be distributed under the MIT license, see bottom of file.
  13.  */
  14. #include <stdio.h>
  15. #include <stdlib.h>
  16. #include <string.h>
  17. #include <unistd.h>
  18.  
  19. /*
  20.  * Convert infix regexp re to postfix notation.
  21.  * Insert . as explicit concatenation operator.
  22.  * Cheesy parser, return static buffer.
  23.  */
  24. char*
  25. re2post(char *re)
  26. {
  27.     int nalt, natom;
  28.     static char buf[8000];
  29.     char *dst;
  30.     struct {
  31.         int nalt;
  32.         int natom;
  33.     } paren[100], *p;
  34.    
  35.     p = paren;
  36.     dst = buf;
  37.     nalt = 0;
  38.     natom = 0;
  39.     if(strlen(re) >= sizeof buf/2)
  40.         return NULL;
  41.     for(; *re; re++){
  42.         switch(*re){
  43.         case '(':
  44.             if(natom > 1){
  45.                 --natom;
  46.                 *dst++ = '.';
  47.             }
  48.             if(p >= paren+100)
  49.                 return NULL;
  50.             p->nalt = nalt;
  51.             p->natom = natom;
  52.             p++;
  53.             nalt = 0;
  54.             natom = 0;
  55.             break;
  56.         case '|':
  57.             if(natom == 0)
  58.                 return NULL;
  59.             while(--natom > 0)
  60.                 *dst++ = '.';
  61.             nalt++;
  62.             break;
  63.         case ')':
  64.             if(p == paren)
  65.                 return NULL;
  66.             if(natom == 0)
  67.                 return NULL;
  68.             while(--natom > 0)
  69.                 *dst++ = '.';
  70.             for(; nalt > 0; nalt--)
  71.                 *dst++ = '|';
  72.             --p;
  73.             nalt = p->nalt;
  74.             natom = p->natom;
  75.             natom++;
  76.             break;
  77.         case '*':
  78.         case '+':
  79.         case '?':
  80.             if(natom == 0)
  81.                 return NULL;
  82.             *dst++ = *re;
  83.             break;
  84.         default:
  85.             if(natom > 1){
  86.                 --natom;
  87.                 *dst++ = '.';
  88.             }
  89.             *dst++ = *re;
  90.             natom++;
  91.             break;
  92.         }
  93.     }
  94.     if(p != paren)
  95.         return NULL;
  96.     while(--natom > 0)
  97.         *dst++ = '.';
  98.     for(; nalt > 0; nalt--)
  99.         *dst++ = '|';
  100.     *dst = 0;
  101.     return buf;
  102. }
  103.  
  104. /*
  105.  * Represents an NFA state plus zero or one or two arrows exiting.
  106.  * if c == Match, no arrows out; matching state.
  107.  * If c == Split, unlabeled arrows to out and out1 (if != NULL).
  108.  * If c < 256, labeled arrow with character c to out.
  109.  */
  110. enum
  111. {
  112.     Match = 256,
  113.     Split = 257
  114. };
  115. typedef struct State State;
  116. struct State
  117. {
  118.     int c;
  119.     State *out;
  120.     State *out1;
  121.     int lastlist;
  122. };
  123. State matchstate = { Match };   /* matching state */
  124. int nstate;
  125.  
  126. /* Allocate and initialize State */
  127. State*
  128. state(int c, State *out, State *out1)
  129. {
  130.     State *s;
  131.    
  132.     nstate++;
  133.     s = malloc(sizeof *s);
  134.     s->lastlist = 0;
  135.     s->c = c;
  136.     s->out = out;
  137.     s->out1 = out1;
  138.     return s;
  139. }
  140.  
  141. /*
  142.  * A partially built NFA without the matching state filled in.
  143.  * Frag.start points at the start state.
  144.  * Frag.out is a list of places that need to be set to the
  145.  * next state for this fragment.
  146.  */
  147. typedef struct Frag Frag;
  148. typedef union Ptrlist Ptrlist;
  149. struct Frag
  150. {
  151.     State *start;
  152.     Ptrlist *out;
  153. };
  154.  
  155. /* Initialize Frag struct. */
  156. Frag
  157. frag(State *start, Ptrlist *out)
  158. {
  159.     Frag n = { start, out };
  160.     return n;
  161. }
  162.  
  163. /*
  164.  * Since the out pointers in the list are always
  165.  * uninitialized, we use the pointers themselves
  166.  * as storage for the Ptrlists.
  167.  */
  168. union Ptrlist
  169. {
  170.     Ptrlist *next;
  171.     State *s;
  172. };
  173.  
  174. /* Create singleton list containing just outp. */
  175. Ptrlist*
  176. list1(State **outp)
  177. {
  178.     Ptrlist *l;
  179.    
  180.     l = (Ptrlist*)outp;
  181.     l->next = NULL;
  182.     return l;
  183. }
  184.  
  185. /* Patch the list of states at out to point to start. */
  186. void
  187. patch(Ptrlist *l, State *s)
  188. {
  189.     Ptrlist *next;
  190.    
  191.     for(; l; l=next){
  192.         next = l->next;
  193.         l->s = s;
  194.     }
  195. }
  196.  
  197. /* Join the two lists l1 and l2, returning the combination. */
  198. Ptrlist*
  199. append(Ptrlist *l1, Ptrlist *l2)
  200. {
  201.     Ptrlist *oldl1;
  202.    
  203.     oldl1 = l1;
  204.     while(l1->next)
  205.         l1 = l1->next;
  206.     l1->next = l2;
  207.     return oldl1;
  208. }
  209.  
  210. /*
  211.  * Convert postfix regular expression to NFA.
  212.  * Return start state.
  213.  */
  214. State*
  215. post2nfa(char *postfix)
  216. {
  217.     char *p;
  218.     Frag stack[1000], *stackp, e1, e2, e;
  219.     State *s;
  220.    
  221.     // fprintf(stderr, "postfix: %s\n", postfix);
  222.  
  223.     if(postfix == NULL)
  224.         return NULL;
  225.  
  226.     #define push(s) *stackp++ = s
  227.     #define pop() *--stackp
  228.  
  229.     stackp = stack;
  230.     for(p=postfix; *p; p++){
  231.         switch(*p){
  232.         default:
  233.             s = state(*p, NULL, NULL);
  234.             push(frag(s, list1(&s->out)));
  235.             break;
  236.         case '.':   /* catenate */
  237.             e2 = pop();
  238.             e1 = pop();
  239.             patch(e1.out, e2.start);
  240.             push(frag(e1.start, e2.out));
  241.             break;
  242.         case '|':   /* alternate */
  243.             e2 = pop();
  244.             e1 = pop();
  245.             s = state(Split, e1.start, e2.start);
  246.             push(frag(s, append(e1.out, e2.out)));
  247.             break;
  248.         case '?':   /* zero or one */
  249.             e = pop();
  250.             s = state(Split, e.start, NULL);
  251.             push(frag(s, append(e.out, list1(&s->out1))));
  252.             break;
  253.         case '*':   /* zero or more */
  254.             e = pop();
  255.             s = state(Split, e.start, NULL);
  256.             patch(e.out, s);
  257.             push(frag(s, list1(&s->out1)));
  258.             break;
  259.         case '+':   /* one or more */
  260.             e = pop();
  261.             s = state(Split, e.start, NULL);
  262.             patch(e.out, s);
  263.             push(frag(e.start, list1(&s->out1)));
  264.             break;
  265.         }
  266.     }
  267.  
  268.     e = pop();
  269.     if(stackp != stack)
  270.         return NULL;
  271.  
  272.     patch(e.out, &matchstate);
  273.     return e.start;
  274. #undef pop
  275. #undef push
  276. }
  277.  
  278. typedef struct List List;
  279. struct List
  280. {
  281.     State **s;
  282.     int n;
  283. };
  284. List l1, l2;
  285. static int listid;
  286.  
  287. void addstate(List*, State*);
  288. void step(List*, int, List*);
  289.  
  290. /* Compute initial state list */
  291. List*
  292. startlist(State *start, List *l)
  293. {
  294.     l->n = 0;
  295.     listid++;
  296.     addstate(l, start);
  297.     return l;
  298. }
  299.  
  300. /* Check whether state list contains a match. */
  301. int
  302. ismatch(List *l)
  303. {
  304.     int i;
  305.  
  306.     for(i=0; i<l->n; i++)
  307.         if(l->s[i] == &matchstate)
  308.             return 1;
  309.     return 0;
  310. }
  311.  
  312. /* Add s to l, following unlabeled arrows. */
  313. void
  314. addstate(List *l, State *s)
  315. {
  316.     if(s == NULL || s->lastlist == listid)
  317.         return;
  318.     s->lastlist = listid;
  319.     if(s->c == Split){
  320.         /* follow unlabeled arrows */
  321.         addstate(l, s->out);
  322.         addstate(l, s->out1);
  323.         return;
  324.     }
  325.     l->s[l->n++] = s;
  326. }
  327.  
  328. /*
  329.  * Step the NFA from the states in clist
  330.  * past the character c,
  331.  * to create next NFA state set nlist.
  332.  */
  333. void
  334. step(List *clist, int c, List *nlist)
  335. {
  336.     int i;
  337.     State *s;
  338.  
  339.     listid++;
  340.     nlist->n = 0;
  341.     for(i=0; i<clist->n; i++){
  342.         s = clist->s[i];
  343.         if(s->c == c)
  344.             addstate(nlist, s->out);
  345.     }
  346. }
  347.  
  348. /* Run NFA to determine whether it matches s. */
  349. int
  350. match(State *start, char *s)
  351. {
  352.     int i, c;
  353.     List *clist, *nlist, *t;
  354.  
  355.     clist = startlist(start, &l1);
  356.     nlist = &l2;
  357.     for(; *s; s++){
  358.         c = *s & 0xFF;
  359.         step(clist, c, nlist);
  360.         t = clist; clist = nlist; nlist = t;    /* swap clist, nlist */
  361.     }
  362.     return ismatch(clist);
  363. }
  364.  
  365. int
  366. main(int argc, char **argv)
  367. {
  368.     int i;
  369.     char *post;
  370.     State *start;
  371.  
  372.     if(argc < 3){
  373.         fprintf(stderr, "usage: nfa regexp string...\n");
  374.         return 1;
  375.     }
  376.    
  377.     post = re2post(argv[1]);
  378.     if(post == NULL){
  379.         fprintf(stderr, "bad regexp %s\n", argv[1]);
  380.         return 1;
  381.     }
  382.  
  383.     start = post2nfa(post);
  384.     if(start == NULL){
  385.         fprintf(stderr, "error in post2nfa %s\n", post);
  386.         return 1;
  387.     }
  388.    
  389.     l1.s = malloc(nstate*sizeof l1.s[0]);
  390.     l2.s = malloc(nstate*sizeof l2.s[0]);
  391.     for(i=2; i<argc; i++)
  392.         if(match(start, argv[i]))
  393.             printf("%s\n", argv[i]);
  394.     return 0;
  395. }
  396.  
  397. /*
  398.  * Permission is hereby granted, free of charge, to any person
  399.  * obtaining a copy of this software and associated
  400.  * documentation files (the "Software"), to deal in the
  401.  * Software without restriction, including without limitation
  402.  * the rights to use, copy, modify, merge, publish, distribute,
  403.  * sublicense, and/or sell copies of the Software, and to
  404.  * permit persons to whom the Software is furnished to do so,
  405.  * subject to the following conditions:
  406.  *
  407.  * The above copyright notice and this permission notice shall
  408.  * be included in all copies or substantial portions of the
  409.  * Software.
  410.  *
  411.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
  412.  * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
  413.  * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
  414.  * PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS
  415.  * OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  416.  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  417.  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  418.  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  419.  */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement