Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- /**
- * POOL ALLOCATOR
- */
- #define POOL_CAP 1024
- typedef struct mem_pool
- {
- struct mem_pool* next;
- size_t pos;
- char data[];
- } mem_pool;
- /**
- * Creates a new memory pool
- */
- mem_pool*
- new_mem_pool ()
- {
- mem_pool* pool = malloc(sizeof(mem_pool) + POOL_CAP);
- if (pool == NULL)
- {
- fprintf(stderr, "failed to allocate memory pool\n");
- exit(-1);
- }
- pool->next = NULL;
- pool->pos = 0;
- return pool;
- }
- /**
- * Free memory pool (and all sub-pools)
- */
- void
- free_mem_pool (mem_pool* pool)
- {
- mem_pool* next;
- while (pool != NULL)
- {
- next = pool->next;
- free(pool);
- pool = next;
- }
- }
- /**
- * Allocate a block from `pool'
- * Returns `NULL' if memory could not be allocated, or a
- * new pointer to atleast `siz' bytes
- */
- void*
- mem_pool_alloc (mem_pool* pool, size_t siz)
- {
- /* keep aligned to word boundary */
- if (siz & 3)
- siz = 1 + (siz | 3);
- /* impossible to alloc */
- if (siz > POOL_CAP || pool == NULL)
- return NULL;
- while (pool->pos + siz > POOL_CAP)
- {
- if (pool->next == NULL)
- pool->next = new_mem_pool();
- pool = pool->next;
- }
- void* ptr = pool->data + pool->pos;
- pool->pos += siz;
- return ptr;
- }
- #define RE_CHAR 0 // [c]
- #define RE_CHARS 1 // [abc]
- #define RE_WILD 2 // .
- #define RE_AND 3 // xy
- #define RE_OR 4 // x|y
- #define RE_MANY 5 // x*
- #define RE_MORE 6 // x+
- #define RE_MAYBE 7 // x?
- #define RE_TRANS_EPS 0
- #define RE_TRANS_SINGLE 1
- #define RE_TRANS_WILD 2
- #define RE_TRANS_MAP 3
- /**
- * Structure for AST node in the regular expression
- * .kind: What form of expression
- * RE_CHAR <char>
- * RE_CHARS [<chars>]
- * RE_WILD .
- * RE_AND (e[0] e[1])
- * RE_OR (e[0] | e[1])
- * RE_MANY (e[0])*
- * RE_MORE (e[0])+
- * RE_MAYBE (e[0])?
- * .ch which character for RE_CHAR kind
- * .e sub-expressions
- */
- typedef struct rexpr
- {
- int kind;
- union {
- char ch;
- int* char_map;
- };
- struct rexpr* e[];
- } rexpr;
- #define CHAR_MAP_SIZE (256 / sizeof(int))
- #define char_map_idx(c) (((unsigned char) (c)) / sizeof(int))
- #define char_map_mask(c) (1 << ((unsigned char) (c)) % sizeof(int))
- static rexpr
- static_wildcard_expr =
- { .kind = RE_WILD };
- static int char_map_space[] = {
- 13824,0,1,0,0,0,0,0,
- 0,0,0,0,0,0,0,0, };
- static int char_map_digit[] = {
- 0,0,0,1023,0,0,0,0,
- 0,0,0,0,0,0,0,0, };
- static int char_map_alpha[] = {
- 0,0,0,0,65534,2047,65534,2047,
- 0,0,0,0,0,0,0,0, };
- static int char_map_i_space[] = {
- -13825,-1,-2,-1,-1,-1,-1,-1,
- -1,-1,-1,-1,-1,-1,-1,-1, };
- static int char_map_i_digit[] = {
- -1,-1,-1,-1024,-1,-1,-1,-1,
- -1,-1,-1,-1,-1,-1,-1,-1, };
- static int char_map_i_alpha[] = {
- -1,-1,-1,-1,-65535,-2048,-65535,-2048,
- -1,-1,-1,-1,-1,-1,-1,-1, };
- static int single_char_maps[CHAR_MAP_SIZE * 256] = {0,};
- static int* single_char_map (char c)
- {
- int* map = single_char_maps(CHAR_MAP_SIZE * (unsigned char) c);
- map[char_map_idx(c)] = char_map_mask(c);
- return map;
- }
- #if 0
- void generate_mask (const char* name, int invert, const char* chars)
- {
- int mask[256 / 16];
- memset(mask, invert ? -1 : 0, sizeof(mask));
- int i;
- for (i = 0; chars[i]; i++)
- {
- unsigned char c = chars[i];
- if (invert) mask[c / 16] &= ~(1 << (c % 16));
- else mask[c / 16] |= (1 << (c % 16));
- }
- printf("static int %s[] = {", name);
- for (i = 0; i < sizeof(mask) / sizeof(*mask); i++)
- {
- printf("%s%d,",
- (i % 8) ? "" : "\n\t",
- mask[i]);
- }
- printf(" };\n");
- }
- int main (void)
- {
- generate_mask("char_map_space", 0, " \n\r\t\f");
- generate_mask("char_map_digit", 0, "0123456789");
- generate_mask("char_map_alpha", 0, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ");
- generate_mask("char_map_i_space", 1, " \n\r\t\f");
- generate_mask("char_map_i_digit", 1, "0123456789");
- generate_mask("char_map_i_alpha", 1, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ");
- return 0;
- }
- #endif
- /* error handling */
- int re_error;
- static char re_error_string[256];
- #define re_errorf(...) \
- re_error = 1, \
- snprintf(re_error_string, sizeof(re_error_string), \
- __VA_ARGS__)
- /* public interface to internal error string */
- const char*
- re_get_error () { return re_error_string; }
- /* memory allocation */
- mem_pool* re_pool;
- #define re_pool_ALLOC_EXPR(sub_exprs) \
- (rexpr*) mem_pool_alloc(re_pool, \
- sizeof(rexpr) + (sub_exprs) * sizeof(rexpr*))
- #define re_pool_ALLOC_CHARS() (char*) mem_pool_alloc(re_pool, CHAR_MAP_SIZE)
- /**
- * RegExp AST PARSER
- */
- static rexpr*
- re_parse_expr (const char* rstr, const char** strp);
- static rexpr*
- re_parse_term (const char* rstr, const char** strp);
- static int
- re_parse_suffix (const char* rstr, const char** strp, rexpr** exprp);
- static int
- re_parse_and_or (const char* rstr, const char** strp, rexpr** exprp);
- /**
- * Parse a full RegExp expression
- * rstr: input string
- * [optional] strp: final string position
- */
- static rexpr*
- re_parse_expr (const char* rstr, const char** strp)
- {
- rexpr* expr = re_parse_term(rstr, &rstr);
- while (re_parse_and_or(rstr, &rstr, &expr))
- {
- if (re_error)
- break;
- }
- if (strp) *strp = rstr;
- return expr;
- }
- /**
- * Parse a simple `term'
- * <char> or `.' or [<range>]
- * followed by `*', `+' or `?'
- */
- static rexpr*
- re_parse_term (const char* rstr, const char** strp)
- {
- char ch;
- rexpr* expr;
- switch (ch = *rstr++)
- {
- case ')':
- case '\0':
- re_errorf("expression missing");
- expr = NULL;
- break;
- case '|':
- case '*':
- case '+':
- case '?':
- re_errorf("invalid '%c' before expression", ch);
- expr = NULL;
- break;
- case '(':
- expr = re_parse_expr(rstr, &rstr);
- if (!re_error && *rstr++ != ')')
- {
- re_errorf("missing closing ')'");
- expr = NULL;
- }
- break;
- case '.':
- expr = &static_wildcard_expr;
- break;
- case '[':
- /* TODO: char map */
- default:
- expr = re_pool_ALLOC_EXPR(0);
- expr->kind = RE_CHAR;
- expr->ch = ch;
- break;
- }
- /* e* e? e+ */
- while (re_parse_suffix(rstr, &rstr, &expr))
- {
- if (re_error)
- break;
- }
- if (strp) *strp = rstr;
- return expr;
- }
- /**
- * Parse a suffix: `*', `+', or `?'
- * Returns 0 if none more to parse
- */
- static int
- re_parse_suffix (const char* rstr, const char** strp, rexpr** exprp)
- {
- rexpr* expr;
- char ch;
- switch (ch = *rstr)
- {
- case '*':
- case '+':
- case '?':
- rstr++;
- expr = re_pool_ALLOC_EXPR(1);
- expr->kind =
- ch == '*' ? RE_MANY :
- ch == '+' ? RE_MORE :
- /* == '?' */ RE_MAYBE;
- expr->e[0] = *exprp;
- break;
- default:
- return 0;
- }
- *exprp = expr;
- if (strp) *strp = rstr;
- return 1;
- }
- /**
- * Parse and/or terms
- * ab and a|b
- * Returns 0 if none more to parse
- */
- static int
- re_parse_and_or (const char* rstr, const char** strp, rexpr** exprp)
- {
- int retval = 1;
- char ch;
- rexpr* expr = NULL, *expr2;
- switch (ch = *rstr)
- {
- case '\0':
- case ')':
- expr = *exprp;
- retval = 0;
- break;
- case '|':
- expr2 = re_parse_expr(rstr + 1, &rstr);
- if (re_error)
- break;
- expr = re_pool_ALLOC_EXPR(2);
- expr->kind = RE_OR;
- expr->e[0] = *exprp;
- expr->e[1] = expr2;
- retval = 0;
- break;
- default:
- expr2 = re_parse_term(rstr, &rstr);
- if (re_error)
- break;
- expr = re_pool_ALLOC_EXPR(2);
- expr->kind = RE_AND;
- expr->e[0] = *exprp;
- expr->e[1] = expr2;
- break;
- }
- *exprp = expr;
- if (strp) *strp = rstr;
- return retval;
- }
- /**
- * State table, contains states and their transitions
- * .list: Array of states
- * .siz/.cap: Size/capacity for resizable array
- */
- typedef struct re_st_table
- {
- int siz, cap;
- struct re_state* list;
- } re_st_table;
- /**
- * A single state, contains list of transitions
- * .trans: Array of transitions
- * .siz/.cap: Size/capacity for resizable array
- */
- typedef struct re_state
- {
- int siz, cap;
- struct re_st_trans* trans;
- } re_state;
- /**
- * A state transition
- * .kind What kind of transition
- * RE_TRANS_EPS epsilon transition (automatic)
- * RE_TRANS_SINGLE single character
- * RE_TRANS_WILD wildcard (any character)
- * RE_TRANS_MAP character map (any of a group a chars)
- * .dest Destination state after the transition
- * .ch Which character (for RE_TRANS_SINGLE)
- * .chars Character map (for RE_TRANS_MAP)
- */
- typedef struct re_st_trans
- {
- int kind;
- int dest;
- union {
- char ch;
- int* char_map;
- };
- } re_st_trans;
- /**
- * Compiled regular expression
- */
- typedef struct re_regexp
- {
- re_st_table tbl;
- int init_state;
- int accept_state;
- } re_regexp;
- /**
- * Create a new state table
- */
- static re_st_table
- new_re_st_table ()
- {
- const int CAP = 16;
- re_state* list = malloc(CAP * sizeof(re_state));
- return (re_st_table) {
- .siz = 0,
- .cap = CAP,
- .list = list,
- };
- }
- /**
- * Free resources from state table (and
- * all sub-states / transitions)
- */
- static void
- re_st_table_free (re_st_table tbl)
- {
- int i, j;
- for (i = 0; i < tbl.siz; i++)
- {
- for (j = 0; j < tbl.list[i].siz; j++)
- {
- if (tbl.list[i].trans[j].kind == RE_TRANS_MAP)
- free(tbl.list[i].trans[j].d.chars);
- }
- free(tbl.list[i].trans);
- }
- free(tbl.list);
- }
- /**
- * Create a new state in table `tbl'
- */
- static int
- new_state (re_st_table* tbl)
- {
- if (tbl->cap <= tbl->siz)
- {
- tbl->cap *= 2;
- tbl->list = realloc(tbl->list, tbl->cap * sizeof(re_state));
- }
- const int CAP = 4;
- re_st_trans* trans = malloc(CAP * sizeof(re_st_trans));
- re_state* state = &tbl->list[tbl->siz];
- state->siz = 0;
- state->cap = CAP;
- state->trans = trans;
- return tbl->siz++;
- }
- /**
- * Create a transition in `tbl' from state `st_id' to `st_dest'
- * with character `ch' (for RE_TRANS_SINGLE)
- */
- static re_st_trans*
- transition (re_st_table* tbl, int st_id, int st_dest, int kind, char ch)
- {
- re_state* state = &tbl->list[st_id];
- if (state->cap <= state->siz)
- {
- state->cap *= 2;
- state->trans = realloc(state->trans, state->cap * sizeof(re_st_trans));
- }
- re_st_trans* tr = &state->trans[state->siz];
- tr->kind = kind;
- tr->dest = st_dest;
- tr->ch = ch;
- state->siz++;
- return tr;
- }
- /**
- * Same as above ( transition(...) ), but without argument `ch'
- */
- #define transition_(tbl,src,dst,k) transition(tbl,src,dst,k,'\0')
- /**
- * Create a character map transition (RE_TRANS_MAP) with
- * map `chars'
- */
- static re_st_trans*
- transition_map (re_st_table* tbl, int st_id, int st_dest, int* char_map)
- {
- re_st_trans* tr = transition_(tbl, st_id, st_dest, RE_TRANS_MAP);
- tr->char_map = char_map;
- return tr;
- }
- /**
- * RegExp COMPILE
- */
- /**
- * Compile an expression into state transitions (into `tbl'),
- * from previous state `st_from'. Returns the new state after
- * the transition(s).
- */
- static int
- compile (re_st_table* tbl, int st_from, rexpr* expr)
- {
- int st_final, st_1, st_2;
- head:
- switch (expr->kind)
- {
- case RE_CHAR:
- st_final = new_state(tbl);
- transition(tbl, st_from, st_final, RE_TRANS_SINGLE, expr->ch);
- return st_final;
- case RE_WILD:
- st_final = new_state(tbl);
- transition_(tbl, st_from, st_final, RE_TRANS_WILD);
- return st_final;
- case RE_AND:
- st_from = compile(tbl, st_from, expr->e[0]);
- expr = expr->e[1];
- /* sorry, EWD */
- /* the semantically consistent call would be
- st_final = compile(tbl, st_from, expr->e[1]);
- but the goto ensures that we don't add any
- stack space (also a for(;;) loop would indent too far) */
- goto head;
- case RE_OR:
- st_1 = new_state(tbl);
- st_2 = new_state(tbl);
- st_final = new_state(tbl);
- transition_(tbl, st_from, st_1, RE_TRANS_EPS);
- transition_(tbl, st_from, st_2, RE_TRANS_EPS);
- st_1 = compile(tbl, st_1, expr->e[0]);
- st_2 = compile(tbl, st_2, expr->e[1]);
- transition_(tbl, st_1, st_final, RE_TRANS_EPS);
- transition_(tbl, st_2, st_final, RE_TRANS_EPS);
- return st_final;
- case RE_MANY:
- st_final = compile(tbl, st_from, expr->e[0]);
- transition_(tbl, st_from, st_final, RE_TRANS_EPS);
- transition_(tbl, st_final, st_from, RE_TRANS_EPS);
- return st_final;
- case RE_MORE:
- st_final = compile(tbl, st_from, expr->e[0]);
- transition_(tbl, st_final, st_from, RE_TRANS_EPS);
- return st_final;
- case RE_MAYBE:
- st_final = compile(tbl, st_from, expr->e[0]);
- transition_(tbl, st_from, st_final, RE_TRANS_EPS);
- return st_final;
- default:
- fprintf(stderr, "invalid expression compile\n");
- exit(-1);
- }
- }
- /**
- * RegExp EXECUTE
- */
- static int
- does_match (const re_st_trans* trans, char ch)
- {
- switch (trans->kind)
- {
- case RE_TRANS_SINGLE:
- return trans->ch == ch;
- case RE_TRANS_WILD:
- return 1;
- case RE_TRANS_MAP:
- return trans->chars[(unsigned char) ch];
- default:
- return 0;
- }
- }
- static int
- follow_matches (const re_st_table* tbl, int* states, int* target, char ch)
- {
- int retval = 0;
- int i, j;
- re_state state;
- for (i = 0; i < tbl->siz; i++)
- {
- if (!states[i])
- continue;
- state = tbl->list[i];
- for (j = 0; j < state.siz; j++)
- {
- if (does_match(&state.trans[j], ch))
- {
- target[state.trans[j].dest] = 1;
- retval = 1;
- }
- }
- }
- return retval;
- }
- static void
- follow_epsilon (const re_st_table* tbl, int* states)
- {
- /* if in state `s1' and there is a transition 's1: eps -> s2',
- then also go in state `s2' */
- int i, j, dest;
- re_state state;
- for (i = 0; i < tbl->siz; i++)
- {
- if (!states[i])
- continue;
- state = tbl->list[i];
- for (j = 0; j < state.siz; j++)
- {
- /* only follow RE_TRANS_EPS transitions */
- if (state.trans[j].kind != RE_TRANS_EPS)
- continue;
- /* backtrack the dest state? */
- dest = state.trans[j].dest;
- if (dest < i && !states[dest])
- i = dest - 1;
- states[dest] = 1;
- }
- }
- }
- /*
- * Finds the longest match to get from `init_state' to
- * `accept_state' in `string'. Returns -1 if no match,
- * or the length of the substring match if a match is found
- */
- static int
- compute_longest_match (const re_st_table* tbl, int init_state, int accept_state, const char* string)
- {
- #define CLEAR(b) memset((b), 0, sizeof(int) * tbl->siz)
- const char* str = string;
- char ch;
- int length = -1;
- /* `flip' is kind of like a "backbuffer" for `states'
- we compute the new set of states into `flip' and then
- swap it with `states'.
- */
- int* states, *flip, *temp;
- states = malloc(sizeof(int) * tbl->siz);
- flip = malloc(sizeof(int) * tbl->siz);
- /* set up initial state */
- CLEAR(flip);
- CLEAR(states);
- states[init_state] = 1;
- follow_epsilon(tbl, states);
- for (;;) {
- /* if accept_state, then it is an acceptable end-of-match */
- if (states[accept_state])
- length = (int) (str - string);
- ch = *str++;
- /* printf("(char %c)", ch);
- * int i;
- * for (i = 0; i < tbl->siz; i++)
- * {
- * if (states[i])
- * printf(" %s%d",
- * (i == accept_state) ? "*" : "",
- * i);
- * }
- * printf("\n");
- */
- /* `follow_matches' returns 0 if the new set of
- states is empty, in which case we should stop
- checking additional characters */
- if (!follow_matches(tbl, states, flip, ch))
- break;
- /* swap */
- temp = states;
- states = flip;
- flip = temp;
- CLEAR(flip);
- follow_epsilon(tbl, states);
- }
- free(states);
- free(flip);
- return length;
- #undef CLEAR
- }
- int
- re_compile (re_regexp* out, const char* regexp)
- {
- int retval = 0;
- /* initialize RE parser */
- re_pool = new_mem_pool();
- re_error = 0;
- *re_error_string = '\0';
- /* parse input regexp */
- rexpr* expr = re_parse_expr(regexp, NULL);
- if (re_error)
- goto cleanup;
- /* compile expr -> states */
- re_st_table tbl = new_re_st_table();
- int init_state = new_state(&tbl);
- int accept_state = compile(&tbl, init_state, expr);
- if (re_error)
- {
- re_st_table_free(tbl);
- goto cleanup;
- }
- /* return as re_regexp object */
- if (out)
- {
- out->tbl = tbl;
- out->init_state = init_state;
- out->accept_state = accept_state;
- }
- else /* no output?? */
- re_st_table_free(tbl);
- /* okay! */
- retval = 1;
- cleanup:
- /* deinitialize */
- free_mem_pool(re_pool);
- return retval;
- }
- /* front-end for interfaces */
- /* (same return value as above) */
- int
- re_match (const re_regexp* regexp, const char* string)
- {
- return compute_longest_match(®exp->tbl,
- regexp->init_state,
- regexp->accept_state,
- string);
- }
- void
- re_free (re_regexp regexp)
- {
- re_st_table_free(regexp.tbl);
- }
- int
- main (int argc, char** argv)
- {
- if (argc < 3)
- {
- printf("usage: re [regexp] [string]\n");
- return 0;
- }
- const char* re_string = argv[1];
- const char* inp_string = argv[2];
- re_regexp regexp;
- if (!re_compile(®exp, re_string))
- {
- fprintf(stderr, "regexp error: %s\n", re_get_error());
- return 1;
- }
- int length = re_match(®exp, inp_string);
- if (length == -1)
- {
- printf("no match.\n");
- }
- else
- {
- char matched[length + 1];
- memcpy(matched, inp_string, length);
- matched[length] = '\0';
- printf("matched string: '%s'\n", matched);
- }
- re_free(regexp);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement