nuttx/libs/libc/regex/regcomp.c
guoshichao e1096bd35c libc/regex: add regex implementation
add regex implementation for libc, the implementation are ported from
musl project

Signed-off-by: guoshichao <guoshichao@xiaomi.com>
2023-05-17 10:25:18 +08:00

4037 lines
102 KiB
C

/****************************************************************************
* libs/libc/regex/regcomp.c
*
* regcomp.c - TRE POSIX compatible regex compilation functions.
*
* Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
****************************************************************************/
/****************************************************************************
* Included Files
****************************************************************************/
#include <string.h>
#include <stdlib.h>
#include <regex.h>
#include <limits.h>
#include <stdint.h>
#include <ctype.h>
#include "tre.h"
#include <assert.h>
/* from tre-compile.h
*/
/****************************************************************************
* Public Functions
****************************************************************************/
typedef struct
{
int position;
int code_min;
int code_max;
int *tags;
int assertions;
tre_ctype_t class;
tre_ctype_t *neg_classes;
int backref;
} tre_pos_and_tags_t;
/* from tre-ast.c and tre-ast.h
*/
/* The different AST node types. */
typedef enum
{
LITERAL,
CATENATION,
ITERATION,
UNION
} tre_ast_type_t;
/* Special subtypes of TRE_LITERAL.
*/
#define EMPTY -1 /* Empty leaf (denotes empty string). */
#define ASSERTION -2 /* Assertion leaf. */
#define TAG -3 /* Tag leaf. */
#define BACKREF -4 /* Back reference leaf. */
#define IS_SPECIAL(x) ((x)->code_min < 0)
#define IS_EMPTY(x) ((x)->code_min == EMPTY)
#define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
#define IS_TAG(x) ((x)->code_min == TAG)
#define IS_BACKREF(x) ((x)->code_min == BACKREF)
/* A generic AST node. All AST nodes consist of this node on the top
* level with `obj' pointing to the actual content.
*/
typedef struct
{
tre_ast_type_t type; /* Type of the node. */
void *obj; /* Pointer to actual node. */
int nullable;
int submatch_id;
int num_submatches;
int num_tags;
tre_pos_and_tags_t *firstpos;
tre_pos_and_tags_t *lastpos;
} tre_ast_node_t;
/* A "literal" node. These are created for assertions, back references,
* tags, matching parameter settings, and all expressions that match one
* character.
*/
typedef struct
{
long code_min;
long code_max;
int position;
tre_ctype_t class;
tre_ctype_t *neg_classes;
} tre_literal_t;
/* A "catenation" node. These are created when two regexps are concatenated.
* If there are more than one subexpressions in sequence, the `left' part
* holds all but the last, and `right' part holds the last subexpression
* (catenation is left associative).
*/
typedef struct
{
tre_ast_node_t *left;
tre_ast_node_t *right;
} tre_catenation_t;
/* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}"
* operators.
*/
typedef struct
{
/* Subexpression to match. */
tre_ast_node_t *arg;
/* Minimum number of consecutive matches. */
int min;
/* Maximum number of consecutive matches. */
int max;
/* If 0, match as many characters as possible, if 1 match as few as
* possible. Note that this does not always mean the same thing as
* matching as many/few repetitions as possible.
*/
unsigned int minimal : 1;
} tre_iteration_t;
/* An "union" node. These are created for the "|" operator. */
typedef struct
{
tre_ast_node_t *left;
tre_ast_node_t *right;
} tre_union_t;
static tre_ast_node_t *tre_ast_new_node(tre_mem_t mem, int type, void *obj)
{
tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
if (!node || !obj)
{
return 0;
}
node->obj = obj;
node->type = type;
node->nullable = -1;
node->submatch_id = -1;
return node;
}
static tre_ast_node_t *tre_ast_new_literal(tre_mem_t mem, int code_min,
int code_max, int position)
{
tre_ast_node_t *node;
tre_literal_t *lit;
lit = tre_mem_calloc(mem, sizeof *lit);
node = tre_ast_new_node(mem, LITERAL, lit);
if (!node)
{
return 0;
}
lit->code_min = code_min;
lit->code_max = code_max;
lit->position = position;
return node;
}
static tre_ast_node_t *tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg,
int min, int max, int minimal)
{
tre_ast_node_t *node;
tre_iteration_t *iter;
iter = tre_mem_calloc(mem, sizeof *iter);
node = tre_ast_new_node(mem, ITERATION, iter);
if (!node)
{
return 0;
}
iter->arg = arg;
iter->min = min;
iter->max = max;
iter->minimal = minimal;
node->num_submatches = arg->num_submatches;
return node;
}
static tre_ast_node_t *tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left,
tre_ast_node_t *right)
{
tre_ast_node_t *node;
tre_union_t *un;
if (!left)
{
return right;
}
un = tre_mem_calloc(mem, sizeof *un);
node = tre_ast_new_node(mem, UNION, un);
if (!node || !right)
{
return 0;
}
un->left = left;
un->right = right;
node->num_submatches = left->num_submatches + right->num_submatches;
return node;
}
static tre_ast_node_t *tre_ast_new_catenation(tre_mem_t mem,
tre_ast_node_t *left,
tre_ast_node_t *right)
{
tre_ast_node_t *node;
tre_catenation_t *cat;
if (!left)
{
return right;
}
cat = tre_mem_calloc(mem, sizeof *cat);
node = tre_ast_new_node(mem, CATENATION, cat);
if (!node)
{
return 0;
}
cat->left = left;
cat->right = right;
node->num_submatches = left->num_submatches + right->num_submatches;
return node;
}
/* from tre-stack.c and tre-stack.h
*/
typedef struct tre_stack_rec tre_stack_t;
/* Creates a new stack object. `size' is initial size in bytes, `max_size'
* is maximum size, and `increment' specifies how much more space will be
* allocated with realloc() if all space gets used up. Returns the stack
* object or NULL if out of memory.
*/
static tre_stack_t *tre_stack_new(int size, int max_size, int increment);
/* Frees the stack object. */
static void tre_stack_destroy(tre_stack_t *s);
/* Returns the current number of objects in the stack. */
static int tre_stack_num_objects(tre_stack_t *s);
/* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
* `value' on top of stack `s'. Returns REG_ESPACE if out of memory.
* This tries to realloc() more space before failing if maximum size
* has not yet been reached. Returns REG_OK if successful.
*/
#define declare_pushf(typetag, type) \
static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t * s, type value)
declare_pushf(voidptr, void *);
declare_pushf(int, int);
/* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
* element off of stack `s' and returns it. The stack must not be
* empty.
*/
#define declare_popf(typetag, type) \
static type tre_stack_pop_ ## typetag(tre_stack_t * s)
declare_popf(voidptr, void *);
declare_popf(int, int);
/* Just to save some typing. */
#define STACK_PUSH(s, typetag, value) \
do \
{ \
status = tre_stack_push_ ## typetag(s, value); \
} \
while (/* CONSTCOND */ 0)
#define STACK_PUSHX(s, typetag, value) \
{ \
status = tre_stack_push_ ## typetag(s, value); \
if (status != REG_OK) \
break; \
}
#define STACK_PUSHR(s, typetag, value) \
{ \
reg_errcode_t _status; \
_status = tre_stack_push_ ## typetag(s, value); \
if (_status != REG_OK) \
return _status; \
}
union tre_stack_item
{
void *voidptr_value;
int int_value;
};
struct tre_stack_rec
{
int size;
int max_size;
int increment;
int ptr;
union tre_stack_item *stack;
};
static tre_stack_t *tre_stack_new(int size, int max_size, int increment)
{
tre_stack_t *s;
s = xmalloc(sizeof(*s));
if (s != NULL)
{
s->stack = xmalloc(sizeof(*s->stack) * size);
if (s->stack == NULL)
{
xfree(s);
return NULL;
}
s->size = size;
s->max_size = max_size;
s->increment = increment;
s->ptr = 0;
}
return s;
}
static void tre_stack_destroy(tre_stack_t *s)
{
xfree(s->stack);
xfree(s);
}
static int tre_stack_num_objects(tre_stack_t *s)
{
return s->ptr;
}
static reg_errcode_t tre_stack_push(tre_stack_t *s,
union tre_stack_item value)
{
if (s->ptr < s->size)
{
s->stack[s->ptr] = value;
s->ptr++;
}
else
{
if (s->size >= s->max_size)
{
return REG_ESPACE;
}
else
{
union tre_stack_item *new_buffer;
int new_size;
new_size = s->size + s->increment;
if (new_size > s->max_size)
{
new_size = s->max_size;
}
new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
if (new_buffer == NULL)
{
return REG_ESPACE;
}
assert(new_size > s->size);
s->size = new_size;
s->stack = new_buffer;
tre_stack_push(s, value);
}
}
return REG_OK;
}
#define define_pushf(typetag, type) \
declare_pushf(typetag, type) { \
union tre_stack_item item; \
item.typetag ## _value = value; \
return tre_stack_push(s, item); \
}
define_pushf(int, int)
define_pushf(voidptr, void *)
#define define_popf(typetag, type) \
declare_popf(typetag, type) { \
return s->stack[--s->ptr].typetag ## _value; \
}
define_popf(int, int)
define_popf(voidptr, void *)
/* from tre-parse.c and tre-parse.h
*/
/* Parse context. */
typedef struct
{
/* Memory allocator. The AST is allocated using this. */
tre_mem_t mem;
/* Stack used for keeping track of regexp syntax. */
tre_stack_t *stack;
/* The parsed node after a parse function returns. */
tre_ast_node_t *n;
/* Position in the regexp pattern after a parse function returns. */
const char *s;
/* The first character of the regexp. */
const char *re;
/* Current submatch ID. */
int submatch_id;
/* Current position (number of literal). */
int position;
/* The highest back reference or -1 if none seen so far. */
int max_backref;
/* Compilation flags. */
int cflags;
} tre_parse_ctx_t;
/* Some macros for expanding \w, \s, etc. */
typedef struct
{
char c;
const char *expansion;
} tre_macro;
static const tre_macro tre_macros[] =
{
{
't', "\t"
},
{
'n', "\n"
},
{
'r', "\r"
},
{
'f', "\f"
},
{
'a', "\a"
},
{
'e', "\033"
},
{
'w', "[[:alnum:]_]"
},
{
'W', "[^[:alnum:]_]"
},
{
's', "[[:space:]]"
},
{
'S', "[^[:space:]]"
},
{
'd', "[[:digit:]]"
},
{
'D', "[^[:digit:]]"
},
{
0, 0
}
};
/* Expands a macro delimited by `regex' and `regex_end' to `buf', which
* must have at least `len' items. Sets buf[0] to zero if the there
* is no match in `tre_macros'.
*/
static const char *tre_expand_macro(const char *s)
{
int i;
for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++)
{
}
return tre_macros[i].expansion;
}
static int tre_compare_lit(const void *a, const void *b)
{
const tre_literal_t *const *la = a;
const tre_literal_t *const *lb = b;
/* assumes the range of valid code_min is < INT_MAX */
return la[0]->code_min - lb[0]->code_min;
}
struct literals
{
tre_mem_t mem;
tre_literal_t **a;
int len;
int cap;
};
static tre_literal_t *tre_new_lit(struct literals *p)
{
tre_literal_t **a;
if (p->len >= p->cap)
{
if (p->cap >= 1 << 15)
{
return 0;
}
p->cap *= 2;
a = xrealloc(p->a, p->cap * sizeof *p->a);
if (!a)
{
return 0;
}
p->a = a;
}
a = p->a + p->len++;
*a = tre_mem_calloc(p->mem, sizeof **a);
return *a;
}
static int add_icase_literals(struct literals *ls, int min, int max)
{
tre_literal_t *lit;
int b;
int e;
int c;
for (c = min; c <= max; )
{
/* assumes islower(c) and isupper(c) are exclusive
* and toupper(c)!=c if islower(c).
* multiple opposite case characters are not supported
*/
if (tre_islower(c))
{
b = e = tre_toupper(c);
for (c++, e++; c <= max; c++, e++)
{
if (tre_toupper(c) != e)
{
break;
}
}
}
else if (tre_isupper(c))
{
b = e = tre_tolower(c);
for (c++, e++; c <= max; c++, e++)
{
if (tre_tolower(c) != e)
{
break;
}
}
}
else
{
c++;
continue;
}
lit = tre_new_lit(ls);
if (!lit)
{
return -1;
}
lit->code_min = b;
lit->code_max = e - 1;
lit->position = -1;
}
return 0;
}
/* Maximum number of character classes in a negated bracket expression. */
#define MAX_NEG_CLASSES 64
struct neg
{
int negate;
int len;
tre_ctype_t a[MAX_NEG_CLASSES];
};
/* TODO: parse bracket into a set of non-overlapping [lo, hi] ranges */
/* bracket grammar:
* Bracket = '[' List ']' | '[^' List ']'
* List = Term | List Term
* Term = Char | Range | Chclass | Eqclass
* Range = Char '-' Char | Char '-' '-'
* Char = Coll | coll_single
* Meta = ']' | '-'
* Coll = '[.' coll_single '.]' | '[.' coll_multi '.]'
* | '[.' Meta '.]'
* Eqclass = '[=' coll_single '=]' | '[=' coll_multi '=]'
* Chclass = '[:' class ':]'
*
* coll_single is a single char collating element but it can be
* '-' only at the beginning or end of a List and
* ']' only at the beginning of a List and
* '^' anywhere except after the openning '['
*/
static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s,
struct literals *ls,
struct neg *neg)
{
const char *start = s;
tre_ctype_t class;
int min;
int max;
wchar_t wc;
int len;
for (; ; )
{
class = 0;
len = mbtowc(&wc, s, -1);
if (len <= 0)
{
return *s ? REG_BADPAT : REG_EBRACK;
}
if (*s == ']' && s != start)
{
ctx->s = s + 1;
return REG_OK;
}
if (*s == '-' && s != start && s[1] != ']' &&
(s[1] != '-' || s[2] == ']'))
{
/* extension: [a-z--@] is accepted as [a-z]|[--@] */
return REG_ERANGE;
}
if (*s == '[' && (s[1] == '.' || s[1] == '='))
{
/* collating symbols and equivalence classes are not supported */
return REG_ECOLLATE;
}
if (*s == '[' && s[1] == ':')
{
char tmp[CHARCLASS_NAME_MAX + 1];
s += 2;
for (len = 0; len < CHARCLASS_NAME_MAX && s[len]; len++)
{
if (s[len] == ':')
{
memcpy(tmp, s, len);
tmp[len] = 0;
class = tre_ctype(tmp);
break;
}
}
if (!class || s[len + 1] != ']')
{
return REG_ECTYPE;
}
min = 0;
max = TRE_CHAR_MAX;
s += len + 2;
}
else
{
min = max = wc;
s += len;
if (*s == '-' && s[1] != ']')
{
s++;
len = mbtowc(&wc, s, -1);
max = wc;
/* XXX - Should use collation order instead of
* encoding values in character ranges.
*/
if (len <= 0 || min > max)
{
return REG_ERANGE;
}
s += len;
}
}
if (class && neg->negate)
{
if (neg->len >= MAX_NEG_CLASSES)
{
return REG_ESPACE;
}
neg->a[neg->len++] = class;
}
else
{
tre_literal_t *lit = tre_new_lit(ls);
if (!lit)
{
return REG_ESPACE;
}
lit->code_min = min;
lit->code_max = max;
lit->class = class;
lit->position = -1;
/* Add opposite-case codepoints if REG_ICASE is present.
* It seems that POSIX requires that bracket negation
* should happen before case-folding, but most practical
* implementations do it the other way around. Changing
* the order would need efficient representation of
* case-fold ranges and bracket range sets even with
* simple patterns so this is ok for now.
*/
if (ctx->cflags & REG_ICASE && !class)
{
if (add_icase_literals(ls, min, max))
{
return REG_ESPACE;
}
}
}
}
}
static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
{
int i;
int max;
int min;
int negmax;
int negmin;
tre_ast_node_t *node = 0, *n;
tre_ctype_t *nc = 0;
tre_literal_t *lit;
struct literals ls;
struct neg neg;
reg_errcode_t err;
ls.mem = ctx->mem;
ls.len = 0;
ls.cap = 32;
ls.a = xmalloc(ls.cap * sizeof *ls.a);
if (!ls.a)
{
return REG_ESPACE;
}
neg.len = 0;
neg.negate = *s == '^';
if (neg.negate)
{
s++;
}
err = parse_bracket_terms(ctx, s, &ls, &neg);
if (err != REG_OK)
{
goto parse_bracket_done;
}
if (neg.negate)
{
/* Sort the array if we need to negate it. */
qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
/* extra lit for the last negated range */
lit = tre_new_lit(&ls);
if (!lit)
{
err = REG_ESPACE;
goto parse_bracket_done;
}
lit->code_min = TRE_CHAR_MAX + 1;
lit->code_max = TRE_CHAR_MAX + 1;
lit->position = -1;
/* negated classes */
if (neg.len)
{
nc = tre_mem_alloc(ctx->mem, (neg.len + 1) * sizeof *neg.a);
if (!nc)
{
err = REG_ESPACE;
goto parse_bracket_done;
}
memcpy(nc, neg.a, neg.len * sizeof *neg.a);
nc[neg.len] = 0;
}
}
/* Build a union of the items in the array, negated if necessary. */
negmax = negmin = 0;
for (i = 0; i < ls.len; i++)
{
lit = ls.a[i];
min = lit->code_min;
max = lit->code_max;
if (neg.negate)
{
if (min <= negmin)
{
/* Overlap. */
negmin = MAX(max + 1, negmin);
continue;
}
negmax = min - 1;
lit->code_min = negmin;
lit->code_max = negmax;
negmin = max + 1;
}
lit->position = ctx->position;
lit->neg_classes = nc;
n = tre_ast_new_node(ctx->mem, LITERAL, lit);
node = tre_ast_new_union(ctx->mem, node, n);
if (!node)
{
err = REG_ESPACE;
break;
}
}
parse_bracket_done:
xfree(ls.a);
ctx->position++;
ctx->n = node;
return err;
}
static const char *parse_dup_count(const char *s, int *n)
{
*n = -1;
if (!isdigit(*s))
{
return s;
}
*n = 0;
for (; ; )
{
*n = 10 * *n + (*s - '0');
s++;
if (!isdigit(*s) || *n > RE_DUP_MAX)
{
break;
}
}
return s;
}
static reg_errcode_t parse_dup(tre_parse_ctx_t *ctx, const char *s)
{
int min;
int max;
s = parse_dup_count(s, &min);
if (*s == ',')
{
s = parse_dup_count(s + 1, &max);
}
else
{
max = min;
}
if ((max < min && max >= 0) ||
max > RE_DUP_MAX ||
min > RE_DUP_MAX ||
min < 0 ||
(!(ctx->cflags & REG_EXTENDED) && *s++ != '\\') ||
*s++ != '}')
{
return REG_BADBR;
}
if (min == 0 && max == 0)
{
ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
}
else
{
ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
}
if (!ctx->n)
{
return REG_ESPACE;
}
ctx->s = s;
return REG_OK;
}
static int hexval(unsigned c)
{
if (c - '0' < 10)
{
return c - '0';
}
c |= 32;
if (c - 'a' < 6)
{
return c - 'a' + 10;
}
return -1;
}
static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node,
int subid)
{
if (node->submatch_id >= 0)
{
tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
if (!n)
{
return REG_ESPACE;
}
n = tre_ast_new_catenation(ctx->mem, n, node);
if (!n)
{
return REG_ESPACE;
}
n->num_submatches = node->num_submatches;
node = n;
}
node->submatch_id = subid;
node->num_submatches++;
ctx->n = node;
return REG_OK;
}
/* BRE grammar:
* Regex = Branch | '^' | '$' | '^$' | '^' Branch
* | Branch '$' | '^' Branch '$'
* Branch = Atom | Branch Atom
* Atom = char | quoted_char | '.' | Bracket | Atom Dup
* | '\(' Branch '\)' | back_ref
* Dup = '*' | '\{' Count '\}' | '\{' Count ',\}'
* | '\{' Count ',' Count '\}'
*
* (leading ^ and trailing $ in a sub expr may be an anchor or
* literal as well)
*
* ERE grammar:
* Regex = Branch | Regex '|' Branch
* Branch = Atom | Branch Atom
* Atom = char | quoted_char | '.' | Bracket | Atom Dup
* | '(' Regex ')' | '^' | '$'
* Dup = '*' | '+' | '?' | '{' Count '}' | '{' Count ',}'
* | '{' Count ',' Count '}'
*
* (a*+?, ^*, $+, \X, {, (|a) are unspecified)
*/
static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
{
int len;
int ere = ctx->cflags & REG_EXTENDED;
const char *p;
tre_ast_node_t *node;
wchar_t wc;
switch (*s)
{
case '[':
{
return parse_bracket(ctx, s + 1);
}
case '\\':
{
p = tre_expand_macro(s + 1);
if (p)
{
/* assume \X expansion is a single atom */
reg_errcode_t err = parse_atom(ctx, p);
ctx->s = s + 2;
return err;
}
/* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
switch (*++s)
{
case 0:
{
return REG_EESCAPE;
}
case 'b':
{
node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
}
break;
case 'B':
{
node =
tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
}
break;
case '<':
{
node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
}
break;
case '>':
{
node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
}
break;
case 'x':
{
s++;
int i;
int v = 0;
int c;
len = 2;
if (*s == '{')
{
len = 8;
s++;
}
for (i = 0; i < len && v < 0x110000; i++)
{
c = hexval(s[i]);
if (c < 0)
{
break;
}
v = 16 * v + c;
}
s += i;
if (len == 8)
{
if (*s != '}')
{
return REG_EBRACE;
}
s++;
}
node = tre_ast_new_literal(ctx->mem, v, v, ctx->position);
ctx->position++;
s--;
}
break;
default:
if (isdigit(*s))
{
/* back reference */
int val = *s - '0';
node = tre_ast_new_literal(ctx->mem, BACKREF, val,
ctx->position);
ctx->max_backref = MAX(val, ctx->max_backref);
}
else
{
/* extension: accept unknown escaped char
* as a literal
*/
node = tre_ast_new_literal(ctx->mem, *s, *s, ctx->position);
}
ctx->position++;
}
s++;
}
break;
case '.':
{
if (ctx->cflags & REG_NEWLINE)
{
tre_ast_node_t *tmp1;
tre_ast_node_t *tmp2;
tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n' - 1,
ctx->position++);
tmp2 = tre_ast_new_literal(ctx->mem, '\n' + 1, TRE_CHAR_MAX,
ctx->position++);
if (tmp1 && tmp2)
{
node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
}
else
{
node = 0;
}
}
else
{
node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
ctx->position++);
}
s++;
}
break;
case '^':
{
/* '^' has a special meaning everywhere in EREs, and at beginning of
* BRE.
*/
if (!ere && s != ctx->re)
{
goto parse_literal;
}
node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
s++;
}
break;
case '$':
{
/* '$' is special everywhere in EREs, and in the end of the string in
* BREs.
*/
if (!ere && s[1])
{
goto parse_literal;
}
node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
s++;
}
break;
case '*':
case '|':
case '{':
case '+':
case '?':
{
if (!ere)
{
goto parse_literal;
}
}
case 0:
{
node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
}
break;
default:
{
parse_literal:
len = mbtowc(&wc, s, -1);
if (len < 0)
{
return REG_BADPAT;
}
if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc)))
{
tre_ast_node_t *tmp1, *tmp2;
/* multiple opposite case characters are not supported */
tmp1 =
tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(
wc), ctx->position);
tmp2 =
tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(
wc), ctx->position);
if (tmp1 && tmp2)
{
node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
}
else
{
node = 0;
}
}
else
{
node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
}
ctx->position++;
s += len;
}
break;
}
if (!node)
{
return REG_ESPACE;
}
ctx->n = node;
ctx->s = s;
return REG_OK;
}
#define PUSHPTR(err, s, v) do { \
if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
return err; \
} while (0)
#define PUSHINT(err, s, v) do { \
if ((err = tre_stack_push_int(s, v)) != REG_OK) \
return err; \
} while (0)
static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
{
tre_ast_node_t *nbranch = 0;
tre_ast_node_t *nunion = 0;
int ere = ctx->cflags & REG_EXTENDED;
const char *s = ctx->re;
int subid = 0;
int depth = 0;
reg_errcode_t err;
tre_stack_t *stack = ctx->stack;
PUSHINT(err, stack, subid++);
for (; ; )
{
if ((!ere && *s == '\\' && s[1] == '(') || (ere && *s == '('))
{
PUSHPTR(err, stack, nunion);
PUSHPTR(err, stack, nbranch);
PUSHINT(err, stack, subid++);
s++;
if (!ere)
{
s++;
}
depth++;
nbranch = nunion = 0;
continue;
}
if ((!ere && *s == '\\' && s[1] == ')') || (ere && *s == ')' && depth))
{
ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
if (!ctx->n)
{
return REG_ESPACE;
}
}
else
{
err = parse_atom(ctx, s);
if (err != REG_OK)
{
return err;
}
s = ctx->s;
}
parse_iter:
/* extension: repetitions are accepted after an empty node
* eg. (+), ^*, a$?, a|{2}
*/
switch (*s)
{
case '+':
case '?':
{
if (!ere)
{
break;
}
/* fallthrough */
}
case '*':
{
int min = 0;
int max = -1;
if (*s == '+')
{
min = 1;
}
if (*s == '?')
{
max = 1;
}
s++;
ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
if (!ctx->n)
{
return REG_ESPACE;
}
/* extension: multiple consecutive *+?{,} is unspecified,
* but (a+)+ has to be supported so accepting a++ makes
* sense, note however that the RE_DUP_MAX limit can be
* circumvented: (a{255}){255} uses a lot of memory..
*/
goto parse_iter;
}
case '\\':
{
if (ere || s[1] != '{')
{
break;
}
s++;
goto parse_brace;
}
case '{':
if (!ere)
{
break;
}
parse_brace:
err = parse_dup(ctx, s + 1);
if (err != REG_OK)
{
return err;
}
s = ctx->s;
goto parse_iter;
}
nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
if ((ere && *s == '|') || (ere && *s == ')' && depth) ||
(!ere && *s == '\\' && s[1] == ')') || !*s)
{
/* extension: empty branch is unspecified (), (|a), (a|)
* here they are not rejected but match on empty string
*/
int c = *s;
nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
nbranch = 0;
if (c != '|')
{
if (c == '\\')
{
if (!depth)
{
return REG_EPAREN;
}
s += 2;
}
else if (c == ')')
{
s++;
}
depth--;
err = marksub(ctx, nunion, tre_stack_pop_int(stack));
if (err != REG_OK)
{
return err;
}
if (!c && depth < 0)
{
ctx->submatch_id = subid;
return REG_OK;
}
if (!c || depth < 0)
{
return REG_EPAREN;
}
nbranch = tre_stack_pop_voidptr(stack);
nunion = tre_stack_pop_voidptr(stack);
goto parse_iter;
}
s++;
}
}
}
/* from tre-compile.c
*/
/* TODO:
* - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
* function calls.
*/
/* Algorithms to setup tags so that submatch addressing can be done. */
/* Inserts a catenation node to the root of the tree given in `node'.
* As the left child a new tag with number `tag_id' to `node' is added,
* and the right child is the old root.
*/
static reg_errcode_t tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node,
int tag_id)
{
tre_catenation_t *c;
c = tre_mem_alloc(mem, sizeof(*c));
if (c == NULL)
{
return REG_ESPACE;
}
c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
if (c->left == NULL)
{
return REG_ESPACE;
}
c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
if (c->right == NULL)
{
return REG_ESPACE;
}
c->right->obj = node->obj;
c->right->type = node->type;
c->right->nullable = -1;
c->right->submatch_id = -1;
c->right->firstpos = NULL;
c->right->lastpos = NULL;
c->right->num_tags = 0;
node->obj = c;
node->type = CATENATION;
return REG_OK;
}
/* Inserts a catenation node to the root of the tree given in `node'.
* As the right child a new tag with number `tag_id' to `node' is added,
* and the left child is the old root.
*/
static reg_errcode_t tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node,
int tag_id)
{
tre_catenation_t *c;
c = tre_mem_alloc(mem, sizeof(*c));
if (c == NULL)
{
return REG_ESPACE;
}
c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
if (c->right == NULL)
{
return REG_ESPACE;
}
c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
if (c->left == NULL)
{
return REG_ESPACE;
}
c->left->obj = node->obj;
c->left->type = node->type;
c->left->nullable = -1;
c->left->submatch_id = -1;
c->left->firstpos = NULL;
c->left->lastpos = NULL;
c->left->num_tags = 0;
node->obj = c;
node->type = CATENATION;
return REG_OK;
}
typedef enum
{
ADDTAGS_RECURSE,
ADDTAGS_AFTER_ITERATION,
ADDTAGS_AFTER_UNION_LEFT,
ADDTAGS_AFTER_UNION_RIGHT,
ADDTAGS_AFTER_CAT_LEFT,
ADDTAGS_AFTER_CAT_RIGHT,
ADDTAGS_SET_SUBMATCH_END
} tre_addtags_symbol_t;
typedef struct
{
int tag;
int next_tag;
} tre_tag_states_t;
/* Go through `regset' and set submatch data for submatches that are
* using this tag.
*/
static void tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
{
int i;
for (i = 0; regset[i] >= 0; i++)
{
int id = regset[i] / 2;
int start = !(regset[i] % 2);
if (start)
{
tnfa->submatch_data[id].so_tag = tag;
}
else
{
tnfa->submatch_data[id].eo_tag = tag;
}
}
regset[0] = -1;
}
/* Adds tags to appropriate locations in the parse tree in `tree', so that
* subexpressions marked for submatch addressing can be traced.
*/
static reg_errcode_t tre_add_tags(tre_mem_t mem, tre_stack_t *stack,
tre_ast_node_t *tree, tre_tnfa_t *tnfa)
{
reg_errcode_t status = REG_OK;
tre_addtags_symbol_t symbol;
tre_ast_node_t *node = tree; /* Tree node we are currently looking
* at. */
int bottom = tre_stack_num_objects(stack);
/* True for first pass (counting number of needed tags) */
int first_pass = (mem == NULL || tnfa == NULL);
int *regset;
int *orig_regset;
/* num_tags: Total number of tags.
* num_minimals: Number of special minimal tags.
* tag: The tag that is to be added next.
* next_tag: Next tag to use after this one.
* parents: Stack of submatches the current submatch is contained in.
* minimal_tag: Tag that marks the beginning of a minimal match.
*/
int num_tags = 0;
int num_minimals = 0;
int tag = 0;
int next_tag = 1;
int *parents;
int minimal_tag = -1;
tre_tag_states_t *saved_states;
tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
if (!first_pass)
{
tnfa->end_tag = 0;
tnfa->minimal_tags[0] = -1;
}
regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
if (regset == NULL)
{
return REG_ESPACE;
}
regset[0] = -1;
orig_regset = regset;
parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
if (parents == NULL)
{
xfree(regset);
return REG_ESPACE;
}
parents[0] = -1;
saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
if (saved_states == NULL)
{
xfree(regset);
xfree(parents);
return REG_ESPACE;
}
else
{
unsigned int i;
for (i = 0; i <= tnfa->num_submatches; i++)
{
saved_states[i].tag = -1;
}
}
STACK_PUSH(stack, voidptr, node);
STACK_PUSH(stack, int, ADDTAGS_RECURSE);
while (tre_stack_num_objects(stack) > bottom)
{
if (status != REG_OK)
{
break;
}
symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
switch (symbol)
{
case ADDTAGS_SET_SUBMATCH_END:
{
int id = tre_stack_pop_int(stack);
int i;
/* Add end of this submatch to regset. */
for (i = 0; regset[i] >= 0; i++)
{
}
regset[i] = id * 2 + 1;
regset[i + 1] = -1;
/* Pop this submatch from the parents stack. */
for (i = 0; parents[i] >= 0; i++)
{
}
parents[i - 1] = -1;
break;
}
case ADDTAGS_RECURSE:
{
node = tre_stack_pop_voidptr(stack);
if (node->submatch_id >= 0)
{
int id = node->submatch_id;
int i;
/* Add start of this submatch to regset. */
for (i = 0; regset[i] >= 0; i++)
{
}
regset[i] = id * 2;
regset[i + 1] = -1;
if (!first_pass)
{
for (i = 0; parents[i] >= 0; i++)
{
}
tnfa->submatch_data[id].parents = NULL;
if (i > 0)
{
int *p = xmalloc(sizeof(*p) * (i + 1));
if (p == NULL)
{
status = REG_ESPACE;
break;
}
assert(tnfa->submatch_data[id].parents == NULL);
tnfa->submatch_data[id].parents = p;
for (i = 0; parents[i] >= 0; i++)
{
p[i] = parents[i];
}
p[i] = -1;
}
}
/* Add end of this submatch to regset after processing this
* node.
*/
STACK_PUSHX(stack, int, node->submatch_id);
STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
}
switch (node->type)
{
case LITERAL:
{
tre_literal_t *lit = node->obj;
if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
{
int i;
if (regset[0] >= 0)
{
/* Regset is not empty, so add a tag before the
* literal or backref.
*/
if (!first_pass)
{
status = tre_add_tag_left(mem,
node,
tag);
tnfa->tag_directions[tag] = direction;
if (minimal_tag >= 0)
{
for (i = 0; tnfa->minimal_tags[i] >= 0; i++)
{
}
tnfa->minimal_tags[i] = tag;
tnfa->minimal_tags[i + 1] = minimal_tag;
tnfa->minimal_tags[i + 2] = -1;
minimal_tag = -1;
num_minimals++;
}
tre_purge_regset(regset, tnfa, tag);
}
else
{
node->num_tags = 1;
}
regset[0] = -1;
tag = next_tag;
num_tags++;
next_tag++;
}
}
else
{
assert(!IS_TAG(lit));
}
break;
}
case CATENATION:
{
tre_catenation_t *cat = node->obj;
tre_ast_node_t *left = cat->left;
tre_ast_node_t *right = cat->right;
int reserved_tag = -1;
/* After processing right child. */
STACK_PUSHX(stack, voidptr, node);
STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
/* Process right child. */
STACK_PUSHX(stack, voidptr, right);
STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
/* After processing left child. */
STACK_PUSHX(stack, int, next_tag + left->num_tags);
if (left->num_tags > 0 && right->num_tags > 0)
{
/* Reserve the next tag to the right child. */
reserved_tag = next_tag;
next_tag++;
}
STACK_PUSHX(stack, int, reserved_tag);
STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
/* Process left child. */
STACK_PUSHX(stack, voidptr, left);
STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
}
break;
case ITERATION:
{
tre_iteration_t *iter = node->obj;
if (first_pass)
{
STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
}
else
{
STACK_PUSHX(stack, int, tag);
STACK_PUSHX(stack, int, iter->minimal);
}
STACK_PUSHX(stack, voidptr, node);
STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
STACK_PUSHX(stack, voidptr, iter->arg);
STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
/* Regset is not empty, so add a tag here. */
if (regset[0] >= 0 || iter->minimal)
{
if (!first_pass)
{
int i;
status = tre_add_tag_left(mem, node, tag);
if (iter->minimal)
{
tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
}
else
{
tnfa->tag_directions[tag] = direction;
}
if (minimal_tag >= 0)
{
for (i = 0; tnfa->minimal_tags[i] >= 0; i++)
{
}
tnfa->minimal_tags[i] = tag;
tnfa->minimal_tags[i + 1] = minimal_tag;
tnfa->minimal_tags[i + 2] = -1;
minimal_tag = -1;
num_minimals++;
}
tre_purge_regset(regset, tnfa, tag);
}
regset[0] = -1;
tag = next_tag;
num_tags++;
next_tag++;
}
direction = TRE_TAG_MINIMIZE;
}
break;
case UNION:
{
tre_union_t *uni = node->obj;
tre_ast_node_t *left = uni->left;
tre_ast_node_t *right = uni->right;
int left_tag;
int right_tag;
if (regset[0] >= 0)
{
left_tag = next_tag;
right_tag = next_tag + 1;
}
else
{
left_tag = tag;
right_tag = next_tag;
}
/* After processing right child. */
STACK_PUSHX(stack, int, right_tag);
STACK_PUSHX(stack, int, left_tag);
STACK_PUSHX(stack, voidptr, regset);
STACK_PUSHX(stack, int, regset[0] >= 0);
STACK_PUSHX(stack, voidptr, node);
STACK_PUSHX(stack, voidptr, right);
STACK_PUSHX(stack, voidptr, left);
STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
/* Process right child. */
STACK_PUSHX(stack, voidptr, right);
STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
/* After processing left child. */
STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
/* Process left child. */
STACK_PUSHX(stack, voidptr, left);
STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
/* Regset is not empty, so add a tag here. */
if (regset[0] >= 0)
{
if (!first_pass)
{
int i;
status = tre_add_tag_left(mem, node,
tag);
tnfa->tag_directions[tag] = direction;
if (minimal_tag >= 0)
{
for (i = 0; tnfa->minimal_tags[i] >= 0; i++)
{
}
tnfa->minimal_tags[i] = tag;
tnfa->minimal_tags[i + 1] = minimal_tag;
tnfa->minimal_tags[i + 2] = -1;
minimal_tag = -1;
num_minimals++;
}
tre_purge_regset(regset, tnfa, tag);
}
regset[0] = -1;
tag = next_tag;
num_tags++;
next_tag++;
}
if (node->num_submatches > 0)
{
/* The next two tags are reserved for markers. */
next_tag++;
tag = next_tag;
next_tag++;
}
break;
}
}
if (node->submatch_id >= 0)
{
int i;
/* Push this submatch on the parents stack. */
for (i = 0; parents[i] >= 0; i++)
{
}
parents[i] = node->submatch_id;
parents[i + 1] = -1;
}
}
break; /* end case: ADDTAGS_RECURSE */
case ADDTAGS_AFTER_ITERATION:
{
int minimal = 0;
int enter_tag;
node = tre_stack_pop_voidptr(stack);
if (first_pass)
{
node->num_tags =
((tre_iteration_t *)node->obj)->arg->num_tags +
tre_stack_pop_int(
stack);
minimal_tag = -1;
}
else
{
minimal = tre_stack_pop_int(stack);
enter_tag = tre_stack_pop_int(stack);
if (minimal)
{
minimal_tag = enter_tag;
}
}
if (!first_pass)
{
if (minimal)
{
direction = TRE_TAG_MINIMIZE;
}
else
{
direction = TRE_TAG_MAXIMIZE;
}
}
break;
}
case ADDTAGS_AFTER_CAT_LEFT:
{
int new_tag = tre_stack_pop_int(stack);
next_tag = tre_stack_pop_int(stack);
if (new_tag >= 0)
{
tag = new_tag;
}
break;
}
case ADDTAGS_AFTER_CAT_RIGHT:
{
node = tre_stack_pop_voidptr(stack);
if (first_pass)
{
node->num_tags =
((tre_catenation_t *)node->obj)->left->num_tags +
((tre_catenation_t *)node->obj)->right->num_tags;
}
}
break;
case ADDTAGS_AFTER_UNION_LEFT:
{
/* Lift the bottom of the `regset' array so that when processing
* the right operand the items currently in the array are
* invisible. The original bottom was saved at ADDTAGS_UNION
* and
* will be restored at ADDTAGS_AFTER_UNION_RIGHT below.
*/
while (*regset >= 0)
{
regset++;
}
}
break;
case ADDTAGS_AFTER_UNION_RIGHT:
{
int added_tags;
int tag_left;
int tag_right;
tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
node = tre_stack_pop_voidptr(stack);
added_tags = tre_stack_pop_int(stack);
if (first_pass)
{
node->num_tags = ((tre_union_t *)node->obj)->left->num_tags +
((tre_union_t *)node->obj)->right->num_tags +
added_tags +
((node->num_submatches > 0) ? 2 : 0);
}
regset = tre_stack_pop_voidptr(stack);
tag_left = tre_stack_pop_int(stack);
tag_right = tre_stack_pop_int(stack);
/* Add tags after both children, the left child gets a smaller
* tag than the right child. This guarantees that we prefer
* the left child over the right child.
*/
/* XXX - This is not always necessary (if the children have
* tags which must be seen for every match of that child).
*/
/* XXX - Check if this is the only place where tre_add_tag_right
* is used. If so, use tre_add_tag_left (putting the tag before
* the child as opposed after the child) and throw away
* tre_add_tag_right.
*/
if (node->num_submatches > 0)
{
if (!first_pass)
{
status = tre_add_tag_right(mem,
left,
tag_left);
tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
status = tre_add_tag_right(
mem,
right,
tag_right);
tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
}
num_tags += 2;
}
direction = TRE_TAG_MAXIMIZE;
break;
}
default:
{
assert(0);
}
break;
/* end switch(symbol)
*/
}
/* end while(tre_stack_num_objects(stack) > bottom)
*/
}
if (!first_pass)
{
tre_purge_regset(regset, tnfa, tag);
}
if (!first_pass && minimal_tag >= 0)
{
int i;
for (i = 0; tnfa->minimal_tags[i] >= 0; i++)
{
}
tnfa->minimal_tags[i] = tag;
tnfa->minimal_tags[i + 1] = minimal_tag;
tnfa->minimal_tags[i + 2] = -1;
minimal_tag = -1;
num_minimals++;
}
assert(tree->num_tags == num_tags);
tnfa->end_tag = num_tags;
tnfa->num_tags = num_tags;
tnfa->num_minimals = num_minimals;
xfree(orig_regset);
xfree(parents);
xfree(saved_states);
return status;
}
/* AST to TNFA compilation routines.
*/
typedef enum
{
COPY_RECURSE,
COPY_SET_RESULT_PTR
} tre_copyast_symbol_t;
/* Flags for tre_copy_ast(). */
#define COPY_REMOVE_TAGS 1
#define COPY_MAXIMIZE_FIRST_TAG 2
static reg_errcode_t tre_copy_ast(tre_mem_t mem, tre_stack_t *stack,
tre_ast_node_t *ast, int flags,
int *pos_add,
tre_tag_direction_t *tag_directions,
tre_ast_node_t **copy, int *max_pos)
{
reg_errcode_t status = REG_OK;
int bottom = tre_stack_num_objects(stack);
int num_copied = 0;
int first_tag = 1;
tre_ast_node_t **result = copy;
tre_copyast_symbol_t symbol;
STACK_PUSH(stack, voidptr, ast);
STACK_PUSH(stack, int, COPY_RECURSE);
while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
{
tre_ast_node_t *node;
if (status != REG_OK)
{
break;
}
symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
switch (symbol)
{
case COPY_SET_RESULT_PTR:
{
result = tre_stack_pop_voidptr(stack);
}
break;
case COPY_RECURSE:
{
node = tre_stack_pop_voidptr(stack);
switch (node->type)
{
case LITERAL:
{
tre_literal_t *lit = node->obj;
int pos = lit->position;
int min = lit->code_min;
int max = lit->code_max;
if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
{
/* XXX - e.g. [ab] has only one position but two
* nodes, so we are creating holes in the state space
* here. Not fatal, just wastes memory.
*/
pos += *pos_add;
num_copied++;
}
else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
{
/* Change this tag to empty. */
min = EMPTY;
max = pos = -1;
}
else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG) &&
first_tag)
{
/* Maximize the first tag. */
tag_directions[max] = TRE_TAG_MAXIMIZE;
first_tag = 0;
}
*result = tre_ast_new_literal(mem, min, max, pos);
if (*result == NULL)
{
status = REG_ESPACE;
}
if (pos > *max_pos)
{
*max_pos = pos;
}
break;
}
case UNION:
{
tre_union_t *uni = node->obj;
tre_union_t *tmp;
*result = tre_ast_new_union(mem, uni->left, uni->right);
if (*result == NULL)
{
status = REG_ESPACE;
break;
}
tmp = (*result)->obj;
result = &tmp->left;
STACK_PUSHX(stack, voidptr, uni->right);
STACK_PUSHX(stack, int, COPY_RECURSE);
STACK_PUSHX(stack, voidptr, &tmp->right);
STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
STACK_PUSHX(stack, voidptr, uni->left);
STACK_PUSHX(stack, int, COPY_RECURSE);
break;
}
case CATENATION:
{
tre_catenation_t *cat = node->obj;
tre_catenation_t *tmp;
*result = tre_ast_new_catenation(mem, cat->left, cat->right);
if (*result == NULL)
{
status = REG_ESPACE;
break;
}
tmp = (*result)->obj;
tmp->left = NULL;
tmp->right = NULL;
result = &tmp->left;
STACK_PUSHX(stack, voidptr, cat->right);
STACK_PUSHX(stack, int, COPY_RECURSE);
STACK_PUSHX(stack, voidptr, &tmp->right);
STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
STACK_PUSHX(stack, voidptr, cat->left);
STACK_PUSHX(stack, int, COPY_RECURSE);
break;
}
case ITERATION:
{
tre_iteration_t *iter = node->obj;
STACK_PUSHX(stack, voidptr, iter->arg);
STACK_PUSHX(stack, int, COPY_RECURSE);
*result = tre_ast_new_iter(mem, iter->arg, iter->min,
iter->max,
iter->minimal);
if (*result == NULL)
{
status = REG_ESPACE;
break;
}
iter = (*result)->obj;
result = &iter->arg;
break;
}
default:
{
assert(0);
break;
}
}
}
break;
}
}
*pos_add += num_copied;
return status;
}
typedef enum
{
EXPAND_RECURSE,
EXPAND_AFTER_ITER
} tre_expand_ast_symbol_t;
/* Expands each iteration node that has a finite nonzero minimum or maximum
* iteration count to a catenated sequence of copies of the node.
*/
static reg_errcode_t tre_expand_ast(tre_mem_t mem, tre_stack_t *stack,
tre_ast_node_t *ast, int *position,
tre_tag_direction_t *tag_directions)
{
reg_errcode_t status = REG_OK;
int bottom = tre_stack_num_objects(stack);
int pos_add = 0;
int pos_add_total = 0;
int max_pos = 0;
int iter_depth = 0;
STACK_PUSHR(stack, voidptr, ast);
STACK_PUSHR(stack, int, EXPAND_RECURSE);
while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
{
tre_ast_node_t *node;
tre_expand_ast_symbol_t symbol;
if (status != REG_OK)
{
break;
}
symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
node = tre_stack_pop_voidptr(stack);
switch (symbol)
{
case EXPAND_RECURSE:
{
switch (node->type)
{
case LITERAL:
{
tre_literal_t *lit = node->obj;
if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
{
lit->position += pos_add;
if (lit->position > max_pos)
{
max_pos = lit->position;
}
}
break;
}
case UNION:
{
tre_union_t *uni = node->obj;
STACK_PUSHX(stack, voidptr, uni->right);
STACK_PUSHX(stack, int, EXPAND_RECURSE);
STACK_PUSHX(stack, voidptr, uni->left);
STACK_PUSHX(stack, int, EXPAND_RECURSE);
break;
}
case CATENATION:
{
tre_catenation_t *cat = node->obj;
STACK_PUSHX(stack, voidptr, cat->right);
STACK_PUSHX(stack, int, EXPAND_RECURSE);
STACK_PUSHX(stack, voidptr, cat->left);
STACK_PUSHX(stack, int, EXPAND_RECURSE);
break;
}
case ITERATION:
{
tre_iteration_t *iter = node->obj;
STACK_PUSHX(stack, int, pos_add);
STACK_PUSHX(stack, voidptr, node);
STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
STACK_PUSHX(stack, voidptr, iter->arg);
STACK_PUSHX(stack, int, EXPAND_RECURSE);
/* If we are going to expand this node at EXPAND_AFTER_ITER
* then don't increase the `pos' fields of the nodes now, it
* will get done when expanding.
*/
if (iter->min > 1 || iter->max > 1)
{
pos_add = 0;
}
iter_depth++;
break;
}
default:
{
assert(0);
break;
}
}
}
break;
case EXPAND_AFTER_ITER:
{
tre_iteration_t *iter = node->obj;
int pos_add_last;
pos_add = tre_stack_pop_int(stack);
pos_add_last = pos_add;
if (iter->min > 1 || iter->max > 1)
{
tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
int j;
int pos_add_save = pos_add;
/* Create a catenated sequence of copies of the node. */
for (j = 0; j < iter->min; j++)
{
tre_ast_node_t *copy;
/* Remove tags from all but the last copy. */
int flags =
((j + 1 <
iter->min) ? COPY_REMOVE_TAGS :
COPY_MAXIMIZE_FIRST_TAG);
pos_add_save = pos_add;
status = tre_copy_ast(mem, stack, iter->arg, flags,
&pos_add, tag_directions,
&copy, &max_pos);
if (status != REG_OK)
{
return status;
}
if (seq1 != NULL)
{
seq1 = tre_ast_new_catenation(mem, seq1, copy);
}
else
{
seq1 = copy;
}
if (seq1 == NULL)
{
return REG_ESPACE;
}
}
if (iter->max == -1)
{
/* No upper limit. */
pos_add_save = pos_add;
status = tre_copy_ast(mem, stack, iter->arg, 0,
&pos_add, NULL, &seq2,
&max_pos);
if (status != REG_OK)
{
return status;
}
seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
if (seq2 == NULL)
{
return REG_ESPACE;
}
}
else
{
for (j = iter->min; j < iter->max; j++)
{
tre_ast_node_t *tmp, *copy;
pos_add_save = pos_add;
status = tre_copy_ast(mem, stack, iter->arg, 0,
&pos_add, NULL, &copy,
&max_pos);
if (status != REG_OK)
{
return status;
}
if (seq2 != NULL)
{
seq2 = tre_ast_new_catenation(mem, copy, seq2);
}
else
{
seq2 = copy;
}
if (seq2 == NULL)
{
return REG_ESPACE;
}
tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
if (tmp == NULL)
{
return REG_ESPACE;
}
seq2 = tre_ast_new_union(mem, tmp, seq2);
if (seq2 == NULL)
{
return REG_ESPACE;
}
}
}
pos_add = pos_add_save;
if (seq1 == NULL)
{
seq1 = seq2;
}
else if (seq2 != NULL)
{
seq1 = tre_ast_new_catenation(mem, seq1, seq2);
}
if (seq1 == NULL)
{
return REG_ESPACE;
}
node->obj = seq1->obj;
node->type = seq1->type;
}
iter_depth--;
pos_add_total += pos_add - pos_add_last;
if (iter_depth == 0)
{
pos_add = pos_add_total;
}
break;
}
default:
{
assert(0);
break;
}
}
}
*position += pos_add_total;
/* `max_pos' should never be larger than `*position' if the above
* code works, but just an extra safeguard let's make sure
* `*position' is set large enough so enough memory will be
* allocated for the transition table.
*/
if (max_pos > *position)
{
*position = max_pos;
}
return status;
}
static tre_pos_and_tags_t *tre_set_empty(tre_mem_t mem)
{
tre_pos_and_tags_t *new_set;
new_set = tre_mem_calloc(mem, sizeof(*new_set));
if (new_set == NULL)
{
return NULL;
}
new_set[0].position = -1;
new_set[0].code_min = -1;
new_set[0].code_max = -1;
return new_set;
}
static tre_pos_and_tags_t *tre_set_one(tre_mem_t mem, int position,
int code_min, int code_max,
tre_ctype_t class,
tre_ctype_t *neg_classes, int backref)
{
tre_pos_and_tags_t *new_set;
new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
if (new_set == NULL)
{
return NULL;
}
new_set[0].position = position;
new_set[0].code_min = code_min;
new_set[0].code_max = code_max;
new_set[0].class = class;
new_set[0].neg_classes = neg_classes;
new_set[0].backref = backref;
new_set[1].position = -1;
new_set[1].code_min = -1;
new_set[1].code_max = -1;
return new_set;
}
static tre_pos_and_tags_t *tre_set_union(tre_mem_t mem,
tre_pos_and_tags_t *set1,
tre_pos_and_tags_t *set2, int *tags,
int assertions)
{
int s1;
int s2;
int i;
int j;
tre_pos_and_tags_t *new_set;
int *new_tags;
int num_tags;
for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++)
{
}
for (s1 = 0; set1[s1].position >= 0; s1++)
{
}
for (s2 = 0; set2[s2].position >= 0; s2++)
{
}
new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
if (!new_set)
{
return NULL;
}
for (s1 = 0; set1[s1].position >= 0; s1++)
{
new_set[s1].position = set1[s1].position;
new_set[s1].code_min = set1[s1].code_min;
new_set[s1].code_max = set1[s1].code_max;
new_set[s1].assertions = set1[s1].assertions | assertions;
new_set[s1].class = set1[s1].class;
new_set[s1].neg_classes = set1[s1].neg_classes;
new_set[s1].backref = set1[s1].backref;
if (set1[s1].tags == NULL && tags == NULL)
{
new_set[s1].tags = NULL;
}
else
{
for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++)
{
}
new_tags =
tre_mem_alloc(mem, (sizeof(*new_tags) * (i + num_tags + 1)));
if (new_tags == NULL)
{
return NULL;
}
for (j = 0; j < i; j++)
{
new_tags[j] = set1[s1].tags[j];
}
for (i = 0; i < num_tags; i++)
{
new_tags[j + i] = tags[i];
}
new_tags[j + i] = -1;
new_set[s1].tags = new_tags;
}
}
for (s2 = 0; set2[s2].position >= 0; s2++)
{
new_set[s1 + s2].position = set2[s2].position;
new_set[s1 + s2].code_min = set2[s2].code_min;
new_set[s1 + s2].code_max = set2[s2].code_max;
/* XXX - why not | assertions here as well? */
new_set[s1 + s2].assertions = set2[s2].assertions;
new_set[s1 + s2].class = set2[s2].class;
new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
new_set[s1 + s2].backref = set2[s2].backref;
if (set2[s2].tags == NULL)
{
new_set[s1 + s2].tags = NULL;
}
else
{
for (i = 0; set2[s2].tags[i] >= 0; i++)
{
}
new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
if (new_tags == NULL)
{
return NULL;
}
for (j = 0; j < i; j++)
{
new_tags[j] = set2[s2].tags[j];
}
new_tags[j] = -1;
new_set[s1 + s2].tags = new_tags;
}
}
new_set[s1 + s2].position = -1;
return new_set;
}
/* Finds the empty path through `node' which is the one that should be
* taken according to POSIX.2 rules, and adds the tags on that path to
* `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
* set to the number of tags seen on the path.
*/
static reg_errcode_t tre_match_empty(tre_stack_t *stack,
tre_ast_node_t *node,
int *tags, int *assertions,
int *num_tags_seen)
{
tre_literal_t *lit;
tre_union_t *uni;
tre_catenation_t *cat;
tre_iteration_t *iter;
int i;
int bottom = tre_stack_num_objects(stack);
reg_errcode_t status = REG_OK;
if (num_tags_seen)
{
*num_tags_seen = 0;
}
status = tre_stack_push_voidptr(stack, node);
/* Walk through the tree recursively. */
while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
{
node = tre_stack_pop_voidptr(stack);
switch (node->type)
{
case LITERAL:
{
lit = (tre_literal_t *)node->obj;
switch (lit->code_min)
{
case TAG:
{
if (lit->code_max >= 0)
{
if (tags != NULL)
{
/* Add the tag to `tags'. */
for (i = 0; tags[i] >= 0; i++)
{
if (tags[i] == lit->code_max)
{
break;
}
}
if (tags[i] < 0)
{
tags[i] = lit->code_max;
tags[i + 1] = -1;
}
}
if (num_tags_seen)
{
(*num_tags_seen)++;
}
}
}
break;
case ASSERTION:
{
assert(lit->code_max >= 1 || lit->code_max <= ASSERT_LAST);
if (assertions != NULL)
{
*assertions |= lit->code_max;
}
}
break;
case EMPTY:
{
}
break;
default:
{
assert(0);
}
break;
}
}
break;
case UNION:
{
/* Subexpressions starting earlier take priority over ones
* starting later, so we prefer the left subexpression over the
* right subexpression.
*/
uni = (tre_union_t *)node->obj;
if (uni->left->nullable)
STACK_PUSHX(stack, voidptr, uni->left)
else if (uni->right->nullable)
STACK_PUSHX(stack, voidptr, uni->right)
else
assert(0);
}
break;
case CATENATION:
{
/* The path must go through both children. */
cat = (tre_catenation_t *)node->obj;
assert(cat->left->nullable);
assert(cat->right->nullable);
STACK_PUSHX(stack, voidptr, cat->left);
STACK_PUSHX(stack, voidptr, cat->right);
}
break;
case ITERATION:
{
/* A match with an empty string is preferred over no match at
* all, so we go through the argument if possible.
*/
iter = (tre_iteration_t *)node->obj;
if (iter->arg->nullable)
{
STACK_PUSHX(stack, voidptr, iter->arg);
}
}
break;
default:
{
assert(0);
}
break;
}
}
return status;
}
typedef enum
{
NFL_RECURSE,
NFL_POST_UNION,
NFL_POST_CATENATION,
NFL_POST_ITERATION
} tre_nfl_stack_symbol_t;
/* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
* the nodes of the AST `tree'.
*/
static reg_errcode_t tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack,
tre_ast_node_t *tree)
{
int bottom = tre_stack_num_objects(stack);
STACK_PUSHR(stack, voidptr, tree);
STACK_PUSHR(stack, int, NFL_RECURSE);
while (tre_stack_num_objects(stack) > bottom)
{
tre_nfl_stack_symbol_t symbol;
tre_ast_node_t *node;
symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
node = tre_stack_pop_voidptr(stack);
switch (symbol)
{
case NFL_RECURSE:
{
switch (node->type)
{
case LITERAL:
{
tre_literal_t *lit = (tre_literal_t *)node->obj;
if (IS_BACKREF(lit))
{
/* Back references: nullable = false, firstpos = {i},
* lastpos = {i}.
*/
node->nullable = 0;
node->firstpos = tre_set_one(mem, lit->position, 0,
TRE_CHAR_MAX, 0, NULL, -1);
if (!node->firstpos)
{
return REG_ESPACE;
}
node->lastpos = tre_set_one(mem, lit->position, 0,
TRE_CHAR_MAX, 0, NULL,
(int)lit->code_max);
if (!node->lastpos)
{
return REG_ESPACE;
}
}
else if (lit->code_min < 0)
{
/* Tags, empty strings, params, and zero width assertions:
* nullable = true, firstpos = {}, and lastpos = {}.
*/
node->nullable = 1;
node->firstpos = tre_set_empty(mem);
if (!node->firstpos)
{
return REG_ESPACE;
}
node->lastpos = tre_set_empty(mem);
if (!node->lastpos)
{
return REG_ESPACE;
}
}
else
{
/* Literal at position i: nullable = false, firstpos = {i},
* lastpos = {i}.
*/
node->nullable = 0;
node->firstpos = tre_set_one(mem, lit->position,
(int)lit->code_min,
(int)lit->code_max,
0, NULL,
-1);
if (!node->firstpos)
{
return REG_ESPACE;
}
node->lastpos = tre_set_one(mem, lit->position,
(int)lit->code_min,
(int)lit->code_max, lit->class,
lit->neg_classes, -1);
if (!node->lastpos)
{
return REG_ESPACE;
}
}
break;
}
case UNION:
{
/* Compute the attributes for the two subtrees, and after that
* for this node.
*/
STACK_PUSHR(stack, voidptr, node);
STACK_PUSHR(stack, int, NFL_POST_UNION);
STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
STACK_PUSHR(stack, int, NFL_RECURSE);
STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
STACK_PUSHR(stack, int, NFL_RECURSE);
}
break;
case CATENATION:
{
/* Compute the attributes for the two subtrees, and after that
* for this node.
*/
STACK_PUSHR(stack, voidptr, node);
STACK_PUSHR(stack, int, NFL_POST_CATENATION);
STACK_PUSHR(stack, voidptr,
((tre_catenation_t *)node->obj)->right);
STACK_PUSHR(stack, int, NFL_RECURSE);
STACK_PUSHR(stack, voidptr,
((tre_catenation_t *)node->obj)->left);
STACK_PUSHR(stack, int, NFL_RECURSE);
}
break;
case ITERATION:
{
/* Compute the attributes for the subtree, and after that for
* this node.
*/
STACK_PUSHR(stack, voidptr, node);
STACK_PUSHR(stack, int, NFL_POST_ITERATION);
STACK_PUSHR(stack, voidptr,
((tre_iteration_t *)node->obj)->arg);
STACK_PUSHR(stack, int, NFL_RECURSE);
}
break;
}
}
break; /* end case: NFL_RECURSE */
case NFL_POST_UNION:
{
tre_union_t *uni = (tre_union_t *)node->obj;
node->nullable = uni->left->nullable || uni->right->nullable;
node->firstpos = tre_set_union(mem, uni->left->firstpos,
uni->right->firstpos, NULL, 0);
if (!node->firstpos)
{
return REG_ESPACE;
}
node->lastpos = tre_set_union(mem, uni->left->lastpos,
uni->right->lastpos, NULL, 0);
if (!node->lastpos)
{
return REG_ESPACE;
}
break;
}
case NFL_POST_ITERATION:
{
tre_iteration_t *iter = (tre_iteration_t *)node->obj;
if (iter->min == 0 || iter->arg->nullable)
{
node->nullable = 1;
}
else
{
node->nullable = 0;
}
node->firstpos = iter->arg->firstpos;
node->lastpos = iter->arg->lastpos;
break;
}
case NFL_POST_CATENATION:
{
int num_tags;
int *tags;
int assertions;
reg_errcode_t status;
tre_catenation_t *cat = node->obj;
node->nullable = cat->left->nullable && cat->right->nullable;
/* Compute firstpos. */
if (cat->left->nullable)
{
/* The left side matches the empty string. Make a first pass
* with tre_match_empty() to get the number of tags and
* parameters.
*/
status =
tre_match_empty(stack, cat->left, NULL, NULL, &num_tags);
if (status != REG_OK)
{
return status;
}
/* Allocate arrays for the tags and parameters. */
tags = xmalloc(sizeof(*tags) * (num_tags + 1));
if (!tags)
{
return REG_ESPACE;
}
tags[0] = -1;
assertions = 0;
/* Second pass with tre_mach_empty() to get the list of
* tags and parameters.
*/
status = tre_match_empty(stack, cat->left, tags, &assertions,
NULL);
if (status != REG_OK)
{
xfree(tags);
return status;
}
node->firstpos = tre_set_union(mem, cat->right->firstpos,
cat->left->firstpos, tags,
assertions);
xfree(tags);
if (!node->firstpos)
{
return REG_ESPACE;
}
}
else
{
node->firstpos = cat->left->firstpos;
}
/* Compute lastpos. */
if (cat->right->nullable)
{
/* The right side matches the empty string. Make a first pass
* with tre_match_empty() to get the number of tags and
* parameters.
*/
status = tre_match_empty(stack, cat->right, NULL, NULL,
&num_tags);
if (status != REG_OK)
{
return status;
}
/* Allocate arrays for the tags and parameters. */
tags = xmalloc(sizeof(int) * (num_tags + 1));
if (!tags)
{
return REG_ESPACE;
}
tags[0] = -1;
assertions = 0;
/* Second pass with tre_mach_empty() to get the list of
* tags and parameters.
*/
status = tre_match_empty(stack, cat->right, tags, &assertions,
NULL);
if (status != REG_OK)
{
xfree(tags);
return status;
}
node->lastpos = tre_set_union(mem, cat->left->lastpos,
cat->right->lastpos, tags,
assertions);
xfree(tags);
if (!node->lastpos)
{
return REG_ESPACE;
}
}
else
{
node->lastpos = cat->right->lastpos;
}
break;
}
default:
{
assert(0);
}
break;
}
}
return REG_OK;
}
/* Adds a transition from each position in `p1' to each position in `p2'. */
static reg_errcode_t tre_make_trans(tre_pos_and_tags_t *p1,
tre_pos_and_tags_t *p2,
tre_tnfa_transition_t *transitions,
int *counts, int *offs)
{
tre_pos_and_tags_t *orig_p2 = p2;
tre_tnfa_transition_t *trans;
int i;
int j;
int k;
int l;
int dup;
int prev_p2_pos;
if (transitions != NULL)
{
while (p1->position >= 0)
{
p2 = orig_p2;
prev_p2_pos = -1;
while (p2->position >= 0)
{
/* Optimization: if this position was already handled, skip it.
*/
if (p2->position == prev_p2_pos)
{
p2++;
continue;
}
prev_p2_pos = p2->position;
/* Set `trans' to point to the next unused transition from
* position `p1->position'.
*/
trans = transitions + offs[p1->position];
while (trans->state != NULL)
{
#if 0
/* If we find a previous transition from `p1->position' to
* `p2->position', it is overwritten. This can happen only
* if there are nested loops in the regexp, like in
* "((a)*)*". In POSIX.2 repetition using the outer loop
* is always preferred over using the inner loop.
* Therefore the transition for the inner loop is useless
* and can be thrown away.
*/
/* XXX - The same position is used for all nodes in a
* bracket expression, so this optimization cannot be
* used (it will break bracket expressions) unless I
* figure out a way to detect it here.
*/
if (trans->state_id == p2->position)
{
break;
}
#endif
trans++;
}
if (trans->state == NULL)
{
(trans + 1)->state = NULL;
}
/* Use the character ranges, assertions, etc. from `p1' for
* the transition from `p1' to `p2'.
*/
trans->code_min = p1->code_min;
trans->code_max = p1->code_max;
trans->state = transitions + offs[p2->position];
trans->state_id = p2->position;
trans->assertions = p1->assertions | p2->assertions |
(p1->class ? ASSERT_CHAR_CLASS : 0) |
(p1->neg_classes !=
NULL ? ASSERT_CHAR_CLASS_NEG : 0);
if (p1->backref >= 0)
{
assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
assert(p2->backref < 0);
trans->u.backref = p1->backref;
trans->assertions |= ASSERT_BACKREF;
}
else
{
trans->u.class = p1->class;
}
if (p1->neg_classes != NULL)
{
for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
{
}
trans->neg_classes =
xmalloc(sizeof(*trans->neg_classes) * (i + 1));
if (trans->neg_classes == NULL)
{
return REG_ESPACE;
}
for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
{
trans->neg_classes[i] = p1->neg_classes[i];
}
trans->neg_classes[i] = (tre_ctype_t)0;
}
else
{
trans->neg_classes = NULL;
}
/* Find out how many tags this transition has. */
i = 0;
if (p1->tags != NULL)
{
while (p1->tags[i] >= 0)
{
i++;
}
}
j = 0;
if (p2->tags != NULL)
{
while (p2->tags[j] >= 0)
{
j++;
}
}
/* If we are overwriting a transition, free the old tag array.
*/
if (trans->tags != NULL)
{
xfree(trans->tags);
}
trans->tags = NULL;
/* If there were any tags, allocate an array and fill it. */
if (i + j > 0)
{
trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
if (!trans->tags)
{
return REG_ESPACE;
}
i = 0;
if (p1->tags != NULL)
{
while (p1->tags[i] >= 0)
{
trans->tags[i] = p1->tags[i];
i++;
}
}
l = i;
j = 0;
if (p2->tags != NULL)
{
while (p2->tags[j] >= 0)
{
/* Don't add duplicates. */
dup = 0;
for (k = 0; k < i; k++)
{
if (trans->tags[k] == p2->tags[j])
{
dup = 1;
break;
}
}
if (!dup)
{
trans->tags[l++] = p2->tags[j];
}
j++;
}
}
trans->tags[l] = -1;
}
p2++;
}
p1++;
}
}
else
{
/* Compute a maximum limit for the number of transitions leaving
* from each state.
*/
while (p1->position >= 0)
{
p2 = orig_p2;
while (p2->position >= 0)
{
counts[p1->position]++;
p2++;
}
p1++;
}
}
return REG_OK;
}
/* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
* labelled with one character range (there are no transitions on empty
* strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
* the regexp.
*/
static reg_errcode_t tre_ast_to_tnfa(tre_ast_node_t *node,
tre_tnfa_transition_t *transitions,
int *counts, int *offs)
{
tre_union_t *uni;
tre_catenation_t *cat;
tre_iteration_t *iter;
reg_errcode_t errcode = REG_OK;
/* XXX - recurse using a stack!. */
switch (node->type)
{
case LITERAL:
{
}
break;
case UNION:
{
uni = (tre_union_t *)node->obj;
errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
if (errcode != REG_OK)
{
return errcode;
}
errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
}
break;
case CATENATION:
{
cat = (tre_catenation_t *)node->obj;
/* Add a transition from each position in cat->left->lastpos
* to each position in cat->right->firstpos.
*/
errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
transitions, counts, offs);
if (errcode != REG_OK)
{
return errcode;
}
errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
if (errcode != REG_OK)
{
return errcode;
}
errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
}
break;
case ITERATION:
{
iter = (tre_iteration_t *)node->obj;
assert(iter->max == -1 || iter->max == 1);
if (iter->max == -1)
{
assert(iter->min == 0 || iter->min == 1);
/* Add a transition from each last position in the iterated
* expression to each first position.
*/
errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
transitions, counts, offs);
if (errcode != REG_OK)
{
return errcode;
}
}
errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
}
break;
}
return errcode;
}
#define ERROR_EXIT(err) \
do \
{ \
errcode = err; \
if (/* CONSTCOND */ 1) \
goto error_exit; \
} \
while (/* CONSTCOND */ 0)
int regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
{
tre_stack_t *stack;
tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
tre_pos_and_tags_t *p;
int *counts = NULL;
int *offs = NULL;
int i;
int add = 0;
tre_tnfa_transition_t *transitions, *initial;
tre_tnfa_t *tnfa = NULL;
tre_submatch_data_t *submatch_data;
tre_tag_direction_t *tag_directions = NULL;
reg_errcode_t errcode;
tre_mem_t mem;
/* Parse context. */
tre_parse_ctx_t parse_ctx;
/* Allocate a stack used throughout the compilation process for various
* purposes.
*/
stack = tre_stack_new(512, 10240, 128);
if (!stack)
{
return REG_ESPACE;
}
/* Allocate a fast memory allocator. */
mem = tre_mem_new();
if (!mem)
{
tre_stack_destroy(stack);
return REG_ESPACE;
}
/* Parse the regexp. */
memset(&parse_ctx, 0, sizeof(parse_ctx));
parse_ctx.mem = mem;
parse_ctx.stack = stack;
parse_ctx.re = regex;
parse_ctx.cflags = cflags;
parse_ctx.max_backref = -1;
errcode = tre_parse(&parse_ctx);
if (errcode != REG_OK)
{
ERROR_EXIT(errcode);
}
preg->re_nsub = parse_ctx.submatch_id - 1;
tree = parse_ctx.n;
#ifdef TRE_DEBUG
tre_ast_print(tree);
#endif /* TRE_DEBUG */
/* Referring to nonexistent subexpressions is illegal. */
if (parse_ctx.max_backref > (int)preg->re_nsub)
{
ERROR_EXIT(REG_ESUBREG);
}
/* Allocate the TNFA struct. */
tnfa = xcalloc(1, sizeof(tre_tnfa_t));
if (tnfa == NULL)
{
ERROR_EXIT(REG_ESPACE);
}
tnfa->have_backrefs = parse_ctx.max_backref >= 0;
tnfa->have_approx = 0;
tnfa->num_submatches = parse_ctx.submatch_id;
/* Set up tags for submatch addressing. If REG_NOSUB is set and the
* regexp does not have back references, this can be skipped.
*/
if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
{
/* Figure out how many tags we will need. */
errcode = tre_add_tags(NULL, stack, tree, tnfa);
if (errcode != REG_OK)
{
ERROR_EXIT(errcode);
}
if (tnfa->num_tags > 0)
{
tag_directions =
xmalloc(sizeof(*tag_directions) * (tnfa->num_tags + 1));
if (tag_directions == NULL)
{
ERROR_EXIT(REG_ESPACE);
}
tnfa->tag_directions = tag_directions;
memset(tag_directions, -1,
sizeof(*tag_directions) * (tnfa->num_tags + 1));
}
tnfa->minimal_tags =
xcalloc((unsigned)tnfa->num_tags * 2 + 1,
sizeof(*tnfa->minimal_tags));
if (tnfa->minimal_tags == NULL)
{
ERROR_EXIT(REG_ESPACE);
}
submatch_data =
xcalloc((unsigned)parse_ctx.submatch_id, sizeof(*submatch_data));
if (submatch_data == NULL)
{
ERROR_EXIT(REG_ESPACE);
}
tnfa->submatch_data = submatch_data;
errcode = tre_add_tags(mem, stack, tree, tnfa);
if (errcode != REG_OK)
{
ERROR_EXIT(errcode);
}
}
/* Expand iteration nodes. */
errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
tag_directions);
if (errcode != REG_OK)
{
ERROR_EXIT(errcode);
}
/* Add a dummy node for the final state.
* XXX - For certain patterns this dummy node can be optimized away,
* for example "a*" or "ab*". Figure out a simple way to detect
* this possibility.
*/
tmp_ast_l = tree;
tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
if (tmp_ast_r == NULL)
{
ERROR_EXIT(REG_ESPACE);
}
tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
if (tree == NULL)
{
ERROR_EXIT(REG_ESPACE);
}
errcode = tre_compute_nfl(mem, stack, tree);
if (errcode != REG_OK)
{
ERROR_EXIT(errcode);
}
counts = xmalloc(sizeof(int) * parse_ctx.position);
if (counts == NULL)
{
ERROR_EXIT(REG_ESPACE);
}
offs = xmalloc(sizeof(int) * parse_ctx.position);
if (offs == NULL)
{
ERROR_EXIT(REG_ESPACE);
}
for (i = 0; i < parse_ctx.position; i++)
{
counts[i] = 0;
}
tre_ast_to_tnfa(tree, NULL, counts, NULL);
add = 0;
for (i = 0; i < parse_ctx.position; i++)
{
offs[i] = add;
add += counts[i] + 1;
counts[i] = 0;
}
transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
if (transitions == NULL)
{
ERROR_EXIT(REG_ESPACE);
}
tnfa->transitions = transitions;
tnfa->num_transitions = add;
errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
if (errcode != REG_OK)
{
ERROR_EXIT(errcode);
}
tnfa->firstpos_chars = NULL;
p = tree->firstpos;
i = 0;
while (p->position >= 0)
{
i++;
p++;
}
initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
if (initial == NULL)
{
ERROR_EXIT(REG_ESPACE);
}
tnfa->initial = initial;
i = 0;
for (p = tree->firstpos; p->position >= 0; p++)
{
initial[i].state = transitions + offs[p->position];
initial[i].state_id = p->position;
initial[i].tags = NULL;
/* Copy the arrays p->tags, and p->params, they are allocated
* from a tre_mem object.
*/
if (p->tags)
{
int j;
for (j = 0; p->tags[j] >= 0; j++)
{
}
initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
if (!initial[i].tags)
{
ERROR_EXIT(REG_ESPACE);
}
memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
}
initial[i].assertions = p->assertions;
i++;
}
initial[i].state = NULL;
tnfa->num_transitions = add;
tnfa->final = transitions + offs[tree->lastpos[0].position];
tnfa->num_states = parse_ctx.position;
tnfa->cflags = cflags;
tre_mem_destroy(mem);
tre_stack_destroy(stack);
xfree(counts);
xfree(offs);
preg->TRE_REGEX_T_FIELD = (void *)tnfa;
return REG_OK;
error_exit:
/* Free everything that was allocated and return the error code. */
tre_mem_destroy(mem);
if (stack != NULL)
{
tre_stack_destroy(stack);
}
if (counts != NULL)
{
xfree(counts);
}
if (offs != NULL)
{
xfree(offs);
}
preg->TRE_REGEX_T_FIELD = (void *)tnfa;
regfree(preg);
return errcode;
}
void regfree(regex_t *preg)
{
tre_tnfa_t *tnfa;
unsigned int i;
tre_tnfa_transition_t *trans;
tnfa = (void *)preg->TRE_REGEX_T_FIELD;
if (!tnfa)
{
return;
}
for (i = 0; i < tnfa->num_transitions; i++)
{
if (tnfa->transitions[i].state)
{
if (tnfa->transitions[i].tags)
{
xfree(tnfa->transitions[i].tags);
}
if (tnfa->transitions[i].neg_classes)
{
xfree(tnfa->transitions[i].neg_classes);
}
}
}
if (tnfa->transitions)
{
xfree(tnfa->transitions);
}
if (tnfa->initial)
{
for (trans = tnfa->initial; trans->state; trans++)
{
if (trans->tags)
{
xfree(trans->tags);
}
}
xfree(tnfa->initial);
}
if (tnfa->submatch_data)
{
for (i = 0; i < tnfa->num_submatches; i++)
{
if (tnfa->submatch_data[i].parents)
{
xfree(tnfa->submatch_data[i].parents);
}
}
xfree(tnfa->submatch_data);
}
if (tnfa->tag_directions)
{
xfree(tnfa->tag_directions);
}
if (tnfa->firstpos_chars)
{
xfree(tnfa->firstpos_chars);
}
if (tnfa->minimal_tags)
{
xfree(tnfa->minimal_tags);
}
xfree(tnfa);
}