1/*
2  regcomp.c - TRE POSIX compatible regex compilation functions.
3
4  Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
5  All rights reserved.
6
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted provided that the following conditions
9  are met:
10
11    1. Redistributions of source code must retain the above copyright
12       notice, this list of conditions and the following disclaimer.
13
14    2. Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17
18  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
22  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30*/
31
32#include <string.h>
33#include <stdlib.h>
34#include <regex.h>
35#include <limits.h>
36#include <stdint.h>
37#include <ctype.h>
38
39#include "tre.h"
40
41#include <assert.h>
42
43/***********************************************************************
44 from tre-compile.h
45***********************************************************************/
46
47typedef struct {
48  int position;
49  int code_min;
50  int code_max;
51  int *tags;
52  int assertions;
53  tre_ctype_t class;
54  tre_ctype_t *neg_classes;
55  int backref;
56} tre_pos_and_tags_t;
57
58
59/***********************************************************************
60 from tre-ast.c and tre-ast.h
61***********************************************************************/
62
63/* The different AST node types. */
64typedef enum {
65  LITERAL,
66  CATENATION,
67  ITERATION,
68  UNION
69} tre_ast_type_t;
70
71/* Special subtypes of TRE_LITERAL. */
72#define EMPTY	  -1   /* Empty leaf (denotes empty string). */
73#define ASSERTION -2   /* Assertion leaf. */
74#define TAG	  -3   /* Tag leaf. */
75#define BACKREF	  -4   /* Back reference leaf. */
76
77#define IS_SPECIAL(x)	((x)->code_min < 0)
78#define IS_EMPTY(x)	((x)->code_min == EMPTY)
79#define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
80#define IS_TAG(x)	((x)->code_min == TAG)
81#define IS_BACKREF(x)	((x)->code_min == BACKREF)
82
83
84/* A generic AST node.  All AST nodes consist of this node on the top
85   level with `obj' pointing to the actual content. */
86typedef struct {
87  tre_ast_type_t type;   /* Type of the node. */
88  void *obj;             /* Pointer to actual node. */
89  int nullable;
90  int submatch_id;
91  int num_submatches;
92  int num_tags;
93  tre_pos_and_tags_t *firstpos;
94  tre_pos_and_tags_t *lastpos;
95} tre_ast_node_t;
96
97
98/* A "literal" node.  These are created for assertions, back references,
99   tags, matching parameter settings, and all expressions that match one
100   character. */
101typedef struct {
102  long code_min;
103  long code_max;
104  int position;
105  tre_ctype_t class;
106  tre_ctype_t *neg_classes;
107} tre_literal_t;
108
109/* A "catenation" node.	 These are created when two regexps are concatenated.
110   If there are more than one subexpressions in sequence, the `left' part
111   holds all but the last, and `right' part holds the last subexpression
112   (catenation is left associative). */
113typedef struct {
114  tre_ast_node_t *left;
115  tre_ast_node_t *right;
116} tre_catenation_t;
117
118/* An "iteration" node.	 These are created for the "*", "+", "?", and "{m,n}"
119   operators. */
120typedef struct {
121  /* Subexpression to match. */
122  tre_ast_node_t *arg;
123  /* Minimum number of consecutive matches. */
124  int min;
125  /* Maximum number of consecutive matches. */
126  int max;
127  /* If 0, match as many characters as possible, if 1 match as few as
128     possible.	Note that this does not always mean the same thing as
129     matching as many/few repetitions as possible. */
130  unsigned int minimal:1;
131} tre_iteration_t;
132
133/* An "union" node.  These are created for the "|" operator. */
134typedef struct {
135  tre_ast_node_t *left;
136  tre_ast_node_t *right;
137} tre_union_t;
138
139
140static tre_ast_node_t *
141tre_ast_new_node(tre_mem_t mem, int type, void *obj)
142{
143	tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
144	if (!node || !obj)
145		return 0;
146	node->obj = obj;
147	node->type = type;
148	node->nullable = -1;
149	node->submatch_id = -1;
150	return node;
151}
152
153static tre_ast_node_t *
154tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
155{
156	tre_ast_node_t *node;
157	tre_literal_t *lit;
158
159	lit = tre_mem_calloc(mem, sizeof *lit);
160	node = tre_ast_new_node(mem, LITERAL, lit);
161	if (!node)
162		return 0;
163	lit->code_min = code_min;
164	lit->code_max = code_max;
165	lit->position = position;
166	return node;
167}
168
169static tre_ast_node_t *
170tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
171{
172	tre_ast_node_t *node;
173	tre_iteration_t *iter;
174
175	iter = tre_mem_calloc(mem, sizeof *iter);
176	node = tre_ast_new_node(mem, ITERATION, iter);
177	if (!node)
178		return 0;
179	iter->arg = arg;
180	iter->min = min;
181	iter->max = max;
182	iter->minimal = minimal;
183	node->num_submatches = arg->num_submatches;
184	return node;
185}
186
187static tre_ast_node_t *
188tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
189{
190	tre_ast_node_t *node;
191	tre_union_t *un;
192
193	if (!left)
194		return right;
195	un = tre_mem_calloc(mem, sizeof *un);
196	node = tre_ast_new_node(mem, UNION, un);
197	if (!node || !right)
198		return 0;
199	un->left = left;
200	un->right = right;
201	node->num_submatches = left->num_submatches + right->num_submatches;
202	return node;
203}
204
205static tre_ast_node_t *
206tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
207{
208	tre_ast_node_t *node;
209	tre_catenation_t *cat;
210
211	if (!left)
212		return right;
213	cat = tre_mem_calloc(mem, sizeof *cat);
214	node = tre_ast_new_node(mem, CATENATION, cat);
215	if (!node)
216		return 0;
217	cat->left = left;
218	cat->right = right;
219	node->num_submatches = left->num_submatches + right->num_submatches;
220	return node;
221}
222
223
224/***********************************************************************
225 from tre-stack.c and tre-stack.h
226***********************************************************************/
227
228typedef struct tre_stack_rec tre_stack_t;
229
230/* Creates a new stack object.	`size' is initial size in bytes, `max_size'
231   is maximum size, and `increment' specifies how much more space will be
232   allocated with realloc() if all space gets used up.	Returns the stack
233   object or NULL if out of memory. */
234static tre_stack_t *
235tre_stack_new(int size, int max_size, int increment);
236
237/* Frees the stack object. */
238static void
239tre_stack_destroy(tre_stack_t *s);
240
241/* Returns the current number of objects in the stack. */
242static int
243tre_stack_num_objects(tre_stack_t *s);
244
245/* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
246   `value' on top of stack `s'.  Returns REG_ESPACE if out of memory.
247   This tries to realloc() more space before failing if maximum size
248   has not yet been reached.  Returns REG_OK if successful. */
249#define declare_pushf(typetag, type)					      \
250  static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
251
252declare_pushf(voidptr, void *);
253declare_pushf(int, int);
254
255/* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
256   element off of stack `s' and returns it.  The stack must not be
257   empty. */
258#define declare_popf(typetag, type)		  \
259  static type tre_stack_pop_ ## typetag(tre_stack_t *s)
260
261declare_popf(voidptr, void *);
262declare_popf(int, int);
263
264/* Just to save some typing. */
265#define STACK_PUSH(s, typetag, value)					      \
266  do									      \
267    {									      \
268      status = tre_stack_push_ ## typetag(s, value);			      \
269    }									      \
270  while (/*CONSTCOND*/0)
271
272#define STACK_PUSHX(s, typetag, value)					      \
273  {									      \
274    status = tre_stack_push_ ## typetag(s, value);			      \
275    if (status != REG_OK)						      \
276      break;								      \
277  }
278
279#define STACK_PUSHR(s, typetag, value)					      \
280  {									      \
281    reg_errcode_t _status;						      \
282    _status = tre_stack_push_ ## typetag(s, value);			      \
283    if (_status != REG_OK)						      \
284      return _status;							      \
285  }
286
287union tre_stack_item {
288  void *voidptr_value;
289  int int_value;
290};
291
292struct tre_stack_rec {
293  int size;
294  int max_size;
295  int increment;
296  int ptr;
297  union tre_stack_item *stack;
298};
299
300
301static tre_stack_t *
302tre_stack_new(int size, int max_size, int increment)
303{
304  tre_stack_t *s;
305
306  s = xmalloc(sizeof(*s));
307  if (s != NULL)
308    {
309      s->stack = xmalloc(sizeof(*s->stack) * size);
310      if (s->stack == NULL)
311	{
312	  xfree(s);
313	  return NULL;
314	}
315      s->size = size;
316      s->max_size = max_size;
317      s->increment = increment;
318      s->ptr = 0;
319    }
320  return s;
321}
322
323static void
324tre_stack_destroy(tre_stack_t *s)
325{
326  xfree(s->stack);
327  xfree(s);
328}
329
330static int
331tre_stack_num_objects(tre_stack_t *s)
332{
333  return s->ptr;
334}
335
336static reg_errcode_t
337tre_stack_push(tre_stack_t *s, union tre_stack_item value)
338{
339  if (s->ptr < s->size)
340    {
341      s->stack[s->ptr] = value;
342      s->ptr++;
343    }
344  else
345    {
346      if (s->size >= s->max_size)
347	{
348	  return REG_ESPACE;
349	}
350      else
351	{
352	  union tre_stack_item *new_buffer;
353	  int new_size;
354	  new_size = s->size + s->increment;
355	  if (new_size > s->max_size)
356	    new_size = s->max_size;
357	  new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
358	  if (new_buffer == NULL)
359	    {
360	      return REG_ESPACE;
361	    }
362	  assert(new_size > s->size);
363	  s->size = new_size;
364	  s->stack = new_buffer;
365	  tre_stack_push(s, value);
366	}
367    }
368  return REG_OK;
369}
370
371#define define_pushf(typetag, type)  \
372  declare_pushf(typetag, type) {     \
373    union tre_stack_item item;	     \
374    item.typetag ## _value = value;  \
375    return tre_stack_push(s, item);  \
376}
377
378define_pushf(int, int)
379define_pushf(voidptr, void *)
380
381#define define_popf(typetag, type)		    \
382  declare_popf(typetag, type) {			    \
383    return s->stack[--s->ptr].typetag ## _value;    \
384  }
385
386define_popf(int, int)
387define_popf(voidptr, void *)
388
389
390/***********************************************************************
391 from tre-parse.c and tre-parse.h
392***********************************************************************/
393
394/* Parse context. */
395typedef struct {
396	/* Memory allocator. The AST is allocated using this. */
397	tre_mem_t mem;
398	/* Stack used for keeping track of regexp syntax. */
399	tre_stack_t *stack;
400	/* The parsed node after a parse function returns. */
401	tre_ast_node_t *n;
402	/* Position in the regexp pattern after a parse function returns. */
403	const char *s;
404	/* The first character of the last subexpression parsed. */
405	const char *start;
406	/* Current submatch ID. */
407	int submatch_id;
408	/* Current position (number of literal). */
409	int position;
410	/* The highest back reference or -1 if none seen so far. */
411	int max_backref;
412	/* Compilation flags. */
413	int cflags;
414} tre_parse_ctx_t;
415
416/* Some macros for expanding \w, \s, etc. */
417static const struct {
418	char c;
419	const char *expansion;
420} tre_macros[] = {
421	{'t', "\t"}, {'n', "\n"}, {'r', "\r"},
422	{'f', "\f"}, {'a', "\a"}, {'e', "\033"},
423	{'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
424	{'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
425	{ 0, 0 }
426};
427
428/* Expands a macro delimited by `regex' and `regex_end' to `buf', which
429   must have at least `len' items.  Sets buf[0] to zero if the there
430   is no match in `tre_macros'. */
431static const char *tre_expand_macro(const char *s)
432{
433	int i;
434	for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
435	return tre_macros[i].expansion;
436}
437
438static int
439tre_compare_lit(const void *a, const void *b)
440{
441	const tre_literal_t *const *la = a;
442	const tre_literal_t *const *lb = b;
443	/* assumes the range of valid code_min is < INT_MAX */
444	return la[0]->code_min - lb[0]->code_min;
445}
446
447struct literals {
448	tre_mem_t mem;
449	tre_literal_t **a;
450	int len;
451	int cap;
452};
453
454static tre_literal_t *tre_new_lit(struct literals *p)
455{
456	tre_literal_t **a;
457	if (p->len >= p->cap) {
458		if (p->cap >= 1<<15)
459			return 0;
460		p->cap *= 2;
461		a = xrealloc(p->a, p->cap * sizeof *p->a);
462		if (!a)
463			return 0;
464		p->a = a;
465	}
466	a = p->a + p->len++;
467	*a = tre_mem_calloc(p->mem, sizeof **a);
468	return *a;
469}
470
471static int add_icase_literals(struct literals *ls, int min, int max)
472{
473	tre_literal_t *lit;
474	int b, e, c;
475	for (c=min; c<=max; ) {
476		/* assumes islower(c) and isupper(c) are exclusive
477		   and toupper(c)!=c if islower(c).
478		   multiple opposite case characters are not supported */
479		if (tre_islower(c)) {
480			b = e = tre_toupper(c);
481			for (c++, e++; c<=max; c++, e++)
482				if (tre_toupper(c) != e) break;
483		} else if (tre_isupper(c)) {
484			b = e = tre_tolower(c);
485			for (c++, e++; c<=max; c++, e++)
486				if (tre_tolower(c) != e) break;
487		} else {
488			c++;
489			continue;
490		}
491		lit = tre_new_lit(ls);
492		if (!lit)
493			return -1;
494		lit->code_min = b;
495		lit->code_max = e-1;
496		lit->position = -1;
497	}
498	return 0;
499}
500
501
502/* Maximum number of character classes in a negated bracket expression. */
503#define MAX_NEG_CLASSES 64
504
505struct neg {
506	int negate;
507	int len;
508	tre_ctype_t a[MAX_NEG_CLASSES];
509};
510
511// TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
512
513/*
514bracket grammar:
515Bracket  =  '[' List ']'  |  '[^' List ']'
516List     =  Term  |  List Term
517Term     =  Char  |  Range  |  Chclass  |  Eqclass
518Range    =  Char '-' Char  |  Char '-' '-'
519Char     =  Coll  |  coll_single
520Meta     =  ']'  |  '-'
521Coll     =  '[.' coll_single '.]'  |  '[.' coll_multi '.]'  |  '[.' Meta '.]'
522Eqclass  =  '[=' coll_single '=]'  |  '[=' coll_multi '=]'
523Chclass  =  '[:' class ':]'
524
525coll_single is a single char collating element but it can be
526 '-' only at the beginning or end of a List and
527 ']' only at the beginning of a List and
528 '^' anywhere except after the openning '['
529*/
530
531static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
532{
533	const char *start = s;
534	tre_ctype_t class;
535	int min, max;
536	wchar_t wc;
537	int len;
538
539	for (;;) {
540		class = 0;
541		len = mbtowc(&wc, s, -1);
542		if (len <= 0)
543			return *s ? REG_BADPAT : REG_EBRACK;
544		if (*s == ']' && s != start) {
545			ctx->s = s+1;
546			return REG_OK;
547		}
548		if (*s == '-' && s != start && s[1] != ']' &&
549		    /* extension: [a-z--@] is accepted as [a-z]|[--@] */
550		    (s[1] != '-' || s[2] == ']'))
551			return REG_ERANGE;
552		if (*s == '[' && (s[1] == '.' || s[1] == '='))
553			/* collating symbols and equivalence classes are not supported */
554			return REG_ECOLLATE;
555		if (*s == '[' && s[1] == ':') {
556			char tmp[CHARCLASS_NAME_MAX+1];
557			s += 2;
558			for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
559				if (s[len] == ':') {
560					memcpy(tmp, s, len);
561					tmp[len] = 0;
562					class = tre_ctype(tmp);
563					break;
564				}
565			}
566			if (!class || s[len+1] != ']')
567				return REG_ECTYPE;
568			min = 0;
569			max = TRE_CHAR_MAX;
570			s += len+2;
571		} else {
572			min = max = wc;
573			s += len;
574			if (*s == '-' && s[1] != ']') {
575				s++;
576				len = mbtowc(&wc, s, -1);
577				max = wc;
578				/* XXX - Should use collation order instead of
579				   encoding values in character ranges. */
580				if (len <= 0 || min > max)
581					return REG_ERANGE;
582				s += len;
583			}
584		}
585
586		if (class && neg->negate) {
587			if (neg->len >= MAX_NEG_CLASSES)
588				return REG_ESPACE;
589			neg->a[neg->len++] = class;
590		} else  {
591			tre_literal_t *lit = tre_new_lit(ls);
592			if (!lit)
593				return REG_ESPACE;
594			lit->code_min = min;
595			lit->code_max = max;
596			lit->class = class;
597			lit->position = -1;
598
599			/* Add opposite-case codepoints if REG_ICASE is present.
600			   It seems that POSIX requires that bracket negation
601			   should happen before case-folding, but most practical
602			   implementations do it the other way around. Changing
603			   the order would need efficient representation of
604			   case-fold ranges and bracket range sets even with
605			   simple patterns so this is ok for now. */
606			if (ctx->cflags & REG_ICASE && !class)
607				if (add_icase_literals(ls, min, max))
608					return REG_ESPACE;
609		}
610	}
611}
612
613static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
614{
615	int i, max, min, negmax, negmin;
616	tre_ast_node_t *node = 0, *n;
617	tre_ctype_t *nc = 0;
618	tre_literal_t *lit;
619	struct literals ls;
620	struct neg neg;
621	reg_errcode_t err;
622
623	ls.mem = ctx->mem;
624	ls.len = 0;
625	ls.cap = 32;
626	ls.a = xmalloc(ls.cap * sizeof *ls.a);
627	if (!ls.a)
628		return REG_ESPACE;
629	neg.len = 0;
630	neg.negate = *s == '^';
631	if (neg.negate)
632		s++;
633
634	err = parse_bracket_terms(ctx, s, &ls, &neg);
635	if (err != REG_OK)
636		goto parse_bracket_done;
637
638	if (neg.negate) {
639		/*
640		 * With REG_NEWLINE, POSIX requires that newlines are not matched by
641		 * any form of a non-matching list.
642		 */
643		if (ctx->cflags & REG_NEWLINE) {
644			lit = tre_new_lit(&ls);
645			if (!lit) {
646				err = REG_ESPACE;
647				goto parse_bracket_done;
648			}
649			lit->code_min = '\n';
650			lit->code_max = '\n';
651			lit->position = -1;
652		}
653		/* Sort the array if we need to negate it. */
654		qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
655		/* extra lit for the last negated range */
656		lit = tre_new_lit(&ls);
657		if (!lit) {
658			err = REG_ESPACE;
659			goto parse_bracket_done;
660		}
661		lit->code_min = TRE_CHAR_MAX+1;
662		lit->code_max = TRE_CHAR_MAX+1;
663		lit->position = -1;
664		/* negated classes */
665		if (neg.len) {
666			nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
667			if (!nc) {
668				err = REG_ESPACE;
669				goto parse_bracket_done;
670			}
671			memcpy(nc, neg.a, neg.len*sizeof *neg.a);
672			nc[neg.len] = 0;
673		}
674	}
675
676	/* Build a union of the items in the array, negated if necessary. */
677	negmax = negmin = 0;
678	for (i = 0; i < ls.len; i++) {
679		lit = ls.a[i];
680		min = lit->code_min;
681		max = lit->code_max;
682		if (neg.negate) {
683			if (min <= negmin) {
684				/* Overlap. */
685				negmin = MAX(max + 1, negmin);
686				continue;
687			}
688			negmax = min - 1;
689			lit->code_min = negmin;
690			lit->code_max = negmax;
691			negmin = max + 1;
692		}
693		lit->position = ctx->position;
694		lit->neg_classes = nc;
695		n = tre_ast_new_node(ctx->mem, LITERAL, lit);
696		node = tre_ast_new_union(ctx->mem, node, n);
697		if (!node) {
698			err = REG_ESPACE;
699			break;
700		}
701	}
702
703parse_bracket_done:
704	xfree(ls.a);
705	ctx->position++;
706	ctx->n = node;
707	return err;
708}
709
710static const char *parse_dup_count(const char *s, int *n)
711{
712	*n = -1;
713	if (!isdigit(*s))
714		return s;
715	*n = 0;
716	for (;;) {
717		*n = 10 * *n + (*s - '0');
718		s++;
719		if (!isdigit(*s) || *n > RE_DUP_MAX)
720			break;
721	}
722	return s;
723}
724
725static const char *parse_dup(const char *s, int ere, int *pmin, int *pmax)
726{
727	int min, max;
728
729	s = parse_dup_count(s, &min);
730	if (*s == ',')
731		s = parse_dup_count(s+1, &max);
732	else
733		max = min;
734
735	if (
736		(max < min && max >= 0) ||
737		max > RE_DUP_MAX ||
738		min > RE_DUP_MAX ||
739		min < 0 ||
740		(!ere && *s++ != '\\') ||
741		*s++ != '}'
742	)
743		return 0;
744	*pmin = min;
745	*pmax = max;
746	return s;
747}
748
749static int hexval(unsigned c)
750{
751	if (c-'0'<10) return c-'0';
752	c |= 32;
753	if (c-'a'<6) return c-'a'+10;
754	return -1;
755}
756
757static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
758{
759	if (node->submatch_id >= 0) {
760		tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
761		if (!n)
762			return REG_ESPACE;
763		n = tre_ast_new_catenation(ctx->mem, n, node);
764		if (!n)
765			return REG_ESPACE;
766		n->num_submatches = node->num_submatches;
767		node = n;
768	}
769	node->submatch_id = subid;
770	node->num_submatches++;
771	ctx->n = node;
772	return REG_OK;
773}
774
775/*
776BRE grammar:
777Regex  =  Branch  |  '^'  |  '$'  |  '^$'  |  '^' Branch  |  Branch '$'  |  '^' Branch '$'
778Branch =  Atom  |  Branch Atom
779Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '\(' Branch '\)'  |  back_ref
780Dup    =  '*'  |  '\{' Count '\}'  |  '\{' Count ',\}'  |  '\{' Count ',' Count '\}'
781
782(leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
783
784ERE grammar:
785Regex  =  Branch  |  Regex '|' Branch
786Branch =  Atom  |  Branch Atom
787Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '(' Regex ')'  |  '^'  |  '$'
788Dup    =  '*'  |  '+'  |  '?'  |  '{' Count '}'  |  '{' Count ',}'  |  '{' Count ',' Count '}'
789
790(a*+?, ^*, $+, \X, {, (|a) are unspecified)
791*/
792
793static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
794{
795	int len, ere = ctx->cflags & REG_EXTENDED;
796	const char *p;
797	tre_ast_node_t *node;
798	wchar_t wc;
799	switch (*s) {
800	case '[':
801		return parse_bracket(ctx, s+1);
802	case '\\':
803		p = tre_expand_macro(s+1);
804		if (p) {
805			/* assume \X expansion is a single atom */
806			reg_errcode_t err = parse_atom(ctx, p);
807			ctx->s = s+2;
808			return err;
809		}
810		/* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
811		switch (*++s) {
812		case 0:
813			return REG_EESCAPE;
814		case 'b':
815			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
816			break;
817		case 'B':
818			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
819			break;
820		case '<':
821			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
822			break;
823		case '>':
824			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
825			break;
826		case 'x':
827			s++;
828			int i, v = 0, c;
829			len = 2;
830			if (*s == '{') {
831				len = 8;
832				s++;
833			}
834			for (i=0; i<len && v<0x110000; i++) {
835				c = hexval(s[i]);
836				if (c < 0) break;
837				v = 16*v + c;
838			}
839			s += i;
840			if (len == 8) {
841				if (*s != '}')
842					return REG_EBRACE;
843				s++;
844			}
845			node = tre_ast_new_literal(ctx->mem, v, v, ctx->position++);
846			s--;
847			break;
848		case '{':
849		case '+':
850		case '?':
851			/* extension: treat \+, \? as repetitions in BRE */
852			/* reject repetitions after empty expression in BRE */
853			if (!ere)
854				return REG_BADRPT;
855		case '|':
856			/* extension: treat \| as alternation in BRE */
857			if (!ere) {
858				node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
859				s--;
860				goto end;
861			}
862			/* fallthrough */
863		default:
864			if (!ere && (unsigned)*s-'1' < 9) {
865				/* back reference */
866				int val = *s - '0';
867				node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position++);
868				ctx->max_backref = MAX(val, ctx->max_backref);
869			} else {
870				/* extension: accept unknown escaped char
871				   as a literal */
872				goto parse_literal;
873			}
874		}
875		s++;
876		break;
877	case '.':
878		if (ctx->cflags & REG_NEWLINE) {
879			tre_ast_node_t *tmp1, *tmp2;
880			tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
881			tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
882			if (tmp1 && tmp2)
883				node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
884			else
885				node = 0;
886		} else {
887			node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
888		}
889		s++;
890		break;
891	case '^':
892		/* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
893		if (!ere && s != ctx->start)
894			goto parse_literal;
895		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
896		s++;
897		break;
898	case '$':
899		/* '$' is special everywhere in EREs, and at the end of a BRE subexpression. */
900		if (!ere && s[1] && (s[1]!='\\'|| (s[2]!=')' && s[2]!='|')))
901			goto parse_literal;
902		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
903		s++;
904		break;
905	case '*':
906	case '{':
907	case '+':
908	case '?':
909		/* reject repetitions after empty expression in ERE */
910		if (ere)
911			return REG_BADRPT;
912	case '|':
913		if (!ere)
914			goto parse_literal;
915	case 0:
916		node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
917		break;
918	default:
919parse_literal:
920		len = mbtowc(&wc, s, -1);
921		if (len < 0)
922			return REG_BADPAT;
923		if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
924			tre_ast_node_t *tmp1, *tmp2;
925			/* multiple opposite case characters are not supported */
926			tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
927			tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
928			if (tmp1 && tmp2)
929				node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
930			else
931				node = 0;
932		} else {
933			node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
934		}
935		ctx->position++;
936		s += len;
937		break;
938	}
939end:
940	if (!node)
941		return REG_ESPACE;
942	ctx->n = node;
943	ctx->s = s;
944	return REG_OK;
945}
946
947#define PUSHPTR(err, s, v) do { \
948	if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
949		return err; \
950} while(0)
951
952#define PUSHINT(err, s, v) do { \
953	if ((err = tre_stack_push_int(s, v)) != REG_OK) \
954		return err; \
955} while(0)
956
957static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
958{
959	tre_ast_node_t *nbranch=0, *nunion=0;
960	int ere = ctx->cflags & REG_EXTENDED;
961	const char *s = ctx->start;
962	int subid = 0;
963	int depth = 0;
964	reg_errcode_t err;
965	tre_stack_t *stack = ctx->stack;
966
967	PUSHINT(err, stack, subid++);
968	for (;;) {
969		if ((!ere && *s == '\\' && s[1] == '(') ||
970		    (ere && *s == '(')) {
971			PUSHPTR(err, stack, nunion);
972			PUSHPTR(err, stack, nbranch);
973			PUSHINT(err, stack, subid++);
974			s++;
975			if (!ere)
976				s++;
977			depth++;
978			nbranch = nunion = 0;
979			ctx->start = s;
980			continue;
981		}
982		if ((!ere && *s == '\\' && s[1] == ')') ||
983		    (ere && *s == ')' && depth)) {
984			ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
985			if (!ctx->n)
986				return REG_ESPACE;
987		} else {
988			err = parse_atom(ctx, s);
989			if (err != REG_OK)
990				return err;
991			s = ctx->s;
992		}
993
994	parse_iter:
995		for (;;) {
996			int min, max;
997
998			if (*s!='\\' && *s!='*') {
999				if (!ere)
1000					break;
1001				if (*s!='+' && *s!='?' && *s!='{')
1002					break;
1003			}
1004			if (*s=='\\' && ere)
1005				break;
1006			/* extension: treat \+, \? as repetitions in BRE */
1007			if (*s=='\\' && s[1]!='+' && s[1]!='?' && s[1]!='{')
1008				break;
1009			if (*s=='\\')
1010				s++;
1011
1012			/* handle ^* at the start of a BRE. */
1013			if (!ere && s==ctx->start+1 && s[-1]=='^')
1014				break;
1015
1016			/* extension: multiple consecutive *+?{,} is unspecified,
1017			   but (a+)+ has to be supported so accepting a++ makes
1018			   sense, note however that the RE_DUP_MAX limit can be
1019			   circumvented: (a{255}){255} uses a lot of memory.. */
1020			if (*s=='{') {
1021				s = parse_dup(s+1, ere, &min, &max);
1022				if (!s)
1023					return REG_BADBR;
1024			} else {
1025				min=0;
1026				max=-1;
1027				if (*s == '+')
1028					min = 1;
1029				if (*s == '?')
1030					max = 1;
1031				s++;
1032			}
1033			if (max == 0)
1034				ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1035			else
1036				ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
1037			if (!ctx->n)
1038				return REG_ESPACE;
1039		}
1040
1041		nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
1042		if ((ere && *s == '|') ||
1043		    (ere && *s == ')' && depth) ||
1044		    (!ere && *s == '\\' && s[1] == ')') ||
1045		    /* extension: treat \| as alternation in BRE */
1046		    (!ere && *s == '\\' && s[1] == '|') ||
1047		    !*s) {
1048			/* extension: empty branch is unspecified (), (|a), (a|)
1049			   here they are not rejected but match on empty string */
1050			int c = *s;
1051			nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
1052			nbranch = 0;
1053
1054			if (c == '\\' && s[1] == '|') {
1055				s+=2;
1056				ctx->start = s;
1057			} else if (c == '|') {
1058				s++;
1059				ctx->start = s;
1060			} else {
1061				if (c == '\\') {
1062					if (!depth) return REG_EPAREN;
1063					s+=2;
1064				} else if (c == ')')
1065					s++;
1066				depth--;
1067				err = marksub(ctx, nunion, tre_stack_pop_int(stack));
1068				if (err != REG_OK)
1069					return err;
1070				if (!c && depth<0) {
1071					ctx->submatch_id = subid;
1072					return REG_OK;
1073				}
1074				if (!c || depth<0)
1075					return REG_EPAREN;
1076				nbranch = tre_stack_pop_voidptr(stack);
1077				nunion = tre_stack_pop_voidptr(stack);
1078				goto parse_iter;
1079			}
1080		}
1081	}
1082}
1083
1084
1085/***********************************************************************
1086 from tre-compile.c
1087***********************************************************************/
1088
1089
1090/*
1091  TODO:
1092   - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1093     function calls.
1094*/
1095
1096/*
1097  Algorithms to setup tags so that submatch addressing can be done.
1098*/
1099
1100
1101/* Inserts a catenation node to the root of the tree given in `node'.
1102   As the left child a new tag with number `tag_id' to `node' is added,
1103   and the right child is the old root. */
1104static reg_errcode_t
1105tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1106{
1107  tre_catenation_t *c;
1108
1109  c = tre_mem_alloc(mem, sizeof(*c));
1110  if (c == NULL)
1111    return REG_ESPACE;
1112  c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1113  if (c->left == NULL)
1114    return REG_ESPACE;
1115  c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1116  if (c->right == NULL)
1117    return REG_ESPACE;
1118
1119  c->right->obj = node->obj;
1120  c->right->type = node->type;
1121  c->right->nullable = -1;
1122  c->right->submatch_id = -1;
1123  c->right->firstpos = NULL;
1124  c->right->lastpos = NULL;
1125  c->right->num_tags = 0;
1126  c->right->num_submatches = 0;
1127  node->obj = c;
1128  node->type = CATENATION;
1129  return REG_OK;
1130}
1131
1132/* Inserts a catenation node to the root of the tree given in `node'.
1133   As the right child a new tag with number `tag_id' to `node' is added,
1134   and the left child is the old root. */
1135static reg_errcode_t
1136tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1137{
1138  tre_catenation_t *c;
1139
1140  c = tre_mem_alloc(mem, sizeof(*c));
1141  if (c == NULL)
1142    return REG_ESPACE;
1143  c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1144  if (c->right == NULL)
1145    return REG_ESPACE;
1146  c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1147  if (c->left == NULL)
1148    return REG_ESPACE;
1149
1150  c->left->obj = node->obj;
1151  c->left->type = node->type;
1152  c->left->nullable = -1;
1153  c->left->submatch_id = -1;
1154  c->left->firstpos = NULL;
1155  c->left->lastpos = NULL;
1156  c->left->num_tags = 0;
1157  c->left->num_submatches = 0;
1158  node->obj = c;
1159  node->type = CATENATION;
1160  return REG_OK;
1161}
1162
1163typedef enum {
1164  ADDTAGS_RECURSE,
1165  ADDTAGS_AFTER_ITERATION,
1166  ADDTAGS_AFTER_UNION_LEFT,
1167  ADDTAGS_AFTER_UNION_RIGHT,
1168  ADDTAGS_AFTER_CAT_LEFT,
1169  ADDTAGS_AFTER_CAT_RIGHT,
1170  ADDTAGS_SET_SUBMATCH_END
1171} tre_addtags_symbol_t;
1172
1173
1174typedef struct {
1175  int tag;
1176  int next_tag;
1177} tre_tag_states_t;
1178
1179
1180/* Go through `regset' and set submatch data for submatches that are
1181   using this tag. */
1182static void
1183tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1184{
1185  int i;
1186
1187  for (i = 0; regset[i] >= 0; i++)
1188    {
1189      int id = regset[i] / 2;
1190      int start = !(regset[i] % 2);
1191      if (start)
1192	tnfa->submatch_data[id].so_tag = tag;
1193      else
1194	tnfa->submatch_data[id].eo_tag = tag;
1195    }
1196  regset[0] = -1;
1197}
1198
1199
1200/* Adds tags to appropriate locations in the parse tree in `tree', so that
1201   subexpressions marked for submatch addressing can be traced. */
1202static reg_errcode_t
1203tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1204	     tre_tnfa_t *tnfa)
1205{
1206  reg_errcode_t status = REG_OK;
1207  tre_addtags_symbol_t symbol;
1208  tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1209  int bottom = tre_stack_num_objects(stack);
1210  /* True for first pass (counting number of needed tags) */
1211  int first_pass = (mem == NULL || tnfa == NULL);
1212  int *regset, *orig_regset;
1213  int num_tags = 0; /* Total number of tags. */
1214  int num_minimals = 0;	 /* Number of special minimal tags. */
1215  int tag = 0;	    /* The tag that is to be added next. */
1216  int next_tag = 1; /* Next tag to use after this one. */
1217  int *parents;	    /* Stack of submatches the current submatch is
1218		       contained in. */
1219  int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1220  tre_tag_states_t *saved_states;
1221
1222  tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1223  if (!first_pass)
1224    {
1225      tnfa->end_tag = 0;
1226      tnfa->minimal_tags[0] = -1;
1227    }
1228
1229  regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1230  if (regset == NULL)
1231    return REG_ESPACE;
1232  regset[0] = -1;
1233  orig_regset = regset;
1234
1235  parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1236  if (parents == NULL)
1237    {
1238      xfree(regset);
1239      return REG_ESPACE;
1240    }
1241  parents[0] = -1;
1242
1243  saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1244  if (saved_states == NULL)
1245    {
1246      xfree(regset);
1247      xfree(parents);
1248      return REG_ESPACE;
1249    }
1250  else
1251    {
1252      unsigned int i;
1253      for (i = 0; i <= tnfa->num_submatches; i++)
1254	saved_states[i].tag = -1;
1255    }
1256
1257  STACK_PUSH(stack, voidptr, node);
1258  STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1259
1260  while (tre_stack_num_objects(stack) > bottom)
1261    {
1262      if (status != REG_OK)
1263	break;
1264
1265      symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1266      switch (symbol)
1267	{
1268
1269	case ADDTAGS_SET_SUBMATCH_END:
1270	  {
1271	    int id = tre_stack_pop_int(stack);
1272	    int i;
1273
1274	    /* Add end of this submatch to regset. */
1275	    for (i = 0; regset[i] >= 0; i++);
1276	    regset[i] = id * 2 + 1;
1277	    regset[i + 1] = -1;
1278
1279	    /* Pop this submatch from the parents stack. */
1280	    for (i = 0; parents[i] >= 0; i++);
1281	    parents[i - 1] = -1;
1282	    break;
1283	  }
1284
1285	case ADDTAGS_RECURSE:
1286	  node = tre_stack_pop_voidptr(stack);
1287
1288	  if (node->submatch_id >= 0)
1289	    {
1290	      int id = node->submatch_id;
1291	      int i;
1292
1293
1294	      /* Add start of this submatch to regset. */
1295	      for (i = 0; regset[i] >= 0; i++);
1296	      regset[i] = id * 2;
1297	      regset[i + 1] = -1;
1298
1299	      if (!first_pass)
1300		{
1301		  for (i = 0; parents[i] >= 0; i++);
1302		  tnfa->submatch_data[id].parents = NULL;
1303		  if (i > 0)
1304		    {
1305		      int *p = xmalloc(sizeof(*p) * (i + 1));
1306		      if (p == NULL)
1307			{
1308			  status = REG_ESPACE;
1309			  break;
1310			}
1311		      assert(tnfa->submatch_data[id].parents == NULL);
1312		      tnfa->submatch_data[id].parents = p;
1313		      for (i = 0; parents[i] >= 0; i++)
1314			p[i] = parents[i];
1315		      p[i] = -1;
1316		    }
1317		}
1318
1319	      /* Add end of this submatch to regset after processing this
1320		 node. */
1321	      STACK_PUSHX(stack, int, node->submatch_id);
1322	      STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1323	    }
1324
1325	  switch (node->type)
1326	    {
1327	    case LITERAL:
1328	      {
1329		tre_literal_t *lit = node->obj;
1330
1331		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1332		  {
1333		    int i;
1334		    if (regset[0] >= 0)
1335		      {
1336			/* Regset is not empty, so add a tag before the
1337			   literal or backref. */
1338			if (!first_pass)
1339			  {
1340			    status = tre_add_tag_left(mem, node, tag);
1341			    tnfa->tag_directions[tag] = direction;
1342			    if (minimal_tag >= 0)
1343			      {
1344				for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1345				tnfa->minimal_tags[i] = tag;
1346				tnfa->minimal_tags[i + 1] = minimal_tag;
1347				tnfa->minimal_tags[i + 2] = -1;
1348				minimal_tag = -1;
1349				num_minimals++;
1350			      }
1351			    tre_purge_regset(regset, tnfa, tag);
1352			  }
1353			else
1354			  {
1355			    node->num_tags = 1;
1356			  }
1357
1358			regset[0] = -1;
1359			tag = next_tag;
1360			num_tags++;
1361			next_tag++;
1362		      }
1363		  }
1364		else
1365		  {
1366		    assert(!IS_TAG(lit));
1367		  }
1368		break;
1369	      }
1370	    case CATENATION:
1371	      {
1372		tre_catenation_t *cat = node->obj;
1373		tre_ast_node_t *left = cat->left;
1374		tre_ast_node_t *right = cat->right;
1375		int reserved_tag = -1;
1376
1377
1378		/* After processing right child. */
1379		STACK_PUSHX(stack, voidptr, node);
1380		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1381
1382		/* Process right child. */
1383		STACK_PUSHX(stack, voidptr, right);
1384		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1385
1386		/* After processing left child. */
1387		STACK_PUSHX(stack, int, next_tag + left->num_tags);
1388		if (left->num_tags > 0 && right->num_tags > 0)
1389		  {
1390		    /* Reserve the next tag to the right child. */
1391		    reserved_tag = next_tag;
1392		    next_tag++;
1393		  }
1394		STACK_PUSHX(stack, int, reserved_tag);
1395		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1396
1397		/* Process left child. */
1398		STACK_PUSHX(stack, voidptr, left);
1399		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1400
1401		}
1402	      break;
1403	    case ITERATION:
1404	      {
1405		tre_iteration_t *iter = node->obj;
1406
1407		if (first_pass)
1408		  {
1409		    STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1410		  }
1411		else
1412		  {
1413		    STACK_PUSHX(stack, int, tag);
1414		    STACK_PUSHX(stack, int, iter->minimal);
1415		  }
1416		STACK_PUSHX(stack, voidptr, node);
1417		STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1418
1419		STACK_PUSHX(stack, voidptr, iter->arg);
1420		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1421
1422		/* Regset is not empty, so add a tag here. */
1423		if (regset[0] >= 0 || iter->minimal)
1424		  {
1425		    if (!first_pass)
1426		      {
1427			int i;
1428			status = tre_add_tag_left(mem, node, tag);
1429			if (iter->minimal)
1430			  tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1431			else
1432			  tnfa->tag_directions[tag] = direction;
1433			if (minimal_tag >= 0)
1434			  {
1435			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1436			    tnfa->minimal_tags[i] = tag;
1437			    tnfa->minimal_tags[i + 1] = minimal_tag;
1438			    tnfa->minimal_tags[i + 2] = -1;
1439			    minimal_tag = -1;
1440			    num_minimals++;
1441			  }
1442			tre_purge_regset(regset, tnfa, tag);
1443		      }
1444
1445		    regset[0] = -1;
1446		    tag = next_tag;
1447		    num_tags++;
1448		    next_tag++;
1449		  }
1450		direction = TRE_TAG_MINIMIZE;
1451	      }
1452	      break;
1453	    case UNION:
1454	      {
1455		tre_union_t *uni = node->obj;
1456		tre_ast_node_t *left = uni->left;
1457		tre_ast_node_t *right = uni->right;
1458		int left_tag;
1459		int right_tag;
1460
1461		if (regset[0] >= 0)
1462		  {
1463		    left_tag = next_tag;
1464		    right_tag = next_tag + 1;
1465		  }
1466		else
1467		  {
1468		    left_tag = tag;
1469		    right_tag = next_tag;
1470		  }
1471
1472		/* After processing right child. */
1473		STACK_PUSHX(stack, int, right_tag);
1474		STACK_PUSHX(stack, int, left_tag);
1475		STACK_PUSHX(stack, voidptr, regset);
1476		STACK_PUSHX(stack, int, regset[0] >= 0);
1477		STACK_PUSHX(stack, voidptr, node);
1478		STACK_PUSHX(stack, voidptr, right);
1479		STACK_PUSHX(stack, voidptr, left);
1480		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1481
1482		/* Process right child. */
1483		STACK_PUSHX(stack, voidptr, right);
1484		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1485
1486		/* After processing left child. */
1487		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1488
1489		/* Process left child. */
1490		STACK_PUSHX(stack, voidptr, left);
1491		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1492
1493		/* Regset is not empty, so add a tag here. */
1494		if (regset[0] >= 0)
1495		  {
1496		    if (!first_pass)
1497		      {
1498			int i;
1499			status = tre_add_tag_left(mem, node, tag);
1500			tnfa->tag_directions[tag] = direction;
1501			if (minimal_tag >= 0)
1502			  {
1503			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1504			    tnfa->minimal_tags[i] = tag;
1505			    tnfa->minimal_tags[i + 1] = minimal_tag;
1506			    tnfa->minimal_tags[i + 2] = -1;
1507			    minimal_tag = -1;
1508			    num_minimals++;
1509			  }
1510			tre_purge_regset(regset, tnfa, tag);
1511		      }
1512
1513		    regset[0] = -1;
1514		    tag = next_tag;
1515		    num_tags++;
1516		    next_tag++;
1517		  }
1518
1519		if (node->num_submatches > 0)
1520		  {
1521		    /* The next two tags are reserved for markers. */
1522		    next_tag++;
1523		    tag = next_tag;
1524		    next_tag++;
1525		  }
1526
1527		break;
1528	      }
1529	    }
1530
1531	  if (node->submatch_id >= 0)
1532	    {
1533	      int i;
1534	      /* Push this submatch on the parents stack. */
1535	      for (i = 0; parents[i] >= 0; i++);
1536	      parents[i] = node->submatch_id;
1537	      parents[i + 1] = -1;
1538	    }
1539
1540	  break; /* end case: ADDTAGS_RECURSE */
1541
1542	case ADDTAGS_AFTER_ITERATION:
1543	  {
1544	    int minimal = 0;
1545	    int enter_tag;
1546	    node = tre_stack_pop_voidptr(stack);
1547	    if (first_pass)
1548	      {
1549		node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1550		  + tre_stack_pop_int(stack);
1551		minimal_tag = -1;
1552	      }
1553	    else
1554	      {
1555		minimal = tre_stack_pop_int(stack);
1556		enter_tag = tre_stack_pop_int(stack);
1557		if (minimal)
1558		  minimal_tag = enter_tag;
1559	      }
1560
1561	    if (!first_pass)
1562	      {
1563		if (minimal)
1564		  direction = TRE_TAG_MINIMIZE;
1565		else
1566		  direction = TRE_TAG_MAXIMIZE;
1567	      }
1568	    break;
1569	  }
1570
1571	case ADDTAGS_AFTER_CAT_LEFT:
1572	  {
1573	    int new_tag = tre_stack_pop_int(stack);
1574	    next_tag = tre_stack_pop_int(stack);
1575	    if (new_tag >= 0)
1576	      {
1577		tag = new_tag;
1578	      }
1579	    break;
1580	  }
1581
1582	case ADDTAGS_AFTER_CAT_RIGHT:
1583	  node = tre_stack_pop_voidptr(stack);
1584	  if (first_pass)
1585	    node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1586	      + ((tre_catenation_t *)node->obj)->right->num_tags;
1587	  break;
1588
1589	case ADDTAGS_AFTER_UNION_LEFT:
1590	  /* Lift the bottom of the `regset' array so that when processing
1591	     the right operand the items currently in the array are
1592	     invisible.	 The original bottom was saved at ADDTAGS_UNION and
1593	     will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1594	  while (*regset >= 0)
1595	    regset++;
1596	  break;
1597
1598	case ADDTAGS_AFTER_UNION_RIGHT:
1599	  {
1600	    int added_tags, tag_left, tag_right;
1601	    tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
1602	    tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
1603	    node = tre_stack_pop_voidptr(stack);
1604	    added_tags = tre_stack_pop_int(stack);
1605	    if (first_pass)
1606	      {
1607		node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
1608		  + ((tre_union_t *)node->obj)->right->num_tags + added_tags
1609		  + ((node->num_submatches > 0) ? 2 : 0);
1610	      }
1611	    regset = tre_stack_pop_voidptr(stack);
1612	    tag_left = tre_stack_pop_int(stack);
1613	    tag_right = tre_stack_pop_int(stack);
1614
1615	    /* Add tags after both children, the left child gets a smaller
1616	       tag than the right child.  This guarantees that we prefer
1617	       the left child over the right child. */
1618	    /* XXX - This is not always necessary (if the children have
1619	       tags which must be seen for every match of that child). */
1620	    /* XXX - Check if this is the only place where tre_add_tag_right
1621	       is used.	 If so, use tre_add_tag_left (putting the tag before
1622	       the child as opposed after the child) and throw away
1623	       tre_add_tag_right. */
1624	    if (node->num_submatches > 0)
1625	      {
1626		if (!first_pass)
1627		  {
1628		    status = tre_add_tag_right(mem, left, tag_left);
1629		    tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
1630		    if (status == REG_OK)
1631		      status = tre_add_tag_right(mem, right, tag_right);
1632		    tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
1633		  }
1634		num_tags += 2;
1635	      }
1636	    direction = TRE_TAG_MAXIMIZE;
1637	    break;
1638	  }
1639
1640	default:
1641	  assert(0);
1642	  break;
1643
1644	} /* end switch(symbol) */
1645    } /* end while(tre_stack_num_objects(stack) > bottom) */
1646
1647  if (!first_pass)
1648    tre_purge_regset(regset, tnfa, tag);
1649
1650  if (!first_pass && minimal_tag >= 0)
1651    {
1652      int i;
1653      for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1654      tnfa->minimal_tags[i] = tag;
1655      tnfa->minimal_tags[i + 1] = minimal_tag;
1656      tnfa->minimal_tags[i + 2] = -1;
1657      minimal_tag = -1;
1658      num_minimals++;
1659    }
1660
1661  assert(tree->num_tags == num_tags);
1662  tnfa->end_tag = num_tags;
1663  tnfa->num_tags = num_tags;
1664  tnfa->num_minimals = num_minimals;
1665  xfree(orig_regset);
1666  xfree(parents);
1667  xfree(saved_states);
1668  return status;
1669}
1670
1671
1672
1673/*
1674  AST to TNFA compilation routines.
1675*/
1676
1677typedef enum {
1678  COPY_RECURSE,
1679  COPY_SET_RESULT_PTR
1680} tre_copyast_symbol_t;
1681
1682/* Flags for tre_copy_ast(). */
1683#define COPY_REMOVE_TAGS	 1
1684#define COPY_MAXIMIZE_FIRST_TAG	 2
1685
1686static reg_errcode_t
1687tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1688	     int flags, int *pos_add, tre_tag_direction_t *tag_directions,
1689	     tre_ast_node_t **copy, int *max_pos)
1690{
1691  reg_errcode_t status = REG_OK;
1692  int bottom = tre_stack_num_objects(stack);
1693  int num_copied = 0;
1694  int first_tag = 1;
1695  tre_ast_node_t **result = copy;
1696  tre_copyast_symbol_t symbol;
1697
1698  STACK_PUSH(stack, voidptr, ast);
1699  STACK_PUSH(stack, int, COPY_RECURSE);
1700
1701  while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1702    {
1703      tre_ast_node_t *node;
1704      if (status != REG_OK)
1705	break;
1706
1707      symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
1708      switch (symbol)
1709	{
1710	case COPY_SET_RESULT_PTR:
1711	  result = tre_stack_pop_voidptr(stack);
1712	  break;
1713	case COPY_RECURSE:
1714	  node = tre_stack_pop_voidptr(stack);
1715	  switch (node->type)
1716	    {
1717	    case LITERAL:
1718	      {
1719		tre_literal_t *lit = node->obj;
1720		int pos = lit->position;
1721		int min = lit->code_min;
1722		int max = lit->code_max;
1723		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1724		  {
1725		    /* XXX - e.g. [ab] has only one position but two
1726		       nodes, so we are creating holes in the state space
1727		       here.  Not fatal, just wastes memory. */
1728		    pos += *pos_add;
1729		    num_copied++;
1730		  }
1731		else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
1732		  {
1733		    /* Change this tag to empty. */
1734		    min = EMPTY;
1735		    max = pos = -1;
1736		  }
1737		else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
1738			 && first_tag)
1739		  {
1740		    /* Maximize the first tag. */
1741		    tag_directions[max] = TRE_TAG_MAXIMIZE;
1742		    first_tag = 0;
1743		  }
1744		*result = tre_ast_new_literal(mem, min, max, pos);
1745		if (*result == NULL)
1746		  status = REG_ESPACE;
1747		else {
1748		  tre_literal_t *p = (*result)->obj;
1749		  p->class = lit->class;
1750		  p->neg_classes = lit->neg_classes;
1751		}
1752
1753		if (pos > *max_pos)
1754		  *max_pos = pos;
1755		break;
1756	      }
1757	    case UNION:
1758	      {
1759		tre_union_t *uni = node->obj;
1760		tre_union_t *tmp;
1761		*result = tre_ast_new_union(mem, uni->left, uni->right);
1762		if (*result == NULL)
1763		  {
1764		    status = REG_ESPACE;
1765		    break;
1766		  }
1767		tmp = (*result)->obj;
1768		result = &tmp->left;
1769		STACK_PUSHX(stack, voidptr, uni->right);
1770		STACK_PUSHX(stack, int, COPY_RECURSE);
1771		STACK_PUSHX(stack, voidptr, &tmp->right);
1772		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1773		STACK_PUSHX(stack, voidptr, uni->left);
1774		STACK_PUSHX(stack, int, COPY_RECURSE);
1775		break;
1776	      }
1777	    case CATENATION:
1778	      {
1779		tre_catenation_t *cat = node->obj;
1780		tre_catenation_t *tmp;
1781		*result = tre_ast_new_catenation(mem, cat->left, cat->right);
1782		if (*result == NULL)
1783		  {
1784		    status = REG_ESPACE;
1785		    break;
1786		  }
1787		tmp = (*result)->obj;
1788		tmp->left = NULL;
1789		tmp->right = NULL;
1790		result = &tmp->left;
1791
1792		STACK_PUSHX(stack, voidptr, cat->right);
1793		STACK_PUSHX(stack, int, COPY_RECURSE);
1794		STACK_PUSHX(stack, voidptr, &tmp->right);
1795		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1796		STACK_PUSHX(stack, voidptr, cat->left);
1797		STACK_PUSHX(stack, int, COPY_RECURSE);
1798		break;
1799	      }
1800	    case ITERATION:
1801	      {
1802		tre_iteration_t *iter = node->obj;
1803		STACK_PUSHX(stack, voidptr, iter->arg);
1804		STACK_PUSHX(stack, int, COPY_RECURSE);
1805		*result = tre_ast_new_iter(mem, iter->arg, iter->min,
1806					   iter->max, iter->minimal);
1807		if (*result == NULL)
1808		  {
1809		    status = REG_ESPACE;
1810		    break;
1811		  }
1812		iter = (*result)->obj;
1813		result = &iter->arg;
1814		break;
1815	      }
1816	    default:
1817	      assert(0);
1818	      break;
1819	    }
1820	  break;
1821	}
1822    }
1823  *pos_add += num_copied;
1824  return status;
1825}
1826
1827typedef enum {
1828  EXPAND_RECURSE,
1829  EXPAND_AFTER_ITER
1830} tre_expand_ast_symbol_t;
1831
1832/* Expands each iteration node that has a finite nonzero minimum or maximum
1833   iteration count to a catenated sequence of copies of the node. */
1834static reg_errcode_t
1835tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1836	       int *position, tre_tag_direction_t *tag_directions)
1837{
1838  reg_errcode_t status = REG_OK;
1839  int bottom = tre_stack_num_objects(stack);
1840  int pos_add = 0;
1841  int pos_add_total = 0;
1842  int max_pos = 0;
1843  int iter_depth = 0;
1844
1845  STACK_PUSHR(stack, voidptr, ast);
1846  STACK_PUSHR(stack, int, EXPAND_RECURSE);
1847  while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1848    {
1849      tre_ast_node_t *node;
1850      tre_expand_ast_symbol_t symbol;
1851
1852      if (status != REG_OK)
1853	break;
1854
1855      symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
1856      node = tre_stack_pop_voidptr(stack);
1857      switch (symbol)
1858	{
1859	case EXPAND_RECURSE:
1860	  switch (node->type)
1861	    {
1862	    case LITERAL:
1863	      {
1864		tre_literal_t *lit= node->obj;
1865		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1866		  {
1867		    lit->position += pos_add;
1868		    if (lit->position > max_pos)
1869		      max_pos = lit->position;
1870		  }
1871		break;
1872	      }
1873	    case UNION:
1874	      {
1875		tre_union_t *uni = node->obj;
1876		STACK_PUSHX(stack, voidptr, uni->right);
1877		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1878		STACK_PUSHX(stack, voidptr, uni->left);
1879		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1880		break;
1881	      }
1882	    case CATENATION:
1883	      {
1884		tre_catenation_t *cat = node->obj;
1885		STACK_PUSHX(stack, voidptr, cat->right);
1886		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1887		STACK_PUSHX(stack, voidptr, cat->left);
1888		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1889		break;
1890	      }
1891	    case ITERATION:
1892	      {
1893		tre_iteration_t *iter = node->obj;
1894		STACK_PUSHX(stack, int, pos_add);
1895		STACK_PUSHX(stack, voidptr, node);
1896		STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
1897		STACK_PUSHX(stack, voidptr, iter->arg);
1898		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1899		/* If we are going to expand this node at EXPAND_AFTER_ITER
1900		   then don't increase the `pos' fields of the nodes now, it
1901		   will get done when expanding. */
1902		if (iter->min > 1 || iter->max > 1)
1903		  pos_add = 0;
1904		iter_depth++;
1905		break;
1906	      }
1907	    default:
1908	      assert(0);
1909	      break;
1910	    }
1911	  break;
1912	case EXPAND_AFTER_ITER:
1913	  {
1914	    tre_iteration_t *iter = node->obj;
1915	    int pos_add_last;
1916	    pos_add = tre_stack_pop_int(stack);
1917	    pos_add_last = pos_add;
1918	    if (iter->min > 1 || iter->max > 1)
1919	      {
1920		tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
1921		int j;
1922		int pos_add_save = pos_add;
1923
1924		/* Create a catenated sequence of copies of the node. */
1925		for (j = 0; j < iter->min; j++)
1926		  {
1927		    tre_ast_node_t *copy;
1928		    /* Remove tags from all but the last copy. */
1929		    int flags = ((j + 1 < iter->min)
1930				 ? COPY_REMOVE_TAGS
1931				 : COPY_MAXIMIZE_FIRST_TAG);
1932		    pos_add_save = pos_add;
1933		    status = tre_copy_ast(mem, stack, iter->arg, flags,
1934					  &pos_add, tag_directions, &copy,
1935					  &max_pos);
1936		    if (status != REG_OK)
1937		      return status;
1938		    if (seq1 != NULL)
1939		      seq1 = tre_ast_new_catenation(mem, seq1, copy);
1940		    else
1941		      seq1 = copy;
1942		    if (seq1 == NULL)
1943		      return REG_ESPACE;
1944		  }
1945
1946		if (iter->max == -1)
1947		  {
1948		    /* No upper limit. */
1949		    pos_add_save = pos_add;
1950		    status = tre_copy_ast(mem, stack, iter->arg, 0,
1951					  &pos_add, NULL, &seq2, &max_pos);
1952		    if (status != REG_OK)
1953		      return status;
1954		    seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
1955		    if (seq2 == NULL)
1956		      return REG_ESPACE;
1957		  }
1958		else
1959		  {
1960		    for (j = iter->min; j < iter->max; j++)
1961		      {
1962			tre_ast_node_t *tmp, *copy;
1963			pos_add_save = pos_add;
1964			status = tre_copy_ast(mem, stack, iter->arg, 0,
1965					      &pos_add, NULL, &copy, &max_pos);
1966			if (status != REG_OK)
1967			  return status;
1968			if (seq2 != NULL)
1969			  seq2 = tre_ast_new_catenation(mem, copy, seq2);
1970			else
1971			  seq2 = copy;
1972			if (seq2 == NULL)
1973			  return REG_ESPACE;
1974			tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
1975			if (tmp == NULL)
1976			  return REG_ESPACE;
1977			seq2 = tre_ast_new_union(mem, tmp, seq2);
1978			if (seq2 == NULL)
1979			  return REG_ESPACE;
1980		      }
1981		  }
1982
1983		pos_add = pos_add_save;
1984		if (seq1 == NULL)
1985		  seq1 = seq2;
1986		else if (seq2 != NULL)
1987		  seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1988		if (seq1 == NULL)
1989		  return REG_ESPACE;
1990		node->obj = seq1->obj;
1991		node->type = seq1->type;
1992	      }
1993
1994	    iter_depth--;
1995	    pos_add_total += pos_add - pos_add_last;
1996	    if (iter_depth == 0)
1997	      pos_add = pos_add_total;
1998
1999	    break;
2000	  }
2001	default:
2002	  assert(0);
2003	  break;
2004	}
2005    }
2006
2007  *position += pos_add_total;
2008
2009  /* `max_pos' should never be larger than `*position' if the above
2010     code works, but just an extra safeguard let's make sure
2011     `*position' is set large enough so enough memory will be
2012     allocated for the transition table. */
2013  if (max_pos > *position)
2014    *position = max_pos;
2015
2016  return status;
2017}
2018
2019static tre_pos_and_tags_t *
2020tre_set_empty(tre_mem_t mem)
2021{
2022  tre_pos_and_tags_t *new_set;
2023
2024  new_set = tre_mem_calloc(mem, sizeof(*new_set));
2025  if (new_set == NULL)
2026    return NULL;
2027
2028  new_set[0].position = -1;
2029  new_set[0].code_min = -1;
2030  new_set[0].code_max = -1;
2031
2032  return new_set;
2033}
2034
2035static tre_pos_and_tags_t *
2036tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2037	    tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2038{
2039  tre_pos_and_tags_t *new_set;
2040
2041  new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2042  if (new_set == NULL)
2043    return NULL;
2044
2045  new_set[0].position = position;
2046  new_set[0].code_min = code_min;
2047  new_set[0].code_max = code_max;
2048  new_set[0].class = class;
2049  new_set[0].neg_classes = neg_classes;
2050  new_set[0].backref = backref;
2051  new_set[1].position = -1;
2052  new_set[1].code_min = -1;
2053  new_set[1].code_max = -1;
2054
2055  return new_set;
2056}
2057
2058static tre_pos_and_tags_t *
2059tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2060	      int *tags, int assertions)
2061{
2062  int s1, s2, i, j;
2063  tre_pos_and_tags_t *new_set;
2064  int *new_tags;
2065  int num_tags;
2066
2067  for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2068  for (s1 = 0; set1[s1].position >= 0; s1++);
2069  for (s2 = 0; set2[s2].position >= 0; s2++);
2070  new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2071  if (!new_set )
2072    return NULL;
2073
2074  for (s1 = 0; set1[s1].position >= 0; s1++)
2075    {
2076      new_set[s1].position = set1[s1].position;
2077      new_set[s1].code_min = set1[s1].code_min;
2078      new_set[s1].code_max = set1[s1].code_max;
2079      new_set[s1].assertions = set1[s1].assertions | assertions;
2080      new_set[s1].class = set1[s1].class;
2081      new_set[s1].neg_classes = set1[s1].neg_classes;
2082      new_set[s1].backref = set1[s1].backref;
2083      if (set1[s1].tags == NULL && tags == NULL)
2084	new_set[s1].tags = NULL;
2085      else
2086	{
2087	  for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2088	  new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2089					 * (i + num_tags + 1)));
2090	  if (new_tags == NULL)
2091	    return NULL;
2092	  for (j = 0; j < i; j++)
2093	    new_tags[j] = set1[s1].tags[j];
2094	  for (i = 0; i < num_tags; i++)
2095	    new_tags[j + i] = tags[i];
2096	  new_tags[j + i] = -1;
2097	  new_set[s1].tags = new_tags;
2098	}
2099    }
2100
2101  for (s2 = 0; set2[s2].position >= 0; s2++)
2102    {
2103      new_set[s1 + s2].position = set2[s2].position;
2104      new_set[s1 + s2].code_min = set2[s2].code_min;
2105      new_set[s1 + s2].code_max = set2[s2].code_max;
2106      /* XXX - why not | assertions here as well? */
2107      new_set[s1 + s2].assertions = set2[s2].assertions;
2108      new_set[s1 + s2].class = set2[s2].class;
2109      new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2110      new_set[s1 + s2].backref = set2[s2].backref;
2111      if (set2[s2].tags == NULL)
2112	new_set[s1 + s2].tags = NULL;
2113      else
2114	{
2115	  for (i = 0; set2[s2].tags[i] >= 0; i++);
2116	  new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2117	  if (new_tags == NULL)
2118	    return NULL;
2119	  for (j = 0; j < i; j++)
2120	    new_tags[j] = set2[s2].tags[j];
2121	  new_tags[j] = -1;
2122	  new_set[s1 + s2].tags = new_tags;
2123	}
2124    }
2125  new_set[s1 + s2].position = -1;
2126  return new_set;
2127}
2128
2129/* Finds the empty path through `node' which is the one that should be
2130   taken according to POSIX.2 rules, and adds the tags on that path to
2131   `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
2132   set to the number of tags seen on the path. */
2133static reg_errcode_t
2134tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2135		int *assertions, int *num_tags_seen)
2136{
2137  tre_literal_t *lit;
2138  tre_union_t *uni;
2139  tre_catenation_t *cat;
2140  tre_iteration_t *iter;
2141  int i;
2142  int bottom = tre_stack_num_objects(stack);
2143  reg_errcode_t status = REG_OK;
2144  if (num_tags_seen)
2145    *num_tags_seen = 0;
2146
2147  status = tre_stack_push_voidptr(stack, node);
2148
2149  /* Walk through the tree recursively. */
2150  while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2151    {
2152      node = tre_stack_pop_voidptr(stack);
2153
2154      switch (node->type)
2155	{
2156	case LITERAL:
2157	  lit = (tre_literal_t *)node->obj;
2158	  switch (lit->code_min)
2159	    {
2160	    case TAG:
2161	      if (lit->code_max >= 0)
2162		{
2163		  if (tags != NULL)
2164		    {
2165		      /* Add the tag to `tags'. */
2166		      for (i = 0; tags[i] >= 0; i++)
2167			if (tags[i] == lit->code_max)
2168			  break;
2169		      if (tags[i] < 0)
2170			{
2171			  tags[i] = lit->code_max;
2172			  tags[i + 1] = -1;
2173			}
2174		    }
2175		  if (num_tags_seen)
2176		    (*num_tags_seen)++;
2177		}
2178	      break;
2179	    case ASSERTION:
2180	      assert(lit->code_max >= 1
2181		     || lit->code_max <= ASSERT_LAST);
2182	      if (assertions != NULL)
2183		*assertions |= lit->code_max;
2184	      break;
2185	    case EMPTY:
2186	      break;
2187	    default:
2188	      assert(0);
2189	      break;
2190	    }
2191	  break;
2192
2193	case UNION:
2194	  /* Subexpressions starting earlier take priority over ones
2195	     starting later, so we prefer the left subexpression over the
2196	     right subexpression. */
2197	  uni = (tre_union_t *)node->obj;
2198	  if (uni->left->nullable)
2199	    STACK_PUSHX(stack, voidptr, uni->left)
2200	  else if (uni->right->nullable)
2201	    STACK_PUSHX(stack, voidptr, uni->right)
2202	  else
2203	    assert(0);
2204	  break;
2205
2206	case CATENATION:
2207	  /* The path must go through both children. */
2208	  cat = (tre_catenation_t *)node->obj;
2209	  assert(cat->left->nullable);
2210	  assert(cat->right->nullable);
2211	  STACK_PUSHX(stack, voidptr, cat->left);
2212	  STACK_PUSHX(stack, voidptr, cat->right);
2213	  break;
2214
2215	case ITERATION:
2216	  /* A match with an empty string is preferred over no match at
2217	     all, so we go through the argument if possible. */
2218	  iter = (tre_iteration_t *)node->obj;
2219	  if (iter->arg->nullable)
2220	    STACK_PUSHX(stack, voidptr, iter->arg);
2221	  break;
2222
2223	default:
2224	  assert(0);
2225	  break;
2226	}
2227    }
2228
2229  return status;
2230}
2231
2232
2233typedef enum {
2234  NFL_RECURSE,
2235  NFL_POST_UNION,
2236  NFL_POST_CATENATION,
2237  NFL_POST_ITERATION
2238} tre_nfl_stack_symbol_t;
2239
2240
2241/* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2242   the nodes of the AST `tree'. */
2243static reg_errcode_t
2244tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2245{
2246  int bottom = tre_stack_num_objects(stack);
2247
2248  STACK_PUSHR(stack, voidptr, tree);
2249  STACK_PUSHR(stack, int, NFL_RECURSE);
2250
2251  while (tre_stack_num_objects(stack) > bottom)
2252    {
2253      tre_nfl_stack_symbol_t symbol;
2254      tre_ast_node_t *node;
2255
2256      symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2257      node = tre_stack_pop_voidptr(stack);
2258      switch (symbol)
2259	{
2260	case NFL_RECURSE:
2261	  switch (node->type)
2262	    {
2263	    case LITERAL:
2264	      {
2265		tre_literal_t *lit = (tre_literal_t *)node->obj;
2266		if (IS_BACKREF(lit))
2267		  {
2268		    /* Back references: nullable = false, firstpos = {i},
2269		       lastpos = {i}. */
2270		    node->nullable = 0;
2271		    node->firstpos = tre_set_one(mem, lit->position, 0,
2272					     TRE_CHAR_MAX, 0, NULL, -1);
2273		    if (!node->firstpos)
2274		      return REG_ESPACE;
2275		    node->lastpos = tre_set_one(mem, lit->position, 0,
2276						TRE_CHAR_MAX, 0, NULL,
2277						(int)lit->code_max);
2278		    if (!node->lastpos)
2279		      return REG_ESPACE;
2280		  }
2281		else if (lit->code_min < 0)
2282		  {
2283		    /* Tags, empty strings, params, and zero width assertions:
2284		       nullable = true, firstpos = {}, and lastpos = {}. */
2285		    node->nullable = 1;
2286		    node->firstpos = tre_set_empty(mem);
2287		    if (!node->firstpos)
2288		      return REG_ESPACE;
2289		    node->lastpos = tre_set_empty(mem);
2290		    if (!node->lastpos)
2291		      return REG_ESPACE;
2292		  }
2293		else
2294		  {
2295		    /* Literal at position i: nullable = false, firstpos = {i},
2296		       lastpos = {i}. */
2297		    node->nullable = 0;
2298		    node->firstpos =
2299		      tre_set_one(mem, lit->position, (int)lit->code_min,
2300				  (int)lit->code_max, 0, NULL, -1);
2301		    if (!node->firstpos)
2302		      return REG_ESPACE;
2303		    node->lastpos = tre_set_one(mem, lit->position,
2304						(int)lit->code_min,
2305						(int)lit->code_max,
2306						lit->class, lit->neg_classes,
2307						-1);
2308		    if (!node->lastpos)
2309		      return REG_ESPACE;
2310		  }
2311		break;
2312	      }
2313
2314	    case UNION:
2315	      /* Compute the attributes for the two subtrees, and after that
2316		 for this node. */
2317	      STACK_PUSHR(stack, voidptr, node);
2318	      STACK_PUSHR(stack, int, NFL_POST_UNION);
2319	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2320	      STACK_PUSHR(stack, int, NFL_RECURSE);
2321	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2322	      STACK_PUSHR(stack, int, NFL_RECURSE);
2323	      break;
2324
2325	    case CATENATION:
2326	      /* Compute the attributes for the two subtrees, and after that
2327		 for this node. */
2328	      STACK_PUSHR(stack, voidptr, node);
2329	      STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2330	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2331	      STACK_PUSHR(stack, int, NFL_RECURSE);
2332	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2333	      STACK_PUSHR(stack, int, NFL_RECURSE);
2334	      break;
2335
2336	    case ITERATION:
2337	      /* Compute the attributes for the subtree, and after that for
2338		 this node. */
2339	      STACK_PUSHR(stack, voidptr, node);
2340	      STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2341	      STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2342	      STACK_PUSHR(stack, int, NFL_RECURSE);
2343	      break;
2344	    }
2345	  break; /* end case: NFL_RECURSE */
2346
2347	case NFL_POST_UNION:
2348	  {
2349	    tre_union_t *uni = (tre_union_t *)node->obj;
2350	    node->nullable = uni->left->nullable || uni->right->nullable;
2351	    node->firstpos = tre_set_union(mem, uni->left->firstpos,
2352					   uni->right->firstpos, NULL, 0);
2353	    if (!node->firstpos)
2354	      return REG_ESPACE;
2355	    node->lastpos = tre_set_union(mem, uni->left->lastpos,
2356					  uni->right->lastpos, NULL, 0);
2357	    if (!node->lastpos)
2358	      return REG_ESPACE;
2359	    break;
2360	  }
2361
2362	case NFL_POST_ITERATION:
2363	  {
2364	    tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2365
2366	    if (iter->min == 0 || iter->arg->nullable)
2367	      node->nullable = 1;
2368	    else
2369	      node->nullable = 0;
2370	    node->firstpos = iter->arg->firstpos;
2371	    node->lastpos = iter->arg->lastpos;
2372	    break;
2373	  }
2374
2375	case NFL_POST_CATENATION:
2376	  {
2377	    int num_tags, *tags, assertions;
2378	    reg_errcode_t status;
2379	    tre_catenation_t *cat = node->obj;
2380	    node->nullable = cat->left->nullable && cat->right->nullable;
2381
2382	    /* Compute firstpos. */
2383	    if (cat->left->nullable)
2384	      {
2385		/* The left side matches the empty string.  Make a first pass
2386		   with tre_match_empty() to get the number of tags and
2387		   parameters. */
2388		status = tre_match_empty(stack, cat->left,
2389					 NULL, NULL, &num_tags);
2390		if (status != REG_OK)
2391		  return status;
2392		/* Allocate arrays for the tags and parameters. */
2393		tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2394		if (!tags)
2395		  return REG_ESPACE;
2396		tags[0] = -1;
2397		assertions = 0;
2398		/* Second pass with tre_mach_empty() to get the list of
2399		   tags and parameters. */
2400		status = tre_match_empty(stack, cat->left, tags,
2401					 &assertions, NULL);
2402		if (status != REG_OK)
2403		  {
2404		    xfree(tags);
2405		    return status;
2406		  }
2407		node->firstpos =
2408		  tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2409				tags, assertions);
2410		xfree(tags);
2411		if (!node->firstpos)
2412		  return REG_ESPACE;
2413	      }
2414	    else
2415	      {
2416		node->firstpos = cat->left->firstpos;
2417	      }
2418
2419	    /* Compute lastpos. */
2420	    if (cat->right->nullable)
2421	      {
2422		/* The right side matches the empty string.  Make a first pass
2423		   with tre_match_empty() to get the number of tags and
2424		   parameters. */
2425		status = tre_match_empty(stack, cat->right,
2426					 NULL, NULL, &num_tags);
2427		if (status != REG_OK)
2428		  return status;
2429		/* Allocate arrays for the tags and parameters. */
2430		tags = xmalloc(sizeof(int) * (num_tags + 1));
2431		if (!tags)
2432		  return REG_ESPACE;
2433		tags[0] = -1;
2434		assertions = 0;
2435		/* Second pass with tre_mach_empty() to get the list of
2436		   tags and parameters. */
2437		status = tre_match_empty(stack, cat->right, tags,
2438					 &assertions, NULL);
2439		if (status != REG_OK)
2440		  {
2441		    xfree(tags);
2442		    return status;
2443		  }
2444		node->lastpos =
2445		  tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2446				tags, assertions);
2447		xfree(tags);
2448		if (!node->lastpos)
2449		  return REG_ESPACE;
2450	      }
2451	    else
2452	      {
2453		node->lastpos = cat->right->lastpos;
2454	      }
2455	    break;
2456	  }
2457
2458	default:
2459	  assert(0);
2460	  break;
2461	}
2462    }
2463
2464  return REG_OK;
2465}
2466
2467
2468/* Adds a transition from each position in `p1' to each position in `p2'. */
2469static reg_errcode_t
2470tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2471	       tre_tnfa_transition_t *transitions,
2472	       int *counts, int *offs)
2473{
2474  tre_pos_and_tags_t *orig_p2 = p2;
2475  tre_tnfa_transition_t *trans;
2476  int i, j, k, l, dup, prev_p2_pos;
2477
2478  if (transitions != NULL)
2479    while (p1->position >= 0)
2480      {
2481	p2 = orig_p2;
2482	prev_p2_pos = -1;
2483	while (p2->position >= 0)
2484	  {
2485	    /* Optimization: if this position was already handled, skip it. */
2486	    if (p2->position == prev_p2_pos)
2487	      {
2488		p2++;
2489		continue;
2490	      }
2491	    prev_p2_pos = p2->position;
2492	    /* Set `trans' to point to the next unused transition from
2493	       position `p1->position'. */
2494	    trans = transitions + offs[p1->position];
2495	    while (trans->state != NULL)
2496	      {
2497#if 0
2498		/* If we find a previous transition from `p1->position' to
2499		   `p2->position', it is overwritten.  This can happen only
2500		   if there are nested loops in the regexp, like in "((a)*)*".
2501		   In POSIX.2 repetition using the outer loop is always
2502		   preferred over using the inner loop.	 Therefore the
2503		   transition for the inner loop is useless and can be thrown
2504		   away. */
2505		/* XXX - The same position is used for all nodes in a bracket
2506		   expression, so this optimization cannot be used (it will
2507		   break bracket expressions) unless I figure out a way to
2508		   detect it here. */
2509		if (trans->state_id == p2->position)
2510		  {
2511		    break;
2512		  }
2513#endif
2514		trans++;
2515	      }
2516
2517	    if (trans->state == NULL)
2518	      (trans + 1)->state = NULL;
2519	    /* Use the character ranges, assertions, etc. from `p1' for
2520	       the transition from `p1' to `p2'. */
2521	    trans->code_min = p1->code_min;
2522	    trans->code_max = p1->code_max;
2523	    trans->state = transitions + offs[p2->position];
2524	    trans->state_id = p2->position;
2525	    trans->assertions = p1->assertions | p2->assertions
2526	      | (p1->class ? ASSERT_CHAR_CLASS : 0)
2527	      | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2528	    if (p1->backref >= 0)
2529	      {
2530		assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2531		assert(p2->backref < 0);
2532		trans->u.backref = p1->backref;
2533		trans->assertions |= ASSERT_BACKREF;
2534	      }
2535	    else
2536	      trans->u.class = p1->class;
2537	    if (p1->neg_classes != NULL)
2538	      {
2539		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2540		trans->neg_classes =
2541		  xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2542		if (trans->neg_classes == NULL)
2543		  return REG_ESPACE;
2544		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2545		  trans->neg_classes[i] = p1->neg_classes[i];
2546		trans->neg_classes[i] = (tre_ctype_t)0;
2547	      }
2548	    else
2549	      trans->neg_classes = NULL;
2550
2551	    /* Find out how many tags this transition has. */
2552	    i = 0;
2553	    if (p1->tags != NULL)
2554	      while(p1->tags[i] >= 0)
2555		i++;
2556	    j = 0;
2557	    if (p2->tags != NULL)
2558	      while(p2->tags[j] >= 0)
2559		j++;
2560
2561	    /* If we are overwriting a transition, free the old tag array. */
2562	    if (trans->tags != NULL)
2563	      xfree(trans->tags);
2564	    trans->tags = NULL;
2565
2566	    /* If there were any tags, allocate an array and fill it. */
2567	    if (i + j > 0)
2568	      {
2569		trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2570		if (!trans->tags)
2571		  return REG_ESPACE;
2572		i = 0;
2573		if (p1->tags != NULL)
2574		  while(p1->tags[i] >= 0)
2575		    {
2576		      trans->tags[i] = p1->tags[i];
2577		      i++;
2578		    }
2579		l = i;
2580		j = 0;
2581		if (p2->tags != NULL)
2582		  while (p2->tags[j] >= 0)
2583		    {
2584		      /* Don't add duplicates. */
2585		      dup = 0;
2586		      for (k = 0; k < i; k++)
2587			if (trans->tags[k] == p2->tags[j])
2588			  {
2589			    dup = 1;
2590			    break;
2591			  }
2592		      if (!dup)
2593			trans->tags[l++] = p2->tags[j];
2594		      j++;
2595		    }
2596		trans->tags[l] = -1;
2597	      }
2598
2599	    p2++;
2600	  }
2601	p1++;
2602      }
2603  else
2604    /* Compute a maximum limit for the number of transitions leaving
2605       from each state. */
2606    while (p1->position >= 0)
2607      {
2608	p2 = orig_p2;
2609	while (p2->position >= 0)
2610	  {
2611	    counts[p1->position]++;
2612	    p2++;
2613	  }
2614	p1++;
2615      }
2616  return REG_OK;
2617}
2618
2619/* Converts the syntax tree to a TNFA.	All the transitions in the TNFA are
2620   labelled with one character range (there are no transitions on empty
2621   strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
2622   the regexp. */
2623static reg_errcode_t
2624tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
2625		int *counts, int *offs)
2626{
2627  tre_union_t *uni;
2628  tre_catenation_t *cat;
2629  tre_iteration_t *iter;
2630  reg_errcode_t errcode = REG_OK;
2631
2632  /* XXX - recurse using a stack!. */
2633  switch (node->type)
2634    {
2635    case LITERAL:
2636      break;
2637    case UNION:
2638      uni = (tre_union_t *)node->obj;
2639      errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
2640      if (errcode != REG_OK)
2641	return errcode;
2642      errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
2643      break;
2644
2645    case CATENATION:
2646      cat = (tre_catenation_t *)node->obj;
2647      /* Add a transition from each position in cat->left->lastpos
2648	 to each position in cat->right->firstpos. */
2649      errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
2650			       transitions, counts, offs);
2651      if (errcode != REG_OK)
2652	return errcode;
2653      errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
2654      if (errcode != REG_OK)
2655	return errcode;
2656      errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
2657      break;
2658
2659    case ITERATION:
2660      iter = (tre_iteration_t *)node->obj;
2661      assert(iter->max == -1 || iter->max == 1);
2662
2663      if (iter->max == -1)
2664	{
2665	  assert(iter->min == 0 || iter->min == 1);
2666	  /* Add a transition from each last position in the iterated
2667	     expression to each first position. */
2668	  errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
2669				   transitions, counts, offs);
2670	  if (errcode != REG_OK)
2671	    return errcode;
2672	}
2673      errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
2674      break;
2675    }
2676  return errcode;
2677}
2678
2679
2680#define ERROR_EXIT(err)		  \
2681  do				  \
2682    {				  \
2683      errcode = err;		  \
2684      if (/*CONSTCOND*/1)	  \
2685      	goto error_exit;	  \
2686    }				  \
2687 while (/*CONSTCOND*/0)
2688
2689
2690int
2691regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
2692{
2693  tre_stack_t *stack;
2694  tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
2695  tre_pos_and_tags_t *p;
2696  int *counts = NULL, *offs = NULL;
2697  int i, add = 0;
2698  tre_tnfa_transition_t *transitions, *initial;
2699  tre_tnfa_t *tnfa = NULL;
2700  tre_submatch_data_t *submatch_data;
2701  tre_tag_direction_t *tag_directions = NULL;
2702  reg_errcode_t errcode;
2703  tre_mem_t mem;
2704
2705  /* Parse context. */
2706  tre_parse_ctx_t parse_ctx;
2707
2708  /* Allocate a stack used throughout the compilation process for various
2709     purposes. */
2710  stack = tre_stack_new(512, 1024000, 128);
2711  if (!stack)
2712    return REG_ESPACE;
2713  /* Allocate a fast memory allocator. */
2714  mem = tre_mem_new();
2715  if (!mem)
2716    {
2717      tre_stack_destroy(stack);
2718      return REG_ESPACE;
2719    }
2720
2721  /* Parse the regexp. */
2722  memset(&parse_ctx, 0, sizeof(parse_ctx));
2723  parse_ctx.mem = mem;
2724  parse_ctx.stack = stack;
2725  parse_ctx.start = regex;
2726  parse_ctx.cflags = cflags;
2727  parse_ctx.max_backref = -1;
2728  errcode = tre_parse(&parse_ctx);
2729  if (errcode != REG_OK)
2730    ERROR_EXIT(errcode);
2731  preg->re_nsub = parse_ctx.submatch_id - 1;
2732  tree = parse_ctx.n;
2733
2734#ifdef TRE_DEBUG
2735  tre_ast_print(tree);
2736#endif /* TRE_DEBUG */
2737
2738  /* Referring to nonexistent subexpressions is illegal. */
2739  if (parse_ctx.max_backref > (int)preg->re_nsub)
2740    ERROR_EXIT(REG_ESUBREG);
2741
2742  /* Allocate the TNFA struct. */
2743  tnfa = xcalloc(1, sizeof(tre_tnfa_t));
2744  if (tnfa == NULL)
2745    ERROR_EXIT(REG_ESPACE);
2746  tnfa->have_backrefs = parse_ctx.max_backref >= 0;
2747  tnfa->have_approx = 0;
2748  tnfa->num_submatches = parse_ctx.submatch_id;
2749
2750  /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
2751     regexp does not have back references, this can be skipped. */
2752  if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
2753    {
2754
2755      /* Figure out how many tags we will need. */
2756      errcode = tre_add_tags(NULL, stack, tree, tnfa);
2757      if (errcode != REG_OK)
2758	ERROR_EXIT(errcode);
2759
2760      if (tnfa->num_tags > 0)
2761	{
2762	  tag_directions = xmalloc(sizeof(*tag_directions)
2763				   * (tnfa->num_tags + 1));
2764	  if (tag_directions == NULL)
2765	    ERROR_EXIT(REG_ESPACE);
2766	  tnfa->tag_directions = tag_directions;
2767	  memset(tag_directions, -1,
2768		 sizeof(*tag_directions) * (tnfa->num_tags + 1));
2769	}
2770      tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
2771				   sizeof(*tnfa->minimal_tags));
2772      if (tnfa->minimal_tags == NULL)
2773	ERROR_EXIT(REG_ESPACE);
2774
2775      submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
2776			      sizeof(*submatch_data));
2777      if (submatch_data == NULL)
2778	ERROR_EXIT(REG_ESPACE);
2779      tnfa->submatch_data = submatch_data;
2780
2781      errcode = tre_add_tags(mem, stack, tree, tnfa);
2782      if (errcode != REG_OK)
2783	ERROR_EXIT(errcode);
2784
2785    }
2786
2787  /* Expand iteration nodes. */
2788  errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2789			   tag_directions);
2790  if (errcode != REG_OK)
2791    ERROR_EXIT(errcode);
2792
2793  /* Add a dummy node for the final state.
2794     XXX - For certain patterns this dummy node can be optimized away,
2795	   for example "a*" or "ab*".	Figure out a simple way to detect
2796	   this possibility. */
2797  tmp_ast_l = tree;
2798  tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2799  if (tmp_ast_r == NULL)
2800    ERROR_EXIT(REG_ESPACE);
2801
2802  tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2803  if (tree == NULL)
2804    ERROR_EXIT(REG_ESPACE);
2805
2806  errcode = tre_compute_nfl(mem, stack, tree);
2807  if (errcode != REG_OK)
2808    ERROR_EXIT(errcode);
2809
2810  counts = xmalloc(sizeof(int) * parse_ctx.position);
2811  if (counts == NULL)
2812    ERROR_EXIT(REG_ESPACE);
2813
2814  offs = xmalloc(sizeof(int) * parse_ctx.position);
2815  if (offs == NULL)
2816    ERROR_EXIT(REG_ESPACE);
2817
2818  for (i = 0; i < parse_ctx.position; i++)
2819    counts[i] = 0;
2820  tre_ast_to_tnfa(tree, NULL, counts, NULL);
2821
2822  add = 0;
2823  for (i = 0; i < parse_ctx.position; i++)
2824    {
2825      offs[i] = add;
2826      add += counts[i] + 1;
2827      counts[i] = 0;
2828    }
2829  transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2830  if (transitions == NULL)
2831    ERROR_EXIT(REG_ESPACE);
2832  tnfa->transitions = transitions;
2833  tnfa->num_transitions = add;
2834
2835  errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2836  if (errcode != REG_OK)
2837    ERROR_EXIT(errcode);
2838
2839  tnfa->firstpos_chars = NULL;
2840
2841  p = tree->firstpos;
2842  i = 0;
2843  while (p->position >= 0)
2844    {
2845      i++;
2846      p++;
2847    }
2848
2849  initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2850  if (initial == NULL)
2851    ERROR_EXIT(REG_ESPACE);
2852  tnfa->initial = initial;
2853
2854  i = 0;
2855  for (p = tree->firstpos; p->position >= 0; p++)
2856    {
2857      initial[i].state = transitions + offs[p->position];
2858      initial[i].state_id = p->position;
2859      initial[i].tags = NULL;
2860      /* Copy the arrays p->tags, and p->params, they are allocated
2861	 from a tre_mem object. */
2862      if (p->tags)
2863	{
2864	  int j;
2865	  for (j = 0; p->tags[j] >= 0; j++);
2866	  initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2867	  if (!initial[i].tags)
2868	    ERROR_EXIT(REG_ESPACE);
2869	  memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2870	}
2871      initial[i].assertions = p->assertions;
2872      i++;
2873    }
2874  initial[i].state = NULL;
2875
2876  tnfa->num_transitions = add;
2877  tnfa->final = transitions + offs[tree->lastpos[0].position];
2878  tnfa->num_states = parse_ctx.position;
2879  tnfa->cflags = cflags;
2880
2881  tre_mem_destroy(mem);
2882  tre_stack_destroy(stack);
2883  xfree(counts);
2884  xfree(offs);
2885
2886  preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2887  return REG_OK;
2888
2889 error_exit:
2890  /* Free everything that was allocated and return the error code. */
2891  tre_mem_destroy(mem);
2892  if (stack != NULL)
2893    tre_stack_destroy(stack);
2894  if (counts != NULL)
2895    xfree(counts);
2896  if (offs != NULL)
2897    xfree(offs);
2898  preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2899  regfree(preg);
2900  return errcode;
2901}
2902
2903
2904
2905
2906void
2907regfree(regex_t *preg)
2908{
2909  tre_tnfa_t *tnfa;
2910  unsigned int i;
2911  tre_tnfa_transition_t *trans;
2912
2913  tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2914  if (!tnfa)
2915    return;
2916
2917  for (i = 0; i < tnfa->num_transitions; i++)
2918    if (tnfa->transitions[i].state)
2919      {
2920	if (tnfa->transitions[i].tags)
2921	  xfree(tnfa->transitions[i].tags);
2922	if (tnfa->transitions[i].neg_classes)
2923	  xfree(tnfa->transitions[i].neg_classes);
2924      }
2925  if (tnfa->transitions)
2926    xfree(tnfa->transitions);
2927
2928  if (tnfa->initial)
2929    {
2930      for (trans = tnfa->initial; trans->state; trans++)
2931	{
2932	  if (trans->tags)
2933	    xfree(trans->tags);
2934	}
2935      xfree(tnfa->initial);
2936    }
2937
2938  if (tnfa->submatch_data)
2939    {
2940      for (i = 0; i < tnfa->num_submatches; i++)
2941	if (tnfa->submatch_data[i].parents)
2942	  xfree(tnfa->submatch_data[i].parents);
2943      xfree(tnfa->submatch_data);
2944    }
2945
2946  if (tnfa->tag_directions)
2947    xfree(tnfa->tag_directions);
2948  if (tnfa->firstpos_chars)
2949    xfree(tnfa->firstpos_chars);
2950  if (tnfa->minimal_tags)
2951    xfree(tnfa->minimal_tags);
2952  xfree(tnfa);
2953}
2954