OpenSolaris_b135/cmd/awk/b.c

Compare this file to the similar file:
Show the results in this format:

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
/*	  All Rights Reserved  	*/

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#define	DEBUG

#include "awk.h"
#include "y.tab.h"

#define	HAT	(NCHARS-1)	/* matches ^ in regular expr */
				/* NCHARS is 2**n */
#define	MAXLIN (3 * LINE_MAX)

#define	type(v)		(v)->nobj
#define	left(v)		(v)->narg[0]
#define	right(v)	(v)->narg[1]
#define	parent(v)	(v)->nnext

#define	LEAF	case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
#define	UNARY	case STAR: case PLUS: case QUEST:

/*
 * encoding in tree Nodes:
 *	leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
 *		left is index, right contains value or pointer to value
 *	unary (STAR, PLUS, QUEST): left is child, right is null
 *	binary (CAT, OR): left and right are children
 *	parent contains pointer to parent
 */

int	setvec[MAXLIN];
int	tmpset[MAXLIN];
Node	*point[MAXLIN];

int	rtok;		/* next token in current re */
int	rlxval;
uchar	*rlxstr;
uchar	*prestr;	/* current position in current re */
uchar	*lastre;	/* origin of last re */

static	int setcnt;
static	int poscnt;

uchar	*patbeg;
int	patlen;

#define	NFA	20	/* cache this many dynamic fa's */
fa	*fatab[NFA];
int	nfatab	= 0;	/* entries in fatab */

static fa	*mkdfa(uchar *, int);
static int	makeinit(fa *, int);
static void	penter(Node *);
static void	freetr(Node *);
static void	overflo(char *);
static void	cfoll(fa *, Node *);
static void	follow(Node *);
static Node	*reparse(uchar *);
static int	relex(void);
static void	freefa(fa *);
static int	cgoto(fa *, int, int);

fa *
makedfa(uchar *s, int anchor)	/* returns dfa for reg expr s */
{
	int i, use, nuse;
	fa *pfa;

	if (compile_time)	/* a constant for sure */
		return (mkdfa(s, anchor));
	for (i = 0; i < nfatab; i++) {	/* is it there already? */
		if (fatab[i]->anchor == anchor &&
		    strcmp((char *)fatab[i]->restr, (char *)s) == 0) {
			fatab[i]->use++;
			return (fatab[i]);
		}
	}
	pfa = mkdfa(s, anchor);
	if (nfatab < NFA) {	/* room for another */
		fatab[nfatab] = pfa;
		fatab[nfatab]->use = 1;
		nfatab++;
		return (pfa);
	}
	use = fatab[0]->use;	/* replace least-recently used */
	nuse = 0;
	for (i = 1; i < nfatab; i++)
		if (fatab[i]->use < use) {
			use = fatab[i]->use;
			nuse = i;
		}
	freefa(fatab[nuse]);
	fatab[nuse] = pfa;
	pfa->use = 1;
	return (pfa);
}

fa *
mkdfa(uchar *s, int anchor)	/* does the real work of making a dfa */
	/* anchor = 1 for anchored matches, else 0 */
{
	Node *p, *p1;
	fa *f;

	p = reparse(s);
	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
		/* put ALL STAR in front of reg.  exp. */
	p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
		/* put FINAL after reg.  exp. */

	poscnt = 0;
	penter(p1);	/* enter parent pointers and leaf indices */
	if ((f = (fa *)calloc(1, sizeof (fa) + poscnt * sizeof (rrow))) == NULL)
		overflo("no room for fa");
	/* penter has computed number of positions in re */
	f->accept = poscnt-1;
	cfoll(f, p1);	/* set up follow sets */
	freetr(p1);
	if ((f->posns[0] =
	    (int *)calloc(1, *(f->re[0].lfollow) * sizeof (int))) == NULL) {
			overflo("out of space in makedfa");
	}
	if ((f->posns[1] = (int *)calloc(1, sizeof (int))) == NULL)
		overflo("out of space in makedfa");
	*f->posns[1] = 0;
	f->initstat = makeinit(f, anchor);
	f->anchor = anchor;
	f->restr = tostring(s);
	return (f);
}

static int
makeinit(fa *f, int anchor)
{
	register int i, k;

	f->curstat = 2;
	f->out[2] = 0;
	f->reset = 0;
	k = *(f->re[0].lfollow);
	xfree(f->posns[2]);
	if ((f->posns[2] = (int *)calloc(1, (k+1) * sizeof (int))) == NULL)
		overflo("out of space in makeinit");
	for (i = 0; i <= k; i++) {
		(f->posns[2])[i] = (f->re[0].lfollow)[i];
	}
	if ((f->posns[2])[1] == f->accept)
		f->out[2] = 1;
	for (i = 0; i < NCHARS; i++)
		f->gototab[2][i] = 0;
	f->curstat = cgoto(f, 2, HAT);
	if (anchor) {
		*f->posns[2] = k-1;	/* leave out position 0 */
		for (i = 0; i < k; i++) {
			(f->posns[0])[i] = (f->posns[2])[i];
		}

		f->out[0] = f->out[2];
		if (f->curstat != 2)
			--(*f->posns[f->curstat]);
	}
	return (f->curstat);
}

void
penter(Node *p)	/* set up parent pointers and leaf indices */
{
	switch (type(p)) {
	LEAF
		left(p) = (Node *) poscnt;
		point[poscnt++] = p;
		break;
	UNARY
		penter(left(p));
		parent(left(p)) = p;
		break;
	case CAT:
	case OR:
		penter(left(p));
		penter(right(p));
		parent(left(p)) = p;
		parent(right(p)) = p;
		break;
	default:
		ERROR "unknown type %d in penter", type(p) FATAL;
		break;
	}
}

static void
freetr(Node *p)	/* free parse tree */
{
	switch (type(p)) {
	LEAF
		xfree(p);
		break;
	UNARY
		freetr(left(p));
		xfree(p);
		break;
	case CAT:
	case OR:
		freetr(left(p));
		freetr(right(p));
		xfree(p);
		break;
	default:
		ERROR "unknown type %d in freetr", type(p) FATAL;
		break;
	}
}

uchar *
cclenter(uchar *p)
{
	register int i, c;
	uchar *op, *chars, *ret;
	size_t	bsize;

	init_buf(&chars, &bsize, LINE_INCR);
	op = p;
	i = 0;
	while ((c = *p++) != 0) {
		if (c == '\\') {
			if ((c = *p++) == 't')
				c = '\t';
			else if (c == 'n')
				c = '\n';
			else if (c == 'f')
				c = '\f';
			else if (c == 'r')
				c = '\r';
			else if (c == 'b')
				c = '\b';
			else if (c == '\\')
				c = '\\';
			else if (isdigit(c)) {
				int n = c - '0';
				if (isdigit(*p)) {
					n = 8 * n + *p++ - '0';
					if (isdigit(*p))
						n = 8 * n + *p++ - '0';
				}
				c = n;
			} /* else */
				/* c = c; */
		} else if (c == '-' && i > 0 && chars[i-1] != 0) {
			if (*p != 0) {
				c = chars[i-1];
				while ((uchar)c < *p) {	/* fails if *p is \\ */
					expand_buf(&chars, &bsize, i);
					chars[i++] = ++c;
				}
				p++;
				continue;
			}
		}
		expand_buf(&chars, &bsize, i);
		chars[i++] = c;
	}
	chars[i++] = '\0';
	dprintf(("cclenter: in = |%s|, out = |%s|\n", op, chars));
	xfree(op);
	ret = tostring(chars);
	free(chars);
	return (ret);
}

static void
overflo(char *s)
{
	ERROR "regular expression too big: %s", gettext((char *)s) FATAL;
}

/* enter follow set of each leaf of vertex v into lfollow[leaf] */
static void
cfoll(fa *f, Node *v)
{
	register int i;
	register int *p;

	switch (type(v)) {
	LEAF
		f->re[(int)left(v)].ltype = type(v);
		f->re[(int)left(v)].lval = (int)right(v);
		for (i = 0; i <= f->accept; i++)
			setvec[i] = 0;
		setcnt = 0;
		follow(v);	/* computes setvec and setcnt */
		if ((p = (int *)calloc(1, (setcnt+1) * sizeof (int))) == NULL)
			overflo("follow set overflow");
		f->re[(int)left(v)].lfollow = p;
		*p = setcnt;
		for (i = f->accept; i >= 0; i--) {
			if (setvec[i] == 1)
				*++p = i;
		}
		break;
	UNARY
		cfoll(f, left(v));
		break;
	case CAT:
	case OR:
		cfoll(f, left(v));
		cfoll(f, right(v));
		break;
	default:
		ERROR "unknown type %d in cfoll", type(v) FATAL;
	}
}

/*
 * collects initially active leaves of p into setvec
 * returns 0 or 1 depending on whether p matches empty string
 */
static int
first(Node *p)
{
	register int b;

	switch (type(p)) {
	LEAF
		if (setvec[(int)left(p)] != 1) {
			setvec[(int)left(p)] = 1;
			setcnt++;
		}
		if (type(p) == CCL && (*(uchar *)right(p)) == '\0')
			return (0);		/* empty CCL */
		else
			return (1);
	case PLUS:
		if (first(left(p)) == 0)
			return (0);
		return (1);
	case STAR:
	case QUEST:
		(void) first(left(p));
		return (0);
	case CAT:
		if (first(left(p)) == 0 && first(right(p)) == 0)
			return (0);
		return (1);
	case OR:
		b = first(right(p));
		if (first(left(p)) == 0 || b == 0)
			return (0);
		return (1);
	}
	ERROR "unknown type %d in first", type(p) FATAL;
	return (-1);
}

/* collects leaves that can follow v into setvec */
static void
follow(Node *v)
{
	Node *p;

	if (type(v) == FINAL)
		return;
	p = parent(v);
	switch (type(p)) {
	case STAR:
	case PLUS:
		(void) first(v);
		follow(p);
		return;

	case OR:
	case QUEST:
		follow(p);
		return;

	case CAT:
		if (v == left(p)) {	/* v is left child of p */
			if (first(right(p)) == 0) {
				follow(p);
				return;
			}
		} else		/* v is right child */
			follow(p);
		return;
	default:
		ERROR "unknown type %d in follow", type(p) FATAL;
		break;
	}
}

static int
member(uchar c, uchar *s)	/* is c in s? */
{
	while (*s)
		if (c == *s++)
			return (1);
	return (0);
}


int
match(fa *f, uchar *p)
{
	register int s, ns;

	s = f->reset ? makeinit(f, 0) : f->initstat;
	if (f->out[s])
		return (1);
	do {
		if ((ns = f->gototab[s][*p]) != 0)
			s = ns;
		else
			s = cgoto(f, s, *p);
		if (f->out[s])
			return (1);
	} while (*p++ != 0);
	return (0);
}

int
pmatch(fa *f, uchar *p)
{
	register int s, ns;
	register uchar *q;
	int i, k;

	if (f->reset) {
		f->initstat = s = makeinit(f, 1);
	} else {
		s = f->initstat;
	}
	patbeg = p;
	patlen = -1;
	do {
		q = p;
		do {
			if (f->out[s])		/* final state */
				patlen = q-p;
			if ((ns = f->gototab[s][*q]) != 0)
				s = ns;
			else
				s = cgoto(f, s, *q);
			if (s == 1) {	/* no transition */
				if (patlen >= 0) {
					patbeg = p;
					return (1);
				} else
					goto nextin;	/* no match */
			}
		} while (*q++ != 0);
		if (f->out[s])
			patlen = q - p - 1;	/* don't count $ */
		if (patlen >= 0) {
			patbeg = p;
			return (1);
		}
	nextin:
		s = 2;
		if (f->reset) {
			for (i = 2; i <= f->curstat; i++)
				xfree(f->posns[i]);
			k = *f->posns[0];
			if ((f->posns[2] =
			    (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
				overflo("out of space in pmatch");
			}
			for (i = 0; i <= k; i++)
				(f->posns[2])[i] = (f->posns[0])[i];
			f->initstat = f->curstat = 2;
			f->out[2] = f->out[0];
			for (i = 0; i < NCHARS; i++)
				f->gototab[2][i] = 0;
		}
	} while (*p++ != 0);
	return (0);
}

int
nematch(fa *f, uchar *p)
{
	register int s, ns;
	register uchar *q;
	int i, k;

	if (f->reset) {
		f->initstat = s = makeinit(f, 1);
	} else {
		s = f->initstat;
	}
	patlen = -1;
	while (*p) {
		q = p;
		do {
			if (f->out[s])		/* final state */
				patlen = q-p;
			if ((ns = f->gototab[s][*q]) != 0)
				s = ns;
			else
				s = cgoto(f, s, *q);
			if (s == 1) {	/* no transition */
				if (patlen > 0) {
					patbeg = p;
					return (1);
				} else
					goto nnextin;	/* no nonempty match */
			}
		} while (*q++ != 0);
		if (f->out[s])
			patlen = q - p - 1;	/* don't count $ */
		if (patlen > 0) {
			patbeg = p;
			return (1);
		}
	nnextin:
		s = 2;
		if (f->reset) {
			for (i = 2; i <= f->curstat; i++)
				xfree(f->posns[i]);
			k = *f->posns[0];
			if ((f->posns[2] =
			    (int *)calloc(1, (k + 1) * sizeof (int))) == NULL) {
				overflo("out of state space");
			}
			for (i = 0; i <= k; i++)
				(f->posns[2])[i] = (f->posns[0])[i];
			f->initstat = f->curstat = 2;
			f->out[2] = f->out[0];
			for (i = 0; i < NCHARS; i++)
				f->gototab[2][i] = 0;
		}
		p++;
	}
	return (0);
}

static Node *regexp(void), *primary(void), *concat(Node *);
static Node *alt(Node *), *unary(Node *);

static Node *
reparse(uchar *p)
{
	/* parses regular expression pointed to by p */
	/* uses relex() to scan regular expression */
	Node *np;

	dprintf(("reparse <%s>\n", p));
	lastre = prestr = p;	/* prestr points to string to be parsed */
	rtok = relex();
	if (rtok == '\0')
		ERROR "empty regular expression" FATAL;
	np = regexp();
	if (rtok == '\0') {
		return (np);
	} else {
		ERROR "syntax error in regular expression %s at %s",
		    lastre, prestr FATAL;
	}
	/*NOTREACHED*/
	return (NULL);
}

static Node *
regexp(void)
{
	return (alt(concat(primary())));
}

static Node *
primary(void)
{
	Node *np;

	switch (rtok) {
	case CHAR:
		np = op2(CHAR, NIL, (Node *)rlxval);
		rtok = relex();
		return (unary(np));
	case ALL:
		rtok = relex();
		return (unary(op2(ALL, NIL, NIL)));
	case DOT:
		rtok = relex();
		return (unary(op2(DOT, NIL, NIL)));
	case CCL:
		/*LINTED align*/
		np = op2(CCL, NIL, (Node *)cclenter(rlxstr));
		rtok = relex();
		return (unary(np));
	case NCCL:
		/*LINTED align*/
		np = op2(NCCL, NIL, (Node *)cclenter(rlxstr));
		rtok = relex();
		return (unary(np));
	case '^':
		rtok = relex();
		return (unary(op2(CHAR, NIL, (Node *)HAT)));
	case '$':
		rtok = relex();
		return (unary(op2(CHAR, NIL, NIL)));
	case '(':
		rtok = relex();
		if (rtok == ')') {	/* special pleading for () */
			rtok = relex();
			return (unary(op2(CCL, NIL,
			    /*LINTED align*/
			    (Node *)tostring((uchar *)""))));
		}
		np = regexp();
		if (rtok == ')') {
			rtok = relex();
			return (unary(np));
		} else {
			ERROR "syntax error in regular expression %s at %s",
			    lastre, prestr FATAL;
		}
	default:
		ERROR "illegal primary in regular expression %s at %s",
		    lastre, prestr FATAL;
	}
	/*NOTREACHED*/
	return (NULL);
}

static Node *
concat(Node *np)
{
	switch (rtok) {
	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
		return (concat(op2(CAT, np, primary())));
	default:
		return (np);
	}
}

static Node *
alt(Node *np)
{
	if (rtok == OR) {
		rtok = relex();
		return (alt(op2(OR, np, concat(primary()))));
	}
	return (np);
}

static Node *
unary(Node *np)
{
	switch (rtok) {
	case STAR:
		rtok = relex();
		return (unary(op2(STAR, np, NIL)));
	case PLUS:
		rtok = relex();
		return (unary(op2(PLUS, np, NIL)));
	case QUEST:
		rtok = relex();
		return (unary(op2(QUEST, np, NIL)));
	default:
		return (np);
	}
}

static int
relex(void)		/* lexical analyzer for reparse */
{
	register int c;
	uchar *cbuf;
	int clen, cflag;

	switch (c = *prestr++) {
	case '|': return OR;
	case '*': return STAR;
	case '+': return PLUS;
	case '?': return QUEST;
	case '.': return DOT;
	case '\0': prestr--; return '\0';
	case '^':
	case '$':
	case '(':
	case ')':
		return (c);
	case '\\':
		if ((c = *prestr++) == 't')
			c = '\t';
		else if (c == 'n')
			c = '\n';
		else if (c == 'f')
			c = '\f';
		else if (c == 'r')
			c = '\r';
		else if (c == 'b')
			c = '\b';
		else if (c == '\\')
			c = '\\';
		else if (isdigit(c)) {
			int n = c - '0';
			if (isdigit(*prestr)) {
				n = 8 * n + *prestr++ - '0';
				if (isdigit(*prestr))
					n = 8 * n + *prestr++ - '0';
			}
			c = n;
		} /* else it's now in c */
		rlxval = c;
		return (CHAR);
	default:
		rlxval = c;
		return (CHAR);
	case '[':
		clen = 0;
		if (*prestr == '^') {
			cflag = 1;
			prestr++;
		} else
			cflag = 0;
		init_buf(&cbuf, NULL, strlen((char *)prestr) * 2 + 1);
		for (;;) {
			if ((c = *prestr++) == '\\') {
				cbuf[clen++] = '\\';
				if ((c = *prestr++) == '\0') {
					ERROR
			"nonterminated character class %s", lastre FATAL;
				}
				cbuf[clen++] = c;
			} else if (c == ']') {
				cbuf[clen] = 0;
				rlxstr = tostring(cbuf);
				free(cbuf);
				if (cflag == 0)
					return (CCL);
				else
					return (NCCL);
			} else if (c == '\n') {
				ERROR "newline in character class %s...",
				    lastre FATAL;
			} else if (c == '\0') {
				ERROR "nonterminated character class %s",
				    lastre FATAL;
			} else
				cbuf[clen++] = c;
		}
		/*NOTREACHED*/
	}
	return (0);
}

static int
cgoto(fa *f, int s, int c)
{
	register int i, j, k;
	register int *p, *q;

	for (i = 0; i <= f->accept; i++)
		setvec[i] = 0;
	setcnt = 0;
	/* compute positions of gototab[s,c] into setvec */
	p = f->posns[s];
	for (i = 1; i <= *p; i++) {
		if ((k = f->re[p[i]].ltype) != FINAL) {
			if (k == CHAR && c == f->re[p[i]].lval ||
			    k == DOT && c != 0 && c != HAT ||
			    k == ALL && c != 0 ||
			    k == CCL &&
			    member(c, (uchar *)f->re[p[i]].lval) ||
			    k == NCCL &&
			    !member(c, (uchar *)f->re[p[i]].lval) &&
			    c != 0 && c != HAT) {
				q = f->re[p[i]].lfollow;
				for (j = 1; j <= *q; j++) {
					if (setvec[q[j]] == 0) {
						setcnt++;
						setvec[q[j]] = 1;
					}
				}
			}
		}
	}
	/* determine if setvec is a previous state */
	tmpset[0] = setcnt;
	j = 1;
	for (i = f->accept; i >= 0; i--)
		if (setvec[i]) {
			tmpset[j++] = i;
		}
	/* tmpset == previous state? */
	for (i = 1; i <= f->curstat; i++) {
		p = f->posns[i];
		if ((k = tmpset[0]) != p[0])
			goto different;
		for (j = 1; j <= k; j++)
			if (tmpset[j] != p[j])
				goto different;
		/* setvec is state i */
		f->gototab[s][c] = i;
		return (i);
	different:;
	}

	/* add tmpset to current set of states */
	if (f->curstat >= NSTATES-1) {
		f->curstat = 2;
		f->reset = 1;
		for (i = 2; i < NSTATES; i++)
			xfree(f->posns[i]);
	} else
		++(f->curstat);
	for (i = 0; i < NCHARS; i++)
		f->gototab[f->curstat][i] = 0;
	xfree(f->posns[f->curstat]);
	if ((p = (int *)calloc(1, (setcnt + 1) * sizeof (int))) == NULL)
		overflo("out of space in cgoto");

	f->posns[f->curstat] = p;
	f->gototab[s][c] = f->curstat;
	for (i = 0; i <= setcnt; i++)
		p[i] = tmpset[i];
	if (setvec[f->accept])
		f->out[f->curstat] = 1;
	else
		f->out[f->curstat] = 0;
	return (f->curstat);
}

static void
freefa(fa *f)
{

	register int i;

	if (f == NULL)
		return;
	for (i = 0; i <= f->curstat; i++)
		xfree(f->posns[i]);
	for (i = 0; i <= f->accept; i++)
		xfree(f->re[i].lfollow);
	xfree(f->restr);
	xfree(f);
}