V10/cmd/gre/eg.c

Compare this file to the similar file:
Show the results in this format:

#include	"re.h"
#include	"lre.h"
#include	"hdr.h"

int re_debug = 0;

#define	RESET(r, ps)	COPY(r, ps, begin)
#define	SET(r, ps, n)	{if(r->ps.base[n] == 0) r->ps.count++, r->ps.base[n] = r->ps.last, r->ps.last = n; }
#define	GET(ps, n)	for(n = ps.last; n > 0; n = ps.base[n])
#define	COPY(r, to, from)	memmove((char *)r->to.base, (char *)r->from.base, r->maxid*sizeof(int)), r->to.count = r->from.count, r->to.last = r->from.last

static State *addstate(re_re *r, Positionset *ps);
static int first(re_re *, Expr *);
static int match(re_re *, Expr *, int);
static State *nextstate(re_re *, State *, int);
#ifdef	DEBUG
static void ppr(Positionset *, char *);
#endif

typedef enum { NORMAL, ACCEPT, ACCEPT_EOP, FAIL } Out;

static void
eptr(register re_re *r, register Expr *e)
{
	if((e->id < 0) || (e->id >= r->maxid)){
		EPR "eptr abort: r=%ld[maxid=%d] e=%ld[id=%d]\n", r, r->maxid, e, e->id);
		abort();
	}
	r->ptr[e->id] = e;
	if((e->type != Charclass) && (e->type != Compcharclass)){
		if(e->l)
			eptr(r, e->l);
		if(e->r)
			eptr(r, e->r);
	}
}

re_re *
egprep(enum Parsetype parser, unsigned char *b, unsigned char *e, unsigned char *map, int dotstar)
{
	register re_re *r;
	Expr *ecomp;
	int i;

	r = (re_re *)egmalloc(sizeof (re_re), "allocating re_re");
	if(r == 0)
		return 0;
	memset((char *)r, 0, sizeof (re_re));
	eg_lexinit((char *)b, (char *)e);
	if(map)
		memmove(r->mymap, (char *)map, 256);
	else
		for(i = 0; i < 256; i++)
			r->mymap[i] = i;
	eg_lex();
	if(dotstar){
		ecomp = eg_newexpr(Star, 0, eg_newexpr(Dot, '.', (Expr *)0, (Expr *)0), (Expr *)0);
		ecomp = eg_newexpr(Cat, 0, ecomp, eg_eall(parser, r->mymap));
	} else
		ecomp = eg_eall(parser, r->mymap);
	r->root = eg_newexpr(EOP, '#', ecomp, (Expr *)0);
	egpost(r);
	r->carat = 0;
	if(r->backref || r->parens)
		egbr(r);
	else
		eginit(r, dotstar);
	return(r);
}

void
eginit(register re_re *r, int dotstar)
{
	int n;

#ifdef	DEBUG
	if(TRACE(6))
		PR "eginit(r=%d, dotstar=%d)\n", r, dotstar);
#endif
#ifdef	DEBUG
	if(TRACE(10)){
		char buf[EPRINTSIZE];
		eg_epr(r->root, buf, 0);
		PR "eginit: r=%d expr='%s'\n", r, buf);
	}
#endif
	r->ptr = (Expr **)egmalloc(r->maxid*sizeof(Expr *), "ptr");
	if (!r->ptr)
		return;
	eptr(r, r->root);
	r->firstpos.base = (int *)egmalloc(n = r->maxid*sizeof(int), "first base");
	if (!r->firstpos.base){
		free((char*)r->ptr);
		return;
	}
	r->begin.base = (int *)egmalloc(n, "begin base");
	if (!r->begin.base){
		free((char*)r->firstpos.base);
		free((char*)r->ptr);
		return;
	}
	r->tmp.base = (int *)egmalloc(n, "tmp base");
	if (!r->tmp.base){
		free((char*)r->begin.base);
		free((char*)r->firstpos.base);
		free((char*)r->ptr);
		return;
	}
	memset((char *)r->begin.base, 0, n);
	r->begin.count = 0;
	r->begin.last = -1;
	r->carat = dotstar;
	RESET(r, firstpos);
	if(first(r, r->root->l) == 0){
		/*
			nullable, so include me
		*/
		SET(r, firstpos, r->root->id)
	}
	if(dotstar)
		COPY(r, begin, firstpos);
	eg_stateinit(r);
	eg_clrstates(r);
	eg_posinit(r);
	(void)addstate(r, &r->firstpos);
	eg_savestate(r, dotstar? nextstate(r, r->states, RE_CARAT):r->states);
	eg_posset(r);
}

unsigned char *
eg_quickmatch(register re_re *r, register unsigned char *b, register unsigned char *e, int endpts)
{
	register State *s, *t;

#ifdef	DEBUG
	if(TRACE(10)){
		char buf[EPRINTSIZE];
		eg_epr(r->root, buf, 0);
		PR "qm: r=%d expr='%s' endpts=%d\n", r, buf, endpts);
	}
#endif
	s = eg_startstate(r);
	if(endpts&RE_BEG){
		if(!r->carat){
			if(t = s->tab[RE_CARAT])
				s = t;
			else
				s = nextstate(r, s, RE_CARAT);
			if(s->out == FAIL)
				s = eg_startstate(r);
		}
		if(s->out){
#ifdef	DEBUG
			if(TRACE(6))
				PR "match at ^: '%s' out=%d\n", b, s->out);
#endif
			return((s->out == FAIL)? 0:b);
		}
	}
	while(b < e){
#ifdef	DEBUG
		if(TRACE(4))
			PR "state %d@%d[%d pos]: char '%c'\n", s-r->states, (int)s, s->npos, *b);
#endif
		if(t = s->tab[*(unsigned char *)b])
			s = t;
		else
			s = nextstate(r, s, *(unsigned char *)b);
		if(s->out){
#ifdef	DEBUG
			if(TRACE(6))
				PR "match at input '%s' out=%d\n", b, s->out);
#endif
			return((s->out == FAIL)? 0:b);
		}
		b++;
	}
	if(endpts&RE_END){
		if(t = s->tab[RE_DOLLAR])
			s = t;
		else
			s = nextstate(r, s, RE_DOLLAR);
	}
	if(s->out){
#ifdef	DEBUG
		if(TRACE(6))
			PR "match at $ '%s' out=%d\n", b, s->out);
#endif
		return((s->out == FAIL)? 0:b);
	}
#ifdef	DEBUG
	if(TRACE(3)){
		char buf[EPRINTSIZE];

		eg_epr(r->root, buf, 1);
		PR "pat = %s\n", buf);
	}
#endif
	return(0);
}

unsigned char *
eg_lquickmatch(register re_re *r, register unsigned char *b, register unsigned char *e, int endpts)
{
	register State *s, *t;
	int outedness = 0;

#ifdef	DEBUG
	if(TRACE(10)){
		char buf[EPRINTSIZE];
		eg_epr(r->root, buf, 0);
		PR "lqm: r=%d carat=%d expr='%s' endpts=%d\n", r, r->carat, buf, endpts);
	}
#endif
	s = eg_startstate(r);
	if(endpts&RE_BEG){
		if(!r->carat){
			if(t = s->tab[RE_CARAT])
				s = t;
			else
				s = nextstate(r, s, RE_CARAT);
			if(s->out == FAIL)
				s = eg_startstate(r);
		}
		if(s->out){
#ifdef	DEBUG
			if(TRACE(6))
				PR "match at ^: '%s' out=%d\n", b, s->out);
#endif
			if(s->out == ACCEPT_EOP)
				return(b);
			else if(s->out == FAIL)
				return(0);
			outedness = 1;
		}
	}
	/*
		look for first match
	*/
	if(outedness == 0)
	for(; b < e; b++){
#ifdef	DEBUG
		if(TRACE(4))
			PR "state %d@%d[%d pos]: char '%c'\n", s-r->states, (int)s, s->npos, *b);
#endif
		if(t = s->tab[*(unsigned char *)b])
			s = t;
		else
			s = nextstate(r, s, *(unsigned char *)b);
		if(s->out){
#ifdef	DEBUG
			if(TRACE(4))
				PR "	out=%d, state %d@%d\n", s->out,
					s-r->states, (int)s);
#endif
			b++;
			if((s->out == ACCEPT_EOP) || (s->out == FAIL))
				goto finish;
			outedness = 1;
			break;
		}
	}
	/*
		in the following loop, we have seen a match already
		because only way outedness is zero is if b>=e
	*/
	for(; b < e; b++){
#ifdef	DEBUG
		if(TRACE(4))
			PR "statex %d@%d[%d pos]: char '%c'\n", s-r->states, (int)s, s->npos, *b);
#endif
		if(t = s->tab[*(unsigned char *)b])
			s = t;
		else
			s = nextstate(r, s, *(unsigned char *)b);
		if(s->out == ACCEPT)
			continue;
		if((s->out == NORMAL) || (s->out == FAIL)){
#ifdef	DEBUG
			if(TRACE(6))
				PR "unmatch at input '%s' out=%d\n", b, s->out);
#endif
			return(b);
		}
	}
	if(endpts&RE_END){
		if(t = s->tab[RE_DOLLAR])
			s = t;
		else
			s = nextstate(r, s, RE_DOLLAR);
	}
finish:
	if((s->out && (s->out != FAIL)) || outedness){
#ifdef	DEBUG
		if(TRACE(6))
			PR "match at $ '%s' out=%d\n", b, s->out);
#endif
		return(b);
	}
#ifdef	DEBUG
	if(TRACE(3)){
		char buf[EPRINTSIZE];

		eg_epr(r->root, buf, 1);
		PR "lqm('%s') failed; out=%d, s->out=%d \n", buf, outedness, s->out);
	}
#endif
	return(0);
}

static
match(register re_re *r, register Expr *e, int a)
{
	switch(e->type)
	{
	case Dollar:	return(a == RE_DOLLAR);
	case Carat:	return(a == RE_CARAT);
	case Dot:	return(a < 256);
	case Literal:	return(r->mymap[a] == r->mymap[e->lit]);
	case Charclass:	if(a >= 256) return(0);
			return(memchr((char *)e->r, r->mymap[a], (int)e->l) != 0);
	case Compcharclass:
			if(a >= 256) return(0);
			return(memchr((char *)e->r, r->mymap[a], (int)e->l) == 0);
	}
	return(0);
}

	/*
		generates the followset for a node in firstpos
	*/

static void
follow(register re_re *r, register Expr *e)
{
	register Expr *p;

	if(e->type == EOP)
		return;
	else
		p = e->parent;
	switch(p->type)
	{
	case EOP:
		SET(r, firstpos, p->id)
		break;
	case Plus:
	case Star:
		(void)first(r, e);
		follow(r, p);
		break;
	case Quest:
	case Alternate:
		follow(r, p);
		break;
	case Cat:
		if(e == p->l){
			if(first(r, p->r) == 0){
				follow(r, p);
				break;
			}
		} else
			follow(r, p);
		break;
	case Group:
		follow(r, p);
		break;
	}
}

	/*
		first returns 0 if e is nullable and in the process,
		sets up firstpos.
	*/

static
first(register re_re *r, register Expr *e)
{
	int k;

	switch(e->type)
	{
	case Carat:
	case Dollar:
	case Literal:
	case Dot:
	case Charclass:
	case Compcharclass:
		SET(r, firstpos, e->id)
		return(1);
	case EOP:
	case Star:
	case Quest:
		(void)first(r, e->l);
		return(0);
	case Group:
	case Plus:
		return(first(r, e->l));
	case Cat:
		return(first(r, e->l) || first(r, e->r));
	case Alternate:
		k = first(r, e->r);
		return(first(r, e->l) && k);
	default:
		EPR "bad type %d\n", e->type);
		abort();
		return(0);
	}
}

static void
efollow(register re_re *r, register Expr *e)
{
	register i, *p;

	RESET(r, firstpos);
	follow(r, e);
	e->flen = r->firstpos.count;
	e->follow = (int *)malloc((unsigned)(e->flen*sizeof(int)));
	if(e->follow == 0){
		char buf[100];
		SPR buf, "efollow malloc fail(%ld)\n", e->flen);
		re_error(buf);
		return;
	}
	p = e->follow;
	GET(r->firstpos, i)
		*p++ = i;
	if(p != e->follow+e->flen){
		char buf[100];
		SPR buf, "efollow length error: %d %ld\n", p-e->follow, e->flen);
		re_error(buf);
		return;
	}
}

static State *
addstate(register re_re *r, register Positionset *ps)
{
	register *p, j;
	register State *s;

	s = r->states + eg_getstate(r);
	memset((char *)s->tab, 0, sizeof(s->tab));
	s->pos = eg_posalloc(r, (int)ps->count);
	s->npos = ps->count;
	p = r->posbase+s->pos;
	GET((*ps), j)
		*p++ = j;
	if(s->out = (ps->base[r->root->id] != 0)){
		if((ps->base[r->root->id] <= 0) && (ps->last == r->root->id))
			s->out = ACCEPT_EOP;	/* marker for only the EOP marker */
		else
			s->out = ACCEPT;
	}
	if(s->npos == 0)
		s->out = FAIL;
#ifdef	DEBUG
	if(TRACE(3)){
		char buf[2000];
		eg_spr(s->npos, s->pos+r->posbase, buf);
		PR "new state[%d]@%d %s,pos=%d out=%d\n", s-r->states, s, buf, s->pos, s->out);
	}
#endif
	return(s);
}

static State *
nextstate(register re_re *r, register State *s, int a)
{
	register int p, *q, *eq;
	register long i;
	State *news;
	Expr *e;

	RESET(r, tmp);
#ifdef	DEBUG
	if(TRACE(5)){
		char buf[2000];
		ppr(&r->tmp, buf);
		PR "nextstate: reset: %s\n", buf);
	}
#endif
	for(i = s->npos, p = s->pos; i > 0; i--){
		e = r->ptr[r->posbase[p++]];
#ifdef	DEBUG
		if(TRACE(11)){
			PR "i=%d e->type=%d\n", i, e->type);
		}
#endif
		if(e->type == EOP) continue;
		if(match(r, e, a)){
#ifdef	DEBUG
			if(TRACE(11)){PR "matched %c\n", a);}
#endif
			if(e->follow == 0)
				efollow(r, e);
			for(q = e->follow, eq = q+e->flen; q < eq; q++){
				SET(r, tmp, *q)
			}
		}
	}
#ifdef	DEBUG
	if(TRACE(5)){
		char buf[2000];
		ppr(&r->tmp, buf);
		PR "nextstate(%d, '%c'): found %s\n", s-r->states, a, buf);
	}
#endif
	if(news = eg_stateof(r, &r->tmp)){
		if(a <= RE_DOLLAR)
			s->tab[a] = news;
	} else
		news = addstate(r, &r->tmp);
#ifdef	DEBUG
	if(TRACE(5)){
		PR "nextstate(%d, '%c'): returning %ld %d out=%d\n", s-r->states, a,
			(long)news, news-r->states, news->out);
	}
#endif
	return(news);
}

void *
egmalloc(int n, char *s)
{
	char *x;
	char buf[256];

	if((x = malloc(n)) == 0){
		SPR buf, "malloc fail(%d): %s", n, s);
		re_error(buf);
		return 0;
	}
	return((void *)x);
}

void *
egrealloc(char *p, int n, char *s)
{
	char *x;
	char buf[256];

	if((x = realloc(p, n)) == 0){
		SPR buf, "realloc fail(%d): %s", n, s);
		re_error(buf);
		return 0;
	}
	return((void *)x);
}

#ifdef	DEBUG
static void
ppr(register Positionset *ps, register char *p)
{
	register n;

	if(ps->count < 1){
		SPR p, "{}");
		return;
	}
	*p++ = '{';
	GET((*ps), n){
		SPR p, "%d[=%d],", n, ps->base[n]);
		p = strchr(p, 0);
	}
	p[-1] = '}';
}
#endif