V9/jtools/src/sam/regexp.c
#include "sam.h"
Range sel;
String lastregexp;
/*
* Machine Information
*/
typedef struct Inst{
int type; /* < 0x200 ==> literal, otherwise action */
struct Inst *right;
struct Inst *left;
}Inst;
#define next left /* Left branch is usually just next ptr */
#define NPROG 1024
Inst program[NPROG];
Inst *progp;
Inst *startinst; /* First inst. of program; might not be program[0] */
Inst *bstartinst; /* same for backwards machine */
typedef struct Ilist{
Inst *inst; /* Instruction of the thread */
Posn startp; /* first char of match */
}Ilist;
#define NLIST 128
Ilist *tl, *nl; /* This list, next list */
Ilist list[2][NLIST];
/*
* Actions and Tokens
*
* 0x2xx are operators, value == precedence
* 0x3xx are tokens, i.e. operands for operators
*/
#define OPERATOR 0x200 /* Bitmask of all operators */
#define START 0x200 /* Start, used for marker on stack */
#define RBRA 0x201 /* Right bracket, ) */
#define LBRA 0x202 /* Left bracket, ( */
#define OR 0x203 /* Alternation, | */
#define CAT 0x204 /* Concatentation, implicit operator */
#define STAR 0x205 /* Closure, * */
#define PLUS 0x206 /* a+ == aa* */
#define QUEST 0x207 /* a? == a|nothing, i.e. 0 or 1 a's */
#define ANY 0x300 /* Any character but newline, . */
#define ANYNL 0x301 /* Any character, including newline, @ */
#define NOP 0x302 /* No operation, internal use only */
#define BOL 0x303 /* Beginning of line, ^ */
#define EOL 0x304 /* End of line, $ */
#define CCLASS 0x305 /* Character class, [] */
#define END 0x377 /* Terminate: match found */
/*
* Parser Information
*/
typedef struct Node{
Inst *first;
Inst *last;
}Node;
#define NSTACK 20
Node andstack[NSTACK];
Node *andp;
int atorstack[NSTACK];
int *atorp;
int lastwasand; /* Last token was operand */
int backwards;
int nbra;
uchar *exprp; /* pointer to next character in source expression */
#define NCLASS 8
int nclass;
char class[NCLASS][32]; /* 32 bytes == 256 bits, one bit per char */
Inst *
newinst(t)
int t;
{
if(progp>= &program[NPROG])
error(Etoolong);
progp->type=t;
progp->left=0;
progp->right=0;
return progp++;
}
Inst *
realcompile(s)
uchar *s;
{
register token;
startlex(s);
atorp=atorstack;
andp=andstack;
lastwasand=FALSE;
/* Start with a low priority operator to prime parser */
pushator(START-1);
while((token=lex()) != END){
if((token&0x300) == OPERATOR)
operator(token);
else
operand(token);
}
/* Close with a low priority operator */
evaluntil(START);
/* Force END */
operand(END);
evaluntil(START);
if(nbra)
error(Eleftpar);
--andp; /* points to first and only operand */
return andp->first;
}
compile(r)
Regexp *r;
{
register String *s= &r->text;
Inst *oprogp;
if(s->n==lastregexp.n &&
strncmp(s->s, lastregexp.s, lastregexp.n)==0)
return;
progp=program;
backwards=FALSE;
startinst=realcompile(s->s);
optimize(program);
oprogp=progp;
backwards=TRUE;
bstartinst=realcompile(s->s);
optimize(oprogp);
strdupstr(&lastregexp, s);
}
operand(t)
int t;
{
register Inst *i;
if(lastwasand)
operator(CAT); /* catenate is implicit */
i=newinst(t);
if(t==CCLASS) /* ugh */
i->right=(Inst *)class[nclass-1]; /* UGH! */
pushand(i, i);
lastwasand=TRUE;
}
operator(t)
int t;
{
if(t==RBRA && --nbra<0)
error(Erightpar);
if(t==LBRA){
nbra++;
if(lastwasand)
operator(CAT);
}else
evaluntil(t);
if(t!=RBRA)
pushator(t);
lastwasand=FALSE;
if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
lastwasand=TRUE; /* these look like operands */
}
cant(s)
char *s;
{
char buf[100];
sprint(buf, "can't happen: %s", s);
panic(buf);
}
pushand(f, l)
Inst *f, *l;
{
if(andp >= &andstack[NSTACK])
cant("operand stack overflow");
andp->first=f;
andp->last=l;
andp++;
}
pushator(t)
int t;
{
if(atorp >= &atorstack[NSTACK])
cant("operator stack overflow");
*atorp++=t;
}
Node *
popand(op)
{
if(andp <= &andstack[0])
error_c(Emissop, op);
return --andp;
}
popator()
{
if(atorp <= &atorstack[0])
cant("operator stack underflow");
return *--atorp;
}
evaluntil(pri)
register pri;
{
register Node *op1, *op2, *t;
register Inst *inst1, *inst2;
while(pri==RBRA || atorp[-1]>=pri){
switch(popator()){
case LBRA:
return; /* must have been RBRA */
default:
panic("unknown regexp operator");
break;
case OR:
op2=popand('|');
op1=popand('|');
inst2=newinst(NOP);
op2->last->next=inst2;
op1->last->next=inst2;
inst1=newinst(OR);
inst1->right=op1->first;
inst1->left=op2->first;
pushand(inst1, inst2);
break;
case CAT:
op2=popand(0);
op1=popand(0);
if(backwards && op2->first->type!=END)
t=op1, op1=op2, op2=t;
op1->last->next=op2->first;
pushand(op1->first, op2->last);
break;
case STAR:
op2=popand('*');
inst1=newinst(OR);
op2->last->next=inst1;
inst1->right=op2->first;
pushand(inst1, inst1);
break;
case PLUS:
op2=popand('+');
inst1=newinst(OR);
op2->last->next=inst1;
inst1->right=op2->first;
pushand(op2->first, inst1);
break;
case QUEST:
op2=popand('?');
inst1=newinst(OR);
inst2=newinst(NOP);
inst1->left=inst2;
inst1->right=op2->first;
op2->last->next=inst2;
pushand(inst1, inst2);
break;
}
}
}
optimize(start)
register Inst *start;
{
register Inst *inst, *target;
for(inst=start; inst->type!=END; inst++){
target=inst->next;
while(target->type == NOP)
target=target->next;
inst->next=target;
}
}
#ifdef DEBUG
dumpstack(){
Node *stk;
int *ip;
dprint("operators\n");
for(ip=atorstack; ip<atorp; ip++)
dprint("0%o\n", *ip);
dprint("operands\n");
for(stk=andstack; stk<andp; stk++)
dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
}
dump(){
Inst *l;
l=program;
do{
dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
l->left-program, l->right-program);
}while(l++->type);
}
#endif
startlex(s)
uchar *s;
{
exprp=s;
nclass=0;
nbra=0;
}
char *
newclass(){
register char *p;
register n;
if(nclass >= NCLASS)
error(Eclass);
p=class[nclass++];
for(n=0; n<32; n++)
p[n]=0;
return p;
}
lex(){
register c= *exprp++;
switch(c){
case '\\':
if(*exprp)
if((c= *exprp++)=='n')
c='\n';
break;
case 0:
c=END;
--exprp; /* In case we come here again */
break;
case '*':
c=STAR;
break;
case '?':
c=QUEST;
break;
case '+':
c=PLUS;
break;
case '|':
c=OR;
break;
case '.':
c=ANY;
break;
case '@':
c=ANYNL;
break;
case '(':
c=LBRA;
break;
case ')':
c=RBRA;
break;
case '^':
c=BOL;
break;
case '$':
c=EOL;
break;
case '[':
c=CCLASS;
bldcclass();
break;
}
return c;
}
nextrec(){
if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
error(Ebadclass);
if(exprp[0]=='\\'){
exprp++;
if(*exprp=='n'){
exprp++;
return '\n';
}
return *exprp++|0x100;
}
return *exprp++;
}
bldcclass(){
register c1, c2;
register char *classp;
register negate=FALSE;
classp=newclass();
/* we have already seen the '[' */
if(*exprp=='^'){
negate=TRUE;
exprp++;
}
while((c1=c2=nextrec()) != ']'){
if(*exprp=='-'){
exprp++; /* eat '-' */
if((c2=nextrec()) == ']')
error(Ebadclass);
}
for((c1&=0xFF), (c2&=0xFF); c1<=c2; c1++)
classp[c1/8] |= 1<<(c1&07);
}
if(negate){
for(c1=0; c1<32; c1++)
classp[c1]^=0xFF;
classp[1]&=~(1<<('\n'-8)); /* exclude '\n' */
}
classp[0]&=0xFE; /* exclude NUL */
}
/*
* Note optimization in addinst:
* *l must be pending when addinst called; if *l has been looked
* at already, the optimization is a bug.
*/
addinst(l, inst, startp)
register Ilist *l;
register Inst *inst;
Posn startp;
{
register Ilist *p;
for(p=l; p->inst; p++){
if(p->inst==inst){
if(startp < p->startp)
p->startp=startp; /* this would be bug */
return; /* It's already there */
}
}
p->inst=inst;
p->startp=startp;
(++p)->inst=0;
}
execute(f, startp, eof)
File *f;
Posn startp, eof;
{
register flag=0;
register Inst *inst;
register Ilist *tlp;
register Posn p=startp;
int nnl=0, ntl;
register c;
int wrapped=0;
register startchar=startinst->type<OPERATOR? startinst->type : 0;
list[0][0].inst=list[1][0].inst=0;
sel.p1= -1;
Fgetcset(f, startp);
/* Execute machine once for each character */
for(;;p++){
doloop:
c=Fgetc(f);
if(p>=eof || c<0){
switch(wrapped++){
case 0: /* let loop run one more click */
case 2:
break;
case 1: /* expired; wrap to beginning */
if(sel.p1>=0 || eof!=INFINITY)
goto Return;
list[0][0].inst=list[1][0].inst=0;
Fgetcset(f, (Posn)0);
p=0;
goto doloop;
default:
goto Return;
}
}else if(((wrapped && p>=startp) || sel.p1>0) && nnl==0)
break;
/* fast check for first char */
if(startchar && nnl==0 && c!=startchar)
continue;
tl=list[flag];
nl=list[flag^=1];
nl->inst=0;
ntl=nnl;
nnl=0;
if(sel.p1<0 && (!wrapped || p<startp || startp==eof)){
/* Add first instruction to this list */
if(++ntl >= NLIST)
Overflow:
error(Eoverflow);
addinst(tl, startinst, p);
}
/* Execute machine until this list is empty */
for(tlp=tl; inst=tlp->inst; tlp++){ /* assignment = */
Switchstmt:
switch(inst->type){
default: /* regular character */
if(inst->type==c){
Addinst:
if(++nnl >= NLIST)
goto Overflow;
addinst(nl, inst->next, tlp->startp);
}
break;
case ANYNL:
goto Addinst;
case ANY:
if(c!='\n')
goto Addinst;
break;
case BOL:
if(p==0){
Step:
inst=inst->next;
goto Switchstmt;
}
if(f->getci>1){
if(f->getcbuf[f->getci-2]=='\n')
goto Step;
}else{
uchar c;
if(Fchars(f, &c, p-1, p)==1 && c=='\n')
goto Step;
}
break;
case EOL:
if(c=='\n')
goto Step;
break;
case CCLASS:
if(c>=0 && ((char *)inst->right)[c/8]&(1<<(c&07)))
goto Addinst;
break;
case OR:
/* evaluate right choice later */
if(++ntl >= NLIST)
goto Overflow;
addinst(tlp, inst->right, tlp->startp);
/* efficiency: advance and re-evaluate */
inst=inst->left;
goto Switchstmt;
case END: /* Match! */
newmatch(tlp->startp, p);
break;
}
}
}
Return:
return sel.p1>=0;
}
newmatch(p1, p2)
Posn p1, p2;
{
if(sel.p1<0 || p1<sel.p1 || (p1==sel.p1 && p2>sel.p2)){
sel.p1=p1;
sel.p2=p2;
}
}
bexecute(f, startp)
register File *f;
Posn startp;
{
register flag=0;
register Inst *inst;
register Ilist *tlp;
register Posn p=startp;
int nnl=0, ntl;
register c;
int wrapped=0;
register startchar=bstartinst->type<OPERATOR? bstartinst->type : 0;
list[0][0].inst=list[1][0].inst=0;
sel.p1= -1;
Fgetcset(f, startp);
/* Execute machine once for each character, including terminal NUL */
for(;;--p){
doloop:
if((c=Fbgetc(f))==-1){
switch(wrapped++){
case 0: /* let loop run one more click */
case 2:
break;
case 1: /* expired; wrap to end */
if(sel.p1>=0)
case 3:
goto Return;
list[0][0].inst=list[1][0].inst=0;
Fgetcset(f, f->nbytes);
p=f->nbytes;
goto doloop;
default:
goto Return;
}
}else if(((wrapped && p<=startp) || sel.p1>0) && nnl==0)
break;
/* fast check for first char */
if(startchar && nnl==0 && c!=startchar)
continue;
tl=list[flag];
nl=list[flag^=1];
nl->inst=0;
ntl=nnl;
nnl=0;
if(sel.p1<0 && (!wrapped || p>startp)){
/* Add first instruction to this list */
if(++ntl >= NLIST)
Overflow:
error(Eoverflow);
/* the minus is so the optimizations in addinst work */
addinst(tl, bstartinst, -p);
}
/* Execute machine until this list is empty */
for(tlp=tl; inst=tlp->inst; tlp++){ /* assignment = */
Switchstmt:
switch(inst->type){
default: /* regular character */
if(inst->type==c){
Addinst:
if(++nnl >= NLIST)
goto Overflow;
addinst(nl, inst->next, tlp->startp);
}
break;
case ANYNL:
goto Addinst;
case ANY:
if(c!='\n')
goto Addinst;
break;
case BOL:
if(c=='\n'){
Step:
inst=inst->next;
goto Switchstmt;
}
break;
case EOL:
if(f->getci<f->ngetc-1){
if(f->getcbuf[f->getci+1]=='\n')
goto Step;
}else if(p<f->nbytes-1){
uchar c;
if(Fchars(f, &c, p+1, p+2)==1 && c=='\n')
goto Step;
}
break;
case CCLASS:
if(((char *)inst->right)[c/8]&(1<<(c&07)))
goto Addinst;
break;
case OR:
/* evaluate right choice later */
if(++ntl >= NLIST)
goto Overflow;
addinst(tlp, inst->right, tlp->startp);
/* efficiency: advance and re-evaluate */
inst=inst->left;
goto Switchstmt;
case END: /* Match! */
bnewmatch(-tlp->startp, p); /* minus sign */
break;
}
}
}
Return:
return sel.p1>=0;
}
bnewmatch(p2, p1) /* note the reversal; p1<=p2 */
Posn p1, p2;
{
if(sel.p1<0 || p2>sel.p2 || (p2==sel.p2 && p1<sel.p1)){
sel.p1=p1;
sel.p2=p2;
}
}