#include "regprog.h" /* * Parser Information */ typedef struct Node{ Inst *first; Inst *last; }Node; #define NSTACK 20 static Node andstack[NSTACK]; static Node *andp; static int atorstack[NSTACK]; static int *atorp; static int cursubid; /* id of current subexpression */ static int subidstack[NSTACK]; /* parallel to atorstack */ static int *subidp; static int lastwasand; /* Last token was operand */ static int nbra; static char *exprp; /* pointer to next character in source expression */ static int nclass; static Class *classp; static Inst *freep; static int errors; /* predeclared crap */ static void operator(); static void pushand(); static void pushator(); static void evaluntil(); static void bldcclass(); static void rcerror(s) char *s; { errors++; regerror(s); } static Inst * newinst(t) int t; { freep->type=t; freep->left=0; freep->right=0; return freep++; } static void operand(t) int t; { register Inst *i; if(lastwasand) operator(CAT); /* catenate is implicit */ i=newinst(t); if(t==CCLASS) /* ugh */ i->right=(Inst *)&(classp[nclass-1]); /* UGH! */ pushand(i, i); lastwasand=TRUE; } static void operator(t) int t; { if(t==RBRA && --nbra<0) rcerror("unmatched right paren"); if(t==LBRA) { if (++cursubid >= NSUBEXP) rcerror ("too many subexpressions"); nbra++; if (lastwasand) operator(CAT); } else evaluntil(t); if(t!=RBRA) pushator(t); lastwasand=FALSE; if(t==STAR || t==QUEST || t==PLUS || t==RBRA) lastwasand=TRUE; /* these look like operands */ } static void regerr2(s, c) char *s; { char buf[100]; char *cp = buf; while(*s) *cp++ = *s++; *cp++ = c; *cp = '\0'; rcerror(buf); } static void cant(s) char *s; { char buf[100]; strcpy(buf, "can't happen: "); strcat(buf, s); rcerror(buf); } static void pushand(f, l) Inst *f, *l; { if(andp >= &andstack[NSTACK]) cant("operand stack overflow"); andp->first=f; andp->last=l; andp++; } static void pushator(t) int t; { if(atorp >= &atorstack[NSTACK]) cant("operator stack overflow"); *atorp++=t; *subidp++=cursubid; } static Node * popand(op) { register Inst *inst; if(andp <= &andstack[0]) { regerr2("missing operand for ", op); inst=newinst(NOP); pushand(inst,inst); } return --andp; } static int popator() { if(atorp <= &atorstack[0]) cant("operator stack underflow"); --subidp; return *--atorp; } static void evaluntil(pri) register pri; { register Node *op1, *op2; register Inst *inst1, *inst2; while(pri==RBRA || atorp[-1]>=pri){ switch(popator()){ default: rcerror("unknown operator in evaluntil"); break; case LBRA: /* must have been RBRA */ op1=popand('('); inst2=newinst(RBRA); inst2->subid = *subidp; op1->last->next = inst2; inst1=newinst(LBRA); inst1->subid = *subidp; inst1->next=op1->first; pushand(inst1, inst2); return; case OR: op2=popand('|'); op1=popand('|'); inst2=newinst(NOP); op2->last->next=inst2; op1->last->next=inst2; inst1=newinst(OR); inst1->right=op1->first; inst1->left=op2->first; pushand(inst1, inst2); break; case CAT: op2=popand(0); op1=popand(0); op1->last->next=op2->first; pushand(op1->first, op2->last); break; case STAR: op2=popand('*'); inst1=newinst(OR); op2->last->next=inst1; inst1->right=op2->first; pushand(inst1, inst1); break; case PLUS: op2=popand('+'); inst1=newinst(OR); op2->last->next=inst1; inst1->right=op2->first; pushand(op2->first, inst1); break; case QUEST: op2=popand('?'); inst1=newinst(OR); inst2=newinst(NOP); inst1->left=inst2; inst1->right=op2->first; op2->last->next=inst2; pushand(inst1, inst2); break; } } } static void optimize(pp) Prog *pp; { register Inst *inst, *target; for(inst=pp->firstinst; inst->type!=END; inst++){ target=inst->next; while(target->type == NOP) target=target->next; inst->next=target; } } #ifdef DEBUG static void dumpstack(){ Node *stk; int *ip; printf("operators\n"); for(ip=atorstack; ip<atorp; ip++) printf("0%o\n", *ip); printf("operands\n"); for(stk=andstack; stk<andp; stk++) printf("0%o\t0%o\n", stk->first->type, stk->last->type); } static void dump(pp) Prog *pp; { Inst *l; l=pp->firstinst; do{ printf("%d:\t0%o\t%d\t%d\n", l-pp->firstinst, l->type, l->left-pp->firstinst, l->right-pp->firstinst); }while(l++->type); } #endif static void startlex(s) char *s; { exprp=s; nclass=0; nbra=0; } static Class * newclass(){ register Class *p; register n; if(nclass >= NCLASS) regerr2("too many character classes; limit", NCLASS+'0'); p = &(classp[nclass++]); for(n=0; n<16; n++) p->map[n]=0; return p; } static int lex(){ register c= *exprp++; switch(c){ case '\\': if(*exprp) c= *exprp++; break; case 0: c=END; --exprp; /* In case we come here again */ break; case '*': c=STAR; break; case '?': c=QUEST; break; case '+': c=PLUS; break; case '|': c=OR; break; case '.': c=ANY; break; case '(': c=LBRA; break; case ')': c=RBRA; break; case '^': c=BOL; break; case '$': c=EOL; break; case '[': c=CCLASS; bldcclass(); break; } return c; } static int nextc(){ if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0)) rcerror("malformed '[]'"); if(exprp[0]=='\\'){ exprp++; return *exprp++|0200; } return *exprp++; } static void bldcclass(){ register c1, c2; register Class *classp; register negate=FALSE; classp=newclass(); /* we have already seen the '[' */ if(*exprp=='^'){ negate=TRUE; exprp++; } while((c1=c2=nextc()) != ']'){ if(*exprp=='-'){ exprp++; /* eat '-' */ if((c2=nextc()) == ']') rcerror("malformed '[]'"); } for((c1&=0177), (c2&=0177); c1<=c2; c1++) classp->map[c1/8] |= 1<<(c1&07); } if(negate) for(c1=0; c1<16; c1++) classp->map[c1]^=0377; classp->map[0] &= 0376; /* exclude NUL */ } extern regexp * regcomp(s) char *s; { register token; Prog *pp; /* get memory for the program */ pp = (Prog *)malloc(sizeof(Prog) + 3*sizeof(Inst)*strlen(s)); if (pp == NULL) { rcerror("out of memory"); return NULL; } freep = pp->firstinst; classp = pp->class; errors = 0; /* go compile the sucker */ startlex(s); atorp=atorstack; andp=andstack; subidp=subidstack; lastwasand=FALSE; cursubid=0; /* Start with a low priority operator to prime parser */ pushator(START-1); while((token=lex()) != END){ if((token&0300) == OPERATOR) operator(token); else operand(token); } /* Close with a low priority operator */ evaluntil(START); /* Force END */ operand(END); evaluntil(START); #ifdef DEBUG dumpstack(); #endif if(nbra) rcerror("unmatched left paren"); --andp; /* points to first and only operand */ pp->startinst=andp->first; #ifdef DEBUG dump(pp); #endif optimize(pp); #ifdef DEBUG printf("start: %d\n", andp->first-pp->firstinst); dump(pp); #endif if (errors) { free(pp); pp = NULL; } return (regexp *)pp; }