#include "sam.h" Range sel; String lastregexp; /* * Machine Information */ typedef struct Inst{ int type; /* < 0x200 ==> literal, otherwise action */ struct Inst *right; struct Inst *left; }Inst; #define next left /* Left branch is usually just next ptr */ #define NPROG 1024 Inst program[NPROG]; Inst *progp; Inst *startinst; /* First inst. of program; might not be program[0] */ Inst *bstartinst; /* same for backwards machine */ typedef struct Ilist{ Inst *inst; /* Instruction of the thread */ Posn startp; /* first char of match */ }Ilist; #define NLIST 128 Ilist *tl, *nl; /* This list, next list */ Ilist list[2][NLIST]; /* * Actions and Tokens * * 0x2xx are operators, value == precedence * 0x3xx are tokens, i.e. operands for operators */ #define OPERATOR 0x200 /* Bitmask of all operators */ #define START 0x200 /* Start, used for marker on stack */ #define RBRA 0x201 /* Right bracket, ) */ #define LBRA 0x202 /* Left bracket, ( */ #define OR 0x203 /* Alternation, | */ #define CAT 0x204 /* Concatentation, implicit operator */ #define STAR 0x205 /* Closure, * */ #define PLUS 0x206 /* a+ == aa* */ #define QUEST 0x207 /* a? == a|nothing, i.e. 0 or 1 a's */ #define ANY 0x300 /* Any character but newline, . */ #define ANYNL 0x301 /* Any character, including newline, @ */ #define NOP 0x302 /* No operation, internal use only */ #define BOL 0x303 /* Beginning of line, ^ */ #define EOL 0x304 /* End of line, $ */ #define CCLASS 0x305 /* Character class, [] */ #define END 0x377 /* Terminate: match found */ /* * Parser Information */ typedef struct Node{ Inst *first; Inst *last; }Node; #define NSTACK 20 Node andstack[NSTACK]; Node *andp; int atorstack[NSTACK]; int *atorp; int lastwasand; /* Last token was operand */ int backwards; int nbra; uchar *exprp; /* pointer to next character in source expression */ #define NCLASS 8 int nclass; char class[NCLASS][32]; /* 32 bytes == 256 bits, one bit per char */ Inst * newinst(t) int t; { if(progp>= &program[NPROG]) error(Etoolong); progp->type=t; progp->left=0; progp->right=0; return progp++; } Inst * realcompile(s) uchar *s; { register token; startlex(s); atorp=atorstack; andp=andstack; lastwasand=FALSE; /* Start with a low priority operator to prime parser */ pushator(START-1); while((token=lex()) != END){ if((token&0x300) == OPERATOR) operator(token); else operand(token); } /* Close with a low priority operator */ evaluntil(START); /* Force END */ operand(END); evaluntil(START); if(nbra) error(Eleftpar); --andp; /* points to first and only operand */ return andp->first; } compile(r) Regexp *r; { register String *s= &r->text; Inst *oprogp; if(s->n==lastregexp.n && strncmp(s->s, lastregexp.s, lastregexp.n)==0) return; progp=program; backwards=FALSE; startinst=realcompile(s->s); optimize(program); oprogp=progp; backwards=TRUE; bstartinst=realcompile(s->s); optimize(oprogp); strdupstr(&lastregexp, s); } operand(t) int t; { register Inst *i; if(lastwasand) operator(CAT); /* catenate is implicit */ i=newinst(t); if(t==CCLASS) /* ugh */ i->right=(Inst *)class[nclass-1]; /* UGH! */ pushand(i, i); lastwasand=TRUE; } operator(t) int t; { if(t==RBRA && --nbra<0) error(Erightpar); if(t==LBRA){ nbra++; if(lastwasand) operator(CAT); }else evaluntil(t); if(t!=RBRA) pushator(t); lastwasand=FALSE; if(t==STAR || t==QUEST || t==PLUS || t==RBRA) lastwasand=TRUE; /* these look like operands */ } cant(s) char *s; { char buf[100]; sprint(buf, "can't happen: %s", s); panic(buf); } pushand(f, l) Inst *f, *l; { if(andp >= &andstack[NSTACK]) cant("operand stack overflow"); andp->first=f; andp->last=l; andp++; } pushator(t) int t; { if(atorp >= &atorstack[NSTACK]) cant("operator stack overflow"); *atorp++=t; } Node * popand(op) { if(andp <= &andstack[0]) error_c(Emissop, op); return --andp; } popator() { if(atorp <= &atorstack[0]) cant("operator stack underflow"); return *--atorp; } evaluntil(pri) register pri; { register Node *op1, *op2, *t; register Inst *inst1, *inst2; while(pri==RBRA || atorp[-1]>=pri){ switch(popator()){ case LBRA: return; /* must have been RBRA */ default: panic("unknown regexp operator"); break; case OR: op2=popand('|'); op1=popand('|'); inst2=newinst(NOP); op2->last->next=inst2; op1->last->next=inst2; inst1=newinst(OR); inst1->right=op1->first; inst1->left=op2->first; pushand(inst1, inst2); break; case CAT: op2=popand(0); op1=popand(0); if(backwards && op2->first->type!=END) t=op1, op1=op2, op2=t; op1->last->next=op2->first; pushand(op1->first, op2->last); break; case STAR: op2=popand('*'); inst1=newinst(OR); op2->last->next=inst1; inst1->right=op2->first; pushand(inst1, inst1); break; case PLUS: op2=popand('+'); inst1=newinst(OR); op2->last->next=inst1; inst1->right=op2->first; pushand(op2->first, inst1); break; case QUEST: op2=popand('?'); inst1=newinst(OR); inst2=newinst(NOP); inst1->left=inst2; inst1->right=op2->first; op2->last->next=inst2; pushand(inst1, inst2); break; } } } optimize(start) register Inst *start; { register Inst *inst, *target; for(inst=start; inst->type!=END; inst++){ target=inst->next; while(target->type == NOP) target=target->next; inst->next=target; } } #ifdef DEBUG dumpstack(){ Node *stk; int *ip; dprint("operators\n"); for(ip=atorstack; ip<atorp; ip++) dprint("0%o\n", *ip); dprint("operands\n"); for(stk=andstack; stk<andp; stk++) dprint("0%o\t0%o\n", stk->first->type, stk->last->type); } dump(){ Inst *l; l=program; do{ dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type, l->left-program, l->right-program); }while(l++->type); } #endif startlex(s) uchar *s; { exprp=s; nclass=0; nbra=0; } char * newclass(){ register char *p; register n; if(nclass >= NCLASS) error(Eclass); p=class[nclass++]; for(n=0; n<32; n++) p[n]=0; return p; } lex(){ register c= *exprp++; switch(c){ case '\\': if(*exprp) if((c= *exprp++)=='n') c='\n'; break; case 0: c=END; --exprp; /* In case we come here again */ break; case '*': c=STAR; break; case '?': c=QUEST; break; case '+': c=PLUS; break; case '|': c=OR; break; case '.': c=ANY; break; case '@': c=ANYNL; break; case '(': c=LBRA; break; case ')': c=RBRA; break; case '^': c=BOL; break; case '$': c=EOL; break; case '[': c=CCLASS; bldcclass(); break; } return c; } nextrec(){ if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0)) error(Ebadclass); if(exprp[0]=='\\'){ exprp++; if(*exprp=='n'){ exprp++; return '\n'; } return *exprp++|0x100; } return *exprp++; } bldcclass(){ register c1, c2; register char *classp; register negate=FALSE; classp=newclass(); /* we have already seen the '[' */ if(*exprp=='^'){ negate=TRUE; exprp++; } while((c1=c2=nextrec()) != ']'){ if(*exprp=='-'){ exprp++; /* eat '-' */ if((c2=nextrec()) == ']') error(Ebadclass); } for((c1&=0xFF), (c2&=0xFF); c1<=c2; c1++) classp[c1/8] |= 1<<(c1&07); } if(negate){ for(c1=0; c1<32; c1++) classp[c1]^=0xFF; classp[1]&=~(1<<('\n'-8)); /* exclude '\n' */ } classp[0]&=0xFE; /* exclude NUL */ } /* * Note optimization in addinst: * *l must be pending when addinst called; if *l has been looked * at already, the optimization is a bug. */ addinst(l, inst, startp) register Ilist *l; register Inst *inst; Posn startp; { register Ilist *p; for(p=l; p->inst; p++){ if(p->inst==inst){ if(startp < p->startp) p->startp=startp; /* this would be bug */ return; /* It's already there */ } } p->inst=inst; p->startp=startp; (++p)->inst=0; } execute(f, startp, eof) File *f; Posn startp, eof; { register flag=0; register Inst *inst; register Ilist *tlp; register Posn p=startp; int nnl=0, ntl; register c; int wrapped=0; register startchar=startinst->type<OPERATOR? startinst->type : 0; list[0][0].inst=list[1][0].inst=0; sel.p1= -1; Fgetcset(f, startp); /* Execute machine once for each character */ for(;;p++){ doloop: c=Fgetc(f); if(p>=eof || c<0){ switch(wrapped++){ case 0: /* let loop run one more click */ case 2: break; case 1: /* expired; wrap to beginning */ if(sel.p1>=0 || eof!=INFINITY) goto Return; list[0][0].inst=list[1][0].inst=0; Fgetcset(f, (Posn)0); p=0; goto doloop; default: goto Return; } }else if(((wrapped && p>=startp) || sel.p1>0) && nnl==0) break; /* fast check for first char */ if(startchar && nnl==0 && c!=startchar) continue; tl=list[flag]; nl=list[flag^=1]; nl->inst=0; ntl=nnl; nnl=0; if(sel.p1<0 && (!wrapped || p<startp || startp==eof)){ /* Add first instruction to this list */ if(++ntl >= NLIST) Overflow: error(Eoverflow); addinst(tl, startinst, p); } /* Execute machine until this list is empty */ for(tlp=tl; inst=tlp->inst; tlp++){ /* assignment = */ Switchstmt: switch(inst->type){ default: /* regular character */ if(inst->type==c){ Addinst: if(++nnl >= NLIST) goto Overflow; addinst(nl, inst->next, tlp->startp); } break; case ANYNL: goto Addinst; case ANY: if(c!='\n') goto Addinst; break; case BOL: if(p==0){ Step: inst=inst->next; goto Switchstmt; } if(f->getci>1){ if(f->getcbuf[f->getci-2]=='\n') goto Step; }else{ uchar c; if(Fchars(f, &c, p-1, p)==1 && c=='\n') goto Step; } break; case EOL: if(c=='\n') goto Step; break; case CCLASS: if(c>=0 && ((char *)inst->right)[c/8]&(1<<(c&07))) goto Addinst; break; case OR: /* evaluate right choice later */ if(++ntl >= NLIST) goto Overflow; addinst(tlp, inst->right, tlp->startp); /* efficiency: advance and re-evaluate */ inst=inst->left; goto Switchstmt; case END: /* Match! */ newmatch(tlp->startp, p); break; } } } Return: return sel.p1>=0; } newmatch(p1, p2) Posn p1, p2; { if(sel.p1<0 || p1<sel.p1 || (p1==sel.p1 && p2>sel.p2)){ sel.p1=p1; sel.p2=p2; } } bexecute(f, startp) register File *f; Posn startp; { register flag=0; register Inst *inst; register Ilist *tlp; register Posn p=startp; int nnl=0, ntl; register c; int wrapped=0; register startchar=bstartinst->type<OPERATOR? bstartinst->type : 0; list[0][0].inst=list[1][0].inst=0; sel.p1= -1; Fgetcset(f, startp); /* Execute machine once for each character, including terminal NUL */ for(;;--p){ doloop: if((c=Fbgetc(f))==-1){ switch(wrapped++){ case 0: /* let loop run one more click */ case 2: break; case 1: /* expired; wrap to end */ if(sel.p1>=0) case 3: goto Return; list[0][0].inst=list[1][0].inst=0; Fgetcset(f, f->nbytes); p=f->nbytes; goto doloop; default: goto Return; } }else if(((wrapped && p<=startp) || sel.p1>0) && nnl==0) break; /* fast check for first char */ if(startchar && nnl==0 && c!=startchar) continue; tl=list[flag]; nl=list[flag^=1]; nl->inst=0; ntl=nnl; nnl=0; if(sel.p1<0 && (!wrapped || p>startp)){ /* Add first instruction to this list */ if(++ntl >= NLIST) Overflow: error(Eoverflow); /* the minus is so the optimizations in addinst work */ addinst(tl, bstartinst, -p); } /* Execute machine until this list is empty */ for(tlp=tl; inst=tlp->inst; tlp++){ /* assignment = */ Switchstmt: switch(inst->type){ default: /* regular character */ if(inst->type==c){ Addinst: if(++nnl >= NLIST) goto Overflow; addinst(nl, inst->next, tlp->startp); } break; case ANYNL: goto Addinst; case ANY: if(c!='\n') goto Addinst; break; case BOL: if(c=='\n'){ Step: inst=inst->next; goto Switchstmt; } break; case EOL: if(f->getci<f->ngetc-1){ if(f->getcbuf[f->getci+1]=='\n') goto Step; }else if(p<f->nbytes-1){ uchar c; if(Fchars(f, &c, p+1, p+2)==1 && c=='\n') goto Step; } break; case CCLASS: if(((char *)inst->right)[c/8]&(1<<(c&07))) goto Addinst; break; case OR: /* evaluate right choice later */ if(++ntl >= NLIST) goto Overflow; addinst(tlp, inst->right, tlp->startp); /* efficiency: advance and re-evaluate */ inst=inst->left; goto Switchstmt; case END: /* Match! */ bnewmatch(-tlp->startp, p); /* minus sign */ break; } } } Return: return sel.p1>=0; } bnewmatch(p2, p1) /* note the reversal; p1<=p2 */ Posn p1, p2; { if(sel.p1<0 || p2>sel.p2 || (p2==sel.p2 && p1<sel.p1)){ sel.p1=p1; sel.p2=p2; } }