#include "re.h" #include "lre.h" #include "hdr.h" static Exprtype toktype; static int toklit; static char *beg, *end; static int maxid; static Expr *e0(void); static Expr *d0(void); static Expr *r18(void); static void err(char*); static int parno; static jmp_buf gohome; static unsigned char *mymap; void egpost(re_re *r) { r->maxid = maxid; r->backref = r->root->backref; r->parens = r->root->parens; } Expr * eg_newexpr(Exprtype t, int l, Expr *left, Expr *right) { register Expr *e = (Expr *)egmalloc(sizeof(Expr), "eg_newexpr"); if (!e) return 0; e->type = t; e->parent = 0; e->lit = l; if(e->lit) e->id = maxid++; else e->id = 0; e->backref = 0; e->parens = 0; if(e->l = left){ left->parent = e; if(left->backref) e->backref = 1; if(left->parens) e->parens = 1; } if(e->r = right){ right->parent = e; if(right->backref) e->backref = 1; if(right->parens) e->parens = 1; } e->follow = 0; e->flen = 0; e->reallit = 0; return(e); } void eg_lexinit(char *b, char *e) { beg = b; end = e; maxid = 1; parno = 1; } void eg_lex(void) { if(beg == end){ toktype = EOP; toklit = -1; } else switch(toklit = *beg++) { case '.': toktype = Dot; break; case '*': toktype = Star; break; case '+': toktype = Plus; break; case '?': toktype = Quest; break; case '^': toktype = Carat; break; case '$': toktype = Dollar; break; case '[': toktype = Charclass; break; case '\n': case '|': toktype = Alternate; break; case '(': toktype = Lpar; break; case ')': toktype = Rpar; break; case '\\': toktype = Backslash; if(beg == end) err("bad \\"); else toklit = *beg++; break; default: toktype = Literal; break; } } void eg_epr(register Expr *e, char *res, int doset) { char r1[EPRINTSIZE], r2[EPRINTSIZE], rid[EPRINTSIZE]; int ids = 0; /* sort of a debugging flag */ if(e == 0){ SPR res, "!0!"); return; } r1[0] = 0; if(ids) SPR rid, "%d:", e->id); else rid[0] = 0; switch(e->type) { case Literal: if(doset) eg_spr(e->flen, e->follow, r1); SPR res, "%s'%c'%s", rid, e->lit, r1); break; case Dot: case Carat: case Dollar: if(doset) eg_spr(e->flen, e->follow, r1); SPR res, "%s%c%s", rid, e->lit, r1); break; case Compcharclass: case Charclass: *res++ = '['; if(e->type == Compcharclass) *res++ = '^'; memmove(res, (char *)e->r, (int)e->l); res += (int)e->l; *res++ = ']'; *res = 0; break; case Cat: eg_epr(e->l, r1, doset); eg_epr(e->r, r2, doset); SPR res, "%s%s%s", rid, r1, r2); break; case Alternate: eg_epr(e->l, r1, doset); eg_epr(e->r, r2, doset); SPR res, "%s(%s|%s)", rid, r1, r2); break; case Star: eg_epr(e->l, r1, doset); SPR res, "%s(%s)*", rid, r1); break; case Plus: eg_epr(e->l, r1, doset); SPR res, "%s(%s)+", rid, r1); break; case Quest: eg_epr(e->l, r1, doset); SPR res, "%s(%s)?", rid, r1); break; case Group: eg_epr(e->l, r1, doset); SPR res, "%sG<%s>", rid, r1); break; case EOP: eg_epr(e->l, r1, doset); SPR res, "%s%s<EOP>", rid, r1); break; case Backref: SPR res, "%s\\%d", rid, e->lit); break; default: SPR res, "<undef type %d>", e->type); err(res); break; } } static void ccl(int *count, char **str, int oldrange) { register n; int cnt; char tab[256], *s; int range, lastc, i; #define TSET(b) {if(tab[n = mymap[(b)]] == 0){ tab[n] = 1; cnt++; } } cnt = 0; memset(tab, 0, sizeof tab); lastc = -1; range = 0; /* scan for chars */ for(; (beg < end); beg++){ toklit = *beg; if(*beg == ']'){ if(!oldrange) break; if(lastc >= 0) break; } if(*beg == '-'){ if(lastc < 0){ TSET('-') lastc = *(unsigned char *)beg; } else range = 1; continue; } if(*beg == '\\'){ if(++beg >= end){ err("unexpected eop after \\ in char class"); beg--; } } if(range){ for(i = *(unsigned char *)beg; i >= lastc; i--) TSET(i) range = 0; } else TSET(*(unsigned char *)beg) lastc = *(unsigned char *)beg; } if(range){ if(oldrange) TSET('-') else err("unterminated range in []"); } if(beg < end) beg++; else err("eop during ccl"); if(cnt == 0) err("empty charclass"); eg_lex(); *count = cnt; *str = s = (char *)egmalloc(cnt, "charclass defn"); if (!s) return; for(n = 0; n < 256; n++) if(tab[n]) *s++ = n; } /* gre patterns: e0: e1 { '|' e1 }* e1: e2 { e2 }* e2: e3 { '*' | '?' | '+' | \{ m,n \} }* e3: lit | '.' | '^' | '$' | '(' e0 ')' */ static Expr * e3(void) { Expr *e; Exprtype t; int cnt; char *s; switch(toktype) { case Backslash: if((toklit >= '1') && (toklit <= '9')){ e = eg_newexpr(Backref, toklit-'0', (Expr *)0, (Expr *)0); e->backref = 1; } else e = eg_newexpr(Literal, toklit, (Expr *)0, (Expr *)0); eg_lex(); break; case Literal: e = eg_newexpr(Literal, toklit, (Expr *)0, (Expr *)0); eg_lex(); break; case Dot: e = eg_newexpr(Dot, '.', (Expr *)0, (Expr *)0); eg_lex(); break; case Carat: e = eg_newexpr(Carat, '^', (Expr *)0, (Expr *)0); eg_lex(); break; case Dollar: e = eg_newexpr(Dollar, '$', (Expr *)0, (Expr *)0); eg_lex(); break; case Charclass: t = toktype; if(*beg == '^'){ t = Compcharclass; beg++; } ccl(&cnt, &s, 0); e = eg_newexpr(t, '[', (Expr *)0, (Expr *)0); e->l = (Expr *)cnt; /* num of chars */ e->r = (Expr *)s; /* chars */ break; case Lpar: eg_lex(); cnt = parno++; e = e0(); if(toktype == Rpar) eg_lex(); else err("expected a ')'"); e = eg_newexpr(Group, cnt, e, (Expr *)0); e->parens = 1; return(e); case EOP: default: err("expected a lit or '('"); e = 0; } return(e); } static int integer(void) { int n; n = 0; while((toktype == Literal) && (toklit >= '0') && (toklit <= '9')){ n = 10*n + toklit-'0'; eg_lex(); } return(n); } static Expr * ecopy(Expr *e) { Expr *ee; char res[256]; if(e == 0) return(e); switch(e->type) { case Literal: case Dot: case Carat: case Dollar: case Backref: return(eg_newexpr(e->type, e->lit, (Expr *)0, (Expr *)0)); case Compcharclass: case Charclass: ee = eg_newexpr(e->type, e->lit, (Expr *)0, (Expr *)0); ee->r = (Expr *)egmalloc((int)e->l, "expr copy"); if (!ee->r) return 0; ee->l = e->l; memmove((char *)ee->r, (char *)e->r, (int)e->l); return(ee); case Cat: case Alternate: return(eg_newexpr(e->type, e->lit, ecopy(e->l), ecopy(e->r))); case Star: case Plus: case Quest: case Group: case EOP: return(eg_newexpr(e->type, e->lit, ecopy(e->l), (Expr *)0)); default: SPR res, "<undef type %d>", e->type); err(res); return((Expr *)0); } } static Expr * edup(Expr *expr, int n, int opt) { if(n == 1){ expr = ecopy(expr); if(opt) expr = eg_newexpr(Quest, 0, expr, (Expr *)0); return(expr); } return(eg_newexpr(Cat, 0, edup(expr, n-n/2, opt), edup(expr, n/2, opt))); } static Expr * range(Expr *expr) { int beg, end; Expr *e, *e1; if((toktype == Literal) && (toklit >= '0') && (toklit <= '9')) beg = integer(); else err("expected a number in range"); if((toktype == Literal) && (toklit == ',')){ end = -1; eg_lex(); } else end = -2; if((toktype == Literal) && (toklit >= '0') && (toklit <= '9')) end = integer(); if((toktype == Backslash) && (toklit == '}')) eg_lex(); else err("expected \\} in range"); e1 = edup(expr, beg, 0); if(end == -2) e = e1; else if(end == -1) e = eg_newexpr(Cat, 0, e1, eg_newexpr(Star, 0, expr, (Expr *)0)); else { if(end < beg) err("bad range specification"); e = (end > beg)? eg_newexpr(Cat, 0, e1, edup(expr, end-beg, 1)) : e1; } return(e); } static Expr * e2(void) { Expr *e; Exprtype t; e = e3(); while((toktype == Star) || (toktype == Plus) || (toktype == Quest) || ((toktype == Backslash) && (toklit == '{'))){ if((toktype == Backslash) && (toklit == '{')){ eg_lex(); e = range(e); } else { t = toktype; eg_lex(); e = eg_newexpr(t, 0, e, (Expr *)0); } } return(e); } static Expr * e1(void) { Expr *e, *f; e = e2(); while((toktype == Literal) || (toktype == Dot) || (toktype == Lpar) || (toktype == Backslash) || (toktype == Dollar) || (toktype == Carat) || (toktype == Charclass)){ f = e2(); e = eg_newexpr(Cat, 0, e, f); } return(e); } static Expr * e0(void) { Expr *e, *f; e = e1(); while(toktype == Alternate){ eg_lex(); if(toktype == EOP) continue; f = e1(); e = eg_newexpr(Alternate, 0, e, f); } return(e); } /* egrep patterns: d0: d1 { '|' d1 }* d1: d2 { d2 }* d2: d3 { '*' | '?' | '+' } d3: lit | '.' | '^' | '$' | '(' d0 ')' */ static Expr * d3(void) { Expr *e; Exprtype t; int cnt; char *s; switch(toktype) { case Backslash: case Literal: e = eg_newexpr(Literal, toklit, (Expr *)0, (Expr *)0); eg_lex(); break; case Dot: e = eg_newexpr(Dot, '.', (Expr *)0, (Expr *)0); eg_lex(); break; case Carat: e = eg_newexpr(Carat, '^', (Expr *)0, (Expr *)0); eg_lex(); break; case Dollar: e = eg_newexpr(Dollar, '$', (Expr *)0, (Expr *)0); eg_lex(); break; case Charclass: t = toktype; if(*beg == '^'){ t = Compcharclass; beg++; } ccl(&cnt, &s, 1); e = eg_newexpr(t, '[', (Expr *)0, (Expr *)0); e->l = (Expr *)cnt; /* num of chars */ e->r = (Expr *)s; /* chars */ break; case Lpar: eg_lex(); e = d0(); if(toktype == Rpar) eg_lex(); else err("expected a ')'"); return(e); default: err("expected a lit or '('"); e = 0; } return(e); } static Expr * d2(void) { Expr *e; Exprtype t; e = d3(); while((toktype == Star) || (toktype == Plus) || (toktype == Quest)){ t = toktype; eg_lex(); e = eg_newexpr(t, 0, e, (Expr *)0); } return(e); } static Expr * d1(void) { Expr *e, *f; e = d2(); while((toktype == Literal) || (toktype == Dot) || (toktype == Lpar) || (toktype == Dollar) || (toktype == Backslash) || (toktype == Carat) || (toktype == Charclass)){ f = d2(); e = eg_newexpr(Cat, 0, e, f); } return(e); } static Expr * d0(void) { Expr *e, *f; e = d1(); while(toktype == Alternate){ eg_lex(); if(toktype == EOP) continue; f = d1(); e = eg_newexpr(Alternate, 0, e, f); } return(e); } /* grep patterns: r0: r18 | '^' r18 | '^' r18 '$' | r18 '$' r18: r17 { r17 }* r17: r14 | r14 '*' | '\(' r18 '\)' r14: lit | '.' | '*' | '\' d */ static Expr * r14(void) { Expr *e; Exprtype t; int cnt; char *s; switch(toktype) { case Alternate: case Plus: case Quest: case Star: case Lpar: case Rpar: case Dollar: case Carat: case Literal: e = eg_newexpr(Literal, toklit, (Expr *)0, (Expr *)0); eg_lex(); break; case Backslash: if((toklit >= '1') && (toklit <= '9')){ e = eg_newexpr(Backref, toklit-'0', (Expr *)0, (Expr *)0); e->backref = 1; } else { e = eg_newexpr(Literal, toklit, (Expr *)0, (Expr *)0); e->reallit = 1; } eg_lex(); break; case Dot: e = eg_newexpr(Dot, '.', (Expr *)0, (Expr *)0); eg_lex(); break; case Charclass: t = toktype; if(*beg == '^'){ t = Compcharclass; beg++; } ccl(&cnt, &s, 1); e = eg_newexpr(t, '[', (Expr *)0, (Expr *)0); e->l = (Expr *)cnt; /* num of chars */ e->r = (Expr *)s; /* chars */ break; default: err("expected a one-char RE"); eg_lex(); /* make sure we don't loop */ e = 0; } return(e); } static Expr * r17(void) { Expr *e; int cnt; if((toktype == Backslash) && (toklit == '(')){ eg_lex(); cnt = parno++; e = r18(); if((toktype == Backslash) && (toklit == ')')) eg_lex(); else err("expected a closing \\)"); e = eg_newexpr(Group, cnt, e, (Expr *)0); e->parens = 1; } else { e = r14(); if(toktype == Star){ e = eg_newexpr(Star, 0, e, (Expr *)0); eg_lex(); } } return(e); } static Expr * r18(void) { Expr *e, *f; e = r17(); while(toktype != EOP){ if((toktype == Backslash) && (toklit == ')')) break; f = r17(); e = eg_newexpr(Cat, 0, e, f); } return(e); } static Expr * r0(void) { Expr *e, *e1; if(toktype == Carat){ e1 = eg_newexpr(Carat, '^', (Expr *)0, (Expr *)0); eg_lex(); } else e1 = 0; if(toktype == EOP) e = e1; else { e = r18(); /* did we see a dollar that is not a literal? */ if(e && (toktype == EOP)){ /* singleton dollar */ if((e->type == Literal) && (e->lit == '$')) e->type = Dollar; /* any other dollar */ if((e->type == Cat) && !e->r->reallit && (e->r->type == Literal) && (e->r->lit == '$')) e->r->type = Dollar; } if(e1){ if(e) e = eg_newexpr(Cat, 0, e1, e); else e = e1; } } if(toktype == Dollar){ if(e) e = eg_newexpr(Cat, 0, e, eg_newexpr(Dollar, '$', (Expr *)0, (Expr *)0)); else e = eg_newexpr(Dollar, '$', (Expr *)0, (Expr *)0); eg_lex(); } return(e); } Expr * eg_eall(enum Parsetype type, unsigned char *map) { Expr *e; mymap = map; if(setjmp(gohome) == 0){ if(type == egrepparse) while(toktype == Alternate) /* bogus but user-friendly */ eg_lex(); switch(type) { case greparse: e = e0(); break; case grepparse: e = r0(); break; case egrepparse: e = d0(); break; } if(type == egrepparse) while(toktype == Alternate) /* bogus but user-friendly */ eg_lex(); if(toktype != EOP) err("expected end of pattern"); } else e = 0; /*{char buf1[4096]; e, buf1, 0); print("e='%s'\n", buf1);}/**/ return(e); } void eg_spr(long c, int *p, register char *buf) { if(c > 0){ *buf++ = '{'; *buf = 0; while(--c > 0){ SPR buf, "%d,", *p++); buf = strchr(buf, 0); } SPR buf, "%d}", *p); } else SPR buf, "{}"); } static void err(char *s) { char buf[4096]; if(toklit < 0) SPR buf, "expression error: %s but got end of expression", s); else SPR buf, "expression error: %s near '%c'", s, toklit); re_error(buf); longjmp(gohome, 1); }