#define TD /* define this symbol to obtain my changes td76.06.01 */ #ifdef TD #define DIS /* define this symbol to get -d option - ian j aug 77 */ #define ELSE /* turns #endif #ifdef into #else - if your `cc' understands it! */ #endif /* diff - differential file comparison * * Uses an algorithm due to Harold Stone, which finds * a pair of longest identical subsequences in the two * files. * * The major goal is to generate the match vector J. * J[i] is the index of the line in file1 corresponding * to line i file0. J[i] = 0 if there is no * such line in file1. * * Lines are hashed so as to work in core. All potential * matches are located by sorting the lines of each file * on the hash (called value_____). In particular, this * collects the equivalence classes in file1 together. * Subroutine equiv____ replaces the value of each line in * file0 by the index of the first element of its * matching equivalence in (the reordered) file1. * To save space equiv_____ squeezes file1 into a single * array member______ in which the equivalence classes * are simply concatenated, except that their first * members are flagged by changing sign. * * Next the indices that point into member______ are unsorted_______ into * array class_____ according to the original order of file0. * * The cleverness lies in routine stone______. This marches * through the lines of file0, developing a vector klist_____ * of "k-candidates". At step i a k-candidate is a matched * pair of lines x,y (x in file0 y in file1) such that * there is a common subsequence of lenght k * between the first i lines of file0 and the first y * lines of file1, but there is no such subsequence for * any smaller y. x is the earliest possible mate to y * that occurs in such a subsequence. * * Whenever any of the members of the equivalence class of * lines in file1 matable to a line in file0 has serial number * less than the y of some k-candidate, that k-candidate * with the smallest such y is replaced. The new * k-candidate is chained (via pred____) to the current * k-1 candidate so that the actual subsequence can * be recovered. When a member has serial number greater * that the y of all k-candidates, the klist is extended. * At the end, the longest subsequence is pulled out * and placed in the array J by unravel_______. * * With J in hand, the matches there recorded are * check_____ed against reality to assure that no spurious * matches have crept in due to hashing. If they have, * they are broken, and "jackpot " is recorded--a harmless * matter except that a true match for a spuriously * mated line may now be unnecessarily reported as a change. * * Much of the complexity of the program comes simply * from trying to minimize core utilization and * maximize the range of doable problems by dynamically * allocating what is needed and reusing what is not. * The core requirements for problems larger than somewhat * are (in words) 2*length(file0) + length(file1) + * 3*(number of k-candidates installed), typically about * 6n words for files of length n. There is also space for buf1 * used which could, by moving data underfoot and reallocating * buf1 together with buf2, be completely overlaid. */ struct buf { int fdes; char data[516]; } *buf1, *buf2; struct cand { int x; int y; struct cand *pred; } cand; struct line { int serial; int value; } *file[2], line; int len[2]; int *class; /*will be overlaid on file[0]*/ int *member; /*will be overlaid on file[1]*/ struct cand **klist; /*will be overlaid on file[0] after class*/ int *J; /*will be overlaid on class*/ int *ixold; /*will be overlaid on klist*/ int *ixnew; /*will be overlaid on file[1]*/ char *area; char *top; xalloc(n) { register char *p; p = area; n = (n+1) & ~1; area =+ n; while(area > top) { if(sbrk(1024) == -1) { mesg("Out of space\n"); exit(1); } top =+ 1024; } return(p); } mesg(s) char *s; { while(*s) write(2,s++,1); } sort(a,n) /*shellsort CACM #201*/ struct line *a; { struct line w; register int j,m; struct line *ai; register struct line *aim; int k, i; for(j=1;j<=n;j=* 2) m = 2*j - 1; for(m=/2;m!=0;m=/2) { k = n-m; for(j=1;j<=k;j++) { for( i = j ; i > 0 ; i =- m ) { ai = &a[i]; aim = &ai[m]; if(aim->value > ai[0].value || aim->value == ai[0].value && aim->serial > ai[0].serial) break; w.value = ai[0].value; ai[0].value = aim->value; aim->value = w.value; w.serial = ai[0].serial; ai[0].serial = aim->serial; aim->serial = w.serial; } } } } unsort(f, l, b) struct line *f; int *b; { int *a; int i; a = xalloc((l+1)*sizeof(a[0])); for(i=1;i<=l;i++) a[f[i].serial] = f[i].value; for(i=1;i<=l;i++) b[i] = a[i]; area = a; } prepare(i, arg) char *arg; { register char *temp; temp = file[i] = area; xalloc(sizeof(line)); input(arg); len[i] = (area - temp)/sizeof(line) - 1; xalloc(sizeof(line)); sort(file[i], len[i]); } input(arg) { register int h, i; register struct line *p; if(fopen(arg,buf1) == -1) { mesg("Cannot open "); mesg(arg); mesg("\n"); exit(1); } for(i=0; h=readhash(buf1);) { p = xalloc(sizeof(line)); p->serial = ++i; p->value = h; } close(buf1->fdes); } equiv(a,n,b,m,c) struct line *a, *b; int *c; { register int i, j; i = j = 1; while(i<=n && j<=m) { if(a[i].value <b[j].value) a[i++].value = 0; else if(a[i].value == b[j].value) a[i++].value = j; else j++; } while(i <= n) a[i++].value = 0; b[m+1].value = 0; j = 0; while(++j <= m) { c[j] = -b[j].serial; while(b[j+1].value == b[j].value) { j++; c[j] = b[j].serial; } } c[j] = -1; } #ifdef TD int cflag, asflag; #ifdef DIS int dflag; #endif #endif struct cand * newcand(x,y,pred) struct cand *pred; { struct cand *p; p = xalloc(sizeof(cand)); p->x = x; p->y = y; p->pred = pred; return(p); } search(c, k, y) struct cand **c; { register int i, j, l; int t; i = 0; j = k+1; while((l=(i+j)/2) > i) { t = c[l]->y; if(t > y) j = l; else if(t < y) i = l; else return(l); } return(l+1); } struct cand * unravel(p) struct cand *p; { int i; for(i=0; i<=len[0]; i++) J[i] = 0; while(p) { J[p->x] = p->y; p = p->pred; } } /* check does double duty: 1. ferret out any fortuitous correspondences due to counfounding by hashing (which result in "jackpot") 2. collect random access indexes to the two files */ check(argv) char **argv; { register int i, j; int ctold, ctnew; int jackpot; char c,d; fopen(argv[1],buf1); fopen(argv[2],buf2); j = 1; ctold = ctnew = 0; ixold[0] = ixnew[0] = 0; jackpot = 0; for(i=1;i<=len[0];i++) { if(J[i]==0) { while(getc(buf1)!='\n') ctold++; ixold[i] = ++ctold; continue; } while(j<J[i]) { while(getc(buf2)!='\n') ctnew++; ixnew[j] = ++ctnew; j++; } while((c=getc(buf1))==(d=getc(buf2))) { if(c=='\n') break; ctold++; ctnew++; } while(c!='\n') { jackpot++; J[i] = 0; c = getc(buf1); ctold++; } ixold[i] = ++ctold; while(d!='\n') { jackpot++; J[i] = 0; d = getc(buf2); ctnew++; } ixnew[j] = ++ctnew; j++; } for(;j<=len[1];j++) { while(getc(buf2)!='\n') ctnew++; ixnew[j] = ++ctnew; } close(buf1->fdes); close(buf2->fdes); return(jackpot); } output(argv) char **argv; { int dir; int m; int i0,i1,j0,j1; extern fout; dir = **argv=='-'; fout = dup(1); buf1->fdes = open(argv[1],0); buf2->fdes = open(argv[2],0); #ifdef TD #ifndef DIS if(cflag || asflag){ #endif #ifdef DIS if(cflag || asflag || dflag){ #endif coutput(argv[3]); return; } #endif m = len[0]; J[0] = 0; J[m+1] = len[1]+1; if(dir==0) for(i0=1;i0<=m;i0=i1+1) { while(i0<=m&&J[i0]==J[i0-1]+1) i0++; j0 = J[i0-1]+1; i1 = i0-1; while(i1<m&&J[i1+1]==0) i1++; j1 = J[i1+1]-1; J[i1] = j1; change(i0,i1,j0,j1,dir); } else for(i0=m;i0>=1;i0=i1-1) { while(i0>=1&&J[i0]==J[i0+1]-1&&J[i0]!=0) i0--; j0 = J[i0+1]-1; i1 = i0+1; while(i1>1&&J[i1-1]==0) i1--; j1 = J[i1-1]+1; J[i1] = j1; change(i1,i0,j1,j0,dir); } if(m==0) change(1,0,1,len[1],dir); flush(); } change(a,b,c,d,dir) { if(a>b&&c>d) return; range(a,b); putchar(a>b?'a':c>d?'d':'c'); if(dir==0) range(c,d); putchar('\n'); if(dir==0) { fetch(ixold,a,b,buf1,"< "); /* ijcm */ if(a<=b&&c<=d) printf("---\n"); } fetch(ixnew,c,d,buf2,dir==0?"> ":""); /* ijcm */ if(dir!=0&&c<=d) printf(".\n"); } range(a,b) { if(a>b) printf("%d",b); if(a<=b) printf("%d",a); if(a<b) printf(",%d",b); } fetch(f,a,b,lb,pref) int *f; struct buf *lb; char *pref; { int i, j; int nc; for(i=a;i<=b;i++) { seek(lb->fdes,f[i-1],0); nc = read(lb->fdes,lb->data,f[i]-f[i-1]); printf(pref); for(j=0;j<nc;j++) putchar(lb->data[j]); } } #ifdef TD coutput(symbol){ int i0, i1, j0, j1, m; char *ifdef, *ifndef, *endif, *endifn, *prefix, *prefixn, *prefixz; prefixz = prefixn = prefix = ""; if(asflag){ ifdef=".if %s\n"; ifndef=".if %s-1\n"; endifn=endif=".endif\n"; printf("%s=0\n", symbol); } else if (cflag) { ifdef="#ifdef %s\n"; ifndef="#ifndef %s\n"; endifn=endif="#endif %s\n"; #ifdef DIS } else { prefixz = " "; ifdef=endif= "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"; prefix = "++ "; ifndef=endifn= "-----------------------------------------------------------------------------------------------------------------------------------\n"; prefixn = "-- "; #endif } m=len[0]; J[0]=0; J[m+1]=len[1]+1; i1=0; for(i0=1;i0<=m;i0=i1+1){ while(i0<=m&&J[i0]==J[i0-1]+1) i0++; fetch(ixold, i1+1, i0-1, buf1, prefixz); j0 = J[i0-1]+1; i1 = i0-1; while(i1<m&&J[i1+1]==0) i1++; if(i1>=i0){ printf(ifndef, symbol); fetch(ixold, i0, i1, buf1, prefixn); #ifndef ELSE printf(endifn, symbol); #else if (! cflag) printf(endifn, symbol); #endif } j1 = J[i1+1]-1; J[i1] = j1; if(j1>=j0){ #ifndef ELSE printf(ifdef, symbol); #else if (! cflag || i1 < i0) printf(ifdef, symbol); else printf("#else %s\n", symbol); #endif fetch(ixnew, j0, j1, buf2, prefix); printf(endif, symbol); } #ifdef ELSE else if (cflag && i1 >= i0) printf(endifn, symbol); #endif } flush(); } #endif stone(a,n,b,c) int *a; int *b; struct cand **c; { register int i, k,y; int j, l; int skip; k = 0; c[0] = 0; for(i=1; i<=n; i++) { j = a[i]; if(j==0) continue; skip = 0; do { y = b[j]; if(y<0) y = -y; if(skip) continue; l = search(c, k, y); if(l > k) { c[k+1] = newcand(i,y,c[k]); skip = 1; k++; } else if(c[l]->y > y && c[l]->x < i) c[l] = newcand(i,y,c[l-1]); } while(b[++j] > 0); } return(k); } main(argc, argv) char **argv; { int k; struct buf buff1, buff2; /* allocate them here to save on data space */ if(argc>1 && *argv[1]=='-') { #ifdef TD cflag = argv[1][1]=='c'; asflag = argv[1][1]=='s'; #ifdef DIS dflag = argv[1][1]=='d'; #endif #endif argc--; argv++; } #ifndef TD if(argc!=3) { #endif #ifdef TD if((!cflag && !asflag && argc!=3) || ((cflag || asflag) && argc!=4)){ #endif mesg("Arg count\n"); exit(1); } area = top = sbrk(0); buf1 = &buff1; prepare(0, argv[1]); prepare(1, argv[2]); member = file[1]; equiv(file[0], len[0], file[1], len[1], member); class = file[0]; unsort(file[0], len[0], class); klist = &class[len[0]+2]; area = &member[len[1]+2]; k = stone(class, len[0], member, klist); J = class; unravel(klist[k]); ixold = klist; ixnew = file[1]; area = &ixnew[len[1]+2]; buf2 = &buff2; if(check(argv)) mesg("Jackpot\n"); output(argv); } /* * hash routine for diff * * This code replaces that which resided in diff2.s * of previous implementations, I wanted I&D space diff * * Effectively spreads the string out into 7-bit * bytes, then sums the result 1's-complement * by 16-bit bytes and adds 1 to avoid zero answer. * So said the comments in diff2.s * * Ian J */ readhash(buf) { long lsum; register n = 0; /* shift */ register unsigned c, k; struct { int hiword; int loword; }; lsum = 1; if( (c = getc(buf)) == -1 ) return 0; /* end of file */ for( ; c != '\n' ; c = getc(buf) ) { if( c == -1 ) { write(2,"incomplete line omitted\n",24); return 0; } k = c; c =<< n; n =- 16; c =| k << n; lsum =+ c; lsum.loword =+ lsum.hiword; lsum.hiword = 0; n = (n + 23) & 017; } return lsum.loword; }