SRI-NOSC/s1/diffA.c

Compare this file to the similar file:
Show the results in this format:
#/*
Module Name:
	diff - differential file comparison

Installation:
	if $1x = finalx goto final
	as diffB.s; mv a.out diffB.o
	cc diffA.c diffB.o
	rm -f diffA.o diffB.o
	exit
: final
	as diffB.s; mv a.out diffB.o
	cc -O -s diffA.c diffB.o
	if ! -r a.out exit
	su cp a.out /usr/bin/diff
	rm -f a.out diffA.o diffB.o

Synopsis:
	diff	file file	output changes (same as before)
	diff -	file file	output editor script (same as before)
	diff -c	file file	output c style ifdefs
	diff -s	file file	output assmebler style ifdefs
	diff -.	file file	output file with * / . tagged lines

Function:
	Uses an algorithm due to Harold Stone, which finds
	a pair of longest identical subsequences in the two
	files.

	The major goal is to generate the match vector J.
	J[i] is the index of the line in file1 corresponding
	to line i file0. J[i] = 0 if there is no
	such line in file1.

	Lines are hashed so as to work in core. All potential
	matches are located by sorting the lines of each file
	on the hash (called value_____). In particular, this
	collects the equivalence classes in file1 together.
	Subroutine equiv_____  replaces the value of each line in
	file0 by the index of the first element of its 
	matching equivalence in (the reordered) file1.
	To save space equiv_____ squeezes file1 into a single
	array member______ in which the equivalence classes
	are simply concatenated, except that their first
	members are flagged by changing sign.

	Next the indices that point into member______ are unsorted_______   into
	array class_____ according to the original order of file0.

	The cleverness lies in routine stone______. This marches
	through the lines of file0, developing a vector klist_____
	of "k-candidates". At step i a k-candidate is a matched
	pair of lines x,y (x in file0 y in file1) such that
	there is a common subsequence of lenght k
	between the first i lines of file0 and the first y 
	lines of file1, but there is no such subsequence for
	any smaller y. x is the earliest possible mate to y
	that occurs in such a subsequence.

	Whenever any of the members of the equivalence class of
	lines in file1 matable to a line in file0 has serial number 
	less than the y of some k-candidate, that k-candidate 
	with the smallest such y is replaced. The new 
	k-candidate is chained (via pred____) to the current
	k-1 candidate so that the actual subsequence can
	be recovered. When a member has serial number greater
	that the y of all k-candidates, the klist is extended.
	At the end, the longest subsequence is pulled out
	and placed in the array J by unravel_______.

	With J in hand, the matches there recorded are
	check_____ed against reality to assure that no spurious
	matches have crept in due to hashing. If they have,
	they are broken, and "jackpot " is recorded--a harmless
	matter except that a true match for a spuriously
	mated line may now be unnecessarily reported as a change.

	Much of the complexity of the program comes simply
	from trying to minimize core utilization and
	maximize the range of doable problems by dynamically
	allocating what is needed and reusing what is not.
	The core requirements for problems larger than somewhat
	are (in words) 2*length(file0) + length(file1) +
	3*(number of k-candidates installed),  typically about
	6n words for files of length n. There is also space for buf1
	used which could, by moving data underfoot and reallocating
	buf1 together with buf2, be completely overlaid.

Restrictions:

Diagnostics:

Files:

See Also:

Bugs:

Globals contained:

Routines contained:

Modules referenced:
	diffB.s -- assembly language hashing subroutine

Compile time parameters and effects:

Module History:
	Original from Bell Labs distribution tape, author unknown,
	with some fixes by Rick Balocca and Jody Kravitz at the U of Illinois
*/
struct buf {
	int fdes;
	char data[516];
} *buf1, *buf2;

struct cand {
	int x;
	int y;
	struct cand *pred;
} cand;
struct line {
	int serial;
	int value;
} *file[2], line;
int len[2];
int *class;	/*will be overlaid on file[0]*/
int *member;	/*will be overlaid on file[1]*/
struct cand **klist;	/*will be overlaid on file[0] after class*/
int *J;		/*will be overlaid on class*/
int *ixold;	/*will be overlaid on klist*/
int *ixnew;	/*will be overlaid on file[1]*/

char *area;
char *top;
alloc(n)
{
	register char *p;
	p = area;
	n = (n+1) & ~1;
	area =+ n;
	while(area > top) {
		if(sbrk(1024) == -1) {
			mesg("Out of space\n");
			exit(1);
		}
		top =+ 1024;
	}
	return(p);
}

mesg(s)
char *s;
{
	while(*s)
		write(2,s++,1);
}

sort(a,n)	/*shellsort CACM #201*/
struct line *a;
{
	struct line w;
	register int j,m;
	struct line *ai;
	register struct line *aim;
	int k;
	for(j=1;j<=n;j=* 2)
		m = 2*j - 1;
	for(m=/2;m!=0;m=/2) {
		k = n-m;
		for(j=1;j<=k;j++) {
			for(ai = &a[j]; ai > a && ai <= &a[n]; ai =- m) {
				aim = &ai[m];
				if(aim->value > ai[0].value ||
				   aim->value == ai[0].value &&
				   aim->serial > ai[0].serial)
					break;
				w.value = ai[0].value;
				ai[0].value = aim->value;
				aim->value = w.value;
				w.serial = ai[0].serial;
				ai[0].serial = aim->serial;
				aim->serial = w.serial;
			}
		}
	}
}

unsort(f, l, b)
struct line *f;
int *b;
{
	register int *a;
	register int i;
	a = alloc((l+1)*sizeof(a[0]));
	for(i=1;i<=l;i++)
		a[f[i].serial] = f[i].value;
	for(i=1;i<=l;i++)
		b[i] = a[i];
	area = a;
}

prepare(i, arg)
char *arg;
{
	register char *temp;
	temp = file[i] = area;
	alloc(sizeof(line));
	input(arg);
	len[i] = (area - temp)/sizeof(line) - 1;
	alloc(sizeof(line));
	sort(file[i], len[i]);
}

input(arg)
{
	register int h, i;
	register struct line *p;
	if(fopen(arg,buf1) == -1) {
		mesg("Cannot open ");
		mesg(arg);
		mesg("\n");
		exit(1);
	}
	for(i=0; h=readhash(buf1);) {
		p = alloc(sizeof(line));
		p->serial = ++i;
		p->value = h;
	}
	close(buf1->fdes);
}

equiv(a,n,b,m,c)
struct line *a, *b;
int *c;
{
	register int i, j;
	i = j = 1;
	while(i<=n && j<=m) {
		if(a[i].value <b[j].value)
			a[i++].value = 0;
		else if(a[i].value == b[j].value)
			a[i++].value = j;
		else
			j++;
	}
	while(i <= n)
		a[i++].value = 0;
	b[m+1].value = 0;
	j = 0;
	while(++j <= m) {
		c[j] = -b[j].serial;
		while(b[j+1].value == b[j].value) {
			j++;
			c[j] = b[j].serial;
		}
	}
	c[j] = -1;
}

main(argc, argv)
char **argv;
{
	register int k;

	if(argc>1 && *argv[1]=='-') {
		argc--;
		argv++;
	}
	if(argc!=3) {
		mesg("Arg count\n");
		exit(1);
	}

	area = top = sbrk(0);
	buf1 = alloc(sizeof(*buf1));
	prepare(0, argv[1]);
	prepare(1, argv[2]);

	member = file[1];
	equiv(file[0], len[0], file[1], len[1], member);

	class = file[0];
	unsort(file[0], len[0], class);

	klist = &class[len[0]+2];
	area = &member[len[1]+2];
	k = stone(class, len[0], member, klist);
	J = class;
	unravel(klist[k]);

	ixold = klist;
	ixnew = file[1];
	area = &ixnew[len[1]+2];
	buf2 = alloc(sizeof(*buf2));
	if(check(argv))
		mesg("Jackpot\n");
	output(argv);
}

stone(a,n,b,c)
int *a;
int *b;
struct cand **c;
{
	register int i, k,y;
	int j, l;
	int skip;
	k = 0;
	c[0] = 0;
	for(i=1; i<=n; i++) {
		j = a[i];
		if(j==0)
			continue;
		skip = 0;
		do {
			y = b[j];
			if(y<0) y = -y;
			if(skip)
				continue;
			l = search(c, k, y);
			if(l > k) {
				c[k+1] = newcand(i,y,c[k]);
				skip = 1;
				k++;
			}
			else if(c[l]->y > y && c[l]->x < i)
				c[l] = newcand(i,y,c[l-1]);
		} while(b[++j] > 0);
	}
	return(k);
}

struct cand *
newcand(x,y,pred)
struct cand *pred;
{
	register struct cand *p;
	p = alloc(sizeof(cand));
	p->x = x;
	p->y = y;
	p->pred = pred;
	return(p);
}

search(c, k, y)
struct cand **c;
{
	register int i, j, l;
	int t;
	i = 0;
	j = k+1;
	while((l=(i+j)/2) > i) {
		t = c[l]->y;
		if(t > y)
			j = l;
		else if(t < y)
			i = l;
		else
			return(l);
	}
	return(l+1);
}

unravel(p)
struct cand *p;
{
	register int i;
	for(i=0; i<=len[0]; i++)
		J[i] = 0;
	while(p) {
		J[p->x] = p->y;
		p = p->pred;
	}
}

/* check does double duty:
1.  ferret out any fortuitous correspondences due
to counfounding by hashing (which result in "jackpot")
2.  collect random access indexes to the two files */

check(argv)
char **argv;
{
	register int i, j;
	register int ctold;
	int ctnew;
	int jackpot;
	char c,d;
	fopen(argv[1],buf1);
	fopen(argv[2],buf2);
	j = 1;
	ctold = ctnew = 0;
	ixold[0] = ixnew[0] = 0;
	jackpot = 0;
	for(i=1;i<=len[0];i++) {
		if(J[i]==0) {
			while(getc(buf1)!='\n') ctold++;
			ixold[i] = ++ctold;
			continue;
		}
		while(j<J[i]) {
			while(getc(buf2)!='\n') ctnew++;
			ixnew[j] = ++ctnew;
			j++;
		}
		while((c=getc(buf1))==(d=getc(buf2))) {
			if(c=='\n') break;
			ctold++;
			ctnew++;
		}
		while(c!='\n') {
			jackpot++;
			J[i] = 0;
			c = getc(buf1);
			ctold++;
		}
		ixold[i] = ++ctold;
		while(d!='\n') {
			jackpot++;
			J[i] = 0;
			d = getc(buf2);
			ctnew++;
		}
		ixnew[j] = ++ctnew;
		j++;
	}
	for(;j<=len[1];j++) {
		while(getc(buf2)!='\n') ctnew++;
		ixnew[j] = ++ctnew;
	}
	close(buf1->fdes);
	close(buf2->fdes);
	return(jackpot);
}

output(argv)
char **argv;
{
	int dir;
	int m;
	register int i0;
	register int i1;
	register int j1;
	int j0, i00;
	extern fout;
	if (**argv == '-')
		switch (argv[0][1]) {
		case '\0':	dir = 1;	break; /* editor script */
		case 'c':	dir = 2;	break; /* c style ifdefs */
		case 's':	dir = 3;	break;	/* assm ifdefs */
		case '.':	dir = 4;	break;	/* *'s and .'s */
		default:	dir = 1;	break;	/* Turkey ! */
		}
	else
		dir = 0;
	fout = dup(1);
	buf1->fdes = open(argv[1],0);
	buf2->fdes = open(argv[2],0);
	m = len[0];
	J[0] = 0;
	J[m+1] = len[1]+1;
	switch (dir) {
	case 0:
		for(i0=1;i0<=m;i0=i1+1) {
			while(i0<=m&&J[i0]==J[i0-1]+1) i0++;
			j0 = J[i0-1]+1;
			i1 = i0-1;
			while(i1<m&&J[i1+1]==0) i1++;
			j1 = J[i1+1]-1;
			J[i1] = j1;
			change(i0,i1,j0,j1,dir);
		}
		break;

	case 1:
		for(i0=m;i0>=1;i0=i1-1) {
			while(i0>=1&&J[i0]==J[i0+1]-1&&J[i0]!=0) i0--;
			j0 = J[i0+1]-1;
			i1 = i0+1;
			while(i1>1&&J[i1-1]==0) i1--;
			j1 = J[i1-1]+1;
			J[i1] = j1;
			change(i1,i0,j1,j0,dir);
		}
		break;

	case 2:
	case 3:
	case 4:
		for(i0=1;i0<=m;i0=i1+1) {
			i00 = i0;
			while(i0<=m&&J[i0]==J[i0-1]+1) i0++;
			fetch (ixold, i00, i0-1, buf1, (dir == 4) ? "  ":"");
			j0 = J[i0-1]+1;
			i1 = i0-1;
			while(i1<m&&J[i1+1]==0) i1++;
			j1 = J[i1+1]-1;
			J[i1] = j1;
			change(i0,i1,j0,j1,dir);
		}
		break;
	}
	if(m==0)
		change(1,0,1,len[1],dir);
	flush();
}

change(a,b,c,d,dir)
{
	if(a>b&&c>d) return;

	switch (dir) {
	case 0:	/* some useful info */
		range(a,b);
		putchar(a>b?'a':c>d?'d':'c');
		range(c,d);
		putchar('\n');
		fetch(ixold,a,b,buf1,"* ");
		if(a<=b&&c<=d) printf("---\n");
		fetch(ixnew,c,d,buf2,". ");
		break;

	case 1:	/* editor script file */
		range(a,b);
		putchar(a>b?'a':c>d?'d':'c');
		putchar('\n');
		fetch(ixnew,c,d,buf2,"");
		if(c<=d) printf(".\n");
		break;

	case 2:	/* c style ifdefs */
	case 3:	/* assembler style ifdefs */
		if (a <= b)
			printf ("#ifndef NEW\n");
		fetch(ixold,a,b,buf1,"");
		if (a <= b)
			printf ("#endif\n");
		if (c <= d)
			printf ("#ifdef NEW\n");
		fetch(ixnew,c,d,buf2,"");
		if (c <= d)
			printf ("#endif\n");
		break;

	case 4:	/* list it all out, *'s and .'s in front of lines */
		fetch(ixold,a,b,buf1,". ");
		fetch(ixnew,c,d,buf2,"* ");
		break;

	}
}

range(a,b)
{
	if(a>b) printf("%d",b);
	if(a<=b) printf("%d",a);
	if(a<b) printf(",%d",b);
}

fetch(f,a,b,lb,pref)
int *f;
struct buf *lb;
char *pref;
{
	register int i;
	register int j;
	register int nc;
	for(i=a;i<=b;i++) {
		seek(lb->fdes,f[i-1],0);
		nc = read(lb->fdes,lb->data,f[i]-f[i-1]);
		printf(pref);
		for(j=0;j<nc;j++)
			putchar(lb->data[j]);
	}
}