V10/cmd/sort/merge.c

Compare this file to the similar file:
Show the results in this format:

/* Copyright 1990, AT&T Bell Labs */
#include <stdlib.h>
#include <string.h>
#include "fsort.h"

#define NMERGE 16

enum { IEOF, IDUP, IOK };

int nextfile;

struct merge {
	char *name;		/* name of file for diagnostics */
	FILE *file;
	struct rec *rec;	/* pointer to the data (and key) */
	short del;		/* delete at close */
	short serial;	/* adjudicates equality for stable sort */
};


struct merge mfile[2*NMERGE];	/* -u needs 2 records per file */
int nmfile = 0;			/* number of initialized structs */
struct merge *flist[2*NMERGE];
int nflist = 0;

static void mergephase(int, char*);
static int insert(void);
static int compare(struct merge*, struct merge*);
extern int link(char*, char*);
extern int unlink(char*);
static void mv(int, int);

static void
recalloc(struct merge *m)
{
	if(m->rec)
		return;
	m->rec = (struct rec*)malloc(MINREC);
	if(m->rec == 0)
		fatal("no space for merge records", "", 0);
	m->rec->next = (struct rec*)((uchar*)m->rec + MINREC);
}

static void
recinit(int j, int n, int flag)
{
	int i;
	struct merge *m;
	while(nmfile < (uflag&&!flag? 2*n: n))
		recalloc(&mfile[nmfile++]);
	for(i=0; i<n; i++) {
		m = &mfile[i];
		m->name = flag? filename(j+i): files[j+i];
		m->file = fileopen(m->name, "r");
		m->serial = i;
		m->del = flag;
		mfile[i+n].name = m->name; /*for uflag*/
		mfile[i+n].file = m->file;
		mfile[i+n].serial = i;
	}
}

static void
recfinal(int n, int flag)
{
	int i;
	if(flag)
		for(i=0; i<n; i++)
			free(mfile[i].name);
}

/* misfortune : fields are parsed in their entirety
   before comparison.  lazy parsing might be in order */
/* flag is 0 for merging input files, 1 for intermediates */

void
merge(int nf, int flag)
{
	char buf[BUFSIZ];
	FILE *input, *output;
	char *name;
	int i, j, bunches, n;
	int nmgd = 0;

	do {
		bunches = (nf+NMERGE-1)/NMERGE;
		
		for(i=bunches,j=0; i>0; nmgd+=n, j+=n, nf-=n,i--) {
			n = (nf+i-1)/i;
			recinit(j, n, flag);
			if(bunches > 1 || !flag && overwrite(nmgd))
				name = filename(nextfile++);
			else
				name = oname;
			mergephase(uflag&&!flag? 2*n: n, name);
			recfinal(n, flag);
		}
		if(name == oname)
			return;
		for(i=nextfile, j=bunches; --j>=0 && --i>j; )
			mv(i, j);
		nf = nextfile = bunches;
		flag = 1;
	} while(nf > 1);

	input = fileopen(name, "r");
	output = fileopen(oname, "w");
	while(n = fread(buf, 1, sizeof(buf), input))
		if(fwrite(buf, 1, n, output) != n)
			fatal("error writing", oname, 0);
	fileclose(input, name);
	unlink(name);
	fileclose(output, oname);
}

static void
mv(int i, int j)
{
	char *old = filename(i);
	char *new = filename(j);
	unlink(new);
	if(link(old,new) == -1 || unlink(old) == -1)
		fatal("cannot move", old, 0);
	free(old);
	free(new);
}

		/* merge 1st n>=1 files in mfile[] */
static void
mergephase(int n, char *name)
{
	int i, c;
	struct merge *m;
	uchar *p, *e;
	FILE *output = fileopen(name, "w");

	nflist = 0;
	for(i=0; i<n; i++) {
		flist[nflist] = &mfile[i];
		while(insert() == IDUP)
			continue;
	}
	while(nflist > 0) {
		m = flist[0];
		p = data(m->rec);
		i = m->rec->dlen;
		e = p + i++;
		c = *e;
		*e = '\n';
		if(fwrite((char*)p, 1, i, output) != i)
			fatal("error writing", oname, 0);
		*e = c;
		nflist--;
		memmove(flist, flist+1, nflist*sizeof(*flist));
		flist[nflist] = m;
		while(insert() == IDUP)
			continue;
	}

	for(i=0; i<n; i++) {
		fileclose(mfile[i].file, 0);
		if(mfile[i].del)
			unlink(mfile[i].name);
	}
	fileclose(output, name);
}

static int
fillrec(struct merge *m)
{
	struct rec *r = getline(m->rec, m->file);
	if(r == 0)
		return IOK;
	if(r == ENDFILE)
		return IEOF;
	free(m->rec);
	m->rec = r;
	return IOK;
}

	/* opportunity for optimization:
	   one call of insert is preceded by memmove, which
           insert will undo right away if there is significant
	   clustering so that successive outputs are likely
	   to come from the same input file */

static int
insert(void)
{
	int i;
	int bot,top, t;
	struct merge *m = flist[nflist];
	struct merge *temp;
	if(fillrec(m) == IEOF)
		return IEOF;
	bot = 0;
	top = nflist;
	while(bot < top) {
		i = (bot+top)/2;
		t = compare(m, flist[i]);
		if(t < 0)
			top = i;
		else if(t > 0)
			bot = i + 1;
		else if(uflag) {
			if(m->serial < flist[i]->serial) {
				temp = flist[i];
				flist[i] = flist[nflist];
				flist[nflist] = temp;
			}
			return IDUP;
		} else if(m->serial < flist[i]->serial)
			top = i;
		else
			bot = i + 1;
	}
	memmove(flist+bot+1,flist+bot,(nflist-bot)*sizeof(*flist));
	flist[bot] = m;
	nflist++;
	return IOK;
}		

static int		
compare(struct merge *mi, struct merge *mj)
{
	uchar *ip, *jp;
	uchar *ei, *ej;
	uchar *trans, *keep;
	int li, lj, k;
	if(simplekeyed) {
		trans = fields->trans;
		keep = fields->keep;
		ip = data(mi->rec);
		jp = data(mj->rec);
		ei = ip + mi->rec->dlen;
		ej = jp + mj->rec->dlen;
		for( ; ; ip++, jp++) {
			while(ip<ei && !keep[*ip]) ip++;
			while(jp<ej && !keep[*jp]) jp++;
			if(ip>=ei || jp>=ej) break;
			k = trans[*ip] - trans[*jp];
			if(k != 0) break;
		}
		if(ip<ei && jp<ej)
			return k*signedrflag;
		else if(ip < ei)
			return signedrflag;
		else if(jp < ej)
			return -signedrflag;
		else if(sflag)
			return 0;
	} else if(keyed) {
		ip = key(mi->rec);
		jp = key(mj->rec);
		li = mi->rec->klen;
		lj = mj->rec->klen;
		for(k=li<lj?li:lj; --k>=0; ip++, jp++)
			if(*ip != *jp)
				break;
		if(k < 0) {
			if(li != lj)
				fatal("theorem disproved","",0);
			if(sflag)
				return 0;
		} else
			return *ip - *jp;
		
	}
	li = mi->rec->dlen;
	lj = mj->rec->dlen;
	ip = data(mi->rec);
	jp = data(mj->rec);
	for(k=li<lj?li:lj; --k>=0; ip++, jp++)
		if(*ip != *jp)
			break;
	return (k<0? li-lj: *ip-*jp)*signedrflag;
}

void
check(char *name)
{
	int i, t;

	recalloc(&mfile[0]);
	recalloc(&mfile[1]);
	mfile[0].file = mfile[1].file = fileopen(name, "r");
	if(mfile[0].file == 0)
		fatal("can't open ", name, 0);

	if(fillrec(&mfile[0]) == IEOF)
		return;
	for(i=1; fillrec(&mfile[i])!=IEOF; i^=1) {
		t = compare(&mfile[i^1], &mfile[i]);
		if(t>0 || t==0 && uflag) {
			if(mfile[i].rec->dlen)
				fatal("disorder:",
				      (char*)data(mfile[i].rec),
				      mfile[i].rec->dlen);
			else
				fatal("disorder at empty record","",0);
		}

	}
}