4.4BSD/usr/src/contrib/bib/src/invert.c

Compare this file to the similar file:
Show the results in this format:

#ifndef lint
static char sccsid[] = "@(#)invert.c	2.7	5/27/93";
#endif not lint
#
/*  input:  records of lines, separated by blank lines
    output: key:file1 start/length ... start/length:file2 start/length ...
*/

# include "stdio.h"
# include "streams.h"
# include "bib.h"
# define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)

int     max_kcnt = 100;     /*  max number of keys                      */
int     max_klen =   6;     /*  max length of keys                      */
char    *ignore =           /*  string of line starts to ignore         */
            "CNOPVX";
char    *INDEX=             /*  name of output file                     */
            INDXFILE;

char    *bibtmpfile =          /*  name of temporary file                  */
            INVTEMPFILE;

int	silent = 0;	    /*  0 => statistics printed			*/
			    /*  1 => no statisitics printed		*/

char *sort_it =
        "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s";
char sortcmd[maxstr];

int     argc;
char    **argv;

main(argcount,arglist)
int argcount;
char **arglist;
{   char            *filename;
    FILE            *input, *output;
    long int        start,length;
    char            word[maxstr];
    int             kcnt;
    char            tag_line[maxstr];
    int 	    bol = 1; /* at beginning of line */

    long int	    records = 0;  /*  number of records read           */
    long int	    keys    = 0;  /*  number of keys read (occurences) */
    long int	    distinct;     /*  number of distinct keys          */
    long int	    shorten();

    InitDirectory(BMACLIB,N_BMACLIB);
    InitDirectory(COMFILE,N_COMFILE);

    argc= argcount-1;
    argv= arglist+1;
    mktemp(bibtmpfile);
    output= fopen(bibtmpfile,"w");

    for ( flags() ; argc>0 ; argc--, argv++ ,flags() )
    {   /* open input file              */
            filename=   *argv;
            input=      fopen(filename,"r");
            if (input==NULL)
            {   fprintf(stderr,"invert: error in open of %s\n", filename);
                continue;
            }
      start=      0L;
      length=     0L;

      for(;;) /* each record  */ {
	 /* find start of next record (exit if none)     */
	 start= nextrecord(input,start+length);
	 if (start==EOF)   break;
	 records++;
	 kcnt= 0;
	 length= recsize(input,start);
	 sprintf(tag_line, " %s %d %d\n", filename, start, length);

	 while (ftell(input) < start+length && kcnt < max_kcnt) {
	    getword(input,word,ignore,&bol);
	    makekey(word,max_klen,COMFILE);
	    if (*word != NULL) {
	       fputs(word,output); fputs(tag_line,output);
	       kcnt++; keys++;
	       }
	    }
	 }
       fclose(input);
       }
    fclose(output);

    sprintf(sortcmd, sort_it, bibtmpfile, bibtmpfile);
    system(sortcmd);

    distinct = shorten(bibtmpfile,INDEX);
    if( silent == 0 )
	fprintf(stderr,
	    "%d documents   %d distinct keys  %d key occurrences\n",
	    records, distinct, keys);
    exit(0);
}



/*  Flag    Meaning                             Default
    -ki     Keys per record                     100
    -li     max Length of keys                  6
    -%str   ignore lines that begin with %x     CNOPVX
            where x is in str
            str is a seq of chars
    -cfile  file contains Common words          /usr/new/lib/bib/common
            do not use common words as keys
    -pfile  name of output file                 INDEX
    -s	    do not print statistics		statistics printed
*/

# define    operand     (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)

flags()
{
    char *p;
    for (; argc>0 && *argv[0]=='-';  argc--,argv++)
    {   switch ((*argv)[1])
        {   case 'k':   max_kcnt= atoi(operand);
                        break;
            case 'l':   max_klen= atoi(operand);
                        break;
            case 'c':   strcpy(COMFILE,operand);
                        break;
            case '%':   ignore=  *argv+2;
                        break;
            case 'p':   INDEX=  operand;
                        break;
	    case 's':	silent= 1;
			break;
	    case 'd':
		p = &argv[0][2];
		if (!p) { 
			argv++;
			p = &argv[0][0];
		}
		strreplace(COMFILE, BMACLIB, p);
		strcpy(BMACLIB, p);
		break;
            default:    fprintf(stderr,"unknown flag '%s'\n", *argv);
        }
    }
}


/*  shorten(inf,outf): file "inf" consists of lines of the form:
        key file start length
    sorted by key and file.  replace lines with the same key
    with one line of the form:
        key:file1 start/length ... start/length:file2 start/length ...
    rename as file "outf"
    returns number of lines in output
*/
long shorten(inf,outf)
char *inf, *outf;
{   FILE *in, *out;
    char line[maxstr];
    char key[maxstr],  newkey[maxstr],
         file[maxstr], newfile[maxstr];
    long int start, length;
    long int lines = 0;

    in=  fopen(inf, "r");
    out= fopen(outf, "w");
    if (in==NULL || out==NULL)
    {   fprintf(stderr,"invert: error in opening file for compression\n");
        return(0);
    }

    getline(in,line);
    sscanf(line,"%s%s%d%d", key, file, &start, &length);
    fprintf(out, "%s :%s %d/%d", key, file, start, length);
    for ( getline(in, line) ; !feof(in);  getline(in, line))
    {   sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length);
        if (strcmp(key,newkey)!=0)
        {   strcpy(key, newkey);
            strcpy(file, newfile);
            fprintf(out, "\n%s :%s %d/%d",  key, file, start, length);
	    lines++;
        }
        else if (strcmp(file,newfile)!=0)
        {   strcpy(file,newfile);
            fprintf(out, ":%s %d/%d", file, start, length);
        }
        else
            fprintf(out, " %d/%d", start, length);
    }
    fprintf(out, "\n");
    lines++;

    fclose(in); fclose(out);
    unlink(inf);
    return (lines);
}