V10/cmd/spell/pcode.c

Compare this file to the similar file:
Show the results in this format:

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include "code.h"

#ifndef __cplusplus

void	exit(int);
void	qsort(void*, unsigned, int, int(*)(void*, void*));

#else

#include <memory.h>
extern "C" {
void	exit(int);
void	qsort(void*, unsigned, int, int(*)(void*, void*));
}

#endif

/* read an annotated spelling list in the form
	word <tab> affixcode [ , affixcode ] ...
   print a reencoded version
	octal <tab> word
 */

typedef long Bits;
typedef	struct	Dict	Dict;
struct	Dict
{
	char*	word;
	Bits	encode;
};

Dict	words[200000];
char	space[500000];
Bits	encodes[4094];
long	nspace;
long	nwords;
int	ncodes;

void	readinput(FILE*);
long	typecode(char*);
int	wcmp(void*, void*);
void	pdict(void);
void	sput(int);

main(int argc, char *argv[])
{
	FILE* f;

	nwords = 0;
	nspace = 0;
	ncodes = 0;
	if(argc <= 1)
		readinput(stdin);
	while(argc > 1) {
		f = fopen(argv[1], "r");
		if(f == 0) {
			fprintf(stderr, "Cannot open %s\n", argv[1]);
			exit(1);
		}
		readinput(f);
		fclose(f);
		argc--;
		argv++;
	}
	fprintf(stderr, "words = %ld; space = %ld; codes = %d\n",
		nwords, nspace, ncodes);
	qsort(words, nwords, sizeof(words[0]), wcmp);
	pdict();
	return 0;
}

wcmp(void *a, void *b)
{

	return strcmp(((Dict*)a)->word, ((Dict*)b)->word);
}

void
readinput(FILE* f)
{
	long i;
	char *code, *bword;
	char line[200];
	long lineno = 0;

	while(fgets(line, sizeof(line), f)) {
		line[strlen(line)-1] = 0;
		lineno++;
		code = line;
		while(isspace(*code))
			code++;
		bword = code;
		while(*code && !isspace(*code))
			code++;

		i = code-bword;
		memcpy(space+nspace, bword, i);
		words[nwords].word = space+nspace;
		nspace += i;
		space[nspace] = 0;
		nspace++;

		if(*code) {
			*code++ = 0;
			while(isspace(*code))
				code++;
		}
		words[nwords].encode = typecode(code);
		nwords++;
		if(nwords >= sizeof(words)/sizeof(words[0])) {
			fprintf(stderr, "words array too small\n");
			exit(1);
		}
		if(nspace >= sizeof(space)/sizeof(space[0])) {
			fprintf(stderr, "space array too small\n");
			exit(1);
		}
	}
}


typedef	struct	Class	Class;
struct	Class
{
	char*	codename;
	long	bits;
};
Class	codea[]  =
{
	{ "a", ADJ },
	{ "adv", ADV },
	0
};
Class	codec[] =
{
	{ "comp", COMP },
	0
};
Class	coded[] =
{
	{ "d", DONT_TOUCH},
	0
};

Class	codee[] =
{
	{ "ed",	ED },
	{ "er", ACTOR },
	0
};

Class	codei[] =
{
	{ "in", IN },
	{ "ion", ION },
	0
};

Class	codem[] =
{
	{ "man", MAN },
	{ "ms", MONO },
	0
};

Class	coden[] =
{
	{ "n", NOUN },
	{ "na", N_AFFIX },
	{ "nopref", NOPREF },
	0
};

Class	codep[] =
{
	{ "pc", PROP_COLLECT },
	0
};
Class	codes[] =
{
	{ "s", STOP },
	0
};

Class	codev[] =
{
	{ "v", VERB },
	{ "va", V_AFFIX },
	{ "vi", V_IRREG },
	0
};

Class	codey[] =
{
	{ "y", _Y },
	0
};

Class	codez[] =
{
	0
};
Class*	codetab[] =
{
	codea,
	codez,
	codec,
	coded,
	codee,
	codez,
	codez,
	codez,
	codei,
	codez,
	codez,
	codez,
	codem,
	coden,
	codez,
	codep,
	codez,
	codez,
	codes,
	codez,
	codez,
	codev,
	codez,
	codez,
	codey,
	codez,
};

long
typecode(char *str)
{
	Class *p;
	long code;
	int n, i;
	char *s, *sp, *st;

	code = 0;

loop:
	for(s=str; *s != 0 && *s != ','; s++)
		;
	for(p = codetab[*str-'a']; sp = p->codename; p++) {
		st = str;
		for(n=s-str;; st++,sp++) {
			if(*st != *sp)
				goto cont;
			n--;
			if(n == 0)
				break;
		}
		code |= p->bits;
		if(*s == 0)
			goto out;
		str = s+1;
		goto loop;
	cont:;
	}
	fprintf(stderr, "Unknown affix code \"%s\"\n", str);
	return 0;
out:
	for(i=0; i<ncodes; i++)
		if(encodes[i] == code)
			return i;
	encodes[i] = code;
	ncodes++;
	return i;
}

void
sput(int s)
{

	putchar(s>>8);
	putchar(s);
}

void
lput(long l)
{
	putchar(l>>24);
	putchar(l>>16);
	putchar(l>>8);
	putchar(l);
}

/*
 * spit out the encoded dictionary
 * all numbers are encoded big-endian.
 *	struct
 *	{
 *		short	ncodes;
 *		int	encodes[ncodes];
 *		struct
 *		{
 *			short	encode;
 *			char	word[*];
 *		} words[*];
 *	};
 * 0x8000 flag for code word
 * 0x7800 count of number of common bytes with previous word
 * 0x07ff index into codes array for affixes
 */
void
pdict(void)
{
	long i, count;
	Bits encode;
	int j, c;
	char *lastword, *thisword, *word;

	sput(ncodes);
	for(i=0; i<ncodes; i++)
		lput(encodes[i]);

	count = ncodes*4 + 2;
	lastword = "";
	for(i=0; i<nwords; i++) {
		word = words[i].word;
		thisword = word;
		for(j=0; *thisword == *lastword; j++) {
			if(*thisword == 0) {
				fprintf(stderr, "identical words: %s\n", word);
				break;
			}
			thisword++;
			lastword++;
		}
		if(j > 15)
			j = 15;
		encode = words[i].encode;
		c = (1<<15) | (j<<11) | encode;
		sput(c);
		count += 2;
		for(thisword=word+j; c = *thisword; thisword++) {
			putchar(c);
			count++;
		}
		lastword = word;
	}
	fprintf(stderr, "output bytes = %ld\n", count);
}