V10/cmd/spell/pcode.c
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include "code.h"
#ifndef __cplusplus
void exit(int);
void qsort(void*, unsigned, int, int(*)(void*, void*));
#else
#include <memory.h>
extern "C" {
void exit(int);
void qsort(void*, unsigned, int, int(*)(void*, void*));
}
#endif
/* read an annotated spelling list in the form
word <tab> affixcode [ , affixcode ] ...
print a reencoded version
octal <tab> word
*/
typedef long Bits;
typedef struct Dict Dict;
struct Dict
{
char* word;
Bits encode;
};
Dict words[200000];
char space[500000];
Bits encodes[4094];
long nspace;
long nwords;
int ncodes;
void readinput(FILE*);
long typecode(char*);
int wcmp(void*, void*);
void pdict(void);
void sput(int);
main(int argc, char *argv[])
{
FILE* f;
nwords = 0;
nspace = 0;
ncodes = 0;
if(argc <= 1)
readinput(stdin);
while(argc > 1) {
f = fopen(argv[1], "r");
if(f == 0) {
fprintf(stderr, "Cannot open %s\n", argv[1]);
exit(1);
}
readinput(f);
fclose(f);
argc--;
argv++;
}
fprintf(stderr, "words = %ld; space = %ld; codes = %d\n",
nwords, nspace, ncodes);
qsort(words, nwords, sizeof(words[0]), wcmp);
pdict();
return 0;
}
wcmp(void *a, void *b)
{
return strcmp(((Dict*)a)->word, ((Dict*)b)->word);
}
void
readinput(FILE* f)
{
long i;
char *code, *bword;
char line[200];
long lineno = 0;
while(fgets(line, sizeof(line), f)) {
line[strlen(line)-1] = 0;
lineno++;
code = line;
while(isspace(*code))
code++;
bword = code;
while(*code && !isspace(*code))
code++;
i = code-bword;
memcpy(space+nspace, bword, i);
words[nwords].word = space+nspace;
nspace += i;
space[nspace] = 0;
nspace++;
if(*code) {
*code++ = 0;
while(isspace(*code))
code++;
}
words[nwords].encode = typecode(code);
nwords++;
if(nwords >= sizeof(words)/sizeof(words[0])) {
fprintf(stderr, "words array too small\n");
exit(1);
}
if(nspace >= sizeof(space)/sizeof(space[0])) {
fprintf(stderr, "space array too small\n");
exit(1);
}
}
}
typedef struct Class Class;
struct Class
{
char* codename;
long bits;
};
Class codea[] =
{
{ "a", ADJ },
{ "adv", ADV },
0
};
Class codec[] =
{
{ "comp", COMP },
0
};
Class coded[] =
{
{ "d", DONT_TOUCH},
0
};
Class codee[] =
{
{ "ed", ED },
{ "er", ACTOR },
0
};
Class codei[] =
{
{ "in", IN },
{ "ion", ION },
0
};
Class codem[] =
{
{ "man", MAN },
{ "ms", MONO },
0
};
Class coden[] =
{
{ "n", NOUN },
{ "na", N_AFFIX },
{ "nopref", NOPREF },
0
};
Class codep[] =
{
{ "pc", PROP_COLLECT },
0
};
Class codes[] =
{
{ "s", STOP },
0
};
Class codev[] =
{
{ "v", VERB },
{ "va", V_AFFIX },
{ "vi", V_IRREG },
0
};
Class codey[] =
{
{ "y", _Y },
0
};
Class codez[] =
{
0
};
Class* codetab[] =
{
codea,
codez,
codec,
coded,
codee,
codez,
codez,
codez,
codei,
codez,
codez,
codez,
codem,
coden,
codez,
codep,
codez,
codez,
codes,
codez,
codez,
codev,
codez,
codez,
codey,
codez,
};
long
typecode(char *str)
{
Class *p;
long code;
int n, i;
char *s, *sp, *st;
code = 0;
loop:
for(s=str; *s != 0 && *s != ','; s++)
;
for(p = codetab[*str-'a']; sp = p->codename; p++) {
st = str;
for(n=s-str;; st++,sp++) {
if(*st != *sp)
goto cont;
n--;
if(n == 0)
break;
}
code |= p->bits;
if(*s == 0)
goto out;
str = s+1;
goto loop;
cont:;
}
fprintf(stderr, "Unknown affix code \"%s\"\n", str);
return 0;
out:
for(i=0; i<ncodes; i++)
if(encodes[i] == code)
return i;
encodes[i] = code;
ncodes++;
return i;
}
void
sput(int s)
{
putchar(s>>8);
putchar(s);
}
void
lput(long l)
{
putchar(l>>24);
putchar(l>>16);
putchar(l>>8);
putchar(l);
}
/*
* spit out the encoded dictionary
* all numbers are encoded big-endian.
* struct
* {
* short ncodes;
* int encodes[ncodes];
* struct
* {
* short encode;
* char word[*];
* } words[*];
* };
* 0x8000 flag for code word
* 0x7800 count of number of common bytes with previous word
* 0x07ff index into codes array for affixes
*/
void
pdict(void)
{
long i, count;
Bits encode;
int j, c;
char *lastword, *thisword, *word;
sput(ncodes);
for(i=0; i<ncodes; i++)
lput(encodes[i]);
count = ncodes*4 + 2;
lastword = "";
for(i=0; i<nwords; i++) {
word = words[i].word;
thisword = word;
for(j=0; *thisword == *lastword; j++) {
if(*thisword == 0) {
fprintf(stderr, "identical words: %s\n", word);
break;
}
thisword++;
lastword++;
}
if(j > 15)
j = 15;
encode = words[i].encode;
c = (1<<15) | (j<<11) | encode;
sput(c);
count += 2;
for(thisword=word+j; c = *thisword; thisword++) {
putchar(c);
count++;
}
lastword = word;
}
fprintf(stderr, "output bytes = %ld\n", count);
}