V10/cmd/lp/file.c

Compare this file to the similar file:
Show the results in this format:

#include <u.h>
#include <libc.h>
#include <ctype.h>
#ifdef plan9
Dir	mbuf;
#else
#define print	printf
struct stat mbuf;
#define OREAD	0
#endif
/*
 * file - determine type of file
 */

uchar	buf[6000];
short	cfreq[140];
short	wfreq[50];
int	nbuf;
int	flag;
int	(*call[])(void);

enum
{
	Cword,
	Fword,
	Aword,
	I1,
	I2,
	I3,
	Clatin	= 128,
	Cbinary,
	Cnull,
	Ceascii,
};
struct
{
	char*	word;
	int	flag;
} dict[] =
{
	"TEXT",		Aword,
	"block",	Fword,
	"char",		Cword,
	"common",	Fword,
	"data",		Fword,
	"dimension",	Fword,
	"double",	Cword,
	"extern",	Cword,
	"fio",		I2,
	"float",	Cword,
	"function",	Fword,
	"h",		I3,
	"include",	I1,
	"int",		Cword,
	"integer",	Fword,
	"libc",		I2,
	"long",		Cword,
	"real",		Fword,
	"register",	Cword,
	"short",	Cword,
	"static",	Cword,
	"stdio",	I2,
	"struct",	Cword,
	"subroutine",	Fword,
	"u",		I2,
	"void",		Cword,
};

enum
{
	Short	= 1<<0,		/* size < 100 */
	Long	= 1<<1,

	Fascii	= 1<<2,		/* printable ascii */
	Flatin	= 1<<3,
	Fbinary	= 1<<4,
	Feascii	= 1<<5,		/* including extended */
	Fnull	= 1<<6,
};

void	type(char*, int);
long	lendian(uchar*);

int
main(int argc, char *argv[])
{
	int i, l;

	l = 0;
	for(i=1; i<argc; i++)
		if(strlen(argv[i]) > l)
			l = strlen(argv[i]);
	for(i=1; i<argc; i++)
		type(argv[i], l);
	exit(0);
}

void
type(char *file, int nlen)
{
	int i, f, l, m, c;
	char *p, *ep, word[20];

	print("%s:%*s", file, nlen-strlen(file)+1, "");
#ifdef plan9
	if(dirstat(file, &mbuf) < 0) {
		print("cannot stat\n");
		return;
	}
	if(mbuf.mode & CHDIR) {
		print("directory\n");
		return;
	}
	if(mbuf.type != 'M') {
		print("special file #%c\n", mbuf.type);
		return;
	}
#else
	if(stat(file, &mbuf) < 0) {
		print("cannot stat\n");
		return;
	}
	switch(mbuf.st_mode&S_IFMT) {
	case S_IFDIR:
		print("directory\n");
		return;
	case S_IFCHR:
		print("character special file\n");
		return;
	case S_IFBLK:
		print("block special file\n");
		return;
	}
#endif

	f = open(file, OREAD);
	if(f < 0) {
		print("cannot open\n");
		return;
	}
	nbuf = read(f, buf, sizeof(buf));
	close(f);

	if(nbuf < 0) {
		print("cannot read\n");
		return;
	}
	if(nbuf == 0) {
		print("empty\n");
		return;
	}

	/*
	 * build histogram table
	 */
	memset(cfreq, 0, sizeof(cfreq));
	flag = 0;
	if(nbuf > 100)
		flag |= Long;
	else
		flag |= Short;

	for(i=0; i<nbuf; i++) {
		f = buf[i] & 0xff;
		if(f >= 128) {
			if(f >= 128+32)
				f = Clatin;	/* latin */
			else
				f = Cbinary;	/* not latin */
		} else
		if(!isprint(f) && !isspace(f))
			if(f == 0)
				f = Cnull;
			else
				f = Ceascii;
		cfreq[f]++;
	}

	/*
	 * gross classify
	 */
	if(cfreq[Cbinary])
		flag |= Fbinary;
	else
	if(cfreq[Clatin])
		flag |= Flatin;
	else
	if(cfreq[Ceascii])
		flag |= Feascii;
	else
	if(cfreq[Cnull])
		flag |= Fnull;
	else
		flag |= Fascii;

	if(flag & Fnull) {
		print("null\n");
		return;
	}

	/*
	 * lookup dictionary words
	 */
	memset(wfreq, 0, sizeof(wfreq));
	if(flag & Fascii) {
		ep = word+sizeof(word)-2;
		for(i=0; i<nbuf; i++) {
			f = buf[i];
			if(!isalpha(f))
				continue;
			p = word;
			for(; i<nbuf; i++) {
				f = buf[i];
				if(!isalnum(f))
					break;
				*p++ = f;
				if(p >= ep)
					break;
			}
			*p = 0;
			f = 0;
			l = sizeof(dict)/sizeof(dict[0]);
			for(;;) {
				if(f >= l)
					break;
				m = (f+l)/2;
				c = strcmp(dict[m].word, word);
				if(c == 0) {
					wfreq[dict[m].flag]++;
					break;
				}
				if(c < 0)
					f = m+1;
				else
					l = m;
			}
		}
	}

	/*
	 * call individual classify routines
	 */
	for(i=0; call[i]; i++)
		if((*call[i])())
			return;

	/*
	 * if all else fails,
	 * print out gross classification
	 */
	if(flag & Short)
		print("short ");
	if(flag & Fascii)
		print("ascii\n");
	else
	if(flag & Feascii)
		print("extended ascii\n");
	else
	if(flag & Flatin)
		print("latin ascii\n");
	else
		print("binary\n");
}

long
lendian(uchar *p)
{

	return (p[0]) |
		(p[1] << 8) |
		(p[2] << 16) |
		(p[3] << 24);
}

int
long0(void)
{

	switch((unsigned)lendian(buf)) {
	default:
		return 0;

	case 0413:
		print("demand paged ");

	case 0410:
		print("pure ");
		goto exec;

	case 0406:
		print("mpx 68000 ");
		goto exec;

	exec:
	case 0407:
		print("unix vax executable");
		if(lendian(buf+4) != 0)
			print(" not stripped");
		print("\n");
		break;

	case 0411:
		print("jfr 411 executable\n");
		break;

	case 0177555:
		print("very old archive\n");
		break;

	case 0177545:
		print("old archive\n");
		break;

	case 0135246:		/* andrew/ehg */
		print("view2d input file\n");
		break;

	case 0135256:		/* andrew */
		print("apl file\n");
		break;

	case 0164200:		/* td */
		print("Lucasfilm picture\n");
		break;

 	case 0600560:
		print("mux downloadable file\n");
		break;

	case 0x07010000:
		print("68020 plan9 executable\n");
		break;

	case 0x07040000:
		print("mips plan9 executable\n");
		break;

	case 0x97010000:
		print("hobbit plan9 executable\n");
		break;

	case 0xab020000:
		print("sparc plan9 executable\n");
		break;

	case 0xeb010000:
		print("386 plan9 executable\n");
		break;
	case 0x0b1f1bdc:
		print("daisy\n");
		break;
	case 0x64205300:
		print("S data object\n");
		break;
	}
	return 1;
}

int
short0(void)
{

	switch(lendian(buf) & 0xffff) {
	default:
		return 0;

	case 070707:
		print("cpio archive\n");
		break;

	case 0x02f7:
		print("tex dvi\n");
		break;

	case 0405:
	case 0407:
	case 0410:
	case 0411:
		print("pdp-11 executable\n");
		break;
	case 0x0000:
		print("bitmap\n");
		break;
	}
	return 1;
}

/*
 * initial words to classify file
 */
char*	iwords[] =
{
	"!<arch>\n__.SYMDEF",
		"archive random library",
	"!<arch>\n",
		"archive",
	"070707",
		"cpio archive - ascii header",
	"#FIG",
		"fig ouput",
	"#!/bin/echo",
		"cyntax object file",
	"#!/bin/rc",
		"rc executable file",
	"#!/bin/sh",
		"sh executable file",
	"%!",
		"postscript",
	"@document(",
		"imagen",
	"x T i300",
		"troff output for i300",
	"x T im300",
		"troff output for im300",
	"x T post",
		"troff output for post",
	"x T opost",
		"troff output for opost",
	"x T Latin1",
		"troff output for Latin1",
	"x T 202",
		"troff output for 202",
	"x T aps",
		"troff output for aps",
	0,0
};

int
istring(void)
{
	int i, n;
	char *p;

	for(i=0; p=iwords[i]; i+=2) {
		n = strlen(p);
		if(nbuf >= n && !strncmp((char*)buf, p, n)) {
			print("%s\n", iwords[i+1]);
			return 1;
		}
	}
	if(strncmp((char*)buf, "TYPE=", 5) == 0) {	/* td */
		for(i=5; i<nbuf; i++)
			if(buf[i] == '\n')
				break;
		print("%.*s picture\n", i-5, buf+5);
		return 1;
	}
	return 0;
}

/*
 * low entropy means encrypted
 */
int
ismung(void)
{
	int i, bucket[8];
	float cs;

	if(nbuf < 64)
		return 0;
	memset(bucket, 0, sizeof(bucket));
	for(i=0; i<64; i++)
		bucket[(buf[i]>>5)&07] += 1;

	cs = 0.;
	for(i=0; i<8; i++)
		cs += (bucket[i]-8)*(bucket[i]-8);
	cs /= 8.;
	if(cs <= 24.322) {
		if(buf[0]==037 && buf[1]==0235)
			print("compressed\n");
		else
			print("encrypted\n");
		return 1;
	}
	return 0;
}

/*
 * english by punctuation and frequencies
 */
int
isenglish(void)
{
	int i, vow, comm, rare, badpun, punct;
	char *p;

	if(!(flag & (Fascii|Feascii)))
		return 0;
	badpun = 0;
	punct = 0;
	for(i=0; i<nbuf-1; i++)
		switch(buf[i]) {
		case '.':
		case ',':
		case ')':
		case '%':
		case ';':
		case ':':
		case '?':
			punct++;
			if(buf[i+1] != ' ' && buf[i+1] != '\n')
				badpun++;
		}
	if(badpun*5 > punct)
		return 0;
	if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e'])	/* shell file test */
		return 0;
	if(2*cfreq[';'] > cfreq['e'])
		return 0;

	vow = 0;
	for(p="AEIOU"; *p; p++) {
		vow += cfreq[*p];
		vow += cfreq[tolower(*p)];
	}
	comm = 0;
	for(p="ETAION"; *p; p++) {
		comm += cfreq[*p];
		comm += cfreq[tolower(*p)];
	}
	rare = 0;
	for(p="VJKQXZ"; *p; p++) {
		rare += cfreq[*p];
		rare += cfreq[tolower(*p)];
	}
	if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
		print("English text\n");
		return 1;
	}
	return 0;
}

int
isc(void)
{
	int n;

	n = wfreq[I1];
	/*
	 * includes
	 */
	if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
		goto yes;
	/*
	 * declarations
	 */
	if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
		goto yes;
	/*
	 * assignments
	 */
	if(cfreq[';'] >= 10 && cfreq['='] >= 10)
		goto yes;
	return 0;

yes:
	print("c program text\n");
	return 1;
}

int
isas(void)
{

	/*
	 * includes
	 */
	if(wfreq[Aword] >= 2)
		goto yes;
	return 0;

yes:
	print("assembler program text\n");
	return 1;
}

int
iscint(void)
{

	if(buf[0] == 0x3a)			/* as = ANAME */
	if(buf[1] == 0x11)			/* type = D_FILE */
	if(buf[2] == 1)				/* sym */
	if(buf[3] == '<') {			/* name of file */
		print("mips .v intermediate\n");
		return 1;
	}

	if(buf[0] == 0x4d)			/* aslo = ANAME */
	if(buf[1] == 0x01)			/* ashi = ANAME */
	if(buf[2] == 0x32)			/* type = D_FILE */
	if(buf[3] == 1)				/* sym */
	if(buf[4] == '<') {			/* name of file */
		print("68020 .2 intermediate\n");
		return 1;
	}

	if(buf[0] == 0x43)			/* as = ANAME */
	if(buf[1] == 0x0d)			/* type */
	if(buf[2] == 1)				/* sym */
	if(buf[3] == '<') {			/* name of file */
		print("hobbit .z intermediate\n");
		return 1;
	}

	if(buf[0] == 0x74)			/* as = ANAME */
	if(buf[1] == 0x10)			/* type */
	if(buf[2] == 1)				/* sym */
	if(buf[3] == '<') {			/* name of file */
		print("sparc .k intermediate\n");
		return 1;
	}

	if(buf[0] == 0x7e)			/* aslo = ANAME */
	if(buf[1] == 0x00)			/* ashi = ANAME */
	if(buf[2] == 0x45)			/* type = D_FILE */
	if(buf[3] == 1)				/* sym */
	if(buf[4] == '<') {			/* name of file */
		print("386 .8 intermediate\n");
		return 1;
	}

	return 0;
}

/*
 * pick up a number with
 * syntax _*[0-9]+_
 */
#define	P9BITLEN	12
int
p9bitnum(uchar *bp)
{
	int n, c, len;

	len = P9BITLEN;
	while(*bp == ' ') {
		bp++;
		len--;
		if(len <= 0)
			return -1;
	}
	n = 0;
	while(len > 1) {
		c = *bp++;
		if(!isdigit(c))
			return -1;
		n = n*10 + c-'0';
		len--;
	}
	if(*bp != ' ')
		return -1;
	return n;
}

int
isp9bit(void)
{
	int ldep, lox, loy, hix, hiy;
	long len;

	ldep = p9bitnum(buf + 0*P9BITLEN);
	lox = p9bitnum(buf + 1*P9BITLEN);
	loy = p9bitnum(buf + 2*P9BITLEN);
	hix = p9bitnum(buf + 3*P9BITLEN);
	hiy = p9bitnum(buf + 4*P9BITLEN);

	if(ldep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
		return 0;

	len = (hix-lox) * (1<<ldep);	/* row length */
	len = (len + 7) / 8;		/* rounded to bytes */
	len *= (hiy-loy);		/* col length */
	len += 60;			/* size of initial ascii */

	/*
	 * for regular file length is non-zero and must match calculation above
	 * for /dev/window and /dev/screen the length is always zero
	 */
#ifdef plan9
	if(mbuf.length != len && mbuf.length != 0)
#else
	if(mbuf.st_size != len && mbuf.st_size != 0)
#endif	
		return 0;
	print("plan 9 bitmap\n");
	return 1;
}

int	(*call[])(void) =
{
	long0,		/* recognizable by first 4 bytes */
	short0,		/* recognizable by first 2 bytes */
	istring,	/* recognizable by first string */
	iscint,		/* c intermediate */
	isc,		/* c compiler key words */
	isas,		/* assembler key words */
	ismung,		/* entropy compressed/encrypted */
	isenglish,	/* char frequency English */
	isp9bit,	/* plan 9 bitmap (as from /dev/window) */
	0
};