V10/cmd/lp/file.c
#include <u.h>
#include <libc.h>
#include <ctype.h>
#ifdef plan9
Dir mbuf;
#else
#define print printf
struct stat mbuf;
#define OREAD 0
#endif
/*
* file - determine type of file
*/
uchar buf[6000];
short cfreq[140];
short wfreq[50];
int nbuf;
int flag;
int (*call[])(void);
enum
{
Cword,
Fword,
Aword,
I1,
I2,
I3,
Clatin = 128,
Cbinary,
Cnull,
Ceascii,
};
struct
{
char* word;
int flag;
} dict[] =
{
"TEXT", Aword,
"block", Fword,
"char", Cword,
"common", Fword,
"data", Fword,
"dimension", Fword,
"double", Cword,
"extern", Cword,
"fio", I2,
"float", Cword,
"function", Fword,
"h", I3,
"include", I1,
"int", Cword,
"integer", Fword,
"libc", I2,
"long", Cword,
"real", Fword,
"register", Cword,
"short", Cword,
"static", Cword,
"stdio", I2,
"struct", Cword,
"subroutine", Fword,
"u", I2,
"void", Cword,
};
enum
{
Short = 1<<0, /* size < 100 */
Long = 1<<1,
Fascii = 1<<2, /* printable ascii */
Flatin = 1<<3,
Fbinary = 1<<4,
Feascii = 1<<5, /* including extended */
Fnull = 1<<6,
};
void type(char*, int);
long lendian(uchar*);
int
main(int argc, char *argv[])
{
int i, l;
l = 0;
for(i=1; i<argc; i++)
if(strlen(argv[i]) > l)
l = strlen(argv[i]);
for(i=1; i<argc; i++)
type(argv[i], l);
exit(0);
}
void
type(char *file, int nlen)
{
int i, f, l, m, c;
char *p, *ep, word[20];
print("%s:%*s", file, nlen-strlen(file)+1, "");
#ifdef plan9
if(dirstat(file, &mbuf) < 0) {
print("cannot stat\n");
return;
}
if(mbuf.mode & CHDIR) {
print("directory\n");
return;
}
if(mbuf.type != 'M') {
print("special file #%c\n", mbuf.type);
return;
}
#else
if(stat(file, &mbuf) < 0) {
print("cannot stat\n");
return;
}
switch(mbuf.st_mode&S_IFMT) {
case S_IFDIR:
print("directory\n");
return;
case S_IFCHR:
print("character special file\n");
return;
case S_IFBLK:
print("block special file\n");
return;
}
#endif
f = open(file, OREAD);
if(f < 0) {
print("cannot open\n");
return;
}
nbuf = read(f, buf, sizeof(buf));
close(f);
if(nbuf < 0) {
print("cannot read\n");
return;
}
if(nbuf == 0) {
print("empty\n");
return;
}
/*
* build histogram table
*/
memset(cfreq, 0, sizeof(cfreq));
flag = 0;
if(nbuf > 100)
flag |= Long;
else
flag |= Short;
for(i=0; i<nbuf; i++) {
f = buf[i] & 0xff;
if(f >= 128) {
if(f >= 128+32)
f = Clatin; /* latin */
else
f = Cbinary; /* not latin */
} else
if(!isprint(f) && !isspace(f))
if(f == 0)
f = Cnull;
else
f = Ceascii;
cfreq[f]++;
}
/*
* gross classify
*/
if(cfreq[Cbinary])
flag |= Fbinary;
else
if(cfreq[Clatin])
flag |= Flatin;
else
if(cfreq[Ceascii])
flag |= Feascii;
else
if(cfreq[Cnull])
flag |= Fnull;
else
flag |= Fascii;
if(flag & Fnull) {
print("null\n");
return;
}
/*
* lookup dictionary words
*/
memset(wfreq, 0, sizeof(wfreq));
if(flag & Fascii) {
ep = word+sizeof(word)-2;
for(i=0; i<nbuf; i++) {
f = buf[i];
if(!isalpha(f))
continue;
p = word;
for(; i<nbuf; i++) {
f = buf[i];
if(!isalnum(f))
break;
*p++ = f;
if(p >= ep)
break;
}
*p = 0;
f = 0;
l = sizeof(dict)/sizeof(dict[0]);
for(;;) {
if(f >= l)
break;
m = (f+l)/2;
c = strcmp(dict[m].word, word);
if(c == 0) {
wfreq[dict[m].flag]++;
break;
}
if(c < 0)
f = m+1;
else
l = m;
}
}
}
/*
* call individual classify routines
*/
for(i=0; call[i]; i++)
if((*call[i])())
return;
/*
* if all else fails,
* print out gross classification
*/
if(flag & Short)
print("short ");
if(flag & Fascii)
print("ascii\n");
else
if(flag & Feascii)
print("extended ascii\n");
else
if(flag & Flatin)
print("latin ascii\n");
else
print("binary\n");
}
long
lendian(uchar *p)
{
return (p[0]) |
(p[1] << 8) |
(p[2] << 16) |
(p[3] << 24);
}
int
long0(void)
{
switch((unsigned)lendian(buf)) {
default:
return 0;
case 0413:
print("demand paged ");
case 0410:
print("pure ");
goto exec;
case 0406:
print("mpx 68000 ");
goto exec;
exec:
case 0407:
print("unix vax executable");
if(lendian(buf+4) != 0)
print(" not stripped");
print("\n");
break;
case 0411:
print("jfr 411 executable\n");
break;
case 0177555:
print("very old archive\n");
break;
case 0177545:
print("old archive\n");
break;
case 0135246: /* andrew/ehg */
print("view2d input file\n");
break;
case 0135256: /* andrew */
print("apl file\n");
break;
case 0164200: /* td */
print("Lucasfilm picture\n");
break;
case 0600560:
print("mux downloadable file\n");
break;
case 0x07010000:
print("68020 plan9 executable\n");
break;
case 0x07040000:
print("mips plan9 executable\n");
break;
case 0x97010000:
print("hobbit plan9 executable\n");
break;
case 0xab020000:
print("sparc plan9 executable\n");
break;
case 0xeb010000:
print("386 plan9 executable\n");
break;
case 0x0b1f1bdc:
print("daisy\n");
break;
case 0x64205300:
print("S data object\n");
break;
}
return 1;
}
int
short0(void)
{
switch(lendian(buf) & 0xffff) {
default:
return 0;
case 070707:
print("cpio archive\n");
break;
case 0x02f7:
print("tex dvi\n");
break;
case 0405:
case 0407:
case 0410:
case 0411:
print("pdp-11 executable\n");
break;
case 0x0000:
print("bitmap\n");
break;
}
return 1;
}
/*
* initial words to classify file
*/
char* iwords[] =
{
"!<arch>\n__.SYMDEF",
"archive random library",
"!<arch>\n",
"archive",
"070707",
"cpio archive - ascii header",
"#FIG",
"fig ouput",
"#!/bin/echo",
"cyntax object file",
"#!/bin/rc",
"rc executable file",
"#!/bin/sh",
"sh executable file",
"%!",
"postscript",
"@document(",
"imagen",
"x T i300",
"troff output for i300",
"x T im300",
"troff output for im300",
"x T post",
"troff output for post",
"x T opost",
"troff output for opost",
"x T Latin1",
"troff output for Latin1",
"x T 202",
"troff output for 202",
"x T aps",
"troff output for aps",
0,0
};
int
istring(void)
{
int i, n;
char *p;
for(i=0; p=iwords[i]; i+=2) {
n = strlen(p);
if(nbuf >= n && !strncmp((char*)buf, p, n)) {
print("%s\n", iwords[i+1]);
return 1;
}
}
if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */
for(i=5; i<nbuf; i++)
if(buf[i] == '\n')
break;
print("%.*s picture\n", i-5, buf+5);
return 1;
}
return 0;
}
/*
* low entropy means encrypted
*/
int
ismung(void)
{
int i, bucket[8];
float cs;
if(nbuf < 64)
return 0;
memset(bucket, 0, sizeof(bucket));
for(i=0; i<64; i++)
bucket[(buf[i]>>5)&07] += 1;
cs = 0.;
for(i=0; i<8; i++)
cs += (bucket[i]-8)*(bucket[i]-8);
cs /= 8.;
if(cs <= 24.322) {
if(buf[0]==037 && buf[1]==0235)
print("compressed\n");
else
print("encrypted\n");
return 1;
}
return 0;
}
/*
* english by punctuation and frequencies
*/
int
isenglish(void)
{
int i, vow, comm, rare, badpun, punct;
char *p;
if(!(flag & (Fascii|Feascii)))
return 0;
badpun = 0;
punct = 0;
for(i=0; i<nbuf-1; i++)
switch(buf[i]) {
case '.':
case ',':
case ')':
case '%':
case ';':
case ':':
case '?':
punct++;
if(buf[i+1] != ' ' && buf[i+1] != '\n')
badpun++;
}
if(badpun*5 > punct)
return 0;
if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */
return 0;
if(2*cfreq[';'] > cfreq['e'])
return 0;
vow = 0;
for(p="AEIOU"; *p; p++) {
vow += cfreq[*p];
vow += cfreq[tolower(*p)];
}
comm = 0;
for(p="ETAION"; *p; p++) {
comm += cfreq[*p];
comm += cfreq[tolower(*p)];
}
rare = 0;
for(p="VJKQXZ"; *p; p++) {
rare += cfreq[*p];
rare += cfreq[tolower(*p)];
}
if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
print("English text\n");
return 1;
}
return 0;
}
int
isc(void)
{
int n;
n = wfreq[I1];
/*
* includes
*/
if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
goto yes;
/*
* declarations
*/
if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
goto yes;
/*
* assignments
*/
if(cfreq[';'] >= 10 && cfreq['='] >= 10)
goto yes;
return 0;
yes:
print("c program text\n");
return 1;
}
int
isas(void)
{
/*
* includes
*/
if(wfreq[Aword] >= 2)
goto yes;
return 0;
yes:
print("assembler program text\n");
return 1;
}
int
iscint(void)
{
if(buf[0] == 0x3a) /* as = ANAME */
if(buf[1] == 0x11) /* type = D_FILE */
if(buf[2] == 1) /* sym */
if(buf[3] == '<') { /* name of file */
print("mips .v intermediate\n");
return 1;
}
if(buf[0] == 0x4d) /* aslo = ANAME */
if(buf[1] == 0x01) /* ashi = ANAME */
if(buf[2] == 0x32) /* type = D_FILE */
if(buf[3] == 1) /* sym */
if(buf[4] == '<') { /* name of file */
print("68020 .2 intermediate\n");
return 1;
}
if(buf[0] == 0x43) /* as = ANAME */
if(buf[1] == 0x0d) /* type */
if(buf[2] == 1) /* sym */
if(buf[3] == '<') { /* name of file */
print("hobbit .z intermediate\n");
return 1;
}
if(buf[0] == 0x74) /* as = ANAME */
if(buf[1] == 0x10) /* type */
if(buf[2] == 1) /* sym */
if(buf[3] == '<') { /* name of file */
print("sparc .k intermediate\n");
return 1;
}
if(buf[0] == 0x7e) /* aslo = ANAME */
if(buf[1] == 0x00) /* ashi = ANAME */
if(buf[2] == 0x45) /* type = D_FILE */
if(buf[3] == 1) /* sym */
if(buf[4] == '<') { /* name of file */
print("386 .8 intermediate\n");
return 1;
}
return 0;
}
/*
* pick up a number with
* syntax _*[0-9]+_
*/
#define P9BITLEN 12
int
p9bitnum(uchar *bp)
{
int n, c, len;
len = P9BITLEN;
while(*bp == ' ') {
bp++;
len--;
if(len <= 0)
return -1;
}
n = 0;
while(len > 1) {
c = *bp++;
if(!isdigit(c))
return -1;
n = n*10 + c-'0';
len--;
}
if(*bp != ' ')
return -1;
return n;
}
int
isp9bit(void)
{
int ldep, lox, loy, hix, hiy;
long len;
ldep = p9bitnum(buf + 0*P9BITLEN);
lox = p9bitnum(buf + 1*P9BITLEN);
loy = p9bitnum(buf + 2*P9BITLEN);
hix = p9bitnum(buf + 3*P9BITLEN);
hiy = p9bitnum(buf + 4*P9BITLEN);
if(ldep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
return 0;
len = (hix-lox) * (1<<ldep); /* row length */
len = (len + 7) / 8; /* rounded to bytes */
len *= (hiy-loy); /* col length */
len += 60; /* size of initial ascii */
/*
* for regular file length is non-zero and must match calculation above
* for /dev/window and /dev/screen the length is always zero
*/
#ifdef plan9
if(mbuf.length != len && mbuf.length != 0)
#else
if(mbuf.st_size != len && mbuf.st_size != 0)
#endif
return 0;
print("plan 9 bitmap\n");
return 1;
}
int (*call[])(void) =
{
long0, /* recognizable by first 4 bytes */
short0, /* recognizable by first 2 bytes */
istring, /* recognizable by first string */
iscint, /* c intermediate */
isc, /* c compiler key words */
isas, /* assembler key words */
ismung, /* entropy compressed/encrypted */
isenglish, /* char frequency English */
isp9bit, /* plan 9 bitmap (as from /dev/window) */
0
};