v13i028: Replacement for the file(1) command, Part02/02

Rich Salz rsalz at bbn.com
Tue Feb 9 07:26:33 AEST 1988


Submitted-by: "Ian F. Darwin" <ian at sq.com>
Posting-number: Volume 13, Issue 28
Archive-name: file/part02

: to unbundle, sh this file
echo x - strtok.c 1>&2
cat >strtok.c <<'@@@End of strtok.c'
/*
 * Get next token from string s (NULL on 2nd, 3rd, etc. calls),
 * where tokens are nonempty strings separated by runs of
 * chars from delim.  Writes NULs into s to end tokens.  delim need not
 * remain constant from call to call.
 *
 * Copyright (c) Henry Spencer.
 * Written by Henry Spencer.
 *
 * This software is not subject to any license of the American Telephone
 * and Telegraph Company or of the Regents of the University of California.
 *
 * Permission is granted to anyone to use this software for any purpose on
 * any computer system, and to alter it and redistribute it freely, subject
 * to the following restrictions:
 *
 * 1. The author is not responsible for the consequences of use of this
 *    software, no matter how awful, even if they arise from flaws in it.
 *
 * 2. The origin of this software must not be misrepresented, either by
 *    explicit claim or by omission.  Since few users ever read sources,
 *    credits must appear in the documentation.
 *
 * 3. Altered versions must be plainly marked as such, and must not be
 *    misrepresented as being the original software.  Since few users
 *    ever read sources, credits must appear in the documentation.
 *
 * 4. This notice may not be removed or altered.
 */

#define	NULL	0
#define CONST

static char *scanpoint = NULL;

char *				/* NULL if no token left */
strtok(s, delim)
char *s;
register CONST char *delim;
{
	register char *scan;
	char *tok;
	register CONST char *dscan;

	if (s == NULL && scanpoint == NULL)
		return(NULL);
	if (s != NULL)
		scan = s;
	else
		scan = scanpoint;

	/*
	 * Scan leading delimiters.
	 */
	for (; *scan != '\0'; scan++) {
		for (dscan = delim; *dscan != '\0'; dscan++)
			if (*scan == *dscan)
				break;
		if (*dscan == '\0')
			break;
	}
	if (*scan == '\0') {
		scanpoint = NULL;
		return(NULL);
	}

	tok = scan;

	/*
	 * Scan token.
	 */
	for (; *scan != '\0'; scan++) {
		for (dscan = delim; *dscan != '\0';)	/* ++ moved down. */
			if (*scan == *dscan++) {
				scanpoint = scan+1;
				*scan = '\0';
				return(tok);
			}
	}

	/*
	 * Reached end of string.
	 */
	scanpoint = NULL;
	return(tok);
}
@@@End of strtok.c
echo x - strchr.c 1>&2
cat >strchr.c <<'@@@End of strchr.c'
/*
 * Local copy of strchr (a.k.a. index) for portability.
 * Totally public domain.
 */

#include <stdio.h>

char *
strchr(s, c)
char *s, c;
{
	char *x = s;

	while (*x != c)
		if (*x == '\0')
			return(NULL);
		else
			++x;
	return(x);
}

@@@End of strchr.c
echo x - file.h 1>&2
cat >file.h <<'@@@End of file.h'
/*
 * file.h - definitions for file(1) program
 # @(#)$Header: file.h,v 1.4 87/09/18 10:56:09 ian Exp $
 *
 * Copyright (c) Ian F. Darwin, 1987.
 * Written by Ian F. Darwin.
 *
 * This software is not subject to any license of the American Telephone
 * and Telegraph Company or of the Regents of the University of California.
 *
 * Permission is granted to anyone to use this software for any purpose on
 * any computer system, and to alter it and redistribute it freely, subject
 * to the following restrictions:
 *
 * 1. The author is not responsible for the consequences of use of this
 *    software, no matter how awful, even if they arise from flaws in it.
 *
 * 2. The origin of this software must not be misrepresented, either by
 *    explicit claim or by omission.  Since few users ever read sources,
 *    credits must appear in the documentation.
 *
 * 3. Altered versions must be plainly marked as such, and must not be
 *    misrepresented as being the original software.  Since few users
 *    ever read sources, credits must appear in the documentation.
 *
 * 4. This notice may not be removed or altered.
 */

#define HOWMANY	1024		/* how much of the file to look at */
#define MAXMAGIS 250		/* max entries in /etc/magic */
#define MAXDESC	50		/* max leng of text description */
#define MAXstring 32		/* max leng of "string" types */
#define ckfputs(str,fil) {if (fputs(str,fil)==EOF) error(ckfmsg,"");}

struct magic {
	short contflag;		/* 1 if '>0' appears */
	long offset;		/* offset to magic number */
	char reln;		/* relation (0=eq, '>'=gt, etc) */
	char type;		/* int, short, long or string. */
	char vallen;		/* length of string value, if any */
#define 			BYTE	1
#define				SHORT	2
#define				LONG	4
#define				STRING	5
	union VALUETYPE {
		char b;
		short h;
		long l;
		char s[MAXstring];
	} value;		/* either number or string */
	char desc[MAXDESC];	/* description */
};

extern void error(), exit();
@@@End of file.h
echo x - names.h 1>&2
cat >names.h <<'@@@End of names.h'
/*
 * Names.h - names and types used by ascmagic in file(1).
 * These tokens are here because they can appear anywhere in
 * the first HOWMANY bytes, while tokens in /etc/magic must
 * appear at fixed offsets into the file. Don't make HOWMANY
 * too high unless you have a very fast CPU.
 *
 * Copyright (c) Ian F. Darwin, 1987.
 * Written by Ian F. Darwin.
 *
 * This software is not subject to any license of the American Telephone
 * and Telegraph Company or of the Regents of the University of California.
 *
 * Permission is granted to anyone to use this software for any purpose on
 * any computer system, and to alter it and redistribute it freely, subject
 * to the terms in the accompanying LEGAL.NOTICE file.
 */

/* these types are used to index the table 'types': keep em in sync! */
#define L_C	0		/* first and foremost on UNIX */
#define	L_FORT	1		/* the oldest one */
#define L_MAKE	2		/* Makefiles */
#define L_PLI	3		/* PL/1 */
#define L_MACH	4		/* some kinda assembler */
#define L_ENG	5		/* English */
#define	L_PAS	6		/* Pascal */
#define	L_MAIL	7		/* Electronic mail */
#define	L_NEWS	8		/* Usenet Netnews */

char *types[] = {
	"c program text",
	"fortran program text",
	"make commands text" ,
	"pl/1 program text",
	"assembler program text",
	"English text",
	"pascal program text",
	"mail text",
	"news text",
	"can't happen error on names.h/types",
	0};

struct names {
	char *name;
	short type;
} names[] = {
	/* These must be sorted by eye for optimal hit rate */
	/* Add to this list only after substantial meditation */
	{"/*",		L_C},	/* must preced "The", "the", etc. */
	{"#include",	L_C},
	{"char",	L_C},
	{"The",		L_ENG},
	{"the",		L_ENG},
	{"double",	L_C},
	{"extern",	L_C},
	{"float",	L_C},
	{"real",	L_C},
	{"struct",	L_C},
	{"union",	L_C},
	{"CFLAGS",	L_MAKE},
	{"LDFLAGS",	L_MAKE},
	{"all:",	L_MAKE},
	{".PRECIOUS",	L_MAKE},
/* Too many files of text have these words in them.  Find another way
 * to recognize Fortrash.
 */
#ifdef	NOTDEF
	{"subroutine",	L_FORT},
	{"function",	L_FORT},
	{"block",	L_FORT},
	{"common",	L_FORT},
	{"dimension",	L_FORT},
	{"integer",	L_FORT},
	{"data",	L_FORT},
#endif	/*NOTDEF*/
	{".ascii",	L_MACH},
	{".asciiz",	L_MACH},
	{".byte",	L_MACH},
	{".even",	L_MACH},
	{".globl",	L_MACH},
	{"clr",		L_MACH},
	{"(input,",	L_PAS},
	{"dcl",		L_PLI},
	{"Received:",	L_MAIL},
	{">From",	L_MAIL},
	{"Return-Path:",L_MAIL},
	{"Cc:",		L_MAIL},
	{"Newsgroups:",	L_NEWS},
	{"Path:",	L_NEWS},
	{"Organization:",L_NEWS},
	0};
#define NNAMES ((sizeof(names)/sizeof(struct names)) - 1)
@@@End of names.h
echo x - tar.h 1>&2
cat >tar.h <<'@@@End of tar.h'
/*
 * Header file for public domain tar (tape archive) program.
 *
 * @(#)tar.h 1.20 86/10/29	Public Domain.
 *
 * Created 25 August 1985 by John Gilmore, ihnp4!hoptoad!gnu.
 */

/*
 * Kludge for handling systems that can't cope with multiple
 * external definitions of a variable.  In ONE routine (tar.c),
 * we #define TAR_EXTERN to null; here, we set it to "extern" if
 * it is not already set.
 */
#ifndef TAR_EXTERN
#define TAR_EXTERN extern
#endif

/*
 * Header block on tape.
 *
 * I'm going to use traditional DP naming conventions here.
 * A "block" is a big chunk of stuff that we do I/O on.
 * A "record" is a piece of info that we care about.
 * Typically many "record"s fit into a "block".
 */
#define	RECORDSIZE	512
#define	NAMSIZ	100
#define	TUNMLEN	32
#define	TGNMLEN	32

union record {
	char		charptr[RECORDSIZE];
	struct header {
		char	name[NAMSIZ];
		char	mode[8];
		char	uid[8];
		char	gid[8];
		char	size[12];
		char	mtime[12];
		char	chksum[8];
		char	linkflag;
		char	linkname[NAMSIZ];
		char	magic[8];
		char	uname[TUNMLEN];
		char	gname[TGNMLEN];
		char	devmajor[8];
		char	devminor[8];
	} header;
};

/* The checksum field is filled with this while the checksum is computed. */
#define	CHKBLANKS	"        "	/* 8 blanks, no null */

/* The magic field is filled with this if uname and gname are valid. */
#define	TMAGIC		"ustar  "	/* 7 chars and a null */

/* The linkflag defines the type of file */
#define	LF_OLDNORMAL	'\0'		/* Normal disk file, Unix compat */
#define	LF_NORMAL	'0'		/* Normal disk file */
#define	LF_LINK		'1'		/* Link to previously dumped file */
#define	LF_SYMLINK	'2'		/* Symbolic link */
#define	LF_CHR		'3'		/* Character special file */
#define	LF_BLK		'4'		/* Block special file */
#define	LF_DIR		'5'		/* Directory */
#define	LF_FIFO		'6'		/* FIFO special file */
#define	LF_CONTIG	'7'		/* Contiguous file */
/* Further link types may be defined later. */

/*
 * Exit codes from the "tar" program
 */
#define	EX_SUCCESS	0		/* success! */
#define	EX_ARGSBAD	1		/* invalid args */
#define	EX_BADFILE	2		/* invalid filename */
#define	EX_BADARCH	3		/* bad archive */
#define	EX_SYSTEM	4		/* system gave unexpected error */


/*
 * Global variables
 */
TAR_EXTERN union record	*ar_block;	/* Start of block of archive */
TAR_EXTERN union record	*ar_record;	/* Current record of archive */
TAR_EXTERN union record	*ar_last;	/* Last+1 record of archive block */
TAR_EXTERN char		ar_reading;	/* 0 writing, !0 reading archive */
TAR_EXTERN int		blocking;	/* Size of each block, in records */
TAR_EXTERN int		blocksize;	/* Size of each block, in bytes */
TAR_EXTERN char		*ar_file;	/* File containing archive */
TAR_EXTERN char		*name_file;	/* File containing names to work on */
TAR_EXTERN char		*tar;		/* Name of this program */

/*
 * Flags from the command line
 */
TAR_EXTERN char	f_reblock;		/* -B */
TAR_EXTERN char	f_create;		/* -c */
TAR_EXTERN char	f_debug;		/* -d */
TAR_EXTERN char	f_sayblock;		/* -D */
TAR_EXTERN char	f_follow_links;		/* -h */
TAR_EXTERN char	f_ignorez;		/* -i */
TAR_EXTERN char	f_keep;			/* -k */
TAR_EXTERN char	f_modified;		/* -m */
TAR_EXTERN char	f_oldarch;		/* -o */
TAR_EXTERN char	f_use_protection;	/* -p */
TAR_EXTERN char	f_sorted_names;		/* -s */
TAR_EXTERN char	f_list;			/* -t */
TAR_EXTERN char	f_namefile;		/* -T */
TAR_EXTERN char	f_verbose;		/* -v */
TAR_EXTERN char	f_extract;		/* -x */
TAR_EXTERN char	f_compress;		/* -z */

/*
 * We now default to Unix Standard format rather than 4.2BSD tar format.
 * The code can actually produce all three:
 *	f_standard	ANSI standard
 *	f_oldarch	V7
 *	neither		4.2BSD
 * but we don't bother, since 4.2BSD can read ANSI standard format anyway.
 * The only advantage to the "neither" option is that we can cmp(1) our
 * output to the output of 4.2BSD tar, for debugging.
 */
#define		f_standard		(!f_oldarch)

/*
 * Structure for keeping track of filenames and lists thereof.
 */
struct name {
	struct name	*next;
	short		length;
	char		found;
	char		name[NAMSIZ+1];
};

TAR_EXTERN struct name	*namelist;	/* Points to first name in list */
TAR_EXTERN struct name	*namelast;	/* Points to last name in list */

TAR_EXTERN int		archive;	/* File descriptor for archive file */
TAR_EXTERN int		errors;		/* # of files in error */

/*
 *
 * Due to the next struct declaration, each routine that includes
 * "tar.h" must also include <sys/types.h>.  I tried to make it automatic,
 * but System V has no defines in <sys/types.h>, so there is no way of
 * knowing when it has been included.  In addition, it cannot be included
 * twice, but must be included exactly once.  Argghh!
 *
 * Thanks, typedef.  Thanks, USG.
 */
struct link {
	struct link	*next;
	dev_t		dev;
	ino_t		ino;
	short		linkcount;
	char		name[NAMSIZ+1];
};

TAR_EXTERN struct link	*linklist;	/* Points to first link in list */


/*
 * Error recovery stuff
 */
TAR_EXTERN char		read_error_flag;


/*
 * Declarations of functions available to the world.
 */
/*LINTLIBRARY*/
union record *findrec();
void userec();
union record *endofrecs();
void anno();
#define	 annorec(stream, msg)	anno(stream, msg, 0)	/* Cur rec */
#define	annofile(stream, msg)	anno(stream, msg, 1)	/* Saved rec */
@@@End of tar.h
echo x - Makefile 1>&2
cat >Makefile <<'@@@End of Makefile'
# Makefile for file(1) cmd. 
# Copyright (c) Ian F. Darwin 86/09/01 - see LEGAL.NOTICE.
# @(#)$Header: Makefile,v 1.17 88/01/15 13:03:16 ian Exp $
#

SHELL	= /bin/sh
MAGIC	= /etc/magic
DEFS	= -DMAGIC='"$(MAGIC)"' # -Dvoid=int
COPTS	= -O # -g
CFLAGS	= $(COPTS) $(DEFS)
SHAR	= bundle
OFILE	= /bin/file.orig	# old or distributed version, for comparison
# Where new binary lives; typically /usr/local (BSD), /usr/lbin (USG).
BINDIR	= /usr/local
# For installing our man pages; 
# MANCxxx is manual section for Commands, MANFxxx is section for file formats.
# MANxDIR is directory names; MANxEXT is the filename extention. Usual values:
# Variable	V7		4BSD		Sys V
# MANCDIR 	/usr/man/man1	/usr/man/man1	/usr/man/u_man/man1
# MANFDIR 	/usr/man/man5	/usr/man/man5	/usr/man/u_man/man4
# MANCEXT	1		1		1
# MANFEXT	5		5		4
# --- possible alternative for 4BSD ---
# MANCDIR			/usr/man/manl
# MANCEXT			l
# --- possible alternative for USG ---
# MANCDIR			/usr/man/local/man1
# MANCEXT			1

MANCDIR	= /usr/man/manl
MANFDIR	= /usr/man/man5
MANCEXT	= l
MANFEXT	= 5

# There are no system-dependant configuration options (except maybe CFLAGS).
# Delete any of LOCALSRCS and LOCALOBJS that are in your C library.
LOCALSRCS = getopt.c strtol.c strtok.c strchr.c
SRCS = file.c apprentice.c fsmagic.c softmagic.c ascmagic.c is_tar.c \
	print.c $(LOCALSRCS)
LOCALOBJS = getopt.o strtol.o strtok.o strchr.o
OBJS = file.o apprentice.o fsmagic.o softmagic.o ascmagic.o is_tar.o \
	print.o $(LOCALOBJS)

ALLSRC = LEGAL.NOTICE README PORTING $(SRCS) *.h \
	Makefile file.1 magic.4 magdir/[a-z]* tst/Makefile

all:		file magic

try:		all $(OFILE)
		cd tst; make
		time $(OFILE) -m ./magic * tst/* >/tmp/t1
		time ./file -m ./magic * tst/* >/tmp/t2
		-diff -b /tmp/t[12]
		what ./file >lastnocore

file:		$(OBJS)
		cc $(CFLAGS) $(OBJS) -o $@
lint:		$(SRCS)
		lint -ha $(DEFS) $(SRCS) | tee $@
magic:		magdir
#		exclude RCS or SCCS dirs:
		cat magdir/[a-z]* >$@

ascmagic.o:	names.h

apprentice.o ascmagic.o file.o fsmagic.o print.o softmagic.o: file.h

install:	file magic file.1 magic.4 $(BINDIR) $(MANCDIR) $(MANCDIR)
		cp file		$(BINDIR)/file
		cp magic	$(MAGIC)
		cp file.1	$(MANCDIR)/file.$(MANCEXT)
		cp magic.4	$(MANFDIR)/magic.$(MANFEXT)

clean:
		rm -f *.o file magic lint.out
		(cd tst; make clean)

dist:		$(ALLSRC)
#		Some versions of shar can't handle a single file from
#		a subdirectory, so we manually insert mkdir as needed.
#		Put the extra "mkdir" AFTER the ": to unbundle..." line.
		$(SHAR) $(ALLSRC) | sed -e '1a\
		mkdir magdir tst' >$@

@@@End of Makefile
echo x - file.1 1>&2
cat >file.1 <<'@@@End of file.1'
..TH FILE 1 "Copyright but distributable"
..SH NAME
..I file
\- determine file type
..SH SYNOPSIS
..B file
[
..B -c
]
[
..B -f
namefile ]
[
..B -m 
magicfile ]
file ...
..SH DESCRIPTION
..I File
tests each argument in an attempt to classify it.
There are three sets of tests, performed in this order:
filesystem tests, magic number tests, and language tests.
The
..I first
test that succeeds causes the file type to be printed.
..PP
The type printed will usually contain one of the words
..B text
(the file contains only ASCII characters and is 
probably safe to read on an ASCII terminal),
..B executable
(the file contains the result of compiling a program
in a form understandable to some \s-1UNIX\s0 kernel or another),
or
..B data
meaning anything else (data is usually `binary' or non-printable).
Exceptions are well-known file formats (core files, tar archives)
that are known to contain binary data.
When modifying the file
..I /etc/magic
or the program itself, 
..B "preserve these keywords" .
People depend on knowing that all the readable files in a directory
have the word ``text'' printed.
Don't do as one computer vendor did \- change ``shell commands text''
to ``shell script''.
..PP
The filesystem tests are based on examining the return from a
..I stat (2)
system call.
The program checks to see if the file is empty,
or if it's some sort of special file.
Any known file types appropriate to the system you are running on
(sockets and symbolic links on 4.2BSD, named pipes (FIFOs) on System V)
are intuited if they are defined in
the system header file
..I sys/stat.h  .
..PP
The magic number tests are used to check for files with data in
particular fixed formats.
The canonical example of this is a binary executable (compiled program)
..I a.out
file, whose format is defined in 
..I a.out.h
and possibly
..I exec.h
in the standard include directory.
These files have a `magic number' stored in a particular place
near the beginning of the file that tells the \s-1UNIX\s0 operating system
that the file is a binary executable, and which of several types thereof.
The concept of `magic number' has been applied by extension to data files.
Any file with some invariant identifier at a small fixed
offset into the file can usually be described in this way.
The information in these files is read from the magic file
..I /etc/magic .
..PP
If an argument appears to be an
..SM ASCII 
file,
..I file
attempts to guess its language.
The language tests look for particular strings (cf \fInames.h\fP)
that can appear anywhere in the first few blocks of a file.
For example, the keyword
..I .br
indicates that the file is most likely a troff input file,
just as the keyword 
..I struct
indicates a C program.
These tests are less reliable than the previous
two groups, so they are performed last.
The language test routines also test for some miscellany
(such as 
..I tar
archives) and determine whether an unknown file should be
labelled as `ascii text' or `data'. 
..PP
Use
..B -m
..I file
to specify an alternate file of magic numbers.
..PP
The
..B -c
option causes a checking printout of the parsed form of the magic file.
This is usually used in conjunction with 
..B -m
to debug a new magic file before installing it.
..PP
The 
..B -f
..I namefile
option specifies that the names of the files to be examined
are to be read (one per line) from 
..I namefile
before the argument list.
Either 
..I namefile
or at least one filename argument must be present;
to test the standard input, use ``-'' as a filename argument.
..SH FILES
..I /etc/magic
\- default list of magic numbers
..SH SEE ALSO
..IR Magic (FILES)
\- description of magic file format.
..br
..IR Strings (1), " od" (1)
\- tools for examining non-textfiles.
..SH STANDARDS CONFORMANCE
This program is believed to exceed the System V Interface Definition
of FILE(CMD), as near as one can determine from the vague language
contained therein. 
Its behaviour is mostly compatible with the System V program of the same name.
This version knows more magic, however, so it will produce
different (albeit more accurate) output in many cases. 
..PP
The one significant difference 
between this version and System V
is that this version treats any white space
as a delimiter, so that spaces in pattern strings must be escaped.
For example,
..br
>10	string	language impress\ 	(imPRESS data)
..br
in an existing magic file would have to be changed to
..br
>10	string	language\e impress	(imPRESS data)
..PP
The Sun Microsystems implementation of System V compatibility
includes a file(1) command that has some extentions.
My version differs from Sun's only in minor ways.
The significant one is the `&' operator, which Sun's program expects as,
for example,
..br
>16	long&0x7fffffff	>0		not stripped
..br
would be entered in my version as
..br
>16	long	&0x7fffffff	not stripped
..br
which is a little less general; it simply tests (location 16)&0x7ffffff
and returns its truth value as a C expression.
..SH MAGIC DIRECTORY
The magic file entries have been collected from various sources,
mainly USENET, and contributed by various authors.
Ian Darwin (address below) will collect additional
or corrected magic file entries.
A consolidation of magic file entries 
will be distributed periodically.
..PP
The order of entries in the magic file is significant.
Depending on what system you are using, the order that
they are put together may be incorrect.
If your old
..I file
command uses a magic file,
keep the old magic file around for comparison purposes
(rename it to 
..IR /etc/magic.orig ).
..SH HISTORY
There has been a 
..I file
command in every UNIX since at least Research Version 6
(man page dated January, 1975).
The System V version introduced one significant major change:
the external list of magic number types.
This slowed the program down slightly but made it a lot more flexible.
..PP
This program, based on the System V version,
was written by Ian Darwin without looking at anybody else's source code.
..PP
John Gilmore revised the code extensively, making it better than
the first version.
Geoff Collyer found several inadequacies
and provided some magic file entries.
The program has undergone continued evolution since.
..SH NOTICE
Copyright (c) Ian F. Darwin,  1986 and 1987.
Written by Ian F. Darwin, UUCP address {utzoo | ihnp4}!darwin!ian,
Internet address ian at sq.com,
postal address: P.O. Box 603, Station F, Toronto, Ontario, CANADA M4Y 2L8.
..PP
..I Strtok.c
and
..I getopt.c
written by and copyright by Henry Spencer, utzoo!henry.
..PP
This software is not subject to any license of the American Telephone
and Telegraph Company or of the Regents of the University of California.
..PP
Permission is granted to anyone to use this software for any purpose on
any computer system, and to alter it and redistribute it freely, subject
to the following restrictions:
..PP 
1. The author is not responsible for the consequences of use of this
software, no matter how awful, even if they arise from flaws in it.
..PP
2. The origin of this software must not be misrepresented, either by
explicit claim or by omission.  Since few users ever read sources,
credits must appear in the documentation.
..PP
3. Altered versions must be plainly marked as such, and must not be
misrepresented as being the original software.  Since few users
ever read sources, credits must appear in the documentation.
..PP
4. This notice may not be removed or altered.
..PP
A few support files (\fIgetopt\fP, \fIstrtok\fP)
distributed with this package
are by Henry Spencer and are subject to the same terms as above.
..PP
A few simple support files (\fIstrtol\fP, \fIstrchr\fP)
distributed with this package
are in the public domain; they are so marked.
..PP
The files
..I tar.h
and
..I is_tar.c
were written by John Gilmore from his public-domain
..I tar
program, and are not covered by the above restrictions.
..SH BUGS
There must be a way to automate the construction of the Magic
file from all the glop in magdir. What is it?
..PP
..I File
uses several algorithms that favor speed over accuracy,
thus it can be misled about the contents of ASCII files.
..PP
The support for ASCII files (primarily for programming languages)
is simplistic, inefficient and requires recompilation to update.
..PP
Should there be an ``else'' clause to follow a series of continuation lines?
..PP
Is it worthwhile to implement recursive file inspection,
so that compressed files, uuencoded, etc., can say ``compressed
ascii text'' or ``compressed executable'' or ``compressed tar archive"
or whatever? 
..PP
The magic file and keywords should have regular expression support.
..PP
It might be advisable to allow upper-case letters in keywords
for e.g., troff commands vs man page macros.
Regular expression support would make this easy.
..PP
The program doesn't grok \s-2FORTRAN\s0.
It should be able to figure \s-2FORTRAN\s0 by seeing some keywords which 
appear indented at the start of line.
Regular expression support would make this easy.
..PP
The list of keywords in 
..I ascmagic
probably belongs in the Magic file.
This could be done by using some keyword like `*' for the offset value.
..PP
The program should malloc the magic file structures,
rather than using a fixed-size array as at present.
..PP
The magic file should be compiled into binary 
(or better yet, fixed-length ASCII strings 
for use in heterogenous network environments) for faster startup.
Then the program would run as fast as the Version 7 program of the same name,
with the flexibility of the System V version.
But then there would have to be yet another magic number for the 
..I magic.out
file.
..PP
Another optimisation would be to sort
the magic file so that we can just run down all the
tests for the first byte, first word, first long, etc, once we
have fetched it.  Complain about conflicts in the magic file entries.
Make a rule that the magic entries sort based on file offset rather
than position within the magic file?
..PP
The program should provide a way to give an estimate 
of ``how good'' a guess is.
We end up removing guesses (e.g. ``From '' as first 5 chars of file) because
they are not as good as other guesses (e.g. ``Newsgroups:'' versus
"Return-Path:").  Still, if the others don't pan out, it should be
possible to use the first guess.  
..PP
Perhaps the program should automatically try all tests with
byte-swapping done, to avoid having to figure out the byte-swapped values
when constructing the magic file.
Of course this will run more slowly, so it should probably be
an option (-a?).
..PP
This manual page, and particularly this section, is too long.
@@@End of file.1
echo x - magic.4 1>&2
cat >magic.4 <<'@@@End of magic.4'
..TH MAGIC FILES "Public Domain"
..\" install as magic.4 on USG, magic.5 on V7 or Berkeley systems.
..SH NAME
magic \- file command's magic number file
..SH DESCRIPTION
The
..IR file (1)
command identifies the type of a file using,
among other tests,
a test for whether the file begins with a certain
..IR "magic number" .
The file
..B /etc/magic
specifies what magic numbers are to be tested for,
what message to print if a particular magic number is found,
and additional information to extract from the file.
..PP
Each line of the file specifies a test to be performed.
A test compares the data starting at a particular offset
in the file with a 1-byte, 2-byte, or 4-byte numeric value or
a string.  If the test succeeds, a message is printed.
The line consists of the following fields:
..IP offset \w'message'u+2n
A number specifying the offset, in bytes, into the file of the data
which is to be tested.
..IP type
The type of the data to be tested.  The possible values are:
..RS
..IP byte \w'message'u+2n
A one-byte value.
..IP short
A two-byte value (on most systems).
..IP long
A four-byte value (on most systems).
..IP string
A string of bytes.
..RE
..IP test
The value to be compared with the value from the file.  If the type is
numeric, this value
is specified in C form; if it is a string, it is specified as a C string
with the usual escapes permitted (e.g. \en for new-line).
..IP
Numeric values
may be preceded by a character indicating the operation to be performed.
It may be
..BR = ,
to specify that the value from the file must equal the specified value,
..BR < ,
to specify that the value from the file must be less than the specified
value,
..BR > ,
to specify that the value from the file must be greater than the specified
value,
or
..BR & ,
to specify that the value is to be AND'ed with the
numeric value before any comparisons are done.
Numeric values are specified in C form; e.g.
..B 13
is decimal,
..B 013
is octal, and
..B 0x13
is hexadecimal.
to specify that any value will match.  If the character
is omitted, it is assumed to be
..BR = .
..IP
For string values, the byte string from the
file must match the specified byte string. 
The operators =, < and > (but not &) can be applied to strings.
The length used for matching is that of the string argument
in the magic file.
..IP message
The message to be printed if the comparison succeeds.  If the string
contains a
..IR printf (3S)
format specification, the value from the file (with any specified masking
performed) is printed using the message as the format string.
..PP
Some file formats contain additional information which is to be printed
along with the file type.  A line which begins with the character
..B >
indicates additional tests and messages to be printed.  If the test on the
line preceding the first line with a
..B >
succeeds, the tests specified in all the subsequent lines beginning with
..B >
are performed, and the messages printed if the tests succeed.  The next
line which does not begin with a
..B >
terminates this.
..SH BUGS
The formats 
..I long
and
..I short
are system-dependant; perhaps they should be specified as a number
of bytes (2B, 4B, etc), 
since the files being recognized typically come from
a system on which the lengths are invariant.
..PP
There should be more than one level of subtests,
with the level possibly indicated by
the number of
..B >
at the beginning of the line.
..SH SEE ALSO
..IR file (1)
\- the command that reads this file.
..\"
..\" From: guy at sun.uucp (Guy Harris)
..\" Newsgroups: net.bugs.usg
..\" Subject: /etc/magic's format isn't well documented
..\" Message-ID: <2752 at sun.uucp>
..\" Date: 3 Sep 85 08:19:07 GMT
..\" Organization: Sun Microsystems, Inc.
..\" Lines: 136
..\" 
..\" Here's a manual page for the format accepted by the "file" made by adding
..\" the changes I posted to the S5R2 version.
..\"
..\" Modified for Ian Darwin's version of the file command.
..\" @(#)$Header: magic.4,v 1.5 87/11/06 20:54:31 ian Exp $
@@@End of magic.4
echo x - magdir/aa 1>&2
cat >magdir/aa <<'@@@End of magdir/aa'
#! file
# Magic data for file(1) command.
# Machine-genererated from src/cmd/file/magdir/*; edit there only!
# Format is described in magic(files), where:
# files is 4 on V7 and BSD, 4 on SV, and ?? in the SVID.
@@@End of magdir/aa
echo x - magdir/aalocal 1>&2
cat >magdir/aalocal <<'@@@End of magdir/aalocal'
#	Add any locally-observed files here.  Remember:
#	text if readable, executable if runnable binary, data if unreadable.
22	short	023000		core dump data
@@@End of magdir/aalocal
echo x - magdir/arc 1>&2
cat >magdir/arc <<'@@@End of magdir/arc'
0	byte		26		'arc' archive
>1	byte		0		(empty)
>1	byte		1		(old format)
@@@End of magdir/arc
echo x - magdir/archive 1>&2
cat >magdir/archive <<'@@@End of magdir/archive'
0	short		070707		cpio archive
0	string		070707		ASCII cpio archive
0	long		0177555		very old archive
0	short		0177555		very old PDP-11 archive
0	long		0177545		old archive
0	short		0177545		old PDP-11 archive
0	long		0100554		apl workspace
0	string		=<ar>		archive
0	string		!<arch>		archive
>8	string		__.SYMDEF	random library
0	string		-h-		archive (Software Tools format) text
@@@End of magdir/archive
echo x - magdir/c 1>&2
cat >magdir/c <<'@@@End of magdir/c'
#	this first will upset you if you're a PL/1 shop...
#	in which case rm it; ascmagic will catch real C programs
0	string		/*		c program text
#	check for various C program generators...
#	offsets derived empirically, your offsets may vary!
#	(this obviously belongs in ascmagic.c/names.h!).
53	string		yyprevious	c program text (from lex)
@@@End of magdir/c
echo x - magdir/commands 1>&2
cat >magdir/commands <<'@@@End of magdir/commands'
0	string		#!\ /bin/sh	commands text
0	string		#!/bin/sh	commands text
0	string		#!\ /bin/csh	C shell commands text
0	string		#!/bin/csh	C shell commands text
0	string		#!\ /bin/awk	awk commands text
0	string		#!/bin/awk	awk commands text
0	string		#!\ /		some kinda commands text
0	string		#!/		some kinda commands text
0	string		#!\ 		commands text
>3	string		>\0		 for %s
#	An "antique" kernel is either unmodified early V7,
#	without DMR's 1979 mod for #!, or any kernel
#	derived from a pre-v7 kernel (i.e., System V)
0	string		:\ 		shell archive or commands for antique kernel text
@@@End of magdir/commands
echo x - magdir/compress 1>&2
cat >magdir/compress <<'@@@End of magdir/compress'
0	short		017037		packed data
# CPL 	- added pack to /etc/magic
0	short		017436		packed data
0	short		0145405		huf output

0	string		\037\235	compressed data
# non block compressed
>2	byte		12		- with 12 bits
>2	byte		13		- with 13 bits
>2	byte		14		- with 14 bits
>2	byte		15		- with 15 bits
>2	byte		16		- with 16 bits
# block compressed
>2	byte		140		- with 12 bits
>2	byte		141		- with 13 bits
>2	byte		142		- with 14 bits
>2	byte		143		- with 15 bits
>2	byte		144		- with 16 bits
@@@End of magdir/compress
echo x - magdir/convex 1>&2
cat >magdir/convex <<'@@@End of magdir/convex'
0	long		0513		Convex executable
@@@End of magdir/convex
echo x - magdir/diff 1>&2
cat >magdir/diff <<'@@@End of magdir/diff'
#
# magic file lines for output from "diff"...
0	string		diff\ 	'diff' output text
0	string		***\ 		'diff' output text
0	string		Only\ in\ 	'diff' output text
0	string		Common\ subdirectories:\ 	'diff' output text
@@@End of magdir/diff
echo x - magdir/ditroff 1>&2
cat >magdir/ditroff <<'@@@End of magdir/ditroff'
# Magic numbers for ditroff intermediate language
0	string		x\ T\ cat	titroff output for the C/A/T text
0	string		x\ T\ ps	titroff output for PostScript
0	string		x\ T 		titroff output text
@@@End of magdir/ditroff
echo x - magdir/fonts 1>&2
cat >magdir/fonts <<'@@@End of magdir/fonts'
0	string	FONT	ASCII vfont text
0	short	0436	Berkeley vfont data
0	short	017001	byte-swapped Berkeley vfont data
@@@End of magdir/fonts
echo x - magdir/frame 1>&2
cat >magdir/frame <<'@@@End of magdir/frame'
# Magic number for FrameMaker files
# Thanks to Berry Kercheval
#
0	string		\<MakerFile	FrameMaker document
@@@End of magdir/frame
echo x - magdir/imagen 1>&2
cat >magdir/imagen <<'@@@End of magdir/imagen'
# Tell file about magic for IMAGEN printer-ready files:
0	string	@document(		Imagen printer
# this only works if "language xxx" is first item in Imagen header.
>10	string	language\ impress	(imPRESS data)
>10	string	language\ daisy		(daisywheel text)
>10	string	language\ diablo		(daisywheel text)
>10	string	language\ printer	(line printer emulation)
>10	string	language\ tektronix	(Tektronix 4014 emulation)
# Add any other languages that your Imagen uses - remember
# to keep the word `text' if the file is human-readable.
#
# Now magic for IMAGEN font files...
0	string		Rast		RST-format raster font data
>45	string		>0		face %
@@@End of magdir/imagen
echo x - magdir/intel 1>&2
cat >magdir/intel <<'@@@End of magdir/intel'
# various intel-CPU magic numbers
0	short		01006		80286 executable (STL)
>31	byte		<0x040		small model
>31	byte		=0x048		large model	
>31	byte		=0x049		huge model 
>16	long		>0		not stripped
0	string		MZ		DOS executable (EXE)
0	string		LZ		DOS executable (built-in)
0	byte		0xe9		DOS executable (COM)
0	byte		0xeb		DOS executable (COM)
0	short		=0512		80286 executable small model (COFF)
>12	long		>0		not stripped
>22	short		>0		- version %ld
0	short		=0522		80286 executable large model (COFF)
>12	long		>0		not stripped
>22	short		>0		- version %ld
0	short		=0514		80386 executable
>12	long		>0		not stripped
>22	short		>0		- version %ld

@@@End of magdir/intel
echo x - magdir/magic 1>&2
cat >magdir/magic <<'@@@End of magdir/magic'
0	string		#magic		magic text file for file(1) cmd
@@@End of magdir/magic
echo x - magdir/mail.news 1>&2
cat >magdir/mail.news <<'@@@End of magdir/mail.news'
# Unfortunately, saved netnews also has From line added in some news software.
#0	string		From 		mail text
# There are tests to ascmagic.c to cope with mail and news.
0	string		Relay-Version: 	old news text
0	string		#!\ rnews	batched news text
0	string		N#!\ rnews	mailed, batched news text
0	string		Forward\ to 	mail forwarding text
0	string		Pipe\ to 	mail piping text
0	string		Return-Path:	smtp mail text
0	string		Path:		news text
0	string		Xref:		news text
0	string		From:		news or mail text
0	string		Article 	saved news text
@@@End of magdir/mail.news
echo x - magdir/mirage 1>&2
cat >magdir/mirage <<'@@@End of magdir/mirage'
0	long	31415		Mirage Assembler m.out executable
@@@End of magdir/mirage
echo x - magdir/misc 1>&2
cat >magdir/misc <<'@@@End of magdir/misc'
0	string		begin 		uuencoded mail text
@@@End of magdir/misc
echo x - magdir/misc2 1>&2
cat >magdir/misc2 <<'@@@End of magdir/misc2'
#	derived empirically, your offsets may vary!
53	string		yyprevious	c program text (from lex)
@@@End of magdir/misc2
echo x - magdir/olda.out 1>&2
cat >magdir/olda.out <<'@@@End of magdir/olda.out'
0	long		0407		executable
>16	long		>0		not stripped
#>2	short		>0		- version %ld
0	short		0407		PDP-11 executable
>8	short		>0		not stripped
0	short		0401		unix-rt ldp
0	short		0405		old overlay
0	long		0410		pure executable
>16	long		>0		not stripped
#>2	short		>0		- version %ld
0	short		0410		PDP-11 pure executable
>8	short		>0		not stripped
#>2	short		>0		- version %ld
0	short		0411		PDP-11 separate I&D executable
>8	short		>0		not stripped
#>2	short		>0		- version %ld
0	long		0413		demand paged pure executable
>16	long		>0		not stripped
#>2	short		>0		- version %ld
0	long		0420		demand paged (first page unmapped) pure executable
>16	long		>0		not stripped
#>2	short		>0		- version %ld
0	short		0437		pdp11 kernel overlay
@@@End of magdir/olda.out
echo x - magdir/postscript 1>&2
cat >magdir/postscript <<'@@@End of magdir/postscript'
#
# Let us not forget PostScript
0	string	%!			PostScript text
>2	string	PS-Adobe-		conforming
>11	string	1.0			at level %s
@@@End of magdir/postscript
echo x - magdir/rasterfile 1>&2
cat >magdir/rasterfile <<'@@@End of magdir/rasterfile'
# Sun rasterfiles
0	string	\x59\xa6\x6a\x95	rasterfile
>4	long	>0		%d
>8	long	>0		x %d
>12	long	>0		x %d
>20	long	0		old format
>20	long	2		compressed
>24	long	1		with color map
@@@End of magdir/rasterfile
echo x - magdir/sccs 1>&2
cat >magdir/sccs <<'@@@End of magdir/sccs'
# SCCS archive structure:
# \001h01207
# \001s 00276/00000/00000
# \001d D 1.1 87/09/23 08:09:20 ian 1 0
# \001c date and time created 87/09/23 08:09:20 by ian
# \001e
# \001u
# \001U
# ... etc.
# Now '\001h' happens to be the same as the 3B20's a.out magic number (0550).
# *Sigh*. And these both came from various parts of the USG.
# Maybe we should just switch everybody from SCCS to RCS!
# Further, you can't just say '\001h0', because the five-digit number
# is a checksum that could (presumably) have any leading digit,
# and we don't have regular expression matching yet. 
# Hence the following official kludge:
8	string		\001s\ 			SCCS archive.
@@@End of magdir/sccs
echo x - magdir/sequent 1>&2
cat >magdir/sequent <<'@@@End of magdir/sequent'
# For Sequent's multiprocessor systems (incomplete).
0	long	000352		BALANCE NS32000 .o
0	long	010352		BALANCE NS32000 executable (0 @ 0)
>16	long	>0		not stripped
0	long	020352		BALANCE NS32000 executable (invalid @ 0)
>16	long	>0		not stripped
0	long	030352		BALANCE NS32000 standalone executable
>16	long	>0		not stripped
# Also need info on Sequent "Symmetry" series...
@@@End of magdir/sequent
echo x - magdir/softquad 1>&2
cat >magdir/softquad <<'@@@End of magdir/softquad'
# SoftQuad troff magic numbers
# SoftQuad @(#)magic	1.2 86/09/15
0	short		0125252		SoftQuad DESC or font file binary
>2	short		>0		- version %d
@@@End of magdir/softquad
echo x - magdir/sun 1>&2
cat >magdir/sun <<'@@@End of magdir/sun'
# Values for Sun MC680x0 binaries
0	short		2		mc68020
>2	short		0407		executable
>2	short		0410		pure executable
>2	short		0413		demand paged executable
>16	long		>0		not stripped
0	short		1		mc68010
>2	short		0407		executable
>2	short		0410		pure executable
>2	short		0413		demand paged executable
>16	long		>0		not stripped
0	short		0		old sun-2
>2	short		0407		executable
>2	short		0410		pure executable
>2	short		0413		demand paged executable
>16	long		>0		not stripped
0	long		0x080456	core file
>128	string		>0		from '%s'
#
0	short		05401		byte-swapped demand paged executable
0	short		010001		byte-swapped demand paged executable
@@@End of magdir/sun
echo x - magdir/tower 1>&2
cat >magdir/tower <<'@@@End of magdir/tower'
# NCR Tower objects, contributed by
# Michael R. Wayne  ***  TMC & Associates  ***  INTERNET: wayne at ford-vax.arpa
# uucp: {philabs | pyramid} !fmsrl7!wayne   OR   wayne at fmsrl7.UUCP
#
0	short		000610	Tower/XP rel 2 object
>12	   long			>0	not stripped
>20	   short		0407	executable
>20	   short		0410	pure executable
>22	   short		>0	-version %ld
0	short		000615	Tower/XP rel 2 object
>12	   long			>0	not stripped
>20	   short		0407	executable
>20	   short		0410	pure executable
>22	   short		>0	-version %ld
0	short		000620	Tower/XP rel 3 object
>12	   long			>0	not stripped
>20	   short		0407	executable
>20	   short		0410	pure executable
>22	   short		>0	-version %ld
0	short		000625	Tower/XP rel 3 object
>12	   long			>0	not stripped
>20	   short		0407	executable
>20	   short		0410	pure executable
>22	   short		>0	-version %ld
0	short		000630	Tower32/600/400 68020 object
>12	   long			>0	not stripped
>20	   short		0407	executable
>20	   short		0410	pure executable
>22	   short		>0	-version %ld
0	short		000640	Tower32/800 68020
>18	   short		&020000	w/68881 object
>18	   short		&040000	compatible object
>18	   short		&~060000	object
>20	   short		0407	executable
>20	   short		0413	pure executable
>12	   long			>0	not stripped
>22	   short		>0	-version %ld
0	short		000645	Tower32/800 68010
>18	   short		&040000	compatible object
>18	   short		&~060000 object
>20	   short		0407	executable
>20	   short		0413	pure executable
>12	   long			>0	not stripped
>22	   short		>0	-version %ld
@@@End of magdir/tower
echo x - magdir/typeset 1>&2
cat >magdir/typeset <<'@@@End of magdir/typeset'
# other typesetting magic
0	string		\100\357	very old (C/A/T) troff output data
0	string		Interpress/Xerox	Xerox InterPress data
@@@End of magdir/typeset
echo x - magdir/varied.out 1>&2
cat >magdir/varied.out <<'@@@End of magdir/varied.out'
#	Herewith many of the object file formats used by USG systems.
#	The `versions' should be un-commented if they work for you.
0	short		0570		SysV executable
>12	long		>0		not stripped
#>22	short		>0		- version %ld
0	short		0575		SysV pure executable
>12	long		>0		not stripped
#>22	short		>0		- version %ld
0	short		0502		basic-16 executable
>12	long		>0		not stripped
0	short		0503		basic-16 executable (TV)
>12	long		>0		not stripped
0	short		0510		x86 executable
>12	long		>0		not stripped
0	short		0511		x86 executable (TV)
>12	long		>0		not stripped
0	short		0550		3b20 executable
>12	long		>0		not stripped
0	short		0551		3b20 executable (TV)
>12	long		>0		not stripped
0	short		0560		WE32000 executable
>12	long		>0		not stripped
0	short		0561		WE32000 executable (TV)
>12	long		>0		not stripped
0	short		0610		Perkin-Elmer executable

@@@End of magdir/varied.out
echo x - magdir/vax.byteswap 1>&2
cat >magdir/vax.byteswap <<'@@@End of magdir/vax.byteswap'
# Byte-swapped VAXen
# From: dupuy at amsterdam.columbia.edu (Alexander Dupuy)
# 
# Here are a few lines you can add to /etc/magic on your sun workstations in
# order to recognize VAX executables and objects.... you could do something
# similar (in reverse) for your vaxen, but since 4.3+NFS' file(1) doesn't look
# for /etc/magic, I've never bothered.  It really should be built in to file(1)
# so you would see the state of setuid/setgid/sticky bits.  Or actually, there
# should be support for checking that sort of thing in /etc/magic.
#
0	long		00700200000	VAX executable
>16	long		&0x7fffffff	not stripped
0	long		01000200000	VAX pure executable
>16	long		&0x7fffffff	not stripped
0	long		01300200000	VAX demand-paged pure executable
>16	long		&0x7fffffff	not stripped
0	long		01100200000	PDP-11 executable
@@@End of magdir/vax.byteswap
echo x - magdir/xenix 1>&2
cat >magdir/xenix <<'@@@End of magdir/xenix'
# XENIX executable formats: derived empirically; treat as folklore until proven0	short	01006		XENIX (x.out) executable
>8	short	1		Middle model
>16	short	>0		not stripped
0	short	02600		XENIX 8086 relocatable or 80286 small model

@@@End of magdir/xenix
echo x - tst/Makefile 1>&2
cat >tst/Makefile <<'@@@End of tst/Makefile'
# Make up some fake test files that are easily produced.
# By no means an exhaustive test!
# @(#) $Header: Makefile,v 1.4 87/11/07 12:46:09 ian Exp $
all:	ar cmd emp i t x
ar:
	echo '<ar> fake fake fake' >$@
	echo 070707 fake fake fake >$@.asc
	echo '!<arch>.__.SYMDEF fake fake fake' >$@.ranlib
	echo - -h- >$@.swt
cmd:
	echo '#! /bin/sh' >$@
	echo '#!/bin/sh' >c.sh2
	echo '#! /bin/csh' >c.csh1
	echo '#!/bin/csh' >c.csh2
	echo '#! /bin/awk' >c.awk1
	echo '#!/bin/awk' >c.awk2
	echo '#! /' >c.misc1
	echo '#!/' >c.misc2
	echo ': ' >c.broken
emp:
	touch $@
i:
	echo '@document(language impress)fake fake' >$@
	echo '@document(language diablo)fake fake' >$@.d
t:
	rm -f $@
	tar cvf $@ *
x:
	echo 'Interpress/Xerox fake fake fake' >$@

clean:
	rm -f [a-z]*
@@@End of tst/Makefile
exit 0
-- 
For comp.sources.unix stuff, mail to sources at uunet.uu.net.



More information about the Comp.sources.unix mailing list