Coherent4.2.10/conf/streams/src/struser.c

Compare this file to the similar file:
Show the results in this format:
#define	_DDI_DKI	1
#define	_DDI_DKI_IMPL	1
#define	_SYSV4		1

/*
 */

#include <common/ccompat.h>
#include <kernel/ddi_lock.h>
#include <kernel/strmlib.h>
#include <sys/confinfo.h>
#include <sys/types.h>
#include <sys/kmem.h>
#include <sys/poll.h>
#include <sys/ksynch.h>
#include <sys/file.h>
#include <sys/uio.h>
#include <sys/cmn_err.h>
#include <sys/errno.h>
#include <sys/signal.h>
#include <sys/fd.h>
#include <sys/cred.h>
#include <stropts.h>
#include <string.h>
#include <poll.h>

/*
 * The following function (local to this module) is forward-referenced due to
 * the mutual recursion between the final close process and the streams unlink
 * code.
 */

__LOCAL__ int	SHEAD_DO_CLOSE	__PROTO ((shead_t * sheadp, int mode,
					  cred_t * credp));


lkinfo_t __stream_schedule_lkinfo = {
	"STREAMS queue schedule", INTERNAL_LOCK
};

lkinfo_t __stream_event_lkinfo = {
	"STREAMS bufcall ()/esbbcall () event list", INTERNAL_LOCK
};

__LOCAL__ lkinfo_t _stream_head_lkinfo = {
	"stream head lock", INTERNAL_LOCK
};


/*
 * Allocate and initialize a queue pair. This function only performs a partial
 * initialization; many other fields are filled in by the caller, usually from
 * fields supplied in the "streamtab" structure.
 *
 * Since at initial stream open time we we be allocating at least two queue
 * pairs and one or two stream head structures, we try to satisfy all those
 * allocations in one step here.
 *
 * The "npairs" argument indicates the number of queue pairs to be allocated.
 * The "extra" argument is the number of additional bytes to allocate over and
 * above the memory for the queue pairs.
 *
 * Call only from base level. This function may sleep.
 */

#if	__USE_PROTO__
__LOCAL__ queue_t * (QUEUE_ALLOC) (int npairs, size_t extra)
#else
__LOCAL__ queue_t *
QUEUE_ALLOC __ARGS ((npairs, extra))
int		npairs;
size_t		extra;
#endif
{
	queue_t	      *	q;
	queue_t	      *	init;
	int		count;

	ASSERT (npairs > 0 && npairs < 3);

	/*
	 * We use kmem_zalloc () to allocate this space so we can sleaze our
	 * way out of most of the initialization work.
	 */

	if ((q = (queue_t *) kmem_zalloc (2 * npairs * sizeof (* q) + extra,
					  KM_SLEEP)) == NULL)
		return NULL;

	init = q;
	count = npairs;

	do {
		/*
		 * First, initialise the read side of the queue. While we are
		 * at it, we link the "q_next" members of multiple queues for
		 * a read side; if we treat the value "q" that we are going to
		 * return to the caller as the stream head, the links run from
		 * the last entry we initialize towards the first one.
		 */

		if (count > 1)
			(init + 2)->q_next = init;

		init->q_flag = QWANTR | QREADR | QPROCSOFF;

		SFREEZE_INIT (init);

		init ++;


		/*
		 * Next, work on the write side. For multiple queues, the
		 * write size "q_next" links run from the "head" to the device.
		 */

		if (count > 1)
			init->q_next = init + 2;

		init->q_flag = QWANTR | QPROCSOFF;

		SFREEZE_INIT (init);

		init ++;
	} while (-- count > 0);


	/*
	 * We put a pointer to any "extra" space that the caller requested in
	 * the "q_ptr" fields of the first two queues allocated, since they
	 * will almost always be the stream head queue pair.
	 */

	if (extra > 0) {

		q->q_ptr = (char *) q + 2 * sizeof (* q) * npairs;
		W (q)->q_ptr = q->q_ptr;
	}

	return q;
}


/*
 * Set a queue's initial watermark data and some other stuff.
 */

#if	__USE_PROTO__
__LOCAL__ void (QUEUE_INITOPT) (queue_t * q)
#else
__LOCAL__ void
QUEUE_INITOPT __ARGS ((q))
queue_t	      *	q;
#endif
{
	struct module_info
		      *	mi = q->q_qinfo->qi_minfo;

	q->q_minpsz = mi->mi_minpsz;
	q->q_maxpsz = mi->mi_maxpsz;
	q->q_hiwat = mi->mi_hiwat;
	q->q_lowat = mi->mi_lowat;
}


/*
 * Set up a queue pair's initial options.
 */

typedef enum {
	QI_NORMAL,
	QI_MUX
} qiflag_t;

#if	__USE_PROTO__
__LOCAL__ void (QUEUE_INIT) (queue_t * q, struct streamtab * stab,
			     qiflag_t mux)
#else
void
QUEUE_INIT __ARGS ((q, stab, mux))
queue_t	      *	q;
struct streamtab
	      *	stab;
qiflag_t	mux;
#endif
{
	q->q_qinfo = mux == QI_NORMAL ? stab->st_rdinit : stab->st_muxrinit;
	QUEUE_INITOPT (q);

	q = W (q);

	q->q_qinfo = mux == QI_NORMAL ? stab->st_wrinit : stab->st_muxwinit;
	QUEUE_INITOPT (q);
}


/*
 * This function is the dual to the QBAND_ALLOC () function, freeing any
 * allocated "qband" structures associated with the given queue.
 *
 * The caller must have the queue frozen or not linked on any stream.
 */

#if	__USE_PROTO__
void (QBAND_FREE) (queue_t * q)
#else
void
QBAND_FREE __ARGS ((q))
queue_t	      *	q;
#endif
{
	qband_t	      *	scan;
	qband_t	      *	prev;
	int		nbands;

	QUEUE_TRACE (q, "QBAND_FREE");

	/*
	 * We use flags in the "qband" entries to locate allocation
	 * boundaries rather than trying to recover this information purely
	 * from comparing addresses (although since the address comparisons
	 * provide an extra check we do that too). In theory, an allocator
	 * might not need extra information stored in the arena yet might
	 * fail if adjacent allocations are coalesced into a single free ().
	 *
	 * I don't know of any allocators with this property, but one might
	 * exist.
	 *
	 * This code will work just fine under the allocation scheme which
	 * keeps the QBAND entries in a single vector, so we don't need to
	 * conditionalize this code at all.
	 */

	nbands = 0;

	for (prev = scan = q->q_bandp ; scan != NULL ; scan = scan->qb_next) {

		if ((scan->qb_flag & QB_FIRST) != 0) {

			ASSERT (nbands > 0);

			kmem_free (prev, sizeof (* prev) * nbands);

			nbands = 1;
			prev = scan;
		} else {

			ASSERT (scan == prev + nbands);
			nbands ++;
		}
	}

	if (nbands > 0)
		kmem_free (prev, sizeof (* prev) * nbands);
}


/*
 * Destroy an individual queue.
 */

#if	__USE_PROTO__
__LOCAL__ void (QUEUE_DESTROY) (queue_t * q)
#else
__LOCAL__ void
QUEUE_DESTROY __ARGS ((q))
queue_t	      *	q;
#endif
{
	mblk_t	      *	mp;
	mblk_t	      *	next;

	QSCHED_UNSCHEDULE (q, str_mem->sm_sched);

	/*
	 * Free all the memory allocated to messages that remain on the queue.
	 */

	for (mp = q->q_first ; mp != NULL ; mp = next) {

		next = mp->b_next;
		freemsg (mp);
	}

	if ((q->q_flag & QWANTW) != 0)
		QUEUE_BACKENAB (q);

	SFREEZE_DESTROY (q);

	if (q->q_nband > 0)
		QBAND_FREE (q);
}


/*
 * Destroy and release the memory for a queue pair or group of pairs. The
 * arguments passed to this function should match those used to allocate a
 * pair or group of pairs exactly. This can easily be done by recognising the
 * various canonical forms for stream structures; modules are always lone
 * queue pairs, regular streams match a driver and stream head (with
 * associated extra data for the stream head), and stream pipes consist of
 * two pairs of queue structures with two head structures.
 */

#if	__USE_PROTO__
__LOCAL__ void (QUEUE_FREE) (queue_t * rq, int npairs, size_t extra)
#else
__LOCAL__ void
QUEUE_FREE __ARGS ((rq, npairs, extra))
queue_t	      *	rq;
int		npairs;
size_t		extra;
#endif
{
	queue_t	      *	destroy;
	int		count;

	ASSERT (rq != NULL);
	ASSERT (npairs > 0 && npairs < 3);

	destroy = rq;
	count = npairs * 2;

	do {

		QUEUE_DESTROY (destroy);
		destroy ++;
	} while (-- count > 0);

	kmem_free (rq, 2 * sizeof (* rq) * npairs + extra);
}


/*
 * STREAM HEAD MANAGEMENT NOTES:
 *
 * The management of stream head structures introduces some interesting
 * synchronization problems arising from the interaction of the rules
 * associated with driver close routines and the fact that stream head
 * structures are dynamically allocated.
 *
 * The first problem is really one of specification; what does it mean for a
 * driver close () entry point to return EINTR? System V does not talk about
 * this case specifically, so we prohibit it by treating error returns from
 * the close () entry point uniformly by still actually closing the device.
 *
 * The second problem is this; a driver should not be re-opened until the
 * close process has completed. However, completion of the close process will
 * normally involve deallocation of the stream head (where presumably the
 * open routines are waiting).
 *
 * Our problem is that it is not possible to reliably determine whether there
 * are any other contexts waiting on a sleep lock. While it might be possible
 * to do so using SLEEP_LOCKAVAIL (), this would require that the calling
 * context release the lock. Under a plausible implementation of SLEEP_LOCK ()
 * where sleep locks are basically implemented with the sleep () and wakeup ()
 * functions and a "locked" flag, there will be no way for a process to find
 * out whether there are any functions waiting on the lock, since the unlock
 * implementation could simply clear the "locked" flag and issue wakeup (),
 * so that there could be any number of contexts waiting to run and test the
 * "locked" flag, yet SLEEP_LOCKAVAIL () in the calling context would return
 * true. Deallocating the lock at this time would be potentially disastrous.
 *
 * Actually, even a quality implementation of sleep locks (such as is provided
 * with this STREAMS system) cannot be easily used this way, since it is very
 * difficult to ensure that the information returned by SLEEP_LOCKAVAIL () is
 * current.
 *
 * The situation can be resolved by maintaining a count of processes wishing
 * to lock the item. The count can be maintained by using the basic-lock
 * action associated with DDI/DKI synchronization variables, and the new
 * SV_SIGNAL () operation can be used to pass the ownership of the lock to
 * a waiting process reliably.
 *
 * Once this is in place, it becomes clear how the count of waiting processes
 * can be used to simplify the destruction of stream heads; essentially,
 * when a process wishes to release the "lock" on the stream head, if both
 * the open count and the waiting count are 0, then the memory for the item
 * can be safely released. Otherwise, control simply passes to the next
 * waiting process.
 *
 * So, if a process is performing a final close on a stream, and some open
 * requests are queued, the close will leave the stream's memory alone and
 * simply pass it on to the waiting open (which can detect that the stream
 * needs to be treated as new since the open count is 0). If the waiting open
 * was interrupted by a signal, it would still have to decrement the count of
 * waiting processes before it releases its lock, at which time it would know
 * to remove the item from the directory and release the memory.
 *
 * For this to work properly takes some coordination in the policy for the
 * directory; the basic lock used to guard lock operations should be held
 * during searches of the directory to ensure that the count value is correct
 * with respect to all processes; a process that has a pointer to the stream
 * head (obtained from the directory) which it has not incremented the count
 * for is an error. Note that this only applies to operations which might
 * later affect the count, of course.
 *
 *
 * STREAM OPERATIONS AND LOCKS:
 *
 * open ()	This operation cannot begin while there is a final close in
 *		progress. If this operation increments the "open count" before
 *		calling the device open routines, it is possible that it will
 *		also have to perform final close duties if a driver or module
 *		fails the open. This function may cause the creation of a new
 *		queue pair and directory entry.
 *
 *		The multiprocessor DDI/DKI also mandates that a particular
 *		device number's open () routine only have one instance active
 *		at any given time.
 *
 * close ()	Normally, this does not require extended locking, but the case
 *		of beginning a final close is special, since only then will
 *		the queue drain and final close procedures begin. Since there
 *		cannot be outstanding ioctl ()s during final close, the timer
 *		code used to control ioctl ()s can be shared with this for
 *		timing out while waiting for a write queue to drain.
 *
 * read (), getmsg (), getpmsg ()
 *		Under normal circumstances, these functions require no special
 *		treatment. It would be desirable to support an extension to
 *		STREAMS which supported "safe" multiple readers, where the
 *		serialization of reads is guaranteed.
 *
 * write (), putmsg (), putpmsg ()
 *		These functions require little special treatment. It would be
 *		desirable to support an extension to STREAMS which guaranteed
 *		serialization of writes, for instance to guarantee unlimited-
 *		length atomic pipe writes.
 *
 * ioctl ()	Depending on the details of the operation, we may need
 *
 *			a read lock on the stream head (eg. I_GETCLTIME).
 *			a write lock on the stream head (eg. I_SRDOPT).
 *			a block on open and pop (I_PUSH).
 *			a block on close and push (I_POP).
 *			a long-term lock on the message queue.
 *
 *		The long-term lock operations revolve around the operations
 *		that send messages downstream : I_LINK, I_UNLINK, I_PLINK,
 *		I_PUNLINK, and I_STR. These operations are also special in
 *		that they are capable of timing out.
 *
 *		Since the close or open routines invoked by an I_PUSH or I_POP
 *		operation may block, they require analagous locking to the
 *		open () and close () cases.
 *
 * LOCK SUMMARY:
 *	EXCLUSIVE LONG-TERM LOCK WITH OPTIONAL TIMEOUT:
 *	    open/close category:
 *		open (), close (), I_PUSH, I_POP
 *	    ioctl category:
 *		I_LINK, I_UNLINK, I_PLINK, I_PUNLINK, I_STR
 *
 *	    In theory, a single lock will do. However, once we take into
 *	    account terminal behaviour w.r.t. CLOCAL and other similar
 *	    situations, it seems that creating the subcategories above will
 *	    suit us better.
 *
 *	    Final close is a special case that blocks all other cases, which
 *	    can be distinguished fairly clearly.
 *
 *	STREAM HEAD WRITE LOCK:
 *		I_SRDOPT, I_SETSIG, I_SWROPT, I_SETCLTIME
 *		Certain stream head message processing routines may also write
 *		lock the stream head, such as M_SETOPT processing.
 *
 *	All other streams operations should acquire a stream head read lock
 *	before reading stream head variables.
 */
/*
 * STREAM HEAD WAIT NOTES:
 *
 * In addition to the above discussion about locking, there are other
 * operations that may cause a process to block while at the stream head. For
 * instance, read (), write (), I_RECVFD, and I_STR operations may cause the
 * outer context to block until some kind of message arrives.
 *
 * The question we are immediately faced with is what level of specificity to
 * provide in the arrangement of synchronization variables and basic locks.
 * Until the implementation is complete and we can perform detailed
 * measurements with a variety of (pathological) loads on a variety of
 * systems, we really don't know. For simplicity, the current system performs
 * all stream head blocking by sleeping on the "sh_wait_sv" synchronization
 * variable that is also used by the above locking operations.
 *
 * However, to give some isolation from changes in this scheme, we mandate a
 * generic layer to deal with this. Not only does this insulate operations
 * from the details of synchronization, but it allows us to perform some
 * simple optimizations that may allow this simple scheme to perform better.
 */


/*
 * Initialize a stream head structure, assuming that memory was allocated with
 * kmem_zalloc () and so NULL pointers and 0-value fields need not be filled
 * in.
 *
 * This function may sleep waiting for memory to become available to allocate
 * the locks needed by the stream head.
 */

#if	__USE_PROTO__
__LOCAL__ void (SHEAD_INIT) (shead_t * sheadp, struct streamtab * stabp,
			     n_dev_t dev, queue_t * rq)
#else
__LOCAL__ void
SHEAD_INIT __ARGS ((sheadp, stabp, dev, rq))
shead_t	      *	sheadp;
struct streamtab
	      *	stabp;
n_dev_t		dev;
queue_t	      *	rq;
#endif
{
	ASSERT (sheadp != NULL);
	ASSERT (rq != NULL);

	/*
	 * The sh_lock_count and sh_time_count members are initialized in the
	 * lock code.
	 */

	ASSERT (sheadp->sh_open_count == 0);
	ASSERT (sheadp->sh_attach_count == 0);
	ASSERT (sheadp->sh_lock_count == 0);
	ASSERT (sheadp->sh_time_count == 0);
	ASSERT (sheadp->sh_rerrcode == 0);
	ASSERT (sheadp->sh_werrcode == 0);
	ASSERT (sheadp->sh_wroff == 0);

	ASSERT (sheadp->sh_read_bufcall == 0);
	ASSERT (sheadp->sh_timeout_id == 0);

	ASSERT (sheadp->sh_sigs == NULL);
	ASSERT (sheadp->sh_linked == NULL);
	ASSERT (sheadp->sh_ioc_msg == NULL);

	ASSERT (rq->q_ptr == sheadp);

	sheadp->sh_dev = dev;
	sheadp->sh_tab = stabp;
	sheadp->sh_head = rq;

	sheadp->sh_pollhead = phalloc (KM_SLEEP);
	sheadp->sh_flags = SH_MASTER;
	sheadp->sh_readopt = RNORM | RPROTNORM;

	sheadp->sh_basic_lockp = LOCK_ALLOC (stream_head_hierarchy, plstr,
					     & _stream_head_lkinfo, KM_SLEEP);
	sheadp->sh_wait_sv = SV_ALLOC (KM_SLEEP);

	ASSERT (sheadp->sh_basic_lockp != NULL || sheadp->sh_wait_sv != NULL);


	/*
	 * The default time to wait for a queue to drain while closing is 15s.
	 */

	sheadp->sh_cltime = drv_usectohz (15000000L);
}


/*
 * Turn a stream head structure back into raw bits.
 */

#if	__USE_PROTO__
__LOCAL__ void (SHEAD_DESTROY) (shead_t * sheadp)
#else
__LOCAL__ void
SHEAD_DESTROY __ARGS((sheadp))
shead_t	      *	sheadp;
#endif
{
	ASSERT (sheadp != NULL);
	ASSERT (sheadp->sh_sigs == NULL);
	ASSERT (sheadp->sh_linked == NULL);

	ASSERT (sheadp->sh_open_count == 0);
	ASSERT (sheadp->sh_attach_count == 0);
	ASSERT (sheadp->sh_lock_count == 0);

	ASSERT (sheadp->sh_timeout_id == 0);

	if (sheadp->sh_read_bufcall != 0)
		unbufcall (sheadp->sh_read_bufcall);

	LOCK_DEALLOC (sheadp->sh_basic_lockp);
	SV_DEALLOC (sheadp->sh_wait_sv);

	phfree (sheadp->sh_pollhead);
}


/*
 * This function attempts to determine the appropriate id queue for a stream
 * head based on cues in the stream head.
 */

#if	__USE_PROTO__
__LOCAL__ slist_id_t (SHEAD_ID) (shead_t * sheadp)
#else
__LOCAL__ slist_id_t
SHEAD_ID __ARGS ((sheadp))
shead_t	      *	sheadp;
#endif
{
	ASSERT (sheadp != NULL);

	if (SHEAD_IS_PIPE (sheadp))
		return PIPE_SLIST;

	return DEV_SLIST;
}


/*
 * This function finds a stream head by looking up its device number, but
 * nothing else. To call this function, the caller must have good reason to
 * suspect that an open reference to the stream exists and isn't going to go
 * away.
 *
 * If this routine returns NULL, then that is a serious error (which may be
 * diagnosed by console messages), because it indicates that the caller has
 * not got the claimed knowledge of the state of the system!
 */

#if	__USE_PROTO__
shead_t * (SHEAD_FIND) (n_dev_t dev, slist_id_t id)
#else
shead_t *
SHEAD_FIND __ARGS ((dev, id))
n_dev_t		dev;
slist_id_t	id;
#endif
{
	shead_t	      *	scan;
	pl_t		prev_pl;

	prev_pl = RW_RDLOCK (str_mem->sm_head_lock, plstr);

	for (scan = str_mem->sm_streams [id] ; scan != NULL ;
	     scan = scan->sh_next) {

		if (scan->sh_dev == dev)
			break;
	}

	RW_UNLOCK (str_mem->sm_head_lock, prev_pl);

	if (scan == NULL)
		cmn_err (CE_WARN, "Unable to locate stream in SHEAD_FIND ()");
	else
		ASSERT (SHEAD_ID (scan) == id);

	return scan;
}


/*
 * This function attempts to locate an existing entry and increment its lock
 * count atomically.
 */

#if	__USE_PROTO__
__LOCAL__ shead_t * (SHEAD_FIND_AND_LOCK) (n_dev_t dev, slist_id_t id)
#else
__LOCAL__ shead_t *
SHEAD_FIND_AND_LOCK __ARGS ((dev, id))
n_dev_t		dev;
slist_id_t	id;
#endif
{
	shead_t	      *	scan;
	pl_t		prev_pl;

	prev_pl = RW_RDLOCK (str_mem->sm_head_lock, plstr);

	for (scan = str_mem->sm_streams [id] ; scan != NULL ;
	     scan = scan->sh_next) {

		if (scan->sh_dev == dev) {
			/*
			 * Now we have found the entry we want, increment the
			 * reference count atomically. We know that it will
			 * not disappear because of the read lock we have on
			 * the containing list.
			 */

			(void) SHEAD_LOCK (scan);

			scan->sh_lock_count ++;
			break;
		}
	}

	RW_UNLOCK (str_mem->sm_head_lock, prev_pl);
	return scan;
}


/*
 * This function adds a stream head to the global list. If an entry with the
 * same ID is present on the list, this operation fails.
 *
 * The stream head should be locked against further opens at this point.
 *
 * The return value is 0 on success, -1 on error.
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_ADD) (shead_t * sheadp)
#else
__LOCAL__ int
SHEAD_ADD __ARGS ((sheadp))
shead_t	      *	sheadp;
#endif
{
	pl_t		prev_pl;
	shead_t	      *	scan;
	slist_id_t	id;

	id = SHEAD_ID (sheadp);

	ASSERT (sheadp->sh_ref_count == 1);
	ASSERT ((sheadp->sh_lock_mask & SH_OPENCLOSE) != 0);

	prev_pl = RW_WRLOCK (str_mem->sm_head_lock, plstr);

	for (scan = str_mem->sm_streams [id] ; scan != NULL ;
	     scan = scan->sh_next)  {

		if (scan->sh_dev == sheadp->sh_dev) {
			/*
			 * We have found a conflict. Unlock the list and
			 * return an error.
			 */

			RW_UNLOCK (str_mem->sm_head_lock, prev_pl);
			return -1;
		}
	}

	sheadp->sh_next = str_mem->sm_streams [id];
	str_mem->sm_streams [id] = sheadp;

	RW_UNLOCK (str_mem->sm_head_lock, prev_pl);
	return 0;
}


/*
 * This function changes the device number of a stream head for a clone open
 * situation. The "st_dev" field of the stream head has to be changed with
 * the stream head list lock held for writing to avoid confusing anyone who
 * is looking for the original device number.
 *
 * Furthermore, the rename can fail because the new number is already in use.
 * The return value is -1 on error, or 0 on success.
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_RENAME) (shead_t * sheadp, n_dev_t dev)
#else
__LOCAL__ int
SHEAD_RENAME __ARGS ((sheadp, dev))
shead_t	      *	sheadp;
n_dev_t		dev;
#endif
{
	pl_t		prev_pl;
	shead_t	      *	scan;
	int		ok = 0;		/* flag whether stream is on list */
	slist_id_t	id;

	id = SHEAD_ID (sheadp);

	prev_pl = RW_WRLOCK (str_mem->sm_head_lock, plstr);

	for (scan = str_mem->sm_streams [id] ; scan != NULL ;
	     scan = scan->sh_next) {

		if (scan->sh_dev == dev) {
			/*
			 * We have found a conflict. Unlock the list and
			 * return an error.
			 */

			RW_UNLOCK (str_mem->sm_head_lock, prev_pl);
			return -1;
		}

		if (scan == sheadp)
			ok ++;		/* Ok, we saw the item */
	}

	/*
	 * All OK, now change the name of the original stream head.
	 */

	sheadp->sh_dev = dev;
	RW_UNLOCK (str_mem->sm_head_lock, prev_pl);

	if (! ok)
		cmn_err (CE_WARN, "SHEAD_RENAME () of unlisted stream head");

	return 0;
}


/*
 * This function decrements the link count of the stream head; this may cause
 * the stream head to become unreferened, which means that the memory will be
 * reclaimed.
 */

#if	__USE_PROTO__
__LOCAL__ void (SHEAD_UNREFERENCE) (shead_t * sheadp)
#else
__LOCAL__ void
SHEAD_UNREFERENCE __ARGS ((sheadp))
shead_t	      *	sheadp;
#endif
{
	slist_id_t	id;
	int		unlink;

	SHEAD_ASSERT_LOCKED (sheadp);
	ASSERT (sheadp == SHEAD_MASTER (sheadp));
	ASSERT (sheadp->sh_lock_count > 0);

	id = SHEAD_ID (sheadp);

	/*
	 * We will delete a stream if there are no references holding it open,
	 * either pending locks or open references. If this is a stream pipe,
	 * then we need to check boths ends of the stream pipe. Because this
	 * function is called from the sleep-locking code, we know that
	 * "sheadp" points to the master end.
	 */

	unlink = sheadp->sh_open_count == 0 ||
			(SHEAD_IS_PIPE (sheadp) &&
				SHEAD_M2SLAVE (sheadp->sh_open_count) == 0);

	/*
	 * We normally assume that we won't be unlinking the stream, because
	 * to do that we need a lock on a global list (which is more expensive
	 * that a list on an individual stream).
	 *
	 * However, due to the relative hierarchy positions of the locks, if
	 * we discover we are likely to be the ones to dequeue the item, we
	 * take out a write lock then.
	 */

	if (sheadp->sh_lock_count > 1 && ! unlink) {
		/*
		 * Take the short path out.
		 */

		sheadp->sh_lock_count --;
		SHEAD_UNLOCK (sheadp, plbase);
		return;
	}

	/*
	 * Escalate to a write lock on the stream head; we will need
	 * to recheck to unlink condition after we escalate.
	 */

	SHEAD_UNLOCK (sheadp, plbase);

	(void) RW_WRLOCK (str_mem->sm_head_lock, plstr);
	(void) SHEAD_LOCK (sheadp);

	unlink = -- sheadp->sh_lock_count == 0 &&
			(sheadp->sh_open_count == 0 ||
			 (SHEAD_IS_PIPE (sheadp) &&
			  SHEAD_M2SLAVE (sheadp)->sh_open_count == 0));

	SHEAD_UNLOCK (sheadp, plstr);

	if (unlink) {
		shead_t	      *	scan;

		/*
		 * Remove from the singly-threaded list by searching for the
		 * immediate predecessor entry in the list (if any).
		 */

		if ((scan = str_mem->sm_streams [id]) == sheadp)
			str_mem->sm_streams [id] = sheadp->sh_next;
		else
			do {
				if (scan->sh_next == sheadp) {

					scan->sh_next = sheadp->sh_next;
					break;
				}
			} while ((scan = scan->sh_next) != NULL);

		if (scan == NULL)
			cmn_err (CE_WARN, "Failure unlinking stream from global directory");

		/*
		 * Note that stream pipes consist of four queues and
		 * two stream heads!
		 */

		SHEAD_DESTROY (sheadp);

		if (SHEAD_IS_PIPE (sheadp))
			QUEUE_FREE (sheadp->sh_head, 4,
				    2 * sizeof (* sheadp));
		else
			QUEUE_FREE (sheadp->sh_head, 2, sizeof (* sheadp));
	}

	RW_UNLOCK (str_mem->sm_head_lock, plbase);
}


/*
 * This function attempts to locate a stream linked below another stream based
 * on the multiplexor ID. If the multiplexor ID is -1, then this function
 * returns the first stream found linked below the given upper stream. In
 * addition, the "cmd" value is used to distinguish between the persistent and
 * regular multiplexor ID spaces.
 */

#if	__USE_PROTO__
__LOCAL__ shead_t * (SHEAD_FIND_MUXID) (shead_t * upper, int cmd,
					muxid_t muxid)
#else
__LOCAL__ shead_t *
SHEAD_FIND_MUXID __ARGS ((upper, cmd, muxid))
shead_t	      *	upper;
int		cmd;
muxid_t		muxid;
#endif
{
	shead_t	      *	scan;
	pl_t		prev_pl;
	int		checklist;

	/*
	 * Precook "cmd" for easier testing below.
	 */

	cmd = (cmd == I_PLINK || cmd == I_PUNLINK) ? SH_PLINK : 0;


	/*
	 * Now take out a lock to protect our list walking.
	 */

	prev_pl = RW_RDLOCK (str_mem->sm_head_lock, plstr);

	for (checklist = DEV_SLIST ; checklist < SLIST_MAX ; checklist ++) {

		for (scan = str_mem->sm_streams [checklist] ; scan != NULL ;
		     scan = scan->sh_next) {

			if (scan->sh_linked == upper &&
			    (scan->sh_flags & SH_PLINK) == cmd &&
			    ((scan->sh_muxid == muxid) || muxid == -1)) {

				goto done;
			}
		}
	}

done:
	RW_UNLOCK (str_mem->sm_head_lock, prev_pl);

	return scan;
}


/*
 * A local helper function for stream head timeouts.
 */

#if	__USE_PROTO__
__LOCAL__ void shead_timer_func (_VOID * arg)
#else
__LOCAL__ void
shead_timer_func (arg)
_VOID	      *	arg;
#endif
{
	shead_t	      *	sheadp = (shead_t *) arg;
	unsigned	locks;

	SHEAD_ASSERT_LOCKED (sheadp);

	SV_BROADCAST (sheadp->sh_wait_sv, 0);

	sheadp->sh_lock_mask &= ~ SH_TIMEFLAG;
	sheadp->sh_timeout_id = 0;
	sheadp->sh_time_count = 0;

	/*
	 * Note that we advance "sh_time_count" on behalf of the processes
	 * that have the stream head locked.
	 */

	locks = sheadp->sh_lock_mask & SH_LOCK_MASK;

	while (locks != 0) {

		if ((locks & 1) != 0)
			sheadp->sh_time_count ++;
		locks >>= 1;
	}
}


/*
 * This function is called when the time has come to actually initiate a
 * timeout.
 */

#if	__USE_PROTO__
int (SHEAD_START_TIMEOUT) (shead_t * sheadp)
#else
int
SHEAD_START_TIMEOUT __ARGS ((sheadp))
shead_t	      *	sheadp;
#endif
{
	__clock_t	the_time;

	SHEAD_ASSERT_LOCKED (sheadp);

	if ((sheadp->sh_lock_mask & SH_TIMEFLAG) == 0 ||
	    sheadp->sh_timeout_id != 0)
		return 1;		/* do nothing */

	(void) drv_getparm (LBOLT, & the_time);

	sheadp->sh_timeout_id = ltimeout (shead_timer_func, sheadp,
					  sheadp->sh_timeout_tick - the_time,
					  sheadp->sh_basic_lockp, plstr);

	/*
	 * If the timeout could not be scheduled, we return 0 to indicate to
	 * the caller that it should timeout immediately, and run the timeout
	 * function to fake a normal timeout.
	 */

	if (sheadp->sh_timeout_id == 0) {

		shead_timer_func (sheadp);
		return 0;
	}

	return 1;
}


/*
 * Indicate that no timeout will be necessary for this lock item.
 */

#if	__USE_PROTO__
void (SHEAD_NO_TIMEOUT) (shead_t * sheadp)
#else
void
SHEAD_NO_TIMEOUT __ARGS ((sheadp))
shead_t	      *	sheadp;
#endif
{
	SHEAD_ASSERT_LOCKED (sheadp);

	ASSERT (sheadp->sh_time_count < sheadp->sh_lock_count);

	if (++ sheadp->sh_time_count == sheadp->sh_lock_count) {
		/*
		 * Since we are the last process to register an end time, we
		 * get to actually initiate a timeout for the stream head.
		 */

		(void) SHEAD_START_TIMEOUT (sheadp);
	}
}


/*
 * Indicate that a timeout is desired for this lock item at the given clock
 * tick.
 */

#if	__USE_PROTO__
int (SHEAD_LOCK_TIMEOUT) (shead_t * sheadp, __clock_t end_time)
#else
int
SHEAD_LOCK_TIMEOUT __ARGS ((sheadp, end_time))
shead_t	      *	sheadp;
__clock_t	end_time;
#endif
{
	__clock_t	the_time;

	SHEAD_ASSERT_LOCKED (sheadp);

	ASSERT (sheadp->sh_time_count < sheadp->sh_lock_count);


	/*
	 * If our horizon falls before the current latest value (or if there
	 * is no latest value), select our horizon time. If the horizon time
	 * is *before* the current time, return 0.
	 *
	 * Comparing time values introduces the usual problems when dealing
	 * with sequence spaces in C. While the following expression is not
	 * as efficient as relying on the semantics of unsigned->signed
	 * casting, avoiding implementation-defined behaviour is important.
	 *
	 * "clock_t" MUST be unsigned for this to work.
	 */

	ASSERT ((__clock_t) -1 > 0);

	(void) drv_getparm (LBOLT, & the_time);

	if ((__clock_t) (the_time - end_time) < ((__clock_t) -1 >> 1)) {
		/*
		 * The indicated time has already passed, so we return a
		 * timeout indication directly.
		 */

		sheadp->sh_time_count ++;
		return 0;
	}


	if ((sheadp->sh_lock_mask & SH_TIMEFLAG) == 0 ||
	    (__clock_t) (sheadp->sh_timeout_tick - end_time) <
			((__clock_t) -1 >> 1)) {
		/*
		 * "end_time" will occur before the current latest time. If
		 * a timeout has been scheduled, we cancel it because we want
		 * to post a more recent one.
		 */

		if (sheadp->sh_timeout_id != 0) {

			untimeout (sheadp->sh_timeout_id);
			sheadp->sh_timeout_id = 0;
		}

		sheadp->sh_lock_mask |= SH_TIMEFLAG;
	}

	if (++ sheadp->sh_time_count == sheadp->sh_lock_count) {
		/*
		 * Since we are the last process to register an end time, we
		 * get to actually initiate a timeout for the stream head.
		 */

		return SHEAD_START_TIMEOUT (sheadp);
	} else {
		/*
		 * We can go to sleep and rely on someone else to actually
		 * intiate the timeout.
		 */

		return 1;
	}
}


/*
 * This function is used when a process wants to cancel a timeout after having
 * registered one.
 */

#if	__USE_PROTO__
void (SHEAD_END_TIMEOUT) (shead_t * sheadp)
#else
void
SHEAD_END_TIMEOUT __ARGS ((sheadp))
shead_t	      *	sheadp;
#endif
{
	SHEAD_ASSERT_LOCKED (sheadp);

	if (sheadp->sh_timeout_id != 0) {

		ASSERT (sheadp->sh_time_count == sheadp->sh_lock_count + 1);
		sheadp->sh_time_count --;
	}
}


/*
 * This function is used when a lock holder wishes to sleep waiting for a
 * timeout.
 */

#if	__USE_PROTO__
int (SHEAD_LOCKED_TIMEOUT) (shead_t * sheadp, __clock_t end_time)
#else
int
SHEAD_LOCKED_TIMEOUT __ARGS ((sheadp, end_time))
shead_t	      *	sheadp;
__clock_t	end_time;
#endif
{
	SHEAD_ASSERT_LOCKED (sheadp);

	ASSERT (sheadp->sh_time_count > 0);

	sheadp->sh_time_count --;

	return SHEAD_LOCK_TIMEOUT (sheadp, end_time);
}


/*
 * Common code to test whether a stream has experienced an error condition.
 */

#define	_shead_error(sheadp,mode) \
		(SHEAD_ASSERT_LOCKED (sheadp), \
		 sheadp->sh_linked != NULL ? EINVAL : \
		   ((mode) & FWRITE) != 0 && sheadp->sh_werrcode != 0 ? \
			sheadp->sh_werrcode : \
		     ((mode) & FREAD) != 0 ? sheadp->sh_rerrcode : 0)

/*
 * This function tests for error or hangup conditions on the stream head given
 * by "sheadp". It assumes that the caller holds a basic lock on the stream
 * head. If an error or hangup condition exists then the basic lock is
 * unlocked and a non-zero error number is returned.
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_ERRHUP_LOCKED) (shead_t * sheadp, int mode)
#else
__LOCAL__ int
SHEAD_ERRHUP_LOCKED __ARGS ((sheadp, mode))
shead_t	      *	sheadp;
int		mode;
#endif
{
	int		retval;

	SHEAD_ASSERT_LOCKED (sheadp);

	if ((retval = _shead_error (sheadp, mode)) != 0 ||
	    ((mode & (FREAD | FWRITE)) != 0 &&
		    (retval = ENXIO, SHEAD_HANGUP (sheadp) != 0))) {

		SHEAD_UNLOCK (sheadp, plbase);
		return retval;
	}

	return 0;
}


/*
 * Definitions use for the "interruptible" parameter to SHEAD_WAIT () and
 * SHEAD_LOCK ().
 */

enum {
	DONT_SIGNAL = 0,
	CHECK_SIGNALS = 1
};


/*
 * This function is the common interface to waiting for an event at a stream
 * head. It borrows the same synchronization variable used by the stream head
 * locking code in this implementation.
 *
 * We return 0 on success or an error number on failure.
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_WAIT) (shead_t * sheadp, int mode, cat_t category,
			    int interruptible)
#else
__LOCAL__ int
SHEAD_WAIT __ARGS ((sheadp, mode, category, interruptible))
shead_t	      *	sheadp;
int		mode;
cat_t		category;
int		interruptible;
#endif
{
	int		retval;

	SHEAD_ASSERT_LOCKED (sheadp);

	/*
	 * Test for error/hangup conditions before we sleep.
	 */

	if ((retval = SHEAD_ERRHUP_LOCKED (sheadp, mode)) != 0)
		return retval;

	/*
	 * Register our interest in the kind of event that we are waiting for.
	 */

	sheadp->sh_lock_mask |= category;

	if (interruptible == CHECK_SIGNALS)
		return SV_WAIT_SIG (sheadp->sh_wait_sv, primed,
				    sheadp->sh_basic_lockp) == 0 ? EINTR : 0;
	else {
		SV_WAIT (sheadp->sh_wait_sv, primed, sheadp->sh_basic_lockp);
		return 0;
	}
}


/*
 * This function is a slightly different interface to SHEAD_WAIT (), used when
 * the caller has been examining some property of a queue and wishes to go
 * to sleep atomically. A frozen queue is not suitable for passing to
 * SV_WAIT_SIG (), so we acquire the stream head global lock and then unfreeze
 * the queue on behalf of the caller. This yields correct behaviour because
 * SHEAD_WAKE () also attempts to acquire the stream head global lock; any
 * modification to a stream queue resulting in a wakeup request will follow
 * the same locking sequence.
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_WAIT_NONBLOCK) (shead_t * sheadp, int mode,
				     cat_t category, int interruptible)
#else
__LOCAL__ int
SHEAD_WAIT_NONBLOCK __ARGS ((sheadp, mode, category, interruptible))
shead_t	      *	sheadp;
int		mode;
cat_t		category;
int		interruptible;
#endif
{
	SHEAD_ASSERT_LOCKED (sheadp);

	if ((mode & (FNDELAY | FNONBLOCK)) != 0) {

		SHEAD_UNLOCK (sheadp, plbase);
		return EAGAIN;
	}

	return SHEAD_WAIT (sheadp, mode, category, interruptible);
}


/*
 * This function is used by lower-level code to signal events to functions
 * that have waited via SHEAD_WAIT (), above.
 */

#if	__USE_PROTO__
void (SHEAD_WAKE) (shead_t * sheadp, cat_t category)
#else
void
SHEAD_WAKE __ARGS ((sheadp, category))
shead_t	      *	sheadp;
cat_t		category;
#endif
{
	pl_t		prev_pl;

	prev_pl = SHEAD_LOCK (sheadp);

	if ((sheadp->sh_lock_mask & category) != 0) {

		sheadp->sh_lock_mask &= ~ category;
		SV_BROADCAST (sheadp->sh_wait_sv, 0);
	}

	SHEAD_UNLOCK (sheadp, prev_pl);
}


/*
 * Common code for locking stream head, shared between open-style locks (which
 * may need to allocate new head structures and need special coordination
 * with the stream head destruction code) and other kinds.
 *
 * We expect that the caller will have taken out a basic lock on the stream
 * head and that the caller will have incremented the lock count of the item
 * to prevent it from being deallocated while we wait.
 */

#if	__USE_PROTO__
int (SHEAD_SLEEP_LOCKED) (shead_t * sheadp, cat_t category, __clock_t timeout,
			  int interruptible)
#else
int
SHEAD_SLEEP_LOCKED __ARGS ((sheadp, category, timeout, interruptible))
shead_t	      *	sheadp;
cat_t		category;
__clock_t	timeout;
int		interruptible;
#endif
{
	__clock_t	end_time;	/* LBOLT when we time out */
	n_dev_t		devno;
	int		retval;

	SHEAD_ASSERT_LOCKED (sheadp);

	/*
	 * Since we expect the caller to have incremented the lock count, then
	 * the caller must have selected the master end of the stream pipe.
	 */

	ASSERT (sheadp == SHEAD_MASTER (sheadp));

	/*
	 * If we are going to be (possibly) timing out, calculate the time
	 * when that will happen.
	 */

	if (timeout > 0) {

		(void) drv_getparm (LBOLT, & end_time);
		end_time += timeout;
	}


	/*
	 * There is a special case that we have to note; in the case of a
	 * clone open, the device number of a stream head may be altered by
	 * the driver open routine. In this case, drivers waiting on the old
	 * number will have to be notified and give up on their lock attempts.
	 */

	devno = sheadp->sh_dev;


	/*
	 * Now we begin the actual business of locking the stream head.
	 */

	for (;;) {
		int		sigflg;

		/*
		 * Check to see whether our category is blocked. At this point
		 * we hold "global_lock".
		 *
		 * An earlier verson of this code had an explicit check for
		 * final close. I have no idea why, because if a stream is in
		 * final close, what else can happen?
		 */

		if ((sheadp->sh_lock_mask & category) == 0) {
			/*
			 * We can acquire a lock on the stream head in our
			 * chosen category, so we do so. Since we will not
			 * need a timeout, we increment the timeout count.
			 *
			 * If during later processing we need a timeout, we
			 * hook into this mechanism, but for simplicity we
			 * assume we won't.
			 */

			SHEAD_NO_TIMEOUT (sheadp);

			SHEAD_UNLOCK (sheadp, plbase);

			sheadp->sh_lock_mask |= category;

			return 0;
		}


		/*
		 * We need to wait, interruptibly. We might also want to time
		 * out at some stage.
		 *
		 * We (optionally) call a function to register the time we
		 * want to expire; this function also takes care of checking
		 * for timeout expiry.
		 */

		if (timeout == 0)
			SHEAD_NO_TIMEOUT (sheadp);
		else if (SHEAD_LOCK_TIMEOUT (sheadp, end_time) == 0) {
			/*
			 * Our horizon time has passed, so we return ETIME.
			 */

			retval = ETIME;
			break;
		}


		/*
		 * Now we can wait. No matter how we wake up, we will need
		 * to relock the global basic lock.
		 *
		 * The caller may not want this wait to be interruptible; this
		 * is reasonable when the lock is being acquired in some
		 * nested context where things are difficult to back out.
		 */

		if (interruptible != DONT_SIGNAL)
			sigflg = SV_WAIT_SIG (sheadp->sh_wait_sv, primed,
					      sheadp->sh_basic_lockp);
		else {
			SV_WAIT (sheadp->sh_wait_sv, primed,
				 sheadp->sh_basic_lockp);
			sigflg = 1;
		}

		(void) SHEAD_LOCK (sheadp);

		if (sigflg == 0) {
			/*
			 * We have been interrupted by a signal, so bang out
			 * to the caller with EINTR.
			 */

			retval = EINTR;
			break;
		} else if (sheadp->sh_dev != devno) {
			/*
			 * The device number has been altered. Flag the fact
			 * to the caller and give up this lock attempt.
			 */

			retval = ENODEV;
			break;
		}


		/*
		 * Now we have the global basic lock, we can wrap around to
		 * the start of the loop to recheck all our conditions.
		 */
	}


	/*
	 * For some reason we are aborting the lock attempt. The code which
	 * make us take this exit path should have set "* retvalp" with an
	 * error code.
	 */

	if (sheadp->sh_time_count > sheadp->sh_lock_count) {

		sheadp->sh_time_count --;
		ASSERT (sheadp->sh_time_count == sheadp->sh_lock_count);

		SHEAD_START_TIMEOUT (sheadp);
	}

	SHEAD_UNREFERENCE (sheadp);
	return retval;
}


/*
 * Entry point for the stream head locking system for use by routines that
 * already have a reference to the stream head. This entry performs checks for
 * routine errors including linked streams.
 */

#if	__USE_PROTO__
int (SHEAD_SLEEP_LOCK) (shead_t * sheadp, cat_t category, __clock_t timeout,
			int interruptible)
#else
int
SHEAD_SLEEP_LOCK __ARGS ((sheadp, category, timeout, interruptible))
shead_t	      *	sheadp;
cat_t		category;
__clock_t	timeout;
int		interruptible;
#endif
{
	SHEAD_LOCK (sheadp);

	/*
	 * The read and write lock modes are experimental. We have a mode bit
	 * that says whether or not we are really interested in honouring
	 * these lock types.
	 */

	if (((sheadp->sh_flags & SH_RWLOCKING) == 0 &&
	     (category & ~ (SH_READ_LOCK | SH_WRITE_LOCK)) == 0)) {

		SHEAD_UNLOCK (sheadp, plbase);
		return 0;
	}


	/*
	 * If the caller wishes to lock a stream head that is part of a stream
	 * pipe, we direct the lock attempt to the master stream head of the
	 * pair that form the pipe. This ensures that any attempt to modify
	 * the state of the pipe from either end will be properly single-
	 * threaded.
	 *
	 * Note that we *must* perform a similar redirection in the unlock.
	 */

	sheadp = SHEAD_MASTER (sheadp);


	/*
	 * We have a pointer to the stream head and hold a global basic lock.
	 *
	 * With the protection of the basic lock, we increment the lock count.
	 */

	sheadp->sh_lock_count ++;

	/*
	 * Now we begin the actual business of locking the stream head.
	 */

	return SHEAD_SLEEP_LOCKED (sheadp, category, timeout, interruptible);
}


/*
 * This is a special form of the stream head locking code for open () access,
 * which specially coordinates with the close code to discover when to
 * allocate a new stream head, and carefully avoids the problems that can
 * occur if the stream head were to be deallocated which we are waiting for
 * it to be unlocked.
 *
 * We also have to do some funky stuff here because of clone opens.
 */

#if	__USE_PROTO__
shead_t * (SHEAD_OPEN_LOCK) (n_dev_t dev, struct streamtab * stabp,
			     int * retvalp)
#else
shead_t *
SHEAD_OPEN_LOCK __ARGS ((dev, stabp, retvalp))
n_dev_t		dev;
struct streamtab
	      * stabp;
int	      *	retvalp;
#endif
{
	shead_t	      *	sheadp;

	ASSERT (retvalp != NULL);

	/*
	 * PHASE 1: Locate the stream head. If the stream head did not
	 * previously exist, we might be able to lock it immediately by virtue
	 * of being able to create it that way. Of course, simultaneous open
	 * attempts might result in this looping as only once of the created
	 * stream heads will be entered in the stream directory.
	 */

	* retvalp = 0;

	for (;;) {
		queue_t	      *	q;

		/*
		 * The first thing we need to do is *find* the stream. We call
		 * a find routine that increments a reference count so that
		 * we can be sure that the stream will not be deallocated
		 * while we wait.
		 */

		if ((sheadp = SHEAD_FIND_AND_LOCK (dev, DEV_SLIST)) != NULL) {
			/*
			 * Sleep lock time; our call to SHEAD_FIND_AND_LOCK ()
			 * will have incremented the lock count of the stream
			 * head so it won't disappear underneath us.
			 */

			* retvalp = SHEAD_SLEEP_LOCKED (sheadp, SH_OPENCLOSE,
							0, CHECK_SIGNALS);

			if (* retvalp != 0) {
				/*
				 * If the lock attempt failed because of a
				 * clone open changing the stream head's
				 * device number, we need to try again.
				 */

				sheadp = NULL;

				if (* retvalp == ENODEV)
					continue;
			}

			return sheadp;
		}


		/*
		 * There ain't no such stream, so we have to allocate a queue
		 * pair.
		 */

		if ((q = QUEUE_ALLOC (2, sizeof (* sheadp))) == NULL) {

			* retvalp = ENFILE;
			return NULL;
		}

		sheadp = (shead_t *) q->q_ptr;

		SHEAD_INIT (sheadp, stabp, dev, q);

		sheadp->sh_ref_count = 1;
		sheadp->sh_lock_mask = SH_OPENCLOSE;
		sheadp->sh_time_count = sheadp->sh_lock_count = 1;

		if (SHEAD_ADD (sheadp) == 0)
			return sheadp;

		/*
		 * The new queue could not be added to the stream
		 * directory, presumably because of a nearly
		 * simultaneous open attempt.
		 *
		 * We undo the allocation we wrought before retrying.
		 */

		SHEAD_DESTROY (sheadp);
		QUEUE_FREE (q, 2, sizeof (* sheadp));
	}
}


/*
 * Unlock a stream head.
 */

#if	__USE_PROTO__
void (SHEAD_SLEEP_UNLOCK) (shead_t * sheadp, cat_t category)
#else
void
SHEAD_SLEEP_UNLOCK __ARGS ((sheadp, category))
shead_t	      *	sheadp;
cat_t		category;
#endif
{
	ASSERT (sheadp != NULL);
	ASSERT (category != 0);

	/*
	 * See if locking is necessary for the read and write operations.
	 */

	if ((sheadp->sh_flags & SH_RWLOCKING) == 0 &&
	    (category & ~ (SH_READ_LOCK | SH_WRITE_LOCK)) == 0) {
		/*
		 * Since we don't actually acquire any locks, we return early.
		 */

		ASSERT (sheadp->sh_open_count > 0);
		return;
	}

	ASSERT (sheadp->sh_lock_count > 0);
	ASSERT ((sheadp->sh_lock_mask & category) == category);


	/*
	 * We direct all locking operations on the slave part of a stream pipe
	 * to the master end.
	 */

	sheadp = SHEAD_MASTER (sheadp);

	(void) SHEAD_LOCK (sheadp);


	/*
	 * Unlike SHEAD_WAKE (), we can assume that our category mask will not
	 * be NULL because of the difference in interpretation between lock
	 * flags (indicating a holder) and wait flags (indicating a waiter).
	 */

	sheadp->sh_lock_mask &= ~ category;
	SV_BROADCAST (sheadp->sh_wait_sv, 0);


	/*
	 * We don't use SHEAD_END_TIMEOUT () here since we are a lock holder
	 * and shead_time_func () makes sure to keep our "sh_time_count" entry
	 * greater than 0.
	 */

	ASSERT (sheadp->sh_time_count > 0);
	sheadp->sh_time_count --;

	if (sheadp->sh_time_count == 0 && sheadp->sh_timeout_id != 0) {
		/*
		 * Since there is no-one waiting for anything, cancel any
		 * pending timeouts.
		 */

		untimeout (sheadp->sh_timeout_id);

		sheadp->sh_lock_mask &= SH_TIMEFLAG;
		sheadp->sh_timeout_id = 0;
	}


	/*
	 * Now that we have done everything that requires access to the stream
	 * head, decrement the lock count.
	 */

	 SHEAD_UNREFERENCE (sheadp);
}


/*
 * This local function asserts that the caller holds a lock on the stream head.
 * Since we can't really determine that, we actually just assert that someone
 * has a lock on the stream head.
 */

#if	__USE_PROTO__
__LOCAL__ void (ASSERT_SLEEP_LOCKED) (shead_t * sheadp, cat_t category)
#else
__LOCAL__ void
ASSERT_SLEEP_LOCKED __ARGS ((sheadp, category))
shead_t	      *	sheadp;
cat_t		category;
#endif
{
	ASSERT (sheadp != NULL);

	if (SHEAD_IS_PIPE (sheadp))
		sheadp = SHEAD_MASTER (sheadp);

	ASSERT ((sheadp->sh_lock_mask & category) == category);
}


/*
 * Wait for a queue to drain. The caller must have the stream head locked when
 * calling this function.
 */

#if	__USE_PROTO__
__LOCAL__ void (DRAIN_QUEUE) (shead_t * sheadp, queue_t * q)
#else
__LOCAL__ void
DRAIN_QUEUE __ARGS ((sheadp, q))
shead_t	      *	sheadp;
queue_t	      *	q;
#endif
{
	__clock_t	end_time;

	ASSERT_SLEEP_LOCKED (sheadp, SH_OPENCLOSE);

	/*
	 * When we are in final close, the STREAMS specification says we
	 * should wait for up to 15 seconds for the write-side queue to be
	 * drained of data, unless we are in O_NONBLOCK mode.
	 *
	 * If we can't post the timeout, then don't wait.
	 */

	if (sheadp->sh_cltime == 0)
		return;

	(void) drv_getparm (LBOLT, & end_time);
	end_time += sheadp->sh_cltime;

	for (;;) {
		/*
		 * We need to acquire a basic lock to pass to SV_WAIT (), and
		 * the code that wakes us up will attempt to acquire the same
		 * lock (see QUEUE_DRAINED ()). Since the wakeup code must
		 * acquire the lock while holding the stream frozen, we must
		 * do things in the same order to prevent the possibility of
		 * deadlock.
		 */

		(void) QFREEZE_TRACE (q, "DRAIN_QUEUE");

		if (q->q_first == NULL) {
			/*
			 * No messages on the queue => our job is done.
			 */

			QUNFREEZE_TRACE (q, plbase);
			break;
		}


		/*
		 * We are going to wait for this queue to become empty, so we
		 * set a flag to indicate that we are interested in finding
		 * out when that happens. Note that we don't ever clear the
		 * flag in this routine; that is the responsibility of the
		 * code which will wake us up.
		 */

		q->q_flag |= QDRAIN;


		/*
		 * We don't build on SHEAD_WAIT (), although we do expect to
		 * be woken up via SHEAD_WAKE (). We transfer our lock from
		 * the queue to the stream head.
		 */

		(void) SHEAD_LOCK (sheadp);

		QUNFREEZE_TRACE (q, plstr);


		/*
		 * Register when we want to time out. If that time has already
		 * passed, then exit to the caller.
		 */

		if (SHEAD_LOCKED_TIMEOUT (sheadp, end_time) == 0) {

			SHEAD_UNLOCK (sheadp, plbase);
			return;
		}

		sheadp->sh_lock_mask |= SH_DRAIN_WAIT;

		SV_WAIT (sheadp->sh_wait_sv, primed, sheadp->sh_basic_lockp);

		/*
		 * We were signalled, so we try again.
		 */
	}
}


/*
 * This function (used in the implementation of I_LIST ioctl ()) returns a
 * count of the number of modules on the stream, including the topmost driver.
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_MODCOUNT) (shead_t * sheadp)
#else
__LOCAL__ int
SHEAD_MODCOUNT __ARGS ((sheadp))
shead_t	      *	sheadp;
#endif
{
	pl_t		prev_pl;
	int		count;
	queue_t	      *	scan;

	ASSERT (sheadp != NULL);

	/*
	 * Note that we use SHEAD_MASTER () because this walk can be affected
	 * by attempts to push or pop queues from either end of a stream pipe.
	 * Using a single lock at the master end avoids problems with this.
	 */

	prev_pl = SHEAD_LOCK (SHEAD_MASTER (sheadp));

	count = 0;

	for (scan = W (sheadp->sh_head)->q_next ; scan != NULL ;
	     scan = scan->q_next) {

		if ((scan->q_flag & QPROCSOFF) == 0)
			count ++;
	}

	SHEAD_UNLOCK (SHEAD_MASTER (sheadp), prev_pl);

	return count;
}


/*
 * Utility routine to return the next write queue below the stream head. This
 * routine deals with locking the stream head for the duration of the walk and
 * also check whether the queue has been disabled with qprocsoff ().
 */

#if	__USE_PROTO__
__LOCAL__ queue_t * (TOP_QUEUE) (shead_t * sheadp)
#else
__LOCAL__ queue_t *
TOP_QUEUE __ARGS ((sheadp))
shead_t	      *	sheadp;
#endif
{
	pl_t		prev_pl;
	queue_t	      *	scan;

	ASSERT (sheadp != NULL);

	/*
	 * Note that we use SHEAD_MASTER () because this walk can be affected
	 * by attempts to push or pop queues from either end of a stream pipe.
	 * Using a single lock at the master end avoids problems with this.
	 */

	prev_pl = SHEAD_LOCK (SHEAD_MASTER (sheadp));

	scan = W (sheadp->sh_head)->q_next;

	while ((scan->q_flag & QPROCSOFF) != 0)
		if ((scan = scan->q_next) == NULL)
			cmn_err (CE_PANIC, "Off end of stream in TOP_QUEUE ()");

	SHEAD_UNLOCK (SHEAD_MASTER (sheadp), prev_pl);

	return scan;
}


/*
 * Utility routine for POP_MODULE () and the I_LOOK processing code that finds
 * the queue entry for the first module on the stream (if any).
 */

#if	__USE_PROTO__
__LOCAL__ queue_t * (TOP_MODULE) (shead_t * sheadp)
#else
__LOCAL__ queue_t *
TOP_MODULE __ARGS ((sheadp))
shead_t	      *	sheadp;
#endif
{
	queue_t	      *	scan;		/* module queue */

	/*
	 * We return the queue pointer if and only if the thing below the
	 * stream head is a module (not a driver) AND the module's read and
	 * write queues are not interchanged (as they would be at the
	 * crossover point of a STREAMS-based FIFO).
	 */

	scan = TOP_QUEUE (sheadp);

	if ((scan->q_flag & QREADR) !=
			(W (sheadp->sh_head)->q_flag & QREADR))
		return NULL;

	/*
	 * The test to see whether "scan" is a module or driver does not need
	 * to involve QUEUE_NEXT (), since the exact value of 'q->q_next'
	 * isn't important to us, just whether or not it's NULL.
	 */

	{
		pl_t		prev_pl;
		queue_t	      *	next;

		prev_pl = QFREEZE_TRACE (scan, "TOP_MODULE");

		next = scan->q_next;

		QUNFREEZE_TRACE (scan, prev_pl);

		if (next == NULL)
			return NULL;
	}

	return R (scan);
}


/*
 * Code common to both POP_MODULE () and PUSH_MODULE () for removing and
 * deallocating a queue pair from a stream.
 */

#if	__USE_PROTO__
__LOCAL__ void (POP_AND_FREE) (shead_t * sheadp, queue_t * module)
#else
__LOCAL__ void
POP_AND_FREE __ARGS ((sheadp, module))
shead_t	      *	sheadp;
queue_t	      *	module;
#endif
{
	pl_t		prev_pl;
	pl_t		q_pl;
	queue_t	      *	next;


	/*
	 * Now we must unlink the module queue from the stream. To do this, we
	 * freeze each queue before we change it. However, that is not enough
	 * if we are on a stream pipe, since the stream head at the other end
	 * of the pipe could be trying to modify the same stream, and thus the
	 * same queue pointers that we are going to change.
	 *
	 * We work our way around this by defining a master/slave relationship
	 * between the ends of a pipe, and requiring that the slave end
	 * acquire an exclusive lock on the "sh_rwlockp" lock belonging to the
	 * master. Since the master and slave contend for the same lock, we
	 * can be confident no other concurrent modifications to the stream
	 * are possible.
	 */

	prev_pl = SHEAD_LOCK (SHEAD_MASTER (sheadp));

	next = W (sheadp->sh_head);

	ASSERT (next->q_next == W (module));

	q_pl = QFREEZE_TRACE (next, "POP_AND_FREE");
	next->q_next = W (module)->q_next;
	QUNFREEZE_TRACE (next, q_pl);


	next = OTHERQ (next->q_next);

	ASSERT (next->q_next == module);

	q_pl = QFREEZE_TRACE (next, "POP_AND_FREE");
	next->q_next = sheadp->sh_head;
	QUNFREEZE_TRACE (next, q_pl);


	SHEAD_UNLOCK (SHEAD_MASTER (sheadp), prev_pl);


	/*
	 * Now we can de-initialize the queue pair and free the memory.
	 */

	QUEUE_FREE (module, 1, 0);
}


/*
 * Pop a module from a stream. In order to request this, the caller must have
 * the stream head sleep-locked for modification (see SHEAD_SLEEP_LOCK ()
 * above).
 *
 * Returns 0 on success or an error number on failure.
 */

#if	__USE_PROTO__
int (POP_MODULE) (shead_t * sheadp, queue_t * q, int mode, cred_t * credp)
#else
int
POP_MODULE __ARGS ((sheadp, q, mode, credp))
shead_t	      *	sheadp;
queue_t	      *	q;
int		mode;
cred_t	      *	credp;
#endif
{
	int		retval;

	ASSERT (q != NULL);
	ASSERT_SLEEP_LOCKED (sheadp, SH_OPENCLOSE);

	/*
	 * When a module is popped, it is blown away. If the module needs to
	 * be drained first (as in final close) then the caller has to do it.
	 */

	retval = (* q->q_qinfo->qi_qclose) (q, mode, credp);

	if (retval != 0)
		cmn_err (CE_WARN, "module close returned %d in POP_MODULE",
			 retval);

	/*
	 * In case the module didn't turn off put and service routines.
	 */

	if ((q->q_flag & QPROCSOFF) == 0) {

		cmn_err (CE_WARN, "Module %s did not call qprocsoff ()",
			 q->q_qinfo->qi_minfo->mi_idname);
		qprocsoff (q);
	}


	/*
	 * And now we can release the module. We do this whether or not the
	 * caller returned an error.
	 */

	POP_AND_FREE (sheadp, q);

	return retval;
}


/*
 * This function pushes the indicated module onto a stream. The caller must
 * have the stream head sleep-locked for modification.
 */

#if	__USE_PROTO__
int (PUSH_MODULE) (shead_t * sheadp, int mode, cred_t * credp,
		   modsw_t * module)
#else
int
PUSH_MODULE __ARGS ((sheadp, mode, credp, module))
shead_t	      *	sheadp;
int		mode;
cred_t	      *	credp;
modsw_t	      *	module;
#endif
{
	queue_t	      *	q;
	queue_t	      *	prev;
	pl_t		prev_pl;
	pl_t		q_pl;
	int		retval;
	n_dev_t		dev;
	char	      *	modname;

	ASSERT (module != NULL);
	ASSERT_SLEEP_LOCKED (sheadp, SH_OPENCLOSE);

	if ((q = QUEUE_ALLOC (1, 0)) == NULL)
		return ENOSR;

	QUEUE_INIT (q, module->mod_stream, QI_NORMAL);

	/*
	 * First we have to link the module into the stream. The newly
	 * allocated queues will have the QPROCSOFF flag set so that they are
	 * ignored by other stream elements until the new module has been
	 * opened.
	 *
	 * As above in POP_MODULE (), we acquire a write lock on the stream
	 * head, which must be a master end if a stream pipe.
	 */

	prev_pl = SHEAD_LOCK (SHEAD_MASTER (sheadp));

	prev = W (sheadp->sh_head);
	W (q)->q_next = prev->q_next;

	q_pl = QFREEZE_TRACE (prev, "PUSH_MODULE");
	prev->q_next = W (q);
	QUNFREEZE_TRACE (prev, q_pl);

	prev = OTHERQ (W (q)->q_next);
	q->q_next = sheadp->sh_head;

	ASSERT (prev->q_next == sheadp->sh_head);

	q_pl = QFREEZE_TRACE (prev, "PUSH_MODULE");
	prev->q_next = q;
	QUNFREEZE_TRACE (prev, q_pl);

	SHEAD_UNLOCK (SHEAD_MASTER (sheadp), prev_pl);


	/*
	 * Ask the module to set itself up.
	 */

	dev = sheadp->sh_dev;

	retval = (* q->q_qinfo->qi_qopen) (q, & dev, mode, MODOPEN,
					   credp);

	modname = q->q_qinfo->qi_minfo->mi_idname;

	if (dev != sheadp->sh_dev)
		cmn_err (CE_WARN, "Module \"%s\" altered its \"dev\" parameter",
			 modname);

	if (retval != 0) {
		/*
		 * OK, don't really open this. The module should not have
		 * turned on it's put and service routines.
		 */

		if ((q->q_flag & QPROCSOFF) == 0) {

			cmn_err (CE_WARN, "PUSH_MODULE () : Module %s enabled queue!",
				 modname);
			qprocson (q);
		}

		POP_AND_FREE (sheadp, q);
	} else if ((q->q_flag & QPROCSOFF) != 0) {

		cmn_err (CE_WARN, "PUSH_MODULE () : Module %s did not enable queue!",
			 modname);
		qprocson (q);
	}

	return retval;
}


/*
 * This function sends an IOCTL message downstream, then blocks until either
 * a reply arrives at the stream head, the (optional) timeout has expired, or
 * a signal is received.
 *
 * The caller must have the stream head locked for ioctl () processing.
 */

#if	__USE_PROTO__
mblk_t * (IOCTL_SEND) (shead_t * sheadp, int mode, mblk_t * msg,
		       int * errretp, __clock_t timeout_time)
#else
mblk_t *
IOCTL_SEND __ARGS ((sheadp, mode, msg, errretp, timeout_time))
shead_t	      *	sheadp;
int		mode;
mblk_t	      *	msg;
int	      *	errretp;
__clock_t	timeout_time;
#endif
{
	__clock_t	end_time;
	int		retval;

	ASSERT_SLEEP_LOCKED (sheadp, SH_IOCTL_LOCK);
	ASSERT (errretp != NULL);
	ASSERT (msg != NULL);

	/*
	 * Set the optional timeout up first. If the timeout cannot be
	 * allocated, we have to return failure.
	 */

	if (timeout_time > 0) {

		(void) drv_getparm (LBOLT, & end_time);
		end_time += timeout_time;
	}


	/*
	 * Now send the client's message downstream. We queue the message on
	 * the write queue rather than directly putting it to avoid the
	 * possibility of deadlock if an acknowledgement is sent back to us
	 * while we are holding the basic lock below.
	 *
	 * We also have to check for the possibility that an error has occurred
	 * on the stream.
	 */

	(void) SHEAD_LOCK (sheadp);

	for (;;) {
		if (sheadp->sh_ioc_msg != NULL) {
			/*
			 * If there is some stale ioctl () message lying
			 * around, dispose of it.
			 */

			freemsg (sheadp->sh_ioc_msg);
			sheadp->sh_ioc_msg = NULL;
		}

		if (msg != NULL)
			putq (W (sheadp->sh_head), msg);

		msg = NULL;


		/*
		 * Before we go to sleep, we schedule our timeout.
		 */

		if (timeout_time > 0 &&
		    SHEAD_LOCKED_TIMEOUT (sheadp, end_time) == 0) {
			/*
			 * We have timed out.
			 */

			retval = ETIME;
			break;
		}


		if (sheadp->sh_open_count == 0) {
			/*
			 * This is a kernel-generated ioctl () and signalling
			 * is not allowed to interrupt us. We pass a NULL mode
			 * to avoid detecting errors.
			 */

			ASSERT_SLEEP_LOCKED (sheadp, SH_OPENCLOSE);

			retval = SHEAD_WAIT (sheadp, 0, SH_IOCTL_WAIT,
					     DONT_SIGNAL);
		} else {
			/*
			 * SHEAD_WAIT () checks for hangups as well, if the
			 * mode includes read or write.
			 */

			ASSERT ((mode & (FREAD | FWRITE)) != 0);

			retval = SHEAD_WAIT (sheadp, mode, SH_IOCTL_WAIT,
					     CHECK_SIGNALS);
		}


		/*
		 * We take out the basic lock here again so we can read
		 * "sheadp->sh_ioc_msg" atomically.
		 */

		(void) SHEAD_LOCK (sheadp);

		msg = sheadp->sh_ioc_msg;
		sheadp->sh_ioc_msg = NULL;

		if (retval != 0 || msg != NULL)
			break;

		/*
		 * We have been woken up either by a timeout, or for some
		 * activity at the stream head not related to us. We loop to
		 * deal with this.
		 */
	}

	if (retval != 0) {

		if (errretp != NULL)
			* errretp = retval;

		if (msg != NULL)
			freemsg (msg);

		msg = NULL;
	}

	SHEAD_UNLOCK (sheadp, plbase);

	return msg;
}


/*
 * This function manages transparent ioctl () processing; it sets itself up
 * to service M_COPYIN and M_COPYOUT requests from a driver until it sees an
 * M_IOCACK or M_IOCNAK.
 */

typedef union {
	struct iocblk	ioc;
	struct copyreq	req;
	struct copyresp	resp;
} x_ioc_t;

#if	__USE_PROTO__
__LOCAL__ int (TRANSPARENT_IOCTL) (shead_t * sheadp, int mode, int cmd,
				   _VOID * arg, cred_t * credp, int * rvalp)
#else
__LOCAL__ int
TRANSPARENT_IOCTL __ARGS ((sheadp, mode, cmd, arg, credp, rvalp))
shead_t	      *	sheadp;
int		mode;
int		cmd;
_VOID	      *	arg;
cred_t	      *	credp;
int	      *	rvalp;
#endif
{
	mblk_t	      *	msg;
	mblk_t	      *	data;
	int		retval;
	x_ioc_t	      *	ioc;

	ASSERT_SLEEP_LOCKED (sheadp, SH_IOCTL_LOCK);

	/*
	 * The message block allocated for a transparent ioctl () must be
	 * large enough to hold any of the ioctl-related message types so that
	 * modules and drivers (and the stream head) can just change the
	 * message type to reply to a message.
	 */

	if ((msg = MSGB_ALLOC (sizeof (* ioc), BPRI_LO, KM_SLEEP)) == NULL)
		return ENOSR;

	ioc = (x_ioc_t *) msg->b_rptr;
	msg->b_wptr = (unsigned char *) (ioc + 1);

	msg->b_datap->db_type = M_IOCTL;

	ioc->ioc.ioc_cmd = cmd;
	ioc->ioc.ioc_cr = credp;
	ioc->ioc.ioc_id = ++ sheadp->sh_ioc_seq;
	ioc->ioc.ioc_count = TRANSPARENT;
	ioc->ioc.ioc_rval = ioc->ioc.ioc_error = 0;


	/*
	 * A transparent ioctl () gets a single data block containing the
	 * value of "arg".
	 */

	if ((data = MSGB_ALLOC (sizeof (arg), BPRI_LO, KM_SLEEP)) == NULL) {

		retval = ENOSR;
		goto done;
	}

	* (_VOID **) data->b_rptr = arg;
	data->b_wptr += sizeof (arg);


	for (;;) {
		/*
		 * Now we send the ioctl () and wait for the acknowledgement.
		 * Transparent ioctl ()'s wait forever, but M_ERROR and
		 * hangup events are interesting to us, so we can blow out.
		 */

		if ((msg = IOCTL_SEND (sheadp, mode, msg, & retval,
				       0)) == NULL)
			return retval;


		/*
		 * Transparent ioctl ()'s dont have to worry about data coming
		 * back in the M_IOCACK message, they just have to process the
		 * M_COPYIN and M_COPYOUT requests.
		 */

		switch (msg->b_datap->db_type) {

		case M_IOCNAK:
			retval = ioc->ioc.ioc_error;
			goto done;

		case M_IOCACK:
			* rvalp = ioc->ioc.ioc_rval;
			retval = ioc->ioc.ioc_error;

			if (ioc->ioc.ioc_count > 0)
				cmn_err (CE_WARN, "Transparent ioctl () processing forbids data in M_IOCACK");
			goto done;

		case M_COPYIN:
			if ((data = msg->b_cont) != NULL)
				freemsg (data);

			/*
			 * The STREAMS documentation is unclear as to whether
			 * these blocks are split up or not, but it seems
			 * unlikely.
			 */

			if ((data = MSGB_ALLOC (ioc->req.cq_size, BPRI_LO,
						KM_SLEEP)) == NULL)
				retval = ENOSR;
			else if (copyin (ioc->req.cq_addr, data->b_rptr,
					 ioc->req.cq_size) != 0) {
				freemsg (data);
				data = NULL;
				retval = EFAULT;
			} else {

				data->b_wptr = data->b_rptr +
						ioc->req.cq_size;
				retval = 0;
			}

			msg->b_cont = data;
			break;

		case M_COPYOUT:
			retval = 0;

			while (ioc->req.cq_size > 0 &&
			       (data = msg->b_cont) != NULL) {
				size_t		unit;

				/*
				 * Copy a single M_DATA block at a time. After
				 * copying, we free the block.
				 */

				unit = data->b_wptr - data->b_rptr;
				if (unit > ioc->req.cq_size)
					unit = ioc->req.cq_size;

				if (copyout (data->b_rptr, ioc->req.cq_addr,
					     unit) != 0) {
					retval = EFAULT;
					break;
				}

				ioc->req.cq_size -= unit;
				ioc->req.cq_addr += unit;

				msg->b_cont = data->b_cont;
				freeb (data);
			}

			/*
			 * Throw away uncopied extra data.
			 */

			if ((data = msg->b_cont) != NULL)
				freemsg (data);

			msg->b_cont = NULL;
			break;

		default:
			cmn_err (CE_WARN, "Invalid message type %d received during unlink processing",
				 msg->b_datap->db_type);
			retval = ENXIO;
			goto done;
		}

		/*
		 * In common code for M_COPYIN and M_COPYOUT, we turn around
		 * the request message. If the request succeeded, we wrap
		 * around to the top of the loop to serve the next request;
		 * it there has been an error, we just put the message and
		 * bail out.
		 */

		msg->b_datap->db_type = M_IOCDATA;
		ioc->resp.cp_rval = (caddr_t) retval;

		if (retval != 0) {

			putq (W (sheadp->sh_head), msg);
			break;
		}
	}

done:
	freemsg (msg);

	return retval;
}


/*
 * This function contains code common to the ioctl () message processing for
 * stream link and unlink commands. These ioctl ()s send messages downstream
 * which are all of a single common form.
 */

#if	__USE_PROTO__
__LOCAL__ int (LINK_MESSAGE) (shead_t * upper, int mode, shead_t * lower,
			      int cmd, int muxid, cred_t * credp,
			      int * retvalp)
#else
__LOCAL__ int
LINK_MESSAGE __ARGS ((upper, mode, lower, cmd, muxid, credp, retvalp))
shead_t	      *	upper;
int		mode;
shead_t	      *	lower;
int		cmd;
int		muxid;
cred_t	      *	credp;
int	      *	retvalp;
#endif
{
	mblk_t	      *	msg;
	struct iocblk *	ioc;
	struct linkblk * linkblk;
	int		ackflag;
	queue_t	      *	q;

	ASSERT_SLEEP_LOCKED (upper, SH_OPENCLOSE | SH_IOCTL_LOCK);
	ASSERT_SLEEP_LOCKED (lower, SH_OPENCLOSE | SH_IOCTL_LOCK);
	ASSERT (credp != NULL);

	/*
	 * Set ourselves up for a STREAMS ioctl (). Note that since we keep
	 * transparent ioctl () processing separate from normal I_STR code, we
	 * don't have to allocate an initial message block that can be turned
	 * into a "copyreq" or "copyresp" structure.
	 */


	if ((msg = MSGB_ALLOC (sizeof (* ioc), BPRI_LO, KM_SLEEP)) == NULL)
		return ENOSR;

	ioc = (struct iocblk *) msg->b_rptr;
	msg->b_wptr = (unsigned char *) (ioc + 1);

	msg->b_datap->db_type = M_IOCTL;

	ioc->ioc_cmd = cmd;
	ioc->ioc_cr = credp;
	ioc->ioc_id = ++ upper->sh_ioc_seq;
	ioc->ioc_count = sizeof (struct linkblk);
	ioc->ioc_rval = ioc->ioc_error = 0;


	/*
	 * Now we allocate and fill in the data part of the I_...LINK message.
	 */

	if ((msg->b_cont = MSGB_ALLOC (sizeof (struct linkblk), BPRI_LO,
				       KM_SLEEP)) == NULL) {
		freeb (msg);
		return ENOSR;
	}

	linkblk = (struct linkblk *) msg->b_cont->b_rptr;
	msg->b_cont->b_wptr = (unsigned char *) (linkblk + 1);


	/*
	 * Find the bottom-most write queue on the upper stream. To make this
	 * walk of the queue safe with respect to I_PUSH and I_POP, we take
	 * out a read lock on the stream head. See the PUSH_MODULE () and
	 * POP_MODULE () routines for more details on this.
	 *
	 * We don't use QUEUE_NEXT () because the intermediate modules are of
	 * no interest to us; we just want the driver at the bottom.
	 */

	{
		queue_t	      *	next;

		(void) SHEAD_LOCK (upper);

		next = W (upper->sh_head);

		do {
			q = next;

			(void) QFREEZE_TRACE (q, "LINK_MESSAGE");

			next = q->q_next;

			QUNFREEZE_TRACE (q, plbase);
		} while (next != NULL);

		SHEAD_UNLOCK (upper, plbase);
	}

	linkblk->l_qtop = q;
	linkblk->l_qbot = lower->sh_head;
	linkblk->l_index = muxid;


	/*
	 * Now we send the ioctl () and wait for the acknowledgement. Since
	 * there is no mechanism for managing this timeout, we will use the
	 * close timeout (which is appropriate given that this operation will
	 * often be performed as the result of a close ()).
	 */

	if ((msg = IOCTL_SEND (upper, mode, msg, retvalp,
			       upper->sh_cltime)) == NULL)
		return 0;		/* counts as a negative ack */

	/*
	 * Now we see what kind of message the driver has send to us.
	 */

	ioc = (struct iocblk *) msg->b_rptr;

	if (retvalp != NULL)
		* retvalp = ioc->ioc_error;

	switch (msg->b_datap->db_type) {

	case M_IOCNAK:
		ackflag = 0;
		break;

	case M_IOCACK:
		/*
		 * We do not copy the "ioc_rval" member out because it is not
		 * documented as forming the return value from such a link-
		 * style ioctl () request.
		 */

		/*
		 * Since "arg" for an I_UNLINK or I_PUNLINK is not a pointer,
		 * it makes no sense for a driver to attempt to return data
		 * for the user. Since the canonical multiplexing driver code
		 * clears ioc_count and simply turns around the message, it
		 * is not a problem for there to be M_DATA messages following
		 * the M_IOCACK, but it is a problem if "ioc_count" is greater
		 * than 0.
		 */

		ackflag = 1;

		if (ioc->ioc_count > 0)
			cmn_err (CE_WARN, "Driver %s returned data with link/unlink ioctl ()",
				 q->q_qinfo->qi_minfo->mi_idname);

		break;

	default:
		cmn_err (CE_WARN, "Invalid message type %d received during link/unlink processing",
			 msg->b_datap->db_type);
		* retvalp = ENXIO;

		ackflag = 0;		/* treat as a negative ack */
		break;
	}

	freemsg (msg);

	return ackflag;
}


/*
 * Helper function for the link/unlink process to restore a stream to the
 * unlinked state.
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_INIT_UNLINKED) (shead_t * sheadp)
#else
__LOCAL__ int
SHEAD_INIT_UNLINKED __ARGS ((sheadp))
shead_t	      *	sheadp;
#endif
{
	pl_t		prev_pl;
	int		final_close;

	ASSERT_SLEEP_LOCKED (sheadp, SH_OPENCLOSE | SH_IOCTL_LOCK);

	/*
	 * The driver is no longer using this stream; restore it to normal
	 * operation. Note that we have to reset "q_ptr" back to point at the
	 * stream head!
	 */

	prev_pl = SHEAD_LOCK (sheadp);

	sheadp->sh_linked = NULL;
	sheadp->sh_flags &= ~ SH_PLINK;

	final_close = -- sheadp->sh_open_count == 0;

	SHEAD_UNLOCK (sheadp, prev_pl);


	prev_pl = QFREEZE_TRACE (sheadp->sh_head, "SHEAD_INIT_UNLINKED");

	sheadp->sh_head->q_ptr = sheadp;
	W (sheadp->sh_head)->q_ptr = sheadp;

	QUEUE_INIT (sheadp->sh_head, sheadp->sh_tab, QI_NORMAL);

	QUNFREEZE_TRACE (sheadp->sh_head, prev_pl);

	return final_close;
}


/*
 * This function takes care of unlinking a lower stream from an upper stream.
 *
 * The caller should have both the upper and lower stream heads locked for
 * open/close processing.
 */

#if	__USE_PROTO__
__LOCAL__ int (LOCKED_UNLINK) (shead_t * upper, int mode, shead_t * lower,
			       int cmd, cred_t * credp, int * retvalp)
#else
__LOCAL__ int
LOCKED_UNLINK __ARGS ((upper, mode, lower, cmd, credp, retvalp))
shead_t	      *	upper;
int		mode;
shead_t	      *	lower;
int		cmd;
cred_t	      *	credp;
int	      *	retvalp;
#endif
{
	ASSERT_SLEEP_LOCKED (upper, SH_OPENCLOSE | SH_IOCTL_LOCK);
	ASSERT_SLEEP_LOCKED (lower, SH_OPENCLOSE | SH_IOCTL_LOCK);

	/*
	 * Check that the right kind of unlink command is being issued.
	 */

	if (lower->sh_linked != upper ||
	    ((lower->sh_flags & SH_PLINK) != 0) == (cmd == I_PUNLINK))
		return EINVAL;

	/*
	 * Send the message downstream and get the return result.
	 */

	if (LINK_MESSAGE (upper, mode, lower, cmd, lower->sh_muxid, credp,
			  retvalp)) {
		/*
		 * The unlink was properly acknowledged. Note that this may
		 * cause the lower stream to close.
		 */

		if (SHEAD_INIT_UNLINKED (lower) != 0)
			SHEAD_DO_CLOSE (lower, mode, credp);

		return 1;
	}

	return 0;
}


/*
 * This function wraps up part of the required client functionality for
 * callers of SHEAD_UNLINK () by dealing with locking the lower stream head
 * and making appropriate calls to see if it needs closing.
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_UNLINK) (shead_t * upper, shead_t * lower, int cmd,
			      int mode, cred_t * credp, int * retvalp)
#else
__LOCAL__ int
SHEAD_UNLINK __ARGS ((upper, lower, cmd, mode, credp, retvalp))
shead_t	      *	upper;
shead_t	      *	lower;
int		cmd;
int		mode;
cred_t	      *	credp;
int	      *	retvalp;
#endif
{
	int		success;

	/*
	 * In order to unlink this lower stream we first have to lock it. This
	 * is necessary not only to ensure that we don't trip up other
	 * operations affecting this stream (since there may be open file
	 * descriptors referring to it, and/or it may be a stream pipe) but
	 * since stream deallocations are checked for in the unlock code this
	 * is necessary to ensure that an unlink of an otherwise unreferenced
	 * stream does the right thing.
	 */

	if ((* retvalp = SHEAD_SLEEP_LOCK (lower,
					   SH_OPENCLOSE | SH_IOCTL_LOCK, 0,
					   DONT_SIGNAL)) != 0) {

		cmn_err (CE_WARN, "Unable to lock stream in SHEAD_UNLINK (), error %d",
			 * retvalp);
		return 1;
	}


	/*
	 * OK, we unlink this lower stream, and do all the stuff we
	 * need to do to ensure that the lower stream gets closed if
	 * its time has come.
	 */

	success = LOCKED_UNLINK (upper, mode, lower, cmd, credp, retvalp);

	SHEAD_SLEEP_UNLOCK (lower, SH_OPENCLOSE | SH_IOCTL_LOCK);

	return success;
}


/*
 * Loop-detection algorithm for use in SHEAD_LINK ().
 *
 * The STREAMS documentation is very vague about what constitutes a cycle in
 * the link graph, probably deliberately.
 *
 * For our purposes we consider a cycle to be caused by any path of links
 * which lead upwards (to user level) from the *device* indicated by "upper"
 * to the *device* indicated by "lower".
 *
 * Frankly, given the nature of multiplexing drivers in terms of routing
 * information, I really don't see much advantage in this; it is perfectly
 * easy for messages cycles to form other ways, and given the typical nature
 * of multiplexor services interfaces a cycle in the hierarchy need not cause
 * a loop. Still, that's the way it's specified.
 *
 * THIS IS A RECURSIVE ALGORITHM FOR DEPTH-FIRST SEARCH. This fact does not
 * really worry me in the slightest, because the depth of the recursion here
 * is no worse than the possible recursive depth of the multiplexor put ()
 * calls themselves.
 */

#if	__USE_PROTO__
__LOCAL__ int (DETECT_LOOP) (shead_t * upper, shead_t * lower)
#else
__LOCAL__ int
DETECT_LOOP __ARGS ((upper, lower))
shead_t	      *	upper;
shead_t	      *	lower;
#endif
{
	shead_t	      *	scan;

	/*
	 * We base our comparisons on the "sh_tab" member of the stream head
	 * structure, since that is equivalent to the major part of the
	 * (internal) device number.
	 */

	if (upper->sh_tab == lower->sh_tab)
		return 1;

	/*
	 * Now recursively work upward through the multiplexing configuration
	 * calling DETECT_LOOP () for all the linked streams with the "sh_tab"
	 * entry of the stream which "upper" is linked to (if "upper" is in
	 * fact linked below another multiplexor).
	 *
	 * Since only device streams can be multiplexors, we only scan the
	 * device stream list.
	 */

	if ((upper = upper->sh_linked) == NULL)
		return 0;

	for (scan = str_mem->sm_streams [DEV_SLIST] ; scan != NULL ;
	     scan = scan->sh_next) {

		if (scan->sh_tab == upper->sh_tab &&
		    DETECT_LOOP (scan, lower))
			return 1;
	}

	return 0;
}


/*
 * This function deals with linking one stream below another. There are
 * numerous conditions which might prevent this from happening, including a
 * refusal from the multiplexing driver.
 *
 * The upper stream must be locked for open/close operations. The lower stream
 * will also be locked by this routine.
 */

#if	__USE_PROTO__
int (SHEAD_LINK) (shead_t * upper, int mode, shead_t * lower, int cmd,
		  cred_t * credp)
#else
int
SHEAD_LINK __ARGS ((upper, mode, lower, cmd, credp))
shead_t	      *	upper;
int		mode;
shead_t	      *	lower;
int		cmd;
cred_t	      *	credp;
#endif
{
	int		retval;
	muxid_t		muxid;

	ASSERT (lower != NULL);

	ASSERT_SLEEP_LOCKED (upper, SH_OPENCLOSE | SH_IOCTL_LOCK);

	if (SHEAD_IS_PIPE (upper) ||
	    upper->sh_tab->st_muxrinit == NULL ||
	    upper->sh_tab->st_muxwinit == NULL)
		return EINVAL;

	/*
	 * Perform some of the non-recursive setup (mainly locking) for the
	 * cycle-detection algorithm. We use the read/write lock on the stream
	 * head since the detection algorithm walks over the global stream
	 * list many times. Note that this single-threads checking operations,
	 * which we would also have to do if were were pushing marker bits
	 * around in a non-recursive implementation.
	 */

	(void) RW_WRLOCK (str_mem->sm_head_lock, plstr);

	retval = DETECT_LOOP (upper, lower);

	RW_UNLOCK (str_mem->sm_head_lock, plbase);

	if (retval != 0)
		return EINVAL;

	/*
	 * Generate a suitable multiplexor ID for the link. We use the device
	 * number of the lower stream as a suitable seed point.
	 */

	for (muxid = (muxid_t) lower->sh_dev ;
	     SHEAD_FIND_MUXID (upper, cmd, muxid) != NULL ; muxid ++)
		; /* DO NOTHING */

	/*
	 * We set up the lower now, before we send the I_LINK, so that by the
	 * time the driver sees the I_LINK the stream is ready for use. We
	 * NULL out the "q_ptr" member of the lower queue so that messages
	 * arriving early can be correctly handled.
	 *
	 * We take a sleep lock on the lower stream. If we can't lock it,
	 * return EINVAL since whatever error caused the failure isn't really
	 * relevant to the stream the caller is dealing with.
	 */

	if ((retval = SHEAD_SLEEP_LOCK (lower, SH_OPENCLOSE | SH_IOCTL_LOCK,
					0, DONT_SIGNAL)) != 0)
		return EINVAL;

	/*
	 * Note that the "q_ptr" field of the queue gets zeroed to avoid
	 * communicating our state to the driver. In addition, we have to
	 * count the link as an extra open now.
	 *
	 * Paranoia time; we check for errors, hangups and whether the lower
	 * stream is linked at this late stage so the cutover is atomic. This
	 * requires some cooperation from the stream head read side service
	 * routine; look to see that it tests for "sh_linked" in the service
	 * routine somewhere...
	 */

	(void) SHEAD_LOCK (lower);

	if (SHEAD_HANGUP (lower) || _shead_error (lower, FREAD | FWRITE)) {

		SHEAD_UNLOCK (lower, plbase);
		SHEAD_SLEEP_UNLOCK (lower, SH_OPENCLOSE | SH_IOCTL_LOCK);
		return EINVAL;
	}

	lower->sh_open_count ++;	/* duplicate reference */

	lower->sh_linked = upper;
	lower->sh_muxid = muxid;

	if (cmd == I_PLINK)
		lower->sh_flags |= SH_PLINK;

	SHEAD_UNLOCK (lower, plbase);


	(void) QFREEZE_TRACE (lower->sh_head, "LINK_STREAMS");

	lower->sh_head->q_ptr = NULL;
	W (lower->sh_head)->q_ptr = NULL;

	QUEUE_INIT (lower->sh_head, lower->sh_tab, QI_MUX);

	QUNFREEZE_TRACE (lower->sh_head, plbase);


	/*
	 * Send the message downstream and get the return result.
	 */

	if (LINK_MESSAGE (upper, mode, lower, cmd, muxid, credp,
			  & retval) == 0) {
		int		final;

		/*
		 * The driver failed the link; restore the lower stream. This
		 * won't cause the stream to close (because of our open
		 * reference) but we take care to ensure that the counts are
		 * properly maintained.
		 */

		final = SHEAD_INIT_UNLINKED (lower);

		if (final != 0)
			cmn_err (CE_WARN, "Final close in SHEAD_LINK () ????");
	}


	/*
	 * Unlock the lower stream for proper symmetry.
	 */

	SHEAD_SLEEP_UNLOCK (lower, SH_OPENCLOSE | SH_IOCTL_LOCK);
	return retval;
}


/*
 * This function deals with a special case in SHEAD_DO_CLOSE () below where
 * the last close of a stream pipe end causes the other end to be detached
 * from all the filesystem entries it has been mounted over.
 */

#if	__USE_PROTO__
__LOCAL__ void (SHEAD_PIPE_DETACH) (shead_t * __NOTUSED (other))
#else
__LOCAL__ void
SHEAD_PIPE_DETACH __ARGS ((other))
shead_t	      *	other;
#endif
{
	/*
	 * Until we know how attachments are going to be performed and what
	 * kind of structure exists, we cannot implement this function.
	 */

	cmn_err (CE_PANIC, "UNIMPLEMENTED : SHEAD_PIPE_DETACH ()");
}


/*
 * This module factors out some code from SHEAD_DO_CLOSE (). The final
 * close processing for a stream head is complicated a little because when a
 * stream pipe end is closed that side of the pipe is not actually closed
 * until the other end also closes.
 */

#if	__USE_PROTO__
__LOCAL__ void (SHEAD_FINAL_CLOSE) (shead_t * sheadp, int mode,
				    cred_t * credp)
#else
__LOCAL__ void
SHEAD_FINAL_CLOSE __ARGS ((sheadp, mode, credp))
shead_t	      *	sheadp;
int		mode;
cred_t	      *	credp;
#endif
{
	shead_t	      *	scan;
	queue_t	      *	q;
	int		retval;

	/*
	 * Final close of a stream; the close steps are to be performed in
	 * this order, as given in the DDI/DDK entry for close(D2DK).
	 *	Non-persistent multiplexor links are unlinked.
	 *	For each module and driver from the head to the driver:
	 *		Wait for the write queue to drain.
	 *		Call close () routine.
	 *		Remove module/driver from stream.
	 *		Free remaining messages.
	 *		Deallocate queue pair.
	 *
	 * Here we can behave as if we have an ioctl () lock on the stream
	 * because during final close no other process can legitimately hold
	 * another lock on the stream head (no other legal references to the
	 * stream exist).
	 */

	while ((scan = SHEAD_FIND_MUXID (sheadp, I_UNLINK, -1)) != NULL) {

		if (SHEAD_UNLINK (sheadp, scan, I_UNLINK, mode, credp,
				  & retval) == 0) {
			/*
			 * There is no good reason for a driver to fail an
			 * I_UNLINK request.
			 */

			cmn_err (CE_WARN, "Driver failed unlink () during final close (%d)",
				 retval);
			scan->sh_linked = NULL;
		}
	}


	/*
	 * Before draining the write side modules, drain the stream head.
	 */

	DRAIN_QUEUE (sheadp, W (sheadp->sh_head));


	/*
	 * Now pop all the modules from the stream.
	 */

	while ((q = TOP_MODULE (sheadp)) != NULL) {

		DRAIN_QUEUE (sheadp, W (q));

		(void) POP_MODULE (sheadp, q, mode, credp);
	}


	/*
	 * Now close the driver, unless this is a pipe. Once this has been
	 * done there is no reason for any context to reference this object
	 * except to unlock it.
	 */

	if (! SHEAD_IS_PIPE (sheadp)) {
		q = TOP_QUEUE (sheadp);

		DRAIN_QUEUE (sheadp, q);

		q = R (q);

		retval = (* q->q_qinfo->qi_qclose) (q, mode, credp);

		if (retval != 0)
			cmn_err (CE_WARN, "Driver close returned %d", retval);

		if ((q->q_flag & QPROCSOFF) == 0) {

			cmn_err (CE_WARN, "Driver %s did not call qprocsoff ()",
				 q->q_qinfo->qi_minfo->mi_idname);
			qprocsoff (q);
		}
	}
}


/*
 * This function does most of the close processing for a stream head. It is
 * used by regular stream close, and by stream open code if certain
 * irregularities are detected.
 *
 * The caller must have the stream head locked for close operations. If the
 * stream head being operated on is a pipe, that means both ends must be
 * locked (so that the caller can sequence the lock order on the basis of the
 * master/slave bits).
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_DO_CLOSE) (shead_t * sheadp, int mode, cred_t * credp)
#else
__LOCAL__ int
SHEAD_DO_CLOSE __ARGS ((sheadp, mode, credp))
shead_t	      *	sheadp;
int		mode;
cred_t	      *	credp;
#endif
{
	ASSERT (credp != NULL);
	ASSERT_SLEEP_LOCKED (sheadp, SH_OPENCLOSE | SH_IOCTL_LOCK);

	/*
	 * If other processes have references to this stream head, then we can
	 * just return doing no more work. Note that we keep the notion of
	 * number of open references, number of filesystem attachments, and
	 * multiplexor links completely separate, but any one qualifies as
	 * a reason for keeping the stream around.
	 *
	 * Therefore, it follows that any code which manipulates any of these
	 * quantities such that the next expression might become false needs
	 * to call this function to ensure that the stream memory will be
	 * properly reclaimed.
	 */

	ASSERT (sheadp->sh_open_count == 0);

	if (SHEAD_IS_PIPE (sheadp)) {
		shead_t	      *	other = SHEAD_OTHER (sheadp);

		/*
		 * If this is a pipe, there are some extra things we should
		 * worry about. First, the first end of a stream pipe to close
		 * should send an M_HANGUP message to the other end rather
		 * than actually closing. The other end then gets to destroy
		 * both sides of the pipe when it finally closes.
		 *
		 * The second exception is that when an unattached end of a
		 * streams pipe is closed, the other end is automatically
		 * detached.
		 *
		 * Note that if the other end of the pipe is linked below a
		 * multiplexor, we do not attempt to automatically unlink it.
		 * This might seem entirely reasonable, but we cannot attempt
		 * this without introducing a possibility of deadlock.
		 */

		if (other->sh_attach_count > 0)
			SHEAD_PIPE_DETACH (other);

		if (other->sh_open_count > 0) {
			/*
			 * The last close/detach/unlink of the other end will
			 * clean everything up.
			 */

			putctl (W (other->sh_head), M_HANGUP);
			return 0;
		}

		/*
		 * Now we clean up the "other" end.
		 */

		SHEAD_FINAL_CLOSE (other, mode, credp);
	}

	SHEAD_FINAL_CLOSE (sheadp, mode, credp);

	/*
	 * We leave the last part of the cleanup (dallocating the queue pair
	 * and any remaining messages) to the unlock call which the caller
	 * will perform.
	 */

	return 0;
}


/*
 * This function builds and send an M_FLUSH message down the stream.
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_FLUSH) (shead_t * sheadp, int flag, uchar_t band)
#else
__LOCAL__ int
SHEAD_FLUSH __ARGS ((sheadp, flag, band))
shead_t	      *	sheadp;
int		flag;
uchar_t		band;
#endif
{
	mblk_t	      *	msg;

	if ((flag & FLUSHRW) == 0 || (flag & ~ FLUSHRW) != 0)
		return EINVAL;
	else if ((msg = MSGB_ALLOC (2, BPRI_LO, KM_SLEEP)) == NULL)
		return ENOSR;
	else {
		/*
		 * Send the message downstream...
		 */

		if (band > 0)
			flag |= FLUSHBAND;

		msg->b_datap->db_type = M_FLUSH;
		* msg->b_wptr ++ = (unsigned char) flag;

		if (band > 0)
			* msg->b_wptr ++ = band;

		put (W (sheadp->sh_head), msg);
	}

	return 0;
}


/*
 * This function locates a module's streamtab entry given the name. Only the
 * first FMNAMESZ characters are considered significant.
 */

#if	__USE_PROTO__
modsw_t * (FIND_MODULE) (__CONST__ char * modname)
#else
modsw_t *
FIND_MODULE __ARGS ((modname))
__CONST__ char * modname;
#endif
{
	modsw_t	      *	scan;
	modsw_t	      *	end;

	for (scan = modsw, end = scan + nmodsw ; scan != end ; scan ++) {
		__CONST__ char * name;

		name = scan->mod_stream->st_rdinit->qi_minfo->mi_idname;

		if (name != NULL && strncmp (modname, name, FMNAMESZ) == 0)
			return scan;
	}

	return NULL;
}


/*
 * This function holds code to process an I_STR ioctl ().
 */

#if	__USE_PROTO__
__LOCAL__ int (ISTR_IOCTL) (shead_t * sheadp, int mode,
			    struct strioctl * strioc, cred_t * credp,
			    int * rvalp)
#else
__LOCAL__ int
ISTR_IOCTL __ARGS ((sheadp, mode, strioc, credp, rvalp))
shead_t	      *	sheadp;
int		mode;
struct strioctl
	      *	strioc;
cred_t	      *	credp;
int	      *	rvalp;
#endif
{
	__clock_t	ticks;
	mblk_t	      *	msg;
	mblk_t	      *	data;
	struct iocblk *	ioc;
        int		retval;

	ASSERT_SLEEP_LOCKED (sheadp, SH_IOCTL_LOCK);

	/*
	 * Set up timeout; "ic_timeout" == -1 means infinite
	 * timeout, while "ic_timeout" == 0 means default
	 * (which is infinite).
	 */

	if (strioc->ic_timeout != -1 && strioc->ic_timeout != 0) {
		/*
		 * We take care to deal with overflow here.
		 */

		if ((ticks = strioc->ic_timeout) > (__clock_t) -1 / 1000000L)
			ticks = (__clock_t) -1;

		ticks = drv_usectohz (ticks * 1000000L);
	} else
		ticks = 0;

	if ((msg = MSGB_ALLOC (sizeof (* ioc), BPRI_LO, KM_SLEEP)) == NULL)
		return ENOSR;

	ioc = (struct iocblk *) msg->b_rptr;
	msg->b_wptr = (unsigned char *) (ioc + 1);

	msg->b_datap->db_type = M_IOCTL;

	ioc->ioc_cmd = strioc->ic_cmd;
	ioc->ioc_cr = credp;
	ioc->ioc_id = ++ sheadp->sh_ioc_seq;
	ioc->ioc_count = strioc->ic_len;
	ioc->ioc_rval = ioc->ioc_error = 0;


	/*
	 * If there is data to be sent downstream, we allocate a buffer to
	 * hold it and copy the data into that buffer.
	 */

	if (ioc->ioc_count > 0) {

		if ((data = MSGB_ALLOC (strioc->ic_len, BPRI_LO,
					KM_SLEEP)) == NULL) {

			retval = ENOSR;
			goto done;
		}

		msg->b_cont = data;
		data->b_wptr += strioc->ic_len;

		if (copyin (strioc->ic_dp, data->b_rptr,
			    strioc->ic_len) != 0) {

			retval = EFAULT;
			goto done;
		}
	}

	/*
	 * Now we send the ioctl () and wait for the acknowledgement.
	 */

	if ((msg = IOCTL_SEND (sheadp, mode, msg, & retval, ticks)) == NULL)
		return retval;


	/*
	 * What do we do with the results? We copy at most "ioc_count" bytes
	 * of data back into the user's address space if this is M_IOCACK.
	 */

	ioc = (struct iocblk *) msg->b_rptr;

	switch (msg->b_datap->db_type) {

	case M_IOCNAK:
		retval = ioc->ioc_error;
		break;

	case M_IOCACK:
		* rvalp = ioc->ioc_rval;
		retval = ioc->ioc_error;

		data = msg->b_cont;

		while (ioc->ioc_count > 0) {
			size_t		len;

			if (data == NULL) {

				cmn_err (CE_WARN, "ISTR_IOCTL : Insufficient M_DATA blocks for ioc_count");
				break;
			}

			len = data->b_wptr - data->b_rptr;

			if (len > ioc->ioc_count)
				len = ioc->ioc_count;

			if (len > 0 &&
			    copyout (data->b_rptr, strioc->ic_dp,
				     len) != 0) {

				retval = EFAULT;
				goto done;
			}

			ioc->ioc_count -= len;
			data = data->b_cont;
			strioc->ic_dp += len;
		}
		break;

	default:
		cmn_err (CE_WARN, "Invalid message type %d received during unlink processing",
			 msg->b_datap->db_type);
		retval = ENXIO;
		break;
	}

done:
	freemsg (msg);

	return retval;
}


/*
 * Here are the details of the implementation of sigpoll_t.
 */

struct sigpoll {
	sigpoll_t     *	sp_next;	/* single-threaded */
	_VOID	      *	sp_proc;	/* from proc_ref () */
	short		sp_events;	/* events to signal per <stropts.h> */
};


/*
 * This function is used when an M_SIG or M_PCSIG message is processed at the
 * stream head.
 */

#if	__USE_PROTO__
void (SHEAD_SIGNAL) (shead_t * sheadp, uchar_t signal)
#else
void
SHEAD_SIGNAL __ARGS ((sheadp, signal))
shead_t	      *	sheadp;
uchar_t		signal;
#endif
{
	pl_t		prev_pl;

	prev_pl = SHEAD_LOCK (sheadp);

	if (signal == SIGPOLL) {
		sigpoll_t     *	sigs;

		/*
		 * SIGPOLL is only sent to those processes that have
		 * registered to receive it with I_SETSIG.
		 */


		for (sigs = sheadp->sh_sigs ; sigs != NULL ;
		     sigs = sigs->sp_next) {

			if ((sigs->sp_events & S_MSG) != 0)
				proc_signal (sigs->sp_proc, signal);
		}
	} else {
		/*
		 * Send a signal to the controlling process group for this
		 * stream; if this stream is not a controlling tty, then no
		 * signal is sent.
		 */

		if (sheadp->sh_pgrp != 0)
			proc_kill_group (sheadp->sh_pgrp, signal);
	}

	SHEAD_UNLOCK (sheadp, prev_pl);
}


/*
 * This function encapsulates all user-level access to the front of the stream
 * head message queue. It deals with ensuring that the STREAMS scheduling
 * policy works (by managing QWANTR) and some other details such as dealing
 * with in-band processing of M_SIG messages.
 *
 * The stream head read queue should be frozen by the caller.
 */

#if	__USE_PROTO__
__LOCAL__ mblk_t * (SHEAD_FIRSTMSG) (shead_t * sheadp)
#else
__LOCAL__ mblk_t *
SHEAD_FIRSTMSG __ARGS ((sheadp))
shead_t	      *	sheadp;
#endif
{
	mblk_t	      *	msg;

	QFROZEN_TRACE (sheadp->sh_head, "SHEAD_FIRSTMSG");

	while ((msg = sheadp->sh_head->q_first) != NULL) {

		if (msg->b_datap->db_type != M_SIG)
			return msg;

		rmvq (sheadp->sh_head, msg);
		SHEAD_SIGNAL (sheadp, * msg->b_rptr);
		freemsg (msg);
	}

	sheadp->sh_head->q_flag |= QWANTR;

	return NULL;
}


/*
 * This function does the necessary work to implement POLLWRBAND, which tests
 * to see if any of the previously written-to bands of flow in the next
 * downstream queue with a service procedure are not flow controlled.
 */

#if	__USE_PROTO__
__LOCAL__ int (POLL_WRBAND) (queue_t * q)
#else
__LOCAL__ int
POLL_WRBAND __ARGS ((q))
queue_t	      *	q;
#endif
{
	pl_t		prev_pl;
	qband_t	      *	qbandp;

	prev_pl = QFREEZE_TRACE (q, "POLL_WRBAND");

	do {
		q = QUEUE_NEXT (q);

		if (q == NULL)
			return 1;

	} while (q->q_qinfo->qi_srvp == NULL);


	/*
	 * We have found a queue with a service procedure, and have it frozen.
	 * If there are no bands, then we can write a band...
	 */

	if (q->q_nband == 0) {

		QUNFREEZE_TRACE (q, prev_pl);
		return 1;
	}

	qbandp = QUEUE_BAND (q, q->q_nband);

	ASSERT (qbandp != NULL);

	/*
	 * Since if a band is blocked implies lower bands are blocked, we can
	 * simply test the highest-numbered band.
	 */

	if ((qbandp->qb_flag & QB_FULL) != 0) {

		QUNFREEZE_TRACE (q, prev_pl);
		return 1;
	}


	/*
	 * If all are full, request notification via back-enabling when the
	 * highest band becomes writeable.
	 */

	qbandp->qb_flag |= QB_WANTW;

	QUNFREEZE_TRACE (q, prev_pl);
	return 0;
}


/*
 * This function has responsibility for checking to see whether the event mask
 * for this SIGPOLL request is immediately satisfied.
 */

#if	__USE_PROTO__
__LOCAL__ short (SHEAD_POLL_CHECK) (shead_t * sheadp, short events)
#else
__LOCAL__ short
SHEAD_POLL_CHECK __ARGS ((sheadp, events))
shead_t	      *	sheadp;
short		events;
#endif
{
	short		revents = 0;
	pl_t		prev_pl;
	mblk_t	      *	msg;

	/*
	 * Check the conditions that do not depend on the status of the first
	 * queued message (if any).
	 *
	 * For S_OUTPUT, using canputnext () is important because it sets the
	 * back-enable flag so that we will be properly notified when the
	 * condition becomes true.
	 *
	 * For S_WRBAND, things are a little more complex; this tests whether
	 * *any* downstream priority band is writeable, which involves walking
	 * over all the 'qband' structures allocated to the next stream with
	 * a service procedure.
	 */

	if ((events & __POLL_OUTPUT) != 0 && canputnext (W (sheadp->sh_head)))
		revents |= __POLL_OUTPUT;

	if ((events & __POLL_WRBAND) != 0 &&
	    POLL_WRBAND (W (sheadp->sh_head)))
		revents |= __POLL_WRBAND;


	prev_pl = SHEAD_LOCK (sheadp);

	if ((events & S_ERROR) != 0 &&
	    (sheadp->sh_rerrcode != 0 || sheadp->sh_werrcode != 0))
		revents |= POLLERR;

	if ((events & S_HANGUP) != 0 && (sheadp->sh_flags & SH_HANGUP) != 0)
		revents = (revents | POLLHUP) &
				~ (__POLL_OUTPUT | __POLL_WRBAND);

	SHEAD_UNLOCK (sheadp, prev_pl);


	prev_pl = QFREEZE_TRACE (sheadp->sh_head, "SHEAD_POLL_CHECK");

	if ((msg = SHEAD_FIRSTMSG (sheadp)) != NULL &&
	    datamsg (msg->b_datap->db_type)) {

		if (! pcmsg (msg->b_datap->db_type)) {

			if ((events & __POLL_INPUT) != 0)
				revents |= __POLL_INPUT;

			if ((events & __POLL_RDNORM) != 0 && msg->b_band == 0)
				revents |= __POLL_RDNORM;

			if ((events & __POLL_RDBAND) != 0 && msg->b_band > 0)
				revents |= __POLL_RDBAND;
		} else if ((events & __POLL_HIPRI) != 0)
			revents |= __POLL_HIPRI;
	}

	QUNFREEZE_TRACE (sheadp->sh_head, prev_pl);

	return revents;
}


/*
 * This function attempts to locate any sigpoll record for the current
 * process. The stream head needs to be locked out against modification of
 * the signal list, which means an IOCTL lock is sufficient.
 */

#if	__USE_PROTO__
__LOCAL__ sigpoll_t * (FIND_SIGPOLL) (shead_t * sheadp)
#else
__LOCAL__ sigpoll_t *
FIND_SIGPOLL __ARGS ((sheadp))
shead_t	      *	sheadp;
#endif
{
	_VOID	      *	proc;
	sigpoll_t     *	scan;

	ASSERT_SLEEP_LOCKED (sheadp, SH_IOCTL_LOCK);

	/*
	 * See if the process is already registered (so that we can modify or
	 * free an existing record).
	 */

	proc = proc_ref ();

	for (scan = sheadp->sh_sigs ; scan != NULL ; scan ++)
		if (scan->sp_proc == proc)
			break;

	proc_unref (proc);

	return scan;
}


/*
 * This function deals with (de)registering a process for SIGPOLL. The caller
 * should have an IOCTL lock on the stream head so that there are no other
 * contexts which could modify the list of registered signals.
 */

#if	__USE_PROTO__
__LOCAL__ int (REGISTER_SIGPOLL) (shead_t * sheadp, short events)
#else
__LOCAL__ int
REGISTER_SIGPOLL __ARGS ((sheadp, events))
shead_t	      *	sheadp;
short		events;
#endif
{
	_VOID	      *	proc;
	sigpoll_t     *	scan;
	sigpoll_t     *	prev;

	ASSERT_SLEEP_LOCKED (sheadp, SH_IOCTL_LOCK);

	/*
	 * See if the process is already registered (so that we can modify or
	 * free an existing record). We don't use the FIND_SIGPOLL () routine
	 * because we want to locate the previous record also.
	 */

	proc = proc_ref ();

	for (prev = NULL, scan = sheadp->sh_sigs ; scan != NULL ;
	     prev = scan, scan ++) {

		if (scan->sp_proc == proc) {
			/*
			 * We have found a preexisting record... now modify
			 * or free it.
			 */

			proc_unref (proc);

			(void) SHEAD_LOCK (sheadp);

			if ((scan->sp_events = events) == 0) {
				/*
				 * Free the cell after unlinking it from the
				 * list of registered processes. We take out
				 * a lock on the stream head to protect
				 * against streams-level contexts walking the
				 * list.
				 */

				if (prev == NULL)
					sheadp->sh_sigs = scan->sp_next;
				else
					prev->sp_next = scan->sp_next;

				SHEAD_UNLOCK (sheadp, plbase);

				/*
				 * We unlock the stream head before calling
				 * the heap manager as a matter of courtesy.
				 */

				kmem_free (scan, sizeof (* scan));
				proc_unref (proc);
				return 0;
			}

			SHEAD_UNLOCK (sheadp, plbase);
			goto sigcheck;
		}
	}


	if (events == 0) {
		/*
		 * Not found, nothing to do. That is an error according to the
		 * streamio (7) man pages.
		 */

		return EINVAL;
	}


	/*
	 * We need to make a new event.
	 */

	if ((scan = (sigpoll_t *) kmem_alloc (sizeof (* scan),
					      KM_SLEEP)) == NULL) {

		proc_unref (proc);
		return EAGAIN;
	}

	scan->sp_proc = proc;
	scan->sp_events = events;
	scan->sp_next = sheadp->sh_sigs;

	/*
	 * Since we are the only process context modifying the list, we only
	 * need to lock the last stage of the insert against interrupt-level
	 * contexts walking the list.
	 */

	(void) SHEAD_LOCK (sheadp);

	sheadp->sh_sigs = scan;

	SHEAD_UNLOCK (sheadp, plbase);

sigcheck:
	if ((events = SHEAD_POLL_CHECK (sheadp, scan->sp_events)) != 0) {
		/*
		 * We have a winner! Check for the SIGURG special case, and
		 * otherwise/also send SIGPOLL via proc_signal ().
		 */

		if ((events & __POLL_RDBAND) != 0 &&
		    (scan->sp_events & S_BANDURG) != 0) {

			proc_signal (scan->sp_proc, SIGURG);
			events &= ~ __POLL_RDBAND;
		}

		if (events != 0)
			proc_signal (scan->sp_proc, SIGPOLL);
	}

	return 0;
}


/*
 * This function maps a file descriptor into a stream head by calling upon the
 * abstract file-description functions declared in <sys/fhsys.h>. The caller
 * must lock the stream somehow.
 */


#if	__USE_PROTO__
__LOCAL__ shead_t * (FH_TO_STREAM) (int fd, int * retvalp)
#else
__LOCAL__ shead_t *
FH_TO_STREAM __ARGS ((fd, retvalp))
int		fd;
int	      *	retvalp;
#endif
{
	__fd_t	      *	fdp;

	ASSERT (retvalp != NULL);

	if ((fdp = fd_get (fd)) == NULL) {

		* retvalp = EBADF;
		return NULL;
	}

	* retvalp = EINVAL;
	return NULL;
}


/*
 * The following structure is used by the routines below to deal with the
 * M_PASSFP message type. The structure of the data contained in that message
 * is completely opaque to STREAMS routines, so we keep the definition local.
 */

struct passfp {
	__fd_t	      *	fdp;	/* system file table entry address */
	n_uid_t		uid;
	n_gid_t		gid;
};


/*
 * This function attempts to retrieve a file descriptor sent by an I_SENDFD
 * ioctl () and create a local file descriptor referring to the same file.
 */

#if	__USE_PROTO__
__LOCAL__ int (FH_RECV) (shead_t * sheadp, int mode, struct strrecvfd * recvp)
#else
__LOCAL__ int
FH_RECV __ARGS ((sheadp, mode, recvp))
shead_t	      *	sheadp;
int		mode;
struct strrecvfd
	      *	recvp;
#endif
{
	mblk_t	      *	msg;
	struct passfp *	fp;
	int		retval;

	ASSERT (sheadp != NULL);
	ASSERT (recvp != NULL);

	if (! SHEAD_IS_PIPE (sheadp))
		return EINVAL;

	if ((mode & FREAD) == 0)
		return EBADF;

	mode &= ~ FWRITE;


	/*
	 * Look at the stream head to see if a message is present.
	 */

	for (;;) {
		int		retval;

		(void) QFREEZE_TRACE (sheadp->sh_head, "FH_RECV");

		if ((msg = SHEAD_FIRSTMSG (sheadp)) != NULL)
			break;

		/*
		 * Use the STREAMS flag to indicate that we have found the
		 * queue empty.
		 */

		(void) SHEAD_LOCK (sheadp);

		QUNFREEZE_TRACE (sheadp->sh_head, plstr);

		/*
		 * We need to wait (unless O_NDELAY or O_NONBLOCK has been
		 * specified), or the stream has been hung up.
		 */

		if ((retval = SHEAD_WAIT_NONBLOCK (sheadp, mode, SH_READ_WAIT,
						   CHECK_SIGNALS)) != 0)
			return retval;

		/*
		 * Check again...
		 */
	}

	if (msg->b_datap->db_type != M_PASSFP)
		retval = EBADMSG;
	else if ((recvp->fd = fd_get_free ()) == ERROR_FD)
		retval = EMFILE;
	else {

		retval = 0;
		rmvq (sheadp->sh_head, msg);
	}

	QUNFREEZE_TRACE (sheadp->sh_head, plbase);

	if (retval != 0)
		return retval;

	/*
	 * After this point, "msg" is our responsibility and we either have to
	 * free it or put it back if there is an error.
	 */

	fp = (struct passfp *) msg->b_rptr;

	if ((retval = fd_recv (recvp->fd, fp->fdp)) == 0 &&
	    ((recvp->uid = (o_uid_t) fp->uid) != fp->uid ||
	     (recvp->gid = (o_gid_t) fp->gid) != fp->gid))
		retval = EOVERFLOW;

	freemsg (msg);
	return retval;
}


/*
 * This function attempts to send a file descriptor to another process at the
 * other end of a stream pipe.
 */

#if	__USE_PROTO__
__LOCAL__ int (FH_SEND) (shead_t * sheadp, int fd, cred_t * credp)
#else
__LOCAL__ int
FH_SEND __ARGS ((sheadp, fd, credp))
shead_t	      *	sheadp;
int		fd;
cred_t	      *	credp;
#endif
{
	mblk_t	      *	msg;
	struct passfp *	fp;
	__fd_t	      *	fdp;
	shead_t	      *	other;
	int		retval;

	ASSERT (sheadp != NULL);
	ASSERT (credp != NULL);

	if (! SHEAD_IS_PIPE (sheadp))
		return EINVAL;

	/*
	 * We are passed a file descriptor (a user-level abstract entity);
	 * here we turn that into a kernel-level abstract entity.
	 */

	if ((fdp = fd_get (fd)) == NULL)
		return EBADF;

	other = SHEAD_OTHER (sheadp);

	if (! canput (W (other->sh_head)) ||
	    (msg = MSGB_ALLOC (sizeof (* fp), BPRI_LO, KM_SLEEP)) == NULL)
		return EAGAIN;

	/*
	 * Fill in the new message block and put the message to the other
	 * side. After this point we have to free the message block if there
	 * is a problem.
	 */

	fp = (struct passfp *) msg->b_rptr;
	msg->b_wptr = (unsigned char *) (fp + 1);

	msg->b_datap->db_type = M_PASSFP;

	fp->fdp = fdp;
	fp->uid = credp->cr_uid;
	fp->gid = credp->cr_gid;

	(void) SHEAD_LOCK (sheadp);

	if ((retval = SHEAD_ERRHUP_LOCKED (sheadp, FWRITE)) == 0) {

		putq (W (other->sh_head), msg);

		SHEAD_UNLOCK (sheadp, plbase);
	} else
		freemsg (msg);

	return retval;
}


/*
 * This function factors out the details of copying data out from kernel space
 * into a "strbuf" stream buffer structure in user space.
 *
 * The return value of this function is 0 on success or -1 on error.
 */

enum {
	CONTROL_PART,
	DATA_PART
};

#if	__USE_PROTO__
__LOCAL__ int (COPYOUT_BUF) (struct strbuf * bufp, mblk_t ** mpp, int data)
#else
__LOCAL__ int
COPYOUT_BUF __ARGS ((bufp, mpp, data))
struct strbuf *	bufp;
mblk_t	     **	mpp;
int		data;
#endif
{
	mblk_t	      *	prev;
	mblk_t	      *	scan;
	int		remaining;
	caddr_t		outaddr;

	ASSERT (mpp != NULL);

	/*
	 * The first thing we do is check some special values; if a "strbuf"
	 * entry is NULL or has its "maxlen" member set to "-1", we do nothing
	 * with the message.
	 */

	if (bufp == NULL || bufp->maxlen == -1)
		return 0;

	/*
	 * Next, if this is a data-part copy, skip any intial M_PROTO or
	 * M_PCPROTO message blocks. There *should* only ever be one of these
	 * at the front of a message, but we are required to effectively
	 * coalesce multiple control blocks.
	 */

	prev = NULL;
	scan = * mpp;

	if (data == DATA_PART) {
		/*
		 * Find the data portion of the message, if any.
		 */

		while (scan != NULL) {

			if (scan->b_datap->db_type == M_DATA)
				break;
			scan = (prev = scan)->b_cont;
		}
	} else
		if (scan->b_datap->db_type == M_DATA)
			scan = NULL;

	/*
	 * If there is no control (or data, as appropriate) part to the
	 * message, then we set the "len" member of the "strbuf" to -1.
	 */

	if (scan == NULL) {

		bufp->len = -1;
		return 0;
	}


	/*
	 * Now be do the actual copy; the form of this loop is organized so
	 * that zero-length message blocks will be consumed if "maxlen" is
	 * set to 0. This is important not only to comply with the manual page
	 * for getmsg ()/getpmsg (), but also ensures that trailing zero-
	 * length blocks at the end of a message get cleaned up properly.
	 */

	bufp->len = 0;
	remaining = bufp->maxlen;
	outaddr = (caddr_t) bufp->buf;

	for (;;) {
		size_t		copylen = scan->b_wptr - scan->b_rptr;
		mblk_t	      *	next;

		if (copylen > remaining)
			copylen = remaining;

		if (copylen > 0) {
			/*
			 * Copy the data to the user. Don't forget that
			 * copyout () is like bcopy () in that the arguments
			 * are src, dest, len !
			 */

			if (copyout (scan->b_rptr, outaddr, copylen) != 0)
				return EFAULT;

			bufp->len += copylen;
			scan->b_rptr += copylen;
			remaining -= copylen;
		}

		if (scan->b_rptr != scan->b_wptr) {
			/*
			 * Since this message block was not fully consumed, we
			 * can infer that we have copied all the data we can.
			 */

			ASSERT (remaining == 0);
			break;
		}


		/*
		 * This message block has been consumed; unlink and free it.
		 */

		next = scan->b_cont;
		if (prev == NULL)
			* mpp = next;
		else
			prev->b_cont = next;
		freeb (scan);


		/*
		 * Get ready to go around the loop again; if we are copying
		 * the control part of a message, we have to test for the end
		 * of the control part here.
		 */

		if ((scan = next) == NULL)
			break;

		if (data == CONTROL_PART && scan->b_datap->db_type == M_DATA)
			break;
	}

	return 0;
}


/*
 * This is a buffer callback function used by SHEAD_PEEK () to deal with
 * dupmsg () failures via bufcall ().
 */

#if	__USE_PROTO__
__LOCAL__ void peek_bufcall_func (_VOID * arg)
#else
__LOCAL__ void
peek_bufcall_func (arg)
_VOID	      *	arg;
#endif
{
	pl_t		prev_pl;
	shead_t	      *	sheadp = (shead_t *) arg;

	/*
	 * We freeze the stream head read queue to synchronize ourselves with
	 * the SHEAD_PEEK () and avoid race conditions where we try and wake
	 * up the process that scheduled us before they have slept.
	 */

	prev_pl = QFREEZE_TRACE (sheadp->sh_head, "peek_bufcall_func");

	sheadp->sh_read_bufcall = 0;

	SHEAD_WAKE (sheadp, SH_PEEK_WAIT);

	QUNFREEZE_TRACE (sheadp->sh_head, prev_pl);
}


/*
 * This is a helper function for I_PEEK and read ()-like functions who want
 * to recover from an out-of-memory situation by performing a short sleep.
 *
 * This function takes care of scheduling and cancelling the bufcall. The
 * caller must have the stream head locked when calling this function.
 *
 * This function returns with the stream head read queue unlocked, and with
 * the value 0 on success and an error number on error.
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_READ_BUFCALL) (shead_t * sheadp, int __NOTUSED (mode))
#else
__LOCAL__ int
SHEAD_READ_BUFCALL __ARGS ((sheadp, mode))
shead_t	      *	sheadp;
int		mode;
#endif
{
	SHEAD_ASSERT_LOCKED (sheadp);

	if (sheadp->sh_read_bufcall == 0 &&
	    (sheadp->sh_read_bufcall = bufcall (1024, BPRI_LO,
						peek_bufcall_func,
						sheadp)) == 0) {
		SHEAD_UNLOCK (sheadp, plbase);
		return ENOSR;
	}

	return SHEAD_WAIT_NONBLOCK (sheadp, FREAD, SH_PEEK_WAIT,
				    CHECK_SIGNALS);
}


/*
 * This function implements I_PEEK read-ahead for streams. The caller should
 * have the stream head locked for read/write access.
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_PEEK) (shead_t * sheadp, struct strpeek * peek,
			    int * rvalp)
#else
__LOCAL__ int
SHEAD_PEEK __ARGS ((sheadp, peek, rvalp))
shead_t	      *	sheadp;
struct strpeek * peek;
int	      *	rvalp;
#endif
{
	mblk_t	      *	msg;
	int		retval;

	ASSERT (sheadp != NULL);
	ASSERT (peek != NULL);
	ASSERT (rvalp != NULL);

	if (peek->flags != 0 && peek->flags != RS_HIPRI)
		return EINVAL;

	/*
	 * Look at the stream head to see if there are any queued messages.
	 *
	 * Some notes about this are in order; as mentioned in the discussion
	 * on locking elsewhere, the possibility of page fault resolution
	 * during copies to user space complicates life, because copy routines
	 * may sleep while a page is made resident.
	 *
	 * This affects routines which read from the stream head in various
	 * ways, some of which are outlined in the general section on locking.
	 * Other than data consistency, it also affects the way message are
	 * deallocated; this routine wants to take a copy of the data in a
	 * message, but we need to ensure that the message will not be
	 * deallocated not just by other processes but also by actions at the
	 * stream level such as M_FLUSH.
	 *
	 * Other than getting into a web of locking flags and reference counts
	 * (there are already too many of those), the simplest ways of dealing
	 * with this are a) create a duplicate reference to the message data
	 * with dupmsg (), and b) simply dequeue the message and put it back
	 * when we're done.
	 *
	 * Neither alternative is without unpleasant consequences; a) has to
	 * deal with a lack of available storage to duplicate the message
	 * blocks, and b) has to deal with such things as reads blocking while
	 * the message is dequeued.
	 *
	 * a) and b) seem to have more-or-less equal implementation costs, but
	 * while a) is guaranteed to maintain the semantics of all stream
	 * operations, b) is not. Unless we can predict all the consequences
	 * of b) and either work around them or determine that they are benign
	 * then a) seems preferable.
	 */

	for (;;) {
		(void) QFREEZE_TRACE (sheadp->sh_head, "SHEAD_PEEK");

		if ((msg = SHEAD_FIRSTMSG (sheadp)) == NULL) {
no_message:
			/*
			 * No go; arrange for the value 0 to be returned to
			 * the caller of the ioctl ().
			 */

			QUNFREEZE_TRACE (sheadp->sh_head, plbase);
			* rvalp = 0;

			return 0;
		}

		switch (msg->b_datap->db_type) {

		case M_DATA:
		case M_PROTO:
			/*
			 * If the caller has asked for high-priority messages
			 * only, then we arrange to return 0.
			 */

			if ((peek->flags & RS_HIPRI) != 0)
				goto no_message;
			break;

		case M_PCPROTO:
			break;

		default:
			QUNFREEZE_TRACE (sheadp->sh_head, plbase);
			return EBADMSG;
		}


		/*
		 * Attempt to obtain a duplicate reference to this message.
		 */

		if ((msg = dupmsg (msg)) != NULL) {

			QUNFREEZE_TRACE (sheadp->sh_head, plbase);
			break;
		}


		/*
		 * Wait a short time for buffers to be available. To do this,
		 * we transfer our locking attention from the queue to the
		 * stream head.
		 */

		(void) SHEAD_LOCK (sheadp);

		QUNFREEZE_TRACE (sheadp->sh_head, plstr);

		if ((retval = SHEAD_READ_BUFCALL (sheadp, FREAD)) != 0)
			return retval;
	}


	/*
	 * Now we have a duplicate copy of a message to transfer to user
	 * space. From this point on, any attempt to exit from this routine
	 * must take care of freeing this message. However, we don't have to
	 * touch the queue again.
	 */

	peek->flags = msg->b_datap->db_type == M_PCPROTO ? RS_HIPRI : 0;

	retval = (COPYOUT_BUF (& peek->ctlbuf, & msg, CONTROL_PART) != 0 ||
		  COPYOUT_BUF (& peek->databuf, & msg,
			       DATA_PART) != 0) ? EFAULT : 0;

	if (msg != NULL)
		freemsg (msg);

	* rvalp = 1;

	return retval;
}


/*
 * Helper function to atomically read the stream head write offset.
 */

#if	__USE_PROTO__
__LOCAL__ short (SHEAD_WRITEOFFSET) (shead_t * sheadp)
#else
__LOCAL__ short
SHEAD_WRITEOFFSET __ARGS ((sheadp))
shead_t	      *	sheadp;
#endif
{
	pl_t		prev_pl;
	short		opt;

	prev_pl = SHEAD_LOCK (sheadp);

	opt = sheadp->sh_readopt;

	SHEAD_UNLOCK (sheadp, prev_pl);

	return opt;
}


/*
 * This function tests for errors on a stream and optionally also waits for
 * flow control to be relieved.
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_WRITE_TEST) (shead_t * sheadp, int mode, int band,
				   int hipri)
#else
__LOCAL__ int
SHEAD_WRITE_TEST __ARGS ((sheadp, mode, band, hipri))
shead_t	      *	sheadp;
int		mode;
int		band;
int		hipri;
#endif
{
	int		retval;

	do {
		(void) SHEAD_LOCK (sheadp);

		if ((retval = SHEAD_ERRHUP_LOCKED (sheadp, mode)) != 0)
			break;

		if (hipri != 0 || bcanputnext (W (sheadp->sh_head), band)) {

			SHEAD_UNLOCK (sheadp, plbase);
			break;
		}
	} while ((retval = SHEAD_WAIT_NONBLOCK (sheadp, mode, SH_WRITE_WAIT,
						CHECK_SIGNALS)) == 0);

	return retval;
}


/*
 * This function deals with all the grunge of creating a message for sending
 * down a stream. There are many conditions that need to be checked, including
 * whether the message fits within the size limits given by the downstream
 * queue.
 *
 * Since the message is going to be written to the stream, we also deal with
 * waiting for flow control here. We only allocate memory for the user's data
 * when we have an indication that it will be valid to write it... this may
 * not be an optimal choice for a high-performance system with vast amounts
 * of STREAMS buffer space.
 */

#if	__USE_PROTO__
__LOCAL__ mblk_t * (SHEAD_MAKEMSG) (shead_t * sheadp, int mode,
				    __CONST__ struct strbuf * ctlbuf,
				    __CONST__ struct strbuf * databuf,
				    int flags, int band, int * retvalp)
#else
__LOCAL__ mblk_t *
SHEAD_MAKEMSG __ARGS ((sheadp, mode, ctlbuf, databuf, flags, band, retvalp))
shead_t	      *	sheadp;
int		mode;
__CONST__ struct strbuf
	      *	ctlbuf;
__CONST__ struct strbuf
	      *	databuf;
int		flags;
int		band;
int	      *	retvalp;
#endif
{
	queue_t	      *	q;
	int		ctlsize;
	int		datasize;
	int		wroff;
	mblk_t	      *	ctlmsg;
	mblk_t	      *	datamsg;

	ASSERT (sheadp != NULL);
	ASSERT (retvalp != NULL);

	/*
	 * The "flags" value specifies one of the constants MSG_BAND or
	 * MSG_HIPRI. Despite being arranged as flags, only one is allowed to
	 * be given.
	 */

	if (flags != MSG_BAND && flags != MSG_HIPRI) {

		* retvalp = EINVAL;
		return NULL;
	}

	if ((mode & FWRITE) == 0) {

		* retvalp = EBADF;
		return NULL;
	}

	mode &= ~ FREAD;

	if (sheadp->sh_linked != NULL) {

		* retvalp = EINVAL;
		return NULL;
	}


	/*
	 * The absolute maximum possible size for the control or data parts of
	 * a STREAMS message are configured system-wide.
	 */

	ctlsize = ctlbuf == NULL ? -1 : ctlbuf->len;
	datasize = databuf == NULL ? -1 : databuf->len;
	wroff = SHEAD_WRITEOFFSET (sheadp);

	q = TOP_QUEUE (sheadp);

	if (ctlsize > str_mem->sm_maxctlsize ||
	    (datasize + wroff) > str_mem->sm_maxdatasize ||
	    (datasize + wroff) < q->q_minpsz ||
	    (datasize + wroff) > q->q_maxpsz) {

		* retvalp = ERANGE;
		return NULL;
	}


	/*
	 * You can't send a high-priority message without a control part, or a
	 * high-priority message with a non-zero band number.
	 *
	 * Otherwise, if no data at all is specified, then no message will be
	 * sent.
	 */

	if (flags == MSG_HIPRI) {

		if (ctlsize < 0 || band != 0) {

			* retvalp = EINVAL;
			return NULL;
		}
	} else if (datasize < 0 && ctlsize < 0) {

		* retvalp = 0;
		return NULL;
	}


	/*
	 * Let's see if the stream is flow controlled; if it is, we either
	 * block or return EAGAIN depending on the FNDELAY/FNONBLOCK setting.
	 */

	if ((* retvalp = SHEAD_WRITE_TEST (sheadp, mode, band, flags)) != 0)
		return NULL;

	/*
	 * Now we allocate data space for the messages. If the size of a
	 * component is -1, then we don't allocate any space for that part,
	 * otherwise we allocate a component of length 0.
	 *
	 * Don't forget that copyin () has arguments in the bcopy () order,
	 * ie. src, dest, len
	 */

	if (ctlsize >= 0) {
		/*
		 * Special case; the stream head is required to ensure that
		 * the control part of any message has at least 64 bytes of
		 * space. This is specified in the putmsg (2) manual page!
		 */

		if ((ctlmsg = MSGB_ALLOC (ctlsize < 64 ? 64 : ctlsize,
					  BPRI_LO, KM_SLEEP)) == NULL) {
			* retvalp = ENOSR;
			return NULL;
		}

		ctlmsg->b_datap->db_type = flags == MSG_HIPRI ? M_PCPROTO :
								M_PROTO;
		ctlmsg->b_band = band;

		if (ctlsize > 0 &&
		    copyin (ctlbuf->buf, ctlmsg->b_rptr, ctlsize) != 0) {

			freeb (ctlmsg);
			* retvalp = EFAULT;
			return NULL;
		}

		ctlmsg->b_wptr += ctlsize;
	} else
		ctlmsg = NULL;		/* paranoia */

	if (datasize >= 0) {

		if ((datamsg = MSGB_ALLOC (datasize + wroff, BPRI_LO,
					   KM_SLEEP)) == NULL) {
			if (ctlsize >= 0)
				freeb (ctlmsg);

			* retvalp = ENOSR;
			return NULL;
		}

		if (ctlsize < 0)
			ctlmsg = datamsg;
		else
			ctlmsg->b_cont = datamsg;

		datamsg->b_band = band;
		datamsg->b_wptr = datamsg->b_rptr += wroff;

		if (datasize > 0 &&
		    copyin (databuf->buf, datamsg->b_rptr, datasize) != 0) {

			freemsg (ctlmsg);

			* retvalp = EFAULT;
			return NULL;
		}

		datamsg->b_wptr += datasize;
	}


	/*
	 * Since the allocation requests could have blocked for memory to
	 * become available, and the copy requests could have blocked to
	 * resolve page faults, we could actually be a fair way down the track
	 * by now.
	 *
	 * If we were paranoid, we would recheck the flow control parameters
	 * and a bunch of other stuff, but there doesn't seem to be a whole
	 * lot of point to that. As long as we don't use data that could have
	 * changed while we waited, we're fine.
	 */

	* retvalp = 0;
	return ctlmsg;
}


/*
 * This function implements the I_FDINSERT ioctl ().
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_FDINSERT) (shead_t * sheadp, int mode,
				struct strfdinsert * fdinsp)
#else
__LOCAL__ int
SHEAD_FDINSERT __ARGS ((sheadp, mode, fdinsp))
shead_t	      *	sheadp;
int		mode;
struct strfdinsert
	      *	fdinsp;
#endif
{
	int		retval;
	shead_t	      *	other;
	mblk_t	      *	msg;

	ASSERT (sheadp != NULL);
	ASSERT (fdinsp != NULL);

	/*
	 * There are a large number of error conditions to check for this
	 * function. Don't lose sight of the fact that the big set of chained
	 * conditions below computes "other" for us.
	 */
	/*
	 * ALIGNMENT-DEPENDENT CODE.
	 */

	if ((fdinsp->flags != 0 && fdinsp->flags != RS_HIPRI) ||
	    ((unsigned) fdinsp->offset & ~ sizeof (int)) != 0 ||
	    fdinsp->offset + sizeof (queue_t *) > fdinsp->ctlbuf.len ||
	    (other = FH_TO_STREAM (fdinsp->fildes, & retval)) != 0) {
		/*
		 * A failure of any of the above returns EINVAL; according to
		 * streamio (7) EINVAL rather than EBADF results from a bad
		 * file descriptor.
		 */

		return EINVAL;
	}

	/*
	 * Make a band 0 message (possibly high-priority). This can fail if
	 * the requested message is too large for the advertised limits set
	 * by the next thing downstream.
	 *
	 * IMPORANT: we *rely* on this function making a message with a single
	 * large control block at least as big as the advertised size. If we
	 * cannot rely on this function we have to do lots of extra checking
	 * which I'd rather avoid.
	 */

	msg = SHEAD_MAKEMSG (sheadp, mode, & fdinsp->ctlbuf,
			     & fdinsp->databuf,
			     fdinsp->flags ? MSG_HIPRI : MSG_BAND, 0,
			     & retval);
	if (msg == NULL)
		return retval;

	/*
	 * Note that we don't recheck for hangups even though we could have
	 * waited a poentially long time in SHEAD_MAKEMSG (). The hangup
	 * condition has plenty of slop in it with the time it takes write
	 * messages to move down the queue anyway; drivers and modules have to
	 * be able to cope.
	 */

	* (queue_t **) (msg->b_rptr + fdinsp->offset) = other->sh_head;

	putq (W (sheadp->sh_head), msg);

	return 0;
}


/*
 * This function implements the I_FIND ioctl ().
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_FIND_MODINFO) (shead_t * sheadp, char * modname,
				    int * rvalp)
#else
__LOCAL__ int
SHEAD_FIND_MODINFO __ARGS ((sheadp, modname, rvalp))
shead_t	      *	sheadp;
char	      *	modname;
int	      *	rvalp;
#endif
{
	modsw_t	      *	module;

	/*
	 * First try to find the module in the global list of modules, then
	 * attempt to find that module's info on the stream.
	 */

	if ((module = FIND_MODULE (modname)) == NULL)
		return EINVAL;
	else {
		queue_t	      *	scan;
		pl_t		prev_pl;

		/*
		 * Walk down the write side of the stream until either the
		 * write side module info matches -or- a cross-point is found.
		 */

		scan = W (sheadp->sh_head);

		prev_pl = SHEAD_LOCK (sheadp);

		while (scan->q_next != NULL) {
			if ((scan->q_flag & QREADR) !=
			    (scan->q_next->q_flag & QREADR)) {
				/*
				 * Set scan to NULL to flag an unsuccessful
				 * search.
				 */

				scan = NULL;
				break;
			}

			scan = scan->q_next;

			if ((scan->q_flag & QPROCSOFF) != 0)
				continue;

			if (scan->q_qinfo == module->mod_stream->st_wrinit)
				break;
		}

		SHEAD_UNLOCK (sheadp, prev_pl);

		* rvalp = scan != NULL;
	}

	return 0;
}


/*
 * This function implements the I_LIST ioctl () for the case where the user
 * supplies a buffer to copy the module/driver names into.
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_LIST) (shead_t * sheadp, struct str_list * slistp,
			    int * rvalp)
#else
__LOCAL__ int
SHEAD_LIST __ARGS ((sheadp, slistp, rvalp))
shead_t	      *	sheadp;
struct str_list
	      *	slistp;
int	      *	rvalp;
#endif
{
	queue_t	      *	scan;
	int		modcount;
	int		i;
	struct str_mlist
		      *	buf;
	struct str_mlist
		      *	temp;

	if (slistp->sl_nmods < 1)
		return EINVAL;

	/*
	 * The EAGAIN error documented for I_LIST in streamio (7) is
	 * suggestive of how it is implemented; rather than get involved in
	 * tricky synchronization issues, copy the module names into a kernel
	 * buffer and then copy that to user level.
	 */

	modcount = SHEAD_MODCOUNT (sheadp);

	if (modcount > slistp->sl_nmods)
		modcount = slistp->sl_nmods;


	/*
	 * If we were paranoid, we'd user kmem_zalloc () to ensure that the
	 * data we copy to user space contains no sensitive information. As it
	 * happens, because of the fact we user strncpy () to fill the buffer
	 * with data, we are guaranteed that we have overwritten all of the
	 * contents.
	 */

	if ((buf = (struct str_mlist *) kmem_alloc (modcount * sizeof (* buf),
						    KM_SLEEP)) == NULL)
		return EAGAIN;

	/*
	 * Now fill the buffer in by moving down the stream. We don't worry
	 * about the race between the calculation of the buffer size and the
	 * time we fill it in, because the problem also exists for the user;
	 * in order to know how much space to allocate at user level, some
	 * arrangement must have been made to ensure things are stable.
	 */

	i = 0;
	scan = W (sheadp->sh_head);
	temp = buf;

	(void) SHEAD_LOCK (sheadp);

	while (scan->q_next != NULL &&
	       ((scan->q_flag & QREADR) == (scan->q_next->q_flag & QREADR))) {

		scan = scan->q_next;

		if ((scan->q_flag & QPROCSOFF) != 0)
			continue;

		if (i ++ > modcount)
			break;

		/*
		 * Now actually copy the module name. Note that we count on
		 * strncpy () null-padding the target for security.
		 */

		strncpy (temp->l_name, scan->q_qinfo->qi_minfo->mi_idname,
			 sizeof (temp->l_name) - 1);
		temp->l_name [sizeof (temp->l_name) - 1] = 0;
	}

	SHEAD_UNLOCK (sheadp, plbase);

	/*
	 * After unlocking we can safely call copyout (), which we could not
	 * use inside the loop because it may sleep resolving a page fault.
	 * Don't forget that copyout () is like bcopy (), not memcpy ()!
	 */

	* rvalp = i;

	i = copyout (buf, slistp->sl_modlist, i * sizeof (* buf));

	kmem_free (buf, modcount * sizeof (* buf));

	return i == 0 ? 0 : EFAULT;
}


/*
 * This function deals with setting the stream head read mode flag bits in
 * M_SETOPT messages or from an I_SRDOPT ioctl ().
 */

#if	__USE_PROTO__
int (SHEAD_SRDOPT) (shead_t * sheadp, int flag)
#else
int
SHEAD_SRDOPT __ARGS ((sheadp, flag))
shead_t	      *	sheadp;
int		flag;
#endif
{
	int		newflag;

	SHEAD_ASSERT_LOCKED (sheadp);

	if ((newflag = flag & RMODEMASK) == __RINVAL)
		return EINVAL;

	/*
	 * The streamio (7) man pages seem ambiguous about whether an
	 * application is permitted to, has to, or cannot diagnose a request
	 * to set multiple read options.
	 *
	 * Arbitrarily, we choose not to.
	 */

	newflag |= (flag & RPROTNORM) != 0 ? RPROTNORM :
		   (flag & RPROTDAT) != 0 ? RPROTDAT :
		   (flag & RPROTDIS) != 0 ? RPROTDIS :
			(sheadp->sh_readopt & ~ RMODEMASK);

	sheadp->sh_readopt = newflag;

	return 0;
}


/*
 * This table determines how many bytes to copy from user space at the start
 * of ioctl () processing and how many bytes to copy to user space at the end
 * of processing (presuming no other errors have occurred yet).
 */

#define	BADLEN		((unsigned short) -1)

typedef enum {
	IO_NOLOCK = 0,
	IO_BASIC_LOCK,
	IO_READFREEZE,
	IO_SLEEP_LOCK
} iolock_t;

enum {	NOHUP,
	HUP
};

static struct ioinfo {
	iolock_t	lock;		/* lock type */
	cat_t		cat;		/* category flag */
	unsigned short	in_len;		/* bytes to copy in */
	unsigned short	out_len;	/* bytes to copy out */
	unsigned char	hup_chk;	/* check for hangup */
} _ioctl_table [] = {
	{ IO_NOLOCK, SH_NONE, BADLEN, BADLEN, NOHUP },
					/* illegal */
	{ IO_READFREEZE, SH_NONE, sizeof (size_t), NOHUP },
						/* I_NREAD */
	{ IO_SLEEP_LOCK, SH_OPENCLOSE | SH_IOCTL_LOCK,
		FMNAMESZ + 1, 0, HUP },		/* I_PUSH */
	{ IO_SLEEP_LOCK, SH_OPENCLOSE | SH_IOCTL_LOCK, 0, 0, HUP },
						/* I_POP */
	{ IO_NOLOCK, SH_NONE, FMNAMESZ + 1, FMNAMESZ + 1, NOHUP },
						/* I_LOOK */
	{ IO_NOLOCK, SH_NONE, 0, 0, HUP },	/* I_FLUSH */
	{ IO_BASIC_LOCK, SH_NONE, 0, 0, NOHUP },/* I_SRDOPT */
	{ IO_BASIC_LOCK, SH_NONE, 0, sizeof (int), NOHUP },
						/* I_GRDOPT */
	{ IO_SLEEP_LOCK, SH_IOCTL_LOCK, sizeof (struct strioctl),
		sizeof (struct strioctl), HUP },/* I_STR */
	{ IO_SLEEP_LOCK, SH_IOCTL_LOCK, 0, 0, NOHUP },
						/* I_SETSIG */
	{ IO_SLEEP_LOCK, SH_IOCTL_LOCK, 0, sizeof (int), NOHUP },
						/* I_GETSIG */
	{ IO_NOLOCK, SH_NONE, FMNAMESZ + 1, 0, NOHUP },
						/* I_FIND */
	{ IO_SLEEP_LOCK, SH_IOCTL_LOCK, 0, 0, HUP },
						/* I_LINK */
	{ IO_SLEEP_LOCK, SH_IOCTL_LOCK, 0, 0, HUP },
						/* I_UNLINK */
	{ IO_SLEEP_LOCK, SH_READ_LOCK, 0, sizeof (struct strrecvfd), HUP },
						/* I_RECVFD */
	{ IO_SLEEP_LOCK, SH_READ_LOCK, sizeof (struct strpeek),
		sizeof (struct strpeek), NOHUP },/* I_PEEK */
	{ IO_NOLOCK, SH_NONE, sizeof (struct strfdinsert), 0, NOHUP },
						/* I_FDINSERT */
	{ IO_NOLOCK, SH_NONE, 0, 0, HUP },	/* I_SENDFD */
	{ IO_NOLOCK, SH_NONE, BADLEN, BADLEN, NOHUP },
					/* --- */
	{ IO_BASIC_LOCK, SH_NONE, 0, 0, NOHUP },/* I_SWROPT */
	{ IO_BASIC_LOCK, SH_NONE, 0, sizeof (int), NOHUP },
						/* I_GWROPT */
	{ IO_NOLOCK, SH_NONE, 0, 0, NOHUP },	/* I_LIST */ /* special */
	{ IO_BASIC_LOCK, SH_OPENCLOSE | SH_IOCTL_LOCK, 0, 0, HUP },
						/* I_PLINK */
	{ IO_BASIC_LOCK, SH_OPENCLOSE | SH_IOCTL_LOCK, 0, 0, HUP },
						/* I_PUNLINK */
	{ IO_NOLOCK, SH_NONE, BADLEN, BADLEN, NOHUP },
					/* I_SETEV */
	{ IO_NOLOCK, SH_NONE, BADLEN, BADLEN, NOHUP },
					/* I_GETEV */
	{ IO_NOLOCK, SH_NONE, BADLEN, BADLEN, NOHUP },
					/* I_STREV */
	{ IO_NOLOCK, SH_NONE, BADLEN, BADLEN, NOHUP },
					/* I_UNSTREV */
	{ IO_NOLOCK, SH_NONE, sizeof (struct bandinfo), 0, HUP },
						/* I_FLUSHBAND */
	{ IO_READFREEZE, SH_NONE, 0, 0, NOHUP },/* I_CKBAND */
	{ IO_READFREEZE, SH_NONE, 0, sizeof (int), NOHUP },
						/* I_GETBAND */
	{ IO_READFREEZE, SH_NONE, 0, 0, NOHUP },/* I_ATMARK */
	{ IO_BASIC_LOCK, SH_NONE, sizeof (__clock_t), 0, NOHUP },
						/* I_SETCLTIME */
	{ IO_BASIC_LOCK, SH_NONE, 0, sizeof (__clock_t), NOHUP },
						/* I_GETCLTIME */
	{ IO_NOLOCK, SH_NONE, 0, 0, HUP }	/* I_CANPUT */
};

struct ioinfo	_transparent = {
	IO_SLEEP_LOCK, SH_IOCTL_LOCK, 0, 0, HUP
};


/*
 * Symbol for accessing the default ioctl () and close timeouts. Consider
 * making this a static variable and initializing it at boot time.
 */

#define	IOCTL_TIMEOUT	drv_usectohz ((__clock_t) 15000000L)


/*
 * Main ioctl () processing for STREAMS files and pipes.
 *
 * This switch+table is an abomination, but trying to emulate C++ style in C
 * to subsume the switch and the above table would probably kill me. This
 * probably is about as short as it can really be.
 */

#if	__USE_PROTO__
int (STREAMS_IOCTL) (shead_t * sheadp, int cmd, _VOID * arg, int mode,
		     cred_t * credp, int * rvalp)
#else
int
STREAMS_IOCTL (sheadp, cmd, arg, mode, credp, rvalp)
shead_t	      *	sheadp;
int		cmd;
_VOID	      *	arg;
int		mode;
cred_t	      *	credp;
int	      *	rvalp;
#endif
{
	union {
		int		i_int;
		__clock_t	i_clock;
		size_t		i_size;
		char		i_modname [FMNAMESZ + 1];
		struct strioctl	i_strioc;
		struct strrecvfd i_recvfd;
		struct strbuf	i_strbuf;
		struct strfdinsert
				i_fdinsert;
		struct str_list	i_list;
		struct bandinfo	i_band;
		struct strpeek	i_peek;
	} iocbuf;
	int		retval;
	struct ioinfo *	info;

	ASSERT (sheadp != NULL);

	/*
	 * We start out by copying in the data specified by the table, which
	 * means we also get to range-check the ioctl entry if it has the
	 * magic STREAMS id.
	 */

	if ((cmd & ~ 0xFF) == STREAM_I) {
		int		index = cmd & 0xFF;

		if (index >= sizeof (_ioctl_table) / sizeof (* info))
			return EINVAL;

		info = & _ioctl_table [index];
	} else
		info = & _transparent;

	if (info->in_len == BADLEN)
		return EINVAL;

	/*
	 * We *must* do the copy before the lock! Never forget that
	 * copyin ()/copyout () can block in page fault resolution.
	 */

	if (info->in_len > 0 && copyin (arg, & iocbuf, info->in_len) != 0)
		return EFAULT;

	/*
	 * We may wish to check for a hangup or error before proceeding.
	 */

	(void) SHEAD_LOCK (sheadp);

	if (info->hup_chk == HUP &&
	    (retval = SHEAD_ERRHUP_LOCKED (sheadp, mode)) != 0)
		return ENXIO;

	if (info->lock != IO_BASIC_LOCK)
		SHEAD_UNLOCK (sheadp, plbase);

	switch (info->lock) {

	case IO_BASIC_LOCK:
		/* stay holding on to the basic lock */
		break;

	case IO_READFREEZE:
		(void) QFREEZE_TRACE (sheadp->sh_head, "STREAMS_IOCTL");
		break;

	case IO_SLEEP_LOCK:
		if ((retval = SHEAD_SLEEP_LOCK (sheadp, info->cat,
						IOCTL_TIMEOUT,
						CHECK_SIGNALS)) != 0)
			return retval;
		break;

	default:
		break;
	}

	retval = 0;

	switch (cmd) {

	case I_NREAD:		/* Get message length, count */
		{
			mblk_t	      *	scan;
			mblk_t	      *	first = NULL;

			* rvalp = 0;

			for (scan = SHEAD_FIRSTMSG (sheadp) ; scan != NULL ;
			     scan = scan->b_next) {

				if (datamsg (scan->b_datap->db_type)) {

					if (first == NULL)
						first = scan;
					(* rvalp) ++;
				}
			}

			if (first != NULL)
				iocbuf.i_int = msgdsize (first);
			else
				iocbuf.i_int = 0;
		}
		break;

	case I_PUSH:		/* push named module */
		{
			modsw_t	      *	module;

			if ((module = FIND_MODULE (iocbuf.i_modname)) == NULL)
				retval = EINVAL;
			else
				retval = PUSH_MODULE (sheadp, mode, credp,
						      module);
		}
		break;

	case I_POP:		/* pop topmost module */
		{
			queue_t	      *	q;

			if ((q = TOP_MODULE (sheadp)) == NULL)
				retval = EINVAL;
			else
				retval = POP_MODULE (sheadp, q, mode, credp);
		}
		break;

	case I_LOOK:		/* get topmost module name */
		{
			queue_t	      *	q;

			if ((q = TOP_MODULE (sheadp)) == NULL)
				retval = EINVAL;
			else {
				/*
				 * Copy the module name, taking care to
				 * 0-terminate it at FMNAMESZ bytes long.
				 */

				strncpy (iocbuf.i_modname,
					 q->q_qinfo->qi_minfo->mi_idname,
					 FMNAMESZ);
				iocbuf.i_modname [FMNAMESZ] = 0;
			}
		}
		break;

	case I_FLUSH:		/* flush read and/or write side */
		retval = SHEAD_FLUSH (sheadp, (int) arg, 0);
		break;

	case I_SRDOPT:		/* set read options */
		retval = SHEAD_SRDOPT (sheadp, (int) arg);
		break;

	case I_GRDOPT:		/* retrieve read options */
		iocbuf.i_int = sheadp->sh_readopt;
		break;

	case I_STR:		/* send ioctl () data down a stream */
		if ((iocbuf.i_strioc.ic_cmd & ~ 0xFF) == STREAM_I ||
		    (unsigned) iocbuf.i_strioc.ic_len == TRANSPARENT) {
			/*
			 * We do not permit I_STR to send STREAMS ioctl ()
			 * codes downstream; in certain cases such as I_LINK
			 * this could produce disastrous results.
			 *
			 * We also do not permit TRANSPARENT length I_STR
			 * messages. While the STREAMS documentation neither
			 * explicitly permits or forbids this, we keep the
			 * transparent ioctl () behaviour separate.
			 */

			retval = EINVAL;
			break;

		}

		retval = ISTR_IOCTL (sheadp, mode, & iocbuf.i_strioc, credp,
				     rvalp);
		break;

	case I_SETSIG:		/* register events for SIGPOLL signal */
		{
			int		events = (short) (ulong_t) arg;

			if ((events & ~ __POLL_MASK) != 0)
				retval = EINVAL;
			else
				retval = REGISTER_SIGPOLL (sheadp, events);
		}
		break;

	case I_GETSIG:		/* return registered event mask */
		{
			sigpoll_t     *	sigs;

			if ((sigs = FIND_SIGPOLL (sheadp)) == NULL)
				retval = EINVAL;
			else
				* rvalp = sigs->sp_events;
		}
		break;

	case I_FIND:		/* determine if module exists on stream */
		retval = SHEAD_FIND_MODINFO (sheadp, iocbuf.i_modname, rvalp);
		break;

	case I_LINK:		/* link stream below another */
	case I_PLINK:		/* create a persistent link */
		{
			shead_t	      *	lower;

			if ((lower = FH_TO_STREAM ((int) arg,
						   & retval)) != NULL) {
				retval = SHEAD_LINK (sheadp, mode, lower, cmd,
						     credp);

				if (retval == 0)
					* rvalp = lower->sh_muxid;
			}
		}
		break;

	case I_UNLINK:		/* remove a (or all) link(s) below a stream */
	case I_PUNLINK:		/* undo a single or all persistent link(s) */
		do {
			shead_t	      *	lower;

			if ((lower = SHEAD_FIND_MUXID (sheadp, cmd,
						       (int) arg)) == NULL) {
				/*
				 * We return EINVAL if a specific mux ID was
				 * given, 0 otherwise.
				 */

				retval = (int) arg == -1 ? 0 : EINVAL;
				break;

			}

			(void) SHEAD_UNLINK (sheadp, lower, cmd, mode, credp,
					     & retval);
		} while ((int) arg == -1);
		break;

	case I_RECVFD:		/* receive a file descriptor from stream */
		retval = FH_RECV (sheadp, mode, & iocbuf.i_recvfd);
		break;

	case I_PEEK:		/* examine data at stream head */
		retval = SHEAD_PEEK (sheadp, & iocbuf.i_peek, rvalp);
		break;

	case I_FDINSERT:	/* send read queue pointer down stream */
		retval = SHEAD_FDINSERT (sheadp, mode, & iocbuf.i_fdinsert);
		break;

	case I_SENDFD:		/* send a file descriptor down a pipe */
		retval = FH_SEND (sheadp, (int) arg, credp);
		break;

	case I_SWROPT:		/* set write options for stream */
		{
			int		flag = (int) arg;

			if ((flag & ~ SNDZERO) != 0)
				retval = EINVAL;
			else
				sheadp->sh_wropt = flag;
		}
		break;

	case I_GWROPT:		/* retrieve write options for stream */
		iocbuf.i_int = sheadp->sh_wropt;
		break;

	case I_LIST:		/* get names of all modules/drivers */
		/*
		 * The value of "arg" is a pointer to a structure for this
		 * entry, but since a NULL value is legal we don't copy the
		 * data in automatically.
		 *
		 * Here we select the call type and copy in the structure for
		 * the non-NULL case.
		 */

		if (arg == NULL)
			* rvalp = SHEAD_MODCOUNT (sheadp);
		else if (copyin (arg, & iocbuf, sizeof (iocbuf.i_list)) != 0)
			retval = EFAULT;
		else
			retval = SHEAD_LIST (sheadp, & iocbuf.i_list, rvalp);
		break;

	case I_SETEV:		/* The meaning of these ioctl ()'s is not */
	case I_GETEV:		/* documented, although their names and */
	case I_STREV:		/* numeric values are given in the System */
	case I_UNSTREV:		/* V ABI. */
		retval = EINVAL;
		break;

	case I_FLUSHBAND:	/* flush messages in a priority band */
		retval = SHEAD_FLUSH (sheadp, iocbuf.i_band.bi_flag,
				      iocbuf.i_band.bi_pri);
		break;

	case I_CKBAND:		/* check for existence of band on stream */
		if ((uchar_t) (ulong_t) arg != (ulong_t) arg)
			retval = EINVAL;
		else {
			mblk_t	      *	scan;

			for (scan = SHEAD_FIRSTMSG (sheadp) ; scan != NULL ;
			     scan = scan->b_next)
				if (datamsg (scan->b_datap->db_type) &&
				    scan->b_band == (uchar_t) (ulong_t) arg)
					break;

			* rvalp = scan != NULL;
		}
		break;

	case I_GETBAND:		/* get the band number of the first message */
		{
			mblk_t	     *	scan;

			for (scan = SHEAD_FIRSTMSG (sheadp) ; scan != NULL ;
			     scan = scan->b_next) {

				if (datamsg (scan->b_datap->db_type))
					break;
			}

			if (scan == NULL)
				retval = ENODATA;
			else
				iocbuf.i_int = scan->b_band;
		}
		break;

	case I_ATMARK:		/* test for (last) mark on messages */
		{
			mblk_t	      *	scan = SHEAD_FIRSTMSG (sheadp);

			if ((ulong_t) arg != ANYMARK &&
			    (ulong_t) arg != LASTMARK) {

				retval = EINVAL;
				break;
			}

			if (scan == NULL || (scan->b_flag & MSGMARK) == 0) {

				* rvalp = 0;
				break;
			}

			* rvalp = 1;

			if ((ulong_t) arg == LASTMARK)
				while ((scan = scan->b_next) != NULL)
					if ((scan->b_flag & MSGMARK) != 0) {
						* rvalp = 0;
						break;
					}
		}
		break;

	case I_SETCLTIME:	/* set close timeout for stream */
		if (iocbuf.i_clock != 0)
			sheadp->sh_cltime = iocbuf.i_clock;
		else
			retval = EINVAL;
		break;

	case I_GETCLTIME:	/* retrieve current close timeout */
		iocbuf.i_clock = sheadp->sh_cltime;
		break;

	case I_CANPUT:		/* test if band is writeable */
		if ((uchar_t) (ulong_t) arg != (ulong_t) arg)
			retval = EINVAL;
		else
			* rvalp = bcanputnext (W (sheadp->sh_head),
					       (uchar_t) (ulong_t) arg);
		break;

	default:
		ASSERT (info == & _transparent);

		retval = TRANSPARENT_IOCTL (sheadp, mode, cmd, arg, credp,
					    rvalp);
		break;
	}


	/*
	 * Perform any necessary unlocking operations and copy back any
	 * results into the data area pointed to by "arg" (if this is a
	 * STREAMS-specific ioctl ()).
	 */

	switch (info->lock) {

	case IO_BASIC_LOCK:
		SHEAD_UNLOCK (sheadp, plbase);
		break;

	case IO_READFREEZE:
		(void) QUNFREEZE_TRACE (sheadp->sh_head, plbase);
		break;

	case IO_SLEEP_LOCK:
		SHEAD_SLEEP_UNLOCK (sheadp, info->cat);
		break;

	default:
		break;
	}


	/*
	 * We only copy out results if there is no error. We have to
	 * do this *after* unlocking, above; copyout () can block in
	 * page fault resolution!
	 */

	if (retval == 0 && info->out_len > 0 &&
	    copyout (& iocbuf, arg, info->out_len) != 0)
		retval = EFAULT;

	return retval;
}


/*
 * Helper function to atomically read the stream head read options.
 */

#if	__USE_PROTO__
__LOCAL__ short (SHEAD_READOPT) (shead_t * sheadp)
#else
__LOCAL__ short
SHEAD_READOPT __ARGS ((sheadp))
shead_t	      *	sheadp;
#endif
{
	pl_t		prev_pl;
	short		opt;

	prev_pl = SHEAD_LOCK (sheadp);

	opt = sheadp->sh_readopt;

	SHEAD_UNLOCK (sheadp, prev_pl);

	return opt;
}


/*
 * Stream head user-level getmsg () processing.
 *
 * This is way too many parameters; this should be bundled into a block like
 * the "uio" structure is.
 */

#if	__USE_PROTO__
int (STREAMS_GETPMSG) (shead_t * sheadp, struct strbuf * ctlbuf,
		       struct strbuf * databuf, int * bandp, int * flagsp,
		       int mode, cred_t * credp, int * rvalp)
#else
int
STREAMS_GETPMSG __ARGS ((sheadp, ctlbuf, databuf, bandp, flagsp, mode, credp,
			 rvalp))
shead_t	      *	sheadp;
int		mode;
struct strbuf *	ctlbuf;
struct strbuf *	databuf;
int	      *	bandp;
int	      *	flagsp;
cred_t	      *	credp;
int	      *	rvalp;
#endif
{
	mblk_t	      *	msg;
	mblk_t	      *	scan;
	int		retval;

	if ((mode & FREAD) == 0)
		return EBADF;

	mode &= ~ FWRITE;
	if (sheadp->sh_linked != NULL)
		return EINVAL;

	/*
	 * As discussed in the general comment on read synchronization and in
	 * the I_PEEK documentation, there are some warts when it comes to
	 * read locking. Here we can solve things by dequeueing a message,
	 * modifying at, and writing it back without worrying about any nasty
	 * unintended effects.
	 *
	 * Because getmsg ()/getpmsg () always honours message boundaries,
	 * there is no real value in single-threading this.
	 */

	/*
	 * Look at the stream head to see if a message is present. Note that
	 * we freeze the read queue before acquiring the stream head basic
	 * lock because that's our canonical ordering.
	 */

	for (;;) {
		(void) QFREEZE_TRACE (sheadp->sh_head, "SHEAD_GETPMSG");

		(void) SHEAD_LOCK (sheadp);

		if ((retval = SHEAD_ERRHUP_LOCKED (sheadp, FREAD)) != 0) {

			QUNFREEZE_TRACE (sheadp->sh_head, plbase);
			return retval;
		}


		if ((msg = SHEAD_FIRSTMSG (sheadp)) != NULL) {

			switch (msg->b_datap->db_type) {

			case M_DATA:
			case M_PROTO:

				if (* flagsp == MSG_HIPRI ||
				    (* flagsp == MSG_BAND &&
				     msg->b_band < * bandp)) {

					sheadp->sh_head->q_lastband =
						msg->b_band;
					msg = NULL;
					break;
				}

				/* FALL THROUGH */

			case M_PCPROTO:
				rmvq (sheadp->sh_head, msg);
				break;

			default:
				retval = EBADMSG;
				break;
			}
		}

		QUNFREEZE_TRACE (sheadp->sh_head, plbase);

		if (retval != 0) {
			SHEAD_UNLOCK (sheadp, plbase);
			return retval;
		}

		if (msg != NULL)
			break;

		/*
		 * We need to wait (unless O_NDELAY or O_NONBLOCK has been
		 * specified). We don't wait any more if the stream has been
		 * hung up.
		 */

		if (SHEAD_HANGUP (sheadp)) {
			SHEAD_UNLOCK (sheadp, plbase);

			/*
			 * Hangups are not an error for getpmsg ().
			 */

			if (ctlbuf != NULL)
				ctlbuf->len = 0;
			if (databuf != NULL)
				databuf->len = 0;

			* rvalp = 0;
			return 0;
		}

		if ((retval = SHEAD_WAIT_NONBLOCK (sheadp, mode, SH_READ_WAIT,
						   CHECK_SIGNALS)) != 0)
			return retval;
	}


	/*
	 * After this point, "msg" is our responsibility and we either have to
	 * free it or put it back if there is an error.
	 */

	* bandp = msg->b_band;
	* flagsp = msg->b_datap->db_type == M_PCPROTO ? MSG_HIPRI : MSG_BAND;

	retval = (COPYOUT_BUF (ctlbuf, & msg, CONTROL_PART) != 0 ||
		  COPYOUT_BUF (databuf, & msg,
			       DATA_PART) != 0) ? EFAULT : 0;

	/*
	 * Formulate a return mask indicating what components of the message
	 * being transferred have not been fully consumed.
	 */

	* rvalp = 0;

	for (scan = msg ; scan != NULL ; scan = scan->b_cont)
		* rvalp |= scan->b_datap->db_type == M_DATA ? MOREDATA
							    : MORECTL;

	if (msg != NULL)
		putbq (sheadp->sh_head, msg);

	SHEAD_UNLOCK (sheadp, plbase);
	return retval;
}


/*
 * In order to keep the logic of SHEAD_READ () manageable, this section of
 * code has been factored into a separate function. Here we wait for data to
 * become available at the stream head for reading.
 *
 * We return 0 on success, or an error number on failure. The value of "mpp"
 * is only valid if 0 is returned.
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_READ_DATA) (shead_t * sheadp, int mode, mblk_t ** mpp,
				 int resid)
#else
__LOCAL__ int
SHEAD_READ_DATA __ARGS ((sheadp, mode, mpp, resid))
shead_t	      *	sheadp;
int		mode;
mblk_t	     **	mpp;
int		resid;
#endif
{
	int		retval;

	for (;;) {
		(void) QFREEZE_TRACE (sheadp->sh_head, "SHEAD_READ");

		(void) SHEAD_LOCK (sheadp);

		if ((retval = SHEAD_ERRHUP_LOCKED (sheadp, FREAD)) != 0) {

			QUNFREEZE_TRACE (sheadp->sh_head, plbase);
			return retval;
		}


		if ((* mpp = SHEAD_FIRSTMSG (sheadp)) != NULL) {

			switch ((* mpp)->b_datap->db_type) {

			case M_PROTO:
			case M_PCPROTO:
				/*
				 * These are valid depending on the read mode
				 * of the stream.
				 */

				if ((sheadp->sh_readopt & RPROTNORM) != 0) {

					retval = EBADMSG;
					break;
				}

				/* FALL THROUGH */

			case M_DATA:
				rmvq (sheadp->sh_head, * mpp);
				break;

			default:
				retval = EBADMSG;
				break;
			}
		}

		QUNFREEZE_TRACE (sheadp->sh_head, plbase);

		if (retval != 0 || * mpp != NULL) {

			SHEAD_UNLOCK (sheadp, plbase);
			return retval;
		}


		/*
		 * Since we are going to sleep on a read (), this is the place
		 * to generate M_READ messages if that is how this stream has
		 * been configured.
		 *
		 * If we can't generate an M_READ message, schedule a bufcall.
		 * If we can't do that, return EAGAIN???
		 */

		if (SHEAD_READMSG (sheadp)) {
			mblk_t	      *	msg;

			if ((msg = MSGB_ALLOC (sizeof (int), BPRI_LO,
					       KM_NOSLEEP)) == NULL) {
				/*
				 * Execute a short wait for buffer memory for
				 * the M_READ, then retry the loop. Note that
				 * SHEAD_READ_BUFCALL () unlocks the stream
				 * head for us.
				 */

				if ((retval = SHEAD_READ_BUFCALL (sheadp,
								  mode)) != 0)
					return retval;

				continue;
			}

			msg->b_datap->db_type = M_READ;
			* (int *) msg->b_rptr = resid;
			msg->b_wptr += sizeof (int);

			putq (W (sheadp->sh_head), msg);
		}

		if ((retval = SHEAD_WAIT_NONBLOCK (sheadp, mode, SH_READ_WAIT,
						   CHECK_SIGNALS)) != 0)
			return retval;
	}

}


/*
 * In order to keep the logic of SHEAD_READ () manageable, this section of
 * code has been factored into this routine, which manages the transfer of
 * message data to the user.
 *
 * We return EAGAIN to the user if more data needs to be read, 0 if the read
 * has successfully completed, or some other error number on failure.
 */

#if	__USE_PROTO__
__LOCAL__ int (SHEAD_READ_MOVE) (shead_t * sheadp, uio_t * uiop,
				 mblk_t * msg)
#else
__LOCAL__ int
SHEAD_READ_MOVE __ARGS ((sheadp, uiop, msg))
shead_t	      *	sheadp;
uio_t	      *	uiop;
mblk_t	      *	msg;
#endif
{
	int		readopt = SHEAD_READOPT (sheadp);
	mblk_t	      *	scan;

	if ((readopt & RMODEMASK) == RNORM &&
	    (msg->b_cont == NULL && msg->b_rptr == msg->b_wptr)) {
		/*
		 * We have run into a zero-length message. Put it
		 * back and terminate the read.
		 */

		putbq (sheadp->sh_head, msg);
		return 0;
	}

	if ((readopt & RPROTDIS) != 0) {

		while (msg != NULL && msg->b_datap->db_type != M_DATA) {
			/*
			 * Consume the control part of the message.
			 */

			scan = msg->b_cont;
			freeb (msg);
			msg = scan;
		}

		if (msg == NULL)
			return EAGAIN;
	}


	/*
	 * Actual data transfer time; copy each message segment to the user
	 * with uiomove ().
	 */

	do {
		size_t		unit = msg->b_wptr - msg->b_rptr;

		if (unit > uiop->uio_resid)
			unit = uiop->uio_resid;

		if (unit > 0 &&
		    uiomove (msg->b_rptr, unit, UIO_READ, uiop) != 0) {
			/*
			 * Address fault time. But first, put back the data.
			 */

			putbq (sheadp->sh_head, msg);

			return EFAULT;
		}

		msg->b_rptr += unit;

		if (msg->b_wptr != msg->b_rptr) {
			/*
			 * Since we didn't finish this message block, we must
			 * be finished with the read (). If we are in message-
			 * discard mode we throw away the remaining data.
			 */

			ASSERT (uiop->uio_resid == 0);

			if ((readopt & RMODEMASK) == RMSGD)
				freemsg (msg);
			else
				putbq (sheadp->sh_head, msg);

			return 0;
		}

		scan = msg->b_cont;
		freeb (msg);
	} while ((msg = scan) != NULL);


	/*
	 * We have run out of message; what now? If in byte-stream mode, look
	 * for more data, otherwise exit.
	 */

	return (readopt & RMODEMASK) == RNORM ? EAGAIN : 0;
}


/*
 * Stream head user-level read processing.
 *
 * I apologise for the abysmal structure of this code; the gotos should be
 * replaced by proper loops and the major subsections factored into auxiliary
 * functions, but this code is the victim of time pressure and several
 * rewrites. By all means encourage the author to improve this in case he has
 * forgotten to come back and fix it.
 */

#if	__USE_PROTO__
int (STREAMS_READ) (shead_t * sheadp, uio_t * uiop, cred_t * credp)
#else
int
STREAMS_READ __ARGS ((sheadp, uiop, credp))
shead_t	      *	sheadp;
uio_t	      *	uiop;
cred_t	      *	credp;
#endif
{
	mblk_t	      *	msg;
	int		retval;
	int		mode;
	int		readcount = uiop->uio_resid;

	if ((uiop->uio_fmode & FREAD) == 0)
		return EBADF;

	mode = uiop->uio_fmode & ~ FWRITE;

	if (sheadp->sh_linked != NULL)
		return EINVAL;

	/*
	 * We (optionally) take out a lock on the stream head to single-thread
	 * reads. This is important because in byte-stream mode we may want
	 * to guarantee atomicity of reads. This is only relevant to byte-
	 * stream mode because modes which honor message boundaries cannot
	 * be protected against multiple readers anyway.
	 */

	if ((retval = SHEAD_SLEEP_LOCK (sheadp, SH_READ_LOCK, 0,
					CHECK_SIGNALS)) != 0)
		return retval;

	do {
		if ((retval = SHEAD_READ_DATA (sheadp, mode, & msg,
					       uiop->uio_resid)) != 0)
			break;

		/*
		 * After this point, "msg" is our responsibility and we either
		 * have to free it or put it back if there is an error.
		 */

	} while ((retval = SHEAD_READ_MOVE (sheadp, uiop, msg)) == EAGAIN);


	/*
	 * If a partial read has been done, we return a short read rather than
	 * reporting an error immediately.
	 */

	if (retval != 0 && readcount != uiop->uio_resid)
		retval = 0;

	SHEAD_SLEEP_UNLOCK (sheadp, SH_READ_LOCK);
	return retval;
}


/*
 * Helper function to atomically read the stream head write options.
 */

#if	__USE_PROTO__
__LOCAL__ short (SHEAD_WRITEOPT) (shead_t * sheadp)
#else
__LOCAL__ short
SHEAD_WRITEOPT __ARGS ((sheadp))
shead_t	      *	sheadp;
#endif
{
	pl_t		prev_pl;
	short		opt;

	prev_pl = SHEAD_LOCK (sheadp);

	opt = sheadp->sh_wropt;

	SHEAD_UNLOCK (sheadp, prev_pl);

	return opt;
}


/*
 * Stream head user level putpmsg () processing. As with getmsg (), there are
 * enough parameters being passed that this should be abstracted into a
 * structure like uio(D4DK).
 */

#if	__USE_PROTO__
int (STREAMS_PUTPMSG) (shead_t * sheadp, __CONST__ struct strbuf * ctlbuf,
		       __CONST__ struct strbuf * databuf, int band, int flags,
		       int mode, cred_t * credp, int * rvalp)
#else
int
STREAMS_PUTPMSG __ARGS ((sheadp, ctlbuf, databuf, band, flags, mode, credp,
			 rvalp))
shead_t	      *	sheadp;
int		mode;
__CONST__ struct strbuf
	      *	ctlbuf;
__CONST__ struct strbuf
	      *	databuf;
int		band;
int		flags;
cred_t	      *	credp;
int	      *	rvalp;
#endif
{
	mblk_t	      *	msg;
	int		retval;

	/*
	 * Make a message (possibly high-priority). This can fail if the
	 * requested message is too large for the advertised limits set by the
	 * next thing downstream. SHEAD_MAKEMSG () also checks for error and
	 * hangup conditions, FNDELAY/FNONBLOCK, and flow control.
	 */

	msg = SHEAD_MAKEMSG (sheadp, mode, ctlbuf, databuf, flags, band,
			     & retval);

	if (msg == NULL)
		return retval;

	/*
	 * Note that we don't recheck for hangups even though we could have
	 * waited a poentially long time in SHEAD_MAKEMSG (). The hangup
	 * condition has plenty of slop in it with the time it takes write
	 * messages to move down the queue anyway; drivers and modules have to
	 * be able to cope.
	 */

	putq (W (sheadp->sh_head), msg);

	* rvalp = 0;
	return 0;
}


/*
 * Stream head user-level write processing.
 *
 * I apologise for the abysmal structure of this code; the gotos should be
 * replaced by proper loops and the major subsections factored into auxiliary
 * functions, but this code is the victim of time pressure and several
 * rewrites. By all means encourage the author to improve this in case he has
 * forgotten to come back and fix it.
 */

#if	__USE_PROTO__
int (STREAMS_WRITE) (shead_t * sheadp, uio_t * uiop, cred_t * credp)
#else
int
STREAMS_WRITE __ARGS ((sheadp, uiop, credp))
shead_t	      *	sheadp;
uio_t	      *	uiop;
cred_t	      *	credp;
#endif
{
	queue_t	      *	q;
	short		wropt = SHEAD_WRITEOPT (sheadp);
	mblk_t	      *	datamsg;
	int		datasize;
	int		wroff;
	int		retval;
	int		mode;
	int		writecount;

	if ((uiop->uio_fmode & FWRITE) == 0)
		return EBADF;

	mode = uiop->uio_fmode & ~ FREAD;
	if (sheadp->sh_linked != NULL)
		return EINVAL;

	/*
	 * Deal with the zero-length-message special case.
	 */

	if ((writecount = uiop->uio_resid) == 0 && (wropt & SNDZERO) == 0)
		return 0;

	/*
	 * Unlike putmsg ()/putpmsg (), write () can potentially spread the
	 * data it writes over multiple messages and take a considerable time
	 * to do it, we allow for the possibility of locking the stream head
	 * so that only one write () is in progress at any time.
	 *
	 * The primary purpose of this is to allow PIPE_BUF to be effectively
	 * unlimited. This is an experimental idea, though.
	 */

	if ((retval = SHEAD_SLEEP_LOCK (sheadp, SH_WRITE_LOCK, 0,
					CHECK_SIGNALS)) != 0)
		return retval;

	wroff = SHEAD_WRITEOFFSET (sheadp);

	q = TOP_QUEUE (sheadp);

	do {

		if ((datasize = uiop->uio_resid) + wroff > q->q_maxpsz) {
			/*
			 * Special case (documented on the write (2) manual
			 * page; if we can't fit within the max/min range and
			 * the minimum is greater than 0, return ERANGE.
			 */

			if (q->q_minpsz > 0) {

				retval = ERANGE;
				break;
			}

			datasize = q->q_maxpsz - wroff;
		} else if (datasize + wroff < q->q_minpsz) {

			retval = ERANGE;
			break;
		}


		/*
		 * Let's see if the stream is flow controlled; if it is, we
		 * either block or return EAGAIN depending on the
		 * FNDELAY/FNONBLOCK setting.
		 */

		if ((retval = SHEAD_WRITE_TEST (sheadp, mode, 0, 0)) != 0)
			break;

		if ((datamsg = MSGB_ALLOC (datasize + wroff, BPRI_LO,
					   KM_SLEEP)) == NULL) {
			retval = ENOSR;
			break;
		}

		datamsg->b_wptr = datamsg->b_rptr = datamsg->b_rptr + wroff;

		if (datasize > 0 &&
		    uiomove (datamsg->b_rptr, datasize, UIO_WRITE,
			     uiop) != 0) {

			freemsg (datamsg);

			retval = EFAULT;
			break;
		}

		datamsg->b_wptr += datasize;

		/*
		 * Now do the write, and see if there is more data.
		 */

		putq (W (sheadp->sh_head), datamsg);

	} while (uiop->uio_resid > 0);


	/*
	 * If there has been an error after some data was actually written
	 * we return a short read rather than report the error immediately.
	 */

	if (retval != 0 && uiop->uio_resid != writecount)
		retval = 0;

	SHEAD_SLEEP_UNLOCK (sheadp, SH_WRITE_LOCK);
	return retval;
}


/*
 * Stream head user-level open processing.
 */

extern struct streamtab headinfo;

#if	__USE_PROTO__
int (STREAMS_OPEN) (n_dev_t * devp, struct streamtab * stabp, int mode,
		    cred_t * credp, int cloneflag)
#else
int
STREAMS_OPEN (devp, stabp, mode, credp, cloneflag)
n_dev_t	      *	devp;
struct streamtab
	      *	stabp;
int		mode;
cred_t	      *	credp;
int		cloneflag;
#endif
{
	shead_t	      *	sheadp;
	queue_t	      *	q;
	int		retval;
	n_dev_t		dev;

	ASSERT (devp != NULL);
	ASSERT (stabp != NULL);
	ASSERT (credp != NULL);

	dev = * devp;

	if ((sheadp = SHEAD_OPEN_LOCK (* devp, stabp, & retval)) == NULL)
		return retval;

	/*
	 * If this is the first open of the stream, set up the stream head
	 * entry points and the driver entry points.
	 */

	q = W (sheadp->sh_head)->q_next;

	if (sheadp->sh_open_count == 0) {

		QUEUE_INIT (sheadp->sh_head, & headinfo, QI_NORMAL);
		qprocson (sheadp->sh_head);

		QUEUE_INIT (R (q), stabp, QI_NORMAL);
	}


	/*
	 * Now we have a stream head (locked, no less), we can call the open
	 * entry points of all the modules and the driver. In the special case
	 * where the open count of the entry is 0, we allow the driver to
	 * change the "dev_t" value to a previously unused number.
	 */

	do {
		retval = (* R (q)->q_qinfo->qi_qopen)
				(R (q), & dev, mode,
				 q->q_next != NULL ? MODOPEN :
					 cloneflag ? CLONEOPEN : 0, credp);

		if (dev != * devp && q->q_next != NULL) {
			/*
			 * A module has changed the device number that we
			 * passed a pointer to. This is not valid!
			 */

			cmn_err (CE_WARN, "Module \"%s\" changed its device number",
				 q->q_qinfo->qi_minfo->mi_idname);
			retval = ENXIO;
		}

		if (retval != 0)
			goto failure;
	} while ((q = q->q_next) != NULL);


	/*
	 * The modules and driver have all OK'ed the open, so increment the
	 * open count. Here we also check for the clone case.
	 *
	 * If we want to detect an error after this point, we should execute
	 * a close.
	 */

	sheadp->sh_open_count ++;

	if (dev != * devp) {
		/*
		 * The driver has requested that the device number of the
		 * queue be assigned differently than the initial device
		 * number. This is only really valid if this is the first open
		 * of the given queue.
		 */

		if (sheadp->sh_open_count > 1) {

			cmn_err (CE_WARN, "Driver \"%s\" changed its device number after inital open",
				 q->q_qinfo->qi_minfo->mi_idname);

			sheadp->sh_open_count --;

			retval = ENXIO;
			goto failure;
		}


		/*
		 * Other open attempts may be waiting on the stream head for
		 * the original device number; they need a wakeup.
		 */

		if (SHEAD_RENAME (sheadp, dev) != 0) {

			cmn_err (CE_WARN, "Clone device number chosen by driver \"%s\"is in use",
				 q->q_qinfo->qi_minfo->mi_idname);

			if (-- sheadp->sh_open_count == 0)
				SHEAD_DO_CLOSE (sheadp, mode, credp);

			retval = ENXIO;
			goto failure;
		}

		* devp = dev;
	}

failure:
	/*
	 * The module or driver has failed the open request. We unlock the
	 * stream head, which may deallocate the stream head if the open count
	 * is 0.
	 */

	SHEAD_SLEEP_UNLOCK (sheadp, SH_OPENCLOSE);
	return retval;
}


/*
 * Stream head interface to generic polling.
 */

#if	__USE_PROTO__
int (STREAMS_CHPOLL) (shead_t * sheadp, short events, int anyyet,
		      short * reventsp, struct pollhead ** phpp)
#else
int
STREAMS_CHPOLL __ARGS ((sheadp, events, anyyet, reventsp, phpp))
shead_t	      *	sheadp;
short		events;
int		anyyet;
short	      *	reventsp;
struct pollhead
	     **	phpp;
#endif
{
	short		my_events;

	/*
	 * The chpoll () entry point uses the POLL... constants rather than
	 * the S_... constants that I_SETSIG uses. We convert to the S_...
	 * form for our internal use... see <sys/poll.h>
	 */

	my_events = (events & (__POLL_INPUT | __POLL_HIPRI | __POLL_OUTPUT |
			       __POLL_RDNORM | __POLL_OUTPUT |
			       __POLL_RDBAND | __POLL_WRBAND));

	if ((events & POLLERR) != 0)
		my_events = S_ERROR;
	if ((events & POLLHUP) != 0)
		my_events = S_HANGUP;

	if ((my_events = SHEAD_POLL_CHECK (sheadp, my_events)) == 0) {

		* reventsp = 0;

		if (anyyet == 0)
			* phpp = sheadp->sh_pollhead;
	} else
		* reventsp = my_events;

	return 0;
}


/*
 * Stream head user-level close processing.
 */

#if	__USE_PROTO__
int (STREAMS_CLOSE) (shead_t * sheadp, int mode, cred_t * credp)
#else
int
STREAMS_CLOSE __ARGS ((sheadp, mode, credp))
shead_t	      *	sheadp;
int		mode;
cred_t	      *	credp;
#endif
{
	int		retval;

	if ((retval = SHEAD_SLEEP_LOCK (sheadp, SH_OPENCLOSE | SH_IOCTL_LOCK,
					0, CHECK_SIGNALS)) != 0)
		return retval;

	if (-- sheadp->sh_open_count == 0)
		SHEAD_DO_CLOSE (sheadp, mode, credp);

	SHEAD_SLEEP_UNLOCK (sheadp, SH_OPENCLOSE | SH_IOCTL_LOCK);
	return 0;
}