OpenSolaris_b135/stand/lib/sock/socket.c

Compare this file to the similar file:
Show the results in this format:

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 *
 * socket.c, Code implementing a simple socket interface.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <sys/types.h>
#include "socket_impl.h"
#include <sys/isa_defs.h>
#include <sys/sysmacros.h>
#include <sys/bootconf.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/tcp.h>
#include <sys/uio.h>
#include <sys/salib.h>
#include "socket_inet.h"
#include "ipv4.h"
#include "ipv4_impl.h"
#include "udp_inet.h"
#include "tcp_inet.h"
#include "mac.h"
#include "mac_impl.h"
#include <sys/promif.h>

struct inetboot_socket	sockets[MAXSOCKET] = { 0 };

/* Default send and receive socket buffer size */
#define	SO_DEF_SNDBUF	48*1024
#define	SO_DEF_RCVBUF	48*1024

/* Default max socket buffer size */
#define	SO_MAX_BUF	4*1024*1024

static ssize_t dgram_sendto(int, const void *, size_t, int,
    const struct sockaddr *, int);
static ssize_t stream_sendto(int, const void *, size_t, int);
static int bind_check(int, const struct sockaddr *);
static int quickbind(int);

/* Check the validity of a fd and return the socket index of that fd. */
int
so_check_fd(int fd, int *errno)
{
	int i;

	i = FD_TO_SOCKET(fd);
	if (i < 0 || i >= MAXSOCKET) {
		*errno = ENOTSOCK;
		return (-1);
	}
	if (sockets[i].type == INETBOOT_UNUSED) {
		*errno = ENOTSOCK;
		return (-1);
	}
	return (i);
}

/*
 * Create an endpoint for network communication. Returns a descriptor.
 *
 * Notes:
 *	Only PF_INET communication domains are supported. Within
 * 	this domain, only SOCK_RAW, SOCK_DGRAM and SOCK_STREAM types are
 *	supported.
 */
int
socket(int domain, int type, int protocol)
{
	static int sock_initialized;
	int i;

	errno = 0;

	if (!sock_initialized) {
		for (i = 0; i < MAXSOCKET; i++)
			sockets[i].type = INETBOOT_UNUSED;
		sock_initialized = B_TRUE;
	}
	if (domain != AF_INET) {
		errno = EPROTONOSUPPORT;
		return (-1);
	}

	/* Find available socket */
	for (i = 0; i < MAXSOCKET; i++) {
		if (sockets[i].type == INETBOOT_UNUSED)
			break;
	}
	if (i >= MAXSOCKET) {
		errno = EMFILE;	/* No slots left. */
		return (-1);
	}

	/* Some socket initialization... */
	sockets[i].so_rcvbuf = SO_DEF_RCVBUF;
	sockets[i].so_sndbuf = SO_DEF_SNDBUF;

	/*
	 * Note that we ignore the protocol field for SOCK_DGRAM and
	 * SOCK_STREAM.  When we support different protocols in future,
	 * this needs to be changed.
	 */
	switch (type) {
	case SOCK_RAW:
		ipv4_raw_socket(&sockets[i], (uint8_t)protocol);
		break;
	case SOCK_DGRAM:
		udp_socket_init(&sockets[i]);
		break;
	case SOCK_STREAM:
		tcp_socket_init(&sockets[i]);
		break;
	default:
		errno = EPROTOTYPE;
		break;
	}

	if (errno != 0)
		return (-1);

	/* IPv4 generic initialization. */
	ipv4_socket_init(&sockets[i]);

	/* MAC generic initialization. */
	mac_socket_init(&sockets[i]);

	return (i + SOCKETTYPE);
}

int
getsockname(int s, struct sockaddr *name,  socklen_t *namelen)
{
	int i;

	errno = 0;
	if ((i = so_check_fd(s, &errno)) == -1)
		return (-1);

	if (*namelen < sizeof (struct sockaddr_in)) {
		errno = ENOMEM;
		return (-1);
	}

	/* Structure assignment... */
	*((struct sockaddr_in *)name) = sockets[i].bind;
	*namelen = sizeof (struct sockaddr_in);
	return (0);
}

/*
 * The socket options we support are:
 * SO_RCVTIMEO	-	Value is in msecs, and is of uint32_t.
 * SO_DONTROUTE	-	Value is an int, and is a boolean (nonzero if set).
 * SO_REUSEADDR -	Value is an int boolean.
 * SO_RCVBUF -		Value is an int.
 * SO_SNDBUF -		Value is an int.
 */
int
getsockopt(int s, int level, int option, void *optval, socklen_t *optlen)
{
	int i;

	errno = 0;
	if ((i = so_check_fd(s, &errno)) == -1)
		return (-1);

	switch (level) {
	case SOL_SOCKET: {
		switch (option) {
		case SO_RCVTIMEO:
			if (*optlen == sizeof (uint32_t)) {
				*(uint32_t *)optval = sockets[i].in_timeout;
			} else {
				*optlen = 0;
				errno = EINVAL;
			}
			break;
		case SO_DONTROUTE:
			if (*optlen == sizeof (int)) {
				*(int *)optval =
				    (sockets[i].out_flags & SO_DONTROUTE);
			} else {
				*optlen = 0;
				errno = EINVAL;
			}
			break;
		case SO_REUSEADDR:
			if (*optlen == sizeof (int)) {
				*(int *)optval =
				    (sockets[i].so_opt & SO_REUSEADDR);
			} else {
				*optlen = 0;
				errno = EINVAL;
			}
			break;
		case SO_RCVBUF:
			if (*optlen == sizeof (int)) {
				*(int *)optval = sockets[i].so_rcvbuf;
			} else {
				*optlen = 0;
				errno = EINVAL;
			}
			break;
		case SO_SNDBUF:
			if (*optlen == sizeof (int)) {
				*(int *)optval = sockets[i].so_sndbuf;
			} else {
				*optlen = 0;
				errno = EINVAL;
			}
			break;
		case SO_LINGER:
			if (*optlen == sizeof (struct linger)) {
				/* struct copy */
				*(struct linger *)optval = sockets[i].so_linger;
			} else {
				*optlen = 0;
				errno = EINVAL;
			}
		default:
			errno = ENOPROTOOPT;
			break;
		}
		break;
	} /* case SOL_SOCKET */
	case IPPROTO_TCP:
	case IPPROTO_IP: {
		switch (option) {
		default:
			*optlen = 0;
			errno = ENOPROTOOPT;
			break;
		}
		break;
	} /* case IPPROTO_IP or IPPROTO_TCP */
	default:
		errno = ENOPROTOOPT;
		break;
	} /* switch (level) */

	if (errno != 0)
		return (-1);
	else
		return (0);
}

/*
 * Generate a network-order source port from the privileged range if
 * "reserved" is true, dynamic/private range otherwise. We consider the
 * range of 512-1023 privileged ports as ports we can use. This mirrors
 * historical rpc client practice for privileged port selection.
 */
in_port_t
get_source_port(boolean_t reserved)
{
	static in_port_t	dynamic = IPPORT_DYNAMIC_START - 1,
	    rsvdport = (IPPORT_RESERVED / 2) - 1;
	in_port_t		p;

	if (reserved) {
		if (++rsvdport >= IPPORT_RESERVED)
			p = rsvdport = IPPORT_RESERVED / 2;
		else
			p = rsvdport;
	} else
		p = ++dynamic;

	return (htons(p));
}

/*
 * The socket options we support are:
 * SO_RECVTIMEO	-	Value is uint32_t msecs.
 * SO_DONTROUTE	-	Value is int boolean (nonzero == TRUE, zero == FALSE).
 * SO_REUSEADDR -	value is int boolean.
 * SO_RCVBUF -		Value is int.
 * SO_SNDBUF -		Value is int.
 */
int
setsockopt(int s, int level, int option, const void *optval, socklen_t optlen)
{
	int i;

	errno = 0;
	if ((i = so_check_fd(s, &errno)) == -1)
		return (-1);

	switch (level) {
	case SOL_SOCKET: {
		switch (option) {
		case SO_RCVTIMEO:
			if (optlen == sizeof (uint32_t))
				sockets[i].in_timeout = *(uint32_t *)optval;
			else {
				errno = EINVAL;
			}
			break;
		case SO_DONTROUTE:
			if (optlen == sizeof (int)) {
				if (*(int *)optval)
					sockets[i].out_flags |= SO_DONTROUTE;
				else
					sockets[i].out_flags &= ~SO_DONTROUTE;
			} else {
				errno = EINVAL;
			}
			break;
		case SO_REUSEADDR:
			if (optlen == sizeof (int)) {
				if (*(int *)optval)
					sockets[i].so_opt |= SO_REUSEADDR;
				else
					sockets[i].so_opt &= ~SO_REUSEADDR;
			} else {
				errno = EINVAL;
			}
			break;
		case SO_RCVBUF:
			if (optlen == sizeof (int)) {
				sockets[i].so_rcvbuf = *(int *)optval;
				if (sockets[i].so_rcvbuf > SO_MAX_BUF)
					sockets[i].so_rcvbuf = SO_MAX_BUF;
				(void) tcp_opt_set(sockets[i].pcb,
				    level, option, optval, optlen);
			} else {
				errno = EINVAL;
			}
			break;
		case SO_SNDBUF:
			if (optlen == sizeof (int)) {
				sockets[i].so_sndbuf = *(int *)optval;
				if (sockets[i].so_sndbuf > SO_MAX_BUF)
					sockets[i].so_sndbuf = SO_MAX_BUF;
				(void) tcp_opt_set(sockets[i].pcb,
				    level, option, optval, optlen);
			} else {
				errno = EINVAL;
			}
			break;
		case SO_LINGER:
			if (optlen == sizeof (struct linger)) {
				/* struct copy */
				sockets[i].so_linger = *(struct linger *)optval;
				(void) tcp_opt_set(sockets[i].pcb,
				    level, option, optval, optlen);
			} else {
				errno = EINVAL;
			}
			break;
		default:
			errno = ENOPROTOOPT;
			break;
		}
		break;
	} /* case SOL_SOCKET */
	case IPPROTO_TCP:
	case IPPROTO_IP: {
		switch (option) {
		default:
			errno = ENOPROTOOPT;
			break;
		}
		break;
	} /* case IPPROTO_IP  or IPPROTO_TCP */
	default:
		errno = ENOPROTOOPT;
		break;
	} /* switch (level) */

	if (errno != 0)
		return (-1);
	else
		return (0);
}

/*
 * Shut down part of a full-duplex connection.
 *
 * Only supported for TCP sockets
 */
int
shutdown(int s, int how)
{
	int sock_id;
	int i;

	errno = 0;
	if ((sock_id = so_check_fd(s, &errno)) == -1)
		return (-1);

	/* shutdown only supported for TCP sockets */
	if (sockets[sock_id].type != INETBOOT_STREAM) {
		errno = EOPNOTSUPP;
		return (-1);
	}

	if (!(sockets[sock_id].so_state & SS_ISCONNECTED)) {
		errno = ENOTCONN;
		return (-1);
	}

	switch (how) {
	case 0:
		sockets[sock_id].so_state |= SS_CANTRCVMORE;
		break;
	case 1:
		sockets[sock_id].so_state |= SS_CANTSENDMORE;
		break;
	case 2:
		sockets[sock_id].so_state |= (SS_CANTRCVMORE | SS_CANTSENDMORE);
		break;
	default:
		errno = EINVAL;
		return (-1);
	}

	switch (sockets[sock_id].so_state &
	    (SS_CANTRCVMORE | SS_CANTSENDMORE)) {
	case (SS_CANTRCVMORE | SS_CANTSENDMORE):
		/* Call lower level protocol close routine. */
		for (i = TRANSPORT_LVL; i >= MEDIA_LVL; i--) {
			if (sockets[sock_id].close[i] != NULL) {
				(void) sockets[sock_id].close[i](sock_id);
			}
		}
		nuke_grams(&sockets[sock_id].inq);
		break;
	case SS_CANTRCVMORE:
		nuke_grams(&sockets[sock_id].inq);
		break;
	case SS_CANTSENDMORE:
		/* Call lower level protocol close routine. */
		if (tcp_shutdown(sock_id) < 0)
			return (-1);
		break;
	default:
		errno = EINVAL;
		return (-1);
	}

	return (0);
}

/*
 * "close" a socket.
 */
int
socket_close(int s)
{
	int sock_id, i;

	errno = 0;
	if ((sock_id = so_check_fd(s, &errno)) == -1)
		return (-1);

	/* Call lower level protocol close routine. */
	for (i = TRANSPORT_LVL; i >= MEDIA_LVL; i--) {
		if (sockets[sock_id].close[i] != NULL) {
			/*
			 * Note that the close() routine of other
			 * layers can return an error.  But right
			 * now, the only mechanism to report that
			 * back is for the close() routine to set
			 * the errno and socket_close() will return
			 * an error.  But the close operation will
			 * not be stopped.
			 */
			(void) sockets[sock_id].close[i](sock_id);
		}
	}

	/*
	 * Clear the input queue.  This has to be done
	 * after the lower level protocol close routines have been
	 * called as they may want to do something about the queue.
	 */
	nuke_grams(&sockets[sock_id].inq);

	bzero((caddr_t)&sockets[sock_id], sizeof (struct inetboot_socket));
	sockets[sock_id].type = INETBOOT_UNUSED;

	return (0);
}

/*
 * Read up to `nbyte' of data from socket `s' into `buf'; if non-zero,
 * then give up after `read_timeout' seconds.  Returns the number of
 * bytes read, or -1 on failure.
 */
int
socket_read(int s, void *buf, size_t nbyte, int read_timeout)
{
	ssize_t	n;
	uint_t	start, diff;

	/*
	 * keep calling non-blocking recvfrom until something received
	 * or an error occurs
	 */
	start = prom_gettime();
	for (;;) {
		n = recvfrom(s, buf, nbyte, MSG_DONTWAIT, NULL, NULL);
		if (n == -1 && errno == EWOULDBLOCK) {
			diff = (uint_t)((prom_gettime() - start) + 500) / 1000;
			if (read_timeout != 0 && diff > read_timeout) {
				errno = EINTR;
				return (-1);
			}
		} else {
			return (n);
		}
	}
}

/*
 * Write up to `nbyte' bytes of data from `buf' to the address pointed to
 * `addr' using socket `s'.  Returns the number of bytes writte on success,
 * or -1 on failure.
 */
int
socket_write(int s, const void *buf, size_t nbyte, struct sockaddr_in *addr)
{
	return (sendto(s, buf, nbyte, 0, (struct sockaddr *)addr,
	    sizeof (*addr)));
}

static int
bind_check(int sock_id, const struct sockaddr *addr)
{
	int k;
	struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;

	/* Do not check for duplicate bind() if SO_REUSEADDR option is set. */
	if (! (sockets[sock_id].so_opt & SO_REUSEADDR)) {
		for (k = 0; k < MAXSOCKET; k++) {
			if (sockets[k].type != INETBOOT_UNUSED &&
			    sockets[k].proto == sockets[sock_id].proto &&
			    sockets[k].bound) {
				if ((sockets[k].bind.sin_addr.s_addr ==
				    in_addr->sin_addr.s_addr) &&
				    (sockets[k].bind.sin_port ==
				    in_addr->sin_port)) {
					errno = EADDRINUSE;
					return (-1);
				}
			}
		}
	}
	return (0);
}

/* Assign a name to an unnamed socket. */
int
bind(int s, const struct sockaddr *name, socklen_t namelen)
{
	int i;

	errno = 0;

	if ((i = so_check_fd(s, &errno)) == -1)
		return (-1);

	if (name == NULL) {
		/* unbind */
		if (sockets[i].bound) {
			bzero((caddr_t)&sockets[i].bind,
			    sizeof (struct sockaddr_in));
			sockets[i].bound = B_FALSE;
		}
		return (0);
	}
	if (namelen != sizeof (struct sockaddr_in) || name == NULL) {
		errno = EINVAL;
		return (-1);
	}
	if (name->sa_family != AF_INET) {
		errno = EAFNOSUPPORT;
		return (-1);
	}
	if (sockets[i].bound) {
		if (bcmp((caddr_t)&sockets[i].bind, (caddr_t)name,
		    namelen) == 0) {
			/* attempt to bind to same address ok... */
			return (0);
		}
		errno = EINVAL;	/* already bound */
		return (-1);
	}

	if (errno != 0) {
		return (-1);
	}

	/* Check for duplicate bind(). */
	if (bind_check(i, name) < 0)
		return (-1);

	bcopy((caddr_t)name, (caddr_t)&sockets[i].bind, namelen);
	if (sockets[i].type == INETBOOT_STREAM) {
		if (tcp_bind(i) < 0) {
			return (-1);
		}
	}
	sockets[i].bound = B_TRUE;

	return (0);
}

static int
quickbind(int sock_id)
{
	int i;
	struct sockaddr_in addr;

	/*
	 * XXX This needs more work.  Right now, if ipv4_setipaddr()
	 * have not been called, this will be wrong.  But we need
	 * something better.  Need to be revisited.
	 */
	ipv4_getipaddr(&addr.sin_addr);
	addr.sin_family = AF_INET;

	for (i = SMALLEST_ANON_PORT; i <= LARGEST_ANON_PORT; i++) {
		addr.sin_port = htons(i);
		if (bind_check(sock_id, (struct sockaddr *)&addr) == 0)
			break;
	}
	/* Need to clear errno as it is probably set by bind_check(). */
	errno = 0;

	if (i <= LARGEST_ANON_PORT) {
		bcopy((caddr_t)&addr, (caddr_t)&sockets[sock_id].bind,
		    sizeof (struct sockaddr_in));
		sockets[sock_id].bound = B_TRUE;
#ifdef DEBUG
		printf("quick bind done addr %s port %d\n",
		    inet_ntoa(sockets[sock_id].bind.sin_addr),
		    ntohs(sockets[sock_id].bind.sin_port));
#endif
		return (0);
	} else {
		return (-1);
	}
}

int
listen(int fd, int backlog)
{
	int sock_id;

	errno = 0;
	if ((sock_id = so_check_fd(fd, &errno)) == -1)
		return (-1);

	if (sockets[sock_id].type != INETBOOT_STREAM) {
		errno = EOPNOTSUPP;
		return (-1);
	}
	if (sockets[sock_id].so_error != 0) {
		errno = sockets[sock_id].so_error;
		return (-1);
	}
	return (tcp_listen(sock_id, backlog));
}

int
accept(int fd, struct sockaddr *addr,  socklen_t *addr_len)
{
	int sock_id;
	int new_sd;

	errno = 0;
	if ((sock_id = so_check_fd(fd, &errno)) == -1)
		return (-1);

	if (sockets[sock_id].type != INETBOOT_STREAM) {
		errno = EOPNOTSUPP;
		return (-1);
	}
	if (sockets[sock_id].so_error != 0) {
		errno = sockets[sock_id].so_error;
		return (-1);
	}
	if ((new_sd = tcp_accept(sock_id, addr, addr_len)) == -1)
		return (-1);
	sock_id = so_check_fd(new_sd, &errno);
	sockets[sock_id].so_state |= SS_ISCONNECTED;
	return (new_sd);
}

int
connect(int fd, const  struct sockaddr *addr, socklen_t addr_len)
{
	int sock_id;
	int so_type;

	errno = 0;
	if ((sock_id = so_check_fd(fd, &errno)) == -1)
		return (-1);

	so_type = sockets[sock_id].type;

	if (addr == NULL || addr_len == 0) {
		errno = EINVAL;
		return (-1);
	}
	/* Don't allow connect for raw socket. */
	if (so_type == INETBOOT_RAW) {
		errno = EPROTONOSUPPORT;
		return (-1);
	}

	if (sockets[sock_id].so_state & SS_ISCONNECTED) {
		errno = EINVAL;
		return (-1);
	}

	if (sockets[sock_id].so_error != 0) {
		errno = sockets[sock_id].so_error;
		return (-1);
	}

	/* If the socket is not bound, we need to do a quick bind. */
	if (!sockets[sock_id].bound) {
		/* For TCP socket, just call tcp_bind(). */
		if (so_type == INETBOOT_STREAM) {
			if (tcp_bind(sock_id) < 0)
				return (-1);
		} else {
			if (quickbind(sock_id) < 0) {
				errno = EADDRNOTAVAIL;
				return (-1);
			}
		}
	}
	/* Should do some sanity check for addr .... */
	bcopy((caddr_t)addr, &sockets[sock_id].remote,
	    sizeof (struct sockaddr_in));

	if (sockets[sock_id].type == INETBOOT_STREAM) {
		/* Call TCP connect routine. */
		if (tcp_connect(sock_id) == 0)
			sockets[sock_id].so_state |= SS_ISCONNECTED;
		else {
			if (sockets[sock_id].so_error != 0)
				errno = sockets[sock_id].so_error;
			return (-1);
		}
	} else {
		sockets[sock_id].so_state |= SS_ISCONNECTED;
	}
	return (0);
}

/* Just a wrapper around recvfrom(). */
ssize_t
recv(int s, void *buf, size_t len, int flags)
{
	return (recvfrom(s, buf, len, flags, NULL, NULL));
}

/*
 * Receive messages from a connectionless socket. Legal flags are 0 and
 * MSG_DONTWAIT. MSG_WAITALL is not currently supported.
 *
 * Returns length of message for success, -1 if error occurred.
 */
ssize_t
recvfrom(int s, void *buf, size_t len, int flags, struct sockaddr *from,
    socklen_t *fromlen)
{
	int			sock_id, i;
	ssize_t			datalen, bytes = 0;
	struct inetgram		*icp;
	enum SockType		so_type;
	char			*tmp_buf;
	mblk_t			*mp;

	errno = 0;

	if ((sock_id = so_check_fd(s, &errno)) == -1) {
		errno = EINVAL;
		return (-1);
	}

	if (sockets[sock_id].type == INETBOOT_STREAM &&
	    !(sockets[sock_id].so_state & SS_ISCONNECTED)) {
		errno = ENOTCONN;
		return (-1);
	}

	if (buf == NULL || len == 0) {
		errno = EINVAL;
		return (-1);
	}
	/* Yup - MSG_WAITALL not implemented */
	if ((flags & ~MSG_DONTWAIT) != 0) {
		errno = EINVAL;
		return (-1);
	}

retry:
	if (sockets[sock_id].inq == NULL) {
		/* Go out and check the wire */
		for (i = MEDIA_LVL; i < APP_LVL; i++) {
			if (sockets[sock_id].input[i] != NULL) {
				if (sockets[sock_id].input[i](sock_id) < 0) {
					if (sockets[sock_id].so_error != 0) {
						errno =
						    sockets[sock_id].so_error;
					}
					return (-1);
				}
			}
		}
	}

	so_type = sockets[sock_id].type;

	/* Remove unknown inetgrams from the head of inq.  Can this happen? */
	while ((icp = sockets[sock_id].inq) != NULL) {
		if ((so_type == INETBOOT_DGRAM ||
		    so_type == INETBOOT_STREAM) &&
		    icp->igm_level != APP_LVL) {
#ifdef	DEBUG
			printf("recvfrom: unexpected level %d frame found\n",
			    icp->igm_level);
#endif	/* DEBUG */
			del_gram(&sockets[sock_id].inq, icp, B_TRUE);
			continue;
		} else {
			break;
		}
	}


	if (icp == NULL) {
		/*
		 * Checking for error should be done everytime a lower layer
		 * input routing is called.  For example, if TCP gets a RST,
		 * this should be reported asap.
		 */
		if (sockets[sock_id].so_state & SS_CANTRCVMORE) {
			if (sockets[sock_id].so_error != 0) {
				errno = sockets[sock_id].so_error;
				return (-1);
			} else {
				return (0);
			}
		}

		if ((flags & MSG_DONTWAIT) == 0)
			goto retry;	/* wait forever */

		/* no data */
		errno = EWOULDBLOCK;
		return (-1);
	}

	if (from != NULL && fromlen != NULL) {
		switch (so_type) {
		case INETBOOT_STREAM:
			/* Need to copy from the socket's remote address. */
			bcopy(&(sockets[sock_id].remote), from, MIN(*fromlen,
			    sizeof (struct sockaddr_in)));
			break;
		case INETBOOT_RAW:
		case INETBOOT_DGRAM:
		default:
			if (*fromlen > sizeof (icp->igm_saddr))
				*fromlen = sizeof (icp->igm_saddr);
			bcopy((caddr_t)&(icp->igm_saddr), (caddr_t)from,
			    MIN(*fromlen, sizeof (struct sockaddr_in)));
			break;
		}
	}

	mp = icp->igm_mp;
	switch (so_type) {
	case INETBOOT_STREAM:
		/*
		 * If the message has igm_id == TCP_CALLB_MAGIC_ID, we need
		 * to drain the data held by tcp and try again.
		 */
		if (icp->igm_id == TCP_CALLB_MAGIC_ID) {
			del_gram(&sockets[sock_id].inq, icp, B_TRUE);
			tcp_rcv_drain_sock(sock_id);
			goto retry;
		}

		/* TCP should put only user data in the inetgram. */
		tmp_buf = (char *)buf;
		while (len > 0 && icp != NULL) {
			datalen = mp->b_wptr - mp->b_rptr;
			if (len < datalen) {
				bcopy(mp->b_rptr, tmp_buf, len);
				bytes += len;
				mp->b_rptr += len;
				break;
			} else {
				bcopy(mp->b_rptr, tmp_buf, datalen);
				len -= datalen;
				bytes += datalen;
				tmp_buf += datalen;
				del_gram(&sockets[sock_id].inq, icp, B_TRUE);

				/*
				 * If we have any embedded magic messages just
				 * drop them.
				 */
				while ((icp = sockets[sock_id].inq) != NULL) {
					if (icp->igm_id != TCP_CALLB_MAGIC_ID)
						break;
					del_gram(&sockets[sock_id].inq, icp,
					    B_TRUE);
				}

				if (icp == NULL)
					break;
				mp = icp->igm_mp;
			}
		}
		sockets[sock_id].so_rcvbuf += (int32_t)bytes;
		break;
	case INETBOOT_DGRAM:
		datalen = mp->b_wptr - mp->b_rptr;
		if (len < datalen)
			bytes = len;
		else
			bytes = datalen;
		bcopy(mp->b_rptr, buf, bytes);
		del_gram(&sockets[sock_id].inq, icp, B_TRUE);
		break;
	case INETBOOT_RAW:
	default:
		datalen = mp->b_wptr - mp->b_rptr;
		if (len < datalen)
			bytes = len;
		else
			bytes = datalen;
		bcopy(mp->b_rptr, buf, bytes);
		del_gram(&sockets[sock_id].inq, icp, B_TRUE);
		break;
	}

#ifdef	DEBUG
	printf("recvfrom(%d): data: (0x%x,%d)\n", sock_id,
	    (icp != NULL) ? icp->igm_mp : 0, bytes);
#endif	/* DEBUG */
	return (bytes);
}


/* Just a wrapper around sendto(). */
ssize_t
send(int s, const void *msg, size_t len, int flags)
{
	return (sendto(s, msg, len, flags, NULL, 0));
}

/*
 * Transmit a message through a socket.
 *
 * Supported flags: MSG_DONTROUTE or 0.
 */
ssize_t
sendto(int s, const void *msg, size_t len, int flags, const struct sockaddr *to,
    socklen_t tolen)
{
	enum SockType so_type;
	int sock_id;
	ssize_t bytes;

	errno = 0;

	if ((sock_id = so_check_fd(s, &errno)) == -1) {
		return (-1);
	}
	if (msg == NULL) {
		errno = EINVAL;
		return (-1);
	}
	so_type = sockets[sock_id].type;
	if ((flags & ~MSG_DONTROUTE) != 0) {
		errno = EINVAL;
		return (-1);
	}
	if (sockets[sock_id].so_error != 0) {
		errno = sockets[sock_id].so_error;
		return (-1);
	}
	if (to != NULL && to->sa_family != AF_INET) {
		errno = EAFNOSUPPORT;
		return (-1);
	}

	switch (so_type) {
	case INETBOOT_RAW:
	case INETBOOT_DGRAM:
		if (!(sockets[sock_id].so_state & SS_ISCONNECTED) &&
		    (to == NULL || tolen != sizeof (struct sockaddr_in))) {
			errno = EINVAL;
			return (-1);
		}
		bytes = dgram_sendto(sock_id, msg, len, flags, to, tolen);
		break;
	case INETBOOT_STREAM:
		if (!((sockets[sock_id].so_state & SS_ISCONNECTED) ||
		    (sockets[sock_id].so_state & SS_ISCONNECTING))) {
			errno = EINVAL;
			return (-1);
		}
		if (sockets[sock_id].so_state & SS_CANTSENDMORE) {
			errno = EPIPE;
			return (-1);
		}
		bytes = stream_sendto(sock_id, msg, len, flags);
		break;
	default:
		/* Should not happen... */
		errno = EPROTOTYPE;
		return (-1);
	}
	return (bytes);
}

static ssize_t
dgram_sendto(int i, const void *msg, size_t len, int flags,
    const struct sockaddr *to, int tolen)
{
	struct inetgram		oc;
	int			l, offset;
	size_t			tlen;
	mblk_t			*mp;

#ifdef	DEBUG
	{
	struct sockaddr_in *sin = (struct sockaddr_in *)to;
	printf("sendto(%d): msg of length: %d sent to port %d and host: %s\n",
	    i, len, ntohs(sin->sin_port), inet_ntoa(sin->sin_addr));
	}
#endif	/* DEBUG */

	nuke_grams(&sockets[i].inq); /* flush the input queue */

	/* calculate offset for data */
	offset = sockets[i].headerlen[MEDIA_LVL](NULL) +
	    (sockets[i].headerlen[NETWORK_LVL])(NULL);

	bzero((caddr_t)&oc, sizeof (oc));
	if (sockets[i].type != INETBOOT_RAW) {
		offset += (sockets[i].headerlen[TRANSPORT_LVL])(NULL);
		oc.igm_level = TRANSPORT_LVL;
	} else
		oc.igm_level = NETWORK_LVL;
	oc.igm_oflags = flags;

	if (to != NULL) {
		bcopy((caddr_t)to, (caddr_t)&oc.igm_saddr, tolen);
	} else {
		bcopy((caddr_t)&sockets[i].remote, (caddr_t)&oc.igm_saddr,
		    sizeof (struct sockaddr_in));
	}

	/* Get a legal source port if the socket isn't bound. */
	if (sockets[i].bound == B_FALSE &&
	    ntohs(oc.igm_saddr.sin_port == 0)) {
		((struct sockaddr_in *)&oc.igm_saddr)->sin_port =
		    get_source_port(B_FALSE);
	}

	/* Round up to 16bit value for checksum purposes */
	if (sockets[i].type == INETBOOT_DGRAM) {
		tlen = ((len + sizeof (uint16_t) - 1) &
		    ~(sizeof (uint16_t) - 1));
	} else
		tlen = len;

	if ((oc.igm_mp = allocb(tlen + offset, 0)) == NULL) {
		errno = ENOMEM;
		return (-1);
	}
	mp = oc.igm_mp;
	mp->b_rptr = mp->b_wptr += offset;
	bcopy((caddr_t)msg, mp->b_wptr, len);
	mp->b_wptr += len;
	for (l = TRANSPORT_LVL; l >= MEDIA_LVL; l--) {
		if (sockets[i].output[l] != NULL) {
			if (sockets[i].output[l](i, &oc) < 0) {
				freeb(mp);
				if (errno == 0)
					errno = EIO;
				return (-1);
			}
		}
	}
	freeb(mp);
	return (len);
}

/* ARGSUSED */
static ssize_t
stream_sendto(int i, const void *msg, size_t len, int flags)
{
	int cnt;

	assert(sockets[i].pcb != NULL);

	/*
	 * Call directly TCP's send routine.  We do this because TCP
	 * needs to decide whether to send out the data.
	 *
	 * Note also that currently, TCP ignores all flags passed in for
	 * TCP socket.
	 */
	if ((cnt = tcp_send(i, sockets[i].pcb, msg, len)) < 0) {
		if (sockets[i].so_error != 0)
			errno = sockets[i].so_error;
		return (-1);
	} else {
		return (cnt);
	}
}

/*
 * Returns ptr to the last inetgram in the list, or null if list is null
 */
struct inetgram *
last_gram(struct inetgram *igp)
{
	struct inetgram	*wp;
	for (wp = igp; wp != NULL; wp = wp->igm_next) {
		if (wp->igm_next == NULL)
			return (wp);
	}
	return (NULL);
}

/*
 * Adds an inetgram or list of inetgrams to the end of the list.
 */
void
add_grams(struct inetgram **igpp, struct inetgram *newgp)
{
	struct inetgram	 *wp;

	if (newgp == NULL)
		return;

	if (*igpp == NULL)
		*igpp = newgp;
	else {
		wp = last_gram(*igpp);
		wp->igm_next = newgp;
	}
}

/*
 * Nuke a whole list of grams.
 */
void
nuke_grams(struct inetgram **lgpp)
{
	while (*lgpp != NULL)
		del_gram(lgpp, *lgpp, B_TRUE);
}

/*
 * Remove the referenced inetgram. List is altered accordingly. Destroy the
 * referenced inetgram if freeit is B_TRUE.
 */
void
del_gram(struct inetgram **lgpp, struct inetgram *igp, int freeit)
{
	struct inetgram	*wp, *pp = NULL;

	if (lgpp == NULL || igp == NULL)
		return;

	wp = *lgpp;
	while (wp != NULL) {
		if (wp == igp) {
			/* detach wp from the list */
			if (*lgpp == wp)
				*lgpp = (*lgpp)->igm_next;
			else
				pp->igm_next = wp->igm_next;
			igp->igm_next = NULL;

			if (freeit) {
				if (igp->igm_mp != NULL)
					freeb(igp->igm_mp);
				bkmem_free((caddr_t)igp,
				    sizeof (struct inetgram));
			}
			break;
		}
		pp = wp;
		wp = wp->igm_next;
	}
}

struct nct_t nct[] = {
	"bootp",	NCT_BOOTP_DHCP,
	"dhcp",		NCT_BOOTP_DHCP,
	"rarp",		NCT_RARP_BOOTPARAMS,
	"manual",	NCT_MANUAL
};
int	nct_entries = sizeof (nct) / sizeof (nct[0]);

/*
 * Figure out from the bootpath what kind of network configuration strategy
 * we should use. Returns the network config strategy.
 */
int
get_netconfig_strategy(void)
{
	int	i;
#define	ISSPACE(c) (c == ' ' || c == '\t' || c == '\n' || c == '\0')
	char	lbootpath[OBP_MAXPATHLEN];
	char	net_options[NCT_BUFSIZE];
	char	*op, *nop, *sp;
	pnode_t	cn;
	int	proplen;

	/* If the PROM DHCP cache exists, we're done */
	if (prom_cached_reply(B_TRUE))
		return (NCT_BOOTP_DHCP);

	/*
	 *	Newer (version 4) PROMs will put the name in the
	 *	"net-config-strategy" property.
	 */
	cn = prom_finddevice("/chosen");
	if ((proplen = prom_getproplen(cn, "net-config-strategy")) <
	    sizeof (net_options)) {
		(void) prom_getprop(cn, "net-config-strategy", net_options);
		net_options[proplen] = '\0';
	} else {

		/*
		 * We're reduced to sacanning bootpath for the prototol to use.
		 * Since there was no "net-config-strategy" property, this is
		 * an old PROM, so we need to excise any extraneous key/value
		 * initializations from bootpath[].
		 */
		for (op = prom_bootpath(), sp = lbootpath; op != NULL &&
		    !ISSPACE(*op); sp++, op++)
			*sp = *op;
		*sp = '\0';
		/* find the last '/' (in the device path) */
		if ((op = strrchr(lbootpath, '/')) == NULL)	/* last '/' */
			op = lbootpath;
		else
			op++;
		/* then look for the ':' separating it from the protocol */
		while (*op != ':' && *op != '\0')
			op++;

		if (*op == ':') {
			for (nop = net_options, op++;
			    *op != '\0' && *op != '/' && !ISSPACE(*op) &&
			    nop < &net_options[NCT_BUFSIZE]; nop++, op++)
				*nop = *op;
			*nop = '\0';
		} else
			net_options[0] = '\0';
	}

#undef	ISSPACE

	for (i = 0; i < nct_entries; i++)
		if (strcmp(net_options, nct[i].p_name) == 0)
			return (nct[i].p_id);

	return (NCT_DEFAULT);
}

/* Modified STREAM routines for ease of porting core TCP code. */

/*ARGSUSED*/
mblk_t *
allocb(size_t size, uint_t pri)
{
	unsigned char *base;
	mblk_t *mp;

	if ((mp = (mblk_t *)bkmem_zalloc(sizeof (mblk_t))) == NULL)
		return (NULL);
	if ((base = (unsigned char *)bkmem_zalloc(size)) == NULL)
		return (NULL);

	mp->b_next = mp->b_prev = mp->b_cont = NULL;
	mp->b_rptr = mp->b_wptr = mp->b_datap = (unsigned char *)base;
	mp->b_size = size;

	return (mp);
}

void
freeb(mblk_t *mp)
{
#ifdef DEBUG
	printf("freeb datap %x\n", mp->b_datap);
#endif
	bkmem_free((caddr_t)(mp->b_datap), mp->b_size);
#ifdef DEBUG
	printf("freeb mp %x\n", mp);
#endif
	bkmem_free((caddr_t)mp, sizeof (mblk_t));
}

void
freemsg(mblk_t *mp)
{
	while (mp) {
		mblk_t *mp_cont = mp->b_cont;

		freeb(mp);
		mp = mp_cont;
	}
}

mblk_t *
copyb(mblk_t *bp)
{
	mblk_t *nbp;
	unsigned char *ndp;

	assert((uintptr_t)(bp->b_wptr - bp->b_rptr) >= 0);

	if (!(nbp = allocb(bp->b_size, 0)))
		return (NULL);
	nbp->b_cont = NULL;
	ndp = nbp->b_datap;

	nbp->b_rptr = ndp + (bp->b_rptr - bp->b_datap);
	nbp->b_wptr = nbp->b_rptr + (bp->b_wptr - bp->b_rptr);
	bcopy(bp->b_datap, nbp->b_datap, bp->b_size);
	return (nbp);
}

/* To simplify things, dupb() is implemented as copyb(). */
mblk_t *
dupb(mblk_t *mp)
{
	return (copyb(mp));
}

/*
 * get number of data bytes in message
 */
size_t
msgdsize(mblk_t *bp)
{
	size_t count = 0;

	for (; bp != NULL; bp = bp->b_cont) {
		assert(bp->b_wptr >= bp->b_rptr);
		count += bp->b_wptr - bp->b_rptr;
	}
	return (count);
}