OpenSolaris_b135/stand/lib/inet/ibd.c

Compare this file to the similar file:
Show the results in this format:

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <dhcp_impl.h>
#include <sys/types.h>
#include <socket_impl.h>
#include <socket_inet.h>
#include <sys/time.h>
#include <sys/socket.h>
#include <net/if.h>
#include <net/if_arp.h>
#include <netinet/in_systm.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/if_ether.h>
#include <sys/promif.h>
#include <sys/prom_plat.h>
#include <sys/salib.h>
#include <sys/bootdebug.h>
#include <sys/ib/clients/ibd/ibd.h>

#include "ipv4.h"
#include "dhcpv4.h"
#include "ipv4_impl.h"
#include "mac.h"
#include "mac_impl.h"
#include "ibd_inet.h"

struct ibd_arp {
	struct arphdr	ea_hdr;		/* fixed-size header */
	ipoib_mac_t	arp_sha;	/* sender hardware address */
	uchar_t		arp_spa[4];	/* sender protocol address */
	ipoib_mac_t	arp_tha;	/* target hardware address */
	uchar_t		arp_tpa[4];	/* target protocol address */
};

extern int errno;
ipoib_mac_t ibdbroadcastaddr;

/*
 * Assumptions about OBP behavior (refer FWARC 2002/702, 2003/251):
 * 1. prom_write() accepts the 20 byte destination address as the
 * first component in the send buffer. The buffer pointer points
 * to the start of this 20 byte address. The length parameter is
 * the IPoIB datagram size with the 20 byte of destination
 * address.
 * 2. OBP will not provide max-frame-size, since obp can only
 * determine that by querying the IBA mcg, and thus the property
 * has to be /chosen:ipib-frame-size. This will refer to the IPoIB
 * link MTU as per section 4.0 of ietf i/d, ie, the 4 byte IPoIB
 * header plus the IP payload mtu. Plus the 20 bytes of addressing
 * information.
 * 3. OBP will not provide mac-address property for IPoIB since there
 * are built in assumptions about 6 byte address with that. Instead,
 * /chosen:ipib-address will provide the local address.
 * 4. prom_read() returns 20 byte 0'ed filler followed by 4 byte
 * IPoIB header followed by IP payload. The return value is -2,
 * -1, 0, or the length of the received IPoIB datagram alongwith
 * the 20 bytes MBZ. The buffer pointer points to the start of
 * the 20 MBZ bytes. The length parameter reflects the max data
 * size that should be copied into the buffer including the 20
 * MBZ bytes.
 * 5. OBP will not provide chosen-network-type, only
 * network-interface-type = ipib. On an Infiniband device, this
 * however does not guarantee that it is a network device.
 * 6. OBP will provide the DHCP client id in /chosen:client-id.
 * 7. /chosen:ipib-broadcast will provide the broadcast address.
 * 8. OBP will validate that RARP is not being used before
 * allowing boot to proceed to inetboot.
 */

struct arp_packet {
	ipoib_ptxhdr_t		arp_eh;
	struct ibd_arp		arp_ea;
};

#define	dprintf	if (boothowto & RB_DEBUG) printf

static char *
ibd_print(ipoib_mac_t *ea)
{
	unsigned char *macaddr = (unsigned char *)ea;
	static char pbuf[(3 * IPOIB_ADDRL) + 1];
	int i;
	char *ptr = pbuf;

	ptr = pbuf + sprintf(pbuf, "%x", *macaddr++);
	for (i = 0; i < (IPOIB_ADDRL - 1); i++)
		ptr += sprintf(ptr, ":%x", *macaddr++);
	return (pbuf);
}


/*
 * Common ARP code. Broadcast the packet and wait for the right response.
 *
 * If arp is called for, caller expects a hardware address in the
 * source hardware address (sha) field of the "out" argument.
 *
 * IPoIB does not support RARP (see ibd_revarp()).
 *
 * Returns TRUE if transaction succeeded, FALSE otherwise.
 *
 * The timeout argument is the number of milliseconds to wait for a
 * response. An infinite timeout can be specified as 0xffffffff.
 */
static int
ibd_comarp(struct arp_packet *out, uint32_t timeout)
{
	struct arp_packet *in = (struct arp_packet *)mac_state.mac_buf;
	int count, time, feedback, len, delay = 2;
	char    *ind = "-\\|/";
	struct in_addr tmp_ia;
	uint32_t wait_time;

	bcopy((caddr_t)&ibdbroadcastaddr, (caddr_t)&out->arp_eh.ipoib_dest,
	    IPOIB_ADDRL);

	out->arp_ea.arp_hrd =  htons(ARPHRD_IB);
	out->arp_ea.arp_pro = htons(ETHERTYPE_IP);
	out->arp_ea.arp_hln = IPOIB_ADDRL;
	out->arp_ea.arp_pln = sizeof (struct in_addr);
	bcopy(mac_state.mac_addr_buf, (caddr_t)&out->arp_ea.arp_sha,
	    IPOIB_ADDRL);
	ipv4_getipaddr(&tmp_ia);
	tmp_ia.s_addr = htonl(tmp_ia.s_addr);
	bcopy((caddr_t)&tmp_ia, (caddr_t)out->arp_ea.arp_spa,
	    sizeof (struct in_addr));
	feedback = 0;

	wait_time = prom_gettime() + timeout;
	for (count = 0; timeout == ~0U || prom_gettime() < wait_time; count++) {
		if (count == IBD_WAITCNT) {
			/*
			 * Since IPoIB does not support RARP (see ibd_revarp),
			 * we know that out->arp_ea.arp_op == ARPOP_REQUEST.
			 */
			bcopy((caddr_t)out->arp_ea.arp_tpa,
			    (caddr_t)&tmp_ia, sizeof (struct in_addr));
			printf("\nRequesting MAC address for: %s\n",
			    inet_ntoa(tmp_ia));
		}

		(void) prom_write(mac_state.mac_dev, (caddr_t)out,
		    sizeof (*out), 0, NETWORK);

		if (count >= IBD_WAITCNT)
			printf("%c\b", ind[feedback++ % 4]); /* activity */

		time = prom_gettime() + (delay * 1000);	/* broadcast delay */
		while (prom_gettime() <= time) {
			len = prom_read(mac_state.mac_dev, mac_state.mac_buf,
			    mac_state.mac_mtu, 0, NETWORK);
			if (len < sizeof (struct arp_packet))
				continue;
			if (in->arp_ea.arp_pro != ntohs(ETHERTYPE_IP))
				continue;
			/*
			 * Since IPoIB does not support RARP (see ibd_revarp),
			 * we know that out->arp_ea.arp_op == ARPOP_REQUEST.
			 */
			if (in->arp_eh.ipoib_rhdr.ipoib_type !=
			    ntohs(ETHERTYPE_ARP))
				continue;
			if (in->arp_ea.arp_op != ntohs(ARPOP_REPLY))
				continue;
			if (bcmp((caddr_t)in->arp_ea.arp_spa,
			    (caddr_t)out->arp_ea.arp_tpa,
			    sizeof (struct in_addr)) != 0)
				continue;
			if (boothowto & RB_VERBOSE) {
				bcopy((caddr_t)in->arp_ea.arp_spa,
				    (caddr_t)&tmp_ia,
				    sizeof (struct in_addr));
				printf("Found %s @ %s\n",
				    inet_ntoa(tmp_ia),
				    ibd_print(&in->arp_ea.arp_sha));
			}
			/* copy hardware addr into "out" for caller */
			bcopy((caddr_t)&in->arp_ea.arp_sha,
			    (caddr_t)&out->arp_ea.arp_sha, IPOIB_ADDRL);
			return (TRUE);
		}

		delay = delay * 2;	/* Double the request delay */
		if (delay > 64)		/* maximum delay is 64 seconds */
			delay = 64;
	}
	return (FALSE);
}

/*
 * ARP client side
 * Broadcasts to determine MAC address given network order IP address.
 * See RFC 826
 *
 * Returns TRUE if successful, FALSE otherwise.
 */
static int
ibd_arp(struct in_addr *ip, void *hap, uint32_t timeout)
{
	ipoib_mac_t *ep = (ipoib_mac_t *)hap;
	struct arp_packet out;
	int result;

	if (!initialized)
		prom_panic("IPoIB device is not initialized.");

	bzero((char *)&out, sizeof (struct arp_packet));

	out.arp_eh.ipoib_rhdr.ipoib_type = htons(ETHERTYPE_ARP);
	out.arp_ea.arp_op = htons(ARPOP_REQUEST);
	bcopy((caddr_t)&ibdbroadcastaddr, (caddr_t)&out.arp_ea.arp_tha,
	    IPOIB_ADDRL);
	bcopy((caddr_t)ip, (caddr_t)out.arp_ea.arp_tpa,
	    sizeof (struct in_addr));

	result = ibd_comarp(&out, timeout);

	if (result && (ep != NULL)) {
		bcopy((caddr_t)&out.arp_ea.arp_sha, (caddr_t)ep, IPOIB_ADDRL);
	}
	return (result);
}

/*
 * Reverse ARP client side
 * Determine our Internet address given our MAC address
 * See RFC 903
 */
static void
ibd_revarp(void)
{
	prom_panic("IPoIB can not boot with RARP.");
}

/* ARGSUSED */
static int
ibd_header_len(struct inetgram *igm)
{
	/*
	 * We indicate to upper layers to leave enough space
	 * in output buffers for filling in the IPoIB header
	 * and the 20 byte destination address in ibd_output().
	 */
	return (IPOIB_HDRSIZE + IPOIB_ADDRL);
}

/*
 * Handle a IP datagram addressed to our MAC address or to the link
 * layer broadcast address. Also respond to ARP requests. Generates
 * inetgrams as long as there's data and the mac level IP timeout timer
 * hasn't expired. As soon as there is no data, we try for
 * IBD_INPUT_ATTEMPTS for more, then exit the loop, even if there is time
 * left, since we expect to have data waiting for us when we're called, we just
 * don't know how much.
 *
 * We workaround slow proms (some proms have hard sleeps for as much as 3msec)
 * even though there are is data waiting.
 *
 * Returns the total number of MEDIA_LVL frames placed on the socket.
 * Caller is expected to free up the inetgram resources.
 */
static int
ibd_input(int index)
{
	struct inetgram		*inp;
	ipoib_ptxhdr_t		*eh;
	int		frames = 0;	/* successful frames */
	int		attempts = 0;	/* failed attempts after success */
	int16_t		len = 0, data_len;
	uint32_t	timeout, reltime;
	uint32_t	pre_pr, post_pr; /* prom_read interval */

#ifdef	DEBUG
	int		failures = 0;		/* total failures */
	int		total_attempts = 0;	/* total prom_read */
	int		no_data = 0;		/* no data in prom */
	int		arps = 0;		/* arp requests processed */
	uint32_t	tot_pr = 0;		/* prom_read time */
	uint32_t	tot_pc = 0;		/* inetgram creation time */
	uint32_t	pre_pc;
	uint32_t	now;
#endif	/* DEBUG */

	if (!initialized)
		prom_panic("IPoIB device is not initialized.");

	if ((reltime = sockets[index].in_timeout) == 0)
		reltime = mac_state.mac_in_timeout;
	timeout = prom_gettime() + reltime;

	do {
		if (frames > IBD_MAX_FRAMES) {
			/* someone is trying a denial of service attack */
			break;
		}

		/*
		 * The following is being paranoid about possible bugs
		 * where prom_read() returns a nonzero length, even when
		 * it's not read a packet; it zeroes out the header to
		 * compensate. Paranoia from calvin prom (V2) days.
		 */
		bzero(mac_state.mac_buf, sizeof (ipoib_ptxhdr_t));

		/*
		 * Prom_read() will return 0 or -2 if no data is present. A
		 * return value of -1 means an error has occurred. We adjust
		 * the timeout by calling the time spent in prom_read() "free".
		 * prom_read() returns the number of bytes actually read, but
		 * will only copy "len" bytes into our buffer. Adjust in
		 * case the MTU is wrong.
		 */
		pre_pr = prom_gettime();
		len = prom_read(mac_state.mac_dev, mac_state.mac_buf,
		    mac_state.mac_mtu, 0, NETWORK);
		post_pr = prom_gettime();
		timeout += (post_pr - pre_pr);
#ifdef	DEBUG
		tot_pr += (post_pr - pre_pr);
		total_attempts++;
#endif	/* DEBUG */

		if (len > mac_state.mac_mtu) {
			dprintf("ibd_input: adjusting MTU %d -> %d\n",
			    mac_state.mac_mtu, len);
			bkmem_free(mac_state.mac_buf, mac_state.mac_mtu);
			mac_state.mac_mtu = len;
			mac_state.mac_buf = bkmem_alloc(mac_state.mac_mtu);
			if (mac_state.mac_buf == NULL) {
				prom_panic("ibd_input: Cannot reallocate "
				    "netbuf memory.");
			}
			len = 0; /* pretend there was no data */
		}

		if (len == -1) {
#ifdef	DEBUG
			failures++;
#endif	/* DEBUG */
			break;
		}
		if (len == 0 || len == -2) {
			if (frames != 0)
				attempts++;
#ifdef	DEBUG
			no_data++;
#endif	/* DEBUG */
			continue;
		}

		eh = (ipoib_ptxhdr_t *)mac_state.mac_buf;
		if (eh->ipoib_rhdr.ipoib_type == ntohs(ETHERTYPE_IP) &&
		    len >= (sizeof (ipoib_ptxhdr_t) + sizeof (struct ip))) {

			int offset;
#ifdef	DEBUG
			pre_pc = prom_gettime();
#endif	/* DEBUG */

			inp = (struct inetgram *)bkmem_zalloc(
			    sizeof (struct inetgram));
			if (inp == NULL) {
				errno = ENOMEM;
				return (frames == 0 ? -1 : frames);
			}
			offset = sizeof (ipoib_ptxhdr_t);
			data_len = len - offset;
			inp->igm_mp = allocb(data_len, 0);
			if (inp->igm_mp == NULL) {
				errno = ENOMEM;
				bkmem_free((caddr_t)inp,
				    sizeof (struct inetgram));
				return (frames == 0 ? -1 : frames);
			}
			bcopy((caddr_t)(mac_state.mac_buf + offset),
			    inp->igm_mp->b_rptr, data_len);
			inp->igm_mp->b_wptr += data_len;
			inp->igm_level = NETWORK_LVL;
			add_grams(&sockets[index].inq, inp);
			frames++;
			attempts = 0;
#ifdef	DEBUG
			tot_pc += prom_gettime() - pre_pc;
#endif	/* DEBUG */
			continue;
		}

		if (eh->ipoib_rhdr.ipoib_type == ntohs(ETHERTYPE_ARP) &&
		    len >= sizeof (struct arp_packet)) {

			struct in_addr		ip;
			struct ibd_arp		*ea;

#ifdef	DEBUG
			printf("ibd_input: ARP message received\n");
			arps++;
#endif	/* DEBUG */

			ea = (struct ibd_arp *)(mac_state.mac_buf +
			    sizeof (ipoib_ptxhdr_t));
			if (ea->arp_pro != ntohs(ETHERTYPE_IP))
				continue;

			ipv4_getipaddr(&ip);
			ip.s_addr = ntohl(ip.s_addr);

			if (ea->arp_op == ntohs(ARPOP_REQUEST) &&
			    ip.s_addr != INADDR_ANY &&
			    (bcmp((caddr_t)ea->arp_tpa, (caddr_t)&ip,
			    sizeof (struct in_addr)) == 0)) {
				ea->arp_op = htons(ARPOP_REPLY);
				bcopy((caddr_t)&ea->arp_sha,
				    (caddr_t)&eh->ipoib_dest, IPOIB_ADDRL);
				bcopy((caddr_t)&ea->arp_sha,
				    (caddr_t)&ea->arp_tha, IPOIB_ADDRL);
				bcopy((caddr_t)ea->arp_spa,
				    (caddr_t)ea->arp_tpa,
				    sizeof (struct in_addr));
				bcopy(mac_state.mac_addr_buf,
				    (caddr_t)&ea->arp_sha,
				    mac_state.mac_addr_len);
				bcopy((caddr_t)&ip, (caddr_t)ea->arp_spa,
				    sizeof (struct in_addr));
				(void) prom_write(mac_state.mac_dev,
				    mac_state.mac_buf,
				    sizeof (struct arp_packet), 0, NETWORK);
				/* don't charge for ARP replies */
				timeout += reltime;
			}
		}
	} while (attempts < IBD_INPUT_ATTEMPTS &&
#ifdef	DEBUG
	    (now = prom_gettime()) < timeout);
#else
	    prom_gettime() < timeout);
#endif	/* DEBUG */

#ifdef	DEBUG
	printf("ibd_input(%d): T/S/N/A/F/P/M: %d/%d/%d/%d/%d/%d/%d "
	    "T/O: %d < %d = %s\n", index, total_attempts, frames, no_data,
	    arps, failures, tot_pr, tot_pc, now, timeout,
	    (now < timeout) ? "TRUE" : "FALSE");
#endif	/* DEBUG */
	return (frames);
}

/*
 * Send out an IPoIB datagram. We expect a IP frame appropriately fragmented
 * at this level.
 *
 * Errno is set and -1 is returned if an error occurs. Number of bytes sent
 * is returned on success.
 */
/* ARGSUSED */
static int
ibd_output(int index, struct inetgram *ogp)
{
	int			header_len, result;
	ipoib_ptxhdr_t		eh;
	struct ip		*ip;
	struct in_addr		tmpip, ipdst;
	int			broadcast = FALSE;
	int			size;
	mblk_t			*mp;

	if (!initialized)
		prom_panic("IPoIB device is not initialized.");

	if (ogp->igm_level != MEDIA_LVL) {
		dprintf("ibd_output: frame type wrong: socket: %d\n",
		    index * SOCKETTYPE);
		errno = EINVAL;
		return (-1);
	}

	header_len = IPOIB_HDRSIZE + IPOIB_ADDRL;
	mp = ogp->igm_mp;
	size = mp->b_wptr - mp->b_rptr;
	if (size > (mac_state.mac_mtu - IPOIB_ADDRL)) {
		dprintf("ibd_output: frame size too big: %d\n", size);
		errno = E2BIG;
		return (-1);
	}

	size += header_len;
	ip = (struct ip *)(mp->b_rptr);

	eh.ipoib_rhdr.ipoib_type = htons(ETHERTYPE_IP);
	eh.ipoib_rhdr.ipoib_mbz = 0;
	bcopy((caddr_t)&ip->ip_dst, (caddr_t)&ipdst, sizeof (ipdst));

	if (ipdst.s_addr == htonl(INADDR_BROADCAST))
		broadcast = TRUE; /* limited broadcast */

	if (!broadcast) {
		struct in_addr mask;

		ipv4_getnetmask(&mask);
		mask.s_addr = htonl(mask.s_addr);
		if (mask.s_addr != htonl(INADDR_BROADCAST) &&
		    (ipdst.s_addr & ~mask.s_addr) == 0) {
			broadcast = TRUE; /* directed broadcast */
		} else {
			if (ogp->igm_router.s_addr != htonl(INADDR_ANY))
				tmpip.s_addr = ogp->igm_router.s_addr;
			else
				tmpip.s_addr = ipdst.s_addr;

			result = mac_get_arp(&tmpip, (void *)&eh.ipoib_dest,
			    IPOIB_ADDRL, mac_state.mac_arp_timeout);
			if (!result) {
				errno = ETIMEDOUT;
				dprintf("ibd_output: ARP request for %s "
				    "timed out.\n", inet_ntoa(tmpip));
				return (-1);
			}
		}
	}

	if (broadcast)
		bcopy((caddr_t)&ibdbroadcastaddr, (caddr_t)&eh.ipoib_dest,
		    IPOIB_ADDRL);

	/* add the ibd header */
	mp->b_rptr -= sizeof (eh);
	bcopy((caddr_t)&eh, mp->b_rptr, sizeof (eh));

#ifdef	DEBUG
	printf("ibd_output(%d): level(%d) frame(0x%x) len(%d)\n",
	    index, ogp->igm_level, mp->b_rptr, size);
#endif	/* DEBUG */

	return (prom_write(mac_state.mac_dev, (char *)mp->b_rptr, size,
	    0, NETWORK));
}

void
ibd_init(void)
{
	pnode_t	chosen;
	char	*mtuprop = "ipib-frame-size";
	char	*bcastprop = "ipib-broadcast";
	char	*addrprop = "ipib-address";
	char	*cidprop = "client-id";
	int	cidlen;
	uint8_t	dhcpcid[DHCP_MAX_CID_LEN];

	mac_state.mac_addr_len = IPOIB_ADDRL;
	mac_state.mac_addr_buf = bkmem_alloc(mac_state.mac_addr_len);
	if (mac_state.mac_addr_buf == NULL)
		prom_panic("ibd_init: Cannot allocate memory.");

	chosen = prom_finddevice("/chosen");
	if (chosen == OBP_NONODE || chosen == OBP_BADNODE)
		prom_panic("ibd_init: Cannot find /chosen.");

	if (prom_getprop(chosen, addrprop, (caddr_t)mac_state.mac_addr_buf) !=
	    IPOIB_ADDRL)
		prom_panic("ibd_init: Cannot find /chosen:ipib-address\n.");

	if (prom_getprop(chosen, bcastprop, (caddr_t)&ibdbroadcastaddr) !=
	    IPOIB_ADDRL)
		prom_panic("ibd_init: Cannot find /chosen:ipib-broadcast\n.");

	if (((cidlen = prom_getproplen(chosen, cidprop)) <= 0) ||
	    (cidlen > DHCP_MAX_CID_LEN) || (prom_getprop(chosen, cidprop,
	    (caddr_t)&dhcpcid) != cidlen))
		prom_panic("ibd_init: Invalid /chosen:client-id\n.");
	dhcp_set_client_id(dhcpcid, cidlen);

	/*
	 * Note that prom reports mtu including 20 bytes of
	 * addressing information.
	 */
	if (prom_getprop(chosen, mtuprop,
	    (caddr_t)&mac_state.mac_mtu) <= 0)
		mac_state.mac_mtu = IBDSIZE + IPOIB_ADDRL;

	/*
	 * Tell upper layers that we can support a little
	 * more. We will be taking off these 20 bytes at
	 * the start before we invoke prom_write() to send
	 * over the wire.
	 */
	mac_state.mac_arp_timeout = IBD_ARP_TIMEOUT;
	mac_state.mac_in_timeout = IBD_IN_TIMEOUT;

	mac_state.mac_arp = ibd_arp;
	mac_state.mac_rarp = ibd_revarp;
	mac_state.mac_header_len = ibd_header_len;
	mac_state.mac_input = ibd_input;
	mac_state.mac_output = ibd_output;
}