NetBSD-5.0.2/sys/arch/xen/xen/xennetback.c

Compare this file to the similar file:
Show the results in this format:

/*      $NetBSD: xennetback.c,v 1.34 2008/10/21 15:46:32 cegger Exp $      */

/*
 * Copyright (c) 2005 Manuel Bouyer.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Manuel Bouyer.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: xennetback.c,v 1.34 2008/10/21 15:46:32 cegger Exp $");

#include "opt_xen.h"

#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/queue.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/device.h>
#include <sys/intr.h>

#include <net/if.h>
#include <net/if_types.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/netisr.h>
#include "bpfilter.h"
#if NBPFILTER > 0
#include <net/bpf.h>
#include <net/bpfdesc.h>
#endif

#include <net/if_ether.h>


#include <xen/xen.h>
#include <xen/xen_shm.h>
#include <xen/evtchn.h>
#include <xen/ctrl_if.h>

#ifdef XEN3
#else
#include <xen/xen-public/io/domain_controller.h>
#endif

#include <uvm/uvm.h>

#ifdef XENDEBUG_NET
#define XENPRINTF(x) printf x
#else
#define XENPRINTF(x)
#endif

/* hash list for TX requests */
/* descriptor of a packet being handled by the kernel */
struct xni_pkt {
	int pkt_id; /* packet's ID */
	struct xnetback_instance *pkt_xneti; /* pointer back to our softc */
	struct xni_page *pkt_page; /* page containing this packet */
};

struct xni_page {
	SLIST_ENTRY(xni_page) xni_page_next;
	int refcount;
	vaddr_t va; /* address the page is mapped to */
	paddr_t ma; /* page's machine address */
};

/* hash list of packets mapped by machine address */
SLIST_HEAD(xni_pages_hash, xni_page);
#define XNI_PAGE_HASH_SIZE 256 /* must be power of 2 */
#define XNI_PAGE_HASH_MASK (XNI_PAGE_HASH_SIZE - 1)
struct xni_pages_hash xni_tx_pages_hash[XNI_PAGE_HASH_SIZE];

/* pools for xni_pkt and xni_page */
struct pool xni_pkt_pool;
struct pool xni_page_pool;
/* ratecheck(9) for pool allocation failures */
struct timeval xni_pool_errintvl = { 30, 0 };  /* 30s, each */
/*
 * Backend network device driver for Xen
 */

static void xnetback_ctrlif_rx(ctrl_msg_t *, unsigned long);

/* state of a xnetback instance */
typedef enum {CONNECTED, DISCONNECTED} xnetback_state_t;

/* we keep the xnetback instances in a linked list */
struct xnetback_instance {
	SLIST_ENTRY(xnetback_instance) next; 
	domid_t domid;		/* attached to this domain */
	uint32_t handle;	/* domain-specific handle */
	xnetback_state_t status;
	void *xni_softintr;

	/* network interface stuff */
	struct ethercom xni_ec;
	struct callout xni_restart;
	uint8_t xni_enaddr[ETHER_ADDR_LEN];

	/* remote domain communication stuff */
	unsigned int xni_evtchn;
	paddr_t xni_ma_rxring; /* machine address of rx shared ring */
	paddr_t xni_ma_txring; /* machine address of tx shared ring */

	netif_tx_interface_t *xni_txring;
	netif_rx_interface_t *xni_rxring;
	NETIF_RING_IDX rxreq_cons; /* our index in the RX request ring */
};
#define xni_if    xni_ec.ec_if
#define xni_bpf   xni_if.if_bpf

static int  xennetback_ifioctl(struct ifnet *, u_long, void *);
static void xennetback_ifstart(struct ifnet *);
static void xennetback_ifsoftstart(void *);
static void xennetback_ifwatchdog(struct ifnet *);
static int  xennetback_ifinit(struct ifnet *);
static void xennetback_ifstop(struct ifnet *, int);

static inline void xennetback_tx_response(struct xnetback_instance *,
    int, int);
static void xennetback_tx_free(struct mbuf * , void *, size_t, void *);


SLIST_HEAD(, xnetback_instance) xnetback_instances;

static struct xnetback_instance *xnetif_lookup(domid_t, uint32_t);
static int  xennetback_evthandler(void *);

/*
 * Number of packets to transmit in one hypercall (= number of pages to
 * transmit at once).
 */
#define NB_XMIT_PAGES_BATCH 64
/*
 * We will transfers a mapped page to the remote domain, and remap another
 * page in place immediatly. For this we keep a list of pages available.
 * When the list is empty, we ask the hypervisor to give us
 * NB_XMIT_PAGES_BATCH pages back.
 */
static unsigned long mcl_pages[NB_XMIT_PAGES_BATCH]; /* our physical pages */
int mcl_pages_alloc; /* current index in mcl_pages */
static int  xennetback_get_mcl_page(paddr_t *);
static void xennetback_get_new_mcl_pages(void);
/*
 * If we can't transfer the mbuf directly, we have to copy it to a page which
 * will be transfered to the remote domain. We use a pool_cache
 * for this, or the mbuf cluster pool cache if MCLBYTES == PAGE_SIZE
 */
#if MCLBYTES != PAGE_SIZE
pool_cache_t xmit_pages_cache;
#endif
pool_cache_t xmit_pages_cachep;

/* arrays used in xennetback_ifstart(), too large to allocate on stack */
static mmu_update_t xstart_mmu[NB_XMIT_PAGES_BATCH * 3];
static multicall_entry_t xstart_mcl[NB_XMIT_PAGES_BATCH * 2];
struct mbuf *mbufs_sent[NB_XMIT_PAGES_BATCH];
struct _pages_pool_free {
	vaddr_t va;
	paddr_t pa;
} pages_pool_free[NB_XMIT_PAGES_BATCH];

void
xennetback_init()
{
	ctrl_msg_t cmsg;
	netif_be_driver_status_t st;
	int i;
	struct pglist mlist;
	struct vm_page *pg;

	if ( !xendomain_is_dom0() &&
	     !(xen_start_info.flags & SIF_NET_BE_DOMAIN))
		return;

	XENPRINTF(("xennetback_init\n"));

	/* initialize the mapped pages hash table */
	for (i = 0; i < XNI_PAGE_HASH_SIZE; i++) {
		SLIST_INIT(&xni_tx_pages_hash[i]);
	}


	/*
	 * steal some non-managed pages to the VM system, to remplace
	 * mbuf cluster or xmit_pages_pool pages given to foreing domains.
	 */
	if (uvm_pglistalloc(PAGE_SIZE * NB_XMIT_PAGES_BATCH, 0, 0xffffffff,
	    0, 0, &mlist, NB_XMIT_PAGES_BATCH, 0) != 0)
		panic("xennetback_init: uvm_pglistalloc");
	for (i = 0, pg = mlist.tqh_first; pg != NULL;
	    pg = pg->pageq.queue.tqe_next, i++)
		mcl_pages[i] = xpmap_ptom(VM_PAGE_TO_PHYS(pg)) >> PAGE_SHIFT;
	if (i != NB_XMIT_PAGES_BATCH)
		panic("xennetback_init: %d mcl pages", i);
	mcl_pages_alloc = NB_XMIT_PAGES_BATCH - 1;

	/* initialise pools */
	pool_init(&xni_pkt_pool, sizeof(struct xni_pkt), 0, 0, 0,
	    "xnbpkt", NULL, IPL_VM);
	pool_init(&xni_page_pool, sizeof(struct xni_page), 0, 0, 0,
	    "xnbpa", NULL, IPL_VM);
#if MCLBYTES != PAGE_SIZE
	xmit_pages_cache = pool_cache_init(PAGE_SIZE, 0, 0, 0, "xnbxm", NULL,
	    IPL_VM, NULL, NULL, NULL);
	xmit_pages_cachep = xmit_pages_cache;
#else
	xmit_pages_cachep = mcl_cache;
#endif

	/*
	 * initialize the backend driver, register the control message handler
	 * and send driver up message.
	 */
	SLIST_INIT(&xnetback_instances);
	(void)ctrl_if_register_receiver(CMSG_NETIF_BE, xnetback_ctrlif_rx,
	    CALLBACK_IN_BLOCKING_CONTEXT);

	cmsg.type      = CMSG_NETIF_BE;
	cmsg.subtype   = CMSG_NETIF_BE_DRIVER_STATUS;
	cmsg.length    = sizeof(netif_be_driver_status_t);
	st.status      = NETIF_DRIVER_STATUS_UP;
	memcpy(cmsg.msg, &st, sizeof(st));
	ctrl_if_send_message_block(&cmsg, NULL, 0, 0);
}

static void
xnetback_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
{
	struct xnetback_instance *xneti;

	XENPRINTF(("xnetback msg %d\n", msg->subtype));
	switch (msg->subtype) {
	case CMSG_NETIF_BE_CREATE:
	{
		netif_be_create_t *req = (netif_be_create_t *)&msg->msg[0];
		struct ifnet *ifp;
		extern int ifqmaxlen; /* XXX */

		if (msg->length != sizeof(netif_be_create_t))
			goto error;
		if (xnetif_lookup(req->domid, req->netif_handle) != NULL) {
			req->status = NETIF_BE_STATUS_INTERFACE_EXISTS;
			goto end;
		}
		xneti = malloc(sizeof(struct xnetback_instance), M_DEVBUF,
		    M_NOWAIT | M_ZERO);
		if (xneti == NULL) {
			req->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
			goto end;
		}
		xneti->domid = req->domid;
		xneti->handle = req->netif_handle;
		xneti->status = DISCONNECTED;

		xneti->xni_softintr = softint_establish(SOFTINT_NET,
		    xennetback_ifsoftstart, xneti);
		if (xneti->xni_softintr == NULL) {
			free(xneti, M_DEVBUF);
			req->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
			goto end;
		}

		ifp = &xneti->xni_if;
		ifp->if_softc = xneti;

		/* create pseudo-interface */
		memcpy(xneti->xni_enaddr, req->mac, ETHER_ADDR_LEN);
		/* we can't use the same MAC addr as our guest */
		xneti->xni_enaddr[3]++;
		snprintf(ifp->if_xname, IFNAMSIZ, "xvif%d.%d",
		    req->domid, req->netif_handle);
		printf("%s: Ethernet address %s\n", ifp->if_xname,
		    ether_sprintf(xneti->xni_enaddr));
		ifp->if_flags =
		    IFF_BROADCAST|IFF_SIMPLEX|IFF_NOTRAILERS|IFF_MULTICAST;
		ifp->if_snd.ifq_maxlen =
		    max(ifqmaxlen, NETIF_RX_RING_SIZE * 2);
		ifp->if_ioctl = xennetback_ifioctl;
		ifp->if_start = xennetback_ifstart;
		ifp->if_watchdog = xennetback_ifwatchdog;
		ifp->if_init = xennetback_ifinit;
		ifp->if_stop = xennetback_ifstop;
		ifp->if_timer = 0;
		IFQ_SET_READY(&ifp->if_snd);
		if_attach(ifp);
		ether_ifattach(&xneti->xni_if, xneti->xni_enaddr);

		req->status = NETIF_BE_STATUS_OKAY;
		SLIST_INSERT_HEAD(&xnetback_instances, xneti, next);
		break;
	}
	case CMSG_NETIF_BE_DESTROY:
	{
		netif_be_destroy_t *req = (netif_be_destroy_t *)&msg->msg[0];
		if (msg->length != sizeof(netif_be_destroy_t))
			goto error;
		xneti = xnetif_lookup(req->domid, req->netif_handle);
		if (xneti == NULL) {
			req->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
			goto end;
		}
		if (xneti->status == CONNECTED) {
			req->status = NETIF_BE_STATUS_INTERFACE_CONNECTED;
			goto end;
		}
		SLIST_REMOVE(&xnetback_instances,
		    xneti, xnetback_instance, next);

		ether_ifdetach(&xneti->xni_if);
		if_detach(&xneti->xni_if);

		free(xneti, M_DEVBUF);
		req->status = NETIF_BE_STATUS_OKAY;
		break;
	}
	case CMSG_NETIF_BE_CONNECT:
	{
		netif_be_connect_t *req = (netif_be_connect_t *)&msg->msg[0];
		vaddr_t ring_rxaddr, ring_txaddr;
		int error;

		if (msg->length != sizeof(netif_be_connect_t))
			goto error;
		xneti = xnetif_lookup(req->domid, req->netif_handle);
		if (xneti == NULL) {
			req->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
			goto end;
		}
		if (xneti->status == CONNECTED) {
			req->status = NETIF_BE_STATUS_INTERFACE_CONNECTED;
			goto end;
		}
		ring_rxaddr = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
		    UVM_KMF_VAONLY);
		if (ring_rxaddr == 0) {
			printf("%s: can't alloc ring VM\n",
			    xneti->xni_if.if_xname);
			req->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
			goto end;
		}
		ring_txaddr = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
		    UVM_KMF_VAONLY);
		if (ring_txaddr == 0) {
			printf("%s: can't alloc ring VM\n",
			    xneti->xni_if.if_xname);
			uvm_km_free(kernel_map, ring_rxaddr, PAGE_SIZE,
			    UVM_KMF_VAONLY);
			req->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
			goto end;
		}
		xneti->xni_ma_rxring = req->rx_shmem_frame << PAGE_SHIFT;
		xneti->xni_ma_txring = req->tx_shmem_frame << PAGE_SHIFT;
		error = pmap_enter_ma(pmap_kernel(), ring_rxaddr,
		   xneti->xni_ma_rxring, 0, VM_PROT_READ | VM_PROT_WRITE,
		   PMAP_WIRED | PMAP_CANFAIL, req->domid);
		if (error) {
			goto fail_1;
		}
		error = pmap_enter_ma(pmap_kernel(), ring_txaddr,
		   xneti->xni_ma_txring, 0, VM_PROT_READ | VM_PROT_WRITE,
		   PMAP_WIRED | PMAP_CANFAIL, req->domid);
		if (error) {
			pmap_remove(pmap_kernel(), ring_rxaddr,
			    ring_rxaddr + PAGE_SIZE);
			pmap_update(pmap_kernel());
fail_1:
			uvm_km_free(kernel_map, ring_rxaddr, PAGE_SIZE,
			    UVM_KMF_VAONLY);
			uvm_km_free(kernel_map, ring_txaddr, PAGE_SIZE,
			    UVM_KMF_VAONLY);
			printf("%s: can't remap ring: error %d\n",
			    xneti->xni_if.if_xname, error);
			if (error == ENOMEM)
				req->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
			else if (error == EFAULT)
				req->status = NETIF_BE_STATUS_MAPPING_ERROR;
			else
				req->status = NETIF_BE_STATUS_ERROR;
			goto end;
		}
		xneti->xni_rxring = (void *)ring_rxaddr;
		xneti->xni_txring = (void *)ring_txaddr;
		xneti->xni_evtchn = req->evtchn;
		event_set_handler(xneti->xni_evtchn, xennetback_evthandler,
		    xneti, IPL_NET, xneti->xni_if.if_xname);
		printf("%s using event channel %d\n",
		    xneti->xni_if.if_xname, xneti->xni_evtchn);
		hypervisor_enable_event(xneti->xni_evtchn);
		xneti->status = CONNECTED;
		if (xneti->xni_if.if_flags & IFF_UP)
			xneti->xni_if.if_flags |= IFF_RUNNING;
		req->status = NETIF_BE_STATUS_OKAY;
		break;
	}
	case CMSG_NETIF_BE_DISCONNECT:
	{
		netif_be_disconnect_t *req =
		    (netif_be_disconnect_t *)&msg->msg[0];
		vaddr_t ring_addr;

		if (msg->length != sizeof(netif_be_disconnect_t))
			goto error;
		xneti = xnetif_lookup(req->domid, req->netif_handle);
		if (xneti == NULL) {
			req->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
			goto end;
		}
		xneti->status = DISCONNECTED;
		xneti->xni_if.if_flags &= ~(IFF_RUNNING | IFF_OACTIVE);
		xneti->xni_if.if_timer = 0;
		hypervisor_mask_event(xneti->xni_evtchn);
		event_remove_handler(xneti->xni_evtchn,
		    xennetback_evthandler, xneti);
		softint_disestablish(xneti->xni_softintr);
		ring_addr = (vaddr_t)xneti->xni_rxring;
		pmap_remove(pmap_kernel(), ring_addr, ring_addr + PAGE_SIZE);
		uvm_km_free(kernel_map, ring_addr, PAGE_SIZE,
		    UVM_KMF_VAONLY);
		ring_addr = (vaddr_t)xneti->xni_txring;
		pmap_remove(pmap_kernel(), ring_addr, ring_addr + PAGE_SIZE);
		uvm_km_free(kernel_map, ring_addr, PAGE_SIZE,
		    UVM_KMF_VAONLY);

		req->status = NETIF_BE_STATUS_OKAY;
		break;
	}
	default:
error:
		printf("xnetback: wrong message subtype %d len %d\n",
		    msg->subtype, msg->length);
		msg->length = 0;
	}
end:
	XENPRINTF(("xnetback msg rep %d\n", msg->length));
	ctrl_if_send_response(msg);
	return;
}

/* lookup a xneti based on domain id and interface handle */
static struct xnetback_instance *
xnetif_lookup(domid_t dom , uint32_t handle)
{
	struct xnetback_instance *xneti;

	SLIST_FOREACH(xneti, &xnetback_instances, next) {
		if (xneti->domid == dom && xneti->handle == handle)
			return xneti;
	}
	return NULL;
}


/* get a page to remplace a mbuf cluster page given to a domain */
static int
xennetback_get_mcl_page(paddr_t *map)
{
	if (mcl_pages_alloc < 0)
		/*
		 * we exhausted our allocation. We can't allocate new ones yet
		 * because the current pages may not have been loaned to
		 * the remote domain yet. We have to let the caller do this.
		 */
		return -1;

	*map = mcl_pages[mcl_pages_alloc] << PAGE_SHIFT;
	mcl_pages_alloc--;
	return 0;
	
}

static void
xennetback_get_new_mcl_pages(void)
{
	int nb_pages;

	/* get some new pages. */
	nb_pages = HYPERVISOR_dom_mem_op(MEMOP_increase_reservation, mcl_pages,
	    NB_XMIT_PAGES_BATCH, 0);
	if (nb_pages <= 0) {
		printf("xennetback: can't get new mcl pages (%d)\n", nb_pages);
		return;
	}
	if (nb_pages != NB_XMIT_PAGES_BATCH)
		printf("xennetback: got only %d new mcl pages\n", nb_pages);

	mcl_pages_alloc = nb_pages - 1;
}


static inline void
xennetback_tx_response(struct xnetback_instance *xneti, int id, int status)
{
	NETIF_RING_IDX resp_prod;
	netif_tx_response_t *txresp;

	resp_prod = xneti->xni_txring->resp_prod;
	txresp = &xneti->xni_txring->ring[MASK_NETIF_TX_IDX(resp_prod)].resp;

	txresp->id = id;
	txresp->status = status;
	x86_lfence();
	xneti->xni_txring->resp_prod++;
	x86_lfence();
	if (xneti->xni_txring->event == xneti->xni_txring->resp_prod) {
		XENPRINTF(("%s send event\n", xneti->xni_if.if_xname));
		hypervisor_notify_via_evtchn(xneti->xni_evtchn);
	}
}


static int
xennetback_evthandler(void *arg)
{
	struct xnetback_instance *xneti = arg;
	struct ifnet *ifp = &xneti->xni_if;
	netif_tx_request_t *txreq;
	struct xni_pkt *pkt;
	struct xni_pages_hash *pkt_hash;
	struct xni_page *pkt_page;
	NETIF_RING_IDX req_prod;
	NETIF_RING_IDX req_cons, i;
	vaddr_t pkt_va;
	paddr_t pkt_ma;
	struct mbuf *m;

	XENPRINTF(("xennetback_evthandler "));
again:
	req_prod = xneti->xni_txring->req_prod;
	x86_lfence(); /* ensure we see all requests up to req_prod */
	req_cons = xneti->xni_txring->req_cons;
	XENPRINTF(("%s event req_prod %d resp_prod %d req_cons %d event %d\n",
	    xneti->xni_if.if_xname,
	    xneti->xni_txring->req_prod,
	    xneti->xni_txring->resp_prod,
	    xneti->xni_txring->req_cons,
	    xneti->xni_txring->event));
	for (i = 0; req_cons != req_prod; req_cons++, i++) {
		txreq =
		    &xneti->xni_txring->ring[MASK_NETIF_TX_IDX(req_cons)].req;
		XENPRINTF(("%s pkt size %d\n", xneti->xni_if.if_xname, txreq->size));
		if (__predict_false((ifp->if_flags & (IFF_UP | IFF_RUNNING)) !=
		    (IFF_UP | IFF_RUNNING))) {
			/* interface not up, drop */
			xennetback_tx_response(xneti, txreq->id,
			    NETIF_RSP_DROPPED);
			continue;
		}
		/*
		 * Do some sanity checks, and map the packet's page.
		 */
		if (__predict_false(txreq->size < ETHER_HDR_LEN ||
		   txreq->size > (ETHER_MAX_LEN - ETHER_CRC_LEN))) {
			printf("%s: packet size %d too big\n",
			    ifp->if_xname, txreq->size);
			xennetback_tx_response(xneti, txreq->id,
			    NETIF_RSP_ERROR);
			ifp->if_ierrors++;
			continue;
		}
		/* don't cross page boundaries */
		if (__predict_false(
		    (txreq->addr & PAGE_MASK) + txreq->size > PAGE_SIZE)) {
			printf("%s: packet cross page boundary\n",
			    ifp->if_xname);
			xennetback_tx_response(xneti, txreq->id,
			    NETIF_RSP_ERROR);
			ifp->if_ierrors++;
			continue;
		}
		/* get a mbuf for this packet */
		MGETHDR(m, M_DONTWAIT, MT_DATA);
		if (__predict_false(m == NULL)) {
			static struct timeval lasttime;
			if (ratecheck(&lasttime, &xni_pool_errintvl))
				printf("%s: mbuf alloc failed\n",
				    ifp->if_xname);
			xennetback_tx_response(xneti, txreq->id,
			    NETIF_RSP_DROPPED);
			ifp->if_ierrors++;
			continue;
		}

		pkt_ma = txreq->addr  & ~PAGE_MASK;
		XENPRINTF(("%s pkt ma 0x%lx size %d id %d req_cons %d\n",
		    xneti->xni_if.if_xname, pkt_ma,
		    txreq->size, txreq->id, MASK_NETIF_TX_IDX(req_cons)));
		    
		pkt = pool_get(&xni_pkt_pool, PR_NOWAIT);
		if (__predict_false(pkt == NULL)) {
			static struct timeval lasttime;
			if (ratecheck(&lasttime, &xni_pool_errintvl))
				printf("%s: xnbpkt alloc failed\n",
				    ifp->if_xname);
			xennetback_tx_response(xneti, txreq->id,
			    NETIF_RSP_DROPPED);
			ifp->if_ierrors++;
			m_freem(m);
			continue;
		}

		pkt_hash = &xni_tx_pages_hash[
		    (pkt_ma >> PAGE_SHIFT) & XNI_PAGE_HASH_MASK];
		SLIST_FOREACH(pkt_page, pkt_hash, xni_page_next) {
			if (pkt_page->ma == pkt_ma)
				break;
		}
		if (pkt_page == NULL) {
			pkt_page = pool_get(&xni_page_pool, PR_NOWAIT);
			if (__predict_false(pkt_page == NULL)) {
				static struct timeval lasttime;
				if (ratecheck(&lasttime, &xni_pool_errintvl))
					printf("%s: xnbpa alloc failed\n",
					    ifp->if_xname);
				xennetback_tx_response(xneti, txreq->id,
				    NETIF_RSP_DROPPED);
				ifp->if_ierrors++;
				m_freem(m);
				pool_put(&xni_pkt_pool, pkt);
				continue;
			}
			pkt_page->refcount = 0;
			if (__predict_false(xen_shm_map(&pkt_ma,
			    1, xneti->domid, &pkt_va, 0) != 0)) {
				static struct timeval lasttime;
				if (ratecheck(&lasttime, &xni_pool_errintvl))
					printf("%s: can't map packet page\n",
					    ifp->if_xname);
				xennetback_tx_response(xneti, txreq->id,
				    NETIF_RSP_DROPPED);
				ifp->if_ierrors++;
				m_freem(m);
				pool_put(&xni_pkt_pool, pkt);
				pool_put(&xni_page_pool, pkt_page);
				continue;
			}
			XENPRINTF(("new pkt_page va 0x%lx mbuf %p\n",
			    pkt_va, m));
			pkt_page->ma = pkt_ma;
			pkt_page->va = pkt_va;
			SLIST_INSERT_HEAD(pkt_hash, pkt_page, xni_page_next);
		} else {
			KASSERT(pkt_page->refcount > 0);
			pkt_va = pkt_page->va;
			XENPRINTF(("pkt_page refcount %d va 0x%lx m %p\n",
			    pkt_page->refcount, pkt_va, m));
		}
		if ((ifp->if_flags & IFF_PROMISC) == 0) {
			struct ether_header *eh =
			    (void*)(pkt_va | (txreq->addr & PAGE_MASK));
			if (ETHER_IS_MULTICAST(eh->ether_dhost) == 0 &&
			    memcmp(CLLADDR(ifp->if_sadl), eh->ether_dhost,
			    ETHER_ADDR_LEN) != 0) {
				pool_put(&xni_pkt_pool, pkt);
				m_freem(m);
				if (pkt_page->refcount == 0) {
					xen_shm_unmap(pkt_page->va,
					    &pkt_page->ma, 1, xneti->domid);
					SLIST_REMOVE(pkt_hash, pkt_page,
					    xni_page, xni_page_next);
					pool_put(&xni_page_pool, pkt_page);
				}
				xennetback_tx_response(xneti, txreq->id,
				    NETIF_RSP_OKAY);
				continue; /* packet is not for us */
			}
		}

		if (MASK_NETIF_TX_IDX(req_cons + 1) ==
		    MASK_NETIF_TX_IDX(xneti->xni_txring->resp_prod)) {
			/*
			 * This is the last TX buffer. Copy the data and
			 * ack it. Delaying it until the mbuf is
			 * freed will stall transmit.
			 */
			pool_put(&xni_pkt_pool, pkt);
			m->m_len = min(MHLEN, txreq->size);
			m->m_pkthdr.len = 0;
			m_copyback(m, 0, txreq->size,
			    (void *)(pkt_va | (txreq->addr & PAGE_MASK)));
			if (pkt_page->refcount == 0) {
				xen_shm_unmap(pkt_page->va, &pkt_page->ma, 1,
				    xneti->domid);
				SLIST_REMOVE(pkt_hash, pkt_page, xni_page,
				    xni_page_next);
				pool_put(&xni_page_pool, pkt_page);
			}
			if (m->m_pkthdr.len < txreq->size) {
				ifp->if_ierrors++;
				m_freem(m);
				xennetback_tx_response(xneti, txreq->id,
				    NETIF_RSP_DROPPED);
				continue;
			}
			xennetback_tx_response(xneti, txreq->id,
			    NETIF_RSP_OKAY);
		} else {
			pkt->pkt_id = txreq->id;
			pkt->pkt_xneti = xneti;
			pkt->pkt_page = pkt_page;
			pkt_page->refcount++;

			MEXTADD(m, pkt_va | (txreq->addr & PAGE_MASK),
			    txreq->size, M_DEVBUF, xennetback_tx_free, pkt);
			m->m_pkthdr.len = m->m_len = txreq->size;
		}
		m->m_pkthdr.rcvif = ifp;
		ifp->if_ipackets++;
		
#if NBPFILTER > 0
		if (ifp->if_bpf)
			bpf_mtap(ifp->if_bpf, m);
#endif
		(*ifp->if_input)(ifp, m);
	}
	xneti->xni_txring->req_cons = req_cons;
	/*
	 * make sure the guest will see our replies before testing for more
	 * work.
	 */
	x86_lfence(); /* ensure we see all requests up to req_prod */
	if (i > 0)
		goto again; /* more work to do ? */

	/* check to see if we can transmit more packets */
	softint_schedule(xneti->xni_softintr);

	return 1;
}
static void
xennetback_tx_free(struct mbuf *m, void *va, size_t size, void *arg)
{
	int s = splnet();
	struct xni_pkt *pkt = arg;
	struct xni_page *pkt_page = pkt->pkt_page;
	struct xnetback_instance *xneti = pkt->pkt_xneti;
	NETIF_RING_IDX resp_prod;
	netif_tx_response_t *txresp;
	struct xni_pages_hash *pkt_hash;

	XENPRINTF(("xennetback_tx_free ma 0x%lx refcount %d\n",
	   pkt_page->ma, pkt_page->refcount));

	resp_prod = xneti->xni_txring->resp_prod;
	XENPRINTF(("ack id %d resp_prod %d\n",
	    pkt->pkt_id, MASK_NETIF_TX_IDX(resp_prod)));
	txresp = &xneti->xni_txring->ring[MASK_NETIF_TX_IDX(resp_prod)].resp;
	txresp->id = pkt->pkt_id;
	txresp->status = NETIF_RSP_OKAY;
	x86_lfence();
	resp_prod++;
	xneti->xni_txring->resp_prod = resp_prod;
	x86_lfence();
	if (resp_prod == xneti->xni_txring->event) {
		XENPRINTF(("%s send event\n",
		    xneti->xni_if.if_xname));
		hypervisor_notify_via_evtchn(xneti->xni_evtchn);
	}
	KASSERT(pkt_page->refcount > 0);
	pkt_page->refcount--;
	pool_put(&xni_pkt_pool, pkt);

	if (pkt_page->refcount == 0) {
		xen_shm_unmap(pkt_page->va, &pkt_page->ma, 1, xneti->domid);
		pkt_hash = &xni_tx_pages_hash[
		    (pkt_page->ma >> PAGE_SHIFT) & XNI_PAGE_HASH_MASK];
		SLIST_REMOVE(pkt_hash, pkt_page, xni_page, xni_page_next);
		pool_put(&xni_page_pool, pkt_page);
	}
	if (m)
		pool_cache_put(mb_cache, m);
	splx(s);
}

static int
xennetback_ifioctl(struct ifnet *ifp, u_long cmd, void *data)
{
	//struct xnetback_instance *xneti = ifp->if_softc;
	//struct ifreq *ifr = (struct ifreq *)data;
	int s, error;

	s = splnet();
	error = ether_ioctl(ifp, cmd, data);
	if (error == ENETRESET)
		error = 0;
	splx(s);
	return error;
}

static void
xennetback_ifstart(struct ifnet *ifp)
{
	struct xnetback_instance *xneti = ifp->if_softc;

	/*
	 * The Xen communication channel is much more efficient if we can
	 * schedule batch of packets for the domain. To achieve this, we
	 * schedule a soft interrupt, and just return. This way, the network
	 * stack will enqueue all pending mbufs in the interface's send queue
	 * before it is processed by xennet_softstart().
	 */
	softint_schedule(xneti->xni_softintr);
}

static void
xennetback_ifsoftstart(void *arg)
{
	struct xnetback_instance *xneti = arg;
	struct ifnet *ifp = &xneti->xni_if;
	struct mbuf *m;
	vaddr_t xmit_va;
	paddr_t xmit_pa;
	paddr_t xmit_ma;
	paddr_t newp_ma = 0; /* XXX gcc */
	int i, j, nppitems;
	mmu_update_t *mmup;
	multicall_entry_t *mclp;
	netif_rx_request_t *rxreq;
	netif_rx_response_t *rxresp;
	NETIF_RING_IDX req_prod, resp_prod;
	int do_event = 0;

	XENPRINTF(("xennetback_ifsoftstart "));
	int s = splnet();
	if (__predict_false(
	    (ifp->if_flags & (IFF_RUNNING|IFF_OACTIVE)) != IFF_RUNNING)) {
		splx(s);
		return;
	}

	while (!IFQ_IS_EMPTY(&ifp->if_snd)) {
		XENPRINTF(("pkt\n"));
		req_prod = xneti->xni_rxring->req_prod;
		resp_prod = xneti->xni_rxring->resp_prod;
		x86_lfence();

		mmup = xstart_mmu;
		mclp = xstart_mcl;
		for (nppitems = 0, i = 0; !IFQ_IS_EMPTY(&ifp->if_snd);) {
			XENPRINTF(("have a packet\n"));
			IFQ_POLL(&ifp->if_snd, m);
			if (__predict_false(m == NULL))
				panic("xennetback_ifstart: IFQ_POLL");
			if (__predict_false(req_prod == xneti->rxreq_cons)) {
				/* out of ring space */
				XENPRINTF(("xennetback_ifstart: ring full "
				    "req_prod 0x%x req_cons 0x%x resp_prod "
				    "0x%x\n",
				    req_prod, xneti->rxreq_cons, resp_prod));
				ifp->if_timer = 1;
				break;
			}
			if (__predict_false(i == NB_XMIT_PAGES_BATCH))
				break; /* we filled the array */
			if (__predict_false(
			    xennetback_get_mcl_page(&newp_ma) != 0))
				break; /* out of memory */
			/* start filling ring */
			rxreq = &xneti->xni_rxring->ring[
			    MASK_NETIF_RX_IDX(xneti->rxreq_cons)].req;
			rxresp = &xneti->xni_rxring->ring[
			    MASK_NETIF_RX_IDX(resp_prod)].resp;
			rxresp->id = rxreq->id;
			rxresp->status = m->m_pkthdr.len;
			if ((m->m_flags & M_CLUSTER) != 0 &&
			    !M_READONLY(m) && MCLBYTES == PAGE_SIZE) {
				/* we can give this page away */
				xmit_pa = m->m_ext.ext_paddr;
				xmit_ma = xpmap_ptom(xmit_pa);
				xmit_va = (vaddr_t)m->m_ext.ext_buf;
				KASSERT(xmit_pa != M_PADDR_INVALID);
				KASSERT((xmit_va & PAGE_MASK) == 0);
				rxresp->addr =
				    xmit_ma + m->m_data - m->m_ext.ext_buf;
			} else {
				/* we have to copy the packet */
				xmit_va = (vaddr_t)pool_cache_get_paddr(
				    xmit_pages_cachep,
				    PR_NOWAIT, &xmit_pa);
				if (__predict_false(xmit_va == 0))
					break; /* out of memory */

				KASSERT(xmit_pa != POOL_PADDR_INVALID);
				xmit_ma = xpmap_ptom(xmit_pa);
				XENPRINTF(("xennetback_get_xmit_page: got va "
				    "0x%x ma 0x%x\n", (u_int)xmit_va,
				    (u_int)xmit_ma));
				m_copydata(m, 0, m->m_pkthdr.len,
				    (void *)xmit_va);
				rxresp->addr = xmit_ma;
				pages_pool_free[nppitems].va = xmit_va;
				pages_pool_free[nppitems].pa = xmit_pa;
				nppitems++;
			}
			/*
			 * transfers the page containing the packet to the
			 * remote domain, and map newp in place.
			 */
			xpmap_phys_to_machine_mapping[
			    (xmit_pa - XPMAP_OFFSET) >> PAGE_SHIFT] =
			    newp_ma >> PAGE_SHIFT;
			mmup[0].ptr = newp_ma | MMU_MACHPHYS_UPDATE;
			mmup[0].val =
			    (xmit_pa - XPMAP_OFFSET) >> PAGE_SHIFT;
			mmup[1].ptr = MMU_EXTENDED_COMMAND;
			mmup[1].val = MMUEXT_SET_FOREIGNDOM |
			    ((int)xneti->domid << 16);
			mmup[2].ptr = xmit_ma | MMU_EXTENDED_COMMAND;
			mmup[2].val = MMUEXT_REASSIGN_PAGE;
			mclp[0].op = __HYPERVISOR_update_va_mapping;
			mclp[0].args[0] = xmit_va >> PAGE_SHIFT;
			mclp[0].args[1] =
			    newp_ma | PG_V | PG_RW | PG_U | PG_M;
			mclp[0].args[2] = UVMF_INVLPG;
			mclp[1].op = __HYPERVISOR_mmu_update;
			mclp[1].args[0] = (unsigned long)mmup;
			mclp[1].args[1] = 3;
			mclp[1].args[2] = 0;
			mmup += 3;
			mclp += 2;

			/* done with this packet */
			xneti->rxreq_cons++;
			resp_prod++;
			do_event = 1;
			IFQ_DEQUEUE(&ifp->if_snd, m);
			i++; /* this packet has been queued */
			ifp->if_opackets++;
#if NBPFILTER > 0
			if (ifp->if_bpf)
				bpf_mtap(ifp->if_bpf, m);
#endif
			mbufs_sent[i-1] = m;
		}
		if (i != 0) {
			/* update the MMU */
			if (HYPERVISOR_multicall(xstart_mcl, i * 2) != 0) {
				panic("%s: HYPERVISOR_multicall failed", 
				    ifp->if_xname);
			}
			for (j = 0; j < i * 2; j++) {
				if (xstart_mcl[j].args[5] != 0)
					printf("%s: xstart_mcl[%d] failed\n",
					    ifp->if_xname, j);
			}
			x86_lfence();
			/* update pointer */
			xneti->xni_rxring->resp_prod += i;
			x86_lfence();
			/* now we can free the mbufs */
			for (j = 0; j < i; j++) {
				m_freem(mbufs_sent[j]);
			}
			for (j = 0; j < nppitems; j++) {
				pool_cache_put_paddr(xmit_pages_cachep,
				    (void *)pages_pool_free[j].va,
				    pages_pool_free[j].pa);
			}
		}
		/* send event */
		if (do_event) {
			x86_lfence();
			XENPRINTF(("%s receive event\n",
			    xneti->xni_if.if_xname));
			hypervisor_notify_via_evtchn(xneti->xni_evtchn);
			do_event = 0;
		}
		/* check if we need to get back some pages */
		if (mcl_pages_alloc < 0) {
			xennetback_get_new_mcl_pages();
			if (mcl_pages_alloc < 0) {
				/*
				 * setup the watchdog to try again, because 
				 * xennetback_ifstart() will never be called
				 * again if queue is full.
				 */
				printf("xennetback_ifstart: no mcl_pages\n");
				ifp->if_timer = 1;
				break;
			}
		}
		if ((volatile NETIF_RING_IDX)(xneti->xni_rxring->req_prod) ==
		    xneti->rxreq_cons) {
			/* ring full */
			break;
		}
	}
	splx(s);
}


static void
xennetback_ifwatchdog(struct ifnet * ifp)
{
	/*
	 * We can get to the following condition: 
	 * transmit stalls because the ring is full when the ifq is full too.
	 * In this case (as, unfortunably, we don't get an interrupt from xen
	 * on transmit) noting will ever call xennetback_ifstart() again.
	 * Here we abuse the watchdog to get out of this condition.
	 */
	XENPRINTF(("xennetback_ifwatchdog\n"));
	xennetback_ifstart(ifp);
}


static int
xennetback_ifinit(struct ifnet *ifp)
{
	struct xnetback_instance *xneti = ifp->if_softc;
	int s = splnet();

	if ((ifp->if_flags & IFF_UP) == 0) {
		splx(s);
		return 0;
	}
	if (xneti->status == CONNECTED)
		ifp->if_flags |= IFF_RUNNING;
	splx(s);
	return 0;
}

static void
xennetback_ifstop(struct ifnet *ifp, int disable)
{
	struct xnetback_instance *xneti = ifp->if_softc;
	int s = splnet();

	ifp->if_flags &= ~(IFF_RUNNING | IFF_OACTIVE);
	ifp->if_timer = 0;
	if (xneti->status == CONNECTED) {
		XENPRINTF(("%s: req_prod 0x%x resp_prod 0x%x req_cons 0x%x "
		    "event 0x%x\n", ifp->if_xname, xneti->xni_txring->req_prod,
		    xneti->xni_txring->resp_prod, xneti->xni_txring->req_cons,
		    xneti->xni_txring->event));
		xennetback_evthandler(ifp->if_softc); /* flush pending RX requests */
	}
	splx(s);
}