NetBSD-5.0.2/sys/arch/xen/xen/xbdback.c

Compare this file to the similar file:
Show the results in this format:
/*      $NetBSD: xbdback.c,v 1.34 2008/10/21 15:46:32 cegger Exp $      */

/*
 * Copyright (c) 2005 Manuel Bouyer.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Manuel Bouyer.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: xbdback.c,v 1.34 2008/10/21 15:46:32 cegger Exp $");

#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/queue.h>
#include <sys/kernel.h>
#include <sys/conf.h>
#include <sys/disk.h>
#include <sys/disklabel.h>
#include <sys/fcntl.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/workqueue.h>

#include <machine/pmap.h>
#include <xen/hypervisor.h>
#include <xen/xen.h>
#include <xen/evtchn.h>
#include <xen/ctrl_if.h>
#include <xen/xen_shm.h>

#ifdef XENDEBUG_VBD
#define XENPRINTF(x) printf x
#else
#define XENPRINTF(x)
#endif

#define    u16 uint16_t

/*
 * Backend block device driver for Xen
 */

/* Max number of pages per request. The request may not be page aligned */
#define BLKIF_MAX_PAGES_PER_REQUEST (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)

/* Values are expressed in 512-byte sectors */
#define VBD_BSIZE 512
#define VBD_MAXSECT ((PAGE_SIZE / VBD_BSIZE) - 1)

struct xbd_vbd;
struct xbdback_request;
struct xbdback_io;
struct xbdback_fragment;
struct xbdback_instance;

/* state of a xbdback instance */
typedef enum {CONNECTED, DISCONNECTING, DISCONNECTED} xbdback_state_t;

/*
 * Since there are a variety of conditions that can block our I/O
 * processing, which isn't allowed to suspend its thread's execution,
 * such things will be done in a sort of continuation-passing style.
 * 
 * Return value is NULL to indicate that execution has blocked; if
 * it's finished, set xbdi->cont (see below) to NULL and the return
 * doesn't matter.  Otherwise it's passed as the second parameter to
 * the new value of xbdi->cont.
 */
typedef void *(* xbdback_cont_t)(struct xbdback_instance *, void *);

/* we keep the xbdback instances in a linked list */
struct xbdback_instance {
	SLIST_ENTRY(xbdback_instance) next;
	domid_t domid;		/* attached to this domain */
	uint32_t handle;	/* domain-specific handle */
	volatile xbdback_state_t status;
	/* parameters for the communication */
	unsigned int evtchn;
	paddr_t ma_ring;
	/* private parameters for communication */
	blkif_ring_t *blk_ring;
	BLKIF_RING_IDX resp_prod; /* our current reply index */
	BLKIF_RING_IDX req_cons; /* our current request index */
	/* disconnection must be postponed until all I/O is done */
	volatile unsigned refcnt;
	uint8_t disconnect_rspid; /* request id of the disconnect request */
	/* 
	 * State for I/O processing/coalescing follows; this has to
	 * live here instead of on the stack because of the
	 * continuation-ness (see above).
	 */
	BLKIF_RING_IDX req_prod; /* limit on request indices */
	xbdback_cont_t cont, cont_aux;
	SIMPLEQ_ENTRY(xbdback_instance) on_hold; /* waiting on resources */
	/* _request state */
	struct xbdback_request *req; /* if NULL, ignore following */
	blkif_request_t *xen_req;
	int segno;
	struct xbd_vbd *req_vbd;
	/* _io state */
	struct xbdback_io *io; /* if NULL, ignore next field */
	daddr_t next_sector;
	unsigned long last_fas, this_fas;
	/* other state */
	int same_page; /* are we merging two segments on the same page? */

	SLIST_HEAD(, xbd_vbd) vbds; /* list of virtual block devices */
};
/* Manipulation of the above reference count. */
/* XXXjld@panix.com: not MP-safe, and move the i386 asm elsewhere. */
#define xbdi_get(xbdip) (++(xbdip)->refcnt)
#define xbdi_put(xbdip)                                      \
do {                                                         \
	__asm volatile("decl %0"                           \
	    : "=m"((xbdip)->refcnt) : "m"((xbdip)->refcnt)); \
	if (0 == (xbdip)->refcnt)                            \
               xbdback_finish_disconnect(xbdip);             \
} while (/* CONSTCOND */ 0)

/* each xbdback instance has a list of vbds associated with it */
struct xbd_vbd {
	SLIST_ENTRY(xbd_vbd) next; 
	blkif_vdev_t vdev; /* interface-specific ID */
	int flags;
#define VBD_F_RO 0x01 /* device is read-only */
	int type; /* VDISK_TYPE_foo */
	/* for now we allow only one extent per vbd */
	dev_t dev; /* underlying device */
	const struct bdevsw *bdevsw; /* pointer to the device's bdevsw */
	struct vnode *vp;
	int start;
	int size;
};

SLIST_HEAD(, xbdback_instance) xbdback_instances;

/*
 * For each request from a guest, a xbdback_request is allocated from
 * a pool.  This will describe the request until completion.  The
 * request may require multiple IO operations to perform, so the
 * per-IO information is not stored here.
 */
struct xbdback_request {
	struct xbdback_instance *rq_xbdi; /* our xbd instance */
	uint8_t rq_operation;
	unsigned long rq_id;
	int rq_iocount; /* reference count; or, number of outstanding I/O's */
	int rq_ioerrs;
};

/*
 * For each I/O operation associated with one of those requests, an
 * xbdback_io is allocated from a pool.  It may correspond to multiple
 * Xen disk requests, or parts of them, if several arrive at once that
 * can be coalesced.
 */
struct xbdback_io {
	struct work xio_work;
	struct buf xio_buf; /* our I/O */
	/* The instance pointer is duplicated for convenience. */
	struct xbdback_instance *xio_xbdi; /* our xbd instance */
	SLIST_HEAD(, xbdback_fragment) xio_rq; /* xbd requests involved */
	vaddr_t xio_vaddr; /* the virtual address to map the request at */
	paddr_t xio_ma[XENSHM_MAX_PAGES_PER_REQUEST]; /* guest pages to map */
	uint16_t xio_nrma; /* number of guest pages */
	uint16_t xio_mapped;
};

/*
 * Rather than have the xbdback_io keep an array of the
 * xbdback_requests involved, since the actual number will probably be
 * small but might be as large as BLKIF_RING_SIZE, use a list.  This
 * would be threaded through xbdback_request, but one of them might be
 * part of multiple I/O's, alas.
 */
struct xbdback_fragment {
	struct xbdback_request *car;
	SLIST_ENTRY(xbdback_fragment) cdr;
};

/*
 * Wrap our pools with a chain of xbdback_instances whose I/O
 * processing has blocked for want of memory from that pool.
 */
struct xbdback_pool {
	struct pool p;
	SIMPLEQ_HEAD(xbdback_iqueue, xbdback_instance) q;
	struct timeval last_warning;
} xbdback_request_pool, xbdback_io_pool, xbdback_fragment_pool;
static struct xbdback_iqueue xbdback_shmq;
static int xbdback_shmcb; /* have we already registered a callback? */

struct timeval xbdback_poolsleep_intvl = { 5, 0 };
#ifdef DEBUG
struct timeval xbdback_fragio_intvl = { 60, 0 };
#endif

static void xbdback_ctrlif_rx(ctrl_msg_t *, unsigned long);
static int  xbdback_evthandler(void *);
static void xbdback_finish_disconnect(struct xbdback_instance *);

static struct xbdback_instance *xbdif_lookup(domid_t, uint32_t);
static struct xbd_vbd * vbd_lookup(struct xbdback_instance *, blkif_vdev_t);

static void *xbdback_co_main(struct xbdback_instance *, void *);
static void *xbdback_co_main_loop(struct xbdback_instance *, void *);
static void *xbdback_co_main_incr(struct xbdback_instance *, void *);
static void *xbdback_co_main_done(struct xbdback_instance *, void *);
static void *xbdback_co_main_done2(struct xbdback_instance *, void *);

static void *xbdback_co_io(struct xbdback_instance *, void *);
static void *xbdback_co_io_gotreq(struct xbdback_instance *, void *);
static void *xbdback_co_io_loop(struct xbdback_instance *, void *);
static void *xbdback_co_io_gotio(struct xbdback_instance *, void *);
static void *xbdback_co_io_gotio2(struct xbdback_instance *, void *);
static void *xbdback_co_io_gotfrag(struct xbdback_instance *, void *);
static void *xbdback_co_io_gotfrag2(struct xbdback_instance *, void *);

static void *xbdback_co_flush(struct xbdback_instance *, void *);
static void *xbdback_co_flush_done(struct xbdback_instance *, void *);

static void *xbdback_co_probe(struct xbdback_instance *, void *);
static void *xbdback_co_probe_gotio(struct xbdback_instance *, void *);
static void *xbdback_co_probe_gotvm(struct xbdback_instance *, void *);

static int  xbdback_shm_callback(void *);
static void xbdback_io_error(struct xbdback_io *, int);
static void xbdback_do_io(struct work *, void *);
static void xbdback_iodone(struct buf *);
static void xbdback_send_reply(struct xbdback_instance *, int , int , int);

static void *xbdback_map_shm(struct xbdback_io *);
static void xbdback_unmap_shm(struct xbdback_io *);

static void *xbdback_pool_get(struct xbdback_pool *,
			      struct xbdback_instance *);
static void xbdback_pool_put(struct xbdback_pool *, void *);
static void xbdback_trampoline(struct xbdback_instance *, void *);

struct workqueue *xbdback_workqueue;

void
xbdback_init(void)
{
	ctrl_msg_t cmsg;
	blkif_be_driver_status_t st;

	if ( !xendomain_is_dom0() &&
	     !(xen_start_info.flags & SIF_BLK_BE_DOMAIN) )
		return;

	XENPRINTF(("xbdback_init\n"));

	/*
	 * initialize the backend driver, register the control message handler
	 * and send driver up message.
	 */
	SLIST_INIT(&xbdback_instances);
	SIMPLEQ_INIT(&xbdback_shmq);
	xbdback_shmcb = 0;
	pool_init(&xbdback_request_pool.p, sizeof(struct xbdback_request),
	    0, 0, 0, "xbbrp", NULL, IPL_BIO);
	SIMPLEQ_INIT(&xbdback_request_pool.q);
	pool_init(&xbdback_io_pool.p, sizeof(struct xbdback_io),
	    0, 0, 0, "xbbip", NULL, IPL_BIO);
	SIMPLEQ_INIT(&xbdback_io_pool.q);
	pool_init(&xbdback_fragment_pool.p, sizeof(struct xbdback_fragment),
	    0, 0, 0, "xbbfp", NULL, IPL_BIO);
	SIMPLEQ_INIT(&xbdback_fragment_pool.q);
	/* we allocate enough to handle a whole ring at once */
	if (pool_prime(&xbdback_request_pool.p, BLKIF_RING_SIZE) != 0)
		printf("xbdback: failed to prime request pool\n");
	if (pool_prime(&xbdback_io_pool.p, BLKIF_RING_SIZE) != 0)
		printf("xbdback: failed to prime io pool\n");
	if (pool_prime(&xbdback_fragment_pool.p,
            BLKIF_MAX_SEGMENTS_PER_REQUEST * BLKIF_RING_SIZE) != 0)
		printf("xbdback: failed to prime fragment pool\n");
	if (workqueue_create(&xbdback_workqueue, "xbdbackd",
	    xbdback_do_io, NULL, PRI_BIO, IPL_BIO, 0))
		printf("xbdback: failed to init workqueue\n");

	(void)ctrl_if_register_receiver(CMSG_BLKIF_BE, xbdback_ctrlif_rx,
	    CALLBACK_IN_BLOCKING_CONTEXT);

	cmsg.type      = CMSG_BLKIF_BE;
	cmsg.subtype   = CMSG_BLKIF_BE_DRIVER_STATUS;
	cmsg.length    = sizeof(blkif_be_driver_status_t);
	st.status      = BLKIF_DRIVER_STATUS_UP;
	memcpy(cmsg.msg, &st, sizeof(st));
	ctrl_if_send_message_block(&cmsg, NULL, 0, 0);
}

static void
xbdback_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
{
	struct xbdback_instance *xbdi;
	struct xbd_vbd *vbd;
	int error;

	XENPRINTF(("xbdback msg %d\n", msg->subtype));
	switch (msg->subtype) {
	case CMSG_BLKIF_BE_CREATE:
	{
		blkif_be_create_t *req = (blkif_be_create_t *)&msg->msg[0];
		if (msg->length != sizeof(blkif_be_create_t))
			goto error;
		if (xbdif_lookup(req->domid, req->blkif_handle) != NULL) {
			req->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
			goto end;
		}
		xbdi = malloc(sizeof(struct xbdback_instance), M_DEVBUF,
		    M_NOWAIT | M_ZERO);
		if (xbdi == NULL) {
			req->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
			goto end;
		}
		xbdi->domid = req->domid;
		xbdi->handle = req->blkif_handle;
		xbdi->status = DISCONNECTED;
		xbdi->refcnt = 1;
		xbdi->blk_ring = NULL;
		SLIST_INIT(&xbdi->vbds);
		SLIST_INSERT_HEAD(&xbdback_instances, xbdi, next);
		
		req->status = BLKIF_BE_STATUS_OKAY;
		break;
	}
	case CMSG_BLKIF_BE_DESTROY:
	{
		blkif_be_destroy_t *req = (blkif_be_destroy_t *)&msg->msg[0];
		if (msg->length != sizeof(blkif_be_destroy_t))
			goto error;
		xbdi = xbdif_lookup(req->domid, req->blkif_handle);
		if (xbdi == NULL) {
			req->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
			goto end;
		}
		if (xbdi->status != DISCONNECTED) {
			req->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
			goto end;
		}
		if (xbdi->blk_ring != NULL)
			uvm_km_free(kernel_map, (vaddr_t)xbdi->blk_ring,
			    PAGE_SIZE, UVM_KMF_VAONLY);
		SLIST_REMOVE(&xbdback_instances, xbdi, xbdback_instance, next);
		free(xbdi, M_DEVBUF);
		req->status = BLKIF_BE_STATUS_OKAY;
		break;
	}
	case CMSG_BLKIF_BE_CONNECT:
	{
		blkif_be_connect_t *req = (blkif_be_connect_t *)&msg->msg[0];
		vaddr_t ring_addr;
		char evname[16];

		if (msg->length != sizeof(blkif_be_connect_t))
			goto error;
		xbdi = xbdif_lookup(req->domid, req->blkif_handle);
		if (xbdi == NULL) {
			req->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
			goto end;
		}
		if (xbdi->status != DISCONNECTED) {
			req->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
			goto end;
		}
		if (xbdi->blk_ring == NULL) {
			ring_addr = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
			    UVM_KMF_VAONLY);
			if (ring_addr == 0) {
				req->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
				goto end;
			}
		} else {
			ring_addr = (vaddr_t)xbdi->blk_ring;
			xbdi->blk_ring = NULL;
		}

		xbdi->ma_ring = req->shmem_frame << PAGE_SHIFT;
		error = pmap_enter_ma(pmap_kernel(), ring_addr,
		    xbdi->ma_ring, 0, VM_PROT_READ | VM_PROT_WRITE,
		    PMAP_WIRED | PMAP_CANFAIL, req->domid);
		if (error) {
			uvm_km_free(kernel_map, ring_addr, PAGE_SIZE,
			    UVM_KMF_VAONLY);
			if (error == ENOMEM)
				req->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
			else if (error == EFAULT)
				req->status = BLKIF_BE_STATUS_MAPPING_ERROR;
			else req->status = BLKIF_BE_STATUS_ERROR;
			goto end;
		}
		xbdi->blk_ring = (void *)ring_addr;
		xbdi->evtchn = req->evtchn;
		snprintf(evname, sizeof(evname), "xbdback%d", xbdi->domid);
		event_set_handler(xbdi->evtchn,
		    xbdback_evthandler, xbdi, IPL_BIO, evname);
		printf("xbd backend %d for domain %d using event channel %d\n",
		    xbdi->handle, xbdi->domid, xbdi->evtchn);
		hypervisor_enable_event(xbdi->evtchn);
		xbdi->status = CONNECTED;
		req->status = BLKIF_BE_STATUS_OKAY;
		break;
	}
	case CMSG_BLKIF_BE_DISCONNECT:
	{
		blkif_be_disconnect_t *req =
		    (blkif_be_disconnect_t *)&msg->msg[0];
		int s;

		if (msg->length != sizeof(blkif_be_disconnect_t))
			goto error;
		xbdi = xbdif_lookup(req->domid, req->blkif_handle);
		if (xbdi == NULL) {
			req->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
			goto end;
		}
		hypervisor_mask_event(xbdi->evtchn);
		event_remove_handler(xbdi->evtchn, xbdback_evthandler, xbdi);
		xbdi->status = DISCONNECTING;
		xbdi->disconnect_rspid = msg->id;
		s = splbio();
		xbdi_put(xbdi);
		splx(s);
		return;
	}
	case CMSG_BLKIF_BE_VBD_CREATE:
	{
		blkif_be_vbd_create_t *req =
		    (blkif_be_vbd_create_t *)&msg->msg[0];
		if (msg->length != sizeof(blkif_be_vbd_create_t))
			goto error;
		xbdi = xbdif_lookup(req->domid, req->blkif_handle);
		if (xbdi == NULL) {
			req->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
			goto end;
		}
		vbd = malloc(sizeof(struct xbd_vbd), M_DEVBUF,
		    M_NOWAIT | M_ZERO);
		if (vbd == NULL) {
			req->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
			goto end;
		}
		vbd->vdev = req->vdevice;
		if (req->readonly)
			vbd->flags |= VBD_F_RO;

		SLIST_INSERT_HEAD(&xbdi->vbds, vbd, next);
		req->status = BLKIF_BE_STATUS_OKAY;
		break;
	}
	case CMSG_BLKIF_BE_VBD_DESTROY:
	{
		blkif_be_vbd_destroy_t *req =
		    (blkif_be_vbd_destroy_t *)&msg->msg[0];
		if (msg->length != sizeof(blkif_be_vbd_destroy_t))
			goto error;
		xbdi = xbdif_lookup(req->domid, req->blkif_handle);
		if (xbdi == NULL) {
			req->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
			goto end;
		}
		vbd = vbd_lookup(xbdi, req->vdevice);
		if (vbd == NULL) {
			req->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
			goto end;
		}
		if (vbd->size) {
			printf("xbd backend: detach device %s%d%c "
			    "for domain %d\n", devsw_blk2name(major(vbd->dev)),
			    DISKUNIT(vbd->dev), DISKPART(vbd->dev) + 'a',
			    xbdi->domid);
			vbd->start = vbd->size = vbd->dev = 0;
			vn_close(vbd->vp, FREAD, NOCRED);
		}
		SLIST_REMOVE(&xbdi->vbds, vbd, xbd_vbd, next);
		free(vbd, M_DEVBUF);

		req->status = BLKIF_BE_STATUS_OKAY;
		break;
	}
	case CMSG_BLKIF_BE_VBD_GROW:
	{
		blkif_be_vbd_grow_t *req = (blkif_be_vbd_grow_t *)&msg->msg[0];
		const char *devname;
		int major;
		struct partinfo dpart;

		if (msg->length != sizeof(blkif_be_vbd_grow_t))
			goto error;
		xbdi = xbdif_lookup(req->domid, req->blkif_handle);
		if (xbdi == NULL) {
			req->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
			goto end;
		}
		vbd = vbd_lookup(xbdi, req->vdevice);
		if (vbd == NULL) {
			req->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
			goto end;
		}
		if (vbd->size != 0) {
			req->status = BLKIF_BE_STATUS_VBD_EXISTS;
			goto end;
		}
		if (req->extent.sector_start != 0) {
			req->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND;
			goto end;
		}
		major = major(req->extent.device);
		devname = devsw_blk2name(major);
		if (devname == NULL) {
			printf("xbdback VBD grow domain %d: unknwon device "
			    "0x%x\n", xbdi->domid, req->extent.device);
			req->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND;
			goto end;
		}
		vbd->dev = req->extent.device;
		vbd->bdevsw = bdevsw_lookup(vbd->dev);
		if (vbd->bdevsw == NULL) {
			printf("xbdback VBD grow domain %d: no bdevsw for "
			    "device 0x%x\n", xbdi->domid, req->extent.device);
			req->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND;
			goto end;
		}
		error = bdevvp(vbd->dev, &vbd->vp);
		if (error) {
			printf("xbdback VBD grow domain %d: can't open "
			    "device 0x%x (error %d)\n", xbdi->domid,
			    req->extent.device, error);
			req->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND;
			goto end;
		}
		error = vn_lock(vbd->vp, LK_EXCLUSIVE | LK_RETRY);
		if (error) {
			printf("xbdback VBD grow domain %d: can't lock "
			    "device 0x%x (error %d)\n", xbdi->domid,
			    req->extent.device, error);
			vrele(vbd->vp);
			req->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND;
			goto end;
		}
		error = VOP_OPEN(vbd->vp, FREAD, NOCRED);
		if (error) {
			printf("xbdback VBD grow domain %d: can't open2 "
			    "device 0x%x (error %d)\n", xbdi->domid,
			    req->extent.device, error);
			vput(vbd->vp);
			req->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND;
			goto end;
		}
		VOP_UNLOCK(vbd->vp, 0);
		error = VOP_IOCTL(vbd->vp, DIOCGPART, &dpart, FREAD, 0);
		if (error) {
			printf("xbdback VBD grow domain %d: can't ioctl "
			    "device 0x%x (error %d)\n", xbdi->domid,
			    req->extent.device, error);
			vbd->start = vbd->size = vbd->dev = 0;
			vn_close(vbd->vp, FREAD, NOCRED);
			vbd->vp = NULL;
			req->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND;
			goto end;
		}
		vbd->size = req->extent.sector_length * (512 / DEV_BSIZE);
		if (vbd->size == 0 || vbd->size > dpart.part->p_size);
			vbd->size = dpart.part->p_size;
		printf("xbd backend: attach device %s%d%c (size %d) "
		    "for domain %d\n", devname, DISKUNIT(vbd->dev),
		    DISKPART(vbd->dev) + 'a', vbd->size, xbdi->domid);
		req->status = BLKIF_BE_STATUS_OKAY;
		break;
	}
	case CMSG_BLKIF_BE_VBD_SHRINK:
	{
		blkif_be_vbd_shrink_t *req =
		    (blkif_be_vbd_shrink_t *)&msg->msg[0];
		if (msg->length != sizeof(blkif_be_vbd_shrink_t))
			goto error;
		xbdi = xbdif_lookup(req->domid, req->blkif_handle);
		if (xbdi == NULL) {
			req->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
			goto end;
		}
		vbd = vbd_lookup(xbdi, req->vdevice);
		if (vbd == NULL) {
			req->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
			goto end;
		}
		if (vbd->size == 0) {
			req->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
			goto end;
		}
		printf("xbd backend: detach device %s%d%c "
		    "for domain %d\n", devsw_blk2name(major(vbd->dev)),
		    DISKUNIT(vbd->dev), DISKPART(vbd->dev) + 'a',
		    xbdi->domid);
		vbd->start = vbd->size = vbd->dev = 0;
		vn_close(vbd->vp, FREAD, NOCRED);
		req->status = BLKIF_BE_STATUS_OKAY;
		break;
	}
	default:
error:
		printf("xbdback: wrong message subtype %d len %d\n",
		    msg->subtype, msg->length);
		msg->length = 0;
	}
end:
	XENPRINTF(("xbdback msg rep size %d\n", msg->length));
	ctrl_if_send_response(msg);
	return;
}

static void xbdback_finish_disconnect(struct xbdback_instance *xbdi)
{
	ctrl_msg_t cmsg;
	blkif_be_disconnect_t *pdisc = (blkif_be_disconnect_t *)&cmsg.msg;
	vaddr_t ring_addr;
	
	KASSERT(xbdi->status == DISCONNECTING);

       	ring_addr = (vaddr_t)xbdi->blk_ring;
	pmap_remove(pmap_kernel(), ring_addr, ring_addr + PAGE_SIZE);
	xbdi->status = DISCONNECTED;

	memset(&cmsg, 0, sizeof(cmsg));
	cmsg.type = CMSG_BLKIF_BE;
	cmsg.subtype = CMSG_BLKIF_BE_DISCONNECT;
	cmsg.id = xbdi->disconnect_rspid;
	cmsg.length = sizeof(blkif_be_disconnect_t);
	pdisc->domid = xbdi->domid;
	pdisc->blkif_handle = xbdi->handle;
	pdisc->status = BLKIF_BE_STATUS_OKAY;

	ctrl_if_send_response(&cmsg);
}

static struct xbdback_instance *
xbdif_lookup(domid_t dom , uint32_t handle)
{
	struct xbdback_instance *xbdi;

	SLIST_FOREACH(xbdi, &xbdback_instances, next) {
		if (xbdi->domid == dom && xbdi->handle == handle)
			return xbdi;
	}
	return NULL;
}

static struct xbd_vbd *
vbd_lookup(struct xbdback_instance *xbdi , blkif_vdev_t vdev)
{
	struct xbd_vbd *vbd;

	SLIST_FOREACH(vbd, &xbdi->vbds, next) {
		if (vbd->vdev == vdev)
			return vbd;
	}
	return NULL;
}

static int
xbdback_evthandler(void *arg)
{
	struct xbdback_instance *xbdi = arg;

	if (xbdi->cont == NULL) {
		xbdi->cont = xbdback_co_main;
		xbdback_trampoline(xbdi, xbdi);
	}
	return 1;
}

static void *
xbdback_co_main(struct xbdback_instance *xbdi, void *obj)
{
	(void)obj;
	xbdi->req_prod = xbdi->blk_ring->req_prod;
	x86_lfence(); /* ensure we see all requests up to req_prod */
	/*
	 * note that we'll eventually get a full ring of request.
	 * in this case, MASK_BLKIF_IDX(req_cons) == MASK_BLKIF_IDX(req_prod)
	 */
	xbdi->cont = xbdback_co_main_loop;
	return xbdi;
}

static void *
xbdback_co_main_loop(struct xbdback_instance *xbdi, void *obj) 
{
	blkif_request_t *req;

	(void)obj;
	if (xbdi->req_cons != xbdi->req_prod) {
		req = xbdi->xen_req = &xbdi->blk_ring->ring[
		    MASK_BLKIF_IDX(xbdi->req_cons)].req;
		XENPRINTF(("xbdback op %d req_cons 0x%x req_prod 0x%x/0x%x "
		    "resp_prod 0x%x/0x%x\n", req->operation,
			xbdi->req_cons,
			xbdi->req_prod, MASK_BLKIF_IDX(xbdi->req_prod),
			xbdi->blk_ring->resp_prod,
			MASK_BLKIF_IDX(xbdi->blk_ring->resp_prod)));
		switch(req->operation) {
		case BLKIF_OP_PROBE:
			xbdi->cont = xbdback_co_probe;
			break;
		case BLKIF_OP_READ:
		case BLKIF_OP_WRITE:
			xbdi->cont = xbdback_co_io;
			break;
		default:
			printf("xbdback_evthandler domain %d: unknown "
			    "operation %d\n", xbdi->domid, req->operation);
			xbdback_send_reply(xbdi, req->id, req->operation,
			    BLKIF_RSP_ERROR);
			xbdi->cont = xbdback_co_main_incr;
			break;
		}
	} else {
		xbdi->cont = xbdback_co_main_done;
	}
	return xbdi;
}

static void *
xbdback_co_main_incr(struct xbdback_instance *xbdi, void *obj)
{
	(void)obj;
	xbdi->req_cons++;
	xbdi->cont = xbdback_co_main_loop;
	return xbdi;
}

static void *
xbdback_co_main_done(struct xbdback_instance *xbdi, void *obj)
{
	(void)obj;
	if (xbdi->io != NULL) {
		xbdi->cont = xbdback_co_flush;
		xbdi->cont_aux = xbdback_co_main_done2;
	} else {
		xbdi->cont = xbdback_co_main_done2;
	}
	return xbdi;
}

static void *
xbdback_co_main_done2(struct xbdback_instance *xbdi, void *obj)
{

	if (xbdi->req_prod == xbdi->blk_ring->req_prod) {
		xbdi->cont = NULL;
	} else {
		xbdi->cont = xbdback_co_main;
	}
	return xbdi;
}

static void *
xbdback_co_io(struct xbdback_instance *xbdi, void *obj)
{	
	int error;

	(void)obj;
	if (xbdi->xen_req->nr_segments < 1 ||
	    xbdi->xen_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST ) {
		printf("xbdback_io domain %d: %d segments\n",
		       xbdi->domid, xbdi->xen_req->nr_segments);
		error = EINVAL;
		goto end;
	}
	xbdi->req_vbd = vbd_lookup(xbdi, xbdi->xen_req->device);
	if (xbdi->req_vbd == NULL) {
		printf("xbdback_io domain %d: unknown vbd %d\n",
		       xbdi->domid, xbdi->xen_req->device);
		error = EINVAL;
		goto end;
	}
	if (xbdi->xen_req->operation == BLKIF_OP_WRITE) {
		if (xbdi->req_vbd->flags & VBD_F_RO) {
			error = EROFS;
			goto end;
		}
	}

	xbdi->segno = 0;

	xbdi->cont = xbdback_co_io_gotreq;
	return xbdback_pool_get(&xbdback_request_pool, xbdi);
 end:
	xbdback_send_reply(xbdi, xbdi->xen_req->id, xbdi->xen_req->operation,
	    error);
	xbdi->cont = xbdback_co_main_incr;
	return xbdi;
}

static void *
xbdback_co_io_gotreq(struct xbdback_instance *xbdi, void *obj)
{
	struct xbdback_request *xrq;

	xrq = xbdi->req = obj;
	
	xrq->rq_xbdi = xbdi;
	xrq->rq_iocount = 0;
	xrq->rq_ioerrs = 0;
	xrq->rq_id = xbdi->xen_req->id;
	xrq->rq_operation = xbdi->xen_req->operation;

	/* 
	 * Request-level reasons not to coalesce: different device,
	 * different op, or noncontiguous disk sectors (vs. previous
	 * request handed to us).
	 */
	xbdi->cont = xbdback_co_io_loop;
	if (xbdi->io != NULL) {
		struct xbdback_request *last_req;
		last_req = SLIST_FIRST(&xbdi->io->xio_rq)->car;
		XENPRINTF(("xbdback_io domain %d: hoping for sector %ld;"
		    " got %ld\n", xbdi->domid, (long)xbdi->next_sector,
		    (long)xbdi->xen_req->sector_number));
		if (xrq->rq_operation != last_req->rq_operation
		    || xbdi->xen_req->sector_number != xbdi->next_sector) {
			XENPRINTF(("xbdback_io domain %d: segment break\n",
			    xbdi->domid));
			xbdi->next_sector = xbdi->xen_req->sector_number;
			xbdi->cont_aux = xbdi->cont; 
			xbdi->cont = xbdback_co_flush;
		}
	} else {
		xbdi->next_sector = xbdi->xen_req->sector_number;
	}
	return xbdi;
}


static void *
xbdback_co_io_loop(struct xbdback_instance *xbdi, void *obj)
{
	struct xbdback_io *xio;

	(void)obj;
	if (xbdi->segno < xbdi->xen_req->nr_segments) {
		unsigned long this_fas, last_fas;
		/* 
		 * Segment-level reason to coalesce: handling full
		 * pages, or adjacent sector ranges from the same page
		 * (and yes, this latter does happen).  But not if the
		 * array of client pseudo-physical pages is full.
		 */
		this_fas = xbdi->xen_req->frame_and_sects[xbdi->segno];
		XENPRINTF(("xbdback_io domain %d: frame_and_sects[%d]=0%lo\n",
			   xbdi->domid, xbdi->segno, this_fas));
		last_fas = xbdi->last_fas = xbdi->this_fas;
		xbdi->this_fas = this_fas;
		if (xbdi->io != NULL) {
			if (blkif_last_sect(last_fas) == VBD_MAXSECT
			    && blkif_first_sect(this_fas) == 0
			    && xbdi->io->xio_nrma
			        < XENSHM_MAX_PAGES_PER_REQUEST) {
				xbdi->same_page = 0;
			} else if (blkif_last_sect(last_fas) + 1
				       == blkif_first_sect(this_fas)
				   && (last_fas & ~PAGE_MASK)
				       == (this_fas & ~PAGE_MASK)) {
#ifdef DEBUG
				static struct timeval gluetimer;
				if (ratecheck(&gluetimer,
					      &xbdback_fragio_intvl))
					printf("xbdback: domain %d sending"
					    " excessively fragmented I/O\n",
					    xbdi->domid);
#endif
				XENPRINTF(("xbdback_io domain %d: glue same "
				    "page", xbdi->domid));
				xbdi->same_page = 1;
			} else {
				xbdi->cont_aux = xbdback_co_io_loop;
				xbdi->cont = xbdback_co_flush;
				return xbdi;
			}
		} else
			xbdi->same_page = 0;

		if (xbdi->io == NULL) {
			xbdi->cont = xbdback_co_io_gotio;
			xio = xbdback_pool_get(&xbdback_io_pool, xbdi);
			buf_init(&xio->xio_buf);
			return xio;
		} else {
			xbdi->cont = xbdback_co_io_gotio2;
		}
	} else {
		/* done with the loop over segments; get next request */
		xbdi->cont = xbdback_co_main_incr;
	}
	return xbdi;			
}


static void *
xbdback_co_io_gotio(struct xbdback_instance *xbdi, void *obj)

{
	struct xbdback_io *xbd_io;
	vaddr_t start_offset; /* start offset in vm area */
	int buf_flags;

	xbdi_get(xbdi);
	
	xbd_io = xbdi->io = obj;
	xbd_io->xio_xbdi = xbdi;
	SLIST_INIT(&xbd_io->xio_rq);
	xbd_io->xio_nrma = 0;
	xbd_io->xio_mapped = 0;

	start_offset = blkif_first_sect(xbdi->this_fas) * VBD_BSIZE;
	
	if (xbdi->xen_req->operation == BLKIF_OP_WRITE) {
		buf_flags = B_WRITE;
	} else {
		buf_flags = B_READ;
	}

	xbd_io->xio_buf.b_flags = buf_flags;
	xbd_io->xio_buf.b_cflags = 0;
	xbd_io->xio_buf.b_oflags = 0;
	xbd_io->xio_buf.b_iodone = xbdback_iodone;
	xbd_io->xio_buf.b_proc = NULL;
	xbd_io->xio_buf.b_vp = xbdi->req_vbd->vp;
	xbd_io->xio_buf.b_objlock = &xbdi->req_vbd->vp->v_interlock;
	xbd_io->xio_buf.b_dev = xbdi->req_vbd->dev;
	xbd_io->xio_buf.b_blkno = xbdi->next_sector;
	xbd_io->xio_buf.b_bcount = 0;
	xbd_io->xio_buf.b_data = (void *)start_offset;
	xbd_io->xio_buf.b_private = xbd_io;

	xbdi->cont = xbdback_co_io_gotio2;
	return xbdi;
}


static void *
xbdback_co_io_gotio2(struct xbdback_instance *xbdi, void *obj)
{
	(void)obj;
	if (xbdi->segno == 0 || SLIST_EMPTY(&xbdi->io->xio_rq)) {
		/* if this is the first segment of a new request */
		/* or if it's the first segment of the io */
		xbdi->cont = xbdback_co_io_gotfrag;
		return xbdback_pool_get(&xbdback_fragment_pool, xbdi);
	}
	xbdi->cont = xbdback_co_io_gotfrag2;
	return xbdi;
}


static void *
xbdback_co_io_gotfrag(struct xbdback_instance *xbdi, void *obj)
{
	struct xbdback_fragment *xbd_fr;

	xbd_fr = obj;
	xbd_fr->car = xbdi->req;
	SLIST_INSERT_HEAD(&xbdi->io->xio_rq, xbd_fr, cdr);
	++xbdi->req->rq_iocount;

	xbdi->cont = xbdback_co_io_gotfrag2;
	return xbdi;
}

static void *
xbdback_co_io_gotfrag2(struct xbdback_instance *xbdi, void *obj)
{
	struct xbdback_io *xbd_io;
	int seg_size;
	unsigned long this_fas;

	this_fas = xbdi->this_fas;
	xbd_io = xbdi->io;
	seg_size = blkif_last_sect(this_fas) - blkif_first_sect(this_fas) + 1;

	if (seg_size < 0) {
		printf("xbdback_io domain %d: negative-size request\n",
		       xbdi->domid);
		xbdback_io_error(xbdi->io, EINVAL);
		xbdi->io = NULL;
		xbdi->cont = xbdback_co_main_incr;
		return xbdi;
	}
	
	if (!xbdi->same_page) {
		XENPRINTF(("xbdback_io domain %d: appending page 0%lo\n",
			   xbdi->domid, (this_fas & ~PAGE_MASK)));
		xbd_io->xio_ma[xbd_io->xio_nrma++] = (this_fas
		     & ~PAGE_MASK);
	}

	xbd_io->xio_buf.b_bcount += (daddr_t)(seg_size * VBD_BSIZE);
	XENPRINTF(("xbdback_io domain %d: start sect %d size %d\n",
	    xbdi->domid, (int)xbdi->next_sector, seg_size));
	
	/* Finally, the end of the segment loop! */
	xbdi->next_sector += seg_size;
	++xbdi->segno;
	xbdi->cont = xbdback_co_io_loop;
	return xbdi;
}


static void *
xbdback_co_flush(struct xbdback_instance *xbdi, void *obj)
{
	(void)obj;
	XENPRINTF(("xbdback_io domain %d: flush sect %ld size %d ptr 0x%lx\n",
	    xbdi->domid, (long)xbdi->io->xio_buf.b_blkno,
	    (int)xbdi->io->xio_buf.b_bcount, (long)xbdi->io));
	xbdi->cont = xbdback_co_flush_done;
	return xbdback_map_shm(xbdi->io);
}

static void *
xbdback_co_flush_done(struct xbdback_instance *xbdi, void *obj)
{
	(void)obj;
	workqueue_enqueue(xbdback_workqueue, &xbdi->io->xio_work, NULL);
	xbdi->io = NULL;
	xbdi->cont = xbdi->cont_aux;
	return xbdi;
}

static void
xbdback_io_error(struct xbdback_io *xbd_io, int error)
{
	xbd_io->xio_buf.b_error = error;
	xbdback_iodone(&xbd_io->xio_buf);
}

static void
xbdback_do_io(struct work *wk, void *dummy)
{
	struct xbdback_io *xbd_io = (void *)wk;
	KASSERT(&xbd_io->xio_work == wk);

	xbd_io->xio_buf.b_data =
	    (void *)((vaddr_t)xbd_io->xio_buf.b_data + xbd_io->xio_vaddr);
#ifdef DIAGNOSTIC
	{
	vaddr_t bdata = (vaddr_t)xbd_io->xio_buf.b_data;
	int nsegs =
	    ((((bdata + xbd_io->xio_buf.b_bcount - 1) & ~PAGE_MASK) -
	    (bdata & ~PAGE_MASK)) >> PAGE_SHIFT) + 1;
	if ((bdata & ~PAGE_MASK) != (xbd_io->xio_vaddr & ~PAGE_MASK)) {
		printf("xbdback_do_io vaddr 0x%lx bdata 0x%lx\n",
		    xbd_io->xio_vaddr, bdata);
		panic("xbdback_do_io: bdata page change");
	}
	if (nsegs > xbd_io->xio_nrma) {
		printf("xbdback_do_io vaddr 0x%lx bcount 0x%x doesn't fit in "
		    " %d pages\n", bdata, xbd_io->xio_buf.b_bcount,
		    xbd_io->xio_nrma);
		panic("xbdback_do_io: not enough pages");
	}
	}
#endif
	if ((xbd_io->xio_buf.b_flags & B_READ) == 0) {
		mutex_enter(&xbd_io->xio_buf.b_vp->v_interlock);
		xbd_io->xio_buf.b_vp->v_numoutput++;
		mutex_exit(&xbd_io->xio_buf.b_vp->v_interlock);
	}
	bdev_strategy(&xbd_io->xio_buf);
}

/* This gets reused by xbdback_io_error to report errors from other sources. */
static void
xbdback_iodone(struct buf *bp)
{
	struct xbdback_io *xbd_io;
	struct xbdback_instance *xbdi;
	int errp;

	xbd_io = bp->b_private;
	xbdi = xbd_io->xio_xbdi;

	XENPRINTF(("xbdback_io domain %d: iodone ptr 0x%lx\n",
		   xbdi->domid, (long)xbd_io));

	if (xbd_io->xio_mapped)
		xbdback_unmap_shm(xbd_io);

	if (bp->b_error != 0) {
		printf("xbd IO domain %d: error %d\n",
		       xbdi->domid, bp->b_error);
		errp = 1;
	} else
		errp = 0;

	
	/* for each constituent xbd request */
	while(!SLIST_EMPTY(&xbd_io->xio_rq)) {
		struct xbdback_fragment *xbd_fr;
		struct xbdback_request *xbd_req;
		struct xbdback_instance *rxbdi;
		int error;
		
		xbd_fr = SLIST_FIRST(&xbd_io->xio_rq);
		xbd_req = xbd_fr->car;
		SLIST_REMOVE_HEAD(&xbd_io->xio_rq, cdr);
		xbdback_pool_put(&xbdback_fragment_pool, xbd_fr);
		
		if (errp)
			++xbd_req->rq_ioerrs;
		
		/* finalize it only if this was its last I/O */
		if (--xbd_req->rq_iocount > 0)
			continue;

		rxbdi = xbd_req->rq_xbdi;
		KASSERT(xbdi == rxbdi);
		
		error = xbd_req->rq_ioerrs > 0
		    ? BLKIF_RSP_ERROR
		    : BLKIF_RSP_OKAY;

		XENPRINTF(("xbdback_io domain %d: end request %lu error=%d\n",
		    xbdi->domid, xbd_req->rq_id, error));
		xbdback_send_reply(xbdi, xbd_req->rq_id,
		    xbd_req->rq_operation, error);
		xbdback_pool_put(&xbdback_request_pool, xbd_req);
	}
	xbdi_put(xbdi);
	buf_destroy(&xbd_io->xio_buf);
	xbdback_pool_put(&xbdback_io_pool, xbd_io);
}

static void *
xbdback_co_probe(struct xbdback_instance *xbdi, void *obj)
{
	struct xbdback_io *xio;
	(void)obj;
	/*
	 * There should be only one page in the request. Map it and store
	 * the reply.
	 */
	if (xbdi->xen_req->nr_segments != 1) {
		printf("xbdback_probe: %d segments\n",
		    xbdi->xen_req->nr_segments);
		xbdback_send_reply(xbdi, xbdi->xen_req->id,
		    xbdi->xen_req->operation, EINVAL);
		xbdi->cont = xbdback_co_main_incr;
		return xbdi;
	}
	xbdi->cont = xbdback_co_probe_gotio;
	xio = xbdback_pool_get(&xbdback_io_pool, xbdi);
	buf_init(&xio->xio_buf);
	return xio;
}

static void *
xbdback_co_probe_gotio(struct xbdback_instance *xbdi, void *obj)
{
	struct xbdback_io *xbd_io;

	xbd_io = xbdi->io = obj;

	xbd_io->xio_xbdi = xbdi;
	xbd_io->xio_nrma = 1;
	xbd_io->xio_ma[0] = (xbdi->xen_req->frame_and_sects[0] & ~PAGE_MASK);
	xbd_io->xio_mapped = 0;

	xbdi->cont = xbdback_co_probe_gotvm;
	xbdi->cont_aux = xbdback_co_main_incr;
	return xbdback_map_shm(xbdi->io);
}

static void *
xbdback_co_probe_gotvm(struct xbdback_instance *xbdi, void *obj)
{
	struct xbd_vbd *vbd;
	blkif_request_t *req;
	vdisk_t *vdisk_reply;
	int i;

	vdisk_reply = (void *)xbdi->io->xio_vaddr;
	req = xbdi->xen_req;
	
	i = 0;
	SLIST_FOREACH(vbd, &xbdi->vbds, next) {
		if (i >= PAGE_SIZE / sizeof(vdisk_t)) {
			printf("xbdback_probe domain %d: too many VBDs\n",
			    xbdi->domid);
			break;
		}
		XENPRINTF(("xbdback_probe: reply %d\n", i));
		vdisk_reply[i].capacity = vbd->size;
		vdisk_reply[i].device = vbd->vdev;
		vdisk_reply[i].info = VDISK_TYPE_DISK | VDISK_FLAG_VIRT;
		if (vbd->flags & VBD_F_RO)
			vdisk_reply[i].info |= VDISK_FLAG_RO;
		i++;
	}
	xbdback_unmap_shm(xbdi->io);
	XENPRINTF(("xbdback_probe: nreplies=%d\n", i));
	xbdback_send_reply(xbdi, req->id, req->operation, i);
	buf_destroy(&xbdi->io->xio_buf);
	xbdback_pool_put(&xbdback_io_pool, xbdi->io);
	xbdi->io = NULL;
	xbdi->cont = xbdback_co_main_incr;
	return xbdi;
}

/*
 * called once a request has completed. Place the reply in the ring and
 * notify the guest OS
 */
static void
xbdback_send_reply(struct xbdback_instance *xbdi, int id, int op, int status)
{
	blkif_response_t *resp;

	resp = &xbdi->blk_ring->ring[MASK_BLKIF_IDX(xbdi->resp_prod)].resp;
	resp->id        = id;
	resp->operation = op;
	resp->status    = status;
	xbdi->resp_prod++;
	x86_lfence(); /* ensure guest see all our replies */
	xbdi->blk_ring->resp_prod = xbdi->resp_prod;
	hypervisor_notify_via_evtchn(xbdi->evtchn);
}

/*
 * Map a request into our virtual address space.  The xbd_req->rq_ma
 * array is to be filled out by the caller.
 */
static void *
xbdback_map_shm(struct xbdback_io *xbd_io)
{
	struct xbdback_instance *xbdi;
	int error, s;

	KASSERT(xbd_io->xio_mapped == 0);

	xbdi = xbd_io->xio_xbdi;
	error = xen_shm_map(xbd_io->xio_ma, xbd_io->xio_nrma,
	    xbd_io->xio_xbdi->domid, &xbd_io->xio_vaddr, 0);

	switch(error) {
	case 0:
		xbd_io->xio_mapped = 1;
		return (void *)xbd_io->xio_vaddr;
	case ENOMEM:
		s = splvm();
		if (!xbdback_shmcb) {
			if (xen_shm_callback(xbdback_shm_callback, xbdi)
			    != 0) {
				splx(s);
				panic("xbdback_co_probe_gotio: "
				      "xen_shm_callback failed");
			}
			xbdback_shmcb = 1;
		}
		SIMPLEQ_INSERT_TAIL(&xbdback_shmq, xbdi, on_hold);
		splx(s);
		return NULL;
	default:
		printf("xbdback_co_probe_gotio: xen_shm error %d",
		       error);
		xbdback_io_error(xbdi->io, error);
		xbdi->io = NULL;
		xbdi->cont = xbdi->cont_aux;
		return xbdi;
	}
}

static int
xbdback_shm_callback(void *arg)
{
        int error, s;

	s = splvm();
	while(!SIMPLEQ_EMPTY(&xbdback_shmq)) {
		struct xbdback_instance *xbdi;
		struct xbdback_io *xbd_io;
		
		xbdi = SIMPLEQ_FIRST(&xbdback_shmq);
		xbd_io = xbdi->io;
		KASSERT(xbd_io->xio_mapped == 0);
		
		switch((error = xen_shm_map(xbd_io->xio_ma, xbd_io->xio_nrma,
		    xbdi->domid, &xbd_io->xio_vaddr, XSHM_CALLBACK))) {
		case ENOMEM:
			splx(s);
			return -1; /* will try again later */
		case 0:
			xbd_io->xio_mapped = 1;
			SIMPLEQ_REMOVE_HEAD(&xbdback_shmq, on_hold);
			splx(s);
			xbdback_trampoline(xbdi, xbdi);
			s = splvm();
			break;
		default:
			SIMPLEQ_REMOVE_HEAD(&xbdback_shmq, on_hold);
			splx(s);
			printf("xbdback_shm_callback: xen_shm error %d\n",
			       error);
			xbdi->cont = xbdi->cont_aux;
			xbdback_io_error(xbd_io, error);
			xbdback_trampoline(xbdi, xbdi);
			s = splvm();
			break;
		}
	}
	xbdback_shmcb = 0;
	splx(s);
	return 0;
}

/* unmap a request from our virtual address space (request is done) */
static void
xbdback_unmap_shm(struct xbdback_io *xbd_io)
{
	KASSERT(xbd_io->xio_mapped == 1);
	xbd_io->xio_mapped = 0;
	xen_shm_unmap(xbd_io->xio_vaddr, xbd_io->xio_ma,
	    xbd_io->xio_nrma, xbd_io->xio_xbdi->domid);
	xbd_io->xio_vaddr = -1;
}

/* Obtain memory from a pool, in cooperation with the continuations. */
static void *xbdback_pool_get(struct xbdback_pool *pp,
			      struct xbdback_instance *xbdi)
{
	int s;
	void *item;

	item = pool_get(&pp->p, PR_NOWAIT);
	if (item == NULL) {
		if (ratecheck(&pp->last_warning, &xbdback_poolsleep_intvl))
			printf("xbdback_pool_get: %s is full",
			       pp->p.pr_wchan);
		s = splvm();
		SIMPLEQ_INSERT_TAIL(&pp->q, xbdi, on_hold);
		splx(s);
	}
	return item;
}

/*
 * Restore memory to a pool... unless an xbdback instance had been
 * waiting for it, in which case that gets the memory first.
 */
static void xbdback_pool_put(struct xbdback_pool *pp, void *item)
{
	int s;
	
	s = splvm();
	if (SIMPLEQ_EMPTY(&pp->q)) {
		splx(s);
		pool_put(&pp->p, item);
	} else {
		struct xbdback_instance *xbdi = SIMPLEQ_FIRST(&pp->q);
		SIMPLEQ_REMOVE_HEAD(&pp->q, on_hold);
		splx(s);
		xbdback_trampoline(xbdi, item);
	}
}

static void xbdback_trampoline(struct xbdback_instance *xbdi, void *obj)
{
	xbdback_cont_t cont;

	while(obj != NULL && xbdi->cont != NULL) {
		cont = xbdi->cont;
#ifdef DIAGNOSTIC
		xbdi->cont = (xbdback_cont_t)0xDEADBEEF;
#endif
		obj = (*cont)(xbdi, obj);
#ifdef DIAGNOSTIC
		if (xbdi->cont == (xbdback_cont_t)0xDEADBEEF) {
			printf("xbdback_trampoline: 0x%lx didn't set "
			       "xbdi->cont!\n2", (long)cont);
			panic("xbdback_trampoline: bad continuation");
		}
#endif
	}
}