NetBSD-5.0.2/sys/arch/xen/xen/xbdback_xenbus.c

Compare this file to the similar file:
Show the results in this format:

/*      $NetBSD: xbdback_xenbus.c,v 1.20 2008/10/24 18:02:58 jym Exp $      */

/*
 * Copyright (c) 2006 Manuel Bouyer.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Manuel Bouyer.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.20 2008/10/24 18:02:58 jym Exp $");

#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/queue.h>
#include <sys/kernel.h>
#include <sys/conf.h>
#include <sys/disk.h>
#include <sys/disklabel.h>
#include <sys/fcntl.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/workqueue.h>

#include <xen/xen.h>
#include <xen/xen_shm.h>
#include <xen/evtchn.h>
#include <xen/xenbus.h>
#include <xen/xen3-public/io/protocols.h>

/* #define XENDEBUG_VBD */
#ifdef XENDEBUG_VBD
#define XENPRINTF(x) printf x
#else
#define XENPRINTF(x)
#endif

#define BLKIF_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)

/*
 * Backend block device driver for Xen
 */

/* Max number of pages per request. The request may not be page aligned */
#define BLKIF_MAX_PAGES_PER_REQUEST (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)

/* Values are expressed in 512-byte sectors */
#define VBD_BSIZE 512
#define VBD_MAXSECT ((PAGE_SIZE / VBD_BSIZE) - 1)

struct xbdback_request;
struct xbdback_io;
struct xbdback_fragment;
struct xbdback_instance;

/* state of a xbdback instance */
typedef enum {CONNECTED, DISCONNECTING, DISCONNECTED} xbdback_state_t;

/*
 * Since there are a variety of conditions that can block our I/O
 * processing, which isn't allowed to suspend its thread's execution,
 * such things will be done in a sort of continuation-passing style.
 * 
 * Return value is NULL to indicate that execution has blocked; if
 * it's finished, set xbdi->xbdi_cont (see below) to NULL and the return
 * doesn't matter.  Otherwise it's passed as the second parameter to
 * the new value of xbdi->xbdi_cont.
 */
typedef void *(* xbdback_cont_t)(struct xbdback_instance *, void *);

enum xbdi_proto {
	XBDIP_NATIVE,
	XBDIP_32,
	XBDIP_64
};


/* we keep the xbdback instances in a linked list */
struct xbdback_instance {
	SLIST_ENTRY(xbdback_instance) next;
	struct xenbus_device *xbdi_xbusd; /* our xenstore entry */
	struct xenbus_watch xbdi_watch; /* to watch our store */
	domid_t xbdi_domid;		/* attached to this domain */
	uint32_t xbdi_handle;	/* domain-specific handle */
	xbdback_state_t xbdi_status;
	/* backing device parameters */
	dev_t xbdi_dev;
	const struct bdevsw *xbdi_bdevsw; /* pointer to the device's bdevsw */
	struct vnode *xbdi_vp;
	uint64_t xbdi_size;
	int xbdi_ro; /* is device read-only ? */
	/* parameters for the communication */
	unsigned int xbdi_evtchn;
	/* private parameters for communication */
	blkif_back_ring_proto_t xbdi_ring;
	enum xbdi_proto xbdi_proto;
	grant_handle_t xbdi_ring_handle; /* to unmap the ring */
	vaddr_t xbdi_ring_va; /* to unmap the ring */
	/* disconnection must be postponed until all I/O is done */
	volatile unsigned xbdi_refcnt;
	/* 
	 * State for I/O processing/coalescing follows; this has to
	 * live here instead of on the stack because of the
	 * continuation-ness (see above).
	 */
	RING_IDX xbdi_req_prod; /* limit on request indices */
	xbdback_cont_t xbdi_cont, xbdi_cont_aux;
	SIMPLEQ_ENTRY(xbdback_instance) xbdi_on_hold; /* waiting on resources */
	/* _request state */
	struct xbdback_request *xbdi_req; /* if NULL, ignore following */
	blkif_request_t xbdi_xen_req;
	int xbdi_segno;
	/* _io state */
	struct xbdback_io *xbdi_io; /* if NULL, ignore next field */
	daddr_t xbdi_next_sector;
	uint8_t xbdi_last_fs, xbdi_this_fs; /* first sectors */
	uint8_t xbdi_last_ls, xbdi_this_ls; /* last sectors */
	grant_ref_t xbdi_thisgrt, xbdi_lastgrt; /* grants */
	/* other state */
	int xbdi_same_page; /* are we merging two segments on the same page? */
};
/* Manipulation of the above reference count. */
/* XXXjld@panix.com: not MP-safe, and move the i386 asm elsewhere. */
#define xbdi_get(xbdip) (++(xbdip)->xbdi_refcnt)
#define xbdi_put(xbdip)                                      \
do {                                                         \
	__asm volatile("decl %0"                           \
	    : "=m"((xbdip)->xbdi_refcnt) : "m"((xbdip)->xbdi_refcnt)); \
	if (0 == (xbdip)->xbdi_refcnt)                            \
               xbdback_finish_disconnect(xbdip);             \
} while (/* CONSTCOND */ 0)

SLIST_HEAD(, xbdback_instance) xbdback_instances;

/*
 * For each request from a guest, a xbdback_request is allocated from
 * a pool.  This will describe the request until completion.  The
 * request may require multiple IO operations to perform, so the
 * per-IO information is not stored here.
 */
struct xbdback_request {
	struct xbdback_instance *rq_xbdi; /* our xbd instance */
	uint64_t rq_id;
	int rq_iocount; /* reference count; or, number of outstanding I/O's */
	int rq_ioerrs;
	uint8_t rq_operation;
};

/*
 * For each I/O operation associated with one of those requests, an
 * xbdback_io is allocated from a pool.  It may correspond to multiple
 * Xen disk requests, or parts of them, if several arrive at once that
 * can be coalesced.
 */
struct xbdback_io {
	struct work xio_work;
	struct buf xio_buf; /* our I/O */
	/* The instance pointer is duplicated for convenience. */
	struct xbdback_instance *xio_xbdi; /* our xbd instance */
	SLIST_HEAD(, xbdback_fragment) xio_rq; /* xbd requests involved */
	vaddr_t xio_vaddr; /* the virtual address to map the request at */
	grant_ref_t xio_gref[XENSHM_MAX_PAGES_PER_REQUEST]; /* grants to map */
	grant_handle_t xio_gh[XENSHM_MAX_PAGES_PER_REQUEST];/* grants release */
	uint16_t xio_nrma; /* number of guest pages */
	uint16_t xio_mapped;
};

/*
 * Rather than have the xbdback_io keep an array of the
 * xbdback_requests involved, since the actual number will probably be
 * small but might be as large as BLKIF_RING_SIZE, use a list.  This
 * would be threaded through xbdback_request, but one of them might be
 * part of multiple I/O's, alas.
 */
struct xbdback_fragment {
	struct xbdback_request *car;
	SLIST_ENTRY(xbdback_fragment) cdr;
};

/*
 * Wrap our pools with a chain of xbdback_instances whose I/O
 * processing has blocked for want of memory from that pool.
 */
struct xbdback_pool {
	struct pool p;
	SIMPLEQ_HEAD(xbdback_iqueue, xbdback_instance) q;
	struct timeval last_warning;
} xbdback_request_pool, xbdback_io_pool, xbdback_fragment_pool;
static struct xbdback_iqueue xbdback_shmq;
static int xbdback_shmcb; /* have we already registered a callback? */

struct timeval xbdback_poolsleep_intvl = { 5, 0 };
#ifdef DEBUG
struct timeval xbdback_fragio_intvl = { 60, 0 };
#endif
       void xbdbackattach(int);
static int  xbdback_xenbus_create(struct xenbus_device *);
static int  xbdback_xenbus_destroy(void *);
static void xbdback_frontend_changed(void *, XenbusState);
static void xbdback_backend_changed(struct xenbus_watch *,
    const char **, unsigned int);
static int  xbdback_evthandler(void *);
static void xbdback_finish_disconnect(struct xbdback_instance *);

static struct xbdback_instance *xbdif_lookup(domid_t, uint32_t);

static void *xbdback_co_main(struct xbdback_instance *, void *);
static void *xbdback_co_main_loop(struct xbdback_instance *, void *);
static void *xbdback_co_main_incr(struct xbdback_instance *, void *);
static void *xbdback_co_main_done(struct xbdback_instance *, void *);
static void *xbdback_co_main_done2(struct xbdback_instance *, void *);

static void *xbdback_co_io(struct xbdback_instance *, void *);
static void *xbdback_co_io_gotreq(struct xbdback_instance *, void *);
static void *xbdback_co_io_loop(struct xbdback_instance *, void *);
static void *xbdback_co_io_gotio(struct xbdback_instance *, void *);
static void *xbdback_co_io_gotio2(struct xbdback_instance *, void *);
static void *xbdback_co_io_gotfrag(struct xbdback_instance *, void *);
static void *xbdback_co_io_gotfrag2(struct xbdback_instance *, void *);

static void *xbdback_co_flush(struct xbdback_instance *, void *);
static void *xbdback_co_flush_done(struct xbdback_instance *, void *);

static int  xbdback_shm_callback(void *);
static void xbdback_io_error(struct xbdback_io *, int);
static void xbdback_do_io(struct work *, void *);
static void xbdback_iodone(struct buf *);
static void xbdback_send_reply(struct xbdback_instance *, uint64_t , int , int);

static void *xbdback_map_shm(struct xbdback_io *);
static void xbdback_unmap_shm(struct xbdback_io *);

static void *xbdback_pool_get(struct xbdback_pool *,
			      struct xbdback_instance *);
static void xbdback_pool_put(struct xbdback_pool *, void *);
static void xbdback_trampoline(struct xbdback_instance *, void *);

static struct xenbus_backend_driver xbd_backend_driver = {
	.xbakd_create = xbdback_xenbus_create,
	.xbakd_type = "vbd"
};

struct workqueue *xbdback_workqueue;

void
xbdbackattach(int n)
{
	XENPRINTF(("xbdbackattach\n"));

	/*
	 * initialize the backend driver, register the control message handler
	 * and send driver up message.
	 */
	SLIST_INIT(&xbdback_instances);
	SIMPLEQ_INIT(&xbdback_shmq);
	xbdback_shmcb = 0;
	pool_init(&xbdback_request_pool.p, sizeof(struct xbdback_request),
	    0, 0, 0, "xbbrp", NULL, IPL_BIO);
	SIMPLEQ_INIT(&xbdback_request_pool.q);
	pool_init(&xbdback_io_pool.p, sizeof(struct xbdback_io),
	    0, 0, 0, "xbbip", NULL, IPL_BIO);
	SIMPLEQ_INIT(&xbdback_io_pool.q);
	pool_init(&xbdback_fragment_pool.p, sizeof(struct xbdback_fragment),
	    0, 0, 0, "xbbfp", NULL, IPL_BIO);
	SIMPLEQ_INIT(&xbdback_fragment_pool.q);
	/* we allocate enough to handle a whole ring at once */
	if (pool_prime(&xbdback_request_pool.p, BLKIF_RING_SIZE) != 0)
		printf("xbdback: failed to prime request pool\n");
	if (pool_prime(&xbdback_io_pool.p, BLKIF_RING_SIZE) != 0)
		printf("xbdback: failed to prime io pool\n");
	if (pool_prime(&xbdback_fragment_pool.p,
            BLKIF_MAX_SEGMENTS_PER_REQUEST * BLKIF_RING_SIZE) != 0)
		printf("xbdback: failed to prime fragment pool\n");

	if (workqueue_create(&xbdback_workqueue, "xbdbackd",
	    xbdback_do_io, NULL, PRI_BIO, IPL_BIO, 0))
		printf("xbdback: failed to init workqueue\n");
	xenbus_backend_register(&xbd_backend_driver);
}

static int
xbdback_xenbus_create(struct xenbus_device *xbusd)
{
	struct xbdback_instance *xbdi;
	long domid, handle;
	int error, i;
	char *ep;

	if ((error = xenbus_read_ul(NULL, xbusd->xbusd_path,
	    "frontend-id", &domid, 10)) != 0) {
		aprint_error("xbdback: can't read %s/frontend-id: %d\n",
		    xbusd->xbusd_path, error);
		return error;
	}

	/*
	 * get handle: this is the last component of the path; which is
	 * a decimal number. $path/dev contains the device name, which is not
	 * appropriate.
	 */
	for (i = strlen(xbusd->xbusd_path); i > 0; i--) {
		if (xbusd->xbusd_path[i] == '/')
			break;
	}
	if (i == 0) {
		aprint_error("xbdback: can't parse %s\n",
		    xbusd->xbusd_path);
		return EFTYPE;
	}
	handle = strtoul(&xbusd->xbusd_path[i+1], &ep, 10);
	if (*ep != '\0') {
		aprint_error("xbdback: can't parse %s\n",
		    xbusd->xbusd_path);
		return EFTYPE;
	}
			
	if (xbdif_lookup(domid, handle) != NULL) {
		return EEXIST;
	}
	xbdi = malloc(sizeof(struct xbdback_instance), M_DEVBUF,
	    M_NOWAIT | M_ZERO);
	if (xbdi == NULL) {
		return ENOMEM;
	}
	xbdi->xbdi_domid = domid;
	xbdi->xbdi_handle = handle;
	xbdi->xbdi_status = DISCONNECTED;
	xbdi->xbdi_refcnt = 1;
	SLIST_INSERT_HEAD(&xbdback_instances, xbdi, next);
	xbusd->xbusd_u.b.b_cookie = xbdi;	
	xbusd->xbusd_u.b.b_detach = xbdback_xenbus_destroy;
	xbusd->xbusd_otherend_changed = xbdback_frontend_changed;
	xbdi->xbdi_xbusd = xbusd;

	error = xenbus_watch_path2(xbusd, xbusd->xbusd_path, "physical-device",
	    &xbdi->xbdi_watch,  xbdback_backend_changed);
	if (error) {
		printf("failed to watch on %s/physical-device: %d\n",
		    xbusd->xbusd_path, error);
		goto fail;
	}
	xbdi->xbdi_watch.xbw_dev = xbusd;
	error = xenbus_switch_state(xbusd, NULL, XenbusStateInitWait);
	if (error) {
		printf("failed to switch state on %s: %d\n",
		    xbusd->xbusd_path, error);
		goto fail2;
	}
	return 0;
fail2:
	unregister_xenbus_watch(&xbdi->xbdi_watch);
fail:
	free(xbdi, M_DEVBUF);
	return error;
}

static int
xbdback_xenbus_destroy(void *arg)
{
	struct xbdback_instance *xbdi = arg;
	struct xenbus_device *xbusd = xbdi->xbdi_xbusd;
	struct gnttab_unmap_grant_ref ungrop;
	int err, s;

	XENPRINTF(("xbdback_xenbus_destroy state %d\n", xbdi->xbdi_status));

	if (xbdi->xbdi_status != DISCONNECTED) {
		hypervisor_mask_event(xbdi->xbdi_evtchn);
		event_remove_handler(xbdi->xbdi_evtchn, xbdback_evthandler,
		    xbdi);
		xbdi->xbdi_status = DISCONNECTING;
		s = splbio();
		xbdi_put(xbdi);
		while (xbdi->xbdi_status != DISCONNECTED) {
			tsleep(&xbdi->xbdi_status, PRIBIO, "xbddis", 0);
		}
		splx(s);
	}
	/* unregister watch */
	if (xbdi->xbdi_watch.node) {
		unregister_xenbus_watch(&xbdi->xbdi_watch);
		free(xbdi->xbdi_watch.node, M_DEVBUF);
		xbdi->xbdi_watch.node = NULL;
	}
	/* unmap ring */
	if (xbdi->xbdi_ring_va != 0) {
		ungrop.host_addr = xbdi->xbdi_ring_va;
		ungrop.handle = xbdi->xbdi_ring_handle;
		ungrop.dev_bus_addr = 0;
		err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
		    &ungrop, 1);
		if (err)
		    printf("xbdback %s: unmap_grant_ref failed: %d\n",
			xbusd->xbusd_otherend, err);
		uvm_km_free(kernel_map, xbdi->xbdi_ring_va,
		    PAGE_SIZE, UVM_KMF_VAONLY);
	}
	/* close device */
	if (xbdi->xbdi_size) {
		printf("xbd backend: detach device %s%d%c for domain %d\n",
		    devsw_blk2name(major(xbdi->xbdi_dev)),
		    DISKUNIT(xbdi->xbdi_dev), DISKPART(xbdi->xbdi_dev) + 'a',
		    xbdi->xbdi_domid);
		vn_close(xbdi->xbdi_vp, FREAD, NOCRED);
	}
	SLIST_REMOVE(&xbdback_instances, xbdi, xbdback_instance, next);
	free(xbdi, M_DEVBUF);
	return 0;
}

static void
xbdback_frontend_changed(void *arg, XenbusState new_state)
{
	struct xbdback_instance *xbdi = arg;
	struct xenbus_device *xbusd = xbdi->xbdi_xbusd;
	u_long ring_ref, revtchn;
	struct gnttab_map_grant_ref grop;
	struct gnttab_unmap_grant_ref ungrop;
	evtchn_op_t evop;
	char evname[16];
	const char *proto;
	char *xsproto;
	int len;
	int err, s;

	XENPRINTF(("xbdback %s: new state %d\n", xbusd->xbusd_path, new_state));
	switch(new_state) {
	case XenbusStateInitialising:
		break;
	case XenbusStateInitialised:
	case XenbusStateConnected:
		if (xbdi->xbdi_status == CONNECTED)
			break;
		/* read comunication informations */
		err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
		    "ring-ref", &ring_ref, 10);
		if (err) {
			xenbus_dev_fatal(xbusd, err, "reading %s/ring-ref",
			    xbusd->xbusd_otherend);
			break;
		}
		err = xenbus_read_ul(NULL, xbusd->xbusd_otherend,
		    "event-channel", &revtchn, 10);
		if (err) {
			xenbus_dev_fatal(xbusd, err, "reading %s/event-channel",
			    xbusd->xbusd_otherend);
			break;
		}
		err = xenbus_read(NULL, xbusd->xbusd_otherend, "protocol",
		    &len, &xsproto);
		if (err) {
			proto = "unspecified";
			xbdi->xbdi_proto = XBDIP_NATIVE;
		} else {
			if(strcmp(xsproto, XEN_IO_PROTO_ABI_NATIVE) == 0) {
				xbdi->xbdi_proto = XBDIP_NATIVE;
				proto = XEN_IO_PROTO_ABI_NATIVE;
			} else if(strcmp(xsproto, XEN_IO_PROTO_ABI_X86_32) == 0) {
				xbdi->xbdi_proto = XBDIP_32;
				proto = XEN_IO_PROTO_ABI_X86_32;
			} else if(strcmp(xsproto, XEN_IO_PROTO_ABI_X86_64) == 0) {
				xbdi->xbdi_proto = XBDIP_64;
				proto = XEN_IO_PROTO_ABI_X86_64;
			} else {
				printf("xbd domain %d: unknown proto %s\n",
				    xbdi->xbdi_domid, xsproto);
				free(xsproto, M_DEVBUF);
				return;
			}
			free(xsproto, M_DEVBUF);
		}
		/* allocate VA space and map rings */
		xbdi->xbdi_ring_va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
		    UVM_KMF_VAONLY);
		if (xbdi->xbdi_ring_va == 0) {
			xenbus_dev_fatal(xbusd, ENOMEM,
			    "can't get VA for ring", xbusd->xbusd_otherend);
			break;
		}
		grop.host_addr = xbdi->xbdi_ring_va;
		grop.flags = GNTMAP_host_map;
		grop.ref = ring_ref;
		grop.dom = xbdi->xbdi_domid;
		err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
		    &grop, 1);
		if (err || grop.status) {
			printf("xbdback %s: can't map grant ref: %d/%d\n",
			    xbusd->xbusd_path, err, grop.status);
			xenbus_dev_fatal(xbusd, EINVAL,
			    "can't map ring", xbusd->xbusd_otherend);
			goto err;
		}
		xbdi->xbdi_ring_handle = grop.handle;
		switch(xbdi->xbdi_proto) {
		case XBDIP_NATIVE:
		{
			blkif_sring_t *sring = (void *)xbdi->xbdi_ring_va;
			BACK_RING_INIT(&xbdi->xbdi_ring.ring_n,
			    sring, PAGE_SIZE);
			break;
		}
		case XBDIP_32:
		{
			blkif_x86_32_sring_t *sring =
			    (void *)xbdi->xbdi_ring_va;
			BACK_RING_INIT(&xbdi->xbdi_ring.ring_32,
			    sring, PAGE_SIZE);
			break;
		}
		case XBDIP_64:
		{
			blkif_x86_64_sring_t *sring =
			    (void *)xbdi->xbdi_ring_va;
			BACK_RING_INIT(&xbdi->xbdi_ring.ring_64,
			    sring, PAGE_SIZE);
			break;
		}
		}
		evop.cmd = EVTCHNOP_bind_interdomain;
		evop.u.bind_interdomain.remote_dom = xbdi->xbdi_domid;
		evop.u.bind_interdomain.remote_port = revtchn;
		err = HYPERVISOR_event_channel_op(&evop);
		if (err) {
			aprint_error("blkback %s: "
			    "can't get event channel: %d\n",
			    xbusd->xbusd_otherend, err);
			xenbus_dev_fatal(xbusd, err,
			    "can't bind event channel", xbusd->xbusd_otherend);
			goto err2;
		}
		xbdi->xbdi_evtchn = evop.u.bind_interdomain.local_port;
		snprintf(evname, sizeof(evname), "xbd%d.%d",
		    xbdi->xbdi_domid, xbdi->xbdi_handle);
		event_set_handler(xbdi->xbdi_evtchn, xbdback_evthandler,
		    xbdi, IPL_BIO, evname);
		aprint_verbose("xbd backend 0x%x for domain %d "
		    "using event channel %d, protocol %s\n", xbdi->xbdi_handle,
		    xbdi->xbdi_domid, xbdi->xbdi_evtchn, proto);
		hypervisor_enable_event(xbdi->xbdi_evtchn);
		hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn);
		xbdi->xbdi_status = CONNECTED;
		break;
	case XenbusStateClosing:
		hypervisor_mask_event(xbdi->xbdi_evtchn);
		event_remove_handler(xbdi->xbdi_evtchn, xbdback_evthandler,
		    xbdi);
		xbdi->xbdi_status = DISCONNECTING;
		s = splbio();
		xbdi_put(xbdi);
		while (xbdi->xbdi_status != DISCONNECTED) {
			tsleep(&xbdi->xbdi_status, PRIBIO, "xbddis", 0);
		}
		splx(s);
		xenbus_switch_state(xbusd, NULL, XenbusStateClosing);
		break;
	case XenbusStateClosed:
		/* otherend_changed() should handle it for us */
		panic("xbdback_frontend_changed: closed\n");
	case XenbusStateUnknown:
	case XenbusStateInitWait:
	default:
		aprint_error("xbdback %s: invalid frontend state %d\n",
		    xbusd->xbusd_path, new_state);
	}
	return;
err2:
	/* unmap ring */
	ungrop.host_addr = xbdi->xbdi_ring_va;
	ungrop.handle = xbdi->xbdi_ring_handle;
	ungrop.dev_bus_addr = 0;
	err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
	    &ungrop, 1);
	if (err)
	    printf("xbdback %s: unmap_grant_ref failed: %d\n",
		xbusd->xbusd_path, err);
err:
	uvm_km_free(kernel_map, xbdi->xbdi_ring_va, PAGE_SIZE, UVM_KMF_VAONLY);
	return;
}

static void
xbdback_backend_changed(struct xenbus_watch *watch,
    const char **vec, unsigned int len)
{
	struct xenbus_device *xbusd = watch->xbw_dev;
	struct xbdback_instance *xbdi = xbusd->xbusd_u.b.b_cookie;
	int err;
	long dev;
	char *mode;
	struct xenbus_transaction *xbt;
	const char *devname;
	int major;

	err = xenbus_read_ul(NULL, xbusd->xbusd_path, "physical-device",
	    &dev, 10);
	/*
	 * An error can occur as the watch can fire up just after being
	 * registered. So we have to ignore error  :(
	 */
	if (err)
		return;
	if (xbdi->xbdi_status == CONNECTED && xbdi->xbdi_dev != dev) {
		printf("xbdback %s: changing physical device from 0x%x to "
		    "0x%lx not supported\n", xbusd->xbusd_path, xbdi->xbdi_dev,
		    dev);
		return;
	}
	xbdi->xbdi_dev = dev;
	err = xenbus_read(NULL, xbusd->xbusd_path, "mode", NULL, &mode);
	if (err) {
		printf("xbdback: failed to read %s/mode: %d\n",
		    xbusd->xbusd_path, err);
		return;
	}
	if (mode[0] == 'w')
		xbdi->xbdi_ro = 0;
	else
		xbdi->xbdi_ro = 1;
	major = major(xbdi->xbdi_dev);
	devname = devsw_blk2name(major);
	if (devname == NULL) {
		printf("xbdback %s: unknown device 0x%x\n", xbusd->xbusd_path,
		    xbdi->xbdi_dev);
		return;
	}
	xbdi->xbdi_bdevsw = bdevsw_lookup(xbdi->xbdi_dev);
	if (xbdi->xbdi_bdevsw == NULL) {
		printf("xbdback %s: no bdevsw for device 0x%x\n",
		    xbusd->xbusd_path, xbdi->xbdi_dev);
		return;
	}
	err = bdevvp(xbdi->xbdi_dev, &xbdi->xbdi_vp);
	if (err) {
		printf("xbdback %s: can't open device 0x%x: %d\n",
		    xbusd->xbusd_path, xbdi->xbdi_dev, err);
		return;
	}
	err = vn_lock(xbdi->xbdi_vp, LK_EXCLUSIVE | LK_RETRY);
	if (err) {
		printf("xbdback %s: can't vn_lock device 0x%x: %d\n",
		    xbusd->xbusd_path, xbdi->xbdi_dev, err);
		vrele(xbdi->xbdi_vp);
		return;
	}
	err  = VOP_OPEN(xbdi->xbdi_vp, FREAD, NOCRED);
	if (err) {
		printf("xbdback %s: can't VOP_OPEN device 0x%x: %d\n",
		    xbusd->xbusd_path, xbdi->xbdi_dev, err);
		vput(xbdi->xbdi_vp);
		return;
	}
	VOP_UNLOCK(xbdi->xbdi_vp, 0);
	if (strcmp(devname, "dk") == 0) {
		/* dk device; get wedge data */
		struct dkwedge_info wi;
		err = VOP_IOCTL(xbdi->xbdi_vp, DIOCGWEDGEINFO, &wi,
		    FREAD, NOCRED);
		if (err) {
			printf("xbdback %s: can't DIOCGWEDGEINFO device "
			    "0x%x: %d\n", xbusd->xbusd_path,
			    xbdi->xbdi_dev, err);
			xbdi->xbdi_size = xbdi->xbdi_dev = 0;
			vn_close(xbdi->xbdi_vp, FREAD, NOCRED);
			xbdi->xbdi_vp = NULL;
			return;
		}
		xbdi->xbdi_size = wi.dkw_size;
		printf("xbd backend: attach device %s (size %" PRIu64 ") "
		    "for domain %d\n", wi.dkw_devname, xbdi->xbdi_size,
		    xbdi->xbdi_domid);
	} else {
		/* disk device, get partition data */
		struct partinfo dpart;
		err = VOP_IOCTL(xbdi->xbdi_vp, DIOCGPART, &dpart, FREAD, 0);
		if (err) {
			printf("xbdback %s: can't DIOCGPART device 0x%x: %d\n",
			    xbusd->xbusd_path, xbdi->xbdi_dev, err);
			xbdi->xbdi_size = xbdi->xbdi_dev = 0;
			vn_close(xbdi->xbdi_vp, FREAD, NOCRED);
			xbdi->xbdi_vp = NULL;
			return;
		}
		xbdi->xbdi_size = dpart.part->p_size;
		printf("xbd backend: attach device %s%d%c (size %" PRIu64 ") "
		    "for domain %d\n", devname, DISKUNIT(xbdi->xbdi_dev),
		    DISKPART(xbdi->xbdi_dev) + 'a', xbdi->xbdi_size,
		    xbdi->xbdi_domid);
	}
again:
	xbt = xenbus_transaction_start();
	if (xbt == NULL) {
		printf("xbdback %s: can't start transaction\n",
		    xbusd->xbusd_path);
		    return;
	}
	err = xenbus_printf(xbt, xbusd->xbusd_path, "sectors", "%" PRIu64 ,
	    xbdi->xbdi_size);
	if (err) {
		printf("xbdback: failed to write %s/sectors: %d\n",
		    xbusd->xbusd_path, err);
		goto abort;
	}
	err = xenbus_printf(xbt, xbusd->xbusd_path, "info", "%u",
	    xbdi->xbdi_ro ? VDISK_READONLY : 0);
	if (err) {
		printf("xbdback: failed to write %s/info: %d\n",
		    xbusd->xbusd_path, err);
		goto abort;
	}
	err = xenbus_printf(xbt, xbusd->xbusd_path, "sector-size", "%lu",
	    (u_long)DEV_BSIZE);
	if (err) {
		printf("xbdback: failed to write %s/sector-size: %d\n",
		    xbusd->xbusd_path, err);
		goto abort;
	}
	err = xenbus_transaction_end(xbt, 0);
	if (err == EAGAIN)
		goto again;
	if (err) {
		printf("xbdback %s: can't end transaction: %d\n",
		    xbusd->xbusd_path, err);
	}
	err = xenbus_switch_state(xbusd, NULL, XenbusStateConnected);
	if (err) {
		printf("xbdback %s: can't switch state: %d\n",
		    xbusd->xbusd_path, err);
	}
	return;
abort:
	xenbus_transaction_end(xbt, 1);
}


static void xbdback_finish_disconnect(struct xbdback_instance *xbdi)
{
	KASSERT(xbdi->xbdi_status == DISCONNECTING);

	xbdi->xbdi_status = DISCONNECTED;
	wakeup(&xbdi->xbdi_status);

}

static struct xbdback_instance *
xbdif_lookup(domid_t dom , uint32_t handle)
{
	struct xbdback_instance *xbdi;

	SLIST_FOREACH(xbdi, &xbdback_instances, next) {
		if (xbdi->xbdi_domid == dom && xbdi->xbdi_handle == handle)
			return xbdi;
	}
	return NULL;
}

static int
xbdback_evthandler(void *arg)
{
	struct xbdback_instance *xbdi = arg;

	XENPRINTF(("xbdback_evthandler domain %d: cont %p\n",
	    xbdi->xbdi_domid, xbdi->xbdi_cont));

	if (xbdi->xbdi_cont == NULL) {
		xbdi->xbdi_cont = xbdback_co_main;
		xbdback_trampoline(xbdi, xbdi);
	}
	return 1;
}

static void *
xbdback_co_main(struct xbdback_instance *xbdi, void *obj)
{
	(void)obj;
	xbdi->xbdi_req_prod = xbdi->xbdi_ring.ring_n.sring->req_prod;
	x86_lfence(); /* ensure we see all requests up to req_prod */
	/*
	 * note that we'll eventually get a full ring of request.
	 * in this case, MASK_BLKIF_IDX(req_cons) == MASK_BLKIF_IDX(req_prod)
	 */
	xbdi->xbdi_cont = xbdback_co_main_loop;
	return xbdi;
}

static void *
xbdback_co_main_loop(struct xbdback_instance *xbdi, void *obj) 
{
	blkif_request_t *req = &xbdi->xbdi_xen_req;
	blkif_x86_32_request_t *req32;
	blkif_x86_64_request_t *req64;
	int i;

	(void)obj;
	if (xbdi->xbdi_ring.ring_n.req_cons != xbdi->xbdi_req_prod) {
		switch(xbdi->xbdi_proto) {
		case XBDIP_NATIVE:
			memcpy(req, RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n,
			    xbdi->xbdi_ring.ring_n.req_cons),
			    sizeof(blkif_request_t));
			break;
		case XBDIP_32:
			req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32,
			    xbdi->xbdi_ring.ring_n.req_cons);
			req->operation = req32->operation;
			req->nr_segments = req32->nr_segments;
			req->handle = req32->handle;
			req->id = req32->id;
			req->sector_number = req32->sector_number;
			for (i = 0; i < req->nr_segments; i++)
				req->seg[i] = req32->seg[i];
			break;
			    
		case XBDIP_64:
			req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64,
			    xbdi->xbdi_ring.ring_n.req_cons);
			req->operation = req64->operation;
			req->nr_segments = req64->nr_segments;
			req->handle = req64->handle;
			req->id = req64->id;
			req->sector_number = req64->sector_number;
			for (i = 0; i < req->nr_segments; i++)
				req->seg[i] = req64->seg[i];
			break;
		}
		XENPRINTF(("xbdback op %d req_cons 0x%x req_prod 0x%x "
		    "resp_prod 0x%x id %" PRIu64 "\n", req->operation,
			xbdi->xbdi_ring.ring_n.req_cons,
			xbdi->xbdi_req_prod,
			xbdi->xbdi_ring.ring_n.rsp_prod_pvt,
			req->id));
		switch(req->operation) {
		case BLKIF_OP_READ:
		case BLKIF_OP_WRITE:
			xbdi->xbdi_cont = xbdback_co_io;
			break;
		default:
			printf("xbdback_evthandler domain %d: unknown "
			    "operation %d\n", xbdi->xbdi_domid, req->operation);
			xbdback_send_reply(xbdi, req->id, req->operation,
			    BLKIF_RSP_ERROR);
			xbdi->xbdi_cont = xbdback_co_main_incr;
			break;
		}
	} else {
		xbdi->xbdi_cont = xbdback_co_main_done;
	}
	return xbdi;
}

static void *
xbdback_co_main_incr(struct xbdback_instance *xbdi, void *obj)
{
	(void)obj;
	xbdi->xbdi_ring.ring_n.req_cons++;
	xbdi->xbdi_cont = xbdback_co_main_loop;
	return xbdi;
}

static void *
xbdback_co_main_done(struct xbdback_instance *xbdi, void *obj)
{
	(void)obj;
	if (xbdi->xbdi_io != NULL) {
		xbdi->xbdi_cont = xbdback_co_flush;
		xbdi->xbdi_cont_aux = xbdback_co_main_done2;
	} else {
		xbdi->xbdi_cont = xbdback_co_main_done2;
	}
	return xbdi;
}

static void *
xbdback_co_main_done2(struct xbdback_instance *xbdi, void *obj)
{
	int work_to_do;

	RING_FINAL_CHECK_FOR_REQUESTS(&xbdi->xbdi_ring.ring_n, work_to_do);
	if (work_to_do)
		xbdi->xbdi_cont = xbdback_co_main;
	else
		xbdi->xbdi_cont = NULL;
	return xbdi;
}

static void *
xbdback_co_io(struct xbdback_instance *xbdi, void *obj)
{	
	int error;

	(void)obj;
	if (xbdi->xbdi_xen_req.nr_segments < 1 ||
	    xbdi->xbdi_xen_req.nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST ) {
		printf("xbdback_io domain %d: %d segments\n",
		       xbdi->xbdi_domid, xbdi->xbdi_xen_req.nr_segments);
		error = EINVAL;
		goto end;
	}
	if (xbdi->xbdi_xen_req.operation == BLKIF_OP_WRITE) {
		if (xbdi->xbdi_ro) {
			error = EROFS;
			goto end;
		}
	}

	xbdi->xbdi_segno = 0;

	xbdi->xbdi_cont = xbdback_co_io_gotreq;
	return xbdback_pool_get(&xbdback_request_pool, xbdi);
 end:
	xbdback_send_reply(xbdi, xbdi->xbdi_xen_req.id,
	    xbdi->xbdi_xen_req.operation, error);
	xbdi->xbdi_cont = xbdback_co_main_incr;
	return xbdi;
}

static void *
xbdback_co_io_gotreq(struct xbdback_instance *xbdi, void *obj)
{
	struct xbdback_request *xrq;

	xrq = xbdi->xbdi_req = obj;
	
	xrq->rq_xbdi = xbdi;
	xrq->rq_iocount = 0;
	xrq->rq_ioerrs = 0;
	xrq->rq_id = xbdi->xbdi_xen_req.id;
	xrq->rq_operation = xbdi->xbdi_xen_req.operation;

	/* 
	 * Request-level reasons not to coalesce: different device,
	 * different op, or noncontiguous disk sectors (vs. previous
	 * request handed to us).
	 */
	xbdi->xbdi_cont = xbdback_co_io_loop;
	if (xbdi->xbdi_io != NULL) {
		struct xbdback_request *last_req;
		last_req = SLIST_FIRST(&xbdi->xbdi_io->xio_rq)->car;
		XENPRINTF(("xbdback_io domain %d: hoping for sector %" PRIu64
		    "; got %" PRIu64 "\n", xbdi->xbdi_domid,
		    xbdi->xbdi_next_sector,
		    xbdi->xbdi_xen_req.sector_number));
		if ((xrq->rq_operation != last_req->rq_operation)
		    || (xbdi->xbdi_xen_req.sector_number !=
		    xbdi->xbdi_next_sector)) {
			XENPRINTF(("xbdback_io domain %d: segment break\n",
			    xbdi->xbdi_domid));
			xbdi->xbdi_next_sector =
			    xbdi->xbdi_xen_req.sector_number;
			xbdi->xbdi_cont_aux = xbdi->xbdi_cont; 
			xbdi->xbdi_cont = xbdback_co_flush;
		}
	} else {
		xbdi->xbdi_next_sector = xbdi->xbdi_xen_req.sector_number;
	}
	return xbdi;
}


static void *
xbdback_co_io_loop(struct xbdback_instance *xbdi, void *obj)
{
	struct xbdback_io *xio;

	(void)obj;
	if (xbdi->xbdi_segno < xbdi->xbdi_xen_req.nr_segments) {
		uint8_t this_fs, this_ls, last_fs, last_ls;
		grant_ref_t thisgrt, lastgrt;
		/* 
		 * Segment-level reason to coalesce: handling full
		 * pages, or adjacent sector ranges from the same page
		 * (and yes, this latter does happen).  But not if the
		 * array of client pseudo-physical pages is full.
		 */
		this_fs = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].first_sect;
		this_ls = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].last_sect;
		thisgrt = xbdi->xbdi_xen_req.seg[xbdi->xbdi_segno].gref;
		XENPRINTF(("xbdback_io domain %d: "
			   "first,last_sect[%d]=0%o,0%o\n",
			   xbdi->xbdi_domid, xbdi->xbdi_segno,
			   this_fs, this_ls));
		last_fs = xbdi->xbdi_last_fs = xbdi->xbdi_this_fs;
		last_ls = xbdi->xbdi_last_ls = xbdi->xbdi_this_ls;
		lastgrt = xbdi->xbdi_lastgrt = xbdi->xbdi_thisgrt;
		xbdi->xbdi_this_fs = this_fs;
		xbdi->xbdi_this_ls = this_ls;
		xbdi->xbdi_thisgrt = thisgrt;
		if (xbdi->xbdi_io != NULL) {
			if (last_ls == VBD_MAXSECT
			    && this_fs == 0
			    && xbdi->xbdi_io->xio_nrma
			    < XENSHM_MAX_PAGES_PER_REQUEST) {
				xbdi->xbdi_same_page = 0;
			} else if (last_ls + 1
				       == this_fs
#ifdef notyet
				   && (last_fas & ~PAGE_MASK)
				       == (this_fas & ~PAGE_MASK)
#else 
				  && 0 /* can't know frame number yet */
#endif
			    ) {
#ifdef DEBUG
				static struct timeval gluetimer;
				if (ratecheck(&gluetimer,
					      &xbdback_fragio_intvl))
					printf("xbdback: domain %d sending"
					    " excessively fragmented I/O\n",
					    xbdi->xbdi_domid);
#endif
				printf("xbdback_io: would maybe glue same page sec %d (%d->%d)\n", xbdi->xbdi_segno, this_fs, this_ls);
				panic("notyet!");
				XENPRINTF(("xbdback_io domain %d: glue same "
				    "page", xbdi->xbdi_domid));
				xbdi->xbdi_same_page = 1;
			} else {
				xbdi->xbdi_cont_aux = xbdback_co_io_loop;
				xbdi->xbdi_cont = xbdback_co_flush;
				return xbdi;
			}
		} else
			xbdi->xbdi_same_page = 0;

		if (xbdi->xbdi_io == NULL) {
			xbdi->xbdi_cont = xbdback_co_io_gotio;
			xio = xbdback_pool_get(&xbdback_io_pool, xbdi);
			buf_init(&xio->xio_buf);
			return xio;
		} else {
			xbdi->xbdi_cont = xbdback_co_io_gotio2;
		}
	} else {
		/* done with the loop over segments; get next request */
		xbdi->xbdi_cont = xbdback_co_main_incr;
	}
	return xbdi;			
}


static void *
xbdback_co_io_gotio(struct xbdback_instance *xbdi, void *obj)

{
	struct xbdback_io *xbd_io;
	vaddr_t start_offset; /* start offset in vm area */
	int buf_flags;

	xbdi_get(xbdi);
	
	xbd_io = xbdi->xbdi_io = obj;
	xbd_io->xio_xbdi = xbdi;
	SLIST_INIT(&xbd_io->xio_rq);
	xbd_io->xio_nrma = 0;
	xbd_io->xio_mapped = 0;

	start_offset = xbdi->xbdi_this_fs * VBD_BSIZE;
	
	if (xbdi->xbdi_xen_req.operation == BLKIF_OP_WRITE) {
		buf_flags = B_WRITE;
	} else {
		buf_flags = B_READ;
	}

	xbd_io->xio_buf.b_flags = buf_flags;
	xbd_io->xio_buf.b_cflags = 0;
	xbd_io->xio_buf.b_oflags = 0;
	xbd_io->xio_buf.b_iodone = xbdback_iodone;
	xbd_io->xio_buf.b_proc = NULL;
	xbd_io->xio_buf.b_vp = xbdi->xbdi_vp;
	xbd_io->xio_buf.b_objlock = &xbdi->xbdi_vp->v_interlock;
	xbd_io->xio_buf.b_dev = xbdi->xbdi_dev;
	xbd_io->xio_buf.b_blkno = xbdi->xbdi_next_sector;
	xbd_io->xio_buf.b_bcount = 0;
	xbd_io->xio_buf.b_data = (void *)start_offset;
	xbd_io->xio_buf.b_private = xbd_io;

	xbdi->xbdi_cont = xbdback_co_io_gotio2;
	return xbdi;
}


static void *
xbdback_co_io_gotio2(struct xbdback_instance *xbdi, void *obj)
{
	(void)obj;
	if (xbdi->xbdi_segno == 0 || SLIST_EMPTY(&xbdi->xbdi_io->xio_rq)) {
		/* if this is the first segment of a new request */
		/* or if it's the first segment of the io */
		xbdi->xbdi_cont = xbdback_co_io_gotfrag;
		return xbdback_pool_get(&xbdback_fragment_pool, xbdi);
	}
	xbdi->xbdi_cont = xbdback_co_io_gotfrag2;
	return xbdi;
}


static void *
xbdback_co_io_gotfrag(struct xbdback_instance *xbdi, void *obj)
{
	struct xbdback_fragment *xbd_fr;

	xbd_fr = obj;
	xbd_fr->car = xbdi->xbdi_req;
	SLIST_INSERT_HEAD(&xbdi->xbdi_io->xio_rq, xbd_fr, cdr);
	++xbdi->xbdi_req->rq_iocount;

	xbdi->xbdi_cont = xbdback_co_io_gotfrag2;
	return xbdi;
}

static void *
xbdback_co_io_gotfrag2(struct xbdback_instance *xbdi, void *obj)
{
	struct xbdback_io *xbd_io;
	int seg_size;
	uint8_t this_fs, this_ls;

	this_fs = xbdi->xbdi_this_fs;
	this_ls = xbdi->xbdi_this_ls;
	xbd_io = xbdi->xbdi_io;
	seg_size = this_ls - this_fs + 1;

	if (seg_size < 0) {
		printf("xbdback_io domain %d: negative-size request (%d %d)\n",
		       xbdi->xbdi_domid, this_ls, this_fs);
		xbdback_io_error(xbdi->xbdi_io, EINVAL);
		xbdi->xbdi_io = NULL;
		xbdi->xbdi_cont = xbdback_co_main_incr;
		return xbdi;
	}
	
	if (!xbdi->xbdi_same_page) {
		XENPRINTF(("xbdback_io domain %d: appending grant %u\n",
			   xbdi->xbdi_domid, (u_int)xbdi->xbdi_thisgrt));
		xbd_io->xio_gref[xbd_io->xio_nrma++] = xbdi->xbdi_thisgrt;
	}

	xbd_io->xio_buf.b_bcount += (daddr_t)(seg_size * VBD_BSIZE);
	XENPRINTF(("xbdback_io domain %d: start sect %d size %d\n",
	    xbdi->xbdi_domid, (int)xbdi->xbdi_next_sector, seg_size));
	
	/* Finally, the end of the segment loop! */
	xbdi->xbdi_next_sector += seg_size;
	++xbdi->xbdi_segno;
	xbdi->xbdi_cont = xbdback_co_io_loop;
	return xbdi;
}


static void *
xbdback_co_flush(struct xbdback_instance *xbdi, void *obj)
{
	(void)obj;
	XENPRINTF(("xbdback_io domain %d: flush sect %ld size %d ptr 0x%lx\n",
	    xbdi->xbdi_domid, (long)xbdi->xbdi_io->xio_buf.b_blkno,
	    (int)xbdi->xbdi_io->xio_buf.b_bcount, (long)xbdi->xbdi_io));
	xbdi->xbdi_cont = xbdback_co_flush_done;
	return xbdback_map_shm(xbdi->xbdi_io);
}

static void *
xbdback_co_flush_done(struct xbdback_instance *xbdi, void *obj)
{
	(void)obj;
	workqueue_enqueue(xbdback_workqueue, &xbdi->xbdi_io->xio_work, NULL);
	xbdi->xbdi_io = NULL;
	xbdi->xbdi_cont = xbdi->xbdi_cont_aux;
	return xbdi;
}

static void
xbdback_io_error(struct xbdback_io *xbd_io, int error)
{
	xbd_io->xio_buf.b_error = error;
	xbdback_iodone(&xbd_io->xio_buf);
}

static void
xbdback_do_io(struct work *wk, void *dummy)
{
	struct xbdback_io *xbd_io = (void *)wk;
	KASSERT(&xbd_io->xio_work == wk);

	xbd_io->xio_buf.b_data =
	    (void *)((vaddr_t)xbd_io->xio_buf.b_data + xbd_io->xio_vaddr);
#ifdef DIAGNOSTIC
	{
	vaddr_t bdata = (vaddr_t)xbd_io->xio_buf.b_data;
	int nsegs =
	    ((((bdata + xbd_io->xio_buf.b_bcount - 1) & ~PAGE_MASK) -
	    (bdata & ~PAGE_MASK)) >> PAGE_SHIFT) + 1;
	if ((bdata & ~PAGE_MASK) != (xbd_io->xio_vaddr & ~PAGE_MASK)) {
		printf("xbdback_do_io vaddr 0x%lx bdata 0x%lx\n",
		    xbd_io->xio_vaddr, bdata);
		panic("xbdback_do_io: bdata page change");
	}
	if (nsegs > xbd_io->xio_nrma) {
		printf("xbdback_do_io vaddr 0x%lx bcount 0x%x doesn't fit in "
		    " %d pages\n", bdata, xbd_io->xio_buf.b_bcount,
		    xbd_io->xio_nrma);
		panic("xbdback_do_io: not enough pages");
	}
	}
#endif
	if ((xbd_io->xio_buf.b_flags & B_READ) == 0) {
		mutex_enter(&xbd_io->xio_buf.b_vp->v_interlock);
		xbd_io->xio_buf.b_vp->v_numoutput++;
		mutex_exit(&xbd_io->xio_buf.b_vp->v_interlock);
	}
	bdev_strategy(&xbd_io->xio_buf);
}

/* This gets reused by xbdback_io_error to report errors from other sources. */
static void
xbdback_iodone(struct buf *bp)
{
	struct xbdback_io *xbd_io;
	struct xbdback_instance *xbdi;
	int errp;

	xbd_io = bp->b_private;
	xbdi = xbd_io->xio_xbdi;

	XENPRINTF(("xbdback_io domain %d: iodone ptr 0x%lx\n",
		   xbdi->xbdi_domid, (long)xbd_io));

	if (xbd_io->xio_mapped)
		xbdback_unmap_shm(xbd_io);

	if (bp->b_error != 0) {
		printf("xbd IO domain %d: error %d\n",
		       xbdi->xbdi_domid, bp->b_error);
		errp = 1;
	} else
		errp = 0;

	
	/* for each constituent xbd request */
	while(!SLIST_EMPTY(&xbd_io->xio_rq)) {
		struct xbdback_fragment *xbd_fr;
		struct xbdback_request *xbd_req;
		struct xbdback_instance *rxbdi;
		int error;
		
		xbd_fr = SLIST_FIRST(&xbd_io->xio_rq);
		xbd_req = xbd_fr->car;
		SLIST_REMOVE_HEAD(&xbd_io->xio_rq, cdr);
		xbdback_pool_put(&xbdback_fragment_pool, xbd_fr);
		
		if (errp)
			++xbd_req->rq_ioerrs;
		
		/* finalize it only if this was its last I/O */
		if (--xbd_req->rq_iocount > 0)
			continue;

		rxbdi = xbd_req->rq_xbdi;
		KASSERT(xbdi == rxbdi);
		
		error = xbd_req->rq_ioerrs > 0
		    ? BLKIF_RSP_ERROR
		    : BLKIF_RSP_OKAY;

		XENPRINTF(("xbdback_io domain %d: end request %" PRIu64 " error=%d\n",
		    xbdi->xbdi_domid, xbd_req->rq_id, error));
		xbdback_send_reply(xbdi, xbd_req->rq_id,
		    xbd_req->rq_operation, error);
		xbdback_pool_put(&xbdback_request_pool, xbd_req);
	}
	xbdi_put(xbdi);
	buf_destroy(&xbd_io->xio_buf);
	xbdback_pool_put(&xbdback_io_pool, xbd_io);
}

/*
 * called once a request has completed. Place the reply in the ring and
 * notify the guest OS
 */
static void
xbdback_send_reply(struct xbdback_instance *xbdi, uint64_t id,
    int op, int status)
{
	blkif_response_t *resp_n;
	blkif_x86_32_response_t *resp32;
	blkif_x86_64_response_t *resp64;
	int notify;

	switch(xbdi->xbdi_proto) {
	case XBDIP_NATIVE:
		resp_n = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_n,
		    xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
		resp_n->id        = id;
		resp_n->operation = op;
		resp_n->status    = status;
		break;
	case XBDIP_32:
		resp32 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_32,
		    xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
		resp32->id        = id;
		resp32->operation = op;
		resp32->status    = status;
		break;
	case XBDIP_64:
		resp64 = RING_GET_RESPONSE(&xbdi->xbdi_ring.ring_64,
		    xbdi->xbdi_ring.ring_n.rsp_prod_pvt);
		resp64->id        = id;
		resp64->operation = op;
		resp64->status    = status;
		break;
	}
	xbdi->xbdi_ring.ring_n.rsp_prod_pvt++;
	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbdi->xbdi_ring.ring_n, notify);
	if (notify) {
		XENPRINTF(("xbdback_send_reply notify %d\n", xbdi->xbdi_domid));
		hypervisor_notify_via_evtchn(xbdi->xbdi_evtchn);
	}
}

/*
 * Map a request into our virtual address space.  The xbd_req->rq_ma
 * array is to be filled out by the caller.
 */
static void *
xbdback_map_shm(struct xbdback_io *xbd_io)
{
	struct xbdback_instance *xbdi;
	struct xbdback_request *xbd_rq;
	int error, s;

#ifdef XENDEBUG_VBD
	int i;
	printf("xbdback_map_shm map grant ");
	for (i = 0; i < xbd_io->xio_nrma; i++) {
		printf("%u ", (u_int)xbd_io->xio_gref[i]);
	}
#endif

	KASSERT(xbd_io->xio_mapped == 0);

	xbdi = xbd_io->xio_xbdi;
	xbd_rq = SLIST_FIRST(&xbd_io->xio_rq)->car;
	error = xen_shm_map(xbd_io->xio_nrma, xbdi->xbdi_domid,
	    xbd_io->xio_gref, &xbd_io->xio_vaddr, xbd_io->xio_gh, 
	    (xbd_rq->rq_operation == BLKIF_OP_WRITE) ? XSHM_RO: 0);

	switch(error) {
	case 0:
#ifdef XENDEBUG_VBD
		printf("handle ");
		for (i = 0; i < xbd_io->xio_nrma; i++) {
			printf("%u ", (u_int)xbd_io->xio_gh[i]);
		}
		printf("\n");
#endif
		xbd_io->xio_mapped = 1;
		return (void *)xbd_io->xio_vaddr;
	case ENOMEM:
		s = splvm();
		if (!xbdback_shmcb) {
			if (xen_shm_callback(xbdback_shm_callback, xbdi)
			    != 0) {
				splx(s);
				panic("xbdback_map_shm: "
				      "xen_shm_callback failed");
			}
			xbdback_shmcb = 1;
		}
		SIMPLEQ_INSERT_TAIL(&xbdback_shmq, xbdi, xbdi_on_hold);
		splx(s);
		return NULL;
	default:
		printf("xbdback_map_shm: xen_shm error %d ",
		       error);
		xbdback_io_error(xbdi->xbdi_io, error);
		xbdi->xbdi_io = NULL;
		xbdi->xbdi_cont = xbdi->xbdi_cont_aux;
		return xbdi;
	}
}

static int
xbdback_shm_callback(void *arg)
{
        int error, s;

	s = splvm();
	while(!SIMPLEQ_EMPTY(&xbdback_shmq)) {
		struct xbdback_instance *xbdi;
		struct xbdback_io *xbd_io;
		struct xbdback_request *xbd_rq;
		
		xbdi = SIMPLEQ_FIRST(&xbdback_shmq);
		xbd_io = xbdi->xbdi_io;
		xbd_rq = SLIST_FIRST(&xbd_io->xio_rq)->car;
		KASSERT(xbd_io->xio_mapped == 0);
		
		error = xen_shm_map(xbd_io->xio_nrma,
		    xbdi->xbdi_domid, xbd_io->xio_gref,
		    &xbd_io->xio_vaddr, xbd_io->xio_gh, 
		    XSHM_CALLBACK |
		    ((xbd_rq->rq_operation == BLKIF_OP_WRITE) ? XSHM_RO: 0));
		switch(error) {
		case ENOMEM:
			splx(s);
			return -1; /* will try again later */
		case 0:
			xbd_io->xio_mapped = 1;
			SIMPLEQ_REMOVE_HEAD(&xbdback_shmq, xbdi_on_hold);
			splx(s);
			xbdback_trampoline(xbdi, xbdi);
			s = splvm();
			break;
		default:
			SIMPLEQ_REMOVE_HEAD(&xbdback_shmq, xbdi_on_hold);
			splx(s);
			printf("xbdback_shm_callback: xen_shm error %d\n",
			       error);
			xbdi->xbdi_cont = xbdi->xbdi_cont_aux;
			xbdback_io_error(xbd_io, error);
			xbdback_trampoline(xbdi, xbdi);
			s = splvm();
			break;
		}
	}
	xbdback_shmcb = 0;
	splx(s);
	return 0;
}

/* unmap a request from our virtual address space (request is done) */
static void
xbdback_unmap_shm(struct xbdback_io *xbd_io)
{
#ifdef XENDEBUG_VBD
	int i;
	printf("xbdback_unmap_shm handle ");
	for (i = 0; i < xbd_io->xio_nrma; i++) {
		printf("%u ", (u_int)xbd_io->xio_gh[i]);
	}
	printf("\n");
#endif

	KASSERT(xbd_io->xio_mapped == 1);
	xbd_io->xio_mapped = 0;
	xen_shm_unmap(xbd_io->xio_vaddr, xbd_io->xio_nrma,
	    xbd_io->xio_gh);
	xbd_io->xio_vaddr = -1;
}

/* Obtain memory from a pool, in cooperation with the continuations. */
static void *xbdback_pool_get(struct xbdback_pool *pp,
			      struct xbdback_instance *xbdi)
{
	int s;
	void *item;

	item = pool_get(&pp->p, PR_NOWAIT);
	if (item == NULL) {
		if (ratecheck(&pp->last_warning, &xbdback_poolsleep_intvl))
			printf("xbdback_pool_get: %s is full",
			       pp->p.pr_wchan);
		s = splvm();
		SIMPLEQ_INSERT_TAIL(&pp->q, xbdi, xbdi_on_hold);
		splx(s);
	}
	return item;
}

/*
 * Restore memory to a pool... unless an xbdback instance had been
 * waiting for it, in which case that gets the memory first.
 */
static void xbdback_pool_put(struct xbdback_pool *pp, void *item)
{
	int s;
	
	s = splvm();
	if (SIMPLEQ_EMPTY(&pp->q)) {
		splx(s);
		pool_put(&pp->p, item);
	} else {
		struct xbdback_instance *xbdi = SIMPLEQ_FIRST(&pp->q);
		SIMPLEQ_REMOVE_HEAD(&pp->q, xbdi_on_hold);
		splx(s);
		xbdback_trampoline(xbdi, item);
	}
}

static void
xbdback_trampoline(struct xbdback_instance *xbdi, void *obj)
{
	xbdback_cont_t cont;

	while(obj != NULL && xbdi->xbdi_cont != NULL) {
		cont = xbdi->xbdi_cont;
#ifdef DIAGNOSTIC
		xbdi->xbdi_cont = (xbdback_cont_t)0xDEADBEEF;
#endif
		obj = (*cont)(xbdi, obj);
#ifdef DIAGNOSTIC
		if (xbdi->xbdi_cont == (xbdback_cont_t)0xDEADBEEF) {
			printf("xbdback_trampoline: 0x%lx didn't set "
			       "xbdi->xbdi_cont!\n2", (long)cont);
			panic("xbdback_trampoline: bad continuation");
		}
#endif
	}
}