NetBSD-5.0.2/sys/arch/xen/xen/xbd_xenbus.c

Compare this file to the similar file:
Show the results in this format:

/*      $NetBSD: xbd_xenbus.c,v 1.34.2.1 2009/03/08 03:12:50 snj Exp $      */

/*
 * Copyright (c) 2006 Manuel Bouyer.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Manuel Bouyer.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: xbd_xenbus.c,v 1.34.2.1 2009/03/08 03:12:50 snj Exp $");

#include "opt_xen.h"
#include "rnd.h"

#include <sys/param.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/device.h>
#include <sys/disk.h>
#include <sys/disklabel.h>
#include <sys/conf.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/stat.h>
#include <sys/vnode.h>

#include <dev/dkvar.h>

#include <uvm/uvm.h>

#if NRND > 0
#include <sys/rnd.h>
#endif

#include <xen/hypervisor.h>
#include <xen/evtchn.h>
#include <xen/granttables.h>
#include <xen/xen3-public/io/blkif.h>
#include <xen/xen3-public/io/protocols.h>

#include <xen/xenbus.h>
#include "locators.h"

#undef XBD_DEBUG
#ifdef XBD_DEBUG
#define DPRINTF(x) printf x;
#else
#define DPRINTF(x)
#endif

#define GRANT_INVALID_REF -1

#define XBD_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)

#define XEN_BSHIFT      9               /* log2(XEN_BSIZE) */
#define XEN_BSIZE       (1 << XEN_BSHIFT) 

struct xbd_req {
	SLIST_ENTRY(xbd_req) req_next;
	uint16_t req_id; /* ID passed to backend */
	grant_ref_t req_gntref[BLKIF_MAX_SEGMENTS_PER_REQUEST];
	int req_nr_segments; /* number of segments in this request */
	struct buf *req_bp; /* buffer associated with this request */
	void *req_data; /* pointer to the data buffer */
};

struct xbd_xenbus_softc {
	device_t sc_dev;
	struct dk_softc sc_dksc;
	struct dk_intf *sc_di;
	struct xenbus_device *sc_xbusd;

	blkif_front_ring_t sc_ring;

	unsigned int sc_evtchn;

	grant_ref_t sc_ring_gntref;

	struct xbd_req sc_reqs[XBD_RING_SIZE];
	SLIST_HEAD(,xbd_req) sc_xbdreq_head; /* list of free requests */

	int sc_backend_status; /* our status with backend */
#define BLKIF_STATE_DISCONNECTED 0
#define BLKIF_STATE_CONNECTED    1
#define BLKIF_STATE_SUSPENDED    2
	int sc_shutdown;

	uint64_t sc_sectors; /* number of sectors for this device */
	u_long sc_secsize; /* sector size */
	uint64_t sc_xbdsize; /* size of disk in DEV_BSIZE */
	u_long sc_info; /* VDISK_* */
	u_long sc_handle; /* from backend */
#if NRND > 0
	rndsource_element_t     sc_rnd_source;
#endif
};

#if 0
/* too big to be on stack */
static multicall_entry_t rq_mcl[XBD_RING_SIZE+1];
static paddr_t rq_pages[XBD_RING_SIZE];
#endif

static int  xbd_xenbus_match(device_t, cfdata_t, void *);
static void xbd_xenbus_attach(device_t, device_t, void *);
static int  xbd_xenbus_detach(device_t, int);

static int  xbd_xenbus_resume(void *);
static int  xbd_handler(void *);
static int  xbdstart(struct dk_softc *, struct buf *);
static void xbd_backend_changed(void *, XenbusState);
static void xbd_connect(struct xbd_xenbus_softc *);

static int  xbd_map_align(struct xbd_req *);
static void xbd_unmap_align(struct xbd_req *);

CFATTACH_DECL_NEW(xbd_xenbus, sizeof(struct xbd_xenbus_softc),
   xbd_xenbus_match, xbd_xenbus_attach, xbd_xenbus_detach, NULL);

dev_type_open(xbdopen);
dev_type_close(xbdclose);
dev_type_read(xbdread);
dev_type_write(xbdwrite);
dev_type_ioctl(xbdioctl);
dev_type_strategy(xbdstrategy);
dev_type_dump(xbddump);
dev_type_size(xbdsize);

const struct bdevsw xbd_bdevsw = {
	xbdopen, xbdclose, xbdstrategy, xbdioctl,
	xbddump, xbdsize, D_DISK
};

const struct cdevsw xbd_cdevsw = {
	xbdopen, xbdclose, xbdread, xbdwrite, xbdioctl,
	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
};

extern struct cfdriver xbd_cd;

/* Pseudo-disk Interface */
static struct dk_intf dkintf_esdi = {
        DTYPE_ESDI,
	"Xen Virtual ESDI",
	xbdopen,
	xbdclose,
	xbdstrategy,
	xbdstart,
};

static struct dkdriver xbddkdriver = {
        .d_strategy = xbdstrategy,
	.d_minphys = minphys,
};

static int
xbd_xenbus_match(device_t parent, cfdata_t match, void *aux)
{
	struct xenbusdev_attach_args *xa = aux;

	if (strcmp(xa->xa_type, "vbd") != 0)
		return 0;

	if (match->cf_loc[XENBUSCF_ID] != XENBUSCF_ID_DEFAULT &&
	    match->cf_loc[XENBUSCF_ID] != xa->xa_id)
		return 0;

	return 1;
}

static void
xbd_xenbus_attach(device_t parent, device_t self, void *aux)
{
	struct xbd_xenbus_softc *sc = device_private(self);
	struct xenbusdev_attach_args *xa = aux;
	RING_IDX i;
#ifdef XBD_DEBUG
	char **dir, *val;
	int dir_n = 0;
	char id_str[20];
	int err;
#endif

	config_pending_incr();
	aprint_normal(": Xen Virtual Block Device Interface\n");

	sc->sc_dev = self;

#ifdef XBD_DEBUG
	printf("path: %s\n", xa->xa_xbusd->xbusd_path);
	snprintf(id_str, sizeof(id_str), "%d", xa->xa_id);
	err = xenbus_directory(NULL, "device/vbd", id_str, &dir_n, &dir);
	if (err) {
		aprint_error_dev(self, "xenbus_directory err %d\n", err);
	} else {
		printf("%s/\n", xa->xa_xbusd->xbusd_path);
		for (i = 0; i < dir_n; i++) {
			printf("\t/%s", dir[i]);
			err = xenbus_read(NULL, xa->xa_xbusd->xbusd_path, dir[i],
			    NULL, &val);
			if (err) {
				aprint_error_dev(self, "xenbus_read err %d\n", err);
			} else {
				printf(" = %s\n", val);
				free(val, M_DEVBUF);
			}
		}
	}
#endif /* XBD_DEBUG */
	sc->sc_xbusd = xa->xa_xbusd;
	sc->sc_xbusd->xbusd_otherend_changed = xbd_backend_changed;

	dk_sc_init(&sc->sc_dksc, sc, device_xname(self));
	disk_init(&sc->sc_dksc.sc_dkdev, device_xname(self), &xbddkdriver);
	sc->sc_di = &dkintf_esdi;
	/* initialize free requests list */
	SLIST_INIT(&sc->sc_xbdreq_head);
	for (i = 0; i < XBD_RING_SIZE; i++) {
		sc->sc_reqs[i].req_id = i;
		SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, &sc->sc_reqs[i],
		    req_next);
	}

	sc->sc_backend_status = BLKIF_STATE_DISCONNECTED;
	sc->sc_shutdown = 1;
	/* initialise shared structures and tell backend that we are ready */
	xbd_xenbus_resume(sc);

#if NRND > 0
	rnd_attach_source(&sc->sc_rnd_source, device_xname(self),
	    RND_TYPE_DISK, RND_FLAG_NO_COLLECT | RND_FLAG_NO_ESTIMATE);
#endif
}

static int
xbd_xenbus_detach(device_t dev, int flags)
{
	struct xbd_xenbus_softc *sc = device_private(dev);
	int s, bmaj, cmaj, i, mn;
	s = splbio();
	DPRINTF(("%s: xbd_detach\n", device_xname(dev)));
	if (sc->sc_shutdown == 0) {
		sc->sc_shutdown = 1;
		/* wait for requests to complete */
		while (sc->sc_backend_status == BLKIF_STATE_CONNECTED &&
		    sc->sc_dksc.sc_dkdev.dk_stats->io_busy > 0)
			tsleep(xbd_xenbus_detach, PRIBIO, "xbddetach", hz/2);
	}
	splx(s);

	/* locate the major number */
	bmaj = bdevsw_lookup_major(&xbd_bdevsw);
	cmaj = cdevsw_lookup_major(&xbd_cdevsw);

	/* Nuke the vnodes for any open instances. */
	for (i = 0; i < MAXPARTITIONS; i++) {
		mn = DISKMINOR(device_unit(dev), i);
		vdevgone(bmaj, mn, mn, VBLK);
		vdevgone(cmaj, mn, mn, VCHR);
	}
	if (sc->sc_backend_status == BLKIF_STATE_CONNECTED) {
		/* Delete all of our wedges. */
		dkwedge_delall(&sc->sc_dksc.sc_dkdev);

		s = splbio();
		/* Kill off any queued buffers. */
		bufq_drain(sc->sc_dksc.sc_bufq);
		bufq_free(sc->sc_dksc.sc_bufq);
		splx(s);

		/* detach disk */
		disk_detach(&sc->sc_dksc.sc_dkdev);
		disk_destroy(&sc->sc_dksc.sc_dkdev);
#if NRND > 0
		/* Unhook the entropy source. */
		rnd_detach_source(&sc->sc_rnd_source);
#endif
	}

	hypervisor_mask_event(sc->sc_evtchn);
	event_remove_handler(sc->sc_evtchn, &xbd_handler, sc);
	while (xengnt_status(sc->sc_ring_gntref)) {
		tsleep(xbd_xenbus_detach, PRIBIO, "xbd_ref", hz/2);
	}
	xengnt_revoke_access(sc->sc_ring_gntref);
	uvm_km_free(kernel_map, (vaddr_t)sc->sc_ring.sring,
	    PAGE_SIZE, UVM_KMF_WIRED);
	return 0;
}

static int
xbd_xenbus_resume(void *p)
{
	struct xbd_xenbus_softc *sc = p;
	struct xenbus_transaction *xbt;
	int error;
	blkif_sring_t *ring;
	paddr_t ma;
	const char *errmsg;

	sc->sc_ring_gntref = GRANT_INVALID_REF;


	/* setup device: alloc event channel and shared ring */
	ring = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
		UVM_KMF_ZERO | UVM_KMF_WIRED);
	if (ring == NULL)
		panic("xbd_xenbus_resume: can't alloc rings");

	SHARED_RING_INIT(ring);
	FRONT_RING_INIT(&sc->sc_ring, ring, PAGE_SIZE);

	(void)pmap_extract_ma(pmap_kernel(), (vaddr_t)ring, &ma);
	error = xenbus_grant_ring(sc->sc_xbusd, ma, &sc->sc_ring_gntref);
	if (error)
		return error;
	error = xenbus_alloc_evtchn(sc->sc_xbusd, &sc->sc_evtchn);
	if (error)
		return error;
	aprint_verbose_dev(sc->sc_dev, "using event channel %d\n",
	    sc->sc_evtchn);
	event_set_handler(sc->sc_evtchn, &xbd_handler, sc,
	    IPL_BIO, device_xname(sc->sc_dev));

again:
	xbt = xenbus_transaction_start();
	if (xbt == NULL)
		return ENOMEM;
	error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
	    "ring-ref","%u", sc->sc_ring_gntref);
	if (error) {
		errmsg = "writing ring-ref";
		goto abort_transaction;
	}
	error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
	    "event-channel", "%u", sc->sc_evtchn);
	if (error) {
		errmsg = "writing event channel";
		goto abort_transaction;
	}
	error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
	    "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE);
	if (error) {
		errmsg = "writing protocol";
		goto abort_transaction;
	}
	error = xenbus_switch_state(sc->sc_xbusd, xbt, XenbusStateInitialised);
	if (error) {
		errmsg = "writing frontend XenbusStateInitialised";
		goto abort_transaction;
	}
	error = xenbus_transaction_end(xbt, 0);
	if (error == EAGAIN)
		goto again;
	if (error) {
		xenbus_dev_fatal(sc->sc_xbusd, error, "completing transaction");
		return -1;
	}
	return 0;

abort_transaction:
	xenbus_transaction_end(xbt, 1);
	xenbus_dev_fatal(sc->sc_xbusd, error, "%s", errmsg);
	return error;
}

static void xbd_backend_changed(void *arg, XenbusState new_state)
{
	struct xbd_xenbus_softc *sc = device_private((device_t)arg);
	struct dk_geom *pdg;
	char buf[9];
	int s;
	DPRINTF(("%s: new backend state %d\n", device_xname(sc->sc_dev), new_state));

	switch (new_state) {
	case XenbusStateUnknown:
	case XenbusStateInitialising:
	case XenbusStateInitWait:
	case XenbusStateInitialised:
		break;
	case XenbusStateClosing:
		s = splbio();
		sc->sc_shutdown = 1;
		/* wait for requests to complete */
		while (sc->sc_backend_status == BLKIF_STATE_CONNECTED &&
		    sc->sc_dksc.sc_dkdev.dk_stats->io_busy > 0)
			tsleep(xbd_xenbus_detach, PRIBIO, "xbddetach",
			    hz/2);
		splx(s);
		xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateClosed);
		break;
	case XenbusStateConnected:
		/*
		 * note that xbd_backend_changed() can only be called by
		 * the xenbus thread.
		 */

		if (sc->sc_backend_status == BLKIF_STATE_CONNECTED)
			/* already connected */
			return;

		xbd_connect(sc);
		sc->sc_shutdown = 0;
		hypervisor_enable_event(sc->sc_evtchn);

		sc->sc_xbdsize =
		    sc->sc_sectors * (uint64_t)sc->sc_secsize / DEV_BSIZE;
		sc->sc_dksc.sc_size = sc->sc_xbdsize;
		pdg = &sc->sc_dksc.sc_geom;
		pdg->pdg_secsize = DEV_BSIZE;
		pdg->pdg_ntracks = 1;
		pdg->pdg_nsectors = 1024 * (1024 / pdg->pdg_secsize);
		pdg->pdg_ncylinders = sc->sc_dksc.sc_size / pdg->pdg_nsectors;

		bufq_alloc(&sc->sc_dksc.sc_bufq, "fcfs", 0);
		sc->sc_dksc.sc_flags |= DKF_INITED;
		disk_attach(&sc->sc_dksc.sc_dkdev);

		sc->sc_backend_status = BLKIF_STATE_CONNECTED;

		/* try to read the disklabel */
		dk_getdisklabel(sc->sc_di, &sc->sc_dksc, 0 /* XXX ? */);
		format_bytes(buf, sizeof(buf), sc->sc_sectors * sc->sc_secsize);
		aprint_verbose_dev(sc->sc_dev,
				"%s, %d bytes/sect x %" PRIu64 " sectors\n",
				buf, (int)pdg->pdg_secsize, sc->sc_xbdsize);
		/* Discover wedges on this disk. */
		dkwedge_discover(&sc->sc_dksc.sc_dkdev);

		/* the disk should be working now */
		config_pending_decr();
		break;
	default:
		panic("bad backend state %d", new_state);
	}
}

static void
xbd_connect(struct xbd_xenbus_softc *sc)
{
	int err;
	unsigned long long sectors;

	err = xenbus_read_ul(NULL,
	    sc->sc_xbusd->xbusd_path, "virtual-device", &sc->sc_handle, 10);
	if (err)
		panic("%s: can't read number from %s/virtual-device\n", 
		    device_xname(sc->sc_dev), sc->sc_xbusd->xbusd_otherend);
	err = xenbus_read_ull(NULL,
	    sc->sc_xbusd->xbusd_otherend, "sectors", &sectors, 10);
	if (err)
		panic("%s: can't read number from %s/sectors\n", 
		    device_xname(sc->sc_dev), sc->sc_xbusd->xbusd_otherend);
	sc->sc_sectors = sectors;

	err = xenbus_read_ul(NULL,
	    sc->sc_xbusd->xbusd_otherend, "info", &sc->sc_info, 10);
	if (err)
		panic("%s: can't read number from %s/info\n", 
		    device_xname(sc->sc_dev), sc->sc_xbusd->xbusd_otherend);
	err = xenbus_read_ul(NULL,
	    sc->sc_xbusd->xbusd_otherend, "sector-size", &sc->sc_secsize, 10);
	if (err)
		panic("%s: can't read number from %s/sector-size\n", 
		    device_xname(sc->sc_dev), sc->sc_xbusd->xbusd_otherend);

	xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateConnected);
}

static int
xbd_handler(void *arg)
{
	struct xbd_xenbus_softc *sc = arg;
	struct buf *bp;
	RING_IDX resp_prod, i;
	int more_to_do;
	int seg;

	DPRINTF(("xbd_handler(%s)\n", device_xname(sc->sc_dev)));

	if (__predict_false(sc->sc_backend_status != BLKIF_STATE_CONNECTED))
		return 0;
again:
	resp_prod = sc->sc_ring.sring->rsp_prod;
	x86_lfence(); /* ensure we see replies up to resp_prod */
	for (i = sc->sc_ring.rsp_cons; i != resp_prod; i++) {
		blkif_response_t *rep = RING_GET_RESPONSE(&sc->sc_ring, i);
		struct xbd_req *xbdreq = &sc->sc_reqs[rep->id];
		bp = xbdreq->req_bp;
		DPRINTF(("xbd_handler(%p): b_bcount = %ld\n",
		    bp, (long)bp->b_bcount));
		for (seg = xbdreq->req_nr_segments - 1; seg >= 0; seg--) {
			if (__predict_false(
			    xengnt_status(xbdreq->req_gntref[seg]))) {
				aprint_verbose_dev(sc->sc_dev,
					"grant still used by backend\n");
				sc->sc_ring.rsp_cons = i;
				xbdreq->req_nr_segments = seg + 1;
				goto done;
			}
			xengnt_revoke_access(
			    xbdreq->req_gntref[seg]);
			xbdreq->req_nr_segments--;
		}
		if (rep->operation != BLKIF_OP_READ &&
		    rep->operation != BLKIF_OP_WRITE) {
				aprint_error_dev(sc->sc_dev,
					 "bad operation %d from backend\n",
					 rep->operation);
				bp->b_error = EIO;
				bp->b_resid = bp->b_bcount;
				goto next;
		}
		if (rep->status != BLKIF_RSP_OKAY) {
				bp->b_error = EIO;
				bp->b_resid = bp->b_bcount;
				goto next;
		}
		/* b_resid was set in xbdstart */
next:
		if (bp->b_data != xbdreq->req_data)
			xbd_unmap_align(xbdreq);
		disk_unbusy(&sc->sc_dksc.sc_dkdev,
		    (bp->b_bcount - bp->b_resid),
		    (bp->b_flags & B_READ));
#if NRND > 0
		rnd_add_uint32(&sc->sc_rnd_source,
		    bp->b_blkno);
#endif
		biodone(bp);
		SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, xbdreq, req_next);
	}
	x86_lfence();
	sc->sc_ring.rsp_cons = i;
	RING_FINAL_CHECK_FOR_RESPONSES(&sc->sc_ring, more_to_do);
	if (more_to_do)
		goto again;
done:
	dk_iodone(sc->sc_di, &sc->sc_dksc);
	return 1;
}

int
xbdopen(dev_t dev, int flags, int fmt, struct lwp *l)
{
	struct	xbd_xenbus_softc *sc;

	sc = device_lookup_private(&xbd_cd, DISKUNIT(dev));
	if (sc == NULL)
		return (ENXIO);
	if ((flags & FWRITE) && (sc->sc_info & VDISK_READONLY))
		return EROFS;

	DPRINTF(("xbdopen(0x%04x, %d)\n", dev, flags));
	return dk_open(sc->sc_di, &sc->sc_dksc, dev, flags, fmt, l);
}

int
xbdclose(dev_t dev, int flags, int fmt, struct lwp *l)
{
	struct xbd_xenbus_softc *sc;

	sc = device_lookup_private(&xbd_cd, DISKUNIT(dev));

	DPRINTF(("xbdclose(%d, %d)\n", dev, flags));
	return dk_close(sc->sc_di, &sc->sc_dksc, dev, flags, fmt, l);
}

void
xbdstrategy(struct buf *bp)
{
	struct xbd_xenbus_softc *sc;

	sc = device_lookup_private(&xbd_cd, DISKUNIT(bp->b_dev));

	DPRINTF(("xbdstrategy(%p): b_bcount = %ld\n", bp,
	    (long)bp->b_bcount));

	if (sc == NULL || sc->sc_shutdown) {
		bp->b_error = EIO;
		biodone(bp);
		return;
	}
	if (__predict_false((sc->sc_info & VDISK_READONLY) &&
	    (bp->b_flags & B_READ) == 0)) {
		bp->b_error = EROFS;
		biodone(bp);
		return;
	}

	dk_strategy(sc->sc_di, &sc->sc_dksc, bp);
	return;
}

int
xbdsize(dev_t dev)
{
	struct	xbd_xenbus_softc *sc;

	DPRINTF(("xbdsize(%d)\n", dev));

	sc = device_lookup_private(&xbd_cd, DISKUNIT(dev));
	if (sc == NULL || sc->sc_shutdown)
		return -1;
	return dk_size(sc->sc_di, &sc->sc_dksc, dev);
}

int
xbdread(dev_t dev, struct uio *uio, int flags)
{
	struct xbd_xenbus_softc *sc = device_lookup_private(&xbd_cd, DISKUNIT(dev));
	struct  dk_softc *dksc = &sc->sc_dksc;

	if ((dksc->sc_flags & DKF_INITED) == 0)
		return ENXIO;
	return physio(xbdstrategy, NULL, dev, B_READ, minphys, uio);
}

int
xbdwrite(dev_t dev, struct uio *uio, int flags)
{
	struct xbd_xenbus_softc *sc = device_lookup_private(&xbd_cd, DISKUNIT(dev));
	struct  dk_softc *dksc = &sc->sc_dksc;

	if ((dksc->sc_flags & DKF_INITED) == 0)
		return ENXIO;
	if (__predict_false(sc->sc_info & VDISK_READONLY))
		return EROFS;
	return physio(xbdstrategy, NULL, dev, B_WRITE, minphys, uio);
}

int
xbdioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
	struct xbd_xenbus_softc *sc = device_lookup_private(&xbd_cd, DISKUNIT(dev));
	struct	dk_softc *dksc;
	int	error;
	struct	disk *dk;

	DPRINTF(("xbdioctl(%d, %08lx, %p, %d, %p)\n",
	    dev, cmd, data, flag, l));
	dksc = &sc->sc_dksc;
	dk = &dksc->sc_dkdev;

	switch (cmd) {
	case DIOCSSTRATEGY:
		error = EOPNOTSUPP;
		break;
	default:
		error = dk_ioctl(sc->sc_di, dksc, dev, cmd, data, flag, l);
		break;
	}

	return error;
}

int
xbddump(dev_t dev, daddr_t blkno, void *va, size_t size)
{
	struct xbd_xenbus_softc *sc;

	sc  = device_lookup_private(&xbd_cd, DISKUNIT(dev));
	if (sc == NULL)
		return (ENXIO);

	DPRINTF(("xbddump(%d, %" PRId64 ", %p, %lu)\n", dev, blkno, va,
	    (unsigned long)size));
	return dk_dump(sc->sc_di, &sc->sc_dksc, dev, blkno, va, size);
}

static int
xbdstart(struct dk_softc *dksc, struct buf *bp)
{
	struct xbd_xenbus_softc *sc;
	struct xbd_req *xbdreq;
	blkif_request_t *req;
	int ret = 0, runqueue = 1;
	size_t bcount, off;
	paddr_t ma;
	vaddr_t va;
	int nsects, nbytes, seg;
	int notify;

	DPRINTF(("xbdstart(%p): b_bcount = %ld\n", bp, (long)bp->b_bcount));

	sc = device_lookup_private(&xbd_cd, DISKUNIT(bp->b_dev));
	if (sc == NULL || sc->sc_shutdown) {
		bp->b_error = EIO;
		goto err;
	}

	if (bp->b_rawblkno < 0 || bp->b_rawblkno > sc->sc_xbdsize) {
		/* invalid block number */
		bp->b_error = EINVAL;
		goto err;
	}

	if (bp->b_rawblkno == sc->sc_xbdsize) {
		/* at end of disk; return short read */
		bp->b_resid = bp->b_bcount;
		biodone(bp);
		return 0;
	}
		

	if (RING_FULL(&sc->sc_ring)) {
		DPRINTF(("xbdstart: ring_full\n"));
		ret = -1;
		goto out;
	}

	dksc = &sc->sc_dksc;

	xbdreq = SLIST_FIRST(&sc->sc_xbdreq_head);
	if (__predict_false(xbdreq == NULL)) {
		DPRINTF(("xbdstart: no req\n"));
		ret = -1; /* dk_start should not remove bp from queue */
		goto out;
	}

	xbdreq->req_bp = bp;
	xbdreq->req_data = bp->b_data;
	if ((vaddr_t)bp->b_data & (XEN_BSIZE - 1)) {
		if (__predict_false(xbd_map_align(xbdreq) != 0)) {
			ret = -1;
			goto out;
		}
	}
	/* now we're sure we'll send this buf */
	disk_busy(&dksc->sc_dkdev);
	SLIST_REMOVE_HEAD(&sc->sc_xbdreq_head, req_next);
	req = RING_GET_REQUEST(&sc->sc_ring, sc->sc_ring.req_prod_pvt);
	req->id = xbdreq->req_id;
	req->operation = bp->b_flags & B_READ ? BLKIF_OP_READ : BLKIF_OP_WRITE;
	req->sector_number = bp->b_rawblkno;
	req->handle = sc->sc_handle;

	va = (vaddr_t)xbdreq->req_data & ~PAGE_MASK;
	off = (vaddr_t)xbdreq->req_data & PAGE_MASK;
	if (bp->b_rawblkno + bp->b_bcount / DEV_BSIZE >= sc->sc_xbdsize) {
		bcount = (sc->sc_xbdsize - bp->b_rawblkno) * DEV_BSIZE;
		bp->b_resid = bp->b_bcount - bcount;
	} else {
		bcount = bp->b_bcount;
		bp->b_resid = 0;
	}
	for (seg = 0, bcount = bp->b_bcount; bcount > 0;) {
		pmap_extract_ma(pmap_kernel(), va, &ma);
		KASSERT((ma & (XEN_BSIZE - 1)) == 0);
		if (bcount > PAGE_SIZE - off)
			nbytes = PAGE_SIZE - off;
		else
			nbytes = bcount;
		nsects = nbytes >> XEN_BSHIFT;
		req->seg[seg].first_sect = off >> XEN_BSHIFT;
		req->seg[seg].last_sect = (off >> XEN_BSHIFT) + nsects - 1;
		KASSERT(req->seg[seg].first_sect <= req->seg[seg].last_sect);
		KASSERT(req->seg[seg].last_sect < 8);
		if (__predict_false(xengnt_grant_access(
		    sc->sc_xbusd->xbusd_otherend_id, ma,
		    (bp->b_flags & B_READ) == 0, &xbdreq->req_gntref[seg])))
			panic("xbdstart: xengnt_grant_access"); /* XXX XXX !!! */
		req->seg[seg].gref = xbdreq->req_gntref[seg];
		seg++;
		KASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
		va += PAGE_SIZE;
		off = 0;
		bcount -= nbytes;
	}
	xbdreq->req_nr_segments = req->nr_segments = seg;
	sc->sc_ring.req_prod_pvt++;
	if (BUFQ_PEEK(sc->sc_dksc.sc_bufq)) {
		 /* we will be called again; don't notify guest yet */
		runqueue = 0;
	}

out:
	if (runqueue) {
		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->sc_ring, notify);
		if (notify)
			hypervisor_notify_via_evtchn(sc->sc_evtchn);
	}
	return ret;

err:
	bp->b_resid = bp->b_bcount;
	biodone(bp);
	return 0;
}

static int
xbd_map_align(struct xbd_req *req)
{
	int s = splvm();

	req->req_data = (void *)uvm_km_alloc(kmem_map, req->req_bp->b_bcount,
	    PAGE_SIZE, UVM_KMF_WIRED | UVM_KMF_NOWAIT);
	splx(s);
	if (__predict_false(req->req_data == NULL))
		return ENOMEM;
	if ((req->req_bp->b_flags & B_READ) == 0)
		memcpy(req->req_data, req->req_bp->b_data,
		    req->req_bp->b_bcount);
	return 0;
}

static void
xbd_unmap_align(struct xbd_req *req)
{
	int s;
	if (req->req_bp->b_flags & B_READ)
		memcpy(req->req_bp->b_data, req->req_data,
		    req->req_bp->b_bcount);
	s = splvm();
	uvm_km_free(kmem_map, (vaddr_t)req->req_data, req->req_bp->b_bcount,
	    UVM_KMF_WIRED);
	splx(s);
}