/* $NetBSD: xbdback.c,v 1.34 2008/10/21 15:46:32 cegger Exp $ */ /* * Copyright (c) 2005 Manuel Bouyer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Manuel Bouyer. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include <sys/cdefs.h> __KERNEL_RCSID(0, "$NetBSD: xbdback.c,v 1.34 2008/10/21 15:46:32 cegger Exp $"); #include <sys/types.h> #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> #include <sys/queue.h> #include <sys/kernel.h> #include <sys/conf.h> #include <sys/disk.h> #include <sys/disklabel.h> #include <sys/fcntl.h> #include <sys/vnode.h> #include <sys/kauth.h> #include <sys/workqueue.h> #include <machine/pmap.h> #include <xen/hypervisor.h> #include <xen/xen.h> #include <xen/evtchn.h> #include <xen/ctrl_if.h> #include <xen/xen_shm.h> #ifdef XENDEBUG_VBD #define XENPRINTF(x) printf x #else #define XENPRINTF(x) #endif #define u16 uint16_t /* * Backend block device driver for Xen */ /* Max number of pages per request. The request may not be page aligned */ #define BLKIF_MAX_PAGES_PER_REQUEST (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1) /* Values are expressed in 512-byte sectors */ #define VBD_BSIZE 512 #define VBD_MAXSECT ((PAGE_SIZE / VBD_BSIZE) - 1) struct xbd_vbd; struct xbdback_request; struct xbdback_io; struct xbdback_fragment; struct xbdback_instance; /* state of a xbdback instance */ typedef enum {CONNECTED, DISCONNECTING, DISCONNECTED} xbdback_state_t; /* * Since there are a variety of conditions that can block our I/O * processing, which isn't allowed to suspend its thread's execution, * such things will be done in a sort of continuation-passing style. * * Return value is NULL to indicate that execution has blocked; if * it's finished, set xbdi->cont (see below) to NULL and the return * doesn't matter. Otherwise it's passed as the second parameter to * the new value of xbdi->cont. */ typedef void *(* xbdback_cont_t)(struct xbdback_instance *, void *); /* we keep the xbdback instances in a linked list */ struct xbdback_instance { SLIST_ENTRY(xbdback_instance) next; domid_t domid; /* attached to this domain */ uint32_t handle; /* domain-specific handle */ volatile xbdback_state_t status; /* parameters for the communication */ unsigned int evtchn; paddr_t ma_ring; /* private parameters for communication */ blkif_ring_t *blk_ring; BLKIF_RING_IDX resp_prod; /* our current reply index */ BLKIF_RING_IDX req_cons; /* our current request index */ /* disconnection must be postponed until all I/O is done */ volatile unsigned refcnt; uint8_t disconnect_rspid; /* request id of the disconnect request */ /* * State for I/O processing/coalescing follows; this has to * live here instead of on the stack because of the * continuation-ness (see above). */ BLKIF_RING_IDX req_prod; /* limit on request indices */ xbdback_cont_t cont, cont_aux; SIMPLEQ_ENTRY(xbdback_instance) on_hold; /* waiting on resources */ /* _request state */ struct xbdback_request *req; /* if NULL, ignore following */ blkif_request_t *xen_req; int segno; struct xbd_vbd *req_vbd; /* _io state */ struct xbdback_io *io; /* if NULL, ignore next field */ daddr_t next_sector; unsigned long last_fas, this_fas; /* other state */ int same_page; /* are we merging two segments on the same page? */ SLIST_HEAD(, xbd_vbd) vbds; /* list of virtual block devices */ }; /* Manipulation of the above reference count. */ /* XXXjld@panix.com: not MP-safe, and move the i386 asm elsewhere. */ #define xbdi_get(xbdip) (++(xbdip)->refcnt) #define xbdi_put(xbdip) \ do { \ __asm volatile("decl %0" \ : "=m"((xbdip)->refcnt) : "m"((xbdip)->refcnt)); \ if (0 == (xbdip)->refcnt) \ xbdback_finish_disconnect(xbdip); \ } while (/* CONSTCOND */ 0) /* each xbdback instance has a list of vbds associated with it */ struct xbd_vbd { SLIST_ENTRY(xbd_vbd) next; blkif_vdev_t vdev; /* interface-specific ID */ int flags; #define VBD_F_RO 0x01 /* device is read-only */ int type; /* VDISK_TYPE_foo */ /* for now we allow only one extent per vbd */ dev_t dev; /* underlying device */ const struct bdevsw *bdevsw; /* pointer to the device's bdevsw */ struct vnode *vp; int start; int size; }; SLIST_HEAD(, xbdback_instance) xbdback_instances; /* * For each request from a guest, a xbdback_request is allocated from * a pool. This will describe the request until completion. The * request may require multiple IO operations to perform, so the * per-IO information is not stored here. */ struct xbdback_request { struct xbdback_instance *rq_xbdi; /* our xbd instance */ uint8_t rq_operation; unsigned long rq_id; int rq_iocount; /* reference count; or, number of outstanding I/O's */ int rq_ioerrs; }; /* * For each I/O operation associated with one of those requests, an * xbdback_io is allocated from a pool. It may correspond to multiple * Xen disk requests, or parts of them, if several arrive at once that * can be coalesced. */ struct xbdback_io { struct work xio_work; struct buf xio_buf; /* our I/O */ /* The instance pointer is duplicated for convenience. */ struct xbdback_instance *xio_xbdi; /* our xbd instance */ SLIST_HEAD(, xbdback_fragment) xio_rq; /* xbd requests involved */ vaddr_t xio_vaddr; /* the virtual address to map the request at */ paddr_t xio_ma[XENSHM_MAX_PAGES_PER_REQUEST]; /* guest pages to map */ uint16_t xio_nrma; /* number of guest pages */ uint16_t xio_mapped; }; /* * Rather than have the xbdback_io keep an array of the * xbdback_requests involved, since the actual number will probably be * small but might be as large as BLKIF_RING_SIZE, use a list. This * would be threaded through xbdback_request, but one of them might be * part of multiple I/O's, alas. */ struct xbdback_fragment { struct xbdback_request *car; SLIST_ENTRY(xbdback_fragment) cdr; }; /* * Wrap our pools with a chain of xbdback_instances whose I/O * processing has blocked for want of memory from that pool. */ struct xbdback_pool { struct pool p; SIMPLEQ_HEAD(xbdback_iqueue, xbdback_instance) q; struct timeval last_warning; } xbdback_request_pool, xbdback_io_pool, xbdback_fragment_pool; static struct xbdback_iqueue xbdback_shmq; static int xbdback_shmcb; /* have we already registered a callback? */ struct timeval xbdback_poolsleep_intvl = { 5, 0 }; #ifdef DEBUG struct timeval xbdback_fragio_intvl = { 60, 0 }; #endif static void xbdback_ctrlif_rx(ctrl_msg_t *, unsigned long); static int xbdback_evthandler(void *); static void xbdback_finish_disconnect(struct xbdback_instance *); static struct xbdback_instance *xbdif_lookup(domid_t, uint32_t); static struct xbd_vbd * vbd_lookup(struct xbdback_instance *, blkif_vdev_t); static void *xbdback_co_main(struct xbdback_instance *, void *); static void *xbdback_co_main_loop(struct xbdback_instance *, void *); static void *xbdback_co_main_incr(struct xbdback_instance *, void *); static void *xbdback_co_main_done(struct xbdback_instance *, void *); static void *xbdback_co_main_done2(struct xbdback_instance *, void *); static void *xbdback_co_io(struct xbdback_instance *, void *); static void *xbdback_co_io_gotreq(struct xbdback_instance *, void *); static void *xbdback_co_io_loop(struct xbdback_instance *, void *); static void *xbdback_co_io_gotio(struct xbdback_instance *, void *); static void *xbdback_co_io_gotio2(struct xbdback_instance *, void *); static void *xbdback_co_io_gotfrag(struct xbdback_instance *, void *); static void *xbdback_co_io_gotfrag2(struct xbdback_instance *, void *); static void *xbdback_co_flush(struct xbdback_instance *, void *); static void *xbdback_co_flush_done(struct xbdback_instance *, void *); static void *xbdback_co_probe(struct xbdback_instance *, void *); static void *xbdback_co_probe_gotio(struct xbdback_instance *, void *); static void *xbdback_co_probe_gotvm(struct xbdback_instance *, void *); static int xbdback_shm_callback(void *); static void xbdback_io_error(struct xbdback_io *, int); static void xbdback_do_io(struct work *, void *); static void xbdback_iodone(struct buf *); static void xbdback_send_reply(struct xbdback_instance *, int , int , int); static void *xbdback_map_shm(struct xbdback_io *); static void xbdback_unmap_shm(struct xbdback_io *); static void *xbdback_pool_get(struct xbdback_pool *, struct xbdback_instance *); static void xbdback_pool_put(struct xbdback_pool *, void *); static void xbdback_trampoline(struct xbdback_instance *, void *); struct workqueue *xbdback_workqueue; void xbdback_init(void) { ctrl_msg_t cmsg; blkif_be_driver_status_t st; if ( !xendomain_is_dom0() && !(xen_start_info.flags & SIF_BLK_BE_DOMAIN) ) return; XENPRINTF(("xbdback_init\n")); /* * initialize the backend driver, register the control message handler * and send driver up message. */ SLIST_INIT(&xbdback_instances); SIMPLEQ_INIT(&xbdback_shmq); xbdback_shmcb = 0; pool_init(&xbdback_request_pool.p, sizeof(struct xbdback_request), 0, 0, 0, "xbbrp", NULL, IPL_BIO); SIMPLEQ_INIT(&xbdback_request_pool.q); pool_init(&xbdback_io_pool.p, sizeof(struct xbdback_io), 0, 0, 0, "xbbip", NULL, IPL_BIO); SIMPLEQ_INIT(&xbdback_io_pool.q); pool_init(&xbdback_fragment_pool.p, sizeof(struct xbdback_fragment), 0, 0, 0, "xbbfp", NULL, IPL_BIO); SIMPLEQ_INIT(&xbdback_fragment_pool.q); /* we allocate enough to handle a whole ring at once */ if (pool_prime(&xbdback_request_pool.p, BLKIF_RING_SIZE) != 0) printf("xbdback: failed to prime request pool\n"); if (pool_prime(&xbdback_io_pool.p, BLKIF_RING_SIZE) != 0) printf("xbdback: failed to prime io pool\n"); if (pool_prime(&xbdback_fragment_pool.p, BLKIF_MAX_SEGMENTS_PER_REQUEST * BLKIF_RING_SIZE) != 0) printf("xbdback: failed to prime fragment pool\n"); if (workqueue_create(&xbdback_workqueue, "xbdbackd", xbdback_do_io, NULL, PRI_BIO, IPL_BIO, 0)) printf("xbdback: failed to init workqueue\n"); (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, xbdback_ctrlif_rx, CALLBACK_IN_BLOCKING_CONTEXT); cmsg.type = CMSG_BLKIF_BE; cmsg.subtype = CMSG_BLKIF_BE_DRIVER_STATUS; cmsg.length = sizeof(blkif_be_driver_status_t); st.status = BLKIF_DRIVER_STATUS_UP; memcpy(cmsg.msg, &st, sizeof(st)); ctrl_if_send_message_block(&cmsg, NULL, 0, 0); } static void xbdback_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) { struct xbdback_instance *xbdi; struct xbd_vbd *vbd; int error; XENPRINTF(("xbdback msg %d\n", msg->subtype)); switch (msg->subtype) { case CMSG_BLKIF_BE_CREATE: { blkif_be_create_t *req = (blkif_be_create_t *)&msg->msg[0]; if (msg->length != sizeof(blkif_be_create_t)) goto error; if (xbdif_lookup(req->domid, req->blkif_handle) != NULL) { req->status = BLKIF_BE_STATUS_INTERFACE_EXISTS; goto end; } xbdi = malloc(sizeof(struct xbdback_instance), M_DEVBUF, M_NOWAIT | M_ZERO); if (xbdi == NULL) { req->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; goto end; } xbdi->domid = req->domid; xbdi->handle = req->blkif_handle; xbdi->status = DISCONNECTED; xbdi->refcnt = 1; xbdi->blk_ring = NULL; SLIST_INIT(&xbdi->vbds); SLIST_INSERT_HEAD(&xbdback_instances, xbdi, next); req->status = BLKIF_BE_STATUS_OKAY; break; } case CMSG_BLKIF_BE_DESTROY: { blkif_be_destroy_t *req = (blkif_be_destroy_t *)&msg->msg[0]; if (msg->length != sizeof(blkif_be_destroy_t)) goto error; xbdi = xbdif_lookup(req->domid, req->blkif_handle); if (xbdi == NULL) { req->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; goto end; } if (xbdi->status != DISCONNECTED) { req->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; goto end; } if (xbdi->blk_ring != NULL) uvm_km_free(kernel_map, (vaddr_t)xbdi->blk_ring, PAGE_SIZE, UVM_KMF_VAONLY); SLIST_REMOVE(&xbdback_instances, xbdi, xbdback_instance, next); free(xbdi, M_DEVBUF); req->status = BLKIF_BE_STATUS_OKAY; break; } case CMSG_BLKIF_BE_CONNECT: { blkif_be_connect_t *req = (blkif_be_connect_t *)&msg->msg[0]; vaddr_t ring_addr; char evname[16]; if (msg->length != sizeof(blkif_be_connect_t)) goto error; xbdi = xbdif_lookup(req->domid, req->blkif_handle); if (xbdi == NULL) { req->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; goto end; } if (xbdi->status != DISCONNECTED) { req->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; goto end; } if (xbdi->blk_ring == NULL) { ring_addr = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_VAONLY); if (ring_addr == 0) { req->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; goto end; } } else { ring_addr = (vaddr_t)xbdi->blk_ring; xbdi->blk_ring = NULL; } xbdi->ma_ring = req->shmem_frame << PAGE_SHIFT; error = pmap_enter_ma(pmap_kernel(), ring_addr, xbdi->ma_ring, 0, VM_PROT_READ | VM_PROT_WRITE, PMAP_WIRED | PMAP_CANFAIL, req->domid); if (error) { uvm_km_free(kernel_map, ring_addr, PAGE_SIZE, UVM_KMF_VAONLY); if (error == ENOMEM) req->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; else if (error == EFAULT) req->status = BLKIF_BE_STATUS_MAPPING_ERROR; else req->status = BLKIF_BE_STATUS_ERROR; goto end; } xbdi->blk_ring = (void *)ring_addr; xbdi->evtchn = req->evtchn; snprintf(evname, sizeof(evname), "xbdback%d", xbdi->domid); event_set_handler(xbdi->evtchn, xbdback_evthandler, xbdi, IPL_BIO, evname); printf("xbd backend %d for domain %d using event channel %d\n", xbdi->handle, xbdi->domid, xbdi->evtchn); hypervisor_enable_event(xbdi->evtchn); xbdi->status = CONNECTED; req->status = BLKIF_BE_STATUS_OKAY; break; } case CMSG_BLKIF_BE_DISCONNECT: { blkif_be_disconnect_t *req = (blkif_be_disconnect_t *)&msg->msg[0]; int s; if (msg->length != sizeof(blkif_be_disconnect_t)) goto error; xbdi = xbdif_lookup(req->domid, req->blkif_handle); if (xbdi == NULL) { req->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; goto end; } hypervisor_mask_event(xbdi->evtchn); event_remove_handler(xbdi->evtchn, xbdback_evthandler, xbdi); xbdi->status = DISCONNECTING; xbdi->disconnect_rspid = msg->id; s = splbio(); xbdi_put(xbdi); splx(s); return; } case CMSG_BLKIF_BE_VBD_CREATE: { blkif_be_vbd_create_t *req = (blkif_be_vbd_create_t *)&msg->msg[0]; if (msg->length != sizeof(blkif_be_vbd_create_t)) goto error; xbdi = xbdif_lookup(req->domid, req->blkif_handle); if (xbdi == NULL) { req->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; goto end; } vbd = malloc(sizeof(struct xbd_vbd), M_DEVBUF, M_NOWAIT | M_ZERO); if (vbd == NULL) { req->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; goto end; } vbd->vdev = req->vdevice; if (req->readonly) vbd->flags |= VBD_F_RO; SLIST_INSERT_HEAD(&xbdi->vbds, vbd, next); req->status = BLKIF_BE_STATUS_OKAY; break; } case CMSG_BLKIF_BE_VBD_DESTROY: { blkif_be_vbd_destroy_t *req = (blkif_be_vbd_destroy_t *)&msg->msg[0]; if (msg->length != sizeof(blkif_be_vbd_destroy_t)) goto error; xbdi = xbdif_lookup(req->domid, req->blkif_handle); if (xbdi == NULL) { req->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; goto end; } vbd = vbd_lookup(xbdi, req->vdevice); if (vbd == NULL) { req->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; goto end; } if (vbd->size) { printf("xbd backend: detach device %s%d%c " "for domain %d\n", devsw_blk2name(major(vbd->dev)), DISKUNIT(vbd->dev), DISKPART(vbd->dev) + 'a', xbdi->domid); vbd->start = vbd->size = vbd->dev = 0; vn_close(vbd->vp, FREAD, NOCRED); } SLIST_REMOVE(&xbdi->vbds, vbd, xbd_vbd, next); free(vbd, M_DEVBUF); req->status = BLKIF_BE_STATUS_OKAY; break; } case CMSG_BLKIF_BE_VBD_GROW: { blkif_be_vbd_grow_t *req = (blkif_be_vbd_grow_t *)&msg->msg[0]; const char *devname; int major; struct partinfo dpart; if (msg->length != sizeof(blkif_be_vbd_grow_t)) goto error; xbdi = xbdif_lookup(req->domid, req->blkif_handle); if (xbdi == NULL) { req->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; goto end; } vbd = vbd_lookup(xbdi, req->vdevice); if (vbd == NULL) { req->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; goto end; } if (vbd->size != 0) { req->status = BLKIF_BE_STATUS_VBD_EXISTS; goto end; } if (req->extent.sector_start != 0) { req->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND; goto end; } major = major(req->extent.device); devname = devsw_blk2name(major); if (devname == NULL) { printf("xbdback VBD grow domain %d: unknwon device " "0x%x\n", xbdi->domid, req->extent.device); req->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND; goto end; } vbd->dev = req->extent.device; vbd->bdevsw = bdevsw_lookup(vbd->dev); if (vbd->bdevsw == NULL) { printf("xbdback VBD grow domain %d: no bdevsw for " "device 0x%x\n", xbdi->domid, req->extent.device); req->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND; goto end; } error = bdevvp(vbd->dev, &vbd->vp); if (error) { printf("xbdback VBD grow domain %d: can't open " "device 0x%x (error %d)\n", xbdi->domid, req->extent.device, error); req->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND; goto end; } error = vn_lock(vbd->vp, LK_EXCLUSIVE | LK_RETRY); if (error) { printf("xbdback VBD grow domain %d: can't lock " "device 0x%x (error %d)\n", xbdi->domid, req->extent.device, error); vrele(vbd->vp); req->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND; goto end; } error = VOP_OPEN(vbd->vp, FREAD, NOCRED); if (error) { printf("xbdback VBD grow domain %d: can't open2 " "device 0x%x (error %d)\n", xbdi->domid, req->extent.device, error); vput(vbd->vp); req->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND; goto end; } VOP_UNLOCK(vbd->vp, 0); error = VOP_IOCTL(vbd->vp, DIOCGPART, &dpart, FREAD, 0); if (error) { printf("xbdback VBD grow domain %d: can't ioctl " "device 0x%x (error %d)\n", xbdi->domid, req->extent.device, error); vbd->start = vbd->size = vbd->dev = 0; vn_close(vbd->vp, FREAD, NOCRED); vbd->vp = NULL; req->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND; goto end; } vbd->size = req->extent.sector_length * (512 / DEV_BSIZE); if (vbd->size == 0 || vbd->size > dpart.part->p_size); vbd->size = dpart.part->p_size; printf("xbd backend: attach device %s%d%c (size %d) " "for domain %d\n", devname, DISKUNIT(vbd->dev), DISKPART(vbd->dev) + 'a', vbd->size, xbdi->domid); req->status = BLKIF_BE_STATUS_OKAY; break; } case CMSG_BLKIF_BE_VBD_SHRINK: { blkif_be_vbd_shrink_t *req = (blkif_be_vbd_shrink_t *)&msg->msg[0]; if (msg->length != sizeof(blkif_be_vbd_shrink_t)) goto error; xbdi = xbdif_lookup(req->domid, req->blkif_handle); if (xbdi == NULL) { req->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; goto end; } vbd = vbd_lookup(xbdi, req->vdevice); if (vbd == NULL) { req->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; goto end; } if (vbd->size == 0) { req->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; goto end; } printf("xbd backend: detach device %s%d%c " "for domain %d\n", devsw_blk2name(major(vbd->dev)), DISKUNIT(vbd->dev), DISKPART(vbd->dev) + 'a', xbdi->domid); vbd->start = vbd->size = vbd->dev = 0; vn_close(vbd->vp, FREAD, NOCRED); req->status = BLKIF_BE_STATUS_OKAY; break; } default: error: printf("xbdback: wrong message subtype %d len %d\n", msg->subtype, msg->length); msg->length = 0; } end: XENPRINTF(("xbdback msg rep size %d\n", msg->length)); ctrl_if_send_response(msg); return; } static void xbdback_finish_disconnect(struct xbdback_instance *xbdi) { ctrl_msg_t cmsg; blkif_be_disconnect_t *pdisc = (blkif_be_disconnect_t *)&cmsg.msg; vaddr_t ring_addr; KASSERT(xbdi->status == DISCONNECTING); ring_addr = (vaddr_t)xbdi->blk_ring; pmap_remove(pmap_kernel(), ring_addr, ring_addr + PAGE_SIZE); xbdi->status = DISCONNECTED; memset(&cmsg, 0, sizeof(cmsg)); cmsg.type = CMSG_BLKIF_BE; cmsg.subtype = CMSG_BLKIF_BE_DISCONNECT; cmsg.id = xbdi->disconnect_rspid; cmsg.length = sizeof(blkif_be_disconnect_t); pdisc->domid = xbdi->domid; pdisc->blkif_handle = xbdi->handle; pdisc->status = BLKIF_BE_STATUS_OKAY; ctrl_if_send_response(&cmsg); } static struct xbdback_instance * xbdif_lookup(domid_t dom , uint32_t handle) { struct xbdback_instance *xbdi; SLIST_FOREACH(xbdi, &xbdback_instances, next) { if (xbdi->domid == dom && xbdi->handle == handle) return xbdi; } return NULL; } static struct xbd_vbd * vbd_lookup(struct xbdback_instance *xbdi , blkif_vdev_t vdev) { struct xbd_vbd *vbd; SLIST_FOREACH(vbd, &xbdi->vbds, next) { if (vbd->vdev == vdev) return vbd; } return NULL; } static int xbdback_evthandler(void *arg) { struct xbdback_instance *xbdi = arg; if (xbdi->cont == NULL) { xbdi->cont = xbdback_co_main; xbdback_trampoline(xbdi, xbdi); } return 1; } static void * xbdback_co_main(struct xbdback_instance *xbdi, void *obj) { (void)obj; xbdi->req_prod = xbdi->blk_ring->req_prod; x86_lfence(); /* ensure we see all requests up to req_prod */ /* * note that we'll eventually get a full ring of request. * in this case, MASK_BLKIF_IDX(req_cons) == MASK_BLKIF_IDX(req_prod) */ xbdi->cont = xbdback_co_main_loop; return xbdi; } static void * xbdback_co_main_loop(struct xbdback_instance *xbdi, void *obj) { blkif_request_t *req; (void)obj; if (xbdi->req_cons != xbdi->req_prod) { req = xbdi->xen_req = &xbdi->blk_ring->ring[ MASK_BLKIF_IDX(xbdi->req_cons)].req; XENPRINTF(("xbdback op %d req_cons 0x%x req_prod 0x%x/0x%x " "resp_prod 0x%x/0x%x\n", req->operation, xbdi->req_cons, xbdi->req_prod, MASK_BLKIF_IDX(xbdi->req_prod), xbdi->blk_ring->resp_prod, MASK_BLKIF_IDX(xbdi->blk_ring->resp_prod))); switch(req->operation) { case BLKIF_OP_PROBE: xbdi->cont = xbdback_co_probe; break; case BLKIF_OP_READ: case BLKIF_OP_WRITE: xbdi->cont = xbdback_co_io; break; default: printf("xbdback_evthandler domain %d: unknown " "operation %d\n", xbdi->domid, req->operation); xbdback_send_reply(xbdi, req->id, req->operation, BLKIF_RSP_ERROR); xbdi->cont = xbdback_co_main_incr; break; } } else { xbdi->cont = xbdback_co_main_done; } return xbdi; } static void * xbdback_co_main_incr(struct xbdback_instance *xbdi, void *obj) { (void)obj; xbdi->req_cons++; xbdi->cont = xbdback_co_main_loop; return xbdi; } static void * xbdback_co_main_done(struct xbdback_instance *xbdi, void *obj) { (void)obj; if (xbdi->io != NULL) { xbdi->cont = xbdback_co_flush; xbdi->cont_aux = xbdback_co_main_done2; } else { xbdi->cont = xbdback_co_main_done2; } return xbdi; } static void * xbdback_co_main_done2(struct xbdback_instance *xbdi, void *obj) { if (xbdi->req_prod == xbdi->blk_ring->req_prod) { xbdi->cont = NULL; } else { xbdi->cont = xbdback_co_main; } return xbdi; } static void * xbdback_co_io(struct xbdback_instance *xbdi, void *obj) { int error; (void)obj; if (xbdi->xen_req->nr_segments < 1 || xbdi->xen_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST ) { printf("xbdback_io domain %d: %d segments\n", xbdi->domid, xbdi->xen_req->nr_segments); error = EINVAL; goto end; } xbdi->req_vbd = vbd_lookup(xbdi, xbdi->xen_req->device); if (xbdi->req_vbd == NULL) { printf("xbdback_io domain %d: unknown vbd %d\n", xbdi->domid, xbdi->xen_req->device); error = EINVAL; goto end; } if (xbdi->xen_req->operation == BLKIF_OP_WRITE) { if (xbdi->req_vbd->flags & VBD_F_RO) { error = EROFS; goto end; } } xbdi->segno = 0; xbdi->cont = xbdback_co_io_gotreq; return xbdback_pool_get(&xbdback_request_pool, xbdi); end: xbdback_send_reply(xbdi, xbdi->xen_req->id, xbdi->xen_req->operation, error); xbdi->cont = xbdback_co_main_incr; return xbdi; } static void * xbdback_co_io_gotreq(struct xbdback_instance *xbdi, void *obj) { struct xbdback_request *xrq; xrq = xbdi->req = obj; xrq->rq_xbdi = xbdi; xrq->rq_iocount = 0; xrq->rq_ioerrs = 0; xrq->rq_id = xbdi->xen_req->id; xrq->rq_operation = xbdi->xen_req->operation; /* * Request-level reasons not to coalesce: different device, * different op, or noncontiguous disk sectors (vs. previous * request handed to us). */ xbdi->cont = xbdback_co_io_loop; if (xbdi->io != NULL) { struct xbdback_request *last_req; last_req = SLIST_FIRST(&xbdi->io->xio_rq)->car; XENPRINTF(("xbdback_io domain %d: hoping for sector %ld;" " got %ld\n", xbdi->domid, (long)xbdi->next_sector, (long)xbdi->xen_req->sector_number)); if (xrq->rq_operation != last_req->rq_operation || xbdi->xen_req->sector_number != xbdi->next_sector) { XENPRINTF(("xbdback_io domain %d: segment break\n", xbdi->domid)); xbdi->next_sector = xbdi->xen_req->sector_number; xbdi->cont_aux = xbdi->cont; xbdi->cont = xbdback_co_flush; } } else { xbdi->next_sector = xbdi->xen_req->sector_number; } return xbdi; } static void * xbdback_co_io_loop(struct xbdback_instance *xbdi, void *obj) { struct xbdback_io *xio; (void)obj; if (xbdi->segno < xbdi->xen_req->nr_segments) { unsigned long this_fas, last_fas; /* * Segment-level reason to coalesce: handling full * pages, or adjacent sector ranges from the same page * (and yes, this latter does happen). But not if the * array of client pseudo-physical pages is full. */ this_fas = xbdi->xen_req->frame_and_sects[xbdi->segno]; XENPRINTF(("xbdback_io domain %d: frame_and_sects[%d]=0%lo\n", xbdi->domid, xbdi->segno, this_fas)); last_fas = xbdi->last_fas = xbdi->this_fas; xbdi->this_fas = this_fas; if (xbdi->io != NULL) { if (blkif_last_sect(last_fas) == VBD_MAXSECT && blkif_first_sect(this_fas) == 0 && xbdi->io->xio_nrma < XENSHM_MAX_PAGES_PER_REQUEST) { xbdi->same_page = 0; } else if (blkif_last_sect(last_fas) + 1 == blkif_first_sect(this_fas) && (last_fas & ~PAGE_MASK) == (this_fas & ~PAGE_MASK)) { #ifdef DEBUG static struct timeval gluetimer; if (ratecheck(&gluetimer, &xbdback_fragio_intvl)) printf("xbdback: domain %d sending" " excessively fragmented I/O\n", xbdi->domid); #endif XENPRINTF(("xbdback_io domain %d: glue same " "page", xbdi->domid)); xbdi->same_page = 1; } else { xbdi->cont_aux = xbdback_co_io_loop; xbdi->cont = xbdback_co_flush; return xbdi; } } else xbdi->same_page = 0; if (xbdi->io == NULL) { xbdi->cont = xbdback_co_io_gotio; xio = xbdback_pool_get(&xbdback_io_pool, xbdi); buf_init(&xio->xio_buf); return xio; } else { xbdi->cont = xbdback_co_io_gotio2; } } else { /* done with the loop over segments; get next request */ xbdi->cont = xbdback_co_main_incr; } return xbdi; } static void * xbdback_co_io_gotio(struct xbdback_instance *xbdi, void *obj) { struct xbdback_io *xbd_io; vaddr_t start_offset; /* start offset in vm area */ int buf_flags; xbdi_get(xbdi); xbd_io = xbdi->io = obj; xbd_io->xio_xbdi = xbdi; SLIST_INIT(&xbd_io->xio_rq); xbd_io->xio_nrma = 0; xbd_io->xio_mapped = 0; start_offset = blkif_first_sect(xbdi->this_fas) * VBD_BSIZE; if (xbdi->xen_req->operation == BLKIF_OP_WRITE) { buf_flags = B_WRITE; } else { buf_flags = B_READ; } xbd_io->xio_buf.b_flags = buf_flags; xbd_io->xio_buf.b_cflags = 0; xbd_io->xio_buf.b_oflags = 0; xbd_io->xio_buf.b_iodone = xbdback_iodone; xbd_io->xio_buf.b_proc = NULL; xbd_io->xio_buf.b_vp = xbdi->req_vbd->vp; xbd_io->xio_buf.b_objlock = &xbdi->req_vbd->vp->v_interlock; xbd_io->xio_buf.b_dev = xbdi->req_vbd->dev; xbd_io->xio_buf.b_blkno = xbdi->next_sector; xbd_io->xio_buf.b_bcount = 0; xbd_io->xio_buf.b_data = (void *)start_offset; xbd_io->xio_buf.b_private = xbd_io; xbdi->cont = xbdback_co_io_gotio2; return xbdi; } static void * xbdback_co_io_gotio2(struct xbdback_instance *xbdi, void *obj) { (void)obj; if (xbdi->segno == 0 || SLIST_EMPTY(&xbdi->io->xio_rq)) { /* if this is the first segment of a new request */ /* or if it's the first segment of the io */ xbdi->cont = xbdback_co_io_gotfrag; return xbdback_pool_get(&xbdback_fragment_pool, xbdi); } xbdi->cont = xbdback_co_io_gotfrag2; return xbdi; } static void * xbdback_co_io_gotfrag(struct xbdback_instance *xbdi, void *obj) { struct xbdback_fragment *xbd_fr; xbd_fr = obj; xbd_fr->car = xbdi->req; SLIST_INSERT_HEAD(&xbdi->io->xio_rq, xbd_fr, cdr); ++xbdi->req->rq_iocount; xbdi->cont = xbdback_co_io_gotfrag2; return xbdi; } static void * xbdback_co_io_gotfrag2(struct xbdback_instance *xbdi, void *obj) { struct xbdback_io *xbd_io; int seg_size; unsigned long this_fas; this_fas = xbdi->this_fas; xbd_io = xbdi->io; seg_size = blkif_last_sect(this_fas) - blkif_first_sect(this_fas) + 1; if (seg_size < 0) { printf("xbdback_io domain %d: negative-size request\n", xbdi->domid); xbdback_io_error(xbdi->io, EINVAL); xbdi->io = NULL; xbdi->cont = xbdback_co_main_incr; return xbdi; } if (!xbdi->same_page) { XENPRINTF(("xbdback_io domain %d: appending page 0%lo\n", xbdi->domid, (this_fas & ~PAGE_MASK))); xbd_io->xio_ma[xbd_io->xio_nrma++] = (this_fas & ~PAGE_MASK); } xbd_io->xio_buf.b_bcount += (daddr_t)(seg_size * VBD_BSIZE); XENPRINTF(("xbdback_io domain %d: start sect %d size %d\n", xbdi->domid, (int)xbdi->next_sector, seg_size)); /* Finally, the end of the segment loop! */ xbdi->next_sector += seg_size; ++xbdi->segno; xbdi->cont = xbdback_co_io_loop; return xbdi; } static void * xbdback_co_flush(struct xbdback_instance *xbdi, void *obj) { (void)obj; XENPRINTF(("xbdback_io domain %d: flush sect %ld size %d ptr 0x%lx\n", xbdi->domid, (long)xbdi->io->xio_buf.b_blkno, (int)xbdi->io->xio_buf.b_bcount, (long)xbdi->io)); xbdi->cont = xbdback_co_flush_done; return xbdback_map_shm(xbdi->io); } static void * xbdback_co_flush_done(struct xbdback_instance *xbdi, void *obj) { (void)obj; workqueue_enqueue(xbdback_workqueue, &xbdi->io->xio_work, NULL); xbdi->io = NULL; xbdi->cont = xbdi->cont_aux; return xbdi; } static void xbdback_io_error(struct xbdback_io *xbd_io, int error) { xbd_io->xio_buf.b_error = error; xbdback_iodone(&xbd_io->xio_buf); } static void xbdback_do_io(struct work *wk, void *dummy) { struct xbdback_io *xbd_io = (void *)wk; KASSERT(&xbd_io->xio_work == wk); xbd_io->xio_buf.b_data = (void *)((vaddr_t)xbd_io->xio_buf.b_data + xbd_io->xio_vaddr); #ifdef DIAGNOSTIC { vaddr_t bdata = (vaddr_t)xbd_io->xio_buf.b_data; int nsegs = ((((bdata + xbd_io->xio_buf.b_bcount - 1) & ~PAGE_MASK) - (bdata & ~PAGE_MASK)) >> PAGE_SHIFT) + 1; if ((bdata & ~PAGE_MASK) != (xbd_io->xio_vaddr & ~PAGE_MASK)) { printf("xbdback_do_io vaddr 0x%lx bdata 0x%lx\n", xbd_io->xio_vaddr, bdata); panic("xbdback_do_io: bdata page change"); } if (nsegs > xbd_io->xio_nrma) { printf("xbdback_do_io vaddr 0x%lx bcount 0x%x doesn't fit in " " %d pages\n", bdata, xbd_io->xio_buf.b_bcount, xbd_io->xio_nrma); panic("xbdback_do_io: not enough pages"); } } #endif if ((xbd_io->xio_buf.b_flags & B_READ) == 0) { mutex_enter(&xbd_io->xio_buf.b_vp->v_interlock); xbd_io->xio_buf.b_vp->v_numoutput++; mutex_exit(&xbd_io->xio_buf.b_vp->v_interlock); } bdev_strategy(&xbd_io->xio_buf); } /* This gets reused by xbdback_io_error to report errors from other sources. */ static void xbdback_iodone(struct buf *bp) { struct xbdback_io *xbd_io; struct xbdback_instance *xbdi; int errp; xbd_io = bp->b_private; xbdi = xbd_io->xio_xbdi; XENPRINTF(("xbdback_io domain %d: iodone ptr 0x%lx\n", xbdi->domid, (long)xbd_io)); if (xbd_io->xio_mapped) xbdback_unmap_shm(xbd_io); if (bp->b_error != 0) { printf("xbd IO domain %d: error %d\n", xbdi->domid, bp->b_error); errp = 1; } else errp = 0; /* for each constituent xbd request */ while(!SLIST_EMPTY(&xbd_io->xio_rq)) { struct xbdback_fragment *xbd_fr; struct xbdback_request *xbd_req; struct xbdback_instance *rxbdi; int error; xbd_fr = SLIST_FIRST(&xbd_io->xio_rq); xbd_req = xbd_fr->car; SLIST_REMOVE_HEAD(&xbd_io->xio_rq, cdr); xbdback_pool_put(&xbdback_fragment_pool, xbd_fr); if (errp) ++xbd_req->rq_ioerrs; /* finalize it only if this was its last I/O */ if (--xbd_req->rq_iocount > 0) continue; rxbdi = xbd_req->rq_xbdi; KASSERT(xbdi == rxbdi); error = xbd_req->rq_ioerrs > 0 ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY; XENPRINTF(("xbdback_io domain %d: end request %lu error=%d\n", xbdi->domid, xbd_req->rq_id, error)); xbdback_send_reply(xbdi, xbd_req->rq_id, xbd_req->rq_operation, error); xbdback_pool_put(&xbdback_request_pool, xbd_req); } xbdi_put(xbdi); buf_destroy(&xbd_io->xio_buf); xbdback_pool_put(&xbdback_io_pool, xbd_io); } static void * xbdback_co_probe(struct xbdback_instance *xbdi, void *obj) { struct xbdback_io *xio; (void)obj; /* * There should be only one page in the request. Map it and store * the reply. */ if (xbdi->xen_req->nr_segments != 1) { printf("xbdback_probe: %d segments\n", xbdi->xen_req->nr_segments); xbdback_send_reply(xbdi, xbdi->xen_req->id, xbdi->xen_req->operation, EINVAL); xbdi->cont = xbdback_co_main_incr; return xbdi; } xbdi->cont = xbdback_co_probe_gotio; xio = xbdback_pool_get(&xbdback_io_pool, xbdi); buf_init(&xio->xio_buf); return xio; } static void * xbdback_co_probe_gotio(struct xbdback_instance *xbdi, void *obj) { struct xbdback_io *xbd_io; xbd_io = xbdi->io = obj; xbd_io->xio_xbdi = xbdi; xbd_io->xio_nrma = 1; xbd_io->xio_ma[0] = (xbdi->xen_req->frame_and_sects[0] & ~PAGE_MASK); xbd_io->xio_mapped = 0; xbdi->cont = xbdback_co_probe_gotvm; xbdi->cont_aux = xbdback_co_main_incr; return xbdback_map_shm(xbdi->io); } static void * xbdback_co_probe_gotvm(struct xbdback_instance *xbdi, void *obj) { struct xbd_vbd *vbd; blkif_request_t *req; vdisk_t *vdisk_reply; int i; vdisk_reply = (void *)xbdi->io->xio_vaddr; req = xbdi->xen_req; i = 0; SLIST_FOREACH(vbd, &xbdi->vbds, next) { if (i >= PAGE_SIZE / sizeof(vdisk_t)) { printf("xbdback_probe domain %d: too many VBDs\n", xbdi->domid); break; } XENPRINTF(("xbdback_probe: reply %d\n", i)); vdisk_reply[i].capacity = vbd->size; vdisk_reply[i].device = vbd->vdev; vdisk_reply[i].info = VDISK_TYPE_DISK | VDISK_FLAG_VIRT; if (vbd->flags & VBD_F_RO) vdisk_reply[i].info |= VDISK_FLAG_RO; i++; } xbdback_unmap_shm(xbdi->io); XENPRINTF(("xbdback_probe: nreplies=%d\n", i)); xbdback_send_reply(xbdi, req->id, req->operation, i); buf_destroy(&xbdi->io->xio_buf); xbdback_pool_put(&xbdback_io_pool, xbdi->io); xbdi->io = NULL; xbdi->cont = xbdback_co_main_incr; return xbdi; } /* * called once a request has completed. Place the reply in the ring and * notify the guest OS */ static void xbdback_send_reply(struct xbdback_instance *xbdi, int id, int op, int status) { blkif_response_t *resp; resp = &xbdi->blk_ring->ring[MASK_BLKIF_IDX(xbdi->resp_prod)].resp; resp->id = id; resp->operation = op; resp->status = status; xbdi->resp_prod++; x86_lfence(); /* ensure guest see all our replies */ xbdi->blk_ring->resp_prod = xbdi->resp_prod; hypervisor_notify_via_evtchn(xbdi->evtchn); } /* * Map a request into our virtual address space. The xbd_req->rq_ma * array is to be filled out by the caller. */ static void * xbdback_map_shm(struct xbdback_io *xbd_io) { struct xbdback_instance *xbdi; int error, s; KASSERT(xbd_io->xio_mapped == 0); xbdi = xbd_io->xio_xbdi; error = xen_shm_map(xbd_io->xio_ma, xbd_io->xio_nrma, xbd_io->xio_xbdi->domid, &xbd_io->xio_vaddr, 0); switch(error) { case 0: xbd_io->xio_mapped = 1; return (void *)xbd_io->xio_vaddr; case ENOMEM: s = splvm(); if (!xbdback_shmcb) { if (xen_shm_callback(xbdback_shm_callback, xbdi) != 0) { splx(s); panic("xbdback_co_probe_gotio: " "xen_shm_callback failed"); } xbdback_shmcb = 1; } SIMPLEQ_INSERT_TAIL(&xbdback_shmq, xbdi, on_hold); splx(s); return NULL; default: printf("xbdback_co_probe_gotio: xen_shm error %d", error); xbdback_io_error(xbdi->io, error); xbdi->io = NULL; xbdi->cont = xbdi->cont_aux; return xbdi; } } static int xbdback_shm_callback(void *arg) { int error, s; s = splvm(); while(!SIMPLEQ_EMPTY(&xbdback_shmq)) { struct xbdback_instance *xbdi; struct xbdback_io *xbd_io; xbdi = SIMPLEQ_FIRST(&xbdback_shmq); xbd_io = xbdi->io; KASSERT(xbd_io->xio_mapped == 0); switch((error = xen_shm_map(xbd_io->xio_ma, xbd_io->xio_nrma, xbdi->domid, &xbd_io->xio_vaddr, XSHM_CALLBACK))) { case ENOMEM: splx(s); return -1; /* will try again later */ case 0: xbd_io->xio_mapped = 1; SIMPLEQ_REMOVE_HEAD(&xbdback_shmq, on_hold); splx(s); xbdback_trampoline(xbdi, xbdi); s = splvm(); break; default: SIMPLEQ_REMOVE_HEAD(&xbdback_shmq, on_hold); splx(s); printf("xbdback_shm_callback: xen_shm error %d\n", error); xbdi->cont = xbdi->cont_aux; xbdback_io_error(xbd_io, error); xbdback_trampoline(xbdi, xbdi); s = splvm(); break; } } xbdback_shmcb = 0; splx(s); return 0; } /* unmap a request from our virtual address space (request is done) */ static void xbdback_unmap_shm(struct xbdback_io *xbd_io) { KASSERT(xbd_io->xio_mapped == 1); xbd_io->xio_mapped = 0; xen_shm_unmap(xbd_io->xio_vaddr, xbd_io->xio_ma, xbd_io->xio_nrma, xbd_io->xio_xbdi->domid); xbd_io->xio_vaddr = -1; } /* Obtain memory from a pool, in cooperation with the continuations. */ static void *xbdback_pool_get(struct xbdback_pool *pp, struct xbdback_instance *xbdi) { int s; void *item; item = pool_get(&pp->p, PR_NOWAIT); if (item == NULL) { if (ratecheck(&pp->last_warning, &xbdback_poolsleep_intvl)) printf("xbdback_pool_get: %s is full", pp->p.pr_wchan); s = splvm(); SIMPLEQ_INSERT_TAIL(&pp->q, xbdi, on_hold); splx(s); } return item; } /* * Restore memory to a pool... unless an xbdback instance had been * waiting for it, in which case that gets the memory first. */ static void xbdback_pool_put(struct xbdback_pool *pp, void *item) { int s; s = splvm(); if (SIMPLEQ_EMPTY(&pp->q)) { splx(s); pool_put(&pp->p, item); } else { struct xbdback_instance *xbdi = SIMPLEQ_FIRST(&pp->q); SIMPLEQ_REMOVE_HEAD(&pp->q, on_hold); splx(s); xbdback_trampoline(xbdi, item); } } static void xbdback_trampoline(struct xbdback_instance *xbdi, void *obj) { xbdback_cont_t cont; while(obj != NULL && xbdi->cont != NULL) { cont = xbdi->cont; #ifdef DIAGNOSTIC xbdi->cont = (xbdback_cont_t)0xDEADBEEF; #endif obj = (*cont)(xbdi, obj); #ifdef DIAGNOSTIC if (xbdi->cont == (xbdback_cont_t)0xDEADBEEF) { printf("xbdback_trampoline: 0x%lx didn't set " "xbdi->cont!\n2", (long)cont); panic("xbdback_trampoline: bad continuation"); } #endif } }