NetBSD-5.0.2/sys/arch/xen/xen/xenevt.c

Compare this file to the similar file:
Show the results in this format:

/*      $NetBSD: xenevt.c,v 1.29.4.1 2009/04/04 23:36:27 snj Exp $      */

/*
 * Copyright (c) 2005 Manuel Bouyer.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Manuel Bouyer.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: xenevt.c,v 1.29.4.1 2009/04/04 23:36:27 snj Exp $");

#include "opt_xen.h"
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/systm.h>
#include <sys/device.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/poll.h>
#include <sys/select.h>
#include <sys/proc.h>
#include <sys/conf.h>
#include <sys/intr.h>
#include <sys/kmem.h>
#include <sys/simplelock.h>

#include <uvm/uvm_extern.h>

#include <xen/hypervisor.h>
#include <xen/xenpmap.h>
#include <xen/xenio.h>
#ifdef XEN3
#include <xen/xenio3.h>
#endif
#include <xen/xen.h>

/*
 * Interface between the event channel and userland.
 * Each process with a xenevt device instance open can regiter events it
 * wants to receive. It will get pending events by read(), eventually blocking
 * until some event is available. Pending events are ack'd by a bitmask
 * write()en to the device. Some special operations (such as events binding)
 * are done though ioctl().
 * Processes get a device instance by opening a cloning device.
 */

void		xenevtattach(int);
static int	xenevt_fread(struct file *, off_t *, struct uio *,
    kauth_cred_t, int);
static int	xenevt_fwrite(struct file *, off_t *, struct uio *,
    kauth_cred_t, int);
static int	xenevt_fioctl(struct file *, u_long, void *);
static int	xenevt_fpoll(struct file *, int);
static int	xenevt_fclose(struct file *);
/* static int	xenevt_fkqfilter(struct file *, struct knote *); */

static const struct fileops xenevt_fileops = {
	.fo_read = xenevt_fread,
	.fo_write = xenevt_fwrite,
	.fo_ioctl = xenevt_fioctl,
	.fo_fcntl = fnullop_fcntl,
	.fo_poll = xenevt_fpoll,
	.fo_stat = fbadop_stat,
	.fo_close = xenevt_fclose,
	.fo_kqfilter = /* xenevt_fkqfilter */ fnullop_kqfilter,
	.fo_drain = fnullop_drain,
};

dev_type_open(xenevtopen);
dev_type_read(xenevtread);
dev_type_mmap(xenevtmmap);
const struct cdevsw xenevt_cdevsw = {
	xenevtopen, nullclose, xenevtread, nowrite, noioctl,
	nostop, notty, nopoll, xenevtmmap, nokqfilter, D_OTHER
};

/* minor numbers */
#define DEV_EVT 0
#define DEV_XSD 1

/* per-instance datas */
#define XENEVT_RING_SIZE 2048
#define XENEVT_RING_MASK 2047

#ifndef XEN3
typedef uint16_t evtchn_port_t;
#endif

#define BYTES_PER_PORT (sizeof(evtchn_port_t) / sizeof(uint8_t))

struct xenevt_d {
	struct simplelock lock;
	STAILQ_ENTRY(xenevt_d) pendingq;
	bool pending;
	evtchn_port_t ring[2048]; 
	u_int ring_read; /* pointer of the reader */
	u_int ring_write; /* pointer of the writer */
	u_int flags;
#define XENEVT_F_OVERFLOW 0x01 /* ring overflow */
	struct selinfo sel; /* used by poll */
};

/* event -> user device mapping */
static struct xenevt_d *devevent[NR_EVENT_CHANNELS];

/* pending events */
static void *devevent_sih;
struct simplelock devevent_pending_lock = SIMPLELOCK_INITIALIZER;
STAILQ_HEAD(, xenevt_d) devevent_pending =
    STAILQ_HEAD_INITIALIZER(devevent_pending);

static void xenevt_donotify(struct xenevt_d *);
static void xenevt_record(struct xenevt_d *, evtchn_port_t);

/* pending events */
#ifdef XEN3
long xenevt_ev1;
long xenevt_ev2[NR_EVENT_CHANNELS];
#else
uint32_t xenevt_ev1;
uint32_t xenevt_ev2[NR_EVENT_CHANNELS];
#endif
static int xenevt_processevt(void *);

/* called at boot time */
void
xenevtattach(int n)
{
	struct intrhand *ih;
	int s;
	int level = IPL_HIGH;
#ifdef MULTIPROCESSOR
	bool mpsafe = (level != IPL_VM);
#endif /* MULTIPROCESSOR */


	devevent_sih = softint_establish(SOFTINT_SERIAL,
	    (void (*)(void *))xenevt_notify, NULL);
	memset(devevent, 0, sizeof(devevent));
	xenevt_ev1 = 0;
	memset(xenevt_ev2, 0, sizeof(xenevt_ev2));

	/* register a handler at splhigh, so that spllower() will call us */
	MALLOC(ih, struct intrhand *, sizeof (struct intrhand), M_DEVBUF,
	     M_WAITOK|M_ZERO);
	if (ih == NULL)
		panic("can't allocate xenevt interrupt source");
	ih->ih_level = level;
	ih->ih_fun = ih->ih_realfun = xenevt_processevt;
	ih->ih_arg = ih->ih_realarg = NULL;
	ih->ih_ipl_next = NULL;
#ifdef MULTIPROCESSOR
	if (!mpsafe) {
		ih->ih_fun = intr_biglock_wrapper;
		ih->ih_arg = ih;
	}
#endif /* MULTIPROCESSOR */

	s = splhigh();
	event_set_iplhandler(ih, level);
	splx(s);
}

/* register pending event - always called with interrupt disabled */
void
xenevt_setipending(int l1, int l2)
{
	xenevt_ev1 |= 1UL << l1;
	xenevt_ev2[l1] |= 1UL << l2;
	curcpu()->ci_ipending |= 1 << IPL_HIGH;
}

/* process pending events */
static int
xenevt_processevt(void *v)
{
	long l1, l2;
	int l1i, l2i;
	int port;

	l1 = xen_atomic_xchg(&xenevt_ev1, 0);
	while ((l1i = xen_ffs(l1)) != 0) {
		l1i--;
		l1 &= ~(1UL << l1i);
		l2 = xen_atomic_xchg(&xenevt_ev2[l1i], 0);
		while ((l2i = xen_ffs(l2)) != 0) {
			l2i--;
			l2 &= ~(1UL << l2i);
			port = (l1i << LONG_SHIFT) + l2i;
			xenevt_event(port);
		}
	}

	return 0;
}


/* event callback, called at splhigh() */
void
xenevt_event(int port)
{
	struct xenevt_d *d;

	d = devevent[port];
	if (d != NULL) {
		xenevt_record(d, port);

		if (d->pending) {
			return;
		}

		simple_lock(&devevent_pending_lock);
		STAILQ_INSERT_TAIL(&devevent_pending, d, pendingq);
		simple_unlock(&devevent_pending_lock);
		d->pending = true;
		softint_schedule(devevent_sih);
	}
}

void
xenevt_notify(void)
{

	int s = splhigh();
	simple_lock(&devevent_pending_lock);
	while (/* CONSTCOND */ 1) {
		struct xenevt_d *d;

		d = STAILQ_FIRST(&devevent_pending);
		if (d == NULL) {
			break;
		}
		STAILQ_REMOVE_HEAD(&devevent_pending, pendingq);
		simple_unlock(&devevent_pending_lock);
		splx(s);

		d->pending = false;
		xenevt_donotify(d);

		s = splhigh();
		simple_lock(&devevent_pending_lock);
	}
	simple_unlock(&devevent_pending_lock);
	splx(s);
}

static void
xenevt_donotify(struct xenevt_d *d)
{
	int s;

	s = splsoftserial();
	simple_lock(&d->lock);
	 
	selnotify(&d->sel, 0, 1);
	wakeup(&d->ring_read);

	simple_unlock(&d->lock);
	splx(s);
}

static void
xenevt_record(struct xenevt_d *d, evtchn_port_t port)
{

	/*
	 * This algorithm overflows for one less slot than available.
	 * Not really an issue, and the correct algorithm would be more
	 * complex
	 */

	if (d->ring_read ==
	    ((d->ring_write + 1) & XENEVT_RING_MASK)) {
		d->flags |= XENEVT_F_OVERFLOW;
		printf("xenevt_event: ring overflow port %d\n", port);
	} else {
		d->ring[d->ring_write] = port;
		d->ring_write = (d->ring_write + 1) & XENEVT_RING_MASK;
	}
}

/* open the xenevt device; this is where we clone */
int
xenevtopen(dev_t dev, int flags, int mode, struct lwp *l)
{
	struct xenevt_d *d;
	struct file *fp;
	int fd, error;

	switch(minor(dev)) {
	case DEV_EVT:
		/* falloc() will use the descriptor for us. */
		if ((error = fd_allocfile(&fp, &fd)) != 0)
			return error;

		d = malloc(sizeof(*d), M_DEVBUF, M_WAITOK | M_ZERO);
		simple_lock_init(&d->lock);
		selinit(&d->sel);
		return fd_clone(fp, fd, flags, &xenevt_fileops, d);
#ifdef XEN3
	case DEV_XSD:
		/* no clone for /dev/xsd_kva */
		return (0);
#endif
	default:
		break;
	}
	return ENODEV;
}

/* read from device: only for /dev/xsd_kva, xenevt is done though fread */
int
xenevtread(dev_t dev, struct uio *uio, int flags)
{
#ifdef XEN3
#define LD_STRLEN 21 /* a 64bit integer needs 20 digits in base10 */
	if (minor(dev) == DEV_XSD) {
		char strbuf[LD_STRLEN], *bf;
		int off, error;
		size_t len;

		off = (int)uio->uio_offset;
		if (off < 0)
			return EINVAL;
		len  = snprintf(strbuf, sizeof(strbuf), "%ld\n",
		    xen_start_info.store_mfn);
		if (off >= len) {
			bf = strbuf;
			len = 0;
		} else {
			bf = &strbuf[off];
			len -= off;
		}
		error = uiomove(bf, len, uio);
		return error;
	}
#endif
	return ENODEV;
}

/* mmap: only for xsd_kva */
paddr_t
xenevtmmap(dev_t dev, off_t off, int prot)
{
#ifdef XEN3
	if (minor(dev) == DEV_XSD) {
		/* only one page, so off is always 0 */
		if (off != 0)
			return -1;
		return x86_btop(
		    xpmap_mtop(xen_start_info.store_mfn << PAGE_SHIFT));
	}
#endif
	return -1;
}

static int
xenevt_fclose(struct file *fp)
{
	struct xenevt_d *d = fp->f_data;
	int i;

	for (i = 0; i < NR_EVENT_CHANNELS; i++ ) {
		if (devevent[i] == d) {
#ifdef XEN3
			evtchn_op_t op = { .cmd = 0 };
			int error;
#endif
			hypervisor_mask_event(i);
			devevent[i] = NULL;
#ifdef XEN3
			op.cmd = EVTCHNOP_close;
			op.u.close.port = i;
			if ((error = HYPERVISOR_event_channel_op(&op))) {
				printf("xenevt_fclose: error %d from "
				    "hypervisor\n", -error);
			}
#endif
		}
	}
	seldestroy(&d->sel);
	free(d, M_DEVBUF);
	fp->f_data = NULL;

	return (0);
}

static int
xenevt_fread(struct file *fp, off_t *offp, struct uio *uio,
    kauth_cred_t cred, int flags)
{
	struct xenevt_d *d = fp->f_data;
	int error;
	size_t len, uio_len;
	int ring_read;
	int ring_write;
	int s;

	error = 0;
	s = splsoftserial();
	simple_lock(&d->lock);
	while (error == 0) {
		ring_read = d->ring_read;
		ring_write = d->ring_write;
		if (ring_read != ring_write) {
			break;
		}
		if (d->flags & XENEVT_F_OVERFLOW) {
			break;
		}

		/* nothing to read */
		if (fp->f_flag & FNONBLOCK) {
			error = EAGAIN;
		} else {
			error = ltsleep(&d->ring_read, PRIBIO | PCATCH,
			    "xenevt", 0, &d->lock);
		}
	}
	if (error == 0 && (d->flags & XENEVT_F_OVERFLOW)) {
		error = EFBIG;
	}
	simple_unlock(&d->lock);
	splx(s);

	if (error) {
		return error;
	}

	uio_len = uio->uio_resid / BYTES_PER_PORT;
	if (ring_read <= ring_write)
		len = ring_write - ring_read;
	else
		len = XENEVT_RING_SIZE - ring_read;
	if (len > uio_len)
		len = uio_len;
	error = uiomove(&d->ring[ring_read], len * BYTES_PER_PORT, uio);
	if (error)
		return error;
	ring_read = (ring_read + len) & XENEVT_RING_MASK;
	uio_len = uio->uio_resid / BYTES_PER_PORT;
	if (uio_len == 0)
		goto done;
	/* ring wrapped, read the second part */
	len = ring_write - ring_read;
	if (len > uio_len)
		len = uio_len;
	error = uiomove(&d->ring[ring_read], len * BYTES_PER_PORT, uio);
	if (error)
		return error;
	ring_read = (ring_read + len) & XENEVT_RING_MASK;

done:
	s = splsoftserial();
	simple_lock(&d->lock);
	d->ring_read = ring_read;
	simple_unlock(&d->lock);
	splx(s);

	return 0;
}

static int
xenevt_fwrite(struct file *fp, off_t *offp, struct uio *uio,
    kauth_cred_t cred, int flags)
{
	struct xenevt_d *d = fp->f_data;
	uint16_t *chans;
	int i, nentries, error;

	if (uio->uio_resid == 0)
		return (0);
	nentries = uio->uio_resid / sizeof(uint16_t);
	if (nentries > NR_EVENT_CHANNELS)
		return EMSGSIZE;
	chans = kmem_alloc(nentries * sizeof(uint16_t), KM_SLEEP);
	if (chans == NULL)
		return ENOMEM;
	error = uiomove(chans, uio->uio_resid, uio);
	if (error)
		goto out;
	for (i = 0; i < nentries; i++) {
		if (chans[i] < NR_EVENT_CHANNELS &&
		    devevent[chans[i]] == d) {
			hypervisor_unmask_event(chans[i]);
		}
	}
out:
	kmem_free(chans, nentries * sizeof(uint16_t));
	return 0;
}

static int
xenevt_fioctl(struct file *fp, u_long cmd, void *addr)
{
	struct xenevt_d *d = fp->f_data;
#ifdef XEN3
	evtchn_op_t op = { .cmd = 0 };
	int error;
#else
	u_int *arg = addr;
#endif

	switch(cmd) {
	case EVTCHN_RESET:
#ifdef XEN3
	case IOCTL_EVTCHN_RESET:
#endif
		d->ring_read = d->ring_write = 0;
		d->flags = 0;
		break;
#ifdef XEN3
	case IOCTL_EVTCHN_BIND_VIRQ:
	{
		struct ioctl_evtchn_bind_virq *bind_virq = addr;
		op.cmd = EVTCHNOP_bind_virq;
		op.u.bind_virq.virq = bind_virq->virq;
		op.u.bind_virq.vcpu = 0;
		if ((error = HYPERVISOR_event_channel_op(&op))) {
			printf("IOCTL_EVTCHN_BIND_VIRQ failed: virq %d error %d\n", bind_virq->virq, error);
			return -error;
		}
		bind_virq->port = op.u.bind_virq.port;
		devevent[bind_virq->port] = d;
		hypervisor_unmask_event(bind_virq->port);
		break;
	}
	case IOCTL_EVTCHN_BIND_INTERDOMAIN:
	{
		struct ioctl_evtchn_bind_interdomain *bind_intd = addr;
		op.cmd = EVTCHNOP_bind_interdomain;
		op.u.bind_interdomain.remote_dom = bind_intd->remote_domain;
		op.u.bind_interdomain.remote_port = bind_intd->remote_port;
		if ((error = HYPERVISOR_event_channel_op(&op)))
			return -error;
		bind_intd->port = op.u.bind_interdomain.local_port;
		devevent[bind_intd->port] = d;
		hypervisor_unmask_event(bind_intd->port);
		break;
	}
	case IOCTL_EVTCHN_BIND_UNBOUND_PORT:
	{
		struct ioctl_evtchn_bind_unbound_port *bind_unbound = addr;
		op.cmd = EVTCHNOP_alloc_unbound;
		op.u.alloc_unbound.dom = DOMID_SELF;
		op.u.alloc_unbound.remote_dom = bind_unbound->remote_domain;
		if ((error = HYPERVISOR_event_channel_op(&op)))
			return -error;
		bind_unbound->port = op.u.alloc_unbound.port;
		devevent[bind_unbound->port] = d;
		hypervisor_unmask_event(bind_unbound->port);
		break;
	}
	case IOCTL_EVTCHN_UNBIND:
	{
		struct ioctl_evtchn_unbind *unbind = addr;
		
		if (unbind->port > NR_EVENT_CHANNELS)
			return EINVAL;
		if (devevent[unbind->port] != d)
			return ENOTCONN;
		devevent[unbind->port] = NULL;
		hypervisor_mask_event(unbind->port);
		op.cmd = EVTCHNOP_close;
		op.u.close.port = unbind->port;
		if ((error = HYPERVISOR_event_channel_op(&op)))
			return -error;
		break;
	}
	case IOCTL_EVTCHN_NOTIFY:
	{
		struct ioctl_evtchn_notify *notify = addr;
		
		if (notify->port > NR_EVENT_CHANNELS)
			return EINVAL;
		if (devevent[notify->port] != d)
			return ENOTCONN;
		hypervisor_notify_via_evtchn(notify->port);
		break;
	}
#else /* !XEN3 */
	case EVTCHN_BIND:
		if (*arg > NR_EVENT_CHANNELS)
			return EINVAL;
		if (devevent[*arg] != NULL)
			return EISCONN;
		devevent[*arg] = d;
		hypervisor_unmask_event(*arg);
		break;
	case EVTCHN_UNBIND:
		if (*arg > NR_EVENT_CHANNELS)
			return EINVAL;
		if (devevent[*arg] != d)
			return ENOTCONN;
		devevent[*arg] = NULL;
		hypervisor_mask_event(*arg);
		break;
#endif /* !XEN3 */
	case FIONBIO:
		break;
	default:
		return EINVAL;
	}
	return 0;
}

/*      
 * Support for poll() system call  
 *
 * Return true if the specific operation will not block indefinitely.
 */      

static int
xenevt_fpoll(struct file *fp, int events)
{
	struct xenevt_d *d = fp->f_data;
	int revents = events & (POLLOUT | POLLWRNORM); /* we can always write */
	int s;

	s = splsoftserial();
	simple_lock(&d->lock);
	if (events & (POLLIN | POLLRDNORM)) {
		if (d->ring_read != d->ring_write) {
			revents |= events & (POLLIN | POLLRDNORM);
		} else {
			/* Record that someone is waiting */
			selrecord(curlwp, &d->sel);
		}
	}
	simple_unlock(&d->lock);
	splx(s);
	return (revents);
}