NetBSD-5.0.2/sys/arch/xen/xen/clock.c

Compare this file to the similar file:
Show the results in this format:
/*	$NetBSD: clock.c,v 1.47 2008/10/21 15:46:32 cegger Exp $	*/

/*
 *
 * Copyright (c) 2004 Christian Limpach.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Christian Limpach.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "opt_xen.h"

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.47 2008/10/21 15:46:32 cegger Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/timetc.h>
#include <sys/timevar.h>
#include <sys/kernel.h>
#include <sys/device.h>
#include <sys/sysctl.h>

#include <xen/xen.h>
#include <xen/hypervisor.h>
#include <xen/evtchn.h>
#include <machine/cpu_counter.h>

#include <dev/clock_subr.h>

static int xen_timer_handler(void *, struct intrframe *);

/* A timecounter: Xen system_time extrapolated with a TSC. */
u_int xen_get_timecount(struct timecounter*);
static struct timecounter xen_timecounter = {
	.tc_get_timecount = xen_get_timecount,
	.tc_poll_pps = NULL,
	.tc_counter_mask = ~0U,
	.tc_frequency = 1000000000ULL,
	.tc_name = "xen_system_time",
	.tc_quality = 10000 /*
			     * This needs to take precedence over any hardware
			     * timecounters (e.g., ACPI in Xen3 dom0), because
			     * they can't correct for Xen scheduling latency.
			     */
};

/* These are periodically updated in shared_info, and then copied here. */
static volatile uint64_t shadow_tsc_stamp;
static volatile uint64_t shadow_system_time;
static volatile unsigned long shadow_time_version; /* XXXSMP */
static volatile uint32_t shadow_freq_mul;
static volatile int8_t shadow_freq_shift;
static volatile struct timespec shadow_ts;

/* The time when the last hardclock(9) call should have taken place. */
static volatile uint64_t processed_system_time;

/*
 * The clock (as returned by xen_get_timecount) may need to be held
 * back to maintain the illusion that hardclock(9) was called when it
 * was supposed to be, not when Xen got around to scheduling us.
 */
static volatile uint64_t xen_clock_bias = 0;

#ifdef DOM0OPS
/* If we're dom0, send our time to Xen every minute or so. */
int xen_timepush_ticks = 0;
static callout_t xen_timepush_co;
#endif

#define NS_PER_TICK (1000000000ULL/hz)

/*
 * Reads a consistent set of time-base values from Xen, into a shadow data
 * area.  Must be called at splhigh (per timecounter requirements).
 */
static void
get_time_values_from_xen(void)
{
#ifdef XEN3
	volatile struct vcpu_time_info *t = &curcpu()->ci_vcpu->time;
	uint32_t tversion;

	do {
		shadow_time_version = t->version;
		x86_lfence();
		shadow_tsc_stamp = t->tsc_timestamp;
		shadow_system_time = t->system_time;
		shadow_freq_mul = t->tsc_to_system_mul;
		shadow_freq_shift = t->tsc_shift;
		x86_lfence();
	} while ((t->version & 1) || (shadow_time_version != t->version));
	do {
		tversion = HYPERVISOR_shared_info->wc_version;
		x86_lfence();
		shadow_ts.tv_sec = HYPERVISOR_shared_info->wc_sec;
		shadow_ts.tv_nsec = HYPERVISOR_shared_info->wc_nsec;
		x86_lfence();
	} while ((HYPERVISOR_shared_info->wc_version & 1) ||
	    (tversion != HYPERVISOR_shared_info->wc_version));
#else /* XEN3 */
	do {
		shadow_time_version = HYPERVISOR_shared_info->time_version2;
		x86_lfence();
		shadow_ts.tv_sec = HYPERVISOR_shared_info->wc_sec;
		shadow_ts.tv_nsec = HYPERVISOR_shared_info->wc_usec;
		shadow_tsc_stamp = HYPERVISOR_shared_info->tsc_timestamp;
		shadow_system_time = HYPERVISOR_shared_info->system_time;
		x86_lfence();
	} while (shadow_time_version != HYPERVISOR_shared_info->time_version1);
	shadow_ts.tv_nsec *= 1000;
#endif
}

/*
 * Are the values we have up to date?
 */
static inline int
time_values_up_to_date(void)
{
	int rv;

	x86_lfence();
#ifndef XEN3
	rv = shadow_time_version == HYPERVISOR_shared_info->time_version1;
#else
	rv = shadow_time_version == curcpu()->ci_vcpu->time.version;
#endif
	x86_lfence();

	return rv;
}

#ifdef XEN3
/*
 * Xen 3 helpfully provides the CPU clock speed in the form of a multiplier
 * and shift that can be used to convert a cycle count into nanoseconds
 * without using an actual (slow) divide insn.
 */
static inline uint64_t
scale_delta(uint64_t delta, uint32_t mul_frac, int8_t shift)
{
	if (shift < 0)
		delta >>= -shift;
	else
		delta <<= shift;

	/*
	 * Here, we multiply a 64-bit and a 32-bit value, and take the top
	 * 64 bits of that 96-bit product.  This is broken up into two
	 * 32*32=>64-bit multiplies and a 64-bit add.  The casts are needed
	 * to hint to GCC that both multiplicands really are 32-bit; the
	 * generated code is still fairly bad, but not insanely so.
	 */
	return ((uint64_t)(uint32_t)(delta >> 32) * mul_frac)
	    + ((((uint64_t)(uint32_t)(delta & 0xFFFFFFFF)) * mul_frac) >> 32);
}
#endif

/* 
 * Use cycle counter to determine ns elapsed since last Xen time update.
 * Must be called at splhigh (per timecounter requirements).
 */
static uint64_t
get_tsc_offset_ns(void)
{
	uint64_t tsc_delta, offset;
#ifndef XEN3
	struct cpu_info *ci = curcpu();
#endif

	tsc_delta = cpu_counter() - shadow_tsc_stamp;
#ifndef XEN3
	offset = tsc_delta * 1000000000ULL / cpu_frequency(ci);
#else
	offset = scale_delta(tsc_delta, shadow_freq_mul,
	    shadow_freq_shift);
#endif
#ifdef XEN_CLOCK_DEBUG
	if (tsc_delta > 100000000000ULL || offset > 10000000000ULL)
		printf("get_tsc_offset_ns: tsc_delta=%llu offset=%llu"
		    " pst=%llu sst=%llu\n", tsc_delta, offset,
		    processed_system_time, shadow_system_time);
#endif

	return offset;
}

/*
 * Returns the current system_time, taking care that the timestamp
 * used is valid for the TSC measurement in question.  Xen2 doesn't
 * ensure that this won't step backwards, so we enforce monotonicity
 * on our own in that case.  Must be called at splhigh.
 */
static uint64_t
get_system_time(void)
{
#ifndef XEN3
	static volatile uint64_t oldstime = 0;
#endif
	uint64_t offset, stime;
	
	for (;;) {
		offset = get_tsc_offset_ns();
		stime = shadow_system_time + offset;
		
		/* if the timestamp went stale before we used it, refresh */
		if (time_values_up_to_date()) {
			/*
			 * Work around an intermittent Xen2 bug where, for
			 * a period of 1<<32 ns, currently running domains
			 * don't get their timer events as usual (and also
			 * aren't preempted in favor of other runnable
			 * domains).  Setting the timer into the past in
			 * this way causes it to fire immediately.
			 */
#ifndef XEN3
			if (offset > 4*10000000ULL) {
#ifdef XEN_CLOCK_DEBUG
				printf("get_system_time: overlarge offset %llu"
				    " (pst=%llu sst=%llu); poking timer...\n",
				    offset, processed_system_time,
				    shadow_system_time);
#endif
				HYPERVISOR_set_timer_op(shadow_system_time);
			}
#endif
			break;
		}
		get_time_values_from_xen();
	}

#ifndef XEN3
	if (stime < oldstime) {
#ifdef XEN_CLOCK_DEBUG
		printf("xen_get_timecount: system_time backstep: %"
		    PRIu64" -> %"PRIu64" (%"PRIu64" ns)\n",
		    oldstime, stime, oldstime-stime);
#endif
		stime = oldstime;
	}
	oldstime = stime;
#endif

	return stime;
}

static void
xen_wall_time(struct timespec *wt)
{
	uint64_t nsec;
	int s;

	s = splhigh();
	get_time_values_from_xen();
	*wt = shadow_ts;
	nsec = wt->tv_nsec;
#ifdef XEN3
	/* Under Xen3, this is the wall time less system time */
	nsec += get_system_time();
	splx(s);
	wt->tv_sec += nsec / 1000000000L;
	wt->tv_nsec = nsec % 1000000000L;
#else
	/* Under Xen2 , this is the current wall time. */
	splx(s);
#endif
}

static int
xen_rtc_get(todr_chip_handle_t todr, volatile struct timeval *tvp)
{
	struct timespec wt;

	xen_wall_time(&wt);
	tvp->tv_sec = wt.tv_sec;
	tvp->tv_usec = wt.tv_nsec / 1000;

	return 0;
}

static int
xen_rtc_set(todr_chip_handle_t todr, volatile struct timeval *tvp)
{
#ifdef DOM0OPS
	dom0_op_t op;
	int s;

	if (xendomain_is_privileged()) {
		op.cmd = DOM0_SETTIME;
		/* XXX is rtc_offset handled correctly everywhere? */
		op.u.settime.secs	 = tvp->tv_sec;
#ifdef XEN3
		op.u.settime.nsecs	 = tvp->tv_usec * 1000;
#else
		op.u.settime.usecs	 = tvp->tv_usec;
#endif
		s = splhigh();
		op.u.settime.system_time = get_system_time();
		splx(s);
		HYPERVISOR_dom0_op(&op);
	}
#endif

	return 0;
}

void
startrtclock(void)
{
	static struct todr_chip_handle	tch;
	tch.todr_gettime = xen_rtc_get;
	tch.todr_settime = xen_rtc_set;
	tch.todr_setwen = NULL;

	todr_attach(&tch);
}

/*
 * Wait approximately `n' microseconds.
 */
void
xen_delay(unsigned int n)
{
	if (n < 500000) {
		/*
		 * shadow_system_time is updated every hz tick, it's not
		 * precise enouth for short delays. Use the CPU counter
		 * instead. We assume it's working at this point.
		 */
		uint64_t cc, cc2, when;
		struct cpu_info *ci = curcpu();

		cc = cpu_counter();
		when = cc + (uint64_t)n * cpu_frequency(ci) / 1000000LL;
		if (when < cc) {
			/* wait for counter to wrap */
			cc2 = cpu_counter();
			while (cc2 > cc)
				cc2 = cpu_counter();
		}
		cc2 = cpu_counter();
		while (cc2 < when)
			cc2 = cpu_counter();
		
		return;
	} else {
		uint64_t when;
		int s;
		/* for large delays, shadow_system_time is OK */
		
		s = splhigh();
		get_time_values_from_xen();
		when = shadow_system_time + n * 1000;
		while (shadow_system_time < when) {
			splx(s);
			s = splhigh();
			get_time_values_from_xen();
		}
		splx(s);
	}
}

#ifdef DOM0OPS
/* ARGSUSED */
static void
xen_timepush(void *arg)
{
	callout_t *co = arg;

	resettodr();
	if (xen_timepush_ticks > 0)
		callout_schedule(co, xen_timepush_ticks);
}

/* ARGSUSED */
static int
sysctl_xen_timepush(SYSCTLFN_ARGS)
{
	int error, new_ticks;
	struct sysctlnode node;

	new_ticks = xen_timepush_ticks;
	node = *rnode;
	node.sysctl_data = &new_ticks;
	error = sysctl_lookup(SYSCTLFN_CALL(&node));
	if (error || newp == NULL)
		return error;

	if (new_ticks < 0)
		return EINVAL;
	if (new_ticks != xen_timepush_ticks) {
		xen_timepush_ticks = new_ticks;
		if (new_ticks > 0)
			callout_schedule(&xen_timepush_co, new_ticks);
		else
			callout_stop(&xen_timepush_co);
	}

	return 0;
}
#endif

/* ARGSUSED */
u_int
xen_get_timecount(struct timecounter *tc)
{
	uint64_t ns;
	int s;
	
	s = splhigh();
	ns = get_system_time() - xen_clock_bias;
	splx(s);

	return (u_int)ns;
}

void
xen_initclocks(void)
{
	int evtch;

#ifdef DOM0OPS
	callout_init(&xen_timepush_co, 0);
#endif
	evtch = bind_virq_to_evtch(VIRQ_TIMER);
	aprint_verbose("Xen clock: using event channel %d\n", evtch);

	get_time_values_from_xen();
	processed_system_time = shadow_system_time;
	tc_init(&xen_timecounter);
	/* The splhigh requirements start here. */

	event_set_handler(evtch, (int (*)(void *))xen_timer_handler,
	    NULL, IPL_CLOCK, "clock");
	hypervisor_enable_event(evtch);

#ifdef DOM0OPS
	xen_timepush_ticks = 53 * hz + 3; /* avoid exact # of min/sec */
	if (xendomain_is_privileged()) {
		sysctl_createv(NULL, 0, NULL, NULL, CTLFLAG_READWRITE,
		    CTLTYPE_INT, "xen_timepush_ticks", SYSCTL_DESCR("How often"
		    " to update the hypervisor's time-of-day; 0 to disable"),
		    sysctl_xen_timepush, 0, &xen_timepush_ticks, 0, 
		    CTL_MACHDEP, CTL_CREATE, CTL_EOL);
		callout_reset(&xen_timepush_co, xen_timepush_ticks,
		    &xen_timepush, &xen_timepush_co);
	}
#endif
}

/* ARGSUSED */
static int
xen_timer_handler(void *arg, struct intrframe *regs)
{
	int64_t delta;
	int s, ticks_done;

	s = splhigh();
#if 0
	get_time_values_from_xen();
#endif
	delta = (int64_t)(get_system_time() - processed_system_time);
	splx(s);

	ticks_done = 0;
	/* Several ticks may have passed without our being run; catch up. */
	while (delta >= (int64_t)NS_PER_TICK) {
		++ticks_done;
		s = splhigh();
		processed_system_time += NS_PER_TICK;
		xen_clock_bias = (delta -= NS_PER_TICK);
		splx(s);
		hardclock((struct clockframe *)regs);
	}
	
	if (xen_clock_bias) {
		s = splhigh();
 		xen_clock_bias = 0;
		splx(s);
	}

	/*
	 * Re-arm the timer here, if needed; Xen's auto-ticking while runnable
	 * is useful only for HZ==100, and even then may be out of phase with
	 * the processed_system_time steps.
	 */
	if (ticks_done != 0)
		HYPERVISOR_set_timer_op(processed_system_time + NS_PER_TICK);

	return 0;
}

void
setstatclockrate(int arg)
{
}

void
idle_block(void)
{
	int r;

	/*
	 * We set the timer to when we expect the next timer
	 * interrupt.  We could set the timer to later if we could
	 * easily find out when we will have more work (callouts) to
	 * process from hardclock.
	 */
	r = HYPERVISOR_set_timer_op(processed_system_time + NS_PER_TICK);
	if (r == 0)
		HYPERVISOR_block();
	else
		__sti();
}