OpenSolaris_b135/uts/i86xpv/os/xen_mmu.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */


#include <sys/mach_mmu.h>
#include <sys/machsystm.h>
#include <sys/cmn_err.h>
#include <sys/promif.h>
#include <sys/hypervisor.h>
#include <sys/bootconf.h>
#include <sys/ontrap.h>
#include <sys/rwlock.h>
#include <sys/sysmacros.h>
#include <vm/seg_kmem.h>
#include <vm/kboot_mmu.h>
#include <vm/hat_pte.h>
#include <vm/hat.h>
#include <vm/htable.h>
#include <vm/hat_i86.h>

start_info_t *xen_info;
ulong_t mfn_count;
mfn_t *mfn_list;
mfn_t *mfn_list_pages;		/* pages that make a table of mfn's */
				/* that make up the pa_to_ma table */
mfn_t *mfn_list_pages_page;	/* page of mfn's for mfn_list_pages */
mfn_t cached_max_mfn;
uintptr_t xen_virt_start;
pfn_t *mfn_to_pfn_mapping;
caddr_t xb_addr;		/* virtual addr for the store_mfn page */


/*
 * We need to prevent migration or suspension of a domU while it's
 * manipulating MFN values, as the MFN values will spontaneously
 * change. The next 4 routines provide a mechanism for that.
 * The basic idea is to use reader/writer mutex, readers are any thread
 * that is manipulating MFNs. Only the thread which is going to actually call
 * HYPERVISOR_suspend() will become a writer.
 *
 * Since various places need to manipulate MFNs and also call the HAT,
 * we track if a thread acquires reader status and allow it to recursively
 * do so again. This prevents deadlocks if a migration request
 * is started and waits for some reader, but then the previous reader needs
 * to call into the HAT.
 */
#define	NUM_M2P_LOCKS 128
static struct {
	krwlock_t m2p_rwlock;
	char m2p_pad[64 - sizeof (krwlock_t)];	/* 64 byte cache line size */
} m2p_lock[NUM_M2P_LOCKS];

#define	XM2P_HASH	((uintptr_t)curthread->t_tid & (NUM_M2P_LOCKS - 1))

void
xen_block_migrate(void)
{
	if (!DOMAIN_IS_INITDOMAIN(xen_info) &&
	    ++curthread->t_xpvcntr == 1)
		rw_enter(&m2p_lock[XM2P_HASH].m2p_rwlock, RW_READER);
}

void
xen_allow_migrate(void)
{
	if (!DOMAIN_IS_INITDOMAIN(xen_info) &&
	    --curthread->t_xpvcntr == 0)
		rw_exit(&m2p_lock[XM2P_HASH].m2p_rwlock);
}

void
xen_start_migrate(void)
{
	int i;

	ASSERT(curthread->t_xpvcntr == 0);
	++curthread->t_xpvcntr; /* this allows calls into HAT */
	for (i = 0; i < NUM_M2P_LOCKS; ++i)
		rw_enter(&m2p_lock[i].m2p_rwlock, RW_WRITER);
}

void
xen_end_migrate(void)
{
	int i;

	for (i = 0; i < NUM_M2P_LOCKS; ++i)
		rw_exit(&m2p_lock[i].m2p_rwlock);
	ASSERT(curthread->t_xpvcntr == 1);
	--curthread->t_xpvcntr;
}

/*ARGSUSED*/
void
set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
{
	mmu_update_t t;
	maddr_t mtable = pa_to_ma(table);
	int retcnt;

	t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
	t.val = pteval;
	if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
		bop_panic("HYPERVISOR_mmu_update() failed");
}

/*
 * The start_info_t and mfn_list are initially mapped in low "boot" memory.
 * Each has a page aligned address and size. We relocate them up into the
 * kernel's normal address space at this point in time. We also create
 * the arrays that let the hypervisor suspend/resume a domain.
 */
void
xen_relocate_start_info(void)
{
	maddr_t mach_addr;
	size_t sz;
	size_t sz2;
	offset_t off;
	uintptr_t addr;
	uintptr_t old;
	int i, j;

	/*
	 * In dom0, we have to account for the console_info structure
	 * which might immediately follow the start_info in memory.
	 */
	sz = sizeof (start_info_t);
	if (DOMAIN_IS_INITDOMAIN(xen_info) &&
	    xen_info->console.dom0.info_off >= sizeof (start_info_t)) {
		sz += xen_info->console.dom0.info_off - sizeof (start_info_t) +
		    xen_info->console.dom0.info_size;
	}
	sz = P2ROUNDUP(sz, MMU_PAGESIZE);
	addr = (uintptr_t)vmem_alloc(heap_arena, sz, VM_SLEEP);
	for (off = 0; off < sz; off += MMU_PAGESIZE) {
		mach_addr = pa_to_ma(pfn_to_pa(va_to_pfn(
		    (caddr_t)xen_info + off)));
		kbm_map_ma(mach_addr + off, addr + off, 0);
	}
	boot_mapin((caddr_t)addr, sz);
	old = (uintptr_t)xen_info;
	xen_info = (start_info_t *)addr;
	for (off = 0; off < sz; off += MMU_PAGESIZE)
		kbm_unmap(old + off);

	/*
	 * Relocate the mfn_list, any number of pages.
	 */
	sz = P2ROUNDUP(mfn_count * sizeof (mfn_t), MMU_PAGESIZE);
	addr = (uintptr_t)vmem_xalloc(heap_arena, sz, MMU_PAGESIZE, 0,
	    0, 0, 0, VM_SLEEP);
	for (off = 0; off < sz; off += MMU_PAGESIZE) {
		mach_addr =
		    pa_to_ma(pfn_to_pa(va_to_pfn((caddr_t)mfn_list + off)));
		kbm_map_ma(mach_addr, addr + off, 0);
	}
	boot_mapin((caddr_t)addr, sz);
	old = (uintptr_t)mfn_list;
	mfn_list = (mfn_t *)addr;
	xen_info->mfn_list = (mfn_t)addr;
	for (off = 0; off < sz; off += MMU_PAGESIZE)
		kbm_unmap(old + off);

	/*
	 * Create the lists of mfn_list pages needed by suspend/resume.
	 * Note we skip this for domain 0 as it can't suspend/resume.
	 */
	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
		sz2 = P2ROUNDUP(mmu_btop(sz) * sizeof (mfn_t), MMU_PAGESIZE);
		mfn_list_pages = kmem_zalloc(sz2, VM_SLEEP);
		mfn_list_pages_page = kmem_zalloc(MMU_PAGESIZE, VM_SLEEP);
		i = 0;
		for (off = 0; off < sz; off += MMU_PAGESIZE) {
			j = mmu_btop(off);
			if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) {
				mfn_list_pages_page[i++] =
				    pfn_to_mfn(va_to_pfn(&mfn_list_pages[j]));
			}
			mfn_list_pages[j] =
			    pfn_to_mfn(va_to_pfn((caddr_t)mfn_list + off));
		}
		HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
		    pfn_to_mfn(va_to_pfn(mfn_list_pages_page));
		HYPERVISOR_shared_info->arch.max_pfn = xen_info->nr_pages;
	}

	/*
	 * Remap the shared info (for I/O) into high memory, too.
	 */
	sz = MMU_PAGESIZE;
	addr = (uintptr_t)vmem_alloc(heap_arena, sz, VM_SLEEP);
	kbm_map_ma(xen_info->shared_info, addr, 0);
	/* shared info has no PFN so don't do: boot_mapin((caddr_t)addr, sz) */
	old = (uintptr_t)HYPERVISOR_shared_info;
	HYPERVISOR_shared_info = (void *)addr;
	kbm_unmap(old);

	/*
	 * Remap the console info into high memory, too.
	 */
	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
		sz = MMU_PAGESIZE;
		addr = (uintptr_t)vmem_alloc(heap_arena, sz, VM_SLEEP);
		kbm_map_ma(pfn_to_pa(xen_info->console.domU.mfn), addr, 0);
		boot_mapin((caddr_t)addr, sz);
		old = (uintptr_t)HYPERVISOR_console_page;
		HYPERVISOR_console_page = (void *)addr;
		kbm_unmap(old);
	} else {
		HYPERVISOR_console_page = NULL;
	}

	/*
	 * On domUs we need to have the xenbus page (store_mfn) mapped into
	 * the kernel. This is referenced as xb_addr.
	 */
	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
		xb_addr = vmem_alloc(heap_arena, MMU_PAGESIZE, VM_SLEEP);
		kbm_map_ma(mfn_to_ma(xen_info->store_mfn),
		    (uintptr_t)xb_addr, 0);
		boot_mapin(xb_addr, MMU_PAGESIZE);
	}
}

/*
 * Generate the pfn value to use for a foreign mfn.
 */
pfn_t
xen_assign_pfn(mfn_t mfn)
{
	pfn_t pfn;

#ifdef DEBUG
	/*
	 * make sure this MFN isn't in our list of MFNs
	 */
	on_trap_data_t otd;
	uint_t	on_trap_ready = (t0.t_stk != NULL);

	if (on_trap_ready) {
		if (on_trap(&otd, OT_DATA_ACCESS) == 0) {
			pfn = mfn_to_pfn_mapping[mfn];
			if (pfn < mfn_count && mfn_list[pfn] == mfn)
				panic("xen_assign_pfn() mfn belongs to us");
		}
		no_trap();
	}
#endif /* DEBUG */

	if (mfn == MFN_INVALID)
		panic("xen_assign_pfn(MFN_INVALID) not allowed");
	pfn = (pfn_t)mfn | PFN_IS_FOREIGN_MFN;
	if (pfn == mfn)
		panic("xen_assign_pfn(mfn) PFN_IS_FOREIGN_MFN bit already set");
	return (pfn);
}

void
xen_release_pfn(pfn_t pfn)
{
	if (pfn == PFN_INVALID)
		panic("xen_release_pfn(PFN_INVALID) not allowed");
	if ((pfn & PFN_IS_FOREIGN_MFN) == 0)
		panic("mfn high bit not set");
}

uint_t
pfn_is_foreign(pfn_t pfn)
{
	if (pfn == PFN_INVALID)
		return (0);
	return ((pfn & PFN_IS_FOREIGN_MFN) != 0);
}

pfn_t
pte2pfn(x86pte_t pte, level_t l)
{
	mfn_t mfn = PTE2MFN(pte, l);

	if ((pte & PT_SOFTWARE) >= PT_FOREIGN)
		return ((pfn_t)mfn | PFN_IS_FOREIGN_MFN);
	return (mfn_to_pfn(mfn));
}

mfn_t
pfn_to_mfn(pfn_t pfn)
{
	if (pfn == PFN_INVALID)
		panic("pfn_to_mfn(PFN_INVALID) not allowed");

	if (pfn & PFN_IS_FOREIGN_MFN)
		return (pfn & ~PFN_IS_FOREIGN_MFN);

	if (pfn >= mfn_count)
		panic("pfn_to_mfn(): illegal PFN 0x%lx", pfn);

	return (mfn_list[pfn]);
}

/*
 * This routine translates an MFN back into the corresponding PFN value.
 * It has to be careful since the mfn_to_pfn_mapping[] might fault
 * as that table is sparse. It also has to check for non-faulting, but out of
 * range that exceed the table.
 */
pfn_t
mfn_to_pfn(mfn_t mfn)
{
	pfn_t pfn;
	on_trap_data_t otd;
	uint_t	on_trap_ready = (t0.t_stk != NULL);

	/*
	 * Cleared at a suspend or migrate
	 */
	if (cached_max_mfn == 0)
		cached_max_mfn =
		    HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);

	if (cached_max_mfn < mfn)
		return ((pfn_t)mfn | PFN_IS_FOREIGN_MFN);

	if (on_trap_ready && on_trap(&otd, OT_DATA_ACCESS)) {
		pfn = (pfn_t)mfn | PFN_IS_FOREIGN_MFN;
	} else {
		pfn = mfn_to_pfn_mapping[mfn];

		if (pfn == PFN_INVALID || pfn >= mfn_count ||
		    pfn_to_mfn(pfn) != mfn)
			pfn = (pfn_t)mfn | PFN_IS_FOREIGN_MFN;
	}

	if (on_trap_ready)
		no_trap();

	/*
	 * If khat_running is set then we should be checking
	 * in domUs that migration is blocked while using the
	 * mfn_to_pfn_mapping[] table.
	 */
	ASSERT(!khat_running || DOMAIN_IS_INITDOMAIN(xen_info) ||
	    rw_read_held(&m2p_lock[XM2P_HASH].m2p_rwlock));

	return (pfn);
}

/*
 * From a pseudo-physical address, find the corresponding machine address.
 */
maddr_t
pa_to_ma(paddr_t pa)
{
	mfn_t mfn = pfn_to_mfn(mmu_btop(pa));

	if (mfn == MFN_INVALID)
		panic("pa_to_ma() got MFN_INVALID");
	return (mfn_to_ma(mfn) + (pa & MMU_PAGEOFFSET));
}

/*
 * From a machine address, find the corresponding pseudo-physical address.
 */
paddr_t
ma_to_pa(maddr_t ma)
{
	pfn_t pfn = mfn_to_pfn(mmu_btop(ma));

	if (pfn == PFN_INVALID)
		panic("ma_to_pa() got PFN_INVALID");
	return (pfn_to_pa(pfn) + (ma & MMU_PAGEOFFSET));
}

/*
 * When calling reassign_pfn(), the page must be (at least) read locked
 * to make sure swrand does not try to grab it.
 */
#ifdef DEBUG
#define	CHECK_PAGE_LOCK(pfn)	{			\
	page_t *pp = page_numtopp_nolock(pfn);		\
	if ((pp != NULL) && (!PAGE_LOCKED(pp))) {	\
		panic("reassign_pfn() called with unlocked page (pfn 0x%lx)", \
		    pfn);				\
	}						\
}
#else	/* DEBUG */
#define	CHECK_PAGE_LOCK(pfn)
#endif	/* DEBUG */

/*
 * Reassign a new machine page to back a physical address.
 */
void
reassign_pfn(pfn_t pfn, mfn_t mfn)
{
	int mmu_update_return;
	mmu_update_t t;
	extern void update_contig_pfnlist(pfn_t, mfn_t, mfn_t);

	ASSERT(pfn != PFN_INVALID);
	ASSERT(!pfn_is_foreign(pfn));

	ASSERT(pfn < mfn_count);
	update_contig_pfnlist(pfn, mfn_list[pfn], mfn);
	if (mfn == MFN_INVALID) {
		CHECK_PAGE_LOCK(pfn);
		if (kpm_vbase != NULL && xen_kpm_page(pfn, 0) < 0)
			panic("reassign_pfn(): failed to remove kpm mapping");
		mfn_list[pfn] = mfn;
		return;
	}

	/*
	 * Verify that previously given away pages are still page locked.
	 */
	if (mfn_list[pfn] == MFN_INVALID) {
		CHECK_PAGE_LOCK(pfn);
	}
	mfn_list[pfn] = mfn;

	t.ptr = mfn_to_ma(mfn) | MMU_MACHPHYS_UPDATE;
	t.val = pfn;

	if (HYPERVISOR_mmu_update(&t, 1, &mmu_update_return, DOMID_SELF))
		panic("HYPERVISOR_mmu_update() failed");
	ASSERT(mmu_update_return == 1);

	if (kpm_vbase != NULL && xen_kpm_page(pfn, PT_VALID | PT_WRITABLE) < 0)
		panic("reassign_pfn(): failed to enable kpm mapping");
}