OpenSolaris_b135/cmd/mdb/intel/modules/mdb_kb/mdb_kb.c

Compare this file to the similar file:
Show the results in this format:

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * KVM backend for hypervisor domain dumps.  We don't use libkvm for
 * such dumps, since they do not have a namelist file or the typical
 * dump structures we expect to aid bootstrapping.  Instead, we
 * bootstrap based upon a debug_info structure at a known VA, using the
 * guest's own page tables to resolve to physical addresses, and
 * construct the namelist in a manner similar to ksyms_snapshot().
 *
 * Note that there are two formats understood by this module: the older,
 * ad hoc format, which we call 'core' within this file, and an
 * ELF-based format, known as 'elf'.
 *
 * We only support the older format generated on Solaris dom0: before we
 * fixed it, core dump files were broken whenever a PFN didn't map a
 * real MFN (!).
 */

#include <strings.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <stdarg.h>
#include <unistd.h>
#include <fcntl.h>
#include <gelf.h>
#include <errno.h>

#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/debug_info.h>
#include <sys/xen_mmu.h>
#include <sys/elf.h>
#include <sys/machelf.h>
#include <sys/modctl.h>
#include <sys/kobj.h>
#include <sys/kobj_impl.h>
#include <sys/sysmacros.h>
#include <sys/privmregs.h>
#include <vm/as.h>

#include <mdb/mdb_io.h>
#include <mdb/mdb_kb.h>
#include <mdb/mdb_target_impl.h>

#include <xen/public/xen.h>
#include <xen/public/version.h>
#include <xen/public/elfnote.h>

#define	XKB_SHDR_NULL 0
#define	XKB_SHDR_SYMTAB 1
#define	XKB_SHDR_STRTAB 2
#define	XKB_SHDR_SHSTRTAB 3
#define	XKB_SHDR_NUM 4

#define	XKB_WALK_LOCAL 0x1
#define	XKB_WALK_GLOBAL 0x2
#define	XKB_WALK_STR 0x4
#define	XKB_WALK_ALL (XKB_WALK_LOCAL | XKB_WALK_GLOBAL | XKB_WALK_STR)

#if defined(__i386)
#define	DEBUG_INFO 0xf4bff000
#define	DEBUG_INFO_HVM 0xfe7ff000
#elif defined(__amd64)
#define	DEBUG_INFO 0xfffffffffb7ff000
#define	DEBUG_INFO_HVM 0xfffffffffb7ff000
#endif

#define	PAGE_SIZE 0x1000
#define	PAGE_SHIFT 12
#define	PAGE_OFFSET(a) ((a) & (PAGE_SIZE - 1))
#define	PAGE_MASK(a) ((a) & ~(PAGE_SIZE - 1))
#define	PAGE_ALIGNED(a) (((a) & (PAGE_SIZE -1)) == 0)
#define	PT_PADDR_LGPG 0x000fffffffffe000ull
#define	PT_PADDR 0x000ffffffffff000ull
#define	PT_VALID 0x1
#define	PT_PAGESIZE 0x080
#define	PTE_IS_LGPG(p, l) ((l) > 0 && ((p) & PT_PAGESIZE))

#define	XC_CORE_MAGIC 0xF00FEBED
#define	XC_CORE_MAGIC_HVM 0xF00FEBEE

#define	VGCF_HVM_GUEST (1<<1)

typedef struct xc_core_header {
	unsigned int xch_magic;
	unsigned int xch_nr_vcpus;
	unsigned int xch_nr_pages;
	unsigned int xch_ctxt_offset;
	unsigned int xch_index_offset;
	unsigned int xch_pages_offset;
} xc_core_header_t;

struct xc_elf_header {
	uint64_t xeh_magic;
	uint64_t xeh_nr_vcpus;
	uint64_t xeh_nr_pages;
	uint64_t xeh_page_size;
};

struct xc_elf_version {
	uint64_t xev_major;
	uint64_t xev_minor;
	xen_extraversion_t xev_extra;
	xen_compile_info_t xev_compile_info;
	xen_capabilities_info_t xev_capabilities;
	xen_changeset_info_t xev_changeset;
	xen_platform_parameters_t xev_platform_parameters;
	uint64_t xev_pagesize;
};

/*
 * Either an old-style (3.0.4) core format, or the ELF format.
 */
typedef enum {
	XKB_FORMAT_UNKNOWN = 0,
	XKB_FORMAT_CORE = 1,
	XKB_FORMAT_ELF = 2
} xkb_type_t;

typedef struct mfn_map {
	mfn_t mm_mfn;
	char *mm_map;
} mfn_map_t;

typedef struct mmu_info {
	size_t mi_max;
	size_t mi_shift[4];
	size_t mi_ptes;
	size_t mi_ptesize;
} mmu_info_t;

typedef struct xkb_core {
	xc_core_header_t xc_hdr;
	void *xc_p2m_buf;
} xkb_core_t;

typedef struct xkb_elf {
	mdb_gelf_file_t *xe_gelf;
	size_t *xe_off;
	struct xc_elf_header xe_hdr;
	struct xc_elf_version xe_version;
} xkb_elf_t;

typedef struct xkb {
	char *xkb_path;
	int xkb_fd;
	int xkb_is_hvm;

	xkb_type_t xkb_type;
	xkb_core_t xkb_core;
	xkb_elf_t xkb_elf;

	size_t xkb_nr_vcpus;
	size_t xkb_nr_pages;
	size_t xkb_pages_off;
	xen_pfn_t xkb_max_pfn;
	mfn_t xkb_max_mfn;
	int xkb_is_pae;

	mmu_info_t xkb_mmu;
	debug_info_t xkb_info;

	void *xkb_vcpu_data;
	size_t xkb_vcpu_data_sz;
	struct vcpu_guest_context **xkb_vcpus;

	char *xkb_pages;
	mfn_t *xkb_p2m;
	xen_pfn_t *xkb_m2p;
	mfn_map_t xkb_pt_map[4];
	mfn_map_t xkb_map;

	char *xkb_namelist;
	size_t xkb_namesize;
} xkb_t;

static const char xkb_shstrtab[] = "\0.symtab\0.strtab\0.shstrtab\0";

typedef struct xkb_namelist {
	Ehdr	kh_elf_hdr;
	Phdr	kh_text_phdr;
	Phdr	kh_data_phdr;
	Shdr	kh_shdr[XKB_SHDR_NUM];
	char	shstrings[sizeof (xkb_shstrtab)];
} xkb_namelist_t;

static int xkb_build_ksyms(xkb_t *);
static offset_t xkb_mfn_to_offset(xkb_t *, mfn_t);
static mfn_t xkb_va_to_mfn(xkb_t *, uintptr_t, mfn_t);
static ssize_t xkb_read(xkb_t *, uintptr_t, void *, size_t);
static int xkb_read_word(xkb_t *, uintptr_t, uintptr_t *);
static char *xkb_map_mfn(xkb_t *, mfn_t, mfn_map_t *);
static int xkb_close(xkb_t *);

/*
 * Jump through the hoops we need to to correctly identify a core file
 * of either the old or new format.
 */
int
xkb_identify(const char *file, int *longmode)
{
	xc_core_header_t header;
	mdb_gelf_file_t *gf = NULL;
	mdb_gelf_sect_t *sect = NULL;
	mdb_io_t *io = NULL;
	char *notes = NULL;
	char *pos;
	int ret = 0;
	size_t sz;
	int fd;

	if ((fd = open64(file, O_RDONLY)) == -1)
		return (-1);

	if (pread64(fd, &header, sizeof (header), 0) != sizeof (header)) {
		(void) close(fd);
		return (0);
	}

	(void) close(fd);

	if (header.xch_magic == XC_CORE_MAGIC) {
		*longmode = 0;

		/*
		 * Indeed.
		 */
		sz = header.xch_index_offset - header.xch_ctxt_offset;
#ifdef _LP64
		if (sizeof (struct vcpu_guest_context) *
		    header.xch_nr_vcpus == sz)
			*longmode = 1;
#else
		if (sizeof (struct vcpu_guest_context) *
		    header.xch_nr_vcpus != sz)
			*longmode = 1;
#endif /* _LP64 */

		return (1);
	}

	if ((io = mdb_fdio_create_path(NULL, file, O_RDONLY, 0)) == NULL)
		return (-1);

	if ((gf = mdb_gelf_create(io, ET_NONE, GF_FILE)) == NULL)
		goto out;

	if ((sect = mdb_gelf_sect_by_name(gf, ".note.Xen")) == NULL)
		goto out;

	if ((notes = mdb_gelf_sect_load(gf, sect)) == NULL)
		goto out;

	for (pos = notes; pos < notes + sect->gs_shdr.sh_size; ) {
		struct xc_elf_version *vers;
		/* LINTED - alignment */
		Elf64_Nhdr *nhdr = (Elf64_Nhdr *)pos;
		char *desc;
		char *name;

		name = pos + sizeof (*nhdr);
		desc = (char *)P2ROUNDUP((uintptr_t)name + nhdr->n_namesz, 4);

		pos = desc + nhdr->n_descsz;

		if (nhdr->n_type != XEN_ELFNOTE_DUMPCORE_XEN_VERSION)
			continue;

		/*
		 * The contents of this struct differ between 32 and 64
		 * bit; however, not until past the 'xev_capabilities'
		 * member, so we can just about get away with this.
		 */

		/* LINTED - alignment */
		vers = (struct xc_elf_version *)desc;

		if (strstr(vers->xev_capabilities, "x86_64")) {
			/*
			 * 64-bit hypervisor, but it can still be
			 * a 32-bit domain core. 32-bit domain cores
			 * are also dumped in Elf64 format, but they
			 * have e_machine set to EM_386, not EM_AMD64.
			 */
			if (gf->gf_ehdr.e_machine == EM_386)
				*longmode = 0;
			else
				*longmode = 1;
		} else if (strstr(vers->xev_capabilities, "x86_32") ||
		    strstr(vers->xev_capabilities, "x86_32p")) {
			/*
			 * 32-bit hypervisor, can only be a 32-bit core.
			 */
			*longmode = 0;
		} else {
			mdb_warn("couldn't derive word size of dump; "
			    "assuming 64-bit");
			*longmode = 1;
		}
	}

	ret = 1;

out:
	if (gf != NULL)
		mdb_gelf_destroy(gf);
	else if (io != NULL)
		mdb_io_destroy(io);
	return (ret);
}

static void *
xkb_fail(xkb_t *xkb, const char *msg, ...)
{
	va_list args;

	va_start(args, msg);
	if (xkb != NULL)
		(void) fprintf(stderr, "%s: ", xkb->xkb_path);
	(void) vfprintf(stderr, msg, args);
	(void) fprintf(stderr, "\n");
	va_end(args);
	if (xkb != NULL)
		(void) xkb_close(xkb);

	errno = ENOEXEC;

	return (NULL);
}

static int
xkb_build_m2p(xkb_t *xkb)
{
	size_t i;

	for (i = 0; i <= xkb->xkb_max_pfn; i++) {
		if (xkb->xkb_p2m[i] != MFN_INVALID &&
		    xkb->xkb_p2m[i] > xkb->xkb_max_mfn)
			xkb->xkb_max_mfn = xkb->xkb_p2m[i];
	}

	xkb->xkb_m2p = mdb_alloc((xkb->xkb_max_mfn + 1) * sizeof (xen_pfn_t),
	    UM_SLEEP);

	for (i = 0; i <= xkb->xkb_max_mfn; i++)
		xkb->xkb_m2p[i] = PFN_INVALID;

	for (i = 0; i <= xkb->xkb_max_pfn; i++) {
		if (xkb->xkb_p2m[i] != MFN_INVALID)
			xkb->xkb_m2p[xkb->xkb_p2m[i]] = i;
	}

	return (1);
}

/*
 * With FORMAT_CORE, we can use the table in the dump file directly.
 * Just to make things fun, they've not page-aligned the p2m table.
 */
static int
xkb_map_p2m(xkb_t *xkb)
{
	offset_t off;
	size_t size;
	xkb_core_t *xc = &xkb->xkb_core;
	size_t count = xkb->xkb_nr_pages;
	size_t boff = xc->xc_hdr.xch_index_offset;

	size = (sizeof (mfn_t) * count) + (PAGE_SIZE * 2);
	size = PAGE_MASK(size);
	off = PAGE_MASK(boff);

	/* LINTED - alignment */
	xc->xc_p2m_buf = (mfn_t *)mmap(NULL, size, PROT_READ,
	    MAP_SHARED, xkb->xkb_fd, off);

	if (xc->xc_p2m_buf == (xen_pfn_t *)MAP_FAILED) {
		(void) xkb_fail(xkb, "cannot map p2m table");
		return (0);
	}

	/* LINTED - alignment */
	xkb->xkb_p2m = (mfn_t *)((char *)xc->xc_p2m_buf +
	    PAGE_OFFSET(boff));

	return (1);
}

/*
 * With FORMAT_ELF, we have a set of <pfn,mfn> pairs, which we convert
 * into a linear array indexed by pfn for convenience.  We also need to
 * track the mapping between mfn and the offset in the file: a pfn with
 * no mfn will not appear in the core file.
 */
static int
xkb_build_p2m(xkb_t *xkb)
{
	xkb_elf_t *xe = &xkb->xkb_elf;
	mdb_gelf_sect_t *sect;
	size_t size;
	size_t i;

	struct elf_p2m {
		uint64_t pfn;
		uint64_t gmfn;
	} *p2m;

	sect = mdb_gelf_sect_by_name(xe->xe_gelf, ".xen_p2m");

	if (sect == NULL) {
		(void) xkb_fail(xkb, "cannot find section .xen_p2m");
		return (0);
	}

	if ((p2m = mdb_gelf_sect_load(xe->xe_gelf, sect)) == NULL) {
		(void) xkb_fail(xkb, "couldn't read .xen_p2m");
		return (0);
	}

	for (i = 0; i < xkb->xkb_nr_pages; i++) {
		if (p2m[i].pfn > xkb->xkb_max_pfn)
			xkb->xkb_max_pfn = p2m[i].pfn;
	}

	size = sizeof (xen_pfn_t) * (xkb->xkb_max_pfn + 1);
	xkb->xkb_p2m = mdb_alloc(size, UM_SLEEP);
	size = sizeof (size_t) * (xkb->xkb_max_pfn + 1);
	xe->xe_off = mdb_alloc(size, UM_SLEEP);

	for (i = 0; i <= xkb->xkb_max_pfn; i++) {
		xkb->xkb_p2m[i] = PFN_INVALID;
		xe->xe_off[i] = (size_t)-1;
	}

	for (i = 0; i < xkb->xkb_nr_pages; i++) {
		xkb->xkb_p2m[p2m[i].pfn] = p2m[i].gmfn;
		xe->xe_off[p2m[i].pfn] = i;
	}

	return (1);
}

/*
 * For HVM images, we don't have the corresponding MFN list; the table
 * is just a mapping from page index in the dump to the corresponding
 * PFN.  To simplify the other code, we'll pretend that these PFNs are
 * really MFNs as well, by populating xkb_p2m.
 */
static int
xkb_build_fake_p2m(xkb_t *xkb)
{
	xkb_elf_t *xe = &xkb->xkb_elf;
	mdb_gelf_sect_t *sect;
	size_t size;
	size_t i;

	uint64_t *p2pfn;

	sect = mdb_gelf_sect_by_name(xe->xe_gelf, ".xen_pfn");

	if (sect == NULL) {
		(void) xkb_fail(xkb, "cannot find section .xen_pfn");
		return (0);
	}

	if ((p2pfn = mdb_gelf_sect_load(xe->xe_gelf, sect)) == NULL) {
		(void) xkb_fail(xkb, "couldn't read .xen_pfn");
		return (0);
	}

	for (i = 0; i < xkb->xkb_nr_pages; i++) {
		if (p2pfn[i] != PFN_INVALID && p2pfn[i] > xkb->xkb_max_pfn)
			xkb->xkb_max_pfn = p2pfn[i];
	}

	size = sizeof (xen_pfn_t) * (xkb->xkb_max_pfn + 1);
	xkb->xkb_p2m = mdb_alloc(size, UM_SLEEP);

	size = sizeof (size_t) * (xkb->xkb_max_pfn + 1);
	xe->xe_off = mdb_alloc(size, UM_SLEEP);

	for (i = 0; i <= xkb->xkb_max_pfn; i++) {
		xkb->xkb_p2m[i] = PFN_INVALID;
		xe->xe_off[i] = (size_t)-1;
	}

	for (i = 0; i < xkb->xkb_nr_pages; i++) {
		if (p2pfn[i] == PFN_INVALID)
			continue;
		xkb->xkb_p2m[p2pfn[i]] = p2pfn[i];
		xe->xe_off[p2pfn[i]] = i;
	}

	return (1);
}

/*
 * Return the MFN of the top-level page table for the given as.
 */
static mfn_t
xkb_as_to_mfn(xkb_t *xkb, struct as *as)
{
	uintptr_t asp = (uintptr_t)as;
	uintptr_t hatp;
	uintptr_t htablep;
	uintptr_t pfn;

	if (!xkb_read_word(xkb, asp + offsetof(struct as, a_hat), &hatp))
		return (MFN_INVALID);
	if (!xkb_read_word(xkb, hatp + xkb->xkb_info.di_hat_htable_off,
	    &htablep))
		return (MFN_INVALID);
	if (!xkb_read_word(xkb, htablep + xkb->xkb_info.di_ht_pfn_off,
	    &pfn))
		return (MFN_INVALID);

	if (pfn > xkb->xkb_max_pfn)
		return (MFN_INVALID);

	return (xkb->xkb_p2m[pfn]);
}

static mfn_t
xkb_cr3_to_pfn(xkb_t *xkb)
{
	uint64_t cr3 = xkb->xkb_vcpus[0]->ctrlreg[3];
	if (xkb->xkb_is_hvm)
		return (cr3 >> PAGE_SHIFT);
	return (xen_cr3_to_pfn(cr3));
}

static ssize_t
xkb_read_helper(xkb_t *xkb, struct as *as, int phys, uint64_t addr,
    void *buf, size_t size)
{
	size_t left = size;
	int windowed = (xkb->xkb_pages == NULL);
	mfn_t tlmfn = xkb_cr3_to_pfn(xkb);

	if (as != NULL && (tlmfn = xkb_as_to_mfn(xkb, as)) == MFN_INVALID)
		return (-1);

	while (left) {
		uint64_t pos = addr + (size - left);
		char *outpos = (char *)buf + (size - left);
		size_t pageoff = PAGE_OFFSET(pos);
		size_t sz = MIN(left, PAGE_SIZE - pageoff);
		mfn_t mfn;

		if (!phys) {
			mfn = xkb_va_to_mfn(xkb, pos, tlmfn);
			if (mfn == MFN_INVALID)
				return (-1);
		} else {
			xen_pfn_t pfn = pos >> PAGE_SHIFT;
			if (pfn > xkb->xkb_max_pfn)
				return (-1);
			mfn = xkb->xkb_p2m[pfn];
			if (mfn == MFN_INVALID)
				return (-1);
		}

		/*
		 * If we're windowed then pread() is much faster.
		 */
		if (windowed) {
			offset_t off = xkb_mfn_to_offset(xkb, mfn);
			int ret;

			if (off == ~1ULL)
				return (-1);

			off += pageoff;

			ret = pread64(xkb->xkb_fd, outpos, sz, off);
			if (ret == -1)
				return (-1);
			if (ret != sz)
				return ((size - left) + ret);

			left -= ret;
		} else {
			if (xkb_map_mfn(xkb, mfn, &xkb->xkb_map) == NULL)
				return (-1);

			bcopy(xkb->xkb_map.mm_map + pageoff, outpos, sz);

			left -= sz;
		}
	}

	return (size);
}

static ssize_t
xkb_pread(xkb_t *xkb, uint64_t addr, void *buf, size_t size)
{
	return (xkb_read_helper(xkb, NULL, 1, addr, buf, size));
}

static ssize_t
xkb_aread(xkb_t *xkb, uintptr_t addr, void *buf, size_t size, struct as *as)
{
	return (xkb_read_helper(xkb, as, 0, addr, buf, size));
}

static ssize_t
xkb_read(xkb_t *xkb, uintptr_t addr, void *buf, size_t size)
{
	return (xkb_aread(xkb, addr, buf, size, NULL));
}

static int
xkb_read_word(xkb_t *xkb, uintptr_t addr, uintptr_t *buf)
{
	if (xkb_read(xkb, addr, buf, sizeof (uintptr_t)) !=
	    sizeof (uintptr_t))
		return (0);
	return (1);
}

static char *
xkb_readstr(xkb_t *xkb, uintptr_t addr)
{
	char *str = mdb_alloc(1024, UM_SLEEP);
	size_t i;

	for (i = 0; i < 1024; i++) {
		if (xkb_read(xkb, addr + i, &str[i], 1) != 1) {
			mdb_free(str, 1024);
			return (NULL);
		}

		if (str[i] == '\0')
			break;
	}

	if (i == 1024) {
		mdb_free(str, 1024);
		return (NULL);
	}

	return (str);
}

static offset_t
xkb_pfn_to_off(xkb_t *xkb, xen_pfn_t pfn)
{
	if (pfn == PFN_INVALID || pfn > xkb->xkb_max_pfn)
		return (-1ULL);

	if (xkb->xkb_type == XKB_FORMAT_CORE)
		return (PAGE_SIZE * pfn);

	return (PAGE_SIZE * (xkb->xkb_elf.xe_off[pfn]));
}

static offset_t
xkb_mfn_to_offset(xkb_t *xkb, mfn_t mfn)
{
	xen_pfn_t pfn;

	if (mfn > xkb->xkb_max_mfn)
		return (-1ULL);

	pfn = xkb->xkb_m2p[mfn];

	if (pfn == PFN_INVALID)
		return (-1ULL);

	return (xkb->xkb_pages_off + xkb_pfn_to_off(xkb, pfn));
}

static char *
xkb_map_mfn(xkb_t *xkb, mfn_t mfn, mfn_map_t *mm)
{
	int windowed = (xkb->xkb_pages == NULL);
	offset_t off;

	if (mm->mm_mfn == mfn)
		return (mm->mm_map);

	mm->mm_mfn = mfn;

	if (windowed) {
		if (mm->mm_map != (char *)MAP_FAILED) {
			(void) munmap(mm->mm_map, PAGE_SIZE);
			mm->mm_map = (void *)MAP_FAILED;
		}

		if ((off = xkb_mfn_to_offset(xkb, mfn)) == (-1ULL))
			return (NULL);

		mm->mm_map = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED,
		    xkb->xkb_fd, off);

		if (mm->mm_map == (char *)MAP_FAILED)
			return (NULL);
	} else {
		xen_pfn_t pfn;

		mm->mm_map = NULL;

		if (mfn > xkb->xkb_max_mfn)
			return (NULL);

		pfn = xkb->xkb_m2p[mfn];

		if (pfn == PFN_INVALID)
			return (NULL);

		mm->mm_map = xkb->xkb_pages + xkb_pfn_to_off(xkb, pfn);
	}

	return (mm->mm_map);
}

static uint64_t
xkb_get_pte(mmu_info_t *mmu, char *ptep)
{
	uint64_t pte = 0;

	if (mmu->mi_ptesize == 8) {
		/* LINTED - alignment */
		pte = *((uint64_t *)ptep);
	} else {
		/* LINTED - alignment */
		pte = *((uint32_t *)ptep);
	}

	return (pte);
}

static mfn_t
xkb_pte_to_base_mfn(uint64_t pte, size_t level)
{
	if (PTE_IS_LGPG(pte, level)) {
		pte &= PT_PADDR_LGPG;
	} else {
		pte &= PT_PADDR;
	}

	return (pte >> PAGE_SHIFT);
}

/*
 * Resolve the given VA into an MFN, using the provided mfn as a top-level page
 * table.
 */
static mfn_t
xkb_va_to_mfn(xkb_t *xkb, uintptr_t va, mfn_t mfn)
{
	mmu_info_t *mmu = &xkb->xkb_mmu;
	uint64_t pte;
	size_t level;

	for (level = mmu->mi_max; ; --level) {
		size_t entry;

		if (xkb_map_mfn(xkb, mfn, &xkb->xkb_pt_map[level]) == NULL)
			return (MFN_INVALID);

		entry = (va >> mmu->mi_shift[level]) & (mmu->mi_ptes - 1);

		pte = xkb_get_pte(mmu, (char *)xkb->xkb_pt_map[level].mm_map +
		    entry * mmu->mi_ptesize);

		if ((mfn = xkb_pte_to_base_mfn(pte, level)) == MFN_INVALID)
			return (MFN_INVALID);

		if (level == 0)
			break;

		/*
		 * Currently 'mfn' refers to the base MFN of the
		 * large-page mapping.  Add on the 4K-sized index into
		 * the large-page mapping to get the right MFN within
		 * the mapping.
		 */
		if (PTE_IS_LGPG(pte, level)) {
			mfn += (va & ((1 << mmu->mi_shift[level]) - 1)) >>
			    PAGE_SHIFT;
			break;
		}
	}

	return (mfn);
}

static int
xkb_read_module(xkb_t *xkb, uintptr_t modulep, struct module *module,
    uintptr_t *sym_addr, uintptr_t *sym_count, uintptr_t *str_addr)
{
	if (xkb_read(xkb, modulep, module, sizeof (struct module)) !=
	    sizeof (struct module))
		return (0);

	if (!xkb_read_word(xkb, (uintptr_t)module->symhdr +
	    offsetof(Shdr, sh_addr), sym_addr))
		return (0);

	if (!xkb_read_word(xkb, (uintptr_t)module->strhdr +
	    offsetof(Shdr, sh_addr), str_addr))
		return (0);

	if (!xkb_read_word(xkb, (uintptr_t)module->symhdr +
	    offsetof(Shdr, sh_size), sym_count))
		return (0);
	*sym_count /= sizeof (Sym);

	return (1);
}

static int
xkb_read_modsyms(xkb_t *xkb, char **buf, size_t *sizes, int types,
    uintptr_t sym_addr, uintptr_t str_addr, uintptr_t sym_count)
{
	size_t i;

	for (i = 0; i < sym_count; i++) {
		Sym sym;
		char *name;
		size_t sz;
		int type = XKB_WALK_GLOBAL;

		if (xkb_read(xkb, sym_addr + i * sizeof (sym), &sym,
		    sizeof (sym)) != sizeof (sym))
			return (0);

		if (GELF_ST_BIND(sym.st_info) == STB_LOCAL)
			type = XKB_WALK_LOCAL;

		name = xkb_readstr(xkb, str_addr + sym.st_name);

		sym.st_shndx = SHN_ABS;
		sym.st_name = sizes[XKB_WALK_STR];

		sizes[type] += sizeof (sym);
		sz = strlen(name) + 1;
		sizes[XKB_WALK_STR] += sz;

		if (buf != NULL) {
			if (types & type) {
				bcopy(&sym, *buf, sizeof (sym));
				*buf += sizeof (sym);
			}
			if (types & XKB_WALK_STR) {
				bcopy(name, *buf, sz);
				*buf += sz;
			}
		}

		mdb_free(name, 1024);
	}

	return (1);
}

static int
xkb_walk_syms(xkb_t *xkb, uintptr_t modhead, char **buf,
    size_t *sizes, int types)
{
	uintptr_t modctl = modhead;
	uintptr_t modulep;
	struct module module;
	uintptr_t sym_count;
	uintptr_t sym_addr;
	uintptr_t str_addr;
	size_t max_iter = 500;

	bzero(sizes, sizeof (*sizes) * (XKB_WALK_STR + 1));

	/*
	 * empty first symbol
	 */
	sizes[XKB_WALK_LOCAL] += sizeof (Sym);
	sizes[XKB_WALK_STR] += 1;

	if (buf != NULL) {
		if (types & XKB_WALK_LOCAL) {
			Sym tmp;
			bzero(&tmp, sizeof (tmp));
			bcopy(&tmp, *buf, sizeof (tmp));
			*buf += sizeof (tmp);
		}
		if (types & XKB_WALK_STR) {
			**buf = '\0';
			(*buf)++;
		}
	}

	for (;;) {
		if (!xkb_read_word(xkb,
		    modctl + offsetof(struct modctl, mod_mp), &modulep))
			return (0);

		if (modulep == NULL)
			goto next;

		if (!xkb_read_module(xkb, modulep, &module, &sym_addr,
		    &sym_count, &str_addr))
			return (0);

		if ((module.flags & KOBJ_NOKSYMS))
			goto next;

		if (!xkb_read_modsyms(xkb, buf, sizes, types, sym_addr,
		    str_addr, sym_count))
			return (0);

next:
		if (!xkb_read_word(xkb,
		    modctl + offsetof(struct modctl, mod_next), &modctl))
			return (0);

		if (modctl == modhead)
			break;
		/*
		 * Try and prevent us looping forever if we have a broken list.
		 */
		if (--max_iter == 0)
			break;
	}

	return (1);
}

/*
 * Userspace equivalent of ksyms_snapshot().  Since we don't have a namelist
 * file for hypervisor images, we fabricate one here using code similar
 * to that of /dev/ksyms.
 */
static int
xkb_build_ksyms(xkb_t *xkb)
{
	debug_info_t *info = &xkb->xkb_info;
	size_t sizes[XKB_WALK_STR + 1];
	xkb_namelist_t *hdr;
	char *buf;
	struct modctl modules;
	uintptr_t module;
	Shdr *shp;

	if (xkb_read(xkb, info->di_modules, &modules,
	    sizeof (struct modctl)) != sizeof (struct modctl))
		return (0);

	module = (uintptr_t)modules.mod_mp;

	if (!xkb_walk_syms(xkb, info->di_modules, NULL, sizes,
	    XKB_WALK_LOCAL | XKB_WALK_GLOBAL | XKB_WALK_STR))
		return (0);

	xkb->xkb_namesize = sizeof (xkb_namelist_t);
	xkb->xkb_namesize += sizes[XKB_WALK_LOCAL];
	xkb->xkb_namesize += sizes[XKB_WALK_GLOBAL];
	xkb->xkb_namesize += sizes[XKB_WALK_STR];

	if ((xkb->xkb_namelist = mdb_zalloc(xkb->xkb_namesize, UM_SLEEP))
	    == NULL)
		return (0);

	/* LINTED - alignment */
	hdr = (xkb_namelist_t *)xkb->xkb_namelist;

	if (xkb_read(xkb, module + offsetof(struct module, hdr),
	    &hdr->kh_elf_hdr, sizeof (Ehdr)) != sizeof (Ehdr))
		return (0);

	hdr->kh_elf_hdr.e_phoff = offsetof(xkb_namelist_t, kh_text_phdr);
	hdr->kh_elf_hdr.e_shoff = offsetof(xkb_namelist_t, kh_shdr);
	hdr->kh_elf_hdr.e_phnum = 2;
	hdr->kh_elf_hdr.e_shnum = XKB_SHDR_NUM;
	hdr->kh_elf_hdr.e_shstrndx = XKB_SHDR_SHSTRTAB;

	hdr->kh_text_phdr.p_type = PT_LOAD;
	hdr->kh_text_phdr.p_vaddr = (Addr)info->di_s_text;
	hdr->kh_text_phdr.p_memsz = (Word)(info->di_e_text - info->di_s_text);
	hdr->kh_text_phdr.p_flags = PF_R | PF_X;

	hdr->kh_data_phdr.p_type = PT_LOAD;
	hdr->kh_data_phdr.p_vaddr = (Addr)info->di_s_data;
	hdr->kh_data_phdr.p_memsz = (Word)(info->di_e_data - info->di_s_data);
	hdr->kh_data_phdr.p_flags = PF_R | PF_W | PF_X;

	shp = &hdr->kh_shdr[XKB_SHDR_SYMTAB];
	shp->sh_name = 1;	/* xkb_shstrtab[1] = ".symtab" */
	shp->sh_type = SHT_SYMTAB;
	shp->sh_offset = sizeof (xkb_namelist_t);
	shp->sh_size = sizes[XKB_WALK_LOCAL] + sizes[XKB_WALK_GLOBAL];
	shp->sh_link = XKB_SHDR_STRTAB;
	shp->sh_info = sizes[XKB_WALK_LOCAL] / sizeof (Sym);
	shp->sh_addralign = sizeof (Addr);
	shp->sh_entsize = sizeof (Sym);
	shp->sh_addr = (Addr)(xkb->xkb_namelist + shp->sh_offset);


	shp = &hdr->kh_shdr[XKB_SHDR_STRTAB];
	shp->sh_name = 9;	/* xkb_shstrtab[9] = ".strtab" */
	shp->sh_type = SHT_STRTAB;
	shp->sh_offset = sizeof (xkb_namelist_t) +
	    sizes[XKB_WALK_LOCAL] + sizes[XKB_WALK_GLOBAL];
	shp->sh_size = sizes[XKB_WALK_STR];
	shp->sh_addralign = 1;
	shp->sh_addr = (Addr)(xkb->xkb_namelist + shp->sh_offset);


	shp = &hdr->kh_shdr[XKB_SHDR_SHSTRTAB];
	shp->sh_name = 17;	/* xkb_shstrtab[17] = ".shstrtab" */
	shp->sh_type = SHT_STRTAB;
	shp->sh_offset = offsetof(xkb_namelist_t, shstrings);
	shp->sh_size = sizeof (xkb_shstrtab);
	shp->sh_addralign = 1;
	shp->sh_addr = (Addr)(xkb->xkb_namelist + shp->sh_offset);

	bcopy(xkb_shstrtab, hdr->shstrings, sizeof (xkb_shstrtab));

	buf = xkb->xkb_namelist + sizeof (xkb_namelist_t);

	if (!xkb_walk_syms(xkb, info->di_modules, &buf, sizes,
	    XKB_WALK_LOCAL))
		return (0);
	if (!xkb_walk_syms(xkb, info->di_modules, &buf, sizes,
	    XKB_WALK_GLOBAL))
		return (0);
	if (!xkb_walk_syms(xkb, info->di_modules, &buf, sizes,
	    XKB_WALK_STR))
		return (0);

	return (1);
}

static xkb_t *
xkb_open_core(xkb_t *xkb)
{
	xkb_core_t *xc = &xkb->xkb_core;
	size_t sz;
	int i;
	struct vcpu_guest_context *vcp;

	xkb->xkb_type = XKB_FORMAT_CORE;

	if ((xkb->xkb_fd = open64(xkb->xkb_path, O_RDONLY)) == -1)
		return (xkb_fail(xkb, "cannot open %s", xkb->xkb_path));

	if (pread64(xkb->xkb_fd, &xc->xc_hdr, sizeof (xc->xc_hdr), 0) !=
	    sizeof (xc->xc_hdr))
		return (xkb_fail(xkb, "invalid dump file"));

	if (xc->xc_hdr.xch_magic == XC_CORE_MAGIC_HVM)
		return (xkb_fail(xkb, "cannot process HVM images"));

	if (xc->xc_hdr.xch_magic != XC_CORE_MAGIC) {
		return (xkb_fail(xkb, "invalid magic %d",
		    xc->xc_hdr.xch_magic));
	}

	/*
	 * With FORMAT_CORE, all pages are in the dump (non-existing
	 * ones are zeroed out).
	 */
	xkb->xkb_nr_pages = xc->xc_hdr.xch_nr_pages;
	xkb->xkb_pages_off = xc->xc_hdr.xch_pages_offset;
	xkb->xkb_max_pfn = xc->xc_hdr.xch_nr_pages - 1;
	xkb->xkb_nr_vcpus = xc->xc_hdr.xch_nr_vcpus;

	sz = xkb->xkb_nr_vcpus * sizeof (struct vcpu_guest_context);
	xkb->xkb_vcpu_data_sz = sz;
	xkb->xkb_vcpu_data = mdb_alloc(sz, UM_SLEEP);

	if (pread64(xkb->xkb_fd, xkb->xkb_vcpu_data, sz,
	    xc->xc_hdr.xch_ctxt_offset) != sz)
		return (xkb_fail(xkb, "cannot read VCPU contexts"));

	sz = xkb->xkb_nr_vcpus * sizeof (struct vcpu_guest_context *);
	xkb->xkb_vcpus = mdb_alloc(sz, UM_SLEEP);

	vcp = xkb->xkb_vcpu_data;
	for (i = 0; i < xkb->xkb_nr_vcpus; i++)
		xkb->xkb_vcpus[i] = &vcp[i];

	/*
	 * Try to map all the data pages. If we can't, fall back to the
	 * window/pread() approach, which is significantly slower.
	 */
	xkb->xkb_pages = mmap(NULL, PAGE_SIZE * xkb->xkb_nr_pages,
	    PROT_READ, MAP_SHARED, xkb->xkb_fd, xc->xc_hdr.xch_pages_offset);

	if (xkb->xkb_pages == (char *)MAP_FAILED)
		xkb->xkb_pages = NULL;

	/*
	 * We'd like to adapt for correctness' sake, but we have no way of
	 * detecting a PAE guest, since cr4 writes are disallowed.
	 */
	xkb->xkb_is_pae = 1;

	if (!xkb_map_p2m(xkb))
		return (NULL);

	return (xkb);
}

static xkb_t *
xkb_open_elf(xkb_t *xkb)
{
	xkb_elf_t *xe = &xkb->xkb_elf;
	mdb_gelf_sect_t *sect;
	char *notes;
	char *pos;
	mdb_io_t *io;
	size_t sz;
	int i;
	void *dp;

	if ((io = mdb_fdio_create_path(NULL, xkb->xkb_path,
	    O_RDONLY, 0)) == NULL)
		return (xkb_fail(xkb, "failed to open"));

	xe->xe_gelf = mdb_gelf_create(io, ET_NONE, GF_FILE);

	if (xe->xe_gelf == NULL) {
		mdb_io_destroy(io);
		return (xkb);
	}

	xkb->xkb_fd = mdb_fdio_fileno(io);

	sect = mdb_gelf_sect_by_name(xe->xe_gelf, ".note.Xen");

	if (sect == NULL)
		return (xkb);

	if ((notes = mdb_gelf_sect_load(xe->xe_gelf, sect)) == NULL)
		return (xkb);

	/*
	 * Now we know this is indeed a hypervisor core dump, even if
	 * it's corrupted.
	 */
	xkb->xkb_type = XKB_FORMAT_ELF;

	for (pos = notes; pos < notes + sect->gs_shdr.sh_size; ) {
		/* LINTED - alignment */
		Elf64_Nhdr *nhdr = (Elf64_Nhdr *)pos;
		uint64_t vers;
		char *desc;
		char *name;

		name = pos + sizeof (*nhdr);
		desc = (char *)P2ROUNDUP((uintptr_t)name + nhdr->n_namesz, 4);

		pos = desc + nhdr->n_descsz;

		switch (nhdr->n_type) {
		case XEN_ELFNOTE_DUMPCORE_NONE:
			break;

		case XEN_ELFNOTE_DUMPCORE_HEADER:
			if (nhdr->n_descsz != sizeof (struct xc_elf_header)) {
				return (xkb_fail(xkb, "invalid ELF note "
				    "XEN_ELFNOTE_DUMPCORE_HEADER\n"));
			}

			bcopy(desc, &xe->xe_hdr,
			    sizeof (struct xc_elf_header));
			break;

		case XEN_ELFNOTE_DUMPCORE_XEN_VERSION:
			if (nhdr->n_descsz < sizeof (struct xc_elf_version)) {
				return (xkb_fail(xkb, "invalid ELF note "
				    "XEN_ELFNOTE_DUMPCORE_XEN_VERSION\n"));
			}

			bcopy(desc, &xe->xe_version,
			    sizeof (struct xc_elf_version));
			break;

		case XEN_ELFNOTE_DUMPCORE_FORMAT_VERSION:
			/* LINTED - alignment */
			vers = *((uint64_t *)desc);
			if ((vers >> 32) != 0) {
				return (xkb_fail(xkb, "unknown major "
				    "version %d (expected 0)\n",
				    (int)(vers >> 32)));
			}

			if ((vers & 0xffffffff) != 1) {
				mdb_warn("unexpected dump minor number "
				    "version %d (expected 1)\n",
				    (int)(vers & 0xffffffff));
			}
			break;

		default:
			mdb_warn("unknown ELF note %d(%s)\n",
			    nhdr->n_type, name);
			break;
		}
	}

	xkb->xkb_is_hvm = xe->xe_hdr.xeh_magic == XC_CORE_MAGIC_HVM;

	if (xe->xe_hdr.xeh_magic != XC_CORE_MAGIC &&
	    xe->xe_hdr.xeh_magic != XC_CORE_MAGIC_HVM) {
		return (xkb_fail(xkb, "invalid magic %d",
		    xe->xe_hdr.xeh_magic));
	}

	xkb->xkb_nr_pages = xe->xe_hdr.xeh_nr_pages;
	xkb->xkb_is_pae = (strstr(xe->xe_version.xev_capabilities,
	    "x86_32p") != NULL);

	sect = mdb_gelf_sect_by_name(xe->xe_gelf, ".xen_prstatus");

	if (sect == NULL)
		return (xkb_fail(xkb, "cannot find section .xen_prstatus"));

	if (sect->gs_shdr.sh_entsize < sizeof (vcpu_guest_context_t))
		return (xkb_fail(xkb, "invalid section .xen_prstatus"));

	xkb->xkb_nr_vcpus = sect->gs_shdr.sh_size / sect->gs_shdr.sh_entsize;

	xkb->xkb_vcpu_data = mdb_gelf_sect_load(xe->xe_gelf, sect);
	if (xkb->xkb_vcpu_data == NULL)
		return (xkb_fail(xkb, "cannot load section .xen_prstatus"));
	xkb->xkb_vcpu_data_sz = sect->gs_shdr.sh_size;

	/*
	 * The vcpu_guest_context structures saved in the core file
	 * are actually unions of the 64-bit and 32-bit versions.
	 * Don't rely on the entry size to match the size of
	 * the structure, but set up an array of pointers.
	 */
	sz = xkb->xkb_nr_vcpus * sizeof (struct vcpu_guest_context *);
	xkb->xkb_vcpus = mdb_alloc(sz, UM_SLEEP);
	for (i = 0; i < xkb->xkb_nr_vcpus; i++) {
		dp = ((char *)xkb->xkb_vcpu_data +
		    i * sect->gs_shdr.sh_entsize);
		xkb->xkb_vcpus[i] = dp;
	}

	sect = mdb_gelf_sect_by_name(xe->xe_gelf, ".xen_pages");

	if (sect == NULL)
		return (xkb_fail(xkb, "cannot find section .xen_pages"));

	if (!PAGE_ALIGNED(sect->gs_shdr.sh_offset))
		return (xkb_fail(xkb, ".xen_pages is not page aligned"));

	if (sect->gs_shdr.sh_entsize != PAGE_SIZE)
		return (xkb_fail(xkb, "invalid section .xen_pages"));

	xkb->xkb_pages_off = sect->gs_shdr.sh_offset;

	/*
	 * Try to map all the data pages. If we can't, fall back to the
	 * window/pread() approach, which is significantly slower.
	 */
	xkb->xkb_pages = mmap(NULL, PAGE_SIZE * xkb->xkb_nr_pages,
	    PROT_READ, MAP_SHARED, xkb->xkb_fd, xkb->xkb_pages_off);

	if (xkb->xkb_pages == (char *)MAP_FAILED)
		xkb->xkb_pages = NULL;

	if (xkb->xkb_is_hvm) {
		if (!xkb_build_fake_p2m(xkb))
			return (NULL);
	} else {
		if (!xkb_build_p2m(xkb))
			return (NULL);
	}

	return (xkb);
}

static void
xkb_init_mmu(xkb_t *xkb)
{
#if defined(__amd64)
	xkb->xkb_mmu.mi_max = 3;
	xkb->xkb_mmu.mi_shift[0] = 12;
	xkb->xkb_mmu.mi_shift[1] = 21;
	xkb->xkb_mmu.mi_shift[2] = 30;
	xkb->xkb_mmu.mi_shift[3] = 39;
	xkb->xkb_mmu.mi_ptes = 512;
	xkb->xkb_mmu.mi_ptesize = 8;
#elif defined(__i386)
	if (xkb->xkb_is_pae) {
		xkb->xkb_mmu.mi_max = 2;
		xkb->xkb_mmu.mi_shift[0] = 12;
		xkb->xkb_mmu.mi_shift[1] = 21;
		xkb->xkb_mmu.mi_shift[2] = 30;
		xkb->xkb_mmu.mi_ptes = 512;
		xkb->xkb_mmu.mi_ptesize = 8;
	} else {
		xkb->xkb_mmu.mi_max = 1;
		xkb->xkb_mmu.mi_shift[0] = 12;
		xkb->xkb_mmu.mi_shift[1] = 22;
		xkb->xkb_mmu.mi_ptes = 1024;
		xkb->xkb_mmu.mi_ptesize = 4;
	}
#endif
}

/*ARGSUSED*/
xkb_t *
xkb_open(const char *namelist, const char *corefile, const char *swapfile,
    int flag, const char *err)
{
	uintptr_t debug_info = DEBUG_INFO;
	struct stat64 corestat;
	xkb_t *xkb = NULL;
	size_t i;

	if (stat64(corefile, &corestat) == -1)
		return (xkb_fail(xkb, "cannot stat %s", corefile));

	if (flag != O_RDONLY)
		return (xkb_fail(xkb, "invalid open flags"));

	xkb = mdb_zalloc(sizeof (*xkb), UM_SLEEP);

	for (i = 0; i < 4; i++) {
		xkb->xkb_pt_map[i].mm_mfn = MFN_INVALID;
		xkb->xkb_pt_map[i].mm_map = (char *)MAP_FAILED;
	}

	xkb->xkb_type = XKB_FORMAT_UNKNOWN;
	xkb->xkb_map.mm_mfn = MFN_INVALID;
	xkb->xkb_map.mm_map = (char *)MAP_FAILED;
	xkb->xkb_core.xc_p2m_buf = (char *)MAP_FAILED;
	xkb->xkb_fd = -1;

	xkb->xkb_path = strdup(corefile);

	if ((xkb = xkb_open_elf(xkb)) == NULL)
		return (NULL);

	if (xkb->xkb_type == XKB_FORMAT_UNKNOWN) {
		if (!xkb_open_core(xkb))
			return (NULL);
	}

	xkb_init_mmu(xkb);

	if (!xkb_build_m2p(xkb))
		return (NULL);

	if (xkb->xkb_is_hvm)
		debug_info = DEBUG_INFO_HVM;

	if (xkb_read(xkb, debug_info, &xkb->xkb_info,
	    sizeof (xkb->xkb_info)) != sizeof (xkb->xkb_info))
		return (xkb_fail(xkb, "cannot read debug_info"));

	if (xkb->xkb_info.di_magic != DEBUG_INFO_MAGIC) {
		return (xkb_fail(xkb, "invalid debug info magic %d",
		    xkb->xkb_info.di_magic));
	}

	if (xkb->xkb_info.di_version != DEBUG_INFO_VERSION) {
		return (xkb_fail(xkb, "unknown debug info version %d",
		    xkb->xkb_info.di_version));
	}

	if (!xkb_build_ksyms(xkb))
		return (xkb_fail(xkb, "cannot construct namelist"));

	return (xkb);
}

int
xkb_close(xkb_t *xkb)
{
	size_t i, sz;

	if (xkb == NULL)
		return (0);

	if (xkb->xkb_m2p != NULL) {
		mdb_free(xkb->xkb_m2p,
		    (xkb->xkb_max_mfn + 1) * sizeof (xen_pfn_t));
	}

	if (xkb->xkb_pages != NULL) {
		(void) munmap((void *)xkb->xkb_pages,
		    PAGE_SIZE * xkb->xkb_nr_pages);
	} else {
		for (i = 0; i < 4; i++) {
			char *addr = xkb->xkb_pt_map[i].mm_map;
			if (addr != (char *)MAP_FAILED)
				(void) munmap((void *)addr, PAGE_SIZE);
		}
		if (xkb->xkb_map.mm_map != (char *)MAP_FAILED) {
			(void) munmap((void *)xkb->xkb_map.mm_map,
			    PAGE_SIZE);
		}
	}

	if (xkb->xkb_namelist != NULL)
		mdb_free(xkb->xkb_namelist, xkb->xkb_namesize);

	if (xkb->xkb_type == XKB_FORMAT_ELF) {
		xkb_elf_t *xe = &xkb->xkb_elf;

		if (xe->xe_gelf != NULL)
			mdb_gelf_destroy(xe->xe_gelf);

		sz = sizeof (xen_pfn_t) * (xkb->xkb_max_pfn + 1);

		if (xkb->xkb_p2m != NULL)
			mdb_free(xkb->xkb_p2m, sz);

		sz = sizeof (size_t) * (xkb->xkb_max_pfn + 1);

		if (xe->xe_off != NULL)
			mdb_free(xe->xe_off, sz);

	} else if (xkb->xkb_type == XKB_FORMAT_CORE) {
		xkb_core_t *xc = &xkb->xkb_core;

		if (xkb->xkb_fd != -1)
			(void) close(xkb->xkb_fd);

		sz = (xkb->xkb_nr_pages * sizeof (mfn_t)) + (PAGE_SIZE * 2);
		sz = PAGE_MASK(sz);

		if (xc->xc_p2m_buf != (xen_pfn_t *)MAP_FAILED)
			(void) munmap(xc->xc_p2m_buf, sz);

		if (xkb->xkb_vcpu_data != NULL)
			mdb_free(xkb->xkb_vcpu_data, xkb->xkb_vcpu_data_sz);
	}

	if (xkb->xkb_vcpus != NULL) {
		sz = sizeof (struct vcpu_guest_context *) *
		    xkb->xkb_nr_vcpus;
		mdb_free(xkb->xkb_vcpus, sz);
	}

	free(xkb->xkb_path);

	mdb_free(xkb, sizeof (*xkb));
	return (0);
}

/*ARGSUSED*/
static mdb_io_t *
xkb_sym_io(xkb_t *xkb, const char *symfile)
{
	mdb_io_t *io = mdb_memio_create(xkb->xkb_namelist, xkb->xkb_namesize);

	if (io == NULL)
		mdb_warn("failed to create namelist from %s", xkb->xkb_path);

	return (io);
}

uint64_t
xkb_vtop(xkb_t *xkb, struct as *as, uintptr_t addr)
{
	mfn_t tlmfn = xkb_cr3_to_pfn(xkb);
	mfn_t mfn;

	if (as != NULL && (tlmfn = xkb_as_to_mfn(xkb, as)) == MFN_INVALID)
		return (-1ULL);

	mfn = xkb_va_to_mfn(xkb, addr, tlmfn);

	if (mfn == MFN_INVALID || mfn > xkb->xkb_max_mfn)
		return (-1ULL);

	return (((uint64_t)xkb->xkb_m2p[mfn] << PAGE_SHIFT)
	    | PAGE_OFFSET(addr));
}

static int
xkb_getmregs(xkb_t *xkb, uint_t cpu, struct privmregs *mregs)
{
	struct vcpu_guest_context *vcpu;
	struct cpu_user_regs *ur;
	struct regs *regs;

	if (cpu >= xkb->xkb_nr_vcpus) {
		errno = EINVAL;
		return (-1);
	}

	bzero(mregs, sizeof (*mregs));

	vcpu = xkb->xkb_vcpus[cpu];
	ur = &vcpu->user_regs;
	regs = &mregs->pm_gregs;

	regs->r_ss = ur->ss;
	regs->r_cs = ur->cs;
	regs->r_ds = ur->ds;
	regs->r_es = ur->es;
	regs->r_fs = ur->fs;
	regs->r_gs = ur->gs;
	regs->r_trapno = ur->entry_vector;
	regs->r_err = ur->error_code;
#ifdef __amd64
	regs->r_savfp = ur->rbp;
	regs->r_savpc = ur->rip;
	regs->r_rdi = ur->rdi;
	regs->r_rsi = ur->rsi;
	regs->r_rdx = ur->rdx;
	regs->r_rcx = ur->rcx;
	regs->r_r8 = ur->r8;
	regs->r_r9 = ur->r9;
	regs->r_rax = ur->rax;
	regs->r_rbx = ur->rbx;
	regs->r_rbp = ur->rbp;
	regs->r_r10 = ur->r10;
	regs->r_r11 = ur->r11;
	regs->r_r12 = ur->r12;
	regs->r_r13 = ur->r13;
	regs->r_r14 = ur->r14;
	regs->r_r15 = ur->r15;
	regs->r_rip = ur->rip;
	regs->r_rfl = ur->rflags;
	regs->r_rsp = ur->rsp;
#else
	regs->r_savfp = ur->ebp;
	regs->r_savpc = ur->eip;
	regs->r_edi = ur->edi;
	regs->r_esi = ur->esi;
	regs->r_ebp = ur->ebp;
	regs->r_esp = ur->esp;
	regs->r_ebx = ur->ebx;
	regs->r_edx = ur->edx;
	regs->r_ecx = ur->ecx;
	regs->r_eax = ur->eax;
	regs->r_eip = ur->eip;
	regs->r_efl = ur->eflags;
	regs->r_uesp = 0;
#endif

	bcopy(&vcpu->ctrlreg, &mregs->pm_cr, 8 * sizeof (ulong_t));
	bcopy(&vcpu->debugreg, &mregs->pm_dr, 8 * sizeof (ulong_t));

	mregs->pm_flags = PM_GREGS | PM_CRREGS | PM_DRREGS;

	return (0);
}

static mdb_kb_ops_t xpv_kb_ops = {
	.kb_open = (void *(*)())xkb_open,
	.kb_close = (int (*)())xkb_close,
	.kb_sym_io = (mdb_io_t *(*)())xkb_sym_io,
	.kb_kread = (ssize_t (*)())xkb_read,
	.kb_kwrite = (ssize_t (*)())mdb_tgt_notsup,
	.kb_aread = (ssize_t (*)())xkb_aread,
	.kb_awrite = (ssize_t (*)())mdb_tgt_notsup,
	.kb_pread = (ssize_t (*)())xkb_pread,
	.kb_pwrite = (ssize_t (*)())mdb_tgt_notsup,
	.kb_vtop = (uint64_t (*)())xkb_vtop,
	.kb_getmregs = (int (*)())xkb_getmregs
};

mdb_kb_ops_t *
mdb_kb_ops(void)
{
	return (&xpv_kb_ops);
}

static const mdb_dcmd_t dcmds[] = { NULL, };
static const mdb_walker_t walkers[] = { NULL, };
static const mdb_modinfo_t modinfo = { MDB_API_VERSION, dcmds, walkers };

const mdb_modinfo_t *
_mdb_init(void)
{
	return (&modinfo);
}

void
_mdb_fini(void)
{
}