OpenSolaris_b135/uts/common/os/sunmdi.c

Compare this file to the similar file:
Show the results in this format:

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Multipath driver interface (MDI) implementation; see mdi_impl.h for a more
 * detailed discussion of the overall mpxio architecture.
 *
 * Default locking order:
 *
 * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_phci_mutex);
 * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_client_mutex);
 * _NOTE(LOCK_ORDER(mdi_vhci:vh_phci_mutex, mdi_phci::ph_mutex);
 * _NOTE(LOCK_ORDER(mdi_vhci:vh_client_mutex, mdi_client::ct_mutex);
 * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
 * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
 * _NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
 */

#include <sys/note.h>
#include <sys/types.h>
#include <sys/varargs.h>
#include <sys/param.h>
#include <sys/errno.h>
#include <sys/uio.h>
#include <sys/buf.h>
#include <sys/modctl.h>
#include <sys/open.h>
#include <sys/kmem.h>
#include <sys/poll.h>
#include <sys/conf.h>
#include <sys/bootconf.h>
#include <sys/cmn_err.h>
#include <sys/stat.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/ddipropdefs.h>
#include <sys/sunndi.h>
#include <sys/ndi_impldefs.h>
#include <sys/promif.h>
#include <sys/sunmdi.h>
#include <sys/mdi_impldefs.h>
#include <sys/taskq.h>
#include <sys/epm.h>
#include <sys/sunpm.h>
#include <sys/modhash.h>
#include <sys/disp.h>
#include <sys/autoconf.h>
#include <sys/sysmacros.h>

#ifdef	DEBUG
#include <sys/debug.h>
int	mdi_debug = 1;
int	mdi_debug_logonly = 0;
#define	MDI_DEBUG(dbglevel, pargs) if (mdi_debug >= (dbglevel))	i_mdi_log pargs
#define	MDI_WARN	CE_WARN, __func__
#define	MDI_NOTE	CE_NOTE, __func__
#define	MDI_CONT	CE_CONT, __func__
static void i_mdi_log(int, const char *, dev_info_t *, const char *, ...);
#else	/* !DEBUG */
#define	MDI_DEBUG(dbglevel, pargs)
#endif	/* DEBUG */
int	mdi_debug_consoleonly = 0;
int	mdi_delay = 3;

extern pri_t	minclsyspri;
extern int	modrootloaded;

/*
 * Global mutex:
 * Protects vHCI list and structure members.
 */
kmutex_t	mdi_mutex;

/*
 * Registered vHCI class driver lists
 */
int		mdi_vhci_count;
mdi_vhci_t	*mdi_vhci_head;
mdi_vhci_t	*mdi_vhci_tail;

/*
 * Client Hash Table size
 */
static int	mdi_client_table_size = CLIENT_HASH_TABLE_SIZE;

/*
 * taskq interface definitions
 */
#define	MDI_TASKQ_N_THREADS	8
#define	MDI_TASKQ_PRI		minclsyspri
#define	MDI_TASKQ_MINALLOC	(4*mdi_taskq_n_threads)
#define	MDI_TASKQ_MAXALLOC	(500*mdi_taskq_n_threads)

taskq_t				*mdi_taskq;
static uint_t			mdi_taskq_n_threads = MDI_TASKQ_N_THREADS;

#define	TICKS_PER_SECOND	(drv_usectohz(1000000))

/*
 * The data should be "quiet" for this interval (in seconds) before the
 * vhci cached data is flushed to the disk.
 */
static int mdi_vhcache_flush_delay = 10;

/* number of seconds the vhcache flush daemon will sleep idle before exiting */
static int mdi_vhcache_flush_daemon_idle_time = 60;

/*
 * MDI falls back to discovery of all paths when a bus_config_one fails.
 * The following parameters can be used to tune this operation.
 *
 * mdi_path_discovery_boot
 *	Number of times path discovery will be attempted during early boot.
 *	Probably there is no reason to ever set this value to greater than one.
 *
 * mdi_path_discovery_postboot
 *	Number of times path discovery will be attempted after early boot.
 *	Set it to a minimum of two to allow for discovery of iscsi paths which
 *	may happen very late during booting.
 *
 * mdi_path_discovery_interval
 *	Minimum number of seconds MDI will wait between successive discovery
 *	of all paths. Set it to -1 to disable discovery of all paths.
 */
static int mdi_path_discovery_boot = 1;
static int mdi_path_discovery_postboot = 2;
static int mdi_path_discovery_interval = 10;

/*
 * number of seconds the asynchronous configuration thread will sleep idle
 * before exiting.
 */
static int mdi_async_config_idle_time = 600;

static int mdi_bus_config_cache_hash_size = 256;

/* turns off multithreaded configuration for certain operations */
static int mdi_mtc_off = 0;

/*
 * The "path" to a pathinfo node is identical to the /devices path to a
 * devinfo node had the device been enumerated under a pHCI instead of
 * a vHCI.  This pathinfo "path" is associated with a 'path_instance'.
 * This association persists across create/delete of the pathinfo nodes,
 * but not across reboot.
 */
static uint_t		mdi_pathmap_instance = 1;	/* 0 -> any path */
static int		mdi_pathmap_hash_size = 256;
static kmutex_t		mdi_pathmap_mutex;
static mod_hash_t	*mdi_pathmap_bypath;		/* "path"->instance */
static mod_hash_t	*mdi_pathmap_byinstance;	/* instance->"path" */
static mod_hash_t	*mdi_pathmap_sbyinstance;	/* inst->shortpath */

/*
 * MDI component property name/value string definitions
 */
const char 		*mdi_component_prop = "mpxio-component";
const char		*mdi_component_prop_vhci = "vhci";
const char		*mdi_component_prop_phci = "phci";
const char		*mdi_component_prop_client = "client";

/*
 * MDI client global unique identifier property name
 */
const char		*mdi_client_guid_prop = "client-guid";

/*
 * MDI client load balancing property name/value string definitions
 */
const char		*mdi_load_balance = "load-balance";
const char		*mdi_load_balance_none = "none";
const char		*mdi_load_balance_rr = "round-robin";
const char		*mdi_load_balance_lba = "logical-block";

/*
 * Obsolete vHCI class definition; to be removed after Leadville update
 */
const char *mdi_vhci_class_scsi = MDI_HCI_CLASS_SCSI;

static char vhci_greeting[] =
	"\tThere already exists one vHCI driver for class %s\n"
	"\tOnly one vHCI driver for each class is allowed\n";

/*
 * Static function prototypes
 */
static int		i_mdi_phci_offline(dev_info_t *, uint_t);
static int		i_mdi_client_offline(dev_info_t *, uint_t);
static int		i_mdi_phci_pre_detach(dev_info_t *, ddi_detach_cmd_t);
static void		i_mdi_phci_post_detach(dev_info_t *,
			    ddi_detach_cmd_t, int);
static int		i_mdi_client_pre_detach(dev_info_t *,
			    ddi_detach_cmd_t);
static void		i_mdi_client_post_detach(dev_info_t *,
			    ddi_detach_cmd_t, int);
static void		i_mdi_pm_hold_pip(mdi_pathinfo_t *);
static void		i_mdi_pm_rele_pip(mdi_pathinfo_t *);
static int 		i_mdi_lba_lb(mdi_client_t *ct,
			    mdi_pathinfo_t **ret_pip, struct buf *buf);
static void		i_mdi_pm_hold_client(mdi_client_t *, int);
static void		i_mdi_pm_rele_client(mdi_client_t *, int);
static void		i_mdi_pm_reset_client(mdi_client_t *);
static int		i_mdi_power_all_phci(mdi_client_t *);
static void		i_mdi_log_sysevent(dev_info_t *, char *, char *);


/*
 * Internal mdi_pathinfo node functions
 */
static void		i_mdi_pi_kstat_destroy(mdi_pathinfo_t *);

static mdi_vhci_t	*i_mdi_vhci_class2vhci(char *);
static mdi_vhci_t	*i_devi_get_vhci(dev_info_t *);
static mdi_phci_t	*i_devi_get_phci(dev_info_t *);
static void		i_mdi_phci_lock(mdi_phci_t *, mdi_pathinfo_t *);
static void		i_mdi_phci_unlock(mdi_phci_t *);
static mdi_pathinfo_t	*i_mdi_pi_alloc(mdi_phci_t *, char *, mdi_client_t *);
static void		i_mdi_phci_add_path(mdi_phci_t *, mdi_pathinfo_t *);
static void		i_mdi_client_add_path(mdi_client_t *, mdi_pathinfo_t *);
static void		i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *,
			    mdi_client_t *);
static void		i_mdi_phci_remove_path(mdi_phci_t *, mdi_pathinfo_t *);
static void		i_mdi_client_remove_path(mdi_client_t *,
			    mdi_pathinfo_t *);

static int		i_mdi_pi_state_change(mdi_pathinfo_t *,
			    mdi_pathinfo_state_t, int);
static int		i_mdi_pi_offline(mdi_pathinfo_t *, int);
static dev_info_t	*i_mdi_devinfo_create(mdi_vhci_t *, char *, char *,
			    char **, int);
static dev_info_t	*i_mdi_devinfo_find(mdi_vhci_t *, char *, char *);
static int		i_mdi_devinfo_remove(dev_info_t *, dev_info_t *, int);
static int		i_mdi_is_child_present(dev_info_t *, dev_info_t *);
static mdi_client_t	*i_mdi_client_alloc(mdi_vhci_t *, char *, char *);
static void		i_mdi_client_enlist_table(mdi_vhci_t *, mdi_client_t *);
static void		i_mdi_client_delist_table(mdi_vhci_t *, mdi_client_t *);
static mdi_client_t	*i_mdi_client_find(mdi_vhci_t *, char *, char *);
static void		i_mdi_client_update_state(mdi_client_t *);
static int		i_mdi_client_compute_state(mdi_client_t *,
			    mdi_phci_t *);
static void		i_mdi_client_lock(mdi_client_t *, mdi_pathinfo_t *);
static void		i_mdi_client_unlock(mdi_client_t *);
static int		i_mdi_client_free(mdi_vhci_t *, mdi_client_t *);
static mdi_client_t	*i_devi_get_client(dev_info_t *);
/*
 * NOTE: this will be removed once the NWS files are changed to use the new
 * mdi_{enable,disable}_path interfaces
 */
static int		i_mdi_pi_enable_disable(dev_info_t *, dev_info_t *,
				int, int);
static mdi_pathinfo_t 	*i_mdi_enable_disable_path(mdi_pathinfo_t *pip,
				mdi_vhci_t *vh, int flags, int op);
/*
 * Failover related function prototypes
 */
static int		i_mdi_failover(void *);

/*
 * misc internal functions
 */
static int		i_mdi_get_hash_key(char *);
static int		i_map_nvlist_error_to_mdi(int);
static void		i_mdi_report_path_state(mdi_client_t *,
			    mdi_pathinfo_t *);

static void		setup_vhci_cache(mdi_vhci_t *);
static int		destroy_vhci_cache(mdi_vhci_t *);
static int		stop_vhcache_async_threads(mdi_vhci_config_t *);
static boolean_t	stop_vhcache_flush_thread(void *, int);
static void		free_string_array(char **, int);
static void		free_vhcache_phci(mdi_vhcache_phci_t *);
static void		free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *);
static void		free_vhcache_client(mdi_vhcache_client_t *);
static int		mainnvl_to_vhcache(mdi_vhci_cache_t *, nvlist_t *);
static nvlist_t		*vhcache_to_mainnvl(mdi_vhci_cache_t *);
static void		vhcache_phci_add(mdi_vhci_config_t *, mdi_phci_t *);
static void		vhcache_phci_remove(mdi_vhci_config_t *, mdi_phci_t *);
static void		vhcache_pi_add(mdi_vhci_config_t *,
			    struct mdi_pathinfo *);
static void		vhcache_pi_remove(mdi_vhci_config_t *,
			    struct mdi_pathinfo *);
static void		free_phclient_path_list(mdi_phys_path_t *);
static void		sort_vhcache_paths(mdi_vhcache_client_t *);
static int		flush_vhcache(mdi_vhci_config_t *, int);
static void		vhcache_dirty(mdi_vhci_config_t *);
static void		free_async_client_config(mdi_async_client_config_t *);
static void		single_threaded_vhconfig_enter(mdi_vhci_config_t *);
static void		single_threaded_vhconfig_exit(mdi_vhci_config_t *);
static nvlist_t		*read_on_disk_vhci_cache(char *);
extern int		fread_nvlist(char *, nvlist_t **);
extern int		fwrite_nvlist(char *, nvlist_t *);

/* called once when first vhci registers with mdi */
static void
i_mdi_init()
{
	static int initialized = 0;

	if (initialized)
		return;
	initialized = 1;

	mutex_init(&mdi_mutex, NULL, MUTEX_DEFAULT, NULL);

	/* Create our taskq resources */
	mdi_taskq = taskq_create("mdi_taskq", mdi_taskq_n_threads,
	    MDI_TASKQ_PRI, MDI_TASKQ_MINALLOC, MDI_TASKQ_MAXALLOC,
	    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
	ASSERT(mdi_taskq != NULL);	/* taskq_create never fails */

	/* Allocate ['path_instance' <-> "path"] maps */
	mutex_init(&mdi_pathmap_mutex, NULL, MUTEX_DRIVER, NULL);
	mdi_pathmap_bypath = mod_hash_create_strhash(
	    "mdi_pathmap_bypath", mdi_pathmap_hash_size,
	    mod_hash_null_valdtor);
	mdi_pathmap_byinstance = mod_hash_create_idhash(
	    "mdi_pathmap_byinstance", mdi_pathmap_hash_size,
	    mod_hash_null_valdtor);
	mdi_pathmap_sbyinstance = mod_hash_create_idhash(
	    "mdi_pathmap_sbyinstance", mdi_pathmap_hash_size,
	    mod_hash_null_valdtor);
}

/*
 * mdi_get_component_type():
 *		Return mpxio component type
 * Return Values:
 *		MDI_COMPONENT_NONE
 *		MDI_COMPONENT_VHCI
 *		MDI_COMPONENT_PHCI
 *		MDI_COMPONENT_CLIENT
 * XXX This doesn't work under multi-level MPxIO and should be
 *	removed when clients migrate mdi_component_is_*() interfaces.
 */
int
mdi_get_component_type(dev_info_t *dip)
{
	return (DEVI(dip)->devi_mdi_component);
}

/*
 * mdi_vhci_register():
 *		Register a vHCI module with the mpxio framework
 *		mdi_vhci_register() is called by vHCI drivers to register the
 *		'class_driver' vHCI driver and its MDI entrypoints with the
 *		mpxio framework.  The vHCI driver must call this interface as
 *		part of its attach(9e) handler.
 *		Competing threads may try to attach mdi_vhci_register() as
 *		the vHCI drivers are loaded and attached as a result of pHCI
 *		driver instance registration (mdi_phci_register()) with the
 *		framework.
 * Return Values:
 *		MDI_SUCCESS
 *		MDI_FAILURE
 */
/*ARGSUSED*/
int
mdi_vhci_register(char *class, dev_info_t *vdip, mdi_vhci_ops_t *vops,
    int flags)
{
	mdi_vhci_t		*vh = NULL;

	/* Registrant can't be older */
	ASSERT(vops->vo_revision <= MDI_VHCI_OPS_REV);

#ifdef DEBUG
	/*
	 * IB nexus driver is loaded only when IB hardware is present.
	 * In order to be able to do this there is a need to drive the loading
	 * and attaching of the IB nexus driver (especially when an IB hardware
	 * is dynamically plugged in) when an IB HCA driver (PHCI)
	 * is being attached. Unfortunately this gets into the limitations
	 * of devfs as there seems to be no clean way to drive configuration
	 * of a subtree from another subtree of a devfs. Hence, do not ASSERT
	 * for IB.
	 */
	if (strcmp(class, MDI_HCI_CLASS_IB) != 0)
		ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
#endif

	i_mdi_init();

	mutex_enter(&mdi_mutex);
	/*
	 * Scan for already registered vhci
	 */
	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
		if (strcmp(vh->vh_class, class) == 0) {
			/*
			 * vHCI has already been created.  Check for valid
			 * vHCI ops registration.  We only support one vHCI
			 * module per class
			 */
			if (vh->vh_ops != NULL) {
				mutex_exit(&mdi_mutex);
				cmn_err(CE_NOTE, vhci_greeting, class);
				return (MDI_FAILURE);
			}
			break;
		}
	}

	/*
	 * if not yet created, create the vHCI component
	 */
	if (vh == NULL) {
		struct client_hash	*hash = NULL;
		char			*load_balance;

		/*
		 * Allocate and initialize the mdi extensions
		 */
		vh = kmem_zalloc(sizeof (mdi_vhci_t), KM_SLEEP);
		hash = kmem_zalloc(mdi_client_table_size * sizeof (*hash),
		    KM_SLEEP);
		vh->vh_client_table = hash;
		vh->vh_class = kmem_zalloc(strlen(class) + 1, KM_SLEEP);
		(void) strcpy(vh->vh_class, class);
		vh->vh_lb = LOAD_BALANCE_RR;
		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vdip,
		    0, LOAD_BALANCE_PROP, &load_balance) == DDI_SUCCESS) {
			if (strcmp(load_balance, LOAD_BALANCE_PROP_NONE) == 0) {
				vh->vh_lb = LOAD_BALANCE_NONE;
			} else if (strcmp(load_balance, LOAD_BALANCE_PROP_LBA)
				    == 0) {
				vh->vh_lb = LOAD_BALANCE_LBA;
			}
			ddi_prop_free(load_balance);
		}

		mutex_init(&vh->vh_phci_mutex, NULL, MUTEX_DEFAULT, NULL);
		mutex_init(&vh->vh_client_mutex, NULL, MUTEX_DEFAULT, NULL);

		/*
		 * Store the vHCI ops vectors
		 */
		vh->vh_dip = vdip;
		vh->vh_ops = vops;

		setup_vhci_cache(vh);

		if (mdi_vhci_head == NULL) {
			mdi_vhci_head = vh;
		}
		if (mdi_vhci_tail) {
			mdi_vhci_tail->vh_next = vh;
		}
		mdi_vhci_tail = vh;
		mdi_vhci_count++;
	}

	/*
	 * Claim the devfs node as a vhci component
	 */
	DEVI(vdip)->devi_mdi_component |= MDI_COMPONENT_VHCI;

	/*
	 * Initialize our back reference from dev_info node
	 */
	DEVI(vdip)->devi_mdi_xhci = (caddr_t)vh;
	mutex_exit(&mdi_mutex);
	return (MDI_SUCCESS);
}

/*
 * mdi_vhci_unregister():
 *		Unregister a vHCI module from mpxio framework
 *		mdi_vhci_unregister() is called from the detach(9E) entrypoint
 * 		of a vhci to unregister it from the framework.
 * Return Values:
 *		MDI_SUCCESS
 *		MDI_FAILURE
 */
/*ARGSUSED*/
int
mdi_vhci_unregister(dev_info_t *vdip, int flags)
{
	mdi_vhci_t	*found, *vh, *prev = NULL;

	ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));

	/*
	 * Check for invalid VHCI
	 */
	if ((vh = i_devi_get_vhci(vdip)) == NULL)
		return (MDI_FAILURE);

	/*
	 * Scan the list of registered vHCIs for a match
	 */
	mutex_enter(&mdi_mutex);
	for (found = mdi_vhci_head; found != NULL; found = found->vh_next) {
		if (found == vh)
			break;
		prev = found;
	}

	if (found == NULL) {
		mutex_exit(&mdi_mutex);
		return (MDI_FAILURE);
	}

	/*
	 * Check the vHCI, pHCI and client count. All the pHCIs and clients
	 * should have been unregistered, before a vHCI can be
	 * unregistered.
	 */
	MDI_VHCI_PHCI_LOCK(vh);
	if (vh->vh_refcnt || vh->vh_phci_count || vh->vh_client_count) {
		MDI_VHCI_PHCI_UNLOCK(vh);
		mutex_exit(&mdi_mutex);
		return (MDI_FAILURE);
	}
	MDI_VHCI_PHCI_UNLOCK(vh);

	if (destroy_vhci_cache(vh) != MDI_SUCCESS) {
		mutex_exit(&mdi_mutex);
		return (MDI_FAILURE);
	}

	/*
	 * Remove the vHCI from the global list
	 */
	if (vh == mdi_vhci_head) {
		mdi_vhci_head = vh->vh_next;
	} else {
		prev->vh_next = vh->vh_next;
	}
	if (vh == mdi_vhci_tail) {
		mdi_vhci_tail = prev;
	}
	mdi_vhci_count--;
	mutex_exit(&mdi_mutex);

	vh->vh_ops = NULL;
	DEVI(vdip)->devi_mdi_component &= ~MDI_COMPONENT_VHCI;
	DEVI(vdip)->devi_mdi_xhci = NULL;
	kmem_free(vh->vh_class, strlen(vh->vh_class)+1);
	kmem_free(vh->vh_client_table,
	    mdi_client_table_size * sizeof (struct client_hash));
	mutex_destroy(&vh->vh_phci_mutex);
	mutex_destroy(&vh->vh_client_mutex);

	kmem_free(vh, sizeof (mdi_vhci_t));
	return (MDI_SUCCESS);
}

/*
 * i_mdi_vhci_class2vhci():
 *		Look for a matching vHCI module given a vHCI class name
 * Return Values:
 *		Handle to a vHCI component
 *		NULL
 */
static mdi_vhci_t *
i_mdi_vhci_class2vhci(char *class)
{
	mdi_vhci_t	*vh = NULL;

	ASSERT(!MUTEX_HELD(&mdi_mutex));

	mutex_enter(&mdi_mutex);
	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
		if (strcmp(vh->vh_class, class) == 0) {
			break;
		}
	}
	mutex_exit(&mdi_mutex);
	return (vh);
}

/*
 * i_devi_get_vhci():
 *		Utility function to get the handle to a vHCI component
 * Return Values:
 *		Handle to a vHCI component
 *		NULL
 */
mdi_vhci_t *
i_devi_get_vhci(dev_info_t *vdip)
{
	mdi_vhci_t	*vh = NULL;
	if (MDI_VHCI(vdip)) {
		vh = (mdi_vhci_t *)DEVI(vdip)->devi_mdi_xhci;
	}
	return (vh);
}

/*
 * mdi_phci_register():
 *		Register a pHCI module with mpxio framework
 *		mdi_phci_register() is called by pHCI drivers to register with
 *		the mpxio framework and a specific 'class_driver' vHCI.  The
 *		pHCI driver must call this interface as part of its attach(9e)
 *		handler.
 * Return Values:
 *		MDI_SUCCESS
 *		MDI_FAILURE
 */
/*ARGSUSED*/
int
mdi_phci_register(char *class, dev_info_t *pdip, int flags)
{
	mdi_phci_t		*ph;
	mdi_vhci_t		*vh;
	char			*data;

	/*
	 * Some subsystems, like fcp, perform pHCI registration from a
	 * different thread than the one doing the pHCI attach(9E) - the
	 * driver attach code is waiting for this other thread to complete.
	 * This means we can only ASSERT DEVI_BUSY_CHANGING of parent
	 * (indicating that some thread has done an ndi_devi_enter of parent)
	 * not DEVI_BUSY_OWNED (which would indicate that we did the enter).
	 */
	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));

	/*
	 * Check for mpxio-disable property. Enable mpxio if the property is
	 * missing or not set to "yes".
	 * If the property is set to "yes" then emit a brief message.
	 */
	if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, 0, "mpxio-disable",
	    &data) == DDI_SUCCESS)) {
		if (strcmp(data, "yes") == 0) {
			MDI_DEBUG(1, (MDI_CONT, pdip,
			    "?multipath capabilities disabled via %s.conf.",
			    ddi_driver_name(pdip)));
			ddi_prop_free(data);
			return (MDI_FAILURE);
		}
		ddi_prop_free(data);
	}

	/*
	 * Search for a matching vHCI
	 */
	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
	if (vh == NULL) {
		return (MDI_FAILURE);
	}

	ph = kmem_zalloc(sizeof (mdi_phci_t), KM_SLEEP);
	mutex_init(&ph->ph_mutex, NULL, MUTEX_DEFAULT, NULL);
	ph->ph_dip = pdip;
	ph->ph_vhci = vh;
	ph->ph_next = NULL;
	ph->ph_unstable = 0;
	ph->ph_vprivate = 0;
	cv_init(&ph->ph_unstable_cv, NULL, CV_DRIVER, NULL);

	MDI_PHCI_LOCK(ph);
	MDI_PHCI_SET_POWER_UP(ph);
	MDI_PHCI_UNLOCK(ph);
	DEVI(pdip)->devi_mdi_component |= MDI_COMPONENT_PHCI;
	DEVI(pdip)->devi_mdi_xhci = (caddr_t)ph;

	vhcache_phci_add(vh->vh_config, ph);

	MDI_VHCI_PHCI_LOCK(vh);
	if (vh->vh_phci_head == NULL) {
		vh->vh_phci_head = ph;
	}
	if (vh->vh_phci_tail) {
		vh->vh_phci_tail->ph_next = ph;
	}
	vh->vh_phci_tail = ph;
	vh->vh_phci_count++;
	MDI_VHCI_PHCI_UNLOCK(vh);

	i_mdi_log_sysevent(pdip, class, ESC_DDI_INITIATOR_REGISTER);
	return (MDI_SUCCESS);
}

/*
 * mdi_phci_unregister():
 *		Unregister a pHCI module from mpxio framework
 *		mdi_phci_unregister() is called by the pHCI drivers from their
 *		detach(9E) handler to unregister their instances from the
 *		framework.
 * Return Values:
 *		MDI_SUCCESS
 *		MDI_FAILURE
 */
/*ARGSUSED*/
int
mdi_phci_unregister(dev_info_t *pdip, int flags)
{
	mdi_vhci_t		*vh;
	mdi_phci_t		*ph;
	mdi_phci_t		*tmp;
	mdi_phci_t		*prev = NULL;
	mdi_pathinfo_t		*pip;

	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));

	ph = i_devi_get_phci(pdip);
	if (ph == NULL) {
		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid pHCI"));
		return (MDI_FAILURE);
	}

	vh = ph->ph_vhci;
	ASSERT(vh != NULL);
	if (vh == NULL) {
		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid vHCI"));
		return (MDI_FAILURE);
	}

	MDI_VHCI_PHCI_LOCK(vh);
	tmp = vh->vh_phci_head;
	while (tmp) {
		if (tmp == ph) {
			break;
		}
		prev = tmp;
		tmp = tmp->ph_next;
	}

	if (ph == vh->vh_phci_head) {
		vh->vh_phci_head = ph->ph_next;
	} else {
		prev->ph_next = ph->ph_next;
	}

	if (ph == vh->vh_phci_tail) {
		vh->vh_phci_tail = prev;
	}

	vh->vh_phci_count--;
	MDI_VHCI_PHCI_UNLOCK(vh);

	/* Walk remaining pathinfo nodes and disassociate them from pHCI */
	MDI_PHCI_LOCK(ph);
	for (pip = (mdi_pathinfo_t *)ph->ph_path_head; pip;
	    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link)
		MDI_PI(pip)->pi_phci = NULL;
	MDI_PHCI_UNLOCK(ph);

	i_mdi_log_sysevent(pdip, ph->ph_vhci->vh_class,
	    ESC_DDI_INITIATOR_UNREGISTER);
	vhcache_phci_remove(vh->vh_config, ph);
	cv_destroy(&ph->ph_unstable_cv);
	mutex_destroy(&ph->ph_mutex);
	kmem_free(ph, sizeof (mdi_phci_t));
	DEVI(pdip)->devi_mdi_component &= ~MDI_COMPONENT_PHCI;
	DEVI(pdip)->devi_mdi_xhci = NULL;
	return (MDI_SUCCESS);
}

/*
 * i_devi_get_phci():
 * 		Utility function to return the phci extensions.
 */
static mdi_phci_t *
i_devi_get_phci(dev_info_t *pdip)
{
	mdi_phci_t	*ph = NULL;

	if (MDI_PHCI(pdip)) {
		ph = (mdi_phci_t *)DEVI(pdip)->devi_mdi_xhci;
	}
	return (ph);
}

/*
 * Single thread mdi entry into devinfo node for modifying its children.
 * If necessary we perform an ndi_devi_enter of the vHCI before doing
 * an ndi_devi_enter of 'dip'.  We maintain circular in two parts: one
 * for the vHCI and one for the pHCI.
 */
void
mdi_devi_enter(dev_info_t *phci_dip, int *circular)
{
	dev_info_t	*vdip;
	int		vcircular, pcircular;

	/* Verify calling context */
	ASSERT(MDI_PHCI(phci_dip));
	vdip = mdi_devi_get_vdip(phci_dip);
	ASSERT(vdip);			/* A pHCI always has a vHCI */

	/*
	 * If pHCI is detaching then the framework has already entered the
	 * vHCI on a threads that went down the code path leading to
	 * detach_node().  This framework enter of the vHCI during pHCI
	 * detach is done to avoid deadlock with vHCI power management
	 * operations which enter the vHCI and the enter down the path
	 * to the pHCI. If pHCI is detaching then we piggyback this calls
	 * enter of the vHCI on frameworks vHCI enter that has already
	 * occurred - this is OK because we know that the framework thread
	 * doing detach is waiting for our completion.
	 *
	 * We should DEVI_IS_DETACHING under an enter of the parent to avoid
	 * race with detach - but we can't do that because the framework has
	 * already entered the parent, so we have some complexity instead.
	 */
	for (;;) {
		if (ndi_devi_tryenter(vdip, &vcircular)) {
			ASSERT(vcircular != -1);
			if (DEVI_IS_DETACHING(phci_dip)) {
				ndi_devi_exit(vdip, vcircular);
				vcircular = -1;
			}
			break;
		} else if (DEVI_IS_DETACHING(phci_dip)) {
			vcircular = -1;
			break;
		} else if (servicing_interrupt()) {
			/*
			 * Don't delay an interrupt (and ensure adaptive
			 * mutex inversion support).
			 */
			ndi_devi_enter(vdip, &vcircular);
			break;
		} else {
			delay_random(mdi_delay);
		}
	}

	ndi_devi_enter(phci_dip, &pcircular);
	*circular = (vcircular << 16) | (pcircular & 0xFFFF);
}

/*
 * Attempt to mdi_devi_enter.
 */
int
mdi_devi_tryenter(dev_info_t *phci_dip, int *circular)
{
	dev_info_t	*vdip;
	int		vcircular, pcircular;

	/* Verify calling context */
	ASSERT(MDI_PHCI(phci_dip));
	vdip = mdi_devi_get_vdip(phci_dip);
	ASSERT(vdip);			/* A pHCI always has a vHCI */

	if (ndi_devi_tryenter(vdip, &vcircular)) {
		if (ndi_devi_tryenter(phci_dip, &pcircular)) {
			*circular = (vcircular << 16) | (pcircular & 0xFFFF);
			return (1);	/* locked */
		}
		ndi_devi_exit(vdip, vcircular);
	}
	return (0);			/* busy */
}

/*
 * Release mdi_devi_enter or successful mdi_devi_tryenter.
 */
void
mdi_devi_exit(dev_info_t *phci_dip, int circular)
{
	dev_info_t	*vdip;
	int		vcircular, pcircular;

	/* Verify calling context */
	ASSERT(MDI_PHCI(phci_dip));
	vdip = mdi_devi_get_vdip(phci_dip);
	ASSERT(vdip);			/* A pHCI always has a vHCI */

	/* extract two circular recursion values from single int */
	pcircular = (short)(circular & 0xFFFF);
	vcircular = (short)((circular >> 16) & 0xFFFF);

	ndi_devi_exit(phci_dip, pcircular);
	if (vcircular != -1)
		ndi_devi_exit(vdip, vcircular);
}

/*
 * The functions mdi_devi_exit_phci() and mdi_devi_enter_phci() are used
 * around a pHCI drivers calls to mdi_pi_online/offline, after holding
 * the pathinfo node via mdi_hold_path/mdi_rele_path, to avoid deadlock
 * with vHCI power management code during path online/offline.  Each
 * mdi_devi_exit_phci must have a matching mdi_devi_enter_phci, and both must
 * occur within the scope of an active mdi_devi_enter that establishes the
 * circular value.
 */
void
mdi_devi_exit_phci(dev_info_t *phci_dip, int circular)
{
	int		pcircular;

	/* Verify calling context */
	ASSERT(MDI_PHCI(phci_dip));

	/* Keep hold on pHCI until we reenter in mdi_devi_enter_phci */
	ndi_hold_devi(phci_dip);

	pcircular = (short)(circular & 0xFFFF);
	ndi_devi_exit(phci_dip, pcircular);
}

void
mdi_devi_enter_phci(dev_info_t *phci_dip, int *circular)
{
	int		pcircular;

	/* Verify calling context */
	ASSERT(MDI_PHCI(phci_dip));

	ndi_devi_enter(phci_dip, &pcircular);

	/* Drop hold from mdi_devi_exit_phci. */
	ndi_rele_devi(phci_dip);

	/* verify matching mdi_devi_exit_phci/mdi_devi_enter_phci use */
	ASSERT(pcircular == ((short)(*circular & 0xFFFF)));
}

/*
 * mdi_devi_get_vdip():
 *		given a pHCI dip return vHCI dip
 */
dev_info_t *
mdi_devi_get_vdip(dev_info_t *pdip)
{
	mdi_phci_t	*ph;

	ph = i_devi_get_phci(pdip);
	if (ph && ph->ph_vhci)
		return (ph->ph_vhci->vh_dip);
	return (NULL);
}

/*
 * mdi_devi_pdip_entered():
 *		Return 1 if we are vHCI and have done an ndi_devi_enter
 *		of a pHCI
 */
int
mdi_devi_pdip_entered(dev_info_t *vdip)
{
	mdi_vhci_t	*vh;
	mdi_phci_t	*ph;

	vh = i_devi_get_vhci(vdip);
	if (vh == NULL)
		return (0);

	MDI_VHCI_PHCI_LOCK(vh);
	ph = vh->vh_phci_head;
	while (ph) {
		if (ph->ph_dip && DEVI_BUSY_OWNED(ph->ph_dip)) {
			MDI_VHCI_PHCI_UNLOCK(vh);
			return (1);
		}
		ph = ph->ph_next;
	}
	MDI_VHCI_PHCI_UNLOCK(vh);
	return (0);
}

/*
 * mdi_phci_path2devinfo():
 * 		Utility function to search for a valid phci device given
 *		the devfs pathname.
 */
dev_info_t *
mdi_phci_path2devinfo(dev_info_t *vdip, caddr_t pathname)
{
	char		*temp_pathname;
	mdi_vhci_t	*vh;
	mdi_phci_t	*ph;
	dev_info_t 	*pdip = NULL;

	vh = i_devi_get_vhci(vdip);
	ASSERT(vh != NULL);

	if (vh == NULL) {
		/*
		 * Invalid vHCI component, return failure
		 */
		return (NULL);
	}

	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
	MDI_VHCI_PHCI_LOCK(vh);
	ph = vh->vh_phci_head;
	while (ph != NULL) {
		pdip = ph->ph_dip;
		ASSERT(pdip != NULL);
		*temp_pathname = '\0';
		(void) ddi_pathname(pdip, temp_pathname);
		if (strcmp(temp_pathname, pathname) == 0) {
			break;
		}
		ph = ph->ph_next;
	}
	if (ph == NULL) {
		pdip = NULL;
	}
	MDI_VHCI_PHCI_UNLOCK(vh);
	kmem_free(temp_pathname, MAXPATHLEN);
	return (pdip);
}

/*
 * mdi_phci_get_path_count():
 * 		get number of path information nodes associated with a given
 *		pHCI device.
 */
int
mdi_phci_get_path_count(dev_info_t *pdip)
{
	mdi_phci_t	*ph;
	int		count = 0;

	ph = i_devi_get_phci(pdip);
	if (ph != NULL) {
		count = ph->ph_path_count;
	}
	return (count);
}

/*
 * i_mdi_phci_lock():
 *		Lock a pHCI device
 * Return Values:
 *		None
 * Note:
 *		The default locking order is:
 *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
 *		But there are number of situations where locks need to be
 *		grabbed in reverse order.  This routine implements try and lock
 *		mechanism depending on the requested parameter option.
 */
static void
i_mdi_phci_lock(mdi_phci_t *ph, mdi_pathinfo_t *pip)
{
	if (pip) {
		/* Reverse locking is requested. */
		while (MDI_PHCI_TRYLOCK(ph) == 0) {
			if (servicing_interrupt()) {
				MDI_PI_HOLD(pip);
				MDI_PI_UNLOCK(pip);
				MDI_PHCI_LOCK(ph);
				MDI_PI_LOCK(pip);
				MDI_PI_RELE(pip);
				break;
			} else {
				/*
				 * tryenter failed. Try to grab again
				 * after a small delay
				 */
				MDI_PI_HOLD(pip);
				MDI_PI_UNLOCK(pip);
				delay_random(mdi_delay);
				MDI_PI_LOCK(pip);
				MDI_PI_RELE(pip);
			}
		}
	} else {
		MDI_PHCI_LOCK(ph);
	}
}

/*
 * i_mdi_phci_unlock():
 *		Unlock the pHCI component
 */
static void
i_mdi_phci_unlock(mdi_phci_t *ph)
{
	MDI_PHCI_UNLOCK(ph);
}

/*
 * i_mdi_devinfo_create():
 *		create client device's devinfo node
 * Return Values:
 *		dev_info
 *		NULL
 * Notes:
 */
static dev_info_t *
i_mdi_devinfo_create(mdi_vhci_t *vh, char *name, char *guid,
	char **compatible, int ncompatible)
{
	dev_info_t *cdip = NULL;

	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));

	/* Verify for duplicate entry */
	cdip = i_mdi_devinfo_find(vh, name, guid);
	ASSERT(cdip == NULL);
	if (cdip) {
		cmn_err(CE_WARN,
		    "i_mdi_devinfo_create: client %s@%s already exists",
			name ? name : "", guid ? guid : "");
	}

	ndi_devi_alloc_sleep(vh->vh_dip, name, DEVI_SID_NODEID, &cdip);
	if (cdip == NULL)
		goto fail;

	/*
	 * Create component type and Global unique identifier
	 * properties
	 */
	if (ndi_prop_update_string(DDI_DEV_T_NONE, cdip,
	    MDI_CLIENT_GUID_PROP, guid) != DDI_PROP_SUCCESS) {
		goto fail;
	}

	/* Decorate the node with compatible property */
	if (compatible &&
	    (ndi_prop_update_string_array(DDI_DEV_T_NONE, cdip,
	    "compatible", compatible, ncompatible) != DDI_PROP_SUCCESS)) {
		goto fail;
	}

	return (cdip);

fail:
	if (cdip) {
		(void) ndi_prop_remove_all(cdip);
		(void) ndi_devi_free(cdip);
	}
	return (NULL);
}

/*
 * i_mdi_devinfo_find():
 *		Find a matching devinfo node for given client node name
 *		and its guid.
 * Return Values:
 *		Handle to a dev_info node or NULL
 */
static dev_info_t *
i_mdi_devinfo_find(mdi_vhci_t *vh, caddr_t name, char *guid)
{
	char			*data;
	dev_info_t 		*cdip = NULL;
	dev_info_t 		*ndip = NULL;
	int			circular;

	ndi_devi_enter(vh->vh_dip, &circular);
	ndip = (dev_info_t *)DEVI(vh->vh_dip)->devi_child;
	while ((cdip = ndip) != NULL) {
		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;

		if (strcmp(DEVI(cdip)->devi_node_name, name)) {
			continue;
		}

		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, cdip,
		    DDI_PROP_DONTPASS, MDI_CLIENT_GUID_PROP,
		    &data) != DDI_PROP_SUCCESS) {
			continue;
		}

		if (strcmp(data, guid) != 0) {
			ddi_prop_free(data);
			continue;
		}
		ddi_prop_free(data);
		break;
	}
	ndi_devi_exit(vh->vh_dip, circular);
	return (cdip);
}

/*
 * i_mdi_devinfo_remove():
 *		Remove a client device node
 */
static int
i_mdi_devinfo_remove(dev_info_t *vdip, dev_info_t *cdip, int flags)
{
	int	rv = MDI_SUCCESS;

	if (i_mdi_is_child_present(vdip, cdip) == MDI_SUCCESS ||
	    (flags & MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED)) {
		rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN | NDI_DEVI_REMOVE);
		if (rv != NDI_SUCCESS) {
			MDI_DEBUG(1, (MDI_NOTE, cdip,
			    "!failed: cdip %p", (void *)cdip));
		}
		/*
		 * Convert to MDI error code
		 */
		switch (rv) {
		case NDI_SUCCESS:
			rv = MDI_SUCCESS;
			break;
		case NDI_BUSY:
			rv = MDI_BUSY;
			break;
		default:
			rv = MDI_FAILURE;
			break;
		}
	}
	return (rv);
}

/*
 * i_devi_get_client()
 *		Utility function to get mpxio component extensions
 */
static mdi_client_t *
i_devi_get_client(dev_info_t *cdip)
{
	mdi_client_t	*ct = NULL;

	if (MDI_CLIENT(cdip)) {
		ct = (mdi_client_t *)DEVI(cdip)->devi_mdi_client;
	}
	return (ct);
}

/*
 * i_mdi_is_child_present():
 *		Search for the presence of client device dev_info node
 */
static int
i_mdi_is_child_present(dev_info_t *vdip, dev_info_t *cdip)
{
	int		rv = MDI_FAILURE;
	struct dev_info	*dip;
	int		circular;

	ndi_devi_enter(vdip, &circular);
	dip = DEVI(vdip)->devi_child;
	while (dip) {
		if (dip == DEVI(cdip)) {
			rv = MDI_SUCCESS;
			break;
		}
		dip = dip->devi_sibling;
	}
	ndi_devi_exit(vdip, circular);
	return (rv);
}


/*
 * i_mdi_client_lock():
 *		Grab client component lock
 * Return Values:
 *		None
 * Note:
 *		The default locking order is:
 *		_NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
 *		But there are number of situations where locks need to be
 *		grabbed in reverse order.  This routine implements try and lock
 *		mechanism depending on the requested parameter option.
 */
static void
i_mdi_client_lock(mdi_client_t *ct, mdi_pathinfo_t *pip)
{
	if (pip) {
		/*
		 * Reverse locking is requested.
		 */
		while (MDI_CLIENT_TRYLOCK(ct) == 0) {
			if (servicing_interrupt()) {
				MDI_PI_HOLD(pip);
				MDI_PI_UNLOCK(pip);
				MDI_CLIENT_LOCK(ct);
				MDI_PI_LOCK(pip);
				MDI_PI_RELE(pip);
				break;
			} else {
				/*
				 * tryenter failed. Try to grab again
				 * after a small delay
				 */
				MDI_PI_HOLD(pip);
				MDI_PI_UNLOCK(pip);
				delay_random(mdi_delay);
				MDI_PI_LOCK(pip);
				MDI_PI_RELE(pip);
			}
		}
	} else {
		MDI_CLIENT_LOCK(ct);
	}
}

/*
 * i_mdi_client_unlock():
 *		Unlock a client component
 */
static void
i_mdi_client_unlock(mdi_client_t *ct)
{
	MDI_CLIENT_UNLOCK(ct);
}

/*
 * i_mdi_client_alloc():
 * 		Allocate and initialize a client structure.  Caller should
 *		hold the vhci client lock.
 * Return Values:
 *		Handle to a client component
 */
/*ARGSUSED*/
static mdi_client_t *
i_mdi_client_alloc(mdi_vhci_t *vh, char *name, char *lguid)
{
	mdi_client_t	*ct;

	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));

	/*
	 * Allocate and initialize a component structure.
	 */
	ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
	mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
	ct->ct_hnext = NULL;
	ct->ct_hprev = NULL;
	ct->ct_dip = NULL;
	ct->ct_vhci = vh;
	ct->ct_drvname = kmem_alloc(strlen(name) + 1, KM_SLEEP);
	(void) strcpy(ct->ct_drvname, name);
	ct->ct_guid = kmem_alloc(strlen(lguid) + 1, KM_SLEEP);
	(void) strcpy(ct->ct_guid, lguid);
	ct->ct_cprivate = NULL;
	ct->ct_vprivate = NULL;
	ct->ct_flags = 0;
	ct->ct_state = MDI_CLIENT_STATE_FAILED;
	MDI_CLIENT_LOCK(ct);
	MDI_CLIENT_SET_OFFLINE(ct);
	MDI_CLIENT_SET_DETACH(ct);
	MDI_CLIENT_SET_POWER_UP(ct);
	MDI_CLIENT_UNLOCK(ct);
	ct->ct_failover_flags = 0;
	ct->ct_failover_status = 0;
	cv_init(&ct->ct_failover_cv, NULL, CV_DRIVER, NULL);
	ct->ct_unstable = 0;
	cv_init(&ct->ct_unstable_cv, NULL, CV_DRIVER, NULL);
	cv_init(&ct->ct_powerchange_cv, NULL, CV_DRIVER, NULL);
	ct->ct_lb = vh->vh_lb;
	ct->ct_lb_args =  kmem_zalloc(sizeof (client_lb_args_t), KM_SLEEP);
	ct->ct_lb_args->region_size = LOAD_BALANCE_DEFAULT_REGION_SIZE;
	ct->ct_path_count = 0;
	ct->ct_path_head = NULL;
	ct->ct_path_tail = NULL;
	ct->ct_path_last = NULL;

	/*
	 * Add this client component to our client hash queue
	 */
	i_mdi_client_enlist_table(vh, ct);
	return (ct);
}

/*
 * i_mdi_client_enlist_table():
 *		Attach the client device to the client hash table. Caller
 *		should hold the vhci client lock.
 */
static void
i_mdi_client_enlist_table(mdi_vhci_t *vh, mdi_client_t *ct)
{
	int 			index;
	struct client_hash	*head;

	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));

	index = i_mdi_get_hash_key(ct->ct_guid);
	head = &vh->vh_client_table[index];
	ct->ct_hnext = (mdi_client_t *)head->ct_hash_head;
	head->ct_hash_head = ct;
	head->ct_hash_count++;
	vh->vh_client_count++;
}

/*
 * i_mdi_client_delist_table():
 *		Attach the client device to the client hash table.
 *		Caller should hold the vhci client lock.
 */
static void
i_mdi_client_delist_table(mdi_vhci_t *vh, mdi_client_t *ct)
{
	int			index;
	char			*guid;
	struct client_hash 	*head;
	mdi_client_t		*next;
	mdi_client_t		*last;

	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));

	guid = ct->ct_guid;
	index = i_mdi_get_hash_key(guid);
	head = &vh->vh_client_table[index];

	last = NULL;
	next = (mdi_client_t *)head->ct_hash_head;
	while (next != NULL) {
		if (next == ct) {
			break;
		}
		last = next;
		next = next->ct_hnext;
	}

	if (next) {
		head->ct_hash_count--;
		if (last == NULL) {
			head->ct_hash_head = ct->ct_hnext;
		} else {
			last->ct_hnext = ct->ct_hnext;
		}
		ct->ct_hnext = NULL;
		vh->vh_client_count--;
	}
}


/*
 * i_mdi_client_free():
 *		Free a client component
 */
static int
i_mdi_client_free(mdi_vhci_t *vh, mdi_client_t *ct)
{
	int		rv = MDI_SUCCESS;
	int		flags = ct->ct_flags;
	dev_info_t	*cdip;
	dev_info_t	*vdip;

	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));

	vdip = vh->vh_dip;
	cdip = ct->ct_dip;

	(void) ndi_prop_remove(DDI_DEV_T_NONE, cdip, MDI_CLIENT_GUID_PROP);
	DEVI(cdip)->devi_mdi_component &= ~MDI_COMPONENT_CLIENT;
	DEVI(cdip)->devi_mdi_client = NULL;

	/*
	 * Clear out back ref. to dev_info_t node
	 */
	ct->ct_dip = NULL;

	/*
	 * Remove this client from our hash queue
	 */
	i_mdi_client_delist_table(vh, ct);

	/*
	 * Uninitialize and free the component
	 */
	kmem_free(ct->ct_drvname, strlen(ct->ct_drvname) + 1);
	kmem_free(ct->ct_guid, strlen(ct->ct_guid) + 1);
	kmem_free(ct->ct_lb_args, sizeof (client_lb_args_t));
	cv_destroy(&ct->ct_failover_cv);
	cv_destroy(&ct->ct_unstable_cv);
	cv_destroy(&ct->ct_powerchange_cv);
	mutex_destroy(&ct->ct_mutex);
	kmem_free(ct, sizeof (*ct));

	if (cdip != NULL) {
		MDI_VHCI_CLIENT_UNLOCK(vh);
		(void) i_mdi_devinfo_remove(vdip, cdip, flags);
		MDI_VHCI_CLIENT_LOCK(vh);
	}
	return (rv);
}

/*
 * i_mdi_client_find():
 * 		Find the client structure corresponding to a given guid
 *		Caller should hold the vhci client lock.
 */
static mdi_client_t *
i_mdi_client_find(mdi_vhci_t *vh, char *cname, char *guid)
{
	int			index;
	struct client_hash	*head;
	mdi_client_t		*ct;

	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));

	index = i_mdi_get_hash_key(guid);
	head = &vh->vh_client_table[index];

	ct = head->ct_hash_head;
	while (ct != NULL) {
		if (strcmp(ct->ct_guid, guid) == 0 &&
		    (cname == NULL || strcmp(ct->ct_drvname, cname) == 0)) {
			break;
		}
		ct = ct->ct_hnext;
	}
	return (ct);
}

/*
 * i_mdi_client_update_state():
 *		Compute and update client device state
 * Notes:
 *		A client device can be in any of three possible states:
 *
 *		MDI_CLIENT_STATE_OPTIMAL - Client in optimal state with more
 *		one online/standby paths. Can tolerate failures.
 *		MDI_CLIENT_STATE_DEGRADED - Client device in degraded state with
 *		no alternate paths available as standby. A failure on the online
 *		would result in loss of access to device data.
 *		MDI_CLIENT_STATE_FAILED - Client device in failed state with
 *		no paths available to access the device.
 */
static void
i_mdi_client_update_state(mdi_client_t *ct)
{
	int state;

	ASSERT(MDI_CLIENT_LOCKED(ct));
	state = i_mdi_client_compute_state(ct, NULL);
	MDI_CLIENT_SET_STATE(ct, state);
}

/*
 * i_mdi_client_compute_state():
 *		Compute client device state
 *
 *		mdi_phci_t *	Pointer to pHCI structure which should
 *				while computing the new value.  Used by
 *				i_mdi_phci_offline() to find the new
 *				client state after DR of a pHCI.
 */
static int
i_mdi_client_compute_state(mdi_client_t *ct, mdi_phci_t *ph)
{
	int		state;
	int		online_count = 0;
	int		standby_count = 0;
	mdi_pathinfo_t	*pip, *next;

	ASSERT(MDI_CLIENT_LOCKED(ct));
	pip = ct->ct_path_head;
	while (pip != NULL) {
		MDI_PI_LOCK(pip);
		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
		if (MDI_PI(pip)->pi_phci == ph) {
			MDI_PI_UNLOCK(pip);
			pip = next;
			continue;
		}

		if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
				== MDI_PATHINFO_STATE_ONLINE)
			online_count++;
		else if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
				== MDI_PATHINFO_STATE_STANDBY)
			standby_count++;
		MDI_PI_UNLOCK(pip);
		pip = next;
	}

	if (online_count == 0) {
		if (standby_count == 0) {
			state = MDI_CLIENT_STATE_FAILED;
			MDI_DEBUG(2, (MDI_NOTE, ct->ct_dip,
			    "client state failed: ct = %p", (void *)ct));
		} else if (standby_count == 1) {
			state = MDI_CLIENT_STATE_DEGRADED;
		} else {
			state = MDI_CLIENT_STATE_OPTIMAL;
		}
	} else if (online_count == 1) {
		if (standby_count == 0) {
			state = MDI_CLIENT_STATE_DEGRADED;
		} else {
			state = MDI_CLIENT_STATE_OPTIMAL;
		}
	} else {
		state = MDI_CLIENT_STATE_OPTIMAL;
	}
	return (state);
}

/*
 * i_mdi_client2devinfo():
 *		Utility function
 */
dev_info_t *
i_mdi_client2devinfo(mdi_client_t *ct)
{
	return (ct->ct_dip);
}

/*
 * mdi_client_path2_devinfo():
 * 		Given the parent devinfo and child devfs pathname, search for
 *		a valid devfs node handle.
 */
dev_info_t *
mdi_client_path2devinfo(dev_info_t *vdip, char *pathname)
{
	dev_info_t 	*cdip = NULL;
	dev_info_t 	*ndip = NULL;
	char		*temp_pathname;
	int		circular;

	/*
	 * Allocate temp buffer
	 */
	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);

	/*
	 * Lock parent against changes
	 */
	ndi_devi_enter(vdip, &circular);
	ndip = (dev_info_t *)DEVI(vdip)->devi_child;
	while ((cdip = ndip) != NULL) {
		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;

		*temp_pathname = '\0';
		(void) ddi_pathname(cdip, temp_pathname);
		if (strcmp(temp_pathname, pathname) == 0) {
			break;
		}
	}
	/*
	 * Release devinfo lock
	 */
	ndi_devi_exit(vdip, circular);

	/*
	 * Free the temp buffer
	 */
	kmem_free(temp_pathname, MAXPATHLEN);
	return (cdip);
}

/*
 * mdi_client_get_path_count():
 * 		Utility function to get number of path information nodes
 *		associated with a given client device.
 */
int
mdi_client_get_path_count(dev_info_t *cdip)
{
	mdi_client_t	*ct;
	int		count = 0;

	ct = i_devi_get_client(cdip);
	if (ct != NULL) {
		count = ct->ct_path_count;
	}
	return (count);
}


/*
 * i_mdi_get_hash_key():
 * 		Create a hash using strings as keys
 *
 */
static int
i_mdi_get_hash_key(char *str)
{
	uint32_t	g, hash = 0;
	char		*p;

	for (p = str; *p != '\0'; p++) {
		g = *p;
		hash += g;
	}
	return (hash % (CLIENT_HASH_TABLE_SIZE - 1));
}

/*
 * mdi_get_lb_policy():
 * 		Get current load balancing policy for a given client device
 */
client_lb_t
mdi_get_lb_policy(dev_info_t *cdip)
{
	client_lb_t	lb = LOAD_BALANCE_NONE;
	mdi_client_t	*ct;

	ct = i_devi_get_client(cdip);
	if (ct != NULL) {
		lb = ct->ct_lb;
	}
	return (lb);
}

/*
 * mdi_set_lb_region_size():
 * 		Set current region size for the load-balance
 */
int
mdi_set_lb_region_size(dev_info_t *cdip, int region_size)
{
	mdi_client_t	*ct;
	int		rv = MDI_FAILURE;

	ct = i_devi_get_client(cdip);
	if (ct != NULL && ct->ct_lb_args != NULL) {
		ct->ct_lb_args->region_size = region_size;
		rv = MDI_SUCCESS;
	}
	return (rv);
}

/*
 * mdi_Set_lb_policy():
 * 		Set current load balancing policy for a given client device
 */
int
mdi_set_lb_policy(dev_info_t *cdip, client_lb_t lb)
{
	mdi_client_t	*ct;
	int		rv = MDI_FAILURE;

	ct = i_devi_get_client(cdip);
	if (ct != NULL) {
		ct->ct_lb = lb;
		rv = MDI_SUCCESS;
	}
	return (rv);
}

/*
 * mdi_failover():
 *		failover function called by the vHCI drivers to initiate
 *		a failover operation.  This is typically due to non-availability
 *		of online paths to route I/O requests.  Failover can be
 *		triggered through user application also.
 *
 *		The vHCI driver calls mdi_failover() to initiate a failover
 *		operation. mdi_failover() calls back into the vHCI driver's
 *		vo_failover() entry point to perform the actual failover
 *		operation.  The reason for requiring the vHCI driver to
 *		initiate failover by calling mdi_failover(), instead of directly
 *		executing vo_failover() itself, is to ensure that the mdi
 *		framework can keep track of the client state properly.
 *		Additionally, mdi_failover() provides as a convenience the
 *		option of performing the failover operation synchronously or
 *		asynchronously
 *
 *		Upon successful completion of the failover operation, the
 *		paths that were previously ONLINE will be in the STANDBY state,
 *		and the newly activated paths will be in the ONLINE state.
 *
 *		The flags modifier determines whether the activation is done
 *		synchronously: MDI_FAILOVER_SYNC
 * Return Values:
 *		MDI_SUCCESS
 *		MDI_FAILURE
 *		MDI_BUSY
 */
/*ARGSUSED*/
int
mdi_failover(dev_info_t *vdip, dev_info_t *cdip, int flags)
{
	int			rv;
	mdi_client_t		*ct;

	ct = i_devi_get_client(cdip);
	ASSERT(ct != NULL);
	if (ct == NULL) {
		/* cdip is not a valid client device. Nothing more to do. */
		return (MDI_FAILURE);
	}

	MDI_CLIENT_LOCK(ct);

	if (MDI_CLIENT_IS_PATH_FREE_IN_PROGRESS(ct)) {
		/* A path to the client is being freed */
		MDI_CLIENT_UNLOCK(ct);
		return (MDI_BUSY);
	}


	if (MDI_CLIENT_IS_FAILED(ct)) {
		/*
		 * Client is in failed state. Nothing more to do.
		 */
		MDI_CLIENT_UNLOCK(ct);
		return (MDI_FAILURE);
	}

	if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
		/*
		 * Failover is already in progress; return BUSY
		 */
		MDI_CLIENT_UNLOCK(ct);
		return (MDI_BUSY);
	}
	/*
	 * Make sure that mdi_pathinfo node state changes are processed.
	 * We do not allow failovers to progress while client path state
	 * changes are in progress
	 */
	if (ct->ct_unstable) {
		if (flags == MDI_FAILOVER_ASYNC) {
			MDI_CLIENT_UNLOCK(ct);
			return (MDI_BUSY);
		} else {
			while (ct->ct_unstable)
				cv_wait(&ct->ct_unstable_cv, &ct->ct_mutex);
		}
	}

	/*
	 * Client device is in stable state. Before proceeding, perform sanity
	 * checks again.
	 */
	if ((MDI_CLIENT_IS_DETACHED(ct)) || (MDI_CLIENT_IS_FAILED(ct)) ||
	    (!i_ddi_devi_attached(ct->ct_dip))) {
		/*
		 * Client is in failed state. Nothing more to do.
		 */
		MDI_CLIENT_UNLOCK(ct);
		return (MDI_FAILURE);
	}

	/*
	 * Set the client state as failover in progress.
	 */
	MDI_CLIENT_SET_FAILOVER_IN_PROGRESS(ct);
	ct->ct_failover_flags = flags;
	MDI_CLIENT_UNLOCK(ct);

	if (flags == MDI_FAILOVER_ASYNC) {
		/*
		 * Submit the initiate failover request via CPR safe
		 * taskq threads.
		 */
		(void) taskq_dispatch(mdi_taskq, (task_func_t *)i_mdi_failover,
		    ct, KM_SLEEP);
		return (MDI_ACCEPT);
	} else {
		/*
		 * Synchronous failover mode.  Typically invoked from the user
		 * land.
		 */
		rv = i_mdi_failover(ct);
	}
	return (rv);
}

/*
 * i_mdi_failover():
 *		internal failover function. Invokes vHCI drivers failover
 *		callback function and process the failover status
 * Return Values:
 *		None
 *
 * Note: A client device in failover state can not be detached or freed.
 */
static int
i_mdi_failover(void *arg)
{
	int		rv = MDI_SUCCESS;
	mdi_client_t	*ct = (mdi_client_t *)arg;
	mdi_vhci_t	*vh = ct->ct_vhci;

	ASSERT(!MDI_CLIENT_LOCKED(ct));

	if (vh->vh_ops->vo_failover != NULL) {
		/*
		 * Call vHCI drivers callback routine
		 */
		rv = (*vh->vh_ops->vo_failover)(vh->vh_dip, ct->ct_dip,
		    ct->ct_failover_flags);
	}

	MDI_CLIENT_LOCK(ct);
	MDI_CLIENT_CLEAR_FAILOVER_IN_PROGRESS(ct);

	/*
	 * Save the failover return status
	 */
	ct->ct_failover_status = rv;

	/*
	 * As a result of failover, client status would have been changed.
	 * Update the client state and wake up anyone waiting on this client
	 * device.
	 */
	i_mdi_client_update_state(ct);

	cv_broadcast(&ct->ct_failover_cv);
	MDI_CLIENT_UNLOCK(ct);
	return (rv);
}

/*
 * Load balancing is logical block.
 * IOs within the range described by region_size
 * would go on the same path. This would improve the
 * performance by cache-hit on some of the RAID devices.
 * Search only for online paths(At some point we
 * may want to balance across target ports).
 * If no paths are found then default to round-robin.
 */
static int
i_mdi_lba_lb(mdi_client_t *ct, mdi_pathinfo_t **ret_pip, struct buf *bp)
{
	int		path_index = -1;
	int		online_path_count = 0;
	int		online_nonpref_path_count = 0;
	int 		region_size = ct->ct_lb_args->region_size;
	mdi_pathinfo_t	*pip;
	mdi_pathinfo_t	*next;
	int		preferred, path_cnt;

	pip = ct->ct_path_head;
	while (pip) {
		MDI_PI_LOCK(pip);
		if (MDI_PI(pip)->pi_state ==
		    MDI_PATHINFO_STATE_ONLINE && MDI_PI(pip)->pi_preferred) {
			online_path_count++;
		} else if (MDI_PI(pip)->pi_state ==
		    MDI_PATHINFO_STATE_ONLINE && !MDI_PI(pip)->pi_preferred) {
			online_nonpref_path_count++;
		}
		next = (mdi_pathinfo_t *)
		    MDI_PI(pip)->pi_client_link;
		MDI_PI_UNLOCK(pip);
		pip = next;
	}
	/* if found any online/preferred then use this type */
	if (online_path_count > 0) {
		path_cnt = online_path_count;
		preferred = 1;
	} else if (online_nonpref_path_count > 0) {
		path_cnt = online_nonpref_path_count;
		preferred = 0;
	} else {
		path_cnt = 0;
	}
	if (path_cnt) {
		path_index = (bp->b_blkno >> region_size) % path_cnt;
		pip = ct->ct_path_head;
		while (pip && path_index != -1) {
			MDI_PI_LOCK(pip);
			if (path_index == 0 &&
			    (MDI_PI(pip)->pi_state ==
			    MDI_PATHINFO_STATE_ONLINE) &&
				MDI_PI(pip)->pi_preferred == preferred) {
				MDI_PI_HOLD(pip);
				MDI_PI_UNLOCK(pip);
				*ret_pip = pip;
				return (MDI_SUCCESS);
			}
			path_index --;
			next = (mdi_pathinfo_t *)
			    MDI_PI(pip)->pi_client_link;
			MDI_PI_UNLOCK(pip);
			pip = next;
		}
		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
		    "lba %llx: path %s %p",
		    bp->b_lblkno, mdi_pi_spathname(pip), (void *)pip));
	}
	return (MDI_FAILURE);
}

/*
 * mdi_select_path():
 *		select a path to access a client device.
 *
 *		mdi_select_path() function is called by the vHCI drivers to
 *		select a path to route the I/O request to.  The caller passes
 *		the block I/O data transfer structure ("buf") as one of the
 *		parameters.  The mpxio framework uses the buf structure
 *		contents to maintain per path statistics (total I/O size /
 *		count pending).  If more than one online paths are available to
 *		select, the framework automatically selects a suitable path
 *		for routing I/O request. If a failover operation is active for
 *		this client device the call shall be failed with MDI_BUSY error
 *		code.
 *
 *		By default this function returns a suitable path in online
 *		state based on the current load balancing policy.  Currently
 *		we support LOAD_BALANCE_NONE (Previously selected online path
 *		will continue to be used till the path is usable) and
 *		LOAD_BALANCE_RR (Online paths will be selected in a round
 *		robin fashion), LOAD_BALANCE_LB(Online paths will be selected
 *		based on the logical block).  The load balancing
 *		through vHCI drivers configuration file (driver.conf).
 *
 *		vHCI drivers may override this default behavior by specifying
 *		appropriate flags.  The meaning of the thrid argument depends
 *		on the flags specified. If MDI_SELECT_PATH_INSTANCE is set
 *		then the argument is the "path instance" of the path to select.
 *		If MDI_SELECT_PATH_INSTANCE is not set then the argument is
 *		"start_pip". A non NULL "start_pip" is the starting point to
 *		walk and find the next appropriate path.  The following values
 *		are currently defined: MDI_SELECT_ONLINE_PATH (to select an
 *		ONLINE path) and/or MDI_SELECT_STANDBY_PATH (to select an
 *		STANDBY path).
 *
 *		The non-standard behavior is used by the scsi_vhci driver,
 *		whenever it has to use a STANDBY/FAULTED path.  Eg. during
 *		attach of client devices (to avoid an unnecessary failover
 *		when the STANDBY path comes up first), during failover
 *		(to activate a STANDBY path as ONLINE).
 *
 *		The selected path is returned in a a mdi_hold_path() state
 *		(pi_ref_cnt). Caller should release the hold by calling
 *		mdi_rele_path().
 *
 * Return Values:
 *		MDI_SUCCESS	- Completed successfully
 *		MDI_BUSY 	- Client device is busy failing over
 *		MDI_NOPATH	- Client device is online, but no valid path are
 *				  available to access this client device
 *		MDI_FAILURE	- Invalid client device or state
 *		MDI_DEVI_ONLINING
 *				- Client device (struct dev_info state) is in
 *				  onlining state.
 */

/*ARGSUSED*/
int
mdi_select_path(dev_info_t *cdip, struct buf *bp, int flags,
    void *arg, mdi_pathinfo_t **ret_pip)
{
	mdi_client_t	*ct;
	mdi_pathinfo_t	*pip;
	mdi_pathinfo_t	*next;
	mdi_pathinfo_t	*head;
	mdi_pathinfo_t	*start;
	client_lb_t	lbp;	/* load balancing policy */
	int		sb = 1;	/* standard behavior */
	int		preferred = 1;	/* preferred path */
	int		cond, cont = 1;
	int		retry = 0;
	mdi_pathinfo_t	*start_pip;	/* request starting pathinfo */
	int		path_instance;	/* request specific path instance */

	/* determine type of arg based on flags */
	if (flags & MDI_SELECT_PATH_INSTANCE) {
		path_instance = (int)(intptr_t)arg;
		start_pip = NULL;
	} else {
		path_instance = 0;
		start_pip = (mdi_pathinfo_t *)arg;
	}

	if (flags != 0) {
		/*
		 * disable default behavior
		 */
		sb = 0;
	}

	*ret_pip = NULL;
	ct = i_devi_get_client(cdip);
	if (ct == NULL) {
		/* mdi extensions are NULL, Nothing more to do */
		return (MDI_FAILURE);
	}

	MDI_CLIENT_LOCK(ct);

	if (sb) {
		if (MDI_CLIENT_IS_FAILED(ct)) {
			/*
			 * Client is not ready to accept any I/O requests.
			 * Fail this request.
			 */
			MDI_DEBUG(2, (MDI_NOTE, cdip,
			    "client state offline ct = %p", (void *)ct));
			MDI_CLIENT_UNLOCK(ct);
			return (MDI_FAILURE);
		}

		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
			/*
			 * Check for Failover is in progress. If so tell the
			 * caller that this device is busy.
			 */
			MDI_DEBUG(2, (MDI_NOTE, cdip,
			    "client failover in progress ct = %p",
			    (void *)ct));
			MDI_CLIENT_UNLOCK(ct);
			return (MDI_BUSY);
		}

		/*
		 * Check to see whether the client device is attached.
		 * If not so, let the vHCI driver manually select a path
		 * (standby) and let the probe/attach process to continue.
		 */
		if (MDI_CLIENT_IS_DETACHED(ct) || !i_ddi_devi_attached(cdip)) {
			MDI_DEBUG(4, (MDI_NOTE, cdip,
			    "devi is onlining ct = %p", (void *)ct));
			MDI_CLIENT_UNLOCK(ct);
			return (MDI_DEVI_ONLINING);
		}
	}

	/*
	 * Cache in the client list head.  If head of the list is NULL
	 * return MDI_NOPATH
	 */
	head = ct->ct_path_head;
	if (head == NULL) {
		MDI_CLIENT_UNLOCK(ct);
		return (MDI_NOPATH);
	}

	/* Caller is specifying a specific pathinfo path by path_instance */
	if (path_instance) {
		/* search for pathinfo with correct path_instance */
		for (pip = head;
		    pip && (mdi_pi_get_path_instance(pip) != path_instance);
		    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link)
			;

		/* If path can't be selected then MDI_NOPATH is returned. */
		if (pip == NULL) {
			MDI_CLIENT_UNLOCK(ct);
			return (MDI_NOPATH);
		}

		/*
		 * Verify state of path. When asked to select a specific
		 * path_instance, we select the requested path in any
		 * state (ONLINE, OFFLINE, STANDBY, FAULT) other than INIT.
		 * We don't however select paths where the pHCI has detached.
		 * NOTE: last pathinfo node of an opened client device may
		 * exist in an OFFLINE state after the pHCI associated with
		 * that path has detached (but pi_phci will be NULL if that
		 * has occurred).
		 */
		MDI_PI_LOCK(pip);
		if ((MDI_PI(pip)->pi_state == MDI_PATHINFO_STATE_INIT) ||
		    (MDI_PI(pip)->pi_phci == NULL)) {
			MDI_PI_UNLOCK(pip);
			MDI_CLIENT_UNLOCK(ct);
			return (MDI_FAILURE);
		}

		/* Return MDI_BUSY if we have a transient condition */
		if (MDI_PI_IS_TRANSIENT(pip)) {
			MDI_PI_UNLOCK(pip);
			MDI_CLIENT_UNLOCK(ct);
			return (MDI_BUSY);
		}

		/*
		 * Return the path in hold state. Caller should release the
		 * lock by calling mdi_rele_path()
		 */
		MDI_PI_HOLD(pip);
		MDI_PI_UNLOCK(pip);
		*ret_pip = pip;
		MDI_CLIENT_UNLOCK(ct);
		return (MDI_SUCCESS);
	}

	/*
	 * for non default behavior, bypass current
	 * load balancing policy and always use LOAD_BALANCE_RR
	 * except that the start point will be adjusted based
	 * on the provided start_pip
	 */
	lbp = sb ? ct->ct_lb : LOAD_BALANCE_RR;

	switch (lbp) {
	case LOAD_BALANCE_NONE:
		/*
		 * Load balancing is None  or Alternate path mode
		 * Start looking for a online mdi_pathinfo node starting from
		 * last known selected path
		 */
		preferred = 1;
		pip = (mdi_pathinfo_t *)ct->ct_path_last;
		if (pip == NULL) {
			pip = head;
		}
		start = pip;
		do {
			MDI_PI_LOCK(pip);
			/*
			 * No need to explicitly check if the path is disabled.
			 * Since we are checking for state == ONLINE and the
			 * same variable is used for DISABLE/ENABLE information.
			 */
			if ((MDI_PI(pip)->pi_state  ==
				MDI_PATHINFO_STATE_ONLINE) &&
				preferred == MDI_PI(pip)->pi_preferred) {
				/*
				 * Return the path in hold state. Caller should
				 * release the lock by calling mdi_rele_path()
				 */
				MDI_PI_HOLD(pip);
				MDI_PI_UNLOCK(pip);
				ct->ct_path_last = pip;
				*ret_pip = pip;
				MDI_CLIENT_UNLOCK(ct);
				return (MDI_SUCCESS);
			}

			/*
			 * Path is busy.
			 */
			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
			    MDI_PI_IS_TRANSIENT(pip))
				retry = 1;
			/*
			 * Keep looking for a next available online path
			 */
			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
			if (next == NULL) {
				next = head;
			}
			MDI_PI_UNLOCK(pip);
			pip = next;
			if (start == pip && preferred) {
				preferred = 0;
			} else if (start == pip && !preferred) {
				cont = 0;
			}
		} while (cont);
		break;

	case LOAD_BALANCE_LBA:
		/*
		 * Make sure we are looking
		 * for an online path. Otherwise, if it is for a STANDBY
		 * path request, it will go through and fetch an ONLINE
		 * path which is not desirable.
		 */
		if ((ct->ct_lb_args != NULL) &&
			    (ct->ct_lb_args->region_size) && bp &&
				(sb || (flags == MDI_SELECT_ONLINE_PATH))) {
			if (i_mdi_lba_lb(ct, ret_pip, bp)
				    == MDI_SUCCESS) {
				MDI_CLIENT_UNLOCK(ct);
				return (MDI_SUCCESS);
			}
		}
		/* FALLTHROUGH */
	case LOAD_BALANCE_RR:
		/*
		 * Load balancing is Round Robin. Start looking for a online
		 * mdi_pathinfo node starting from last known selected path
		 * as the start point.  If override flags are specified,
		 * process accordingly.
		 * If the search is already in effect(start_pip not null),
		 * then lets just use the same path preference to continue the
		 * traversal.
		 */

		if (start_pip != NULL) {
			preferred = MDI_PI(start_pip)->pi_preferred;
		} else {
			preferred = 1;
		}

		start = sb ? (mdi_pathinfo_t *)ct->ct_path_last : start_pip;
		if (start == NULL) {
			pip = head;
		} else {
			pip = (mdi_pathinfo_t *)MDI_PI(start)->pi_client_link;
			if (pip == NULL) {
				if ( flags & MDI_SELECT_NO_PREFERRED) {
					/*
					 * Return since we hit the end of list
					 */
					MDI_CLIENT_UNLOCK(ct);
					return (MDI_NOPATH);
				}

				if (!sb) {
					if (preferred == 0) {
						/*
						 * Looks like we have completed
						 * the traversal as preferred
						 * value is 0. Time to bail out.
						 */
						*ret_pip = NULL;
						MDI_CLIENT_UNLOCK(ct);
						return (MDI_NOPATH);
					} else {
						/*
						 * Looks like we reached the
						 * end of the list. Lets enable
						 * traversal of non preferred
						 * paths.
						 */
						preferred = 0;
					}
				}
				pip = head;
			}
		}
		start = pip;
		do {
			MDI_PI_LOCK(pip);
			if (sb) {
				cond = ((MDI_PI(pip)->pi_state ==
				    MDI_PATHINFO_STATE_ONLINE &&
					MDI_PI(pip)->pi_preferred ==
						preferred) ? 1 : 0);
			} else {
				if (flags == MDI_SELECT_ONLINE_PATH) {
					cond = ((MDI_PI(pip)->pi_state ==
					    MDI_PATHINFO_STATE_ONLINE &&
						MDI_PI(pip)->pi_preferred ==
						preferred) ? 1 : 0);
				} else if (flags == MDI_SELECT_STANDBY_PATH) {
					cond = ((MDI_PI(pip)->pi_state ==
					    MDI_PATHINFO_STATE_STANDBY &&
						MDI_PI(pip)->pi_preferred ==
						preferred) ? 1 : 0);
				} else if (flags == (MDI_SELECT_ONLINE_PATH |
				    MDI_SELECT_STANDBY_PATH)) {
					cond = (((MDI_PI(pip)->pi_state ==
					    MDI_PATHINFO_STATE_ONLINE ||
					    (MDI_PI(pip)->pi_state ==
					    MDI_PATHINFO_STATE_STANDBY)) &&
						MDI_PI(pip)->pi_preferred ==
						preferred) ? 1 : 0);
				} else if (flags ==
					(MDI_SELECT_STANDBY_PATH |
					MDI_SELECT_ONLINE_PATH |
					MDI_SELECT_USER_DISABLE_PATH)) {
					cond = (((MDI_PI(pip)->pi_state ==
					    MDI_PATHINFO_STATE_ONLINE ||
					    (MDI_PI(pip)->pi_state ==
					    MDI_PATHINFO_STATE_STANDBY) ||
						(MDI_PI(pip)->pi_state ==
					    (MDI_PATHINFO_STATE_ONLINE|
					    MDI_PATHINFO_STATE_USER_DISABLE)) ||
						(MDI_PI(pip)->pi_state ==
					    (MDI_PATHINFO_STATE_STANDBY |
					    MDI_PATHINFO_STATE_USER_DISABLE)))&&
						MDI_PI(pip)->pi_preferred ==
						preferred) ? 1 : 0);
				} else if (flags ==
				    (MDI_SELECT_STANDBY_PATH |
				    MDI_SELECT_ONLINE_PATH |
				    MDI_SELECT_NO_PREFERRED)) {
					cond = (((MDI_PI(pip)->pi_state ==
					    MDI_PATHINFO_STATE_ONLINE) ||
					    (MDI_PI(pip)->pi_state ==
					    MDI_PATHINFO_STATE_STANDBY))
					    ? 1 : 0);
				} else {
					cond = 0;
				}
			}
			/*
			 * No need to explicitly check if the path is disabled.
			 * Since we are checking for state == ONLINE and the
			 * same variable is used for DISABLE/ENABLE information.
			 */
			if (cond) {
				/*
				 * Return the path in hold state. Caller should
				 * release the lock by calling mdi_rele_path()
				 */
				MDI_PI_HOLD(pip);
				MDI_PI_UNLOCK(pip);
				if (sb)
					ct->ct_path_last = pip;
				*ret_pip = pip;
				MDI_CLIENT_UNLOCK(ct);
				return (MDI_SUCCESS);
			}
			/*
			 * Path is busy.
			 */
			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
			    MDI_PI_IS_TRANSIENT(pip))
				retry = 1;

			/*
			 * Keep looking for a next available online path
			 */
do_again:
			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
			if (next == NULL) {
				if ( flags & MDI_SELECT_NO_PREFERRED) {
					/*
					 * Bail out since we hit the end of list
					 */
					MDI_PI_UNLOCK(pip);
					break;
				}

				if (!sb) {
					if (preferred == 1) {
						/*
						 * Looks like we reached the
						 * end of the list. Lets enable
						 * traversal of non preferred
						 * paths.
						 */
						preferred = 0;
						next = head;
					} else {
						/*
						 * We have done both the passes
						 * Preferred as well as for
						 * Non-preferred. Bail out now.
						 */
						cont = 0;
					}
				} else {
					/*
					 * Standard behavior case.
					 */
					next = head;
				}
			}
			MDI_PI_UNLOCK(pip);
			if (cont == 0) {
				break;
			}
			pip = next;

			if (!sb) {
				/*
				 * We need to handle the selection of
				 * non-preferred path in the following
				 * case:
				 *
				 * +------+   +------+   +------+   +-----+
				 * | A : 1| - | B : 1| - | C : 0| - |NULL |
				 * +------+   +------+   +------+   +-----+
				 *
				 * If we start the search with B, we need to
				 * skip beyond B to pick C which is non -
				 * preferred in the second pass. The following
				 * test, if true, will allow us to skip over
				 * the 'start'(B in the example) to select
				 * other non preferred elements.
				 */
				if ((start_pip != NULL) && (start_pip == pip) &&
				    (MDI_PI(start_pip)->pi_preferred
				    != preferred)) {
					/*
					 * try again after going past the start
					 * pip
					 */
					MDI_PI_LOCK(pip);
					goto do_again;
				}
			} else {
				/*
				 * Standard behavior case
				 */
				if (start == pip && preferred) {
					/* look for nonpreferred paths */
					preferred = 0;
				} else if (start == pip && !preferred) {
					/*
					 * Exit condition
					 */
					cont = 0;
				}
			}
		} while (cont);
		break;
	}

	MDI_CLIENT_UNLOCK(ct);
	if (retry == 1) {
		return (MDI_BUSY);
	} else {
		return (MDI_NOPATH);
	}
}

/*
 * For a client, return the next available path to any phci
 *
 * Note:
 *		Caller should hold the branch's devinfo node to get a consistent
 *		snap shot of the mdi_pathinfo nodes.
 *
 *		Please note that even the list is stable the mdi_pathinfo
 *		node state and properties are volatile.  The caller should lock
 *		and unlock the nodes by calling mdi_pi_lock() and
 *		mdi_pi_unlock() functions to get a stable properties.
 *
 *		If there is a need to use the nodes beyond the hold of the
 *		devinfo node period (For ex. I/O), then mdi_pathinfo node
 *		need to be held against unexpected removal by calling
 *		mdi_hold_path() and should be released by calling
 *		mdi_rele_path() on completion.
 */
mdi_pathinfo_t *
mdi_get_next_phci_path(dev_info_t *ct_dip, mdi_pathinfo_t *pip)
{
	mdi_client_t *ct;

	if (!MDI_CLIENT(ct_dip))
		return (NULL);

	/*
	 * Walk through client link
	 */
	ct = (mdi_client_t *)DEVI(ct_dip)->devi_mdi_client;
	ASSERT(ct != NULL);

	if (pip == NULL)
		return ((mdi_pathinfo_t *)ct->ct_path_head);

	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link);
}

/*
 * For a phci, return the next available path to any client
 * Note: ditto mdi_get_next_phci_path()
 */
mdi_pathinfo_t *
mdi_get_next_client_path(dev_info_t *ph_dip, mdi_pathinfo_t *pip)
{
	mdi_phci_t *ph;

	if (!MDI_PHCI(ph_dip))
		return (NULL);

	/*
	 * Walk through pHCI link
	 */
	ph = (mdi_phci_t *)DEVI(ph_dip)->devi_mdi_xhci;
	ASSERT(ph != NULL);

	if (pip == NULL)
		return ((mdi_pathinfo_t *)ph->ph_path_head);

	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link);
}

/*
 * mdi_hold_path():
 *		Hold the mdi_pathinfo node against unwanted unexpected free.
 * Return Values:
 *		None
 */
void
mdi_hold_path(mdi_pathinfo_t *pip)
{
	if (pip) {
		MDI_PI_LOCK(pip);
		MDI_PI_HOLD(pip);
		MDI_PI_UNLOCK(pip);
	}
}


/*
 * mdi_rele_path():
 *		Release the mdi_pathinfo node which was selected
 *		through mdi_select_path() mechanism or manually held by
 *		calling mdi_hold_path().
 * Return Values:
 *		None
 */
void
mdi_rele_path(mdi_pathinfo_t *pip)
{
	if (pip) {
		MDI_PI_LOCK(pip);
		MDI_PI_RELE(pip);
		if (MDI_PI(pip)->pi_ref_cnt == 0) {
			cv_broadcast(&MDI_PI(pip)->pi_ref_cv);
		}
		MDI_PI_UNLOCK(pip);
	}
}

/*
 * mdi_pi_lock():
 * 		Lock the mdi_pathinfo node.
 * Note:
 *		The caller should release the lock by calling mdi_pi_unlock()
 */
void
mdi_pi_lock(mdi_pathinfo_t *pip)
{
	ASSERT(pip != NULL);
	if (pip) {
		MDI_PI_LOCK(pip);
	}
}


/*
 * mdi_pi_unlock():
 * 		Unlock the mdi_pathinfo node.
 * Note:
 *		The mdi_pathinfo node should have been locked with mdi_pi_lock()
 */
void
mdi_pi_unlock(mdi_pathinfo_t *pip)
{
	ASSERT(pip != NULL);
	if (pip) {
		MDI_PI_UNLOCK(pip);
	}
}

/*
 * mdi_pi_find():
 *		Search the list of mdi_pathinfo nodes attached to the
 *		pHCI/Client device node whose path address matches "paddr".
 *		Returns a pointer to the mdi_pathinfo node if a matching node is
 *		found.
 * Return Values:
 *		mdi_pathinfo node handle
 *		NULL
 * Notes:
 *		Caller need not hold any locks to call this function.
 */
mdi_pathinfo_t *
mdi_pi_find(dev_info_t *pdip, char *caddr, char *paddr)
{
	mdi_phci_t		*ph;
	mdi_vhci_t		*vh;
	mdi_client_t		*ct;
	mdi_pathinfo_t		*pip = NULL;

	MDI_DEBUG(2, (MDI_NOTE, pdip,
	    "caddr@%s paddr@%s", caddr ? caddr : "", paddr ? paddr : ""));
	if ((pdip == NULL) || (paddr == NULL)) {
		return (NULL);
	}
	ph = i_devi_get_phci(pdip);
	if (ph == NULL) {
		/*
		 * Invalid pHCI device, Nothing more to do.
		 */
		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid phci"));
		return (NULL);
	}

	vh = ph->ph_vhci;
	if (vh == NULL) {
		/*
		 * Invalid vHCI device, Nothing more to do.
		 */
		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid vhci"));
		return (NULL);
	}

	/*
	 * Look for pathinfo node identified by paddr.
	 */
	if (caddr == NULL) {
		/*
		 * Find a mdi_pathinfo node under pHCI list for a matching
		 * unit address.
		 */
		MDI_PHCI_LOCK(ph);
		if (MDI_PHCI_IS_OFFLINE(ph)) {
			MDI_DEBUG(2, (MDI_WARN, pdip,
			    "offline phci %p", (void *)ph));
			MDI_PHCI_UNLOCK(ph);
			return (NULL);
		}
		pip = (mdi_pathinfo_t *)ph->ph_path_head;

		while (pip != NULL) {
			if (strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
				break;
			}
			pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
		}
		MDI_PHCI_UNLOCK(ph);
		MDI_DEBUG(2, (MDI_NOTE, pdip,
		    "found %s %p", mdi_pi_spathname(pip), (void *)pip));
		return (pip);
	}

	/*
	 * XXX - Is the rest of the code in this function really necessary?
	 * The consumers of mdi_pi_find() can search for the desired pathinfo
	 * node by calling mdi_pi_find(pdip, NULL, paddr). Irrespective of
	 * whether the search is based on the pathinfo nodes attached to
	 * the pHCI or the client node, the result will be the same.
	 */

	/*
	 * Find the client device corresponding to 'caddr'
	 */
	MDI_VHCI_CLIENT_LOCK(vh);

	/*
	 * XXX - Passing NULL to the following function works as long as the
	 * the client addresses (caddr) are unique per vhci basis.
	 */
	ct = i_mdi_client_find(vh, NULL, caddr);
	if (ct == NULL) {
		/*
		 * Client not found, Obviously mdi_pathinfo node has not been
		 * created yet.
		 */
		MDI_VHCI_CLIENT_UNLOCK(vh);
		MDI_DEBUG(2, (MDI_NOTE, pdip,
		    "client not found for caddr @%s", caddr ? caddr : ""));
		return (NULL);
	}

	/*
	 * Hold the client lock and look for a mdi_pathinfo node with matching
	 * pHCI and paddr
	 */
	MDI_CLIENT_LOCK(ct);

	/*
	 * Release the global mutex as it is no more needed. Note: We always
	 * respect the locking order while acquiring.
	 */
	MDI_VHCI_CLIENT_UNLOCK(vh);

	pip = (mdi_pathinfo_t *)ct->ct_path_head;
	while (pip != NULL) {
		/*
		 * Compare the unit address
		 */
		if ((MDI_PI(pip)->pi_phci == ph) &&
		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
			break;
		}
		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
	}
	MDI_CLIENT_UNLOCK(ct);
	MDI_DEBUG(2, (MDI_NOTE, pdip,
	    "found: %s %p", mdi_pi_spathname(pip), (void *)pip));
	return (pip);
}

/*
 * mdi_pi_alloc():
 *		Allocate and initialize a new instance of a mdi_pathinfo node.
 *		The mdi_pathinfo node returned by this function identifies a
 *		unique device path is capable of having properties attached
 *		and passed to mdi_pi_online() to fully attach and online the
 *		path and client device node.
 *		The mdi_pathinfo node returned by this function must be
 *		destroyed using mdi_pi_free() if the path is no longer
 *		operational or if the caller fails to attach a client device
 *		node when calling mdi_pi_online(). The framework will not free
 *		the resources allocated.
 *		This function can be called from both interrupt and kernel
 *		contexts.  DDI_NOSLEEP flag should be used while calling
 *		from interrupt contexts.
 * Return Values:
 *		MDI_SUCCESS
 *		MDI_FAILURE
 *		MDI_NOMEM
 */
/*ARGSUSED*/
int
mdi_pi_alloc_compatible(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
    char **compatible, int ncompatible, int flags, mdi_pathinfo_t **ret_pip)
{
	mdi_vhci_t	*vh;
	mdi_phci_t	*ph;
	mdi_client_t	*ct;
	mdi_pathinfo_t	*pip = NULL;
	dev_info_t	*cdip;
	int		rv = MDI_NOMEM;
	int		path_allocated = 0;

	MDI_DEBUG(2, (MDI_NOTE, pdip,
	    "cname %s: caddr@%s paddr@%s",
	    cname ? cname : "", caddr ? caddr : "", paddr ? paddr : ""));

	if (pdip == NULL || cname == NULL || caddr == NULL || paddr == NULL ||
	    ret_pip == NULL) {
		/* Nothing more to do */
		return (MDI_FAILURE);
	}

	*ret_pip = NULL;

	/* No allocations on detaching pHCI */
	if (DEVI_IS_DETACHING(pdip)) {
		/* Invalid pHCI device, return failure */
		MDI_DEBUG(1, (MDI_WARN, pdip,
		    "!detaching pHCI=%p", (void *)pdip));
		return (MDI_FAILURE);
	}

	ph = i_devi_get_phci(pdip);
	ASSERT(ph != NULL);
	if (ph == NULL) {
		/* Invalid pHCI device, return failure */
		MDI_DEBUG(1, (MDI_WARN, pdip,
		    "!invalid pHCI=%p", (void *)pdip));
		return (MDI_FAILURE);
	}

	MDI_PHCI_LOCK(ph);
	vh = ph->ph_vhci;
	if (vh == NULL) {
		/* Invalid vHCI device, return failure */
		MDI_DEBUG(1, (MDI_WARN, pdip,
		    "!invalid vHCI=%p", (void *)pdip));
		MDI_PHCI_UNLOCK(ph);
		return (MDI_FAILURE);
	}

	if (MDI_PHCI_IS_READY(ph) == 0) {
		/*
		 * Do not allow new node creation when pHCI is in
		 * offline/suspended states
		 */
		MDI_DEBUG(1, (MDI_WARN, pdip,
		    "pHCI=%p is not ready", (void *)ph));
		MDI_PHCI_UNLOCK(ph);
		return (MDI_BUSY);
	}
	MDI_PHCI_UNSTABLE(ph);
	MDI_PHCI_UNLOCK(ph);

	/* look for a matching client, create one if not found */
	MDI_VHCI_CLIENT_LOCK(vh);
	ct = i_mdi_client_find(vh, cname, caddr);
	if (ct == NULL) {
		ct = i_mdi_client_alloc(vh, cname, caddr);
		ASSERT(ct != NULL);
	}

	if (ct->ct_dip == NULL) {
		/*
		 * Allocate a devinfo node
		 */
		ct->ct_dip = i_mdi_devinfo_create(vh, cname, caddr,
		    compatible, ncompatible);
		if (ct->ct_dip == NULL) {
			(void) i_mdi_client_free(vh, ct);
			goto fail;
		}
	}
	cdip = ct->ct_dip;

	DEVI(cdip)->devi_mdi_component |= MDI_COMPONENT_CLIENT;
	DEVI(cdip)->devi_mdi_client = (caddr_t)ct;

	MDI_CLIENT_LOCK(ct);
	pip = (mdi_pathinfo_t *)ct->ct_path_head;
	while (pip != NULL) {
		/*
		 * Compare the unit address
		 */
		if ((MDI_PI(pip)->pi_phci == ph) &&
		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
			break;
		}
		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
	}
	MDI_CLIENT_UNLOCK(ct);

	if (pip == NULL) {
		/*
		 * This is a new path for this client device.  Allocate and
		 * initialize a new pathinfo node
		 */
		pip = i_mdi_pi_alloc(ph, paddr, ct);
		ASSERT(pip != NULL);
		path_allocated = 1;
	}
	rv = MDI_SUCCESS;

fail:
	/*
	 * Release the global mutex.
	 */
	MDI_VHCI_CLIENT_UNLOCK(vh);

	/*
	 * Mark the pHCI as stable
	 */
	MDI_PHCI_LOCK(ph);
	MDI_PHCI_STABLE(ph);
	MDI_PHCI_UNLOCK(ph);
	*ret_pip = pip;

	MDI_DEBUG(2, (MDI_NOTE, pdip,
	    "alloc %s %p", mdi_pi_spathname(pip), (void *)pip));

	if (path_allocated)
		vhcache_pi_add(vh->vh_config, MDI_PI(pip));

	return (rv);
}

/*ARGSUSED*/
int
mdi_pi_alloc(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
    int flags, mdi_pathinfo_t **ret_pip)
{
	return (mdi_pi_alloc_compatible(pdip, cname, caddr, paddr, NULL, 0,
	    flags, ret_pip));
}

/*
 * i_mdi_pi_alloc():
 *		Allocate a mdi_pathinfo node and add to the pHCI path list
 * Return Values:
 *		mdi_pathinfo
 */
/*ARGSUSED*/
static mdi_pathinfo_t *
i_mdi_pi_alloc(mdi_phci_t *ph, char *paddr, mdi_client_t *ct)
{
	mdi_pathinfo_t	*pip;
	int		ct_circular;
	int		ph_circular;
	static char	path[MAXPATHLEN];	/* mdi_pathmap_mutex protects */
	char		*path_persistent;
	int		path_instance;
	mod_hash_val_t	hv;

	ASSERT(MDI_VHCI_CLIENT_LOCKED(ph->ph_vhci));

	pip = kmem_zalloc(sizeof (struct mdi_pathinfo), KM_SLEEP);
	mutex_init(&MDI_PI(pip)->pi_mutex, NULL, MUTEX_DEFAULT, NULL);
	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_INIT |
	    MDI_PATHINFO_STATE_TRANSIENT;

	if (MDI_PHCI_IS_USER_DISABLED(ph))
		MDI_PI_SET_USER_DISABLE(pip);

	if (MDI_PHCI_IS_DRV_DISABLED_TRANSIENT(ph))
		MDI_PI_SET_DRV_DISABLE_TRANS(pip);

	if (MDI_PHCI_IS_DRV_DISABLED(ph))
		MDI_PI_SET_DRV_DISABLE(pip);

	MDI_PI(pip)->pi_old_state = MDI_PATHINFO_STATE_INIT;
	cv_init(&MDI_PI(pip)->pi_state_cv, NULL, CV_DEFAULT, NULL);
	MDI_PI(pip)->pi_client = ct;
	MDI_PI(pip)->pi_phci = ph;
	MDI_PI(pip)->pi_addr = kmem_alloc(strlen(paddr) + 1, KM_SLEEP);
	(void) strcpy(MDI_PI(pip)->pi_addr, paddr);

        /*
	 * We form the "path" to the pathinfo node, and see if we have
	 * already allocated a 'path_instance' for that "path".  If so,
	 * we use the already allocated 'path_instance'.  If not, we
	 * allocate a new 'path_instance' and associate it with a copy of
	 * the "path" string (which is never freed). The association
	 * between a 'path_instance' this "path" string persists until
	 * reboot.
	 */
        mutex_enter(&mdi_pathmap_mutex);
	(void) ddi_pathname(ph->ph_dip, path);
	(void) sprintf(path + strlen(path), "/%s@%s",
	    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
        if (mod_hash_find(mdi_pathmap_bypath, (mod_hash_key_t)path, &hv) == 0) {
                path_instance = (uint_t)(intptr_t)hv;
        } else {
		/* allocate a new 'path_instance' and persistent "path" */
		path_instance = mdi_pathmap_instance++;
		path_persistent = i_ddi_strdup(path, KM_SLEEP);
                (void) mod_hash_insert(mdi_pathmap_bypath,
                    (mod_hash_key_t)path_persistent,
                    (mod_hash_val_t)(intptr_t)path_instance);
		(void) mod_hash_insert(mdi_pathmap_byinstance,
		    (mod_hash_key_t)(intptr_t)path_instance,
		    (mod_hash_val_t)path_persistent);

		/* create shortpath name */
		(void) snprintf(path, sizeof(path), "%s%d/%s@%s",
		    ddi_driver_name(ph->ph_dip), ddi_get_instance(ph->ph_dip),
		    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
		path_persistent = i_ddi_strdup(path, KM_SLEEP);
		(void) mod_hash_insert(mdi_pathmap_sbyinstance,
		    (mod_hash_key_t)(intptr_t)path_instance,
		    (mod_hash_val_t)path_persistent);
        }
        mutex_exit(&mdi_pathmap_mutex);
	MDI_PI(pip)->pi_path_instance = path_instance;

	(void) nvlist_alloc(&MDI_PI(pip)->pi_prop, NV_UNIQUE_NAME, KM_SLEEP);
	ASSERT(MDI_PI(pip)->pi_prop != NULL);
	MDI_PI(pip)->pi_pprivate = NULL;
	MDI_PI(pip)->pi_cprivate = NULL;
	MDI_PI(pip)->pi_vprivate = NULL;
	MDI_PI(pip)->pi_client_link = NULL;
	MDI_PI(pip)->pi_phci_link = NULL;
	MDI_PI(pip)->pi_ref_cnt = 0;
	MDI_PI(pip)->pi_kstats = NULL;
	MDI_PI(pip)->pi_preferred = 1;
	cv_init(&MDI_PI(pip)->pi_ref_cv, NULL, CV_DEFAULT, NULL);

	/*
	 * Lock both dev_info nodes against changes in parallel.
	 *
	 * The ndi_devi_enter(Client), is atypical since the client is a leaf.
	 * This atypical operation is done to synchronize pathinfo nodes
	 * during devinfo snapshot (see di_register_pip) by 'pretending' that
	 * the pathinfo nodes are children of the Client.
	 */
	ndi_devi_enter(ct->ct_dip, &ct_circular);
	ndi_devi_enter(ph->ph_dip, &ph_circular);

	i_mdi_phci_add_path(ph, pip);
	i_mdi_client_add_path(ct, pip);

	ndi_devi_exit(ph->ph_dip, ph_circular);
	ndi_devi_exit(ct->ct_dip, ct_circular);

	return (pip);
}

/*
 * mdi_pi_pathname_by_instance():
 *	Lookup of "path" by 'path_instance'. Return "path".
 *	NOTE: returned "path" remains valid forever (until reboot).
 */
char *
mdi_pi_pathname_by_instance(int path_instance)
{
	char		*path;
	mod_hash_val_t	hv;

	/* mdi_pathmap lookup of "path" by 'path_instance' */
	mutex_enter(&mdi_pathmap_mutex);
	if (mod_hash_find(mdi_pathmap_byinstance,
	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
		path = (char *)hv;
	else
		path = NULL;
	mutex_exit(&mdi_pathmap_mutex);
	return (path);
}

/*
 * mdi_pi_spathname_by_instance():
 *	Lookup of "shortpath" by 'path_instance'. Return "shortpath".
 *	NOTE: returned "shortpath" remains valid forever (until reboot).
 */
char *
mdi_pi_spathname_by_instance(int path_instance)
{
	char		*path;
	mod_hash_val_t	hv;

	/* mdi_pathmap lookup of "path" by 'path_instance' */
	mutex_enter(&mdi_pathmap_mutex);
	if (mod_hash_find(mdi_pathmap_sbyinstance,
	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
		path = (char *)hv;
	else
		path = NULL;
	mutex_exit(&mdi_pathmap_mutex);
	return (path);
}


/*
 * i_mdi_phci_add_path():
 * 		Add a mdi_pathinfo node to pHCI list.
 * Notes:
 *		Caller should per-pHCI mutex
 */
static void
i_mdi_phci_add_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
{
	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));

	MDI_PHCI_LOCK(ph);
	if (ph->ph_path_head == NULL) {
		ph->ph_path_head = pip;
	} else {
		MDI_PI(ph->ph_path_tail)->pi_phci_link = MDI_PI(pip);
	}
	ph->ph_path_tail = pip;
	ph->ph_path_count++;
	MDI_PHCI_UNLOCK(ph);
}

/*
 * i_mdi_client_add_path():
 *		Add mdi_pathinfo node to client list
 */
static void
i_mdi_client_add_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
{
	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));

	MDI_CLIENT_LOCK(ct);
	if (ct->ct_path_head == NULL) {
		ct->ct_path_head = pip;
	} else {
		MDI_PI(ct->ct_path_tail)->pi_client_link = MDI_PI(pip);
	}
	ct->ct_path_tail = pip;
	ct->ct_path_count++;
	MDI_CLIENT_UNLOCK(ct);
}

/*
 * mdi_pi_free():
 *		Free the mdi_pathinfo node and also client device node if this
 *		is the last path to the device
 * Return Values:
 *		MDI_SUCCESS
 *		MDI_FAILURE
 *		MDI_BUSY
 */
/*ARGSUSED*/
int
mdi_pi_free(mdi_pathinfo_t *pip, int flags)
{
	int		rv;
	mdi_vhci_t	*vh;
	mdi_phci_t	*ph;
	mdi_client_t	*ct;
	int		(*f)();
	int		client_held = 0;

	MDI_PI_LOCK(pip);
	ph = MDI_PI(pip)->pi_phci;
	ASSERT(ph != NULL);
	if (ph == NULL) {
		/*
		 * Invalid pHCI device, return failure
		 */
		MDI_DEBUG(1, (MDI_WARN, NULL,
		    "!invalid pHCI: pip %s %p",
		    mdi_pi_spathname(pip), (void *)pip));
		MDI_PI_UNLOCK(pip);
		return (MDI_FAILURE);
	}

	vh = ph->ph_vhci;
	ASSERT(vh != NULL);
	if (vh == NULL) {
		/* Invalid pHCI device, return failure */
		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
		    "!invalid vHCI: pip %s %p",
		    mdi_pi_spathname(pip), (void *)pip));
		MDI_PI_UNLOCK(pip);
		return (MDI_FAILURE);
	}

	ct = MDI_PI(pip)->pi_client;
	ASSERT(ct != NULL);
	if (ct == NULL) {
		/*
		 * Invalid Client device, return failure
		 */
		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
		    "!invalid client: pip %s %p",
		    mdi_pi_spathname(pip), (void *)pip));
		MDI_PI_UNLOCK(pip);
		return (MDI_FAILURE);
	}

	/*
	 * Check to see for busy condition.  A mdi_pathinfo can only be freed
	 * if the node state is either offline or init and the reference count
	 * is zero.
	 */
	if (!(MDI_PI_IS_OFFLINE(pip) || MDI_PI_IS_INIT(pip) ||
	    MDI_PI_IS_INITING(pip))) {
		/*
		 * Node is busy
		 */
		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
		    "!busy: pip %s %p", mdi_pi_spathname(pip), (void *)pip));
		MDI_PI_UNLOCK(pip);
		return (MDI_BUSY);
	}

	while (MDI_PI(pip)->pi_ref_cnt != 0) {
		/*
		 * Give a chance for pending I/Os to complete.
		 */
		MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
		    "!%d cmds still pending on path: %s %p",
		    MDI_PI(pip)->pi_ref_cnt,
		    mdi_pi_spathname(pip), (void *)pip));
		if (cv_reltimedwait(&MDI_PI(pip)->pi_ref_cv,
		    &MDI_PI(pip)->pi_mutex, drv_usectohz(60 * 1000000),
		    TR_CLOCK_TICK) == -1) {
			/*
			 * The timeout time reached without ref_cnt being zero
			 * being signaled.
			 */
			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
			    "!Timeout reached on path %s %p without the cond",
			    mdi_pi_spathname(pip), (void *)pip));
			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
			    "!%d cmds still pending on path %s %p",
			    MDI_PI(pip)->pi_ref_cnt,
			    mdi_pi_spathname(pip), (void *)pip));
			MDI_PI_UNLOCK(pip);
			return (MDI_BUSY);
		}
	}
	if (MDI_PI(pip)->pi_pm_held) {
		client_held = 1;
	}
	MDI_PI_UNLOCK(pip);

	vhcache_pi_remove(vh->vh_config, MDI_PI(pip));

	MDI_CLIENT_LOCK(ct);

	/* Prevent further failovers till MDI_VHCI_CLIENT_LOCK is held */
	MDI_CLIENT_SET_PATH_FREE_IN_PROGRESS(ct);

	/*
	 * Wait till failover is complete before removing this node.
	 */
	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);

	MDI_CLIENT_UNLOCK(ct);
	MDI_VHCI_CLIENT_LOCK(vh);
	MDI_CLIENT_LOCK(ct);
	MDI_CLIENT_CLEAR_PATH_FREE_IN_PROGRESS(ct);

	if (!MDI_PI_IS_INITING(pip)) {
		f = vh->vh_ops->vo_pi_uninit;
		if (f != NULL) {
			rv = (*f)(vh->vh_dip, pip, 0);
		}
	} else
		rv = MDI_SUCCESS;

	/*
	 * If vo_pi_uninit() completed successfully.
	 */
	if (rv == MDI_SUCCESS) {
		if (client_held) {
			MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
			    "i_mdi_pm_rele_client\n"));
			i_mdi_pm_rele_client(ct, 1);
		}
		i_mdi_pi_free(ph, pip, ct);
		if (ct->ct_path_count == 0) {
			/*
			 * Client lost its last path.
			 * Clean up the client device
			 */
			MDI_CLIENT_UNLOCK(ct);
			(void) i_mdi_client_free(ct->ct_vhci, ct);
			MDI_VHCI_CLIENT_UNLOCK(vh);
			return (rv);
		}
	}
	MDI_CLIENT_UNLOCK(ct);
	MDI_VHCI_CLIENT_UNLOCK(vh);

	if (rv == MDI_FAILURE)
		vhcache_pi_add(vh->vh_config, MDI_PI(pip));

	return (rv);
}

/*
 * i_mdi_pi_free():
 *		Free the mdi_pathinfo node
 */
static void
i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *pip, mdi_client_t *ct)
{
	int	ct_circular;
	int	ph_circular;

	ASSERT(MDI_CLIENT_LOCKED(ct));

	/*
	 * remove any per-path kstats
	 */
	i_mdi_pi_kstat_destroy(pip);

	/* See comments in i_mdi_pi_alloc() */
	ndi_devi_enter(ct->ct_dip, &ct_circular);
	ndi_devi_enter(ph->ph_dip, &ph_circular);

	i_mdi_client_remove_path(ct, pip);
	i_mdi_phci_remove_path(ph, pip);

	ndi_devi_exit(ph->ph_dip, ph_circular);
	ndi_devi_exit(ct->ct_dip, ct_circular);

	mutex_destroy(&MDI_PI(pip)->pi_mutex);
	cv_destroy(&MDI_PI(pip)->pi_state_cv);
	cv_destroy(&MDI_PI(pip)->pi_ref_cv);
	if (MDI_PI(pip)->pi_addr) {
		kmem_free(MDI_PI(pip)->pi_addr,
		    strlen(MDI_PI(pip)->pi_addr) + 1);
		MDI_PI(pip)->pi_addr = NULL;
	}

	if (MDI_PI(pip)->pi_prop) {
		(void) nvlist_free(MDI_PI(pip)->pi_prop);
		MDI_PI(pip)->pi_prop = NULL;
	}
	kmem_free(pip, sizeof (struct mdi_pathinfo));
}


/*
 * i_mdi_phci_remove_path():
 * 		Remove a mdi_pathinfo node from pHCI list.
 * Notes:
 *		Caller should hold per-pHCI mutex
 */
static void
i_mdi_phci_remove_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
{
	mdi_pathinfo_t	*prev = NULL;
	mdi_pathinfo_t	*path = NULL;

	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));

	MDI_PHCI_LOCK(ph);
	path = ph->ph_path_head;
	while (path != NULL) {
		if (path == pip) {
			break;
		}
		prev = path;
		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
	}

	if (path) {
		ph->ph_path_count--;
		if (prev) {
			MDI_PI(prev)->pi_phci_link = MDI_PI(path)->pi_phci_link;
		} else {
			ph->ph_path_head =
			    (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
		}
		if (ph->ph_path_tail == path) {
			ph->ph_path_tail = prev;
		}
	}

	/*
	 * Clear the pHCI link
	 */
	MDI_PI(pip)->pi_phci_link = NULL;
	MDI_PI(pip)->pi_phci = NULL;
	MDI_PHCI_UNLOCK(ph);
}

/*
 * i_mdi_client_remove_path():
 * 		Remove a mdi_pathinfo node from client path list.
 */
static void
i_mdi_client_remove_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
{
	mdi_pathinfo_t	*prev = NULL;
	mdi_pathinfo_t	*path;

	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));

	ASSERT(MDI_CLIENT_LOCKED(ct));
	path = ct->ct_path_head;
	while (path != NULL) {
		if (path == pip) {
			break;
		}
		prev = path;
		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
	}

	if (path) {
		ct->ct_path_count--;
		if (prev) {
			MDI_PI(prev)->pi_client_link =
			    MDI_PI(path)->pi_client_link;
		} else {
			ct->ct_path_head =
			    (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
		}
		if (ct->ct_path_tail == path) {
			ct->ct_path_tail = prev;
		}
		if (ct->ct_path_last == path) {
			ct->ct_path_last = ct->ct_path_head;
		}
	}
	MDI_PI(pip)->pi_client_link = NULL;
	MDI_PI(pip)->pi_client = NULL;
}

/*
 * i_mdi_pi_state_change():
 *		online a mdi_pathinfo node
 *
 * Return Values:
 *		MDI_SUCCESS
 *		MDI_FAILURE
 */
/*ARGSUSED*/
static int
i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
{
	int		rv = MDI_SUCCESS;
	mdi_vhci_t	*vh;
	mdi_phci_t	*ph;
	mdi_client_t	*ct;
	int		(*f)();
	dev_info_t	*cdip;

	MDI_PI_LOCK(pip);

	ph = MDI_PI(pip)->pi_phci;
	ASSERT(ph);
	if (ph == NULL) {
		/*
		 * Invalid pHCI device, fail the request
		 */
		MDI_PI_UNLOCK(pip);
		MDI_DEBUG(1, (MDI_WARN, NULL,
		    "!invalid phci: pip %s %p",
		    mdi_pi_spathname(pip), (void *)pip));
		return (MDI_FAILURE);
	}

	vh = ph->ph_vhci;
	ASSERT(vh);
	if (vh == NULL) {
		/*
		 * Invalid vHCI device, fail the request
		 */
		MDI_PI_UNLOCK(pip);
		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
		    "!invalid vhci: pip %s %p",
		    mdi_pi_spathname(pip), (void *)pip));
		return (MDI_FAILURE);
	}

	ct = MDI_PI(pip)->pi_client;
	ASSERT(ct != NULL);
	if (ct == NULL) {
		/*
		 * Invalid client device, fail the request
		 */
		MDI_PI_UNLOCK(pip);
		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
		    "!invalid client: pip %s %p",
		    mdi_pi_spathname(pip), (void *)pip));
		return (MDI_FAILURE);
	}

	/*
	 * If this path has not been initialized yet, Callback vHCI driver's
	 * pathinfo node initialize entry point
	 */

	if (MDI_PI_IS_INITING(pip)) {
		MDI_PI_UNLOCK(pip);
		f = vh->vh_ops->vo_pi_init;
		if (f != NULL) {
			rv = (*f)(vh->vh_dip, pip, 0);
			if (rv != MDI_SUCCESS) {
				MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
				    "!vo_pi_init failed: vHCI %p, pip %s %p",
				    (void *)vh, mdi_pi_spathname(pip),
				    (void *)pip));
				return (MDI_FAILURE);
			}
		}
		MDI_PI_LOCK(pip);
		MDI_PI_CLEAR_TRANSIENT(pip);
	}

	/*
	 * Do not allow state transition when pHCI is in offline/suspended
	 * states
	 */
	i_mdi_phci_lock(ph, pip);
	if (MDI_PHCI_IS_READY(ph) == 0) {
		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
		    "!pHCI not ready, pHCI=%p", (void *)ph));
		MDI_PI_UNLOCK(pip);
		i_mdi_phci_unlock(ph);
		return (MDI_BUSY);
	}
	MDI_PHCI_UNSTABLE(ph);
	i_mdi_phci_unlock(ph);

	/*
	 * Check if mdi_pathinfo state is in transient state.
	 * If yes, offlining is in progress and wait till transient state is
	 * cleared.
	 */
	if (MDI_PI_IS_TRANSIENT(pip)) {
		while (MDI_PI_IS_TRANSIENT(pip)) {
			cv_wait(&MDI_PI(pip)->pi_state_cv,
			    &MDI_PI(pip)->pi_mutex);
		}
	}

	/*
	 * Grab the client lock in reverse order sequence and release the
	 * mdi_pathinfo mutex.
	 */
	i_mdi_client_lock(ct, pip);
	MDI_PI_UNLOCK(pip);

	/*
	 * Wait till failover state is cleared
	 */
	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);

	/*
	 * Mark the mdi_pathinfo node state as transient
	 */
	MDI_PI_LOCK(pip);
	switch (state) {
	case MDI_PATHINFO_STATE_ONLINE:
		MDI_PI_SET_ONLINING(pip);
		break;

	case MDI_PATHINFO_STATE_STANDBY:
		MDI_PI_SET_STANDBYING(pip);
		break;

	case MDI_PATHINFO_STATE_FAULT:
		/*
		 * Mark the pathinfo state as FAULTED
		 */
		MDI_PI_SET_FAULTING(pip);
		MDI_PI_ERRSTAT(pip, MDI_PI_HARDERR);
		break;

	case MDI_PATHINFO_STATE_OFFLINE:
		/*
		 * ndi_devi_offline() cannot hold pip or ct locks.
		 */
		MDI_PI_UNLOCK(pip);

		/*
		 * If this is a user initiated path online->offline operation
		 * who's success would transition a client from DEGRADED to
		 * FAILED then only proceed if we can offline the client first.
		 */
		cdip = ct->ct_dip;
		if ((flag & NDI_USER_REQ) &&
		    MDI_PI_IS_ONLINE(pip) &&
		    (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED)) {
			i_mdi_client_unlock(ct);
			rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN);
			if (rv != NDI_SUCCESS) {
				/*
				 * Convert to MDI error code
				 */
				switch (rv) {
				case NDI_BUSY:
					rv = MDI_BUSY;
					break;
				default:
					rv = MDI_FAILURE;
					break;
				}
				goto state_change_exit;
			} else {
				i_mdi_client_lock(ct, NULL);
			}
		}
		/*
		 * Mark the mdi_pathinfo node state as transient
		 */
		MDI_PI_LOCK(pip);
		MDI_PI_SET_OFFLINING(pip);
		break;
	}
	MDI_PI_UNLOCK(pip);
	MDI_CLIENT_UNSTABLE(ct);
	i_mdi_client_unlock(ct);

	f = vh->vh_ops->vo_pi_state_change;
	if (f != NULL)
		rv = (*f)(vh->vh_dip, pip, state, 0, flag);

	MDI_CLIENT_LOCK(ct);
	MDI_PI_LOCK(pip);
	if (rv == MDI_NOT_SUPPORTED) {
		MDI_CLIENT_SET_DEV_NOT_SUPPORTED(ct);
	}
	if (rv != MDI_SUCCESS) {
		MDI_DEBUG(2, (MDI_WARN, ct->ct_dip,
		    "vo_pi_state_change failed: rv %x", rv));
	}
	if (MDI_PI_IS_TRANSIENT(pip)) {
		if (rv == MDI_SUCCESS) {
			MDI_PI_CLEAR_TRANSIENT(pip);
		} else {
			MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
		}
	}

	/*
	 * Wake anyone waiting for this mdi_pathinfo node
	 */
	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
	MDI_PI_UNLOCK(pip);

	/*
	 * Mark the client device as stable
	 */
	MDI_CLIENT_STABLE(ct);
	if (rv == MDI_SUCCESS) {
		if (ct->ct_unstable == 0) {
			cdip = ct->ct_dip;

			/*
			 * Onlining the mdi_pathinfo node will impact the
			 * client state Update the client and dev_info node
			 * state accordingly
			 */
			rv = NDI_SUCCESS;
			i_mdi_client_update_state(ct);
			switch (MDI_CLIENT_STATE(ct)) {
			case MDI_CLIENT_STATE_OPTIMAL:
			case MDI_CLIENT_STATE_DEGRADED:
				if (cdip && !i_ddi_devi_attached(cdip) &&
				    ((state == MDI_PATHINFO_STATE_ONLINE) ||
				    (state == MDI_PATHINFO_STATE_STANDBY))) {

					/*
					 * Must do ndi_devi_online() through
					 * hotplug thread for deferred
					 * attach mechanism to work
					 */
					MDI_CLIENT_UNLOCK(ct);
					rv = ndi_devi_online(cdip, 0);
					MDI_CLIENT_LOCK(ct);
					if ((rv != NDI_SUCCESS) &&
					    (MDI_CLIENT_STATE(ct) ==
					    MDI_CLIENT_STATE_DEGRADED)) {
						/*
						 * ndi_devi_online failed.
						 * Reset client flags to
						 * offline.
						 */
						MDI_DEBUG(1, (MDI_WARN, cdip,
						    "!ndi_devi_online failed "
						    "error %x", rv));
						MDI_CLIENT_SET_OFFLINE(ct);
					}
					if (rv != NDI_SUCCESS) {
						/* Reset the path state */
						MDI_PI_LOCK(pip);
						MDI_PI(pip)->pi_state =
						    MDI_PI_OLD_STATE(pip);
						MDI_PI_UNLOCK(pip);
					}
				}
				break;

			case MDI_CLIENT_STATE_FAILED:
				/*
				 * This is the last path case for
				 * non-user initiated events.
				 */
				if (((flag & NDI_USER_REQ) == 0) &&
				    cdip && (i_ddi_node_state(cdip) >=
				    DS_INITIALIZED)) {
					MDI_CLIENT_UNLOCK(ct);
					rv = ndi_devi_offline(cdip,
					    NDI_DEVFS_CLEAN);
					MDI_CLIENT_LOCK(ct);

					if (rv != NDI_SUCCESS) {
						/*
						 * ndi_devi_offline failed.
						 * Reset client flags to
						 * online as the path could not
						 * be offlined.
						 */
						MDI_DEBUG(1, (MDI_WARN, cdip,
						    "!ndi_devi_offline failed: "
						    "error %x", rv));
						MDI_CLIENT_SET_ONLINE(ct);
					}
				}
				break;
			}
			/*
			 * Convert to MDI error code
			 */
			switch (rv) {
			case NDI_SUCCESS:
				MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
				i_mdi_report_path_state(ct, pip);
				rv = MDI_SUCCESS;
				break;
			case NDI_BUSY:
				rv = MDI_BUSY;
				break;
			default:
				rv = MDI_FAILURE;
				break;
			}
		}
	}
	MDI_CLIENT_UNLOCK(ct);

state_change_exit:
	/*
	 * Mark the pHCI as stable again.
	 */
	MDI_PHCI_LOCK(ph);
	MDI_PHCI_STABLE(ph);
	MDI_PHCI_UNLOCK(ph);
	return (rv);
}

/*
 * mdi_pi_online():
 *		Place the path_info node in the online state.  The path is
 *		now available to be selected by mdi_select_path() for
 *		transporting I/O requests to client devices.
 * Return Values:
 *		MDI_SUCCESS
 *		MDI_FAILURE
 */
int
mdi_pi_online(mdi_pathinfo_t *pip, int flags)
{
	mdi_client_t	*ct = MDI_PI(pip)->pi_client;
	int		client_held = 0;
	int		rv;

	ASSERT(ct != NULL);
	rv = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_ONLINE, flags);
	if (rv != MDI_SUCCESS)
		return (rv);

	MDI_PI_LOCK(pip);
	if (MDI_PI(pip)->pi_pm_held == 0) {
		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
		    "i_mdi_pm_hold_pip %p", (void *)pip));
		i_mdi_pm_hold_pip(pip);
		client_held = 1;
	}
	MDI_PI_UNLOCK(pip);

	if (client_held) {
		MDI_CLIENT_LOCK(ct);
		if (ct->ct_power_cnt == 0) {
			rv = i_mdi_power_all_phci(ct);
		}

		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
		    "i_mdi_pm_hold_client %p", (void *)ct));
		i_mdi_pm_hold_client(ct, 1);
		MDI_CLIENT_UNLOCK(ct);
	}

	return (rv);
}

/*
 * mdi_pi_standby():
 *		Place the mdi_pathinfo node in standby state
 *
 * Return Values:
 *		MDI_SUCCESS
 *		MDI_FAILURE
 */
int
mdi_pi_standby(mdi_pathinfo_t *pip, int flags)
{
	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_STANDBY, flags));
}

/*
 * mdi_pi_fault():
 *		Place the mdi_pathinfo node in fault'ed state
 * Return Values:
 *		MDI_SUCCESS
 *		MDI_FAILURE
 */
int
mdi_pi_fault(mdi_pathinfo_t *pip, int flags)
{
	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_FAULT, flags));
}

/*
 * mdi_pi_offline():
 *		Offline a mdi_pathinfo node.
 * Return Values:
 *		MDI_SUCCESS
 *		MDI_FAILURE
 */
int
mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
{
	int	ret, client_held = 0;
	mdi_client_t	*ct;

	/*
	 * Original code overloaded NDI_DEVI_REMOVE to this interface, and
	 * used it to mean "user initiated operation" (i.e. devctl). Callers
	 * should now just use NDI_USER_REQ.
	 */
	if (flags & NDI_DEVI_REMOVE) {
		flags &= ~NDI_DEVI_REMOVE;
		flags |= NDI_USER_REQ;
	}

	ret = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_OFFLINE, flags);

	if (ret == MDI_SUCCESS) {
		MDI_PI_LOCK(pip);
		if (MDI_PI(pip)->pi_pm_held) {
			client_held = 1;
		}
		MDI_PI_UNLOCK(pip);

		if (client_held) {
			ct = MDI_PI(pip)->pi_client;
			MDI_CLIENT_LOCK(ct);
			MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
			    "i_mdi_pm_rele_client\n"));
			i_mdi_pm_rele_client(ct, 1);
			MDI_CLIENT_UNLOCK(ct);
		}
	}

	return (ret);
}

/*
 * i_mdi_pi_offline():
 *		Offline a mdi_pathinfo node and call the vHCI driver's callback
 */
static int
i_mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
{
	dev_info_t	*vdip = NULL;
	mdi_vhci_t	*vh = NULL;
	mdi_client_t	*ct = NULL;
	int		(*f)();
	int		rv;

	MDI_PI_LOCK(pip);
	ct = MDI_PI(pip)->pi_client;
	ASSERT(ct != NULL);

	while (MDI_PI(pip)->pi_ref_cnt != 0) {
		/*
		 * Give a chance for pending I/Os to complete.
		 */
		MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
		    "!%d cmds still pending on path %s %p",
		    MDI_PI(pip)->pi_ref_cnt, mdi_pi_spathname(pip),
		    (void *)pip));
		if (cv_reltimedwait(&MDI_PI(pip)->pi_ref_cv,
		    &MDI_PI(pip)->pi_mutex, drv_usectohz(60 * 1000000),
		    TR_CLOCK_TICK) == -1) {
			/*
			 * The timeout time reached without ref_cnt being zero
			 * being signaled.
			 */
			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
			    "!Timeout reached on path %s %p without the cond",
			    mdi_pi_spathname(pip), (void *)pip));
			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
			    "!%d cmds still pending on path %s %p",
			    MDI_PI(pip)->pi_ref_cnt,
			    mdi_pi_spathname(pip), (void *)pip));
		}
	}
	vh = ct->ct_vhci;
	vdip = vh->vh_dip;

	/*
	 * Notify vHCI that has registered this event
	 */
	ASSERT(vh->vh_ops);
	f = vh->vh_ops->vo_pi_state_change;

	if (f != NULL) {
		MDI_PI_UNLOCK(pip);
		if ((rv = (*f)(vdip, pip, MDI_PATHINFO_STATE_OFFLINE, 0,
		    flags)) != MDI_SUCCESS) {
			MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
			    "!vo_path_offline failed: vdip %s%d %p: path %s %p",
			    ddi_driver_name(vdip), ddi_get_instance(vdip),
			    (void *)vdip, mdi_pi_spathname(pip), (void *)pip));
		}
		MDI_PI_LOCK(pip);
	}

	/*
	 * Set the mdi_pathinfo node state and clear the transient condition
	 */
	MDI_PI_SET_OFFLINE(pip);
	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
	MDI_PI_UNLOCK(pip);

	MDI_CLIENT_LOCK(ct);
	if (rv == MDI_SUCCESS) {
		if (ct->ct_unstable == 0) {
			dev_info_t	*cdip = ct->ct_dip;

			/*
			 * Onlining the mdi_pathinfo node will impact the
			 * client state Update the client and dev_info node
			 * state accordingly
			 */
			i_mdi_client_update_state(ct);
			rv = NDI_SUCCESS;
			if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
				if (cdip &&
				    (i_ddi_node_state(cdip) >=
				    DS_INITIALIZED)) {
					MDI_CLIENT_UNLOCK(ct);
					rv = ndi_devi_offline(cdip,
					    NDI_DEVFS_CLEAN);
					MDI_CLIENT_LOCK(ct);
					if (rv != NDI_SUCCESS) {
						/*
						 * ndi_devi_offline failed.
						 * Reset client flags to
						 * online.
						 */
						MDI_DEBUG(4, (MDI_WARN, cdip,
						    "ndi_devi_offline failed: "
						    "error %x", rv));
						MDI_CLIENT_SET_ONLINE(ct);
					}
				}
			}
			/*
			 * Convert to MDI error code
			 */
			switch (rv) {
			case NDI_SUCCESS:
				rv = MDI_SUCCESS;
				break;
			case NDI_BUSY:
				rv = MDI_BUSY;
				break;
			default:
				rv = MDI_FAILURE;
				break;
			}
		}
		MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
		i_mdi_report_path_state(ct, pip);
	}

	MDI_CLIENT_UNLOCK(ct);

	/*
	 * Change in the mdi_pathinfo node state will impact the client state
	 */
	MDI_DEBUG(2, (MDI_NOTE, ct->ct_dip,
	    "ct = %p pip = %p", (void *)ct, (void *)pip));
	return (rv);
}

/*
 * mdi_pi_get_node_name():
 *              Get the name associated with a mdi_pathinfo node.
 *              Since pathinfo nodes are not directly named, we
 *              return the node_name of the client.
 *
 * Return Values:
 *              char *
 */
char *
mdi_pi_get_node_name(mdi_pathinfo_t *pip)
{
	mdi_client_t    *ct;

	if (pip == NULL)
		return (NULL);
	ct = MDI_PI(pip)->pi_client;
	if ((ct == NULL) || (ct->ct_dip == NULL))
		return (NULL);
	return (ddi_node_name(ct->ct_dip));
}

/*
 * mdi_pi_get_addr():
 *		Get the unit address associated with a mdi_pathinfo node
 *
 * Return Values:
 *		char *
 */
char *
mdi_pi_get_addr(mdi_pathinfo_t *pip)
{
	if (pip == NULL)
		return (NULL);

	return (MDI_PI(pip)->pi_addr);
}

/*
 * mdi_pi_get_path_instance():
 *		Get the 'path_instance' of a mdi_pathinfo node
 *
 * Return Values:
 *		path_instance
 */
int
mdi_pi_get_path_instance(mdi_pathinfo_t *pip)
{
	if (pip == NULL)
		return (0);

	return (MDI_PI(pip)->pi_path_instance);
}

/*
 * mdi_pi_pathname():
 *		Return pointer to path to pathinfo node.
 */
char *
mdi_pi_pathname(mdi_pathinfo_t *pip)
{
	if (pip == NULL)
		return (NULL);
	return (mdi_pi_pathname_by_instance(mdi_pi_get_path_instance(pip)));
}

/*
 * mdi_pi_spathname():
 *		Return pointer to shortpath to pathinfo node. Used for debug
 *		messages, so return "" instead of NULL when unknown.
 */
char *
mdi_pi_spathname(mdi_pathinfo_t *pip)
{
	char	*spath = "";

	if (pip) {
		spath = mdi_pi_spathname_by_instance(
		    mdi_pi_get_path_instance(pip));
		if (spath == NULL)
			spath = "";
	}
	return (spath);
}

char *
mdi_pi_pathname_obp(mdi_pathinfo_t *pip, char *path)
{
	char *obp_path = NULL;
	if ((pip == NULL) || (path == NULL))
		return (NULL);

	if (mdi_prop_lookup_string(pip, "obp-path", &obp_path) == MDI_SUCCESS) {
		(void) strcpy(path, obp_path);
		(void) mdi_prop_free(obp_path);
	} else {
		path = NULL;
	}
	return (path);
}

int
mdi_pi_pathname_obp_set(mdi_pathinfo_t *pip, char *component)
{
	dev_info_t *pdip;
	char *obp_path = NULL;
	int rc = MDI_FAILURE;

	if (pip == NULL)
		return (MDI_FAILURE);

	pdip = mdi_pi_get_phci(pip);
	if (pdip == NULL)
		return (MDI_FAILURE);

	obp_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);

	if (ddi_pathname_obp(pdip, obp_path) == NULL) {
		(void) ddi_pathname(pdip, obp_path);
	}

	if (component) {
		(void) strncat(obp_path, "/", MAXPATHLEN);
		(void) strncat(obp_path, component, MAXPATHLEN);
	}
	rc = mdi_prop_update_string(pip, "obp-path", obp_path);

	if (obp_path)
		kmem_free(obp_path, MAXPATHLEN);
	return (rc);
}

/*
 * mdi_pi_get_client():
 *		Get the client devinfo associated with a mdi_pathinfo node
 *
 * Return Values:
 *		Handle to client device dev_info node
 */
dev_info_t *
mdi_pi_get_client(mdi_pathinfo_t *pip)
{
	dev_info_t	*dip = NULL;
	if (pip) {
		dip = MDI_PI(pip)->pi_client->ct_dip;
	}
	return (dip);
}

/*
 * mdi_pi_get_phci():
 *		Get the pHCI devinfo associated with the mdi_pathinfo node
 * Return Values:
 *		Handle to dev_info node
 */
dev_info_t *
mdi_pi_get_phci(mdi_pathinfo_t *pip)
{
	dev_info_t	*dip = NULL;
	mdi_phci_t	*ph;

	if (pip) {
		ph = MDI_PI(pip)->pi_phci;
		if (ph)
			dip = ph->ph_dip;
	}
	return (dip);
}

/*
 * mdi_pi_get_client_private():
 *		Get the client private information associated with the
 *		mdi_pathinfo node
 */
void *
mdi_pi_get_client_private(mdi_pathinfo_t *pip)
{
	void *cprivate = NULL;
	if (pip) {
		cprivate = MDI_PI(pip)->pi_cprivate;
	}
	return (cprivate);
}

/*
 * mdi_pi_set_client_private():
 *		Set the client private information in the mdi_pathinfo node
 */
void
mdi_pi_set_client_private(mdi_pathinfo_t *pip, void *priv)
{
	if (pip) {
		MDI_PI(pip)->pi_cprivate = priv;
	}
}

/*
 * mdi_pi_get_phci_private():
 *		Get the pHCI private information associated with the
 *		mdi_pathinfo node
 */
caddr_t
mdi_pi_get_phci_private(mdi_pathinfo_t *pip)
{
	caddr_t	pprivate = NULL;

	if (pip) {
		pprivate = MDI_PI(pip)->pi_pprivate;
	}
	return (pprivate);
}

/*
 * mdi_pi_set_phci_private():
 *		Set the pHCI private information in the mdi_pathinfo node
 */
void
mdi_pi_set_phci_private(mdi_pathinfo_t *pip, caddr_t priv)
{
	if (pip) {
		MDI_PI(pip)->pi_pprivate = priv;
	}
}

/*
 * mdi_pi_get_state():
 *		Get the mdi_pathinfo node state. Transient states are internal
 *		and not provided to the users
 */
mdi_pathinfo_state_t
mdi_pi_get_state(mdi_pathinfo_t *pip)
{
	mdi_pathinfo_state_t    state = MDI_PATHINFO_STATE_INIT;

	if (pip) {
		if (MDI_PI_IS_TRANSIENT(pip)) {
			/*
			 * mdi_pathinfo is in state transition.  Return the
			 * last good state.
			 */
			state = MDI_PI_OLD_STATE(pip);
		} else {
			state = MDI_PI_STATE(pip);
		}
	}
	return (state);
}

/*
 * mdi_pi_get_flags():
 *		Get the mdi_pathinfo node flags.
 */
uint_t
mdi_pi_get_flags(mdi_pathinfo_t *pip)
{
	return (pip ? MDI_PI(pip)->pi_flags : 0);
}

/*
 * Note that the following function needs to be the new interface for
 * mdi_pi_get_state when mpxio gets integrated to ON.
 */
int
mdi_pi_get_state2(mdi_pathinfo_t *pip, mdi_pathinfo_state_t *state,
		uint32_t *ext_state)
{
	*state = MDI_PATHINFO_STATE_INIT;

	if (pip) {
		if (MDI_PI_IS_TRANSIENT(pip)) {
			/*
			 * mdi_pathinfo is in state transition.  Return the
			 * last good state.
			 */
			*state = MDI_PI_OLD_STATE(pip);
			*ext_state = MDI_PI_OLD_EXT_STATE(pip);
		} else {
			*state = MDI_PI_STATE(pip);
			*ext_state = MDI_PI_EXT_STATE(pip);
		}
	}
	return (MDI_SUCCESS);
}

/*
 * mdi_pi_get_preferred:
 *	Get the preferred path flag
 */
int
mdi_pi_get_preferred(mdi_pathinfo_t *pip)
{
	if (pip) {
		return (MDI_PI(pip)->pi_preferred);
	}
	return (0);
}

/*
 * mdi_pi_set_preferred:
 *	Set the preferred path flag
 */
void
mdi_pi_set_preferred(mdi_pathinfo_t *pip, int preferred)
{
	if (pip) {
		MDI_PI(pip)->pi_preferred = preferred;
	}
}

/*
 * mdi_pi_set_state():
 *		Set the mdi_pathinfo node state
 */
void
mdi_pi_set_state(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state)
{
	uint32_t	ext_state;

	if (pip) {
		ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK;
		MDI_PI(pip)->pi_state = state;
		MDI_PI(pip)->pi_state |= ext_state;

		/* Path has changed state, invalidate DINFOCACHE snap shot. */
		i_ddi_di_cache_invalidate();
	}
}

/*
 * Property functions:
 */
int
i_map_nvlist_error_to_mdi(int val)
{
	int rv;

	switch (val) {
	case 0:
		rv = DDI_PROP_SUCCESS;
		break;
	case EINVAL:
	case ENOTSUP:
		rv = DDI_PROP_INVAL_ARG;
		break;
	case ENOMEM:
		rv = DDI_PROP_NO_MEMORY;
		break;
	default:
		rv = DDI_PROP_NOT_FOUND;
		break;
	}
	return (rv);
}

/*
 * mdi_pi_get_next_prop():
 * 		Property walk function.  The caller should hold mdi_pi_lock()
 *		and release by calling mdi_pi_unlock() at the end of walk to
 *		get a consistent value.
 */
nvpair_t *
mdi_pi_get_next_prop(mdi_pathinfo_t *pip, nvpair_t *prev)
{
	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
		return (NULL);
	}
	ASSERT(MDI_PI_LOCKED(pip));
	return (nvlist_next_nvpair(MDI_PI(pip)->pi_prop, prev));
}

/*
 * mdi_prop_remove():
 * 		Remove the named property from the named list.
 */
int
mdi_prop_remove(mdi_pathinfo_t *pip, char *name)
{
	if (pip == NULL) {
		return (DDI_PROP_NOT_FOUND);
	}
	ASSERT(!MDI_PI_LOCKED(pip));
	MDI_PI_LOCK(pip);
	if (MDI_PI(pip)->pi_prop == NULL) {
		MDI_PI_UNLOCK(pip);
		return (DDI_PROP_NOT_FOUND);
	}
	if (name) {
		(void) nvlist_remove_all(MDI_PI(pip)->pi_prop, name);
	} else {
		char		nvp_name[MAXNAMELEN];
		nvpair_t	*nvp;
		nvp = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, NULL);
		while (nvp) {
			nvpair_t	*next;
			next = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, nvp);
			(void) snprintf(nvp_name, sizeof(nvp_name), "%s",
			    nvpair_name(nvp));
			(void) nvlist_remove_all(MDI_PI(pip)->pi_prop,
			    nvp_name);
			nvp = next;
		}
	}
	MDI_PI_UNLOCK(pip);
	return (DDI_PROP_SUCCESS);
}

/*
 * mdi_prop_size():
 * 		Get buffer size needed to pack the property data.
 * 		Caller should hold the mdi_pathinfo_t lock to get a consistent
 *		buffer size.
 */
int
mdi_prop_size(mdi_pathinfo_t *pip, size_t *buflenp)
{
	int	rv;
	size_t	bufsize;

	*buflenp = 0;
	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
		return (DDI_PROP_NOT_FOUND);
	}
	ASSERT(MDI_PI_LOCKED(pip));
	rv = nvlist_size(MDI_PI(pip)->pi_prop,
	    &bufsize, NV_ENCODE_NATIVE);
	*buflenp = bufsize;
	return (i_map_nvlist_error_to_mdi(rv));
}

/*
 * mdi_prop_pack():
 * 		pack the property list.  The caller should hold the
 *		mdi_pathinfo_t node to get a consistent data
 */
int
mdi_prop_pack(mdi_pathinfo_t *pip, char **bufp, uint_t buflen)
{
	int	rv;
	size_t	bufsize;

	if ((pip == NULL) || MDI_PI(pip)->pi_prop == NULL) {
		return (DDI_PROP_NOT_FOUND);
	}

	ASSERT(MDI_PI_LOCKED(pip));

	bufsize = buflen;
	rv = nvlist_pack(MDI_PI(pip)->pi_prop, bufp, (size_t *)&bufsize,
	    NV_ENCODE_NATIVE, KM_SLEEP);

	return (i_map_nvlist_error_to_mdi(rv));
}

/*
 * mdi_prop_update_byte():
 *		Create/Update a byte property
 */
int
mdi_prop_update_byte(mdi_pathinfo_t *pip, char *name, uchar_t data)
{
	int rv;

	if (pip == NULL) {
		return (DDI_PROP_INVAL_ARG);
	}
	ASSERT(!MDI_PI_LOCKED(pip));
	MDI_PI_LOCK(pip);
	if (MDI_PI(pip)->pi_prop == NULL) {
		MDI_PI_UNLOCK(pip);
		return (DDI_PROP_NOT_FOUND);
	}
	rv = nvlist_add_byte(MDI_PI(pip)->pi_prop, name, data);
	MDI_PI_UNLOCK(pip);
	return (i_map_nvlist_error_to_mdi(rv));
}

/*
 * mdi_prop_update_byte_array():
 *		Create/Update a byte array property
 */
int
mdi_prop_update_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t *data,
    uint_t nelements)
{
	int rv;

	if (pip == NULL) {
		return (DDI_PROP_INVAL_ARG);
	}
	ASSERT(!MDI_PI_LOCKED(pip));
	MDI_PI_LOCK(pip);
	if (MDI_PI(pip)->pi_prop == NULL) {
		MDI_PI_UNLOCK(pip);
		return (DDI_PROP_NOT_FOUND);
	}
	rv = nvlist_add_byte_array(MDI_PI(pip)->pi_prop, name, data, nelements);
	MDI_PI_UNLOCK(pip);
	return (i_map_nvlist_error_to_mdi(rv));
}

/*
 * mdi_prop_update_int():
 *		Create/Update a 32 bit integer property
 */
int
mdi_prop_update_int(mdi_pathinfo_t *pip, char *name, int data)
{
	int rv;

	if (pip == NULL) {
		return (DDI_PROP_INVAL_ARG);
	}
	ASSERT(!MDI_PI_LOCKED(pip));
	MDI_PI_LOCK(pip);
	if (MDI_PI(pip)->pi_prop == NULL) {
		MDI_PI_UNLOCK(pip);
		return (DDI_PROP_NOT_FOUND);
	}
	rv = nvlist_add_int32(MDI_PI(pip)->pi_prop, name, (int32_t)data);
	MDI_PI_UNLOCK(pip);
	return (i_map_nvlist_error_to_mdi(rv));
}

/*
 * mdi_prop_update_int64():
 *		Create/Update a 64 bit integer property
 */
int
mdi_prop_update_int64(mdi_pathinfo_t *pip, char *name, int64_t data)
{
	int rv;

	if (pip == NULL) {
		return (DDI_PROP_INVAL_ARG);
	}
	ASSERT(!MDI_PI_LOCKED(pip));
	MDI_PI_LOCK(pip);
	if (MDI_PI(pip)->pi_prop == NULL) {
		MDI_PI_UNLOCK(pip);
		return (DDI_PROP_NOT_FOUND);
	}
	rv = nvlist_add_int64(MDI_PI(pip)->pi_prop, name, data);
	MDI_PI_UNLOCK(pip);
	return (i_map_nvlist_error_to_mdi(rv));
}

/*
 * mdi_prop_update_int_array():
 *		Create/Update a int array property
 */
int
mdi_prop_update_int_array(mdi_pathinfo_t *pip, char *name, int *data,
	    uint_t nelements)
{
	int rv;

	if (pip == NULL) {
		return (DDI_PROP_INVAL_ARG);
	}
	ASSERT(!MDI_PI_LOCKED(pip));
	MDI_PI_LOCK(pip);
	if (MDI_PI(pip)->pi_prop == NULL) {
		MDI_PI_UNLOCK(pip);
		return (DDI_PROP_NOT_FOUND);
	}
	rv = nvlist_add_int32_array(MDI_PI(pip)->pi_prop, name, (int32_t *)data,
	    nelements);
	MDI_PI_UNLOCK(pip);
	return (i_map_nvlist_error_to_mdi(rv));
}

/*
 * mdi_prop_update_string():
 *		Create/Update a string property
 */
int
mdi_prop_update_string(mdi_pathinfo_t *pip, char *name, char *data)
{
	int rv;

	if (pip == NULL) {
		return (DDI_PROP_INVAL_ARG);
	}
	ASSERT(!MDI_PI_LOCKED(pip));
	MDI_PI_LOCK(pip);
	if (MDI_PI(pip)->pi_prop == NULL) {
		MDI_PI_UNLOCK(pip);
		return (DDI_PROP_NOT_FOUND);
	}
	rv = nvlist_add_string(MDI_PI(pip)->pi_prop, name, data);
	MDI_PI_UNLOCK(pip);
	return (i_map_nvlist_error_to_mdi(rv));
}

/*
 * mdi_prop_update_string_array():
 *		Create/Update a string array property
 */
int
mdi_prop_update_string_array(mdi_pathinfo_t *pip, char *name, char **data,
    uint_t nelements)
{
	int rv;

	if (pip == NULL) {
		return (DDI_PROP_INVAL_ARG);
	}
	ASSERT(!MDI_PI_LOCKED(pip));
	MDI_PI_LOCK(pip);
	if (MDI_PI(pip)->pi_prop == NULL) {
		MDI_PI_UNLOCK(pip);
		return (DDI_PROP_NOT_FOUND);
	}
	rv = nvlist_add_string_array(MDI_PI(pip)->pi_prop, name, data,
	    nelements);
	MDI_PI_UNLOCK(pip);
	return (i_map_nvlist_error_to_mdi(rv));
}

/*
 * mdi_prop_lookup_byte():
 * 		Look for byte property identified by name.  The data returned
 *		is the actual property and valid as long as mdi_pathinfo_t node
 *		is alive.
 */
int
mdi_prop_lookup_byte(mdi_pathinfo_t *pip, char *name, uchar_t *data)
{
	int rv;

	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
		return (DDI_PROP_NOT_FOUND);
	}
	rv = nvlist_lookup_byte(MDI_PI(pip)->pi_prop, name, data);
	return (i_map_nvlist_error_to_mdi(rv));
}


/*
 * mdi_prop_lookup_byte_array():
 * 		Look for byte array property identified by name.  The data
 *		returned is the actual property and valid as long as
 *		mdi_pathinfo_t node is alive.
 */
int
mdi_prop_lookup_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t **data,
    uint_t *nelements)
{
	int rv;

	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
		return (DDI_PROP_NOT_FOUND);
	}
	rv = nvlist_lookup_byte_array(MDI_PI(pip)->pi_prop, name, data,
	    nelements);
	return (i_map_nvlist_error_to_mdi(rv));
}

/*
 * mdi_prop_lookup_int():
 * 		Look for int property identified by name.  The data returned
 *		is the actual property and valid as long as mdi_pathinfo_t
 *		node is alive.
 */
int
mdi_prop_lookup_int(mdi_pathinfo_t *pip, char *name, int *data)
{
	int rv;

	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
		return (DDI_PROP_NOT_FOUND);
	}
	rv = nvlist_lookup_int32(MDI_PI(pip)->pi_prop, name, (int32_t *)data);
	return (i_map_nvlist_error_to_mdi(rv));
}

/*
 * mdi_prop_lookup_int64():
 * 		Look for int64 property identified by name.  The data returned
 *		is the actual property and valid as long as mdi_pathinfo_t node
 *		is alive.
 */
int
mdi_prop_lookup_int64(mdi_pathinfo_t *pip, char *name, int64_t *data)
{
	int rv;
	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
		return (DDI_PROP_NOT_FOUND);
	}
	rv = nvlist_lookup_int64(MDI_PI(pip)->pi_prop, name, data);
	return (i_map_nvlist_error_to_mdi(rv));
}

/*
 * mdi_prop_lookup_int_array():
 * 		Look for int array property identified by name.  The data
 *		returned is the actual property and valid as long as
 *		mdi_pathinfo_t node is alive.
 */
int
mdi_prop_lookup_int_array(mdi_pathinfo_t *pip, char *name, int **data,
    uint_t *nelements)
{
	int rv;

	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
		return (DDI_PROP_NOT_FOUND);
	}
	rv = nvlist_lookup_int32_array(MDI_PI(pip)->pi_prop, name,
	    (int32_t **)data, nelements);
	return (i_map_nvlist_error_to_mdi(rv));
}

/*
 * mdi_prop_lookup_string():
 * 		Look for string property identified by name.  The data
 *		returned is the actual property and valid as long as
 *		mdi_pathinfo_t node is alive.
 */
int
mdi_prop_lookup_string(mdi_pathinfo_t *pip, char *name, char **data)
{
	int rv;

	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
		return (DDI_PROP_NOT_FOUND);
	}
	rv = nvlist_lookup_string(MDI_PI(pip)->pi_prop, name, data);
	return (i_map_nvlist_error_to_mdi(rv));
}

/*
 * mdi_prop_lookup_string_array():
 * 		Look for string array property identified by name.  The data
 *		returned is the actual property and valid as long as
 *		mdi_pathinfo_t node is alive.
 */
int
mdi_prop_lookup_string_array(mdi_pathinfo_t *pip, char *name, char ***data,
    uint_t *nelements)
{
	int rv;

	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
		return (DDI_PROP_NOT_FOUND);
	}
	rv = nvlist_lookup_string_array(MDI_PI(pip)->pi_prop, name, data,
	    nelements);
	return (i_map_nvlist_error_to_mdi(rv));
}

/*
 * mdi_prop_free():
 * 		Symmetrical function to ddi_prop_free(). nvlist_lookup_xx()
 *		functions return the pointer to actual property data and not a
 *		copy of it.  So the data returned is valid as long as
 *		mdi_pathinfo_t node is valid.
 */
/*ARGSUSED*/
int
mdi_prop_free(void *data)
{
	return (DDI_PROP_SUCCESS);
}

/*ARGSUSED*/
static void
i_mdi_report_path_state(mdi_client_t *ct, mdi_pathinfo_t *pip)
{
	char		*ct_path;
	char		*ct_status;
	char		*status;
	dev_info_t	*cdip = ct->ct_dip;
	char		lb_buf[64];
	int		report_lb_c = 0, report_lb_p = 0;

	ASSERT(MDI_CLIENT_LOCKED(ct));
	if ((cdip == NULL) || (ddi_get_instance(cdip) == -1) ||
	    (MDI_CLIENT_IS_REPORT_DEV_NEEDED(ct) == 0)) {
		return;
	}
	if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_OPTIMAL) {
		ct_status = "optimal";
		report_lb_c = 1;
	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED) {
		ct_status = "degraded";
	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
		ct_status = "failed";
	} else {
		ct_status = "unknown";
	}

	lb_buf[0] = 0;		/* not interested in load balancing config */

	if (MDI_PI_FLAGS_IS_DEVICE_REMOVED(pip)) {
		status = "removed";
	} else if (MDI_PI_IS_OFFLINE(pip)) {
		status = "offline";
	} else if (MDI_PI_IS_ONLINE(pip)) {
		status = "online";
		report_lb_p = 1;
	} else if (MDI_PI_IS_STANDBY(pip)) {
		status = "standby";
	} else if (MDI_PI_IS_FAULT(pip)) {
		status = "faulted";
	} else {
		status = "unknown";
	}

	if (cdip) {
		ct_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);

		/*
		 * NOTE: Keeping "multipath status: %s" and
		 * "Load balancing: %s" format unchanged in case someone
		 * scrubs /var/adm/messages looking for these messages.
		 */
		if (report_lb_c && report_lb_p) {
			if (ct->ct_lb == LOAD_BALANCE_LBA) {
				(void) snprintf(lb_buf, sizeof (lb_buf),
				    "%s, region-size: %d", mdi_load_balance_lba,
				    ct->ct_lb_args->region_size);
			} else if (ct->ct_lb == LOAD_BALANCE_NONE) {
				(void) snprintf(lb_buf, sizeof (lb_buf),
				    "%s", mdi_load_balance_none);
			} else {
				(void) snprintf(lb_buf, sizeof (lb_buf), "%s",
				    mdi_load_balance_rr);
			}

			cmn_err(mdi_debug_consoleonly ? CE_NOTE : CE_CONT,
			    "?%s (%s%d) multipath status: %s: "
			    "path %d %s is %s: Load balancing: %s\n",
			    ddi_pathname(cdip, ct_path), ddi_driver_name(cdip),
			    ddi_get_instance(cdip), ct_status,
			    mdi_pi_get_path_instance(pip),
			    mdi_pi_spathname(pip), status, lb_buf);
		} else {
			cmn_err(mdi_debug_consoleonly ? CE_NOTE : CE_CONT,
			    "?%s (%s%d) multipath status: %s: "
			    "path %d %s is %s\n",
			    ddi_pathname(cdip, ct_path), ddi_driver_name(cdip),
			    ddi_get_instance(cdip), ct_status,
			    mdi_pi_get_path_instance(pip),
			    mdi_pi_spathname(pip), status);
		}

		kmem_free(ct_path, MAXPATHLEN);
		MDI_CLIENT_CLEAR_REPORT_DEV_NEEDED(ct);
	}
}

#ifdef	DEBUG
/*
 * i_mdi_log():
 *		Utility function for error message management
 *
 *		NOTE: Implementation takes care of trailing \n for cmn_err,
 *		MDI_DEBUG should not terminate fmt strings with \n.
 *
 *		NOTE: If the level is >= 2, and there is no leading !?^
 *		then a leading ! is implied (but can be overriden via
 *		mdi_debug_consoleonly). If you are using kmdb on the console,
 *		consider setting mdi_debug_consoleonly to 1 as an aid.
 */
/*PRINTFLIKE4*/
static void
i_mdi_log(int level, const char *func, dev_info_t *dip, const char *fmt, ...)
{
	char		name[MAXNAMELEN];
	char		buf[512];
	char		*bp;
	va_list		ap;
	int		log_only = 0;
	int		boot_only = 0;
	int		console_only = 0;

	if (dip) {
		(void) snprintf(name, sizeof(name), "%s%d: ",
		    ddi_driver_name(dip), ddi_get_instance(dip));
	} else {
		name[0] = 0;
	}

	va_start(ap, fmt);
	(void) vsnprintf(buf, sizeof(buf), fmt, ap);
	va_end(ap);

	switch (buf[0]) {
	case '!':
		bp = &buf[1];
		log_only = 1;
		break;
	case '?':
		bp = &buf[1];
		boot_only = 1;
		break;
	case '^':
		bp = &buf[1];
		console_only = 1;
		break;
	default:
		if (level >= 2)
			log_only = 1;		/* ! implied */
		bp = buf;
		break;
	}
	if (mdi_debug_logonly) {
		log_only = 1;
		boot_only = 0;
		console_only = 0;
	}
	if (mdi_debug_consoleonly) {
		log_only = 0;
		boot_only = 0;
		console_only = 1;
		level = CE_NOTE;
		goto console;
	}

	switch (level) {
	case CE_NOTE:
		level = CE_CONT;
		/* FALLTHROUGH */
	case CE_CONT:
		if (boot_only) {
			cmn_err(level, "?mdi: %s%s: %s\n", name, func, bp);
		} else if (console_only) {
			cmn_err(level, "^mdi: %s%s: %s\n", name, func, bp);
		} else if (log_only) {
			cmn_err(level, "!mdi: %s%s: %s\n", name, func, bp);
		} else {
			cmn_err(level, "mdi: %s%s: %s\n", name, func, bp);
		}
		break;

	case CE_WARN:
	case CE_PANIC:
	console:
		if (boot_only) {
			cmn_err(level, "?mdi: %s%s: %s", name, func, bp);
		} else if (console_only) {
			cmn_err(level, "^mdi: %s%s: %s", name, func, bp);
		} else if (log_only) {
			cmn_err(level, "!mdi: %s%s: %s", name, func, bp);
		} else {
			cmn_err(level, "mdi: %s%s: %s", name, func, bp);
		}
		break;
	default:
		cmn_err(level, "mdi: %s%s", name, bp);
		break;
	}
}
#endif	/* DEBUG */

void
i_mdi_client_online(dev_info_t *ct_dip)
{
	mdi_client_t	*ct;

	/*
	 * Client online notification. Mark client state as online
	 * restore our binding with dev_info node
	 */
	ct = i_devi_get_client(ct_dip);
	ASSERT(ct != NULL);
	MDI_CLIENT_LOCK(ct);
	MDI_CLIENT_SET_ONLINE(ct);
	/* catch for any memory leaks */
	ASSERT((ct->ct_dip == NULL) || (ct->ct_dip == ct_dip));
	ct->ct_dip = ct_dip;

	if (ct->ct_power_cnt == 0)
		(void) i_mdi_power_all_phci(ct);

	MDI_DEBUG(4, (MDI_NOTE, ct_dip,
	    "i_mdi_pm_hold_client %p", (void *)ct));
	i_mdi_pm_hold_client(ct, 1);

	MDI_CLIENT_UNLOCK(ct);
}

void
i_mdi_phci_online(dev_info_t *ph_dip)
{
	mdi_phci_t	*ph;

	/* pHCI online notification. Mark state accordingly */
	ph = i_devi_get_phci(ph_dip);
	ASSERT(ph != NULL);
	MDI_PHCI_LOCK(ph);
	MDI_PHCI_SET_ONLINE(ph);
	MDI_PHCI_UNLOCK(ph);
}

/*
 * mdi_devi_online():
 * 		Online notification from NDI framework on pHCI/client
 *		device online.
 * Return Values:
 *		NDI_SUCCESS
 *		MDI_FAILURE
 */
/*ARGSUSED*/
int
mdi_devi_online(dev_info_t *dip, uint_t flags)
{
	if (MDI_PHCI(dip)) {
		i_mdi_phci_online(dip);
	}

	if (MDI_CLIENT(dip)) {
		i_mdi_client_online(dip);
	}
	return (NDI_SUCCESS);
}

/*
 * mdi_devi_offline():
 * 		Offline notification from NDI framework on pHCI/Client device
 *		offline.
 *
 * Return Values:
 *		NDI_SUCCESS
 *		NDI_FAILURE
 */
/*ARGSUSED*/
int
mdi_devi_offline(dev_info_t *dip, uint_t flags)
{
	int		rv = NDI_SUCCESS;

	if (MDI_CLIENT(dip)) {
		rv = i_mdi_client_offline(dip, flags);
		if (rv != NDI_SUCCESS)
			return (rv);
	}

	if (MDI_PHCI(dip)) {
		rv = i_mdi_phci_offline(dip, flags);

		if ((rv != NDI_SUCCESS) && MDI_CLIENT(dip)) {
			/* set client back online */
			i_mdi_client_online(dip);
		}
	}

	return (rv);
}

/*ARGSUSED*/
static int
i_mdi_phci_offline(dev_info_t *dip, uint_t flags)
{
	int		rv = NDI_SUCCESS;
	mdi_phci_t	*ph;
	mdi_client_t	*ct;
	mdi_pathinfo_t	*pip;
	mdi_pathinfo_t	*next;
	mdi_pathinfo_t	*failed_pip = NULL;
	dev_info_t	*cdip;

	/*
	 * pHCI component offline notification
	 * Make sure that this pHCI instance is free to be offlined.
	 * If it is OK to proceed, Offline and remove all the child
	 * mdi_pathinfo nodes.  This process automatically offlines
	 * corresponding client devices, for which this pHCI provides
	 * critical services.
	 */
	ph = i_devi_get_phci(dip);
	MDI_DEBUG(2, (MDI_NOTE, dip,
	    "called %p %p", (void *)dip, (void *)ph));
	if (ph == NULL) {
		return (rv);
	}

	MDI_PHCI_LOCK(ph);

	if (MDI_PHCI_IS_OFFLINE(ph)) {
		MDI_DEBUG(1, (MDI_WARN, dip,
		    "!pHCI already offlined: %p", (void *)dip));
		MDI_PHCI_UNLOCK(ph);
		return (NDI_SUCCESS);
	}

	/*
	 * Check to see if the pHCI can be offlined
	 */
	if (ph->ph_unstable) {
		MDI_DEBUG(1, (MDI_WARN, dip,
		    "!One or more target devices are in transient state. "
		    "This device can not be removed at this moment. "
		    "Please try again later."));
		MDI_PHCI_UNLOCK(ph);
		return (NDI_BUSY);
	}

	pip = ph->ph_path_head;
	while (pip != NULL) {
		MDI_PI_LOCK(pip);
		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;

		/*
		 * The mdi_pathinfo state is OK. Check the client state.
		 * If failover in progress fail the pHCI from offlining
		 */
		ct = MDI_PI(pip)->pi_client;
		i_mdi_client_lock(ct, pip);
		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
		    (ct->ct_unstable)) {
			/*
			 * Failover is in progress, Fail the DR
			 */
			MDI_DEBUG(1, (MDI_WARN, dip,
			    "!pHCI device is busy. "
			    "This device can not be removed at this moment. "
			    "Please try again later."));
			MDI_PI_UNLOCK(pip);
			i_mdi_client_unlock(ct);
			MDI_PHCI_UNLOCK(ph);
			return (NDI_BUSY);
		}
		MDI_PI_UNLOCK(pip);

		/*
		 * Check to see of we are removing the last path of this
		 * client device...
		 */
		cdip = ct->ct_dip;
		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
		    (i_mdi_client_compute_state(ct, ph) ==
		    MDI_CLIENT_STATE_FAILED)) {
			i_mdi_client_unlock(ct);
			MDI_PHCI_UNLOCK(ph);
			if (ndi_devi_offline(cdip,
			    NDI_DEVFS_CLEAN) != NDI_SUCCESS) {
				/*
				 * ndi_devi_offline() failed.
				 * This pHCI provides the critical path
				 * to one or more client devices.
				 * Return busy.
				 */
				MDI_PHCI_LOCK(ph);
				MDI_DEBUG(1, (MDI_WARN, dip,
				    "!pHCI device is busy. "
				    "This device can not be removed at this "
				    "moment. Please try again later."));
				failed_pip = pip;
				break;
			} else {
				MDI_PHCI_LOCK(ph);
				pip = next;
			}
		} else {
			i_mdi_client_unlock(ct);
			pip = next;
		}
	}

	if (failed_pip) {
		pip = ph->ph_path_head;
		while (pip != failed_pip) {
			MDI_PI_LOCK(pip);
			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
			ct = MDI_PI(pip)->pi_client;
			i_mdi_client_lock(ct, pip);
			cdip = ct->ct_dip;
			switch (MDI_CLIENT_STATE(ct)) {
			case MDI_CLIENT_STATE_OPTIMAL:
			case MDI_CLIENT_STATE_DEGRADED:
				if (cdip) {
					MDI_PI_UNLOCK(pip);
					i_mdi_client_unlock(ct);
					MDI_PHCI_UNLOCK(ph);
					(void) ndi_devi_online(cdip, 0);
					MDI_PHCI_LOCK(ph);
					pip = next;
					continue;
				}
				break;

			case MDI_CLIENT_STATE_FAILED:
				if (cdip) {
					MDI_PI_UNLOCK(pip);
					i_mdi_client_unlock(ct);
					MDI_PHCI_UNLOCK(ph);
					(void) ndi_devi_offline(cdip,
						NDI_DEVFS_CLEAN);
					MDI_PHCI_LOCK(ph);
					pip = next;
					continue;
				}
				break;
			}
			MDI_PI_UNLOCK(pip);
			i_mdi_client_unlock(ct);
			pip = next;
		}
		MDI_PHCI_UNLOCK(ph);
		return (NDI_BUSY);
	}

	/*
	 * Mark the pHCI as offline
	 */
	MDI_PHCI_SET_OFFLINE(ph);

	/*
	 * Mark the child mdi_pathinfo nodes as transient
	 */
	pip = ph->ph_path_head;
	while (pip != NULL) {
		MDI_PI_LOCK(pip);
		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
		MDI_PI_SET_OFFLINING(pip);
		MDI_PI_UNLOCK(pip);
		pip = next;
	}
	MDI_PHCI_UNLOCK(ph);
	/*
	 * Give a chance for any pending commands to execute
	 */
	delay_random(mdi_delay);
	MDI_PHCI_LOCK(ph);
	pip = ph->ph_path_head;
	while (pip != NULL) {
		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
		(void) i_mdi_pi_offline(pip, flags);
		MDI_PI_LOCK(pip);
		ct = MDI_PI(pip)->pi_client;
		if (!MDI_PI_IS_OFFLINE(pip)) {
			MDI_DEBUG(1, (MDI_WARN, dip,
			    "!pHCI device is busy. "
			    "This device can not be removed at this moment. "
			    "Please try again later."));
			MDI_PI_UNLOCK(pip);
			MDI_PHCI_SET_ONLINE(ph);
			MDI_PHCI_UNLOCK(ph);
			return (NDI_BUSY);
		}
		MDI_PI_UNLOCK(pip);
		pip = next;
	}
	MDI_PHCI_UNLOCK(ph);

	return (rv);
}

void
mdi_phci_mark_retiring(dev_info_t *dip, char **cons_array)
{
	mdi_phci_t	*ph;
	mdi_client_t	*ct;
	mdi_pathinfo_t	*pip;
	mdi_pathinfo_t	*next;
	dev_info_t	*cdip;

	if (!MDI_PHCI(dip))
		return;

	ph = i_devi_get_phci(dip);
	if (ph == NULL) {
		return;
	}

	MDI_PHCI_LOCK(ph);

	if (MDI_PHCI_IS_OFFLINE(ph)) {
		/* has no last path */
		MDI_PHCI_UNLOCK(ph);
		return;
	}

	pip = ph->ph_path_head;
	while (pip != NULL) {
		MDI_PI_LOCK(pip);
		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;

		ct = MDI_PI(pip)->pi_client;
		i_mdi_client_lock(ct, pip);
		MDI_PI_UNLOCK(pip);

		cdip = ct->ct_dip;
		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
		    (i_mdi_client_compute_state(ct, ph) ==
		    MDI_CLIENT_STATE_FAILED)) {
			/* Last path. Mark client dip as retiring */
			i_mdi_client_unlock(ct);
			MDI_PHCI_UNLOCK(ph);
			(void) e_ddi_mark_retiring(cdip, cons_array);
			MDI_PHCI_LOCK(ph);
			pip = next;
		} else {
			i_mdi_client_unlock(ct);
			pip = next;
		}
	}

	MDI_PHCI_UNLOCK(ph);

	return;
}

void
mdi_phci_retire_notify(dev_info_t *dip, int *constraint)
{
	mdi_phci_t	*ph;
	mdi_client_t	*ct;
	mdi_pathinfo_t	*pip;
	mdi_pathinfo_t	*next;
	dev_info_t	*cdip;

	if (!MDI_PHCI(dip))
		return;

	ph = i_devi_get_phci(dip);
	if (ph == NULL)
		return;

	MDI_PHCI_LOCK(ph);

	if (MDI_PHCI_IS_OFFLINE(ph)) {
		MDI_PHCI_UNLOCK(ph);
		/* not last path */
		return;
	}

	if (ph->ph_unstable) {
		MDI_PHCI_UNLOCK(ph);
		/* can't check for constraints */
		*constraint = 0;
		return;
	}

	pip = ph->ph_path_head;
	while (pip != NULL) {
		MDI_PI_LOCK(pip);
		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;

		/*
		 * The mdi_pathinfo state is OK. Check the client state.
		 * If failover in progress fail the pHCI from offlining
		 */
		ct = MDI_PI(pip)->pi_client;
		i_mdi_client_lock(ct, pip);
		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
		    (ct->ct_unstable)) {
			/*
			 * Failover is in progress, can't check for constraints
			 */
			MDI_PI_UNLOCK(pip);
			i_mdi_client_unlock(ct);
			MDI_PHCI_UNLOCK(ph);
			*constraint = 0;
			return;
		}
		MDI_PI_UNLOCK(pip);

		/*
		 * Check to see of we are retiring the last path of this
		 * client device...
		 */
		cdip = ct->ct_dip;
		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
		    (i_mdi_client_compute_state(ct, ph) ==
		    MDI_CLIENT_STATE_FAILED)) {
			i_mdi_client_unlock(ct);
			MDI_PHCI_UNLOCK(ph);
			(void) e_ddi_retire_notify(cdip, constraint);
			MDI_PHCI_LOCK(ph);
			pip = next;
		} else {
			i_mdi_client_unlock(ct);
			pip = next;
		}
	}

	MDI_PHCI_UNLOCK(ph);

	return;
}

/*
 * offline the path(s) hanging off the pHCI. If the
 * last path to any client, check that constraints
 * have been applied.
 */
void
mdi_phci_retire_finalize(dev_info_t *dip, int phci_only)
{
	mdi_phci_t	*ph;
	mdi_client_t	*ct;
	mdi_pathinfo_t	*pip;
	mdi_pathinfo_t	*next;
	dev_info_t	*cdip;
	int		unstable = 0;
	int		constraint;

	if (!MDI_PHCI(dip))
		return;

	ph = i_devi_get_phci(dip);
	if (ph == NULL) {
		/* no last path and no pips */
		return;
	}

	MDI_PHCI_LOCK(ph);

	if (MDI_PHCI_IS_OFFLINE(ph)) {
		MDI_PHCI_UNLOCK(ph);
		/* no last path and no pips */
		return;
	}

	/*
	 * Check to see if the pHCI can be offlined
	 */
	if (ph->ph_unstable) {
		unstable = 1;
	}

	pip = ph->ph_path_head;
	while (pip != NULL) {
		MDI_PI_LOCK(pip);
		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;

		/*
		 * if failover in progress fail the pHCI from offlining
		 */
		ct = MDI_PI(pip)->pi_client;
		i_mdi_client_lock(ct, pip);
		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
		    (ct->ct_unstable)) {
			unstable = 1;
		}
		MDI_PI_UNLOCK(pip);

		/*
		 * Check to see of we are removing the last path of this
		 * client device...
		 */
		cdip = ct->ct_dip;
		if (!phci_only && cdip &&
		    (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
		    (i_mdi_client_compute_state(ct, ph) ==
		    MDI_CLIENT_STATE_FAILED)) {
			i_mdi_client_unlock(ct);
			MDI_PHCI_UNLOCK(ph);
			/*
			 * We don't retire clients we just retire the
			 * path to a client. If it is the last path
			 * to a client, constraints are checked and
			 * if we pass the last path is offlined. MPXIO will
			 * then fail all I/Os to the client. Since we don't
			 * want to retire the client on a path error
			 * set constraint = 0 so that the client dip
			 * is not retired.
			 */
			constraint = 0;
			(void) e_ddi_retire_finalize(cdip, &constraint);
			MDI_PHCI_LOCK(ph);
			pip = next;
		} else {
			i_mdi_client_unlock(ct);
			pip = next;
		}
	}

	/*
	 * Cannot offline pip(s)
	 */
	if (unstable) {
		cmn_err(CE_WARN, "%s%d: mdi_phci_retire_finalize: "
		    "pHCI in transient state, cannot retire",
		    ddi_driver_name(dip), ddi_get_instance(dip));
		MDI_PHCI_UNLOCK(ph);
		return;
	}

	/*
	 * Mark the pHCI as offline
	 */
	MDI_PHCI_SET_OFFLINE(ph);

	/*
	 * Mark the child mdi_pathinfo nodes as transient
	 */
	pip = ph->ph_path_head;
	while (pip != NULL) {
		MDI_PI_LOCK(pip);
		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
		MDI_PI_SET_OFFLINING(pip);
		MDI_PI_UNLOCK(pip);
		pip = next;
	}
	MDI_PHCI_UNLOCK(ph);
	/*
	 * Give a chance for any pending commands to execute
	 */
	delay_random(mdi_delay);
	MDI_PHCI_LOCK(ph);
	pip = ph->ph_path_head;
	while (pip != NULL) {
		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
		(void) i_mdi_pi_offline(pip, 0);
		MDI_PI_LOCK(pip);
		ct = MDI_PI(pip)->pi_client;
		if (!MDI_PI_IS_OFFLINE(pip)) {
			cmn_err(CE_WARN, "mdi_phci_retire_finalize: "
			    "path %d %s busy, cannot offline",
			    mdi_pi_get_path_instance(pip),
			    mdi_pi_spathname(pip));
			MDI_PI_UNLOCK(pip);
			MDI_PHCI_SET_ONLINE(ph);
			MDI_PHCI_UNLOCK(ph);
			return;
		}
		MDI_PI_UNLOCK(pip);
		pip = next;
	}
	MDI_PHCI_UNLOCK(ph);

	return;
}

void
mdi_phci_unretire(dev_info_t *dip)
{
	ASSERT(MDI_PHCI(dip));

	/*
	 * Online the phci
	 */
	i_mdi_phci_online(dip);
}

/*ARGSUSED*/
static int
i_mdi_client_offline(dev_info_t *dip, uint_t flags)
{
	int		rv = NDI_SUCCESS;
	mdi_client_t	*ct;

	/*
	 * Client component to go offline.  Make sure that we are
	 * not in failing over state and update client state
	 * accordingly
	 */
	ct = i_devi_get_client(dip);
	MDI_DEBUG(2, (MDI_NOTE, dip,
	    "called %p %p", (void *)dip, (void *)ct));
	if (ct != NULL) {
		MDI_CLIENT_LOCK(ct);
		if (ct->ct_unstable) {
			/*
			 * One or more paths are in transient state,
			 * Dont allow offline of a client device
			 */
			MDI_DEBUG(1, (MDI_WARN, dip,
			    "!One or more paths to "
			    "this device are in transient state. "
			    "This device can not be removed at this moment. "
			    "Please try again later."));
			MDI_CLIENT_UNLOCK(ct);
			return (NDI_BUSY);
		}
		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
			/*
			 * Failover is in progress, Dont allow DR of
			 * a client device
			 */
			MDI_DEBUG(1, (MDI_WARN, dip,
			    "!Client device is Busy. "
			    "This device can not be removed at this moment. "
			    "Please try again later."));
			MDI_CLIENT_UNLOCK(ct);
			return (NDI_BUSY);
		}
		MDI_CLIENT_SET_OFFLINE(ct);

		/*
		 * Unbind our relationship with the dev_info node
		 */
		if (flags & NDI_DEVI_REMOVE) {
			ct->ct_dip = NULL;
		}
		MDI_CLIENT_UNLOCK(ct);
	}
	return (rv);
}

/*
 * mdi_pre_attach():
 *		Pre attach() notification handler
 */
/*ARGSUSED*/
int
mdi_pre_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
	/* don't support old DDI_PM_RESUME */
	if ((DEVI(dip)->devi_mdi_component != MDI_COMPONENT_NONE) &&
	    (cmd == DDI_PM_RESUME))
		return (DDI_FAILURE);

	return (DDI_SUCCESS);
}

/*
 * mdi_post_attach():
 *		Post attach() notification handler
 */
/*ARGSUSED*/
void
mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
{
	mdi_phci_t	*ph;
	mdi_client_t	*ct;
	mdi_vhci_t	*vh;

	if (MDI_PHCI(dip)) {
		ph = i_devi_get_phci(dip);
		ASSERT(ph != NULL);

		MDI_PHCI_LOCK(ph);
		switch (cmd) {
		case DDI_ATTACH:
			MDI_DEBUG(2, (MDI_NOTE, dip,
			    "phci post_attach called %p", (void *)ph));
			if (error == DDI_SUCCESS) {
				MDI_PHCI_SET_ATTACH(ph);
			} else {
				MDI_DEBUG(1, (MDI_NOTE, dip,
				    "!pHCI post_attach failed: error %d",
				    error));
				MDI_PHCI_SET_DETACH(ph);
			}
			break;

		case DDI_RESUME:
			MDI_DEBUG(2, (MDI_NOTE, dip,
			    "pHCI post_resume: called %p", (void *)ph));
			if (error == DDI_SUCCESS) {
				MDI_PHCI_SET_RESUME(ph);
			} else {
				MDI_DEBUG(1, (MDI_NOTE, dip,
				    "!pHCI post_resume failed: error %d",
				    error));
				MDI_PHCI_SET_SUSPEND(ph);
			}
			break;
		}
		MDI_PHCI_UNLOCK(ph);
	}

	if (MDI_CLIENT(dip)) {
		ct = i_devi_get_client(dip);
		ASSERT(ct != NULL);

		MDI_CLIENT_LOCK(ct);
		switch (cmd) {
		case DDI_ATTACH:
			MDI_DEBUG(2, (MDI_NOTE, dip,
			    "client post_attach called %p", (void *)ct));
			if (error != DDI_SUCCESS) {
				MDI_DEBUG(1, (MDI_NOTE, dip,
				    "!client post_attach failed: error %d",
				    error));
				MDI_CLIENT_SET_DETACH(ct);
				MDI_DEBUG(4, (MDI_WARN, dip,
				    "i_mdi_pm_reset_client"));
				i_mdi_pm_reset_client(ct);
				break;
			}

			/*
			 * Client device has successfully attached, inform
			 * the vhci.
			 */
			vh = ct->ct_vhci;
			if (vh->vh_ops->vo_client_attached)
				(*vh->vh_ops->vo_client_attached)(dip);

			MDI_CLIENT_SET_ATTACH(ct);
			break;

		case DDI_RESUME:
			MDI_DEBUG(2, (MDI_NOTE, dip,
			    "client post_attach: called %p", (void *)ct));
			if (error == DDI_SUCCESS) {
				MDI_CLIENT_SET_RESUME(ct);
			} else {
				MDI_DEBUG(1, (MDI_NOTE, dip,
				    "!client post_resume failed: error %d",
				    error));
				MDI_CLIENT_SET_SUSPEND(ct);
			}
			break;
		}
		MDI_CLIENT_UNLOCK(ct);
	}
}

/*
 * mdi_pre_detach():
 *		Pre detach notification handler
 */
/*ARGSUSED*/
int
mdi_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
	int rv = DDI_SUCCESS;

	if (MDI_CLIENT(dip)) {
		(void) i_mdi_client_pre_detach(dip, cmd);
	}

	if (MDI_PHCI(dip)) {
		rv = i_mdi_phci_pre_detach(dip, cmd);
	}

	return (rv);
}

/*ARGSUSED*/
static int
i_mdi_phci_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
	int		rv = DDI_SUCCESS;
	mdi_phci_t	*ph;
	mdi_client_t	*ct;
	mdi_pathinfo_t	*pip;
	mdi_pathinfo_t	*failed_pip = NULL;
	mdi_pathinfo_t	*next;

	ph = i_devi_get_phci(dip);
	if (ph == NULL) {
		return (rv);
	}

	MDI_PHCI_LOCK(ph);
	switch (cmd) {
	case DDI_DETACH:
		MDI_DEBUG(2, (MDI_NOTE, dip,
		    "pHCI pre_detach: called %p", (void *)ph));
		if (!MDI_PHCI_IS_OFFLINE(ph)) {
			/*
			 * mdi_pathinfo nodes are still attached to
			 * this pHCI. Fail the detach for this pHCI.
			 */
			MDI_DEBUG(2, (MDI_WARN, dip,
			    "pHCI pre_detach: paths are still attached %p",
			    (void *)ph));
			rv = DDI_FAILURE;
			break;
		}
		MDI_PHCI_SET_DETACH(ph);
		break;

	case DDI_SUSPEND:
		/*
		 * pHCI is getting suspended.  Since mpxio client
		 * devices may not be suspended at this point, to avoid
		 * a potential stack overflow, it is important to suspend
		 * client devices before pHCI can be suspended.
		 */

		MDI_DEBUG(2, (MDI_NOTE, dip,
		    "pHCI pre_suspend: called %p", (void *)ph));
		/*
		 * Suspend all the client devices accessible through this pHCI
		 */
		pip = ph->ph_path_head;
		while (pip != NULL && rv == DDI_SUCCESS) {
			dev_info_t *cdip;
			MDI_PI_LOCK(pip);
			next =
			    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
			ct = MDI_PI(pip)->pi_client;
			i_mdi_client_lock(ct, pip);
			cdip = ct->ct_dip;
			MDI_PI_UNLOCK(pip);
			if ((MDI_CLIENT_IS_DETACHED(ct) == 0) &&
			    MDI_CLIENT_IS_SUSPENDED(ct) == 0) {
				i_mdi_client_unlock(ct);
				if ((rv = devi_detach(cdip, DDI_SUSPEND)) !=
				    DDI_SUCCESS) {
					/*
					 * Suspend of one of the client
					 * device has failed.
					 */
					MDI_DEBUG(1, (MDI_WARN, dip,
					    "!suspend of device (%s%d) failed.",
					    ddi_driver_name(cdip),
					    ddi_get_instance(cdip)));
					failed_pip = pip;
					break;
				}
			} else {
				i_mdi_client_unlock(ct);
			}
			pip = next;
		}

		if (rv == DDI_SUCCESS) {
			/*
			 * Suspend of client devices is complete. Proceed
			 * with pHCI suspend.
			 */
			MDI_PHCI_SET_SUSPEND(ph);
		} else {
			/*
			 * Revert back all the suspended client device states
			 * to converse.
			 */
			pip = ph->ph_path_head;
			while (pip != failed_pip) {
				dev_info_t *cdip;
				MDI_PI_LOCK(pip);
				next =
				    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
				ct = MDI_PI(pip)->pi_client;
				i_mdi_client_lock(ct, pip);
				cdip = ct->ct_dip;
				MDI_PI_UNLOCK(pip);
				if (MDI_CLIENT_IS_SUSPENDED(ct)) {
					i_mdi_client_unlock(ct);
					(void) devi_attach(cdip, DDI_RESUME);
				} else {
					i_mdi_client_unlock(ct);
				}
				pip = next;
			}
		}
		break;

	default:
		rv = DDI_FAILURE;
		break;
	}
	MDI_PHCI_UNLOCK(ph);
	return (rv);
}

/*ARGSUSED*/
static int
i_mdi_client_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
	int		rv = DDI_SUCCESS;
	mdi_client_t	*ct;

	ct = i_devi_get_client(dip);
	if (ct == NULL) {
		return (rv);
	}

	MDI_CLIENT_LOCK(ct);
	switch (cmd) {
	case DDI_DETACH:
		MDI_DEBUG(2, (MDI_NOTE, dip,
		    "client pre_detach: called %p",
		     (void *)ct));
		MDI_CLIENT_SET_DETACH(ct);
		break;

	case DDI_SUSPEND:
		MDI_DEBUG(2, (MDI_NOTE, dip,
		    "client pre_suspend: called %p",
		    (void *)ct));
		MDI_CLIENT_SET_SUSPEND(ct);
		break;

	default:
		rv = DDI_FAILURE;
		break;
	}
	MDI_CLIENT_UNLOCK(ct);
	return (rv);
}

/*
 * mdi_post_detach():
 *		Post detach notification handler
 */
/*ARGSUSED*/
void
mdi_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
{
	/*
	 * Detach/Suspend of mpxio component failed. Update our state
	 * too
	 */
	if (MDI_PHCI(dip))
		i_mdi_phci_post_detach(dip, cmd, error);

	if (MDI_CLIENT(dip))
		i_mdi_client_post_detach(dip, cmd, error);
}

/*ARGSUSED*/
static void
i_mdi_phci_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
{
	mdi_phci_t	*ph;

	/*
	 * Detach/Suspend of phci component failed. Update our state
	 * too
	 */
	ph = i_devi_get_phci(dip);
	if (ph == NULL) {
		return;
	}

	MDI_PHCI_LOCK(ph);
	/*
	 * Detach of pHCI failed. Restore back converse
	 * state
	 */
	switch (cmd) {
	case DDI_DETACH:
		MDI_DEBUG(2, (MDI_NOTE, dip,
		    "pHCI post_detach: called %p",
		    (void *)ph));
		if (error != DDI_SUCCESS)
			MDI_PHCI_SET_ATTACH(ph);
		break;

	case DDI_SUSPEND:
		MDI_DEBUG(2, (MDI_NOTE, dip,
		    "pHCI post_suspend: called %p",
		    (void *)ph));
		if (error != DDI_SUCCESS)
			MDI_PHCI_SET_RESUME(ph);
		break;
	}
	MDI_PHCI_UNLOCK(ph);
}

/*ARGSUSED*/
static void
i_mdi_client_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
{
	mdi_client_t	*ct;

	ct = i_devi_get_client(dip);
	if (ct == NULL) {
		return;
	}
	MDI_CLIENT_LOCK(ct);
	/*
	 * Detach of Client failed. Restore back converse
	 * state
	 */
	switch (cmd) {
	case DDI_DETACH:
		MDI_DEBUG(2, (MDI_NOTE, dip,
		    "client post_detach: called %p", (void *)ct));
		if (DEVI_IS_ATTACHING(ct->ct_dip)) {
			MDI_DEBUG(4, (MDI_NOTE, dip,
			    "i_mdi_pm_rele_client\n"));
			i_mdi_pm_rele_client(ct, ct->ct_path_count);
		} else {
			MDI_DEBUG(4, (MDI_NOTE, dip,
			    "i_mdi_pm_reset_client\n"));
			i_mdi_pm_reset_client(ct);
		}
		if (error != DDI_SUCCESS)
			MDI_CLIENT_SET_ATTACH(ct);
		break;

	case DDI_SUSPEND:
		MDI_DEBUG(2, (MDI_NOTE, dip,
		    "called %p", (void *)ct));
		if (error != DDI_SUCCESS)
			MDI_CLIENT_SET_RESUME(ct);
		break;
	}
	MDI_CLIENT_UNLOCK(ct);
}

int
mdi_pi_kstat_exists(mdi_pathinfo_t *pip)
{
	return (MDI_PI(pip)->pi_kstats ? 1 : 0);
}

/*
 * create and install per-path (client - pHCI) statistics
 * I/O stats supported: nread, nwritten, reads, and writes
 * Error stats - hard errors, soft errors, & transport errors
 */
int
mdi_pi_kstat_create(mdi_pathinfo_t *pip, char *ksname)
{
	kstat_t			*kiosp, *kerrsp;
	struct pi_errs		*nsp;
	struct mdi_pi_kstats	*mdi_statp;

	if (MDI_PI(pip)->pi_kstats != NULL)
		return (MDI_SUCCESS);

	if ((kiosp = kstat_create("mdi", 0, ksname, "iopath",
	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL) {
		return (MDI_FAILURE);
	}

	(void) strcat(ksname, ",err");
	kerrsp = kstat_create("mdi", 0, ksname, "iopath_errors",
	    KSTAT_TYPE_NAMED,
	    sizeof (struct pi_errs) / sizeof (kstat_named_t), 0);
	if (kerrsp == NULL) {
		kstat_delete(kiosp);
		return (MDI_FAILURE);
	}

	nsp = (struct pi_errs *)kerrsp->ks_data;
	kstat_named_init(&nsp->pi_softerrs, "Soft Errors", KSTAT_DATA_UINT32);
	kstat_named_init(&nsp->pi_harderrs, "Hard Errors", KSTAT_DATA_UINT32);
	kstat_named_init(&nsp->pi_transerrs, "Transport Errors",
	    KSTAT_DATA_UINT32);
	kstat_named_init(&nsp->pi_icnt_busy, "Interconnect Busy",
	    KSTAT_DATA_UINT32);
	kstat_named_init(&nsp->pi_icnt_errors, "Interconnect Errors",
	    KSTAT_DATA_UINT32);
	kstat_named_init(&nsp->pi_phci_rsrc, "pHCI No Resources",
	    KSTAT_DATA_UINT32);
	kstat_named_init(&nsp->pi_phci_localerr, "pHCI Local Errors",
	    KSTAT_DATA_UINT32);
	kstat_named_init(&nsp->pi_phci_invstate, "pHCI Invalid State",
	    KSTAT_DATA_UINT32);
	kstat_named_init(&nsp->pi_failedfrom, "Failed From",
	    KSTAT_DATA_UINT32);
	kstat_named_init(&nsp->pi_failedto, "Failed To", KSTAT_DATA_UINT32);

	mdi_statp = kmem_alloc(sizeof (*mdi_statp), KM_SLEEP);
	mdi_statp->pi_kstat_ref = 1;
	mdi_statp->pi_kstat_iostats = kiosp;
	mdi_statp->pi_kstat_errstats = kerrsp;
	kstat_install(kiosp);
	kstat_install(kerrsp);
	MDI_PI(pip)->pi_kstats = mdi_statp;
	return (MDI_SUCCESS);
}

/*
 * destroy per-path properties
 */
static void
i_mdi_pi_kstat_destroy(mdi_pathinfo_t *pip)
{

	struct mdi_pi_kstats *mdi_statp;

	if (MDI_PI(pip)->pi_kstats == NULL)
		return;
	if ((mdi_statp = MDI_PI(pip)->pi_kstats) == NULL)
		return;

	MDI_PI(pip)->pi_kstats = NULL;

	/*
	 * the kstat may be shared between multiple pathinfo nodes
	 * decrement this pathinfo's usage, removing the kstats
	 * themselves when the last pathinfo reference is removed.
	 */
	ASSERT(mdi_statp->pi_kstat_ref > 0);
	if (--mdi_statp->pi_kstat_ref != 0)
		return;

	kstat_delete(mdi_statp->pi_kstat_iostats);
	kstat_delete(mdi_statp->pi_kstat_errstats);
	kmem_free(mdi_statp, sizeof (*mdi_statp));
}

/*
 * update I/O paths KSTATS
 */
void
mdi_pi_kstat_iosupdate(mdi_pathinfo_t *pip, struct buf *bp)
{
	kstat_t *iostatp;
	size_t xfer_cnt;

	ASSERT(pip != NULL);

	/*
	 * I/O can be driven across a path prior to having path
	 * statistics available, i.e. probe(9e).
	 */
	if (bp != NULL && MDI_PI(pip)->pi_kstats != NULL) {
		iostatp = MDI_PI(pip)->pi_kstats->pi_kstat_iostats;
		xfer_cnt = bp->b_bcount - bp->b_resid;
		if (bp->b_flags & B_READ) {
			KSTAT_IO_PTR(iostatp)->reads++;
			KSTAT_IO_PTR(iostatp)->nread += xfer_cnt;
		} else {
			KSTAT_IO_PTR(iostatp)->writes++;
			KSTAT_IO_PTR(iostatp)->nwritten += xfer_cnt;
		}
	}
}

/*
 * Enable the path(specific client/target/initiator)
 * Enabling a path means that MPxIO may select the enabled path for routing
 * future I/O requests, subject to other path state constraints.
 */
int
mdi_pi_enable_path(mdi_pathinfo_t *pip, int flags)
{
	mdi_phci_t	*ph;

	ph = MDI_PI(pip)->pi_phci;
	if (ph == NULL) {
		MDI_DEBUG(1, (MDI_NOTE, mdi_pi_get_phci(pip),
		    "!failed: path %s %p: NULL ph",
		    mdi_pi_spathname(pip), (void *)pip));
		return (MDI_FAILURE);
	}

	(void) i_mdi_enable_disable_path(pip, ph->ph_vhci, flags,
		MDI_ENABLE_OP);
	MDI_DEBUG(5, (MDI_NOTE, ph->ph_dip,
	    "!returning success pip = %p. ph = %p",
	    (void *)pip, (void *)ph));
	return (MDI_SUCCESS);

}

/*
 * Disable the path (specific client/target/initiator)
 * Disabling a path means that MPxIO will not select the disabled path for
 * routing any new I/O requests.
 */
int
mdi_pi_disable_path(mdi_pathinfo_t *pip, int flags)
{
	mdi_phci_t	*ph;

	ph = MDI_PI(pip)->pi_phci;
	if (ph == NULL) {
		MDI_DEBUG(1, (MDI_NOTE, mdi_pi_get_phci(pip),
		    "!failed: path %s %p: NULL ph",
		    mdi_pi_spathname(pip), (void *)pip));
		return (MDI_FAILURE);
	}

	(void) i_mdi_enable_disable_path(pip,
	    ph->ph_vhci, flags, MDI_DISABLE_OP);
	MDI_DEBUG(5, (MDI_NOTE, ph->ph_dip,
	    "!returning success pip = %p. ph = %p",
	    (void *)pip, (void *)ph));
	return (MDI_SUCCESS);
}

/*
 * disable the path to a particular pHCI (pHCI specified in the phci_path
 * argument) for a particular client (specified in the client_path argument).
 * Disabling a path means that MPxIO will not select the disabled path for
 * routing any new I/O requests.
 * NOTE: this will be removed once the NWS files are changed to use the new
 * mdi_{enable,disable}_path interfaces
 */
int
mdi_pi_disable(dev_info_t *cdip, dev_info_t *pdip, int flags)
{
	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_DISABLE_OP));
}

/*
 * Enable the path to a particular pHCI (pHCI specified in the phci_path
 * argument) for a particular client (specified in the client_path argument).
 * Enabling a path means that MPxIO may select the enabled path for routing
 * future I/O requests, subject to other path state constraints.
 * NOTE: this will be removed once the NWS files are changed to use the new
 * mdi_{enable,disable}_path interfaces
 */

int
mdi_pi_enable(dev_info_t *cdip, dev_info_t *pdip, int flags)
{
	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_ENABLE_OP));
}

/*
 * Common routine for doing enable/disable.
 */
static mdi_pathinfo_t *
i_mdi_enable_disable_path(mdi_pathinfo_t *pip, mdi_vhci_t *vh, int flags,
		int op)
{
	int		sync_flag = 0;
	int		rv;
	mdi_pathinfo_t 	*next;
	int		(*f)() = NULL;

	/*
	 * Check to make sure the path is not already in the
	 * requested state. If it is just return the next path
	 * as we have nothing to do here.
	 */
	if ((MDI_PI_IS_DISABLE(pip) && op == MDI_DISABLE_OP) ||
	    (!MDI_PI_IS_DISABLE(pip) && op == MDI_ENABLE_OP)) {
		MDI_PI_LOCK(pip);
		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
		MDI_PI_UNLOCK(pip);
		return (next);
	}

	f = vh->vh_ops->vo_pi_state_change;

	sync_flag = (flags << 8) & 0xf00;

	/*
	 * Do a callback into the mdi consumer to let it
	 * know that path is about to get enabled/disabled.
	 */
	if (f != NULL) {
		rv = (*f)(vh->vh_dip, pip, 0,
			MDI_PI_EXT_STATE(pip),
			MDI_EXT_STATE_CHANGE | sync_flag |
			op | MDI_BEFORE_STATE_CHANGE);
		if (rv != MDI_SUCCESS) {
			MDI_DEBUG(2, (MDI_WARN, vh->vh_dip,
			    "vo_pi_state_change: failed rv = %x", rv));
		}
	}
	MDI_PI_LOCK(pip);
	next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;

	switch (flags) {
		case USER_DISABLE:
			if (op == MDI_DISABLE_OP) {
				MDI_PI_SET_USER_DISABLE(pip);
			} else {
				MDI_PI_SET_USER_ENABLE(pip);
			}
			break;
		case DRIVER_DISABLE:
			if (op == MDI_DISABLE_OP) {
				MDI_PI_SET_DRV_DISABLE(pip);
			} else {
				MDI_PI_SET_DRV_ENABLE(pip);
			}
			break;
		case DRIVER_DISABLE_TRANSIENT:
			if (op == MDI_DISABLE_OP && rv == MDI_SUCCESS) {
				MDI_PI_SET_DRV_DISABLE_TRANS(pip);
			} else {
				MDI_PI_SET_DRV_ENABLE_TRANS(pip);
			}
			break;
	}
	MDI_PI_UNLOCK(pip);
	/*
	 * Do a callback into the mdi consumer to let it
	 * know that path is now enabled/disabled.
	 */
	if (f != NULL) {
		rv = (*f)(vh->vh_dip, pip, 0,
			MDI_PI_EXT_STATE(pip),
			MDI_EXT_STATE_CHANGE | sync_flag |
			op | MDI_AFTER_STATE_CHANGE);
		if (rv != MDI_SUCCESS) {
			MDI_DEBUG(2, (MDI_WARN, vh->vh_dip,
			    "vo_pi_state_change failed: rv = %x", rv));
		}
	}
	return (next);
}

/*
 * Common routine for doing enable/disable.
 * NOTE: this will be removed once the NWS files are changed to use the new
 * mdi_{enable,disable}_path has been putback
 */
int
i_mdi_pi_enable_disable(dev_info_t *cdip, dev_info_t *pdip, int flags, int op)
{

	mdi_phci_t	*ph;
	mdi_vhci_t	*vh = NULL;
	mdi_client_t	*ct;
	mdi_pathinfo_t	*next, *pip;
	int		found_it;

	ph = i_devi_get_phci(pdip);
	MDI_DEBUG(5, (MDI_NOTE, cdip ? cdip : pdip,
	    "!op = %d pdip = %p cdip = %p", op, (void *)pdip,
	    (void *)cdip));
	if (ph == NULL) {
		MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
		    "!failed: operation %d: NULL ph", op));
		return (MDI_FAILURE);
	}

	if ((op != MDI_ENABLE_OP) && (op != MDI_DISABLE_OP)) {
		MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
		    "!failed: invalid operation %d", op));
		return (MDI_FAILURE);
	}

	vh = ph->ph_vhci;

	if (cdip == NULL) {
		/*
		 * Need to mark the Phci as enabled/disabled.
		 */
		MDI_DEBUG(4, (MDI_NOTE, cdip ? cdip : pdip,
		    "op %d for the phci", op));
		MDI_PHCI_LOCK(ph);
		switch (flags) {
			case USER_DISABLE:
				if (op == MDI_DISABLE_OP) {
					MDI_PHCI_SET_USER_DISABLE(ph);
				} else {
					MDI_PHCI_SET_USER_ENABLE(ph);
				}
				break;
			case DRIVER_DISABLE:
				if (op == MDI_DISABLE_OP) {
					MDI_PHCI_SET_DRV_DISABLE(ph);
				} else {
					MDI_PHCI_SET_DRV_ENABLE(ph);
				}
				break;
			case DRIVER_DISABLE_TRANSIENT:
				if (op == MDI_DISABLE_OP) {
					MDI_PHCI_SET_DRV_DISABLE_TRANSIENT(ph);
				} else {
					MDI_PHCI_SET_DRV_ENABLE_TRANSIENT(ph);
				}
				break;
			default:
				MDI_PHCI_UNLOCK(ph);
				MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
				    "!invalid flag argument= %d", flags));
		}

		/*
		 * Phci has been disabled. Now try to enable/disable
		 * path info's to each client.
		 */
		pip = ph->ph_path_head;
		while (pip != NULL) {
			pip = i_mdi_enable_disable_path(pip, vh, flags, op);
		}
		MDI_PHCI_UNLOCK(ph);
	} else {

		/*
		 * Disable a specific client.
		 */
		ct = i_devi_get_client(cdip);
		if (ct == NULL) {
			MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
			    "!failed: operation = %d: NULL ct", op));
			return (MDI_FAILURE);
		}

		MDI_CLIENT_LOCK(ct);
		pip = ct->ct_path_head;
		found_it = 0;
		while (pip != NULL) {
			MDI_PI_LOCK(pip);
			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
			if (MDI_PI(pip)->pi_phci == ph) {
				MDI_PI_UNLOCK(pip);
				found_it = 1;
				break;
			}
			MDI_PI_UNLOCK(pip);
			pip = next;
		}


		MDI_CLIENT_UNLOCK(ct);
		if (found_it == 0) {
			MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
			    "!failed. Could not find corresponding pip\n"));
			return (MDI_FAILURE);
		}

		(void) i_mdi_enable_disable_path(pip, vh, flags, op);
	}

	MDI_DEBUG(5, (MDI_NOTE, cdip ? cdip : pdip,
	    "!op %d returning success pdip = %p cdip = %p",
	    op, (void *)pdip, (void *)cdip));
	return (MDI_SUCCESS);
}

/*
 * Ensure phci powered up
 */
static void
i_mdi_pm_hold_pip(mdi_pathinfo_t *pip)
{
	dev_info_t	*ph_dip;

	ASSERT(pip != NULL);
	ASSERT(MDI_PI_LOCKED(pip));

	if (MDI_PI(pip)->pi_pm_held) {
		return;
	}

	ph_dip = mdi_pi_get_phci(pip);
	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
	    "%s %p", mdi_pi_spathname(pip), (void *)pip));
	if (ph_dip == NULL) {
		return;
	}

	MDI_PI_UNLOCK(pip);
	MDI_DEBUG(4, (MDI_NOTE, ph_dip, "kidsupcnt was %d",
	    DEVI(ph_dip)->devi_pm_kidsupcnt));
	pm_hold_power(ph_dip);
	MDI_DEBUG(4, (MDI_NOTE, ph_dip, "kidsupcnt is %d",
	    DEVI(ph_dip)->devi_pm_kidsupcnt));
	MDI_PI_LOCK(pip);

	/* If PM_GET_PM_INFO is NULL the pm_hold_power above was a noop */
	if (DEVI(ph_dip)->devi_pm_info)
		MDI_PI(pip)->pi_pm_held = 1;
}

/*
 * Allow phci powered down
 */
static void
i_mdi_pm_rele_pip(mdi_pathinfo_t *pip)
{
	dev_info_t	*ph_dip = NULL;

	ASSERT(pip != NULL);
	ASSERT(MDI_PI_LOCKED(pip));

	if (MDI_PI(pip)->pi_pm_held == 0) {
		return;
	}

	ph_dip = mdi_pi_get_phci(pip);
	ASSERT(ph_dip != NULL);

	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
	    "%s %p", mdi_pi_spathname(pip), (void *)pip));

	MDI_PI_UNLOCK(pip);
	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
	    "kidsupcnt was %d", DEVI(ph_dip)->devi_pm_kidsupcnt));
	pm_rele_power(ph_dip);
	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
	    "kidsupcnt is %d", DEVI(ph_dip)->devi_pm_kidsupcnt));
	MDI_PI_LOCK(pip);

	MDI_PI(pip)->pi_pm_held = 0;
}

static void
i_mdi_pm_hold_client(mdi_client_t *ct, int incr)
{
	ASSERT(MDI_CLIENT_LOCKED(ct));

	ct->ct_power_cnt += incr;
	MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
	    "%p ct_power_cnt = %d incr = %d",
	    (void *)ct, ct->ct_power_cnt, incr));
	ASSERT(ct->ct_power_cnt >= 0);
}

static void
i_mdi_rele_all_phci(mdi_client_t *ct)
{
	mdi_pathinfo_t  *pip;

	ASSERT(MDI_CLIENT_LOCKED(ct));
	pip = (mdi_pathinfo_t *)ct->ct_path_head;
	while (pip != NULL) {
		mdi_hold_path(pip);
		MDI_PI_LOCK(pip);
		i_mdi_pm_rele_pip(pip);
		MDI_PI_UNLOCK(pip);
		mdi_rele_path(pip);
		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
	}
}

static void
i_mdi_pm_rele_client(mdi_client_t *ct, int decr)
{
	ASSERT(MDI_CLIENT_LOCKED(ct));

	if (i_ddi_devi_attached(ct->ct_dip)) {
		ct->ct_power_cnt -= decr;
		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
		    "%p ct_power_cnt = %d decr = %d",
		    (void *)ct, ct->ct_power_cnt, decr));
	}

	ASSERT(ct->ct_power_cnt >= 0);
	if (ct->ct_power_cnt == 0) {
		i_mdi_rele_all_phci(ct);
		return;
	}
}

static void
i_mdi_pm_reset_client(mdi_client_t *ct)
{
	MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
	    "%p ct_power_cnt = %d", (void *)ct, ct->ct_power_cnt));
	ASSERT(MDI_CLIENT_LOCKED(ct));
	ct->ct_power_cnt = 0;
	i_mdi_rele_all_phci(ct);
	ct->ct_powercnt_config = 0;
	ct->ct_powercnt_unconfig = 0;
	ct->ct_powercnt_reset = 1;
}

static int
i_mdi_power_one_phci(mdi_pathinfo_t *pip)
{
	int		ret;
	dev_info_t	*ph_dip;

	MDI_PI_LOCK(pip);
	i_mdi_pm_hold_pip(pip);

	ph_dip = mdi_pi_get_phci(pip);
	MDI_PI_UNLOCK(pip);

	/* bring all components of phci to full power */
	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
	    "pm_powerup for %s%d %p", ddi_driver_name(ph_dip),
	    ddi_get_instance(ph_dip), (void *)pip));

	ret = pm_powerup(ph_dip);

	if (ret == DDI_FAILURE) {
		MDI_DEBUG(4, (MDI_NOTE, ph_dip,
		    "pm_powerup FAILED for %s%d %p",
		    ddi_driver_name(ph_dip), ddi_get_instance(ph_dip),
		    (void *)pip));

		MDI_PI_LOCK(pip);
		i_mdi_pm_rele_pip(pip);
		MDI_PI_UNLOCK(pip);
		return (MDI_FAILURE);
	}

	return (MDI_SUCCESS);
}

static int
i_mdi_power_all_phci(mdi_client_t *ct)
{
	mdi_pathinfo_t  *pip;
	int		succeeded = 0;

	ASSERT(MDI_CLIENT_LOCKED(ct));
	pip = (mdi_pathinfo_t *)ct->ct_path_head;
	while (pip != NULL) {
		/*
		 * Don't power if MDI_PATHINFO_STATE_FAULT
		 * or MDI_PATHINFO_STATE_OFFLINE.
		 */
		if (MDI_PI_IS_INIT(pip) ||
		    MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip)) {
			mdi_hold_path(pip);
			MDI_CLIENT_UNLOCK(ct);
			if (i_mdi_power_one_phci(pip) == MDI_SUCCESS)
				succeeded = 1;

			ASSERT(ct == MDI_PI(pip)->pi_client);
			MDI_CLIENT_LOCK(ct);
			mdi_rele_path(pip);
		}
		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
	}

	return (succeeded ? MDI_SUCCESS : MDI_FAILURE);
}

/*
 * mdi_bus_power():
 *		1. Place the phci(s) into powered up state so that
 *		   client can do power management
 *		2. Ensure phci powered up as client power managing
 * Return Values:
 *		MDI_SUCCESS
 *		MDI_FAILURE
 */
int
mdi_bus_power(dev_info_t *parent, void *impl_arg, pm_bus_power_op_t op,
    void *arg, void *result)
{
	int			ret = MDI_SUCCESS;
	pm_bp_child_pwrchg_t	*bpc;
	mdi_client_t		*ct;
	dev_info_t		*cdip;
	pm_bp_has_changed_t	*bphc;

	/*
	 * BUS_POWER_NOINVOL not supported
	 */
	if (op == BUS_POWER_NOINVOL)
		return (MDI_FAILURE);

	/*
	 * ignore other OPs.
	 * return quickly to save cou cycles on the ct processing
	 */
	switch (op) {
	case BUS_POWER_PRE_NOTIFICATION:
	case BUS_POWER_POST_NOTIFICATION:
		bpc = (pm_bp_child_pwrchg_t *)arg;
		cdip = bpc->bpc_dip;
		break;
	case BUS_POWER_HAS_CHANGED:
		bphc = (pm_bp_has_changed_t *)arg;
		cdip = bphc->bphc_dip;
		break;
	default:
		return (pm_busop_bus_power(parent, impl_arg, op, arg, result));
	}

	ASSERT(MDI_CLIENT(cdip));

	ct = i_devi_get_client(cdip);
	if (ct == NULL)
		return (MDI_FAILURE);

	/*
	 * wait till the mdi_pathinfo node state change are processed
	 */
	MDI_CLIENT_LOCK(ct);
	switch (op) {
	case BUS_POWER_PRE_NOTIFICATION:
		MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
		    "BUS_POWER_PRE_NOTIFICATION:"
		    "%s@%s, olevel=%d, nlevel=%d, comp=%d",
		    ddi_node_name(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp));

		/* serialize power level change per client */
		while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
			cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);

		MDI_CLIENT_SET_POWER_TRANSITION(ct);

		if (ct->ct_power_cnt == 0) {
			ret = i_mdi_power_all_phci(ct);
		}

		/*
		 * if new_level > 0:
		 *	- hold phci(s)
		 *	- power up phci(s) if not already
		 * ignore power down
		 */
		if (bpc->bpc_nlevel > 0) {
			if (!DEVI_IS_ATTACHING(ct->ct_dip)) {
				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
				    "i_mdi_pm_hold_client\n"));
				i_mdi_pm_hold_client(ct, ct->ct_path_count);
			}
		}
		break;
	case BUS_POWER_POST_NOTIFICATION:
		MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
		    "BUS_POWER_POST_NOTIFICATION:"
		    "%s@%s, olevel=%d, nlevel=%d, comp=%d result=%d",
		    ddi_node_name(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp,
		    *(int *)result));

		if (*(int *)result == DDI_SUCCESS) {
			if (bpc->bpc_nlevel > 0) {
				MDI_CLIENT_SET_POWER_UP(ct);
			} else {
				MDI_CLIENT_SET_POWER_DOWN(ct);
			}
		}

		/* release the hold we did in pre-notification */
		if (bpc->bpc_nlevel > 0 && (*(int *)result != DDI_SUCCESS) &&
		    !DEVI_IS_ATTACHING(ct->ct_dip)) {
			MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
			    "i_mdi_pm_rele_client\n"));
			i_mdi_pm_rele_client(ct, ct->ct_path_count);
		}

		if (bpc->bpc_nlevel == 0 && (*(int *)result == DDI_SUCCESS)) {
			/* another thread might started attaching */
			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
				    "i_mdi_pm_rele_client\n"));
				i_mdi_pm_rele_client(ct, ct->ct_path_count);
			/* detaching has been taken care in pm_post_unconfig */
			} else if (!DEVI_IS_DETACHING(ct->ct_dip)) {
				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
				    "i_mdi_pm_reset_client\n"));
				i_mdi_pm_reset_client(ct);
			}
		}

		MDI_CLIENT_CLEAR_POWER_TRANSITION(ct);
		cv_broadcast(&ct->ct_powerchange_cv);

		break;

	/* need to do more */
	case BUS_POWER_HAS_CHANGED:
		MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
		    "BUS_POWER_HAS_CHANGED:"
		    "%s@%s, olevel=%d, nlevel=%d, comp=%d",
		    ddi_node_name(bphc->bphc_dip), PM_ADDR(bphc->bphc_dip),
		    bphc->bphc_olevel, bphc->bphc_nlevel, bphc->bphc_comp));

		if (bphc->bphc_nlevel > 0 &&
		    bphc->bphc_nlevel > bphc->bphc_olevel) {
			if (ct->ct_power_cnt == 0) {
				ret = i_mdi_power_all_phci(ct);
			}
			MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
			    "i_mdi_pm_hold_client\n"));
			i_mdi_pm_hold_client(ct, ct->ct_path_count);
		}

		if (bphc->bphc_nlevel == 0 && bphc->bphc_olevel != -1) {
			MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
			    "i_mdi_pm_rele_client\n"));
			i_mdi_pm_rele_client(ct, ct->ct_path_count);
		}
		break;
	}

	MDI_CLIENT_UNLOCK(ct);
	return (ret);
}

static int
i_mdi_pm_pre_config_one(dev_info_t *child)
{
	int		ret = MDI_SUCCESS;
	mdi_client_t	*ct;

	ct = i_devi_get_client(child);
	if (ct == NULL)
		return (MDI_FAILURE);

	MDI_CLIENT_LOCK(ct);
	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);

	if (!MDI_CLIENT_IS_FAILED(ct)) {
		MDI_CLIENT_UNLOCK(ct);
		MDI_DEBUG(4, (MDI_NOTE, child, "already configured\n"));
		return (MDI_SUCCESS);
	}

	if (ct->ct_powercnt_config) {
		MDI_CLIENT_UNLOCK(ct);
		MDI_DEBUG(4, (MDI_NOTE, child, "already held\n"));
		return (MDI_SUCCESS);
	}

	if (ct->ct_power_cnt == 0) {
		ret = i_mdi_power_all_phci(ct);
	}
	MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_hold_client\n"));
	i_mdi_pm_hold_client(ct, ct->ct_path_count);
	ct->ct_powercnt_config = 1;
	ct->ct_powercnt_reset = 0;
	MDI_CLIENT_UNLOCK(ct);
	return (ret);
}

static int
i_mdi_pm_pre_config(dev_info_t *vdip, dev_info_t *child)
{
	int			ret = MDI_SUCCESS;
	dev_info_t		*cdip;
	int			circ;

	ASSERT(MDI_VHCI(vdip));

	/* ndi_devi_config_one */
	if (child) {
		ASSERT(DEVI_BUSY_OWNED(vdip));
		return (i_mdi_pm_pre_config_one(child));
	}

	/* devi_config_common */
	ndi_devi_enter(vdip, &circ);
	cdip = ddi_get_child(vdip);
	while (cdip) {
		dev_info_t *next = ddi_get_next_sibling(cdip);

		ret = i_mdi_pm_pre_config_one(cdip);
		if (ret != MDI_SUCCESS)
			break;
		cdip = next;
	}
	ndi_devi_exit(vdip, circ);
	return (ret);
}

static int
i_mdi_pm_pre_unconfig_one(dev_info_t *child, int *held, int flags)
{
	int		ret = MDI_SUCCESS;
	mdi_client_t	*ct;

	ct = i_devi_get_client(child);
	if (ct == NULL)
		return (MDI_FAILURE);

	MDI_CLIENT_LOCK(ct);
	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);

	if (!i_ddi_devi_attached(ct->ct_dip)) {
		MDI_DEBUG(4, (MDI_NOTE, child, "node detached already\n"));
		MDI_CLIENT_UNLOCK(ct);
		return (MDI_SUCCESS);
	}

	if (MDI_CLIENT_IS_POWERED_DOWN(ct) &&
	    (flags & NDI_AUTODETACH)) {
		MDI_DEBUG(4, (MDI_NOTE, child, "auto-modunload\n"));
		MDI_CLIENT_UNLOCK(ct);
		return (MDI_FAILURE);
	}

	if (ct->ct_powercnt_unconfig) {
		MDI_DEBUG(4, (MDI_NOTE, child, "ct_powercnt_held\n"));
		MDI_CLIENT_UNLOCK(ct);
		*held = 1;
		return (MDI_SUCCESS);
	}

	if (ct->ct_power_cnt == 0) {
		ret = i_mdi_power_all_phci(ct);
	}
	MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_hold_client\n"));
	i_mdi_pm_hold_client(ct, ct->ct_path_count);
	ct->ct_powercnt_unconfig = 1;
	ct->ct_powercnt_reset = 0;
	MDI_CLIENT_UNLOCK(ct);
	if (ret == MDI_SUCCESS)
		*held = 1;
	return (ret);
}

static int
i_mdi_pm_pre_unconfig(dev_info_t *vdip, dev_info_t *child, int *held,
    int flags)
{
	int			ret = MDI_SUCCESS;
	dev_info_t		*cdip;
	int			circ;

	ASSERT(MDI_VHCI(vdip));
	*held = 0;

	/* ndi_devi_unconfig_one */
	if (child) {
		ASSERT(DEVI_BUSY_OWNED(vdip));
		return (i_mdi_pm_pre_unconfig_one(child, held, flags));
	}

	/* devi_unconfig_common */
	ndi_devi_enter(vdip, &circ);
	cdip = ddi_get_child(vdip);
	while (cdip) {
		dev_info_t *next = ddi_get_next_sibling(cdip);

		ret = i_mdi_pm_pre_unconfig_one(cdip, held, flags);
		cdip = next;
	}
	ndi_devi_exit(vdip, circ);

	if (*held)
		ret = MDI_SUCCESS;

	return (ret);
}

static void
i_mdi_pm_post_config_one(dev_info_t *child)
{
	mdi_client_t	*ct;

	ct = i_devi_get_client(child);
	if (ct == NULL)
		return;

	MDI_CLIENT_LOCK(ct);
	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);

	if (ct->ct_powercnt_reset || !ct->ct_powercnt_config) {
		MDI_DEBUG(4, (MDI_NOTE, child, "not configured\n"));
		MDI_CLIENT_UNLOCK(ct);
		return;
	}

	/* client has not been updated */
	if (MDI_CLIENT_IS_FAILED(ct)) {
		MDI_DEBUG(4, (MDI_NOTE, child, "client failed\n"));
		MDI_CLIENT_UNLOCK(ct);
		return;
	}

	/* another thread might have powered it down or detached it */
	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
	    !DEVI_IS_ATTACHING(ct->ct_dip)) ||
	    (!i_ddi_devi_attached(ct->ct_dip) &&
	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_reset_client\n"));
		i_mdi_pm_reset_client(ct);
	} else {
		mdi_pathinfo_t  *pip, *next;
		int	valid_path_count = 0;

		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_rele_client\n"));
		pip = ct->ct_path_head;
		while (pip != NULL) {
			MDI_PI_LOCK(pip);
			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
				valid_path_count ++;
			MDI_PI_UNLOCK(pip);
			pip = next;
		}
		i_mdi_pm_rele_client(ct, valid_path_count);
	}
	ct->ct_powercnt_config = 0;
	MDI_CLIENT_UNLOCK(ct);
}

static void
i_mdi_pm_post_config(dev_info_t *vdip, dev_info_t *child)
{
	int		circ;
	dev_info_t	*cdip;

	ASSERT(MDI_VHCI(vdip));

	/* ndi_devi_config_one */
	if (child) {
		ASSERT(DEVI_BUSY_OWNED(vdip));
		i_mdi_pm_post_config_one(child);
		return;
	}

	/* devi_config_common */
	ndi_devi_enter(vdip, &circ);
	cdip = ddi_get_child(vdip);
	while (cdip) {
		dev_info_t *next = ddi_get_next_sibling(cdip);

		i_mdi_pm_post_config_one(cdip);
		cdip = next;
	}
	ndi_devi_exit(vdip, circ);
}

static void
i_mdi_pm_post_unconfig_one(dev_info_t *child)
{
	mdi_client_t	*ct;

	ct = i_devi_get_client(child);
	if (ct == NULL)
		return;

	MDI_CLIENT_LOCK(ct);
	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);

	if (!ct->ct_powercnt_unconfig || ct->ct_powercnt_reset) {
		MDI_DEBUG(4, (MDI_NOTE, child, "not held\n"));
		MDI_CLIENT_UNLOCK(ct);
		return;
	}

	/* failure detaching or another thread just attached it */
	if ((MDI_CLIENT_IS_POWERED_DOWN(ct) &&
	    i_ddi_devi_attached(ct->ct_dip)) ||
	    (!i_ddi_devi_attached(ct->ct_dip) &&
	    !DEVI_IS_ATTACHING(ct->ct_dip))) {
		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_reset_client\n"));
		i_mdi_pm_reset_client(ct);
	} else {
		mdi_pathinfo_t  *pip, *next;
		int	valid_path_count = 0;

		MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_rele_client\n"));
		pip = ct->ct_path_head;
		while (pip != NULL) {
			MDI_PI_LOCK(pip);
			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
			if (MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip))
				valid_path_count ++;
			MDI_PI_UNLOCK(pip);
			pip = next;
		}
		i_mdi_pm_rele_client(ct, valid_path_count);
		ct->ct_powercnt_unconfig = 0;
	}

	MDI_CLIENT_UNLOCK(ct);
}

static void
i_mdi_pm_post_unconfig(dev_info_t *vdip, dev_info_t *child, int held)
{
	int			circ;
	dev_info_t		*cdip;

	ASSERT(MDI_VHCI(vdip));

	if (!held) {
		MDI_DEBUG(4, (MDI_NOTE, vdip, "held = %d", held));
		return;
	}

	if (child) {
		ASSERT(DEVI_BUSY_OWNED(vdip));
		i_mdi_pm_post_unconfig_one(child);
		return;
	}

	ndi_devi_enter(vdip, &circ);
	cdip = ddi_get_child(vdip);
	while (cdip) {
		dev_info_t *next = ddi_get_next_sibling(cdip);

		i_mdi_pm_post_unconfig_one(cdip);
		cdip = next;
	}
	ndi_devi_exit(vdip, circ);
}

int
mdi_power(dev_info_t *vdip, mdi_pm_op_t op, void *args, char *devnm, int flags)
{
	int			circ, ret = MDI_SUCCESS;
	dev_info_t		*client_dip = NULL;
	mdi_client_t		*ct;

	/*
	 * Handling ndi_devi_config_one and ndi_devi_unconfig_one.
	 * Power up pHCI for the named client device.
	 * Note: Before the client is enumerated under vhci by phci,
	 * client_dip can be NULL. Then proceed to power up all the
	 * pHCIs.
	 */
	if (devnm != NULL) {
		ndi_devi_enter(vdip, &circ);
		client_dip = ndi_devi_findchild(vdip, devnm);
	}

	MDI_DEBUG(4, (MDI_NOTE, vdip,
	    "op = %d %s %p", op, devnm ? devnm : "", (void *)client_dip));

	switch (op) {
	case MDI_PM_PRE_CONFIG:
		ret = i_mdi_pm_pre_config(vdip, client_dip);
		break;

	case MDI_PM_PRE_UNCONFIG:
		ret = i_mdi_pm_pre_unconfig(vdip, client_dip, (int *)args,
		    flags);
		break;

	case MDI_PM_POST_CONFIG:
		i_mdi_pm_post_config(vdip, client_dip);
		break;

	case MDI_PM_POST_UNCONFIG:
		i_mdi_pm_post_unconfig(vdip, client_dip, *(int *)args);
		break;

	case MDI_PM_HOLD_POWER:
	case MDI_PM_RELE_POWER:
		ASSERT(args);

		client_dip = (dev_info_t *)args;
		ASSERT(MDI_CLIENT(client_dip));

		ct = i_devi_get_client(client_dip);
		MDI_CLIENT_LOCK(ct);

		if (op == MDI_PM_HOLD_POWER) {
			if (ct->ct_power_cnt == 0) {
				(void) i_mdi_power_all_phci(ct);
				MDI_DEBUG(4, (MDI_NOTE, client_dip,
				    "i_mdi_pm_hold_client\n"));
				i_mdi_pm_hold_client(ct, ct->ct_path_count);
			}
		} else {
			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
				MDI_DEBUG(4, (MDI_NOTE, client_dip,
				    "i_mdi_pm_rele_client\n"));
				i_mdi_pm_rele_client(ct, ct->ct_path_count);
			} else {
				MDI_DEBUG(4, (MDI_NOTE, client_dip,
				    "i_mdi_pm_reset_client\n"));
				i_mdi_pm_reset_client(ct);
			}
		}

		MDI_CLIENT_UNLOCK(ct);
		break;

	default:
		break;
	}

	if (devnm)
		ndi_devi_exit(vdip, circ);

	return (ret);
}

int
mdi_component_is_vhci(dev_info_t *dip, const char **mdi_class)
{
	mdi_vhci_t *vhci;

	if (!MDI_VHCI(dip))
		return (MDI_FAILURE);

	if (mdi_class) {
		vhci = DEVI(dip)->devi_mdi_xhci;
		ASSERT(vhci);
		*mdi_class = vhci->vh_class;
	}

	return (MDI_SUCCESS);
}

int
mdi_component_is_phci(dev_info_t *dip, const char **mdi_class)
{
	mdi_phci_t *phci;

	if (!MDI_PHCI(dip))
		return (MDI_FAILURE);

	if (mdi_class) {
		phci = DEVI(dip)->devi_mdi_xhci;
		ASSERT(phci);
		*mdi_class = phci->ph_vhci->vh_class;
	}

	return (MDI_SUCCESS);
}

int
mdi_component_is_client(dev_info_t *dip, const char **mdi_class)
{
	mdi_client_t *client;

	if (!MDI_CLIENT(dip))
		return (MDI_FAILURE);

	if (mdi_class) {
		client = DEVI(dip)->devi_mdi_client;
		ASSERT(client);
		*mdi_class = client->ct_vhci->vh_class;
	}

	return (MDI_SUCCESS);
}

void *
mdi_client_get_vhci_private(dev_info_t *dip)
{
	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
		mdi_client_t	*ct;
		ct = i_devi_get_client(dip);
		return (ct->ct_vprivate);
	}
	return (NULL);
}

void
mdi_client_set_vhci_private(dev_info_t *dip, void *data)
{
	ASSERT(mdi_component_is_client(dip, NULL) == MDI_SUCCESS);
	if (mdi_component_is_client(dip, NULL) == MDI_SUCCESS) {
		mdi_client_t	*ct;
		ct = i_devi_get_client(dip);
		ct->ct_vprivate = data;
	}
}
/*
 * mdi_pi_get_vhci_private():
 *		Get the vhci private information associated with the
 *		mdi_pathinfo node
 */
void *
mdi_pi_get_vhci_private(mdi_pathinfo_t *pip)
{
	caddr_t	vprivate = NULL;
	if (pip) {
		vprivate = MDI_PI(pip)->pi_vprivate;
	}
	return (vprivate);
}

/*
 * mdi_pi_set_vhci_private():
 *		Set the vhci private information in the mdi_pathinfo node
 */
void
mdi_pi_set_vhci_private(mdi_pathinfo_t *pip, void *priv)
{
	if (pip) {
		MDI_PI(pip)->pi_vprivate = priv;
	}
}

/*
 * mdi_phci_get_vhci_private():
 *		Get the vhci private information associated with the
 *		mdi_phci node
 */
void *
mdi_phci_get_vhci_private(dev_info_t *dip)
{
	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
		mdi_phci_t	*ph;
		ph = i_devi_get_phci(dip);
		return (ph->ph_vprivate);
	}
	return (NULL);
}

/*
 * mdi_phci_set_vhci_private():
 *		Set the vhci private information in the mdi_phci node
 */
void
mdi_phci_set_vhci_private(dev_info_t *dip, void *priv)
{
	ASSERT(mdi_component_is_phci(dip, NULL) == MDI_SUCCESS);
	if (mdi_component_is_phci(dip, NULL) == MDI_SUCCESS) {
		mdi_phci_t	*ph;
		ph = i_devi_get_phci(dip);
		ph->ph_vprivate = priv;
	}
}

int
mdi_pi_ishidden(mdi_pathinfo_t *pip)
{
	return (MDI_PI_FLAGS_IS_HIDDEN(pip));
}

int
mdi_pi_device_isremoved(mdi_pathinfo_t *pip)
{
	return (MDI_PI_FLAGS_IS_DEVICE_REMOVED(pip));
}

/*
 * When processing hotplug, if mdi_pi_offline-mdi_pi_free fails then this
 * interface is used to represent device removal.
 */
int
mdi_pi_device_remove(mdi_pathinfo_t *pip)
{
	MDI_PI_LOCK(pip);
	if (mdi_pi_device_isremoved(pip)) {
		MDI_PI_UNLOCK(pip);
		return (0);
	}
	MDI_PI_FLAGS_SET_DEVICE_REMOVED(pip);
	MDI_PI_FLAGS_SET_HIDDEN(pip);
	MDI_PI_UNLOCK(pip);

	i_ddi_di_cache_invalidate();

	return (1);
}

/*
 * When processing hotplug, if a path marked mdi_pi_device_isremoved()
 * is now accessible then this interfaces is used to represent device insertion.
 */
int
mdi_pi_device_insert(mdi_pathinfo_t *pip)
{
	MDI_PI_LOCK(pip);
	if (!mdi_pi_device_isremoved(pip)) {
		MDI_PI_UNLOCK(pip);
		return (0);
	}
	MDI_PI_FLAGS_CLR_DEVICE_REMOVED(pip);
	MDI_PI_FLAGS_CLR_HIDDEN(pip);
	MDI_PI_UNLOCK(pip);

	i_ddi_di_cache_invalidate();

	return (1);
}

/*
 * List of vhci class names:
 * A vhci class name must be in this list only if the corresponding vhci
 * driver intends to use the mdi provided bus config implementation
 * (i.e., mdi_vhci_bus_config()).
 */
static char *vhci_class_list[] = { MDI_HCI_CLASS_SCSI, MDI_HCI_CLASS_IB };
#define	N_VHCI_CLASSES	(sizeof (vhci_class_list) / sizeof (char *))

/*
 * During boot time, the on-disk vhci cache for every vhci class is read
 * in the form of an nvlist and stored here.
 */
static nvlist_t *vhcache_nvl[N_VHCI_CLASSES];

/* nvpair names in vhci cache nvlist */
#define	MDI_VHCI_CACHE_VERSION	1
#define	MDI_NVPNAME_VERSION	"version"
#define	MDI_NVPNAME_PHCIS	"phcis"
#define	MDI_NVPNAME_CTADDRMAP	"clientaddrmap"

/*
 * Given vhci class name, return its on-disk vhci cache filename.
 * Memory for the returned filename which includes the full path is allocated
 * by this function.
 */
static char *
vhclass2vhcache_filename(char *vhclass)
{
	char *filename;
	int len;
	static char *fmt = "/etc/devices/mdi_%s_cache";

	/*
	 * fmt contains the on-disk vhci cache file name format;
	 * for scsi_vhci the filename is "/etc/devices/mdi_scsi_vhci_cache".
	 */

	/* the -1 below is to account for "%s" in the format string */
	len = strlen(fmt) + strlen(vhclass) - 1;
	filename = kmem_alloc(len, KM_SLEEP);
	(void) snprintf(filename, len, fmt, vhclass);
	ASSERT(len == (strlen(filename) + 1));
	return (filename);
}

/*
 * initialize the vhci cache related data structures and read the on-disk
 * vhci cached data into memory.
 */
static void
setup_vhci_cache(mdi_vhci_t *vh)
{
	mdi_vhci_config_t *vhc;
	mdi_vhci_cache_t *vhcache;
	int i;
	nvlist_t *nvl = NULL;

	vhc = kmem_zalloc(sizeof (mdi_vhci_config_t), KM_SLEEP);
	vh->vh_config = vhc;
	vhcache = &vhc->vhc_vhcache;

	vhc->vhc_vhcache_filename = vhclass2vhcache_filename(vh->vh_class);

	mutex_init(&vhc->vhc_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&vhc->vhc_cv, NULL, CV_DRIVER, NULL);

	rw_init(&vhcache->vhcache_lock, NULL, RW_DRIVER, NULL);

	/*
	 * Create string hash; same as mod_hash_create_strhash() except that
	 * we use NULL key destructor.
	 */
	vhcache->vhcache_client_hash = mod_hash_create_extended(vh->vh_class,
	    mdi_bus_config_cache_hash_size,
	    mod_hash_null_keydtor, mod_hash_null_valdtor,
	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);

	/*
	 * The on-disk vhci cache is read during booting prior to the
	 * lights-out period by mdi_read_devices_files().
	 */
	for (i = 0; i < N_VHCI_CLASSES; i++) {
		if (strcmp(vhci_class_list[i], vh->vh_class) == 0) {
			nvl = vhcache_nvl[i];
			vhcache_nvl[i] = NULL;
			break;
		}
	}

	/*
	 * this is to cover the case of some one manually causing unloading
	 * (or detaching) and reloading (or attaching) of a vhci driver.
	 */
	if (nvl == NULL && modrootloaded)
		nvl = read_on_disk_vhci_cache(vh->vh_class);

	if (nvl != NULL) {
		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
		if (mainnvl_to_vhcache(vhcache, nvl) == MDI_SUCCESS)
			vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
		else  {
			cmn_err(CE_WARN,
			    "%s: data file corrupted, will recreate",
			    vhc->vhc_vhcache_filename);
		}
		rw_exit(&vhcache->vhcache_lock);
		nvlist_free(nvl);
	}

	vhc->vhc_cbid = callb_add(stop_vhcache_flush_thread, vhc,
	    CB_CL_UADMIN_PRE_VFS, "mdi_vhcache_flush");

	vhc->vhc_path_discovery_boot = mdi_path_discovery_boot;
	vhc->vhc_path_discovery_postboot = mdi_path_discovery_postboot;
}

/*
 * free all vhci cache related resources
 */
static int
destroy_vhci_cache(mdi_vhci_t *vh)
{
	mdi_vhci_config_t *vhc = vh->vh_config;
	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
	mdi_vhcache_phci_t *cphci, *cphci_next;
	mdi_vhcache_client_t *cct, *cct_next;
	mdi_vhcache_pathinfo_t *cpi, *cpi_next;

	if (stop_vhcache_async_threads(vhc) != MDI_SUCCESS)
		return (MDI_FAILURE);

	kmem_free(vhc->vhc_vhcache_filename,
	    strlen(vhc->vhc_vhcache_filename) + 1);

	mod_hash_destroy_strhash(vhcache->vhcache_client_hash);

	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
	    cphci = cphci_next) {
		cphci_next = cphci->cphci_next;
		free_vhcache_phci(cphci);
	}

	for (cct = vhcache->vhcache_client_head; cct != NULL; cct = cct_next) {
		cct_next = cct->cct_next;
		for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi_next) {
			cpi_next = cpi->cpi_next;
			free_vhcache_pathinfo(cpi);
		}
		free_vhcache_client(cct);
	}

	rw_destroy(&vhcache->vhcache_lock);

	mutex_destroy(&vhc->vhc_lock);
	cv_destroy(&vhc->vhc_cv);
	kmem_free(vhc, sizeof (mdi_vhci_config_t));
	return (MDI_SUCCESS);
}

/*
 * Stop all vhci cache related async threads and free their resources.
 */
static int
stop_vhcache_async_threads(mdi_vhci_config_t *vhc)
{
	mdi_async_client_config_t *acc, *acc_next;

	mutex_enter(&vhc->vhc_lock);
	vhc->vhc_flags |= MDI_VHC_EXIT;
	ASSERT(vhc->vhc_acc_thrcount >= 0);
	cv_broadcast(&vhc->vhc_cv);

	while ((vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) ||
	    vhc->vhc_acc_thrcount != 0) {
		mutex_exit(&vhc->vhc_lock);
		delay_random(mdi_delay);
		mutex_enter(&vhc->vhc_lock);
	}

	vhc->vhc_flags &= ~MDI_VHC_EXIT;

	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc_next) {
		acc_next = acc->acc_next;
		free_async_client_config(acc);
	}
	vhc->vhc_acc_list_head = NULL;
	vhc->vhc_acc_list_tail = NULL;
	vhc->vhc_acc_count = 0;

	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
		mutex_exit(&vhc->vhc_lock);
		if (flush_vhcache(vhc, 0) != MDI_SUCCESS) {
			vhcache_dirty(vhc);
			return (MDI_FAILURE);
		}
	} else
		mutex_exit(&vhc->vhc_lock);

	if (callb_delete(vhc->vhc_cbid) != 0)
		return (MDI_FAILURE);

	return (MDI_SUCCESS);
}

/*
 * Stop vhci cache flush thread
 */
/* ARGSUSED */
static boolean_t
stop_vhcache_flush_thread(void *arg, int code)
{
	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;

	mutex_enter(&vhc->vhc_lock);
	vhc->vhc_flags |= MDI_VHC_EXIT;
	cv_broadcast(&vhc->vhc_cv);

	while (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
		mutex_exit(&vhc->vhc_lock);
		delay_random(mdi_delay);
		mutex_enter(&vhc->vhc_lock);
	}

	if (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) {
		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
		mutex_exit(&vhc->vhc_lock);
		(void) flush_vhcache(vhc, 1);
	} else
		mutex_exit(&vhc->vhc_lock);

	return (B_TRUE);
}

/*
 * Enqueue the vhcache phci (cphci) at the tail of the list
 */
static void
enqueue_vhcache_phci(mdi_vhci_cache_t *vhcache, mdi_vhcache_phci_t *cphci)
{
	cphci->cphci_next = NULL;
	if (vhcache->vhcache_phci_head == NULL)
		vhcache->vhcache_phci_head = cphci;
	else
		vhcache->vhcache_phci_tail->cphci_next = cphci;
	vhcache->vhcache_phci_tail = cphci;
}

/*
 * Enqueue the vhcache pathinfo (cpi) at the tail of the list
 */
static void
enqueue_tail_vhcache_pathinfo(mdi_vhcache_client_t *cct,
    mdi_vhcache_pathinfo_t *cpi)
{
	cpi->cpi_next = NULL;
	if (cct->cct_cpi_head == NULL)
		cct->cct_cpi_head = cpi;
	else
		cct->cct_cpi_tail->cpi_next = cpi;
	cct->cct_cpi_tail = cpi;
}

/*
 * Enqueue the vhcache pathinfo (cpi) at the correct location in the
 * ordered list. All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
 * flag set come at the beginning of the list. All cpis which have this
 * flag set come at the end of the list.
 */
static void
enqueue_vhcache_pathinfo(mdi_vhcache_client_t *cct,
    mdi_vhcache_pathinfo_t *newcpi)
{
	mdi_vhcache_pathinfo_t *cpi, *prev_cpi;

	if (cct->cct_cpi_head == NULL ||
	    (newcpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))
		enqueue_tail_vhcache_pathinfo(cct, newcpi);
	else {
		for (cpi = cct->cct_cpi_head, prev_cpi = NULL; cpi != NULL &&
		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST);
		    prev_cpi = cpi, cpi = cpi->cpi_next)
			;

		if (prev_cpi == NULL)
			cct->cct_cpi_head = newcpi;
		else
			prev_cpi->cpi_next = newcpi;

		newcpi->cpi_next = cpi;

		if (cpi == NULL)
			cct->cct_cpi_tail = newcpi;
	}
}

/*
 * Enqueue the vhcache client (cct) at the tail of the list
 */
static void
enqueue_vhcache_client(mdi_vhci_cache_t *vhcache,
    mdi_vhcache_client_t *cct)
{
	cct->cct_next = NULL;
	if (vhcache->vhcache_client_head == NULL)
		vhcache->vhcache_client_head = cct;
	else
		vhcache->vhcache_client_tail->cct_next = cct;
	vhcache->vhcache_client_tail = cct;
}

static void
free_string_array(char **str, int nelem)
{
	int i;

	if (str) {
		for (i = 0; i < nelem; i++) {
			if (str[i])
				kmem_free(str[i], strlen(str[i]) + 1);
		}
		kmem_free(str, sizeof (char *) * nelem);
	}
}

static void
free_vhcache_phci(mdi_vhcache_phci_t *cphci)
{
	kmem_free(cphci->cphci_path, strlen(cphci->cphci_path) + 1);
	kmem_free(cphci, sizeof (*cphci));
}

static void
free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *cpi)
{
	kmem_free(cpi->cpi_addr, strlen(cpi->cpi_addr) + 1);
	kmem_free(cpi, sizeof (*cpi));
}

static void
free_vhcache_client(mdi_vhcache_client_t *cct)
{
	kmem_free(cct->cct_name_addr, strlen(cct->cct_name_addr) + 1);
	kmem_free(cct, sizeof (*cct));
}

static char *
vhcache_mknameaddr(char *ct_name, char *ct_addr, int *ret_len)
{
	char *name_addr;
	int len;

	len = strlen(ct_name) + strlen(ct_addr) + 2;
	name_addr = kmem_alloc(len, KM_SLEEP);
	(void) snprintf(name_addr, len, "%s@%s", ct_name, ct_addr);

	if (ret_len)
		*ret_len = len;
	return (name_addr);
}

/*
 * Copy the contents of paddrnvl to vhci cache.
 * paddrnvl nvlist contains path information for a vhci client.
 * See the comment in mainnvl_to_vhcache() for the format of this nvlist.
 */
static void
paddrnvl_to_vhcache(nvlist_t *nvl, mdi_vhcache_phci_t *cphci_list[],
    mdi_vhcache_client_t *cct)
{
	nvpair_t *nvp = NULL;
	mdi_vhcache_pathinfo_t *cpi;
	uint_t nelem;
	uint32_t *val;

	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
		ASSERT(nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY);
		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
		cpi->cpi_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
		(void) nvpair_value_uint32_array(nvp, &val, &nelem);
		ASSERT(nelem == 2);
		cpi->cpi_cphci = cphci_list[val[0]];
		cpi->cpi_flags = val[1];
		enqueue_tail_vhcache_pathinfo(cct, cpi);
	}
}

/*
 * Copy the contents of caddrmapnvl to vhci cache.
 * caddrmapnvl nvlist contains vhci client address to phci client address
 * mappings. See the comment in mainnvl_to_vhcache() for the format of
 * this nvlist.
 */
static void
caddrmapnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl,
    mdi_vhcache_phci_t *cphci_list[])
{
	nvpair_t *nvp = NULL;
	nvlist_t *paddrnvl;
	mdi_vhcache_client_t *cct;

	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
		ASSERT(nvpair_type(nvp) == DATA_TYPE_NVLIST);
		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
		cct->cct_name_addr = i_ddi_strdup(nvpair_name(nvp), KM_SLEEP);
		(void) nvpair_value_nvlist(nvp, &paddrnvl);
		paddrnvl_to_vhcache(paddrnvl, cphci_list, cct);
		/* the client must contain at least one path */
		ASSERT(cct->cct_cpi_head != NULL);

		enqueue_vhcache_client(vhcache, cct);
		(void) mod_hash_insert(vhcache->vhcache_client_hash,
		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
	}
}

/*
 * Copy the contents of the main nvlist to vhci cache.
 *
 * VHCI busconfig cached data is stored in the form of a nvlist on the disk.
 * The nvlist contains the mappings between the vhci client addresses and
 * their corresponding phci client addresses.
 *
 * The structure of the nvlist is as follows:
 *
 * Main nvlist:
 *	NAME		TYPE		DATA
 *	version		int32		version number
 *	phcis		string array	array of phci paths
 *	clientaddrmap	nvlist_t	c2paddrs_nvl (see below)
 *
 * structure of c2paddrs_nvl:
 *	NAME		TYPE		DATA
 *	caddr1		nvlist_t	paddrs_nvl1
 *	caddr2		nvlist_t	paddrs_nvl2
 *	...
 * where caddr1, caddr2, ... are vhci client name and addresses in the
 * form of "<clientname>@<clientaddress>".
 * (for example: "ssd@2000002037cd9f72");
 * paddrs_nvl1, paddrs_nvl2, .. are nvlists that contain path information.
 *
 * structure of paddrs_nvl:
 *	NAME		TYPE		DATA
 *	pi_addr1	uint32_array	(phci-id, cpi_flags)
 *	pi_addr2	uint32_array	(phci-id, cpi_flags)
 *	...
 * where pi_addr1, pi_addr2, ... are bus specific addresses of pathinfo nodes
 * (so called pi_addrs, for example: "w2100002037cd9f72,0");
 * phci-ids are integers that identify pHCIs to which the
 * the bus specific address belongs to. These integers are used as an index
 * into to the phcis string array in the main nvlist to get the pHCI path.
 */
static int
mainnvl_to_vhcache(mdi_vhci_cache_t *vhcache, nvlist_t *nvl)
{
	char **phcis, **phci_namep;
	uint_t nphcis;
	mdi_vhcache_phci_t *cphci, **cphci_list;
	nvlist_t *caddrmapnvl;
	int32_t ver;
	int i;
	size_t cphci_list_size;

	ASSERT(RW_WRITE_HELD(&vhcache->vhcache_lock));

	if (nvlist_lookup_int32(nvl, MDI_NVPNAME_VERSION, &ver) != 0 ||
	    ver != MDI_VHCI_CACHE_VERSION)
		return (MDI_FAILURE);

	if (nvlist_lookup_string_array(nvl, MDI_NVPNAME_PHCIS, &phcis,
	    &nphcis) != 0)
		return (MDI_SUCCESS);

	ASSERT(nphcis > 0);

	cphci_list_size = sizeof (mdi_vhcache_phci_t *) * nphcis;
	cphci_list = kmem_alloc(cphci_list_size, KM_SLEEP);
	for (i = 0, phci_namep = phcis; i < nphcis; i++, phci_namep++) {
		cphci = kmem_zalloc(sizeof (mdi_vhcache_phci_t), KM_SLEEP);
		cphci->cphci_path = i_ddi_strdup(*phci_namep, KM_SLEEP);
		enqueue_vhcache_phci(vhcache, cphci);
		cphci_list[i] = cphci;
	}

	ASSERT(vhcache->vhcache_phci_head != NULL);

	if (nvlist_lookup_nvlist(nvl, MDI_NVPNAME_CTADDRMAP, &caddrmapnvl) == 0)
		caddrmapnvl_to_vhcache(vhcache, caddrmapnvl, cphci_list);

	kmem_free(cphci_list, cphci_list_size);
	return (MDI_SUCCESS);
}

/*
 * Build paddrnvl for the specified client using the information in the
 * vhci cache and add it to the caddrmapnnvl.
 * Returns 0 on success, errno on failure.
 */
static int
vhcache_to_paddrnvl(mdi_vhci_cache_t *vhcache, mdi_vhcache_client_t *cct,
    nvlist_t *caddrmapnvl)
{
	mdi_vhcache_pathinfo_t *cpi;
	nvlist_t *nvl;
	int err;
	uint32_t val[2];

	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));

	if ((err = nvlist_alloc(&nvl, 0, KM_SLEEP)) != 0)
		return (err);

	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
		val[0] = cpi->cpi_cphci->cphci_id;
		val[1] = cpi->cpi_flags;
		if ((err = nvlist_add_uint32_array(nvl, cpi->cpi_addr, val, 2))
		    != 0)
			goto out;
	}

	err = nvlist_add_nvlist(caddrmapnvl, cct->cct_name_addr, nvl);
out:
	nvlist_free(nvl);
	return (err);
}

/*
 * Build caddrmapnvl using the information in the vhci cache
 * and add it to the mainnvl.
 * Returns 0 on success, errno on failure.
 */
static int
vhcache_to_caddrmapnvl(mdi_vhci_cache_t *vhcache, nvlist_t *mainnvl)
{
	mdi_vhcache_client_t *cct;
	nvlist_t *nvl;
	int err;

	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));

	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0)
		return (err);

	for (cct = vhcache->vhcache_client_head; cct != NULL;
	    cct = cct->cct_next) {
		if ((err = vhcache_to_paddrnvl(vhcache, cct, nvl)) != 0)
			goto out;
	}

	err = nvlist_add_nvlist(mainnvl, MDI_NVPNAME_CTADDRMAP, nvl);
out:
	nvlist_free(nvl);
	return (err);
}

/*
 * Build nvlist using the information in the vhci cache.
 * See the comment in mainnvl_to_vhcache() for the format of the nvlist.
 * Returns nvl on success, NULL on failure.
 */
static nvlist_t *
vhcache_to_mainnvl(mdi_vhci_cache_t *vhcache)
{
	mdi_vhcache_phci_t *cphci;
	uint_t phci_count;
	char **phcis;
	nvlist_t *nvl;
	int err, i;

	if ((err = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) != 0) {
		nvl = NULL;
		goto out;
	}

	if ((err = nvlist_add_int32(nvl, MDI_NVPNAME_VERSION,
	    MDI_VHCI_CACHE_VERSION)) != 0)
		goto out;

	rw_enter(&vhcache->vhcache_lock, RW_READER);
	if (vhcache->vhcache_phci_head == NULL) {
		rw_exit(&vhcache->vhcache_lock);
		return (nvl);
	}

	phci_count = 0;
	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
	    cphci = cphci->cphci_next)
		cphci->cphci_id = phci_count++;

	/* build phci pathname list */
	phcis = kmem_alloc(sizeof (char *) * phci_count, KM_SLEEP);
	for (cphci = vhcache->vhcache_phci_head, i = 0; cphci != NULL;
	    cphci = cphci->cphci_next, i++)
		phcis[i] = i_ddi_strdup(cphci->cphci_path, KM_SLEEP);

	err = nvlist_add_string_array(nvl, MDI_NVPNAME_PHCIS, phcis,
	    phci_count);
	free_string_array(phcis, phci_count);

	if (err == 0 &&
	    (err = vhcache_to_caddrmapnvl(vhcache, nvl)) == 0) {
		rw_exit(&vhcache->vhcache_lock);
		return (nvl);
	}

	rw_exit(&vhcache->vhcache_lock);
out:
	if (nvl)
		nvlist_free(nvl);
	return (NULL);
}

/*
 * Lookup vhcache phci structure for the specified phci path.
 */
static mdi_vhcache_phci_t *
lookup_vhcache_phci_by_name(mdi_vhci_cache_t *vhcache, char *phci_path)
{
	mdi_vhcache_phci_t *cphci;

	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));

	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
	    cphci = cphci->cphci_next) {
		if (strcmp(cphci->cphci_path, phci_path) == 0)
			return (cphci);
	}

	return (NULL);
}

/*
 * Lookup vhcache phci structure for the specified phci.
 */
static mdi_vhcache_phci_t *
lookup_vhcache_phci_by_addr(mdi_vhci_cache_t *vhcache, mdi_phci_t *ph)
{
	mdi_vhcache_phci_t *cphci;

	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));

	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
	    cphci = cphci->cphci_next) {
		if (cphci->cphci_phci == ph)
			return (cphci);
	}

	return (NULL);
}

/*
 * Add the specified phci to the vhci cache if not already present.
 */
static void
vhcache_phci_add(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
{
	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
	mdi_vhcache_phci_t *cphci;
	char *pathname;
	int cache_updated;

	rw_enter(&vhcache->vhcache_lock, RW_WRITER);

	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
	(void) ddi_pathname(ph->ph_dip, pathname);
	if ((cphci = lookup_vhcache_phci_by_name(vhcache, pathname))
	    != NULL) {
		cphci->cphci_phci = ph;
		cache_updated = 0;
	} else {
		cphci = kmem_zalloc(sizeof (*cphci), KM_SLEEP);
		cphci->cphci_path = i_ddi_strdup(pathname, KM_SLEEP);
		cphci->cphci_phci = ph;
		enqueue_vhcache_phci(vhcache, cphci);
		cache_updated = 1;
	}

	rw_exit(&vhcache->vhcache_lock);

	/*
	 * Since a new phci has been added, reset
	 * vhc_path_discovery_cutoff_time to allow for discovery of paths
	 * during next vhcache_discover_paths().
	 */
	mutex_enter(&vhc->vhc_lock);
	vhc->vhc_path_discovery_cutoff_time = 0;
	mutex_exit(&vhc->vhc_lock);

	kmem_free(pathname, MAXPATHLEN);
	if (cache_updated)
		vhcache_dirty(vhc);
}

/*
 * Remove the reference to the specified phci from the vhci cache.
 */
static void
vhcache_phci_remove(mdi_vhci_config_t *vhc, mdi_phci_t *ph)
{
	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
	mdi_vhcache_phci_t *cphci;

	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
	if ((cphci = lookup_vhcache_phci_by_addr(vhcache, ph)) != NULL) {
		/* do not remove the actual mdi_vhcache_phci structure */
		cphci->cphci_phci = NULL;
	}
	rw_exit(&vhcache->vhcache_lock);
}

static void
init_vhcache_lookup_token(mdi_vhcache_lookup_token_t *dst,
    mdi_vhcache_lookup_token_t *src)
{
	if (src == NULL) {
		dst->lt_cct = NULL;
		dst->lt_cct_lookup_time = 0;
	} else {
		dst->lt_cct = src->lt_cct;
		dst->lt_cct_lookup_time = src->lt_cct_lookup_time;
	}
}

/*
 * Look up vhcache client for the specified client.
 */
static mdi_vhcache_client_t *
lookup_vhcache_client(mdi_vhci_cache_t *vhcache, char *ct_name, char *ct_addr,
    mdi_vhcache_lookup_token_t *token)
{
	mod_hash_val_t hv;
	char *name_addr;
	int len;

	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));

	/*
	 * If no vhcache clean occurred since the last lookup, we can
	 * simply return the cct from the last lookup operation.
	 * It works because ccts are never freed except during the vhcache
	 * cleanup operation.
	 */
	if (token != NULL &&
	    vhcache->vhcache_clean_time < token->lt_cct_lookup_time)
		return (token->lt_cct);

	name_addr = vhcache_mknameaddr(ct_name, ct_addr, &len);
	if (mod_hash_find(vhcache->vhcache_client_hash,
	    (mod_hash_key_t)name_addr, &hv) == 0) {
		if (token) {
			token->lt_cct = (mdi_vhcache_client_t *)hv;
			token->lt_cct_lookup_time = ddi_get_lbolt64();
		}
	} else {
		if (token) {
			token->lt_cct = NULL;
			token->lt_cct_lookup_time = 0;
		}
		hv = NULL;
	}
	kmem_free(name_addr, len);
	return ((mdi_vhcache_client_t *)hv);
}

/*
 * Add the specified path to the vhci cache if not already present.
 * Also add the vhcache client for the client corresponding to this path
 * if it doesn't already exist.
 */
static void
vhcache_pi_add(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
{
	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
	mdi_vhcache_client_t *cct;
	mdi_vhcache_pathinfo_t *cpi;
	mdi_phci_t *ph = pip->pi_phci;
	mdi_client_t *ct = pip->pi_client;
	int cache_updated = 0;

	rw_enter(&vhcache->vhcache_lock, RW_WRITER);

	/* if vhcache client for this pip doesn't already exist, add it */
	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
	    NULL)) == NULL) {
		cct = kmem_zalloc(sizeof (*cct), KM_SLEEP);
		cct->cct_name_addr = vhcache_mknameaddr(ct->ct_drvname,
		    ct->ct_guid, NULL);
		enqueue_vhcache_client(vhcache, cct);
		(void) mod_hash_insert(vhcache->vhcache_client_hash,
		    (mod_hash_key_t)cct->cct_name_addr, (mod_hash_val_t)cct);
		cache_updated = 1;
	}

	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
		if (cpi->cpi_cphci->cphci_phci == ph &&
		    strcmp(cpi->cpi_addr, pip->pi_addr) == 0) {
			cpi->cpi_pip = pip;
			if (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST) {
				cpi->cpi_flags &=
				    ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
				sort_vhcache_paths(cct);
				cache_updated = 1;
			}
			break;
		}
	}

	if (cpi == NULL) {
		cpi = kmem_zalloc(sizeof (*cpi), KM_SLEEP);
		cpi->cpi_addr = i_ddi_strdup(pip->pi_addr, KM_SLEEP);
		cpi->cpi_cphci = lookup_vhcache_phci_by_addr(vhcache, ph);
		ASSERT(cpi->cpi_cphci != NULL);
		cpi->cpi_pip = pip;
		enqueue_vhcache_pathinfo(cct, cpi);
		cache_updated = 1;
	}

	rw_exit(&vhcache->vhcache_lock);

	if (cache_updated)
		vhcache_dirty(vhc);
}

/*
 * Remove the reference to the specified path from the vhci cache.
 */
static void
vhcache_pi_remove(mdi_vhci_config_t *vhc, struct mdi_pathinfo *pip)
{
	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
	mdi_client_t *ct = pip->pi_client;
	mdi_vhcache_client_t *cct;
	mdi_vhcache_pathinfo_t *cpi;

	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
	if ((cct = lookup_vhcache_client(vhcache, ct->ct_drvname, ct->ct_guid,
	    NULL)) != NULL) {
		for (cpi = cct->cct_cpi_head; cpi != NULL;
		    cpi = cpi->cpi_next) {
			if (cpi->cpi_pip == pip) {
				cpi->cpi_pip = NULL;
				break;
			}
		}
	}
	rw_exit(&vhcache->vhcache_lock);
}

/*
 * Flush the vhci cache to disk.
 * Returns MDI_SUCCESS on success, MDI_FAILURE on failure.
 */
static int
flush_vhcache(mdi_vhci_config_t *vhc, int force_flag)
{
	nvlist_t *nvl;
	int err;
	int rv;

	/*
	 * It is possible that the system may shutdown before
	 * i_ddi_io_initialized (during stmsboot for example). To allow for
	 * flushing the cache in this case do not check for
	 * i_ddi_io_initialized when force flag is set.
	 */
	if (force_flag == 0 && !i_ddi_io_initialized())
		return (MDI_FAILURE);

	if ((nvl = vhcache_to_mainnvl(&vhc->vhc_vhcache)) != NULL) {
		err = fwrite_nvlist(vhc->vhc_vhcache_filename, nvl);
		nvlist_free(nvl);
	} else
		err = EFAULT;

	rv = MDI_SUCCESS;
	mutex_enter(&vhc->vhc_lock);
	if (err != 0) {
		if (err == EROFS) {
			vhc->vhc_flags |= MDI_VHC_READONLY_FS;
			vhc->vhc_flags &= ~(MDI_VHC_VHCACHE_FLUSH_ERROR |
			    MDI_VHC_VHCACHE_DIRTY);
		} else {
			if (!(vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR)) {
				cmn_err(CE_CONT, "%s: update failed\n",
				    vhc->vhc_vhcache_filename);
				vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_ERROR;
			}
			rv = MDI_FAILURE;
		}
	} else if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_ERROR) {
		cmn_err(CE_CONT,
		    "%s: update now ok\n", vhc->vhc_vhcache_filename);
		vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_ERROR;
	}
	mutex_exit(&vhc->vhc_lock);

	return (rv);
}

/*
 * Call flush_vhcache() to flush the vhci cache at the scheduled time.
 * Exits itself if left idle for the idle timeout period.
 */
static void
vhcache_flush_thread(void *arg)
{
	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
	clock_t idle_time, quit_at_ticks;
	callb_cpr_t cprinfo;

	/* number of seconds to sleep idle before exiting */
	idle_time = mdi_vhcache_flush_daemon_idle_time * TICKS_PER_SECOND;

	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
	    "mdi_vhcache_flush");
	mutex_enter(&vhc->vhc_lock);
	for (; ; ) {
		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
		    (vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY)) {
			if (ddi_get_lbolt() < vhc->vhc_flush_at_ticks) {
				CALLB_CPR_SAFE_BEGIN(&cprinfo);
				(void) cv_timedwait(&vhc->vhc_cv,
				    &vhc->vhc_lock, vhc->vhc_flush_at_ticks);
				CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
			} else {
				vhc->vhc_flags &= ~MDI_VHC_VHCACHE_DIRTY;
				mutex_exit(&vhc->vhc_lock);

				if (flush_vhcache(vhc, 0) != MDI_SUCCESS)
					vhcache_dirty(vhc);

				mutex_enter(&vhc->vhc_lock);
			}
		}

		quit_at_ticks = ddi_get_lbolt() + idle_time;

		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY) &&
		    ddi_get_lbolt() < quit_at_ticks) {
			CALLB_CPR_SAFE_BEGIN(&cprinfo);
			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
			    quit_at_ticks);
			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
		}

		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
		    !(vhc->vhc_flags & MDI_VHC_VHCACHE_DIRTY))
			goto out;
	}

out:
	vhc->vhc_flags &= ~MDI_VHC_VHCACHE_FLUSH_THREAD;
	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
	CALLB_CPR_EXIT(&cprinfo);
}

/*
 * Make vhci cache dirty and schedule flushing by vhcache flush thread.
 */
static void
vhcache_dirty(mdi_vhci_config_t *vhc)
{
	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
	int create_thread;

	rw_enter(&vhcache->vhcache_lock, RW_READER);
	/* do not flush cache until the cache is fully built */
	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
		rw_exit(&vhcache->vhcache_lock);
		return;
	}
	rw_exit(&vhcache->vhcache_lock);

	mutex_enter(&vhc->vhc_lock);
	if (vhc->vhc_flags & MDI_VHC_READONLY_FS) {
		mutex_exit(&vhc->vhc_lock);
		return;
	}

	vhc->vhc_flags |= MDI_VHC_VHCACHE_DIRTY;
	vhc->vhc_flush_at_ticks = ddi_get_lbolt() +
	    mdi_vhcache_flush_delay * TICKS_PER_SECOND;
	if (vhc->vhc_flags & MDI_VHC_VHCACHE_FLUSH_THREAD) {
		cv_broadcast(&vhc->vhc_cv);
		create_thread = 0;
	} else {
		vhc->vhc_flags |= MDI_VHC_VHCACHE_FLUSH_THREAD;
		create_thread = 1;
	}
	mutex_exit(&vhc->vhc_lock);

	if (create_thread)
		(void) thread_create(NULL, 0, vhcache_flush_thread, vhc,
		    0, &p0, TS_RUN, minclsyspri);
}

/*
 * phci bus config structure - one for for each phci bus config operation that
 * we initiate on behalf of a vhci.
 */
typedef struct mdi_phci_bus_config_s {
	char *phbc_phci_path;
	struct mdi_vhci_bus_config_s *phbc_vhbusconfig;	/* vhci bus config */
	struct mdi_phci_bus_config_s *phbc_next;
} mdi_phci_bus_config_t;

/* vhci bus config structure - one for each vhci bus config operation */
typedef struct mdi_vhci_bus_config_s {
	ddi_bus_config_op_t vhbc_op;	/* bus config op */
	major_t vhbc_op_major;		/* bus config op major */
	uint_t vhbc_op_flags;		/* bus config op flags */
	kmutex_t vhbc_lock;
	kcondvar_t vhbc_cv;
	int vhbc_thr_count;
} mdi_vhci_bus_config_t;

/*
 * bus config the specified phci
 */
static void
bus_config_phci(void *arg)
{
	mdi_phci_bus_config_t *phbc = (mdi_phci_bus_config_t *)arg;
	mdi_vhci_bus_config_t *vhbc = phbc->phbc_vhbusconfig;
	dev_info_t *ph_dip;

	/*
	 * first configure all path components upto phci and then configure
	 * the phci children.
	 */
	if ((ph_dip = e_ddi_hold_devi_by_path(phbc->phbc_phci_path, 0))
	    != NULL) {
		if (vhbc->vhbc_op == BUS_CONFIG_DRIVER ||
		    vhbc->vhbc_op == BUS_UNCONFIG_DRIVER) {
			(void) ndi_devi_config_driver(ph_dip,
			    vhbc->vhbc_op_flags,
			    vhbc->vhbc_op_major);
		} else
			(void) ndi_devi_config(ph_dip,
			    vhbc->vhbc_op_flags);

		/* release the hold that e_ddi_hold_devi_by_path() placed */
		ndi_rele_devi(ph_dip);
	}

	kmem_free(phbc->phbc_phci_path, strlen(phbc->phbc_phci_path) + 1);
	kmem_free(phbc, sizeof (*phbc));

	mutex_enter(&vhbc->vhbc_lock);
	vhbc->vhbc_thr_count--;
	if (vhbc->vhbc_thr_count == 0)
		cv_broadcast(&vhbc->vhbc_cv);
	mutex_exit(&vhbc->vhbc_lock);
}

/*
 * Bus config all phcis associated with the vhci in parallel.
 * op must be BUS_CONFIG_DRIVER or BUS_CONFIG_ALL.
 */
static void
bus_config_all_phcis(mdi_vhci_cache_t *vhcache, uint_t flags,
    ddi_bus_config_op_t op, major_t maj)
{
	mdi_phci_bus_config_t *phbc_head = NULL, *phbc, *phbc_next;
	mdi_vhci_bus_config_t *vhbc;
	mdi_vhcache_phci_t *cphci;

	rw_enter(&vhcache->vhcache_lock, RW_READER);
	if (vhcache->vhcache_phci_head == NULL) {
		rw_exit(&vhcache->vhcache_lock);
		return;
	}

	vhbc = kmem_zalloc(sizeof (*vhbc), KM_SLEEP);

	for (cphci = vhcache->vhcache_phci_head; cphci != NULL;
	    cphci = cphci->cphci_next) {
		/* skip phcis that haven't attached before root is available */
		if (!modrootloaded && (cphci->cphci_phci == NULL))
			continue;
		phbc = kmem_zalloc(sizeof (*phbc), KM_SLEEP);
		phbc->phbc_phci_path = i_ddi_strdup(cphci->cphci_path,
		    KM_SLEEP);
		phbc->phbc_vhbusconfig = vhbc;
		phbc->phbc_next = phbc_head;
		phbc_head = phbc;
		vhbc->vhbc_thr_count++;
	}
	rw_exit(&vhcache->vhcache_lock);

	vhbc->vhbc_op = op;
	vhbc->vhbc_op_major = maj;
	vhbc->vhbc_op_flags = NDI_NO_EVENT |
	    (flags & (NDI_CONFIG_REPROBE | NDI_DRV_CONF_REPROBE));
	mutex_init(&vhbc->vhbc_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&vhbc->vhbc_cv, NULL, CV_DRIVER, NULL);

	/* now create threads to initiate bus config on all phcis in parallel */
	for (phbc = phbc_head; phbc != NULL; phbc = phbc_next) {
		phbc_next = phbc->phbc_next;
		if (mdi_mtc_off)
			bus_config_phci((void *)phbc);
		else
			(void) thread_create(NULL, 0, bus_config_phci, phbc,
			    0, &p0, TS_RUN, minclsyspri);
	}

	mutex_enter(&vhbc->vhbc_lock);
	/* wait until all threads exit */
	while (vhbc->vhbc_thr_count > 0)
		cv_wait(&vhbc->vhbc_cv, &vhbc->vhbc_lock);
	mutex_exit(&vhbc->vhbc_lock);

	mutex_destroy(&vhbc->vhbc_lock);
	cv_destroy(&vhbc->vhbc_cv);
	kmem_free(vhbc, sizeof (*vhbc));
}

/*
 * Single threaded version of bus_config_all_phcis()
 */
static void
st_bus_config_all_phcis(mdi_vhci_config_t *vhc, uint_t flags,
    ddi_bus_config_op_t op, major_t maj)
{
	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;

	single_threaded_vhconfig_enter(vhc);
	bus_config_all_phcis(vhcache, flags, op, maj);
	single_threaded_vhconfig_exit(vhc);
}

/*
 * Perform BUS_CONFIG_ONE on the specified child of the phci.
 * The path includes the child component in addition to the phci path.
 */
static int
bus_config_one_phci_child(char *path)
{
	dev_info_t *ph_dip, *child;
	char *devnm;
	int rv = MDI_FAILURE;

	/* extract the child component of the phci */
	devnm = strrchr(path, '/');
	*devnm++ = '\0';

	/*
	 * first configure all path components upto phci and then
	 * configure the phci child.
	 */
	if ((ph_dip = e_ddi_hold_devi_by_path(path, 0)) != NULL) {
		if (ndi_devi_config_one(ph_dip, devnm, &child, NDI_NO_EVENT) ==
		    NDI_SUCCESS) {
			/*
			 * release the hold that ndi_devi_config_one() placed
			 */
			ndi_rele_devi(child);
			rv = MDI_SUCCESS;
		}

		/* release the hold that e_ddi_hold_devi_by_path() placed */
		ndi_rele_devi(ph_dip);
	}

	devnm--;
	*devnm = '/';
	return (rv);
}

/*
 * Build a list of phci client paths for the specified vhci client.
 * The list includes only those phci client paths which aren't configured yet.
 */
static mdi_phys_path_t *
build_phclient_path_list(mdi_vhcache_client_t *cct, char *ct_name)
{
	mdi_vhcache_pathinfo_t *cpi;
	mdi_phys_path_t *pp_head = NULL, *pp_tail = NULL, *pp;
	int config_path, len;

	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
		/*
		 * include only those paths that aren't configured.
		 */
		config_path = 0;
		if (cpi->cpi_pip == NULL)
			config_path = 1;
		else {
			MDI_PI_LOCK(cpi->cpi_pip);
			if (MDI_PI_IS_INIT(cpi->cpi_pip))
				config_path = 1;
			MDI_PI_UNLOCK(cpi->cpi_pip);
		}

		if (config_path) {
			pp = kmem_alloc(sizeof (*pp), KM_SLEEP);
			len = strlen(cpi->cpi_cphci->cphci_path) +
			    strlen(ct_name) + strlen(cpi->cpi_addr) + 3;
			pp->phys_path = kmem_alloc(len, KM_SLEEP);
			(void) snprintf(pp->phys_path, len, "%s/%s@%s",
			    cpi->cpi_cphci->cphci_path, ct_name,
			    cpi->cpi_addr);
			pp->phys_path_next = NULL;

			if (pp_head == NULL)
				pp_head = pp;
			else
				pp_tail->phys_path_next = pp;
			pp_tail = pp;
		}
	}

	return (pp_head);
}

/*
 * Free the memory allocated for phci client path list.
 */
static void
free_phclient_path_list(mdi_phys_path_t *pp_head)
{
	mdi_phys_path_t *pp, *pp_next;

	for (pp = pp_head; pp != NULL; pp = pp_next) {
		pp_next = pp->phys_path_next;
		kmem_free(pp->phys_path, strlen(pp->phys_path) + 1);
		kmem_free(pp, sizeof (*pp));
	}
}

/*
 * Allocated async client structure and initialize with the specified values.
 */
static mdi_async_client_config_t *
alloc_async_client_config(char *ct_name, char *ct_addr,
    mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
{
	mdi_async_client_config_t *acc;

	acc = kmem_alloc(sizeof (*acc), KM_SLEEP);
	acc->acc_ct_name = i_ddi_strdup(ct_name, KM_SLEEP);
	acc->acc_ct_addr = i_ddi_strdup(ct_addr, KM_SLEEP);
	acc->acc_phclient_path_list_head = pp_head;
	init_vhcache_lookup_token(&acc->acc_token, tok);
	acc->acc_next = NULL;
	return (acc);
}

/*
 * Free the memory allocated for the async client structure and their members.
 */
static void
free_async_client_config(mdi_async_client_config_t *acc)
{
	if (acc->acc_phclient_path_list_head)
		free_phclient_path_list(acc->acc_phclient_path_list_head);
	kmem_free(acc->acc_ct_name, strlen(acc->acc_ct_name) + 1);
	kmem_free(acc->acc_ct_addr, strlen(acc->acc_ct_addr) + 1);
	kmem_free(acc, sizeof (*acc));
}

/*
 * Sort vhcache pathinfos (cpis) of the specified client.
 * All cpis which do not have MDI_CPI_HINT_PATH_DOES_NOT_EXIST
 * flag set come at the beginning of the list. All cpis which have this
 * flag set come at the end of the list.
 */
static void
sort_vhcache_paths(mdi_vhcache_client_t *cct)
{
	mdi_vhcache_pathinfo_t *cpi, *cpi_next, *cpi_head;

	cpi_head = cct->cct_cpi_head;
	cct->cct_cpi_head = cct->cct_cpi_tail = NULL;
	for (cpi = cpi_head; cpi != NULL; cpi = cpi_next) {
		cpi_next = cpi->cpi_next;
		enqueue_vhcache_pathinfo(cct, cpi);
	}
}

/*
 * Verify whether MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag setting is correct for
 * every vhcache pathinfo of the specified client. If not adjust the flag
 * setting appropriately.
 *
 * Note that MDI_CPI_HINT_PATH_DOES_NOT_EXIST flag is persisted in the
 * on-disk vhci cache. So every time this flag is updated the cache must be
 * flushed.
 */
static void
adjust_sort_vhcache_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
    mdi_vhcache_lookup_token_t *tok)
{
	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
	mdi_vhcache_client_t *cct;
	mdi_vhcache_pathinfo_t *cpi;

	rw_enter(&vhcache->vhcache_lock, RW_READER);
	if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, tok))
	    == NULL) {
		rw_exit(&vhcache->vhcache_lock);
		return;
	}

	/*
	 * to avoid unnecessary on-disk cache updates, first check if an
	 * update is really needed. If no update is needed simply return.
	 */
	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
		if ((cpi->cpi_pip != NULL &&
		    (cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST)) ||
		    (cpi->cpi_pip == NULL &&
		    !(cpi->cpi_flags & MDI_CPI_HINT_PATH_DOES_NOT_EXIST))) {
			break;
		}
	}
	if (cpi == NULL) {
		rw_exit(&vhcache->vhcache_lock);
		return;
	}

	if (rw_tryupgrade(&vhcache->vhcache_lock) == 0) {
		rw_exit(&vhcache->vhcache_lock);
		rw_enter(&vhcache->vhcache_lock, RW_WRITER);
		if ((cct = lookup_vhcache_client(vhcache, ct_name, ct_addr,
		    tok)) == NULL) {
			rw_exit(&vhcache->vhcache_lock);
			return;
		}
	}

	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
		if (cpi->cpi_pip != NULL)
			cpi->cpi_flags &= ~MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
		else
			cpi->cpi_flags |= MDI_CPI_HINT_PATH_DOES_NOT_EXIST;
	}
	sort_vhcache_paths(cct);

	rw_exit(&vhcache->vhcache_lock);
	vhcache_dirty(vhc);
}

/*
 * Configure all specified paths of the client.
 */
static void
config_client_paths_sync(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
    mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
{
	mdi_phys_path_t *pp;

	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next)
		(void) bus_config_one_phci_child(pp->phys_path);
	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, tok);
}

/*
 * Dequeue elements from vhci async client config list and bus configure
 * their corresponding phci clients.
 */
static void
config_client_paths_thread(void *arg)
{
	mdi_vhci_config_t *vhc = (mdi_vhci_config_t *)arg;
	mdi_async_client_config_t *acc;
	clock_t quit_at_ticks;
	clock_t idle_time = mdi_async_config_idle_time * TICKS_PER_SECOND;
	callb_cpr_t cprinfo;

	CALLB_CPR_INIT(&cprinfo, &vhc->vhc_lock, callb_generic_cpr,
	    "mdi_config_client_paths");

	for (; ; ) {
		quit_at_ticks = ddi_get_lbolt() + idle_time;

		mutex_enter(&vhc->vhc_lock);
		while (!(vhc->vhc_flags & MDI_VHC_EXIT) &&
		    vhc->vhc_acc_list_head == NULL &&
		    ddi_get_lbolt() < quit_at_ticks) {
			CALLB_CPR_SAFE_BEGIN(&cprinfo);
			(void) cv_timedwait(&vhc->vhc_cv, &vhc->vhc_lock,
			    quit_at_ticks);
			CALLB_CPR_SAFE_END(&cprinfo, &vhc->vhc_lock);
		}

		if ((vhc->vhc_flags & MDI_VHC_EXIT) ||
		    vhc->vhc_acc_list_head == NULL)
			goto out;

		acc = vhc->vhc_acc_list_head;
		vhc->vhc_acc_list_head = acc->acc_next;
		if (vhc->vhc_acc_list_head == NULL)
			vhc->vhc_acc_list_tail = NULL;
		vhc->vhc_acc_count--;
		mutex_exit(&vhc->vhc_lock);

		config_client_paths_sync(vhc, acc->acc_ct_name,
		    acc->acc_ct_addr, acc->acc_phclient_path_list_head,
		    &acc->acc_token);

		free_async_client_config(acc);
	}

out:
	vhc->vhc_acc_thrcount--;
	/* CALLB_CPR_EXIT releases the vhc->vhc_lock */
	CALLB_CPR_EXIT(&cprinfo);
}

/*
 * Arrange for all the phci client paths (pp_head) for the specified client
 * to be bus configured asynchronously by a thread.
 */
static void
config_client_paths_async(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr,
    mdi_phys_path_t *pp_head, mdi_vhcache_lookup_token_t *tok)
{
	mdi_async_client_config_t *acc, *newacc;
	int create_thread;

	if (pp_head == NULL)
		return;

	if (mdi_mtc_off) {
		config_client_paths_sync(vhc, ct_name, ct_addr, pp_head, tok);
		free_phclient_path_list(pp_head);
		return;
	}

	newacc = alloc_async_client_config(ct_name, ct_addr, pp_head, tok);
	ASSERT(newacc);

	mutex_enter(&vhc->vhc_lock);
	for (acc = vhc->vhc_acc_list_head; acc != NULL; acc = acc->acc_next) {
		if (strcmp(ct_name, acc->acc_ct_name) == 0 &&
		    strcmp(ct_addr, acc->acc_ct_addr) == 0) {
			free_async_client_config(newacc);
			mutex_exit(&vhc->vhc_lock);
			return;
		}
	}

	if (vhc->vhc_acc_list_head == NULL)
		vhc->vhc_acc_list_head = newacc;
	else
		vhc->vhc_acc_list_tail->acc_next = newacc;
	vhc->vhc_acc_list_tail = newacc;
	vhc->vhc_acc_count++;
	if (vhc->vhc_acc_count <= vhc->vhc_acc_thrcount) {
		cv_broadcast(&vhc->vhc_cv);
		create_thread = 0;
	} else {
		vhc->vhc_acc_thrcount++;
		create_thread = 1;
	}
	mutex_exit(&vhc->vhc_lock);

	if (create_thread)
		(void) thread_create(NULL, 0, config_client_paths_thread, vhc,
		    0, &p0, TS_RUN, minclsyspri);
}

/*
 * Return number of online paths for the specified client.
 */
static int
nonline_paths(mdi_vhcache_client_t *cct)
{
	mdi_vhcache_pathinfo_t *cpi;
	int online_count = 0;

	for (cpi = cct->cct_cpi_head; cpi != NULL; cpi = cpi->cpi_next) {
		if (cpi->cpi_pip != NULL) {
			MDI_PI_LOCK(cpi->cpi_pip);
			if (cpi->cpi_pip->pi_state == MDI_PATHINFO_STATE_ONLINE)
				online_count++;
			MDI_PI_UNLOCK(cpi->cpi_pip);
		}
	}

	return (online_count);
}

/*
 * Bus configure all paths for the specified vhci client.
 * If at least one path for the client is already online, the remaining paths
 * will be configured asynchronously. Otherwise, it synchronously configures
 * the paths until at least one path is online and then rest of the paths
 * will be configured asynchronously.
 */
static void
config_client_paths(mdi_vhci_config_t *vhc, char *ct_name, char *ct_addr)
{
	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
	mdi_phys_path_t *pp_head, *pp;
	mdi_vhcache_client_t *cct;
	mdi_vhcache_lookup_token_t tok;

	ASSERT(RW_LOCK_HELD(&vhcache->vhcache_lock));

	init_vhcache_lookup_token(&tok, NULL);

	if (ct_name == NULL || ct_addr == NULL ||
	    (cct = lookup_vhcache_client(vhcache, ct_name, ct_addr, &tok))
	    == NULL ||
	    (pp_head = build_phclient_path_list(cct, ct_name)) == NULL) {
		rw_exit(&vhcache->vhcache_lock);
		return;
	}

	/* if at least one path is online, configure the rest asynchronously */
	if (nonline_paths(cct) > 0) {
		rw_exit(&vhcache->vhcache_lock);
		config_client_paths_async(vhc, ct_name, ct_addr, pp_head, &tok);
		return;
	}

	rw_exit(&vhcache->vhcache_lock);

	for (pp = pp_head; pp != NULL; pp = pp->phys_path_next) {
		if (bus_config_one_phci_child(pp->phys_path) == MDI_SUCCESS) {
			rw_enter(&vhcache->vhcache_lock, RW_READER);

			if ((cct = lookup_vhcache_client(vhcache, ct_name,
			    ct_addr, &tok)) == NULL) {
				rw_exit(&vhcache->vhcache_lock);
				goto out;
			}

			if (nonline_paths(cct) > 0 &&
			    pp->phys_path_next != NULL) {
				rw_exit(&vhcache->vhcache_lock);
				config_client_paths_async(vhc, ct_name, ct_addr,
				    pp->phys_path_next, &tok);
				pp->phys_path_next = NULL;
				goto out;
			}

			rw_exit(&vhcache->vhcache_lock);
		}
	}

	adjust_sort_vhcache_paths(vhc, ct_name, ct_addr, &tok);
out:
	free_phclient_path_list(pp_head);
}

static void
single_threaded_vhconfig_enter(mdi_vhci_config_t *vhc)
{
	mutex_enter(&vhc->vhc_lock);
	while (vhc->vhc_flags & MDI_VHC_SINGLE_THREADED)
		cv_wait(&vhc->vhc_cv, &vhc->vhc_lock);
	vhc->vhc_flags |= MDI_VHC_SINGLE_THREADED;
	mutex_exit(&vhc->vhc_lock);
}

static void
single_threaded_vhconfig_exit(mdi_vhci_config_t *vhc)
{
	mutex_enter(&vhc->vhc_lock);
	vhc->vhc_flags &= ~MDI_VHC_SINGLE_THREADED;
	cv_broadcast(&vhc->vhc_cv);
	mutex_exit(&vhc->vhc_lock);
}

typedef struct mdi_phci_driver_info {
	char	*phdriver_name;	/* name of the phci driver */

	/* set to non zero if the phci driver supports root device */
	int	phdriver_root_support;
} mdi_phci_driver_info_t;

/*
 * vhci class and root support capability of a phci driver can be
 * specified using ddi-vhci-class and ddi-no-root-support properties in the
 * phci driver.conf file. The built-in tables below contain this information
 * for those phci drivers whose driver.conf files don't yet contain this info.
 *
 * All phci drivers expect iscsi have root device support.
 */
static mdi_phci_driver_info_t scsi_phci_driver_list[] = {
	{ "fp", 1 },
	{ "iscsi", 0 },
	{ "ibsrp", 1 }
	};

static mdi_phci_driver_info_t ib_phci_driver_list[] = { "tavor", 1 };

static void *
mdi_realloc(void *old_ptr, size_t old_size, size_t new_size)
{
	void *new_ptr;

	new_ptr = kmem_zalloc(new_size, KM_SLEEP);
	if (old_ptr) {
		bcopy(old_ptr, new_ptr, MIN(old_size, new_size));
		kmem_free(old_ptr, old_size);
	}
	return (new_ptr);
}

static void
add_to_phci_list(char ***driver_list, int **root_support_list,
    int *cur_elements, int *max_elements, char *driver_name, int root_support)
{
	ASSERT(*cur_elements <= *max_elements);
	if (*cur_elements == *max_elements) {
		*max_elements += 10;
		*driver_list = mdi_realloc(*driver_list,
		    sizeof (char *) * (*cur_elements),
		    sizeof (char *) * (*max_elements));
		*root_support_list = mdi_realloc(*root_support_list,
		    sizeof (int) * (*cur_elements),
		    sizeof (int) * (*max_elements));
	}
	(*driver_list)[*cur_elements] = i_ddi_strdup(driver_name, KM_SLEEP);
	(*root_support_list)[*cur_elements] = root_support;
	(*cur_elements)++;
}

static void
get_phci_driver_list(char *vhci_class, char ***driver_list,
    int **root_support_list, int *cur_elements, int *max_elements)
{
	mdi_phci_driver_info_t	*st_driver_list, *p;
	int		st_ndrivers, root_support, i, j, driver_conf_count;
	major_t		m;
	struct devnames	*dnp;
	ddi_prop_t	*propp;

	*driver_list = NULL;
	*root_support_list = NULL;
	*cur_elements = 0;
	*max_elements = 0;

	/* add the phci drivers derived from the phci driver.conf files */
	for (m = 0; m < devcnt; m++) {
		dnp = &devnamesp[m];

		if (dnp->dn_flags & DN_PHCI_DRIVER) {
			LOCK_DEV_OPS(&dnp->dn_lock);
			if (dnp->dn_global_prop_ptr != NULL &&
			    (propp = i_ddi_prop_search(DDI_DEV_T_ANY,
			    DDI_VHCI_CLASS, DDI_PROP_TYPE_STRING,
			    &dnp->dn_global_prop_ptr->prop_list)) != NULL &&
			    strcmp(propp->prop_val, vhci_class) == 0) {

				root_support = (i_ddi_prop_search(DDI_DEV_T_ANY,
				    DDI_NO_ROOT_SUPPORT, DDI_PROP_TYPE_INT,
				    &dnp->dn_global_prop_ptr->prop_list)
				    == NULL) ? 1 : 0;

				add_to_phci_list(driver_list, root_support_list,
				    cur_elements, max_elements, dnp->dn_name,
				    root_support);

				UNLOCK_DEV_OPS(&dnp->dn_lock);
			} else
				UNLOCK_DEV_OPS(&dnp->dn_lock);
		}
	}

	driver_conf_count = *cur_elements;

	/* add the phci drivers specified in the built-in tables */
	if (strcmp(vhci_class, MDI_HCI_CLASS_SCSI) == 0) {
		st_driver_list = scsi_phci_driver_list;
		st_ndrivers = sizeof (scsi_phci_driver_list) /
		    sizeof (mdi_phci_driver_info_t);
	} else if (strcmp(vhci_class, MDI_HCI_CLASS_IB) == 0) {
		st_driver_list = ib_phci_driver_list;
		st_ndrivers = sizeof (ib_phci_driver_list) /
		    sizeof (mdi_phci_driver_info_t);
	} else {
		st_driver_list = NULL;
		st_ndrivers = 0;
	}

	for (i = 0, p = st_driver_list; i < st_ndrivers; i++, p++) {
		/* add this phci driver if not already added before */
		for (j = 0; j < driver_conf_count; j++) {
			if (strcmp((*driver_list)[j], p->phdriver_name) == 0)
				break;
		}
		if (j == driver_conf_count) {
			add_to_phci_list(driver_list, root_support_list,
			    cur_elements, max_elements, p->phdriver_name,
			    p->phdriver_root_support);
		}
	}
}

/*
 * Attach the phci driver instances associated with the specified vhci class.
 * If root is mounted attach all phci driver instances.
 * If root is not mounted, attach the instances of only those phci
 * drivers that have the root support.
 */
static void
attach_phci_drivers(char *vhci_class)
{
	char	**driver_list, **p;
	int	*root_support_list;
	int	cur_elements, max_elements, i;
	major_t	m;

	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
	    &cur_elements, &max_elements);

	for (i = 0; i < cur_elements; i++) {
		if (modrootloaded || root_support_list[i]) {
			m = ddi_name_to_major(driver_list[i]);
			if (m != DDI_MAJOR_T_NONE &&
			    ddi_hold_installed_driver(m))
				ddi_rele_driver(m);
		}
	}

	if (driver_list) {
		for (i = 0, p = driver_list; i < cur_elements; i++, p++)
			kmem_free(*p, strlen(*p) + 1);
		kmem_free(driver_list, sizeof (char *) * max_elements);
		kmem_free(root_support_list, sizeof (int) * max_elements);
	}
}

/*
 * Build vhci cache:
 *
 * Attach phci driver instances and then drive BUS_CONFIG_ALL on
 * the phci driver instances. During this process the cache gets built.
 *
 * Cache is built fully if the root is mounted.
 * If the root is not mounted, phci drivers that do not have root support
 * are not attached. As a result the cache is built partially. The entries
 * in the cache reflect only those phci drivers that have root support.
 */
static int
build_vhci_cache(mdi_vhci_t *vh)
{
	mdi_vhci_config_t *vhc = vh->vh_config;
	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;

	single_threaded_vhconfig_enter(vhc);

	rw_enter(&vhcache->vhcache_lock, RW_READER);
	if (vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE) {
		rw_exit(&vhcache->vhcache_lock);
		single_threaded_vhconfig_exit(vhc);
		return (0);
	}
	rw_exit(&vhcache->vhcache_lock);

	attach_phci_drivers(vh->vh_class);
	bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE | NDI_NO_EVENT,
	    BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);

	rw_enter(&vhcache->vhcache_lock, RW_WRITER);
	vhcache->vhcache_flags |= MDI_VHCI_CACHE_SETUP_DONE;
	rw_exit(&vhcache->vhcache_lock);

	single_threaded_vhconfig_exit(vhc);
	vhcache_dirty(vhc);
	return (1);
}

/*
 * Determine if discovery of paths is needed.
 */
static int
vhcache_do_discovery(mdi_vhci_config_t *vhc)
{
	int rv = 1;

	mutex_enter(&vhc->vhc_lock);
	if (i_ddi_io_initialized() == 0) {
		if (vhc->vhc_path_discovery_boot > 0) {
			vhc->vhc_path_discovery_boot--;
			goto out;
		}
	} else {
		if (vhc->vhc_path_discovery_postboot > 0) {
			vhc->vhc_path_discovery_postboot--;
			goto out;
		}
	}

	/*
	 * Do full path discovery at most once per mdi_path_discovery_interval.
	 * This is to avoid a series of full path discoveries when opening
	 * stale /dev/[r]dsk links.
	 */
	if (mdi_path_discovery_interval != -1 &&
	    ddi_get_lbolt64() >= vhc->vhc_path_discovery_cutoff_time)
		goto out;

	rv = 0;
out:
	mutex_exit(&vhc->vhc_lock);
	return (rv);
}

/*
 * Discover all paths:
 *
 * Attach phci driver instances and then drive BUS_CONFIG_ALL on all the phci
 * driver instances. During this process all paths will be discovered.
 */
static int
vhcache_discover_paths(mdi_vhci_t *vh)
{
	mdi_vhci_config_t *vhc = vh->vh_config;
	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
	int rv = 0;

	single_threaded_vhconfig_enter(vhc);

	if (vhcache_do_discovery(vhc)) {
		attach_phci_drivers(vh->vh_class);
		bus_config_all_phcis(vhcache, NDI_DRV_CONF_REPROBE |
		    NDI_NO_EVENT, BUS_CONFIG_ALL, DDI_MAJOR_T_NONE);

		mutex_enter(&vhc->vhc_lock);
		vhc->vhc_path_discovery_cutoff_time = ddi_get_lbolt64() +
		    mdi_path_discovery_interval * TICKS_PER_SECOND;
		mutex_exit(&vhc->vhc_lock);
		rv = 1;
	}

	single_threaded_vhconfig_exit(vhc);
	return (rv);
}

/*
 * Generic vhci bus config implementation:
 *
 * Parameters
 *	vdip	vhci dip
 *	flags	bus config flags
 *	op	bus config operation
 *	The remaining parameters are bus config operation specific
 *
 * for BUS_CONFIG_ONE
 *	arg	pointer to name@addr
 *	child	upon successful return from this function, *child will be
 *		set to the configured and held devinfo child node of vdip.
 *	ct_addr	pointer to client address (i.e. GUID)
 *
 * for BUS_CONFIG_DRIVER
 *	arg	major number of the driver
 *	child and ct_addr parameters are ignored
 *
 * for BUS_CONFIG_ALL
 *	arg, child, and ct_addr parameters are ignored
 *
 * Note that for the rest of the bus config operations, this function simply
 * calls the framework provided default bus config routine.
 */
int
mdi_vhci_bus_config(dev_info_t *vdip, uint_t flags, ddi_bus_config_op_t op,
    void *arg, dev_info_t **child, char *ct_addr)
{
	mdi_vhci_t *vh = i_devi_get_vhci(vdip);
	mdi_vhci_config_t *vhc = vh->vh_config;
	mdi_vhci_cache_t *vhcache = &vhc->vhc_vhcache;
	int rv = 0;
	int params_valid = 0;
	char *cp;

	/*
	 * To bus config vhcis we relay operation, possibly using another
	 * thread, to phcis. The phci driver then interacts with MDI to cause
	 * vhci child nodes to be enumerated under the vhci node.  Adding a
	 * vhci child requires an ndi_devi_enter of the vhci. Since another
	 * thread may be adding the child, to avoid deadlock we can't wait
	 * for the relayed operations to complete if we have already entered
	 * the vhci node.
	 */
	if (DEVI_BUSY_OWNED(vdip)) {
		MDI_DEBUG(2, (MDI_NOTE, vdip,
		    "vhci dip is busy owned %p", (void *)vdip));
		goto default_bus_config;
	}

	rw_enter(&vhcache->vhcache_lock, RW_READER);
	if (!(vhcache->vhcache_flags & MDI_VHCI_CACHE_SETUP_DONE)) {
		rw_exit(&vhcache->vhcache_lock);
		rv = build_vhci_cache(vh);
		rw_enter(&vhcache->vhcache_lock, RW_READER);
	}

	switch (op) {
	case BUS_CONFIG_ONE:
		if (arg != NULL && ct_addr != NULL) {
			/* extract node name */
			cp = (char *)arg;
			while (*cp != '\0' && *cp != '@')
				cp++;
			if (*cp == '@') {
				params_valid = 1;
				*cp = '\0';
				config_client_paths(vhc, (char *)arg, ct_addr);
				/* config_client_paths() releases cache_lock */
				*cp = '@';
				break;
			}
		}

		rw_exit(&vhcache->vhcache_lock);
		break;

	case BUS_CONFIG_DRIVER:
		rw_exit(&vhcache->vhcache_lock);
		if (rv == 0)
			st_bus_config_all_phcis(vhc, flags, op,
			    (major_t)(uintptr_t)arg);
		break;

	case BUS_CONFIG_ALL:
		rw_exit(&vhcache->vhcache_lock);
		if (rv == 0)
			st_bus_config_all_phcis(vhc, flags, op, -1);
		break;

	default:
		rw_exit(&vhcache->vhcache_lock);
		break;
	}


default_bus_config:
	/*
	 * All requested child nodes are enumerated under the vhci.
	 * Now configure them.
	 */
	if (ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
	    NDI_SUCCESS) {
		return (MDI_SUCCESS);
	} else if (op == BUS_CONFIG_ONE && rv == 0 && params_valid) {
		/* discover all paths and try configuring again */
		if (vhcache_discover_paths(vh) &&
		    ndi_busop_bus_config(vdip, flags, op, arg, child, 0) ==
		    NDI_SUCCESS)
			return (MDI_SUCCESS);
	}

	return (MDI_FAILURE);
}

/*
 * Read the on-disk vhci cache into an nvlist for the specified vhci class.
 */
static nvlist_t *
read_on_disk_vhci_cache(char *vhci_class)
{
	nvlist_t *nvl;
	int err;
	char *filename;

	filename = vhclass2vhcache_filename(vhci_class);

	if ((err = fread_nvlist(filename, &nvl)) == 0) {
		kmem_free(filename, strlen(filename) + 1);
		return (nvl);
	} else if (err == EIO)
		cmn_err(CE_WARN, "%s: I/O error, will recreate", filename);
	else if (err == EINVAL)
		cmn_err(CE_WARN,
		    "%s: data file corrupted, will recreate", filename);

	kmem_free(filename, strlen(filename) + 1);
	return (NULL);
}

/*
 * Read on-disk vhci cache into nvlists for all vhci classes.
 * Called during booting by i_ddi_read_devices_files().
 */
void
mdi_read_devices_files(void)
{
	int i;

	for (i = 0; i < N_VHCI_CLASSES; i++)
		vhcache_nvl[i] = read_on_disk_vhci_cache(vhci_class_list[i]);
}

/*
 * Remove all stale entries from vhci cache.
 */
static void
clean_vhcache(mdi_vhci_config_t *vhc)
{
	mdi_vhci_cache_t	*vhcache = &vhc->vhc_vhcache;
	mdi_vhcache_phci_t	*phci, *nxt_phci;
	mdi_vhcache_client_t	*client, *nxt_client;
	mdi_vhcache_pathinfo_t	*path, *nxt_path;

	rw_enter(&vhcache->vhcache_lock, RW_WRITER);

	client = vhcache->vhcache_client_head;
	vhcache->vhcache_client_head = vhcache->vhcache_client_tail = NULL;
	for ( ; client != NULL; client = nxt_client) {
		nxt_client = client->cct_next;

		path = client->cct_cpi_head;
		client->cct_cpi_head = client->cct_cpi_tail = NULL;
		for ( ; path != NULL; path = nxt_path) {
			nxt_path = path->cpi_next;
			if ((path->cpi_cphci->cphci_phci != NULL) &&
			    (path->cpi_pip != NULL)) {
				enqueue_tail_vhcache_pathinfo(client, path);
			} else if (path->cpi_pip != NULL) {
				/* Not valid to have a path without a phci. */
				free_vhcache_pathinfo(path);
			}
		}

		if (client->cct_cpi_head != NULL)
			enqueue_vhcache_client(vhcache, client);
		else {
			(void) mod_hash_destroy(vhcache->vhcache_client_hash,
			    (mod_hash_key_t)client->cct_name_addr);
			free_vhcache_client(client);
		}
	}

	phci = vhcache->vhcache_phci_head;
	vhcache->vhcache_phci_head = vhcache->vhcache_phci_tail = NULL;
	for ( ; phci != NULL; phci = nxt_phci) {

		nxt_phci = phci->cphci_next;
		if (phci->cphci_phci != NULL)
			enqueue_vhcache_phci(vhcache, phci);
		else
			free_vhcache_phci(phci);
	}

	vhcache->vhcache_clean_time = ddi_get_lbolt64();
	rw_exit(&vhcache->vhcache_lock);
	vhcache_dirty(vhc);
}

/*
 * Remove all stale entries from vhci cache.
 * Called by i_ddi_clean_devices_files() during the execution of devfsadm -C
 */
void
mdi_clean_vhcache(void)
{
	mdi_vhci_t *vh;

	mutex_enter(&mdi_mutex);
	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
		vh->vh_refcnt++;
		mutex_exit(&mdi_mutex);
		clean_vhcache(vh->vh_config);
		mutex_enter(&mdi_mutex);
		vh->vh_refcnt--;
	}
	mutex_exit(&mdi_mutex);
}

/*
 * mdi_vhci_walk_clients():
 *		Walker routine to traverse client dev_info nodes
 * ddi_walk_devs(ddi_get_child(vdip), f, arg) returns the entire tree
 * below the client, including nexus devices, which we dont want.
 * So we just traverse the immediate siblings, starting from 1st client.
 */
void
mdi_vhci_walk_clients(dev_info_t *vdip,
    int (*f)(dev_info_t *, void *), void *arg)
{
	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
	dev_info_t	*cdip;
	mdi_client_t	*ct;

	MDI_VHCI_CLIENT_LOCK(vh);
	cdip = ddi_get_child(vdip);
	while (cdip) {
		ct = i_devi_get_client(cdip);
		MDI_CLIENT_LOCK(ct);

		if (((*f)(cdip, arg)) == DDI_WALK_CONTINUE)
			cdip = ddi_get_next_sibling(cdip);
		else
			cdip = NULL;

		MDI_CLIENT_UNLOCK(ct);
	}
	MDI_VHCI_CLIENT_UNLOCK(vh);
}

/*
 * mdi_vhci_walk_phcis():
 *		Walker routine to traverse phci dev_info nodes
 */
void
mdi_vhci_walk_phcis(dev_info_t *vdip,
    int (*f)(dev_info_t *, void *), void *arg)
{
	mdi_vhci_t	*vh = i_devi_get_vhci(vdip);
	mdi_phci_t	*ph, *next;

	MDI_VHCI_PHCI_LOCK(vh);
	ph = vh->vh_phci_head;
	while (ph) {
		MDI_PHCI_LOCK(ph);

		if (((*f)(ph->ph_dip, arg)) == DDI_WALK_CONTINUE)
			next = ph->ph_next;
		else
			next = NULL;

		MDI_PHCI_UNLOCK(ph);
		ph = next;
	}
	MDI_VHCI_PHCI_UNLOCK(vh);
}


/*
 * mdi_walk_vhcis():
 *		Walker routine to traverse vhci dev_info nodes
 */
void
mdi_walk_vhcis(int (*f)(dev_info_t *, void *), void *arg)
{
	mdi_vhci_t	*vh = NULL;

	mutex_enter(&mdi_mutex);
	/*
	 * Scan for already registered vhci
	 */
	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
		vh->vh_refcnt++;
		mutex_exit(&mdi_mutex);
		if (((*f)(vh->vh_dip, arg)) != DDI_WALK_CONTINUE) {
			mutex_enter(&mdi_mutex);
			vh->vh_refcnt--;
			break;
		} else {
			mutex_enter(&mdi_mutex);
			vh->vh_refcnt--;
		}
	}

	mutex_exit(&mdi_mutex);
}

/*
 * i_mdi_log_sysevent():
 *		Logs events for pickup by syseventd
 */
static void
i_mdi_log_sysevent(dev_info_t *dip, char *ph_vh_class, char *subclass)
{
	char		*path_name;
	nvlist_t	*attr_list;

	if (nvlist_alloc(&attr_list, NV_UNIQUE_NAME_TYPE,
	    KM_SLEEP) != DDI_SUCCESS) {
		goto alloc_failed;
	}

	path_name = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
	(void) ddi_pathname(dip, path_name);

	if (nvlist_add_string(attr_list, DDI_DRIVER_NAME,
	    ddi_driver_name(dip)) != DDI_SUCCESS) {
		goto error;
	}

	if (nvlist_add_int32(attr_list, DDI_DRIVER_MAJOR,
	    (int32_t)ddi_driver_major(dip)) != DDI_SUCCESS) {
		goto error;
	}

	if (nvlist_add_int32(attr_list, DDI_INSTANCE,
	    (int32_t)ddi_get_instance(dip)) != DDI_SUCCESS) {
		goto error;
	}

	if (nvlist_add_string(attr_list, DDI_PATHNAME,
	    path_name) != DDI_SUCCESS) {
		goto error;
	}

	if (nvlist_add_string(attr_list, DDI_CLASS,
	    ph_vh_class) != DDI_SUCCESS) {
		goto error;
	}

	(void) ddi_log_sysevent(dip, DDI_VENDOR_SUNW, EC_DDI, subclass,
	    attr_list, NULL, DDI_SLEEP);

error:
	kmem_free(path_name, MAXPATHLEN);
	nvlist_free(attr_list);
	return;

alloc_failed:
	MDI_DEBUG(1, (MDI_WARN, dip, "!unable to send sysevent"));
}

char **
mdi_get_phci_driver_list(char *vhci_class, int	*ndrivers)
{
	char	**driver_list, **ret_driver_list = NULL;
	int	*root_support_list;
	int	cur_elements, max_elements;

	get_phci_driver_list(vhci_class, &driver_list, &root_support_list,
	    &cur_elements, &max_elements);


	if (driver_list) {
		kmem_free(root_support_list, sizeof (int) * max_elements);
		ret_driver_list = mdi_realloc(driver_list, sizeof (char *)
		    * max_elements, sizeof (char *) * cur_elements);
	}
	*ndrivers = cur_elements;

	return (ret_driver_list);

}

void
mdi_free_phci_driver_list(char **driver_list, int ndrivers)
{
	char	**p;
	int	i;

	if (driver_list) {
		for (i = 0, p = driver_list; i < ndrivers; i++, p++)
			kmem_free(*p, strlen(*p) + 1);
		kmem_free(driver_list, sizeof (char *) * ndrivers);
	}
}

/*
 * mdi_is_dev_supported():
 *		function called by pHCI bus config operation to determine if a
 *		device should be represented as a child of the vHCI or the
 *		pHCI.  This decision is made by the vHCI, using cinfo idenity
 *		information passed by the pHCI - specifics of the cinfo
 *		representation are by agreement between the pHCI and vHCI.
 * Return Values:
 *		MDI_SUCCESS
 *		MDI_FAILURE
 */
int
mdi_is_dev_supported(char *class, dev_info_t *pdip, void *cinfo)
{
	mdi_vhci_t	*vh;

	ASSERT(class && pdip);

	/*
	 * For dev_supported, mdi_phci_register() must have established pdip as
	 * a pHCI.
	 *
	 * NOTE: mdi_phci_register() does "mpxio-disable" processing, and
	 * MDI_PHCI(pdip) will return false if mpxio is disabled.
	 */
	if (!MDI_PHCI(pdip))
		return (MDI_FAILURE);

	/* Return MDI_FAILURE if vHCI does not support asking the question. */
	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
	if ((vh == NULL) || (vh->vh_ops->vo_is_dev_supported == NULL)) {
		return (MDI_FAILURE);
	}

	/* Return vHCI answer */
	return (vh->vh_ops->vo_is_dev_supported(vh->vh_dip, pdip, cinfo));
}

int
mdi_dc_return_dev_state(mdi_pathinfo_t *pip, struct devctl_iocdata *dcp)
{
	uint_t devstate = 0;
	dev_info_t *cdip;

	if ((pip == NULL) || (dcp == NULL))
		return (MDI_FAILURE);

	cdip = mdi_pi_get_client(pip);

	switch (mdi_pi_get_state(pip)) {
	case MDI_PATHINFO_STATE_INIT:
		devstate = DEVICE_DOWN;
		break;
	case MDI_PATHINFO_STATE_ONLINE:
		devstate = DEVICE_ONLINE;
		if ((cdip) && (devi_stillreferenced(cdip) == DEVI_REFERENCED))
			devstate |= DEVICE_BUSY;
		break;
	case MDI_PATHINFO_STATE_STANDBY:
		devstate = DEVICE_ONLINE;
		break;
	case MDI_PATHINFO_STATE_FAULT:
		devstate = DEVICE_DOWN;
		break;
	case MDI_PATHINFO_STATE_OFFLINE:
		devstate = DEVICE_OFFLINE;
		break;
	default:
		ASSERT(MDI_PI(pip)->pi_state);
	}

	if (copyout(&devstate, dcp->cpyout_buf, sizeof (uint_t)) != 0)
		return (MDI_FAILURE);

	return (MDI_SUCCESS);
}