OpenSolaris_b135/lib/lvm/libmeta/common/meta_mirror_resync.c

Compare this file to the similar file:
Show the results in this format:

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

/*
 * mirror operations
 */

#include <meta.h>
#include <sys/lvm/md_mirror.h>
#include <thread.h>

extern	int	md_in_daemon;
extern md_mn_client_list_t *mdmn_clients;

/*
 * chain of mirrors
 */
typedef struct mm_unit_list {
	struct mm_unit_list	*next;	/* next in chain */
	mdname_t		*namep;	/* mirror name */
	mm_pass_num_t		pass;	/* pass number */
	uint_t			done;	/* resync done */
} mm_unit_list_t;

/*
 * resync mirror
 * meta_lock for this set should be held on entry.
 */
int
meta_mirror_resync(
	mdsetname_t		*sp,
	mdname_t		*mirnp,
	daddr_t			size,
	md_error_t		*ep,
	md_resync_cmd_t		cmd	/* Start/Block/Unblock/Kill */
)
{
	char			*miscname;
	md_resync_ioctl_t	ri;

	/* should have a set */
	assert(sp != NULL);
	assert(sp->setno == MD_MIN2SET(meta_getminor(mirnp->dev)));

	/* make sure we have a mirror */
	if ((miscname = metagetmiscname(mirnp, ep)) == NULL)
		return (-1);
	if (strcmp(miscname, MD_MIRROR) != 0) {
		return (mdmderror(ep, MDE_NOT_MM, meta_getminor(mirnp->dev),
		    mirnp->cname));
	}

	/* start resync */
	(void) memset(&ri, 0, sizeof (ri));
	MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno);
	ri.ri_mnum = meta_getminor(mirnp->dev);
	ri.ri_copysize = size;
	switch (cmd) {
	case MD_RESYNC_FORCE_MNSTART:
		ri.ri_flags |= MD_RI_RESYNC_FORCE_MNSTART;
		break;
	case MD_RESYNC_START:
		ri.ri_flags = 0;
		break;
	case MD_RESYNC_BLOCK:
		ri.ri_flags = MD_RI_BLOCK;
		break;
	case MD_RESYNC_UNBLOCK:
		ri.ri_flags = MD_RI_UNBLOCK;
		break;
	case MD_RESYNC_KILL:
		ri.ri_flags = MD_RI_KILL;
		break;
	case MD_RESYNC_KILL_NO_WAIT:
		ri.ri_flags = MD_RI_KILL | MD_RI_NO_WAIT;
		break;
	default:
		/* TODO: Add new error MDE_BAD_RESYNC_FLAGS */
		return (mderror(ep, MDE_BAD_RESYNC_OPT, mirnp->cname));
	}

	if (metaioctl(MD_IOCSETSYNC, &ri, &ri.mde, mirnp->cname) != 0)
		return (mdstealerror(ep, &ri.mde));

	/* return success */
	return (0);
}

/*
 * free units
 */
static void
free_units(
	mm_unit_list_t	*mirrors[MD_PASS_MAX + 1]
)
{
	uint_t		i;

	for (i = 0; (i < (MD_PASS_MAX + 1)); ++i) {
		mm_unit_list_t	*p, *n;

		for (p = mirrors[i], n = NULL; (p != NULL); p = n) {
			n = p->next;
			Free(p);
		}
		mirrors[i] = NULL;
	}
}

/*
 * setup_units:	build lists of units for each pass
 */
static int
setup_units(
	mdsetname_t	*sp,
	mm_unit_list_t	*mirrors[MD_PASS_MAX + 1],
	md_error_t	*ep
)
{
	mdnamelist_t	*mirrornlp = NULL;
	mdnamelist_t	*p;
	int		rval = 0;

	/* should have a set */
	assert(sp != NULL);

	/* for each mirror */
	if (meta_get_mirror_names(sp, &mirrornlp, 0, ep) < 0)
		return (-1);
	for (p = mirrornlp; (p != NULL); p = p->next) {
		md_mirror_t	*mirrorp;
		mm_unit_list_t	*lp;

		/* get unit structure */
		if ((mirrorp = meta_get_mirror(sp, p->namep, ep)) == NULL) {
			rval = -1;	/* record, but ignore errors */
			continue;
		}

		/* save info */
		lp = Zalloc(sizeof (*lp));
		lp->namep = p->namep;
		lp->pass = mirrorp->pass_num;
		if ((lp->pass < 0) || (lp->pass > MD_PASS_MAX))
			lp->pass = MD_PASS_MAX;

		/* put on list */
		lp->next = mirrors[lp->pass];
		mirrors[lp->pass] = lp;
	}

	/* cleanup, return error */
	metafreenamelist(mirrornlp);
	return (rval);
}

/*
 * resync all mirrors (in background)
 */
int
meta_mirror_resync_all(
	mdsetname_t	*sp,
	daddr_t		size,
	md_error_t	*ep
)
{
	mm_unit_list_t	*mirrors[MD_PASS_MAX + 1];
	mm_pass_num_t	pass, max_pass;
	int		rval = 0, fval;

	/* should have a set */
	assert(sp != NULL);

	/* get mirrors */
	(void) memset(mirrors, 0, sizeof (mirrors));
	if (setup_units(sp, mirrors, ep) != 0)
		return (-1);

	/* fork a process */
	if ((fval = md_daemonize(sp, ep)) != 0) {
		/*
		 * md_daemonize will fork off a process.  The is the
		 * parent or error.
		 */
		if (fval > 0) {
			free_units(mirrors);
			return (0);
		}
		mdclrerror(ep);
	}
	/*
	 * Closing stdin/out/err here.
	 * In case this was called thru rsh, the calling process on the other
	 * side will know, it doesn't have to wait until all the resyncs have
	 * finished.
	 * Also initialise the rpc client pool so that this process will use
	 * a unique pool of clients. If we don't do this, all of the forked
	 * clients will end up using the same pool of clients which can result
	 * in hung clients.
	 */
	if (meta_is_mn_set(sp, ep)) {
		(void) close(0);
		(void) close(1);
		(void) close(2);
		mdmn_clients = NULL;
	}
	assert((fval == 0) || (fval == -1));

	/*
	 * Determine which pass level is the highest that contains mirrors to
	 * resync. We only need to wait for completion of earlier levels below
	 * this high watermark. If all mirrors are at the same pass level
	 * there is no requirement to wait for completion.
	 */

	max_pass = 1;
	for (pass = MD_PASS_MAX; pass > 1; --pass) {
		if (mirrors[pass] != NULL) {
			max_pass = pass;
			break;
		}
	}

	/*
	 * max_pass now contains the highest pass-level with resyncable mirrors
	 */

	/* do passes */
	for (pass = 1; (pass <= MD_PASS_MAX); ++pass) {
		int			dispatched = 0;
		unsigned		howlong = 1;
		mm_unit_list_t		*lp;

		/* skip empty passes */
		if (mirrors[pass] == NULL)
			continue;

		/* dispatch all resyncs in pass */
		for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) {
			if (meta_is_mn_set(sp, ep)) {
				if (meta_mn_send_setsync(sp, lp->namep,
				    size, ep) != 0) {
					rval = -1;
					lp->done = 1;
				} else {
					++dispatched;
				}
			} else {
				if (meta_mirror_resync(sp, lp->namep, size, ep,
				    MD_RESYNC_START) != 0) {
					rval = -1;
					lp->done = 1;
				} else {
					++dispatched;
				}
			}
		}

		/*
		 * Wait for them to finish iff we are at a level lower than
		 * max_pass. This orders the resyncs into distinct levels.
		 * I.e. level 2 resyncs won't start until all level 1 ones
		 * have completed.
		 */
		if (pass == max_pass)
			continue;

		howlong = 1;
		while (dispatched > 0) {

			/* wait a while */
			(void) sleep(howlong);

			/* see if any finished */
			for (lp = mirrors[pass]; lp != NULL; lp = lp->next) {
				md_resync_ioctl_t	ri;

				if (lp->done)
					continue;

				(void) memset(&ri, '\0', sizeof (ri));
				ri.ri_mnum = meta_getminor(lp->namep->dev);
				MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno);
				if (metaioctl(MD_IOCGETSYNC, &ri, &ri.mde,
				    lp->namep->cname) != 0) {
					(void) mdstealerror(ep, &ri.mde);
					rval = -1;
					lp->done = 1;
					--dispatched;
				} else if (! (ri.ri_flags & MD_RI_INPROGRESS)) {
					lp->done = 1;
					--dispatched;
				}
			}

			/* wait a little longer next time */
			if (howlong < 10)
				++howlong;
		}
	}

	/* cleanup, return success */
	free_units(mirrors);
	if (fval == 0)  /* we are the child process so exit */
		exit(0);
	return (rval);
}

/*
 * meta_mn_mirror_resync_all:
 * -------------------------
 * Resync all mirrors associated with given set (arg). Called when master
 * node is adding a node to a diskset.  Only want to initiate the resync on
 * the current node.
 */
void *
meta_mn_mirror_resync_all(void *arg)
{
	set_t		setno = *((set_t *)arg);
	mdsetname_t	*sp;
	mm_unit_list_t	*mirrors[MD_PASS_MAX + 1];
	mm_pass_num_t	pass, max_pass;
	md_error_t	mde = mdnullerror;
	int		fval;


	/* should have a set */
	assert(setno != NULL);

	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
		mde_perror(&mde, "");
		return (NULL);
	}

	if (!(meta_is_mn_set(sp, &mde))) {
		mde_perror(&mde, "");
		return (NULL);
	}

	/* fork a process */
	if ((fval = md_daemonize(sp, &mde)) != 0) {
		/*
		 * md_daemonize will fork off a process.  The is the
		 * parent or error.
		 */
		if (fval > 0) {
			return (NULL);
		}
		mde_perror(&mde, "");
		return (NULL);
	}
	/*
	 * Child process should never return back to rpc.metad, but
	 * should exit.
	 * Flush all internally cached data inherited from parent process
	 * since cached data will be cleared when parent process RPC request
	 * has completed (which is possibly before this child process
	 * can complete).
	 * Child process can retrieve and cache its own copy of data from
	 * rpc.metad that won't be changed by the parent process.
	 *
	 * Reset md_in_daemon since this child will be a client of rpc.metad
	 * not part of the rpc.metad daemon itself.
	 * md_in_daemon is used by rpc.metad so that libmeta can tell if
	 * this thread is rpc.metad or any other thread.  (If this thread
	 * was rpc.metad it could use some short circuit code to get data
	 * directly from rpc.metad instead of doing an RPC call to rpc.metad).
	 */
	md_in_daemon = 0;
	metaflushsetname(sp);
	sr_cache_flush_setno(setno);
	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
		mde_perror(&mde, "");
		md_exit(sp, 1);
	}

	if (meta_lock(sp, TRUE, &mde) != 0) {
		mde_perror(&mde, "");
		md_exit(sp, 1);
	}

	/*
	 * Closing stdin/out/err here.
	 */
	(void) close(0);
	(void) close(1);
	(void) close(2);
	assert(fval == 0);

	/* get mirrors */
	(void) memset(mirrors, 0, sizeof (mirrors));
	if (setup_units(sp, mirrors, &mde) != 0) {
		(void) meta_unlock(sp, &mde);
		md_exit(sp, 1);
	}

	/*
	 * Determine which pass level is the highest that contains mirrors to
	 * resync. We only need to wait for completion of earlier levels below
	 * this high watermark. If all mirrors are at the same pass level
	 * there is no requirement to wait for completion.
	 */
	max_pass = 1;
	for (pass = MD_PASS_MAX; pass > 1; --pass) {
		if (mirrors[pass] != NULL) {
			max_pass = pass;
			break;
		}
	}

	/*
	 * max_pass now contains the highest pass-level with resyncable mirrors
	 */
	/* do passes */
	for (pass = 1; (pass <= MD_PASS_MAX); ++pass) {
		int			dispatched = 0;
		unsigned		howlong = 1;
		mm_unit_list_t		*lp;

		/* skip empty passes */
		if (mirrors[pass] == NULL)
			continue;

		/* dispatch all resyncs in pass */
		for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) {
			if (meta_mirror_resync(sp, lp->namep, 0, &mde,
			    MD_RESYNC_FORCE_MNSTART) != 0) {
				mdclrerror(&mde);
				lp->done = 1;
			} else {
				++dispatched;
			}
		}

		/*
		 * Wait for them to finish iff we are at a level lower than
		 * max_pass. This orders the resyncs into distinct levels.
		 * I.e. level 2 resyncs won't start until all level 1 ones
		 * have completed.
		 */
		if (pass == max_pass)
			continue;

		howlong = 1;
		while (dispatched > 0) {

			/* wait a while */
			(void) sleep(howlong);

			/* see if any finished */
			for (lp = mirrors[pass]; lp != NULL; lp = lp->next) {
				md_resync_ioctl_t	ri;

				if (lp->done)
					continue;

				(void) memset(&ri, '\0', sizeof (ri));
				ri.ri_mnum = meta_getminor(lp->namep->dev);
				MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno);
				if (metaioctl(MD_IOCGETSYNC, &ri, &ri.mde,
				    lp->namep->cname) != 0) {
					mdclrerror(&mde);
					lp->done = 1;
					--dispatched;
				} else if (! (ri.ri_flags & MD_RI_INPROGRESS)) {
					lp->done = 1;
					--dispatched;
				}
			}

			/* wait a little longer next time */
			if (howlong < 10)
				++howlong;
		}
	}

	/* cleanup, return success */
	free_units(mirrors);
	(void) meta_unlock(sp, &mde);
	md_exit(sp, 0);
	/*NOTREACHED*/
	return (NULL);
}

/*
 * meta_mirror_resync_process:
 * --------------------------
 * Modify any resync that is in progress on this node for the given set.
 *
 * Input Parameters:
 *	sp	setname to scan for mirrors
 *	cmd	action to take:
 *		MD_RESYNC_KILL	- kill all resync threads
 *		MD_RESYNC_BLOCK	- block all resync threads
 *		MD_RESYNC_UNBLOCK - resume all resync threads
 * Output Parameters
 *	ep	error return structure
 *
 * meta_lock for this set should be held on entry.
 */
static void
meta_mirror_resync_process(mdsetname_t *sp, md_error_t *ep, md_resync_cmd_t cmd)
{
	mm_unit_list_t	*mirrors[MD_PASS_MAX + 1];
	mm_pass_num_t	pass;

	/* Grab all the mirrors from the set (if any) */
	(void) memset(mirrors, 0, sizeof (mirrors));
	if (setup_units(sp, mirrors, ep) != 0)
		return;

	/* do passes */
	for (pass = 1; (pass <= MD_PASS_MAX); ++pass) {
		mm_unit_list_t		*lp;

		/* skip empty passes */
		if (mirrors[pass] == NULL)
			continue;

		/* Process all resyncs in pass */
		for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) {
			(void) meta_mirror_resync(sp, lp->namep, 0, ep,
			    cmd);
		}
	}

	/* Clear up mirror units */
	free_units(mirrors);
}

/*
 * meta_mirror_resync_process_all:
 * ------------------------------
 * Issue the given resync command to all mirrors contained in all multi-node
 * sets.
 *
 * Input Parameters:
 *	cmd	- MD_RESYNC_KILL, MD_RESYNC_BLOCK, MD_RESYNC_UNBLOCK
 */
static void
meta_mirror_resync_process_all(md_resync_cmd_t cmd)
{
	set_t		setno, max_sets;
	md_error_t	mde = mdnullerror;
	mdsetname_t	*this_sp;
	md_set_desc	*sd;

	/*
	 * Traverse all sets looking for multi-node capable ones.
	 */
	max_sets = get_max_sets(&mde);
	for (setno = 1; setno < max_sets; setno++) {
		mde = mdnullerror;
		if (this_sp = metasetnosetname(setno, &mde)) {
			if ((sd = metaget_setdesc(this_sp, &mde)) == NULL)
				continue;
			if (!MD_MNSET_DESC(sd))
				continue;

			if (meta_lock(this_sp, TRUE, &mde)) {
				continue;
			}
			meta_mirror_resync_process(this_sp, &mde, cmd);
			(void) meta_unlock(this_sp, &mde);
		}
	}
}

/*
 * meta_mirror_resync_kill_all:
 * ---------------------------
 * Abort any resync that is in progress on this node. Scan all sets for all
 * mirrors.
 * Note: this routine is provided for future use. For example to kill all
 *	 resyncs on a node this could be used as long as the
 *	 mddoors / rpc.mdcommd tuple is running on all members of the cluster.
 */
void
meta_mirror_resync_kill_all(void)
{
	meta_mirror_resync_process_all(MD_RESYNC_KILL);
}

/*
 * meta_mirror_resync_block_all:
 * ----------------------------
 * Block all resyncs that are in progress. This causes the resync state to
 * freeze on this machine, and can be resumed by calling
 * meta_mirror_resync_unblock_all.
 */
void
meta_mirror_resync_block_all(void)
{
	meta_mirror_resync_process_all(MD_RESYNC_BLOCK);
}

/*
 * meta_mirror_resync_unblock_all:
 * ------------------------------
 * Unblock all previously blocked resync threads on this node.
 */
void
meta_mirror_resync_unblock_all(void)
{
	meta_mirror_resync_process_all(MD_RESYNC_UNBLOCK);
}

/*
 * meta_mirror_resync_unblock:
 * --------------------------
 * Unblock any previously blocked resync threads for the given set.
 * meta_lock for this set should be held on entry.
 */
void
meta_mirror_resync_unblock(mdsetname_t *sp)
{
	md_error_t	mde = mdnullerror;

	meta_mirror_resync_process(sp, &mde, MD_RESYNC_UNBLOCK);
}

/*
 * meta_mirror_resync_kill:
 * -----------------------
 * Kill any resync threads running on mirrors in the given set.
 * Called when releasing a set (meta_set_prv.c`halt_set)
 */
void
meta_mirror_resync_kill(mdsetname_t *sp)
{
	md_error_t	mde = mdnullerror;

	meta_mirror_resync_process(sp, &mde, MD_RESYNC_KILL);
}