Net2/usr/src/lib/libc/db/btree/split.c

Compare this file to the similar file:
Show the results in this format:

/*-
 * Copyright (c) 1990 The Regents of the University of California.
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Mike Olson.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the University of
 *	California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#if defined(LIBC_SCCS) && !defined(lint)
static char sccsid[] = "@(#)split.c	5.2 (Berkeley) 2/22/91";
#endif /* LIBC_SCCS and not lint */

#include <sys/types.h>
#include <db.h>
#include <stdlib.h>
#include <string.h>
#include "btree.h"

/*
 *  _BT_SPLIT -- Split a page into two pages.
 *
 *	Splits are caused by insertions, and propogate up the tree in
 *	the usual way.  The root page is always page 1 in the file on
 *	disk, so root splits are handled specially.  On entry to this
 *	routine, t->bt_curpage is the page to be split.
 *
 *	Parameters:
 *		t -- btree in which to do split.
 *
 *	Returns:
 *		RET_SUCCESS, RET_ERROR.
 *
 *	Side Effects:
 *		Changes the notion of the current page.
 */

int
_bt_split(t)
	BTREE_P t;
{
	BTHEADER *h;
	BTHEADER *left, *right;
	pgno_t nextpgno, parent;
	int nbytes, len;
	IDATUM *id;
	DATUM *d;
	char *src;
	IDATUM *new;
	pgno_t oldchain;
	u_char flags;

	h = (BTHEADER *) t->bt_curpage;

	/* split root page specially, since it must remain page 1 */
	if (h->h_pgno == P_ROOT) {
		return (_bt_splitroot(t));
	}

	/*
	 *  This is a little complicated.  We go to some trouble to
	 *  figure out which of the three possible cases -- in-memory tree,
	 *  disk tree (no cache), and disk tree (cache) -- we have, in order
	 *  to avoid unnecessary copying.  If we have a disk cache, then we
	 *  have to do some extra copying, though, since the cache code
	 *  manages buffers externally to this code.
	 */

	if (ISDISK(t) && ISCACHE(t)) {
		if ((left = (BTHEADER *) malloc((unsigned) t->bt_psize))
		    == (BTHEADER *) NULL)
			return (RET_ERROR);
		left->h_pgno = left->h_prevpg = left->h_nextpg = P_NONE;
		left->h_flags = t->bt_curpage->h_flags;
		left->h_lower = (index_t)
			  (((char *) &(left->h_linp[0])) - ((char *) left));
		left->h_upper = t->bt_psize;

	} else {
		if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
			return (RET_ERROR);
	}
	left->h_pgno = h->h_pgno;

	if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
		return (RET_ERROR);
	right->h_pgno = ++(t->bt_npages);

	/* now do the split */
	if (_bt_dopsplit(t, left, right) == RET_ERROR)
		return (RET_ERROR);

	right->h_prevpg = left->h_pgno;
	nextpgno = right->h_nextpg = h->h_nextpg;
	left->h_nextpg = right->h_pgno;
	left->h_prevpg = h->h_prevpg;

	/* okay, now use the left half of the page as the new page */
	if (ISDISK(t) && ISCACHE(t)) {
		(void) bcopy((char *) left, (char *) t->bt_curpage,
			     (int) t->bt_psize);
		(void) free ((char *) left);
		left = t->bt_curpage;
	} else {
		(void) free((char *) t->bt_curpage);
		t->bt_curpage = left;
	}

	/*
	 *  Write the new pages out.  We need them to stay where they are
	 *  until we're done updating the parent pages.
	 */

	if (_bt_write(t, left, NORELEASE) == RET_ERROR)
		return (RET_ERROR);
	if (_bt_write(t, right, NORELEASE) == RET_ERROR)
		return (RET_ERROR);

	/* update 'prev' pointer of old neighbor of left */
	if (nextpgno != P_NONE) {
		if (_bt_getpage(t, nextpgno) == RET_ERROR)
			return (RET_ERROR);
		h = t->bt_curpage;
		h->h_prevpg = right->h_pgno;
		h->h_flags |= F_DIRTY;
	}

	if ((parent = _bt_pop(t)) != P_NONE) {
		if (right->h_flags & F_LEAF) {
			d = (DATUM *) GETDATUM(right, 0);
			len = d->d_ksize;
			if (d->d_flags & D_BIGKEY) {
				bcopy(&(d->d_bytes[0]),
				      (char *) &oldchain,
				      sizeof(oldchain));
				if (_bt_markchain(t, oldchain) == RET_ERROR)
					return (RET_ERROR);
				src = (char *) &oldchain;
				flags = D_BIGKEY;
			} else {
				src = (char *) &(d->d_bytes[0]);
				flags = 0;
			}
		} else {
			id = (IDATUM *) GETDATUM(right, 0);
			len = id->i_size;
			flags = id->i_flags;
			src = (char *) &(id->i_bytes[0]);
		}
		nbytes = len + (sizeof(IDATUM) - sizeof(char));
		new = (IDATUM *) malloc((unsigned) nbytes);
		if (new == (IDATUM *) NULL)
			return (RET_ERROR);
		new->i_size = len;
		new->i_pgno = right->h_pgno;
		new->i_flags = flags;
		if (len > 0)
			(void) bcopy(src, (char *) &(new->i_bytes[0]), len);

		nbytes = LONGALIGN(nbytes) + sizeof(index_t);
		if (_bt_getpage(t, parent) == RET_ERROR)
			return (RET_ERROR);

		h = t->bt_curpage;

		/*
		 *  Split the parent if we need to, then reposition the
		 *  tree's current page pointer for the new datum.
		 */
		if ((h->h_upper - h->h_lower) < nbytes) {
			if (_bt_split(t) == RET_ERROR)
				return (RET_ERROR);
			if (_bt_reposition(t, new, parent, right->h_prevpg)
			      == RET_ERROR)
				return (RET_ERROR);
		}

		/* okay, now insert the new idatum */
		if (_bt_inserti(t, new, right->h_prevpg) == RET_ERROR)
			return (RET_ERROR);
	}

	/*
	 *  Okay, split is done; don't need right page stapled down anymore.
	 *  The page we call 'left' above is the new version of the old
	 *  (split) page, so we can't release it.
	 */

	if (_bt_release(t, right) == RET_ERROR)
		return (RET_ERROR);
	if (ISDISK(t) && !ISCACHE(t))
		(void) free((char *) right);

	return (RET_SUCCESS);
}

/*
 *  _BT_REPOSITION -- Reposition the current page pointer of a btree.
 *
 *	After splitting a node in the tree in order to make room for
 *	an insertion, we need to figure out which page after the split
 *	should get the item we want to insert.  This routine positions
 *	the tree's current page pointer appropriately.
 *
 *	Parameters:
 *		t -- tree to position
 *		new -- the item we want to insert
 *		parent -- parent of the node that we just split
 *		prev -- page number of item directly to the left of
 *			new's position in the tree.
 *
 *	Returns:
 *		RET_SUCCESS, RET_ERROR.
 *
 *	Side Effects:
 *		None.
 */

int
_bt_reposition(t, new, parent, prev)
	BTREE_P t;
	IDATUM *new;
	pgno_t parent;
	pgno_t prev;
{
	index_t i, next;
	IDATUM *idx;

	if (parent == P_ROOT) {

		/*
		 *  If we just split the root page, then there are guaranteed
		 *  to be exactly two IDATUMs on it.  Look at both of them
		 *  to see if they point to the page that we want.
		 */

		if (_bt_getpage(t, parent) == RET_ERROR)
			return (RET_ERROR);

		next = NEXTINDEX(t->bt_curpage);
		for (i = 0; i < next; i++) {
			idx = (IDATUM *) GETDATUM(t->bt_curpage, i);
			if (_bt_getpage(t, idx->i_pgno) == RET_ERROR)
				return (RET_ERROR);
			if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
				return (RET_SUCCESS);
			if (_bt_getpage(t, parent) == RET_ERROR)
				return (RET_ERROR);
		}
	} else {

		/*
		 *  Get the parent page -- which is where the new item would
		 *  have gone -- and figure out whether the new item now goes
		 *  on the parent, or the page immediately to the right of
		 *  the parent.
		 */

		if (_bt_getpage(t, parent) == RET_ERROR)
			return (RET_ERROR);
		if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
			return (RET_SUCCESS);
		if (_bt_getpage(t, t->bt_curpage->h_nextpg) == RET_ERROR)
			return (RET_ERROR);
		if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
			return (RET_SUCCESS);
	}
	return (RET_ERROR);
}

/*
 *  _BT_ISONPAGE -- Is the IDATUM for a given page number on the current page?
 *
 *	This routine is used by _bt_reposition to decide whether the current
 *	page is the correct one on which to insert a new item.
 *
 *	Parameters:
 *		t -- tree to check
 *		new -- the item that will be inserted (used for binary search)
 *		prev -- page number of page whose IDATUM is immediately to
 *			the left of new's position in the tree.
 *
 *	Returns:
 *		RET_SUCCESS, RET_ERROR (corresponding to TRUE, FALSE).
 */

int
_bt_isonpage(t, new, prev)
	BTREE_P t;
	IDATUM *new;
	pgno_t prev;
{
	BTHEADER *h = (BTHEADER *) t->bt_curpage;
	index_t i, next;
	IDATUM *idx;

	i = _bt_binsrch(t, &(new->i_bytes[0]));
	while (i != 0 && _bt_cmp(t, &(new->i_bytes[0]), i) == 0)
		--i;
	next = NEXTINDEX(h);
	idx = (IDATUM *) GETDATUM(h, i);
	while (i < next && idx->i_pgno != prev) {
		i++;
		idx = (IDATUM *) GETDATUM(h, i);
	}
	if (idx->i_pgno == prev)
		return (RET_SUCCESS);
	else
		return (RET_ERROR);
}

/*
 *  _BT_SPLITROOT -- Split the root of a btree.
 *
 *	The root page for a btree is always page one.  This means that in
 *	order to split the root, we need to do extra work.
 *
 *	Parameters:
 *		t -- tree to split
 *
 *	Returns:
 *		RET_SUCCESS, RET_ERROR.
 *
 *	Side Effects:
 *		Splits root upward in the usual way, adding two new pages
 *		to the tree (rather than just one, as in usual splits).
 */

int
_bt_splitroot(t)
	BTREE_P t;
{
	BTHEADER *h = t->bt_curpage;
	BTHEADER *left, *right;
	IDATUM *id;
	BTHEADER *where_h;
	char *src, *dest;
	int len, nbytes;
	u_long was_leaf;
	pgno_t oldchain;
	u_char flags;

	/* get two new pages for the split */
	if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
		return (RET_ERROR);
	left->h_pgno = ++(t->bt_npages);
	if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
		return (RET_ERROR);
	right->h_pgno = ++(t->bt_npages);

	/* do the split */
	if (_bt_dopsplit(t, left, right) == RET_ERROR)
		return (RET_ERROR);

	/* connect the new pages correctly */
	right->h_prevpg = left->h_pgno;
	left->h_nextpg = right->h_pgno;

	/*
	 *  Write the child pages out now.  We need them to remain
	 *  where they are until we finish updating parent pages,
	 *  however.
	 */

	if (_bt_write(t, left, NORELEASE) == RET_ERROR)
		return (RET_ERROR);
	if (_bt_write(t, right, NORELEASE) == RET_ERROR)
		return (RET_ERROR);

	/* now change the root page into an internal page */
	was_leaf = (h->h_flags & F_LEAF);
	h->h_flags &= ~F_LEAF;
	h->h_lower = (index_t) (((char *) (&(h->h_linp[0]))) - ((char *) h));
	h->h_upper = (index_t) t->bt_psize;
	(void) bzero((char *) &(h->h_linp[0]), (int) (h->h_upper - h->h_lower));

	/* put two new keys on root page */
	where_h = left;
	while (where_h) {
		DATUM *data;
		IDATUM *idata;

		if (was_leaf) {
			data = (DATUM *) GETDATUM(where_h, 0);

			if (where_h == left) {
				len = 0;	/* first key in tree is null */
			} else {
				if (data->d_flags & D_BIGKEY) {
					bcopy(&(data->d_bytes[0]),
					      (char *) &oldchain,
					      sizeof(oldchain));
					if (_bt_markchain(t, oldchain) == RET_ERROR)
						return (RET_ERROR);
					src = (char *) &oldchain;
					flags = D_BIGKEY;
				} else {
					src = (char *) &(data->d_bytes[0]);
					flags = 0;
				}
				len = data->d_ksize;
			}
		} else {
			idata = (IDATUM *) GETDATUM(where_h, 0);
			len = idata->i_size;
			flags = idata->i_flags;
			src = &(idata->i_bytes[0]);
		}
		dest = ((char *) h) + h->h_upper;
		nbytes = len + (sizeof (IDATUM) - sizeof(char));
		dest -= LONGALIGN(nbytes);
		id = (IDATUM *) dest;
		id->i_size = len;
		id->i_pgno = where_h->h_pgno;
		id->i_flags = flags;
		if (len > 0)
			(void) bcopy((char *) src, (char *) &(id->i_bytes[0]), len);
		dest -= ((int) h);
		h->h_linp[NEXTINDEX(h)] = (index_t) dest;
		h->h_upper = (index_t) dest;
		h->h_lower += sizeof(index_t);

		/* next page */
		if (where_h == left)
			where_h = right;
		else
			where_h = (BTHEADER *) NULL;
	}

	if (_bt_release(t, left) == RET_ERROR)
		return (RET_ERROR);
	if (_bt_release(t, right) == RET_ERROR)
		return (RET_ERROR);

	/*
	 *  That's it, split is done.  If we're doing a non-cached disk
	 *  btree, we can free up the pages we allocated, as they're on
	 *  disk, now.
	 */

	if (ISDISK(t) && !ISCACHE(t)) {
		(void) free ((char *) left);
		(void) free ((char *) right);
	}

	h->h_flags |= F_DIRTY;

	return (RET_SUCCESS);
}

/*
 *  _BT_DOPSPLIT -- Do the work of splitting a page
 *
 *	This routine takes two page pointers and splits the data on the
 *	current page of the btree between them.
 *
 *	We do a lot of work here to handle duplicate keys on a page; we
 *	have to place these keys carefully to guarantee that later searches
 *	will find them correctly.  See comments in the code below for details.
 *
 *	Parameters:
 *		t -- tree to split
 *		left -- pointer to page to get left half of the data
 *		right -- pointer to page to get right half of the data
 *
 *	Returns:
 *		None.
 */

int
_bt_dopsplit(t, left, right)
	BTREE_P t;
	BTHEADER *left;
	BTHEADER *right;
{
	BTHEADER *h = t->bt_curpage;
	size_t psize;
	char *where;
	BTHEADER *where_h;
	index_t where_i;
	int nbytes, dsize, fixedsize, freespc;
	index_t i;
	index_t save_lower, save_upper, save_i;
	index_t switch_i;
	char *save_key;
	DATUM *d;
	CURSOR *c;
	index_t top;
	int free_save;
	pgno_t chain;
	int ignore;

	/*
	 *  Our strategy is to put half the bytes on each page.  We figure
	 *  out how many bytes we have total, and then move items until
	 *  the last item moved put at least 50% of the data on the left
	 *  page.
	 */
	save_key = (char *) NULL;
	psize = (int) t->bt_psize;
	where = ((char *) left) + psize;
	where_h = left;
	where_i = 0;
	nbytes = psize - (int) ((char *) &(h->h_linp[0]) - ((char *) h));
	freespc = nbytes;

	top = NEXTINDEX(h);
	if (h->h_flags & F_LEAF)
		fixedsize = (sizeof(DATUM) - sizeof(char));
	else
		fixedsize = (sizeof(IDATUM) - sizeof(char));

	save_key = (char *) NULL;

	/* move data */
	for (i = 0; i < top; i++) {

		/*
		 *  Internal and leaf pages have different layouts for
		 *  data items, but in both cases the first entry in the
		 *  data item is a size_t.
		 */
		d = (DATUM *) GETDATUM(h,i);
		if (h->h_flags & F_LEAF) {
			dsize = d->d_ksize + d->d_dsize + fixedsize;
		} else {
			dsize = d->d_ksize + fixedsize;
		}

		/*
		 *  If a page contains duplicate keys, we have to be
		 *  careful about splits.  A sequence of duplicate keys
		 *  may not begin in the middle of one page and end in
		 *  the middle of another; it must begin on a page boundary,
		 *  in order for searches on the internal nodes to work
		 *  correctly.
		 */
		if (where_h == left) {
			if (save_key == (char *) NULL) {
				if (h->h_flags & F_LEAF) {
					if (d->d_flags & D_BIGKEY) {
						free_save = TRUE;
						bcopy(&(d->d_bytes[0]),
						     (char *) &chain,
						     sizeof(chain));
						if (_bt_getbig(t, chain,
							       &save_key,
							       &ignore)
						    == RET_ERROR)
							return (RET_ERROR);
					} else {
						free_save = FALSE;
						save_key = (char *) &(d->d_bytes[0]);
					}
				} else {
					IDATUM *id = (IDATUM *) d;

					if (id->i_flags & D_BIGKEY) {
						free_save = TRUE;
						bcopy(&(id->i_bytes[0]),
						     (char *) &chain,
						     sizeof(chain));
						if (_bt_getbig(t, chain,
							       &save_key,
							       &ignore)
						    == RET_ERROR)
							return (RET_ERROR);
					} else {
						free_save = FALSE;
						save_key = (char *)
							&(id->i_bytes[0]);
					}
				}
				save_i = 0;
				save_lower = where_h->h_lower;
				save_upper = where_h->h_upper;
			} else {
				if (_bt_cmp(t, save_key, i) != 0) {
					save_lower = where_h->h_lower;
					save_upper = where_h->h_upper;
					save_i = i;
				}
				if (h->h_flags & F_LEAF) {
					if (free_save)
						(void) free(save_key);
					if (d->d_flags & D_BIGKEY) {
						free_save = TRUE;
						bcopy(&(d->d_bytes[0]),
						     (char *) &chain,
						     sizeof(chain));
						if (_bt_getbig(t, chain,
							       &save_key,
							       &ignore)
						    == RET_ERROR)
							return (RET_ERROR);
					} else {
						free_save = FALSE;
						save_key = (char *) &(d->d_bytes[0]);
					}
				} else {
					IDATUM *id = (IDATUM *) d;

					if (id->i_flags & D_BIGKEY) {
						free_save = TRUE;
						bcopy(&(id->i_bytes[0]),
						     (char *) &chain,
						     sizeof(chain));
						if (_bt_getbig(t, chain,
							       &save_key,
							       &ignore)
						    == RET_ERROR)
							return (RET_ERROR);
					} else {
						free_save = FALSE;
						save_key = (char *)
							&(id->i_bytes[0]);
					}
				}
			}
		}

		/* copy data and update page state */
		where -= LONGALIGN(dsize);
		(void) bcopy((char *) d, (char *) where, dsize);
		where_h->h_upper = where_h->h_linp[where_i] =
			(index_t) (where - (int) where_h);
		where_h->h_lower += sizeof(index_t);
		where_i++;

		/* if we've moved half, switch to the right-hand page */
		nbytes -= LONGALIGN(dsize) + sizeof(index_t);
		if ((freespc - nbytes) > nbytes) {
			nbytes = 2 * freespc;

			/* identical keys go on the same page */
			if (save_i > 0) {
				/* i gets incremented at loop bottom... */
				i = save_i - 1;
				where_h->h_lower = save_lower;
				where_h->h_upper = save_upper;
			}
			where = ((char *) right) + psize;
			where_h = right;
			switch_i = where_i;
			where_i = 0;
		}
	}

	/*
	 *  If there was an active scan on the database, and we just
	 *  split the page that the cursor was on, we may need to
	 *  adjust the cursor to point to the same entry as before the
	 *  split.
	 */

	c = &(t->bt_cursor);
	if ((t->bt_flags & BTF_SEQINIT)
	    && (c->c_pgno == h->h_pgno)
	    && (c->c_index >= switch_i)) {
		c->c_pgno = where_h->h_pgno;
		c->c_index -= where_i;
	}
	return (RET_SUCCESS);
}