Net2/usr/src/lib/libc/db/btree/btree.c

Compare this file to the similar file:
Show the results in this format:

/*-
 * Copyright (c) 1990 The Regents of the University of California.
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Mike Olson.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the University of
 *	California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#if defined(LIBC_SCCS) && !defined(lint)
static char sccsid[] = "@(#)btree.c	5.9 (Berkeley) 5/7/91";
#endif /* LIBC_SCCS and not lint */

/*
 *  btree.c -- implementation of btree access method for 4.4BSD.
 *
 *	The design here is based on that of the btree access method used
 *	in the Postgres database system at UC Berkeley.  The implementation
 *	is wholly independent of the Postgres code.
 *
 *	This implementation supports btrees on disk (supply a filename) or
 *	in memory (don't).  Public interfaces defined here are:
 *
 *		btree_open()	-- wrapper; returns a filled DB struct for
 *				   a btree.
 *
 *		bt_open()	-- open a new or existing btree.
 *		bt_get()	-- fetch data from a tree by key.
 *		bt_seq()	-- do a sequential scan on a tree.
 *		bt_put()	-- add data to a tree by key.
 *		bt_delete()	-- remove data from a tree by key.
 *		bt_close()	-- close a btree.
 *		bt_sync()	-- force btree pages to disk (disk trees only).
 */

#include <sys/param.h>
#include <sys/stat.h>
#include <signal.h>
#include <errno.h>
#include <fcntl.h>
#include <db.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "btree.h"

BTREEINFO _DefaultBTInfo = {
	0,	/* flags */
	0,	/* cachesize */
	0,	/* psize */
	strcmp,	/* compare */
	0
};

/*
 *  BTREE_OPEN -- Wrapper routine to open a btree.
 *
 *	Creates and fills a DB struct, and calls the routine that actually
 *	opens the btree.
 *
 *	Parameters:
 *		f:  filename to open
 *		flags:  flag bits passed to open
 *		mode:  permission bits, used if O_CREAT specified
 *		b:  BTREEINFO pointer
 *
 *	Returns:
 *		Filled-in DBT on success; NULL on failure, with errno
 *		set as appropriate.
 *
 *	Side Effects:
 *		Allocates memory for the DB struct.
 */

DB *
btree_open(f, flags, mode, b)
	const char *f;
	int flags;
	int mode;
	const BTREEINFO *b;
{
	DB *db;
	BTREE t;

	if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL)
		return ((DB *) NULL);

	if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) {
		(void) free ((char *) db);
		return ((DB *) NULL);
	}

	db->internal = (char *) t;
	db->close = bt_close;
	db->del = bt_delete;
	db->get = bt_get;
	db->put = bt_put;
	db->seq = bt_seq;
	db->sync = bt_sync;
	db->type = DB_BTREE;

	return (db);
}

/*
 *  BT_OPEN -- Open a btree.
 *
 *	This routine creates the correct kind (disk or in-memory) of
 *	btree and initializes it as required.
 *
 *	Parameters:
 *		f -- name of btree (NULL for in-memory btrees)
 *		flags -- flags passed to open()
 *		mode -- mode passed to open ()
 *		b -- BTREEINFO structure, describing btree
 *
 *	Returns:
 *		(Opaque) pointer to the btree.  On failure, returns NULL
 *		with errno set as appropriate.
 *
 *	Side Effects:
 *		Allocates memory, opens files.
 */

BTREE
bt_open(f, flags, mode, b)
	char *f;
	int flags;
	int mode;
	BTREEINFO *b;
{
	BTREE_P t;
	HTABLE ht;
	int nbytes;
	int fd;
	CURSOR *c;
	BTMETA m;
	struct stat buf;

	/* use the default info if none was provided */
	if (b == (BTREEINFO *) NULL)
		b = &_DefaultBTInfo;

	if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL)
		return ((BTREE) NULL);

	if (b->compare)
		t->bt_compare = b->compare;
	else
		t->bt_compare = strcmp;

	t->bt_fname = f;
	t->bt_curpage = (BTHEADER *) NULL;
	t->bt_free = P_NONE;
	c = &(t->bt_cursor);
	c->c_pgno = P_NONE;
	c->c_index = 0;
	c->c_flags = (u_char) NULL;
	c->c_key = (char *) NULL;
	t->bt_stack = (BTSTACK *) NULL;
	t->bt_flags = 0;

	/*
	 *  If no file name was supplied, this is an in-memory btree.
	 *  Otherwise, it's a disk-based btree.
	 */
	if (f == (char *) NULL) {
		/* in memory */
		if ((t->bt_psize = b->psize) < MINPSIZE) {
			if (t->bt_psize != 0) {
				(void) free ((char *) t);
				errno = EINVAL;
				return ((BTREE) NULL);
			}
			t->bt_psize = getpagesize();
		}

		nbytes = HTSIZE * sizeof(HTBUCKET *);
		if ((ht = (HTABLE) malloc((unsigned) nbytes))
		    == (HTABLE) NULL) {
			(void) free((char *) t);
			return ((BTREE) NULL);
		}
		(void) bzero((char *) ht, nbytes);
		t->bt_s.bt_ht = ht;
		t->bt_npages = 0;
		t->bt_lorder = BYTE_ORDER;
		if (!(b->flags & R_DUP))
			t->bt_flags |= BTF_NODUPS;
	} else {
		/* on disk */
		if ((fd = open(f, O_RDONLY, 0)) < 0) {
			/* doesn't exist yet, be sure page is big enough */
			if ((t->bt_psize = b->psize) < sizeof(BTHEADER)
			    && b->psize != 0) {
				(void) free((char *) t);
				errno = EINVAL;
				return ((BTREE) NULL);
			}
			if (b->lorder == 0)
				b->lorder = BYTE_ORDER;

			if (b->lorder != BIG_ENDIAN
			    && b->lorder != LITTLE_ENDIAN) {
				(void) free((char *) t);
				errno = EINVAL;
				return ((BTREE) NULL);
			}
			t->bt_lorder = b->lorder;
			if (!(b->flags & R_DUP))
				t->bt_flags |= BTF_NODUPS;
		} else {
			/* exists, get page size from file */
			if (read(fd, &m, sizeof(m)) < sizeof(m)) {
				(void) close(fd);
				(void) free((char *) t);
				errno = EINVAL;
				return ((BTREE) NULL);
			}

			/* lorder always stored in host-independent format */
			NTOHL(m.m_lorder);
			if (m.m_lorder != BIG_ENDIAN
			    && m.m_lorder != LITTLE_ENDIAN) {
				(void) free((char *) t);
				errno = EINVAL;
				return ((BTREE) NULL);
			}
			t->bt_lorder = m.m_lorder;

			if (t->bt_lorder != BYTE_ORDER) {
				BLSWAP(m.m_magic);
				BLSWAP(m.m_version);
				BLSWAP(m.m_psize);
				BLSWAP(m.m_free);
				BLSWAP(m.m_flags);
			}

			if (m.m_magic != BTREEMAGIC
			    || m.m_version != BTREEVERSION
			    || m.m_psize < MINPSIZE) {
				(void) close(fd);
				(void) free((char *) t);
#ifndef EFTYPE
#define EFTYPE	-100
#endif
				errno = EFTYPE;
				return ((BTREE) NULL);
			}
			t->bt_psize = m.m_psize;
			t->bt_free = m.m_free;
			t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK;
			(void) close(fd);
		}

		/* now open the file the way the user wanted it */
		if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) {
			(void) free ((char *) t);
			return ((BTREE) NULL);
		}

		/* access method files are always close-on-exec */
		if (fcntl(t->bt_s.bt_d.d_fd, F_SETFL, 1) == -1) {
			(void) close(t->bt_s.bt_d.d_fd);
			(void) free ((char *) t);
			return ((BTREE) NULL);
		}

		/* get number of pages, page size if necessary */
		(void) fstat(t->bt_s.bt_d.d_fd, &buf);
		if (t->bt_psize == 0)
			t->bt_psize = buf.st_blksize;
		t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize);

		/* page zero is metadata, doesn't count */
		if (t->bt_npages > 0)
			--(t->bt_npages);

		if (b->cachesize == 0)
			b->cachesize = DEFCACHE;

		/* get an lru buffer cache, if the user asked for one */
		if ((b->cachesize / t->bt_psize) > 0) {
			BTDISK *d = &(t->bt_s.bt_d);

			d->d_cache = lruinit(d->d_fd,
					     (int) (b->cachesize / t->bt_psize),
					     (int) t->bt_psize,
					     (char *) t->bt_lorder,
					     _bt_pgin, _bt_pgout);

			if (d->d_cache == (char *) NULL) {
				(void) free((char *) t);
				return ((BTREE) NULL);
			}
		}
	}

	/* remember if tree was opened for write */
	if (((flags & O_WRONLY) == O_WRONLY)
	    || ((flags & O_RDWR) == O_RDWR))
		t->bt_flags |= BTF_ISWRITE;

	return ((BTREE) t);
}

/*
 *  BT_GET -- Get an entry from a btree.
 *
 *	Does a key lookup in the tree to find the specified key, and returns
 *	the key/data pair found.
 *
 *	Parameters:
 *		tree -- btree in which to do lookup
 *		key -- key to find
 *		data -- pointer to DBT in which to return data
 *		flag -- ignored
 *
 *	Returns:
 *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not
 *		found.  If key is not found, nothing is stored in the
 *		return DBT 'data'.
 *
 *	Side Effects:
 *		None.
 *
 *	Warnings:
 *		Return data is statically allocated, and will be overwritten
 *		at the next call.
 */

int
bt_get(dbp, key, data, flag)
	DB *dbp; 
	DBT *key, *data;
	u_long flag;
{
	BTREE_P t = (BTREE_P) (dbp->internal);
	BTHEADER *h;
	DATUM *d;
	BTITEM *item;

#ifdef lint
	flag = flag;
#endif /* lint */

	/* lookup */
	item = _bt_search(t, key);

	/* clear parent pointer stack */
	while (_bt_pop(t) != P_NONE)
		continue;

	if (item == (BTITEM *) NULL)
		return (RET_ERROR);

	h = (BTHEADER *) t->bt_curpage;
	data->size = 0;
	data->data = (u_char *) NULL;

	/* match? */
	if (VALIDITEM(t, item)
	    && _bt_cmp(t, key->data, item->bti_index) == 0) {
		d = (DATUM *) GETDATUM(h, item->bti_index);
		return (_bt_buildret(t, d, data, key));
	}

	/* nope */
	return (RET_SPECIAL);
}

/*
 *  BT_PUT -- Add an entry to a btree.
 *
 *	The specified (key, data) pair is added to the tree.  If the tree
 *	was created for unique keys only, then duplicates will not be
 *	entered.  If the requested key exists in the tree, it will be over-
 *	written unless the flags parameter is R_NOOVERWRITE, in which case
 *	the update will not be done.  If duplicate keys are permitted in the
 *	tree, duplicates will be inserted and will not overwrite existing
 *	keys.  Nodes are split as required.
 *
 *	Parameters:
 *		tree -- btree in which to put the new entry
 *		key -- key component to add
 *		data -- data corresponding to key
 *		flag -- R_NOOVERWRITE or zero.
 *
 *	Returns:
 *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the
 *		NOOVERWRITE flag was set and the specified key exists
 *		in the database.
 *
 *	Side Effects:
 *		None.
 */

int
bt_put(dbp, key, data, flag)
	DB *dbp;
	DBT *key, *data;
	u_long flag;
{
	BTREE_P t;
	BTITEM *item;

	t = (BTREE_P)dbp->internal;

	/* look for this key in the tree */
	item = _bt_search(t, key);

	/*
	 *  If this tree was originally created without R_DUP, then duplicate
	 *  keys are not allowed.  We need to check this at insertion time.
	 */

	if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) {
		if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) {
			if (_bt_delone(t, item->bti_index) == RET_ERROR) {
				while (_bt_pop(t) != P_NONE)
					continue;
				return (RET_ERROR);
			}
		}
	}

	return (_bt_insert(t, item, key, data, flag));
}

/*
 *  BT_DELETE -- delete a key from the tree.
 *
 *	Deletes all keys (and their associated data items) matching the
 *	supplied key from the tree.  If the flags entry is R_CURSOR, then
 *	the current item in the active scan is deleted.
 *
 *	Parameters:
 *		tree -- btree from which to delete key
 *		key -- key to delete
 *		flags -- R_CURSOR or zero
 *
 *	Returns:
 *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified
 *		key was not in the tree.
 *
 *	Side Effects:
 *		None.
 */

int
bt_delete(dbp, key, flags)
	DB *dbp;
	DBT *key;
	u_long flags;
{
	BTREE_P t;
	BTHEADER *h;
	BTITEM *item;
	int ndel = 0;

	t = (BTREE_P)dbp->internal;

	if (flags == R_CURSOR)
		return (_bt_crsrdel(t));

	/* find the first matching key in the tree */
	item = _bt_first(t, key);
	h = t->bt_curpage;

	/* don't need the descent stack for deletes */
	while (_bt_pop(t) != P_NONE)
		continue;

	/* delete all matching keys */
	for (;;) {
		while (VALIDITEM(t, item)
		       && (_bt_cmp(t, key->data, item->bti_index) == 0)) {
			if (_bt_delone(t, item->bti_index) == RET_ERROR)
				return (RET_ERROR);
			ndel++;
		}

		if (VALIDITEM(t, item) || h->h_nextpg == P_NONE)
			break;

		/* next page, if necessary */
		do {
			if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
				return (RET_ERROR);
			h = t->bt_curpage;
		} while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE);

		item->bti_pgno = h->h_pgno;
		item->bti_index = 0;

		if (!VALIDITEM(t, item)
		    || _bt_cmp(t, key->data, item->bti_index) != 0)
			break;
	}

	/* flush changes to disk */
	if (ISDISK(t)) {
		if (h->h_flags & F_DIRTY) {
			if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR)
				return (RET_ERROR);
		}
	}

	if (ndel == 0)
		return (RET_SPECIAL);

	return (RET_SUCCESS);
}

/*
 *  BT_SYNC -- sync the btree to disk.
 *
 *	Parameters:
 *		tree -- btree to sync.
 *
 *	Returns:
 *		RET_SUCCESS, RET_ERROR.
 */

bt_sync(dbp)
	DB *dbp;
{
	BTREE_P t;
	BTHEADER *h;
	pgno_t pgno;

	t = (BTREE_P)dbp->internal;

	/* if this is an in-memory btree, syncing is a no-op */
	if (!ISDISK(t))
		return (RET_SUCCESS);

	h = (BTHEADER *) t->bt_curpage;
	h->h_flags &= ~F_DIRTY;

	if (ISCACHE(t)) {
		pgno = t->bt_curpage->h_pgno;
		if (_bt_write(t, h, RELEASE) == RET_ERROR)
			return(RET_ERROR);
		if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR)
			return (RET_ERROR);
		if (_bt_getpage(t, pgno) == RET_ERROR)
			return (RET_ERROR);
	} else {
		if (_bt_write(t, h, NORELEASE) == RET_ERROR)
			return (RET_ERROR);
	}

	return (fsync(t->bt_s.bt_d.d_fd));
}

/*
 *  BT_SEQ -- Sequential scan interface.
 *
 *	This routine supports sequential scans on the btree.  If called with
 *	flags set to R_CURSOR, or if no seq scan has been initialized in the
 *	current tree, we initialize the scan.  Otherwise, we advance the scan
 *	and return the next item.
 *
 *	Scans can be either keyed or non-keyed.  Keyed scans basically have
 *	a starting point somewhere in the middle of the tree.  Non-keyed
 *	scans start at an endpoint.  Also, scans can be backward (descending
 *	order), forward (ascending order), or no movement (keep returning
 *	the same item).
 *
 *	Flags is checked every time we enter the routine, so the user can
 *	change directions on an active scan if desired.  The key argument
 *	is examined only when we initialize the scan, in order to position
 *	it properly.
 *
 *	Items are returned via the key and data arguments passed in.
 *
 *	Parameters:
 *		tree -- btree in which to do scan
 *		key -- key, used to position scan on initialization, and
 *		       used to return key components to the user.
 *		data -- used to return data components to the user.
 *		flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or
 *			 R_PREV.
 *
 *	Returns:
 *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data
 *		exists in the tree in the specified direction.
 *
 *	Side Effects:
 *		Changes the btree's notion of the current position in the
 *		scan.
 *
 *	Warnings:
 *		The key and data items returned are static and will be
 *		overwritten by the next bt_get or bt_seq.
 */

int
bt_seq(dbp, key, data, flags)
	DB *dbp;
	DBT *key, *data;
	u_long flags;
{
	BTREE_P t;
	BTHEADER *h;
	DATUM *d;
	int status;

	t = (BTREE_P)dbp->internal;

	/* do we need to initialize the scan? */
	if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST
	    || !(t->bt_flags & BTF_SEQINIT)) {

		/* initialize it */
		status = _bt_seqinit(t, key, flags);
	} else {
		/* just advance the current scan pointer */
		status = _bt_seqadvance(t, flags);
	}

	key->size = data->size = 0;
	key->data = data->data = (u_char *) NULL;

	h = t->bt_curpage;

	/* is there a valid item at the current scan location? */
	if (status == RET_SPECIAL) {
		if (flags == R_NEXT) {
			if (t->bt_cursor.c_index >= NEXTINDEX(h)) {
				if (NEXTINDEX(h) > 0)
					t->bt_cursor.c_index = NEXTINDEX(h) - 1;
				else
					t->bt_cursor.c_index = 0;
			}
		} else {
			t->bt_cursor.c_index = 0;
		}
		return (RET_SPECIAL);
	} else if (status == RET_ERROR)
		return (RET_ERROR);

	/* okay, return the data */
	d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index);

	return (_bt_buildret(t, d, data, key));
}

/*
 *  BT_CLOSE -- Close a btree
 *
 *	Parameters:
 *		tree -- tree to close
 *
 *	Returns:
 *		RET_SUCCESS, RET_ERROR.
 *
 *	Side Effects:
 *		Frees space occupied by the tree.
 */

int
bt_close(dbp)
	DB *dbp;
{
	struct HTBUCKET *b, *sb;
	BTREE_P t;
	BTHEADER *h;
	HTABLE ht;
	int fd, i;
	char *cache;

	t = (BTREE_P)dbp->internal;

	if (t->bt_cursor.c_key != (char *) NULL)
		(void) free(t->bt_cursor.c_key);

	if (!ISDISK(t)) {
		/* in-memory tree, release hash table memory */
		ht = t->bt_s.bt_ht;
		for (i = 0; i < HTSIZE; i++) {
			if ((b = ht[i]) == (struct HTBUCKET *) NULL)
				break;
			do {
				sb = b;
				(void) free((char *) b->ht_page);
				b = b->ht_next;
				(void) free((char *) sb);
			} while (b != (struct HTBUCKET *) NULL);
		}
		(void) free ((char *) ht);
		(void) free ((char *) t);
		return (RET_SUCCESS);
	}

	if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) {
		if (_bt_wrtmeta(t) == RET_ERROR)
			return (RET_ERROR);
	}

	if (t->bt_curpage != (BTHEADER *) NULL) {
		h = t->bt_curpage;
		if (h->h_flags & F_DIRTY) {
			if (_bt_write(t, h, RELEASE) == RET_ERROR)
				return (RET_ERROR);
		} else {
			if (_bt_release(t, h) == RET_ERROR)
				return (RET_ERROR);
		}

		/* flush and free the cache, if there is one */
		if (ISCACHE(t)) {
			cache = t->bt_s.bt_d.d_cache;
			if (lrusync(cache) == RET_ERROR)
				return (RET_ERROR);
			lrufree(cache);
		}
		(void) free ((char *) h);
	}

	fd = t->bt_s.bt_d.d_fd;
	(void) free ((char *) t);
	return (close(fd));
}