NetBSD-5.0.2/sys/sys/wapbl.h

Compare this file to the similar file:
Show the results in this format:

/*	$NetBSD: wapbl.h,v 1.2 2008/07/31 05:38:06 simonb Exp $	*/

/*-
 * Copyright (c) 2003,2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Wasabi Systems, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef _SYS_WAPBL_H
#define	_SYS_WAPBL_H

#include <sys/mutex.h>

#include <miscfs/specfs/specdev.h>

/* This header file describes the api and data structures for
 * write ahead physical block logging (WAPBL) support.
 */

#if defined(_KERNEL_OPT)
#include "opt_wapbl.h"
#endif

#ifdef WAPBL_DEBUG
#ifndef WAPBL_DEBUG_PRINT
#define	WAPBL_DEBUG_PRINT (WAPBL_PRINT_REPLAY | WAPBL_PRINT_OPEN)
#endif

#if 0
#define	WAPBL_DEBUG_BUFBYTES
#define	WAPBL_DEBUG_SERIALIZE
#endif

#endif

#ifdef WAPBL_DEBUG_PRINT

enum {
	WAPBL_PRINT_OPEN = 0x1,
	WAPBL_PRINT_FLUSH = 0x2,
	WAPBL_PRINT_TRUNCATE = 0x4,
	WAPBL_PRINT_TRANSACTION = 0x8,
	WAPBL_PRINT_BUFFER = 0x10,
	WAPBL_PRINT_BUFFER2 = 0x20,
	WAPBL_PRINT_ALLOC = 0x40,
	WAPBL_PRINT_INODE = 0x80,
	WAPBL_PRINT_WRITE = 0x100,
	WAPBL_PRINT_IO = 0x200,
	WAPBL_PRINT_REPLAY = 0x400,
	WAPBL_PRINT_ERROR = 0x800,
	WAPBL_PRINT_DISCARD = 0x1000,
	WAPBL_PRINT_BIODONE = 0x2000,
};

#define	WAPBL_PRINTF(mask, a) if (wapbl_debug_print & (mask)) printf  a
extern int wapbl_debug_print;
#else
#define	WAPBL_PRINTF(mask, a)
#endif

/****************************************************************/

/* The WAPBL journal layout.
 * 
 * The journal consists of a header followed by a circular buffer
 * region.  The circular data area is described by the header
 * wc_circ_off, wc_circ_size, wc_head and wc_tail fields as bytes
 * from the start of the journal header.  New records are inserted
 * at wc_head and the oldest valid record can be found at wc_tail.
 * When ((wc_head == wc_tail) && (wc_head == 0)), the journal is empty.
 * The condition of ((wc_head == wc_tail) && (wc_head != 0))
 * indicates a full journal, although this condition is rare.
 *
 * The journal header as well as its records are marked by a 32bit
 * type tag and length for ease of parsing.  Journal records are
 * padded so as to fall on journal device block boundaries.
 * (XXX i think there is currently a bug wrt WC_BLOCKS not ending
 * correctly on a journal device block boundary. this would need
 * to be fixed if the journal blocksize does not match filesystem.)
 */

/*
 * The following are the 4 record types used by the journal:
 * Each tag indicates journal data organized by one of the
 * structures used below.
 */
enum {
	WAPBL_WC_HEADER = 0x5741424c,	/* "WABL", struct wapbl_wc_header */
	WAPBL_WC_INODES,		/* struct wapbl_wc_inodelist */
	WAPBL_WC_REVOCATIONS,		/* struct wapbl_wc_blocklist */
	WAPBL_WC_BLOCKS,		/* struct wapbl_wc_blocklist */
};

/* null entry (on disk) */
/* This structure isn't used directly, but shares its header
 * layout with all the other log structures for the purpose
 * of reading a log structure and determining its type
 */
struct wapbl_wc_null {
	uint32_t	wc_type;	/* WAPBL_WC_* */
	int32_t		wc_len;
	uint8_t		wc_spare[0];	/* actually longer */
};

/* journal header (on-disk)
 * This record is found at the start of the
 * journal, but not within the circular buffer region.  As well as
 * describing the journal parameters and matching filesystem, it
 * additionally serves as the atomic update record for journal
 * updates.
 */
struct wapbl_wc_header {
	uint32_t	wc_type;	/* WAPBL_WC_HEADER log magic number */
	int32_t		wc_len;		/* length of this journal entry */
	uint32_t	wc_checksum;
	uint32_t	wc_generation;
	int32_t		wc_fsid[2];
	uint64_t	wc_time;
	uint32_t	wc_timensec;
	uint32_t	wc_version;
	uint32_t	wc_log_dev_bshift;
	uint32_t	wc_fs_dev_bshift;
	int64_t		wc_head;
	int64_t		wc_tail;
	int64_t		wc_circ_off;	/* offset of of circ buffer region */
	int64_t		wc_circ_size;	/* size of circular buffer region */
	uint8_t		wc_spare[0];	/* actually longer */
};

/* list of blocks (on disk)
 * This record is used to describe a set of filesystem blocks,
 * and is used with two type tags, WAPBL_WC_BLOCKS and
 * WAPBL_WC_REVOCATIONS.
 * 
 * For WAPBL_WC_BLOCKS, a copy of each listed block can be found
 * starting at the next log device blocksize boundary.  starting at
 * one log device block since the start of the record.  This contains
 * the bulk of the filesystem journal data which is written using
 * these records before being written into the filesystem.
 *
 * The WAPBL_WC_REVOCATIONS record is used to indicate that any
 * previously listed blocks should not be written into the filesystem.
 * This is important so that deallocated and reallocated data blocks
 * do not get overwritten with stale data from the journal.  The
 * revocation records to not contain a copy of any actual block data.
 */
struct wapbl_wc_blocklist {
	uint32_t	wc_type; /* WAPBL_WC_{REVOCATIONS,BLOCKS} */
	int32_t		wc_len;
	int32_t		wc_blkcount;
	int32_t		wc_unused;
	struct {
		int64_t	wc_daddr;
		int32_t	wc_unused;
		int32_t	wc_dlen;
	} wc_blocks[0];			/* actually longer */
};

/* list of inodes (on disk)
 * This record is used to describe the set of inodes which
 * may be allocated but are unlinked.  Inodes end up listed here
 * while they are in the process of being initialized and
 * deinitialized.  Inodes unlinked while in use by a process
 * will be listed here and the actual deletion must be completed
 * on journal replay.
 */
struct wapbl_wc_inodelist {
	uint32_t	wc_type; /* WAPBL_WC_INODES */
	int32_t		wc_len;
	int32_t		wc_inocnt;
	int32_t		wc_clear;	/* set if previously listed inodes 
					   hould be ignored */
	struct {
		uint32_t wc_inumber;
		uint32_t wc_imode;
	} wc_inodes[0];		/* actually longer */
};

/****************************************************************/

#include <sys/queue.h>
#include <sys/vnode.h>
#include <sys/buf.h>

typedef void (*wapbl_flush_fn_t)(struct mount *, daddr_t *, int *, int);

#ifdef _KERNEL

struct wapbl_entry;
struct wapbl_wc_header;
struct wapbl_replay;
struct wapbl;

/*
 * This structure holds per transaction log information
 */
struct wapbl_entry {
	struct wapbl *we_wapbl;
	SIMPLEQ_ENTRY(wapbl_entry) we_entries;
	size_t we_bufcount;		/* Count of unsynced buffers */
	size_t we_reclaimable_bytes;	/* Number on disk bytes for this
					   transaction */
	int	we_error;
#ifdef WAPBL_DEBUG_BUFBYTES
	size_t we_unsynced_bufbytes;	/* Byte count of unsynced buffers */
#endif
};

void	wapbl_init(void);

/* Start using a log */
int	wapbl_start(struct wapbl **, struct mount *, struct vnode *, daddr_t,
		    size_t, size_t, struct wapbl_replay *,
		    wapbl_flush_fn_t, wapbl_flush_fn_t);

/* Discard the current transaction, potentially dangerous */
void	wapbl_discard(struct wapbl *);

/* stop using a log */
int	wapbl_stop(struct wapbl *, int);

/*
 * Begin a new transaction or increment transaction recursion
 * level if called while a transaction is already in progress
 * by the current process.
 */
int	wapbl_begin(struct wapbl *, const char *, int);


/* End a transaction or decrement the transaction recursion level */
void	wapbl_end(struct wapbl *);

/*
 * Add a new buffer to the current transaction.  The buffers
 * data will be copied to the current transaction log and the
 * buffer will be marked B_LOCKED so that it will not be
 * flushed to disk by the syncer or reallocated.
 */
void	wapbl_add_buf(struct wapbl *, struct buf *);

/* Remove a buffer from the current transaction. */
void	wapbl_remove_buf(struct wapbl *, struct buf *);

void	wapbl_resize_buf(struct wapbl *, struct buf *, long, long);

/*
 * This will flush all completed transactions to disk and
 * start asynchronous writes on the associated buffers
 */
int	wapbl_flush(struct wapbl *, int);

/*
 * Inodes that are allocated but have zero link count
 * must be registered with the current transaction
 * so they may be recorded in the log and cleaned up later.
 * registration/unregistration of ino numbers already registered is ok.
 */
void	wapbl_register_inode(struct wapbl *, ino_t, mode_t);
void	wapbl_unregister_inode(struct wapbl *, ino_t, mode_t);

/*
 * Metadata block deallocations must be registered so
 * that revocations records can be written and to prevent
 * the corresponding blocks from being reused as data
 * blocks until the log is on disk.
 */
void	wapbl_register_deallocation(struct wapbl *, daddr_t, int);

void	wapbl_jlock_assert(struct wapbl *wl);
void	wapbl_junlock_assert(struct wapbl *wl);

void	wapbl_print(struct wapbl *wl, int full, void (*pr)(const char *, ...));

#if defined(WAPBL_DEBUG) || defined(DDB)
void	wapbl_dump(struct wapbl *);
#endif

void	wapbl_biodone(struct buf *);

extern struct wapbl_ops wapbl_ops;

static __inline struct mount *
wapbl_vptomp(struct vnode *vp)
{
	struct mount *mp;

	mp = NULL;
	if (vp != NULL) {
		if (vp->v_type == VBLK)
			mp = vp->v_specmountpoint;
		else
			mp = vp->v_mount;
	}

	return mp;
}

static __inline bool
wapbl_vphaswapbl(struct vnode *vp)
{
	struct mount *mp;

	if (vp == NULL)
		return false;

	mp = wapbl_vptomp(vp);
	if (mp && mp->mnt_wapbl)
		return true;
	else
		return false;
}

#endif /* _KERNEL */

/****************************************************************/
/* Replay support */

struct wapbl_replay {
	struct vnode *wr_logvp;
	struct vnode *wr_devvp;
	daddr_t wr_logpbn;

	struct wapbl_wc_header wr_wc_header;
	void *wr_scratch;

	LIST_HEAD(wapbl_blk_head, wapbl_blk) *wr_blkhash;
	u_long wr_blkhashmask;
	int wr_blkhashcnt;

	off_t wr_inodeshead;
	off_t wr_inodestail;
	int wr_inodescnt;
	struct {
		uint32_t wr_inumber;
		uint32_t wr_imode;
	} *wr_inodes;
};

#define	wapbl_replay_isopen(wr) ((wr)->wr_scratch != 0)

int	wapbl_replay_isopen1(struct wapbl_replay *);
int	wapbl_replay_start(struct wapbl_replay **, struct vnode *,
	daddr_t, size_t, size_t);
void	wapbl_replay_stop(struct wapbl_replay *);
void	wapbl_replay_free(struct wapbl_replay *);
int	wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
int	wapbl_replay_write(struct wapbl_replay *, struct vnode *);
int	wapbl_replay_read(struct wapbl_replay *, void *, daddr_t, long);

/****************************************************************/

/* Supply this to provide i/o support */
int wapbl_write(void *, size_t, struct vnode *, daddr_t);
int wapbl_read(void *, size_t, struct vnode *, daddr_t);

/****************************************************************/

#endif /* !_SYS_WAPBL_H */