NetBSD-5.0.2/sys/sys/wapbl.h
/* $NetBSD: wapbl.h,v 1.2 2008/07/31 05:38:06 simonb Exp $ */
/*-
* Copyright (c) 2003,2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SYS_WAPBL_H
#define _SYS_WAPBL_H
#include <sys/mutex.h>
#include <miscfs/specfs/specdev.h>
/* This header file describes the api and data structures for
* write ahead physical block logging (WAPBL) support.
*/
#if defined(_KERNEL_OPT)
#include "opt_wapbl.h"
#endif
#ifdef WAPBL_DEBUG
#ifndef WAPBL_DEBUG_PRINT
#define WAPBL_DEBUG_PRINT (WAPBL_PRINT_REPLAY | WAPBL_PRINT_OPEN)
#endif
#if 0
#define WAPBL_DEBUG_BUFBYTES
#define WAPBL_DEBUG_SERIALIZE
#endif
#endif
#ifdef WAPBL_DEBUG_PRINT
enum {
WAPBL_PRINT_OPEN = 0x1,
WAPBL_PRINT_FLUSH = 0x2,
WAPBL_PRINT_TRUNCATE = 0x4,
WAPBL_PRINT_TRANSACTION = 0x8,
WAPBL_PRINT_BUFFER = 0x10,
WAPBL_PRINT_BUFFER2 = 0x20,
WAPBL_PRINT_ALLOC = 0x40,
WAPBL_PRINT_INODE = 0x80,
WAPBL_PRINT_WRITE = 0x100,
WAPBL_PRINT_IO = 0x200,
WAPBL_PRINT_REPLAY = 0x400,
WAPBL_PRINT_ERROR = 0x800,
WAPBL_PRINT_DISCARD = 0x1000,
WAPBL_PRINT_BIODONE = 0x2000,
};
#define WAPBL_PRINTF(mask, a) if (wapbl_debug_print & (mask)) printf a
extern int wapbl_debug_print;
#else
#define WAPBL_PRINTF(mask, a)
#endif
/****************************************************************/
/* The WAPBL journal layout.
*
* The journal consists of a header followed by a circular buffer
* region. The circular data area is described by the header
* wc_circ_off, wc_circ_size, wc_head and wc_tail fields as bytes
* from the start of the journal header. New records are inserted
* at wc_head and the oldest valid record can be found at wc_tail.
* When ((wc_head == wc_tail) && (wc_head == 0)), the journal is empty.
* The condition of ((wc_head == wc_tail) && (wc_head != 0))
* indicates a full journal, although this condition is rare.
*
* The journal header as well as its records are marked by a 32bit
* type tag and length for ease of parsing. Journal records are
* padded so as to fall on journal device block boundaries.
* (XXX i think there is currently a bug wrt WC_BLOCKS not ending
* correctly on a journal device block boundary. this would need
* to be fixed if the journal blocksize does not match filesystem.)
*/
/*
* The following are the 4 record types used by the journal:
* Each tag indicates journal data organized by one of the
* structures used below.
*/
enum {
WAPBL_WC_HEADER = 0x5741424c, /* "WABL", struct wapbl_wc_header */
WAPBL_WC_INODES, /* struct wapbl_wc_inodelist */
WAPBL_WC_REVOCATIONS, /* struct wapbl_wc_blocklist */
WAPBL_WC_BLOCKS, /* struct wapbl_wc_blocklist */
};
/* null entry (on disk) */
/* This structure isn't used directly, but shares its header
* layout with all the other log structures for the purpose
* of reading a log structure and determining its type
*/
struct wapbl_wc_null {
uint32_t wc_type; /* WAPBL_WC_* */
int32_t wc_len;
uint8_t wc_spare[0]; /* actually longer */
};
/* journal header (on-disk)
* This record is found at the start of the
* journal, but not within the circular buffer region. As well as
* describing the journal parameters and matching filesystem, it
* additionally serves as the atomic update record for journal
* updates.
*/
struct wapbl_wc_header {
uint32_t wc_type; /* WAPBL_WC_HEADER log magic number */
int32_t wc_len; /* length of this journal entry */
uint32_t wc_checksum;
uint32_t wc_generation;
int32_t wc_fsid[2];
uint64_t wc_time;
uint32_t wc_timensec;
uint32_t wc_version;
uint32_t wc_log_dev_bshift;
uint32_t wc_fs_dev_bshift;
int64_t wc_head;
int64_t wc_tail;
int64_t wc_circ_off; /* offset of of circ buffer region */
int64_t wc_circ_size; /* size of circular buffer region */
uint8_t wc_spare[0]; /* actually longer */
};
/* list of blocks (on disk)
* This record is used to describe a set of filesystem blocks,
* and is used with two type tags, WAPBL_WC_BLOCKS and
* WAPBL_WC_REVOCATIONS.
*
* For WAPBL_WC_BLOCKS, a copy of each listed block can be found
* starting at the next log device blocksize boundary. starting at
* one log device block since the start of the record. This contains
* the bulk of the filesystem journal data which is written using
* these records before being written into the filesystem.
*
* The WAPBL_WC_REVOCATIONS record is used to indicate that any
* previously listed blocks should not be written into the filesystem.
* This is important so that deallocated and reallocated data blocks
* do not get overwritten with stale data from the journal. The
* revocation records to not contain a copy of any actual block data.
*/
struct wapbl_wc_blocklist {
uint32_t wc_type; /* WAPBL_WC_{REVOCATIONS,BLOCKS} */
int32_t wc_len;
int32_t wc_blkcount;
int32_t wc_unused;
struct {
int64_t wc_daddr;
int32_t wc_unused;
int32_t wc_dlen;
} wc_blocks[0]; /* actually longer */
};
/* list of inodes (on disk)
* This record is used to describe the set of inodes which
* may be allocated but are unlinked. Inodes end up listed here
* while they are in the process of being initialized and
* deinitialized. Inodes unlinked while in use by a process
* will be listed here and the actual deletion must be completed
* on journal replay.
*/
struct wapbl_wc_inodelist {
uint32_t wc_type; /* WAPBL_WC_INODES */
int32_t wc_len;
int32_t wc_inocnt;
int32_t wc_clear; /* set if previously listed inodes
hould be ignored */
struct {
uint32_t wc_inumber;
uint32_t wc_imode;
} wc_inodes[0]; /* actually longer */
};
/****************************************************************/
#include <sys/queue.h>
#include <sys/vnode.h>
#include <sys/buf.h>
typedef void (*wapbl_flush_fn_t)(struct mount *, daddr_t *, int *, int);
#ifdef _KERNEL
struct wapbl_entry;
struct wapbl_wc_header;
struct wapbl_replay;
struct wapbl;
/*
* This structure holds per transaction log information
*/
struct wapbl_entry {
struct wapbl *we_wapbl;
SIMPLEQ_ENTRY(wapbl_entry) we_entries;
size_t we_bufcount; /* Count of unsynced buffers */
size_t we_reclaimable_bytes; /* Number on disk bytes for this
transaction */
int we_error;
#ifdef WAPBL_DEBUG_BUFBYTES
size_t we_unsynced_bufbytes; /* Byte count of unsynced buffers */
#endif
};
void wapbl_init(void);
/* Start using a log */
int wapbl_start(struct wapbl **, struct mount *, struct vnode *, daddr_t,
size_t, size_t, struct wapbl_replay *,
wapbl_flush_fn_t, wapbl_flush_fn_t);
/* Discard the current transaction, potentially dangerous */
void wapbl_discard(struct wapbl *);
/* stop using a log */
int wapbl_stop(struct wapbl *, int);
/*
* Begin a new transaction or increment transaction recursion
* level if called while a transaction is already in progress
* by the current process.
*/
int wapbl_begin(struct wapbl *, const char *, int);
/* End a transaction or decrement the transaction recursion level */
void wapbl_end(struct wapbl *);
/*
* Add a new buffer to the current transaction. The buffers
* data will be copied to the current transaction log and the
* buffer will be marked B_LOCKED so that it will not be
* flushed to disk by the syncer or reallocated.
*/
void wapbl_add_buf(struct wapbl *, struct buf *);
/* Remove a buffer from the current transaction. */
void wapbl_remove_buf(struct wapbl *, struct buf *);
void wapbl_resize_buf(struct wapbl *, struct buf *, long, long);
/*
* This will flush all completed transactions to disk and
* start asynchronous writes on the associated buffers
*/
int wapbl_flush(struct wapbl *, int);
/*
* Inodes that are allocated but have zero link count
* must be registered with the current transaction
* so they may be recorded in the log and cleaned up later.
* registration/unregistration of ino numbers already registered is ok.
*/
void wapbl_register_inode(struct wapbl *, ino_t, mode_t);
void wapbl_unregister_inode(struct wapbl *, ino_t, mode_t);
/*
* Metadata block deallocations must be registered so
* that revocations records can be written and to prevent
* the corresponding blocks from being reused as data
* blocks until the log is on disk.
*/
void wapbl_register_deallocation(struct wapbl *, daddr_t, int);
void wapbl_jlock_assert(struct wapbl *wl);
void wapbl_junlock_assert(struct wapbl *wl);
void wapbl_print(struct wapbl *wl, int full, void (*pr)(const char *, ...));
#if defined(WAPBL_DEBUG) || defined(DDB)
void wapbl_dump(struct wapbl *);
#endif
void wapbl_biodone(struct buf *);
extern struct wapbl_ops wapbl_ops;
static __inline struct mount *
wapbl_vptomp(struct vnode *vp)
{
struct mount *mp;
mp = NULL;
if (vp != NULL) {
if (vp->v_type == VBLK)
mp = vp->v_specmountpoint;
else
mp = vp->v_mount;
}
return mp;
}
static __inline bool
wapbl_vphaswapbl(struct vnode *vp)
{
struct mount *mp;
if (vp == NULL)
return false;
mp = wapbl_vptomp(vp);
if (mp && mp->mnt_wapbl)
return true;
else
return false;
}
#endif /* _KERNEL */
/****************************************************************/
/* Replay support */
struct wapbl_replay {
struct vnode *wr_logvp;
struct vnode *wr_devvp;
daddr_t wr_logpbn;
struct wapbl_wc_header wr_wc_header;
void *wr_scratch;
LIST_HEAD(wapbl_blk_head, wapbl_blk) *wr_blkhash;
u_long wr_blkhashmask;
int wr_blkhashcnt;
off_t wr_inodeshead;
off_t wr_inodestail;
int wr_inodescnt;
struct {
uint32_t wr_inumber;
uint32_t wr_imode;
} *wr_inodes;
};
#define wapbl_replay_isopen(wr) ((wr)->wr_scratch != 0)
int wapbl_replay_isopen1(struct wapbl_replay *);
int wapbl_replay_start(struct wapbl_replay **, struct vnode *,
daddr_t, size_t, size_t);
void wapbl_replay_stop(struct wapbl_replay *);
void wapbl_replay_free(struct wapbl_replay *);
int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
int wapbl_replay_write(struct wapbl_replay *, struct vnode *);
int wapbl_replay_read(struct wapbl_replay *, void *, daddr_t, long);
/****************************************************************/
/* Supply this to provide i/o support */
int wapbl_write(void *, size_t, struct vnode *, daddr_t);
int wapbl_read(void *, size_t, struct vnode *, daddr_t);
/****************************************************************/
#endif /* !_SYS_WAPBL_H */