OpenSolaris_b135/uts/sun4/sys/async.h

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#ifndef	_SYS_ASYNC_H
#define	_SYS_ASYNC_H

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <sys/privregs.h>

#ifdef	__cplusplus
extern "C" {
#endif

#ifndef	_ASM

#include <sys/errorq.h>

/*
 * The async_flt structure is used to record all pertinent information about
 * an asynchronous CPU or bus-related memory error.  Typically, the structure
 * is initialized by a high-level interrupt or trap handler, and then enqueued
 * for later processing.  Separate queues are maintained for correctable and
 * uncorrectable errors.  The current CPU module determines the size of the
 * queue elements, so that it may declare a CPU-specific fault structure
 * which contains a struct async_flt as its first member.  Each async_flt also
 * contains a callback function (flt_func) that is invoked by the processing
 * code in order to actually log messages when the event is dequeued.  This
 * function may be called from a softint, from trap() as part of AST handling
 * before the victim thread returns to userland, or as part of panic().  As
 * such, the flt_func should basically only be calling cmn_err (but NOT with
 * the CE_PANIC flag).  It must not call panic(), acquire locks, or block.
 * The owner of the event is responsible for determining whether the event is
 * fatal; if so, the owner should set flt_panic and panic() after enqueuing
 * the event.  The event will then be dequeued and logged as part of panic
 * processing.  If flt_panic is not set, the queue function will schedule a
 * soft interrupt to process the event.
 */

struct async_flt;
typedef void (*async_func_t)(struct async_flt *, char *);

struct async_flt {
	uint64_t	flt_id;		/* gethrtime() at time of fault */
	uint64_t	flt_stat;	/* async fault status register */
	uint64_t	flt_addr;	/* async fault address register */
	caddr_t		flt_pc;		/* program counter from error trap */
	async_func_t	flt_func;	/* logging function */
	uint_t		flt_bus_id;	/* hardware bus id# of cpu/sbus/pci */
	uint_t		flt_inst;	/* software instance of cpu/sbus/pci */
	ushort_t	flt_status;	/* error information */
	ushort_t	flt_synd;	/* ECC syndrome */
	uchar_t		flt_in_memory;	/* fault occurred in memory if != 0 */
	uchar_t		flt_class;	/* fault class (cpu or bus) */
	uchar_t		flt_prot;	/* type of fault protection (if any) */
	uchar_t		flt_priv;	/* fault occurred in kernel if != 0 */
	uchar_t		flt_panic;	/* fault caused owner to panic() */
	uchar_t		flt_tl;		/* fault occurred at TL > 0 */
	uchar_t		flt_core;	/* fault occurred during core() dump */
	uchar_t		flt_pad;	/* reserved for future use */
	uint64_t	flt_disp;	/* error disposition information */
	uint64_t	flt_payload;	/* ereport payload information */
	char		*flt_erpt_class; /* ereport class string */
};

/*
 * Bus nexus drivers can use the bus_func_register() interface to register
 * callback functions for error handling and panic handling.  The handler
 * functions should be registered and unregistered from driver attach and
 * detach context, where it is safe to perform a sleeping allocation.  The
 * callbacks themselves can be invoked from panic, or from the CPU module's
 * asynchronous trap handler at high PIL.  As such, these routines may only
 * test for errors and enqueue async_flt events.  They may not grab adaptive
 * locks, call panic(), or invoke bus_func_register() or bus_func_unregister().
 * Each callback function should return one of the BF_* return status values
 * below.  The bus_func_invoke() function calls all the registered handlers of
 * the specified type, and returns the maximum of their return values (e.g.
 * BF_FATAL if any callback returned BF_FATAL).  If any callback returns
 * BF_FATAL, the system will panic at the end of callback processing.
 */

typedef	uint_t (*busfunc_t)(void *);

#define	BF_TYPE_UE		1	/* check for uncorrectable errors */
#define	BF_TYPE_ERRDIS		2	/* disable error detection */
#define	BF_TYPE_RESINTR		3	/* reset interrupts */

#define	BF_NONE			0	/* no errors were detected */
#define	BF_NONFATAL		1	/* one or more non-fatal errors found */
#define	BF_FATAL		2	/* one or more fatal errors found */

typedef struct bus_func_desc {
	int bf_type;			/* type of function (see above) */
	busfunc_t bf_func;		/* function to call */
	void *bf_arg;			/* function argument */
	struct bus_func_desc *bf_next;	/* pointer to next registered desc */
} bus_func_desc_t;

extern void bus_func_register(int, busfunc_t, void *);
extern void bus_func_unregister(int, busfunc_t, void *);
extern void bus_async_log_err(struct async_flt *);
extern uint_t bus_func_invoke(int);

extern void ecc_cpu_call(struct async_flt *, char *, int);

extern void ce_scrub(struct async_flt *);
extern void ecc_page_zero(void *);

extern void error_init(void);

extern	int	ce_verbose_memory;
extern	int	ce_verbose_other;
extern	int	ce_show_data;
extern	int	ce_debug;
extern	int	ue_debug;

extern	int	aft_verbose;
extern	int	aft_panic;
extern	int	aft_testfatal;

extern struct async_flt panic_aflt;

extern errorq_t *ce_queue;
extern errorq_t *ue_queue;

#endif	/* !_ASM */

/*
 * ECC or parity error status for async_flt.flt_status.
 */
#define	ECC_C_TRAP		0x0001	/* Trap 0x63 Corrected ECC Error */
#define	ECC_I_TRAP		0x0002	/* Trap 0x0A Instr Access Error */
#define	ECC_ECACHE		0x0004	/* Ecache ECC Error */
#define	ECC_IOBUS		0x0008	/* Pci or sysio ECC Error */
#define	ECC_INTERMITTENT	0x0010	/* Intermittent ECC Error */
#define	ECC_PERSISTENT		0x0020	/* Persistent ECC Error */
#define	ECC_STICKY		0x0040	/* Sticky ECC Error */
#define	ECC_D_TRAP		0x0080	/* Trap 0x32 Data Access Error */
#define	ECC_F_TRAP		0x0100	/* Cheetah Trap 0x70 Fast ECC Error */
#define	ECC_DP_TRAP		0x0200	/* Cheetah+ Trap 0x71 D$ Parity Error */
#define	ECC_IP_TRAP		0x0400	/* Cheetah+ Trap 0x72 I$ Parity Error */
#define	ECC_ITLB_TRAP		0x0800	/* Panther ITLB Parity Error */
#define	ECC_DTLB_TRAP		0x1000	/* Panther DTLB Parity Error */
#define	ECC_IO_CE		0x2000	/* Pci or sysio CE */
#define	ECC_IO_UE		0x4000	/* Pci or sysio UE */

/*
 * Trap type numbers corresponding to the fault types defined above.
 */
#define	TRAP_TYPE_ECC_I		0x0A
#define	TRAP_TYPE_ECC_D		0x32
#define	TRAP_TYPE_ECC_F		0x70
#define	TRAP_TYPE_ECC_C		0x63
#define	TRAP_TYPE_ECC_DP	0x71
#define	TRAP_TYPE_ECC_IP	0x72
#define	TRAP_TYPE_ECC_ITLB	0x08
#define	TRAP_TYPE_ECC_DTLB	0x30
#define	TRAP_TYPE_UNKNOWN	0

/*
 * Fault classes for async_flt.flt_class.
 */
#define	BUS_FAULT		0	/* originating from bus drivers */
#define	CPU_FAULT		1	/* originating from CPUs */
#define	RECIRC_BUS_FAULT	2	/* scheduled diagnostic */
#define	RECIRC_CPU_FAULT	3	/* scheduled diagnostic */

/*
 * Invalid or unknown physical address for async_flt.flt_addr.
 */
#define	AFLT_INV_ADDR	(-1ULL)

/*
 * Fault protection values for async_flt.flt_prot.  The async error handling
 * code may be able to recover from errors when kernel code has explicitly
 * protected itself using one of the mechanisms specified here.
 */
#define	AFLT_PROT_NONE		0	/* no protection active */
#define	AFLT_PROT_ACCESS	1	/* on_trap OT_DATA_ACCESS protection */
#define	AFLT_PROT_EC		2	/* on_trap OT_DATA_EC protection */
#define	AFLT_PROT_COPY		3	/* t_lofault protection (ucopy, etc.) */

/*
 * These flags are used to indicate the validity of certain data based on
 * the various overwrite priority features of the AFSR/AFAR:
 * AFAR, ESYND and MSYND, each of which have different overwrite priorities.
 *
 * Given a specific afsr error bit and the entire afsr, there are three cases:
 *   INVALID:	The specified bit is lower overwrite priority than some other
 *		error bit which is on in the afsr (or IVU/IVC).
 *   VALID:	The specified bit is higher priority than all other error bits
 *		which are on in the afsr.
 *   AMBIGUOUS: Another error bit (or bits) of equal priority to the specified
 *		bit is on in the afsr.
 *
 * NB: The domain-to-SC communications depend on these values. If they are
 * changed, plat_ecc_unum.[ch] must be updated to match.
 */
#define	AFLT_STAT_INVALID	0	/* higher priority afsr bit is on */
#define	AFLT_STAT_VALID		1	/* this is highest priority afsr bit */
#define	AFLT_STAT_AMBIGUOUS	2	/* two afsr bits of equal priority */

/*
 * Maximum length of unum string.
 */
#define	UNUM_NAMLEN	60

/*
 * Maximum length of a DIMM serial id string + null
 */
#define	DIMM_SERIAL_ID_LEN	16

#ifdef	__cplusplus
}
#endif

#endif	/* _SYS_ASYNC_H */