OpenSolaris_b135/uts/common/rpc/rdma_subr.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Copyright (c) 2008, The Ohio State University. All rights reserved.
 *
 * Portions of this source code is developed by the team members of
 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
 * headed by Professor Dhabaleswar K. (DK) Panda.
 *
 * Acknowledgements to contributions from developors:
 *   Ranjit Noronha: noronha@cse.ohio-state.edu
 *   Lei Chai      : chail@cse.ohio-state.edu
 *   Weikuan Yu    : yuw@cse.ohio-state.edu
 *
 */

#include <sys/systm.h>
#include <sys/kstat.h>
#include <sys/modctl.h>
#include <sys/sdt.h>
#include <rpc/rpc_rdma.h>

#include <sys/ib/ibtl/ibti.h>

uint_t rdma_minchunk = RDMA_MINCHUNK;

/*
 * Globals
 */
int rdma_modloaded = 0;		/* flag to load RDMA plugin modules */
int rdma_dev_available = 0;	/* if any RDMA device is loaded */
kmutex_t rdma_modload_lock;	/* protects rdma_modloaded flag */

rdma_svc_wait_t rdma_wait;

rdma_registry_t	*rdma_mod_head = NULL;	/* head for RDMA modules */
krwlock_t	rdma_lock;		/* protects rdma_mod_head list */
ldi_ident_t rpcmod_li = NULL;	/* identifies us with ldi_ framework */

kmem_cache_t *clist_cache = NULL;

/*
 * Statics
 */
ldi_handle_t rpcib_handle = NULL;

/*
 * Externs
 */
extern	kstat_named_t	*rdmarcstat_ptr;
extern	uint_t		rdmarcstat_ndata;
extern	kstat_named_t	*rdmarsstat_ptr;
extern	uint_t		rdmarsstat_ndata;

void rdma_kstat_init();

/*
 * RDMATF module registration routine.
 * This routine is expected to be called by the init routine in
 * the plugin modules.
 */
rdma_stat
rdma_register_mod(rdma_mod_t *mod)
{
	rdma_registry_t **mp, *m;

	if (mod->rdma_version != RDMATF_VERS) {
		return (RDMA_BADVERS);
	}

	rw_enter(&rdma_lock, RW_WRITER);
	/*
	 * Ensure not already registered
	 */
	mp = &rdma_mod_head;
	while (*mp != NULL) {
		if (strncmp((*mp)->r_mod->rdma_api, mod->rdma_api,
		    KNC_STRSIZE) == 0) {
			if ((*mp)->r_mod_state == RDMA_MOD_INACTIVE) {
				(*mp)->r_mod_state = RDMA_MOD_ACTIVE;
				(*mp)->r_mod->rdma_ops = mod->rdma_ops;
				(*mp)->r_mod->rdma_count = mod->rdma_count;
				goto announce_hca;
			}
			rw_exit(&rdma_lock);
			return (RDMA_REG_EXIST);
		}
		mp = &((*mp)->r_next);
	}

	/*
	 * New one, create and add to registry
	 */
	m = kmem_alloc(sizeof (rdma_registry_t), KM_SLEEP);
	m->r_mod = kmem_alloc(sizeof (rdma_mod_t), KM_SLEEP);
	*m->r_mod = *mod;
	m->r_next = NULL;
	m->r_mod->rdma_api = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
	(void) strncpy(m->r_mod->rdma_api, mod->rdma_api, KNC_STRSIZE);
	m->r_mod->rdma_api[KNC_STRSIZE - 1] = '\0';
	m->r_mod_state = RDMA_MOD_ACTIVE;
	*mp = m;

announce_hca:
	rw_exit(&rdma_lock);
	/*
	 * Start the nfs service on the rdma xprts.
	 * (this notification mechanism will need to change when we support
	 * multiple hcas and have support for multiple rdma plugins).
	 */
	mutex_enter(&rdma_wait.svc_lock);
	rdma_wait.svc_stat = RDMA_HCA_ATTACH;
	cv_signal(&rdma_wait.svc_cv);
	mutex_exit(&rdma_wait.svc_lock);

	return (RDMA_SUCCESS);
}

/*
 * RDMATF module unregistration routine.
 * This routine is expected to be called by the fini routine in
 * the plugin modules.
 */
rdma_stat
rdma_unregister_mod(rdma_mod_t *mod)
{
	rdma_registry_t **m, *mmod = NULL;

	rw_enter(&rdma_lock, RW_WRITER);

	m = &rdma_mod_head;
	while (*m != NULL) {
		if (strncmp((*m)->r_mod->rdma_api, mod->rdma_api,
		    KNC_STRSIZE) != 0) {
			m = &((*m)->r_next);
			continue;
		}
		/*
		 * Check if any device attached, if so return error
		 */
		if (mod->rdma_count != 0) {
			rw_exit(&rdma_lock);
			return (RDMA_FAILED);
		}
		/*
		 * Found entry. Mark it inactive.
		 */
		mmod = *m;
		mmod->r_mod->rdma_count = 0;
		mmod->r_mod_state = RDMA_MOD_INACTIVE;
		break;
	}

	rdma_modloaded = 0;
	rdma_dev_available = 0;
	rw_exit(&rdma_lock);

	/*
	 * Stop the nfs service running on the rdma xprts.
	 * (this notification mechanism will need to change when we support
	 * multiple hcas and have support for multiple rdma plugins).
	 */
	mutex_enter(&rdma_wait.svc_lock);
	rdma_wait.svc_stat = RDMA_HCA_DETACH;
	cv_signal(&rdma_wait.svc_cv);
	mutex_exit(&rdma_wait.svc_lock);

	/*
	 * Not found.
	 */
	return (RDMA_SUCCESS);
}

struct clist *
clist_alloc(void)
{
	struct clist *clp;

	clp = kmem_cache_alloc(clist_cache, KM_SLEEP);

	bzero(clp, sizeof (*clp));

	return (clp);
}

uint32_t
clist_len(struct clist *cl)
{
	uint32_t len = 0;
	while (cl) {
		len += cl->c_len;
		cl = cl->c_next;
	}
	return (len);
}

void
clist_zero_len(struct clist *cl)
{
	while (cl != NULL) {
		if (cl->c_dmemhandle.mrc_rmr == 0)
			break;
		cl->c_len = 0;
		cl = cl->c_next;
	}
}

/*
 * Creates a new chunk list entry, and
 * adds it to the end of a chunk list.
 */
void
clist_add(struct clist **clp, uint32_t xdroff, int len,
	struct mrc *shandle, caddr_t saddr,
	struct mrc *dhandle, caddr_t daddr)
{
	struct clist *cl;

	/* Find the end of the list */

	while (*clp != NULL)
		clp = &((*clp)->c_next);

	cl = clist_alloc();
	cl->c_xdroff = xdroff;
	cl->c_len = len;
	cl->w.c_saddr = (uint64_t)(uintptr_t)saddr;
	if (shandle)
		cl->c_smemhandle = *shandle;
	cl->u.c_daddr = (uint64_t)(uintptr_t)daddr;
	if (dhandle)
		cl->c_dmemhandle = *dhandle;
	cl->c_next = NULL;

	*clp = cl;
}

rdma_stat
clist_register(CONN *conn, struct clist *cl, clist_dstsrc dstsrc)
{
	struct clist *c;
	int status;

	for (c = cl; c; c = c->c_next) {
		if (c->c_len <= 0)
			continue;

		c->c_regtype = dstsrc;

		switch (dstsrc) {
		case CLIST_REG_SOURCE:
			status = RDMA_REGMEMSYNC(conn,
			    (caddr_t)(struct as *)c->c_adspc,
			    (caddr_t)(uintptr_t)c->w.c_saddr3, c->c_len,
			    &c->c_smemhandle, (void **)&c->c_ssynchandle,
			    (void *)c->rb_longbuf.rb_private);
			break;
		case CLIST_REG_DST:
			status = RDMA_REGMEMSYNC(conn,
			    (caddr_t)(struct as *)c->c_adspc,
			    (caddr_t)(uintptr_t)c->u.c_daddr3, c->c_len,
			    &c->c_dmemhandle, (void **)&c->c_dsynchandle,
			    (void *)c->rb_longbuf.rb_private);
			break;
		default:
			return (RDMA_INVAL);
		}
		if (status != RDMA_SUCCESS) {
			(void) clist_deregister(conn, cl);
			return (status);
		}
	}

	return (RDMA_SUCCESS);
}

rdma_stat
clist_deregister(CONN *conn, struct clist *cl)
{
	struct clist *c;

	for (c = cl; c; c = c->c_next) {
		switch (c->c_regtype) {
		case CLIST_REG_SOURCE:
			if (c->c_smemhandle.mrc_rmr != 0) {
				(void) RDMA_DEREGMEMSYNC(conn,
				    (caddr_t)(uintptr_t)c->w.c_saddr3,
				    c->c_smemhandle,
				    (void *)(uintptr_t)c->c_ssynchandle,
				    (void *)c->rb_longbuf.rb_private);
				c->c_smemhandle.mrc_rmr = 0;
				c->c_ssynchandle = NULL;
			}
			break;
		case CLIST_REG_DST:
			if (c->c_dmemhandle.mrc_rmr != 0) {
				(void) RDMA_DEREGMEMSYNC(conn,
				    (caddr_t)(uintptr_t)c->u.c_daddr3,
				    c->c_dmemhandle,
				    (void *)(uintptr_t)c->c_dsynchandle,
				    (void *)c->rb_longbuf.rb_private);
				c->c_dmemhandle.mrc_rmr = 0;
				c->c_dsynchandle = NULL;
			}
			break;
		default:
			/* clist unregistered. continue */
			break;
		}
	}

	return (RDMA_SUCCESS);
}

rdma_stat
clist_syncmem(CONN *conn, struct clist *cl, clist_dstsrc dstsrc)
{
	struct clist *c;
	rdma_stat status;

	c = cl;
	switch (dstsrc) {
	case CLIST_REG_SOURCE:
		while (c != NULL) {
			if (c->c_ssynchandle) {
				status = RDMA_SYNCMEM(conn,
				    (void *)(uintptr_t)c->c_ssynchandle,
				    (caddr_t)(uintptr_t)c->w.c_saddr3,
				    c->c_len, 0);
				if (status != RDMA_SUCCESS)
					return (status);
			}
			c = c->c_next;
		}
		break;
	case CLIST_REG_DST:
		while (c != NULL) {
			if (c->c_ssynchandle) {
				status = RDMA_SYNCMEM(conn,
				    (void *)(uintptr_t)c->c_dsynchandle,
				    (caddr_t)(uintptr_t)c->u.c_daddr3,
				    c->c_len, 1);
				if (status != RDMA_SUCCESS)
					return (status);
			}
			c = c->c_next;
		}
		break;
	default:
		return (RDMA_INVAL);
	}

	return (RDMA_SUCCESS);
}

/*
 * Frees up entries in chunk list
 */
void
clist_free(struct clist *cl)
{
	struct clist *c = cl;

	while (c != NULL) {
		cl = cl->c_next;
		kmem_cache_free(clist_cache, c);
		c = cl;
	}
}

rdma_stat
rdma_clnt_postrecv(CONN *conn, uint32_t xid)
{
	struct clist *cl = NULL;
	rdma_stat retval;
	rdma_buf_t rbuf = {0};

	rbuf.type = RECV_BUFFER;
	if (RDMA_BUF_ALLOC(conn, &rbuf)) {
		return (RDMA_NORESOURCE);
	}

	clist_add(&cl, 0, rbuf.len, &rbuf.handle, rbuf.addr,
	    NULL, NULL);
	retval = RDMA_CLNT_RECVBUF(conn, cl, xid);
	clist_free(cl);

	return (retval);
}

rdma_stat
rdma_clnt_postrecv_remove(CONN *conn, uint32_t xid)
{
	return (RDMA_CLNT_RECVBUF_REMOVE(conn, xid));
}

rdma_stat
rdma_svc_postrecv(CONN *conn)
{
	struct clist *cl = NULL;
	rdma_stat retval;
	rdma_buf_t rbuf = {0};

	rbuf.type = RECV_BUFFER;
	if (RDMA_BUF_ALLOC(conn, &rbuf)) {
		retval = RDMA_NORESOURCE;
	} else {
		clist_add(&cl, 0, rbuf.len, &rbuf.handle, rbuf.addr,
		    NULL, NULL);
		retval = RDMA_SVC_RECVBUF(conn, cl);
		clist_free(cl);
	}
	return (retval);
}

rdma_stat
rdma_buf_alloc(CONN *conn, rdma_buf_t *rbuf)
{
	return (RDMA_BUF_ALLOC(conn, rbuf));
}

void
rdma_buf_free(CONN *conn, rdma_buf_t *rbuf)
{
	if (!rbuf || rbuf->addr == NULL) {
		return;
	}
	RDMA_BUF_FREE(conn, rbuf);
	bzero(rbuf, sizeof (rdma_buf_t));
}

/*
 * Caller is holding rdma_modload_lock mutex
 */
int
rdma_modload()
{
	int status;
	ASSERT(MUTEX_HELD(&rdma_modload_lock));
	/*
	 * Load all available RDMA plugins which right now is only IB plugin.
	 * If no IB hardware is present, then quit right away.
	 * ENODEV -- For no device on the system
	 * EPROTONOSUPPORT -- For module not avilable either due to failure to
	 * load or some other reason.
	 */
	rdma_modloaded = 1;
	if (ibt_hw_is_present() == 0) {
		rdma_dev_available = 0;
		return (ENODEV);
	}

	rdma_dev_available = 1;
	if (rpcmod_li == NULL)
		return (EPROTONOSUPPORT);

	status = ldi_open_by_name("/devices/ib/rpcib@0:rpcib",
	    FREAD | FWRITE, kcred,
	    &rpcib_handle, rpcmod_li);

	if (status != 0)
		return (EPROTONOSUPPORT);


	/*
	 * We will need to reload the plugin module after it was unregistered
	 * but the resources below need to allocated only the first time.
	 */
	if (!clist_cache) {
		clist_cache = kmem_cache_create("rdma_clist",
		    sizeof (struct clist), _POINTER_ALIGNMENT, NULL,
		    NULL, NULL, NULL, 0, 0);
		rdma_kstat_init();
	}

	(void) ldi_close(rpcib_handle, FREAD|FWRITE, kcred);

	return (0);
}

void
rdma_kstat_init(void)
{
	kstat_t *ksp;

	/*
	 * The RDMA framework doesn't know how to deal with Zones, and is
	 * only available in the global zone.
	 */
	ASSERT(INGLOBALZONE(curproc));
	ksp = kstat_create_zone("unix", 0, "rpc_rdma_client", "rpc",
	    KSTAT_TYPE_NAMED, rdmarcstat_ndata,
	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, GLOBAL_ZONEID);
	if (ksp) {
		ksp->ks_data = (void *) rdmarcstat_ptr;
		kstat_install(ksp);
	}

	ksp = kstat_create_zone("unix", 0, "rpc_rdma_server", "rpc",
	    KSTAT_TYPE_NAMED, rdmarsstat_ndata,
	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, GLOBAL_ZONEID);
	if (ksp) {
		ksp->ks_data = (void *) rdmarsstat_ptr;
		kstat_install(ksp);
	}
}

rdma_stat
rdma_kwait(void)
{
	int ret;
	rdma_stat stat;

	mutex_enter(&rdma_wait.svc_lock);

	ret = cv_wait_sig(&rdma_wait.svc_cv, &rdma_wait.svc_lock);

	/*
	 * If signalled by a hca attach/detach, pass the right
	 * stat back.
	 */

	if (ret)
		stat =  rdma_wait.svc_stat;
	else
		stat = RDMA_INTR;

	mutex_exit(&rdma_wait.svc_lock);

	return (stat);
}