/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 2006-2009 Oracle.  All rights reserved.
 *
 * $Id$
 */

#include "db_config.h"

#define	__INCLUDE_NETWORKING	1
#include "db_int.h"

typedef int (*HEARTBEAT_ACTION) __P((ENV *));

static int accept_handshake __P((ENV *, REPMGR_CONNECTION *, char *));
static int accept_v1_handshake __P((ENV *, REPMGR_CONNECTION *, char *));
static int __repmgr_call_election __P((ENV *));
static int __repmgr_connect __P((ENV*, socket_t *, REPMGR_SITE *));
static int dispatch_msgin __P((ENV *, REPMGR_CONNECTION *));
static int find_version_info __P((ENV *, REPMGR_CONNECTION *, DBT *));
static int introduce_site __P((ENV *, char *, u_int, REPMGR_SITE**, u_int32_t));
static int __repmgr_next_timeout __P((ENV *,
    db_timespec *, HEARTBEAT_ACTION *));
static int dispatch_phase_completion __P((ENV *, REPMGR_CONNECTION *));
static REPMGR_CONNECTION *__repmgr_master_connection __P((ENV *));
static int process_parameters __P((ENV *,
    REPMGR_CONNECTION *, char *, u_int, u_int32_t, u_int32_t));
static int read_version_response __P((ENV *, REPMGR_CONNECTION *));
static int record_ack __P((ENV *, REPMGR_CONNECTION *));
static int __repmgr_retry_connections __P((ENV *));
static int send_handshake __P((ENV *, REPMGR_CONNECTION *, void *, size_t));
static int __repmgr_send_heartbeat __P((ENV *));
static int send_v1_handshake __P((ENV *,
    REPMGR_CONNECTION *, void *, size_t));
static int send_version_response __P((ENV *, REPMGR_CONNECTION *));
static int __repmgr_try_one __P((ENV *, u_int));

#define	ONLY_HANDSHAKE(env, conn) do {				     \
	if (conn->msg_type != REPMGR_HANDSHAKE) {		     \
		__db_errx(env, "unexpected msg type %d in state %d", \
		    (int)conn->msg_type, conn->state);		     \
		return (DB_REP_UNAVAIL);			     \
	}							     \
} while (0)

/*
 * PUBLIC: void *__repmgr_select_thread __P((void *));
 */
void *
__repmgr_select_thread(args)
	void *args;
{
	ENV *env = args;
	int ret;

	if ((ret = __repmgr_select_loop(env)) != 0) {
		__db_err(env, ret, "select loop failed");
		__repmgr_thread_failure(env, ret);
	}
	return (NULL);
}

/*
 * PUBLIC: int __repmgr_accept __P((ENV *));
 */
int
__repmgr_accept(env)
	ENV *env;
{
	DB_REP *db_rep;
	REPMGR_CONNECTION *conn;
	struct sockaddr_in siaddr;
	socklen_t addrlen;
	socket_t s;
	int ret;
#ifdef DB_WIN32
	WSAEVENT event_obj;
#endif

	db_rep = env->rep_handle;
	addrlen = sizeof(siaddr);
	if ((s = accept(db_rep->listen_fd, (struct sockaddr *)&siaddr,
	    &addrlen)) == -1) {
		/*
		 * Some errors are innocuous and so should be ignored.  MSDN
		 * Library documents the Windows ones; the Unix ones are
		 * advocated in Stevens' UNPv1, section 16.6; and Linux
		 * Application Development, p. 416.
		 */
		switch (ret = net_errno) {
#ifdef DB_WIN32
		case WSAECONNRESET:
		case WSAEWOULDBLOCK:
#else
		case EINTR:
		case EWOULDBLOCK:
		case ECONNABORTED:
		case ENETDOWN:
#ifdef EPROTO
		case EPROTO:
#endif
		case ENOPROTOOPT:
		case EHOSTDOWN:
#ifdef ENONET
		case ENONET:
#endif
		case EHOSTUNREACH:
		case EOPNOTSUPP:
		case ENETUNREACH:
#endif
			RPRINT(env, DB_VERB_REPMGR_MISC, (env,
			    "accept error %d considered innocuous", ret));
			return (0);
		default:
			__db_err(env, ret, "accept error");
			return (ret);
		}
	}
	RPRINT(env, DB_VERB_REPMGR_MISC, (env, "accepted a new connection"));

	if ((ret = __repmgr_set_nonblocking(s)) != 0) {
		__db_err(env, ret, "can't set nonblock after accept");
		(void)closesocket(s);
		return (ret);
	}

#ifdef DB_WIN32
	if ((event_obj = WSACreateEvent()) == WSA_INVALID_EVENT) {
		ret = net_errno;
		__db_err(env, ret, "can't create WSA event");
		(void)closesocket(s);
		return (ret);
	}
	if (WSAEventSelect(s, event_obj, FD_READ|FD_CLOSE) == SOCKET_ERROR) {
		ret = net_errno;
		__db_err(env, ret, "can't set desired event bits");
		(void)WSACloseEvent(event_obj);
		(void)closesocket(s);
		return (ret);
	}
#endif
	if ((ret =
	    __repmgr_new_connection(env, &conn, s, CONN_NEGOTIATE)) != 0) {
#ifdef DB_WIN32
		(void)WSACloseEvent(event_obj);
#endif
		(void)closesocket(s);
		return (ret);
	}
	F_SET(conn, CONN_INCOMING);

	/*
	 * We don't yet know which site this connection is coming from.  So for
	 * now, put it on the "orphans" list; we'll move it to the appropriate
	 * site struct later when we discover who we're talking with, and what
	 * type of connection it is.
	 */
	conn->eid = -1;
	TAILQ_INSERT_TAIL(&db_rep->connections, conn, entries);

#ifdef DB_WIN32
	conn->event_object = event_obj;
#endif
	return (0);
}

/*
 * Computes how long we should wait for input, in other words how long until we
 * have to wake up and do something.  Returns TRUE if timeout is set; FALSE if
 * there is nothing to wait for.
 *
 * Note that the resulting timeout could be zero; but it can't be negative.
 *
 * PUBLIC: int __repmgr_compute_timeout __P((ENV *, db_timespec *));
 */
int
__repmgr_compute_timeout(env, timeout)
	ENV *env;
	db_timespec *timeout;
{
	DB_REP *db_rep;
	REPMGR_RETRY *retry;
	db_timespec now, t;
	int have_timeout;

	db_rep = env->rep_handle;

	/*
	 * There are two factors to consider: are heartbeats in use?  and, do we
	 * have any sites with broken connections that we ought to retry?
	 */
	have_timeout = __repmgr_next_timeout(env, &t, NULL);

	/* List items are in order, so we only have to examine the first one. */
	if (!TAILQ_EMPTY(&db_rep->retries)) {
		retry = TAILQ_FIRST(&db_rep->retries);
		if (have_timeout) {
			/* Choose earliest timeout deadline. */
			t = timespeccmp(&retry->time, &t, <) ? retry->time : t;
		} else {
			t = retry->time;
			have_timeout = TRUE;
		}
	}

	if (have_timeout) {
		__os_gettime(env, &now, 1);
		if (timespeccmp(&now, &t, >=))
			timespecclear(timeout);
		else {
			*timeout = t;
			timespecsub(timeout, &now);
		}
	}

	return (have_timeout);
}

/*
 * Figures out the next heartbeat-related thing to be done, and when it should
 * be done.  The code is factored this way because this computation needs to be
 * done both before each select() call, and after (when we're checking for timer
 * expiration).
 */
static int
__repmgr_next_timeout(env, deadline, action)
	ENV *env;
	db_timespec *deadline;
	HEARTBEAT_ACTION *action;
{
	DB_REP *db_rep;
	HEARTBEAT_ACTION my_action;
	REPMGR_CONNECTION *conn;
	REPMGR_SITE *site;
	db_timespec t;

	db_rep = env->rep_handle;

	if (db_rep->master_eid == SELF_EID && db_rep->heartbeat_frequency > 0) {
		t = db_rep->last_bcast;
		TIMESPEC_ADD_DB_TIMEOUT(&t, db_rep->heartbeat_frequency);
		my_action = __repmgr_send_heartbeat;
	} else if ((conn = __repmgr_master_connection(env)) != NULL &&
	    !IS_SUBORDINATE(db_rep) &&
	    db_rep->heartbeat_monitor_timeout > 0 &&
	    conn->version >= HEARTBEAT_MIN_VERSION) {
		/*
		 * If we have a working connection to a heartbeat-aware master,
		 * let's monitor it.  Otherwise there's really nothing we can
		 * do.
		 */
		site = SITE_FROM_EID(db_rep->master_eid);
		t = site->last_rcvd_timestamp;
		TIMESPEC_ADD_DB_TIMEOUT(&t, db_rep->heartbeat_monitor_timeout);
		my_action = __repmgr_call_election;
	} else
		return (FALSE);

	*deadline = t;
	if (action != NULL)
		*action = my_action;
	return (TRUE);
}

static int
__repmgr_send_heartbeat(env)
	ENV *env;
{
	DBT control, rec;
	u_int unused1, unused2;

	DB_INIT_DBT(control, NULL, 0);
	DB_INIT_DBT(rec, NULL, 0);
	return (__repmgr_send_broadcast(env,
	    REPMGR_HEARTBEAT, &control, &rec, &unused1, &unused2));
}

static REPMGR_CONNECTION *
__repmgr_master_connection(env)
	ENV *env;
{
	DB_REP *db_rep;
	REPMGR_CONNECTION *conn;
	REPMGR_SITE *master;

	db_rep = env->rep_handle;

	if (db_rep->master_eid == SELF_EID ||
	    !IS_VALID_EID(db_rep->master_eid))
		return (NULL);
	master = SITE_FROM_EID(db_rep->master_eid);
	if (master->state != SITE_CONNECTED)
		return (NULL);
	conn = master->ref.conn;
	if (IS_READY_STATE(conn->state))
		return (conn);
	return (NULL);
}

static int
__repmgr_call_election(env)
	ENV *env;
{
	REPMGR_CONNECTION *conn;

	conn = __repmgr_master_connection(env);
	DB_ASSERT(env, conn != NULL);
	RPRINT(env, DB_VERB_REPMGR_MISC,
	    (env, "heartbeat monitor timeout expired"));
	STAT(env->rep_handle->region->mstat.st_connection_drop++);
	return (__repmgr_bust_connection(env, conn));
}

/*
 * PUBLIC: int __repmgr_check_timeouts __P((ENV *));
 *
 * !!!
 * Assumes caller holds the mutex.
 */
int
__repmgr_check_timeouts(env)
	ENV *env;
{
	db_timespec when, now;
	HEARTBEAT_ACTION action;
	int ret;

	/*
	 * Figure out the next heartbeat-related thing to be done.  Then, if
	 * it's time to do it, do so.
	 */
	if (__repmgr_next_timeout(env, &when, &action)) {
		__os_gettime(env, &now, 1);
		if (timespeccmp(&when, &now, <=) &&
		    (ret = (*action)(env)) != 0)
			return (ret);
	}

	return (__repmgr_retry_connections(env));
}

/*
 * Initiates connection attempts for any sites on the idle list whose retry
 * times have expired.
 */
static int
__repmgr_retry_connections(env)
	ENV *env;
{
	DB_REP *db_rep;
	REPMGR_RETRY *retry;
	db_timespec now;
	u_int eid;
	int ret;

	db_rep = env->rep_handle;
	__os_gettime(env, &now, 1);

	while (!TAILQ_EMPTY(&db_rep->retries)) {
		retry = TAILQ_FIRST(&db_rep->retries);
		if (timespeccmp(&retry->time, &now, >=))
			break;	/* since items are in time order */

		TAILQ_REMOVE(&db_rep->retries, retry, entries);

		eid = retry->eid;
		__os_free(env, retry);

		if ((ret = __repmgr_try_one(env, eid)) != 0)
			return (ret);
	}
	return (0);
}

/*
 * PUBLIC: int __repmgr_first_try_connections __P((ENV *));
 *
 * !!!
 * Assumes caller holds the mutex.
 */
int
__repmgr_first_try_connections(env)
	ENV *env;
{
	DB_REP *db_rep;
	u_int eid;
	int ret;

	db_rep = env->rep_handle;
	for (eid = 0; eid < db_rep->site_cnt; eid++)
		if ((ret = __repmgr_try_one(env, eid)) != 0)
			return (ret);
	return (0);
}

/*
 * Makes a best-effort attempt to connect to the indicated site.  Returns a
 * non-zero error indication only for disastrous failures.  For re-tryable
 * errors, we will have scheduled another attempt, and that can be considered
 * success enough.
 */
static int
__repmgr_try_one(env, eid)
	ENV *env;
	u_int eid;
{
	ADDRINFO *list;
	DB_REP *db_rep;
	repmgr_netaddr_t *addr;
	int ret;

	db_rep = env->rep_handle;

	addr = &SITE_FROM_EID(eid)->net_addr;
	if (ADDR_LIST_FIRST(addr) == NULL) {
		if ((ret = __repmgr_getaddr(env,
		    addr->host, addr->port, 0, &list)) == 0) {
			addr->address_list = list;
			(void)ADDR_LIST_FIRST(addr);
		} else if (ret == DB_REP_UNAVAIL)
			return (__repmgr_schedule_connection_attempt(
			    env, eid, FALSE));
		else
			return (ret);
	}

	/* Here, when we have a valid address. */
	return (__repmgr_connect_site(env, eid));
}

/*
 * Tries to establish a connection with the site indicated by the given eid,
 * starting with the "current" element of its address list and trying as many
 * addresses as necessary until the list is exhausted.
 *
 * PUBLIC: int __repmgr_connect_site __P((ENV *, u_int eid));
 */
int
__repmgr_connect_site(env, eid)
	ENV *env;
	u_int eid;
{
	DB_REP *db_rep;
	REPMGR_CONNECTION *con;
	REPMGR_SITE *site;
	socket_t s;
	int state;
	int ret;
#ifdef DB_WIN32
	long desired_event;
	WSAEVENT event_obj;
#endif

	db_rep = env->rep_handle;
	site = SITE_FROM_EID(eid);

	switch (ret = __repmgr_connect(env, &s, site)) {
	case 0:
		state = CONN_CONNECTED;
#ifdef DB_WIN32
		desired_event = FD_READ|FD_CLOSE;
#endif
		break;
	case INPROGRESS:
		state = CONN_CONNECTING;
#ifdef DB_WIN32
		desired_event = FD_CONNECT;
#endif
		break;
	default:
		STAT(db_rep->region->mstat.st_connect_fail++);
		return (
		    __repmgr_schedule_connection_attempt(env, eid, FALSE));
	}

#ifdef DB_WIN32
	if ((event_obj = WSACreateEvent()) == WSA_INVALID_EVENT) {
		ret = net_errno;
		__db_err(env, ret, "can't create WSA event");
		(void)closesocket(s);
		return (ret);
	}
	if (WSAEventSelect(s, event_obj, desired_event) == SOCKET_ERROR) {
		ret = net_errno;
		__db_err(env, ret, "can't set desired event bits");
		(void)WSACloseEvent(event_obj);
		(void)closesocket(s);
		return (ret);
	}
#endif

	if ((ret = __repmgr_new_connection(env, &con, s, state)) != 0) {
#ifdef DB_WIN32
		(void)WSACloseEvent(event_obj);
#endif
		(void)closesocket(s);
		return (ret);
	}
#ifdef DB_WIN32
	con->event_object = event_obj;
#endif

	con->eid = (int)eid;
	site->ref.conn = con;
	site->state = SITE_CONNECTED;

	if (state == CONN_CONNECTED) {
		__os_gettime(env, &site->last_rcvd_timestamp, 1);
		switch (ret = __repmgr_propose_version(env, con)) {
		case 0:
			break;
		case DB_REP_UNAVAIL:
			return (__repmgr_bust_connection(env, con));
		default:
			return (ret);
		}
	}

	return (0);
}

static int
__repmgr_connect(env, socket_result, site)
	ENV *env;
	socket_t *socket_result;
	REPMGR_SITE *site;
{
	repmgr_netaddr_t *addr;
	ADDRINFO *ai;
	socket_t s;
	char *why;
	int ret;
	SITE_STRING_BUFFER buffer;

	/*
	 * Lint doesn't know about DB_ASSERT, so it can't tell that this
	 * loop will always get executed at least once, giving 'why' a value.
	 */
	COMPQUIET(why, "");
	addr = &site->net_addr;
	ai = ADDR_LIST_CURRENT(addr);
	DB_ASSERT(env, ai != NULL);
	for (; ai != NULL; ai = ADDR_LIST_NEXT(addr)) {

		if ((s = socket(ai->ai_family,
		    ai->ai_socktype, ai->ai_protocol)) == SOCKET_ERROR) {
			why = "can't create socket to connect";
			continue;
		}

		if ((ret = __repmgr_set_nonblocking(s)) != 0) {
			__db_err(env,
			    ret, "can't make nonblock socket to connect");
			(void)closesocket(s);
			return (ret);
		}

		if (connect(s, ai->ai_addr, (socklen_t)ai->ai_addrlen) != 0)
			ret = net_errno;

		if (ret == 0 || ret == INPROGRESS) {
			*socket_result = s;
			RPRINT(env, DB_VERB_REPMGR_MISC, (env,
			    "init connection to %s with result %d",
			    __repmgr_format_site_loc(site, buffer), ret));
			return (ret);
		}

		why = "connection failed";
		(void)closesocket(s);
	}

	/* We've exhausted all possible addresses. */
	ret = net_errno;
	__db_err(env, ret, "%s to %s", why,
	    __repmgr_format_site_loc(site, buffer));
	return (ret);
}

/*
 * Sends a proposal for version negotiation.
 *
 * PUBLIC: int __repmgr_propose_version __P((ENV *, REPMGR_CONNECTION *));
 */
int
__repmgr_propose_version(env, conn)
	ENV *env;
	REPMGR_CONNECTION *conn;
{
	DB_REP *db_rep;
	__repmgr_version_proposal_args versions;
	repmgr_netaddr_t *my_addr;
	size_t hostname_len, rec_length;
	u_int8_t *buf, *p;
	int ret;

	db_rep = env->rep_handle;
	my_addr = &db_rep->my_addr;

	/*
	 * In repmgr wire protocol version 1, a handshake message had a rec part
	 * that looked like this:
	 *
	 *  +-----------------+----+
	 *  |  host name ...  | \0 |
	 *  +-----------------+----+
	 *
	 * To ensure its own sanity, the old repmgr would write a NUL into the
	 * last byte of a received message, and then use normal C library string
	 * operations (e.g., * strlen, strcpy).
	 *
	 * Now, a version proposal has a rec part that looks like this:
	 *
	 *  +-----------------+----+------------------+------+
	 *  |  host name ...  | \0 |  extra info ...  |  \0  |
	 *  +-----------------+----+------------------+------+
	 *
	 * The "extra info" contains the version parameters, in marshaled form.
	 */

	hostname_len = strlen(my_addr->host);
	rec_length = hostname_len + 1 +
	    __REPMGR_VERSION_PROPOSAL_SIZE + 1;
	if ((ret = __os_malloc(env, rec_length, &buf)) != 0)
		goto out;
	p = buf;
	(void)strcpy((char*)p, my_addr->host);

	p += hostname_len + 1;
	versions.min = DB_REPMGR_MIN_VERSION;
	versions.max = DB_REPMGR_VERSION;
	__repmgr_version_proposal_marshal(env, &versions, p);

	ret = send_v1_handshake(env, conn, buf, rec_length);
	__os_free(env, buf);
out:
	return (ret);
}

static int
send_v1_handshake(env, conn, buf, len)
	ENV *env;
	REPMGR_CONNECTION *conn;
	void *buf;
	size_t len;
{
	DB_REP *db_rep;
	REP *rep;
	repmgr_netaddr_t *my_addr;
	DB_REPMGR_V1_HANDSHAKE buffer;
	DBT cntrl, rec;

	db_rep = env->rep_handle;
	rep = db_rep->region;
	my_addr = &db_rep->my_addr;

	buffer.version = 1;
	buffer.priority = htonl(rep->priority);
	buffer.port = my_addr->port;
	cntrl.data = &buffer;
	cntrl.size = sizeof(buffer);

	rec.data = buf;
	rec.size = (u_int32_t)len;

	/*
	 * It would of course be disastrous to block the select() thread, so
	 * pass the "blockable" argument as FALSE.  Fortunately blocking should
	 * never be necessary here, because the hand-shake is always the first
	 * thing we send.  Which is a good thing, because it would be almost as
	 * disastrous if we allowed ourselves to drop a handshake.
	 */
	return (__repmgr_send_one(env,
	    conn, REPMGR_HANDSHAKE, &cntrl, &rec, FALSE));
}

/*
 * PUBLIC: int __repmgr_read_from_site __P((ENV *, REPMGR_CONNECTION *));
 *
 * !!!
 * Caller is assumed to hold repmgr->mutex, 'cuz we call queue_put() from here.
 */
int
__repmgr_read_from_site(env, conn)
	ENV *env;
	REPMGR_CONNECTION *conn;
{
	DB_REP *db_rep;
	REPMGR_SITE *site;
	SITE_STRING_BUFFER buffer;
	size_t nr;
	int ret;

	db_rep = env->rep_handle;
	/*
	 * Keep reading pieces as long as we're making some progress, or until
	 * we complete the current read phase.
	 */
	for (;;) {
		if ((ret = __repmgr_readv(conn->fd,
		    &conn->iovecs.vectors[conn->iovecs.offset],
		    conn->iovecs.count - conn->iovecs.offset, &nr)) != 0) {
			switch (ret) {
#ifndef DB_WIN32
			case EINTR:
				continue;
#endif
			case WOULDBLOCK:
				return (0);
			default:
#ifdef EBADF
				DB_ASSERT(env, ret != EBADF);
#endif
				(void)__repmgr_format_eid_loc(env->rep_handle,
				    conn->eid, buffer);
				__db_err(env, ret,
				    "can't read from %s", buffer);
				STAT(env->rep_handle->
				    region->mstat.st_connection_drop++);
				return (DB_REP_UNAVAIL);
			}
		}

		if (nr > 0) {
			if (IS_VALID_EID(conn->eid)) {
				site = SITE_FROM_EID(conn->eid);
				__os_gettime(
				    env, &site->last_rcvd_timestamp, 1);
			}
			if (__repmgr_update_consumed(&conn->iovecs, nr))
				return (dispatch_phase_completion(env,
					    conn));
		} else {
			(void)__repmgr_format_eid_loc(env->rep_handle,
			    conn->eid, buffer);
			__db_errx(env, "EOF on connection from %s", buffer);
			STAT(env->rep_handle->
			    region->mstat.st_connection_drop++);
			return (DB_REP_UNAVAIL);
		}
	}
}

/*
 * Handles whatever needs to be done upon the completion of a reading phase on a
 * given connection.
 */
static int
dispatch_phase_completion(env, conn)
	ENV *env;
	REPMGR_CONNECTION *conn;
{
#define	MEM_ALIGN sizeof(double)
	DBT *dbt;
	u_int32_t control_size, rec_size;
	size_t memsize, control_offset, rec_offset;
	void *membase;
	int ret;

	switch (conn->reading_phase) {
	case SIZES_PHASE:
		/*
		 * We've received the header: a message type and the lengths of
		 * the two pieces of the message.  Set up buffers to read the
		 * two pieces.  This set-up is a bit different for a
		 * REPMGR_REP_MESSAGE, because we plan to pass it off to the msg
		 * threads.
		 */
		__repmgr_iovec_init(&conn->iovecs);
		control_size = ntohl(conn->control_size_buf);
		rec_size = ntohl(conn->rec_size_buf);

		if (conn->msg_type == REPMGR_REP_MESSAGE) {
			if (control_size == 0) {
				__db_errx(
				    env, "illegal size for rep msg");
				return (DB_REP_UNAVAIL);
			}
			/*
			 * Allocate a block of memory large enough to hold a
			 * DB_REPMGR_MESSAGE wrapper, plus the (one or) two DBT
			 * data areas that it points to.  Start by calculating
			 * the total memory needed, rounding up for the start of
			 * each DBT, to ensure possible alignment requirements.
			 */
			memsize = (size_t)
			    DB_ALIGN(sizeof(REPMGR_MESSAGE), MEM_ALIGN);
			control_offset = memsize;
			memsize += control_size;
			if (rec_size > 0) {
				memsize = (size_t)DB_ALIGN(memsize, MEM_ALIGN);
				rec_offset = memsize;
				memsize += rec_size;
			} else
				COMPQUIET(rec_offset, 0);
			if ((ret = __os_malloc(env, memsize, &membase)) != 0)
				return (ret);
			conn->input.rep_message = membase;

			conn->input.rep_message->originating_eid = conn->eid;
			DB_INIT_DBT(conn->input.rep_message->control,
			    (u_int8_t*)membase + control_offset, control_size);
			__repmgr_add_dbt(&conn->iovecs,
			    &conn->input.rep_message->control);

			if (rec_size > 0) {
				DB_INIT_DBT(conn->input.rep_message->rec,
				    (rec_size > 0 ?
					(u_int8_t*)membase + rec_offset : NULL),
				    rec_size);
				__repmgr_add_dbt(&conn->iovecs,
				    &conn->input.rep_message->rec);
			} else
				DB_INIT_DBT(conn->input.rep_message->rec,
				    NULL, 0);
		} else {
			conn->input.repmgr_msg.cntrl.size = control_size;
			conn->input.repmgr_msg.rec.size = rec_size;

			if (control_size > 0) {
				dbt = &conn->input.repmgr_msg.cntrl;
				if ((ret = __os_malloc(env, control_size,
				    &dbt->data)) != 0)
					return (ret);
				__repmgr_add_dbt(&conn->iovecs, dbt);
			}

			if (rec_size > 0) {
				dbt = &conn->input.repmgr_msg.rec;
				if ((ret = __os_malloc(env, rec_size,
				     &dbt->data)) != 0) {
					if (control_size > 0)
						__os_free(env,
						    conn->input.repmgr_msg.
						    cntrl.data);
					return (ret);
				}
				__repmgr_add_dbt(&conn->iovecs, dbt);
			}
		}

		conn->reading_phase = DATA_PHASE;

		if (control_size > 0 || rec_size > 0)
			break;

		/*
		 * However, if they're both 0, we're ready to complete
		 * DATA_PHASE.
		 */
		/* FALLTHROUGH */

	case DATA_PHASE:
		return (dispatch_msgin(env, conn));

	default:
		DB_ASSERT(env, FALSE);
	}

	return (0);
}

/*
 * Processes an incoming message, depending on our current state.
 */
static int
dispatch_msgin(env, conn)
	ENV *env;
	REPMGR_CONNECTION *conn;
{
	DBT *dbt;
	char *hostname;
	int given, ret;

	given = FALSE;

	switch (conn->state) {
	case CONN_CONNECTED:
		/*
		 * In this state, we know we're working with an outgoing
		 * connection.  We've sent a version proposal, and now expect
		 * the response (which could be a dumb old V1 handshake).
		 */
		ONLY_HANDSHAKE(env, conn);
		if ((ret = read_version_response(env, conn)) != 0)
			return (ret);
		break;

	case CONN_NEGOTIATE:
		/*
		 * Since we're in this state, we know we're working with an
		 * incoming connection, and this is the first message we've
		 * received.  So it must be a version negotiation proposal (or a
		 * legacy V1 handshake).  (We'll verify this of course.)
		 */
		ONLY_HANDSHAKE(env, conn);
		if ((ret = send_version_response(env, conn)) != 0)
			return (ret);
		break;

	case CONN_PARAMETERS:
		/*
		 * We've previously agreed on a (>1) version, and are now simply
		 * awaiting the other side's parameters handshake.
		 */
		ONLY_HANDSHAKE(env, conn);
		dbt = &conn->input.repmgr_msg.rec;
		hostname = dbt->data;
		hostname[dbt->size-1] = '\0';
		if ((ret = accept_handshake(env, conn, hostname)) != 0)
			return (ret);
		conn->state = CONN_READY;
		break;

	case CONN_READY:	/* FALLTHROUGH */
	case CONN_CONGESTED:
		/*
		 * We have a complete message, so process it.  Acks and
		 * handshakes get processed here, in line.  Regular rep messages
		 * get posted to a queue, to be handled by a thread from the
		 * message thread pool.
		 */
		switch (conn->msg_type) {
		case REPMGR_ACK:
			if ((ret = record_ack(env, conn)) != 0)
				return (ret);
			break;

		case REPMGR_HEARTBEAT:
			/*
			 * The underlying byte-receiving mechanism will already
			 * have noted the fact that we got some traffic on this
			 * connection.  And that's all we really have to do, so
			 * there's nothing more needed at this point.
			 */
			break;

		case REPMGR_REP_MESSAGE:
			if ((ret = __repmgr_queue_put(env,
			    conn->input.rep_message)) != 0)
				return (ret);
			/*
			 * The queue has taken over responsibility for the
			 * rep_message buffer, and will free it later.
			 */
			given = TRUE;
			break;

		default:
			__db_errx(env,
			    "unexpected msg type rcvd in ready state: %d",
			    (int)conn->msg_type);
			return (DB_REP_UNAVAIL);
		}
		break;

	case CONN_DEFUNCT:
		break;

	default:
		DB_ASSERT(env, FALSE);
	}

	if (!given) {
		dbt = &conn->input.repmgr_msg.cntrl;
		if (dbt->size > 0)
			__os_free(env, dbt->data);
		dbt = &conn->input.repmgr_msg.rec;
		if (dbt->size > 0)
			__os_free(env, dbt->data);
	}
	__repmgr_reset_for_reading(conn);
	return (0);
}

/*
 * Examine and verify the incoming version proposal message, and send an
 * appropriate response.
 */
static int
send_version_response(env, conn)
	ENV *env;
	REPMGR_CONNECTION *conn;
{
	DB_REP *db_rep;
	__repmgr_version_proposal_args versions;
	__repmgr_version_confirmation_args conf;
	repmgr_netaddr_t *my_addr;
	char *hostname;
	u_int8_t buf[__REPMGR_VERSION_CONFIRMATION_SIZE+1];
	DBT vi;
	int ret;

	db_rep = env->rep_handle;
	my_addr = &db_rep->my_addr;

	if ((ret = find_version_info(env, conn, &vi)) != 0)
		return (ret);
	if (vi.size == 0) {
		/* No version info, so we must be talking to a v1 site. */
		hostname = conn->input.repmgr_msg.rec.data;
		if ((ret = accept_v1_handshake(env, conn, hostname)) != 0)
			return (ret);
		if ((ret = send_v1_handshake(env, conn, my_addr->host,
		     strlen(my_addr->host) + 1)) != 0)
			return (ret);
		conn->state = CONN_READY;
	} else {
		if ((ret = __repmgr_version_proposal_unmarshal(env,
		    &versions, vi.data, vi.size, NULL)) != 0)
			return (DB_REP_UNAVAIL);

		if (DB_REPMGR_VERSION >= versions.min &&
		    DB_REPMGR_VERSION <= versions.max)
			conf.version = DB_REPMGR_VERSION;
		else if (versions.max >= DB_REPMGR_MIN_VERSION &&
		    versions.max <= DB_REPMGR_VERSION)
			conf.version = versions.max;
		else {
			/*
			 * User must have wired up a combination of versions
			 * exceeding what we said we'd support.
			 */
			__db_errx(env,
			    "No available version between %lu and %lu",
			    (u_long)versions.min, (u_long)versions.max);
			return (DB_REP_UNAVAIL);
		}
		conn->version = conf.version;

		__repmgr_version_confirmation_marshal(env, &conf, buf);
		if ((ret = send_handshake(env, conn, buf, sizeof(buf))) != 0)
			return (ret);

		conn->state = CONN_PARAMETERS;
	}
	return (ret);
}

/*
 * Sends a version-aware handshake to the remote site, only after we've verified
 * that it is indeed version-aware.  We can send either v2 or v3 handshake,
 * depending on the connection's version.
 */
static int
send_handshake(env, conn, opt, optlen)
	ENV *env;
	REPMGR_CONNECTION *conn;
	void *opt;
	size_t optlen;
{
	DB_REP *db_rep;
	REP *rep;
	DBT cntrl, rec;
	__repmgr_handshake_args hs;
	__repmgr_v2handshake_args v2hs;
	repmgr_netaddr_t *my_addr;
	size_t hostname_len, rec_len;
	void *buf;
	u_int8_t *p;
	u_int32_t cntrl_len;
	int ret;

	db_rep = env->rep_handle;
	rep = db_rep->region;
	my_addr = &db_rep->my_addr;

	/*
	 * The cntrl part has port and priority.  The rec part has the host
	 * name, followed by whatever optional extra data was passed to us.
	 *
	 * Version awareness was introduced with protocol version 2.
	 */
	DB_ASSERT(env, conn->version >= 2);
	cntrl_len = conn->version == 2 ?
	    __REPMGR_V2HANDSHAKE_SIZE : __REPMGR_HANDSHAKE_SIZE;
	hostname_len = strlen(my_addr->host);
	rec_len = hostname_len + 1 +
	    (opt == NULL ? 0 : optlen);

	if ((ret = __os_malloc(env, cntrl_len + rec_len, &buf)) != 0)
		return (ret);

	cntrl.data = p = buf;
	if (conn->version == 2) {
		/* Not allowed to use multi-process feature in v2 group. */
		DB_ASSERT(env, !IS_SUBORDINATE(db_rep));
		v2hs.port = my_addr->port;
		v2hs.priority = rep->priority;
		__repmgr_v2handshake_marshal(env, &v2hs, p);
	} else {
		hs.port = my_addr->port;
		hs.priority = rep->priority;
		hs.flags = IS_SUBORDINATE(db_rep) ? REPMGR_SUBORDINATE : 0;
		__repmgr_handshake_marshal(env, &hs, p);
	}
	cntrl.size = cntrl_len;

	p = rec.data = &p[cntrl_len];
	(void)strcpy((char*)p, my_addr->host);
	p += hostname_len + 1;
	if (opt != NULL) {
		memcpy(p, opt, optlen);
		p += optlen;
	}
	rec.size = (u_int32_t)(p - (u_int8_t*)rec.data);

	/* Never block on select thread: pass blockable as FALSE. */
	ret = __repmgr_send_one(env,
	    conn, REPMGR_HANDSHAKE, &cntrl, &rec, FALSE);
	__os_free(env, buf);
	return (ret);
}

static int
read_version_response(env, conn)
	ENV *env;
	REPMGR_CONNECTION *conn;
{
	__repmgr_version_confirmation_args conf;
	DBT vi;
	char *hostname;
	int ret;

	if ((ret = find_version_info(env, conn, &vi)) != 0)
		return (ret);
	hostname = conn->input.repmgr_msg.rec.data;
	if (vi.size == 0) {
		if ((ret = accept_v1_handshake(env, conn, hostname)) != 0)
			return (ret);
	} else {
		if ((ret = __repmgr_version_confirmation_unmarshal(env,
		    &conf, vi.data, vi.size, NULL)) != 0)
			return (DB_REP_UNAVAIL);
		if (conf.version >= DB_REPMGR_MIN_VERSION &&
		    conf.version <= DB_REPMGR_VERSION)
			conn->version = conf.version;
		else {
			/*
			 * Remote site "confirmed" a version outside of the
			 * range we proposed.  It should never do that.
			 */
			__db_errx(env,
			    "Can't support confirmed version %lu",
			    (u_long)conf.version);
			return (DB_REP_UNAVAIL);
		}

		if ((ret = accept_handshake(env, conn, hostname)) != 0)
			return (ret);
		if ((ret = send_handshake(env, conn, NULL, 0)) != 0)
			return (ret);
	}
	conn->state = CONN_READY;
	return (ret);
}

/*
 * Examine the rec part of a handshake message to see if it has any version
 * information in it.  This is the magic that lets us allows version-aware sites
 * to exchange information, and yet avoids tripping up v1 sites, which don't
 * know how to look for it.
 */
static int
find_version_info(env, conn, vi)
	ENV *env;
	REPMGR_CONNECTION *conn;
	DBT *vi;
{
	DBT *dbt;
	char *hostname;
	u_int32_t hostname_len;

	dbt = &conn->input.repmgr_msg.rec;
	if (dbt->size == 0) {
		__db_errx(env, "handshake is missing rec part");
		return (DB_REP_UNAVAIL);
	}
	hostname = dbt->data;
	hostname[dbt->size-1] = '\0';
	hostname_len = (u_int32_t)strlen(hostname);
	if (hostname_len + 1 == dbt->size) {
		/*
		 * The rec DBT held only the host name.  This is a simple legacy
		 * V1 handshake; it contains no version information.
		 */
		vi->size = 0;
	} else {
		/*
		 * There's more data than just the host name.  The remainder is
		 * available to be treated as a normal byte buffer (and read in
		 * by one of the unmarshal functions).  Note that the remaining
		 * length should not include the padding byte that we have
		 * already clobbered.
		 */
		vi->data = &((u_int8_t *)dbt->data)[hostname_len + 1];
		vi->size = (dbt->size - (hostname_len+1)) - 1;
	}
	return (0);
}

static int
accept_handshake(env, conn, hostname)
	ENV *env;
	REPMGR_CONNECTION *conn;
	char *hostname;
{
	__repmgr_handshake_args hs;
	__repmgr_v2handshake_args hs2;
	u_int port;
	u_int32_t pri, flags;

	/*
	 * Current version is 3, and only other version that supports version
	 * negotiation is 2.
	 */
	DB_ASSERT(env, conn->version == 2 || conn->version == 3);

	/* Extract port and priority from cntrl. */
	if (conn->version == 2) {
		if (__repmgr_v2handshake_unmarshal(env, &hs2,
		    conn->input.repmgr_msg.cntrl.data,
		    conn->input.repmgr_msg.cntrl.size, NULL) != 0)
			return (DB_REP_UNAVAIL);
		port = hs2.port;
		pri = hs2.priority;
		flags = 0;
	} else {
		if (__repmgr_handshake_unmarshal(env, &hs,
		   conn->input.repmgr_msg.cntrl.data,
		   conn->input.repmgr_msg.cntrl.size, NULL) != 0)
			return (DB_REP_UNAVAIL);
		port = hs.port;
		pri = hs.priority;
		flags = hs.flags;
	}

	return (process_parameters(env,
		    conn, hostname, port, pri, flags));
}

static int
accept_v1_handshake(env, conn, hostname)
	ENV *env;
	REPMGR_CONNECTION *conn;
	char *hostname;
{
	DB_REPMGR_V1_HANDSHAKE *handshake;
	u_int32_t prio;

	handshake = conn->input.repmgr_msg.cntrl.data;
	if (conn->input.repmgr_msg.cntrl.size != sizeof(*handshake) ||
	    handshake->version != 1) {
		__db_errx(env, "malformed V1 handshake");
		return (DB_REP_UNAVAIL);
	}

	conn->version = 1;
	prio = ntohl(handshake->priority);
	return (process_parameters(env,
		    conn, hostname, handshake->port, prio, 0));
}

static int
process_parameters(env, conn, host, port, priority, flags)
	ENV *env;
	REPMGR_CONNECTION *conn;
	char *host;
	u_int port;
	u_int32_t priority, flags;
{
	DB_REP *db_rep;
	REPMGR_RETRY *retry;
	REPMGR_SITE *site;
	int eid, ret, sockopt;

	db_rep = env->rep_handle;

	if (F_ISSET(conn, CONN_INCOMING)) {
		/*
		 * Incoming connection: we don't yet know what site it belongs
		 * to, so it must be on the "orphans" list.
		 */
		DB_ASSERT(env, !IS_VALID_EID(conn->eid));
		TAILQ_REMOVE(&db_rep->connections, conn, entries);

		/*
		 * Now that we've been given the host and port, use them to find
		 * the site (or create a new one if necessary, etc.).
		 */
		if ((site = __repmgr_find_site(env, host, port)) != NULL) {
			eid = EID_FROM_SITE(site);
			if (LF_ISSET(REPMGR_SUBORDINATE)) {
				/*
				 * Accept it, as a supplementary source of
				 * input, but nothing else.
				 */
				TAILQ_INSERT_TAIL(&site->sub_conns,
				    conn, entries);
				conn->eid = eid;

#ifdef SO_KEEPALIVE
				sockopt = 1;
				if (setsockopt(conn->fd, SOL_SOCKET,
				    SO_KEEPALIVE, (sockopt_t)&sockopt,
				     sizeof(sockopt)) != 0) {
					ret = net_errno;
					__db_err(env, ret,
					   "can't set KEEPALIVE socket option");
					return (ret);
				}
#endif
			} else {
				if (site->state == SITE_IDLE) {
					RPRINT(env, DB_VERB_REPMGR_MISC, (env,
					"handshake from idle site %s:%u EID %u",
					    host, port, eid));
					retry = site->ref.retry;
					TAILQ_REMOVE(&db_rep->retries,
					    retry, entries);
					__os_free(env, retry);
				} else {
					/*
					 * We got an incoming connection for a
					 * site we were already connected to; at
					 * least we thought we were.
					 */
					RPRINT(env, DB_VERB_REPMGR_MISC, (env,
			     "connection from %s:%u EID %u supersedes existing",
					    host, port, eid));

					/*
					 * No need to schedule a retry for
					 * later, since we now have a
					 * replacement connection.
					 */
					__repmgr_disable_connection(env,
					     site->ref.conn);
				}
				conn->eid = eid;
				site->state = SITE_CONNECTED;
				site->ref.conn = conn;
				__os_gettime(env,
				    &site->last_rcvd_timestamp, 1);
			}
		} else {
			if ((ret = introduce_site(env,
			    host, port, &site, flags)) == 0)
				RPRINT(env, DB_VERB_REPMGR_MISC, (env,
			"handshake introduces unknown site %s:%u", host, port));
			else if (ret != EEXIST)
				return (ret);
			eid = EID_FROM_SITE(site);

			if (LF_ISSET(REPMGR_SUBORDINATE)) {
				TAILQ_INSERT_TAIL(&site->sub_conns,
				    conn, entries);
#ifdef SO_KEEPALIVE
				sockopt = 1;
				if ((ret = setsockopt(conn->fd, SOL_SOCKET,
				    SO_KEEPALIVE, (sockopt_t)&sockopt,
				     sizeof(sockopt))) != 0) {
					__db_err(env, ret,
					   "can't set KEEPALIVE socket option");
					return (ret);
				}
#endif
			} else {
				site->state = SITE_CONNECTED;
				site->ref.conn = conn;
				__os_gettime(env,
				    &site->last_rcvd_timestamp, 1);
			}
			conn->eid = eid;
		}
	} else {
		/*
		 * Since we initiated this as an outgoing connection, we
		 * obviously already know the host, port and site.  We just need
		 * the other site's priority.
		 */
		DB_ASSERT(env, IS_VALID_EID(conn->eid));
		site = SITE_FROM_EID(conn->eid);
		RPRINT(env, DB_VERB_REPMGR_MISC, (env,
		    "handshake from connection to %s:%lu EID %u",
		    site->net_addr.host,
		    (u_long)site->net_addr.port, conn->eid));
	}

	site->priority = priority;
	F_SET(site, SITE_HAS_PRIO);

	/*
	 * If we're moping around wishing we knew who the master was, then
	 * getting in touch with another site might finally provide sufficient
	 * connectivity to find out.  But just do this once, because otherwise
	 * we get messages while the subsequent rep_start operations are going
	 * on, and rep tosses them in that case.
	 */
	if (!IS_SUBORDINATE(db_rep) && /* us */
	    db_rep->master_eid == DB_EID_INVALID &&
	    db_rep->init_policy != DB_REP_MASTER &&
	    !db_rep->done_one &&
	    !LF_ISSET(REPMGR_SUBORDINATE)) { /* the remote site */
		db_rep->done_one = TRUE;
		RPRINT(env, DB_VERB_REPMGR_MISC, (env,
		    "handshake with no known master to wake election thread"));
		if ((ret = __repmgr_init_election(env, ELECT_REPSTART)) != 0)
			return (ret);
	}

	return (0);
}

static int
introduce_site(env, host, port, sitep, flags)
	ENV *env;
	char *host;
	u_int port;
	REPMGR_SITE **sitep;
	u_int32_t flags;
{
	int peer, state;

	/*
	 * SITE_CONNECTED means we have the main connection to the site.  But
	 * we're here when we first learn of a site by getting a subordinate
	 * connection, so this doesn't suffice to put us in "connected" state.
	 */
	state = LF_ISSET(REPMGR_SUBORDINATE) ? SITE_IDLE : SITE_CONNECTED;
	peer = FALSE;

	return (__repmgr_add_site_int(env, host, port, sitep, peer, state));
}

static int
record_ack(env, conn)
	ENV *env;
	REPMGR_CONNECTION *conn;
{
	DB_REP *db_rep;
	REPMGR_SITE *site;
	__repmgr_ack_args *ackp, ack;
	SITE_STRING_BUFFER location;
	u_int32_t gen;
	int ret;

	db_rep = env->rep_handle;

	DB_ASSERT(env, conn->version > 0 &&
	    IS_READY_STATE(conn->state) && IS_VALID_EID(conn->eid));
	site = SITE_FROM_EID(conn->eid);

	/*
	 * Extract the LSN.  Save it only if it is an improvement over what the
	 * site has already ack'ed.
	 */
	if (conn->version == 1) {
		ackp = conn->input.repmgr_msg.cntrl.data;
		if (conn->input.repmgr_msg.cntrl.size != sizeof(ack) ||
		    conn->input.repmgr_msg.rec.size != 0) {
			__db_errx(env, "bad ack msg size");
			return (DB_REP_UNAVAIL);
		}
	} else {
		ackp = &ack;
		if ((ret = __repmgr_ack_unmarshal(env, ackp,
			 conn->input.repmgr_msg.cntrl.data,
			 conn->input.repmgr_msg.cntrl.size, NULL)) != 0)
			return (DB_REP_UNAVAIL);
	}

	/* Ignore stale acks. */
	gen = db_rep->region->gen;
	if (ackp->generation < gen) {
		RPRINT(env, DB_VERB_REPMGR_MISC, (env,
		    "ignoring stale ack (%lu<%lu), from %s",
		     (u_long)ackp->generation, (u_long)gen,
		     __repmgr_format_site_loc(site, location)));
		return (0);
	}
	RPRINT(env, DB_VERB_REPMGR_MISC, (env,
	    "got ack [%lu][%lu](%lu) from %s", (u_long)ackp->lsn.file,
	    (u_long)ackp->lsn.offset, (u_long)ackp->generation,
	    __repmgr_format_site_loc(site, location)));

	if (ackp->generation == gen &&
	    LOG_COMPARE(&ackp->lsn, &site->max_ack) == 1) {
		memcpy(&site->max_ack, &ackp->lsn, sizeof(DB_LSN));
		if ((ret = __repmgr_wake_waiting_senders(env)) != 0)
			return (ret);
	}
	return (0);
}

/*
 * PUBLIC: int __repmgr_write_some __P((ENV *, REPMGR_CONNECTION *));
 */
int
__repmgr_write_some(env, conn)
	ENV *env;
	REPMGR_CONNECTION *conn;
{
	QUEUED_OUTPUT *output;
	REPMGR_FLAT *msg;
	int bytes, ret;

	while (!STAILQ_EMPTY(&conn->outbound_queue)) {
		output = STAILQ_FIRST(&conn->outbound_queue);
		msg = output->msg;
		if ((bytes = send(conn->fd, &msg->data[output->offset],
		    (size_t)msg->length - output->offset, 0)) == SOCKET_ERROR) {
			if ((ret = net_errno) == WOULDBLOCK)
				return (0);
			else {
				__db_err(env, ret, "writing data");
				STAT(env->rep_handle->
				    region->mstat.st_connection_drop++);
				return (DB_REP_UNAVAIL);
			}
		}

		if ((output->offset += (size_t)bytes) >= msg->length) {
			STAILQ_REMOVE_HEAD(&conn->outbound_queue, entries);
			__os_free(env, output);
			conn->out_queue_length--;
			if (--msg->ref_count <= 0)
				__os_free(env, msg);

			/*
			 * We've achieved enough movement to free up at least
			 * one space in the outgoing queue.  Wake any message
			 * threads that may be waiting for space.  Leave
			 * CONGESTED state so that when the queue reaches the
			 * high-water mark again, the filling thread will be
			 * allowed to try waiting again.
			 */
			conn->state = CONN_READY;
			if (conn->blockers > 0 &&
			    (ret = __repmgr_signal(&conn->drained)) != 0)
				return (ret);
		}
	}

#ifdef DB_WIN32
	/*
	 * With the queue now empty, it's time to relinquish ownership of this
	 * connection again, so that the next call to send() can write the
	 * message in line, instead of posting it to the queue for us.
	 */
	if (WSAEventSelect(conn->fd, conn->event_object, FD_READ|FD_CLOSE)
	    == SOCKET_ERROR) {
		ret = net_errno;
		__db_err(env, ret, "can't remove FD_WRITE event bit");
		return (ret);
	}
#endif

	return (0);
}