2024-12-23 17:34:34 +03:00 · 2009-11-03 05:41:02 +01:00 · 2009-11-03 05:41:02 +01:00 · d415d4d32f
commit d415d4d32f
parent 49397a8b3e
6 changed files with 73 additions and 5 deletions
--- a/docs-xml/smbdotconf/misc/ctdbtimeout.xml
+++ b/docs-xml/smbdotconf/misc/ctdbtimeout.xml
@ -0,0 +1,37 @@
+<samba:parameter name="ctdb timeout"
+                 context="G"
+				 type="integer"
+                 advanced="1"
+                 xmlns:samba="http://www.samba.org/samba/DTD/samba-doc">
+<description>
+	<para>This parameter specifies a timeout in seconds for the
+	  connection between Samba and ctdb. It is only valid if you
+	  have compiled Samba with clustering and if you have
+	  set <parameter>clustering=yes</parameter>.
+	</para>
+	<para>When something in the cluster blocks, it can happen that
+	  we wait indefinitely long for ctdb, just adding to the
+	  blocking condition. In a well-running cluster this should
+	  never happen, but there are too many components in a cluster
+	  that might have hickups. Choosing the right balance for this
+	  value is very tricky, because on a busy cluster long service
+	  times to transfer something across the cluster might be
+	  valid. Setting it too short will degrade the service your
+	  cluster presents, setting it too long might make the cluster
+	  itself not recover from something severely broken for too
+	  long.
+	</para>
+	<para>
+	  Be aware that if you set this parameter, this needs to be in
+	  the file smb.conf, it is not really helpful to put this into
+	  a registry configuration (typical on a cluster), because to
+	  access the registry contact to ctdb is requred.
+	</para>
+	<para>Setting <parameter>ctdb timeout</parameter> to n makes
+	  any process waiting longer than n seconds for a reply by the
+	  cluster panic. Setting it to 0 (the default) makes Samba
+	  block forever, which is the highly recommended default.
+	</para>
+</description>
+<value type="default">0</value>
+</samba:parameter>
--- a/source3/include/packet.h
+++ b/source3/include/packet.h
@ -38,7 +38,8 @@ NTSTATUS packet_fd_read(struct packet_context *ctx);
 /*
 * Sync read, wait for the next chunk
 */
-NTSTATUS packet_fd_read_sync(struct packet_context *ctx);
+NTSTATUS packet_fd_read_sync(struct packet_context *ctx,
+			     struct timeval *timeout);

 /*
 * Handle an incoming packet:
--- a/source3/include/proto.h
+++ b/source3/include/proto.h
@ -4130,6 +4130,7 @@ int lp_cups_connection_timeout(void);
 const char *lp_ctdbd_socket(void);
 const char **lp_cluster_addresses(void);
 bool lp_clustering(void);
+int lp_ctdb_timeout(void);
 char *lp_printcommand(int );
 char *lp_lpqcommand(int );
 char *lp_lprmcommand(int );
--- a/source3/lib/ctdbd_conn.c
+++ b/source3/lib/ctdbd_conn.c
@ -275,6 +275,17 @@ static struct messaging_rec *ctdb_pull_messaging_rec(TALLOC_CTX *mem_ctx,
 	return result;
 }

+static NTSTATUS ctdb_packet_fd_read_sync(struct packet_context *ctx)
+{
+	struct timeval timeout;
+	struct timeval *ptimeout;
+
+	timeout = timeval_set(lp_ctdb_timeout(), 0);
+	ptimeout = (timeout.tv_sec != 0) ? &timeout : NULL;
+
+	return packet_fd_read_sync(ctx, ptimeout);
+}
+
 /*
 * Read a full ctdbd request. If we have a messaging context, defer incoming
 * messages that might come in between.
@ -289,7 +300,7 @@ static NTSTATUS ctdb_read_req(struct ctdbd_connection *conn, uint32 reqid,

 again:

-	status = packet_fd_read_sync(conn->pkt);
+	status = ctdb_packet_fd_read_sync(conn->pkt);

 	if (NT_STATUS_EQUAL(status, NT_STATUS_NETWORK_BUSY)) {
 		/* EAGAIN */
@ -1156,7 +1167,7 @@ NTSTATUS ctdbd_traverse(uint32 db_id,
 			break;
 		}

-		status = packet_fd_read_sync(conn->pkt);
+		status = ctdb_packet_fd_read_sync(conn->pkt);

 		if (NT_STATUS_EQUAL(status, NT_STATUS_RETRY)) {
 			/*
--- a/source3/lib/packet.c
+++ b/source3/lib/packet.c
@ -101,7 +101,8 @@ NTSTATUS packet_fd_read(struct packet_context *ctx)
 	return NT_STATUS_OK;
 }

-NTSTATUS packet_fd_read_sync(struct packet_context *ctx)
+NTSTATUS packet_fd_read_sync(struct packet_context *ctx,
+			     struct timeval *timeout)
 {
 	int res;
 	fd_set r_fds;
@ -109,7 +110,12 @@ NTSTATUS packet_fd_read_sync(struct packet_context *ctx)
 	FD_ZERO(&r_fds);
 	FD_SET(ctx->fd, &r_fds);

-	res = sys_select(ctx->fd+1, &r_fds, NULL, NULL, NULL);
+	res = sys_select(ctx->fd+1, &r_fds, NULL, NULL, timeout);
+
+	if (res == 0) {
+		DEBUG(10, ("select timed out\n"));
+		return NT_STATUS_IO_TIMEOUT;
+	}

 	if (res == -1) {
 		DEBUG(10, ("select returned %s\n", strerror(errno)));
--- a/source3/param/loadparm.c
+++ b/source3/param/loadparm.c
@ -271,6 +271,7 @@ struct global {
 	char *ctdbdSocket;
 	char **szClusterAddresses;
 	bool clustering;
+	int ctdb_timeout;
 	int ldap_passwd_sync;
 	int ldap_replication_sleep;
 	int ldap_timeout; /* This is initialised in init_globals */
@ -2541,6 +2542,15 @@ static struct parm_struct parm_table[] = {
 		.enum_list	= NULL,
 		.flags		= FLAG_ADVANCED | FLAG_GLOBAL,
 	},
+	{
+		.label		= "ctdb timeout",
+		.type		= P_INTEGER,
+		.p_class	= P_GLOBAL,
+		.ptr		= &Globals.ctdb_timeout,
+		.special	= NULL,
+		.enum_list	= NULL,
+		.flags		= FLAG_ADVANCED | FLAG_GLOBAL,
+	},

 	{N_("Printing Options"), P_SEP, P_SEPARATOR},

@ -5107,6 +5117,7 @@ static void init_globals(bool first_time_only)
 	string_set(&Globals.ctdbdSocket, "");
 	Globals.szClusterAddresses = NULL;
 	Globals.clustering = False;
+	Globals.ctdb_timeout = 0;

 	Globals.winbind_cache_time = 300;	/* 5 minutes */
 	Globals.winbind_reconnect_delay = 30;	/* 30 seconds */
@ -5557,6 +5568,7 @@ FN_GLOBAL_INTEGER(lp_cups_connection_timeout, &Globals.cups_connection_timeout)
 FN_GLOBAL_CONST_STRING(lp_ctdbd_socket, &Globals.ctdbdSocket)
 FN_GLOBAL_LIST(lp_cluster_addresses, &Globals.szClusterAddresses)
 FN_GLOBAL_BOOL(lp_clustering, &Globals.clustering)
+FN_GLOBAL_INTEGER(lp_ctdb_timeout, &Globals.ctdb_timeout)
 FN_LOCAL_STRING(lp_printcommand, szPrintcommand)
 FN_LOCAL_STRING(lp_lpqcommand, szLpqcommand)
 FN_LOCAL_STRING(lp_lprmcommand, szLprmcommand)