From af55c910a4cbf7be87db7f6e7b17674d17df2d26 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 21 Jul 2010 12:29:55 +0930 Subject: [PATCH] freeze: abort vacuuming when we're going to freeze. There are some reports of freeze timeouts, and it looks like vacuuming might be the culprit. So we add code to tell them to abort when a freeze is going on. (This is based on the 1.0.112 branch version 517f05e42f, but far simpler since tdb is now robust against processes being killed during transaction commit) CQ:S1018154 & S1018349 Signed-off-by: Rusty Russell (This used to be ctdb commit f5d7dc679501e607c2c83a248a89d3cada9df146) --- ctdb/include/ctdb_private.h | 3 +++ ctdb/server/ctdb_freeze.c | 3 +++ ctdb/server/ctdb_vacuum.c | 16 ++++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h index ca3d613f5bc..cd387c53930 100644 --- a/ctdb/include/ctdb_private.h +++ b/ctdb/include/ctdb_private.h @@ -456,6 +456,8 @@ struct ctdb_context { TALLOC_CTX *banning_ctx; + struct ctdb_vacuum_child_context *vacuumers; + /* mapping from pid to ctdb_client * */ struct ctdb_client_pid_list *client_pids; @@ -1312,6 +1314,7 @@ int ctdb_ctrl_report_recd_lock_latency(struct ctdb_context *ctdb, struct timeval int32_t ctdb_control_stop_node(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply); int32_t ctdb_control_continue_node(struct ctdb_context *ctdb); +void ctdb_stop_vacuuming(struct ctdb_context *ctdb); int ctdb_vacuum_init(struct ctdb_db_context *ctdb_db); int32_t ctdb_control_enable_script(struct ctdb_context *ctdb, TDB_DATA indata); diff --git a/ctdb/server/ctdb_freeze.c b/ctdb/server/ctdb_freeze.c index e641ef3ae68..4e977589e1e 100644 --- a/ctdb/server/ctdb_freeze.c +++ b/ctdb/server/ctdb_freeze.c @@ -272,6 +272,9 @@ int ctdb_start_freeze(struct ctdb_context *ctdb, uint32_t priority) return 0; } + /* Stop any vacuuming going on: we don't want to wait. */ + ctdb_stop_vacuuming(ctdb); + /* if there isn't a freeze lock child then create one */ if (ctdb->freeze_handles[priority] == NULL) { ctdb->freeze_handles[priority] = ctdb_freeze_lock(ctdb, priority); diff --git a/ctdb/server/ctdb_vacuum.c b/ctdb/server/ctdb_vacuum.c index f1e61dbf115..17afd79afde 100644 --- a/ctdb/server/ctdb_vacuum.c +++ b/ctdb/server/ctdb_vacuum.c @@ -36,7 +36,9 @@ enum vacuum_child_status { VACUUM_RUNNING, VACUUM_OK, VACUUM_ERROR, VACUUM_TIMEOUT}; struct ctdb_vacuum_child_context { + struct ctdb_vacuum_child_context *next, *prev; struct ctdb_vacuum_handle *vacuum_handle; + /* fd child writes status to */ int fd[2]; pid_t child_pid; enum vacuum_child_status status; @@ -743,6 +745,8 @@ static int vacuum_child_destructor(struct ctdb_vacuum_child_context *child_ctx) kill(child_ctx->child_pid, SIGKILL); } + DLIST_REMOVE(ctdb->vacuumers, child_ctx); + event_add_timed(ctdb->ev, child_ctx->vacuum_handle, timeval_current_ofs(get_vacuum_interval(ctdb_db), 0), ctdb_vacuum_event, child_ctx->vacuum_handle); @@ -861,6 +865,7 @@ ctdb_vacuum_event(struct event_context *ev, struct timed_event *te, child_ctx->status = VACUUM_RUNNING; child_ctx->start_time = timeval_current(); + DLIST_ADD(ctdb->vacuumers, child_ctx); talloc_set_destructor(child_ctx, vacuum_child_destructor); event_add_timed(ctdb->ev, child_ctx, @@ -878,6 +883,17 @@ ctdb_vacuum_event(struct event_context *ev, struct timed_event *te, child_ctx->vacuum_handle = vacuum_handle; } +void ctdb_stop_vacuuming(struct ctdb_context *ctdb) +{ + /* Simply free them all. */ + while (ctdb->vacuumers) { + DEBUG(DEBUG_INFO, ("Aborting vacuuming for %s (%p)\n", + ctdb->vacuumers->vacuum_handle->ctdb_db->db_name, + ctdb->vacuumers->child_pid)); + /* vacuum_child_destructor kills it, removes from list */ + talloc_free(ctdb->vacuumers); + } +} /* this function initializes the vacuuming context for a database * starts the vacuuming events