1
0
mirror of https://github.com/samba-team/samba.git synced 2025-03-08 04:58:40 +03:00

Redo the vacukming process to mkake it scalable.

Vacumming used to delete one record at a time on all nodes, that was
m*n behaviour and would require a huge storm of ctdb->ctdb controls and just wouldnt scale at all.

The new vacuming process collects all records to be deleted locally and then only sends 1 control to the other nodes. This control contains a list of all records to be deleted.

(This used to be ctdb commit 9e625ece19a91f362c9539fa73b6b2108f0d9c53)
This commit is contained in:
Ronnie Sahlberg 2008-03-13 07:53:29 +11:00
parent e2930588b3
commit 74d57f8d51
4 changed files with 334 additions and 158 deletions

View File

@ -490,12 +490,13 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS = 0,
CTDB_CONTROL_TRANSACTION_START = 65,
CTDB_CONTROL_TRANSACTION_COMMIT = 66,
CTDB_CONTROL_WIPE_DATABASE = 67,
CTDB_CONTROL_DELETE_RECORD = 68,
/* #68 removed */
CTDB_CONTROL_UPTIME = 69,
CTDB_CONTROL_START_RECOVERY = 70,
CTDB_CONTROL_END_RECOVERY = 71,
CTDB_CONTROL_RELOAD_NODES_FILE = 72,
CTDB_CONTROL_GET_RECLOCK_FILE = 73,
CTDB_CONTROL_TRY_DELETE_RECORDS = 74,
};
/*
@ -1161,6 +1162,7 @@ int32_t ctdb_control_get_tunable(struct ctdb_context *ctdb, TDB_DATA indata,
int32_t ctdb_control_set_tunable(struct ctdb_context *ctdb, TDB_DATA indata);
int32_t ctdb_control_list_tunables(struct ctdb_context *ctdb, TDB_DATA *outdata);
int32_t ctdb_control_get_reclock_file(struct ctdb_context *ctdb, TDB_DATA *outdata);
int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata);
void ctdb_tunables_set_defaults(struct ctdb_context *ctdb);
@ -1229,8 +1231,6 @@ int32_t ctdb_control_wipe_database(struct ctdb_context *ctdb, TDB_DATA indata);
int ctdb_vacuum(struct ctdb_context *ctdb, int argc, const char **argv);
int ctdb_repack(struct ctdb_context *ctdb, int argc, const char **argv);
int32_t ctdb_control_delete_record(struct ctdb_context *ctdb, TDB_DATA indata);
void ctdb_block_signal(int signum);
void ctdb_unblock_signal(int signum);
int32_t ctdb_monitoring_mode(struct ctdb_context *ctdb);

View File

@ -353,9 +353,6 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_control_wipe_database));
return ctdb_control_wipe_database(ctdb, indata);
case CTDB_CONTROL_DELETE_RECORD:
return ctdb_control_delete_record(ctdb, indata);
case CTDB_CONTROL_UPTIME:
return ctdb_control_uptime(ctdb, outdata);
@ -369,6 +366,9 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
CHECK_CONTROL_DATA_SIZE(0);
return ctdb_control_get_reclock_file(ctdb, outdata);
case CTDB_CONTROL_TRY_DELETE_RECORDS:
return ctdb_control_try_delete_records(ctdb, indata, outdata);
default:
DEBUG(DEBUG_CRIT,(__location__ " Unknown CTDB control opcode %u\n", opcode));
return -1;

View File

@ -641,16 +641,17 @@ bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep)
return true;
}
/*
delete a record as part of the vacuum process
only delete if we are not lmaster or dmaster, and our rsn is <= the provided rsn
use non-blocking locks
return 0 if the record was successfully deleted (i.e. it does not exist
when the function returns)
or !0 is the record still exists in the tdb after returning.
*/
int32_t ctdb_control_delete_record(struct ctdb_context *ctdb, TDB_DATA indata)
static int delete_tdb_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, struct ctdb_rec_data *rec)
{
struct ctdb_rec_data *rec = (struct ctdb_rec_data *)indata.dptr;
struct ctdb_db_context *ctdb_db;
TDB_DATA key, data;
struct ctdb_ltdb_header *hdr, *hdr2;
@ -659,16 +660,6 @@ int32_t ctdb_control_delete_record(struct ctdb_context *ctdb, TDB_DATA indata)
int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
if (indata.dsize < sizeof(uint32_t) || indata.dsize != rec->length) {
DEBUG(DEBUG_ERR,(__location__ " Bad record size in ctdb_control_delete_record\n"));
return -1;
}
ctdb_db = find_ctdb_db(ctdb, rec->reqid);
if (!ctdb_db) {
DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", rec->reqid));
return -1;
}
key.dsize = rec->keylen;
key.dptr = &rec->data[0];
@ -747,6 +738,7 @@ int32_t ctdb_control_delete_record(struct ctdb_context *ctdb, TDB_DATA indata)
}
struct recovery_callback_state {
struct ctdb_req_control *c;
};
@ -879,3 +871,89 @@ int32_t ctdb_control_get_reclock_file(struct ctdb_context *ctdb, TDB_DATA *outda
return 0;
}
/*
try to delete all these records as part of the vacuuming process
and return the records we failed to delete
*/
int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
{
struct ctdb_control_pulldb_reply *reply = (struct ctdb_control_pulldb_reply *)indata.dptr;
struct ctdb_db_context *ctdb_db;
int i;
struct ctdb_rec_data *rec;
struct ctdb_control_pulldb_reply *records;
if (indata.dsize < offsetof(struct ctdb_control_pulldb_reply, data)) {
DEBUG(DEBUG_ERR,(__location__ " invalid data in try_delete_records\n"));
return -1;
}
ctdb_db = find_ctdb_db(ctdb, reply->db_id);
if (!ctdb_db) {
DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
return -1;
}
DEBUG(DEBUG_DEBUG,("starting try_delete_records of %u records for dbid 0x%x\n",
reply->count, reply->db_id));
/* create a blob to send back the records we couldnt delete */
records = (struct ctdb_control_pulldb_reply *)
talloc_zero_size(outdata,
offsetof(struct ctdb_control_pulldb_reply, data));
if (records == NULL) {
DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
return -1;
}
records->db_id = ctdb_db->db_id;
rec = (struct ctdb_rec_data *)&reply->data[0];
for (i=0;i<reply->count;i++) {
TDB_DATA key, data;
key.dptr = &rec->data[0];
key.dsize = rec->keylen;
data.dptr = &rec->data[key.dsize];
data.dsize = rec->datalen;
if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record in indata\n"));
return -1;
}
/* If we cant delete the record we must add it to the reply
so the lmaster knows it may not purge this record
*/
if (delete_tdb_record(ctdb, ctdb_db, rec) != 0) {
size_t old_size;
struct ctdb_ltdb_header *hdr;
hdr = (struct ctdb_ltdb_header *)data.dptr;
data.dptr += sizeof(*hdr);
data.dsize -= sizeof(*hdr);
DEBUG(DEBUG_INFO, (__location__ " Failed to vacuum delete record with hash 0x%08x\n", ctdb_hash(&key)));
old_size = talloc_get_size(records);
records = talloc_realloc_size(outdata, records, old_size + rec->length);
if (records == NULL) {
DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
return -1;
}
records->count++;
memcpy(old_size+(uint8_t *)records, rec, rec->length);
}
rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
}
outdata->dptr = (uint8_t *)records;
outdata->dsize = talloc_get_size(records);
return 0;
}

View File

@ -23,132 +23,12 @@
#include "system/network.h"
#include "../include/ctdb.h"
#include "../include/ctdb_private.h"
#include "../common/rb_tree.h"
#include "db_wrap.h"
/* should be tunable */
#define TIMELIMIT() timeval_current_ofs(10, 0)
/*
vacuum one record
*/
static int ctdb_vacuum_one(struct ctdb_context *ctdb, TDB_DATA key,
struct ctdb_db_context *ctdb_db, uint32_t *count)
{
TDB_DATA data;
struct ctdb_ltdb_header *hdr;
struct ctdb_rec_data *rec;
uint64_t rsn;
if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
/* the chain is busy - come back later */
return 0;
}
data = tdb_fetch(ctdb_db->ltdb->tdb, key);
tdb_chainunlock(ctdb_db->ltdb->tdb, key);
if (data.dptr == NULL) {
return 0;
}
if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
free(data.dptr);
return 0;
}
hdr = (struct ctdb_ltdb_header *)data.dptr;
rsn = hdr->rsn;
/* if we are not the lmaster and the dmaster then skip the record */
if (hdr->dmaster != ctdb->pnn ||
ctdb_lmaster(ctdb, &key) != ctdb->pnn) {
free(data.dptr);
return 0;
}
rec = ctdb_marshall_record(ctdb, ctdb_db->db_id, key, hdr, tdb_null);
free(data.dptr);
if (rec == NULL) {
/* try it again later */
return 0;
}
data.dptr = (void *)rec;
data.dsize = rec->length;
if (ctdb_client_async_control(ctdb, CTDB_CONTROL_DELETE_RECORD,
list_of_vnnmap_nodes(ctdb, ctdb->vnn_map, rec, false),
TIMELIMIT(), true, data) != 0) {
/* one or more nodes failed to delete a record - no problem! */
talloc_free(rec);
return 0;
}
talloc_free(rec);
/* its deleted on all other nodes - refetch, check and delete */
if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
/* the chain is busy - come back later */
return 0;
}
data = tdb_fetch(ctdb_db->ltdb->tdb, key);
if (data.dptr == NULL) {
tdb_chainunlock(ctdb_db->ltdb->tdb, key);
return 0;
}
if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
free(data.dptr);
tdb_chainunlock(ctdb_db->ltdb->tdb, key);
return 0;
}
hdr = (struct ctdb_ltdb_header *)data.dptr;
/* if we are not the lmaster and the dmaster then skip the record */
if (hdr->dmaster != ctdb->pnn ||
ctdb_lmaster(ctdb, &key) != ctdb->pnn ||
rsn != hdr->rsn) {
tdb_chainunlock(ctdb_db->ltdb->tdb, key);
free(data.dptr);
return 0;
}
ctdb_block_signal(SIGALRM);
tdb_delete(ctdb_db->ltdb->tdb, key);
ctdb_unblock_signal(SIGALRM);
tdb_chainunlock(ctdb_db->ltdb->tdb, key);
free(data.dptr);
(*count)++;
return 0;
}
/*
vacuum records for which we are the lmaster
*/
static int ctdb_vacuum_local(struct ctdb_context *ctdb, struct ctdb_control_pulldb_reply *list,
struct ctdb_db_context *ctdb_db, uint32_t *count)
{
struct ctdb_rec_data *r;
int i;
r = (struct ctdb_rec_data *)&list->data[0];
for (i=0;
i<list->count;
r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r), i++) {
TDB_DATA key;
key.dptr = &r->data[0];
key.dsize = r->keylen;
if (ctdb_vacuum_one(ctdb, key, ctdb_db, count) != 0) {
return -1;
}
}
return 0;
}
/*
a list of records to possibly delete
@ -156,24 +36,37 @@ static int ctdb_vacuum_local(struct ctdb_context *ctdb, struct ctdb_control_pull
struct vacuum_data {
uint32_t vacuum_limit;
struct ctdb_context *ctdb;
struct ctdb_db_context *ctdb_db;
trbt_tree_t *delete_tree;
uint32_t delete_count;
struct ctdb_control_pulldb_reply **list;
bool traverse_error;
uint32_t total;
};
/* this structure contains the information for one record to be deleted */
struct delete_record_data {
struct ctdb_context *ctdb;
struct ctdb_db_context *ctdb_db;
struct ctdb_ltdb_header hdr;
TDB_DATA key;
};
/*
traverse function for vacuuming
*/
static int vacuum_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private)
{
struct vacuum_data *vdata = talloc_get_type(private, struct vacuum_data);
struct ctdb_context *ctdb = vdata->ctdb;
struct ctdb_db_context *ctdb_db = vdata->ctdb_db;
uint32_t lmaster;
struct ctdb_ltdb_header *hdr;
struct ctdb_rec_data *rec;
size_t old_size;
lmaster = ctdb_lmaster(vdata->ctdb, &key);
if (lmaster >= vdata->ctdb->vnn_map->size) {
lmaster = ctdb_lmaster(ctdb, &key);
if (lmaster >= ctdb->vnn_map->size) {
return 0;
}
@ -184,13 +77,53 @@ static int vacuum_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
hdr = (struct ctdb_ltdb_header *)data.dptr;
if (hdr->dmaster != vdata->ctdb->pnn) {
if (hdr->dmaster != ctdb->pnn) {
return 0;
}
/* is this a records we could possibly delete? I.e.
if the record is empty and also we are both lmaster
and dmaster for the record we should be able to delete it
*/
if ( (lmaster == ctdb->pnn)
&&( (vdata->delete_count < vdata->vacuum_limit)
||(vdata->vacuum_limit == 0) ) ){
uint32_t hash;
hash = ctdb_hash(&key);
if (trbt_lookup32(vdata->delete_tree, hash)) {
DEBUG(DEBUG_INFO, (__location__ " Hash collission when vacuuming, skipping this record.\n"));
} else {
struct delete_record_data *dd;
/* store key and header indexed by the key hash */
dd = talloc_zero(vdata->delete_tree, struct delete_record_data);
if (dd == NULL) {
DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
return -1;
}
dd->ctdb = ctdb;
dd->ctdb_db = ctdb_db;
dd->key.dsize = key.dsize;
dd->key.dptr = talloc_memdup(dd, key.dptr, key.dsize);
if (dd->key.dptr == NULL) {
DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
return -1;
}
dd->hdr = *hdr;
trbt_insert32(vdata->delete_tree, hash, dd);
vdata->delete_count++;
}
}
/* add the record to the blob ready to send to the nodes */
rec = ctdb_marshall_record(vdata->list[lmaster], vdata->ctdb->pnn, key, NULL, tdb_null);
rec = ctdb_marshall_record(vdata->list[lmaster], ctdb->pnn, key, NULL, tdb_null);
if (rec == NULL) {
DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
vdata->traverse_error = true;
@ -219,6 +152,84 @@ static int vacuum_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
return 0;
}
struct delete_records_list {
struct ctdb_control_pulldb_reply *records;
};
/*
traverse the tree of records to delete and marshall them into
a blob
*/
static void
delete_traverse(void *param, void *data)
{
struct delete_record_data *dd = talloc_get_type(data, struct delete_record_data);
struct delete_records_list *recs = talloc_get_type(param, struct delete_records_list);
struct ctdb_rec_data *rec;
size_t old_size;
rec = ctdb_marshall_record(dd, recs->records->db_id, dd->key, &dd->hdr, tdb_null);
if (rec == NULL) {
DEBUG(DEBUG_ERR, (__location__ " failed to marshall record\n"));
return;
}
old_size = talloc_get_size(recs->records);
recs->records = talloc_realloc_size(NULL, recs->records, old_size + rec->length);
if (recs->records == NULL) {
DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
return;
}
recs->records->count++;
memcpy(old_size+(uint8_t *)(recs->records), rec, rec->length);
}
static void delete_record(void *param, void *d)
{
struct delete_record_data *dd = talloc_get_type(d, struct delete_record_data);
struct ctdb_context *ctdb = dd->ctdb;
struct ctdb_db_context *ctdb_db = dd->ctdb_db;
uint32_t *count = (uint32_t *)param;
struct ctdb_ltdb_header *hdr;
TDB_DATA data;
/* its deleted on all other nodes - refetch, check and delete */
if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, dd->key) != 0) {
/* the chain is busy - come back later */
return;
}
data = tdb_fetch(ctdb_db->ltdb->tdb, dd->key);
if (data.dptr == NULL) {
tdb_chainunlock(ctdb_db->ltdb->tdb, dd->key);
return;
}
if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
free(data.dptr);
tdb_chainunlock(ctdb_db->ltdb->tdb, dd->key);
return;
}
hdr = (struct ctdb_ltdb_header *)data.dptr;
/* if we are not the lmaster and the dmaster then skip the record */
if (hdr->dmaster != ctdb->pnn ||
ctdb_lmaster(ctdb, &(dd->key)) != ctdb->pnn ||
dd->hdr.rsn != hdr->rsn) {
tdb_chainunlock(ctdb_db->ltdb->tdb, dd->key);
free(data.dptr);
return;
}
ctdb_block_signal(SIGALRM);
tdb_delete(ctdb_db->ltdb->tdb, dd->key);
ctdb_unblock_signal(SIGALRM);
tdb_chainunlock(ctdb_db->ltdb->tdb, dd->key);
free(data.dptr);
(*count)++;
}
/* vacuum one database */
static int ctdb_vacuum_db(struct ctdb_context *ctdb, uint32_t db_id, struct ctdb_node_map *map,
@ -237,6 +248,11 @@ static int ctdb_vacuum_db(struct ctdb_context *ctdb, uint32_t db_id, struct ctdb
vdata->ctdb = ctdb;
vdata->vacuum_limit = vacuum_limit;
vdata->delete_tree = trbt_create(vdata, 0);
if (vdata->delete_tree == NULL) {
DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
return -1;
}
if (ctdb_ctrl_getdbname(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, db_id, vdata, &name) != 0) {
DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", db_id));
@ -250,6 +266,7 @@ static int ctdb_vacuum_db(struct ctdb_context *ctdb, uint32_t db_id, struct ctdb
talloc_free(vdata);
return -1;
}
vdata->ctdb_db = ctdb_db;
/* the list needs to be of length num_nodes */
vdata->list = talloc_array(vdata, struct ctdb_control_pulldb_reply *, ctdb->vnn_map->size);
@ -301,23 +318,104 @@ static int ctdb_vacuum_db(struct ctdb_context *ctdb, uint32_t db_id, struct ctdb
}
}
for (i=0;i<ctdb->vnn_map->size;i++) {
uint32_t count = 0;
if (vdata->list[i]->count == 0) {
continue;
/* Process all records we can delete (if any) */
if (vdata->delete_count > 0) {
struct delete_records_list *recs;
TDB_DATA indata, outdata;
int ret;
int32_t res;
uint32_t count;
recs = talloc_zero(vdata, struct delete_records_list);
if (recs == NULL) {
DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
return -1;
}
recs->records = (struct ctdb_control_pulldb_reply *)
talloc_zero_size(vdata,
offsetof(struct ctdb_control_pulldb_reply, data));
if (recs->records == NULL) {
DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
return -1;
}
recs->records->db_id = db_id;
/* traverse the tree of all records we want to delete and
create a blob we can send to the other nodes.
*/
trbt_traversearray32(vdata->delete_tree, 1, delete_traverse, recs);
indata.dsize = talloc_get_size(recs->records);
indata.dptr = (void *)recs->records;
/* now tell all the other nodes to delete all these records
(if possible)
*/
for (i=0;i<ctdb->vnn_map->size;i++) {
struct ctdb_control_pulldb_reply *records;
struct ctdb_rec_data *rec;
if (ctdb->vnn_map->map[i] == ctdb->pnn) {
/* we dont delete the records on the local node
just yet
*/
continue;
}
ret = ctdb_control(ctdb, ctdb->vnn_map->map[i], 0,
CTDB_CONTROL_TRY_DELETE_RECORDS, 0,
indata, recs, &outdata, &res,
NULL, NULL);
if (ret != 0 || res != 0) {
DEBUG(DEBUG_ERR,("Failed to delete records on node %u\n", ctdb->vnn_map->map[i]));
exit(10);
}
/* outdata countains the list of records coming back
from the node which the node could not delete
*/
records = (struct ctdb_control_pulldb_reply *)outdata.dptr;
rec = (struct ctdb_rec_data *)&records->data[0];
while (records->count-- > 1) {
TDB_DATA reckey, recdata;
struct ctdb_ltdb_header *rechdr;
reckey.dptr = &rec->data[0];
reckey.dsize = rec->keylen;
recdata.dptr = &rec->data[reckey.dsize];
recdata.dsize = rec->datalen;
if (recdata.dsize < sizeof(struct ctdb_ltdb_header)) {
DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
exit(10);
}
rechdr = (struct ctdb_ltdb_header *)recdata.dptr;
recdata.dptr += sizeof(*rechdr);
recdata.dsize -= sizeof(*rechdr);
/* that other node couldnt delete the record
so we shouldnt delete it either.
remove it from the tree.
*/
talloc_free(trbt_lookup32(vdata->delete_tree, ctdb_hash(&reckey)));
rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
}
}
/* for records where we are the lmaster, we can try to delete them */
if (ctdb_vacuum_local(ctdb, vdata->list[i], ctdb_db, &count) != 0) {
DEBUG(DEBUG_ERR,(__location__ " Deletion error in vacuuming '%s'\n", name));
talloc_free(vdata);
return -1;
/* the only records remaining in the tree would be those
records where all other nodes could successfully
delete them, so we can now safely delete them on the
lmaster as well.
*/
count = 0;
trbt_traversearray32(vdata->delete_tree, 1, delete_record, &count);
if (vdata->delete_count != 0) {
printf("Deleted %u records out of %u on this node from '%s'\n", count, vdata->delete_count, name);
}
if (count != 0) {
printf("Deleted %u records on this node from '%s'\n", count, name);
}
}
}
/* this ensures we run our event queue */
ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE);