Fix DB iterator not resetting pauserehash causing dict being unable to rehash (#12757)

When using DB iterator, it will use dictInitSafeIterator to init a old safe
dict iterator. When dbIteratorNext is used, it will jump to the next slot db
dict when we are done a dict. During this process, we do not have any calls to
dictResumeRehashing, which causes the dict's pauserehash to always be > 0.

And at last, it will be returned directly in dictRehashMilliseconds, which causes
us to have slot dict in a state where rehash cannot be completed.

In the "expire scan should skip dictionaries with lot's of empty buckets" test,
adding a `keys *` can reproduce the problem stably. `keys *` will call dbIteratorNext
to trigger a traversal of all slot dicts.

Added dbReleaseIterator and dbIteratorInitNextSafeIterator methods to call dictResetIterator.
Issue was introduced in #11695.
This commit is contained in:
Binbin 2023-11-14 20:28:46 +08:00 committed by GitHub
parent a9e73c00bc
commit fe36306340
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 35 additions and 13 deletions

View File

@ -2328,12 +2328,12 @@ int rewriteAppendOnlyFileRio(rio *aof) {
if (server.rdb_key_save_delay)
debugDelay(server.rdb_key_save_delay);
}
zfree(dbit);
dbReleaseIterator(dbit);
}
return C_OK;
werr:
if (dbit) zfree(dbit);
if (dbit) dbReleaseIterator(dbit);
return C_ERR;
}

View File

@ -87,6 +87,12 @@ dictEntry *dbIteratorNext(dbIterator *dbit) {
if (!de) { /* No current dict or reached the end of the dictionary. */
dict *d = dbIteratorNextDict(dbit);
if (!d) return NULL;
if (dbit->di.d) {
/* Before we move to the next dict, reset the iter of the previous dict. */
dictIterator *iter = &dbit->di;
dictResetIterator(iter);
}
dictInitSafeIterator(&dbit->di, d);
de = dictNext(&dbit->di);
}
@ -95,7 +101,9 @@ dictEntry *dbIteratorNext(dbIterator *dbit) {
/* Returns DB iterator that can be used to iterate through sub-dictionaries.
* Primary database contains only one dictionary when node runs without cluster mode,
* or 16k dictionaries (one per slot) when node runs with cluster mode enabled. */
* or 16k dictionaries (one per slot) when node runs with cluster mode enabled.
*
* The caller should free the resulting dbit with dbReleaseIterator. */
dbIterator *dbIteratorInit(redisDb *db, dbKeyType keyType) {
dbIterator *dbit = zmalloc(sizeof(*dbit));
dbit->db = db;
@ -106,6 +114,9 @@ dbIterator *dbIteratorInit(redisDb *db, dbKeyType keyType) {
return dbit;
}
/* Returns DB iterator that can be used to iterate through sub-dictionaries.
*
* The caller should free the resulting dbit with dbReleaseIterator. */
dbIterator *dbIteratorInitFromSlot(redisDb *db, dbKeyType keyType, int slot) {
dbIterator *dbit = zmalloc(sizeof(*dbit));
dbit->db = db;
@ -116,6 +127,14 @@ dbIterator *dbIteratorInitFromSlot(redisDb *db, dbKeyType keyType, int slot) {
return dbit;
}
/* Free the dbit returned by dbIteratorInit or dbIteratorInitFromSlot. */
void dbReleaseIterator(dbIterator *dbit) {
dictIterator *iter = &dbit->di;
dictResetIterator(iter);
zfree(dbit);
}
/* Returns next non-empty slot strictly after given one, or -1 if provided slot is the last one. */
int dbGetNextNonEmptySlot(redisDb *db, int slot, dbKeyType keyType) {
unsigned long long next_key = cumulativeKeyCountRead(db, slot, keyType) + 1;
@ -999,7 +1018,7 @@ void keysCommand(client *c) {
if (c->flags & CLIENT_CLOSE_ASAP)
break;
}
zfree(dbit);
dbReleaseIterator(dbit);
setDeferredArrayLen(c,replylen,numkeys);
}
@ -1388,7 +1407,7 @@ unsigned long long int dbSize(redisDb *db, dbKeyType keyType) {
return db->sub_dict[keyType].key_count;
}
/* This method proivdes the cumulative sum of all the dictionary buckets
/* This method provides the cumulative sum of all the dictionary buckets
* across dictionaries in a database. */
unsigned long dbBuckets(redisDb *db, dbKeyType keyType) {
if (server.cluster_enabled) {
@ -2976,7 +2995,7 @@ void dbGetStats(char *buf, size_t bufsize, redisDb *db, int full, dbKeyType keyT
}
}
}
zfree(dbit);
dbReleaseIterator(dbit);
l = dictGetStatsMsg(buf, bufsize, mainHtStats, full);
dictFreeStats(mainHtStats);
buf += l;

View File

@ -315,7 +315,7 @@ void computeDatasetDigest(unsigned char *final) {
xorDigest(final,digest,20);
decrRefCount(keyobj);
}
zfree(dbit);
dbReleaseIterator(dbit);
}
}

View File

@ -1368,11 +1368,11 @@ ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter) {
}
}
}
zfree(dbit);
dbReleaseIterator(dbit);
return written;
werr:
if (dbit) zfree(dbit);
if (dbit) dbReleaseIterator(dbit);
return -1;
}

View File

@ -654,7 +654,7 @@ void tryResizeHashTables(int dbid) {
}
/* Save current iterator position in the resize_cursor. */
db->sub_dict[subdict].resize_cursor = slot;
zfree(dbit);
dbReleaseIterator(dbit);
}
}
@ -1511,7 +1511,6 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
* call this function when needed. */
updateDictResizePolicy();
/* AOF postponed flush: Try at every cron cycle if the slow fsync
* completed. */
if ((server.aof_state == AOF_ON || server.aof_state == AOF_WAIT_REWRITE) &&

View File

@ -997,7 +997,7 @@ typedef struct redisDb {
long long avg_ttl; /* Average TTL, just for stats */
unsigned long expires_cursor; /* Cursor of the active expire cycle. */
list *defrag_later; /* List of key names to attempt to defrag one by one, gradually. */
int dict_count; /* Indicates total number of dictionaires owned by this DB, 1 dict per slot in cluster mode. */
int dict_count; /* Indicates total number of dictionaries owned by this DB, 1 dict per slot in cluster mode. */
dbDictState sub_dict[2]; /* Metadata for main and expires dictionaries */
} redisDb;
@ -2436,6 +2436,7 @@ typedef struct dbIterator dbIterator;
/* DB iterator specific functions */
dbIterator *dbIteratorInit(redisDb *db, dbKeyType keyType);
dbIterator *dbIteratorInitFromSlot(redisDb *db, dbKeyType keyType, int slot);
void dbReleaseIterator(dbIterator *dbit);
dict *dbIteratorNextDict(dbIterator *dbit);
dict *dbGetDictFromIterator(dbIterator *dbit);
int dbIteratorGetCurrentSlot(dbIterator *dbit);

View File

@ -840,7 +840,7 @@ start_cluster 1 0 {tags {"expire external:skip cluster slow"}} {
# Collect two slots to help determine the expiry scan logic is able
# to go past certain slots which aren't valid for scanning at the given point of time.
# And the next non empyt slot after that still gets scanned and expiration happens.
# And the next non empty slot after that still gets scanned and expiration happens.
# hashslot(alice) is 749
r psetex alice 500 val
@ -862,6 +862,9 @@ start_cluster 1 0 {tags {"expire external:skip cluster slow"}} {
r del "{foo}$j"
}
# Trigger a full traversal of all dictionaries.
r keys *
r debug set-active-expire 1
# Verify {foo}100 still exists and remaining got cleaned up