Limit tracking custom errors (e.g. from LUA) while allowing non custom errors to be tracked normally (#500)

Implementing the change proposed here: https://github.com/valkey-io/valkey/issues/487 In this PR, we prevent tracking new custom error messages (e.g. LUA) if the number of error messages (in the errors RAX) is greater than 128. Instead, we will track any additional custom error prefix in a new counter: `errorstat_ERRORSTATS_OVERFLOW ` and if any non-custom flagged errors (e.g. MOVED / CLUSTERDOWN) occur, they will continue to be tracked as usual. This will address the issue of spammed error messages / memory usage of the errors RAX. Additionally, we will not have to execute `CONFIG RESETSTAT` to restore error stats functionality because normal error messages continue to be tracked. Example: ``` # Errorstats . . . errorstat_127:count=2 errorstat_128:count=2 errorstat_ERR:count=1 errorstat_ERRORSTATS_OVERFLOW:count=2 ``` --------- Signed-off-by: Karthik Subbarao <karthikrs2021@gmail.com> Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
2024-07-14 20:04:47 -07:00 · 2024-07-14 20:04:47 -07:00 · 418901dec4
commit 418901dec4
parent 34649bd034
5 changed files with 56 additions and 67 deletions
--- a/src/networking.c
+++ b/src/networking.c
@ -549,19 +549,26 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) {
        server.stat_total_error_replies++;
        /* Increment the error stats
         * If the string already starts with "-..." then the error prefix
-         * is provided by the caller ( we limit the search to 32 chars). Otherwise we use "-ERR". */
-        if (s[0] != '-') {
-            incrementErrorCount("ERR", 3);
-        } else {
+         * is provided by the caller (we limit the search to 32 chars). Otherwise we use "-ERR". */
+        char *err_prefix = "ERR";
+        size_t prefix_len = 3;
+        if (s[0] == '-') {
            char *spaceloc = memchr(s, ' ', len < 32 ? len : 32);
+            /* If we cannot retrieve the error prefix, use the default: "ERR". */
            if (spaceloc) {
                const size_t errEndPos = (size_t)(spaceloc - s);
-                incrementErrorCount(s + 1, errEndPos - 1);
-            } else {
-                /* Fallback to ERR if we can't retrieve the error prefix */
-                incrementErrorCount("ERR", 3);
+                err_prefix = (char *)s + 1;
+                prefix_len = errEndPos - 1;
            }
        }
+        /* After the errors RAX reaches its limit, instead of tracking
+         * custom errors (e.g. LUA), we track the error under `errorstat_ERRORSTATS_OVERFLOW` */
+        if (flags & ERR_REPLY_FLAG_CUSTOM && raxSize(server.errors) >= ERRORSTATS_LIMIT &&
+            !raxFind(server.errors, (unsigned char *)err_prefix, prefix_len, NULL)) {
+            err_prefix = ERRORSTATS_OVERFLOW_ERR;
+            prefix_len = strlen(ERRORSTATS_OVERFLOW_ERR);
+        }
+        incrementErrorCount(err_prefix, prefix_len);
    } else {
        /* stat_total_error_replies will not be updated, which means that
         * the cmd stats will not be updated as well, we still want this command
--- a/src/script_lua.c
+++ b/src/script_lua.c
@ -617,8 +617,9 @@ static void luaReplyToRedisReply(client *c, client *script_client, lua_State *lu
                    1); /* pop the error message, we will use luaExtractErrorInformation to get error information */
            errorInfo err_info = {0};
            luaExtractErrorInformation(lua, &err_info);
-            addReplyErrorFormatEx(c, err_info.ignore_err_stats_update ? ERR_REPLY_FLAG_NO_STATS_UPDATE : 0, "-%s",
-                                  err_info.msg);
+            addReplyErrorFormatEx(
+                c, ERR_REPLY_FLAG_CUSTOM | (err_info.ignore_err_stats_update ? ERR_REPLY_FLAG_NO_STATS_UPDATE : 0),
+                "-%s", err_info.msg);
            luaErrorInformationDiscard(&err_info);
            lua_pop(lua, 1); /* pop the result table */
            return;
--- a/src/server.c
+++ b/src/server.c
@ -2548,7 +2548,6 @@ void initServer(void) {
    server.main_thread_id = pthread_self();
    server.current_client = NULL;
    server.errors = raxNew();
-    server.errors_enabled = 1;
    server.execution_nesting = 0;
    server.clients = listCreate();
    server.clients_index = raxNew();
@ -3030,7 +3029,6 @@ void resetCommandTableStats(dict *commands) {
 void resetErrorTableStats(void) {
    freeErrorsRadixTreeAsync(server.errors);
    server.errors = raxNew();
-    server.errors_enabled = 1;
 }

 /* ========================== OP Array API ============================ */
@ -4067,48 +4065,9 @@ int processCommand(client *c) {

 /* ====================== Error lookup and execution ===================== */

-/* Users who abuse lua error_reply will generate a new error object on each
- * error call, which can make server.errors get bigger and bigger. This will
- * cause the server to block when calling INFO (we also return errorstats by
- * default). To prevent the damage it can cause, when a misuse is detected,
- * we will print the warning log and disable the errorstats to avoid adding
- * more new errors. It can be re-enabled via CONFIG RESETSTAT. */
-#define ERROR_STATS_NUMBER 128
 void incrementErrorCount(const char *fullerr, size_t namelen) {
-    /* errorstats is disabled, return ASAP. */
-    if (!server.errors_enabled) return;
-
    void *result;
    if (!raxFind(server.errors, (unsigned char *)fullerr, namelen, &result)) {
-        if (server.errors->numele >= ERROR_STATS_NUMBER) {
-            sds errors = sdsempty();
-            raxIterator ri;
-            raxStart(&ri, server.errors);
-            raxSeek(&ri, "^", NULL, 0);
-            while (raxNext(&ri)) {
-                char *tmpsafe;
-                errors = sdscatlen(errors, getSafeInfoString((char *)ri.key, ri.key_len, &tmpsafe), ri.key_len);
-                errors = sdscatlen(errors, ", ", 2);
-                if (tmpsafe != NULL) zfree(tmpsafe);
-            }
-            sdsrange(errors, 0, -3); /* Remove final ", ". */
-            raxStop(&ri);
-
-            /* Print the warning log and the contents of server.errors to the log. */
-            serverLog(LL_WARNING, "Errorstats stopped adding new errors because the number of "
-                                  "errors reached the limit, may be misuse of lua error_reply, "
-                                  "please check INFO ERRORSTATS, this can be re-enabled via "
-                                  "CONFIG RESETSTAT.");
-            serverLog(LL_WARNING, "Current errors code list: %s", errors);
-            sdsfree(errors);
-
-            /* Reset the errors and add a single element to indicate that it is disabled. */
-            resetErrorTableStats();
-            incrementErrorCount("ERRORSTATS_DISABLED", 19);
-            server.errors_enabled = 0;
-            return;
-        }
-
        struct serverError *error = zmalloc(sizeof(*error));
        error->count = 1;
        raxInsert(server.errors, (unsigned char *)fullerr, namelen, error, NULL);
--- a/src/server.h
+++ b/src/server.h
@ -1606,7 +1606,6 @@ struct valkeyServer {
    dict *orig_commands; /* Command table before command renaming. */
    aeEventLoop *el;
    rax *errors;                         /* Errors table */
-    int errors_enabled;                  /* If true, errorstats is enabled, and we will add new errors. */
    unsigned int lruclock;               /* Clock for LRU eviction */
    volatile sig_atomic_t shutdown_asap; /* Shutdown ordered by signal handler. */
    mstime_t shutdown_mstime;            /* Timestamp to limit graceful shutdown. */
@ -2623,10 +2622,20 @@ int serverCommunicateSystemd(const char *sd_notify_msg);
 void serverSetCpuAffinity(const char *cpulist);
 void dictVanillaFree(dict *d, void *val);

+/* ERROR STATS constants */
+
+/* Once the errors RAX reaches this limit, instead of tracking custom
+ * errors (e.g. LUA), we track the error under the prefix below. */
+#define ERRORSTATS_LIMIT 128
+#define ERRORSTATS_OVERFLOW_ERR "ERRORSTATS_OVERFLOW"
+
 /* afterErrorReply flags */
-#define ERR_REPLY_FLAG_NO_STATS_UPDATE                                                                                 \
-    (1ULL << 0) /* Indicating that we should not update                                                                \
-                   error stats after sending error reply */
+
+/* Indicating that we should not update error stats after sending error reply. */
+#define ERR_REPLY_FLAG_NO_STATS_UPDATE (1ULL << 0)
+/* Indicates the error message is custom (e.g. from LUA). */
+#define ERR_REPLY_FLAG_CUSTOM (1ULL << 1)
+
 /* networking.c -- Networking and Client related operations */

 /* Read flags for various read errors and states */
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@ -278,21 +278,34 @@ start_server {tags {"info" "external:skip"}} {
            r config resetstat
            for {set j 1} {$j <= 1100} {incr j} {
                assert_error "$j my error message" {
-                    r eval {return redis.error_reply(string.format('%s my error message', ARGV[1]))} 0 $j
+                    r eval {return server.error_reply(string.format('%s my error message', ARGV[1]))} 0 $j
                }
            }
-
-            assert_equal [count_log_message 0 "Errorstats stopped adding new errors"] 1
-            assert_equal [count_log_message 0 "Current errors code list"] 1
-            assert_equal "count=1" [errorstat ERRORSTATS_DISABLED]
-
-            # Since we currently have no metrics exposed for server.errors, we use lazyfree
-            # to verify that we only have 128 errors.
-            wait_for_condition 50 100 {
-                [s lazyfreed_objects] eq 128
-            } else {
-                fail "errorstats resetstat lazyfree error"
+            # Validate that custom LUA errors are tracked in `ERRORSTATS_OVERFLOW` when errors
+            # has 128 entries.
+            assert_equal "count=972" [errorstat ERRORSTATS_OVERFLOW]
+            # Validate that non LUA errors continue to be tracked even when we have >=128 entries.
+            assert_error {ERR syntax error} {r set a b c d e f g}
+            assert_equal "count=1" [errorstat ERR]
+            # Validate that custom errors that were already tracked continue to increment when past 128 entries.
+            assert_equal "count=1" [errorstat 1]
+            assert_error "1 my error message" {
+                r eval {return server.error_reply(string.format('1 my error message', ARGV[1]))} 0
            }
+            assert_equal "count=2" [errorstat 1]
+            # Test LUA error variants.
+            assert_error "My error message" {r eval {return server.error_reply('My error message')} 0}
+            assert_error "My error message" {r eval {return {err = 'My error message'}} 0}
+            assert_equal "count=974" [errorstat ERRORSTATS_OVERFLOW]
+            # Function calls that contain custom error messages should call be included in overflow counter
+            r FUNCTION LOAD replace [format "#!lua name=mylib\nserver.register_function('customerrorfn', function() return server.error_reply('My error message') end)"]
+            assert_error "My error message" {r fcall customerrorfn 0}
+            assert_equal "count=975" [errorstat ERRORSTATS_OVERFLOW]
+            # Function calls that contain non lua errors should continue to be tracked normally (in a separate counter).
+            r FUNCTION LOAD replace [format "#!lua name=mylib\nserver.register_function('invalidgetcmd', function() return server.call('get', 'x', 'x', 'x') end)"]
+            assert_error "ERR Wrong number of args*" {r fcall invalidgetcmd 0}
+            assert_equal "count=975" [errorstat ERRORSTATS_OVERFLOW]
+            assert_equal "count=2" [errorstat ERR]
        }

        # skip the following 2 tests if we are running with io-threads as the eventloop metrics are different in that case.