string-util: introduce strverscmp_improved()

Unfortunately, strverscmp() from libc or str_verscmp() do not correctly handle pre-release version, e.g. 247 vs 247~rc1. This implement a new comparison function, which is based on the RPM's rpmvercmp().
2024-12-23 21:35:11 +03:00 · 2021-02-04 05:55:59 +09:00 · 2021-02-04 05:55:59 +09:00 · 87b7d9b6ff
commit 87b7d9b6ff
parent e5bc5f1f5a
3 changed files with 239 additions and 0 deletions
--- a/src/fundamental/string-util-fundamental.c
+++ b/src/fundamental/string-util-fundamental.c
@ -1,6 +1,8 @@
 /* SPDX-License-Identifier: LGPL-2.1-or-later */

 #ifndef SD_BOOT
+#include <ctype.h>
+
 #include "macro.h"
 #endif
 #include "string-util-fundamental.h"
@ -74,3 +76,159 @@ sd_char* endswith_no_case(const sd_char *s, const sd_char *postfix) {

        return (sd_char*) s + sl - pl;
 }
+
+#ifdef SD_BOOT
+static sd_bool isdigit(sd_char a) {
+        return a >= '0' && a <= '9';
+}
+#endif
+
+static sd_bool is_alpha(sd_char a) {
+        /* Locale independent version of isalpha(). */
+        return (a >= 'a' && a <= 'z') || (a >= 'A' && a <= 'Z');
+}
+
+static sd_bool is_valid_version_char(sd_char a) {
+        return isdigit(a) || is_alpha(a) || IN_SET(a, '~', '-', '^', '.');
+}
+
+sd_int strverscmp_improved(const sd_char *a, const sd_char *b) {
+
+        /* This is based on RPM's rpmvercmp(). But this explicitly handles '-' and '.', as we usually
+         * want to directly compare strings which contain both version and release; e.g.
+         * '247.2-3.1.fc33.x86_64' or '5.11.0-0.rc5.20210128git76c057c84d28.137.fc34'.
+         * Unlike rpmvercmp(), this distiguishes e.g. 123a and 123.a, and 123a is newer.
+         *
+         * This splits the input strings into segments. Each segment is numeric or alpha, and may be
+         * prefixed with the following:
+         *  '~' : used for pre-releases, a segment prefixed with this is the oldest,
+         *  '-' : used for the separator between version and release,
+         *  '^' : used for patched releases, a segment with this is newer than one with '-'.
+         *  '.' : used for point releases.
+         * Note, no prefix segment is the newest. All non-supported characters are dropped, and
+         * handled as a separator of segments, e.g., 123_a is equivalent to 123a.
+         *
+         * By using this, version strings can be sorted like following:
+         *  (older) 122.1
+         *     ^    123~rc1-1
+         *     |    123
+         *     |    123-a
+         *     |    123-a.1
+         *     |    123-1
+         *     |    123-1.1
+         *     |    123^post1
+         *     |    123.a-1
+         *     |    123.1-1
+         *     v    123a-1
+         *  (newer) 124-1
+         */
+
+        if (isempty(a) || isempty(b))
+                return strcmp_ptr(a, b);
+
+        for (;;) {
+                const sd_char *aa, *bb;
+                sd_int r;
+
+                /* Drop leading invalid characters. */
+                while (*a != '\0' && !is_valid_version_char(*a))
+                        a++;
+                while (*b != '\0' && !is_valid_version_char(*b))
+                        b++;
+
+                /* Handle '~'. Used for pre-releases, e.g. 123~rc1, or 4.5~alpha1 */
+                if (*a == '~' || *b == '~') {
+                        /* The string prefixed with '~' is older. */
+                        r = CMP(*a != '~', *b != '~');
+                        if (r != 0)
+                                return r;
+
+                        /* Now both strings are prefixed with '~'. Compare remaining strings. */
+                        a++;
+                        b++;
+                }
+
+                /* If at least one string reaches the end, then longer is newer.
+                 * Note that except for '~' prefixed segments, a string has more segments is newer.
+                 * So, this check must be after the '~' check. */
+                if (*a == '\0' || *b == '\0')
+                        return strcmp(a, b);
+
+                /* Handle '-', which separates version and release, e.g 123.4-3.1.fc33.x86_64 */
+                if (*a == '-' || *b == '-') {
+                        /* The string prefixed with '-' is older (e.g., 123-9 vs 123.1-1) */
+                        r = CMP(*a != '-', *b != '-');
+                        if (r != 0)
+                                return r;
+
+                        a++;
+                        b++;
+                }
+
+                /* Handle '^'. Used for patched release. */
+                if (*a == '^' || *b == '^') {
+                        r = CMP(*a != '^', *b != '^');
+                        if (r != 0)
+                                return r;
+
+                        a++;
+                        b++;
+                }
+
+                /* Handle '.'. Used for point releases. */
+                if (*a == '.' || *b == '.') {
+                        r = CMP(*a != '.', *b != '.');
+                        if (r != 0)
+                                return r;
+
+                        a++;
+                        b++;
+                }
+
+                if (isdigit(*a) || isdigit(*b)) {
+                        /* Skip leading '0', to make 00123 equivalent to 123. */
+                        while (*a == '0')
+                                a++;
+                        while (*b == '0')
+                                b++;
+
+                        /* Find the leading numeric segments. One may be an empty string. So,
+                         * numeric segments are always newer than alpha segments. */
+                        for (aa = a; *aa != '\0' && isdigit(*aa); aa++)
+                                ;
+                        for (bb = b; *bb != '\0' && isdigit(*bb); bb++)
+                                ;
+
+                        /* To compare numeric segments without parsing their values, first compare the
+                         * lengths of the segments. Eg. 12345 vs 123, longer is newer. */
+                        r = CMP(aa - a, bb - b);
+                        if (r != 0)
+                                return r;
+
+                        /* Then, compare them as strings. */
+                        r = strncmp(a, b, aa - a);
+                        if (r != 0)
+                                return r;
+                } else {
+                        /* Find the leading non-numeric segments. */
+                        for (aa = a; *aa != '\0' && is_alpha(*aa); aa++)
+                                ;
+                        for (bb = b; *bb != '\0' && is_alpha(*bb); bb++)
+                                ;
+
+                        /* Note that the segments are usually not NUL-terminated. */
+                        r = strncmp(a, b, MIN(aa - a, bb - b));
+                        if (r != 0)
+                                return r;
+
+                        /* Longer is newer, e.g. abc vs abcde. */
+                        r = CMP(aa - a, bb - b);
+                        if (r != 0)
+                                return r;
+                }
+
+                /* The current segments are equivalent. Let's compare the next one. */
+                a = aa;
+                b = bb;
+        }
+}
--- a/src/fundamental/string-util-fundamental.h
+++ b/src/fundamental/string-util-fundamental.h
@ -63,3 +63,5 @@ static inline sd_bool isempty(const sd_char *a) {
 static inline const sd_char *yes_no(sd_bool b) {
        return b ? STR_C("yes") : STR_C("no");
 }
+
+sd_int strverscmp_improved(const sd_char *a, const sd_char *b);
--- a/src/test/test-string-util.c
+++ b/src/test/test-string-util.c
@ -886,6 +886,84 @@ static void test_string_contains_word(void) {
        assert_se(!string_contains_word("a:b:cc", ":#", ":cc"));
 }

+static void test_strverscmp_improved_one(const char *newer, const char *older) {
+        log_info("/* %s(%s, %s) */", __func__, strnull(newer), strnull(older));
+
+        assert_se(strverscmp_improved(newer, newer) == 0);
+        assert_se(strverscmp_improved(newer, older) >  0);
+        assert_se(strverscmp_improved(older, newer) <  0);
+        assert_se(strverscmp_improved(older, older) == 0);
+}
+
+static void test_strverscmp_improved(void) {
+        static const char * const versions[] = {
+                "",
+                "~1",
+                "ab",
+                "abb",
+                "abc",
+                "0001",
+                "002",
+                "12",
+                "122",
+                "122.9",
+                "123~rc1",
+                "123",
+                "123-a",
+                "123-a.1",
+                "123-a1",
+                "123-a1.1",
+                "123-3",
+                "123-3.1",
+                "123^patch1",
+                "123^1",
+                "123.a-1"
+                "123.1-1",
+                "123a-1",
+                "124",
+                NULL,
+        };
+        const char * const *p, * const *q;
+
+        STRV_FOREACH(p, versions)
+                STRV_FOREACH(q, p + 1)
+                        test_strverscmp_improved_one(*q, *p);
+
+        test_strverscmp_improved_one("123.45-67.89", "123.45-67.88");
+        test_strverscmp_improved_one("123.45-67.89a", "123.45-67.89");
+        test_strverscmp_improved_one("123.45-67.89", "123.45-67.ab");
+        test_strverscmp_improved_one("123.45-67.89", "123.45-67.9");
+        test_strverscmp_improved_one("123.45-67.89", "123.45-67");
+        test_strverscmp_improved_one("123.45-67.89", "123.45-66.89");
+        test_strverscmp_improved_one("123.45-67.89", "123.45-9.99");
+        test_strverscmp_improved_one("123.45-67.89", "123.42-99.99");
+        test_strverscmp_improved_one("123.45-67.89", "123-99.99");
+
+        /* '~' : pre-releases */
+        test_strverscmp_improved_one("123.45-67.89", "123~rc1-99.99");
+        test_strverscmp_improved_one("123-45.67.89", "123~rc1-99.99");
+        test_strverscmp_improved_one("123~rc2-67.89", "123~rc1-99.99");
+        test_strverscmp_improved_one("123^aa2-67.89", "123~rc1-99.99");
+        test_strverscmp_improved_one("123aa2-67.89", "123~rc1-99.99");
+
+        /* '-' : separator between version and release. */
+        test_strverscmp_improved_one("123.45-67.89", "123-99.99");
+        test_strverscmp_improved_one("123^aa2-67.89", "123-99.99");
+        test_strverscmp_improved_one("123aa2-67.89", "123-99.99");
+
+        /* '^' : patch releases */
+        test_strverscmp_improved_one("123.45-67.89", "123^45-67.89");
+        test_strverscmp_improved_one("123^aa2-67.89", "123^aa1-99.99");
+        test_strverscmp_improved_one("123aa2-67.89", "123^aa2-67.89");
+
+        /* '.' : point release */
+        test_strverscmp_improved_one("123aa2-67.89", "123.aa2-67.89");
+        test_strverscmp_improved_one("123.ab2-67.89", "123.aa2-67.89");
+
+        /* invalid characters */
+        assert_se(strverscmp_improved("123_aa2-67.89", "123aa+2-67.89") == 0);
+}
+
 int main(int argc, char *argv[]) {
        test_setup_logging(LOG_DEBUG);

@ -923,6 +1001,7 @@ int main(int argc, char *argv[]) {
        test_string_extract_line();
        test_string_contains_word_strv();
        test_string_contains_word();
+        test_strverscmp_improved();

        return 0;
 }