parser: Simplify xmlStringCurrentChar

Start to move away from using this function.
2025-03-23 02:50:08 +03:00 · 2023-09-22 15:45:20 +02:00 · 2023-09-22 15:45:20 +02:00 · b9db3d7d02
commit b9db3d7d02
parent f98fa86318
3 changed files with 12 additions and 98 deletions
--- a/HTMLparser.c
+++ b/HTMLparser.c
@ -321,7 +321,6 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
 ************/

 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
-#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)

 #define COPY_BUF(l,b,i,v)						\
    if (l == 1) b[i++] = v;						\
--- a/parserInternals.c
+++ b/parserInternals.c
@ -1029,104 +1029,18 @@ incomplete_sequence:
 */

 int
-xmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar * cur, int *len)
-{
-    if ((len == NULL) || (cur == NULL)) return(0);
-    if ((ctxt == NULL) || (ctxt->input == NULL) ||
-        ((ctxt->input->flags & XML_INPUT_8_BIT) == 0)) {
-        /*
-         * We are supposed to handle UTF8, check it's valid
-         * From rfc2044: encoding of the Unicode values on UTF-8:
-         *
-         * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
-         * 0000 0000-0000 007F   0xxxxxxx
-         * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
-         * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
-         *
-         * Check for the 0x110000 limit too
-         */
-        unsigned char c;
-        unsigned int val;
+xmlStringCurrentChar(xmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED,
+                     const xmlChar *cur, int *len) {
+    int c;

-        c = *cur;
-        if (c & 0x80) {
-            if ((cur[1] & 0xc0) != 0x80)
-                goto encoding_error;
-            if ((c & 0xe0) == 0xe0) {
+    if ((cur == NULL) || (len == NULL))
+        return(0);

-                if ((cur[2] & 0xc0) != 0x80)
-                    goto encoding_error;
-                if ((c & 0xf0) == 0xf0) {
-                    if (((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80))
-                        goto encoding_error;
-                    /* 4-byte code */
-                    *len = 4;
-                    val = (cur[0] & 0x7) << 18;
-                    val |= (cur[1] & 0x3f) << 12;
-                    val |= (cur[2] & 0x3f) << 6;
-                    val |= cur[3] & 0x3f;
-                } else {
-                    /* 3-byte code */
-                    *len = 3;
-                    val = (cur[0] & 0xf) << 12;
-                    val |= (cur[1] & 0x3f) << 6;
-                    val |= cur[2] & 0x3f;
-                }
-            } else {
-                /* 2-byte code */
-                *len = 2;
-                val = (cur[0] & 0x1f) << 6;
-                val |= cur[1] & 0x3f;
-            }
-            if (!IS_CHAR(val)) {
-	        xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
-				  "Char 0x%X out of allowed range\n", val);
-            }
-            return (val);
-        } else {
-            /* 1-byte code */
-            *len = 1;
-            return (*cur);
-        }
-    }
-    /*
-     * Assume it's a fixed length encoding (1) with
-     * a compatible encoding for the ASCII set, since
-     * XML constructs only use < 128 chars
-     */
-    *len = 1;
-    return (*cur);
-encoding_error:
+    /* cur is zero-terminated, so we can lie about its length. */
+    *len = 4;
+    c = xmlGetUTF8Char(cur, len);

-    /*
-     * An encoding problem may arise from a truncated input buffer
-     * splitting a character in the middle. In that case do not raise
-     * an error but return 0 to indicate an end of stream problem
-     */
-    if ((ctxt == NULL) || (ctxt->input == NULL) ||
-        (ctxt->input->end - ctxt->input->cur < 4)) {
-	*len = 0;
-	return(0);
-    }
-    /*
-     * If we detect an UTF8 error that probably mean that the
-     * input encoding didn't get properly advertised in the
-     * declaration header. Report the error and switch the encoding
-     * to ISO-Latin-1 (if you don't like this policy, just declare the
-     * encoding !)
-     */
-    {
-        char buffer[150];
-
-	snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
-			ctxt->input->cur[0], ctxt->input->cur[1],
-			ctxt->input->cur[2], ctxt->input->cur[3]);
-	__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
-		     "Input is not proper UTF-8, indicate encoding !\n%s",
-		     BAD_CAST buffer, NULL);
-    }
-    *len = 1;
-    return (*cur);
+    return((c < 0) ? 0 : c);
 }

 /**
--- a/xinclude.c
+++ b/xinclude.c
@ -1715,8 +1715,9 @@ xmlXIncludeLoadTxt(xmlXIncludeCtxtPtr ctxt, const xmlChar *url,
        int cur;
        int l;

-        cur = xmlStringCurrentChar(NULL, &content[i], &l);
-        if (!IS_CHAR(cur)) {
+        l = len - i;
+        cur = xmlGetUTF8Char(&content[i], &l);
+        if ((cur < 0) || (!IS_CHAR(cur))) {
            xmlXIncludeErr(ctxt, ref->elem, XML_XINCLUDE_INVALID_CHAR,
                           "%s contains invalid char\n", URL);
            goto error;