parser: Fix regression when push parsing UTF-8 sequences

Partial UTF-8 sequences are allowed when push parsing. Fixes #542.
2025-03-25 10:50:08 +03:00 · 2023-05-18 17:31:44 +02:00 · 2023-05-18 17:31:44 +02:00 · e0f3016f71
commit e0f3016f71
parent 687a2b719e
2 changed files with 67 additions and 34 deletions
--- a/parser.c
+++ b/parser.c
@ -4113,7 +4113,7 @@ xmlParsePubidLiteral(xmlParserCtxtPtr ctxt) {
    return(buf);
 }

-static void xmlParseCharDataComplex(xmlParserCtxtPtr ctxt);
+static void xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial);

 /*
 * used for the test in the inner loop of the char data testing
@ -4154,17 +4154,13 @@ static const unsigned char test_char_data[256] = {
 };

 /**
- * xmlParseCharData:
+ * xmlParseCharDataInternal:
 * @ctxt:  an XML parser context
- * @cdata:  unused
- *
- * DEPRECATED: Internal function, don't use.
+ * @partial:  buffer may contain partial UTF-8 sequences
 *
 * Parse character data. Always makes progress if the first char isn't
 * '<' or '&'.
 *
- * if we are within a CDATA section ']]>' marks an end of section.
- *
 * The right angle bracket (>) may be represented using the string "&gt;",
 * and must, for compatibility, be escaped using "&gt;" or a character
 * reference when it appears in the string "]]>" in content, when that
@ -4172,9 +4168,8 @@ static const unsigned char test_char_data[256] = {
 *
 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
 */
-
-void
-xmlParseCharData(xmlParserCtxtPtr ctxt, ATTRIBUTE_UNUSED int cdata) {
+static void
+xmlParseCharDataInternal(xmlParserCtxtPtr ctxt, int partial) {
    const xmlChar *in;
    int nbchar = 0;
    int line = ctxt->input->line;
@ -4307,7 +4302,7 @@ get_more:
             (*in == 0x09) || (*in == 0x0a));
    ctxt->input->line = line;
    ctxt->input->col = col;
-    xmlParseCharDataComplex(ctxt);
+    xmlParseCharDataComplex(ctxt, partial);
 }

 /**
@ -4322,7 +4317,7 @@ get_more:
 * of non-ASCII characters.
 */
 static void
-xmlParseCharDataComplex(xmlParserCtxtPtr ctxt) {
+xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial) {
    xmlChar buf[XML_PARSER_BIG_BUFFER_SIZE + 5];
    int nbchar = 0;
    int cur, l;
@ -4385,15 +4380,42 @@ xmlParseCharDataComplex(xmlParserCtxtPtr ctxt) {
 	    }
 	}
    }
-    if ((ctxt->input->cur < ctxt->input->end) && (!IS_CHAR(cur))) {
-	/* Generate the error and skip the offending character */
-        xmlFatalErrMsgInt(ctxt, XML_ERR_INVALID_CHAR,
-                          "PCDATA invalid Char value %d\n",
-	                  cur ? cur : CUR);
-	NEXT;
+    /*
+     * cur == 0 can mean
+     *
+     * - XML_PARSER_EOF or memory error. This is checked above.
+     * - An actual 0 character.
+     * - End of buffer.
+     * - An incomplete UTF-8 sequence. This is allowed if partial is set.
+     */
+    if (ctxt->input->cur < ctxt->input->end) {
+        if ((cur == 0) && (CUR != 0)) {
+            if (partial == 0) {
+                xmlFatalErrMsgInt(ctxt, XML_ERR_INVALID_CHAR,
+                        "Incomplete UTF-8 sequence starting with %02X\n", CUR);
+                NEXTL(1);
+            }
+        } else if ((cur != '<') && (cur != '&')) {
+            /* Generate the error and skip the offending character */
+            xmlFatalErrMsgInt(ctxt, XML_ERR_INVALID_CHAR,
+                              "PCDATA invalid Char value %d\n", cur);
+            NEXTL(l);
+        }
    }
 }

+/**
+ * xmlParseCharData:
+ * @ctxt:  an XML parser context
+ * @cdata:  unused
+ *
+ * DEPRECATED: Internal function, don't use.
+ */
+void
+xmlParseCharData(xmlParserCtxtPtr ctxt, ATTRIBUTE_UNUSED int cdata) {
+    xmlParseCharDataInternal(ctxt, 0);
+}
+
 /**
 * xmlParseExternalID:
 * @ctxt:  an XML parser context
@ -9656,7 +9678,7 @@ xmlParseContentInternal(xmlParserCtxtPtr ctxt) {
 	 * Last case, text. Note that References are handled directly.
 	 */
 	else {
-	    xmlParseCharData(ctxt, 0);
+	    xmlParseCharDataInternal(ctxt, 0);
 	}

 	SHRINK;
@ -11449,7 +11471,7 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
 			    goto done;
                    }
                    ctxt->checkIndex = 0;
-		    xmlParseCharData(ctxt, 0);
+		    xmlParseCharDataInternal(ctxt, !terminate);
 		}
 		break;
 	    }
--- a/parserInternals.c
+++ b/parserInternals.c
@ -936,14 +936,20 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {

            avail = ctxt->input->end - ctxt->input->cur;

-	    if ((avail < 2) || (cur[1] & 0xc0) != 0x80)
+            if (avail < 2)
+                goto incomplete_sequence;
+	    if ((cur[1] & 0xc0) != 0x80)
 		goto encoding_error;
 	    if ((c & 0xe0) == 0xe0) {
-		if ((avail < 3) || (cur[2] & 0xc0) != 0x80)
+                if (avail < 3)
+                    goto incomplete_sequence;
+		if ((cur[2] & 0xc0) != 0x80)
 		    goto encoding_error;
 		if ((c & 0xf0) == 0xf0) {
+                    if (avail < 4)
+                        goto incomplete_sequence;
 		    if (((c & 0xf8) != 0xf0) ||
-			(avail < 4) || ((cur[3] & 0xc0) != 0x80))
+			((cur[3] & 0xc0) != 0x80))
 			goto encoding_error;
 		    /* 4-byte code */
 		    *len = 4;
@ -1005,17 +1011,8 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
 	return(0xA);
    }
    return(*ctxt->input->cur);
-encoding_error:
-    /*
-     * An encoding problem may arise from a truncated input buffer
-     * splitting a character in the middle. In that case do not raise
-     * an error but return 0 to indicate an end of stream problem
-     */
-    if (ctxt->input->end - ctxt->input->cur < 4) {
-	*len = 0;
-	return(0);
-    }

+encoding_error:
    /*
     * If we detect an UTF8 error that probably mean that the
     * input encoding didn't get properly advertised in the
@ -1023,7 +1020,11 @@ encoding_error:
     * to ISO-Latin-1 (if you don't like this policy, just declare the
     * encoding !)
     */
-    {
+    if (ctxt->input->end - ctxt->input->cur < 4) {
+	__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
+		     "Input is not proper UTF-8, indicate encoding !\n",
+		     NULL, NULL);
+    } else {
        char buffer[150];

 	snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
@ -1036,6 +1037,16 @@ encoding_error:
    ctxt->charset = XML_CHAR_ENCODING_8859_1;
    *len = 1;
    return(*ctxt->input->cur);
+
+incomplete_sequence:
+    /*
+     * An encoding problem may arise from a truncated input buffer
+     * splitting a character in the middle. In that case do not raise
+     * an error but return 0. This should only happen when push parsing
+     * char data.
+     */
+    *len = 0;
+    return(0);
 }

 /**