1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-03-25 10:50:08 +03:00

parser: Fix regression when push parsing UTF-8 sequences

Partial UTF-8 sequences are allowed when push parsing.

Fixes #542.
This commit is contained in:
Nick Wellnhofer 2023-05-18 17:31:44 +02:00
parent 687a2b719e
commit e0f3016f71
2 changed files with 67 additions and 34 deletions

View File

@ -4113,7 +4113,7 @@ xmlParsePubidLiteral(xmlParserCtxtPtr ctxt) {
return(buf);
}
static void xmlParseCharDataComplex(xmlParserCtxtPtr ctxt);
static void xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial);
/*
* used for the test in the inner loop of the char data testing
@ -4154,17 +4154,13 @@ static const unsigned char test_char_data[256] = {
};
/**
* xmlParseCharData:
* xmlParseCharDataInternal:
* @ctxt: an XML parser context
* @cdata: unused
*
* DEPRECATED: Internal function, don't use.
* @partial: buffer may contain partial UTF-8 sequences
*
* Parse character data. Always makes progress if the first char isn't
* '<' or '&'.
*
* if we are within a CDATA section ']]>' marks an end of section.
*
* The right angle bracket (>) may be represented using the string "&gt;",
* and must, for compatibility, be escaped using "&gt;" or a character
* reference when it appears in the string "]]>" in content, when that
@ -4172,9 +4168,8 @@ static const unsigned char test_char_data[256] = {
*
* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
*/
void
xmlParseCharData(xmlParserCtxtPtr ctxt, ATTRIBUTE_UNUSED int cdata) {
static void
xmlParseCharDataInternal(xmlParserCtxtPtr ctxt, int partial) {
const xmlChar *in;
int nbchar = 0;
int line = ctxt->input->line;
@ -4307,7 +4302,7 @@ get_more:
(*in == 0x09) || (*in == 0x0a));
ctxt->input->line = line;
ctxt->input->col = col;
xmlParseCharDataComplex(ctxt);
xmlParseCharDataComplex(ctxt, partial);
}
/**
@ -4322,7 +4317,7 @@ get_more:
* of non-ASCII characters.
*/
static void
xmlParseCharDataComplex(xmlParserCtxtPtr ctxt) {
xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial) {
xmlChar buf[XML_PARSER_BIG_BUFFER_SIZE + 5];
int nbchar = 0;
int cur, l;
@ -4385,15 +4380,42 @@ xmlParseCharDataComplex(xmlParserCtxtPtr ctxt) {
}
}
}
if ((ctxt->input->cur < ctxt->input->end) && (!IS_CHAR(cur))) {
/* Generate the error and skip the offending character */
xmlFatalErrMsgInt(ctxt, XML_ERR_INVALID_CHAR,
"PCDATA invalid Char value %d\n",
cur ? cur : CUR);
NEXT;
/*
* cur == 0 can mean
*
* - XML_PARSER_EOF or memory error. This is checked above.
* - An actual 0 character.
* - End of buffer.
* - An incomplete UTF-8 sequence. This is allowed if partial is set.
*/
if (ctxt->input->cur < ctxt->input->end) {
if ((cur == 0) && (CUR != 0)) {
if (partial == 0) {
xmlFatalErrMsgInt(ctxt, XML_ERR_INVALID_CHAR,
"Incomplete UTF-8 sequence starting with %02X\n", CUR);
NEXTL(1);
}
} else if ((cur != '<') && (cur != '&')) {
/* Generate the error and skip the offending character */
xmlFatalErrMsgInt(ctxt, XML_ERR_INVALID_CHAR,
"PCDATA invalid Char value %d\n", cur);
NEXTL(l);
}
}
}
/**
* xmlParseCharData:
* @ctxt: an XML parser context
* @cdata: unused
*
* DEPRECATED: Internal function, don't use.
*/
void
xmlParseCharData(xmlParserCtxtPtr ctxt, ATTRIBUTE_UNUSED int cdata) {
xmlParseCharDataInternal(ctxt, 0);
}
/**
* xmlParseExternalID:
* @ctxt: an XML parser context
@ -9656,7 +9678,7 @@ xmlParseContentInternal(xmlParserCtxtPtr ctxt) {
* Last case, text. Note that References are handled directly.
*/
else {
xmlParseCharData(ctxt, 0);
xmlParseCharDataInternal(ctxt, 0);
}
SHRINK;
@ -11449,7 +11471,7 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
goto done;
}
ctxt->checkIndex = 0;
xmlParseCharData(ctxt, 0);
xmlParseCharDataInternal(ctxt, !terminate);
}
break;
}

View File

@ -936,14 +936,20 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
avail = ctxt->input->end - ctxt->input->cur;
if ((avail < 2) || (cur[1] & 0xc0) != 0x80)
if (avail < 2)
goto incomplete_sequence;
if ((cur[1] & 0xc0) != 0x80)
goto encoding_error;
if ((c & 0xe0) == 0xe0) {
if ((avail < 3) || (cur[2] & 0xc0) != 0x80)
if (avail < 3)
goto incomplete_sequence;
if ((cur[2] & 0xc0) != 0x80)
goto encoding_error;
if ((c & 0xf0) == 0xf0) {
if (avail < 4)
goto incomplete_sequence;
if (((c & 0xf8) != 0xf0) ||
(avail < 4) || ((cur[3] & 0xc0) != 0x80))
((cur[3] & 0xc0) != 0x80))
goto encoding_error;
/* 4-byte code */
*len = 4;
@ -1005,17 +1011,8 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
return(0xA);
}
return(*ctxt->input->cur);
encoding_error:
/*
* An encoding problem may arise from a truncated input buffer
* splitting a character in the middle. In that case do not raise
* an error but return 0 to indicate an end of stream problem
*/
if (ctxt->input->end - ctxt->input->cur < 4) {
*len = 0;
return(0);
}
encoding_error:
/*
* If we detect an UTF8 error that probably mean that the
* input encoding didn't get properly advertised in the
@ -1023,7 +1020,11 @@ encoding_error:
* to ISO-Latin-1 (if you don't like this policy, just declare the
* encoding !)
*/
{
if (ctxt->input->end - ctxt->input->cur < 4) {
__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
"Input is not proper UTF-8, indicate encoding !\n",
NULL, NULL);
} else {
char buffer[150];
snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
@ -1036,6 +1037,16 @@ encoding_error:
ctxt->charset = XML_CHAR_ENCODING_8859_1;
*len = 1;
return(*ctxt->input->cur);
incomplete_sequence:
/*
* An encoding problem may arise from a truncated input buffer
* splitting a character in the middle. In that case do not raise
* an error but return 0. This should only happen when push parsing
* char data.
*/
*len = 0;
return(0);
}
/**