mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-03-25 10:50:08 +03:00
parser: Fix regression when push parsing UTF-8 sequences
Partial UTF-8 sequences are allowed when push parsing. Fixes #542.
This commit is contained in:
parent
687a2b719e
commit
e0f3016f71
62
parser.c
62
parser.c
@ -4113,7 +4113,7 @@ xmlParsePubidLiteral(xmlParserCtxtPtr ctxt) {
|
||||
return(buf);
|
||||
}
|
||||
|
||||
static void xmlParseCharDataComplex(xmlParserCtxtPtr ctxt);
|
||||
static void xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial);
|
||||
|
||||
/*
|
||||
* used for the test in the inner loop of the char data testing
|
||||
@ -4154,17 +4154,13 @@ static const unsigned char test_char_data[256] = {
|
||||
};
|
||||
|
||||
/**
|
||||
* xmlParseCharData:
|
||||
* xmlParseCharDataInternal:
|
||||
* @ctxt: an XML parser context
|
||||
* @cdata: unused
|
||||
*
|
||||
* DEPRECATED: Internal function, don't use.
|
||||
* @partial: buffer may contain partial UTF-8 sequences
|
||||
*
|
||||
* Parse character data. Always makes progress if the first char isn't
|
||||
* '<' or '&'.
|
||||
*
|
||||
* if we are within a CDATA section ']]>' marks an end of section.
|
||||
*
|
||||
* The right angle bracket (>) may be represented using the string ">",
|
||||
* and must, for compatibility, be escaped using ">" or a character
|
||||
* reference when it appears in the string "]]>" in content, when that
|
||||
@ -4172,9 +4168,8 @@ static const unsigned char test_char_data[256] = {
|
||||
*
|
||||
* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
|
||||
*/
|
||||
|
||||
void
|
||||
xmlParseCharData(xmlParserCtxtPtr ctxt, ATTRIBUTE_UNUSED int cdata) {
|
||||
static void
|
||||
xmlParseCharDataInternal(xmlParserCtxtPtr ctxt, int partial) {
|
||||
const xmlChar *in;
|
||||
int nbchar = 0;
|
||||
int line = ctxt->input->line;
|
||||
@ -4307,7 +4302,7 @@ get_more:
|
||||
(*in == 0x09) || (*in == 0x0a));
|
||||
ctxt->input->line = line;
|
||||
ctxt->input->col = col;
|
||||
xmlParseCharDataComplex(ctxt);
|
||||
xmlParseCharDataComplex(ctxt, partial);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -4322,7 +4317,7 @@ get_more:
|
||||
* of non-ASCII characters.
|
||||
*/
|
||||
static void
|
||||
xmlParseCharDataComplex(xmlParserCtxtPtr ctxt) {
|
||||
xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial) {
|
||||
xmlChar buf[XML_PARSER_BIG_BUFFER_SIZE + 5];
|
||||
int nbchar = 0;
|
||||
int cur, l;
|
||||
@ -4385,15 +4380,42 @@ xmlParseCharDataComplex(xmlParserCtxtPtr ctxt) {
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((ctxt->input->cur < ctxt->input->end) && (!IS_CHAR(cur))) {
|
||||
/* Generate the error and skip the offending character */
|
||||
xmlFatalErrMsgInt(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"PCDATA invalid Char value %d\n",
|
||||
cur ? cur : CUR);
|
||||
NEXT;
|
||||
/*
|
||||
* cur == 0 can mean
|
||||
*
|
||||
* - XML_PARSER_EOF or memory error. This is checked above.
|
||||
* - An actual 0 character.
|
||||
* - End of buffer.
|
||||
* - An incomplete UTF-8 sequence. This is allowed if partial is set.
|
||||
*/
|
||||
if (ctxt->input->cur < ctxt->input->end) {
|
||||
if ((cur == 0) && (CUR != 0)) {
|
||||
if (partial == 0) {
|
||||
xmlFatalErrMsgInt(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"Incomplete UTF-8 sequence starting with %02X\n", CUR);
|
||||
NEXTL(1);
|
||||
}
|
||||
} else if ((cur != '<') && (cur != '&')) {
|
||||
/* Generate the error and skip the offending character */
|
||||
xmlFatalErrMsgInt(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"PCDATA invalid Char value %d\n", cur);
|
||||
NEXTL(l);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlParseCharData:
|
||||
* @ctxt: an XML parser context
|
||||
* @cdata: unused
|
||||
*
|
||||
* DEPRECATED: Internal function, don't use.
|
||||
*/
|
||||
void
|
||||
xmlParseCharData(xmlParserCtxtPtr ctxt, ATTRIBUTE_UNUSED int cdata) {
|
||||
xmlParseCharDataInternal(ctxt, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlParseExternalID:
|
||||
* @ctxt: an XML parser context
|
||||
@ -9656,7 +9678,7 @@ xmlParseContentInternal(xmlParserCtxtPtr ctxt) {
|
||||
* Last case, text. Note that References are handled directly.
|
||||
*/
|
||||
else {
|
||||
xmlParseCharData(ctxt, 0);
|
||||
xmlParseCharDataInternal(ctxt, 0);
|
||||
}
|
||||
|
||||
SHRINK;
|
||||
@ -11449,7 +11471,7 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
|
||||
goto done;
|
||||
}
|
||||
ctxt->checkIndex = 0;
|
||||
xmlParseCharData(ctxt, 0);
|
||||
xmlParseCharDataInternal(ctxt, !terminate);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -936,14 +936,20 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
|
||||
|
||||
avail = ctxt->input->end - ctxt->input->cur;
|
||||
|
||||
if ((avail < 2) || (cur[1] & 0xc0) != 0x80)
|
||||
if (avail < 2)
|
||||
goto incomplete_sequence;
|
||||
if ((cur[1] & 0xc0) != 0x80)
|
||||
goto encoding_error;
|
||||
if ((c & 0xe0) == 0xe0) {
|
||||
if ((avail < 3) || (cur[2] & 0xc0) != 0x80)
|
||||
if (avail < 3)
|
||||
goto incomplete_sequence;
|
||||
if ((cur[2] & 0xc0) != 0x80)
|
||||
goto encoding_error;
|
||||
if ((c & 0xf0) == 0xf0) {
|
||||
if (avail < 4)
|
||||
goto incomplete_sequence;
|
||||
if (((c & 0xf8) != 0xf0) ||
|
||||
(avail < 4) || ((cur[3] & 0xc0) != 0x80))
|
||||
((cur[3] & 0xc0) != 0x80))
|
||||
goto encoding_error;
|
||||
/* 4-byte code */
|
||||
*len = 4;
|
||||
@ -1005,17 +1011,8 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
|
||||
return(0xA);
|
||||
}
|
||||
return(*ctxt->input->cur);
|
||||
encoding_error:
|
||||
/*
|
||||
* An encoding problem may arise from a truncated input buffer
|
||||
* splitting a character in the middle. In that case do not raise
|
||||
* an error but return 0 to indicate an end of stream problem
|
||||
*/
|
||||
if (ctxt->input->end - ctxt->input->cur < 4) {
|
||||
*len = 0;
|
||||
return(0);
|
||||
}
|
||||
|
||||
encoding_error:
|
||||
/*
|
||||
* If we detect an UTF8 error that probably mean that the
|
||||
* input encoding didn't get properly advertised in the
|
||||
@ -1023,7 +1020,11 @@ encoding_error:
|
||||
* to ISO-Latin-1 (if you don't like this policy, just declare the
|
||||
* encoding !)
|
||||
*/
|
||||
{
|
||||
if (ctxt->input->end - ctxt->input->cur < 4) {
|
||||
__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"Input is not proper UTF-8, indicate encoding !\n",
|
||||
NULL, NULL);
|
||||
} else {
|
||||
char buffer[150];
|
||||
|
||||
snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
|
||||
@ -1036,6 +1037,16 @@ encoding_error:
|
||||
ctxt->charset = XML_CHAR_ENCODING_8859_1;
|
||||
*len = 1;
|
||||
return(*ctxt->input->cur);
|
||||
|
||||
incomplete_sequence:
|
||||
/*
|
||||
* An encoding problem may arise from a truncated input buffer
|
||||
* splitting a character in the middle. In that case do not raise
|
||||
* an error but return 0. This should only happen when push parsing
|
||||
* char data.
|
||||
*/
|
||||
*len = 0;
|
||||
return(0);
|
||||
}
|
||||
|
||||
/**
|
||||
|
Loading…
x
Reference in New Issue
Block a user