1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2024-10-26 12:25:09 +03:00

html: Reenable buggy detection of XML declarations

Switch to UTF-8 if a document starts with '<?xm' to match old behavior.
Also enable this check in the push parser.

Fixes #637.
This commit is contained in:
Nick Wellnhofer 2023-11-30 16:15:46 +01:00
parent c4d22fe4b4
commit 8672bf253b
5 changed files with 43 additions and 0 deletions

View File

@ -4851,6 +4851,14 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
xmlDetectEncoding(ctxt); xmlDetectEncoding(ctxt);
/*
* This is wrong but matches long-standing behavior. In most cases,
* a document starting with an XML declaration will specify UTF-8.
*/
if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
(xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
/* /*
* Wipe out everything which is before the first '<' * Wipe out everything which is before the first '<'
*/ */
@ -5408,6 +5416,16 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
*/ */
goto done; goto done;
case XML_PARSER_START: case XML_PARSER_START:
/*
* This is wrong but matches long-standing behavior. In most
* cases, a document starting with an XML declaration will
* specify UTF-8.
*/
if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
(xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
}
/* /*
* Very first chars read from the document flow. * Very first chars read from the document flow.
*/ */

View File

@ -0,0 +1,4 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<?xml encoding="UTF-8"><html><body>
<p>&ouml;&auml;&uuml;&szlig;</p>
</body></html>

View File

@ -0,0 +1,13 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.processingInstruction(xml, encoding="UTF-8")
SAX.startElement(html)
SAX.startElement(body)
SAX.startElement(p)
SAX.characters(&ouml;&auml;&uuml;&szlig;, 8)
SAX.endElement(p)
SAX.characters(
, 1)
SAX.endElement(body)
SAX.endElement(html)
SAX.endDocument()

View File

@ -2140,6 +2140,12 @@ pushBoundaryTest(const char *filename, const char *result,
int cur = 0; int cur = 0;
unsigned long avail, oldConsumed, consumed; unsigned long avail, oldConsumed, consumed;
/*
* HTML encoding detection doesn't work when data is fed bytewise.
*/
if (strcmp(filename, "./test/HTML/xml-declaration-1.html") == 0)
return(0);
/* /*
* If the parser made progress, check that exactly one construct was * If the parser made progress, check that exactly one construct was
* processed and that the input buffer is (almost) empty. * processed and that the input buffer is (almost) empty.

View File

@ -0,0 +1,2 @@
<?xml encoding="UTF-8">
<p>öäüß</p>