diff --git a/HTMLparser.c b/HTMLparser.c index fa1fe380..9c3359f8 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -4851,6 +4851,14 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) { xmlDetectEncoding(ctxt); + /* + * This is wrong but matches long-standing behavior. In most cases, + * a document starting with an XML declaration will specify UTF-8. + */ + if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) && + (xmlStrncmp(ctxt->input->cur, BAD_CAST "input->flags & XML_INPUT_HAS_ENCODING) == 0) && + (xmlStrncmp(ctxt->input->cur, BAD_CAST " + +

öäüß

+ diff --git a/result/HTML/xml-declaration-1.html.sax b/result/HTML/xml-declaration-1.html.sax new file mode 100644 index 00000000..83fe8eb6 --- /dev/null +++ b/result/HTML/xml-declaration-1.html.sax @@ -0,0 +1,13 @@ +SAX.setDocumentLocator() +SAX.startDocument() +SAX.processingInstruction(xml, encoding="UTF-8") +SAX.startElement(html) +SAX.startElement(body) +SAX.startElement(p) +SAX.characters(öäüß, 8) +SAX.endElement(p) +SAX.characters( +, 1) +SAX.endElement(body) +SAX.endElement(html) +SAX.endDocument() diff --git a/runtest.c b/runtest.c index ff65fe86..c78eec81 100644 --- a/runtest.c +++ b/runtest.c @@ -2140,6 +2140,12 @@ pushBoundaryTest(const char *filename, const char *result, int cur = 0; unsigned long avail, oldConsumed, consumed; + /* + * HTML encoding detection doesn't work when data is fed bytewise. + */ + if (strcmp(filename, "./test/HTML/xml-declaration-1.html") == 0) + return(0); + /* * If the parser made progress, check that exactly one construct was * processed and that the input buffer is (almost) empty. diff --git a/test/HTML/xml-declaration-1.html b/test/HTML/xml-declaration-1.html new file mode 100644 index 00000000..1950be71 --- /dev/null +++ b/test/HTML/xml-declaration-1.html @@ -0,0 +1,2 @@ + +

öäüß