diff --git a/HTMLparser.c b/HTMLparser.c index 34df182b..00b64c13 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -4935,6 +4935,14 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { */ goto done; case XML_PARSER_START: + /* + * Very first chars read from the document flow. + */ + if ((!terminate) && (avail < 4)) + goto done; + + xmlDetectEncoding(ctxt); + /* * This is wrong but matches long-standing behavior. In most * cases, a document starting with an XML declaration will @@ -4945,6 +4953,9 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8); } + /* fall through */ + + case XML_PARSER_XML_DECL: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) { ctxt->sax->setDocumentLocator(ctxt->userData, (xmlSAXLocator *) &xmlDefaultSAXLocator); @@ -4953,8 +4964,9 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { (!ctxt->disableSAX)) ctxt->sax->startDocument(ctxt->userData); - /* Allow callback to modify state */ - if (ctxt->instate == XML_PARSER_START) + /* Allow callback to modify state for tests */ + if ((ctxt->instate == XML_PARSER_START) || + (ctxt->instate == XML_PARSER_XML_DECL)) ctxt->instate = XML_PARSER_MISC; break; case XML_PARSER_START_TAG: { diff --git a/runtest.c b/runtest.c index d06ec83e..f6bb9679 100644 --- a/runtest.c +++ b/runtest.c @@ -1797,6 +1797,8 @@ htmlTokenizerTest(const char *filename, const char *result, config.startTag = BAD_CAST startTag; config.inCharacters = 0; ctxt->_private = &config; + /* Skip charset auto-detection */ + ctxt->instate = XML_PARSER_XML_DECL; htmlCtxtUseOptions(ctxt, options | HTML_PARSE_HTML5); htmlParseChunk(ctxt, data, size, 1); htmlFreeParserCtxt(ctxt);