diff --git a/HTMLparser.c b/HTMLparser.c
index 34df182b..00b64c13 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -4935,6 +4935,14 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
*/
goto done;
case XML_PARSER_START:
+ /*
+ * Very first chars read from the document flow.
+ */
+ if ((!terminate) && (avail < 4))
+ goto done;
+
+ xmlDetectEncoding(ctxt);
+
/*
* This is wrong but matches long-standing behavior. In most
* cases, a document starting with an XML declaration will
@@ -4945,6 +4953,9 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
}
+ /* fall through */
+
+ case XML_PARSER_XML_DECL:
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
ctxt->sax->setDocumentLocator(ctxt->userData,
(xmlSAXLocator *) &xmlDefaultSAXLocator);
@@ -4953,8 +4964,9 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(!ctxt->disableSAX))
ctxt->sax->startDocument(ctxt->userData);
- /* Allow callback to modify state */
- if (ctxt->instate == XML_PARSER_START)
+ /* Allow callback to modify state for tests */
+ if ((ctxt->instate == XML_PARSER_START) ||
+ (ctxt->instate == XML_PARSER_XML_DECL))
ctxt->instate = XML_PARSER_MISC;
break;
case XML_PARSER_START_TAG: {
diff --git a/runtest.c b/runtest.c
index d06ec83e..f6bb9679 100644
--- a/runtest.c
+++ b/runtest.c
@@ -1797,6 +1797,8 @@ htmlTokenizerTest(const char *filename, const char *result,
config.startTag = BAD_CAST startTag;
config.inCharacters = 0;
ctxt->_private = &config;
+ /* Skip charset auto-detection */
+ ctxt->instate = XML_PARSER_XML_DECL;
htmlCtxtUseOptions(ctxt, options | HTML_PARSE_HTML5);
htmlParseChunk(ctxt, data, size, 1);
htmlFreeParserCtxt(ctxt);