parser: Stop switching to ISO-8859-1 on encoding errors

Use U+FFFD Replacement Character if invalid UTF-8 is encountered in recovery mode. Also rewrite xmlNextChar and xmlCurrentChar. Fixes #598.
2025-03-31 06:50:06 +03:00 · 2023-08-09 16:59:36 +02:00 · 2023-08-09 16:59:36 +02:00 · c082ef4644
commit c082ef4644
parent a9ada18352
3 changed files with 198 additions and 272 deletions
--- a/include/private/parser.h
+++ b/include/private/parser.h
@ -24,7 +24,7 @@
 #define XML_INPUT_AUTO_UTF16BE      (3u << 1)
 #define XML_INPUT_AUTO_OTHER        (4u << 1)
 #define XML_INPUT_USES_ENC_DECL     (1u << 4)
-#define XML_INPUT_8_BIT             (1u << 5)
+#define XML_INPUT_ENCODING_ERROR    (1u << 5)

 XML_HIDDEN void
 xmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra);
--- a/parser.c
+++ b/parser.c
@ -2292,8 +2292,8 @@ static int spacePop(xmlParserCtxtPtr ctxt) {
 #define CUR_CHAR(l) xmlCurrentChar(ctxt, &l)
 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)

-#define COPY_BUF(l,b,i,v)						\
-    if (l == 1) b[i++] = v;						\
+#define COPY_BUF(b, i, v)						\
+    if (v < 0x80) b[i++] = v;						\
    else i += xmlCopyCharMultiByte(&b[i],v)

 /**
@ -2843,7 +2843,7 @@ xmlStringDecodeEntitiesInt(xmlParserCtxtPtr ctxt, const xmlChar *str, int len,
 	    int val = xmlParseStringCharRef(ctxt, &str);
 	    if (val == 0)
                goto int_error;
-	    COPY_BUF(0,buffer,nbchars,val);
+	    COPY_BUF(buffer, nbchars, val);
 	    if (nbchars + XML_PARSER_BUFFER_SIZE > buffer_size) {
 	        growBuffer(buffer, XML_PARSER_BUFFER_SIZE);
 	    }
@ -2856,7 +2856,7 @@ xmlStringDecodeEntitiesInt(xmlParserCtxtPtr ctxt, const xmlChar *str, int len,
 	    if ((ent != NULL) &&
 		(ent->etype == XML_INTERNAL_PREDEFINED_ENTITY)) {
 		if (ent->content != NULL) {
-		    COPY_BUF(0,buffer,nbchars,ent->content[0]);
+		    COPY_BUF(buffer, nbchars, ent->content[0]);
 		    if (nbchars + XML_PARSER_BUFFER_SIZE > buffer_size) {
 			growBuffer(buffer, XML_PARSER_BUFFER_SIZE);
 		    }
@ -2967,7 +2967,7 @@ xmlStringDecodeEntitiesInt(xmlParserCtxtPtr ctxt, const xmlChar *str, int len,
                rep = NULL;
 	    }
 	} else {
-	    COPY_BUF(l,buffer,nbchars,c);
+	    COPY_BUF(buffer, nbchars, c);
 	    str += l;
 	    if (nbchars + XML_PARSER_BUFFER_SIZE > buffer_size) {
 	        growBuffer(buffer, XML_PARSER_BUFFER_SIZE);
@ -3742,11 +3742,11 @@ xmlParseStringName(xmlParserCtxtPtr ctxt, const xmlChar** str) {
 	return(NULL);
    }

-    COPY_BUF(l,buf,len,c);
+    COPY_BUF(buf, len, c);
    cur += l;
    c = CUR_SCHAR(cur, l);
    while (xmlIsNameChar(ctxt, c)) {
-	COPY_BUF(l,buf,len,c);
+	COPY_BUF(buf, len, c);
 	cur += l;
 	c = CUR_SCHAR(cur, l);
 	if (len >= XML_MAX_NAMELEN) { /* test bigentname.xml */
@ -3776,7 +3776,7 @@ xmlParseStringName(xmlParserCtxtPtr ctxt, const xmlChar** str) {
 		    }
 		    buffer = tmp;
 		}
-		COPY_BUF(l,buffer,len,c);
+		COPY_BUF(buffer, len, c);
 		cur += l;
 		c = CUR_SCHAR(cur, l);
                if (len > maxLength) {
@ -3825,7 +3825,7 @@ xmlParseNmtoken(xmlParserCtxtPtr ctxt) {
    c = CUR_CHAR(l);

    while (xmlIsNameChar(ctxt, c)) {
-	COPY_BUF(l,buf,len,c);
+	COPY_BUF(buf, len, c);
 	NEXTL(l);
 	c = CUR_CHAR(l);
 	if (len >= XML_MAX_NAMELEN) {
@ -3855,7 +3855,7 @@ xmlParseNmtoken(xmlParserCtxtPtr ctxt) {
 		    }
 		    buffer = tmp;
 		}
-		COPY_BUF(l,buffer,len,c);
+		COPY_BUF(buffer, len, c);
                if (len > maxLength) {
                    xmlFatalErr(ctxt, XML_ERR_NAME_TOO_LONG, "NmToken");
                    xmlFree(buffer);
@ -3957,7 +3957,7 @@ xmlParseEntityValue(xmlParserCtxtPtr ctxt, xmlChar **orig) {
 	    }
 	    buf = tmp;
 	}
-	COPY_BUF(l,buf,len,c);
+	COPY_BUF(buf, len, c);
 	NEXTL(l);

 	GROW;
@ -4241,7 +4241,7 @@ xmlParseAttValueComplex(xmlParserCtxtPtr ctxt, int *attlen, int normalize) {
 	    if ((c == 0x20) || (c == 0xD) || (c == 0xA) || (c == 0x9)) {
 	        if ((len != 0) || (!normalize)) {
 		    if ((!normalize) || (!in_space)) {
-			COPY_BUF(l,buf,len,0x20);
+			COPY_BUF(buf, len, 0x20);
 			while (len + 10 > buf_size) {
 			    growBuffer(buf, 10);
 			}
@ -4250,7 +4250,7 @@ xmlParseAttValueComplex(xmlParserCtxtPtr ctxt, int *attlen, int normalize) {
 		}
 	    } else {
 	        in_space = 0;
-		COPY_BUF(l,buf,len,c);
+		COPY_BUF(buf, len, c);
 		if (len + 10 > buf_size) {
 		    growBuffer(buf, 10);
 		}
@ -4397,7 +4397,7 @@ xmlParseSystemLiteral(xmlParserCtxtPtr ctxt) {
 	    }
 	    buf = tmp;
 	}
-	COPY_BUF(l,buf,len,cur);
+	COPY_BUF(buf, len, cur);
        if (len > maxLength) {
            xmlFatalErr(ctxt, XML_ERR_NAME_TOO_LONG, "SystemLiteral");
            xmlFree(buf);
@ -4721,7 +4721,7 @@ xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial) {
 	if ((cur == ']') && (NXT(1) == ']') && (NXT(2) == '>')) {
 	    xmlFatalErr(ctxt, XML_ERR_MISPLACED_CDATA_END, NULL);
 	}
-	COPY_BUF(l,buf,nbchar,cur);
+	COPY_BUF(buf, nbchar, cur);
 	/* move current position before possible calling of ctxt->sax->characters */
 	NEXTL(l);
 	if (nbchar >= XML_PARSER_BIG_BUFFER_SIZE) {
@ -4964,7 +4964,7 @@ xmlParseCommentComplex(xmlParserCtxtPtr ctxt, xmlChar *buf,
 	    buf = new_buf;
            size = new_size;
 	}
-	COPY_BUF(ql,buf,len,q);
+	COPY_BUF(buf, len, q);
        if (len > maxLength) {
            xmlFatalErrMsgStr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
                         "Comment too big found", NULL);
@ -5386,7 +5386,7 @@ xmlParsePI(xmlParserCtxtPtr ctxt) {
 		    buf = tmp;
                    size = new_size;
 		}
-		COPY_BUF(l,buf,len,cur);
+		COPY_BUF(buf, len, cur);
                if (len > maxLength) {
                    xmlFatalErrMsgStr(ctxt, XML_ERR_PI_NOT_FINISHED,
                                      "PI %s too big found", target);
@ -7246,7 +7246,7 @@ xmlParseReference(xmlParserCtxtPtr ctxt) {
        /*
         * Just encode the value in UTF-8
         */
-        COPY_BUF(0, out, i, value);
+        COPY_BUF(out, i, value);
        out[i] = 0;
        if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL) &&
            (!ctxt->disableSAX))
@ -10218,7 +10218,7 @@ xmlParseCDSect(xmlParserCtxtPtr ctxt) {
 	    buf = tmp;
 	    size *= 2;
 	}
-	COPY_BUF(rl,buf,len,r);
+	COPY_BUF(buf, len, r);
        if (len > maxLength) {
            xmlFatalErrMsg(ctxt, XML_ERR_CDATA_NOT_FINISHED,
                           "CData section too big found\n");
--- a/parserInternals.c
+++ b/parserInternals.c
@ -38,7 +38,6 @@

 #define CUR(ctxt) ctxt->input->cur
 #define END(ctxt) ctxt->input->end
-#define VALID_CTXT(ctxt) (CUR(ctxt) <= END(ctxt))

 #include "private/buf.h"
 #include "private/enc.h"
@ -697,154 +696,102 @@ xmlParserInputShrink(xmlParserInputPtr in) {
 void
 xmlNextChar(xmlParserCtxtPtr ctxt)
 {
+    const unsigned char *cur;
+    size_t avail;
+    int c;
+
    if ((ctxt == NULL) || (ctxt->instate == XML_PARSER_EOF) ||
        (ctxt->input == NULL))
        return;

-    if (!(VALID_CTXT(ctxt))) {
-        xmlErrInternal(ctxt, "Parser input data memory error\n", NULL);
-	ctxt->errNo = XML_ERR_INTERNAL_ERROR;
-        xmlStopParser(ctxt);
-	return;
-    }
+    avail = ctxt->input->end - ctxt->input->cur;

-    if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK) {
+    if (avail < INPUT_CHUNK) {
        xmlParserGrow(ctxt);
        if ((ctxt->instate == XML_PARSER_EOF) ||
            (ctxt->input->cur >= ctxt->input->end))
            return;
+        avail = ctxt->input->end - ctxt->input->cur;
    }

-    if ((ctxt->input->flags & XML_INPUT_8_BIT) == 0) {
-        const unsigned char *cur;
-        unsigned char c;
+    cur = ctxt->input->cur;
+    c = *cur;

-        cur = ctxt->input->cur;
-
-        /*
-         *   2.11 End-of-Line Handling
-         *   the literal two-character sequence "#xD#xA" or a standalone
-         *   literal #xD, an XML processor must pass to the application
-         *   the single character #xA.
-         */
-        if ((*cur == '\n') || (*cur == '\r')) {
-            ctxt->input->line++;
-            ctxt->input->col = 1;
-            if ((*cur == '\r') && (cur[1] == '\n')) {
-                ctxt->input->cur++;
-                cur++;
-            }
-        } else {
-            ctxt->input->col++;
-        }
-
-        /*
-         * We are supposed to handle UTF8, check it's valid
-         * From rfc2044: encoding of the Unicode values on UTF-8:
-         *
-         * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
-         * 0000 0000-0000 007F   0xxxxxxx
-         * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
-         * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
-         *
-         * Check for the 0x110000 limit too
-         */
-
-        c = *cur;
-        if (c & 0x80) {
-            size_t avail;
-
-            if (c == 0xC0)
-	        goto encoding_error;
-
-            avail = ctxt->input->end - ctxt->input->cur;
-
-            if ((avail < 2) || (cur[1] & 0xc0) != 0x80)
-                goto encoding_error;
-            if ((c & 0xe0) == 0xe0) {
-                unsigned int val;
-
-                if ((avail < 3) || (cur[2] & 0xc0) != 0x80)
-                    goto encoding_error;
-                if ((c & 0xf0) == 0xf0) {
-                    if (((c & 0xf8) != 0xf0) ||
-                        (avail < 4) || ((cur[3] & 0xc0) != 0x80))
-                        goto encoding_error;
-                    /* 4-byte code */
-                    ctxt->input->cur += 4;
-                    val = (cur[0] & 0x7) << 18;
-                    val |= (cur[1] & 0x3f) << 12;
-                    val |= (cur[2] & 0x3f) << 6;
-                    val |= cur[3] & 0x3f;
-                } else {
-                    /* 3-byte code */
-                    ctxt->input->cur += 3;
-                    val = (cur[0] & 0xf) << 12;
-                    val |= (cur[1] & 0x3f) << 6;
-                    val |= cur[2] & 0x3f;
-                }
-                if (((val > 0xd7ff) && (val < 0xe000)) ||
-                    ((val > 0xfffd) && (val < 0x10000)) ||
-                    (val >= 0x110000)) {
-		xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
-				  "Char 0x%X out of allowed range\n",
-				  val);
-                }
-            } else
-                /* 2-byte code */
-                ctxt->input->cur += 2;
-        } else
-            /* 1-byte code */
+    if (c < 0x80) {
+        if (c == '\n') {
            ctxt->input->cur++;
-    } else {
-        const unsigned char *cur;
-
-        /*
-         * Assume it's a fixed length encoding (1) with
-         * a compatible encoding for the ASCII set, since
-         * XML constructs only use < 128 chars
-         */
-
-        cur = ctxt->input->cur;
-
-        if ((*cur == '\n') || (*cur == '\r')) {
            ctxt->input->line++;
            ctxt->input->col = 1;
-            if ((*cur == '\r') && (cur[1] == '\n')) {
-                ctxt->input->cur++;
-            }
+        } else if (c == '\r') {
+            /*
+             *   2.11 End-of-Line Handling
+             *   the literal two-character sequence "#xD#xA" or a standalone
+             *   literal #xD, an XML processor must pass to the application
+             *   the single character #xA.
+             */
+            ctxt->input->cur += ((cur[1] == '\n') ? 2 : 1);
+            ctxt->input->line++;
+            ctxt->input->col = 1;
+            return;
        } else {
+            ctxt->input->cur++;
            ctxt->input->col++;
        }
-        ctxt->input->cur++;
-    }
-    return;
-encoding_error:
-    /*
-     * If we detect an UTF8 error that probably mean that the
-     * input encoding didn't get properly advertised in the
-     * declaration header. Report the error and switch the encoding
-     * to ISO-Latin-1 (if you don't like this policy, just declare the
-     * encoding !)
-     */
-    if ((ctxt == NULL) || (ctxt->input == NULL) ||
-        (ctxt->input->end - ctxt->input->cur < 4)) {
-	__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
-		     "Input is not proper UTF-8, indicate encoding !\n",
-		     NULL, NULL);
    } else {
-        char buffer[150];
+        ctxt->input->col++;

-	snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
-			ctxt->input->cur[0], ctxt->input->cur[1],
-			ctxt->input->cur[2], ctxt->input->cur[3]);
-	__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
-		     "Input is not proper UTF-8, indicate encoding !\n%s",
-		     BAD_CAST buffer, NULL);
+        if ((avail < 2) || (cur[1] & 0xc0) != 0x80)
+            goto encoding_error;
+
+        if (c < 0xe0) {
+            /* 2-byte code */
+            if (c < 0xc2)
+                goto encoding_error;
+            ctxt->input->cur += 2;
+        } else {
+            unsigned int val = (c << 8) | cur[1];
+
+            if ((avail < 3) || (cur[2] & 0xc0) != 0x80)
+                goto encoding_error;
+
+            if (c < 0xf0) {
+                /* 3-byte code */
+                if ((val < 0xe0a0) || ((val >= 0xeda0) && (val < 0xee00)))
+                    goto encoding_error;
+                ctxt->input->cur += 3;
+            } else {
+                if ((avail < 4) || ((cur[3] & 0xc0) != 0x80))
+                    goto encoding_error;
+
+                /* 4-byte code */
+                if ((val < 0xf090) || (val >= 0xf490))
+                    goto encoding_error;
+                ctxt->input->cur += 4;
+            }
+        }
    }
-    if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
-        ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
-        ctxt->input->flags |= XML_INPUT_8_BIT;
+
+    return;
+
+encoding_error:
+    /* Only report the first error */
+    if ((ctxt->input->flags & XML_INPUT_ENCODING_ERROR) == 0) {
+        if ((ctxt == NULL) || (ctxt->input == NULL) ||
+            (ctxt->input->end - ctxt->input->cur < 4)) {
+            __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
+                         "Input is not proper UTF-8, indicate encoding !\n",
+                         NULL, NULL);
+        } else {
+            char buffer[150];
+
+            snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
+                            ctxt->input->cur[0], ctxt->input->cur[1],
+                            ctxt->input->cur[2], ctxt->input->cur[3]);
+            __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
+                         "Input is not proper UTF-8, indicate encoding !\n%s",
+                         BAD_CAST buffer, NULL);
+        }
+        ctxt->input->flags |= XML_INPUT_ENCODING_ERROR;
    }
    ctxt->input->cur++;
    return;
@ -872,149 +819,129 @@ encoding_error:

 int
 xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
+    const unsigned char *cur;
+    size_t avail;
+    int c;
+
    if ((ctxt == NULL) || (len == NULL) || (ctxt->input == NULL)) return(0);
    if (ctxt->instate == XML_PARSER_EOF)
 	return(0);

-    if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK) {
+    avail = ctxt->input->end - ctxt->input->cur;
+
+    if (avail < INPUT_CHUNK) {
        xmlParserGrow(ctxt);
        if (ctxt->instate == XML_PARSER_EOF)
            return(0);
+        avail = ctxt->input->end - ctxt->input->cur;
    }

-    if ((*ctxt->input->cur >= 0x20) && (*ctxt->input->cur <= 0x7F)) {
-	    *len = 1;
-	    return(*ctxt->input->cur);
-    }
-    if ((ctxt->input->flags & XML_INPUT_8_BIT) == 0) {
-	/*
-	 * We are supposed to handle UTF8, check it's valid
-	 * From rfc2044: encoding of the Unicode values on UTF-8:
-	 *
-	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
-	 * 0000 0000-0000 007F   0xxxxxxx
-	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
-	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
-	 *
-	 * Check for the 0x110000 limit too
-	 */
-	const unsigned char *cur = ctxt->input->cur;
-	unsigned char c;
-	unsigned int val;
+    cur = ctxt->input->cur;
+    c = *cur;

-	c = *cur;
-	if (c & 0x80) {
-            size_t avail;
+    if (c < 0x80) {
+	/* 1-byte code */
+        if (c < 0x20) {
+            /*
+             *   2.11 End-of-Line Handling
+             *   the literal two-character sequence "#xD#xA" or a standalone
+             *   literal #xD, an XML processor must pass to the application
+             *   the single character #xA.
+             */
+            if (c == '\r') {
+                *len = ((cur[1] == '\n') ? 2 : 1);
+                c = '\n';
+            } else if (c == 0) {
+                if (ctxt->input->cur >= ctxt->input->end) {
+                    *len = 0;
+                } else {
+                    *len = 1;
+                    xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
+                            "Char 0x0 out of allowed range\n", c);
+                }
+            } else {
+                *len = 1;
+            }
+        } else {
+            *len = 1;
+        }

-	    if (((c & 0x40) == 0) || (c == 0xC0))
-		goto encoding_error;
+        return(c);
+    } else {
+        int val;

-            avail = ctxt->input->end - ctxt->input->cur;
+        if (avail < 2)
+            goto incomplete_sequence;
+        if ((cur[1] & 0xc0) != 0x80)
+            goto encoding_error;

-            if (avail < 2)
-                goto incomplete_sequence;
-	    if ((cur[1] & 0xc0) != 0x80)
-		goto encoding_error;
-	    if ((c & 0xe0) == 0xe0) {
-                if (avail < 3)
-                    goto incomplete_sequence;
-		if ((cur[2] & 0xc0) != 0x80)
-		    goto encoding_error;
-		if ((c & 0xf0) == 0xf0) {
-                    if (avail < 4)
-                        goto incomplete_sequence;
-		    if (((c & 0xf8) != 0xf0) ||
-			((cur[3] & 0xc0) != 0x80))
-			goto encoding_error;
-		    /* 4-byte code */
-		    *len = 4;
-		    val = (cur[0] & 0x7) << 18;
-		    val |= (cur[1] & 0x3f) << 12;
-		    val |= (cur[2] & 0x3f) << 6;
-		    val |= cur[3] & 0x3f;
-		    if (val < 0x10000)
-			goto encoding_error;
-		} else {
-		  /* 3-byte code */
-		    *len = 3;
-		    val = (cur[0] & 0xf) << 12;
-		    val |= (cur[1] & 0x3f) << 6;
-		    val |= cur[2] & 0x3f;
-		    if (val < 0x800)
-			goto encoding_error;
-		}
-	    } else {
-	      /* 2-byte code */
-		*len = 2;
-		val = (cur[0] & 0x1f) << 6;
-		val |= cur[1] & 0x3f;
-		if (val < 0x80)
-		    goto encoding_error;
-	    }
-	    if (!IS_CHAR(val)) {
-	        xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
-				  "Char 0x%X out of allowed range\n", val);
-	    }
-	    return(val);
-	} else {
-	    /* 1-byte code */
-	    *len = 1;
-	    if ((*ctxt->input->cur == 0) &&
-	        (ctxt->input->end > ctxt->input->cur)) {
-	        xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
-				  "Char 0x0 out of allowed range\n", 0);
-	    }
-	    if (*ctxt->input->cur == 0xD) {
-		if (ctxt->input->cur[1] == 0xA) {
-                    *len = 2;
-		}
-		return(0xA);
-	    }
-	    return(*ctxt->input->cur);
-	}
-    }
-    /*
-     * Assume it's a fixed length encoding (1) with
-     * a compatible encoding for the ASCII set, since
-     * XML constructs only use < 128 chars
-     */
-    *len = 1;
-    if (*ctxt->input->cur == 0xD) {
-	if (ctxt->input->cur[1] == 0xA) {
+        if (c < 0xe0) {
+            /* 2-byte code */
+            if (c < 0xc2)
+                goto encoding_error;
+            val = (c & 0x1f) << 6;
+            val |= cur[1] & 0x3f;
            *len = 2;
-	}
-	return(0xA);
+        } else {
+            if (avail < 3)
+                goto incomplete_sequence;
+            if ((cur[2] & 0xc0) != 0x80)
+                goto encoding_error;
+
+            if (c < 0xf0) {
+                /* 3-byte code */
+                val = (c & 0xf) << 12;
+                val |= (cur[1] & 0x3f) << 6;
+                val |= cur[2] & 0x3f;
+                if ((val < 0x800) || ((val >= 0xd800) && (val < 0xe000)))
+                    goto encoding_error;
+                *len = 3;
+            } else {
+                if (avail < 4)
+                    goto incomplete_sequence;
+                if ((cur[3] & 0xc0) != 0x80)
+                    goto encoding_error;
+
+                /* 4-byte code */
+                val = (c & 0x0f) << 18;
+                val |= (cur[1] & 0x3f) << 12;
+                val |= (cur[2] & 0x3f) << 6;
+                val |= cur[3] & 0x3f;
+                if ((val < 0x10000) || (val >= 0x110000))
+                    goto encoding_error;
+                *len = 4;
+            }
+        }
+
+        if (!IS_CHAR(val)) {
+            xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
+                              "Char 0x%X out of allowed range\n", val);
+        }
+
+        return(val);
    }
-    return(*ctxt->input->cur);

 encoding_error:
-    /*
-     * If we detect an UTF8 error that probably mean that the
-     * input encoding didn't get properly advertised in the
-     * declaration header. Report the error and switch the encoding
-     * to ISO-Latin-1 (if you don't like this policy, just declare the
-     * encoding !)
-     */
-    if (ctxt->input->end - ctxt->input->cur < 4) {
-	__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
-		     "Input is not proper UTF-8, indicate encoding !\n",
-		     NULL, NULL);
-    } else {
-        char buffer[150];
+    /* Only report the first error */
+    if ((ctxt->input->flags & XML_INPUT_ENCODING_ERROR) == 0) {
+        if (ctxt->input->end - ctxt->input->cur < 4) {
+            __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
+                         "Input is not proper UTF-8, indicate encoding !\n",
+                         NULL, NULL);
+        } else {
+            char buffer[150];

-	snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
-			ctxt->input->cur[0], ctxt->input->cur[1],
-			ctxt->input->cur[2], ctxt->input->cur[3]);
-	__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
-		     "Input is not proper UTF-8, indicate encoding !\n%s",
-		     BAD_CAST buffer, NULL);
-    }
-    if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
-        ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
-        ctxt->input->flags |= XML_INPUT_8_BIT;
+            snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
+                            ctxt->input->cur[0], ctxt->input->cur[1],
+                            ctxt->input->cur[2], ctxt->input->cur[3]);
+            __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
+                         "Input is not proper UTF-8, indicate encoding !\n%s",
+                         BAD_CAST buffer, NULL);
+        }
+        ctxt->input->flags |= XML_INPUT_ENCODING_ERROR;
    }
    *len = 1;
-    return(*ctxt->input->cur);
+    return(0xFFFD); /* U+FFFD Replacement Character */

 incomplete_sequence:
    /*
@ -1271,7 +1198,6 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
    in = input->buf;

    input->flags |= XML_INPUT_HAS_ENCODING;
-    input->flags &= ~XML_INPUT_8_BIT;

    /*
     * UTF-8 requires no encoding handler.