diff --git a/HTMLparser.c b/HTMLparser.c index 1af0190c..fa1fe380 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -321,7 +321,6 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt) ************/ #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l) -#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l) #define COPY_BUF(l,b,i,v) \ if (l == 1) b[i++] = v; \ diff --git a/parserInternals.c b/parserInternals.c index 51a472ee..6c3fb786 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -1029,104 +1029,18 @@ incomplete_sequence: */ int -xmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar * cur, int *len) -{ - if ((len == NULL) || (cur == NULL)) return(0); - if ((ctxt == NULL) || (ctxt->input == NULL) || - ((ctxt->input->flags & XML_INPUT_8_BIT) == 0)) { - /* - * We are supposed to handle UTF8, check it's valid - * From rfc2044: encoding of the Unicode values on UTF-8: - * - * UCS-4 range (hex.) UTF-8 octet sequence (binary) - * 0000 0000-0000 007F 0xxxxxxx - * 0000 0080-0000 07FF 110xxxxx 10xxxxxx - * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx - * - * Check for the 0x110000 limit too - */ - unsigned char c; - unsigned int val; +xmlStringCurrentChar(xmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED, + const xmlChar *cur, int *len) { + int c; - c = *cur; - if (c & 0x80) { - if ((cur[1] & 0xc0) != 0x80) - goto encoding_error; - if ((c & 0xe0) == 0xe0) { + if ((cur == NULL) || (len == NULL)) + return(0); - if ((cur[2] & 0xc0) != 0x80) - goto encoding_error; - if ((c & 0xf0) == 0xf0) { - if (((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80)) - goto encoding_error; - /* 4-byte code */ - *len = 4; - val = (cur[0] & 0x7) << 18; - val |= (cur[1] & 0x3f) << 12; - val |= (cur[2] & 0x3f) << 6; - val |= cur[3] & 0x3f; - } else { - /* 3-byte code */ - *len = 3; - val = (cur[0] & 0xf) << 12; - val |= (cur[1] & 0x3f) << 6; - val |= cur[2] & 0x3f; - } - } else { - /* 2-byte code */ - *len = 2; - val = (cur[0] & 0x1f) << 6; - val |= cur[1] & 0x3f; - } - if (!IS_CHAR(val)) { - xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR, - "Char 0x%X out of allowed range\n", val); - } - return (val); - } else { - /* 1-byte code */ - *len = 1; - return (*cur); - } - } - /* - * Assume it's a fixed length encoding (1) with - * a compatible encoding for the ASCII set, since - * XML constructs only use < 128 chars - */ - *len = 1; - return (*cur); -encoding_error: + /* cur is zero-terminated, so we can lie about its length. */ + *len = 4; + c = xmlGetUTF8Char(cur, len); - /* - * An encoding problem may arise from a truncated input buffer - * splitting a character in the middle. In that case do not raise - * an error but return 0 to indicate an end of stream problem - */ - if ((ctxt == NULL) || (ctxt->input == NULL) || - (ctxt->input->end - ctxt->input->cur < 4)) { - *len = 0; - return(0); - } - /* - * If we detect an UTF8 error that probably mean that the - * input encoding didn't get properly advertised in the - * declaration header. Report the error and switch the encoding - * to ISO-Latin-1 (if you don't like this policy, just declare the - * encoding !) - */ - { - char buffer[150]; - - snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", - ctxt->input->cur[0], ctxt->input->cur[1], - ctxt->input->cur[2], ctxt->input->cur[3]); - __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR, - "Input is not proper UTF-8, indicate encoding !\n%s", - BAD_CAST buffer, NULL); - } - *len = 1; - return (*cur); + return((c < 0) ? 0 : c); } /** diff --git a/xinclude.c b/xinclude.c index 8eaf4d85..9c144a1e 100644 --- a/xinclude.c +++ b/xinclude.c @@ -1715,8 +1715,9 @@ xmlXIncludeLoadTxt(xmlXIncludeCtxtPtr ctxt, const xmlChar *url, int cur; int l; - cur = xmlStringCurrentChar(NULL, &content[i], &l); - if (!IS_CHAR(cur)) { + l = len - i; + cur = xmlGetUTF8Char(&content[i], &l); + if ((cur < 0) || (!IS_CHAR(cur))) { xmlXIncludeErr(ctxt, ref->elem, XML_XINCLUDE_INVALID_CHAR, "%s contains invalid char\n", URL); goto error;