mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-03-31 06:50:06 +03:00
parser: Stop switching to ISO-8859-1 on encoding errors
Use U+FFFD Replacement Character if invalid UTF-8 is encountered in recovery mode. Also rewrite xmlNextChar and xmlCurrentChar. Fixes #598.
This commit is contained in:
parent
a9ada18352
commit
c082ef4644
@ -24,7 +24,7 @@
|
||||
#define XML_INPUT_AUTO_UTF16BE (3u << 1)
|
||||
#define XML_INPUT_AUTO_OTHER (4u << 1)
|
||||
#define XML_INPUT_USES_ENC_DECL (1u << 4)
|
||||
#define XML_INPUT_8_BIT (1u << 5)
|
||||
#define XML_INPUT_ENCODING_ERROR (1u << 5)
|
||||
|
||||
XML_HIDDEN void
|
||||
xmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra);
|
||||
|
38
parser.c
38
parser.c
@ -2292,8 +2292,8 @@ static int spacePop(xmlParserCtxtPtr ctxt) {
|
||||
#define CUR_CHAR(l) xmlCurrentChar(ctxt, &l)
|
||||
#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
|
||||
|
||||
#define COPY_BUF(l,b,i,v) \
|
||||
if (l == 1) b[i++] = v; \
|
||||
#define COPY_BUF(b, i, v) \
|
||||
if (v < 0x80) b[i++] = v; \
|
||||
else i += xmlCopyCharMultiByte(&b[i],v)
|
||||
|
||||
/**
|
||||
@ -2843,7 +2843,7 @@ xmlStringDecodeEntitiesInt(xmlParserCtxtPtr ctxt, const xmlChar *str, int len,
|
||||
int val = xmlParseStringCharRef(ctxt, &str);
|
||||
if (val == 0)
|
||||
goto int_error;
|
||||
COPY_BUF(0,buffer,nbchars,val);
|
||||
COPY_BUF(buffer, nbchars, val);
|
||||
if (nbchars + XML_PARSER_BUFFER_SIZE > buffer_size) {
|
||||
growBuffer(buffer, XML_PARSER_BUFFER_SIZE);
|
||||
}
|
||||
@ -2856,7 +2856,7 @@ xmlStringDecodeEntitiesInt(xmlParserCtxtPtr ctxt, const xmlChar *str, int len,
|
||||
if ((ent != NULL) &&
|
||||
(ent->etype == XML_INTERNAL_PREDEFINED_ENTITY)) {
|
||||
if (ent->content != NULL) {
|
||||
COPY_BUF(0,buffer,nbchars,ent->content[0]);
|
||||
COPY_BUF(buffer, nbchars, ent->content[0]);
|
||||
if (nbchars + XML_PARSER_BUFFER_SIZE > buffer_size) {
|
||||
growBuffer(buffer, XML_PARSER_BUFFER_SIZE);
|
||||
}
|
||||
@ -2967,7 +2967,7 @@ xmlStringDecodeEntitiesInt(xmlParserCtxtPtr ctxt, const xmlChar *str, int len,
|
||||
rep = NULL;
|
||||
}
|
||||
} else {
|
||||
COPY_BUF(l,buffer,nbchars,c);
|
||||
COPY_BUF(buffer, nbchars, c);
|
||||
str += l;
|
||||
if (nbchars + XML_PARSER_BUFFER_SIZE > buffer_size) {
|
||||
growBuffer(buffer, XML_PARSER_BUFFER_SIZE);
|
||||
@ -3742,11 +3742,11 @@ xmlParseStringName(xmlParserCtxtPtr ctxt, const xmlChar** str) {
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
COPY_BUF(l,buf,len,c);
|
||||
COPY_BUF(buf, len, c);
|
||||
cur += l;
|
||||
c = CUR_SCHAR(cur, l);
|
||||
while (xmlIsNameChar(ctxt, c)) {
|
||||
COPY_BUF(l,buf,len,c);
|
||||
COPY_BUF(buf, len, c);
|
||||
cur += l;
|
||||
c = CUR_SCHAR(cur, l);
|
||||
if (len >= XML_MAX_NAMELEN) { /* test bigentname.xml */
|
||||
@ -3776,7 +3776,7 @@ xmlParseStringName(xmlParserCtxtPtr ctxt, const xmlChar** str) {
|
||||
}
|
||||
buffer = tmp;
|
||||
}
|
||||
COPY_BUF(l,buffer,len,c);
|
||||
COPY_BUF(buffer, len, c);
|
||||
cur += l;
|
||||
c = CUR_SCHAR(cur, l);
|
||||
if (len > maxLength) {
|
||||
@ -3825,7 +3825,7 @@ xmlParseNmtoken(xmlParserCtxtPtr ctxt) {
|
||||
c = CUR_CHAR(l);
|
||||
|
||||
while (xmlIsNameChar(ctxt, c)) {
|
||||
COPY_BUF(l,buf,len,c);
|
||||
COPY_BUF(buf, len, c);
|
||||
NEXTL(l);
|
||||
c = CUR_CHAR(l);
|
||||
if (len >= XML_MAX_NAMELEN) {
|
||||
@ -3855,7 +3855,7 @@ xmlParseNmtoken(xmlParserCtxtPtr ctxt) {
|
||||
}
|
||||
buffer = tmp;
|
||||
}
|
||||
COPY_BUF(l,buffer,len,c);
|
||||
COPY_BUF(buffer, len, c);
|
||||
if (len > maxLength) {
|
||||
xmlFatalErr(ctxt, XML_ERR_NAME_TOO_LONG, "NmToken");
|
||||
xmlFree(buffer);
|
||||
@ -3957,7 +3957,7 @@ xmlParseEntityValue(xmlParserCtxtPtr ctxt, xmlChar **orig) {
|
||||
}
|
||||
buf = tmp;
|
||||
}
|
||||
COPY_BUF(l,buf,len,c);
|
||||
COPY_BUF(buf, len, c);
|
||||
NEXTL(l);
|
||||
|
||||
GROW;
|
||||
@ -4241,7 +4241,7 @@ xmlParseAttValueComplex(xmlParserCtxtPtr ctxt, int *attlen, int normalize) {
|
||||
if ((c == 0x20) || (c == 0xD) || (c == 0xA) || (c == 0x9)) {
|
||||
if ((len != 0) || (!normalize)) {
|
||||
if ((!normalize) || (!in_space)) {
|
||||
COPY_BUF(l,buf,len,0x20);
|
||||
COPY_BUF(buf, len, 0x20);
|
||||
while (len + 10 > buf_size) {
|
||||
growBuffer(buf, 10);
|
||||
}
|
||||
@ -4250,7 +4250,7 @@ xmlParseAttValueComplex(xmlParserCtxtPtr ctxt, int *attlen, int normalize) {
|
||||
}
|
||||
} else {
|
||||
in_space = 0;
|
||||
COPY_BUF(l,buf,len,c);
|
||||
COPY_BUF(buf, len, c);
|
||||
if (len + 10 > buf_size) {
|
||||
growBuffer(buf, 10);
|
||||
}
|
||||
@ -4397,7 +4397,7 @@ xmlParseSystemLiteral(xmlParserCtxtPtr ctxt) {
|
||||
}
|
||||
buf = tmp;
|
||||
}
|
||||
COPY_BUF(l,buf,len,cur);
|
||||
COPY_BUF(buf, len, cur);
|
||||
if (len > maxLength) {
|
||||
xmlFatalErr(ctxt, XML_ERR_NAME_TOO_LONG, "SystemLiteral");
|
||||
xmlFree(buf);
|
||||
@ -4721,7 +4721,7 @@ xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial) {
|
||||
if ((cur == ']') && (NXT(1) == ']') && (NXT(2) == '>')) {
|
||||
xmlFatalErr(ctxt, XML_ERR_MISPLACED_CDATA_END, NULL);
|
||||
}
|
||||
COPY_BUF(l,buf,nbchar,cur);
|
||||
COPY_BUF(buf, nbchar, cur);
|
||||
/* move current position before possible calling of ctxt->sax->characters */
|
||||
NEXTL(l);
|
||||
if (nbchar >= XML_PARSER_BIG_BUFFER_SIZE) {
|
||||
@ -4964,7 +4964,7 @@ xmlParseCommentComplex(xmlParserCtxtPtr ctxt, xmlChar *buf,
|
||||
buf = new_buf;
|
||||
size = new_size;
|
||||
}
|
||||
COPY_BUF(ql,buf,len,q);
|
||||
COPY_BUF(buf, len, q);
|
||||
if (len > maxLength) {
|
||||
xmlFatalErrMsgStr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
|
||||
"Comment too big found", NULL);
|
||||
@ -5386,7 +5386,7 @@ xmlParsePI(xmlParserCtxtPtr ctxt) {
|
||||
buf = tmp;
|
||||
size = new_size;
|
||||
}
|
||||
COPY_BUF(l,buf,len,cur);
|
||||
COPY_BUF(buf, len, cur);
|
||||
if (len > maxLength) {
|
||||
xmlFatalErrMsgStr(ctxt, XML_ERR_PI_NOT_FINISHED,
|
||||
"PI %s too big found", target);
|
||||
@ -7246,7 +7246,7 @@ xmlParseReference(xmlParserCtxtPtr ctxt) {
|
||||
/*
|
||||
* Just encode the value in UTF-8
|
||||
*/
|
||||
COPY_BUF(0, out, i, value);
|
||||
COPY_BUF(out, i, value);
|
||||
out[i] = 0;
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL) &&
|
||||
(!ctxt->disableSAX))
|
||||
@ -10218,7 +10218,7 @@ xmlParseCDSect(xmlParserCtxtPtr ctxt) {
|
||||
buf = tmp;
|
||||
size *= 2;
|
||||
}
|
||||
COPY_BUF(rl,buf,len,r);
|
||||
COPY_BUF(buf, len, r);
|
||||
if (len > maxLength) {
|
||||
xmlFatalErrMsg(ctxt, XML_ERR_CDATA_NOT_FINISHED,
|
||||
"CData section too big found\n");
|
||||
|
@ -38,7 +38,6 @@
|
||||
|
||||
#define CUR(ctxt) ctxt->input->cur
|
||||
#define END(ctxt) ctxt->input->end
|
||||
#define VALID_CTXT(ctxt) (CUR(ctxt) <= END(ctxt))
|
||||
|
||||
#include "private/buf.h"
|
||||
#include "private/enc.h"
|
||||
@ -697,154 +696,102 @@ xmlParserInputShrink(xmlParserInputPtr in) {
|
||||
void
|
||||
xmlNextChar(xmlParserCtxtPtr ctxt)
|
||||
{
|
||||
const unsigned char *cur;
|
||||
size_t avail;
|
||||
int c;
|
||||
|
||||
if ((ctxt == NULL) || (ctxt->instate == XML_PARSER_EOF) ||
|
||||
(ctxt->input == NULL))
|
||||
return;
|
||||
|
||||
if (!(VALID_CTXT(ctxt))) {
|
||||
xmlErrInternal(ctxt, "Parser input data memory error\n", NULL);
|
||||
ctxt->errNo = XML_ERR_INTERNAL_ERROR;
|
||||
xmlStopParser(ctxt);
|
||||
return;
|
||||
}
|
||||
avail = ctxt->input->end - ctxt->input->cur;
|
||||
|
||||
if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK) {
|
||||
if (avail < INPUT_CHUNK) {
|
||||
xmlParserGrow(ctxt);
|
||||
if ((ctxt->instate == XML_PARSER_EOF) ||
|
||||
(ctxt->input->cur >= ctxt->input->end))
|
||||
return;
|
||||
avail = ctxt->input->end - ctxt->input->cur;
|
||||
}
|
||||
|
||||
if ((ctxt->input->flags & XML_INPUT_8_BIT) == 0) {
|
||||
const unsigned char *cur;
|
||||
unsigned char c;
|
||||
cur = ctxt->input->cur;
|
||||
c = *cur;
|
||||
|
||||
cur = ctxt->input->cur;
|
||||
|
||||
/*
|
||||
* 2.11 End-of-Line Handling
|
||||
* the literal two-character sequence "#xD#xA" or a standalone
|
||||
* literal #xD, an XML processor must pass to the application
|
||||
* the single character #xA.
|
||||
*/
|
||||
if ((*cur == '\n') || (*cur == '\r')) {
|
||||
ctxt->input->line++;
|
||||
ctxt->input->col = 1;
|
||||
if ((*cur == '\r') && (cur[1] == '\n')) {
|
||||
ctxt->input->cur++;
|
||||
cur++;
|
||||
}
|
||||
} else {
|
||||
ctxt->input->col++;
|
||||
}
|
||||
|
||||
/*
|
||||
* We are supposed to handle UTF8, check it's valid
|
||||
* From rfc2044: encoding of the Unicode values on UTF-8:
|
||||
*
|
||||
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
|
||||
* 0000 0000-0000 007F 0xxxxxxx
|
||||
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
|
||||
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
|
||||
*
|
||||
* Check for the 0x110000 limit too
|
||||
*/
|
||||
|
||||
c = *cur;
|
||||
if (c & 0x80) {
|
||||
size_t avail;
|
||||
|
||||
if (c == 0xC0)
|
||||
goto encoding_error;
|
||||
|
||||
avail = ctxt->input->end - ctxt->input->cur;
|
||||
|
||||
if ((avail < 2) || (cur[1] & 0xc0) != 0x80)
|
||||
goto encoding_error;
|
||||
if ((c & 0xe0) == 0xe0) {
|
||||
unsigned int val;
|
||||
|
||||
if ((avail < 3) || (cur[2] & 0xc0) != 0x80)
|
||||
goto encoding_error;
|
||||
if ((c & 0xf0) == 0xf0) {
|
||||
if (((c & 0xf8) != 0xf0) ||
|
||||
(avail < 4) || ((cur[3] & 0xc0) != 0x80))
|
||||
goto encoding_error;
|
||||
/* 4-byte code */
|
||||
ctxt->input->cur += 4;
|
||||
val = (cur[0] & 0x7) << 18;
|
||||
val |= (cur[1] & 0x3f) << 12;
|
||||
val |= (cur[2] & 0x3f) << 6;
|
||||
val |= cur[3] & 0x3f;
|
||||
} else {
|
||||
/* 3-byte code */
|
||||
ctxt->input->cur += 3;
|
||||
val = (cur[0] & 0xf) << 12;
|
||||
val |= (cur[1] & 0x3f) << 6;
|
||||
val |= cur[2] & 0x3f;
|
||||
}
|
||||
if (((val > 0xd7ff) && (val < 0xe000)) ||
|
||||
((val > 0xfffd) && (val < 0x10000)) ||
|
||||
(val >= 0x110000)) {
|
||||
xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"Char 0x%X out of allowed range\n",
|
||||
val);
|
||||
}
|
||||
} else
|
||||
/* 2-byte code */
|
||||
ctxt->input->cur += 2;
|
||||
} else
|
||||
/* 1-byte code */
|
||||
if (c < 0x80) {
|
||||
if (c == '\n') {
|
||||
ctxt->input->cur++;
|
||||
} else {
|
||||
const unsigned char *cur;
|
||||
|
||||
/*
|
||||
* Assume it's a fixed length encoding (1) with
|
||||
* a compatible encoding for the ASCII set, since
|
||||
* XML constructs only use < 128 chars
|
||||
*/
|
||||
|
||||
cur = ctxt->input->cur;
|
||||
|
||||
if ((*cur == '\n') || (*cur == '\r')) {
|
||||
ctxt->input->line++;
|
||||
ctxt->input->col = 1;
|
||||
if ((*cur == '\r') && (cur[1] == '\n')) {
|
||||
ctxt->input->cur++;
|
||||
}
|
||||
} else if (c == '\r') {
|
||||
/*
|
||||
* 2.11 End-of-Line Handling
|
||||
* the literal two-character sequence "#xD#xA" or a standalone
|
||||
* literal #xD, an XML processor must pass to the application
|
||||
* the single character #xA.
|
||||
*/
|
||||
ctxt->input->cur += ((cur[1] == '\n') ? 2 : 1);
|
||||
ctxt->input->line++;
|
||||
ctxt->input->col = 1;
|
||||
return;
|
||||
} else {
|
||||
ctxt->input->cur++;
|
||||
ctxt->input->col++;
|
||||
}
|
||||
ctxt->input->cur++;
|
||||
}
|
||||
return;
|
||||
encoding_error:
|
||||
/*
|
||||
* If we detect an UTF8 error that probably mean that the
|
||||
* input encoding didn't get properly advertised in the
|
||||
* declaration header. Report the error and switch the encoding
|
||||
* to ISO-Latin-1 (if you don't like this policy, just declare the
|
||||
* encoding !)
|
||||
*/
|
||||
if ((ctxt == NULL) || (ctxt->input == NULL) ||
|
||||
(ctxt->input->end - ctxt->input->cur < 4)) {
|
||||
__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"Input is not proper UTF-8, indicate encoding !\n",
|
||||
NULL, NULL);
|
||||
} else {
|
||||
char buffer[150];
|
||||
ctxt->input->col++;
|
||||
|
||||
snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
|
||||
ctxt->input->cur[0], ctxt->input->cur[1],
|
||||
ctxt->input->cur[2], ctxt->input->cur[3]);
|
||||
__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"Input is not proper UTF-8, indicate encoding !\n%s",
|
||||
BAD_CAST buffer, NULL);
|
||||
if ((avail < 2) || (cur[1] & 0xc0) != 0x80)
|
||||
goto encoding_error;
|
||||
|
||||
if (c < 0xe0) {
|
||||
/* 2-byte code */
|
||||
if (c < 0xc2)
|
||||
goto encoding_error;
|
||||
ctxt->input->cur += 2;
|
||||
} else {
|
||||
unsigned int val = (c << 8) | cur[1];
|
||||
|
||||
if ((avail < 3) || (cur[2] & 0xc0) != 0x80)
|
||||
goto encoding_error;
|
||||
|
||||
if (c < 0xf0) {
|
||||
/* 3-byte code */
|
||||
if ((val < 0xe0a0) || ((val >= 0xeda0) && (val < 0xee00)))
|
||||
goto encoding_error;
|
||||
ctxt->input->cur += 3;
|
||||
} else {
|
||||
if ((avail < 4) || ((cur[3] & 0xc0) != 0x80))
|
||||
goto encoding_error;
|
||||
|
||||
/* 4-byte code */
|
||||
if ((val < 0xf090) || (val >= 0xf490))
|
||||
goto encoding_error;
|
||||
ctxt->input->cur += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
|
||||
ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
|
||||
ctxt->input->flags |= XML_INPUT_8_BIT;
|
||||
|
||||
return;
|
||||
|
||||
encoding_error:
|
||||
/* Only report the first error */
|
||||
if ((ctxt->input->flags & XML_INPUT_ENCODING_ERROR) == 0) {
|
||||
if ((ctxt == NULL) || (ctxt->input == NULL) ||
|
||||
(ctxt->input->end - ctxt->input->cur < 4)) {
|
||||
__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"Input is not proper UTF-8, indicate encoding !\n",
|
||||
NULL, NULL);
|
||||
} else {
|
||||
char buffer[150];
|
||||
|
||||
snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
|
||||
ctxt->input->cur[0], ctxt->input->cur[1],
|
||||
ctxt->input->cur[2], ctxt->input->cur[3]);
|
||||
__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"Input is not proper UTF-8, indicate encoding !\n%s",
|
||||
BAD_CAST buffer, NULL);
|
||||
}
|
||||
ctxt->input->flags |= XML_INPUT_ENCODING_ERROR;
|
||||
}
|
||||
ctxt->input->cur++;
|
||||
return;
|
||||
@ -872,149 +819,129 @@ encoding_error:
|
||||
|
||||
int
|
||||
xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
|
||||
const unsigned char *cur;
|
||||
size_t avail;
|
||||
int c;
|
||||
|
||||
if ((ctxt == NULL) || (len == NULL) || (ctxt->input == NULL)) return(0);
|
||||
if (ctxt->instate == XML_PARSER_EOF)
|
||||
return(0);
|
||||
|
||||
if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK) {
|
||||
avail = ctxt->input->end - ctxt->input->cur;
|
||||
|
||||
if (avail < INPUT_CHUNK) {
|
||||
xmlParserGrow(ctxt);
|
||||
if (ctxt->instate == XML_PARSER_EOF)
|
||||
return(0);
|
||||
avail = ctxt->input->end - ctxt->input->cur;
|
||||
}
|
||||
|
||||
if ((*ctxt->input->cur >= 0x20) && (*ctxt->input->cur <= 0x7F)) {
|
||||
*len = 1;
|
||||
return(*ctxt->input->cur);
|
||||
}
|
||||
if ((ctxt->input->flags & XML_INPUT_8_BIT) == 0) {
|
||||
/*
|
||||
* We are supposed to handle UTF8, check it's valid
|
||||
* From rfc2044: encoding of the Unicode values on UTF-8:
|
||||
*
|
||||
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
|
||||
* 0000 0000-0000 007F 0xxxxxxx
|
||||
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
|
||||
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
|
||||
*
|
||||
* Check for the 0x110000 limit too
|
||||
*/
|
||||
const unsigned char *cur = ctxt->input->cur;
|
||||
unsigned char c;
|
||||
unsigned int val;
|
||||
cur = ctxt->input->cur;
|
||||
c = *cur;
|
||||
|
||||
c = *cur;
|
||||
if (c & 0x80) {
|
||||
size_t avail;
|
||||
if (c < 0x80) {
|
||||
/* 1-byte code */
|
||||
if (c < 0x20) {
|
||||
/*
|
||||
* 2.11 End-of-Line Handling
|
||||
* the literal two-character sequence "#xD#xA" or a standalone
|
||||
* literal #xD, an XML processor must pass to the application
|
||||
* the single character #xA.
|
||||
*/
|
||||
if (c == '\r') {
|
||||
*len = ((cur[1] == '\n') ? 2 : 1);
|
||||
c = '\n';
|
||||
} else if (c == 0) {
|
||||
if (ctxt->input->cur >= ctxt->input->end) {
|
||||
*len = 0;
|
||||
} else {
|
||||
*len = 1;
|
||||
xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"Char 0x0 out of allowed range\n", c);
|
||||
}
|
||||
} else {
|
||||
*len = 1;
|
||||
}
|
||||
} else {
|
||||
*len = 1;
|
||||
}
|
||||
|
||||
if (((c & 0x40) == 0) || (c == 0xC0))
|
||||
goto encoding_error;
|
||||
return(c);
|
||||
} else {
|
||||
int val;
|
||||
|
||||
avail = ctxt->input->end - ctxt->input->cur;
|
||||
if (avail < 2)
|
||||
goto incomplete_sequence;
|
||||
if ((cur[1] & 0xc0) != 0x80)
|
||||
goto encoding_error;
|
||||
|
||||
if (avail < 2)
|
||||
goto incomplete_sequence;
|
||||
if ((cur[1] & 0xc0) != 0x80)
|
||||
goto encoding_error;
|
||||
if ((c & 0xe0) == 0xe0) {
|
||||
if (avail < 3)
|
||||
goto incomplete_sequence;
|
||||
if ((cur[2] & 0xc0) != 0x80)
|
||||
goto encoding_error;
|
||||
if ((c & 0xf0) == 0xf0) {
|
||||
if (avail < 4)
|
||||
goto incomplete_sequence;
|
||||
if (((c & 0xf8) != 0xf0) ||
|
||||
((cur[3] & 0xc0) != 0x80))
|
||||
goto encoding_error;
|
||||
/* 4-byte code */
|
||||
*len = 4;
|
||||
val = (cur[0] & 0x7) << 18;
|
||||
val |= (cur[1] & 0x3f) << 12;
|
||||
val |= (cur[2] & 0x3f) << 6;
|
||||
val |= cur[3] & 0x3f;
|
||||
if (val < 0x10000)
|
||||
goto encoding_error;
|
||||
} else {
|
||||
/* 3-byte code */
|
||||
*len = 3;
|
||||
val = (cur[0] & 0xf) << 12;
|
||||
val |= (cur[1] & 0x3f) << 6;
|
||||
val |= cur[2] & 0x3f;
|
||||
if (val < 0x800)
|
||||
goto encoding_error;
|
||||
}
|
||||
} else {
|
||||
/* 2-byte code */
|
||||
*len = 2;
|
||||
val = (cur[0] & 0x1f) << 6;
|
||||
val |= cur[1] & 0x3f;
|
||||
if (val < 0x80)
|
||||
goto encoding_error;
|
||||
}
|
||||
if (!IS_CHAR(val)) {
|
||||
xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"Char 0x%X out of allowed range\n", val);
|
||||
}
|
||||
return(val);
|
||||
} else {
|
||||
/* 1-byte code */
|
||||
*len = 1;
|
||||
if ((*ctxt->input->cur == 0) &&
|
||||
(ctxt->input->end > ctxt->input->cur)) {
|
||||
xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"Char 0x0 out of allowed range\n", 0);
|
||||
}
|
||||
if (*ctxt->input->cur == 0xD) {
|
||||
if (ctxt->input->cur[1] == 0xA) {
|
||||
*len = 2;
|
||||
}
|
||||
return(0xA);
|
||||
}
|
||||
return(*ctxt->input->cur);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Assume it's a fixed length encoding (1) with
|
||||
* a compatible encoding for the ASCII set, since
|
||||
* XML constructs only use < 128 chars
|
||||
*/
|
||||
*len = 1;
|
||||
if (*ctxt->input->cur == 0xD) {
|
||||
if (ctxt->input->cur[1] == 0xA) {
|
||||
if (c < 0xe0) {
|
||||
/* 2-byte code */
|
||||
if (c < 0xc2)
|
||||
goto encoding_error;
|
||||
val = (c & 0x1f) << 6;
|
||||
val |= cur[1] & 0x3f;
|
||||
*len = 2;
|
||||
}
|
||||
return(0xA);
|
||||
} else {
|
||||
if (avail < 3)
|
||||
goto incomplete_sequence;
|
||||
if ((cur[2] & 0xc0) != 0x80)
|
||||
goto encoding_error;
|
||||
|
||||
if (c < 0xf0) {
|
||||
/* 3-byte code */
|
||||
val = (c & 0xf) << 12;
|
||||
val |= (cur[1] & 0x3f) << 6;
|
||||
val |= cur[2] & 0x3f;
|
||||
if ((val < 0x800) || ((val >= 0xd800) && (val < 0xe000)))
|
||||
goto encoding_error;
|
||||
*len = 3;
|
||||
} else {
|
||||
if (avail < 4)
|
||||
goto incomplete_sequence;
|
||||
if ((cur[3] & 0xc0) != 0x80)
|
||||
goto encoding_error;
|
||||
|
||||
/* 4-byte code */
|
||||
val = (c & 0x0f) << 18;
|
||||
val |= (cur[1] & 0x3f) << 12;
|
||||
val |= (cur[2] & 0x3f) << 6;
|
||||
val |= cur[3] & 0x3f;
|
||||
if ((val < 0x10000) || (val >= 0x110000))
|
||||
goto encoding_error;
|
||||
*len = 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (!IS_CHAR(val)) {
|
||||
xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"Char 0x%X out of allowed range\n", val);
|
||||
}
|
||||
|
||||
return(val);
|
||||
}
|
||||
return(*ctxt->input->cur);
|
||||
|
||||
encoding_error:
|
||||
/*
|
||||
* If we detect an UTF8 error that probably mean that the
|
||||
* input encoding didn't get properly advertised in the
|
||||
* declaration header. Report the error and switch the encoding
|
||||
* to ISO-Latin-1 (if you don't like this policy, just declare the
|
||||
* encoding !)
|
||||
*/
|
||||
if (ctxt->input->end - ctxt->input->cur < 4) {
|
||||
__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"Input is not proper UTF-8, indicate encoding !\n",
|
||||
NULL, NULL);
|
||||
} else {
|
||||
char buffer[150];
|
||||
/* Only report the first error */
|
||||
if ((ctxt->input->flags & XML_INPUT_ENCODING_ERROR) == 0) {
|
||||
if (ctxt->input->end - ctxt->input->cur < 4) {
|
||||
__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"Input is not proper UTF-8, indicate encoding !\n",
|
||||
NULL, NULL);
|
||||
} else {
|
||||
char buffer[150];
|
||||
|
||||
snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
|
||||
ctxt->input->cur[0], ctxt->input->cur[1],
|
||||
ctxt->input->cur[2], ctxt->input->cur[3]);
|
||||
__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"Input is not proper UTF-8, indicate encoding !\n%s",
|
||||
BAD_CAST buffer, NULL);
|
||||
}
|
||||
if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
|
||||
ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
|
||||
ctxt->input->flags |= XML_INPUT_8_BIT;
|
||||
snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
|
||||
ctxt->input->cur[0], ctxt->input->cur[1],
|
||||
ctxt->input->cur[2], ctxt->input->cur[3]);
|
||||
__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"Input is not proper UTF-8, indicate encoding !\n%s",
|
||||
BAD_CAST buffer, NULL);
|
||||
}
|
||||
ctxt->input->flags |= XML_INPUT_ENCODING_ERROR;
|
||||
}
|
||||
*len = 1;
|
||||
return(*ctxt->input->cur);
|
||||
return(0xFFFD); /* U+FFFD Replacement Character */
|
||||
|
||||
incomplete_sequence:
|
||||
/*
|
||||
@ -1271,7 +1198,6 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
|
||||
in = input->buf;
|
||||
|
||||
input->flags |= XML_INPUT_HAS_ENCODING;
|
||||
input->flags &= ~XML_INPUT_8_BIT;
|
||||
|
||||
/*
|
||||
* UTF-8 requires no encoding handler.
|
||||
|
Loading…
x
Reference in New Issue
Block a user