mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2024-10-26 12:25:09 +03:00
html: Parse numeric character references according to HTML5
This commit is contained in:
parent
4eeac30944
commit
a6955c13c7
115
HTMLparser.c
115
HTMLparser.c
@ -3443,6 +3443,13 @@ done:
|
||||
return;
|
||||
}
|
||||
|
||||
static const short htmlC1Remap[32] = {
|
||||
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
|
||||
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
|
||||
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
|
||||
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
|
||||
};
|
||||
|
||||
/**
|
||||
* htmlParseCharRef:
|
||||
* @ctxt: an HTML parser context
|
||||
@ -3462,63 +3469,57 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) {
|
||||
|
||||
if ((ctxt == NULL) || (ctxt->input == NULL))
|
||||
return(0);
|
||||
|
||||
if ((CUR == '&') && (NXT(1) == '#') &&
|
||||
((NXT(2) == 'x') || NXT(2) == 'X')) {
|
||||
SKIP(3);
|
||||
while (CUR != ';') {
|
||||
if ((CUR >= '0') && (CUR <= '9')) {
|
||||
if (val < 0x110000)
|
||||
val = val * 16 + (CUR - '0');
|
||||
} else if ((CUR >= 'a') && (CUR <= 'f')) {
|
||||
if (val < 0x110000)
|
||||
val = val * 16 + (CUR - 'a') + 10;
|
||||
} else if ((CUR >= 'A') && (CUR <= 'F')) {
|
||||
if (val < 0x110000)
|
||||
val = val * 16 + (CUR - 'A') + 10;
|
||||
while (1) {
|
||||
int c = CUR;
|
||||
|
||||
if ((c >= '0') && (c <= '9')) {
|
||||
c -= '0';
|
||||
} else if ((c >= 'a') && (c <= 'f')) {
|
||||
c = (c - 'a') + 10;
|
||||
} else if ((c >= 'A') && (c <= 'F')) {
|
||||
c = (c - 'A') + 10;
|
||||
} else {
|
||||
htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
|
||||
"htmlParseCharRef: missing semicolon\n",
|
||||
NULL, NULL);
|
||||
break;
|
||||
}
|
||||
val = val * 16 + c;
|
||||
if (val >= 0x110000)
|
||||
val = 0x110000;
|
||||
NEXT;
|
||||
}
|
||||
if (CUR == ';')
|
||||
SKIP(1);
|
||||
} else if ((CUR == '&') && (NXT(1) == '#')) {
|
||||
SKIP(2);
|
||||
while (CUR != ';') {
|
||||
if ((CUR >= '0') && (CUR <= '9')) {
|
||||
if (val < 0x110000)
|
||||
val = val * 10 + (CUR - '0');
|
||||
} else {
|
||||
htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
|
||||
"htmlParseCharRef: missing semicolon\n",
|
||||
NULL, NULL);
|
||||
while (1) {
|
||||
int c = CUR;
|
||||
|
||||
if ((c < '0') || (c > '9'))
|
||||
break;
|
||||
}
|
||||
val = val * 10 + (c - '0');
|
||||
if (val >= 0x110000)
|
||||
val = 0x110000;
|
||||
NEXT;
|
||||
}
|
||||
if (CUR == ';')
|
||||
SKIP(1);
|
||||
} else {
|
||||
htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
|
||||
"htmlParseCharRef: invalid value\n", NULL, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check the value IS_CHAR ...
|
||||
* Remap C1 control characters
|
||||
*/
|
||||
if (IS_CHAR(val)) {
|
||||
return(val);
|
||||
} else if (val >= 0x110000) {
|
||||
htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"htmlParseCharRef: value too large\n", NULL, NULL);
|
||||
} else {
|
||||
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
|
||||
"htmlParseCharRef: invalid xmlChar value %d\n",
|
||||
val);
|
||||
if ((val >= 0x80) && (val < 0xA0)) {
|
||||
val = htmlC1Remap[val - 0x80];
|
||||
} else if ((val <= 0) ||
|
||||
((val >= 0xD800) && (val < 0xE000)) ||
|
||||
(val > 0x10FFFF)) {
|
||||
val = 0xFFFD;
|
||||
}
|
||||
return(0);
|
||||
|
||||
return(val);
|
||||
}
|
||||
|
||||
|
||||
@ -4070,10 +4071,15 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
|
||||
*/
|
||||
static void
|
||||
htmlParseReference(htmlParserCtxtPtr ctxt) {
|
||||
const xmlChar *repl = NULL;
|
||||
int replLen = 0;
|
||||
xmlChar out[6];
|
||||
if (CUR != '&') return;
|
||||
|
||||
if (NXT(1) == '#') {
|
||||
if ((NXT(1) == '#') &&
|
||||
((IS_ASCII_DIGIT(NXT(2))) ||
|
||||
((UPP(2) == 'X') &&
|
||||
((IS_ASCII_DIGIT(NXT(3))) ||
|
||||
((UPP(3) >= 'A') && (UPP(3) <= 'F')))))) {
|
||||
unsigned int c;
|
||||
int bits, i = 0;
|
||||
|
||||
@ -4091,30 +4097,29 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
|
||||
}
|
||||
out[i] = 0;
|
||||
|
||||
htmlCheckParagraph(ctxt);
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
|
||||
ctxt->sax->characters(ctxt->userData, out, i);
|
||||
} else {
|
||||
const xmlChar *repl;
|
||||
int nameLen, replLen;
|
||||
repl = out;
|
||||
replLen = i;
|
||||
} else if (IS_ASCII_LETTER(NXT(1))) {
|
||||
int nameLen;
|
||||
|
||||
htmlCheckParagraph(ctxt);
|
||||
|
||||
SKIP(1);
|
||||
repl = htmlFindEntityPrefix(CUR_PTR,
|
||||
ctxt->input->end - CUR_PTR,
|
||||
repl = htmlFindEntityPrefix(CUR_PTR + 1,
|
||||
ctxt->input->end - CUR_PTR - 1,
|
||||
/* isAttr */ 0,
|
||||
&nameLen, &replLen);
|
||||
|
||||
if (repl != NULL)
|
||||
SKIP(nameLen + 1);
|
||||
}
|
||||
|
||||
if (repl == NULL) {
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
|
||||
ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
|
||||
} else {
|
||||
repl = BAD_CAST "&";
|
||||
replLen = 1;
|
||||
SKIP(1);
|
||||
}
|
||||
|
||||
htmlCheckParagraph(ctxt);
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
|
||||
ctxt->sax->characters(ctxt->userData, repl, replLen);
|
||||
SKIP(nameLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
Loading…
Reference in New Issue
Block a user