1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2024-10-26 12:25:09 +03:00

html: Parse numeric character references according to HTML5

This commit is contained in:
Nick Wellnhofer 2024-09-08 23:19:49 +02:00
parent 4eeac30944
commit a6955c13c7

View File

@ -3443,6 +3443,13 @@ done:
return;
}
static const short htmlC1Remap[32] = {
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
};
/**
* htmlParseCharRef:
* @ctxt: an HTML parser context
@ -3462,63 +3469,57 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) {
if ((ctxt == NULL) || (ctxt->input == NULL))
return(0);
if ((CUR == '&') && (NXT(1) == '#') &&
((NXT(2) == 'x') || NXT(2) == 'X')) {
SKIP(3);
while (CUR != ';') {
if ((CUR >= '0') && (CUR <= '9')) {
if (val < 0x110000)
val = val * 16 + (CUR - '0');
} else if ((CUR >= 'a') && (CUR <= 'f')) {
if (val < 0x110000)
val = val * 16 + (CUR - 'a') + 10;
} else if ((CUR >= 'A') && (CUR <= 'F')) {
if (val < 0x110000)
val = val * 16 + (CUR - 'A') + 10;
while (1) {
int c = CUR;
if ((c >= '0') && (c <= '9')) {
c -= '0';
} else if ((c >= 'a') && (c <= 'f')) {
c = (c - 'a') + 10;
} else if ((c >= 'A') && (c <= 'F')) {
c = (c - 'A') + 10;
} else {
htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
"htmlParseCharRef: missing semicolon\n",
NULL, NULL);
break;
}
val = val * 16 + c;
if (val >= 0x110000)
val = 0x110000;
NEXT;
}
if (CUR == ';')
SKIP(1);
} else if ((CUR == '&') && (NXT(1) == '#')) {
SKIP(2);
while (CUR != ';') {
if ((CUR >= '0') && (CUR <= '9')) {
if (val < 0x110000)
val = val * 10 + (CUR - '0');
} else {
htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
"htmlParseCharRef: missing semicolon\n",
NULL, NULL);
while (1) {
int c = CUR;
if ((c < '0') || (c > '9'))
break;
}
val = val * 10 + (c - '0');
if (val >= 0x110000)
val = 0x110000;
NEXT;
}
if (CUR == ';')
SKIP(1);
} else {
htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
"htmlParseCharRef: invalid value\n", NULL, NULL);
}
/*
* Check the value IS_CHAR ...
* Remap C1 control characters
*/
if (IS_CHAR(val)) {
return(val);
} else if (val >= 0x110000) {
htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
"htmlParseCharRef: value too large\n", NULL, NULL);
} else {
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
"htmlParseCharRef: invalid xmlChar value %d\n",
val);
if ((val >= 0x80) && (val < 0xA0)) {
val = htmlC1Remap[val - 0x80];
} else if ((val <= 0) ||
((val >= 0xD800) && (val < 0xE000)) ||
(val > 0x10FFFF)) {
val = 0xFFFD;
}
return(0);
return(val);
}
@ -4070,10 +4071,15 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
*/
static void
htmlParseReference(htmlParserCtxtPtr ctxt) {
const xmlChar *repl = NULL;
int replLen = 0;
xmlChar out[6];
if (CUR != '&') return;
if (NXT(1) == '#') {
if ((NXT(1) == '#') &&
((IS_ASCII_DIGIT(NXT(2))) ||
((UPP(2) == 'X') &&
((IS_ASCII_DIGIT(NXT(3))) ||
((UPP(3) >= 'A') && (UPP(3) <= 'F')))))) {
unsigned int c;
int bits, i = 0;
@ -4091,30 +4097,29 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
}
out[i] = 0;
htmlCheckParagraph(ctxt);
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, out, i);
} else {
const xmlChar *repl;
int nameLen, replLen;
repl = out;
replLen = i;
} else if (IS_ASCII_LETTER(NXT(1))) {
int nameLen;
htmlCheckParagraph(ctxt);
SKIP(1);
repl = htmlFindEntityPrefix(CUR_PTR,
ctxt->input->end - CUR_PTR,
repl = htmlFindEntityPrefix(CUR_PTR + 1,
ctxt->input->end - CUR_PTR - 1,
/* isAttr */ 0,
&nameLen, &replLen);
if (repl != NULL)
SKIP(nameLen + 1);
}
if (repl == NULL) {
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
} else {
repl = BAD_CAST "&";
replLen = 1;
SKIP(1);
}
htmlCheckParagraph(ctxt);
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, repl, replLen);
SKIP(nameLen);
}
}
}
/**