1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2024-10-26 12:25:09 +03:00

entities: Rework serialization of numeric character references

This commit is contained in:
Nick Wellnhofer 2024-07-12 03:07:57 +02:00
parent 8d1606265d
commit 1cfc5b8089
4 changed files with 92 additions and 71 deletions

View File

@ -43,6 +43,7 @@
#include "private/buf.h"
#include "private/enc.h"
#include "private/entities.h"
#include "private/error.h"
#ifdef LIBXML_ICU_ENABLED
@ -1744,8 +1745,7 @@ retry:
* and continue the transcoding phase, hoping the error
* did not mangle the encoder state.
*/
charrefLen = snprintf((char *) &charref[0], sizeof(charref),
"&#%d;", cur);
charrefLen = xmlSerializeDecCharRef((char *) charref, cur);
xmlBufGrow(out, charrefLen * 4);
c_out = xmlBufAvail(out);
c_in = charrefLen;
@ -1856,8 +1856,7 @@ retry:
* and continue the transcoding phase, hoping the error
* did not mangle the encoder state.
*/
charrefLen = snprintf((char *) &charref[0], sizeof(charref),
"&#%d;", cur);
charrefLen = xmlSerializeDecCharRef((char *) charref, cur);
xmlBufferShrink(in, len);
xmlBufferGrow(out, charrefLen * 4);
written = out->size - out->use - 1;

View File

@ -512,6 +512,71 @@ xmlGetDocEntity(const xmlDoc *doc, const xmlChar *name) {
return(xmlGetPredefinedEntity(name));
}
int
xmlSerializeHexCharRef(char *buf, int val) {
char *out = buf;
int shift = 0, bits;
*out++ = '&';
*out++ = '#';
*out++ = 'x';
bits = val;
if (bits & 0xFF0000) {
shift = 16;
bits &= 0xFF0000;
} else if (bits & 0x00FF00) {
shift = 8;
bits &= 0x00FF00;
}
if (bits & 0xF0F0F0) {
shift += 4;
}
do {
int d = (val >> shift) & 0x0F;
if (d < 10)
*out++ = '0' + d;
else
*out++ = 'A' + (d - 10);
shift -= 4;
} while (shift >= 0);
*out++ = ';';
return(out - buf);
}
int
xmlSerializeDecCharRef(char *buf, int val) {
char *out = buf;
int len, i;
*out++ = '&';
*out++ = '#';
if (val < 100) {
len = (val < 10) ? 1 : 2;
} else if (val < 10000) {
len = (val < 1000) ? 3 : 4;
} else if (val < 1000000) {
len = (val < 100000) ? 5 : 6;
} else {
len = 7;
}
for (i = len - 1; i >= 0; i--) {
out[i] = '0' + val % 10;
val /= 10;
}
out[len] = ';';
return(len + 3);
}
static const char xmlEscapeSafe[128] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -540,7 +605,7 @@ xmlEscapeText(const xmlChar *text, int flags) {
unescaped = cur;
while (*cur != '\0') {
char buf[13];
char buf[12];
const xmlChar *end;
const xmlChar *repl;
size_t used;
@ -618,7 +683,7 @@ xmlEscapeText(const xmlChar *text, int flags) {
val = 0xFFFD;
}
replSize = snprintf(buf, sizeof(buf), "&#x%X;", val);
replSize = xmlSerializeHexCharRef(buf, val);
repl = BAD_CAST buf;
} else if ((flags & XML_ESCAPE_ALLOW_INVALID) ||
(c >= 0x20) ||

View File

@ -27,6 +27,11 @@
#define XML_ESCAPE_QUOT (1u << 3)
#define XML_ESCAPE_ALLOW_INVALID (1u << 4)
XML_HIDDEN int
xmlSerializeHexCharRef(char *buf, int val);
XML_HIDDEN int
xmlSerializeDecCharRef(char *buf, int val);
XML_HIDDEN xmlChar *
xmlEscapeText(const xmlChar *text, int flags);

View File

@ -125,51 +125,10 @@ xmlSaveErr(xmlOutputBufferPtr out, int code, xmlNodePtr node,
* Special escaping routines *
* *
************************************************************************/
static char *
xmlSerializeHexCharRef(char *out, int val) {
char *ptr;
*out++ = '&';
*out++ = '#';
*out++ = 'x';
if (val < 0x10) ptr = out;
else if (val < 0x100) ptr = out + 1;
else if (val < 0x1000) ptr = out + 2;
else if (val < 0x10000) ptr = out + 3;
else if (val < 0x100000) ptr = out + 4;
else ptr = out + 5;
out = ptr + 1;
while (val > 0) {
switch (val & 0xF) {
case 0: *ptr-- = '0'; break;
case 1: *ptr-- = '1'; break;
case 2: *ptr-- = '2'; break;
case 3: *ptr-- = '3'; break;
case 4: *ptr-- = '4'; break;
case 5: *ptr-- = '5'; break;
case 6: *ptr-- = '6'; break;
case 7: *ptr-- = '7'; break;
case 8: *ptr-- = '8'; break;
case 9: *ptr-- = '9'; break;
case 0xA: *ptr-- = 'A'; break;
case 0xB: *ptr-- = 'B'; break;
case 0xC: *ptr-- = 'C'; break;
case 0xD: *ptr-- = 'D'; break;
case 0xE: *ptr-- = 'E'; break;
case 0xF: *ptr-- = 'F'; break;
default: *ptr-- = '0'; break;
}
val >>= 4;
}
*out++ = ';';
*out = 0;
return(out);
}
static void
xmlSerializeText(xmlOutputBufferPtr buf, const xmlChar *string,
unsigned flags) {
char tmp[12];
const char *base, *cur;
if (string == NULL)
@ -178,33 +137,12 @@ xmlSerializeText(xmlOutputBufferPtr buf, const xmlChar *string,
base = cur = (const char *) string;
while (*cur != 0) {
char tempBuf[12];
const char *repl = NULL;
int replSize = 0;
int chunkSize = 1;
int c = (unsigned char) *cur;
if ((c >= 0x80) && (flags & XML_ESCAPE_NON_ASCII)) {
int val = 0, l = 4;
if (base != cur)
xmlOutputBufferWrite(buf, cur - base, base);
val = xmlGetUTF8Char((const xmlChar *) cur, &l);
if (val < 0) {
val = 0xFFFD;
cur++;
} else {
if (!IS_CHAR(val))
val = 0xFFFD;
cur += l;
}
xmlSerializeHexCharRef(tmp, val);
xmlOutputBufferWriteString(buf, tmp);
base = cur;
continue;
}
switch (c) {
case '\t':
if (flags & XML_ESCAPE_ATTR) {
@ -255,6 +193,20 @@ xmlSerializeText(xmlOutputBufferPtr buf, const xmlChar *string,
if (c < 0x20) {
repl = "&#xFFFD;";
replSize = 8;
} else if ((c >= 0x80) && (flags & XML_ESCAPE_NON_ASCII)) {
int val = 0, l = 4;
val = xmlGetUTF8Char((const xmlChar *) cur, &l);
if (val < 0) {
val = 0xFFFD;
} else {
if (!IS_CHAR(val))
val = 0xFFFD;
chunkSize = l;
}
replSize = xmlSerializeHexCharRef(tempBuf, val);
repl = tempBuf;
}
break;
}
@ -265,7 +217,7 @@ xmlSerializeText(xmlOutputBufferPtr buf, const xmlChar *string,
if (base != cur)
xmlOutputBufferWrite(buf, cur - base, base);
xmlOutputBufferWrite(buf, replSize, repl);
cur++;
cur += chunkSize;
base = cur;
}
}