1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-03-31 06:50:06 +03:00

entities: Rework text escaping

This commit is contained in:
Nick Wellnhofer 2024-07-12 02:01:06 +02:00
parent cc45f618ae
commit 8d1606265d
5 changed files with 223 additions and 325 deletions

View File

@ -512,17 +512,162 @@ xmlGetDocEntity(const xmlDoc *doc, const xmlChar *name) {
return(xmlGetPredefinedEntity(name));
}
/*
* Macro used to grow the current buffer.
*/
#define growBufferReentrant() { \
xmlChar *tmp; \
size_t new_size = buffer_size * 2; \
if (new_size < buffer_size) goto mem_error; \
tmp = (xmlChar *) xmlRealloc(buffer, new_size); \
if (tmp == NULL) goto mem_error; \
buffer = tmp; \
buffer_size = new_size; \
static const char xmlEscapeSafe[128] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
};
xmlChar *
xmlEscapeText(const xmlChar *text, int flags) {
const xmlChar *cur;
xmlChar *buffer;
xmlChar *out;
const xmlChar *unescaped;
size_t size = 50;
buffer = xmlMalloc(size + 1);
if (buffer == NULL)
return(NULL);
out = buffer;
cur = text;
unescaped = cur;
while (*cur != '\0') {
char buf[13];
const xmlChar *end;
const xmlChar *repl;
size_t used;
size_t replSize;
size_t unescapedSize;
size_t totalSize;
int chunkSize = 1;
int c;
/* accelerator */
while (1) {
c = *cur;
if (c < 0x80) {
if (!xmlEscapeSafe[*cur])
break;
} else {
if (flags & XML_ESCAPE_NON_ASCII)
break;
}
cur += 1;
}
if (c == 0) {
chunkSize = 0;
repl = BAD_CAST "";
replSize = 0;
} else if (c == '<') {
/*
* Special handling of server side include in HTML attributes
*/
if ((flags & XML_ESCAPE_HTML) && (flags & XML_ESCAPE_ATTR) &&
(cur[1] == '!') && (cur[2] == '-') && (cur[3] == '-') &&
((end = xmlStrstr(cur, BAD_CAST "-->")) != NULL)) {
chunkSize = (end - cur) + 3;
repl = cur;
replSize = chunkSize;
} else {
repl = BAD_CAST "&lt;";
replSize = 4;
}
} else if (c == '>') {
repl = BAD_CAST "&gt;";
replSize = 4;
} else if (c == '&') {
/*
* Special handling of &{...} construct from HTML 4, see
* http://www.w3.org/TR/html401/appendix/notes.html#h-B.7.1
*/
if ((flags & XML_ESCAPE_HTML) && (flags & XML_ESCAPE_ATTR) &&
(cur[1] == '{') && (end = xmlStrchr(cur, '}'))) {
chunkSize = (end - cur) + 1;
repl = cur;
replSize = chunkSize;
} else {
repl = BAD_CAST "&amp;";
replSize = 5;
}
} else if ((flags & XML_ESCAPE_QUOT) && (c == '"')) {
repl = BAD_CAST "&quot;";
replSize = 6;
} else if (((flags & XML_ESCAPE_HTML) == 0) && (c == '\r')) {
repl = BAD_CAST "&#13;";
replSize = 5;
} else if ((flags & XML_ESCAPE_NON_ASCII) && (c >= 0x80)) {
int val;
chunkSize = 4;
val = xmlGetUTF8Char(cur, &chunkSize);
if (val < 0) {
val = 0xFFFD;
chunkSize = 1;
} else if (((flags & XML_ESCAPE_ALLOW_INVALID) == 0) &&
(!IS_CHAR(val))) {
val = 0xFFFD;
}
replSize = snprintf(buf, sizeof(buf), "&#x%X;", val);
repl = BAD_CAST buf;
} else if ((flags & XML_ESCAPE_ALLOW_INVALID) ||
(c >= 0x20) ||
(c == '\n') || (c == '\t') || (c == '\r')) {
/* default case, just copy */
cur += 1;
if (*cur != 0)
continue;
chunkSize = 0;
repl = BAD_CAST "";
replSize = 0;
} else {
/* ignore */
repl = BAD_CAST "";
replSize = 0;
}
used = out - buffer;
unescapedSize = cur - unescaped;
totalSize = unescapedSize + replSize;
cur += chunkSize;
if (totalSize > size - used) {
xmlChar *tmp;
size += totalSize;
if (*cur != 0)
size *= 2;
tmp = xmlRealloc(buffer, size + 1);
if (tmp == NULL) {
xmlFree(buffer);
return(NULL);
}
buffer = tmp;
out = buffer + used;
}
memcpy(out, unescaped, unescapedSize);
out += unescapedSize;
memcpy(out, repl, replSize);
out += replSize;
unescaped = cur;
}
*out = 0;
return(buffer);
}
/**
@ -538,178 +683,18 @@ xmlGetDocEntity(const xmlDoc *doc, const xmlChar *name) {
*
* Returns A newly allocated string with the substitution done.
*/
static xmlChar *
xmlEncodeEntitiesInternal(xmlDocPtr doc, const xmlChar *input, int attr) {
const xmlChar *cur = input;
xmlChar *buffer = NULL;
xmlChar *out = NULL;
size_t buffer_size = 0;
int html = 0;
if (input == NULL) return(NULL);
if (doc != NULL)
html = (doc->type == XML_HTML_DOCUMENT_NODE);
/*
* allocate an translation buffer.
*/
buffer_size = 1000;
buffer = (xmlChar *) xmlMalloc(buffer_size);
if (buffer == NULL)
return(NULL);
out = buffer;
while (*cur != '\0') {
size_t indx = out - buffer;
if (indx + 100 > buffer_size) {
growBufferReentrant();
out = &buffer[indx];
}
/*
* By default one have to encode at least '<', '>', '"' and '&' !
*/
if (*cur == '<') {
const xmlChar *end;
/*
* Special handling of server side include in HTML attributes
*/
if (html && attr &&
(cur[1] == '!') && (cur[2] == '-') && (cur[3] == '-') &&
((end = xmlStrstr(cur, BAD_CAST "-->")) != NULL)) {
while (cur != end) {
*out++ = *cur++;
indx = out - buffer;
if (indx + 100 > buffer_size) {
growBufferReentrant();
out = &buffer[indx];
}
}
*out++ = *cur++;
*out++ = *cur++;
*out++ = *cur++;
continue;
}
*out++ = '&';
*out++ = 'l';
*out++ = 't';
*out++ = ';';
} else if (*cur == '>') {
*out++ = '&';
*out++ = 'g';
*out++ = 't';
*out++ = ';';
} else if (*cur == '&') {
/*
* Special handling of &{...} construct from HTML 4, see
* http://www.w3.org/TR/html401/appendix/notes.html#h-B.7.1
*/
if (html && attr && (cur[1] == '{') &&
(strchr((const char *) cur, '}'))) {
while (*cur != '}') {
*out++ = *cur++;
indx = out - buffer;
if (indx + 100 > buffer_size) {
growBufferReentrant();
out = &buffer[indx];
}
}
*out++ = *cur++;
continue;
}
*out++ = '&';
*out++ = 'a';
*out++ = 'm';
*out++ = 'p';
*out++ = ';';
} else if (((*cur >= 0x20) && (*cur < 0x80)) ||
(*cur == '\n') || (*cur == '\t') || ((html) && (*cur == '\r'))) {
/*
* default case, just copy !
*/
*out++ = *cur;
} else if (*cur >= 0x80) {
if (((doc != NULL) && (doc->encoding != NULL)) || (html)) {
/*
* Bjørn Reese <br@sseusa.com> provided the patch
xmlChar xc;
xc = (*cur & 0x3F) << 6;
if (cur[1] != 0) {
xc += *(++cur) & 0x3F;
*out++ = xc;
} else
*/
*out++ = *cur;
} else {
/*
* We assume we have UTF-8 input.
* It must match either:
* 110xxxxx 10xxxxxx
* 1110xxxx 10xxxxxx 10xxxxxx
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* That is:
* cur[0] is 11xxxxxx
* cur[1] is 10xxxxxx
* cur[2] is 10xxxxxx if cur[0] is 111xxxxx
* cur[3] is 10xxxxxx if cur[0] is 1111xxxx
* cur[0] is not 11111xxx
*/
char buf[13], *ptr;
int val, l;
l = 4;
val = xmlGetUTF8Char(cur, &l);
if (val < 0) {
val = 0xFFFD;
cur++;
} else {
if (!IS_CHAR(val))
val = 0xFFFD;
cur += l;
}
/*
* We could do multiple things here. Just save as a char ref
*/
snprintf(buf, sizeof(buf), "&#x%X;", val);
buf[sizeof(buf) - 1] = 0;
ptr = buf;
while (*ptr != 0) *out++ = *ptr++;
continue;
}
} else if (IS_BYTE_CHAR(*cur)) {
char buf[11], *ptr;
snprintf(buf, sizeof(buf), "&#%d;", *cur);
buf[sizeof(buf) - 1] = 0;
ptr = buf;
while (*ptr != 0) *out++ = *ptr++;
}
cur++;
}
*out = 0;
return(buffer);
mem_error:
xmlFree(buffer);
return(NULL);
}
/**
* xmlEncodeAttributeEntities:
* @doc: the document containing the string
* @input: A string to convert to XML.
*
* Do a global encoding of a string, replacing the predefined entities
* and non ASCII values with their entities and CharRef counterparts for
* attribute values.
*
* Returns A newly allocated string with the substitution done.
*/
xmlChar *
xmlEncodeAttributeEntities(xmlDocPtr doc, const xmlChar *input) {
return xmlEncodeEntitiesInternal(doc, input, 1);
xmlEncodeEntitiesInternal(xmlDocPtr doc, const xmlChar *input,
unsigned flags) {
if (input == NULL)
return(NULL);
if ((doc != NULL) && (doc->type == XML_HTML_DOCUMENT_NODE))
flags |= XML_ESCAPE_HTML;
else if ((doc == NULL) || (doc->encoding == NULL))
flags |= XML_ESCAPE_NON_ASCII;
return(xmlEscapeText(input, flags));
}
/**
@ -722,6 +707,10 @@ xmlEncodeAttributeEntities(xmlDocPtr doc, const xmlChar *input) {
* Contrary to xmlEncodeEntities, this routine is reentrant, and result
* must be deallocated.
*
* This escapes '<', '>', '&' and '\r'. If the document has no encoding,
* non-ASCII codepoints are escaped. There is some special handling for
* HTML documents.
*
* Returns A newly allocated string with the substitution done.
*/
xmlChar *
@ -731,86 +720,23 @@ xmlEncodeEntitiesReentrant(xmlDocPtr doc, const xmlChar *input) {
/**
* xmlEncodeSpecialChars:
* @doc: the document containing the string
* @doc: unused
* @input: A string to convert to XML.
*
* Do a global encoding of a string, replacing the predefined entities
* this routine is reentrant, and result must be deallocated.
*
* This escapes '<', '>', '&', '"' and '\r' chars.
*
* Returns A newly allocated string with the substitution done.
*/
xmlChar *
xmlEncodeSpecialChars(const xmlDoc *doc ATTRIBUTE_UNUSED, const xmlChar *input) {
const xmlChar *cur = input;
xmlChar *buffer = NULL;
xmlChar *out = NULL;
size_t buffer_size = 0;
if (input == NULL) return(NULL);
xmlEncodeSpecialChars(const xmlDoc *doc ATTRIBUTE_UNUSED,
const xmlChar *input) {
if (input == NULL)
return(NULL);
/*
* allocate an translation buffer.
*/
buffer_size = 1000;
buffer = (xmlChar *) xmlMalloc(buffer_size);
if (buffer == NULL)
return(NULL);
out = buffer;
while (*cur != '\0') {
size_t indx = out - buffer;
if (indx + 10 > buffer_size) {
growBufferReentrant();
out = &buffer[indx];
}
/*
* By default one have to encode at least '<', '>', '"' and '&' !
*/
if (*cur == '<') {
*out++ = '&';
*out++ = 'l';
*out++ = 't';
*out++ = ';';
} else if (*cur == '>') {
*out++ = '&';
*out++ = 'g';
*out++ = 't';
*out++ = ';';
} else if (*cur == '&') {
*out++ = '&';
*out++ = 'a';
*out++ = 'm';
*out++ = 'p';
*out++ = ';';
} else if (*cur == '"') {
*out++ = '&';
*out++ = 'q';
*out++ = 'u';
*out++ = 'o';
*out++ = 't';
*out++ = ';';
} else if (*cur == '\r') {
*out++ = '&';
*out++ = '#';
*out++ = '1';
*out++ = '3';
*out++ = ';';
} else {
/*
* Works because on UTF-8, all extended sequences cannot
* result in bytes in the ASCII range.
*/
*out++ = *cur;
}
cur++;
}
*out = 0;
return(buffer);
mem_error:
xmlFree(buffer);
return(NULL);
return(xmlEscapeText(input, XML_ESCAPE_QUOT | XML_ESCAPE_ALLOW_INVALID));
}
/**

View File

@ -21,7 +21,17 @@
#define XML_ENT_VALIDATED (1u << 2)
#define XML_ENT_EXPANDING (1u << 3)
#define XML_ESCAPE_ATTR (1u << 0)
#define XML_ESCAPE_NON_ASCII (1u << 1)
#define XML_ESCAPE_HTML (1u << 2)
#define XML_ESCAPE_QUOT (1u << 3)
#define XML_ESCAPE_ALLOW_INVALID (1u << 4)
XML_HIDDEN xmlChar *
xmlEncodeAttributeEntities(xmlDocPtr doc, const xmlChar *input);
xmlEscapeText(const xmlChar *text, int flags);
XML_HIDDEN xmlChar *
xmlEncodeEntitiesInternal(xmlDocPtr doc, const xmlChar *input,
unsigned flags);
#endif /* XML_ENTITIES_H_PRIVATE__ */

8
tree.c
View File

@ -1557,11 +1557,11 @@ xmlNodeListGetStringInternal(xmlDocPtr doc, const xmlNode *node, int escMode) {
xmlChar *encoded;
if (escMode == 1)
encoded = xmlEncodeEntitiesReentrant(doc,
node->content);
encoded = xmlEncodeEntitiesInternal(doc, node->content,
0);
else if (escMode == 2)
encoded = xmlEncodeAttributeEntities(doc,
node->content);
encoded = xmlEncodeEntitiesInternal(doc, node->content,
XML_ESCAPE_ATTR);
else
encoded = xmlEncodeSpecialChars(doc, node->content);
if (encoded == NULL)

92
xmlIO.c
View File

@ -44,6 +44,7 @@
#include "private/buf.h"
#include "private/enc.h"
#include "private/entities.h"
#include "private/error.h"
#include "private/io.h"
@ -2374,66 +2375,6 @@ xmlOutputBufferWrite(xmlOutputBufferPtr out, int len, const char *data) {
return(written <= INT_MAX ? written : INT_MAX);
}
/**
* xmlEscapeContent:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of unescaped UTF-8 bytes
* @inlen: the length of @in
*
* Take a block of UTF-8 chars in and escape them.
* Returns 0 if success, or -1 otherwise
* The value of @inlen after return is the number of octets consumed
* if the return value is positive, else unpredictable.
* The value of @outlen after return is the number of octets consumed.
*/
static int
xmlEscapeContent(unsigned char* out, int *outlen,
const xmlChar* in, int *inlen) {
unsigned char* outstart = out;
const unsigned char* base = in;
unsigned char* outend = out + *outlen;
const unsigned char* inend;
inend = in + (*inlen);
while ((in < inend) && (out < outend)) {
if (*in == '<') {
if (outend - out < 4) break;
*out++ = '&';
*out++ = 'l';
*out++ = 't';
*out++ = ';';
} else if (*in == '>') {
if (outend - out < 4) break;
*out++ = '&';
*out++ = 'g';
*out++ = 't';
*out++ = ';';
} else if (*in == '&') {
if (outend - out < 5) break;
*out++ = '&';
*out++ = 'a';
*out++ = 'm';
*out++ = 'p';
*out++ = ';';
} else if (*in == '\r') {
if (outend - out < 5) break;
*out++ = '&';
*out++ = '#';
*out++ = '1';
*out++ = '3';
*out++ = ';';
} else {
*out++ = *in;
}
++in;
}
*outlen = out - outstart;
*inlen = in - base;
return(0);
}
/**
* xmlOutputBufferWriteEscape:
* @out: a buffered parser output
@ -2454,13 +2395,36 @@ xmlOutputBufferWriteEscape(xmlOutputBufferPtr out, const xmlChar *str,
xmlCharEncodingOutputFunc escaping) {
int ret;
int written = 0;
int len;
size_t len;
if ((out == NULL) || (out->error) || (str == NULL))
return(-1);
len = strlen((const char *)str);
if (escaping == NULL)
escaping = xmlEscapeContent;
len = strlen((const char *) str);
if (len >= INT_MAX) {
out->error = XML_ERR_RESOURCE_LIMIT;
return(-1);
}
if (escaping == NULL) {
char *escaped = (char *) xmlEscapeText(str, XML_ESCAPE_ALLOW_INVALID);
if (escaped == NULL) {
out->error = XML_ERR_NO_MEMORY;
return(-1);
}
len = strlen(escaped);
if (len >= INT_MAX) {
out->error = XML_ERR_RESOURCE_LIMIT;
return(-1);
}
ret = xmlOutputBufferWrite(out, len, escaped);
xmlFree(escaped);
return(ret);
}
while (len > 0) {
xmlChar buf[1024];

View File

@ -23,6 +23,7 @@
#include "private/buf.h"
#include "private/enc.h"
#include "private/entities.h"
#include "private/error.h"
#include "private/io.h"
#include "private/save.h"
@ -31,9 +32,6 @@
#define XHTML_NS_NAME BAD_CAST "http://www.w3.org/1999/xhtml"
#define XML_ESCAPE_ATTR (1u << 0)
#define XML_ESCAPE_NON_ASCII (1u << 1)
struct _xmlSaveCtxt {
void *_private;
int type;