entities: Rework text escaping

2025-03-31 06:50:06 +03:00 · 2024-07-12 02:01:06 +02:00 · 2024-07-12 02:01:06 +02:00 · 8d1606265d
commit 8d1606265d
parent cc45f618ae
5 changed files with 223 additions and 325 deletions
--- a/entities.c
+++ b/entities.c
@ -512,17 +512,162 @@ xmlGetDocEntity(const xmlDoc *doc, const xmlChar *name) {
    return(xmlGetPredefinedEntity(name));
 }

-/*
- * Macro used to grow the current buffer.
- */
-#define growBufferReentrant() {						\
-    xmlChar *tmp;                                                       \
-    size_t new_size = buffer_size * 2;                                  \
-    if (new_size < buffer_size) goto mem_error;                         \
-    tmp = (xmlChar *) xmlRealloc(buffer, new_size);	                \
-    if (tmp == NULL) goto mem_error;                                    \
-    buffer = tmp;							\
-    buffer_size = new_size;						\
+static const char xmlEscapeSafe[128] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+};
+
+xmlChar *
+xmlEscapeText(const xmlChar *text, int flags) {
+    const xmlChar *cur;
+    xmlChar *buffer;
+    xmlChar *out;
+    const xmlChar *unescaped;
+    size_t size = 50;
+
+    buffer = xmlMalloc(size + 1);
+    if (buffer == NULL)
+        return(NULL);
+    out = buffer;
+
+    cur = text;
+    unescaped = cur;
+
+    while (*cur != '\0') {
+        char buf[13];
+	const xmlChar *end;
+        const xmlChar *repl;
+        size_t used;
+        size_t replSize;
+        size_t unescapedSize;
+        size_t totalSize;
+        int chunkSize = 1;
+        int c;
+
+        /* accelerator */
+	while (1) {
+            c = *cur;
+
+            if (c < 0x80) {
+                if (!xmlEscapeSafe[*cur])
+                    break;
+            } else {
+               if (flags & XML_ESCAPE_NON_ASCII)
+                   break;
+            }
+            cur += 1;
+        }
+
+        if (c == 0) {
+            chunkSize = 0;
+            repl = BAD_CAST "";
+            replSize = 0;
+        } else if (c == '<') {
+	    /*
+	     * Special handling of server side include in HTML attributes
+	     */
+	    if ((flags & XML_ESCAPE_HTML) && (flags & XML_ESCAPE_ATTR) &&
+	        (cur[1] == '!') && (cur[2] == '-') && (cur[3] == '-') &&
+	        ((end = xmlStrstr(cur, BAD_CAST "-->")) != NULL)) {
+                chunkSize = (end - cur) + 3;
+                repl = cur;
+                replSize = chunkSize;
+	    } else {
+                repl = BAD_CAST "&lt;";
+                replSize = 4;
+            }
+	} else if (c == '>') {
+            repl = BAD_CAST "&gt;";
+            replSize = 4;
+	} else if (c == '&') {
+	    /*
+	     * Special handling of &{...} construct from HTML 4, see
+	     * http://www.w3.org/TR/html401/appendix/notes.html#h-B.7.1
+	     */
+	    if ((flags & XML_ESCAPE_HTML) && (flags & XML_ESCAPE_ATTR) &&
+                (cur[1] == '{') && (end = xmlStrchr(cur, '}'))) {
+                chunkSize = (end - cur) + 1;
+                repl = cur;
+                replSize = chunkSize;
+	    } else {
+                repl = BAD_CAST "&amp;";
+                replSize = 5;
+            }
+	} else if ((flags & XML_ESCAPE_QUOT) && (c == '"')) {
+            repl = BAD_CAST "&quot;";
+            replSize = 6;
+	} else if (((flags & XML_ESCAPE_HTML) == 0) && (c == '\r')) {
+	    repl = BAD_CAST "&#13;";
+            replSize = 5;
+	} else if ((flags & XML_ESCAPE_NON_ASCII) && (c >= 0x80)) {
+            int val;
+
+            chunkSize = 4;
+            val = xmlGetUTF8Char(cur, &chunkSize);
+            if (val < 0) {
+                val = 0xFFFD;
+                chunkSize = 1;
+            } else if (((flags & XML_ESCAPE_ALLOW_INVALID) == 0) &&
+                       (!IS_CHAR(val))) {
+                val = 0xFFFD;
+            }
+
+            replSize = snprintf(buf, sizeof(buf), "&#x%X;", val);
+            repl = BAD_CAST buf;
+	} else if ((flags & XML_ESCAPE_ALLOW_INVALID) ||
+                   (c >= 0x20) ||
+	           (c == '\n') || (c == '\t') || (c == '\r')) {
+	    /* default case, just copy */
+            cur += 1;
+            if (*cur != 0)
+                continue;
+
+            chunkSize = 0;
+            repl = BAD_CAST "";
+            replSize = 0;
+	} else {
+            /* ignore */
+            repl = BAD_CAST "";
+            replSize = 0;
+        }
+
+        used = out - buffer;
+        unescapedSize = cur - unescaped;
+        totalSize = unescapedSize + replSize;
+
+	cur += chunkSize;
+
+        if (totalSize > size - used) {
+            xmlChar *tmp;
+
+            size += totalSize;
+            if (*cur != 0)
+                size *= 2;
+            tmp = xmlRealloc(buffer, size + 1);
+            if (tmp == NULL) {
+                xmlFree(buffer);
+                return(NULL);
+            }
+            buffer = tmp;
+            out = buffer + used;
+        }
+
+        memcpy(out, unescaped, unescapedSize);
+        out += unescapedSize;
+        memcpy(out, repl, replSize);
+        out += replSize;
+
+        unescaped = cur;
+    }
+
+    *out = 0;
+    return(buffer);
 }

 /**
@ -538,178 +683,18 @@ xmlGetDocEntity(const xmlDoc *doc, const xmlChar *name) {
 *
 * Returns A newly allocated string with the substitution done.
 */
-static xmlChar *
-xmlEncodeEntitiesInternal(xmlDocPtr doc, const xmlChar *input, int attr) {
-    const xmlChar *cur = input;
-    xmlChar *buffer = NULL;
-    xmlChar *out = NULL;
-    size_t buffer_size = 0;
-    int html = 0;
-
-    if (input == NULL) return(NULL);
-    if (doc != NULL)
-        html = (doc->type == XML_HTML_DOCUMENT_NODE);
-
-    /*
-     * allocate an translation buffer.
-     */
-    buffer_size = 1000;
-    buffer = (xmlChar *) xmlMalloc(buffer_size);
-    if (buffer == NULL)
-	return(NULL);
-    out = buffer;
-
-    while (*cur != '\0') {
-        size_t indx = out - buffer;
-        if (indx + 100 > buffer_size) {
-
-	    growBufferReentrant();
-	    out = &buffer[indx];
-	}
-
-	/*
-	 * By default one have to encode at least '<', '>', '"' and '&' !
-	 */
-	if (*cur == '<') {
-	    const xmlChar *end;
-
-	    /*
-	     * Special handling of server side include in HTML attributes
-	     */
-	    if (html && attr &&
-	        (cur[1] == '!') && (cur[2] == '-') && (cur[3] == '-') &&
-	        ((end = xmlStrstr(cur, BAD_CAST "-->")) != NULL)) {
-	        while (cur != end) {
-		    *out++ = *cur++;
-		    indx = out - buffer;
-		    if (indx + 100 > buffer_size) {
-			growBufferReentrant();
-			out = &buffer[indx];
-		    }
-		}
-		*out++ = *cur++;
-		*out++ = *cur++;
-		*out++ = *cur++;
-		continue;
-	    }
-	    *out++ = '&';
-	    *out++ = 'l';
-	    *out++ = 't';
-	    *out++ = ';';
-	} else if (*cur == '>') {
-	    *out++ = '&';
-	    *out++ = 'g';
-	    *out++ = 't';
-	    *out++ = ';';
-	} else if (*cur == '&') {
-	    /*
-	     * Special handling of &{...} construct from HTML 4, see
-	     * http://www.w3.org/TR/html401/appendix/notes.html#h-B.7.1
-	     */
-	    if (html && attr && (cur[1] == '{') &&
-	        (strchr((const char *) cur, '}'))) {
-	        while (*cur != '}') {
-		    *out++ = *cur++;
-		    indx = out - buffer;
-		    if (indx + 100 > buffer_size) {
-			growBufferReentrant();
-			out = &buffer[indx];
-		    }
-		}
-		*out++ = *cur++;
-		continue;
-	    }
-	    *out++ = '&';
-	    *out++ = 'a';
-	    *out++ = 'm';
-	    *out++ = 'p';
-	    *out++ = ';';
-	} else if (((*cur >= 0x20) && (*cur < 0x80)) ||
-	    (*cur == '\n') || (*cur == '\t') || ((html) && (*cur == '\r'))) {
-	    /*
-	     * default case, just copy !
-	     */
-	    *out++ = *cur;
-	} else if (*cur >= 0x80) {
-	    if (((doc != NULL) && (doc->encoding != NULL)) || (html)) {
-		/*
-		 * Bjørn Reese <br@sseusa.com> provided the patch
-	        xmlChar xc;
-	        xc = (*cur & 0x3F) << 6;
-	        if (cur[1] != 0) {
-		    xc += *(++cur) & 0x3F;
-		    *out++ = xc;
-	        } else
-		 */
-		*out++ = *cur;
-	    } else {
-		/*
-		 * We assume we have UTF-8 input.
-		 * It must match either:
-		 *   110xxxxx 10xxxxxx
-		 *   1110xxxx 10xxxxxx 10xxxxxx
-		 *   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-		 * That is:
-		 *   cur[0] is 11xxxxxx
-		 *   cur[1] is 10xxxxxx
-		 *   cur[2] is 10xxxxxx if cur[0] is 111xxxxx
-		 *   cur[3] is 10xxxxxx if cur[0] is 1111xxxx
-		 *   cur[0] is not 11111xxx
-		 */
-		char buf[13], *ptr;
-		int val, l;
-
-                l = 4;
-                val = xmlGetUTF8Char(cur, &l);
-                if (val < 0) {
-                    val = 0xFFFD;
-                    cur++;
-                } else {
-                    if (!IS_CHAR(val))
-                        val = 0xFFFD;
-                    cur += l;
-		}
-		/*
-		 * We could do multiple things here. Just save as a char ref
-		 */
-		snprintf(buf, sizeof(buf), "&#x%X;", val);
-		buf[sizeof(buf) - 1] = 0;
-		ptr = buf;
-		while (*ptr != 0) *out++ = *ptr++;
-		continue;
-	    }
-	} else if (IS_BYTE_CHAR(*cur)) {
-	    char buf[11], *ptr;
-
-	    snprintf(buf, sizeof(buf), "&#%d;", *cur);
-	    buf[sizeof(buf) - 1] = 0;
-            ptr = buf;
-	    while (*ptr != 0) *out++ = *ptr++;
-	}
-	cur++;
-    }
-    *out = 0;
-    return(buffer);
-
-mem_error:
-    xmlFree(buffer);
-    return(NULL);
-}
-
-/**
- * xmlEncodeAttributeEntities:
- * @doc:  the document containing the string
- * @input:  A string to convert to XML.
- *
- * Do a global encoding of a string, replacing the predefined entities
- * and non ASCII values with their entities and CharRef counterparts for
- * attribute values.
- *
- * Returns A newly allocated string with the substitution done.
- */
 xmlChar *
-xmlEncodeAttributeEntities(xmlDocPtr doc, const xmlChar *input) {
-    return xmlEncodeEntitiesInternal(doc, input, 1);
+xmlEncodeEntitiesInternal(xmlDocPtr doc, const xmlChar *input,
+                          unsigned flags) {
+    if (input == NULL)
+        return(NULL);
+
+    if ((doc != NULL) && (doc->type == XML_HTML_DOCUMENT_NODE))
+        flags |= XML_ESCAPE_HTML;
+    else if ((doc == NULL) || (doc->encoding == NULL))
+        flags |= XML_ESCAPE_NON_ASCII;
+
+    return(xmlEscapeText(input, flags));
 }

 /**
@ -722,6 +707,10 @@ xmlEncodeAttributeEntities(xmlDocPtr doc, const xmlChar *input) {
 * Contrary to xmlEncodeEntities, this routine is reentrant, and result
 * must be deallocated.
 *
+ * This escapes '<', '>', '&' and '\r'. If the document has no encoding,
+ * non-ASCII codepoints are escaped. There is some special handling for
+ * HTML documents.
+ *
 * Returns A newly allocated string with the substitution done.
 */
 xmlChar *
@ -731,86 +720,23 @@ xmlEncodeEntitiesReentrant(xmlDocPtr doc, const xmlChar *input) {

 /**
 * xmlEncodeSpecialChars:
- * @doc:  the document containing the string
+ * @doc:  unused
 * @input:  A string to convert to XML.
 *
 * Do a global encoding of a string, replacing the predefined entities
 * this routine is reentrant, and result must be deallocated.
 *
+ * This escapes '<', '>', '&', '"' and '\r' chars.
+ *
 * Returns A newly allocated string with the substitution done.
 */
 xmlChar *
-xmlEncodeSpecialChars(const xmlDoc *doc ATTRIBUTE_UNUSED, const xmlChar *input) {
-    const xmlChar *cur = input;
-    xmlChar *buffer = NULL;
-    xmlChar *out = NULL;
-    size_t buffer_size = 0;
-    if (input == NULL) return(NULL);
+xmlEncodeSpecialChars(const xmlDoc *doc ATTRIBUTE_UNUSED,
+                      const xmlChar *input) {
+    if (input == NULL)
+        return(NULL);

-    /*
-     * allocate an translation buffer.
-     */
-    buffer_size = 1000;
-    buffer = (xmlChar *) xmlMalloc(buffer_size);
-    if (buffer == NULL)
-	return(NULL);
-    out = buffer;
-
-    while (*cur != '\0') {
-        size_t indx = out - buffer;
-        if (indx + 10 > buffer_size) {
-
-	    growBufferReentrant();
-	    out = &buffer[indx];
-	}
-
-	/*
-	 * By default one have to encode at least '<', '>', '"' and '&' !
-	 */
-	if (*cur == '<') {
-	    *out++ = '&';
-	    *out++ = 'l';
-	    *out++ = 't';
-	    *out++ = ';';
-	} else if (*cur == '>') {
-	    *out++ = '&';
-	    *out++ = 'g';
-	    *out++ = 't';
-	    *out++ = ';';
-	} else if (*cur == '&') {
-	    *out++ = '&';
-	    *out++ = 'a';
-	    *out++ = 'm';
-	    *out++ = 'p';
-	    *out++ = ';';
-	} else if (*cur == '"') {
-	    *out++ = '&';
-	    *out++ = 'q';
-	    *out++ = 'u';
-	    *out++ = 'o';
-	    *out++ = 't';
-	    *out++ = ';';
-	} else if (*cur == '\r') {
-	    *out++ = '&';
-	    *out++ = '#';
-	    *out++ = '1';
-	    *out++ = '3';
-	    *out++ = ';';
-	} else {
-	    /*
-	     * Works because on UTF-8, all extended sequences cannot
-	     * result in bytes in the ASCII range.
-	     */
-	    *out++ = *cur;
-	}
-	cur++;
-    }
-    *out = 0;
-    return(buffer);
-
-mem_error:
-    xmlFree(buffer);
-    return(NULL);
+    return(xmlEscapeText(input, XML_ESCAPE_QUOT | XML_ESCAPE_ALLOW_INVALID));
 }

 /**
--- a/include/private/entities.h
+++ b/include/private/entities.h
@ -21,7 +21,17 @@
 #define XML_ENT_VALIDATED   (1u << 2)
 #define XML_ENT_EXPANDING   (1u << 3)

+#define XML_ESCAPE_ATTR             (1u << 0)
+#define XML_ESCAPE_NON_ASCII        (1u << 1)
+#define XML_ESCAPE_HTML             (1u << 2)
+#define XML_ESCAPE_QUOT             (1u << 3)
+#define XML_ESCAPE_ALLOW_INVALID    (1u << 4)
+
 XML_HIDDEN xmlChar *
-xmlEncodeAttributeEntities(xmlDocPtr doc, const xmlChar *input);
+xmlEscapeText(const xmlChar *text, int flags);
+
+XML_HIDDEN xmlChar *
+xmlEncodeEntitiesInternal(xmlDocPtr doc, const xmlChar *input,
+                          unsigned flags);

 #endif /* XML_ENTITIES_H_PRIVATE__ */
--- a/tree.c
+++ b/tree.c
@ -1557,11 +1557,11 @@ xmlNodeListGetStringInternal(xmlDocPtr doc, const xmlNode *node, int escMode) {
                    xmlChar *encoded;

                    if (escMode == 1)
-                        encoded = xmlEncodeEntitiesReentrant(doc,
-                                                             node->content);
+                        encoded = xmlEncodeEntitiesInternal(doc, node->content,
+                                                            0);
                    else if (escMode == 2)
-                        encoded = xmlEncodeAttributeEntities(doc,
-                                                             node->content);
+                        encoded = xmlEncodeEntitiesInternal(doc, node->content,
+                                                            XML_ESCAPE_ATTR);
                    else
                        encoded = xmlEncodeSpecialChars(doc, node->content);
                    if (encoded == NULL)
--- a/xmlIO.c
+++ b/xmlIO.c
@ -44,6 +44,7 @@

 #include "private/buf.h"
 #include "private/enc.h"
+#include "private/entities.h"
 #include "private/error.h"
 #include "private/io.h"

@ -2374,66 +2375,6 @@ xmlOutputBufferWrite(xmlOutputBufferPtr out, int len, const char *data) {
    return(written <= INT_MAX ? written : INT_MAX);
 }

-/**
- * xmlEscapeContent:
- * @out:  a pointer to an array of bytes to store the result
- * @outlen:  the length of @out
- * @in:  a pointer to an array of unescaped UTF-8 bytes
- * @inlen:  the length of @in
- *
- * Take a block of UTF-8 chars in and escape them.
- * Returns 0 if success, or -1 otherwise
- * The value of @inlen after return is the number of octets consumed
- *     if the return value is positive, else unpredictable.
- * The value of @outlen after return is the number of octets consumed.
- */
-static int
-xmlEscapeContent(unsigned char* out, int *outlen,
-                 const xmlChar* in, int *inlen) {
-    unsigned char* outstart = out;
-    const unsigned char* base = in;
-    unsigned char* outend = out + *outlen;
-    const unsigned char* inend;
-
-    inend = in + (*inlen);
-
-    while ((in < inend) && (out < outend)) {
-	if (*in == '<') {
-	    if (outend - out < 4) break;
-	    *out++ = '&';
-	    *out++ = 'l';
-	    *out++ = 't';
-	    *out++ = ';';
-	} else if (*in == '>') {
-	    if (outend - out < 4) break;
-	    *out++ = '&';
-	    *out++ = 'g';
-	    *out++ = 't';
-	    *out++ = ';';
-	} else if (*in == '&') {
-	    if (outend - out < 5) break;
-	    *out++ = '&';
-	    *out++ = 'a';
-	    *out++ = 'm';
-	    *out++ = 'p';
-	    *out++ = ';';
-	} else if (*in == '\r') {
-	    if (outend - out < 5) break;
-	    *out++ = '&';
-	    *out++ = '#';
-	    *out++ = '1';
-	    *out++ = '3';
-	    *out++ = ';';
-	} else {
-	    *out++ = *in;
-	}
-	++in;
-    }
-    *outlen = out - outstart;
-    *inlen = in - base;
-    return(0);
-}
-
 /**
 * xmlOutputBufferWriteEscape:
 * @out:  a buffered parser output
@ -2454,13 +2395,36 @@ xmlOutputBufferWriteEscape(xmlOutputBufferPtr out, const xmlChar *str,
                           xmlCharEncodingOutputFunc escaping) {
    int ret;
    int written = 0;
-    int len;
+    size_t len;

    if ((out == NULL) || (out->error) || (str == NULL))
        return(-1);
-    len = strlen((const char *)str);
-    if (escaping == NULL)
-        escaping = xmlEscapeContent;
+
+    len = strlen((const char *) str);
+    if (len >= INT_MAX) {
+        out->error = XML_ERR_RESOURCE_LIMIT;
+        return(-1);
+    }
+
+    if (escaping == NULL) {
+        char *escaped = (char *) xmlEscapeText(str, XML_ESCAPE_ALLOW_INVALID);
+
+        if (escaped == NULL) {
+            out->error = XML_ERR_NO_MEMORY;
+            return(-1);
+        }
+
+        len = strlen(escaped);
+        if (len >= INT_MAX) {
+            out->error = XML_ERR_RESOURCE_LIMIT;
+            return(-1);
+        }
+
+        ret = xmlOutputBufferWrite(out, len, escaped);
+
+        xmlFree(escaped);
+        return(ret);
+    }

    while (len > 0) {
        xmlChar buf[1024];
--- a/xmlsave.c
+++ b/xmlsave.c
@ -23,6 +23,7 @@

 #include "private/buf.h"
 #include "private/enc.h"
+#include "private/entities.h"
 #include "private/error.h"
 #include "private/io.h"
 #include "private/save.h"
@ -31,9 +32,6 @@

 #define XHTML_NS_NAME BAD_CAST "http://www.w3.org/1999/xhtml"

-#define XML_ESCAPE_ATTR         (1u << 0)
-#define XML_ESCAPE_NON_ASCII    (1u << 1)
-
 struct _xmlSaveCtxt {
    void *_private;
    int type;