1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-04-25 22:50:08 +03:00

html: Use hash table to check for duplicate attributes

This commit is contained in:
Nick Wellnhofer 2024-09-15 20:28:49 +02:00
parent 24a6149fc4
commit 0bc4608c50
3 changed files with 149 additions and 53 deletions

View File

@ -36,6 +36,7 @@
#include <libxml/uri.h> #include <libxml/uri.h>
#include "private/buf.h" #include "private/buf.h"
#include "private/dict.h"
#include "private/enc.h" #include "private/enc.h"
#include "private/error.h" #include "private/error.h"
#include "private/html.h" #include "private/html.h"
@ -2356,10 +2357,11 @@ htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
* Returns the Tag Name parsed or NULL * Returns the Tag Name parsed or NULL
*/ */
static const xmlChar * static xmlHashedString
htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr) { htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr) {
xmlHashedString ret;
xmlChar buf[HTML_PARSER_BUFFER_SIZE]; xmlChar buf[HTML_PARSER_BUFFER_SIZE];
const xmlChar *ret, *in; const xmlChar *in;
size_t avail; size_t avail;
int eof = PARSER_PROGRESSIVE(ctxt); int eof = PARSER_PROGRESSIVE(ctxt);
int nbchar = 0; int nbchar = 0;
@ -2436,8 +2438,8 @@ htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr) {
SHRINK; SHRINK;
ret = xmlDictLookup(ctxt->dict, buf, nbchar); ret = xmlDictLookupHashed(ctxt->dict, buf, nbchar);
if (ret == NULL) if (ret.name == NULL)
htmlErrMemory(ctxt); htmlErrMemory(ctxt);
return(ret); return(ret);
@ -3514,15 +3516,15 @@ bogus:
* Returns the attribute name, and the value in *value. * Returns the attribute name, and the value in *value.
*/ */
static const xmlChar * static xmlHashedString
htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
const xmlChar *name; xmlHashedString hname;
xmlChar *val = NULL; xmlChar *val = NULL;
*value = NULL; *value = NULL;
name = htmlParseHTMLName(ctxt, 1); hname = htmlParseHTMLName(ctxt, 1);
if (name == NULL) if (hname.name == NULL)
return(NULL); return(hname);
/* /*
* read the value * read the value
@ -3535,7 +3537,7 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
} }
*value = val; *value = val;
return(name); return(hname);
} }
/** /**
@ -3617,6 +3619,48 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
} }
/**
* htmlAttrHashInsert:
* @ctxt: parser context
* @size: size of the hash table
* @name: attribute name
* @hashValue: hash value of name
* @aindex: attribute index (this is a multiple of 5)
*
* Inserts a new attribute into the hash table.
*
* Returns INT_MAX if no existing attribute was found, the attribute
* index if an attribute was found, -1 if a memory allocation failed.
*/
static int
htmlAttrHashInsert(xmlParserCtxtPtr ctxt, unsigned size, const xmlChar *name,
unsigned hashValue, int aindex) {
xmlAttrHashBucket *table = ctxt->attrHash;
xmlAttrHashBucket *bucket;
unsigned hindex;
hindex = hashValue & (size - 1);
bucket = &table[hindex];
while (bucket->index >= 0) {
const xmlChar **atts = &ctxt->atts[bucket->index];
if (name == atts[0])
return(bucket->index);
hindex++;
bucket++;
if (hindex >= size) {
hindex = 0;
bucket = table;
}
}
bucket->index = aindex;
return(INT_MAX);
}
/** /**
* htmlParseStartTag: * htmlParseStartTag:
* @ctxt: an HTML parser context * @ctxt: an HTML parser context
@ -3657,7 +3701,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
maxatts = ctxt->maxatts; maxatts = ctxt->maxatts;
GROW; GROW;
name = htmlParseHTMLName(ctxt, 0); name = htmlParseHTMLName(ctxt, 0).name;
if (name == NULL) if (name == NULL)
return; return;
if (xmlStrEqual(name, BAD_CAST"meta")) if (xmlStrEqual(name, BAD_CAST"meta"))
@ -3717,6 +3761,8 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
(CUR != '>') && (CUR != '>') &&
((CUR != '/') || (NXT(1) != '>')) && ((CUR != '/') || (NXT(1) != '>')) &&
(PARSER_STOPPED(ctxt) == 0)) { (PARSER_STOPPED(ctxt) == 0)) {
xmlHashedString hattname;
/* unexpected-solidus-in-tag */ /* unexpected-solidus-in-tag */
if (CUR == '/') { if (CUR == '/') {
SKIP(1); SKIP(1);
@ -3724,55 +3770,50 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
continue; continue;
} }
GROW; GROW;
attname = htmlParseAttribute(ctxt, &attvalue); hattname = htmlParseAttribute(ctxt, &attvalue);
attname = hattname.name;
if (attname != NULL) { if (attname != NULL) {
/*
* Well formedness requires at most one declaration of an attribute
*/
for (i = 0; i < nbatts;i += 2) {
if (xmlStrEqual(atts[i], attname)) {
if (attvalue != NULL)
xmlFree(attvalue);
goto failed;
}
}
/* /*
* Add the pair to atts * Add the pair to atts
*/ */
if (atts == NULL) { if (nbatts + 4 > maxatts) {
maxatts = 22; /* allow for 10 attrs by default */ const xmlChar **tmp;
atts = (const xmlChar **) unsigned *utmp;
xmlMalloc(maxatts * sizeof(xmlChar *)); size_t newSize = maxatts ? maxatts * 2 : 22;
if (atts == NULL) {
htmlErrMemory(ctxt);
if (attvalue != NULL)
xmlFree(attvalue);
goto failed;
}
ctxt->atts = atts;
ctxt->maxatts = maxatts;
} else if (nbatts + 4 > maxatts) {
const xmlChar **n;
maxatts *= 2; tmp = xmlMalloc(newSize * sizeof(tmp[0]));
n = (const xmlChar **) xmlRealloc((void *) atts, if (tmp == NULL) {
maxatts * sizeof(const xmlChar *));
if (n == NULL) {
htmlErrMemory(ctxt); htmlErrMemory(ctxt);
if (attvalue != NULL) if (attvalue != NULL)
xmlFree(attvalue); xmlFree(attvalue);
goto failed; goto failed;
} }
atts = n;
utmp = xmlRealloc(ctxt->attallocs,
newSize / 2 * sizeof(utmp[0]));
if (utmp == NULL) {
htmlErrMemory(ctxt);
if (attvalue != NULL)
xmlFree(attvalue);
xmlFree(tmp);
goto failed;
}
if (maxatts > 0)
memcpy(tmp, atts, maxatts * sizeof(tmp[0]));
xmlFree(atts);
atts = tmp;
maxatts = newSize;
ctxt->atts = atts; ctxt->atts = atts;
ctxt->attallocs = utmp;
ctxt->maxatts = maxatts; ctxt->maxatts = maxatts;
} }
ctxt->attallocs[nbatts/2] = hattname.hashValue;
atts[nbatts++] = attname; atts[nbatts++] = attname;
atts[nbatts++] = attvalue; atts[nbatts++] = attvalue;
atts[nbatts] = NULL;
atts[nbatts + 1] = NULL;
} }
else { else {
if (attvalue != NULL) if (attvalue != NULL)
@ -3789,10 +3830,65 @@ failed:
} }
/* /*
* Handle specific association to the META tag * Verify that attribute names are unique.
*/ */
if (meta && (nbatts != 0)) if (nbatts > 2) {
htmlCheckMeta(ctxt, atts); unsigned attrHashSize;
int j, k;
attrHashSize = 4;
while (attrHashSize / 2 < (unsigned) nbatts / 2)
attrHashSize *= 2;
if (attrHashSize > ctxt->attrHashMax) {
xmlAttrHashBucket *tmp;
tmp = xmlRealloc(ctxt->attrHash, attrHashSize * sizeof(tmp[0]));
if (tmp == NULL) {
htmlErrMemory(ctxt);
goto done;
}
ctxt->attrHash = tmp;
ctxt->attrHashMax = attrHashSize;
}
memset(ctxt->attrHash, -1, attrHashSize * sizeof(ctxt->attrHash[0]));
for (i = 0, j = 0, k = 0; i < nbatts; i += 2, k++) {
unsigned hashValue;
int res;
attname = atts[i];
hashValue = ctxt->attallocs[k] | 0x80000000;
res = htmlAttrHashInsert(ctxt, attrHashSize, attname,
hashValue, j);
if (res < 0)
continue;
if (res == INT_MAX) {
atts[j] = atts[i];
atts[j+1] = atts[i+1];
j += 2;
} else {
xmlFree((xmlChar *) atts[i+1]);
}
}
nbatts = j;
}
if (nbatts > 0) {
atts[nbatts] = NULL;
atts[nbatts + 1] = NULL;
/*
* Handle specific association to the META tag
*/
if (meta)
htmlCheckMeta(ctxt, atts);
}
/* /*
* SAX: Start of Element ! * SAX: Start of Element !
@ -3857,7 +3953,7 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
return; return;
} }
name = htmlParseHTMLName(ctxt, 0); name = htmlParseHTMLName(ctxt, 0).name;
if (name == NULL) if (name == NULL)
return; return;

View File

@ -46,6 +46,10 @@
(((ctxt)->input->entity != NULL) && \ (((ctxt)->input->entity != NULL) && \
((ctxt)->input->entity->etype == XML_EXTERNAL_PARAMETER_ENTITY))) ((ctxt)->input->entity->etype == XML_EXTERNAL_PARAMETER_ENTITY)))
struct _xmlAttrHashBucket {
int index;
};
XML_HIDDEN void XML_HIDDEN void
xmlCtxtVErr(xmlParserCtxtPtr ctxt, xmlNodePtr node, xmlErrorDomain domain, xmlCtxtVErr(xmlParserCtxtPtr ctxt, xmlNodePtr node, xmlErrorDomain domain,
xmlParserErrors code, xmlErrorLevel level, xmlParserErrors code, xmlErrorLevel level,

View File

@ -116,10 +116,6 @@ struct _xmlParserNsData {
int minNsIndex; int minNsIndex;
}; };
struct _xmlAttrHashBucket {
int index;
};
static int static int
xmlParseElementStart(xmlParserCtxtPtr ctxt); xmlParseElementStart(xmlParserCtxtPtr ctxt);