1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2024-10-26 12:25:09 +03:00

html: Parse named character references according to HTML5

This commit is contained in:
Nick Wellnhofer 2024-09-03 15:52:44 +02:00
parent d5cd0f07f8
commit 5951179239
25 changed files with 23265 additions and 432 deletions

View File

@ -2620,6 +2620,111 @@ htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
return(ret); return(ret);
} }
#include "html5ent.inc"
#define ENT_F_SEMICOLON 0x80u
#define ENT_F_SUBTABLE 0x40u
#define ENT_F_ALL 0xC0u
static const xmlChar *
htmlFindEntityPrefix(const xmlChar *string, size_t slen, int isAttr,
int *nlen, int *rlen) {
const xmlChar *match = NULL;
unsigned left, right;
int first = string[0];
size_t matchLen = 0;
size_t soff = 1;
if (slen < 2)
return(NULL);
if (((first < 'A') || (first > 'Z')) &&
((first < 'a') || (first > 'z')))
return(NULL);
/*
* Look up range by first character
*/
first &= 63;
left = htmlEntAlpha[first*3] | htmlEntAlpha[first*3+1] << 8;
right = left + htmlEntAlpha[first*3+2];
/*
* Binary search
*/
while (left < right) {
const xmlChar *bytes;
unsigned mid;
size_t len;
int cmp;
mid = left + (right - left) / 2;
bytes = htmlEntStrings + htmlEntValues[mid];
len = bytes[0] & ~ENT_F_ALL;
cmp = string[soff] - bytes[1];
if (cmp == 0) {
if (slen < len) {
cmp = strncmp((const char *) string + soff + 1,
(const char *) bytes + 2,
slen - 1);
/* Prefix can never match */
if (cmp == 0)
break;
} else {
cmp = strncmp((const char *) string + soff + 1,
(const char *) bytes + 2,
len - 1);
}
}
if (cmp < 0) {
right = mid;
} else if (cmp > 0) {
left = mid + 1;
} else {
int term = soff + len < slen ? string[soff + len] : 0;
int isAlnum, isTerm;
isAlnum = (((term >= 'A') && (term <= 'Z')) ||
((term >= 'a') && (term <= 'z')) ||
((term >= '0') && (term <= '9')));
isTerm = ((term == ';') ||
((bytes[0] & ENT_F_SEMICOLON) &&
((!isAttr) ||
((!isAlnum) && (term != '=')))));
if (isTerm) {
match = bytes + len + 1;
matchLen = soff + len;
if (term == ';')
matchLen += 1;
}
if (bytes[0] & ENT_F_SUBTABLE) {
if (isTerm)
match += 2;
if ((isAlnum) && (soff + len < slen)) {
left = mid + bytes[len + 1];
right = left + bytes[len + 2];
soff += len;
continue;
}
}
break;
}
}
if (match == NULL)
return(NULL);
*nlen = matchLen;
*rlen = match[0];
return(match + 1);
}
/** /**
* htmlParseHTMLAttribute: * htmlParseHTMLAttribute:
@ -2640,9 +2745,6 @@ htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
XML_MAX_HUGE_LENGTH : XML_MAX_HUGE_LENGTH :
XML_MAX_TEXT_LENGTH; XML_MAX_TEXT_LENGTH;
xmlChar *out = NULL; xmlChar *out = NULL;
const xmlChar *name = NULL;
const xmlChar *cur = NULL;
const htmlEntityDesc * ent;
/* /*
* allocate a translation buffer. * allocate a translation buffer.
@ -2662,6 +2764,16 @@ htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
(CUR != 0) && (CUR != stop)) { (CUR != 0) && (CUR != stop)) {
if ((stop == 0) && (CUR == '>')) break; if ((stop == 0) && (CUR == '>')) break;
if ((stop == 0) && (IS_BLANK_CH(CUR))) break; if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
if (out - buffer > buffer_size - 100) {
int indx = out - buffer;
growBuffer(buffer);
out = &buffer[indx];
}
GROW;
if (CUR == '&') { if (CUR == '&') {
if (NXT(1) == '#') { if (NXT(1) == '#') {
unsigned int c; unsigned int c;
@ -2680,70 +2792,28 @@ htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
for ( ; bits >= 0; bits-= 6) { for ( ; bits >= 0; bits-= 6) {
*out++ = ((c >> bits) & 0x3F) | 0x80; *out++ = ((c >> bits) & 0x3F) | 0x80;
} }
if (out - buffer > buffer_size - 100) {
int indx = out - buffer;
growBuffer(buffer);
out = &buffer[indx];
}
} else { } else {
ent = htmlParseEntityRef(ctxt, &name); const xmlChar *repl;
if (name == NULL) { int nameLen, replLen;
*out++ = '&';
if (out - buffer > buffer_size - 100) {
int indx = out - buffer;
growBuffer(buffer); SKIP(1);
out = &buffer[indx]; repl = htmlFindEntityPrefix(CUR_PTR,
} ctxt->input->end - CUR_PTR,
} else if (ent == NULL) { /* isAttr */ 1,
*out++ = '&'; &nameLen, &replLen);
cur = name;
while (*cur != 0) {
if (out - buffer > buffer_size - 100) {
int indx = out - buffer;
growBuffer(buffer); if (repl == NULL) {
out = &buffer[indx]; *out++ = '&';
}
*out++ = *cur++;
}
} else { } else {
unsigned int c; memcpy(out, repl, replLen);
int bits; out += replLen;
SKIP(nameLen);
if (out - buffer > buffer_size - 100) {
int indx = out - buffer;
growBuffer(buffer);
out = &buffer[indx];
}
c = ent->value;
if (c < 0x80)
{ *out++ = c; bits= -6; }
else if (c < 0x800)
{ *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
else if (c < 0x10000)
{ *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
else
{ *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
for ( ; bits >= 0; bits-= 6) {
*out++ = ((c >> bits) & 0x3F) | 0x80;
}
} }
} }
} else { } else {
unsigned int c; unsigned int c;
int bits, l; int bits, l;
if (out - buffer > buffer_size - 100) {
int indx = out - buffer;
growBuffer(buffer);
out = &buffer[indx];
}
c = CUR_CHAR(l); c = CUR_CHAR(l);
if (c < 0x80) if (c < 0x80)
{ *out++ = c; bits= -6; } { *out++ = c; bits= -6; }
@ -4086,9 +4156,7 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
*/ */
static void static void
htmlParseReference(htmlParserCtxtPtr ctxt) { htmlParseReference(htmlParserCtxtPtr ctxt) {
const htmlEntityDesc * ent;
xmlChar out[6]; xmlChar out[6];
const xmlChar *name;
if (CUR != '&') return; if (CUR != '&') return;
if (NXT(1) == '#') { if (NXT(1) == '#') {
@ -4113,42 +4181,24 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, out, i); ctxt->sax->characters(ctxt->userData, out, i);
} else { } else {
ent = htmlParseEntityRef(ctxt, &name); const xmlChar *repl;
if (name == NULL) { int nameLen, replLen;
htmlCheckParagraph(ctxt); htmlCheckParagraph(ctxt);
SKIP(1);
repl = htmlFindEntityPrefix(CUR_PTR,
ctxt->input->end - CUR_PTR,
/* isAttr */ 0,
&nameLen, &replLen);
if (repl == NULL) {
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
return;
}
if ((ent == NULL) || !(ent->value > 0)) {
htmlCheckParagraph(ctxt);
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
}
} else { } else {
unsigned int c;
int bits, i = 0;
c = ent->value;
if (c < 0x80)
{ out[i++]= c; bits= -6; }
else if (c < 0x800)
{ out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
else if (c < 0x10000)
{ out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
else
{ out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
for ( ; bits >= 0; bits-= 6) {
out[i++]= ((c >> bits) & 0x3F) | 0x80;
}
out[i] = 0;
htmlCheckParagraph(ctxt);
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, out, i); ctxt->sax->characters(ctxt->userData, repl, replLen);
SKIP(nameLen);
} }
} }
} }

1607
html5ent.inc Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,2 +1,2 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd"> <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body><p>&amp;&Ugrave;</p></body></html> <html><body><p>&amp;j&Ugrave;</p></body></html>

View File

@ -1,3 +0,0 @@
./test/HTML/758518-entity.html:1: HTML parser error : htmlParseEntityRef: expecting ';'
Ù
^

View File

@ -1,11 +1,10 @@
SAX.setDocumentLocator() SAX.setDocumentLocator()
SAX.startDocument() SAX.startDocument()
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(html) SAX.startElement(html)
SAX.startElement(body) SAX.startElement(body)
SAX.startElement(p) SAX.startElement(p)
SAX.characters(&amp;, 1) SAX.characters(&amp;, 1)
SAX.characters(&Ugrave;, 2) SAX.characters(j&Ugrave;, 3)
SAX.endElement(p) SAX.endElement(p)
SAX.endElement(body) SAX.endElement(body)
SAX.endElement(html) SAX.endElement(html)

View File

@ -1,3 +1,3 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd"> <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body><p>&amp;&ecirc; <html><body><p>&amp;:&ecirc;
</p></body></html> </p></body></html>

View File

@ -1,3 +0,0 @@
./test/HTML/758605.html:1: HTML parser error : htmlParseEntityRef: expecting ';'
ê
^

View File

@ -1,13 +1,11 @@
SAX.setDocumentLocator() SAX.setDocumentLocator()
SAX.startDocument() SAX.startDocument()
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(html) SAX.startElement(html)
SAX.startElement(body) SAX.startElement(body)
SAX.startElement(p) SAX.startElement(p)
SAX.characters(&amp;, 1) SAX.characters(&amp;, 1)
SAX.characters(&ecirc;, 2) SAX.characters(:&ecirc;
SAX.characters( , 4)
, 1)
SAX.endElement(p) SAX.endElement(p)
SAX.endElement(body) SAX.endElement(body)
SAX.endElement(html) SAX.endElement(html)

View File

@ -0,0 +1,8 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<body>
<a href="index.cgi?a&amp;lt=1&amp;gt=2">link</a>
<a href="index.cgi?a&amp;lta&amp;gta">link</a>
<a href="index.cgi?a&lt;&gt;">link</a>
</body>
</html>

View File

@ -0,0 +1,30 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.startElement(html)
SAX.characters(
, 1)
SAX.startElement(body)
SAX.characters(
, 1)
SAX.startElement(a, href='index.cgi?a&amp;lt=1&amp;gt=2')
SAX.characters(link, 4)
SAX.endElement(a)
SAX.characters(
, 1)
SAX.startElement(a, href='index.cgi?a&amp;lta&amp;gta')
SAX.characters(link, 4)
SAX.endElement(a)
SAX.characters(
, 1)
SAX.startElement(a, href='index.cgi?a&lt;&gt;')
SAX.characters(link, 4)
SAX.endElement(a)
SAX.characters(
, 1)
SAX.endElement(body)
SAX.characters(
, 1)
SAX.endElement(html)
SAX.characters(
, 1)
SAX.endDocument()

View File

@ -1,15 +1,6 @@
./test/HTML/doc3.htm:10: HTML parser error : Misplaced DOCTYPE declaration ./test/HTML/doc3.htm:10: HTML parser error : Misplaced DOCTYPE declaration
<!-- END Naviscope Javascript --><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN <!-- END Naviscope Javascript --><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN
^ ^
./test/HTML/doc3.htm:52: HTML parser error : htmlParseEntityRef: expecting ';'
href="http://ads.gamesquad.net/addclick.exe/adclick.cgi?REGION=game|tech|ent&id
^
./test/HTML/doc3.htm:52: HTML parser error : htmlParseEntityRef: expecting ';'
_top"><img src="http://ads.gamesquad.net/addclick.exe/adcycle.cgi?group=52&media
^
./test/HTML/doc3.htm:52: HTML parser error : htmlParseEntityRef: expecting ';'
><img src="http://ads.gamesquad.net/addclick.exe/adcycle.cgi?group=52&media=1&id
^
./test/HTML/doc3.htm:148: HTML parser error : Unexpected end tag : p ./test/HTML/doc3.htm:148: HTML parser error : Unexpected end tag : p
</P></TD></TR></TBODY></TABLE></CENTER></TD></TR></TBODY></TABLE></CENTER></P> </P></TD></TR></TBODY></TABLE></CENTER></TD></TR></TBODY></TABLE></CENTER></P>
^ ^
@ -19,12 +10,6 @@ _top"><img src="http://ads.gamesquad.net/addclick.exe/adcycle.cgi?group=52&media
./test/HTML/doc3.htm:236: HTML parser error : Unexpected end tag : a ./test/HTML/doc3.htm:236: HTML parser error : Unexpected end tag : a
Specials<BR><BR></FONT></A><BR></FONT></A><B><FONT color=yellow Specials<BR><BR></FONT></A><BR></FONT></A><B><FONT color=yellow
^ ^
./test/HTML/doc3.htm:747: HTML parser error : htmlParseEntityRef: expecting ';'
er=0 alt="Advertisement" src="http://ads.adflight.com/ad_static.asp?pid=2097&sid
^
./test/HTML/doc3.htm:747: HTML parser error : htmlParseEntityRef: expecting ';'
Advertisement" src="http://ads.adflight.com/ad_static.asp?pid=2097&sid=1881&asid
^
./test/HTML/doc3.htm:747: HTML parser error : Unexpected end tag : li ./test/HTML/doc3.htm:747: HTML parser error : Unexpected end tag : li
light.com/ad_static.asp?pid=2097&sid=1881&asid=7708"></a></IFRAME></CENTER></LI> light.com/ad_static.asp?pid=2097&sid=1881&asid=7708"></a></IFRAME></CENTER></LI>
^ ^

View File

@ -85,10 +85,7 @@ SAX.comment( © 2000 GameSquad.net All Rights Reserved. )
SAX.startElement(iframe, border='0', frameborder='no', height='60', marginheight='0', marginwidth='0', scrolling='no', src='doc3_files/adcycle.htm', width='468') SAX.startElement(iframe, border='0', frameborder='no', height='60', marginheight='0', marginwidth='0', scrolling='no', src='doc3_files/adcycle.htm', width='468')
SAX.characters( SAX.characters(
, 1) , 1)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(a, href='http://ads.gamesquad.net/addclick.exe/adclick.cgi?REGION=game|tech|ent&amp;id=1', target='_top') SAX.startElement(a, href='http://ads.gamesquad.net/addclick.exe/adclick.cgi?REGION=game|tech|ent&amp;id=1', target='_top')
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(img, src='http://ads.gamesquad.net/addclick.exe/adcycle.cgi?group=52&amp;media=1&amp;id=1', width='468', height='60', border='0', alt='GSN ROS Ad') SAX.startElement(img, src='http://ads.gamesquad.net/addclick.exe/adcycle.cgi?group=52&amp;media=1&amp;id=1', width='468', height='60', border='0', alt='GSN ROS Ad')
SAX.endElement(img) SAX.endElement(img)
SAX.endElement(a) SAX.endElement(a)
@ -2567,8 +2564,6 @@ SAX.endElement(font)
SAX.startElement(center) SAX.startElement(center)
SAX.startElement(iframe, frameborder='0', height='60', marginheight='0', marginwidth='0', noresize, scrolling='no', src='doc3_files/ad_iframe.htm', width='468') SAX.startElement(iframe, frameborder='0', height='60', marginheight='0', marginwidth='0', noresize, scrolling='no', src='doc3_files/ad_iframe.htm', width='468')
SAX.startElement(a, href='http://ads.adflight.com/go_static.asp?asid=7708', target='_top') SAX.startElement(a, href='http://ads.adflight.com/go_static.asp?asid=7708', target='_top')
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(img, width='468', height='60', border='0', alt='Advertisement', src='http://ads.adflight.com/ad_static.asp?pid=2097&amp;sid=1881&amp;asid=7708') SAX.startElement(img, width='468', height='60', border='0', alt='Advertisement', src='http://ads.adflight.com/ad_static.asp?pid=2097&amp;sid=1881&amp;asid=7708')
SAX.endElement(img) SAX.endElement(img)
SAX.endElement(a) SAX.endElement(a)

View File

@ -1,12 +0,0 @@
./test/HTML/entities.html:1: HTML parser error : htmlParseEntityRef: expecting ';'
<p tst="a&amp;b" tst2="a&b" tst3="a & b">
^
./test/HTML/entities.html:1: HTML parser error : htmlParseEntityRef: no name
<p tst="a&amp;b" tst2="a&b" tst3="a & b">
^
./test/HTML/entities.html:3: HTML parser error : htmlParseEntityRef: expecting ';'
a&b
^
./test/HTML/entities.html:4: HTML parser error : htmlParseEntityRef: no name
a & b
^

View File

@ -2,20 +2,15 @@ SAX.setDocumentLocator()
SAX.startDocument() SAX.startDocument()
SAX.startElement(html) SAX.startElement(html)
SAX.startElement(body) SAX.startElement(body)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: no name
SAX.startElement(p, tst='a&amp;b', tst2='a&amp;b', tst3='a &amp; b') SAX.startElement(p, tst='a&amp;b', tst2='a&amp;b', tst3='a &amp; b')
SAX.characters( SAX.characters(
a, 2) a, 2)
SAX.characters(&amp;, 1) SAX.characters(&amp;, 1)
SAX.characters(b SAX.characters(b
a, 3) a, 3)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.characters(&amp;, 1) SAX.characters(&amp;, 1)
SAX.characters(b, 1) SAX.characters(b
SAX.characters( a , 4)
a , 3)
SAX.error: htmlParseEntityRef: no name
SAX.characters(&amp;, 1) SAX.characters(&amp;, 1)
SAX.characters( b SAX.characters( b
, 3) , 3)

View File

@ -1,3 +0,0 @@
./test/HTML/fp40.htm:153: HTML parser error : htmlParseEntityRef: no name
technical articles from Microsoft's extensive Knowledge Base, FAQs, & troublesho
^

View File

@ -422,7 +422,6 @@ SAX.characters(
, 2) , 2)
SAX.startElement(p) SAX.startElement(p)
SAX.characters(For further technical informat, 254) SAX.characters(For further technical informat, 254)
SAX.error: htmlParseEntityRef: no name
SAX.characters(&amp;, 1) SAX.characters(&amp;, 1)
SAX.characters( troubleshooters to find SAX.characters( troubleshooters to find
fast, 302) fast, 302)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,15 +1,3 @@
./test/HTML/utf8bug.html:45: HTML parser error : htmlParseEntityRef: expecting ';'
<img src="showimage.aspx?path=Files_Upload\192.png&width=%>" border="0" />
^
./test/HTML/utf8bug.html:118: HTML parser error : htmlParseEntityRef: expecting ';'
<a href="showimage.aspx?path=Files_Upload\302.JPG&Width=" rel="lightbox" tit
^
./test/HTML/utf8bug.html:119: HTML parser error : htmlParseEntityRef: expecting ';'
<img src="showimage.aspx?path=Files_Upload\302.JPG&Width=220" align="left" b
^
./test/HTML/utf8bug.html:121: HTML parser error : Tag s1 invalid ./test/HTML/utf8bug.html:121: HTML parser error : Tag s1 invalid
ز همکاران است. روی آن کلیک کند.</FONT></FONT></STRONG><S1 ز همکاران است. روی آن کلیک کند.</FONT></FONT></STRONG><S1
^ ^
./test/HTML/utf8bug.html:177: HTML parser error : htmlParseEntityRef: expecting ';'
ین پاسخ را برای نویسنده مقاله رجانیوز copy&paste
^

View File

@ -146,7 +146,6 @@ SAX.startElement(a, href='RSS2.asp')
SAX.characters( SAX.characters(
, 2) , 2)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(img, src='showimage.aspx?path=Files_Upload\192.png&amp;width=%&gt;', border='0') SAX.startElement(img, src='showimage.aspx?path=Files_Upload\192.png&amp;width=%&gt;', border='0')
SAX.endElement(img) SAX.endElement(img)
SAX.characters( SAX.characters(
@ -400,11 +399,9 @@ SAX.startElement(div, class='Image')
SAX.characters( SAX.characters(
, 10) , 10)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(a, href='showimage.aspx?path=Files_Upload\302.JPG&amp;Width=', rel='lightbox', title='&#1588;&#1607;&#1610;&#1585; &#1576;&#1604;&#1575;&#1711;') SAX.startElement(a, href='showimage.aspx?path=Files_Upload\302.JPG&amp;Width=', rel='lightbox', title='&#1588;&#1607;&#1610;&#1585; &#1576;&#1604;&#1575;&#1711;')
SAX.characters( SAX.characters(
, 5) , 5)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(img, src='showimage.aspx?path=Files_Upload\302.JPG&amp;Width=220', align='left', border='1') SAX.startElement(img, src='showimage.aspx?path=Files_Upload\302.JPG&amp;Width=220', align='left', border='1')
SAX.endElement(img) SAX.endElement(img)
SAX.characters( SAX.characters(
@ -665,10 +662,8 @@ SAX.endElement(font)
SAX.startElement(br) SAX.startElement(br)
SAX.endElement(br) SAX.endElement(br)
SAX.characters(&#1587;&#1604;&#1575;&#1605; , 834) SAX.characters(&#1587;&#1604;&#1575;&#1605; , 834)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.characters(&amp;, 1) SAX.characters(&amp;, 1)
SAX.characters(paste, 5) SAX.characters(paste , 6)
SAX.characters( , 1)
SAX.startElement(br) SAX.startElement(br)
SAX.endElement(br) SAX.endElement(br)
SAX.characters( SAX.characters(

View File

@ -1,201 +1,15 @@
./test/HTML/wired.html:6: HTML parser error : htmlParseEntityRef: expecting ';'
<FORM METHOD=GET ACTION="http://nsads.hotwired.com/event.ng/Type=click&ProfileID
^
./test/HTML/wired.html:6: HTML parser error : htmlParseEntityRef: expecting ';'
D=GET ACTION="http://nsads.hotwired.com/event.ng/Type=click&ProfileID=9688&RunID
^
./test/HTML/wired.html:6: HTML parser error : htmlParseEntityRef: expecting ';'
N="http://nsads.hotwired.com/event.ng/Type=click&ProfileID=9688&RunID=14074&AdID
^
./test/HTML/wired.html:6: HTML parser error : htmlParseEntityRef: expecting ';'
s.hotwired.com/event.ng/Type=click&ProfileID=9688&RunID=14074&AdID=22584&GroupID
^
./test/HTML/wired.html:6: HTML parser error : htmlParseEntityRef: expecting ';'
com/event.ng/Type=click&ProfileID=9688&RunID=14074&AdID=22584&GroupID=1&FamilyID
^
./test/HTML/wired.html:6: HTML parser error : htmlParseEntityRef: expecting ';'
pe=click&ProfileID=9688&RunID=14074&AdID=22584&GroupID=1&FamilyID=2684&TagValues
^
./test/HTML/wired.html:52: HTML parser error : htmlParseEntityRef: expecting ';'
" align="RIGHT"><a href="http://nsads.hotwired.com/event.ng/Type=click&ProfileID
^
./test/HTML/wired.html:52: HTML parser error : htmlParseEntityRef: expecting ';'
GHT"><a href="http://nsads.hotwired.com/event.ng/Type=click&ProfileID=5597&RunID
^
./test/HTML/wired.html:52: HTML parser error : htmlParseEntityRef: expecting ';'
f="http://nsads.hotwired.com/event.ng/Type=click&ProfileID=5597&RunID=17167&AdID
^
./test/HTML/wired.html:52: HTML parser error : htmlParseEntityRef: expecting ';'
s.hotwired.com/event.ng/Type=click&ProfileID=5597&RunID=17167&AdID=22588&GroupID
^
./test/HTML/wired.html:52: HTML parser error : htmlParseEntityRef: expecting ';'
com/event.ng/Type=click&ProfileID=5597&RunID=17167&AdID=22588&GroupID=1&FamilyID
^
./test/HTML/wired.html:52: HTML parser error : htmlParseEntityRef: expecting ';'
pe=click&ProfileID=5597&RunID=17167&AdID=22588&GroupID=1&FamilyID=3228&TagValues
^
./test/HTML/wired.html:70: HTML parser error : Tag nobr invalid ./test/HTML/wired.html:70: HTML parser error : Tag nobr invalid
<td bgcolor="#FF0000" align="left" valign="center"><nobr><img src="http://static <td bgcolor="#FF0000" align="left" valign="center"><nobr><img src="http://static
^ ^
./test/HTML/wired.html:89: HTML parser error : htmlParseEntityRef: expecting ';'
on value="http://search.hotwired.com/search97/s97.vts?Action=FilterSearch&Filter
^
./test/HTML/wired.html:89: HTML parser error : htmlParseEntityRef: expecting ';'
d.com/search97/s97.vts?Action=FilterSearch&Filter=docs_filter.hts&ResultTemplate
^
./test/HTML/wired.html:89: HTML parser error : htmlParseEntityRef: expecting ';'
ction=FilterSearch&Filter=docs_filter.hts&ResultTemplate=vignette.hts&Collection
^
./test/HTML/wired.html:89: HTML parser error : htmlParseEntityRef: expecting ';'
Filter=docs_filter.hts&ResultTemplate=vignette.hts&Collection=vignette&QueryMode
^
./test/HTML/wired.html:89: HTML parser error : htmlParseEntityRef: expecting ';'
ter.hts&ResultTemplate=vignette.hts&Collection=vignette&QueryMode=Internet&Query
^
./test/HTML/wired.html:90: HTML parser error : htmlParseEntityRef: expecting ';'
on value="http://search.hotwired.com/search97/s97.vts?Action=FilterSearch&Filter
^
./test/HTML/wired.html:90: HTML parser error : htmlParseEntityRef: expecting ';'
d.com/search97/s97.vts?Action=FilterSearch&Filter=docs_filter.hts&ResultTemplate
^
./test/HTML/wired.html:90: HTML parser error : htmlParseEntityRef: expecting ';'
tion=FilterSearch&Filter=docs_filter.hts&ResultTemplate=webmonkey.hts&Collection
^
./test/HTML/wired.html:90: HTML parser error : htmlParseEntityRef: expecting ';'
lter=docs_filter.hts&ResultTemplate=webmonkey.hts&Collection=webmonkey&QueryMode
^
./test/HTML/wired.html:90: HTML parser error : htmlParseEntityRef: expecting ';'
r.hts&ResultTemplate=webmonkey.hts&Collection=webmonkey&QueryMode=Internet&Query
^
./test/HTML/wired.html:91: HTML parser error : htmlParseEntityRef: expecting ';'
="http://search.hotwired.com/search97/s97.vts?collection=webmonkey_guides&Action
^
./test/HTML/wired.html:91: HTML parser error : htmlParseEntityRef: expecting ';'
ired.com/search97/s97.vts?collection=webmonkey_guides&Action=FilterSearch&filter
^
./test/HTML/wired.html:91: HTML parser error : htmlParseEntityRef: expecting ';'
ction=webmonkey_guides&Action=FilterSearch&filter=docs_filter.hts&ResultTemplate
^
./test/HTML/wired.html:91: HTML parser error : htmlParseEntityRef: expecting ';'
ilterSearch&filter=docs_filter.hts&ResultTemplate=webmonkey_guides.hts&QueryMode
^
./test/HTML/wired.html:91: HTML parser error : htmlParseEntityRef: expecting ';'
ter=docs_filter.hts&ResultTemplate=webmonkey_guides.hts&QueryMode=Internet&Query
^
./test/HTML/wired.html:92: HTML parser error : htmlParseEntityRef: expecting ';'
on value="http://search.hotwired.com/search97/s97.vts?collection=hotwired&Action
^
./test/HTML/wired.html:92: HTML parser error : htmlParseEntityRef: expecting ';'
rch.hotwired.com/search97/s97.vts?collection=hotwired&Action=FilterSearch&filter
^
./test/HTML/wired.html:92: HTML parser error : htmlParseEntityRef: expecting ';'
ts?collection=hotwired&Action=FilterSearch&filter=docs_filter.hts&ResultTemplate
^
./test/HTML/wired.html:92: HTML parser error : htmlParseEntityRef: expecting ';'
ilterSearch&filter=docs_filter.hts&ResultTemplate=hotwired_archive.hts&QueryMode
^
./test/HTML/wired.html:92: HTML parser error : htmlParseEntityRef: expecting ';'
ter=docs_filter.hts&ResultTemplate=hotwired_archive.hts&QueryMode=Internet&Query
^
./test/HTML/wired.html:93: HTML parser error : htmlParseEntityRef: expecting ';'
on value="http://search.hotwired.com/search97/s97.vts?Action=FilterSearch&Filter
^
./test/HTML/wired.html:93: HTML parser error : htmlParseEntityRef: expecting ';'
d.com/search97/s97.vts?Action=FilterSearch&Filter=docs_filter.hts&ResultTemplate
^
./test/HTML/wired.html:93: HTML parser error : htmlParseEntityRef: expecting ';'
ction=FilterSearch&Filter=docs_filter.hts&ResultTemplate=magazine.hts&Collection
^
./test/HTML/wired.html:93: HTML parser error : htmlParseEntityRef: expecting ';'
Filter=docs_filter.hts&ResultTemplate=magazine.hts&Collection=magazine&QueryMode
^
./test/HTML/wired.html:93: HTML parser error : htmlParseEntityRef: expecting ';'
ter.hts&ResultTemplate=magazine.hts&Collection=magazine&QueryMode=Internet&Query
^
./test/HTML/wired.html:94: HTML parser error : htmlParseEntityRef: expecting ';'
on value="http://search.hotwired.com/search97/s97.vts?Action=FilterSearch&Filter
^
./test/HTML/wired.html:94: HTML parser error : htmlParseEntityRef: expecting ';'
d.com/search97/s97.vts?Action=FilterSearch&Filter=docs_filter.hts&ResultTemplate
^
./test/HTML/wired.html:94: HTML parser error : htmlParseEntityRef: expecting ';'
tion=FilterSearch&Filter=docs_filter.hts&ResultTemplate=animation.hts&Collection
^
./test/HTML/wired.html:94: HTML parser error : htmlParseEntityRef: expecting ';'
lter=docs_filter.hts&ResultTemplate=animation.hts&Collection=animation&QueryMode
^
./test/HTML/wired.html:94: HTML parser error : htmlParseEntityRef: expecting ';'
r.hts&ResultTemplate=animation.hts&Collection=animation&QueryMode=Internet&Query
^
./test/HTML/wired.html:95: HTML parser error : htmlParseEntityRef: expecting ';'
option value="http://search.hotwired.com/search97/s97.vts?collection=suck&Action
^
./test/HTML/wired.html:95: HTML parser error : htmlParseEntityRef: expecting ';'
/search.hotwired.com/search97/s97.vts?collection=suck&Action=FilterSearch&filter
^
./test/HTML/wired.html:95: HTML parser error : htmlParseEntityRef: expecting ';'
97.vts?collection=suck&Action=FilterSearch&filter=docs_filter.hts&ResultTemplate
^
./test/HTML/wired.html:95: HTML parser error : htmlParseEntityRef: expecting ';'
uck&Action=FilterSearch&filter=docs_filter.hts&ResultTemplate=suck.hts&QueryMode
^
./test/HTML/wired.html:95: HTML parser error : htmlParseEntityRef: expecting ';'
erSearch&filter=docs_filter.hts&ResultTemplate=suck.hts&QueryMode=Internet&Query
^
./test/HTML/wired.html:96: HTML parser error : htmlParseEntityRef: expecting ';'
lue="http://search.hotwired.com/search97/s97.vts?collection=uber_hotwired&Action
^
./test/HTML/wired.html:96: HTML parser error : htmlParseEntityRef: expecting ';'
otwired.com/search97/s97.vts?collection=uber_hotwired&Action=FilterSearch&filter
^
./test/HTML/wired.html:96: HTML parser error : htmlParseEntityRef: expecting ';'
llection=uber_hotwired&Action=FilterSearch&filter=docs_filter.hts&ResultTemplate
^
./test/HTML/wired.html:96: HTML parser error : htmlParseEntityRef: expecting ';'
n=FilterSearch&filter=docs_filter.hts&ResultTemplate=uber_hotwired.hts&QueryMode
^
./test/HTML/wired.html:96: HTML parser error : htmlParseEntityRef: expecting ';'
filter=docs_filter.hts&ResultTemplate=uber_hotwired.hts&QueryMode=Internet&Query
^
./test/HTML/wired.html:97: HTML parser error : htmlParseEntityRef: expecting ';'
<option value="http://www.hotbot.com/?SM=MC&DV=0&LG=any&RD=RG&DC=10&DE=2&_v=2&O
^
./test/HTML/wired.html:97: HTML parser error : htmlParseEntityRef: expecting ';'
<option value="http://www.hotbot.com/?SM=MC&DV=0&LG=any&RD=RG&DC=10&DE=2&_v=2&O
^
./test/HTML/wired.html:97: HTML parser error : htmlParseEntityRef: expecting ';'
<option value="http://www.hotbot.com/?SM=MC&DV=0&LG=any&RD=RG&DC=10&DE=2&_v=2&O
^
./test/HTML/wired.html:97: HTML parser error : htmlParseEntityRef: expecting ';'
<option value="http://www.hotbot.com/?SM=MC&DV=0&LG=any&RD=RG&DC=10&DE=2&_v=2&O
^
./test/HTML/wired.html:97: HTML parser error : htmlParseEntityRef: expecting ';'
<option value="http://www.hotbot.com/?SM=MC&DV=0&LG=any&RD=RG&DC=10&DE=2&_v=2&O
^
./test/HTML/wired.html:97: HTML parser error : htmlParseEntityRef: expecting ';'
<option value="http://www.hotbot.com/?SM=MC&DV=0&LG=any&RD=RG&DC=10&DE=2&_v=2&O
^
./test/HTML/wired.html:97: HTML parser error : htmlParseEntityRef: expecting ';'
option value="http://www.hotbot.com/?SM=MC&DV=0&LG=any&RD=RG&DC=10&DE=2&_v=2&OPs
^
./test/HTML/wired.html:97: HTML parser error : htmlParseEntityRef: expecting ';'
lue="http://www.hotbot.com/?SM=MC&DV=0&LG=any&RD=RG&DC=10&DE=2&_v=2&OPs=MDRTP&MT
^
./test/HTML/wired.html:170: HTML parser error : Unexpected end tag : form ./test/HTML/wired.html:170: HTML parser error : Unexpected end tag : form
</tr> </form> </tr> </form>
^ ^
./test/HTML/wired.html:248: HTML parser error : htmlParseEntityRef: expecting ';'
MG SRC="http://barnesandnoble.bfast.com/booklink/serve?sourceid=383471&is_search
^
./test/HTML/wired.html:265: HTML parser error : Unexpected end tag : form ./test/HTML/wired.html:265: HTML parser error : Unexpected end tag : form
</tr> </form> </tr> </form>
^ ^
./test/HTML/wired.html:346: HTML parser error : Opening and ending tag mismatch: td and font ./test/HTML/wired.html:346: HTML parser error : Opening and ending tag mismatch: td and font
</td> </td>
^ ^
./test/HTML/wired.html:374: HTML parser error : htmlParseEntityRef: no name
a, sans-serif"><b><a href="/news/commentarySection/0,1292,31926,00.html">Rants &
^
./test/HTML/wired.html:374: HTML parser error : Opening and ending tag mismatch: td and font ./test/HTML/wired.html:374: HTML parser error : Opening and ending tag mismatch: td and font
Readers on Apple's G4 ... AOL's passwords ... MS vs. Linux.</font><br><br> </td Readers on Apple's G4 ... AOL's passwords ... MS vs. Linux.</font><br><br> </td
^ ^
@ -205,15 +19,6 @@ Readers on Apple's G4 ... AOL's passwords ... MS vs. Linux.</font><br><br> </td
./test/HTML/wired.html:402: HTML parser error : Opening and ending tag mismatch: a and font ./test/HTML/wired.html:402: HTML parser error : Opening and ending tag mismatch: a and font
w.vignette.com/" style="text-decoration:none"><font color="#000000">Vignette</a> w.vignette.com/" style="text-decoration:none"><font color="#000000">Vignette</a>
^ ^
./test/HTML/wired.html:407: HTML parser error : htmlParseEntityRef: expecting ';'
ervlet/appservlet?from=/wired/sprint/&template=/security/security.html&SITE=
^
./test/HTML/wired.html:407: HTML parser error : htmlParseEntityRef: expecting ';'
ervlet/appservlet?from=/wired/sprint/&template=/security/security.html&SITE=
^
./test/HTML/wired.html:408: HTML parser error : htmlParseEntityRef: expecting ';'
wired.com&BANNER=Sprint" style="text-decoration:none"><font color="#000000">Spri
^
./test/HTML/wired.html:408: HTML parser error : Opening and ending tag mismatch: a and font ./test/HTML/wired.html:408: HTML parser error : Opening and ending tag mismatch: a and font
com&BANNER=Sprint" style="text-decoration:none"><font color="#000000">Sprint</a> com&BANNER=Sprint" style="text-decoration:none"><font color="#000000">Sprint</a>
^ ^
@ -250,6 +55,3 @@ com&BANNER=Sprint" style="text-decoration:none"><font color="#000000">Sprint</a>
./test/HTML/wired.html:414: HTML parser error : Opening and ending tag mismatch: td and font ./test/HTML/wired.html:414: HTML parser error : Opening and ending tag mismatch: td and font
</td> </td>
^ ^
./test/HTML/wired.html:432: HTML parser error : htmlParseEntityRef: expecting ';'
href="http://www.lycos.com/news/flash/hitlerbunker.html?v=wn1015&lpv=1">Lycos</a
^

View File

@ -19,12 +19,6 @@ SAX.characters(
, 5) , 5)
SAX.startElement(td, valign='top', align='LEFT') SAX.startElement(td, valign='top', align='LEFT')
SAX.startElement(table, border='0', cellpadding='0', cellspacing='0', width='468', height='60', bgcolor='#FFFFFF') SAX.startElement(table, border='0', cellpadding='0', cellspacing='0', width='468', height='60', bgcolor='#FFFFFF')
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(form, method='GET', action='http://nsads.hotwired.com/event.ng/Type=click&amp;ProfileID=9688&amp;RunID=14074&amp;AdID=22584&amp;GroupID=1&amp;FamilyID=2684&amp;TagValues=8.25.156.159.166.171.172.174.179.180.181.182.183.196.197.199.208.389.412.436.2041.6750.78456.79630.81880&amp;Redirect=http://www.springstreet.com/aa/citysearch.htm', id='form1', name='form1') SAX.startElement(form, method='GET', action='http://nsads.hotwired.com/event.ng/Type=click&amp;ProfileID=9688&amp;RunID=14074&amp;AdID=22584&amp;GroupID=1&amp;FamilyID=2684&amp;TagValues=8.25.156.159.166.171.172.174.179.180.181.182.183.196.197.199.208.389.412.436.2041.6750.78456.79630.81880&amp;Redirect=http://www.springstreet.com/aa/citysearch.htm', id='form1', name='form1')
SAX.characters( SAX.characters(
, 2) , 2)
@ -298,12 +292,6 @@ SAX.endElement(td)
SAX.characters( SAX.characters(
, 5) , 5)
SAX.startElement(td, valign='top', align='RIGHT') SAX.startElement(td, valign='top', align='RIGHT')
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(a, href='http://nsads.hotwired.com/event.ng/Type=click&amp;ProfileID=5597&amp;RunID=17167&amp;AdID=22588&amp;GroupID=1&amp;FamilyID=3228&amp;TagValues=8.25.159.171.172.174.179.180.181.182.183.196.197.199.208.241.389.412.436.2035.6749.6750.70367.78456.79630.81880&amp;Redirect=http:%2F%2Fwww.hp.com%2Fgo%2Foriginal%20', target='_top') SAX.startElement(a, href='http://nsads.hotwired.com/event.ng/Type=click&amp;ProfileID=5597&amp;RunID=17167&amp;AdID=22588&amp;GroupID=1&amp;FamilyID=3228&amp;TagValues=8.25.159.171.172.174.179.180.181.182.183.196.197.199.208.241.389.412.436.2035.6749.6750.70367.78456.79630.81880&amp;Redirect=http:%2F%2Fwww.hp.com%2Fgo%2Foriginal%20', target='_top')
SAX.startElement(img, src='http://static.wired.com/advertising/blipverts/hp_colorinkjet/hp_970c_120x60_6.gif', border='1', height='60', width='120', alt='True to the Original') SAX.startElement(img, src='http://static.wired.com/advertising/blipverts/hp_colorinkjet/hp_970c_120x60_6.gif', border='1', height='60', width='120', alt='True to the Original')
SAX.endElement(img) SAX.endElement(img)
@ -437,94 +425,46 @@ SAX.startElement(select, name='url')
SAX.characters( SAX.characters(
, 4) , 4)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(option, value='http://search.hotwired.com/search97/s97.vts?Action=FilterSearch&amp;Filter=docs_filter.hts&amp;ResultTemplate=vignette.hts&amp;Collection=vignette&amp;QueryMode=Internet&amp;Query=', selected) SAX.startElement(option, value='http://search.hotwired.com/search97/s97.vts?Action=FilterSearch&amp;Filter=docs_filter.hts&amp;ResultTemplate=vignette.hts&amp;Collection=vignette&amp;QueryMode=Internet&amp;Query=', selected)
SAX.characters(Wired News, 10) SAX.characters(Wired News, 10)
SAX.endElement(option) SAX.endElement(option)
SAX.characters( SAX.characters(
, 3) , 3)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(option, value='http://search.hotwired.com/search97/s97.vts?Action=FilterSearch&amp;Filter=docs_filter.hts&amp;ResultTemplate=webmonkey.hts&amp;Collection=webmonkey&amp;QueryMode=Internet&amp;Query=') SAX.startElement(option, value='http://search.hotwired.com/search97/s97.vts?Action=FilterSearch&amp;Filter=docs_filter.hts&amp;ResultTemplate=webmonkey.hts&amp;Collection=webmonkey&amp;QueryMode=Internet&amp;Query=')
SAX.characters(Webmonkey, 9) SAX.characters(Webmonkey, 9)
SAX.endElement(option) SAX.endElement(option)
SAX.characters( SAX.characters(
, 2) , 2)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(option, value='http://search.hotwired.com/search97/s97.vts?collection=webmonkey_guides&amp;Action=FilterSearch&amp;filter=docs_filter.hts&amp;ResultTemplate=webmonkey_guides.hts&amp;QueryMode=Internet&amp;Query=') SAX.startElement(option, value='http://search.hotwired.com/search97/s97.vts?collection=webmonkey_guides&amp;Action=FilterSearch&amp;filter=docs_filter.hts&amp;ResultTemplate=webmonkey_guides.hts&amp;QueryMode=Internet&amp;Query=')
SAX.characters(Webmonkey Guides, 16) SAX.characters(Webmonkey Guides, 16)
SAX.endElement(option) SAX.endElement(option)
SAX.characters( SAX.characters(
, 2) , 2)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(option, value='http://search.hotwired.com/search97/s97.vts?collection=hotwired&amp;Action=FilterSearch&amp;filter=docs_filter.hts&amp;ResultTemplate=hotwired_archive.hts&amp;QueryMode=Internet&amp;Query=') SAX.startElement(option, value='http://search.hotwired.com/search97/s97.vts?collection=hotwired&amp;Action=FilterSearch&amp;filter=docs_filter.hts&amp;ResultTemplate=hotwired_archive.hts&amp;QueryMode=Internet&amp;Query=')
SAX.characters(HotWired Archives, 17) SAX.characters(HotWired Archives, 17)
SAX.endElement(option) SAX.endElement(option)
SAX.characters( SAX.characters(
, 3) , 3)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(option, value='http://search.hotwired.com/search97/s97.vts?Action=FilterSearch&amp;Filter=docs_filter.hts&amp;ResultTemplate=magazine.hts&amp;Collection=magazine&amp;QueryMode=Internet&amp;Query=') SAX.startElement(option, value='http://search.hotwired.com/search97/s97.vts?Action=FilterSearch&amp;Filter=docs_filter.hts&amp;ResultTemplate=magazine.hts&amp;Collection=magazine&amp;QueryMode=Internet&amp;Query=')
SAX.characters(Wired Magazine, 14) SAX.characters(Wired Magazine, 14)
SAX.endElement(option) SAX.endElement(option)
SAX.characters( SAX.characters(
, 3) , 3)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(option, value='http://search.hotwired.com/search97/s97.vts?Action=FilterSearch&amp;Filter=docs_filter.hts&amp;ResultTemplate=animation.hts&amp;Collection=animation&amp;QueryMode=Internet&amp;Query=') SAX.startElement(option, value='http://search.hotwired.com/search97/s97.vts?Action=FilterSearch&amp;Filter=docs_filter.hts&amp;ResultTemplate=animation.hts&amp;Collection=animation&amp;QueryMode=Internet&amp;Query=')
SAX.characters(Animation Express, 17) SAX.characters(Animation Express, 17)
SAX.endElement(option) SAX.endElement(option)
SAX.characters( SAX.characters(
, 3) , 3)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(option, value='http://search.hotwired.com/search97/s97.vts?collection=suck&amp;Action=FilterSearch&amp;filter=docs_filter.hts&amp;ResultTemplate=suck.hts&amp;QueryMode=Internet&amp;Query=') SAX.startElement(option, value='http://search.hotwired.com/search97/s97.vts?collection=suck&amp;Action=FilterSearch&amp;filter=docs_filter.hts&amp;ResultTemplate=suck.hts&amp;QueryMode=Internet&amp;Query=')
SAX.characters(Suck.com, 8) SAX.characters(Suck.com, 8)
SAX.endElement(option) SAX.endElement(option)
SAX.characters( SAX.characters(
, 2) , 2)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(option, value='http://search.hotwired.com/search97/s97.vts?collection=uber_hotwired&amp;Action=FilterSearch&amp;filter=docs_filter.hts&amp;ResultTemplate=uber_hotwired.hts&amp;QueryMode=Internet&amp;Query=') SAX.startElement(option, value='http://search.hotwired.com/search97/s97.vts?collection=uber_hotwired&amp;Action=FilterSearch&amp;filter=docs_filter.hts&amp;ResultTemplate=uber_hotwired.hts&amp;QueryMode=Internet&amp;Query=')
SAX.characters(All of HotWired, 15) SAX.characters(All of HotWired, 15)
SAX.endElement(option) SAX.endElement(option)
SAX.characters( SAX.characters(
, 2) , 2)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(option, value='http://www.hotbot.com/?SM=MC&amp;DV=0&amp;LG=any&amp;RD=RG&amp;DC=10&amp;DE=2&amp;_v=2&amp;OPs=MDRTP&amp;MT=') SAX.startElement(option, value='http://www.hotbot.com/?SM=MC&amp;DV=0&amp;LG=any&amp;RD=RG&amp;DC=10&amp;DE=2&amp;_v=2&amp;OPs=MDRTP&amp;MT=')
SAX.characters(The Web -&gt; HotBot, 17) SAX.characters(The Web -&gt; HotBot, 17)
SAX.endElement(option) SAX.endElement(option)
@ -1090,7 +1030,6 @@ SAX.endElement(input)
SAX.characters( SAX.characters(
, 2) , 2)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(img, src='http://barnesandnoble.bfast.com/booklink/serve?sourceid=383471&amp;is_search=Y', border='0', align='top') SAX.startElement(img, src='http://barnesandnoble.bfast.com/booklink/serve?sourceid=383471&amp;is_search=Y', border='0', align='top')
SAX.endElement(img) SAX.endElement(img)
SAX.characters( SAX.characters(
@ -1612,7 +1551,6 @@ SAX.startElement(font, size='2', face='Arial,Helvetica, sans-serif')
SAX.startElement(b) SAX.startElement(b)
SAX.startElement(a, href='/news/commentarySection/0,1292,31926,00.html') SAX.startElement(a, href='/news/commentarySection/0,1292,31926,00.html')
SAX.characters(Rants , 6) SAX.characters(Rants , 6)
SAX.error: htmlParseEntityRef: no name
SAX.characters(&amp;, 1) SAX.characters(&amp;, 1)
SAX.characters( Raves, 6) SAX.characters( Raves, 6)
SAX.endElement(a) SAX.endElement(a)
@ -1948,9 +1886,6 @@ SAX.startElement(br)
SAX.endElement(br) SAX.endElement(br)
SAX.startElement(i) SAX.startElement(i)
SAX.characters(Sponsored by , 13) SAX.characters(Sponsored by , 13)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(a, href='http://r.wired.com/r/wn_is_r_ssec/http://ad.doubleclick.net/clk;653163;3599571;s?http://www.sprintbiz.com/s SAX.startElement(a, href='http://r.wired.com/r/wn_is_r_ssec/http://ad.doubleclick.net/clk;653163;3599571;s?http://www.sprintbiz.com/s
ervlet/appservlet?from=/wired/sprint/&amp;template=/security/security.html&amp;SITE= ervlet/appservlet?from=/wired/sprint/&amp;template=/security/security.html&amp;SITE=
wired.com&amp;BANNER=Sprint', style='text-decoration:none') wired.com&amp;BANNER=Sprint', style='text-decoration:none')
@ -2093,7 +2028,6 @@ SAX.endElement(br)
SAX.endElement(p) SAX.endElement(p)
SAX.startElement(li) SAX.startElement(li)
SAX.characters(More from , 10) SAX.characters(More from , 10)
SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(a, href='http://www.lycos.com/news/flash/hitlerbunker.html?v=wn1015&amp;lpv=1') SAX.startElement(a, href='http://www.lycos.com/news/flash/hitlerbunker.html?v=wn1015&amp;lpv=1')
SAX.characters(Lycos, 5) SAX.characters(Lycos, 5)
SAX.endElement(a) SAX.endElement(a)

7
test/HTML/attr-ents.html Normal file
View File

@ -0,0 +1,7 @@
<html>
<body>
<a href="index.cgi?a&lt=1&gt=2">link</a>
<a href="index.cgi?a&lta&gta">link</a>
<a href="index.cgi?a&lt&gt">link</a>
</body>
</html>

File diff suppressed because it is too large Load Diff

169
tools/genHtmlEnt.py Executable file
View File

@ -0,0 +1,169 @@
#!/usr/bin/env python3
import json
import sys
from dataclasses import dataclass
# The basic idea is to find named character references using binary
# search. Since entity strings may not have a terminator, this doesn't
# work if one entity string is a prefix of another. In this case,
# we branch to a subtable after matching the prefix.
#
# We create separate initial tables based on the first character
# of the entity name.
#
# The following tables are generated:
#
# htmlEntAlpha: start and end of initial tables, indexing into
# htmlEntValues
# htmlEntValues: concatenation of all table values, which index into
# htmlEntStrings
# htmlEntStrings: variable sized records containing entity name,
# replacement and optionally the position of a
# subtable
try:
with open('entities.json') as json_data:
ents = json.load(json_data)
except FileNotFoundError:
print('entities.json not found, try curl -LJO',
'https://html.spec.whatwg.org/entities.json')
sys.exit(1)
def to_cchars(s):
r = []
for c in s.encode():
if c >= 0x20 and c <= 0x7E and c != ord("'") and c != ord('\\'):
v = f"'{chr(c)}'"
else:
v = c
r += [ v ]
return r
@dataclass
class PrefixStackEntry:
prefix: str
table_id: int
@dataclass
class AlphaFixup:
table_id: int
char: int
@dataclass
class StringFixup:
table_id: int
string_index: int
super_table_id: int
super_offset: int
# Remove entity strings without trailing semicolon
keys = (key for key in ents.keys() if key.endswith(';'))
# Sort entity strings
keys = sorted(keys, key=lambda k: k[1:-1])
strings = []
tables = []
prefix_stack = []
alpha_fixups = []
string_fixups = []
for i in range(64):
tables.append([])
for i, key in enumerate(keys):
name = key[1:-1]
next_name = None
if i + 1 < len(keys):
next_name = keys[i+1][1:-1]
while prefix_stack and not name.startswith(prefix_stack[-1].prefix):
prefix_stack.pop()
# First character is initial prefix
if not prefix_stack:
table_id = len(tables)
tables.append([])
prefix_stack.append(PrefixStackEntry(name[0], table_id))
alpha_fixups.append(AlphaFixup(table_id, ord(name[0]) % 64))
string_index = len(strings)
table = tables[prefix_stack[-1].table_id]
table_index = len(table)
table.append(string_index)
name_offset = len(prefix_stack[-1].prefix)
name_chars = to_cchars(name[name_offset:])
repl_chars = to_cchars(ents[key]['characters'])
semicolon_flag = 0
if key[:-1] in ents:
semicolon_flag = 0x80
if next_name and next_name.startswith(name):
# Create subtable
strings += [
len(name_chars) | semicolon_flag | 0x40, *name_chars,
0, 0, # subtable position, to be fixed up
len(repl_chars), *repl_chars,
]
table_id = len(tables)
tables.append([])
fixup_index = string_index + 1 + len(name_chars)
string_fixups.append(StringFixup(
table_id, fixup_index, prefix_stack[-1].table_id, table_index,
))
prefix_stack.append(PrefixStackEntry(name, table_id))
else:
strings += [
len(name_chars) | semicolon_flag, *name_chars,
len(repl_chars), *repl_chars,
]
# Concat tables and record ranges
ranges = [ 0 ]
values = []
for table in tables:
values += table
ranges.append(len(values))
# Create alpha table
alpha = [ 0 ] * (59 * 3)
for fixup in alpha_fixups:
table_id, c = fixup.table_id, fixup.char
start = ranges[table_id]
end = ranges[table_id+1]
alpha[c*3:c*3+3] = [ start & 0xFF, start >> 8, end - start ]
# Fix up subtable positions
for fixup in string_fixups:
table_id, i = fixup.table_id, fixup.string_index
start = ranges[table_id]
end = ranges[table_id+1]
super_index = ranges[fixup.super_table_id] + fixup.super_offset
strings[i:i+2] = [ start - super_index, end - start ]
# Print tables
def gen_table(ctype, cname, values, fmt, elems_per_line):
count = len(values)
r = ''
for i in range(count):
if i != 0: r += ','
if i % elems_per_line == 0: r += '\n '
else: r += ' '
r += fmt % values[i]
return f'static const {ctype} {cname}[{count}] = {{{r}\n}};\n'
print(gen_table('unsigned char', 'htmlEntAlpha', alpha, '%3d', 15))
print(gen_table('unsigned short', 'htmlEntValues', values, '%5d', 10))
print(gen_table('unsigned char', 'htmlEntStrings', strings, '%3s', 15))