1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2024-10-26 03:55:04 +03:00

html: Optimize htmlParseCharData

This commit is contained in:
Nick Wellnhofer 2024-09-12 01:45:34 +02:00
parent 440bd64c69
commit f77ec16db0
13 changed files with 5230 additions and 8569 deletions

View File

@ -51,6 +51,11 @@
(((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0C) || ((c) == 0x0D) || \
((c) == 0x20))
#define IS_HEX_DIGIT(c) \
((IS_ASCII_DIGIT(c)) || \
(((c) >= 'A') && ((c) <= 'F')) || \
(((c) >= 'a') && ((c) <= 'f')))
static int htmlOmittedDefaultValue = 1;
static int
@ -470,6 +475,65 @@ encoding_error:
return(*ctxt->input->cur);
}
static int
htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len) {
unsigned c = str[0];
int size;
if (c < 0xC2) {
goto invalid;
} else if (c < 0xE0) {
if (len < 2)
goto incomplete;
if ((str[1] & 0xC0) != 0x80)
goto invalid;
size = 2;
} else if (c < 0xF0) {
unsigned v;
if (len < 3)
goto incomplete;
v = str[1] << 8 | str[2]; /* hint to generate 16-bit load */
v |= c << 16;
if (((v & 0x00C0C0) != 0x008080) ||
((v & 0x0F2000) == 0x000000) ||
((v & 0x0F2000) == 0x0D2000))
goto invalid;
size = 3;
} else {
unsigned v;
if (len < 4)
goto incomplete;
v = c << 24 | str[1] << 16 | str[2] << 8 | str[3];
if (((v & 0x00C0C0C0) != 0x00808080) ||
(v < 0xF0900000) || (v >= 0xF4900000))
goto invalid;
size = 4;
}
return(size);
incomplete:
return(0);
invalid:
/* Only report the first error */
if ((ctxt->input->flags & XML_INPUT_ENCODING_ERROR) == 0) {
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
"Invalid bytes in character encoding", NULL, NULL);
ctxt->input->flags |= XML_INPUT_ENCODING_ERROR;
}
return(-1);
}
/**
* htmlSkipBlankChars:
* @ctxt: the HTML parser context
@ -2234,7 +2298,7 @@ static const char *allowPCData[] = {
*
* Is this a sequence of blank chars that one can ignore ?
*
* Returns 1 if ignorable 0 otherwise.
* Returns 1 if ignorable 0 if whitespace, -1 otherwise.
*/
static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
@ -2244,7 +2308,7 @@ static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
xmlDtdPtr dtd;
for (j = 0;j < len;j++)
if (!(IS_WS_HTML(str[j]))) return(0);
if (!(IS_WS_HTML(str[j]))) return(-1);
if (CUR == 0) return(1);
if (CUR != '<') return(0);
@ -2476,6 +2540,109 @@ htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr) {
return(ret);
}
static const short htmlC1Remap[32] = {
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
};
static int
htmlParseNCR(const xmlChar *string, size_t slen, int *dlen) {
const xmlChar *in = string;
const xmlChar *end = string + slen;
unsigned val = 0;
while (in < end) {
int c = *in;
if ((c < '0') || (c > '9'))
break;
val = val * 10 + (c - '0');
if (val >= 0x110000)
val = 0x110000;
in += 1;
}
if (*in == ';')
in += 1;
if ((val >= 0x80) && (val < 0xA0)) {
val = htmlC1Remap[val - 0x80];
} else if ((val <= 0) ||
((val >= 0xD800) && (val < 0xE000)) ||
(val > 0x10FFFF)) {
val = 0xFFFD;
}
*dlen = in - string;
return(val);
}
static int
htmlParseNCRHex(const xmlChar *string, size_t slen, int *dlen) {
const xmlChar *in = string;
const xmlChar *end = string + slen;
unsigned val = 0;
while (in < end) {
int c = *in;
if ((c >= '0') && (c <= '9')) {
c -= '0';
} else if ((c >= 'a') && (c <= 'f')) {
c = (c - 'a') + 10;
} else if ((c >= 'A') && (c <= 'F')) {
c = (c - 'A') + 10;
} else {
break;
}
val = val * 16 + c;
if (val >= 0x110000)
val = 0x110000;
in += 1;
}
if (*in == ';')
in += 1;
if ((val >= 0x80) && (val < 0xA0)) {
val = htmlC1Remap[val - 0x80];
} else if ((val <= 0) ||
((val >= 0xD800) && (val < 0xE000)) ||
(val > 0x10FFFF)) {
val = 0xFFFD;
}
*dlen = in - string;
return(val);
}
static const xmlChar *
htmlCodePointToUtf8(int c, xmlChar *out, int *osize) {
int i = 0;
int bits, hi;
if (c < 0x80) { bits = 0; hi = 0x00; }
else if (c < 0x800) { bits = 6; hi = 0xC0; }
else if (c < 0x10000) { bits = 12; hi = 0xE0; }
else { bits = 18; hi = 0xF0; }
out[i++] = (c >> bits) | hi;
while (bits > 0) {
bits -= 6;
out[i++] = ((c >> bits) & 0x3F) | 0x80;
}
*osize = i;
return(out);
}
#include "html5ent.inc"
#define ENT_F_SEMICOLON 0x80u
@ -2753,30 +2920,26 @@ htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf,
if ((ctxt->sax == NULL) || (ctxt->disableSAX))
return;
if ((mode == 0) || (mode == DATA_RCDATA)) {
if (areBlanks(ctxt, buf, size)) {
if (ctxt->keepBlanks) {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData, buf, size);
} else {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData,
buf, size);
}
if ((mode == 0) || (mode == DATA_RCDATA) ||
(ctxt->sax->cdataBlock == NULL)) {
int blank = areBlanks(ctxt, buf, size);
if ((mode == 0) && (blank > 0) && (!ctxt->keepBlanks)) {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData,
buf, size);
} else {
htmlCheckParagraph(ctxt);
if ((mode == 0) && (blank < 0))
htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData, buf, size);
}
} else {
if (ctxt->sax->cdataBlock != NULL) {
/*
* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
*/
ctxt->sax->cdataBlock(ctxt->userData, buf, size);
} else if (ctxt->sax->characters != NULL) {
ctxt->sax->characters(ctxt->userData, buf, size);
}
/*
* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
*/
ctxt->sax->cdataBlock(ctxt->userData, buf, size);
}
}
@ -2789,128 +2952,273 @@ htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf,
*/
static int
htmlParseCharData(htmlParserCtxtPtr ctxt, int terminate) {
xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
int nbchar = 0;
htmlParseCharData(htmlParserCtxtPtr ctxt) {
xmlParserInputPtr input = ctxt->input;
xmlChar utf8Char[4];
int complete = 0;
int res = 0;
int cur, l, mode;
int done = 0;
int mode;
int eof = PARSER_PROGRESSIVE(ctxt);
int line, col;
mode = ctxt->endCheckState;
while ((!PARSER_STOPPED(ctxt)) &&
(ctxt->input->cur < ctxt->input->end)) {
cur = CUR_CHAR(l);
line = input->line;
col = input->col;
/*
* Check for end of text data
*/
if ((cur == '<') && (mode != DATA_PLAINTEXT)) {
int j, len;
while (!PARSER_STOPPED(ctxt)) {
const xmlChar *chunk, *in, *repl;
size_t avail;
int replSize;
int skip = 0;
if (mode == 0)
break;
chunk = input->cur;
avail = input->end - chunk;
in = chunk;
j = 1;
len = ctxt->input->end - ctxt->input->cur;
if (j < len) {
if ((mode == DATA_SCRIPT) && (NXT(j) == '!')) {
/* Check for comment start */
repl = BAD_CAST "";
replSize = 0;
while (!PARSER_STOPPED(ctxt)) {
size_t j;
int cur, size, cp;
if (avail <= 64) {
if (!eof) {
size_t oldAvail = avail;
size_t off = in - chunk;
input->cur = in;
xmlParserGrow(ctxt);
in = input->cur;
chunk = in - off;
input->cur = chunk;
avail = input->end - in;
if (oldAvail == avail)
eof = 1;
}
if (avail == 0) {
done = 1;
break;
}
}
cur = *in;
size = 1;
col += 1;
switch (cur) {
case '<':
if (mode == 0) {
done = 1;
goto next_chunk;
}
if (mode == DATA_PLAINTEXT)
break;
j = 1;
if (j < avail) {
if ((mode == DATA_SCRIPT) && (in[j] == '!')) {
/* Check for comment start */
j += 1;
if ((j < len) && (NXT(j) == '-')) {
j += 1;
if ((j < len) && (NXT(j) == '-'))
mode = DATA_SCRIPT_ESC1;
}
} else {
int i = 0;
int solidus = 0;
/* Check for tag */
if (NXT(j) == '/') {
j += 1;
solidus = 1;
}
if ((solidus) || (mode == DATA_SCRIPT_ESC1)) {
while ((j < len) &&
(ctxt->name[i] != 0) &&
(ctxt->name[i] == (NXT(j) | 32))) {
i += 1;
if ((j < avail) && (in[j] == '-')) {
j += 1;
if ((j < avail) && (in[j] == '-'))
mode = DATA_SCRIPT_ESC1;
}
} else {
int i = 0;
int solidus = 0;
/* Check for tag */
if (in[j] == '/') {
j += 1;
solidus = 1;
}
if ((ctxt->name[i] == 0) && (j < len)) {
int c = NXT(j);
if ((solidus) || (mode == DATA_SCRIPT_ESC1)) {
while ((j < avail) &&
(ctxt->name[i] != 0) &&
(ctxt->name[i] == (in[j] | 32))) {
i += 1;
j += 1;
}
if ((c == '>') || (c == '/') || (IS_WS_HTML(c))) {
if ((mode == DATA_SCRIPT_ESC1) && (!solidus)) {
mode = DATA_SCRIPT_ESC2;
} else if (mode == DATA_SCRIPT_ESC2) {
mode = DATA_SCRIPT_ESC1;
} else {
complete = 1;
res = 1;
break;
if ((ctxt->name[i] == 0) && (j < avail)) {
int c = in[j];
if ((c == '>') || (c == '/') ||
(IS_WS_HTML(c))) {
if ((mode == DATA_SCRIPT_ESC1) &&
(!solidus)) {
mode = DATA_SCRIPT_ESC2;
} else if (mode == DATA_SCRIPT_ESC2) {
mode = DATA_SCRIPT_ESC1;
} else {
complete = 1;
done = 1;
goto next_chunk;
}
}
}
}
}
}
}
/* Push parser */
if ((!terminate) && (j >= len)) {
res = 1;
if ((mode != 0) && (PARSER_PROGRESSIVE(ctxt))) {
in += 1;
done = 1;
goto next_chunk;
}
break;
case '-':
if ((mode != DATA_SCRIPT_ESC1) && (mode != DATA_SCRIPT_ESC2))
break;
/* Check for comment end */
j = 1;
if ((j < avail) && (in[j] == '-')) {
j += 1;
if ((j < avail) && (in[j] == '>'))
mode = DATA_SCRIPT;
}
break;
case '&':
if ((mode != 0) && (mode != DATA_RCDATA))
break;
cp = 0;
j = 1;
if ((j < avail) && (in[j] == '#')) {
j += 1;
if (j < avail) {
if ((in[j] | 0x20) == 'x') {
j += 1;
if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
cp = htmlParseNCRHex(in + j, avail - j, &skip);
skip += 3;
}
} else if (IS_ASCII_DIGIT(in[j])) {
cp = htmlParseNCR(in + j, avail - j, &skip);
skip += 2;
}
}
if (cp > 0) {
repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
goto next_chunk;
}
skip = 0;
} else {
repl = htmlFindEntityPrefix(in + j,
avail - j,
/* isAttr */ 0,
&skip, &replSize);
if (repl != NULL) {
skip += 1;
goto next_chunk;
}
skip = 0;
}
break;
case '\0':
skip = 1;
repl = BAD_CAST "\xEF\xBF\xBD";
replSize = 3;
goto next_chunk;
case '\n':
line += 1;
col = 1;
break;
case '\r':
skip = 1;
if (in[1] != 0x0A) {
repl = BAD_CAST "\x0A";
replSize = 1;
}
goto next_chunk;
default:
if (cur < 0x80)
break;
if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
xmlChar * guess;
guess = htmlFindEncoding(ctxt);
if (guess == NULL) {
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
} else {
xmlSwitchEncodingName(ctxt, (const char *) guess);
xmlFree(guess);
}
input->flags |= XML_INPUT_HAS_ENCODING;
goto restart;
}
size = htmlValidateUtf8(ctxt, in, avail);
if (size <= 0) {
skip = 1;
repl = BAD_CAST "\xEF\xBF\xBD";
replSize = 3;
goto next_chunk;
}
break;
}
} else if ((cur == '-') &&
((mode == DATA_SCRIPT_ESC1) ||
(mode == DATA_SCRIPT_ESC2))) {
int len = ctxt->input->end - ctxt->input->cur;
int j = 1;
/* Check for comment end */
if ((j < len) && (NXT(j) == '-')) {
j += 1;
if ((j < len) && (NXT(j) == '>'))
mode = DATA_SCRIPT;
}
/* Push parser */
if ((!terminate) && (j >= len)) {
res = 1;
break;
}
} else if ((cur == '&') &&
((mode == 0) || (mode == DATA_RCDATA))) {
break;
in += size;
avail -= size;
}
COPY_BUF(buf,nbchar,cur);
NEXTL(l);
if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
buf[nbchar] = 0;
htmlCharDataSAXCallback(ctxt, buf, nbchar, mode);
next_chunk:
if (in > chunk) {
input->cur += in - chunk;
htmlCharDataSAXCallback(ctxt, chunk, in - chunk, mode);
}
nbchar = 0;
SHRINK;
}
}
if (nbchar != 0) {
buf[nbchar] = 0;
htmlCharDataSAXCallback(ctxt, buf, nbchar, mode);
input->cur += skip;
if (replSize > 0)
htmlCharDataSAXCallback(ctxt, repl, replSize, mode);
SHRINK;
if (done)
break;
restart:
;
}
input->line = line;
input->col = col;
if (complete)
ctxt->endCheckState = 0;
else
ctxt->endCheckState = mode;
return(res);
return(complete);
}
/**
@ -3025,13 +3333,6 @@ done:
return;
}
static const short htmlC1Remap[32] = {
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
};
/**
* htmlParseCharRef:
* @ctxt: an HTML parser context
@ -3754,68 +4055,6 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
}
}
/**
* htmlParseReference:
* @ctxt: an HTML parser context
*
* parse and handle entity references in content,
* this will end-up in a call to character() since this is either a
* CharRef, or a predefined entity.
*/
static void
htmlParseReference(htmlParserCtxtPtr ctxt) {
const xmlChar *repl = NULL;
int replLen = 0;
xmlChar out[6];
if ((NXT(1) == '#') &&
((IS_ASCII_DIGIT(NXT(2))) ||
((UPP(2) == 'X') &&
((IS_ASCII_DIGIT(NXT(3))) ||
((UPP(3) >= 'A') && (UPP(3) <= 'F')))))) {
unsigned int c;
int bits, i = 0;
c = htmlParseCharRef(ctxt);
if (c == 0)
return;
if (c < 0x80) { out[i++]= c; bits= -6; }
else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
for ( ; bits >= 0; bits-= 6) {
out[i++]= ((c >> bits) & 0x3F) | 0x80;
}
out[i] = 0;
repl = out;
replLen = i;
} else if (IS_ASCII_LETTER(NXT(1))) {
int nameLen;
repl = htmlFindEntityPrefix(CUR_PTR + 1,
ctxt->input->end - CUR_PTR - 1,
/* isAttr */ 0,
&nameLen, &replLen);
if (repl != NULL)
SKIP(nameLen + 1);
}
if (repl == NULL) {
repl = BAD_CAST "&";
replLen = 1;
SKIP(1);
}
htmlCheckParagraph(ctxt);
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, repl, replLen);
}
/**
* htmlParseContent:
* @ctxt: an HTML parser context
@ -3864,10 +4103,8 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
SKIP(1);
}
} else if ((CUR == '&') && ((mode == 0) || (mode == DATA_RCDATA))) {
htmlParseReference(ctxt);
} else {
htmlParseCharData(ctxt, /* terminate */ 1);
htmlParseCharData(ctxt);
}
SHRINK;
@ -4793,10 +5030,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
mode = ctxt->endCheckState;
if (mode != 0) {
int done = 0;
while ((PARSER_STOPPED(ctxt) == 0) &&
(!done) &&
(in->cur < in->end)) {
size_t extra;
@ -4808,13 +5042,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
goto done;
ctxt->checkIndex = 0;
if ((cur == '&') && (mode == DATA_RCDATA)) {
htmlParseReference(ctxt);
} else {
done = htmlParseCharData(ctxt, terminate);
}
cur = in->cur[0];
if (htmlParseCharData(ctxt))
break;
}
break;
@ -4897,15 +5126,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(htmlParseLookupString(ctxt, 0, "<", 1, 0) < 0))
goto done;
ctxt->checkIndex = 0;
while ((PARSER_STOPPED(ctxt) == 0) &&
(cur != '<') && (in->cur < in->end)) {
if (cur == '&') {
htmlParseReference(ctxt);
} else {
htmlParseCharData(ctxt, terminate);
}
cur = in->cur[0];
}
htmlParseCharData(ctxt);
}
break;

View File

@ -9,11 +9,7 @@ SAX.characters(
SAX.startElement(style)
SAX.cdata(
.......
....................., 1000)
SAX.cdata(.............................., 1000)
SAX.cdata(.............................., 1000)
SAX.cdata(................
............., 977)
....................., 3977)
SAX.endElement(style)
SAX.characters(
, 1)

View File

@ -3,8 +3,7 @@ SAX.startDocument()
SAX.startElement(html)
SAX.startElement(body)
SAX.startElement(p)
SAX.characters(&amp;, 1)
SAX.characters(j&Ugrave;, 3)
SAX.characters(&amp;j&Ugrave;, 4)
SAX.endElement(p)
SAX.endElement(body)
SAX.endElement(html)

View File

@ -3,9 +3,8 @@ SAX.startDocument()
SAX.startElement(html)
SAX.startElement(body)
SAX.startElement(p)
SAX.characters(&amp;, 1)
SAX.characters(:&ecirc;
, 4)
SAX.characters(&amp;:&ecirc;
, 5)
SAX.endElement(p)
SAX.endElement(body)
SAX.endElement(html)

View File

@ -18,11 +18,7 @@ SAX.startElement(p)
SAX.characters(
Filler bytes follow:
1, 1000)
SAX.characters(89 123456789 123456789
1, 1000)
SAX.characters(89 123456789 123456789
1, 827)
1, 2827)
SAX.endElement(p)
SAX.characters(
, 1)

File diff suppressed because it is too large Load Diff

View File

@ -7,13 +7,9 @@ SAX.characters(
a, 2)
SAX.characters(&amp;, 1)
SAX.characters(b
a, 3)
SAX.characters(&amp;, 1)
SAX.characters(b
a , 4)
SAX.characters(&amp;, 1)
SAX.characters( b
, 3)
a&amp;b
a &amp; b
, 12)
SAX.endElement(p)
SAX.characters(
, 1)

View File

@ -421,10 +421,7 @@ SAX.characters(
, 2)
SAX.startElement(p)
SAX.characters(For further technical informat, 254)
SAX.characters(&amp;, 1)
SAX.characters( troubleshooters to find
fast, 302)
SAX.characters(For further technical informat, 557)
SAX.startElement(a, href='http://support.microsoft.com/support/')
SAX.characters(http://support.microsoft.com/s, 37)
SAX.endElement(a)

File diff suppressed because it is too large Load Diff

View File

@ -53,9 +53,13 @@ SAX.characters(Note , 5)
SAX.endElement(b)
SAX.endElement(dt)
SAX.startElement(dd)
SAX.characters(The Problem Domain package is , 58)
SAX.characters(The Problem Domain package is , 57)
SAX.characters(
, 1)
SAX.startElement(dd)
SAX.characters(Interface, thats stores and ma, 57)
SAX.characters(Interface, thats stores and ma, 56)
SAX.characters(
, 1)
SAX.endElement(dd)
SAX.endElement(dd)
SAX.endElement(dl)
@ -70,8 +74,9 @@ SAX.characters(
, 1)
SAX.startElement(dl)
SAX.characters(
, 2)
, 1)
SAX.characters(
, 1)
SAX.startElement(dt)
SAX.startElement(h4)
SAX.characters(Class , 6)
@ -164,8 +169,9 @@ SAX.characters(
, 1)
SAX.endElement(dl)
SAX.characters(
, 2)
, 1)
SAX.characters(
, 1)
SAX.startElement(h4)
SAX.startElement(b)
SAX.characters(Links, 5)

View File

@ -555,8 +555,7 @@ SAX.characters(&#1605;&#1585;&#1583; &#1607;, 693)
SAX.characters(", 1)
SAX.characters(&#1580;&#1608; &#1711;&#1740;, 11)
SAX.characters(", 1)
SAX.characters( &#1705;&#1607; &#1607;&#1585;, 1000)
SAX.characters( &#1608;&#1575;&#1740; &#1576;, 460)
SAX.characters( &#1705;&#1607; &#1607;&#1585;, 1460)
SAX.characters(", 1)
SAX.characters(&#1606;&#1607;, 4)
SAX.characters(", 1)
@ -601,8 +600,7 @@ SAX.characters( &#1608; , 4)
SAX.characters(", 1)
SAX.characters(&#1576;&#1587;&#1585;&#1593;, 10)
SAX.characters(", 1)
SAX.characters( &#1548; &#1605;&#1593;&#1583;, 1001)
SAX.characters(&#1583; &#1605;&#1585;&#1581;, 481)
SAX.characters( &#1548; &#1605;&#1593;&#1583;, 1482)
SAX.startElement(br)
SAX.endElement(br)
SAX.characters(
@ -660,9 +658,7 @@ SAX.characters(4/3/2008 - 7:04:00 PM, 21)
SAX.endElement(font)
SAX.startElement(br)
SAX.endElement(br)
SAX.characters(&#1587;&#1604;&#1575;&#1605; , 834)
SAX.characters(&amp;, 1)
SAX.characters(paste , 6)
SAX.characters(&#1587;&#1604;&#1575;&#1605; , 841)
SAX.startElement(br)
SAX.endElement(br)
SAX.characters(
@ -683,8 +679,7 @@ SAX.characters(4/3/2008 - 6:06:00 PM FOO!, 26)
SAX.endElement(font)
SAX.startElement(br)
SAX.endElement(br)
SAX.characters(&#1576;&#1575; &#1583;&#1585;, 1000)
SAX.characters(&#1688;&#1575;&#1606;&#1587; , 999)
SAX.characters(&#1576;&#1575; &#1583;&#1585;, 1999)
SAX.characters(&zwnj;, 3)
SAX.characters(&#1607;&#1575;&#1740; &#1588;, 98)
SAX.characters(&zwnj;, 3)

View File

@ -44,8 +44,9 @@ SAX.startElement(select, name='state')
SAX.characters(
, 1)
SAX.startElement(option, value='WA', selected)
SAX.characters(WA
, 3)
SAX.characters(WA, 2)
SAX.characters(
, 1)
SAX.endElement(option)
SAX.startElement(option, value='AL')
SAX.characters(AL, 2)
@ -229,8 +230,9 @@ SAX.characters(
SAX.startElement(option, value='SC')
SAX.characters(SC, 2)
SAX.endElement(option)
SAX.characters(
, 2)
SAX.characters( , 1)
SAX.characters(
, 1)
SAX.startElement(option, value='SD')
SAX.characters(SD, 2)
SAX.endElement(option)
@ -1549,9 +1551,7 @@ SAX.endElement(br)
SAX.startElement(font, size='2', face='Arial,Helvetica, sans-serif')
SAX.startElement(b)
SAX.startElement(a, href='/news/commentarySection/0,1292,31926,00.html')
SAX.characters(Rants , 6)
SAX.characters(&amp;, 1)
SAX.characters( Raves, 6)
SAX.characters(Rants &amp; Raves, 13)
SAX.endElement(a)
SAX.endElement(b)
SAX.endElement(font)
@ -2021,7 +2021,9 @@ SAX.endElement(br)
SAX.startElement(font, size='1', face='Arial, Geneva, sans-serif', color='#000000')
SAX.startElement(p)
SAX.characters(
Contruction workers in Berlin, 635)
Contruction workers in Berlin, 634)
SAX.characters(
, 1)
SAX.startElement(br)
SAX.endElement(br)
SAX.endElement(p)

View File

@ -2177,9 +2177,11 @@ pushBoundaryTest(const char *filename, const char *result,
*ctxt->input->cur :
base[cur];
if ((firstChar != '<') &&
((options & XML_PARSE_HTML) || (firstChar != '&')))
isText = 1;
if (options & XML_PARSE_HTML) {
isText = ((ctxt->endCheckState) || (firstChar != '<'));
} else {
isText = ((firstChar != '<') && (firstChar != '&'));
}
}
oldConsumed = ctxt->input->consumed +