1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2024-10-26 12:25:09 +03:00

Attribute nomarlization closing bug #3597

Small fixes in encoding.c
First bits of real progressive parsing,
   Daniel
This commit is contained in:
Daniel Veillard 1999-11-17 17:32:38 +00:00
parent d7e200c0b0
commit 7f8585025f
15 changed files with 303 additions and 94 deletions

View File

@ -1,3 +1,11 @@
Wed Nov 17 18:28:06 CET 1999
* encoding.c: bug fix and typos
* xmlIO.[ch] parser.c: first bits toward real progressive parsing
* parser.c: added attribute normalization closing bug #3597
* test/att* result/att* SAXresult/att*: testcase for attribute
normalization
Mon Nov 15 18:50:56 CET 1999 Daniel Veillard <Daniel.Veillard@w3.org>
* configure.in: closing bug #3163 by adding extra flags for the

5
SAXresult/att1 Normal file
View File

@ -0,0 +1,5 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.startElement(doc, attr='to normalize with a space')
SAX.endElement(doc)
SAX.endDocument()

5
SAXresult/att2 Normal file
View File

@ -0,0 +1,5 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.startElement(doc, attr='to normalize with a space')
SAX.endElement(doc)
SAX.endDocument()

View File

@ -51,10 +51,10 @@
/**
* isolat1ToUTF8:
* @out: a pointer ot an array of bytes to store the result
* @outlen: the lenght of @out
* @in: a pointer ot an array of ISO Latin 1 chars
* @inlen: the lenght of @in
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of ISO Latin 1 chars
* @inlen: the length of @in
*
* Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
* block of chars out.
@ -86,10 +86,10 @@ isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen)
/**
* UTF8Toisolat1:
* @out: a pointer ot an array of bytes to store the result
* @outlen: the lenght of @out
* @in: a pointer ot an array of UTF-8 chars
* @inlen: the lenght of @in
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of UTF-8 chars
* @inlen: the length of @in
*
* Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
* block of chars out.
@ -123,10 +123,10 @@ UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen)
/**
* UTF16ToUTF8:
* @out: a pointer ot an array of bytes to store the result
* @outlen: the lenght of @out
* @in: a pointer ot an array of UTF-16 chars (array of unsigned shorts)
* @inlen: the lenght of @in
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of UTF-16 chars (array of unsigned shorts)
* @inlen: the length of @in
*
* Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
* block of chars out.
@ -161,7 +161,7 @@ UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
else if (c < 0x10000) { *out++= (c >> 12) | 0xE0; bits= 6; }
else { *out++= (c >> 18) | 0xF0; bits= 12; }
for ( ; bits < 0; bits-= 6) {
for ( ; bits > 0; bits-= 6) {
if (out >= outend) return -1;
*out++= (c >> bits) & 0x3F;
}
@ -171,10 +171,10 @@ UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
/**
* UTF8ToUTF16:
* @out: a pointer ot an array of shorts to store the result
* @outlen: the lenght of @out (number of shorts)
* @in: a pointer ot an array of UTF-8 chars
* @inlen: the lenght of @in
* @out: a pointer to an array of shorts to store the result
* @outlen: the length of @out (number of shorts)
* @in: a pointer to an array of UTF-8 chars
* @inlen: the length of @in
*
* Take a block of UTF-8 chars in and try to convert it to an UTF-16
* block of chars out.
@ -264,7 +264,7 @@ xmlDetectCharEncoding(const unsigned char* in)
/**
* xmlParseCharEncoding:
* @name: the encoding name as parsed, in UTF-8 format (ASCCI actually)
* @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
*
* Conpare the string to the known encoding schemes already known. Note
* that the comparison is case insensitive accordingly to the section
@ -351,7 +351,7 @@ static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
/**
* xmlNewCharEncodingHandler:
* @name: the encoding name, in UTF-8 format (ASCCI actually)
* @name: the encoding name, in UTF-8 format (ASCII actually)
* @input: the xmlCharEncodingInputFunc to read that encoding
* @output: the xmlCharEncodingOutputFunc to write that encoding
*
@ -409,7 +409,7 @@ xmlNewCharEncodingHandler(const char *name, xmlCharEncodingInputFunc input,
*
* Initialize the char encoding support, it registers the default
* encoding supported.
* NOTE: while public theis function usually don't need to be called
* NOTE: while public, this function usually doesn't need to be called
* in normal processing.
*/
void

View File

@ -50,6 +50,9 @@ int xmlParserInputBufferRead (xmlParserInputBufferPtr in,
int len);
int xmlParserInputBufferGrow (xmlParserInputBufferPtr in,
int len);
int xmlParserInputBufferPush (xmlParserInputBufferPtr in,
int len,
char *buf);
void xmlFreeParserInputBuffer (xmlParserInputBufferPtr in);
char * xmlParserGetDirectory (const char *filename);

230
parser.c
View File

@ -2497,58 +2497,146 @@ xmlParseEntityValue(xmlParserCtxtPtr ctxt, xmlChar **orig) {
* [10] AttValue ::= '"' ([^<&"] | Reference)* '"' |
* "'" ([^<&'] | Reference)* "'"
*
* Returns the AttValue parsed or NULL.
* 3.3.3 Attribute-Value Normalization:
* Before the value of an attribute is passed to the application or
* checked for validity, the XML processor must normalize it as follows:
* - a character reference is processed by appending the referenced
* character to the attribute value
* - an entity reference is processed by recursively processing the
* replacement text of the entity
* - a whitespace character (#x20, #xD, #xA, #x9) is processed by
* appending #x20 to the normalized value, except that only a single
* #x20 is appended for a "#xD#xA" sequence that is part of an external
* parsed entity or the literal entity value of an internal parsed entity
* - other characters are processed by appending them to the normalized value
*
* Returns the AttValue parsed or NULL. The value has to be freed by the caller.
*/
xmlChar *
xmlParseAttValue(xmlParserCtxtPtr ctxt) {
xmlChar *ret = NULL;
xmlChar limit = 0;
xmlChar *buffer = NULL;
int buffer_size = 0;
xmlChar *out = NULL;
xmlChar *current = NULL;
xmlEntityPtr ent;
xmlChar cur;
int blank = 0;
SHRINK;
if (CUR == '"') {
ctxt->instate = XML_PARSER_ATTRIBUTE_VALUE;
limit = '"';
NEXT;
ret = xmlDecodeEntities(ctxt, -1, XML_SUBSTITUTE_REF, '"', '<', 0);
if (CUR == '<') {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"Unescaped '<' not allowed in attributes values\n");
ctxt->errNo = XML_ERR_LT_IN_ATTRIBUTE;
ctxt->wellFormed = 0;
}
if (CUR != '"') {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
ctxt->errNo = XML_ERR_ATTRIBUTE_NOT_FINISHED;
ctxt->wellFormed = 0;
} else
NEXT;
} else if (CUR == '\'') {
limit = '\'';
ctxt->instate = XML_PARSER_ATTRIBUTE_VALUE;
NEXT;
ret = xmlDecodeEntities(ctxt, -1, XML_SUBSTITUTE_REF, '\'', '<', 0);
if (CUR == '<') {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"Unescaped '<' not allowed in attributes values\n");
ctxt->errNo = XML_ERR_LT_IN_ATTRIBUTE;
ctxt->wellFormed = 0;
}
if (CUR != '\'') {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
ctxt->errNo = XML_ERR_ATTRIBUTE_NOT_FINISHED;
ctxt->wellFormed = 0;
} else
NEXT;
} else {
ctxt->errNo = XML_ERR_ATTRIBUTE_NOT_STARTED;
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData, "AttValue: \" or ' expected\n");
ctxt->wellFormed = 0;
return(NULL);
}
return(ret);
/*
* allocate a translation buffer.
*/
buffer_size = 100;
buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
if (buffer == NULL) {
perror("xmlParseAttValue: malloc failed");
return(NULL);
}
out = buffer;
/*
* Ok loop until we reach one of the ending char or a size limit.
*/
cur = CUR;
while ((cur != limit) && (cur != '<')) {
if (cur == 0) break;
if ((cur == '&') && (NXT(1) == '#')) {
int val = xmlParseCharRef(ctxt);
*out++ = val;
blank = 0;
} else if (cur == '&') {
ent = xmlParseEntityRef(ctxt);
if ((ent != NULL) &&
(ctxt->replaceEntities != 0)) {
current = ent->content;
while (*current != 0) {
*out++ = *current++;
if (out - buffer > buffer_size - 10) {
int index = out - buffer;
growBuffer(buffer);
out = &buffer[index];
}
}
} else if (ent != NULL) {
int i = xmlStrlen(ent->name);
const xmlChar *cur = ent->name;
*out++ = '&';
if (out - buffer > buffer_size - i - 10) {
int index = out - buffer;
growBuffer(buffer);
out = &buffer[index];
}
for (;i > 0;i--)
*out++ = *cur++;
*out++ = ';';
}
blank = 0;
} else {
/* invalid for UTF-8 , use COPY(out); !!!!!! */
if ((cur == 0x20) || (cur == 0xD) || (cur == 0xA) || (cur == 0x9)) {
if (!blank) {
*out++ = 0x20;
if (out - buffer > buffer_size - 10) {
int index = out - buffer;
growBuffer(buffer);
out = &buffer[index];
}
}
blank = 1;
} else {
*out++ = cur;
if (out - buffer > buffer_size - 10) {
int index = out - buffer;
growBuffer(buffer);
out = &buffer[index];
}
blank = 0;
}
NEXT;
}
cur = CUR;
}
*out++ = 0;
if (CUR == '<') {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"Unescaped '<' not allowed in attributes values\n");
ctxt->errNo = XML_ERR_LT_IN_ATTRIBUTE;
ctxt->wellFormed = 0;
} else if (CUR != limit) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n");
ctxt->errNo = XML_ERR_ATTRIBUTE_NOT_FINISHED;
ctxt->wellFormed = 0;
} else
NEXT;
return(buffer);
}
/**
@ -4962,7 +5050,7 @@ xmlParseAttribute(xmlParserCtxtPtr ctxt, xmlChar **value) {
*
* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
*
* Returns the element name parsed
* Returne the element name parsed
*/
xmlChar *
@ -5986,6 +6074,80 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
return(0);
}
/************************************************************************
* *
* Progressive parsing interfaces *
* *
************************************************************************/
/**
* xmlParseLookupSequence:
* @ctxt: an XML parser context
* @first: the first char to lookup
* @next: the next char to lookup
*
* Try to find if a sequence (first, next) or just (first) if next
* is zero is available in the input stream.
* Since XML-1.0 is an LALR(2) grammar a sequence of 2 char should be
* enought. If this doesn't prove true this function call may change.
*
* Returns 1 if the full sequence is available, 0 otherwise.
*/
int
xmlParseLookupSequence(xmlParserCtxtPtr ctxt, xmlChar first, xmlChar next) {
return(0);
}
/**
* xmlParseTry:
* @ctxt: an XML parser context
*
* Try to progress on parsing
*
* Returns zero if no parsing was possible
*/
int
xmlParseTry(xmlParserCtxtPtr ctxt) {
int ret = 0;
while (1) {
switch (ctxt->instate) {
case XML_PARSER_EOF:
return(0);
case XML_PARSER_PROLOG:
case XML_PARSER_CONTENT:
case XML_PARSER_ENTITY_DECL:
case XML_PARSER_ENTITY_VALUE:
case XML_PARSER_ATTRIBUTE_VALUE:
case XML_PARSER_DTD:
case XML_PARSER_EPILOG:
case XML_PARSER_COMMENT:
case XML_PARSER_CDATA_SECTION:
}
}
return(ret);
}
/**
* xmlParseChunk:
* @ctxt: an XML parser context
* @chunk: an char array
* @size: the size in byte of the chunk
* @terminate: last chunk indicator
*
* Parse a Chunk of memory
*
* Returns zero if no error, the xmlParserErrors otherwise.
*/
xmlParserErrors
xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size,
int terminate) {
if ((size > 0) && (chunk != NULL)) {
xmlParserInputBufferPush(ctxt->input, size, chunk);
}
return((xmlParserErrors) ctxt->errNo);
}
/************************************************************************
* *
* I/O front end functions to the parser *

View File

@ -2,42 +2,9 @@
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG April 1999//EN" "http://www.w3.org/Graphics/SVG/svg-19990412.dtd">
<svg width="800px" height="800px">
<desc>This sample SVG file draws a flower</desc>
<g style="transform: matrix(1 0 0 -1 -25.88 798.60);
stroke: #000; stroke-width: 1">
<path style="fill: #1A5466" d="m 242.33 684.19
C 346.44 757.48 271.45 647.38 213.17 641.37
C 271.45 647.38 383.43 575.21 256.71 613.30
C 383.43 575.21 251.04 565.90 205.23 609.68
C 251.04 565.90 265.13 432.88 210.71 557.95
C 265.13 432.88 175.04 531.37 175.67 596.26
C 175.04 531.37 80.63 437.67 138.96 559.82
C 80.63 437.67 100.67 569.80 146.75 611.20
C 100.67 569.80 -31.14 585.98 95.49 617.49
C -31.14 585.98 83.94 652.25 140.24 643.26
C 83.94 652.25 13.98 766.12 113.04 687.55
C 13.98 766.12 137.45 716.63 161.05 668.30
C 137.45 716.63 182.02 842.45 178.39 717.23
C 182.02 842.45 220.90 714.46 193.51 667.46
C 220.90 714.46 346.44 757.48 242.33 684.19 z"/>
<path style="fill: #34AACD" d="M 235.33 691.19
C 339.44 764.48 264.45 654.38 206.17 648.37
C 264.45 654.38 376.43 582.21 249.71 620.30
C 376.43 582.21 244.04 572.90 198.23 616.68
C 244.04 572.90 258.13 439.88 203.71 564.95
C 258.13 439.88 168.04 538.37 168.67 603.26
C 168.04 538.37 73.63 444.67 131.96 566.82
C 73.63 444.67 93.67 576.80 139.75 618.20
C 93.67 576.80 -38.14 592.98 88.49 624.49
C -38.14 592.98 76.94 659.25 133.24 650.26
C 76.94 659.25 6.98 773.12 106.04 694.55
C 6.98 773.12 130.45 723.63 154.05 675.30
C 130.45 723.63 175.02 849.45 171.39 724.23
C 175.02 849.45 213.90 721.46 186.51 674.46
C 213.90 721.46 339.44 764.48 235.33 691.19 z"/>
<path style="fill: #F881BF" d="M 199.44 634.43
C 199.44 622.16 189.19 612.21 176.54 612.21
C 163.89 612.21 153.63 622.16 153.63 634.43
C 153.63 646.71 163.89 656.66 176.54 656.66
C 189.19 656.66 199.44 646.71 199.44 634.43 z"/>
<g style="transform: matrix(1 0 0 -1 -25.88 798.60); stroke: #000; stroke-width: 1">
<path style="fill: #1A5466" d="m 242.33 684.19 C 346.44 757.48 271.45 647.38 213.17 641.37 C 271.45 647.38 383.43 575.21 256.71 613.30 C 383.43 575.21 251.04 565.90 205.23 609.68 C 251.04 565.90 265.13 432.88 210.71 557.95 C 265.13 432.88 175.04 531.37 175.67 596.26 C 175.04 531.37 80.63 437.67 138.96 559.82 C 80.63 437.67 100.67 569.80 146.75 611.20 C 100.67 569.80 -31.14 585.98 95.49 617.49 C -31.14 585.98 83.94 652.25 140.24 643.26 C 83.94 652.25 13.98 766.12 113.04 687.55 C 13.98 766.12 137.45 716.63 161.05 668.30 C 137.45 716.63 182.02 842.45 178.39 717.23 C 182.02 842.45 220.90 714.46 193.51 667.46 C 220.90 714.46 346.44 757.48 242.33 684.19 z"/>
<path style="fill: #34AACD" d="M 235.33 691.19 C 339.44 764.48 264.45 654.38 206.17 648.37 C 264.45 654.38 376.43 582.21 249.71 620.30 C 376.43 582.21 244.04 572.90 198.23 616.68 C 244.04 572.90 258.13 439.88 203.71 564.95 C 258.13 439.88 168.04 538.37 168.67 603.26 C 168.04 538.37 73.63 444.67 131.96 566.82 C 73.63 444.67 93.67 576.80 139.75 618.20 C 93.67 576.80 -38.14 592.98 88.49 624.49 C -38.14 592.98 76.94 659.25 133.24 650.26 C 76.94 659.25 6.98 773.12 106.04 694.55 C 6.98 773.12 130.45 723.63 154.05 675.30 C 130.45 723.63 175.02 849.45 171.39 724.23 C 175.02 849.45 213.90 721.46 186.51 674.46 C 213.90 721.46 339.44 764.48 235.33 691.19 z"/>
<path style="fill: #F881BF" d="M 199.44 634.43 C 199.44 622.16 189.19 612.21 176.54 612.21 C 163.89 612.21 153.63 622.16 153.63 634.43 C 153.63 646.71 163.89 656.66 176.54 656.66 C 189.19 656.66 199.44 646.71 199.44 634.43 z"/>
</g>
</svg>

Before

Width:  |  Height:  |  Size: 2.2 KiB

After

Width:  |  Height:  |  Size: 1.8 KiB

View File

@ -3,7 +3,7 @@
<svg width="4in" height="3in">
<defs>
<symbol id="Triangle1" min-x="0" min-y="0" max-x="300" max-y="200">
<path d="M 50 0 L 50 200 L 250 0 z"/>
<path d="M 50 0 L 50 200 L 250 0 z"/>
</symbol>
<symbol id="Triangle2" min-x="0" min-y="0" max-x="300" max-y="200">
<path d="M 50 0 L 250 200 L 250 0 z"/>

Before

Width:  |  Height:  |  Size: 848 B

After

Width:  |  Height:  |  Size: 847 B

2
result/att1 Normal file
View File

@ -0,0 +1,2 @@
<?xml version="1.0"?>
<doc attr="to normalize with a space"/>

2
result/att2 Normal file
View File

@ -0,0 +1,2 @@
<?xml version="1.0"?>
<doc attr="to normalize with a space"/>

View File

@ -1674,8 +1674,7 @@ match <termref def="NT-Nmtokens">Nmtokens</termref>.
<p>The XML processor must normalize attribute values before
passing them to the application, as described in
<specref ref="AVNormalize"/>.</p>-->
<p><termdef id="dt-enumerated" term="Enumerated Attribute
Values"><term>Enumerated attributes</term> can take one
<p><termdef id="dt-enumerated" term="Enumerated Attribute Values"><term>Enumerated attributes</term> can take one
of a list of values provided in the declaration</termdef>. There are two
kinds of enumerated types:
<scrap lang="ebnf"><head>Enumerated Attribute Types</head><prod id="NT-EnumeratedType"><lhs>EnumeratedType</lhs><rhs><nt def="NT-NotationType">NotationType</nt>

2
test/att1 Normal file
View File

@ -0,0 +1,2 @@
<doc attr="to normalize
with a space"/>

1
test/att2 Normal file
View File

@ -0,0 +1 @@
<doc attr="to normalize with a space"/>

50
xmlIO.c
View File

@ -249,6 +249,55 @@ xmlParserInputBufferCreateFd(int fd, xmlCharEncoding enc) {
return(ret);
}
/**
* xmlParserInputBufferPush:
* @in: a buffered parser input
* @buf: an char array
* @len: the size in bytes of the array.
*
* Push the content of the arry in the input buffer
* This routine handle the I18N transcoding to internal UTF-8
* This is used when operating the parser in progressive (push) mode.
*
* Returns the number of chars read and stored in the buffer, or -1
* in case of error.
*/
int
xmlParserInputBufferPush(xmlParserInputBufferPtr in, int len, char *buf) {
char *buffer = NULL;
int nbchars = 0;
if (len < 0) return(0);
if (in->encoder != NULL) {
xmlChar *buf;
buf = (xmlChar *) xmlMalloc((len + 1) * 2 * sizeof(xmlChar));
if (buf == NULL) {
fprintf(stderr, "xmlParserInputBufferGrow : out of memory !\n");
xmlFree(buffer);
return(-1);
}
nbchars = in->encoder->input(buf, (len + 1) * 2 * sizeof(xmlChar),
BAD_CAST buffer, len);
/*
* TODO : we really need to have something atomic or the
* encoder must report the number of bytes read
*/
buf[nbchars] = 0;
xmlBufferAdd(in->buffer, (xmlChar *) buf, nbchars);
xmlFree(buf);
} else {
nbchars = len;
buffer[nbchars] = 0;
xmlBufferAdd(in->buffer, (xmlChar *) buffer, nbchars);
}
#ifdef DEBUG_INPUT
fprintf(stderr, "I/O: pushed %d chars, buffer %d/%d\n",
nbchars, in->buffer->use, in->buffer->size);
#endif
return(nbchars);
}
/**
* xmlParserInputBufferGrow:
* @in: a buffered parser input
@ -256,6 +305,7 @@ xmlParserInputBufferCreateFd(int fd, xmlCharEncoding enc) {
*
* Grow up the content of the input buffer, the old data are preserved
* This routine handle the I18N transcoding to internal UTF-8
* This routine is used when operating the parser in normal (pull) mode
* TODO: one should be able to remove one extra copy
*
* Returns the number of chars read and stored in the buffer, or -1

View File

@ -50,6 +50,9 @@ int xmlParserInputBufferRead (xmlParserInputBufferPtr in,
int len);
int xmlParserInputBufferGrow (xmlParserInputBufferPtr in,
int len);
int xmlParserInputBufferPush (xmlParserInputBufferPtr in,
int len,
char *buf);
void xmlFreeParserInputBuffer (xmlParserInputBufferPtr in);
char * xmlParserGetDirectory (const char *filename);