From 7f8585025f550f53c1c20b242a48ebc3a5704d74 Mon Sep 17 00:00:00 2001 From: Daniel Veillard Date: Wed, 17 Nov 1999 17:32:38 +0000 Subject: [PATCH] Attribute nomarlization closing bug #3597 Small fixes in encoding.c First bits of real progressive parsing, Daniel --- ChangeLog | 8 ++ SAXresult/att1 | 5 + SAXresult/att2 | 5 + encoding.c | 40 +++--- include/libxml/xmlIO.h | 3 + parser.c | 230 +++++++++++++++++++++++++----- result/SVG/flower2.xml | 41 +----- result/SVG/toap02.xml | 2 +- result/att1 | 2 + result/att2 | 2 + result/valid/REC-xml-19980210.xml | 3 +- test/att1 | 2 + test/att2 | 1 + xmlIO.c | 50 +++++++ xmlIO.h | 3 + 15 files changed, 303 insertions(+), 94 deletions(-) create mode 100644 SAXresult/att1 create mode 100644 SAXresult/att2 create mode 100644 result/att1 create mode 100644 result/att2 create mode 100644 test/att1 create mode 100644 test/att2 diff --git a/ChangeLog b/ChangeLog index d9dc9d28..f677325e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +Wed Nov 17 18:28:06 CET 1999 + + * encoding.c: bug fix and typos + * xmlIO.[ch] parser.c: first bits toward real progressive parsing + * parser.c: added attribute normalization closing bug #3597 + * test/att* result/att* SAXresult/att*: testcase for attribute + normalization + Mon Nov 15 18:50:56 CET 1999 Daniel Veillard * configure.in: closing bug #3163 by adding extra flags for the diff --git a/SAXresult/att1 b/SAXresult/att1 new file mode 100644 index 00000000..125e1b2c --- /dev/null +++ b/SAXresult/att1 @@ -0,0 +1,5 @@ +SAX.setDocumentLocator() +SAX.startDocument() +SAX.startElement(doc, attr='to normalize with a space') +SAX.endElement(doc) +SAX.endDocument() diff --git a/SAXresult/att2 b/SAXresult/att2 new file mode 100644 index 00000000..125e1b2c --- /dev/null +++ b/SAXresult/att2 @@ -0,0 +1,5 @@ +SAX.setDocumentLocator() +SAX.startDocument() +SAX.startElement(doc, attr='to normalize with a space') +SAX.endElement(doc) +SAX.endDocument() diff --git a/encoding.c b/encoding.c index 5169cdea..50deb2c4 100644 --- a/encoding.c +++ b/encoding.c @@ -51,10 +51,10 @@ /** * isolat1ToUTF8: - * @out: a pointer ot an array of bytes to store the result - * @outlen: the lenght of @out - * @in: a pointer ot an array of ISO Latin 1 chars - * @inlen: the lenght of @in + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @in: a pointer to an array of ISO Latin 1 chars + * @inlen: the length of @in * * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8 * block of chars out. @@ -86,10 +86,10 @@ isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen) /** * UTF8Toisolat1: - * @out: a pointer ot an array of bytes to store the result - * @outlen: the lenght of @out - * @in: a pointer ot an array of UTF-8 chars - * @inlen: the lenght of @in + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @in: a pointer to an array of UTF-8 chars + * @inlen: the length of @in * * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1 * block of chars out. @@ -123,10 +123,10 @@ UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen) /** * UTF16ToUTF8: - * @out: a pointer ot an array of bytes to store the result - * @outlen: the lenght of @out - * @in: a pointer ot an array of UTF-16 chars (array of unsigned shorts) - * @inlen: the lenght of @in + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @in: a pointer to an array of UTF-16 chars (array of unsigned shorts) + * @inlen: the length of @in * * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8 * block of chars out. @@ -161,7 +161,7 @@ UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen) else if (c < 0x10000) { *out++= (c >> 12) | 0xE0; bits= 6; } else { *out++= (c >> 18) | 0xF0; bits= 12; } - for ( ; bits < 0; bits-= 6) { + for ( ; bits > 0; bits-= 6) { if (out >= outend) return -1; *out++= (c >> bits) & 0x3F; } @@ -171,10 +171,10 @@ UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen) /** * UTF8ToUTF16: - * @out: a pointer ot an array of shorts to store the result - * @outlen: the lenght of @out (number of shorts) - * @in: a pointer ot an array of UTF-8 chars - * @inlen: the lenght of @in + * @out: a pointer to an array of shorts to store the result + * @outlen: the length of @out (number of shorts) + * @in: a pointer to an array of UTF-8 chars + * @inlen: the length of @in * * Take a block of UTF-8 chars in and try to convert it to an UTF-16 * block of chars out. @@ -264,7 +264,7 @@ xmlDetectCharEncoding(const unsigned char* in) /** * xmlParseCharEncoding: - * @name: the encoding name as parsed, in UTF-8 format (ASCCI actually) + * @name: the encoding name as parsed, in UTF-8 format (ASCII actually) * * Conpare the string to the known encoding schemes already known. Note * that the comparison is case insensitive accordingly to the section @@ -351,7 +351,7 @@ static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL; /** * xmlNewCharEncodingHandler: - * @name: the encoding name, in UTF-8 format (ASCCI actually) + * @name: the encoding name, in UTF-8 format (ASCII actually) * @input: the xmlCharEncodingInputFunc to read that encoding * @output: the xmlCharEncodingOutputFunc to write that encoding * @@ -409,7 +409,7 @@ xmlNewCharEncodingHandler(const char *name, xmlCharEncodingInputFunc input, * * Initialize the char encoding support, it registers the default * encoding supported. - * NOTE: while public theis function usually don't need to be called + * NOTE: while public, this function usually doesn't need to be called * in normal processing. */ void diff --git a/include/libxml/xmlIO.h b/include/libxml/xmlIO.h index a99ab23d..58baeb01 100644 --- a/include/libxml/xmlIO.h +++ b/include/libxml/xmlIO.h @@ -50,6 +50,9 @@ int xmlParserInputBufferRead (xmlParserInputBufferPtr in, int len); int xmlParserInputBufferGrow (xmlParserInputBufferPtr in, int len); +int xmlParserInputBufferPush (xmlParserInputBufferPtr in, + int len, + char *buf); void xmlFreeParserInputBuffer (xmlParserInputBufferPtr in); char * xmlParserGetDirectory (const char *filename); diff --git a/parser.c b/parser.c index 207e6496..3491efbc 100644 --- a/parser.c +++ b/parser.c @@ -2497,58 +2497,146 @@ xmlParseEntityValue(xmlParserCtxtPtr ctxt, xmlChar **orig) { * [10] AttValue ::= '"' ([^<&"] | Reference)* '"' | * "'" ([^<&'] | Reference)* "'" * - * Returns the AttValue parsed or NULL. + * 3.3.3 Attribute-Value Normalization: + * Before the value of an attribute is passed to the application or + * checked for validity, the XML processor must normalize it as follows: + * - a character reference is processed by appending the referenced + * character to the attribute value + * - an entity reference is processed by recursively processing the + * replacement text of the entity + * - a whitespace character (#x20, #xD, #xA, #x9) is processed by + * appending #x20 to the normalized value, except that only a single + * #x20 is appended for a "#xD#xA" sequence that is part of an external + * parsed entity or the literal entity value of an internal parsed entity + * - other characters are processed by appending them to the normalized value + * + * Returns the AttValue parsed or NULL. The value has to be freed by the caller. */ xmlChar * xmlParseAttValue(xmlParserCtxtPtr ctxt) { - xmlChar *ret = NULL; + xmlChar limit = 0; + xmlChar *buffer = NULL; + int buffer_size = 0; + xmlChar *out = NULL; + + xmlChar *current = NULL; + xmlEntityPtr ent; + xmlChar cur; + int blank = 0; + SHRINK; if (CUR == '"') { ctxt->instate = XML_PARSER_ATTRIBUTE_VALUE; + limit = '"'; NEXT; - ret = xmlDecodeEntities(ctxt, -1, XML_SUBSTITUTE_REF, '"', '<', 0); - if (CUR == '<') { - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "Unescaped '<' not allowed in attributes values\n"); - ctxt->errNo = XML_ERR_LT_IN_ATTRIBUTE; - ctxt->wellFormed = 0; - } - if (CUR != '"') { - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n"); - ctxt->errNo = XML_ERR_ATTRIBUTE_NOT_FINISHED; - ctxt->wellFormed = 0; - } else - NEXT; } else if (CUR == '\'') { + limit = '\''; ctxt->instate = XML_PARSER_ATTRIBUTE_VALUE; NEXT; - ret = xmlDecodeEntities(ctxt, -1, XML_SUBSTITUTE_REF, '\'', '<', 0); - if (CUR == '<') { - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "Unescaped '<' not allowed in attributes values\n"); - ctxt->errNo = XML_ERR_LT_IN_ATTRIBUTE; - ctxt->wellFormed = 0; - } - if (CUR != '\'') { - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n"); - ctxt->errNo = XML_ERR_ATTRIBUTE_NOT_FINISHED; - ctxt->wellFormed = 0; - } else - NEXT; } else { ctxt->errNo = XML_ERR_ATTRIBUTE_NOT_STARTED; if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "AttValue: \" or ' expected\n"); ctxt->wellFormed = 0; + return(NULL); } - return(ret); + /* + * allocate a translation buffer. + */ + buffer_size = 100; + buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar)); + if (buffer == NULL) { + perror("xmlParseAttValue: malloc failed"); + return(NULL); + } + out = buffer; + + /* + * Ok loop until we reach one of the ending char or a size limit. + */ + cur = CUR; + while ((cur != limit) && (cur != '<')) { + + if (cur == 0) break; + if ((cur == '&') && (NXT(1) == '#')) { + int val = xmlParseCharRef(ctxt); + *out++ = val; + blank = 0; + } else if (cur == '&') { + ent = xmlParseEntityRef(ctxt); + if ((ent != NULL) && + (ctxt->replaceEntities != 0)) { + current = ent->content; + while (*current != 0) { + *out++ = *current++; + if (out - buffer > buffer_size - 10) { + int index = out - buffer; + + growBuffer(buffer); + out = &buffer[index]; + } + } + } else if (ent != NULL) { + int i = xmlStrlen(ent->name); + const xmlChar *cur = ent->name; + + *out++ = '&'; + if (out - buffer > buffer_size - i - 10) { + int index = out - buffer; + + growBuffer(buffer); + out = &buffer[index]; + } + for (;i > 0;i--) + *out++ = *cur++; + *out++ = ';'; + } + blank = 0; + } else { + /* invalid for UTF-8 , use COPY(out); !!!!!! */ + if ((cur == 0x20) || (cur == 0xD) || (cur == 0xA) || (cur == 0x9)) { + if (!blank) { + *out++ = 0x20; + if (out - buffer > buffer_size - 10) { + int index = out - buffer; + + growBuffer(buffer); + out = &buffer[index]; + } + } + blank = 1; + } else { + *out++ = cur; + if (out - buffer > buffer_size - 10) { + int index = out - buffer; + + growBuffer(buffer); + out = &buffer[index]; + } + blank = 0; + } + NEXT; + } + cur = CUR; + } + *out++ = 0; + if (CUR == '<') { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Unescaped '<' not allowed in attributes values\n"); + ctxt->errNo = XML_ERR_LT_IN_ATTRIBUTE; + ctxt->wellFormed = 0; + } else if (CUR != limit) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n"); + ctxt->errNo = XML_ERR_ATTRIBUTE_NOT_FINISHED; + ctxt->wellFormed = 0; + } else + NEXT; + return(buffer); } /** @@ -4962,7 +5050,7 @@ xmlParseAttribute(xmlParserCtxtPtr ctxt, xmlChar **value) { * * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>' * - * Returns the element name parsed + * Returne the element name parsed */ xmlChar * @@ -5986,6 +6074,80 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) { return(0); } +/************************************************************************ + * * + * Progressive parsing interfaces * + * * + ************************************************************************/ + +/** + * xmlParseLookupSequence: + * @ctxt: an XML parser context + * @first: the first char to lookup + * @next: the next char to lookup + * + * Try to find if a sequence (first, next) or just (first) if next + * is zero is available in the input stream. + * Since XML-1.0 is an LALR(2) grammar a sequence of 2 char should be + * enought. If this doesn't prove true this function call may change. + * + * Returns 1 if the full sequence is available, 0 otherwise. + */ +int +xmlParseLookupSequence(xmlParserCtxtPtr ctxt, xmlChar first, xmlChar next) { + return(0); +} + +/** + * xmlParseTry: + * @ctxt: an XML parser context + * + * Try to progress on parsing + * + * Returns zero if no parsing was possible + */ +int +xmlParseTry(xmlParserCtxtPtr ctxt) { + int ret = 0; + + while (1) { + switch (ctxt->instate) { + case XML_PARSER_EOF: + return(0); + case XML_PARSER_PROLOG: + case XML_PARSER_CONTENT: + case XML_PARSER_ENTITY_DECL: + case XML_PARSER_ENTITY_VALUE: + case XML_PARSER_ATTRIBUTE_VALUE: + case XML_PARSER_DTD: + case XML_PARSER_EPILOG: + case XML_PARSER_COMMENT: + case XML_PARSER_CDATA_SECTION: + } + } + return(ret); +} + +/** + * xmlParseChunk: + * @ctxt: an XML parser context + * @chunk: an char array + * @size: the size in byte of the chunk + * @terminate: last chunk indicator + * + * Parse a Chunk of memory + * + * Returns zero if no error, the xmlParserErrors otherwise. + */ +xmlParserErrors +xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size, + int terminate) { + if ((size > 0) && (chunk != NULL)) { + xmlParserInputBufferPush(ctxt->input, size, chunk); + } + return((xmlParserErrors) ctxt->errNo); +} + /************************************************************************ * * * I/O front end functions to the parser * diff --git a/result/SVG/flower2.xml b/result/SVG/flower2.xml index f8970db3..b34513d3 100644 --- a/result/SVG/flower2.xml +++ b/result/SVG/flower2.xml @@ -2,42 +2,9 @@ This sample SVG file draws a flower - - - - + + + + diff --git a/result/SVG/toap02.xml b/result/SVG/toap02.xml index c62aa04b..0a903689 100644 --- a/result/SVG/toap02.xml +++ b/result/SVG/toap02.xml @@ -3,7 +3,7 @@ - + diff --git a/result/att1 b/result/att1 new file mode 100644 index 00000000..d3ed2adf --- /dev/null +++ b/result/att1 @@ -0,0 +1,2 @@ + + diff --git a/result/att2 b/result/att2 new file mode 100644 index 00000000..d3ed2adf --- /dev/null +++ b/result/att2 @@ -0,0 +1,2 @@ + + diff --git a/result/valid/REC-xml-19980210.xml b/result/valid/REC-xml-19980210.xml index 2ff55122..7f70749f 100644 --- a/result/valid/REC-xml-19980210.xml +++ b/result/valid/REC-xml-19980210.xml @@ -1674,8 +1674,7 @@ match Nmtokens.

The XML processor must normalize attribute values before passing them to the application, as described in .

--> -

Enumerated attributes can take one +

Enumerated attributes can take one of a list of values provided in the declaration. There are two kinds of enumerated types: Enumerated Attribute TypesEnumeratedTypeNotationType diff --git a/test/att1 b/test/att1 new file mode 100644 index 00000000..609e5cc0 --- /dev/null +++ b/test/att1 @@ -0,0 +1,2 @@ + diff --git a/test/att2 b/test/att2 new file mode 100644 index 00000000..e630ff54 --- /dev/null +++ b/test/att2 @@ -0,0 +1 @@ + diff --git a/xmlIO.c b/xmlIO.c index 668caaba..ea14efe0 100644 --- a/xmlIO.c +++ b/xmlIO.c @@ -249,6 +249,55 @@ xmlParserInputBufferCreateFd(int fd, xmlCharEncoding enc) { return(ret); } +/** + * xmlParserInputBufferPush: + * @in: a buffered parser input + * @buf: an char array + * @len: the size in bytes of the array. + * + * Push the content of the arry in the input buffer + * This routine handle the I18N transcoding to internal UTF-8 + * This is used when operating the parser in progressive (push) mode. + * + * Returns the number of chars read and stored in the buffer, or -1 + * in case of error. + */ +int +xmlParserInputBufferPush(xmlParserInputBufferPtr in, int len, char *buf) { + char *buffer = NULL; + int nbchars = 0; + + if (len < 0) return(0); + if (in->encoder != NULL) { + xmlChar *buf; + + buf = (xmlChar *) xmlMalloc((len + 1) * 2 * sizeof(xmlChar)); + if (buf == NULL) { + fprintf(stderr, "xmlParserInputBufferGrow : out of memory !\n"); + xmlFree(buffer); + return(-1); + } + nbchars = in->encoder->input(buf, (len + 1) * 2 * sizeof(xmlChar), + BAD_CAST buffer, len); + /* + * TODO : we really need to have something atomic or the + * encoder must report the number of bytes read + */ + buf[nbchars] = 0; + xmlBufferAdd(in->buffer, (xmlChar *) buf, nbchars); + xmlFree(buf); + } else { + nbchars = len; + buffer[nbchars] = 0; + xmlBufferAdd(in->buffer, (xmlChar *) buffer, nbchars); + } +#ifdef DEBUG_INPUT + fprintf(stderr, "I/O: pushed %d chars, buffer %d/%d\n", + nbchars, in->buffer->use, in->buffer->size); +#endif + return(nbchars); +} + /** * xmlParserInputBufferGrow: * @in: a buffered parser input @@ -256,6 +305,7 @@ xmlParserInputBufferCreateFd(int fd, xmlCharEncoding enc) { * * Grow up the content of the input buffer, the old data are preserved * This routine handle the I18N transcoding to internal UTF-8 + * This routine is used when operating the parser in normal (pull) mode * TODO: one should be able to remove one extra copy * * Returns the number of chars read and stored in the buffer, or -1 diff --git a/xmlIO.h b/xmlIO.h index a99ab23d..58baeb01 100644 --- a/xmlIO.h +++ b/xmlIO.h @@ -50,6 +50,9 @@ int xmlParserInputBufferRead (xmlParserInputBufferPtr in, int len); int xmlParserInputBufferGrow (xmlParserInputBufferPtr in, int len); +int xmlParserInputBufferPush (xmlParserInputBufferPtr in, + int len, + char *buf); void xmlFreeParserInputBuffer (xmlParserInputBufferPtr in); char * xmlParserGetDirectory (const char *filename);