1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-02-05 05:47:00 +03:00

- Push mode for the HTML parser (new calls)

- Improved the memory debugger to provide content informations
- cleanups, last known mem leak killed
Daniel
This commit is contained in:
Daniel Veillard 1999-12-29 12:49:06 +00:00
parent be849cf33f
commit 5e5c62351f
8 changed files with 941 additions and 103 deletions

View File

@ -1,3 +1,14 @@
Wed Dec 29 15:29:52 CET 1999 Daniel Veillard <Daniel.Veillard@w3.org>
* HTMLparser.[ch] testHTML.c: added push mode for the HTML parser
too htmlCreatePushParserCtxt() and htmlParseChunk()
* parser.c: a bit of cleanup.
* SAX.c, HTMLparser.c: some attributes may not have values (contrary
to XML) removed the last mem leak known
* HTMLtree.c: output message cleanup
* xmlmemory.c: display content info about memory blocks
* result/HTML/wired.* : missing att value warning change
Tue Dec 28 17:42:41 CET 1999 Daniel Veillard <Daniel.Veillard@w3.org>
* doc/* : rebuilt the documentation

View File

@ -41,11 +41,15 @@
#include "valid.h"
#include "parserInternals.h"
#include "xmlIO.h"
#include "xml-error.h"
#define HTML_MAX_NAMELEN 1000
#define INPUT_CHUNK 50
#define HTML_PARSER_BIG_BUFFER_SIZE 1024
#define HTML_PARSER_BUFFER_SIZE 100
/* #define DEBUG */
/* #define DEBUG_PUSH */
/************************************************************************
* *
@ -145,20 +149,6 @@ PUSH_AND_POP(extern, xmlChar*, name)
xmlParserInputGrow(ctxt->input, INPUT_CHUNK); \
}}
/****************************************
#define NEXT ((*ctxt->input->cur) ? \
(((*(ctxt->input->cur) == '\n') ? \
(ctxt->input->line++, ctxt->input->col = 1) : \
(ctxt->input->col++)), \
(ctxt->input->cur++), \
((*ctxt->input->cur) ? \
(xmlParserInputGrow(ctxt->input, 100), \
ctxt->input->cur): \
(ctxt->input->cur))) : \
((xmlParserInputGrow(ctxt->input, 100) > 0) ? \
ctxt->input->cur: \
(xmlPopInput(ctxt), ctxt->input->cur)))
****************************************/
#else
#endif
@ -926,7 +916,7 @@ htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
/*
* allocate a translation buffer.
*/
buffer_size = 1000;
buffer_size = HTML_PARSER_BIG_BUFFER_SIZE;
buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar));
if (buffer == NULL) {
perror("htmlDecodeEntities: malloc failed");
@ -1128,6 +1118,66 @@ htmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc)
}
}
/************************************************************************
* *
* Commodity functions to handle streams *
* *
************************************************************************/
/**
* htmlFreeInputStream:
* @input: an htmlParserInputPtr
*
* Free up an input stream.
*/
void
htmlFreeInputStream(htmlParserInputPtr input) {
if (input == NULL) return;
if (input->filename != NULL) xmlFree((char *) input->filename);
if (input->directory != NULL) xmlFree((char *) input->directory);
if ((input->free != NULL) && (input->base != NULL))
input->free((xmlChar *) input->base);
if (input->buf != NULL)
xmlFreeParserInputBuffer(input->buf);
memset(input, -1, sizeof(htmlParserInput));
xmlFree(input);
}
/**
* htmlNewInputStream:
* @ctxt: an HTML parser context
*
* Create a new input stream structure
* Returns the new input stream or NULL
*/
htmlParserInputPtr
htmlNewInputStream(htmlParserCtxtPtr ctxt) {
htmlParserInputPtr input;
input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
if (input == NULL) {
ctxt->errNo = XML_ERR_NO_MEMORY;
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"malloc: couldn't allocate a new input stream\n");
ctxt->errNo = XML_ERR_NO_MEMORY;
return(NULL);
}
input->filename = NULL;
input->directory = NULL;
input->base = NULL;
input->cur = NULL;
input->buf = NULL;
input->line = 1;
input->col = 1;
input->buf = NULL;
input->free = NULL;
input->consumed = 0;
input->length = 0;
return(input);
}
/************************************************************************
* *
@ -1268,12 +1318,13 @@ xmlChar *
htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
xmlChar *ret = NULL;
int i = 0;
xmlChar loc[100];
xmlChar loc[HTML_PARSER_BUFFER_SIZE];
if (!IS_LETTER(CUR) && (CUR != '_') &&
(CUR != ':')) return(NULL);
while ((i < 100) && ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)))) {
while ((i < HTML_PARSER_BUFFER_SIZE) &&
((IS_LETTER(CUR)) || (IS_DIGIT(CUR)))) {
if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
else loc[i] = CUR;
i++;
@ -1615,7 +1666,7 @@ void
htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
xmlChar *buf = NULL;
int len = 0;
int size = 100;
int size = HTML_PARSER_BUFFER_SIZE;
xmlChar q;
buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar));
@ -1742,17 +1793,16 @@ htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
/**
* htmlParseComment:
* @ctxt: an HTML parser context
* @create: should we create a node, or just skip the content
*
* Parse an XML (SGML) comment <!-- .... -->
*
* [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
*/
void
htmlParseComment(htmlParserCtxtPtr ctxt, int create) {
htmlParseComment(htmlParserCtxtPtr ctxt) {
xmlChar *buf = NULL;
int len = 0;
int size = 100;
int size = HTML_PARSER_BUFFER_SIZE;
register xmlChar s, r, q;
/*
@ -1793,10 +1843,8 @@ htmlParseComment(htmlParserCtxtPtr ctxt, int create) {
ctxt->wellFormed = 0;
} else {
NEXT;
if (create) {
if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL)) {
ctxt->sax->comment(ctxt->userData, buf);
}
if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL)) {
ctxt->sax->comment(ctxt->userData, buf);
}
}
xmlFree(buf);
@ -1935,6 +1983,9 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
/*
* Create the document accordingly to the DOCTYPE
*/
if (ctxt->myDoc != NULL)
xmlFreeDoc(ctxt->myDoc);
ctxt->myDoc = htmlNewDoc(URI, ExternalID);
/*
@ -1968,7 +2019,7 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
xmlChar *
htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
xmlChar *name, *val;
xmlChar *name, *val = NULL;
*value = NULL;
name = htmlParseName(ctxt);
@ -1990,10 +2041,8 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
} else {
/* TODO : some attribute must have values, some may not */
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"Specification mandate value for attribute %s\n", name);
ctxt->wellFormed = 0;
return(NULL);
ctxt->sax->warning(ctxt->userData,
"No value for attribute %s\n", name);
}
*value = val;
@ -2060,7 +2109,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
GROW;
attname = htmlParseAttribute(ctxt, &attvalue);
if ((attname != NULL) && (attvalue != NULL)) {
if (attname != NULL) {
/*
* Well formedness requires at most one declaration of an attribute
*/
@ -2072,7 +2121,8 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
attname);
ctxt->wellFormed = 0;
xmlFree(attname);
xmlFree(attvalue);
if (attvalue != NULL)
xmlFree(attvalue);
goto failed;
}
}
@ -2127,7 +2177,10 @@ failed:
ctxt->sax->startElement(ctxt->userData, name, atts);
if (atts != NULL) {
for (i = 0;i < nbatts;i++) xmlFree((xmlChar *) atts[i]);
for (i = 0;i < nbatts;i++) {
if (atts[i] != NULL)
xmlFree((xmlChar *) atts[i]);
}
xmlFree(atts);
}
if (name != NULL) xmlFree(name);
@ -2330,7 +2383,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
*/
if ((CUR == '<') && (NXT(1) == '!') &&
(NXT(2) == '-') && (NXT(3) == '-')) {
htmlParseComment(ctxt, 1);
htmlParseComment(ctxt);
}
/*
@ -2384,11 +2437,11 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
void
htmlParseElement(htmlParserCtxtPtr ctxt) {
const xmlChar *openTag = CUR_PTR;
xmlChar *oldname;
xmlChar *name;
xmlChar *currentNode = NULL;
htmlElemDescPtr info;
htmlParserNodeInfo node_info;
xmlChar *oldname;
int depth = ctxt->nameNr;
/* Capture start position */
@ -2585,8 +2638,9 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
*/
while ((CUR == '<') && (NXT(1) == '!') &&
(NXT(2) == '-') && (NXT(3) == '-')) {
ctxt->myDoc = htmlNewDoc(NULL, NULL);
htmlParseComment(ctxt, 1);
if (ctxt->myDoc == NULL)
ctxt->myDoc = htmlNewDoc(NULL, NULL);
htmlParseComment(ctxt);
SKIP_BLANKS;
}
@ -2721,6 +2775,7 @@ htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
xmlFree(oldname);
}
if (ctxt->nameTab != NULL) xmlFree(ctxt->nameTab);
if (ctxt->directory != NULL) xmlFree(ctxt->directory);
if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))
@ -2766,11 +2821,717 @@ htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding) {
return(ctxt);
}
/********************************************************************************
* *
* User entry points *
* *
********************************************************************************/
/************************************************************************
* *
* Progressive parsing interfaces *
* *
************************************************************************/
/**
* htmlParseLookupSequence:
* @ctxt: an HTML parser context
* @first: the first char to lookup
* @next: the next char to lookup or zero
* @third: the next char to lookup or zero
*
* Try to find if a sequence (first, next, third) or just (first next) or
* (first) is available in the input stream.
* This function has a side effect of (possibly) incrementing ctxt->checkIndex
* to avoid rescanning sequences of bytes, it DOES change the state of the
* parser, do not use liberally.
* This is basically similar to xmlParseLookupSequence()
*
* Returns the index to the current parsing point if the full sequence
* is available, -1 otherwise.
*/
int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
xmlChar next, xmlChar third) {
int base, len;
htmlParserInputPtr in;
const xmlChar *buf;
in = ctxt->input;
if (in == NULL) return(-1);
base = in->cur - in->base;
if (base < 0) return(-1);
if (ctxt->checkIndex > base)
base = ctxt->checkIndex;
if (in->buf == NULL) {
buf = in->base;
len = in->length;
} else {
buf = in->buf->buffer->content;
len = in->buf->buffer->use;
}
/* take into account the sequence length */
if (third) len -= 2;
else if (next) len --;
for (;base < len;base++) {
if (buf[base] == first) {
if (third != 0) {
if ((buf[base + 1] != next) ||
(buf[base + 2] != third)) continue;
} else if (next != 0) {
if (buf[base + 1] != next) continue;
}
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
if (next == 0)
fprintf(stderr, "HPP: lookup '%c' found at %d\n",
first, base);
else if (third == 0)
fprintf(stderr, "HPP: lookup '%c%c' found at %d\n",
first, next, base);
else
fprintf(stderr, "HPP: lookup '%c%c%c' found at %d\n",
first, next, third, base);
#endif
return(base - (in->cur - in->base));
}
}
ctxt->checkIndex = base;
#ifdef DEBUG_PUSH
if (next == 0)
fprintf(stderr, "HPP: lookup '%c' failed\n", first);
else if (third == 0)
fprintf(stderr, "HPP: lookup '%c%c' failed\n", first, next);
else
fprintf(stderr, "HPP: lookup '%c%c%c' failed\n", first, next, third);
#endif
return(-1);
}
/**
* htmlParseTry:
* @ctxt: an HTML parser context
*
* Try to progress on parsing
*
* Returns zero if no parsing was possible
*/
int
htmlParseTry(htmlParserCtxtPtr ctxt) {
int ret = 0;
htmlParserInputPtr in;
int avail;
xmlChar cur, next;
#ifdef DEBUG_PUSH
switch (ctxt->instate) {
case XML_PARSER_EOF:
fprintf(stderr, "HPP: try EOF\n"); break;
case XML_PARSER_START:
fprintf(stderr, "HPP: try START\n"); break;
case XML_PARSER_MISC:
fprintf(stderr, "HPP: try MISC\n");break;
case XML_PARSER_COMMENT:
fprintf(stderr, "HPP: try COMMENT\n");break;
case XML_PARSER_PROLOG:
fprintf(stderr, "HPP: try PROLOG\n");break;
case XML_PARSER_START_TAG:
fprintf(stderr, "HPP: try START_TAG\n");break;
case XML_PARSER_CONTENT:
fprintf(stderr, "HPP: try CONTENT\n");break;
case XML_PARSER_CDATA_SECTION:
fprintf(stderr, "HPP: try CDATA_SECTION\n");break;
case XML_PARSER_END_TAG:
fprintf(stderr, "HPP: try END_TAG\n");break;
case XML_PARSER_ENTITY_DECL:
fprintf(stderr, "HPP: try ENTITY_DECL\n");break;
case XML_PARSER_ENTITY_VALUE:
fprintf(stderr, "HPP: try ENTITY_VALUE\n");break;
case XML_PARSER_ATTRIBUTE_VALUE:
fprintf(stderr, "HPP: try ATTRIBUTE_VALUE\n");break;
case XML_PARSER_DTD:
fprintf(stderr, "HPP: try DTD\n");break;
case XML_PARSER_EPILOG:
fprintf(stderr, "HPP: try EPILOG\n");break;
case XML_PARSER_PI:
fprintf(stderr, "HPP: try PI\n");break;
}
#endif
while (1) {
in = ctxt->input;
if (in == NULL) break;
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
avail = in->buf->buffer->use - (in->cur - in->base);
if (avail < 1)
goto done;
switch (ctxt->instate) {
case XML_PARSER_EOF:
/*
* Document parsing is done !
*/
goto done;
case XML_PARSER_START:
/*
* Very first chars read from the document flow.
*/
cur = in->cur[0];
if (IS_BLANK(cur)) {
SKIP_BLANKS;
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
avail = in->buf->buffer->use - (in->cur - in->base);
}
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
ctxt->sax->setDocumentLocator(ctxt->userData,
&xmlDefaultSAXLocator);
cur = in->cur[0];
next = in->cur[1];
if ((cur == '<') && (next == '!') &&
(UPP(2) == 'D') && (UPP(3) == 'O') &&
(UPP(4) == 'C') && (UPP(5) == 'T') &&
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) {
if (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0)
goto done;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: Parsing internal subset\n");
#endif
htmlParseDocTypeDecl(ctxt);
ctxt->instate = XML_PARSER_PROLOG;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering PROLOG\n");
#endif
} else {
ctxt->myDoc = htmlNewDoc(NULL, NULL);
ctxt->instate = XML_PARSER_MISC;
}
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering MISC\n");
#endif
break;
case XML_PARSER_MISC:
SKIP_BLANKS;
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
avail = in->buf->buffer->use - (in->cur - in->base);
if (avail < 2)
goto done;
cur = in->cur[0];
next = in->cur[1];
if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0)
goto done;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: Parsing Comment\n");
#endif
htmlParseComment(ctxt);
ctxt->instate = XML_PARSER_MISC;
} else if ((cur == '<') && (next == '!') &&
(UPP(2) == 'D') && (UPP(3) == 'O') &&
(UPP(4) == 'C') && (UPP(5) == 'T') &&
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) {
if (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0)
goto done;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: Parsing internal subset\n");
#endif
htmlParseDocTypeDecl(ctxt);
ctxt->instate = XML_PARSER_PROLOG;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering PROLOG\n");
#endif
} else if ((cur == '<') && (next == '!') &&
(avail < 9)) {
goto done;
} else {
ctxt->instate = XML_PARSER_START_TAG;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering START_TAG\n");
#endif
}
break;
case XML_PARSER_PROLOG:
SKIP_BLANKS;
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
avail = in->buf->buffer->use - (in->cur - in->base);
if (avail < 2)
goto done;
cur = in->cur[0];
next = in->cur[1];
if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0)
goto done;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: Parsing Comment\n");
#endif
htmlParseComment(ctxt);
ctxt->instate = XML_PARSER_PROLOG;
} else if ((cur == '<') && (next == '!') &&
(avail < 4)) {
goto done;
} else {
ctxt->instate = XML_PARSER_START_TAG;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering START_TAG\n");
#endif
}
break;
case XML_PARSER_EPILOG:
SKIP_BLANKS;
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
avail = in->buf->buffer->use - (in->cur - in->base);
if (avail < 2)
goto done;
cur = in->cur[0];
next = in->cur[1];
if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0)
goto done;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: Parsing Comment\n");
#endif
htmlParseComment(ctxt);
ctxt->instate = XML_PARSER_EPILOG;
} else if ((cur == '<') && (next == '!') &&
(avail < 4)) {
goto done;
} else {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"Extra content at the end of the document\n");
ctxt->wellFormed = 0;
ctxt->errNo = XML_ERR_DOCUMENT_END;
ctxt->instate = XML_PARSER_EOF;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering EOF\n");
#endif
if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
ctxt->sax->endDocument(ctxt->userData);
goto done;
}
break;
case XML_PARSER_START_TAG: {
xmlChar *name, *oldname;
int depth = ctxt->nameNr;
htmlElemDescPtr info;
if (avail < 2)
goto done;
cur = in->cur[0];
if (cur != '<') {
ctxt->instate = XML_PARSER_CONTENT;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering CONTENT\n");
#endif
break;
}
if (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0)
goto done;
oldname = xmlStrdup(ctxt->name);
htmlParseStartTag(ctxt);
name = ctxt->name;
#ifdef DEBUG
if (oldname == NULL)
fprintf(stderr, "Start of element %s\n", name);
else if (name == NULL)
fprintf(stderr, "Start of element failed, was %s\n",
oldname);
else
fprintf(stderr, "Start of element %s, was %s\n",
name, oldname);
#endif
if (((depth == ctxt->nameNr) &&
(!xmlStrcmp(oldname, ctxt->name))) ||
(name == NULL)) {
if (CUR == '>')
NEXT;
if (oldname != NULL)
xmlFree(oldname);
break;
}
if (oldname != NULL)
xmlFree(oldname);
/*
* Lookup the info for that element.
*/
info = htmlTagLookup(name);
if (info == NULL) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData, "Tag %s invalid\n",
name);
ctxt->wellFormed = 0;
} else if (info->depr) {
/***************************
if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL))
ctxt->sax->warning(ctxt->userData,
"Tag %s is deprecated\n",
name);
***************************/
}
/*
* Check for an Empty Element labelled the XML/SGML way
*/
if ((CUR == '/') && (NXT(1) == '>')) {
SKIP(2);
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
oldname = htmlnamePop(ctxt);
#ifdef DEBUG
fprintf(stderr,"End of tag the XML way: popping out %s\n",
oldname);
#endif
if (oldname != NULL)
xmlFree(oldname);
ctxt->instate = XML_PARSER_CONTENT;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering CONTENT\n");
#endif
break;
}
if (CUR == '>') {
NEXT;
} else {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"Couldn't find end of Start Tag %s\n",
name);
ctxt->wellFormed = 0;
/*
* end of parsing of this node.
*/
if (!xmlStrcmp(name, ctxt->name)) {
nodePop(ctxt);
oldname = htmlnamePop(ctxt);
#ifdef DEBUG
fprintf(stderr,
"End of start tag problem: popping out %s\n", oldname);
#endif
if (oldname != NULL)
xmlFree(oldname);
}
ctxt->instate = XML_PARSER_CONTENT;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering CONTENT\n");
#endif
break;
}
/*
* Check for an Empty Element from DTD definition
*/
if ((info != NULL) && (info->empty)) {
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
oldname = htmlnamePop(ctxt);
#ifdef DEBUG
fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname);
#endif
if (oldname != NULL)
xmlFree(oldname);
}
ctxt->instate = XML_PARSER_CONTENT;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering CONTENT\n");
#endif
break;
}
case XML_PARSER_CONTENT:
/*
* Handle preparsed entities and charRef
*/
if (ctxt->token != 0) {
xmlChar cur[2] = { 0 , 0 } ;
cur[0] = (xmlChar) ctxt->token;
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData, cur, 1);
ctxt->token = 0;
ctxt->checkIndex = 0;
}
if (avail < 2)
goto done;
cur = in->cur[0];
next = in->cur[1];
if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0)
goto done;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: Parsing Comment\n");
#endif
htmlParseComment(ctxt);
ctxt->instate = XML_PARSER_CONTENT;
} else if ((cur == '<') && (next == '!') && (avail < 4)) {
goto done;
} else if ((cur == '<') && (next == '/')) {
ctxt->instate = XML_PARSER_END_TAG;
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering END_TAG\n");
#endif
break;
} else if (cur == '<') {
ctxt->instate = XML_PARSER_START_TAG;
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering START_TAG\n");
#endif
break;
} else if (cur == '&') {
if (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0)
goto done;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: Parsing Reference\n");
#endif
/* TODO: check generation of subtrees if noent !!! */
htmlParseReference(ctxt);
} else {
/* TODO Avoid the extra copy, handle directly !!!!!! */
/*
* Goal of the following test is :
* - minimize calls to the SAX 'character' callback
* when they are mergeable
*/
if ((ctxt->inputNr == 1) &&
(avail < HTML_PARSER_BIG_BUFFER_SIZE)) {
if (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0)
goto done;
}
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: Parsing char data\n");
#endif
htmlParseCharData(ctxt, 0);
}
break;
case XML_PARSER_END_TAG:
if (avail < 2)
goto done;
if (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0)
goto done;
htmlParseEndTag(ctxt);
if (ctxt->nameNr == 0) {
ctxt->instate = XML_PARSER_EPILOG;
} else {
ctxt->instate = XML_PARSER_CONTENT;
}
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering CONTENT\n");
#endif
break;
case XML_PARSER_CDATA_SECTION:
fprintf(stderr, "HPP: internal error, state == CDATA\n");
ctxt->instate = XML_PARSER_CONTENT;
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering CONTENT\n");
#endif
break;
case XML_PARSER_DTD:
fprintf(stderr, "HPP: internal error, state == DTD\n");
ctxt->instate = XML_PARSER_CONTENT;
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering CONTENT\n");
#endif
break;
case XML_PARSER_COMMENT:
fprintf(stderr, "HPP: internal error, state == COMMENT\n");
ctxt->instate = XML_PARSER_CONTENT;
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering CONTENT\n");
#endif
break;
case XML_PARSER_PI:
fprintf(stderr, "HPP: internal error, state == PI\n");
ctxt->instate = XML_PARSER_CONTENT;
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering CONTENT\n");
#endif
break;
case XML_PARSER_ENTITY_DECL:
fprintf(stderr, "HPP: internal error, state == ENTITY_DECL\n");
ctxt->instate = XML_PARSER_CONTENT;
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering CONTENT\n");
#endif
break;
case XML_PARSER_ENTITY_VALUE:
fprintf(stderr, "HPP: internal error, state == ENTITY_VALUE\n");
ctxt->instate = XML_PARSER_CONTENT;
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering DTD\n");
#endif
break;
case XML_PARSER_ATTRIBUTE_VALUE:
fprintf(stderr, "HPP: internal error, state == ATTRIBUTE_VALUE\n");
ctxt->instate = XML_PARSER_START_TAG;
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: entering START_TAG\n");
#endif
break;
}
}
done:
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: done %d\n", ret);
#endif
return(ret);
}
/**
* htmlParseChunk:
* @ctxt: an XML parser context
* @chunk: an char array
* @size: the size in byte of the chunk
* @terminate: last chunk indicator
*
* Parse a Chunk of memory
*
* Returns zero if no error, the xmlParserErrors otherwise.
*/
int
htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
int terminate) {
if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
int base = ctxt->input->base - ctxt->input->buf->buffer->content;
int cur = ctxt->input->cur - ctxt->input->base;
xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
ctxt->input->base = ctxt->input->buf->buffer->content + base;
ctxt->input->cur = ctxt->input->base + cur;
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: pushed %d\n", size);
#endif
htmlParseTry(ctxt);
} else if (ctxt->instate != XML_PARSER_EOF)
htmlParseTry(ctxt);
if (terminate) {
if ((ctxt->instate != XML_PARSER_EOF) &&
(ctxt->instate != XML_PARSER_EPILOG) &&
(ctxt->instate != XML_PARSER_MISC)) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"Extra content at the end of the document\n");
ctxt->wellFormed = 0;
ctxt->errNo = XML_ERR_DOCUMENT_END;
}
if (ctxt->instate != XML_PARSER_EOF) {
if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
ctxt->sax->endDocument(ctxt->userData);
}
ctxt->instate = XML_PARSER_EOF;
}
return((xmlParserErrors) ctxt->errNo);
}
/************************************************************************
* *
* User entry points *
* *
************************************************************************/
/**
* htmlCreatePushParserCtxt :
* @sax: a SAX handler
* @user_data: The user data returned on SAX callbacks
* @chunk: a pointer to an array of chars
* @size: number of chars in the array
* @filename: an optional file name or URI
* @enc: an optional encoding
*
* Create a parser context for using the HTML parser in push mode
* To allow content encoding detection, @size should be >= 4
* The value of @filename is used for fetching external entities
* and error/warning reports.
*
* Returns the new parser context or NULL
*/
htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
const char *chunk, int size, const char *filename,
xmlCharEncoding enc) {
htmlParserCtxtPtr ctxt;
htmlParserInputPtr inputStream;
xmlParserInputBufferPtr buf;
buf = xmlAllocParserInputBuffer(enc);
if (buf == NULL) return(NULL);
ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt));
if (ctxt == NULL) {
xmlFree(buf);
return(NULL);
}
memset(ctxt, 0, sizeof(htmlParserCtxt));
htmlInitParserCtxt(ctxt);
if (sax != NULL) {
if (ctxt->sax != &htmlDefaultSAXHandler)
xmlFree(ctxt->sax);
ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
if (ctxt->sax == NULL) {
xmlFree(buf);
xmlFree(ctxt);
return(NULL);
}
memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
if (user_data != NULL)
ctxt->userData = user_data;
}
if (filename == NULL) {
ctxt->directory = NULL;
} else {
ctxt->directory = xmlParserGetDirectory(filename);
}
inputStream = htmlNewInputStream(ctxt);
if (inputStream == NULL) {
xmlFreeParserCtxt(ctxt);
return(NULL);
}
if (filename == NULL)
inputStream->filename = NULL;
else
inputStream->filename = xmlMemStrdup(filename);
inputStream->buf = buf;
inputStream->base = inputStream->buf->buffer->content;
inputStream->cur = inputStream->buf->buffer->content;
inputPush(ctxt, inputStream);
if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
(ctxt->input->buf != NULL)) {
xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
#ifdef DEBUG_PUSH
fprintf(stderr, "HPP: pushed %d\n", size);
#endif
}
return(ctxt);
}
/**
* htmlSAXParseDoc :

View File

@ -78,6 +78,20 @@ htmlDocPtr htmlSAXParseFile(const char *filename,
htmlDocPtr htmlParseFile (const char *filename,
const char *encoding);
/**
* Interfaces for the Push mode
*/
void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt);
htmlParserCtxtPtr htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
void *user_data,
const char *chunk,
int size,
const char *filename,
xmlCharEncoding enc);
int htmlParseChunk (htmlParserCtxtPtr ctxt,
const char *chunk,
int size,
int terminate);
#ifdef __cplusplus
}
#endif

View File

@ -310,7 +310,7 @@ htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
if (cur == NULL) {
#ifdef DEBUG_TREE
fprintf(stderr, "xmlDocDumpMemory : document == NULL\n");
fprintf(stderr, "htmlxmlDocDumpMemory : document == NULL\n");
#endif
*mem = NULL;
*size = 0;
@ -343,7 +343,7 @@ htmlDocDump(FILE *f, xmlDocPtr cur) {
if (cur == NULL) {
#ifdef DEBUG_TREE
fprintf(stderr, "xmlDocDump : document == NULL\n");
fprintf(stderr, "htmlDocDump : document == NULL\n");
#endif
return;
}

View File

@ -78,6 +78,20 @@ htmlDocPtr htmlSAXParseFile(const char *filename,
htmlDocPtr htmlParseFile (const char *filename,
const char *encoding);
/**
* Interfaces for the Push mode
*/
void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt);
htmlParserCtxtPtr htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
void *user_data,
const char *chunk,
int size,
const char *filename,
xmlCharEncoding enc);
int htmlParseChunk (htmlParserCtxtPtr ctxt,
const char *chunk,
int size,
int terminate);
#ifdef __cplusplus
}
#endif

View File

@ -7180,7 +7180,7 @@ xmlParseTry(xmlParserCtxtPtr ctxt) {
xmlParsePI(ctxt);
} else if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if (xmlParseLookupSequence(ctxt, '-', '>', 0) < 0)
if (xmlParseLookupSequence(ctxt, '-', '-', '>') < 0)
goto done;
#ifdef DEBUG_PUSH
fprintf(stderr, "PP: Parsing Comment\n");
@ -7238,7 +7238,7 @@ xmlParseTry(xmlParserCtxtPtr ctxt) {
xmlParsePI(ctxt);
} else if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if (xmlParseLookupSequence(ctxt, '-', '>', 0) < 0)
if (xmlParseLookupSequence(ctxt, '-', '-', '>') < 0)
goto done;
#ifdef DEBUG_PUSH
fprintf(stderr, "PP: Parsing Comment\n");
@ -7275,7 +7275,7 @@ xmlParseTry(xmlParserCtxtPtr ctxt) {
ctxt->instate = XML_PARSER_EPILOG;
} else if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if (xmlParseLookupSequence(ctxt, '-', '>', 0) < 0)
if (xmlParseLookupSequence(ctxt, '-', '-', '>') < 0)
goto done;
#ifdef DEBUG_PUSH
fprintf(stderr, "PP: Parsing Comment\n");
@ -7425,7 +7425,7 @@ xmlParseTry(xmlParserCtxtPtr ctxt) {
xmlParsePI(ctxt);
} else if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if (xmlParseLookupSequence(ctxt, '-', '>', 0) < 0)
if (xmlParseLookupSequence(ctxt, '-', '-', '>') < 0)
goto done;
#ifdef DEBUG_PUSH
fprintf(stderr, "PP: Parsing Comment\n");
@ -7531,7 +7531,7 @@ xmlParseTry(xmlParserCtxtPtr ctxt) {
}
break;
}
case XML_PARSER_END_TAG: {
case XML_PARSER_END_TAG:
if (avail < 2)
goto done;
if (xmlParseLookupSequence(ctxt, '>', 0, 0) < 0)
@ -7549,7 +7549,6 @@ xmlParseTry(xmlParserCtxtPtr ctxt) {
#endif
}
break;
}
case XML_PARSER_DTD: {
/*
* Sorry but progressive parsing of the internal subset

View File

@ -43,26 +43,7 @@ static int copy = 0;
static int sax = 0;
static int repeat = 0;
static int noout = 0;
/*
* Note: this is perfectly clean HTML, i.e. not a useful test.
static xmlChar buffer[] =
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\"\n\
\"http://www.w3.org/TR/REC-html40/loose.dtd\">\n\
<html>\n\
<head>\n\
<title>This service is temporary down</title>\n\
</head>\n\
\n\
<body bgcolor=\"#FFFFFF\">\n\
<h1 align=\"center\">Sorry, this service is temporary down</h1>\n\
We are doing our best to get it back on-line,\n\
\n\
<p>The W3C system administrators</p>\n\
</body>\n\
</html>\n\
";
*/
static int push = 0;
xmlSAXHandler emptySAXHandlerStruct = {
NULL, /* internalSubset */
@ -608,7 +589,35 @@ void parseAndPrintFile(char *filename) {
/*
* build an HTML tree from a string;
*/
doc = htmlParseFile(filename, NULL);
if (push) {
FILE *f;
f = fopen(filename, "r");
if (f != NULL) {
int res, size = 3;
char chars[1024];
htmlParserCtxtPtr ctxt;
if (repeat)
size = 1024;
res = fread(chars, 1, 4, f);
if (res > 0) {
ctxt = htmlCreatePushParserCtxt(NULL, NULL,
chars, res, filename, 0);
while ((res = fread(chars, 1, size, f)) > 0) {
htmlParseChunk(ctxt, chars, res, 0);
}
htmlParseChunk(ctxt, chars, 0, 1);
doc = ctxt->myDoc;
htmlFreeParserCtxt(ctxt);
}
}
} else {
doc = htmlParseFile(filename, NULL);
}
if (doc == NULL) {
fprintf(stderr, "Could not parse %s\n", filename);
}
/*
* test intermediate copy if needed.
@ -635,37 +644,6 @@ void parseAndPrintFile(char *filename) {
xmlFreeDoc(doc);
}
void parseAndPrintBuffer(xmlChar *buf) {
htmlDocPtr doc, tmp;
/*
* build an HTML tree from a string;
*/
doc = htmlParseDoc(buf, NULL);
/*
* test intermediate copy if needed.
*/
if (copy) {
tmp = doc;
doc = xmlCopyDoc(doc, 1);
xmlFreeDoc(tmp);
}
/*
* print it.
*/
if (!debug)
htmlDocDump(stdout, doc);
else
xmlDebugDumpDocument(stdout, doc);
/*
* free it.
*/
xmlFreeDoc(doc);
}
int main(int argc, char **argv) {
int i, count;
int files = 0;
@ -675,6 +653,8 @@ int main(int argc, char **argv) {
debug++;
else if ((!strcmp(argv[i], "-copy")) || (!strcmp(argv[i], "--copy")))
copy++;
else if ((!strcmp(argv[i], "-push")) || (!strcmp(argv[i], "--push")))
push++;
else if ((!strcmp(argv[i], "-sax")) || (!strcmp(argv[i], "--sax")))
sax++;
else if ((!strcmp(argv[i], "-noout")) || (!strcmp(argv[i], "--noout")))
@ -708,8 +688,9 @@ int main(int argc, char **argv) {
printf("\t--debug : dump a debug tree of the in-memory document\n");
printf("\t--copy : used to test the internal copy implementation\n");
printf("\t--sax : debug the sequence of SAX callbacks\n");
printf("\t--repeat : parse the file 100 times, for timing or profiling\n");
printf("\t--repeat : parse the file 100 times, for timing\n");
printf("\t--noout : do not print the result\n");
printf("\t--push : use the push mode parser\n");
}
xmlCleanupParser();
xmlMemoryDump();

View File

@ -25,6 +25,9 @@
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_CTYPE_H
#include <ctype.h>
#endif
#include "xmlmemory.h"
@ -368,6 +371,59 @@ xmlMemUsed(void) {
return(debugMemSize);
}
#ifdef MEM_LIST
/**
* xmlMemContentShow:
* @fp: a FILE descriptor used as the output file
* @p: a memory block header
*
* tries to show some content from the memory block
*/
void
xmlMemContentShow(FILE *fp, MEMHDR *p)
{
int i,j,len = p->mh_size;
const char *buf = HDR_2_CLIENT(p);
for (i = 0;i < len;i++) {
if (buf[i] == 0) break;
if (!isprint(buf[i])) break;
}
if ((i < 4) && ((buf[i] != 0) || (i == 0))) {
if (len >= 4) {
MEMHDR *q;
void *cur;
for (j = 0;j < len -3;j += 4) {
cur = *((void **) &buf[j]);
q = CLIENT_2_HDR(cur);
p = memlist;
while (p != NULL) {
if (p == q) break;
p = p->mh_next;
}
if (p == q) {
fprintf(fp, " pointer to #%lu at index %d",
p->mh_number, j);
return;
}
}
}
} else if ((i == 0) && (buf[i] == 0)) {
fprintf(fp," null");
} else {
if (buf[i] == 0) fprintf(fp," \"%.25s\"", buf);
else {
fprintf(fp," [");
for (j = 0;j < i;j++)
fprintf(fp,"%c", buf[j]);
fprintf(fp,"]");
}
}
}
#endif
/**
* xmlMemShow:
* @fp: a FILE descriptor used as the output file
@ -403,6 +459,7 @@ xmlMemShow(FILE *fp, int nr)
fprintf(fp,"%s(%d)", p->mh_file, p->mh_line);
if (p->mh_tag != MEMTAG)
fprintf(fp," INVALID");
xmlMemContentShow(fp, p);
fprintf(fp,"\n");
nr--;
p = p->mh_next;
@ -453,6 +510,7 @@ xmlMemDisplay(FILE *fp)
if (p->mh_file != NULL) fprintf(fp,"%s(%d)", p->mh_file, p->mh_line);
if (p->mh_tag != MEMTAG)
fprintf(fp," INVALID");
xmlMemContentShow(fp, p);
fprintf(fp,"\n");
p = p->mh_next;
}