From 35fcbb84d2c480f29c7a870ec9b75fc8c8a5de07 Mon Sep 17 00:00:00 2001 From: Daniel Veillard Date: Wed, 12 Mar 2008 21:43:39 +0000 Subject: [PATCH] patch from Arnold Hendriks improving parsing of html within html bogus * HTMLparser.c: patch from Arnold Hendriks improving parsing of html within html bogus data, still not a complete fix though Daniel svn path=/trunk/; revision=3704 --- ChangeLog | 5 +++++ HTMLparser.c | 31 ++++++++++++++++--------------- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8143e16a..d62fb131 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +Wed Mar 12 18:56:22 CET 2008 Daniel Veillard + + * HTMLparser.c: patch from Arnold Hendriks improving parsing of + html within html bogus data, still not a complete fix though + Wed Mar 12 10:22:01 CET 2008 Daniel Veillard * python/types.c: fix a memory errro when using namespace nodes diff --git a/HTMLparser.c b/HTMLparser.c index 38af5e3f..7b983672 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -3423,7 +3423,7 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { * * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>' * - * Returns 0 in case of success and -1 in case of error. + * Returns 0 in case of success, -1 in case of error and 1 if discarded */ static int @@ -3436,6 +3436,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { int maxatts; int meta = 0; int i; + int discardtag = 0; if ((ctxt == NULL) || (ctxt->input == NULL)) { htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, @@ -3480,14 +3481,14 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, "htmlParseStartTag: misplaced tag\n", name, NULL); - return 0; + discardtag = 1; } if ((ctxt->nameNr != 1) && (xmlStrEqual(name, BAD_CAST"head"))) { htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, "htmlParseStartTag: misplaced tag\n", name, NULL); - return 0; + discardtag = 1; } if (xmlStrEqual(name, BAD_CAST"body")) { int indx; @@ -3496,9 +3497,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, "htmlParseStartTag: misplaced tag\n", name, NULL); - while ((IS_CHAR_CH(CUR)) && (CUR != '>')) - NEXT; - return 0; + discardtag = 1; } } } @@ -3597,12 +3596,14 @@ failed: /* * SAX: Start of Element ! */ - htmlnamePush(ctxt, name); - if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) { - if (nbatts != 0) - ctxt->sax->startElement(ctxt->userData, name, atts); - else - ctxt->sax->startElement(ctxt->userData, name, NULL); + if (!discardtag) { + htmlnamePush(ctxt, name); + if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) { + if (nbatts != 0) + ctxt->sax->startElement(ctxt->userData, name, atts); + else + ctxt->sax->startElement(ctxt->userData, name, NULL); + } } if (atts != NULL) { @@ -3612,7 +3613,7 @@ failed: } } - return 0; + return(discardtag); } /** @@ -3991,7 +3992,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) { failed = htmlParseStartTag(ctxt); name = ctxt->name; - if (failed || (name == NULL)) { + if ((failed == -1) || (name == NULL)) { if (CUR == '>') NEXT; return; @@ -4893,7 +4894,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { failed = htmlParseStartTag(ctxt); name = ctxt->name; - if (failed || + if ((failed == -1) || (name == NULL)) { if (CUR == '>') NEXT;