/* * parser.c : an XML 1.0 non-verifying parser * * See Copyright for the status of this software. * * $Id$ */ #include #include #include #include /* for memset() only */ #include #include #ifdef HAVE_FCNTL_H #include #endif #ifdef HAVE_UNISTD_H #include #endif #ifdef HAVE_ZLIB_H #include #endif #include "xml_tree.h" #include "xml_parser.h" #include "xml_entities.h" /* * A few macros needed to help building the parser. */ #ifdef UNICODE /* * UNICODE version of the macros. Incomplete now TODO !!!! */ #define IS_CHAR(c) \ (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || \ (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF))) #define SKIP_BLANKS(p) \ while ((*(p) == 0x20) || (*(p) == 0x09) || (*(p) == 0xa) || \ (*(p) == 0x3000)) (p)++; /* I'm too lazy to complete this one TODO !!!! */ #define IS_BASECHAR(c) \ ((((c) >= 0x41) && ((c) <= 0x5a)) || \ (((c) >= 0x61) && ((c) <= 0x7a)) || \ (((c) >= 0xaa) && ((c) <= 0x5b)) || \ (((c) >= 0xc0) && ((c) <= 0xd6)) || \ (((c) >= 0xd8) && ((c) <= 0xf6)) || \ (((c) >= 0xf8) && ((c) <= 0xff)) || \ ((c) == 0xba)) /* I'm too lazy to complete this one TODO !!!! */ #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39)) /* I'm too lazy to complete this one TODO !!!! */ #define IS_COMBINING(c) 0 #define IS_IGNORABLE(c) \ ((((c) >= 0x200c) && ((c) <= 0x200f)) || \ (((c) >= 0x202a) && ((c) <= 0x202e)) || \ (((c) >= 0x206a) && ((c) <= 0x206f)) || \ ((c) == 0xfeff)) #define IS_EXTENDER(c) \ (((c) == 0xb7) || ((c) == 0x2d0) || ((c) == 0x2d1) || \ ((c) == 0x387) || ((c) == 0x640) || ((c) == 0xe46) || \ ((c) == 0xec6) || ((c) == 0x3005) \ (((c) >= 0x3031) && ((c) <= 0x3035)) || \ (((c) >= 0x309b) && ((c) <= 0x309e)) || \ (((c) >= 0x30fc) && ((c) <= 0x30fe)) || \ (((c) >= 0xff70) && ((c) <= 0xff9e)) || \ ((c) == 0xff9f)) #define IS_IDEOGRAPHIC(c) \ ((((c) >= 0x4e00) && ((c) <= 0x9fa5)) || \ (((c) >= 0xf900) && ((c) <= 0xfa2d)) || \ (((c) >= 0x3021) && ((c) <= 0x3029)) || \ ((c) == 0x3007)) #define IS_LETTER(c) (IS_BASECHAR(c) || IS_IDEOGRAPHIC(c)) /* I'm too lazy to complete this one ! */ #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa)) #else /* * 8bits / ASCII version of the macros. */ #define IS_CHAR(c) \ (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || ((c) >= 0x20)) #define IS_BASECHAR(c) \ ((((c) >= 0x41) && ((c) <= 0x5a)) || \ (((c) >= 0x61) && ((c) <= 0x7a)) || \ (((c) >= 0xaa) && ((c) <= 0x5b)) || \ (((c) >= 0xc0) && ((c) <= 0xd6)) || \ (((c) >= 0xd8) && ((c) <= 0xf6)) || \ (((c) >= 0xf8) && ((c) <= 0xff)) || \ ((c) == 0xba)) #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39)) #define IS_LETTER(c) IS_BASECHAR(c) #define IS_COMBINING(c) 0 #define IS_IGNORABLE(c) 0 #define IS_EXTENDER(c) ((c) == 0xb7) #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa)) #endif #define SKIP_EOL(p) \ if (*(p) == 0x13) { p++ ; if (*(p) == 0x10) p++; } \ if (*(p) == 0x10) { p++ ; if (*(p) == 0x13) p++; } #define SKIP_BLANKS(p) \ while (IS_BLANK(*(p))) (p)++; #define MOVETO_ENDTAG(p) \ while (IS_CHAR(*p) && (*(p) != '>')) (p)++; #define MOVETO_STARTTAG(p) \ while (IS_CHAR(*p) && (*(p) != '<')) (p)++; /* * Forward definition for recusive behaviour. */ xmlNodePtr xmlParseElement(xmlParserCtxtPtr ctxt); /* * xmlHandleData : this routine represent's the specific application * behaviour when reading a piece of text. * * For example in WebDav, any piece made only of blanks is eliminated */ CHAR *xmlHandleData(CHAR *in) { CHAR *cur; if (in == NULL) return(NULL); cur = in; while (IS_CHAR(*cur)) { if (!IS_BLANK(*cur)) goto not_blank; cur++; } free(in); return(NULL); not_blank: return(in); } /* * xmlStrndup : a strdup for array of CHAR's */ CHAR *xmlStrndup(const CHAR *cur, int len) { CHAR *ret = malloc((len + 1) * sizeof(CHAR)); if (ret == NULL) { fprintf(stderr, "malloc of %d byte failed\n", (len + 1) * sizeof(CHAR)); return(NULL); } memcpy(ret, cur, len * sizeof(CHAR)); ret[len] = 0; return(ret); } /* * xmlStrdup : a strdup for CHAR's */ CHAR *xmlStrdup(const CHAR *cur) { const CHAR *p = cur; while (IS_CHAR(*p)) p++; return(xmlStrndup(cur, p - cur)); } /* * xmlStrcmp : a strcmp for CHAR's */ int xmlStrcmp(const CHAR *str1, const CHAR *str2) { register int tmp; do { tmp = *str1++ - *str2++; if (tmp != 0) return(tmp); } while ((*str1 != 0) && (*str2 != 0)); return (*str1 - *str2); } /* * xmlStrncmp : a strncmp for CHAR's */ int xmlStrncmp(const CHAR *str1, const CHAR *str2, int len) { register int tmp; if (len <= 0) return(0); do { tmp = *str1++ - *str2++; if (tmp != 0) return(tmp); len--; if (len <= 0) return(0); } while ((*str1 != 0) && (*str2 != 0)); return (*str1 - *str2); } /* * xmlStrchr : a strchr for CHAR's */ CHAR *xmlStrchr(const CHAR *str, CHAR val) { while (*str != 0) { if (*str == val) return((CHAR *) str); str++; } return(NULL); } /* * xmlParseName : parse an XML name. */ CHAR *xmlParseName(xmlParserCtxtPtr ctxt) { const CHAR *q; CHAR *ret = NULL; /* * Name ::= (Letter | '_') (NameChar)* */ if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) return(NULL); q = ctxt->cur++; while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) || (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') || (ctxt->cur[0] == '_') || (ctxt->cur[0] == ':') || (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) || (IS_EXTENDER(ctxt->cur[0]))) ctxt->cur++; ret = xmlStrndup(q, ctxt->cur - q); return(ret); } /* * Parse and return a string between quotes or doublequotes */ CHAR *xmlParseQuotedString(xmlParserCtxtPtr ctxt) { CHAR *ret = NULL; const CHAR *q; if (ctxt->cur[0] == '"') { ctxt->cur++; q = ctxt->cur; while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '"')) ctxt->cur++; if (ctxt->cur[0] != '"') fprintf(stderr, "String not closed \"%.50s\n", q); else { ret = xmlStrndup(q, ctxt->cur - q); ctxt->cur++; } } else if (ctxt->cur[0] == '\''){ ctxt->cur++; q = ctxt->cur; while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '\'')) ctxt->cur++; if (ctxt->cur[0] != '\'') fprintf(stderr, "String not closed '%.50s\n", q); else { ret = xmlStrndup(q, ctxt->cur - q); ctxt->cur++; } } return(ret); } /* * Skip an XML (SGML) comment * * TODO !!!! Save the comment in the tree !!! */ void xmlParserSkipComment(xmlParserCtxtPtr ctxt) { const CHAR *q, *start; const CHAR *r; /* * An extra check may avoid errors and isn't that costly ! */ if ((ctxt->cur[0] != '<') || (ctxt->cur[1] != '!') || (ctxt->cur[2] != '-') || (ctxt->cur[3] != '-')) return; ctxt->cur += 4; start = q = ctxt->cur; ctxt->cur++; r = ctxt->cur; ctxt->cur++; while (IS_CHAR(ctxt->cur[0]) && ((ctxt->cur[0] == ':') || (ctxt->cur[0] != '>') || (*r != '-') || (*q != '-'))) { ctxt->cur++;r++;q++; } if (!IS_CHAR(ctxt->cur[0])) { fprintf(stderr, "Comment not terminated