1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2024-10-26 12:25:09 +03:00
libxml2/xmlregexp.c
Nick Wellnhofer 68eadabd00 Fix exponential runtime in xmlFARecurseDeterminism
In order to prevent visiting a state twice, states must be marked as
visited for the whole duration of graph traversal because states might
be reached by different paths. Otherwise state graphs like the
following can lead to exponential runtime:

  ->O-->O-->O-->O-->O->
     \ / \ / \ / \ /
      O   O   O   O

Reset the "visited" flag only after the graph was traversed.

xmlFAComputesDeterminism still has massive performance problems when
handling fuzzed input. By design, it has quadratic time complexity in
the number of reachable states. Some issues might also stem from
redundant epsilon transitions. With this fix, fuzzing regexes with a
maximum length of 100 becomes feasible at least.

Found with libFuzzer.
2020-07-31 11:55:13 +02:00

8268 lines
218 KiB
C

/*
* regexp.c: generic and extensible Regular Expression engine
*
* Basically designed with the purpose of compiling regexps for
* the variety of validation/schemas mechanisms now available in
* XML related specifications these include:
* - XML-1.0 DTD validation
* - XML Schemas structure part 1
* - XML Schemas Datatypes part 2 especially Appendix F
* - RELAX-NG/TREX i.e. the counter proposal
*
* See Copyright for the status of this software.
*
* Daniel Veillard <veillard@redhat.com>
*/
#define IN_LIBXML
#include "libxml.h"
#ifdef LIBXML_REGEXP_ENABLED
/* #define DEBUG_ERR */
#include <stdio.h>
#include <string.h>
#ifdef HAVE_LIMITS_H
#include <limits.h>
#endif
#ifdef HAVE_STDINT_H
#include <stdint.h>
#endif
#include <libxml/tree.h>
#include <libxml/parserInternals.h>
#include <libxml/xmlregexp.h>
#include <libxml/xmlautomata.h>
#include <libxml/xmlunicode.h>
#ifndef INT_MAX
#define INT_MAX 123456789 /* easy to flag and big enough for our needs */
#endif
#ifndef SIZE_MAX
#define SIZE_MAX ((size_t) -1)
#endif
/* #define DEBUG_REGEXP_GRAPH */
/* #define DEBUG_REGEXP_EXEC */
/* #define DEBUG_PUSH */
/* #define DEBUG_COMPACTION */
#define MAX_PUSH 10000000
#ifdef ERROR
#undef ERROR
#endif
#define ERROR(str) \
ctxt->error = XML_REGEXP_COMPILE_ERROR; \
xmlRegexpErrCompile(ctxt, str);
#define NEXT ctxt->cur++
#define CUR (*(ctxt->cur))
#define NXT(index) (ctxt->cur[index])
#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
#define NEXTL(l) ctxt->cur += l;
#define XML_REG_STRING_SEPARATOR '|'
/*
* Need PREV to check on a '-' within a Character Group. May only be used
* when it's guaranteed that cur is not at the beginning of ctxt->string!
*/
#define PREV (ctxt->cur[-1])
/**
* TODO:
*
* macro to flag unimplemented blocks
*/
#define TODO \
xmlGenericError(xmlGenericErrorContext, \
"Unimplemented block at %s:%d\n", \
__FILE__, __LINE__);
/************************************************************************
* *
* Datatypes and structures *
* *
************************************************************************/
/*
* Note: the order of the enums below is significant, do not shuffle
*/
typedef enum {
XML_REGEXP_EPSILON = 1,
XML_REGEXP_CHARVAL,
XML_REGEXP_RANGES,
XML_REGEXP_SUBREG, /* used for () sub regexps */
XML_REGEXP_STRING,
XML_REGEXP_ANYCHAR, /* . */
XML_REGEXP_ANYSPACE, /* \s */
XML_REGEXP_NOTSPACE, /* \S */
XML_REGEXP_INITNAME, /* \l */
XML_REGEXP_NOTINITNAME, /* \L */
XML_REGEXP_NAMECHAR, /* \c */
XML_REGEXP_NOTNAMECHAR, /* \C */
XML_REGEXP_DECIMAL, /* \d */
XML_REGEXP_NOTDECIMAL, /* \D */
XML_REGEXP_REALCHAR, /* \w */
XML_REGEXP_NOTREALCHAR, /* \W */
XML_REGEXP_LETTER = 100,
XML_REGEXP_LETTER_UPPERCASE,
XML_REGEXP_LETTER_LOWERCASE,
XML_REGEXP_LETTER_TITLECASE,
XML_REGEXP_LETTER_MODIFIER,
XML_REGEXP_LETTER_OTHERS,
XML_REGEXP_MARK,
XML_REGEXP_MARK_NONSPACING,
XML_REGEXP_MARK_SPACECOMBINING,
XML_REGEXP_MARK_ENCLOSING,
XML_REGEXP_NUMBER,
XML_REGEXP_NUMBER_DECIMAL,
XML_REGEXP_NUMBER_LETTER,
XML_REGEXP_NUMBER_OTHERS,
XML_REGEXP_PUNCT,
XML_REGEXP_PUNCT_CONNECTOR,
XML_REGEXP_PUNCT_DASH,
XML_REGEXP_PUNCT_OPEN,
XML_REGEXP_PUNCT_CLOSE,
XML_REGEXP_PUNCT_INITQUOTE,
XML_REGEXP_PUNCT_FINQUOTE,
XML_REGEXP_PUNCT_OTHERS,
XML_REGEXP_SEPAR,
XML_REGEXP_SEPAR_SPACE,
XML_REGEXP_SEPAR_LINE,
XML_REGEXP_SEPAR_PARA,
XML_REGEXP_SYMBOL,
XML_REGEXP_SYMBOL_MATH,
XML_REGEXP_SYMBOL_CURRENCY,
XML_REGEXP_SYMBOL_MODIFIER,
XML_REGEXP_SYMBOL_OTHERS,
XML_REGEXP_OTHER,
XML_REGEXP_OTHER_CONTROL,
XML_REGEXP_OTHER_FORMAT,
XML_REGEXP_OTHER_PRIVATE,
XML_REGEXP_OTHER_NA,
XML_REGEXP_BLOCK_NAME
} xmlRegAtomType;
typedef enum {
XML_REGEXP_QUANT_EPSILON = 1,
XML_REGEXP_QUANT_ONCE,
XML_REGEXP_QUANT_OPT,
XML_REGEXP_QUANT_MULT,
XML_REGEXP_QUANT_PLUS,
XML_REGEXP_QUANT_ONCEONLY,
XML_REGEXP_QUANT_ALL,
XML_REGEXP_QUANT_RANGE
} xmlRegQuantType;
typedef enum {
XML_REGEXP_START_STATE = 1,
XML_REGEXP_FINAL_STATE,
XML_REGEXP_TRANS_STATE,
XML_REGEXP_SINK_STATE,
XML_REGEXP_UNREACH_STATE
} xmlRegStateType;
typedef enum {
XML_REGEXP_MARK_NORMAL = 0,
XML_REGEXP_MARK_START,
XML_REGEXP_MARK_VISITED
} xmlRegMarkedType;
typedef struct _xmlRegRange xmlRegRange;
typedef xmlRegRange *xmlRegRangePtr;
struct _xmlRegRange {
int neg; /* 0 normal, 1 not, 2 exclude */
xmlRegAtomType type;
int start;
int end;
xmlChar *blockName;
};
typedef struct _xmlRegAtom xmlRegAtom;
typedef xmlRegAtom *xmlRegAtomPtr;
typedef struct _xmlAutomataState xmlRegState;
typedef xmlRegState *xmlRegStatePtr;
struct _xmlRegAtom {
int no;
xmlRegAtomType type;
xmlRegQuantType quant;
int min;
int max;
void *valuep;
void *valuep2;
int neg;
int codepoint;
xmlRegStatePtr start;
xmlRegStatePtr start0;
xmlRegStatePtr stop;
int maxRanges;
int nbRanges;
xmlRegRangePtr *ranges;
void *data;
};
typedef struct _xmlRegCounter xmlRegCounter;
typedef xmlRegCounter *xmlRegCounterPtr;
struct _xmlRegCounter {
int min;
int max;
};
typedef struct _xmlRegTrans xmlRegTrans;
typedef xmlRegTrans *xmlRegTransPtr;
struct _xmlRegTrans {
xmlRegAtomPtr atom;
int to;
int counter;
int count;
int nd;
};
struct _xmlAutomataState {
xmlRegStateType type;
xmlRegMarkedType mark;
xmlRegMarkedType markd;
xmlRegMarkedType reached;
int no;
int maxTrans;
int nbTrans;
xmlRegTrans *trans;
/* knowing states pointing to us can speed things up */
int maxTransTo;
int nbTransTo;
int *transTo;
};
typedef struct _xmlAutomata xmlRegParserCtxt;
typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
#define AM_AUTOMATA_RNG 1
struct _xmlAutomata {
xmlChar *string;
xmlChar *cur;
int error;
int neg;
xmlRegStatePtr start;
xmlRegStatePtr end;
xmlRegStatePtr state;
xmlRegAtomPtr atom;
int maxAtoms;
int nbAtoms;
xmlRegAtomPtr *atoms;
int maxStates;
int nbStates;
xmlRegStatePtr *states;
int maxCounters;
int nbCounters;
xmlRegCounter *counters;
int determinist;
int negs;
int flags;
int depth;
};
struct _xmlRegexp {
xmlChar *string;
int nbStates;
xmlRegStatePtr *states;
int nbAtoms;
xmlRegAtomPtr *atoms;
int nbCounters;
xmlRegCounter *counters;
int determinist;
int flags;
/*
* That's the compact form for determinists automatas
*/
int nbstates;
int *compact;
void **transdata;
int nbstrings;
xmlChar **stringMap;
};
typedef struct _xmlRegExecRollback xmlRegExecRollback;
typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
struct _xmlRegExecRollback {
xmlRegStatePtr state;/* the current state */
int index; /* the index in the input stack */
int nextbranch; /* the next transition to explore in that state */
int *counts; /* save the automata state if it has some */
};
typedef struct _xmlRegInputToken xmlRegInputToken;
typedef xmlRegInputToken *xmlRegInputTokenPtr;
struct _xmlRegInputToken {
xmlChar *value;
void *data;
};
struct _xmlRegExecCtxt {
int status; /* execution status != 0 indicate an error */
int determinist; /* did we find an indeterministic behaviour */
xmlRegexpPtr comp; /* the compiled regexp */
xmlRegExecCallbacks callback;
void *data;
xmlRegStatePtr state;/* the current state */
int transno; /* the current transition on that state */
int transcount; /* the number of chars in char counted transitions */
/*
* A stack of rollback states
*/
int maxRollbacks;
int nbRollbacks;
xmlRegExecRollback *rollbacks;
/*
* The state of the automata if any
*/
int *counts;
/*
* The input stack
*/
int inputStackMax;
int inputStackNr;
int index;
int *charStack;
const xmlChar *inputString; /* when operating on characters */
xmlRegInputTokenPtr inputStack;/* when operating on strings */
/*
* error handling
*/
int errStateNo; /* the error state number */
xmlRegStatePtr errState; /* the error state */
xmlChar *errString; /* the string raising the error */
int *errCounts; /* counters at the error state */
int nbPush;
};
#define REGEXP_ALL_COUNTER 0x123456
#define REGEXP_ALL_LAX_COUNTER 0x123457
static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
static void xmlRegFreeState(xmlRegStatePtr state);
static void xmlRegFreeAtom(xmlRegAtomPtr atom);
static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
int neg, int start, int end, const xmlChar *blockName);
void xmlAutomataSetFlags(xmlAutomataPtr am, int flags);
/************************************************************************
* *
* Regexp memory error handler *
* *
************************************************************************/
/**
* xmlRegexpErrMemory:
* @extra: extra information
*
* Handle an out of memory condition
*/
static void
xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
{
const char *regexp = NULL;
if (ctxt != NULL) {
regexp = (const char *) ctxt->string;
ctxt->error = XML_ERR_NO_MEMORY;
}
__xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
regexp, NULL, 0, 0,
"Memory allocation failed : %s\n", extra);
}
/**
* xmlRegexpErrCompile:
* @extra: extra information
*
* Handle a compilation failure
*/
static void
xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
{
const char *regexp = NULL;
int idx = 0;
if (ctxt != NULL) {
regexp = (const char *) ctxt->string;
idx = ctxt->cur - ctxt->string;
ctxt->error = XML_REGEXP_COMPILE_ERROR;
}
__xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
regexp, NULL, idx, 0,
"failed to compile: %s\n", extra);
}
/************************************************************************
* *
* Allocation/Deallocation *
* *
************************************************************************/
static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
/**
* xmlRegCalloc2:
* @dim1: size of first dimension
* @dim2: size of second dimension
* @elemSize: size of element
*
* Allocate a two-dimensional array and set all elements to zero.
*
* Returns the new array or NULL in case of error.
*/
static void*
xmlRegCalloc2(size_t dim1, size_t dim2, size_t elemSize) {
size_t totalSize;
void *ret;
/* Check for overflow */
if (dim1 > SIZE_MAX / dim2 / elemSize)
return (NULL);
totalSize = dim1 * dim2 * elemSize;
ret = xmlMalloc(totalSize);
if (ret != NULL)
memset(ret, 0, totalSize);
return (ret);
}
/**
* xmlRegEpxFromParse:
* @ctxt: the parser context used to build it
*
* Allocate a new regexp and fill it with the result from the parser
*
* Returns the new regexp or NULL in case of error
*/
static xmlRegexpPtr
xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
xmlRegexpPtr ret;
ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
if (ret == NULL) {
xmlRegexpErrMemory(ctxt, "compiling regexp");
return(NULL);
}
memset(ret, 0, sizeof(xmlRegexp));
ret->string = ctxt->string;
ret->nbStates = ctxt->nbStates;
ret->states = ctxt->states;
ret->nbAtoms = ctxt->nbAtoms;
ret->atoms = ctxt->atoms;
ret->nbCounters = ctxt->nbCounters;
ret->counters = ctxt->counters;
ret->determinist = ctxt->determinist;
ret->flags = ctxt->flags;
if (ret->determinist == -1) {
xmlRegexpIsDeterminist(ret);
}
if ((ret->determinist != 0) &&
(ret->nbCounters == 0) &&
(ctxt->negs == 0) &&
(ret->atoms != NULL) &&
(ret->atoms[0] != NULL) &&
(ret->atoms[0]->type == XML_REGEXP_STRING)) {
int i, j, nbstates = 0, nbatoms = 0;
int *stateRemap;
int *stringRemap;
int *transitions;
void **transdata;
xmlChar **stringMap;
xmlChar *value;
/*
* Switch to a compact representation
* 1/ counting the effective number of states left
* 2/ counting the unique number of atoms, and check that
* they are all of the string type
* 3/ build a table state x atom for the transitions
*/
stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
if (stateRemap == NULL) {
xmlRegexpErrMemory(ctxt, "compiling regexp");
xmlFree(ret);
return(NULL);
}
for (i = 0;i < ret->nbStates;i++) {
if (ret->states[i] != NULL) {
stateRemap[i] = nbstates;
nbstates++;
} else {
stateRemap[i] = -1;
}
}
#ifdef DEBUG_COMPACTION
printf("Final: %d states\n", nbstates);
#endif
stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
if (stringMap == NULL) {
xmlRegexpErrMemory(ctxt, "compiling regexp");
xmlFree(stateRemap);
xmlFree(ret);
return(NULL);
}
stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
if (stringRemap == NULL) {
xmlRegexpErrMemory(ctxt, "compiling regexp");
xmlFree(stringMap);
xmlFree(stateRemap);
xmlFree(ret);
return(NULL);
}
for (i = 0;i < ret->nbAtoms;i++) {
if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
(ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
value = ret->atoms[i]->valuep;
for (j = 0;j < nbatoms;j++) {
if (xmlStrEqual(stringMap[j], value)) {
stringRemap[i] = j;
break;
}
}
if (j >= nbatoms) {
stringRemap[i] = nbatoms;
stringMap[nbatoms] = xmlStrdup(value);
if (stringMap[nbatoms] == NULL) {
for (i = 0;i < nbatoms;i++)
xmlFree(stringMap[i]);
xmlFree(stringRemap);
xmlFree(stringMap);
xmlFree(stateRemap);
xmlFree(ret);
return(NULL);
}
nbatoms++;
}
} else {
xmlFree(stateRemap);
xmlFree(stringRemap);
for (i = 0;i < nbatoms;i++)
xmlFree(stringMap[i]);
xmlFree(stringMap);
xmlFree(ret);
return(NULL);
}
}
#ifdef DEBUG_COMPACTION
printf("Final: %d atoms\n", nbatoms);
#endif
transitions = (int *) xmlRegCalloc2(nbstates + 1, nbatoms + 1,
sizeof(int));
if (transitions == NULL) {
xmlFree(stateRemap);
xmlFree(stringRemap);
for (i = 0;i < nbatoms;i++)
xmlFree(stringMap[i]);
xmlFree(stringMap);
xmlFree(ret);
return(NULL);
}
/*
* Allocate the transition table. The first entry for each
* state corresponds to the state type.
*/
transdata = NULL;
for (i = 0;i < ret->nbStates;i++) {
int stateno, atomno, targetno, prev;
xmlRegStatePtr state;
xmlRegTransPtr trans;
stateno = stateRemap[i];
if (stateno == -1)
continue;
state = ret->states[i];
transitions[stateno * (nbatoms + 1)] = state->type;
for (j = 0;j < state->nbTrans;j++) {
trans = &(state->trans[j]);
if ((trans->to == -1) || (trans->atom == NULL))
continue;
atomno = stringRemap[trans->atom->no];
if ((trans->atom->data != NULL) && (transdata == NULL)) {
transdata = (void **) xmlRegCalloc2(nbstates, nbatoms,
sizeof(void *));
if (transdata == NULL) {
xmlRegexpErrMemory(ctxt, "compiling regexp");
break;
}
}
targetno = stateRemap[trans->to];
/*
* if the same atom can generate transitions to 2 different
* states then it means the automata is not deterministic and
* the compact form can't be used !
*/
prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
if (prev != 0) {
if (prev != targetno + 1) {
ret->determinist = 0;
#ifdef DEBUG_COMPACTION
printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
i, j, trans->atom->no, trans->to, atomno, targetno);
printf(" previous to is %d\n", prev);
#endif
if (transdata != NULL)
xmlFree(transdata);
xmlFree(transitions);
xmlFree(stateRemap);
xmlFree(stringRemap);
for (i = 0;i < nbatoms;i++)
xmlFree(stringMap[i]);
xmlFree(stringMap);
goto not_determ;
}
} else {
#if 0
printf("State %d trans %d: atom %d to %d : %d to %d\n",
i, j, trans->atom->no, trans->to, atomno, targetno);
#endif
transitions[stateno * (nbatoms + 1) + atomno + 1] =
targetno + 1; /* to avoid 0 */
if (transdata != NULL)
transdata[stateno * nbatoms + atomno] =
trans->atom->data;
}
}
}
ret->determinist = 1;
#ifdef DEBUG_COMPACTION
/*
* Debug
*/
for (i = 0;i < nbstates;i++) {
for (j = 0;j < nbatoms + 1;j++) {
printf("%02d ", transitions[i * (nbatoms + 1) + j]);
}
printf("\n");
}
printf("\n");
#endif
/*
* Cleanup of the old data
*/
if (ret->states != NULL) {
for (i = 0;i < ret->nbStates;i++)
xmlRegFreeState(ret->states[i]);
xmlFree(ret->states);
}
ret->states = NULL;
ret->nbStates = 0;
if (ret->atoms != NULL) {
for (i = 0;i < ret->nbAtoms;i++)
xmlRegFreeAtom(ret->atoms[i]);
xmlFree(ret->atoms);
}
ret->atoms = NULL;
ret->nbAtoms = 0;
ret->compact = transitions;
ret->transdata = transdata;
ret->stringMap = stringMap;
ret->nbstrings = nbatoms;
ret->nbstates = nbstates;
xmlFree(stateRemap);
xmlFree(stringRemap);
}
not_determ:
ctxt->string = NULL;
ctxt->nbStates = 0;
ctxt->states = NULL;
ctxt->nbAtoms = 0;
ctxt->atoms = NULL;
ctxt->nbCounters = 0;
ctxt->counters = NULL;
return(ret);
}
/**
* xmlRegNewParserCtxt:
* @string: the string to parse
*
* Allocate a new regexp parser context
*
* Returns the new context or NULL in case of error
*/
static xmlRegParserCtxtPtr
xmlRegNewParserCtxt(const xmlChar *string) {
xmlRegParserCtxtPtr ret;
ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
if (ret == NULL)
return(NULL);
memset(ret, 0, sizeof(xmlRegParserCtxt));
if (string != NULL)
ret->string = xmlStrdup(string);
ret->cur = ret->string;
ret->neg = 0;
ret->negs = 0;
ret->error = 0;
ret->determinist = -1;
return(ret);
}
/**
* xmlRegNewRange:
* @ctxt: the regexp parser context
* @neg: is that negative
* @type: the type of range
* @start: the start codepoint
* @end: the end codepoint
*
* Allocate a new regexp range
*
* Returns the new range or NULL in case of error
*/
static xmlRegRangePtr
xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
int neg, xmlRegAtomType type, int start, int end) {
xmlRegRangePtr ret;
ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
if (ret == NULL) {
xmlRegexpErrMemory(ctxt, "allocating range");
return(NULL);
}
ret->neg = neg;
ret->type = type;
ret->start = start;
ret->end = end;
return(ret);
}
/**
* xmlRegFreeRange:
* @range: the regexp range
*
* Free a regexp range
*/
static void
xmlRegFreeRange(xmlRegRangePtr range) {
if (range == NULL)
return;
if (range->blockName != NULL)
xmlFree(range->blockName);
xmlFree(range);
}
/**
* xmlRegCopyRange:
* @range: the regexp range
*
* Copy a regexp range
*
* Returns the new copy or NULL in case of error.
*/
static xmlRegRangePtr
xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
xmlRegRangePtr ret;
if (range == NULL)
return(NULL);
ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
range->end);
if (ret == NULL)
return(NULL);
if (range->blockName != NULL) {
ret->blockName = xmlStrdup(range->blockName);
if (ret->blockName == NULL) {
xmlRegexpErrMemory(ctxt, "allocating range");
xmlRegFreeRange(ret);
return(NULL);
}
}
return(ret);
}
/**
* xmlRegNewAtom:
* @ctxt: the regexp parser context
* @type: the type of atom
*
* Allocate a new atom
*
* Returns the new atom or NULL in case of error
*/
static xmlRegAtomPtr
xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
xmlRegAtomPtr ret;
ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
if (ret == NULL) {
xmlRegexpErrMemory(ctxt, "allocating atom");
return(NULL);
}
memset(ret, 0, sizeof(xmlRegAtom));
ret->type = type;
ret->quant = XML_REGEXP_QUANT_ONCE;
ret->min = 0;
ret->max = 0;
return(ret);
}
/**
* xmlRegFreeAtom:
* @atom: the regexp atom
*
* Free a regexp atom
*/
static void
xmlRegFreeAtom(xmlRegAtomPtr atom) {
int i;
if (atom == NULL)
return;
for (i = 0;i < atom->nbRanges;i++)
xmlRegFreeRange(atom->ranges[i]);
if (atom->ranges != NULL)
xmlFree(atom->ranges);
if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
xmlFree(atom->valuep);
if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
xmlFree(atom->valuep2);
if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
xmlFree(atom->valuep);
xmlFree(atom);
}
/**
* xmlRegCopyAtom:
* @ctxt: the regexp parser context
* @atom: the original atom
*
* Allocate a new regexp range
*
* Returns the new atom or NULL in case of error
*/
static xmlRegAtomPtr
xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
xmlRegAtomPtr ret;
ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
if (ret == NULL) {
xmlRegexpErrMemory(ctxt, "copying atom");
return(NULL);
}
memset(ret, 0, sizeof(xmlRegAtom));
ret->type = atom->type;
ret->quant = atom->quant;
ret->min = atom->min;
ret->max = atom->max;
if (atom->nbRanges > 0) {
int i;
ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
atom->nbRanges);
if (ret->ranges == NULL) {
xmlRegexpErrMemory(ctxt, "copying atom");
goto error;
}
for (i = 0;i < atom->nbRanges;i++) {
ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
if (ret->ranges[i] == NULL)
goto error;
ret->nbRanges = i + 1;
}
}
return(ret);
error:
xmlRegFreeAtom(ret);
return(NULL);
}
static xmlRegStatePtr
xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
xmlRegStatePtr ret;
ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
if (ret == NULL) {
xmlRegexpErrMemory(ctxt, "allocating state");
return(NULL);
}
memset(ret, 0, sizeof(xmlRegState));
ret->type = XML_REGEXP_TRANS_STATE;
ret->mark = XML_REGEXP_MARK_NORMAL;
return(ret);
}
/**
* xmlRegFreeState:
* @state: the regexp state
*
* Free a regexp state
*/
static void
xmlRegFreeState(xmlRegStatePtr state) {
if (state == NULL)
return;
if (state->trans != NULL)
xmlFree(state->trans);
if (state->transTo != NULL)
xmlFree(state->transTo);
xmlFree(state);
}
/**
* xmlRegFreeParserCtxt:
* @ctxt: the regexp parser context
*
* Free a regexp parser context
*/
static void
xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
int i;
if (ctxt == NULL)
return;
if (ctxt->string != NULL)
xmlFree(ctxt->string);
if (ctxt->states != NULL) {
for (i = 0;i < ctxt->nbStates;i++)
xmlRegFreeState(ctxt->states[i]);
xmlFree(ctxt->states);
}
if (ctxt->atoms != NULL) {
for (i = 0;i < ctxt->nbAtoms;i++)
xmlRegFreeAtom(ctxt->atoms[i]);
xmlFree(ctxt->atoms);
}
if (ctxt->counters != NULL)
xmlFree(ctxt->counters);
xmlFree(ctxt);
}
/************************************************************************
* *
* Display of Data structures *
* *
************************************************************************/
static void
xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
switch (type) {
case XML_REGEXP_EPSILON:
fprintf(output, "epsilon "); break;
case XML_REGEXP_CHARVAL:
fprintf(output, "charval "); break;
case XML_REGEXP_RANGES:
fprintf(output, "ranges "); break;
case XML_REGEXP_SUBREG:
fprintf(output, "subexpr "); break;
case XML_REGEXP_STRING:
fprintf(output, "string "); break;
case XML_REGEXP_ANYCHAR:
fprintf(output, "anychar "); break;
case XML_REGEXP_ANYSPACE:
fprintf(output, "anyspace "); break;
case XML_REGEXP_NOTSPACE:
fprintf(output, "notspace "); break;
case XML_REGEXP_INITNAME:
fprintf(output, "initname "); break;
case XML_REGEXP_NOTINITNAME:
fprintf(output, "notinitname "); break;
case XML_REGEXP_NAMECHAR:
fprintf(output, "namechar "); break;
case XML_REGEXP_NOTNAMECHAR:
fprintf(output, "notnamechar "); break;
case XML_REGEXP_DECIMAL:
fprintf(output, "decimal "); break;
case XML_REGEXP_NOTDECIMAL:
fprintf(output, "notdecimal "); break;
case XML_REGEXP_REALCHAR:
fprintf(output, "realchar "); break;
case XML_REGEXP_NOTREALCHAR:
fprintf(output, "notrealchar "); break;
case XML_REGEXP_LETTER:
fprintf(output, "LETTER "); break;
case XML_REGEXP_LETTER_UPPERCASE:
fprintf(output, "LETTER_UPPERCASE "); break;
case XML_REGEXP_LETTER_LOWERCASE:
fprintf(output, "LETTER_LOWERCASE "); break;
case XML_REGEXP_LETTER_TITLECASE:
fprintf(output, "LETTER_TITLECASE "); break;
case XML_REGEXP_LETTER_MODIFIER:
fprintf(output, "LETTER_MODIFIER "); break;
case XML_REGEXP_LETTER_OTHERS:
fprintf(output, "LETTER_OTHERS "); break;
case XML_REGEXP_MARK:
fprintf(output, "MARK "); break;
case XML_REGEXP_MARK_NONSPACING:
fprintf(output, "MARK_NONSPACING "); break;
case XML_REGEXP_MARK_SPACECOMBINING:
fprintf(output, "MARK_SPACECOMBINING "); break;
case XML_REGEXP_MARK_ENCLOSING:
fprintf(output, "MARK_ENCLOSING "); break;
case XML_REGEXP_NUMBER:
fprintf(output, "NUMBER "); break;
case XML_REGEXP_NUMBER_DECIMAL:
fprintf(output, "NUMBER_DECIMAL "); break;
case XML_REGEXP_NUMBER_LETTER:
fprintf(output, "NUMBER_LETTER "); break;
case XML_REGEXP_NUMBER_OTHERS:
fprintf(output, "NUMBER_OTHERS "); break;
case XML_REGEXP_PUNCT:
fprintf(output, "PUNCT "); break;
case XML_REGEXP_PUNCT_CONNECTOR:
fprintf(output, "PUNCT_CONNECTOR "); break;
case XML_REGEXP_PUNCT_DASH:
fprintf(output, "PUNCT_DASH "); break;
case XML_REGEXP_PUNCT_OPEN:
fprintf(output, "PUNCT_OPEN "); break;
case XML_REGEXP_PUNCT_CLOSE:
fprintf(output, "PUNCT_CLOSE "); break;
case XML_REGEXP_PUNCT_INITQUOTE:
fprintf(output, "PUNCT_INITQUOTE "); break;
case XML_REGEXP_PUNCT_FINQUOTE:
fprintf(output, "PUNCT_FINQUOTE "); break;
case XML_REGEXP_PUNCT_OTHERS:
fprintf(output, "PUNCT_OTHERS "); break;
case XML_REGEXP_SEPAR:
fprintf(output, "SEPAR "); break;
case XML_REGEXP_SEPAR_SPACE:
fprintf(output, "SEPAR_SPACE "); break;
case XML_REGEXP_SEPAR_LINE:
fprintf(output, "SEPAR_LINE "); break;
case XML_REGEXP_SEPAR_PARA:
fprintf(output, "SEPAR_PARA "); break;
case XML_REGEXP_SYMBOL:
fprintf(output, "SYMBOL "); break;
case XML_REGEXP_SYMBOL_MATH:
fprintf(output, "SYMBOL_MATH "); break;
case XML_REGEXP_SYMBOL_CURRENCY:
fprintf(output, "SYMBOL_CURRENCY "); break;
case XML_REGEXP_SYMBOL_MODIFIER:
fprintf(output, "SYMBOL_MODIFIER "); break;
case XML_REGEXP_SYMBOL_OTHERS:
fprintf(output, "SYMBOL_OTHERS "); break;
case XML_REGEXP_OTHER:
fprintf(output, "OTHER "); break;
case XML_REGEXP_OTHER_CONTROL:
fprintf(output, "OTHER_CONTROL "); break;
case XML_REGEXP_OTHER_FORMAT:
fprintf(output, "OTHER_FORMAT "); break;
case XML_REGEXP_OTHER_PRIVATE:
fprintf(output, "OTHER_PRIVATE "); break;
case XML_REGEXP_OTHER_NA:
fprintf(output, "OTHER_NA "); break;
case XML_REGEXP_BLOCK_NAME:
fprintf(output, "BLOCK "); break;
}
}
static void
xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
switch (type) {
case XML_REGEXP_QUANT_EPSILON:
fprintf(output, "epsilon "); break;
case XML_REGEXP_QUANT_ONCE:
fprintf(output, "once "); break;
case XML_REGEXP_QUANT_OPT:
fprintf(output, "? "); break;
case XML_REGEXP_QUANT_MULT:
fprintf(output, "* "); break;
case XML_REGEXP_QUANT_PLUS:
fprintf(output, "+ "); break;
case XML_REGEXP_QUANT_RANGE:
fprintf(output, "range "); break;
case XML_REGEXP_QUANT_ONCEONLY:
fprintf(output, "onceonly "); break;
case XML_REGEXP_QUANT_ALL:
fprintf(output, "all "); break;
}
}
static void
xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
fprintf(output, " range: ");
if (range->neg)
fprintf(output, "negative ");
xmlRegPrintAtomType(output, range->type);
fprintf(output, "%c - %c\n", range->start, range->end);
}
static void
xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
fprintf(output, " atom: ");
if (atom == NULL) {
fprintf(output, "NULL\n");
return;
}
if (atom->neg)
fprintf(output, "not ");
xmlRegPrintAtomType(output, atom->type);
xmlRegPrintQuantType(output, atom->quant);
if (atom->quant == XML_REGEXP_QUANT_RANGE)
fprintf(output, "%d-%d ", atom->min, atom->max);
if (atom->type == XML_REGEXP_STRING)
fprintf(output, "'%s' ", (char *) atom->valuep);
if (atom->type == XML_REGEXP_CHARVAL)
fprintf(output, "char %c\n", atom->codepoint);
else if (atom->type == XML_REGEXP_RANGES) {
int i;
fprintf(output, "%d entries\n", atom->nbRanges);
for (i = 0; i < atom->nbRanges;i++)
xmlRegPrintRange(output, atom->ranges[i]);
} else if (atom->type == XML_REGEXP_SUBREG) {
fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
} else {
fprintf(output, "\n");
}
}
static void
xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
fprintf(output, " trans: ");
if (trans == NULL) {
fprintf(output, "NULL\n");
return;
}
if (trans->to < 0) {
fprintf(output, "removed\n");
return;
}
if (trans->nd != 0) {
if (trans->nd == 2)
fprintf(output, "last not determinist, ");
else
fprintf(output, "not determinist, ");
}
if (trans->counter >= 0) {
fprintf(output, "counted %d, ", trans->counter);
}
if (trans->count == REGEXP_ALL_COUNTER) {
fprintf(output, "all transition, ");
} else if (trans->count >= 0) {
fprintf(output, "count based %d, ", trans->count);
}
if (trans->atom == NULL) {
fprintf(output, "epsilon to %d\n", trans->to);
return;
}
if (trans->atom->type == XML_REGEXP_CHARVAL)
fprintf(output, "char %c ", trans->atom->codepoint);
fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
}
static void
xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
int i;
fprintf(output, " state: ");
if (state == NULL) {
fprintf(output, "NULL\n");
return;
}
if (state->type == XML_REGEXP_START_STATE)
fprintf(output, "START ");
if (state->type == XML_REGEXP_FINAL_STATE)
fprintf(output, "FINAL ");
fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
for (i = 0;i < state->nbTrans; i++) {
xmlRegPrintTrans(output, &(state->trans[i]));
}
}
#ifdef DEBUG_REGEXP_GRAPH
static void
xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
int i;
fprintf(output, " ctxt: ");
if (ctxt == NULL) {
fprintf(output, "NULL\n");
return;
}
fprintf(output, "'%s' ", ctxt->string);
if (ctxt->error)
fprintf(output, "error ");
if (ctxt->neg)
fprintf(output, "neg ");
fprintf(output, "\n");
fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
for (i = 0;i < ctxt->nbAtoms; i++) {
fprintf(output, " %02d ", i);
xmlRegPrintAtom(output, ctxt->atoms[i]);
}
if (ctxt->atom != NULL) {
fprintf(output, "current atom:\n");
xmlRegPrintAtom(output, ctxt->atom);
}
fprintf(output, "%d states:", ctxt->nbStates);
if (ctxt->start != NULL)
fprintf(output, " start: %d", ctxt->start->no);
if (ctxt->end != NULL)
fprintf(output, " end: %d", ctxt->end->no);
fprintf(output, "\n");
for (i = 0;i < ctxt->nbStates; i++) {
xmlRegPrintState(output, ctxt->states[i]);
}
fprintf(output, "%d counters:\n", ctxt->nbCounters);
for (i = 0;i < ctxt->nbCounters; i++) {
fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
ctxt->counters[i].max);
}
}
#endif
/************************************************************************
* *
* Finite Automata structures manipulations *
* *
************************************************************************/
static void
xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
int neg, xmlRegAtomType type, int start, int end,
xmlChar *blockName) {
xmlRegRangePtr range;
if (atom == NULL) {
ERROR("add range: atom is NULL");
return;
}
if (atom->type != XML_REGEXP_RANGES) {
ERROR("add range: atom is not ranges");
return;
}
if (atom->maxRanges == 0) {
atom->maxRanges = 4;
atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
sizeof(xmlRegRangePtr));
if (atom->ranges == NULL) {
xmlRegexpErrMemory(ctxt, "adding ranges");
atom->maxRanges = 0;
return;
}
} else if (atom->nbRanges >= atom->maxRanges) {
xmlRegRangePtr *tmp;
atom->maxRanges *= 2;
tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
sizeof(xmlRegRangePtr));
if (tmp == NULL) {
xmlRegexpErrMemory(ctxt, "adding ranges");
atom->maxRanges /= 2;
return;
}
atom->ranges = tmp;
}
range = xmlRegNewRange(ctxt, neg, type, start, end);
if (range == NULL)
return;
range->blockName = blockName;
atom->ranges[atom->nbRanges++] = range;
}
static int
xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
if (ctxt->maxCounters == 0) {
ctxt->maxCounters = 4;
ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
sizeof(xmlRegCounter));
if (ctxt->counters == NULL) {
xmlRegexpErrMemory(ctxt, "allocating counter");
ctxt->maxCounters = 0;
return(-1);
}
} else if (ctxt->nbCounters >= ctxt->maxCounters) {
xmlRegCounter *tmp;
ctxt->maxCounters *= 2;
tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
sizeof(xmlRegCounter));
if (tmp == NULL) {
xmlRegexpErrMemory(ctxt, "allocating counter");
ctxt->maxCounters /= 2;
return(-1);
}
ctxt->counters = tmp;
}
ctxt->counters[ctxt->nbCounters].min = -1;
ctxt->counters[ctxt->nbCounters].max = -1;
return(ctxt->nbCounters++);
}
static int
xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
if (atom == NULL) {
ERROR("atom push: atom is NULL");
return(-1);
}
if (ctxt->maxAtoms == 0) {
ctxt->maxAtoms = 4;
ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
sizeof(xmlRegAtomPtr));
if (ctxt->atoms == NULL) {
xmlRegexpErrMemory(ctxt, "pushing atom");
ctxt->maxAtoms = 0;
return(-1);
}
} else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
xmlRegAtomPtr *tmp;
ctxt->maxAtoms *= 2;
tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
sizeof(xmlRegAtomPtr));
if (tmp == NULL) {
xmlRegexpErrMemory(ctxt, "allocating counter");
ctxt->maxAtoms /= 2;
return(-1);
}
ctxt->atoms = tmp;
}
atom->no = ctxt->nbAtoms;
ctxt->atoms[ctxt->nbAtoms++] = atom;
return(0);
}
static void
xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
int from) {
if (target->maxTransTo == 0) {
target->maxTransTo = 8;
target->transTo = (int *) xmlMalloc(target->maxTransTo *
sizeof(int));
if (target->transTo == NULL) {
xmlRegexpErrMemory(ctxt, "adding transition");
target->maxTransTo = 0;
return;
}
} else if (target->nbTransTo >= target->maxTransTo) {
int *tmp;
target->maxTransTo *= 2;
tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
sizeof(int));
if (tmp == NULL) {
xmlRegexpErrMemory(ctxt, "adding transition");
target->maxTransTo /= 2;
return;
}
target->transTo = tmp;
}
target->transTo[target->nbTransTo] = from;
target->nbTransTo++;
}
static void
xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
xmlRegAtomPtr atom, xmlRegStatePtr target,
int counter, int count) {
int nrtrans;
if (state == NULL) {
ERROR("add state: state is NULL");
return;
}
if (target == NULL) {
ERROR("add state: target is NULL");
return;
}
/*
* Other routines follow the philosophy 'When in doubt, add a transition'
* so we check here whether such a transition is already present and, if
* so, silently ignore this request.
*/
for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
xmlRegTransPtr trans = &(state->trans[nrtrans]);
if ((trans->atom == atom) &&
(trans->to == target->no) &&
(trans->counter == counter) &&
(trans->count == count)) {
#ifdef DEBUG_REGEXP_GRAPH
printf("Ignoring duplicate transition from %d to %d\n",
state->no, target->no);
#endif
return;
}
}
if (state->maxTrans == 0) {
state->maxTrans = 8;
state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
sizeof(xmlRegTrans));
if (state->trans == NULL) {
xmlRegexpErrMemory(ctxt, "adding transition");
state->maxTrans = 0;
return;
}
} else if (state->nbTrans >= state->maxTrans) {
xmlRegTrans *tmp;
state->maxTrans *= 2;
tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
sizeof(xmlRegTrans));
if (tmp == NULL) {
xmlRegexpErrMemory(ctxt, "adding transition");
state->maxTrans /= 2;
return;
}
state->trans = tmp;
}
#ifdef DEBUG_REGEXP_GRAPH
printf("Add trans from %d to %d ", state->no, target->no);
if (count == REGEXP_ALL_COUNTER)
printf("all transition\n");
else if (count >= 0)
printf("count based %d\n", count);
else if (counter >= 0)
printf("counted %d\n", counter);
else if (atom == NULL)
printf("epsilon transition\n");
else if (atom != NULL)
xmlRegPrintAtom(stdout, atom);
#endif
state->trans[state->nbTrans].atom = atom;
state->trans[state->nbTrans].to = target->no;
state->trans[state->nbTrans].counter = counter;
state->trans[state->nbTrans].count = count;
state->trans[state->nbTrans].nd = 0;
state->nbTrans++;
xmlRegStateAddTransTo(ctxt, target, state->no);
}
static int
xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
if (state == NULL) return(-1);
if (ctxt->maxStates == 0) {
ctxt->maxStates = 4;
ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
sizeof(xmlRegStatePtr));
if (ctxt->states == NULL) {
xmlRegexpErrMemory(ctxt, "adding state");
ctxt->maxStates = 0;
return(-1);
}
} else if (ctxt->nbStates >= ctxt->maxStates) {
xmlRegStatePtr *tmp;
ctxt->maxStates *= 2;
tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
sizeof(xmlRegStatePtr));
if (tmp == NULL) {
xmlRegexpErrMemory(ctxt, "adding state");
ctxt->maxStates /= 2;
return(-1);
}
ctxt->states = tmp;
}
state->no = ctxt->nbStates;
ctxt->states[ctxt->nbStates++] = state;
return(0);
}
/**
* xmlFAGenerateAllTransition:
* @ctxt: a regexp parser context
* @from: the from state
* @to: the target state or NULL for building a new one
* @lax:
*
*/
static void
xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
xmlRegStatePtr from, xmlRegStatePtr to,
int lax) {
if (to == NULL) {
to = xmlRegNewState(ctxt);
xmlRegStatePush(ctxt, to);
ctxt->state = to;
}
if (lax)
xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
else
xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
}
/**
* xmlFAGenerateEpsilonTransition:
* @ctxt: a regexp parser context
* @from: the from state
* @to: the target state or NULL for building a new one
*
*/
static void
xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
xmlRegStatePtr from, xmlRegStatePtr to) {
if (to == NULL) {
to = xmlRegNewState(ctxt);
xmlRegStatePush(ctxt, to);
ctxt->state = to;
}
xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
}
/**
* xmlFAGenerateCountedEpsilonTransition:
* @ctxt: a regexp parser context
* @from: the from state
* @to: the target state or NULL for building a new one
* counter: the counter for that transition
*
*/
static void
xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
if (to == NULL) {
to = xmlRegNewState(ctxt);
xmlRegStatePush(ctxt, to);
ctxt->state = to;
}
xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
}
/**
* xmlFAGenerateCountedTransition:
* @ctxt: a regexp parser context
* @from: the from state
* @to: the target state or NULL for building a new one
* counter: the counter for that transition
*
*/
static void
xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
if (to == NULL) {
to = xmlRegNewState(ctxt);
xmlRegStatePush(ctxt, to);
ctxt->state = to;
}
xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
}
/**
* xmlFAGenerateTransitions:
* @ctxt: a regexp parser context
* @from: the from state
* @to: the target state or NULL for building a new one
* @atom: the atom generating the transition
*
* Returns 0 if success and -1 in case of error.
*/
static int
xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
xmlRegStatePtr to, xmlRegAtomPtr atom) {
xmlRegStatePtr end;
int nullable = 0;
if (atom == NULL) {
ERROR("generate transition: atom == NULL");
return(-1);
}
if (atom->type == XML_REGEXP_SUBREG) {
/*
* this is a subexpression handling one should not need to
* create a new node except for XML_REGEXP_QUANT_RANGE.
*/
if (xmlRegAtomPush(ctxt, atom) < 0) {
return(-1);
}
if ((to != NULL) && (atom->stop != to) &&
(atom->quant != XML_REGEXP_QUANT_RANGE)) {
/*
* Generate an epsilon transition to link to the target
*/
xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
#ifdef DV
} else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
(atom->quant != XML_REGEXP_QUANT_ONCE)) {
to = xmlRegNewState(ctxt);
xmlRegStatePush(ctxt, to);
ctxt->state = to;
xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
#endif
}
switch (atom->quant) {
case XML_REGEXP_QUANT_OPT:
atom->quant = XML_REGEXP_QUANT_ONCE;
/*
* transition done to the state after end of atom.
* 1. set transition from atom start to new state
* 2. set transition from atom end to this state.
*/
if (to == NULL) {
xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
ctxt->state);
} else {
xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
}
break;
case XML_REGEXP_QUANT_MULT:
atom->quant = XML_REGEXP_QUANT_ONCE;
xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
break;
case XML_REGEXP_QUANT_PLUS:
atom->quant = XML_REGEXP_QUANT_ONCE;
xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
break;
case XML_REGEXP_QUANT_RANGE: {
int counter;
xmlRegStatePtr inter, newstate;
/*
* create the final state now if needed
*/
if (to != NULL) {
newstate = to;
} else {
newstate = xmlRegNewState(ctxt);
xmlRegStatePush(ctxt, newstate);
}
/*
* The principle here is to use counted transition
* to avoid explosion in the number of states in the
* graph. This is clearly more complex but should not
* be exploitable at runtime.
*/
if ((atom->min == 0) && (atom->start0 == NULL)) {
xmlRegAtomPtr copy;
/*
* duplicate a transition based on atom to count next
* occurrences after 1. We cannot loop to atom->start
* directly because we need an epsilon transition to
* newstate.
*/
/* ???? For some reason it seems we never reach that
case, I suppose this got optimized out before when
building the automata */
copy = xmlRegCopyAtom(ctxt, atom);
if (copy == NULL)
return(-1);
copy->quant = XML_REGEXP_QUANT_ONCE;
copy->min = 0;
copy->max = 0;
if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
< 0)
return(-1);
inter = ctxt->state;
counter = xmlRegGetCounter(ctxt);
ctxt->counters[counter].min = atom->min - 1;
ctxt->counters[counter].max = atom->max - 1;
/* count the number of times we see it again */
xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
atom->stop, counter);
/* allow a way out based on the count */
xmlFAGenerateCountedTransition(ctxt, inter,
newstate, counter);
/* and also allow a direct exit for 0 */
xmlFAGenerateEpsilonTransition(ctxt, atom->start,
newstate);
} else {
/*
* either we need the atom at least once or there
* is an atom->start0 allowing to easily plug the
* epsilon transition.
*/
counter = xmlRegGetCounter(ctxt);
ctxt->counters[counter].min = atom->min - 1;
ctxt->counters[counter].max = atom->max - 1;
/* count the number of times we see it again */
xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
atom->start, counter);
/* allow a way out based on the count */
xmlFAGenerateCountedTransition(ctxt, atom->stop,
newstate, counter);
/* and if needed allow a direct exit for 0 */
if (atom->min == 0)
xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
newstate);
}
atom->min = 0;
atom->max = 0;
atom->quant = XML_REGEXP_QUANT_ONCE;
ctxt->state = newstate;
}
default:
break;
}
return(0);
}
if ((atom->min == 0) && (atom->max == 0) &&
(atom->quant == XML_REGEXP_QUANT_RANGE)) {
/*
* we can discard the atom and generate an epsilon transition instead
*/
if (to == NULL) {
to = xmlRegNewState(ctxt);
if (to != NULL)
xmlRegStatePush(ctxt, to);
else {
return(-1);
}
}
xmlFAGenerateEpsilonTransition(ctxt, from, to);
ctxt->state = to;
xmlRegFreeAtom(atom);
return(0);
}
if (to == NULL) {
to = xmlRegNewState(ctxt);
if (to != NULL)
xmlRegStatePush(ctxt, to);
else {
return(-1);
}
}
end = to;
if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
(atom->quant == XML_REGEXP_QUANT_PLUS)) {
/*
* Do not pollute the target state by adding transitions from
* it as it is likely to be the shared target of multiple branches.
* So isolate with an epsilon transition.
*/
xmlRegStatePtr tmp;
tmp = xmlRegNewState(ctxt);
if (tmp != NULL)
xmlRegStatePush(ctxt, tmp);
else {
return(-1);
}
xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
to = tmp;
}
if (xmlRegAtomPush(ctxt, atom) < 0) {
return(-1);
}
if ((atom->quant == XML_REGEXP_QUANT_RANGE) &&
(atom->min == 0) && (atom->max > 0)) {
nullable = 1;
atom->min = 1;
if (atom->max == 1)
atom->quant = XML_REGEXP_QUANT_OPT;
}
xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
ctxt->state = end;
switch (atom->quant) {
case XML_REGEXP_QUANT_OPT:
atom->quant = XML_REGEXP_QUANT_ONCE;
xmlFAGenerateEpsilonTransition(ctxt, from, to);
break;
case XML_REGEXP_QUANT_MULT:
atom->quant = XML_REGEXP_QUANT_ONCE;
xmlFAGenerateEpsilonTransition(ctxt, from, to);
xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
break;
case XML_REGEXP_QUANT_PLUS:
atom->quant = XML_REGEXP_QUANT_ONCE;
xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
break;
case XML_REGEXP_QUANT_RANGE:
if (nullable)
xmlFAGenerateEpsilonTransition(ctxt, from, to);
break;
default:
break;
}
return(0);
}
/**
* xmlFAReduceEpsilonTransitions:
* @ctxt: a regexp parser context
* @fromnr: the from state
* @tonr: the to state
* @counter: should that transition be associated to a counted
*
*/
static void
xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
int tonr, int counter) {
int transnr;
xmlRegStatePtr from;
xmlRegStatePtr to;
#ifdef DEBUG_REGEXP_GRAPH
printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
#endif
from = ctxt->states[fromnr];
if (from == NULL)
return;
to = ctxt->states[tonr];
if (to == NULL)
return;
if ((to->mark == XML_REGEXP_MARK_START) ||
(to->mark == XML_REGEXP_MARK_VISITED))
return;
to->mark = XML_REGEXP_MARK_VISITED;
if (to->type == XML_REGEXP_FINAL_STATE) {
#ifdef DEBUG_REGEXP_GRAPH
printf("State %d is final, so %d becomes final\n", tonr, fromnr);
#endif
from->type = XML_REGEXP_FINAL_STATE;
}
for (transnr = 0;transnr < to->nbTrans;transnr++) {
if (to->trans[transnr].to < 0)
continue;
if (to->trans[transnr].atom == NULL) {
/*
* Don't remove counted transitions
* Don't loop either
*/
if (to->trans[transnr].to != fromnr) {
if (to->trans[transnr].count >= 0) {
int newto = to->trans[transnr].to;
xmlRegStateAddTrans(ctxt, from, NULL,
ctxt->states[newto],
-1, to->trans[transnr].count);
} else {
#ifdef DEBUG_REGEXP_GRAPH
printf("Found epsilon trans %d from %d to %d\n",
transnr, tonr, to->trans[transnr].to);
#endif
if (to->trans[transnr].counter >= 0) {
xmlFAReduceEpsilonTransitions(ctxt, fromnr,
to->trans[transnr].to,
to->trans[transnr].counter);
} else {
xmlFAReduceEpsilonTransitions(ctxt, fromnr,
to->trans[transnr].to,
counter);
}
}
}
} else {
int newto = to->trans[transnr].to;
if (to->trans[transnr].counter >= 0) {
xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
ctxt->states[newto],
to->trans[transnr].counter, -1);
} else {
xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
ctxt->states[newto], counter, -1);
}
}
}
to->mark = XML_REGEXP_MARK_NORMAL;
}
/**
* xmlFAEliminateSimpleEpsilonTransitions:
* @ctxt: a regexp parser context
*
* Eliminating general epsilon transitions can get costly in the general
* algorithm due to the large amount of generated new transitions and
* associated comparisons. However for simple epsilon transition used just
* to separate building blocks when generating the automata this can be
* reduced to state elimination:
* - if there exists an epsilon from X to Y
* - if there is no other transition from X
* then X and Y are semantically equivalent and X can be eliminated
* If X is the start state then make Y the start state, else replace the
* target of all transitions to X by transitions to Y.
*/
static void
xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
int statenr, i, j, newto;
xmlRegStatePtr state, tmp;
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
state = ctxt->states[statenr];
if (state == NULL)
continue;
if (state->nbTrans != 1)
continue;
if (state->type == XML_REGEXP_UNREACH_STATE)
continue;
/* is the only transition out a basic transition */
if ((state->trans[0].atom == NULL) &&
(state->trans[0].to >= 0) &&
(state->trans[0].to != statenr) &&
(state->trans[0].counter < 0) &&
(state->trans[0].count < 0)) {
newto = state->trans[0].to;
if (state->type == XML_REGEXP_START_STATE) {
#ifdef DEBUG_REGEXP_GRAPH
printf("Found simple epsilon trans from start %d to %d\n",
statenr, newto);
#endif
} else {
#ifdef DEBUG_REGEXP_GRAPH
printf("Found simple epsilon trans from %d to %d\n",
statenr, newto);
#endif
for (i = 0;i < state->nbTransTo;i++) {
tmp = ctxt->states[state->transTo[i]];
for (j = 0;j < tmp->nbTrans;j++) {
if (tmp->trans[j].to == statenr) {
#ifdef DEBUG_REGEXP_GRAPH
printf("Changed transition %d on %d to go to %d\n",
j, tmp->no, newto);
#endif
tmp->trans[j].to = -1;
xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
ctxt->states[newto],
tmp->trans[j].counter,
tmp->trans[j].count);
}
}
}
if (state->type == XML_REGEXP_FINAL_STATE)
ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
/* eliminate the transition completely */
state->nbTrans = 0;
state->type = XML_REGEXP_UNREACH_STATE;
}
}
}
}
/**
* xmlFAEliminateEpsilonTransitions:
* @ctxt: a regexp parser context
*
*/
static void
xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
int statenr, transnr;
xmlRegStatePtr state;
int has_epsilon;
if (ctxt->states == NULL) return;
/*
* Eliminate simple epsilon transition and the associated unreachable
* states.
*/
xmlFAEliminateSimpleEpsilonTransitions(ctxt);
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
state = ctxt->states[statenr];
if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
#ifdef DEBUG_REGEXP_GRAPH
printf("Removed unreachable state %d\n", statenr);
#endif
xmlRegFreeState(state);
ctxt->states[statenr] = NULL;
}
}
has_epsilon = 0;
/*
* Build the completed transitions bypassing the epsilons
* Use a marking algorithm to avoid loops
* Mark sink states too.
* Process from the latest states backward to the start when
* there is long cascading epsilon chains this minimize the
* recursions and transition compares when adding the new ones
*/
for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
state = ctxt->states[statenr];
if (state == NULL)
continue;
if ((state->nbTrans == 0) &&
(state->type != XML_REGEXP_FINAL_STATE)) {
state->type = XML_REGEXP_SINK_STATE;
}
for (transnr = 0;transnr < state->nbTrans;transnr++) {
if ((state->trans[transnr].atom == NULL) &&
(state->trans[transnr].to >= 0)) {
if (state->trans[transnr].to == statenr) {
state->trans[transnr].to = -1;
#ifdef DEBUG_REGEXP_GRAPH
printf("Removed loopback epsilon trans %d on %d\n",
transnr, statenr);
#endif
} else if (state->trans[transnr].count < 0) {
int newto = state->trans[transnr].to;
#ifdef DEBUG_REGEXP_GRAPH
printf("Found epsilon trans %d from %d to %d\n",
transnr, statenr, newto);
#endif
has_epsilon = 1;
state->trans[transnr].to = -2;
state->mark = XML_REGEXP_MARK_START;
xmlFAReduceEpsilonTransitions(ctxt, statenr,
newto, state->trans[transnr].counter);
state->mark = XML_REGEXP_MARK_NORMAL;
#ifdef DEBUG_REGEXP_GRAPH
} else {
printf("Found counted transition %d on %d\n",
transnr, statenr);
#endif
}
}
}
}
/*
* Eliminate the epsilon transitions
*/
if (has_epsilon) {
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
state = ctxt->states[statenr];
if (state == NULL)
continue;
for (transnr = 0;transnr < state->nbTrans;transnr++) {
xmlRegTransPtr trans = &(state->trans[transnr]);
if ((trans->atom == NULL) &&
(trans->count < 0) &&
(trans->to >= 0)) {
trans->to = -1;
}
}
}
}
/*
* Use this pass to detect unreachable states too
*/
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
state = ctxt->states[statenr];
if (state != NULL)
state->reached = XML_REGEXP_MARK_NORMAL;
}
state = ctxt->states[0];
if (state != NULL)
state->reached = XML_REGEXP_MARK_START;
while (state != NULL) {
xmlRegStatePtr target = NULL;
state->reached = XML_REGEXP_MARK_VISITED;
/*
* Mark all states reachable from the current reachable state
*/
for (transnr = 0;transnr < state->nbTrans;transnr++) {
if ((state->trans[transnr].to >= 0) &&
((state->trans[transnr].atom != NULL) ||
(state->trans[transnr].count >= 0))) {
int newto = state->trans[transnr].to;
if (ctxt->states[newto] == NULL)
continue;
if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
target = ctxt->states[newto];
}
}
}
/*
* find the next accessible state not explored
*/
if (target == NULL) {
for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
state = ctxt->states[statenr];
if ((state != NULL) && (state->reached ==
XML_REGEXP_MARK_START)) {
target = state;
break;
}
}
}
state = target;
}
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
state = ctxt->states[statenr];
if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
#ifdef DEBUG_REGEXP_GRAPH
printf("Removed unreachable state %d\n", statenr);
#endif
xmlRegFreeState(state);
ctxt->states[statenr] = NULL;
}
}
}
static int
xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
int ret = 0;
if ((range1->type == XML_REGEXP_RANGES) ||
(range2->type == XML_REGEXP_RANGES) ||
(range2->type == XML_REGEXP_SUBREG) ||
(range1->type == XML_REGEXP_SUBREG) ||
(range1->type == XML_REGEXP_STRING) ||
(range2->type == XML_REGEXP_STRING))
return(-1);
/* put them in order */
if (range1->type > range2->type) {
xmlRegRangePtr tmp;
tmp = range1;
range1 = range2;
range2 = tmp;
}
if ((range1->type == XML_REGEXP_ANYCHAR) ||
(range2->type == XML_REGEXP_ANYCHAR)) {
ret = 1;
} else if ((range1->type == XML_REGEXP_EPSILON) ||
(range2->type == XML_REGEXP_EPSILON)) {
return(0);
} else if (range1->type == range2->type) {
if (range1->type != XML_REGEXP_CHARVAL)
ret = 1;
else if ((range1->end < range2->start) ||
(range2->end < range1->start))
ret = 0;
else
ret = 1;
} else if (range1->type == XML_REGEXP_CHARVAL) {
int codepoint;
int neg = 0;
/*
* just check all codepoints in the range for acceptance,
* this is usually way cheaper since done only once at
* compilation than testing over and over at runtime or
* pushing too many states when evaluating.
*/
if (((range1->neg == 0) && (range2->neg != 0)) ||
((range1->neg != 0) && (range2->neg == 0)))
neg = 1;
for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
ret = xmlRegCheckCharacterRange(range2->type, codepoint,
0, range2->start, range2->end,
range2->blockName);
if (ret < 0)
return(-1);
if (((neg == 1) && (ret == 0)) ||
((neg == 0) && (ret == 1)))
return(1);
}
return(0);
} else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
(range2->type == XML_REGEXP_BLOCK_NAME)) {
if (range1->type == range2->type) {
ret = xmlStrEqual(range1->blockName, range2->blockName);
} else {
/*
* comparing a block range with anything else is way
* too costly, and maintaining the table is like too much
* memory too, so let's force the automata to save state
* here.
*/
return(1);
}
} else if ((range1->type < XML_REGEXP_LETTER) ||
(range2->type < XML_REGEXP_LETTER)) {
if ((range1->type == XML_REGEXP_ANYSPACE) &&
(range2->type == XML_REGEXP_NOTSPACE))
ret = 0;
else if ((range1->type == XML_REGEXP_INITNAME) &&
(range2->type == XML_REGEXP_NOTINITNAME))
ret = 0;
else if ((range1->type == XML_REGEXP_NAMECHAR) &&
(range2->type == XML_REGEXP_NOTNAMECHAR))
ret = 0;
else if ((range1->type == XML_REGEXP_DECIMAL) &&
(range2->type == XML_REGEXP_NOTDECIMAL))
ret = 0;
else if ((range1->type == XML_REGEXP_REALCHAR) &&
(range2->type == XML_REGEXP_NOTREALCHAR))
ret = 0;
else {
/* same thing to limit complexity */
return(1);
}
} else {
ret = 0;
/* range1->type < range2->type here */
switch (range1->type) {
case XML_REGEXP_LETTER:
/* all disjoint except in the subgroups */
if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
(range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
(range2->type == XML_REGEXP_LETTER_TITLECASE) ||
(range2->type == XML_REGEXP_LETTER_MODIFIER) ||
(range2->type == XML_REGEXP_LETTER_OTHERS))
ret = 1;
break;
case XML_REGEXP_MARK:
if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
(range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
(range2->type == XML_REGEXP_MARK_ENCLOSING))
ret = 1;
break;
case XML_REGEXP_NUMBER:
if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
(range2->type == XML_REGEXP_NUMBER_LETTER) ||
(range2->type == XML_REGEXP_NUMBER_OTHERS))
ret = 1;
break;
case XML_REGEXP_PUNCT:
if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
(range2->type == XML_REGEXP_PUNCT_DASH) ||
(range2->type == XML_REGEXP_PUNCT_OPEN) ||
(range2->type == XML_REGEXP_PUNCT_CLOSE) ||
(range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
(range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
(range2->type == XML_REGEXP_PUNCT_OTHERS))
ret = 1;
break;
case XML_REGEXP_SEPAR:
if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
(range2->type == XML_REGEXP_SEPAR_LINE) ||
(range2->type == XML_REGEXP_SEPAR_PARA))
ret = 1;
break;
case XML_REGEXP_SYMBOL:
if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
(range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
(range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
(range2->type == XML_REGEXP_SYMBOL_OTHERS))
ret = 1;
break;
case XML_REGEXP_OTHER:
if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
(range2->type == XML_REGEXP_OTHER_FORMAT) ||
(range2->type == XML_REGEXP_OTHER_PRIVATE))
ret = 1;
break;
default:
if ((range2->type >= XML_REGEXP_LETTER) &&
(range2->type < XML_REGEXP_BLOCK_NAME))
ret = 0;
else {
/* safety net ! */
return(1);
}
}
}
if (((range1->neg == 0) && (range2->neg != 0)) ||
((range1->neg != 0) && (range2->neg == 0)))
ret = !ret;
return(ret);
}
/**
* xmlFACompareAtomTypes:
* @type1: an atom type
* @type2: an atom type
*
* Compares two atoms type to check whether they intersect in some ways,
* this is used by xmlFACompareAtoms only
*
* Returns 1 if they may intersect and 0 otherwise
*/
static int
xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
if ((type1 == XML_REGEXP_EPSILON) ||
(type1 == XML_REGEXP_CHARVAL) ||
(type1 == XML_REGEXP_RANGES) ||
(type1 == XML_REGEXP_SUBREG) ||
(type1 == XML_REGEXP_STRING) ||
(type1 == XML_REGEXP_ANYCHAR))
return(1);
if ((type2 == XML_REGEXP_EPSILON) ||
(type2 == XML_REGEXP_CHARVAL) ||
(type2 == XML_REGEXP_RANGES) ||
(type2 == XML_REGEXP_SUBREG) ||
(type2 == XML_REGEXP_STRING) ||
(type2 == XML_REGEXP_ANYCHAR))
return(1);
if (type1 == type2) return(1);
/* simplify subsequent compares by making sure type1 < type2 */
if (type1 > type2) {
xmlRegAtomType tmp = type1;
type1 = type2;
type2 = tmp;
}
switch (type1) {
case XML_REGEXP_ANYSPACE: /* \s */
/* can't be a letter, number, mark, punctuation, symbol */
if ((type2 == XML_REGEXP_NOTSPACE) ||
((type2 >= XML_REGEXP_LETTER) &&
(type2 <= XML_REGEXP_LETTER_OTHERS)) ||
((type2 >= XML_REGEXP_NUMBER) &&
(type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
((type2 >= XML_REGEXP_MARK) &&
(type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
((type2 >= XML_REGEXP_PUNCT) &&
(type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
((type2 >= XML_REGEXP_SYMBOL) &&
(type2 <= XML_REGEXP_SYMBOL_OTHERS))
) return(0);
break;
case XML_REGEXP_NOTSPACE: /* \S */
break;
case XML_REGEXP_INITNAME: /* \l */
/* can't be a number, mark, separator, punctuation, symbol or other */
if ((type2 == XML_REGEXP_NOTINITNAME) ||
((type2 >= XML_REGEXP_NUMBER) &&
(type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
((type2 >= XML_REGEXP_MARK) &&
(type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
((type2 >= XML_REGEXP_SEPAR) &&
(type2 <= XML_REGEXP_SEPAR_PARA)) ||
((type2 >= XML_REGEXP_PUNCT) &&
(type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
((type2 >= XML_REGEXP_SYMBOL) &&
(type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
((type2 >= XML_REGEXP_OTHER) &&
(type2 <= XML_REGEXP_OTHER_NA))
) return(0);
break;
case XML_REGEXP_NOTINITNAME: /* \L */
break;
case XML_REGEXP_NAMECHAR: /* \c */
/* can't be a mark, separator, punctuation, symbol or other */
if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
((type2 >= XML_REGEXP_MARK) &&
(type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
((type2 >= XML_REGEXP_PUNCT) &&
(type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
((type2 >= XML_REGEXP_SEPAR) &&
(type2 <= XML_REGEXP_SEPAR_PARA)) ||
((type2 >= XML_REGEXP_SYMBOL) &&
(type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
((type2 >= XML_REGEXP_OTHER) &&
(type2 <= XML_REGEXP_OTHER_NA))
) return(0);
break;
case XML_REGEXP_NOTNAMECHAR: /* \C */
break;
case XML_REGEXP_DECIMAL: /* \d */
/* can't be a letter, mark, separator, punctuation, symbol or other */
if ((type2 == XML_REGEXP_NOTDECIMAL) ||
(type2 == XML_REGEXP_REALCHAR) ||
((type2 >= XML_REGEXP_LETTER) &&
(type2 <= XML_REGEXP_LETTER_OTHERS)) ||
((type2 >= XML_REGEXP_MARK) &&
(type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
((type2 >= XML_REGEXP_PUNCT) &&
(type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
((type2 >= XML_REGEXP_SEPAR) &&
(type2 <= XML_REGEXP_SEPAR_PARA)) ||
((type2 >= XML_REGEXP_SYMBOL) &&
(type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
((type2 >= XML_REGEXP_OTHER) &&
(type2 <= XML_REGEXP_OTHER_NA))
)return(0);
break;
case XML_REGEXP_NOTDECIMAL: /* \D */
break;
case XML_REGEXP_REALCHAR: /* \w */
/* can't be a mark, separator, punctuation, symbol or other */
if ((type2 == XML_REGEXP_NOTDECIMAL) ||
((type2 >= XML_REGEXP_MARK) &&
(type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
((type2 >= XML_REGEXP_PUNCT) &&
(type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
((type2 >= XML_REGEXP_SEPAR) &&
(type2 <= XML_REGEXP_SEPAR_PARA)) ||
((type2 >= XML_REGEXP_SYMBOL) &&
(type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
((type2 >= XML_REGEXP_OTHER) &&
(type2 <= XML_REGEXP_OTHER_NA))
)return(0);
break;
case XML_REGEXP_NOTREALCHAR: /* \W */
break;
/*
* at that point we know both type 1 and type2 are from
* character categories are ordered and are different,
* it becomes simple because this is a partition
*/
case XML_REGEXP_LETTER:
if (type2 <= XML_REGEXP_LETTER_OTHERS)
return(1);
return(0);
case XML_REGEXP_LETTER_UPPERCASE:
case XML_REGEXP_LETTER_LOWERCASE:
case XML_REGEXP_LETTER_TITLECASE:
case XML_REGEXP_LETTER_MODIFIER:
case XML_REGEXP_LETTER_OTHERS:
return(0);
case XML_REGEXP_MARK:
if (type2 <= XML_REGEXP_MARK_ENCLOSING)
return(1);
return(0);
case XML_REGEXP_MARK_NONSPACING:
case XML_REGEXP_MARK_SPACECOMBINING:
case XML_REGEXP_MARK_ENCLOSING:
return(0);
case XML_REGEXP_NUMBER:
if (type2 <= XML_REGEXP_NUMBER_OTHERS)
return(1);
return(0);
case XML_REGEXP_NUMBER_DECIMAL:
case XML_REGEXP_NUMBER_LETTER:
case XML_REGEXP_NUMBER_OTHERS:
return(0);
case XML_REGEXP_PUNCT:
if (type2 <= XML_REGEXP_PUNCT_OTHERS)
return(1);
return(0);
case XML_REGEXP_PUNCT_CONNECTOR:
case XML_REGEXP_PUNCT_DASH:
case XML_REGEXP_PUNCT_OPEN:
case XML_REGEXP_PUNCT_CLOSE:
case XML_REGEXP_PUNCT_INITQUOTE:
case XML_REGEXP_PUNCT_FINQUOTE:
case XML_REGEXP_PUNCT_OTHERS:
return(0);
case XML_REGEXP_SEPAR:
if (type2 <= XML_REGEXP_SEPAR_PARA)
return(1);
return(0);
case XML_REGEXP_SEPAR_SPACE:
case XML_REGEXP_SEPAR_LINE:
case XML_REGEXP_SEPAR_PARA:
return(0);
case XML_REGEXP_SYMBOL:
if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
return(1);
return(0);
case XML_REGEXP_SYMBOL_MATH:
case XML_REGEXP_SYMBOL_CURRENCY:
case XML_REGEXP_SYMBOL_MODIFIER:
case XML_REGEXP_SYMBOL_OTHERS:
return(0);
case XML_REGEXP_OTHER:
if (type2 <= XML_REGEXP_OTHER_NA)
return(1);
return(0);
case XML_REGEXP_OTHER_CONTROL:
case XML_REGEXP_OTHER_FORMAT:
case XML_REGEXP_OTHER_PRIVATE:
case XML_REGEXP_OTHER_NA:
return(0);
default:
break;
}
return(1);
}
/**
* xmlFAEqualAtoms:
* @atom1: an atom
* @atom2: an atom
* @deep: if not set only compare string pointers
*
* Compares two atoms to check whether they are the same exactly
* this is used to remove equivalent transitions
*
* Returns 1 if same and 0 otherwise
*/
static int
xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
int ret = 0;
if (atom1 == atom2)
return(1);
if ((atom1 == NULL) || (atom2 == NULL))
return(0);
if (atom1->type != atom2->type)
return(0);
switch (atom1->type) {
case XML_REGEXP_EPSILON:
ret = 0;
break;
case XML_REGEXP_STRING:
if (!deep)
ret = (atom1->valuep == atom2->valuep);
else
ret = xmlStrEqual((xmlChar *)atom1->valuep,
(xmlChar *)atom2->valuep);
break;
case XML_REGEXP_CHARVAL:
ret = (atom1->codepoint == atom2->codepoint);
break;
case XML_REGEXP_RANGES:
/* too hard to do in the general case */
ret = 0;
default:
break;
}
return(ret);
}
/**
* xmlFACompareAtoms:
* @atom1: an atom
* @atom2: an atom
* @deep: if not set only compare string pointers
*
* Compares two atoms to check whether they intersect in some ways,
* this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
*
* Returns 1 if yes and 0 otherwise
*/
static int
xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
int ret = 1;
if (atom1 == atom2)
return(1);
if ((atom1 == NULL) || (atom2 == NULL))
return(0);
if ((atom1->type == XML_REGEXP_ANYCHAR) ||
(atom2->type == XML_REGEXP_ANYCHAR))
return(1);
if (atom1->type > atom2->type) {
xmlRegAtomPtr tmp;
tmp = atom1;
atom1 = atom2;
atom2 = tmp;
}
if (atom1->type != atom2->type) {
ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
/* if they can't intersect at the type level break now */
if (ret == 0)
return(0);
}
switch (atom1->type) {
case XML_REGEXP_STRING:
if (!deep)
ret = (atom1->valuep != atom2->valuep);
else {
xmlChar *val1 = (xmlChar *)atom1->valuep;
xmlChar *val2 = (xmlChar *)atom2->valuep;
int compound1 = (xmlStrchr(val1, '|') != NULL);
int compound2 = (xmlStrchr(val2, '|') != NULL);
/* Ignore negative match flag for ##other namespaces */
if (compound1 != compound2)
return(0);
ret = xmlRegStrEqualWildcard(val1, val2);
}
break;
case XML_REGEXP_EPSILON:
goto not_determinist;
case XML_REGEXP_CHARVAL:
if (atom2->type == XML_REGEXP_CHARVAL) {
ret = (atom1->codepoint == atom2->codepoint);
} else {
ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
if (ret < 0)
ret = 1;
}
break;
case XML_REGEXP_RANGES:
if (atom2->type == XML_REGEXP_RANGES) {
int i, j, res;
xmlRegRangePtr r1, r2;
/*
* need to check that none of the ranges eventually matches
*/
for (i = 0;i < atom1->nbRanges;i++) {
for (j = 0;j < atom2->nbRanges;j++) {
r1 = atom1->ranges[i];
r2 = atom2->ranges[j];
res = xmlFACompareRanges(r1, r2);
if (res == 1) {
ret = 1;
goto done;
}
}
}
ret = 0;
}
break;
default:
goto not_determinist;
}
done:
if (atom1->neg != atom2->neg) {
ret = !ret;
}
if (ret == 0)
return(0);
not_determinist:
return(1);
}
/**
* xmlFARecurseDeterminism:
* @ctxt: a regexp parser context
*
* Check whether the associated regexp is determinist,
* should be called after xmlFAEliminateEpsilonTransitions()
*
*/
static int
xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
int to, xmlRegAtomPtr atom) {
int ret = 1;
int res;
int transnr, nbTrans;
xmlRegTransPtr t1;
int deep = 1;
if (state == NULL)
return(ret);
if (state->markd == XML_REGEXP_MARK_VISITED)
return(ret);
if (ctxt->flags & AM_AUTOMATA_RNG)
deep = 0;
/*
* don't recurse on transitions potentially added in the course of
* the elimination.
*/
nbTrans = state->nbTrans;
for (transnr = 0;transnr < nbTrans;transnr++) {
t1 = &(state->trans[transnr]);
/*
* check transitions conflicting with the one looked at
*/
if (t1->atom == NULL) {
if (t1->to < 0)
continue;
state->markd = XML_REGEXP_MARK_VISITED;
res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
to, atom);
if (res == 0) {
ret = 0;
/* t1->nd = 1; */
}
continue;
}
if (t1->to != to)
continue;
if (xmlFACompareAtoms(t1->atom, atom, deep)) {
ret = 0;
/* mark the transition as non-deterministic */
t1->nd = 1;
}
}
return(ret);
}
/**
* xmlFAFinishRecurseDeterminism:
* @ctxt: a regexp parser context
*
* Reset flags after checking determinism.
*/
static void
xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
int transnr, nbTrans;
if (state == NULL)
return;
if (state->markd != XML_REGEXP_MARK_VISITED)
return;
state->markd = 0;
nbTrans = state->nbTrans;
for (transnr = 0; transnr < nbTrans; transnr++) {
xmlRegTransPtr t1 = &state->trans[transnr];
if ((t1->atom == NULL) && (t1->to >= 0))
xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
}
}
/**
* xmlFAComputesDeterminism:
* @ctxt: a regexp parser context
*
* Check whether the associated regexp is determinist,
* should be called after xmlFAEliminateEpsilonTransitions()
*
*/
static int
xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
int statenr, transnr;
xmlRegStatePtr state;
xmlRegTransPtr t1, t2, last;
int i;
int ret = 1;
int deep = 1;
#ifdef DEBUG_REGEXP_GRAPH
printf("xmlFAComputesDeterminism\n");
xmlRegPrintCtxt(stdout, ctxt);
#endif
if (ctxt->determinist != -1)
return(ctxt->determinist);
if (ctxt->flags & AM_AUTOMATA_RNG)
deep = 0;
/*
* First cleanup the automata removing cancelled transitions
*/
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
state = ctxt->states[statenr];
if (state == NULL)
continue;
if (state->nbTrans < 2)
continue;
for (transnr = 0;transnr < state->nbTrans;transnr++) {
t1 = &(state->trans[transnr]);
/*
* Determinism checks in case of counted or all transitions
* will have to be handled separately
*/
if (t1->atom == NULL) {
/* t1->nd = 1; */
continue;
}
if (t1->to == -1) /* eliminated */
continue;
for (i = 0;i < transnr;i++) {
t2 = &(state->trans[i]);
if (t2->to == -1) /* eliminated */
continue;
if (t2->atom != NULL) {
if (t1->to == t2->to) {
/*
* Here we use deep because we want to keep the
* transitions which indicate a conflict
*/
if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
(t1->counter == t2->counter) &&
(t1->count == t2->count))
t2->to = -1; /* eliminated */
}
}
}
}
}
/*
* Check for all states that there aren't 2 transitions
* with the same atom and a different target.
*/
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
state = ctxt->states[statenr];
if (state == NULL)
continue;
if (state->nbTrans < 2)
continue;
last = NULL;
for (transnr = 0;transnr < state->nbTrans;transnr++) {
t1 = &(state->trans[transnr]);
/*
* Determinism checks in case of counted or all transitions
* will have to be handled separately
*/
if (t1->atom == NULL) {
continue;
}
if (t1->to == -1) /* eliminated */
continue;
for (i = 0;i < transnr;i++) {
t2 = &(state->trans[i]);
if (t2->to == -1) /* eliminated */
continue;
if (t2->atom != NULL) {
/*
* But here we don't use deep because we want to
* find transitions which indicate a conflict
*/
if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
ret = 0;
/* mark the transitions as non-deterministic ones */
t1->nd = 1;
t2->nd = 1;
last = t1;
}
} else if (t1->to != -1) {
/*
* do the closure in case of remaining specific
* epsilon transitions like choices or all
*/
ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
t2->to, t2->atom);
xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
/* don't shortcut the computation so all non deterministic
transition get marked down
if (ret == 0)
return(0);
*/
if (ret == 0) {
t1->nd = 1;
/* t2->nd = 1; */
last = t1;
}
}
}
/* don't shortcut the computation so all non deterministic
transition get marked down
if (ret == 0)
break; */
}
/*
* mark specifically the last non-deterministic transition
* from a state since there is no need to set-up rollback
* from it
*/
if (last != NULL) {
last->nd = 2;
}
/* don't shortcut the computation so all non deterministic
transition get marked down
if (ret == 0)
break; */
}
ctxt->determinist = ret;
return(ret);
}
/************************************************************************
* *
* Routines to check input against transition atoms *
* *
************************************************************************/
static int
xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
int start, int end, const xmlChar *blockName) {
int ret = 0;
switch (type) {
case XML_REGEXP_STRING:
case XML_REGEXP_SUBREG:
case XML_REGEXP_RANGES:
case XML_REGEXP_EPSILON:
return(-1);
case XML_REGEXP_ANYCHAR:
ret = ((codepoint != '\n') && (codepoint != '\r'));
break;
case XML_REGEXP_CHARVAL:
ret = ((codepoint >= start) && (codepoint <= end));
break;
case XML_REGEXP_NOTSPACE:
neg = !neg;
/* Falls through. */
case XML_REGEXP_ANYSPACE:
ret = ((codepoint == '\n') || (codepoint == '\r') ||
(codepoint == '\t') || (codepoint == ' '));
break;
case XML_REGEXP_NOTINITNAME:
neg = !neg;
/* Falls through. */
case XML_REGEXP_INITNAME:
ret = (IS_LETTER(codepoint) ||
(codepoint == '_') || (codepoint == ':'));
break;
case XML_REGEXP_NOTNAMECHAR:
neg = !neg;
/* Falls through. */
case XML_REGEXP_NAMECHAR:
ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
(codepoint == '.') || (codepoint == '-') ||
(codepoint == '_') || (codepoint == ':') ||
IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
break;
case XML_REGEXP_NOTDECIMAL:
neg = !neg;
/* Falls through. */
case XML_REGEXP_DECIMAL:
ret = xmlUCSIsCatNd(codepoint);
break;
case XML_REGEXP_REALCHAR:
neg = !neg;
/* Falls through. */
case XML_REGEXP_NOTREALCHAR:
ret = xmlUCSIsCatP(codepoint);
if (ret == 0)
ret = xmlUCSIsCatZ(codepoint);
if (ret == 0)
ret = xmlUCSIsCatC(codepoint);
break;
case XML_REGEXP_LETTER:
ret = xmlUCSIsCatL(codepoint);
break;
case XML_REGEXP_LETTER_UPPERCASE:
ret = xmlUCSIsCatLu(codepoint);
break;
case XML_REGEXP_LETTER_LOWERCASE:
ret = xmlUCSIsCatLl(codepoint);
break;
case XML_REGEXP_LETTER_TITLECASE:
ret = xmlUCSIsCatLt(codepoint);
break;
case XML_REGEXP_LETTER_MODIFIER:
ret = xmlUCSIsCatLm(codepoint);
break;
case XML_REGEXP_LETTER_OTHERS:
ret = xmlUCSIsCatLo(codepoint);
break;
case XML_REGEXP_MARK:
ret = xmlUCSIsCatM(codepoint);
break;
case XML_REGEXP_MARK_NONSPACING:
ret = xmlUCSIsCatMn(codepoint);
break;
case XML_REGEXP_MARK_SPACECOMBINING:
ret = xmlUCSIsCatMc(codepoint);
break;
case XML_REGEXP_MARK_ENCLOSING:
ret = xmlUCSIsCatMe(codepoint);
break;
case XML_REGEXP_NUMBER:
ret = xmlUCSIsCatN(codepoint);
break;
case XML_REGEXP_NUMBER_DECIMAL:
ret = xmlUCSIsCatNd(codepoint);
break;
case XML_REGEXP_NUMBER_LETTER:
ret = xmlUCSIsCatNl(codepoint);
break;
case XML_REGEXP_NUMBER_OTHERS:
ret = xmlUCSIsCatNo(codepoint);
break;
case XML_REGEXP_PUNCT:
ret = xmlUCSIsCatP(codepoint);
break;
case XML_REGEXP_PUNCT_CONNECTOR:
ret = xmlUCSIsCatPc(codepoint);
break;
case XML_REGEXP_PUNCT_DASH:
ret = xmlUCSIsCatPd(codepoint);
break;
case XML_REGEXP_PUNCT_OPEN:
ret = xmlUCSIsCatPs(codepoint);
break;
case XML_REGEXP_PUNCT_CLOSE:
ret = xmlUCSIsCatPe(codepoint);
break;
case XML_REGEXP_PUNCT_INITQUOTE:
ret = xmlUCSIsCatPi(codepoint);
break;
case XML_REGEXP_PUNCT_FINQUOTE:
ret = xmlUCSIsCatPf(codepoint);
break;
case XML_REGEXP_PUNCT_OTHERS:
ret = xmlUCSIsCatPo(codepoint);
break;
case XML_REGEXP_SEPAR:
ret = xmlUCSIsCatZ(codepoint);
break;
case XML_REGEXP_SEPAR_SPACE:
ret = xmlUCSIsCatZs(codepoint);
break;
case XML_REGEXP_SEPAR_LINE:
ret = xmlUCSIsCatZl(codepoint);
break;
case XML_REGEXP_SEPAR_PARA:
ret = xmlUCSIsCatZp(codepoint);
break;
case XML_REGEXP_SYMBOL:
ret = xmlUCSIsCatS(codepoint);
break;
case XML_REGEXP_SYMBOL_MATH:
ret = xmlUCSIsCatSm(codepoint);
break;
case XML_REGEXP_SYMBOL_CURRENCY:
ret = xmlUCSIsCatSc(codepoint);
break;
case XML_REGEXP_SYMBOL_MODIFIER:
ret = xmlUCSIsCatSk(codepoint);
break;
case XML_REGEXP_SYMBOL_OTHERS:
ret = xmlUCSIsCatSo(codepoint);
break;
case XML_REGEXP_OTHER:
ret = xmlUCSIsCatC(codepoint);
break;
case XML_REGEXP_OTHER_CONTROL:
ret = xmlUCSIsCatCc(codepoint);
break;
case XML_REGEXP_OTHER_FORMAT:
ret = xmlUCSIsCatCf(codepoint);
break;
case XML_REGEXP_OTHER_PRIVATE:
ret = xmlUCSIsCatCo(codepoint);
break;
case XML_REGEXP_OTHER_NA:
/* ret = xmlUCSIsCatCn(codepoint); */
/* Seems it doesn't exist anymore in recent Unicode releases */
ret = 0;
break;
case XML_REGEXP_BLOCK_NAME:
ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
break;
}
if (neg)
return(!ret);
return(ret);
}
static int
xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
int i, ret = 0;
xmlRegRangePtr range;
if ((atom == NULL) || (!IS_CHAR(codepoint)))
return(-1);
switch (atom->type) {
case XML_REGEXP_SUBREG:
case XML_REGEXP_EPSILON:
return(-1);
case XML_REGEXP_CHARVAL:
return(codepoint == atom->codepoint);
case XML_REGEXP_RANGES: {
int accept = 0;
for (i = 0;i < atom->nbRanges;i++) {
range = atom->ranges[i];
if (range->neg == 2) {
ret = xmlRegCheckCharacterRange(range->type, codepoint,
0, range->start, range->end,
range->blockName);
if (ret != 0)
return(0); /* excluded char */
} else if (range->neg) {
ret = xmlRegCheckCharacterRange(range->type, codepoint,
0, range->start, range->end,
range->blockName);
if (ret == 0)
accept = 1;
else
return(0);
} else {
ret = xmlRegCheckCharacterRange(range->type, codepoint,
0, range->start, range->end,
range->blockName);
if (ret != 0)
accept = 1; /* might still be excluded */
}
}
return(accept);
}
case XML_REGEXP_STRING:
printf("TODO: XML_REGEXP_STRING\n");
return(-1);
case XML_REGEXP_ANYCHAR:
case XML_REGEXP_ANYSPACE:
case XML_REGEXP_NOTSPACE:
case XML_REGEXP_INITNAME:
case XML_REGEXP_NOTINITNAME:
case XML_REGEXP_NAMECHAR:
case XML_REGEXP_NOTNAMECHAR:
case XML_REGEXP_DECIMAL:
case XML_REGEXP_NOTDECIMAL:
case XML_REGEXP_REALCHAR:
case XML_REGEXP_NOTREALCHAR:
case XML_REGEXP_LETTER:
case XML_REGEXP_LETTER_UPPERCASE:
case XML_REGEXP_LETTER_LOWERCASE:
case XML_REGEXP_LETTER_TITLECASE:
case XML_REGEXP_LETTER_MODIFIER:
case XML_REGEXP_LETTER_OTHERS:
case XML_REGEXP_MARK:
case XML_REGEXP_MARK_NONSPACING:
case XML_REGEXP_MARK_SPACECOMBINING:
case XML_REGEXP_MARK_ENCLOSING:
case XML_REGEXP_NUMBER:
case XML_REGEXP_NUMBER_DECIMAL:
case XML_REGEXP_NUMBER_LETTER:
case XML_REGEXP_NUMBER_OTHERS:
case XML_REGEXP_PUNCT:
case XML_REGEXP_PUNCT_CONNECTOR:
case XML_REGEXP_PUNCT_DASH:
case XML_REGEXP_PUNCT_OPEN:
case XML_REGEXP_PUNCT_CLOSE:
case XML_REGEXP_PUNCT_INITQUOTE:
case XML_REGEXP_PUNCT_FINQUOTE:
case XML_REGEXP_PUNCT_OTHERS:
case XML_REGEXP_SEPAR:
case XML_REGEXP_SEPAR_SPACE:
case XML_REGEXP_SEPAR_LINE:
case XML_REGEXP_SEPAR_PARA:
case XML_REGEXP_SYMBOL:
case XML_REGEXP_SYMBOL_MATH:
case XML_REGEXP_SYMBOL_CURRENCY:
case XML_REGEXP_SYMBOL_MODIFIER:
case XML_REGEXP_SYMBOL_OTHERS:
case XML_REGEXP_OTHER:
case XML_REGEXP_OTHER_CONTROL:
case XML_REGEXP_OTHER_FORMAT:
case XML_REGEXP_OTHER_PRIVATE:
case XML_REGEXP_OTHER_NA:
case XML_REGEXP_BLOCK_NAME:
ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
(const xmlChar *)atom->valuep);
if (atom->neg)
ret = !ret;
break;
}
return(ret);
}
/************************************************************************
* *
* Saving and restoring state of an execution context *
* *
************************************************************************/
#ifdef DEBUG_REGEXP_EXEC
static void
xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
if (exec->inputStack != NULL) {
int i;
printf(": ");
for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
printf("%s ", (const char *)
exec->inputStack[exec->inputStackNr - (i + 1)].value);
} else {
printf(": %s", &(exec->inputString[exec->index]));
}
printf("\n");
}
#endif
static void
xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
#ifdef DEBUG_REGEXP_EXEC
printf("saving ");
exec->transno++;
xmlFARegDebugExec(exec);
exec->transno--;
#endif
#ifdef MAX_PUSH
if (exec->nbPush > MAX_PUSH) {
return;
}
exec->nbPush++;
#endif
if (exec->maxRollbacks == 0) {
exec->maxRollbacks = 4;
exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
sizeof(xmlRegExecRollback));
if (exec->rollbacks == NULL) {
xmlRegexpErrMemory(NULL, "saving regexp");
exec->maxRollbacks = 0;
return;
}
memset(exec->rollbacks, 0,
exec->maxRollbacks * sizeof(xmlRegExecRollback));
} else if (exec->nbRollbacks >= exec->maxRollbacks) {
xmlRegExecRollback *tmp;
int len = exec->maxRollbacks;
exec->maxRollbacks *= 2;
tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
exec->maxRollbacks * sizeof(xmlRegExecRollback));
if (tmp == NULL) {
xmlRegexpErrMemory(NULL, "saving regexp");
exec->maxRollbacks /= 2;
return;
}
exec->rollbacks = tmp;
tmp = &exec->rollbacks[len];
memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
}
exec->rollbacks[exec->nbRollbacks].state = exec->state;
exec->rollbacks[exec->nbRollbacks].index = exec->index;
exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
if (exec->comp->nbCounters > 0) {
if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
exec->rollbacks[exec->nbRollbacks].counts = (int *)
xmlMalloc(exec->comp->nbCounters * sizeof(int));
if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
xmlRegexpErrMemory(NULL, "saving regexp");
exec->status = -5;
return;
}
}
memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
exec->comp->nbCounters * sizeof(int));
}
exec->nbRollbacks++;
}
static void
xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
if (exec->nbRollbacks <= 0) {
exec->status = -1;
#ifdef DEBUG_REGEXP_EXEC
printf("rollback failed on empty stack\n");
#endif
return;
}
exec->nbRollbacks--;
exec->state = exec->rollbacks[exec->nbRollbacks].state;
exec->index = exec->rollbacks[exec->nbRollbacks].index;
exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
if (exec->comp->nbCounters > 0) {
if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
fprintf(stderr, "exec save: allocation failed");
exec->status = -6;
return;
}
if (exec->counts) {
memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
exec->comp->nbCounters * sizeof(int));
}
}
#ifdef DEBUG_REGEXP_EXEC
printf("restored ");
xmlFARegDebugExec(exec);
#endif
}
/************************************************************************
* *
* Verifier, running an input against a compiled regexp *
* *
************************************************************************/
static int
xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
xmlRegExecCtxt execval;
xmlRegExecCtxtPtr exec = &execval;
int ret, codepoint = 0, len, deter;
exec->inputString = content;
exec->index = 0;
exec->nbPush = 0;
exec->determinist = 1;
exec->maxRollbacks = 0;
exec->nbRollbacks = 0;
exec->rollbacks = NULL;
exec->status = 0;
exec->comp = comp;
exec->state = comp->states[0];
exec->transno = 0;
exec->transcount = 0;
exec->inputStack = NULL;
exec->inputStackMax = 0;
if (comp->nbCounters > 0) {
exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
if (exec->counts == NULL) {
xmlRegexpErrMemory(NULL, "running regexp");
return(-1);
}
memset(exec->counts, 0, comp->nbCounters * sizeof(int));
} else
exec->counts = NULL;
while ((exec->status == 0) && (exec->state != NULL) &&
((exec->inputString[exec->index] != 0) ||
((exec->state != NULL) &&
(exec->state->type != XML_REGEXP_FINAL_STATE)))) {
xmlRegTransPtr trans;
xmlRegAtomPtr atom;
/*
* If end of input on non-terminal state, rollback, however we may
* still have epsilon like transition for counted transitions
* on counters, in that case don't break too early. Additionally,
* if we are working on a range like "AB{0,2}", where B is not present,
* we don't want to break.
*/
len = 1;
if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
/*
* if there is a transition, we must check if
* atom allows minOccurs of 0
*/
if (exec->transno < exec->state->nbTrans) {
trans = &exec->state->trans[exec->transno];
if (trans->to >=0) {
atom = trans->atom;
if (!((atom->min == 0) && (atom->max > 0)))
goto rollback;
}
} else
goto rollback;
}
exec->transcount = 0;
for (;exec->transno < exec->state->nbTrans;exec->transno++) {
trans = &exec->state->trans[exec->transno];
if (trans->to < 0)
continue;
atom = trans->atom;
ret = 0;
deter = 1;
if (trans->count >= 0) {
int count;
xmlRegCounterPtr counter;
if (exec->counts == NULL) {
exec->status = -1;
goto error;
}
/*
* A counted transition.
*/
count = exec->counts[trans->count];
counter = &exec->comp->counters[trans->count];
#ifdef DEBUG_REGEXP_EXEC
printf("testing count %d: val %d, min %d, max %d\n",
trans->count, count, counter->min, counter->max);
#endif
ret = ((count >= counter->min) && (count <= counter->max));
if ((ret) && (counter->min != counter->max))
deter = 0;
} else if (atom == NULL) {
fprintf(stderr, "epsilon transition left at runtime\n");
exec->status = -2;
break;
} else if (exec->inputString[exec->index] != 0) {
codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
ret = xmlRegCheckCharacter(atom, codepoint);
if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
xmlRegStatePtr to = comp->states[trans->to];
/*
* this is a multiple input sequence
* If there is a counter associated increment it now.
* before potentially saving and rollback
* do not increment if the counter is already over the
* maximum limit in which case get to next transition
*/
if (trans->counter >= 0) {
xmlRegCounterPtr counter;
if ((exec->counts == NULL) ||
(exec->comp == NULL) ||
(exec->comp->counters == NULL)) {
exec->status = -1;
goto error;
}
counter = &exec->comp->counters[trans->counter];
if (exec->counts[trans->counter] >= counter->max)
continue; /* for loop on transitions */
#ifdef DEBUG_REGEXP_EXEC
printf("Increasing count %d\n", trans->counter);
#endif
exec->counts[trans->counter]++;
}
if (exec->state->nbTrans > exec->transno + 1) {
xmlFARegExecSave(exec);
}
exec->transcount = 1;
do {
/*
* Try to progress as much as possible on the input
*/
if (exec->transcount == atom->max) {
break;
}
exec->index += len;
/*
* End of input: stop here
*/
if (exec->inputString[exec->index] == 0) {
exec->index -= len;
break;
}
if (exec->transcount >= atom->min) {
int transno = exec->transno;
xmlRegStatePtr state = exec->state;
/*
* The transition is acceptable save it
*/
exec->transno = -1; /* trick */
exec->state = to;
xmlFARegExecSave(exec);
exec->transno = transno;
exec->state = state;
}
codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
len);
ret = xmlRegCheckCharacter(atom, codepoint);
exec->transcount++;
} while (ret == 1);
if (exec->transcount < atom->min)
ret = 0;
/*
* If the last check failed but one transition was found
* possible, rollback
*/
if (ret < 0)
ret = 0;
if (ret == 0) {
goto rollback;
}
if (trans->counter >= 0) {
if (exec->counts == NULL) {
exec->status = -1;
goto error;
}
#ifdef DEBUG_REGEXP_EXEC
printf("Decreasing count %d\n", trans->counter);
#endif
exec->counts[trans->counter]--;
}
} else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
/*
* we don't match on the codepoint, but minOccurs of 0
* says that's ok. Setting len to 0 inhibits stepping
* over the codepoint.
*/
exec->transcount = 1;
len = 0;
ret = 1;
}
} else if ((atom->min == 0) && (atom->max > 0)) {
/* another spot to match when minOccurs is 0 */
exec->transcount = 1;
len = 0;
ret = 1;
}
if (ret == 1) {
if ((trans->nd == 1) ||
((trans->count >= 0) && (deter == 0) &&
(exec->state->nbTrans > exec->transno + 1))) {
#ifdef DEBUG_REGEXP_EXEC
if (trans->nd == 1)
printf("Saving on nd transition atom %d for %c at %d\n",
trans->atom->no, codepoint, exec->index);
else
printf("Saving on counted transition count %d for %c at %d\n",
trans->count, codepoint, exec->index);
#endif
xmlFARegExecSave(exec);
}
if (trans->counter >= 0) {
xmlRegCounterPtr counter;
/* make sure we don't go over the counter maximum value */
if ((exec->counts == NULL) ||
(exec->comp == NULL) ||
(exec->comp->counters == NULL)) {
exec->status = -1;
goto error;
}
counter = &exec->comp->counters[trans->counter];
if (exec->counts[trans->counter] >= counter->max)
continue; /* for loop on transitions */
#ifdef DEBUG_REGEXP_EXEC
printf("Increasing count %d\n", trans->counter);
#endif
exec->counts[trans->counter]++;
}
if ((trans->count >= 0) &&
(trans->count < REGEXP_ALL_COUNTER)) {
if (exec->counts == NULL) {
exec->status = -1;
goto error;
}
#ifdef DEBUG_REGEXP_EXEC
printf("resetting count %d on transition\n",
trans->count);
#endif
exec->counts[trans->count] = 0;
}
#ifdef DEBUG_REGEXP_EXEC
printf("entering state %d\n", trans->to);
#endif
exec->state = comp->states[trans->to];
exec->transno = 0;
if (trans->atom != NULL) {
exec->index += len;
}
goto progress;
} else if (ret < 0) {
exec->status = -4;
break;
}
}
if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
rollback:
/*
* Failed to find a way out
*/
exec->determinist = 0;
#ifdef DEBUG_REGEXP_EXEC
printf("rollback from state %d on %d:%c\n", exec->state->no,
codepoint,codepoint);
#endif
xmlFARegExecRollBack(exec);
}
progress:
continue;
}
error:
if (exec->rollbacks != NULL) {
if (exec->counts != NULL) {
int i;
for (i = 0;i < exec->maxRollbacks;i++)
if (exec->rollbacks[i].counts != NULL)
xmlFree(exec->rollbacks[i].counts);
}
xmlFree(exec->rollbacks);
}
if (exec->state == NULL)
return(-1);
if (exec->counts != NULL)
xmlFree(exec->counts);
if (exec->status == 0)
return(1);
if (exec->status == -1) {
if (exec->nbPush > MAX_PUSH)
return(-1);
return(0);
}
return(exec->status);
}
/************************************************************************
* *
* Progressive interface to the verifier one atom at a time *
* *
************************************************************************/
#ifdef DEBUG_ERR
static void testerr(xmlRegExecCtxtPtr exec);
#endif
/**
* xmlRegNewExecCtxt:
* @comp: a precompiled regular expression
* @callback: a callback function used for handling progresses in the
* automata matching phase
* @data: the context data associated to the callback in this context
*
* Build a context used for progressive evaluation of a regexp.
*
* Returns the new context
*/
xmlRegExecCtxtPtr
xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
xmlRegExecCtxtPtr exec;
if (comp == NULL)
return(NULL);
if ((comp->compact == NULL) && (comp->states == NULL))
return(NULL);
exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
if (exec == NULL) {
xmlRegexpErrMemory(NULL, "creating execution context");
return(NULL);
}
memset(exec, 0, sizeof(xmlRegExecCtxt));
exec->inputString = NULL;
exec->index = 0;
exec->determinist = 1;
exec->maxRollbacks = 0;
exec->nbRollbacks = 0;
exec->rollbacks = NULL;
exec->status = 0;
exec->comp = comp;
if (comp->compact == NULL)
exec->state = comp->states[0];
exec->transno = 0;
exec->transcount = 0;
exec->callback = callback;
exec->data = data;
if (comp->nbCounters > 0) {
/*
* For error handling, exec->counts is allocated twice the size
* the second half is used to store the data in case of rollback
*/
exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
* 2);
if (exec->counts == NULL) {
xmlRegexpErrMemory(NULL, "creating execution context");
xmlFree(exec);
return(NULL);
}
memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
exec->errCounts = &exec->counts[comp->nbCounters];
} else {
exec->counts = NULL;
exec->errCounts = NULL;
}
exec->inputStackMax = 0;
exec->inputStackNr = 0;
exec->inputStack = NULL;
exec->errStateNo = -1;
exec->errString = NULL;
exec->nbPush = 0;
return(exec);
}
/**
* xmlRegFreeExecCtxt:
* @exec: a regular expression evaluation context
*
* Free the structures associated to a regular expression evaluation context.
*/
void
xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
if (exec == NULL)
return;
if (exec->rollbacks != NULL) {
if (exec->counts != NULL) {
int i;
for (i = 0;i < exec->maxRollbacks;i++)
if (exec->rollbacks[i].counts != NULL)
xmlFree(exec->rollbacks[i].counts);
}
xmlFree(exec->rollbacks);
}
if (exec->counts != NULL)
xmlFree(exec->counts);
if (exec->inputStack != NULL) {
int i;
for (i = 0;i < exec->inputStackNr;i++) {
if (exec->inputStack[i].value != NULL)
xmlFree(exec->inputStack[i].value);
}
xmlFree(exec->inputStack);
}
if (exec->errString != NULL)
xmlFree(exec->errString);
xmlFree(exec);
}
static void
xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
void *data) {
#ifdef DEBUG_PUSH
printf("saving value: %d:%s\n", exec->inputStackNr, value);
#endif
if (exec->inputStackMax == 0) {
exec->inputStackMax = 4;
exec->inputStack = (xmlRegInputTokenPtr)
xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
if (exec->inputStack == NULL) {
xmlRegexpErrMemory(NULL, "pushing input string");
exec->inputStackMax = 0;
return;
}
} else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
xmlRegInputTokenPtr tmp;
exec->inputStackMax *= 2;
tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
exec->inputStackMax * sizeof(xmlRegInputToken));
if (tmp == NULL) {
xmlRegexpErrMemory(NULL, "pushing input string");
exec->inputStackMax /= 2;
return;
}
exec->inputStack = tmp;
}
exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
exec->inputStack[exec->inputStackNr].data = data;
exec->inputStackNr++;
exec->inputStack[exec->inputStackNr].value = NULL;
exec->inputStack[exec->inputStackNr].data = NULL;
}
/**
* xmlRegStrEqualWildcard:
* @expStr: the string to be evaluated
* @valStr: the validation string
*
* Checks if both strings are equal or have the same content. "*"
* can be used as a wildcard in @valStr; "|" is used as a separator of
* substrings in both @expStr and @valStr.
*
* Returns 1 if the comparison is satisfied and the number of substrings
* is equal, 0 otherwise.
*/
static int
xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
if (expStr == valStr) return(1);
if (expStr == NULL) return(0);
if (valStr == NULL) return(0);
do {
/*
* Eval if we have a wildcard for the current item.
*/
if (*expStr != *valStr) {
/* if one of them starts with a wildcard make valStr be it */
if (*valStr == '*') {
const xmlChar *tmp;
tmp = valStr;
valStr = expStr;
expStr = tmp;
}
if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
do {
if (*valStr == XML_REG_STRING_SEPARATOR)
break;
valStr++;
} while (*valStr != 0);
continue;
} else
return(0);
}
expStr++;
valStr++;
} while (*valStr != 0);
if (*expStr != 0)
return (0);
else
return (1);
}
/**
* xmlRegCompactPushString:
* @exec: a regexp execution context
* @comp: the precompiled exec with a compact table
* @value: a string token input
* @data: data associated to the token to reuse in callbacks
*
* Push one input token in the execution context
*
* Returns: 1 if the regexp reached a final state, 0 if non-final, and
* a negative value in case of error.
*/
static int
xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
xmlRegexpPtr comp,
const xmlChar *value,
void *data) {
int state = exec->index;
int i, target;
if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
return(-1);
if (value == NULL) {
/*
* are we at a final state ?
*/
if (comp->compact[state * (comp->nbstrings + 1)] ==
XML_REGEXP_FINAL_STATE)
return(1);
return(0);
}
#ifdef DEBUG_PUSH
printf("value pushed: %s\n", value);
#endif
/*
* Examine all outside transitions from current state
*/
for (i = 0;i < comp->nbstrings;i++) {
target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
if ((target > 0) && (target <= comp->nbstates)) {
target--; /* to avoid 0 */
if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
exec->index = target;
if ((exec->callback != NULL) && (comp->transdata != NULL)) {
exec->callback(exec->data, value,
comp->transdata[state * comp->nbstrings + i], data);
}
#ifdef DEBUG_PUSH
printf("entering state %d\n", target);
#endif
if (comp->compact[target * (comp->nbstrings + 1)] ==
XML_REGEXP_SINK_STATE)
goto error;
if (comp->compact[target * (comp->nbstrings + 1)] ==
XML_REGEXP_FINAL_STATE)
return(1);
return(0);
}
}
}
/*
* Failed to find an exit transition out from current state for the
* current token
*/
#ifdef DEBUG_PUSH
printf("failed to find a transition for %s on state %d\n", value, state);
#endif
error:
if (exec->errString != NULL)
xmlFree(exec->errString);
exec->errString = xmlStrdup(value);
exec->errStateNo = state;
exec->status = -1;
#ifdef DEBUG_ERR
testerr(exec);
#endif
return(-1);
}
/**
* xmlRegExecPushStringInternal:
* @exec: a regexp execution context or NULL to indicate the end
* @value: a string token input
* @data: data associated to the token to reuse in callbacks
* @compound: value was assembled from 2 strings
*
* Push one input token in the execution context
*
* Returns: 1 if the regexp reached a final state, 0 if non-final, and
* a negative value in case of error.
*/
static int
xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
void *data, int compound) {
xmlRegTransPtr trans;
xmlRegAtomPtr atom;
int ret;
int final = 0;
int progress = 1;
if (exec == NULL)
return(-1);
if (exec->comp == NULL)
return(-1);
if (exec->status != 0)
return(exec->status);
if (exec->comp->compact != NULL)
return(xmlRegCompactPushString(exec, exec->comp, value, data));
if (value == NULL) {
if (exec->state->type == XML_REGEXP_FINAL_STATE)
return(1);
final = 1;
}
#ifdef DEBUG_PUSH
printf("value pushed: %s\n", value);
#endif
/*
* If we have an active rollback stack push the new value there
* and get back to where we were left
*/
if ((value != NULL) && (exec->inputStackNr > 0)) {
xmlFARegExecSaveInputString(exec, value, data);
value = exec->inputStack[exec->index].value;
data = exec->inputStack[exec->index].data;
#ifdef DEBUG_PUSH
printf("value loaded: %s\n", value);
#endif
}
while ((exec->status == 0) &&
((value != NULL) ||
((final == 1) &&
(exec->state->type != XML_REGEXP_FINAL_STATE)))) {
/*
* End of input on non-terminal state, rollback, however we may
* still have epsilon like transition for counted transitions
* on counters, in that case don't break too early.
*/
if ((value == NULL) && (exec->counts == NULL))
goto rollback;
exec->transcount = 0;
for (;exec->transno < exec->state->nbTrans;exec->transno++) {
trans = &exec->state->trans[exec->transno];
if (trans->to < 0)
continue;
atom = trans->atom;
ret = 0;
if (trans->count == REGEXP_ALL_LAX_COUNTER) {
int i;
int count;
xmlRegTransPtr t;
xmlRegCounterPtr counter;
ret = 0;
#ifdef DEBUG_PUSH
printf("testing all lax %d\n", trans->count);
#endif
/*
* Check all counted transitions from the current state
*/
if ((value == NULL) && (final)) {
ret = 1;
} else if (value != NULL) {
for (i = 0;i < exec->state->nbTrans;i++) {
t = &exec->state->trans[i];
if ((t->counter < 0) || (t == trans))
continue;
counter = &exec->comp->counters[t->counter];
count = exec->counts[t->counter];
if ((count < counter->max) &&
(t->atom != NULL) &&
(xmlStrEqual(value, t->atom->valuep))) {
ret = 0;
break;
}
if ((count >= counter->min) &&
(count < counter->max) &&
(t->atom != NULL) &&
(xmlStrEqual(value, t->atom->valuep))) {
ret = 1;
break;
}
}
}
} else if (trans->count == REGEXP_ALL_COUNTER) {
int i;
int count;
xmlRegTransPtr t;
xmlRegCounterPtr counter;
ret = 1;
#ifdef DEBUG_PUSH
printf("testing all %d\n", trans->count);
#endif
/*
* Check all counted transitions from the current state
*/
for (i = 0;i < exec->state->nbTrans;i++) {
t = &exec->state->trans[i];
if ((t->counter < 0) || (t == trans))
continue;
counter = &exec->comp->counters[t->counter];
count = exec->counts[t->counter];
if ((count < counter->min) || (count > counter->max)) {
ret = 0;
break;
}
}
} else if (trans->count >= 0) {
int count;
xmlRegCounterPtr counter;
/*
* A counted transition.
*/
count = exec->counts[trans->count];
counter = &exec->comp->counters[trans->count];
#ifdef DEBUG_PUSH
printf("testing count %d: val %d, min %d, max %d\n",
trans->count, count, counter->min, counter->max);
#endif
ret = ((count >= counter->min) && (count <= counter->max));
} else if (atom == NULL) {
fprintf(stderr, "epsilon transition left at runtime\n");
exec->status = -2;
break;
} else if (value != NULL) {
ret = xmlRegStrEqualWildcard(atom->valuep, value);
if (atom->neg) {
ret = !ret;
if (!compound)
ret = 0;
}
if ((ret == 1) && (trans->counter >= 0)) {
xmlRegCounterPtr counter;
int count;
count = exec->counts[trans->counter];
counter = &exec->comp->counters[trans->counter];
if (count >= counter->max)
ret = 0;
}
if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
xmlRegStatePtr to = exec->comp->states[trans->to];
/*
* this is a multiple input sequence
*/
if (exec->state->nbTrans > exec->transno + 1) {
if (exec->inputStackNr <= 0) {
xmlFARegExecSaveInputString(exec, value, data);
}
xmlFARegExecSave(exec);
}
exec->transcount = 1;
do {
/*
* Try to progress as much as possible on the input
*/
if (exec->transcount == atom->max) {
break;
}
exec->index++;
value = exec->inputStack[exec->index].value;
data = exec->inputStack[exec->index].data;
#ifdef DEBUG_PUSH
printf("value loaded: %s\n", value);
#endif
/*
* End of input: stop here
*/
if (value == NULL) {
exec->index --;
break;
}
if (exec->transcount >= atom->min) {
int transno = exec->transno;
xmlRegStatePtr state = exec->state;
/*
* The transition is acceptable save it
*/
exec->transno = -1; /* trick */
exec->state = to;
if (exec->inputStackNr <= 0) {
xmlFARegExecSaveInputString(exec, value, data);
}
xmlFARegExecSave(exec);
exec->transno = transno;
exec->state = state;
}
ret = xmlStrEqual(value, atom->valuep);
exec->transcount++;
} while (ret == 1);
if (exec->transcount < atom->min)
ret = 0;
/*
* If the last check failed but one transition was found
* possible, rollback
*/
if (ret < 0)
ret = 0;
if (ret == 0) {
goto rollback;
}
}
}
if (ret == 1) {
if ((exec->callback != NULL) && (atom != NULL) &&
(data != NULL)) {
exec->callback(exec->data, atom->valuep,
atom->data, data);
}
if (exec->state->nbTrans > exec->transno + 1) {
if (exec->inputStackNr <= 0) {
xmlFARegExecSaveInputString(exec, value, data);
}
xmlFARegExecSave(exec);
}
if (trans->counter >= 0) {
#ifdef DEBUG_PUSH
printf("Increasing count %d\n", trans->counter);
#endif
exec->counts[trans->counter]++;
}
if ((trans->count >= 0) &&
(trans->count < REGEXP_ALL_COUNTER)) {
#ifdef DEBUG_REGEXP_EXEC
printf("resetting count %d on transition\n",
trans->count);
#endif
exec->counts[trans->count] = 0;
}
#ifdef DEBUG_PUSH
printf("entering state %d\n", trans->to);
#endif
if ((exec->comp->states[trans->to] != NULL) &&
(exec->comp->states[trans->to]->type ==
XML_REGEXP_SINK_STATE)) {
/*
* entering a sink state, save the current state as error
* state.
*/
if (exec->errString != NULL)
xmlFree(exec->errString);
exec->errString = xmlStrdup(value);
exec->errState = exec->state;
memcpy(exec->errCounts, exec->counts,
exec->comp->nbCounters * sizeof(int));
}
exec->state = exec->comp->states[trans->to];
exec->transno = 0;
if (trans->atom != NULL) {
if (exec->inputStack != NULL) {
exec->index++;
if (exec->index < exec->inputStackNr) {
value = exec->inputStack[exec->index].value;
data = exec->inputStack[exec->index].data;
#ifdef DEBUG_PUSH
printf("value loaded: %s\n", value);
#endif
} else {
value = NULL;
data = NULL;
#ifdef DEBUG_PUSH
printf("end of input\n");
#endif
}
} else {
value = NULL;
data = NULL;
#ifdef DEBUG_PUSH
printf("end of input\n");
#endif
}
}
goto progress;
} else if (ret < 0) {
exec->status = -4;
break;
}
}
if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
rollback:
/*
* if we didn't yet rollback on the current input
* store the current state as the error state.
*/
if ((progress) && (exec->state != NULL) &&
(exec->state->type != XML_REGEXP_SINK_STATE)) {
progress = 0;
if (exec->errString != NULL)
xmlFree(exec->errString);
exec->errString = xmlStrdup(value);
exec->errState = exec->state;
if (exec->comp->nbCounters)
memcpy(exec->errCounts, exec->counts,
exec->comp->nbCounters * sizeof(int));
}
/*
* Failed to find a way out
*/
exec->determinist = 0;
xmlFARegExecRollBack(exec);
if ((exec->inputStack != NULL ) && (exec->status == 0)) {
value = exec->inputStack[exec->index].value;
data = exec->inputStack[exec->index].data;
#ifdef DEBUG_PUSH
printf("value loaded: %s\n", value);
#endif
}
}
continue;
progress:
progress = 1;
continue;
}
if (exec->status == 0) {
return(exec->state->type == XML_REGEXP_FINAL_STATE);
}
#ifdef DEBUG_ERR
if (exec->status < 0) {
testerr(exec);
}
#endif
return(exec->status);
}
/**
* xmlRegExecPushString:
* @exec: a regexp execution context or NULL to indicate the end
* @value: a string token input
* @data: data associated to the token to reuse in callbacks
*
* Push one input token in the execution context
*
* Returns: 1 if the regexp reached a final state, 0 if non-final, and
* a negative value in case of error.
*/
int
xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
void *data) {
return(xmlRegExecPushStringInternal(exec, value, data, 0));
}
/**
* xmlRegExecPushString2:
* @exec: a regexp execution context or NULL to indicate the end
* @value: the first string token input
* @value2: the second string token input
* @data: data associated to the token to reuse in callbacks
*
* Push one input token in the execution context
*
* Returns: 1 if the regexp reached a final state, 0 if non-final, and
* a negative value in case of error.
*/
int
xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
const xmlChar *value2, void *data) {
xmlChar buf[150];
int lenn, lenp, ret;
xmlChar *str;
if (exec == NULL)
return(-1);
if (exec->comp == NULL)
return(-1);
if (exec->status != 0)
return(exec->status);
if (value2 == NULL)
return(xmlRegExecPushString(exec, value, data));
lenn = strlen((char *) value2);
lenp = strlen((char *) value);
if (150 < lenn + lenp + 2) {
str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
if (str == NULL) {
exec->status = -1;
return(-1);
}
} else {
str = buf;
}
memcpy(&str[0], value, lenp);
str[lenp] = XML_REG_STRING_SEPARATOR;
memcpy(&str[lenp + 1], value2, lenn);
str[lenn + lenp + 1] = 0;
if (exec->comp->compact != NULL)
ret = xmlRegCompactPushString(exec, exec->comp, str, data);
else
ret = xmlRegExecPushStringInternal(exec, str, data, 1);
if (str != buf)
xmlFree(str);
return(ret);
}
/**
* xmlRegExecGetValues:
* @exec: a regexp execution context
* @err: error extraction or normal one
* @nbval: pointer to the number of accepted values IN/OUT
* @nbneg: return number of negative transitions
* @values: pointer to the array of acceptable values
* @terminal: return value if this was a terminal state
*
* Extract information from the regexp execution, internal routine to
* implement xmlRegExecNextValues() and xmlRegExecErrInfo()
*
* Returns: 0 in case of success or -1 in case of error.
*/
static int
xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
int *nbval, int *nbneg,
xmlChar **values, int *terminal) {
int maxval;
int nb = 0;
if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
(values == NULL) || (*nbval <= 0))
return(-1);
maxval = *nbval;
*nbval = 0;
*nbneg = 0;
if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
xmlRegexpPtr comp;
int target, i, state;
comp = exec->comp;
if (err) {
if (exec->errStateNo == -1) return(-1);
state = exec->errStateNo;
} else {
state = exec->index;
}
if (terminal != NULL) {
if (comp->compact[state * (comp->nbstrings + 1)] ==
XML_REGEXP_FINAL_STATE)
*terminal = 1;
else
*terminal = 0;
}
for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
if ((target > 0) && (target <= comp->nbstates) &&
(comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
XML_REGEXP_SINK_STATE)) {
values[nb++] = comp->stringMap[i];
(*nbval)++;
}
}
for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
if ((target > 0) && (target <= comp->nbstates) &&
(comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
XML_REGEXP_SINK_STATE)) {
values[nb++] = comp->stringMap[i];
(*nbneg)++;
}
}
} else {
int transno;
xmlRegTransPtr trans;
xmlRegAtomPtr atom;
xmlRegStatePtr state;
if (terminal != NULL) {
if (exec->state->type == XML_REGEXP_FINAL_STATE)
*terminal = 1;
else
*terminal = 0;
}
if (err) {
if (exec->errState == NULL) return(-1);
state = exec->errState;
} else {
if (exec->state == NULL) return(-1);
state = exec->state;
}
for (transno = 0;
(transno < state->nbTrans) && (nb < maxval);
transno++) {
trans = &state->trans[transno];
if (trans->to < 0)
continue;
atom = trans->atom;
if ((atom == NULL) || (atom->valuep == NULL))
continue;
if (trans->count == REGEXP_ALL_LAX_COUNTER) {
/* this should not be reached but ... */
TODO;
} else if (trans->count == REGEXP_ALL_COUNTER) {
/* this should not be reached but ... */
TODO;
} else if (trans->counter >= 0) {
xmlRegCounterPtr counter = NULL;
int count;
if (err)
count = exec->errCounts[trans->counter];
else
count = exec->counts[trans->counter];
if (exec->comp != NULL)
counter = &exec->comp->counters[trans->counter];
if ((counter == NULL) || (count < counter->max)) {
if (atom->neg)
values[nb++] = (xmlChar *) atom->valuep2;
else
values[nb++] = (xmlChar *) atom->valuep;
(*nbval)++;
}
} else {
if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) &&
(exec->comp->states[trans->to]->type !=
XML_REGEXP_SINK_STATE)) {
if (atom->neg)
values[nb++] = (xmlChar *) atom->valuep2;
else
values[nb++] = (xmlChar *) atom->valuep;
(*nbval)++;
}
}
}
for (transno = 0;
(transno < state->nbTrans) && (nb < maxval);
transno++) {
trans = &state->trans[transno];
if (trans->to < 0)
continue;
atom = trans->atom;
if ((atom == NULL) || (atom->valuep == NULL))
continue;
if (trans->count == REGEXP_ALL_LAX_COUNTER) {
continue;
} else if (trans->count == REGEXP_ALL_COUNTER) {
continue;
} else if (trans->counter >= 0) {
continue;
} else {
if ((exec->comp->states[trans->to] != NULL) &&
(exec->comp->states[trans->to]->type ==
XML_REGEXP_SINK_STATE)) {
if (atom->neg)
values[nb++] = (xmlChar *) atom->valuep2;
else
values[nb++] = (xmlChar *) atom->valuep;
(*nbneg)++;
}
}
}
}
return(0);
}
/**
* xmlRegExecNextValues:
* @exec: a regexp execution context
* @nbval: pointer to the number of accepted values IN/OUT
* @nbneg: return number of negative transitions
* @values: pointer to the array of acceptable values
* @terminal: return value if this was a terminal state
*
* Extract information from the regexp execution,
* the parameter @values must point to an array of @nbval string pointers
* on return nbval will contain the number of possible strings in that
* state and the @values array will be updated with them. The string values
* returned will be freed with the @exec context and don't need to be
* deallocated.
*
* Returns: 0 in case of success or -1 in case of error.
*/
int
xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
xmlChar **values, int *terminal) {
return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
}
/**
* xmlRegExecErrInfo:
* @exec: a regexp execution context generating an error
* @string: return value for the error string
* @nbval: pointer to the number of accepted values IN/OUT
* @nbneg: return number of negative transitions
* @values: pointer to the array of acceptable values
* @terminal: return value if this was a terminal state
*
* Extract error information from the regexp execution, the parameter
* @string will be updated with the value pushed and not accepted,
* the parameter @values must point to an array of @nbval string pointers
* on return nbval will contain the number of possible strings in that
* state and the @values array will be updated with them. The string values
* returned will be freed with the @exec context and don't need to be
* deallocated.
*
* Returns: 0 in case of success or -1 in case of error.
*/
int
xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
int *nbval, int *nbneg, xmlChar **values, int *terminal) {
if (exec == NULL)
return(-1);
if (string != NULL) {
if (exec->status != 0)
*string = exec->errString;
else
*string = NULL;
}
return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
}
#ifdef DEBUG_ERR
static void testerr(xmlRegExecCtxtPtr exec) {
const xmlChar *string;
xmlChar *values[5];
int nb = 5;
int nbneg;
int terminal;
xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
}
#endif
#if 0
static int
xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
xmlRegTransPtr trans;
xmlRegAtomPtr atom;
int ret;
int codepoint, len;
if (exec == NULL)
return(-1);
if (exec->status != 0)
return(exec->status);
while ((exec->status == 0) &&
((exec->inputString[exec->index] != 0) ||
(exec->state->type != XML_REGEXP_FINAL_STATE))) {
/*
* End of input on non-terminal state, rollback, however we may
* still have epsilon like transition for counted transitions
* on counters, in that case don't break too early.
*/
if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
goto rollback;
exec->transcount = 0;
for (;exec->transno < exec->state->nbTrans;exec->transno++) {
trans = &exec->state->trans[exec->transno];
if (trans->to < 0)
continue;
atom = trans->atom;
ret = 0;
if (trans->count >= 0) {
int count;
xmlRegCounterPtr counter;
/*
* A counted transition.
*/
count = exec->counts[trans->count];
counter = &exec->comp->counters[trans->count];
#ifdef DEBUG_REGEXP_EXEC
printf("testing count %d: val %d, min %d, max %d\n",
trans->count, count, counter->min, counter->max);
#endif
ret = ((count >= counter->min) && (count <= counter->max));
} else if (atom == NULL) {
fprintf(stderr, "epsilon transition left at runtime\n");
exec->status = -2;
break;
} else if (exec->inputString[exec->index] != 0) {
codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
ret = xmlRegCheckCharacter(atom, codepoint);
if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
xmlRegStatePtr to = exec->comp->states[trans->to];
/*
* this is a multiple input sequence
*/
if (exec->state->nbTrans > exec->transno + 1) {
xmlFARegExecSave(exec);
}
exec->transcount = 1;
do {
/*
* Try to progress as much as possible on the input
*/
if (exec->transcount == atom->max) {
break;
}
exec->index += len;
/*
* End of input: stop here
*/
if (exec->inputString[exec->index] == 0) {
exec->index -= len;
break;
}
if (exec->transcount >= atom->min) {
int transno = exec->transno;
xmlRegStatePtr state = exec->state;
/*
* The transition is acceptable save it
*/
exec->transno = -1; /* trick */
exec->state = to;
xmlFARegExecSave(exec);
exec->transno = transno;
exec->state = state;
}
codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
len);
ret = xmlRegCheckCharacter(atom, codepoint);
exec->transcount++;
} while (ret == 1);
if (exec->transcount < atom->min)
ret = 0;
/*
* If the last check failed but one transition was found
* possible, rollback
*/
if (ret < 0)
ret = 0;
if (ret == 0) {
goto rollback;
}
}
}
if (ret == 1) {
if (exec->state->nbTrans > exec->transno + 1) {
xmlFARegExecSave(exec);
}
/*
* restart count for expressions like this ((abc){2})*
*/
if (trans->count >= 0) {
#ifdef DEBUG_REGEXP_EXEC
printf("Reset count %d\n", trans->count);
#endif
exec->counts[trans->count] = 0;
}
if (trans->counter >= 0) {
#ifdef DEBUG_REGEXP_EXEC
printf("Increasing count %d\n", trans->counter);
#endif
exec->counts[trans->counter]++;
}
#ifdef DEBUG_REGEXP_EXEC
printf("entering state %d\n", trans->to);
#endif
exec->state = exec->comp->states[trans->to];
exec->transno = 0;
if (trans->atom != NULL) {
exec->index += len;
}
goto progress;
} else if (ret < 0) {
exec->status = -4;
break;
}
}
if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
rollback:
/*
* Failed to find a way out
*/
exec->determinist = 0;
xmlFARegExecRollBack(exec);
}
progress:
continue;
}
}
#endif
/************************************************************************
* *
* Parser for the Schemas Datatype Regular Expressions *
* http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
* *
************************************************************************/
/**
* xmlFAIsChar:
* @ctxt: a regexp parser context
*
* [10] Char ::= [^.\?*+()|#x5B#x5D]
*/
static int
xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
int cur;
int len;
cur = CUR_SCHAR(ctxt->cur, len);
if ((cur == '.') || (cur == '\\') || (cur == '?') ||
(cur == '*') || (cur == '+') || (cur == '(') ||
(cur == ')') || (cur == '|') || (cur == 0x5B) ||
(cur == 0x5D) || (cur == 0))
return(-1);
return(cur);
}
/**
* xmlFAParseCharProp:
* @ctxt: a regexp parser context
*
* [27] charProp ::= IsCategory | IsBlock
* [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
* Separators | Symbols | Others
* [29] Letters ::= 'L' [ultmo]?
* [30] Marks ::= 'M' [nce]?
* [31] Numbers ::= 'N' [dlo]?
* [32] Punctuation ::= 'P' [cdseifo]?
* [33] Separators ::= 'Z' [slp]?
* [34] Symbols ::= 'S' [mcko]?
* [35] Others ::= 'C' [cfon]?
* [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
*/
static void
xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
int cur;
xmlRegAtomType type = (xmlRegAtomType) 0;
xmlChar *blockName = NULL;
cur = CUR;
if (cur == 'L') {
NEXT;
cur = CUR;
if (cur == 'u') {
NEXT;
type = XML_REGEXP_LETTER_UPPERCASE;
} else if (cur == 'l') {
NEXT;
type = XML_REGEXP_LETTER_LOWERCASE;
} else if (cur == 't') {
NEXT;
type = XML_REGEXP_LETTER_TITLECASE;
} else if (cur == 'm') {
NEXT;
type = XML_REGEXP_LETTER_MODIFIER;
} else if (cur == 'o') {
NEXT;
type = XML_REGEXP_LETTER_OTHERS;
} else {
type = XML_REGEXP_LETTER;
}
} else if (cur == 'M') {
NEXT;
cur = CUR;
if (cur == 'n') {
NEXT;
/* nonspacing */
type = XML_REGEXP_MARK_NONSPACING;
} else if (cur == 'c') {
NEXT;
/* spacing combining */
type = XML_REGEXP_MARK_SPACECOMBINING;
} else if (cur == 'e') {
NEXT;
/* enclosing */
type = XML_REGEXP_MARK_ENCLOSING;
} else {
/* all marks */
type = XML_REGEXP_MARK;
}
} else if (cur == 'N') {
NEXT;
cur = CUR;
if (cur == 'd') {
NEXT;
/* digital */
type = XML_REGEXP_NUMBER_DECIMAL;
} else if (cur == 'l') {
NEXT;
/* letter */
type = XML_REGEXP_NUMBER_LETTER;
} else if (cur == 'o') {
NEXT;
/* other */
type = XML_REGEXP_NUMBER_OTHERS;
} else {
/* all numbers */
type = XML_REGEXP_NUMBER;
}
} else if (cur == 'P') {
NEXT;
cur = CUR;
if (cur == 'c') {
NEXT;
/* connector */
type = XML_REGEXP_PUNCT_CONNECTOR;
} else if (cur == 'd') {
NEXT;
/* dash */
type = XML_REGEXP_PUNCT_DASH;
} else if (cur == 's') {
NEXT;
/* open */
type = XML_REGEXP_PUNCT_OPEN;
} else if (cur == 'e') {
NEXT;
/* close */
type = XML_REGEXP_PUNCT_CLOSE;
} else if (cur == 'i') {
NEXT;
/* initial quote */
type = XML_REGEXP_PUNCT_INITQUOTE;
} else if (cur == 'f') {
NEXT;
/* final quote */
type = XML_REGEXP_PUNCT_FINQUOTE;
} else if (cur == 'o') {
NEXT;
/* other */
type = XML_REGEXP_PUNCT_OTHERS;
} else {
/* all punctuation */
type = XML_REGEXP_PUNCT;
}
} else if (cur == 'Z') {
NEXT;
cur = CUR;
if (cur == 's') {
NEXT;
/* space */
type = XML_REGEXP_SEPAR_SPACE;
} else if (cur == 'l') {
NEXT;
/* line */
type = XML_REGEXP_SEPAR_LINE;
} else if (cur == 'p') {
NEXT;
/* paragraph */
type = XML_REGEXP_SEPAR_PARA;
} else {
/* all separators */
type = XML_REGEXP_SEPAR;
}
} else if (cur == 'S') {
NEXT;
cur = CUR;
if (cur == 'm') {
NEXT;
type = XML_REGEXP_SYMBOL_MATH;
/* math */
} else if (cur == 'c') {
NEXT;
type = XML_REGEXP_SYMBOL_CURRENCY;
/* currency */
} else if (cur == 'k') {
NEXT;
type = XML_REGEXP_SYMBOL_MODIFIER;
/* modifiers */
} else if (cur == 'o') {
NEXT;
type = XML_REGEXP_SYMBOL_OTHERS;
/* other */
} else {
/* all symbols */
type = XML_REGEXP_SYMBOL;
}
} else if (cur == 'C') {
NEXT;
cur = CUR;
if (cur == 'c') {
NEXT;
/* control */
type = XML_REGEXP_OTHER_CONTROL;
} else if (cur == 'f') {
NEXT;
/* format */
type = XML_REGEXP_OTHER_FORMAT;
} else if (cur == 'o') {
NEXT;
/* private use */
type = XML_REGEXP_OTHER_PRIVATE;
} else if (cur == 'n') {
NEXT;
/* not assigned */
type = XML_REGEXP_OTHER_NA;
} else {
/* all others */
type = XML_REGEXP_OTHER;
}
} else if (cur == 'I') {
const xmlChar *start;
NEXT;
cur = CUR;
if (cur != 's') {
ERROR("IsXXXX expected");
return;
}
NEXT;
start = ctxt->cur;
cur = CUR;
if (((cur >= 'a') && (cur <= 'z')) ||
((cur >= 'A') && (cur <= 'Z')) ||
((cur >= '0') && (cur <= '9')) ||
(cur == 0x2D)) {
NEXT;
cur = CUR;
while (((cur >= 'a') && (cur <= 'z')) ||
((cur >= 'A') && (cur <= 'Z')) ||
((cur >= '0') && (cur <= '9')) ||
(cur == 0x2D)) {
NEXT;
cur = CUR;
}
}
type = XML_REGEXP_BLOCK_NAME;
blockName = xmlStrndup(start, ctxt->cur - start);
} else {
ERROR("Unknown char property");
return;
}
if (ctxt->atom == NULL) {
ctxt->atom = xmlRegNewAtom(ctxt, type);
if (ctxt->atom != NULL)
ctxt->atom->valuep = blockName;
} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
type, 0, 0, blockName);
}
}
/**
* xmlFAParseCharClassEsc:
* @ctxt: a regexp parser context
*
* [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
* [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
* [25] catEsc ::= '\p{' charProp '}'
* [26] complEsc ::= '\P{' charProp '}'
* [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
*/
static void
xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
int cur;
if (CUR == '.') {
if (ctxt->atom == NULL) {
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
XML_REGEXP_ANYCHAR, 0, 0, NULL);
}
NEXT;
return;
}
if (CUR != '\\') {
ERROR("Escaped sequence: expecting \\");
return;
}
NEXT;
cur = CUR;
if (cur == 'p') {
NEXT;
if (CUR != '{') {
ERROR("Expecting '{'");
return;
}
NEXT;
xmlFAParseCharProp(ctxt);
if (CUR != '}') {
ERROR("Expecting '}'");
return;
}
NEXT;
} else if (cur == 'P') {
NEXT;
if (CUR != '{') {
ERROR("Expecting '{'");
return;
}
NEXT;
xmlFAParseCharProp(ctxt);
if (ctxt->atom != NULL)
ctxt->atom->neg = 1;
if (CUR != '}') {
ERROR("Expecting '}'");
return;
}
NEXT;
} else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
(cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
(cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
(cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
(cur == 0x5E)) {
if (ctxt->atom == NULL) {
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
if (ctxt->atom != NULL) {
switch (cur) {
case 'n':
ctxt->atom->codepoint = '\n';
break;
case 'r':
ctxt->atom->codepoint = '\r';
break;
case 't':
ctxt->atom->codepoint = '\t';
break;
default:
ctxt->atom->codepoint = cur;
}
}
} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
switch (cur) {
case 'n':
cur = '\n';
break;
case 'r':
cur = '\r';
break;
case 't':
cur = '\t';
break;
}
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
XML_REGEXP_CHARVAL, cur, cur, NULL);
}
NEXT;
} else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
(cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
(cur == 'w') || (cur == 'W')) {
xmlRegAtomType type = XML_REGEXP_ANYSPACE;
switch (cur) {
case 's':
type = XML_REGEXP_ANYSPACE;
break;
case 'S':
type = XML_REGEXP_NOTSPACE;
break;
case 'i':
type = XML_REGEXP_INITNAME;
break;
case 'I':
type = XML_REGEXP_NOTINITNAME;
break;
case 'c':
type = XML_REGEXP_NAMECHAR;
break;
case 'C':
type = XML_REGEXP_NOTNAMECHAR;
break;
case 'd':
type = XML_REGEXP_DECIMAL;
break;
case 'D':
type = XML_REGEXP_NOTDECIMAL;
break;
case 'w':
type = XML_REGEXP_REALCHAR;
break;
case 'W':
type = XML_REGEXP_NOTREALCHAR;
break;
}
NEXT;
if (ctxt->atom == NULL) {
ctxt->atom = xmlRegNewAtom(ctxt, type);
} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
type, 0, 0, NULL);
}
} else {
ERROR("Wrong escape sequence, misuse of character '\\'");
}
}
/**
* xmlFAParseCharRange:
* @ctxt: a regexp parser context
*
* [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
* [18] seRange ::= charOrEsc '-' charOrEsc
* [20] charOrEsc ::= XmlChar | SingleCharEsc
* [21] XmlChar ::= [^\#x2D#x5B#x5D]
* [22] XmlCharIncDash ::= [^\#x5B#x5D]
*/
static void
xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
int cur, len;
int start = -1;
int end = -1;
if (CUR == '\0') {
ERROR("Expecting ']'");
return;
}
cur = CUR;
if (cur == '\\') {
NEXT;
cur = CUR;
switch (cur) {
case 'n': start = 0xA; break;
case 'r': start = 0xD; break;
case 't': start = 0x9; break;
case '\\': case '|': case '.': case '-': case '^': case '?':
case '*': case '+': case '{': case '}': case '(': case ')':
case '[': case ']':
start = cur; break;
default:
ERROR("Invalid escape value");
return;
}
end = start;
len = 1;
} else if ((cur != 0x5B) && (cur != 0x5D)) {
end = start = CUR_SCHAR(ctxt->cur, len);
} else {
ERROR("Expecting a char range");
return;
}
/*
* Since we are "inside" a range, we can assume ctxt->cur is past
* the start of ctxt->string, and PREV should be safe
*/
if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
NEXTL(len);
return;
}
NEXTL(len);
cur = CUR;
if ((cur != '-') || (NXT(1) == ']')) {
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
XML_REGEXP_CHARVAL, start, end, NULL);
return;
}
NEXT;
cur = CUR;
if (cur == '\\') {
NEXT;
cur = CUR;
switch (cur) {
case 'n': end = 0xA; break;
case 'r': end = 0xD; break;
case 't': end = 0x9; break;
case '\\': case '|': case '.': case '-': case '^': case '?':
case '*': case '+': case '{': case '}': case '(': case ')':
case '[': case ']':
end = cur; break;
default:
ERROR("Invalid escape value");
return;
}
len = 1;
} else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) {
end = CUR_SCHAR(ctxt->cur, len);
} else {
ERROR("Expecting the end of a char range");
return;
}
/* TODO check that the values are acceptable character ranges for XML */
if (end < start) {
ERROR("End of range is before start of range");
} else {
NEXTL(len);
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
XML_REGEXP_CHARVAL, start, end, NULL);
}
return;
}
/**
* xmlFAParsePosCharGroup:
* @ctxt: a regexp parser context
*
* [14] posCharGroup ::= ( charRange | charClassEsc )+
*/
static void
xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
do {
if (CUR == '\\') {
xmlFAParseCharClassEsc(ctxt);
} else {
xmlFAParseCharRange(ctxt);
}
} while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
(CUR != 0) && (ctxt->error == 0));
}
/**
* xmlFAParseCharGroup:
* @ctxt: a regexp parser context
*
* [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
* [15] negCharGroup ::= '^' posCharGroup
* [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
* [12] charClassExpr ::= '[' charGroup ']'
*/
static void
xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
int n = ctxt->neg;
while ((CUR != ']') && (ctxt->error == 0)) {
if (CUR == '^') {
int neg = ctxt->neg;
NEXT;
ctxt->neg = !ctxt->neg;
xmlFAParsePosCharGroup(ctxt);
ctxt->neg = neg;
} else if ((CUR == '-') && (NXT(1) == '[')) {
int neg = ctxt->neg;
ctxt->neg = 2;
NEXT; /* eat the '-' */
NEXT; /* eat the '[' */
xmlFAParseCharGroup(ctxt);
if (CUR == ']') {
NEXT;
} else {
ERROR("charClassExpr: ']' expected");
break;
}
ctxt->neg = neg;
break;
} else if (CUR != ']') {
xmlFAParsePosCharGroup(ctxt);
}
}
ctxt->neg = n;
}
/**
* xmlFAParseCharClass:
* @ctxt: a regexp parser context
*
* [11] charClass ::= charClassEsc | charClassExpr
* [12] charClassExpr ::= '[' charGroup ']'
*/
static void
xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
if (CUR == '[') {
NEXT;
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
if (ctxt->atom == NULL)
return;
xmlFAParseCharGroup(ctxt);
if (CUR == ']') {
NEXT;
} else {
ERROR("xmlFAParseCharClass: ']' expected");
}
} else {
xmlFAParseCharClassEsc(ctxt);
}
}
/**
* xmlFAParseQuantExact:
* @ctxt: a regexp parser context
*
* [8] QuantExact ::= [0-9]+
*
* Returns 0 if success or -1 in case of error
*/
static int
xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
int ret = 0;
int ok = 0;
int overflow = 0;
while ((CUR >= '0') && (CUR <= '9')) {
if (ret > INT_MAX / 10) {
overflow = 1;
} else {
int digit = CUR - '0';
ret *= 10;
if (ret > INT_MAX - digit)
overflow = 1;
else
ret += digit;
}
ok = 1;
NEXT;
}
if ((ok != 1) || (overflow == 1)) {
return(-1);
}
return(ret);
}
/**
* xmlFAParseQuantifier:
* @ctxt: a regexp parser context
*
* [4] quantifier ::= [?*+] | ( '{' quantity '}' )
* [5] quantity ::= quantRange | quantMin | QuantExact
* [6] quantRange ::= QuantExact ',' QuantExact
* [7] quantMin ::= QuantExact ','
* [8] QuantExact ::= [0-9]+
*/
static int
xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
int cur;
cur = CUR;
if ((cur == '?') || (cur == '*') || (cur == '+')) {
if (ctxt->atom != NULL) {
if (cur == '?')
ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
else if (cur == '*')
ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
else if (cur == '+')
ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
}
NEXT;
return(1);
}
if (cur == '{') {
int min = 0, max = 0;
NEXT;
cur = xmlFAParseQuantExact(ctxt);
if (cur >= 0)
min = cur;
else {
ERROR("Improper quantifier");
}
if (CUR == ',') {
NEXT;
if (CUR == '}')
max = INT_MAX;
else {
cur = xmlFAParseQuantExact(ctxt);
if (cur >= 0)
max = cur;
else {
ERROR("Improper quantifier");
}
}
}
if (CUR == '}') {
NEXT;
} else {
ERROR("Unterminated quantifier");
}
if (max == 0)
max = min;
if (ctxt->atom != NULL) {
ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
ctxt->atom->min = min;
ctxt->atom->max = max;
}
return(1);
}
return(0);
}
/**
* xmlFAParseAtom:
* @ctxt: a regexp parser context
*
* [9] atom ::= Char | charClass | ( '(' regExp ')' )
*/
static int
xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
int codepoint, len;
codepoint = xmlFAIsChar(ctxt);
if (codepoint > 0) {
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
if (ctxt->atom == NULL)
return(-1);
codepoint = CUR_SCHAR(ctxt->cur, len);
ctxt->atom->codepoint = codepoint;
NEXTL(len);
return(1);
} else if (CUR == '|') {
return(0);
} else if (CUR == 0) {
return(0);
} else if (CUR == ')') {
return(0);
} else if (CUR == '(') {
xmlRegStatePtr start, oldend, start0;
NEXT;
if (ctxt->depth >= 50) {
ERROR("xmlFAParseAtom: maximum nesting depth exceeded");
return(-1);
}
/*
* this extra Epsilon transition is needed if we count with 0 allowed
* unfortunately this can't be known at that point
*/
xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
start0 = ctxt->state;
xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
start = ctxt->state;
oldend = ctxt->end;
ctxt->end = NULL;
ctxt->atom = NULL;
ctxt->depth++;
xmlFAParseRegExp(ctxt, 0);
ctxt->depth--;
if (CUR == ')') {
NEXT;
} else {
ERROR("xmlFAParseAtom: expecting ')'");
}
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
if (ctxt->atom == NULL)
return(-1);
ctxt->atom->start = start;
ctxt->atom->start0 = start0;
ctxt->atom->stop = ctxt->state;
ctxt->end = oldend;
return(1);
} else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
xmlFAParseCharClass(ctxt);
return(1);
}
return(0);
}
/**
* xmlFAParsePiece:
* @ctxt: a regexp parser context
*
* [3] piece ::= atom quantifier?
*/
static int
xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
int ret;
ctxt->atom = NULL;
ret = xmlFAParseAtom(ctxt);
if (ret == 0)
return(0);
if (ctxt->atom == NULL) {
ERROR("internal: no atom generated");
}
xmlFAParseQuantifier(ctxt);
return(1);
}
/**
* xmlFAParseBranch:
* @ctxt: a regexp parser context
* @to: optional target to the end of the branch
*
* @to is used to optimize by removing duplicate path in automata
* in expressions like (a|b)(c|d)
*
* [2] branch ::= piece*
*/
static int
xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
xmlRegStatePtr previous;
int ret;
previous = ctxt->state;
ret = xmlFAParsePiece(ctxt);
if (ret == 0) {
/* Empty branch */
xmlFAGenerateEpsilonTransition(ctxt, previous, to);
} else {
if (xmlFAGenerateTransitions(ctxt, previous,
(CUR=='|' || CUR==')' || CUR==0) ? to : NULL, ctxt->atom) < 0)
return(-1);
previous = ctxt->state;
ctxt->atom = NULL;
}
while ((ret != 0) && (ctxt->error == 0)) {
ret = xmlFAParsePiece(ctxt);
if (ret != 0) {
if (xmlFAGenerateTransitions(ctxt, previous,
(CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
ctxt->atom) < 0)
return(-1);
previous = ctxt->state;
ctxt->atom = NULL;
}
}
return(0);
}
/**
* xmlFAParseRegExp:
* @ctxt: a regexp parser context
* @top: is this the top-level expression ?
*
* [1] regExp ::= branch ( '|' branch )*
*/
static void
xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
xmlRegStatePtr start, end;
/* if not top start should have been generated by an epsilon trans */
start = ctxt->state;
ctxt->end = NULL;
xmlFAParseBranch(ctxt, NULL);
if (top) {
#ifdef DEBUG_REGEXP_GRAPH
printf("State %d is final\n", ctxt->state->no);
#endif
ctxt->state->type = XML_REGEXP_FINAL_STATE;
}
if (CUR != '|') {
ctxt->end = ctxt->state;
return;
}
end = ctxt->state;
while ((CUR == '|') && (ctxt->error == 0)) {
NEXT;
ctxt->state = start;
ctxt->end = NULL;
xmlFAParseBranch(ctxt, end);
}
if (!top) {
ctxt->state = end;
ctxt->end = end;
}
}
/************************************************************************
* *
* The basic API *
* *
************************************************************************/
/**
* xmlRegexpPrint:
* @output: the file for the output debug
* @regexp: the compiled regexp
*
* Print the content of the compiled regular expression
*/
void
xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
int i;
if (output == NULL)
return;
fprintf(output, " regexp: ");
if (regexp == NULL) {
fprintf(output, "NULL\n");
return;
}
fprintf(output, "'%s' ", regexp->string);
fprintf(output, "\n");
fprintf(output, "%d atoms:\n", regexp->nbAtoms);
for (i = 0;i < regexp->nbAtoms; i++) {
fprintf(output, " %02d ", i);
xmlRegPrintAtom(output, regexp->atoms[i]);
}
fprintf(output, "%d states:", regexp->nbStates);
fprintf(output, "\n");
for (i = 0;i < regexp->nbStates; i++) {
xmlRegPrintState(output, regexp->states[i]);
}
fprintf(output, "%d counters:\n", regexp->nbCounters);
for (i = 0;i < regexp->nbCounters; i++) {
fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
regexp->counters[i].max);
}
}
/**
* xmlRegexpCompile:
* @regexp: a regular expression string
*
* Parses a regular expression conforming to XML Schemas Part 2 Datatype
* Appendix F and builds an automata suitable for testing strings against
* that regular expression
*
* Returns the compiled expression or NULL in case of error
*/
xmlRegexpPtr
xmlRegexpCompile(const xmlChar *regexp) {
xmlRegexpPtr ret;
xmlRegParserCtxtPtr ctxt;
ctxt = xmlRegNewParserCtxt(regexp);
if (ctxt == NULL)
return(NULL);
/* initialize the parser */
ctxt->end = NULL;
ctxt->start = ctxt->state = xmlRegNewState(ctxt);
xmlRegStatePush(ctxt, ctxt->start);
/* parse the expression building an automata */
xmlFAParseRegExp(ctxt, 1);
if (CUR != 0) {
ERROR("xmlFAParseRegExp: extra characters");
}
if (ctxt->error != 0) {
xmlRegFreeParserCtxt(ctxt);
return(NULL);
}
ctxt->end = ctxt->state;
ctxt->start->type = XML_REGEXP_START_STATE;
ctxt->end->type = XML_REGEXP_FINAL_STATE;
/* remove the Epsilon except for counted transitions */
xmlFAEliminateEpsilonTransitions(ctxt);
if (ctxt->error != 0) {
xmlRegFreeParserCtxt(ctxt);
return(NULL);
}
ret = xmlRegEpxFromParse(ctxt);
xmlRegFreeParserCtxt(ctxt);
return(ret);
}
/**
* xmlRegexpExec:
* @comp: the compiled regular expression
* @content: the value to check against the regular expression
*
* Check if the regular expression generates the value
*
* Returns 1 if it matches, 0 if not and a negative value in case of error
*/
int
xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
if ((comp == NULL) || (content == NULL))
return(-1);
return(xmlFARegExec(comp, content));
}
/**
* xmlRegexpIsDeterminist:
* @comp: the compiled regular expression
*
* Check if the regular expression is determinist
*
* Returns 1 if it yes, 0 if not and a negative value in case of error
*/
int
xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
xmlAutomataPtr am;
int ret;
if (comp == NULL)
return(-1);
if (comp->determinist != -1)
return(comp->determinist);
am = xmlNewAutomata();
if (am == NULL)
return(-1);
if (am->states != NULL) {
int i;
for (i = 0;i < am->nbStates;i++)
xmlRegFreeState(am->states[i]);
xmlFree(am->states);
}
am->nbAtoms = comp->nbAtoms;
am->atoms = comp->atoms;
am->nbStates = comp->nbStates;
am->states = comp->states;
am->determinist = -1;
am->flags = comp->flags;
ret = xmlFAComputesDeterminism(am);
am->atoms = NULL;
am->states = NULL;
xmlFreeAutomata(am);
comp->determinist = ret;
return(ret);
}
/**
* xmlRegFreeRegexp:
* @regexp: the regexp
*
* Free a regexp
*/
void
xmlRegFreeRegexp(xmlRegexpPtr regexp) {
int i;
if (regexp == NULL)
return;
if (regexp->string != NULL)
xmlFree(regexp->string);
if (regexp->states != NULL) {
for (i = 0;i < regexp->nbStates;i++)
xmlRegFreeState(regexp->states[i]);
xmlFree(regexp->states);
}
if (regexp->atoms != NULL) {
for (i = 0;i < regexp->nbAtoms;i++)
xmlRegFreeAtom(regexp->atoms[i]);
xmlFree(regexp->atoms);
}
if (regexp->counters != NULL)
xmlFree(regexp->counters);
if (regexp->compact != NULL)
xmlFree(regexp->compact);
if (regexp->transdata != NULL)
xmlFree(regexp->transdata);
if (regexp->stringMap != NULL) {
for (i = 0; i < regexp->nbstrings;i++)
xmlFree(regexp->stringMap[i]);
xmlFree(regexp->stringMap);
}
xmlFree(regexp);
}
#ifdef LIBXML_AUTOMATA_ENABLED
/************************************************************************
* *
* The Automata interface *
* *
************************************************************************/
/**
* xmlNewAutomata:
*
* Create a new automata
*
* Returns the new object or NULL in case of failure
*/
xmlAutomataPtr
xmlNewAutomata(void) {
xmlAutomataPtr ctxt;
ctxt = xmlRegNewParserCtxt(NULL);
if (ctxt == NULL)
return(NULL);
/* initialize the parser */
ctxt->end = NULL;
ctxt->start = ctxt->state = xmlRegNewState(ctxt);
if (ctxt->start == NULL) {
xmlFreeAutomata(ctxt);
return(NULL);
}
ctxt->start->type = XML_REGEXP_START_STATE;
if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
xmlRegFreeState(ctxt->start);
xmlFreeAutomata(ctxt);
return(NULL);
}
ctxt->flags = 0;
return(ctxt);
}
/**
* xmlFreeAutomata:
* @am: an automata
*
* Free an automata
*/
void
xmlFreeAutomata(xmlAutomataPtr am) {
if (am == NULL)
return;
xmlRegFreeParserCtxt(am);
}
/**
* xmlAutomataSetFlags:
* @am: an automata
* @flags: a set of internal flags
*
* Set some flags on the automata
*/
void
xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
if (am == NULL)
return;
am->flags |= flags;
}
/**
* xmlAutomataGetInitState:
* @am: an automata
*
* Initial state lookup
*
* Returns the initial state of the automata
*/
xmlAutomataStatePtr
xmlAutomataGetInitState(xmlAutomataPtr am) {
if (am == NULL)
return(NULL);
return(am->start);
}
/**
* xmlAutomataSetFinalState:
* @am: an automata
* @state: a state in this automata
*
* Makes that state a final state
*
* Returns 0 or -1 in case of error
*/
int
xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
if ((am == NULL) || (state == NULL))
return(-1);
state->type = XML_REGEXP_FINAL_STATE;
return(0);
}
/**
* xmlAutomataNewTransition:
* @am: an automata
* @from: the starting point of the transition
* @to: the target point of the transition or NULL
* @token: the input string associated to that transition
* @data: data passed to the callback function if the transition is activated
*
* If @to is NULL, this creates first a new target state in the automata
* and then adds a transition from the @from state to the target state
* activated by the value of @token
*
* Returns the target state or NULL in case of error
*/
xmlAutomataStatePtr
xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
xmlAutomataStatePtr to, const xmlChar *token,
void *data) {
xmlRegAtomPtr atom;
if ((am == NULL) || (from == NULL) || (token == NULL))
return(NULL);
atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
if (atom == NULL)
return(NULL);
atom->data = data;
atom->valuep = xmlStrdup(token);
if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
xmlRegFreeAtom(atom);
return(NULL);
}
if (to == NULL)
return(am->state);
return(to);
}
/**
* xmlAutomataNewTransition2:
* @am: an automata
* @from: the starting point of the transition
* @to: the target point of the transition or NULL
* @token: the first input string associated to that transition
* @token2: the second input string associated to that transition
* @data: data passed to the callback function if the transition is activated
*
* If @to is NULL, this creates first a new target state in the automata
* and then adds a transition from the @from state to the target state
* activated by the value of @token
*
* Returns the target state or NULL in case of error
*/
xmlAutomataStatePtr
xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
xmlAutomataStatePtr to, const xmlChar *token,
const xmlChar *token2, void *data) {
xmlRegAtomPtr atom;
if ((am == NULL) || (from == NULL) || (token == NULL))
return(NULL);
atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
if (atom == NULL)
return(NULL);
atom->data = data;
if ((token2 == NULL) || (*token2 == 0)) {
atom->valuep = xmlStrdup(token);
} else {
int lenn, lenp;
xmlChar *str;
lenn = strlen((char *) token2);
lenp = strlen((char *) token);
str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
if (str == NULL) {
xmlRegFreeAtom(atom);
return(NULL);
}
memcpy(&str[0], token, lenp);
str[lenp] = '|';
memcpy(&str[lenp + 1], token2, lenn);
str[lenn + lenp + 1] = 0;
atom->valuep = str;
}
if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
xmlRegFreeAtom(atom);
return(NULL);
}
if (to == NULL)
return(am->state);
return(to);
}
/**
* xmlAutomataNewNegTrans:
* @am: an automata
* @from: the starting point of the transition
* @to: the target point of the transition or NULL
* @token: the first input string associated to that transition
* @token2: the second input string associated to that transition
* @data: data passed to the callback function if the transition is activated
*
* If @to is NULL, this creates first a new target state in the automata
* and then adds a transition from the @from state to the target state
* activated by any value except (@token,@token2)
* Note that if @token2 is not NULL, then (X, NULL) won't match to follow
# the semantic of XSD ##other
*
* Returns the target state or NULL in case of error
*/
xmlAutomataStatePtr
xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
xmlAutomataStatePtr to, const xmlChar *token,
const xmlChar *token2, void *data) {
xmlRegAtomPtr atom;
xmlChar err_msg[200];
if ((am == NULL) || (from == NULL) || (token == NULL))
return(NULL);
atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
if (atom == NULL)
return(NULL);
atom->data = data;
atom->neg = 1;
if ((token2 == NULL) || (*token2 == 0)) {
atom->valuep = xmlStrdup(token);
} else {
int lenn, lenp;
xmlChar *str;
lenn = strlen((char *) token2);
lenp = strlen((char *) token);
str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
if (str == NULL) {
xmlRegFreeAtom(atom);
return(NULL);
}
memcpy(&str[0], token, lenp);
str[lenp] = '|';
memcpy(&str[lenp + 1], token2, lenn);
str[lenn + lenp + 1] = 0;
atom->valuep = str;
}
snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
err_msg[199] = 0;
atom->valuep2 = xmlStrdup(err_msg);
if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
xmlRegFreeAtom(atom);
return(NULL);
}
am->negs++;
if (to == NULL)
return(am->state);
return(to);
}
/**
* xmlAutomataNewCountTrans2:
* @am: an automata
* @from: the starting point of the transition
* @to: the target point of the transition or NULL
* @token: the input string associated to that transition
* @token2: the second input string associated to that transition
* @min: the minimum successive occurrences of token
* @max: the maximum successive occurrences of token
* @data: data associated to the transition
*
* If @to is NULL, this creates first a new target state in the automata
* and then adds a transition from the @from state to the target state
* activated by a succession of input of value @token and @token2 and
* whose number is between @min and @max
*
* Returns the target state or NULL in case of error
*/
xmlAutomataStatePtr
xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
xmlAutomataStatePtr to, const xmlChar *token,
const xmlChar *token2,
int min, int max, void *data) {
xmlRegAtomPtr atom;
int counter;
if ((am == NULL) || (from == NULL) || (token == NULL))
return(NULL);
if (min < 0)
return(NULL);
if ((max < min) || (max < 1))
return(NULL);
atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
if (atom == NULL)
return(NULL);
if ((token2 == NULL) || (*token2 == 0)) {
atom->valuep = xmlStrdup(token);
} else {
int lenn, lenp;
xmlChar *str;
lenn = strlen((char *) token2);
lenp = strlen((char *) token);
str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
if (str == NULL) {
xmlRegFreeAtom(atom);
return(NULL);
}
memcpy(&str[0], token, lenp);
str[lenp] = '|';
memcpy(&str[lenp + 1], token2, lenn);
str[lenn + lenp + 1] = 0;
atom->valuep = str;
}
atom->data = data;
if (min == 0)
atom->min = 1;
else
atom->min = min;
atom->max = max;
/*
* associate a counter to the transition.
*/
counter = xmlRegGetCounter(am);
am->counters[counter].min = min;
am->counters[counter].max = max;
/* xmlFAGenerateTransitions(am, from, to, atom); */
if (to == NULL) {
to = xmlRegNewState(am);
xmlRegStatePush(am, to);
}
xmlRegStateAddTrans(am, from, atom, to, counter, -1);
xmlRegAtomPush(am, atom);
am->state = to;
if (to == NULL)
to = am->state;
if (to == NULL)
return(NULL);
if (min == 0)
xmlFAGenerateEpsilonTransition(am, from, to);
return(to);
}
/**
* xmlAutomataNewCountTrans:
* @am: an automata
* @from: the starting point of the transition
* @to: the target point of the transition or NULL
* @token: the input string associated to that transition
* @min: the minimum successive occurrences of token
* @max: the maximum successive occurrences of token
* @data: data associated to the transition
*
* If @to is NULL, this creates first a new target state in the automata
* and then adds a transition from the @from state to the target state
* activated by a succession of input of value @token and whose number
* is between @min and @max
*
* Returns the target state or NULL in case of error
*/
xmlAutomataStatePtr
xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
xmlAutomataStatePtr to, const xmlChar *token,
int min, int max, void *data) {
xmlRegAtomPtr atom;
int counter;
if ((am == NULL) || (from == NULL) || (token == NULL))
return(NULL);
if (min < 0)
return(NULL);
if ((max < min) || (max < 1))
return(NULL);
atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
if (atom == NULL)
return(NULL);
atom->valuep = xmlStrdup(token);
atom->data = data;
if (min == 0)
atom->min = 1;
else
atom->min = min;
atom->max = max;
/*
* associate a counter to the transition.
*/
counter = xmlRegGetCounter(am);
am->counters[counter].min = min;
am->counters[counter].max = max;
/* xmlFAGenerateTransitions(am, from, to, atom); */
if (to == NULL) {
to = xmlRegNewState(am);
xmlRegStatePush(am, to);
}
xmlRegStateAddTrans(am, from, atom, to, counter, -1);
xmlRegAtomPush(am, atom);
am->state = to;
if (to == NULL)
to = am->state;
if (to == NULL)
return(NULL);
if (min == 0)
xmlFAGenerateEpsilonTransition(am, from, to);
return(to);
}
/**
* xmlAutomataNewOnceTrans2:
* @am: an automata
* @from: the starting point of the transition
* @to: the target point of the transition or NULL
* @token: the input string associated to that transition
* @token2: the second input string associated to that transition
* @min: the minimum successive occurrences of token
* @max: the maximum successive occurrences of token
* @data: data associated to the transition
*
* If @to is NULL, this creates first a new target state in the automata
* and then adds a transition from the @from state to the target state
* activated by a succession of input of value @token and @token2 and whose
* number is between @min and @max, moreover that transition can only be
* crossed once.
*
* Returns the target state or NULL in case of error
*/
xmlAutomataStatePtr
xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
xmlAutomataStatePtr to, const xmlChar *token,
const xmlChar *token2,
int min, int max, void *data) {
xmlRegAtomPtr atom;
int counter;
if ((am == NULL) || (from == NULL) || (token == NULL))
return(NULL);
if (min < 1)
return(NULL);
if (max < min)
return(NULL);
atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
if (atom == NULL)
return(NULL);
if ((token2 == NULL) || (*token2 == 0)) {
atom->valuep = xmlStrdup(token);
} else {
int lenn, lenp;
xmlChar *str;
lenn = strlen((char *) token2);
lenp = strlen((char *) token);
str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
if (str == NULL) {
xmlRegFreeAtom(atom);
return(NULL);
}
memcpy(&str[0], token, lenp);
str[lenp] = '|';
memcpy(&str[lenp + 1], token2, lenn);
str[lenn + lenp + 1] = 0;
atom->valuep = str;
}
atom->data = data;
atom->quant = XML_REGEXP_QUANT_ONCEONLY;
atom->min = min;
atom->max = max;
/*
* associate a counter to the transition.
*/
counter = xmlRegGetCounter(am);
am->counters[counter].min = 1;
am->counters[counter].max = 1;
/* xmlFAGenerateTransitions(am, from, to, atom); */
if (to == NULL) {
to = xmlRegNewState(am);
xmlRegStatePush(am, to);
}
xmlRegStateAddTrans(am, from, atom, to, counter, -1);
xmlRegAtomPush(am, atom);
am->state = to;
return(to);
}
/**
* xmlAutomataNewOnceTrans:
* @am: an automata
* @from: the starting point of the transition
* @to: the target point of the transition or NULL
* @token: the input string associated to that transition
* @min: the minimum successive occurrences of token
* @max: the maximum successive occurrences of token
* @data: data associated to the transition
*
* If @to is NULL, this creates first a new target state in the automata
* and then adds a transition from the @from state to the target state
* activated by a succession of input of value @token and whose number
* is between @min and @max, moreover that transition can only be crossed
* once.
*
* Returns the target state or NULL in case of error
*/
xmlAutomataStatePtr
xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
xmlAutomataStatePtr to, const xmlChar *token,
int min, int max, void *data) {
xmlRegAtomPtr atom;
int counter;
if ((am == NULL) || (from == NULL) || (token == NULL))
return(NULL);
if (min < 1)
return(NULL);
if (max < min)
return(NULL);
atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
if (atom == NULL)
return(NULL);
atom->valuep = xmlStrdup(token);
atom->data = data;
atom->quant = XML_REGEXP_QUANT_ONCEONLY;
atom->min = min;
atom->max = max;
/*
* associate a counter to the transition.
*/
counter = xmlRegGetCounter(am);
am->counters[counter].min = 1;
am->counters[counter].max = 1;
/* xmlFAGenerateTransitions(am, from, to, atom); */
if (to == NULL) {
to = xmlRegNewState(am);
xmlRegStatePush(am, to);
}
xmlRegStateAddTrans(am, from, atom, to, counter, -1);
xmlRegAtomPush(am, atom);
am->state = to;
return(to);
}
/**
* xmlAutomataNewState:
* @am: an automata
*
* Create a new disconnected state in the automata
*
* Returns the new state or NULL in case of error
*/
xmlAutomataStatePtr
xmlAutomataNewState(xmlAutomataPtr am) {
xmlAutomataStatePtr to;
if (am == NULL)
return(NULL);
to = xmlRegNewState(am);
xmlRegStatePush(am, to);
return(to);
}
/**
* xmlAutomataNewEpsilon:
* @am: an automata
* @from: the starting point of the transition
* @to: the target point of the transition or NULL
*
* If @to is NULL, this creates first a new target state in the automata
* and then adds an epsilon transition from the @from state to the
* target state
*
* Returns the target state or NULL in case of error
*/
xmlAutomataStatePtr
xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
xmlAutomataStatePtr to) {
if ((am == NULL) || (from == NULL))
return(NULL);
xmlFAGenerateEpsilonTransition(am, from, to);
if (to == NULL)
return(am->state);
return(to);
}
/**
* xmlAutomataNewAllTrans:
* @am: an automata
* @from: the starting point of the transition
* @to: the target point of the transition or NULL
* @lax: allow to transition if not all all transitions have been activated
*
* If @to is NULL, this creates first a new target state in the automata
* and then adds a an ALL transition from the @from state to the
* target state. That transition is an epsilon transition allowed only when
* all transitions from the @from node have been activated.
*
* Returns the target state or NULL in case of error
*/
xmlAutomataStatePtr
xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
xmlAutomataStatePtr to, int lax) {
if ((am == NULL) || (from == NULL))
return(NULL);
xmlFAGenerateAllTransition(am, from, to, lax);
if (to == NULL)
return(am->state);
return(to);
}
/**
* xmlAutomataNewCounter:
* @am: an automata
* @min: the minimal value on the counter
* @max: the maximal value on the counter
*
* Create a new counter
*
* Returns the counter number or -1 in case of error
*/
int
xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
int ret;
if (am == NULL)
return(-1);
ret = xmlRegGetCounter(am);
if (ret < 0)
return(-1);
am->counters[ret].min = min;
am->counters[ret].max = max;
return(ret);
}
/**
* xmlAutomataNewCountedTrans:
* @am: an automata
* @from: the starting point of the transition
* @to: the target point of the transition or NULL
* @counter: the counter associated to that transition
*
* If @to is NULL, this creates first a new target state in the automata
* and then adds an epsilon transition from the @from state to the target state
* which will increment the counter provided
*
* Returns the target state or NULL in case of error
*/
xmlAutomataStatePtr
xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
xmlAutomataStatePtr to, int counter) {
if ((am == NULL) || (from == NULL) || (counter < 0))
return(NULL);
xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
if (to == NULL)
return(am->state);
return(to);
}
/**
* xmlAutomataNewCounterTrans:
* @am: an automata
* @from: the starting point of the transition
* @to: the target point of the transition or NULL
* @counter: the counter associated to that transition
*
* If @to is NULL, this creates first a new target state in the automata
* and then adds an epsilon transition from the @from state to the target state
* which will be allowed only if the counter is within the right range.
*
* Returns the target state or NULL in case of error
*/
xmlAutomataStatePtr
xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
xmlAutomataStatePtr to, int counter) {
if ((am == NULL) || (from == NULL) || (counter < 0))
return(NULL);
xmlFAGenerateCountedTransition(am, from, to, counter);
if (to == NULL)
return(am->state);
return(to);
}
/**
* xmlAutomataCompile:
* @am: an automata
*
* Compile the automata into a Reg Exp ready for being executed.
* The automata should be free after this point.
*
* Returns the compiled regexp or NULL in case of error
*/
xmlRegexpPtr
xmlAutomataCompile(xmlAutomataPtr am) {
xmlRegexpPtr ret;
if ((am == NULL) || (am->error != 0)) return(NULL);
xmlFAEliminateEpsilonTransitions(am);
/* xmlFAComputesDeterminism(am); */
ret = xmlRegEpxFromParse(am);
return(ret);
}
/**
* xmlAutomataIsDeterminist:
* @am: an automata
*
* Checks if an automata is determinist.
*
* Returns 1 if true, 0 if not, and -1 in case of error
*/
int
xmlAutomataIsDeterminist(xmlAutomataPtr am) {
int ret;
if (am == NULL)
return(-1);
ret = xmlFAComputesDeterminism(am);
return(ret);
}
#endif /* LIBXML_AUTOMATA_ENABLED */
#ifdef LIBXML_EXPR_ENABLED
/************************************************************************
* *
* Formal Expression handling code *
* *
************************************************************************/
/************************************************************************
* *
* Expression handling context *
* *
************************************************************************/
struct _xmlExpCtxt {
xmlDictPtr dict;
xmlExpNodePtr *table;
int size;
int nbElems;
int nb_nodes;
int maxNodes;
const char *expr;
const char *cur;
int nb_cons;
int tabSize;
};
/**
* xmlExpNewCtxt:
* @maxNodes: the maximum number of nodes
* @dict: optional dictionary to use internally
*
* Creates a new context for manipulating expressions
*
* Returns the context or NULL in case of error
*/
xmlExpCtxtPtr
xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
xmlExpCtxtPtr ret;
int size = 256;
if (maxNodes <= 4096)
maxNodes = 4096;
ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
if (ret == NULL)
return(NULL);
memset(ret, 0, sizeof(xmlExpCtxt));
ret->size = size;
ret->nbElems = 0;
ret->maxNodes = maxNodes;
ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
if (ret->table == NULL) {
xmlFree(ret);
return(NULL);
}
memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
if (dict == NULL) {
ret->dict = xmlDictCreate();
if (ret->dict == NULL) {
xmlFree(ret->table);
xmlFree(ret);
return(NULL);
}
} else {
ret->dict = dict;
xmlDictReference(ret->dict);
}
return(ret);
}
/**
* xmlExpFreeCtxt:
* @ctxt: an expression context
*
* Free an expression context
*/
void
xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
if (ctxt == NULL)
return;
xmlDictFree(ctxt->dict);
if (ctxt->table != NULL)
xmlFree(ctxt->table);
xmlFree(ctxt);
}
/************************************************************************
* *
* Structure associated to an expression node *
* *
************************************************************************/
#define MAX_NODES 10000
/* #define DEBUG_DERIV */
/*
* TODO:
* - Wildcards
* - public API for creation
*
* Started
* - regression testing
*
* Done
* - split into module and test tool
* - memleaks
*/
typedef enum {
XML_EXP_NILABLE = (1 << 0)
} xmlExpNodeInfo;
#define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
struct _xmlExpNode {
unsigned char type;/* xmlExpNodeType */
unsigned char info;/* OR of xmlExpNodeInfo */
unsigned short key; /* the hash key */
unsigned int ref; /* The number of references */
int c_max; /* the maximum length it can consume */
xmlExpNodePtr exp_left;
xmlExpNodePtr next;/* the next node in the hash table or free list */
union {
struct {
int f_min;
int f_max;
} count;
struct {
xmlExpNodePtr f_right;
} children;
const xmlChar *f_str;
} field;
};
#define exp_min field.count.f_min
#define exp_max field.count.f_max
/* #define exp_left field.children.f_left */
#define exp_right field.children.f_right
#define exp_str field.f_str
static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
static xmlExpNode forbiddenExpNode = {
XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
};
xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
static xmlExpNode emptyExpNode = {
XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
};
xmlExpNodePtr emptyExp = &emptyExpNode;
/************************************************************************
* *
* The custom hash table for unicity and canonicalization *
* of sub-expressions pointers *
* *
************************************************************************/
/*
* xmlExpHashNameComputeKey:
* Calculate the hash key for a token
*/
static unsigned short
xmlExpHashNameComputeKey(const xmlChar *name) {
unsigned short value = 0L;
char ch;
if (name != NULL) {
value += 30 * (*name);
while ((ch = *name++) != 0) {
value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
}
}
return (value);
}
/*
* xmlExpHashComputeKey:
* Calculate the hash key for a compound expression
*/
static unsigned short
xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
xmlExpNodePtr right) {
unsigned long value;
unsigned short ret;
switch (type) {
case XML_EXP_SEQ:
value = left->key;
value += right->key;
value *= 3;
ret = (unsigned short) value;
break;
case XML_EXP_OR:
value = left->key;
value += right->key;
value *= 7;
ret = (unsigned short) value;
break;
case XML_EXP_COUNT:
value = left->key;
value += right->key;
ret = (unsigned short) value;
break;
default:
ret = 0;
}
return(ret);
}
static xmlExpNodePtr
xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
xmlExpNodePtr ret;
if (ctxt->nb_nodes >= MAX_NODES)
return(NULL);
ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
if (ret == NULL)
return(NULL);
memset(ret, 0, sizeof(xmlExpNode));
ret->type = type;
ret->next = NULL;
ctxt->nb_nodes++;
ctxt->nb_cons++;
return(ret);
}
/**
* xmlExpHashGetEntry:
* @table: the hash table
*
* Get the unique entry from the hash table. The entry is created if
* needed. @left and @right are consumed, i.e. their ref count will
* be decremented by the operation.
*
* Returns the pointer or NULL in case of error
*/
static xmlExpNodePtr
xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
xmlExpNodePtr left, xmlExpNodePtr right,
const xmlChar *name, int min, int max) {
unsigned short kbase, key;
xmlExpNodePtr entry;
xmlExpNodePtr insert;
if (ctxt == NULL)
return(NULL);
/*
* Check for duplicate and insertion location.
*/
if (type == XML_EXP_ATOM) {
kbase = xmlExpHashNameComputeKey(name);
} else if (type == XML_EXP_COUNT) {
/* COUNT reduction rule 1 */
/* a{1} -> a */
if (min == max) {
if (min == 1) {
return(left);
}
if (min == 0) {
xmlExpFree(ctxt, left);
return(emptyExp);
}
}
if (min < 0) {
xmlExpFree(ctxt, left);
return(forbiddenExp);
}
if (max == -1)
kbase = min + 79;
else
kbase = max - min;
kbase += left->key;
} else if (type == XML_EXP_OR) {
/* Forbid reduction rules */
if (left->type == XML_EXP_FORBID) {
xmlExpFree(ctxt, left);
return(right);
}
if (right->type == XML_EXP_FORBID) {
xmlExpFree(ctxt, right);
return(left);
}
/* OR reduction rule 1 */
/* a | a reduced to a */
if (left == right) {
xmlExpFree(ctxt, right);
return(left);
}
/* OR canonicalization rule 1 */
/* linearize (a | b) | c into a | (b | c) */
if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
xmlExpNodePtr tmp = left;
left = right;
right = tmp;
}
/* OR reduction rule 2 */
/* a | (a | b) and b | (a | b) are reduced to a | b */
if (right->type == XML_EXP_OR) {
if ((left == right->exp_left) ||
(left == right->exp_right)) {
xmlExpFree(ctxt, left);
return(right);
}
}
/* OR canonicalization rule 2 */
/* linearize (a | b) | c into a | (b | c) */
if (left->type == XML_EXP_OR) {
xmlExpNodePtr tmp;
/* OR canonicalization rule 2 */
if ((left->exp_right->type != XML_EXP_OR) &&
(left->exp_right->key < left->exp_left->key)) {
tmp = left->exp_right;
left->exp_right = left->exp_left;
left->exp_left = tmp;
}
left->exp_right->ref++;
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
NULL, 0, 0);
left->exp_left->ref++;
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
NULL, 0, 0);
xmlExpFree(ctxt, left);
return(tmp);
}
if (right->type == XML_EXP_OR) {
/* Ordering in the tree */
/* C | (A | B) -> A | (B | C) */
if (left->key > right->exp_right->key) {
xmlExpNodePtr tmp;
right->exp_right->ref++;
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
left, NULL, 0, 0);
right->exp_left->ref++;
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
tmp, NULL, 0, 0);
xmlExpFree(ctxt, right);
return(tmp);
}
/* Ordering in the tree */
/* B | (A | C) -> A | (B | C) */
if (left->key > right->exp_left->key) {
xmlExpNodePtr tmp;
right->exp_right->ref++;
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
right->exp_right, NULL, 0, 0);
right->exp_left->ref++;
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
tmp, NULL, 0, 0);
xmlExpFree(ctxt, right);
return(tmp);
}
}
/* we know both types are != XML_EXP_OR here */
else if (left->key > right->key) {
xmlExpNodePtr tmp = left;
left = right;
right = tmp;
}
kbase = xmlExpHashComputeKey(type, left, right);
} else if (type == XML_EXP_SEQ) {
/* Forbid reduction rules */
if (left->type == XML_EXP_FORBID) {
xmlExpFree(ctxt, right);
return(left);
}
if (right->type == XML_EXP_FORBID) {
xmlExpFree(ctxt, left);
return(right);
}
/* Empty reduction rules */
if (right->type == XML_EXP_EMPTY) {
return(left);
}
if (left->type == XML_EXP_EMPTY) {
return(right);
}
kbase = xmlExpHashComputeKey(type, left, right);
} else
return(NULL);
key = kbase % ctxt->size;
if (ctxt->table[key] != NULL) {
for (insert = ctxt->table[key]; insert != NULL;
insert = insert->next) {
if ((insert->key == kbase) &&
(insert->type == type)) {
if (type == XML_EXP_ATOM) {
if (name == insert->exp_str) {
insert->ref++;
return(insert);
}
} else if (type == XML_EXP_COUNT) {
if ((insert->exp_min == min) && (insert->exp_max == max) &&
(insert->exp_left == left)) {
insert->ref++;
left->ref--;
return(insert);
}
} else if ((insert->exp_left == left) &&
(insert->exp_right == right)) {
insert->ref++;
left->ref--;
right->ref--;
return(insert);
}
}
}
}
entry = xmlExpNewNode(ctxt, type);
if (entry == NULL)
return(NULL);
entry->key = kbase;
if (type == XML_EXP_ATOM) {
entry->exp_str = name;
entry->c_max = 1;
} else if (type == XML_EXP_COUNT) {
entry->exp_min = min;
entry->exp_max = max;
entry->exp_left = left;
if ((min == 0) || (IS_NILLABLE(left)))
entry->info |= XML_EXP_NILABLE;
if (max < 0)
entry->c_max = -1;
else
entry->c_max = max * entry->exp_left->c_max;
} else {
entry->exp_left = left;
entry->exp_right = right;
if (type == XML_EXP_OR) {
if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
entry->info |= XML_EXP_NILABLE;
if ((entry->exp_left->c_max == -1) ||
(entry->exp_right->c_max == -1))
entry->c_max = -1;
else if (entry->exp_left->c_max > entry->exp_right->c_max)
entry->c_max = entry->exp_left->c_max;
else
entry->c_max = entry->exp_right->c_max;
} else {
if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
entry->info |= XML_EXP_NILABLE;
if ((entry->exp_left->c_max == -1) ||
(entry->exp_right->c_max == -1))
entry->c_max = -1;
else
entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
}
}
entry->ref = 1;
if (ctxt->table[key] != NULL)
entry->next = ctxt->table[key];
ctxt->table[key] = entry;
ctxt->nbElems++;
return(entry);
}
/**
* xmlExpFree:
* @ctxt: the expression context
* @exp: the expression
*
* Dereference the expression
*/
void
xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
return;
exp->ref--;
if (exp->ref == 0) {
unsigned short key;
/* Unlink it first from the hash table */
key = exp->key % ctxt->size;
if (ctxt->table[key] == exp) {
ctxt->table[key] = exp->next;
} else {
xmlExpNodePtr tmp;
tmp = ctxt->table[key];
while (tmp != NULL) {
if (tmp->next == exp) {
tmp->next = exp->next;
break;
}
tmp = tmp->next;
}
}
if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
xmlExpFree(ctxt, exp->exp_left);
xmlExpFree(ctxt, exp->exp_right);
} else if (exp->type == XML_EXP_COUNT) {
xmlExpFree(ctxt, exp->exp_left);
}
xmlFree(exp);
ctxt->nb_nodes--;
}
}
/**
* xmlExpRef:
* @exp: the expression
*
* Increase the reference count of the expression
*/
void
xmlExpRef(xmlExpNodePtr exp) {
if (exp != NULL)
exp->ref++;
}
/**
* xmlExpNewAtom:
* @ctxt: the expression context
* @name: the atom name
* @len: the atom name length in byte (or -1);
*
* Get the atom associated to this name from that context
*
* Returns the node or NULL in case of error
*/
xmlExpNodePtr
xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
if ((ctxt == NULL) || (name == NULL))
return(NULL);
name = xmlDictLookup(ctxt->dict, name, len);
if (name == NULL)
return(NULL);
return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
}
/**
* xmlExpNewOr:
* @ctxt: the expression context
* @left: left expression
* @right: right expression
*
* Get the atom associated to the choice @left | @right
* Note that @left and @right are consumed in the operation, to keep
* an handle on them use xmlExpRef() and use xmlExpFree() to release them,
* this is true even in case of failure (unless ctxt == NULL).
*
* Returns the node or NULL in case of error
*/
xmlExpNodePtr
xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
if (ctxt == NULL)
return(NULL);
if ((left == NULL) || (right == NULL)) {
xmlExpFree(ctxt, left);
xmlExpFree(ctxt, right);
return(NULL);
}
return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
}
/**
* xmlExpNewSeq:
* @ctxt: the expression context
* @left: left expression
* @right: right expression
*
* Get the atom associated to the sequence @left , @right
* Note that @left and @right are consumed in the operation, to keep
* an handle on them use xmlExpRef() and use xmlExpFree() to release them,
* this is true even in case of failure (unless ctxt == NULL).
*
* Returns the node or NULL in case of error
*/
xmlExpNodePtr
xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
if (ctxt == NULL)
return(NULL);
if ((left == NULL) || (right == NULL)) {
xmlExpFree(ctxt, left);
xmlExpFree(ctxt, right);
return(NULL);
}
return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
}
/**
* xmlExpNewRange:
* @ctxt: the expression context
* @subset: the expression to be repeated
* @min: the lower bound for the repetition
* @max: the upper bound for the repetition, -1 means infinite
*
* Get the atom associated to the range (@subset){@min, @max}
* Note that @subset is consumed in the operation, to keep
* an handle on it use xmlExpRef() and use xmlExpFree() to release it,
* this is true even in case of failure (unless ctxt == NULL).
*
* Returns the node or NULL in case of error
*/
xmlExpNodePtr
xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
if (ctxt == NULL)
return(NULL);
if ((subset == NULL) || (min < 0) || (max < -1) ||
((max >= 0) && (min > max))) {
xmlExpFree(ctxt, subset);
return(NULL);
}
return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
NULL, NULL, min, max));
}
/************************************************************************
* *
* Public API for operations on expressions *
* *
************************************************************************/
static int
xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
const xmlChar**list, int len, int nb) {
int tmp, tmp2;
tail:
switch (exp->type) {
case XML_EXP_EMPTY:
return(0);
case XML_EXP_ATOM:
for (tmp = 0;tmp < nb;tmp++)
if (list[tmp] == exp->exp_str)
return(0);
if (nb >= len)
return(-2);
list[nb] = exp->exp_str;
return(1);
case XML_EXP_COUNT:
exp = exp->exp_left;
goto tail;
case XML_EXP_SEQ:
case XML_EXP_OR:
tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
if (tmp < 0)
return(tmp);
tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
nb + tmp);
if (tmp2 < 0)
return(tmp2);
return(tmp + tmp2);
}
return(-1);
}
/**
* xmlExpGetLanguage:
* @ctxt: the expression context
* @exp: the expression
* @langList: where to store the tokens
* @len: the allocated length of @list
*
* Find all the strings used in @exp and store them in @list
*
* Returns the number of unique strings found, -1 in case of errors and
* -2 if there is more than @len strings
*/
int
xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
const xmlChar**langList, int len) {
if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
return(-1);
return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
}
static int
xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
const xmlChar**list, int len, int nb) {
int tmp, tmp2;
tail:
switch (exp->type) {
case XML_EXP_FORBID:
return(0);
case XML_EXP_EMPTY:
return(0);
case XML_EXP_ATOM:
for (tmp = 0;tmp < nb;tmp++)
if (list[tmp] == exp->exp_str)
return(0);
if (nb >= len)
return(-2);
list[nb] = exp->exp_str;
return(1);
case XML_EXP_COUNT:
exp = exp->exp_left;
goto tail;
case XML_EXP_SEQ:
tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
if (tmp < 0)
return(tmp);
if (IS_NILLABLE(exp->exp_left)) {
tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
nb + tmp);
if (tmp2 < 0)
return(tmp2);
tmp += tmp2;
}
return(tmp);
case XML_EXP_OR:
tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
if (tmp < 0)
return(tmp);
tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
nb + tmp);
if (tmp2 < 0)
return(tmp2);
return(tmp + tmp2);
}
return(-1);
}
/**
* xmlExpGetStart:
* @ctxt: the expression context
* @exp: the expression
* @tokList: where to store the tokens
* @len: the allocated length of @list
*
* Find all the strings that appears at the start of the languages
* accepted by @exp and store them in @list. E.g. for (a, b) | c
* it will return the list [a, c]
*
* Returns the number of unique strings found, -1 in case of errors and
* -2 if there is more than @len strings
*/
int
xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
const xmlChar**tokList, int len) {
if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
return(-1);
return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
}
/**
* xmlExpIsNillable:
* @exp: the expression
*
* Finds if the expression is nillable, i.e. if it accepts the empty sequence
*
* Returns 1 if nillable, 0 if not and -1 in case of error
*/
int
xmlExpIsNillable(xmlExpNodePtr exp) {
if (exp == NULL)
return(-1);
return(IS_NILLABLE(exp) != 0);
}
static xmlExpNodePtr
xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
{
xmlExpNodePtr ret;
switch (exp->type) {
case XML_EXP_EMPTY:
return(forbiddenExp);
case XML_EXP_FORBID:
return(forbiddenExp);
case XML_EXP_ATOM:
if (exp->exp_str == str) {
#ifdef DEBUG_DERIV
printf("deriv atom: equal => Empty\n");
#endif
ret = emptyExp;
} else {
#ifdef DEBUG_DERIV
printf("deriv atom: mismatch => forbid\n");
#endif
/* TODO wildcards here */
ret = forbiddenExp;
}
return(ret);
case XML_EXP_OR: {
xmlExpNodePtr tmp;
#ifdef DEBUG_DERIV
printf("deriv or: => or(derivs)\n");
#endif
tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
if (tmp == NULL) {
return(NULL);
}
ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
if (ret == NULL) {
xmlExpFree(ctxt, tmp);
return(NULL);
}
ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
NULL, 0, 0);
return(ret);
}
case XML_EXP_SEQ:
#ifdef DEBUG_DERIV
printf("deriv seq: starting with left\n");
#endif
ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
if (ret == NULL) {
return(NULL);
} else if (ret == forbiddenExp) {
if (IS_NILLABLE(exp->exp_left)) {
#ifdef DEBUG_DERIV
printf("deriv seq: left failed but nillable\n");
#endif
ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
}
} else {
#ifdef DEBUG_DERIV
printf("deriv seq: left match => sequence\n");
#endif
exp->exp_right->ref++;
ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
NULL, 0, 0);
}
return(ret);
case XML_EXP_COUNT: {
int min, max;
xmlExpNodePtr tmp;
if (exp->exp_max == 0)
return(forbiddenExp);
ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
if (ret == NULL)
return(NULL);
if (ret == forbiddenExp) {
#ifdef DEBUG_DERIV
printf("deriv count: pattern mismatch => forbid\n");
#endif
return(ret);
}
if (exp->exp_max == 1)
return(ret);
if (exp->exp_max < 0) /* unbounded */
max = -1;
else
max = exp->exp_max - 1;
if (exp->exp_min > 0)
min = exp->exp_min - 1;
else
min = 0;
exp->exp_left->ref++;
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
NULL, min, max);
if (ret == emptyExp) {
#ifdef DEBUG_DERIV
printf("deriv count: match to empty => new count\n");
#endif
return(tmp);
}
#ifdef DEBUG_DERIV
printf("deriv count: match => sequence with new count\n");
#endif
return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
NULL, 0, 0));
}
}
return(NULL);
}
/**
* xmlExpStringDerive:
* @ctxt: the expression context
* @exp: the expression
* @str: the string
* @len: the string len in bytes if available
*
* Do one step of Brzozowski derivation of the expression @exp with
* respect to the input string
*
* Returns the resulting expression or NULL in case of internal error
*/
xmlExpNodePtr
xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
const xmlChar *str, int len) {
const xmlChar *input;
if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
return(NULL);
}
/*
* check the string is in the dictionary, if yes use an interned
* copy, otherwise we know it's not an acceptable input
*/
input = xmlDictExists(ctxt->dict, str, len);
if (input == NULL) {
return(forbiddenExp);
}
return(xmlExpStringDeriveInt(ctxt, exp, input));
}
static int
xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
int ret = 1;
if (sub->c_max == -1) {
if (exp->c_max != -1)
ret = 0;
} else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
ret = 0;
}
#if 0
if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
ret = 0;
#endif
return(ret);
}
static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
xmlExpNodePtr sub);
/**
* xmlExpDivide:
* @ctxt: the expressions context
* @exp: the englobing expression
* @sub: the subexpression
* @mult: the multiple expression
* @remain: the remain from the derivation of the multiple
*
* Check if exp is a multiple of sub, i.e. if there is a finite number n
* so that sub{n} subsume exp
*
* Returns the multiple value if successful, 0 if it is not a multiple
* and -1 in case of internal error.
*/
static int
xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
int i;
xmlExpNodePtr tmp, tmp2;
if (mult != NULL) *mult = NULL;
if (remain != NULL) *remain = NULL;
if (exp->c_max == -1) return(0);
if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
for (i = 1;i <= exp->c_max;i++) {
sub->ref++;
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
sub, NULL, NULL, i, i);
if (tmp == NULL) {
return(-1);
}
if (!xmlExpCheckCard(tmp, exp)) {
xmlExpFree(ctxt, tmp);
continue;
}
tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
if (tmp2 == NULL) {
xmlExpFree(ctxt, tmp);
return(-1);
}
if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
if (remain != NULL)
*remain = tmp2;
else
xmlExpFree(ctxt, tmp2);
if (mult != NULL)
*mult = tmp;
else
xmlExpFree(ctxt, tmp);
#ifdef DEBUG_DERIV
printf("Divide succeeded %d\n", i);
#endif
return(i);
}
xmlExpFree(ctxt, tmp);
xmlExpFree(ctxt, tmp2);
}
#ifdef DEBUG_DERIV
printf("Divide failed\n");
#endif
return(0);
}
/**
* xmlExpExpDeriveInt:
* @ctxt: the expressions context
* @exp: the englobing expression
* @sub: the subexpression
*
* Try to do a step of Brzozowski derivation but at a higher level
* the input being a subexpression.
*
* Returns the resulting expression or NULL in case of internal error
*/
static xmlExpNodePtr
xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
xmlExpNodePtr ret, tmp, tmp2, tmp3;
const xmlChar **tab;
int len, i;
/*
* In case of equality and if the expression can only consume a finite
* amount, then the derivation is empty
*/
if ((exp == sub) && (exp->c_max >= 0)) {
#ifdef DEBUG_DERIV
printf("Equal(exp, sub) and finite -> Empty\n");
#endif
return(emptyExp);
}
/*
* decompose sub sequence first
*/
if (sub->type == XML_EXP_EMPTY) {
#ifdef DEBUG_DERIV
printf("Empty(sub) -> Empty\n");
#endif
exp->ref++;
return(exp);
}
if (sub->type == XML_EXP_SEQ) {
#ifdef DEBUG_DERIV
printf("Seq(sub) -> decompose\n");
#endif
tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
if (tmp == NULL)
return(NULL);
if (tmp == forbiddenExp)
return(tmp);
ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
xmlExpFree(ctxt, tmp);
return(ret);
}
if (sub->type == XML_EXP_OR) {
#ifdef DEBUG_DERIV
printf("Or(sub) -> decompose\n");
#endif
tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
if (tmp == forbiddenExp)
return(tmp);
if (tmp == NULL)
return(NULL);
ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
if ((ret == NULL) || (ret == forbiddenExp)) {
xmlExpFree(ctxt, tmp);
return(ret);
}
return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
}
if (!xmlExpCheckCard(exp, sub)) {
#ifdef DEBUG_DERIV
printf("CheckCard(exp, sub) failed -> Forbid\n");
#endif
return(forbiddenExp);
}
switch (exp->type) {
case XML_EXP_EMPTY:
if (sub == emptyExp)
return(emptyExp);
#ifdef DEBUG_DERIV
printf("Empty(exp) -> Forbid\n");
#endif
return(forbiddenExp);
case XML_EXP_FORBID:
#ifdef DEBUG_DERIV
printf("Forbid(exp) -> Forbid\n");
#endif
return(forbiddenExp);
case XML_EXP_ATOM:
if (sub->type == XML_EXP_ATOM) {
/* TODO: handle wildcards */
if (exp->exp_str == sub->exp_str) {
#ifdef DEBUG_DERIV
printf("Atom match -> Empty\n");
#endif
return(emptyExp);
}
#ifdef DEBUG_DERIV
printf("Atom mismatch -> Forbid\n");
#endif
return(forbiddenExp);
}
if ((sub->type == XML_EXP_COUNT) &&
(sub->exp_max == 1) &&
(sub->exp_left->type == XML_EXP_ATOM)) {
/* TODO: handle wildcards */
if (exp->exp_str == sub->exp_left->exp_str) {
#ifdef DEBUG_DERIV
printf("Atom match -> Empty\n");
#endif
return(emptyExp);
}
#ifdef DEBUG_DERIV
printf("Atom mismatch -> Forbid\n");
#endif
return(forbiddenExp);
}
#ifdef DEBUG_DERIV
printf("Complex exp vs Atom -> Forbid\n");
#endif
return(forbiddenExp);
case XML_EXP_SEQ:
/* try to get the sequence consumed only if possible */
if (xmlExpCheckCard(exp->exp_left, sub)) {
/* See if the sequence can be consumed directly */
#ifdef DEBUG_DERIV
printf("Seq trying left only\n");
#endif
ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
if ((ret != forbiddenExp) && (ret != NULL)) {
#ifdef DEBUG_DERIV
printf("Seq trying left only worked\n");
#endif
/*
* TODO: assumption here that we are determinist
* i.e. we won't get to a nillable exp left
* subset which could be matched by the right
* part too.
* e.g.: (a | b)+,(a | c) and 'a+,a'
*/
exp->exp_right->ref++;
return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
exp->exp_right, NULL, 0, 0));
}
#ifdef DEBUG_DERIV
} else {
printf("Seq: left too short\n");
#endif
}
/* Try instead to decompose */
if (sub->type == XML_EXP_COUNT) {
int min, max;
#ifdef DEBUG_DERIV
printf("Seq: sub is a count\n");
#endif
ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
if (ret == NULL)
return(NULL);
if (ret != forbiddenExp) {
#ifdef DEBUG_DERIV
printf("Seq , Count match on left\n");
#endif
if (sub->exp_max < 0)
max = -1;
else
max = sub->exp_max -1;
if (sub->exp_min > 0)
min = sub->exp_min -1;
else
min = 0;
exp->exp_right->ref++;
tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
exp->exp_right, NULL, 0, 0);
if (tmp == NULL)
return(NULL);
sub->exp_left->ref++;
tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
sub->exp_left, NULL, NULL, min, max);
if (tmp2 == NULL) {
xmlExpFree(ctxt, tmp);
return(NULL);
}
ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
xmlExpFree(ctxt, tmp);
xmlExpFree(ctxt, tmp2);
return(ret);
}
}
/* we made no progress on structured operations */
break;
case XML_EXP_OR:
#ifdef DEBUG_DERIV
printf("Or , trying both side\n");
#endif
ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
if (ret == NULL)
return(NULL);
tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
if (tmp == NULL) {
xmlExpFree(ctxt, ret);
return(NULL);
}
return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
case XML_EXP_COUNT: {
int min, max;
if (sub->type == XML_EXP_COUNT) {
/*
* Try to see if the loop is completely subsumed
*/
tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
if (tmp == NULL)
return(NULL);
if (tmp == forbiddenExp) {
int mult;
#ifdef DEBUG_DERIV
printf("Count, Count inner don't subsume\n");
#endif
mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
NULL, &tmp);
if (mult <= 0) {
#ifdef DEBUG_DERIV
printf("Count, Count not multiple => forbidden\n");
#endif
return(forbiddenExp);
}
if (sub->exp_max == -1) {
max = -1;
if (exp->exp_max == -1) {
if (exp->exp_min <= sub->exp_min * mult)
min = 0;
else
min = exp->exp_min - sub->exp_min * mult;
} else {
#ifdef DEBUG_DERIV
printf("Count, Count finite can't subsume infinite\n");
#endif
xmlExpFree(ctxt, tmp);
return(forbiddenExp);
}
} else {
if (exp->exp_max == -1) {
#ifdef DEBUG_DERIV
printf("Infinite loop consume mult finite loop\n");
#endif
if (exp->exp_min > sub->exp_min * mult) {
max = -1;
min = exp->exp_min - sub->exp_min * mult;
} else {
max = -1;
min = 0;
}
} else {
if (exp->exp_max < sub->exp_max * mult) {
#ifdef DEBUG_DERIV
printf("loops max mult mismatch => forbidden\n");
#endif
xmlExpFree(ctxt, tmp);
return(forbiddenExp);
}
if (sub->exp_max * mult > exp->exp_min)
min = 0;
else
min = exp->exp_min - sub->exp_max * mult;
max = exp->exp_max - sub->exp_max * mult;
}
}
} else if (!IS_NILLABLE(tmp)) {
/*
* TODO: loop here to try to grow if working on finite
* blocks.
*/
#ifdef DEBUG_DERIV
printf("Count, Count remain not nillable => forbidden\n");
#endif
xmlExpFree(ctxt, tmp);
return(forbiddenExp);
} else if (sub->exp_max == -1) {
if (exp->exp_max == -1) {
if (exp->exp_min <= sub->exp_min) {
#ifdef DEBUG_DERIV
printf("Infinite loops Okay => COUNT(0,Inf)\n");
#endif
max = -1;
min = 0;
} else {
#ifdef DEBUG_DERIV
printf("Infinite loops min => Count(X,Inf)\n");
#endif
max = -1;
min = exp->exp_min - sub->exp_min;
}
} else if (exp->exp_min > sub->exp_min) {
#ifdef DEBUG_DERIV
printf("loops min mismatch 1 => forbidden ???\n");
#endif
xmlExpFree(ctxt, tmp);
return(forbiddenExp);
} else {
max = -1;
min = 0;
}
} else {
if (exp->exp_max == -1) {
#ifdef DEBUG_DERIV
printf("Infinite loop consume finite loop\n");
#endif
if (exp->exp_min > sub->exp_min) {
max = -1;
min = exp->exp_min - sub->exp_min;
} else {
max = -1;
min = 0;
}
} else {
if (exp->exp_max < sub->exp_max) {
#ifdef DEBUG_DERIV
printf("loops max mismatch => forbidden\n");
#endif
xmlExpFree(ctxt, tmp);
return(forbiddenExp);
}
if (sub->exp_max > exp->exp_min)
min = 0;
else
min = exp->exp_min - sub->exp_max;
max = exp->exp_max - sub->exp_max;
}
}
#ifdef DEBUG_DERIV
printf("loops match => SEQ(COUNT())\n");
#endif
exp->exp_left->ref++;
tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
NULL, NULL, min, max);
if (tmp2 == NULL) {
return(NULL);
}
ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
NULL, 0, 0);
return(ret);
}
tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
if (tmp == NULL)
return(NULL);
if (tmp == forbiddenExp) {
#ifdef DEBUG_DERIV
printf("loop mismatch => forbidden\n");
#endif
return(forbiddenExp);
}
if (exp->exp_min > 0)
min = exp->exp_min - 1;
else
min = 0;
if (exp->exp_max < 0)
max = -1;
else
max = exp->exp_max - 1;
#ifdef DEBUG_DERIV
printf("loop match => SEQ(COUNT())\n");
#endif
exp->exp_left->ref++;
tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
NULL, NULL, min, max);
if (tmp2 == NULL)
return(NULL);
ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
NULL, 0, 0);
return(ret);
}
}
#ifdef DEBUG_DERIV
printf("Fallback to derivative\n");
#endif
if (IS_NILLABLE(sub)) {
if (!(IS_NILLABLE(exp)))
return(forbiddenExp);
else
ret = emptyExp;
} else
ret = NULL;
/*
* here the structured derivation made no progress so
* we use the default token based derivation to force one more step
*/
if (ctxt->tabSize == 0)
ctxt->tabSize = 40;
tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
sizeof(const xmlChar *));
if (tab == NULL) {
return(NULL);
}
/*
* collect all the strings accepted by the subexpression on input
*/
len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
while (len < 0) {
const xmlChar **temp;
temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
sizeof(const xmlChar *));
if (temp == NULL) {
xmlFree((xmlChar **) tab);
return(NULL);
}
tab = temp;
ctxt->tabSize *= 2;
len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
}
for (i = 0;i < len;i++) {
tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
if ((tmp == NULL) || (tmp == forbiddenExp)) {
xmlExpFree(ctxt, ret);
xmlFree((xmlChar **) tab);
return(tmp);
}
tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
xmlExpFree(ctxt, tmp);
xmlExpFree(ctxt, ret);
xmlFree((xmlChar **) tab);
return(tmp);
}
tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
xmlExpFree(ctxt, tmp);
xmlExpFree(ctxt, tmp2);
if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
xmlExpFree(ctxt, ret);
xmlFree((xmlChar **) tab);
return(tmp3);
}
if (ret == NULL)
ret = tmp3;
else {
ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
if (ret == NULL) {
xmlFree((xmlChar **) tab);
return(NULL);
}
}
}
xmlFree((xmlChar **) tab);
return(ret);
}
/**
* xmlExpExpDerive:
* @ctxt: the expressions context
* @exp: the englobing expression
* @sub: the subexpression
*
* Evaluates the expression resulting from @exp consuming a sub expression @sub
* Based on algebraic derivation and sometimes direct Brzozowski derivation
* it usually takes less than linear time and can handle expressions generating
* infinite languages.
*
* Returns the resulting expression or NULL in case of internal error, the
* result must be freed
*/
xmlExpNodePtr
xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
return(NULL);
/*
* O(1) speedups
*/
if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
#ifdef DEBUG_DERIV
printf("Sub nillable and not exp : can't subsume\n");
#endif
return(forbiddenExp);
}
if (xmlExpCheckCard(exp, sub) == 0) {
#ifdef DEBUG_DERIV
printf("sub generate longer sequences than exp : can't subsume\n");
#endif
return(forbiddenExp);
}
return(xmlExpExpDeriveInt(ctxt, exp, sub));
}
/**
* xmlExpSubsume:
* @ctxt: the expressions context
* @exp: the englobing expression
* @sub: the subexpression
*
* Check whether @exp accepts all the languages accepted by @sub
* the input being a subexpression.
*
* Returns 1 if true 0 if false and -1 in case of failure.
*/
int
xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
xmlExpNodePtr tmp;
if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
return(-1);
/*
* TODO: speedup by checking the language of sub is a subset of the
* language of exp
*/
/*
* O(1) speedups
*/
if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
#ifdef DEBUG_DERIV
printf("Sub nillable and not exp : can't subsume\n");
#endif
return(0);
}
if (xmlExpCheckCard(exp, sub) == 0) {
#ifdef DEBUG_DERIV
printf("sub generate longer sequences than exp : can't subsume\n");
#endif
return(0);
}
tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
#ifdef DEBUG_DERIV
printf("Result derivation :\n");
PRINT_EXP(tmp);
#endif
if (tmp == NULL)
return(-1);
if (tmp == forbiddenExp)
return(0);
if (tmp == emptyExp)
return(1);
if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
xmlExpFree(ctxt, tmp);
return(1);
}
xmlExpFree(ctxt, tmp);
return(0);
}
/************************************************************************
* *
* Parsing expression *
* *
************************************************************************/
static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
#undef CUR
#define CUR (*ctxt->cur)
#undef NEXT
#define NEXT ctxt->cur++;
#undef IS_BLANK
#define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
#define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
static int
xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
int ret = 0;
SKIP_BLANKS
if (CUR == '*') {
NEXT
return(-1);
}
if ((CUR < '0') || (CUR > '9'))
return(-1);
while ((CUR >= '0') && (CUR <= '9')) {
ret = ret * 10 + (CUR - '0');
NEXT
}
return(ret);
}
static xmlExpNodePtr
xmlExpParseOr(xmlExpCtxtPtr ctxt) {
const char *base;
xmlExpNodePtr ret;
const xmlChar *val;
SKIP_BLANKS
base = ctxt->cur;
if (*ctxt->cur == '(') {
NEXT
ret = xmlExpParseExpr(ctxt);
SKIP_BLANKS
if (*ctxt->cur != ')') {
fprintf(stderr, "unbalanced '(' : %s\n", base);
xmlExpFree(ctxt, ret);
return(NULL);
}
NEXT;
SKIP_BLANKS
goto parse_quantifier;
}
while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
(CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
(CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
NEXT;
val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
if (val == NULL)
return(NULL);
ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
if (ret == NULL)
return(NULL);
SKIP_BLANKS
parse_quantifier:
if (CUR == '{') {
int min, max;
NEXT
min = xmlExpParseNumber(ctxt);
if (min < 0) {
xmlExpFree(ctxt, ret);
return(NULL);
}
SKIP_BLANKS
if (CUR == ',') {
NEXT
max = xmlExpParseNumber(ctxt);
SKIP_BLANKS
} else
max = min;
if (CUR != '}') {
xmlExpFree(ctxt, ret);
return(NULL);
}
NEXT
ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
min, max);
SKIP_BLANKS
} else if (CUR == '?') {
NEXT
ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
0, 1);
SKIP_BLANKS
} else if (CUR == '+') {
NEXT
ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
1, -1);
SKIP_BLANKS
} else if (CUR == '*') {
NEXT
ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
0, -1);
SKIP_BLANKS
}
return(ret);
}
static xmlExpNodePtr
xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
xmlExpNodePtr ret, right;
ret = xmlExpParseOr(ctxt);
SKIP_BLANKS
while (CUR == '|') {
NEXT
right = xmlExpParseOr(ctxt);
if (right == NULL) {
xmlExpFree(ctxt, ret);
return(NULL);
}
ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
if (ret == NULL)
return(NULL);
}
return(ret);
}
static xmlExpNodePtr
xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
xmlExpNodePtr ret, right;
ret = xmlExpParseSeq(ctxt);
SKIP_BLANKS
while (CUR == ',') {
NEXT
right = xmlExpParseSeq(ctxt);
if (right == NULL) {
xmlExpFree(ctxt, ret);
return(NULL);
}
ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
if (ret == NULL)
return(NULL);
}
return(ret);
}
/**
* xmlExpParse:
* @ctxt: the expressions context
* @expr: the 0 terminated string
*
* Minimal parser for regexps, it understand the following constructs
* - string terminals
* - choice operator |
* - sequence operator ,
* - subexpressions (...)
* - usual cardinality operators + * and ?
* - finite sequences { min, max }
* - infinite sequences { min, * }
* There is minimal checkings made especially no checking on strings values
*
* Returns a new expression or NULL in case of failure
*/
xmlExpNodePtr
xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
xmlExpNodePtr ret;
ctxt->expr = expr;
ctxt->cur = expr;
ret = xmlExpParseExpr(ctxt);
SKIP_BLANKS
if (*ctxt->cur != 0) {
xmlExpFree(ctxt, ret);
return(NULL);
}
return(ret);
}
static void
xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
xmlExpNodePtr c;
if (expr == NULL) return;
if (glob) xmlBufferWriteChar(buf, "(");
switch (expr->type) {
case XML_EXP_EMPTY:
xmlBufferWriteChar(buf, "empty");
break;
case XML_EXP_FORBID:
xmlBufferWriteChar(buf, "forbidden");
break;
case XML_EXP_ATOM:
xmlBufferWriteCHAR(buf, expr->exp_str);
break;
case XML_EXP_SEQ:
c = expr->exp_left;
if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
xmlExpDumpInt(buf, c, 1);
else
xmlExpDumpInt(buf, c, 0);
xmlBufferWriteChar(buf, " , ");
c = expr->exp_right;
if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
xmlExpDumpInt(buf, c, 1);
else
xmlExpDumpInt(buf, c, 0);
break;
case XML_EXP_OR:
c = expr->exp_left;
if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
xmlExpDumpInt(buf, c, 1);
else
xmlExpDumpInt(buf, c, 0);
xmlBufferWriteChar(buf, " | ");
c = expr->exp_right;
if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
xmlExpDumpInt(buf, c, 1);
else
xmlExpDumpInt(buf, c, 0);
break;
case XML_EXP_COUNT: {
char rep[40];
c = expr->exp_left;
if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
xmlExpDumpInt(buf, c, 1);
else
xmlExpDumpInt(buf, c, 0);
if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
rep[0] = '?';
rep[1] = 0;
} else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
rep[0] = '*';
rep[1] = 0;
} else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
rep[0] = '+';
rep[1] = 0;
} else if (expr->exp_max == expr->exp_min) {
snprintf(rep, 39, "{%d}", expr->exp_min);
} else if (expr->exp_max < 0) {
snprintf(rep, 39, "{%d,inf}", expr->exp_min);
} else {
snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
}
rep[39] = 0;
xmlBufferWriteChar(buf, rep);
break;
}
default:
fprintf(stderr, "Error in tree\n");
}
if (glob)
xmlBufferWriteChar(buf, ")");
}
/**
* xmlExpDump:
* @buf: a buffer to receive the output
* @expr: the compiled expression
*
* Serialize the expression as compiled to the buffer
*/
void
xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
if ((buf == NULL) || (expr == NULL))
return;
xmlExpDumpInt(buf, expr, 0);
}
/**
* xmlExpMaxToken:
* @expr: a compiled expression
*
* Indicate the maximum number of input a expression can accept
*
* Returns the maximum length or -1 in case of error
*/
int
xmlExpMaxToken(xmlExpNodePtr expr) {
if (expr == NULL)
return(-1);
return(expr->c_max);
}
/**
* xmlExpCtxtNbNodes:
* @ctxt: an expression context
*
* Debugging facility provides the number of allocated nodes at a that point
*
* Returns the number of nodes in use or -1 in case of error
*/
int
xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
if (ctxt == NULL)
return(-1);
return(ctxt->nb_nodes);
}
/**
* xmlExpCtxtNbCons:
* @ctxt: an expression context
*
* Debugging facility provides the number of allocated nodes over lifetime
*
* Returns the number of nodes ever allocated or -1 in case of error
*/
int
xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
if (ctxt == NULL)
return(-1);
return(ctxt->nb_cons);
}
#endif /* LIBXML_EXPR_ENABLED */
#define bottom_xmlregexp
#include "elfgcchack.h"
#endif /* LIBXML_REGEXP_ENABLED */