mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2024-12-28 07:21:26 +03:00
ea7751d53b
* testReader.c xmlreader.c valid.c include/libxml/tree.h include/libxml/valid.h include/libxml/xmlreader.h: working on DTD validation on top of xml reader interfaces. Allows to validate arbitrary large instances. This required some extensions to the valid module interface and augmenting the size of xmlID and xmlRef structs a bit. * uri.c xmlregexp.c: simple cleanup. Daniel
4320 lines
110 KiB
C
4320 lines
110 KiB
C
/*
|
|
* regexp.c: generic and extensible Regular Expression engine
|
|
*
|
|
* Basically designed with the purpose of compiling regexps for
|
|
* the variety of validation/shemas mechanisms now available in
|
|
* XML related specifications thise includes:
|
|
* - XML-1.0 DTD validation
|
|
* - XML Schemas structure part 1
|
|
* - XML Schemas Datatypes part 2 especially Appendix F
|
|
* - RELAX-NG/TREX i.e. the counter proposal
|
|
*
|
|
* See Copyright for the status of this software.
|
|
*
|
|
* Daniel Veillard <veillard@redhat.com>
|
|
*/
|
|
|
|
#define IN_LIBXML
|
|
#include "libxml.h"
|
|
|
|
#ifdef LIBXML_REGEXP_ENABLED
|
|
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <libxml/tree.h>
|
|
#include <libxml/parserInternals.h>
|
|
#include <libxml/xmlregexp.h>
|
|
#include <libxml/xmlautomata.h>
|
|
#include <libxml/xmlunicode.h>
|
|
|
|
/* #define DEBUG_REGEXP_GRAPH */
|
|
/* #define DEBUG_REGEXP_EXEC */
|
|
/* #define DEBUG_PUSH */
|
|
/* #define DEBUG_COMPACTION */
|
|
|
|
#define ERROR(str) ctxt->error = 1; \
|
|
xmlGenericError(xmlGenericErrorContext, "Regexp: %s: %s\n", str, ctxt->cur)
|
|
#define NEXT ctxt->cur++
|
|
#define CUR (*(ctxt->cur))
|
|
#define NXT(index) (ctxt->cur[index])
|
|
|
|
#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
|
|
#define NEXTL(l) ctxt->cur += l;
|
|
|
|
/**
|
|
* TODO:
|
|
*
|
|
* macro to flag unimplemented blocks
|
|
*/
|
|
#define TODO \
|
|
xmlGenericError(xmlGenericErrorContext, \
|
|
"Unimplemented block at %s:%d\n", \
|
|
__FILE__, __LINE__);
|
|
|
|
|
|
/************************************************************************
|
|
* *
|
|
* Datatypes and structures *
|
|
* *
|
|
************************************************************************/
|
|
|
|
typedef enum {
|
|
XML_REGEXP_EPSILON = 1,
|
|
XML_REGEXP_CHARVAL,
|
|
XML_REGEXP_RANGES,
|
|
XML_REGEXP_SUBREG,
|
|
XML_REGEXP_STRING,
|
|
XML_REGEXP_ANYCHAR, /* . */
|
|
XML_REGEXP_ANYSPACE, /* \s */
|
|
XML_REGEXP_NOTSPACE, /* \S */
|
|
XML_REGEXP_INITNAME, /* \l */
|
|
XML_REGEXP_NOTINITNAME, /* \l */
|
|
XML_REGEXP_NAMECHAR, /* \c */
|
|
XML_REGEXP_NOTNAMECHAR, /* \C */
|
|
XML_REGEXP_DECIMAL, /* \d */
|
|
XML_REGEXP_NOTDECIMAL, /* \d */
|
|
XML_REGEXP_REALCHAR, /* \w */
|
|
XML_REGEXP_NOTREALCHAR, /* \w */
|
|
XML_REGEXP_LETTER,
|
|
XML_REGEXP_LETTER_UPPERCASE,
|
|
XML_REGEXP_LETTER_LOWERCASE,
|
|
XML_REGEXP_LETTER_TITLECASE,
|
|
XML_REGEXP_LETTER_MODIFIER,
|
|
XML_REGEXP_LETTER_OTHERS,
|
|
XML_REGEXP_MARK,
|
|
XML_REGEXP_MARK_NONSPACING,
|
|
XML_REGEXP_MARK_SPACECOMBINING,
|
|
XML_REGEXP_MARK_ENCLOSING,
|
|
XML_REGEXP_NUMBER,
|
|
XML_REGEXP_NUMBER_DECIMAL,
|
|
XML_REGEXP_NUMBER_LETTER,
|
|
XML_REGEXP_NUMBER_OTHERS,
|
|
XML_REGEXP_PUNCT,
|
|
XML_REGEXP_PUNCT_CONNECTOR,
|
|
XML_REGEXP_PUNCT_DASH,
|
|
XML_REGEXP_PUNCT_OPEN,
|
|
XML_REGEXP_PUNCT_CLOSE,
|
|
XML_REGEXP_PUNCT_INITQUOTE,
|
|
XML_REGEXP_PUNCT_FINQUOTE,
|
|
XML_REGEXP_PUNCT_OTHERS,
|
|
XML_REGEXP_SEPAR,
|
|
XML_REGEXP_SEPAR_SPACE,
|
|
XML_REGEXP_SEPAR_LINE,
|
|
XML_REGEXP_SEPAR_PARA,
|
|
XML_REGEXP_SYMBOL,
|
|
XML_REGEXP_SYMBOL_MATH,
|
|
XML_REGEXP_SYMBOL_CURRENCY,
|
|
XML_REGEXP_SYMBOL_MODIFIER,
|
|
XML_REGEXP_SYMBOL_OTHERS,
|
|
XML_REGEXP_OTHER,
|
|
XML_REGEXP_OTHER_CONTROL,
|
|
XML_REGEXP_OTHER_FORMAT,
|
|
XML_REGEXP_OTHER_PRIVATE,
|
|
XML_REGEXP_OTHER_NA,
|
|
XML_REGEXP_BLOCK_NAME
|
|
} xmlRegAtomType;
|
|
|
|
typedef enum {
|
|
XML_REGEXP_QUANT_EPSILON = 1,
|
|
XML_REGEXP_QUANT_ONCE,
|
|
XML_REGEXP_QUANT_OPT,
|
|
XML_REGEXP_QUANT_MULT,
|
|
XML_REGEXP_QUANT_PLUS,
|
|
XML_REGEXP_QUANT_ONCEONLY,
|
|
XML_REGEXP_QUANT_ALL,
|
|
XML_REGEXP_QUANT_RANGE
|
|
} xmlRegQuantType;
|
|
|
|
typedef enum {
|
|
XML_REGEXP_START_STATE = 1,
|
|
XML_REGEXP_FINAL_STATE,
|
|
XML_REGEXP_TRANS_STATE
|
|
} xmlRegStateType;
|
|
|
|
typedef enum {
|
|
XML_REGEXP_MARK_NORMAL = 0,
|
|
XML_REGEXP_MARK_START,
|
|
XML_REGEXP_MARK_VISITED
|
|
} xmlRegMarkedType;
|
|
|
|
typedef struct _xmlRegRange xmlRegRange;
|
|
typedef xmlRegRange *xmlRegRangePtr;
|
|
|
|
struct _xmlRegRange {
|
|
int neg;
|
|
xmlRegAtomType type;
|
|
int start;
|
|
int end;
|
|
xmlChar *blockName;
|
|
};
|
|
|
|
typedef struct _xmlRegAtom xmlRegAtom;
|
|
typedef xmlRegAtom *xmlRegAtomPtr;
|
|
|
|
typedef struct _xmlAutomataState xmlRegState;
|
|
typedef xmlRegState *xmlRegStatePtr;
|
|
|
|
struct _xmlRegAtom {
|
|
int no;
|
|
xmlRegAtomType type;
|
|
xmlRegQuantType quant;
|
|
int min;
|
|
int max;
|
|
|
|
void *valuep;
|
|
void *valuep2;
|
|
int neg;
|
|
int codepoint;
|
|
xmlRegStatePtr start;
|
|
xmlRegStatePtr stop;
|
|
int maxRanges;
|
|
int nbRanges;
|
|
xmlRegRangePtr *ranges;
|
|
void *data;
|
|
};
|
|
|
|
typedef struct _xmlRegCounter xmlRegCounter;
|
|
typedef xmlRegCounter *xmlRegCounterPtr;
|
|
|
|
struct _xmlRegCounter {
|
|
int min;
|
|
int max;
|
|
};
|
|
|
|
typedef struct _xmlRegTrans xmlRegTrans;
|
|
typedef xmlRegTrans *xmlRegTransPtr;
|
|
|
|
struct _xmlRegTrans {
|
|
xmlRegAtomPtr atom;
|
|
int to;
|
|
int counter;
|
|
int count;
|
|
};
|
|
|
|
struct _xmlAutomataState {
|
|
xmlRegStateType type;
|
|
xmlRegMarkedType mark;
|
|
xmlRegMarkedType reached;
|
|
int no;
|
|
|
|
int maxTrans;
|
|
int nbTrans;
|
|
xmlRegTrans *trans;
|
|
};
|
|
|
|
typedef struct _xmlAutomata xmlRegParserCtxt;
|
|
typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
|
|
|
|
struct _xmlAutomata {
|
|
xmlChar *string;
|
|
xmlChar *cur;
|
|
|
|
int error;
|
|
int neg;
|
|
|
|
xmlRegStatePtr start;
|
|
xmlRegStatePtr end;
|
|
xmlRegStatePtr state;
|
|
|
|
xmlRegAtomPtr atom;
|
|
|
|
int maxAtoms;
|
|
int nbAtoms;
|
|
xmlRegAtomPtr *atoms;
|
|
|
|
int maxStates;
|
|
int nbStates;
|
|
xmlRegStatePtr *states;
|
|
|
|
int maxCounters;
|
|
int nbCounters;
|
|
xmlRegCounter *counters;
|
|
|
|
int determinist;
|
|
};
|
|
|
|
struct _xmlRegexp {
|
|
xmlChar *string;
|
|
int nbStates;
|
|
xmlRegStatePtr *states;
|
|
int nbAtoms;
|
|
xmlRegAtomPtr *atoms;
|
|
int nbCounters;
|
|
xmlRegCounter *counters;
|
|
int determinist;
|
|
/*
|
|
* That's the compact form for determinists automatas
|
|
*/
|
|
int nbstates;
|
|
int *compact;
|
|
void **transdata;
|
|
int nbstrings;
|
|
xmlChar **stringMap;
|
|
};
|
|
|
|
typedef struct _xmlRegExecRollback xmlRegExecRollback;
|
|
typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
|
|
|
|
struct _xmlRegExecRollback {
|
|
xmlRegStatePtr state;/* the current state */
|
|
int index; /* the index in the input stack */
|
|
int nextbranch; /* the next transition to explore in that state */
|
|
int *counts; /* save the automate state if it has some */
|
|
};
|
|
|
|
typedef struct _xmlRegInputToken xmlRegInputToken;
|
|
typedef xmlRegInputToken *xmlRegInputTokenPtr;
|
|
|
|
struct _xmlRegInputToken {
|
|
xmlChar *value;
|
|
void *data;
|
|
};
|
|
|
|
struct _xmlRegExecCtxt {
|
|
int status; /* execution status != 0 indicate an error */
|
|
int determinist; /* did we found an inderterministic behaviour */
|
|
xmlRegexpPtr comp; /* the compiled regexp */
|
|
xmlRegExecCallbacks callback;
|
|
void *data;
|
|
|
|
xmlRegStatePtr state;/* the current state */
|
|
int transno; /* the current transition on that state */
|
|
int transcount; /* the number of char in char counted transitions */
|
|
|
|
/*
|
|
* A stack of rollback states
|
|
*/
|
|
int maxRollbacks;
|
|
int nbRollbacks;
|
|
xmlRegExecRollback *rollbacks;
|
|
|
|
/*
|
|
* The state of the automata if any
|
|
*/
|
|
int *counts;
|
|
|
|
/*
|
|
* The input stack
|
|
*/
|
|
int inputStackMax;
|
|
int inputStackNr;
|
|
int index;
|
|
int *charStack;
|
|
const xmlChar *inputString; /* when operating on characters */
|
|
xmlRegInputTokenPtr inputStack;/* when operating on strings */
|
|
|
|
};
|
|
|
|
#define REGEXP_ALL_COUNTER 0x123456
|
|
#define REGEXP_ALL_LAX_COUNTER 0x123457
|
|
|
|
static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
|
|
static void xmlRegFreeState(xmlRegStatePtr state);
|
|
static void xmlRegFreeAtom(xmlRegAtomPtr atom);
|
|
|
|
/************************************************************************
|
|
* *
|
|
* Allocation/Deallocation *
|
|
* *
|
|
************************************************************************/
|
|
|
|
static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
|
|
/**
|
|
* xmlRegEpxFromParse:
|
|
* @ctxt: the parser context used to build it
|
|
*
|
|
* Allocate a new regexp and fill it with the reult from the parser
|
|
*
|
|
* Returns the new regexp or NULL in case of error
|
|
*/
|
|
static xmlRegexpPtr
|
|
xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
|
|
xmlRegexpPtr ret;
|
|
|
|
ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
|
|
if (ret == NULL)
|
|
return(NULL);
|
|
memset(ret, 0, sizeof(xmlRegexp));
|
|
ret->string = ctxt->string;
|
|
ctxt->string = NULL;
|
|
ret->nbStates = ctxt->nbStates;
|
|
ctxt->nbStates = 0;
|
|
ret->states = ctxt->states;
|
|
ctxt->states = NULL;
|
|
ret->nbAtoms = ctxt->nbAtoms;
|
|
ctxt->nbAtoms = 0;
|
|
ret->atoms = ctxt->atoms;
|
|
ctxt->atoms = NULL;
|
|
ret->nbCounters = ctxt->nbCounters;
|
|
ctxt->nbCounters = 0;
|
|
ret->counters = ctxt->counters;
|
|
ctxt->counters = NULL;
|
|
ret->determinist = ctxt->determinist;
|
|
|
|
if ((ret->determinist != 0) &&
|
|
(ret->nbCounters == 0) &&
|
|
(ret->atoms != NULL) &&
|
|
(ret->atoms[0] != NULL) &&
|
|
(ret->atoms[0]->type == XML_REGEXP_STRING)) {
|
|
int i, j, nbstates = 0, nbatoms = 0;
|
|
int *stateRemap;
|
|
int *stringRemap;
|
|
int *transitions;
|
|
void **transdata;
|
|
xmlChar **stringMap;
|
|
xmlChar *value;
|
|
|
|
/*
|
|
* Switch to a compact representation
|
|
* 1/ counting the effective number of states left
|
|
* 2/ conting the unique number of atoms, and check that
|
|
* they are all of the string type
|
|
* 3/ build a table state x atom for the transitions
|
|
*/
|
|
|
|
stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
|
|
for (i = 0;i < ret->nbStates;i++) {
|
|
if (ret->states[i] != NULL) {
|
|
stateRemap[i] = nbstates;
|
|
nbstates++;
|
|
} else {
|
|
stateRemap[i] = -1;
|
|
}
|
|
}
|
|
#ifdef DEBUG_COMPACTION
|
|
printf("Final: %d states\n", nbstates);
|
|
#endif
|
|
stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
|
|
stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
|
|
for (i = 0;i < ret->nbAtoms;i++) {
|
|
if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
|
|
(ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
|
|
value = ret->atoms[i]->valuep;
|
|
for (j = 0;j < nbatoms;j++) {
|
|
if (xmlStrEqual(stringMap[j], value)) {
|
|
stringRemap[i] = j;
|
|
break;
|
|
}
|
|
}
|
|
if (j >= nbatoms) {
|
|
stringRemap[i] = nbatoms;
|
|
stringMap[nbatoms] = xmlStrdup(value);
|
|
nbatoms++;
|
|
}
|
|
} else {
|
|
xmlFree(stateRemap);
|
|
xmlFree(stringRemap);
|
|
for (i = 0;i < nbatoms;i++)
|
|
xmlFree(stringMap[i]);
|
|
xmlFree(stringMap);
|
|
goto fail_compact;
|
|
}
|
|
}
|
|
#ifdef DEBUG_COMPACTION
|
|
printf("Final: %d atoms\n", nbatoms);
|
|
#endif
|
|
|
|
/*
|
|
* Allocate the transition table. The first entry for each
|
|
* state correspond to the state type.
|
|
*/
|
|
transitions = (int *) xmlMalloc(nbstates * (nbatoms + 1) * sizeof(int));
|
|
transdata = NULL;
|
|
memset(transitions, 0, nbstates * (nbatoms + 1) * sizeof(int));
|
|
|
|
for (i = 0;i < ret->nbStates;i++) {
|
|
int stateno, atomno, targetno, prev;
|
|
xmlRegStatePtr state;
|
|
xmlRegTransPtr trans;
|
|
|
|
stateno = stateRemap[i];
|
|
if (stateno == -1)
|
|
continue;
|
|
state = ret->states[i];
|
|
|
|
transitions[stateno * (nbatoms + 1)] = state->type;
|
|
|
|
for (j = 0;j < state->nbTrans;j++) {
|
|
trans = &(state->trans[j]);
|
|
if ((trans->to == -1) || (trans->atom == NULL))
|
|
continue;
|
|
atomno = stringRemap[trans->atom->no];
|
|
if ((trans->atom->data != NULL) && (transdata == NULL)) {
|
|
transdata = (void **) xmlMalloc(nbstates * nbatoms *
|
|
sizeof(void *));
|
|
if (transdata != NULL)
|
|
memset(transdata, 0,
|
|
nbstates * nbatoms * sizeof(void *));
|
|
}
|
|
targetno = stateRemap[trans->to];
|
|
/*
|
|
* if the same atome can generate transition to 2 different
|
|
* states then it means the automata is not determinist and
|
|
* the compact form can't be used !
|
|
*/
|
|
prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
|
|
if (prev != 0) {
|
|
if (prev != targetno + 1) {
|
|
printf("not determinist\n");
|
|
ret->determinist = 0;
|
|
#ifdef DEBUG_COMPACTION
|
|
printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
|
|
i, j, trans->atom->no, trans->to, atomno, targetno);
|
|
printf(" previous to is %d\n", prev);
|
|
#endif
|
|
ret->determinist = 0;
|
|
if (transdata != NULL)
|
|
xmlFree(transdata);
|
|
xmlFree(transitions);
|
|
xmlFree(stateRemap);
|
|
xmlFree(stringRemap);
|
|
for (i = 0;i < nbatoms;i++)
|
|
xmlFree(stringMap[i]);
|
|
xmlFree(stringMap);
|
|
goto fail_compact;
|
|
}
|
|
} else {
|
|
#if 0
|
|
printf("State %d trans %d: atom %d to %d : %d to %d\n",
|
|
i, j, trans->atom->no, trans->to, atomno, targetno);
|
|
#endif
|
|
transitions[stateno * (nbatoms + 1) + atomno + 1] =
|
|
targetno + 1; /* to avoid 0 */
|
|
if (transdata != NULL)
|
|
transdata[stateno * nbatoms + atomno] =
|
|
trans->atom->data;
|
|
}
|
|
}
|
|
}
|
|
ret->determinist = 1;
|
|
#ifdef DEBUG_COMPACTION
|
|
/*
|
|
* Debug
|
|
*/
|
|
for (i = 0;i < nbstates;i++) {
|
|
for (j = 0;j < nbatoms + 1;j++) {
|
|
printf("%02d ", transitions[i * (nbatoms + 1) + j]);
|
|
}
|
|
printf("\n");
|
|
}
|
|
printf("\n");
|
|
#endif
|
|
/*
|
|
* Cleanup of the old data
|
|
*/
|
|
if (ret->states != NULL) {
|
|
for (i = 0;i < ret->nbStates;i++)
|
|
xmlRegFreeState(ret->states[i]);
|
|
xmlFree(ret->states);
|
|
}
|
|
ret->states = NULL;
|
|
ret->nbStates = 0;
|
|
if (ret->atoms != NULL) {
|
|
for (i = 0;i < ret->nbAtoms;i++)
|
|
xmlRegFreeAtom(ret->atoms[i]);
|
|
xmlFree(ret->atoms);
|
|
}
|
|
ret->atoms = NULL;
|
|
ret->nbAtoms = 0;
|
|
|
|
ret->compact = transitions;
|
|
ret->transdata = transdata;
|
|
ret->stringMap = stringMap;
|
|
ret->nbstrings = nbatoms;
|
|
ret->nbstates = nbstates;
|
|
xmlFree(stateRemap);
|
|
xmlFree(stringRemap);
|
|
}
|
|
fail_compact:
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlRegNewParserCtxt:
|
|
* @string: the string to parse
|
|
*
|
|
* Allocate a new regexp parser context
|
|
*
|
|
* Returns the new context or NULL in case of error
|
|
*/
|
|
static xmlRegParserCtxtPtr
|
|
xmlRegNewParserCtxt(const xmlChar *string) {
|
|
xmlRegParserCtxtPtr ret;
|
|
|
|
ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
|
|
if (ret == NULL)
|
|
return(NULL);
|
|
memset(ret, 0, sizeof(xmlRegParserCtxt));
|
|
if (string != NULL)
|
|
ret->string = xmlStrdup(string);
|
|
ret->cur = ret->string;
|
|
ret->neg = 0;
|
|
ret->error = 0;
|
|
ret->determinist = -1;
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlRegNewRange:
|
|
* @ctxt: the regexp parser context
|
|
* @neg: is that negative
|
|
* @type: the type of range
|
|
* @start: the start codepoint
|
|
* @end: the end codepoint
|
|
*
|
|
* Allocate a new regexp range
|
|
*
|
|
* Returns the new range or NULL in case of error
|
|
*/
|
|
static xmlRegRangePtr
|
|
xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
|
|
int neg, xmlRegAtomType type, int start, int end) {
|
|
xmlRegRangePtr ret;
|
|
|
|
ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
|
|
if (ret == NULL) {
|
|
ERROR("failed to allocate regexp range");
|
|
return(NULL);
|
|
}
|
|
ret->neg = neg;
|
|
ret->type = type;
|
|
ret->start = start;
|
|
ret->end = end;
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlRegFreeRange:
|
|
* @range: the regexp range
|
|
*
|
|
* Free a regexp range
|
|
*/
|
|
static void
|
|
xmlRegFreeRange(xmlRegRangePtr range) {
|
|
if (range == NULL)
|
|
return;
|
|
|
|
if (range->blockName != NULL)
|
|
xmlFree(range->blockName);
|
|
xmlFree(range);
|
|
}
|
|
|
|
/**
|
|
* xmlRegNewAtom:
|
|
* @ctxt: the regexp parser context
|
|
* @type: the type of atom
|
|
*
|
|
* Allocate a new regexp range
|
|
*
|
|
* Returns the new atom or NULL in case of error
|
|
*/
|
|
static xmlRegAtomPtr
|
|
xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
|
|
xmlRegAtomPtr ret;
|
|
|
|
ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
|
|
if (ret == NULL) {
|
|
ERROR("failed to allocate regexp atom");
|
|
return(NULL);
|
|
}
|
|
memset(ret, 0, sizeof(xmlRegAtom));
|
|
ret->type = type;
|
|
ret->quant = XML_REGEXP_QUANT_ONCE;
|
|
ret->min = 0;
|
|
ret->max = 0;
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlRegFreeAtom:
|
|
* @atom: the regexp atom
|
|
*
|
|
* Free a regexp atom
|
|
*/
|
|
static void
|
|
xmlRegFreeAtom(xmlRegAtomPtr atom) {
|
|
int i;
|
|
|
|
if (atom == NULL)
|
|
return;
|
|
|
|
for (i = 0;i < atom->nbRanges;i++)
|
|
xmlRegFreeRange(atom->ranges[i]);
|
|
if (atom->ranges != NULL)
|
|
xmlFree(atom->ranges);
|
|
if (atom->type == XML_REGEXP_STRING)
|
|
xmlFree(atom->valuep);
|
|
xmlFree(atom);
|
|
}
|
|
|
|
static xmlRegStatePtr
|
|
xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
|
|
xmlRegStatePtr ret;
|
|
|
|
ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
|
|
if (ret == NULL) {
|
|
ERROR("failed to allocate regexp state");
|
|
return(NULL);
|
|
}
|
|
memset(ret, 0, sizeof(xmlRegState));
|
|
ret->type = XML_REGEXP_TRANS_STATE;
|
|
ret->mark = XML_REGEXP_MARK_NORMAL;
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlRegFreeState:
|
|
* @state: the regexp state
|
|
*
|
|
* Free a regexp state
|
|
*/
|
|
static void
|
|
xmlRegFreeState(xmlRegStatePtr state) {
|
|
if (state == NULL)
|
|
return;
|
|
|
|
if (state->trans != NULL)
|
|
xmlFree(state->trans);
|
|
xmlFree(state);
|
|
}
|
|
|
|
/**
|
|
* xmlRegFreeParserCtxt:
|
|
* @ctxt: the regexp parser context
|
|
*
|
|
* Free a regexp parser context
|
|
*/
|
|
static void
|
|
xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
|
|
int i;
|
|
if (ctxt == NULL)
|
|
return;
|
|
|
|
if (ctxt->string != NULL)
|
|
xmlFree(ctxt->string);
|
|
if (ctxt->states != NULL) {
|
|
for (i = 0;i < ctxt->nbStates;i++)
|
|
xmlRegFreeState(ctxt->states[i]);
|
|
xmlFree(ctxt->states);
|
|
}
|
|
if (ctxt->atoms != NULL) {
|
|
for (i = 0;i < ctxt->nbAtoms;i++)
|
|
xmlRegFreeAtom(ctxt->atoms[i]);
|
|
xmlFree(ctxt->atoms);
|
|
}
|
|
if (ctxt->counters != NULL)
|
|
xmlFree(ctxt->counters);
|
|
xmlFree(ctxt);
|
|
}
|
|
|
|
/************************************************************************
|
|
* *
|
|
* Display of Data structures *
|
|
* *
|
|
************************************************************************/
|
|
|
|
static void
|
|
xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
|
|
switch (type) {
|
|
case XML_REGEXP_EPSILON:
|
|
fprintf(output, "epsilon "); break;
|
|
case XML_REGEXP_CHARVAL:
|
|
fprintf(output, "charval "); break;
|
|
case XML_REGEXP_RANGES:
|
|
fprintf(output, "ranges "); break;
|
|
case XML_REGEXP_SUBREG:
|
|
fprintf(output, "subexpr "); break;
|
|
case XML_REGEXP_STRING:
|
|
fprintf(output, "string "); break;
|
|
case XML_REGEXP_ANYCHAR:
|
|
fprintf(output, "anychar "); break;
|
|
case XML_REGEXP_ANYSPACE:
|
|
fprintf(output, "anyspace "); break;
|
|
case XML_REGEXP_NOTSPACE:
|
|
fprintf(output, "notspace "); break;
|
|
case XML_REGEXP_INITNAME:
|
|
fprintf(output, "initname "); break;
|
|
case XML_REGEXP_NOTINITNAME:
|
|
fprintf(output, "notinitname "); break;
|
|
case XML_REGEXP_NAMECHAR:
|
|
fprintf(output, "namechar "); break;
|
|
case XML_REGEXP_NOTNAMECHAR:
|
|
fprintf(output, "notnamechar "); break;
|
|
case XML_REGEXP_DECIMAL:
|
|
fprintf(output, "decimal "); break;
|
|
case XML_REGEXP_NOTDECIMAL:
|
|
fprintf(output, "notdecimal "); break;
|
|
case XML_REGEXP_REALCHAR:
|
|
fprintf(output, "realchar "); break;
|
|
case XML_REGEXP_NOTREALCHAR:
|
|
fprintf(output, "notrealchar "); break;
|
|
case XML_REGEXP_LETTER:
|
|
fprintf(output, "LETTER "); break;
|
|
case XML_REGEXP_LETTER_UPPERCASE:
|
|
fprintf(output, "LETTER_UPPERCASE "); break;
|
|
case XML_REGEXP_LETTER_LOWERCASE:
|
|
fprintf(output, "LETTER_LOWERCASE "); break;
|
|
case XML_REGEXP_LETTER_TITLECASE:
|
|
fprintf(output, "LETTER_TITLECASE "); break;
|
|
case XML_REGEXP_LETTER_MODIFIER:
|
|
fprintf(output, "LETTER_MODIFIER "); break;
|
|
case XML_REGEXP_LETTER_OTHERS:
|
|
fprintf(output, "LETTER_OTHERS "); break;
|
|
case XML_REGEXP_MARK:
|
|
fprintf(output, "MARK "); break;
|
|
case XML_REGEXP_MARK_NONSPACING:
|
|
fprintf(output, "MARK_NONSPACING "); break;
|
|
case XML_REGEXP_MARK_SPACECOMBINING:
|
|
fprintf(output, "MARK_SPACECOMBINING "); break;
|
|
case XML_REGEXP_MARK_ENCLOSING:
|
|
fprintf(output, "MARK_ENCLOSING "); break;
|
|
case XML_REGEXP_NUMBER:
|
|
fprintf(output, "NUMBER "); break;
|
|
case XML_REGEXP_NUMBER_DECIMAL:
|
|
fprintf(output, "NUMBER_DECIMAL "); break;
|
|
case XML_REGEXP_NUMBER_LETTER:
|
|
fprintf(output, "NUMBER_LETTER "); break;
|
|
case XML_REGEXP_NUMBER_OTHERS:
|
|
fprintf(output, "NUMBER_OTHERS "); break;
|
|
case XML_REGEXP_PUNCT:
|
|
fprintf(output, "PUNCT "); break;
|
|
case XML_REGEXP_PUNCT_CONNECTOR:
|
|
fprintf(output, "PUNCT_CONNECTOR "); break;
|
|
case XML_REGEXP_PUNCT_DASH:
|
|
fprintf(output, "PUNCT_DASH "); break;
|
|
case XML_REGEXP_PUNCT_OPEN:
|
|
fprintf(output, "PUNCT_OPEN "); break;
|
|
case XML_REGEXP_PUNCT_CLOSE:
|
|
fprintf(output, "PUNCT_CLOSE "); break;
|
|
case XML_REGEXP_PUNCT_INITQUOTE:
|
|
fprintf(output, "PUNCT_INITQUOTE "); break;
|
|
case XML_REGEXP_PUNCT_FINQUOTE:
|
|
fprintf(output, "PUNCT_FINQUOTE "); break;
|
|
case XML_REGEXP_PUNCT_OTHERS:
|
|
fprintf(output, "PUNCT_OTHERS "); break;
|
|
case XML_REGEXP_SEPAR:
|
|
fprintf(output, "SEPAR "); break;
|
|
case XML_REGEXP_SEPAR_SPACE:
|
|
fprintf(output, "SEPAR_SPACE "); break;
|
|
case XML_REGEXP_SEPAR_LINE:
|
|
fprintf(output, "SEPAR_LINE "); break;
|
|
case XML_REGEXP_SEPAR_PARA:
|
|
fprintf(output, "SEPAR_PARA "); break;
|
|
case XML_REGEXP_SYMBOL:
|
|
fprintf(output, "SYMBOL "); break;
|
|
case XML_REGEXP_SYMBOL_MATH:
|
|
fprintf(output, "SYMBOL_MATH "); break;
|
|
case XML_REGEXP_SYMBOL_CURRENCY:
|
|
fprintf(output, "SYMBOL_CURRENCY "); break;
|
|
case XML_REGEXP_SYMBOL_MODIFIER:
|
|
fprintf(output, "SYMBOL_MODIFIER "); break;
|
|
case XML_REGEXP_SYMBOL_OTHERS:
|
|
fprintf(output, "SYMBOL_OTHERS "); break;
|
|
case XML_REGEXP_OTHER:
|
|
fprintf(output, "OTHER "); break;
|
|
case XML_REGEXP_OTHER_CONTROL:
|
|
fprintf(output, "OTHER_CONTROL "); break;
|
|
case XML_REGEXP_OTHER_FORMAT:
|
|
fprintf(output, "OTHER_FORMAT "); break;
|
|
case XML_REGEXP_OTHER_PRIVATE:
|
|
fprintf(output, "OTHER_PRIVATE "); break;
|
|
case XML_REGEXP_OTHER_NA:
|
|
fprintf(output, "OTHER_NA "); break;
|
|
case XML_REGEXP_BLOCK_NAME:
|
|
fprintf(output, "BLOCK "); break;
|
|
}
|
|
}
|
|
|
|
static void
|
|
xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
|
|
switch (type) {
|
|
case XML_REGEXP_QUANT_EPSILON:
|
|
fprintf(output, "epsilon "); break;
|
|
case XML_REGEXP_QUANT_ONCE:
|
|
fprintf(output, "once "); break;
|
|
case XML_REGEXP_QUANT_OPT:
|
|
fprintf(output, "? "); break;
|
|
case XML_REGEXP_QUANT_MULT:
|
|
fprintf(output, "* "); break;
|
|
case XML_REGEXP_QUANT_PLUS:
|
|
fprintf(output, "+ "); break;
|
|
case XML_REGEXP_QUANT_RANGE:
|
|
fprintf(output, "range "); break;
|
|
case XML_REGEXP_QUANT_ONCEONLY:
|
|
fprintf(output, "onceonly "); break;
|
|
case XML_REGEXP_QUANT_ALL:
|
|
fprintf(output, "all "); break;
|
|
}
|
|
}
|
|
static void
|
|
xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
|
|
fprintf(output, " range: ");
|
|
if (range->neg)
|
|
fprintf(output, "negative ");
|
|
xmlRegPrintAtomType(output, range->type);
|
|
fprintf(output, "%c - %c\n", range->start, range->end);
|
|
}
|
|
|
|
static void
|
|
xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
|
|
fprintf(output, " atom: ");
|
|
if (atom == NULL) {
|
|
fprintf(output, "NULL\n");
|
|
return;
|
|
}
|
|
xmlRegPrintAtomType(output, atom->type);
|
|
xmlRegPrintQuantType(output, atom->quant);
|
|
if (atom->quant == XML_REGEXP_QUANT_RANGE)
|
|
fprintf(output, "%d-%d ", atom->min, atom->max);
|
|
if (atom->type == XML_REGEXP_STRING)
|
|
fprintf(output, "'%s' ", (char *) atom->valuep);
|
|
if (atom->type == XML_REGEXP_CHARVAL)
|
|
fprintf(output, "char %c\n", atom->codepoint);
|
|
else if (atom->type == XML_REGEXP_RANGES) {
|
|
int i;
|
|
fprintf(output, "%d entries\n", atom->nbRanges);
|
|
for (i = 0; i < atom->nbRanges;i++)
|
|
xmlRegPrintRange(output, atom->ranges[i]);
|
|
} else if (atom->type == XML_REGEXP_SUBREG) {
|
|
fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
|
|
} else {
|
|
fprintf(output, "\n");
|
|
}
|
|
}
|
|
|
|
static void
|
|
xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
|
|
fprintf(output, " trans: ");
|
|
if (trans == NULL) {
|
|
fprintf(output, "NULL\n");
|
|
return;
|
|
}
|
|
if (trans->to < 0) {
|
|
fprintf(output, "removed\n");
|
|
return;
|
|
}
|
|
if (trans->counter >= 0) {
|
|
fprintf(output, "counted %d, ", trans->counter);
|
|
}
|
|
if (trans->count == REGEXP_ALL_COUNTER) {
|
|
fprintf(output, "all transition, ");
|
|
} else if (trans->count >= 0) {
|
|
fprintf(output, "count based %d, ", trans->count);
|
|
}
|
|
if (trans->atom == NULL) {
|
|
fprintf(output, "epsilon to %d\n", trans->to);
|
|
return;
|
|
}
|
|
if (trans->atom->type == XML_REGEXP_CHARVAL)
|
|
fprintf(output, "char %c ", trans->atom->codepoint);
|
|
fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
|
|
}
|
|
|
|
static void
|
|
xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
|
|
int i;
|
|
|
|
fprintf(output, " state: ");
|
|
if (state == NULL) {
|
|
fprintf(output, "NULL\n");
|
|
return;
|
|
}
|
|
if (state->type == XML_REGEXP_START_STATE)
|
|
fprintf(output, "START ");
|
|
if (state->type == XML_REGEXP_FINAL_STATE)
|
|
fprintf(output, "FINAL ");
|
|
|
|
fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
|
|
for (i = 0;i < state->nbTrans; i++) {
|
|
xmlRegPrintTrans(output, &(state->trans[i]));
|
|
}
|
|
}
|
|
|
|
#ifdef DEBUG_REGEXP_GRAPH
|
|
static void
|
|
xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
|
|
int i;
|
|
|
|
fprintf(output, " ctxt: ");
|
|
if (ctxt == NULL) {
|
|
fprintf(output, "NULL\n");
|
|
return;
|
|
}
|
|
fprintf(output, "'%s' ", ctxt->string);
|
|
if (ctxt->error)
|
|
fprintf(output, "error ");
|
|
if (ctxt->neg)
|
|
fprintf(output, "neg ");
|
|
fprintf(output, "\n");
|
|
fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
|
|
for (i = 0;i < ctxt->nbAtoms; i++) {
|
|
fprintf(output, " %02d ", i);
|
|
xmlRegPrintAtom(output, ctxt->atoms[i]);
|
|
}
|
|
if (ctxt->atom != NULL) {
|
|
fprintf(output, "current atom:\n");
|
|
xmlRegPrintAtom(output, ctxt->atom);
|
|
}
|
|
fprintf(output, "%d states:", ctxt->nbStates);
|
|
if (ctxt->start != NULL)
|
|
fprintf(output, " start: %d", ctxt->start->no);
|
|
if (ctxt->end != NULL)
|
|
fprintf(output, " end: %d", ctxt->end->no);
|
|
fprintf(output, "\n");
|
|
for (i = 0;i < ctxt->nbStates; i++) {
|
|
xmlRegPrintState(output, ctxt->states[i]);
|
|
}
|
|
fprintf(output, "%d counters:\n", ctxt->nbCounters);
|
|
for (i = 0;i < ctxt->nbCounters; i++) {
|
|
fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
|
|
ctxt->counters[i].max);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
/************************************************************************
|
|
* *
|
|
* Finite Automata structures manipulations *
|
|
* *
|
|
************************************************************************/
|
|
|
|
static void
|
|
xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
|
|
int neg, xmlRegAtomType type, int start, int end,
|
|
xmlChar *blockName) {
|
|
xmlRegRangePtr range;
|
|
|
|
if (atom == NULL) {
|
|
ERROR("add range: atom is NULL");
|
|
return;
|
|
}
|
|
if (atom->type != XML_REGEXP_RANGES) {
|
|
ERROR("add range: atom is not ranges");
|
|
return;
|
|
}
|
|
if (atom->maxRanges == 0) {
|
|
atom->maxRanges = 4;
|
|
atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
|
|
sizeof(xmlRegRangePtr));
|
|
if (atom->ranges == NULL) {
|
|
ERROR("add range: allocation failed");
|
|
atom->maxRanges = 0;
|
|
return;
|
|
}
|
|
} else if (atom->nbRanges >= atom->maxRanges) {
|
|
xmlRegRangePtr *tmp;
|
|
atom->maxRanges *= 2;
|
|
tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
|
|
sizeof(xmlRegRangePtr));
|
|
if (tmp == NULL) {
|
|
ERROR("add range: allocation failed");
|
|
atom->maxRanges /= 2;
|
|
return;
|
|
}
|
|
atom->ranges = tmp;
|
|
}
|
|
range = xmlRegNewRange(ctxt, neg, type, start, end);
|
|
if (range == NULL)
|
|
return;
|
|
range->blockName = blockName;
|
|
atom->ranges[atom->nbRanges++] = range;
|
|
|
|
}
|
|
|
|
static int
|
|
xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
|
|
if (ctxt->maxCounters == 0) {
|
|
ctxt->maxCounters = 4;
|
|
ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
|
|
sizeof(xmlRegCounter));
|
|
if (ctxt->counters == NULL) {
|
|
ERROR("reg counter: allocation failed");
|
|
ctxt->maxCounters = 0;
|
|
return(-1);
|
|
}
|
|
} else if (ctxt->nbCounters >= ctxt->maxCounters) {
|
|
xmlRegCounter *tmp;
|
|
ctxt->maxCounters *= 2;
|
|
tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
|
|
sizeof(xmlRegCounter));
|
|
if (tmp == NULL) {
|
|
ERROR("reg counter: allocation failed");
|
|
ctxt->maxCounters /= 2;
|
|
return(-1);
|
|
}
|
|
ctxt->counters = tmp;
|
|
}
|
|
ctxt->counters[ctxt->nbCounters].min = -1;
|
|
ctxt->counters[ctxt->nbCounters].max = -1;
|
|
return(ctxt->nbCounters++);
|
|
}
|
|
|
|
static void
|
|
xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
|
|
if (atom == NULL) {
|
|
ERROR("atom push: atom is NULL");
|
|
return;
|
|
}
|
|
if (ctxt->maxAtoms == 0) {
|
|
ctxt->maxAtoms = 4;
|
|
ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
|
|
sizeof(xmlRegAtomPtr));
|
|
if (ctxt->atoms == NULL) {
|
|
ERROR("atom push: allocation failed");
|
|
ctxt->maxAtoms = 0;
|
|
return;
|
|
}
|
|
} else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
|
|
xmlRegAtomPtr *tmp;
|
|
ctxt->maxAtoms *= 2;
|
|
tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
|
|
sizeof(xmlRegAtomPtr));
|
|
if (tmp == NULL) {
|
|
ERROR("atom push: allocation failed");
|
|
ctxt->maxAtoms /= 2;
|
|
return;
|
|
}
|
|
ctxt->atoms = tmp;
|
|
}
|
|
atom->no = ctxt->nbAtoms;
|
|
ctxt->atoms[ctxt->nbAtoms++] = atom;
|
|
}
|
|
|
|
static void
|
|
xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
|
|
xmlRegAtomPtr atom, xmlRegStatePtr target,
|
|
int counter, int count) {
|
|
if (state == NULL) {
|
|
ERROR("add state: state is NULL");
|
|
return;
|
|
}
|
|
if (target == NULL) {
|
|
ERROR("add state: target is NULL");
|
|
return;
|
|
}
|
|
if (state->maxTrans == 0) {
|
|
state->maxTrans = 4;
|
|
state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
|
|
sizeof(xmlRegTrans));
|
|
if (state->trans == NULL) {
|
|
ERROR("add range: allocation failed");
|
|
state->maxTrans = 0;
|
|
return;
|
|
}
|
|
} else if (state->nbTrans >= state->maxTrans) {
|
|
xmlRegTrans *tmp;
|
|
state->maxTrans *= 2;
|
|
tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
|
|
sizeof(xmlRegTrans));
|
|
if (tmp == NULL) {
|
|
ERROR("add range: allocation failed");
|
|
state->maxTrans /= 2;
|
|
return;
|
|
}
|
|
state->trans = tmp;
|
|
}
|
|
#ifdef DEBUG_REGEXP_GRAPH
|
|
printf("Add trans from %d to %d ", state->no, target->no);
|
|
if (count == REGEXP_ALL_COUNTER)
|
|
printf("all transition");
|
|
else if (count >= 0)
|
|
printf("count based %d", count);
|
|
else if (counter >= 0)
|
|
printf("counted %d", counter);
|
|
else if (atom == NULL)
|
|
printf("epsilon transition");
|
|
printf("\n");
|
|
#endif
|
|
|
|
state->trans[state->nbTrans].atom = atom;
|
|
state->trans[state->nbTrans].to = target->no;
|
|
state->trans[state->nbTrans].counter = counter;
|
|
state->trans[state->nbTrans].count = count;
|
|
state->nbTrans++;
|
|
}
|
|
|
|
static void
|
|
xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
|
|
if (ctxt->maxStates == 0) {
|
|
ctxt->maxStates = 4;
|
|
ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
|
|
sizeof(xmlRegStatePtr));
|
|
if (ctxt->states == NULL) {
|
|
ERROR("add range: allocation failed");
|
|
ctxt->maxStates = 0;
|
|
return;
|
|
}
|
|
} else if (ctxt->nbStates >= ctxt->maxStates) {
|
|
xmlRegStatePtr *tmp;
|
|
ctxt->maxStates *= 2;
|
|
tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
|
|
sizeof(xmlRegStatePtr));
|
|
if (tmp == NULL) {
|
|
ERROR("add range: allocation failed");
|
|
ctxt->maxStates /= 2;
|
|
return;
|
|
}
|
|
ctxt->states = tmp;
|
|
}
|
|
state->no = ctxt->nbStates;
|
|
ctxt->states[ctxt->nbStates++] = state;
|
|
}
|
|
|
|
/**
|
|
* xmlFAGenerateAllTransition:
|
|
* @ctxt: a regexp parser context
|
|
* @from: the from state
|
|
* @to: the target state or NULL for building a new one
|
|
* @lax:
|
|
*
|
|
*/
|
|
static void
|
|
xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
|
|
xmlRegStatePtr from, xmlRegStatePtr to,
|
|
int lax) {
|
|
if (to == NULL) {
|
|
to = xmlRegNewState(ctxt);
|
|
xmlRegStatePush(ctxt, to);
|
|
ctxt->state = to;
|
|
}
|
|
if (lax)
|
|
xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
|
|
else
|
|
xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
|
|
}
|
|
|
|
/**
|
|
* xmlFAGenerateEpsilonTransition:
|
|
* @ctxt: a regexp parser context
|
|
* @from: the from state
|
|
* @to: the target state or NULL for building a new one
|
|
*
|
|
*/
|
|
static void
|
|
xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
|
|
xmlRegStatePtr from, xmlRegStatePtr to) {
|
|
if (to == NULL) {
|
|
to = xmlRegNewState(ctxt);
|
|
xmlRegStatePush(ctxt, to);
|
|
ctxt->state = to;
|
|
}
|
|
xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
|
|
}
|
|
|
|
/**
|
|
* xmlFAGenerateCountedEpsilonTransition:
|
|
* @ctxt: a regexp parser context
|
|
* @from: the from state
|
|
* @to: the target state or NULL for building a new one
|
|
* counter: the counter for that transition
|
|
*
|
|
*/
|
|
static void
|
|
xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
|
|
xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
|
|
if (to == NULL) {
|
|
to = xmlRegNewState(ctxt);
|
|
xmlRegStatePush(ctxt, to);
|
|
ctxt->state = to;
|
|
}
|
|
xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
|
|
}
|
|
|
|
/**
|
|
* xmlFAGenerateCountedTransition:
|
|
* @ctxt: a regexp parser context
|
|
* @from: the from state
|
|
* @to: the target state or NULL for building a new one
|
|
* counter: the counter for that transition
|
|
*
|
|
*/
|
|
static void
|
|
xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
|
|
xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
|
|
if (to == NULL) {
|
|
to = xmlRegNewState(ctxt);
|
|
xmlRegStatePush(ctxt, to);
|
|
ctxt->state = to;
|
|
}
|
|
xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
|
|
}
|
|
|
|
/**
|
|
* xmlFAGenerateTransitions:
|
|
* @ctxt: a regexp parser context
|
|
* @from: the from state
|
|
* @to: the target state or NULL for building a new one
|
|
* @atom: the atom generating the transition
|
|
*
|
|
*/
|
|
static void
|
|
xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
|
|
xmlRegStatePtr to, xmlRegAtomPtr atom) {
|
|
if (atom == NULL) {
|
|
ERROR("genrate transition: atom == NULL");
|
|
return;
|
|
}
|
|
if (atom->type == XML_REGEXP_SUBREG) {
|
|
/*
|
|
* this is a subexpression handling one should not need to
|
|
* create a new node excep for XML_REGEXP_QUANT_RANGE.
|
|
*/
|
|
xmlRegAtomPush(ctxt, atom);
|
|
if ((to != NULL) && (atom->stop != to) &&
|
|
(atom->quant != XML_REGEXP_QUANT_RANGE)) {
|
|
/*
|
|
* Generate an epsilon transition to link to the target
|
|
*/
|
|
xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
|
|
}
|
|
switch (atom->quant) {
|
|
case XML_REGEXP_QUANT_OPT:
|
|
atom->quant = XML_REGEXP_QUANT_ONCE;
|
|
xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
|
|
break;
|
|
case XML_REGEXP_QUANT_MULT:
|
|
atom->quant = XML_REGEXP_QUANT_ONCE;
|
|
xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
|
|
xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
|
|
break;
|
|
case XML_REGEXP_QUANT_PLUS:
|
|
atom->quant = XML_REGEXP_QUANT_ONCE;
|
|
xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
|
|
break;
|
|
case XML_REGEXP_QUANT_RANGE: {
|
|
int counter;
|
|
xmlRegStatePtr newstate;
|
|
|
|
/*
|
|
* This one is nasty:
|
|
* 1/ register a new counter
|
|
* 2/ register an epsilon transition associated to
|
|
* this counter going from atom->stop to atom->start
|
|
* 3/ create a new state
|
|
* 4/ generate a counted transition from atom->stop to
|
|
* that state
|
|
*/
|
|
counter = xmlRegGetCounter(ctxt);
|
|
ctxt->counters[counter].min = atom->min - 1;
|
|
ctxt->counters[counter].max = atom->max - 1;
|
|
atom->min = 0;
|
|
atom->max = 0;
|
|
atom->quant = XML_REGEXP_QUANT_ONCE;
|
|
xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
|
|
atom->start, counter);
|
|
if (to != NULL) {
|
|
newstate = to;
|
|
} else {
|
|
newstate = xmlRegNewState(ctxt);
|
|
xmlRegStatePush(ctxt, newstate);
|
|
ctxt->state = newstate;
|
|
}
|
|
xmlFAGenerateCountedTransition(ctxt, atom->stop,
|
|
newstate, counter);
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
return;
|
|
} else {
|
|
if (to == NULL) {
|
|
to = xmlRegNewState(ctxt);
|
|
xmlRegStatePush(ctxt, to);
|
|
}
|
|
xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
|
|
xmlRegAtomPush(ctxt, atom);
|
|
ctxt->state = to;
|
|
}
|
|
switch (atom->quant) {
|
|
case XML_REGEXP_QUANT_OPT:
|
|
atom->quant = XML_REGEXP_QUANT_ONCE;
|
|
xmlFAGenerateEpsilonTransition(ctxt, from, to);
|
|
break;
|
|
case XML_REGEXP_QUANT_MULT:
|
|
atom->quant = XML_REGEXP_QUANT_ONCE;
|
|
xmlFAGenerateEpsilonTransition(ctxt, from, to);
|
|
xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
|
|
break;
|
|
case XML_REGEXP_QUANT_PLUS:
|
|
atom->quant = XML_REGEXP_QUANT_ONCE;
|
|
xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* xmlFAReduceEpsilonTransitions:
|
|
* @ctxt: a regexp parser context
|
|
* @fromnr: the from state
|
|
* @tonr: the to state
|
|
* @cpunter: should that transition be associted to a counted
|
|
*
|
|
*/
|
|
static void
|
|
xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
|
|
int tonr, int counter) {
|
|
int transnr;
|
|
xmlRegStatePtr from;
|
|
xmlRegStatePtr to;
|
|
|
|
#ifdef DEBUG_REGEXP_GRAPH
|
|
printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
|
|
#endif
|
|
from = ctxt->states[fromnr];
|
|
if (from == NULL)
|
|
return;
|
|
to = ctxt->states[tonr];
|
|
if (to == NULL)
|
|
return;
|
|
if ((to->mark == XML_REGEXP_MARK_START) ||
|
|
(to->mark == XML_REGEXP_MARK_VISITED))
|
|
return;
|
|
|
|
to->mark = XML_REGEXP_MARK_VISITED;
|
|
if (to->type == XML_REGEXP_FINAL_STATE) {
|
|
#ifdef DEBUG_REGEXP_GRAPH
|
|
printf("State %d is final, so %d becomes final\n", tonr, fromnr);
|
|
#endif
|
|
from->type = XML_REGEXP_FINAL_STATE;
|
|
}
|
|
for (transnr = 0;transnr < to->nbTrans;transnr++) {
|
|
if (to->trans[transnr].atom == NULL) {
|
|
/*
|
|
* Don't remove counted transitions
|
|
* Don't loop either
|
|
*/
|
|
if (to->trans[transnr].to != fromnr) {
|
|
if (to->trans[transnr].count >= 0) {
|
|
int newto = to->trans[transnr].to;
|
|
|
|
xmlRegStateAddTrans(ctxt, from, NULL,
|
|
ctxt->states[newto],
|
|
-1, to->trans[transnr].count);
|
|
} else {
|
|
#ifdef DEBUG_REGEXP_GRAPH
|
|
printf("Found epsilon trans %d from %d to %d\n",
|
|
transnr, tonr, to->trans[transnr].to);
|
|
#endif
|
|
if (to->trans[transnr].counter >= 0) {
|
|
xmlFAReduceEpsilonTransitions(ctxt, fromnr,
|
|
to->trans[transnr].to,
|
|
to->trans[transnr].counter);
|
|
} else {
|
|
xmlFAReduceEpsilonTransitions(ctxt, fromnr,
|
|
to->trans[transnr].to,
|
|
counter);
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
int newto = to->trans[transnr].to;
|
|
|
|
if (to->trans[transnr].counter >= 0) {
|
|
xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
|
|
ctxt->states[newto],
|
|
to->trans[transnr].counter, -1);
|
|
} else {
|
|
xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
|
|
ctxt->states[newto], counter, -1);
|
|
}
|
|
}
|
|
}
|
|
to->mark = XML_REGEXP_MARK_NORMAL;
|
|
}
|
|
|
|
/**
|
|
* xmlFAEliminateEpsilonTransitions:
|
|
* @ctxt: a regexp parser context
|
|
*
|
|
*/
|
|
static void
|
|
xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
|
|
int statenr, transnr;
|
|
xmlRegStatePtr state;
|
|
|
|
/*
|
|
* build the completed transitions bypassing the epsilons
|
|
* Use a marking algorithm to avoid loops
|
|
*/
|
|
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
|
|
state = ctxt->states[statenr];
|
|
if (state == NULL)
|
|
continue;
|
|
for (transnr = 0;transnr < state->nbTrans;transnr++) {
|
|
if ((state->trans[transnr].atom == NULL) &&
|
|
(state->trans[transnr].to >= 0)) {
|
|
if (state->trans[transnr].to == statenr) {
|
|
state->trans[transnr].to = -1;
|
|
#ifdef DEBUG_REGEXP_GRAPH
|
|
printf("Removed loopback epsilon trans %d on %d\n",
|
|
transnr, statenr);
|
|
#endif
|
|
} else if (state->trans[transnr].count < 0) {
|
|
int newto = state->trans[transnr].to;
|
|
|
|
#ifdef DEBUG_REGEXP_GRAPH
|
|
printf("Found epsilon trans %d from %d to %d\n",
|
|
transnr, statenr, newto);
|
|
#endif
|
|
state->mark = XML_REGEXP_MARK_START;
|
|
xmlFAReduceEpsilonTransitions(ctxt, statenr,
|
|
newto, state->trans[transnr].counter);
|
|
state->mark = XML_REGEXP_MARK_NORMAL;
|
|
#ifdef DEBUG_REGEXP_GRAPH
|
|
} else {
|
|
printf("Found counted transition %d on %d\n",
|
|
transnr, statenr);
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
}
|
|
/*
|
|
* Eliminate the epsilon transitions
|
|
*/
|
|
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
|
|
state = ctxt->states[statenr];
|
|
if (state == NULL)
|
|
continue;
|
|
for (transnr = 0;transnr < state->nbTrans;transnr++) {
|
|
if ((state->trans[transnr].atom == NULL) &&
|
|
(state->trans[transnr].count < 0) &&
|
|
(state->trans[transnr].to >= 0)) {
|
|
state->trans[transnr].to = -1;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Use this pass to detect unreachable states too
|
|
*/
|
|
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
|
|
state = ctxt->states[statenr];
|
|
if (state != NULL)
|
|
state->reached = 0;
|
|
}
|
|
state = ctxt->states[0];
|
|
if (state != NULL)
|
|
state->reached = 1;
|
|
while (state != NULL) {
|
|
xmlRegStatePtr target = NULL;
|
|
state->reached = 2;
|
|
/*
|
|
* Mark all state reachable from the current reachable state
|
|
*/
|
|
for (transnr = 0;transnr < state->nbTrans;transnr++) {
|
|
if ((state->trans[transnr].to >= 0) &&
|
|
((state->trans[transnr].atom != NULL) ||
|
|
(state->trans[transnr].count >= 0))) {
|
|
int newto = state->trans[transnr].to;
|
|
|
|
if (ctxt->states[newto] == NULL)
|
|
continue;
|
|
if (ctxt->states[newto]->reached == 0) {
|
|
ctxt->states[newto]->reached = 1;
|
|
target = ctxt->states[newto];
|
|
}
|
|
}
|
|
}
|
|
/*
|
|
* find the next accessible state not explored
|
|
*/
|
|
if (target == NULL) {
|
|
for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
|
|
state = ctxt->states[statenr];
|
|
if ((state != NULL) && (state->reached == 1)) {
|
|
target = state;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
state = target;
|
|
}
|
|
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
|
|
state = ctxt->states[statenr];
|
|
if ((state != NULL) && (state->reached == 0)) {
|
|
#ifdef DEBUG_REGEXP_GRAPH
|
|
printf("Removed unreachable state %d\n", statenr);
|
|
#endif
|
|
xmlRegFreeState(state);
|
|
ctxt->states[statenr] = NULL;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
/**
|
|
* xmlFACompareAtoms:
|
|
* @atom1: an atom
|
|
* @atom2: an atom
|
|
*
|
|
* Compares two atoms to check whether they are equivatents
|
|
*
|
|
* Returns 1 if yes and 0 otherwise
|
|
*/
|
|
static int
|
|
xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
|
|
if (atom1 == atom2)
|
|
return(1);
|
|
if ((atom1 == NULL) || (atom2 == NULL))
|
|
return(0);
|
|
|
|
if (atom1->type != atom2->type)
|
|
return(0);
|
|
switch (atom1->type) {
|
|
case XML_REGEXP_STRING:
|
|
return(xmlStrEqual((xmlChar *)atom1->valuep,
|
|
(xmlChar *)atom2->valuep));
|
|
case XML_REGEXP_EPSILON:
|
|
return(1);
|
|
case XML_REGEXP_CHARVAL:
|
|
return(atom1->codepoint == atom2->codepoint);
|
|
case XML_REGEXP_RANGES:
|
|
TODO;
|
|
return(0);
|
|
default:
|
|
break;
|
|
}
|
|
return(1);
|
|
}
|
|
|
|
/**
|
|
* xmlFARecurseDeterminism:
|
|
* @ctxt: a regexp parser context
|
|
*
|
|
* Check whether the associated regexp is determinist,
|
|
* should be called after xmlFAEliminateEpsilonTransitions()
|
|
*
|
|
*/
|
|
static int
|
|
xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
|
|
int to, xmlRegAtomPtr atom) {
|
|
int ret = 1;
|
|
int transnr;
|
|
xmlRegTransPtr t1;
|
|
|
|
if (state == NULL)
|
|
return(ret);
|
|
for (transnr = 0;transnr < state->nbTrans;transnr++) {
|
|
t1 = &(state->trans[transnr]);
|
|
/*
|
|
* check transitions conflicting with the one looked at
|
|
*/
|
|
if (t1->atom == NULL) {
|
|
if (t1->to == -1)
|
|
continue;
|
|
ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
|
|
to, atom);
|
|
if (ret == 0)
|
|
return(0);
|
|
continue;
|
|
}
|
|
if (t1->to != to)
|
|
continue;
|
|
if (xmlFACompareAtoms(t1->atom, atom))
|
|
return(0);
|
|
}
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlFAComputesDeterminism:
|
|
* @ctxt: a regexp parser context
|
|
*
|
|
* Check whether the associated regexp is determinist,
|
|
* should be called after xmlFAEliminateEpsilonTransitions()
|
|
*
|
|
*/
|
|
static int
|
|
xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
|
|
int statenr, transnr;
|
|
xmlRegStatePtr state;
|
|
xmlRegTransPtr t1, t2;
|
|
int i;
|
|
int ret = 1;
|
|
|
|
#ifdef DEBUG_REGEXP_GRAPH
|
|
printf("xmlFAComputesDeterminism\n");
|
|
xmlRegPrintCtxt(stdout, ctxt);
|
|
#endif
|
|
if (ctxt->determinist != -1)
|
|
return(ctxt->determinist);
|
|
|
|
/*
|
|
* Check for all states that there isn't 2 transitions
|
|
* with the same atom and a different target.
|
|
*/
|
|
for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
|
|
state = ctxt->states[statenr];
|
|
if (state == NULL)
|
|
continue;
|
|
for (transnr = 0;transnr < state->nbTrans;transnr++) {
|
|
t1 = &(state->trans[transnr]);
|
|
/*
|
|
* Determinism checks in case of counted or all transitions
|
|
* will have to be handled separately
|
|
*/
|
|
if (t1->atom == NULL)
|
|
continue;
|
|
if (t1->to == -1) /* eliminated */
|
|
continue;
|
|
for (i = 0;i < transnr;i++) {
|
|
t2 = &(state->trans[i]);
|
|
if (t2->to == -1) /* eliminated */
|
|
continue;
|
|
if (t2->atom != NULL) {
|
|
if (t1->to == t2->to) {
|
|
if (xmlFACompareAtoms(t1->atom, t2->atom))
|
|
t2->to = -1; /* eliminate */
|
|
} else {
|
|
/* not determinist ! */
|
|
if (xmlFACompareAtoms(t1->atom, t2->atom))
|
|
ret = 0;
|
|
}
|
|
} else if (t1->to != -1) {
|
|
/*
|
|
* do the closure in case of remaining specific
|
|
* epsilon transitions like choices or all
|
|
*/
|
|
ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
|
|
t2->to, t2->atom);
|
|
if (ret == 0)
|
|
return(0);
|
|
}
|
|
}
|
|
if (ret == 0)
|
|
break;
|
|
}
|
|
if (ret == 0)
|
|
break;
|
|
}
|
|
ctxt->determinist = ret;
|
|
return(ret);
|
|
}
|
|
|
|
/************************************************************************
|
|
* *
|
|
* Routines to check input against transition atoms *
|
|
* *
|
|
************************************************************************/
|
|
|
|
static int
|
|
xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
|
|
int start, int end, const xmlChar *blockName) {
|
|
int ret = 0;
|
|
|
|
switch (type) {
|
|
case XML_REGEXP_STRING:
|
|
case XML_REGEXP_SUBREG:
|
|
case XML_REGEXP_RANGES:
|
|
case XML_REGEXP_EPSILON:
|
|
return(-1);
|
|
case XML_REGEXP_ANYCHAR:
|
|
ret = ((codepoint != '\n') && (codepoint != '\r'));
|
|
break;
|
|
case XML_REGEXP_CHARVAL:
|
|
ret = ((codepoint >= start) && (codepoint <= end));
|
|
break;
|
|
case XML_REGEXP_NOTSPACE:
|
|
neg = !neg;
|
|
case XML_REGEXP_ANYSPACE:
|
|
ret = ((codepoint == '\n') || (codepoint == '\r') ||
|
|
(codepoint == '\t') || (codepoint == ' '));
|
|
break;
|
|
case XML_REGEXP_NOTINITNAME:
|
|
neg = !neg;
|
|
case XML_REGEXP_INITNAME:
|
|
ret = (xmlIsLetter(codepoint) ||
|
|
(codepoint == '_') || (codepoint == ':'));
|
|
break;
|
|
case XML_REGEXP_NOTNAMECHAR:
|
|
neg = !neg;
|
|
case XML_REGEXP_NAMECHAR:
|
|
ret = (xmlIsLetter(codepoint) || xmlIsDigit(codepoint) ||
|
|
(codepoint == '.') || (codepoint == '-') ||
|
|
(codepoint == '_') || (codepoint == ':') ||
|
|
xmlIsCombining(codepoint) || xmlIsExtender(codepoint));
|
|
break;
|
|
case XML_REGEXP_NOTDECIMAL:
|
|
neg = !neg;
|
|
case XML_REGEXP_DECIMAL:
|
|
ret = xmlUCSIsCatNd(codepoint);
|
|
break;
|
|
case XML_REGEXP_REALCHAR:
|
|
neg = !neg;
|
|
case XML_REGEXP_NOTREALCHAR:
|
|
ret = xmlUCSIsCatP(codepoint);
|
|
if (ret == 0)
|
|
ret = xmlUCSIsCatZ(codepoint);
|
|
if (ret == 0)
|
|
ret = xmlUCSIsCatC(codepoint);
|
|
break;
|
|
case XML_REGEXP_LETTER:
|
|
ret = xmlUCSIsCatL(codepoint);
|
|
break;
|
|
case XML_REGEXP_LETTER_UPPERCASE:
|
|
ret = xmlUCSIsCatLu(codepoint);
|
|
break;
|
|
case XML_REGEXP_LETTER_LOWERCASE:
|
|
ret = xmlUCSIsCatLl(codepoint);
|
|
break;
|
|
case XML_REGEXP_LETTER_TITLECASE:
|
|
ret = xmlUCSIsCatLt(codepoint);
|
|
break;
|
|
case XML_REGEXP_LETTER_MODIFIER:
|
|
ret = xmlUCSIsCatLm(codepoint);
|
|
break;
|
|
case XML_REGEXP_LETTER_OTHERS:
|
|
ret = xmlUCSIsCatLo(codepoint);
|
|
break;
|
|
case XML_REGEXP_MARK:
|
|
ret = xmlUCSIsCatM(codepoint);
|
|
break;
|
|
case XML_REGEXP_MARK_NONSPACING:
|
|
ret = xmlUCSIsCatMn(codepoint);
|
|
break;
|
|
case XML_REGEXP_MARK_SPACECOMBINING:
|
|
ret = xmlUCSIsCatMc(codepoint);
|
|
break;
|
|
case XML_REGEXP_MARK_ENCLOSING:
|
|
ret = xmlUCSIsCatMe(codepoint);
|
|
break;
|
|
case XML_REGEXP_NUMBER:
|
|
ret = xmlUCSIsCatN(codepoint);
|
|
break;
|
|
case XML_REGEXP_NUMBER_DECIMAL:
|
|
ret = xmlUCSIsCatNd(codepoint);
|
|
break;
|
|
case XML_REGEXP_NUMBER_LETTER:
|
|
ret = xmlUCSIsCatNl(codepoint);
|
|
break;
|
|
case XML_REGEXP_NUMBER_OTHERS:
|
|
ret = xmlUCSIsCatNo(codepoint);
|
|
break;
|
|
case XML_REGEXP_PUNCT:
|
|
ret = xmlUCSIsCatP(codepoint);
|
|
break;
|
|
case XML_REGEXP_PUNCT_CONNECTOR:
|
|
ret = xmlUCSIsCatPc(codepoint);
|
|
break;
|
|
case XML_REGEXP_PUNCT_DASH:
|
|
ret = xmlUCSIsCatPd(codepoint);
|
|
break;
|
|
case XML_REGEXP_PUNCT_OPEN:
|
|
ret = xmlUCSIsCatPs(codepoint);
|
|
break;
|
|
case XML_REGEXP_PUNCT_CLOSE:
|
|
ret = xmlUCSIsCatPe(codepoint);
|
|
break;
|
|
case XML_REGEXP_PUNCT_INITQUOTE:
|
|
ret = xmlUCSIsCatPi(codepoint);
|
|
break;
|
|
case XML_REGEXP_PUNCT_FINQUOTE:
|
|
ret = xmlUCSIsCatPf(codepoint);
|
|
break;
|
|
case XML_REGEXP_PUNCT_OTHERS:
|
|
ret = xmlUCSIsCatPo(codepoint);
|
|
break;
|
|
case XML_REGEXP_SEPAR:
|
|
ret = xmlUCSIsCatZ(codepoint);
|
|
break;
|
|
case XML_REGEXP_SEPAR_SPACE:
|
|
ret = xmlUCSIsCatZs(codepoint);
|
|
break;
|
|
case XML_REGEXP_SEPAR_LINE:
|
|
ret = xmlUCSIsCatZl(codepoint);
|
|
break;
|
|
case XML_REGEXP_SEPAR_PARA:
|
|
ret = xmlUCSIsCatZp(codepoint);
|
|
break;
|
|
case XML_REGEXP_SYMBOL:
|
|
ret = xmlUCSIsCatS(codepoint);
|
|
break;
|
|
case XML_REGEXP_SYMBOL_MATH:
|
|
ret = xmlUCSIsCatSm(codepoint);
|
|
break;
|
|
case XML_REGEXP_SYMBOL_CURRENCY:
|
|
ret = xmlUCSIsCatSc(codepoint);
|
|
break;
|
|
case XML_REGEXP_SYMBOL_MODIFIER:
|
|
ret = xmlUCSIsCatSk(codepoint);
|
|
break;
|
|
case XML_REGEXP_SYMBOL_OTHERS:
|
|
ret = xmlUCSIsCatSo(codepoint);
|
|
break;
|
|
case XML_REGEXP_OTHER:
|
|
ret = xmlUCSIsCatC(codepoint);
|
|
break;
|
|
case XML_REGEXP_OTHER_CONTROL:
|
|
ret = xmlUCSIsCatCc(codepoint);
|
|
break;
|
|
case XML_REGEXP_OTHER_FORMAT:
|
|
ret = xmlUCSIsCatCf(codepoint);
|
|
break;
|
|
case XML_REGEXP_OTHER_PRIVATE:
|
|
ret = xmlUCSIsCatCo(codepoint);
|
|
break;
|
|
case XML_REGEXP_OTHER_NA:
|
|
/* ret = xmlUCSIsCatCn(codepoint); */
|
|
/* Seems it doesn't exist anymore in recent Unicode releases */
|
|
ret = 0;
|
|
break;
|
|
case XML_REGEXP_BLOCK_NAME:
|
|
ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
|
|
break;
|
|
}
|
|
if (neg)
|
|
return(!ret);
|
|
return(ret);
|
|
}
|
|
|
|
static int
|
|
xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
|
|
int i, ret = 0;
|
|
xmlRegRangePtr range;
|
|
|
|
if ((atom == NULL) || (!xmlIsChar(codepoint)))
|
|
return(-1);
|
|
|
|
switch (atom->type) {
|
|
case XML_REGEXP_SUBREG:
|
|
case XML_REGEXP_EPSILON:
|
|
return(-1);
|
|
case XML_REGEXP_CHARVAL:
|
|
return(codepoint == atom->codepoint);
|
|
case XML_REGEXP_RANGES: {
|
|
int accept = 0;
|
|
for (i = 0;i < atom->nbRanges;i++) {
|
|
range = atom->ranges[i];
|
|
if (range->neg) {
|
|
ret = xmlRegCheckCharacterRange(range->type, codepoint,
|
|
0, range->start, range->end,
|
|
range->blockName);
|
|
if (ret != 0)
|
|
return(0); /* excluded char */
|
|
} else {
|
|
ret = xmlRegCheckCharacterRange(range->type, codepoint,
|
|
0, range->start, range->end,
|
|
range->blockName);
|
|
if (ret != 0)
|
|
accept = 1; /* might still be excluded */
|
|
}
|
|
}
|
|
return(accept);
|
|
}
|
|
case XML_REGEXP_STRING:
|
|
printf("TODO: XML_REGEXP_STRING\n");
|
|
return(-1);
|
|
case XML_REGEXP_ANYCHAR:
|
|
case XML_REGEXP_ANYSPACE:
|
|
case XML_REGEXP_NOTSPACE:
|
|
case XML_REGEXP_INITNAME:
|
|
case XML_REGEXP_NOTINITNAME:
|
|
case XML_REGEXP_NAMECHAR:
|
|
case XML_REGEXP_NOTNAMECHAR:
|
|
case XML_REGEXP_DECIMAL:
|
|
case XML_REGEXP_NOTDECIMAL:
|
|
case XML_REGEXP_REALCHAR:
|
|
case XML_REGEXP_NOTREALCHAR:
|
|
case XML_REGEXP_LETTER:
|
|
case XML_REGEXP_LETTER_UPPERCASE:
|
|
case XML_REGEXP_LETTER_LOWERCASE:
|
|
case XML_REGEXP_LETTER_TITLECASE:
|
|
case XML_REGEXP_LETTER_MODIFIER:
|
|
case XML_REGEXP_LETTER_OTHERS:
|
|
case XML_REGEXP_MARK:
|
|
case XML_REGEXP_MARK_NONSPACING:
|
|
case XML_REGEXP_MARK_SPACECOMBINING:
|
|
case XML_REGEXP_MARK_ENCLOSING:
|
|
case XML_REGEXP_NUMBER:
|
|
case XML_REGEXP_NUMBER_DECIMAL:
|
|
case XML_REGEXP_NUMBER_LETTER:
|
|
case XML_REGEXP_NUMBER_OTHERS:
|
|
case XML_REGEXP_PUNCT:
|
|
case XML_REGEXP_PUNCT_CONNECTOR:
|
|
case XML_REGEXP_PUNCT_DASH:
|
|
case XML_REGEXP_PUNCT_OPEN:
|
|
case XML_REGEXP_PUNCT_CLOSE:
|
|
case XML_REGEXP_PUNCT_INITQUOTE:
|
|
case XML_REGEXP_PUNCT_FINQUOTE:
|
|
case XML_REGEXP_PUNCT_OTHERS:
|
|
case XML_REGEXP_SEPAR:
|
|
case XML_REGEXP_SEPAR_SPACE:
|
|
case XML_REGEXP_SEPAR_LINE:
|
|
case XML_REGEXP_SEPAR_PARA:
|
|
case XML_REGEXP_SYMBOL:
|
|
case XML_REGEXP_SYMBOL_MATH:
|
|
case XML_REGEXP_SYMBOL_CURRENCY:
|
|
case XML_REGEXP_SYMBOL_MODIFIER:
|
|
case XML_REGEXP_SYMBOL_OTHERS:
|
|
case XML_REGEXP_OTHER:
|
|
case XML_REGEXP_OTHER_CONTROL:
|
|
case XML_REGEXP_OTHER_FORMAT:
|
|
case XML_REGEXP_OTHER_PRIVATE:
|
|
case XML_REGEXP_OTHER_NA:
|
|
case XML_REGEXP_BLOCK_NAME:
|
|
ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
|
|
(const xmlChar *)atom->valuep);
|
|
if (atom->neg)
|
|
ret = !ret;
|
|
break;
|
|
}
|
|
return(ret);
|
|
}
|
|
|
|
/************************************************************************
|
|
* *
|
|
* Saving an restoring state of an execution context *
|
|
* *
|
|
************************************************************************/
|
|
|
|
#ifdef DEBUG_REGEXP_EXEC
|
|
static void
|
|
xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
|
|
printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
|
|
if (exec->inputStack != NULL) {
|
|
int i;
|
|
printf(": ");
|
|
for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
|
|
printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]);
|
|
} else {
|
|
printf(": %s", &(exec->inputString[exec->index]));
|
|
}
|
|
printf("\n");
|
|
}
|
|
#endif
|
|
|
|
static void
|
|
xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
|
|
#ifdef DEBUG_REGEXP_EXEC
|
|
printf("saving ");
|
|
exec->transno++;
|
|
xmlFARegDebugExec(exec);
|
|
exec->transno--;
|
|
#endif
|
|
|
|
if (exec->maxRollbacks == 0) {
|
|
exec->maxRollbacks = 4;
|
|
exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
|
|
sizeof(xmlRegExecRollback));
|
|
if (exec->rollbacks == NULL) {
|
|
fprintf(stderr, "exec save: allocation failed");
|
|
exec->maxRollbacks = 0;
|
|
return;
|
|
}
|
|
memset(exec->rollbacks, 0,
|
|
exec->maxRollbacks * sizeof(xmlRegExecRollback));
|
|
} else if (exec->nbRollbacks >= exec->maxRollbacks) {
|
|
xmlRegExecRollback *tmp;
|
|
int len = exec->maxRollbacks;
|
|
|
|
exec->maxRollbacks *= 2;
|
|
tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
|
|
exec->maxRollbacks * sizeof(xmlRegExecRollback));
|
|
if (tmp == NULL) {
|
|
fprintf(stderr, "exec save: allocation failed");
|
|
exec->maxRollbacks /= 2;
|
|
return;
|
|
}
|
|
exec->rollbacks = tmp;
|
|
tmp = &exec->rollbacks[len];
|
|
memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
|
|
}
|
|
exec->rollbacks[exec->nbRollbacks].state = exec->state;
|
|
exec->rollbacks[exec->nbRollbacks].index = exec->index;
|
|
exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
|
|
if (exec->comp->nbCounters > 0) {
|
|
if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
|
|
exec->rollbacks[exec->nbRollbacks].counts = (int *)
|
|
xmlMalloc(exec->comp->nbCounters * sizeof(int));
|
|
if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
|
|
fprintf(stderr, "exec save: allocation failed");
|
|
exec->status = -5;
|
|
return;
|
|
}
|
|
}
|
|
memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
|
|
exec->comp->nbCounters * sizeof(int));
|
|
}
|
|
exec->nbRollbacks++;
|
|
}
|
|
|
|
static void
|
|
xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
|
|
if (exec->nbRollbacks <= 0) {
|
|
exec->status = -1;
|
|
#ifdef DEBUG_REGEXP_EXEC
|
|
printf("rollback failed on empty stack\n");
|
|
#endif
|
|
return;
|
|
}
|
|
exec->nbRollbacks--;
|
|
exec->state = exec->rollbacks[exec->nbRollbacks].state;
|
|
exec->index = exec->rollbacks[exec->nbRollbacks].index;
|
|
exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
|
|
if (exec->comp->nbCounters > 0) {
|
|
if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
|
|
fprintf(stderr, "exec save: allocation failed");
|
|
exec->status = -6;
|
|
return;
|
|
}
|
|
memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
|
|
exec->comp->nbCounters * sizeof(int));
|
|
}
|
|
|
|
#ifdef DEBUG_REGEXP_EXEC
|
|
printf("restored ");
|
|
xmlFARegDebugExec(exec);
|
|
#endif
|
|
}
|
|
|
|
/************************************************************************
|
|
* *
|
|
* Verifyer, running an input against a compiled regexp *
|
|
* *
|
|
************************************************************************/
|
|
|
|
static int
|
|
xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
|
|
xmlRegExecCtxt execval;
|
|
xmlRegExecCtxtPtr exec = &execval;
|
|
int ret, codepoint, len;
|
|
|
|
exec->inputString = content;
|
|
exec->index = 0;
|
|
exec->determinist = 1;
|
|
exec->maxRollbacks = 0;
|
|
exec->nbRollbacks = 0;
|
|
exec->rollbacks = NULL;
|
|
exec->status = 0;
|
|
exec->comp = comp;
|
|
exec->state = comp->states[0];
|
|
exec->transno = 0;
|
|
exec->transcount = 0;
|
|
if (comp->nbCounters > 0) {
|
|
exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
|
|
if (exec->counts == NULL)
|
|
return(-1);
|
|
memset(exec->counts, 0, comp->nbCounters * sizeof(int));
|
|
} else
|
|
exec->counts = NULL;
|
|
while ((exec->status == 0) &&
|
|
((exec->inputString[exec->index] != 0) ||
|
|
(exec->state->type != XML_REGEXP_FINAL_STATE))) {
|
|
xmlRegTransPtr trans;
|
|
xmlRegAtomPtr atom;
|
|
|
|
/*
|
|
* End of input on non-terminal state, rollback, however we may
|
|
* still have epsilon like transition for counted transitions
|
|
* on counters, in that case don't break too early.
|
|
*/
|
|
if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
|
|
goto rollback;
|
|
|
|
exec->transcount = 0;
|
|
for (;exec->transno < exec->state->nbTrans;exec->transno++) {
|
|
trans = &exec->state->trans[exec->transno];
|
|
if (trans->to < 0)
|
|
continue;
|
|
atom = trans->atom;
|
|
ret = 0;
|
|
if (trans->count >= 0) {
|
|
int count;
|
|
xmlRegCounterPtr counter;
|
|
|
|
/*
|
|
* A counted transition.
|
|
*/
|
|
|
|
count = exec->counts[trans->count];
|
|
counter = &exec->comp->counters[trans->count];
|
|
#ifdef DEBUG_REGEXP_EXEC
|
|
printf("testing count %d: val %d, min %d, max %d\n",
|
|
trans->count, count, counter->min, counter->max);
|
|
#endif
|
|
ret = ((count >= counter->min) && (count <= counter->max));
|
|
} else if (atom == NULL) {
|
|
fprintf(stderr, "epsilon transition left at runtime\n");
|
|
exec->status = -2;
|
|
break;
|
|
} else if (exec->inputString[exec->index] != 0) {
|
|
codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
|
|
ret = xmlRegCheckCharacter(atom, codepoint);
|
|
if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
|
|
xmlRegStatePtr to = comp->states[trans->to];
|
|
|
|
/*
|
|
* this is a multiple input sequence
|
|
*/
|
|
if (exec->state->nbTrans > exec->transno + 1) {
|
|
xmlFARegExecSave(exec);
|
|
}
|
|
exec->transcount = 1;
|
|
do {
|
|
/*
|
|
* Try to progress as much as possible on the input
|
|
*/
|
|
if (exec->transcount == atom->max) {
|
|
break;
|
|
}
|
|
exec->index += len;
|
|
/*
|
|
* End of input: stop here
|
|
*/
|
|
if (exec->inputString[exec->index] == 0) {
|
|
exec->index -= len;
|
|
break;
|
|
}
|
|
if (exec->transcount >= atom->min) {
|
|
int transno = exec->transno;
|
|
xmlRegStatePtr state = exec->state;
|
|
|
|
/*
|
|
* The transition is acceptable save it
|
|
*/
|
|
exec->transno = -1; /* trick */
|
|
exec->state = to;
|
|
xmlFARegExecSave(exec);
|
|
exec->transno = transno;
|
|
exec->state = state;
|
|
}
|
|
codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
|
|
len);
|
|
ret = xmlRegCheckCharacter(atom, codepoint);
|
|
exec->transcount++;
|
|
} while (ret == 1);
|
|
if (exec->transcount < atom->min)
|
|
ret = 0;
|
|
|
|
/*
|
|
* If the last check failed but one transition was found
|
|
* possible, rollback
|
|
*/
|
|
if (ret < 0)
|
|
ret = 0;
|
|
if (ret == 0) {
|
|
goto rollback;
|
|
}
|
|
}
|
|
}
|
|
if (ret == 1) {
|
|
if (exec->state->nbTrans > exec->transno + 1) {
|
|
xmlFARegExecSave(exec);
|
|
}
|
|
if (trans->counter >= 0) {
|
|
#ifdef DEBUG_REGEXP_EXEC
|
|
printf("Increasing count %d\n", trans->counter);
|
|
#endif
|
|
exec->counts[trans->counter]++;
|
|
}
|
|
#ifdef DEBUG_REGEXP_EXEC
|
|
printf("entering state %d\n", trans->to);
|
|
#endif
|
|
exec->state = comp->states[trans->to];
|
|
exec->transno = 0;
|
|
if (trans->atom != NULL) {
|
|
exec->index += len;
|
|
}
|
|
goto progress;
|
|
} else if (ret < 0) {
|
|
exec->status = -4;
|
|
break;
|
|
}
|
|
}
|
|
if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
|
|
rollback:
|
|
/*
|
|
* Failed to find a way out
|
|
*/
|
|
exec->determinist = 0;
|
|
xmlFARegExecRollBack(exec);
|
|
}
|
|
progress:
|
|
continue;
|
|
}
|
|
if (exec->rollbacks != NULL) {
|
|
if (exec->counts != NULL) {
|
|
int i;
|
|
|
|
for (i = 0;i < exec->maxRollbacks;i++)
|
|
if (exec->rollbacks[i].counts != NULL)
|
|
xmlFree(exec->rollbacks[i].counts);
|
|
}
|
|
xmlFree(exec->rollbacks);
|
|
}
|
|
if (exec->counts != NULL)
|
|
xmlFree(exec->counts);
|
|
if (exec->status == 0)
|
|
return(1);
|
|
if (exec->status == -1)
|
|
return(0);
|
|
return(exec->status);
|
|
}
|
|
|
|
/************************************************************************
|
|
* *
|
|
* Progressive interface to the verifyer one atom at a time *
|
|
* *
|
|
************************************************************************/
|
|
|
|
/**
|
|
* xmlRegNewExecCtxt:
|
|
* @comp: a precompiled regular expression
|
|
* @callback: a callback function used for handling progresses in the
|
|
* automata matching phase
|
|
* @data: the context data associated to the callback in this context
|
|
*
|
|
* Build a context used for progressive evaluation of a regexp.
|
|
*
|
|
* Returns the new context
|
|
*/
|
|
xmlRegExecCtxtPtr
|
|
xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
|
|
xmlRegExecCtxtPtr exec;
|
|
|
|
if (comp == NULL)
|
|
return(NULL);
|
|
exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
|
|
if (exec == NULL) {
|
|
return(NULL);
|
|
}
|
|
memset(exec, 0, sizeof(xmlRegExecCtxt));
|
|
exec->inputString = NULL;
|
|
exec->index = 0;
|
|
exec->determinist = 1;
|
|
exec->maxRollbacks = 0;
|
|
exec->nbRollbacks = 0;
|
|
exec->rollbacks = NULL;
|
|
exec->status = 0;
|
|
exec->comp = comp;
|
|
if (comp->compact == NULL)
|
|
exec->state = comp->states[0];
|
|
exec->transno = 0;
|
|
exec->transcount = 0;
|
|
exec->callback = callback;
|
|
exec->data = data;
|
|
if (comp->nbCounters > 0) {
|
|
exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
|
|
if (exec->counts == NULL) {
|
|
xmlFree(exec);
|
|
return(NULL);
|
|
}
|
|
memset(exec->counts, 0, comp->nbCounters * sizeof(int));
|
|
} else
|
|
exec->counts = NULL;
|
|
exec->inputStackMax = 0;
|
|
exec->inputStackNr = 0;
|
|
exec->inputStack = NULL;
|
|
return(exec);
|
|
}
|
|
|
|
/**
|
|
* xmlRegFreeExecCtxt:
|
|
* @exec: a regular expression evaulation context
|
|
*
|
|
* Free the structures associated to a regular expression evaulation context.
|
|
*/
|
|
void
|
|
xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
|
|
if (exec == NULL)
|
|
return;
|
|
|
|
if (exec->rollbacks != NULL) {
|
|
if (exec->counts != NULL) {
|
|
int i;
|
|
|
|
for (i = 0;i < exec->maxRollbacks;i++)
|
|
if (exec->rollbacks[i].counts != NULL)
|
|
xmlFree(exec->rollbacks[i].counts);
|
|
}
|
|
xmlFree(exec->rollbacks);
|
|
}
|
|
if (exec->counts != NULL)
|
|
xmlFree(exec->counts);
|
|
if (exec->inputStack != NULL) {
|
|
int i;
|
|
|
|
for (i = 0;i < exec->inputStackNr;i++) {
|
|
if (exec->inputStack[i].value != NULL)
|
|
xmlFree(exec->inputStack[i].value);
|
|
}
|
|
xmlFree(exec->inputStack);
|
|
}
|
|
xmlFree(exec);
|
|
}
|
|
|
|
static void
|
|
xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
|
|
void *data) {
|
|
#ifdef DEBUG_PUSH
|
|
printf("saving value: %d:%s\n", exec->inputStackNr, value);
|
|
#endif
|
|
if (exec->inputStackMax == 0) {
|
|
exec->inputStackMax = 4;
|
|
exec->inputStack = (xmlRegInputTokenPtr)
|
|
xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
|
|
if (exec->inputStack == NULL) {
|
|
fprintf(stderr, "push input: allocation failed");
|
|
exec->inputStackMax = 0;
|
|
return;
|
|
}
|
|
} else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
|
|
xmlRegInputTokenPtr tmp;
|
|
|
|
exec->inputStackMax *= 2;
|
|
tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
|
|
exec->inputStackMax * sizeof(xmlRegInputToken));
|
|
if (tmp == NULL) {
|
|
fprintf(stderr, "push input: allocation failed");
|
|
exec->inputStackMax /= 2;
|
|
return;
|
|
}
|
|
exec->inputStack = tmp;
|
|
}
|
|
exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
|
|
exec->inputStack[exec->inputStackNr].data = data;
|
|
exec->inputStackNr++;
|
|
exec->inputStack[exec->inputStackNr].value = NULL;
|
|
exec->inputStack[exec->inputStackNr].data = NULL;
|
|
}
|
|
|
|
|
|
/**
|
|
* xmlRegCompactPushString:
|
|
* @exec: a regexp execution context
|
|
* @comp: the precompiled exec with a compact table
|
|
* @value: a string token input
|
|
* @data: data associated to the token to reuse in callbacks
|
|
*
|
|
* Push one input token in the execution context
|
|
*
|
|
* Returns: 1 if the regexp reached a final state, 0 if non-final, and
|
|
* a negative value in case of error.
|
|
*/
|
|
static int
|
|
xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
|
|
xmlRegexpPtr comp,
|
|
const xmlChar *value,
|
|
void *data) {
|
|
int state = exec->index;
|
|
int i, target;
|
|
|
|
if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
|
|
return(-1);
|
|
|
|
if (value == NULL) {
|
|
/*
|
|
* are we at a final state ?
|
|
*/
|
|
if (comp->compact[state * (comp->nbstrings + 1)] ==
|
|
XML_REGEXP_FINAL_STATE)
|
|
return(1);
|
|
return(0);
|
|
}
|
|
|
|
#ifdef DEBUG_PUSH
|
|
printf("value pushed: %s\n", value);
|
|
#endif
|
|
|
|
/*
|
|
* Examine all outside transition from current state
|
|
*/
|
|
for (i = 0;i < comp->nbstrings;i++) {
|
|
target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
|
|
if ((target > 0) && (target <= comp->nbstates)) {
|
|
target--; /* to avoid 0 */
|
|
if (xmlStrEqual(comp->stringMap[i], value)) {
|
|
exec->index = target;
|
|
if ((exec->callback != NULL) && (comp->transdata != NULL)) {
|
|
exec->callback(exec->data, value,
|
|
comp->transdata[state * comp->nbstrings + i], data);
|
|
}
|
|
#ifdef DEBUG_PUSH
|
|
printf("entering state %d\n", target);
|
|
#endif
|
|
if (comp->compact[target * (comp->nbstrings + 1)] ==
|
|
XML_REGEXP_FINAL_STATE)
|
|
return(1);
|
|
return(0);
|
|
}
|
|
}
|
|
}
|
|
/*
|
|
* Failed to find an exit transition out from current state for the
|
|
* current token
|
|
*/
|
|
#ifdef DEBUG_PUSH
|
|
printf("failed to find a transition for %s on state %d\n", value, state);
|
|
#endif
|
|
exec->status = -1;
|
|
return(-1);
|
|
}
|
|
|
|
/**
|
|
* xmlRegExecPushString:
|
|
* @exec: a regexp execution context or NULL to indicate the end
|
|
* @value: a string token input
|
|
* @data: data associated to the token to reuse in callbacks
|
|
*
|
|
* Push one input token in the execution context
|
|
*
|
|
* Returns: 1 if the regexp reached a final state, 0 if non-final, and
|
|
* a negative value in case of error.
|
|
*/
|
|
int
|
|
xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
|
|
void *data) {
|
|
xmlRegTransPtr trans;
|
|
xmlRegAtomPtr atom;
|
|
int ret;
|
|
int final = 0;
|
|
|
|
if (exec == NULL)
|
|
return(-1);
|
|
if (exec->comp == NULL)
|
|
return(-1);
|
|
if (exec->status != 0)
|
|
return(exec->status);
|
|
|
|
if (exec->comp->compact != NULL)
|
|
return(xmlRegCompactPushString(exec, exec->comp, value, data));
|
|
|
|
if (value == NULL) {
|
|
if (exec->state->type == XML_REGEXP_FINAL_STATE)
|
|
return(1);
|
|
final = 1;
|
|
}
|
|
|
|
#ifdef DEBUG_PUSH
|
|
printf("value pushed: %s\n", value);
|
|
#endif
|
|
/*
|
|
* If we have an active rollback stack push the new value there
|
|
* and get back to where we were left
|
|
*/
|
|
if ((value != NULL) && (exec->inputStackNr > 0)) {
|
|
xmlFARegExecSaveInputString(exec, value, data);
|
|
value = exec->inputStack[exec->index].value;
|
|
data = exec->inputStack[exec->index].data;
|
|
#ifdef DEBUG_PUSH
|
|
printf("value loaded: %s\n", value);
|
|
#endif
|
|
}
|
|
|
|
while ((exec->status == 0) &&
|
|
((value != NULL) ||
|
|
((final == 1) &&
|
|
(exec->state->type != XML_REGEXP_FINAL_STATE)))) {
|
|
|
|
/*
|
|
* End of input on non-terminal state, rollback, however we may
|
|
* still have epsilon like transition for counted transitions
|
|
* on counters, in that case don't break too early.
|
|
*/
|
|
if ((value == NULL) && (exec->counts == NULL))
|
|
goto rollback;
|
|
|
|
exec->transcount = 0;
|
|
for (;exec->transno < exec->state->nbTrans;exec->transno++) {
|
|
trans = &exec->state->trans[exec->transno];
|
|
if (trans->to < 0)
|
|
continue;
|
|
atom = trans->atom;
|
|
ret = 0;
|
|
if (trans->count == REGEXP_ALL_LAX_COUNTER) {
|
|
int i;
|
|
int count;
|
|
xmlRegTransPtr t;
|
|
xmlRegCounterPtr counter;
|
|
|
|
ret = 0;
|
|
|
|
#ifdef DEBUG_PUSH
|
|
printf("testing all lax %d\n", trans->count);
|
|
#endif
|
|
/*
|
|
* Check all counted transitions from the current state
|
|
*/
|
|
if ((value == NULL) && (final)) {
|
|
ret = 1;
|
|
} else if (value != NULL) {
|
|
for (i = 0;i < exec->state->nbTrans;i++) {
|
|
t = &exec->state->trans[i];
|
|
if ((t->counter < 0) || (t == trans))
|
|
continue;
|
|
counter = &exec->comp->counters[t->counter];
|
|
count = exec->counts[t->counter];
|
|
if ((count < counter->max) &&
|
|
(t->atom != NULL) &&
|
|
(xmlStrEqual(value, t->atom->valuep))) {
|
|
ret = 0;
|
|
break;
|
|
}
|
|
if ((count >= counter->min) &&
|
|
(count < counter->max) &&
|
|
(xmlStrEqual(value, t->atom->valuep))) {
|
|
ret = 1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
} else if (trans->count == REGEXP_ALL_COUNTER) {
|
|
int i;
|
|
int count;
|
|
xmlRegTransPtr t;
|
|
xmlRegCounterPtr counter;
|
|
|
|
ret = 1;
|
|
|
|
#ifdef DEBUG_PUSH
|
|
printf("testing all %d\n", trans->count);
|
|
#endif
|
|
/*
|
|
* Check all counted transitions from the current state
|
|
*/
|
|
for (i = 0;i < exec->state->nbTrans;i++) {
|
|
t = &exec->state->trans[i];
|
|
if ((t->counter < 0) || (t == trans))
|
|
continue;
|
|
counter = &exec->comp->counters[t->counter];
|
|
count = exec->counts[t->counter];
|
|
if ((count < counter->min) || (count > counter->max)) {
|
|
ret = 0;
|
|
break;
|
|
}
|
|
}
|
|
} else if (trans->count >= 0) {
|
|
int count;
|
|
xmlRegCounterPtr counter;
|
|
|
|
/*
|
|
* A counted transition.
|
|
*/
|
|
|
|
count = exec->counts[trans->count];
|
|
counter = &exec->comp->counters[trans->count];
|
|
#ifdef DEBUG_PUSH
|
|
printf("testing count %d: val %d, min %d, max %d\n",
|
|
trans->count, count, counter->min, counter->max);
|
|
#endif
|
|
ret = ((count >= counter->min) && (count <= counter->max));
|
|
} else if (atom == NULL) {
|
|
fprintf(stderr, "epsilon transition left at runtime\n");
|
|
exec->status = -2;
|
|
break;
|
|
} else if (value != NULL) {
|
|
ret = xmlStrEqual(value, atom->valuep);
|
|
if ((ret == 1) && (trans->counter >= 0)) {
|
|
xmlRegCounterPtr counter;
|
|
int count;
|
|
|
|
count = exec->counts[trans->counter];
|
|
counter = &exec->comp->counters[trans->counter];
|
|
if (count >= counter->max)
|
|
ret = 0;
|
|
}
|
|
|
|
if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
|
|
xmlRegStatePtr to = exec->comp->states[trans->to];
|
|
|
|
/*
|
|
* this is a multiple input sequence
|
|
*/
|
|
if (exec->state->nbTrans > exec->transno + 1) {
|
|
if (exec->inputStackNr <= 0) {
|
|
xmlFARegExecSaveInputString(exec, value, data);
|
|
}
|
|
xmlFARegExecSave(exec);
|
|
}
|
|
exec->transcount = 1;
|
|
do {
|
|
/*
|
|
* Try to progress as much as possible on the input
|
|
*/
|
|
if (exec->transcount == atom->max) {
|
|
break;
|
|
}
|
|
exec->index++;
|
|
value = exec->inputStack[exec->index].value;
|
|
data = exec->inputStack[exec->index].data;
|
|
#ifdef DEBUG_PUSH
|
|
printf("value loaded: %s\n", value);
|
|
#endif
|
|
|
|
/*
|
|
* End of input: stop here
|
|
*/
|
|
if (value == NULL) {
|
|
exec->index --;
|
|
break;
|
|
}
|
|
if (exec->transcount >= atom->min) {
|
|
int transno = exec->transno;
|
|
xmlRegStatePtr state = exec->state;
|
|
|
|
/*
|
|
* The transition is acceptable save it
|
|
*/
|
|
exec->transno = -1; /* trick */
|
|
exec->state = to;
|
|
if (exec->inputStackNr <= 0) {
|
|
xmlFARegExecSaveInputString(exec, value, data);
|
|
}
|
|
xmlFARegExecSave(exec);
|
|
exec->transno = transno;
|
|
exec->state = state;
|
|
}
|
|
ret = xmlStrEqual(value, atom->valuep);
|
|
exec->transcount++;
|
|
} while (ret == 1);
|
|
if (exec->transcount < atom->min)
|
|
ret = 0;
|
|
|
|
/*
|
|
* If the last check failed but one transition was found
|
|
* possible, rollback
|
|
*/
|
|
if (ret < 0)
|
|
ret = 0;
|
|
if (ret == 0) {
|
|
goto rollback;
|
|
}
|
|
}
|
|
}
|
|
if (ret == 1) {
|
|
if ((exec->callback != NULL) && (atom != NULL)) {
|
|
exec->callback(exec->data, atom->valuep,
|
|
atom->data, data);
|
|
}
|
|
if (exec->state->nbTrans > exec->transno + 1) {
|
|
if (exec->inputStackNr <= 0) {
|
|
xmlFARegExecSaveInputString(exec, value, data);
|
|
}
|
|
xmlFARegExecSave(exec);
|
|
}
|
|
if (trans->counter >= 0) {
|
|
#ifdef DEBUG_PUSH
|
|
printf("Increasing count %d\n", trans->counter);
|
|
#endif
|
|
exec->counts[trans->counter]++;
|
|
}
|
|
#ifdef DEBUG_PUSH
|
|
printf("entering state %d\n", trans->to);
|
|
#endif
|
|
exec->state = exec->comp->states[trans->to];
|
|
exec->transno = 0;
|
|
if (trans->atom != NULL) {
|
|
if (exec->inputStack != NULL) {
|
|
exec->index++;
|
|
if (exec->index < exec->inputStackNr) {
|
|
value = exec->inputStack[exec->index].value;
|
|
data = exec->inputStack[exec->index].data;
|
|
#ifdef DEBUG_PUSH
|
|
printf("value loaded: %s\n", value);
|
|
#endif
|
|
} else {
|
|
value = NULL;
|
|
data = NULL;
|
|
#ifdef DEBUG_PUSH
|
|
printf("end of input\n");
|
|
#endif
|
|
}
|
|
} else {
|
|
value = NULL;
|
|
data = NULL;
|
|
#ifdef DEBUG_PUSH
|
|
printf("end of input\n");
|
|
#endif
|
|
}
|
|
}
|
|
goto progress;
|
|
} else if (ret < 0) {
|
|
exec->status = -4;
|
|
break;
|
|
}
|
|
}
|
|
if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
|
|
rollback:
|
|
/*
|
|
* Failed to find a way out
|
|
*/
|
|
exec->determinist = 0;
|
|
xmlFARegExecRollBack(exec);
|
|
if (exec->status == 0) {
|
|
value = exec->inputStack[exec->index].value;
|
|
data = exec->inputStack[exec->index].data;
|
|
#ifdef DEBUG_PUSH
|
|
printf("value loaded: %s\n", value);
|
|
#endif
|
|
}
|
|
}
|
|
progress:
|
|
continue;
|
|
}
|
|
if (exec->status == 0) {
|
|
return(exec->state->type == XML_REGEXP_FINAL_STATE);
|
|
}
|
|
return(exec->status);
|
|
}
|
|
|
|
#if 0
|
|
static int
|
|
xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
|
|
xmlRegTransPtr trans;
|
|
xmlRegAtomPtr atom;
|
|
int ret;
|
|
int codepoint, len;
|
|
|
|
if (exec == NULL)
|
|
return(-1);
|
|
if (exec->status != 0)
|
|
return(exec->status);
|
|
|
|
while ((exec->status == 0) &&
|
|
((exec->inputString[exec->index] != 0) ||
|
|
(exec->state->type != XML_REGEXP_FINAL_STATE))) {
|
|
|
|
/*
|
|
* End of input on non-terminal state, rollback, however we may
|
|
* still have epsilon like transition for counted transitions
|
|
* on counters, in that case don't break too early.
|
|
*/
|
|
if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
|
|
goto rollback;
|
|
|
|
exec->transcount = 0;
|
|
for (;exec->transno < exec->state->nbTrans;exec->transno++) {
|
|
trans = &exec->state->trans[exec->transno];
|
|
if (trans->to < 0)
|
|
continue;
|
|
atom = trans->atom;
|
|
ret = 0;
|
|
if (trans->count >= 0) {
|
|
int count;
|
|
xmlRegCounterPtr counter;
|
|
|
|
/*
|
|
* A counted transition.
|
|
*/
|
|
|
|
count = exec->counts[trans->count];
|
|
counter = &exec->comp->counters[trans->count];
|
|
#ifdef DEBUG_REGEXP_EXEC
|
|
printf("testing count %d: val %d, min %d, max %d\n",
|
|
trans->count, count, counter->min, counter->max);
|
|
#endif
|
|
ret = ((count >= counter->min) && (count <= counter->max));
|
|
} else if (atom == NULL) {
|
|
fprintf(stderr, "epsilon transition left at runtime\n");
|
|
exec->status = -2;
|
|
break;
|
|
} else if (exec->inputString[exec->index] != 0) {
|
|
codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
|
|
ret = xmlRegCheckCharacter(atom, codepoint);
|
|
if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
|
|
xmlRegStatePtr to = exec->comp->states[trans->to];
|
|
|
|
/*
|
|
* this is a multiple input sequence
|
|
*/
|
|
if (exec->state->nbTrans > exec->transno + 1) {
|
|
xmlFARegExecSave(exec);
|
|
}
|
|
exec->transcount = 1;
|
|
do {
|
|
/*
|
|
* Try to progress as much as possible on the input
|
|
*/
|
|
if (exec->transcount == atom->max) {
|
|
break;
|
|
}
|
|
exec->index += len;
|
|
/*
|
|
* End of input: stop here
|
|
*/
|
|
if (exec->inputString[exec->index] == 0) {
|
|
exec->index -= len;
|
|
break;
|
|
}
|
|
if (exec->transcount >= atom->min) {
|
|
int transno = exec->transno;
|
|
xmlRegStatePtr state = exec->state;
|
|
|
|
/*
|
|
* The transition is acceptable save it
|
|
*/
|
|
exec->transno = -1; /* trick */
|
|
exec->state = to;
|
|
xmlFARegExecSave(exec);
|
|
exec->transno = transno;
|
|
exec->state = state;
|
|
}
|
|
codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
|
|
len);
|
|
ret = xmlRegCheckCharacter(atom, codepoint);
|
|
exec->transcount++;
|
|
} while (ret == 1);
|
|
if (exec->transcount < atom->min)
|
|
ret = 0;
|
|
|
|
/*
|
|
* If the last check failed but one transition was found
|
|
* possible, rollback
|
|
*/
|
|
if (ret < 0)
|
|
ret = 0;
|
|
if (ret == 0) {
|
|
goto rollback;
|
|
}
|
|
}
|
|
}
|
|
if (ret == 1) {
|
|
if (exec->state->nbTrans > exec->transno + 1) {
|
|
xmlFARegExecSave(exec);
|
|
}
|
|
if (trans->counter >= 0) {
|
|
#ifdef DEBUG_REGEXP_EXEC
|
|
printf("Increasing count %d\n", trans->counter);
|
|
#endif
|
|
exec->counts[trans->counter]++;
|
|
}
|
|
#ifdef DEBUG_REGEXP_EXEC
|
|
printf("entering state %d\n", trans->to);
|
|
#endif
|
|
exec->state = exec->comp->states[trans->to];
|
|
exec->transno = 0;
|
|
if (trans->atom != NULL) {
|
|
exec->index += len;
|
|
}
|
|
goto progress;
|
|
} else if (ret < 0) {
|
|
exec->status = -4;
|
|
break;
|
|
}
|
|
}
|
|
if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
|
|
rollback:
|
|
/*
|
|
* Failed to find a way out
|
|
*/
|
|
exec->determinist = 0;
|
|
xmlFARegExecRollBack(exec);
|
|
}
|
|
progress:
|
|
continue;
|
|
}
|
|
}
|
|
#endif
|
|
/************************************************************************
|
|
* *
|
|
* Parser for the Shemas Datatype Regular Expressions *
|
|
* http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
|
|
* *
|
|
************************************************************************/
|
|
|
|
/**
|
|
* xmlFAIsChar:
|
|
* @ctxt: a regexp parser context
|
|
*
|
|
* [10] Char ::= [^.\?*+()|#x5B#x5D]
|
|
*/
|
|
static int
|
|
xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
|
|
int cur;
|
|
int len;
|
|
|
|
cur = CUR_SCHAR(ctxt->cur, len);
|
|
if ((cur == '.') || (cur == '\\') || (cur == '?') ||
|
|
(cur == '*') || (cur == '+') || (cur == '(') ||
|
|
(cur == ')') || (cur == '|') || (cur == 0x5B) ||
|
|
(cur == 0x5D) || (cur == 0))
|
|
return(-1);
|
|
return(cur);
|
|
}
|
|
|
|
/**
|
|
* xmlFAParseCharProp:
|
|
* @ctxt: a regexp parser context
|
|
*
|
|
* [27] charProp ::= IsCategory | IsBlock
|
|
* [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
|
|
* Separators | Symbols | Others
|
|
* [29] Letters ::= 'L' [ultmo]?
|
|
* [30] Marks ::= 'M' [nce]?
|
|
* [31] Numbers ::= 'N' [dlo]?
|
|
* [32] Punctuation ::= 'P' [cdseifo]?
|
|
* [33] Separators ::= 'Z' [slp]?
|
|
* [34] Symbols ::= 'S' [mcko]?
|
|
* [35] Others ::= 'C' [cfon]?
|
|
* [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
|
|
*/
|
|
static void
|
|
xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
|
|
int cur;
|
|
xmlRegAtomType type = 0;
|
|
xmlChar *blockName = NULL;
|
|
|
|
cur = CUR;
|
|
if (cur == 'L') {
|
|
NEXT;
|
|
cur = CUR;
|
|
if (cur == 'u') {
|
|
NEXT;
|
|
type = XML_REGEXP_LETTER_UPPERCASE;
|
|
} else if (cur == 'l') {
|
|
NEXT;
|
|
type = XML_REGEXP_LETTER_LOWERCASE;
|
|
} else if (cur == 't') {
|
|
NEXT;
|
|
type = XML_REGEXP_LETTER_TITLECASE;
|
|
} else if (cur == 'm') {
|
|
NEXT;
|
|
type = XML_REGEXP_LETTER_MODIFIER;
|
|
} else if (cur == 'o') {
|
|
NEXT;
|
|
type = XML_REGEXP_LETTER_OTHERS;
|
|
} else {
|
|
type = XML_REGEXP_LETTER;
|
|
}
|
|
} else if (cur == 'M') {
|
|
NEXT;
|
|
cur = CUR;
|
|
if (cur == 'n') {
|
|
NEXT;
|
|
/* nonspacing */
|
|
type = XML_REGEXP_MARK_NONSPACING;
|
|
} else if (cur == 'c') {
|
|
NEXT;
|
|
/* spacing combining */
|
|
type = XML_REGEXP_MARK_SPACECOMBINING;
|
|
} else if (cur == 'e') {
|
|
NEXT;
|
|
/* enclosing */
|
|
type = XML_REGEXP_MARK_ENCLOSING;
|
|
} else {
|
|
/* all marks */
|
|
type = XML_REGEXP_MARK;
|
|
}
|
|
} else if (cur == 'N') {
|
|
NEXT;
|
|
cur = CUR;
|
|
if (cur == 'd') {
|
|
NEXT;
|
|
/* digital */
|
|
type = XML_REGEXP_NUMBER_DECIMAL;
|
|
} else if (cur == 'l') {
|
|
NEXT;
|
|
/* letter */
|
|
type = XML_REGEXP_NUMBER_LETTER;
|
|
} else if (cur == 'o') {
|
|
NEXT;
|
|
/* other */
|
|
type = XML_REGEXP_NUMBER_OTHERS;
|
|
} else {
|
|
/* all numbers */
|
|
type = XML_REGEXP_NUMBER;
|
|
}
|
|
} else if (cur == 'P') {
|
|
NEXT;
|
|
cur = CUR;
|
|
if (cur == 'c') {
|
|
NEXT;
|
|
/* connector */
|
|
type = XML_REGEXP_PUNCT_CONNECTOR;
|
|
} else if (cur == 'd') {
|
|
NEXT;
|
|
/* dash */
|
|
type = XML_REGEXP_PUNCT_DASH;
|
|
} else if (cur == 's') {
|
|
NEXT;
|
|
/* open */
|
|
type = XML_REGEXP_PUNCT_OPEN;
|
|
} else if (cur == 'e') {
|
|
NEXT;
|
|
/* close */
|
|
type = XML_REGEXP_PUNCT_CLOSE;
|
|
} else if (cur == 'i') {
|
|
NEXT;
|
|
/* initial quote */
|
|
type = XML_REGEXP_PUNCT_INITQUOTE;
|
|
} else if (cur == 'f') {
|
|
NEXT;
|
|
/* final quote */
|
|
type = XML_REGEXP_PUNCT_FINQUOTE;
|
|
} else if (cur == 'o') {
|
|
NEXT;
|
|
/* other */
|
|
type = XML_REGEXP_PUNCT_OTHERS;
|
|
} else {
|
|
/* all punctuation */
|
|
type = XML_REGEXP_PUNCT;
|
|
}
|
|
} else if (cur == 'Z') {
|
|
NEXT;
|
|
cur = CUR;
|
|
if (cur == 's') {
|
|
NEXT;
|
|
/* space */
|
|
type = XML_REGEXP_SEPAR_SPACE;
|
|
} else if (cur == 'l') {
|
|
NEXT;
|
|
/* line */
|
|
type = XML_REGEXP_SEPAR_LINE;
|
|
} else if (cur == 'p') {
|
|
NEXT;
|
|
/* paragraph */
|
|
type = XML_REGEXP_SEPAR_PARA;
|
|
} else {
|
|
/* all separators */
|
|
type = XML_REGEXP_SEPAR;
|
|
}
|
|
} else if (cur == 'S') {
|
|
NEXT;
|
|
cur = CUR;
|
|
if (cur == 'm') {
|
|
NEXT;
|
|
type = XML_REGEXP_SYMBOL_MATH;
|
|
/* math */
|
|
} else if (cur == 'c') {
|
|
NEXT;
|
|
type = XML_REGEXP_SYMBOL_CURRENCY;
|
|
/* currency */
|
|
} else if (cur == 'k') {
|
|
NEXT;
|
|
type = XML_REGEXP_SYMBOL_MODIFIER;
|
|
/* modifiers */
|
|
} else if (cur == 'o') {
|
|
NEXT;
|
|
type = XML_REGEXP_SYMBOL_OTHERS;
|
|
/* other */
|
|
} else {
|
|
/* all symbols */
|
|
type = XML_REGEXP_SYMBOL;
|
|
}
|
|
} else if (cur == 'C') {
|
|
NEXT;
|
|
cur = CUR;
|
|
if (cur == 'c') {
|
|
NEXT;
|
|
/* control */
|
|
type = XML_REGEXP_OTHER_CONTROL;
|
|
} else if (cur == 'f') {
|
|
NEXT;
|
|
/* format */
|
|
type = XML_REGEXP_OTHER_FORMAT;
|
|
} else if (cur == 'o') {
|
|
NEXT;
|
|
/* private use */
|
|
type = XML_REGEXP_OTHER_PRIVATE;
|
|
} else if (cur == 'n') {
|
|
NEXT;
|
|
/* not assigned */
|
|
type = XML_REGEXP_OTHER_NA;
|
|
} else {
|
|
/* all others */
|
|
type = XML_REGEXP_OTHER;
|
|
}
|
|
} else if (cur == 'I') {
|
|
const xmlChar *start;
|
|
NEXT;
|
|
cur = CUR;
|
|
if (cur != 's') {
|
|
ERROR("IsXXXX expected");
|
|
return;
|
|
}
|
|
NEXT;
|
|
start = ctxt->cur;
|
|
cur = CUR;
|
|
if (((cur >= 'a') && (cur <= 'z')) ||
|
|
((cur >= 'A') && (cur <= 'Z')) ||
|
|
((cur >= '0') && (cur <= '9')) ||
|
|
(cur == 0x2D)) {
|
|
NEXT;
|
|
cur = CUR;
|
|
while (((cur >= 'a') && (cur <= 'z')) ||
|
|
((cur >= 'A') && (cur <= 'Z')) ||
|
|
((cur >= '0') && (cur <= '9')) ||
|
|
(cur == 0x2D)) {
|
|
NEXT;
|
|
cur = CUR;
|
|
}
|
|
}
|
|
type = XML_REGEXP_BLOCK_NAME;
|
|
blockName = xmlStrndup(start, ctxt->cur - start);
|
|
} else {
|
|
ERROR("Unknown char property");
|
|
return;
|
|
}
|
|
if (ctxt->atom == NULL) {
|
|
ctxt->atom = xmlRegNewAtom(ctxt, type);
|
|
if (ctxt->atom != NULL)
|
|
ctxt->atom->valuep = blockName;
|
|
} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
|
|
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
|
|
type, 0, 0, blockName);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* xmlFAParseCharClassEsc:
|
|
* @ctxt: a regexp parser context
|
|
*
|
|
* [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
|
|
* [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
|
|
* [25] catEsc ::= '\p{' charProp '}'
|
|
* [26] complEsc ::= '\P{' charProp '}'
|
|
* [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
|
|
*/
|
|
static void
|
|
xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
|
|
int cur;
|
|
|
|
if (CUR == '.') {
|
|
if (ctxt->atom == NULL) {
|
|
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
|
|
} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
|
|
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
|
|
XML_REGEXP_ANYCHAR, 0, 0, NULL);
|
|
}
|
|
NEXT;
|
|
return;
|
|
}
|
|
if (CUR != '\\') {
|
|
ERROR("Escaped sequence: expecting \\");
|
|
return;
|
|
}
|
|
NEXT;
|
|
cur = CUR;
|
|
if (cur == 'p') {
|
|
NEXT;
|
|
if (CUR != '{') {
|
|
ERROR("Expecting '{'");
|
|
return;
|
|
}
|
|
NEXT;
|
|
xmlFAParseCharProp(ctxt);
|
|
if (CUR != '}') {
|
|
ERROR("Expecting '}'");
|
|
return;
|
|
}
|
|
NEXT;
|
|
} else if (cur == 'P') {
|
|
NEXT;
|
|
if (CUR != '{') {
|
|
ERROR("Expecting '{'");
|
|
return;
|
|
}
|
|
NEXT;
|
|
xmlFAParseCharProp(ctxt);
|
|
ctxt->atom->neg = 1;
|
|
if (CUR != '}') {
|
|
ERROR("Expecting '}'");
|
|
return;
|
|
}
|
|
NEXT;
|
|
} else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
|
|
(cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
|
|
(cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
|
|
(cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
|
|
(cur == 0x5E)) {
|
|
if (ctxt->atom == NULL) {
|
|
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
|
|
if (ctxt->atom != NULL)
|
|
ctxt->atom->codepoint = cur;
|
|
} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
|
|
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
|
|
XML_REGEXP_CHARVAL, cur, cur, NULL);
|
|
}
|
|
NEXT;
|
|
} else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
|
|
(cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
|
|
(cur == 'w') || (cur == 'W')) {
|
|
xmlRegAtomType type = XML_REGEXP_ANYSPACE;
|
|
|
|
switch (cur) {
|
|
case 's':
|
|
type = XML_REGEXP_ANYSPACE;
|
|
break;
|
|
case 'S':
|
|
type = XML_REGEXP_NOTSPACE;
|
|
break;
|
|
case 'i':
|
|
type = XML_REGEXP_INITNAME;
|
|
break;
|
|
case 'I':
|
|
type = XML_REGEXP_NOTINITNAME;
|
|
break;
|
|
case 'c':
|
|
type = XML_REGEXP_NAMECHAR;
|
|
break;
|
|
case 'C':
|
|
type = XML_REGEXP_NOTNAMECHAR;
|
|
break;
|
|
case 'd':
|
|
type = XML_REGEXP_DECIMAL;
|
|
break;
|
|
case 'D':
|
|
type = XML_REGEXP_NOTDECIMAL;
|
|
break;
|
|
case 'w':
|
|
type = XML_REGEXP_REALCHAR;
|
|
break;
|
|
case 'W':
|
|
type = XML_REGEXP_NOTREALCHAR;
|
|
break;
|
|
}
|
|
NEXT;
|
|
if (ctxt->atom == NULL) {
|
|
ctxt->atom = xmlRegNewAtom(ctxt, type);
|
|
} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
|
|
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
|
|
type, 0, 0, NULL);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* xmlFAParseCharRef:
|
|
* @ctxt: a regexp parser context
|
|
*
|
|
* [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' )
|
|
*/
|
|
static int
|
|
xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) {
|
|
int ret = 0, cur;
|
|
|
|
if ((CUR != '&') || (NXT(1) != '#'))
|
|
return(-1);
|
|
NEXT;
|
|
NEXT;
|
|
cur = CUR;
|
|
if (cur == 'x') {
|
|
NEXT;
|
|
cur = CUR;
|
|
if (((cur >= '0') && (cur <= '9')) ||
|
|
((cur >= 'a') && (cur <= 'f')) ||
|
|
((cur >= 'A') && (cur <= 'F'))) {
|
|
while (((cur >= '0') && (cur <= '9')) ||
|
|
((cur >= 'A') && (cur <= 'F'))) {
|
|
if ((cur >= '0') && (cur <= '9'))
|
|
ret = ret * 16 + cur - '0';
|
|
else if ((cur >= 'a') && (cur <= 'f'))
|
|
ret = ret * 16 + 10 + (cur - 'a');
|
|
else
|
|
ret = ret * 16 + 10 + (cur - 'A');
|
|
NEXT;
|
|
cur = CUR;
|
|
}
|
|
} else {
|
|
ERROR("Char ref: expecting [0-9A-F]");
|
|
return(-1);
|
|
}
|
|
} else {
|
|
if ((cur >= '0') && (cur <= '9')) {
|
|
while ((cur >= '0') && (cur <= '9')) {
|
|
ret = ret * 10 + cur - '0';
|
|
NEXT;
|
|
cur = CUR;
|
|
}
|
|
} else {
|
|
ERROR("Char ref: expecting [0-9]");
|
|
return(-1);
|
|
}
|
|
}
|
|
if (cur != ';') {
|
|
ERROR("Char ref: expecting ';'");
|
|
return(-1);
|
|
} else {
|
|
NEXT;
|
|
}
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlFAParseCharRange:
|
|
* @ctxt: a regexp parser context
|
|
*
|
|
* [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
|
|
* [18] seRange ::= charOrEsc '-' charOrEsc
|
|
* [20] charOrEsc ::= XmlChar | SingleCharEsc
|
|
* [21] XmlChar ::= [^\#x2D#x5B#x5D]
|
|
* [22] XmlCharIncDash ::= [^\#x5B#x5D]
|
|
*/
|
|
static void
|
|
xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
|
|
int cur;
|
|
int start = -1;
|
|
int end = -1;
|
|
|
|
if ((CUR == '&') && (NXT(1) == '#')) {
|
|
end = start = xmlFAParseCharRef(ctxt);
|
|
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
|
|
XML_REGEXP_CHARVAL, start, end, NULL);
|
|
return;
|
|
}
|
|
cur = CUR;
|
|
if (cur == '\\') {
|
|
NEXT;
|
|
cur = CUR;
|
|
switch (cur) {
|
|
case 'n': start = 0xA; break;
|
|
case 'r': start = 0xD; break;
|
|
case 't': start = 0x9; break;
|
|
case '\\': case '|': case '.': case '-': case '^': case '?':
|
|
case '*': case '+': case '{': case '}': case '(': case ')':
|
|
case '[': case ']':
|
|
start = cur; break;
|
|
default:
|
|
ERROR("Invalid escape value");
|
|
return;
|
|
}
|
|
end = start;
|
|
} else if ((cur != 0x5B) && (cur != 0x5D)) {
|
|
end = start = cur;
|
|
} else {
|
|
ERROR("Expecting a char range");
|
|
return;
|
|
}
|
|
NEXT;
|
|
if (start == '-') {
|
|
return;
|
|
}
|
|
cur = CUR;
|
|
if (cur != '-') {
|
|
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
|
|
XML_REGEXP_CHARVAL, start, end, NULL);
|
|
return;
|
|
}
|
|
NEXT;
|
|
cur = CUR;
|
|
if (cur == '\\') {
|
|
NEXT;
|
|
cur = CUR;
|
|
switch (cur) {
|
|
case 'n': end = 0xA; break;
|
|
case 'r': end = 0xD; break;
|
|
case 't': end = 0x9; break;
|
|
case '\\': case '|': case '.': case '-': case '^': case '?':
|
|
case '*': case '+': case '{': case '}': case '(': case ')':
|
|
case '[': case ']':
|
|
end = cur; break;
|
|
default:
|
|
ERROR("Invalid escape value");
|
|
return;
|
|
}
|
|
} else if ((cur != 0x5B) && (cur != 0x5D)) {
|
|
end = cur;
|
|
} else {
|
|
ERROR("Expecting the end of a char range");
|
|
return;
|
|
}
|
|
NEXT;
|
|
/* TODO check that the values are acceptable character ranges for XML */
|
|
if (end < start) {
|
|
ERROR("End of range is before start of range");
|
|
} else {
|
|
xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
|
|
XML_REGEXP_CHARVAL, start, end, NULL);
|
|
}
|
|
return;
|
|
}
|
|
|
|
/**
|
|
* xmlFAParsePosCharGroup:
|
|
* @ctxt: a regexp parser context
|
|
*
|
|
* [14] posCharGroup ::= ( charRange | charClassEsc )+
|
|
*/
|
|
static void
|
|
xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
|
|
do {
|
|
if ((CUR == '\\') || (CUR == '.')) {
|
|
xmlFAParseCharClassEsc(ctxt);
|
|
} else {
|
|
xmlFAParseCharRange(ctxt);
|
|
}
|
|
} while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
|
|
(ctxt->error == 0));
|
|
}
|
|
|
|
/**
|
|
* xmlFAParseCharGroup:
|
|
* @ctxt: a regexp parser context
|
|
*
|
|
* [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
|
|
* [15] negCharGroup ::= '^' posCharGroup
|
|
* [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
|
|
* [12] charClassExpr ::= '[' charGroup ']'
|
|
*/
|
|
static void
|
|
xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
|
|
int n = ctxt->neg;
|
|
while ((CUR != ']') && (ctxt->error == 0)) {
|
|
if (CUR == '^') {
|
|
int neg = ctxt->neg;
|
|
|
|
NEXT;
|
|
ctxt->neg = !ctxt->neg;
|
|
xmlFAParsePosCharGroup(ctxt);
|
|
ctxt->neg = neg;
|
|
} else if (CUR == '-') {
|
|
NEXT;
|
|
ctxt->neg = !ctxt->neg;
|
|
if (CUR != '[') {
|
|
ERROR("charClassExpr: '[' expected");
|
|
break;
|
|
}
|
|
NEXT;
|
|
xmlFAParseCharGroup(ctxt);
|
|
if (CUR == ']') {
|
|
NEXT;
|
|
} else {
|
|
ERROR("charClassExpr: ']' expected");
|
|
break;
|
|
}
|
|
break;
|
|
} else if (CUR != ']') {
|
|
xmlFAParsePosCharGroup(ctxt);
|
|
}
|
|
}
|
|
ctxt->neg = n;
|
|
}
|
|
|
|
/**
|
|
* xmlFAParseCharClass:
|
|
* @ctxt: a regexp parser context
|
|
*
|
|
* [11] charClass ::= charClassEsc | charClassExpr
|
|
* [12] charClassExpr ::= '[' charGroup ']'
|
|
*/
|
|
static void
|
|
xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
|
|
if (CUR == '[') {
|
|
NEXT;
|
|
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
|
|
if (ctxt->atom == NULL)
|
|
return;
|
|
xmlFAParseCharGroup(ctxt);
|
|
if (CUR == ']') {
|
|
NEXT;
|
|
} else {
|
|
ERROR("xmlFAParseCharClass: ']' expected");
|
|
}
|
|
} else {
|
|
xmlFAParseCharClassEsc(ctxt);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* xmlFAParseQuantExact:
|
|
* @ctxt: a regexp parser context
|
|
*
|
|
* [8] QuantExact ::= [0-9]+
|
|
*/
|
|
static int
|
|
xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
|
|
int ret = 0;
|
|
int ok = 0;
|
|
|
|
while ((CUR >= '0') && (CUR <= '9')) {
|
|
ret = ret * 10 + (CUR - '0');
|
|
ok = 1;
|
|
NEXT;
|
|
}
|
|
if (ok != 1) {
|
|
return(-1);
|
|
}
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlFAParseQuantifier:
|
|
* @ctxt: a regexp parser context
|
|
*
|
|
* [4] quantifier ::= [?*+] | ( '{' quantity '}' )
|
|
* [5] quantity ::= quantRange | quantMin | QuantExact
|
|
* [6] quantRange ::= QuantExact ',' QuantExact
|
|
* [7] quantMin ::= QuantExact ','
|
|
* [8] QuantExact ::= [0-9]+
|
|
*/
|
|
static int
|
|
xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
|
|
int cur;
|
|
|
|
cur = CUR;
|
|
if ((cur == '?') || (cur == '*') || (cur == '+')) {
|
|
if (ctxt->atom != NULL) {
|
|
if (cur == '?')
|
|
ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
|
|
else if (cur == '*')
|
|
ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
|
|
else if (cur == '+')
|
|
ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
|
|
}
|
|
NEXT;
|
|
return(1);
|
|
}
|
|
if (cur == '{') {
|
|
int min = 0, max = 0;
|
|
|
|
NEXT;
|
|
cur = xmlFAParseQuantExact(ctxt);
|
|
if (cur >= 0)
|
|
min = cur;
|
|
if (CUR == ',') {
|
|
NEXT;
|
|
cur = xmlFAParseQuantExact(ctxt);
|
|
if (cur >= 0)
|
|
max = cur;
|
|
}
|
|
if (CUR == '}') {
|
|
NEXT;
|
|
} else {
|
|
ERROR("Unterminated quantifier");
|
|
}
|
|
if (max == 0)
|
|
max = min;
|
|
if (ctxt->atom != NULL) {
|
|
ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
|
|
ctxt->atom->min = min;
|
|
ctxt->atom->max = max;
|
|
}
|
|
return(1);
|
|
}
|
|
return(0);
|
|
}
|
|
|
|
/**
|
|
* xmlFAParseAtom:
|
|
* @ctxt: a regexp parser context
|
|
*
|
|
* [9] atom ::= Char | charClass | ( '(' regExp ')' )
|
|
*/
|
|
static int
|
|
xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
|
|
int codepoint, len;
|
|
|
|
codepoint = xmlFAIsChar(ctxt);
|
|
if (codepoint > 0) {
|
|
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
|
|
if (ctxt->atom == NULL)
|
|
return(-1);
|
|
codepoint = CUR_SCHAR(ctxt->cur, len);
|
|
ctxt->atom->codepoint = codepoint;
|
|
NEXTL(len);
|
|
return(1);
|
|
} else if (CUR == '|') {
|
|
return(0);
|
|
} else if (CUR == 0) {
|
|
return(0);
|
|
} else if (CUR == ')') {
|
|
return(0);
|
|
} else if (CUR == '(') {
|
|
xmlRegStatePtr start, oldend;
|
|
|
|
NEXT;
|
|
xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
|
|
start = ctxt->state;
|
|
oldend = ctxt->end;
|
|
ctxt->end = NULL;
|
|
ctxt->atom = NULL;
|
|
xmlFAParseRegExp(ctxt, 0);
|
|
if (CUR == ')') {
|
|
NEXT;
|
|
} else {
|
|
ERROR("xmlFAParseAtom: expecting ')'");
|
|
}
|
|
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
|
|
if (ctxt->atom == NULL)
|
|
return(-1);
|
|
ctxt->atom->start = start;
|
|
ctxt->atom->stop = ctxt->state;
|
|
ctxt->end = oldend;
|
|
return(1);
|
|
} else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
|
|
xmlFAParseCharClass(ctxt);
|
|
return(1);
|
|
}
|
|
return(0);
|
|
}
|
|
|
|
/**
|
|
* xmlFAParsePiece:
|
|
* @ctxt: a regexp parser context
|
|
*
|
|
* [3] piece ::= atom quantifier?
|
|
*/
|
|
static int
|
|
xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
|
|
int ret;
|
|
|
|
ctxt->atom = NULL;
|
|
ret = xmlFAParseAtom(ctxt);
|
|
if (ret == 0)
|
|
return(0);
|
|
if (ctxt->atom == NULL) {
|
|
ERROR("internal: no atom generated");
|
|
}
|
|
xmlFAParseQuantifier(ctxt);
|
|
return(1);
|
|
}
|
|
|
|
/**
|
|
* xmlFAParseBranch:
|
|
* @ctxt: a regexp parser context
|
|
* @first: is taht the first
|
|
*
|
|
* [2] branch ::= piece*
|
|
*/
|
|
static void
|
|
xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, int first) {
|
|
xmlRegStatePtr previous;
|
|
xmlRegAtomPtr prevatom = NULL;
|
|
int ret;
|
|
|
|
previous = ctxt->state;
|
|
ret = xmlFAParsePiece(ctxt);
|
|
if (ret != 0) {
|
|
if (first) {
|
|
xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom);
|
|
previous = ctxt->state;
|
|
} else {
|
|
prevatom = ctxt->atom;
|
|
}
|
|
ctxt->atom = NULL;
|
|
}
|
|
while ((ret != 0) && (ctxt->error == 0)) {
|
|
ret = xmlFAParsePiece(ctxt);
|
|
if (ret != 0) {
|
|
if (first) {
|
|
xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom);
|
|
} else {
|
|
xmlFAGenerateTransitions(ctxt, previous, NULL, prevatom);
|
|
prevatom = ctxt->atom;
|
|
}
|
|
previous = ctxt->state;
|
|
ctxt->atom = NULL;
|
|
}
|
|
}
|
|
if (!first) {
|
|
xmlFAGenerateTransitions(ctxt, previous, ctxt->end, prevatom);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* xmlFAParseRegExp:
|
|
* @ctxt: a regexp parser context
|
|
* @top: is that the top-level expressions ?
|
|
*
|
|
* [1] regExp ::= branch ( '|' branch )*
|
|
*/
|
|
static void
|
|
xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
|
|
xmlRegStatePtr start, end, oldend;
|
|
|
|
oldend = ctxt->end;
|
|
|
|
start = ctxt->state;
|
|
xmlFAParseBranch(ctxt, (ctxt->end == NULL));
|
|
if (CUR != '|') {
|
|
ctxt->end = ctxt->state;
|
|
return;
|
|
}
|
|
end = ctxt->state;
|
|
while ((CUR == '|') && (ctxt->error == 0)) {
|
|
NEXT;
|
|
ctxt->state = start;
|
|
ctxt->end = end;
|
|
xmlFAParseBranch(ctxt, 0);
|
|
}
|
|
if (!top)
|
|
ctxt->end = oldend;
|
|
}
|
|
|
|
/************************************************************************
|
|
* *
|
|
* The basic API *
|
|
* *
|
|
************************************************************************/
|
|
|
|
/**
|
|
* xmlRegexpPrint:
|
|
* @output: the file for the output debug
|
|
* @regexp: the compiled regexp
|
|
*
|
|
* Print the content of the compiled regular expression
|
|
*/
|
|
void
|
|
xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
|
|
int i;
|
|
|
|
fprintf(output, " regexp: ");
|
|
if (regexp == NULL) {
|
|
fprintf(output, "NULL\n");
|
|
return;
|
|
}
|
|
fprintf(output, "'%s' ", regexp->string);
|
|
fprintf(output, "\n");
|
|
fprintf(output, "%d atoms:\n", regexp->nbAtoms);
|
|
for (i = 0;i < regexp->nbAtoms; i++) {
|
|
fprintf(output, " %02d ", i);
|
|
xmlRegPrintAtom(output, regexp->atoms[i]);
|
|
}
|
|
fprintf(output, "%d states:", regexp->nbStates);
|
|
fprintf(output, "\n");
|
|
for (i = 0;i < regexp->nbStates; i++) {
|
|
xmlRegPrintState(output, regexp->states[i]);
|
|
}
|
|
fprintf(output, "%d counters:\n", regexp->nbCounters);
|
|
for (i = 0;i < regexp->nbCounters; i++) {
|
|
fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
|
|
regexp->counters[i].max);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* xmlRegexpCompile:
|
|
* @regexp: a regular expression string
|
|
*
|
|
* Parses a regular expression conforming to XML Schemas Part 2 Datatype
|
|
* Appendix F and build an automata suitable for testing strings against
|
|
* that regular expression
|
|
*
|
|
* Returns the compiled expression or NULL in case of error
|
|
*/
|
|
xmlRegexpPtr
|
|
xmlRegexpCompile(const xmlChar *regexp) {
|
|
xmlRegexpPtr ret;
|
|
xmlRegParserCtxtPtr ctxt;
|
|
|
|
ctxt = xmlRegNewParserCtxt(regexp);
|
|
if (ctxt == NULL)
|
|
return(NULL);
|
|
|
|
/* initialize the parser */
|
|
ctxt->end = NULL;
|
|
ctxt->start = ctxt->state = xmlRegNewState(ctxt);
|
|
xmlRegStatePush(ctxt, ctxt->start);
|
|
|
|
/* parse the expression building an automata */
|
|
xmlFAParseRegExp(ctxt, 1);
|
|
if (CUR != 0) {
|
|
ERROR("xmlFAParseRegExp: extra characters");
|
|
}
|
|
ctxt->end = ctxt->state;
|
|
ctxt->start->type = XML_REGEXP_START_STATE;
|
|
ctxt->end->type = XML_REGEXP_FINAL_STATE;
|
|
|
|
/* remove the Epsilon except for counted transitions */
|
|
xmlFAEliminateEpsilonTransitions(ctxt);
|
|
|
|
|
|
if (ctxt->error != 0) {
|
|
xmlRegFreeParserCtxt(ctxt);
|
|
return(NULL);
|
|
}
|
|
ret = xmlRegEpxFromParse(ctxt);
|
|
xmlRegFreeParserCtxt(ctxt);
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlRegexpExec:
|
|
* @comp: the compiled regular expression
|
|
* @content: the value to check against the regular expression
|
|
*
|
|
* Check if the regular expression generate the value
|
|
*
|
|
* Returns 1 if it matches, 0 if not and a negativa value in case of error
|
|
*/
|
|
int
|
|
xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
|
|
if ((comp == NULL) || (content == NULL))
|
|
return(-1);
|
|
return(xmlFARegExec(comp, content));
|
|
}
|
|
|
|
/**
|
|
* xmlRegexpIsDeterminist:
|
|
* @comp: the compiled regular expression
|
|
*
|
|
* Check if the regular expression is determinist
|
|
*
|
|
* Returns 1 if it yes, 0 if not and a negativa value in case of error
|
|
*/
|
|
int
|
|
xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
|
|
xmlAutomataPtr am;
|
|
int ret;
|
|
|
|
if (comp == NULL)
|
|
return(-1);
|
|
if (comp->determinist != -1)
|
|
return(comp->determinist);
|
|
|
|
am = xmlNewAutomata();
|
|
if (am->states != NULL) {
|
|
int i;
|
|
|
|
for (i = 0;i < am->nbStates;i++)
|
|
xmlRegFreeState(am->states[i]);
|
|
xmlFree(am->states);
|
|
}
|
|
am->nbAtoms = comp->nbAtoms;
|
|
am->atoms = comp->atoms;
|
|
am->nbStates = comp->nbStates;
|
|
am->states = comp->states;
|
|
am->determinist = -1;
|
|
ret = xmlFAComputesDeterminism(am);
|
|
am->atoms = NULL;
|
|
am->states = NULL;
|
|
xmlFreeAutomata(am);
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlRegFreeRegexp:
|
|
* @regexp: the regexp
|
|
*
|
|
* Free a regexp
|
|
*/
|
|
void
|
|
xmlRegFreeRegexp(xmlRegexpPtr regexp) {
|
|
int i;
|
|
if (regexp == NULL)
|
|
return;
|
|
|
|
if (regexp->string != NULL)
|
|
xmlFree(regexp->string);
|
|
if (regexp->states != NULL) {
|
|
for (i = 0;i < regexp->nbStates;i++)
|
|
xmlRegFreeState(regexp->states[i]);
|
|
xmlFree(regexp->states);
|
|
}
|
|
if (regexp->atoms != NULL) {
|
|
for (i = 0;i < regexp->nbAtoms;i++)
|
|
xmlRegFreeAtom(regexp->atoms[i]);
|
|
xmlFree(regexp->atoms);
|
|
}
|
|
if (regexp->counters != NULL)
|
|
xmlFree(regexp->counters);
|
|
if (regexp->compact != NULL)
|
|
xmlFree(regexp->compact);
|
|
if (regexp->transdata != NULL)
|
|
xmlFree(regexp->transdata);
|
|
if (regexp->stringMap != NULL) {
|
|
for (i = 0; i < regexp->nbstrings;i++)
|
|
xmlFree(regexp->stringMap[i]);
|
|
xmlFree(regexp->stringMap);
|
|
}
|
|
|
|
xmlFree(regexp);
|
|
}
|
|
|
|
#ifdef LIBXML_AUTOMATA_ENABLED
|
|
/************************************************************************
|
|
* *
|
|
* The Automata interface *
|
|
* *
|
|
************************************************************************/
|
|
|
|
/**
|
|
* xmlNewAutomata:
|
|
*
|
|
* Create a new automata
|
|
*
|
|
* Returns the new object or NULL in case of failure
|
|
*/
|
|
xmlAutomataPtr
|
|
xmlNewAutomata(void) {
|
|
xmlAutomataPtr ctxt;
|
|
|
|
ctxt = xmlRegNewParserCtxt(NULL);
|
|
if (ctxt == NULL)
|
|
return(NULL);
|
|
|
|
/* initialize the parser */
|
|
ctxt->end = NULL;
|
|
ctxt->start = ctxt->state = xmlRegNewState(ctxt);
|
|
xmlRegStatePush(ctxt, ctxt->start);
|
|
|
|
return(ctxt);
|
|
}
|
|
|
|
/**
|
|
* xmlFreeAutomata:
|
|
* @am: an automata
|
|
*
|
|
* Free an automata
|
|
*/
|
|
void
|
|
xmlFreeAutomata(xmlAutomataPtr am) {
|
|
if (am == NULL)
|
|
return;
|
|
xmlRegFreeParserCtxt(am);
|
|
}
|
|
|
|
/**
|
|
* xmlAutomataGetInitState:
|
|
* @am: an automata
|
|
*
|
|
* Initial state lookup
|
|
*
|
|
* Returns the initial state of the automata
|
|
*/
|
|
xmlAutomataStatePtr
|
|
xmlAutomataGetInitState(xmlAutomataPtr am) {
|
|
if (am == NULL)
|
|
return(NULL);
|
|
return(am->start);
|
|
}
|
|
|
|
/**
|
|
* xmlAutomataSetFinalState:
|
|
* @am: an automata
|
|
* @state: a state in this automata
|
|
*
|
|
* Makes that state a final state
|
|
*
|
|
* Returns 0 or -1 in case of error
|
|
*/
|
|
int
|
|
xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
|
|
if ((am == NULL) || (state == NULL))
|
|
return(-1);
|
|
state->type = XML_REGEXP_FINAL_STATE;
|
|
return(0);
|
|
}
|
|
|
|
/**
|
|
* xmlAutomataNewTransition:
|
|
* @am: an automata
|
|
* @from: the starting point of the transition
|
|
* @to: the target point of the transition or NULL
|
|
* @token: the input string associated to that transition
|
|
* @data: data passed to the callback function if the transition is activated
|
|
*
|
|
* If @to is NULL, this create first a new target state in the automata
|
|
* and then adds a transition from the @from state to the target state
|
|
* activated by the value of @token
|
|
*
|
|
* Returns the target state or NULL in case of error
|
|
*/
|
|
xmlAutomataStatePtr
|
|
xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
|
|
xmlAutomataStatePtr to, const xmlChar *token,
|
|
void *data) {
|
|
xmlRegAtomPtr atom;
|
|
|
|
if ((am == NULL) || (from == NULL) || (token == NULL))
|
|
return(NULL);
|
|
atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
|
|
atom->data = data;
|
|
if (atom == NULL)
|
|
return(NULL);
|
|
atom->valuep = xmlStrdup(token);
|
|
|
|
xmlFAGenerateTransitions(am, from, to, atom);
|
|
if (to == NULL)
|
|
return(am->state);
|
|
return(to);
|
|
}
|
|
|
|
/**
|
|
* xmlAutomataNewCountTrans:
|
|
* @am: an automata
|
|
* @from: the starting point of the transition
|
|
* @to: the target point of the transition or NULL
|
|
* @token: the input string associated to that transition
|
|
* @min: the minimum successive occurences of token
|
|
* @max: the maximum successive occurences of token
|
|
* @data: data associated to the transition
|
|
*
|
|
* If @to is NULL, this create first a new target state in the automata
|
|
* and then adds a transition from the @from state to the target state
|
|
* activated by a succession of input of value @token and whose number
|
|
* is between @min and @max
|
|
*
|
|
* Returns the target state or NULL in case of error
|
|
*/
|
|
xmlAutomataStatePtr
|
|
xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
|
|
xmlAutomataStatePtr to, const xmlChar *token,
|
|
int min, int max, void *data) {
|
|
xmlRegAtomPtr atom;
|
|
|
|
if ((am == NULL) || (from == NULL) || (token == NULL))
|
|
return(NULL);
|
|
if (min < 0)
|
|
return(NULL);
|
|
if ((max < min) || (max < 1))
|
|
return(NULL);
|
|
atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
|
|
if (atom == NULL)
|
|
return(NULL);
|
|
atom->valuep = xmlStrdup(token);
|
|
atom->data = data;
|
|
if (min == 0)
|
|
atom->min = 1;
|
|
else
|
|
atom->min = min;
|
|
atom->max = max;
|
|
|
|
xmlFAGenerateTransitions(am, from, to, atom);
|
|
if (to == NULL)
|
|
to = am->state;
|
|
if (to == NULL)
|
|
return(NULL);
|
|
if (min == 0)
|
|
xmlFAGenerateEpsilonTransition(am, from, to);
|
|
return(to);
|
|
}
|
|
|
|
/**
|
|
* xmlAutomataNewOnceTrans:
|
|
* @am: an automata
|
|
* @from: the starting point of the transition
|
|
* @to: the target point of the transition or NULL
|
|
* @token: the input string associated to that transition
|
|
* @min: the minimum successive occurences of token
|
|
* @max: the maximum successive occurences of token
|
|
* @data: data associated to the transition
|
|
*
|
|
* If @to is NULL, this create first a new target state in the automata
|
|
* and then adds a transition from the @from state to the target state
|
|
* activated by a succession of input of value @token and whose number
|
|
* is between @min and @max, moreover that transistion can only be crossed
|
|
* once.
|
|
*
|
|
* Returns the target state or NULL in case of error
|
|
*/
|
|
xmlAutomataStatePtr
|
|
xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
|
|
xmlAutomataStatePtr to, const xmlChar *token,
|
|
int min, int max, void *data) {
|
|
xmlRegAtomPtr atom;
|
|
int counter;
|
|
|
|
if ((am == NULL) || (from == NULL) || (token == NULL))
|
|
return(NULL);
|
|
if (min < 1)
|
|
return(NULL);
|
|
if ((max < min) || (max < 1))
|
|
return(NULL);
|
|
atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
|
|
if (atom == NULL)
|
|
return(NULL);
|
|
atom->valuep = xmlStrdup(token);
|
|
atom->data = data;
|
|
atom->quant = XML_REGEXP_QUANT_ONCEONLY;
|
|
if (min == 0)
|
|
atom->min = 1;
|
|
else
|
|
atom->min = min;
|
|
atom->max = max;
|
|
/*
|
|
* associate a counter to the transition.
|
|
*/
|
|
counter = xmlRegGetCounter(am);
|
|
am->counters[counter].min = 1;
|
|
am->counters[counter].max = 1;
|
|
|
|
/* xmlFAGenerateTransitions(am, from, to, atom); */
|
|
if (to == NULL) {
|
|
to = xmlRegNewState(am);
|
|
xmlRegStatePush(am, to);
|
|
}
|
|
xmlRegStateAddTrans(am, from, atom, to, counter, -1);
|
|
xmlRegAtomPush(am, atom);
|
|
am->state = to;
|
|
if (to == NULL)
|
|
to = am->state;
|
|
if (to == NULL)
|
|
return(NULL);
|
|
return(to);
|
|
}
|
|
|
|
/**
|
|
* xmlAutomataNewState:
|
|
* @am: an automata
|
|
*
|
|
* Create a new disconnected state in the automata
|
|
*
|
|
* Returns the new state or NULL in case of error
|
|
*/
|
|
xmlAutomataStatePtr
|
|
xmlAutomataNewState(xmlAutomataPtr am) {
|
|
xmlAutomataStatePtr to;
|
|
|
|
if (am == NULL)
|
|
return(NULL);
|
|
to = xmlRegNewState(am);
|
|
xmlRegStatePush(am, to);
|
|
return(to);
|
|
}
|
|
|
|
/**
|
|
* xmlAutomataNewEpsilon:
|
|
* @am: an automata
|
|
* @from: the starting point of the transition
|
|
* @to: the target point of the transition or NULL
|
|
*
|
|
* If @to is NULL, this create first a new target state in the automata
|
|
* and then adds a an epsilon transition from the @from state to the
|
|
* target state
|
|
*
|
|
* Returns the target state or NULL in case of error
|
|
*/
|
|
xmlAutomataStatePtr
|
|
xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
|
|
xmlAutomataStatePtr to) {
|
|
if ((am == NULL) || (from == NULL))
|
|
return(NULL);
|
|
xmlFAGenerateEpsilonTransition(am, from, to);
|
|
if (to == NULL)
|
|
return(am->state);
|
|
return(to);
|
|
}
|
|
|
|
/**
|
|
* xmlAutomataNewAllTrans:
|
|
* @am: an automata
|
|
* @from: the starting point of the transition
|
|
* @to: the target point of the transition or NULL
|
|
* @lax: allow to transition if not all all transitions have been activated
|
|
*
|
|
* If @to is NULL, this create first a new target state in the automata
|
|
* and then adds a an ALL transition from the @from state to the
|
|
* target state. That transition is an epsilon transition allowed only when
|
|
* all transitions from the @from node have been activated.
|
|
*
|
|
* Returns the target state or NULL in case of error
|
|
*/
|
|
xmlAutomataStatePtr
|
|
xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
|
|
xmlAutomataStatePtr to, int lax) {
|
|
if ((am == NULL) || (from == NULL))
|
|
return(NULL);
|
|
xmlFAGenerateAllTransition(am, from, to, lax);
|
|
if (to == NULL)
|
|
return(am->state);
|
|
return(to);
|
|
}
|
|
|
|
/**
|
|
* xmlAutomataNewCounter:
|
|
* @am: an automata
|
|
* @min: the minimal value on the counter
|
|
* @max: the maximal value on the counter
|
|
*
|
|
* Create a new counter
|
|
*
|
|
* Returns the counter number or -1 in case of error
|
|
*/
|
|
int
|
|
xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
|
|
int ret;
|
|
|
|
if (am == NULL)
|
|
return(-1);
|
|
|
|
ret = xmlRegGetCounter(am);
|
|
if (ret < 0)
|
|
return(-1);
|
|
am->counters[ret].min = min;
|
|
am->counters[ret].max = max;
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlAutomataNewCountedTrans:
|
|
* @am: an automata
|
|
* @from: the starting point of the transition
|
|
* @to: the target point of the transition or NULL
|
|
* @counter: the counter associated to that transition
|
|
*
|
|
* If @to is NULL, this create first a new target state in the automata
|
|
* and then adds an epsilon transition from the @from state to the target state
|
|
* which will increment the counter provided
|
|
*
|
|
* Returns the target state or NULL in case of error
|
|
*/
|
|
xmlAutomataStatePtr
|
|
xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
|
|
xmlAutomataStatePtr to, int counter) {
|
|
if ((am == NULL) || (from == NULL) || (counter < 0))
|
|
return(NULL);
|
|
xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
|
|
if (to == NULL)
|
|
return(am->state);
|
|
return(to);
|
|
}
|
|
|
|
/**
|
|
* xmlAutomataNewCounterTrans:
|
|
* @am: an automata
|
|
* @from: the starting point of the transition
|
|
* @to: the target point of the transition or NULL
|
|
* @counter: the counter associated to that transition
|
|
*
|
|
* If @to is NULL, this create first a new target state in the automata
|
|
* and then adds an epsilon transition from the @from state to the target state
|
|
* which will be allowed only if the counter is within the right range.
|
|
*
|
|
* Returns the target state or NULL in case of error
|
|
*/
|
|
xmlAutomataStatePtr
|
|
xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
|
|
xmlAutomataStatePtr to, int counter) {
|
|
if ((am == NULL) || (from == NULL) || (counter < 0))
|
|
return(NULL);
|
|
xmlFAGenerateCountedTransition(am, from, to, counter);
|
|
if (to == NULL)
|
|
return(am->state);
|
|
return(to);
|
|
}
|
|
|
|
/**
|
|
* xmlAutomataCompile:
|
|
* @am: an automata
|
|
*
|
|
* Compile the automata into a Reg Exp ready for being executed.
|
|
* The automata should be free after this point.
|
|
*
|
|
* Returns the compiled regexp or NULL in case of error
|
|
*/
|
|
xmlRegexpPtr
|
|
xmlAutomataCompile(xmlAutomataPtr am) {
|
|
xmlRegexpPtr ret;
|
|
|
|
xmlFAEliminateEpsilonTransitions(am);
|
|
/* xmlFAComputesDeterminism(am); */
|
|
ret = xmlRegEpxFromParse(am);
|
|
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* xmlAutomataIsDeterminist:
|
|
* @am: an automata
|
|
*
|
|
* Checks if an automata is determinist.
|
|
*
|
|
* Returns 1 if true, 0 if not, and -1 in case of error
|
|
*/
|
|
int
|
|
xmlAutomataIsDeterminist(xmlAutomataPtr am) {
|
|
int ret;
|
|
|
|
if (am == NULL)
|
|
return(-1);
|
|
|
|
ret = xmlFAComputesDeterminism(am);
|
|
return(ret);
|
|
}
|
|
#endif /* LIBXML_AUTOMATA_ENABLED */
|
|
#endif /* LIBXML_REGEXP_ENABLED */
|