diff options
Diffstat (limited to 'bundle/libxml/xmlregexp.c')
| -rw-r--r-- | bundle/libxml/xmlregexp.c | 4319 | 
1 files changed, 4319 insertions, 0 deletions
diff --git a/bundle/libxml/xmlregexp.c b/bundle/libxml/xmlregexp.c new file mode 100644 index 0000000000..129c1269ec --- /dev/null +++ b/bundle/libxml/xmlregexp.c @@ -0,0 +1,4319 @@ +/* + * regexp.c: generic and extensible Regular Expression engine + * + * Basically designed with the purpose of compiling regexps for  + * the variety of validation/shemas mechanisms now available in + * XML related specifications thise includes: + *    - XML-1.0 DTD validation + *    - XML Schemas structure part 1 + *    - XML Schemas Datatypes part 2 especially Appendix F + *    - RELAX-NG/TREX i.e. the counter proposal + * + * See Copyright for the status of this software. + * + * Daniel Veillard <veillard@redhat.com> + */ + +#define IN_LIBXML +#include "libxml.h" + +#ifdef LIBXML_REGEXP_ENABLED + +#include <stdio.h> +#include <string.h> +#include <libxml/tree.h> +#include <libxml/parserInternals.h> +#include <libxml/xmlregexp.h> +#include <libxml/xmlautomata.h> +#include <libxml/xmlunicode.h> + +/* #define DEBUG_REGEXP_GRAPH  */ +/* #define DEBUG_REGEXP_EXEC */ +/* #define DEBUG_PUSH */ +/* #define DEBUG_COMPACTION */ + +#define ERROR(str) ctxt->error = 1;					\ +    xmlGenericError(xmlGenericErrorContext, "Regexp: %s: %s\n", str, ctxt->cur) +#define NEXT ctxt->cur++ +#define CUR (*(ctxt->cur)) +#define NXT(index) (ctxt->cur[index]) + +#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) +#define NEXTL(l) ctxt->cur += l; + +/** + * TODO: + * + * macro to flag unimplemented blocks + */ +#define TODO 								\ +    xmlGenericError(xmlGenericErrorContext,				\ +	    "Unimplemented block at %s:%d\n",				\ +            __FILE__, __LINE__); + + +/************************************************************************ + * 									* + * 			Datatypes and structures			* + * 									* + ************************************************************************/ + +typedef enum { +    XML_REGEXP_EPSILON = 1, +    XML_REGEXP_CHARVAL, +    XML_REGEXP_RANGES, +    XML_REGEXP_SUBREG, +    XML_REGEXP_STRING, +    XML_REGEXP_ANYCHAR, /* . */ +    XML_REGEXP_ANYSPACE, /* \s */ +    XML_REGEXP_NOTSPACE, /* \S */ +    XML_REGEXP_INITNAME, /* \l */ +    XML_REGEXP_NOTINITNAME, /* \l */ +    XML_REGEXP_NAMECHAR, /* \c */ +    XML_REGEXP_NOTNAMECHAR, /* \C */ +    XML_REGEXP_DECIMAL, /* \d */ +    XML_REGEXP_NOTDECIMAL, /* \d */ +    XML_REGEXP_REALCHAR, /* \w */ +    XML_REGEXP_NOTREALCHAR, /* \w */ +    XML_REGEXP_LETTER, +    XML_REGEXP_LETTER_UPPERCASE, +    XML_REGEXP_LETTER_LOWERCASE, +    XML_REGEXP_LETTER_TITLECASE, +    XML_REGEXP_LETTER_MODIFIER, +    XML_REGEXP_LETTER_OTHERS, +    XML_REGEXP_MARK, +    XML_REGEXP_MARK_NONSPACING, +    XML_REGEXP_MARK_SPACECOMBINING, +    XML_REGEXP_MARK_ENCLOSING, +    XML_REGEXP_NUMBER, +    XML_REGEXP_NUMBER_DECIMAL, +    XML_REGEXP_NUMBER_LETTER, +    XML_REGEXP_NUMBER_OTHERS, +    XML_REGEXP_PUNCT, +    XML_REGEXP_PUNCT_CONNECTOR, +    XML_REGEXP_PUNCT_DASH, +    XML_REGEXP_PUNCT_OPEN, +    XML_REGEXP_PUNCT_CLOSE, +    XML_REGEXP_PUNCT_INITQUOTE, +    XML_REGEXP_PUNCT_FINQUOTE, +    XML_REGEXP_PUNCT_OTHERS, +    XML_REGEXP_SEPAR, +    XML_REGEXP_SEPAR_SPACE, +    XML_REGEXP_SEPAR_LINE, +    XML_REGEXP_SEPAR_PARA, +    XML_REGEXP_SYMBOL, +    XML_REGEXP_SYMBOL_MATH, +    XML_REGEXP_SYMBOL_CURRENCY, +    XML_REGEXP_SYMBOL_MODIFIER, +    XML_REGEXP_SYMBOL_OTHERS, +    XML_REGEXP_OTHER, +    XML_REGEXP_OTHER_CONTROL, +    XML_REGEXP_OTHER_FORMAT, +    XML_REGEXP_OTHER_PRIVATE, +    XML_REGEXP_OTHER_NA, +    XML_REGEXP_BLOCK_NAME +} xmlRegAtomType; + +typedef enum { +    XML_REGEXP_QUANT_EPSILON = 1, +    XML_REGEXP_QUANT_ONCE, +    XML_REGEXP_QUANT_OPT, +    XML_REGEXP_QUANT_MULT, +    XML_REGEXP_QUANT_PLUS, +    XML_REGEXP_QUANT_ONCEONLY, +    XML_REGEXP_QUANT_ALL, +    XML_REGEXP_QUANT_RANGE +} xmlRegQuantType; + +typedef enum { +    XML_REGEXP_START_STATE = 1, +    XML_REGEXP_FINAL_STATE, +    XML_REGEXP_TRANS_STATE +} xmlRegStateType; + +typedef enum { +    XML_REGEXP_MARK_NORMAL = 0, +    XML_REGEXP_MARK_START, +    XML_REGEXP_MARK_VISITED +} xmlRegMarkedType; + +typedef struct _xmlRegRange xmlRegRange; +typedef xmlRegRange *xmlRegRangePtr; + +struct _xmlRegRange { +    int neg; +    xmlRegAtomType type; +    int start; +    int end; +    xmlChar *blockName; +}; + +typedef struct _xmlRegAtom xmlRegAtom; +typedef xmlRegAtom *xmlRegAtomPtr; + +typedef struct _xmlAutomataState xmlRegState; +typedef xmlRegState *xmlRegStatePtr; + +struct _xmlRegAtom { +    int no; +    xmlRegAtomType type; +    xmlRegQuantType quant; +    int min; +    int max; + +    void *valuep; +    void *valuep2; +    int neg; +    int codepoint; +    xmlRegStatePtr start; +    xmlRegStatePtr stop; +    int maxRanges; +    int nbRanges; +    xmlRegRangePtr *ranges; +    void *data; +}; + +typedef struct _xmlRegCounter xmlRegCounter; +typedef xmlRegCounter *xmlRegCounterPtr; + +struct _xmlRegCounter { +    int min; +    int max; +}; + +typedef struct _xmlRegTrans xmlRegTrans; +typedef xmlRegTrans *xmlRegTransPtr; + +struct _xmlRegTrans { +    xmlRegAtomPtr atom; +    int to; +    int counter; +    int count; +}; + +struct _xmlAutomataState { +    xmlRegStateType type; +    xmlRegMarkedType mark; +    xmlRegMarkedType reached; +    int no; + +    int maxTrans; +    int nbTrans; +    xmlRegTrans *trans; +}; + +typedef struct _xmlAutomata xmlRegParserCtxt; +typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; + +struct _xmlAutomata { +    xmlChar *string; +    xmlChar *cur; + +    int error; +    int neg; + +    xmlRegStatePtr start; +    xmlRegStatePtr end; +    xmlRegStatePtr state; + +    xmlRegAtomPtr atom; + +    int maxAtoms; +    int nbAtoms; +    xmlRegAtomPtr *atoms; + +    int maxStates; +    int nbStates; +    xmlRegStatePtr *states; + +    int maxCounters; +    int nbCounters; +    xmlRegCounter *counters; + +    int determinist; +}; + +struct _xmlRegexp { +    xmlChar *string; +    int nbStates; +    xmlRegStatePtr *states; +    int nbAtoms; +    xmlRegAtomPtr *atoms; +    int nbCounters; +    xmlRegCounter *counters; +    int determinist; +    /* +     * That's the compact form for determinists automatas +     */ +    int nbstates; +    int *compact; +    void **transdata; +    int nbstrings; +    xmlChar **stringMap; +}; + +typedef struct _xmlRegExecRollback xmlRegExecRollback; +typedef xmlRegExecRollback *xmlRegExecRollbackPtr; + +struct _xmlRegExecRollback { +    xmlRegStatePtr state;/* the current state */ +    int index;		/* the index in the input stack */ +    int nextbranch;	/* the next transition to explore in that state */ +    int *counts;	/* save the automate state if it has some */ +}; + +typedef struct _xmlRegInputToken xmlRegInputToken; +typedef xmlRegInputToken *xmlRegInputTokenPtr; + +struct _xmlRegInputToken { +    xmlChar *value; +    void *data; +}; + +struct _xmlRegExecCtxt { +    int status;		/* execution status != 0 indicate an error */ +    int determinist;	/* did we found an inderterministic behaviour */ +    xmlRegexpPtr comp;	/* the compiled regexp */ +    xmlRegExecCallbacks callback; +    void *data; + +    xmlRegStatePtr state;/* the current state */ +    int transno;	/* the current transition on that state */ +    int transcount;	/* the number of char in char counted transitions */ + +    /* +     * A stack of rollback states +     */ +    int maxRollbacks; +    int nbRollbacks; +    xmlRegExecRollback *rollbacks; + +    /* +     * The state of the automata if any +     */ +    int *counts; + +    /* +     * The input stack +     */ +    int inputStackMax; +    int inputStackNr; +    int index; +    int *charStack; +    const xmlChar *inputString; /* when operating on characters */ +    xmlRegInputTokenPtr inputStack;/* when operating on strings */ + +}; + +#define REGEXP_ALL_COUNTER	0x123456 +#define REGEXP_ALL_LAX_COUNTER	0x123457 + +static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); +static void xmlRegFreeState(xmlRegStatePtr state); +static void xmlRegFreeAtom(xmlRegAtomPtr atom); + +/************************************************************************ + * 									* + * 			Allocation/Deallocation				* + * 									* + ************************************************************************/ + +static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); +/** + * xmlRegEpxFromParse: + * @ctxt:  the parser context used to build it + * + * Allocate a new regexp and fill it with the reult from the parser + * + * Returns the new regexp or NULL in case of error + */ +static xmlRegexpPtr +xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { +    xmlRegexpPtr ret; + +    ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); +    if (ret == NULL) +	return(NULL); +    memset(ret, 0, sizeof(xmlRegexp)); +    ret->string = ctxt->string; +    ctxt->string = NULL; +    ret->nbStates = ctxt->nbStates; +    ctxt->nbStates = 0; +    ret->states = ctxt->states; +    ctxt->states = NULL; +    ret->nbAtoms = ctxt->nbAtoms; +    ctxt->nbAtoms = 0; +    ret->atoms = ctxt->atoms; +    ctxt->atoms = NULL; +    ret->nbCounters = ctxt->nbCounters; +    ctxt->nbCounters = 0; +    ret->counters = ctxt->counters; +    ctxt->counters = NULL; +    ret->determinist = ctxt->determinist; + +    if ((ret->determinist != 0) && +	(ret->nbCounters == 0) && +	(ret->atoms != NULL) && +	(ret->atoms[0] != NULL) && +	(ret->atoms[0]->type == XML_REGEXP_STRING)) { +	int i, j, nbstates = 0, nbatoms = 0; +	int *stateRemap; +	int *stringRemap; +	int *transitions; +	void **transdata; +	xmlChar **stringMap; +        xmlChar *value; + +	/* +	 * Switch to a compact representation +	 * 1/ counting the effective number of states left +	 * 2/ conting the unique number of atoms, and check that +	 *    they are all of the string type +	 * 3/ build a table state x atom for the transitions +	 */ + +	stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); +	for (i = 0;i < ret->nbStates;i++) { +	    if (ret->states[i] != NULL) { +		stateRemap[i] = nbstates; +		nbstates++; +	    } else { +		stateRemap[i] = -1; +	    } +	} +#ifdef DEBUG_COMPACTION +	printf("Final: %d states\n", nbstates); +#endif +	stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); +	stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); +	for (i = 0;i < ret->nbAtoms;i++) { +	    if ((ret->atoms[i]->type == XML_REGEXP_STRING) && +		(ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { +		value = ret->atoms[i]->valuep; +                for (j = 0;j < nbatoms;j++) { +		    if (xmlStrEqual(stringMap[j], value)) { +			stringRemap[i] = j; +			break; +		    } +		} +		if (j >= nbatoms) { +		    stringRemap[i] = nbatoms; +		    stringMap[nbatoms] = xmlStrdup(value); +		    nbatoms++; +		} +	    } else { +		xmlFree(stateRemap); +		xmlFree(stringRemap); +		for (i = 0;i < nbatoms;i++) +		    xmlFree(stringMap[i]); +		xmlFree(stringMap); +		goto fail_compact; +	    } +	} +#ifdef DEBUG_COMPACTION +	printf("Final: %d atoms\n", nbatoms); +#endif + +	/* +	 * Allocate the transition table. The first entry for each +	 * state correspond to the state type. +	 */ +	transitions = (int *) xmlMalloc(nbstates * (nbatoms + 1) * sizeof(int)); +	transdata = NULL; +	memset(transitions, 0, nbstates * (nbatoms + 1) * sizeof(int)); + +	for (i = 0;i < ret->nbStates;i++) { +	    int stateno, atomno, targetno, prev; +	    xmlRegStatePtr state; +	    xmlRegTransPtr trans; + +	    stateno = stateRemap[i]; +	    if (stateno == -1) +		continue; +	    state = ret->states[i]; + +	    transitions[stateno * (nbatoms + 1)] = state->type; + +	    for (j = 0;j < state->nbTrans;j++) { +		trans = &(state->trans[j]); +		if ((trans->to == -1) || (trans->atom == NULL)) +		    continue; +                atomno = stringRemap[trans->atom->no]; +		if ((trans->atom->data != NULL) && (transdata == NULL)) { +		    transdata = (void **) xmlMalloc(nbstates * nbatoms * +			                            sizeof(void *)); +		    if (transdata != NULL) +			memset(transdata, 0, +			       nbstates * nbatoms * sizeof(void *)); +		} +		targetno = stateRemap[trans->to]; +		/* +		 * if the same atome can generate transition to 2 different +		 * states then it means the automata is not determinist and +		 * the compact form can't be used ! +		 */ +		prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; +		if (prev != 0) { +		    if (prev != targetno + 1) { +			printf("not determinist\n"); +			ret->determinist = 0; +#ifdef DEBUG_COMPACTION +			printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", +			       i, j, trans->atom->no, trans->to, atomno, targetno); +			printf("       previous to is %d\n", prev); +#endif +			ret->determinist = 0; +			if (transdata != NULL) +			    xmlFree(transdata); +			xmlFree(transitions); +			xmlFree(stateRemap); +			xmlFree(stringRemap); +			for (i = 0;i < nbatoms;i++) +			    xmlFree(stringMap[i]); +			xmlFree(stringMap); +			goto fail_compact; +		    } +		} else { +#if 0 +		    printf("State %d trans %d: atom %d to %d : %d to %d\n", +			   i, j, trans->atom->no, trans->to, atomno, targetno); +#endif +		    transitions[stateno * (nbatoms + 1) + atomno + 1] = +			targetno + 1; /* to avoid 0 */ +		    if (transdata != NULL) +			transdata[stateno * nbatoms + atomno] = +			    trans->atom->data; +		} +	    } +	} +	ret->determinist = 1; +#ifdef DEBUG_COMPACTION +	/* +	 * Debug +	 */ +	for (i = 0;i < nbstates;i++) { +	    for (j = 0;j < nbatoms + 1;j++) { +                printf("%02d ", transitions[i * (nbatoms + 1) + j]); +	    } +	    printf("\n"); +	} +	printf("\n"); +#endif +	/* +	 * Cleanup of the old data +	 */ +	if (ret->states != NULL) { +	    for (i = 0;i < ret->nbStates;i++) +		xmlRegFreeState(ret->states[i]); +	    xmlFree(ret->states); +	} +	ret->states = NULL; +	ret->nbStates = 0; +	if (ret->atoms != NULL) { +	    for (i = 0;i < ret->nbAtoms;i++) +		xmlRegFreeAtom(ret->atoms[i]); +	    xmlFree(ret->atoms); +	} +	ret->atoms = NULL; +	ret->nbAtoms = 0; + +	ret->compact = transitions; +	ret->transdata = transdata; +	ret->stringMap = stringMap; +	ret->nbstrings = nbatoms; +	ret->nbstates = nbstates; +	xmlFree(stateRemap); +	xmlFree(stringRemap); +    } +fail_compact: +    return(ret); +} + +/** + * xmlRegNewParserCtxt: + * @string:  the string to parse + * + * Allocate a new regexp parser context + * + * Returns the new context or NULL in case of error + */ +static xmlRegParserCtxtPtr +xmlRegNewParserCtxt(const xmlChar *string) { +    xmlRegParserCtxtPtr ret; + +    ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); +    if (ret == NULL) +	return(NULL); +    memset(ret, 0, sizeof(xmlRegParserCtxt)); +    if (string != NULL) +	ret->string = xmlStrdup(string); +    ret->cur = ret->string; +    ret->neg = 0; +    ret->error = 0; +    ret->determinist = -1; +    return(ret); +} + +/** + * xmlRegNewRange: + * @ctxt:  the regexp parser context + * @neg:  is that negative + * @type:  the type of range + * @start:  the start codepoint + * @end:  the end codepoint + * + * Allocate a new regexp range + * + * Returns the new range or NULL in case of error + */ +static xmlRegRangePtr +xmlRegNewRange(xmlRegParserCtxtPtr ctxt, +	       int neg, xmlRegAtomType type, int start, int end) { +    xmlRegRangePtr ret; + +    ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); +    if (ret == NULL) { +	ERROR("failed to allocate regexp range"); +	return(NULL); +    } +    ret->neg = neg; +    ret->type = type; +    ret->start = start; +    ret->end = end; +    return(ret); +} + +/** + * xmlRegFreeRange: + * @range:  the regexp range + * + * Free a regexp range + */ +static void +xmlRegFreeRange(xmlRegRangePtr range) { +    if (range == NULL) +	return; + +    if (range->blockName != NULL) +	xmlFree(range->blockName); +    xmlFree(range); +} + +/** + * xmlRegNewAtom: + * @ctxt:  the regexp parser context + * @type:  the type of atom + * + * Allocate a new regexp range + * + * Returns the new atom or NULL in case of error + */ +static xmlRegAtomPtr +xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { +    xmlRegAtomPtr ret; + +    ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); +    if (ret == NULL) { +	ERROR("failed to allocate regexp atom"); +	return(NULL); +    } +    memset(ret, 0, sizeof(xmlRegAtom)); +    ret->type = type; +    ret->quant = XML_REGEXP_QUANT_ONCE; +    ret->min = 0; +    ret->max = 0; +    return(ret); +} + +/** + * xmlRegFreeAtom: + * @atom:  the regexp atom + * + * Free a regexp atom + */ +static void +xmlRegFreeAtom(xmlRegAtomPtr atom) { +    int i; + +    if (atom == NULL) +	return; + +    for (i = 0;i < atom->nbRanges;i++) +	xmlRegFreeRange(atom->ranges[i]); +    if (atom->ranges != NULL) +	xmlFree(atom->ranges); +    if (atom->type == XML_REGEXP_STRING) +	xmlFree(atom->valuep); +    xmlFree(atom); +} + +static xmlRegStatePtr +xmlRegNewState(xmlRegParserCtxtPtr ctxt) { +    xmlRegStatePtr ret; + +    ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); +    if (ret == NULL) { +	ERROR("failed to allocate regexp state"); +	return(NULL); +    } +    memset(ret, 0, sizeof(xmlRegState)); +    ret->type = XML_REGEXP_TRANS_STATE; +    ret->mark = XML_REGEXP_MARK_NORMAL; +    return(ret); +} + +/** + * xmlRegFreeState: + * @state:  the regexp state + * + * Free a regexp state + */ +static void +xmlRegFreeState(xmlRegStatePtr state) { +    if (state == NULL) +	return; + +    if (state->trans != NULL) +	xmlFree(state->trans); +    xmlFree(state); +} + +/** + * xmlRegFreeParserCtxt: + * @ctxt:  the regexp parser context + * + * Free a regexp parser context + */ +static void +xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { +    int i; +    if (ctxt == NULL) +	return; + +    if (ctxt->string != NULL) +	xmlFree(ctxt->string); +    if (ctxt->states != NULL) { +	for (i = 0;i < ctxt->nbStates;i++) +	    xmlRegFreeState(ctxt->states[i]); +	xmlFree(ctxt->states); +    } +    if (ctxt->atoms != NULL) { +	for (i = 0;i < ctxt->nbAtoms;i++) +	    xmlRegFreeAtom(ctxt->atoms[i]); +	xmlFree(ctxt->atoms); +    } +    if (ctxt->counters != NULL) +	xmlFree(ctxt->counters); +    xmlFree(ctxt); +} + +/************************************************************************ + * 									* + * 			Display of Data structures			* + * 									* + ************************************************************************/ + +static void +xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { +    switch (type) { +        case XML_REGEXP_EPSILON: +	    fprintf(output, "epsilon "); break; +        case XML_REGEXP_CHARVAL: +	    fprintf(output, "charval "); break; +        case XML_REGEXP_RANGES: +	    fprintf(output, "ranges "); break; +        case XML_REGEXP_SUBREG: +	    fprintf(output, "subexpr "); break; +        case XML_REGEXP_STRING: +	    fprintf(output, "string "); break; +        case XML_REGEXP_ANYCHAR: +	    fprintf(output, "anychar "); break; +        case XML_REGEXP_ANYSPACE: +	    fprintf(output, "anyspace "); break; +        case XML_REGEXP_NOTSPACE: +	    fprintf(output, "notspace "); break; +        case XML_REGEXP_INITNAME: +	    fprintf(output, "initname "); break; +        case XML_REGEXP_NOTINITNAME: +	    fprintf(output, "notinitname "); break; +        case XML_REGEXP_NAMECHAR: +	    fprintf(output, "namechar "); break; +        case XML_REGEXP_NOTNAMECHAR: +	    fprintf(output, "notnamechar "); break; +        case XML_REGEXP_DECIMAL: +	    fprintf(output, "decimal "); break; +        case XML_REGEXP_NOTDECIMAL: +	    fprintf(output, "notdecimal "); break; +        case XML_REGEXP_REALCHAR: +	    fprintf(output, "realchar "); break; +        case XML_REGEXP_NOTREALCHAR: +	    fprintf(output, "notrealchar "); break; +        case XML_REGEXP_LETTER: +            fprintf(output, "LETTER "); break; +        case XML_REGEXP_LETTER_UPPERCASE: +            fprintf(output, "LETTER_UPPERCASE "); break; +        case XML_REGEXP_LETTER_LOWERCASE: +            fprintf(output, "LETTER_LOWERCASE "); break; +        case XML_REGEXP_LETTER_TITLECASE: +            fprintf(output, "LETTER_TITLECASE "); break; +        case XML_REGEXP_LETTER_MODIFIER: +            fprintf(output, "LETTER_MODIFIER "); break; +        case XML_REGEXP_LETTER_OTHERS: +            fprintf(output, "LETTER_OTHERS "); break; +        case XML_REGEXP_MARK: +            fprintf(output, "MARK "); break; +        case XML_REGEXP_MARK_NONSPACING: +            fprintf(output, "MARK_NONSPACING "); break; +        case XML_REGEXP_MARK_SPACECOMBINING: +            fprintf(output, "MARK_SPACECOMBINING "); break; +        case XML_REGEXP_MARK_ENCLOSING: +            fprintf(output, "MARK_ENCLOSING "); break; +        case XML_REGEXP_NUMBER: +            fprintf(output, "NUMBER "); break; +        case XML_REGEXP_NUMBER_DECIMAL: +            fprintf(output, "NUMBER_DECIMAL "); break; +        case XML_REGEXP_NUMBER_LETTER: +            fprintf(output, "NUMBER_LETTER "); break; +        case XML_REGEXP_NUMBER_OTHERS: +            fprintf(output, "NUMBER_OTHERS "); break; +        case XML_REGEXP_PUNCT: +            fprintf(output, "PUNCT "); break; +        case XML_REGEXP_PUNCT_CONNECTOR: +            fprintf(output, "PUNCT_CONNECTOR "); break; +        case XML_REGEXP_PUNCT_DASH: +            fprintf(output, "PUNCT_DASH "); break; +        case XML_REGEXP_PUNCT_OPEN: +            fprintf(output, "PUNCT_OPEN "); break; +        case XML_REGEXP_PUNCT_CLOSE: +            fprintf(output, "PUNCT_CLOSE "); break; +        case XML_REGEXP_PUNCT_INITQUOTE: +            fprintf(output, "PUNCT_INITQUOTE "); break; +        case XML_REGEXP_PUNCT_FINQUOTE: +            fprintf(output, "PUNCT_FINQUOTE "); break; +        case XML_REGEXP_PUNCT_OTHERS: +            fprintf(output, "PUNCT_OTHERS "); break; +        case XML_REGEXP_SEPAR: +            fprintf(output, "SEPAR "); break; +        case XML_REGEXP_SEPAR_SPACE: +            fprintf(output, "SEPAR_SPACE "); break; +        case XML_REGEXP_SEPAR_LINE: +            fprintf(output, "SEPAR_LINE "); break; +        case XML_REGEXP_SEPAR_PARA: +            fprintf(output, "SEPAR_PARA "); break; +        case XML_REGEXP_SYMBOL: +            fprintf(output, "SYMBOL "); break; +        case XML_REGEXP_SYMBOL_MATH: +            fprintf(output, "SYMBOL_MATH "); break; +        case XML_REGEXP_SYMBOL_CURRENCY: +            fprintf(output, "SYMBOL_CURRENCY "); break; +        case XML_REGEXP_SYMBOL_MODIFIER: +            fprintf(output, "SYMBOL_MODIFIER "); break; +        case XML_REGEXP_SYMBOL_OTHERS: +            fprintf(output, "SYMBOL_OTHERS "); break; +        case XML_REGEXP_OTHER: +            fprintf(output, "OTHER "); break; +        case XML_REGEXP_OTHER_CONTROL: +            fprintf(output, "OTHER_CONTROL "); break; +        case XML_REGEXP_OTHER_FORMAT: +            fprintf(output, "OTHER_FORMAT "); break; +        case XML_REGEXP_OTHER_PRIVATE: +            fprintf(output, "OTHER_PRIVATE "); break; +        case XML_REGEXP_OTHER_NA: +            fprintf(output, "OTHER_NA "); break; +        case XML_REGEXP_BLOCK_NAME: +	    fprintf(output, "BLOCK "); break; +    } +} + +static void +xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { +    switch (type) { +        case XML_REGEXP_QUANT_EPSILON: +	    fprintf(output, "epsilon "); break; +        case XML_REGEXP_QUANT_ONCE: +	    fprintf(output, "once "); break; +        case XML_REGEXP_QUANT_OPT: +	    fprintf(output, "? "); break; +        case XML_REGEXP_QUANT_MULT: +	    fprintf(output, "* "); break; +        case XML_REGEXP_QUANT_PLUS: +	    fprintf(output, "+ "); break; +	case XML_REGEXP_QUANT_RANGE: +	    fprintf(output, "range "); break; +	case XML_REGEXP_QUANT_ONCEONLY: +	    fprintf(output, "onceonly "); break; +	case XML_REGEXP_QUANT_ALL: +	    fprintf(output, "all "); break; +    } +} +static void +xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { +    fprintf(output, "  range: "); +    if (range->neg) +	fprintf(output, "negative "); +    xmlRegPrintAtomType(output, range->type); +    fprintf(output, "%c - %c\n", range->start, range->end); +} + +static void +xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { +    fprintf(output, " atom: "); +    if (atom == NULL) { +	fprintf(output, "NULL\n"); +	return; +    } +    xmlRegPrintAtomType(output, atom->type); +    xmlRegPrintQuantType(output, atom->quant); +    if (atom->quant == XML_REGEXP_QUANT_RANGE) +	fprintf(output, "%d-%d ", atom->min, atom->max); +    if (atom->type == XML_REGEXP_STRING) +	fprintf(output, "'%s' ", (char *) atom->valuep); +    if (atom->type == XML_REGEXP_CHARVAL) +	fprintf(output, "char %c\n", atom->codepoint); +    else if (atom->type == XML_REGEXP_RANGES) { +	int i; +	fprintf(output, "%d entries\n", atom->nbRanges); +	for (i = 0; i < atom->nbRanges;i++) +	    xmlRegPrintRange(output, atom->ranges[i]); +    } else if (atom->type == XML_REGEXP_SUBREG) { +	fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); +    } else { +	fprintf(output, "\n"); +    } +} + +static void +xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { +    fprintf(output, "  trans: "); +    if (trans == NULL) { +	fprintf(output, "NULL\n"); +	return; +    } +    if (trans->to < 0) { +	fprintf(output, "removed\n"); +	return; +    } +    if (trans->counter >= 0) { +	fprintf(output, "counted %d, ", trans->counter); +    } +    if (trans->count == REGEXP_ALL_COUNTER) { +	fprintf(output, "all transition, "); +    } else if (trans->count >= 0) { +	fprintf(output, "count based %d, ", trans->count); +    } +    if (trans->atom == NULL) { +	fprintf(output, "epsilon to %d\n", trans->to); +	return; +    } +    if (trans->atom->type == XML_REGEXP_CHARVAL) +	fprintf(output, "char %c ", trans->atom->codepoint); +    fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); +} +     +static void +xmlRegPrintState(FILE *output, xmlRegStatePtr state) { +    int i; + +    fprintf(output, " state: "); +    if (state == NULL) { +	fprintf(output, "NULL\n"); +	return; +    } +    if (state->type == XML_REGEXP_START_STATE) +	fprintf(output, "START "); +    if (state->type == XML_REGEXP_FINAL_STATE) +	fprintf(output, "FINAL "); +     +    fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); +    for (i = 0;i < state->nbTrans; i++) { +	xmlRegPrintTrans(output, &(state->trans[i])); +    } +} + +#ifdef DEBUG_REGEXP_GRAPH +static void +xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { +    int i; + +    fprintf(output, " ctxt: "); +    if (ctxt == NULL) { +	fprintf(output, "NULL\n"); +	return; +    } +    fprintf(output, "'%s' ", ctxt->string); +    if (ctxt->error) +	fprintf(output, "error "); +    if (ctxt->neg) +	fprintf(output, "neg "); +    fprintf(output, "\n"); +    fprintf(output, "%d atoms:\n", ctxt->nbAtoms); +    for (i = 0;i < ctxt->nbAtoms; i++) { +	fprintf(output, " %02d ", i); +	xmlRegPrintAtom(output, ctxt->atoms[i]); +    } +    if (ctxt->atom != NULL) { +	fprintf(output, "current atom:\n"); +	xmlRegPrintAtom(output, ctxt->atom); +    } +    fprintf(output, "%d states:", ctxt->nbStates); +    if (ctxt->start != NULL) +	fprintf(output, " start: %d", ctxt->start->no); +    if (ctxt->end != NULL) +	fprintf(output, " end: %d", ctxt->end->no); +    fprintf(output, "\n"); +    for (i = 0;i < ctxt->nbStates; i++) { +	xmlRegPrintState(output, ctxt->states[i]); +    } +    fprintf(output, "%d counters:\n", ctxt->nbCounters); +    for (i = 0;i < ctxt->nbCounters; i++) { +	fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, +		                                ctxt->counters[i].max); +    } +} +#endif + +/************************************************************************ + * 									* + *		 Finite Automata structures manipulations		* + * 									* + ************************************************************************/ + +static void  +xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, +	           int neg, xmlRegAtomType type, int start, int end, +		   xmlChar *blockName) { +    xmlRegRangePtr range; + +    if (atom == NULL) { +	ERROR("add range: atom is NULL"); +	return; +    } +    if (atom->type != XML_REGEXP_RANGES) { +	ERROR("add range: atom is not ranges"); +	return; +    } +    if (atom->maxRanges == 0) { +	atom->maxRanges = 4; +	atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * +		                             sizeof(xmlRegRangePtr)); +	if (atom->ranges == NULL) { +	    ERROR("add range: allocation failed"); +	    atom->maxRanges = 0; +	    return; +	} +    } else if (atom->nbRanges >= atom->maxRanges) { +	xmlRegRangePtr *tmp; +	atom->maxRanges *= 2; +	tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * +		                             sizeof(xmlRegRangePtr)); +	if (tmp == NULL) { +	    ERROR("add range: allocation failed"); +	    atom->maxRanges /= 2; +	    return; +	} +	atom->ranges = tmp; +    } +    range = xmlRegNewRange(ctxt, neg, type, start, end); +    if (range == NULL) +	return; +    range->blockName = blockName; +    atom->ranges[atom->nbRanges++] = range; +     +} + +static int +xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { +    if (ctxt->maxCounters == 0) { +	ctxt->maxCounters = 4; +	ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * +		                             sizeof(xmlRegCounter)); +	if (ctxt->counters == NULL) { +	    ERROR("reg counter: allocation failed"); +	    ctxt->maxCounters = 0; +	    return(-1); +	} +    } else if (ctxt->nbCounters >= ctxt->maxCounters) { +	xmlRegCounter *tmp; +	ctxt->maxCounters *= 2; +	tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * +		                           sizeof(xmlRegCounter)); +	if (tmp == NULL) { +	    ERROR("reg counter: allocation failed"); +	    ctxt->maxCounters /= 2; +	    return(-1); +	} +	ctxt->counters = tmp; +    } +    ctxt->counters[ctxt->nbCounters].min = -1; +    ctxt->counters[ctxt->nbCounters].max = -1; +    return(ctxt->nbCounters++); +} + +static void  +xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { +    if (atom == NULL) { +	ERROR("atom push: atom is NULL"); +	return; +    } +    if (ctxt->maxAtoms == 0) { +	ctxt->maxAtoms = 4; +	ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * +		                             sizeof(xmlRegAtomPtr)); +	if (ctxt->atoms == NULL) { +	    ERROR("atom push: allocation failed"); +	    ctxt->maxAtoms = 0; +	    return; +	} +    } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { +	xmlRegAtomPtr *tmp; +	ctxt->maxAtoms *= 2; +	tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * +		                             sizeof(xmlRegAtomPtr)); +	if (tmp == NULL) { +	    ERROR("atom push: allocation failed"); +	    ctxt->maxAtoms /= 2; +	    return; +	} +	ctxt->atoms = tmp; +    } +    atom->no = ctxt->nbAtoms; +    ctxt->atoms[ctxt->nbAtoms++] = atom; +} + +static void  +xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, +	            xmlRegAtomPtr atom, xmlRegStatePtr target, +		    int counter, int count) { +    if (state == NULL) { +	ERROR("add state: state is NULL"); +	return; +    } +    if (target == NULL) { +	ERROR("add state: target is NULL"); +	return; +    } +    if (state->maxTrans == 0) { +	state->maxTrans = 4; +	state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * +		                             sizeof(xmlRegTrans)); +	if (state->trans == NULL) { +	    ERROR("add range: allocation failed"); +	    state->maxTrans = 0; +	    return; +	} +    } else if (state->nbTrans >= state->maxTrans) { +	xmlRegTrans *tmp; +	state->maxTrans *= 2; +	tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * +		                             sizeof(xmlRegTrans)); +	if (tmp == NULL) { +	    ERROR("add range: allocation failed"); +	    state->maxTrans /= 2; +	    return; +	} +	state->trans = tmp; +    } +#ifdef DEBUG_REGEXP_GRAPH +    printf("Add trans from %d to %d ", state->no, target->no); +    if (count == REGEXP_ALL_COUNTER) +	printf("all transition"); +    else if (count >= 0) +	printf("count based %d", count); +    else if (counter >= 0) +	printf("counted %d", counter); +    else if (atom == NULL) +	printf("epsilon transition"); +    printf("\n"); +#endif + +    state->trans[state->nbTrans].atom = atom; +    state->trans[state->nbTrans].to = target->no; +    state->trans[state->nbTrans].counter = counter; +    state->trans[state->nbTrans].count = count; +    state->nbTrans++; +} + +static void  +xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { +    if (ctxt->maxStates == 0) { +	ctxt->maxStates = 4; +	ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * +		                             sizeof(xmlRegStatePtr)); +	if (ctxt->states == NULL) { +	    ERROR("add range: allocation failed"); +	    ctxt->maxStates = 0; +	    return; +	} +    } else if (ctxt->nbStates >= ctxt->maxStates) { +	xmlRegStatePtr *tmp; +	ctxt->maxStates *= 2; +	tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * +		                             sizeof(xmlRegStatePtr)); +	if (tmp == NULL) { +	    ERROR("add range: allocation failed"); +	    ctxt->maxStates /= 2; +	    return; +	} +	ctxt->states = tmp; +    } +    state->no = ctxt->nbStates; +    ctxt->states[ctxt->nbStates++] = state; +} + +/** + * xmlFAGenerateAllTransition: + * @ctxt:  a regexp parser context + * @from:  the from state + * @to:  the target state or NULL for building a new one + * @lax: + * + */ +static void +xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, +			   xmlRegStatePtr from, xmlRegStatePtr to, +			   int lax) { +    if (to == NULL) { +	to = xmlRegNewState(ctxt); +	xmlRegStatePush(ctxt, to); +	ctxt->state = to; +    } +    if (lax) +	xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); +    else +	xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); +} + +/** + * xmlFAGenerateEpsilonTransition: + * @ctxt:  a regexp parser context + * @from:  the from state + * @to:  the target state or NULL for building a new one + * + */ +static void +xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, +			       xmlRegStatePtr from, xmlRegStatePtr to) { +    if (to == NULL) { +	to = xmlRegNewState(ctxt); +	xmlRegStatePush(ctxt, to); +	ctxt->state = to; +    } +    xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); +} + +/** + * xmlFAGenerateCountedEpsilonTransition: + * @ctxt:  a regexp parser context + * @from:  the from state + * @to:  the target state or NULL for building a new one + * counter:  the counter for that transition + * + */ +static void +xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, +	    xmlRegStatePtr from, xmlRegStatePtr to, int counter) { +    if (to == NULL) { +	to = xmlRegNewState(ctxt); +	xmlRegStatePush(ctxt, to); +	ctxt->state = to; +    } +    xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); +} + +/** + * xmlFAGenerateCountedTransition: + * @ctxt:  a regexp parser context + * @from:  the from state + * @to:  the target state or NULL for building a new one + * counter:  the counter for that transition + * + */ +static void +xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, +	    xmlRegStatePtr from, xmlRegStatePtr to, int counter) { +    if (to == NULL) { +	to = xmlRegNewState(ctxt); +	xmlRegStatePush(ctxt, to); +	ctxt->state = to; +    } +    xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); +} + +/** + * xmlFAGenerateTransitions: + * @ctxt:  a regexp parser context + * @from:  the from state + * @to:  the target state or NULL for building a new one + * @atom:  the atom generating the transition + * + */ +static void +xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, +	                 xmlRegStatePtr to, xmlRegAtomPtr atom) { +    if (atom == NULL) { +	ERROR("genrate transition: atom == NULL"); +	return; +    } +    if (atom->type == XML_REGEXP_SUBREG) { +	/* +	 * this is a subexpression handling one should not need to +	 * create a new node excep for XML_REGEXP_QUANT_RANGE. +	 */ +	xmlRegAtomPush(ctxt, atom); +	if ((to != NULL) && (atom->stop != to) && +	    (atom->quant != XML_REGEXP_QUANT_RANGE)) { +	    /* +	     * Generate an epsilon transition to link to the target +	     */ +	    xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); +	} +	switch (atom->quant) { +	    case XML_REGEXP_QUANT_OPT: +		atom->quant = XML_REGEXP_QUANT_ONCE; +		xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); +		break; +	    case XML_REGEXP_QUANT_MULT: +		atom->quant = XML_REGEXP_QUANT_ONCE; +		xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); +		xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); +		break; +	    case XML_REGEXP_QUANT_PLUS: +		atom->quant = XML_REGEXP_QUANT_ONCE; +		xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); +		break; +	    case XML_REGEXP_QUANT_RANGE: { +		int counter; +		xmlRegStatePtr newstate; + +		/* +		 * This one is nasty: +		 *   1/ register a new counter +		 *   2/ register an epsilon transition associated to +		 *      this counter going from atom->stop to atom->start +		 *   3/ create a new state +		 *   4/ generate a counted transition from atom->stop to +		 *      that state +		 */ +		counter = xmlRegGetCounter(ctxt); +		ctxt->counters[counter].min = atom->min - 1; +		ctxt->counters[counter].max = atom->max - 1; +		atom->min = 0; +		atom->max = 0; +		atom->quant = XML_REGEXP_QUANT_ONCE; +		xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, +			                              atom->start, counter); +		if (to != NULL) { +		    newstate = to; +		} else { +		    newstate = xmlRegNewState(ctxt); +		    xmlRegStatePush(ctxt, newstate); +		    ctxt->state = newstate; +		} +		xmlFAGenerateCountedTransition(ctxt, atom->stop, +			                       newstate, counter); +	    } +	    default: +		break; +	} +	return; +    } else { +	if (to == NULL) { +	    to = xmlRegNewState(ctxt); +	    xmlRegStatePush(ctxt, to); +	} +	xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); +	xmlRegAtomPush(ctxt, atom); +	ctxt->state = to; +    } +    switch (atom->quant) { +	case XML_REGEXP_QUANT_OPT: +	    atom->quant = XML_REGEXP_QUANT_ONCE; +	    xmlFAGenerateEpsilonTransition(ctxt, from, to); +	    break; +	case XML_REGEXP_QUANT_MULT: +	    atom->quant = XML_REGEXP_QUANT_ONCE; +	    xmlFAGenerateEpsilonTransition(ctxt, from, to); +	    xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); +	    break; +	case XML_REGEXP_QUANT_PLUS: +	    atom->quant = XML_REGEXP_QUANT_ONCE; +	    xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); +	    break; +	default: +	    break; +    } +} + +/** + * xmlFAReduceEpsilonTransitions: + * @ctxt:  a regexp parser context + * @fromnr:  the from state + * @tonr:  the to state  + * @cpunter:  should that transition be associted to a counted + * + */ +static void +xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, +	                      int tonr, int counter) { +    int transnr; +    xmlRegStatePtr from; +    xmlRegStatePtr to; + +#ifdef DEBUG_REGEXP_GRAPH +    printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); +#endif +    from = ctxt->states[fromnr]; +    if (from == NULL) +	return; +    to = ctxt->states[tonr]; +    if (to == NULL) +	return; +    if ((to->mark == XML_REGEXP_MARK_START) || +	(to->mark == XML_REGEXP_MARK_VISITED)) +	return; + +    to->mark = XML_REGEXP_MARK_VISITED; +    if (to->type == XML_REGEXP_FINAL_STATE) { +#ifdef DEBUG_REGEXP_GRAPH +	printf("State %d is final, so %d becomes final\n", tonr, fromnr); +#endif +	from->type = XML_REGEXP_FINAL_STATE; +    } +    for (transnr = 0;transnr < to->nbTrans;transnr++) { +	if (to->trans[transnr].atom == NULL) { +	    /* +	     * Don't remove counted transitions +	     * Don't loop either +	     */ +	    if (to->trans[transnr].to != fromnr) { +		if (to->trans[transnr].count >= 0) { +		    int newto = to->trans[transnr].to; + +		    xmlRegStateAddTrans(ctxt, from, NULL, +					ctxt->states[newto],  +					-1, to->trans[transnr].count); +		} else { +#ifdef DEBUG_REGEXP_GRAPH +		    printf("Found epsilon trans %d from %d to %d\n", +			   transnr, tonr, to->trans[transnr].to); +#endif +		    if (to->trans[transnr].counter >= 0) { +			xmlFAReduceEpsilonTransitions(ctxt, fromnr, +					      to->trans[transnr].to, +					      to->trans[transnr].counter); +		    } else { +			xmlFAReduceEpsilonTransitions(ctxt, fromnr, +					      to->trans[transnr].to, +					      counter); +		    } +		} +	    } +	} else { +	    int newto = to->trans[transnr].to; + +	    if (to->trans[transnr].counter >= 0) { +		xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,  +				    ctxt->states[newto],  +				    to->trans[transnr].counter, -1); +	    } else { +		xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,  +				    ctxt->states[newto], counter, -1); +	    } +	} +    } +    to->mark = XML_REGEXP_MARK_NORMAL; +} + +/** + * xmlFAEliminateEpsilonTransitions: + * @ctxt:  a regexp parser context + * + */ +static void +xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { +    int statenr, transnr; +    xmlRegStatePtr state; + +    /* +     * build the completed transitions bypassing the epsilons +     * Use a marking algorithm to avoid loops +     */ +    for (statenr = 0;statenr < ctxt->nbStates;statenr++) { +	state = ctxt->states[statenr]; +	if (state == NULL) +	    continue; +	for (transnr = 0;transnr < state->nbTrans;transnr++) { +	    if ((state->trans[transnr].atom == NULL) && +		(state->trans[transnr].to >= 0)) { +		if (state->trans[transnr].to == statenr) { +		    state->trans[transnr].to = -1; +#ifdef DEBUG_REGEXP_GRAPH +		    printf("Removed loopback epsilon trans %d on %d\n", +			   transnr, statenr); +#endif +		} else if (state->trans[transnr].count < 0) { +		    int newto = state->trans[transnr].to; + +#ifdef DEBUG_REGEXP_GRAPH +		    printf("Found epsilon trans %d from %d to %d\n", +			   transnr, statenr, newto); +#endif +		    state->mark = XML_REGEXP_MARK_START; +		    xmlFAReduceEpsilonTransitions(ctxt, statenr, +				      newto, state->trans[transnr].counter); +		    state->mark = XML_REGEXP_MARK_NORMAL; +#ifdef DEBUG_REGEXP_GRAPH +		} else { +		    printf("Found counted transition %d on %d\n", +			   transnr, statenr); +#endif +	        } +	    } +	} +    } +    /* +     * Eliminate the epsilon transitions +     */ +    for (statenr = 0;statenr < ctxt->nbStates;statenr++) { +	state = ctxt->states[statenr]; +	if (state == NULL) +	    continue; +	for (transnr = 0;transnr < state->nbTrans;transnr++) { +	    if ((state->trans[transnr].atom == NULL) && +		(state->trans[transnr].count < 0) && +		(state->trans[transnr].to >= 0)) { +		state->trans[transnr].to = -1; +	    } +	} +    } + +    /* +     * Use this pass to detect unreachable states too +     */ +    for (statenr = 0;statenr < ctxt->nbStates;statenr++) { +	state = ctxt->states[statenr]; +	if (state != NULL) +	    state->reached = 0; +    } +    state = ctxt->states[0]; +    if (state != NULL) +	state->reached = 1; +    while (state != NULL) { +	xmlRegStatePtr target = NULL; +	state->reached = 2; +	/* +	 * Mark all state reachable from the current reachable state +	 */ +	for (transnr = 0;transnr < state->nbTrans;transnr++) { +	    if ((state->trans[transnr].to >= 0) && +		((state->trans[transnr].atom != NULL) || +		 (state->trans[transnr].count >= 0))) { +		int newto = state->trans[transnr].to; + +		if (ctxt->states[newto] == NULL) +		    continue; +		if (ctxt->states[newto]->reached == 0) { +		    ctxt->states[newto]->reached = 1; +		    target = ctxt->states[newto]; +		} +	    } +	} +	/* +	 * find the next accessible state not explored +	 */ +	if (target == NULL) { +	    for (statenr = 1;statenr < ctxt->nbStates;statenr++) { +		state = ctxt->states[statenr]; +		if ((state != NULL) && (state->reached == 1)) { +		    target = state; +		    break; +		} +	    } +	} +	state = target; +    } +    for (statenr = 0;statenr < ctxt->nbStates;statenr++) { +	state = ctxt->states[statenr]; +	if ((state != NULL) && (state->reached == 0)) { +#ifdef DEBUG_REGEXP_GRAPH +	    printf("Removed unreachable state %d\n", statenr); +#endif +	    xmlRegFreeState(state); +	    ctxt->states[statenr] = NULL; +	} +    } + +} + +/** + * xmlFACompareAtoms: + * @atom1:  an atom + * @atom2:  an atom + * + * Compares two atoms to check whether they are equivatents + * + * Returns 1 if yes and 0 otherwise + */ +static int +xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { +    if (atom1 == atom2) +	return(1); +    if ((atom1 == NULL) || (atom2 == NULL)) +	return(0); + +    if (atom1->type != atom2->type) +	return(0); +    switch (atom1->type) { +        case XML_REGEXP_STRING: +	    return(xmlStrEqual((xmlChar *)atom1->valuep, +			       (xmlChar *)atom2->valuep)); +        case XML_REGEXP_EPSILON: +	    return(1); +        case XML_REGEXP_CHARVAL: +	    return(atom1->codepoint == atom2->codepoint); +        case XML_REGEXP_RANGES: +	    TODO; +	    return(0); +	default: +	    break; +    } +    return(1); +} + +/** + * xmlFARecurseDeterminism: + * @ctxt:  a regexp parser context + * + * Check whether the associated regexp is determinist, + * should be called after xmlFAEliminateEpsilonTransitions() + * + */ +static int +xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, +	                 int to, xmlRegAtomPtr atom) { +    int ret = 1; +    int transnr; +    xmlRegTransPtr t1; + +    if (state == NULL) +	return(ret); +    for (transnr = 0;transnr < state->nbTrans;transnr++) { +	t1 = &(state->trans[transnr]); +	/* +	 * check transitions conflicting with the one looked at +	 */ +	if (t1->atom == NULL) { +	    if (t1->to == -1) +		continue; +	    ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], +		                           to, atom); +	    if (ret == 0) +		return(0); +	    continue; +	} +	if (t1->to != to) +	    continue; +	if (xmlFACompareAtoms(t1->atom, atom)) +	    return(0); +    } +    return(ret); +} + +/** + * xmlFAComputesDeterminism: + * @ctxt:  a regexp parser context + * + * Check whether the associated regexp is determinist, + * should be called after xmlFAEliminateEpsilonTransitions() + * + */ +static int +xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { +    int statenr, transnr; +    xmlRegStatePtr state; +    xmlRegTransPtr t1, t2; +    int i; +    int ret = 1; + +#ifdef DEBUG_REGEXP_GRAPH +    printf("xmlFAComputesDeterminism\n"); +    xmlRegPrintCtxt(stdout, ctxt); +#endif +    if (ctxt->determinist != -1) +	return(ctxt->determinist); + +    /* +     * Check for all states that there isn't 2 transitions +     * with the same atom and a different target. +     */ +    for (statenr = 0;statenr < ctxt->nbStates;statenr++) { +	state = ctxt->states[statenr]; +	if (state == NULL) +	    continue; +	for (transnr = 0;transnr < state->nbTrans;transnr++) { +	    t1 = &(state->trans[transnr]); +	    /* +	     * Determinism checks in case of counted or all transitions +	     * will have to be handled separately +	     */ +	    if (t1->atom == NULL) +		continue; +	    if (t1->to == -1) /* eliminated */ +		continue; +	    for (i = 0;i < transnr;i++) { +		t2 = &(state->trans[i]); +		if (t2->to == -1) /* eliminated */ +		    continue; +		if (t2->atom != NULL) { +		    if (t1->to == t2->to) { +			if (xmlFACompareAtoms(t1->atom, t2->atom)) +			    t2->to = -1; /* eliminate */ +		    } else { +			/* not determinist ! */ +			if (xmlFACompareAtoms(t1->atom, t2->atom)) +			    ret = 0; +		    } +		} else if (t1->to != -1) { +		    /* +		     * do the closure in case of remaining specific +		     * epsilon transitions like choices or all +		     */ +		    ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], +						   t2->to, t2->atom); +		    if (ret == 0) +			return(0); +		} +	    } +	    if (ret == 0) +		break; +	} +	if (ret == 0) +	    break; +    } +    ctxt->determinist = ret; +    return(ret); +} + +/************************************************************************ + * 									* + *	Routines to check input against transition atoms		* + * 									* + ************************************************************************/ + +static int +xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, +	                  int start, int end, const xmlChar *blockName) { +    int ret = 0; + +    switch (type) { +        case XML_REGEXP_STRING: +        case XML_REGEXP_SUBREG: +        case XML_REGEXP_RANGES: +        case XML_REGEXP_EPSILON: +	    return(-1); +        case XML_REGEXP_ANYCHAR: +	    ret = ((codepoint != '\n') && (codepoint != '\r')); +	    break; +        case XML_REGEXP_CHARVAL: +	    ret = ((codepoint >= start) && (codepoint <= end)); +	    break; +        case XML_REGEXP_NOTSPACE: +	    neg = !neg; +        case XML_REGEXP_ANYSPACE: +	    ret = ((codepoint == '\n') || (codepoint == '\r') || +		   (codepoint == '\t') || (codepoint == ' ')); +	    break; +        case XML_REGEXP_NOTINITNAME: +	    neg = !neg; +        case XML_REGEXP_INITNAME: +	    ret = (xmlIsLetter(codepoint) ||  +		   (codepoint == '_') || (codepoint == ':')); +	    break; +        case XML_REGEXP_NOTNAMECHAR: +	    neg = !neg; +        case XML_REGEXP_NAMECHAR: +	    ret = (xmlIsLetter(codepoint) || xmlIsDigit(codepoint) || +		   (codepoint == '.') || (codepoint == '-') || +		   (codepoint == '_') || (codepoint == ':') || +		   xmlIsCombining(codepoint) || xmlIsExtender(codepoint)); +	    break; +        case XML_REGEXP_NOTDECIMAL: +	    neg = !neg; +        case XML_REGEXP_DECIMAL: +	    ret = xmlUCSIsCatNd(codepoint); +	    break; +        case XML_REGEXP_REALCHAR: +	    neg = !neg; +        case XML_REGEXP_NOTREALCHAR: +	    ret = xmlUCSIsCatP(codepoint); +	    if (ret == 0) +		ret = xmlUCSIsCatZ(codepoint); +	    if (ret == 0) +		ret = xmlUCSIsCatC(codepoint); +	    break; +        case XML_REGEXP_LETTER: +	    ret = xmlUCSIsCatL(codepoint); +	    break; +        case XML_REGEXP_LETTER_UPPERCASE: +	    ret = xmlUCSIsCatLu(codepoint); +	    break; +        case XML_REGEXP_LETTER_LOWERCASE: +	    ret = xmlUCSIsCatLl(codepoint); +	    break; +        case XML_REGEXP_LETTER_TITLECASE: +	    ret = xmlUCSIsCatLt(codepoint); +	    break; +        case XML_REGEXP_LETTER_MODIFIER: +	    ret = xmlUCSIsCatLm(codepoint); +	    break; +        case XML_REGEXP_LETTER_OTHERS: +	    ret = xmlUCSIsCatLo(codepoint); +	    break; +        case XML_REGEXP_MARK: +	    ret = xmlUCSIsCatM(codepoint); +	    break; +        case XML_REGEXP_MARK_NONSPACING: +	    ret = xmlUCSIsCatMn(codepoint); +	    break; +        case XML_REGEXP_MARK_SPACECOMBINING: +	    ret = xmlUCSIsCatMc(codepoint); +	    break; +        case XML_REGEXP_MARK_ENCLOSING: +	    ret = xmlUCSIsCatMe(codepoint); +	    break; +        case XML_REGEXP_NUMBER: +	    ret = xmlUCSIsCatN(codepoint); +	    break; +        case XML_REGEXP_NUMBER_DECIMAL: +	    ret = xmlUCSIsCatNd(codepoint); +	    break; +        case XML_REGEXP_NUMBER_LETTER: +	    ret = xmlUCSIsCatNl(codepoint); +	    break; +        case XML_REGEXP_NUMBER_OTHERS: +	    ret = xmlUCSIsCatNo(codepoint); +	    break; +        case XML_REGEXP_PUNCT: +	    ret = xmlUCSIsCatP(codepoint); +	    break; +        case XML_REGEXP_PUNCT_CONNECTOR: +	    ret = xmlUCSIsCatPc(codepoint); +	    break; +        case XML_REGEXP_PUNCT_DASH: +	    ret = xmlUCSIsCatPd(codepoint); +	    break; +        case XML_REGEXP_PUNCT_OPEN: +	    ret = xmlUCSIsCatPs(codepoint); +	    break; +        case XML_REGEXP_PUNCT_CLOSE: +	    ret = xmlUCSIsCatPe(codepoint); +	    break; +        case XML_REGEXP_PUNCT_INITQUOTE: +	    ret = xmlUCSIsCatPi(codepoint); +	    break; +        case XML_REGEXP_PUNCT_FINQUOTE: +	    ret = xmlUCSIsCatPf(codepoint); +	    break; +        case XML_REGEXP_PUNCT_OTHERS: +	    ret = xmlUCSIsCatPo(codepoint); +	    break; +        case XML_REGEXP_SEPAR: +	    ret = xmlUCSIsCatZ(codepoint); +	    break; +        case XML_REGEXP_SEPAR_SPACE: +	    ret = xmlUCSIsCatZs(codepoint); +	    break; +        case XML_REGEXP_SEPAR_LINE: +	    ret = xmlUCSIsCatZl(codepoint); +	    break; +        case XML_REGEXP_SEPAR_PARA: +	    ret = xmlUCSIsCatZp(codepoint); +	    break; +        case XML_REGEXP_SYMBOL: +	    ret = xmlUCSIsCatS(codepoint); +	    break; +        case XML_REGEXP_SYMBOL_MATH: +	    ret = xmlUCSIsCatSm(codepoint); +	    break; +        case XML_REGEXP_SYMBOL_CURRENCY: +	    ret = xmlUCSIsCatSc(codepoint); +	    break; +        case XML_REGEXP_SYMBOL_MODIFIER: +	    ret = xmlUCSIsCatSk(codepoint); +	    break; +        case XML_REGEXP_SYMBOL_OTHERS: +	    ret = xmlUCSIsCatSo(codepoint); +	    break; +        case XML_REGEXP_OTHER: +	    ret = xmlUCSIsCatC(codepoint); +	    break; +        case XML_REGEXP_OTHER_CONTROL: +	    ret = xmlUCSIsCatCc(codepoint); +	    break; +        case XML_REGEXP_OTHER_FORMAT: +	    ret = xmlUCSIsCatCf(codepoint); +	    break; +        case XML_REGEXP_OTHER_PRIVATE: +	    ret = xmlUCSIsCatCo(codepoint); +	    break; +        case XML_REGEXP_OTHER_NA: +	    /* ret = xmlUCSIsCatCn(codepoint); */ +	    /* Seems it doesn't exist anymore in recent Unicode releases */ +	    ret = 0; +	    break; +        case XML_REGEXP_BLOCK_NAME: +	    ret = xmlUCSIsBlock(codepoint, (const char *) blockName); +	    break; +    } +    if (neg) +	return(!ret); +    return(ret); +} + +static int +xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { +    int i, ret = 0; +    xmlRegRangePtr range; + +    if ((atom == NULL) || (!xmlIsChar(codepoint))) +	return(-1); + +    switch (atom->type) { +        case XML_REGEXP_SUBREG: +        case XML_REGEXP_EPSILON: +	    return(-1); +        case XML_REGEXP_CHARVAL: +            return(codepoint == atom->codepoint); +        case XML_REGEXP_RANGES: { +	    int accept = 0; +	    for (i = 0;i < atom->nbRanges;i++) { +		range = atom->ranges[i]; +		if (range->neg) { +		    ret = xmlRegCheckCharacterRange(range->type, codepoint, +						0, range->start, range->end, +						range->blockName); +		    if (ret != 0) +			return(0); /* excluded char */ +		} else { +		    ret = xmlRegCheckCharacterRange(range->type, codepoint, +						0, range->start, range->end, +						range->blockName); +		    if (ret != 0) +			accept = 1; /* might still be excluded */ +		} +	    } +	    return(accept); +	} +        case XML_REGEXP_STRING: +	    printf("TODO: XML_REGEXP_STRING\n"); +	    return(-1); +        case XML_REGEXP_ANYCHAR: +        case XML_REGEXP_ANYSPACE: +        case XML_REGEXP_NOTSPACE: +        case XML_REGEXP_INITNAME: +        case XML_REGEXP_NOTINITNAME: +        case XML_REGEXP_NAMECHAR: +        case XML_REGEXP_NOTNAMECHAR: +        case XML_REGEXP_DECIMAL: +        case XML_REGEXP_NOTDECIMAL: +        case XML_REGEXP_REALCHAR: +        case XML_REGEXP_NOTREALCHAR: +        case XML_REGEXP_LETTER: +        case XML_REGEXP_LETTER_UPPERCASE: +        case XML_REGEXP_LETTER_LOWERCASE: +        case XML_REGEXP_LETTER_TITLECASE: +        case XML_REGEXP_LETTER_MODIFIER: +        case XML_REGEXP_LETTER_OTHERS: +        case XML_REGEXP_MARK: +        case XML_REGEXP_MARK_NONSPACING: +        case XML_REGEXP_MARK_SPACECOMBINING: +        case XML_REGEXP_MARK_ENCLOSING: +        case XML_REGEXP_NUMBER: +        case XML_REGEXP_NUMBER_DECIMAL: +        case XML_REGEXP_NUMBER_LETTER: +        case XML_REGEXP_NUMBER_OTHERS: +        case XML_REGEXP_PUNCT: +        case XML_REGEXP_PUNCT_CONNECTOR: +        case XML_REGEXP_PUNCT_DASH: +        case XML_REGEXP_PUNCT_OPEN: +        case XML_REGEXP_PUNCT_CLOSE: +        case XML_REGEXP_PUNCT_INITQUOTE: +        case XML_REGEXP_PUNCT_FINQUOTE: +        case XML_REGEXP_PUNCT_OTHERS: +        case XML_REGEXP_SEPAR: +        case XML_REGEXP_SEPAR_SPACE: +        case XML_REGEXP_SEPAR_LINE: +        case XML_REGEXP_SEPAR_PARA: +        case XML_REGEXP_SYMBOL: +        case XML_REGEXP_SYMBOL_MATH: +        case XML_REGEXP_SYMBOL_CURRENCY: +        case XML_REGEXP_SYMBOL_MODIFIER: +        case XML_REGEXP_SYMBOL_OTHERS: +        case XML_REGEXP_OTHER: +        case XML_REGEXP_OTHER_CONTROL: +        case XML_REGEXP_OTHER_FORMAT: +        case XML_REGEXP_OTHER_PRIVATE: +        case XML_REGEXP_OTHER_NA: +	case XML_REGEXP_BLOCK_NAME: +	    ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, +		                            (const xmlChar *)atom->valuep); +	    if (atom->neg) +		ret = !ret; +	    break; +    } +    return(ret); +} + +/************************************************************************ + * 									* + *	Saving an restoring state of an execution context		* + * 									* + ************************************************************************/ + +#ifdef DEBUG_REGEXP_EXEC +static void +xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { +    printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); +    if (exec->inputStack != NULL) { +	int i; +	printf(": "); +	for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) +	    printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]); +    } else { +	printf(": %s", &(exec->inputString[exec->index])); +    } +    printf("\n"); +} +#endif + +static void +xmlFARegExecSave(xmlRegExecCtxtPtr exec) { +#ifdef DEBUG_REGEXP_EXEC +    printf("saving "); +    exec->transno++; +    xmlFARegDebugExec(exec); +    exec->transno--; +#endif + +    if (exec->maxRollbacks == 0) { +	exec->maxRollbacks = 4; +	exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * +		                             sizeof(xmlRegExecRollback)); +	if (exec->rollbacks == NULL) { +	    fprintf(stderr, "exec save: allocation failed"); +	    exec->maxRollbacks = 0; +	    return; +	} +	memset(exec->rollbacks, 0, +	       exec->maxRollbacks * sizeof(xmlRegExecRollback)); +    } else if (exec->nbRollbacks >= exec->maxRollbacks) { +	xmlRegExecRollback *tmp; +	int len = exec->maxRollbacks; + +	exec->maxRollbacks *= 2; +	tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, +			exec->maxRollbacks * sizeof(xmlRegExecRollback)); +	if (tmp == NULL) { +	    fprintf(stderr, "exec save: allocation failed"); +	    exec->maxRollbacks /= 2; +	    return; +	} +	exec->rollbacks = tmp; +	tmp = &exec->rollbacks[len]; +	memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); +    } +    exec->rollbacks[exec->nbRollbacks].state = exec->state; +    exec->rollbacks[exec->nbRollbacks].index = exec->index; +    exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; +    if (exec->comp->nbCounters > 0) { +	if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { +	    exec->rollbacks[exec->nbRollbacks].counts = (int *) +		xmlMalloc(exec->comp->nbCounters * sizeof(int)); +	    if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { +		fprintf(stderr, "exec save: allocation failed"); +		exec->status = -5; +		return; +	    } +	} +	memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, +	       exec->comp->nbCounters * sizeof(int)); +    } +    exec->nbRollbacks++; +} + +static void +xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { +    if (exec->nbRollbacks <= 0) { +	exec->status = -1; +#ifdef DEBUG_REGEXP_EXEC +	printf("rollback failed on empty stack\n"); +#endif +	return; +    } +    exec->nbRollbacks--; +    exec->state = exec->rollbacks[exec->nbRollbacks].state; +    exec->index = exec->rollbacks[exec->nbRollbacks].index; +    exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; +    if (exec->comp->nbCounters > 0) { +	if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { +	    fprintf(stderr, "exec save: allocation failed"); +	    exec->status = -6; +	    return; +	} +	memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, +	       exec->comp->nbCounters * sizeof(int)); +    } + +#ifdef DEBUG_REGEXP_EXEC +    printf("restored "); +    xmlFARegDebugExec(exec); +#endif +} + +/************************************************************************ + * 									* + *	Verifyer, running an input against a compiled regexp		* + * 									* + ************************************************************************/ + +static int +xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { +    xmlRegExecCtxt execval; +    xmlRegExecCtxtPtr exec = &execval; +    int ret, codepoint, len; + +    exec->inputString = content; +    exec->index = 0; +    exec->determinist = 1; +    exec->maxRollbacks = 0; +    exec->nbRollbacks = 0; +    exec->rollbacks = NULL; +    exec->status = 0; +    exec->comp = comp; +    exec->state = comp->states[0]; +    exec->transno = 0; +    exec->transcount = 0; +    if (comp->nbCounters > 0) { +	exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); +	if (exec->counts == NULL) +	    return(-1); +        memset(exec->counts, 0, comp->nbCounters * sizeof(int)); +    } else +	exec->counts = NULL; +    while ((exec->status == 0) && +	   ((exec->inputString[exec->index] != 0) || +	    (exec->state->type != XML_REGEXP_FINAL_STATE))) { +	xmlRegTransPtr trans; +	xmlRegAtomPtr atom; + +	/* +	 * End of input on non-terminal state, rollback, however we may +	 * still have epsilon like transition for counted transitions +	 * on counters, in that case don't break too early. +	 */ +	if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) +	    goto rollback; + +	exec->transcount = 0; +	for (;exec->transno < exec->state->nbTrans;exec->transno++) { +	    trans = &exec->state->trans[exec->transno]; +	    if (trans->to < 0) +		continue; +	    atom = trans->atom; +	    ret = 0; +	    if (trans->count >= 0) { +		int count; +		xmlRegCounterPtr counter; + +		/* +		 * A counted transition. +		 */ + +		count = exec->counts[trans->count]; +		counter = &exec->comp->counters[trans->count]; +#ifdef DEBUG_REGEXP_EXEC +		printf("testing count %d: val %d, min %d, max %d\n", +		       trans->count, count, counter->min,  counter->max); +#endif +		ret = ((count >= counter->min) && (count <= counter->max)); +	    } else if (atom == NULL) { +		fprintf(stderr, "epsilon transition left at runtime\n"); +		exec->status = -2; +		break; +	    } else if (exec->inputString[exec->index] != 0) { +                codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); +		ret = xmlRegCheckCharacter(atom, codepoint); +		if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { +		    xmlRegStatePtr to = comp->states[trans->to]; + +		    /* +		     * this is a multiple input sequence +		     */ +		    if (exec->state->nbTrans > exec->transno + 1) { +			xmlFARegExecSave(exec); +		    } +		    exec->transcount = 1; +		    do { +			/* +			 * Try to progress as much as possible on the input +			 */ +			if (exec->transcount == atom->max) { +			    break; +			} +			exec->index += len; +			/* +			 * End of input: stop here +			 */ +			if (exec->inputString[exec->index] == 0) { +			    exec->index -= len; +			    break; +			} +			if (exec->transcount >= atom->min) { +			    int transno = exec->transno; +			    xmlRegStatePtr state = exec->state; + +			    /* +			     * The transition is acceptable save it +			     */ +			    exec->transno = -1; /* trick */ +			    exec->state = to; +			    xmlFARegExecSave(exec); +			    exec->transno = transno; +			    exec->state = state; +			} +			codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), +				              len); +			ret = xmlRegCheckCharacter(atom, codepoint); +			exec->transcount++; +		    } while (ret == 1); +		    if (exec->transcount < atom->min) +			ret = 0; + +		    /* +		     * If the last check failed but one transition was found +		     * possible, rollback +		     */ +		    if (ret < 0) +			ret = 0; +		    if (ret == 0) { +			goto rollback; +		    } +		} +	    } +	    if (ret == 1) { +		if (exec->state->nbTrans > exec->transno + 1) { +		    xmlFARegExecSave(exec); +		} +		if (trans->counter >= 0) { +#ifdef DEBUG_REGEXP_EXEC +		    printf("Increasing count %d\n", trans->counter); +#endif +		    exec->counts[trans->counter]++; +		} +#ifdef DEBUG_REGEXP_EXEC +		printf("entering state %d\n", trans->to); +#endif +		exec->state = comp->states[trans->to]; +		exec->transno = 0; +		if (trans->atom != NULL) { +		    exec->index += len; +		} +		goto progress; +	    } else if (ret < 0) { +		exec->status = -4; +		break; +	    } +	} +	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { +rollback: +	    /* +	     * Failed to find a way out +	     */ +	    exec->determinist = 0; +	    xmlFARegExecRollBack(exec); +	} +progress: +	continue; +    } +    if (exec->rollbacks != NULL) { +	if (exec->counts != NULL) { +	    int i; + +	    for (i = 0;i < exec->maxRollbacks;i++) +		if (exec->rollbacks[i].counts != NULL) +		    xmlFree(exec->rollbacks[i].counts); +	} +	xmlFree(exec->rollbacks); +    } +    if (exec->counts != NULL) +	xmlFree(exec->counts); +    if (exec->status == 0) +	return(1); +    if (exec->status == -1) +	return(0); +    return(exec->status); +} + +/************************************************************************ + * 									* + *	Progressive interface to the verifyer one atom at a time	* + * 									* + ************************************************************************/ + +/** + * xmlRegNewExecCtxt: + * @comp: a precompiled regular expression + * @callback: a callback function used for handling progresses in the + *            automata matching phase + * @data: the context data associated to the callback in this context + * + * Build a context used for progressive evaluation of a regexp. + * + * Returns the new context + */ +xmlRegExecCtxtPtr +xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { +    xmlRegExecCtxtPtr exec; + +    if (comp == NULL) +	return(NULL); +    exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); +    if (exec == NULL) { +	return(NULL); +    } +    memset(exec, 0, sizeof(xmlRegExecCtxt)); +    exec->inputString = NULL; +    exec->index = 0; +    exec->determinist = 1; +    exec->maxRollbacks = 0; +    exec->nbRollbacks = 0; +    exec->rollbacks = NULL; +    exec->status = 0; +    exec->comp = comp; +    if (comp->compact == NULL) +	exec->state = comp->states[0]; +    exec->transno = 0; +    exec->transcount = 0; +    exec->callback = callback; +    exec->data = data; +    if (comp->nbCounters > 0) { +	exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); +	if (exec->counts == NULL) { +	    xmlFree(exec); +	    return(NULL); +	} +        memset(exec->counts, 0, comp->nbCounters * sizeof(int)); +    } else +	exec->counts = NULL; +    exec->inputStackMax = 0; +    exec->inputStackNr = 0; +    exec->inputStack = NULL; +    return(exec); +} + +/** + * xmlRegFreeExecCtxt: + * @exec: a regular expression evaulation context + * + * Free the structures associated to a regular expression evaulation context. + */ +void +xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { +    if (exec == NULL) +	return; + +    if (exec->rollbacks != NULL) { +	if (exec->counts != NULL) { +	    int i; + +	    for (i = 0;i < exec->maxRollbacks;i++) +		if (exec->rollbacks[i].counts != NULL) +		    xmlFree(exec->rollbacks[i].counts); +	} +	xmlFree(exec->rollbacks); +    } +    if (exec->counts != NULL) +	xmlFree(exec->counts); +    if (exec->inputStack != NULL) { +	int i; + +	for (i = 0;i < exec->inputStackNr;i++) { +	    if (exec->inputStack[i].value != NULL) +		xmlFree(exec->inputStack[i].value); +	} +	xmlFree(exec->inputStack); +    } +    xmlFree(exec); +} + +static void +xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, +	                    void *data) { +#ifdef DEBUG_PUSH +    printf("saving value: %d:%s\n", exec->inputStackNr, value); +#endif +    if (exec->inputStackMax == 0) { +	exec->inputStackMax = 4; +	exec->inputStack = (xmlRegInputTokenPtr)  +	    xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); +	if (exec->inputStack == NULL) { +	    fprintf(stderr, "push input: allocation failed"); +	    exec->inputStackMax = 0; +	    return; +	} +    } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { +	xmlRegInputTokenPtr tmp; + +	exec->inputStackMax *= 2; +	tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, +			exec->inputStackMax * sizeof(xmlRegInputToken)); +	if (tmp == NULL) { +	    fprintf(stderr, "push input: allocation failed"); +	    exec->inputStackMax /= 2; +	    return; +	} +	exec->inputStack = tmp; +    } +    exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); +    exec->inputStack[exec->inputStackNr].data = data; +    exec->inputStackNr++; +    exec->inputStack[exec->inputStackNr].value = NULL; +    exec->inputStack[exec->inputStackNr].data = NULL; +} + + +/** + * xmlRegCompactPushString: + * @exec: a regexp execution context + * @comp:  the precompiled exec with a compact table + * @value: a string token input + * @data: data associated to the token to reuse in callbacks + * + * Push one input token in the execution context + * + * Returns: 1 if the regexp reached a final state, 0 if non-final, and + *     a negative value in case of error. + */ +static int +xmlRegCompactPushString(xmlRegExecCtxtPtr exec, +	                xmlRegexpPtr comp, +	                const xmlChar *value, +	                void *data) { +    int state = exec->index; +    int i, target; + +    if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) +	return(-1); +     +    if (value == NULL) { +	/* +	 * are we at a final state ? +	 */ +	if (comp->compact[state * (comp->nbstrings + 1)] == +            XML_REGEXP_FINAL_STATE) +	    return(1); +	return(0); +    } + +#ifdef DEBUG_PUSH +    printf("value pushed: %s\n", value); +#endif + +    /* +     * Examine all outside transition from current state +     */ +    for (i = 0;i < comp->nbstrings;i++) { +	target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; +	if ((target > 0) && (target <= comp->nbstates)) { +	    target--; /* to avoid 0 */ +	    if (xmlStrEqual(comp->stringMap[i], value)) { +		exec->index = target; +		if ((exec->callback != NULL) && (comp->transdata != NULL)) { +		    exec->callback(exec->data, value, +			  comp->transdata[state * comp->nbstrings + i], data); +		} +#ifdef DEBUG_PUSH +		printf("entering state %d\n", target); +#endif +		if (comp->compact[target * (comp->nbstrings + 1)] == +		    XML_REGEXP_FINAL_STATE) +		    return(1); +		return(0); +	    } +	} +    } +    /* +     * Failed to find an exit transition out from current state for the +     * current token +     */ +#ifdef DEBUG_PUSH +    printf("failed to find a transition for %s on state %d\n", value, state); +#endif +    exec->status = -1; +    return(-1); +} + +/** + * xmlRegExecPushString: + * @exec: a regexp execution context + * @value: a string token input + * @data: data associated to the token to reuse in callbacks + * + * Push one input token in the execution context + * + * Returns: 1 if the regexp reached a final state, 0 if non-final, and + *     a negative value in case of error. + */ +int +xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, +	             void *data) { +    xmlRegTransPtr trans; +    xmlRegAtomPtr atom; +    int ret; +    int final = 0; + +    if (exec == NULL) +	return(-1); +    if (exec->comp == NULL) +	return(-1); +    if (exec->status != 0) +	return(exec->status); + +    if (exec->comp->compact != NULL) +	return(xmlRegCompactPushString(exec, exec->comp, value, data)); + +    if (value == NULL) { +        if (exec->state->type == XML_REGEXP_FINAL_STATE) +	    return(1); +	final = 1; +    } + +#ifdef DEBUG_PUSH +    printf("value pushed: %s\n", value); +#endif +    /* +     * If we have an active rollback stack push the new value there +     * and get back to where we were left +     */ +    if ((value != NULL) && (exec->inputStackNr > 0)) { +	xmlFARegExecSaveInputString(exec, value, data); +	value = exec->inputStack[exec->index].value; +	data = exec->inputStack[exec->index].data; +#ifdef DEBUG_PUSH +	printf("value loaded: %s\n", value); +#endif +    } + +    while ((exec->status == 0) && +	   ((value != NULL) || +	    ((final == 1) && +	     (exec->state->type != XML_REGEXP_FINAL_STATE)))) { + +	/* +	 * End of input on non-terminal state, rollback, however we may +	 * still have epsilon like transition for counted transitions +	 * on counters, in that case don't break too early. +	 */ +	if ((value == NULL) && (exec->counts == NULL)) +	    goto rollback; + +	exec->transcount = 0; +	for (;exec->transno < exec->state->nbTrans;exec->transno++) { +	    trans = &exec->state->trans[exec->transno]; +	    if (trans->to < 0) +		continue; +	    atom = trans->atom; +	    ret = 0; +	    if (trans->count == REGEXP_ALL_LAX_COUNTER) { +		int i; +		int count; +		xmlRegTransPtr t; +		xmlRegCounterPtr counter; + +		ret = 0; + +#ifdef DEBUG_PUSH +		printf("testing all lax %d\n", trans->count); +#endif +		/* +		 * Check all counted transitions from the current state +		 */ +		if ((value == NULL) && (final)) { +		    ret = 1; +		} else if (value != NULL) { +		    for (i = 0;i < exec->state->nbTrans;i++) { +			t = &exec->state->trans[i]; +			if ((t->counter < 0) || (t == trans)) +			    continue; +			counter = &exec->comp->counters[t->counter]; +			count = exec->counts[t->counter]; +			if ((count < counter->max) &&  +		            (t->atom != NULL) && +			    (xmlStrEqual(value, t->atom->valuep))) { +			    ret = 0; +			    break; +			} +			if ((count >= counter->min) && +			    (count < counter->max) && +			    (xmlStrEqual(value, t->atom->valuep))) { +			    ret = 1; +			    break; +			} +		    } +		} +	    } else if (trans->count == REGEXP_ALL_COUNTER) { +		int i; +		int count; +		xmlRegTransPtr t; +		xmlRegCounterPtr counter; + +		ret = 1; + +#ifdef DEBUG_PUSH +		printf("testing all %d\n", trans->count); +#endif +		/* +		 * Check all counted transitions from the current state +		 */ +		for (i = 0;i < exec->state->nbTrans;i++) { +                    t = &exec->state->trans[i]; +		    if ((t->counter < 0) || (t == trans)) +			continue; +                    counter = &exec->comp->counters[t->counter]; +		    count = exec->counts[t->counter]; +		    if ((count < counter->min) || (count > counter->max)) { +			ret = 0; +			break; +		    } +		} +	    } else if (trans->count >= 0) { +		int count; +		xmlRegCounterPtr counter; + +		/* +		 * A counted transition. +		 */ + +		count = exec->counts[trans->count]; +		counter = &exec->comp->counters[trans->count]; +#ifdef DEBUG_PUSH +		printf("testing count %d: val %d, min %d, max %d\n", +		       trans->count, count, counter->min,  counter->max); +#endif +		ret = ((count >= counter->min) && (count <= counter->max)); +	    } else if (atom == NULL) { +		fprintf(stderr, "epsilon transition left at runtime\n"); +		exec->status = -2; +		break; +	    } else if (value != NULL) { +		ret = xmlStrEqual(value, atom->valuep); +		if ((ret == 1) && (trans->counter >= 0)) { +		    xmlRegCounterPtr counter; +		    int count; + +		    count = exec->counts[trans->counter]; +		    counter = &exec->comp->counters[trans->counter]; +		    if (count >= counter->max) +			ret = 0; +		} + +		if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { +		    xmlRegStatePtr to = exec->comp->states[trans->to]; + +		    /* +		     * this is a multiple input sequence +		     */ +		    if (exec->state->nbTrans > exec->transno + 1) { +			if (exec->inputStackNr <= 0) { +			    xmlFARegExecSaveInputString(exec, value, data); +			} +			xmlFARegExecSave(exec); +		    } +		    exec->transcount = 1; +		    do { +			/* +			 * Try to progress as much as possible on the input +			 */ +			if (exec->transcount == atom->max) { +			    break; +			} +			exec->index++; +			value = exec->inputStack[exec->index].value; +			data = exec->inputStack[exec->index].data; +#ifdef DEBUG_PUSH +			printf("value loaded: %s\n", value); +#endif + +			/* +			 * End of input: stop here +			 */ +			if (value == NULL) { +			    exec->index --; +			    break; +			} +			if (exec->transcount >= atom->min) { +			    int transno = exec->transno; +			    xmlRegStatePtr state = exec->state; + +			    /* +			     * The transition is acceptable save it +			     */ +			    exec->transno = -1; /* trick */ +			    exec->state = to; +			    if (exec->inputStackNr <= 0) { +				xmlFARegExecSaveInputString(exec, value, data); +			    } +			    xmlFARegExecSave(exec); +			    exec->transno = transno; +			    exec->state = state; +			} +			ret = xmlStrEqual(value, atom->valuep); +			exec->transcount++; +		    } while (ret == 1); +		    if (exec->transcount < atom->min) +			ret = 0; + +		    /* +		     * If the last check failed but one transition was found +		     * possible, rollback +		     */ +		    if (ret < 0) +			ret = 0; +		    if (ret == 0) { +			goto rollback; +		    } +		} +	    } +	    if (ret == 1) { +		if ((exec->callback != NULL) && (atom != NULL)) { +		    exec->callback(exec->data, atom->valuep, +			           atom->data, data); +		} +		if (exec->state->nbTrans > exec->transno + 1) { +		    if (exec->inputStackNr <= 0) { +			xmlFARegExecSaveInputString(exec, value, data); +		    } +		    xmlFARegExecSave(exec); +		} +		if (trans->counter >= 0) { +#ifdef DEBUG_PUSH +		    printf("Increasing count %d\n", trans->counter); +#endif +		    exec->counts[trans->counter]++; +		} +#ifdef DEBUG_PUSH +		printf("entering state %d\n", trans->to); +#endif +		exec->state = exec->comp->states[trans->to]; +		exec->transno = 0; +		if (trans->atom != NULL) { +		    if (exec->inputStack != NULL) { +			exec->index++; +			if (exec->index < exec->inputStackNr) { +			    value = exec->inputStack[exec->index].value; +			    data = exec->inputStack[exec->index].data; +#ifdef DEBUG_PUSH +			    printf("value loaded: %s\n", value); +#endif +			} else { +			    value = NULL; +			    data = NULL; +#ifdef DEBUG_PUSH +			    printf("end of input\n"); +#endif +			} +		    } else { +			value = NULL; +			data = NULL; +#ifdef DEBUG_PUSH +			printf("end of input\n"); +#endif +		    } +		} +		goto progress; +	    } else if (ret < 0) { +		exec->status = -4; +		break; +	    } +	} +	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { +rollback: +	    /* +	     * Failed to find a way out +	     */ +	    exec->determinist = 0; +	    xmlFARegExecRollBack(exec); +	    if (exec->status == 0) { +		value = exec->inputStack[exec->index].value; +		data = exec->inputStack[exec->index].data; +#ifdef DEBUG_PUSH +		printf("value loaded: %s\n", value); +#endif +	    } +	} +progress: +	continue; +    } +    if (exec->status == 0) { +        return(exec->state->type == XML_REGEXP_FINAL_STATE); +    } +    return(exec->status); +} + +#if 0 +static int +xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { +    xmlRegTransPtr trans; +    xmlRegAtomPtr atom; +    int ret; +    int codepoint, len; + +    if (exec == NULL) +	return(-1); +    if (exec->status != 0) +	return(exec->status); + +    while ((exec->status == 0) && +	   ((exec->inputString[exec->index] != 0) || +	    (exec->state->type != XML_REGEXP_FINAL_STATE))) { + +	/* +	 * End of input on non-terminal state, rollback, however we may +	 * still have epsilon like transition for counted transitions +	 * on counters, in that case don't break too early. +	 */ +	if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) +	    goto rollback; + +	exec->transcount = 0; +	for (;exec->transno < exec->state->nbTrans;exec->transno++) { +	    trans = &exec->state->trans[exec->transno]; +	    if (trans->to < 0) +		continue; +	    atom = trans->atom; +	    ret = 0; +	    if (trans->count >= 0) { +		int count; +		xmlRegCounterPtr counter; + +		/* +		 * A counted transition. +		 */ + +		count = exec->counts[trans->count]; +		counter = &exec->comp->counters[trans->count]; +#ifdef DEBUG_REGEXP_EXEC +		printf("testing count %d: val %d, min %d, max %d\n", +		       trans->count, count, counter->min,  counter->max); +#endif +		ret = ((count >= counter->min) && (count <= counter->max)); +	    } else if (atom == NULL) { +		fprintf(stderr, "epsilon transition left at runtime\n"); +		exec->status = -2; +		break; +	    } else if (exec->inputString[exec->index] != 0) { +                codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); +		ret = xmlRegCheckCharacter(atom, codepoint); +		if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { +		    xmlRegStatePtr to = exec->comp->states[trans->to]; + +		    /* +		     * this is a multiple input sequence +		     */ +		    if (exec->state->nbTrans > exec->transno + 1) { +			xmlFARegExecSave(exec); +		    } +		    exec->transcount = 1; +		    do { +			/* +			 * Try to progress as much as possible on the input +			 */ +			if (exec->transcount == atom->max) { +			    break; +			} +			exec->index += len; +			/* +			 * End of input: stop here +			 */ +			if (exec->inputString[exec->index] == 0) { +			    exec->index -= len; +			    break; +			} +			if (exec->transcount >= atom->min) { +			    int transno = exec->transno; +			    xmlRegStatePtr state = exec->state; + +			    /* +			     * The transition is acceptable save it +			     */ +			    exec->transno = -1; /* trick */ +			    exec->state = to; +			    xmlFARegExecSave(exec); +			    exec->transno = transno; +			    exec->state = state; +			} +			codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), +				              len); +			ret = xmlRegCheckCharacter(atom, codepoint); +			exec->transcount++; +		    } while (ret == 1); +		    if (exec->transcount < atom->min) +			ret = 0; + +		    /* +		     * If the last check failed but one transition was found +		     * possible, rollback +		     */ +		    if (ret < 0) +			ret = 0; +		    if (ret == 0) { +			goto rollback; +		    } +		} +	    } +	    if (ret == 1) { +		if (exec->state->nbTrans > exec->transno + 1) { +		    xmlFARegExecSave(exec); +		} +		if (trans->counter >= 0) { +#ifdef DEBUG_REGEXP_EXEC +		    printf("Increasing count %d\n", trans->counter); +#endif +		    exec->counts[trans->counter]++; +		} +#ifdef DEBUG_REGEXP_EXEC +		printf("entering state %d\n", trans->to); +#endif +		exec->state = exec->comp->states[trans->to]; +		exec->transno = 0; +		if (trans->atom != NULL) { +		    exec->index += len; +		} +		goto progress; +	    } else if (ret < 0) { +		exec->status = -4; +		break; +	    } +	} +	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { +rollback: +	    /* +	     * Failed to find a way out +	     */ +	    exec->determinist = 0; +	    xmlFARegExecRollBack(exec); +	} +progress: +	continue; +    } +} +#endif +/************************************************************************ + * 									* + *	Parser for the Shemas Datatype Regular Expressions		* + *	http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs	* + * 									* + ************************************************************************/ + +/** + * xmlFAIsChar: + * @ctxt:  a regexp parser context + * + * [10]   Char   ::=   [^.\?*+()|#x5B#x5D] + */ +static int +xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { +    int cur; +    int len; + +    cur = CUR_SCHAR(ctxt->cur, len); +    if ((cur == '.') || (cur == '\\') || (cur == '?') || +	(cur == '*') || (cur == '+') || (cur == '(') || +	(cur == ')') || (cur == '|') || (cur == 0x5B) || +	(cur == 0x5D) || (cur == 0)) +	return(-1); +    return(cur); +} + +/** + * xmlFAParseCharProp: + * @ctxt:  a regexp parser context + * + * [27]   charProp   ::=   IsCategory | IsBlock + * [28]   IsCategory ::= Letters | Marks | Numbers | Punctuation | + *                       Separators | Symbols | Others  + * [29]   Letters   ::=   'L' [ultmo]? + * [30]   Marks   ::=   'M' [nce]? + * [31]   Numbers   ::=   'N' [dlo]? + * [32]   Punctuation   ::=   'P' [cdseifo]? + * [33]   Separators   ::=   'Z' [slp]? + * [34]   Symbols   ::=   'S' [mcko]? + * [35]   Others   ::=   'C' [cfon]? + * [36]   IsBlock   ::=   'Is' [a-zA-Z0-9#x2D]+ + */ +static void +xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { +    int cur; +    xmlRegAtomType type = 0; +    xmlChar *blockName = NULL; +     +    cur = CUR; +    if (cur == 'L') { +	NEXT; +	cur = CUR; +	if (cur == 'u') { +	    NEXT; +	    type = XML_REGEXP_LETTER_UPPERCASE; +	} else if (cur == 'l') { +	    NEXT; +	    type = XML_REGEXP_LETTER_LOWERCASE; +	} else if (cur == 't') { +	    NEXT; +	    type = XML_REGEXP_LETTER_TITLECASE; +	} else if (cur == 'm') { +	    NEXT; +	    type = XML_REGEXP_LETTER_MODIFIER; +	} else if (cur == 'o') { +	    NEXT; +	    type = XML_REGEXP_LETTER_OTHERS; +	} else { +	    type = XML_REGEXP_LETTER; +	} +    } else if (cur == 'M') { +	NEXT; +	cur = CUR; +	if (cur == 'n') { +	    NEXT; +	    /* nonspacing */ +	    type = XML_REGEXP_MARK_NONSPACING; +	} else if (cur == 'c') { +	    NEXT; +	    /* spacing combining */ +	    type = XML_REGEXP_MARK_SPACECOMBINING; +	} else if (cur == 'e') { +	    NEXT; +	    /* enclosing */ +	    type = XML_REGEXP_MARK_ENCLOSING; +	} else { +	    /* all marks */ +	    type = XML_REGEXP_MARK; +	} +    } else if (cur == 'N') { +	NEXT; +	cur = CUR; +	if (cur == 'd') { +	    NEXT; +	    /* digital */ +	    type = XML_REGEXP_NUMBER_DECIMAL; +	} else if (cur == 'l') { +	    NEXT; +	    /* letter */ +	    type = XML_REGEXP_NUMBER_LETTER; +	} else if (cur == 'o') { +	    NEXT; +	    /* other */ +	    type = XML_REGEXP_NUMBER_OTHERS; +	} else { +	    /* all numbers */ +	    type = XML_REGEXP_NUMBER; +	} +    } else if (cur == 'P') { +	NEXT; +	cur = CUR; +	if (cur == 'c') { +	    NEXT; +	    /* connector */ +	    type = XML_REGEXP_PUNCT_CONNECTOR; +	} else if (cur == 'd') { +	    NEXT; +	    /* dash */ +	    type = XML_REGEXP_PUNCT_DASH; +	} else if (cur == 's') { +	    NEXT; +	    /* open */ +	    type = XML_REGEXP_PUNCT_OPEN; +	} else if (cur == 'e') { +	    NEXT; +	    /* close */ +	    type = XML_REGEXP_PUNCT_CLOSE; +	} else if (cur == 'i') { +	    NEXT; +	    /* initial quote */ +	    type = XML_REGEXP_PUNCT_INITQUOTE; +	} else if (cur == 'f') { +	    NEXT; +	    /* final quote */ +	    type = XML_REGEXP_PUNCT_FINQUOTE; +	} else if (cur == 'o') { +	    NEXT; +	    /* other */ +	    type = XML_REGEXP_PUNCT_OTHERS; +	} else { +	    /* all punctuation */ +	    type = XML_REGEXP_PUNCT; +	} +    } else if (cur == 'Z') { +	NEXT; +	cur = CUR; +	if (cur == 's') { +	    NEXT; +	    /* space */ +	    type = XML_REGEXP_SEPAR_SPACE; +	} else if (cur == 'l') { +	    NEXT; +	    /* line */ +	    type = XML_REGEXP_SEPAR_LINE; +	} else if (cur == 'p') { +	    NEXT; +	    /* paragraph */ +	    type = XML_REGEXP_SEPAR_PARA; +	} else { +	    /* all separators */ +	    type = XML_REGEXP_SEPAR; +	} +    } else if (cur == 'S') { +	NEXT; +	cur = CUR; +	if (cur == 'm') { +	    NEXT; +	    type = XML_REGEXP_SYMBOL_MATH; +	    /* math */ +	} else if (cur == 'c') { +	    NEXT; +	    type = XML_REGEXP_SYMBOL_CURRENCY; +	    /* currency */ +	} else if (cur == 'k') { +	    NEXT; +	    type = XML_REGEXP_SYMBOL_MODIFIER; +	    /* modifiers */ +	} else if (cur == 'o') { +	    NEXT; +	    type = XML_REGEXP_SYMBOL_OTHERS; +	    /* other */ +	} else { +	    /* all symbols */ +	    type = XML_REGEXP_SYMBOL; +	} +    } else if (cur == 'C') { +	NEXT; +	cur = CUR; +	if (cur == 'c') { +	    NEXT; +	    /* control */ +	    type = XML_REGEXP_OTHER_CONTROL; +	} else if (cur == 'f') { +	    NEXT; +	    /* format */ +	    type = XML_REGEXP_OTHER_FORMAT; +	} else if (cur == 'o') { +	    NEXT; +	    /* private use */ +	    type = XML_REGEXP_OTHER_PRIVATE; +	} else if (cur == 'n') { +	    NEXT; +	    /* not assigned */ +	    type = XML_REGEXP_OTHER_NA; +	} else { +	    /* all others */ +	    type = XML_REGEXP_OTHER; +	} +    } else if (cur == 'I') { +	const xmlChar *start; +	NEXT; +	cur = CUR; +	if (cur != 's') { +	    ERROR("IsXXXX expected"); +	    return; +	} +	NEXT; +	start = ctxt->cur; +	cur = CUR; +	if (((cur >= 'a') && (cur <= 'z')) ||  +	    ((cur >= 'A') && (cur <= 'Z')) ||  +	    ((cur >= '0') && (cur <= '9')) ||  +	    (cur == 0x2D)) { +	    NEXT; +	    cur = CUR; +	    while (((cur >= 'a') && (cur <= 'z')) ||  +		((cur >= 'A') && (cur <= 'Z')) ||  +		((cur >= '0') && (cur <= '9')) ||  +		(cur == 0x2D)) { +		NEXT; +		cur = CUR; +	    } +	} +	type = XML_REGEXP_BLOCK_NAME; +	blockName = xmlStrndup(start, ctxt->cur - start); +    } else { +	ERROR("Unknown char property"); +	return; +    } +    if (ctxt->atom == NULL) { +	ctxt->atom = xmlRegNewAtom(ctxt, type); +	if (ctxt->atom != NULL) +	    ctxt->atom->valuep = blockName; +    } else if (ctxt->atom->type == XML_REGEXP_RANGES) { +        xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, +		           type, 0, 0, blockName); +    } +} + +/** + * xmlFAParseCharClassEsc: + * @ctxt:  a regexp parser context + * + * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )  + * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] + * [25] catEsc   ::=   '\p{' charProp '}' + * [26] complEsc ::=   '\P{' charProp '}' + * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) + */ +static void +xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { +    int cur; + +    if (CUR == '.') { +	if (ctxt->atom == NULL) { +	    ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); +	} else if (ctxt->atom->type == XML_REGEXP_RANGES) { +	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, +			       XML_REGEXP_ANYCHAR, 0, 0, NULL); +	} +	NEXT; +	return; +    } +    if (CUR != '\\') { +	ERROR("Escaped sequence: expecting \\"); +	return; +    } +    NEXT; +    cur = CUR; +    if (cur == 'p') { +	NEXT; +	if (CUR != '{') { +	    ERROR("Expecting '{'"); +	    return; +	} +	NEXT; +	xmlFAParseCharProp(ctxt); +	if (CUR != '}') { +	    ERROR("Expecting '}'"); +	    return; +	} +	NEXT; +    } else if (cur == 'P') { +	NEXT; +	if (CUR != '{') { +	    ERROR("Expecting '{'"); +	    return; +	} +	NEXT; +	xmlFAParseCharProp(ctxt); +	ctxt->atom->neg = 1; +	if (CUR != '}') { +	    ERROR("Expecting '}'"); +	    return; +	} +	NEXT; +    } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || +	(cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || +	(cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || +	(cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || +	(cur == 0x5E)) { +	if (ctxt->atom == NULL) { +	    ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); +	    if (ctxt->atom != NULL) +		ctxt->atom->codepoint = cur; +	} else if (ctxt->atom->type == XML_REGEXP_RANGES) { +	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, +			       XML_REGEXP_CHARVAL, cur, cur, NULL); +	} +	NEXT; +    } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || +	(cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || +	(cur == 'w') || (cur == 'W')) { +	xmlRegAtomType type = XML_REGEXP_ANYSPACE; + +	switch (cur) { +	    case 's':  +		type = XML_REGEXP_ANYSPACE; +		break; +	    case 'S':  +		type = XML_REGEXP_NOTSPACE; +		break; +	    case 'i':  +		type = XML_REGEXP_INITNAME; +		break; +	    case 'I':  +		type = XML_REGEXP_NOTINITNAME; +		break; +	    case 'c':  +		type = XML_REGEXP_NAMECHAR; +		break; +	    case 'C':  +		type = XML_REGEXP_NOTNAMECHAR; +		break; +	    case 'd':  +		type = XML_REGEXP_DECIMAL; +		break; +	    case 'D':  +		type = XML_REGEXP_NOTDECIMAL; +		break; +	    case 'w':  +		type = XML_REGEXP_REALCHAR; +		break; +	    case 'W':  +		type = XML_REGEXP_NOTREALCHAR; +		break; +	} +	NEXT; +	if (ctxt->atom == NULL) { +	    ctxt->atom = xmlRegNewAtom(ctxt, type); +	} else if (ctxt->atom->type == XML_REGEXP_RANGES) { +	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, +			       type, 0, 0, NULL); +	} +    } +} + +/** + * xmlFAParseCharRef: + * @ctxt:  a regexp parser context + * + * [19]   XmlCharRef   ::=   ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' ) + */ +static int +xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) { +    int ret = 0, cur; + +    if ((CUR != '&') || (NXT(1) != '#')) +	return(-1); +    NEXT; +    NEXT; +    cur = CUR; +    if (cur == 'x') { +	NEXT; +	cur = CUR; +	if (((cur >= '0') && (cur <= '9')) || +	    ((cur >= 'a') && (cur <= 'f')) || +	    ((cur >= 'A') && (cur <= 'F'))) { +	    while (((cur >= '0') && (cur <= '9')) || +		   ((cur >= 'A') && (cur <= 'F'))) { +		if ((cur >= '0') && (cur <= '9')) +		    ret = ret * 16 + cur - '0'; +		else if ((cur >= 'a') && (cur <= 'f')) +		    ret = ret * 16 + 10 + (cur - 'a'); +		else +		    ret = ret * 16 + 10 + (cur - 'A'); +		NEXT; +		cur = CUR; +	    } +	} else { +	    ERROR("Char ref: expecting [0-9A-F]"); +	    return(-1); +	} +    } else { +	if ((cur >= '0') && (cur <= '9')) { +	    while ((cur >= '0') && (cur <= '9')) { +		ret = ret * 10 + cur - '0'; +		NEXT; +		cur = CUR; +	    } +	} else { +	    ERROR("Char ref: expecting [0-9]"); +	    return(-1); +	} +    } +    if (cur != ';') { +	ERROR("Char ref: expecting ';'"); +	return(-1); +    } else { +	NEXT; +    } +    return(ret); +} + +/** + * xmlFAParseCharRange: + * @ctxt:  a regexp parser context + * + * [17]   charRange   ::=     seRange | XmlCharRef | XmlCharIncDash  + * [18]   seRange   ::=   charOrEsc '-' charOrEsc + * [20]   charOrEsc   ::=   XmlChar | SingleCharEsc + * [21]   XmlChar   ::=   [^\#x2D#x5B#x5D] + * [22]   XmlCharIncDash   ::=   [^\#x5B#x5D] + */ +static void +xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { +    int cur; +    int start = -1; +    int end = -1; + +    if ((CUR == '&') && (NXT(1) == '#')) { +	end = start = xmlFAParseCharRef(ctxt); +        xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, +	                   XML_REGEXP_CHARVAL, start, end, NULL); +	return; +    } +    cur = CUR; +    if (cur == '\\') { +	NEXT; +	cur = CUR; +	switch (cur) { +	    case 'n': start = 0xA; break; +	    case 'r': start = 0xD; break; +	    case 't': start = 0x9; break; +	    case '\\': case '|': case '.': case '-': case '^': case '?': +	    case '*': case '+': case '{': case '}': case '(': case ')': +	    case '[': case ']': +		start = cur; break; +	    default: +		ERROR("Invalid escape value"); +		return; +	} +	end = start; +    } else if ((cur != 0x5B) && (cur != 0x5D)) { +	end = start = cur; +    } else { +	ERROR("Expecting a char range"); +	return; +    } +    NEXT; +    if (start == '-') { +	return; +    } +    cur = CUR; +    if (cur != '-') { +        xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, +		              XML_REGEXP_CHARVAL, start, end, NULL); +	return; +    } +    NEXT; +    cur = CUR; +    if (cur == '\\') { +	NEXT; +	cur = CUR; +	switch (cur) { +	    case 'n': end = 0xA; break; +	    case 'r': end = 0xD; break; +	    case 't': end = 0x9; break; +	    case '\\': case '|': case '.': case '-': case '^': case '?': +	    case '*': case '+': case '{': case '}': case '(': case ')': +	    case '[': case ']': +		end = cur; break; +	    default: +		ERROR("Invalid escape value"); +		return; +	} +    } else if ((cur != 0x5B) && (cur != 0x5D)) { +	end = cur; +    } else { +	ERROR("Expecting the end of a char range"); +	return; +    } +    NEXT; +    /* TODO check that the values are acceptable character ranges for XML */ +    if (end < start) { +	ERROR("End of range is before start of range"); +    } else { +        xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, +		           XML_REGEXP_CHARVAL, start, end, NULL); +    } +    return; +} + +/** + * xmlFAParsePosCharGroup: + * @ctxt:  a regexp parser context + * + * [14]   posCharGroup ::= ( charRange | charClassEsc  )+ + */ +static void +xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { +    do { +	if ((CUR == '\\') || (CUR == '.')) { +	    xmlFAParseCharClassEsc(ctxt); +	} else { +	    xmlFAParseCharRange(ctxt); +	} +    } while ((CUR != ']') && (CUR != '^') && (CUR != '-') && +	     (ctxt->error == 0)); +} + +/** + * xmlFAParseCharGroup: + * @ctxt:  a regexp parser context + * + * [13]   charGroup    ::= posCharGroup | negCharGroup | charClassSub + * [15]   negCharGroup ::= '^' posCharGroup + * [16]   charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr   + * [12]   charClassExpr ::= '[' charGroup ']' + */ +static void +xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { +    int n = ctxt->neg; +    while ((CUR != ']') && (ctxt->error == 0)) { +	if (CUR == '^') { +	    int neg = ctxt->neg; + +	    NEXT; +	    ctxt->neg = !ctxt->neg; +	    xmlFAParsePosCharGroup(ctxt); +	    ctxt->neg = neg; +	} else if (CUR == '-') { +	    NEXT; +	    ctxt->neg = !ctxt->neg; +	    if (CUR != '[') { +		ERROR("charClassExpr: '[' expected"); +		break; +	    } +	    NEXT; +	    xmlFAParseCharGroup(ctxt); +	    if (CUR == ']') { +		NEXT; +	    } else { +		ERROR("charClassExpr: ']' expected"); +		break; +	    } +	    break; +	} else if (CUR != ']') { +	    xmlFAParsePosCharGroup(ctxt); +	} +    } +    ctxt->neg = n; +} + +/** + * xmlFAParseCharClass: + * @ctxt:  a regexp parser context + * + * [11]   charClass   ::=     charClassEsc | charClassExpr + * [12]   charClassExpr   ::=   '[' charGroup ']' + */ +static void +xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { +    if (CUR == '[') { +	NEXT; +	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); +	if (ctxt->atom == NULL) +	    return; +	xmlFAParseCharGroup(ctxt); +	if (CUR == ']') { +	    NEXT; +	} else { +	    ERROR("xmlFAParseCharClass: ']' expected"); +	} +    } else { +	xmlFAParseCharClassEsc(ctxt); +    } +} + +/** + * xmlFAParseQuantExact: + * @ctxt:  a regexp parser context + * + * [8]   QuantExact   ::=   [0-9]+ + */ +static int +xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { +    int ret = 0; +    int ok = 0; + +    while ((CUR >= '0') && (CUR <= '9')) { +	ret = ret * 10 + (CUR - '0'); +	ok = 1; +	NEXT; +    } +    if (ok != 1) { +	return(-1); +    } +    return(ret); +} + +/** + * xmlFAParseQuantifier: + * @ctxt:  a regexp parser context + * + * [4]   quantifier   ::=   [?*+] | ( '{' quantity '}' ) + * [5]   quantity   ::=   quantRange | quantMin | QuantExact + * [6]   quantRange   ::=   QuantExact ',' QuantExact + * [7]   quantMin   ::=   QuantExact ',' + * [8]   QuantExact   ::=   [0-9]+ + */ +static int +xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { +    int cur; + +    cur = CUR; +    if ((cur == '?') || (cur == '*') || (cur == '+')) { +	if (ctxt->atom != NULL) { +	    if (cur == '?') +		ctxt->atom->quant = XML_REGEXP_QUANT_OPT; +	    else if (cur == '*') +		ctxt->atom->quant = XML_REGEXP_QUANT_MULT; +	    else if (cur == '+') +		ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; +	} +	NEXT; +	return(1); +    } +    if (cur == '{') { +	int min = 0, max = 0; + +	NEXT; +	cur = xmlFAParseQuantExact(ctxt); +	if (cur >= 0) +	    min = cur; +	if (CUR == ',') { +	    NEXT; +	    cur = xmlFAParseQuantExact(ctxt); +	    if (cur >= 0) +		max = cur; +	} +	if (CUR == '}') { +	    NEXT; +	} else { +	    ERROR("Unterminated quantifier"); +	} +	if (max == 0) +	    max = min; +	if (ctxt->atom != NULL) { +	    ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; +	    ctxt->atom->min = min; +	    ctxt->atom->max = max; +	} +	return(1); +    } +    return(0); +} + +/** + * xmlFAParseAtom: + * @ctxt:  a regexp parser context + * + * [9]   atom   ::=   Char | charClass | ( '(' regExp ')' ) + */ +static int +xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { +    int codepoint, len; + +    codepoint = xmlFAIsChar(ctxt); +    if (codepoint > 0) { +	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); +	if (ctxt->atom == NULL) +	    return(-1); +	codepoint = CUR_SCHAR(ctxt->cur, len); +	ctxt->atom->codepoint = codepoint; +	NEXTL(len); +	return(1); +    } else if (CUR == '|') { +	return(0); +    } else if (CUR == 0) { +	return(0); +    } else if (CUR == ')') { +	return(0); +    } else if (CUR == '(') { +	xmlRegStatePtr start, oldend; + +	NEXT; +	xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); +	start = ctxt->state; +	oldend = ctxt->end; +	ctxt->end = NULL; +	ctxt->atom = NULL; +	xmlFAParseRegExp(ctxt, 0); +	if (CUR == ')') { +	    NEXT; +	} else { +	    ERROR("xmlFAParseAtom: expecting ')'"); +	} +	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); +	if (ctxt->atom == NULL) +	    return(-1); +	ctxt->atom->start = start; +	ctxt->atom->stop = ctxt->state; +	ctxt->end = oldend; +	return(1); +    } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { +	xmlFAParseCharClass(ctxt); +	return(1); +    } +    return(0); +} + +/** + * xmlFAParsePiece: + * @ctxt:  a regexp parser context + * + * [3]   piece   ::=   atom quantifier? + */ +static int +xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { +    int ret; + +    ctxt->atom = NULL; +    ret = xmlFAParseAtom(ctxt); +    if (ret == 0) +	return(0); +    if (ctxt->atom == NULL) { +	ERROR("internal: no atom generated"); +    } +    xmlFAParseQuantifier(ctxt); +    return(1); +} + +/** + * xmlFAParseBranch: + * @ctxt:  a regexp parser context + * @first:  is taht the first + * + * [2]   branch   ::=   piece* + */ +static void +xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, int first) { +    xmlRegStatePtr previous; +    xmlRegAtomPtr prevatom = NULL; +    int ret; + +    previous = ctxt->state; +    ret = xmlFAParsePiece(ctxt); +    if (ret != 0) { +	if (first) { +	    xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom); +	    previous = ctxt->state; +	} else { +	    prevatom = ctxt->atom; +	} +	ctxt->atom = NULL; +    } +    while ((ret != 0) && (ctxt->error == 0)) { +	ret = xmlFAParsePiece(ctxt); +	if (ret != 0) { +	    if (first) { +		xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom); +	    } else { +		xmlFAGenerateTransitions(ctxt, previous, NULL, prevatom); +		prevatom = ctxt->atom; +	    } +	    previous = ctxt->state; +	    ctxt->atom = NULL; +	} +    } +    if (!first) { +	xmlFAGenerateTransitions(ctxt, previous, ctxt->end, prevatom); +    } +} + +/** + * xmlFAParseRegExp: + * @ctxt:  a regexp parser context + * @top:  is that the top-level expressions ? + * + * [1]   regExp   ::=     branch  ( '|' branch )* + */ +static void +xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { +    xmlRegStatePtr start, end, oldend; + +    oldend = ctxt->end; + +    start = ctxt->state; +    xmlFAParseBranch(ctxt, (ctxt->end == NULL)); +    if (CUR != '|') { +	ctxt->end = ctxt->state; +	return; +    } +    end = ctxt->state; +    while ((CUR == '|') && (ctxt->error == 0)) { +	NEXT; +	ctxt->state = start; +	ctxt->end = end; +	xmlFAParseBranch(ctxt, 0); +    } +    if (!top) +	ctxt->end = oldend; +} + +/************************************************************************ + * 									* + * 			The basic API					* + * 									* + ************************************************************************/ + +/** + * xmlRegexpPrint: + * @output: the file for the output debug + * @regexp: the compiled regexp + * + * Print the content of the compiled regular expression + */ +void +xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { +    int i; + +    fprintf(output, " regexp: "); +    if (regexp == NULL) { +	fprintf(output, "NULL\n"); +	return; +    } +    fprintf(output, "'%s' ", regexp->string); +    fprintf(output, "\n"); +    fprintf(output, "%d atoms:\n", regexp->nbAtoms); +    for (i = 0;i < regexp->nbAtoms; i++) { +	fprintf(output, " %02d ", i); +	xmlRegPrintAtom(output, regexp->atoms[i]); +    } +    fprintf(output, "%d states:", regexp->nbStates); +    fprintf(output, "\n"); +    for (i = 0;i < regexp->nbStates; i++) { +	xmlRegPrintState(output, regexp->states[i]); +    } +    fprintf(output, "%d counters:\n", regexp->nbCounters); +    for (i = 0;i < regexp->nbCounters; i++) { +	fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, +		                                regexp->counters[i].max); +    } +} + +/** + * xmlRegexpCompile: + * @regexp:  a regular expression string + * + * Parses a regular expression conforming to XML Schemas Part 2 Datatype + * Appendix F and build an automata suitable for testing strings against + * that regular expression + * + * Returns the compiled expression or NULL in case of error + */ +xmlRegexpPtr +xmlRegexpCompile(const xmlChar *regexp) { +    xmlRegexpPtr ret; +    xmlRegParserCtxtPtr ctxt; + +    ctxt = xmlRegNewParserCtxt(regexp); +    if (ctxt == NULL) +	return(NULL); + +    /* initialize the parser */ +    ctxt->end = NULL; +    ctxt->start = ctxt->state = xmlRegNewState(ctxt); +    xmlRegStatePush(ctxt, ctxt->start); + +    /* parse the expression building an automata */ +    xmlFAParseRegExp(ctxt, 1); +    if (CUR != 0) { +	ERROR("xmlFAParseRegExp: extra characters"); +    } +    ctxt->end = ctxt->state; +    ctxt->start->type = XML_REGEXP_START_STATE; +    ctxt->end->type = XML_REGEXP_FINAL_STATE; + +    /* remove the Epsilon except for counted transitions */ +    xmlFAEliminateEpsilonTransitions(ctxt); + + +    if (ctxt->error != 0) { +	xmlRegFreeParserCtxt(ctxt); +	return(NULL); +    } +    ret = xmlRegEpxFromParse(ctxt); +    xmlRegFreeParserCtxt(ctxt); +    return(ret); +} + +/** + * xmlRegexpExec: + * @comp:  the compiled regular expression + * @content:  the value to check against the regular expression + * + * Check if the regular expression generate the value + * + * Returns 1 if it matches, 0 if not and a negativa value in case of error + */ +int +xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { +    if ((comp == NULL) || (content == NULL)) +	return(-1); +    return(xmlFARegExec(comp, content)); +} + +/** + * xmlRegexpIsDeterminist: + * @comp:  the compiled regular expression + * + * Check if the regular expression is determinist + * + * Returns 1 if it yes, 0 if not and a negativa value in case of error + */ +int +xmlRegexpIsDeterminist(xmlRegexpPtr comp) { +    xmlAutomataPtr am; +    int ret; + +    if (comp == NULL) +	return(-1); +    if (comp->determinist != -1) +	return(comp->determinist); + +    am = xmlNewAutomata(); +    if (am->states != NULL) { +	int i; + +	for (i = 0;i < am->nbStates;i++) +	    xmlRegFreeState(am->states[i]); +	xmlFree(am->states); +    } +    am->nbAtoms = comp->nbAtoms; +    am->atoms = comp->atoms; +    am->nbStates = comp->nbStates; +    am->states = comp->states; +    am->determinist = -1; +    ret = xmlFAComputesDeterminism(am); +    am->atoms = NULL; +    am->states = NULL; +    xmlFreeAutomata(am); +    return(ret); +} + +/** + * xmlRegFreeRegexp: + * @regexp:  the regexp + * + * Free a regexp + */ +void +xmlRegFreeRegexp(xmlRegexpPtr regexp) { +    int i; +    if (regexp == NULL) +	return; + +    if (regexp->string != NULL) +	xmlFree(regexp->string); +    if (regexp->states != NULL) { +	for (i = 0;i < regexp->nbStates;i++) +	    xmlRegFreeState(regexp->states[i]); +	xmlFree(regexp->states); +    } +    if (regexp->atoms != NULL) { +	for (i = 0;i < regexp->nbAtoms;i++) +	    xmlRegFreeAtom(regexp->atoms[i]); +	xmlFree(regexp->atoms); +    } +    if (regexp->counters != NULL) +	xmlFree(regexp->counters); +    if (regexp->compact != NULL) +	xmlFree(regexp->compact); +    if (regexp->transdata != NULL) +	xmlFree(regexp->transdata); +    if (regexp->stringMap != NULL) { +	for (i = 0; i < regexp->nbstrings;i++) +	    xmlFree(regexp->stringMap[i]); +	xmlFree(regexp->stringMap); +    } + +    xmlFree(regexp); +} + +#ifdef LIBXML_AUTOMATA_ENABLED +/************************************************************************ + * 									* + * 			The Automata interface				* + * 									* + ************************************************************************/ + +/** + * xmlNewAutomata: + * + * Create a new automata + * + * Returns the new object or NULL in case of failure + */ +xmlAutomataPtr +xmlNewAutomata(void) { +    xmlAutomataPtr ctxt; + +    ctxt = xmlRegNewParserCtxt(NULL); +    if (ctxt == NULL) +	return(NULL); + +    /* initialize the parser */ +    ctxt->end = NULL; +    ctxt->start = ctxt->state = xmlRegNewState(ctxt); +    xmlRegStatePush(ctxt, ctxt->start); + +    return(ctxt); +} + +/** + * xmlFreeAutomata: + * @am: an automata + * + * Free an automata + */ +void +xmlFreeAutomata(xmlAutomataPtr am) { +    if (am == NULL) +	return; +    xmlRegFreeParserCtxt(am); +} + +/** + * xmlAutomataGetInitState: + * @am: an automata + * + * Initial state lookup + * + * Returns the initial state of the automata + */ +xmlAutomataStatePtr +xmlAutomataGetInitState(xmlAutomataPtr am) { +    if (am == NULL) +	return(NULL); +    return(am->start); +} + +/** + * xmlAutomataSetFinalState: + * @am: an automata + * @state: a state in this automata + * + * Makes that state a final state + * + * Returns 0 or -1 in case of error + */ +int +xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { +    if ((am == NULL) || (state == NULL)) +	return(-1); +    state->type = XML_REGEXP_FINAL_STATE; +    return(0); +} + +/** + * xmlAutomataNewTransition: + * @am: an automata + * @from: the starting point of the transition + * @to: the target point of the transition or NULL + * @token: the input string associated to that transition + * @data: data passed to the callback function if the transition is activated + * + * If @to is NULL, this create first a new target state in the automata + * and then adds a transition from the @from state to the target state + * activated by the value of @token + * + * Returns the target state or NULL in case of error + */ +xmlAutomataStatePtr +xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, +			 xmlAutomataStatePtr to, const xmlChar *token, +			 void *data) { +    xmlRegAtomPtr atom; + +    if ((am == NULL) || (from == NULL) || (token == NULL)) +	return(NULL); +    atom = xmlRegNewAtom(am, XML_REGEXP_STRING); +    atom->data = data; +    if (atom == NULL) +	return(NULL); +    atom->valuep = xmlStrdup(token); + +    xmlFAGenerateTransitions(am, from, to, atom); +    if (to == NULL) +	return(am->state); +    return(to); +} + +/** + * xmlAutomataNewCountTrans: + * @am: an automata + * @from: the starting point of the transition + * @to: the target point of the transition or NULL + * @token: the input string associated to that transition + * @min:  the minimum successive occurences of token + * @max:  the maximum successive occurences of token + * @data:  data associated to the transition + * + * If @to is NULL, this create first a new target state in the automata + * and then adds a transition from the @from state to the target state + * activated by a succession of input of value @token and whose number + * is between @min and @max + * + * Returns the target state or NULL in case of error + */ +xmlAutomataStatePtr +xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, +			 xmlAutomataStatePtr to, const xmlChar *token, +			 int min, int max, void *data) { +    xmlRegAtomPtr atom; + +    if ((am == NULL) || (from == NULL) || (token == NULL)) +	return(NULL); +    if (min < 0) +	return(NULL); +    if ((max < min) || (max < 1)) +	return(NULL); +    atom = xmlRegNewAtom(am, XML_REGEXP_STRING); +    if (atom == NULL) +	return(NULL); +    atom->valuep = xmlStrdup(token); +    atom->data = data; +    if (min == 0) +	atom->min = 1; +    else +	atom->min = min; +    atom->max = max; + +    xmlFAGenerateTransitions(am, from, to, atom); +    if (to == NULL) +	to = am->state; +    if (to == NULL) +	return(NULL); +    if (min == 0) +	xmlFAGenerateEpsilonTransition(am, from, to); +    return(to); +} + +/** + * xmlAutomataNewOnceTrans: + * @am: an automata + * @from: the starting point of the transition + * @to: the target point of the transition or NULL + * @token: the input string associated to that transition + * @min:  the minimum successive occurences of token + * @max:  the maximum successive occurences of token + * @data:  data associated to the transition + * + * If @to is NULL, this create first a new target state in the automata + * and then adds a transition from the @from state to the target state + * activated by a succession of input of value @token and whose number + * is between @min and @max, moreover that transistion can only be crossed + * once. + * + * Returns the target state or NULL in case of error + */ +xmlAutomataStatePtr +xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, +			 xmlAutomataStatePtr to, const xmlChar *token, +			 int min, int max, void *data) { +    xmlRegAtomPtr atom; +    int counter; + +    if ((am == NULL) || (from == NULL) || (token == NULL)) +	return(NULL); +    if (min < 1) +	return(NULL); +    if ((max < min) || (max < 1)) +	return(NULL); +    atom = xmlRegNewAtom(am, XML_REGEXP_STRING); +    if (atom == NULL) +	return(NULL); +    atom->valuep = xmlStrdup(token); +    atom->data = data; +    atom->quant = XML_REGEXP_QUANT_ONCEONLY; +    if (min == 0) +	atom->min = 1; +    else +	atom->min = min; +    atom->max = max; +    /* +     * associate a counter to the transition. +     */ +    counter = xmlRegGetCounter(am); +    am->counters[counter].min = 1; +    am->counters[counter].max = 1; + +    /* xmlFAGenerateTransitions(am, from, to, atom); */ +    if (to == NULL) { +	to = xmlRegNewState(am); +	xmlRegStatePush(am, to); +    } +    xmlRegStateAddTrans(am, from, atom, to, counter, -1); +    xmlRegAtomPush(am, atom); +    am->state = to; +    if (to == NULL) +	to = am->state; +    if (to == NULL) +	return(NULL); +    return(to); +} + +/** + * xmlAutomataNewState: + * @am: an automata + * + * Create a new disconnected state in the automata + * + * Returns the new state or NULL in case of error + */ +xmlAutomataStatePtr +xmlAutomataNewState(xmlAutomataPtr am) { +    xmlAutomataStatePtr to;  + +    if (am == NULL) +	return(NULL); +    to = xmlRegNewState(am); +    xmlRegStatePush(am, to); +    return(to); +} + +/** + * xmlAutomataNewEpsilon: + * @am: an automata + * @from: the starting point of the transition + * @to: the target point of the transition or NULL + * + * If @to is NULL, this create first a new target state in the automata + * and then adds a an epsilon transition from the @from state to the + * target state + * + * Returns the target state or NULL in case of error + */ +xmlAutomataStatePtr +xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, +		      xmlAutomataStatePtr to) { +    if ((am == NULL) || (from == NULL)) +	return(NULL); +    xmlFAGenerateEpsilonTransition(am, from, to); +    if (to == NULL) +	return(am->state); +    return(to); +} + +/** + * xmlAutomataNewAllTrans: + * @am: an automata + * @from: the starting point of the transition + * @to: the target point of the transition or NULL + * @lax: allow to transition if not all all transitions have been activated + * + * If @to is NULL, this create first a new target state in the automata + * and then adds a an ALL transition from the @from state to the + * target state. That transition is an epsilon transition allowed only when + * all transitions from the @from node have been activated. + * + * Returns the target state or NULL in case of error + */ +xmlAutomataStatePtr +xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, +		       xmlAutomataStatePtr to, int lax) { +    if ((am == NULL) || (from == NULL)) +	return(NULL); +    xmlFAGenerateAllTransition(am, from, to, lax); +    if (to == NULL) +	return(am->state); +    return(to); +} + +/** + * xmlAutomataNewCounter: + * @am: an automata + * @min:  the minimal value on the counter + * @max:  the maximal value on the counter + * + * Create a new counter + * + * Returns the counter number or -1 in case of error + */ +int		 +xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { +    int ret; + +    if (am == NULL) +	return(-1); + +    ret = xmlRegGetCounter(am); +    if (ret < 0) +	return(-1); +    am->counters[ret].min = min; +    am->counters[ret].max = max; +    return(ret); +} + +/** + * xmlAutomataNewCountedTrans: + * @am: an automata + * @from: the starting point of the transition + * @to: the target point of the transition or NULL + * @counter: the counter associated to that transition + * + * If @to is NULL, this create first a new target state in the automata + * and then adds an epsilon transition from the @from state to the target state + * which will increment the counter provided + * + * Returns the target state or NULL in case of error + */ +xmlAutomataStatePtr +xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, +		xmlAutomataStatePtr to, int counter) { +    if ((am == NULL) || (from == NULL) || (counter < 0)) +	return(NULL); +    xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); +    if (to == NULL) +	return(am->state); +    return(to); +} + +/** + * xmlAutomataNewCounterTrans: + * @am: an automata + * @from: the starting point of the transition + * @to: the target point of the transition or NULL + * @counter: the counter associated to that transition + * + * If @to is NULL, this create first a new target state in the automata + * and then adds an epsilon transition from the @from state to the target state + * which will be allowed only if the counter is within the right range. + * + * Returns the target state or NULL in case of error + */ +xmlAutomataStatePtr +xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, +		xmlAutomataStatePtr to, int counter) { +    if ((am == NULL) || (from == NULL) || (counter < 0)) +	return(NULL); +    xmlFAGenerateCountedTransition(am, from, to, counter); +    if (to == NULL) +	return(am->state); +    return(to); +} + +/** + * xmlAutomataCompile: + * @am: an automata + * + * Compile the automata into a Reg Exp ready for being executed. + * The automata should be free after this point. + * + * Returns the compiled regexp or NULL in case of error + */ +xmlRegexpPtr           +xmlAutomataCompile(xmlAutomataPtr am) { +    xmlRegexpPtr ret; + +    xmlFAEliminateEpsilonTransitions(am); +    /* xmlFAComputesDeterminism(am); */ +    ret = xmlRegEpxFromParse(am); + +    return(ret); +} + +/** + * xmlAutomataIsDeterminist: + * @am: an automata + * + * Checks if an automata is determinist. + * + * Returns 1 if true, 0 if not, and -1 in case of error + */ +int           +xmlAutomataIsDeterminist(xmlAutomataPtr am) { +    int ret; + +    if (am == NULL) +	return(-1); + +    ret = xmlFAComputesDeterminism(am); +    return(ret); +} +#endif /* LIBXML_AUTOMATA_ENABLED */ +#endif /* LIBXML_REGEXP_ENABLED */  | 
