diff options
Diffstat (limited to 'ext/pcre/pcrelib/pcre_study.c')
| -rw-r--r-- | ext/pcre/pcrelib/pcre_study.c | 448 |
1 files changed, 26 insertions, 422 deletions
diff --git a/ext/pcre/pcrelib/pcre_study.c b/ext/pcre/pcrelib/pcre_study.c index 2653624e0a..226cc65941 100644 --- a/ext/pcre/pcrelib/pcre_study.c +++ b/ext/pcre/pcrelib/pcre_study.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2010 University of Cambridge + Copyright (c) 1997-2008 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -52,379 +52,6 @@ supporting functions. */ enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE }; - -/************************************************* -* Find the minimum subject length for a group * -*************************************************/ - -/* Scan a parenthesized group and compute the minimum length of subject that -is needed to match it. This is a lower bound; it does not mean there is a -string of that length that matches. In UTF8 mode, the result is in characters -rather than bytes. - -Arguments: - code pointer to start of group (the bracket) - startcode pointer to start of the whole pattern - options the compiling options - -Returns: the minimum length - -1 if \C was encountered - -2 internal error (missing capturing bracket) -*/ - -static int -find_minlength(const uschar *code, const uschar *startcode, int options) -{ -int length = -1; -BOOL utf8 = (options & PCRE_UTF8) != 0; -BOOL had_recurse = FALSE; -register int branchlength = 0; -register uschar *cc = (uschar *)code + 1 + LINK_SIZE; - -if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2; - -/* Scan along the opcodes for this branch. If we get to the end of the -branch, check the length against that of the other branches. */ - -for (;;) - { - int d, min; - uschar *cs, *ce; - register int op = *cc; - - switch (op) - { - case OP_COND: - case OP_SCOND: - - /* If there is only one branch in a condition, the implied branch has zero - length, so we don't add anything. This covers the DEFINE "condition" - automatically. */ - - cs = cc + GET(cc, 1); - if (*cs != OP_ALT) - { - cc = cs + 1 + LINK_SIZE; - break; - } - - /* Otherwise we can fall through and treat it the same as any other - subpattern. */ - - case OP_CBRA: - case OP_SCBRA: - case OP_BRA: - case OP_SBRA: - case OP_ONCE: - d = find_minlength(cc, startcode, options); - if (d < 0) return d; - branchlength += d; - do cc += GET(cc, 1); while (*cc == OP_ALT); - cc += 1 + LINK_SIZE; - break; - - /* Reached end of a branch; if it's a ket it is the end of a nested - call. If it's ALT it is an alternation in a nested call. If it is - END it's the end of the outer call. All can be handled by the same code. */ - - case OP_ALT: - case OP_KET: - case OP_KETRMAX: - case OP_KETRMIN: - case OP_END: - if (length < 0 || (!had_recurse && branchlength < length)) - length = branchlength; - if (*cc != OP_ALT) return length; - cc += 1 + LINK_SIZE; - branchlength = 0; - had_recurse = FALSE; - break; - - /* Skip over assertive subpatterns */ - - case OP_ASSERT: - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - do cc += GET(cc, 1); while (*cc == OP_ALT); - /* Fall through */ - - /* Skip over things that don't match chars */ - - case OP_REVERSE: - case OP_CREF: - case OP_NCREF: - case OP_RREF: - case OP_NRREF: - case OP_DEF: - case OP_OPT: - case OP_CALLOUT: - case OP_SOD: - case OP_SOM: - case OP_EOD: - case OP_EODN: - case OP_CIRC: - case OP_DOLL: - case OP_NOT_WORD_BOUNDARY: - case OP_WORD_BOUNDARY: - cc += _pcre_OP_lengths[*cc]; - break; - - /* Skip over a subpattern that has a {0} or {0,x} quantifier */ - - case OP_BRAZERO: - case OP_BRAMINZERO: - case OP_SKIPZERO: - cc += _pcre_OP_lengths[*cc]; - do cc += GET(cc, 1); while (*cc == OP_ALT); - cc += 1 + LINK_SIZE; - break; - - /* Handle literal characters and + repetitions */ - - case OP_CHAR: - case OP_CHARNC: - case OP_NOT: - case OP_PLUS: - case OP_MINPLUS: - case OP_POSPLUS: - case OP_NOTPLUS: - case OP_NOTMINPLUS: - case OP_NOTPOSPLUS: - branchlength++; - cc += 2; -#ifdef SUPPORT_UTF8 - if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f]; -#endif - break; - - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEPOSPLUS: - branchlength++; - cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2; - break; - - /* Handle exact repetitions. The count is already in characters, but we - need to skip over a multibyte character in UTF8 mode. */ - - case OP_EXACT: - case OP_NOTEXACT: - branchlength += GET2(cc,1); - cc += 4; -#ifdef SUPPORT_UTF8 - if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f]; -#endif - break; - - case OP_TYPEEXACT: - branchlength += GET2(cc,1); - cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4; - break; - - /* Handle single-char non-literal matchers */ - - case OP_PROP: - case OP_NOTPROP: - cc += 2; - /* Fall through */ - - case OP_NOT_DIGIT: - case OP_DIGIT: - case OP_NOT_WHITESPACE: - case OP_WHITESPACE: - case OP_NOT_WORDCHAR: - case OP_WORDCHAR: - case OP_ANY: - case OP_ALLANY: - case OP_EXTUNI: - case OP_HSPACE: - case OP_NOT_HSPACE: - case OP_VSPACE: - case OP_NOT_VSPACE: - branchlength++; - cc++; - break; - - /* "Any newline" might match two characters */ - - case OP_ANYNL: - branchlength += 2; - cc++; - break; - - /* The single-byte matcher means we can't proceed in UTF-8 mode */ - - case OP_ANYBYTE: -#ifdef SUPPORT_UTF8 - if (utf8) return -1; -#endif - branchlength++; - cc++; - break; - - /* For repeated character types, we have to test for \p and \P, which have - an extra two bytes of parameters. */ - - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEPOSSTAR: - case OP_TYPEPOSQUERY: - if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2; - cc += _pcre_OP_lengths[op]; - break; - - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEPOSUPTO: - if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2; - cc += _pcre_OP_lengths[op]; - break; - - /* Check a class for variable quantification */ - -#ifdef SUPPORT_UTF8 - case OP_XCLASS: - cc += GET(cc, 1) - 33; - /* Fall through */ -#endif - - case OP_CLASS: - case OP_NCLASS: - cc += 33; - - switch (*cc) - { - case OP_CRPLUS: - case OP_CRMINPLUS: - branchlength++; - /* Fall through */ - - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRQUERY: - case OP_CRMINQUERY: - cc++; - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - branchlength += GET2(cc,1); - cc += 5; - break; - - default: - branchlength++; - break; - } - break; - - /* Backreferences and subroutine calls are treated in the same way: we find - the minimum length for the subpattern. A recursion, however, causes an - a flag to be set that causes the length of this branch to be ignored. The - logic is that a recursion can only make sense if there is another - alternation that stops the recursing. That will provide the minimum length - (when no recursion happens). A backreference within the group that it is - referencing behaves in the same way. - - If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket - matches an empty string (by default it causes a matching failure), so in - that case we must set the minimum length to zero. */ - - case OP_REF: - if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) - { - ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1)); - if (cs == NULL) return -2; - do ce += GET(ce, 1); while (*ce == OP_ALT); - if (cc > cs && cc < ce) - { - d = 0; - had_recurse = TRUE; - } - else d = find_minlength(cs, startcode, options); - } - else d = 0; - cc += 3; - - /* Handle repeated back references */ - - switch (*cc) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRQUERY: - case OP_CRMINQUERY: - min = 0; - cc++; - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - min = GET2(cc, 1); - cc += 5; - break; - - default: - min = 1; - break; - } - - branchlength += min * d; - break; - - case OP_RECURSE: - cs = ce = (uschar *)startcode + GET(cc, 1); - if (cs == NULL) return -2; - do ce += GET(ce, 1); while (*ce == OP_ALT); - if (cc > cs && cc < ce) - had_recurse = TRUE; - else - branchlength += find_minlength(cs, startcode, options); - cc += 1 + LINK_SIZE; - break; - - /* Anything else does not or need not match a character. We can get the - item's length from the table, but for those that can match zero occurrences - of a character, we must take special action for UTF-8 characters. */ - - case OP_UPTO: - case OP_NOTUPTO: - case OP_MINUPTO: - case OP_NOTMINUPTO: - case OP_POSUPTO: - case OP_STAR: - case OP_MINSTAR: - case OP_NOTMINSTAR: - case OP_POSSTAR: - case OP_NOTPOSSTAR: - case OP_QUERY: - case OP_MINQUERY: - case OP_NOTMINQUERY: - case OP_POSQUERY: - case OP_NOTPOSQUERY: - cc += _pcre_OP_lengths[op]; -#ifdef SUPPORT_UTF8 - if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f]; -#endif - break; - - /* For the record, these are the opcodes that are matched by "default": - OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP, - OP_THEN. */ - - default: - cc += _pcre_OP_lengths[op]; - break; - } - } -/* Control never gets here */ -} - - - /************************************************* * Set a bit and maybe its alternate case * *************************************************/ @@ -442,8 +69,7 @@ Returns: nothing */ static void -set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless, - compile_data *cd) +set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd) { start_bits[c/8] |= (1 << (c&7)); if (caseless && (cd->ctypes[c] & ctype_letter) != 0) @@ -605,7 +231,7 @@ do case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: - set_table_bit(start_bits, tcode[1], caseless, cd); + set_bit(start_bits, tcode[1], caseless, cd); tcode += 2; #ifdef SUPPORT_UTF8 if (utf8 && tcode[-1] >= 0xc0) @@ -618,7 +244,7 @@ do case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: - set_table_bit(start_bits, tcode[3], caseless, cd); + set_bit(start_bits, tcode[3], caseless, cd); tcode += 4; #ifdef SUPPORT_UTF8 if (utf8 && tcode[-1] >= 0xc0) @@ -636,7 +262,7 @@ do case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: - set_table_bit(start_bits, tcode[1], caseless, cd); + set_bit(start_bits, tcode[1], caseless, cd); try_next = FALSE; break; @@ -872,15 +498,13 @@ Arguments: set NULL unless error Returns: pointer to a pcre_extra block, with study_data filled in and the - appropriate flags set; + appropriate flag set; NULL on error or if no optimization possible */ PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION pcre_study(const pcre *external_re, int options, const char **errorptr) { -int min; -BOOL bits_set = FALSE; uschar start_bits[32]; pcre_extra *extra; pcre_study_data *study; @@ -907,39 +531,30 @@ code = (uschar *)re + re->name_table_offset + (re->name_count * re->name_entry_size); /* For an anchored pattern, or an unanchored pattern that has a first char, or -a multiline pattern that matches only at "line starts", there is no point in -seeking a list of starting bytes. */ - -if ((re->options & PCRE_ANCHORED) == 0 && - (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0) - { - /* Set the character tables in the block that is passed around */ - - tables = re->tables; - if (tables == NULL) - (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, - (void *)(&tables)); +a multiline pattern that matches only at "line starts", no further processing +at present. */ - compile_block.lcc = tables + lcc_offset; - compile_block.fcc = tables + fcc_offset; - compile_block.cbits = tables + cbits_offset; - compile_block.ctypes = tables + ctypes_offset; - - /* See if we can find a fixed set of initial characters for the pattern. */ +if ((re->options & PCRE_ANCHORED) != 0 || + (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) + return NULL; - memset(start_bits, 0, 32 * sizeof(uschar)); - bits_set = set_start_bits(code, start_bits, - (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0, - &compile_block) == SSB_DONE; - } +/* Set the character tables in the block that is passed around */ -/* Find the minimum length of subject string. */ +tables = re->tables; +if (tables == NULL) + (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, + (void *)(&tables)); -min = find_minlength(code, code, re->options); +compile_block.lcc = tables + lcc_offset; +compile_block.fcc = tables + fcc_offset; +compile_block.cbits = tables + cbits_offset; +compile_block.ctypes = tables + ctypes_offset; -/* Return NULL if no optimization is possible. */ +/* See if we can find a fixed set of initial characters for the pattern. */ -if (!bits_set && min < 0) return NULL; +memset(start_bits, 0, 32 * sizeof(uschar)); +if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, + (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL; /* Get a pcre_extra block and a pcre_study_data block. The study data is put in the latter, which is pointed to by the former, which may also get additional @@ -962,19 +577,8 @@ extra->flags = PCRE_EXTRA_STUDY_DATA; extra->study_data = study; study->size = sizeof(pcre_study_data); -study->flags = 0; - -if (bits_set) - { - study->flags |= PCRE_STUDY_MAPPED; - memcpy(study->start_bits, start_bits, sizeof(start_bits)); - } - -if (min >= 0) - { - study->flags |= PCRE_STUDY_MINLEN; - study->minlength = min; - } +study->options = PCRE_STUDY_MAPPED; +memcpy(study->start_bits, start_bits, sizeof(start_bits)); return extra; } |
