diff options
Diffstat (limited to 'ext/pcre/pcrelib/pcre.c')
| -rw-r--r-- | ext/pcre/pcrelib/pcre.c | 345 |
1 files changed, 234 insertions, 111 deletions
diff --git a/ext/pcre/pcrelib/pcre.c b/ext/pcre/pcrelib/pcre.c index 51cb6b6c87..6735b823d4 100644 --- a/ext/pcre/pcrelib/pcre.c +++ b/ext/pcre/pcrelib/pcre.c @@ -111,7 +111,7 @@ static const short int escapes[] = { static BOOL compile_regex(int, int, int *, uschar **, const uschar **, const char **, - BOOL, int, compile_data *); + BOOL, int, int *, int *, compile_data *); @@ -148,10 +148,13 @@ tables. */ * Return version string * *************************************************/ +#define STRING(a) # a +#define XSTRING(s) STRING(s) + const char * pcre_version(void) { -return PCRE_VERSION; +return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE); } @@ -162,7 +165,10 @@ return PCRE_VERSION; *************************************************/ /* This function picks potentially useful data out of the private -structure. +structure. The public options are passed back in an int - though the +re->options field has been expanded to a long int, all the public options +at the low end of it, and so even on 16-bit systems this will still be OK. +Therefore, I haven't changed the API for pcre_info(). Arguments: external_re points to compiled code @@ -181,7 +187,7 @@ pcre_info(const pcre *external_re, int *optptr, int *first_char) const real_pcre *re = (const real_pcre *)external_re; if (re == NULL) return PCRE_ERROR_NULL; if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC; -if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS); +if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS); if (first_char != NULL) *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char : ((re->options & PCRE_STARTLINE) != 0)? -1 : -2; @@ -532,6 +538,7 @@ for (;;) case OP_REVERSE: cc++; + /* Fall through */ case OP_CREF: case OP_OPT: @@ -627,6 +634,8 @@ Arguments: ptrptr points to the current pattern pointer errorptr points to pointer to error message optchanged set to the value of the last OP_OPT item compiled + reqchar set to the last literal character required, else -1 + countlits set to count of mandatory literal characters cd contains pointers to tables Returns: TRUE on success @@ -636,12 +645,15 @@ Returns: TRUE on success static BOOL compile_branch(int options, int *brackets, uschar **codeptr, const uschar **ptrptr, const char **errorptr, int *optchanged, - compile_data *cd) + int *reqchar, int *countlits, compile_data *cd) { int repeat_type, op_type; int repeat_min, repeat_max; int bravalue, length; int greedy_default, greedy_non_default; +int prevreqchar; +int condcount = 0; +int subcountlits = 0; register int c; register uschar *code = *codeptr; uschar *tempcode; @@ -655,6 +667,11 @@ uschar class[32]; greedy_default = ((options & PCRE_UNGREEDY) != 0); greedy_non_default = greedy_default ^ 1; +/* Initialize no required char, and count of literals */ + +*reqchar = prevreqchar = -1; +*countlits = 0; + /* Switch on next character until the end of the branch */ for (;; ptr++) @@ -664,6 +681,7 @@ for (;; ptr++) int class_lastchar; int newoptions; int condref; + int subreqchar; c = *ptr; if ((options & PCRE_EXTENDED) != 0) @@ -937,18 +955,19 @@ for (;; ptr++) { repeat_type = greedy_non_default; ptr++; } else repeat_type = greedy_default; - /* If the maximum is zero then the minimum must also be zero; Perl allows - this case, so we do too - by simply omitting the item altogether. */ - - if (repeat_max == 0) code = previous; - /* If previous was a string of characters, chop off the last one and use it as the subject of the repeat. If there was only one character, we can - abolish the previous item altogether. */ + abolish the previous item altogether. A repeat with a zero minimum wipes + out any reqchar setting, backing up to the previous value. We must also + adjust the countlits value. */ - else if (*previous == OP_CHARS) + if (*previous == OP_CHARS) { int len = previous[1]; + + if (repeat_min == 0) *reqchar = prevreqchar; + *countlits += repeat_min - 1; + if (len == 1) { c = previous[2]; @@ -987,7 +1006,15 @@ for (;; ptr++) code = previous; OUTPUT_SINGLE_REPEAT: - repeat_type += op_type; /* Combine both values for many cases */ + + /* If the maximum is zero then the minimum must also be zero; Perl allows + this case, so we do too - by simply omitting the item altogether. */ + + if (repeat_max == 0) goto END_REPEAT; + + /* Combine the op_type with the repeat_type */ + + repeat_type += op_type; /* A minimum of zero is handled either as the special case * or ?, or as an UPTO, with the maximum given. */ @@ -1064,10 +1091,15 @@ for (;; ptr++) } /* If previous was a character class or a back reference, we put the repeat - stuff after it. */ + stuff after it, but just skip the item if the repeat was {0,0}. */ else if (*previous == OP_CLASS || *previous == OP_REF) { + if (repeat_max == 0) + { + code = previous; + goto END_REPEAT; + } if (repeat_min == 0 && repeat_max == -1) *code++ = OP_CRSTAR + repeat_type; else if (repeat_min == 1 && repeat_max == -1) @@ -1118,14 +1150,22 @@ for (;; ptr++) if (repeat_min == 0) { + /* If we set up a required char from the bracket, we must back off + to the previous value and reset the countlits value too. */ + + if (subcountlits > 0) + { + *reqchar = prevreqchar; + *countlits -= subcountlits; + } + /* If the maximum is also zero, we just omit the group from the output altogether. */ if (repeat_max == 0) { code = previous; - previous = NULL; - break; + goto END_REPEAT; } /* If the maximum is 1 or unlimited, we just have to stick in the @@ -1230,59 +1270,6 @@ for (;; ptr++) correct offset was computed above. */ else code[-ketoffset] = OP_KETRMAX + repeat_type; - - -#ifdef NEVER - /* If the minimum is greater than zero, and the maximum is unlimited or - equal to the minimum, the first copy remains where it is, and is - replicated up to the minimum number of times. This case includes the + - repeat, but of course no replication is needed in that case. */ - - if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min)) - { - for (i = 1; i < repeat_min; i++) - { - memcpy(code, previous, len); - code += len; - } - } - - /* If the minimum is zero, stick BRAZERO in front of the first copy. - Then, if there is a fixed upper limit, replicated up to that many times, - sticking BRAZERO in front of all the optional ones. */ - - else - { - if (repeat_min == 0) - { - memmove(previous+1, previous, len); - code++; - *previous++ = OP_BRAZERO + repeat_type; - } - - for (i = 1; i < repeat_min; i++) - { - memcpy(code, previous, len); - code += len; - } - - for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++) - { - *code++ = OP_BRAZERO + repeat_type; - memcpy(code, previous, len); - code += len; - } - } - - /* If the maximum is unlimited, set a repeater in the final copy. We - can't just offset backwards from the current code point, because we - don't know if there's been an options resetting after the ket. The - correct offset was computed above. */ - - if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type; -#endif - - } /* Else there's some kind of shambles */ @@ -1295,6 +1282,7 @@ for (;; ptr++) /* In all case we no longer have a previous item. */ + END_REPEAT: previous = NULL; break; @@ -1463,6 +1451,8 @@ for (;; ptr++) (bravalue == OP_ASSERTBACK || bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ condref, /* Condition reference number */ + &subreqchar, /* For possible last char */ + &subcountlits, /* For literal count */ cd)) /* Tables block */ goto FAILED; @@ -1476,22 +1466,39 @@ for (;; ptr++) if (bravalue == OP_COND) { - int branchcount = 0; uschar *tc = code; + condcount = 0; do { - branchcount++; + condcount++; tc += (tc[1] << 8) | tc[2]; } while (*tc != OP_KET); - if (branchcount > 2) + if (condcount > 2) { *errorptr = ERR27; goto FAILED; } } + /* Handle updating of the required character. If the subpattern didn't + set one, leave it as it was. Otherwise, update it for normal brackets of + all kinds, forward assertions, and conditions with two branches. Don't + update the literal count for forward assertions, however. If the bracket + is followed by a quantifier with zero repeat, we have to back off. Hence + the definition of prevreqchar and subcountlits outside the main loop so + that they can be accessed for the back off. */ + + if (subreqchar > 0 && + (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT || + (bravalue == OP_COND && condcount == 2))) + { + prevreqchar = *reqchar; + *reqchar = subreqchar; + if (bravalue != OP_ASSERT) *countlits += subcountlits; + } + /* Now update the main code pointer to the end of the group. */ code = tempcode; @@ -1586,6 +1593,12 @@ for (;; ptr++) while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0); + /* Update the last character and the count of literals */ + + prevreqchar = (length > 1)? code[-2] : *reqchar; + *reqchar = code[-1]; + *countlits += length; + /* Compute the length and set it in the data vector, and advance to the next state. */ @@ -1629,6 +1642,8 @@ Argument: errorptr -> pointer to error message lookbehind TRUE if this is a lookbehind assertion condref > 0 for OPT_CREF setting at start of conditional group + reqchar -> place to put the last required character, or a negative number + countlits -> place to put the shortest literal count of any branch cd points to the data block with tables pointers Returns: TRUE on success @@ -1637,7 +1652,7 @@ Returns: TRUE on success static BOOL compile_regex(int options, int optchanged, int *brackets, uschar **codeptr, const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref, - compile_data *cd) + int *reqchar, int *countlits, compile_data *cd) { const uschar *ptr = *ptrptr; uschar *code = *codeptr; @@ -1645,7 +1660,10 @@ uschar *last_branch = code; uschar *start_bracket = code; uschar *reverse_count = NULL; int oldoptions = options & PCRE_IMS; +int branchreqchar, branchcountlits; +*reqchar = -1; +*countlits = INT_MAX; code += 3; /* At the start of a reference-based conditional group, insert the reference @@ -1684,7 +1702,8 @@ for (;;) /* Now compile the branch */ - if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd)) + if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged, + &branchreqchar, &branchcountlits, cd)) { *ptrptr = ptr; return FALSE; @@ -1696,6 +1715,25 @@ for (;;) last_branch[1] = length >> 8; last_branch[2] = length & 255; + /* Save the last required character if all branches have the same; a current + value of -1 means unset, while -2 means "previous branch had no last required + char". */ + + if (*reqchar != -2) + { + if (branchreqchar >= 0) + { + if (*reqchar == -1) *reqchar = branchreqchar; + else if (*reqchar != branchreqchar) *reqchar = -2; + } + else *reqchar = -2; + } + + /* Keep the shortest literal count */ + + if (branchcountlits < *countlits) *countlits = branchcountlits; + DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits)); + /* If lookbehind, check that this branch matches a fixed-length string, and put the length into the OP_REVERSE item. Temporarily mark the end of the branch with OP_END. */ @@ -1790,6 +1828,11 @@ for (;;) code += 2; break; + case OP_WORD_BOUNDARY: + case OP_NOT_WORD_BOUNDARY: + code++; + break; + case OP_ASSERT_NOT: case OP_ASSERTBACK: case OP_ASSERTBACK_NOT: @@ -1972,7 +2015,7 @@ pcre_compile(const char *pattern, int options, const char **errorptr, real_pcre *re; int length = 3; /* For initial BRA plus length */ int runlength; -int c, size; +int c, size, reqchar, countlits; int bracount = 0; int top_backref = 0; int branch_extra = 0; @@ -2312,13 +2355,16 @@ while ((c = *(++ptr)) != 0) will lead to an over-estimate on the length, but this shouldn't matter very much. We also have to allow for resetting options at the start of any alternations, which we do by setting - branch_newextra to 2. */ + branch_newextra to 2. Finally, we record whether the case-dependent + flag ever changes within the regex. This is used by the "required + character" code. */ case ':': if (((set|unset) & PCRE_IMS) != 0) { length += 4; branch_newextra = 2; + if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED; } goto END_OPTIONS; @@ -2519,7 +2565,7 @@ code = re->code; *code = OP_BRA; bracount = 0; (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1, - &compile_block); + &reqchar, &countlits, &compile_block); re->top_bracket = bracount; re->top_backref = top_backref; @@ -2579,6 +2625,15 @@ if ((options & PCRE_ANCHORED) == 0) } } +/* Save the last required character if there are at least two literal +characters on all paths, or if there is no first character setting. */ + +if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0)) + { + re->req_char = reqchar; + re->options |= PCRE_REQCHSET; + } + /* Print out the compiled data for debugging */ #ifdef DEBUG @@ -2588,9 +2643,10 @@ printf("Length = %d top_bracket = %d top_backref = %d\n", if (re->options != 0) { - printf("%s%s%s%s%s%s%s%s\n", + printf("%s%s%s%s%s%s%s%s%s\n", ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "", ((re->options & PCRE_CASELESS) != 0)? "caseless " : "", + ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "", ((re->options & PCRE_EXTENDED) != 0)? "extended " : "", ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "", ((re->options & PCRE_DOTALL) != 0)? "dotall " : "", @@ -2605,6 +2661,12 @@ if ((re->options & PCRE_FIRSTSET) != 0) else printf("First char = \\x%02x\n", re->first_char); } +if ((re->options & PCRE_REQCHSET) != 0) + { + if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char); + else printf("Req char = \\x%02x\n", re->req_char); + } + code_end = code; code_base = code = re->code; @@ -2838,7 +2900,7 @@ Returns: TRUE if matched static BOOL match_ref(int offset, register const uschar *eptr, int length, match_data *md, - int ims) + unsigned long int ims) { const uschar *p = md->start_subject + md->offset_vector[offset]; @@ -2897,9 +2959,10 @@ Returns: TRUE if matched static BOOL match(register const uschar *eptr, register const uschar *ecode, - int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb) + int offset_top, match_data *md, unsigned long int ims, BOOL condassert, + const uschar *eptrb) { -int original_ims = ims; /* Save for resetting on ')' */ +unsigned long int original_ims = ims; /* Save for resetting on ')' */ for (;;) { @@ -3014,9 +3077,11 @@ for (;;) ecode += 2; break; - /* End of the pattern */ + /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched + an empty string - recursion will then try other alternatives, if any. */ case OP_END: + if (md->notempty && eptr == md->start_match) return FALSE; md->end_match_ptr = eptr; /* Record where we ended */ md->end_offset_top = offset_top; /* and how many extracts were taken */ return TRUE; @@ -3026,7 +3091,7 @@ for (;;) case OP_OPT: ims = ecode[1]; ecode += 2; - DPRINTF(("ims set to %02x\n", ims)); + DPRINTF(("ims set to %02lx\n", ims)); break; /* Assertion brackets. Check the alternative branches in turn - the @@ -3133,7 +3198,7 @@ for (;;) if (ecode[3] == OP_OPT) { ims = (ims & ~PCRE_IMS) | ecode[4]; - DPRINTF(("ims set to %02x at group repeat\n", ims)); + DPRINTF(("ims set to %02lx at group repeat\n", ims)); } if (*ecode == OP_KETRMIN) @@ -3227,7 +3292,7 @@ for (;;) the group. */ ims = original_ims; - DPRINTF(("ims reset to %02x\n", ims)); + DPRINTF(("ims reset to %02lx\n", ims)); /* For a non-repeating ket, just continue at this level. This also happens for a repeating ket if no characters were matched in the group. @@ -3321,12 +3386,8 @@ for (;;) case OP_NOT_WORD_BOUNDARY: case OP_WORD_BOUNDARY: { - /* Modified the next line to check previous character - in case we're not at the beginning of the whole string - (Andrey Zmievski) */ - BOOL prev_is_word = (eptr != md->start_subject) ? - ((md->ctypes[eptr[-1]] & ctype_word) != 0) : - ((md->ctypes[md->regprev] & ctype_word) != 0); + BOOL prev_is_word = (eptr != md->start_subject) && + ((md->ctypes[eptr[-1]] & ctype_word) != 0); BOOL cur_is_word = (eptr < md->end_subject) && ((md->ctypes[*eptr] & ctype_word) != 0); if ((*ecode++ == OP_WORD_BOUNDARY)? @@ -4117,6 +4178,7 @@ Arguments: external_extra points to "hints" from pcre_study() or is NULL subject points to the subject string length length of subject string (may contain binary zeros) + start_offset where to start in the subject string options option bits offsets points to a vector of ints to be filled in with offsets offsetcount the number of elements in the vector @@ -4129,16 +4191,19 @@ Returns: > 0 => success; value is the number of elements filled in int pcre_exec(const pcre *external_re, const pcre_extra *external_extra, - const char *subject, int length, const char *strbeg, int options, - int *offsets, int offsetcount, int minlen) + const char *subject, int length, int start_offset, int options, int *offsets, + int offsetcount) { int resetcount, ocount; int first_char = -1; -int ims = 0; +int req_char = -1; +int req_char2 = -1; +unsigned long int ims = 0; match_data match_block; const uschar *start_bits = NULL; -const uschar *start_match = (const uschar *)subject; +const uschar *start_match = (const uschar *)subject + start_offset; const uschar *end_subject; +const uschar *req_char_ptr = start_match - 1; const real_pcre *re = (const real_pcre *)external_re; const real_pcre_extra *extra = (const real_pcre_extra *)external_extra; BOOL using_temporary_offsets = FALSE; @@ -4159,21 +4224,13 @@ match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; match_block.notbol = (options & PCRE_NOTBOL) != 0; match_block.noteol = (options & PCRE_NOTEOL) != 0; +match_block.notempty = (options & PCRE_NOTEMPTY) != 0; match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */ match_block.lcc = re->tables + lcc_offset; match_block.ctypes = re->tables + ctypes_offset; -/* Setup previous character (Andrey Zmievski) */ -if (subject == strbeg) - match_block.regprev = '\n'; -else { - match_block.regprev = subject[-1]; - if (!(re->options & PCRE_MULTILINE) && match_block.regprev == '\n') - match_block.regprev = '\0'; -} - /* The ims options can vary during the matching as a result of the presence of (?ims) items in the pattern. They are kept in a local variable so that restoring at the exit of a group is easy. */ @@ -4238,7 +4295,23 @@ if (!anchored) start_bits = extra->start_bits; } -/* Loop for unanchored matches; for anchored regexps the loop runs just once. */ +/* For anchored or unanchored matches, there may be a "last known required +character" set. If the PCRE_CASELESS is set, implying that the match starts +caselessly, or if there are any changes of this flag within the regex, set up +both cases of the character. Otherwise set the two values the same, which will +avoid duplicate testing (which takes significant time). This covers the vast +majority of cases. It will be suboptimal when the case flag changes in a regex +and the required character in fact is caseful. */ + +if ((re->options & PCRE_REQCHSET) != 0) + { + req_char = re->req_char; + req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)? + (re->tables + fcc_offset)[req_char] : req_char; + } + +/* Loop for handling unanchored repeated matching attempts; for anchored regexs +the loop runs just once. */ do { @@ -4267,14 +4340,14 @@ do else if (startline) { - if (start_match > match_block.start_subject) + if (start_match > match_block.start_subject + start_offset) { while (start_match < end_subject && start_match[-1] != '\n') start_match++; } } - /* Or to a non-unique first char */ + /* Or to a non-unique first char after study */ else if (start_bits != NULL) { @@ -4291,6 +4364,59 @@ do printf("\n"); #endif + /* If req_char is set, we know that that character must appear in the subject + for the match to succeed. If the first character is set, req_char must be + later in the subject; otherwise the test starts at the match point. This + optimization can save a huge amount of backtracking in patterns with nested + unlimited repeats that aren't going to match. We don't know what the state of + case matching may be when this character is hit, so test for it in both its + cases if necessary. However, the different cased versions will not be set up + unless PCRE_CASELESS was given or the casing state changes within the regex. + Writing separate code makes it go faster, as does using an autoincrement and + backing off on a match. */ + + if (req_char >= 0) + { + register const uschar *p = start_match + ((first_char >= 0)? 1 : 0); + + /* We don't need to repeat the search if we haven't yet reached the + place we found it at last time. */ + + if (p > req_char_ptr) + { + /* Do a single test if no case difference is set up */ + + if (req_char == req_char2) + { + while (p < end_subject) + { + if (*p++ == req_char) { p--; break; } + } + } + + /* Otherwise test for either case */ + + else + { + while (p < end_subject) + { + register int pp = *p++; + if (pp == req_char || pp == req_char2) { p--; break; } + } + } + + /* If we can't find the required character, break the matching loop */ + + if (p >= end_subject) break; + + /* If we have found the required character, save the point where we + found it, so that we don't search again next time round the loop if + the start hasn't passed this character yet. */ + + req_char_ptr = p; + } + } + /* When a match occurs, substrings will be set for all internal extractions; we just need to set up the whole thing as substring 0 before returning. If there were too many extractions, set the return code to zero. In the case @@ -4298,13 +4424,10 @@ do those back references that we can. In this case there need not be overflow if certain parts of the pattern were not used. */ + match_block.start_match = start_match; if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match)) continue; - /* Check that the match is not closer than minlen (Andrey Zmievski) */ - if (start_match - match_block.start_subject < minlen) - continue; - /* Copy the offset information from temporary store if necessary */ if (using_temporary_offsets) |
