diff options
Diffstat (limited to 'ext/pcre/pcrelib/pcre_exec.c')
| -rw-r--r-- | ext/pcre/pcrelib/pcre_exec.c | 301 |
1 files changed, 180 insertions, 121 deletions
diff --git a/ext/pcre/pcrelib/pcre_exec.c b/ext/pcre/pcrelib/pcre_exec.c index 5520d153b5..eff51c7048 100644 --- a/ext/pcre/pcrelib/pcre_exec.c +++ b/ext/pcre/pcrelib/pcre_exec.c @@ -156,13 +156,39 @@ printf("\n"); if (length > md->end_subject - eptr) return FALSE; -/* Separate the caselesss case for speed */ +/* Separate the caseless case for speed. In UTF-8 mode we can only do this +properly if Unicode properties are supported. Otherwise, we can check only +ASCII characters. */ if ((ims & PCRE_CASELESS) != 0) { +#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UCP + if (md->utf8) + { + USPTR endptr = eptr + length; + while (eptr < endptr) + { + int c, d; + GETCHARINC(c, eptr); + GETCHARINC(d, p); + if (c != d && c != UCD_OTHERCASE(d)) return FALSE; + } + } + else +#endif +#endif + + /* The same code works when not in UTF-8 mode and in UTF-8 mode when there + is no UCP support. */ + while (length-- > 0) - if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; + { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; } } + +/* In the caseful case, we can just compare the bytes, whether or not we +are in UTF-8 mode. */ + else { while (length-- > 0) if (*p++ != *eptr++) return FALSE; } @@ -1146,11 +1172,11 @@ for (;;) do ecode += GET(ecode,1); while (*ecode == OP_ALT); break; - /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating - that it may occur zero times. It may repeat infinitely, or not at all - - i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper - repeat limits are compiled as a number of copies, with the optional ones - preceded by BRAZERO or BRAMINZERO. */ + /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group, + indicating that it may occur zero times. It may repeat infinitely, or not + at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets + with fixed upper repeat limits are compiled as a number of copies, with the + optional ones preceded by BRAZERO or BRAMINZERO. */ case OP_BRAZERO: { @@ -1172,6 +1198,14 @@ for (;;) } break; + case OP_SKIPZERO: + { + next = ecode+1; + do next += GET(next,1); while (*next == OP_ALT); + ecode = next + 1 + LINK_SIZE; + } + break; + /* End of a group, repeated or non-repeating. */ case OP_KET: @@ -1419,13 +1453,12 @@ for (;;) /* Match a single character type; inline for speed */ case OP_ANY: - if ((ims & PCRE_DOTALL) == 0) - { - if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); - } + if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + /* Fall through */ + + case OP_ALLANY: if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); - if (utf8) - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; ecode++; break; @@ -1644,8 +1677,7 @@ for (;;) if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); { - int chartype, script; - int category = _pcre_ucp_findprop(c, &chartype, &script); + const ucd_record * prop = GET_UCD(c); switch(ecode[1]) { @@ -1654,24 +1686,24 @@ for (;;) break; case PT_LAMP: - if ((chartype == ucp_Lu || - chartype == ucp_Ll || - chartype == ucp_Lt) == (op == OP_NOTPROP)) + if ((prop->chartype == ucp_Lu || + prop->chartype == ucp_Ll || + prop->chartype == ucp_Lt) == (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); break; case PT_GC: - if ((ecode[2] != category) == (op == OP_PROP)) + if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; case PT_PC: - if ((ecode[2] != chartype) == (op == OP_PROP)) + if ((ecode[2] != prop->chartype) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; case PT_SC: - if ((ecode[2] != script) == (op == OP_PROP)) + if ((ecode[2] != prop->script) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; @@ -1690,8 +1722,7 @@ for (;;) if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); { - int chartype, script; - int category = _pcre_ucp_findprop(c, &chartype, &script); + int category = UCD_CATEGORY(c); if (category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { @@ -1700,7 +1731,7 @@ for (;;) { GETCHARLEN(c, eptr, len); } - category = _pcre_ucp_findprop(c, &chartype, &script); + category = UCD_CATEGORY(c); if (category != ucp_M) break; eptr += len; } @@ -1721,16 +1752,25 @@ for (;;) case OP_REF: { offset = GET2(ecode, 1) << 1; /* Doubled ref number */ - ecode += 3; /* Advance past item */ + ecode += 3; - /* If the reference is unset, set the length to be longer than the amount - of subject left; this ensures that every attempt at a match fails. We - can't just fail here, because of the possibility of quantifiers with zero - minima. */ + /* If the reference is unset, there are two possibilities: - length = (offset >= offset_top || md->offset_vector[offset] < 0)? - md->end_subject - eptr + 1 : - md->offset_vector[offset+1] - md->offset_vector[offset]; + (a) In the default, Perl-compatible state, set the length to be longer + than the amount of subject left; this ensures that every attempt at a + match fails. We can't just fail here, because of the possibility of + quantifiers with zero minima. + + (b) If the JavaScript compatibility flag is set, set the length to zero + so that the back reference matches an empty string. + + Otherwise, set the length to the length of what was matched by the + referenced subpattern. */ + + if (offset >= offset_top || md->offset_vector[offset] < 0) + length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1; + else + length = md->offset_vector[offset+1] - md->offset_vector[offset]; /* Set up for repetition, or handle the non-repeated case */ @@ -2156,7 +2196,7 @@ for (;;) if (fc != dc) { #ifdef SUPPORT_UCP - if (dc != _pcre_ucp_othercase(fc)) + if (dc != UCD_OTHERCASE(fc)) #endif RRETURN(MATCH_NOMATCH); } @@ -2247,7 +2287,7 @@ for (;;) #ifdef SUPPORT_UCP unsigned int othercase; if ((ims & PCRE_CASELESS) != 0 && - (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR) + (othercase = UCD_OTHERCASE(fc)) != fc) oclength = _pcre_ord2utf8(othercase, occhars); else oclength = 0; #endif /* SUPPORT_UCP */ @@ -2567,10 +2607,11 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28); if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(d, eptr); if (d < 256) d = md->lcc[d]; - if (fi >= max || eptr >= md->end_subject || fc == d) - RRETURN(MATCH_NOMATCH); + if (fc == d) RRETURN(MATCH_NOMATCH); + } } else @@ -2676,9 +2717,9 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32); if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(d, eptr); - if (fi >= max || eptr >= md->end_subject || fc == d) - RRETURN(MATCH_NOMATCH); + if (fc == d) RRETURN(MATCH_NOMATCH); } } else @@ -2852,7 +2893,7 @@ for (;;) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || prop_chartype == ucp_Lt) == prop_fail_result) @@ -2865,7 +2906,7 @@ for (;;) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -2876,7 +2917,7 @@ for (;;) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -2887,7 +2928,7 @@ for (;;) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -2906,7 +2947,7 @@ for (;;) for (i = 1; i <= min; i++) { GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { @@ -2915,7 +2956,7 @@ for (;;) { GETCHARLEN(c, eptr, len); } - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr += len; } @@ -2933,14 +2974,22 @@ for (;;) case OP_ANY: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject || - ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr))) + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } break; + case OP_ALLANY: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + eptr++; + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + } + break; + case OP_ANYBYTE: eptr += min; break; @@ -3149,15 +3198,15 @@ for (;;) switch(ctype) { case OP_ANY: - if ((ims & PCRE_DOTALL) == 0) + for (i = 1; i <= min; i++) { - for (i = 1; i <= min; i++) - { - if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); - eptr++; - } + if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + eptr++; } - else eptr += min; + break; + + case OP_ALLANY: + eptr += min; break; case OP_ANYBYTE: @@ -3323,7 +3372,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || prop_chartype == ucp_Lt) == prop_fail_result) @@ -3338,7 +3387,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -3351,7 +3400,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -3364,7 +3413,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -3386,7 +3435,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { @@ -3395,7 +3444,7 @@ for (;;) { GETCHARLEN(c, eptr, len); } - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr += len; } @@ -3414,16 +3463,14 @@ for (;;) RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject || - (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 && - IS_NEWLINE(eptr))) + (ctype == OP_ANY && IS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(ctype) { - case OP_ANY: /* This is the DOTALL case */ - break; - + case OP_ANY: /* This is the non-NL case */ + case OP_ALLANY: case OP_ANYBYTE: break; @@ -3575,15 +3622,14 @@ for (;;) RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject || - ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr))) + (ctype == OP_ANY && IS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); c = *eptr++; switch(ctype) { - case OP_ANY: /* This is the DOTALL case */ - break; - + case OP_ANY: /* This is the non-NL case */ + case OP_ALLANY: case OP_ANYBYTE: break; @@ -3716,7 +3762,7 @@ for (;;) int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || prop_chartype == ucp_Lt) == prop_fail_result) @@ -3731,7 +3777,7 @@ for (;;) int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) break; eptr+= len; @@ -3744,7 +3790,7 @@ for (;;) int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) break; eptr+= len; @@ -3757,7 +3803,7 @@ for (;;) int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) break; eptr+= len; @@ -3786,7 +3832,7 @@ for (;;) { if (eptr >= md->end_subject) break; GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if (prop_category == ucp_M) break; while (eptr < md->end_subject) { @@ -3795,7 +3841,7 @@ for (;;) { GETCHARLEN(c, eptr, len); } - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr += len; } @@ -3817,7 +3863,7 @@ for (;;) BACKCHAR(eptr); GETCHARLEN(c, eptr, len); } - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr--; } @@ -3837,23 +3883,11 @@ for (;;) case OP_ANY: if (max < INT_MAX) { - if ((ims & PCRE_DOTALL) == 0) - { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; - eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; - } - } - else + for (i = min; i < max; i++) { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject) break; - eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; - } + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + eptr++; + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } } @@ -3861,20 +3895,26 @@ for (;;) else { - if ((ims & PCRE_DOTALL) == 0) + for (i = min; i < max; i++) { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; - eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; - } + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + eptr++; + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } - else + } + break; + + case OP_ALLANY: + if (max < INT_MAX) + { + for (i = min; i < max; i++) { - eptr = md->end_subject; + if (eptr >= md->end_subject) break; + eptr++; + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } } + else eptr = md->end_subject; /* Unlimited UTF-8 repeat */ break; /* The byte case is the same as non-UTF8 */ @@ -4062,17 +4102,14 @@ for (;;) switch(ctype) { case OP_ANY: - if ((ims & PCRE_DOTALL) == 0) + for (i = min; i < max; i++) { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; - eptr++; - } - break; + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + eptr++; } - /* For DOTALL case, fall through and treat as \C */ + break; + case OP_ALLANY: case OP_ANYBYTE: c = max - min; if (c > (unsigned int)(md->end_subject - eptr)) @@ -4346,7 +4383,7 @@ Returns: > 0 => success; value is the number of elements filled in < -1 => some kind of unexpected problem */ -PCRE_EXP_DEFN int +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, int offsetcount) @@ -4448,6 +4485,7 @@ end_subject = md->end_subject; md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; +md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; md->notbol = (options & PCRE_NOTBOL) != 0; md->noteol = (options & PCRE_NOTEOL) != 0; @@ -4657,31 +4695,53 @@ for(;;) if (firstline) { USPTR t = start_match; +#ifdef SUPPORT_UTF8 + if (utf8) + { + while (t < md->end_subject && !IS_NEWLINE(t)) + { + t++; + while (t < end_subject && (*t & 0xc0) == 0x80) t++; + } + } + else +#endif while (t < md->end_subject && !IS_NEWLINE(t)) t++; end_subject = t; } - /* Now test for a unique first byte */ + /* Now advance to a unique first byte if there is one. */ if (first_byte >= 0) { if (first_byte_caseless) - while (start_match < end_subject && - md->lcc[*start_match] != first_byte) - { NEXTCHAR(start_match); } + while (start_match < end_subject && md->lcc[*start_match] != first_byte) + start_match++; else while (start_match < end_subject && *start_match != first_byte) - { NEXTCHAR(start_match); } + start_match++; } - /* Or to just after a linebreak for a multiline match if possible */ + /* Or to just after a linebreak for a multiline match */ else if (startline) { if (start_match > md->start_subject + start_offset) { - while (start_match <= end_subject && !WAS_NEWLINE(start_match)) - { NEXTCHAR(start_match); } +#ifdef SUPPORT_UTF8 + if (utf8) + { + while (start_match < end_subject && !WAS_NEWLINE(start_match)) + { + start_match++; + while(start_match < end_subject && (*start_match & 0xc0) == 0x80) + start_match++; + } + } + else +#endif + while (start_match < end_subject && !WAS_NEWLINE(start_match)) + start_match++; /* If we have just passed a CR and the newline option is ANY or ANYCRLF, and we are now at a LF, advance the match position by one more character. @@ -4695,16 +4755,15 @@ for(;;) } } - /* Or to a non-unique first char after study */ + /* Or to a non-unique first byte after study */ else if (start_bits != NULL) { while (start_match < end_subject) { register unsigned int c = *start_match; - if ((start_bits[c/8] & (1 << (c&7))) == 0) - { NEXTCHAR(start_match); } - else break; + if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; + else break; } } |
