diff options
Diffstat (limited to 'ext/pcre/pcrelib/pcre_exec.c')
-rw-r--r-- | ext/pcre/pcrelib/pcre_exec.c | 148 |
1 files changed, 51 insertions, 97 deletions
diff --git a/ext/pcre/pcrelib/pcre_exec.c b/ext/pcre/pcrelib/pcre_exec.c index eff51c7048..e28fe9ec86 100644 --- a/ext/pcre/pcrelib/pcre_exec.c +++ b/ext/pcre/pcrelib/pcre_exec.c @@ -156,39 +156,13 @@ printf("\n"); if (length > md->end_subject - eptr) return FALSE; -/* Separate the caseless case for speed. In UTF-8 mode we can only do this -properly if Unicode properties are supported. Otherwise, we can check only -ASCII characters. */ +/* Separate the caselesss case for speed */ if ((ims & PCRE_CASELESS) != 0) { -#ifdef SUPPORT_UTF8 -#ifdef SUPPORT_UCP - if (md->utf8) - { - USPTR endptr = eptr + length; - while (eptr < endptr) - { - int c, d; - GETCHARINC(c, eptr); - GETCHARINC(d, p); - if (c != d && c != UCD_OTHERCASE(d)) return FALSE; - } - } - else -#endif -#endif - - /* The same code works when not in UTF-8 mode and in UTF-8 mode when there - is no UCP support. */ - while (length-- > 0) - { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; } + if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; } - -/* In the caseful case, we can just compare the bytes, whether or not we -are in UTF-8 mode. */ - else { while (length-- > 0) if (*p++ != *eptr++) return FALSE; } @@ -1677,7 +1651,8 @@ for (;;) if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); { - const ucd_record * prop = GET_UCD(c); + int chartype, script; + int category = _pcre_ucp_findprop(c, &chartype, &script); switch(ecode[1]) { @@ -1686,24 +1661,24 @@ for (;;) break; case PT_LAMP: - if ((prop->chartype == ucp_Lu || - prop->chartype == ucp_Ll || - prop->chartype == ucp_Lt) == (op == OP_NOTPROP)) + if ((chartype == ucp_Lu || + chartype == ucp_Ll || + chartype == ucp_Lt) == (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); break; case PT_GC: - if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP)) + if ((ecode[2] != category) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; case PT_PC: - if ((ecode[2] != prop->chartype) == (op == OP_PROP)) + if ((ecode[2] != chartype) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; case PT_SC: - if ((ecode[2] != prop->script) == (op == OP_PROP)) + if ((ecode[2] != script) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; @@ -1722,7 +1697,8 @@ for (;;) if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); { - int category = UCD_CATEGORY(c); + int chartype, script; + int category = _pcre_ucp_findprop(c, &chartype, &script); if (category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { @@ -1731,7 +1707,7 @@ for (;;) { GETCHARLEN(c, eptr, len); } - category = UCD_CATEGORY(c); + category = _pcre_ucp_findprop(c, &chartype, &script); if (category != ucp_M) break; eptr += len; } @@ -2196,7 +2172,7 @@ for (;;) if (fc != dc) { #ifdef SUPPORT_UCP - if (dc != UCD_OTHERCASE(fc)) + if (dc != _pcre_ucp_othercase(fc)) #endif RRETURN(MATCH_NOMATCH); } @@ -2287,7 +2263,7 @@ for (;;) #ifdef SUPPORT_UCP unsigned int othercase; if ((ims & PCRE_CASELESS) != 0 && - (othercase = UCD_OTHERCASE(fc)) != fc) + (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR) oclength = _pcre_ord2utf8(othercase, occhars); else oclength = 0; #endif /* SUPPORT_UCP */ @@ -2607,11 +2583,10 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(d, eptr); if (d < 256) d = md->lcc[d]; - if (fc == d) RRETURN(MATCH_NOMATCH); - + if (fi >= max || eptr >= md->end_subject || fc == d) + RRETURN(MATCH_NOMATCH); } } else @@ -2717,9 +2692,9 @@ for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(d, eptr); - if (fc == d) RRETURN(MATCH_NOMATCH); + if (fi >= max || eptr >= md->end_subject || fc == d) + RRETURN(MATCH_NOMATCH); } } else @@ -2893,7 +2868,7 @@ for (;;) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); - prop_chartype = UCD_CHARTYPE(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || prop_chartype == ucp_Lt) == prop_fail_result) @@ -2906,7 +2881,7 @@ for (;;) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); - prop_category = UCD_CATEGORY(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if ((prop_category == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -2917,7 +2892,7 @@ for (;;) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); - prop_chartype = UCD_CHARTYPE(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if ((prop_chartype == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -2928,7 +2903,7 @@ for (;;) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); - prop_script = UCD_SCRIPT(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if ((prop_script == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -2947,7 +2922,7 @@ for (;;) for (i = 1; i <= min; i++) { GETCHARINCTEST(c, eptr); - prop_category = UCD_CATEGORY(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { @@ -2956,7 +2931,7 @@ for (;;) { GETCHARLEN(c, eptr, len); } - prop_category = UCD_CATEGORY(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if (prop_category != ucp_M) break; eptr += len; } @@ -3372,7 +3347,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); - prop_chartype = UCD_CHARTYPE(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || prop_chartype == ucp_Lt) == prop_fail_result) @@ -3387,7 +3362,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); - prop_category = UCD_CATEGORY(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if ((prop_category == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -3400,7 +3375,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); - prop_chartype = UCD_CHARTYPE(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if ((prop_chartype == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -3413,7 +3388,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); - prop_script = UCD_SCRIPT(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if ((prop_script == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -3435,7 +3410,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); - prop_category = UCD_CATEGORY(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { @@ -3444,7 +3419,7 @@ for (;;) { GETCHARLEN(c, eptr, len); } - prop_category = UCD_CATEGORY(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if (prop_category != ucp_M) break; eptr += len; } @@ -3762,7 +3737,7 @@ for (;;) int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); - prop_chartype = UCD_CHARTYPE(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || prop_chartype == ucp_Lt) == prop_fail_result) @@ -3777,7 +3752,7 @@ for (;;) int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); - prop_category = UCD_CATEGORY(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if ((prop_category == prop_value) == prop_fail_result) break; eptr+= len; @@ -3790,7 +3765,7 @@ for (;;) int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); - prop_chartype = UCD_CHARTYPE(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if ((prop_chartype == prop_value) == prop_fail_result) break; eptr+= len; @@ -3803,7 +3778,7 @@ for (;;) int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); - prop_script = UCD_SCRIPT(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if ((prop_script == prop_value) == prop_fail_result) break; eptr+= len; @@ -3832,7 +3807,7 @@ for (;;) { if (eptr >= md->end_subject) break; GETCHARINCTEST(c, eptr); - prop_category = UCD_CATEGORY(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if (prop_category == ucp_M) break; while (eptr < md->end_subject) { @@ -3841,7 +3816,7 @@ for (;;) { GETCHARLEN(c, eptr, len); } - prop_category = UCD_CATEGORY(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if (prop_category != ucp_M) break; eptr += len; } @@ -3863,7 +3838,7 @@ for (;;) BACKCHAR(eptr); GETCHARLEN(c, eptr, len); } - prop_category = UCD_CATEGORY(c); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if (prop_category != ucp_M) break; eptr--; } @@ -4383,7 +4358,7 @@ Returns: > 0 => success; value is the number of elements filled in < -1 => some kind of unexpected problem */ -PCRE_EXP_DEFN int PCRE_CALL_CONVENTION +PCRE_EXP_DEFN int pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, int offsetcount) @@ -4695,53 +4670,31 @@ for(;;) if (firstline) { USPTR t = start_match; -#ifdef SUPPORT_UTF8 - if (utf8) - { - while (t < md->end_subject && !IS_NEWLINE(t)) - { - t++; - while (t < end_subject && (*t & 0xc0) == 0x80) t++; - } - } - else -#endif while (t < md->end_subject && !IS_NEWLINE(t)) t++; end_subject = t; } - /* Now advance to a unique first byte if there is one. */ + /* Now test for a unique first byte */ if (first_byte >= 0) { if (first_byte_caseless) - while (start_match < end_subject && md->lcc[*start_match] != first_byte) - start_match++; + while (start_match < end_subject && + md->lcc[*start_match] != first_byte) + { NEXTCHAR(start_match); } else while (start_match < end_subject && *start_match != first_byte) - start_match++; + { NEXTCHAR(start_match); } } - /* Or to just after a linebreak for a multiline match */ + /* Or to just after a linebreak for a multiline match if possible */ else if (startline) { if (start_match > md->start_subject + start_offset) { -#ifdef SUPPORT_UTF8 - if (utf8) - { - while (start_match < end_subject && !WAS_NEWLINE(start_match)) - { - start_match++; - while(start_match < end_subject && (*start_match & 0xc0) == 0x80) - start_match++; - } - } - else -#endif - while (start_match < end_subject && !WAS_NEWLINE(start_match)) - start_match++; + while (start_match <= end_subject && !WAS_NEWLINE(start_match)) + { NEXTCHAR(start_match); } /* If we have just passed a CR and the newline option is ANY or ANYCRLF, and we are now at a LF, advance the match position by one more character. @@ -4755,15 +4708,16 @@ for(;;) } } - /* Or to a non-unique first byte after study */ + /* Or to a non-unique first char after study */ else if (start_bits != NULL) { while (start_match < end_subject) { register unsigned int c = *start_match; - if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; - else break; + if ((start_bits[c/8] & (1 << (c&7))) == 0) + { NEXTCHAR(start_match); } + else break; } } |