summaryrefslogtreecommitdiff
path: root/ext/pcre/pcrelib/pcre_exec.c
diff options
context:
space:
mode:
authorIlia Alshanetsky <iliaa@php.net>2008-11-23 18:30:38 +0000
committerIlia Alshanetsky <iliaa@php.net>2008-11-23 18:30:38 +0000
commitd8b835620ccbc9e81bab42cd73a90a752c8d13da (patch)
tree44c468de98fbb6d1e990c10d4e0e205945a4a33b /ext/pcre/pcrelib/pcre_exec.c
parent3d8b5526af3b9483f10ecb5be463093344711c5f (diff)
downloadphp-git-d8b835620ccbc9e81bab42cd73a90a752c8d13da.tar.gz
MFB: Fixed bug #46643 (Upgraded PCRElib to 7.8)
Diffstat (limited to 'ext/pcre/pcrelib/pcre_exec.c')
-rw-r--r--ext/pcre/pcrelib/pcre_exec.c301
1 files changed, 180 insertions, 121 deletions
diff --git a/ext/pcre/pcrelib/pcre_exec.c b/ext/pcre/pcrelib/pcre_exec.c
index 5520d153b5..eff51c7048 100644
--- a/ext/pcre/pcrelib/pcre_exec.c
+++ b/ext/pcre/pcrelib/pcre_exec.c
@@ -156,13 +156,39 @@ printf("\n");
if (length > md->end_subject - eptr) return FALSE;
-/* Separate the caselesss case for speed */
+/* Separate the caseless case for speed. In UTF-8 mode we can only do this
+properly if Unicode properties are supported. Otherwise, we can check only
+ASCII characters. */
if ((ims & PCRE_CASELESS) != 0)
{
+#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UCP
+ if (md->utf8)
+ {
+ USPTR endptr = eptr + length;
+ while (eptr < endptr)
+ {
+ int c, d;
+ GETCHARINC(c, eptr);
+ GETCHARINC(d, p);
+ if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
+ }
+ }
+ else
+#endif
+#endif
+
+ /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
+ is no UCP support. */
+
while (length-- > 0)
- if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
+ { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
}
+
+/* In the caseful case, we can just compare the bytes, whether or not we
+are in UTF-8 mode. */
+
else
{ while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
@@ -1146,11 +1172,11 @@ for (;;)
do ecode += GET(ecode,1); while (*ecode == OP_ALT);
break;
- /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
- that it may occur zero times. It may repeat infinitely, or not at all -
- i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
- repeat limits are compiled as a number of copies, with the optional ones
- preceded by BRAZERO or BRAMINZERO. */
+ /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
+ indicating that it may occur zero times. It may repeat infinitely, or not
+ at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
+ with fixed upper repeat limits are compiled as a number of copies, with the
+ optional ones preceded by BRAZERO or BRAMINZERO. */
case OP_BRAZERO:
{
@@ -1172,6 +1198,14 @@ for (;;)
}
break;
+ case OP_SKIPZERO:
+ {
+ next = ecode+1;
+ do next += GET(next,1); while (*next == OP_ALT);
+ ecode = next + 1 + LINK_SIZE;
+ }
+ break;
+
/* End of a group, repeated or non-repeating. */
case OP_KET:
@@ -1419,13 +1453,12 @@ for (;;)
/* Match a single character type; inline for speed */
case OP_ANY:
- if ((ims & PCRE_DOTALL) == 0)
- {
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
- }
+ if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+ /* Fall through */
+
+ case OP_ALLANY:
if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
- if (utf8)
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
ecode++;
break;
@@ -1644,8 +1677,7 @@ for (;;)
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
{
- int chartype, script;
- int category = _pcre_ucp_findprop(c, &chartype, &script);
+ const ucd_record * prop = GET_UCD(c);
switch(ecode[1])
{
@@ -1654,24 +1686,24 @@ for (;;)
break;
case PT_LAMP:
- if ((chartype == ucp_Lu ||
- chartype == ucp_Ll ||
- chartype == ucp_Lt) == (op == OP_NOTPROP))
+ if ((prop->chartype == ucp_Lu ||
+ prop->chartype == ucp_Ll ||
+ prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
break;
case PT_GC:
- if ((ecode[2] != category) == (op == OP_PROP))
+ if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
RRETURN(MATCH_NOMATCH);
break;
case PT_PC:
- if ((ecode[2] != chartype) == (op == OP_PROP))
+ if ((ecode[2] != prop->chartype) == (op == OP_PROP))
RRETURN(MATCH_NOMATCH);
break;
case PT_SC:
- if ((ecode[2] != script) == (op == OP_PROP))
+ if ((ecode[2] != prop->script) == (op == OP_PROP))
RRETURN(MATCH_NOMATCH);
break;
@@ -1690,8 +1722,7 @@ for (;;)
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
{
- int chartype, script;
- int category = _pcre_ucp_findprop(c, &chartype, &script);
+ int category = UCD_CATEGORY(c);
if (category == ucp_M) RRETURN(MATCH_NOMATCH);
while (eptr < md->end_subject)
{
@@ -1700,7 +1731,7 @@ for (;;)
{
GETCHARLEN(c, eptr, len);
}
- category = _pcre_ucp_findprop(c, &chartype, &script);
+ category = UCD_CATEGORY(c);
if (category != ucp_M) break;
eptr += len;
}
@@ -1721,16 +1752,25 @@ for (;;)
case OP_REF:
{
offset = GET2(ecode, 1) << 1; /* Doubled ref number */
- ecode += 3; /* Advance past item */
+ ecode += 3;
- /* If the reference is unset, set the length to be longer than the amount
- of subject left; this ensures that every attempt at a match fails. We
- can't just fail here, because of the possibility of quantifiers with zero
- minima. */
+ /* If the reference is unset, there are two possibilities:
- length = (offset >= offset_top || md->offset_vector[offset] < 0)?
- md->end_subject - eptr + 1 :
- md->offset_vector[offset+1] - md->offset_vector[offset];
+ (a) In the default, Perl-compatible state, set the length to be longer
+ than the amount of subject left; this ensures that every attempt at a
+ match fails. We can't just fail here, because of the possibility of
+ quantifiers with zero minima.
+
+ (b) If the JavaScript compatibility flag is set, set the length to zero
+ so that the back reference matches an empty string.
+
+ Otherwise, set the length to the length of what was matched by the
+ referenced subpattern. */
+
+ if (offset >= offset_top || md->offset_vector[offset] < 0)
+ length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
+ else
+ length = md->offset_vector[offset+1] - md->offset_vector[offset];
/* Set up for repetition, or handle the non-repeated case */
@@ -2156,7 +2196,7 @@ for (;;)
if (fc != dc)
{
#ifdef SUPPORT_UCP
- if (dc != _pcre_ucp_othercase(fc))
+ if (dc != UCD_OTHERCASE(fc))
#endif
RRETURN(MATCH_NOMATCH);
}
@@ -2247,7 +2287,7 @@ for (;;)
#ifdef SUPPORT_UCP
unsigned int othercase;
if ((ims & PCRE_CASELESS) != 0 &&
- (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
+ (othercase = UCD_OTHERCASE(fc)) != fc)
oclength = _pcre_ord2utf8(othercase, occhars);
else oclength = 0;
#endif /* SUPPORT_UCP */
@@ -2567,10 +2607,11 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(d, eptr);
if (d < 256) d = md->lcc[d];
- if (fi >= max || eptr >= md->end_subject || fc == d)
- RRETURN(MATCH_NOMATCH);
+ if (fc == d) RRETURN(MATCH_NOMATCH);
+
}
}
else
@@ -2676,9 +2717,9 @@ for (;;)
{
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(d, eptr);
- if (fi >= max || eptr >= md->end_subject || fc == d)
- RRETURN(MATCH_NOMATCH);
+ if (fc == d) RRETURN(MATCH_NOMATCH);
}
}
else
@@ -2852,7 +2893,7 @@ for (;;)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == ucp_Lu ||
prop_chartype == ucp_Ll ||
prop_chartype == ucp_Lt) == prop_fail_result)
@@ -2865,7 +2906,7 @@ for (;;)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_category = UCD_CATEGORY(c);
if ((prop_category == prop_value) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
@@ -2876,7 +2917,7 @@ for (;;)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == prop_value) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
@@ -2887,7 +2928,7 @@ for (;;)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_script = UCD_SCRIPT(c);
if ((prop_script == prop_value) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
@@ -2906,7 +2947,7 @@ for (;;)
for (i = 1; i <= min; i++)
{
GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_category = UCD_CATEGORY(c);
if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
while (eptr < md->end_subject)
{
@@ -2915,7 +2956,7 @@ for (;;)
{
GETCHARLEN(c, eptr, len);
}
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_category = UCD_CATEGORY(c);
if (prop_category != ucp_M) break;
eptr += len;
}
@@ -2933,14 +2974,22 @@ for (;;)
case OP_ANY:
for (i = 1; i <= min; i++)
{
- if (eptr >= md->end_subject ||
- ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr))
RRETURN(MATCH_NOMATCH);
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
break;
+ case OP_ALLANY:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ eptr++;
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ }
+ break;
+
case OP_ANYBYTE:
eptr += min;
break;
@@ -3149,15 +3198,15 @@ for (;;)
switch(ctype)
{
case OP_ANY:
- if ((ims & PCRE_DOTALL) == 0)
+ for (i = 1; i <= min; i++)
{
- for (i = 1; i <= min; i++)
- {
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
- eptr++;
- }
+ if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+ eptr++;
}
- else eptr += min;
+ break;
+
+ case OP_ALLANY:
+ eptr += min;
break;
case OP_ANYBYTE:
@@ -3323,7 +3372,7 @@ for (;;)
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == ucp_Lu ||
prop_chartype == ucp_Ll ||
prop_chartype == ucp_Lt) == prop_fail_result)
@@ -3338,7 +3387,7 @@ for (;;)
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_category = UCD_CATEGORY(c);
if ((prop_category == prop_value) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
@@ -3351,7 +3400,7 @@ for (;;)
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == prop_value) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
@@ -3364,7 +3413,7 @@ for (;;)
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_script = UCD_SCRIPT(c);
if ((prop_script == prop_value) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
@@ -3386,7 +3435,7 @@ for (;;)
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_category = UCD_CATEGORY(c);
if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
while (eptr < md->end_subject)
{
@@ -3395,7 +3444,7 @@ for (;;)
{
GETCHARLEN(c, eptr, len);
}
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_category = UCD_CATEGORY(c);
if (prop_category != ucp_M) break;
eptr += len;
}
@@ -3414,16 +3463,14 @@ for (;;)
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject ||
- (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
- IS_NEWLINE(eptr)))
+ (ctype == OP_ANY && IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
switch(ctype)
{
- case OP_ANY: /* This is the DOTALL case */
- break;
-
+ case OP_ANY: /* This is the non-NL case */
+ case OP_ALLANY:
case OP_ANYBYTE:
break;
@@ -3575,15 +3622,14 @@ for (;;)
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject ||
- ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
+ (ctype == OP_ANY && IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
c = *eptr++;
switch(ctype)
{
- case OP_ANY: /* This is the DOTALL case */
- break;
-
+ case OP_ANY: /* This is the non-NL case */
+ case OP_ALLANY:
case OP_ANYBYTE:
break;
@@ -3716,7 +3762,7 @@ for (;;)
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == ucp_Lu ||
prop_chartype == ucp_Ll ||
prop_chartype == ucp_Lt) == prop_fail_result)
@@ -3731,7 +3777,7 @@ for (;;)
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_category = UCD_CATEGORY(c);
if ((prop_category == prop_value) == prop_fail_result)
break;
eptr+= len;
@@ -3744,7 +3790,7 @@ for (;;)
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_chartype = UCD_CHARTYPE(c);
if ((prop_chartype == prop_value) == prop_fail_result)
break;
eptr+= len;
@@ -3757,7 +3803,7 @@ for (;;)
int len = 1;
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_script = UCD_SCRIPT(c);
if ((prop_script == prop_value) == prop_fail_result)
break;
eptr+= len;
@@ -3786,7 +3832,7 @@ for (;;)
{
if (eptr >= md->end_subject) break;
GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_category = UCD_CATEGORY(c);
if (prop_category == ucp_M) break;
while (eptr < md->end_subject)
{
@@ -3795,7 +3841,7 @@ for (;;)
{
GETCHARLEN(c, eptr, len);
}
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_category = UCD_CATEGORY(c);
if (prop_category != ucp_M) break;
eptr += len;
}
@@ -3817,7 +3863,7 @@ for (;;)
BACKCHAR(eptr);
GETCHARLEN(c, eptr, len);
}
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
+ prop_category = UCD_CATEGORY(c);
if (prop_category != ucp_M) break;
eptr--;
}
@@ -3837,23 +3883,11 @@ for (;;)
case OP_ANY:
if (max < INT_MAX)
{
- if ((ims & PCRE_DOTALL) == 0)
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- }
- else
+ for (i = min; i < max; i++)
{
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
+ eptr++;
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
}
@@ -3861,20 +3895,26 @@ for (;;)
else
{
- if ((ims & PCRE_DOTALL) == 0)
+ for (i = min; i < max; i++)
{
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
+ eptr++;
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
- else
+ }
+ break;
+
+ case OP_ALLANY:
+ if (max < INT_MAX)
+ {
+ for (i = min; i < max; i++)
{
- eptr = md->end_subject;
+ if (eptr >= md->end_subject) break;
+ eptr++;
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
}
+ else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
break;
/* The byte case is the same as non-UTF8 */
@@ -4062,17 +4102,14 @@ for (;;)
switch(ctype)
{
case OP_ANY:
- if ((ims & PCRE_DOTALL) == 0)
+ for (i = min; i < max; i++)
{
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- }
- break;
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
+ eptr++;
}
- /* For DOTALL case, fall through and treat as \C */
+ break;
+ case OP_ALLANY:
case OP_ANYBYTE:
c = max - min;
if (c > (unsigned int)(md->end_subject - eptr))
@@ -4346,7 +4383,7 @@ Returns: > 0 => success; value is the number of elements filled in
< -1 => some kind of unexpected problem
*/
-PCRE_EXP_DEFN int
+PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
int offsetcount)
@@ -4448,6 +4485,7 @@ end_subject = md->end_subject;
md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
+md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
md->notbol = (options & PCRE_NOTBOL) != 0;
md->noteol = (options & PCRE_NOTEOL) != 0;
@@ -4657,31 +4695,53 @@ for(;;)
if (firstline)
{
USPTR t = start_match;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ while (t < md->end_subject && !IS_NEWLINE(t))
+ {
+ t++;
+ while (t < end_subject && (*t & 0xc0) == 0x80) t++;
+ }
+ }
+ else
+#endif
while (t < md->end_subject && !IS_NEWLINE(t)) t++;
end_subject = t;
}
- /* Now test for a unique first byte */
+ /* Now advance to a unique first byte if there is one. */
if (first_byte >= 0)
{
if (first_byte_caseless)
- while (start_match < end_subject &&
- md->lcc[*start_match] != first_byte)
- { NEXTCHAR(start_match); }
+ while (start_match < end_subject && md->lcc[*start_match] != first_byte)
+ start_match++;
else
while (start_match < end_subject && *start_match != first_byte)
- { NEXTCHAR(start_match); }
+ start_match++;
}
- /* Or to just after a linebreak for a multiline match if possible */
+ /* Or to just after a linebreak for a multiline match */
else if (startline)
{
if (start_match > md->start_subject + start_offset)
{
- while (start_match <= end_subject && !WAS_NEWLINE(start_match))
- { NEXTCHAR(start_match); }
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ while (start_match < end_subject && !WAS_NEWLINE(start_match))
+ {
+ start_match++;
+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+ start_match++;
+ }
+ }
+ else
+#endif
+ while (start_match < end_subject && !WAS_NEWLINE(start_match))
+ start_match++;
/* If we have just passed a CR and the newline option is ANY or ANYCRLF,
and we are now at a LF, advance the match position by one more character.
@@ -4695,16 +4755,15 @@ for(;;)
}
}
- /* Or to a non-unique first char after study */
+ /* Or to a non-unique first byte after study */
else if (start_bits != NULL)
{
while (start_match < end_subject)
{
register unsigned int c = *start_match;
- if ((start_bits[c/8] & (1 << (c&7))) == 0)
- { NEXTCHAR(start_match); }
- else break;
+ if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
+ else break;
}
}