diff options
Diffstat (limited to 'ext/pcre/pcrelib/pcre_exec.c')
| -rw-r--r-- | ext/pcre/pcrelib/pcre_exec.c | 894 | 
1 files changed, 580 insertions, 314 deletions
diff --git a/ext/pcre/pcrelib/pcre_exec.c b/ext/pcre/pcrelib/pcre_exec.c index a67855bfef..890e0f731a 100644 --- a/ext/pcre/pcrelib/pcre_exec.c +++ b/ext/pcre/pcrelib/pcre_exec.c @@ -6,7 +6,7 @@  and semantics are as close as possible to those of the Perl 5 language.                         Written by Philip Hazel -           Copyright (c) 1997-2007 University of Cambridge +           Copyright (c) 1997-2006 University of Cambridge  -----------------------------------------------------------------------------  Redistribution and use in source and binary forms, with or without @@ -42,25 +42,22 @@ POSSIBILITY OF SUCH DAMAGE.  pattern matching using an NFA algorithm, trying to mimic Perl as closely as  possible. There are also some static supporting functions. */ -#define NLBLOCK md           /* The block containing newline information */ -#include "pcre_internal.h" +#define NLBLOCK md             /* Block containing newline information */ +#define PSSTART start_subject  /* Field containing processed string start */ +#define PSEND   end_subject    /* Field containing processed string end */ +#include "pcre_internal.h" -/* Structure for building a chain of data that actually lives on the -stack, for holding the values of the subject pointer at the start of each -subpattern, so as to detect when an empty string has been matched by a -subpattern - to break infinite loops. When NO_RECURSE is set, these blocks -are on the heap, not on the stack. */ +/* The chain of eptrblocks for tail recursions uses memory in stack workspace, +obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */ -typedef struct eptrblock { -  struct eptrblock *epb_prev; -  USPTR epb_saved_eptr; -} eptrblock; +#define EPTR_WORK_SIZE (1000)  /* Flag bits for the match() function */ -#define match_condassert   0x01    /* Called to check a condition assertion */ -#define match_isgroup      0x02    /* Set if start of bracketed group */ +#define match_condassert     0x01  /* Called to check a condition assertion */ +#define match_cbegroup       0x02  /* Could-be-empty unlimited repeat group */ +#define match_tail_recursed  0x04  /* Tail recursive call */  /* Non-error returns from the match() function. Error returns are externally  defined PCRE_ERROR_xxx codes, which are all negative. */ @@ -101,7 +98,7 @@ Returns:     nothing  static void  pchars(const uschar *p, int length, BOOL is_subject, match_data *md)  { -int c; +unsigned int c;  if (is_subject && length > md->end_subject - p) length = md->end_subject - p;  while (length-- > 0)    if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c); @@ -291,7 +288,6 @@ typedef struct heapframe {    BOOL Xcur_is_word;    BOOL Xcondition; -  BOOL Xminimize;    BOOL Xprev_is_word;    unsigned long int Xoriginal_ims; @@ -303,11 +299,10 @@ typedef struct heapframe {    int Xprop_category;    int Xprop_chartype;    int Xprop_script; -  int *Xprop_test_variable;  #endif    int Xctype; -  int Xfc; +  unsigned int Xfc;    int Xfi;    int Xlength;    int Xmax; @@ -340,10 +335,7 @@ typedef struct heapframe {  *         Match from current position            *  *************************************************/ -/* On entry ecode points to the first opcode, and eptr to the first character -in the subject string, while eptrb holds the value of eptr at the start of the -last bracketed group - used for breaking infinite loops matching zero-length -strings. This function is called recursively in many circumstances. Whenever it +/* This function is called recursively in many circumstances. Whenever it  returns a negative (error) response, the outer incarnation must also return the  same response. @@ -353,8 +345,8 @@ performance. Tests using gcc on a SPARC disproved this; in the first case, it  made performance worse.  Arguments: -   eptr        pointer in subject -   ecode       position in code +   eptr        pointer to current character in subject +   ecode       pointer to current position in compiled code     offset_top  current top pointer     md          pointer to "static" info for the match     ims         current /i, /m, and /s options @@ -362,7 +354,9 @@ Arguments:                   brackets - for testing for empty matches     flags       can contain                   match_condassert - this is an assertion condition -                 match_isgroup - this is the start of a bracketed group +                 match_cbegroup - this is the start of an unlimited repeat +                   group that can match an empty string +                 match_tail_recursed - this is a tail_recursed group     rdepth      the recursion depth  Returns:       MATCH_MATCH if matched            )  these values are >= 0 @@ -377,14 +371,16 @@ match(REGISTER USPTR eptr, REGISTER const uschar *ecode,    int flags, unsigned int rdepth)  {  /* These variables do not need to be preserved over recursion in this function, -so they can be ordinary variables in all cases. Mark them with "register" -because they are used a lot in loops. */ +so they can be ordinary variables in all cases. Mark some of them with +"register" because they are used a lot in loops. */  register int  rrc;         /* Returns from recursive calls */  register int  i;           /* Used for loops not involving calls to RMATCH() */ -register unsigned int  c;  /* Character values not kept over RMATCH() calls */ +register unsigned int c;   /* Character values not kept over RMATCH() calls */  register BOOL utf8;        /* Local copy of UTF-8 flag for speed */ +BOOL minimize, possessive; /* Quantifier options */ +  /* When recursion is not being used, all "local" variables that have to be  preserved over calls to RMATCH() are part of a "frame" which is obtained from  heap storage. Set up the top-level frame here; others are obtained from the @@ -434,7 +430,6 @@ HEAP_RECURSE:  #define cur_is_word        frame->Xcur_is_word  #define condition          frame->Xcondition -#define minimize           frame->Xminimize  #define prev_is_word       frame->Xprev_is_word  #define original_ims       frame->Xoriginal_ims @@ -446,7 +441,6 @@ HEAP_RECURSE:  #define prop_category      frame->Xprop_category  #define prop_chartype      frame->Xprop_chartype  #define prop_script        frame->Xprop_script -#define prop_test_variable frame->Xprop_test_variable  #endif  #define ctype              frame->Xctype @@ -470,7 +464,7 @@ HEAP_RECURSE:  get preserved during recursion in the normal way. In this environment, fi and  i, and fc and c, can be the same variables. */ -#else +#else         /* NO_RECURSE not defined */  #define fi i  #define fc c @@ -489,7 +483,6 @@ recursion_info new_recursive;      /* within blocks below are for variables  */                                     /* that do not have to be preserved over  */  BOOL cur_is_word;                  /* a recursive call to RMATCH().          */  BOOL condition; -BOOL minimize;  BOOL prev_is_word;  unsigned long int original_ims; @@ -501,7 +494,6 @@ int prop_fail_result;  int prop_category;  int prop_chartype;  int prop_script; -int *prop_test_variable;  #endif  int ctype; @@ -516,7 +508,7 @@ int save_offset1, save_offset2, save_offset3;  int stacksave[REC_STACK_SAVE_MAX];  eptrblock newptrb; -#endif +#endif     /* NO_RECURSE */  /* These statements are here to stop the compiler complaining about unitialized  variables. */ @@ -524,9 +516,9 @@ variables. */  #ifdef SUPPORT_UCP  prop_value = 0;  prop_fail_result = 0; -prop_test_variable = NULL;  #endif +  /* This label is used for tail recursion, which is used in a few cases even  when NO_RECURSE is not defined, in order to reduce the amount of stack that is  used. Thanks to Ian Taylor for noticing this possibility and sending the @@ -556,24 +548,34 @@ utf8 = md->utf8;       /* Local copy of the flag */  utf8 = FALSE;  #endif -/* At the start of a bracketed group, add the current subject pointer to the -stack of such pointers, to be re-instated at the end of the group when we hit -the closing ket. When match() is called in other circumstances, we don't add to -this stack. */ +/* At the start of a group with an unlimited repeat that may match an empty +string, the match_cbegroup flag is set. When this is the case, add the current +subject pointer to the chain of such remembered pointers, to be checked when we +hit the closing ket, in order to break infinite loops that match no characters. +When match() is called in other circumstances, don't add to the chain. If this +is a tail recursion, use a block from the workspace, as the one on the stack is +already used. */ -if ((flags & match_isgroup) != 0) +if ((flags & match_cbegroup) != 0)    { -  newptrb.epb_prev = eptrb; -  newptrb.epb_saved_eptr = eptr; -  eptrb = &newptrb; +  eptrblock *p; +  if ((flags & match_tail_recursed) != 0) +    { +    if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT); +    p = md->eptrchain + md->eptrn++; +    } +  else p = &newptrb; +  p->epb_saved_eptr = eptr; +  p->epb_prev = eptrb; +  eptrb = p;    } -/* Now start processing the operations. */ +/* Now start processing the opcodes. */  for (;;)    { +  minimize = possessive = FALSE;    op = *ecode; -  minimize = FALSE;    /* For partial matching, remember if we ever hit the end of the subject after    matching at least one subject character. */ @@ -583,33 +585,30 @@ for (;;)        eptr > md->start_match)      md->hitend = TRUE; -  /* Opening capturing bracket. If there is space in the offset vector, save -  the current subject position in the working slot at the top of the vector. We -  mustn't change the current values of the data slot, because they may be set -  from a previous iteration of this group, and be referred to by a reference -  inside the group. - -  If the bracket fails to match, we need to restore this value and also the -  values of the final offsets, in case they were set by a previous iteration of -  the same bracket. - -  If there isn't enough space in the offset vector, treat this as if it were a -  non-capturing bracket. Don't worry about setting the flag for the error case -  here; that is handled in the code for KET. */ - -  if (op > OP_BRA) +  switch(op)      { -    number = op - OP_BRA; - -    /* For extended extraction brackets (large number), we have to fish out the -    number from a dummy opcode at the start. */ - -    if (number > EXTRACT_BASIC_MAX) -      number = GET2(ecode, 2+LINK_SIZE); +    /* Handle a capturing bracket. If there is space in the offset vector, save +    the current subject position in the working slot at the top of the vector. +    We mustn't change the current values of the data slot, because they may be +    set from a previous iteration of this group, and be referred to by a +    reference inside the group. + +    If the bracket fails to match, we need to restore this value and also the +    values of the final offsets, in case they were set by a previous iteration +    of the same bracket. + +    If there isn't enough space in the offset vector, treat this as if it were +    a non-capturing bracket. Don't worry about setting the flag for the error +    case here; that is handled in the code for KET. */ + +    case OP_CBRA: +    case OP_SCBRA: +    number = GET2(ecode, 1+LINK_SIZE);      offset = number << 1;  #ifdef DEBUG -    printf("start bracket %d subject=", number); +    printf("start bracket %d\n", number); +    printf("subject=");      pchars(eptr, 16, TRUE, md);      printf("\n");  #endif @@ -624,10 +623,11 @@ for (;;)        DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));        md->offset_vector[md->offset_end - number] = eptr - md->start_subject; +      flags = (op == OP_SCBRA)? match_cbegroup : 0;        do          { -        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, -          match_isgroup); +        RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, +          ims, eptrb, flags);          if (rrc != MATCH_NOMATCH) RRETURN(rrc);          md->capture_last = save_capture_last;          ecode += GET(ecode, 1); @@ -643,39 +643,35 @@ for (;;)        RRETURN(MATCH_NOMATCH);        } -    /* Insufficient room for saving captured contents */ +    /* Insufficient room for saving captured contents. Treat as a non-capturing +    bracket. */ -    else op = OP_BRA; -    } +    DPRINTF(("insufficient capture room: treat as non-capturing\n")); -  /* Other types of node can be handled by a switch */ - -  switch(op) -    { -    case OP_BRA:     /* Non-capturing bracket: optimized */ -    DPRINTF(("start bracket 0\n")); - -    /* Loop for all the alternatives */ +    /* Non-capturing bracket. Loop for all the alternatives. When we get to the +    final alternative within the brackets, we would return the result of a +    recursive call to match() whatever happened. We can reduce stack usage by +    turning this into a tail recursion. */ +    case OP_BRA: +    case OP_SBRA: +    DPRINTF(("start non-capturing bracket\n")); +    flags = (op >= OP_SBRA)? match_cbegroup : 0;      for (;;)        { -      /* When we get to the final alternative within the brackets, we would -      return the result of a recursive call to match() whatever happened. We -      can reduce stack usage by turning this into a tail recursion. */ -        if (ecode[GET(ecode, 1)] != OP_ALT) -       { -       ecode += 1 + LINK_SIZE; -       flags = match_isgroup; -       DPRINTF(("bracket 0 tail recursion\n")); -       goto TAIL_RECURSE; -       } +        { +        ecode += _pcre_OP_lengths[*ecode]; +        flags |= match_tail_recursed; +        DPRINTF(("bracket 0 tail recursion\n")); +        goto TAIL_RECURSE; +        }        /* For non-final alternatives, continue the loop for a NOMATCH result;        otherwise return. */ -      RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, -        match_isgroup); +      RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, +        eptrb, flags);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        ecode += GET(ecode, 1);        } @@ -688,54 +684,72 @@ for (;;)      obeyed, we can use tail recursion to avoid using another stack frame. */      case OP_COND: -    if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */ +    case OP_SCOND: +    if (ecode[LINK_SIZE+1] == OP_RREF)         /* Recursion test */ +      { +      offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/ +      condition = md->recursive != NULL && +        (offset == RREF_ANY || offset == md->recursive->group_num); +      ecode += condition? 3 : GET(ecode, 1); +      } + +    else if (ecode[LINK_SIZE+1] == OP_CREF)    /* Group used test */        {        offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */ -      condition = (offset == CREF_RECURSE * 2)? -        (md->recursive != NULL) : -        (offset < offset_top && md->offset_vector[offset] >= 0); -      ecode += condition? (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1)); -      flags = match_isgroup; -      goto TAIL_RECURSE; +      condition = offset < offset_top && md->offset_vector[offset] >= 0; +      ecode += condition? 3 : GET(ecode, 1); +      } + +    else if (ecode[LINK_SIZE+1] == OP_DEF)     /* DEFINE - always false */ +      { +      condition = FALSE; +      ecode += GET(ecode, 1);        }      /* The condition is an assertion. Call match() to evaluate it - setting -    the final argument TRUE causes it to stop at the end of an assertion. */ +    the final argument match_condassert causes it to stop at the end of an +    assertion. */      else        {        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, -          match_condassert | match_isgroup); +          match_condassert);        if (rrc == MATCH_MATCH)          { -        ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2); +        condition = TRUE; +        ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);          while (*ecode == OP_ALT) ecode += GET(ecode, 1);          }        else if (rrc != MATCH_NOMATCH)          {          RRETURN(rrc);         /* Need braces because of following else */          } -      else ecode += GET(ecode, 1); +      else +        { +        condition = FALSE; +        ecode += GET(ecode, 1); +        } +      } -      /* We are now at the branch that is to be obeyed. As there is only one, -      we can use tail recursion to avoid using another stack frame. */ +    /* We are now at the branch that is to be obeyed. As there is only one, +    we can use tail recursion to avoid using another stack frame. If the second +    alternative doesn't exist, we can just plough on. */ +    if (condition || *ecode == OP_ALT) +      {        ecode += 1 + LINK_SIZE; -      flags = match_isgroup; +      flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);        goto TAIL_RECURSE;        } -    /* Control never reaches here */ - -    /* Skip over conditional reference or large extraction number data if -    encountered. */ - -    case OP_CREF: -    case OP_BRANUMBER: -    ecode += 3; +    else +      { +      ecode += 1 + LINK_SIZE; +      }      break; -    /* End of the pattern. If we are in a recursion, we should restore the -    offsets appropriately and continue from after the call. */ + +    /* End of the pattern. If we are in a top-level recursion, we should +    restore the offsets appropriately and continue from after the call. */      case OP_END:      if (md->recursive != NULL && md->recursive->group_num == 0) @@ -777,8 +791,7 @@ for (;;)      case OP_ASSERTBACK:      do        { -      RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, -        match_isgroup); +      RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);        if (rrc == MATCH_MATCH) break;        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        ecode += GET(ecode, 1); @@ -804,8 +817,7 @@ for (;;)      case OP_ASSERTBACK_NOT:      do        { -      RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, -        match_isgroup); +      RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);        if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        ecode += GET(ecode,1); @@ -826,8 +838,8 @@ for (;;)  #ifdef SUPPORT_UTF8      if (utf8)        { -      c = GET(ecode,1); -      for (i = 0; i < c; i++) +      i = GET(ecode, 1); +      while (i-- > 0)          {          eptr--;          if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); @@ -840,7 +852,7 @@ for (;;)      /* No UTF-8 support, or not in UTF-8 mode: count is byte count */        { -      eptr -= GET(ecode,1); +      eptr -= GET(ecode, 1);        if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);        } @@ -897,13 +909,8 @@ for (;;)      case OP_RECURSE:        {        callpat = md->start_code + GET(ecode, 1); -      new_recursive.group_num = *callpat - OP_BRA; - -      /* For extended extraction brackets (large number), we have to fish out -      the number from a dummy opcode at the start. */ - -      if (new_recursive.group_num > EXTRACT_BASIC_MAX) -        new_recursive.group_num = GET2(callpat, 2+LINK_SIZE); +      new_recursive.group_num = (callpat == md->start_code)? 0 : +        GET2(callpat, 1 + LINK_SIZE);        /* Add to "recursing stack" */ @@ -936,10 +943,11 @@ for (;;)        restore the offset and recursion data. */        DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); +      flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;        do          { -        RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims, -            eptrb, match_isgroup); +        RMATCH(rrc, eptr, callpat + _pcre_OP_lengths[*callpat], offset_top, +          md, ims, eptrb, flags);          if (rrc == MATCH_MATCH)            {            DPRINTF(("Recursion matched\n")); @@ -983,7 +991,7 @@ for (;;)      do        {        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, -        eptrb, match_isgroup); +        eptrb, 0);        if (rrc == MATCH_MATCH) break;        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        ecode += GET(ecode,1); @@ -997,7 +1005,7 @@ for (;;)      /* Continue as from after the assertion, updating the offsets high water      mark, since extracts may have been taken. */ -    do ecode += GET(ecode,1); while (*ecode == OP_ALT); +    do ecode += GET(ecode, 1); while (*ecode == OP_ALT);      offset_top = md->end_offset_top;      eptr = md->end_match_ptr; @@ -1031,15 +1039,15 @@ for (;;)        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        ecode = prev; -      flags = match_isgroup; +      flags = match_tail_recursed;        goto TAIL_RECURSE;        }      else  /* OP_KETRMAX */        { -      RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); +      RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_cbegroup);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        ecode += 1 + LINK_SIZE; -      flags = 0; +      flags = match_tail_recursed;        goto TAIL_RECURSE;        }      /* Control never gets here */ @@ -1060,38 +1068,44 @@ for (;;)      case OP_BRAZERO:        {        next = ecode+1; -      RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup); +      RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, 0);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        do next += GET(next,1); while (*next == OP_ALT); -      ecode = next + 1+LINK_SIZE; +      ecode = next + 1 + LINK_SIZE;        }      break;      case OP_BRAMINZERO:        {        next = ecode+1; -      do next += GET(next,1); while (*next == OP_ALT); -      RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, -        match_isgroup); +      do next += GET(next, 1); while (*next == OP_ALT); +      RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        ecode++;        }      break; -    /* End of a group, repeated or non-repeating. If we are at the end of -    an assertion "group", stop matching and return MATCH_MATCH, but record the -    current high water mark for use by positive assertions. Do this also -    for the "once" (not-backup up) groups. */ +    /* End of a group, repeated or non-repeating. */      case OP_KET:      case OP_KETRMIN:      case OP_KETRMAX:      prev = ecode - GET(ecode, 1); -    saved_eptr = eptrb->epb_saved_eptr; -    /* Back up the stack of bracket start pointers. */ +    /* If this was a group that remembered the subject start, in order to break +    infinite repeats of empty string matches, retrieve the subject start from +    the chain. Otherwise, set it NULL. */ + +    if (*prev >= OP_SBRA) +      { +      saved_eptr = eptrb->epb_saved_eptr;   /* Value at start of group */ +      eptrb = eptrb->epb_prev;              /* Backup to previous group */ +      } +    else saved_eptr = NULL; -    eptrb = eptrb->epb_prev; +    /* If we are at the end of an assertion group, stop matching and return +    MATCH_MATCH, but record the current high water mark for use by positive +    assertions. Do this also for the "once" (atomic) groups. */      if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||          *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT || @@ -1102,18 +1116,15 @@ for (;;)        RRETURN(MATCH_MATCH);        } -    /* In all other cases except a conditional group we have to check the -    group number back at the start and if necessary complete handling an -    extraction by setting the offsets and bumping the high water mark. */ +    /* For capturing groups we have to check the group number back at the start +    and if necessary complete handling an extraction by setting the offsets and +    bumping the high water mark. Note that whole-pattern recursion is coded as +    a recurse into group 0, so it won't be picked up here. Instead, we catch it +    when the OP_END is reached. Other recursion is handled here. */ -    if (*prev != OP_COND) +    if (*prev == OP_CBRA || *prev == OP_SCBRA)        { -      number = *prev - OP_BRA; - -      /* For extended extraction brackets (large number), we have to fish out -      the number from a dummy opcode at the start. */ - -      if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE); +      number = GET2(prev, 1+LINK_SIZE);        offset = number << 1;  #ifdef DEBUG @@ -1121,42 +1132,34 @@ for (;;)        printf("\n");  #endif -      /* Test for a numbered group. This includes groups called as a result -      of recursion. Note that whole-pattern recursion is coded as a recurse -      into group 0, so it won't be picked up here. Instead, we catch it when -      the OP_END is reached. */ - -      if (number > 0) +      md->capture_last = number; +      if (offset >= md->offset_max) md->offset_overflow = TRUE; else          { -        md->capture_last = number; -        if (offset >= md->offset_max) md->offset_overflow = TRUE; else -          { -          md->offset_vector[offset] = -            md->offset_vector[md->offset_end - number]; -          md->offset_vector[offset+1] = eptr - md->start_subject; -          if (offset_top <= offset) offset_top = offset + 2; -          } +        md->offset_vector[offset] = +          md->offset_vector[md->offset_end - number]; +        md->offset_vector[offset+1] = eptr - md->start_subject; +        if (offset_top <= offset) offset_top = offset + 2; +        } -        /* Handle a recursively called group. Restore the offsets -        appropriately and continue from after the call. */ +      /* Handle a recursively called group. Restore the offsets +      appropriately and continue from after the call. */ -        if (md->recursive != NULL && md->recursive->group_num == number) -          { -          recursion_info *rec = md->recursive; -          DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); -          md->recursive = rec->prevrec; -          md->start_match = rec->save_start; -          memcpy(md->offset_vector, rec->offset_save, -            rec->saved_max * sizeof(int)); -          ecode = rec->after_call; -          ims = original_ims; -          break; -          } +      if (md->recursive != NULL && md->recursive->group_num == number) +        { +        recursion_info *rec = md->recursive; +        DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); +        md->recursive = rec->prevrec; +        md->start_match = rec->save_start; +        memcpy(md->offset_vector, rec->offset_save, +          rec->saved_max * sizeof(int)); +        ecode = rec->after_call; +        ims = original_ims; +        break;          }        } -    /* Reset the value of the ims flags, in case they got changed during -    the group. */ +    /* For both capturing and non-capturing groups, reset the value of the ims +    flags, in case they got changed during the group. */      ims = original_ims;      DPRINTF(("ims reset to %02lx\n", ims)); @@ -1177,20 +1180,22 @@ for (;;)      preceding bracket, in the appropriate order. In the second case, we can use      tail recursion to avoid using another stack frame. */ +    flags = (*prev >= OP_SBRA)? match_cbegroup : 0; +      if (*ecode == OP_KETRMIN)        {        RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        ecode = prev; -      flags = match_isgroup; +      flags |= match_tail_recursed;        goto TAIL_RECURSE;        }      else  /* OP_KETRMAX */        { -      RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); +      RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, flags);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        ecode += 1 + LINK_SIZE; -      flags = 0; +      flags = match_tail_recursed;        goto TAIL_RECURSE;        }      /* Control never gets here */ @@ -1202,9 +1207,7 @@ for (;;)      if ((ims & PCRE_MULTILINE) != 0)        {        if (eptr != md->start_subject && -          (eptr == md->end_subject || -           eptr < md->start_subject + md->nllen || -           !IS_NEWLINE(eptr - md->nllen))) +          (eptr == md->end_subject || !WAS_NEWLINE(eptr)))          RRETURN(MATCH_NOMATCH);        ecode++;        break; @@ -1244,7 +1247,7 @@ for (;;)        if (!md->endonly)          {          if (eptr != md->end_subject && -            (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr))) +            (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))            RRETURN(MATCH_NOMATCH);          ecode++;          break; @@ -1263,7 +1266,7 @@ for (;;)      case OP_EODN:      if (eptr != md->end_subject && -        (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr))) +        (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))        RRETURN(MATCH_NOMATCH);      ecode++;      break; @@ -1319,8 +1322,7 @@ for (;;)      case OP_ANY:      if ((ims & PCRE_DOTALL) == 0)        { -      if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)) -        RRETURN(MATCH_NOMATCH); +      if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);        }      if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (utf8) @@ -1414,6 +1416,26 @@ for (;;)      ecode++;      break; +    case OP_ANYNL: +    if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); +    GETCHARINCTEST(c, eptr); +    switch(c) +      { +      default: RRETURN(MATCH_NOMATCH); +      case 0x000d: +      if (eptr < md->end_subject && *eptr == 0x0a) eptr++; +      break; +      case 0x000a: +      case 0x000b: +      case 0x000c: +      case 0x0085: +      case 0x2028: +      case 0x2029: +      break; +      } +    ecode++; +    break; +  #ifdef SUPPORT_UCP      /* Check the next character by Unicode property. We will get here only      if the support is in the binary; otherwise a compile-time error occurs. */ @@ -1456,7 +1478,6 @@ for (;;)          default:          RRETURN(PCRE_ERROR_INTERNAL); -        break;          }        ecode += 3; @@ -1926,7 +1947,7 @@ for (;;)        else          { -        int dc; +        unsigned int dc;          GETCHARINC(dc, eptr);          ecode += length; @@ -1953,13 +1974,17 @@ for (;;)        }      break; -    /* Match a single character repeatedly; different opcodes share code. */ +    /* Match a single character repeatedly. */      case OP_EXACT:      min = max = GET2(ecode, 1);      ecode += 3;      goto REPEATCHAR; +    case OP_POSUPTO: +    possessive = TRUE; +    /* Fall through */ +      case OP_UPTO:      case OP_MINUPTO:      min = 0; @@ -1968,6 +1993,27 @@ for (;;)      ecode += 3;      goto REPEATCHAR; +    case OP_POSSTAR: +    possessive = TRUE; +    min = 0; +    max = INT_MAX; +    ecode++; +    goto REPEATCHAR; + +    case OP_POSPLUS: +    possessive = TRUE; +    min = 1; +    max = INT_MAX; +    ecode++; +    goto REPEATCHAR; + +    case OP_POSQUERY: +    possessive = TRUE; +    min = 0; +    max = 1; +    ecode++; +    goto REPEATCHAR; +      case OP_STAR:      case OP_MINSTAR:      case OP_PLUS: @@ -2003,10 +2049,9 @@ for (;;)          uschar occhars[8];  #ifdef SUPPORT_UCP -        int othercase; +        unsigned int othercase;          if ((ims & PCRE_CASELESS) != 0 && -            (othercase = _pcre_ucp_othercase(fc)) >= 0 && -             othercase >= 0) +            (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)            oclength = _pcre_ord2utf8(othercase, occhars);  #endif  /* SUPPORT_UCP */ @@ -2042,7 +2087,8 @@ for (;;)              }            /* Control never gets here */            } -        else + +        else  /* Maximize */            {            pp = eptr;            for (i = min; i < max; i++) @@ -2056,6 +2102,8 @@ for (;;)                eptr += oclength;                }              } + +          if (possessive) continue;            while (eptr >= pp)             {             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -2110,7 +2158,7 @@ for (;;)            }          /* Control never gets here */          } -      else +      else  /* Maximize */          {          pp = eptr;          for (i = min; i < max; i++) @@ -2118,6 +2166,7 @@ for (;;)            if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;            eptr++;            } +        if (possessive) continue;          while (eptr >= pp)            {            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -2146,7 +2195,7 @@ for (;;)            }          /* Control never gets here */          } -      else +      else  /* Maximize */          {          pp = eptr;          for (i = min; i < max; i++) @@ -2154,6 +2203,7 @@ for (;;)            if (eptr >= md->end_subject || fc != *eptr) break;            eptr++;            } +        if (possessive) continue;          while (eptr >= pp)            {            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -2206,6 +2256,34 @@ for (;;)      ecode += 3;      goto REPEATNOTCHAR; +    case OP_NOTPOSSTAR: +    possessive = TRUE; +    min = 0; +    max = INT_MAX; +    ecode++; +    goto REPEATNOTCHAR; + +    case OP_NOTPOSPLUS: +    possessive = TRUE; +    min = 1; +    max = INT_MAX; +    ecode++; +    goto REPEATNOTCHAR; + +    case OP_NOTPOSQUERY: +    possessive = TRUE; +    min = 0; +    max = 1; +    ecode++; +    goto REPEATNOTCHAR; + +    case OP_NOTPOSUPTO: +    possessive = TRUE; +    min = 0; +    max = GET2(ecode, 1); +    ecode += 3; +    goto REPEATNOTCHAR; +      case OP_NOTSTAR:      case OP_NOTMINSTAR:      case OP_NOTPLUS: @@ -2245,7 +2323,7 @@ for (;;)        /* UTF-8 mode */        if (utf8)          { -        register int d; +        register unsigned int d;          for (i = 1; i <= min; i++)            {            GETCHARINC(d, eptr); @@ -2270,7 +2348,7 @@ for (;;)          /* UTF-8 mode */          if (utf8)            { -          register int d; +          register unsigned int d;            for (fi = min;; fi++)              {              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -2306,7 +2384,7 @@ for (;;)          /* UTF-8 mode */          if (utf8)            { -          register int d; +          register unsigned int d;            for (i = min; i < max; i++)              {              int len = 1; @@ -2316,7 +2394,8 @@ for (;;)              if (fc == d) break;              eptr += len;              } -          for(;;) +        if (possessive) continue; +        for(;;)              {              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              if (rrc != MATCH_NOMATCH) RRETURN(rrc); @@ -2333,6 +2412,7 @@ for (;;)              if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;              eptr++;              } +          if (possessive) continue;            while (eptr >= pp)              {              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -2354,7 +2434,7 @@ for (;;)        /* UTF-8 mode */        if (utf8)          { -        register int d; +        register unsigned int d;          for (i = 1; i <= min; i++)            {            GETCHARINC(d, eptr); @@ -2377,7 +2457,7 @@ for (;;)          /* UTF-8 mode */          if (utf8)            { -          register int d; +          register unsigned int d;            for (fi = min;; fi++)              {              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -2412,7 +2492,7 @@ for (;;)          /* UTF-8 mode */          if (utf8)            { -          register int d; +          register unsigned int d;            for (i = min; i < max; i++)              {              int len = 1; @@ -2421,6 +2501,7 @@ for (;;)              if (fc == d) break;              eptr += len;              } +          if (possessive) continue;            for(;;)              {              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -2438,6 +2519,7 @@ for (;;)              if (eptr >= md->end_subject || fc == *eptr) break;              eptr++;              } +          if (possessive) continue;            while (eptr >= pp)              {              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -2469,6 +2551,34 @@ for (;;)      ecode += 3;      goto REPEATTYPE; +    case OP_TYPEPOSSTAR: +    possessive = TRUE; +    min = 0; +    max = INT_MAX; +    ecode++; +    goto REPEATTYPE; + +    case OP_TYPEPOSPLUS: +    possessive = TRUE; +    min = 1; +    max = INT_MAX; +    ecode++; +    goto REPEATTYPE; + +    case OP_TYPEPOSQUERY: +    possessive = TRUE; +    min = 0; +    max = 1; +    ecode++; +    goto REPEATTYPE; + +    case OP_TYPEPOSUPTO: +    possessive = TRUE; +    min = 0; +    max = GET2(ecode, 1); +    ecode += 3; +    goto REPEATTYPE; +      case OP_TYPESTAR:      case OP_TYPEMINSTAR:      case OP_TYPEPLUS: @@ -2571,7 +2681,6 @@ for (;;)            default:            RRETURN(PCRE_ERROR_INTERNAL); -          break;            }          } @@ -2611,9 +2720,7 @@ for (;;)          for (i = 1; i <= min; i++)            {            if (eptr >= md->end_subject || -               ((ims & PCRE_DOTALL) == 0 && -                 eptr <= md->end_subject - md->nllen && -                 IS_NEWLINE(eptr))) +               ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))              RRETURN(MATCH_NOMATCH);            eptr++;            while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; @@ -2624,6 +2731,28 @@ for (;;)          eptr += min;          break; +        case OP_ANYNL: +        for (i = 1; i <= min; i++) +          { +          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); +          GETCHARINC(c, eptr); +          switch(c) +            { +            default: RRETURN(MATCH_NOMATCH); +            case 0x000d: +            if (eptr < md->end_subject && *eptr == 0x0a) eptr++; +            break; +            case 0x000a: +            case 0x000b: +            case 0x000c: +            case 0x0085: +            case 0x2028: +            case 0x2029: +            break; +            } +          } +        break; +          case OP_NOT_DIGIT:          for (i = 1; i <= min; i++)            { @@ -2692,7 +2821,8 @@ for (;;)  #endif     /* SUPPORT_UTF8 */        /* Code for the non-UTF-8 case for minimum matching of operators other -      than OP_PROP and OP_NOTPROP. */ +      than OP_PROP and OP_NOTPROP. We can assume that there are the minimum +      number of bytes present, as this was tested above. */        switch(ctype)          { @@ -2701,8 +2831,7 @@ for (;;)            {            for (i = 1; i <= min; i++)              { -            if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)) -              RRETURN(MATCH_NOMATCH); +            if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);              eptr++;              }            } @@ -2713,6 +2842,28 @@ for (;;)          eptr += min;          break; +        /* Because of the CRLF case, we can't assume the minimum number of +        bytes are present in this case. */ + +        case OP_ANYNL: +        for (i = 1; i <= min; i++) +          { +          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); +          switch(*eptr++) +            { +            default: RRETURN(MATCH_NOMATCH); +            case 0x000d: +            if (eptr < md->end_subject && *eptr == 0x0a) eptr++; +            break; +            case 0x000a: +            case 0x000b: +            case 0x000c: +            case 0x0085: +            break; +            } +          } +        break; +          case OP_NOT_DIGIT:          for (i = 1; i <= min; i++)            if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); @@ -2774,7 +2925,7 @@ for (;;)              GETCHARINC(c, eptr);              if (prop_fail_result) RRETURN(MATCH_NOMATCH);              } -          break; +          /* Control never gets here */            case PT_LAMP:            for (fi = min;; fi++) @@ -2789,7 +2940,7 @@ for (;;)                   prop_chartype == ucp_Lt) == prop_fail_result)                RRETURN(MATCH_NOMATCH);              } -          break; +          /* Control never gets here */            case PT_GC:            for (fi = min;; fi++) @@ -2802,7 +2953,7 @@ for (;;)              if ((prop_category == prop_value) == prop_fail_result)                RRETURN(MATCH_NOMATCH);              } -          break; +          /* Control never gets here */            case PT_PC:            for (fi = min;; fi++) @@ -2815,7 +2966,7 @@ for (;;)              if ((prop_chartype == prop_value) == prop_fail_result)                RRETURN(MATCH_NOMATCH);              } -          break; +          /* Control never gets here */            case PT_SC:            for (fi = min;; fi++) @@ -2828,11 +2979,10 @@ for (;;)              if ((prop_script == prop_value) == prop_fail_result)                RRETURN(MATCH_NOMATCH);              } -          break; +          /* Control never gets here */            default:            RRETURN(PCRE_ERROR_INTERNAL); -          break;            }          } @@ -2876,7 +3026,7 @@ for (;;)            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (fi >= max || eptr >= md->end_subject ||                 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 && -                eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) +                IS_NEWLINE(eptr)))              RRETURN(MATCH_NOMATCH);            GETCHARINC(c, eptr); @@ -2888,6 +3038,23 @@ for (;;)              case OP_ANYBYTE:              break; +            case OP_ANYNL: +            switch(c) +              { +              default: RRETURN(MATCH_NOMATCH); +              case 0x000d: +              if (eptr < md->end_subject && *eptr == 0x0a) eptr++; +              break; +              case 0x000a: +              case 0x000b: +              case 0x000c: +              case 0x0085: +              case 0x2028: +              case 0x2029: +              break; +              } +            break; +              case OP_NOT_DIGIT:              if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)                RRETURN(MATCH_NOMATCH); @@ -2932,8 +3099,7 @@ for (;;)            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (fi >= max || eptr >= md->end_subject || -               ((ims & PCRE_DOTALL) == 0 && -                 eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) +               ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))              RRETURN(MATCH_NOMATCH);            c = *eptr++; @@ -2945,6 +3111,21 @@ for (;;)              case OP_ANYBYTE:              break; +            case OP_ANYNL: +            switch(c) +              { +              default: RRETURN(MATCH_NOMATCH); +              case 0x000d: +              if (eptr < md->end_subject && *eptr == 0x0a) eptr++; +              break; +              case 0x000a: +              case 0x000b: +              case 0x000c: +              case 0x0085: +              break; +              } +            break; +              case OP_NOT_DIGIT:              if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);              break; @@ -2977,7 +3158,7 @@ for (;;)        /* Control never gets here */        } -    /* If maximizing it is worth using inline code for speed, doing the type +    /* If maximizing, it is worth using inline code for speed, doing the type      test once at the start (i.e. keep it out of the loop). Again, keep the      UTF-8 and UCP stuff separate. */ @@ -3058,6 +3239,7 @@ for (;;)          /* eptr is now past the end of the maximum run */ +        if (possessive) continue;          for(;;)            {            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -3093,6 +3275,7 @@ for (;;)          /* eptr is now past the end of the maximum run */ +        if (possessive) continue;          for(;;)            {            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -3135,9 +3318,7 @@ for (;;)                {                for (i = min; i < max; i++)                  { -                if (eptr >= md->end_subject || -                    (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) -                  break; +                if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;                  eptr++;                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;                  } @@ -3161,9 +3342,7 @@ for (;;)                {                for (i = min; i < max; i++)                  { -                if (eptr >= md->end_subject || -                    (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) -                  break; +                if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;                  eptr++;                  }                break; @@ -3171,7 +3350,8 @@ for (;;)              else                {                c = max - min; -              if (c > md->end_subject - eptr) c = md->end_subject - eptr; +              if (c > (unsigned int)(md->end_subject - eptr)) +                c = md->end_subject - eptr;                eptr += c;                }              } @@ -3181,10 +3361,32 @@ for (;;)            case OP_ANYBYTE:            c = max - min; -          if (c > md->end_subject - eptr) c = md->end_subject - eptr; +          if (c > (unsigned int)(md->end_subject - eptr)) +            c = md->end_subject - eptr;            eptr += c;            break; +          case OP_ANYNL: +          for (i = min; i < max; i++) +            { +            int len = 1; +            if (eptr >= md->end_subject) break; +            GETCHARLEN(c, eptr, len); +            if (c == 0x000d) +              { +              if (++eptr >= md->end_subject) break; +              if (*eptr == 0x000a) eptr++; +              } +            else +              { +              if (c != 0x000a && c != 0x000b && c != 0x000c && +                  c != 0x0085 && c != 0x2028 && c != 0x2029) +                break; +              eptr += len; +              } +            } +          break; +            case OP_NOT_DIGIT:            for (i = min; i < max; i++)              { @@ -3257,6 +3459,7 @@ for (;;)          /* eptr is now past the end of the maximum run */ +        if (possessive) continue;          for(;;)            {            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -3277,9 +3480,7 @@ for (;;)              {              for (i = min; i < max; i++)                { -              if (eptr >= md->end_subject || -                  (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) -                break; +              if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;                eptr++;                }              break; @@ -3288,10 +3489,30 @@ for (;;)            case OP_ANYBYTE:            c = max - min; -          if (c > md->end_subject - eptr) c = md->end_subject - eptr; +          if (c > (unsigned int)(md->end_subject - eptr)) +            c = md->end_subject - eptr;            eptr += c;            break; +          case OP_ANYNL: +          for (i = min; i < max; i++) +            { +            if (eptr >= md->end_subject) break; +            c = *eptr; +            if (c == 0x000d) +              { +              if (++eptr >= md->end_subject) break; +              if (*eptr == 0x000a) eptr++; +              } +            else +              { +              if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085) +                break; +              eptr++; +              } +            } +          break; +            case OP_NOT_DIGIT:            for (i = min; i < max; i++)              { @@ -3352,6 +3573,7 @@ for (;;)          /* eptr is now past the end of the maximum run */ +        if (possessive) continue;          while (eptr >= pp)            {            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); @@ -3366,14 +3588,12 @@ for (;;)        }      /* Control never gets here */ -    /* There's been some horrible disaster. Since all codes > OP_BRA are -    for capturing brackets, and there shouldn't be any gaps between 0 and -    OP_BRA, arrival here can only mean there is something seriously wrong -    in the code above or the OP_xxx definitions. */ +    /* There's been some horrible disaster. Arrival here can only mean there is +    something seriously wrong in the code above or the OP_xxx definitions. */      default:      DPRINTF(("Unknown opcode %d\n", *ecode)); -    RRETURN(PCRE_ERROR_UNKNOWN_NODE); +    RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);      }    /* Do not stick any code in here without much thought; it is assumed @@ -3411,7 +3631,6 @@ Undefine all the macros that were defined above to handle this. */  #undef cur_is_word  #undef condition -#undef minimize  #undef prev_is_word  #undef original_ims @@ -3484,6 +3703,7 @@ BOOL startline;  BOOL firstline;  BOOL first_byte_caseless = FALSE;  BOOL req_byte_caseless = FALSE; +BOOL utf8;  match_data match_block;  match_data *md = &match_block;  const uschar *tables; @@ -3491,6 +3711,7 @@ const uschar *start_bits = NULL;  USPTR start_match = (USPTR)subject + start_offset;  USPTR end_subject;  USPTR req_byte_ptr = start_match - 1; +eptrblock eptrchain[EPTR_WORK_SIZE];  pcre_study_data internal_study;  const pcre_study_data *study; @@ -3567,7 +3788,7 @@ md->end_subject = md->start_subject + length;  end_subject = md->end_subject;  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; -md->utf8 = (re->options & PCRE_UTF8) != 0; +utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;  md->notbol = (options & PCRE_NOTBOL) != 0;  md->noteol = (options & PCRE_NOTEOL) != 0; @@ -3576,6 +3797,7 @@ md->partial = (options & PCRE_PARTIAL) != 0;  md->hitend = FALSE;  md->recursive = NULL;                   /* No recursion at top level */ +md->eptrchain = eptrchain;              /* Make workspace generally available */  md->lcc = tables + lcc_offset;  md->ctypes = tables + ctypes_offset; @@ -3583,26 +3805,36 @@ md->ctypes = tables + ctypes_offset;  /* Handle different types of newline. The two bits give four cases. If nothing  is set at run time, whatever was used at compile time applies. */ -switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) & -         PCRE_NEWLINE_CRLF) +switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) & +       PCRE_NEWLINE_BITS)    { -  default:              newline = NEWLINE; break;   /* Compile-time default */ +  case 0: newline = NEWLINE; break;   /* Compile-time default */    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_CR+         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; +  case PCRE_NEWLINE_ANY: newline = -1; break; +  default: return PCRE_ERROR_BADNEWLINE;    } -if (newline > 255) +if (newline < 0)    { -  md->nllen = 2; -  md->nl[0] = (newline >> 8) & 255; -  md->nl[1] = newline & 255; +  md->nltype = NLTYPE_ANY;    }  else    { -  md->nllen = 1; -  md->nl[0] = newline; +  md->nltype = NLTYPE_FIXED; +  if (newline > 255) +    { +    md->nllen = 2; +    md->nl[0] = (newline >> 8) & 255; +    md->nl[1] = newline & 255; +    } +  else +    { +    md->nllen = 1; +    md->nl[0] = newline; +    }    }  /* Partial matching is supported only for a restricted set of regexes at the @@ -3615,7 +3847,7 @@ if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)  back the character offset. */  #ifdef SUPPORT_UTF8 -if (md->utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) +if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)    {    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)      return PCRE_ERROR_BADUTF8; @@ -3707,10 +3939,13 @@ if ((re->options & PCRE_REQCHSET) != 0)    req_byte2 = (tables + fcc_offset)[req_byte];  /* case flipped */    } + +/* ==========================================================================*/ +  /* Loop for handling unanchored repeated matching attempts; for anchored regexs  the loop runs just once. */ -do +for(;;)    {    USPTR save_end_subject = end_subject; @@ -3725,14 +3960,14 @@ do    /* Advance to a unique first char if possible. If firstline is TRUE, the    start of the match is constrained to the first line of a multiline string. -  Implement this by temporarily adjusting end_subject so that we stop scanning -  at a newline. If the match fails at the newline, later code breaks this loop. -  */ +  That is, the match must be before or at the first newline. Implement this by +  temporarily adjusting end_subject so that we stop scanning at a newline. If +  the match fails at the newline, later code breaks this loop. */    if (firstline)      {      USPTR t = start_match; -    while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++; +    while (t < md->end_subject && !IS_NEWLINE(t)) t++;      end_subject = t;      } @@ -3753,11 +3988,9 @@ do    else if (startline)      { -    if (start_match >= md->start_subject + md->nllen + -          start_offset) +    if (start_match > md->start_subject + start_offset)        { -      while (start_match <= end_subject && -             !IS_NEWLINE(start_match - md->nllen)) +      while (start_match <= end_subject && !WAS_NEWLINE(start_match))          start_match++;        }      } @@ -3793,8 +4026,8 @@ do    HOWEVER: when the subject string is very, very long, searching to its end can    take a long time, and give bad performance on quite ordinary patterns. This -  showed up when somebody was matching /^C/ on a 32-megabyte string... so we -  don't do this when the string is sufficiently long. +  showed up when somebody was matching something like /^\d+C/ on a 32-megabyte +  string... so we don't do this when the string is sufficiently long.    ALSO: this processing is disabled when partial matching is requested.    */ @@ -3826,9 +4059,14 @@ do            }          } -      /* If we can't find the required character, break the matching loop */ +      /* If we can't find the required character, break the matching loop, +      forcing a match failure. */ -      if (p >= end_subject) break; +      if (p >= end_subject) +        { +        rc = MATCH_NOMATCH; +        break; +        }        /* If we have found the required character, save the point where we        found it, so that we don't search again next time round the loop if @@ -3838,49 +4076,70 @@ do        }      } -  /* When a match occurs, substrings will be set for all internal extractions; -  we just need to set up the whole thing as substring 0 before returning. If -  there were too many extractions, set the return code to zero. In the case -  where we had to get some local store to hold offsets for backreferences, copy -  those back references that we can. In this case there need not be overflow -  if certain parts of the pattern were not used. */ +  /* OK, we can now run the match. */    md->start_match = start_match;    md->match_call_count = 0; +  md->eptrn = 0;                          /* Next free eptrchain slot */ +  rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0); -  rc = match(start_match, md->start_code, 2, md, ims, NULL, match_isgroup, 0); +  /* Any return other than MATCH_NOMATCH breaks the loop. */ -  /* When the result is no match, if the subject's first character was a -  newline and the PCRE_FIRSTLINE option is set, break (which will return -  PCRE_ERROR_NOMATCH). The option requests that a match occur before the first -  newline in the subject. Otherwise, advance the pointer to the next character -  and continue - but the continuation will actually happen only when the -  pattern is not anchored. */ +  if (rc != MATCH_NOMATCH) break; -  if (rc == MATCH_NOMATCH) -    { -    if (firstline && -        start_match <= md->end_subject - md->nllen && -        IS_NEWLINE(start_match)) -      break; -    start_match++; +  /* If PCRE_FIRSTLINE is set, the match must happen before or at the first +  newline in the subject (though it may continue over the newline). Therefore, +  if we have just failed to match, starting at a newline, do not continue. */ + +  if (firstline && IS_NEWLINE(start_match)) break; + +  /* Advance the match position by one character. */ + +  start_match++;  #ifdef SUPPORT_UTF8 -    if (md->utf8) -      while(start_match < end_subject && (*start_match & 0xc0) == 0x80) -        start_match++; +  if (utf8) +    while(start_match < end_subject && (*start_match & 0xc0) == 0x80) +      start_match++;  #endif -    continue; -    } -  if (rc != MATCH_MATCH) -    { -    DPRINTF((">>>> error: returning %d\n", rc)); -    return rc; -    } +  /* Break the loop if the pattern is anchored or if we have passed the end of +  the subject. */ + +  if (anchored || start_match > end_subject) break; + +  /* If we have just passed a CR and the newline option is CRLF or ANY, and we +  are now at a LF, advance the match position by one more character. */ + +  if (start_match[-1] == '\r' && +       (md->nltype == NLTYPE_ANY || md->nllen == 2) && +       start_match < end_subject && +       *start_match == '\n') +    start_match++; + +  }   /* End of for(;;) "bumpalong" loop */ + +/* ==========================================================================*/ + +/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping +conditions is true: -  /* We have a match! Copy the offset information from temporary store if -  necessary */ +(1) The pattern is anchored; +(2) We are past the end of the subject; + +(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because +    this option requests that a match occur at or before the first newline in +    the subject. + +When we have a match and the offset vector is big enough to deal with any +backreferences, captured substring offsets will already be set up. In the case +where we had to get some local store to hold offsets for backreference +processing, copy those that we can. In this case there need not be overflow if +certain parts of the pattern were not used, even though there are more +capturing parentheses than vector slots. */ + +if (rc == MATCH_MATCH) +  {    if (using_temporary_offsets)      {      if (offsetcount >= 4) @@ -3889,15 +4148,18 @@ do          (offsetcount - 2) * sizeof(int));        DPRINTF(("Copied offsets from temporary memory\n"));        } -    if (md->end_offset_top > offsetcount) -      md->offset_overflow = TRUE; - +    if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;      DPRINTF(("Freeing temporary memory\n"));      (pcre_free)(md->offset_vector);      } +  /* Set the return code to the number of captured strings, or 0 if there are +  too many to fit into the vector. */ +    rc = md->offset_overflow? 0 : md->end_offset_top/2; +  /* If there is space, set up the whole thing as substring 0. */ +    if (offsetcount < 2) rc = 0; else      {      offsets[0] = start_match - md->start_subject; @@ -3908,9 +4170,8 @@ do    return rc;    } -/* This "while" is the end of the "do" above */ - -while (!anchored && start_match <= end_subject); +/* Control gets here if there has been an error, or if the overall match +attempt has failed at all permitted starting positions. */  if (using_temporary_offsets)    { @@ -3918,7 +4179,12 @@ if (using_temporary_offsets)    (pcre_free)(md->offset_vector);    } -if (md->partial && md->hitend) +if (rc != MATCH_NOMATCH) +  { +  DPRINTF((">>>> error: returning %d\n", rc)); +  return rc; +  } +else if (md->partial && md->hitend)    {    DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));    return PCRE_ERROR_PARTIAL;  | 
