diff options
Diffstat (limited to 'ext/pcre/pcrelib/pcre_compile.c')
-rw-r--r-- | ext/pcre/pcrelib/pcre_compile.c | 1605 |
1 files changed, 1274 insertions, 331 deletions
diff --git a/ext/pcre/pcrelib/pcre_compile.c b/ext/pcre/pcrelib/pcre_compile.c index 34721c8863..0f3ebf93fd 100644 --- a/ext/pcre/pcrelib/pcre_compile.c +++ b/ext/pcre/pcrelib/pcre_compile.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2006 University of Cambridge + Copyright (c) 1997-2008 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -42,11 +42,12 @@ POSSIBILITY OF SUCH DAMAGE. supporting internal functions that are not used by other modules. */ +#include "config.h" + #define NLBLOCK cd /* Block containing newline information */ #define PSSTART start_pattern /* Field containing processed string start */ #define PSEND end_pattern /* Field containing processed string end */ - #include "pcre_internal.h" @@ -58,6 +59,18 @@ used by pcretest. DEBUG is not defined when building a production library. */ #endif +/* Macro for setting individual bits in class bitmaps. */ + +#define SETBIT(a,b) a[b/8] |= (1 << (b%8)) + +/* Maximum length value to check against when making sure that the integer that +holds the compiled pattern length does not overflow. We make it a bit less than +INT_MAX to allow for adding in group terminating bytes, so that we don't have +to check them every time. */ + +#define OFLOW_MAX (INT_MAX - 20) + + /************************************************* * Code parameters and static tables * *************************************************/ @@ -82,21 +95,21 @@ are simple data values; negative values are for special things like \d and so on. Zero means further processing is needed (for things like \x), or the escape is invalid. */ -#if !EBCDIC /* This is the "normal" table for ASCII systems */ +#ifndef EBCDIC /* This is the "normal" table for ASCII systems */ static const short int escapes[] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */ - 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */ --ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */ +-ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */ +-ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */ -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */ - 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */ --ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */ +-ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */ +-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */ 0, 0, -ESC_z /* x - z */ }; -#else /* This is the "abnormal" table for EBCDIC systems */ +#else /* This is the "abnormal" table for EBCDIC systems */ static const short int escapes[] = { /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, @@ -106,18 +119,18 @@ static const short int escapes[] = { /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, -/* 88 */ 0, 0, 0, '{', 0, 0, 0, 0, +/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0, /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p, /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, -/* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0, +/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0, /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, -/* C8 */ 0, 0, 0, 0, 0, 0, 0, 0, -/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P, +/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0, +/* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P, /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0, -/* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X, +/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X, /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 @@ -125,14 +138,47 @@ static const short int escapes[] = { #endif -/* Tables of names of POSIX character classes and their lengths. The list is -terminated by a zero length entry. The first three must be alpha, lower, upper, -as this is assumed for handling case independence. */ +/* Table of special "verbs" like (*PRUNE). This is a short table, so it is +searched linearly. Put all the names into a single string, in order to reduce +the number of relocations when a shared library is dynamically linked. */ + +typedef struct verbitem { + int len; + int op; +} verbitem; + +static const char verbnames[] = + "ACCEPT\0" + "COMMIT\0" + "F\0" + "FAIL\0" + "PRUNE\0" + "SKIP\0" + "THEN"; + +static const verbitem verbs[] = { + { 6, OP_ACCEPT }, + { 6, OP_COMMIT }, + { 1, OP_FAIL }, + { 4, OP_FAIL }, + { 5, OP_PRUNE }, + { 4, OP_SKIP }, + { 4, OP_THEN } +}; + +static const int verbcount = sizeof(verbs)/sizeof(verbitem); + -static const char *const posix_names[] = { - "alpha", "lower", "upper", - "alnum", "ascii", "blank", "cntrl", "digit", "graph", - "print", "punct", "space", "word", "xdigit" }; +/* Tables of names of POSIX character classes and their lengths. The names are +now all in a single string, to reduce the number of relocations when a shared +library is dynamically loaded. The list of lengths is terminated by a zero +length entry. The first three must be alpha, lower, upper, as this is assumed +for handling case independence. */ + +static const char posix_names[] = + "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0" + "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0" + "word\0" "xdigit"; static const uschar posix_name_lengths[] = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; @@ -171,79 +217,91 @@ static const int posix_class_maps[] = { /* The texts of compile-time error messages. These are "char *" because they are passed to the outside world. Do not ever re-use any error number, because they are documented. Always add a new error instead. Messages marked DEAD below -are no longer used. */ - -static const char *error_texts[] = { - "no error", - "\\ at end of pattern", - "\\c at end of pattern", - "unrecognized character follows \\", - "numbers out of order in {} quantifier", +are no longer used. This used to be a table of strings, but in order to reduce +the number of relocations needed when a shared library is loaded dynamically, +it is now one long string. We cannot use a table of offsets, because the +lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we +simply count through to the one we want - this isn't a performance issue +because these strings are used only when there is a compilation error. */ + +static const char error_texts[] = + "no error\0" + "\\ at end of pattern\0" + "\\c at end of pattern\0" + "unrecognized character follows \\\0" + "numbers out of order in {} quantifier\0" /* 5 */ - "number too big in {} quantifier", - "missing terminating ] for character class", - "invalid escape sequence in character class", - "range out of order in character class", - "nothing to repeat", + "number too big in {} quantifier\0" + "missing terminating ] for character class\0" + "invalid escape sequence in character class\0" + "range out of order in character class\0" + "nothing to repeat\0" /* 10 */ - "operand of unlimited repeat could match the empty string", /** DEAD **/ - "internal error: unexpected repeat", - "unrecognized character after (?", - "POSIX named classes are supported only within a class", - "missing )", + "operand of unlimited repeat could match the empty string\0" /** DEAD **/ + "internal error: unexpected repeat\0" + "unrecognized character after (? or (?-\0" + "POSIX named classes are supported only within a class\0" + "missing )\0" /* 15 */ - "reference to non-existent subpattern", - "erroffset passed as NULL", - "unknown option bit(s) set", - "missing ) after comment", - "parentheses nested too deeply", /** DEAD **/ + "reference to non-existent subpattern\0" + "erroffset passed as NULL\0" + "unknown option bit(s) set\0" + "missing ) after comment\0" + "parentheses nested too deeply\0" /** DEAD **/ /* 20 */ - "regular expression too large", - "failed to get memory", - "unmatched parentheses", - "internal error: code overflow", - "unrecognized character after (?<", + "regular expression is too large\0" + "failed to get memory\0" + "unmatched parentheses\0" + "internal error: code overflow\0" + "unrecognized character after (?<\0" /* 25 */ - "lookbehind assertion is not fixed length", - "malformed number or name after (?(", - "conditional group contains more than two branches", - "assertion expected after (?(", - "(?R or (?digits must be followed by )", + "lookbehind assertion is not fixed length\0" + "malformed number or name after (?(\0" + "conditional group contains more than two branches\0" + "assertion expected after (?(\0" + "(?R or (?[+-]digits must be followed by )\0" /* 30 */ - "unknown POSIX class name", - "POSIX collating elements are not supported", - "this version of PCRE is not compiled with PCRE_UTF8 support", - "spare error", /** DEAD **/ - "character value in \\x{...} sequence is too large", + "unknown POSIX class name\0" + "POSIX collating elements are not supported\0" + "this version of PCRE is not compiled with PCRE_UTF8 support\0" + "spare error\0" /** DEAD **/ + "character value in \\x{...} sequence is too large\0" /* 35 */ - "invalid condition (?(0)", - "\\C not allowed in lookbehind assertion", - "PCRE does not support \\L, \\l, \\N, \\U, or \\u", - "number after (?C is > 255", - "closing ) for (?C expected", + "invalid condition (?(0)\0" + "\\C not allowed in lookbehind assertion\0" + "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0" + "number after (?C is > 255\0" + "closing ) for (?C expected\0" /* 40 */ - "recursive call could loop indefinitely", - "unrecognized character after (?P", - "syntax error in subpattern name (missing terminator)", - "two named subpatterns have the same name", - "invalid UTF-8 string", + "recursive call could loop indefinitely\0" + "unrecognized character after (?P\0" + "syntax error in subpattern name (missing terminator)\0" + "two named subpatterns have the same name\0" + "invalid UTF-8 string\0" /* 45 */ - "support for \\P, \\p, and \\X has not been compiled", - "malformed \\P or \\p sequence", - "unknown property name after \\P or \\p", - "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)", - "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")", + "support for \\P, \\p, and \\X has not been compiled\0" + "malformed \\P or \\p sequence\0" + "unknown property name after \\P or \\p\0" + "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0" + "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0" /* 50 */ - "repeated subpattern is too long", - "octal value is greater than \\377 (not in UTF-8 mode)", - "internal error: overran compiling workspace", - "internal error: previously-checked referenced subpattern not found", - "DEFINE group contains more than one branch", + "repeated subpattern is too long\0" /** DEAD **/ + "octal value is greater than \\377 (not in UTF-8 mode)\0" + "internal error: overran compiling workspace\0" + "internal error: previously-checked referenced subpattern not found\0" + "DEFINE group contains more than one branch\0" /* 55 */ - "repeating a DEFINE group is not allowed", - "inconsistent NEWLINE options", - "\\g is not followed by an (optionally braced) non-zero number" -}; + "repeating a DEFINE group is not allowed\0" + "inconsistent NEWLINE options\0" + "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0" + "a numbered reference must not be zero\0" + "(*VERB) with an argument is not supported\0" + /* 60 */ + "(*VERB) not recognized\0" + "number is too big\0" + "subpattern name expected\0" + "digit expected after (?+\0" + "] is an invalid data character in JavaScript compatibility mode"; /* Table to identify digits and hex digits. This is used when compiling @@ -262,7 +320,7 @@ For convenience, we use the same bit definitions as in chartables: Then we can use ctype_digit and ctype_xdigit in the code. */ -#if !EBCDIC /* This is the "normal" case, for ASCII systems */ +#ifndef EBCDIC /* This is the "normal" case, for ASCII systems */ static const unsigned char digitab[] = { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ @@ -298,7 +356,7 @@ static const unsigned char digitab[] = 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ -#else /* This is the "abnormal" case, for EBCDIC systems */ +#else /* This is the "abnormal" case, for EBCDIC systems */ static const unsigned char digitab[] = { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ @@ -312,7 +370,7 @@ static const unsigned char digitab[] = 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */ - 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */ + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */ @@ -346,7 +404,7 @@ static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */ 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */ 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */ - 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */ + 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */ 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */ @@ -373,10 +431,32 @@ static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */ /* Definition to allow mutual recursion */ static BOOL - compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *, - int *, branch_chain *, compile_data *, int *); + compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int, + int *, int *, branch_chain *, compile_data *, int *); + +/************************************************* +* Find an error text * +*************************************************/ + +/* The error texts are now all in one long string, to save on relocations. As +some of the text is of unknown length, we can't use a table of offsets. +Instead, just count through the strings. This is not a performance issue +because it happens only when there has been a compilation error. + +Argument: the error number +Returns: pointer to the error string +*/ + +static const char * +find_error_text(int n) +{ +const char *s = error_texts; +for (; n > 0; n--) while (*s++ != 0); +return s; +} + /************************************************* * Handle escapes * @@ -399,7 +479,7 @@ Arguments: Returns: zero or positive => a data character negative => a special escape sequence - on error, errorptr is set + on error, errorcodeptr is set */ static int @@ -417,16 +497,16 @@ ptr--; /* Set pointer back to the last byte */ if (c == 0) *errorcodeptr = ERR1; -/* Non-alphamerics are literals. For digits or letters, do an initial lookup in -a table. A non-zero result is something that can be returned immediately. +/* Non-alphanumerics are literals. For digits or letters, do an initial lookup +in a table. A non-zero result is something that can be returned immediately. Otherwise further processing may be required. */ -#if !EBCDIC /* ASCII coding */ -else if (c < '0' || c > 'z') {} /* Not alphameric */ +#ifndef EBCDIC /* ASCII coding */ +else if (c < '0' || c > 'z') {} /* Not alphanumeric */ else if ((i = escapes[c - '0']) != 0) c = i; -#else /* EBCDIC coding */ -else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */ +#else /* EBCDIC coding */ +else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */ else if ((i = escapes[c - 0x48]) != 0) c = i; #endif @@ -450,13 +530,41 @@ else *errorcodeptr = ERR37; break; - /* \g must be followed by a number, either plain or braced. If positive, it - is an absolute backreference. If negative, it is a relative backreference. - This is a Perl 5.10 feature. */ + /* \g must be followed by one of a number of specific things: + + (1) A number, either plain or braced. If positive, it is an absolute + backreference. If negative, it is a relative backreference. This is a Perl + 5.10 feature. + + (2) Perl 5.10 also supports \g{name} as a reference to a named group. This + is part of Perl's movement towards a unified syntax for back references. As + this is synonymous with \k{name}, we fudge it up by pretending it really + was \k. + + (3) For Oniguruma compatibility we also support \g followed by a name or a + number either in angle brackets or in single quotes. However, these are + (possibly recursive) subroutine calls, _not_ backreferences. Just return + the -ESC_g code (cf \k). */ case 'g': + if (ptr[1] == '<' || ptr[1] == '\'') + { + c = -ESC_g; + break; + } + + /* Handle the Perl-compatible cases */ + if (ptr[1] == '{') { + const uschar *p; + for (p = ptr+2; *p != 0 && *p != '}'; p++) + if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break; + if (*p != 0 && *p != '}') + { + c = -ESC_k; + break; + } braced = TRUE; ptr++; } @@ -473,10 +581,22 @@ else while ((digitab[ptr[1]] & ctype_digit) != 0) c = c * 10 + *(++ptr) - '0'; - if (c == 0 || (braced && *(++ptr) != '}')) + if (c < 0) /* Integer overflow */ + { + *errorcodeptr = ERR61; + break; + } + + if (braced && *(++ptr) != '}') { *errorcodeptr = ERR57; - return 0; + break; + } + + if (c == 0) + { + *errorcodeptr = ERR58; + break; } if (negated) @@ -484,7 +604,7 @@ else if (c > bracount) { *errorcodeptr = ERR15; - return 0; + break; } c = bracount - (c - 1); } @@ -513,6 +633,11 @@ else c -= '0'; while ((digitab[ptr[1]] & ctype_digit) != 0) c = c * 10 + *(++ptr) - '0'; + if (c < 0) /* Integer overflow */ + { + *errorcodeptr = ERR61; + break; + } if (c < 10 || c <= bracount) { c = -(ESC_REF + c); @@ -562,10 +687,10 @@ else if (c == 0 && cc == '0') continue; /* Leading zeroes */ count++; -#if !EBCDIC /* ASCII coding */ +#ifndef EBCDIC /* ASCII coding */ if (cc >= 'a') cc -= 32; /* Convert to upper case */ c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10)); -#else /* EBCDIC coding */ +#else /* EBCDIC coding */ if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */ c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10)); #endif @@ -589,10 +714,10 @@ else { int cc; /* Some compilers don't like ++ */ cc = *(++ptr); /* in initializers */ -#if !EBCDIC /* ASCII coding */ +#ifndef EBCDIC /* ASCII coding */ if (cc >= 'a') cc -= 32; /* Convert to upper case */ c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10)); -#else /* EBCDIC coding */ +#else /* EBCDIC coding */ if (cc <= 'z') cc += 64; /* Convert to upper case */ c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10)); #endif @@ -608,23 +733,23 @@ else if (c == 0) { *errorcodeptr = ERR2; - return 0; + break; } -#if !EBCDIC /* ASCII coding */ +#ifndef EBCDIC /* ASCII coding */ if (c >= 'a' && c <= 'z') c -= 32; c ^= 0x40; -#else /* EBCDIC coding */ +#else /* EBCDIC coding */ if (c >= 'a' && c <= 'z') c += 64; c ^= 0xC0; #endif break; /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any - other alphameric following \ is an error if PCRE_EXTRA was set; otherwise, - for Perl compatibility, it is a literal. This code looks a bit odd, but - there used to be some cases other than the default, and there may be again - in future, so I haven't "optimized" it. */ + other alphanumeric following \ is an error if PCRE_EXTRA was set; + otherwise, for Perl compatibility, it is a literal. This code looks a bit + odd, but there used to be some cases other than the default, and there may + be again in future, so I haven't "optimized" it. */ default: if ((options & PCRE_EXTRA) != 0) switch(c) @@ -684,7 +809,7 @@ if (c == '{') *negptr = TRUE; ptr++; } - for (i = 0; i < sizeof(name) - 1; i++) + for (i = 0; i < (int)sizeof(name) - 1; i++) { c = *(++ptr); if (c == 0) goto ERROR_RETURN; @@ -713,7 +838,7 @@ top = _pcre_utt_size; while (bot < top) { i = (bot + top) >> 1; - c = strcmp(name, _pcre_utt[i].name); + c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset); if (c == 0) { *dptr = _pcre_utt[i].value; @@ -849,7 +974,7 @@ be terminated by '>' because that is checked in the first pass. Arguments: ptr current position in the pattern - count current count of capturing parens so far encountered + cd compile background data name name to seek, or NULL if seeking a numbered subpattern lorn name length, or subpattern number if name is NULL xmode TRUE if we are in /x mode @@ -858,10 +983,11 @@ Returns: the number of the named subpattern, or -1 if not found */ static int -find_parens(const uschar *ptr, int count, const uschar *name, int lorn, +find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn, BOOL xmode) { const uschar *thisname; +int count = cd->bracount; for (; *ptr != 0; ptr++) { @@ -881,12 +1007,37 @@ for (; *ptr != 0; ptr++) continue; } - /* Skip over character classes */ + /* Skip over character classes; this logic must be similar to the way they + are handled for real. If the first character is '^', skip it. Also, if the + first few characters (either before or after ^) are \Q\E or \E we skip them + too. This makes for compatibility with Perl. */ if (*ptr == '[') { + BOOL negate_class = FALSE; + for (;;) + { + int c = *(++ptr); + if (c == '\\') + { + if (ptr[1] == 'E') ptr++; + else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3; + else break; + } + else if (!negate_class && c == '^') + negate_class = TRUE; + else break; + } + + /* If the next character is ']', it is a data character that must be + skipped, except in JavaScript compatibility mode. */ + + if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0) + ptr++; + while (*(++ptr) != ']') { + if (*ptr == 0) return -1; if (*ptr == '\\') { if (*(++ptr) == 0) return -1; @@ -914,7 +1065,7 @@ for (; *ptr != 0; ptr++) /* An opening parens must now be a real metacharacter */ if (*ptr != '(') continue; - if (ptr[1] != '?') + if (ptr[1] != '?' && ptr[1] != '*') { count++; if (name == NULL && count == lorn) return count; @@ -1042,7 +1193,6 @@ for (;;) { int d; register int op = *cc; - switch (op) { case OP_CBRA: @@ -1131,6 +1281,7 @@ for (;;) case OP_TYPEEXACT: branchlength += GET2(cc,1); + if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2; cc += 4; break; @@ -1148,6 +1299,7 @@ for (;;) case OP_NOT_WORDCHAR: case OP_WORDCHAR: case OP_ANY: + case OP_ALLANY: branchlength++; cc++; break; @@ -1239,13 +1391,43 @@ for (;;) code += _pcre_OP_lengths[c]; } - /* In UTF-8 mode, opcodes that are followed by a character may be followed by - a multi-byte character. The length in the table is a minimum, so we have to - arrange to skip the extra bytes. */ + /* Otherwise, we can get the item's length from the table, except that for + repeated character types, we have to test for \p and \P, which have an extra + two bytes of parameters. */ else { + switch(c) + { + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSSTAR: + case OP_TYPEPOSPLUS: + case OP_TYPEPOSQUERY: + if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; + break; + + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEEXACT: + case OP_TYPEPOSUPTO: + if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; + break; + } + + /* Add in the fixed length from the table */ + code += _pcre_OP_lengths[c]; + + /* In UTF-8 mode, opcodes that are followed by a character may be followed by + a multi-byte character. The length in the table is a minimum, so we have to + arrange to skip the extra bytes. */ + +#ifdef SUPPORT_UTF8 if (utf8) switch(c) { case OP_CHAR: @@ -1266,6 +1448,7 @@ for (;;) if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; break; } +#endif } } } @@ -1301,14 +1484,43 @@ for (;;) if (c == OP_XCLASS) code += GET(code, 1); - /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes - that are followed by a character may be followed by a multi-byte character. - The length in the table is a minimum, so we have to arrange to skip the extra - bytes. */ + /* Otherwise, we can get the item's length from the table, except that for + repeated character types, we have to test for \p and \P, which have an extra + two bytes of parameters. */ else { + switch(c) + { + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSSTAR: + case OP_TYPEPOSPLUS: + case OP_TYPEPOSQUERY: + if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; + break; + + case OP_TYPEPOSUPTO: + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEEXACT: + if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; + break; + } + + /* Add in the fixed length from the table */ + code += _pcre_OP_lengths[c]; + + /* In UTF-8 mode, opcodes that are followed by a character may be followed + by a multi-byte character. The length in the table is a minimum, so we have + to arrange to skip the extra bytes. */ + +#ifdef SUPPORT_UTF8 if (utf8) switch(c) { case OP_CHAR: @@ -1329,6 +1541,7 @@ for (;;) if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; break; } +#endif } } } @@ -1343,8 +1556,9 @@ for (;;) can match the empty string or not. It is called from could_be_empty() below and from compile_branch() when checking for an unlimited repeat of a group that can match nothing. Note that first_significant_code() skips over -assertions. If we hit an unclosed bracket, we return "empty" - this means we've -struck an inner bracket whose current branch will already have been scanned. +backward and negative forward assertions when its final argument is TRUE. If we +hit an unclosed bracket, we return "empty" - this means we've struck an inner +bracket whose current branch will already have been scanned. Arguments: code points to start of search @@ -1366,7 +1580,29 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE c = *code; - if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE) + /* Skip over forward assertions; the other assertions are skipped by + first_significant_code() with a TRUE final argument. */ + + if (c == OP_ASSERT) + { + do code += GET(code, 1); while (*code == OP_ALT); + c = *code; + continue; + } + + /* Groups with zero repeats can of course be empty; skip them. */ + + if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO) + { + code += _pcre_OP_lengths[c]; + do code += GET(code, 1); while (*code == OP_ALT); + c = *code; + continue; + } + + /* For other groups, scan the branches. */ + + if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND) { BOOL empty_branch; if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ @@ -1382,12 +1618,7 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE } while (*code == OP_ALT); if (!empty_branch) return FALSE; /* All branches are non-empty */ - - /* Move past the KET and fudge things so that the increment in the "for" - above has no effect. */ - - c = OP_END; - code += 1 + LINK_SIZE - _pcre_OP_lengths[c]; + c = *code; continue; } @@ -1395,11 +1626,15 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE switch (c) { - /* Check for quantifiers after a class */ + /* Check for quantifiers after a class. XCLASS is used for classes that + cannot be represented just by a bit map. This includes negated single + high-valued characters. The length in _pcre_OP_lengths[] is zero; the + actual length is stored in the compiled code, so we must update "code" + here. */ #ifdef SUPPORT_UTF8 case OP_XCLASS: - ccode = code + GET(code, 1); + ccode = code += GET(code, 1); goto CHECK_CLASS_REPEAT; #endif @@ -1443,6 +1678,7 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE case OP_NOT_WORDCHAR: case OP_WORDCHAR: case OP_ANY: + case OP_ALLANY: case OP_ANYBYTE: case OP_CHAR: case OP_CHARNC: @@ -1461,6 +1697,26 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE case OP_TYPEEXACT: return FALSE; + /* These are going to continue, as they may be empty, but we have to + fudge the length for the \p and \P cases. */ + + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPOSSTAR: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSQUERY: + if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; + break; + + /* Same for these */ + + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEPOSUPTO: + if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; + break; + /* End of branch */ case OP_KET: @@ -1530,29 +1786,48 @@ return TRUE; *************************************************/ /* This function is called when the sequence "[:" or "[." or "[=" is -encountered in a character class. It checks whether this is followed by an -optional ^ and then a sequence of letters, terminated by a matching ":]" or -".]" or "=]". +encountered in a character class. It checks whether this is followed by a +sequence of characters terminated by a matching ":]" or ".]" or "=]". If we +reach an unescaped ']' without the special preceding character, return FALSE. + +Originally, this function only recognized a sequence of letters between the +terminators, but it seems that Perl recognizes any sequence of characters, +though of course unknown POSIX names are subsequently rejected. Perl gives an +"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE +didn't consider this to be a POSIX class. Likewise for [:1234:]. + +The problem in trying to be exactly like Perl is in the handling of escapes. We +have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX +class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code +below handles the special case of \], but does not try to do any other escape +processing. This makes it different from Perl for cases such as [:l\ower:] +where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize +"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does, +I think. -Argument: +Arguments: ptr pointer to the initial [ endptr where to return the end pointer - cd pointer to compile data Returns: TRUE or FALSE */ static BOOL -check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd) +check_posix_syntax(const uschar *ptr, const uschar **endptr) { int terminator; /* Don't combine these lines; the Solaris cc */ terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ -if (*(++ptr) == '^') ptr++; -while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++; -if (*ptr == terminator && ptr[1] == ']') +for (++ptr; *ptr != 0; ptr++) { - *endptr = ptr; - return TRUE; + if (*ptr == '\\' && ptr[1] == ']') ptr++; else + { + if (*ptr == ']') return FALSE; + if (*ptr == terminator && ptr[1] == ']') + { + *endptr = ptr; + return TRUE; + } + } } return FALSE; } @@ -1577,11 +1852,13 @@ Returns: a value representing the name, or -1 if unknown static int check_posix_name(const uschar *ptr, int len) { +const char *pn = posix_names; register int yield = 0; while (posix_name_lengths[yield] != 0) { if (len == posix_name_lengths[yield] && - strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield; + strncmp((const char *)ptr, pn, len) == 0) return yield; + pn += posix_name_lengths[yield] + 1; yield++; } return -1; @@ -1596,11 +1873,12 @@ return -1; that is referenced. This means that groups can be replicated for fixed repetition simply by copying (because the recursion is allowed to refer to earlier groups that are outside the current group). However, when a group is -optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before -it, after it has been compiled. This means that any OP_RECURSE items within it -that refer to the group itself or any contained groups have to have their -offsets adjusted. That one of the jobs of this function. Before it is called, -the partially compiled regex must be temporarily terminated with OP_END. +optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is +inserted before it, after it has been compiled. This means that any OP_RECURSE +items within it that refer to the group itself or any contained groups have to +have their offsets adjusted. That one of the jobs of this function. Before it +is called, the partially compiled regex must be temporarily terminated with +OP_END. This function has been extended with the possibility of forward references for recursions and subroutine calls. It must also check the list of such references @@ -1623,6 +1901,7 @@ adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd, uschar *save_hwm) { uschar *ptr = group; + while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL) { int offset; @@ -1884,7 +2163,6 @@ if (next >= 0) switch(op_code) /* For OP_NOT, "item" must be a single-byte character. */ case OP_NOT: - if (next < 0) return FALSE; /* Not a character */ if (item == next) return TRUE; if ((options & PCRE_CASELESS) == 0) return FALSE; #ifdef SUPPORT_UTF8 @@ -1921,6 +2199,50 @@ if (next >= 0) switch(op_code) case OP_NOT_WORDCHAR: return next <= 127 && (cd->ctypes[next] & ctype_word) != 0; + case OP_HSPACE: + case OP_NOT_HSPACE: + switch(next) + { + case 0x09: + case 0x20: + case 0xa0: + case 0x1680: + case 0x180e: + case 0x2000: + case 0x2001: + case 0x2002: + case 0x2003: + case 0x2004: + case 0x2005: + case 0x2006: + case 0x2007: + case 0x2008: + case 0x2009: + case 0x200A: + case 0x202f: + case 0x205f: + case 0x3000: + return op_code != OP_HSPACE; + default: + return op_code == OP_HSPACE; + } + + case OP_VSPACE: + case OP_NOT_VSPACE: + switch(next) + { + case 0x0a: + case 0x0b: + case 0x0c: + case 0x0d: + case 0x85: + case 0x2028: + case 0x2029: + return op_code != OP_VSPACE; + default: + return op_code == OP_VSPACE; + } + default: return FALSE; } @@ -1955,12 +2277,57 @@ switch(op_code) case ESC_W: return item <= 127 && (cd->ctypes[item] & ctype_word) != 0; + case ESC_h: + case ESC_H: + switch(item) + { + case 0x09: + case 0x20: + case 0xa0: + case 0x1680: + case 0x180e: + case 0x2000: + case 0x2001: + case 0x2002: + case 0x2003: + case 0x2004: + case 0x2005: + case 0x2006: + case 0x2007: + case 0x2008: + case 0x2009: + case 0x200A: + case 0x202f: + case 0x205f: + case 0x3000: + return -next != ESC_h; + default: + return -next == ESC_h; + } + + case ESC_v: + case ESC_V: + switch(item) + { + case 0x0a: + case 0x0b: + case 0x0c: + case 0x0d: + case 0x85: + case 0x2028: + case 0x2029: + return -next != ESC_v; + default: + return -next == ESC_v; + } + default: return FALSE; } case OP_DIGIT: - return next == -ESC_D || next == -ESC_s || next == -ESC_W; + return next == -ESC_D || next == -ESC_s || next == -ESC_W || + next == -ESC_h || next == -ESC_v; case OP_NOT_DIGIT: return next == -ESC_d; @@ -1969,10 +2336,23 @@ switch(op_code) return next == -ESC_S || next == -ESC_d || next == -ESC_w; case OP_NOT_WHITESPACE: - return next == -ESC_s; + return next == -ESC_s || next == -ESC_h || next == -ESC_v; + + case OP_HSPACE: + return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w; + + case OP_NOT_HSPACE: + return next == -ESC_h; + + /* Can't have \S in here because VT matches \S (Perl anomaly) */ + case OP_VSPACE: + return next == -ESC_V || next == -ESC_d || next == -ESC_w; + + case OP_NOT_VSPACE: + return next == -ESC_v; case OP_WORDCHAR: - return next == -ESC_W || next == -ESC_s; + return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v; case OP_NOT_WORDCHAR: return next == -ESC_w || next == -ESC_d; @@ -2045,6 +2425,7 @@ uschar classbits[32]; BOOL class_utf8; BOOL utf8 = (options & PCRE_UTF8) != 0; uschar *class_utf8data; +uschar *class_utf8data_base; uschar utf8_char[6]; #else BOOL utf8 = FALSE; @@ -2084,13 +2465,16 @@ req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; for (;; ptr++) { BOOL negate_class; + BOOL should_flip_negation; BOOL possessive_quantifier; BOOL is_quantifier; BOOL is_recurse; + BOOL reset_bracount; int class_charcount; int class_lastchar; int newoptions; int recno; + int refsign; int skipbytes; int subreqbyte; int subfirstbyte; @@ -2123,6 +2507,15 @@ for (;; ptr++) */ if (code < last_code) code = last_code; + + /* Paranoid check for integer overflow */ + + if (OFLOW_MAX - *lengthptr < code - last_code) + { + *errorcodeptr = ERR20; + goto FAILED; + } + *lengthptr += code - last_code; DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c)); @@ -2235,6 +2628,11 @@ for (;; ptr++) *ptrptr = ptr; if (lengthptr != NULL) { + if (OFLOW_MAX - *lengthptr < code - last_code) + { + *errorcodeptr = ERR20; + goto FAILED; + } *lengthptr += code - last_code; /* To include callout length */ DPRINTF((">> end branch\n")); } @@ -2267,7 +2665,7 @@ for (;; ptr++) zerofirstbyte = firstbyte; zeroreqbyte = reqbyte; previous = code; - *code++ = OP_ANY; + *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY; break; @@ -2282,7 +2680,17 @@ for (;; ptr++) opcode is compiled. It may optionally have a bit map for characters < 256, but those above are are explicitly listed afterwards. A flag byte tells whether the bitmap is present, and whether this is a negated class or not. - */ + + In JavaScript compatibility mode, an isolated ']' causes an error. In + default (Perl) mode, it is treated as a data character. */ + + case ']': + if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) + { + *errorcodeptr = ERR64; + goto FAILED; + } + goto NORMAL_CHAR; case '[': previous = code; @@ -2291,24 +2699,50 @@ for (;; ptr++) they are encountered at the top level, so we'll do that too. */ if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && - check_posix_syntax(ptr, &tempptr, cd)) + check_posix_syntax(ptr, &tempptr)) { *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31; goto FAILED; } - /* If the first character is '^', set the negation flag and skip it. */ + /* If the first character is '^', set the negation flag and skip it. Also, + if the first few characters (either before or after ^) are \Q\E or \E we + skip them too. This makes for compatibility with Perl. */ - if ((c = *(++ptr)) == '^') + negate_class = FALSE; + for (;;) { - negate_class = TRUE; c = *(++ptr); + if (c == '\\') + { + if (ptr[1] == 'E') ptr++; + else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3; + else break; + } + else if (!negate_class && c == '^') + negate_class = TRUE; + else break; } - else + + /* Empty classes are allowed in JavaScript compatibility mode. Otherwise, + an initial ']' is taken as a data character -- the code below handles + that. In JS mode, [] must always fail, so generate OP_FAIL, whereas + [^] must match any character, so generate OP_ALLANY. */ + + if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) { - negate_class = FALSE; + *code++ = negate_class? OP_ALLANY : OP_FAIL; + if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; + zerofirstbyte = firstbyte; + break; } + /* If a class contains a negative special such as \S, we need to flip the + negation flag at the end, so that support for characters > 255 works + correctly (they are all included in the class). */ + + should_flip_negation = FALSE; + /* Keep a count of chars with values < 256 so that we can optimize the case of just a single character (as long as it's < 256). However, For higher valued UTF-8 characters, we don't yet do any optimization. */ @@ -2326,6 +2760,7 @@ for (;; ptr++) #ifdef SUPPORT_UTF8 class_utf8 = FALSE; /* No chars >= 256 */ class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */ + class_utf8data_base = class_utf8data; /* For resetting in pass 1 */ #endif /* Process characters until ] is reached. By writing this as a "do" it @@ -2341,6 +2776,18 @@ for (;; ptr++) { /* Braces are required because the */ GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ } + + /* In the pre-compile phase, accumulate the length of any UTF-8 extra + data and reset the pointer. This is so that very large classes that + contain a zillion UTF-8 characters no longer overwrite the work space + (which is on the stack). */ + + if (lengthptr != NULL) + { + *lengthptr += class_utf8data - class_utf8data_base; + class_utf8data = class_utf8data_base; + } + #endif /* Inside \Q...\E everything is literal except \E */ @@ -2364,7 +2811,7 @@ for (;; ptr++) if (c == '[' && (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && - check_posix_syntax(ptr, &tempptr, cd)) + check_posix_syntax(ptr, &tempptr)) { BOOL local_negate = FALSE; int posix_class, taboffset, tabopt; @@ -2381,6 +2828,7 @@ for (;; ptr++) if (*ptr == '^') { local_negate = TRUE; + should_flip_negation = TRUE; /* Note negative special */ ptr++; } @@ -2447,7 +2895,7 @@ for (;; ptr++) of the specials, which just set a flag. The sequence \b is a special case. Inside a class (and only there) it is treated as backspace. Elsewhere it marks a word boundary. Other escapes have preset maps ready - to or into the one we are building. We assume they have more than one + to 'or' into the one we are building. We assume they have more than one character in them, so set class_charcount bigger than one. */ if (c == '\\') @@ -2455,7 +2903,7 @@ for (;; ptr++) c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); if (*errorcodeptr != 0) goto FAILED; - if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */ + if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */ else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */ else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */ else if (-c == ESC_Q) /* Handle start of quoted string */ @@ -2467,6 +2915,7 @@ for (;; ptr++) else inescq = TRUE; continue; } + else if (-c == ESC_E) continue; /* Ignore orphan \E */ if (c < 0) { @@ -2482,6 +2931,7 @@ for (;; ptr++) continue; case ESC_D: + should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit]; continue; @@ -2490,6 +2940,7 @@ for (;; ptr++) continue; case ESC_W: + should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word]; continue; @@ -2499,13 +2950,11 @@ for (;; ptr++) continue; case ESC_S: + should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space]; classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ continue; - case ESC_E: /* Perl ignores an orphan \E */ - continue; - default: /* Not recognized; fall through */ break; /* Need "default" setting to stop compiler warning. */ } @@ -2515,6 +2964,133 @@ for (;; ptr++) else if (c == -ESC_d || c == -ESC_D || c == -ESC_w || c == -ESC_W || c == -ESC_s || c == -ESC_S) continue; + /* We need to deal with \H, \h, \V, and \v in both phases because + they use extra memory. */ + + if (-c == ESC_h) + { + SETBIT(classbits, 0x09); /* VT */ + SETBIT(classbits, 0x20); /* SPACE */ + SETBIT(classbits, 0xa0); /* NSBP */ +#ifdef SUPPORT_UTF8 + if (utf8) + { + class_utf8 = TRUE; + *class_utf8data++ = XCL_SINGLE; + class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data); + *class_utf8data++ = XCL_SINGLE; + class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data); + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data); + class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data); + *class_utf8data++ = XCL_SINGLE; + class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data); + *class_utf8data++ = XCL_SINGLE; + class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data); + *class_utf8data++ = XCL_SINGLE; + class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data); + } +#endif + continue; + } + + if (-c == ESC_H) + { + for (c = 0; c < 32; c++) + { + int x = 0xff; + switch (c) + { + case 0x09/8: x ^= 1 << (0x09%8); break; + case 0x20/8: x ^= 1 << (0x20%8); break; + case 0xa0/8: x ^= 1 << (0xa0%8); break; + default: break; + } + classbits[c] |= x; + } + +#ifdef SUPPORT_UTF8 + if (utf8) + { + class_utf8 = TRUE; + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data); + class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data); + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data); + class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data); + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data); + class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data); + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data); + class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data); + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data); + class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data); + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data); + class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data); + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data); + class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data); + } +#endif + continue; + } + + if (-c == ESC_v) + { + SETBIT(classbits, 0x0a); /* LF */ + SETBIT(classbits, 0x0b); /* VT */ + SETBIT(classbits, 0x0c); /* FF */ + SETBIT(classbits, 0x0d); /* CR */ + SETBIT(classbits, 0x85); /* NEL */ +#ifdef SUPPORT_UTF8 + if (utf8) + { + class_utf8 = TRUE; + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data); + class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data); + } +#endif + continue; + } + + if (-c == ESC_V) + { + for (c = 0; c < 32; c++) + { + int x = 0xff; + switch (c) + { + case 0x0a/8: x ^= 1 << (0x0a%8); + x ^= 1 << (0x0b%8); + x ^= 1 << (0x0c%8); + x ^= 1 << (0x0d%8); + break; + case 0x85/8: x ^= 1 << (0x85%8); break; + default: break; + } + classbits[c] |= x; + } + +#ifdef SUPPORT_UTF8 + if (utf8) + { + class_utf8 = TRUE; + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data); + class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data); + *class_utf8data++ = XCL_RANGE; + class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data); + class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data); + } +#endif + continue; + } + /* We need to deal with \P and \p in both phases. */ #ifdef SUPPORT_UCP @@ -2566,6 +3142,12 @@ for (;; ptr++) oldptr = ptr; + /* Remember \r or \n */ + + if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF; + + /* Check for range */ + if (!inescq && ptr[1] == '-') { int d; @@ -2607,7 +3189,7 @@ for (;; ptr++) d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); if (*errorcodeptr != 0) goto FAILED; - /* \b is backslash; \X is literal X; \R is literal R; any other + /* \b is backspace; \X is literal X; \R is literal R; any other special means the '-' was literal */ if (d < 0) @@ -2633,6 +3215,10 @@ for (;; ptr++) if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */ + /* Remember \r or \n */ + + if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF; + /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless matching, we have to use an XCLASS with extra data items. Caseless matching for characters > 127 is available only if UCP support is @@ -2655,14 +3241,18 @@ for (;; ptr++) unsigned int origd = d; while (get_othercase_range(&cc, origd, &occ, &ocd)) { - if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */ + if (occ >= (unsigned int)c && + ocd <= (unsigned int)d) + continue; /* Skip embedded ranges */ - if (occ < c && ocd >= c - 1) /* Extend the basic range */ + if (occ < (unsigned int)c && + ocd >= (unsigned int)c - 1) /* Extend the basic range */ { /* if there is overlap, */ c = occ; /* noting that if occ < c */ continue; /* we can't have ocd > d */ } /* because a subrange is */ - if (ocd > d && occ <= d + 1) /* always shorter than */ + if (ocd > (unsigned int)d && + occ <= (unsigned int)d + 1) /* always shorter than */ { /* the basic range. */ d = ocd; continue; @@ -2782,12 +3372,34 @@ for (;; ptr++) goto FAILED; } + +/* This code has been disabled because it would mean that \s counts as +an explicit \r or \n reference, and that's not really what is wanted. Now +we set the flag only if there is a literal "\r" or "\n" in the class. */ + +#if 0 + /* Remember whether \r or \n are in this class */ + + if (negate_class) + { + if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF; + } + else + { + if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF; + } +#endif + + /* If class_charcount is 1, we saw precisely one character whose value is - less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we - can optimize the negative case only if there were no characters >= 128 - because OP_NOT and the related opcodes like OP_NOTSTAR operate on - single-bytes only. This is an historical hangover. Maybe one day we can - tidy these opcodes to handle multi-byte characters. + less than 256. As long as there were no characters >= 128 and there was no + use of \p or \P, in other words, no use of any XCLASS features, we can + optimize. + + In UTF-8 mode, we can optimize the negative case only if there were no + characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR + operate on single-bytes only. This is an historical hangover. Maybe one day + we can tidy these opcodes to handle multi-byte characters. The optimization throws away the bit map. We turn the item into a 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note @@ -2797,10 +3409,8 @@ for (;; ptr++) reqbyte, save the previous value for reinstating. */ #ifdef SUPPORT_UTF8 - if (class_charcount == 1 && - (!utf8 || - (!class_utf8 && (!negate_class || class_lastchar < 128)))) - + if (class_charcount == 1 && !class_utf8 && + (!utf8 || !negate_class || class_lastchar < 128)) #else if (class_charcount == 1) #endif @@ -2843,11 +3453,14 @@ for (;; ptr++) zeroreqbyte = reqbyte; /* If there are characters with values > 255, we have to compile an - extended class, with its own opcode. If there are no characters < 256, - we can omit the bitmap in the actual compiled code. */ + extended class, with its own opcode, unless there was a negated special + such as \S in the class, because in that case all characters > 255 are in + the class, so any that were explicitly given as well can be ignored. If + (when there are explicit characters > 255 that must be listed) there are no + characters < 256, we can omit the bitmap in the actual compiled code. */ #ifdef SUPPORT_UTF8 - if (class_utf8) + if (class_utf8 && !should_flip_negation) { *class_utf8data++ = XCL_END; /* Marks the end of extra data */ *code++ = OP_XCLASS; @@ -2873,20 +3486,19 @@ for (;; ptr++) } #endif - /* If there are no characters > 255, negate the 32-byte map if necessary, - and copy it into the code vector. If this is the first thing in the branch, - there can be no first char setting, whatever the repeat count. Any reqbyte - setting must remain unchanged after any kind of repeat. */ + /* If there are no characters > 255, set the opcode to OP_CLASS or + OP_NCLASS, depending on whether the whole class was negated and whether + there were negative specials such as \S in the class. Then copy the 32-byte + map into the code vector, negating it if necessary. */ + *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; if (negate_class) { - *code++ = OP_NCLASS; if (lengthptr == NULL) /* Save time in the pre-compile phase */ for (c = 0; c < 32; c++) code[c] = ~classbits[c]; } else { - *code++ = OP_CLASS; memcpy(code, classbits, 32); } code += 32; @@ -3072,7 +3684,7 @@ for (;; ptr++) /* All real repeats make it impossible to handle partial matching (maybe one day we will be able to remove this restriction). */ - if (repeat_max != 1) cd->nopartial = TRUE; + if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; /* Combine the op_type with the repeat_type */ @@ -3222,7 +3834,7 @@ for (;; ptr++) /* All real repeats make it impossible to handle partial matching (maybe one day we will be able to remove this restriction). */ - if (repeat_max != 1) cd->nopartial = TRUE; + if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; if (repeat_min == 0 && repeat_max == -1) *code++ = OP_CRSTAR + repeat_type; @@ -3258,14 +3870,6 @@ for (;; ptr++) goto FAILED; } - /* This is a paranoid check to stop integer overflow later on */ - - if (len > MAX_DUPLENGTH) - { - *errorcodeptr = ERR50; - goto FAILED; - } - /* If the maximum repeat count is unlimited, find the end of the bracket by scanning through from the start, and compute the offset back to it from the current code pointer. There may be an OP_OPT setting following @@ -3288,28 +3892,38 @@ for (;; ptr++) if (repeat_min == 0) { - /* If the maximum is also zero, we just omit the group from the output - altogether. */ - - if (repeat_max == 0) - { - code = previous; - goto END_REPEAT; - } - - /* If the maximum is 1 or unlimited, we just have to stick in the - BRAZERO and do no more at this point. However, we do need to adjust - any OP_RECURSE calls inside the group that refer to the group itself or - any internal or forward referenced group, because the offset is from - the start of the whole regex. Temporarily terminate the pattern while - doing this. */ - - if (repeat_max <= 1) + /* If the maximum is also zero, we used to just omit the group from the + output altogether, like this: + + ** if (repeat_max == 0) + ** { + ** code = previous; + ** goto END_REPEAT; + ** } + + However, that fails when a group is referenced as a subroutine from + elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it + so that it is skipped on execution. As we don't have a list of which + groups are referenced, we cannot do this selectively. + + If the maximum is 1 or unlimited, we just have to stick in the BRAZERO + and do no more at this point. However, we do need to adjust any + OP_RECURSE calls inside the group that refer to the group itself or any + internal or forward referenced group, because the offset is from the + start of the whole regex. Temporarily terminate the pattern while doing + this. */ + + if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ { *code = OP_END; adjust_recurse(previous, 1, utf8, cd, save_hwm); memmove(previous+1, previous, len); code++; + if (repeat_max == 0) + { + *previous++ = OP_SKIPZERO; + goto END_REPEAT; + } *previous++ = OP_BRAZERO + repeat_type; } @@ -3354,10 +3968,21 @@ for (;; ptr++) if (repeat_min > 1) { /* In the pre-compile phase, we don't actually do the replication. We - just adjust the length as if we had. */ + just adjust the length as if we had. Do some paranoid checks for + potential integer overflow. */ if (lengthptr != NULL) - *lengthptr += (repeat_min - 1)*length_prevgroup; + { + int delta = (repeat_min - 1)*length_prevgroup; + if ((double)(repeat_min - 1)*(double)length_prevgroup > + (double)INT_MAX || + OFLOW_MAX - *lengthptr < delta) + { + *errorcodeptr = ERR20; + goto FAILED; + } + *lengthptr += delta; + } /* This is compiling for real */ @@ -3395,11 +4020,23 @@ for (;; ptr++) /* In the pre-compile phase, we don't actually do the replication. We just adjust the length as if we had. For each repetition we must add 1 to the length for BRAZERO and for all but the last repetition we must - add 2 + 2*LINKSIZE to allow for the nesting that occurs. */ + add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some + paranoid checks to avoid integer overflow. */ if (lengthptr != NULL && repeat_max > 0) - *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) - - 2 - 2*LINK_SIZE; /* Last one doesn't nest */ + { + int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) - + 2 - 2*LINK_SIZE; /* Last one doesn't nest */ + if ((double)repeat_max * + (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE) + > (double)INT_MAX || + OFLOW_MAX - *lengthptr < delta) + { + *errorcodeptr = ERR20; + goto FAILED; + } + *lengthptr += delta; + } /* This is compiling for real */ @@ -3481,6 +4118,13 @@ for (;; ptr++) } } + /* If previous is OP_FAIL, it was generated by an empty class [] in + JavaScript mode. The other ways in which OP_FAIL can be generated, that is + by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat" + error above. We can just ignore the repeat in JS case. */ + + else if (*previous == OP_FAIL) goto END_REPEAT; + /* Else there's some kind of shambles */ else @@ -3507,7 +4151,9 @@ for (;; ptr++) int len; if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT || *tempcode == OP_NOTEXACT) - tempcode += _pcre_OP_lengths[*tempcode]; + tempcode += _pcre_OP_lengths[*tempcode] + + ((*tempcode == OP_TYPEEXACT && + (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0); len = code - tempcode; if (len > 0) switch (*tempcode) { @@ -3551,17 +4197,55 @@ for (;; ptr++) /* ===================================================================*/ /* Start of nested parenthesized sub-expression, or comment or lookahead or lookbehind or option setting or condition or all the other extended - parenthesis forms. First deal with the specials; all are introduced by ?, - and the appearance of any of them means that this is not a capturing - group. */ + parenthesis forms. */ case '(': newoptions = options; skipbytes = 0; bravalue = OP_CBRA; save_hwm = cd->hwm; + reset_bracount = FALSE; - if (*(++ptr) == '?') + /* First deal with various "verbs" that can be introduced by '*'. */ + + if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0) + { + int i, namelen; + const char *vn = verbnames; + const uschar *name = ++ptr; + previous = NULL; + while ((cd->ctypes[*++ptr] & ctype_letter) != 0); + if (*ptr == ':') + { + *errorcodeptr = ERR59; /* Not supported */ + goto FAILED; + } + if (*ptr != ')') + { + *errorcodeptr = ERR60; + goto FAILED; + } + namelen = ptr - name; + for (i = 0; i < verbcount; i++) + { + if (namelen == verbs[i].len && + strncmp((char *)name, vn, namelen) == 0) + { + *code = verbs[i].op; + if (*code++ == OP_ACCEPT) cd->had_accept = TRUE; + break; + } + vn += verbs[i].len + 1; + } + if (i < verbcount) continue; + *errorcodeptr = ERR60; + goto FAILED; + } + + /* Deal with the extended parentheses; all are introduced by '?', and the + appearance of any of them means that this is not a capturing group. */ + + else if (*ptr == '?') { int i, set, unset, namelen; int *optset; @@ -3582,6 +4266,11 @@ for (;; ptr++) /* ------------------------------------------------------------ */ + case '|': /* Reset capture count for each branch */ + reset_bracount = TRUE; + /* Fall through */ + + /* ------------------------------------------------------------ */ case ':': /* Non-capturing bracket */ bravalue = OP_BRA; ptr++; @@ -3617,6 +4306,7 @@ for (;; ptr++) code[1+LINK_SIZE] = OP_CREF; skipbytes = 3; + refsign = -1; /* Check for a test for recursion in a named group. */ @@ -3640,7 +4330,11 @@ for (;; ptr++) terminator = '\''; ptr++; } - else terminator = 0; + else + { + terminator = 0; + if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr); + } /* We now expect to read a name; any thing else is an error */ @@ -3676,7 +4370,29 @@ for (;; ptr++) if (lengthptr != NULL) break; /* In the real compile we do the work of looking for the actual - reference. */ + reference. If the string started with "+" or "-" we require the rest to + be digits, in which case recno will be set. */ + + if (refsign > 0) + { + if (recno <= 0) + { + *errorcodeptr = ERR58; + goto FAILED; + } + recno = (refsign == '-')? + cd->bracount - recno + 1 : recno +cd->bracount; + if (recno <= 0 || recno > cd->final_bracount) + { + *errorcodeptr = ERR15; + goto FAILED; + } + PUT2(code, 2+LINK_SIZE, recno); + break; + } + + /* Otherwise (did not start with "+" or "-"), start by looking for the + name. */ slot = cd->name_table; for (i = 0; i < cd->names_found; i++) @@ -3695,7 +4411,7 @@ for (;; ptr++) /* Search the pattern for a forward reference */ - else if ((i = find_parens(ptr, cd->bracount, name, namelen, + else if ((i = find_parens(ptr, cd, name, namelen, (options & PCRE_EXTENDED) != 0)) > 0) { PUT2(code, 2+LINK_SIZE, i); @@ -3742,9 +4458,10 @@ for (;; ptr++) skipbytes = 1; } - /* Check for the "name" actually being a subpattern number. */ + /* Check for the "name" actually being a subpattern number. We are + in the second pass here, so final_bracount is set. */ - else if (recno > 0) + else if (recno > 0 && recno <= cd->final_bracount) { PUT2(code, 2+LINK_SIZE, recno); } @@ -3768,8 +4485,14 @@ for (;; ptr++) /* ------------------------------------------------------------ */ case '!': /* Negative lookahead */ - bravalue = OP_ASSERT_NOT; ptr++; + if (*ptr == ')') /* Optimize (?!) */ + { + *code++ = OP_FAIL; + previous = NULL; + continue; + } + bravalue = OP_ASSERT_NOT; break; @@ -3932,7 +4655,9 @@ for (;; ptr++) /* We come here from the Python syntax above that handles both references (?P=name) and recursion (?P>name), as well as falling - through from the Perl recursion syntax (?&name). */ + through from the Perl recursion syntax (?&name). We also come here from + the Perl \k<name> or \k'name' back reference syntax and the \k{name} + .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */ NAMED_REF_OR_RECURSE: name = ++ptr; @@ -3944,6 +4669,11 @@ for (;; ptr++) if (lengthptr != NULL) { + if (namelen == 0) + { + *errorcodeptr = ERR62; + goto FAILED; + } if (*ptr != terminator) { *errorcodeptr = ERR42; @@ -3957,14 +4687,19 @@ for (;; ptr++) recno = 0; } - /* In the real compile, seek the name in the table */ + /* In the real compile, seek the name in the table. We check the name + first, and then check that we have reached the end of the name in the + table. That way, if the name that is longer than any in the table, + the comparison will fail without reading beyond the table entry. */ else { slot = cd->name_table; for (i = 0; i < cd->names_found; i++) { - if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; + if (strncmp((char *)name, (char *)slot+2, namelen) == 0 && + slot[2+namelen] == 0) + break; slot += cd->name_entry_size; } @@ -3973,7 +4708,7 @@ for (;; ptr++) recno = GET2(slot, 0); } else if ((recno = /* Forward back reference */ - find_parens(ptr, cd->bracount, name, namelen, + find_parens(ptr, cd, name, namelen, (options & PCRE_EXTENDED) != 0)) <= 0) { *errorcodeptr = ERR15; @@ -3995,19 +4730,71 @@ for (;; ptr++) /* ------------------------------------------------------------ */ + case '-': case '+': case '0': case '1': case '2': case '3': case '4': /* Recursion or */ case '5': case '6': case '7': case '8': case '9': /* subroutine */ { const uschar *called; + terminator = ')'; + + /* Come here from the \g<...> and \g'...' code (Oniguruma + compatibility). However, the syntax has been checked to ensure that + the ... are a (signed) number, so that neither ERR63 nor ERR29 will + be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY + ever be taken. */ + + HANDLE_NUMERICAL_RECURSION: + + if ((refsign = *ptr) == '+') + { + ptr++; + if ((digitab[*ptr] & ctype_digit) == 0) + { + *errorcodeptr = ERR63; + goto FAILED; + } + } + else if (refsign == '-') + { + if ((digitab[ptr[1]] & ctype_digit) == 0) + goto OTHER_CHAR_AFTER_QUERY; + ptr++; + } + recno = 0; while((digitab[*ptr] & ctype_digit) != 0) recno = recno * 10 + *ptr++ - '0'; - if (*ptr != ')') + + if (*ptr != terminator) { *errorcodeptr = ERR29; goto FAILED; } + if (refsign == '-') + { + if (recno == 0) + { + *errorcodeptr = ERR58; + goto FAILED; + } + recno = cd->bracount - recno + 1; + if (recno <= 0) + { + *errorcodeptr = ERR15; + goto FAILED; + } + } + else if (refsign == '+') + { + if (recno == 0) + { + *errorcodeptr = ERR58; + goto FAILED; + } + recno += cd->bracount; + } + /* Come here from code above that handles a named recursion */ HANDLE_RECURSION: @@ -4031,8 +4818,8 @@ for (;; ptr++) if (called == NULL) { - if (find_parens(ptr, cd->bracount, NULL, recno, - (options & PCRE_EXTENDED) != 0) < 0) + if (find_parens(ptr, cd, NULL, recno, + (options & PCRE_EXTENDED) != 0) < 0) { *errorcodeptr = ERR15; goto FAILED; @@ -4080,6 +4867,7 @@ for (;; ptr++) /* ------------------------------------------------------------ */ default: /* Other characters: check option setting */ + OTHER_CHAR_AFTER_QUERY: set = unset = 0; optset = &set; @@ -4091,7 +4879,7 @@ for (;; ptr++) case 'J': /* Record that it changed in the external options */ *optset |= PCRE_DUPNAMES; - cd->external_options |= PCRE_JCHANGED; + cd->external_flags |= PCRE_JCHANGED; break; case 'i': *optset |= PCRE_CASELESS; break; @@ -4141,7 +4929,7 @@ for (;; ptr++) (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE)) { cd->external_options = newoptions; - options = newoptions; + options = *optionsptr = newoptions; } else { @@ -4214,6 +5002,7 @@ for (;; ptr++) errorcodeptr, /* Where to put an error message */ (bravalue == OP_ASSERTBACK || bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ + reset_bracount, /* True if (?| group */ skipbytes, /* Skip over bracket number */ &subfirstbyte, /* For possible first char */ &subreqbyte, /* For possible last char */ @@ -4230,9 +5019,11 @@ for (;; ptr++) is on the bracket. */ /* If this is a conditional bracket, check that there are no more than - two branches in the group, or just one if it's a DEFINE group. */ + two branches in the group, or just one if it's a DEFINE group. We do this + in the real compile phase, not in the pre-pass, where the whole group may + not be available. */ - if (bravalue == OP_COND) + if (bravalue == OP_COND && lengthptr == NULL) { uschar *tc = code; int condcount = 0; @@ -4279,23 +5070,29 @@ for (;; ptr++) goto FAILED; } - /* In the pre-compile phase, update the length by the length of the nested - group, less the brackets at either end. Then reduce the compiled code to - just the brackets so that it doesn't use much memory if it is duplicated by - a quantifier. */ + /* In the pre-compile phase, update the length by the length of the group, + less the brackets at either end. Then reduce the compiled code to just a + set of non-capturing brackets so that it doesn't use much memory if it is + duplicated by a quantifier.*/ if (lengthptr != NULL) { + if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE) + { + *errorcodeptr = ERR20; + goto FAILED; + } *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; - code++; + *code++ = OP_BRA; PUTINC(code, 0, 1 + LINK_SIZE); *code++ = OP_KET; PUTINC(code, 0, 1 + LINK_SIZE); + break; /* No need to waste time with special character handling */ } /* Otherwise update the main code pointer to the end of the group. */ - else code = tempcode; + code = tempcode; /* For a DEFINE group, required and first character settings are not relevant. */ @@ -4392,12 +5189,71 @@ for (;; ptr++) zerofirstbyte = firstbyte; zeroreqbyte = reqbyte; - /* \k<name> or \k'name' is a back reference by name (Perl syntax) */ + /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n' + is a subroutine call by number (Oniguruma syntax). In fact, the value + -ESC_g is returned only for these cases. So we don't need to check for < + or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is + -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as + that is a synonym for a named back reference). */ - if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'')) + if (-c == ESC_g) { - is_recurse = FALSE; + const uschar *p; + save_hwm = cd->hwm; /* Normally this is set when '(' is read */ terminator = (*(++ptr) == '<')? '>' : '\''; + + /* These two statements stop the compiler for warning about possibly + unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In + fact, because we actually check for a number below, the paths that + would actually be in error are never taken. */ + + skipbytes = 0; + reset_bracount = FALSE; + + /* Test for a name */ + + if (ptr[1] != '+' && ptr[1] != '-') + { + BOOL isnumber = TRUE; + for (p = ptr + 1; *p != 0 && *p != terminator; p++) + { + if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE; + if ((cd->ctypes[*p] & ctype_word) == 0) break; + } + if (*p != terminator) + { + *errorcodeptr = ERR57; + break; + } + if (isnumber) + { + ptr++; + goto HANDLE_NUMERICAL_RECURSION; + } + is_recurse = TRUE; + goto NAMED_REF_OR_RECURSE; + } + + /* Test a signed number in angle brackets or quotes. */ + + p = ptr + 2; + while ((digitab[*p] & ctype_digit) != 0) p++; + if (*p != terminator) + { + *errorcodeptr = ERR57; + break; + } + ptr++; + goto HANDLE_NUMERICAL_RECURSION; + } + + /* \k<name> or \k'name' is a back reference by name (Perl syntax). + We also support \k{name} (.NET syntax) */ + + if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{')) + { + is_recurse = FALSE; + terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}'; goto NAMED_REF_OR_RECURSE; } @@ -4498,6 +5354,11 @@ for (;; ptr++) *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR; for (c = 0; c < mclength; c++) *code++ = mcbuffer[c]; + /* Remember if \r or \n were seen */ + + if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n') + cd->external_flags |= PCRE_HASCRORLF; + /* Set the first and required bytes appropriately. If no previous first byte, set it from this character, but revert to none on a zero repeat. Otherwise, leave the firstbyte value alone, and don't change it on a zero @@ -4563,13 +5424,14 @@ This function is used during the pre-compile phase when we are trying to find out the amount of memory needed, as well as during the real compile phase. The value of lengthptr distinguishes the two phases. -Argument: +Arguments: options option bits, including any changes for this subpattern oldims previous settings of ims option bits codeptr -> the address of the current code pointer ptrptr -> the address of the current pattern pointer errorcodeptr -> pointer to error code variable lookbehind TRUE if this is a lookbehind assertion + reset_bracount TRUE to reset the count for each branch skipbytes skip this many bytes at start (for brackets and OP_COND) firstbyteptr place to put the first required character, or a negative number reqbyteptr place to put the last required character, or a negative number @@ -4583,8 +5445,9 @@ Returns: TRUE on success static BOOL compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr, - int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr, - int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr) + int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes, + int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd, + int *lengthptr) { const uschar *ptr = *ptrptr; uschar *code = *codeptr; @@ -4594,6 +5457,8 @@ uschar *reverse_count = NULL; int firstbyte, reqbyte; int branchfirstbyte, branchreqbyte; int length; +int orig_bracount; +int max_bracount; branch_chain bc; bc.outer = bcptr; @@ -4622,8 +5487,14 @@ code += 1 + LINK_SIZE + skipbytes; /* Loop for each alternative branch */ +orig_bracount = max_bracount = cd->bracount; for (;;) { + /* For a (?| group, reset the capturing bracket count so that each branch + uses the same numbers. */ + + if (reset_bracount) cd->bracount = orig_bracount; + /* Handle a change of ims options at the start of the branch */ if ((options & PCRE_IMS) != oldims) @@ -4653,6 +5524,11 @@ for (;;) return FALSE; } + /* Keep the highest bracket count in case (?| was used and some branch + has fewer than the rest. */ + + if (cd->bracount > max_bracount) max_bracount = cd->bracount; + /* In the real compile phase, there is some post-processing to be done. */ if (lengthptr == NULL) @@ -4716,26 +5592,29 @@ for (;;) } } - /* Reached end of expression, either ')' or end of pattern. Go back through - the alternative branches and reverse the chain of offsets, with the field in - the BRA item now becoming an offset to the first alternative. If there are - no alternatives, it points to the end of the group. The length in the - terminating ket is always the length of the whole bracketed item. If any of - the ims options were changed inside the group, compile a resetting op-code - following, except at the very end of the pattern. Return leaving the pointer - at the terminating char. */ + /* Reached end of expression, either ')' or end of pattern. In the real + compile phase, go back through the alternative branches and reverse the chain + of offsets, with the field in the BRA item now becoming an offset to the + first alternative. If there are no alternatives, it points to the end of the + group. The length in the terminating ket is always the length of the whole + bracketed item. If any of the ims options were changed inside the group, + compile a resetting op-code following, except at the very end of the pattern. + Return leaving the pointer at the terminating char. */ if (*ptr != '|') { - int branch_length = code - last_branch; - do + if (lengthptr == NULL) { - int prev_length = GET(last_branch, 1); - PUT(last_branch, 1, branch_length); - branch_length = prev_length; - last_branch -= branch_length; + int branch_length = code - last_branch; + do + { + int prev_length = GET(last_branch, 1); + PUT(last_branch, 1, branch_length); + branch_length = prev_length; + last_branch -= branch_length; + } + while (branch_length > 0); } - while (branch_length > 0); /* Fill in the ket */ @@ -4752,27 +5631,51 @@ for (;;) length += 2; } + /* Retain the highest bracket number, in case resetting was used. */ + + cd->bracount = max_bracount; + /* Set values to pass back */ *codeptr = code; *ptrptr = ptr; *firstbyteptr = firstbyte; *reqbyteptr = reqbyte; - if (lengthptr != NULL) *lengthptr += length; + if (lengthptr != NULL) + { + if (OFLOW_MAX - *lengthptr < length) + { + *errorcodeptr = ERR20; + return FALSE; + } + *lengthptr += length; + } return TRUE; } - /* Another branch follows; insert an "or" node. Its length field points back + /* Another branch follows. In the pre-compile phase, we can move the code + pointer back to where it was for the start of the first branch. (That is, + pretend that each branch is the only one.) + + In the real compile phase, insert an ALT node. Its length field points back to the previous branch while the bracket remains open. At the end the chain is reversed. It's done like this so that the start of the bracket has a zero offset until it is closed, making it possible to detect recursion. */ - *code = OP_ALT; - PUT(code, 1, code - last_branch); - bc.current = last_branch = code; - code += 1 + LINK_SIZE; + if (lengthptr != NULL) + { + code = *codeptr + 1 + LINK_SIZE + skipbytes; + length += 1 + LINK_SIZE; + } + else + { + *code = OP_ALT; + PUT(code, 1, code - last_branch); + bc.current = last_branch = code; + code += 1 + LINK_SIZE; + } + ptr++; - length += 1 + LINK_SIZE; } /* Control never reaches here */ } @@ -4850,14 +5753,14 @@ do { if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; } - /* .* is not anchored unless DOTALL is set and it isn't in brackets that - are or may be referenced. */ + /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and + it isn't in brackets that are or may be referenced. */ else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || - op == OP_TYPEPOSSTAR) && - (*options & PCRE_DOTALL) != 0) + op == OP_TYPEPOSSTAR)) { - if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; + if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0) + return FALSE; } /* Check for explicit anchoring */ @@ -5039,7 +5942,7 @@ Returns: pointer to compiled data block, or NULL on error, with errorptr and erroroffset set */ -PCRE_DATA_SCOPE pcre * +PCRE_EXP_DEFN pcre * pcre_compile(const char *pattern, int options, const char **errorptr, int *erroroffset, const unsigned char *tables) { @@ -5047,7 +5950,7 @@ return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables); } -PCRE_DATA_SCOPE pcre * +PCRE_EXP_DEFN pcre * pcre_compile2(const char *pattern, int options, int *errorcodeptr, const char **errorptr, int *erroroffset, const unsigned char *tables) { @@ -5055,6 +5958,7 @@ real_pcre *re; int length = 1; /* For final END opcode */ int firstbyte, reqbyte, newline; int errorcode = 0; +int skipatstart = 0; #ifdef SUPPORT_UTF8 BOOL utf8; #endif @@ -5073,7 +5977,6 @@ to fill in forward references to subpatterns. */ uschar cworkspace[COMPILE_WORK_SIZE]; - /* Set this early so that early errors get offset 0. */ ptr = (const uschar *)pattern; @@ -5096,7 +5999,7 @@ if (errorcodeptr != NULL) *errorcodeptr = ERR0; if (erroroffset == NULL) { errorcode = ERR16; - goto PCRE_EARLY_ERROR_RETURN; + goto PCRE_EARLY_ERROR_RETURN2; } *erroroffset = 0; @@ -5109,7 +6012,7 @@ if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 && (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0) { errorcode = ERR44; - goto PCRE_UTF8_ERROR_RETURN; + goto PCRE_EARLY_ERROR_RETURN2; } #else if ((options & PCRE_UTF8) != 0) @@ -5133,21 +6036,69 @@ cd->fcc = tables + fcc_offset; cd->cbits = tables + cbits_offset; cd->ctypes = tables + ctypes_offset; +/* Check for global one-time settings at the start of the pattern, and remember +the offset for later. */ + +while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*') + { + int newnl = 0; + int newbsr = 0; + + if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0) + { skipatstart += 5; newnl = PCRE_NEWLINE_CR; } + else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0) + { skipatstart += 5; newnl = PCRE_NEWLINE_LF; } + else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0) + { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; } + else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0) + { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; } + else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0) + { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; } + + else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0) + { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; } + else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0) + { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; } + + if (newnl != 0) + options = (options & ~PCRE_NEWLINE_BITS) | newnl; + else if (newbsr != 0) + options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr; + else break; + } + +/* Check validity of \R options. */ + +switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) + { + case 0: + case PCRE_BSR_ANYCRLF: + case PCRE_BSR_UNICODE: + break; + default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; + } + /* Handle different types of newline. The three bits give seven cases. The -current code allows for fixed one- or two-byte sequences, plus "any". */ +current code allows for fixed one- or two-byte sequences, plus "any" and +"anycrlf". */ -switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY)) +switch (options & PCRE_NEWLINE_BITS) { - case 0: newline = NEWLINE; break; /* Compile-time default */ + case 0: newline = NEWLINE; break; /* Build-time default */ case PCRE_NEWLINE_CR: newline = '\r'; break; case PCRE_NEWLINE_LF: newline = '\n'; break; case PCRE_NEWLINE_CR+ PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; case PCRE_NEWLINE_ANY: newline = -1; break; + case PCRE_NEWLINE_ANYCRLF: newline = -2; break; default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; } -if (newline < 0) +if (newline == -2) + { + cd->nltype = NLTYPE_ANYCRLF; + } +else if (newline < 0) { cd->nltype = NLTYPE_ANY; } @@ -5186,7 +6137,7 @@ to compile parts of the pattern into; the compiled code is discarded when it is no longer needed, so hopefully this workspace will never overflow, though there is a test for its doing so. */ -cd->bracount = 0; +cd->bracount = cd->final_bracount = 0; cd->names_found = 0; cd->name_entry_size = 0; cd->name_table = NULL; @@ -5196,8 +6147,8 @@ cd->hwm = cworkspace; cd->start_pattern = (const uschar *)pattern; cd->end_pattern = (const uschar *)(pattern + strlen(pattern)); cd->req_varyopt = 0; -cd->nopartial = FALSE; cd->external_options = options; +cd->external_flags = 0; /* Now do the pre-compile. On error, errorcode will be set non-zero, so we don't need to look at the result of the function here. The initial options have @@ -5205,10 +6156,12 @@ been put into the cd block so that they can be changed if an option setting is found within the regex right at the beginning. Bringing initial option settings outside can help speed up starting point checks. */ +ptr += skipatstart; code = cworkspace; *code = OP_BRA; (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS, - &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length); + &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, + &length); if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN; DPRINTF(("end pre-compile: length=%d workspace=%d\n", length, @@ -5234,14 +6187,16 @@ if (re == NULL) goto PCRE_EARLY_ERROR_RETURN; } -/* Put in the magic number, and save the sizes, initial options, and character -table pointer. NULL is used for the default character tables. The nullpad field -is at the end; it's there to help in the case when a regex compiled on a system -with 4-byte pointers is run on another with 8-byte pointers. */ +/* Put in the magic number, and save the sizes, initial options, internal +flags, and character table pointer. NULL is used for the default character +tables. The nullpad field is at the end; it's there to help in the case when a +regex compiled on a system with 4-byte pointers is run on another with 8-byte +pointers. */ re->magic_number = MAGIC_NUMBER; re->size = size; re->options = cd->external_options; +re->flags = cd->external_flags; re->dummy1 = 0; re->first_byte = 0; re->req_byte = 0; @@ -5259,6 +6214,7 @@ field. Reset the bracket count and the names_found field. Also reset the hwm field; this time it's used for remembering forward references to subpatterns. */ +cd->final_bracount = cd->bracount; /* Save for checking forward references */ cd->bracount = 0; cd->names_found = 0; cd->name_table = (uschar *)re + re->name_table_offset; @@ -5266,21 +6222,22 @@ codestart = cd->name_table + re->name_entry_size * re->name_count; cd->start_code = codestart; cd->hwm = cworkspace; cd->req_varyopt = 0; -cd->nopartial = FALSE; +cd->had_accept = FALSE; /* Set up a starting, non-extracting bracket, then compile the expression. On error, errorcode will be set non-zero, so we don't need to look at the result of the function here. */ -ptr = (const uschar *)pattern; +ptr = (const uschar *)pattern + skipatstart; code = (uschar *)codestart; *code = OP_BRA; (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr, - &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL); + &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL); re->top_bracket = cd->bracount; re->top_backref = cd->top_backref; +re->flags = cd->external_flags; -if (cd->nopartial) re->options |= PCRE_NOPARTIAL; +if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */ /* If not reached end of pattern on success, there's an excess bracket. */ @@ -5321,10 +6278,8 @@ if (errorcode != 0) (pcre_free)(re); PCRE_EARLY_ERROR_RETURN: *erroroffset = ptr - (const uschar *)pattern; -#ifdef SUPPORT_UTF8 - PCRE_UTF8_ERROR_RETURN: -#endif - *errorptr = error_texts[errorcode]; + PCRE_EARLY_ERROR_RETURN2: + *errorptr = find_error_text(errorcode); if (errorcodeptr != NULL) *errorcodeptr = errorcode; return NULL; } @@ -5353,10 +6308,10 @@ if ((re->options & PCRE_ANCHORED) == 0) int ch = firstbyte & 255; re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && cd->fcc[ch] == ch)? ch : firstbyte; - re->options |= PCRE_FIRSTSET; + re->flags |= PCRE_FIRSTSET; } else if (is_startline(codestart, 0, cd->backref_map)) - re->options |= PCRE_STARTLINE; + re->flags |= PCRE_STARTLINE; } } @@ -5370,7 +6325,7 @@ if (reqbyte >= 0 && int ch = reqbyte & 255; re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; - re->options |= PCRE_REQCHSET; + re->flags |= PCRE_REQCHSET; } /* Print out the compiled data if debugging is enabled. This is never the @@ -5381,21 +6336,9 @@ case when building a production library. */ printf("Length = %d top_bracket = %d top_backref = %d\n", length, re->top_bracket, re->top_backref); -if (re->options != 0) - { - printf("%s%s%s%s%s%s%s%s%s\n", - ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "", - ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "", - ((re->options & PCRE_CASELESS) != 0)? "caseless " : "", - ((re->options & PCRE_EXTENDED) != 0)? "extended " : "", - ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "", - ((re->options & PCRE_DOTALL) != 0)? "dotall " : "", - ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "", - ((re->options & PCRE_EXTRA) != 0)? "extra " : "", - ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : ""); - } +printf("Options=%08x\n", re->options); -if ((re->options & PCRE_FIRSTSET) != 0) +if ((re->flags & PCRE_FIRSTSET) != 0) { int ch = re->first_byte & 255; const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? @@ -5404,7 +6347,7 @@ if ((re->options & PCRE_FIRSTSET) != 0) else printf("First char = \\x%02x%s\n", ch, caseless); } -if ((re->options & PCRE_REQCHSET) != 0) +if ((re->flags & PCRE_REQCHSET) != 0) { int ch = re->req_byte & 255; const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? @@ -5413,7 +6356,7 @@ if ((re->options & PCRE_REQCHSET) != 0) else printf("Req char = \\x%02x%s\n", ch, caseless); } -pcre_printint(re, stdout); +pcre_printint(re, stdout, TRUE); /* This check is done here in the debugging case so that the code that was compiled can be seen. */ @@ -5421,7 +6364,7 @@ was compiled can be seen. */ if (code - codestart > length) { (pcre_free)(re); - *errorptr = error_texts[ERR23]; + *errorptr = find_error_text(ERR23); *erroroffset = ptr - (uschar *)pattern; if (errorcodeptr != NULL) *errorcodeptr = ERR23; return NULL; |