summaryrefslogtreecommitdiff
path: root/ext/pcre
diff options
context:
space:
mode:
authorSVN Migration <svn@php.net>2003-02-27 17:43:39 +0000
committerSVN Migration <svn@php.net>2003-02-27 17:43:39 +0000
commit078bcec0997ad0e07b720c43cc9e6d0e046a75ab (patch)
tree36cb0f6be2ef078fe3374de8c087b93ecf82f812 /ext/pcre
parentfd61f69077f6156ca71dde60ecfd9ed9765a02db (diff)
downloadphp-git-PHP-5.tar.gz
This commit was manufactured by cvs2svn to create branch 'PHP_5'.PHP-5
Diffstat (limited to 'ext/pcre')
-rw-r--r--ext/pcre/CREDITS2
-rw-r--r--ext/pcre/config.m456
-rw-r--r--ext/pcre/config0.m456
-rw-r--r--ext/pcre/pcrelib/AUTHORS6
-rw-r--r--ext/pcre/pcrelib/COPYING50
-rw-r--r--ext/pcre/pcrelib/ChangeLog958
-rw-r--r--ext/pcre/pcrelib/INSTALL185
-rw-r--r--ext/pcre/pcrelib/LICENCE50
-rw-r--r--ext/pcre/pcrelib/NEWS85
-rw-r--r--ext/pcre/pcrelib/NON-UNIX-USE89
-rw-r--r--ext/pcre/pcrelib/README312
-rw-r--r--ext/pcre/pcrelib/chartables.c183
-rw-r--r--ext/pcre/pcrelib/dftables.c148
-rw-r--r--ext/pcre/pcrelib/dll.mk60
-rw-r--r--ext/pcre/pcrelib/doc/Tech.Notes253
-rw-r--r--ext/pcre/pcrelib/doc/pcre.31993
-rw-r--r--ext/pcre/pcrelib/doc/pcre.html2672
-rw-r--r--ext/pcre/pcrelib/doc/pcre.txt2307
-rw-r--r--ext/pcre/pcrelib/doc/pcregrep.192
-rw-r--r--ext/pcre/pcrelib/doc/pcregrep.html125
-rw-r--r--ext/pcre/pcrelib/doc/pcregrep.txt109
-rw-r--r--ext/pcre/pcrelib/doc/pcreposix.3149
-rw-r--r--ext/pcre/pcrelib/doc/pcreposix.html191
-rw-r--r--ext/pcre/pcrelib/doc/pcreposix.txt159
-rw-r--r--ext/pcre/pcrelib/doc/pcretest.1288
-rw-r--r--ext/pcre/pcrelib/doc/pcretest.html377
-rw-r--r--ext/pcre/pcrelib/doc/pcretest.txt329
-rw-r--r--ext/pcre/pcrelib/doc/perltest.txt34
-rw-r--r--ext/pcre/pcrelib/get.c227
-rw-r--r--ext/pcre/pcrelib/internal.h632
-rw-r--r--ext/pcre/pcrelib/maketables.c136
-rw-r--r--ext/pcre/pcrelib/pcre.c5992
-rw-r--r--ext/pcre/pcrelib/pcre.def22
-rw-r--r--ext/pcre/pcrelib/pcre.h150
-rw-r--r--ext/pcre/pcrelib/pcregrep.c640
-rw-r--r--ext/pcre/pcrelib/pcreposix.c301
-rw-r--r--ext/pcre/pcrelib/pcreposix.h88
-rw-r--r--ext/pcre/pcrelib/pcretest.c1274
-rw-r--r--ext/pcre/pcrelib/study.c409
-rw-r--r--ext/pcre/pcrelib/testdata/testinput13808
-rw-r--r--ext/pcre/pcrelib/testdata/testinput21162
-rw-r--r--ext/pcre/pcrelib/testdata/testinput365
-rw-r--r--ext/pcre/pcrelib/testdata/testinput4155
-rw-r--r--ext/pcre/pcrelib/testdata/testinput591
-rw-r--r--ext/pcre/pcrelib/testdata/testinput678
-rw-r--r--ext/pcre/pcrelib/testdata/testoutput16222
-rw-r--r--ext/pcre/pcrelib/testdata/testoutput24088
-rw-r--r--ext/pcre/pcrelib/testdata/testoutput3116
-rw-r--r--ext/pcre/pcrelib/testdata/testoutput4304
-rw-r--r--ext/pcre/pcrelib/testdata/testoutput5339
-rw-r--r--ext/pcre/pcrelib/testdata/testoutput6319
-rw-r--r--ext/pcre/php_pcre.c1508
-rw-r--r--ext/pcre/php_pcre.h78
-rw-r--r--ext/pcre/tests/bug20528.phpt24
-rw-r--r--ext/pcre/tests/bug21732.phpt29
55 files changed, 0 insertions, 39575 deletions
diff --git a/ext/pcre/CREDITS b/ext/pcre/CREDITS
deleted file mode 100644
index ae0fba35d4..0000000000
--- a/ext/pcre/CREDITS
+++ /dev/null
@@ -1,2 +0,0 @@
-Perl Compatible Regexps
-Andrei Zmievski
diff --git a/ext/pcre/config.m4 b/ext/pcre/config.m4
deleted file mode 100644
index 9e81ad2814..0000000000
--- a/ext/pcre/config.m4
+++ /dev/null
@@ -1,56 +0,0 @@
-dnl
-dnl $Id$
-dnl
-
-dnl By default we'll compile and link against the bundled PCRE library
-dnl if DIR is supplied, we'll use that for linking
-
-PHP_ARG_WITH(pcre-regex,for PCRE support,
-[ --without-pcre-regex Do not include Perl Compatible Regular Expressions
- support. Use --with-pcre-regex=DIR to specify DIR
- where PCRE's include and library files are located,
- if not using bundled library.],yes)
-
-if test "$PHP_PCRE_REGEX" != "no"; then
- if test "$PHP_PCRE_REGEX" = "yes"; then
- PHP_NEW_EXTENSION(pcre, pcrelib/maketables.c pcrelib/get.c pcrelib/study.c pcrelib/pcre.c php_pcre.c, $ext_shared,,-DSUPPORT_UTF8 -DLINK_SIZE=2 -I@ext_srcdir@/pcrelib)
- PHP_ADD_BUILD_DIR($ext_builddir/pcrelib)
- AC_DEFINE(HAVE_BUNDLED_PCRE, 1, [ ])
- else
- for i in $PHP_PCRE_REGEX $PHP_PCRE_REGEX/include $PHP_PCRE_REGEX/include/pcre; do
- test -f $i/pcre.h && PCRE_INCDIR=$i
- done
-
- if test -z "$PCRE_INCDIR"; then
- AC_MSG_ERROR([Could not find pcre.h in $PHP_PCRE_REGEX])
- fi
-
- for j in $PHP_PCRE_REGEX $PHP_PCRE_REGEX/lib; do
- test -f $j/libpcre.a -o -f $j/libpcre.$SHLIB_SUFFIX_NAME && PCRE_LIBDIR=$j
- done
-
- if test -z "$PCRE_LIBDIR" ; then
- AC_MSG_ERROR([Could not find libpcre.(a|$SHLIB_SUFFIX_NAME) in $PHP_PCRE_REGEX])
- fi
-
- changequote({,})
- pcre_major=`grep PCRE_MAJOR $PCRE_INCDIR/pcre.h | sed -e 's/[^0-9]//g'`
- pcre_minor=`grep PCRE_MINOR $PCRE_INCDIR/pcre.h | sed -e 's/[^0-9]//g'`
- changequote([,])
- pcre_minor_length=`echo "$pcre_minor" | wc -c | sed -e 's/[^0-9]//g'`
- if test "$pcre_minor_length" -eq 2 ; then
- pcre_minor="$pcre_minor"0
- fi
- pcre_version=$pcre_major$pcre_minor
- if test "$pcre_version" -lt 208; then
- AC_MSG_ERROR([The PCRE extension requires PCRE library version >= 2.08])
- fi
-
- PHP_ADD_LIBRARY_WITH_PATH(pcre, $PCRE_LIBDIR, PCRE_SHARED_LIBADD)
-
- AC_DEFINE(HAVE_PCRE, 1, [ ])
- PHP_ADD_INCLUDE($PCRE_INCDIR)
- PHP_NEW_EXTENSION(pcre, php_pcre.c, $ext_shared,,-DSUPPORT_UTF8 -DLINK_SIZE=2)
- fi
- PHP_SUBST(PCRE_SHARED_LIBADD)
-fi
diff --git a/ext/pcre/config0.m4 b/ext/pcre/config0.m4
deleted file mode 100644
index 9e81ad2814..0000000000
--- a/ext/pcre/config0.m4
+++ /dev/null
@@ -1,56 +0,0 @@
-dnl
-dnl $Id$
-dnl
-
-dnl By default we'll compile and link against the bundled PCRE library
-dnl if DIR is supplied, we'll use that for linking
-
-PHP_ARG_WITH(pcre-regex,for PCRE support,
-[ --without-pcre-regex Do not include Perl Compatible Regular Expressions
- support. Use --with-pcre-regex=DIR to specify DIR
- where PCRE's include and library files are located,
- if not using bundled library.],yes)
-
-if test "$PHP_PCRE_REGEX" != "no"; then
- if test "$PHP_PCRE_REGEX" = "yes"; then
- PHP_NEW_EXTENSION(pcre, pcrelib/maketables.c pcrelib/get.c pcrelib/study.c pcrelib/pcre.c php_pcre.c, $ext_shared,,-DSUPPORT_UTF8 -DLINK_SIZE=2 -I@ext_srcdir@/pcrelib)
- PHP_ADD_BUILD_DIR($ext_builddir/pcrelib)
- AC_DEFINE(HAVE_BUNDLED_PCRE, 1, [ ])
- else
- for i in $PHP_PCRE_REGEX $PHP_PCRE_REGEX/include $PHP_PCRE_REGEX/include/pcre; do
- test -f $i/pcre.h && PCRE_INCDIR=$i
- done
-
- if test -z "$PCRE_INCDIR"; then
- AC_MSG_ERROR([Could not find pcre.h in $PHP_PCRE_REGEX])
- fi
-
- for j in $PHP_PCRE_REGEX $PHP_PCRE_REGEX/lib; do
- test -f $j/libpcre.a -o -f $j/libpcre.$SHLIB_SUFFIX_NAME && PCRE_LIBDIR=$j
- done
-
- if test -z "$PCRE_LIBDIR" ; then
- AC_MSG_ERROR([Could not find libpcre.(a|$SHLIB_SUFFIX_NAME) in $PHP_PCRE_REGEX])
- fi
-
- changequote({,})
- pcre_major=`grep PCRE_MAJOR $PCRE_INCDIR/pcre.h | sed -e 's/[^0-9]//g'`
- pcre_minor=`grep PCRE_MINOR $PCRE_INCDIR/pcre.h | sed -e 's/[^0-9]//g'`
- changequote([,])
- pcre_minor_length=`echo "$pcre_minor" | wc -c | sed -e 's/[^0-9]//g'`
- if test "$pcre_minor_length" -eq 2 ; then
- pcre_minor="$pcre_minor"0
- fi
- pcre_version=$pcre_major$pcre_minor
- if test "$pcre_version" -lt 208; then
- AC_MSG_ERROR([The PCRE extension requires PCRE library version >= 2.08])
- fi
-
- PHP_ADD_LIBRARY_WITH_PATH(pcre, $PCRE_LIBDIR, PCRE_SHARED_LIBADD)
-
- AC_DEFINE(HAVE_PCRE, 1, [ ])
- PHP_ADD_INCLUDE($PCRE_INCDIR)
- PHP_NEW_EXTENSION(pcre, php_pcre.c, $ext_shared,,-DSUPPORT_UTF8 -DLINK_SIZE=2)
- fi
- PHP_SUBST(PCRE_SHARED_LIBADD)
-fi
diff --git a/ext/pcre/pcrelib/AUTHORS b/ext/pcre/pcrelib/AUTHORS
deleted file mode 100644
index 832dddca45..0000000000
--- a/ext/pcre/pcrelib/AUTHORS
+++ /dev/null
@@ -1,6 +0,0 @@
-Written by: Philip Hazel <ph10@cam.ac.uk>
-
-University of Cambridge Computing Service,
-Cambridge, England. Phone: +44 1223 334714.
-
-Copyright (c) 1997-2001 University of Cambridge
diff --git a/ext/pcre/pcrelib/COPYING b/ext/pcre/pcrelib/COPYING
deleted file mode 100644
index 8effa66492..0000000000
--- a/ext/pcre/pcrelib/COPYING
+++ /dev/null
@@ -1,50 +0,0 @@
-PCRE LICENCE
-------------
-
-PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
-Written by: Philip Hazel <ph10@cam.ac.uk>
-
-University of Cambridge Computing Service,
-Cambridge, England. Phone: +44 1223 334714.
-
-Copyright (c) 1997-2001 University of Cambridge
-
-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
-
-1. This software is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-
-2. The origin of this software must not be misrepresented, either by
- explicit claim or by omission. In practice, this means that if you use
- PCRE in software which you distribute to others, commercially or
- otherwise, you must put a sentence like this
-
- Regular expression support is provided by the PCRE library package,
- which is open source software, written by Philip Hazel, and copyright
- by the University of Cambridge, England.
-
- somewhere reasonably visible in your documentation and in any relevant
- files or online help data or similar. A reference to the ftp site for
- the source, that is, to
-
- ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/
-
- should also be given in the documentation.
-
-3. Altered versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
-
-4. If PCRE is embedded in any software that is released under the GNU
- General Purpose Licence (GPL), or Lesser General Purpose Licence (LGPL),
- then the terms of that licence shall supersede any condition above with
- which it is incompatible.
-
-The documentation for PCRE, supplied in the "doc" directory, is distributed
-under the same terms as the software itself.
-
-End
diff --git a/ext/pcre/pcrelib/ChangeLog b/ext/pcre/pcrelib/ChangeLog
deleted file mode 100644
index 708b82a74b..0000000000
--- a/ext/pcre/pcrelib/ChangeLog
+++ /dev/null
@@ -1,958 +0,0 @@
-ChangeLog for PCRE
-------------------
-
-Version 4.00 ....
------------------
-
-1. If a comment in an extended regex that started immediately after a meta-item
-extended to the end of string, PCRE compiled incorrect data. This could lead to
-all kinds of weird effects. Example: /#/ was bad; /()#/ was bad; /a#/ was not.
-
-2. Moved to autoconf 2.53 and libtool 1.4.2.
-
-3. Perl 5.8 no longer needs "use utf8" for doing UTF-8 things. Consequently,
-the special perltest8 script is no longer needed - all the tests can be run
-from a single perltest script.
-
-4. From 5.004, Perl has not included the VT character (0x0b) in the set defined
-by \s. It has now been removed in PCRE. This means it isn't recognized as
-whitespace in /x regexes too, which is the same as Perl. Note that the POSIX
-class [:space:] *does* include VT, thereby creating a mess.
-
-5. Added the class [:blank:] (a GNU extension from Perl 5.8) to match only
-space and tab.
-
-6. Perl 5.005 was a long time ago. It's time to amalgamate the tests that use
-its new features into the main test script, reducing the number of scripts.
-
-7. Perl 5.8 has changed the meaning of patterns like /a(?i)b/. Earlier
-versions were backward compatible, and made the (?i) apply to the whole
-pattern, as if /i were given. Now it behaves more logically, and applies the
-option setting only to what follows. PCRE has been changed to follow suit.
-However, if it finds options settings right at the start of the pattern, it
-extracts them into the global options, as before. Thus, they show up in the
-info data.
-
-8. Added support for the \Q...\E escape sequence. Characters in between are
-treated as literals. This is slightly different from Perl in that $ and @ are
-also handled as literals inside the quotes. In Perl, they will cause variable
-interpolation. Note the following examples:
-
- Pattern PCRE matches Perl matches
-
- \Qabc$xyz\E abc$xyz abc followed by the contents of $xyz
- \Qabc\$xyz\E abc\$xyz abc\$xyz
- \Qabc\E\$\Qxyz\E abc$xyz abc$xyz
-
-9. Re-organized 3 code statements in pcretest to avoid "overflow in
-floating-point constant arithmetic" warnings from a Microsoft compiler. Added a
-(size_t) cast to one statement in pcretest and one in pcreposix to avoid
-signed/unsigned warnings.
-
-10. SunOS4 doesn't have strtoul(). This was used only for unpicking the -o
-option for pcretest, so I've replaced it by a simple function that does just
-that job.
-
-11. pcregrep was ending with code 0 instead of 2 for the commands "pcregrep" or
-"pcregrep -".
-
-12. Added "possessive quantifiers" ?+, *+, ++, and {,}+ which come from Sun's
-Java package. This provides some syntactic sugar for simple cases of what my
-documentation calls "once-only subpatterns". A pattern such as x*+ is the
-same as (?>x*). In other words, if what is inside (?>...) is just a single
-repeated item, you can use this simplified notation. Note that only makes sense
-with greedy quantifiers. Consequently, the use of the possessive quantifier
-forces greediness, whatever the setting of the PCRE_UNGREEDY option.
-
-13. A change of greediness default within a pattern was not taking effect at
-the current level for patterns like /(b+(?U)a+)/. It did apply to parenthesized
-subpatterns that followed. Patterns like /b+(?U)a+/ worked because the option
-was abstracted outside.
-
-14. PCRE now supports the \G assertion. It is true when the current matching
-position is at the start point of the match. This differs from \A when the
-starting offset is non-zero. Used with the /g option of pcretest (or similar
-code), it works in the same way as it does for Perl's /g option.
-
-15. Some bugs concerning the handling of certain option changes within patterns
-have been fixed. These applied to options other than (?ims). For example,
-"a(?x: b c )d" did not match "XabcdY" but did match "Xa b c dY". It should have
-been the other way round. Some of this was related to change 7 above.
-
-16. PCRE now gives errors for /[.x.]/ and /[=x=]/ as unsupported POSIX
-features, as Perl does. Previously, PCRE gave the warnings only for /[[.x.]]/
-and /[[=x=]]/. PCRE now also gives an error for /[:name:]/ because it supports
-POSIX classes only within a class (e.g. /[[:alpha:]]/).
-
-17. Added support for Perl's \C escape. This matches one byte, even in UTF8
-mode. Unlike ".", it always matches newline, whatever the setting of
-PCRE_DOTALL. However, PCRE does not permit \C to appear in lookbehind
-assertions. (Perl allows it, but it doesn't (in general) work because it can't
-calculate the length of the lookbehind. At least, that's the case for Perl
-5.8.0)
-
-18. Added an error diagnosis for escapes that PCRE does not support: these are
-\L, \l, \N, \P, \p, \U, \u, and \X.
-
-19. Although correctly diagnosing a missing ']' in a character class, PCRE was
-reading past the end of the pattern in cases such as /[abcd/.
-
-20. PCRE was getting more memory than necessary for patterns with classes that
-contained both POSIX named classes and other characters, e.g. /[[:space:]abc/.
-
-21. Added some code, conditional on #ifdef VPCOMPAT, to make life easier for
-compiling PCRE for use with Virtual Pascal.
-
-22. Small fix to the Makefile to make it work properly if the build is done
-outside the source tree.
-
-23. Added a new extension: a condition to go with recursion. If a conditional
-subpattern starts with (?(R) the "true" branch is used if recursion has
-happened, whereas the "false" branch is used only at the top level.
-
-24. When there was a very long string of literal characters (over 255 bytes
-without UTF support, over 250 bytes with UTF support), the computation of how
-much memory was required could be incorrect, leading to segfaults or other
-strange effects.
-
-25. PCRE was incorrectly assuming anchoring (either to start of subject or to
-start of line for a non-DOTALL pattern) when a pattern started with (.*) and
-there was a subsequent back reference to those brackets. This meant that, for
-example, /(.*)\d+\1/ failed to match "abc123bc". Unfortunately, it isn't
-possible to check for precisely this case. All we can do is abandon the
-optimization if .* occurs inside capturing brackets when there are any back
-references whatsoever.
-
-26. The handling of the optimization for finding the first character of a
-non-anchored pattern, and for finding a character that is required later in the
-match were failing in some cases. This didn't break the matching; it just
-failed to optimize when it could. The way this is done has been re-implemented.
-
-27. Fixed typo in error message for invalid (?R item (it said "(?p").
-
-28. Added a new feature that provides some of the functionality that Perl
-provides with (?{...}). The facility is termed a "callout". The way it is done
-in PCRE is for the caller to provide an optional function, by setting
-pcre_callout to its entry point. Like pcre_malloc and pcre_free, this is a
-global variable. By default it is unset, which disables all calling out. To get
-the function called, the regex must include (?C) at appropriate points. This
-is, in fact, equivalent to (?C0), and any number <= 255 may be given with (?C).
-This provides a means of identifying different callout points. When PCRE
-reaches such a point in the regex, if pcre_callout has been set, the external
-function is called. It is provided with data in a structure called
-pcre_callout_block, which is defined in pcre.h. If the function returns 0,
-matching continues; if it returns a non-zero value, the match at the current
-point fails. However, backtracking will occur if possible.
-
-29. pcretest is upgraded to test the callout functionality. It provides a
-callout function that displays information. By default, it shows the start of
-the match and the current position in the text. There are some new data escapes
-to vary what happens:
-
- \C+ in addition, show current contents of captured substrings
- \C- do not supply a callout function
- \C!n return 1 when callout number n is reached
- \C!n!m return 1 when callout number n is reached for the mth time
-
-30. If pcregrep was called with the -l option and just a single file name, it
-output "<stdin>" if a match was found, instead of the file name.
-
-31. Improve the efficiency of the POSIX API to PCRE. If the number of capturing
-slots is less than POSIX_MALLOC_THRESHOLD, use a block on the stack to pass to
-pcre_exec(). This saves a malloc/free per call. The default value of
-POSIX_MALLOC_THRESHOLD is 5; it can be changed by --with-posix-malloc-threshold
-when configuring.
-
-32. The default maximum size of a compiled pattern is 64K. There have been a
-few cases of people hitting this limit. The code now uses macros to handle the
-storing of links as offsets within the compiled pattern. It defaults to 2-byte
-links, but this can be changed to 3 or 4 bytes by --with-link-size when
-configuring. Tests 2 and 5 work only with 2-byte links because they output
-debugging information about compiled patterns.
-
-33. Internal code re-arrangements:
-
- (a) Moved the debugging function for printing out a compiled regex into
- its own source file (printint.c) and used #include to pull it into
- pcretest.c and, when DEBUG is defined, into pcre.c, instead of having
- two separate copies.
-
- (b) Defined the list of op-code names for debugging as a macro in
- internal.h so that it is next to the definition of the opcodes.
-
- (c) Defined a table of op-code lengths for simpler skipping along compiled
- code. This is again a macro in internal.h so that it is next to the
- definition of the opcodes.
-
-34. Added support for recursive calls to individual subpatterns, along the
- lines of Robin Houston's patch (but implemented somewhat differently).
-
-35. Further mods to the Makefile to help Win32. Also, added code to pcregrep
- to allow it to read and process whole directories in Win32. This code was
- contributed by Lionel Fourquaux; it has not been tested by me.
-
-36. Added support for named subpatterns. The Python syntax (?P<name>...) is
- used to name a group. Names consist of alphanumerics and underscores, and
- must be unique. Back references use the syntax (?P=name) and recursive
- calls use (?P>name) which is a PCRE extension to the Python extension.
- Groups still have numbers. The function pcre_fullinfo() can be used after
- compilation to extract a name/number map. There are three relevant calls:
-
- PCRE_INFO_NAMEENTRYSIZE yields the size of each entry in the map
- PCRE_INFO_NAMECOUNT yields the number of entries
- PCRE_INFO_NAMETABLE yields a pointer to the map.
-
- The map is a vector of fixed-size entries. The size of each entry depends
- on the length of the longest name used. The first two bytes of each entry
- are the group number, most significant byte first. There follows the
- corresponding name, zero terminated. The names are in alphabetical order.
-
-
-Version 3.9 02-Jan-02
----------------------
-
-1. A bit of extraneous text had somehow crept into the pcregrep documentation.
-
-2. If --disable-static was given, the building process failed when trying to
-build pcretest and pcregrep. (For some reason it was using libtool to compile
-them, which is not right, as they aren't part of the library.)
-
-
-Version 3.8 18-Dec-01
----------------------
-
-1. The experimental UTF-8 code was completely screwed up. It was packing the
-bytes in the wrong order. How dumb can you get?
-
-
-Version 3.7 29-Oct-01
----------------------
-
-1. In updating pcretest to check change 1 of version 3.6, I screwed up.
-This caused pcretest, when used on the test data, to segfault. Unfortunately,
-this didn't happen under Solaris 8, where I normally test things.
-
-2. The Makefile had to be changed to make it work on BSD systems, where 'make'
-doesn't seem to recognize that ./xxx and xxx are the same file. (This entry
-isn't in ChangeLog distributed with 3.7 because I forgot when I hastily made
-this fix an hour or so after the initial 3.7 release.)
-
-
-Version 3.6 23-Oct-01
----------------------
-
-1. Crashed with /(sens|respons)e and \1ibility/ and "sense and sensibility" if
-offsets passed as NULL with zero offset count.
-
-2. The config.guess and config.sub files had not been updated when I moved to
-the latest autoconf.
-
-
-Version 3.5 15-Aug-01
----------------------
-
-1. Added some missing #if !defined NOPOSIX conditionals in pcretest.c that
-had been forgotten.
-
-2. By using declared but undefined structures, we can avoid using "void"
-definitions in pcre.h while keeping the internal definitions of the structures
-private.
-
-3. The distribution is now built using autoconf 2.50 and libtool 1.4. From a
-user point of view, this means that both static and shared libraries are built
-by default, but this can be individually controlled. More of the work of
-handling this static/shared cases is now inside libtool instead of PCRE's make
-file.
-
-4. The pcretest utility is now installed along with pcregrep because it is
-useful for users (to test regexs) and by doing this, it automatically gets
-relinked by libtool. The documentation has been turned into a man page, so
-there are now .1, .txt, and .html versions in /doc.
-
-5. Upgrades to pcregrep:
- (i) Added long-form option names like gnu grep.
- (ii) Added --help to list all options with an explanatory phrase.
- (iii) Added -r, --recursive to recurse into sub-directories.
- (iv) Added -f, --file to read patterns from a file.
-
-6. pcre_exec() was referring to its "code" argument before testing that
-argument for NULL (and giving an error if it was NULL).
-
-7. Upgraded Makefile.in to allow for compiling in a different directory from
-the source directory.
-
-8. Tiny buglet in pcretest: when pcre_fullinfo() was called to retrieve the
-options bits, the pointer it was passed was to an int instead of to an unsigned
-long int. This mattered only on 64-bit systems.
-
-9. Fixed typo (3.4/1) in pcre.h again. Sigh. I had changed pcre.h (which is
-generated) instead of pcre.in, which it its source. Also made the same change
-in several of the .c files.
-
-10. A new release of gcc defines printf() as a macro, which broke pcretest
-because it had an ifdef in the middle of a string argument for printf(). Fixed
-by using separate calls to printf().
-
-11. Added --enable-newline-is-cr and --enable-newline-is-lf to the configure
-script, to force use of CR or LF instead of \n in the source. On non-Unix
-systems, the value can be set in config.h.
-
-12. The limit of 200 on non-capturing parentheses is a _nesting_ limit, not an
-absolute limit. Changed the text of the error message to make this clear, and
-likewise updated the man page.
-
-13. The limit of 99 on the number of capturing subpatterns has been removed.
-The new limit is 65535, which I hope will not be a "real" limit.
-
-
-Version 3.4 22-Aug-00
----------------------
-
-1. Fixed typo in pcre.h: unsigned const char * changed to const unsigned char *.
-
-2. Diagnose condition (?(0) as an error instead of crashing on matching.
-
-
-Version 3.3 01-Aug-00
----------------------
-
-1. If an octal character was given, but the value was greater than \377, it
-was not getting masked to the least significant bits, as documented. This could
-lead to crashes in some systems.
-
-2. Perl 5.6 (if not earlier versions) accepts classes like [a-\d] and treats
-the hyphen as a literal. PCRE used to give an error; it now behaves like Perl.
-
-3. Added the functions pcre_free_substring() and pcre_free_substring_list().
-These just pass their arguments on to (pcre_free)(), but they are provided
-because some uses of PCRE bind it to non-C systems that can call its functions,
-but cannot call free() or pcre_free() directly.
-
-4. Add "make test" as a synonym for "make check". Corrected some comments in
-the Makefile.
-
-5. Add $(DESTDIR)/ in front of all the paths in the "install" target in the
-Makefile.
-
-6. Changed the name of pgrep to pcregrep, because Solaris has introduced a
-command called pgrep for grepping around the active processes.
-
-7. Added the beginnings of support for UTF-8 character strings.
-
-8. Arranged for the Makefile to pass over the settings of CC, CFLAGS, and
-RANLIB to ./ltconfig so that they are used by libtool. I think these are all
-the relevant ones. (AR is not passed because ./ltconfig does its own figuring
-out for the ar command.)
-
-
-Version 3.2 12-May-00
----------------------
-
-This is purely a bug fixing release.
-
-1. If the pattern /((Z)+|A)*/ was matched agained ZABCDEFG it matched Z instead
-of ZA. This was just one example of several cases that could provoke this bug,
-which was introduced by change 9 of version 2.00. The code for breaking
-infinite loops after an iteration that matches an empty string was't working
-correctly.
-
-2. The pcretest program was not imitating Perl correctly for the pattern /a*/g
-when matched against abbab (for example). After matching an empty string, it
-wasn't forcing anchoring when setting PCRE_NOTEMPTY for the next attempt; this
-caused it to match further down the string than it should.
-
-3. The code contained an inclusion of sys/types.h. It isn't clear why this
-was there because it doesn't seem to be needed, and it causes trouble on some
-systems, as it is not a Standard C header. It has been removed.
-
-4. Made 4 silly changes to the source to avoid stupid compiler warnings that
-were reported on the Macintosh. The changes were from
-
- while ((c = *(++ptr)) != 0 && c != '\n');
-to
- while ((c = *(++ptr)) != 0 && c != '\n') ;
-
-Totally extraordinary, but if that's what it takes...
-
-5. PCRE is being used in one environment where neither memmove() nor bcopy() is
-available. Added HAVE_BCOPY and an autoconf test for it; if neither
-HAVE_MEMMOVE nor HAVE_BCOPY is set, use a built-in emulation function which
-assumes the way PCRE uses memmove() (always moving upwards).
-
-6. PCRE is being used in one environment where strchr() is not available. There
-was only one use in pcre.c, and writing it out to avoid strchr() probably gives
-faster code anyway.
-
-
-Version 3.1 09-Feb-00
----------------------
-
-The only change in this release is the fixing of some bugs in Makefile.in for
-the "install" target:
-
-(1) It was failing to install pcreposix.h.
-
-(2) It was overwriting the pcre.3 man page with the pcreposix.3 man page.
-
-
-Version 3.0 01-Feb-00
----------------------
-
-1. Add support for the /+ modifier to perltest (to output $` like it does in
-pcretest).
-
-2. Add support for the /g modifier to perltest.
-
-3. Fix pcretest so that it behaves even more like Perl for /g when the pattern
-matches null strings.
-
-4. Fix perltest so that it doesn't do unwanted things when fed an empty
-pattern. Perl treats empty patterns specially - it reuses the most recent
-pattern, which is not what we want. Replace // by /(?#)/ in order to avoid this
-effect.
-
-5. The POSIX interface was broken in that it was just handing over the POSIX
-captured string vector to pcre_exec(), but (since release 2.00) PCRE has
-required a bigger vector, with some working space on the end. This means that
-the POSIX wrapper now has to get and free some memory, and copy the results.
-
-6. Added some simple autoconf support, placing the test data and the
-documentation in separate directories, re-organizing some of the
-information files, and making it build pcre-config (a GNU standard). Also added
-libtool support for building PCRE as a shared library, which is now the
-default.
-
-7. Got rid of the leading zero in the definition of PCRE_MINOR because 08 and
-09 are not valid octal constants. Single digits will be used for minor values
-less than 10.
-
-8. Defined REG_EXTENDED and REG_NOSUB as zero in the POSIX header, so that
-existing programs that set these in the POSIX interface can use PCRE without
-modification.
-
-9. Added a new function, pcre_fullinfo() with an extensible interface. It can
-return all that pcre_info() returns, plus additional data. The pcre_info()
-function is retained for compatibility, but is considered to be obsolete.
-
-10. Added experimental recursion feature (?R) to handle one common case that
-Perl 5.6 will be able to do with (?p{...}).
-
-11. Added support for POSIX character classes like [:alpha:], which Perl is
-adopting.
-
-
-Version 2.08 31-Aug-99
-----------------------
-
-1. When startoffset was not zero and the pattern began with ".*", PCRE was not
-trying to match at the startoffset position, but instead was moving forward to
-the next newline as if a previous match had failed.
-
-2. pcretest was not making use of PCRE_NOTEMPTY when repeating for /g and /G,
-and could get into a loop if a null string was matched other than at the start
-of the subject.
-
-3. Added definitions of PCRE_MAJOR and PCRE_MINOR to pcre.h so the version can
-be distinguished at compile time, and for completeness also added PCRE_DATE.
-
-5. Added Paul Sokolovsky's minor changes to make it easy to compile a Win32 DLL
-in GnuWin32 environments.
-
-
-Version 2.07 29-Jul-99
-----------------------
-
-1. The documentation is now supplied in plain text form and HTML as well as in
-the form of man page sources.
-
-2. C++ compilers don't like assigning (void *) values to other pointer types.
-In particular this affects malloc(). Although there is no problem in Standard
-C, I've put in casts to keep C++ compilers happy.
-
-3. Typo on pcretest.c; a cast of (unsigned char *) in the POSIX regexec() call
-should be (const char *).
-
-4. If NOPOSIX is defined, pcretest.c compiles without POSIX support. This may
-be useful for non-Unix systems who don't want to bother with the POSIX stuff.
-However, I haven't made this a standard facility. The documentation doesn't
-mention it, and the Makefile doesn't support it.
-
-5. The Makefile now contains an "install" target, with editable destinations at
-the top of the file. The pcretest program is not installed.
-
-6. pgrep -V now gives the PCRE version number and date.
-
-7. Fixed bug: a zero repetition after a literal string (e.g. /abcde{0}/) was
-causing the entire string to be ignored, instead of just the last character.
-
-8. If a pattern like /"([^\\"]+|\\.)*"/ is applied in the normal way to a
-non-matching string, it can take a very, very long time, even for strings of
-quite modest length, because of the nested recursion. PCRE now does better in
-some of these cases. It does this by remembering the last required literal
-character in the pattern, and pre-searching the subject to ensure it is present
-before running the real match. In other words, it applies a heuristic to detect
-some types of certain failure quickly, and in the above example, if presented
-with a string that has no trailing " it gives "no match" very quickly.
-
-9. A new runtime option PCRE_NOTEMPTY causes null string matches to be ignored;
-other alternatives are tried instead.
-
-
-Version 2.06 09-Jun-99
-----------------------
-
-1. Change pcretest's output for amount of store used to show just the code
-space, because the remainder (the data block) varies in size between 32-bit and
-64-bit systems.
-
-2. Added an extra argument to pcre_exec() to supply an offset in the subject to
-start matching at. This allows lookbehinds to work when searching for multiple
-occurrences in a string.
-
-3. Added additional options to pcretest for testing multiple occurrences:
-
- /+ outputs the rest of the string that follows a match
- /g loops for multiple occurrences, using the new startoffset argument
- /G loops for multiple occurrences by passing an incremented pointer
-
-4. PCRE wasn't doing the "first character" optimization for patterns starting
-with \b or \B, though it was doing it for other lookbehind assertions. That is,
-it wasn't noticing that a match for a pattern such as /\bxyz/ has to start with
-the letter 'x'. On long subject strings, this gives a significant speed-up.
-
-
-Version 2.05 21-Apr-99
-----------------------
-
-1. Changed the type of magic_number from int to long int so that it works
-properly on 16-bit systems.
-
-2. Fixed a bug which caused patterns starting with .* not to work correctly
-when the subject string contained newline characters. PCRE was assuming
-anchoring for such patterns in all cases, which is not correct because .* will
-not pass a newline unless PCRE_DOTALL is set. It now assumes anchoring only if
-DOTALL is set at top level; otherwise it knows that patterns starting with .*
-must be retried after every newline in the subject.
-
-
-Version 2.04 18-Feb-99
-----------------------
-
-1. For parenthesized subpatterns with repeats whose minimum was zero, the
-computation of the store needed to hold the pattern was incorrect (too large).
-If such patterns were nested a few deep, this could multiply and become a real
-problem.
-
-2. Added /M option to pcretest to show the memory requirement of a specific
-pattern. Made -m a synonym of -s (which does this globally) for compatibility.
-
-3. Subpatterns of the form (regex){n,m} (i.e. limited maximum) were being
-compiled in such a way that the backtracking after subsequent failure was
-pessimal. Something like (a){0,3} was compiled as (a)?(a)?(a)? instead of
-((a)((a)(a)?)?)? with disastrous performance if the maximum was of any size.
-
-
-Version 2.03 02-Feb-99
-----------------------
-
-1. Fixed typo and small mistake in man page.
-
-2. Added 4th condition (GPL supersedes if conflict) and created separate
-LICENCE file containing the conditions.
-
-3. Updated pcretest so that patterns such as /abc\/def/ work like they do in
-Perl, that is the internal \ allows the delimiter to be included in the
-pattern. Locked out the use of \ as a delimiter. If \ immediately follows
-the final delimiter, add \ to the end of the pattern (to test the error).
-
-4. Added the convenience functions for extracting substrings after a successful
-match. Updated pcretest to make it able to test these functions.
-
-
-Version 2.02 14-Jan-99
-----------------------
-
-1. Initialized the working variables associated with each extraction so that
-their saving and restoring doesn't refer to uninitialized store.
-
-2. Put dummy code into study.c in order to trick the optimizer of the IBM C
-compiler for OS/2 into generating correct code. Apparently IBM isn't going to
-fix the problem.
-
-3. Pcretest: the timing code wasn't using LOOPREPEAT for timing execution
-calls, and wasn't printing the correct value for compiling calls. Increased the
-default value of LOOPREPEAT, and the number of significant figures in the
-times.
-
-4. Changed "/bin/rm" in the Makefile to "-rm" so it works on Windows NT.
-
-5. Renamed "deftables" as "dftables" to get it down to 8 characters, to avoid
-a building problem on Windows NT with a FAT file system.
-
-
-Version 2.01 21-Oct-98
-----------------------
-
-1. Changed the API for pcre_compile() to allow for the provision of a pointer
-to character tables built by pcre_maketables() in the current locale. If NULL
-is passed, the default tables are used.
-
-
-Version 2.00 24-Sep-98
-----------------------
-
-1. Since the (>?) facility is in Perl 5.005, don't require PCRE_EXTRA to enable
-it any more.
-
-2. Allow quantification of (?>) groups, and make it work correctly.
-
-3. The first character computation wasn't working for (?>) groups.
-
-4. Correct the implementation of \Z (it is permitted to match on the \n at the
-end of the subject) and add 5.005's \z, which really does match only at the
-very end of the subject.
-
-5. Remove the \X "cut" facility; Perl doesn't have it, and (?> is neater.
-
-6. Remove the ability to specify CASELESS, MULTILINE, DOTALL, and
-DOLLAR_END_ONLY at runtime, to make it possible to implement the Perl 5.005
-localized options. All options to pcre_study() were also removed.
-
-7. Add other new features from 5.005:
-
- $(?<= positive lookbehind
- $(?<! negative lookbehind
- (?imsx-imsx) added the unsetting capability
- such a setting is global if at outer level; local otherwise
- (?imsx-imsx:) non-capturing groups with option setting
- (?(cond)re|re) conditional pattern matching
-
- A backreference to itself in a repeated group matches the previous
- captured string.
-
-8. General tidying up of studying (both automatic and via "study")
-consequential on the addition of new assertions.
-
-9. As in 5.005, unlimited repeated groups that could match an empty substring
-are no longer faulted at compile time. Instead, the loop is forcibly broken at
-runtime if any iteration does actually match an empty substring.
-
-10. Include the RunTest script in the distribution.
-
-11. Added tests from the Perl 5.005_02 distribution. This showed up a few
-discrepancies, some of which were old and were also with respect to 5.004. They
-have now been fixed.
-
-
-Version 1.09 28-Apr-98
-----------------------
-
-1. A negated single character class followed by a quantifier with a minimum
-value of one (e.g. [^x]{1,6} ) was not compiled correctly. This could lead to
-program crashes, or just wrong answers. This did not apply to negated classes
-containing more than one character, or to minima other than one.
-
-
-Version 1.08 27-Mar-98
-----------------------
-
-1. Add PCRE_UNGREEDY to invert the greediness of quantifiers.
-
-2. Add (?U) and (?X) to set PCRE_UNGREEDY and PCRE_EXTRA respectively. The
-latter must appear before anything that relies on it in the pattern.
-
-
-Version 1.07 16-Feb-98
-----------------------
-
-1. A pattern such as /((a)*)*/ was not being diagnosed as in error (unlimited
-repeat of a potentially empty string).
-
-
-Version 1.06 23-Jan-98
-----------------------
-
-1. Added Markus Oberhumer's little patches for C++.
-
-2. Literal strings longer than 255 characters were broken.
-
-
-Version 1.05 23-Dec-97
-----------------------
-
-1. Negated character classes containing more than one character were failing if
-PCRE_CASELESS was set at run time.
-
-
-Version 1.04 19-Dec-97
-----------------------
-
-1. Corrected the man page, where some "const" qualifiers had been omitted.
-
-2. Made debugging output print "{0,xxx}" instead of just "{,xxx}" to agree with
-input syntax.
-
-3. Fixed memory leak which occurred when a regex with back references was
-matched with an offsets vector that wasn't big enough. The temporary memory
-that is used in this case wasn't being freed if the match failed.
-
-4. Tidied pcretest to ensure it frees memory that it gets.
-
-5. Temporary memory was being obtained in the case where the passed offsets
-vector was exactly big enough.
-
-6. Corrected definition of offsetof() from change 5 below.
-
-7. I had screwed up change 6 below and broken the rules for the use of
-setjmp(). Now fixed.
-
-
-Version 1.03 18-Dec-97
-----------------------
-
-1. A erroneous regex with a missing opening parenthesis was correctly
-diagnosed, but PCRE attempted to access brastack[-1], which could cause crashes
-on some systems.
-
-2. Replaced offsetof(real_pcre, code) by offsetof(real_pcre, code[0]) because
-it was reported that one broken compiler failed on the former because "code" is
-also an independent variable.
-
-3. The erroneous regex a[]b caused an array overrun reference.
-
-4. A regex ending with a one-character negative class (e.g. /[^k]$/) did not
-fail on data ending with that character. (It was going on too far, and checking
-the next character, typically a binary zero.) This was specific to the
-optimized code for single-character negative classes.
-
-5. Added a contributed patch from the TIN world which does the following:
-
- + Add an undef for memmove, in case the the system defines a macro for it.
-
- + Add a definition of offsetof(), in case there isn't one. (I don't know
- the reason behind this - offsetof() is part of the ANSI standard - but
- it does no harm).
-
- + Reduce the ifdef's in pcre.c using macro DPRINTF, thereby eliminating
- most of the places where whitespace preceded '#'. I have given up and
- allowed the remaining 2 cases to be at the margin.
-
- + Rename some variables in pcre to eliminate shadowing. This seems very
- pedantic, but does no harm, of course.
-
-6. Moved the call to setjmp() into its own function, to get rid of warnings
-from gcc -Wall, and avoided calling it at all unless PCRE_EXTRA is used.
-
-7. Constructs such as \d{8,} were compiling into the equivalent of
-\d{8}\d{0,65527} instead of \d{8}\d* which didn't make much difference to the
-outcome, but in this particular case used more store than had been allocated,
-which caused the bug to be discovered because it threw up an internal error.
-
-8. The debugging code in both pcre and pcretest for outputting the compiled
-form of a regex was going wrong in the case of back references followed by
-curly-bracketed repeats.
-
-
-Version 1.02 12-Dec-97
-----------------------
-
-1. Typos in pcre.3 and comments in the source fixed.
-
-2. Applied a contributed patch to get rid of places where it used to remove
-'const' from variables, and fixed some signed/unsigned and uninitialized
-variable warnings.
-
-3. Added the "runtest" target to Makefile.
-
-4. Set default compiler flag to -O2 rather than just -O.
-
-
-Version 1.01 19-Nov-97
-----------------------
-
-1. PCRE was failing to diagnose unlimited repeat of empty string for patterns
-like /([ab]*)*/, that is, for classes with more than one character in them.
-
-2. Likewise, it wasn't diagnosing patterns with "once-only" subpatterns, such
-as /((?>a*))*/ (a PCRE_EXTRA facility).
-
-
-Version 1.00 18-Nov-97
-----------------------
-
-1. Added compile-time macros to support systems such as SunOS4 which don't have
-memmove() or strerror() but have other things that can be used instead.
-
-2. Arranged that "make clean" removes the executables.
-
-
-Version 0.99 27-Oct-97
-----------------------
-
-1. Fixed bug in code for optimizing classes with only one character. It was
-initializing a 32-byte map regardless, which could cause it to run off the end
-of the memory it had got.
-
-2. Added, conditional on PCRE_EXTRA, the proposed (?>REGEX) construction.
-
-
-Version 0.98 22-Oct-97
-----------------------
-
-1. Fixed bug in code for handling temporary memory usage when there are more
-back references than supplied space in the ovector. This could cause segfaults.
-
-
-Version 0.97 21-Oct-97
-----------------------
-
-1. Added the \X "cut" facility, conditional on PCRE_EXTRA.
-
-2. Optimized negated single characters not to use a bit map.
-
-3. Brought error texts together as macro definitions; clarified some of them;
-fixed one that was wrong - it said "range out of order" when it meant "invalid
-escape sequence".
-
-4. Changed some char * arguments to const char *.
-
-5. Added PCRE_NOTBOL and PCRE_NOTEOL (from POSIX).
-
-6. Added the POSIX-style API wrapper in pcreposix.a and testing facilities in
-pcretest.
-
-
-Version 0.96 16-Oct-97
-----------------------
-
-1. Added a simple "pgrep" utility to the distribution.
-
-2. Fixed an incompatibility with Perl: "{" is now treated as a normal character
-unless it appears in one of the precise forms "{ddd}", "{ddd,}", or "{ddd,ddd}"
-where "ddd" means "one or more decimal digits".
-
-3. Fixed serious bug. If a pattern had a back reference, but the call to
-pcre_exec() didn't supply a large enough ovector to record the related
-identifying subpattern, the match always failed. PCRE now remembers the number
-of the largest back reference, and gets some temporary memory in which to save
-the offsets during matching if necessary, in order to ensure that
-backreferences always work.
-
-4. Increased the compatibility with Perl in a number of ways:
-
- (a) . no longer matches \n by default; an option PCRE_DOTALL is provided
- to request this handling. The option can be set at compile or exec time.
-
- (b) $ matches before a terminating newline by default; an option
- PCRE_DOLLAR_ENDONLY is provided to override this (but not in multiline
- mode). The option can be set at compile or exec time.
-
- (c) The handling of \ followed by a digit other than 0 is now supposed to be
- the same as Perl's. If the decimal number it represents is less than 10
- or there aren't that many previous left capturing parentheses, an octal
- escape is read. Inside a character class, it's always an octal escape,
- even if it is a single digit.
-
- (d) An escaped but undefined alphabetic character is taken as a literal,
- unless PCRE_EXTRA is set. Currently this just reserves the remaining
- escapes.
-
- (e) {0} is now permitted. (The previous item is removed from the compiled
- pattern).
-
-5. Changed all the names of code files so that the basic parts are no longer
-than 10 characters, and abolished the teeny "globals.c" file.
-
-6. Changed the handling of character classes; they are now done with a 32-byte
-bit map always.
-
-7. Added the -d and /D options to pcretest to make it possible to look at the
-internals of compilation without having to recompile pcre.
-
-
-Version 0.95 23-Sep-97
-----------------------
-
-1. Fixed bug in pre-pass concerning escaped "normal" characters such as \x5c or
-\x20 at the start of a run of normal characters. These were being treated as
-real characters, instead of the source characters being re-checked.
-
-
-Version 0.94 18-Sep-97
-----------------------
-
-1. The functions are now thread-safe, with the caveat that the global variables
-containing pointers to malloc() and free() or alternative functions are the
-same for all threads.
-
-2. Get pcre_study() to generate a bitmap of initial characters for non-
-anchored patterns when this is possible, and use it if passed to pcre_exec().
-
-
-Version 0.93 15-Sep-97
-----------------------
-
-1. /(b)|(:+)/ was computing an incorrect first character.
-
-2. Add pcre_study() to the API and the passing of pcre_extra to pcre_exec(),
-but not actually doing anything yet.
-
-3. Treat "-" characters in classes that cannot be part of ranges as literals,
-as Perl does (e.g. [-az] or [az-]).
-
-4. Set the anchored flag if a branch starts with .* or .*? because that tests
-all possible positions.
-
-5. Split up into different modules to avoid including unneeded functions in a
-compiled binary. However, compile and exec are still in one module. The "study"
-function is split off.
-
-6. The character tables are now in a separate module whose source is generated
-by an auxiliary program - but can then be edited by hand if required. There are
-now no calls to isalnum(), isspace(), isdigit(), isxdigit(), tolower() or
-toupper() in the code.
-
-7. Turn the malloc/free funtions variables into pcre_malloc and pcre_free and
-make them global. Abolish the function for setting them, as the caller can now
-set them directly.
-
-
-Version 0.92 11-Sep-97
-----------------------
-
-1. A repeat with a fixed maximum and a minimum of 1 for an ordinary character
-(e.g. /a{1,3}/) was broken (I mis-optimized it).
-
-2. Caseless matching was not working in character classes if the characters in
-the pattern were in upper case.
-
-3. Make ranges like [W-c] work in the same way as Perl for caseless matching.
-
-4. Make PCRE_ANCHORED public and accept as a compile option.
-
-5. Add an options word to pcre_exec() and accept PCRE_ANCHORED and
-PCRE_CASELESS at run time. Add escapes \A and \I to pcretest to cause it to
-pass them.
-
-6. Give an error if bad option bits passed at compile or run time.
-
-7. Add PCRE_MULTILINE at compile and exec time, and (?m) as well. Add \M to
-pcretest to cause it to pass that flag.
-
-8. Add pcre_info(), to get the number of identifying subpatterns, the stored
-options, and the first character, if set.
-
-9. Recognize C+ or C{n,m} where n >= 1 as providing a fixed starting character.
-
-
-Version 0.91 10-Sep-97
-----------------------
-
-1. PCRE was failing to diagnose unlimited repeats of subpatterns that could
-match the empty string as in /(a*)*/. It was looping and ultimately crashing.
-
-2. PCRE was looping on encountering an indefinitely repeated back reference to
-a subpattern that had matched an empty string, e.g. /(a|)\1*/. It now does what
-Perl does - treats the match as successful.
-
-****
diff --git a/ext/pcre/pcrelib/INSTALL b/ext/pcre/pcrelib/INSTALL
deleted file mode 100644
index 08802812de..0000000000
--- a/ext/pcre/pcrelib/INSTALL
+++ /dev/null
@@ -1,185 +0,0 @@
-Basic Installation
-==================
-
- These are generic installation instructions that apply to systems that
-can run the `configure' shell script - Unix systems and any that imitate
-it. They are not specific to PCRE. There are PCRE-specific instructions
-for non-Unix systems in the file NON-UNIX-USE.
-
- The `configure' shell script attempts to guess correct values for
-various system-dependent variables used during compilation. It uses
-those values to create a `Makefile' in each directory of the package.
-It may also create one or more `.h' files containing system-dependent
-definitions. Finally, it creates a shell script `config.status' that
-you can run in the future to recreate the current configuration, a file
-`config.cache' that saves the results of its tests to speed up
-reconfiguring, and a file `config.log' containing compiler output
-(useful mainly for debugging `configure').
-
- If you need to do unusual things to compile the package, please try
-to figure out how `configure' could check whether to do them, and mail
-diffs or instructions to the address given in the `README' so they can
-be considered for the next release. If at some point `config.cache'
-contains results you don't want to keep, you may remove or edit it.
-
- The file `configure.in' is used to create `configure' by a program
-called `autoconf'. You only need `configure.in' if you want to change
-it or regenerate `configure' using a newer version of `autoconf'.
-
-The simplest way to compile this package is:
-
- 1. `cd' to the directory containing the package's source code and type
- `./configure' to configure the package for your system. If you're
- using `csh' on an old version of System V, you might need to type
- `sh ./configure' instead to prevent `csh' from trying to execute
- `configure' itself.
-
- Running `configure' takes awhile. While running, it prints some
- messages telling which features it is checking for.
-
- 2. Type `make' to compile the package.
-
- 3. Optionally, type `make check' to run any self-tests that come with
- the package.
-
- 4. Type `make install' to install the programs and any data files and
- documentation.
-
- 5. You can remove the program binaries and object files from the
- source code directory by typing `make clean'. To also remove the
- files that `configure' created (so you can compile the package for
- a different kind of computer), type `make distclean'. There is
- also a `make maintainer-clean' target, but that is intended mainly
- for the package's developers. If you use it, you may have to get
- all sorts of other programs in order to regenerate files that came
- with the distribution.
-
-Compilers and Options
-=====================
-
- Some systems require unusual options for compilation or linking that
-the `configure' script does not know about. You can give `configure'
-initial values for variables by setting them in the environment. Using
-a Bourne-compatible shell, you can do that on the command line like
-this:
- CC=c89 CFLAGS=-O2 LIBS=-lposix ./configure
-
-Or on systems that have the `env' program, you can do it like this:
- env CPPFLAGS=-I/usr/local/include LDFLAGS=-s ./configure
-
-Compiling For Multiple Architectures
-====================================
-
- You can compile the package for more than one kind of computer at the
-same time, by placing the object files for each architecture in their
-own directory. To do this, you must use a version of `make' that
-supports the `VPATH' variable, such as GNU `make'. `cd' to the
-directory where you want the object files and executables to go and run
-the `configure' script. `configure' automatically checks for the
-source code in the directory that `configure' is in and in `..'.
-
- If you have to use a `make' that does not supports the `VPATH'
-variable, you have to compile the package for one architecture at a time
-in the source code directory. After you have installed the package for
-one architecture, use `make distclean' before reconfiguring for another
-architecture.
-
-Installation Names
-==================
-
- By default, `make install' will install the package's files in
-`/usr/local/bin', `/usr/local/man', etc. You can specify an
-installation prefix other than `/usr/local' by giving `configure' the
-option `--prefix=PATH'.
-
- You can specify separate installation prefixes for
-architecture-specific files and architecture-independent files. If you
-give `configure' the option `--exec-prefix=PATH', the package will use
-PATH as the prefix for installing programs and libraries.
-Documentation and other data files will still use the regular prefix.
-
- In addition, if you use an unusual directory layout you can give
-options like `--bindir=PATH' to specify different values for particular
-kinds of files. Run `configure --help' for a list of the directories
-you can set and what kinds of files go in them.
-
- If the package supports it, you can cause programs to be installed
-with an extra prefix or suffix on their names by giving `configure' the
-option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
-
-Optional Features
-=================
-
- Some packages pay attention to `--enable-FEATURE' options to
-`configure', where FEATURE indicates an optional part of the package.
-They may also pay attention to `--with-PACKAGE' options, where PACKAGE
-is something like `gnu-as' or `x' (for the X Window System). The
-`README' should mention any `--enable-' and `--with-' options that the
-package recognizes.
-
- For packages that use the X Window System, `configure' can usually
-find the X include and library files automatically, but if it doesn't,
-you can use the `configure' options `--x-includes=DIR' and
-`--x-libraries=DIR' to specify their locations.
-
-Specifying the System Type
-==========================
-
- There may be some features `configure' can not figure out
-automatically, but needs to determine by the type of host the package
-will run on. Usually `configure' can figure that out, but if it prints
-a message saying it can not guess the host type, give it the
-`--host=TYPE' option. TYPE can either be a short name for the system
-type, such as `sun4', or a canonical name with three fields:
- CPU-COMPANY-SYSTEM
-
-See the file `config.sub' for the possible values of each field. If
-`config.sub' isn't included in this package, then this package doesn't
-need to know the host type.
-
- If you are building compiler tools for cross-compiling, you can also
-use the `--target=TYPE' option to select the type of system they will
-produce code for and the `--build=TYPE' option to select the type of
-system on which you are compiling the package.
-
-Sharing Defaults
-================
-
- If you want to set default values for `configure' scripts to share,
-you can create a site shell script called `config.site' that gives
-default values for variables like `CC', `cache_file', and `prefix'.
-`configure' looks for `PREFIX/share/config.site' if it exists, then
-`PREFIX/etc/config.site' if it exists. Or, you can set the
-`CONFIG_SITE' environment variable to the location of the site script.
-A warning: not all `configure' scripts look for a site script.
-
-Operation Controls
-==================
-
- `configure' recognizes the following options to control how it
-operates.
-
-`--cache-file=FILE'
- Use and save the results of the tests in FILE instead of
- `./config.cache'. Set FILE to `/dev/null' to disable caching, for
- debugging `configure'.
-
-`--help'
- Print a summary of the options to `configure', and exit.
-
-`--quiet'
-`--silent'
-`-q'
- Do not print messages saying which checks are being made. To
- suppress all normal output, redirect it to `/dev/null' (any error
- messages will still be shown).
-
-`--srcdir=DIR'
- Look for the package's source code in directory DIR. Usually
- `configure' can determine that directory automatically.
-
-`--version'
- Print the version of Autoconf used to generate the `configure'
- script, and exit.
-
-`configure' also accepts some other, not widely useful, options.
diff --git a/ext/pcre/pcrelib/LICENCE b/ext/pcre/pcrelib/LICENCE
deleted file mode 100644
index 8effa66492..0000000000
--- a/ext/pcre/pcrelib/LICENCE
+++ /dev/null
@@ -1,50 +0,0 @@
-PCRE LICENCE
-------------
-
-PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
-Written by: Philip Hazel <ph10@cam.ac.uk>
-
-University of Cambridge Computing Service,
-Cambridge, England. Phone: +44 1223 334714.
-
-Copyright (c) 1997-2001 University of Cambridge
-
-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
-
-1. This software is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-
-2. The origin of this software must not be misrepresented, either by
- explicit claim or by omission. In practice, this means that if you use
- PCRE in software which you distribute to others, commercially or
- otherwise, you must put a sentence like this
-
- Regular expression support is provided by the PCRE library package,
- which is open source software, written by Philip Hazel, and copyright
- by the University of Cambridge, England.
-
- somewhere reasonably visible in your documentation and in any relevant
- files or online help data or similar. A reference to the ftp site for
- the source, that is, to
-
- ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/
-
- should also be given in the documentation.
-
-3. Altered versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
-
-4. If PCRE is embedded in any software that is released under the GNU
- General Purpose Licence (GPL), or Lesser General Purpose Licence (LGPL),
- then the terms of that licence shall supersede any condition above with
- which it is incompatible.
-
-The documentation for PCRE, supplied in the "doc" directory, is distributed
-under the same terms as the software itself.
-
-End
diff --git a/ext/pcre/pcrelib/NEWS b/ext/pcre/pcrelib/NEWS
deleted file mode 100644
index 27866b68a2..0000000000
--- a/ext/pcre/pcrelib/NEWS
+++ /dev/null
@@ -1,85 +0,0 @@
-News about PCRE releases
-------------------------
-
-Release 3.5 15-Aug-01
----------------------
-
-1. The configuring system has been upgraded to use later versions of autoconf
-and libtool. By default it builds both a shared and a static library if the OS
-supports it. You can use --disable-shared or --disable-static on the configure
-command if you want only one of them.
-
-2. The pcretest utility is now installed along with pcregrep because it is
-useful for users (to test regexs) and by doing this, it automatically gets
-relinked by libtool. The documentation has been turned into a man page, so
-there are now .1, .txt, and .html versions in /doc.
-
-3. Upgrades to pcregrep:
- (i) Added long-form option names like gnu grep.
- (ii) Added --help to list all options with an explanatory phrase.
- (iii) Added -r, --recursive to recurse into sub-directories.
- (iv) Added -f, --file to read patterns from a file.
-
-4. Added --enable-newline-is-cr and --enable-newline-is-lf to the configure
-script, to force use of CR or LF instead of \n in the source. On non-Unix
-systems, the value can be set in config.h.
-
-5. The limit of 200 on non-capturing parentheses is a _nesting_ limit, not an
-absolute limit. Changed the text of the error message to make this clear, and
-likewise updated the man page.
-
-6. The limit of 99 on the number of capturing subpatterns has been removed.
-The new limit is 65535, which I hope will not be a "real" limit.
-
-
-Release 3.3 01-Aug-00
----------------------
-
-There is some support for UTF-8 character strings. This is incomplete and
-experimental. The documentation describes what is and what is not implemented.
-Otherwise, this is just a bug-fixing release.
-
-
-Release 3.0 01-Feb-00
----------------------
-
-1. A "configure" script is now used to configure PCRE for Unix systems. It
-builds a Makefile, a config.h file, and the pcre-config script.
-
-2. PCRE is built as a shared library by default.
-
-3. There is support for POSIX classes such as [:alpha:].
-
-5. There is an experimental recursion feature.
-
-----------------------------------------------------------------------------
- IMPORTANT FOR THOSE UPGRADING FROM VERSIONS BEFORE 2.00
-
-Please note that there has been a change in the API such that a larger
-ovector is required at matching time, to provide some additional workspace.
-The new man page has details. This change was necessary in order to support
-some of the new functionality in Perl 5.005.
-
- IMPORTANT FOR THOSE UPGRADING FROM VERSION 2.00
-
-Another (I hope this is the last!) change has been made to the API for the
-pcre_compile() function. An additional argument has been added to make it
-possible to pass over a pointer to character tables built in the current
-locale by pcre_maketables(). To use the default tables, this new arguement
-should be passed as NULL.
-
- IMPORTANT FOR THOSE UPGRADING FROM VERSION 2.05
-
-Yet another (and again I hope this really is the last) change has been made
-to the API for the pcre_exec() function. An additional argument has been
-added to make it possible to start the match other than at the start of the
-subject string. This is important if there are lookbehinds. The new man
-page has the details, but you just want to convert existing programs, all
-you need to do is to stick in a new fifth argument to pcre_exec(), with a
-value of zero. For example, change
-
- pcre_exec(pattern, extra, subject, length, options, ovec, ovecsize)
-to
- pcre_exec(pattern, extra, subject, length, 0, options, ovec, ovecsize)
-
-****
diff --git a/ext/pcre/pcrelib/NON-UNIX-USE b/ext/pcre/pcrelib/NON-UNIX-USE
deleted file mode 100644
index 8cbad88819..0000000000
--- a/ext/pcre/pcrelib/NON-UNIX-USE
+++ /dev/null
@@ -1,89 +0,0 @@
-Compiling PCRE on non-Unix systems
-----------------------------------
-
-If you want to compile PCRE for a non-Unix system, note that it consists
-entirely of code written in Standard C, and so should compile successfully
-on any machine with a Standard C compiler and library, using normal compiling
-commands to do the following:
-
-(1) Copy or rename the file config.in as config.h, and change the macros that
-define HAVE_STRERROR and HAVE_MEMMOVE to define them as 1 rather than 0.
-Unfortunately, because of the way Unix autoconf works, the default setting has
-to be 0. You may also want to make changes to other macros in config.h. In
-particular, if you want to force a specific value for newline, you can define
-the NEWLINE macro. The default is to use '\n', thereby using whatever value
-your compiler gives to '\n'.
-
-(2) Copy or rename the file pcre.in as pcre.h, and change the macro definitions
-for PCRE_MAJOR, PCRE_MINOR, and PCRE_DATE near its start to the values set in
-configure.in.
-
-(3) Compile dftables.c as a stand-alone program, and then run it with
-the standard output sent to chartables.c. This generates a set of standard
-character tables.
-
-(4) Compile maketables.c, get.c, study.c and pcre.c and link them all
-together into an object library in whichever form your system keeps such
-libraries. This is the pcre library (chartables.c gets included by means of an
-#include directive).
-
-(5) Similarly, compile pcreposix.c and link it as the pcreposix library.
-
-(6) Compile the test program pcretest.c. This needs the functions in the
-pcre and pcreposix libraries when linking.
-
-(7) Run pcretest on the testinput files in the testdata directory, and check
-that the output matches the corresponding testoutput files. You must use the
--i option when checking testinput2.
-
-If you have a system without "configure" but where you can use a Makefile, edit
-Makefile.in to create Makefile, substituting suitable values for the variables
-at the head of the file.
-
-Some help in building a Win32 DLL of PCRE in GnuWin32 environments was
-contributed by Paul Sokolovsky. These environments are Mingw32
-(http://www.xraylith.wisc.edu/~khan/software/gnu-win32/) and CygWin
-(http://sourceware.cygnus.com/cygwin/). Paul comments:
-
- For CygWin, set CFLAGS=-mno-cygwin, and do 'make dll'. You'll get
- pcre.dll (containing pcreposix also), libpcre.dll.a, and dynamically
- linked pgrep and pcretest. If you have /bin/sh, run RunTest (three
- main test go ok, locale not supported).
-
-A script for building PCRE using Borland's C++ compiler for use with VPASCAL
-was contributed by Alexander Tokarev. It is called makevp.bat.
-
-These are some further comments about Win32 builds from Mark Evans:
-
-The documentation for Win32 builds is a bit shy. Under MSVC6 I
-followed their instructions to the letter, but there were still
-some things missing.
-
-(1) Must #define STATIC for entire project if linking statically.
- (I see no reason to use DLLs for code this compact.) This of
- course is a project setting in MSVC under Preprocessor.
-
-(2) Missing some #ifdefs relating to the function pointers
- pcre_malloc and pcre_free. See my solution below. (The stubs
- may not be mandatory but they made me feel better.)
-
-=========================
-#ifdef _WIN32
-#include <malloc.h>
-
-void* malloc_stub(size_t N)
-{ return malloc(N); }
-void free_stub(void* p)
-{ free(p); }
-void *(*pcre_malloc)(size_t) = &malloc_stub;
-void (*pcre_free)(void *) = &free_stub;
-
-#else
-
-void *(*pcre_malloc)(size_t) = malloc;
-void (*pcre_free)(void *) = free;
-
-#endif
-=========================
-
-****
diff --git a/ext/pcre/pcrelib/README b/ext/pcre/pcrelib/README
deleted file mode 100644
index 81cd1c6fa3..0000000000
--- a/ext/pcre/pcrelib/README
+++ /dev/null
@@ -1,312 +0,0 @@
-README file for PCRE (Perl-compatible regular expression library)
------------------------------------------------------------------
-
-The latest release of PCRE is always available from
-
- ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-xxx.tar.gz
-
-Please read the NEWS file if you are upgrading from a previous release.
-
-PCRE has its own native API, but a set of "wrapper" functions that are based on
-the POSIX API are also supplied in the library libpcreposix. Note that this
-just provides a POSIX calling interface to PCRE: the regular expressions
-themselves still follow Perl syntax and semantics. The header file
-for the POSIX-style functions is called pcreposix.h. The official POSIX name is
-regex.h, but I didn't want to risk possible problems with existing files of
-that name by distributing it that way. To use it with an existing program that
-uses the POSIX API, it will have to be renamed or pointed at by a link.
-
-
-Contributions by users of PCRE
-------------------------------
-
-You can find contributions from PCRE users in the directory
-
- ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/Contrib
-
-where there is also a README file giving brief descriptions of what they are.
-Several of them provide support for compiling PCRE on various flavours of
-Windows systems (I myself do not use Windows). Some are complete in themselves;
-others are pointers to URLs containing relevant files.
-
-
-Building PCRE on a Unix-like system
------------------------------------
-
-To build PCRE on a Unix-like system, first run the "configure" command from the
-PCRE distribution directory, with your current directory set to the directory
-where you want the files to be created. This command is a standard GNU
-"autoconf" configuration script, for which generic instructions are supplied in
-INSTALL.
-
-Most commonly, people build PCRE within its own distribution directory, and in
-this case, on many systems, just running "./configure" is sufficient, but the
-usual methods of changing standard defaults are available. For example,
-
-CFLAGS='-O2 -Wall' ./configure --prefix=/opt/local
-
-specifies that the C compiler should be run with the flags '-O2 -Wall' instead
-of the default, and that "make install" should install PCRE under /opt/local
-instead of the default /usr/local.
-
-If you want to build in a different directory, just run "configure" with that
-directory as current. For example, suppose you have unpacked the PCRE source
-into /source/pcre/pcre-xxx, but you want to build it in /build/pcre/pcre-xxx:
-
-cd /build/pcre/pcre-xxx
-/source/pcre/pcre-xxx/configure
-
-If you want to make use of the experimential, incomplete support for UTF-8
-character strings in PCRE, you must add --enable-utf8 to the "configure"
-command. Without it, the code for handling UTF-8 is not included in the
-library. (Even when included, it still has to be enabled by an option at run
-time.)
-
-The "configure" script builds five files:
-
-. libtool is a script that builds shared and/or static libraries
-. Makefile is built by copying Makefile.in and making substitutions.
-. config.h is built by copying config.in and making substitutions.
-. pcre-config is built by copying pcre-config.in and making substitutions.
-. RunTest is a script for running tests
-
-Once "configure" has run, you can run "make". It builds two libraries called
-libpcre and libpcreposix, a test program called pcretest, and the pcregrep
-command. You can use "make install" to copy these, the public header files
-pcre.h and pcreposix.h, and the man pages to appropriate live directories on
-your system, in the normal way.
-
-Running "make install" also installs the command pcre-config, which can be used
-to recall information about the PCRE configuration and installation. For
-example,
-
- pcre-config --version
-
-prints the version number, and
-
- pcre-config --libs
-
-outputs information about where the library is installed. This command can be
-included in makefiles for programs that use PCRE, saving the programmer from
-having to remember too many details.
-
-There is one esoteric feature that is controlled by "configure". It concerns
-the character value used for "newline", and is something that you probably do
-not want to change on a Unix system. The default is to use whatever value your
-compiler gives to '\n'. By using --enable-newline-is-cr or
---enable-newline-is-lf you can force the value to be CR (13) or LF (10) if you
-really want to.
-
-
-Shared libraries on Unix systems
---------------------------------
-
-The default distribution builds PCRE as two shared libraries and two static
-libraries, as long as the operating system supports shared libraries. Shared
-library support relies on the "libtool" script which is built as part of the
-"configure" process.
-
-The libtool script is used to compile and link both shared and static
-libraries. They are placed in a subdirectory called .libs when they are newly
-built. The programs pcretest and pcregrep are built to use these uninstalled
-libraries (by means of wrapper scripts in the case of shared libraries). When
-you use "make install" to install shared libraries, pcregrep and pcretest are
-automatically re-built to use the newly installed shared libraries before being
-installed themselves. However, the versions left in the source directory still
-use the uninstalled libraries.
-
-To build PCRE using static libraries only you must use --disable-shared when
-configuring it. For example
-
-./configure --prefix=/usr/gnu --disable-shared
-
-Then run "make" in the usual way. Similarly, you can use --disable-static to
-build only shared libraries.
-
-
-Building on non-Unix systems
-----------------------------
-
-For a non-Unix system, read the comments in the file NON-UNIX-USE. PCRE has
-been compiled on Windows systems and on Macintoshes, but I don't know the
-details because I don't use those systems. It should be straightforward to
-build PCRE on any system that has a Standard C compiler, because it uses only
-Standard C functions.
-
-
-Testing PCRE
-------------
-
-To test PCRE on a Unix system, run the RunTest script that is created by the
-configuring process. (This can also be run by "make runtest", "make check", or
-"make test".) For other systems, see the instruction in NON-UNIX-USE.
-
-The script runs the pcretest test program (which is documented in the doc
-directory) on each of the testinput files (in the testdata directory) in turn,
-and compares the output with the contents of the corresponding testoutput file.
-A file called testtry is used to hold the output from pcretest. To run pcretest
-on just one of the test files, give its number as an argument to RunTest, for
-example:
-
- RunTest 2
-
-The first file can also be fed directly into the perltest script to check that
-Perl gives the same results. The only difference you should see is in the first
-few lines, where the Perl version is given instead of the PCRE version.
-
-The second set of tests check pcre_fullinfo(), pcre_info(), pcre_study(),
-pcre_copy_substring(), pcre_get_substring(), pcre_get_substring_list(), error
-detection, and run-time flags that are specific to PCRE, as well as the POSIX
-wrapper API. It also uses the debugging flag to check some of the internals of
-pcre_compile().
-
-If you build PCRE with a locale setting that is not the standard C locale, the
-character tables may be different (see next paragraph). In some cases, this may
-cause failures in the second set of tests. For example, in a locale where the
-isprint() function yields TRUE for characters in the range 128-255, the use of
-[:isascii:] inside a character class defines a different set of characters, and
-this shows up in this test as a difference in the compiled code, which is being
-listed for checking. Where the comparison test output contains [\x00-\x7f] the
-test will contain [\x00-\xff], and similarly in some other cases. This is not a
-bug in PCRE.
-
-The third set of tests checks pcre_maketables(), the facility for building a
-set of character tables for a specific locale and using them instead of the
-default tables. The tests make use of the "fr" (French) locale. Before running
-the test, the script checks for the presence of this locale by running the
-"locale" command. If that command fails, or if it doesn't include "fr" in the
-list of available locales, the third test cannot be run, and a comment is
-output to say why. If running this test produces instances of the error
-
- ** Failed to set locale "fr"
-
-in the comparison output, it means that locale is not available on your system,
-despite being listed by "locale". This does not mean that PCRE is broken.
-
-The fourth test checks the experimental, incomplete UTF-8 support. It is not
-run automatically unless PCRE is built with UTF-8 support. To do this you must
-set --enable-utf8 when running "configure". This file can be also fed directly
-to the perltest script, provided you are running Perl 5.8 or higher. (For Perl
-5.6, a small patch, commented in the script, can be be used.)
-
-The fifth and final file tests error handling with UTF-8 encoding, and internal
-UTF-8 features of PCRE that are not relevant to Perl.
-
-
-Character tables
-----------------
-
-PCRE uses four tables for manipulating and identifying characters. The final
-argument of the pcre_compile() function is a pointer to a block of memory
-containing the concatenated tables. A call to pcre_maketables() can be used to
-generate a set of tables in the current locale. If the final argument for
-pcre_compile() is passed as NULL, a set of default tables that is built into
-the binary is used.
-
-The source file called chartables.c contains the default set of tables. This is
-not supplied in the distribution, but is built by the program dftables
-(compiled from dftables.c), which uses the ANSI C character handling functions
-such as isalnum(), isalpha(), isupper(), islower(), etc. to build the table
-sources. This means that the default C locale which is set for your system will
-control the contents of these default tables. You can change the default tables
-by editing chartables.c and then re-building PCRE. If you do this, you should
-probably also edit Makefile to ensure that the file doesn't ever get
-re-generated.
-
-The first two 256-byte tables provide lower casing and case flipping functions,
-respectively. The next table consists of three 32-byte bit maps which identify
-digits, "word" characters, and white space, respectively. These are used when
-building 32-byte bit maps that represent character classes.
-
-The final 256-byte table has bits indicating various character types, as
-follows:
-
- 1 white space character
- 2 letter
- 4 decimal digit
- 8 hexadecimal digit
- 16 alphanumeric or '_'
- 128 regular expression metacharacter or binary zero
-
-You should not alter the set of characters that contain the 128 bit, as that
-will cause PCRE to malfunction.
-
-
-Manifest
---------
-
-The distribution should contain the following files:
-
-(A) The actual source files of the PCRE library functions and their
- headers:
-
- dftables.c auxiliary program for building chartables.c
- get.c )
- maketables.c )
- study.c ) source of
- pcre.c ) the functions
- pcreposix.c )
- pcre.in "source" for the header for the external API; pcre.h
- is built from this by "configure"
- pcreposix.h header for the external POSIX wrapper API
- internal.h header for internal use
- config.in template for config.h, which is built by configure
-
-(B) Auxiliary files:
-
- AUTHORS information about the author of PCRE
- ChangeLog log of changes to the code
- INSTALL generic installation instructions
- LICENCE conditions for the use of PCRE
- COPYING the same, using GNU's standard name
- Makefile.in template for Unix Makefile, which is built by configure
- NEWS important changes in this release
- NON-UNIX-USE notes on building PCRE on non-Unix systems
- README this file
- RunTest.in template for a Unix shell script for running tests
- config.guess ) files used by libtool,
- config.sub ) used only when building a shared library
- configure a configuring shell script (built by autoconf)
- configure.in the autoconf input used to build configure
- doc/Tech.Notes notes on the encoding
- doc/pcre.3 man page source for the PCRE functions
- doc/pcre.html HTML version
- doc/pcre.txt plain text version
- doc/pcreposix.3 man page source for the POSIX wrapper API
- doc/pcreposix.html HTML version
- doc/pcreposix.txt plain text version
- doc/pcretest.txt documentation of test program
- doc/perltest.txt documentation of Perl test program
- doc/pcregrep.1 man page source for the pcregrep utility
- doc/pcregrep.html HTML version
- doc/pcregrep.txt plain text version
- install-sh a shell script for installing files
- ltmain.sh file used to build a libtool script
- pcretest.c comprehensive test program
- pcredemo.c simple demonstration of coding calls to PCRE
- perltest Perl test program
- perltest8 Perl test program for UTF-8 tests
- pcregrep.c source of a grep utility that uses PCRE
- pcre-config.in source of script which retains PCRE information
- testdata/testinput1 test data, compatible with Perl
- testdata/testinput2 test data for error messages and non-Perl things
- testdata/testinput3 test data for locale-specific tests
- testdata/testinput4 test data for UTF-8 tests compatible with Perl
- testdata/testinput5 test data for other UTF-8 tests
- testdata/testoutput1 test results corresponding to testinput1
- testdata/testoutput2 test results corresponding to testinput2
- testdata/testoutput3 test results corresponding to testinput3
- testdata/testoutput4 test results corresponding to testinput4
- testdata/testoutput5 test results corresponding to testinput5
-
-(C) Auxiliary files for Win32 DLL
-
- dll.mk
- pcre.def
-
-(D) Auxiliary file for VPASCAL
-
- makevp.bat
-
-Philip Hazel <ph10@cam.ac.uk>
-August 2002
diff --git a/ext/pcre/pcrelib/chartables.c b/ext/pcre/pcrelib/chartables.c
deleted file mode 100644
index 9055da2d2d..0000000000
--- a/ext/pcre/pcrelib/chartables.c
+++ /dev/null
@@ -1,183 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* This file is automatically written by the dftables auxiliary
-program. If you edit it by hand, you might like to edit the Makefile to
-prevent its ever being regenerated.
-
-This file is #included in the compilation of pcre.c to build the default
-character tables which are used when no tables are passed to the compile
-function. */
-
-static unsigned char pcre_default_tables[] = {
-
-/* This table is a lower casing table. */
-
- 0, 1, 2, 3, 4, 5, 6, 7,
- 8, 9, 10, 11, 12, 13, 14, 15,
- 16, 17, 18, 19, 20, 21, 22, 23,
- 24, 25, 26, 27, 28, 29, 30, 31,
- 32, 33, 34, 35, 36, 37, 38, 39,
- 40, 41, 42, 43, 44, 45, 46, 47,
- 48, 49, 50, 51, 52, 53, 54, 55,
- 56, 57, 58, 59, 60, 61, 62, 63,
- 64, 97, 98, 99,100,101,102,103,
- 104,105,106,107,108,109,110,111,
- 112,113,114,115,116,117,118,119,
- 120,121,122, 91, 92, 93, 94, 95,
- 96, 97, 98, 99,100,101,102,103,
- 104,105,106,107,108,109,110,111,
- 112,113,114,115,116,117,118,119,
- 120,121,122,123,124,125,126,127,
- 128,129,130,131,132,133,134,135,
- 136,137,138,139,140,141,142,143,
- 144,145,146,147,148,149,150,151,
- 152,153,154,155,156,157,158,159,
- 160,161,162,163,164,165,166,167,
- 168,169,170,171,172,173,174,175,
- 176,177,178,179,180,181,182,183,
- 184,185,186,187,188,189,190,191,
- 192,193,194,195,196,197,198,199,
- 200,201,202,203,204,205,206,207,
- 208,209,210,211,212,213,214,215,
- 216,217,218,219,220,221,222,223,
- 224,225,226,227,228,229,230,231,
- 232,233,234,235,236,237,238,239,
- 240,241,242,243,244,245,246,247,
- 248,249,250,251,252,253,254,255,
-
-/* This table is a case flipping table. */
-
- 0, 1, 2, 3, 4, 5, 6, 7,
- 8, 9, 10, 11, 12, 13, 14, 15,
- 16, 17, 18, 19, 20, 21, 22, 23,
- 24, 25, 26, 27, 28, 29, 30, 31,
- 32, 33, 34, 35, 36, 37, 38, 39,
- 40, 41, 42, 43, 44, 45, 46, 47,
- 48, 49, 50, 51, 52, 53, 54, 55,
- 56, 57, 58, 59, 60, 61, 62, 63,
- 64, 97, 98, 99,100,101,102,103,
- 104,105,106,107,108,109,110,111,
- 112,113,114,115,116,117,118,119,
- 120,121,122, 91, 92, 93, 94, 95,
- 96, 65, 66, 67, 68, 69, 70, 71,
- 72, 73, 74, 75, 76, 77, 78, 79,
- 80, 81, 82, 83, 84, 85, 86, 87,
- 88, 89, 90,123,124,125,126,127,
- 128,129,130,131,132,133,134,135,
- 136,137,138,139,140,141,142,143,
- 144,145,146,147,148,149,150,151,
- 152,153,154,155,156,157,158,159,
- 160,161,162,163,164,165,166,167,
- 168,169,170,171,172,173,174,175,
- 176,177,178,179,180,181,182,183,
- 184,185,186,187,188,189,190,191,
- 192,193,194,195,196,197,198,199,
- 200,201,202,203,204,205,206,207,
- 208,209,210,211,212,213,214,215,
- 216,217,218,219,220,221,222,223,
- 224,225,226,227,228,229,230,231,
- 232,233,234,235,236,237,238,239,
- 240,241,242,243,244,245,246,247,
- 248,249,250,251,252,253,254,255,
-
-/* This table contains bit maps for various character classes.
-Each map is 32 bytes long and the bits run from the least
-significant end of each byte. The classes that have their own
-maps are: space, xdigit, digit, upper, lower, word, graph
-print, punct, and cntrl. Other classes are built from combinations. */
-
- 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
- 0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
- 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff,
- 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,
- 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc,
- 0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
- 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-
-/* This table identifies various classes of character by individual bits:
- 0x01 white space character
- 0x02 letter
- 0x04 decimal digit
- 0x08 hexadecimal digit
- 0x10 alphanumeric or '_'
- 0x80 regular expression metacharacter or binary zero
-*/
-
- 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
- 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
- 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
- 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
- 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
- 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
- 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
- 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
- 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
- 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
- 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
- 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
- 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
- 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
-
-/* End of chartables.c */
diff --git a/ext/pcre/pcrelib/dftables.c b/ext/pcre/pcrelib/dftables.c
deleted file mode 100644
index fe4ffcdb7a..0000000000
--- a/ext/pcre/pcrelib/dftables.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/*
-PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
-Written by: Philip Hazel <ph10@cam.ac.uk>
-
- Copyright (c) 1997-2001 University of Cambridge
-
------------------------------------------------------------------------------
-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
-
-1. This software is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-
-2. The origin of this software must not be misrepresented, either by
- explicit claim or by omission.
-
-3. Altered versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
-
-4. If PCRE is embedded in any software that is released under the GNU
- General Purpose Licence (GPL), then the terms of that licence shall
- supersede any condition above with which it is incompatible.
------------------------------------------------------------------------------
-
-See the file Tech.Notes for some information on the internals.
-*/
-
-
-/* This is a support program to generate the file chartables.c, containing
-character tables of various kinds. They are built according to the default C
-locale and used as the default tables by PCRE. Now that pcre_maketables is
-a function visible to the outside world, we make use of its code from here in
-order to be consistent. */
-
-#include <ctype.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "internal.h"
-
-#define DFTABLES /* maketables.c notices this */
-#include "maketables.c"
-
-
-int main(void)
-{
-int i;
-const unsigned char *tables = pcre_maketables();
-
-printf(
- "/*************************************************\n"
- "* Perl-Compatible Regular Expressions *\n"
- "*************************************************/\n\n"
- "/* This file is automatically written by the dftables auxiliary \n"
- "program. If you edit it by hand, you might like to edit the Makefile to \n"
- "prevent its ever being regenerated.\n\n"
- "This file is #included in the compilation of pcre.c to build the default\n"
- "character tables which are used when no tables are passed to the compile\n"
- "function. */\n\n"
- "static unsigned char pcre_default_tables[] = {\n\n"
- "/* This table is a lower casing table. */\n\n");
-
-printf(" ");
-for (i = 0; i < 256; i++)
- {
- if ((i & 7) == 0 && i != 0) printf("\n ");
- printf("%3d", *tables++);
- if (i != 255) printf(",");
- }
-printf(",\n\n");
-
-printf("/* This table is a case flipping table. */\n\n");
-
-printf(" ");
-for (i = 0; i < 256; i++)
- {
- if ((i & 7) == 0 && i != 0) printf("\n ");
- printf("%3d", *tables++);
- if (i != 255) printf(",");
- }
-printf(",\n\n");
-
-printf(
- "/* This table contains bit maps for various character classes.\n"
- "Each map is 32 bytes long and the bits run from the least\n"
- "significant end of each byte. The classes that have their own\n"
- "maps are: space, xdigit, digit, upper, lower, word, graph\n"
- "print, punct, and cntrl. Other classes are built from combinations. */\n\n");
-
-printf(" ");
-for (i = 0; i < cbit_length; i++)
- {
- if ((i & 7) == 0 && i != 0)
- {
- if ((i & 31) == 0) printf("\n");
- printf("\n ");
- }
- printf("0x%02x", *tables++);
- if (i != cbit_length - 1) printf(",");
- }
-printf(",\n\n");
-
-printf(
- "/* This table identifies various classes of character by individual bits:\n"
- " 0x%02x white space character\n"
- " 0x%02x letter\n"
- " 0x%02x decimal digit\n"
- " 0x%02x hexadecimal digit\n"
- " 0x%02x alphanumeric or '_'\n"
- " 0x%02x regular expression metacharacter or binary zero\n*/\n\n",
- ctype_space, ctype_letter, ctype_digit, ctype_xdigit, ctype_word,
- ctype_meta);
-
-printf(" ");
-for (i = 0; i < 256; i++)
- {
- if ((i & 7) == 0 && i != 0)
- {
- printf(" /* ");
- if (isprint(i-8)) printf(" %c -", i-8);
- else printf("%3d-", i-8);
- if (isprint(i-1)) printf(" %c ", i-1);
- else printf("%3d", i-1);
- printf(" */\n ");
- }
- printf("0x%02x", *tables++);
- if (i != 255) printf(",");
- }
-
-printf("};/* ");
-if (isprint(i-8)) printf(" %c -", i-8);
- else printf("%3d-", i-8);
-if (isprint(i-1)) printf(" %c ", i-1);
- else printf("%3d", i-1);
-printf(" */\n\n/* End of chartables.c */\n");
-
-return 0;
-}
-
-/* End of dftables.c */
diff --git a/ext/pcre/pcrelib/dll.mk b/ext/pcre/pcrelib/dll.mk
deleted file mode 100644
index d8b728e57e..0000000000
--- a/ext/pcre/pcrelib/dll.mk
+++ /dev/null
@@ -1,60 +0,0 @@
-# dll.mk - auxilary Makefile to easy build dll's for mingw32 target
-# ver. 0.6 of 1999-03-25
-#
-# Homepage of this makefile - http://www.is.lg.ua/~paul/devel/
-# Homepage of original mingw32 project -
-# http://www.fu.is.saga-u.ac.jp/~colin/gcc.html
-#
-# How to use:
-# This makefile can:
-# 1. Create automatical .def file from list of objects
-# 2. Create .dll from objects and .def file, either automatical, or your
-# hand-written (maybe) file, which must have same basename as dll
-# WARNING! There MUST be object, which name match dll's name. Make sux.
-# 3. Create import library from .def (as for .dll, only its name required,
-# not dll itself)
-# By convention implibs for dll have .dll.a suffix, e.g. libstuff.dll.a
-# Why not just libstuff.a? 'Cos that's name for static lib, ok?
-# Process divided into 3 phases because:
-# 1. Pre-existent .def possible
-# 2. Generating implib is enough time-consuming
-#
-# Variables:
-# DLL_LDLIBS - libs for linking dll
-# DLL_LDFLAGS - flags for linking dll
-#
-# By using $(DLL_SUFFIX) instead of 'dll', e.g. stuff.$(DLL_SUFFIX)
-# you may help porting makefiles to other platforms
-#
-# Put this file in your make's include path (e.g. main include dir, for
-# more information see include section in make doc). Put in the beginning
-# of your own Makefile line "include dll.mk". Specify dependences, e.g.:
-#
-# Do all stuff in one step
-# libstuff.dll.a: $(OBJECTS) stuff.def
-# stuff.def: $(OBJECTS)
-#
-# Steps separated, pre-provided .def, link with user32
-#
-# DLL_LDLIBS=-luser32
-# stuff.dll: $(OBJECTS)
-# libstuff.dll.a: $(OBJECTS)
-
-
-DLLWRAP=dllwrap
-DLLTOOL=dlltool
-
-DLL_SUFFIX=dll
-
-.SUFFIXES: .o .$(DLL_SUFFIX)
-
-_%.def: %.o
- $(DLLTOOL) --export-all --output-def $@ $^
-
-%.$(DLL_SUFFIX): %.o
- $(DLLWRAP) --dllname $(notdir $@) --driver-name $(CC) --def $*.def -o $@ $(filter %.o,$^) $(DLL_LDFLAGS) $(DLL_LDLIBS)
-
-lib%.$(DLL_SUFFIX).a:%.def
- $(DLLTOOL) --dllname $(notdir $*.dll) --def $< --output-lib $@
-
-# End
diff --git a/ext/pcre/pcrelib/doc/Tech.Notes b/ext/pcre/pcrelib/doc/Tech.Notes
deleted file mode 100644
index df8f21892f..0000000000
--- a/ext/pcre/pcrelib/doc/Tech.Notes
+++ /dev/null
@@ -1,253 +0,0 @@
-Technical Notes about PCRE
---------------------------
-
-Many years ago I implemented some regular expression functions to an algorithm
-suggested by Martin Richards. These were not Unix-like in form, and were quite
-restricted in what they could do by comparison with Perl. The interesting part
-about the algorithm was that the amount of space required to hold the compiled
-form of an expression was known in advance. The code to apply an expression did
-not operate by backtracking, as the Henry Spencer and Perl code does, but
-instead checked all possibilities simultaneously by keeping a list of current
-states and checking all of them as it advanced through the subject string. (In
-the terminology of Jeffrey Friedl's book, it was a "DFA algorithm".) When the
-pattern was all used up, all remaining states were possible matches, and the
-one matching the longest subset of the subject string was chosen. This did not
-necessarily maximize the individual wild portions of the pattern, as is
-expected in Unix and Perl-style regular expressions.
-
-By contrast, the code originally written by Henry Spencer and subsequently
-heavily modified for Perl actually compiles the expression twice: once in a
-dummy mode in order to find out how much store will be needed, and then for
-real. The execution function operates by backtracking and maximizing (or,
-optionally, minimizing in Perl) the amount of the subject that matches
-individual wild portions of the pattern. This is an "NFA algorithm" in Friedl's
-terminology.
-
-For the set of functions that forms PCRE (which are unrelated to those
-mentioned above), I tried at first to invent an algorithm that used an amount
-of store bounded by a multiple of the number of characters in the pattern, to
-save on compiling time. However, because of the greater complexity in Perl
-regular expressions, I couldn't do this. In any case, a first pass through the
-pattern is needed, in order to find internal flag settings like (?i) at top
-level. So PCRE works by running a very degenerate first pass to calculate a
-maximum store size, and then a second pass to do the real compile - which may
-use a bit less than the predicted amount of store. The idea is that this is
-going to turn out faster because the first pass is degenerate and the second
-pass can just store stuff straight into the vector. It does make the compiling
-functions bigger, of course, but they have got quite big anyway to handle all
-the Perl stuff.
-
-The compiled form of a pattern is a vector of bytes, containing items of
-variable length. The first byte in an item is an opcode, and the length of the
-item is either implicit in the opcode or contained in the data bytes which
-follow it. A list of all the opcodes follows:
-
-Opcodes with no following data
-------------------------------
-
-These items are all just one byte long
-
- OP_END end of pattern
- OP_ANY match any character
- OP_SOD match start of data: \A
- OP_CIRC ^ (start of data, or after \n in multiline)
- OP_NOT_WORD_BOUNDARY \W
- OP_WORD_BOUNDARY \w
- OP_NOT_DIGIT \D
- OP_DIGIT \d
- OP_NOT_WHITESPACE \S
- OP_WHITESPACE \s
- OP_NOT_WORDCHAR \W
- OP_WORDCHAR \w
- OP_EODN match end of data or \n at end: \Z
- OP_EOD match end of data: \z
- OP_DOLL $ (end of data, or before \n in multiline)
- OP_RECURSE match the pattern recursively
-
-
-Repeating single characters
----------------------------
-
-The common repeats (*, +, ?) when applied to a single character appear as
-two-byte items using the following opcodes:
-
- OP_STAR
- OP_MINSTAR
- OP_PLUS
- OP_MINPLUS
- OP_QUERY
- OP_MINQUERY
-
-Those with "MIN" in their name are the minimizing versions. Each is followed by
-the character that is to be repeated. Other repeats make use of
-
- OP_UPTO
- OP_MINUPTO
- OP_EXACT
-
-which are followed by a two-byte count (most significant first) and the
-repeated character. OP_UPTO matches from 0 to the given number. A repeat with a
-non-zero minimum and a fixed maximum is coded as an OP_EXACT followed by an
-OP_UPTO (or OP_MINUPTO).
-
-
-Repeating character types
--------------------------
-
-Repeats of things like \d are done exactly as for single characters, except
-that instead of a character, the opcode for the type is stored in the data
-byte. The opcodes are:
-
- OP_TYPESTAR
- OP_TYPEMINSTAR
- OP_TYPEPLUS
- OP_TYPEMINPLUS
- OP_TYPEQUERY
- OP_TYPEMINQUERY
- OP_TYPEUPTO
- OP_TYPEMINUPTO
- OP_TYPEEXACT
-
-
-Matching a character string
----------------------------
-
-The OP_CHARS opcode is followed by a one-byte count and then that number of
-characters. If there are more than 255 characters in sequence, successive
-instances of OP_CHARS are used.
-
-
-Character classes
------------------
-
-OP_CLASS is used for a character class, provided there are at least two
-characters in the class. If there is only one character, OP_CHARS is used for a
-positive class, and OP_NOT for a negative one (that is, for something like
-[^a]). Another set of repeating opcodes (OP_NOTSTAR etc.) are used for a
-repeated, negated, single-character class. The normal ones (OP_STAR etc.) are
-used for a repeated positive single-character class.
-
-OP_CLASS is followed by a 32-byte bit map containing a 1 bit for every
-character that is acceptable. The bits are counted from the least significant
-end of each byte.
-
-
-Back references
----------------
-
-OP_REF is followed by two bytes containing the reference number.
-
-
-Repeating character classes and back references
------------------------------------------------
-
-Single-character classes are handled specially (see above). This applies to
-OP_CLASS and OP_REF. In both cases, the repeat information follows the base
-item. The matching code looks at the following opcode to see if it is one of
-
- OP_CRSTAR
- OP_CRMINSTAR
- OP_CRPLUS
- OP_CRMINPLUS
- OP_CRQUERY
- OP_CRMINQUERY
- OP_CRRANGE
- OP_CRMINRANGE
-
-All but the last two are just single-byte items. The others are followed by
-four bytes of data, comprising the minimum and maximum repeat counts.
-
-
-Brackets and alternation
-------------------------
-
-A pair of non-capturing (round) brackets is wrapped round each expression at
-compile time, so alternation always happens in the context of brackets.
-
-Non-capturing brackets use the opcode OP_BRA, while capturing brackets use
-OP_BRA+1, OP_BRA+2, etc. [Note for North Americans: "bracket" to some English
-speakers, including myself, can be round, square, curly, or pointy. Hence this
-usage.]
-
-Originally PCRE was limited to 99 capturing brackets (so as not to use up all
-the opcodes). From release 3.5, there is no limit. What happens is that the
-first ones, up to EXTRACT_BASIC_MAX are handled with separate opcodes, as
-above. If there are more, the opcode is set to EXTRACT_BASIC_MAX+1, and the
-first operation in the bracket is OP_BRANUMBER, followed by a 2-byte bracket
-number. This opcode is ignored while matching, but is fished out when handling
-the bracket itself. (They could have all been done like this, but I was making
-minimal changes.)
-
-A bracket opcode is followed by two bytes which give the offset to the next
-alternative OP_ALT or, if there aren't any branches, to the matching KET
-opcode. Each OP_ALT is followed by two bytes giving the offset to the next one,
-or to the KET opcode.
-
-OP_KET is used for subpatterns that do not repeat indefinitely, while
-OP_KETRMIN and OP_KETRMAX are used for indefinite repetitions, minimally or
-maximally respectively. All three are followed by two bytes giving (as a
-positive number) the offset back to the matching BRA opcode.
-
-If a subpattern is quantified such that it is permitted to match zero times, it
-is preceded by one of OP_BRAZERO or OP_BRAMINZERO. These are single-byte
-opcodes which tell the matcher that skipping this subpattern entirely is a
-valid branch.
-
-A subpattern with an indefinite maximum repetition is replicated in the
-compiled data its minimum number of times (or once with a BRAZERO if the
-minimum is zero), with the final copy terminating with a KETRMIN or KETRMAX as
-appropriate.
-
-A subpattern with a bounded maximum repetition is replicated in a nested
-fashion up to the maximum number of times, with BRAZERO or BRAMINZERO before
-each replication after the minimum, so that, for example, (abc){2,5} is
-compiled as (abc)(abc)((abc)((abc)(abc)?)?)?. The 99 and 200 bracket limits do
-not apply to these internally generated brackets.
-
-
-Assertions
-----------
-
-Forward assertions are just like other subpatterns, but starting with one of
-the opcodes OP_ASSERT or OP_ASSERT_NOT. Backward assertions use the opcodes
-OP_ASSERTBACK and OP_ASSERTBACK_NOT, and the first opcode inside the assertion
-is OP_REVERSE, followed by a two byte count of the number of characters to move
-back the pointer in the subject string. When operating in UTF-8 mode, the count
-is a character count rather than a byte count. A separate count is present in
-each alternative of a lookbehind assertion, allowing them to have different
-fixed lengths.
-
-
-Once-only subpatterns
----------------------
-
-These are also just like other subpatterns, but they start with the opcode
-OP_ONCE.
-
-
-Conditional subpatterns
------------------------
-
-These are like other subpatterns, but they start with the opcode OP_COND. If
-the condition is a back reference, this is stored at the start of the
-subpattern using the opcode OP_CREF followed by two bytes containing the
-reference number. If the condition is "in recursion" (coded as "(?(R)"), the
-same scheme is used, with a "reference number" of 0xffff. Otherwise, a
-conditional subpattern always starts with one of the assertions.
-
-
-Changing options
-----------------
-
-If any of the /i, /m, or /s options are changed within a pattern, an OP_OPT
-opcode is compiled, followed by one byte containing the new settings of these
-flags. If there are several alternatives, there is an occurrence of OP_OPT at
-the start of all those following the first options change, to set appropriate
-options for the start of the alternative. Immediately after the end of the
-group there is another such item to reset the flags to their previous values. A
-change of flag right at the very start of the pattern can be handled entirely
-at compile time, and so does not cause anything to be put into the compiled
-data.
-
-Philip Hazel
-August 2002
diff --git a/ext/pcre/pcrelib/doc/pcre.3 b/ext/pcre/pcrelib/doc/pcre.3
deleted file mode 100644
index 4827c5e21d..0000000000
--- a/ext/pcre/pcrelib/doc/pcre.3
+++ /dev/null
@@ -1,1993 +0,0 @@
-.TH PCRE 3
-.SH NAME
-pcre - Perl-compatible regular expressions.
-.SH SYNOPSIS
-.B #include <pcre.h>
-.PP
-.SM
-.br
-.B pcre *pcre_compile(const char *\fIpattern\fR, int \fIoptions\fR,
-.ti +5n
-.B const char **\fIerrptr\fR, int *\fIerroffset\fR,
-.ti +5n
-.B const unsigned char *\fItableptr\fR);
-.PP
-.br
-.B pcre_extra *pcre_study(const pcre *\fIcode\fR, int \fIoptions\fR,
-.ti +5n
-.B const char **\fIerrptr\fR);
-.PP
-.br
-.B int pcre_exec(const pcre *\fIcode\fR, "const pcre_extra *\fIextra\fR,"
-.ti +5n
-.B "const char *\fIsubject\fR," int \fIlength\fR, int \fIstartoffset\fR,
-.ti +5n
-.B int \fIoptions\fR, int *\fIovector\fR, int \fIovecsize\fR);
-.PP
-.br
-.B int pcre_copy_substring(const char *\fIsubject\fR, int *\fIovector\fR,
-.ti +5n
-.B int \fIstringcount\fR, int \fIstringnumber\fR, char *\fIbuffer\fR,
-.ti +5n
-.B int \fIbuffersize\fR);
-.PP
-.br
-.B int pcre_get_substring(const char *\fIsubject\fR, int *\fIovector\fR,
-.ti +5n
-.B int \fIstringcount\fR, int \fIstringnumber\fR,
-.ti +5n
-.B const char **\fIstringptr\fR);
-.PP
-.br
-.B int pcre_get_substring_list(const char *\fIsubject\fR,
-.ti +5n
-.B int *\fIovector\fR, int \fIstringcount\fR, "const char ***\fIlistptr\fR);"
-.PP
-.br
-.B void pcre_free_substring(const char *\fIstringptr\fR);
-.PP
-.br
-.B void pcre_free_substring_list(const char **\fIstringptr\fR);
-.PP
-.br
-.B const unsigned char *pcre_maketables(void);
-.PP
-.br
-.B int pcre_fullinfo(const pcre *\fIcode\fR, "const pcre_extra *\fIextra\fR,"
-.ti +5n
-.B int \fIwhat\fR, void *\fIwhere\fR);
-.PP
-.br
-.B int pcre_info(const pcre *\fIcode\fR, int *\fIoptptr\fR, int
-.B *\fIfirstcharptr\fR);
-.PP
-.br
-.B char *pcre_version(void);
-.PP
-.br
-.B void *(*pcre_malloc)(size_t);
-.PP
-.br
-.B void (*pcre_free)(void *);
-
-
-
-.SH DESCRIPTION
-The PCRE library is a set of functions that implement regular expression
-pattern matching using the same syntax and semantics as Perl 5, with just a few
-differences (see below). The current implementation corresponds to Perl 5.005,
-with some additional features from later versions. This includes some
-experimental, incomplete support for UTF-8 encoded strings. Details of exactly
-what is and what is not supported are given below.
-
-PCRE has its own native API, which is described in this document. There is also
-a set of wrapper functions that correspond to the POSIX regular expression API.
-These are described in the \fBpcreposix\fR documentation.
-
-The native API function prototypes are defined in the header file \fBpcre.h\fR,
-and on Unix systems the library itself is called \fBlibpcre.a\fR, so can be
-accessed by adding \fB-lpcre\fR to the command for linking an application which
-calls it. The header file defines the macros PCRE_MAJOR and PCRE_MINOR to
-contain the major and minor release numbers for the library. Applications can
-use these to include support for different releases.
-
-The functions \fBpcre_compile()\fR, \fBpcre_study()\fR, and \fBpcre_exec()\fR
-are used for compiling and matching regular expressions. A sample program that
-demonstrates the simplest way of using them is given in the file
-\fIpcredemo.c\fR. The last section of this man page describes how to run it.
-
-The functions \fBpcre_copy_substring()\fR, \fBpcre_get_substring()\fR, and
-\fBpcre_get_substring_list()\fR are convenience functions for extracting
-captured substrings from a matched subject string; \fBpcre_free_substring()\fR
-and \fBpcre_free_substring_list()\fR are also provided, to free the memory used
-for extracted strings.
-
-The function \fBpcre_maketables()\fR is used (optionally) to build a set of
-character tables in the current locale for passing to \fBpcre_compile()\fR.
-
-The function \fBpcre_fullinfo()\fR is used to find out information about a
-compiled pattern; \fBpcre_info()\fR is an obsolete version which returns only
-some of the available information, but is retained for backwards compatibility.
-The function \fBpcre_version()\fR returns a pointer to a string containing the
-version of PCRE and its date of release.
-
-The global variables \fBpcre_malloc\fR and \fBpcre_free\fR initially contain
-the entry points of the standard \fBmalloc()\fR and \fBfree()\fR functions
-respectively. PCRE calls the memory management functions via these variables,
-so a calling program can replace them if it wishes to intercept the calls. This
-should be done before calling any PCRE functions.
-
-
-.SH MULTI-THREADING
-The PCRE functions can be used in multi-threading applications, with the
-proviso that the memory management functions pointed to by \fBpcre_malloc\fR
-and \fBpcre_free\fR are shared by all threads.
-
-The compiled form of a regular expression is not altered during matching, so
-the same compiled pattern can safely be used by several threads at once.
-
-
-.SH COMPILING A PATTERN
-The function \fBpcre_compile()\fR is called to compile a pattern into an
-internal form. The pattern is a C string terminated by a binary zero, and
-is passed in the argument \fIpattern\fR. A pointer to a single block of memory
-that is obtained via \fBpcre_malloc\fR is returned. This contains the compiled
-code and related data. The \fBpcre\fR type is defined for the returned block;
-this is a typedef for a structure whose contents are not externally defined. It
-is up to the caller to free the memory when it is no longer required.
-
-Although the compiled code of a PCRE regex is relocatable, that is, it does not
-depend on memory location, the complete \fBpcre\fR data block is not
-fully relocatable, because it contains a copy of the \fItableptr\fR argument,
-which is an address (see below).
-
-The size of a compiled pattern is roughly proportional to the length of the
-pattern string, except that each character class (other than those containing
-just a single character, negated or not) requires 33 bytes, and repeat
-quantifiers with a minimum greater than one or a bounded maximum cause the
-relevant portions of the compiled pattern to be replicated.
-
-The \fIoptions\fR argument contains independent bits that affect the
-compilation. It should be zero if no options are required. Some of the options,
-in particular, those that are compatible with Perl, can also be set and unset
-from within the pattern (see the detailed description of regular expressions
-below). For these options, the contents of the \fIoptions\fR argument specifies
-their initial settings at the start of compilation and execution. The
-PCRE_ANCHORED option can be set at the time of matching as well as at compile
-time.
-
-If \fIerrptr\fR is NULL, \fBpcre_compile()\fR returns NULL immediately.
-Otherwise, if compilation of a pattern fails, \fBpcre_compile()\fR returns
-NULL, and sets the variable pointed to by \fIerrptr\fR to point to a textual
-error message. The offset from the start of the pattern to the character where
-the error was discovered is placed in the variable pointed to by
-\fIerroffset\fR, which must not be NULL. If it is, an immediate error is given.
-
-If the final argument, \fItableptr\fR, is NULL, PCRE uses a default set of
-character tables which are built when it is compiled, using the default C
-locale. Otherwise, \fItableptr\fR must be the result of a call to
-\fBpcre_maketables()\fR. See the section on locale support below.
-
-This code fragment shows a typical straightforward call to \fBpcre_compile()\fR:
-
- pcre *re;
- const char *error;
- int erroffset;
- re = pcre_compile(
- "^A.*Z", /* the pattern */
- 0, /* default options */
- &error, /* for error message */
- &erroffset, /* for error offset */
- NULL); /* use default character tables */
-
-The following option bits are defined in the header file:
-
- PCRE_ANCHORED
-
-If this bit is set, the pattern is forced to be "anchored", that is, it is
-constrained to match only at the start of the string which is being searched
-(the "subject string"). This effect can also be achieved by appropriate
-constructs in the pattern itself, which is the only way to do it in Perl.
-
- PCRE_CASELESS
-
-If this bit is set, letters in the pattern match both upper and lower case
-letters. It is equivalent to Perl's /i option.
-
- PCRE_DOLLAR_ENDONLY
-
-If this bit is set, a dollar metacharacter in the pattern matches only at the
-end of the subject string. Without this option, a dollar also matches
-immediately before the final character if it is a newline (but not before any
-other newlines). The PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is
-set. There is no equivalent to this option in Perl.
-
- PCRE_DOTALL
-
-If this bit is set, a dot metacharater in the pattern matches all characters,
-including newlines. Without it, newlines are excluded. This option is
-equivalent to Perl's /s option. A negative class such as [^a] always matches a
-newline character, independent of the setting of this option.
-
- PCRE_EXTENDED
-
-If this bit is set, whitespace data characters in the pattern are totally
-ignored except when escaped or inside a character class, and characters between
-an unescaped # outside a character class and the next newline character,
-inclusive, are also ignored. This is equivalent to Perl's /x option, and makes
-it possible to include comments inside complicated patterns. Note, however,
-that this applies only to data characters. Whitespace characters may never
-appear within special character sequences in a pattern, for example within the
-sequence (?( which introduces a conditional subpattern.
-
- PCRE_EXTRA
-
-This option was invented in order to turn on additional functionality of PCRE
-that is incompatible with Perl, but it is currently of very little use. When
-set, any backslash in a pattern that is followed by a letter that has no
-special meaning causes an error, thus reserving these combinations for future
-expansion. By default, as in Perl, a backslash followed by a letter with no
-special meaning is treated as a literal. There are at present no other features
-controlled by this option. It can also be set by a (?X) option setting within a
-pattern.
-
- PCRE_MULTILINE
-
-By default, PCRE treats the subject string as consisting of a single "line" of
-characters (even if it actually contains several newlines). The "start of line"
-metacharacter (^) matches only at the start of the string, while the "end of
-line" metacharacter ($) matches only at the end of the string, or before a
-terminating newline (unless PCRE_DOLLAR_ENDONLY is set). This is the same as
-Perl.
-
-When PCRE_MULTILINE it is set, the "start of line" and "end of line" constructs
-match immediately following or immediately before any newline in the subject
-string, respectively, as well as at the very start and end. This is equivalent
-to Perl's /m option. If there are no "\\n" characters in a subject string, or
-no occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no
-effect.
-
- PCRE_UNGREEDY
-
-This option inverts the "greediness" of the quantifiers so that they are not
-greedy by default, but become greedy if followed by "?". It is not compatible
-with Perl. It can also be set by a (?U) option setting within the pattern.
-
- PCRE_UTF8
-
-This option causes PCRE to regard both the pattern and the subject as strings
-of UTF-8 characters instead of just byte strings. However, it is available only
-if PCRE has been built to include UTF-8 support. If not, the use of this option
-provokes an error. Support for UTF-8 is new, experimental, and incomplete.
-Details of exactly what it entails are given below.
-
-
-.SH STUDYING A PATTERN
-When a pattern is going to be used several times, it is worth spending more
-time analyzing it in order to speed up the time taken for matching. The
-function \fBpcre_study()\fR takes a pointer to a compiled pattern as its first
-argument, and returns a pointer to a \fBpcre_extra\fR block (another typedef
-for a structure with hidden contents) containing additional information about
-the pattern; this can be passed to \fBpcre_exec()\fR. If no additional
-information is available, NULL is returned.
-
-The second argument contains option bits. At present, no options are defined
-for \fBpcre_study()\fR, and this argument should always be zero.
-
-The third argument for \fBpcre_study()\fR is a pointer to an error message. If
-studying succeeds (even if no data is returned), the variable it points to is
-set to NULL. Otherwise it points to a textual error message.
-
-This is a typical call to \fBpcre_study\fR():
-
- pcre_extra *pe;
- pe = pcre_study(
- re, /* result of pcre_compile() */
- 0, /* no options exist */
- &error); /* set to NULL or points to a message */
-
-At present, studying a pattern is useful only for non-anchored patterns that do
-not have a single fixed starting character. A bitmap of possible starting
-characters is created.
-
-
-.SH LOCALE SUPPORT
-PCRE handles caseless matching, and determines whether characters are letters,
-digits, or whatever, by reference to a set of tables. The library contains a
-default set of tables which is created in the default C locale when PCRE is
-compiled. This is used when the final argument of \fBpcre_compile()\fR is NULL,
-and is sufficient for many applications.
-
-An alternative set of tables can, however, be supplied. Such tables are built
-by calling the \fBpcre_maketables()\fR function, which has no arguments, in the
-relevant locale. The result can then be passed to \fBpcre_compile()\fR as often
-as necessary. For example, to build and use tables that are appropriate for the
-French locale (where accented characters with codes greater than 128 are
-treated as letters), the following code could be used:
-
- setlocale(LC_CTYPE, "fr");
- tables = pcre_maketables();
- re = pcre_compile(..., tables);
-
-The tables are built in memory that is obtained via \fBpcre_malloc\fR. The
-pointer that is passed to \fBpcre_compile\fR is saved with the compiled
-pattern, and the same tables are used via this pointer by \fBpcre_study()\fR
-and \fBpcre_exec()\fR. Thus for any single pattern, compilation, studying and
-matching all happen in the same locale, but different patterns can be compiled
-in different locales. It is the caller's responsibility to ensure that the
-memory containing the tables remains available for as long as it is needed.
-
-
-.SH INFORMATION ABOUT A PATTERN
-The \fBpcre_fullinfo()\fR function returns information about a compiled
-pattern. It replaces the obsolete \fBpcre_info()\fR function, which is
-nevertheless retained for backwards compability (and is documented below).
-
-The first argument for \fBpcre_fullinfo()\fR is a pointer to the compiled
-pattern. The second argument is the result of \fBpcre_study()\fR, or NULL if
-the pattern was not studied. The third argument specifies which piece of
-information is required, while the fourth argument is a pointer to a variable
-to receive the data. The yield of the function is zero for success, or one of
-the following negative numbers:
-
- PCRE_ERROR_NULL the argument \fIcode\fR was NULL
- the argument \fIwhere\fR was NULL
- PCRE_ERROR_BADMAGIC the "magic number" was not found
- PCRE_ERROR_BADOPTION the value of \fIwhat\fR was invalid
-
-Here is a typical call of \fBpcre_fullinfo()\fR, to obtain the length of the
-compiled pattern:
-
- int rc;
- unsigned long int length;
- rc = pcre_fullinfo(
- re, /* result of pcre_compile() */
- pe, /* result of pcre_study(), or NULL */
- PCRE_INFO_SIZE, /* what is required */
- &length); /* where to put the data */
-
-The possible values for the third argument are defined in \fBpcre.h\fR, and are
-as follows:
-
- PCRE_INFO_OPTIONS
-
-Return a copy of the options with which the pattern was compiled. The fourth
-argument should point to an \fBunsigned long int\fR variable. These option bits
-are those specified in the call to \fBpcre_compile()\fR, modified by any
-top-level option settings within the pattern itself, and with the PCRE_ANCHORED
-bit forcibly set if the form of the pattern implies that it can match only at
-the start of a subject string.
-
- PCRE_INFO_SIZE
-
-Return the size of the compiled pattern, that is, the value that was passed as
-the argument to \fBpcre_malloc()\fR when PCRE was getting memory in which to
-place the compiled data. The fourth argument should point to a \fBsize_t\fR
-variable.
-
- PCRE_INFO_CAPTURECOUNT
-
-Return the number of capturing subpatterns in the pattern. The fourth argument
-should point to an \fbint\fR variable.
-
- PCRE_INFO_BACKREFMAX
-
-Return the number of the highest back reference in the pattern. The fourth
-argument should point to an \fBint\fR variable. Zero is returned if there are
-no back references.
-
- PCRE_INFO_FIRSTCHAR
-
-Return information about the first character of any matched string, for a
-non-anchored pattern. If there is a fixed first character, e.g. from a pattern
-such as (cat|cow|coyote), it is returned in the integer pointed to by
-\fIwhere\fR. Otherwise, if either
-
-(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
-starts with "^", or
-
-(b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set
-(if it were set, the pattern would be anchored),
-
--1 is returned, indicating that the pattern matches only at the start of a
-subject string or after any "\\n" within the string. Otherwise -2 is returned.
-For anchored patterns, -2 is returned.
-
- PCRE_INFO_FIRSTTABLE
-
-If the pattern was studied, and this resulted in the construction of a 256-bit
-table indicating a fixed set of characters for the first character in any
-matching string, a pointer to the table is returned. Otherwise NULL is
-returned. The fourth argument should point to an \fBunsigned char *\fR
-variable.
-
- PCRE_INFO_LASTLITERAL
-
-For a non-anchored pattern, return the value of the rightmost literal character
-which must exist in any matched string, other than at its start. The fourth
-argument should point to an \fBint\fR variable. If there is no such character,
-or if the pattern is anchored, -1 is returned. For example, for the pattern
-/a\\d+z\\d+/ the returned value is 'z'.
-
-The \fBpcre_info()\fR function is now obsolete because its interface is too
-restrictive to return all the available data about a compiled pattern. New
-programs should use \fBpcre_fullinfo()\fR instead. The yield of
-\fBpcre_info()\fR is the number of capturing subpatterns, or one of the
-following negative numbers:
-
- PCRE_ERROR_NULL the argument \fIcode\fR was NULL
- PCRE_ERROR_BADMAGIC the "magic number" was not found
-
-If the \fIoptptr\fR argument is not NULL, a copy of the options with which the
-pattern was compiled is placed in the integer it points to (see
-PCRE_INFO_OPTIONS above).
-
-If the pattern is not anchored and the \fIfirstcharptr\fR argument is not NULL,
-it is used to pass back information about the first character of any matched
-string (see PCRE_INFO_FIRSTCHAR above).
-
-
-.SH MATCHING A PATTERN
-The function \fBpcre_exec()\fR is called to match a subject string against a
-pre-compiled pattern, which is passed in the \fIcode\fR argument. If the
-pattern has been studied, the result of the study should be passed in the
-\fIextra\fR argument. Otherwise this must be NULL.
-
-Here is an example of a simple call to \fBpcre_exec()\fR:
-
- int rc;
- int ovector[30];
- rc = pcre_exec(
- re, /* result of pcre_compile() */
- NULL, /* we didn't study the pattern */
- "some string", /* the subject string */
- 11, /* the length of the subject string */
- 0, /* start at offset 0 in the subject */
- 0, /* default options */
- ovector, /* vector for substring information */
- 30); /* number of elements in the vector */
-
-The PCRE_ANCHORED option can be passed in the \fIoptions\fR argument, whose
-unused bits must be zero. However, if a pattern was compiled with
-PCRE_ANCHORED, or turned out to be anchored by virtue of its contents, it
-cannot be made unachored at matching time.
-
-There are also three further options that can be set only at matching time:
-
- PCRE_NOTBOL
-
-The first character of the string is not the beginning of a line, so the
-circumflex metacharacter should not match before it. Setting this without
-PCRE_MULTILINE (at compile time) causes circumflex never to match.
-
- PCRE_NOTEOL
-
-The end of the string is not the end of a line, so the dollar metacharacter
-should not match it nor (except in multiline mode) a newline immediately before
-it. Setting this without PCRE_MULTILINE (at compile time) causes dollar never
-to match.
-
- PCRE_NOTEMPTY
-
-An empty string is not considered to be a valid match if this option is set. If
-there are alternatives in the pattern, they are tried. If all the alternatives
-match the empty string, the entire match fails. For example, if the pattern
-
- a?b?
-
-is applied to a string not beginning with "a" or "b", it matches the empty
-string at the start of the subject. With PCRE_NOTEMPTY set, this match is not
-valid, so PCRE searches further into the string for occurrences of "a" or "b".
-
-Perl has no direct equivalent of PCRE_NOTEMPTY, but it does make a special case
-of a pattern match of the empty string within its \fBsplit()\fR function, and
-when using the /g modifier. It is possible to emulate Perl's behaviour after
-matching a null string by first trying the match again at the same offset with
-PCRE_NOTEMPTY set, and then if that fails by advancing the starting offset (see
-below) and trying an ordinary match again.
-
-The subject string is passed as a pointer in \fIsubject\fR, a length in
-\fIlength\fR, and a starting offset in \fIstartoffset\fR. Unlike the pattern
-string, the subject may contain binary zero characters. When the starting
-offset is zero, the search for a match starts at the beginning of the subject,
-and this is by far the most common case.
-
-A non-zero starting offset is useful when searching for another match in the
-same subject by calling \fBpcre_exec()\fR again after a previous success.
-Setting \fIstartoffset\fR differs from just passing over a shortened string and
-setting PCRE_NOTBOL in the case of a pattern that begins with any kind of
-lookbehind. For example, consider the pattern
-
- \\Biss\\B
-
-which finds occurrences of "iss" in the middle of words. (\\B matches only if
-the current position in the subject is not a word boundary.) When applied to
-the string "Mississipi" the first call to \fBpcre_exec()\fR finds the first
-occurrence. If \fBpcre_exec()\fR is called again with just the remainder of the
-subject, namely "issipi", it does not match, because \\B is always false at the
-start of the subject, which is deemed to be a word boundary. However, if
-\fBpcre_exec()\fR is passed the entire string again, but with \fIstartoffset\fR
-set to 4, it finds the second occurrence of "iss" because it is able to look
-behind the starting point to discover that it is preceded by a letter.
-
-If a non-zero starting offset is passed when the pattern is anchored, one
-attempt to match at the given offset is tried. This can only succeed if the
-pattern does not require the match to be at the start of the subject.
-
-In general, a pattern matches a certain portion of the subject, and in
-addition, further substrings from the subject may be picked out by parts of the
-pattern. Following the usage in Jeffrey Friedl's book, this is called
-"capturing" in what follows, and the phrase "capturing subpattern" is used for
-a fragment of a pattern that picks out a substring. PCRE supports several other
-kinds of parenthesized subpattern that do not cause substrings to be captured.
-
-Captured substrings are returned to the caller via a vector of integer offsets
-whose address is passed in \fIovector\fR. The number of elements in the vector
-is passed in \fIovecsize\fR. The first two-thirds of the vector is used to pass
-back captured substrings, each substring using a pair of integers. The
-remaining third of the vector is used as workspace by \fBpcre_exec()\fR while
-matching capturing subpatterns, and is not available for passing back
-information. The length passed in \fIovecsize\fR should always be a multiple of
-three. If it is not, it is rounded down.
-
-When a match has been successful, information about captured substrings is
-returned in pairs of integers, starting at the beginning of \fIovector\fR, and
-continuing up to two-thirds of its length at the most. The first element of a
-pair is set to the offset of the first character in a substring, and the second
-is set to the offset of the first character after the end of a substring. The
-first pair, \fIovector[0]\fR and \fIovector[1]\fR, identify the portion of the
-subject string matched by the entire pattern. The next pair is used for the
-first capturing subpattern, and so on. The value returned by \fBpcre_exec()\fR
-is the number of pairs that have been set. If there are no capturing
-subpatterns, the return value from a successful match is 1, indicating that
-just the first pair of offsets has been set.
-
-Some convenience functions are provided for extracting the captured substrings
-as separate strings. These are described in the following section.
-
-It is possible for an capturing subpattern number \fIn+1\fR to match some
-part of the subject when subpattern \fIn\fR has not been used at all. For
-example, if the string "abc" is matched against the pattern (a|(z))(bc)
-subpatterns 1 and 3 are matched, but 2 is not. When this happens, both offset
-values corresponding to the unused subpattern are set to -1.
-
-If a capturing subpattern is matched repeatedly, it is the last portion of the
-string that it matched that gets returned.
-
-If the vector is too small to hold all the captured substrings, it is used as
-far as possible (up to two-thirds of its length), and the function returns a
-value of zero. In particular, if the substring offsets are not of interest,
-\fBpcre_exec()\fR may be called with \fIovector\fR passed as NULL and
-\fIovecsize\fR as zero. However, if the pattern contains back references and
-the \fIovector\fR isn't big enough to remember the related substrings, PCRE has
-to get additional memory for use during matching. Thus it is usually advisable
-to supply an \fIovector\fR.
-
-Note that \fBpcre_info()\fR can be used to find out how many capturing
-subpatterns there are in a compiled pattern. The smallest size for
-\fIovector\fR that will allow for \fIn\fR captured substrings in addition to
-the offsets of the substring matched by the whole pattern is (\fIn\fR+1)*3.
-
-If \fBpcre_exec()\fR fails, it returns a negative number. The following are
-defined in the header file:
-
- PCRE_ERROR_NOMATCH (-1)
-
-The subject string did not match the pattern.
-
- PCRE_ERROR_NULL (-2)
-
-Either \fIcode\fR or \fIsubject\fR was passed as NULL, or \fIovector\fR was
-NULL and \fIovecsize\fR was not zero.
-
- PCRE_ERROR_BADOPTION (-3)
-
-An unrecognized bit was set in the \fIoptions\fR argument.
-
- PCRE_ERROR_BADMAGIC (-4)
-
-PCRE stores a 4-byte "magic number" at the start of the compiled code, to catch
-the case when it is passed a junk pointer. This is the error it gives when the
-magic number isn't present.
-
- PCRE_ERROR_UNKNOWN_NODE (-5)
-
-While running the pattern match, an unknown item was encountered in the
-compiled pattern. This error could be caused by a bug in PCRE or by overwriting
-of the compiled pattern.
-
- PCRE_ERROR_NOMEMORY (-6)
-
-If a pattern contains back references, but the \fIovector\fR that is passed to
-\fBpcre_exec()\fR is not big enough to remember the referenced substrings, PCRE
-gets a block of memory at the start of matching to use for this purpose. If the
-call via \fBpcre_malloc()\fR fails, this error is given. The memory is freed at
-the end of matching.
-
-
-.SH EXTRACTING CAPTURED SUBSTRINGS
-Captured substrings can be accessed directly by using the offsets returned by
-\fBpcre_exec()\fR in \fIovector\fR. For convenience, the functions
-\fBpcre_copy_substring()\fR, \fBpcre_get_substring()\fR, and
-\fBpcre_get_substring_list()\fR are provided for extracting captured substrings
-as new, separate, zero-terminated strings. A substring that contains a binary
-zero is correctly extracted and has a further zero added on the end, but the
-result does not, of course, function as a C string.
-
-The first three arguments are the same for all three functions: \fIsubject\fR
-is the subject string which has just been successfully matched, \fIovector\fR
-is a pointer to the vector of integer offsets that was passed to
-\fBpcre_exec()\fR, and \fIstringcount\fR is the number of substrings that
-were captured by the match, including the substring that matched the entire
-regular expression. This is the value returned by \fBpcre_exec\fR if it
-is greater than zero. If \fBpcre_exec()\fR returned zero, indicating that it
-ran out of space in \fIovector\fR, the value passed as \fIstringcount\fR should
-be the size of the vector divided by three.
-
-The functions \fBpcre_copy_substring()\fR and \fBpcre_get_substring()\fR
-extract a single substring, whose number is given as \fIstringnumber\fR. A
-value of zero extracts the substring that matched the entire pattern, while
-higher values extract the captured substrings. For \fBpcre_copy_substring()\fR,
-the string is placed in \fIbuffer\fR, whose length is given by
-\fIbuffersize\fR, while for \fBpcre_get_substring()\fR a new block of memory is
-obtained via \fBpcre_malloc\fR, and its address is returned via
-\fIstringptr\fR. The yield of the function is the length of the string, not
-including the terminating zero, or one of
-
- PCRE_ERROR_NOMEMORY (-6)
-
-The buffer was too small for \fBpcre_copy_substring()\fR, or the attempt to get
-memory failed for \fBpcre_get_substring()\fR.
-
- PCRE_ERROR_NOSUBSTRING (-7)
-
-There is no substring whose number is \fIstringnumber\fR.
-
-The \fBpcre_get_substring_list()\fR function extracts all available substrings
-and builds a list of pointers to them. All this is done in a single block of
-memory which is obtained via \fBpcre_malloc\fR. The address of the memory block
-is returned via \fIlistptr\fR, which is also the start of the list of string
-pointers. The end of the list is marked by a NULL pointer. The yield of the
-function is zero if all went well, or
-
- PCRE_ERROR_NOMEMORY (-6)
-
-if the attempt to get the memory block failed.
-
-When any of these functions encounter a substring that is unset, which can
-happen when capturing subpattern number \fIn+1\fR matches some part of the
-subject, but subpattern \fIn\fR has not been used at all, they return an empty
-string. This can be distinguished from a genuine zero-length substring by
-inspecting the appropriate offset in \fIovector\fR, which is negative for unset
-substrings.
-
-The two convenience functions \fBpcre_free_substring()\fR and
-\fBpcre_free_substring_list()\fR can be used to free the memory returned by
-a previous call of \fBpcre_get_substring()\fR or
-\fBpcre_get_substring_list()\fR, respectively. They do nothing more than call
-the function pointed to by \fBpcre_free\fR, which of course could be called
-directly from a C program. However, PCRE is used in some situations where it is
-linked via a special interface to another programming language which cannot use
-\fBpcre_free\fR directly; it is for these cases that the functions are
-provided.
-
-
-.SH LIMITATIONS
-There are some size limitations in PCRE but it is hoped that they will never in
-practice be relevant.
-The maximum length of a compiled pattern is 65539 (sic) bytes.
-All values in repeating quantifiers must be less than 65536.
-There maximum number of capturing subpatterns is 65535.
-There is no limit to the number of non-capturing subpatterns, but the maximum
-depth of nesting of all kinds of parenthesized subpattern, including capturing
-subpatterns, assertions, and other types of subpattern, is 200.
-
-The maximum length of a subject string is the largest positive number that an
-integer variable can hold. However, PCRE uses recursion to handle subpatterns
-and indefinite repetition. This means that the available stack space may limit
-the size of a subject string that can be processed by certain patterns.
-
-
-.SH DIFFERENCES FROM PERL
-The differences described here are with respect to Perl 5.005.
-
-1. By default, a whitespace character is any character that the C library
-function \fBisspace()\fR recognizes, though it is possible to compile PCRE with
-alternative character type tables. Normally \fBisspace()\fR matches space,
-formfeed, newline, carriage return, horizontal tab, and vertical tab. Perl 5
-no longer includes vertical tab in its set of whitespace characters. The \\v
-escape that was in the Perl documentation for a long time was never in fact
-recognized. However, the character itself was treated as whitespace at least
-up to 5.002. In 5.004 and 5.005 it does not match \\s.
-
-2. PCRE does not allow repeat quantifiers on lookahead assertions. Perl permits
-them, but they do not mean what you might think. For example, (?!a){3} does
-not assert that the next three characters are not "a". It just asserts that the
-next character is not "a" three times.
-
-3. Capturing subpatterns that occur inside negative lookahead assertions are
-counted, but their entries in the offsets vector are never set. Perl sets its
-numerical variables from any such patterns that are matched before the
-assertion fails to match something (thereby succeeding), but only if the
-negative lookahead assertion contains just one branch.
-
-4. Though binary zero characters are supported in the subject string, they are
-not allowed in a pattern string because it is passed as a normal C string,
-terminated by zero. The escape sequence "\\0" can be used in the pattern to
-represent a binary zero.
-
-5. The following Perl escape sequences are not supported: \\l, \\u, \\L, \\U,
-\\E, \\Q. In fact these are implemented by Perl's general string-handling and
-are not part of its pattern matching engine.
-
-6. The Perl \\G assertion is not supported as it is not relevant to single
-pattern matches.
-
-7. Fairly obviously, PCRE does not support the (?{code}) and (?p{code})
-constructions. However, there is some experimental support for recursive
-patterns using the non-Perl item (?R).
-
-8. There are at the time of writing some oddities in Perl 5.005_02 concerned
-with the settings of captured strings when part of a pattern is repeated. For
-example, matching "aba" against the pattern /^(a(b)?)+$/ sets $2 to the value
-"b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves $2 unset. However, if
-the pattern is changed to /^(aa(b(b))?)+$/ then $2 (and $3) are set.
-
-In Perl 5.004 $2 is set in both cases, and that is also true of PCRE. If in the
-future Perl changes to a consistent state that is different, PCRE may change to
-follow.
-
-9. Another as yet unresolved discrepancy is that in Perl 5.005_02 the pattern
-/^(a)?(?(1)a|b)+$/ matches the string "a", whereas in PCRE it does not.
-However, in both Perl and PCRE /^(a)?a/ matched against "a" leaves $1 unset.
-
-10. PCRE provides some extensions to the Perl regular expression facilities:
-
-(a) Although lookbehind assertions must match fixed length strings, each
-alternative branch of a lookbehind assertion can match a different length of
-string. Perl 5.005 requires them all to have the same length.
-
-(b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not set, the $ meta-
-character matches only at the very end of the string.
-
-(c) If PCRE_EXTRA is set, a backslash followed by a letter with no special
-meaning is faulted.
-
-(d) If PCRE_UNGREEDY is set, the greediness of the repetition quantifiers is
-inverted, that is, by default they are not greedy, but if followed by a
-question mark they are.
-
-(e) PCRE_ANCHORED can be used to force a pattern to be tried only at the start
-of the subject.
-
-(f) The PCRE_NOTBOL, PCRE_NOTEOL, and PCRE_NOTEMPTY options for
-\fBpcre_exec()\fR have no Perl equivalents.
-
-(g) The (?R) construct allows for recursive pattern matching (Perl 5.6 can do
-this using the (?p{code}) construct, which PCRE cannot of course support.)
-
-
-.SH REGULAR EXPRESSION DETAILS
-The syntax and semantics of the regular expressions supported by PCRE are
-described below. Regular expressions are also described in the Perl
-documentation and in a number of other books, some of which have copious
-examples. Jeffrey Friedl's "Mastering Regular Expressions", published by
-O'Reilly (ISBN 1-56592-257), covers them in great detail.
-
-The description here is intended as reference documentation. The basic
-operation of PCRE is on strings of bytes. However, there is the beginnings of
-some support for UTF-8 character strings. To use this support you must
-configure PCRE to include it, and then call \fBpcre_compile()\fR with the
-PCRE_UTF8 option. How this affects the pattern matching is described in the
-final section of this document.
-
-A regular expression is a pattern that is matched against a subject string from
-left to right. Most characters stand for themselves in a pattern, and match the
-corresponding characters in the subject. As a trivial example, the pattern
-
- The quick brown fox
-
-matches a portion of a subject string that is identical to itself. The power of
-regular expressions comes from the ability to include alternatives and
-repetitions in the pattern. These are encoded in the pattern by the use of
-\fImeta-characters\fR, which do not stand for themselves but instead are
-interpreted in some special way.
-
-There are two different sets of meta-characters: those that are recognized
-anywhere in the pattern except within square brackets, and those that are
-recognized in square brackets. Outside square brackets, the meta-characters are
-as follows:
-
- \\ general escape character with several uses
- ^ assert start of subject (or line, in multiline mode)
- $ assert end of subject (or line, in multiline mode)
- . match any character except newline (by default)
- [ start character class definition
- | start of alternative branch
- ( start subpattern
- ) end subpattern
- ? extends the meaning of (
- also 0 or 1 quantifier
- also quantifier minimizer
- * 0 or more quantifier
- + 1 or more quantifier
- { start min/max quantifier
-
-Part of a pattern that is in square brackets is called a "character class". In
-a character class the only meta-characters are:
-
- \\ general escape character
- ^ negate the class, but only if the first character
- - indicates character range
- ] terminates the character class
-
-The following sections describe the use of each of the meta-characters.
-
-
-.SH BACKSLASH
-The backslash character has several uses. Firstly, if it is followed by a
-non-alphameric character, it takes away any special meaning that character may
-have. This use of backslash as an escape character applies both inside and
-outside character classes.
-
-For example, if you want to match a "*" character, you write "\\*" in the
-pattern. This applies whether or not the following character would otherwise be
-interpreted as a meta-character, so it is always safe to precede a
-non-alphameric with "\\" to specify that it stands for itself. In particular,
-if you want to match a backslash, you write "\\\\".
-
-If a pattern is compiled with the PCRE_EXTENDED option, whitespace in the
-pattern (other than in a character class) and characters between a "#" outside
-a character class and the next newline character are ignored. An escaping
-backslash can be used to include a whitespace or "#" character as part of the
-pattern.
-
-A second use of backslash provides a way of encoding non-printing characters
-in patterns in a visible manner. There is no restriction on the appearance of
-non-printing characters, apart from the binary zero that terminates a pattern,
-but when a pattern is being prepared by text editing, it is usually easier to
-use one of the following escape sequences than the binary character it
-represents:
-
- \\a alarm, that is, the BEL character (hex 07)
- \\cx "control-x", where x is any character
- \\e escape (hex 1B)
- \\f formfeed (hex 0C)
- \\n newline (hex 0A)
- \\r carriage return (hex 0D)
- \\t tab (hex 09)
- \\xhh character with hex code hh
- \\ddd character with octal code ddd, or backreference
-
-The precise effect of "\\cx" is as follows: if "x" is a lower case letter, it
-is converted to upper case. Then bit 6 of the character (hex 40) is inverted.
-Thus "\\cz" becomes hex 1A, but "\\c{" becomes hex 3B, while "\\c;" becomes hex
-7B.
-
-After "\\x", up to two hexadecimal digits are read (letters can be in upper or
-lower case).
-
-After "\\0" up to two further octal digits are read. In both cases, if there
-are fewer than two digits, just those that are present are used. Thus the
-sequence "\\0\\x\\07" specifies two binary zeros followed by a BEL character.
-Make sure you supply two digits after the initial zero if the character that
-follows is itself an octal digit.
-
-The handling of a backslash followed by a digit other than 0 is complicated.
-Outside a character class, PCRE reads it and any following digits as a decimal
-number. If the number is less than 10, or if there have been at least that many
-previous capturing left parentheses in the expression, the entire sequence is
-taken as a \fIback reference\fR. A description of how this works is given
-later, following the discussion of parenthesized subpatterns.
-
-Inside a character class, or if the decimal number is greater than 9 and there
-have not been that many capturing subpatterns, PCRE re-reads up to three octal
-digits following the backslash, and generates a single byte from the least
-significant 8 bits of the value. Any subsequent digits stand for themselves.
-For example:
-
- \\040 is another way of writing a space
- \\40 is the same, provided there are fewer than 40
- previous capturing subpatterns
- \\7 is always a back reference
- \\11 might be a back reference, or another way of
- writing a tab
- \\011 is always a tab
- \\0113 is a tab followed by the character "3"
- \\113 is the character with octal code 113 (since there
- can be no more than 99 back references)
- \\377 is a byte consisting entirely of 1 bits
- \\81 is either a back reference, or a binary zero
- followed by the two characters "8" and "1"
-
-Note that octal values of 100 or greater must not be introduced by a leading
-zero, because no more than three octal digits are ever read.
-
-All the sequences that define a single byte value can be used both inside and
-outside character classes. In addition, inside a character class, the sequence
-"\\b" is interpreted as the backspace character (hex 08). Outside a character
-class it has a different meaning (see below).
-
-The third use of backslash is for specifying generic character types:
-
- \\d any decimal digit
- \\D any character that is not a decimal digit
- \\s any whitespace character
- \\S any character that is not a whitespace character
- \\w any "word" character
- \\W any "non-word" character
-
-Each pair of escape sequences partitions the complete set of characters into
-two disjoint sets. Any given character matches one, and only one, of each pair.
-
-A "word" character is any letter or digit or the underscore character, that is,
-any character which can be part of a Perl "word". The definition of letters and
-digits is controlled by PCRE's character tables, and may vary if locale-
-specific matching is taking place (see "Locale support" above). For example, in
-the "fr" (French) locale, some character codes greater than 128 are used for
-accented letters, and these are matched by \\w.
-
-These character type sequences can appear both inside and outside character
-classes. They each match one character of the appropriate type. If the current
-matching point is at the end of the subject string, all of them fail, since
-there is no character to match.
-
-The fourth use of backslash is for certain simple assertions. An assertion
-specifies a condition that has to be met at a particular point in a match,
-without consuming any characters from the subject string. The use of
-subpatterns for more complicated assertions is described below. The backslashed
-assertions are
-
- \\b word boundary
- \\B not a word boundary
- \\A start of subject (independent of multiline mode)
- \\Z end of subject or newline at end (independent of multiline mode)
- \\z end of subject (independent of multiline mode)
-
-These assertions may not appear in character classes (but note that "\\b" has a
-different meaning, namely the backspace character, inside a character class).
-
-A word boundary is a position in the subject string where the current character
-and the previous character do not both match \\w or \\W (i.e. one matches
-\\w and the other matches \\W), or the start or end of the string if the
-first or last character matches \\w, respectively.
-
-The \\A, \\Z, and \\z assertions differ from the traditional circumflex and
-dollar (described below) in that they only ever match at the very start and end
-of the subject string, whatever options are set. They are not affected by the
-PCRE_NOTBOL or PCRE_NOTEOL options. If the \fIstartoffset\fR argument of
-\fBpcre_exec()\fR is non-zero, \\A can never match. The difference between \\Z
-and \\z is that \\Z matches before a newline that is the last character of the
-string as well as at the end of the string, whereas \\z matches only at the
-end.
-
-
-.SH CIRCUMFLEX AND DOLLAR
-Outside a character class, in the default matching mode, the circumflex
-character is an assertion which is true only if the current matching point is
-at the start of the subject string. If the \fIstartoffset\fR argument of
-\fBpcre_exec()\fR is non-zero, circumflex can never match. Inside a character
-class, circumflex has an entirely different meaning (see below).
-
-Circumflex need not be the first character of the pattern if a number of
-alternatives are involved, but it should be the first thing in each alternative
-in which it appears if the pattern is ever to match that branch. If all
-possible alternatives start with a circumflex, that is, if the pattern is
-constrained to match only at the start of the subject, it is said to be an
-"anchored" pattern. (There are also other constructs that can cause a pattern
-to be anchored.)
-
-A dollar character is an assertion which is true only if the current matching
-point is at the end of the subject string, or immediately before a newline
-character that is the last character in the string (by default). Dollar need
-not be the last character of the pattern if a number of alternatives are
-involved, but it should be the last item in any branch in which it appears.
-Dollar has no special meaning in a character class.
-
-The meaning of dollar can be changed so that it matches only at the very end of
-the string, by setting the PCRE_DOLLAR_ENDONLY option at compile or matching
-time. This does not affect the \\Z assertion.
-
-The meanings of the circumflex and dollar characters are changed if the
-PCRE_MULTILINE option is set. When this is the case, they match immediately
-after and immediately before an internal "\\n" character, respectively, in
-addition to matching at the start and end of the subject string. For example,
-the pattern /^abc$/ matches the subject string "def\\nabc" in multiline mode,
-but not otherwise. Consequently, patterns that are anchored in single line mode
-because all branches start with "^" are not anchored in multiline mode, and a
-match for circumflex is possible when the \fIstartoffset\fR argument of
-\fBpcre_exec()\fR is non-zero. The PCRE_DOLLAR_ENDONLY option is ignored if
-PCRE_MULTILINE is set.
-
-Note that the sequences \\A, \\Z, and \\z can be used to match the start and
-end of the subject in both modes, and if all branches of a pattern start with
-\\A it is always anchored, whether PCRE_MULTILINE is set or not.
-
-
-.SH FULL STOP (PERIOD, DOT)
-Outside a character class, a dot in the pattern matches any one character in
-the subject, including a non-printing character, but not (by default) newline.
-If the PCRE_DOTALL option is set, dots match newlines as well. The handling of
-dot is entirely independent of the handling of circumflex and dollar, the only
-relationship being that they both involve newline characters. Dot has no
-special meaning in a character class.
-
-
-.SH SQUARE BRACKETS
-An opening square bracket introduces a character class, terminated by a closing
-square bracket. A closing square bracket on its own is not special. If a
-closing square bracket is required as a member of the class, it should be the
-first data character in the class (after an initial circumflex, if present) or
-escaped with a backslash.
-
-A character class matches a single character in the subject; the character must
-be in the set of characters defined by the class, unless the first character in
-the class is a circumflex, in which case the subject character must not be in
-the set defined by the class. If a circumflex is actually required as a member
-of the class, ensure it is not the first character, or escape it with a
-backslash.
-
-For example, the character class [aeiou] matches any lower case vowel, while
-[^aeiou] matches any character that is not a lower case vowel. Note that a
-circumflex is just a convenient notation for specifying the characters which
-are in the class by enumerating those that are not. It is not an assertion: it
-still consumes a character from the subject string, and fails if the current
-pointer is at the end of the string.
-
-When caseless matching is set, any letters in a class represent both their
-upper case and lower case versions, so for example, a caseless [aeiou] matches
-"A" as well as "a", and a caseless [^aeiou] does not match "A", whereas a
-caseful version would.
-
-The newline character is never treated in any special way in character classes,
-whatever the setting of the PCRE_DOTALL or PCRE_MULTILINE options is. A class
-such as [^a] will always match a newline.
-
-The minus (hyphen) character can be used to specify a range of characters in a
-character class. For example, [d-m] matches any letter between d and m,
-inclusive. If a minus character is required in a class, it must be escaped with
-a backslash or appear in a position where it cannot be interpreted as
-indicating a range, typically as the first or last character in the class.
-
-It is not possible to have the literal character "]" as the end character of a
-range. A pattern such as [W-]46] is interpreted as a class of two characters
-("W" and "-") followed by a literal string "46]", so it would match "W46]" or
-"-46]". However, if the "]" is escaped with a backslash it is interpreted as
-the end of range, so [W-\\]46] is interpreted as a single class containing a
-range followed by two separate characters. The octal or hexadecimal
-representation of "]" can also be used to end a range.
-
-Ranges operate in ASCII collating sequence. They can also be used for
-characters specified numerically, for example [\\000-\\037]. If a range that
-includes letters is used when caseless matching is set, it matches the letters
-in either case. For example, [W-c] is equivalent to [][\\^_`wxyzabc], matched
-caselessly, and if character tables for the "fr" locale are in use,
-[\\xc8-\\xcb] matches accented E characters in both cases.
-
-The character types \\d, \\D, \\s, \\S, \\w, and \\W may also appear in a
-character class, and add the characters that they match to the class. For
-example, [\\dABCDEF] matches any hexadecimal digit. A circumflex can
-conveniently be used with the upper case character types to specify a more
-restricted set of characters than the matching lower case type. For example,
-the class [^\\W_] matches any letter or digit, but not underscore.
-
-All non-alphameric characters other than \\, -, ^ (at the start) and the
-terminating ] are non-special in character classes, but it does no harm if they
-are escaped.
-
-
-.SH POSIX CHARACTER CLASSES
-Perl 5.6 (not yet released at the time of writing) is going to support the
-POSIX notation for character classes, which uses names enclosed by [: and :]
-within the enclosing square brackets. PCRE supports this notation. For example,
-
- [01[:alpha:]%]
-
-matches "0", "1", any alphabetic character, or "%". The supported class names
-are
-
- alnum letters and digits
- alpha letters
- ascii character codes 0 - 127
- cntrl control characters
- digit decimal digits (same as \\d)
- graph printing characters, excluding space
- lower lower case letters
- print printing characters, including space
- punct printing characters, excluding letters and digits
- space white space (same as \\s)
- upper upper case letters
- word "word" characters (same as \\w)
- xdigit hexadecimal digits
-
->>>>>>>>>>>>Only WORD is perl. BLANK is GNU.
-
-The names "ascii" and "word" are Perl extensions. Another Perl extension is
-negation, which is indicated by a ^ character after the colon. For example,
-
- [12[:^digit:]]
-
-matches "1", "2", or any non-digit. PCRE (and Perl) also recognize the POSIX
-syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not
-supported, and an error is given if they are encountered.
-
-
-.SH VERTICAL BAR
-Vertical bar characters are used to separate alternative patterns. For example,
-the pattern
-
- gilbert|sullivan
-
-matches either "gilbert" or "sullivan". Any number of alternatives may appear,
-and an empty alternative is permitted (matching the empty string).
-The matching process tries each alternative in turn, from left to right,
-and the first one that succeeds is used. If the alternatives are within a
-subpattern (defined below), "succeeds" means matching the rest of the main
-pattern as well as the alternative in the subpattern.
-
-
-.SH INTERNAL OPTION SETTING
-The settings of PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, and PCRE_EXTENDED
-can be changed from within the pattern by a sequence of Perl option letters
-enclosed between "(?" and ")". The option letters are
-
- i for PCRE_CASELESS
- m for PCRE_MULTILINE
- s for PCRE_DOTALL
- x for PCRE_EXTENDED
-
-For example, (?im) sets caseless, multiline matching. It is also possible to
-unset these options by preceding the letter with a hyphen, and a combined
-setting and unsetting such as (?im-sx), which sets PCRE_CASELESS and
-PCRE_MULTILINE while unsetting PCRE_DOTALL and PCRE_EXTENDED, is also
-permitted. If a letter appears both before and after the hyphen, the option is
-unset.
-
-The scope of these option changes depends on where in the pattern the setting
-occurs. For settings that are outside any subpattern (defined below), the
-effect is the same as if the options were set or unset at the start of
-matching. The following patterns all behave in exactly the same way:
-
- (?i)abc
- a(?i)bc
- ab(?i)c
- abc(?i)
-
-which in turn is the same as compiling the pattern abc with PCRE_CASELESS set.
-In other words, such "top level" settings apply to the whole pattern (unless
-there are other changes inside subpatterns). If there is more than one setting
-of the same option at top level, the rightmost setting is used.
-
-If an option change occurs inside a subpattern, the effect is different. This
-is a change of behaviour in Perl 5.005. An option change inside a subpattern
-affects only that part of the subpattern that follows it, so
-
- (a(?i)b)c
-
-matches abc and aBc and no other strings (assuming PCRE_CASELESS is not used).
-By this means, options can be made to have different settings in different
-parts of the pattern. Any changes made in one alternative do carry on
-into subsequent branches within the same subpattern. For example,
-
- (a(?i)b|c)
-
-matches "ab", "aB", "c", and "C", even though when matching "C" the first
-branch is abandoned before the option setting. This is because the effects of
-option settings happen at compile time. There would be some very weird
-behaviour otherwise.
-
-The PCRE-specific options PCRE_UNGREEDY and PCRE_EXTRA can be changed in the
-same way as the Perl-compatible options by using the characters U and X
-respectively. The (?X) flag setting is special in that it must always occur
-earlier in the pattern than any of the additional features it turns on, even
-when it is at top level. It is best put at the start.
-
-
-.SH SUBPATTERNS
-Subpatterns are delimited by parentheses (round brackets), which can be nested.
-Marking part of a pattern as a subpattern does two things:
-
-1. It localizes a set of alternatives. For example, the pattern
-
- cat(aract|erpillar|)
-
-matches one of the words "cat", "cataract", or "caterpillar". Without the
-parentheses, it would match "cataract", "erpillar" or the empty string.
-
-2. It sets up the subpattern as a capturing subpattern (as defined above).
-When the whole pattern matches, that portion of the subject string that matched
-the subpattern is passed back to the caller via the \fIovector\fR argument of
-\fBpcre_exec()\fR. Opening parentheses are counted from left to right (starting
-from 1) to obtain the numbers of the capturing subpatterns.
-
-For example, if the string "the red king" is matched against the pattern
-
- the ((red|white) (king|queen))
-
-the captured substrings are "red king", "red", and "king", and are numbered 1,
-2, and 3, respectively.
-
-The fact that plain parentheses fulfil two functions is not always helpful.
-There are often times when a grouping subpattern is required without a
-capturing requirement. If an opening parenthesis is followed by "?:", the
-subpattern does not do any capturing, and is not counted when computing the
-number of any subsequent capturing subpatterns. For example, if the string "the
-white queen" is matched against the pattern
-
- the ((?:red|white) (king|queen))
-
-the captured substrings are "white queen" and "queen", and are numbered 1 and
-2. The maximum number of captured substrings is 99, and the maximum number of
-all subpatterns, both capturing and non-capturing, is 200.
-
-As a convenient shorthand, if any option settings are required at the start of
-a non-capturing subpattern, the option letters may appear between the "?" and
-the ":". Thus the two patterns
-
- (?i:saturday|sunday)
- (?:(?i)saturday|sunday)
-
-match exactly the same set of strings. Because alternative branches are tried
-from left to right, and options are not reset until the end of the subpattern
-is reached, an option setting in one branch does affect subsequent branches, so
-the above patterns match "SUNDAY" as well as "Saturday".
-
-
-.SH REPETITION
-Repetition is specified by quantifiers, which can follow any of the following
-items:
-
- a single character, possibly escaped
- the . metacharacter
- a character class
- a back reference (see next section)
- a parenthesized subpattern (unless it is an assertion - see below)
-
-The general repetition quantifier specifies a minimum and maximum number of
-permitted matches, by giving the two numbers in curly brackets (braces),
-separated by a comma. The numbers must be less than 65536, and the first must
-be less than or equal to the second. For example:
-
- z{2,4}
-
-matches "zz", "zzz", or "zzzz". A closing brace on its own is not a special
-character. If the second number is omitted, but the comma is present, there is
-no upper limit; if the second number and the comma are both omitted, the
-quantifier specifies an exact number of required matches. Thus
-
- [aeiou]{3,}
-
-matches at least 3 successive vowels, but may match many more, while
-
- \\d{8}
-
-matches exactly 8 digits. An opening curly bracket that appears in a position
-where a quantifier is not allowed, or one that does not match the syntax of a
-quantifier, is taken as a literal character. For example, {,6} is not a
-quantifier, but a literal string of four characters.
-
-The quantifier {0} is permitted, causing the expression to behave as if the
-previous item and the quantifier were not present.
-
-For convenience (and historical compatibility) the three most common
-quantifiers have single-character abbreviations:
-
- * is equivalent to {0,}
- + is equivalent to {1,}
- ? is equivalent to {0,1}
-
-It is possible to construct infinite loops by following a subpattern that can
-match no characters with a quantifier that has no upper limit, for example:
-
- (a?)*
-
-Earlier versions of Perl and PCRE used to give an error at compile time for
-such patterns. However, because there are cases where this can be useful, such
-patterns are now accepted, but if any repetition of the subpattern does in fact
-match no characters, the loop is forcibly broken.
-
-By default, the quantifiers are "greedy", that is, they match as much as
-possible (up to the maximum number of permitted times), without causing the
-rest of the pattern to fail. The classic example of where this gives problems
-is in trying to match comments in C programs. These appear between the
-sequences /* and */ and within the sequence, individual * and / characters may
-appear. An attempt to match C comments by applying the pattern
-
- /\\*.*\\*/
-
-to the string
-
- /* first command */ not comment /* second comment */
-
-fails, because it matches the entire string owing to the greediness of the .*
-item.
-
-However, if a quantifier is followed by a question mark, it ceases to be
-greedy, and instead matches the minimum number of times possible, so the
-pattern
-
- /\\*.*?\\*/
-
-does the right thing with the C comments. The meaning of the various
-quantifiers is not otherwise changed, just the preferred number of matches.
-Do not confuse this use of question mark with its use as a quantifier in its
-own right. Because it has two uses, it can sometimes appear doubled, as in
-
- \\d??\\d
-
-which matches one digit by preference, but can match two if that is the only
-way the rest of the pattern matches.
-
-If the PCRE_UNGREEDY option is set (an option which is not available in Perl),
-the quantifiers are not greedy by default, but individual ones can be made
-greedy by following them with a question mark. In other words, it inverts the
-default behaviour.
-
-When a parenthesized subpattern is quantified with a minimum repeat count that
-is greater than 1 or with a limited maximum, more store is required for the
-compiled pattern, in proportion to the size of the minimum or maximum.
-
-If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equivalent
-to Perl's /s) is set, thus allowing the . to match newlines, the pattern is
-implicitly anchored, because whatever follows will be tried against every
-character position in the subject string, so there is no point in retrying the
-overall match at any position after the first. PCRE treats such a pattern as
-though it were preceded by \\A. In cases where it is known that the subject
-string contains no newlines, it is worth setting PCRE_DOTALL when the pattern
-begins with .* in order to obtain this optimization, or alternatively using ^
-to indicate anchoring explicitly.
-
-When a capturing subpattern is repeated, the value captured is the substring
-that matched the final iteration. For example, after
-
- (tweedle[dume]{3}\\s*)+
-
-has matched "tweedledum tweedledee" the value of the captured substring is
-"tweedledee". However, if there are nested capturing subpatterns, the
-corresponding captured values may have been set in previous iterations. For
-example, after
-
- /(a|(b))+/
-
-matches "aba" the value of the second captured substring is "b".
-
-
-.SH BACK REFERENCES
-Outside a character class, a backslash followed by a digit greater than 0 (and
-possibly further digits) is a back reference to a capturing subpattern earlier
-(i.e. to its left) in the pattern, provided there have been that many previous
-capturing left parentheses.
-
-However, if the decimal number following the backslash is less than 10, it is
-always taken as a back reference, and causes an error only if there are not
-that many capturing left parentheses in the entire pattern. In other words, the
-parentheses that are referenced need not be to the left of the reference for
-numbers less than 10. See the section entitled "Backslash" above for further
-details of the handling of digits following a backslash.
-
-A back reference matches whatever actually matched the capturing subpattern in
-the current subject string, rather than anything matching the subpattern
-itself. So the pattern
-
- (sens|respons)e and \\1ibility
-
-matches "sense and sensibility" and "response and responsibility", but not
-"sense and responsibility". If caseful matching is in force at the time of the
-back reference, the case of letters is relevant. For example,
-
- ((?i)rah)\\s+\\1
-
-matches "rah rah" and "RAH RAH", but not "RAH rah", even though the original
-capturing subpattern is matched caselessly.
-
-There may be more than one back reference to the same subpattern. If a
-subpattern has not actually been used in a particular match, any back
-references to it always fail. For example, the pattern
-
- (a|(bc))\\2
-
-always fails if it starts to match "a" rather than "bc". Because there may be
-up to 99 back references, all digits following the backslash are taken
-as part of a potential back reference number. If the pattern continues with a
-digit character, some delimiter must be used to terminate the back reference.
-If the PCRE_EXTENDED option is set, this can be whitespace. Otherwise an empty
-comment can be used.
-
-A back reference that occurs inside the parentheses to which it refers fails
-when the subpattern is first used, so, for example, (a\\1) never matches.
-However, such references can be useful inside repeated subpatterns. For
-example, the pattern
-
- (a|b\\1)+
-
-matches any number of "a"s and also "aba", "ababbaa" etc. At each iteration of
-the subpattern, the back reference matches the character string corresponding
-to the previous iteration. In order for this to work, the pattern must be such
-that the first iteration does not need to match the back reference. This can be
-done using alternation, as in the example above, or by a quantifier with a
-minimum of zero.
-
-
-.SH ASSERTIONS
-An assertion is a test on the characters following or preceding the current
-matching point that does not actually consume any characters. The simple
-assertions coded as \\b, \\B, \\A, \\Z, \\z, ^ and $ are described above. More
-complicated assertions are coded as subpatterns. There are two kinds: those
-that look ahead of the current position in the subject string, and those that
-look behind it.
-
-An assertion subpattern is matched in the normal way, except that it does not
-cause the current matching position to be changed. Lookahead assertions start
-with (?= for positive assertions and (?! for negative assertions. For example,
-
- \\w+(?=;)
-
-matches a word followed by a semicolon, but does not include the semicolon in
-the match, and
-
- foo(?!bar)
-
-matches any occurrence of "foo" that is not followed by "bar". Note that the
-apparently similar pattern
-
- (?!foo)bar
-
-does not find an occurrence of "bar" that is preceded by something other than
-"foo"; it finds any occurrence of "bar" whatsoever, because the assertion
-(?!foo) is always true when the next three characters are "bar". A
-lookbehind assertion is needed to achieve this effect.
-
-Lookbehind assertions start with (?<= for positive assertions and (?<! for
-negative assertions. For example,
-
- (?<!foo)bar
-
-does find an occurrence of "bar" that is not preceded by "foo". The contents of
-a lookbehind assertion are restricted such that all the strings it matches must
-have a fixed length. However, if there are several alternatives, they do not
-all have to have the same fixed length. Thus
-
- (?<=bullock|donkey)
-
-is permitted, but
-
- (?<!dogs?|cats?)
-
-causes an error at compile time. Branches that match different length strings
-are permitted only at the top level of a lookbehind assertion. This is an
-extension compared with Perl 5.005, which requires all branches to match the
-same length of string. An assertion such as
-
- (?<=ab(c|de))
-
-is not permitted, because its single top-level branch can match two different
-lengths, but it is acceptable if rewritten to use two top-level branches:
-
- (?<=abc|abde)
-
-The implementation of lookbehind assertions is, for each alternative, to
-temporarily move the current position back by the fixed width and then try to
-match. If there are insufficient characters before the current position, the
-match is deemed to fail. Lookbehinds in conjunction with once-only subpatterns
-can be particularly useful for matching at the ends of strings; an example is
-given at the end of the section on once-only subpatterns.
-
-Several assertions (of any sort) may occur in succession. For example,
-
- (?<=\\d{3})(?<!999)foo
-
-matches "foo" preceded by three digits that are not "999". Notice that each of
-the assertions is applied independently at the same point in the subject
-string. First there is a check that the previous three characters are all
-digits, and then there is a check that the same three characters are not "999".
-This pattern does \fInot\fR match "foo" preceded by six characters, the first
-of which are digits and the last three of which are not "999". For example, it
-doesn't match "123abcfoo". A pattern to do that is
-
- (?<=\\d{3}...)(?<!999)foo
-
-This time the first assertion looks at the preceding six characters, checking
-that the first three are digits, and then the second assertion checks that the
-preceding three characters are not "999".
-
-Assertions can be nested in any combination. For example,
-
- (?<=(?<!foo)bar)baz
-
-matches an occurrence of "baz" that is preceded by "bar" which in turn is not
-preceded by "foo", while
-
- (?<=\\d{3}(?!999)...)foo
-
-is another pattern which matches "foo" preceded by three digits and any three
-characters that are not "999".
-
-Assertion subpatterns are not capturing subpatterns, and may not be repeated,
-because it makes no sense to assert the same thing several times. If any kind
-of assertion contains capturing subpatterns within it, these are counted for
-the purposes of numbering the capturing subpatterns in the whole pattern.
-However, substring capturing is carried out only for positive assertions,
-because it does not make sense for negative assertions.
-
-Assertions count towards the maximum of 200 parenthesized subpatterns.
-
-
-.SH ONCE-ONLY SUBPATTERNS
-With both maximizing and minimizing repetition, failure of what follows
-normally causes the repeated item to be re-evaluated to see if a different
-number of repeats allows the rest of the pattern to match. Sometimes it is
-useful to prevent this, either to change the nature of the match, or to cause
-it fail earlier than it otherwise might, when the author of the pattern knows
-there is no point in carrying on.
-
-Consider, for example, the pattern \\d+foo when applied to the subject line
-
- 123456bar
-
-After matching all 6 digits and then failing to match "foo", the normal
-action of the matcher is to try again with only 5 digits matching the \\d+
-item, and then with 4, and so on, before ultimately failing. Once-only
-subpatterns provide the means for specifying that once a portion of the pattern
-has matched, it is not to be re-evaluated in this way, so the matcher would
-give up immediately on failing to match "foo" the first time. The notation is
-another kind of special parenthesis, starting with (?> as in this example:
-
- (?>\\d+)bar
-
-This kind of parenthesis "locks up" the part of the pattern it contains once
-it has matched, and a failure further into the pattern is prevented from
-backtracking into it. Backtracking past it to previous items, however, works as
-normal.
-
-An alternative description is that a subpattern of this type matches the string
-of characters that an identical standalone pattern would match, if anchored at
-the current point in the subject string.
-
-Once-only subpatterns are not capturing subpatterns. Simple cases such as the
-above example can be thought of as a maximizing repeat that must swallow
-everything it can. So, while both \\d+ and \\d+? are prepared to adjust the
-number of digits they match in order to make the rest of the pattern match,
-(?>\\d+) can only match an entire sequence of digits.
-
-This construction can of course contain arbitrarily complicated subpatterns,
-and it can be nested.
-
-Once-only subpatterns can be used in conjunction with lookbehind assertions to
-specify efficient matching at the end of the subject string. Consider a simple
-pattern such as
-
- abcd$
-
-when applied to a long string which does not match. Because matching proceeds
-from left to right, PCRE will look for each "a" in the subject and then see if
-what follows matches the rest of the pattern. If the pattern is specified as
-
- ^.*abcd$
-
-the initial .* matches the entire string at first, but when this fails (because
-there is no following "a"), it backtracks to match all but the last character,
-then all but the last two characters, and so on. Once again the search for "a"
-covers the entire string, from right to left, so we are no better off. However,
-if the pattern is written as
-
- ^(?>.*)(?<=abcd)
-
-there can be no backtracking for the .* item; it can match only the entire
-string. The subsequent lookbehind assertion does a single test on the last four
-characters. If it fails, the match fails immediately. For long strings, this
-approach makes a significant difference to the processing time.
-
-When a pattern contains an unlimited repeat inside a subpattern that can itself
-be repeated an unlimited number of times, the use of a once-only subpattern is
-the only way to avoid some failing matches taking a very long time indeed.
-The pattern
-
- (\\D+|<\\d+>)*[!?]
-
-matches an unlimited number of substrings that either consist of non-digits, or
-digits enclosed in <>, followed by either ! or ?. When it matches, it runs
-quickly. However, if it is applied to
-
- aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-
-it takes a long time before reporting failure. This is because the string can
-be divided between the two repeats in a large number of ways, and all have to
-be tried. (The example used [!?] rather than a single character at the end,
-because both PCRE and Perl have an optimization that allows for fast failure
-when a single character is used. They remember the last single character that
-is required for a match, and fail early if it is not present in the string.)
-If the pattern is changed to
-
- ((?>\\D+)|<\\d+>)*[!?]
-
-sequences of non-digits cannot be broken, and failure happens quickly.
-
-
-.SH CONDITIONAL SUBPATTERNS
-It is possible to cause the matching process to obey a subpattern
-conditionally or to choose between two alternative subpatterns, depending on
-the result of an assertion, or whether a previous capturing subpattern matched
-or not. The two possible forms of conditional subpattern are
-
- (?(condition)yes-pattern)
- (?(condition)yes-pattern|no-pattern)
-
-If the condition is satisfied, the yes-pattern is used; otherwise the
-no-pattern (if present) is used. If there are more than two alternatives in the
-subpattern, a compile-time error occurs.
-
-There are two kinds of condition. If the text between the parentheses consists
-of a sequence of digits, the condition is satisfied if the capturing subpattern
-of that number has previously matched. The number must be greater than zero.
-Consider the following pattern, which contains non-significant white space to
-make it more readable (assume the PCRE_EXTENDED option) and to divide it into
-three parts for ease of discussion:
-
- ( \\( )? [^()]+ (?(1) \\) )
-
-The first part matches an optional opening parenthesis, and if that
-character is present, sets it as the first captured substring. The second part
-matches one or more characters that are not parentheses. The third part is a
-conditional subpattern that tests whether the first set of parentheses matched
-or not. If they did, that is, if subject started with an opening parenthesis,
-the condition is true, and so the yes-pattern is executed and a closing
-parenthesis is required. Otherwise, since no-pattern is not present, the
-subpattern matches nothing. In other words, this pattern matches a sequence of
-non-parentheses, optionally enclosed in parentheses.
-
-If the condition is not a sequence of digits, it must be an assertion. This may
-be a positive or negative lookahead or lookbehind assertion. Consider this
-pattern, again containing non-significant white space, and with the two
-alternatives on the second line:
-
- (?(?=[^a-z]*[a-z])
- \\d{2}-[a-z]{3}-\\d{2} | \\d{2}-\\d{2}-\\d{2} )
-
-The condition is a positive lookahead assertion that matches an optional
-sequence of non-letters followed by a letter. In other words, it tests for the
-presence of at least one letter in the subject. If a letter is found, the
-subject is matched against the first alternative; otherwise it is matched
-against the second. This pattern matches strings in one of the two forms
-dd-aaa-dd or dd-dd-dd, where aaa are letters and dd are digits.
-
-
-.SH COMMENTS
-The sequence (?# marks the start of a comment which continues up to the next
-closing parenthesis. Nested parentheses are not permitted. The characters
-that make up a comment play no part in the pattern matching at all.
-
-If the PCRE_EXTENDED option is set, an unescaped # character outside a
-character class introduces a comment that continues up to the next newline
-character in the pattern.
-
-
-.SH RECURSIVE PATTERNS
-Consider the problem of matching a string in parentheses, allowing for
-unlimited nested parentheses. Without the use of recursion, the best that can
-be done is to use a pattern that matches up to some fixed depth of nesting. It
-is not possible to handle an arbitrary nesting depth. Perl 5.6 has provided an
-experimental facility that allows regular expressions to recurse (amongst other
-things). It does this by interpolating Perl code in the expression at run time,
-and the code can refer to the expression itself. A Perl pattern to solve the
-parentheses problem can be created like this:
-
- $re = qr{\\( (?: (?>[^()]+) | (?p{$re}) )* \\)}x;
-
-The (?p{...}) item interpolates Perl code at run time, and in this case refers
-recursively to the pattern in which it appears. Obviously, PCRE cannot support
-the interpolation of Perl code. Instead, the special item (?R) is provided for
-the specific case of recursion. This PCRE pattern solves the parentheses
-problem (assume the PCRE_EXTENDED option is set so that white space is
-ignored):
-
- \\( ( (?>[^()]+) | (?R) )* \\)
-
-First it matches an opening parenthesis. Then it matches any number of
-substrings which can either be a sequence of non-parentheses, or a recursive
-match of the pattern itself (i.e. a correctly parenthesized substring). Finally
-there is a closing parenthesis.
-
-This particular example pattern contains nested unlimited repeats, and so the
-use of a once-only subpattern for matching strings of non-parentheses is
-important when applying the pattern to strings that do not match. For example,
-when it is applied to
-
- (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
-
-it yields "no match" quickly. However, if a once-only subpattern is not used,
-the match runs for a very long time indeed because there are so many different
-ways the + and * repeats can carve up the subject, and all have to be tested
-before failure can be reported.
-
-The values set for any capturing subpatterns are those from the outermost level
-of the recursion at which the subpattern value is set. If the pattern above is
-matched against
-
- (ab(cd)ef)
-
-the value for the capturing parentheses is "ef", which is the last value taken
-on at the top level. If additional parentheses are added, giving
-
- \\( ( ( (?>[^()]+) | (?R) )* ) \\)
- ^ ^
- ^ ^
-the string they capture is "ab(cd)ef", the contents of the top level
-parentheses. If there are more than 15 capturing parentheses in a pattern, PCRE
-has to obtain extra memory to store data during a recursion, which it does by
-using \fBpcre_malloc\fR, freeing it via \fBpcre_free\fR afterwards. If no
-memory can be obtained, it saves data for the first 15 capturing parentheses
-only, as there is no way to give an out-of-memory error from within a
-recursion.
-
-
-.SH PERFORMANCE
-Certain items that may appear in patterns are more efficient than others. It is
-more efficient to use a character class like [aeiou] than a set of alternatives
-such as (a|e|i|o|u). In general, the simplest construction that provides the
-required behaviour is usually the most efficient. Jeffrey Friedl's book
-contains a lot of discussion about optimizing regular expressions for efficient
-performance.
-
-When a pattern begins with .* and the PCRE_DOTALL option is set, the pattern is
-implicitly anchored by PCRE, since it can match only at the start of a subject
-string. However, if PCRE_DOTALL is not set, PCRE cannot make this optimization,
-because the . metacharacter does not then match a newline, and if the subject
-string contains newlines, the pattern may match from the character immediately
-following one of them instead of from the very start. For example, the pattern
-
- (.*) second
-
-matches the subject "first\\nand second" (where \\n stands for a newline
-character) with the first captured substring being "and". In order to do this,
-PCRE has to retry the match starting after every newline in the subject.
-
-If you are using such a pattern with subject strings that do not contain
-newlines, the best performance is obtained by setting PCRE_DOTALL, or starting
-the pattern with ^.* to indicate explicit anchoring. That saves PCRE from
-having to scan along the subject looking for a newline to restart at.
-
-Beware of patterns that contain nested indefinite repeats. These can take a
-long time to run when applied to a string that does not match. Consider the
-pattern fragment
-
- (a+)*
-
-This can match "aaaa" in 33 different ways, and this number increases very
-rapidly as the string gets longer. (The * repeat can match 0, 1, 2, 3, or 4
-times, and for each of those cases other than 0, the + repeats can match
-different numbers of times.) When the remainder of the pattern is such that the
-entire match is going to fail, PCRE has in principle to try every possible
-variation, and this can take an extremely long time.
-
-An optimization catches some of the more simple cases such as
-
- (a+)*b
-
-where a literal character follows. Before embarking on the standard matching
-procedure, PCRE checks that there is a "b" later in the subject string, and if
-there is not, it fails the match immediately. However, when there is no
-following literal this optimization cannot be used. You can see the difference
-by comparing the behaviour of
-
- (a+)*\\d
-
-with the pattern above. The former gives a failure almost instantly when
-applied to a whole line of "a" characters, whereas the latter takes an
-appreciable time with strings longer than about 20 characters.
-
-
-.SH UTF-8 SUPPORT
-Starting at release 3.3, PCRE has some support for character strings encoded
-in the UTF-8 format. This is incomplete, and is regarded as experimental. In
-order to use it, you must configure PCRE to include UTF-8 support in the code,
-and, in addition, you must call \fBpcre_compile()\fR with the PCRE_UTF8 option
-flag. When you do this, both the pattern and any subject strings that are
-matched against it are treated as UTF-8 strings instead of just strings of
-bytes, but only in the cases that are mentioned below.
-
-If you compile PCRE with UTF-8 support, but do not use it at run time, the
-library will be a bit bigger, but the additional run time overhead is limited
-to testing the PCRE_UTF8 flag in several places, so should not be very large.
-
-PCRE assumes that the strings it is given contain valid UTF-8 codes. It does
-not diagnose invalid UTF-8 strings. If you pass invalid UTF-8 strings to PCRE,
-the results are undefined.
-
-Running with PCRE_UTF8 set causes these changes in the way PCRE works:
-
-1. In a pattern, the escape sequence \\x{...}, where the contents of the braces
-is a string of hexadecimal digits, is interpreted as a UTF-8 character whose
-code number is the given hexadecimal number, for example: \\x{1234}. This
-inserts from one to six literal bytes into the pattern, using the UTF-8
-encoding. If a non-hexadecimal digit appears between the braces, the item is
-not recognized.
-
-2. The original hexadecimal escape sequence, \\xhh, generates a two-byte UTF-8
-character if its value is greater than 127.
-
-3. Repeat quantifiers are NOT correctly handled if they follow a multibyte
-character. For example, \\x{100}* and \\xc3+ do not work. If you want to
-repeat such characters, you must enclose them in non-capturing parentheses,
-for example (?:\\x{100}), at present.
-
-4. The dot metacharacter matches one UTF-8 character instead of a single byte.
-
-5. Unlike literal UTF-8 characters, the dot metacharacter followed by a
-repeat quantifier does operate correctly on UTF-8 characters instead of
-single bytes.
-
-4. Although the \\x{...} escape is permitted in a character class, characters
-whose values are greater than 255 cannot be included in a class.
-
-5. A class is matched against a UTF-8 character instead of just a single byte,
-but it can match only characters whose values are less than 256. Characters
-with greater values always fail to match a class.
-
-6. Repeated classes work correctly on multiple characters.
-
-7. Classes containing just a single character whose value is greater than 127
-(but less than 256), for example, [\\x80] or [^\\x{93}], do not work because
-these are optimized into single byte matches. In the first case, of course,
-the class brackets are just redundant.
-
-8. Lookbehind assertions move backwards in the subject by a fixed number of
-characters instead of a fixed number of bytes. Simple cases have been tested
-to work correctly, but there may be hidden gotchas herein.
-
-9. The character types such as \\d and \\w do not work correctly with UTF-8
-characters. They continue to test a single byte.
-
-10. Anything not explicitly mentioned here continues to work in bytes rather
-than in characters.
-
-The following UTF-8 features of Perl 5.6 are not implemented:
-
-1. The escape sequence \\C to match a single byte.
-
-2. The use of Unicode tables and properties and escapes \\p, \\P, and \\X.
-
-
-.SH SAMPLE PROGRAM
-The code below is a simple, complete demonstration program, to get you started
-with using PCRE. This code is also supplied in the file \fIpcredemo.c\fR in the
-PCRE distribution.
-
-The program compiles the regular expression that is its first argument, and
-matches it against the subject string in its second argument. No options are
-set, and default character tables are used. If matching succeeds, the program
-outputs the portion of the subject that matched, together with the contents of
-any captured substrings.
-
-On a Unix system that has PCRE installed in \fI/usr/local\fR, you can compile
-the demonstration program using a command like this:
-
- gcc -o pcredemo pcredemo.c -I/usr/local/include -L/usr/local/lib -lpcre
-
-Then you can run simple tests like this:
-
- ./pcredemo 'cat|dog' 'the cat sat on the mat'
-
-Note that there is a much more comprehensive test program, called
-\fBpcretest\fR, which supports many more facilities for testing regular
-expressions. The \fBpcredemo\fR program is provided as a simple coding example.
-
-On some operating systems (e.g. Solaris) you may get an error like this when
-you try to run \fBpcredemo\fR:
-
- ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such file or directory
-
-This is caused by the way shared library support works on those systems. You
-need to add
-
- -R/usr/local/lib
-
-to the compile command to get round this problem. Here's the code:
-
- #include <stdio.h>
- #include <string.h>
- #include <pcre.h>
-
- #define OVECCOUNT 30 /* should be a multiple of 3 */
-
- int main(int argc, char **argv)
- {
- pcre *re;
- const char *error;
- int erroffset;
- int ovector[OVECCOUNT];
- int rc, i;
-
- if (argc != 3)
- {
- printf("Two arguments required: a regex and a "
- "subject string\\n");
- return 1;
- }
-
- /* Compile the regular expression in the first argument */
-
- re = pcre_compile(
- argv[1], /* the pattern */
- 0, /* default options */
- &error, /* for error message */
- &erroffset, /* for error offset */
- NULL); /* use default character tables */
-
- /* Compilation failed: print the error message and exit */
-
- if (re == NULL)
- {
- printf("PCRE compilation failed at offset %d: %s\\n",
- erroffset, error);
- return 1;
- }
-
- /* Compilation succeeded: match the subject in the second
- argument */
-
- rc = pcre_exec(
- re, /* the compiled pattern */
- NULL, /* we didn't study the pattern */
- argv[2], /* the subject string */
- (int)strlen(argv[2]), /* the length of the subject */
- 0, /* start at offset 0 in the subject */
- 0, /* default options */
- ovector, /* vector for substring information */
- OVECCOUNT); /* number of elements in the vector */
-
- /* Matching failed: handle error cases */
-
- if (rc < 0)
- {
- switch(rc)
- {
- case PCRE_ERROR_NOMATCH: printf("No match\\n"); break;
- /*
- Handle other special cases if you like
- */
- default: printf("Matching error %d\\n", rc); break;
- }
- return 1;
- }
-
- /* Match succeded */
-
- printf("Match succeeded\\n");
-
- /* The output vector wasn't big enough */
-
- if (rc == 0)
- {
- rc = OVECCOUNT/3;
- printf("ovector only has room for %d captured "
- substrings\\n", rc - 1);
- }
-
- /* Show substrings stored in the output vector */
-
- for (i = 0; i < rc; i++)
- {
- char *substring_start = argv[2] + ovector[2*i];
- int substring_length = ovector[2*i+1] - ovector[2*i];
- printf("%2d: %.*s\\n", i, substring_length,
- substring_start);
- }
-
- return 0;
- }
-
-
-.SH AUTHOR
-Philip Hazel <ph10@cam.ac.uk>
-.br
-University Computing Service,
-.br
-New Museums Site,
-.br
-Cambridge CB2 3QG, England.
-.br
-Phone: +44 1223 334714
-
-Last updated: 15 August 2001
-.br
-Copyright (c) 1997-2001 University of Cambridge.
diff --git a/ext/pcre/pcrelib/doc/pcre.html b/ext/pcre/pcrelib/doc/pcre.html
deleted file mode 100644
index d3d7b738d2..0000000000
--- a/ext/pcre/pcrelib/doc/pcre.html
+++ /dev/null
@@ -1,2672 +0,0 @@
-<HTML>
-<HEAD>
-<TITLE>pcre specification</TITLE>
-</HEAD>
-<body bgcolor="#FFFFFF" text="#00005A">
-<H1>pcre specification</H1>
-This HTML document has been generated automatically from the original man page.
-If there is any nonsense in it, please consult the man page in case the
-conversion went wrong.
-<UL>
-<LI><A NAME="TOC1" HREF="#SEC1">NAME</A>
-<LI><A NAME="TOC2" HREF="#SEC2">SYNOPSIS</A>
-<LI><A NAME="TOC3" HREF="#SEC3">DESCRIPTION</A>
-<LI><A NAME="TOC4" HREF="#SEC4">MULTI-THREADING</A>
-<LI><A NAME="TOC5" HREF="#SEC5">COMPILING A PATTERN</A>
-<LI><A NAME="TOC6" HREF="#SEC6">STUDYING A PATTERN</A>
-<LI><A NAME="TOC7" HREF="#SEC7">LOCALE SUPPORT</A>
-<LI><A NAME="TOC8" HREF="#SEC8">INFORMATION ABOUT A PATTERN</A>
-<LI><A NAME="TOC9" HREF="#SEC9">MATCHING A PATTERN</A>
-<LI><A NAME="TOC10" HREF="#SEC10">EXTRACTING CAPTURED SUBSTRINGS</A>
-<LI><A NAME="TOC11" HREF="#SEC11">LIMITATIONS</A>
-<LI><A NAME="TOC12" HREF="#SEC12">DIFFERENCES FROM PERL</A>
-<LI><A NAME="TOC13" HREF="#SEC13">REGULAR EXPRESSION DETAILS</A>
-<LI><A NAME="TOC14" HREF="#SEC14">BACKSLASH</A>
-<LI><A NAME="TOC15" HREF="#SEC15">CIRCUMFLEX AND DOLLAR</A>
-<LI><A NAME="TOC16" HREF="#SEC16">FULL STOP (PERIOD, DOT)</A>
-<LI><A NAME="TOC17" HREF="#SEC17">SQUARE BRACKETS</A>
-<LI><A NAME="TOC18" HREF="#SEC18">POSIX CHARACTER CLASSES</A>
-<LI><A NAME="TOC19" HREF="#SEC19">VERTICAL BAR</A>
-<LI><A NAME="TOC20" HREF="#SEC20">INTERNAL OPTION SETTING</A>
-<LI><A NAME="TOC21" HREF="#SEC21">SUBPATTERNS</A>
-<LI><A NAME="TOC22" HREF="#SEC22">REPETITION</A>
-<LI><A NAME="TOC23" HREF="#SEC23">BACK REFERENCES</A>
-<LI><A NAME="TOC24" HREF="#SEC24">ASSERTIONS</A>
-<LI><A NAME="TOC25" HREF="#SEC25">ONCE-ONLY SUBPATTERNS</A>
-<LI><A NAME="TOC26" HREF="#SEC26">CONDITIONAL SUBPATTERNS</A>
-<LI><A NAME="TOC27" HREF="#SEC27">COMMENTS</A>
-<LI><A NAME="TOC28" HREF="#SEC28">RECURSIVE PATTERNS</A>
-<LI><A NAME="TOC29" HREF="#SEC29">PERFORMANCE</A>
-<LI><A NAME="TOC30" HREF="#SEC30">UTF-8 SUPPORT</A>
-<LI><A NAME="TOC31" HREF="#SEC31">SAMPLE PROGRAM</A>
-<LI><A NAME="TOC32" HREF="#SEC32">AUTHOR</A>
-</UL>
-<LI><A NAME="SEC1" HREF="#TOC1">NAME</A>
-<P>
-pcre - Perl-compatible regular expressions.
-</P>
-<LI><A NAME="SEC2" HREF="#TOC1">SYNOPSIS</A>
-<P>
-<B>#include &#60;pcre.h&#62;</B>
-</P>
-<P>
-<B>pcre *pcre_compile(const char *<I>pattern</I>, int <I>options</I>,</B>
-<B>const char **<I>errptr</I>, int *<I>erroffset</I>,</B>
-<B>const unsigned char *<I>tableptr</I>);</B>
-</P>
-<P>
-<B>pcre_extra *pcre_study(const pcre *<I>code</I>, int <I>options</I>,</B>
-<B>const char **<I>errptr</I>);</B>
-</P>
-<P>
-<B>int pcre_exec(const pcre *<I>code</I>, const pcre_extra *<I>extra</I>,</B>
-<B>const char *<I>subject</I>, int <I>length</I>, int <I>startoffset</I>,</B>
-<B>int <I>options</I>, int *<I>ovector</I>, int <I>ovecsize</I>);</B>
-</P>
-<P>
-<B>int pcre_copy_substring(const char *<I>subject</I>, int *<I>ovector</I>,</B>
-<B>int <I>stringcount</I>, int <I>stringnumber</I>, char *<I>buffer</I>,</B>
-<B>int <I>buffersize</I>);</B>
-</P>
-<P>
-<B>int pcre_get_substring(const char *<I>subject</I>, int *<I>ovector</I>,</B>
-<B>int <I>stringcount</I>, int <I>stringnumber</I>,</B>
-<B>const char **<I>stringptr</I>);</B>
-</P>
-<P>
-<B>int pcre_get_substring_list(const char *<I>subject</I>,</B>
-<B>int *<I>ovector</I>, int <I>stringcount</I>, const char ***<I>listptr</I>);</B>
-</P>
-<P>
-<B>void pcre_free_substring(const char *<I>stringptr</I>);</B>
-</P>
-<P>
-<B>void pcre_free_substring_list(const char **<I>stringptr</I>);</B>
-</P>
-<P>
-<B>const unsigned char *pcre_maketables(void);</B>
-</P>
-<P>
-<B>int pcre_fullinfo(const pcre *<I>code</I>, const pcre_extra *<I>extra</I>,</B>
-<B>int <I>what</I>, void *<I>where</I>);</B>
-</P>
-<P>
-<B>int pcre_info(const pcre *<I>code</I>, int *<I>optptr</I>, int</B>
-<B>*<I>firstcharptr</I>);</B>
-</P>
-<P>
-<B>char *pcre_version(void);</B>
-</P>
-<P>
-<B>void *(*pcre_malloc)(size_t);</B>
-</P>
-<P>
-<B>void (*pcre_free)(void *);</B>
-</P>
-<LI><A NAME="SEC3" HREF="#TOC1">DESCRIPTION</A>
-<P>
-The PCRE library is a set of functions that implement regular expression
-pattern matching using the same syntax and semantics as Perl 5, with just a few
-differences (see below). The current implementation corresponds to Perl 5.005,
-with some additional features from later versions. This includes some
-experimental, incomplete support for UTF-8 encoded strings. Details of exactly
-what is and what is not supported are given below.
-</P>
-<P>
-PCRE has its own native API, which is described in this document. There is also
-a set of wrapper functions that correspond to the POSIX regular expression API.
-These are described in the <B>pcreposix</B> documentation.
-</P>
-<P>
-The native API function prototypes are defined in the header file <B>pcre.h</B>,
-and on Unix systems the library itself is called <B>libpcre.a</B>, so can be
-accessed by adding <B>-lpcre</B> to the command for linking an application which
-calls it. The header file defines the macros PCRE_MAJOR and PCRE_MINOR to
-contain the major and minor release numbers for the library. Applications can
-use these to include support for different releases.
-</P>
-<P>
-The functions <B>pcre_compile()</B>, <B>pcre_study()</B>, and <B>pcre_exec()</B>
-are used for compiling and matching regular expressions. A sample program that
-demonstrates the simplest way of using them is given in the file
-<I>pcredemo.c</I>. The last section of this man page describes how to run it.
-</P>
-<P>
-The functions <B>pcre_copy_substring()</B>, <B>pcre_get_substring()</B>, and
-<B>pcre_get_substring_list()</B> are convenience functions for extracting
-captured substrings from a matched subject string; <B>pcre_free_substring()</B>
-and <B>pcre_free_substring_list()</B> are also provided, to free the memory used
-for extracted strings.
-</P>
-<P>
-The function <B>pcre_maketables()</B> is used (optionally) to build a set of
-character tables in the current locale for passing to <B>pcre_compile()</B>.
-</P>
-<P>
-The function <B>pcre_fullinfo()</B> is used to find out information about a
-compiled pattern; <B>pcre_info()</B> is an obsolete version which returns only
-some of the available information, but is retained for backwards compatibility.
-The function <B>pcre_version()</B> returns a pointer to a string containing the
-version of PCRE and its date of release.
-</P>
-<P>
-The global variables <B>pcre_malloc</B> and <B>pcre_free</B> initially contain
-the entry points of the standard <B>malloc()</B> and <B>free()</B> functions
-respectively. PCRE calls the memory management functions via these variables,
-so a calling program can replace them if it wishes to intercept the calls. This
-should be done before calling any PCRE functions.
-</P>
-<LI><A NAME="SEC4" HREF="#TOC1">MULTI-THREADING</A>
-<P>
-The PCRE functions can be used in multi-threading applications, with the
-proviso that the memory management functions pointed to by <B>pcre_malloc</B>
-and <B>pcre_free</B> are shared by all threads.
-</P>
-<P>
-The compiled form of a regular expression is not altered during matching, so
-the same compiled pattern can safely be used by several threads at once.
-</P>
-<LI><A NAME="SEC5" HREF="#TOC1">COMPILING A PATTERN</A>
-<P>
-The function <B>pcre_compile()</B> is called to compile a pattern into an
-internal form. The pattern is a C string terminated by a binary zero, and
-is passed in the argument <I>pattern</I>. A pointer to a single block of memory
-that is obtained via <B>pcre_malloc</B> is returned. This contains the compiled
-code and related data. The <B>pcre</B> type is defined for the returned block;
-this is a typedef for a structure whose contents are not externally defined. It
-is up to the caller to free the memory when it is no longer required.
-</P>
-<P>
-Although the compiled code of a PCRE regex is relocatable, that is, it does not
-depend on memory location, the complete <B>pcre</B> data block is not
-fully relocatable, because it contains a copy of the <I>tableptr</I> argument,
-which is an address (see below).
-</P>
-<P>
-The size of a compiled pattern is roughly proportional to the length of the
-pattern string, except that each character class (other than those containing
-just a single character, negated or not) requires 33 bytes, and repeat
-quantifiers with a minimum greater than one or a bounded maximum cause the
-relevant portions of the compiled pattern to be replicated.
-</P>
-<P>
-The <I>options</I> argument contains independent bits that affect the
-compilation. It should be zero if no options are required. Some of the options,
-in particular, those that are compatible with Perl, can also be set and unset
-from within the pattern (see the detailed description of regular expressions
-below). For these options, the contents of the <I>options</I> argument specifies
-their initial settings at the start of compilation and execution. The
-PCRE_ANCHORED option can be set at the time of matching as well as at compile
-time.
-</P>
-<P>
-If <I>errptr</I> is NULL, <B>pcre_compile()</B> returns NULL immediately.
-Otherwise, if compilation of a pattern fails, <B>pcre_compile()</B> returns
-NULL, and sets the variable pointed to by <I>errptr</I> to point to a textual
-error message. The offset from the start of the pattern to the character where
-the error was discovered is placed in the variable pointed to by
-<I>erroffset</I>, which must not be NULL. If it is, an immediate error is given.
-</P>
-<P>
-If the final argument, <I>tableptr</I>, is NULL, PCRE uses a default set of
-character tables which are built when it is compiled, using the default C
-locale. Otherwise, <I>tableptr</I> must be the result of a call to
-<B>pcre_maketables()</B>. See the section on locale support below.
-</P>
-<P>
-This code fragment shows a typical straightforward call to <B>pcre_compile()</B>:
-</P>
-<P>
-<PRE>
- pcre *re;
- const char *error;
- int erroffset;
- re = pcre_compile(
- "^A.*Z", /* the pattern */
- 0, /* default options */
- &error, /* for error message */
- &erroffset, /* for error offset */
- NULL); /* use default character tables */
-</PRE>
-</P>
-<P>
-The following option bits are defined in the header file:
-</P>
-<P>
-<PRE>
- PCRE_ANCHORED
-</PRE>
-</P>
-<P>
-If this bit is set, the pattern is forced to be "anchored", that is, it is
-constrained to match only at the start of the string which is being searched
-(the "subject string"). This effect can also be achieved by appropriate
-constructs in the pattern itself, which is the only way to do it in Perl.
-</P>
-<P>
-<PRE>
- PCRE_CASELESS
-</PRE>
-</P>
-<P>
-If this bit is set, letters in the pattern match both upper and lower case
-letters. It is equivalent to Perl's /i option.
-</P>
-<P>
-<PRE>
- PCRE_DOLLAR_ENDONLY
-</PRE>
-</P>
-<P>
-If this bit is set, a dollar metacharacter in the pattern matches only at the
-end of the subject string. Without this option, a dollar also matches
-immediately before the final character if it is a newline (but not before any
-other newlines). The PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is
-set. There is no equivalent to this option in Perl.
-</P>
-<P>
-<PRE>
- PCRE_DOTALL
-</PRE>
-</P>
-<P>
-If this bit is set, a dot metacharater in the pattern matches all characters,
-including newlines. Without it, newlines are excluded. This option is
-equivalent to Perl's /s option. A negative class such as [^a] always matches a
-newline character, independent of the setting of this option.
-</P>
-<P>
-<PRE>
- PCRE_EXTENDED
-</PRE>
-</P>
-<P>
-If this bit is set, whitespace data characters in the pattern are totally
-ignored except when escaped or inside a character class, and characters between
-an unescaped # outside a character class and the next newline character,
-inclusive, are also ignored. This is equivalent to Perl's /x option, and makes
-it possible to include comments inside complicated patterns. Note, however,
-that this applies only to data characters. Whitespace characters may never
-appear within special character sequences in a pattern, for example within the
-sequence (?( which introduces a conditional subpattern.
-</P>
-<P>
-<PRE>
- PCRE_EXTRA
-</PRE>
-</P>
-<P>
-This option was invented in order to turn on additional functionality of PCRE
-that is incompatible with Perl, but it is currently of very little use. When
-set, any backslash in a pattern that is followed by a letter that has no
-special meaning causes an error, thus reserving these combinations for future
-expansion. By default, as in Perl, a backslash followed by a letter with no
-special meaning is treated as a literal. There are at present no other features
-controlled by this option. It can also be set by a (?X) option setting within a
-pattern.
-</P>
-<P>
-<PRE>
- PCRE_MULTILINE
-</PRE>
-</P>
-<P>
-By default, PCRE treats the subject string as consisting of a single "line" of
-characters (even if it actually contains several newlines). The "start of line"
-metacharacter (^) matches only at the start of the string, while the "end of
-line" metacharacter ($) matches only at the end of the string, or before a
-terminating newline (unless PCRE_DOLLAR_ENDONLY is set). This is the same as
-Perl.
-</P>
-<P>
-When PCRE_MULTILINE it is set, the "start of line" and "end of line" constructs
-match immediately following or immediately before any newline in the subject
-string, respectively, as well as at the very start and end. This is equivalent
-to Perl's /m option. If there are no "\n" characters in a subject string, or
-no occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no
-effect.
-</P>
-<P>
-<PRE>
- PCRE_UNGREEDY
-</PRE>
-</P>
-<P>
-This option inverts the "greediness" of the quantifiers so that they are not
-greedy by default, but become greedy if followed by "?". It is not compatible
-with Perl. It can also be set by a (?U) option setting within the pattern.
-</P>
-<P>
-<PRE>
- PCRE_UTF8
-</PRE>
-</P>
-<P>
-This option causes PCRE to regard both the pattern and the subject as strings
-of UTF-8 characters instead of just byte strings. However, it is available only
-if PCRE has been built to include UTF-8 support. If not, the use of this option
-provokes an error. Support for UTF-8 is new, experimental, and incomplete.
-Details of exactly what it entails are given below.
-</P>
-<LI><A NAME="SEC6" HREF="#TOC1">STUDYING A PATTERN</A>
-<P>
-When a pattern is going to be used several times, it is worth spending more
-time analyzing it in order to speed up the time taken for matching. The
-function <B>pcre_study()</B> takes a pointer to a compiled pattern as its first
-argument, and returns a pointer to a <B>pcre_extra</B> block (another typedef
-for a structure with hidden contents) containing additional information about
-the pattern; this can be passed to <B>pcre_exec()</B>. If no additional
-information is available, NULL is returned.
-</P>
-<P>
-The second argument contains option bits. At present, no options are defined
-for <B>pcre_study()</B>, and this argument should always be zero.
-</P>
-<P>
-The third argument for <B>pcre_study()</B> is a pointer to an error message. If
-studying succeeds (even if no data is returned), the variable it points to is
-set to NULL. Otherwise it points to a textual error message.
-</P>
-<P>
-This is a typical call to <B>pcre_study</B>():
-</P>
-<P>
-<PRE>
- pcre_extra *pe;
- pe = pcre_study(
- re, /* result of pcre_compile() */
- 0, /* no options exist */
- &error); /* set to NULL or points to a message */
-</PRE>
-</P>
-<P>
-At present, studying a pattern is useful only for non-anchored patterns that do
-not have a single fixed starting character. A bitmap of possible starting
-characters is created.
-</P>
-<LI><A NAME="SEC7" HREF="#TOC1">LOCALE SUPPORT</A>
-<P>
-PCRE handles caseless matching, and determines whether characters are letters,
-digits, or whatever, by reference to a set of tables. The library contains a
-default set of tables which is created in the default C locale when PCRE is
-compiled. This is used when the final argument of <B>pcre_compile()</B> is NULL,
-and is sufficient for many applications.
-</P>
-<P>
-An alternative set of tables can, however, be supplied. Such tables are built
-by calling the <B>pcre_maketables()</B> function, which has no arguments, in the
-relevant locale. The result can then be passed to <B>pcre_compile()</B> as often
-as necessary. For example, to build and use tables that are appropriate for the
-French locale (where accented characters with codes greater than 128 are
-treated as letters), the following code could be used:
-</P>
-<P>
-<PRE>
- setlocale(LC_CTYPE, "fr");
- tables = pcre_maketables();
- re = pcre_compile(..., tables);
-</PRE>
-</P>
-<P>
-The tables are built in memory that is obtained via <B>pcre_malloc</B>. The
-pointer that is passed to <B>pcre_compile</B> is saved with the compiled
-pattern, and the same tables are used via this pointer by <B>pcre_study()</B>
-and <B>pcre_exec()</B>. Thus for any single pattern, compilation, studying and
-matching all happen in the same locale, but different patterns can be compiled
-in different locales. It is the caller's responsibility to ensure that the
-memory containing the tables remains available for as long as it is needed.
-</P>
-<LI><A NAME="SEC8" HREF="#TOC1">INFORMATION ABOUT A PATTERN</A>
-<P>
-The <B>pcre_fullinfo()</B> function returns information about a compiled
-pattern. It replaces the obsolete <B>pcre_info()</B> function, which is
-nevertheless retained for backwards compability (and is documented below).
-</P>
-<P>
-The first argument for <B>pcre_fullinfo()</B> is a pointer to the compiled
-pattern. The second argument is the result of <B>pcre_study()</B>, or NULL if
-the pattern was not studied. The third argument specifies which piece of
-information is required, while the fourth argument is a pointer to a variable
-to receive the data. The yield of the function is zero for success, or one of
-the following negative numbers:
-</P>
-<P>
-<PRE>
- PCRE_ERROR_NULL the argument <I>code</I> was NULL
- the argument <I>where</I> was NULL
- PCRE_ERROR_BADMAGIC the "magic number" was not found
- PCRE_ERROR_BADOPTION the value of <I>what</I> was invalid
-</PRE>
-</P>
-<P>
-Here is a typical call of <B>pcre_fullinfo()</B>, to obtain the length of the
-compiled pattern:
-</P>
-<P>
-<PRE>
- int rc;
- unsigned long int length;
- rc = pcre_fullinfo(
- re, /* result of pcre_compile() */
- pe, /* result of pcre_study(), or NULL */
- PCRE_INFO_SIZE, /* what is required */
- &length); /* where to put the data */
-</PRE>
-</P>
-<P>
-The possible values for the third argument are defined in <B>pcre.h</B>, and are
-as follows:
-</P>
-<P>
-<PRE>
- PCRE_INFO_OPTIONS
-</PRE>
-</P>
-<P>
-Return a copy of the options with which the pattern was compiled. The fourth
-argument should point to an <B>unsigned long int</B> variable. These option bits
-are those specified in the call to <B>pcre_compile()</B>, modified by any
-top-level option settings within the pattern itself, and with the PCRE_ANCHORED
-bit forcibly set if the form of the pattern implies that it can match only at
-the start of a subject string.
-</P>
-<P>
-<PRE>
- PCRE_INFO_SIZE
-</PRE>
-</P>
-<P>
-Return the size of the compiled pattern, that is, the value that was passed as
-the argument to <B>pcre_malloc()</B> when PCRE was getting memory in which to
-place the compiled data. The fourth argument should point to a <B>size_t</B>
-variable.
-</P>
-<P>
-<PRE>
- PCRE_INFO_CAPTURECOUNT
-</PRE>
-</P>
-<P>
-Return the number of capturing subpatterns in the pattern. The fourth argument
-should point to an \fbint\fR variable.
-</P>
-<P>
-<PRE>
- PCRE_INFO_BACKREFMAX
-</PRE>
-</P>
-<P>
-Return the number of the highest back reference in the pattern. The fourth
-argument should point to an <B>int</B> variable. Zero is returned if there are
-no back references.
-</P>
-<P>
-<PRE>
- PCRE_INFO_FIRSTCHAR
-</PRE>
-</P>
-<P>
-Return information about the first character of any matched string, for a
-non-anchored pattern. If there is a fixed first character, e.g. from a pattern
-such as (cat|cow|coyote), it is returned in the integer pointed to by
-<I>where</I>. Otherwise, if either
-</P>
-<P>
-(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
-starts with "^", or
-</P>
-<P>
-(b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set
-(if it were set, the pattern would be anchored),
-</P>
-<P>
--1 is returned, indicating that the pattern matches only at the start of a
-subject string or after any "\n" within the string. Otherwise -2 is returned.
-For anchored patterns, -2 is returned.
-</P>
-<P>
-<PRE>
- PCRE_INFO_FIRSTTABLE
-</PRE>
-</P>
-<P>
-If the pattern was studied, and this resulted in the construction of a 256-bit
-table indicating a fixed set of characters for the first character in any
-matching string, a pointer to the table is returned. Otherwise NULL is
-returned. The fourth argument should point to an <B>unsigned char *</B>
-variable.
-</P>
-<P>
-<PRE>
- PCRE_INFO_LASTLITERAL
-</PRE>
-</P>
-<P>
-For a non-anchored pattern, return the value of the rightmost literal character
-which must exist in any matched string, other than at its start. The fourth
-argument should point to an <B>int</B> variable. If there is no such character,
-or if the pattern is anchored, -1 is returned. For example, for the pattern
-/a\d+z\d+/ the returned value is 'z'.
-</P>
-<P>
-The <B>pcre_info()</B> function is now obsolete because its interface is too
-restrictive to return all the available data about a compiled pattern. New
-programs should use <B>pcre_fullinfo()</B> instead. The yield of
-<B>pcre_info()</B> is the number of capturing subpatterns, or one of the
-following negative numbers:
-</P>
-<P>
-<PRE>
- PCRE_ERROR_NULL the argument <I>code</I> was NULL
- PCRE_ERROR_BADMAGIC the "magic number" was not found
-</PRE>
-</P>
-<P>
-If the <I>optptr</I> argument is not NULL, a copy of the options with which the
-pattern was compiled is placed in the integer it points to (see
-PCRE_INFO_OPTIONS above).
-</P>
-<P>
-If the pattern is not anchored and the <I>firstcharptr</I> argument is not NULL,
-it is used to pass back information about the first character of any matched
-string (see PCRE_INFO_FIRSTCHAR above).
-</P>
-<LI><A NAME="SEC9" HREF="#TOC1">MATCHING A PATTERN</A>
-<P>
-The function <B>pcre_exec()</B> is called to match a subject string against a
-pre-compiled pattern, which is passed in the <I>code</I> argument. If the
-pattern has been studied, the result of the study should be passed in the
-<I>extra</I> argument. Otherwise this must be NULL.
-</P>
-<P>
-Here is an example of a simple call to <B>pcre_exec()</B>:
-</P>
-<P>
-<PRE>
- int rc;
- int ovector[30];
- rc = pcre_exec(
- re, /* result of pcre_compile() */
- NULL, /* we didn't study the pattern */
- "some string", /* the subject string */
- 11, /* the length of the subject string */
- 0, /* start at offset 0 in the subject */
- 0, /* default options */
- ovector, /* vector for substring information */
- 30); /* number of elements in the vector */
-</PRE>
-</P>
-<P>
-The PCRE_ANCHORED option can be passed in the <I>options</I> argument, whose
-unused bits must be zero. However, if a pattern was compiled with
-PCRE_ANCHORED, or turned out to be anchored by virtue of its contents, it
-cannot be made unachored at matching time.
-</P>
-<P>
-There are also three further options that can be set only at matching time:
-</P>
-<P>
-<PRE>
- PCRE_NOTBOL
-</PRE>
-</P>
-<P>
-The first character of the string is not the beginning of a line, so the
-circumflex metacharacter should not match before it. Setting this without
-PCRE_MULTILINE (at compile time) causes circumflex never to match.
-</P>
-<P>
-<PRE>
- PCRE_NOTEOL
-</PRE>
-</P>
-<P>
-The end of the string is not the end of a line, so the dollar metacharacter
-should not match it nor (except in multiline mode) a newline immediately before
-it. Setting this without PCRE_MULTILINE (at compile time) causes dollar never
-to match.
-</P>
-<P>
-<PRE>
- PCRE_NOTEMPTY
-</PRE>
-</P>
-<P>
-An empty string is not considered to be a valid match if this option is set. If
-there are alternatives in the pattern, they are tried. If all the alternatives
-match the empty string, the entire match fails. For example, if the pattern
-</P>
-<P>
-<PRE>
- a?b?
-</PRE>
-</P>
-<P>
-is applied to a string not beginning with "a" or "b", it matches the empty
-string at the start of the subject. With PCRE_NOTEMPTY set, this match is not
-valid, so PCRE searches further into the string for occurrences of "a" or "b".
-</P>
-<P>
-Perl has no direct equivalent of PCRE_NOTEMPTY, but it does make a special case
-of a pattern match of the empty string within its <B>split()</B> function, and
-when using the /g modifier. It is possible to emulate Perl's behaviour after
-matching a null string by first trying the match again at the same offset with
-PCRE_NOTEMPTY set, and then if that fails by advancing the starting offset (see
-below) and trying an ordinary match again.
-</P>
-<P>
-The subject string is passed as a pointer in <I>subject</I>, a length in
-<I>length</I>, and a starting offset in <I>startoffset</I>. Unlike the pattern
-string, the subject may contain binary zero characters. When the starting
-offset is zero, the search for a match starts at the beginning of the subject,
-and this is by far the most common case.
-</P>
-<P>
-A non-zero starting offset is useful when searching for another match in the
-same subject by calling <B>pcre_exec()</B> again after a previous success.
-Setting <I>startoffset</I> differs from just passing over a shortened string and
-setting PCRE_NOTBOL in the case of a pattern that begins with any kind of
-lookbehind. For example, consider the pattern
-</P>
-<P>
-<PRE>
- \Biss\B
-</PRE>
-</P>
-<P>
-which finds occurrences of "iss" in the middle of words. (\B matches only if
-the current position in the subject is not a word boundary.) When applied to
-the string "Mississipi" the first call to <B>pcre_exec()</B> finds the first
-occurrence. If <B>pcre_exec()</B> is called again with just the remainder of the
-subject, namely "issipi", it does not match, because \B is always false at the
-start of the subject, which is deemed to be a word boundary. However, if
-<B>pcre_exec()</B> is passed the entire string again, but with <I>startoffset</I>
-set to 4, it finds the second occurrence of "iss" because it is able to look
-behind the starting point to discover that it is preceded by a letter.
-</P>
-<P>
-If a non-zero starting offset is passed when the pattern is anchored, one
-attempt to match at the given offset is tried. This can only succeed if the
-pattern does not require the match to be at the start of the subject.
-</P>
-<P>
-In general, a pattern matches a certain portion of the subject, and in
-addition, further substrings from the subject may be picked out by parts of the
-pattern. Following the usage in Jeffrey Friedl's book, this is called
-"capturing" in what follows, and the phrase "capturing subpattern" is used for
-a fragment of a pattern that picks out a substring. PCRE supports several other
-kinds of parenthesized subpattern that do not cause substrings to be captured.
-</P>
-<P>
-Captured substrings are returned to the caller via a vector of integer offsets
-whose address is passed in <I>ovector</I>. The number of elements in the vector
-is passed in <I>ovecsize</I>. The first two-thirds of the vector is used to pass
-back captured substrings, each substring using a pair of integers. The
-remaining third of the vector is used as workspace by <B>pcre_exec()</B> while
-matching capturing subpatterns, and is not available for passing back
-information. The length passed in <I>ovecsize</I> should always be a multiple of
-three. If it is not, it is rounded down.
-</P>
-<P>
-When a match has been successful, information about captured substrings is
-returned in pairs of integers, starting at the beginning of <I>ovector</I>, and
-continuing up to two-thirds of its length at the most. The first element of a
-pair is set to the offset of the first character in a substring, and the second
-is set to the offset of the first character after the end of a substring. The
-first pair, <I>ovector[0]</I> and <I>ovector[1]</I>, identify the portion of the
-subject string matched by the entire pattern. The next pair is used for the
-first capturing subpattern, and so on. The value returned by <B>pcre_exec()</B>
-is the number of pairs that have been set. If there are no capturing
-subpatterns, the return value from a successful match is 1, indicating that
-just the first pair of offsets has been set.
-</P>
-<P>
-Some convenience functions are provided for extracting the captured substrings
-as separate strings. These are described in the following section.
-</P>
-<P>
-It is possible for an capturing subpattern number <I>n+1</I> to match some
-part of the subject when subpattern <I>n</I> has not been used at all. For
-example, if the string "abc" is matched against the pattern (a|(z))(bc)
-subpatterns 1 and 3 are matched, but 2 is not. When this happens, both offset
-values corresponding to the unused subpattern are set to -1.
-</P>
-<P>
-If a capturing subpattern is matched repeatedly, it is the last portion of the
-string that it matched that gets returned.
-</P>
-<P>
-If the vector is too small to hold all the captured substrings, it is used as
-far as possible (up to two-thirds of its length), and the function returns a
-value of zero. In particular, if the substring offsets are not of interest,
-<B>pcre_exec()</B> may be called with <I>ovector</I> passed as NULL and
-<I>ovecsize</I> as zero. However, if the pattern contains back references and
-the <I>ovector</I> isn't big enough to remember the related substrings, PCRE has
-to get additional memory for use during matching. Thus it is usually advisable
-to supply an <I>ovector</I>.
-</P>
-<P>
-Note that <B>pcre_info()</B> can be used to find out how many capturing
-subpatterns there are in a compiled pattern. The smallest size for
-<I>ovector</I> that will allow for <I>n</I> captured substrings in addition to
-the offsets of the substring matched by the whole pattern is (<I>n</I>+1)*3.
-</P>
-<P>
-If <B>pcre_exec()</B> fails, it returns a negative number. The following are
-defined in the header file:
-</P>
-<P>
-<PRE>
- PCRE_ERROR_NOMATCH (-1)
-</PRE>
-</P>
-<P>
-The subject string did not match the pattern.
-</P>
-<P>
-<PRE>
- PCRE_ERROR_NULL (-2)
-</PRE>
-</P>
-<P>
-Either <I>code</I> or <I>subject</I> was passed as NULL, or <I>ovector</I> was
-NULL and <I>ovecsize</I> was not zero.
-</P>
-<P>
-<PRE>
- PCRE_ERROR_BADOPTION (-3)
-</PRE>
-</P>
-<P>
-An unrecognized bit was set in the <I>options</I> argument.
-</P>
-<P>
-<PRE>
- PCRE_ERROR_BADMAGIC (-4)
-</PRE>
-</P>
-<P>
-PCRE stores a 4-byte "magic number" at the start of the compiled code, to catch
-the case when it is passed a junk pointer. This is the error it gives when the
-magic number isn't present.
-</P>
-<P>
-<PRE>
- PCRE_ERROR_UNKNOWN_NODE (-5)
-</PRE>
-</P>
-<P>
-While running the pattern match, an unknown item was encountered in the
-compiled pattern. This error could be caused by a bug in PCRE or by overwriting
-of the compiled pattern.
-</P>
-<P>
-<PRE>
- PCRE_ERROR_NOMEMORY (-6)
-</PRE>
-</P>
-<P>
-If a pattern contains back references, but the <I>ovector</I> that is passed to
-<B>pcre_exec()</B> is not big enough to remember the referenced substrings, PCRE
-gets a block of memory at the start of matching to use for this purpose. If the
-call via <B>pcre_malloc()</B> fails, this error is given. The memory is freed at
-the end of matching.
-</P>
-<LI><A NAME="SEC10" HREF="#TOC1">EXTRACTING CAPTURED SUBSTRINGS</A>
-<P>
-Captured substrings can be accessed directly by using the offsets returned by
-<B>pcre_exec()</B> in <I>ovector</I>. For convenience, the functions
-<B>pcre_copy_substring()</B>, <B>pcre_get_substring()</B>, and
-<B>pcre_get_substring_list()</B> are provided for extracting captured substrings
-as new, separate, zero-terminated strings. A substring that contains a binary
-zero is correctly extracted and has a further zero added on the end, but the
-result does not, of course, function as a C string.
-</P>
-<P>
-The first three arguments are the same for all three functions: <I>subject</I>
-is the subject string which has just been successfully matched, <I>ovector</I>
-is a pointer to the vector of integer offsets that was passed to
-<B>pcre_exec()</B>, and <I>stringcount</I> is the number of substrings that
-were captured by the match, including the substring that matched the entire
-regular expression. This is the value returned by <B>pcre_exec</B> if it
-is greater than zero. If <B>pcre_exec()</B> returned zero, indicating that it
-ran out of space in <I>ovector</I>, the value passed as <I>stringcount</I> should
-be the size of the vector divided by three.
-</P>
-<P>
-The functions <B>pcre_copy_substring()</B> and <B>pcre_get_substring()</B>
-extract a single substring, whose number is given as <I>stringnumber</I>. A
-value of zero extracts the substring that matched the entire pattern, while
-higher values extract the captured substrings. For <B>pcre_copy_substring()</B>,
-the string is placed in <I>buffer</I>, whose length is given by
-<I>buffersize</I>, while for <B>pcre_get_substring()</B> a new block of memory is
-obtained via <B>pcre_malloc</B>, and its address is returned via
-<I>stringptr</I>. The yield of the function is the length of the string, not
-including the terminating zero, or one of
-</P>
-<P>
-<PRE>
- PCRE_ERROR_NOMEMORY (-6)
-</PRE>
-</P>
-<P>
-The buffer was too small for <B>pcre_copy_substring()</B>, or the attempt to get
-memory failed for <B>pcre_get_substring()</B>.
-</P>
-<P>
-<PRE>
- PCRE_ERROR_NOSUBSTRING (-7)
-</PRE>
-</P>
-<P>
-There is no substring whose number is <I>stringnumber</I>.
-</P>
-<P>
-The <B>pcre_get_substring_list()</B> function extracts all available substrings
-and builds a list of pointers to them. All this is done in a single block of
-memory which is obtained via <B>pcre_malloc</B>. The address of the memory block
-is returned via <I>listptr</I>, which is also the start of the list of string
-pointers. The end of the list is marked by a NULL pointer. The yield of the
-function is zero if all went well, or
-</P>
-<P>
-<PRE>
- PCRE_ERROR_NOMEMORY (-6)
-</PRE>
-</P>
-<P>
-if the attempt to get the memory block failed.
-</P>
-<P>
-When any of these functions encounter a substring that is unset, which can
-happen when capturing subpattern number <I>n+1</I> matches some part of the
-subject, but subpattern <I>n</I> has not been used at all, they return an empty
-string. This can be distinguished from a genuine zero-length substring by
-inspecting the appropriate offset in <I>ovector</I>, which is negative for unset
-substrings.
-</P>
-<P>
-The two convenience functions <B>pcre_free_substring()</B> and
-<B>pcre_free_substring_list()</B> can be used to free the memory returned by
-a previous call of <B>pcre_get_substring()</B> or
-<B>pcre_get_substring_list()</B>, respectively. They do nothing more than call
-the function pointed to by <B>pcre_free</B>, which of course could be called
-directly from a C program. However, PCRE is used in some situations where it is
-linked via a special interface to another programming language which cannot use
-<B>pcre_free</B> directly; it is for these cases that the functions are
-provided.
-</P>
-<LI><A NAME="SEC11" HREF="#TOC1">LIMITATIONS</A>
-<P>
-There are some size limitations in PCRE but it is hoped that they will never in
-practice be relevant.
-The maximum length of a compiled pattern is 65539 (sic) bytes.
-All values in repeating quantifiers must be less than 65536.
-There maximum number of capturing subpatterns is 65535.
-There is no limit to the number of non-capturing subpatterns, but the maximum
-depth of nesting of all kinds of parenthesized subpattern, including capturing
-subpatterns, assertions, and other types of subpattern, is 200.
-</P>
-<P>
-The maximum length of a subject string is the largest positive number that an
-integer variable can hold. However, PCRE uses recursion to handle subpatterns
-and indefinite repetition. This means that the available stack space may limit
-the size of a subject string that can be processed by certain patterns.
-</P>
-<LI><A NAME="SEC12" HREF="#TOC1">DIFFERENCES FROM PERL</A>
-<P>
-The differences described here are with respect to Perl 5.005.
-</P>
-<P>
-1. By default, a whitespace character is any character that the C library
-function <B>isspace()</B> recognizes, though it is possible to compile PCRE with
-alternative character type tables. Normally <B>isspace()</B> matches space,
-formfeed, newline, carriage return, horizontal tab, and vertical tab. Perl 5
-no longer includes vertical tab in its set of whitespace characters. The \v
-escape that was in the Perl documentation for a long time was never in fact
-recognized. However, the character itself was treated as whitespace at least
-up to 5.002. In 5.004 and 5.005 it does not match \s.
-</P>
-<P>
-2. PCRE does not allow repeat quantifiers on lookahead assertions. Perl permits
-them, but they do not mean what you might think. For example, (?!a){3} does
-not assert that the next three characters are not "a". It just asserts that the
-next character is not "a" three times.
-</P>
-<P>
-3. Capturing subpatterns that occur inside negative lookahead assertions are
-counted, but their entries in the offsets vector are never set. Perl sets its
-numerical variables from any such patterns that are matched before the
-assertion fails to match something (thereby succeeding), but only if the
-negative lookahead assertion contains just one branch.
-</P>
-<P>
-4. Though binary zero characters are supported in the subject string, they are
-not allowed in a pattern string because it is passed as a normal C string,
-terminated by zero. The escape sequence "\0" can be used in the pattern to
-represent a binary zero.
-</P>
-<P>
-5. The following Perl escape sequences are not supported: \l, \u, \L, \U,
-\E, \Q. In fact these are implemented by Perl's general string-handling and
-are not part of its pattern matching engine.
-</P>
-<P>
-6. The Perl \G assertion is not supported as it is not relevant to single
-pattern matches.
-</P>
-<P>
-7. Fairly obviously, PCRE does not support the (?{code}) and (?p{code})
-constructions. However, there is some experimental support for recursive
-patterns using the non-Perl item (?R).
-</P>
-<P>
-8. There are at the time of writing some oddities in Perl 5.005_02 concerned
-with the settings of captured strings when part of a pattern is repeated. For
-example, matching "aba" against the pattern /^(a(b)?)+$/ sets $2 to the value
-"b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves $2 unset. However, if
-the pattern is changed to /^(aa(b(b))?)+$/ then $2 (and $3) are set.
-</P>
-<P>
-In Perl 5.004 $2 is set in both cases, and that is also true of PCRE. If in the
-future Perl changes to a consistent state that is different, PCRE may change to
-follow.
-</P>
-<P>
-9. Another as yet unresolved discrepancy is that in Perl 5.005_02 the pattern
-/^(a)?(?(1)a|b)+$/ matches the string "a", whereas in PCRE it does not.
-However, in both Perl and PCRE /^(a)?a/ matched against "a" leaves $1 unset.
-</P>
-<P>
-10. PCRE provides some extensions to the Perl regular expression facilities:
-</P>
-<P>
-(a) Although lookbehind assertions must match fixed length strings, each
-alternative branch of a lookbehind assertion can match a different length of
-string. Perl 5.005 requires them all to have the same length.
-</P>
-<P>
-(b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not set, the $ meta-
-character matches only at the very end of the string.
-</P>
-<P>
-(c) If PCRE_EXTRA is set, a backslash followed by a letter with no special
-meaning is faulted.
-</P>
-<P>
-(d) If PCRE_UNGREEDY is set, the greediness of the repetition quantifiers is
-inverted, that is, by default they are not greedy, but if followed by a
-question mark they are.
-</P>
-<P>
-(e) PCRE_ANCHORED can be used to force a pattern to be tried only at the start
-of the subject.
-</P>
-<P>
-(f) The PCRE_NOTBOL, PCRE_NOTEOL, and PCRE_NOTEMPTY options for
-<B>pcre_exec()</B> have no Perl equivalents.
-</P>
-<P>
-(g) The (?R) construct allows for recursive pattern matching (Perl 5.6 can do
-this using the (?p{code}) construct, which PCRE cannot of course support.)
-</P>
-<LI><A NAME="SEC13" HREF="#TOC1">REGULAR EXPRESSION DETAILS</A>
-<P>
-The syntax and semantics of the regular expressions supported by PCRE are
-described below. Regular expressions are also described in the Perl
-documentation and in a number of other books, some of which have copious
-examples. Jeffrey Friedl's "Mastering Regular Expressions", published by
-O'Reilly (ISBN 1-56592-257), covers them in great detail.
-</P>
-<P>
-The description here is intended as reference documentation. The basic
-operation of PCRE is on strings of bytes. However, there is the beginnings of
-some support for UTF-8 character strings. To use this support you must
-configure PCRE to include it, and then call <B>pcre_compile()</B> with the
-PCRE_UTF8 option. How this affects the pattern matching is described in the
-final section of this document.
-</P>
-<P>
-A regular expression is a pattern that is matched against a subject string from
-left to right. Most characters stand for themselves in a pattern, and match the
-corresponding characters in the subject. As a trivial example, the pattern
-</P>
-<P>
-<PRE>
- The quick brown fox
-</PRE>
-</P>
-<P>
-matches a portion of a subject string that is identical to itself. The power of
-regular expressions comes from the ability to include alternatives and
-repetitions in the pattern. These are encoded in the pattern by the use of
-<I>meta-characters</I>, which do not stand for themselves but instead are
-interpreted in some special way.
-</P>
-<P>
-There are two different sets of meta-characters: those that are recognized
-anywhere in the pattern except within square brackets, and those that are
-recognized in square brackets. Outside square brackets, the meta-characters are
-as follows:
-</P>
-<P>
-<PRE>
- \ general escape character with several uses
- ^ assert start of subject (or line, in multiline mode)
- $ assert end of subject (or line, in multiline mode)
- . match any character except newline (by default)
- [ start character class definition
- | start of alternative branch
- ( start subpattern
- ) end subpattern
- ? extends the meaning of (
- also 0 or 1 quantifier
- also quantifier minimizer
- * 0 or more quantifier
- + 1 or more quantifier
- { start min/max quantifier
-</PRE>
-</P>
-<P>
-Part of a pattern that is in square brackets is called a "character class". In
-a character class the only meta-characters are:
-</P>
-<P>
-<PRE>
- \ general escape character
- ^ negate the class, but only if the first character
- - indicates character range
- ] terminates the character class
-</PRE>
-</P>
-<P>
-The following sections describe the use of each of the meta-characters.
-</P>
-<LI><A NAME="SEC14" HREF="#TOC1">BACKSLASH</A>
-<P>
-The backslash character has several uses. Firstly, if it is followed by a
-non-alphameric character, it takes away any special meaning that character may
-have. This use of backslash as an escape character applies both inside and
-outside character classes.
-</P>
-<P>
-For example, if you want to match a "*" character, you write "\*" in the
-pattern. This applies whether or not the following character would otherwise be
-interpreted as a meta-character, so it is always safe to precede a
-non-alphameric with "\" to specify that it stands for itself. In particular,
-if you want to match a backslash, you write "\\".
-</P>
-<P>
-If a pattern is compiled with the PCRE_EXTENDED option, whitespace in the
-pattern (other than in a character class) and characters between a "#" outside
-a character class and the next newline character are ignored. An escaping
-backslash can be used to include a whitespace or "#" character as part of the
-pattern.
-</P>
-<P>
-A second use of backslash provides a way of encoding non-printing characters
-in patterns in a visible manner. There is no restriction on the appearance of
-non-printing characters, apart from the binary zero that terminates a pattern,
-but when a pattern is being prepared by text editing, it is usually easier to
-use one of the following escape sequences than the binary character it
-represents:
-</P>
-<P>
-<PRE>
- \a alarm, that is, the BEL character (hex 07)
- \cx "control-x", where x is any character
- \e escape (hex 1B)
- \f formfeed (hex 0C)
- \n newline (hex 0A)
- \r carriage return (hex 0D)
- \t tab (hex 09)
- \xhh character with hex code hh
- \ddd character with octal code ddd, or backreference
-</PRE>
-</P>
-<P>
-The precise effect of "\cx" is as follows: if "x" is a lower case letter, it
-is converted to upper case. Then bit 6 of the character (hex 40) is inverted.
-Thus "\cz" becomes hex 1A, but "\c{" becomes hex 3B, while "\c;" becomes hex
-7B.
-</P>
-<P>
-After "\x", up to two hexadecimal digits are read (letters can be in upper or
-lower case).
-</P>
-<P>
-After "\0" up to two further octal digits are read. In both cases, if there
-are fewer than two digits, just those that are present are used. Thus the
-sequence "\0\x\07" specifies two binary zeros followed by a BEL character.
-Make sure you supply two digits after the initial zero if the character that
-follows is itself an octal digit.
-</P>
-<P>
-The handling of a backslash followed by a digit other than 0 is complicated.
-Outside a character class, PCRE reads it and any following digits as a decimal
-number. If the number is less than 10, or if there have been at least that many
-previous capturing left parentheses in the expression, the entire sequence is
-taken as a <I>back reference</I>. A description of how this works is given
-later, following the discussion of parenthesized subpatterns.
-</P>
-<P>
-Inside a character class, or if the decimal number is greater than 9 and there
-have not been that many capturing subpatterns, PCRE re-reads up to three octal
-digits following the backslash, and generates a single byte from the least
-significant 8 bits of the value. Any subsequent digits stand for themselves.
-For example:
-</P>
-<P>
-<PRE>
- \040 is another way of writing a space
- \40 is the same, provided there are fewer than 40
- previous capturing subpatterns
- \7 is always a back reference
- \11 might be a back reference, or another way of
- writing a tab
- \011 is always a tab
- \0113 is a tab followed by the character "3"
- \113 is the character with octal code 113 (since there
- can be no more than 99 back references)
- \377 is a byte consisting entirely of 1 bits
- \81 is either a back reference, or a binary zero
- followed by the two characters "8" and "1"
-</PRE>
-</P>
-<P>
-Note that octal values of 100 or greater must not be introduced by a leading
-zero, because no more than three octal digits are ever read.
-</P>
-<P>
-All the sequences that define a single byte value can be used both inside and
-outside character classes. In addition, inside a character class, the sequence
-"\b" is interpreted as the backspace character (hex 08). Outside a character
-class it has a different meaning (see below).
-</P>
-<P>
-The third use of backslash is for specifying generic character types:
-</P>
-<P>
-<PRE>
- \d any decimal digit
- \D any character that is not a decimal digit
- \s any whitespace character
- \S any character that is not a whitespace character
- \w any "word" character
- \W any "non-word" character
-</PRE>
-</P>
-<P>
-Each pair of escape sequences partitions the complete set of characters into
-two disjoint sets. Any given character matches one, and only one, of each pair.
-</P>
-<P>
-A "word" character is any letter or digit or the underscore character, that is,
-any character which can be part of a Perl "word". The definition of letters and
-digits is controlled by PCRE's character tables, and may vary if locale-
-specific matching is taking place (see "Locale support" above). For example, in
-the "fr" (French) locale, some character codes greater than 128 are used for
-accented letters, and these are matched by \w.
-</P>
-<P>
-These character type sequences can appear both inside and outside character
-classes. They each match one character of the appropriate type. If the current
-matching point is at the end of the subject string, all of them fail, since
-there is no character to match.
-</P>
-<P>
-The fourth use of backslash is for certain simple assertions. An assertion
-specifies a condition that has to be met at a particular point in a match,
-without consuming any characters from the subject string. The use of
-subpatterns for more complicated assertions is described below. The backslashed
-assertions are
-</P>
-<P>
-<PRE>
- \b word boundary
- \B not a word boundary
- \A start of subject (independent of multiline mode)
- \Z end of subject or newline at end (independent of multiline mode)
- \z end of subject (independent of multiline mode)
-</PRE>
-</P>
-<P>
-These assertions may not appear in character classes (but note that "\b" has a
-different meaning, namely the backspace character, inside a character class).
-</P>
-<P>
-A word boundary is a position in the subject string where the current character
-and the previous character do not both match \w or \W (i.e. one matches
-\w and the other matches \W), or the start or end of the string if the
-first or last character matches \w, respectively.
-</P>
-<P>
-The \A, \Z, and \z assertions differ from the traditional circumflex and
-dollar (described below) in that they only ever match at the very start and end
-of the subject string, whatever options are set. They are not affected by the
-PCRE_NOTBOL or PCRE_NOTEOL options. If the <I>startoffset</I> argument of
-<B>pcre_exec()</B> is non-zero, \A can never match. The difference between \Z
-and \z is that \Z matches before a newline that is the last character of the
-string as well as at the end of the string, whereas \z matches only at the
-end.
-</P>
-<LI><A NAME="SEC15" HREF="#TOC1">CIRCUMFLEX AND DOLLAR</A>
-<P>
-Outside a character class, in the default matching mode, the circumflex
-character is an assertion which is true only if the current matching point is
-at the start of the subject string. If the <I>startoffset</I> argument of
-<B>pcre_exec()</B> is non-zero, circumflex can never match. Inside a character
-class, circumflex has an entirely different meaning (see below).
-</P>
-<P>
-Circumflex need not be the first character of the pattern if a number of
-alternatives are involved, but it should be the first thing in each alternative
-in which it appears if the pattern is ever to match that branch. If all
-possible alternatives start with a circumflex, that is, if the pattern is
-constrained to match only at the start of the subject, it is said to be an
-"anchored" pattern. (There are also other constructs that can cause a pattern
-to be anchored.)
-</P>
-<P>
-A dollar character is an assertion which is true only if the current matching
-point is at the end of the subject string, or immediately before a newline
-character that is the last character in the string (by default). Dollar need
-not be the last character of the pattern if a number of alternatives are
-involved, but it should be the last item in any branch in which it appears.
-Dollar has no special meaning in a character class.
-</P>
-<P>
-The meaning of dollar can be changed so that it matches only at the very end of
-the string, by setting the PCRE_DOLLAR_ENDONLY option at compile or matching
-time. This does not affect the \Z assertion.
-</P>
-<P>
-The meanings of the circumflex and dollar characters are changed if the
-PCRE_MULTILINE option is set. When this is the case, they match immediately
-after and immediately before an internal "\n" character, respectively, in
-addition to matching at the start and end of the subject string. For example,
-the pattern /^abc$/ matches the subject string "def\nabc" in multiline mode,
-but not otherwise. Consequently, patterns that are anchored in single line mode
-because all branches start with "^" are not anchored in multiline mode, and a
-match for circumflex is possible when the <I>startoffset</I> argument of
-<B>pcre_exec()</B> is non-zero. The PCRE_DOLLAR_ENDONLY option is ignored if
-PCRE_MULTILINE is set.
-</P>
-<P>
-Note that the sequences \A, \Z, and \z can be used to match the start and
-end of the subject in both modes, and if all branches of a pattern start with
-\A it is always anchored, whether PCRE_MULTILINE is set or not.
-</P>
-<LI><A NAME="SEC16" HREF="#TOC1">FULL STOP (PERIOD, DOT)</A>
-<P>
-Outside a character class, a dot in the pattern matches any one character in
-the subject, including a non-printing character, but not (by default) newline.
-If the PCRE_DOTALL option is set, dots match newlines as well. The handling of
-dot is entirely independent of the handling of circumflex and dollar, the only
-relationship being that they both involve newline characters. Dot has no
-special meaning in a character class.
-</P>
-<LI><A NAME="SEC17" HREF="#TOC1">SQUARE BRACKETS</A>
-<P>
-An opening square bracket introduces a character class, terminated by a closing
-square bracket. A closing square bracket on its own is not special. If a
-closing square bracket is required as a member of the class, it should be the
-first data character in the class (after an initial circumflex, if present) or
-escaped with a backslash.
-</P>
-<P>
-A character class matches a single character in the subject; the character must
-be in the set of characters defined by the class, unless the first character in
-the class is a circumflex, in which case the subject character must not be in
-the set defined by the class. If a circumflex is actually required as a member
-of the class, ensure it is not the first character, or escape it with a
-backslash.
-</P>
-<P>
-For example, the character class [aeiou] matches any lower case vowel, while
-[^aeiou] matches any character that is not a lower case vowel. Note that a
-circumflex is just a convenient notation for specifying the characters which
-are in the class by enumerating those that are not. It is not an assertion: it
-still consumes a character from the subject string, and fails if the current
-pointer is at the end of the string.
-</P>
-<P>
-When caseless matching is set, any letters in a class represent both their
-upper case and lower case versions, so for example, a caseless [aeiou] matches
-"A" as well as "a", and a caseless [^aeiou] does not match "A", whereas a
-caseful version would.
-</P>
-<P>
-The newline character is never treated in any special way in character classes,
-whatever the setting of the PCRE_DOTALL or PCRE_MULTILINE options is. A class
-such as [^a] will always match a newline.
-</P>
-<P>
-The minus (hyphen) character can be used to specify a range of characters in a
-character class. For example, [d-m] matches any letter between d and m,
-inclusive. If a minus character is required in a class, it must be escaped with
-a backslash or appear in a position where it cannot be interpreted as
-indicating a range, typically as the first or last character in the class.
-</P>
-<P>
-It is not possible to have the literal character "]" as the end character of a
-range. A pattern such as [W-]46] is interpreted as a class of two characters
-("W" and "-") followed by a literal string "46]", so it would match "W46]" or
-"-46]". However, if the "]" is escaped with a backslash it is interpreted as
-the end of range, so [W-\]46] is interpreted as a single class containing a
-range followed by two separate characters. The octal or hexadecimal
-representation of "]" can also be used to end a range.
-</P>
-<P>
-Ranges operate in ASCII collating sequence. They can also be used for
-characters specified numerically, for example [\000-\037]. If a range that
-includes letters is used when caseless matching is set, it matches the letters
-in either case. For example, [W-c] is equivalent to [][\^_`wxyzabc], matched
-caselessly, and if character tables for the "fr" locale are in use,
-[\xc8-\xcb] matches accented E characters in both cases.
-</P>
-<P>
-The character types \d, \D, \s, \S, \w, and \W may also appear in a
-character class, and add the characters that they match to the class. For
-example, [\dABCDEF] matches any hexadecimal digit. A circumflex can
-conveniently be used with the upper case character types to specify a more
-restricted set of characters than the matching lower case type. For example,
-the class [^\W_] matches any letter or digit, but not underscore.
-</P>
-<P>
-All non-alphameric characters other than \, -, ^ (at the start) and the
-terminating ] are non-special in character classes, but it does no harm if they
-are escaped.
-</P>
-<LI><A NAME="SEC18" HREF="#TOC1">POSIX CHARACTER CLASSES</A>
-<P>
-Perl 5.6 (not yet released at the time of writing) is going to support the
-POSIX notation for character classes, which uses names enclosed by [: and :]
-within the enclosing square brackets. PCRE supports this notation. For example,
-</P>
-<P>
-<PRE>
- [01[:alpha:]%]
-</PRE>
-</P>
-<P>
-matches "0", "1", any alphabetic character, or "%". The supported class names
-are
-</P>
-<P>
-<PRE>
- alnum letters and digits
- alpha letters
- ascii character codes 0 - 127
- cntrl control characters
- digit decimal digits (same as \d)
- graph printing characters, excluding space
- lower lower case letters
- print printing characters, including space
- punct printing characters, excluding letters and digits
- space white space (same as \s)
- upper upper case letters
- word "word" characters (same as \w)
- xdigit hexadecimal digits
-</PRE>
-</P>
-<P>
-&#62;&#62;&#62;&#62;&#62;&#62;&#62;&#62;&#62;&#62;&#62;&#62;Only WORD is perl. BLANK is GNU.
-</P>
-<P>
-The names "ascii" and "word" are Perl extensions. Another Perl extension is
-negation, which is indicated by a ^ character after the colon. For example,
-</P>
-<P>
-<PRE>
- [12[:^digit:]]
-</PRE>
-</P>
-<P>
-matches "1", "2", or any non-digit. PCRE (and Perl) also recognize the POSIX
-syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not
-supported, and an error is given if they are encountered.
-</P>
-<LI><A NAME="SEC19" HREF="#TOC1">VERTICAL BAR</A>
-<P>
-Vertical bar characters are used to separate alternative patterns. For example,
-the pattern
-</P>
-<P>
-<PRE>
- gilbert|sullivan
-</PRE>
-</P>
-<P>
-matches either "gilbert" or "sullivan". Any number of alternatives may appear,
-and an empty alternative is permitted (matching the empty string).
-The matching process tries each alternative in turn, from left to right,
-and the first one that succeeds is used. If the alternatives are within a
-subpattern (defined below), "succeeds" means matching the rest of the main
-pattern as well as the alternative in the subpattern.
-</P>
-<LI><A NAME="SEC20" HREF="#TOC1">INTERNAL OPTION SETTING</A>
-<P>
-The settings of PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, and PCRE_EXTENDED
-can be changed from within the pattern by a sequence of Perl option letters
-enclosed between "(?" and ")". The option letters are
-</P>
-<P>
-<PRE>
- i for PCRE_CASELESS
- m for PCRE_MULTILINE
- s for PCRE_DOTALL
- x for PCRE_EXTENDED
-</PRE>
-</P>
-<P>
-For example, (?im) sets caseless, multiline matching. It is also possible to
-unset these options by preceding the letter with a hyphen, and a combined
-setting and unsetting such as (?im-sx), which sets PCRE_CASELESS and
-PCRE_MULTILINE while unsetting PCRE_DOTALL and PCRE_EXTENDED, is also
-permitted. If a letter appears both before and after the hyphen, the option is
-unset.
-</P>
-<P>
-The scope of these option changes depends on where in the pattern the setting
-occurs. For settings that are outside any subpattern (defined below), the
-effect is the same as if the options were set or unset at the start of
-matching. The following patterns all behave in exactly the same way:
-</P>
-<P>
-<PRE>
- (?i)abc
- a(?i)bc
- ab(?i)c
- abc(?i)
-</PRE>
-</P>
-<P>
-which in turn is the same as compiling the pattern abc with PCRE_CASELESS set.
-In other words, such "top level" settings apply to the whole pattern (unless
-there are other changes inside subpatterns). If there is more than one setting
-of the same option at top level, the rightmost setting is used.
-</P>
-<P>
-If an option change occurs inside a subpattern, the effect is different. This
-is a change of behaviour in Perl 5.005. An option change inside a subpattern
-affects only that part of the subpattern that follows it, so
-</P>
-<P>
-<PRE>
- (a(?i)b)c
-</PRE>
-</P>
-<P>
-matches abc and aBc and no other strings (assuming PCRE_CASELESS is not used).
-By this means, options can be made to have different settings in different
-parts of the pattern. Any changes made in one alternative do carry on
-into subsequent branches within the same subpattern. For example,
-</P>
-<P>
-<PRE>
- (a(?i)b|c)
-</PRE>
-</P>
-<P>
-matches "ab", "aB", "c", and "C", even though when matching "C" the first
-branch is abandoned before the option setting. This is because the effects of
-option settings happen at compile time. There would be some very weird
-behaviour otherwise.
-</P>
-<P>
-The PCRE-specific options PCRE_UNGREEDY and PCRE_EXTRA can be changed in the
-same way as the Perl-compatible options by using the characters U and X
-respectively. The (?X) flag setting is special in that it must always occur
-earlier in the pattern than any of the additional features it turns on, even
-when it is at top level. It is best put at the start.
-</P>
-<LI><A NAME="SEC21" HREF="#TOC1">SUBPATTERNS</A>
-<P>
-Subpatterns are delimited by parentheses (round brackets), which can be nested.
-Marking part of a pattern as a subpattern does two things:
-</P>
-<P>
-1. It localizes a set of alternatives. For example, the pattern
-</P>
-<P>
-<PRE>
- cat(aract|erpillar|)
-</PRE>
-</P>
-<P>
-matches one of the words "cat", "cataract", or "caterpillar". Without the
-parentheses, it would match "cataract", "erpillar" or the empty string.
-</P>
-<P>
-2. It sets up the subpattern as a capturing subpattern (as defined above).
-When the whole pattern matches, that portion of the subject string that matched
-the subpattern is passed back to the caller via the <I>ovector</I> argument of
-<B>pcre_exec()</B>. Opening parentheses are counted from left to right (starting
-from 1) to obtain the numbers of the capturing subpatterns.
-</P>
-<P>
-For example, if the string "the red king" is matched against the pattern
-</P>
-<P>
-<PRE>
- the ((red|white) (king|queen))
-</PRE>
-</P>
-<P>
-the captured substrings are "red king", "red", and "king", and are numbered 1,
-2, and 3, respectively.
-</P>
-<P>
-The fact that plain parentheses fulfil two functions is not always helpful.
-There are often times when a grouping subpattern is required without a
-capturing requirement. If an opening parenthesis is followed by "?:", the
-subpattern does not do any capturing, and is not counted when computing the
-number of any subsequent capturing subpatterns. For example, if the string "the
-white queen" is matched against the pattern
-</P>
-<P>
-<PRE>
- the ((?:red|white) (king|queen))
-</PRE>
-</P>
-<P>
-the captured substrings are "white queen" and "queen", and are numbered 1 and
-2. The maximum number of captured substrings is 99, and the maximum number of
-all subpatterns, both capturing and non-capturing, is 200.
-</P>
-<P>
-As a convenient shorthand, if any option settings are required at the start of
-a non-capturing subpattern, the option letters may appear between the "?" and
-the ":". Thus the two patterns
-</P>
-<P>
-<PRE>
- (?i:saturday|sunday)
- (?:(?i)saturday|sunday)
-</PRE>
-</P>
-<P>
-match exactly the same set of strings. Because alternative branches are tried
-from left to right, and options are not reset until the end of the subpattern
-is reached, an option setting in one branch does affect subsequent branches, so
-the above patterns match "SUNDAY" as well as "Saturday".
-</P>
-<LI><A NAME="SEC22" HREF="#TOC1">REPETITION</A>
-<P>
-Repetition is specified by quantifiers, which can follow any of the following
-items:
-</P>
-<P>
-<PRE>
- a single character, possibly escaped
- the . metacharacter
- a character class
- a back reference (see next section)
- a parenthesized subpattern (unless it is an assertion - see below)
-</PRE>
-</P>
-<P>
-The general repetition quantifier specifies a minimum and maximum number of
-permitted matches, by giving the two numbers in curly brackets (braces),
-separated by a comma. The numbers must be less than 65536, and the first must
-be less than or equal to the second. For example:
-</P>
-<P>
-<PRE>
- z{2,4}
-</PRE>
-</P>
-<P>
-matches "zz", "zzz", or "zzzz". A closing brace on its own is not a special
-character. If the second number is omitted, but the comma is present, there is
-no upper limit; if the second number and the comma are both omitted, the
-quantifier specifies an exact number of required matches. Thus
-</P>
-<P>
-<PRE>
- [aeiou]{3,}
-</PRE>
-</P>
-<P>
-matches at least 3 successive vowels, but may match many more, while
-</P>
-<P>
-<PRE>
- \d{8}
-</PRE>
-</P>
-<P>
-matches exactly 8 digits. An opening curly bracket that appears in a position
-where a quantifier is not allowed, or one that does not match the syntax of a
-quantifier, is taken as a literal character. For example, {,6} is not a
-quantifier, but a literal string of four characters.
-</P>
-<P>
-The quantifier {0} is permitted, causing the expression to behave as if the
-previous item and the quantifier were not present.
-</P>
-<P>
-For convenience (and historical compatibility) the three most common
-quantifiers have single-character abbreviations:
-</P>
-<P>
-<PRE>
- * is equivalent to {0,}
- + is equivalent to {1,}
- ? is equivalent to {0,1}
-</PRE>
-</P>
-<P>
-It is possible to construct infinite loops by following a subpattern that can
-match no characters with a quantifier that has no upper limit, for example:
-</P>
-<P>
-<PRE>
- (a?)*
-</PRE>
-</P>
-<P>
-Earlier versions of Perl and PCRE used to give an error at compile time for
-such patterns. However, because there are cases where this can be useful, such
-patterns are now accepted, but if any repetition of the subpattern does in fact
-match no characters, the loop is forcibly broken.
-</P>
-<P>
-By default, the quantifiers are "greedy", that is, they match as much as
-possible (up to the maximum number of permitted times), without causing the
-rest of the pattern to fail. The classic example of where this gives problems
-is in trying to match comments in C programs. These appear between the
-sequences /* and */ and within the sequence, individual * and / characters may
-appear. An attempt to match C comments by applying the pattern
-</P>
-<P>
-<PRE>
- /\*.*\*/
-</PRE>
-</P>
-<P>
-to the string
-</P>
-<P>
-<PRE>
- /* first command */ not comment /* second comment */
-</PRE>
-</P>
-<P>
-fails, because it matches the entire string owing to the greediness of the .*
-item.
-</P>
-<P>
-However, if a quantifier is followed by a question mark, it ceases to be
-greedy, and instead matches the minimum number of times possible, so the
-pattern
-</P>
-<P>
-<PRE>
- /\*.*?\*/
-</PRE>
-</P>
-<P>
-does the right thing with the C comments. The meaning of the various
-quantifiers is not otherwise changed, just the preferred number of matches.
-Do not confuse this use of question mark with its use as a quantifier in its
-own right. Because it has two uses, it can sometimes appear doubled, as in
-</P>
-<P>
-<PRE>
- \d??\d
-</PRE>
-</P>
-<P>
-which matches one digit by preference, but can match two if that is the only
-way the rest of the pattern matches.
-</P>
-<P>
-If the PCRE_UNGREEDY option is set (an option which is not available in Perl),
-the quantifiers are not greedy by default, but individual ones can be made
-greedy by following them with a question mark. In other words, it inverts the
-default behaviour.
-</P>
-<P>
-When a parenthesized subpattern is quantified with a minimum repeat count that
-is greater than 1 or with a limited maximum, more store is required for the
-compiled pattern, in proportion to the size of the minimum or maximum.
-</P>
-<P>
-If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equivalent
-to Perl's /s) is set, thus allowing the . to match newlines, the pattern is
-implicitly anchored, because whatever follows will be tried against every
-character position in the subject string, so there is no point in retrying the
-overall match at any position after the first. PCRE treats such a pattern as
-though it were preceded by \A. In cases where it is known that the subject
-string contains no newlines, it is worth setting PCRE_DOTALL when the pattern
-begins with .* in order to obtain this optimization, or alternatively using ^
-to indicate anchoring explicitly.
-</P>
-<P>
-When a capturing subpattern is repeated, the value captured is the substring
-that matched the final iteration. For example, after
-</P>
-<P>
-<PRE>
- (tweedle[dume]{3}\s*)+
-</PRE>
-</P>
-<P>
-has matched "tweedledum tweedledee" the value of the captured substring is
-"tweedledee". However, if there are nested capturing subpatterns, the
-corresponding captured values may have been set in previous iterations. For
-example, after
-</P>
-<P>
-<PRE>
- /(a|(b))+/
-</PRE>
-</P>
-<P>
-matches "aba" the value of the second captured substring is "b".
-</P>
-<LI><A NAME="SEC23" HREF="#TOC1">BACK REFERENCES</A>
-<P>
-Outside a character class, a backslash followed by a digit greater than 0 (and
-possibly further digits) is a back reference to a capturing subpattern earlier
-(i.e. to its left) in the pattern, provided there have been that many previous
-capturing left parentheses.
-</P>
-<P>
-However, if the decimal number following the backslash is less than 10, it is
-always taken as a back reference, and causes an error only if there are not
-that many capturing left parentheses in the entire pattern. In other words, the
-parentheses that are referenced need not be to the left of the reference for
-numbers less than 10. See the section entitled "Backslash" above for further
-details of the handling of digits following a backslash.
-</P>
-<P>
-A back reference matches whatever actually matched the capturing subpattern in
-the current subject string, rather than anything matching the subpattern
-itself. So the pattern
-</P>
-<P>
-<PRE>
- (sens|respons)e and \1ibility
-</PRE>
-</P>
-<P>
-matches "sense and sensibility" and "response and responsibility", but not
-"sense and responsibility". If caseful matching is in force at the time of the
-back reference, the case of letters is relevant. For example,
-</P>
-<P>
-<PRE>
- ((?i)rah)\s+\1
-</PRE>
-</P>
-<P>
-matches "rah rah" and "RAH RAH", but not "RAH rah", even though the original
-capturing subpattern is matched caselessly.
-</P>
-<P>
-There may be more than one back reference to the same subpattern. If a
-subpattern has not actually been used in a particular match, any back
-references to it always fail. For example, the pattern
-</P>
-<P>
-<PRE>
- (a|(bc))\2
-</PRE>
-</P>
-<P>
-always fails if it starts to match "a" rather than "bc". Because there may be
-up to 99 back references, all digits following the backslash are taken
-as part of a potential back reference number. If the pattern continues with a
-digit character, some delimiter must be used to terminate the back reference.
-If the PCRE_EXTENDED option is set, this can be whitespace. Otherwise an empty
-comment can be used.
-</P>
-<P>
-A back reference that occurs inside the parentheses to which it refers fails
-when the subpattern is first used, so, for example, (a\1) never matches.
-However, such references can be useful inside repeated subpatterns. For
-example, the pattern
-</P>
-<P>
-<PRE>
- (a|b\1)+
-</PRE>
-</P>
-<P>
-matches any number of "a"s and also "aba", "ababbaa" etc. At each iteration of
-the subpattern, the back reference matches the character string corresponding
-to the previous iteration. In order for this to work, the pattern must be such
-that the first iteration does not need to match the back reference. This can be
-done using alternation, as in the example above, or by a quantifier with a
-minimum of zero.
-</P>
-<LI><A NAME="SEC24" HREF="#TOC1">ASSERTIONS</A>
-<P>
-An assertion is a test on the characters following or preceding the current
-matching point that does not actually consume any characters. The simple
-assertions coded as \b, \B, \A, \Z, \z, ^ and $ are described above. More
-complicated assertions are coded as subpatterns. There are two kinds: those
-that look ahead of the current position in the subject string, and those that
-look behind it.
-</P>
-<P>
-An assertion subpattern is matched in the normal way, except that it does not
-cause the current matching position to be changed. Lookahead assertions start
-with (?= for positive assertions and (?! for negative assertions. For example,
-</P>
-<P>
-<PRE>
- \w+(?=;)
-</PRE>
-</P>
-<P>
-matches a word followed by a semicolon, but does not include the semicolon in
-the match, and
-</P>
-<P>
-<PRE>
- foo(?!bar)
-</PRE>
-</P>
-<P>
-matches any occurrence of "foo" that is not followed by "bar". Note that the
-apparently similar pattern
-</P>
-<P>
-<PRE>
- (?!foo)bar
-</PRE>
-</P>
-<P>
-does not find an occurrence of "bar" that is preceded by something other than
-"foo"; it finds any occurrence of "bar" whatsoever, because the assertion
-(?!foo) is always true when the next three characters are "bar". A
-lookbehind assertion is needed to achieve this effect.
-</P>
-<P>
-Lookbehind assertions start with (?&#60;= for positive assertions and (?&#60;! for
-negative assertions. For example,
-</P>
-<P>
-<PRE>
- (?&#60;!foo)bar
-</PRE>
-</P>
-<P>
-does find an occurrence of "bar" that is not preceded by "foo". The contents of
-a lookbehind assertion are restricted such that all the strings it matches must
-have a fixed length. However, if there are several alternatives, they do not
-all have to have the same fixed length. Thus
-</P>
-<P>
-<PRE>
- (?&#60;=bullock|donkey)
-</PRE>
-</P>
-<P>
-is permitted, but
-</P>
-<P>
-<PRE>
- (?&#60;!dogs?|cats?)
-</PRE>
-</P>
-<P>
-causes an error at compile time. Branches that match different length strings
-are permitted only at the top level of a lookbehind assertion. This is an
-extension compared with Perl 5.005, which requires all branches to match the
-same length of string. An assertion such as
-</P>
-<P>
-<PRE>
- (?&#60;=ab(c|de))
-</PRE>
-</P>
-<P>
-is not permitted, because its single top-level branch can match two different
-lengths, but it is acceptable if rewritten to use two top-level branches:
-</P>
-<P>
-<PRE>
- (?&#60;=abc|abde)
-</PRE>
-</P>
-<P>
-The implementation of lookbehind assertions is, for each alternative, to
-temporarily move the current position back by the fixed width and then try to
-match. If there are insufficient characters before the current position, the
-match is deemed to fail. Lookbehinds in conjunction with once-only subpatterns
-can be particularly useful for matching at the ends of strings; an example is
-given at the end of the section on once-only subpatterns.
-</P>
-<P>
-Several assertions (of any sort) may occur in succession. For example,
-</P>
-<P>
-<PRE>
- (?&#60;=\d{3})(?&#60;!999)foo
-</PRE>
-</P>
-<P>
-matches "foo" preceded by three digits that are not "999". Notice that each of
-the assertions is applied independently at the same point in the subject
-string. First there is a check that the previous three characters are all
-digits, and then there is a check that the same three characters are not "999".
-This pattern does <I>not</I> match "foo" preceded by six characters, the first
-of which are digits and the last three of which are not "999". For example, it
-doesn't match "123abcfoo". A pattern to do that is
-</P>
-<P>
-<PRE>
- (?&#60;=\d{3}...)(?&#60;!999)foo
-</PRE>
-</P>
-<P>
-This time the first assertion looks at the preceding six characters, checking
-that the first three are digits, and then the second assertion checks that the
-preceding three characters are not "999".
-</P>
-<P>
-Assertions can be nested in any combination. For example,
-</P>
-<P>
-<PRE>
- (?&#60;=(?&#60;!foo)bar)baz
-</PRE>
-</P>
-<P>
-matches an occurrence of "baz" that is preceded by "bar" which in turn is not
-preceded by "foo", while
-</P>
-<P>
-<PRE>
- (?&#60;=\d{3}(?!999)...)foo
-</PRE>
-</P>
-<P>
-is another pattern which matches "foo" preceded by three digits and any three
-characters that are not "999".
-</P>
-<P>
-Assertion subpatterns are not capturing subpatterns, and may not be repeated,
-because it makes no sense to assert the same thing several times. If any kind
-of assertion contains capturing subpatterns within it, these are counted for
-the purposes of numbering the capturing subpatterns in the whole pattern.
-However, substring capturing is carried out only for positive assertions,
-because it does not make sense for negative assertions.
-</P>
-<P>
-Assertions count towards the maximum of 200 parenthesized subpatterns.
-</P>
-<LI><A NAME="SEC25" HREF="#TOC1">ONCE-ONLY SUBPATTERNS</A>
-<P>
-With both maximizing and minimizing repetition, failure of what follows
-normally causes the repeated item to be re-evaluated to see if a different
-number of repeats allows the rest of the pattern to match. Sometimes it is
-useful to prevent this, either to change the nature of the match, or to cause
-it fail earlier than it otherwise might, when the author of the pattern knows
-there is no point in carrying on.
-</P>
-<P>
-Consider, for example, the pattern \d+foo when applied to the subject line
-</P>
-<P>
-<PRE>
- 123456bar
-</PRE>
-</P>
-<P>
-After matching all 6 digits and then failing to match "foo", the normal
-action of the matcher is to try again with only 5 digits matching the \d+
-item, and then with 4, and so on, before ultimately failing. Once-only
-subpatterns provide the means for specifying that once a portion of the pattern
-has matched, it is not to be re-evaluated in this way, so the matcher would
-give up immediately on failing to match "foo" the first time. The notation is
-another kind of special parenthesis, starting with (?&#62; as in this example:
-</P>
-<P>
-<PRE>
- (?&#62;\d+)bar
-</PRE>
-</P>
-<P>
-This kind of parenthesis "locks up" the part of the pattern it contains once
-it has matched, and a failure further into the pattern is prevented from
-backtracking into it. Backtracking past it to previous items, however, works as
-normal.
-</P>
-<P>
-An alternative description is that a subpattern of this type matches the string
-of characters that an identical standalone pattern would match, if anchored at
-the current point in the subject string.
-</P>
-<P>
-Once-only subpatterns are not capturing subpatterns. Simple cases such as the
-above example can be thought of as a maximizing repeat that must swallow
-everything it can. So, while both \d+ and \d+? are prepared to adjust the
-number of digits they match in order to make the rest of the pattern match,
-(?&#62;\d+) can only match an entire sequence of digits.
-</P>
-<P>
-This construction can of course contain arbitrarily complicated subpatterns,
-and it can be nested.
-</P>
-<P>
-Once-only subpatterns can be used in conjunction with lookbehind assertions to
-specify efficient matching at the end of the subject string. Consider a simple
-pattern such as
-</P>
-<P>
-<PRE>
- abcd$
-</PRE>
-</P>
-<P>
-when applied to a long string which does not match. Because matching proceeds
-from left to right, PCRE will look for each "a" in the subject and then see if
-what follows matches the rest of the pattern. If the pattern is specified as
-</P>
-<P>
-<PRE>
- ^.*abcd$
-</PRE>
-</P>
-<P>
-the initial .* matches the entire string at first, but when this fails (because
-there is no following "a"), it backtracks to match all but the last character,
-then all but the last two characters, and so on. Once again the search for "a"
-covers the entire string, from right to left, so we are no better off. However,
-if the pattern is written as
-</P>
-<P>
-<PRE>
- ^(?&#62;.*)(?&#60;=abcd)
-</PRE>
-</P>
-<P>
-there can be no backtracking for the .* item; it can match only the entire
-string. The subsequent lookbehind assertion does a single test on the last four
-characters. If it fails, the match fails immediately. For long strings, this
-approach makes a significant difference to the processing time.
-</P>
-<P>
-When a pattern contains an unlimited repeat inside a subpattern that can itself
-be repeated an unlimited number of times, the use of a once-only subpattern is
-the only way to avoid some failing matches taking a very long time indeed.
-The pattern
-</P>
-<P>
-<PRE>
- (\D+|&#60;\d+&#62;)*[!?]
-</PRE>
-</P>
-<P>
-matches an unlimited number of substrings that either consist of non-digits, or
-digits enclosed in &#60;&#62;, followed by either ! or ?. When it matches, it runs
-quickly. However, if it is applied to
-</P>
-<P>
-<PRE>
- aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-</PRE>
-</P>
-<P>
-it takes a long time before reporting failure. This is because the string can
-be divided between the two repeats in a large number of ways, and all have to
-be tried. (The example used [!?] rather than a single character at the end,
-because both PCRE and Perl have an optimization that allows for fast failure
-when a single character is used. They remember the last single character that
-is required for a match, and fail early if it is not present in the string.)
-If the pattern is changed to
-</P>
-<P>
-<PRE>
- ((?&#62;\D+)|&#60;\d+&#62;)*[!?]
-</PRE>
-</P>
-<P>
-sequences of non-digits cannot be broken, and failure happens quickly.
-</P>
-<LI><A NAME="SEC26" HREF="#TOC1">CONDITIONAL SUBPATTERNS</A>
-<P>
-It is possible to cause the matching process to obey a subpattern
-conditionally or to choose between two alternative subpatterns, depending on
-the result of an assertion, or whether a previous capturing subpattern matched
-or not. The two possible forms of conditional subpattern are
-</P>
-<P>
-<PRE>
- (?(condition)yes-pattern)
- (?(condition)yes-pattern|no-pattern)
-</PRE>
-</P>
-<P>
-If the condition is satisfied, the yes-pattern is used; otherwise the
-no-pattern (if present) is used. If there are more than two alternatives in the
-subpattern, a compile-time error occurs.
-</P>
-<P>
-There are two kinds of condition. If the text between the parentheses consists
-of a sequence of digits, the condition is satisfied if the capturing subpattern
-of that number has previously matched. The number must be greater than zero.
-Consider the following pattern, which contains non-significant white space to
-make it more readable (assume the PCRE_EXTENDED option) and to divide it into
-three parts for ease of discussion:
-</P>
-<P>
-<PRE>
- ( \( )? [^()]+ (?(1) \) )
-</PRE>
-</P>
-<P>
-The first part matches an optional opening parenthesis, and if that
-character is present, sets it as the first captured substring. The second part
-matches one or more characters that are not parentheses. The third part is a
-conditional subpattern that tests whether the first set of parentheses matched
-or not. If they did, that is, if subject started with an opening parenthesis,
-the condition is true, and so the yes-pattern is executed and a closing
-parenthesis is required. Otherwise, since no-pattern is not present, the
-subpattern matches nothing. In other words, this pattern matches a sequence of
-non-parentheses, optionally enclosed in parentheses.
-</P>
-<P>
-If the condition is not a sequence of digits, it must be an assertion. This may
-be a positive or negative lookahead or lookbehind assertion. Consider this
-pattern, again containing non-significant white space, and with the two
-alternatives on the second line:
-</P>
-<P>
-<PRE>
- (?(?=[^a-z]*[a-z])
- \d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} )
-</PRE>
-</P>
-<P>
-The condition is a positive lookahead assertion that matches an optional
-sequence of non-letters followed by a letter. In other words, it tests for the
-presence of at least one letter in the subject. If a letter is found, the
-subject is matched against the first alternative; otherwise it is matched
-against the second. This pattern matches strings in one of the two forms
-dd-aaa-dd or dd-dd-dd, where aaa are letters and dd are digits.
-</P>
-<LI><A NAME="SEC27" HREF="#TOC1">COMMENTS</A>
-<P>
-The sequence (?# marks the start of a comment which continues up to the next
-closing parenthesis. Nested parentheses are not permitted. The characters
-that make up a comment play no part in the pattern matching at all.
-</P>
-<P>
-If the PCRE_EXTENDED option is set, an unescaped # character outside a
-character class introduces a comment that continues up to the next newline
-character in the pattern.
-</P>
-<LI><A NAME="SEC28" HREF="#TOC1">RECURSIVE PATTERNS</A>
-<P>
-Consider the problem of matching a string in parentheses, allowing for
-unlimited nested parentheses. Without the use of recursion, the best that can
-be done is to use a pattern that matches up to some fixed depth of nesting. It
-is not possible to handle an arbitrary nesting depth. Perl 5.6 has provided an
-experimental facility that allows regular expressions to recurse (amongst other
-things). It does this by interpolating Perl code in the expression at run time,
-and the code can refer to the expression itself. A Perl pattern to solve the
-parentheses problem can be created like this:
-</P>
-<P>
-<PRE>
- $re = qr{\( (?: (?&#62;[^()]+) | (?p{$re}) )* \)}x;
-</PRE>
-</P>
-<P>
-The (?p{...}) item interpolates Perl code at run time, and in this case refers
-recursively to the pattern in which it appears. Obviously, PCRE cannot support
-the interpolation of Perl code. Instead, the special item (?R) is provided for
-the specific case of recursion. This PCRE pattern solves the parentheses
-problem (assume the PCRE_EXTENDED option is set so that white space is
-ignored):
-</P>
-<P>
-<PRE>
- \( ( (?&#62;[^()]+) | (?R) )* \)
-</PRE>
-</P>
-<P>
-First it matches an opening parenthesis. Then it matches any number of
-substrings which can either be a sequence of non-parentheses, or a recursive
-match of the pattern itself (i.e. a correctly parenthesized substring). Finally
-there is a closing parenthesis.
-</P>
-<P>
-This particular example pattern contains nested unlimited repeats, and so the
-use of a once-only subpattern for matching strings of non-parentheses is
-important when applying the pattern to strings that do not match. For example,
-when it is applied to
-</P>
-<P>
-<PRE>
- (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
-</PRE>
-</P>
-<P>
-it yields "no match" quickly. However, if a once-only subpattern is not used,
-the match runs for a very long time indeed because there are so many different
-ways the + and * repeats can carve up the subject, and all have to be tested
-before failure can be reported.
-</P>
-<P>
-The values set for any capturing subpatterns are those from the outermost level
-of the recursion at which the subpattern value is set. If the pattern above is
-matched against
-</P>
-<P>
-<PRE>
- (ab(cd)ef)
-</PRE>
-</P>
-<P>
-the value for the capturing parentheses is "ef", which is the last value taken
-on at the top level. If additional parentheses are added, giving
-</P>
-<P>
-<PRE>
- \( ( ( (?&#62;[^()]+) | (?R) )* ) \)
- ^ ^
- ^ ^
-</PRE>
-the string they capture is "ab(cd)ef", the contents of the top level
-parentheses. If there are more than 15 capturing parentheses in a pattern, PCRE
-has to obtain extra memory to store data during a recursion, which it does by
-using <B>pcre_malloc</B>, freeing it via <B>pcre_free</B> afterwards. If no
-memory can be obtained, it saves data for the first 15 capturing parentheses
-only, as there is no way to give an out-of-memory error from within a
-recursion.
-</P>
-<LI><A NAME="SEC29" HREF="#TOC1">PERFORMANCE</A>
-<P>
-Certain items that may appear in patterns are more efficient than others. It is
-more efficient to use a character class like [aeiou] than a set of alternatives
-such as (a|e|i|o|u). In general, the simplest construction that provides the
-required behaviour is usually the most efficient. Jeffrey Friedl's book
-contains a lot of discussion about optimizing regular expressions for efficient
-performance.
-</P>
-<P>
-When a pattern begins with .* and the PCRE_DOTALL option is set, the pattern is
-implicitly anchored by PCRE, since it can match only at the start of a subject
-string. However, if PCRE_DOTALL is not set, PCRE cannot make this optimization,
-because the . metacharacter does not then match a newline, and if the subject
-string contains newlines, the pattern may match from the character immediately
-following one of them instead of from the very start. For example, the pattern
-</P>
-<P>
-<PRE>
- (.*) second
-</PRE>
-</P>
-<P>
-matches the subject "first\nand second" (where \n stands for a newline
-character) with the first captured substring being "and". In order to do this,
-PCRE has to retry the match starting after every newline in the subject.
-</P>
-<P>
-If you are using such a pattern with subject strings that do not contain
-newlines, the best performance is obtained by setting PCRE_DOTALL, or starting
-the pattern with ^.* to indicate explicit anchoring. That saves PCRE from
-having to scan along the subject looking for a newline to restart at.
-</P>
-<P>
-Beware of patterns that contain nested indefinite repeats. These can take a
-long time to run when applied to a string that does not match. Consider the
-pattern fragment
-</P>
-<P>
-<PRE>
- (a+)*
-</PRE>
-</P>
-<P>
-This can match "aaaa" in 33 different ways, and this number increases very
-rapidly as the string gets longer. (The * repeat can match 0, 1, 2, 3, or 4
-times, and for each of those cases other than 0, the + repeats can match
-different numbers of times.) When the remainder of the pattern is such that the
-entire match is going to fail, PCRE has in principle to try every possible
-variation, and this can take an extremely long time.
-</P>
-<P>
-An optimization catches some of the more simple cases such as
-</P>
-<P>
-<PRE>
- (a+)*b
-</PRE>
-</P>
-<P>
-where a literal character follows. Before embarking on the standard matching
-procedure, PCRE checks that there is a "b" later in the subject string, and if
-there is not, it fails the match immediately. However, when there is no
-following literal this optimization cannot be used. You can see the difference
-by comparing the behaviour of
-</P>
-<P>
-<PRE>
- (a+)*\d
-</PRE>
-</P>
-<P>
-with the pattern above. The former gives a failure almost instantly when
-applied to a whole line of "a" characters, whereas the latter takes an
-appreciable time with strings longer than about 20 characters.
-</P>
-<LI><A NAME="SEC30" HREF="#TOC1">UTF-8 SUPPORT</A>
-<P>
-Starting at release 3.3, PCRE has some support for character strings encoded
-in the UTF-8 format. This is incomplete, and is regarded as experimental. In
-order to use it, you must configure PCRE to include UTF-8 support in the code,
-and, in addition, you must call <B>pcre_compile()</B> with the PCRE_UTF8 option
-flag. When you do this, both the pattern and any subject strings that are
-matched against it are treated as UTF-8 strings instead of just strings of
-bytes, but only in the cases that are mentioned below.
-</P>
-<P>
-If you compile PCRE with UTF-8 support, but do not use it at run time, the
-library will be a bit bigger, but the additional run time overhead is limited
-to testing the PCRE_UTF8 flag in several places, so should not be very large.
-</P>
-<P>
-PCRE assumes that the strings it is given contain valid UTF-8 codes. It does
-not diagnose invalid UTF-8 strings. If you pass invalid UTF-8 strings to PCRE,
-the results are undefined.
-</P>
-<P>
-Running with PCRE_UTF8 set causes these changes in the way PCRE works:
-</P>
-<P>
-1. In a pattern, the escape sequence \x{...}, where the contents of the braces
-is a string of hexadecimal digits, is interpreted as a UTF-8 character whose
-code number is the given hexadecimal number, for example: \x{1234}. This
-inserts from one to six literal bytes into the pattern, using the UTF-8
-encoding. If a non-hexadecimal digit appears between the braces, the item is
-not recognized.
-</P>
-<P>
-2. The original hexadecimal escape sequence, \xhh, generates a two-byte UTF-8
-character if its value is greater than 127.
-</P>
-<P>
-3. Repeat quantifiers are NOT correctly handled if they follow a multibyte
-character. For example, \x{100}* and \xc3+ do not work. If you want to
-repeat such characters, you must enclose them in non-capturing parentheses,
-for example (?:\x{100}), at present.
-</P>
-<P>
-4. The dot metacharacter matches one UTF-8 character instead of a single byte.
-</P>
-<P>
-5. Unlike literal UTF-8 characters, the dot metacharacter followed by a
-repeat quantifier does operate correctly on UTF-8 characters instead of
-single bytes.
-</P>
-<P>
-4. Although the \x{...} escape is permitted in a character class, characters
-whose values are greater than 255 cannot be included in a class.
-</P>
-<P>
-5. A class is matched against a UTF-8 character instead of just a single byte,
-but it can match only characters whose values are less than 256. Characters
-with greater values always fail to match a class.
-</P>
-<P>
-6. Repeated classes work correctly on multiple characters.
-</P>
-<P>
-7. Classes containing just a single character whose value is greater than 127
-(but less than 256), for example, [\x80] or [^\x{93}], do not work because
-these are optimized into single byte matches. In the first case, of course,
-the class brackets are just redundant.
-</P>
-<P>
-8. Lookbehind assertions move backwards in the subject by a fixed number of
-characters instead of a fixed number of bytes. Simple cases have been tested
-to work correctly, but there may be hidden gotchas herein.
-</P>
-<P>
-9. The character types such as \d and \w do not work correctly with UTF-8
-characters. They continue to test a single byte.
-</P>
-<P>
-10. Anything not explicitly mentioned here continues to work in bytes rather
-than in characters.
-</P>
-<P>
-The following UTF-8 features of Perl 5.6 are not implemented:
-</P>
-<P>
-1. The escape sequence \C to match a single byte.
-</P>
-<P>
-2. The use of Unicode tables and properties and escapes \p, \P, and \X.
-</P>
-<LI><A NAME="SEC31" HREF="#TOC1">SAMPLE PROGRAM</A>
-<P>
-The code below is a simple, complete demonstration program, to get you started
-with using PCRE. This code is also supplied in the file <I>pcredemo.c</I> in the
-PCRE distribution.
-</P>
-<P>
-The program compiles the regular expression that is its first argument, and
-matches it against the subject string in its second argument. No options are
-set, and default character tables are used. If matching succeeds, the program
-outputs the portion of the subject that matched, together with the contents of
-any captured substrings.
-</P>
-<P>
-On a Unix system that has PCRE installed in <I>/usr/local</I>, you can compile
-the demonstration program using a command like this:
-</P>
-<P>
-<PRE>
- gcc -o pcredemo pcredemo.c -I/usr/local/include -L/usr/local/lib -lpcre
-</PRE>
-</P>
-<P>
-Then you can run simple tests like this:
-</P>
-<P>
-<PRE>
- ./pcredemo 'cat|dog' 'the cat sat on the mat'
-</PRE>
-</P>
-<P>
-Note that there is a much more comprehensive test program, called
-<B>pcretest</B>, which supports many more facilities for testing regular
-expressions. The <B>pcredemo</B> program is provided as a simple coding example.
-</P>
-<P>
-On some operating systems (e.g. Solaris) you may get an error like this when
-you try to run <B>pcredemo</B>:
-</P>
-<P>
-<PRE>
- ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such file or directory
-</PRE>
-</P>
-<P>
-This is caused by the way shared library support works on those systems. You
-need to add
-</P>
-<P>
-<PRE>
- -R/usr/local/lib
-</PRE>
-</P>
-<P>
-to the compile command to get round this problem. Here's the code:
-</P>
-<P>
-<PRE>
- #include &#60;stdio.h&#62;
- #include &#60;string.h&#62;
- #include &#60;pcre.h&#62;
-</PRE>
-</P>
-<P>
-<PRE>
- #define OVECCOUNT 30 /* should be a multiple of 3 */
-</PRE>
-</P>
-<P>
-<PRE>
- int main(int argc, char **argv)
- {
- pcre *re;
- const char *error;
- int erroffset;
- int ovector[OVECCOUNT];
- int rc, i;
-</PRE>
-</P>
-<P>
-<PRE>
- if (argc != 3)
- {
- printf("Two arguments required: a regex and a "
- "subject string\n");
- return 1;
- }
-</PRE>
-</P>
-<P>
-<PRE>
- /* Compile the regular expression in the first argument */
-</PRE>
-</P>
-<P>
-<PRE>
- re = pcre_compile(
- argv[1], /* the pattern */
- 0, /* default options */
- &error, /* for error message */
- &erroffset, /* for error offset */
- NULL); /* use default character tables */
-</PRE>
-</P>
-<P>
-<PRE>
- /* Compilation failed: print the error message and exit */
-</PRE>
-</P>
-<P>
-<PRE>
- if (re == NULL)
- {
- printf("PCRE compilation failed at offset %d: %s\n",
- erroffset, error);
- return 1;
- }
-</PRE>
-</P>
-<P>
-<PRE>
- /* Compilation succeeded: match the subject in the second
- argument */
-</PRE>
-</P>
-<P>
-<PRE>
- rc = pcre_exec(
- re, /* the compiled pattern */
- NULL, /* we didn't study the pattern */
- argv[2], /* the subject string */
- (int)strlen(argv[2]), /* the length of the subject */
- 0, /* start at offset 0 in the subject */
- 0, /* default options */
- ovector, /* vector for substring information */
- OVECCOUNT); /* number of elements in the vector */
-</PRE>
-</P>
-<P>
-<PRE>
- /* Matching failed: handle error cases */
-</PRE>
-</P>
-<P>
-<PRE>
- if (rc &#60; 0)
- {
- switch(rc)
- {
- case PCRE_ERROR_NOMATCH: printf("No match\n"); break;
- /*
- Handle other special cases if you like
- */
- default: printf("Matching error %d\n", rc); break;
- }
- return 1;
- }
-</PRE>
-</P>
-<P>
-<PRE>
- /* Match succeded */
-</PRE>
-</P>
-<P>
-<PRE>
- printf("Match succeeded\n");
-</PRE>
-</P>
-<P>
-<PRE>
- /* The output vector wasn't big enough */
-</PRE>
-</P>
-<P>
-<PRE>
- if (rc == 0)
- {
- rc = OVECCOUNT/3;
- printf("ovector only has room for %d captured "
- substrings\n", rc - 1);
- }
-</PRE>
-</P>
-<P>
-<PRE>
- /* Show substrings stored in the output vector */
-</PRE>
-</P>
-<P>
-<PRE>
- for (i = 0; i &#60; rc; i++)
- {
- char *substring_start = argv[2] + ovector[2*i];
- int substring_length = ovector[2*i+1] - ovector[2*i];
- printf("%2d: %.*s\n", i, substring_length,
- substring_start);
- }
-</PRE>
-</P>
-<P>
-<PRE>
- return 0;
- }
-</PRE>
-</P>
-<LI><A NAME="SEC32" HREF="#TOC1">AUTHOR</A>
-<P>
-Philip Hazel &#60;ph10@cam.ac.uk&#62;
-<BR>
-University Computing Service,
-<BR>
-New Museums Site,
-<BR>
-Cambridge CB2 3QG, England.
-<BR>
-Phone: +44 1223 334714
-</P>
-<P>
-Last updated: 15 August 2001
-<BR>
-Copyright (c) 1997-2001 University of Cambridge.
diff --git a/ext/pcre/pcrelib/doc/pcre.txt b/ext/pcre/pcrelib/doc/pcre.txt
deleted file mode 100644
index 46ede59754..0000000000
--- a/ext/pcre/pcrelib/doc/pcre.txt
+++ /dev/null
@@ -1,2307 +0,0 @@
-NAME
- pcre - Perl-compatible regular expressions.
-
-
-
-SYNOPSIS
- #include <pcre.h>
-
- pcre *pcre_compile(const char *pattern, int options,
- const char **errptr, int *erroffset,
- const unsigned char *tableptr);
-
- pcre_extra *pcre_study(const pcre *code, int options,
- const char **errptr);
-
- int pcre_exec(const pcre *code, const pcre_extra *extra,
- const char *subject, int length, int startoffset,
- int options, int *ovector, int ovecsize);
-
- int pcre_copy_substring(const char *subject, int *ovector,
- int stringcount, int stringnumber, char *buffer,
- int buffersize);
-
- int pcre_get_substring(const char *subject, int *ovector,
- int stringcount, int stringnumber,
- const char **stringptr);
-
- int pcre_get_substring_list(const char *subject,
- int *ovector, int stringcount, const char ***listptr);
-
- void pcre_free_substring(const char *stringptr);
-
- void pcre_free_substring_list(const char **stringptr);
-
- const unsigned char *pcre_maketables(void);
-
- int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
- int what, void *where);
-
- int pcre_info(const pcre *code, int *optptr, *firstcharptr);
-
- char *pcre_version(void);
-
- void *(*pcre_malloc)(size_t);
-
- void (*pcre_free)(void *);
-
-
-
-
-DESCRIPTION
- The PCRE library is a set of functions that implement regu-
- lar expression pattern matching using the same syntax and
- semantics as Perl 5, with just a few differences (see
-
- below). The current implementation corresponds to Perl
- 5.005, with some additional features from later versions.
- This includes some experimental, incomplete support for
- UTF-8 encoded strings. Details of exactly what is and what
- is not supported are given below.
-
- PCRE has its own native API, which is described in this
- document. There is also a set of wrapper functions that
- correspond to the POSIX regular expression API. These are
- described in the pcreposix documentation.
-
- The native API function prototypes are defined in the header
- file pcre.h, and on Unix systems the library itself is
- called libpcre.a, so can be accessed by adding -lpcre to the
- command for linking an application which calls it. The
- header file defines the macros PCRE_MAJOR and PCRE_MINOR to
- contain the major and minor release numbers for the library.
- Applications can use these to include support for different
- releases.
-
- The functions pcre_compile(), pcre_study(), and pcre_exec()
- are used for compiling and matching regular expressions. A
- sample program that demonstrates the simplest way of using
- them is given in the file pcredemo.c. The last section of
- this man page describes how to run it.
-
- The functions pcre_copy_substring(), pcre_get_substring(),
- and pcre_get_substring_list() are convenience functions for
- extracting captured substrings from a matched subject
- string; pcre_free_substring() and pcre_free_substring_list()
- are also provided, to free the memory used for extracted
- strings.
-
- The function pcre_maketables() is used (optionally) to build
- a set of character tables in the current locale for passing
- to pcre_compile().
-
- The function pcre_fullinfo() is used to find out information
- about a compiled pattern; pcre_info() is an obsolete version
- which returns only some of the available information, but is
- retained for backwards compatibility. The function
- pcre_version() returns a pointer to a string containing the
- version of PCRE and its date of release.
-
- The global variables pcre_malloc and pcre_free initially
- contain the entry points of the standard malloc() and free()
- functions respectively. PCRE calls the memory management
- functions via these variables, so a calling program can
- replace them if it wishes to intercept the calls. This
- should be done before calling any PCRE functions.
-
-
-
-MULTI-THREADING
- The PCRE functions can be used in multi-threading applica-
- tions, with the proviso that the memory management functions
- pointed to by pcre_malloc and pcre_free are shared by all
- threads.
-
- The compiled form of a regular expression is not altered
- during matching, so the same compiled pattern can safely be
- used by several threads at once.
-
-
-
-COMPILING A PATTERN
- The function pcre_compile() is called to compile a pattern
- into an internal form. The pattern is a C string terminated
- by a binary zero, and is passed in the argument pattern. A
- pointer to a single block of memory that is obtained via
- pcre_malloc is returned. This contains the compiled code and
- related data. The pcre type is defined for the returned
- block; this is a typedef for a structure whose contents are
- not externally defined. It is up to the caller to free the
- memory when it is no longer required.
-
- Although the compiled code of a PCRE regex is relocatable,
- that is, it does not depend on memory location, the complete
- pcre data block is not fully relocatable, because it con-
- tains a copy of the tableptr argument, which is an address
- (see below).
-
- The size of a compiled pattern is roughly proportional to
- the length of the pattern string, except that each character
- class (other than those containing just a single character,
- negated or not) requires 33 bytes, and repeat quantifiers
- with a minimum greater than one or a bounded maximum cause
- the relevant portions of the compiled pattern to be repli-
- cated.
-
- The options argument contains independent bits that affect
- the compilation. It should be zero if no options are
- required. Some of the options, in particular, those that are
- compatible with Perl, can also be set and unset from within
- the pattern (see the detailed description of regular expres-
- sions below). For these options, the contents of the options
- argument specifies their initial settings at the start of
- compilation and execution. The PCRE_ANCHORED option can be
- set at the time of matching as well as at compile time.
-
- If errptr is NULL, pcre_compile() returns NULL immediately.
- Otherwise, if compilation of a pattern fails, pcre_compile()
- returns NULL, and sets the variable pointed to by errptr to
- point to a textual error message. The offset from the start
- of the pattern to the character where the error was
- discovered is placed in the variable pointed to by
- erroffset, which must not be NULL. If it is, an immediate
- error is given.
-
- If the final argument, tableptr, is NULL, PCRE uses a
- default set of character tables which are built when it is
- compiled, using the default C locale. Otherwise, tableptr
- must be the result of a call to pcre_maketables(). See the
- section on locale support below.
-
- This code fragment shows a typical straightforward call to
- pcre_compile():
-
- pcre *re;
- const char *error;
- int erroffset;
- re = pcre_compile(
- "^A.*Z", /* the pattern */
- 0, /* default options */
- &error, /* for error message */
- &erroffset, /* for error offset */
- NULL); /* use default character tables */
-
- The following option bits are defined in the header file:
-
- PCRE_ANCHORED
-
- If this bit is set, the pattern is forced to be "anchored",
- that is, it is constrained to match only at the start of the
- string which is being searched (the "subject string"). This
- effect can also be achieved by appropriate constructs in the
- pattern itself, which is the only way to do it in Perl.
-
- PCRE_CASELESS
-
- If this bit is set, letters in the pattern match both upper
- and lower case letters. It is equivalent to Perl's /i
- option.
-
- PCRE_DOLLAR_ENDONLY
-
- If this bit is set, a dollar metacharacter in the pattern
- matches only at the end of the subject string. Without this
- option, a dollar also matches immediately before the final
- character if it is a newline (but not before any other new-
- lines). The PCRE_DOLLAR_ENDONLY option is ignored if
- PCRE_MULTILINE is set. There is no equivalent to this option
- in Perl.
-
- PCRE_DOTALL
-
- If this bit is set, a dot metacharater in the pattern
- matches all characters, including newlines. Without it, new-
- lines are excluded. This option is equivalent to Perl's /s
- option. A negative class such as [^a] always matches a new-
- line character, independent of the setting of this option.
-
- PCRE_EXTENDED
-
- If this bit is set, whitespace data characters in the pat-
- tern are totally ignored except when escaped or inside a
- character class, and characters between an unescaped # out-
- side a character class and the next newline character,
- inclusive, are also ignored. This is equivalent to Perl's /x
- option, and makes it possible to include comments inside
- complicated patterns. Note, however, that this applies only
- to data characters. Whitespace characters may never appear
- within special character sequences in a pattern, for example
- within the sequence (?( which introduces a conditional sub-
- pattern.
-
- PCRE_EXTRA
-
- This option was invented in order to turn on additional
- functionality of PCRE that is incompatible with Perl, but it
- is currently of very little use. When set, any backslash in
- a pattern that is followed by a letter that has no special
- meaning causes an error, thus reserving these combinations
- for future expansion. By default, as in Perl, a backslash
- followed by a letter with no special meaning is treated as a
- literal. There are at present no other features controlled
- by this option. It can also be set by a (?X) option setting
- within a pattern.
-
- PCRE_MULTILINE
-
- By default, PCRE treats the subject string as consisting of
- a single "line" of characters (even if it actually contains
- several newlines). The "start of line" metacharacter (^)
- matches only at the start of the string, while the "end of
- line" metacharacter ($) matches only at the end of the
- string, or before a terminating newline (unless
- PCRE_DOLLAR_ENDONLY is set). This is the same as Perl.
-
- When PCRE_MULTILINE it is set, the "start of line" and "end
- of line" constructs match immediately following or immedi-
- ately before any newline in the subject string, respec-
- tively, as well as at the very start and end. This is
- equivalent to Perl's /m option. If there are no "\n" charac-
- ters in a subject string, or no occurrences of ^ or $ in a
- pattern, setting PCRE_MULTILINE has no effect.
-
- PCRE_UNGREEDY
-
- This option inverts the "greediness" of the quantifiers so
- that they are not greedy by default, but become greedy if
- followed by "?". It is not compatible with Perl. It can also
- be set by a (?U) option setting within the pattern.
-
- PCRE_UTF8
-
- This option causes PCRE to regard both the pattern and the
- subject as strings of UTF-8 characters instead of just byte
- strings. However, it is available only if PCRE has been
- built to include UTF-8 support. If not, the use of this
- option provokes an error. Support for UTF-8 is new, experi-
- mental, and incomplete. Details of exactly what it entails
- are given below.
-
-
-
-STUDYING A PATTERN
- When a pattern is going to be used several times, it is
- worth spending more time analyzing it in order to speed up
- the time taken for matching. The function pcre_study() takes
- a pointer to a compiled pattern as its first argument, and
- returns a pointer to a pcre_extra block (another typedef for
- a structure with hidden contents) containing additional
- information about the pattern; this can be passed to
- pcre_exec(). If no additional information is available, NULL
- is returned.
-
- The second argument contains option bits. At present, no
- options are defined for pcre_study(), and this argument
- should always be zero.
-
- The third argument for pcre_study() is a pointer to an error
- message. If studying succeeds (even if no data is returned),
- the variable it points to is set to NULL. Otherwise it
- points to a textual error message.
-
- This is a typical call to pcre_study():
-
- pcre_extra *pe;
- pe = pcre_study(
- re, /* result of pcre_compile() */
- 0, /* no options exist */
- &error); /* set to NULL or points to a message */
-
- At present, studying a pattern is useful only for non-
- anchored patterns that do not have a single fixed starting
- character. A bitmap of possible starting characters is
- created.
-
-
-
-LOCALE SUPPORT
- PCRE handles caseless matching, and determines whether char-
- acters are letters, digits, or whatever, by reference to a
- set of tables. The library contains a default set of tables
- which is created in the default C locale when PCRE is com-
- piled. This is used when the final argument of
- pcre_compile() is NULL, and is sufficient for many applica-
- tions.
-
- An alternative set of tables can, however, be supplied. Such
- tables are built by calling the pcre_maketables() function,
- which has no arguments, in the relevant locale. The result
- can then be passed to pcre_compile() as often as necessary.
- For example, to build and use tables that are appropriate
- for the French locale (where accented characters with codes
- greater than 128 are treated as letters), the following code
- could be used:
-
- setlocale(LC_CTYPE, "fr");
- tables = pcre_maketables();
- re = pcre_compile(..., tables);
-
- The tables are built in memory that is obtained via
- pcre_malloc. The pointer that is passed to pcre_compile is
- saved with the compiled pattern, and the same tables are
- used via this pointer by pcre_study() and pcre_exec(). Thus
- for any single pattern, compilation, studying and matching
- all happen in the same locale, but different patterns can be
- compiled in different locales. It is the caller's responsi-
- bility to ensure that the memory containing the tables
- remains available for as long as it is needed.
-
-
-
-INFORMATION ABOUT A PATTERN
- The pcre_fullinfo() function returns information about a
- compiled pattern. It replaces the obsolete pcre_info() func-
- tion, which is nevertheless retained for backwards compabil-
- ity (and is documented below).
-
- The first argument for pcre_fullinfo() is a pointer to the
- compiled pattern. The second argument is the result of
- pcre_study(), or NULL if the pattern was not studied. The
- third argument specifies which piece of information is
- required, while the fourth argument is a pointer to a vari-
- able to receive the data. The yield of the function is zero
- for success, or one of the following negative numbers:
-
- PCRE_ERROR_NULL the argument code was NULL
- the argument where was NULL
- PCRE_ERROR_BADMAGIC the "magic number" was not found
- PCRE_ERROR_BADOPTION the value of what was invalid
-
- Here is a typical call of pcre_fullinfo(), to obtain the
- length of the compiled pattern:
-
- int rc;
- unsigned long int length;
- rc = pcre_fullinfo(
- re, /* result of pcre_compile() */
- pe, /* result of pcre_study(), or NULL */
- PCRE_INFO_SIZE, /* what is required */
- &length); /* where to put the data */
-
- The possible values for the third argument are defined in
- pcre.h, and are as follows:
-
- PCRE_INFO_OPTIONS
-
- Return a copy of the options with which the pattern was com-
- piled. The fourth argument should point to an unsigned long
- int variable. These option bits are those specified in the
- call to pcre_compile(), modified by any top-level option
- settings within the pattern itself, and with the
- PCRE_ANCHORED bit forcibly set if the form of the pattern
- implies that it can match only at the start of a subject
- string.
-
- PCRE_INFO_SIZE
-
- Return the size of the compiled pattern, that is, the value
- that was passed as the argument to pcre_malloc() when PCRE
- was getting memory in which to place the compiled data. The
- fourth argument should point to a size_t variable.
-
- PCRE_INFO_CAPTURECOUNT
-
- Return the number of capturing subpatterns in the pattern.
- The fourth argument should point to an int variable.
-
- PCRE_INFO_BACKREFMAX
-
- Return the number of the highest back reference in the pat-
- tern. The fourth argument should point to an int variable.
- Zero is returned if there are no back references.
-
- PCRE_INFO_FIRSTCHAR
-
- Return information about the first character of any matched
- string, for a non-anchored pattern. If there is a fixed
- first character, e.g. from a pattern such as
- (cat|cow|coyote), it is returned in the integer pointed to
- by where. Otherwise, if either
-
- (a) the pattern was compiled with the PCRE_MULTILINE option,
- and every branch starts with "^", or
-
- (b) every branch of the pattern starts with ".*" and
- PCRE_DOTALL is not set (if it were set, the pattern would be
- anchored),
-
- -1 is returned, indicating that the pattern matches only at
- the start of a subject string or after any "\n" within the
- string. Otherwise -2 is returned. For anchored patterns, -2
- is returned.
-
- PCRE_INFO_FIRSTTABLE
-
- If the pattern was studied, and this resulted in the con-
- struction of a 256-bit table indicating a fixed set of char-
- acters for the first character in any matching string, a
- pointer to the table is returned. Otherwise NULL is
- returned. The fourth argument should point to an unsigned
- char * variable.
-
- PCRE_INFO_LASTLITERAL
-
- For a non-anchored pattern, return the value of the right-
- most literal character which must exist in any matched
- string, other than at its start. The fourth argument should
- point to an int variable. If there is no such character, or
- if the pattern is anchored, -1 is returned. For example, for
- the pattern /a\d+z\d+/ the returned value is 'z'.
-
- The pcre_info() function is now obsolete because its inter-
- face is too restrictive to return all the available data
- about a compiled pattern. New programs should use
- pcre_fullinfo() instead. The yield of pcre_info() is the
- number of capturing subpatterns, or one of the following
- negative numbers:
-
- PCRE_ERROR_NULL the argument code was NULL
- PCRE_ERROR_BADMAGIC the "magic number" was not found
-
- If the optptr argument is not NULL, a copy of the options
- with which the pattern was compiled is placed in the integer
- it points to (see PCRE_INFO_OPTIONS above).
-
- If the pattern is not anchored and the firstcharptr argument
- is not NULL, it is used to pass back information about the
- first character of any matched string (see
- PCRE_INFO_FIRSTCHAR above).
-
-
-
-MATCHING A PATTERN
- The function pcre_exec() is called to match a subject string
-
-
-
-
-
-SunOS 5.8 Last change: 9
-
-
-
- against a pre-compiled pattern, which is passed in the code
- argument. If the pattern has been studied, the result of the
- study should be passed in the extra argument. Otherwise this
- must be NULL.
-
- Here is an example of a simple call to pcre_exec():
-
- int rc;
- int ovector[30];
- rc = pcre_exec(
- re, /* result of pcre_compile() */
- NULL, /* we didn't study the pattern */
- "some string", /* the subject string */
- 11, /* the length of the subject string */
- 0, /* start at offset 0 in the subject */
- 0, /* default options */
- ovector, /* vector for substring information */
- 30); /* number of elements in the vector */
-
- The PCRE_ANCHORED option can be passed in the options argu-
- ment, whose unused bits must be zero. However, if a pattern
- was compiled with PCRE_ANCHORED, or turned out to be
- anchored by virtue of its contents, it cannot be made
- unachored at matching time.
-
- There are also three further options that can be set only at
- matching time:
-
- PCRE_NOTBOL
-
- The first character of the string is not the beginning of a
- line, so the circumflex metacharacter should not match
- before it. Setting this without PCRE_MULTILINE (at compile
- time) causes circumflex never to match.
-
- PCRE_NOTEOL
-
- The end of the string is not the end of a line, so the dol-
- lar metacharacter should not match it nor (except in multi-
- line mode) a newline immediately before it. Setting this
- without PCRE_MULTILINE (at compile time) causes dollar never
- to match.
-
- PCRE_NOTEMPTY
-
- An empty string is not considered to be a valid match if
- this option is set. If there are alternatives in the pat-
- tern, they are tried. If all the alternatives match the
- empty string, the entire match fails. For example, if the
- pattern
-
- a?b?
-
- is applied to a string not beginning with "a" or "b", it
- matches the empty string at the start of the subject. With
- PCRE_NOTEMPTY set, this match is not valid, so PCRE searches
- further into the string for occurrences of "a" or "b".
-
- Perl has no direct equivalent of PCRE_NOTEMPTY, but it does
- make a special case of a pattern match of the empty string
- within its split() function, and when using the /g modifier.
- It is possible to emulate Perl's behaviour after matching a
- null string by first trying the match again at the same
- offset with PCRE_NOTEMPTY set, and then if that fails by
- advancing the starting offset (see below) and trying an
- ordinary match again.
-
- The subject string is passed as a pointer in subject, a
- length in length, and a starting offset in startoffset.
- Unlike the pattern string, the subject may contain binary
- zero characters. When the starting offset is zero, the
- search for a match starts at the beginning of the subject,
- and this is by far the most common case.
-
- A non-zero starting offset is useful when searching for
- another match in the same subject by calling pcre_exec()
- again after a previous success. Setting startoffset differs
- from just passing over a shortened string and setting
- PCRE_NOTBOL in the case of a pattern that begins with any
- kind of lookbehind. For example, consider the pattern
-
- \Biss\B
-
- which finds occurrences of "iss" in the middle of words. (\B
- matches only if the current position in the subject is not a
- word boundary.) When applied to the string "Mississipi" the
- first call to pcre_exec() finds the first occurrence. If
- pcre_exec() is called again with just the remainder of the
- subject, namely "issipi", it does not match, because \B is
- always false at the start of the subject, which is deemed to
- be a word boundary. However, if pcre_exec() is passed the
- entire string again, but with startoffset set to 4, it finds
- the second occurrence of "iss" because it is able to look
- behind the starting point to discover that it is preceded by
- a letter.
-
- If a non-zero starting offset is passed when the pattern is
- anchored, one attempt to match at the given offset is tried.
- This can only succeed if the pattern does not require the
- match to be at the start of the subject.
-
- In general, a pattern matches a certain portion of the sub-
- ject, and in addition, further substrings from the subject
- may be picked out by parts of the pattern. Following the
- usage in Jeffrey Friedl's book, this is called "capturing"
- in what follows, and the phrase "capturing subpattern" is
- used for a fragment of a pattern that picks out a substring.
- PCRE supports several other kinds of parenthesized subpat-
- tern that do not cause substrings to be captured.
-
- Captured substrings are returned to the caller via a vector
- of integer offsets whose address is passed in ovector. The
- number of elements in the vector is passed in ovecsize. The
- first two-thirds of the vector is used to pass back captured
- substrings, each substring using a pair of integers. The
- remaining third of the vector is used as workspace by
- pcre_exec() while matching capturing subpatterns, and is not
- available for passing back information. The length passed in
- ovecsize should always be a multiple of three. If it is not,
- it is rounded down.
-
- When a match has been successful, information about captured
- substrings is returned in pairs of integers, starting at the
- beginning of ovector, and continuing up to two-thirds of its
- length at the most. The first element of a pair is set to
- the offset of the first character in a substring, and the
- second is set to the offset of the first character after the
- end of a substring. The first pair, ovector[0] and ovec-
- tor[1], identify the portion of the subject string matched
- by the entire pattern. The next pair is used for the first
- capturing subpattern, and so on. The value returned by
- pcre_exec() is the number of pairs that have been set. If
- there are no capturing subpatterns, the return value from a
- successful match is 1, indicating that just the first pair
- of offsets has been set.
-
- Some convenience functions are provided for extracting the
- captured substrings as separate strings. These are described
- in the following section.
-
- It is possible for an capturing subpattern number n+1 to
- match some part of the subject when subpattern n has not
- been used at all. For example, if the string "abc" is
- matched against the pattern (a|(z))(bc) subpatterns 1 and 3
- are matched, but 2 is not. When this happens, both offset
- values corresponding to the unused subpattern are set to -1.
-
- If a capturing subpattern is matched repeatedly, it is the
- last portion of the string that it matched that gets
- returned.
-
- If the vector is too small to hold all the captured sub-
- strings, it is used as far as possible (up to two-thirds of
- its length), and the function returns a value of zero. In
- particular, if the substring offsets are not of interest,
- pcre_exec() may be called with ovector passed as NULL and
- ovecsize as zero. However, if the pattern contains back
- references and the ovector isn't big enough to remember the
- related substrings, PCRE has to get additional memory for
- use during matching. Thus it is usually advisable to supply
- an ovector.
-
- Note that pcre_info() can be used to find out how many cap-
- turing subpatterns there are in a compiled pattern. The
- smallest size for ovector that will allow for n captured
- substrings in addition to the offsets of the substring
- matched by the whole pattern is (n+1)*3.
-
- If pcre_exec() fails, it returns a negative number. The fol-
- lowing are defined in the header file:
-
- PCRE_ERROR_NOMATCH (-1)
-
- The subject string did not match the pattern.
-
- PCRE_ERROR_NULL (-2)
-
- Either code or subject was passed as NULL, or ovector was
- NULL and ovecsize was not zero.
-
- PCRE_ERROR_BADOPTION (-3)
-
- An unrecognized bit was set in the options argument.
-
- PCRE_ERROR_BADMAGIC (-4)
-
- PCRE stores a 4-byte "magic number" at the start of the com-
- piled code, to catch the case when it is passed a junk
- pointer. This is the error it gives when the magic number
- isn't present.
-
- PCRE_ERROR_UNKNOWN_NODE (-5)
-
- While running the pattern match, an unknown item was encoun-
- tered in the compiled pattern. This error could be caused by
- a bug in PCRE or by overwriting of the compiled pattern.
-
- PCRE_ERROR_NOMEMORY (-6)
-
- If a pattern contains back references, but the ovector that
- is passed to pcre_exec() is not big enough to remember the
- referenced substrings, PCRE gets a block of memory at the
- start of matching to use for this purpose. If the call via
- pcre_malloc() fails, this error is given. The memory is
- freed at the end of matching.
-
-
-
-
-EXTRACTING CAPTURED SUBSTRINGS
- Captured substrings can be accessed directly by using the
- offsets returned by pcre_exec() in ovector. For convenience,
- the functions pcre_copy_substring(), pcre_get_substring(),
- and pcre_get_substring_list() are provided for extracting
- captured substrings as new, separate, zero-terminated
- strings. A substring that contains a binary zero is
- correctly extracted and has a further zero added on the end,
- but the result does not, of course, function as a C string.
-
- The first three arguments are the same for all three func-
- tions: subject is the subject string which has just been
- successfully matched, ovector is a pointer to the vector of
- integer offsets that was passed to pcre_exec(), and
- stringcount is the number of substrings that were captured
- by the match, including the substring that matched the
- entire regular expression. This is the value returned by
- pcre_exec if it is greater than zero. If pcre_exec()
- returned zero, indicating that it ran out of space in ovec-
- tor, the value passed as stringcount should be the size of
- the vector divided by three.
-
- The functions pcre_copy_substring() and pcre_get_substring()
- extract a single substring, whose number is given as string-
- number. A value of zero extracts the substring that matched
- the entire pattern, while higher values extract the captured
- substrings. For pcre_copy_substring(), the string is placed
- in buffer, whose length is given by buffersize, while for
- pcre_get_substring() a new block of memory is obtained via
- pcre_malloc, and its address is returned via stringptr. The
- yield of the function is the length of the string, not
- including the terminating zero, or one of
-
- PCRE_ERROR_NOMEMORY (-6)
-
- The buffer was too small for pcre_copy_substring(), or the
- attempt to get memory failed for pcre_get_substring().
-
- PCRE_ERROR_NOSUBSTRING (-7)
-
- There is no substring whose number is stringnumber.
-
- The pcre_get_substring_list() function extracts all avail-
- able substrings and builds a list of pointers to them. All
- this is done in a single block of memory which is obtained
- via pcre_malloc. The address of the memory block is returned
- via listptr, which is also the start of the list of string
- pointers. The end of the list is marked by a NULL pointer.
- The yield of the function is zero if all went well, or
-
- PCRE_ERROR_NOMEMORY (-6)
-
- if the attempt to get the memory block failed.
-
- When any of these functions encounter a substring that is
- unset, which can happen when capturing subpattern number n+1
- matches some part of the subject, but subpattern n has not
- been used at all, they return an empty string. This can be
- distinguished from a genuine zero-length substring by
- inspecting the appropriate offset in ovector, which is nega-
- tive for unset substrings.
-
- The two convenience functions pcre_free_substring() and
- pcre_free_substring_list() can be used to free the memory
- returned by a previous call of pcre_get_substring() or
- pcre_get_substring_list(), respectively. They do nothing
- more than call the function pointed to by pcre_free, which
- of course could be called directly from a C program. How-
- ever, PCRE is used in some situations where it is linked via
- a special interface to another programming language which
- cannot use pcre_free directly; it is for these cases that
- the functions are provided.
-
-
-
-LIMITATIONS
- There are some size limitations in PCRE but it is hoped that
- they will never in practice be relevant. The maximum length
- of a compiled pattern is 65539 (sic) bytes. All values in
- repeating quantifiers must be less than 65536. There max-
- imum number of capturing subpatterns is 65535. There is no
- limit to the number of non-capturing subpatterns, but the
- maximum depth of nesting of all kinds of parenthesized sub-
- pattern, including capturing subpatterns, assertions, and
- other types of subpattern, is 200.
-
- The maximum length of a subject string is the largest posi-
- tive number that an integer variable can hold. However, PCRE
- uses recursion to handle subpatterns and indefinite repeti-
- tion. This means that the available stack space may limit
- the size of a subject string that can be processed by cer-
- tain patterns.
-
-
-
-DIFFERENCES FROM PERL
- The differences described here are with respect to Perl
- 5.005.
-
- 1. By default, a whitespace character is any character that
- the C library function isspace() recognizes, though it is
- possible to compile PCRE with alternative character type
- tables. Normally isspace() matches space, formfeed, newline,
- carriage return, horizontal tab, and vertical tab. Perl 5 no
- longer includes vertical tab in its set of whitespace char-
- acters. The \v escape that was in the Perl documentation for
- a long time was never in fact recognized. However, the char-
- acter itself was treated as whitespace at least up to 5.002.
- In 5.004 and 5.005 it does not match \s.
-
- 2. PCRE does not allow repeat quantifiers on lookahead
- assertions. Perl permits them, but they do not mean what you
- might think. For example, (?!a){3} does not assert that the
- next three characters are not "a". It just asserts that the
- next character is not "a" three times.
-
- 3. Capturing subpatterns that occur inside negative looka-
- head assertions are counted, but their entries in the
- offsets vector are never set. Perl sets its numerical vari-
- ables from any such patterns that are matched before the
- assertion fails to match something (thereby succeeding), but
- only if the negative lookahead assertion contains just one
- branch.
-
- 4. Though binary zero characters are supported in the sub-
- ject string, they are not allowed in a pattern string
- because it is passed as a normal C string, terminated by
- zero. The escape sequence "\0" can be used in the pattern to
- represent a binary zero.
-
- 5. The following Perl escape sequences are not supported:
- \l, \u, \L, \U, \E, \Q. In fact these are implemented by
- Perl's general string-handling and are not part of its pat-
- tern matching engine.
-
- 6. The Perl \G assertion is not supported as it is not
- relevant to single pattern matches.
-
- 7. Fairly obviously, PCRE does not support the (?{code}) and
- (?p{code}) constructions. However, there is some experimen-
- tal support for recursive patterns using the non-Perl item
- (?R).
-
- 8. There are at the time of writing some oddities in Perl
- 5.005_02 concerned with the settings of captured strings
- when part of a pattern is repeated. For example, matching
- "aba" against the pattern /^(a(b)?)+$/ sets $2 to the value
- "b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves $2
- unset. However, if the pattern is changed to
- /^(aa(b(b))?)+$/ then $2 (and $3) are set.
-
- In Perl 5.004 $2 is set in both cases, and that is also true
- of PCRE. If in the future Perl changes to a consistent state
- that is different, PCRE may change to follow.
-
- 9. Another as yet unresolved discrepancy is that in Perl
- 5.005_02 the pattern /^(a)?(?(1)a|b)+$/ matches the string
- "a", whereas in PCRE it does not. However, in both Perl and
- PCRE /^(a)?a/ matched against "a" leaves $1 unset.
-
- 10. PCRE provides some extensions to the Perl regular
- expression facilities:
-
- (a) Although lookbehind assertions must match fixed length
- strings, each alternative branch of a lookbehind assertion
- can match a different length of string. Perl 5.005 requires
- them all to have the same length.
-
- (b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not
- set, the $ meta- character matches only at the very end of
- the string.
-
- (c) If PCRE_EXTRA is set, a backslash followed by a letter
- with no special meaning is faulted.
-
- (d) If PCRE_UNGREEDY is set, the greediness of the repeti-
- tion quantifiers is inverted, that is, by default they are
- not greedy, but if followed by a question mark they are.
-
- (e) PCRE_ANCHORED can be used to force a pattern to be tried
- only at the start of the subject.
-
- (f) The PCRE_NOTBOL, PCRE_NOTEOL, and PCRE_NOTEMPTY options
- for pcre_exec() have no Perl equivalents.
-
- (g) The (?R) construct allows for recursive pattern matching
- (Perl 5.6 can do this using the (?p{code}) construct, which
- PCRE cannot of course support.)
-
-
-
-REGULAR EXPRESSION DETAILS
- The syntax and semantics of the regular expressions sup-
- ported by PCRE are described below. Regular expressions are
- also described in the Perl documentation and in a number of
- other books, some of which have copious examples. Jeffrey
- Friedl's "Mastering Regular Expressions", published by
- O'Reilly (ISBN 1-56592-257), covers them in great detail.
-
- The description here is intended as reference documentation.
- The basic operation of PCRE is on strings of bytes. However,
- there is the beginnings of some support for UTF-8 character
- strings. To use this support you must configure PCRE to
- include it, and then call pcre_compile() with the PCRE_UTF8
- option. How this affects the pattern matching is described
- in the final section of this document.
-
- A regular expression is a pattern that is matched against a
- subject string from left to right. Most characters stand for
- themselves in a pattern, and match the corresponding charac-
- ters in the subject. As a trivial example, the pattern
-
- The quick brown fox
-
- matches a portion of a subject string that is identical to
- itself. The power of regular expressions comes from the
- ability to include alternatives and repetitions in the pat-
- tern. These are encoded in the pattern by the use of meta-
- characters, which do not stand for themselves but instead
- are interpreted in some special way.
-
- There are two different sets of meta-characters: those that
- are recognized anywhere in the pattern except within square
- brackets, and those that are recognized in square brackets.
- Outside square brackets, the meta-characters are as follows:
-
- \ general escape character with several uses
- ^ assert start of subject (or line, in multiline
- mode)
- $ assert end of subject (or line, in multiline mode)
- . match any character except newline (by default)
- [ start character class definition
- | start of alternative branch
- ( start subpattern
- ) end subpattern
- ? extends the meaning of (
- also 0 or 1 quantifier
- also quantifier minimizer
- * 0 or more quantifier
- + 1 or more quantifier
- { start min/max quantifier
-
- Part of a pattern that is in square brackets is called a
- "character class". In a character class the only meta-
- characters are:
-
- \ general escape character
- ^ negate the class, but only if the first character
- - indicates character range
- ] terminates the character class
-
- The following sections describe the use of each of the
- meta-characters.
-
-
-
-BACKSLASH
- The backslash character has several uses. Firstly, if it is
- followed by a non-alphameric character, it takes away any
- special meaning that character may have. This use of
-
- backslash as an escape character applies both inside and
- outside character classes.
-
- For example, if you want to match a "*" character, you write
- "\*" in the pattern. This applies whether or not the follow-
- ing character would otherwise be interpreted as a meta-
- character, so it is always safe to precede a non-alphameric
- with "\" to specify that it stands for itself. In particu-
- lar, if you want to match a backslash, you write "\\".
-
- If a pattern is compiled with the PCRE_EXTENDED option, whi-
- tespace in the pattern (other than in a character class) and
- characters between a "#" outside a character class and the
- next newline character are ignored. An escaping backslash
- can be used to include a whitespace or "#" character as part
- of the pattern.
-
- A second use of backslash provides a way of encoding non-
- printing characters in patterns in a visible manner. There
- is no restriction on the appearance of non-printing charac-
- ters, apart from the binary zero that terminates a pattern,
- but when a pattern is being prepared by text editing, it is
- usually easier to use one of the following escape sequences
- than the binary character it represents:
-
- \a alarm, that is, the BEL character (hex 07)
- \cx "control-x", where x is any character
- \e escape (hex 1B)
- \f formfeed (hex 0C)
- \n newline (hex 0A)
- \r carriage return (hex 0D)
- \t tab (hex 09)
- \xhh character with hex code hh
- \ddd character with octal code ddd, or backreference
-
- The precise effect of "\cx" is as follows: if "x" is a lower
- case letter, it is converted to upper case. Then bit 6 of
- the character (hex 40) is inverted. Thus "\cz" becomes hex
- 1A, but "\c{" becomes hex 3B, while "\c;" becomes hex 7B.
-
- After "\x", up to two hexadecimal digits are read (letters
- can be in upper or lower case).
-
- After "\0" up to two further octal digits are read. In both
- cases, if there are fewer than two digits, just those that
- are present are used. Thus the sequence "\0\x\07" specifies
- two binary zeros followed by a BEL character. Make sure you
- supply two digits after the initial zero if the character
- that follows is itself an octal digit.
-
- The handling of a backslash followed by a digit other than 0
- is complicated. Outside a character class, PCRE reads it
- and any following digits as a decimal number. If the number
- is less than 10, or if there have been at least that many
- previous capturing left parentheses in the expression, the
- entire sequence is taken as a back reference. A description
- of how this works is given later, following the discussion
- of parenthesized subpatterns.
-
- Inside a character class, or if the decimal number is
- greater than 9 and there have not been that many capturing
- subpatterns, PCRE re-reads up to three octal digits follow-
- ing the backslash, and generates a single byte from the
- least significant 8 bits of the value. Any subsequent digits
- stand for themselves. For example:
-
- \040 is another way of writing a space
- \40 is the same, provided there are fewer than 40
- previous capturing subpatterns
- \7 is always a back reference
- \11 might be a back reference, or another way of
- writing a tab
- \011 is always a tab
- \0113 is a tab followed by the character "3"
- \113 is the character with octal code 113 (since there
- can be no more than 99 back references)
- \377 is a byte consisting entirely of 1 bits
- \81 is either a back reference, or a binary zero
- followed by the two characters "8" and "1"
-
- Note that octal values of 100 or greater must not be intro-
- duced by a leading zero, because no more than three octal
- digits are ever read.
-
- All the sequences that define a single byte value can be
- used both inside and outside character classes. In addition,
- inside a character class, the sequence "\b" is interpreted
- as the backspace character (hex 08). Outside a character
- class it has a different meaning (see below).
-
- The third use of backslash is for specifying generic charac-
- ter types:
-
- \d any decimal digit
- \D any character that is not a decimal digit
- \s any whitespace character
- \S any character that is not a whitespace character
- \w any "word" character
- \W any "non-word" character
-
- Each pair of escape sequences partitions the complete set of
- characters into two disjoint sets. Any given character
- matches one, and only one, of each pair.
-
- A "word" character is any letter or digit or the underscore
- character, that is, any character which can be part of a
- Perl "word". The definition of letters and digits is con-
- trolled by PCRE's character tables, and may vary if locale-
- specific matching is taking place (see "Locale support"
- above). For example, in the "fr" (French) locale, some char-
- acter codes greater than 128 are used for accented letters,
- and these are matched by \w.
-
- These character type sequences can appear both inside and
- outside character classes. They each match one character of
- the appropriate type. If the current matching point is at
- the end of the subject string, all of them fail, since there
- is no character to match.
-
- The fourth use of backslash is for certain simple asser-
- tions. An assertion specifies a condition that has to be met
- at a particular point in a match, without consuming any
- characters from the subject string. The use of subpatterns
- for more complicated assertions is described below. The
- backslashed assertions are
-
- \b word boundary
- \B not a word boundary
- \A start of subject (independent of multiline mode)
- \Z end of subject or newline at end (independent of
- multiline mode)
- \z end of subject (independent of multiline mode)
-
- These assertions may not appear in character classes (but
- note that "\b" has a different meaning, namely the backspace
- character, inside a character class).
-
- A word boundary is a position in the subject string where
- the current character and the previous character do not both
- match \w or \W (i.e. one matches \w and the other matches
- \W), or the start or end of the string if the first or last
- character matches \w, respectively.
-
- The \A, \Z, and \z assertions differ from the traditional
- circumflex and dollar (described below) in that they only
- ever match at the very start and end of the subject string,
- whatever options are set. They are not affected by the
- PCRE_NOTBOL or PCRE_NOTEOL options. If the startoffset argu-
- ment of pcre_exec() is non-zero, \A can never match. The
- difference between \Z and \z is that \Z matches before a
- newline that is the last character of the string as well as
- at the end of the string, whereas \z matches only at the
- end.
-
-
-
-CIRCUMFLEX AND DOLLAR
- Outside a character class, in the default matching mode, the
- circumflex character is an assertion which is true only if
- the current matching point is at the start of the subject
- string. If the startoffset argument of pcre_exec() is non-
- zero, circumflex can never match. Inside a character class,
- circumflex has an entirely different meaning (see below).
-
- Circumflex need not be the first character of the pattern if
- a number of alternatives are involved, but it should be the
- first thing in each alternative in which it appears if the
- pattern is ever to match that branch. If all possible alter-
- natives start with a circumflex, that is, if the pattern is
- constrained to match only at the start of the subject, it is
- said to be an "anchored" pattern. (There are also other con-
- structs that can cause a pattern to be anchored.)
-
- A dollar character is an assertion which is true only if the
- current matching point is at the end of the subject string,
- or immediately before a newline character that is the last
- character in the string (by default). Dollar need not be the
- last character of the pattern if a number of alternatives
- are involved, but it should be the last item in any branch
- in which it appears. Dollar has no special meaning in a
- character class.
-
- The meaning of dollar can be changed so that it matches only
- at the very end of the string, by setting the
- PCRE_DOLLAR_ENDONLY option at compile or matching time. This
- does not affect the \Z assertion.
-
- The meanings of the circumflex and dollar characters are
- changed if the PCRE_MULTILINE option is set. When this is
- the case, they match immediately after and immediately
- before an internal "\n" character, respectively, in addition
- to matching at the start and end of the subject string. For
- example, the pattern /^abc$/ matches the subject string
- "def\nabc" in multiline mode, but not otherwise. Conse-
- quently, patterns that are anchored in single line mode
- because all branches start with "^" are not anchored in mul-
- tiline mode, and a match for circumflex is possible when the
- startoffset argument of pcre_exec() is non-zero. The
- PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is
- set.
-
- Note that the sequences \A, \Z, and \z can be used to match
- the start and end of the subject in both modes, and if all
- branches of a pattern start with \A it is always anchored,
- whether PCRE_MULTILINE is set or not.
-
-
-
-FULL STOP (PERIOD, DOT)
- Outside a character class, a dot in the pattern matches any
- one character in the subject, including a non-printing char-
- acter, but not (by default) newline. If the PCRE_DOTALL
- option is set, dots match newlines as well. The handling of
- dot is entirely independent of the handling of circumflex
- and dollar, the only relationship being that they both
- involve newline characters. Dot has no special meaning in a
- character class.
-
-
-
-SQUARE BRACKETS
- An opening square bracket introduces a character class, ter-
- minated by a closing square bracket. A closing square
- bracket on its own is not special. If a closing square
- bracket is required as a member of the class, it should be
- the first data character in the class (after an initial cir-
- cumflex, if present) or escaped with a backslash.
-
- A character class matches a single character in the subject;
- the character must be in the set of characters defined by
- the class, unless the first character in the class is a cir-
- cumflex, in which case the subject character must not be in
- the set defined by the class. If a circumflex is actually
- required as a member of the class, ensure it is not the
- first character, or escape it with a backslash.
-
- For example, the character class [aeiou] matches any lower
- case vowel, while [^aeiou] matches any character that is not
- a lower case vowel. Note that a circumflex is just a con-
- venient notation for specifying the characters which are in
- the class by enumerating those that are not. It is not an
- assertion: it still consumes a character from the subject
- string, and fails if the current pointer is at the end of
- the string.
-
- When caseless matching is set, any letters in a class
- represent both their upper case and lower case versions, so
- for example, a caseless [aeiou] matches "A" as well as "a",
- and a caseless [^aeiou] does not match "A", whereas a case-
- ful version would.
-
- The newline character is never treated in any special way in
- character classes, whatever the setting of the PCRE_DOTALL
- or PCRE_MULTILINE options is. A class such as [^a] will
- always match a newline.
-
- The minus (hyphen) character can be used to specify a range
- of characters in a character class. For example, [d-m]
- matches any letter between d and m, inclusive. If a minus
- character is required in a class, it must be escaped with a
- backslash or appear in a position where it cannot be inter-
- preted as indicating a range, typically as the first or last
- character in the class.
-
- It is not possible to have the literal character "]" as the
- end character of a range. A pattern such as [W-]46] is
- interpreted as a class of two characters ("W" and "-") fol-
- lowed by a literal string "46]", so it would match "W46]" or
- "-46]". However, if the "]" is escaped with a backslash it
- is interpreted as the end of range, so [W-\]46] is inter-
- preted as a single class containing a range followed by two
- separate characters. The octal or hexadecimal representation
- of "]" can also be used to end a range.
-
- Ranges operate in ASCII collating sequence. They can also be
- used for characters specified numerically, for example
- [\000-\037]. If a range that includes letters is used when
- caseless matching is set, it matches the letters in either
- case. For example, [W-c] is equivalent to [][\^_`wxyzabc],
- matched caselessly, and if character tables for the "fr"
- locale are in use, [\xc8-\xcb] matches accented E characters
- in both cases.
-
- The character types \d, \D, \s, \S, \w, and \W may also
- appear in a character class, and add the characters that
- they match to the class. For example, [\dABCDEF] matches any
- hexadecimal digit. A circumflex can conveniently be used
- with the upper case character types to specify a more res-
- tricted set of characters than the matching lower case type.
- For example, the class [^\W_] matches any letter or digit,
- but not underscore.
-
- All non-alphameric characters other than \, -, ^ (at the
- start) and the terminating ] are non-special in character
- classes, but it does no harm if they are escaped.
-
-
-
-POSIX CHARACTER CLASSES
- Perl 5.6 (not yet released at the time of writing) is going
- to support the POSIX notation for character classes, which
- uses names enclosed by [: and :] within the enclosing
- square brackets. PCRE supports this notation. For example,
-
- [01[:alpha:]%]
-
- matches "0", "1", any alphabetic character, or "%". The sup-
- ported class names are
-
- alnum letters and digits
- alpha letters
- ascii character codes 0 - 127
- cntrl control characters
- digit decimal digits (same as \d)
- graph printing characters, excluding space
- lower lower case letters
- print printing characters, including space
- punct printing characters, excluding letters and digits
- space white space (same as \s)
- upper upper case letters
- word "word" characters (same as \w)
- xdigit hexadecimal digits
-
- >>>>>>>>>>>>Only WORD is perl. BLANK is GNU.
-
- The names "ascii" and "word" are Perl extensions. Another
- Perl extension is negation, which is indicated by a ^ char-
- acter after the colon. For example,
-
- [12[:^digit:]]
-
- matches "1", "2", or any non-digit. PCRE (and Perl) also
- recognize the POSIX syntax [.ch.] and [=ch=] where "ch" is a
- "collating element", but these are not supported, and an
- error is given if they are encountered.
-
-
-
-VERTICAL BAR
- Vertical bar characters are used to separate alternative
- patterns. For example, the pattern
-
- gilbert|sullivan
-
- matches either "gilbert" or "sullivan". Any number of alter-
- natives may appear, and an empty alternative is permitted
- (matching the empty string). The matching process tries
- each alternative in turn, from left to right, and the first
- one that succeeds is used. If the alternatives are within a
- subpattern (defined below), "succeeds" means matching the
- rest of the main pattern as well as the alternative in the
- subpattern.
-
-
-
-INTERNAL OPTION SETTING
- The settings of PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL,
- and PCRE_EXTENDED can be changed from within the pattern by
- a sequence of Perl option letters enclosed between "(?" and
- ")". The option letters are
-
- i for PCRE_CASELESS
- m for PCRE_MULTILINE
- s for PCRE_DOTALL
- x for PCRE_EXTENDED
-
- For example, (?im) sets caseless, multiline matching. It is
- also possible to unset these options by preceding the letter
- with a hyphen, and a combined setting and unsetting such as
- (?im-sx), which sets PCRE_CASELESS and PCRE_MULTILINE while
- unsetting PCRE_DOTALL and PCRE_EXTENDED, is also permitted.
- If a letter appears both before and after the hyphen, the
- option is unset.
-
- The scope of these option changes depends on where in the
- pattern the setting occurs. For settings that are outside
- any subpattern (defined below), the effect is the same as if
- the options were set or unset at the start of matching. The
- following patterns all behave in exactly the same way:
-
- (?i)abc
- a(?i)bc
- ab(?i)c
- abc(?i)
-
- which in turn is the same as compiling the pattern abc with
- PCRE_CASELESS set. In other words, such "top level" set-
- tings apply to the whole pattern (unless there are other
- changes inside subpatterns). If there is more than one set-
- ting of the same option at top level, the rightmost setting
- is used.
-
- If an option change occurs inside a subpattern, the effect
- is different. This is a change of behaviour in Perl 5.005.
- An option change inside a subpattern affects only that part
- of the subpattern that follows it, so
-
- (a(?i)b)c
-
- matches abc and aBc and no other strings (assuming
- PCRE_CASELESS is not used). By this means, options can be
- made to have different settings in different parts of the
- pattern. Any changes made in one alternative do carry on
- into subsequent branches within the same subpattern. For
- example,
-
- (a(?i)b|c)
-
- matches "ab", "aB", "c", and "C", even though when matching
- "C" the first branch is abandoned before the option setting.
- This is because the effects of option settings happen at
- compile time. There would be some very weird behaviour oth-
- erwise.
-
- The PCRE-specific options PCRE_UNGREEDY and PCRE_EXTRA can
- be changed in the same way as the Perl-compatible options by
- using the characters U and X respectively. The (?X) flag
- setting is special in that it must always occur earlier in
- the pattern than any of the additional features it turns on,
- even when it is at top level. It is best put at the start.
-
-
-
-SUBPATTERNS
- Subpatterns are delimited by parentheses (round brackets),
- which can be nested. Marking part of a pattern as a subpat-
- tern does two things:
-
- 1. It localizes a set of alternatives. For example, the pat-
- tern
-
- cat(aract|erpillar|)
-
- matches one of the words "cat", "cataract", or "caterpil-
- lar". Without the parentheses, it would match "cataract",
- "erpillar" or the empty string.
-
- 2. It sets up the subpattern as a capturing subpattern (as
- defined above). When the whole pattern matches, that por-
- tion of the subject string that matched the subpattern is
- passed back to the caller via the ovector argument of
- pcre_exec(). Opening parentheses are counted from left to
- right (starting from 1) to obtain the numbers of the captur-
- ing subpatterns.
-
- For example, if the string "the red king" is matched against
- the pattern
-
- the ((red|white) (king|queen))
-
- the captured substrings are "red king", "red", and "king",
- and are numbered 1, 2, and 3, respectively.
-
- The fact that plain parentheses fulfil two functions is not
- always helpful. There are often times when a grouping sub-
- pattern is required without a capturing requirement. If an
- opening parenthesis is followed by "?:", the subpattern does
- not do any capturing, and is not counted when computing the
- number of any subsequent capturing subpatterns. For example,
- if the string "the white queen" is matched against the pat-
- tern
-
- the ((?:red|white) (king|queen))
-
- the captured substrings are "white queen" and "queen", and
- are numbered 1 and 2. The maximum number of captured sub-
- strings is 99, and the maximum number of all subpatterns,
- both capturing and non-capturing, is 200.
- As a convenient shorthand, if any option settings are
- required at the start of a non-capturing subpattern, the
- option letters may appear between the "?" and the ":". Thus
- the two patterns
-
- (?i:saturday|sunday)
- (?:(?i)saturday|sunday)
-
- match exactly the same set of strings. Because alternative
- branches are tried from left to right, and options are not
- reset until the end of the subpattern is reached, an option
- setting in one branch does affect subsequent branches, so
- the above patterns match "SUNDAY" as well as "Saturday".
-
-
-
-REPETITION
- Repetition is specified by quantifiers, which can follow any
- of the following items:
-
- a single character, possibly escaped
- the . metacharacter
- a character class
- a back reference (see next section)
- a parenthesized subpattern (unless it is an assertion -
- see below)
-
- The general repetition quantifier specifies a minimum and
- maximum number of permitted matches, by giving the two
- numbers in curly brackets (braces), separated by a comma.
- The numbers must be less than 65536, and the first must be
- less than or equal to the second. For example:
-
- z{2,4}
-
- matches "zz", "zzz", or "zzzz". A closing brace on its own
- is not a special character. If the second number is omitted,
- but the comma is present, there is no upper limit; if the
- second number and the comma are both omitted, the quantifier
- specifies an exact number of required matches. Thus
-
- [aeiou]{3,}
-
- matches at least 3 successive vowels, but may match many
- more, while
-
- \d{8}
-
- matches exactly 8 digits. An opening curly bracket that
- appears in a position where a quantifier is not allowed, or
- one that does not match the syntax of a quantifier, is taken
- as a literal character. For example, {,6} is not a
- quantifier, but a literal string of four characters.
-
- The quantifier {0} is permitted, causing the expression to
- behave as if the previous item and the quantifier were not
- present.
-
- For convenience (and historical compatibility) the three
- most common quantifiers have single-character abbreviations:
-
- * is equivalent to {0,}
- + is equivalent to {1,}
- ? is equivalent to {0,1}
-
- It is possible to construct infinite loops by following a
- subpattern that can match no characters with a quantifier
- that has no upper limit, for example:
-
- (a?)*
-
- Earlier versions of Perl and PCRE used to give an error at
- compile time for such patterns. However, because there are
- cases where this can be useful, such patterns are now
- accepted, but if any repetition of the subpattern does in
- fact match no characters, the loop is forcibly broken.
-
- By default, the quantifiers are "greedy", that is, they
- match as much as possible (up to the maximum number of per-
- mitted times), without causing the rest of the pattern to
- fail. The classic example of where this gives problems is in
- trying to match comments in C programs. These appear between
- the sequences /* and */ and within the sequence, individual
- * and / characters may appear. An attempt to match C com-
- ments by applying the pattern
-
- /\*.*\*/
-
- to the string
-
- /* first command */ not comment /* second comment */
-
- fails, because it matches the entire string owing to the
- greediness of the .* item.
-
- However, if a quantifier is followed by a question mark, it
- ceases to be greedy, and instead matches the minimum number
- of times possible, so the pattern
-
- /\*.*?\*/
-
- does the right thing with the C comments. The meaning of the
- various quantifiers is not otherwise changed, just the pre-
- ferred number of matches. Do not confuse this use of
- question mark with its use as a quantifier in its own right.
- Because it has two uses, it can sometimes appear doubled, as
- in
-
- \d??\d
-
- which matches one digit by preference, but can match two if
- that is the only way the rest of the pattern matches.
-
- If the PCRE_UNGREEDY option is set (an option which is not
- available in Perl), the quantifiers are not greedy by
- default, but individual ones can be made greedy by following
- them with a question mark. In other words, it inverts the
- default behaviour.
-
- When a parenthesized subpattern is quantified with a minimum
- repeat count that is greater than 1 or with a limited max-
- imum, more store is required for the compiled pattern, in
- proportion to the size of the minimum or maximum.
-
- If a pattern starts with .* or .{0,} and the PCRE_DOTALL
- option (equivalent to Perl's /s) is set, thus allowing the .
- to match newlines, the pattern is implicitly anchored,
- because whatever follows will be tried against every charac-
- ter position in the subject string, so there is no point in
- retrying the overall match at any position after the first.
- PCRE treats such a pattern as though it were preceded by \A.
- In cases where it is known that the subject string contains
- no newlines, it is worth setting PCRE_DOTALL when the pat-
- tern begins with .* in order to obtain this optimization, or
- alternatively using ^ to indicate anchoring explicitly.
-
- When a capturing subpattern is repeated, the value captured
- is the substring that matched the final iteration. For exam-
- ple, after
-
- (tweedle[dume]{3}\s*)+
-
- has matched "tweedledum tweedledee" the value of the cap-
- tured substring is "tweedledee". However, if there are
- nested capturing subpatterns, the corresponding captured
- values may have been set in previous iterations. For exam-
- ple, after
-
- /(a|(b))+/
-
- matches "aba" the value of the second captured substring is
- "b".
-
-
-
-
-BACK REFERENCES
- Outside a character class, a backslash followed by a digit
- greater than 0 (and possibly further digits) is a back
- reference to a capturing subpattern earlier (i.e. to its
- left) in the pattern, provided there have been that many
- previous capturing left parentheses.
-
- However, if the decimal number following the backslash is
- less than 10, it is always taken as a back reference, and
- causes an error only if there are not that many capturing
- left parentheses in the entire pattern. In other words, the
- parentheses that are referenced need not be to the left of
- the reference for numbers less than 10. See the section
- entitled "Backslash" above for further details of the han-
- dling of digits following a backslash.
-
- A back reference matches whatever actually matched the cap-
- turing subpattern in the current subject string, rather than
- anything matching the subpattern itself. So the pattern
-
- (sens|respons)e and \1ibility
-
- matches "sense and sensibility" and "response and responsi-
- bility", but not "sense and responsibility". If caseful
- matching is in force at the time of the back reference, the
- case of letters is relevant. For example,
-
- ((?i)rah)\s+\1
-
- matches "rah rah" and "RAH RAH", but not "RAH rah", even
- though the original capturing subpattern is matched case-
- lessly.
-
- There may be more than one back reference to the same sub-
- pattern. If a subpattern has not actually been used in a
- particular match, any back references to it always fail. For
- example, the pattern
-
- (a|(bc))\2
-
- always fails if it starts to match "a" rather than "bc".
- Because there may be up to 99 back references, all digits
- following the backslash are taken as part of a potential
- back reference number. If the pattern continues with a digit
- character, some delimiter must be used to terminate the back
- reference. If the PCRE_EXTENDED option is set, this can be
- whitespace. Otherwise an empty comment can be used.
-
- A back reference that occurs inside the parentheses to which
- it refers fails when the subpattern is first used, so, for
- example, (a\1) never matches. However, such references can
- be useful inside repeated subpatterns. For example, the
- pattern
-
- (a|b\1)+
-
- matches any number of "a"s and also "aba", "ababbaa" etc. At
- each iteration of the subpattern, the back reference matches
- the character string corresponding to the previous itera-
- tion. In order for this to work, the pattern must be such
- that the first iteration does not need to match the back
- reference. This can be done using alternation, as in the
- example above, or by a quantifier with a minimum of zero.
-
-
-
-ASSERTIONS
- An assertion is a test on the characters following or
- preceding the current matching point that does not actually
- consume any characters. The simple assertions coded as \b,
- \B, \A, \Z, \z, ^ and $ are described above. More compli-
- cated assertions are coded as subpatterns. There are two
- kinds: those that look ahead of the current position in the
- subject string, and those that look behind it.
-
- An assertion subpattern is matched in the normal way, except
- that it does not cause the current matching position to be
- changed. Lookahead assertions start with (?= for positive
- assertions and (?! for negative assertions. For example,
-
- \w+(?=;)
-
- matches a word followed by a semicolon, but does not include
- the semicolon in the match, and
-
- foo(?!bar)
-
- matches any occurrence of "foo" that is not followed by
- "bar". Note that the apparently similar pattern
-
- (?!foo)bar
-
- does not find an occurrence of "bar" that is preceded by
- something other than "foo"; it finds any occurrence of "bar"
- whatsoever, because the assertion (?!foo) is always true
- when the next three characters are "bar". A lookbehind
- assertion is needed to achieve this effect.
-
- Lookbehind assertions start with (?<= for positive asser-
- tions and (?<! for negative assertions. For example,
-
- (?<!foo)bar
-
- does find an occurrence of "bar" that is not preceded by
- "foo". The contents of a lookbehind assertion are restricted
- such that all the strings it matches must have a fixed
- length. However, if there are several alternatives, they do
- not all have to have the same fixed length. Thus
-
- (?<=bullock|donkey)
-
- is permitted, but
-
- (?<!dogs?|cats?)
-
- causes an error at compile time. Branches that match dif-
- ferent length strings are permitted only at the top level of
- a lookbehind assertion. This is an extension compared with
- Perl 5.005, which requires all branches to match the same
- length of string. An assertion such as
-
- (?<=ab(c|de))
-
- is not permitted, because its single top-level branch can
- match two different lengths, but it is acceptable if rewrit-
- ten to use two top-level branches:
-
- (?<=abc|abde)
-
- The implementation of lookbehind assertions is, for each
- alternative, to temporarily move the current position back
- by the fixed width and then try to match. If there are
- insufficient characters before the current position, the
- match is deemed to fail. Lookbehinds in conjunction with
- once-only subpatterns can be particularly useful for match-
- ing at the ends of strings; an example is given at the end
- of the section on once-only subpatterns.
-
- Several assertions (of any sort) may occur in succession.
- For example,
-
- (?<=\d{3})(?<!999)foo
-
- matches "foo" preceded by three digits that are not "999".
- Notice that each of the assertions is applied independently
- at the same point in the subject string. First there is a
- check that the previous three characters are all digits, and
- then there is a check that the same three characters are not
- "999". This pattern does not match "foo" preceded by six
- characters, the first of which are digits and the last three
- of which are not "999". For example, it doesn't match
- "123abcfoo". A pattern to do that is
-
- (?<=\d{3}...)(?<!999)foo
-
- This time the first assertion looks at the preceding six
- characters, checking that the first three are digits, and
- then the second assertion checks that the preceding three
- characters are not "999".
-
- Assertions can be nested in any combination. For example,
-
- (?<=(?<!foo)bar)baz
-
- matches an occurrence of "baz" that is preceded by "bar"
- which in turn is not preceded by "foo", while
-
- (?<=\d{3}(?!999)...)foo
-
- is another pattern which matches "foo" preceded by three
- digits and any three characters that are not "999".
-
- Assertion subpatterns are not capturing subpatterns, and may
- not be repeated, because it makes no sense to assert the
- same thing several times. If any kind of assertion contains
- capturing subpatterns within it, these are counted for the
- purposes of numbering the capturing subpatterns in the whole
- pattern. However, substring capturing is carried out only
- for positive assertions, because it does not make sense for
- negative assertions.
-
- Assertions count towards the maximum of 200 parenthesized
- subpatterns.
-
-
-
-ONCE-ONLY SUBPATTERNS
- With both maximizing and minimizing repetition, failure of
- what follows normally causes the repeated item to be re-
- evaluated to see if a different number of repeats allows the
- rest of the pattern to match. Sometimes it is useful to
- prevent this, either to change the nature of the match, or
- to cause it fail earlier than it otherwise might, when the
- author of the pattern knows there is no point in carrying
- on.
-
- Consider, for example, the pattern \d+foo when applied to
- the subject line
-
- 123456bar
-
- After matching all 6 digits and then failing to match "foo",
- the normal action of the matcher is to try again with only 5
- digits matching the \d+ item, and then with 4, and so on,
- before ultimately failing. Once-only subpatterns provide the
- means for specifying that once a portion of the pattern has
- matched, it is not to be re-evaluated in this way, so the
- matcher would give up immediately on failing to match "foo"
- the first time. The notation is another kind of special
- parenthesis, starting with (?> as in this example:
-
- (?>\d+)bar
-
- This kind of parenthesis "locks up" the part of the pattern
- it contains once it has matched, and a failure further into
- the pattern is prevented from backtracking into it. Back-
- tracking past it to previous items, however, works as nor-
- mal.
-
- An alternative description is that a subpattern of this type
- matches the string of characters that an identical stan-
- dalone pattern would match, if anchored at the current point
- in the subject string.
-
- Once-only subpatterns are not capturing subpatterns. Simple
- cases such as the above example can be thought of as a max-
- imizing repeat that must swallow everything it can. So,
- while both \d+ and \d+? are prepared to adjust the number of
- digits they match in order to make the rest of the pattern
- match, (?>\d+) can only match an entire sequence of digits.
-
- This construction can of course contain arbitrarily compli-
- cated subpatterns, and it can be nested.
-
- Once-only subpatterns can be used in conjunction with look-
- behind assertions to specify efficient matching at the end
- of the subject string. Consider a simple pattern such as
-
- abcd$
-
- when applied to a long string which does not match. Because
- matching proceeds from left to right, PCRE will look for
- each "a" in the subject and then see if what follows matches
- the rest of the pattern. If the pattern is specified as
-
- ^.*abcd$
-
- the initial .* matches the entire string at first, but when
- this fails (because there is no following "a"), it back-
- tracks to match all but the last character, then all but the
- last two characters, and so on. Once again the search for
- "a" covers the entire string, from right to left, so we are
- no better off. However, if the pattern is written as
-
- ^(?>.*)(?<=abcd)
-
- there can be no backtracking for the .* item; it can match
- only the entire string. The subsequent lookbehind assertion
- does a single test on the last four characters. If it fails,
- the match fails immediately. For long strings, this approach
- makes a significant difference to the processing time.
-
- When a pattern contains an unlimited repeat inside a subpat-
- tern that can itself be repeated an unlimited number of
- times, the use of a once-only subpattern is the only way to
- avoid some failing matches taking a very long time indeed.
- The pattern
-
- (\D+|<\d+>)*[!?]
-
- matches an unlimited number of substrings that either con-
- sist of non-digits, or digits enclosed in <>, followed by
- either ! or ?. When it matches, it runs quickly. However, if
- it is applied to
-
- aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-
- it takes a long time before reporting failure. This is
- because the string can be divided between the two repeats in
- a large number of ways, and all have to be tried. (The exam-
- ple used [!?] rather than a single character at the end,
- because both PCRE and Perl have an optimization that allows
- for fast failure when a single character is used. They
- remember the last single character that is required for a
- match, and fail early if it is not present in the string.)
- If the pattern is changed to
-
- ((?>\D+)|<\d+>)*[!?]
-
- sequences of non-digits cannot be broken, and failure hap-
- pens quickly.
-
-
-
-CONDITIONAL SUBPATTERNS
- It is possible to cause the matching process to obey a sub-
- pattern conditionally or to choose between two alternative
- subpatterns, depending on the result of an assertion, or
- whether a previous capturing subpattern matched or not. The
- two possible forms of conditional subpattern are
-
- (?(condition)yes-pattern)
- (?(condition)yes-pattern|no-pattern)
-
- If the condition is satisfied, the yes-pattern is used; oth-
- erwise the no-pattern (if present) is used. If there are
- more than two alternatives in the subpattern, a compile-time
- error occurs.
-
- There are two kinds of condition. If the text between the
- parentheses consists of a sequence of digits, the condition
- is satisfied if the capturing subpattern of that number has
- previously matched. The number must be greater than zero.
- Consider the following pattern, which contains non-
- significant white space to make it more readable (assume the
- PCRE_EXTENDED option) and to divide it into three parts for
- ease of discussion:
-
- ( \( )? [^()]+ (?(1) \) )
-
- The first part matches an optional opening parenthesis, and
- if that character is present, sets it as the first captured
- substring. The second part matches one or more characters
- that are not parentheses. The third part is a conditional
- subpattern that tests whether the first set of parentheses
- matched or not. If they did, that is, if subject started
- with an opening parenthesis, the condition is true, and so
- the yes-pattern is executed and a closing parenthesis is
- required. Otherwise, since no-pattern is not present, the
- subpattern matches nothing. In other words, this pattern
- matches a sequence of non-parentheses, optionally enclosed
- in parentheses.
-
- If the condition is not a sequence of digits, it must be an
- assertion. This may be a positive or negative lookahead or
- lookbehind assertion. Consider this pattern, again contain-
- ing non-significant white space, and with the two alterna-
- tives on the second line:
-
- (?(?=[^a-z]*[a-z])
- \d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} )
-
- The condition is a positive lookahead assertion that matches
- an optional sequence of non-letters followed by a letter. In
- other words, it tests for the presence of at least one
- letter in the subject. If a letter is found, the subject is
- matched against the first alternative; otherwise it is
- matched against the second. This pattern matches strings in
- one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are
- letters and dd are digits.
-
-
-
-COMMENTS
- The sequence (?# marks the start of a comment which contin-
- ues up to the next closing parenthesis. Nested parentheses
- are not permitted. The characters that make up a comment
- play no part in the pattern matching at all.
-
- If the PCRE_EXTENDED option is set, an unescaped # character
- outside a character class introduces a comment that contin-
- ues up to the next newline character in the pattern.
-
-
-
-RECURSIVE PATTERNS
- Consider the problem of matching a string in parentheses,
- allowing for unlimited nested parentheses. Without the use
- of recursion, the best that can be done is to use a pattern
- that matches up to some fixed depth of nesting. It is not
- possible to handle an arbitrary nesting depth. Perl 5.6 has
- provided an experimental facility that allows regular
- expressions to recurse (amongst other things). It does this
- by interpolating Perl code in the expression at run time,
- and the code can refer to the expression itself. A Perl pat-
- tern to solve the parentheses problem can be created like
- this:
-
- $re = qr{\( (?: (?>[^()]+) | (?p{$re}) )* \)}x;
-
- The (?p{...}) item interpolates Perl code at run time, and
- in this case refers recursively to the pattern in which it
- appears. Obviously, PCRE cannot support the interpolation of
- Perl code. Instead, the special item (?R) is provided for
- the specific case of recursion. This PCRE pattern solves the
- parentheses problem (assume the PCRE_EXTENDED option is set
- so that white space is ignored):
-
- \( ( (?>[^()]+) | (?R) )* \)
-
- First it matches an opening parenthesis. Then it matches any
- number of substrings which can either be a sequence of non-
- parentheses, or a recursive match of the pattern itself
- (i.e. a correctly parenthesized substring). Finally there is
- a closing parenthesis.
-
- This particular example pattern contains nested unlimited
- repeats, and so the use of a once-only subpattern for match-
- ing strings of non-parentheses is important when applying
- the pattern to strings that do not match. For example, when
- it is applied to
-
- (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
-
- it yields "no match" quickly. However, if a once-only sub-
- pattern is not used, the match runs for a very long time
- indeed because there are so many different ways the + and *
- repeats can carve up the subject, and all have to be tested
- before failure can be reported.
-
- The values set for any capturing subpatterns are those from
- the outermost level of the recursion at which the subpattern
- value is set. If the pattern above is matched against
-
- (ab(cd)ef)
-
- the value for the capturing parentheses is "ef", which is
- the last value taken on at the top level. If additional
- parentheses are added, giving
-
- \( ( ( (?>[^()]+) | (?R) )* ) \)
- ^ ^
- ^ ^ the string they capture is
- "ab(cd)ef", the contents of the top level parentheses. If
- there are more than 15 capturing parentheses in a pattern,
- PCRE has to obtain extra memory to store data during a
- recursion, which it does by using pcre_malloc, freeing it
- via pcre_free afterwards. If no memory can be obtained, it
- saves data for the first 15 capturing parentheses only, as
- there is no way to give an out-of-memory error from within a
- recursion.
-
-
-
-PERFORMANCE
- Certain items that may appear in patterns are more efficient
- than others. It is more efficient to use a character class
- like [aeiou] than a set of alternatives such as (a|e|i|o|u).
- In general, the simplest construction that provides the
- required behaviour is usually the most efficient. Jeffrey
- Friedl's book contains a lot of discussion about optimizing
- regular expressions for efficient performance.
-
- When a pattern begins with .* and the PCRE_DOTALL option is
- set, the pattern is implicitly anchored by PCRE, since it
- can match only at the start of a subject string. However, if
- PCRE_DOTALL is not set, PCRE cannot make this optimization,
- because the . metacharacter does not then match a newline,
- and if the subject string contains newlines, the pattern may
- match from the character immediately following one of them
- instead of from the very start. For example, the pattern
-
- (.*) second
-
- matches the subject "first\nand second" (where \n stands for
- a newline character) with the first captured substring being
- "and". In order to do this, PCRE has to retry the match
- starting after every newline in the subject.
-
- If you are using such a pattern with subject strings that do
- not contain newlines, the best performance is obtained by
- setting PCRE_DOTALL, or starting the pattern with ^.* to
- indicate explicit anchoring. That saves PCRE from having to
- scan along the subject looking for a newline to restart at.
-
- Beware of patterns that contain nested indefinite repeats.
- These can take a long time to run when applied to a string
- that does not match. Consider the pattern fragment
-
- (a+)*
-
- This can match "aaaa" in 33 different ways, and this number
- increases very rapidly as the string gets longer. (The *
- repeat can match 0, 1, 2, 3, or 4 times, and for each of
- those cases other than 0, the + repeats can match different
- numbers of times.) When the remainder of the pattern is such
- that the entire match is going to fail, PCRE has in princi-
- ple to try every possible variation, and this can take an
- extremely long time.
-
- An optimization catches some of the more simple cases such
- as
-
- (a+)*b
-
- where a literal character follows. Before embarking on the
- standard matching procedure, PCRE checks that there is a "b"
- later in the subject string, and if there is not, it fails
- the match immediately. However, when there is no following
- literal this optimization cannot be used. You can see the
- difference by comparing the behaviour of
-
- (a+)*\d
-
- with the pattern above. The former gives a failure almost
- instantly when applied to a whole line of "a" characters,
- whereas the latter takes an appreciable time with strings
- longer than about 20 characters.
-
-
-
-UTF-8 SUPPORT
- Starting at release 3.3, PCRE has some support for character
- strings encoded in the UTF-8 format. This is incomplete, and
- is regarded as experimental. In order to use it, you must
- configure PCRE to include UTF-8 support in the code, and, in
- addition, you must call pcre_compile() with the PCRE_UTF8
- option flag. When you do this, both the pattern and any sub-
- ject strings that are matched against it are treated as
- UTF-8 strings instead of just strings of bytes, but only in
- the cases that are mentioned below.
-
- If you compile PCRE with UTF-8 support, but do not use it at
- run time, the library will be a bit bigger, but the addi-
- tional run time overhead is limited to testing the PCRE_UTF8
- flag in several places, so should not be very large.
-
- PCRE assumes that the strings it is given contain valid
- UTF-8 codes. It does not diagnose invalid UTF-8 strings. If
- you pass invalid UTF-8 strings to PCRE, the results are
- undefined.
- Running with PCRE_UTF8 set causes these changes in the way
- PCRE works:
-
- 1. In a pattern, the escape sequence \x{...}, where the con-
- tents of the braces is a string of hexadecimal digits, is
- interpreted as a UTF-8 character whose code number is the
- given hexadecimal number, for example: \x{1234}. This
- inserts from one to six literal bytes into the pattern,
- using the UTF-8 encoding. If a non-hexadecimal digit appears
- between the braces, the item is not recognized.
-
- 2. The original hexadecimal escape sequence, \xhh, generates
- a two-byte UTF-8 character if its value is greater than 127.
-
- 3. Repeat quantifiers are NOT correctly handled if they fol-
- low a multibyte character. For example, \x{100}* and \xc3+
- do not work. If you want to repeat such characters, you must
- enclose them in non-capturing parentheses, for example
- (?:\x{100}), at present.
-
- 4. The dot metacharacter matches one UTF-8 character instead
- of a single byte.
-
- 5. Unlike literal UTF-8 characters, the dot metacharacter
- followed by a repeat quantifier does operate correctly on
- UTF-8 characters instead of single bytes.
-
- 4. Although the \x{...} escape is permitted in a character
- class, characters whose values are greater than 255 cannot
- be included in a class.
-
- 5. A class is matched against a UTF-8 character instead of
- just a single byte, but it can match only characters whose
- values are less than 256. Characters with greater values
- always fail to match a class.
-
- 6. Repeated classes work correctly on multiple characters.
-
- 7. Classes containing just a single character whose value is
- greater than 127 (but less than 256), for example, [\x80] or
- [^\x{93}], do not work because these are optimized into sin-
- gle byte matches. In the first case, of course, the class
- brackets are just redundant.
-
- 8. Lookbehind assertions move backwards in the subject by a
- fixed number of characters instead of a fixed number of
- bytes. Simple cases have been tested to work correctly, but
- there may be hidden gotchas herein.
-
- 9. The character types such as \d and \w do not work
- correctly with UTF-8 characters. They continue to test a
- single byte.
- 10. Anything not explicitly mentioned here continues to work
- in bytes rather than in characters.
-
- The following UTF-8 features of Perl 5.6 are not imple-
- mented:
-
- 1. The escape sequence \C to match a single byte.
-
- 2. The use of Unicode tables and properties and escapes \p,
- \P, and \X.
-
-
-
-SAMPLE PROGRAM
- The code below is a simple, complete demonstration program,
- to get you started with using PCRE. This code is also sup-
- plied in the file pcredemo.c in the PCRE distribution.
-
- The program compiles the regular expression that is its
- first argument, and matches it against the subject string in
- its second argument. No options are set, and default charac-
- ter tables are used. If matching succeeds, the program out-
- puts the portion of the subject that matched, together with
- the contents of any captured substrings.
-
- On a Unix system that has PCRE installed in /usr/local, you
- can compile the demonstration program using a command like
- this:
-
- gcc -o pcredemo pcredemo.c -I/usr/local/include
- -L/usr/local/lib -lpcre
-
- Then you can run simple tests like this:
-
- ./pcredemo 'cat|dog' 'the cat sat on the mat'
-
- Note that there is a much more comprehensive test program,
- called pcretest, which supports many more facilities for
- testing regular expressions. The pcredemo program is pro-
- vided as a simple coding example.
-
- On some operating systems (e.g. Solaris) you may get an
- error like this when you try to run pcredemo:
-
- ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such
- file or directory
-
- This is caused by the way shared library support works on
- those systems. You need to add
-
- -R/usr/local/lib
-
- to the compile command to get round this problem. Here's the
- code:
-
- #include <stdio.h>
- #include <string.h>
- #include <pcre.h>
-
- #define OVECCOUNT 30 /* should be a multiple of 3 */
-
- int main(int argc, char **argv)
- {
- pcre *re;
- const char *error;
- int erroffset;
- int ovector[OVECCOUNT];
- int rc, i;
-
- if (argc != 3)
- {
- printf("Two arguments required: a regex and a "
- "subject string\n");
- return 1;
- }
-
- /* Compile the regular expression in the first argument */
-
- re = pcre_compile(
- argv[1], /* the pattern */
- 0, /* default options */
- &error, /* for error message */
- &erroffset, /* for error offset */
- NULL); /* use default character tables */
-
- /* Compilation failed: print the error message and exit */
-
- if (re == NULL)
- {
- printf("PCRE compilation failed at offset %d: %s\n",
- erroffset, error);
- return 1;
- }
-
- /* Compilation succeeded: match the subject in the second
- argument */
-
- rc = pcre_exec(
- re, /* the compiled pattern */
- NULL, /* we didn't study the pattern */
- argv[2], /* the subject string */
- (int)strlen(argv[2]), /* the length of the subject */
- 0, /* start at offset 0 in the subject */
- 0, /* default options */
- ovector, /* vector for substring information */
- OVECCOUNT); /* number of elements in the vector */
-
- /* Matching failed: handle error cases */
-
- if (rc < 0)
- {
- switch(rc)
- {
- case PCRE_ERROR_NOMATCH: printf("No match\n"); break;
- /*
- Handle other special cases if you like
- */
- default: printf("Matching error %d\n", rc); break;
- }
- return 1;
- }
-
- /* Match succeded */
-
- printf("Match succeeded\n");
-
- /* The output vector wasn't big enough */
-
- if (rc == 0)
- {
- rc = OVECCOUNT/3;
- printf("ovector only has room for %d captured "
- substrings\n", rc - 1);
- }
-
- /* Show substrings stored in the output vector */
-
- for (i = 0; i < rc; i++)
- {
- char *substring_start = argv[2] + ovector[2*i];
- int substring_length = ovector[2*i+1] - ovector[2*i];
- printf("%2d: %.*s\n", i, substring_length,
- substring_start);
- }
-
- return 0;
- }
-
-
-
-AUTHOR
- Philip Hazel <ph10@cam.ac.uk>
- University Computing Service,
- New Museums Site,
- Cambridge CB2 3QG, England.
- Phone: +44 1223 334714
- Last updated: 15 August 2001
- Copyright (c) 1997-2001 University of Cambridge.
diff --git a/ext/pcre/pcrelib/doc/pcregrep.1 b/ext/pcre/pcrelib/doc/pcregrep.1
deleted file mode 100644
index b55745aca8..0000000000
--- a/ext/pcre/pcrelib/doc/pcregrep.1
+++ /dev/null
@@ -1,92 +0,0 @@
-.TH PCREGREP 1
-.SH NAME
-pcregrep - a grep with Perl-compatible regular expressions.
-.SH SYNOPSIS
-.B pcregrep [-Vcfhilnrsvx] [pattern] [file1 file2 ...]
-
-
-.SH DESCRIPTION
-\fBpcregrep\fR searches files for character patterns, in the same way as other
-grep commands do, but it uses the PCRE regular expression library to support
-patterns that are compatible with the regular expressions of Perl 5. See
-\fBpcre(3)\fR for a full description of syntax and semantics.
-
-A pattern must be specified on the command line unless the \fB-f\fR option is
-used (see below).
-
-If no files are specified, \fBpcregrep\fR reads the standard input. By default,
-each line that matches the pattern is copied to the standard output, and if
-there is more than one file, the file name is printed before each line of
-output. However, there are options that can change how \fBpcregrep\fR behaves.
-
-Lines are limited to BUFSIZ characters. BUFSIZ is defined in \fB<stdio.h>\fR.
-The newline character is removed from the end of each line before it is matched
-against the pattern.
-
-
-.SH OPTIONS
-.TP 10
-\fB-V\fR
-Write the version number of the PCRE library being used to the standard error
-stream.
-.TP
-\fB-c\fR
-Do not print individual lines; instead just print a count of the number of
-lines that would otherwise have been printed. If several files are given, a
-count is printed for each of them.
-.TP
-\fB-f\fIfilename\fR Read a number of patterns from the file, one per line, and
-match all of them against each line of input. A line is output if any of the
-patterns match it. When \fB-f\fR is used, no pattern is taken from the command
-line; all arguments are treated as file names. There is a maximum of 100
-patterns. Trailing white space is removed, and blank lines are ignored. An
-empty file contains no patterns and therefore matches nothing.
-.TP
-\fB-h\fR
-Suppress printing of filenames when searching multiple files.
-.TP
-\fB-i\fR
-Ignore upper/lower case distinctions during comparisons.
-.TP
-\fB-l\fR
-Instead of printing lines from the files, just print the names of the files
-containing lines that would have been printed. Each file name is printed
-once, on a separate line.
-.TP
-\fB-n\fR
-Precede each line by its line number in the file.
-.TP
-\fB-r\fR
-If any file is a directory, recursively scan the files it contains. Without
-\fB-r\fR a directory is scanned as a normal file.
-.TP
-\fB-s\fR
-Work silently, that is, display nothing except error messages.
-The exit status indicates whether any matches were found.
-.TP
-\fB-v\fR
-Invert the sense of the match, so that lines which do \fInot\fR match the
-pattern are now the ones that are found.
-.TP
-\fB-x\fR
-Force the pattern to be anchored (it must start matching at the beginning of
-the line) and in addition, require it to match the entire line. This is
-equivalent to having ^ and $ characters at the start and end of each
-alternative branch in the regular expression.
-
-
-.SH SEE ALSO
-\fBpcre(3)\fR, Perl 5 documentation
-
-
-.SH DIAGNOSTICS
-Exit status is 0 if any matches were found, 1 if no matches were found, and 2
-for syntax errors or inacessible files (even if matches were found).
-
-
-.SH AUTHOR
-Philip Hazel <ph10@cam.ac.uk>
-
-Last updated: 25 July 2002
-.br
-Copyright (c) 1997-2002 University of Cambridge.
diff --git a/ext/pcre/pcrelib/doc/pcregrep.html b/ext/pcre/pcrelib/doc/pcregrep.html
deleted file mode 100644
index 20151d4a08..0000000000
--- a/ext/pcre/pcrelib/doc/pcregrep.html
+++ /dev/null
@@ -1,125 +0,0 @@
-<HTML>
-<HEAD>
-<TITLE>pcregrep specification</TITLE>
-</HEAD>
-<body bgcolor="#FFFFFF" text="#00005A">
-<H1>pcregrep specification</H1>
-This HTML document has been generated automatically from the original man page.
-If there is any nonsense in it, please consult the man page in case the
-conversion went wrong.
-<UL>
-<LI><A NAME="TOC1" HREF="#SEC1">NAME</A>
-<LI><A NAME="TOC2" HREF="#SEC2">SYNOPSIS</A>
-<LI><A NAME="TOC3" HREF="#SEC3">DESCRIPTION</A>
-<LI><A NAME="TOC4" HREF="#SEC4">OPTIONS</A>
-<LI><A NAME="TOC5" HREF="#SEC5">SEE ALSO</A>
-<LI><A NAME="TOC6" HREF="#SEC6">DIAGNOSTICS</A>
-<LI><A NAME="TOC7" HREF="#SEC7">AUTHOR</A>
-</UL>
-<LI><A NAME="SEC1" HREF="#TOC1">NAME</A>
-<P>
-pcregrep - a grep with Perl-compatible regular expressions.
-</P>
-<LI><A NAME="SEC2" HREF="#TOC1">SYNOPSIS</A>
-<P>
-<B>pcregrep [-Vcfhilnrsvx] [pattern] [file1 file2 ...]</B>
-</P>
-<LI><A NAME="SEC3" HREF="#TOC1">DESCRIPTION</A>
-<P>
-<B>pcregrep</B> searches files for character patterns, in the same way as other
-grep commands do, but it uses the PCRE regular expression library to support
-patterns that are compatible with the regular expressions of Perl 5. See
-<B>pcre(3)</B> for a full description of syntax and semantics.
-</P>
-<P>
-A pattern must be specified on the command line unless the <B>-f</B> option is
-used (see below).
-</P>
-<P>
-If no files are specified, <B>pcregrep</B> reads the standard input. By default,
-each line that matches the pattern is copied to the standard output, and if
-there is more than one file, the file name is printed before each line of
-output. However, there are options that can change how <B>pcregrep</B> behaves.
-</P>
-<P>
-Lines are limited to BUFSIZ characters. BUFSIZ is defined in <B>&#60;stdio.h&#62;</B>.
-The newline character is removed from the end of each line before it is matched
-against the pattern.
-</P>
-<LI><A NAME="SEC4" HREF="#TOC1">OPTIONS</A>
-<P>
-<B>-V</B>
-Write the version number of the PCRE library being used to the standard error
-stream.
-</P>
-<P>
-<B>-c</B>
-Do not print individual lines; instead just print a count of the number of
-lines that would otherwise have been printed. If several files are given, a
-count is printed for each of them.
-</P>
-<P>
-\fB-f<I>filename</I> Read a number of patterns from the file, one per line, and
-match all of them against each line of input. A line is output if any of the
-patterns match it. When <B>-f</B> is used, no pattern is taken from the command
-line; all arguments are treated as file names. There is a maximum of 100
-patterns. Trailing white space is removed, and blank lines are ignored. An
-empty file contains no patterns and therefore matches nothing.
-</P>
-<P>
-<B>-h</B>
-Suppress printing of filenames when searching multiple files.
-</P>
-<P>
-<B>-i</B>
-Ignore upper/lower case distinctions during comparisons.
-</P>
-<P>
-<B>-l</B>
-Instead of printing lines from the files, just print the names of the files
-containing lines that would have been printed. Each file name is printed
-once, on a separate line.
-</P>
-<P>
-<B>-n</B>
-Precede each line by its line number in the file.
-</P>
-<P>
-<B>-r</B>
-If any file is a directory, recursively scan the files it contains. Without
-<B>-r</B> a directory is scanned as a normal file.
-</P>
-<P>
-<B>-s</B>
-Work silently, that is, display nothing except error messages.
-The exit status indicates whether any matches were found.
-</P>
-<P>
-<B>-v</B>
-Invert the sense of the match, so that lines which do <I>not</I> match the
-pattern are now the ones that are found.
-</P>
-<P>
-<B>-x</B>
-Force the pattern to be anchored (it must start matching at the beginning of
-the line) and in addition, require it to match the entire line. This is
-equivalent to having ^ and $ characters at the start and end of each
-alternative branch in the regular expression.
-</P>
-<LI><A NAME="SEC5" HREF="#TOC1">SEE ALSO</A>
-<P>
-<B>pcre(3)</B>, Perl 5 documentation
-</P>
-<LI><A NAME="SEC6" HREF="#TOC1">DIAGNOSTICS</A>
-<P>
-Exit status is 0 if any matches were found, 1 if no matches were found, and 2
-for syntax errors or inacessible files (even if matches were found).
-</P>
-<LI><A NAME="SEC7" HREF="#TOC1">AUTHOR</A>
-<P>
-Philip Hazel &#60;ph10@cam.ac.uk&#62;
-</P>
-<P>
-Last updated: 25 July 2002
-<BR>
-Copyright (c) 1997-2002 University of Cambridge.
diff --git a/ext/pcre/pcrelib/doc/pcregrep.txt b/ext/pcre/pcrelib/doc/pcregrep.txt
deleted file mode 100644
index ce53f7a889..0000000000
--- a/ext/pcre/pcrelib/doc/pcregrep.txt
+++ /dev/null
@@ -1,109 +0,0 @@
-NAME
- pcregrep - a grep with Perl-compatible regular expressions.
-
-
-
-SYNOPSIS
- pcregrep [-Vcfhilnrsvx] [pattern] [file1 file2 ...]
-
-
-
-DESCRIPTION
- pcregrep searches files for character patterns, in the same
- way as other grep commands do, but it uses the PCRE regular
- expression library to support patterns that are compatible
- with the regular expressions of Perl 5. See pcre(3) for a
- full description of syntax and semantics.
-
- A pattern must be specified on the command line unless the
- -f option is used (see below).
-
- If no files are specified, pcregrep reads the standard
- input. By default, each line that matches the pattern is
- copied to the standard output, and if there is more than one
- file, the file name is printed before each line of output.
- However, there are options that can change how pcregrep
- behaves.
-
- Lines are limited to BUFSIZ characters. BUFSIZ is defined in
- <stdio.h>. The newline character is removed from the end of
- each line before it is matched against the pattern.
-
-
-
-OPTIONS
- -V Write the version number of the PCRE library being
- used to the standard error stream.
-
- -c Do not print individual lines; instead just print
- a count of the number of lines that would other-
- wise have been printed. If several files are
- given, a count is printed for each of them.
-
-
-
-and
- -
- ffilename Read a number of patterns from the file, one per line,
- match all of them against each line of input. A
- line is output if any of the patterns match it.
- When -f is used, no pattern is taken from the com-
- mand line; all arguments are treated as file
- names. There is a maximum of 100 patterns. Trail-
- ing white space is removed, and blank lines are
- ignored. An empty file contains no patterns and
- therefore matches nothing.
-
- -h Suppress printing of filenames when searching mul-
- tiple files.
-
- -i Ignore upper/lower case distinctions during com-
- parisons.
-
- -l Instead of printing lines from the files, just
- print the names of the files containing lines that
- would have been printed. Each file name is printed
- once, on a separate line.
-
- -n Precede each line by its line number in the file.
-
- -r If any file is a directory, recursively scan the
- files it contains. Without -r a directory is
- scanned as a normal file.
-
- -s Work silently, that is, display nothing except
- error messages. The exit status indicates whether
- any matches were found.
-
- -v Invert the sense of the match, so that lines which
- do not match the pattern are now the ones that are
- found.
-
- -x Force the pattern to be anchored (it must start
- matching at the beginning of the line) and in
- addition, require it to match the entire line.
- This is equivalent to having ^ and $ characters at
- the start and end of each alternative branch in
- the regular expression.
-
-
-
-SEE ALSO
- pcre(3), Perl 5 documentation
-
-
-
-
-
-DIAGNOSTICS
- Exit status is 0 if any matches were found, 1 if no matches
- were found, and 2 for syntax errors or inacessible files
- (even if matches were found).
-
-
-
-AUTHOR
- Philip Hazel <ph10@cam.ac.uk>
-
- Last updated: 25 July 2002
- Copyright (c) 1997-2002 University of Cambridge.
diff --git a/ext/pcre/pcrelib/doc/pcreposix.3 b/ext/pcre/pcrelib/doc/pcreposix.3
deleted file mode 100644
index 41716ead91..0000000000
--- a/ext/pcre/pcrelib/doc/pcreposix.3
+++ /dev/null
@@ -1,149 +0,0 @@
-.TH PCRE 3
-.SH NAME
-pcreposix - POSIX API for Perl-compatible regular expressions.
-.SH SYNOPSIS
-.B #include <pcreposix.h>
-.PP
-.SM
-.br
-.B int regcomp(regex_t *\fIpreg\fR, const char *\fIpattern\fR,
-.ti +5n
-.B int \fIcflags\fR);
-.PP
-.br
-.B int regexec(regex_t *\fIpreg\fR, const char *\fIstring\fR,
-.ti +5n
-.B size_t \fInmatch\fR, regmatch_t \fIpmatch\fR[], int \fIeflags\fR);
-.PP
-.br
-.B size_t regerror(int \fIerrcode\fR, const regex_t *\fIpreg\fR,
-.ti +5n
-.B char *\fIerrbuf\fR, size_t \fIerrbuf_size\fR);
-.PP
-.br
-.B void regfree(regex_t *\fIpreg\fR);
-
-
-.SH DESCRIPTION
-This set of functions provides a POSIX-style API to the PCRE regular expression
-package. See the \fBpcre\fR documentation for a description of the native API,
-which contains additional functionality.
-
-The functions described here are just wrapper functions that ultimately call
-the native API. Their prototypes are defined in the \fBpcreposix.h\fR header
-file, and on Unix systems the library itself is called \fBpcreposix.a\fR, so
-can be accessed by adding \fB-lpcreposix\fR to the command for linking an
-application which uses them. Because the POSIX functions call the native ones,
-it is also necessary to add \fR-lpcre\fR.
-
-I have implemented only those option bits that can be reasonably mapped to PCRE
-native options. In addition, the options REG_EXTENDED and REG_NOSUB are defined
-with the value zero. They have no effect, but since programs that are written
-to the POSIX interface often use them, this makes it easier to slot in PCRE as
-a replacement library. Other POSIX options are not even defined.
-
-When PCRE is called via these functions, it is only the API that is POSIX-like
-in style. The syntax and semantics of the regular expressions themselves are
-still those of Perl, subject to the setting of various PCRE options, as
-described below.
-
-The header for these functions is supplied as \fBpcreposix.h\fR to avoid any
-potential clash with other POSIX libraries. It can, of course, be renamed or
-aliased as \fBregex.h\fR, which is the "correct" name. It provides two
-structure types, \fIregex_t\fR for compiled internal forms, and
-\fIregmatch_t\fR for returning captured substrings. It also defines some
-constants whose names start with "REG_"; these are used for setting options and
-identifying error codes.
-
-
-.SH COMPILING A PATTERN
-
-The function \fBregcomp()\fR is called to compile a pattern into an
-internal form. The pattern is a C string terminated by a binary zero, and
-is passed in the argument \fIpattern\fR. The \fIpreg\fR argument is a pointer
-to a regex_t structure which is used as a base for storing information about
-the compiled expression.
-
-The argument \fIcflags\fR is either zero, or contains one or more of the bits
-defined by the following macros:
-
- REG_ICASE
-
-The PCRE_CASELESS option is set when the expression is passed for compilation
-to the native function.
-
- REG_NEWLINE
-
-The PCRE_MULTILINE option is set when the expression is passed for compilation
-to the native function.
-
-In the absence of these flags, no options are passed to the native function.
-This means the the regex is compiled with PCRE default semantics. In
-particular, the way it handles newline characters in the subject string is the
-Perl way, not the POSIX way. Note that setting PCRE_MULTILINE has only
-\fIsome\fR of the effects specified for REG_NEWLINE. It does not affect the way
-newlines are matched by . (they aren't) or a negative class such as [^a] (they
-are).
-
-The yield of \fBregcomp()\fR is zero on success, and non-zero otherwise. The
-\fIpreg\fR structure is filled in on success, and one member of the structure
-is publicized: \fIre_nsub\fR contains the number of capturing subpatterns in
-the regular expression. Various error codes are defined in the header file.
-
-
-.SH MATCHING A PATTERN
-The function \fBregexec()\fR is called to match a pre-compiled pattern
-\fIpreg\fR against a given \fIstring\fR, which is terminated by a zero byte,
-subject to the options in \fIeflags\fR. These can be:
-
- REG_NOTBOL
-
-The PCRE_NOTBOL option is set when calling the underlying PCRE matching
-function.
-
- REG_NOTEOL
-
-The PCRE_NOTEOL option is set when calling the underlying PCRE matching
-function.
-
-The portion of the string that was matched, and also any captured substrings,
-are returned via the \fIpmatch\fR argument, which points to an array of
-\fInmatch\fR structures of type \fIregmatch_t\fR, containing the members
-\fIrm_so\fR and \fIrm_eo\fR. These contain the offset to the first character of
-each substring and the offset to the first character after the end of each
-substring, respectively. The 0th element of the vector relates to the entire
-portion of \fIstring\fR that was matched; subsequent elements relate to the
-capturing subpatterns of the regular expression. Unused entries in the array
-have both structure members set to -1.
-
-A successful match yields a zero return; various error codes are defined in the
-header file, of which REG_NOMATCH is the "expected" failure code.
-
-
-.SH ERROR MESSAGES
-The \fBregerror()\fR function maps a non-zero errorcode from either
-\fBregcomp\fR or \fBregexec\fR to a printable message. If \fIpreg\fR is not
-NULL, the error should have arisen from the use of that structure. A message
-terminated by a binary zero is placed in \fIerrbuf\fR. The length of the
-message, including the zero, is limited to \fIerrbuf_size\fR. The yield of the
-function is the size of buffer needed to hold the whole message.
-
-
-.SH STORAGE
-Compiling a regular expression causes memory to be allocated and associated
-with the \fIpreg\fR structure. The function \fBregfree()\fR frees all such
-memory, after which \fIpreg\fR may no longer be used as a compiled expression.
-
-
-.SH AUTHOR
-Philip Hazel <ph10@cam.ac.uk>
-.br
-University Computing Service,
-.br
-New Museums Site,
-.br
-Cambridge CB2 3QG, England.
-.br
-Phone: +44 1223 334714
-
-Copyright (c) 1997-2000 University of Cambridge.
diff --git a/ext/pcre/pcrelib/doc/pcreposix.html b/ext/pcre/pcrelib/doc/pcreposix.html
deleted file mode 100644
index 9c89478420..0000000000
--- a/ext/pcre/pcrelib/doc/pcreposix.html
+++ /dev/null
@@ -1,191 +0,0 @@
-<HTML>
-<HEAD>
-<TITLE>pcreposix specification</TITLE>
-</HEAD>
-<body bgcolor="#FFFFFF" text="#00005A">
-<H1>pcreposix specification</H1>
-This HTML document has been generated automatically from the original man page.
-If there is any nonsense in it, please consult the man page in case the
-conversion went wrong.
-<UL>
-<LI><A NAME="TOC1" HREF="#SEC1">NAME</A>
-<LI><A NAME="TOC2" HREF="#SEC2">SYNOPSIS</A>
-<LI><A NAME="TOC3" HREF="#SEC3">DESCRIPTION</A>
-<LI><A NAME="TOC4" HREF="#SEC4">COMPILING A PATTERN</A>
-<LI><A NAME="TOC5" HREF="#SEC5">MATCHING A PATTERN</A>
-<LI><A NAME="TOC6" HREF="#SEC6">ERROR MESSAGES</A>
-<LI><A NAME="TOC7" HREF="#SEC7">STORAGE</A>
-<LI><A NAME="TOC8" HREF="#SEC8">AUTHOR</A>
-</UL>
-<LI><A NAME="SEC1" HREF="#TOC1">NAME</A>
-<P>
-pcreposix - POSIX API for Perl-compatible regular expressions.
-</P>
-<LI><A NAME="SEC2" HREF="#TOC1">SYNOPSIS</A>
-<P>
-<B>#include &#60;pcreposix.h&#62;</B>
-</P>
-<P>
-<B>int regcomp(regex_t *<I>preg</I>, const char *<I>pattern</I>,</B>
-<B>int <I>cflags</I>);</B>
-</P>
-<P>
-<B>int regexec(regex_t *<I>preg</I>, const char *<I>string</I>,</B>
-<B>size_t <I>nmatch</I>, regmatch_t <I>pmatch</I>[], int <I>eflags</I>);</B>
-</P>
-<P>
-<B>size_t regerror(int <I>errcode</I>, const regex_t *<I>preg</I>,</B>
-<B>char *<I>errbuf</I>, size_t <I>errbuf_size</I>);</B>
-</P>
-<P>
-<B>void regfree(regex_t *<I>preg</I>);</B>
-</P>
-<LI><A NAME="SEC3" HREF="#TOC1">DESCRIPTION</A>
-<P>
-This set of functions provides a POSIX-style API to the PCRE regular expression
-package. See the <B>pcre</B> documentation for a description of the native API,
-which contains additional functionality.
-</P>
-<P>
-The functions described here are just wrapper functions that ultimately call
-the native API. Their prototypes are defined in the <B>pcreposix.h</B> header
-file, and on Unix systems the library itself is called <B>pcreposix.a</B>, so
-can be accessed by adding <B>-lpcreposix</B> to the command for linking an
-application which uses them. Because the POSIX functions call the native ones,
-it is also necessary to add \fR-lpcre\fR.
-</P>
-<P>
-I have implemented only those option bits that can be reasonably mapped to PCRE
-native options. In addition, the options REG_EXTENDED and REG_NOSUB are defined
-with the value zero. They have no effect, but since programs that are written
-to the POSIX interface often use them, this makes it easier to slot in PCRE as
-a replacement library. Other POSIX options are not even defined.
-</P>
-<P>
-When PCRE is called via these functions, it is only the API that is POSIX-like
-in style. The syntax and semantics of the regular expressions themselves are
-still those of Perl, subject to the setting of various PCRE options, as
-described below.
-</P>
-<P>
-The header for these functions is supplied as <B>pcreposix.h</B> to avoid any
-potential clash with other POSIX libraries. It can, of course, be renamed or
-aliased as <B>regex.h</B>, which is the "correct" name. It provides two
-structure types, <I>regex_t</I> for compiled internal forms, and
-<I>regmatch_t</I> for returning captured substrings. It also defines some
-constants whose names start with "REG_"; these are used for setting options and
-identifying error codes.
-</P>
-<LI><A NAME="SEC4" HREF="#TOC1">COMPILING A PATTERN</A>
-<P>
-The function <B>regcomp()</B> is called to compile a pattern into an
-internal form. The pattern is a C string terminated by a binary zero, and
-is passed in the argument <I>pattern</I>. The <I>preg</I> argument is a pointer
-to a regex_t structure which is used as a base for storing information about
-the compiled expression.
-</P>
-<P>
-The argument <I>cflags</I> is either zero, or contains one or more of the bits
-defined by the following macros:
-</P>
-<P>
-<PRE>
- REG_ICASE
-</PRE>
-</P>
-<P>
-The PCRE_CASELESS option is set when the expression is passed for compilation
-to the native function.
-</P>
-<P>
-<PRE>
- REG_NEWLINE
-</PRE>
-</P>
-<P>
-The PCRE_MULTILINE option is set when the expression is passed for compilation
-to the native function.
-</P>
-<P>
-In the absence of these flags, no options are passed to the native function.
-This means the the regex is compiled with PCRE default semantics. In
-particular, the way it handles newline characters in the subject string is the
-Perl way, not the POSIX way. Note that setting PCRE_MULTILINE has only
-<I>some</I> of the effects specified for REG_NEWLINE. It does not affect the way
-newlines are matched by . (they aren't) or a negative class such as [^a] (they
-are).
-</P>
-<P>
-The yield of <B>regcomp()</B> is zero on success, and non-zero otherwise. The
-<I>preg</I> structure is filled in on success, and one member of the structure
-is publicized: <I>re_nsub</I> contains the number of capturing subpatterns in
-the regular expression. Various error codes are defined in the header file.
-</P>
-<LI><A NAME="SEC5" HREF="#TOC1">MATCHING A PATTERN</A>
-<P>
-The function <B>regexec()</B> is called to match a pre-compiled pattern
-<I>preg</I> against a given <I>string</I>, which is terminated by a zero byte,
-subject to the options in <I>eflags</I>. These can be:
-</P>
-<P>
-<PRE>
- REG_NOTBOL
-</PRE>
-</P>
-<P>
-The PCRE_NOTBOL option is set when calling the underlying PCRE matching
-function.
-</P>
-<P>
-<PRE>
- REG_NOTEOL
-</PRE>
-</P>
-<P>
-The PCRE_NOTEOL option is set when calling the underlying PCRE matching
-function.
-</P>
-<P>
-The portion of the string that was matched, and also any captured substrings,
-are returned via the <I>pmatch</I> argument, which points to an array of
-<I>nmatch</I> structures of type <I>regmatch_t</I>, containing the members
-<I>rm_so</I> and <I>rm_eo</I>. These contain the offset to the first character of
-each substring and the offset to the first character after the end of each
-substring, respectively. The 0th element of the vector relates to the entire
-portion of <I>string</I> that was matched; subsequent elements relate to the
-capturing subpatterns of the regular expression. Unused entries in the array
-have both structure members set to -1.
-</P>
-<P>
-A successful match yields a zero return; various error codes are defined in the
-header file, of which REG_NOMATCH is the "expected" failure code.
-</P>
-<LI><A NAME="SEC6" HREF="#TOC1">ERROR MESSAGES</A>
-<P>
-The <B>regerror()</B> function maps a non-zero errorcode from either
-<B>regcomp</B> or <B>regexec</B> to a printable message. If <I>preg</I> is not
-NULL, the error should have arisen from the use of that structure. A message
-terminated by a binary zero is placed in <I>errbuf</I>. The length of the
-message, including the zero, is limited to <I>errbuf_size</I>. The yield of the
-function is the size of buffer needed to hold the whole message.
-</P>
-<LI><A NAME="SEC7" HREF="#TOC1">STORAGE</A>
-<P>
-Compiling a regular expression causes memory to be allocated and associated
-with the <I>preg</I> structure. The function <B>regfree()</B> frees all such
-memory, after which <I>preg</I> may no longer be used as a compiled expression.
-</P>
-<LI><A NAME="SEC8" HREF="#TOC1">AUTHOR</A>
-<P>
-Philip Hazel &#60;ph10@cam.ac.uk&#62;
-<BR>
-University Computing Service,
-<BR>
-New Museums Site,
-<BR>
-Cambridge CB2 3QG, England.
-<BR>
-Phone: +44 1223 334714
-</P>
-<P>
-Copyright (c) 1997-2000 University of Cambridge.
diff --git a/ext/pcre/pcrelib/doc/pcreposix.txt b/ext/pcre/pcrelib/doc/pcreposix.txt
deleted file mode 100644
index 2d76f7cdcc..0000000000
--- a/ext/pcre/pcrelib/doc/pcreposix.txt
+++ /dev/null
@@ -1,159 +0,0 @@
-NAME
- pcreposix - POSIX API for Perl-compatible regular expres-
- sions.
-
-
-
-SYNOPSIS
- #include <pcreposix.h>
-
- int regcomp(regex_t *preg, const char *pattern,
- int cflags);
-
- int regexec(regex_t *preg, const char *string,
- size_t nmatch, regmatch_t pmatch[], int eflags);
-
- size_t regerror(int errcode, const regex_t *preg,
- char *errbuf, size_t errbuf_size);
-
- void regfree(regex_t *preg);
-
-
-
-DESCRIPTION
- This set of functions provides a POSIX-style API to the PCRE
- regular expression package. See the pcre documentation for a
- description of the native API, which contains additional
- functionality.
-
- The functions described here are just wrapper functions that
- ultimately call the native API. Their prototypes are defined
- in the pcreposix.h header file, and on Unix systems the
- library itself is called pcreposix.a, so can be accessed by
- adding -lpcreposix to the command for linking an application
- which uses them. Because the POSIX functions call the native
- ones, it is also necessary to add -lpcre.
-
- I have implemented only those option bits that can be rea-
- sonably mapped to PCRE native options. In addition, the
- options REG_EXTENDED and REG_NOSUB are defined with the
- value zero. They have no effect, but since programs that are
- written to the POSIX interface often use them, this makes it
- easier to slot in PCRE as a replacement library. Other POSIX
- options are not even defined.
-
- When PCRE is called via these functions, it is only the API
- that is POSIX-like in style. The syntax and semantics of the
- regular expressions themselves are still those of Perl, sub-
- ject to the setting of various PCRE options, as described
- below.
-
- The header for these functions is supplied as pcreposix.h to
- avoid any potential clash with other POSIX libraries. It
- can, of course, be renamed or aliased as regex.h, which is
- the "correct" name. It provides two structure types, regex_t
- for compiled internal forms, and regmatch_t for returning
- captured substrings. It also defines some constants whose
- names start with "REG_"; these are used for setting options
- and identifying error codes.
-
-
-
-COMPILING A PATTERN
- The function regcomp() is called to compile a pattern into
- an internal form. The pattern is a C string terminated by a
- binary zero, and is passed in the argument pattern. The preg
- argument is a pointer to a regex_t structure which is used
- as a base for storing information about the compiled expres-
- sion.
-
- The argument cflags is either zero, or contains one or more
- of the bits defined by the following macros:
-
- REG_ICASE
-
- The PCRE_CASELESS option is set when the expression is
- passed for compilation to the native function.
-
- REG_NEWLINE
-
- The PCRE_MULTILINE option is set when the expression is
- passed for compilation to the native function.
-
- In the absence of these flags, no options are passed to the
- native function. This means the the regex is compiled with
- PCRE default semantics. In particular, the way it handles
- newline characters in the subject string is the Perl way,
- not the POSIX way. Note that setting PCRE_MULTILINE has only
- some of the effects specified for REG_NEWLINE. It does not
- affect the way newlines are matched by . (they aren't) or a
- negative class such as [^a] (they are).
-
- The yield of regcomp() is zero on success, and non-zero oth-
- erwise. The preg structure is filled in on success, and one
- member of the structure is publicized: re_nsub contains the
- number of capturing subpatterns in the regular expression.
- Various error codes are defined in the header file.
-
-
-
-MATCHING A PATTERN
- The function regexec() is called to match a pre-compiled
- pattern preg against a given string, which is terminated by
- a zero byte, subject to the options in eflags. These can be:
-
- REG_NOTBOL
-
- The PCRE_NOTBOL option is set when calling the underlying
- PCRE matching function.
-
- REG_NOTEOL
-
- The PCRE_NOTEOL option is set when calling the underlying
- PCRE matching function.
-
- The portion of the string that was matched, and also any
- captured substrings, are returned via the pmatch argument,
- which points to an array of nmatch structures of type
- regmatch_t, containing the members rm_so and rm_eo. These
- contain the offset to the first character of each substring
- and the offset to the first character after the end of each
- substring, respectively. The 0th element of the vector
- relates to the entire portion of string that was matched;
- subsequent elements relate to the capturing subpatterns of
- the regular expression. Unused entries in the array have
- both structure members set to -1.
-
- A successful match yields a zero return; various error codes
- are defined in the header file, of which REG_NOMATCH is the
- "expected" failure code.
-
-
-
-ERROR MESSAGES
- The regerror() function maps a non-zero errorcode from
- either regcomp or regexec to a printable message. If preg is
- not NULL, the error should have arisen from the use of that
- structure. A message terminated by a binary zero is placed
- in errbuf. The length of the message, including the zero, is
- limited to errbuf_size. The yield of the function is the
- size of buffer needed to hold the whole message.
-
-
-
-STORAGE
- Compiling a regular expression causes memory to be allocated
- and associated with the preg structure. The function reg-
- free() frees all such memory, after which preg may no longer
- be used as a compiled expression.
-
-
-
-AUTHOR
- Philip Hazel <ph10@cam.ac.uk>
- University Computing Service,
- New Museums Site,
- Cambridge CB2 3QG, England.
- Phone: +44 1223 334714
-
- Copyright (c) 1997-2000 University of Cambridge.
diff --git a/ext/pcre/pcrelib/doc/pcretest.1 b/ext/pcre/pcrelib/doc/pcretest.1
deleted file mode 100644
index 02afe7a49d..0000000000
--- a/ext/pcre/pcrelib/doc/pcretest.1
+++ /dev/null
@@ -1,288 +0,0 @@
-.TH PCRETEST 1
-.SH NAME
-pcretest - a program for testing Perl-compatible regular expressions.
-.SH SYNOPSIS
-.B pcretest "[-d] [-i] [-m] [-o osize] [-p] [-t] [source] [destination]"
-
-\fBpcretest\fR was written as a test program for the PCRE regular expression
-library itself, but it can also be used for experimenting with regular
-expressions. This man page describes the features of the test program; for
-details of the regular expressions themselves, see the \fBpcre\fR man page.
-
-.SH OPTIONS
-.TP 10
-\fB-d\fR
-Behave as if each regex had the \fB/D\fR modifier (see below); the internal
-form is output after compilation.
-.TP 10
-\fB-i\fR
-Behave as if each regex had the \fB/I\fR modifier; information about the
-compiled pattern is given after compilation.
-.TP 10
-\fB-m\fR
-Output the size of each compiled pattern after it has been compiled. This is
-equivalent to adding /M to each regular expression. For compatibility with
-earlier versions of pcretest, \fB-s\fR is a synonym for \fB-m\fR.
-.TP 10
-\fB-o\fR \fIosize\fR
-Set the number of elements in the output vector that is used when calling PCRE
-to be \fIosize\fR. The default value is 45, which is enough for 14 capturing
-subexpressions. The vector size can be changed for individual matching calls by
-including \\O in the data line (see below).
-.TP 10
-\fB-p\fR
-Behave as if each regex has \fB/P\fR modifier; the POSIX wrapper API is used
-to call PCRE. None of the other options has any effect when \fB-p\fR is set.
-.TP 10
-\fB-t\fR
-Run each compile, study, and match many times with a timer, and output
-resulting time per compile or match (in milliseconds). Do not set \fB-t\fR with
-\fB-m\fR, because you will then get the size output 20000 times and the timing
-will be distorted.
-
-
-.SH DESCRIPTION
-
-If \fBpcretest\fR is given two filename arguments, it reads from the first and
-writes to the second. If it is given only one filename argument, it reads from
-that file and writes to stdout. Otherwise, it reads from stdin and writes to
-stdout, and prompts for each line of input, using "re>" to prompt for regular
-expressions, and "data>" to prompt for data lines.
-
-The program handles any number of sets of input on a single input file. Each
-set starts with a regular expression, and continues with any number of data
-lines to be matched against the pattern.
-
-Each line is matched separately and independently. If you want to do
-multiple-line matches, you have to use the \\n escape sequence in a single line
-of input to encode the newline characters. The maximum length of data line is
-30,000 characters.
-
-An empty line signals the end of the data lines, at which point a new regular
-expression is read. The regular expressions are given enclosed in any
-non-alphameric delimiters other than backslash, for example
-
- /(a|bc)x+yz/
-
-White space before the initial delimiter is ignored. A regular expression may
-be continued over several input lines, in which case the newline characters are
-included within it. It is possible to include the delimiter within the pattern
-by escaping it, for example
-
- /abc\\/def/
-
-If you do so, the escape and the delimiter form part of the pattern, but since
-delimiters are always non-alphameric, this does not affect its interpretation.
-If the terminating delimiter is immediately followed by a backslash, for
-example,
-
- /abc/\\
-
-then a backslash is added to the end of the pattern. This is done to provide a
-way of testing the error condition that arises if a pattern finishes with a
-backslash, because
-
- /abc\\/
-
-is interpreted as the first line of a pattern that starts with "abc/", causing
-pcretest to read the next line as a continuation of the regular expression.
-
-
-.SH PATTERN MODIFIERS
-
-The pattern may be followed by \fBi\fR, \fBm\fR, \fBs\fR, or \fBx\fR to set the
-PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, or PCRE_EXTENDED options,
-respectively. For example:
-
- /caseless/i
-
-These modifier letters have the same effect as they do in Perl. There are
-others which set PCRE options that do not correspond to anything in Perl:
-\fB/A\fR, \fB/E\fR, and \fB/X\fR set PCRE_ANCHORED, PCRE_DOLLAR_ENDONLY, and
-PCRE_EXTRA respectively.
-
-Searching for all possible matches within each subject string can be requested
-by the \fB/g\fR or \fB/G\fR modifier. After finding a match, PCRE is called
-again to search the remainder of the subject string. The difference between
-\fB/g\fR and \fB/G\fR is that the former uses the \fIstartoffset\fR argument to
-\fBpcre_exec()\fR to start searching at a new point within the entire string
-(which is in effect what Perl does), whereas the latter passes over a shortened
-substring. This makes a difference to the matching process if the pattern
-begins with a lookbehind assertion (including \\b or \\B).
-
-If any call to \fBpcre_exec()\fR in a \fB/g\fR or \fB/G\fR sequence matches an
-empty string, the next call is done with the PCRE_NOTEMPTY and PCRE_ANCHORED
-flags set in order to search for another, non-empty, match at the same point.
-If this second match fails, the start offset is advanced by one, and the normal
-match is retried. This imitates the way Perl handles such cases when using the
-\fB/g\fR modifier or the \fBsplit()\fR function.
-
-There are a number of other modifiers for controlling the way \fBpcretest\fR
-operates.
-
-The \fB/+\fR modifier requests that as well as outputting the substring that
-matched the entire pattern, pcretest should in addition output the remainder of
-the subject string. This is useful for tests where the subject contains
-multiple copies of the same substring.
-
-The \fB/L\fR modifier must be followed directly by the name of a locale, for
-example,
-
- /pattern/Lfr
-
-For this reason, it must be the last modifier letter. The given locale is set,
-\fBpcre_maketables()\fR is called to build a set of character tables for the
-locale, and this is then passed to \fBpcre_compile()\fR when compiling the
-regular expression. Without an \fB/L\fR modifier, NULL is passed as the tables
-pointer; that is, \fB/L\fR applies only to the expression on which it appears.
-
-The \fB/I\fR modifier requests that \fBpcretest\fR output information about the
-compiled expression (whether it is anchored, has a fixed first character, and
-so on). It does this by calling \fBpcre_fullinfo()\fR after compiling an
-expression, and outputting the information it gets back. If the pattern is
-studied, the results of that are also output.
-
-The \fB/D\fR modifier is a PCRE debugging feature, which also assumes \fB/I\fR.
-It causes the internal form of compiled regular expressions to be output after
-compilation.
-
-The \fB/S\fR modifier causes \fBpcre_study()\fR to be called after the
-expression has been compiled, and the results used when the expression is
-matched.
-
-The \fB/M\fR modifier causes the size of memory block used to hold the compiled
-pattern to be output.
-
-The \fB/P\fR modifier causes \fBpcretest\fR to call PCRE via the POSIX wrapper
-API rather than its native API. When this is done, all other modifiers except
-\fB/i\fR, \fB/m\fR, and \fB/+\fR are ignored. REG_ICASE is set if \fB/i\fR is
-present, and REG_NEWLINE is set if \fB/m\fR is present. The wrapper functions
-force PCRE_DOLLAR_ENDONLY always, and PCRE_DOTALL unless REG_NEWLINE is set.
-
-The \fB/8\fR modifier causes \fBpcretest\fR to call PCRE with the PCRE_UTF8
-option set. This turns on the (currently incomplete) support for UTF-8
-character handling in PCRE, provided that it was compiled with this support
-enabled. This modifier also causes any non-printing characters in output
-strings to be printed using the \\x{hh...} notation if they are valid UTF-8
-sequences.
-
-
-.SH DATA LINES
-
-Before each data line is passed to \fBpcre_exec()\fR, leading and trailing
-whitespace is removed, and it is then scanned for \\ escapes. The following are
-recognized:
-
- \\a alarm (= BEL)
- \\b backspace
- \\e escape
- \\f formfeed
- \\n newline
- \\r carriage return
- \\t tab
- \\v vertical tab
- \\nnn octal character (up to 3 octal digits)
- \\xhh hexadecimal character (up to 2 hex digits)
- \\x{hh...} hexadecimal UTF-8 character
-
- \\A pass the PCRE_ANCHORED option to \fBpcre_exec()\fR
- \\B pass the PCRE_NOTBOL option to \fBpcre_exec()\fR
- \\Cdd call pcre_copy_substring() for substring dd
- after a successful match (any decimal number
- less than 32)
- \\Gdd call pcre_get_substring() for substring dd
- after a successful match (any decimal number
- less than 32)
- \\L call pcre_get_substringlist() after a
- successful match
- \\N pass the PCRE_NOTEMPTY option to \fBpcre_exec()\fR
- \\Odd set the size of the output vector passed to
- \fBpcre_exec()\fR to dd (any number of decimal
- digits)
- \\Z pass the PCRE_NOTEOL option to \fBpcre_exec()\fR
-
-When \\O is used, it may be higher or lower than the size set by the \fB-O\fR
-option (or defaulted to 45); \\O applies only to the call of \fBpcre_exec()\fR
-for the line in which it appears.
-
-A backslash followed by anything else just escapes the anything else. If the
-very last character is a backslash, it is ignored. This gives a way of passing
-an empty line as data, since a real empty line terminates the data input.
-
-If \fB/P\fR was present on the regex, causing the POSIX wrapper API to be used,
-only \fB\B\fR, and \fB\Z\fR have any effect, causing REG_NOTBOL and REG_NOTEOL
-to be passed to \fBregexec()\fR respectively.
-
-The use of \\x{hh...} to represent UTF-8 characters is not dependent on the use
-of the \fB/8\fR modifier on the pattern. It is recognized always. There may be
-any number of hexadecimal digits inside the braces. The result is from one to
-six bytes, encoded according to the UTF-8 rules.
-
-
-.SH OUTPUT FROM PCRETEST
-
-When a match succeeds, pcretest outputs the list of captured substrings that
-\fBpcre_exec()\fR returns, starting with number 0 for the string that matched
-the whole pattern. Here is an example of an interactive pcretest run.
-
- $ pcretest
- PCRE version 2.06 08-Jun-1999
-
- re> /^abc(\\d+)/
- data> abc123
- 0: abc123
- 1: 123
- data> xyz
- No match
-
-If the strings contain any non-printing characters, they are output as \\0x
-escapes, or as \\x{...} escapes if the \fB/8\fR modifier was present on the
-pattern. If the pattern has the \fB/+\fR modifier, then the output for
-substring 0 is followed by the the rest of the subject string, identified by
-"0+" like this:
-
- re> /cat/+
- data> cataract
- 0: cat
- 0+ aract
-
-If the pattern has the \fB/g\fR or \fB/G\fR modifier, the results of successive
-matching attempts are output in sequence, like this:
-
- re> /\\Bi(\\w\\w)/g
- data> Mississippi
- 0: iss
- 1: ss
- 0: iss
- 1: ss
- 0: ipp
- 1: pp
-
-"No match" is output only if the first match attempt fails.
-
-If any of the sequences \fB\\C\fR, \fB\\G\fR, or \fB\\L\fR are present in a
-data line that is successfully matched, the substrings extracted by the
-convenience functions are output with C, G, or L after the string number
-instead of a colon. This is in addition to the normal full list. The string
-length (that is, the return from the extraction function) is given in
-parentheses after each string for \fB\\C\fR and \fB\\G\fR.
-
-Note that while patterns can be continued over several lines (a plain ">"
-prompt is used for continuations), data lines may not. However newlines can be
-included in data by means of the \\n escape.
-
-
-.SH AUTHOR
-Philip Hazel <ph10@cam.ac.uk>
-.br
-University Computing Service,
-.br
-New Museums Site,
-.br
-Cambridge CB2 3QG, England.
-.br
-Phone: +44 1223 334714
-
-Last updated: 25 August 2002
-.br
-Copyright (c) 1997-2002 University of Cambridge.
diff --git a/ext/pcre/pcrelib/doc/pcretest.html b/ext/pcre/pcrelib/doc/pcretest.html
deleted file mode 100644
index cec1214420..0000000000
--- a/ext/pcre/pcrelib/doc/pcretest.html
+++ /dev/null
@@ -1,377 +0,0 @@
-<HTML>
-<HEAD>
-<TITLE>pcretest specification</TITLE>
-</HEAD>
-<body bgcolor="#FFFFFF" text="#00005A">
-<H1>pcretest specification</H1>
-This HTML document has been generated automatically from the original man page.
-If there is any nonsense in it, please consult the man page in case the
-conversion went wrong.
-<UL>
-<LI><A NAME="TOC1" HREF="#SEC1">NAME</A>
-<LI><A NAME="TOC2" HREF="#SEC2">SYNOPSIS</A>
-<LI><A NAME="TOC3" HREF="#SEC3">OPTIONS</A>
-<LI><A NAME="TOC4" HREF="#SEC4">DESCRIPTION</A>
-<LI><A NAME="TOC5" HREF="#SEC5">PATTERN MODIFIERS</A>
-<LI><A NAME="TOC6" HREF="#SEC6">DATA LINES</A>
-<LI><A NAME="TOC7" HREF="#SEC7">OUTPUT FROM PCRETEST</A>
-<LI><A NAME="TOC8" HREF="#SEC8">AUTHOR</A>
-</UL>
-<LI><A NAME="SEC1" HREF="#TOC1">NAME</A>
-<P>
-pcretest - a program for testing Perl-compatible regular expressions.
-</P>
-<LI><A NAME="SEC2" HREF="#TOC1">SYNOPSIS</A>
-<P>
-<B>pcretest [-d] [-i] [-m] [-o osize] [-p] [-t] [source] [destination]</B>
-</P>
-<P>
-<B>pcretest</B> was written as a test program for the PCRE regular expression
-library itself, but it can also be used for experimenting with regular
-expressions. This man page describes the features of the test program; for
-details of the regular expressions themselves, see the <B>pcre</B> man page.
-</P>
-<LI><A NAME="SEC3" HREF="#TOC1">OPTIONS</A>
-<P>
-<B>-d</B>
-Behave as if each regex had the <B>/D</B> modifier (see below); the internal
-form is output after compilation.
-</P>
-<P>
-<B>-i</B>
-Behave as if each regex had the <B>/I</B> modifier; information about the
-compiled pattern is given after compilation.
-</P>
-<P>
-<B>-m</B>
-Output the size of each compiled pattern after it has been compiled. This is
-equivalent to adding /M to each regular expression. For compatibility with
-earlier versions of pcretest, <B>-s</B> is a synonym for <B>-m</B>.
-</P>
-<P>
-<B>-o</B> <I>osize</I>
-Set the number of elements in the output vector that is used when calling PCRE
-to be <I>osize</I>. The default value is 45, which is enough for 14 capturing
-subexpressions. The vector size can be changed for individual matching calls by
-including \O in the data line (see below).
-</P>
-<P>
-<B>-p</B>
-Behave as if each regex has <B>/P</B> modifier; the POSIX wrapper API is used
-to call PCRE. None of the other options has any effect when <B>-p</B> is set.
-</P>
-<P>
-<B>-t</B>
-Run each compile, study, and match many times with a timer, and output
-resulting time per compile or match (in milliseconds). Do not set <B>-t</B> with
-<B>-m</B>, because you will then get the size output 20000 times and the timing
-will be distorted.
-</P>
-<LI><A NAME="SEC4" HREF="#TOC1">DESCRIPTION</A>
-<P>
-If <B>pcretest</B> is given two filename arguments, it reads from the first and
-writes to the second. If it is given only one filename argument, it reads from
-that file and writes to stdout. Otherwise, it reads from stdin and writes to
-stdout, and prompts for each line of input, using "re&#62;" to prompt for regular
-expressions, and "data&#62;" to prompt for data lines.
-</P>
-<P>
-The program handles any number of sets of input on a single input file. Each
-set starts with a regular expression, and continues with any number of data
-lines to be matched against the pattern.
-</P>
-<P>
-Each line is matched separately and independently. If you want to do
-multiple-line matches, you have to use the \n escape sequence in a single line
-of input to encode the newline characters. The maximum length of data line is
-30,000 characters.
-</P>
-<P>
-An empty line signals the end of the data lines, at which point a new regular
-expression is read. The regular expressions are given enclosed in any
-non-alphameric delimiters other than backslash, for example
-</P>
-<P>
-<PRE>
- /(a|bc)x+yz/
-</PRE>
-</P>
-<P>
-White space before the initial delimiter is ignored. A regular expression may
-be continued over several input lines, in which case the newline characters are
-included within it. It is possible to include the delimiter within the pattern
-by escaping it, for example
-</P>
-<P>
-<PRE>
- /abc\/def/
-</PRE>
-</P>
-<P>
-If you do so, the escape and the delimiter form part of the pattern, but since
-delimiters are always non-alphameric, this does not affect its interpretation.
-If the terminating delimiter is immediately followed by a backslash, for
-example,
-</P>
-<P>
-<PRE>
- /abc/\
-</PRE>
-</P>
-<P>
-then a backslash is added to the end of the pattern. This is done to provide a
-way of testing the error condition that arises if a pattern finishes with a
-backslash, because
-</P>
-<P>
-<PRE>
- /abc\/
-</PRE>
-</P>
-<P>
-is interpreted as the first line of a pattern that starts with "abc/", causing
-pcretest to read the next line as a continuation of the regular expression.
-</P>
-<LI><A NAME="SEC5" HREF="#TOC1">PATTERN MODIFIERS</A>
-<P>
-The pattern may be followed by <B>i</B>, <B>m</B>, <B>s</B>, or <B>x</B> to set the
-PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, or PCRE_EXTENDED options,
-respectively. For example:
-</P>
-<P>
-<PRE>
- /caseless/i
-</PRE>
-</P>
-<P>
-These modifier letters have the same effect as they do in Perl. There are
-others which set PCRE options that do not correspond to anything in Perl:
-<B>/A</B>, <B>/E</B>, and <B>/X</B> set PCRE_ANCHORED, PCRE_DOLLAR_ENDONLY, and
-PCRE_EXTRA respectively.
-</P>
-<P>
-Searching for all possible matches within each subject string can be requested
-by the <B>/g</B> or <B>/G</B> modifier. After finding a match, PCRE is called
-again to search the remainder of the subject string. The difference between
-<B>/g</B> and <B>/G</B> is that the former uses the <I>startoffset</I> argument to
-<B>pcre_exec()</B> to start searching at a new point within the entire string
-(which is in effect what Perl does), whereas the latter passes over a shortened
-substring. This makes a difference to the matching process if the pattern
-begins with a lookbehind assertion (including \b or \B).
-</P>
-<P>
-If any call to <B>pcre_exec()</B> in a <B>/g</B> or <B>/G</B> sequence matches an
-empty string, the next call is done with the PCRE_NOTEMPTY and PCRE_ANCHORED
-flags set in order to search for another, non-empty, match at the same point.
-If this second match fails, the start offset is advanced by one, and the normal
-match is retried. This imitates the way Perl handles such cases when using the
-<B>/g</B> modifier or the <B>split()</B> function.
-</P>
-<P>
-There are a number of other modifiers for controlling the way <B>pcretest</B>
-operates.
-</P>
-<P>
-The <B>/+</B> modifier requests that as well as outputting the substring that
-matched the entire pattern, pcretest should in addition output the remainder of
-the subject string. This is useful for tests where the subject contains
-multiple copies of the same substring.
-</P>
-<P>
-The <B>/L</B> modifier must be followed directly by the name of a locale, for
-example,
-</P>
-<P>
-<PRE>
- /pattern/Lfr
-</PRE>
-</P>
-<P>
-For this reason, it must be the last modifier letter. The given locale is set,
-<B>pcre_maketables()</B> is called to build a set of character tables for the
-locale, and this is then passed to <B>pcre_compile()</B> when compiling the
-regular expression. Without an <B>/L</B> modifier, NULL is passed as the tables
-pointer; that is, <B>/L</B> applies only to the expression on which it appears.
-</P>
-<P>
-The <B>/I</B> modifier requests that <B>pcretest</B> output information about the
-compiled expression (whether it is anchored, has a fixed first character, and
-so on). It does this by calling <B>pcre_fullinfo()</B> after compiling an
-expression, and outputting the information it gets back. If the pattern is
-studied, the results of that are also output.
-</P>
-<P>
-The <B>/D</B> modifier is a PCRE debugging feature, which also assumes <B>/I</B>.
-It causes the internal form of compiled regular expressions to be output after
-compilation.
-</P>
-<P>
-The <B>/S</B> modifier causes <B>pcre_study()</B> to be called after the
-expression has been compiled, and the results used when the expression is
-matched.
-</P>
-<P>
-The <B>/M</B> modifier causes the size of memory block used to hold the compiled
-pattern to be output.
-</P>
-<P>
-The <B>/P</B> modifier causes <B>pcretest</B> to call PCRE via the POSIX wrapper
-API rather than its native API. When this is done, all other modifiers except
-<B>/i</B>, <B>/m</B>, and <B>/+</B> are ignored. REG_ICASE is set if <B>/i</B> is
-present, and REG_NEWLINE is set if <B>/m</B> is present. The wrapper functions
-force PCRE_DOLLAR_ENDONLY always, and PCRE_DOTALL unless REG_NEWLINE is set.
-</P>
-<P>
-The <B>/8</B> modifier causes <B>pcretest</B> to call PCRE with the PCRE_UTF8
-option set. This turns on the (currently incomplete) support for UTF-8
-character handling in PCRE, provided that it was compiled with this support
-enabled. This modifier also causes any non-printing characters in output
-strings to be printed using the \x{hh...} notation if they are valid UTF-8
-sequences.
-</P>
-<LI><A NAME="SEC6" HREF="#TOC1">DATA LINES</A>
-<P>
-Before each data line is passed to <B>pcre_exec()</B>, leading and trailing
-whitespace is removed, and it is then scanned for \ escapes. The following are
-recognized:
-</P>
-<P>
-<PRE>
- \a alarm (= BEL)
- \b backspace
- \e escape
- \f formfeed
- \n newline
- \r carriage return
- \t tab
- \v vertical tab
- \nnn octal character (up to 3 octal digits)
- \xhh hexadecimal character (up to 2 hex digits)
- \x{hh...} hexadecimal UTF-8 character
-</PRE>
-</P>
-<P>
-<PRE>
- \A pass the PCRE_ANCHORED option to <B>pcre_exec()</B>
- \B pass the PCRE_NOTBOL option to <B>pcre_exec()</B>
- \Cdd call pcre_copy_substring() for substring dd
- after a successful match (any decimal number
- less than 32)
- \Gdd call pcre_get_substring() for substring dd
- after a successful match (any decimal number
- less than 32)
- \L call pcre_get_substringlist() after a
- successful match
- \N pass the PCRE_NOTEMPTY option to <B>pcre_exec()</B>
- \Odd set the size of the output vector passed to
- <B>pcre_exec()</B> to dd (any number of decimal
- digits)
- \Z pass the PCRE_NOTEOL option to <B>pcre_exec()</B>
-</PRE>
-</P>
-<P>
-When \O is used, it may be higher or lower than the size set by the <B>-O</B>
-option (or defaulted to 45); \O applies only to the call of <B>pcre_exec()</B>
-for the line in which it appears.
-</P>
-<P>
-A backslash followed by anything else just escapes the anything else. If the
-very last character is a backslash, it is ignored. This gives a way of passing
-an empty line as data, since a real empty line terminates the data input.
-</P>
-<P>
-If <B>/P</B> was present on the regex, causing the POSIX wrapper API to be used,
-only <B>\B</B>, and <B>\Z</B> have any effect, causing REG_NOTBOL and REG_NOTEOL
-to be passed to <B>regexec()</B> respectively.
-</P>
-<P>
-The use of \x{hh...} to represent UTF-8 characters is not dependent on the use
-of the <B>/8</B> modifier on the pattern. It is recognized always. There may be
-any number of hexadecimal digits inside the braces. The result is from one to
-six bytes, encoded according to the UTF-8 rules.
-</P>
-<LI><A NAME="SEC7" HREF="#TOC1">OUTPUT FROM PCRETEST</A>
-<P>
-When a match succeeds, pcretest outputs the list of captured substrings that
-<B>pcre_exec()</B> returns, starting with number 0 for the string that matched
-the whole pattern. Here is an example of an interactive pcretest run.
-</P>
-<P>
-<PRE>
- $ pcretest
- PCRE version 2.06 08-Jun-1999
-</PRE>
-</P>
-<P>
-<PRE>
- re&#62; /^abc(\d+)/
- data&#62; abc123
- 0: abc123
- 1: 123
- data&#62; xyz
- No match
-</PRE>
-</P>
-<P>
-If the strings contain any non-printing characters, they are output as \0x
-escapes, or as \x{...} escapes if the <B>/8</B> modifier was present on the
-pattern. If the pattern has the <B>/+</B> modifier, then the output for
-substring 0 is followed by the the rest of the subject string, identified by
-"0+" like this:
-</P>
-<P>
-<PRE>
- re&#62; /cat/+
- data&#62; cataract
- 0: cat
- 0+ aract
-</PRE>
-</P>
-<P>
-If the pattern has the <B>/g</B> or <B>/G</B> modifier, the results of successive
-matching attempts are output in sequence, like this:
-</P>
-<P>
-<PRE>
- re&#62; /\Bi(\w\w)/g
- data&#62; Mississippi
- 0: iss
- 1: ss
- 0: iss
- 1: ss
- 0: ipp
- 1: pp
-</PRE>
-</P>
-<P>
-"No match" is output only if the first match attempt fails.
-</P>
-<P>
-If any of the sequences <B>\C</B>, <B>\G</B>, or <B>\L</B> are present in a
-data line that is successfully matched, the substrings extracted by the
-convenience functions are output with C, G, or L after the string number
-instead of a colon. This is in addition to the normal full list. The string
-length (that is, the return from the extraction function) is given in
-parentheses after each string for <B>\C</B> and <B>\G</B>.
-</P>
-<P>
-Note that while patterns can be continued over several lines (a plain "&#62;"
-prompt is used for continuations), data lines may not. However newlines can be
-included in data by means of the \n escape.
-</P>
-<LI><A NAME="SEC8" HREF="#TOC1">AUTHOR</A>
-<P>
-Philip Hazel &#60;ph10@cam.ac.uk&#62;
-<BR>
-University Computing Service,
-<BR>
-New Museums Site,
-<BR>
-Cambridge CB2 3QG, England.
-<BR>
-Phone: +44 1223 334714
-</P>
-<P>
-Last updated: 25 August 2002
-<BR>
-Copyright (c) 1997-2002 University of Cambridge.
diff --git a/ext/pcre/pcrelib/doc/pcretest.txt b/ext/pcre/pcrelib/doc/pcretest.txt
deleted file mode 100644
index 54e989a83c..0000000000
--- a/ext/pcre/pcrelib/doc/pcretest.txt
+++ /dev/null
@@ -1,329 +0,0 @@
-NAME
- pcretest - a program for testing Perl-compatible regular
- expressions.
-
-
-
-SYNOPSIS
- pcretest [-d] [-i] [-m] [-o osize] [-p] [-t] [source] [des-
- tination]
-
- pcretest was written as a test program for the PCRE regular
- expression library itself, but it can also be used for
- experimenting with regular expressions. This man page
- describes the features of the test program; for details of
- the regular expressions themselves, see the pcre man page.
-
-
-
-OPTIONS
- -d Behave as if each regex had the /D modifier (see
- below); the internal form is output after compila-
- tion.
-
- -i Behave as if each regex had the /I modifier;
- information about the compiled pattern is given
- after compilation.
-
- -m Output the size of each compiled pattern after it
- has been compiled. This is equivalent to adding /M
- to each regular expression. For compatibility with
- earlier versions of pcretest, -s is a synonym for
- -m.
-
- -o osize Set the number of elements in the output vector
- that is used when calling PCRE to be osize. The
- default value is 45, which is enough for 14 cap-
- turing subexpressions. The vector size can be
- changed for individual matching calls by including
- \O in the data line (see below).
-
- -p Behave as if each regex has /P modifier; the POSIX
- wrapper API is used to call PCRE. None of the
- other options has any effect when -p is set.
-
- -t Run each compile, study, and match many times with
- a timer, and output resulting time per compile or
- match (in milliseconds). Do not set -t with -m,
- because you will then get the size output 20000
- times and the timing will be distorted.
-
-
-
-DESCRIPTION
- If pcretest is given two filename arguments, it reads from
- the first and writes to the second. If it is given only one
-
-
-
-
-SunOS 5.8 Last change: 1
-
-
-
- filename argument, it reads from that file and writes to
- stdout. Otherwise, it reads from stdin and writes to stdout,
- and prompts for each line of input, using "re>" to prompt
- for regular expressions, and "data>" to prompt for data
- lines.
-
- The program handles any number of sets of input on a single
- input file. Each set starts with a regular expression, and
- continues with any number of data lines to be matched
- against the pattern.
-
- Each line is matched separately and independently. If you
- want to do multiple-line matches, you have to use the \n
- escape sequence in a single line of input to encode the new-
- line characters. The maximum length of data line is 30,000
- characters.
-
- An empty line signals the end of the data lines, at which
- point a new regular expression is read. The regular expres-
- sions are given enclosed in any non-alphameric delimiters
- other than backslash, for example
-
- /(a|bc)x+yz/
-
- White space before the initial delimiter is ignored. A regu-
- lar expression may be continued over several input lines, in
- which case the newline characters are included within it. It
- is possible to include the delimiter within the pattern by
- escaping it, for example
-
- /abc\/def/
-
- If you do so, the escape and the delimiter form part of the
- pattern, but since delimiters are always non-alphameric,
- this does not affect its interpretation. If the terminating
- delimiter is immediately followed by a backslash, for exam-
- ple,
-
- /abc/\
-
- then a backslash is added to the end of the pattern. This is
- done to provide a way of testing the error condition that
- arises if a pattern finishes with a backslash, because
-
- /abc\/
-
- is interpreted as the first line of a pattern that starts
- with "abc/", causing pcretest to read the next line as a
- continuation of the regular expression.
-
-
-
-PATTERN MODIFIERS
- The pattern may be followed by i, m, s, or x to set the
- PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, or PCRE_EXTENDED
- options, respectively. For example:
-
- /caseless/i
-
- These modifier letters have the same effect as they do in
- Perl. There are others which set PCRE options that do not
- correspond to anything in Perl: /A, /E, and /X set
- PCRE_ANCHORED, PCRE_DOLLAR_ENDONLY, and PCRE_EXTRA respec-
- tively.
-
- Searching for all possible matches within each subject
- string can be requested by the /g or /G modifier. After
- finding a match, PCRE is called again to search the
- remainder of the subject string. The difference between /g
- and /G is that the former uses the startoffset argument to
- pcre_exec() to start searching at a new point within the
- entire string (which is in effect what Perl does), whereas
- the latter passes over a shortened substring. This makes a
- difference to the matching process if the pattern begins
- with a lookbehind assertion (including \b or \B).
-
- If any call to pcre_exec() in a /g or /G sequence matches an
- empty string, the next call is done with the PCRE_NOTEMPTY
- and PCRE_ANCHORED flags set in order to search for another,
- non-empty, match at the same point. If this second match
- fails, the start offset is advanced by one, and the normal
- match is retried. This imitates the way Perl handles such
- cases when using the /g modifier or the split() function.
-
- There are a number of other modifiers for controlling the
- way pcretest operates.
-
- The /+ modifier requests that as well as outputting the sub-
- string that matched the entire pattern, pcretest should in
- addition output the remainder of the subject string. This is
- useful for tests where the subject contains multiple copies
- of the same substring.
-
- The /L modifier must be followed directly by the name of a
- locale, for example,
-
- /pattern/Lfr
-
- For this reason, it must be the last modifier letter. The
- given locale is set, pcre_maketables() is called to build a
- set of character tables for the locale, and this is then
- passed to pcre_compile() when compiling the regular expres-
- sion. Without an /L modifier, NULL is passed as the tables
- pointer; that is, /L applies only to the expression on which
- it appears.
-
- The /I modifier requests that pcretest output information
- about the compiled expression (whether it is anchored, has a
- fixed first character, and so on). It does this by calling
- pcre_fullinfo() after compiling an expression, and output-
- ting the information it gets back. If the pattern is stu-
- died, the results of that are also output.
-
- The /D modifier is a PCRE debugging feature, which also
- assumes /I. It causes the internal form of compiled regular
- expressions to be output after compilation.
-
- The /S modifier causes pcre_study() to be called after the
- expression has been compiled, and the results used when the
- expression is matched.
-
- The /M modifier causes the size of memory block used to hold
- the compiled pattern to be output.
-
- The /P modifier causes pcretest to call PCRE via the POSIX
- wrapper API rather than its native API. When this is done,
- all other modifiers except /i, /m, and /+ are ignored.
- REG_ICASE is set if /i is present, and REG_NEWLINE is set if
- /m is present. The wrapper functions force
- PCRE_DOLLAR_ENDONLY always, and PCRE_DOTALL unless
- REG_NEWLINE is set.
-
- The /8 modifier causes pcretest to call PCRE with the
- PCRE_UTF8 option set. This turns on the (currently incom-
- plete) support for UTF-8 character handling in PCRE, pro-
- vided that it was compiled with this support enabled. This
- modifier also causes any non-printing characters in output
- strings to be printed using the \x{hh...} notation if they
- are valid UTF-8 sequences.
-
-
-
-DATA LINES
- Before each data line is passed to pcre_exec(), leading and
- trailing whitespace is removed, and it is then scanned for \
- escapes. The following are recognized:
-
- \a alarm (= BEL)
- \b backspace
- \e escape
- \f formfeed
- \n newline
- \r carriage return
- \t tab
- \v vertical tab
- \nnn octal character (up to 3 octal digits)
-
- hexadecimal character (up to 2 hex digits)
- \x{hh...} hexadecimal UTF-8 character
-
- \A pass the PCRE_ANCHORED option to pcre_exec()
- \B pass the PCRE_NOTBOL option to pcre_exec()
- \Cdd call pcre_copy_substring() for substring dd
- after a successful match (any decimal number
- less than 32)
- \Gdd call pcre_get_substring() for substring dd
- after a successful match (any decimal number
- less than 32)
- \L call pcre_get_substringlist() after a
- successful match
- \N pass the PCRE_NOTEMPTY option to pcre_exec()
- \Odd set the size of the output vector passed to
- pcre_exec() to dd (any number of decimal
- digits)
- \Z pass the PCRE_NOTEOL option to pcre_exec()
-
- When \O is used, it may be higher or lower than the size set
- by the -O option (or defaulted to 45); \O applies only to
- the call of pcre_exec() for the line in which it appears.
-
- A backslash followed by anything else just escapes the any-
- thing else. If the very last character is a backslash, it is
- ignored. This gives a way of passing an empty line as data,
- since a real empty line terminates the data input.
-
- If /P was present on the regex, causing the POSIX wrapper
- API to be used, only B, and Z have any effect, causing
- REG_NOTBOL and REG_NOTEOL to be passed to regexec() respec-
- tively.
-
- The use of \x{hh...} to represent UTF-8 characters is not
- dependent on the use of the /8 modifier on the pattern. It
- is recognized always. There may be any number of hexadecimal
- digits inside the braces. The result is from one to six
- bytes, encoded according to the UTF-8 rules.
-
-
-
-OUTPUT FROM PCRETEST
- When a match succeeds, pcretest outputs the list of captured
- substrings that pcre_exec() returns, starting with number 0
- for the string that matched the whole pattern. Here is an
- example of an interactive pcretest run.
-
- $ pcretest
- PCRE version 2.06 08-Jun-1999
-
- re> /^abc(\d+)/
- data> abc123
-
- 0: abc123
- 1: 123
- data> xyz
- No match
-
- If the strings contain any non-printing characters, they are
- output as \0x escapes, or as \x{...} escapes if the /8
- modifier was present on the pattern. If the pattern has the
- /+ modifier, then the output for substring 0 is followed by
- the the rest of the subject string, identified by "0+" like
- this:
-
- re> /cat/+
- data> cataract
- 0: cat
- 0+ aract
-
- If the pattern has the /g or /G modifier, the results of
- successive matching attempts are output in sequence, like
- this:
-
- re> /\Bi(\w\w)/g
- data> Mississippi
- 0: iss
- 1: ss
- 0: iss
- 1: ss
- 0: ipp
- 1: pp
-
- "No match" is output only if the first match attempt fails.
-
- If any of the sequences \C, \G, or \L are present in a data
- line that is successfully matched, the substrings extracted
- by the convenience functions are output with C, G, or L
- after the string number instead of a colon. This is in addi-
- tion to the normal full list. The string length (that is,
- the return from the extraction function) is given in
- parentheses after each string for \C and \G.
-
- Note that while patterns can be continued over several lines
- (a plain ">" prompt is used for continuations), data lines
- may not. However newlines can be included in data by means
- of the \n escape.
-
-
-
-AUTHOR
- Philip Hazel <ph10@cam.ac.uk>
- University Computing Service,
- New Museums Site,
- Cambridge CB2 3QG, England.
- Phone: +44 1223 334714
-
- Last updated: 25 August 2002
- Copyright (c) 1997-2002 University of Cambridge.
diff --git a/ext/pcre/pcrelib/doc/perltest.txt b/ext/pcre/pcrelib/doc/perltest.txt
deleted file mode 100644
index 9ea9d932a5..0000000000
--- a/ext/pcre/pcrelib/doc/perltest.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-The perltest program
---------------------
-
-The perltest program tests Perl's regular expressions; it has the same
-specification as pcretest, and so can be given identical input, except that
-input patterns can be followed only by Perl's lower case modifiers and /+ (as
-used by pcretest), which is recognized and handled by the program.
-
-The data lines are processed as Perl double-quoted strings, so if they contain
-" \ $ or @ characters, these have to be escaped. For this reason, all such
-characters in testinput1 and testinput3 are escaped so that they can be used
-for perltest as well as for pcretest, and the special upper case modifiers such
-as /A that pcretest recognizes are not used in these files. The output should
-be identical, apart from the initial identifying banner.
-
-The perltest script can also test UTF-8 features. It works as is for Perl 5.8
-or higher. It recognizes the special modifier /8 that pcretest uses to invoke
-UTF-8 functionality. The testinput5 file can be fed to perltest to run UTF-8
-tests.
-
-For Perl 5.6, perltest won't work unmodified for the UTF-8 tests. You need to
-uncomment the "use utf8" lines that it contains. It is best to do this on a
-copy of the script, because for non-UTF-8 tests, these lines should remain
-commented out.
-
-The testinput2 and testinput4 files are not suitable for feeding to perltest,
-since they do make use of the special upper case modifiers and escapes that
-pcretest uses to test some features of PCRE. The first of these files also
-contains malformed regular expressions, in order to check that PCRE diagnoses
-them correctly. Similarly, testinput6 tests UTF-8 features that do not relate
-to Perl.
-
-Philip Hazel <ph10@cam.ac.uk>
-August 2002
diff --git a/ext/pcre/pcrelib/get.c b/ext/pcre/pcrelib/get.c
deleted file mode 100644
index 55e736dc24..0000000000
--- a/ext/pcre/pcrelib/get.c
+++ /dev/null
@@ -1,227 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/*
-This is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language. See
-the file Tech.Notes for some information on the internals.
-
-Written by: Philip Hazel <ph10@cam.ac.uk>
-
- Copyright (c) 1997-2001 University of Cambridge
-
------------------------------------------------------------------------------
-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
-
-1. This software is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-
-2. The origin of this software must not be misrepresented, either by
- explicit claim or by omission.
-
-3. Altered versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
-
-4. If PCRE is embedded in any software that is released under the GNU
- General Purpose Licence (GPL), then the terms of that licence shall
- supersede any condition above with which it is incompatible.
------------------------------------------------------------------------------
-*/
-
-/* This module contains some convenience functions for extracting substrings
-from the subject string after a regex match has succeeded. The original idea
-for these functions came from Scott Wimer <scottw@cgibuilder.com>. */
-
-
-/* Include the internals header, which itself includes Standard C headers plus
-the external pcre header. */
-
-#include "internal.h"
-
-
-
-/*************************************************
-* Copy captured string to given buffer *
-*************************************************/
-
-/* This function copies a single captured substring into a given buffer.
-Note that we use memcpy() rather than strncpy() in case there are binary zeros
-in the string.
-
-Arguments:
- subject the subject string that was matched
- ovector pointer to the offsets table
- stringcount the number of substrings that were captured
- (i.e. the yield of the pcre_exec call, unless
- that was zero, in which case it should be 1/3
- of the offset table size)
- stringnumber the number of the required substring
- buffer where to put the substring
- size the size of the buffer
-
-Returns: if successful:
- the length of the copied string, not including the zero
- that is put on the end; can be zero
- if not successful:
- PCRE_ERROR_NOMEMORY (-6) buffer too small
- PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
-*/
-
-int
-pcre_copy_substring(const char *subject, int *ovector, int stringcount,
- int stringnumber, char *buffer, int size)
-{
-int yield;
-if (stringnumber < 0 || stringnumber >= stringcount)
- return PCRE_ERROR_NOSUBSTRING;
-stringnumber *= 2;
-yield = ovector[stringnumber+1] - ovector[stringnumber];
-if (size < yield + 1) return PCRE_ERROR_NOMEMORY;
-memcpy(buffer, subject + ovector[stringnumber], yield);
-buffer[yield] = 0;
-return yield;
-}
-
-
-
-/*************************************************
-* Copy all captured strings to new store *
-*************************************************/
-
-/* This function gets one chunk of store and builds a list of pointers and all
-of the captured substrings in it. A NULL pointer is put on the end of the list.
-
-Arguments:
- subject the subject string that was matched
- ovector pointer to the offsets table
- stringcount the number of substrings that were captured
- (i.e. the yield of the pcre_exec call, unless
- that was zero, in which case it should be 1/3
- of the offset table size)
- listptr set to point to the list of pointers
-
-Returns: if successful: 0
- if not successful:
- PCRE_ERROR_NOMEMORY (-6) failed to get store
-*/
-
-int
-pcre_get_substring_list(const char *subject, int *ovector, int stringcount,
- const char ***listptr)
-{
-int i;
-int size = sizeof(char *);
-int double_count = stringcount * 2;
-char **stringlist;
-char *p;
-
-for (i = 0; i < double_count; i += 2)
- size += sizeof(char *) + ovector[i+1] - ovector[i] + 1;
-
-stringlist = (char **)(pcre_malloc)(size);
-if (stringlist == NULL) return PCRE_ERROR_NOMEMORY;
-
-*listptr = (const char **)stringlist;
-p = (char *)(stringlist + stringcount + 1);
-
-for (i = 0; i < double_count; i += 2)
- {
- int len = ovector[i+1] - ovector[i];
- memcpy(p, subject + ovector[i], len);
- *stringlist++ = p;
- p += len;
- *p++ = 0;
- }
-
-*stringlist = NULL;
-return 0;
-}
-
-
-
-/*************************************************
-* Free store obtained by get_substring_list *
-*************************************************/
-
-/* This function exists for the benefit of people calling PCRE from non-C
-programs that can call its functions, but not free() or (pcre_free)() directly.
-
-Argument: the result of a previous pcre_get_substring_list()
-Returns: nothing
-*/
-
-void
-pcre_free_substring_list(const char **pointer)
-{
-(pcre_free)((void *)pointer);
-}
-
-
-
-/*************************************************
-* Copy captured string to new store *
-*************************************************/
-
-/* This function copies a single captured substring into a piece of new
-store
-
-Arguments:
- subject the subject string that was matched
- ovector pointer to the offsets table
- stringcount the number of substrings that were captured
- (i.e. the yield of the pcre_exec call, unless
- that was zero, in which case it should be 1/3
- of the offset table size)
- stringnumber the number of the required substring
- stringptr where to put a pointer to the substring
-
-Returns: if successful:
- the length of the string, not including the zero that
- is put on the end; can be zero
- if not successful:
- PCRE_ERROR_NOMEMORY (-6) failed to get store
- PCRE_ERROR_NOSUBSTRING (-7) substring not present
-*/
-
-int
-pcre_get_substring(const char *subject, int *ovector, int stringcount,
- int stringnumber, const char **stringptr)
-{
-int yield;
-char *substring;
-if (stringnumber < 0 || stringnumber >= stringcount)
- return PCRE_ERROR_NOSUBSTRING;
-stringnumber *= 2;
-yield = ovector[stringnumber+1] - ovector[stringnumber];
-substring = (char *)(pcre_malloc)(yield + 1);
-if (substring == NULL) return PCRE_ERROR_NOMEMORY;
-memcpy(substring, subject + ovector[stringnumber], yield);
-substring[yield] = 0;
-*stringptr = substring;
-return yield;
-}
-
-
-
-/*************************************************
-* Free store obtained by get_substring *
-*************************************************/
-
-/* This function exists for the benefit of people calling PCRE from non-C
-programs that can call its functions, but not free() or (pcre_free)() directly.
-
-Argument: the result of a previous pcre_get_substring()
-Returns: nothing
-*/
-
-void
-pcre_free_substring(const char *pointer)
-{
-(pcre_free)((void *)pointer);
-}
-
-/* End of get.c */
diff --git a/ext/pcre/pcrelib/internal.h b/ext/pcre/pcrelib/internal.h
deleted file mode 100644
index 2396668a33..0000000000
--- a/ext/pcre/pcrelib/internal.h
+++ /dev/null
@@ -1,632 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-
-/* This is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language. See
-the file Tech.Notes for some information on the internals.
-
-Written by: Philip Hazel <ph10@cam.ac.uk>
-
- Copyright (c) 1997-2003 University of Cambridge
-
------------------------------------------------------------------------------
-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
-
-1. This software is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-
-2. The origin of this software must not be misrepresented, either by
- explicit claim or by omission.
-
-3. Altered versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
-
-4. If PCRE is embedded in any software that is released under the GNU
- General Purpose Licence (GPL), then the terms of that licence shall
- supersede any condition above with which it is incompatible.
------------------------------------------------------------------------------
-*/
-
-/* This header contains definitions that are shared between the different
-modules, but which are not relevant to the outside. */
-
-/* Get the definitions provided by running "configure" */
-
-#ifdef PHP_WIN32
-# include "config.w32.h"
-#elif defined(NETWARE)
-# include "config.nw.h"
-#else
-# include "php_config.h"
-#endif
-
-/* The value of NEWLINE determines the newline character. The default is to
-leave it up to the compiler, but some sites want to force a particular value.
-On Unix systems, "configure" can be used to override this default. */
-
-#ifndef NEWLINE
-#define NEWLINE '\n'
-#endif
-
-/* When compiling for use with the Virtual Pascal compiler, these functions
-need to have their names changed. PCRE must be compiled with the -DVPCOMPAT
-option on the command line. */
-
-#ifdef VPCOMPAT
-#define strncmp(s1,s2,m) _strncmp(s1,s2,m)
-#define memcpy(d,s,n) _memcpy(d,s,n)
-#define memmove(d,s,n) _memmove(d,s,n)
-#define memset(s,c,n) _memset(s,c,n)
-#else /* VPCOMPAT */
-
-/* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
-define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY
-is set. Otherwise, include an emulating function for those systems that have
-neither (there some non-Unix environments where this is the case). This assumes
-that all calls to memmove are moving strings upwards in store, which is the
-case in PCRE. */
-
-#if ! HAVE_MEMMOVE
-#undef memmove /* some systems may have a macro */
-#if HAVE_BCOPY
-#define memmove(a, b, c) bcopy(b, a, c)
-#else /* HAVE_BCOPY */
-void *
-pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n)
-{
-int i;
-dest += n;
-src += n;
-for (i = 0; i < n; ++i) *(--dest) = *(--src);
-}
-#define memmove(a, b, c) pcre_memmove(a, b, c)
-#endif /* not HAVE_BCOPY */
-#endif /* not HAVE_MEMMOVE */
-#endif /* not VPCOMPAT */
-
-
-/* PCRE keeps offsets in its compiled code as 2-byte quantities by default.
-These are used, for example, to link from the start of a subpattern to its
-alternatives and its end. The use of 2 bytes per offset limits the size of the
-compiled regex to around 64K, which is big enough for almost everybody.
-However, I received a request for an even bigger limit. For this reason, and
-also to make the code easier to maintain, the storing and loading of offsets
-from the byte string is now handled by the macros that are defined here.
-
-The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
-the config.h file, but can be overridden by using -D on the command line. This
-is automated on Unix systems via the "configure" command. */
-
-#if LINK_SIZE == 2
-
-#define PUT(a,n,d) \
- (a[n] = (d) >> 8), \
- (a[(n)+1] = (d) & 255)
-
-#define GET(a,n) \
- (((a)[n] << 8) | (a)[(n)+1])
-
-#define MAX_PATTERN_SIZE (1 << 16)
-
-
-#elif LINK_SIZE == 3
-
-#define PUT(a,n,d) \
- (a[n] = (d) >> 16), \
- (a[(n)+1] = (d) >> 8), \
- (a[(n)+2] = (d) & 255)
-
-#define GET(a,n) \
- (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
-
-#define MAX_PATTERN_SIZE (1 << 24)
-
-
-#elif LINK_SIZE == 4
-
-#define PUT(a,n,d) \
- (a[n] = (d) >> 24), \
- (a[(n)+1] = (d) >> 16), \
- (a[(n)+2] = (d) >> 8), \
- (a[(n)+3] = (d) & 255)
-
-#define GET(a,n) \
- (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
-
-#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
-
-
-#else
-#error LINK_SIZE must be either 2, 3, or 4
-#endif
-
-
-/* Convenience macro defined in terms of the others */
-
-#define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
-
-
-/* PCRE uses some other 2-byte quantities that do not change when the size of
-offsets changes. There are used for repeat counts and for other things such as
-capturing parenthesis numbers in back references. */
-
-#define PUT2(a,n,d) \
- a[n] = (d) >> 8; \
- a[(n)+1] = (d) & 255
-
-#define GET2(a,n) \
- (((a)[n] << 8) | (a)[(n)+1])
-
-#define PUT2INC(a,n,d) PUT2(a,n,d), a += 2
-
-
-/* Standard C headers plus the external interface definition */
-
-#include <ctype.h>
-#include <limits.h>
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "pcre.h"
-
-/* In case there is no definition of offsetof() provided - though any proper
-Standard C system should have one. */
-
-#ifndef offsetof
-#define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
-#endif
-
-/* These are the public options that can change during matching. */
-
-#define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
-
-/* Private options flags start at the most significant end of the four bytes,
-but skip the top bit so we can use ints for convenience without getting tangled
-with negative values. The public options defined in pcre.h start at the least
-significant end. Make sure they don't overlap, though now that we have expanded
-to four bytes there is plenty of space. */
-
-#define PCRE_FIRSTSET 0x40000000 /* first_char is set */
-#define PCRE_REQCHSET 0x20000000 /* req_char is set */
-#define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */
-#define PCRE_ICHANGED 0x08000000 /* i option changes within regex */
-
-/* Options for the "extra" block produced by pcre_study(). */
-
-#define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */
-
-/* Masks for identifying the public options which are permitted at compile
-time, run time or study time, respectively. */
-
-#define PUBLIC_OPTIONS \
- (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
- PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8)
-
-#define PUBLIC_EXEC_OPTIONS \
- (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY)
-
-#define PUBLIC_STUDY_OPTIONS 0 /* None defined */
-
-/* Magic number to provide a small check against being handed junk. */
-
-#define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
-
-/* Negative values for the firstchar and reqchar variables */
-
-#define REQ_UNSET (-2)
-#define REQ_NONE (-1)
-
-/* Flags added to firstchar or reqchar */
-
-#define REQ_CASELESS 0x0100 /* indicates caselessness */
-
-/* Miscellaneous definitions */
-
-typedef int BOOL;
-
-#define FALSE 0
-#define TRUE 1
-
-/* Escape items that are just an encoding of a particular data value. Note that
-ESC_n is defined as yet another macro, which is set in config.h to either \n
-(the default) or \r (which some people want). */
-
-#ifndef ESC_e
-#define ESC_e 27
-#endif
-
-#ifndef ESC_f
-#define ESC_f '\f'
-#endif
-
-#ifndef ESC_n
-#define ESC_n NEWLINE
-#endif
-
-#ifndef ESC_r
-#define ESC_r '\r'
-#endif
-
-#ifndef ESC_t
-#define ESC_t '\t'
-#endif
-
-/* These are escaped items that aren't just an encoding of a particular data
-value such as \n. They must have non-zero values, as check_escape() returns
-their negation. Also, they must appear in the same order as in the opcode
-definitions below, up to ESC_z. There's a dummy for OP_ANY because it
-corresponds to "." rather than an escape sequence. The final one must be
-ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
-tests in the code for an escape greater than ESC_b and less than ESC_Z to
-detect the types that may be repeated. These are the types that consume a
-character. If any new escapes are put in between that don't consume a
-character, that code will have to change. */
-
-enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
- ESC_w, ESC_dum1, ESC_C, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_REF };
-
-
-/* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
-that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
-OP_EOD must correspond in order to the list of escapes immediately above.
-Note that whenever this list is updated, the two macro definitions that follow
-must also be updated to match. */
-
-enum {
- OP_END, /* 0 End of pattern */
-
- /* Values corresponding to backslashed metacharacters */
-
- OP_SOD, /* 1 Start of data: \A */
- OP_SOM, /* 2 Start of match (subject + offset): \G */
- OP_NOT_WORD_BOUNDARY, /* 3 \B */
- OP_WORD_BOUNDARY, /* 4 \b */
- OP_NOT_DIGIT, /* 5 \D */
- OP_DIGIT, /* 6 \d */
- OP_NOT_WHITESPACE, /* 7 \S */
- OP_WHITESPACE, /* 8 \s */
- OP_NOT_WORDCHAR, /* 9 \W */
- OP_WORDCHAR, /* 10 \w */
- OP_ANY, /* 11 Match any character */
- OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
- OP_EODN, /* 13 End of data or \n at end of data: \Z. */
- OP_EOD, /* 14 End of data: \z */
-
- OP_OPT, /* 15 Set runtime options */
- OP_CIRC, /* 16 Start of line - varies with multiline switch */
- OP_DOLL, /* 17 End of line - varies with multiline switch */
- OP_CHARS, /* 18 Match string of characters */
- OP_NOT, /* 19 Match anything but the following char */
-
- OP_STAR, /* 20 The maximizing and minimizing versions of */
- OP_MINSTAR, /* 21 all these opcodes must come in pairs, with */
- OP_PLUS, /* 22 the minimizing one second. */
- OP_MINPLUS, /* 23 This first set applies to single characters */
- OP_QUERY, /* 24 */
- OP_MINQUERY, /* 25 */
- OP_UPTO, /* 26 From 0 to n matches */
- OP_MINUPTO, /* 27 */
- OP_EXACT, /* 28 Exactly n matches */
-
- OP_NOTSTAR, /* 29 The maximizing and minimizing versions of */
- OP_NOTMINSTAR, /* 30 all these opcodes must come in pairs, with */
- OP_NOTPLUS, /* 31 the minimizing one second. */
- OP_NOTMINPLUS, /* 32 This set applies to "not" single characters */
- OP_NOTQUERY, /* 33 */
- OP_NOTMINQUERY, /* 34 */
- OP_NOTUPTO, /* 35 From 0 to n matches */
- OP_NOTMINUPTO, /* 36 */
- OP_NOTEXACT, /* 37 Exactly n matches */
-
- OP_TYPESTAR, /* 38 The maximizing and minimizing versions of */
- OP_TYPEMINSTAR, /* 39 all these opcodes must come in pairs, with */
- OP_TYPEPLUS, /* 40 the minimizing one second. These codes must */
- OP_TYPEMINPLUS, /* 41 be in exactly the same order as those above. */
- OP_TYPEQUERY, /* 42 This set applies to character types such as \d */
- OP_TYPEMINQUERY, /* 43 */
- OP_TYPEUPTO, /* 44 From 0 to n matches */
- OP_TYPEMINUPTO, /* 45 */
- OP_TYPEEXACT, /* 46 Exactly n matches */
-
- OP_CRSTAR, /* 47 The maximizing and minimizing versions of */
- OP_CRMINSTAR, /* 48 all these opcodes must come in pairs, with */
- OP_CRPLUS, /* 49 the minimizing one second. These codes must */
- OP_CRMINPLUS, /* 50 be in exactly the same order as those above. */
- OP_CRQUERY, /* 51 These are for character classes and back refs */
- OP_CRMINQUERY, /* 52 */
- OP_CRRANGE, /* 53 These are different to the three seta above. */
- OP_CRMINRANGE, /* 54 */
-
- OP_CLASS, /* 55 Match a character class */
- OP_REF, /* 56 Match a back reference */
- OP_RECURSE, /* 57 Match a numbered subpattern (possibly recursive) */
- OP_CALLOUT, /* 58 Call out to external function if provided */
-
- OP_ALT, /* 59 Start of alternation */
- OP_KET, /* 60 End of group that doesn't have an unbounded repeat */
- OP_KETRMAX, /* 61 These two must remain together and in this */
- OP_KETRMIN, /* 62 order. They are for groups the repeat for ever. */
-
- /* The assertions must come before ONCE and COND */
-
- OP_ASSERT, /* 63 Positive lookahead */
- OP_ASSERT_NOT, /* 64 Negative lookahead */
- OP_ASSERTBACK, /* 65 Positive lookbehind */
- OP_ASSERTBACK_NOT, /* 66 Negative lookbehind */
- OP_REVERSE, /* 67 Move pointer back - used in lookbehind assertions */
-
- /* ONCE and COND must come after the assertions, with ONCE first, as there's
- a test for >= ONCE for a subpattern that isn't an assertion. */
-
- OP_ONCE, /* 68 Once matched, don't back up into the subpattern */
- OP_COND, /* 69 Conditional group */
- OP_CREF, /* 70 Used to hold an extraction string number (cond ref) */
-
- OP_BRAZERO, /* 71 These two must remain together and in this */
- OP_BRAMINZERO, /* 72 order. */
-
- OP_BRANUMBER, /* 73 Used for extracting brackets whose number is greater
- than can fit into an opcode. */
-
- OP_BRA /* 74 This and greater values are used for brackets that
- extract substrings up to a basic limit. After that,
- use is made of OP_BRANUMBER. */
-};
-
-
-/* This macro defines textual names for all the opcodes. There are used only
-for debugging, in pcre.c when DEBUG is defined, and also in pcretest.c. The
-macro is referenced only in printint.c. */
-
-#define OP_NAME_LIST \
- "End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \
- "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", "\\Z", "\\z", \
- "Opt", "^", "$", "chars", "not", \
- "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
- "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
- "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
- "*", "*?", "+", "+?", "?", "??", "{", "{", \
- "class", "Ref", "Recurse", "Callout", \
- "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
- "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\
- "Brazero", "Braminzero", "Branumber", "Bra"
-
-
-/* This macro defines the length of fixed length operations in the compiled
-regex. The lengths are used when searching for specific things, and also in the
-debugging printing of a compiled regex. We use a macro so that it can be
-incorporated both into pcre.c and pcretest.c without being publicly exposed. */
-
-#define OP_LENGTHS \
- 1, /* End */ \
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
- 1, 1, 1, 1, 2, 1, 1, /* Any, Anybyte, \Z, \z, Opt, ^, $ */ \
- 2, /* Chars - the minimum length */ \
- 2, /* not */ \
- /* Positive single-char repeats */ \
- 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? */ \
- 4, 4, 4, /* upto, minupto, exact */ \
- /* Negative single-char repeats */ \
- 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
- 4, 4, 4, /* NOT upto, minupto, exact */ \
- /* Positive type repeats */ \
- 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
- 4, 4, 4, /* Type upto, minupto, exact */ \
- /* Multi-char class repeats */ \
- 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
- 5, 5, /* CRRANGE, CRMINRANGE */ \
- 33, 3, /* CLASS, REF */ \
- 1+LINK_SIZE, /* RECURSE */ \
- 2, /* CALLOUT */ \
- 1+LINK_SIZE, /* Alt */ \
- 1+LINK_SIZE, /* Ket */ \
- 1+LINK_SIZE, /* KetRmax */ \
- 1+LINK_SIZE, /* KetRmin */ \
- 1+LINK_SIZE, /* Assert */ \
- 1+LINK_SIZE, /* Assert not */ \
- 1+LINK_SIZE, /* Assert behind */ \
- 1+LINK_SIZE, /* Assert behind not */ \
- 1+LINK_SIZE, /* Reverse */ \
- 1+LINK_SIZE, /* Once */ \
- 1, /* COND */ \
- 3, /* CREF */ \
- 1, 1, /* BRAZERO, BRAMINZERO */ \
- 3, /* BRANUMBER */ \
- 1+LINK_SIZE /* BRA */ \
-
-
-/* The highest extraction number before we have to start using additional
-bytes. (Originally PCRE didn't have support for extraction counts highter than
-this number.) The value is limited by the number of opcodes left after OP_BRA,
-i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
-opcodes. */
-
-#define EXTRACT_BASIC_MAX 150
-
-/* A magic value for OP_CREF to indicate the "in recursion" condition. */
-
-#define CREF_RECURSE 0xffff
-
-/* The texts of compile-time error messages are defined as macros here so that
-they can be accessed by the POSIX wrapper and converted into error codes. Yes,
-I could have used error codes in the first place, but didn't feel like changing
-just to accommodate the POSIX wrapper. */
-
-#define ERR1 "\\ at end of pattern"
-#define ERR2 "\\c at end of pattern"
-#define ERR3 "unrecognized character follows \\"
-#define ERR4 "numbers out of order in {} quantifier"
-#define ERR5 "number too big in {} quantifier"
-#define ERR6 "missing terminating ] for character class"
-#define ERR7 "invalid escape sequence in character class"
-#define ERR8 "range out of order in character class"
-#define ERR9 "nothing to repeat"
-#define ERR10 "operand of unlimited repeat could match the empty string"
-#define ERR11 "internal error: unexpected repeat"
-#define ERR12 "unrecognized character after (?"
-#define ERR13 "POSIX named classes are supported only within a class"
-#define ERR14 "missing )"
-#define ERR15 "reference to non-existent subpattern"
-#define ERR16 "erroffset passed as NULL"
-#define ERR17 "unknown option bit(s) set"
-#define ERR18 "missing ) after comment"
-#define ERR19 "parentheses nested too deeply"
-#define ERR20 "regular expression too large"
-#define ERR21 "failed to get memory"
-#define ERR22 "unmatched parentheses"
-#define ERR23 "internal error: code overflow"
-#define ERR24 "unrecognized character after (?<"
-#define ERR25 "lookbehind assertion is not fixed length"
-#define ERR26 "malformed number after (?("
-#define ERR27 "conditional group contains more than two branches"
-#define ERR28 "assertion expected after (?("
-#define ERR29 "(?R or (?digits must be followed by )"
-#define ERR30 "unknown POSIX class name"
-#define ERR31 "POSIX collating elements are not supported"
-#define ERR32 "this version of PCRE is not compiled with PCRE_UTF8 support"
-#define ERR33 "characters with values > 255 are not yet supported in classes"
-#define ERR34 "character value in \\x{...} sequence is too large"
-#define ERR35 "invalid condition (?(0)"
-#define ERR36 "\\C not allowed in lookbehind assertion"
-#define ERR37 "PCRE does not support \\L, \\l, \\N, \\P, \\p, \\U, \\u, or \\X"
-#define ERR38 "number after (?C is > 255"
-#define ERR39 "closing ) for (?C expected"
-#define ERR40 "recursive call could loop indefinitely"
-#define ERR41 "unrecognized character after (?P"
-#define ERR42 "syntax error after (?P"
-#define ERR43 "two named groups have the same name"
-
-/* All character handling must be done as unsigned characters. Otherwise there
-are problems with top-bit-set characters and functions such as isspace().
-However, we leave the interface to the outside world as char *, because that
-should make things easier for callers. We define a short type for unsigned char
-to save lots of typing. I tried "uchar", but it causes problems on Digital
-Unix, where it is defined in sys/types, so use "uschar" instead. */
-
-typedef unsigned char uschar;
-
-/* The real format of the start of the pcre block; the index of names and the
-code vector run on as long as necessary after the end. */
-
-typedef struct real_pcre {
- unsigned long int magic_number;
- size_t size; /* Total that was malloced */
- const unsigned char *tables; /* Pointer to tables */
- unsigned long int options;
- unsigned short int top_bracket;
- unsigned short int top_backref;
- unsigned short int first_char;
- unsigned short int req_char;
- unsigned short int name_entry_size; /* Size of any name items; 0 => none */
- unsigned short int name_count; /* Number of name items */
-} real_pcre;
-
-/* The real format of the extra block returned by pcre_study(). */
-
-typedef struct real_pcre_extra {
- uschar options;
- uschar start_bits[32];
-} real_pcre_extra;
-
-
-/* Structure for passing "static" information around between the functions
-doing the compiling, so that they are thread-safe. */
-
-typedef struct compile_data {
- const uschar *lcc; /* Points to lower casing table */
- const uschar *fcc; /* Points to case-flipping table */
- const uschar *cbits; /* Points to character type table */
- const uschar *ctypes; /* Points to table of type maps */
- const uschar *start_code; /* The start of the compiled code */
- uschar *name_table; /* The name/number table */
- int names_found; /* Number of entries so far */
- int name_entry_size; /* Size of each entry */
-} compile_data;
-
-/* Structure for maintaining a chain of pointers to the currently incomplete
-branches, for testing for left recursion. */
-
-typedef struct branch_chain {
- struct branch_chain *outer;
- uschar *current;
-} branch_chain;
-
-/* Structure for items in a linked list that represents an explicit recursive
-call within the pattern. */
-
-typedef struct recursion_info {
- struct recursion_info *prev; /* Previous recursion record (or NULL) */
- int group_num; /* Number of group that was called */
- const uschar *after_call; /* "Return value": points after the call in the expr */
- const uschar *save_start; /* Old value of md->start_match */
- int *offset_save; /* Pointer to start of saved offsets */
- int saved_max; /* Number of saved offsets */
-} recursion_info;
-
-/* Structure for passing "static" information around between the functions
-doing the matching, so that they are thread-safe. */
-
-typedef struct match_data {
- int errorcode; /* As it says */
- int *offset_vector; /* Offset vector */
- int offset_end; /* One past the end */
- int offset_max; /* The maximum usable for return data */
- const uschar *lcc; /* Points to lower casing table */
- const uschar *ctypes; /* Points to table of type maps */
- BOOL offset_overflow; /* Set if too many extractions */
- BOOL notbol; /* NOTBOL flag */
- BOOL noteol; /* NOTEOL flag */
- BOOL utf8; /* UTF8 flag */
- BOOL endonly; /* Dollar not before final \n */
- BOOL notempty; /* Empty string match not wanted */
- const uschar *start_code; /* For use when recursing */
- const uschar *start_subject; /* Start of the subject string */
- const uschar *end_subject; /* End of the subject string */
- const uschar *start_match; /* Start of this match attempt */
- const uschar *end_match_ptr; /* Subject position at end match */
- int end_offset_top; /* Highwater mark at end of match */
- int capture_last; /* Most recent capture number */
- int start_offset; /* The start offset value */
- recursion_info *recursive; /* Linked list of recursion data */
-} match_data;
-
-/* Bit definitions for entries in the pcre_ctypes table. */
-
-#define ctype_space 0x01
-#define ctype_letter 0x02
-#define ctype_digit 0x04
-#define ctype_xdigit 0x08
-#define ctype_word 0x10 /* alphameric or '_' */
-#define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */
-
-/* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
-of bits for a class map. Some classes are built by combining these tables. */
-
-#define cbit_space 0 /* [:space:] or \s */
-#define cbit_xdigit 32 /* [:xdigit:] */
-#define cbit_digit 64 /* [:digit:] or \d */
-#define cbit_upper 96 /* [:upper:] */
-#define cbit_lower 128 /* [:lower:] */
-#define cbit_word 160 /* [:word:] or \w */
-#define cbit_graph 192 /* [:graph:] */
-#define cbit_print 224 /* [:print:] */
-#define cbit_punct 256 /* [:punct:] */
-#define cbit_cntrl 288 /* [:cntrl:] */
-#define cbit_length 320 /* Length of the cbits table */
-
-/* Offsets of the various tables from the base tables pointer, and
-total length. */
-
-#define lcc_offset 0
-#define fcc_offset 256
-#define cbits_offset 512
-#define ctypes_offset (cbits_offset + cbit_length)
-#define tables_length (ctypes_offset + 256)
-
-/* End of internal.h */
diff --git a/ext/pcre/pcrelib/maketables.c b/ext/pcre/pcrelib/maketables.c
deleted file mode 100644
index f89765214c..0000000000
--- a/ext/pcre/pcrelib/maketables.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/*
-PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
-Written by: Philip Hazel <ph10@cam.ac.uk>
-
- Copyright (c) 1997-2001 University of Cambridge
-
------------------------------------------------------------------------------
-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
-
-1. This software is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-
-2. The origin of this software must not be misrepresented, either by
- explicit claim or by omission.
-
-3. Altered versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
-
-4. If PCRE is embedded in any software that is released under the GNU
- General Purpose Licence (GPL), then the terms of that licence shall
- supersede any condition above with which it is incompatible.
------------------------------------------------------------------------------
-
-See the file Tech.Notes for some information on the internals.
-*/
-
-
-/* This file is compiled on its own as part of the PCRE library. However,
-it is also included in the compilation of dftables.c, in which case the macro
-DFTABLES is defined. */
-
-#ifndef DFTABLES
-#include "internal.h"
-#endif
-
-
-
-/*************************************************
-* Create PCRE character tables *
-*************************************************/
-
-/* This function builds a set of character tables for use by PCRE and returns
-a pointer to them. They are build using the ctype functions, and consequently
-their contents will depend upon the current locale setting. When compiled as
-part of the library, the store is obtained via pcre_malloc(), but when compiled
-inside dftables, use malloc().
-
-Arguments: none
-Returns: pointer to the contiguous block of data
-*/
-
-const unsigned char *
-pcre_maketables(void)
-{
-unsigned char *yield, *p;
-int i;
-
-#ifndef DFTABLES
-yield = (unsigned char*)(pcre_malloc)(tables_length);
-#else
-yield = (unsigned char*)malloc(tables_length);
-#endif
-
-if (yield == NULL) return NULL;
-p = yield;
-
-/* First comes the lower casing table */
-
-for (i = 0; i < 256; i++) *p++ = tolower(i);
-
-/* Next the case-flipping table */
-
-for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i);
-
-/* Then the character class tables. Don't try to be clever and save effort
-on exclusive ones - in some locales things may be different. Note that the
-table for "space" includes everything "isspace" gives, including VT in the
-default locale. This makes it work for the POSIX class [:space:]. */
-
-memset(p, 0, cbit_length);
-for (i = 0; i < 256; i++)
- {
- if (isdigit(i))
- {
- p[cbit_digit + i/8] |= 1 << (i&7);
- p[cbit_word + i/8] |= 1 << (i&7);
- }
- if (isupper(i))
- {
- p[cbit_upper + i/8] |= 1 << (i&7);
- p[cbit_word + i/8] |= 1 << (i&7);
- }
- if (islower(i))
- {
- p[cbit_lower + i/8] |= 1 << (i&7);
- p[cbit_word + i/8] |= 1 << (i&7);
- }
- if (i == '_') p[cbit_word + i/8] |= 1 << (i&7);
- if (isspace(i)) p[cbit_space + i/8] |= 1 << (i&7);
- if (isxdigit(i))p[cbit_xdigit + i/8] |= 1 << (i&7);
- if (isgraph(i)) p[cbit_graph + i/8] |= 1 << (i&7);
- if (isprint(i)) p[cbit_print + i/8] |= 1 << (i&7);
- if (ispunct(i)) p[cbit_punct + i/8] |= 1 << (i&7);
- if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1 << (i&7);
- }
-p += cbit_length;
-
-/* Finally, the character type table. In this, we exclude VT from the white
-space chars, because Perl doesn't recognize it as such for \s and for comments
-within regexes. */
-
-for (i = 0; i < 256; i++)
- {
- int x = 0;
- if (i != 0x0b && isspace(i)) x += ctype_space;
- if (isalpha(i)) x += ctype_letter;
- if (isdigit(i)) x += ctype_digit;
- if (isxdigit(i)) x += ctype_xdigit;
- if (isalnum(i) || i == '_') x += ctype_word;
- if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta;
- *p++ = x;
- }
-
-return yield;
-}
-
-/* End of maketables.c */
diff --git a/ext/pcre/pcrelib/pcre.c b/ext/pcre/pcrelib/pcre.c
deleted file mode 100644
index 8c74905131..0000000000
--- a/ext/pcre/pcrelib/pcre.c
+++ /dev/null
@@ -1,5992 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/*
-This is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language. See
-the file Tech.Notes for some information on the internals.
-
-Written by: Philip Hazel <ph10@cam.ac.uk>
-
- Copyright (c) 1997-2003 University of Cambridge
-
------------------------------------------------------------------------------
-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
-
-1. This software is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-
-2. The origin of this software must not be misrepresented, either by
- explicit claim or by omission.
-
-3. Altered versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
-
-4. If PCRE is embedded in any software that is released under the GNU
- General Purpose Licence (GPL), then the terms of that licence shall
- supersede any condition above with which it is incompatible.
------------------------------------------------------------------------------
-*/
-
-
-/* Define DEBUG to get debugging output on stdout. */
-
-/* #define DEBUG */
-
-/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
-inline, and there are *still* stupid compilers about that don't like indented
-pre-processor statements. I suppose it's only been 10 years... */
-
-#ifdef DEBUG
-#define DPRINTF(p) printf p
-#else
-#define DPRINTF(p) /*nothing*/
-#endif
-
-/* Include the internals header, which itself includes Standard C headers plus
-the external pcre header. */
-
-#include "internal.h"
-
-
-/* Allow compilation as C++ source code, should anybody want to do that. */
-
-#ifdef __cplusplus
-#define class pcre_class
-#endif
-
-
-/* Maximum number of items on the nested bracket stacks at compile time. This
-applies to the nesting of all kinds of parentheses. It does not limit
-un-nested, non-capturing parentheses. This number can be made bigger if
-necessary - it is used to dimension one int and one unsigned char vector at
-compile time. */
-
-#define BRASTACK_SIZE 200
-
-
-
-/* Maximum number of ints of offset to save on the stack for recursive calls.
-If the offset vector is bigger, malloc is used. This should be a multiple of 3,
-because the offset vector is always a multiple of 3 long. */
-
-#define REC_STACK_SAVE_MAX 30
-
-
-/* The number of bytes in a literal character string above which we can't add
-any more is different when UTF-8 characters may be encountered. */
-
-#ifdef SUPPORT_UTF8
-#define MAXLIT 250
-#else
-#define MAXLIT 255
-#endif
-
-
-/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
-the definition is next to the definition of the opcodes in internal.h. */
-
-static uschar OP_lengths[] = { OP_LENGTHS };
-
-/* Min and max values for the common repeats; for the maxima, 0 => infinity */
-
-static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
-static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
-
-/* Table for handling escaped characters in the range '0'-'z'. Positive returns
-are simple data values; negative values are for special things like \d and so
-on. Zero means further processing is needed (for things like \x), or the escape
-is invalid. */
-
-static const short int escapes[] = {
- 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
- 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
- '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
- 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
- 0, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
- 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
- '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
- 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
- 0, 0, ESC_r, -ESC_s, ESC_t, 0, 0, -ESC_w, /* p - w */
- 0, 0, -ESC_z /* x - z */
-};
-
-/* Tables of names of POSIX character classes and their lengths. The list is
-terminated by a zero length entry. The first three must be alpha, upper, lower,
-as this is assumed for handling case independence. */
-
-static const char *posix_names[] = {
- "alpha", "lower", "upper",
- "alnum", "ascii", "blank", "cntrl", "digit", "graph",
- "print", "punct", "space", "word", "xdigit" };
-
-static const uschar posix_name_lengths[] = {
- 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
-
-/* Table of class bit maps for each POSIX class; up to three may be combined
-to form the class. The table for [:blank:] is dynamically modified to remove
-the vertical space characters. */
-
-static const int posix_class_maps[] = {
- cbit_lower, cbit_upper, -1, /* alpha */
- cbit_lower, -1, -1, /* lower */
- cbit_upper, -1, -1, /* upper */
- cbit_digit, cbit_lower, cbit_upper, /* alnum */
- cbit_print, cbit_cntrl, -1, /* ascii */
- cbit_space, -1, -1, /* blank - a GNU extension */
- cbit_cntrl, -1, -1, /* cntrl */
- cbit_digit, -1, -1, /* digit */
- cbit_graph, -1, -1, /* graph */
- cbit_print, -1, -1, /* print */
- cbit_punct, -1, -1, /* punct */
- cbit_space, -1, -1, /* space */
- cbit_word, -1, -1, /* word - a Perl extension */
- cbit_xdigit,-1, -1 /* xdigit */
-};
-
-
-/* Definition to allow mutual recursion */
-
-static BOOL
- compile_regex(int, int, int *, uschar **, const uschar **, const char **,
- BOOL, int, int *, int *, branch_chain *, compile_data *);
-
-/* Structure for building a chain of data that actually lives on the
-stack, for holding the values of the subject pointer at the start of each
-subpattern, so as to detect when an empty string has been matched by a
-subpattern - to break infinite loops. */
-
-typedef struct eptrblock {
- struct eptrblock *prev;
- const uschar *saved_eptr;
-} eptrblock;
-
-/* Flag bits for the match() function */
-
-#define match_condassert 0x01 /* Called to check a condition assertion */
-#define match_isgroup 0x02 /* Set if start of bracketed group */
-
-
-
-/*************************************************
-* Global variables *
-*************************************************/
-
-/* PCRE is thread-clean and doesn't use any global variables in the normal
-sense. However, it calls memory allocation and free functions via the two
-indirections below, and it can optionally do callouts. These values can be
-changed by the caller, but are shared between all threads. However, when
-compiling for Virtual Pascal, things are done differently (see pcre.in). */
-
-#ifndef VPCOMPAT
-void *(*pcre_malloc)(size_t) = malloc;
-void (*pcre_free)(void *) = free;
-int (*pcre_callout)(pcre_callout_block *) = NULL;
-#endif
-
-
-/*************************************************
-* Macros and tables for character handling *
-*************************************************/
-
-/* When UTF-8 encoding is being used, a character is no longer just a single
-byte. The macros for character handling generate simple sequences when used in
-byte-mode, and more complicated ones for UTF-8 characters. */
-
-#ifndef SUPPORT_UTF8
-#define GETCHARINC(c, eptr) c = *eptr++;
-#define GETCHARLEN(c, eptr, len) c = *eptr;
-#define BACKCHAR(eptr)
-
-#else /* SUPPORT_UTF8 */
-
-/* Get the next UTF-8 character, advancing the pointer */
-
-#define GETCHARINC(c, eptr) \
- c = *eptr++; \
- if (md->utf8 && (c & 0xc0) == 0xc0) \
- { \
- int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int s = 6*a; \
- c = (c & utf8_table3[a]) << s; \
- while (a-- > 0) \
- { \
- s -= 6; \
- c |= (*eptr++ & 0x3f) << s; \
- } \
- }
-
-/* Get the next UTF-8 character, not advancing the pointer, setting length */
-
-#define GETCHARLEN(c, eptr, len) \
- c = *eptr; \
- len = 1; \
- if (md->utf8 && (c & 0xc0) == 0xc0) \
- { \
- int i; \
- int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int s = 6*a; \
- c = (c & utf8_table3[a]) << s; \
- for (i = 1; i <= a; i++) \
- { \
- s -= 6; \
- c |= (eptr[i] & 0x3f) << s; \
- } \
- len += a; \
- }
-
-/* If the pointer is not at the start of a character, move it back until
-it is. */
-
-#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
-
-#endif
-
-
-
-/*************************************************
-* Default character tables *
-*************************************************/
-
-/* A default set of character tables is included in the PCRE binary. Its source
-is built by the maketables auxiliary program, which uses the default C ctypes
-functions, and put in the file chartables.c. These tables are used by PCRE
-whenever the caller of pcre_compile() does not provide an alternate set of
-tables. */
-
-#include "chartables.c"
-
-
-
-#ifdef SUPPORT_UTF8
-/*************************************************
-* Tables for UTF-8 support *
-*************************************************/
-
-/* These are the breakpoints for different numbers of bytes in a UTF-8
-character. */
-
-static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
-
-/* These are the indicator bits and the mask for the data bits to set in the
-first byte of a character, indexed by the number of additional bytes. */
-
-static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
-static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
-
-/* Table of the number of extra characters, indexed by the first character
-masked with 0x3f. The highest number for a valid UTF-8 character is in fact
-0x3d. */
-
-static uschar utf8_table4[] = {
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
-
-
-/*************************************************
-* Convert character value to UTF-8 *
-*************************************************/
-
-/* This function takes an integer value in the range 0 - 0x7fffffff
-and encodes it as a UTF-8 character in 0 to 6 bytes.
-
-Arguments:
- cvalue the character value
- buffer pointer to buffer for result - at least 6 bytes long
-
-Returns: number of characters placed in the buffer
-*/
-
-static int
-ord2utf8(int cvalue, uschar *buffer)
-{
-register int i, j;
-for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
- if (cvalue <= utf8_table1[i]) break;
-buffer += i;
-for (j = i; j > 0; j--)
- {
- *buffer-- = 0x80 | (cvalue & 0x3f);
- cvalue >>= 6;
- }
-*buffer = utf8_table2[i] | cvalue;
-return i + 1;
-}
-#endif
-
-
-
-/*************************************************
-* Print compiled regex *
-*************************************************/
-
-/* The code for doing this is held in a separate file that is also included in
-pcretest.c. It defines a function called print_internals(). */
-
-#ifdef DEBUG
-#include "printint.c"
-#endif
-
-
-
-/*************************************************
-* Return version string *
-*************************************************/
-
-#define STRING(a) # a
-#define XSTRING(s) STRING(s)
-
-const char *
-pcre_version(void)
-{
-return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
-}
-
-
-
-
-/*************************************************
-* (Obsolete) Return info about compiled pattern *
-*************************************************/
-
-/* This is the original "info" function. It picks potentially useful data out
-of the private structure, but its interface was too rigid. It remains for
-backwards compatibility. The public options are passed back in an int - though
-the re->options field has been expanded to a long int, all the public options
-at the low end of it, and so even on 16-bit systems this will still be OK.
-Therefore, I haven't changed the API for pcre_info().
-
-Arguments:
- external_re points to compiled code
- optptr where to pass back the options
- first_char where to pass back the first character,
- or -1 if multiline and all branches start ^,
- or -2 otherwise
-
-Returns: number of capturing subpatterns
- or negative values on error
-*/
-
-int
-pcre_info(const pcre *external_re, int *optptr, int *first_char)
-{
-const real_pcre *re = (const real_pcre *)external_re;
-if (re == NULL) return PCRE_ERROR_NULL;
-if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
-if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
-if (first_char != NULL)
- *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
- ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
-return re->top_bracket;
-}
-
-
-
-/*************************************************
-* Return info about compiled pattern *
-*************************************************/
-
-/* This is a newer "info" function which has an extensible interface so
-that additional items can be added compatibly.
-
-Arguments:
- external_re points to compiled code
- external_study points to study data, or NULL
- what what information is required
- where where to put the information
-
-Returns: 0 if data returned, negative on error
-*/
-
-int
-pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
- void *where)
-{
-const real_pcre *re = (const real_pcre *)external_re;
-const real_pcre_extra *study = (const real_pcre_extra *)study_data;
-
-if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
-if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
-
-switch (what)
- {
- case PCRE_INFO_OPTIONS:
- *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
- break;
-
- case PCRE_INFO_SIZE:
- *((size_t *)where) = re->size;
- break;
-
- case PCRE_INFO_CAPTURECOUNT:
- *((int *)where) = re->top_bracket;
- break;
-
- case PCRE_INFO_BACKREFMAX:
- *((int *)where) = re->top_backref;
- break;
-
- case PCRE_INFO_FIRSTCHAR:
- *((int *)where) =
- ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
- ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
- break;
-
- case PCRE_INFO_FIRSTTABLE:
- *((const uschar **)where) =
- (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
- study->start_bits : NULL;
- break;
-
- case PCRE_INFO_LASTLITERAL:
- *((int *)where) =
- ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
- break;
-
- case PCRE_INFO_NAMEENTRYSIZE:
- *((int *)where) = re->name_entry_size;
- break;
-
- case PCRE_INFO_NAMECOUNT:
- *((int *)where) = re->name_count;
- break;
-
- case PCRE_INFO_NAMETABLE:
- *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);
- break;
-
- default: return PCRE_ERROR_BADOPTION;
- }
-
-return 0;
-}
-
-
-
-#ifdef DEBUG
-/*************************************************
-* Debugging function to print chars *
-*************************************************/
-
-/* Print a sequence of chars in printable format, stopping at the end of the
-subject if the requested.
-
-Arguments:
- p points to characters
- length number to print
- is_subject TRUE if printing from within md->start_subject
- md pointer to matching data block, if is_subject is TRUE
-
-Returns: nothing
-*/
-
-static void
-pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
-{
-int c;
-if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
-while (length-- > 0)
- if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
-}
-#endif
-
-
-
-
-/*************************************************
-* Handle escapes *
-*************************************************/
-
-/* This function is called when a \ has been encountered. It either returns a
-positive value for a simple escape such as \n, or a negative value which
-encodes one of the more complicated things such as \d. When UTF-8 is enabled,
-a positive value greater than 255 may be returned. On entry, ptr is pointing at
-the \. On exit, it is on the final character of the escape sequence.
-
-Arguments:
- ptrptr points to the pattern position pointer
- errorptr points to the pointer to the error message
- bracount number of previous extracting brackets
- options the options bits
- isclass TRUE if inside a character class
- cd pointer to char tables block
-
-Returns: zero or positive => a data character
- negative => a special escape sequence
- on error, errorptr is set
-*/
-
-static int
-check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
- int options, BOOL isclass, compile_data *cd)
-{
-const uschar *ptr = *ptrptr;
-int c, i;
-
-/* If backslash is at the end of the pattern, it's an error. */
-
-c = *(++ptr);
-if (c == 0) *errorptr = ERR1;
-
-/* Digits or letters may have special meaning; all others are literals. */
-
-else if (c < '0' || c > 'z') {}
-
-/* Do an initial lookup in a table. A non-zero result is something that can be
-returned immediately. Otherwise further processing may be required. */
-
-else if ((i = escapes[c - '0']) != 0) c = i;
-
-/* Escapes that need further processing, or are illegal. */
-
-else
- {
- const uschar *oldptr;
- switch (c)
- {
- /* A number of Perl escapes are not handled by PCRE. We give an explicit
- error. */
-
- case 'l':
- case 'L':
- case 'N':
- case 'p':
- case 'P':
- case 'u':
- case 'U':
- case 'X':
- *errorptr = ERR37;
- break;
-
- /* The handling of escape sequences consisting of a string of digits
- starting with one that is not zero is not straightforward. By experiment,
- the way Perl works seems to be as follows:
-
- Outside a character class, the digits are read as a decimal number. If the
- number is less than 10, or if there are that many previous extracting
- left brackets, then it is a back reference. Otherwise, up to three octal
- digits are read to form an escaped byte. Thus \123 is likely to be octal
- 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
- value is greater than 377, the least significant 8 bits are taken. Inside a
- character class, \ followed by a digit is always an octal number. */
-
- case '1': case '2': case '3': case '4': case '5':
- case '6': case '7': case '8': case '9':
-
- if (!isclass)
- {
- oldptr = ptr;
- c -= '0';
- while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
- c = c * 10 + *(++ptr) - '0';
- if (c < 10 || c <= bracount)
- {
- c = -(ESC_REF + c);
- break;
- }
- ptr = oldptr; /* Put the pointer back and fall through */
- }
-
- /* Handle an octal number following \. If the first digit is 8 or 9, Perl
- generates a binary zero byte and treats the digit as a following literal.
- Thus we have to pull back the pointer by one. */
-
- if ((c = *ptr) >= '8')
- {
- ptr--;
- c = 0;
- break;
- }
-
- /* \0 always starts an octal number, but we may drop through to here with a
- larger first octal digit. */
-
- case '0':
- c -= '0';
- while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
- ptr[1] != '8' && ptr[1] != '9')
- c = c * 8 + *(++ptr) - '0';
- c &= 255; /* Take least significant 8 bits */
- break;
-
- /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
- which can be greater than 0xff, but only if the ddd are hex digits. */
-
- case 'x':
-#ifdef SUPPORT_UTF8
- if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
- {
- const uschar *pt = ptr + 2;
- register int count = 0;
- c = 0;
- while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
- {
- count++;
- c = c * 16 + cd->lcc[*pt] -
- (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
- pt++;
- }
- if (*pt == '}')
- {
- if (c < 0 || count > 8) *errorptr = ERR34;
- ptr = pt;
- break;
- }
- /* If the sequence of hex digits does not end with '}', then we don't
- recognize this construct; fall through to the normal \x handling. */
- }
-#endif
-
- /* Read just a single hex char */
-
- c = 0;
- while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
- {
- ptr++;
- c = c * 16 + cd->lcc[*ptr] -
- (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
- }
- break;
-
- /* Other special escapes not starting with a digit are straightforward */
-
- case 'c':
- c = *(++ptr);
- if (c == 0)
- {
- *errorptr = ERR2;
- return 0;
- }
-
- /* A letter is upper-cased; then the 0x40 bit is flipped */
-
- if (c >= 'a' && c <= 'z') c = cd->fcc[c];
- c ^= 0x40;
- break;
-
- /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
- other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
- for Perl compatibility, it is a literal. This code looks a bit odd, but
- there used to be some cases other than the default, and there may be again
- in future, so I haven't "optimized" it. */
-
- default:
- if ((options & PCRE_EXTRA) != 0) switch(c)
- {
- default:
- *errorptr = ERR3;
- break;
- }
- break;
- }
- }
-
-*ptrptr = ptr;
-return c;
-}
-
-
-
-/*************************************************
-* Check for counted repeat *
-*************************************************/
-
-/* This function is called when a '{' is encountered in a place where it might
-start a quantifier. It looks ahead to see if it really is a quantifier or not.
-It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
-where the ddds are digits.
-
-Arguments:
- p pointer to the first char after '{'
- cd pointer to char tables block
-
-Returns: TRUE or FALSE
-*/
-
-static BOOL
-is_counted_repeat(const uschar *p, compile_data *cd)
-{
-if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
-while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
-if (*p == '}') return TRUE;
-
-if (*p++ != ',') return FALSE;
-if (*p == '}') return TRUE;
-
-if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
-while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
-return (*p == '}');
-}
-
-
-
-/*************************************************
-* Read repeat counts *
-*************************************************/
-
-/* Read an item of the form {n,m} and return the values. This is called only
-after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
-so the syntax is guaranteed to be correct, but we need to check the values.
-
-Arguments:
- p pointer to first char after '{'
- minp pointer to int for min
- maxp pointer to int for max
- returned as -1 if no max
- errorptr points to pointer to error message
- cd pointer to character tables clock
-
-Returns: pointer to '}' on success;
- current ptr on error, with errorptr set
-*/
-
-static const uschar *
-read_repeat_counts(const uschar *p, int *minp, int *maxp,
- const char **errorptr, compile_data *cd)
-{
-int min = 0;
-int max = -1;
-
-while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
-
-if (*p == '}') max = min; else
- {
- if (*(++p) != '}')
- {
- max = 0;
- while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
- if (max < min)
- {
- *errorptr = ERR4;
- return p;
- }
- }
- }
-
-/* Do paranoid checks, then fill in the required variables, and pass back the
-pointer to the terminating '}'. */
-
-if (min > 65535 || max > 65535)
- *errorptr = ERR5;
-else
- {
- *minp = min;
- *maxp = max;
- }
-return p;
-}
-
-
-
-/*************************************************
-* Find first significant op code *
-*************************************************/
-
-/* This is called by several functions that scan a compiled expression looking
-for a fixed first character, or an anchoring op code etc. It skips over things
-that do not influence this. For some calls, a change of option is important.
-
-Arguments:
- code pointer to the start of the group
- options pointer to external options
- optbit the option bit whose changing is significant, or
- zero if none are
-
-Returns: pointer to the first significant opcode
-*/
-
-static const uschar*
-first_significant_code(const uschar *code, int *options, int optbit)
-{
-for (;;)
- {
- switch ((int)*code)
- {
- case OP_OPT:
- if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
- *options = (int)code[1];
- code += 2;
- break;
-
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK:
- case OP_ASSERTBACK_NOT:
- do code += GET(code, 1); while (*code == OP_ALT);
- /* Fall through */
-
- case OP_CALLOUT:
- case OP_CREF:
- case OP_BRANUMBER:
- case OP_WORD_BOUNDARY:
- case OP_NOT_WORD_BOUNDARY:
- code += OP_lengths[*code];
- break;
-
- default:
- return code;
- }
- }
-/* Control never reaches here */
-}
-
-
-
-
-/*************************************************
-* Find the fixed length of a pattern *
-*************************************************/
-
-/* Scan a pattern and compute the fixed length of subject that will match it,
-if the length is fixed. This is needed for dealing with backward assertions.
-
-Arguments:
- code points to the start of the pattern (the bracket)
- options the compiling options
-
-Returns: the fixed length, or -1 if there is no fixed length,
- or -2 if \C was encountered
-*/
-
-static int
-find_fixedlength(uschar *code, int options)
-{
-int length = -1;
-
-register int branchlength = 0;
-register uschar *cc = code + 1 + LINK_SIZE;
-
-/* Scan along the opcodes for this branch. If we get to the end of the
-branch, check the length against that of the other branches. */
-
-for (;;)
- {
- int d;
- register int op = *cc;
- if (op >= OP_BRA) op = OP_BRA;
-
- switch (op)
- {
- case OP_BRA:
- case OP_ONCE:
- case OP_COND:
- d = find_fixedlength(cc, options);
- if (d < 0) return d;
- branchlength += d;
- do cc += GET(cc, 1); while (*cc == OP_ALT);
- cc += 1 + LINK_SIZE;
- break;
-
- /* Reached end of a branch; if it's a ket it is the end of a nested
- call. If it's ALT it is an alternation in a nested call. If it is
- END it's the end of the outer call. All can be handled by the same code. */
-
- case OP_ALT:
- case OP_KET:
- case OP_KETRMAX:
- case OP_KETRMIN:
- case OP_END:
- if (length < 0) length = branchlength;
- else if (length != branchlength) return -1;
- if (*cc != OP_ALT) return length;
- cc += 1 + LINK_SIZE;
- branchlength = 0;
- break;
-
- /* Skip over assertive subpatterns */
-
- case OP_ASSERT:
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK:
- case OP_ASSERTBACK_NOT:
- do cc += GET(cc, 1); while (*cc == OP_ALT);
- /* Fall through */
-
- /* Skip over things that don't match chars */
-
- case OP_REVERSE:
- case OP_BRANUMBER:
- case OP_CREF:
- case OP_OPT:
- case OP_CALLOUT:
- case OP_SOD:
- case OP_SOM:
- case OP_EOD:
- case OP_EODN:
- case OP_CIRC:
- case OP_DOLL:
- case OP_NOT_WORD_BOUNDARY:
- case OP_WORD_BOUNDARY:
- cc += OP_lengths[*cc];
- break;
-
- /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
- This requires a scan of the string, unfortunately. We assume valid UTF-8
- strings, so all we do is reduce the length by one for every byte whose bits
- are 10xxxxxx. */
-
- case OP_CHARS:
- branchlength += *(++cc);
-#ifdef SUPPORT_UTF8
- for (d = 1; d <= *cc; d++)
- if ((cc[d] & 0xc0) == 0x80) branchlength--;
-#endif
- cc += *cc + 1;
- break;
-
- /* Handle exact repetitions */
-
- case OP_EXACT:
- case OP_TYPEEXACT:
- branchlength += GET2(cc,1);
- cc += 4;
- break;
-
- /* Handle single-char matchers */
-
- case OP_NOT_DIGIT:
- case OP_DIGIT:
- case OP_NOT_WHITESPACE:
- case OP_WHITESPACE:
- case OP_NOT_WORDCHAR:
- case OP_WORDCHAR:
- case OP_ANY:
- branchlength++;
- cc++;
- break;
-
- /* The single-byte matcher isn't allowed */
-
- case OP_ANYBYTE:
- return -2;
-
- /* Check a class for variable quantification */
-
- case OP_CLASS:
- cc += 33;
-
- switch (*cc)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- return -1;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- if (GET2(cc,1) != GET2(cc,3)) return -1;
- branchlength += GET2(cc,1);
- cc += 5;
- break;
-
- default:
- branchlength++;
- }
- break;
-
- /* Anything else is variable length */
-
- default:
- return -1;
- }
- }
-/* Control never gets here */
-}
-
-
-
-
-/*************************************************
-* Scan compiled regex for numbered bracket *
-*************************************************/
-
-/* This little function scans through a compiled pattern until it finds a
-capturing bracket with the given number.
-
-Arguments:
- code points to start of expression
- number the required bracket number
-
-Returns: pointer to the opcode for the bracket, or NULL if not found
-*/
-
-static const uschar *
-find_bracket(const uschar *code, int number)
-{
-for (;;)
- {
- register int c = *code;
- if (c == OP_END) return NULL;
- else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
- else if (c > OP_BRA)
- {
- int n = c - OP_BRA;
- if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
- if (n == number) return (uschar *)code;
- code += OP_lengths[OP_BRA];
- }
- else code += OP_lengths[c];
- }
-}
-
-
-
-/*************************************************
-* Scan compiled branch for non-emptiness *
-*************************************************/
-
-/* This function scans through a branch of a compiled pattern to see whether it
-can match the empty string or not. It is called only from could_be_empty()
-below. Note that first_significant_code() skips over assertions. If we hit an
-unclosed bracket, we return "empty" - this means we've struck an inner bracket
-whose current branch will already have been scanned.
-
-Arguments:
- code points to start of search
- endcode points to where to stop
-
-Returns: TRUE if what is matched could be empty
-*/
-
-static BOOL
-could_be_empty_branch(const uschar *code, const uschar *endcode)
-{
-register int c;
-for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
- code < endcode;
- code = first_significant_code(code + OP_lengths[c], NULL, 0))
- {
- c = *code;
-
- if (c >= OP_BRA)
- {
- BOOL empty_branch;
- if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
-
- /* Scan a closed bracket */
-
- empty_branch = FALSE;
- do
- {
- if (!empty_branch && could_be_empty_branch(code, endcode))
- empty_branch = TRUE;
- code += GET(code, 1);
- }
- while (*code == OP_ALT);
- if (!empty_branch) return FALSE; /* All branches are non-empty */
- code += 1 + LINK_SIZE;
- c = *code;
- }
-
- /* Check for any quantifier after a class */
-
- else if (c == OP_CLASS)
- {
- const uschar *ccode = code + 33;
-
- switch (*ccode)
- {
- case OP_CRSTAR: /* These could be empty; continue */
- case OP_CRMINSTAR:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- break;
-
- default: /* Non-repeat => class must match */
- case OP_CRPLUS: /* These repeats aren't empty */
- case OP_CRMINPLUS:
- return FALSE;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
- break;
- }
- }
-
- /* Test for an opcode that must match a character. */
-
- else switch (c)
- {
- case OP_NOT_DIGIT:
- case OP_DIGIT:
- case OP_NOT_WHITESPACE:
- case OP_WHITESPACE:
- case OP_NOT_WORDCHAR:
- case OP_WORDCHAR:
- case OP_ANY:
- case OP_ANYBYTE:
- case OP_CHARS:
- case OP_NOT:
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_EXACT:
- case OP_NOTPLUS:
- case OP_NOTMINPLUS:
- case OP_NOTEXACT:
- case OP_TYPEPLUS:
- case OP_TYPEMINPLUS:
- case OP_TYPEEXACT:
- return FALSE;
-
- /* End of branch */
-
- case OP_KET:
- case OP_KETRMAX:
- case OP_KETRMIN:
- case OP_ALT:
- return TRUE;
- }
- }
-
-return TRUE;
-}
-
-
-
-/*************************************************
-* Scan compiled regex for non-emptiness *
-*************************************************/
-
-/* This function is called to check for left recursive calls. We want to check
-the current branch of the current pattern to see if it could match the empty
-string. If it could, we must look outwards for branches at other levels,
-stopping when we pass beyond the bracket which is the subject of the recursion.
-
-Arguments:
- code points to start of the recursion
- endcode points to where to stop (current RECURSE item)
- bcptr points to the chain of current (unclosed) branch starts
-
-Returns: TRUE if what is matched could be empty
-*/
-
-static BOOL
-could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr)
-{
-while (bcptr != NULL && bcptr->current >= code)
- {
- if (!could_be_empty_branch(bcptr->current, endcode)) return FALSE;
- bcptr = bcptr->outer;
- }
-return TRUE;
-}
-
-
-
-/*************************************************
-* Check for POSIX class syntax *
-*************************************************/
-
-/* This function is called when the sequence "[:" or "[." or "[=" is
-encountered in a character class. It checks whether this is followed by an
-optional ^ and then a sequence of letters, terminated by a matching ":]" or
-".]" or "=]".
-
-Argument:
- ptr pointer to the initial [
- endptr where to return the end pointer
- cd pointer to compile data
-
-Returns: TRUE or FALSE
-*/
-
-static BOOL
-check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
-{
-int terminator; /* Don't combine these lines; the Solaris cc */
-terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
-if (*(++ptr) == '^') ptr++;
-while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
-if (*ptr == terminator && ptr[1] == ']')
- {
- *endptr = ptr;
- return TRUE;
- }
-return FALSE;
-}
-
-
-
-
-/*************************************************
-* Check POSIX class name *
-*************************************************/
-
-/* This function is called to check the name given in a POSIX-style class entry
-such as [:alnum:].
-
-Arguments:
- ptr points to the first letter
- len the length of the name
-
-Returns: a value representing the name, or -1 if unknown
-*/
-
-static int
-check_posix_name(const uschar *ptr, int len)
-{
-register int yield = 0;
-while (posix_name_lengths[yield] != 0)
- {
- if (len == posix_name_lengths[yield] &&
- strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
- yield++;
- }
-return -1;
-}
-
-
-
-
-/*************************************************
-* Compile one branch *
-*************************************************/
-
-/* Scan the pattern, compiling it into the code vector. If the options are
-changed during the branch, the pointer is used to change the external options
-bits.
-
-Arguments:
- optionsptr pointer to the option bits
- brackets points to number of extracting brackets used
- code points to the pointer to the current code point
- ptrptr points to the current pattern pointer
- errorptr points to pointer to error message
- firstcharptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
- reqcharptr set to the last literal character required, else < 0
- bcptr points to current branch chain
- cd contains pointers to tables etc.
-
-Returns: TRUE on success
- FALSE, with *errorptr set on error
-*/
-
-static BOOL
-compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
- const uschar **ptrptr, const char **errorptr, int *firstcharptr,
- int *reqcharptr, branch_chain *bcptr, compile_data *cd)
-{
-int repeat_type, op_type;
-int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
-int bravalue = 0;
-int length;
-int greedy_default, greedy_non_default;
-int firstchar, reqchar;
-int zeroreqchar, zerofirstchar;
-int req_caseopt;
-int condcount = 0;
-int options = *optionsptr;
-register int c;
-register uschar *code = *codeptr;
-uschar *tempcode;
-BOOL inescq = FALSE;
-BOOL groupsetfirstchar = FALSE;
-const uschar *ptr = *ptrptr;
-const uschar *tempptr;
-uschar *previous = NULL;
-uschar class[32];
-
-/* Set up the default and non-default settings for greediness */
-
-greedy_default = ((options & PCRE_UNGREEDY) != 0);
-greedy_non_default = greedy_default ^ 1;
-
-/* Initialize no first char, no required char. REQ_UNSET means "no char
-matching encountered yet". It gets changed to REQ_NONE if we hit something that
-matches a non-fixed char first char; reqchar just remains unset if we never
-find one.
-
-When we hit a repeat whose minimum is zero, we may have to adjust these values
-to take the zero repeat into account. This is implemented by setting them to
-zerofirstchar and zeroreqchar when such a repeat is encountered. The individual
-item types that can be repeated set these backoff variables appropriately. */
-
-firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
-
-/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
-according to the current setting of the caseless flag. REQ_CASELESS is a bit
-value > 255. It is added into the firstchar or reqchar variables to record the
-case status of the value. */
-
-req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
-
-/* Switch on next character until the end of the branch */
-
-for (;; ptr++)
- {
- BOOL negate_class;
- BOOL possessive_quantifier;
- int class_charcount;
- int class_lastchar;
- int newoptions;
- int recno;
- int skipbytes;
- int subreqchar;
- int subfirstchar;
-
- c = *ptr;
- if (inescq && c != 0) goto NORMAL_CHAR;
-
- if ((options & PCRE_EXTENDED) != 0)
- {
- if ((cd->ctypes[c] & ctype_space) != 0) continue;
- if (c == '#')
- {
- /* The space before the ; is to avoid a warning on a silly compiler
- on the Macintosh. */
- while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
- if (c != 0) continue; /* Else fall through to handle end of string */
- }
- }
-
- switch(c)
- {
- /* The branch terminates at end of string, |, or ). */
-
- case 0:
- case '|':
- case ')':
- *firstcharptr = firstchar;
- *reqcharptr = reqchar;
- *codeptr = code;
- *ptrptr = ptr;
- return TRUE;
-
- /* Handle single-character metacharacters. In multiline mode, ^ disables
- the setting of any following char as a first character. */
-
- case '^':
- if ((options & PCRE_MULTILINE) != 0)
- {
- if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
- }
- previous = NULL;
- *code++ = OP_CIRC;
- break;
-
- case '$':
- previous = NULL;
- *code++ = OP_DOLL;
- break;
-
- /* There can never be a first char if '.' is first, whatever happens about
- repeats. The value of reqchar doesn't change either. */
-
- case '.':
- if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
- zerofirstchar = firstchar;
- zeroreqchar = reqchar;
- previous = code;
- *code++ = OP_ANY;
- break;
-
- /* Character classes. These always build a 32-byte bitmap of the permitted
- characters, except in the special case where there is only one character.
- For negated classes, we build the map as usual, then invert it at the end.
- */
-
- case '[':
- previous = code;
- *code++ = OP_CLASS;
-
- /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
- they are encountered at the top level, so we'll do that too. */
-
- if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
- check_posix_syntax(ptr, &tempptr, cd))
- {
- *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
- goto FAILED;
- }
-
- /* If the first character is '^', set the negation flag and skip it. */
-
- if ((c = *(++ptr)) == '^')
- {
- negate_class = TRUE;
- c = *(++ptr);
- }
- else negate_class = FALSE;
-
- /* Keep a count of chars so that we can optimize the case of just a single
- character. */
-
- class_charcount = 0;
- class_lastchar = -1;
-
- /* Initialize the 32-char bit map to all zeros. We have to build the
- map in a temporary bit of store, in case the class contains only 1
- character, because in that case the compiled code doesn't use the
- bit map. */
-
- memset(class, 0, 32 * sizeof(uschar));
-
- /* Process characters until ] is reached. By writing this as a "do" it
- means that an initial ] is taken as a data character. The first pass
- checked the overall syntax. */
-
- do
- {
- /* Handle POSIX class names. Perl allows a negation extension of the
- form [:^name:]. A square bracket that doesn't match the syntax is
- treated as a literal. We also recognize the POSIX constructions
- [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
- 5.6 and 5.8 do. */
-
- if (c == '[' &&
- (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
- check_posix_syntax(ptr, &tempptr, cd))
- {
- BOOL local_negate = FALSE;
- int posix_class, i;
- register const uschar *cbits = cd->cbits;
-
- if (ptr[1] != ':')
- {
- *errorptr = ERR31;
- goto FAILED;
- }
-
- ptr += 2;
- if (*ptr == '^')
- {
- local_negate = TRUE;
- ptr++;
- }
-
- posix_class = check_posix_name(ptr, tempptr - ptr);
- if (posix_class < 0)
- {
- *errorptr = ERR30;
- goto FAILED;
- }
-
- /* If matching is caseless, upper and lower are converted to
- alpha. This relies on the fact that the class table starts with
- alpha, lower, upper as the first 3 entries. */
-
- if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
- posix_class = 0;
-
- /* Or into the map we are building up to 3 of the static class
- tables, or their negations. The [:blank:] class sets up the same
- chars as the [:space:] class (all white space). We remove the vertical
- white space chars afterwards. */
-
- posix_class *= 3;
- for (i = 0; i < 3; i++)
- {
- BOOL isblank = strncmp(ptr, "blank", 5) == 0;
- int taboffset = posix_class_maps[posix_class + i];
- if (taboffset < 0) break;
- if (local_negate)
- {
- for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
- if (isblank) class[1] |= 0x3c;
- }
- else
- {
- for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
- if (isblank) class[1] &= ~0x3c;
- }
- }
-
- ptr = tempptr + 1;
- class_charcount = 10; /* Set > 1; assumes more than 1 per class */
- continue;
- }
-
- /* Backslash may introduce a single character, or it may introduce one
- of the specials, which just set a flag. Escaped items are checked for
- validity in the pre-compiling pass. The sequence \b is a special case.
- Inside a class (and only there) it is treated as backspace. Elsewhere
- it marks a word boundary. Other escapes have preset maps ready to
- or into the one we are building. We assume they have more than one
- character in them, so set class_count bigger than one. */
-
- if (c == '\\')
- {
- c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
- if (-c == ESC_b) c = '\b';
- else if (c < 0)
- {
- register const uschar *cbits = cd->cbits;
- class_charcount = 10; /* Greater than 1 is what matters */
- switch (-c)
- {
- case ESC_d:
- for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
- continue;
-
- case ESC_D:
- for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
- continue;
-
- case ESC_w:
- for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
- continue;
-
- case ESC_W:
- for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
- continue;
-
- case ESC_s:
- for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
- class[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
- continue;
-
- case ESC_S:
- for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
- class[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
- continue;
-
- default:
- *errorptr = ERR7;
- goto FAILED;
- }
- }
-
- /* Fall through if single character, but don't at present allow
- chars > 255 in UTF-8 mode. */
-
-#ifdef SUPPORT_UTF8
- if (c > 255)
- {
- *errorptr = ERR33;
- goto FAILED;
- }
-#endif
- }
-
- /* A single character may be followed by '-' to form a range. However,
- Perl does not permit ']' to be the end of the range. A '-' character
- here is treated as a literal. */
-
- if (ptr[1] == '-' && ptr[2] != ']')
- {
- int d;
- ptr += 2;
- d = *ptr;
-
- /* The second part of a range can be a single-character escape, but
- not any of the other escapes. Perl 5.6 treats a hyphen as a literal
- in such circumstances. */
-
- if (d == '\\')
- {
- const uschar *oldptr = ptr;
- d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
-
-#ifdef SUPPORT_UTF8
- if (d > 255)
- {
- *errorptr = ERR33;
- goto FAILED;
- }
-#endif
- /* \b is backslash; any other special means the '-' was literal */
-
- if (d < 0)
- {
- if (d == -ESC_b) d = '\b'; else
- {
- ptr = oldptr - 2;
- goto SINGLE_CHARACTER; /* A few lines below */
- }
- }
- }
-
- if (d < c)
- {
- *errorptr = ERR8;
- goto FAILED;
- }
-
- for (; c <= d; c++)
- {
- class[c/8] |= (1 << (c&7));
- if ((options & PCRE_CASELESS) != 0)
- {
- int uc = cd->fcc[c]; /* flip case */
- class[uc/8] |= (1 << (uc&7));
- }
- class_charcount++; /* in case a one-char range */
- class_lastchar = c;
- }
- continue; /* Go get the next char in the class */
- }
-
- /* Handle a lone single character - we can get here for a normal
- non-escape char, or after \ that introduces a single character. */
-
- SINGLE_CHARACTER:
-
- class [c/8] |= (1 << (c&7));
- if ((options & PCRE_CASELESS) != 0)
- {
- c = cd->fcc[c]; /* flip case */
- class[c/8] |= (1 << (c&7));
- }
- class_charcount++;
- class_lastchar = c;
- }
-
- /* Loop until ']' reached; the check for end of string happens inside the
- loop. This "while" is the end of the "do" above. */
-
- while ((c = *(++ptr)) != ']');
-
- /* If class_charcount is 1 and class_lastchar is not negative, we saw
- precisely one character. This doesn't need the whole 32-byte bit map. We
- turn it into a 1-character OP_CHARS if it's positive, or OP_NOT if it's
- negative. In the positive case, it can cause firstchar to be set.
- Otherwise, there can be no first char if this item is first, whatever
- repeat count may follow. In the case of reqchar, save the previous value
- for reinstating. */
-
- if (class_charcount == 1 && class_lastchar >= 0)
- {
- zeroreqchar = reqchar;
- if (negate_class)
- {
- if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
- zerofirstchar = firstchar;
- code[-1] = OP_NOT;
- }
- else
- {
- if (firstchar == REQ_UNSET)
- {
- zerofirstchar = REQ_NONE;
- firstchar = class_lastchar | req_caseopt;
- }
- else
- {
- zerofirstchar = firstchar;
- reqchar = class_lastchar | req_caseopt;
- }
- code[-1] = OP_CHARS;
- *code++ = 1;
- }
- *code++ = class_lastchar;
- }
-
- /* Otherwise, negate the 32-byte map if necessary, and copy it into
- the code vector. If this is the first thing in the branch, there can be
- no first char setting, whatever the repeat count. Any reqchar setting
- must remain unchanged after any kind of repeat. */
-
- else
- {
- if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
- zerofirstchar = firstchar;
- zeroreqchar = reqchar;
- if (negate_class)
- for (c = 0; c < 32; c++) code[c] = ~class[c];
- else
- memcpy(code, class, 32);
- code += 32;
- }
- break;
-
- /* Various kinds of repeat */
-
- case '{':
- if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
- ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
- if (*errorptr != NULL) goto FAILED;
- goto REPEAT;
-
- case '*':
- repeat_min = 0;
- repeat_max = -1;
- goto REPEAT;
-
- case '+':
- repeat_min = 1;
- repeat_max = -1;
- goto REPEAT;
-
- case '?':
- repeat_min = 0;
- repeat_max = 1;
-
- REPEAT:
- if (previous == NULL)
- {
- *errorptr = ERR9;
- goto FAILED;
- }
-
- if (repeat_min == 0)
- {
- firstchar = zerofirstchar; /* Adjust for zero repeat */
- reqchar = zeroreqchar; /* Ditto */
- }
-
- op_type = 0; /* Default single-char op codes */
- possessive_quantifier = FALSE; /* Default not possessive quantifier */
-
- /* Save start of previous item, in case we have to move it up to make space
- for an inserted OP_ONCE for the additional '+' extension. */
-
- tempcode = previous;
-
- /* If the next character is '+', we have a possessive quantifier. This
- implies greediness, whatever the setting of the PCRE_UNGREEDY option.
- If the next character is '?' this is a minimizing repeat, by default,
- but if PCRE_UNGREEDY is set, it works the other way round. We change the
- repeat type to the non-default. */
-
- if (ptr[1] == '+')
- {
- repeat_type = 0; /* Force greedy */
- possessive_quantifier = TRUE;
- ptr++;
- }
- else if (ptr[1] == '?')
- {
- repeat_type = greedy_non_default;
- ptr++;
- }
- else repeat_type = greedy_default;
-
- /* If previous was a recursion, we need to wrap it inside brackets so that
- it can be replicated if necessary. */
-
- if (*previous == OP_RECURSE)
- {
- memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
- code += 1 + LINK_SIZE;
- *previous = OP_BRA;
- PUT(previous, 1, code - previous);
- *code = OP_KET;
- PUT(code, 1, code - previous);
- code += 1 + LINK_SIZE;
- }
-
- /* If previous was a string of characters, chop off the last one and use it
- as the subject of the repeat. If there was only one character, we can
- abolish the previous item altogether. If a one-char item has a minumum of
- more than one, ensure that it is set in reqchar - it might not be if a
- sequence such as x{3} is the first thing in a branch because the x will
- have gone into firstchar instead. */
-
- if (*previous == OP_CHARS)
- {
- int len = previous[1];
- if (len == 1)
- {
- c = previous[2];
- code = previous;
- if (repeat_min > 1) reqchar = c | req_caseopt;
- }
- else
- {
- c = previous[len+1];
- previous[1]--;
- code--;
- tempcode = code; /* Adjust position to be moved for '+' */
- }
-
- goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
- }
-
- /* If previous was a single negated character ([^a] or similar), we use
- one of the special opcodes, replacing it. The code is shared with single-
- character repeats by setting opt_type to add a suitable offset into
- repeat_type. */
-
- else if (*previous == OP_NOT)
- {
- op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
- c = previous[1];
- code = previous;
- goto OUTPUT_SINGLE_REPEAT;
- }
-
- /* If previous was a character type match (\d or similar), abolish it and
- create a suitable repeat item. The code is shared with single-character
- repeats by setting op_type to add a suitable offset into repeat_type. */
-
- else if (*previous < OP_EODN)
- {
- op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
- c = *previous;
- code = previous;
-
- OUTPUT_SINGLE_REPEAT:
-
- /* If the maximum is zero then the minimum must also be zero; Perl allows
- this case, so we do too - by simply omitting the item altogether. */
-
- if (repeat_max == 0) goto END_REPEAT;
-
- /* Combine the op_type with the repeat_type */
-
- repeat_type += op_type;
-
- /* A minimum of zero is handled either as the special case * or ?, or as
- an UPTO, with the maximum given. */
-
- if (repeat_min == 0)
- {
- if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
- else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
- else
- {
- *code++ = OP_UPTO + repeat_type;
- PUT2INC(code, 0, repeat_max);
- }
- }
-
- /* The case {1,} is handled as the special case + */
-
- else if (repeat_min == 1 && repeat_max == -1)
- *code++ = OP_PLUS + repeat_type;
-
- /* The case {n,n} is just an EXACT, while the general case {n,m} is
- handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
-
- else
- {
- if (repeat_min != 1)
- {
- *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
- PUT2INC(code, 0, repeat_min);
- }
-
- /* If the mininum is 1 and the previous item was a character string,
- we either have to put back the item that got cancelled if the string
- length was 1, or add the character back onto the end of a longer
- string. For a character type nothing need be done; it will just get
- put back naturally. Note that the final character is always going to
- get added below. */
-
- else if (*previous == OP_CHARS)
- {
- if (code == previous) code += 2; else previous[1]++;
- }
-
- /* For a single negated character we also have to put back the
- item that got cancelled. */
-
- else if (*previous == OP_NOT) code++;
-
- /* If the maximum is unlimited, insert an OP_STAR. */
-
- if (repeat_max < 0)
- {
- *code++ = c;
- *code++ = OP_STAR + repeat_type;
- }
-
- /* Else insert an UPTO if the max is greater than the min. */
-
- else if (repeat_max != repeat_min)
- {
- *code++ = c;
- repeat_max -= repeat_min;
- *code++ = OP_UPTO + repeat_type;
- PUT2INC(code, 0, repeat_max);
- }
- }
-
- /* The character or character type itself comes last in all cases. */
-
- *code++ = c;
- }
-
- /* If previous was a character class or a back reference, we put the repeat
- stuff after it, but just skip the item if the repeat was {0,0}. */
-
- else if (*previous == OP_CLASS || *previous == OP_REF)
- {
- if (repeat_max == 0)
- {
- code = previous;
- goto END_REPEAT;
- }
- if (repeat_min == 0 && repeat_max == -1)
- *code++ = OP_CRSTAR + repeat_type;
- else if (repeat_min == 1 && repeat_max == -1)
- *code++ = OP_CRPLUS + repeat_type;
- else if (repeat_min == 0 && repeat_max == 1)
- *code++ = OP_CRQUERY + repeat_type;
- else
- {
- *code++ = OP_CRRANGE + repeat_type;
- PUT2INC(code, 0, repeat_min);
- if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
- PUT2INC(code, 0, repeat_max);
- }
- }
-
- /* If previous was a bracket group, we may have to replicate it in certain
- cases. */
-
- else if (*previous >= OP_BRA || *previous == OP_ONCE ||
- *previous == OP_COND)
- {
- register int i;
- int ketoffset = 0;
- int len = code - previous;
- uschar *bralink = NULL;
-
- /* If the maximum repeat count is unlimited, find the end of the bracket
- by scanning through from the start, and compute the offset back to it
- from the current code pointer. There may be an OP_OPT setting following
- the final KET, so we can't find the end just by going back from the code
- pointer. */
-
- if (repeat_max == -1)
- {
- register uschar *ket = previous;
- do ket += GET(ket, 1); while (*ket != OP_KET);
- ketoffset = code - ket;
- }
-
- /* The case of a zero minimum is special because of the need to stick
- OP_BRAZERO in front of it, and because the group appears once in the
- data, whereas in other cases it appears the minimum number of times. For
- this reason, it is simplest to treat this case separately, as otherwise
- the code gets far too messy. There are several special subcases when the
- minimum is zero. */
-
- if (repeat_min == 0)
- {
- /* If the maximum is also zero, we just omit the group from the output
- altogether. */
-
- if (repeat_max == 0)
- {
- code = previous;
- goto END_REPEAT;
- }
-
- /* If the maximum is 1 or unlimited, we just have to stick in the
- BRAZERO and do no more at this point. */
-
- if (repeat_max <= 1)
- {
- memmove(previous+1, previous, len);
- code++;
- *previous++ = OP_BRAZERO + repeat_type;
- }
-
- /* If the maximum is greater than 1 and limited, we have to replicate
- in a nested fashion, sticking OP_BRAZERO before each set of brackets.
- The first one has to be handled carefully because it's the original
- copy, which has to be moved up. The remainder can be handled by code
- that is common with the non-zero minimum case below. We just have to
- adjust the value or repeat_max, since one less copy is required. */
-
- else
- {
- int offset;
- memmove(previous + 2 + LINK_SIZE, previous, len);
- code += 2 + LINK_SIZE;
- *previous++ = OP_BRAZERO + repeat_type;
- *previous++ = OP_BRA;
-
- /* We chain together the bracket offset fields that have to be
- filled in later when the ends of the brackets are reached. */
-
- offset = (bralink == NULL)? 0 : previous - bralink;
- bralink = previous;
- PUTINC(previous, 0, offset);
- }
-
- repeat_max--;
- }
-
- /* If the minimum is greater than zero, replicate the group as many
- times as necessary, and adjust the maximum to the number of subsequent
- copies that we need. If we set a first char from the group, and didn't
- set a required char, copy the latter from the former. */
-
- else
- {
- if (repeat_min > 1)
- {
- if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
- for (i = 1; i < repeat_min; i++)
- {
- memcpy(code, previous, len);
- code += len;
- }
- }
- if (repeat_max > 0) repeat_max -= repeat_min;
- }
-
- /* This code is common to both the zero and non-zero minimum cases. If
- the maximum is limited, it replicates the group in a nested fashion,
- remembering the bracket starts on a stack. In the case of a zero minimum,
- the first one was set up above. In all cases the repeat_max now specifies
- the number of additional copies needed. */
-
- if (repeat_max >= 0)
- {
- for (i = repeat_max - 1; i >= 0; i--)
- {
- *code++ = OP_BRAZERO + repeat_type;
-
- /* All but the final copy start a new nesting, maintaining the
- chain of brackets outstanding. */
-
- if (i != 0)
- {
- int offset;
- *code++ = OP_BRA;
- offset = (bralink == NULL)? 0 : code - bralink;
- bralink = code;
- PUTINC(code, 0, offset);
- }
-
- memcpy(code, previous, len);
- code += len;
- }
-
- /* Now chain through the pending brackets, and fill in their length
- fields (which are holding the chain links pro tem). */
-
- while (bralink != NULL)
- {
- int oldlinkoffset;
- int offset = code - bralink + 1;
- uschar *bra = code - offset;
- oldlinkoffset = GET(bra, 1);
- bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
- *code++ = OP_KET;
- PUTINC(code, 0, offset);
- PUT(bra, 1, offset);
- }
- }
-
- /* If the maximum is unlimited, set a repeater in the final copy. We
- can't just offset backwards from the current code point, because we
- don't know if there's been an options resetting after the ket. The
- correct offset was computed above. */
-
- else code[-ketoffset] = OP_KETRMAX + repeat_type;
- }
-
- /* Else there's some kind of shambles */
-
- else
- {
- *errorptr = ERR11;
- goto FAILED;
- }
-
- /* If the character following a repeat is '+', we wrap the entire repeated
- item inside OP_ONCE brackets. This is just syntactic sugar, taken from
- Sun's Java package. The repeated item starts at tempcode, not at previous,
- which might be the first part of a string whose (former) last char we
- repeated. However, we don't support '+' after a greediness '?'. */
-
- if (possessive_quantifier)
- {
- int len = code - tempcode;
- memmove(tempcode + 1+LINK_SIZE, tempcode, len);
- code += 1 + LINK_SIZE;
- len += 1 + LINK_SIZE;
- tempcode[0] = OP_ONCE;
- *code++ = OP_KET;
- PUTINC(code, 0, len);
- PUT(tempcode, 1, len);
- }
-
- /* In all case we no longer have a previous item. */
-
- END_REPEAT:
- previous = NULL;
- break;
-
-
- /* Start of nested bracket sub-expression, or comment or lookahead or
- lookbehind or option setting or condition. First deal with special things
- that can come after a bracket; all are introduced by ?, and the appearance
- of any of them means that this is not a referencing group. They were
- checked for validity in the first pass over the string, so we don't have to
- check for syntax errors here. */
-
- case '(':
- newoptions = options;
- skipbytes = 0;
-
- if (*(++ptr) == '?')
- {
- int set, unset;
- int *optset;
-
- switch (*(++ptr))
- {
- case '#': /* Comment; skip to ket */
- ptr++;
- while (*ptr != ')') ptr++;
- continue;
-
- case ':': /* Non-extracting bracket */
- bravalue = OP_BRA;
- ptr++;
- break;
-
- case '(':
- bravalue = OP_COND; /* Conditional group */
-
- /* Condition to test for recursion */
-
- if (ptr[1] == 'R')
- {
- code[1+LINK_SIZE] = OP_CREF;
- PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
- skipbytes += 1+LINK_SIZE;
- ptr += 3;
- }
-
- /* Condition to test for a numbered subpattern match */
-
- else if ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
- {
- int condref = *(++ptr) - '0';
- while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
- if (condref == 0)
- {
- *errorptr = ERR35;
- goto FAILED;
- }
- ptr++;
- code[1+LINK_SIZE] = OP_CREF;
- PUT2(code, 2+LINK_SIZE, condref);
- skipbytes = 3;
- }
- /* For conditions that are assertions, we just fall through, having
- set bravalue above. */
- break;
-
- case '=': /* Positive lookahead */
- bravalue = OP_ASSERT;
- ptr++;
- break;
-
- case '!': /* Negative lookahead */
- bravalue = OP_ASSERT_NOT;
- ptr++;
- break;
-
- case '<': /* Lookbehinds */
- switch (*(++ptr))
- {
- case '=': /* Positive lookbehind */
- bravalue = OP_ASSERTBACK;
- ptr++;
- break;
-
- case '!': /* Negative lookbehind */
- bravalue = OP_ASSERTBACK_NOT;
- ptr++;
- break;
- }
- break;
-
- case '>': /* One-time brackets */
- bravalue = OP_ONCE;
- ptr++;
- break;
-
- case 'C': /* Callout - may be followed by digits */
- *code++ = OP_CALLOUT;
- {
- int n = 0;
- while ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
- n = n * 10 + *ptr - '0';
- if (n > 255)
- {
- *errorptr = ERR38;
- goto FAILED;
- }
- *code++ = n;
- }
- previous = NULL;
- continue;
-
- case 'P': /* Named subpattern handling */
- if (*(++ptr) == '<') /* Definition */
- {
- int i, namelen;
- const uschar *name = ++ptr;
- uschar *slot = cd->name_table;
-
- while (*ptr++ != '>');
- namelen = ptr - name - 1;
-
- for (i = 0; i < cd->names_found; i++)
- {
- int c = strncmp(name, slot+2, namelen);
- if (c == 0)
- {
- *errorptr = ERR43;
- goto FAILED;
- }
- if (c < 0)
- {
- memmove(slot + cd->name_entry_size, slot,
- (cd->names_found - i) * cd->name_entry_size);
- break;
- }
- slot += cd->name_entry_size;
- }
-
- PUT2(slot, 0, *brackets + 1);
- memcpy(slot + 2, name, namelen);
- slot[2+namelen] = 0;
- cd->names_found++;
- goto NUMBERED_GROUP;
- }
-
- if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
- {
- int i, namelen;
- int type = *ptr++;
- const uschar *name = ptr;
- uschar *slot = cd->name_table;
-
- while (*ptr != ')') ptr++;
- namelen = ptr - name;
-
- for (i = 0; i < cd->names_found; i++)
- {
- if (strncmp(name, slot+2, namelen) == 0) break;
- slot += cd->name_entry_size;
- }
- if (i >= cd->names_found)
- {
- *errorptr = ERR15;
- goto FAILED;
- }
-
- recno = GET2(slot, 0);
-
- if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
-
- /* Back reference */
-
- previous = code;
- *code++ = OP_REF;
- PUT2INC(code, 0, recno);
- continue;
- }
-
- /* Should never happen */
- break;
-
- case 'R': /* Pattern recursion */
- ptr++; /* Same as (?0) */
- /* Fall through */
-
- /* Recursion or "subroutine" call */
-
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- {
- const uschar *called;
- recno = 0;
-
- while ((cd->ctypes[*ptr] & ctype_digit) != 0)
- recno = recno * 10 + *ptr++ - '0';
-
- /* Come here from code above that handles a named recursion */
-
- HANDLE_RECURSION:
-
- previous = code;
-
- /* Find the bracket that is being referenced. Temporarily end the
- regex in case it doesn't exist. */
-
- *code = OP_END;
- called = (recno == 0)?
- cd->start_code : find_bracket(cd->start_code, recno);
- if (called == NULL)
- {
- *errorptr = ERR15;
- goto FAILED;
- }
-
- /* If the subpattern is still open, this is a recursive call. We
- check to see if this is a left recursion that could loop for ever,
- and diagnose that case. */
-
- if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr))
- {
- *errorptr = ERR40;
- goto FAILED;
- }
-
- /* Insert the recursion/subroutine item */
-
- *code = OP_RECURSE;
- PUT(code, 1, called - cd->start_code);
- code += 1 + LINK_SIZE;
- }
- continue;
-
- /* Character after (? not specially recognized */
-
- default: /* Option setting */
- set = unset = 0;
- optset = &set;
-
- while (*ptr != ')' && *ptr != ':')
- {
- switch (*ptr++)
- {
- case '-': optset = &unset; break;
-
- case 'i': *optset |= PCRE_CASELESS; break;
- case 'm': *optset |= PCRE_MULTILINE; break;
- case 's': *optset |= PCRE_DOTALL; break;
- case 'x': *optset |= PCRE_EXTENDED; break;
- case 'U': *optset |= PCRE_UNGREEDY; break;
- case 'X': *optset |= PCRE_EXTRA; break;
- }
- }
-
- /* Set up the changed option bits, but don't change anything yet. */
-
- newoptions = (options | set) & (~unset);
-
- /* If the options ended with ')' this is not the start of a nested
- group with option changes, so the options change at this level. Compile
- code to change the ims options if this setting actually changes any of
- them. We also pass the new setting back so that it can be put at the
- start of any following branches, and when this group ends (if we are in
- a group), a resetting item can be compiled.
-
- Note that if this item is right at the start of the pattern, the
- options will have been abstracted and made global, so there will be no
- change to compile. */
-
- if (*ptr == ')')
- {
- if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
- {
- *code++ = OP_OPT;
- *code++ = newoptions & PCRE_IMS;
- }
-
- /* Change options at this level, and pass them back for use
- in subsequent branches. Reset the greedy defaults and the case
- value for firstchar and reqchar. */
-
- *optionsptr = options = newoptions;
- greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
- greedy_non_default = greedy_default ^ 1;
- req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
-
- previous = NULL; /* This item can't be repeated */
- continue; /* It is complete */
- }
-
- /* If the options ended with ':' we are heading into a nested group
- with possible change of options. Such groups are non-capturing and are
- not assertions of any kind. All we need to do is skip over the ':';
- the newoptions value is handled below. */
-
- bravalue = OP_BRA;
- ptr++;
- }
- }
-
- /* Else we have a referencing group; adjust the opcode. If the bracket
- number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
- arrange for the true number to follow later, in an OP_BRANUMBER item. */
-
- else
- {
- NUMBERED_GROUP:
- if (++(*brackets) > EXTRACT_BASIC_MAX)
- {
- bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
- code[1+LINK_SIZE] = OP_BRANUMBER;
- PUT2(code, 2+LINK_SIZE, *brackets);
- skipbytes = 3;
- }
- else bravalue = OP_BRA + *brackets;
- }
-
- /* Process nested bracketed re. Assertions may not be repeated, but other
- kinds can be. We copy code into a non-register variable in order to be able
- to pass its address because some compilers complain otherwise. Pass in a
- new setting for the ims options if they have changed. */
-
- previous = (bravalue >= OP_ONCE)? code : NULL;
- *code = bravalue;
- tempcode = code;
-
- if (!compile_regex(
- newoptions, /* The complete new option state */
- options & PCRE_IMS, /* The previous ims option state */
- brackets, /* Extracting bracket count */
- &tempcode, /* Where to put code (updated) */
- &ptr, /* Input pointer (updated) */
- errorptr, /* Where to put an error message */
- (bravalue == OP_ASSERTBACK ||
- bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
- skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
- &subfirstchar, /* For possible first char */
- &subreqchar, /* For possible last char */
- bcptr, /* Current branch chain */
- cd)) /* Tables block */
- goto FAILED;
-
- /* At the end of compiling, code is still pointing to the start of the
- group, while tempcode has been updated to point past the end of the group
- and any option resetting that may follow it. The pattern pointer (ptr)
- is on the bracket. */
-
- /* If this is a conditional bracket, check that there are no more than
- two branches in the group. */
-
- else if (bravalue == OP_COND)
- {
- uschar *tc = code;
- condcount = 0;
-
- do {
- condcount++;
- tc += GET(tc,1);
- }
- while (*tc != OP_KET);
-
- if (condcount > 2)
- {
- *errorptr = ERR27;
- goto FAILED;
- }
-
- /* If there is just one branch, we must not make use of its firstchar or
- reqchar, because this is equivalent to an empty second branch. */
-
- if (condcount == 1) subfirstchar = subreqchar = REQ_NONE;
- }
-
- /* Handle updating of the required and first characters. Update for normal
- brackets of all kinds, and conditions with two branches (see code above).
- If the bracket is followed by a quantifier with zero repeat, we have to
- back off. Hence the definition of zeroreqchar and zerofirstchar outside the
- main loop so that they can be accessed for the back off. */
-
- zeroreqchar = reqchar;
- zerofirstchar = firstchar;
- groupsetfirstchar = FALSE;
-
- if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
- {
- /* If we have not yet set a firstchar in this branch, take it from the
- subpattern, remembering that it was set here so that a repeat of more
- than one can replicate it as reqchar if necessary. If the subpattern has
- no firstchar, set "none" for the whole branch. In both cases, a zero
- repeat forces firstchar to "none". */
-
- if (firstchar == REQ_UNSET)
- {
- if (subfirstchar >= 0)
- {
- firstchar = subfirstchar;
- groupsetfirstchar = TRUE;
- }
- else firstchar = REQ_NONE;
- zerofirstchar = REQ_NONE;
- }
-
- /* If firstchar was previously set, convert the subpattern's firstchar
- into reqchar if there wasn't one. */
-
- else if (subfirstchar >= 0 && subreqchar < 0) subreqchar = subfirstchar;
-
- /* If the subpattern set a required char (or set a first char that isn't
- really the first char - see above), set it. */
-
- if (subreqchar >= 0) reqchar = subreqchar;
- }
-
- /* For a forward assertion, we take the reqchar, if set. This can be
- helpful if the pattern that follows the assertion doesn't set a different
- char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
- for an assertion, however because it leads to incorrect effect for patterns
- such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
- of a firstchar. This is overcome by a scan at the end if there's no
- firstchar, looking for an asserted first char. */
-
- else if (bravalue == OP_ASSERT && subreqchar >= 0) reqchar = subreqchar;
-
- /* Now update the main code pointer to the end of the group. */
-
- code = tempcode;
-
- /* Error if hit end of pattern */
-
- if (*ptr != ')')
- {
- *errorptr = ERR14;
- goto FAILED;
- }
- break;
-
- /* Check \ for being a real metacharacter; if not, fall through and handle
- it as a data character at the start of a string. Escape items are checked
- for validity in the pre-compiling pass. */
-
- case '\\':
- tempptr = ptr;
- c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
-
- /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
- are arranged to be the negation of the corresponding OP_values. For the
- back references, the values are ESC_REF plus the reference number. Only
- back references and those types that consume a character may be repeated.
- We can test for values between ESC_b and ESC_Z for the latter; this may
- have to change if any new ones are ever created. */
-
- if (c < 0)
- {
- if (-c == ESC_Q) /* Handle start of quoted string */
- {
- if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
- else inescq = TRUE;
- continue;
- }
-
- /* For metasequences that actually match a character, we disable the
- setting of a first character if it hasn't already been set. */
-
- if (firstchar == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
- firstchar = REQ_NONE;
-
- /* Set values to reset to if this is followed by a zero repeat. */
-
- zerofirstchar = firstchar;
- zeroreqchar = reqchar;
-
- /* Back references are handled specially */
-
- if (-c >= ESC_REF)
- {
- int number = -c - ESC_REF;
- previous = code;
- *code++ = OP_REF;
- PUT2INC(code, 0, number);
- }
- else
- {
- previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
- *code++ = -c;
- }
- continue;
- }
-
- /* Data character: reset and fall through */
-
- ptr = tempptr;
- c = '\\';
-
- /* Handle a run of data characters until a metacharacter is encountered.
- The first character is guaranteed not to be whitespace or # when the
- extended flag is set. */
-
- NORMAL_CHAR:
- default:
- previous = code;
- *code = OP_CHARS;
- code += 2;
- length = 0;
-
- do
- {
- /* If in \Q...\E, check for the end; if not, we always have a literal */
-
- if (inescq)
- {
- if (c == '\\' && ptr[1] == 'E')
- {
- inescq = FALSE;
- ptr++;
- }
- else
- {
- *code++ = c;
- length++;
- }
- continue;
- }
-
- /* Skip white space and comments for /x patterns */
-
- if ((options & PCRE_EXTENDED) != 0)
- {
- if ((cd->ctypes[c] & ctype_space) != 0) continue;
- if (c == '#')
- {
- /* The space before the ; is to avoid a warning on a silly compiler
- on the Macintosh. */
- while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
- if (c == 0) break;
- continue;
- }
- }
-
- /* Backslash may introduce a data char or a metacharacter. Escaped items
- are checked for validity in the pre-compiling pass. Stop the string
- before a metaitem. */
-
- if (c == '\\')
- {
- tempptr = ptr;
- c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
- if (c < 0) { ptr = tempptr; break; }
-
- /* If a character is > 127 in UTF-8 mode, we have to turn it into
- two or more characters in the UTF-8 encoding. */
-
-#ifdef SUPPORT_UTF8
- if (c > 127 && (options & PCRE_UTF8) != 0)
- {
- uschar buffer[8];
- int len = ord2utf8(c, buffer);
- for (c = 0; c < len; c++) *code++ = buffer[c];
- length += len;
- continue;
- }
-#endif
- }
-
- /* Ordinary character or single-char escape */
-
- *code++ = c;
- length++;
- }
-
- /* This "while" is the end of the "do" above. */
-
- while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
-
- /* Update the first and last character */
-
- if (firstchar == REQ_UNSET)
- {
- if (length > 1)
- {
- zerofirstchar = firstchar = previous[2] | req_caseopt;
- zeroreqchar = (length > 2)? (code[-2] | req_caseopt) : reqchar;
- reqchar = code[-1] | req_caseopt;
- }
- else
- {
- zerofirstchar = REQ_NONE;
- firstchar = code[-1] | req_caseopt;
- zeroreqchar = reqchar;
- }
- }
- else /* firstchar previously set */
- {
- zerofirstchar = firstchar;
- zeroreqchar = (length > 1)? (code[-2] | req_caseopt) : reqchar;
- reqchar = code[-1] | req_caseopt;
- }
-
- /* Set the length in the data vector, and advance to the next state. */
-
- previous[1] = length;
- if (length < MAXLIT) ptr--;
- break;
- }
- } /* end of big loop */
-
-/* Control never reaches here by falling through, only by a goto for all the
-error states. Pass back the position in the pattern so that it can be displayed
-to the user for diagnosing the error. */
-
-FAILED:
-*ptrptr = ptr;
-return FALSE;
-}
-
-
-
-
-/*************************************************
-* Compile sequence of alternatives *
-*************************************************/
-
-/* On entry, ptr is pointing past the bracket character, but on return
-it points to the closing bracket, or vertical bar, or end of string.
-The code variable is pointing at the byte into which the BRA operator has been
-stored. If the ims options are changed at the start (for a (?ims: group) or
-during any branch, we need to insert an OP_OPT item at the start of every
-following branch to ensure they get set correctly at run time, and also pass
-the new options into every subsequent branch compile.
-
-Argument:
- options option bits, including any changes for this subpattern
- oldims previous settings of ims option bits
- brackets -> int containing the number of extracting brackets used
- codeptr -> the address of the current code pointer
- ptrptr -> the address of the current pattern pointer
- errorptr -> pointer to error message
- lookbehind TRUE if this is a lookbehind assertion
- skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
- firstcharptr place to put the first required character, or a negative number
- reqcharptr place to put the last required character, or a negative number
- bcptr pointer to the chain of currently open branches
- cd points to the data block with tables pointers etc.
-
-Returns: TRUE on success
-*/
-
-static BOOL
-compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
- const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
- int *firstcharptr, int *reqcharptr, branch_chain *bcptr, compile_data *cd)
-{
-const uschar *ptr = *ptrptr;
-uschar *code = *codeptr;
-uschar *last_branch = code;
-uschar *start_bracket = code;
-uschar *reverse_count = NULL;
-int firstchar, reqchar;
-int branchfirstchar, branchreqchar;
-branch_chain bc;
-
-bc.outer = bcptr;
-bc.current = code;
-
-firstchar = reqchar = REQ_UNSET;
-
-/* Offset is set zero to mark that this bracket is still open */
-
-PUT(code, 1, 0);
-code += 1 + LINK_SIZE + skipbytes;
-
-/* Loop for each alternative branch */
-
-for (;;)
- {
- /* Handle a change of ims options at the start of the branch */
-
- if ((options & PCRE_IMS) != oldims)
- {
- *code++ = OP_OPT;
- *code++ = options & PCRE_IMS;
- }
-
- /* Set up dummy OP_REVERSE if lookbehind assertion */
-
- if (lookbehind)
- {
- *code++ = OP_REVERSE;
- reverse_count = code;
- PUTINC(code, 0, 0);
- }
-
- /* Now compile the branch */
-
- if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
- &branchfirstchar, &branchreqchar, &bc, cd))
- {
- *ptrptr = ptr;
- return FALSE;
- }
-
- /* If this is the first branch, the firstchar and reqchar values for the
- branch become the values for the regex. */
-
- if (*last_branch != OP_ALT)
- {
- firstchar = branchfirstchar;
- reqchar = branchreqchar;
- }
-
- /* If this is not the first branch, the first char and reqchar have to
- match the values from all the previous branches. */
-
- else
- {
- /* If we previously had a firstchar, but it doesn't match the new branch,
- we have to abandon the firstchar for the regex, but if there was previously
- no reqchar, it takes on the value of the old firstchar. */
-
- if (firstchar >= 0 && firstchar != branchfirstchar)
- {
- if (reqchar < 0) reqchar = firstchar;
- firstchar = REQ_NONE;
- }
-
- /* If we (now or from before) have no firstchar, a firstchar from the
- branch becomes a reqchar if there isn't a branch reqchar. */
-
- if (firstchar < 0 && branchfirstchar >= 0 && branchreqchar < 0)
- branchreqchar = branchfirstchar;
-
- /* Now ensure that the reqchars match */
-
- if (reqchar != branchreqchar) reqchar = REQ_NONE;
- }
-
- /* If lookbehind, check that this branch matches a fixed-length string,
- and put the length into the OP_REVERSE item. Temporarily mark the end of
- the branch with OP_END. */
-
- if (lookbehind)
- {
- int length;
- *code = OP_END;
- length = find_fixedlength(last_branch, options);
- DPRINTF(("fixed length = %d\n", length));
- if (length < 0)
- {
- *errorptr = (length == -2)? ERR36 : ERR25;
- *ptrptr = ptr;
- return FALSE;
- }
- PUT(reverse_count, 0, length);
- }
-
- /* Reached end of expression, either ')' or end of pattern. Go back through
- the alternative branches and reverse the chain of offsets, with the field in
- the BRA item now becoming an offset to the first alternative. If there are
- no alternatives, it points to the end of the group. The length in the
- terminating ket is always the length of the whole bracketed item. If any of
- the ims options were changed inside the group, compile a resetting op-code
- following, except at the very end of the pattern. Return leaving the pointer
- at the terminating char. */
-
- if (*ptr != '|')
- {
- int length = code - last_branch;
- do
- {
- int prev_length = GET(last_branch, 1);
- PUT(last_branch, 1, length);
- length = prev_length;
- last_branch -= length;
- }
- while (length > 0);
-
- /* Fill in the ket */
-
- *code = OP_KET;
- PUT(code, 1, code - start_bracket);
- code += 1 + LINK_SIZE;
-
- /* Resetting option if needed */
-
- if ((options & PCRE_IMS) != oldims && *ptr == ')')
- {
- *code++ = OP_OPT;
- *code++ = oldims;
- }
-
- /* Set values to pass back */
-
- *codeptr = code;
- *ptrptr = ptr;
- *firstcharptr = firstchar;
- *reqcharptr = reqchar;
- return TRUE;
- }
-
- /* Another branch follows; insert an "or" node. Its length field points back
- to the previous branch while the bracket remains open. At the end the chain
- is reversed. It's done like this so that the start of the bracket has a
- zero offset until it is closed, making it possible to detect recursion. */
-
- *code = OP_ALT;
- PUT(code, 1, code - last_branch);
- bc.current = last_branch = code;
- code += 1 + LINK_SIZE;
- ptr++;
- }
-/* Control never reaches here */
-}
-
-
-
-
-/*************************************************
-* Check for anchored expression *
-*************************************************/
-
-/* Try to find out if this is an anchored regular expression. Consider each
-alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
-all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
-it's anchored. However, if this is a multiline pattern, then only OP_SOD
-counts, since OP_CIRC can match in the middle.
-
-A branch is also implicitly anchored if it starts with .* and DOTALL is set,
-because that will try the rest of the pattern at all possible matching points,
-so there is no point trying again.... er ....
-
-.... except when the .* appears inside capturing parentheses, and there is a
-subsequent back reference to those parentheses. We haven't enough information
-to catch that case precisely. The best we can do is to detect when .* is in
-capturing brackets and the highest back reference is greater than or equal to
-that level.
-
-Arguments:
- code points to start of expression (the bracket)
- options points to the options setting
- in_brackets TRUE if inside capturing parentheses
- top_backref the highest back reference in the regex
-
-Returns: TRUE or FALSE
-*/
-
-static BOOL
-is_anchored(register const uschar *code, int *options, BOOL in_brackets,
- int top_backref)
-{
-do {
- const uschar *scode =
- first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
- register int op = *scode;
-
- /* Capturing brackets */
-
- if (op > OP_BRA)
- {
- if (!is_anchored(scode, options, TRUE, top_backref)) return FALSE;
- }
-
- /* Other brackets */
-
- else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
- {
- if (!is_anchored(scode, options, in_brackets, top_backref))
- return FALSE;
- }
-
- /* .* is not anchored unless DOTALL is set and it isn't in brackets that
- may be referenced. */
-
- else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
- (*options & PCRE_DOTALL) != 0)
- {
- if (scode[1] != OP_ANY || (in_brackets && top_backref > 0)) return FALSE;
- }
-
- /* Check for explicit anchoring */
-
- else if (op != OP_SOD &&
- ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
- return FALSE;
- code += GET(code, 1);
- }
-while (*code == OP_ALT); /* Loop for each alternative */
-return TRUE;
-}
-
-
-
-/*************************************************
-* Check for starting with ^ or .* *
-*************************************************/
-
-/* This is called to find out if every branch starts with ^ or .* so that
-"first char" processing can be done to speed things up in multiline
-matching and for non-DOTALL patterns that start with .* (which must start at
-the beginning or after \n). As in the case of is_anchored() (see above), we
-have to take account of back references to capturing brackets that contain .*
-because in that case we can't make the assumption.
-
-Arguments:
- code points to start of expression (the bracket)
- in_brackets TRUE if inside capturing parentheses
- top_backref the highest back reference in the regex
-
-Returns: TRUE or FALSE
-*/
-
-static BOOL
-is_startline(const uschar *code, BOOL in_brackets, int top_backref)
-{
-do {
- const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
- register int op = *scode;
-
- /* Capturing brackets */
-
- if (op > OP_BRA)
- { if (!is_startline(scode, TRUE, top_backref)) return FALSE; }
-
- /* Other brackets */
-
- else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
- { if (!is_startline(scode, in_brackets, top_backref)) return FALSE; }
-
- /* .* is not anchored unless DOTALL is set and it isn't in brackets that
- may be referenced. */
-
- else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
- {
- if (scode[1] != OP_ANY || (in_brackets && top_backref > 0)) return FALSE;
- }
-
- /* Check for explicit circumflex */
-
- else if (op != OP_CIRC) return FALSE;
- code += GET(code, 1);
- }
-while (*code == OP_ALT); /* Loop for each alternative */
-return TRUE;
-}
-
-
-
-/*************************************************
-* Check for asserted fixed first char *
-*************************************************/
-
-/* During compilation, the "first char" settings from forward assertions are
-discarded, because they can cause conflicts with actual literals that follow.
-However, if we end up without a first char setting for an unanchored pattern,
-it is worth scanning the regex to see if there is an initial asserted first
-char. If all branches start with the same asserted char, or with a bracket all
-of whose alternatives start with the same asserted char (recurse ad lib), then
-we return that char, otherwise -1.
-
-Arguments:
- code points to start of expression (the bracket)
- options pointer to the options (used to check casing changes)
- inassert TRUE if in an assertion
-
-Returns: -1 or the fixed first char
-*/
-
-static int
-find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
-{
-register int c = -1;
-do {
- int d;
- const uschar *scode =
- first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
- register int op = *scode;
-
- if (op >= OP_BRA) op = OP_BRA;
-
- switch(op)
- {
- default:
- return -1;
-
- case OP_BRA:
- case OP_ASSERT:
- case OP_ONCE:
- case OP_COND:
- if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
- return -1;
- if (c < 0) c = d; else if (c != d) return -1;
- break;
-
- case OP_EXACT: /* Fall through */
- scode++;
-
- case OP_CHARS: /* Fall through */
- scode++;
-
- case OP_PLUS:
- case OP_MINPLUS:
- if (!inassert) return -1;
- if (c < 0)
- {
- c = scode[1];
- if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
- }
- else if (c != scode[1]) return -1;
- break;
- }
-
- code += GET(code, 1);
- }
-while (*code == OP_ALT);
-return c;
-}
-
-
-
-
-/*************************************************
-* Compile a Regular Expression *
-*************************************************/
-
-/* This function takes a string and returns a pointer to a block of store
-holding a compiled version of the expression.
-
-Arguments:
- pattern the regular expression
- options various option bits
- errorptr pointer to pointer to error text
- erroroffset ptr offset in pattern where error was detected
- tables pointer to character tables or NULL
-
-Returns: pointer to compiled data block, or NULL on error,
- with errorptr and erroroffset set
-*/
-
-pcre *
-pcre_compile(const char *pattern, int options, const char **errorptr,
- int *erroroffset, const unsigned char *tables)
-{
-real_pcre *re;
-int length = 1 + LINK_SIZE; /* For initial BRA plus length */
-int runlength;
-int c, firstchar, reqchar;
-int bracount = 0;
-int top_backref = 0;
-int branch_extra = 0;
-int branch_newextra;
-int item_count = -1;
-int name_count = 0;
-int max_name_size = 0;
-BOOL inescq = FALSE;
-unsigned int brastackptr = 0;
-size_t size;
-uschar *code;
-const uschar *codestart;
-const uschar *ptr;
-compile_data compile_block;
-int brastack[BRASTACK_SIZE];
-uschar bralenstack[BRASTACK_SIZE];
-
-/* Can't support UTF8 unless PCRE has been compiled to include the code. */
-
-#ifndef SUPPORT_UTF8
-if ((options & PCRE_UTF8) != 0)
- {
- *errorptr = ERR32;
- return NULL;
- }
-#endif
-
-/* We can't pass back an error message if errorptr is NULL; I guess the best we
-can do is just return NULL. */
-
-if (errorptr == NULL) return NULL;
-*errorptr = NULL;
-
-/* However, we can give a message for this error */
-
-if (erroroffset == NULL)
- {
- *errorptr = ERR16;
- return NULL;
- }
-*erroroffset = 0;
-
-if ((options & ~PUBLIC_OPTIONS) != 0)
- {
- *errorptr = ERR17;
- return NULL;
- }
-
-/* Set up pointers to the individual character tables */
-
-if (tables == NULL) tables = pcre_default_tables;
-compile_block.lcc = tables + lcc_offset;
-compile_block.fcc = tables + fcc_offset;
-compile_block.cbits = tables + cbits_offset;
-compile_block.ctypes = tables + ctypes_offset;
-
-/* Reflect pattern for debugging output */
-
-DPRINTF(("------------------------------------------------------------------\n"));
-DPRINTF(("%s\n", pattern));
-
-/* The first thing to do is to make a pass over the pattern to compute the
-amount of store required to hold the compiled code. This does not have to be
-perfect as long as errors are overestimates. At the same time we can detect any
-flag settings right at the start, and extract them. Make an attempt to correct
-for any counted white space if an "extended" flag setting appears late in the
-pattern. We can't be so clever for #-comments. */
-
-ptr = (const uschar *)(pattern - 1);
-while ((c = *(++ptr)) != 0)
- {
- int min, max;
- int class_charcount;
- int bracket_length;
- int duplength;
-
- /* If we are inside a \Q...\E sequence, all chars are literal */
-
- if (inescq) goto NORMAL_CHAR;
-
- /* Otherwise, first check for ignored whitespace and comments */
-
- if ((options & PCRE_EXTENDED) != 0)
- {
- if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
- if (c == '#')
- {
- /* The space before the ; is to avoid a warning on a silly compiler
- on the Macintosh. */
- while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
- if (c == 0) break;
- continue;
- }
- }
-
- item_count++; /* Is zero for the first non-comment item */
-
- switch(c)
- {
- /* A backslashed item may be an escaped "normal" character or a
- character type. For a "normal" character, put the pointers and
- character back so that tests for whitespace etc. in the input
- are done correctly. */
-
- case '\\':
- {
- const uschar *save_ptr = ptr;
- c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
- if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
- if (c >= 0)
- {
- ptr = save_ptr;
- c = '\\';
- goto NORMAL_CHAR;
- }
- }
-
- /* If \Q, enter "literal" mode */
-
- if (-c == ESC_Q)
- {
- inescq = TRUE;
- continue;
- }
-
- /* Other escapes need one byte */
-
- length++;
-
- /* A back reference needs an additional 2 bytes, plus either one or 5
- bytes for a repeat. We also need to keep the value of the highest
- back reference. */
-
- if (c <= -ESC_REF)
- {
- int refnum = -c - ESC_REF;
- if (refnum > top_backref) top_backref = refnum;
- length += 2; /* For single back reference */
- if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
- {
- ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
- if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
- if ((min == 0 && (max == 1 || max == -1)) ||
- (min == 1 && max == -1))
- length++;
- else length += 5;
- if (ptr[1] == '?') ptr++;
- }
- }
- continue;
-
- case '*': /* These repeats won't be after brackets; */
- case '+': /* those are handled separately */
- case '?':
- if (ptr[1] == '+') /* Handle "possessive quantifier" */
- {
- length += 2 + 2*LINK_SIZE;
- ptr++;
- }
- /* Fall through */
-
- case '^': /* Single-byte metacharacters */
- case '.':
- case '$':
- length++;
- continue;
-
- /* This covers the cases of repeats after a single char, metachar, class,
- or back reference. */
-
- case '{':
- if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
- ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
- if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
- if ((min == 0 && (max == 1 || max == -1)) ||
- (min == 1 && max == -1))
- length++;
- else
- {
- length--; /* Uncount the original char or metachar */
- if (min == 1) length++; else if (min > 0) length += 4;
- if (max > 0) length += 4; else length += 2;
- }
- if (ptr[1] == '?') ptr++; /* Needs no extra length */
- if (ptr[1] == '+') /* Possessive quantifier */
- {
- ptr++;
- length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
- }
- continue;
-
- /* An alternation contains an offset to the next branch or ket. If any ims
- options changed in the previous branch(es), and/or if we are in a
- lookbehind assertion, extra space will be needed at the start of the
- branch. This is handled by branch_extra. */
-
- case '|':
- length += 1 + LINK_SIZE + branch_extra;
- continue;
-
- /* A character class uses 33 characters. Don't worry about character types
- that aren't allowed in classes - they'll get picked up during the compile.
- A character class that contains only one character uses 2 or 3 bytes,
- depending on whether it is negated or not. Notice this where we can. */
-
- case '[':
- class_charcount = 0;
- if (*(++ptr) == '^') ptr++;
-
- /* Written as a "do" so that an initial ']' is taken as data */
-
- if (*ptr != 0) do
- {
- if (*ptr == '\\')
- {
- int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
- &compile_block);
- if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
- if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
- }
-
- /* Check the syntax for POSIX stuff. The bits we actually handle are
- checked during the real compile phase. */
-
- else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
- {
- ptr++;
- class_charcount = 10; /* Make sure > 1 */
- }
-
- /* Anything else just counts as one char */
-
- else class_charcount++;
- }
- while (*(++ptr) != 0 && *ptr != ']'); /* Concludes "do" above */
-
- if (*ptr == 0) /* Missing terminating ']' */
- {
- *errorptr = ERR6;
- goto PCRE_ERROR_RETURN;
- }
-
- /* Repeats for negated single chars are handled by the general code */
-
- if (class_charcount == 1) length += 3; else
- {
- length += 33;
-
- /* A repeat needs either 1 or 5 bytes. */
-
- if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
- {
- ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
- if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
- if ((min == 0 && (max == 1 || max == -1)) ||
- (min == 1 && max == -1))
- length++;
- else length += 5;
- if (ptr[1] == '?') ptr++;
- }
- }
- continue;
-
- /* Brackets may be genuine groups or special things */
-
- case '(':
- branch_newextra = 0;
- bracket_length = 1 + LINK_SIZE;
-
- /* Handle special forms of bracket, which all start (? */
-
- if (ptr[1] == '?')
- {
- int set, unset;
- int *optset;
-
- switch (c = ptr[2])
- {
- /* Skip over comments entirely */
- case '#':
- ptr += 3;
- while (*ptr != 0 && *ptr != ')') ptr++;
- if (*ptr == 0)
- {
- *errorptr = ERR18;
- goto PCRE_ERROR_RETURN;
- }
- continue;
-
- /* Non-referencing groups and lookaheads just move the pointer on, and
- then behave like a non-special bracket, except that they don't increment
- the count of extracting brackets. Ditto for the "once only" bracket,
- which is in Perl from version 5.005. */
-
- case ':':
- case '=':
- case '!':
- case '>':
- ptr += 2;
- break;
-
- /* (?R) specifies a recursive call to the regex, which is an extension
- to provide the facility which can be obtained by (?p{perl-code}) in
- Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
-
- From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
- the appropriate numbered brackets. This includes both recursive and
- non-recursive calls. (?R) is now synonymous with (?0). */
-
- case 'R':
- ptr++;
-
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- ptr += 2;
- if (c != 'R')
- while ((compile_block.ctypes[*(++ptr)] & ctype_digit) != 0);
- if (*ptr != ')')
- {
- *errorptr = ERR29;
- goto PCRE_ERROR_RETURN;
- }
- length += 1 + LINK_SIZE;
-
- /* If this item is quantified, it will get wrapped inside brackets so
- as to use the code for quantified brackets. We jump down and use the
- code that handles this for real brackets. */
-
- if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
- {
- length += 2 + 2 * LINK_SIZE; /* to make bracketed */
- duplength = 5 + 3 * LINK_SIZE;
- goto HANDLE_QUANTIFIED_BRACKETS;
- }
- continue;
-
- /* (?C) is an extension which provides "callout" - to provide a bit of
- the functionality of the Perl (?{...}) feature. An optional number may
- follow (default is zero). */
-
- case 'C':
- ptr += 2;
- while ((compile_block.ctypes[*(++ptr)] & ctype_digit) != 0);
- if (*ptr != ')')
- {
- *errorptr = ERR39;
- goto PCRE_ERROR_RETURN;
- }
- length += 2;
- continue;
-
- /* Named subpatterns are an extension copied from Python */
-
- case 'P':
- ptr += 3;
- if (*ptr == '<')
- {
- const uschar *p = ++ptr;
- while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
- if (*ptr != '>')
- {
- *errorptr = ERR42;
- goto PCRE_ERROR_RETURN;
- }
- name_count++;
- if (ptr - p > max_name_size) max_name_size = (ptr - p);
- break;
- }
-
- if (*ptr == '=' || *ptr == '>')
- {
- while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
- if (*ptr != ')')
- {
- *errorptr = ERR42;
- goto PCRE_ERROR_RETURN;
- }
- break;
- }
-
- /* Unknown character after (?P */
-
- *errorptr = ERR41;
- goto PCRE_ERROR_RETURN;
-
- /* Lookbehinds are in Perl from version 5.005 */
-
- case '<':
- ptr += 3;
- if (*ptr == '=' || *ptr == '!')
- {
- branch_newextra = 1 + LINK_SIZE;
- length += 1 + LINK_SIZE; /* For the first branch */
- break;
- }
- *errorptr = ERR24;
- goto PCRE_ERROR_RETURN;
-
- /* Conditionals are in Perl from version 5.005. The bracket must either
- be followed by a number (for bracket reference) or by an assertion
- group, or (a PCRE extension) by 'R' for a recursion test. */
-
- case '(':
- if (ptr[3] == 'R' && ptr[4] == ')')
- {
- ptr += 4;
- length += 3;
- }
- else if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
- {
- ptr += 4;
- length += 3;
- while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
- if (*ptr != ')')
- {
- *errorptr = ERR26;
- goto PCRE_ERROR_RETURN;
- }
- }
- else /* An assertion must follow */
- {
- ptr++; /* Can treat like ':' as far as spacing is concerned */
- if (ptr[2] != '?' ||
- (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
- {
- ptr += 2; /* To get right offset in message */
- *errorptr = ERR28;
- goto PCRE_ERROR_RETURN;
- }
- }
- break;
-
- /* Else loop checking valid options until ) is met. Anything else is an
- error. If we are without any brackets, i.e. at top level, the settings
- act as if specified in the options, so massage the options immediately.
- This is for backward compatibility with Perl 5.004. */
-
- default:
- set = unset = 0;
- optset = &set;
- ptr += 2;
-
- for (;; ptr++)
- {
- c = *ptr;
- switch (c)
- {
- case 'i':
- *optset |= PCRE_CASELESS;
- continue;
-
- case 'm':
- *optset |= PCRE_MULTILINE;
- continue;
-
- case 's':
- *optset |= PCRE_DOTALL;
- continue;
-
- case 'x':
- *optset |= PCRE_EXTENDED;
- continue;
-
- case 'X':
- *optset |= PCRE_EXTRA;
- continue;
-
- case 'U':
- *optset |= PCRE_UNGREEDY;
- continue;
-
- case '-':
- optset = &unset;
- continue;
-
- /* A termination by ')' indicates an options-setting-only item; if
- this is at the very start of the pattern (indicated by item_count
- being zero), we use it to set the global options. This is helpful
- when analyzing the pattern for first characters, etc. Otherwise
- nothing is done here and it is handled during the compiling
- process.
-
- [Historical note: Up to Perl 5.8, options settings at top level
- were always global settings, wherever they appeared in the pattern.
- That is, they were equivalent to an external setting. From 5.8
- onwards, they apply only to what follows (which is what you might
- expect).] */
-
- case ')':
- if (item_count == 0)
- {
- options = (options | set) & (~unset);
- set = unset = 0; /* To save length */
- item_count--; /* To allow for several */
- }
-
- /* Fall through */
-
- /* A termination by ':' indicates the start of a nested group with
- the given options set. This is again handled at compile time, but
- we must allow for compiled space if any of the ims options are
- set. We also have to allow for resetting space at the end of
- the group, which is why 4 is added to the length and not just 2.
- If there are several changes of options within the same group, this
- will lead to an over-estimate on the length, but this shouldn't
- matter very much. We also have to allow for resetting options at
- the start of any alternations, which we do by setting
- branch_newextra to 2. Finally, we record whether the case-dependent
- flag ever changes within the regex. This is used by the "required
- character" code. */
-
- case ':':
- if (((set|unset) & PCRE_IMS) != 0)
- {
- length += 4;
- branch_newextra = 2;
- if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
- }
- goto END_OPTIONS;
-
- /* Unrecognized option character */
-
- default:
- *errorptr = ERR12;
- goto PCRE_ERROR_RETURN;
- }
- }
-
- /* If we hit a closing bracket, that's it - this is a freestanding
- option-setting. We need to ensure that branch_extra is updated if
- necessary. The only values branch_newextra can have here are 0 or 2.
- If the value is 2, then branch_extra must either be 2 or 5, depending
- on whether this is a lookbehind group or not. */
-
- END_OPTIONS:
- if (c == ')')
- {
- if (branch_newextra == 2 &&
- (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
- branch_extra += branch_newextra;
- continue;
- }
-
- /* If options were terminated by ':' control comes here. Fall through
- to handle the group below. */
- }
- }
-
- /* Extracting brackets must be counted so we can process escapes in a
- Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
- need an additional 3 bytes of store per extracting bracket. */
-
- else
- {
- bracount++;
- if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
- }
-
- /* Save length for computing whole length at end if there's a repeat that
- requires duplication of the group. Also save the current value of
- branch_extra, and start the new group with the new value. If non-zero, this
- will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
-
- if (brastackptr >= sizeof(brastack)/sizeof(int))
- {
- *errorptr = ERR19;
- goto PCRE_ERROR_RETURN;
- }
-
- bralenstack[brastackptr] = branch_extra;
- branch_extra = branch_newextra;
-
- brastack[brastackptr++] = length;
- length += bracket_length;
- continue;
-
- /* Handle ket. Look for subsequent max/min; for certain sets of values we
- have to replicate this bracket up to that many times. If brastackptr is
- 0 this is an unmatched bracket which will generate an error, but take care
- not to try to access brastack[-1] when computing the length and restoring
- the branch_extra value. */
-
- case ')':
- length += 1 + LINK_SIZE;
- if (brastackptr > 0)
- {
- duplength = length - brastack[--brastackptr];
- branch_extra = bralenstack[brastackptr];
- }
- else duplength = 0;
-
- /* The following code is also used when a recursion such as (?3) is
- followed by a quantifier, because in that case, it has to be wrapped inside
- brackets so that the quantifier works. The value of duplength must be
- set before arrival. */
-
- HANDLE_QUANTIFIED_BRACKETS:
-
- /* Leave ptr at the final char; for read_repeat_counts this happens
- automatically; for the others we need an increment. */
-
- if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
- {
- ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
- if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
- }
- else if (c == '*') { min = 0; max = -1; ptr++; }
- else if (c == '+') { min = 1; max = -1; ptr++; }
- else if (c == '?') { min = 0; max = 1; ptr++; }
- else { min = 1; max = 1; }
-
- /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
- group, and if the maximum is greater than zero, we have to replicate
- maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
- bracket set. */
-
- if (min == 0)
- {
- length++;
- if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
- }
-
- /* When the minimum is greater than zero, we have to replicate up to
- minval-1 times, with no additions required in the copies. Then, if there
- is a limited maximum we have to replicate up to maxval-1 times allowing
- for a BRAZERO item before each optional copy and nesting brackets for all
- but one of the optional copies. */
-
- else
- {
- length += (min - 1) * duplength;
- if (max > min) /* Need this test as max=-1 means no limit */
- length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
- - (2 + 2*LINK_SIZE);
- }
-
- /* Allow space for once brackets for "possessive quantifier" */
-
- if (ptr[1] == '+')
- {
- ptr++;
- length += 2 + 2*LINK_SIZE;
- }
- continue;
-
- /* Non-special character. For a run of such characters the length required
- is the number of characters + 2, except that the maximum run length is 255.
- We won't get a skipped space or a non-data escape or the start of a #
- comment as the first character, so the length can't be zero. */
-
- NORMAL_CHAR:
- default:
- length += 2;
- runlength = 0;
- do
- {
- /* If in a \Q...\E sequence, check for end; otherwise it's a literal */
- if (inescq)
- {
- if (c == '\\' && ptr[1] == 'E')
- {
- inescq = FALSE;
- ptr++;
- }
- else runlength++;
- continue;
- }
-
- /* Skip whitespace and comments for /x */
-
- if ((options & PCRE_EXTENDED) != 0)
- {
- if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
- if (c == '#')
- {
- /* The space before the ; is to avoid a warning on a silly compiler
- on the Macintosh. */
- while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
- continue;
- }
- }
-
- /* Backslash may introduce a data char or a metacharacter; stop the
- string before the latter. */
-
- if (c == '\\')
- {
- const uschar *saveptr = ptr;
- c = check_escape(&ptr, errorptr, bracount, options, FALSE,
- &compile_block);
- if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
- if (c < 0) { ptr = saveptr; break; }
-
-#ifdef SUPPORT_UTF8
- if (c > 127 && (options & PCRE_UTF8) != 0)
- {
- int i;
- for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
- if (c <= utf8_table1[i]) break;
- runlength += i;
- }
-#endif
- }
-
- /* Ordinary character or single-char escape */
-
- runlength++;
- }
-
- /* This "while" is the end of the "do" above. */
-
- while (runlength < MAXLIT &&
- (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
-
- if (runlength < MAXLIT) ptr--;
- length += runlength;
- continue;
- }
- }
-
-length += 2 + LINK_SIZE; /* For final KET and END */
-
-if (length > MAX_PATTERN_SIZE)
- {
- *errorptr = ERR20;
- return NULL;
- }
-
-/* Compute the size of data block needed and get it, either from malloc or
-externally provided function. */
-
-size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
-re = (real_pcre *)(pcre_malloc)(size);
-
-if (re == NULL)
- {
- *errorptr = ERR21;
- return NULL;
- }
-
-/* Put in the magic number, and save the size, options, and table pointer */
-
-re->magic_number = MAGIC_NUMBER;
-re->size = size;
-re->options = options;
-re->tables = tables;
-re->name_entry_size = max_name_size + 3;
-re->name_count = name_count;
-
-/* The starting points of the name/number translation table and of the code are
-passed around in the compile data block. */
-
-compile_block.names_found = 0;
-compile_block.name_entry_size = max_name_size + 3;
-compile_block.name_table = (uschar *)re + sizeof(real_pcre);
-codestart = compile_block.name_table + re->name_entry_size * re->name_count;
-compile_block.start_code = codestart;
-
-/* Set up a starting, non-extracting bracket, then compile the expression. On
-error, *errorptr will be set non-NULL, so we don't need to look at the result
-of the function here. */
-
-ptr = (const uschar *)pattern;
-code = (uschar *)codestart;
-*code = OP_BRA;
-bracount = 0;
-(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
- errorptr, FALSE, 0, &firstchar, &reqchar, NULL, &compile_block);
-re->top_bracket = bracount;
-re->top_backref = top_backref;
-
-/* If not reached end of pattern on success, there's an excess bracket. */
-
-if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
-
-/* Fill in the terminating state and check for disastrous overflow, but
-if debugging, leave the test till after things are printed out. */
-
-*code++ = OP_END;
-
-#ifndef DEBUG
-if (code - codestart > length) *errorptr = ERR23;
-#endif
-
-/* Give an error if there's back reference to a non-existent capturing
-subpattern. */
-
-if (top_backref > re->top_bracket) *errorptr = ERR15;
-
-/* Failed to compile, or error while post-processing */
-
-if (*errorptr != NULL)
- {
- (pcre_free)(re);
- PCRE_ERROR_RETURN:
- *erroroffset = ptr - (const uschar *)pattern;
- return NULL;
- }
-
-/* If the anchored option was not passed, set the flag if we can determine that
-the pattern is anchored by virtue of ^ characters or \A or anything else (such
-as starting with .* when DOTALL is set).
-
-Otherwise, if we know what the first character has to be, save it, because that
-speeds up unanchored matches no end. If not, see if we can set the
-PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
-start with ^. and also when all branches start with .* for non-DOTALL matches.
-*/
-
-if ((options & PCRE_ANCHORED) == 0)
- {
- int temp_options = options;
- if (is_anchored(codestart, &temp_options, FALSE, top_backref))
- re->options |= PCRE_ANCHORED;
- else
- {
- if (firstchar < 0)
- firstchar = find_firstassertedchar(codestart, &temp_options, FALSE);
- if (firstchar >= 0) /* Remove caseless flag for non-caseable chars */
- {
- int ch = firstchar & 255;
- re->first_char = ((firstchar & REQ_CASELESS) != 0 &&
- compile_block.fcc[ch] == ch)? ch : firstchar;
- re->options |= PCRE_FIRSTSET;
- }
- else if (is_startline(codestart, FALSE, top_backref))
- re->options |= PCRE_STARTLINE;
- }
- }
-
-/* Save the last required character if any. Remove caseless flag for
-non-caseable chars. */
-
-if ((re->options & PCRE_ANCHORED) != 0 && reqchar < 0 && firstchar >= 0)
- reqchar = firstchar;
-
-if (reqchar >= 0)
- {
- int ch = reqchar & 255;
- re->req_char = ((reqchar & REQ_CASELESS) != 0 &&
- compile_block.fcc[ch] == ch)? ch : reqchar;
- re->options |= PCRE_REQCHSET;
- }
-
-/* Print out the compiled data for debugging */
-
-#ifdef DEBUG
-
-printf("Length = %d top_bracket = %d top_backref = %d\n",
- length, re->top_bracket, re->top_backref);
-
-if (re->options != 0)
- {
- printf("%s%s%s%s%s%s%s%s%s\n",
- ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
- ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
- ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
- ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
- ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
- ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
- ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
- ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
- ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
- }
-
-if ((re->options & PCRE_FIRSTSET) != 0)
- {
- int ch = re->first_char & 255;
- char *caseless = ((re->first_char & REQ_CASELESS) == 0)? "" : " (caseless)";
- if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
- else printf("First char = \\x%02x%s\n", ch, caseless);
- }
-
-if ((re->options & PCRE_REQCHSET) != 0)
- {
- int ch = re->req_char & 255;
- char *caseless = ((re->req_char & REQ_CASELESS) == 0)? "" : " (caseless)";
- if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
- else printf("Req char = \\x%02x%s\n", ch, caseless);
- }
-
-print_internals(re, stdout);
-
-/* This check is done here in the debugging case so that the code that
-was compiled can be seen. */
-
-if (code - codestart > length)
- {
- *errorptr = ERR23;
- (pcre_free)(re);
- *erroroffset = ptr - (uschar *)pattern;
- return NULL;
- }
-#endif
-
-return (pcre *)re;
-}
-
-
-
-/*************************************************
-* Match a back-reference *
-*************************************************/
-
-/* If a back reference hasn't been set, the length that is passed is greater
-than the number of characters left in the string, so the match fails.
-
-Arguments:
- offset index into the offset vector
- eptr points into the subject
- length length to be matched
- md points to match data block
- ims the ims flags
-
-Returns: TRUE if matched
-*/
-
-static BOOL
-match_ref(int offset, register const uschar *eptr, int length, match_data *md,
- unsigned long int ims)
-{
-const uschar *p = md->start_subject + md->offset_vector[offset];
-
-#ifdef DEBUG
-if (eptr >= md->end_subject)
- printf("matching subject <null>");
-else
- {
- printf("matching subject ");
- pchars(eptr, length, TRUE, md);
- }
-printf(" against backref ");
-pchars(p, length, FALSE, md);
-printf("\n");
-#endif
-
-/* Always fail if not enough characters left */
-
-if (length > md->end_subject - eptr) return FALSE;
-
-/* Separate the caselesss case for speed */
-
-if ((ims & PCRE_CASELESS) != 0)
- {
- while (length-- > 0)
- if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
- }
-else
- { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
-
-return TRUE;
-}
-
-
-
-/*************************************************
-* Match from current position *
-*************************************************/
-
-/* On entry ecode points to the first opcode, and eptr to the first character
-in the subject string, while eptrb holds the value of eptr at the start of the
-last bracketed group - used for breaking infinite loops matching zero-length
-strings.
-
-Arguments:
- eptr pointer in subject
- ecode position in code
- offset_top current top pointer
- md pointer to "static" info for the match
- ims current /i, /m, and /s options
- eptrb pointer to chain of blocks containing eptr at start of
- brackets - for testing for empty matches
- flags can contain
- match_condassert - this is an assertion condition
- match_isgroup - this is the start of a bracketed group
-
-Returns: TRUE if matched
-*/
-
-static BOOL
-match(register const uschar *eptr, register const uschar *ecode,
- int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
- int flags)
-{
-unsigned long int original_ims = ims; /* Save for resetting on ')' */
-eptrblock newptrb;
-
-/* At the start of a bracketed group, add the current subject pointer to the
-stack of such pointers, to be re-instated at the end of the group when we hit
-the closing ket. When match() is called in other circumstances, we don't add to
-the stack. */
-
-if ((flags & match_isgroup) != 0)
- {
- newptrb.prev = eptrb;
- newptrb.saved_eptr = eptr;
- eptrb = &newptrb;
- }
-
-/* Now start processing the operations. */
-
-for (;;)
- {
- int op = (int)*ecode;
- int min, max, ctype;
- register int i;
- register int c;
- BOOL minimize = FALSE;
-
- /* Opening capturing bracket. If there is space in the offset vector, save
- the current subject position in the working slot at the top of the vector. We
- mustn't change the current values of the data slot, because they may be set
- from a previous iteration of this group, and be referred to by a reference
- inside the group.
-
- If the bracket fails to match, we need to restore this value and also the
- values of the final offsets, in case they were set by a previous iteration of
- the same bracket.
-
- If there isn't enough space in the offset vector, treat this as if it were a
- non-capturing bracket. Don't worry about setting the flag for the error case
- here; that is handled in the code for KET. */
-
- if (op > OP_BRA)
- {
- int offset;
- int number = op - OP_BRA;
-
- /* For extended extraction brackets (large number), we have to fish out the
- number from a dummy opcode at the start. */
-
- if (number > EXTRACT_BASIC_MAX)
- number = GET2(ecode, 2+LINK_SIZE);
- offset = number << 1;
-
-#ifdef DEBUG
- printf("start bracket %d subject=", number);
- pchars(eptr, 16, TRUE, md);
- printf("\n");
-#endif
-
- if (offset < md->offset_max)
- {
- int save_offset1 = md->offset_vector[offset];
- int save_offset2 = md->offset_vector[offset+1];
- int save_offset3 = md->offset_vector[md->offset_end - number];
- int save_capture_last = md->capture_last;
-
- DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
- md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
-
- do
- {
- if (match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
- match_isgroup)) return TRUE;
- md->capture_last = save_capture_last;
- ecode += GET(ecode, 1);
- }
- while (*ecode == OP_ALT);
-
- DPRINTF(("bracket %d failed\n", number));
-
- md->offset_vector[offset] = save_offset1;
- md->offset_vector[offset+1] = save_offset2;
- md->offset_vector[md->offset_end - number] = save_offset3;
-
- return FALSE;
- }
-
- /* Insufficient room for saving captured contents */
-
- else op = OP_BRA;
- }
-
- /* Other types of node can be handled by a switch */
-
- switch(op)
- {
- case OP_BRA: /* Non-capturing bracket: optimized */
- DPRINTF(("start bracket 0\n"));
- do
- {
- if (match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
- match_isgroup)) return TRUE;
- ecode += GET(ecode, 1);
- }
- while (*ecode == OP_ALT);
- DPRINTF(("bracket 0 failed\n"));
- return FALSE;
-
- /* Conditional group: compilation checked that there are no more than
- two branches. If the condition is false, skipping the first branch takes us
- past the end if there is only one branch, but that's OK because that is
- exactly what going to the ket would do. */
-
- case OP_COND:
- if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
- {
- int offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
- BOOL condition = (offset == CREF_RECURSE * 2)?
- (md->recursive != NULL) :
- (offset < offset_top && md->offset_vector[offset] >= 0);
- return match(eptr, ecode + (condition?
- (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
- offset_top, md, ims, eptrb, match_isgroup);
- }
-
- /* The condition is an assertion. Call match() to evaluate it - setting
- the final argument TRUE causes it to stop at the end of an assertion. */
-
- else
- {
- if (match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
- match_condassert | match_isgroup))
- {
- ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
- while (*ecode == OP_ALT) ecode += GET(ecode, 1);
- }
- else ecode += GET(ecode, 1);
- return match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
- match_isgroup);
- }
- /* Control never reaches here */
-
- /* Skip over conditional reference or large extraction number data if
- encountered. */
-
- case OP_CREF:
- case OP_BRANUMBER:
- ecode += 3;
- break;
-
- /* End of the pattern. If we are in a recursion, we should restore the
- offsets appropriately and continue from after the call. */
-
- case OP_END:
- if (md->recursive != NULL && md->recursive->group_num == 0)
- {
- recursion_info *rec = md->recursive;
- DPRINTF(("Hit the end in a (?0) recursion\n"));
- md->recursive = rec->prev;
- memmove(md->offset_vector, rec->offset_save,
- rec->saved_max * sizeof(int));
- md->start_match = rec->save_start;
- ims = original_ims;
- ecode = rec->after_call;
- break;
- }
-
- /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
- string - backtracking will then try other alternatives, if any. */
-
- if (md->notempty && eptr == md->start_match) return FALSE;
- md->end_match_ptr = eptr; /* Record where we ended */
- md->end_offset_top = offset_top; /* and how many extracts were taken */
- return TRUE;
-
- /* Change option settings */
-
- case OP_OPT:
- ims = ecode[1];
- ecode += 2;
- DPRINTF(("ims set to %02lx\n", ims));
- break;
-
- /* Assertion brackets. Check the alternative branches in turn - the
- matching won't pass the KET for an assertion. If any one branch matches,
- the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
- start of each branch to move the current point backwards, so the code at
- this level is identical to the lookahead case. */
-
- case OP_ASSERT:
- case OP_ASSERTBACK:
- do
- {
- if (match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
- match_isgroup)) break;
- ecode += GET(ecode, 1);
- }
- while (*ecode == OP_ALT);
- if (*ecode == OP_KET) return FALSE;
-
- /* If checking an assertion for a condition, return TRUE. */
-
- if ((flags & match_condassert) != 0) return TRUE;
-
- /* Continue from after the assertion, updating the offsets high water
- mark, since extracts may have been taken during the assertion. */
-
- do ecode += GET(ecode,1); while (*ecode == OP_ALT);
- ecode += 1 + LINK_SIZE;
- offset_top = md->end_offset_top;
- continue;
-
- /* Negative assertion: all branches must fail to match */
-
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK_NOT:
- do
- {
- if (match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
- match_isgroup)) return FALSE;
- ecode += GET(ecode,1);
- }
- while (*ecode == OP_ALT);
-
- if ((flags & match_condassert) != 0) return TRUE;
-
- ecode += 1 + LINK_SIZE;
- continue;
-
- /* Move the subject pointer back. This occurs only at the start of
- each branch of a lookbehind assertion. If we are too close to the start to
- move back, this match function fails. When working with UTF-8 we move
- back a number of characters, not bytes. */
-
- case OP_REVERSE:
-#ifdef SUPPORT_UTF8
- c = GET(ecode,1);
- for (i = 0; i < c; i++)
- {
- eptr--;
- BACKCHAR(eptr)
- }
-#else
- eptr -= GET(ecode,1);
-#endif
-
- if (eptr < md->start_subject) return FALSE;
- ecode += 1 + LINK_SIZE;
- break;
-
- /* The callout item calls an external function, if one is provided, passing
- details of the match so far. This is mainly for debugging, though the
- function is able to force a failure. */
-
- case OP_CALLOUT:
- if (pcre_callout != NULL)
- {
- pcre_callout_block cb;
- cb.version = 0; /* Version 0 of the callout block */
- cb.callout_number = ecode[1];
- cb.offset_vector = md->offset_vector;
- cb.subject = (const char *)md->start_subject;
- cb.subject_length = md->end_subject - md->start_subject;
- cb.start_match = md->start_match - md->start_subject;
- cb.current_position = eptr - md->start_subject;
- cb.capture_top = offset_top/2;
- cb.capture_last = md->capture_last;
- if ((*pcre_callout)(&cb) != 0) return FALSE;
- }
- ecode += 2;
- break;
-
- /* Recursion either matches the current regex, or some subexpression. The
- offset data is the offset to the starting bracket from the start of the
- whole pattern. However, it is possible that a BRAZERO was inserted before
- this bracket after we took the offset - we just skip it if encountered.
-
- If there are any capturing brackets started but not finished, we have to
- save their starting points and reinstate them after the recursion. However,
- we don't know how many such there are (offset_top records the completed
- total) so we just have to save all the potential data. There may be up to
- 65535 such values, which is too large to put on the stack, but using malloc
- for small numbers seems expensive. As a compromise, the stack is used when
- there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
- is used. A problem is what to do if the malloc fails ... there is no way of
- returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
- values on the stack, and accept that the rest may be wrong.
-
- There are also other values that have to be saved. We use a chained
- sequence of blocks that actually live on the stack. Thanks to Robin Houston
- for the original version of this logic. */
-
- case OP_RECURSE:
- {
- int stacksave[REC_STACK_SAVE_MAX];
- recursion_info new_recursive;
- const uschar *callpat = md->start_code + GET(ecode, 1);
-
- if (*callpat == OP_BRAZERO) callpat++;
-
- new_recursive.group_num = *callpat - OP_BRA;
-
- /* For extended extraction brackets (large number), we have to fish out
- the number from a dummy opcode at the start. */
-
- if (new_recursive.group_num > EXTRACT_BASIC_MAX)
- new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
-
- /* Add to "recursing stack" */
-
- new_recursive.prev = md->recursive;
- md->recursive = &new_recursive;
-
- /* Find where to continue from afterwards */
-
- ecode += 1 + LINK_SIZE;
- new_recursive.after_call = ecode;
-
- /* Now save the offset data. */
-
- new_recursive.saved_max = md->offset_end;
- if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
- new_recursive.offset_save = stacksave;
- else
- {
- new_recursive.offset_save = (int *)
- (pcre_malloc)(new_recursive.saved_max * sizeof(int));
-
- /* RH: Warning: This may cause INCORRECT RESULTS if we run out of
- memory here, because we won't be restoring all the stored strings
- correctly. We either need proper run-time error handling or, at the
- very least, some way to warn the user. Could we just spit a message to
- stderr?
-
- PH: No, Robin, no! You must NEVER write to stderr from inside a general
- library function, because you don't know anything about the state of
- the file descriptor.
-
- RH: Returning error values would be very tedious because of the
- recursion; and Philip Hazel says that longjmp() - in many ways the
- obvious solution - has previously caused problems on some platforms. */
-
- if (new_recursive.offset_save == NULL)
- {
- DPRINTF(("malloc() failed - results may be wrong\n"));
- new_recursive.offset_save = stacksave;
- new_recursive.saved_max = REC_STACK_SAVE_MAX;
- }
- }
-
- memcpy(new_recursive.offset_save, md->offset_vector,
- new_recursive.saved_max * sizeof(int));
- new_recursive.save_start = md->start_match;
- md->start_match = eptr;
-
- /* OK, now we can do the recursion. For each top-level alternative we
- restore the offset and recursion data. */
-
- DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
- do
- {
- if (match(eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
- match_isgroup))
- {
- md->recursive = new_recursive.prev;
- if (new_recursive.offset_save != stacksave)
- (pcre_free)(new_recursive.offset_save);
- return TRUE;
- }
-
- md->recursive = &new_recursive;
- memcpy(md->offset_vector, new_recursive.offset_save,
- new_recursive.saved_max * sizeof(int));
- callpat += GET(callpat, 1);
- }
- while (*callpat == OP_ALT);
-
- DPRINTF(("Recursion didn't match\n"));
- md->recursive = new_recursive.prev;
- if (new_recursive.offset_save != stacksave)
- (pcre_free)(new_recursive.offset_save);
- return FALSE;
- }
- break;
-
- /* "Once" brackets are like assertion brackets except that after a match,
- the point in the subject string is not moved back. Thus there can never be
- a move back into the brackets. Friedl calls these "atomic" subpatterns.
- Check the alternative branches in turn - the matching won't pass the KET
- for this kind of subpattern. If any one branch matches, we carry on as at
- the end of a normal bracket, leaving the subject pointer. */
-
- case OP_ONCE:
- {
- const uschar *prev = ecode;
- const uschar *saved_eptr = eptr;
-
- do
- {
- if (match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
- match_isgroup)) break;
- ecode += GET(ecode,1);
- }
- while (*ecode == OP_ALT);
-
- /* If hit the end of the group (which could be repeated), fail */
-
- if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
-
- /* Continue as from after the assertion, updating the offsets high water
- mark, since extracts may have been taken. */
-
- do ecode += GET(ecode,1); while (*ecode == OP_ALT);
-
- offset_top = md->end_offset_top;
- eptr = md->end_match_ptr;
-
- /* For a non-repeating ket, just continue at this level. This also
- happens for a repeating ket if no characters were matched in the group.
- This is the forcible breaking of infinite loops as implemented in Perl
- 5.005. If there is an options reset, it will get obeyed in the normal
- course of events. */
-
- if (*ecode == OP_KET || eptr == saved_eptr)
- {
- ecode += 1+LINK_SIZE;
- break;
- }
-
- /* The repeating kets try the rest of the pattern or restart from the
- preceding bracket, in the appropriate order. We need to reset any options
- that changed within the bracket before re-running it, so check the next
- opcode. */
-
- if (ecode[1+LINK_SIZE] == OP_OPT)
- {
- ims = (ims & ~PCRE_IMS) | ecode[4];
- DPRINTF(("ims set to %02lx at group repeat\n", ims));
- }
-
- if (*ecode == OP_KETRMIN)
- {
- if (match(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0)
- ||
- match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
- return TRUE;
- }
- else /* OP_KETRMAX */
- {
- if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
- match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0))
- return TRUE;
- }
- }
- return FALSE;
-
- /* An alternation is the end of a branch; scan along to find the end of the
- bracketed group and go to there. */
-
- case OP_ALT:
- do ecode += GET(ecode,1); while (*ecode == OP_ALT);
- break;
-
- /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
- that it may occur zero times. It may repeat infinitely, or not at all -
- i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
- repeat limits are compiled as a number of copies, with the optional ones
- preceded by BRAZERO or BRAMINZERO. */
-
- case OP_BRAZERO:
- {
- const uschar *next = ecode+1;
- if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
- return TRUE;
- do next += GET(next,1); while (*next == OP_ALT);
- ecode = next + 1+LINK_SIZE;
- }
- break;
-
- case OP_BRAMINZERO:
- {
- const uschar *next = ecode+1;
- do next += GET(next,1); while (*next == OP_ALT);
- if (match(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
- match_isgroup)) return TRUE;
- ecode++;
- }
- break;
-
- /* End of a group, repeated or non-repeating. If we are at the end of
- an assertion "group", stop matching and return TRUE, but record the
- current high water mark for use by positive assertions. Do this also
- for the "once" (not-backup up) groups. */
-
- case OP_KET:
- case OP_KETRMIN:
- case OP_KETRMAX:
- {
- const uschar *prev = ecode - GET(ecode, 1);
- const uschar *saved_eptr = eptrb->saved_eptr;
-
- eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
-
- if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
- *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
- *prev == OP_ONCE)
- {
- md->end_match_ptr = eptr; /* For ONCE */
- md->end_offset_top = offset_top;
- return TRUE;
- }
-
- /* In all other cases except a conditional group we have to check the
- group number back at the start and if necessary complete handling an
- extraction by setting the offsets and bumping the high water mark. */
-
- if (*prev != OP_COND)
- {
- int offset;
- int number = *prev - OP_BRA;
-
- /* For extended extraction brackets (large number), we have to fish out
- the number from a dummy opcode at the start. */
-
- if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
- offset = number << 1;
-
-#ifdef DEBUG
- printf("end bracket %d", number);
- printf("\n");
-#endif
-
- /* Test for a numbered group. This includes groups called as a result
- of recursion. Note that whole-pattern recursion is coded as a recurse
- into group 0, so it won't be picked up here. Instead, we catch it when
- the OP_END is reached. */
-
- if (number > 0)
- {
- md->capture_last = number;
- if (offset >= md->offset_max) md->offset_overflow = TRUE; else
- {
- md->offset_vector[offset] =
- md->offset_vector[md->offset_end - number];
- md->offset_vector[offset+1] = eptr - md->start_subject;
- if (offset_top <= offset) offset_top = offset + 2;
- }
-
- /* Handle a recursively called group. Restore the offsets
- appropriately and continue from after the call. */
-
- if (md->recursive != NULL && md->recursive->group_num == number)
- {
- recursion_info *rec = md->recursive;
- DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
- md->recursive = rec->prev;
- md->start_match = rec->save_start;
- memcpy(md->offset_vector, rec->offset_save,
- rec->saved_max * sizeof(int));
- ecode = rec->after_call;
- ims = original_ims;
- break;
- }
- }
- }
-
- /* Reset the value of the ims flags, in case they got changed during
- the group. */
-
- ims = original_ims;
- DPRINTF(("ims reset to %02lx\n", ims));
-
- /* For a non-repeating ket, just continue at this level. This also
- happens for a repeating ket if no characters were matched in the group.
- This is the forcible breaking of infinite loops as implemented in Perl
- 5.005. If there is an options reset, it will get obeyed in the normal
- course of events. */
-
- if (*ecode == OP_KET || eptr == saved_eptr)
- {
- ecode += 1 + LINK_SIZE;
- break;
- }
-
- /* The repeating kets try the rest of the pattern or restart from the
- preceding bracket, in the appropriate order. */
-
- if (*ecode == OP_KETRMIN)
- {
- if (match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0) ||
- match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
- return TRUE;
- }
- else /* OP_KETRMAX */
- {
- if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
- match(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0))
- return TRUE;
- }
- }
- return FALSE;
-
- /* Start of subject unless notbol, or after internal newline if multiline */
-
- case OP_CIRC:
- if (md->notbol && eptr == md->start_subject) return FALSE;
- if ((ims & PCRE_MULTILINE) != 0)
- {
- if (eptr != md->start_subject && eptr[-1] != NEWLINE) return FALSE;
- ecode++;
- break;
- }
- /* ... else fall through */
-
- /* Start of subject assertion */
-
- case OP_SOD:
- if (eptr != md->start_subject) return FALSE;
- ecode++;
- break;
-
- /* Start of match assertion */
-
- case OP_SOM:
- if (eptr != md->start_subject + md->start_offset) return FALSE;
- ecode++;
- break;
-
- /* Assert before internal newline if multiline, or before a terminating
- newline unless endonly is set, else end of subject unless noteol is set. */
-
- case OP_DOLL:
- if ((ims & PCRE_MULTILINE) != 0)
- {
- if (eptr < md->end_subject) { if (*eptr != NEWLINE) return FALSE; }
- else { if (md->noteol) return FALSE; }
- ecode++;
- break;
- }
- else
- {
- if (md->noteol) return FALSE;
- if (!md->endonly)
- {
- if (eptr < md->end_subject - 1 ||
- (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
-
- ecode++;
- break;
- }
- }
- /* ... else fall through */
-
- /* End of subject assertion (\z) */
-
- case OP_EOD:
- if (eptr < md->end_subject) return FALSE;
- ecode++;
- break;
-
- /* End of subject or ending \n assertion (\Z) */
-
- case OP_EODN:
- if (eptr < md->end_subject - 1 ||
- (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
- ecode++;
- break;
-
- /* Word boundary assertions */
-
- case OP_NOT_WORD_BOUNDARY:
- case OP_WORD_BOUNDARY:
- {
- BOOL prev_is_word = (eptr != md->start_subject) &&
- ((md->ctypes[eptr[-1]] & ctype_word) != 0);
- BOOL cur_is_word = (eptr < md->end_subject) &&
- ((md->ctypes[*eptr] & ctype_word) != 0);
- if ((*ecode++ == OP_WORD_BOUNDARY)?
- cur_is_word == prev_is_word : cur_is_word != prev_is_word)
- return FALSE;
- }
- break;
-
- /* Match a single character type; inline for speed */
-
- case OP_ANY:
- if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
- return FALSE;
- if (eptr++ >= md->end_subject) return FALSE;
-#ifdef SUPPORT_UTF8
- if (md->utf8)
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
-#endif
- ecode++;
- break;
-
- /* Match a single byte, even in UTF-8 mode. This opcode really does match
- any byte, even newline, independent of the setting of PCRE_DOTALL. */
-
- case OP_ANYBYTE:
- if (eptr++ >= md->end_subject) return FALSE;
- ecode++;
- break;
-
- case OP_NOT_DIGIT:
- if (eptr >= md->end_subject ||
- (md->ctypes[*eptr++] & ctype_digit) != 0)
- return FALSE;
- ecode++;
- break;
-
- case OP_DIGIT:
- if (eptr >= md->end_subject ||
- (md->ctypes[*eptr++] & ctype_digit) == 0)
- return FALSE;
- ecode++;
- break;
-
- case OP_NOT_WHITESPACE:
- if (eptr >= md->end_subject ||
- (md->ctypes[*eptr++] & ctype_space) != 0)
- return FALSE;
- ecode++;
- break;
-
- case OP_WHITESPACE:
- if (eptr >= md->end_subject ||
- (md->ctypes[*eptr++] & ctype_space) == 0)
- return FALSE;
- ecode++;
- break;
-
- case OP_NOT_WORDCHAR:
- if (eptr >= md->end_subject ||
- (md->ctypes[*eptr++] & ctype_word) != 0)
- return FALSE;
- ecode++;
- break;
-
- case OP_WORDCHAR:
- if (eptr >= md->end_subject ||
- (md->ctypes[*eptr++] & ctype_word) == 0)
- return FALSE;
- ecode++;
- break;
-
- /* Match a back reference, possibly repeatedly. Look past the end of the
- item to see if there is repeat information following. The code is similar
- to that for character classes, but repeated for efficiency. Then obey
- similar code to character type repeats - written out again for speed.
- However, if the referenced string is the empty string, always treat
- it as matched, any number of times (otherwise there could be infinite
- loops). */
-
- case OP_REF:
- {
- int length;
- int offset = GET2(ecode, 1) << 1; /* Doubled ref number */
- ecode += 3; /* Advance past item */
-
- /* If the reference is unset, set the length to be longer than the amount
- of subject left; this ensures that every attempt at a match fails. We
- can't just fail here, because of the possibility of quantifiers with zero
- minima. */
-
- length = (offset >= offset_top || md->offset_vector[offset] < 0)?
- md->end_subject - eptr + 1 :
- md->offset_vector[offset+1] - md->offset_vector[offset];
-
- /* Set up for repetition, or handle the non-repeated case */
-
- switch (*ecode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- break;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- min = GET2(ecode, 1);
- max = GET2(ecode, 3);
- if (max == 0) max = INT_MAX;
- ecode += 5;
- break;
-
- default: /* No repeat follows */
- if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
- eptr += length;
- continue; /* With the main loop */
- }
-
- /* If the length of the reference is zero, just continue with the
- main loop. */
-
- if (length == 0) continue;
-
- /* First, ensure the minimum number of matches are present. We get back
- the length of the reference string explicitly rather than passing the
- address of eptr, so that eptr can be a register variable. */
-
- for (i = 1; i <= min; i++)
- {
- if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
- eptr += length;
- }
-
- /* If min = max, continue at the same level without recursion.
- They are not both allowed to be zero. */
-
- if (min == max) continue;
-
- /* If minimizing, keep trying and advancing the pointer */
-
- if (minimize)
- {
- for (i = min;; i++)
- {
- if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- if (i >= max || !match_ref(offset, eptr, length, md, ims))
- return FALSE;
- eptr += length;
- }
- /* Control never gets here */
- }
-
- /* If maximizing, find the longest string and work backwards */
-
- else
- {
- const uschar *pp = eptr;
- for (i = min; i < max; i++)
- {
- if (!match_ref(offset, eptr, length, md, ims)) break;
- eptr += length;
- }
- while (eptr >= pp)
- {
- if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- eptr -= length;
- }
- return FALSE;
- }
- }
- /* Control never gets here */
-
-
-
- /* Match a character class, possibly repeatedly. Look past the end of the
- item to see if there is repeat information following. Then obey similar
- code to character type repeats - written out again for speed. */
-
- case OP_CLASS:
- {
- const uschar *data = ecode + 1; /* Save for matching */
- ecode += 33; /* Advance past the item */
-
- switch (*ecode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- break;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- min = GET2(ecode, 1);
- max = GET2(ecode, 3);
- if (max == 0) max = INT_MAX;
- ecode += 5;
- break;
-
- default: /* No repeat follows */
- min = max = 1;
- break;
- }
-
- /* First, ensure the minimum number of matches are present. */
-
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) return FALSE;
- GETCHARINC(c, eptr) /* Get character; increment eptr */
-
-#ifdef SUPPORT_UTF8
- /* We do not yet support class members > 255 */
- if (c > 255) return FALSE;
-#endif
-
- if ((data[c/8] & (1 << (c&7))) != 0) continue;
- return FALSE;
- }
-
- /* If max == min we can continue with the main loop without the
- need to recurse. */
-
- if (min == max) continue;
-
- /* If minimizing, keep testing the rest of the expression and advancing
- the pointer while it matches the class. */
-
- if (minimize)
- {
- for (i = min;; i++)
- {
- if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- if (i >= max || eptr >= md->end_subject) return FALSE;
- GETCHARINC(c, eptr) /* Get character; increment eptr */
-
-#ifdef SUPPORT_UTF8
- /* We do not yet support class members > 255 */
- if (c > 255) return FALSE;
-#endif
- if ((data[c/8] & (1 << (c&7))) != 0) continue;
- return FALSE;
- }
- /* Control never gets here */
- }
-
- /* If maximizing, find the longest possible run, then work backwards. */
-
- else
- {
- const uschar *pp = eptr;
- int len = 1;
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len) /* Get character, set length if UTF-8 */
-
-#ifdef SUPPORT_UTF8
- /* We do not yet support class members > 255 */
- if (c > 255) break;
-#endif
- if ((data[c/8] & (1 << (c&7))) == 0) break;
- eptr += len;
- }
-
- while (eptr >= pp)
- {
- if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
-
-#ifdef SUPPORT_UTF8
- BACKCHAR(eptr)
-#endif
- }
- return FALSE;
- }
- }
- /* Control never gets here */
-
- /* Match a run of characters */
-
- case OP_CHARS:
- {
- register int length = ecode[1];
- ecode += 2;
-
-#ifdef DEBUG /* Sigh. Some compilers never learn. */
- if (eptr >= md->end_subject)
- printf("matching subject <null> against pattern ");
- else
- {
- printf("matching subject ");
- pchars(eptr, length, TRUE, md);
- printf(" against pattern ");
- }
- pchars(ecode, length, FALSE, md);
- printf("\n");
-#endif
-
- if (length > md->end_subject - eptr) return FALSE;
- if ((ims & PCRE_CASELESS) != 0)
- {
- while (length-- > 0)
- if (md->lcc[*ecode++] != md->lcc[*eptr++])
- return FALSE;
- }
- else
- {
- while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
- }
- }
- break;
-
- /* Match a single character repeatedly; different opcodes share code. */
-
- case OP_EXACT:
- min = max = GET2(ecode, 1);
- ecode += 3;
- goto REPEATCHAR;
-
- case OP_UPTO:
- case OP_MINUPTO:
- min = 0;
- max = GET2(ecode, 1);
- minimize = *ecode == OP_MINUPTO;
- ecode += 3;
- goto REPEATCHAR;
-
- case OP_STAR:
- case OP_MINSTAR:
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_QUERY:
- case OP_MINQUERY:
- c = *ecode++ - OP_STAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
-
- /* Common code for all repeated single-character matches. We can give
- up quickly if there are fewer than the minimum number of characters left in
- the subject. */
-
- REPEATCHAR:
- if (min > md->end_subject - eptr) return FALSE;
- c = *ecode++;
-
- /* The code is duplicated for the caseless and caseful cases, for speed,
- since matching characters is likely to be quite common. First, ensure the
- minimum number of matches are present. If min = max, continue at the same
- level without recursing. Otherwise, if minimizing, keep trying the rest of
- the expression and advancing one matching character if failing, up to the
- maximum. Alternatively, if maximizing, find the maximum number of
- characters and work backwards. */
-
- DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
- max, eptr));
-
- if ((ims & PCRE_CASELESS) != 0)
- {
- c = md->lcc[c];
- for (i = 1; i <= min; i++)
- if (c != md->lcc[*eptr++]) return FALSE;
- if (min == max) continue;
- if (minimize)
- {
- for (i = min;; i++)
- {
- if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- if (i >= max || eptr >= md->end_subject ||
- c != md->lcc[*eptr++])
- return FALSE;
- }
- /* Control never gets here */
- }
- else
- {
- const uschar *pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
- eptr++;
- }
- while (eptr >= pp)
- if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- return FALSE;
- }
- /* Control never gets here */
- }
-
- /* Caseful comparisons */
-
- else
- {
- for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
- if (min == max) continue;
- if (minimize)
- {
- for (i = min;; i++)
- {
- if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
- }
- /* Control never gets here */
- }
- else
- {
- const uschar *pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || c != *eptr) break;
- eptr++;
- }
- while (eptr >= pp)
- if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- return FALSE;
- }
- }
- /* Control never gets here */
-
- /* Match a negated single character */
-
- case OP_NOT:
- if (eptr >= md->end_subject) return FALSE;
- ecode++;
- if ((ims & PCRE_CASELESS) != 0)
- {
- if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
- }
- else
- {
- if (*ecode++ == *eptr++) return FALSE;
- }
- break;
-
- /* Match a negated single character repeatedly. This is almost a repeat of
- the code for a repeated single character, but I haven't found a nice way of
- commoning these up that doesn't require a test of the positive/negative
- option for each character match. Maybe that wouldn't add very much to the
- time taken, but character matching *is* what this is all about... */
-
- case OP_NOTEXACT:
- min = max = GET2(ecode, 1);
- ecode += 3;
- goto REPEATNOTCHAR;
-
- case OP_NOTUPTO:
- case OP_NOTMINUPTO:
- min = 0;
- max = GET2(ecode, 1);
- minimize = *ecode == OP_NOTMINUPTO;
- ecode += 3;
- goto REPEATNOTCHAR;
-
- case OP_NOTSTAR:
- case OP_NOTMINSTAR:
- case OP_NOTPLUS:
- case OP_NOTMINPLUS:
- case OP_NOTQUERY:
- case OP_NOTMINQUERY:
- c = *ecode++ - OP_NOTSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
-
- /* Common code for all repeated single-character matches. We can give
- up quickly if there are fewer than the minimum number of characters left in
- the subject. */
-
- REPEATNOTCHAR:
- if (min > md->end_subject - eptr) return FALSE;
- c = *ecode++;
-
- /* The code is duplicated for the caseless and caseful cases, for speed,
- since matching characters is likely to be quite common. First, ensure the
- minimum number of matches are present. If min = max, continue at the same
- level without recursing. Otherwise, if minimizing, keep trying the rest of
- the expression and advancing one matching character if failing, up to the
- maximum. Alternatively, if maximizing, find the maximum number of
- characters and work backwards. */
-
- DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
- max, eptr));
-
- if ((ims & PCRE_CASELESS) != 0)
- {
- c = md->lcc[c];
- for (i = 1; i <= min; i++)
- if (c == md->lcc[*eptr++]) return FALSE;
- if (min == max) continue;
- if (minimize)
- {
- for (i = min;; i++)
- {
- if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- if (i >= max || eptr >= md->end_subject ||
- c == md->lcc[*eptr++])
- return FALSE;
- }
- /* Control never gets here */
- }
- else
- {
- const uschar *pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
- eptr++;
- }
- while (eptr >= pp)
- if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- return FALSE;
- }
- /* Control never gets here */
- }
-
- /* Caseful comparisons */
-
- else
- {
- for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
- if (min == max) continue;
- if (minimize)
- {
- for (i = min;; i++)
- {
- if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
- }
- /* Control never gets here */
- }
- else
- {
- const uschar *pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || c == *eptr) break;
- eptr++;
- }
- while (eptr >= pp)
- if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- return FALSE;
- }
- }
- /* Control never gets here */
-
- /* Match a single character type repeatedly; several different opcodes
- share code. This is very similar to the code for single characters, but we
- repeat it in the interests of efficiency. */
-
- case OP_TYPEEXACT:
- min = max = GET2(ecode, 1);
- minimize = TRUE;
- ecode += 3;
- goto REPEATTYPE;
-
- case OP_TYPEUPTO:
- case OP_TYPEMINUPTO:
- min = 0;
- max = GET2(ecode, 1);
- minimize = *ecode == OP_TYPEMINUPTO;
- ecode += 3;
- goto REPEATTYPE;
-
- case OP_TYPESTAR:
- case OP_TYPEMINSTAR:
- case OP_TYPEPLUS:
- case OP_TYPEMINPLUS:
- case OP_TYPEQUERY:
- case OP_TYPEMINQUERY:
- c = *ecode++ - OP_TYPESTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
-
- /* Common code for all repeated single character type matches */
-
- REPEATTYPE:
- ctype = *ecode++; /* Code for the character type */
-
- /* First, ensure the minimum number of matches are present. Use inline
- code for maximizing the speed, and do the type test once at the start
- (i.e. keep it out of the loop). Also we can test that there are at least
- the minimum number of bytes before we start, except when doing '.' in
- UTF8 mode. Leave the test in in all cases; in the special case we have
- to test after each character. */
-
- if (min > md->end_subject - eptr) return FALSE;
- if (min > 0) switch(ctype)
- {
- case OP_ANY:
-#ifdef SUPPORT_UTF8
- if (md->utf8)
- {
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
- return FALSE;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- break;
- }
-#endif
- /* Non-UTF8 can be faster */
- if ((ims & PCRE_DOTALL) == 0)
- { for (i = 1; i <= min; i++) if (*eptr++ == NEWLINE) return FALSE; }
- else eptr += min;
- break;
-
- case OP_ANYBYTE:
- eptr += min;
- break;
-
- case OP_NOT_DIGIT:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
- break;
-
- case OP_DIGIT:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
- break;
-
- case OP_NOT_WHITESPACE:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
- break;
-
- case OP_WHITESPACE:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
- break;
-
- case OP_NOT_WORDCHAR:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_word) != 0)
- return FALSE;
- break;
-
- case OP_WORDCHAR:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_word) == 0)
- return FALSE;
- break;
- }
-
- /* If min = max, continue at the same level without recursing */
-
- if (min == max) continue;
-
- /* If minimizing, we have to test the rest of the pattern before each
- subsequent match. */
-
- if (minimize)
- {
- for (i = min;; i++)
- {
- if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
- if (i >= max || eptr >= md->end_subject) return FALSE;
-
- c = *eptr++;
- switch(ctype)
- {
- case OP_ANY:
- if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return FALSE;
-#ifdef SUPPORT_UTF8
- if (md->utf8)
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
-#endif
- break;
-
- case OP_ANYBYTE:
- break;
-
- case OP_NOT_DIGIT:
- if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
- break;
-
- case OP_DIGIT:
- if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
- break;
-
- case OP_NOT_WHITESPACE:
- if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
- break;
-
- case OP_WHITESPACE:
- if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
- break;
-
- case OP_NOT_WORDCHAR:
- if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
- break;
-
- case OP_WORDCHAR:
- if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
- break;
- }
- }
- /* Control never gets here */
- }
-
- /* If maximizing it is worth using inline code for speed, doing the type
- test once at the start (i.e. keep it out of the loop). */
-
- else
- {
- const uschar *pp = eptr;
- switch(ctype)
- {
- case OP_ANY:
-
- /* Special code is required for UTF8, but when the maximum is unlimited
- we don't need it. */
-
-#ifdef SUPPORT_UTF8
- if (md->utf8 && max < INT_MAX)
- {
- if ((ims & PCRE_DOTALL) == 0)
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || *eptr++ == NEWLINE) break;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- }
- else
- {
- for (i = min; i < max; i++)
- {
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- }
- break;
- }
-#endif
- /* Non-UTF8 can be faster */
- if ((ims & PCRE_DOTALL) == 0)
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || *eptr == NEWLINE) break;
- eptr++;
- }
- break;
- }
- /* For non-UTF8 DOTALL case, fall through and treat as \C */
-
- case OP_ANYBYTE:
- c = max - min;
- if (c > md->end_subject - eptr) c = md->end_subject - eptr;
- eptr += c;
- break;
-
- case OP_NOT_DIGIT:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
- break;
- eptr++;
- }
- break;
-
- case OP_DIGIT:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
- break;
- eptr++;
- }
- break;
-
- case OP_NOT_WHITESPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
- break;
- eptr++;
- }
- break;
-
- case OP_WHITESPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
- break;
- eptr++;
- }
- break;
-
- case OP_NOT_WORDCHAR:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
- break;
- eptr++;
- }
- break;
-
- case OP_WORDCHAR:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
- break;
- eptr++;
- }
- break;
- }
-
- while (eptr >= pp)
- {
- if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
-#ifdef SUPPORT_UTF8
- if (md->utf8)
- while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
-#endif
- }
- return FALSE;
- }
- /* Control never gets here */
-
- /* There's been some horrible disaster. */
-
- default:
- DPRINTF(("Unknown opcode %d\n", *ecode));
- md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
- return FALSE;
- }
-
- /* Do not stick any code in here without much thought; it is assumed
- that "continue" in the code above comes out to here to repeat the main
- loop. */
-
- } /* End of main loop */
-/* Control never reaches here */
-}
-
-
-
-
-/*************************************************
-* Execute a Regular Expression *
-*************************************************/
-
-/* This function applies a compiled re to a subject string and picks out
-portions of the string if it matches. Two elements in the vector are set for
-each substring: the offsets to the start and end of the substring.
-
-Arguments:
- external_re points to the compiled expression
- external_extra points to "hints" from pcre_study() or is NULL
- subject points to the subject string
- length length of subject string (may contain binary zeros)
- start_offset where to start in the subject string
- options option bits
- offsets points to a vector of ints to be filled in with offsets
- offsetcount the number of elements in the vector
-
-Returns: > 0 => success; value is the number of elements filled in
- = 0 => success, but offsets is not big enough
- -1 => failed to match
- < -1 => some kind of unexpected problem
-*/
-
-int
-pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
- const char *subject, int length, int start_offset, int options, int *offsets,
- int offsetcount)
-{
-int resetcount, ocount;
-int first_char = -1;
-int req_char = -1;
-int req_char2 = -1;
-unsigned long int ims = 0;
-match_data match_block;
-const uschar *start_bits = NULL;
-const uschar *start_match = (const uschar *)subject + start_offset;
-const uschar *end_subject;
-const uschar *req_char_ptr = start_match - 1;
-const real_pcre *re = (const real_pcre *)external_re;
-const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
-const uschar *codestart =
- (const uschar *)re + sizeof(real_pcre) + re->name_count * re->name_entry_size;
-BOOL using_temporary_offsets = FALSE;
-BOOL anchored;
-BOOL startline;
-BOOL first_char_caseless = FALSE;
-BOOL req_char_caseless = FALSE;
-
-if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
-
-if (re == NULL || subject == NULL ||
- (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
-if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
-
-anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
-startline = (re->options & PCRE_STARTLINE) != 0;
-
-match_block.start_code = codestart;
-match_block.start_subject = (const uschar *)subject;
-match_block.start_offset = start_offset;
-match_block.end_subject = match_block.start_subject + length;
-end_subject = match_block.end_subject;
-
-match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
-match_block.utf8 = (re->options & PCRE_UTF8) != 0;
-
-match_block.notbol = (options & PCRE_NOTBOL) != 0;
-match_block.noteol = (options & PCRE_NOTEOL) != 0;
-match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
-
-match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
-match_block.recursive = NULL; /* No recursion */
-
-match_block.lcc = re->tables + lcc_offset;
-match_block.ctypes = re->tables + ctypes_offset;
-
-/* The ims options can vary during the matching as a result of the presence
-of (?ims) items in the pattern. They are kept in a local variable so that
-restoring at the exit of a group is easy. */
-
-ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
-
-/* If the expression has got more back references than the offsets supplied can
-hold, we get a temporary bit of working store to use during the matching.
-Otherwise, we can use the vector supplied, rounding down its size to a multiple
-of 3. */
-
-ocount = offsetcount - (offsetcount % 3);
-
-if (re->top_backref > 0 && re->top_backref >= ocount/3)
- {
- ocount = re->top_backref * 3 + 3;
- match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
- if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
- using_temporary_offsets = TRUE;
- DPRINTF(("Got memory to hold back references\n"));
- }
-else match_block.offset_vector = offsets;
-
-match_block.offset_end = ocount;
-match_block.offset_max = (2*ocount)/3;
-match_block.offset_overflow = FALSE;
-match_block.capture_last = -1;
-
-/* Compute the minimum number of offsets that we need to reset each time. Doing
-this makes a huge difference to execution time when there aren't many brackets
-in the pattern. */
-
-resetcount = 2 + re->top_bracket * 2;
-if (resetcount > offsetcount) resetcount = ocount;
-
-/* Reset the working variable associated with each extraction. These should
-never be used unless previously set, but they get saved and restored, and so we
-initialize them to avoid reading uninitialized locations. */
-
-if (match_block.offset_vector != NULL)
- {
- register int *iptr = match_block.offset_vector + ocount;
- register int *iend = iptr - resetcount/2 + 1;
- while (--iptr >= iend) *iptr = -1;
- }
-
-/* Set up the first character to match, if available. The first_char value is
-never set for an anchored regular expression, but the anchoring may be forced
-at run time, so we have to test for anchoring. The first char may be unset for
-an unanchored pattern, of course. If there's no first char and the pattern was
-studied, there may be a bitmap of possible first characters. */
-
-if (!anchored)
- {
- if ((re->options & PCRE_FIRSTSET) != 0)
- {
- first_char = re->first_char & 255;
- if ((first_char_caseless = ((re->first_char & REQ_CASELESS) != 0)) == TRUE)
- first_char = match_block.lcc[first_char];
- }
- else
- if (!startline && extra != NULL &&
- (extra->options & PCRE_STUDY_MAPPED) != 0)
- start_bits = extra->start_bits;
- }
-
-/* For anchored or unanchored matches, there may be a "last known required
-character" set. */
-
-if ((re->options & PCRE_REQCHSET) != 0)
- {
- req_char = re->req_char & 255;
- req_char_caseless = (re->req_char & REQ_CASELESS) != 0;
- req_char2 = (re->tables + fcc_offset)[req_char]; /* case flipped */
- }
-
-/* Loop for handling unanchored repeated matching attempts; for anchored regexs
-the loop runs just once. */
-
-do
- {
- int rc;
- register int *iptr = match_block.offset_vector;
- register int *iend = iptr + resetcount;
-
- /* Reset the maximum number of extractions we might see. */
-
- while (iptr < iend) *iptr++ = -1;
-
- /* Advance to a unique first char if possible */
-
- if (first_char >= 0)
- {
- if (first_char_caseless)
- while (start_match < end_subject &&
- match_block.lcc[*start_match] != first_char)
- start_match++;
- else
- while (start_match < end_subject && *start_match != first_char)
- start_match++;
- }
-
- /* Or to just after \n for a multiline match if possible */
-
- else if (startline)
- {
- if (start_match > match_block.start_subject + start_offset)
- {
- while (start_match < end_subject && start_match[-1] != NEWLINE)
- start_match++;
- }
- }
-
- /* Or to a non-unique first char after study */
-
- else if (start_bits != NULL)
- {
- while (start_match < end_subject)
- {
- register int c = *start_match;
- if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
- }
- }
-
-#ifdef DEBUG /* Sigh. Some compilers never learn. */
- printf(">>>> Match against: ");
- pchars(start_match, end_subject - start_match, TRUE, &match_block);
- printf("\n");
-#endif
-
- /* If req_char is set, we know that that character must appear in the subject
- for the match to succeed. If the first character is set, req_char must be
- later in the subject; otherwise the test starts at the match point. This
- optimization can save a huge amount of backtracking in patterns with nested
- unlimited repeats that aren't going to match. Writing separate code for
- cased/caseless versions makes it go faster, as does using an autoincrement
- and backing off on a match. */
-
- if (req_char >= 0)
- {
- register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
-
- /* We don't need to repeat the search if we haven't yet reached the
- place we found it at last time. */
-
- if (p > req_char_ptr)
- {
- if (req_char_caseless)
- {
- while (p < end_subject)
- {
- register int pp = *p++;
- if (pp == req_char || pp == req_char2) { p--; break; }
- }
- }
- else
- {
- while (p < end_subject)
- {
- if (*p++ == req_char) { p--; break; }
- }
- }
-
- /* If we can't find the required character, break the matching loop */
-
- if (p >= end_subject) break;
-
- /* If we have found the required character, save the point where we
- found it, so that we don't search again next time round the loop if
- the start hasn't passed this character yet. */
-
- req_char_ptr = p;
- }
- }
-
- /* When a match occurs, substrings will be set for all internal extractions;
- we just need to set up the whole thing as substring 0 before returning. If
- there were too many extractions, set the return code to zero. In the case
- where we had to get some local store to hold offsets for backreferences, copy
- those back references that we can. In this case there need not be overflow
- if certain parts of the pattern were not used. */
-
- match_block.start_match = start_match;
- if (!match(start_match, codestart, 2, &match_block, ims, NULL, match_isgroup))
- continue;
-
- /* Copy the offset information from temporary store if necessary */
-
- if (using_temporary_offsets)
- {
- if (offsetcount >= 4)
- {
- memcpy(offsets + 2, match_block.offset_vector + 2,
- (offsetcount - 2) * sizeof(int));
- DPRINTF(("Copied offsets from temporary memory\n"));
- }
- if (match_block.end_offset_top > offsetcount)
- match_block.offset_overflow = TRUE;
-
- DPRINTF(("Freeing temporary memory\n"));
- (pcre_free)(match_block.offset_vector);
- }
-
- rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
-
- if (offsetcount < 2) rc = 0; else
- {
- offsets[0] = start_match - match_block.start_subject;
- offsets[1] = match_block.end_match_ptr - match_block.start_subject;
- }
-
- DPRINTF((">>>> returning %d\n", rc));
- return rc;
- }
-
-/* This "while" is the end of the "do" above */
-
-while (!anchored &&
- match_block.errorcode == PCRE_ERROR_NOMATCH &&
- start_match++ < end_subject);
-
-if (using_temporary_offsets)
- {
- DPRINTF(("Freeing temporary memory\n"));
- (pcre_free)(match_block.offset_vector);
- }
-
-DPRINTF((">>>> returning %d\n", match_block.errorcode));
-
-return match_block.errorcode;
-}
-
-/* End of pcre.c */
diff --git a/ext/pcre/pcrelib/pcre.def b/ext/pcre/pcrelib/pcre.def
deleted file mode 100644
index 4f6c4bff40..0000000000
--- a/ext/pcre/pcrelib/pcre.def
+++ /dev/null
@@ -1,22 +0,0 @@
-EXPORTS
-
-pcre_malloc DATA
-pcre_free DATA
-
-pcre_compile
-pcre_copy_substring
-pcre_exec
-pcre_get_substring
-pcre_get_substring_list
-pcre_free_substring
-pcre_free_substring_list
-pcre_info
-pcre_fullinfo
-pcre_maketables
-pcre_study
-pcre_version
-
-regcomp
-regexec
-regerror
-regfree
diff --git a/ext/pcre/pcrelib/pcre.h b/ext/pcre/pcrelib/pcre.h
deleted file mode 100644
index b33045c4f3..0000000000
--- a/ext/pcre/pcrelib/pcre.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* Copyright (c) 1997-2003 University of Cambridge */
-
-#ifndef _PCRE_H
-#define _PCRE_H
-
-/* The file pcre.h is build by "configure". Do not edit it; instead
-make changes to pcre.in. */
-
-#include "php_compat.h"
-
-#define PCRE_MAJOR 3
-#define PCRE_MINOR 92
-#define PCRE_DATE 11-Sep-2002
-
-/* Win32 uses DLL by default */
-
-#ifdef _WIN32
-# ifdef PHP_EXPORTS
-# define PCRE_DL_IMPORT
-# else
-# define PCRE_DL_IMPORT __declspec(dllimport)
-# endif
-#else
-# define PCRE_DL_IMPORT
-#endif
-
-/* Have to include stdlib.h in order to ensure that size_t is defined;
-it is needed here for malloc. */
-
-#include <stdlib.h>
-
-/* Allow for C++ users */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Options */
-
-#define PCRE_CASELESS 0x0001
-#define PCRE_MULTILINE 0x0002
-#define PCRE_DOTALL 0x0004
-#define PCRE_EXTENDED 0x0008
-#define PCRE_ANCHORED 0x0010
-#define PCRE_DOLLAR_ENDONLY 0x0020
-#define PCRE_EXTRA 0x0040
-#define PCRE_NOTBOL 0x0080
-#define PCRE_NOTEOL 0x0100
-#define PCRE_UNGREEDY 0x0200
-#define PCRE_NOTEMPTY 0x0400
-#define PCRE_UTF8 0x0800
-
-/* Exec-time and get-time error codes */
-
-#define PCRE_ERROR_NOMATCH (-1)
-#define PCRE_ERROR_NULL (-2)
-#define PCRE_ERROR_BADOPTION (-3)
-#define PCRE_ERROR_BADMAGIC (-4)
-#define PCRE_ERROR_UNKNOWN_NODE (-5)
-#define PCRE_ERROR_NOMEMORY (-6)
-#define PCRE_ERROR_NOSUBSTRING (-7)
-
-/* Request types for pcre_fullinfo() */
-
-#define PCRE_INFO_OPTIONS 0
-#define PCRE_INFO_SIZE 1
-#define PCRE_INFO_CAPTURECOUNT 2
-#define PCRE_INFO_BACKREFMAX 3
-#define PCRE_INFO_FIRSTCHAR 4
-#define PCRE_INFO_FIRSTTABLE 5
-#define PCRE_INFO_LASTLITERAL 6
-#define PCRE_INFO_NAMEENTRYSIZE 7
-#define PCRE_INFO_NAMECOUNT 8
-#define PCRE_INFO_NAMETABLE 9
-
-/* Types */
-
-struct real_pcre; /* declaration; the definition is private */
-struct real_pcre_extra; /* declaration; the definition is private */
-
-typedef struct real_pcre pcre;
-typedef struct real_pcre_extra pcre_extra;
-
-/* The structure for passing out data via the pcre_callout_function. We use a
-structure so that new fields can be added on the end in future versions,
-without changing the API of the function, thereby allowing old clients to work
-without modification. */
-
-typedef struct pcre_callout_block {
- int version; /* Identifies version of block */
- /* ------------------------ Version 0 ------------------------------- */
- int callout_number; /* Number compiled into pattern */
- int *offset_vector; /* The offset vector */
- const char *subject; /* The subject being matched */
- int subject_length; /* The length of the subject */
- int start_match; /* Offset to start of this match attempt */
- int current_position; /* Where we currently are */
- int capture_top; /* Max current capture */
- int capture_last; /* Most recently closed capture */
- /* ------------------------------------------------------------------ */
-} pcre_callout_block;
-
-/* Indirection for store get and free functions. These can be set to
-alternative malloc/free functions if required. There is also an optional
-callout function that is triggered by the (?) regex item. Some magic is
-required for Win32 DLL; it is null on other OS. For Virtual Pascal, these have
-to be different again. */
-
-#ifndef VPCOMPAT
-PCRE_DL_IMPORT extern void *(*pcre_malloc)(size_t);
-PCRE_DL_IMPORT extern void (*pcre_free)(void *);
-PCRE_DL_IMPORT extern int (*pcre_callout)(pcre_callout_block *);
-#else /* VPCOMPAT */
-extern void *pcre_malloc(size_t);
-extern void pcre_free(void *);
-extern int pcre_callout(pcre_callout_block *);
-#endif /* VPCOMPAT */
-
-/* Exported PCRE functions */
-
-PCRE_DL_IMPORT extern pcre *pcre_compile(const char *, int, const char **,
- int *, const unsigned char *);
-PCRE_DL_IMPORT extern int pcre_copy_substring(const char *, int *, int, int,
- char *, int);
-PCRE_DL_IMPORT extern int pcre_exec(const pcre *, const pcre_extra *,
- const char *, int, int, int, int *, int);
-PCRE_DL_IMPORT extern void pcre_free_substring(const char *);
-PCRE_DL_IMPORT extern void pcre_free_substring_list(const char **);
-PCRE_DL_IMPORT extern int pcre_get_substring(const char *, int *, int, int,
- const char **);
-PCRE_DL_IMPORT extern int pcre_get_substring_list(const char *, int *, int,
- const char ***);
-PCRE_DL_IMPORT extern int pcre_info(const pcre *, int *, int *);
-PCRE_DL_IMPORT extern int pcre_fullinfo(const pcre *, const pcre_extra *, int,
- void *);
-PCRE_DL_IMPORT extern const unsigned char *pcre_maketables(void);
-PCRE_DL_IMPORT extern pcre_extra *pcre_study(const pcre *, int, const char **);
-PCRE_DL_IMPORT extern const char *pcre_version(void);
-
-#undef PCRE_DL_IMPORT
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* End of pcre.h */
diff --git a/ext/pcre/pcrelib/pcregrep.c b/ext/pcre/pcrelib/pcregrep.c
deleted file mode 100644
index 87bb65ccbd..0000000000
--- a/ext/pcre/pcrelib/pcregrep.c
+++ /dev/null
@@ -1,640 +0,0 @@
-/*************************************************
-* pcregrep program *
-*************************************************/
-
-/* This is a grep program that uses the PCRE regular expression library to do
-its pattern matching. On a Unix or Win32 system it can recurse into
-directories. */
-
-#include <ctype.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <errno.h>
-#include "config.h"
-#include "pcre.h"
-
-#define FALSE 0
-#define TRUE 1
-
-typedef int BOOL;
-
-#define VERSION "2.2 10-Sep-2003"
-#define MAX_PATTERN_COUNT 100
-
-
-/*************************************************
-* Global variables *
-*************************************************/
-
-static char *pattern_filename = NULL;
-static int pattern_count = 0;
-static pcre **pattern_list;
-static pcre_extra **hints_list;
-
-static BOOL count_only = FALSE;
-static BOOL filenames = TRUE;
-static BOOL filenames_only = FALSE;
-static BOOL invert = FALSE;
-static BOOL number = FALSE;
-static BOOL recurse = FALSE;
-static BOOL silent = FALSE;
-static BOOL whole_lines = FALSE;
-
-/* Structure for options and list of them */
-
-typedef struct option_item {
- int one_char;
- char *long_name;
- char *help_text;
-} option_item;
-
-static option_item optionlist[] = {
- { -1, "help", "display this help and exit" },
- { 'c', "count", "print only a count of matching lines per FILE" },
- { 'h', "no-filename", "suppress the prefixing filename on output" },
- { 'i', "ignore-case", "ignore case distinctions" },
- { 'l', "files-with-matches", "print only FILE names containing matches" },
- { 'n', "line-number", "print line number with output lines" },
- { 'r', "recursive", "recursively scan sub-directories" },
- { 's', "no-messages", "suppress error messages" },
- { 'V', "version", "print version information and exit" },
- { 'v', "invert-match", "select non-matching lines" },
- { 'x', "line-regex", "force PATTERN to match only whole lines" },
- { 'x', "line-regexp", "force PATTERN to match only whole lines" },
- { 0, NULL, NULL }
-};
-
-
-/*************************************************
-* Functions for directory scanning *
-*************************************************/
-
-/* These functions are defined so that they can be made system specific,
-although at present the only ones are for Unix, Win32, and for "no directory
-recursion support". */
-
-
-/************* Directory scanning in Unix ***********/
-
-#if IS_UNIX
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <dirent.h>
-
-typedef DIR directory_type;
-
-int
-isdirectory(char *filename)
-{
-struct stat statbuf;
-if (stat(filename, &statbuf) < 0)
- return 0; /* In the expectation that opening as a file will fail */
-return ((statbuf.st_mode & S_IFMT) == S_IFDIR)? '/' : 0;
-}
-
-directory_type *
-opendirectory(char *filename)
-{
-return opendir(filename);
-}
-
-char *
-readdirectory(directory_type *dir)
-{
-for (;;)
- {
- struct dirent *dent = readdir(dir);
- if (dent == NULL) return NULL;
- if (strcmp(dent->d_name, ".") != 0 && strcmp(dent->d_name, "..") != 0)
- return dent->d_name;
- }
-return NULL; /* Keep compiler happy; never executed */
-}
-
-void
-closedirectory(directory_type *dir)
-{
-closedir(dir);
-}
-
-
-/************* Directory scanning in Win32 ***********/
-
-/* I (Philip Hazel) have no means of testing this code. It was contributed by
-Lionel Fourquaux. */
-
-
-#elif HAVE_WIN32API
-
-#ifndef STRICT
-# define STRICT
-#endif
-#ifndef WIN32_LEAN_AND_MEAN
-# define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
-
-typedef struct directory_type
-{
-HANDLE handle;
-BOOL first;
-WIN32_FIND_DATA data;
-} directory_type;
-
-int
-isdirectory(char *filename)
-{
-DWORD attr = GetFileAttributes(filename);
-if (attr == INVALID_FILE_ATTRIBUTES)
- return 0;
-return ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) ? '/' : 0;
-}
-
-directory_type *
-opendirectory(char *filename)
-{
-size_t len;
-char *pattern;
-directory_type *dir;
-DWORD err;
-len = strlen(filename);
-pattern = (char *) malloc(len + 3);
-dir = (directory_type *) malloc(sizeof(*dir));
-if ((pattern == NULL) || (dir == NULL))
- {
- fprintf(stderr, "pcregrep: malloc failed\n");
- exit(2);
- }
-memcpy(pattern, filename, len);
-memcpy(&(pattern[len]), "\\*", 3);
-dir->handle = FindFirstFile(pattern, &(dir->data));
-if (dir->handle != INVALID_HANDLE_VALUE)
- {
- free(pattern);
- dir->first = TRUE;
- return dir;
- }
-err = GetLastError();
-free(pattern);
-free(dir);
-errno = (err == ERROR_ACCESS_DENIED) ? EACCES : ENOENT;
-return NULL;
-}
-
-char *
-readdirectory(directory_type *dir)
-{
-for (;;)
- {
- if (!dir->first)
- {
- if (!FindNextFile(dir->handle, &(dir->data)))
- return NULL;
- }
- else
- {
- dir->first = FALSE;
- }
- if (strcmp(dir->data.cFileName, ".") != 0 && strcmp(dir->data.cFileName, "..") != 0)
- return dir->data.cFileName;
- }
-#ifndef _MSC_VER
-return NULL; /* Keep compiler happy; never executed */
-#endif
-}
-
-void
-closedirectory(directory_type *dir)
-{
-FindClose(dir->handle);
-free(dir);
-}
-
-
-/************* Directory scanning when we can't do it ***********/
-
-/* The type is void, and apart from isdirectory(), the functions do nothing. */
-
-#else
-
-typedef void directory_type;
-
-int isdirectory(char *filename) { return FALSE; }
-directory_type * opendirectory(char *filename) {}
-char *readdirectory(directory_type *dir) {}
-void closedirectory(directory_type *dir) {}
-
-#endif
-
-
-
-#if ! HAVE_STRERROR
-/*************************************************
-* Provide strerror() for non-ANSI libraries *
-*************************************************/
-
-/* Some old-fashioned systems still around (e.g. SunOS4) don't have strerror()
-in their libraries, but can provide the same facility by this simple
-alternative function. */
-
-extern int sys_nerr;
-extern char *sys_errlist[];
-
-char *
-strerror(int n)
-{
-if (n < 0 || n >= sys_nerr) return "unknown error number";
-return sys_errlist[n];
-}
-#endif /* HAVE_STRERROR */
-
-
-
-/*************************************************
-* Grep an individual file *
-*************************************************/
-
-static int
-pcregrep(FILE *in, char *name)
-{
-int rc = 1;
-int linenumber = 0;
-int count = 0;
-int offsets[99];
-char buffer[BUFSIZ];
-
-while (fgets(buffer, sizeof(buffer), in) != NULL)
- {
- BOOL match = FALSE;
- int i;
- int length = (int)strlen(buffer);
- if (length > 0 && buffer[length-1] == '\n') buffer[--length] = 0;
- linenumber++;
-
- for (i = 0; !match && i < pattern_count; i++)
- {
- match = pcre_exec(pattern_list[i], hints_list[i], buffer, length, 0, 0,
- offsets, 99) >= 0;
- if (match && whole_lines && offsets[1] != length) match = FALSE;
- }
-
- if (match != invert)
- {
- if (count_only) count++;
-
- else if (filenames_only)
- {
- fprintf(stdout, "%s\n", (name == NULL)? "<stdin>" : name);
- return 0;
- }
-
- else if (silent) return 0;
-
- else
- {
- if (name != NULL) fprintf(stdout, "%s:", name);
- if (number) fprintf(stdout, "%d:", linenumber);
- fprintf(stdout, "%s\n", buffer);
- }
-
- rc = 0;
- }
- }
-
-if (count_only)
- {
- if (name != NULL) fprintf(stdout, "%s:", name);
- fprintf(stdout, "%d\n", count);
- }
-
-return rc;
-}
-
-
-
-
-/*************************************************
-* Grep a file or recurse into a directory *
-*************************************************/
-
-static int
-grep_or_recurse(char *filename, BOOL recurse, BOOL show_filenames,
- BOOL only_one_at_top)
-{
-int rc = 1;
-int sep;
-FILE *in;
-
-/* If the file is a directory and we are recursing, scan each file within it.
-The scanning code is localized so it can be made system-specific. */
-
-if ((sep = isdirectory(filename)) != 0 && recurse)
- {
- char buffer[1024];
- char *nextfile;
- directory_type *dir = opendirectory(filename);
-
- if (dir == NULL)
- {
- fprintf(stderr, "pcregrep: Failed to open directory %s: %s\n", filename,
- strerror(errno));
- return 2;
- }
-
- while ((nextfile = readdirectory(dir)) != NULL)
- {
- int frc;
- sprintf(buffer, "%.512s%c%.128s", filename, sep, nextfile);
- frc = grep_or_recurse(buffer, recurse, TRUE, FALSE);
- if (frc == 0 && rc == 1) rc = 0;
- }
-
- closedirectory(dir);
- return rc;
- }
-
-/* If the file is not a directory, or we are not recursing, scan it. If this is
-the first and only argument at top level, we don't show the file name (unless
-we are only showing the file name). Otherwise, control is via the
-show_filenames variable. */
-
-in = fopen(filename, "r");
-if (in == NULL)
- {
- fprintf(stderr, "pcregrep: Failed to open %s: %s\n", filename, strerror(errno));
- return 2;
- }
-
-rc = pcregrep(in, (filenames_only || (show_filenames && !only_one_at_top))?
- filename : NULL);
-fclose(in);
-return rc;
-}
-
-
-
-
-/*************************************************
-* Usage function *
-*************************************************/
-
-static int
-usage(int rc)
-{
-fprintf(stderr, "Usage: pcregrep [-Vcfhilnrsvx] [long-options] [pattern] [file1 file2 ...]\n");
-fprintf(stderr, "Type `pcregrep --help' for more information.\n");
-return rc;
-}
-
-
-
-
-/*************************************************
-* Help function *
-*************************************************/
-
-static void
-help(void)
-{
-option_item *op;
-
-printf("Usage: pcregrep [OPTION]... [PATTERN] [FILE1 FILE2 ...]\n");
-printf("Search for PATTERN in each FILE or standard input.\n");
-printf("PATTERN must be present if -f is not used.\n");
-printf("Example: pcregrep -i 'hello.*world' menu.h main.c\n\n");
-
-printf("Options:\n");
-
-for (op = optionlist; op->one_char != 0; op++)
- {
- int n;
- char s[4];
- if (op->one_char > 0) sprintf(s, "-%c,", op->one_char); else strcpy(s, " ");
- printf(" %s --%s%n", s, op->long_name, &n);
- n = 30 - n;
- if (n < 1) n = 1;
- printf("%.*s%s\n", n, " ", op->help_text);
- }
-
-printf("\n -f<filename> or --file=<filename>\n");
-printf(" Read patterns from <filename> instead of using a command line option.\n");
-printf(" Trailing white space is removed; blanks lines are ignored.\n");
-printf(" There is a maximum of %d patterns.\n", MAX_PATTERN_COUNT);
-
-printf("\nWith no FILE, read standard input. If fewer than two FILEs given, assume -h.\n");
-printf("Exit status is 0 if any matches, 1 if no matches, and 2 if trouble.\n");
-}
-
-
-
-
-/*************************************************
-* Handle an option *
-*************************************************/
-
-static int
-handle_option(int letter, int options)
-{
-switch(letter)
- {
- case -1: help(); exit(0);
- case 'c': count_only = TRUE; break;
- case 'h': filenames = FALSE; break;
- case 'i': options |= PCRE_CASELESS; break;
- case 'l': filenames_only = TRUE;
- case 'n': number = TRUE; break;
- case 'r': recurse = TRUE; break;
- case 's': silent = TRUE; break;
- case 'v': invert = TRUE; break;
- case 'x': whole_lines = TRUE; options |= PCRE_ANCHORED; break;
-
- case 'V':
- fprintf(stderr, "pcregrep version %s using ", VERSION);
- fprintf(stderr, "PCRE version %s\n", pcre_version());
- exit(0);
- break;
-
- default:
- fprintf(stderr, "pcregrep: Unknown option -%c\n", letter);
- exit(usage(2));
- }
-
-return options;
-}
-
-
-
-
-/*************************************************
-* Main program *
-*************************************************/
-
-int
-main(int argc, char **argv)
-{
-int i, j;
-int rc = 1;
-int options = 0;
-int errptr;
-const char *error;
-BOOL only_one_at_top;
-
-/* Process the options */
-
-for (i = 1; i < argc; i++)
- {
- if (argv[i][0] != '-') break;
-
- /* Missing options */
-
- if (argv[i][1] == 0) exit(usage(2));
-
- /* Long name options */
-
- if (argv[i][1] == '-')
- {
- option_item *op;
-
- if (strncmp(argv[i]+2, "file=", 5) == 0)
- {
- pattern_filename = argv[i] + 7;
- continue;
- }
-
- for (op = optionlist; op->one_char != 0; op++)
- {
- if (strcmp(argv[i]+2, op->long_name) == 0)
- {
- options = handle_option(op->one_char, options);
- break;
- }
- }
- if (op->one_char == 0)
- {
- fprintf(stderr, "pcregrep: Unknown option %s\n", argv[i]);
- exit(usage(2));
- }
- }
-
- /* One-char options */
-
- else
- {
- char *s = argv[i] + 1;
- while (*s != 0)
- {
- if (*s == 'f')
- {
- pattern_filename = s + 1;
- if (pattern_filename[0] == 0)
- {
- if (i >= argc - 1)
- {
- fprintf(stderr, "pcregrep: File name missing after -f\n");
- exit(usage(2));
- }
- pattern_filename = argv[++i];
- }
- break;
- }
- else options = handle_option(*s++, options);
- }
- }
- }
-
-pattern_list = malloc(MAX_PATTERN_COUNT * sizeof(pcre *));
-hints_list = malloc(MAX_PATTERN_COUNT * sizeof(pcre_extra *));
-
-if (pattern_list == NULL || hints_list == NULL)
- {
- fprintf(stderr, "pcregrep: malloc failed\n");
- return 2;
- }
-
-/* Compile the regular expression(s). */
-
-if (pattern_filename != NULL)
- {
- FILE *f = fopen(pattern_filename, "r");
- char buffer[BUFSIZ];
- if (f == NULL)
- {
- fprintf(stderr, "pcregrep: Failed to open %s: %s\n", pattern_filename,
- strerror(errno));
- return 2;
- }
- while (fgets(buffer, sizeof(buffer), f) != NULL)
- {
- char *s = buffer + (int)strlen(buffer);
- if (pattern_count >= MAX_PATTERN_COUNT)
- {
- fprintf(stderr, "pcregrep: Too many patterns in file (max %d)\n",
- MAX_PATTERN_COUNT);
- return 2;
- }
- while (s > buffer && isspace((unsigned char)(s[-1]))) s--;
- if (s == buffer) continue;
- *s = 0;
- pattern_list[pattern_count] = pcre_compile(buffer, options, &error,
- &errptr, NULL);
- if (pattern_list[pattern_count++] == NULL)
- {
- fprintf(stderr, "pcregrep: Error in regex number %d at offset %d: %s\n",
- pattern_count, errptr, error);
- return 2;
- }
- }
- fclose(f);
- }
-
-/* If no file name, a single regex must be given inline */
-
-else
- {
- if (i >= argc) return usage(2);
- pattern_list[0] = pcre_compile(argv[i++], options, &error, &errptr, NULL);
- if (pattern_list[0] == NULL)
- {
- fprintf(stderr, "pcregrep: Error in regex at offset %d: %s\n", errptr,
- error);
- return 2;
- }
- pattern_count++;
- }
-
-/* Study the regular expressions, as we will be running them may times */
-
-for (j = 0; j < pattern_count; j++)
- {
- hints_list[j] = pcre_study(pattern_list[j], 0, &error);
- if (error != NULL)
- {
- char s[16];
- if (pattern_count == 1) s[0] = 0; else sprintf(s, " number %d", j);
- fprintf(stderr, "pcregrep: Error while studying regex%s: %s\n", s, error);
- return 2;
- }
- }
-
-/* If there are no further arguments, do the business on stdin and exit */
-
-if (i >= argc) return pcregrep(stdin, NULL);
-
-/* Otherwise, work through the remaining arguments as files or directories.
-Pass in the fact that there is only one argument at top level - this suppresses
-the file name if the argument is not a directory. */
-
-only_one_at_top = (i == argc - 1);
-if (filenames_only) filenames = TRUE;
-
-for (; i < argc; i++)
- {
- int frc = grep_or_recurse(argv[i], recurse, filenames, only_one_at_top);
- if (frc == 0 && rc == 1) rc = 0;
- }
-
-return rc;
-}
-
-/* End */
diff --git a/ext/pcre/pcrelib/pcreposix.c b/ext/pcre/pcrelib/pcreposix.c
deleted file mode 100644
index 0ed55bb248..0000000000
--- a/ext/pcre/pcrelib/pcreposix.c
+++ /dev/null
@@ -1,301 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/*
-This is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language. See
-the file Tech.Notes for some information on the internals.
-
-This module is a wrapper that provides a POSIX API to the underlying PCRE
-functions.
-
-Written by: Philip Hazel <ph10@cam.ac.uk>
-
- Copyright (c) 1997-2003 University of Cambridge
-
------------------------------------------------------------------------------
-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
-
-1. This software is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-
-2. The origin of this software must not be misrepresented, either by
- explicit claim or by omission.
-
-3. Altered versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
-
-4. If PCRE is embedded in any software that is released under the GNU
- General Purpose Licence (GPL), then the terms of that licence shall
- supersede any condition above with which it is incompatible.
------------------------------------------------------------------------------
-*/
-
-#include "internal.h"
-#include "pcreposix.h"
-#include "stdlib.h"
-
-
-
-/* Corresponding tables of PCRE error messages and POSIX error codes. */
-
-static const char *estring[] = {
- ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
- ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
- ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR29, ERR29, ERR30,
- ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
- ERR41, ERR42, ERR43 };
-
-static int eint[] = {
- REG_EESCAPE, /* "\\ at end of pattern" */
- REG_EESCAPE, /* "\\c at end of pattern" */
- REG_EESCAPE, /* "unrecognized character follows \\" */
- REG_BADBR, /* "numbers out of order in {} quantifier" */
- REG_BADBR, /* "number too big in {} quantifier" */
- REG_EBRACK, /* "missing terminating ] for character class" */
- REG_ECTYPE, /* "invalid escape sequence in character class" */
- REG_ERANGE, /* "range out of order in character class" */
- REG_BADRPT, /* "nothing to repeat" */
- REG_BADRPT, /* "operand of unlimited repeat could match the empty string" */
- REG_ASSERT, /* "internal error: unexpected repeat" */
- REG_BADPAT, /* "unrecognized character after (?" */
- REG_BADPAT, /* "POSIX named classes are supported only within a class" */
- REG_EPAREN, /* "missing )" */
- REG_ESUBREG, /* "reference to non-existent subpattern" */
- REG_INVARG, /* "erroffset passed as NULL" */
- REG_INVARG, /* "unknown option bit(s) set" */
- REG_EPAREN, /* "missing ) after comment" */
- REG_ESIZE, /* "parentheses nested too deeply" */
- REG_ESIZE, /* "regular expression too large" */
- REG_ESPACE, /* "failed to get memory" */
- REG_EPAREN, /* "unmatched brackets" */
- REG_ASSERT, /* "internal error: code overflow" */
- REG_BADPAT, /* "unrecognized character after (?<" */
- REG_BADPAT, /* "lookbehind assertion is not fixed length" */
- REG_BADPAT, /* "malformed number after (?(" */
- REG_BADPAT, /* "conditional group containe more than two branches" */
- REG_BADPAT, /* "assertion expected after (?(" */
- REG_BADPAT, /* "(?R or (?digits must be followed by )" */
- REG_ECTYPE, /* "unknown POSIX class name" */
- REG_BADPAT, /* "POSIX collating elements are not supported" */
- REG_INVARG, /* "this version of PCRE is not compiled with PCRE_UTF8 support" */
- REG_BADPAT, /* "characters with values > 255 are not yet supported in classes" */
- REG_BADPAT, /* "character value in \x{...} sequence is too large" */
- REG_BADPAT, /* "invalid condition (?(0)" */
- REG_BADPAT, /* "\\C not allowed in lookbehind assertion" */
- REG_EESCAPE, /* "PCRE does not support \\L, \\l, \\N, \\P, \\p, \\U, \\u, or \\X" */
- REG_BADPAT, /* "number after (?C is > 255" */
- REG_BADPAT, /* "closing ) for (?C expected" */
- REG_BADPAT, /* "recursive call could loop indefinitely" */
- REG_BADPAT, /* "unrecognized character after (?P" */
- REG_BADPAT, /* "syntax error after (?P" */
- REG_BADPAT /* "two named groups have the same name" */
-};
-
-/* Table of texts corresponding to POSIX error codes */
-
-static const char *pstring[] = {
- "", /* Dummy for value 0 */
- "internal error", /* REG_ASSERT */
- "invalid repeat counts in {}", /* BADBR */
- "pattern error", /* BADPAT */
- "? * + invalid", /* BADRPT */
- "unbalanced {}", /* EBRACE */
- "unbalanced []", /* EBRACK */
- "collation error - not relevant", /* ECOLLATE */
- "bad class", /* ECTYPE */
- "bad escape sequence", /* EESCAPE */
- "empty expression", /* EMPTY */
- "unbalanced ()", /* EPAREN */
- "bad range inside []", /* ERANGE */
- "expression too big", /* ESIZE */
- "failed to get memory", /* ESPACE */
- "bad back reference", /* ESUBREG */
- "bad argument", /* INVARG */
- "match failed" /* NOMATCH */
-};
-
-
-
-
-/*************************************************
-* Translate PCRE text code to int *
-*************************************************/
-
-/* PCRE compile-time errors are given as strings defined as macros. We can just
-look them up in a table to turn them into POSIX-style error codes. */
-
-static int
-pcre_posix_error_code(const char *s)
-{
-size_t i;
-for (i = 0; i < sizeof(estring)/sizeof(char *); i++)
- if (strcmp(s, estring[i]) == 0) return eint[i];
-return REG_ASSERT;
-}
-
-
-
-/*************************************************
-* Translate error code to string *
-*************************************************/
-
-size_t
-regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
-{
-const char *message, *addmessage;
-size_t length, addlength;
-
-message = (errcode >= (int)(sizeof(pstring)/sizeof(char *)))?
- "unknown error code" : pstring[errcode];
-length = strlen(message) + 1;
-
-addmessage = " at offset ";
-addlength = (preg != NULL && (int)preg->re_erroffset != -1)?
- strlen(addmessage) + 6 : 0;
-
-if (errbuf_size > 0)
- {
- if (addlength > 0 && errbuf_size >= length + addlength)
- sprintf(errbuf, "%s%s%-6d", message, addmessage, (int)preg->re_erroffset);
- else
- {
- strncpy(errbuf, message, errbuf_size - 1);
- errbuf[errbuf_size-1] = 0;
- }
- }
-
-return length + addlength;
-}
-
-
-
-
-/*************************************************
-* Free store held by a regex *
-*************************************************/
-
-void
-regfree(regex_t *preg)
-{
-(pcre_free)(preg->re_pcre);
-}
-
-
-
-
-/*************************************************
-* Compile a regular expression *
-*************************************************/
-
-/*
-Arguments:
- preg points to a structure for recording the compiled expression
- pattern the pattern to compile
- cflags compilation flags
-
-Returns: 0 on success
- various non-zero codes on failure
-*/
-
-int
-regcomp(regex_t *preg, const char *pattern, int cflags)
-{
-const char *errorptr;
-int erroffset;
-int options = 0;
-
-if ((cflags & REG_ICASE) != 0) options |= PCRE_CASELESS;
-if ((cflags & REG_NEWLINE) != 0) options |= PCRE_MULTILINE;
-
-preg->re_pcre = pcre_compile(pattern, options, &errorptr, &erroffset, NULL);
-preg->re_erroffset = erroffset;
-
-if (preg->re_pcre == NULL) return pcre_posix_error_code(errorptr);
-
-preg->re_nsub = pcre_info(preg->re_pcre, NULL, NULL);
-return 0;
-}
-
-
-
-
-/*************************************************
-* Match a regular expression *
-*************************************************/
-
-/* Unfortunately, PCRE requires 3 ints of working space for each captured
-substring, so we have to get and release working store instead of just using
-the POSIX structures as was done in earlier releases when PCRE needed only 2
-ints. However, if the number of possible capturing brackets is small, use a
-block of store on the stack, to reduce the use of malloc/free. The threshold is
-in a macro that can be changed at configure time. */
-
-int
-regexec(regex_t *preg, const char *string, size_t nmatch,
- regmatch_t pmatch[], int eflags)
-{
-int rc;
-int options = 0;
-int *ovector = NULL;
-int small_ovector[POSIX_MALLOC_THRESHOLD * 3];
-BOOL allocated_ovector = FALSE;
-
-if ((eflags & REG_NOTBOL) != 0) options |= PCRE_NOTBOL;
-if ((eflags & REG_NOTEOL) != 0) options |= PCRE_NOTEOL;
-
-preg->re_erroffset = (size_t)(-1); /* Only has meaning after compile */
-
-if (nmatch > 0)
- {
- if (nmatch <= POSIX_MALLOC_THRESHOLD)
- {
- ovector = &(small_ovector[0]);
- }
- else
- {
- ovector = (int *)malloc(sizeof(int) * nmatch * 3);
- if (ovector == NULL) return REG_ESPACE;
- allocated_ovector = TRUE;
- }
- }
-
-rc = pcre_exec(preg->re_pcre, NULL, string, (int)strlen(string), 0, options,
- ovector, nmatch * 3);
-
-if (rc == 0) rc = nmatch; /* All captured slots were filled in */
-
-if (rc >= 0)
- {
- size_t i;
- for (i = 0; i < (size_t)rc; i++)
- {
- pmatch[i].rm_so = ovector[i*2];
- pmatch[i].rm_eo = ovector[i*2+1];
- }
- if (allocated_ovector) free(ovector);
- for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1;
- return 0;
- }
-
-else
- {
- if (allocated_ovector) free(ovector);
- switch(rc)
- {
- case PCRE_ERROR_NOMATCH: return REG_NOMATCH;
- case PCRE_ERROR_NULL: return REG_INVARG;
- case PCRE_ERROR_BADOPTION: return REG_INVARG;
- case PCRE_ERROR_BADMAGIC: return REG_INVARG;
- case PCRE_ERROR_UNKNOWN_NODE: return REG_ASSERT;
- case PCRE_ERROR_NOMEMORY: return REG_ESPACE;
- default: return REG_ASSERT;
- }
- }
-}
-
-/* End of pcreposix.c */
diff --git a/ext/pcre/pcrelib/pcreposix.h b/ext/pcre/pcrelib/pcreposix.h
deleted file mode 100644
index e70af2de84..0000000000
--- a/ext/pcre/pcrelib/pcreposix.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* Copyright (c) 1997-2001 University of Cambridge */
-
-#ifndef _PCREPOSIX_H
-#define _PCREPOSIX_H
-
-/* This is the header for the POSIX wrapper interface to the PCRE Perl-
-Compatible Regular Expression library. It defines the things POSIX says should
-be there. I hope. */
-
-/* Have to include stdlib.h in order to ensure that size_t is defined. */
-
-#include <stdlib.h>
-
-/* Allow for C++ users */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Options defined by POSIX. */
-
-#define REG_ICASE 0x01
-#define REG_NEWLINE 0x02
-#define REG_NOTBOL 0x04
-#define REG_NOTEOL 0x08
-
-/* These are not used by PCRE, but by defining them we make it easier
-to slot PCRE into existing programs that make POSIX calls. */
-
-#define REG_EXTENDED 0
-#define REG_NOSUB 0
-
-/* Error values. Not all these are relevant or used by the wrapper. */
-
-enum {
- REG_ASSERT = 1, /* internal error ? */
- REG_BADBR, /* invalid repeat counts in {} */
- REG_BADPAT, /* pattern error */
- REG_BADRPT, /* ? * + invalid */
- REG_EBRACE, /* unbalanced {} */
- REG_EBRACK, /* unbalanced [] */
- REG_ECOLLATE, /* collation error - not relevant */
- REG_ECTYPE, /* bad class */
- REG_EESCAPE, /* bad escape sequence */
- REG_EMPTY, /* empty expression */
- REG_EPAREN, /* unbalanced () */
- REG_ERANGE, /* bad range inside [] */
- REG_ESIZE, /* expression too big */
- REG_ESPACE, /* failed to get memory */
- REG_ESUBREG, /* bad back reference */
- REG_INVARG, /* bad argument */
- REG_NOMATCH /* match failed */
-};
-
-
-/* The structure representing a compiled regular expression. */
-
-typedef struct {
- void *re_pcre;
- size_t re_nsub;
- size_t re_erroffset;
-} regex_t;
-
-/* The structure in which a captured offset is returned. */
-
-typedef int regoff_t;
-
-typedef struct {
- regoff_t rm_so;
- regoff_t rm_eo;
-} regmatch_t;
-
-/* The functions */
-
-extern int regcomp(regex_t *, const char *, int);
-extern int regexec(regex_t *, const char *, size_t, regmatch_t *, int);
-extern size_t regerror(int, const regex_t *, char *, size_t);
-extern void regfree(regex_t *);
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* End of pcreposix.h */
diff --git a/ext/pcre/pcrelib/pcretest.c b/ext/pcre/pcrelib/pcretest.c
deleted file mode 100644
index 8977ef7f66..0000000000
--- a/ext/pcre/pcrelib/pcretest.c
+++ /dev/null
@@ -1,1274 +0,0 @@
-/*************************************************
-* PCRE testing program *
-*************************************************/
-
-/* This program was hacked up as a tester for PCRE. I really should have
-written it more tidily in the first place. Will I ever learn? It has grown and
-been extended and consequently is now rather untidy in places. */
-
-#include <ctype.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <time.h>
-#include <locale.h>
-
-/* We need the internal info for displaying the results of pcre_study(). Also
-for getting the opcodes for showing compiled code. */
-
-#include "internal.h"
-
-/* It is possible to compile this test program without including support for
-testing the POSIX interface, though this is not available via the standard
-Makefile. */
-
-#if !defined NOPOSIX
-#include "pcreposix.h"
-#endif
-
-#ifndef CLOCKS_PER_SEC
-#ifdef CLK_TCK
-#define CLOCKS_PER_SEC CLK_TCK
-#else
-#define CLOCKS_PER_SEC 100
-#endif
-#endif
-
-#define LOOPREPEAT 50000
-
-
-static FILE *outfile;
-static int log_store = 0;
-static int callout_count;
-static int callout_extra;
-static int callout_fail_count;
-static int callout_fail_id;
-static int first_callout;
-static int utf8;
-static size_t gotten_store;
-
-
-
-static int utf8_table1[] = {
- 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
-
-static int utf8_table2[] = {
- 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
-
-static int utf8_table3[] = {
- 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
-
-
-
-/*************************************************
-* Print compiled regex *
-*************************************************/
-
-/* The code for doing this is held in a separate file that is also included in
-pcre.c when it is compiled with the debug switch. It defines a function called
-print_internals(), which uses a table of opcode lengths defined by the macro
-OP_LENGTHS, whose name must be OP_lengths. */
-
-static uschar OP_lengths[] = { OP_LENGTHS };
-
-#include "printint.c"
-
-
-
-/*************************************************
-* Read number from string *
-*************************************************/
-
-/* We don't use strtoul() because SunOS4 doesn't have it. Rather than mess
-around with conditional compilation, just do the job by hand. It is only used
-for unpicking the -o argument, so just keep it simple.
-
-Arguments:
- str string to be converted
- endptr where to put the end pointer
-
-Returns: the unsigned long
-*/
-
-static int
-get_value(unsigned char *str, unsigned char **endptr)
-{
-int result = 0;
-while(*str != 0 && isspace(*str)) str++;
-while (isdigit(*str)) result = result * 10 + (int)(*str++ - '0');
-*endptr = str;
-return(result);
-}
-
-
-
-/*************************************************
-* Convert character value to UTF-8 *
-*************************************************/
-
-/* This function takes an integer value in the range 0 - 0x7fffffff
-and encodes it as a UTF-8 character in 0 to 6 bytes.
-
-Arguments:
- cvalue the character value
- buffer pointer to buffer for result - at least 6 bytes long
-
-Returns: number of characters placed in the buffer
- -1 if input character is negative
- 0 if input character is positive but too big (only when
- int is longer than 32 bits)
-*/
-
-static int
-ord2utf8(int cvalue, unsigned char *buffer)
-{
-register int i, j;
-for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
- if (cvalue <= utf8_table1[i]) break;
-if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
-if (cvalue < 0) return -1;
-
-buffer += i;
-for (j = i; j > 0; j--)
- {
- *buffer-- = 0x80 | (cvalue & 0x3f);
- cvalue >>= 6;
- }
-*buffer = utf8_table2[i] | cvalue;
-return i + 1;
-}
-
-
-/*************************************************
-* Convert UTF-8 string to value *
-*************************************************/
-
-/* This function takes one or more bytes that represents a UTF-8 character,
-and returns the value of the character.
-
-Argument:
- buffer a pointer to the byte vector
- vptr a pointer to an int to receive the value
-
-Returns: > 0 => the number of bytes consumed
- -6 to 0 => malformed UTF-8 character at offset = (-return)
-*/
-
-int
-utf82ord(unsigned char *buffer, int *vptr)
-{
-int c = *buffer++;
-int d = c;
-int i, j, s;
-
-for (i = -1; i < 6; i++) /* i is number of additional bytes */
- {
- if ((d & 0x80) == 0) break;
- d <<= 1;
- }
-
-if (i == -1) { *vptr = c; return 1; } /* ascii character */
-if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
-
-/* i now has a value in the range 1-5 */
-
-s = 6*i;
-d = (c & utf8_table3[i]) << s;
-
-for (j = 0; j < i; j++)
- {
- c = *buffer++;
- if ((c & 0xc0) != 0x80) return -(j+1);
- s -= 6;
- d |= (c & 0x3f) << s;
- }
-
-/* Check that encoding was the correct unique one */
-
-for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
- if (d <= utf8_table1[j]) break;
-if (j != i) return -(i+1);
-
-/* Valid value */
-
-*vptr = d;
-return i+1;
-}
-
-
-
-/*************************************************
-* Print character string *
-*************************************************/
-
-/* Character string printing function. Must handle UTF-8 strings in utf8
-mode. Yields number of characters printed. If handed a NULL file, just counts
-chars without printing. */
-
-static int pchars(unsigned char *p, int length, FILE *f)
-{
-int c;
-int yield = 0;
-
-while (length-- > 0)
- {
- if (utf8)
- {
- int rc = utf82ord(p, &c);
-
- if (rc > 0 && rc <= length + 1) /* Mustn't run over the end */
- {
- length -= rc - 1;
- p += rc;
- if (c < 256 && isprint(c))
- {
- if (f != NULL) fprintf(f, "%c", c);
- yield++;
- }
- else
- {
- int n;
- if (f != NULL) fprintf(f, "\\x{%02x}%n", c, &n);
- yield += n;
- }
- continue;
- }
- }
-
- /* Not UTF-8, or malformed UTF-8 */
-
- if (isprint(c = *(p++)))
- {
- if (f != NULL) fprintf(f, "%c", c);
- yield++;
- }
- else
- {
- if (f != NULL) fprintf(f, "\\x%02x", c);
- yield += 4;
- }
- }
-
-return yield;
-}
-
-
-
-/*************************************************
-* Callout function *
-*************************************************/
-
-/* Called from PCRE as a result of the (?C) item. We print out where we are in
-the match. Yield OK unless more callouts than the fail count. . */
-
-static int callout(pcre_callout_block *cb)
-{
-FILE *f = (first_callout | callout_extra)? outfile : NULL;
-int i, pre_start, post_start;
-
-if (callout_extra)
- {
- int i;
- fprintf(f, "Callout %d: last capture = %d\n",
- cb->callout_number, cb->capture_last);
-
- for (i = 0; i < cb->capture_top * 2; i += 2)
- {
- if (cb->offset_vector[i] < 0)
- fprintf(f, "%2d: <unset>\n", i/2);
- else
- {
- fprintf(f, "%2d: ", i/2);
- (void)pchars((unsigned char *)cb->subject + cb->offset_vector[i],
- cb->offset_vector[i+1] - cb->offset_vector[i], f);
- fprintf(f, "\n");
- }
- }
- }
-
-/* Re-print the subject in canonical form, the first time or if giving full
-datails. On subsequent calls in the same match, we use pchars just to find the
-printed lengths of the substrings. */
-
-if (f != NULL) fprintf(f, "--->");
-
-pre_start = pchars((unsigned char *)cb->subject, cb->start_match, f);
-post_start = pchars((unsigned char *)(cb->subject + cb->start_match),
- cb->current_position - cb->start_match, f);
-
-(void)pchars((unsigned char *)(cb->subject + cb->current_position),
- cb->subject_length - cb->current_position, f);
-
-if (f != NULL) fprintf(f, "\n");
-
-/* Always print appropriate indicators, with callout number if not already
-shown */
-
-if (callout_extra) fprintf(outfile, " ");
- else fprintf(outfile, "%3d ", cb->callout_number);
-
-for (i = 0; i < pre_start; i++) fprintf(outfile, " ");
-fprintf(outfile, "^");
-
-if (post_start > 0)
- {
- for (i = 0; i < post_start - 1; i++) fprintf(outfile, " ");
- fprintf(outfile, "^");
- }
-
-fprintf(outfile, "\n");
-
-first_callout = 0;
-
-return (cb->callout_number != callout_fail_id)? 0 :
- (++callout_count >= callout_fail_count)? 1 : 0;
-}
-
-
-/*************************************************
-* Local malloc function *
-*************************************************/
-
-/* Alternative malloc function, to test functionality and show the size of the
-compiled re. */
-
-static void *new_malloc(size_t size)
-{
-gotten_store = size;
-return malloc(size);
-}
-
-
-
-/*************************************************
-* Call pcre_fullinfo() *
-*************************************************/
-
-/* Get one piece of information from the pcre_fullinfo() function */
-
-static void new_info(pcre *re, pcre_extra *study, int option, void *ptr)
-{
-int rc;
-if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
- fprintf(outfile, "Error %d from pcre_fullinfo(%d)\n", rc, option);
-}
-
-
-
-/*************************************************
-* Main Program *
-*************************************************/
-
-/* Read lines from named file or stdin and write to named file or stdout; lines
-consist of a regular expression, in delimiters and optionally followed by
-options, followed by a set of test data, terminated by an empty line. */
-
-int main(int argc, char **argv)
-{
-FILE *infile = stdin;
-int options = 0;
-int study_options = 0;
-int op = 1;
-int timeit = 0;
-int showinfo = 0;
-int showstore = 0;
-int size_offsets = 45;
-int size_offsets_max;
-int *offsets;
-#if !defined NOPOSIX
-int posix = 0;
-#endif
-int debug = 0;
-int done = 0;
-unsigned char buffer[30000];
-unsigned char dbuffer[1024];
-
-/* Static so that new_malloc can use it. */
-
-outfile = stdout;
-
-/* Scan options */
-
-while (argc > 1 && argv[op][0] == '-')
- {
- unsigned char *endptr;
-
- if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
- showstore = 1;
- else if (strcmp(argv[op], "-t") == 0) timeit = 1;
- else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
- else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
- else if (strcmp(argv[op], "-o") == 0 && argc > 2 &&
- ((size_offsets = get_value(argv[op+1], &endptr)), *endptr == 0))
- {
- op++;
- argc--;
- }
-#if !defined NOPOSIX
- else if (strcmp(argv[op], "-p") == 0) posix = 1;
-#endif
- else
- {
- printf("** Unknown or malformed option %s\n", argv[op]);
- printf("Usage: pcretest [-d] [-i] [-o <n>] [-p] [-s] [-t] [<input> [<output>]]\n");
- printf(" -d debug: show compiled code; implies -i\n"
- " -i show information about compiled pattern\n"
- " -o <n> set size of offsets vector to <n>\n");
-#if !defined NOPOSIX
- printf(" -p use POSIX interface\n");
-#endif
- printf(" -s output store information\n"
- " -t time compilation and execution\n");
- return 1;
- }
- op++;
- argc--;
- }
-
-/* Get the store for the offsets vector, and remember what it was */
-
-size_offsets_max = size_offsets;
-offsets = malloc(size_offsets_max * sizeof(int));
-if (offsets == NULL)
- {
- printf("** Failed to get %d bytes of memory for offsets vector\n",
- size_offsets_max * sizeof(int));
- return 1;
- }
-
-/* Sort out the input and output files */
-
-if (argc > 1)
- {
- infile = fopen(argv[op], "r");
- if (infile == NULL)
- {
- printf("** Failed to open %s\n", argv[op]);
- return 1;
- }
- }
-
-if (argc > 2)
- {
- outfile = fopen(argv[op+1], "w");
- if (outfile == NULL)
- {
- printf("** Failed to open %s\n", argv[op+1]);
- return 1;
- }
- }
-
-/* Set alternative malloc function */
-
-pcre_malloc = new_malloc;
-
-/* Heading line, then prompt for first regex if stdin */
-
-fprintf(outfile, "PCRE version %s\n\n", pcre_version());
-
-/* Main loop */
-
-while (!done)
- {
- pcre *re = NULL;
- pcre_extra *extra = NULL;
-
-#if !defined NOPOSIX /* There are still compilers that require no indent */
- regex_t preg;
- int do_posix = 0;
-#endif
-
- const char *error;
- unsigned char *p, *pp, *ppp;
- const unsigned char *tables = NULL;
- int do_study = 0;
- int do_debug = debug;
- int do_G = 0;
- int do_g = 0;
- int do_showinfo = showinfo;
- int do_showrest = 0;
- int erroroffset, len, delimiter;
-
- utf8 = 0;
-
- if (infile == stdin) printf(" re> ");
- if (fgets((char *)buffer, sizeof(buffer), infile) == NULL) break;
- if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
- fflush(outfile);
-
- p = buffer;
- while (isspace(*p)) p++;
- if (*p == 0) continue;
-
- /* Get the delimiter and seek the end of the pattern; if is isn't
- complete, read more. */
-
- delimiter = *p++;
-
- if (isalnum(delimiter) || delimiter == '\\')
- {
- fprintf(outfile, "** Delimiter must not be alphameric or \\\n");
- goto SKIP_DATA;
- }
-
- pp = p;
-
- for(;;)
- {
- while (*pp != 0)
- {
- if (*pp == '\\' && pp[1] != 0) pp++;
- else if (*pp == delimiter) break;
- pp++;
- }
- if (*pp != 0) break;
-
- len = sizeof(buffer) - (pp - buffer);
- if (len < 256)
- {
- fprintf(outfile, "** Expression too long - missing delimiter?\n");
- goto SKIP_DATA;
- }
-
- if (infile == stdin) printf(" > ");
- if (fgets((char *)pp, len, infile) == NULL)
- {
- fprintf(outfile, "** Unexpected EOF\n");
- done = 1;
- goto CONTINUE;
- }
- if (infile != stdin) fprintf(outfile, "%s", (char *)pp);
- }
-
- /* If the first character after the delimiter is backslash, make
- the pattern end with backslash. This is purely to provide a way
- of testing for the error message when a pattern ends with backslash. */
-
- if (pp[1] == '\\') *pp++ = '\\';
-
- /* Terminate the pattern at the delimiter */
-
- *pp++ = 0;
-
- /* Look for options after final delimiter */
-
- options = 0;
- study_options = 0;
- log_store = showstore; /* default from command line */
-
- while (*pp != 0)
- {
- switch (*pp++)
- {
- case 'g': do_g = 1; break;
- case 'i': options |= PCRE_CASELESS; break;
- case 'm': options |= PCRE_MULTILINE; break;
- case 's': options |= PCRE_DOTALL; break;
- case 'x': options |= PCRE_EXTENDED; break;
-
- case '+': do_showrest = 1; break;
- case 'A': options |= PCRE_ANCHORED; break;
- case 'D': do_debug = do_showinfo = 1; break;
- case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
- case 'G': do_G = 1; break;
- case 'I': do_showinfo = 1; break;
- case 'M': log_store = 1; break;
-
-#if !defined NOPOSIX
- case 'P': do_posix = 1; break;
-#endif
-
- case 'S': do_study = 1; break;
- case 'U': options |= PCRE_UNGREEDY; break;
- case 'X': options |= PCRE_EXTRA; break;
- case '8': options |= PCRE_UTF8; utf8 = 1; break;
-
- case 'L':
- ppp = pp;
- while (*ppp != '\n' && *ppp != ' ') ppp++;
- *ppp = 0;
- if (setlocale(LC_CTYPE, (const char *)pp) == NULL)
- {
- fprintf(outfile, "** Failed to set locale \"%s\"\n", pp);
- goto SKIP_DATA;
- }
- tables = pcre_maketables();
- pp = ppp;
- break;
-
- case '\n': case ' ': break;
- default:
- fprintf(outfile, "** Unknown option '%c'\n", pp[-1]);
- goto SKIP_DATA;
- }
- }
-
- /* Handle compiling via the POSIX interface, which doesn't support the
- timing, showing, or debugging options, nor the ability to pass over
- local character tables. */
-
-#if !defined NOPOSIX
- if (posix || do_posix)
- {
- int rc;
- int cflags = 0;
- if ((options & PCRE_CASELESS) != 0) cflags |= REG_ICASE;
- if ((options & PCRE_MULTILINE) != 0) cflags |= REG_NEWLINE;
- rc = regcomp(&preg, (char *)p, cflags);
-
- /* Compilation failed; go back for another re, skipping to blank line
- if non-interactive. */
-
- if (rc != 0)
- {
- (void)regerror(rc, &preg, (char *)buffer, sizeof(buffer));
- fprintf(outfile, "Failed: POSIX code %d: %s\n", rc, buffer);
- goto SKIP_DATA;
- }
- }
-
- /* Handle compiling via the native interface */
-
- else
-#endif /* !defined NOPOSIX */
-
- {
- if (timeit)
- {
- register int i;
- clock_t time_taken;
- clock_t start_time = clock();
- for (i = 0; i < LOOPREPEAT; i++)
- {
- re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
- if (re != NULL) free(re);
- }
- time_taken = clock() - start_time;
- fprintf(outfile, "Compile time %.3f milliseconds\n",
- (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
- (double)CLOCKS_PER_SEC);
- }
-
- re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
-
- /* Compilation failed; go back for another re, skipping to blank line
- if non-interactive. */
-
- if (re == NULL)
- {
- fprintf(outfile, "Failed: %s at offset %d\n", error, erroroffset);
- SKIP_DATA:
- if (infile != stdin)
- {
- for (;;)
- {
- if (fgets((char *)buffer, sizeof(buffer), infile) == NULL)
- {
- done = 1;
- goto CONTINUE;
- }
- len = (int)strlen((char *)buffer);
- while (len > 0 && isspace(buffer[len-1])) len--;
- if (len == 0) break;
- }
- fprintf(outfile, "\n");
- }
- goto CONTINUE;
- }
-
- /* Compilation succeeded; print data if required. There are now two
- info-returning functions. The old one has a limited interface and
- returns only limited data. Check that it agrees with the newer one. */
-
- if (log_store)
- fprintf(outfile, "Memory allocation (code space): %d\n",
- (int)(gotten_store -
- sizeof(real_pcre) -
- ((real_pcre *)re)->name_count * ((real_pcre *)re)->name_entry_size));
-
- if (do_showinfo)
- {
- unsigned long int get_options;
- int old_first_char, old_options, old_count;
- int count, backrefmax, first_char, need_char;
- int nameentrysize, namecount;
- const uschar *nametable;
- size_t size;
-
- if (do_debug)
- {
- fprintf(outfile, "------------------------------------------------------------------\n");
- print_internals(re, outfile);
- }
-
- new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
- new_info(re, NULL, PCRE_INFO_SIZE, &size);
- new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
- new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax);
- new_info(re, NULL, PCRE_INFO_FIRSTCHAR, &first_char);
- new_info(re, NULL, PCRE_INFO_LASTLITERAL, &need_char);
- new_info(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
- new_info(re, NULL, PCRE_INFO_NAMECOUNT, &namecount);
- new_info(re, NULL, PCRE_INFO_NAMETABLE, &nametable);
-
- old_count = pcre_info(re, &old_options, &old_first_char);
- if (count < 0) fprintf(outfile,
- "Error %d from pcre_info()\n", count);
- else
- {
- if (old_count != count) fprintf(outfile,
- "Count disagreement: pcre_fullinfo=%d pcre_info=%d\n", count,
- old_count);
-
- if (old_first_char != first_char) fprintf(outfile,
- "First char disagreement: pcre_fullinfo=%d pcre_info=%d\n",
- first_char, old_first_char);
-
- if (old_options != (int)get_options) fprintf(outfile,
- "Options disagreement: pcre_fullinfo=%ld pcre_info=%d\n",
- get_options, old_options);
- }
-
- if (size != gotten_store) fprintf(outfile,
- "Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
- size, gotten_store);
-
- fprintf(outfile, "Capturing subpattern count = %d\n", count);
- if (backrefmax > 0)
- fprintf(outfile, "Max back reference = %d\n", backrefmax);
-
- if (namecount > 0)
- {
- fprintf(outfile, "Named capturing subpatterns:\n");
- while (namecount-- > 0)
- {
- fprintf(outfile, " %s %*s%3d\n", nametable + 2,
- nameentrysize - 3 - (int)strlen((char *)nametable + 2), "",
- GET2(nametable, 0));
- nametable += nameentrysize;
- }
- }
-
- if (get_options == 0) fprintf(outfile, "No options\n");
- else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s\n",
- ((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
- ((get_options & PCRE_CASELESS) != 0)? " caseless" : "",
- ((get_options & PCRE_EXTENDED) != 0)? " extended" : "",
- ((get_options & PCRE_MULTILINE) != 0)? " multiline" : "",
- ((get_options & PCRE_DOTALL) != 0)? " dotall" : "",
- ((get_options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
- ((get_options & PCRE_EXTRA) != 0)? " extra" : "",
- ((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
- ((get_options & PCRE_UTF8) != 0)? " utf8" : "");
-
- if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
- fprintf(outfile, "Case state changes\n");
-
- if (first_char == -1)
- {
- fprintf(outfile, "First char at start or follows \\n\n");
- }
- else if (first_char < 0)
- {
- fprintf(outfile, "No first char\n");
- }
- else
- {
- int ch = first_char & 255;
- char *caseless = ((first_char & REQ_CASELESS) == 0)?
- "" : " (caseless)";
- if (isprint(ch))
- fprintf(outfile, "First char = \'%c\'%s\n", ch, caseless);
- else
- fprintf(outfile, "First char = %d%s\n", ch, caseless);
- }
-
- if (need_char < 0)
- {
- fprintf(outfile, "No need char\n");
- }
- else
- {
- int ch = need_char & 255;
- char *caseless = ((need_char & REQ_CASELESS) == 0)?
- "" : " (caseless)";
- if (isprint(need_char))
- fprintf(outfile, "Need char = \'%c\'%s\n", ch, caseless);
- else
- fprintf(outfile, "Need char = %d%s\n", ch, caseless);
- }
- }
-
- /* If /S was present, study the regexp to generate additional info to
- help with the matching. */
-
- if (do_study)
- {
- if (timeit)
- {
- register int i;
- clock_t time_taken;
- clock_t start_time = clock();
- for (i = 0; i < LOOPREPEAT; i++)
- extra = pcre_study(re, study_options, &error);
- time_taken = clock() - start_time;
- if (extra != NULL) free(extra);
- fprintf(outfile, " Study time %.3f milliseconds\n",
- (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
- (double)CLOCKS_PER_SEC);
- }
-
- extra = pcre_study(re, study_options, &error);
- if (error != NULL)
- fprintf(outfile, "Failed to study: %s\n", error);
- else if (extra == NULL)
- fprintf(outfile, "Study returned NULL\n");
-
- else if (do_showinfo)
- {
- uschar *start_bits = NULL;
- new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
- if (start_bits == NULL)
- fprintf(outfile, "No starting character set\n");
- else
- {
- int i;
- int c = 24;
- fprintf(outfile, "Starting character set: ");
- for (i = 0; i < 256; i++)
- {
- if ((start_bits[i/8] & (1<<(i%8))) != 0)
- {
- if (c > 75)
- {
- fprintf(outfile, "\n ");
- c = 2;
- }
- if (isprint(i) && i != ' ')
- {
- fprintf(outfile, "%c ", i);
- c += 2;
- }
- else
- {
- fprintf(outfile, "\\x%02x ", i);
- c += 5;
- }
- }
- }
- fprintf(outfile, "\n");
- }
- }
- }
- }
-
- /* Read data lines and test them */
-
- for (;;)
- {
- unsigned char *q;
- unsigned char *bptr = dbuffer;
- int *use_offsets = offsets;
- int use_size_offsets = size_offsets;
- int count, c;
- int copystrings = 0;
- int getstrings = 0;
- int getlist = 0;
- int gmatched = 0;
- int start_offset = 0;
- int g_notempty = 0;
-
- options = 0;
-
- pcre_callout = callout;
- first_callout = 1;
- callout_extra = 0;
- callout_count = 0;
- callout_fail_count = 999999;
- callout_fail_id = -1;
-
- if (infile == stdin) printf("data> ");
- if (fgets((char *)buffer, sizeof(buffer), infile) == NULL)
- {
- done = 1;
- goto CONTINUE;
- }
- if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
-
- len = (int)strlen((char *)buffer);
- while (len > 0 && isspace(buffer[len-1])) len--;
- buffer[len] = 0;
- if (len == 0) break;
-
- p = buffer;
- while (isspace(*p)) p++;
-
- q = dbuffer;
- while ((c = *p++) != 0)
- {
- int i = 0;
- int n = 0;
-
- if (c == '\\') switch ((c = *p++))
- {
- case 'a': c = 7; break;
- case 'b': c = '\b'; break;
- case 'e': c = 27; break;
- case 'f': c = '\f'; break;
- case 'n': c = '\n'; break;
- case 'r': c = '\r'; break;
- case 't': c = '\t'; break;
- case 'v': c = '\v'; break;
-
- case '0': case '1': case '2': case '3':
- case '4': case '5': case '6': case '7':
- c -= '0';
- while (i++ < 2 && isdigit(*p) && *p != '8' && *p != '9')
- c = c * 8 + *p++ - '0';
- break;
-
- case 'x':
-
- /* Handle \x{..} specially - new Perl thing for utf8 */
-
- if (*p == '{')
- {
- unsigned char *pt = p;
- c = 0;
- while (isxdigit(*(++pt)))
- c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'W');
- if (*pt == '}')
- {
- unsigned char buffer[8];
- int ii, utn;
- utn = ord2utf8(c, buffer);
- for (ii = 0; ii < utn - 1; ii++) *q++ = buffer[ii];
- c = buffer[ii]; /* Last byte */
- p = pt + 1;
- break;
- }
- /* Not correct form; fall through */
- }
-
- /* Ordinary \x */
-
- c = 0;
- while (i++ < 2 && isxdigit(*p))
- {
- c = c * 16 + tolower(*p) - ((isdigit(*p))? '0' : 'W');
- p++;
- }
- break;
-
- case 0: /* Allows for an empty line */
- p--;
- continue;
-
- case 'A': /* Option setting */
- options |= PCRE_ANCHORED;
- continue;
-
- case 'B':
- options |= PCRE_NOTBOL;
- continue;
-
- case 'C':
- if (isdigit(*p)) /* Set copy string */
- {
- while(isdigit(*p)) n = n * 10 + *p++ - '0';
- copystrings |= 1 << n;
- }
- else if (*p == '+')
- {
- callout_extra = 1;
- p++;
- }
- else if (*p == '-')
- {
- pcre_callout = NULL;
- p++;
- }
- else if (*p == '!')
- {
- callout_fail_id = 0;
- p++;
- while(isdigit(*p))
- callout_fail_id = callout_fail_id * 10 + *p++ - '0';
- callout_fail_count = 0;
- if (*p == '!')
- {
- p++;
- while(isdigit(*p))
- callout_fail_count = callout_fail_count * 10 + *p++ - '0';
- }
- }
- continue;
-
- case 'G':
- while(isdigit(*p)) n = n * 10 + *p++ - '0';
- getstrings |= 1 << n;
- continue;
-
- case 'L':
- getlist = 1;
- continue;
-
- case 'N':
- options |= PCRE_NOTEMPTY;
- continue;
-
- case 'O':
- while(isdigit(*p)) n = n * 10 + *p++ - '0';
- if (n > size_offsets_max)
- {
- size_offsets_max = n;
- free(offsets);
- use_offsets = offsets = malloc(size_offsets_max * sizeof(int));
- if (offsets == NULL)
- {
- printf("** Failed to get %d bytes of memory for offsets vector\n",
- size_offsets_max * sizeof(int));
- return 1;
- }
- }
- use_size_offsets = n;
- if (n == 0) use_offsets = NULL; /* Ensures it can't write to it */
- continue;
-
- case 'Z':
- options |= PCRE_NOTEOL;
- continue;
- }
- *q++ = c;
- }
- *q = 0;
- len = q - dbuffer;
-
- /* Handle matching via the POSIX interface, which does not
- support timing. */
-
-#if !defined NOPOSIX
- if (posix || do_posix)
- {
- int rc;
- int eflags = 0;
- regmatch_t *pmatch = malloc(sizeof(regmatch_t) * use_size_offsets);
- if ((options & PCRE_NOTBOL) != 0) eflags |= REG_NOTBOL;
- if ((options & PCRE_NOTEOL) != 0) eflags |= REG_NOTEOL;
-
- rc = regexec(&preg, (const char *)bptr, use_size_offsets, pmatch, eflags);
-
- if (rc != 0)
- {
- (void)regerror(rc, &preg, (char *)buffer, sizeof(buffer));
- fprintf(outfile, "No match: POSIX code %d: %s\n", rc, buffer);
- }
- else
- {
- size_t i;
- for (i = 0; i < (size_t)use_size_offsets; i++)
- {
- if (pmatch[i].rm_so >= 0)
- {
- fprintf(outfile, "%2d: ", (int)i);
- (void)pchars(dbuffer + pmatch[i].rm_so,
- pmatch[i].rm_eo - pmatch[i].rm_so, outfile);
- fprintf(outfile, "\n");
- if (i == 0 && do_showrest)
- {
- fprintf(outfile, " 0+ ");
- (void)pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo,
- outfile);
- fprintf(outfile, "\n");
- }
- }
- }
- }
- free(pmatch);
- }
-
- /* Handle matching via the native interface - repeats for /g and /G */
-
- else
-#endif /* !defined NOPOSIX */
-
- for (;; gmatched++) /* Loop for /g or /G */
- {
- if (timeit)
- {
- register int i;
- clock_t time_taken;
- clock_t start_time = clock();
- for (i = 0; i < LOOPREPEAT; i++)
- count = pcre_exec(re, extra, (char *)bptr, len,
- start_offset, options | g_notempty, use_offsets, use_size_offsets);
- time_taken = clock() - start_time;
- fprintf(outfile, "Execute time %.3f milliseconds\n",
- (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
- (double)CLOCKS_PER_SEC);
- }
-
- count = pcre_exec(re, extra, (char *)bptr, len,
- start_offset, options | g_notempty, use_offsets, use_size_offsets);
-
- if (count == 0)
- {
- fprintf(outfile, "Matched, but too many substrings\n");
- count = use_size_offsets/3;
- }
-
- /* Matched */
-
- if (count >= 0)
- {
- int i;
- for (i = 0; i < count * 2; i += 2)
- {
- if (use_offsets[i] < 0)
- fprintf(outfile, "%2d: <unset>\n", i/2);
- else
- {
- fprintf(outfile, "%2d: ", i/2);
- (void)pchars(bptr + use_offsets[i],
- use_offsets[i+1] - use_offsets[i], outfile);
- fprintf(outfile, "\n");
- if (i == 0)
- {
- if (do_showrest)
- {
- fprintf(outfile, " 0+ ");
- (void)pchars(bptr + use_offsets[i+1], len - use_offsets[i+1],
- outfile);
- fprintf(outfile, "\n");
- }
- }
- }
- }
-
- for (i = 0; i < 32; i++)
- {
- if ((copystrings & (1 << i)) != 0)
- {
- char copybuffer[16];
- int rc = pcre_copy_substring((char *)bptr, use_offsets, count,
- i, copybuffer, sizeof(copybuffer));
- if (rc < 0)
- fprintf(outfile, "copy substring %d failed %d\n", i, rc);
- else
- fprintf(outfile, "%2dC %s (%d)\n", i, copybuffer, rc);
- }
- }
-
- for (i = 0; i < 32; i++)
- {
- if ((getstrings & (1 << i)) != 0)
- {
- const char *substring;
- int rc = pcre_get_substring((char *)bptr, use_offsets, count,
- i, &substring);
- if (rc < 0)
- fprintf(outfile, "get substring %d failed %d\n", i, rc);
- else
- {
- fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
- /* free((void *)substring); */
- pcre_free_substring(substring);
- }
- }
- }
-
- if (getlist)
- {
- const char **stringlist;
- int rc = pcre_get_substring_list((char *)bptr, use_offsets, count,
- &stringlist);
- if (rc < 0)
- fprintf(outfile, "get substring list failed %d\n", rc);
- else
- {
- for (i = 0; i < count; i++)
- fprintf(outfile, "%2dL %s\n", i, stringlist[i]);
- if (stringlist[i] != NULL)
- fprintf(outfile, "string list not terminated by NULL\n");
- /* free((void *)stringlist); */
- pcre_free_substring_list(stringlist);
- }
- }
- }
-
- /* Failed to match. If this is a /g or /G loop and we previously set
- g_notempty after a null match, this is not necessarily the end.
- We want to advance the start offset, and continue. Fudge the offset
- values to achieve this. We won't be at the end of the string - that
- was checked before setting g_notempty. */
-
- else
- {
- if (g_notempty != 0)
- {
- use_offsets[0] = start_offset;
- use_offsets[1] = start_offset + 1;
- }
- else
- {
- if (gmatched == 0) /* Error if no previous matches */
- {
- if (count == -1) fprintf(outfile, "No match\n");
- else fprintf(outfile, "Error %d\n", count);
- }
- break; /* Out of the /g loop */
- }
- }
-
- /* If not /g or /G we are done */
-
- if (!do_g && !do_G) break;
-
- /* If we have matched an empty string, first check to see if we are at
- the end of the subject. If so, the /g loop is over. Otherwise, mimic
- what Perl's /g options does. This turns out to be rather cunning. First
- we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the
- same point. If this fails (picked up above) we advance to the next
- character. */
-
- g_notempty = 0;
- if (use_offsets[0] == use_offsets[1])
- {
- if (use_offsets[0] == len) break;
- g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;
- }
-
- /* For /g, update the start offset, leaving the rest alone */
-
- if (do_g) start_offset = use_offsets[1];
-
- /* For /G, update the pointer and length */
-
- else
- {
- bptr += use_offsets[1];
- len -= use_offsets[1];
- }
- } /* End of loop for /g and /G */
- } /* End of loop for data lines */
-
- CONTINUE:
-
-#if !defined NOPOSIX
- if (posix || do_posix) regfree(&preg);
-#endif
-
- if (re != NULL) free(re);
- if (extra != NULL) free(extra);
- if (tables != NULL)
- {
- free((void *)tables);
- setlocale(LC_CTYPE, "C");
- }
- }
-
-fprintf(outfile, "\n");
-return 0;
-}
-
-/* End */
diff --git a/ext/pcre/pcrelib/study.c b/ext/pcre/pcrelib/study.c
deleted file mode 100644
index 65771b9d74..0000000000
--- a/ext/pcre/pcrelib/study.c
+++ /dev/null
@@ -1,409 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/*
-This is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language. See
-the file Tech.Notes for some information on the internals.
-
-Written by: Philip Hazel <ph10@cam.ac.uk>
-
- Copyright (c) 1997-2003 University of Cambridge
-
------------------------------------------------------------------------------
-Permission is granted to anyone to use this software for any purpose on any
-computer system, and to redistribute it freely, subject to the following
-restrictions:
-
-1. This software is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-
-2. The origin of this software must not be misrepresented, either by
- explicit claim or by omission.
-
-3. Altered versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
-
-4. If PCRE is embedded in any software that is released under the GNU
- General Purpose Licence (GPL), then the terms of that licence shall
- supersede any condition above with which it is incompatible.
------------------------------------------------------------------------------
-*/
-
-
-/* Include the internals header, which itself includes Standard C headers plus
-the external pcre header. */
-
-#include "internal.h"
-
-
-
-/*************************************************
-* Set a bit and maybe its alternate case *
-*************************************************/
-
-/* Given a character, set its bit in the table, and also the bit for the other
-version of a letter if we are caseless.
-
-Arguments:
- start_bits points to the bit map
- c is the character
- caseless the caseless flag
- cd the block with char table pointers
-
-Returns: nothing
-*/
-
-static void
-set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
-{
-start_bits[c/8] |= (1 << (c&7));
-if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
- start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
-}
-
-
-
-/*************************************************
-* Create bitmap of starting chars *
-*************************************************/
-
-/* This function scans a compiled unanchored expression and attempts to build a
-bitmap of the set of initial characters. If it can't, it returns FALSE. As time
-goes by, we may be able to get more clever at doing this.
-
-Arguments:
- code points to an expression
- start_bits points to a 32-byte table, initialized to 0
- caseless the current state of the caseless flag
- cd the block with char table pointers
-
-Returns: TRUE if table built, FALSE otherwise
-*/
-
-static BOOL
-set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
- compile_data *cd)
-{
-register int c;
-
-/* This next statement and the later reference to dummy are here in order to
-trick the optimizer of the IBM C compiler for OS/2 into generating correct
-code. Apparently IBM isn't going to fix the problem, and we would rather not
-disable optimization (in this module it actually makes a big difference, and
-the pcre module can use all the optimization it can get). */
-
-volatile int dummy;
-
-do
- {
- const uschar *tcode = code + 1 + LINK_SIZE;
- BOOL try_next = TRUE;
-
- while (try_next)
- {
- /* If a branch starts with a bracket or a positive lookahead assertion,
- recurse to set bits from within them. That's all for this branch. */
-
- if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
- {
- if (!set_start_bits(tcode, start_bits, caseless, cd))
- return FALSE;
- try_next = FALSE;
- }
-
- else switch(*tcode)
- {
- default:
- return FALSE;
-
- /* Skip over callout */
-
- case OP_CALLOUT:
- tcode += 2;
- break;
-
- /* Skip over extended extraction bracket number */
-
- case OP_BRANUMBER:
- tcode += 3;
- break;
-
- /* Skip over lookbehind and negative lookahead assertions */
-
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK:
- case OP_ASSERTBACK_NOT:
- do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
- tcode += 1+LINK_SIZE;
- break;
-
- /* Skip over an option setting, changing the caseless flag */
-
- case OP_OPT:
- caseless = (tcode[1] & PCRE_CASELESS) != 0;
- tcode += 2;
- break;
-
- /* BRAZERO does the bracket, but carries on. */
-
- case OP_BRAZERO:
- case OP_BRAMINZERO:
- if (!set_start_bits(++tcode, start_bits, caseless, cd))
- return FALSE;
- dummy = 1;
- do tcode += GET(tcode,1); while (*tcode == OP_ALT);
- tcode += 1+LINK_SIZE;
- break;
-
- /* Single-char * or ? sets the bit and tries the next item */
-
- case OP_STAR:
- case OP_MINSTAR:
- case OP_QUERY:
- case OP_MINQUERY:
- set_bit(start_bits, tcode[1], caseless, cd);
- tcode += 2;
- break;
-
- /* Single-char upto sets the bit and tries the next */
-
- case OP_UPTO:
- case OP_MINUPTO:
- set_bit(start_bits, tcode[3], caseless, cd);
- tcode += 4;
- break;
-
- /* At least one single char sets the bit and stops */
-
- case OP_EXACT: /* Fall through */
- tcode++;
-
- case OP_CHARS: /* Fall through */
- tcode++;
-
- case OP_PLUS:
- case OP_MINPLUS:
- set_bit(start_bits, tcode[1], caseless, cd);
- try_next = FALSE;
- break;
-
- /* Single character type sets the bits and stops */
-
- case OP_NOT_DIGIT:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_digit];
- try_next = FALSE;
- break;
-
- case OP_DIGIT:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_digit];
- try_next = FALSE;
- break;
-
- case OP_NOT_WHITESPACE:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_space];
- try_next = FALSE;
- break;
-
- case OP_WHITESPACE:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_space];
- try_next = FALSE;
- break;
-
- case OP_NOT_WORDCHAR:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_word];
- try_next = FALSE;
- break;
-
- case OP_WORDCHAR:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_word];
- try_next = FALSE;
- break;
-
- /* One or more character type fudges the pointer and restarts, knowing
- it will hit a single character type and stop there. */
-
- case OP_TYPEPLUS:
- case OP_TYPEMINPLUS:
- tcode++;
- break;
-
- case OP_TYPEEXACT:
- tcode += 3;
- break;
-
- /* Zero or more repeats of character types set the bits and then
- try again. */
-
- case OP_TYPEUPTO:
- case OP_TYPEMINUPTO:
- tcode += 2; /* Fall through */
-
- case OP_TYPESTAR:
- case OP_TYPEMINSTAR:
- case OP_TYPEQUERY:
- case OP_TYPEMINQUERY:
- switch(tcode[1])
- {
- case OP_NOT_DIGIT:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_digit];
- break;
-
- case OP_DIGIT:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_digit];
- break;
-
- case OP_NOT_WHITESPACE:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_space];
- break;
-
- case OP_WHITESPACE:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_space];
- break;
-
- case OP_NOT_WORDCHAR:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_word];
- break;
-
- case OP_WORDCHAR:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_word];
- break;
- }
-
- tcode += 2;
- break;
-
- /* Character class: set the bits and either carry on or not,
- according to the repeat count. */
-
- case OP_CLASS:
- {
- tcode++;
- for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
- tcode += 32;
- switch (*tcode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- tcode++;
- break;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
- else try_next = FALSE;
- break;
-
- default:
- try_next = FALSE;
- break;
- }
- }
- break; /* End of class handling */
-
- } /* End of switch */
- } /* End of try_next loop */
-
- code += GET(code, 1); /* Advance to next branch */
- }
-while (*code == OP_ALT);
-return TRUE;
-}
-
-
-
-/*************************************************
-* Study a compiled expression *
-*************************************************/
-
-/* This function is handed a compiled expression that it must study to produce
-information that will speed up the matching. It returns a pcre_extra block
-which then gets handed back to pcre_exec().
-
-Arguments:
- re points to the compiled expression
- options contains option bits
- errorptr points to where to place error messages;
- set NULL unless error
-
-Returns: pointer to a pcre_extra block,
- NULL on error or if no optimization possible
-*/
-
-pcre_extra *
-pcre_study(const pcre *external_re, int options, const char **errorptr)
-{
-uschar start_bits[32];
-real_pcre_extra *extra;
-const real_pcre *re = (const real_pcre *)external_re;
-uschar *code = (uschar *)re + sizeof(real_pcre) +
- (re->name_count * re->name_entry_size);
-compile_data compile_block;
-
-*errorptr = NULL;
-
-if (re == NULL || re->magic_number != MAGIC_NUMBER)
- {
- *errorptr = "argument is not a compiled regular expression";
- return NULL;
- }
-
-if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
- {
- *errorptr = "unknown or incorrect option bit(s) set";
- return NULL;
- }
-
-/* For an anchored pattern, or an unanchored pattern that has a first char, or
-a multiline pattern that matches only at "line starts", no further processing
-at present. */
-
-if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
- return NULL;
-
-/* Set the character tables in the block which is passed around */
-
-compile_block.lcc = re->tables + lcc_offset;
-compile_block.fcc = re->tables + fcc_offset;
-compile_block.cbits = re->tables + cbits_offset;
-compile_block.ctypes = re->tables + ctypes_offset;
-
-/* See if we can find a fixed set of initial characters for the pattern. */
-
-memset(start_bits, 0, 32 * sizeof(uschar));
-if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
- &compile_block)) return NULL;
-
-/* Get an "extra" block and put the information therein. */
-
-extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
-
-if (extra == NULL)
- {
- *errorptr = "failed to get memory";
- return NULL;
- }
-
-extra->options = PCRE_STUDY_MAPPED;
-memcpy(extra->start_bits, start_bits, sizeof(start_bits));
-
-return (pcre_extra *)extra;
-}
-
-/* End of study.c */
diff --git a/ext/pcre/pcrelib/testdata/testinput1 b/ext/pcre/pcrelib/testdata/testinput1
deleted file mode 100644
index c64257685b..0000000000
--- a/ext/pcre/pcrelib/testdata/testinput1
+++ /dev/null
@@ -1,3808 +0,0 @@
-/the quick brown fox/
- the quick brown fox
- The quick brown FOX
- What do you know about the quick brown fox?
- What do you know about THE QUICK BROWN FOX?
-
-/The quick brown fox/i
- the quick brown fox
- The quick brown FOX
- What do you know about the quick brown fox?
- What do you know about THE QUICK BROWN FOX?
-
-/abcd\t\n\r\f\a\e\071\x3b\$\\\?caxyz/
- abcd\t\n\r\f\a\e9;\$\\?caxyz
-
-/a*abc?xyz+pqr{3}ab{2,}xy{4,5}pq{0,6}AB{0,}zz/
- abxyzpqrrrabbxyyyypqAzz
- abxyzpqrrrabbxyyyypqAzz
- aabxyzpqrrrabbxyyyypqAzz
- aaabxyzpqrrrabbxyyyypqAzz
- aaaabxyzpqrrrabbxyyyypqAzz
- abcxyzpqrrrabbxyyyypqAzz
- aabcxyzpqrrrabbxyyyypqAzz
- aaabcxyzpqrrrabbxyyyypAzz
- aaabcxyzpqrrrabbxyyyypqAzz
- aaabcxyzpqrrrabbxyyyypqqAzz
- aaabcxyzpqrrrabbxyyyypqqqAzz
- aaabcxyzpqrrrabbxyyyypqqqqAzz
- aaabcxyzpqrrrabbxyyyypqqqqqAzz
- aaabcxyzpqrrrabbxyyyypqqqqqqAzz
- aaaabcxyzpqrrrabbxyyyypqAzz
- abxyzzpqrrrabbxyyyypqAzz
- aabxyzzzpqrrrabbxyyyypqAzz
- aaabxyzzzzpqrrrabbxyyyypqAzz
- aaaabxyzzzzpqrrrabbxyyyypqAzz
- abcxyzzpqrrrabbxyyyypqAzz
- aabcxyzzzpqrrrabbxyyyypqAzz
- aaabcxyzzzzpqrrrabbxyyyypqAzz
- aaaabcxyzzzzpqrrrabbxyyyypqAzz
- aaaabcxyzzzzpqrrrabbbxyyyypqAzz
- aaaabcxyzzzzpqrrrabbbxyyyyypqAzz
- aaabcxyzpqrrrabbxyyyypABzz
- aaabcxyzpqrrrabbxyyyypABBzz
- >>>aaabxyzpqrrrabbxyyyypqAzz
- >aaaabxyzpqrrrabbxyyyypqAzz
- >>>>abcxyzpqrrrabbxyyyypqAzz
- *** Failers
- abxyzpqrrabbxyyyypqAzz
- abxyzpqrrrrabbxyyyypqAzz
- abxyzpqrrrabxyyyypqAzz
- aaaabcxyzzzzpqrrrabbbxyyyyyypqAzz
- aaaabcxyzzzzpqrrrabbbxyyypqAzz
- aaabcxyzpqrrrabbxyyyypqqqqqqqAzz
-
-/^(abc){1,2}zz/
- abczz
- abcabczz
- *** Failers
- zz
- abcabcabczz
- >>abczz
-
-/^(b+?|a){1,2}?c/
- bc
- bbc
- bbbc
- bac
- bbac
- aac
- abbbbbbbbbbbc
- bbbbbbbbbbbac
- *** Failers
- aaac
- abbbbbbbbbbbac
-
-/^(b+|a){1,2}c/
- bc
- bbc
- bbbc
- bac
- bbac
- aac
- abbbbbbbbbbbc
- bbbbbbbbbbbac
- *** Failers
- aaac
- abbbbbbbbbbbac
-
-/^(b+|a){1,2}?bc/
- bbc
-
-/^(b*|ba){1,2}?bc/
- babc
- bbabc
- bababc
- *** Failers
- bababbc
- babababc
-
-/^(ba|b*){1,2}?bc/
- babc
- bbabc
- bababc
- *** Failers
- bababbc
- babababc
-
-/^\ca\cA\c[\c{\c:/
- \x01\x01\e;z
-
-/^[ab\]cde]/
- athing
- bthing
- ]thing
- cthing
- dthing
- ething
- *** Failers
- fthing
- [thing
- \\thing
-
-/^[]cde]/
- ]thing
- cthing
- dthing
- ething
- *** Failers
- athing
- fthing
-
-/^[^ab\]cde]/
- fthing
- [thing
- \\thing
- *** Failers
- athing
- bthing
- ]thing
- cthing
- dthing
- ething
-
-/^[^]cde]/
- athing
- fthing
- *** Failers
- ]thing
- cthing
- dthing
- ething
-
-/^\/
-
-
-/^ÿ/
- ÿ
-
-/^[0-9]+$/
- 0
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 100
- *** Failers
- abc
-
-/^.*nter/
- enter
- inter
- uponter
-
-/^xxx[0-9]+$/
- xxx0
- xxx1234
- *** Failers
- xxx
-
-/^.+[0-9][0-9][0-9]$/
- x123
- xx123
- 123456
- *** Failers
- 123
- x1234
-
-/^.+?[0-9][0-9][0-9]$/
- x123
- xx123
- 123456
- *** Failers
- 123
- x1234
-
-/^([^!]+)!(.+)=apquxz\.ixr\.zzz\.ac\.uk$/
- abc!pqr=apquxz.ixr.zzz.ac.uk
- *** Failers
- !pqr=apquxz.ixr.zzz.ac.uk
- abc!=apquxz.ixr.zzz.ac.uk
- abc!pqr=apquxz:ixr.zzz.ac.uk
- abc!pqr=apquxz.ixr.zzz.ac.ukk
-
-/:/
- Well, we need a colon: somewhere
- *** Fail if we don't
-
-/([\da-f:]+)$/i
- 0abc
- abc
- fed
- E
- ::
- 5f03:12C0::932e
- fed def
- Any old stuff
- *** Failers
- 0zzz
- gzzz
- fed\x20
- Any old rubbish
-
-/^.*\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/
- .1.2.3
- A.12.123.0
- *** Failers
- .1.2.3333
- 1.2.3
- 1234.2.3
-
-/^(\d+)\s+IN\s+SOA\s+(\S+)\s+(\S+)\s*\(\s*$/
- 1 IN SOA non-sp1 non-sp2(
- 1 IN SOA non-sp1 non-sp2 (
- *** Failers
- 1IN SOA non-sp1 non-sp2(
-
-/^[a-zA-Z\d][a-zA-Z\d\-]*(\.[a-zA-Z\d][a-zA-z\d\-]*)*\.$/
- a.
- Z.
- 2.
- ab-c.pq-r.
- sxk.zzz.ac.uk.
- x-.y-.
- *** Failers
- -abc.peq.
-
-/^\*\.[a-z]([a-z\-\d]*[a-z\d]+)?(\.[a-z]([a-z\-\d]*[a-z\d]+)?)*$/
- *.a
- *.b0-a
- *.c3-b.c
- *.c-a.b-c
- *** Failers
- *.0
- *.a-
- *.a-b.c-
- *.c-a.0-c
-
-/^(?=ab(de))(abd)(e)/
- abde
-
-/^(?!(ab)de|x)(abd)(f)/
- abdf
-
-/^(?=(ab(cd)))(ab)/
- abcd
-
-/^[\da-f](\.[\da-f])*$/i
- a.b.c.d
- A.B.C.D
- a.b.c.1.2.3.C
-
-/^\".*\"\s*(;.*)?$/
- \"1234\"
- \"abcd\" ;
- \"\" ; rhubarb
- *** Failers
- \"1234\" : things
-
-/^$/
- \
- *** Failers
-
-/ ^ a (?# begins with a) b\sc (?# then b c) $ (?# then end)/x
- ab c
- *** Failers
- abc
- ab cde
-
-/(?x) ^ a (?# begins with a) b\sc (?# then b c) $ (?# then end)/
- ab c
- *** Failers
- abc
- ab cde
-
-/^ a\ b[c ]d $/x
- a bcd
- a b d
- *** Failers
- abcd
- ab d
-
-/^(a(b(c)))(d(e(f)))(h(i(j)))(k(l(m)))$/
- abcdefhijklm
-
-/^(?:a(b(c)))(?:d(e(f)))(?:h(i(j)))(?:k(l(m)))$/
- abcdefhijklm
-
-/^[\w][\W][\s][\S][\d][\D][\b][\n][\c]][\022]/
- a+ Z0+\x08\n\x1d\x12
-
-/^[.^$|()*+?{,}]+/
- .^\$(*+)|{?,?}
-
-/^a*\w/
- z
- az
- aaaz
- a
- aa
- aaaa
- a+
- aa+
-
-/^a*?\w/
- z
- az
- aaaz
- a
- aa
- aaaa
- a+
- aa+
-
-/^a+\w/
- az
- aaaz
- aa
- aaaa
- aa+
-
-/^a+?\w/
- az
- aaaz
- aa
- aaaa
- aa+
-
-/^\d{8}\w{2,}/
- 1234567890
- 12345678ab
- 12345678__
- *** Failers
- 1234567
-
-/^[aeiou\d]{4,5}$/
- uoie
- 1234
- 12345
- aaaaa
- *** Failers
- 123456
-
-/^[aeiou\d]{4,5}?/
- uoie
- 1234
- 12345
- aaaaa
- 123456
-
-/\A(abc|def)=(\1){2,3}\Z/
- abc=abcabc
- def=defdefdef
- *** Failers
- abc=defdef
-
-/^(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)\11*(\3\4)\1(?#)2$/
- abcdefghijkcda2
- abcdefghijkkkkcda2
-
-/(cat(a(ract|tonic)|erpillar)) \1()2(3)/
- cataract cataract23
- catatonic catatonic23
- caterpillar caterpillar23
-
-
-/^From +([^ ]+) +[a-zA-Z][a-zA-Z][a-zA-Z] +[a-zA-Z][a-zA-Z][a-zA-Z] +[0-9]?[0-9] +[0-9][0-9]:[0-9][0-9]/
- From abcd Mon Sep 01 12:33:02 1997
-
-/^From\s+\S+\s+([a-zA-Z]{3}\s+){2}\d{1,2}\s+\d\d:\d\d/
- From abcd Mon Sep 01 12:33:02 1997
- From abcd Mon Sep 1 12:33:02 1997
- *** Failers
- From abcd Sep 01 12:33:02 1997
-
-/^12.34/s
- 12\n34
- 12\r34
-
-/\w+(?=\t)/
- the quick brown\t fox
-
-/foo(?!bar)(.*)/
- foobar is foolish see?
-
-/(?:(?!foo)...|^.{0,2})bar(.*)/
- foobar crowbar etc
- barrel
- 2barrel
- A barrel
-
-/^(\D*)(?=\d)(?!123)/
- abc456
- *** Failers
- abc123
-
-/^1234(?# test newlines
- inside)/
- 1234
-
-/^1234 #comment in extended re
- /x
- 1234
-
-/#rhubarb
- abcd/x
- abcd
-
-/^abcd#rhubarb/x
- abcd
-
-/^(a)\1{2,3}(.)/
- aaab
- aaaab
- aaaaab
- aaaaaab
-
-/(?!^)abc/
- the abc
- *** Failers
- abc
-
-/(?=^)abc/
- abc
- *** Failers
- the abc
-
-/^[ab]{1,3}(ab*|b)/
- aabbbbb
-
-/^[ab]{1,3}?(ab*|b)/
- aabbbbb
-
-/^[ab]{1,3}?(ab*?|b)/
- aabbbbb
-
-/^[ab]{1,3}(ab*?|b)/
- aabbbbb
-
-/ (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* # optional leading comment
-(?: (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-" (?: # opening quote...
-[^\\\x80-\xff\n\015"] # Anything except backslash and quote
-| # or
-\\ [^\x80-\xff] # Escaped something (something != CR)
-)* " # closing quote
-) # initial word
-(?: (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* \. (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-" (?: # opening quote...
-[^\\\x80-\xff\n\015"] # Anything except backslash and quote
-| # or
-\\ [^\x80-\xff] # Escaped something (something != CR)
-)* " # closing quote
-) )* # further okay, if led by a period
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* @ (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-| \[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-) # initial subdomain
-(?: #
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* \. # if led by a period...
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-| \[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-) # ...further okay
-)*
-# address
-| # or
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-" (?: # opening quote...
-[^\\\x80-\xff\n\015"] # Anything except backslash and quote
-| # or
-\\ [^\x80-\xff] # Escaped something (something != CR)
-)* " # closing quote
-) # one word, optionally followed by....
-(?:
-[^()<>@,;:".\\\[\]\x80-\xff\000-\010\012-\037] | # atom and space parts, or...
-\(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) | # comments, or...
-
-" (?: # opening quote...
-[^\\\x80-\xff\n\015"] # Anything except backslash and quote
-| # or
-\\ [^\x80-\xff] # Escaped something (something != CR)
-)* " # closing quote
-# quoted strings
-)*
-< (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* # leading <
-(?: @ (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-| \[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-) # initial subdomain
-(?: #
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* \. # if led by a period...
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-| \[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-) # ...further okay
-)*
-
-(?: (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* , (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* @ (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-| \[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-) # initial subdomain
-(?: #
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* \. # if led by a period...
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-| \[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-) # ...further okay
-)*
-)* # further okay, if led by comma
-: # closing colon
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* )? # optional route
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-" (?: # opening quote...
-[^\\\x80-\xff\n\015"] # Anything except backslash and quote
-| # or
-\\ [^\x80-\xff] # Escaped something (something != CR)
-)* " # closing quote
-) # initial word
-(?: (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* \. (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-" (?: # opening quote...
-[^\\\x80-\xff\n\015"] # Anything except backslash and quote
-| # or
-\\ [^\x80-\xff] # Escaped something (something != CR)
-)* " # closing quote
-) )* # further okay, if led by a period
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* @ (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-| \[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-) # initial subdomain
-(?: #
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* \. # if led by a period...
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-| \[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-) # ...further okay
-)*
-# address spec
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* > # trailing >
-# name and address
-) (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* # optional trailing comment
-/x
- Alan Other <user\@dom.ain>
- <user\@dom.ain>
- user\@dom.ain
- \"A. Other\" <user.1234\@dom.ain> (a comment)
- A. Other <user.1234\@dom.ain> (a comment)
- \"/s=user/ou=host/o=place/prmd=uu.yy/admd= /c=gb/\"\@x400-re.lay
- A missing angle <user\@some.where
- *** Failers
- The quick brown fox
-
-/[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional leading comment
-(?:
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-# Atom
-| # or
-" # "
-[^\\\x80-\xff\n\015"] * # normal
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015"] * )* # ( special normal* )*
-" # "
-# Quoted string
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-\.
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-# Atom
-| # or
-" # "
-[^\\\x80-\xff\n\015"] * # normal
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015"] * )* # ( special normal* )*
-" # "
-# Quoted string
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# additional words
-)*
-@
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-\[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-(?:
-\.
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-\[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-)*
-# address
-| # or
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-# Atom
-| # or
-" # "
-[^\\\x80-\xff\n\015"] * # normal
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015"] * )* # ( special normal* )*
-" # "
-# Quoted string
-)
-# leading word
-[^()<>@,;:".\\\[\]\x80-\xff\000-\010\012-\037] * # "normal" atoms and or spaces
-(?:
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-|
-" # "
-[^\\\x80-\xff\n\015"] * # normal
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015"] * )* # ( special normal* )*
-" # "
-) # "special" comment or quoted string
-[^()<>@,;:".\\\[\]\x80-\xff\000-\010\012-\037] * # more "normal"
-)*
-<
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# <
-(?:
-@
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-\[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-(?:
-\.
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-\[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-)*
-(?: ,
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-@
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-\[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-(?:
-\.
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-\[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-)*
-)* # additional domains
-:
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-)? # optional route
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-# Atom
-| # or
-" # "
-[^\\\x80-\xff\n\015"] * # normal
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015"] * )* # ( special normal* )*
-" # "
-# Quoted string
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-\.
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-# Atom
-| # or
-" # "
-[^\\\x80-\xff\n\015"] * # normal
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015"] * )* # ( special normal* )*
-" # "
-# Quoted string
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# additional words
-)*
-@
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-\[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-(?:
-\.
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-\[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-)*
-# address spec
-> # >
-# name and address
-)
-/x
- Alan Other <user\@dom.ain>
- <user\@dom.ain>
- user\@dom.ain
- \"A. Other\" <user.1234\@dom.ain> (a comment)
- A. Other <user.1234\@dom.ain> (a comment)
- \"/s=user/ou=host/o=place/prmd=uu.yy/admd= /c=gb/\"\@x400-re.lay
- A missing angle <user\@some.where
- *** Failers
- The quick brown fox
-
-/abc\0def\00pqr\000xyz\0000AB/
- abc\0def\00pqr\000xyz\0000AB
- abc456 abc\0def\00pqr\000xyz\0000ABCDE
-
-/abc\x0def\x00pqr\x000xyz\x0000AB/
- abc\x0def\x00pqr\x000xyz\x0000AB
- abc456 abc\x0def\x00pqr\x000xyz\x0000ABCDE
-
-/^[\000-\037]/
- \0A
- \01B
- \037C
-
-/\0*/
- \0\0\0\0
-
-/A\x0{2,3}Z/
- The A\x0\x0Z
- An A\0\x0\0Z
- *** Failers
- A\0Z
- A\0\x0\0\x0Z
-
-/^(cow|)\1(bell)/
- cowcowbell
- bell
- *** Failers
- cowbell
-
-/^\s/
- \040abc
- \x0cabc
- \nabc
- \rabc
- \tabc
- *** Failers
- abc
-
-/^a b
-
- c/x
- abc
-
-/^(a|)\1*b/
- ab
- aaaab
- b
- *** Failers
- acb
-
-/^(a|)\1+b/
- aab
- aaaab
- b
- *** Failers
- ab
-
-/^(a|)\1?b/
- ab
- aab
- b
- *** Failers
- acb
-
-/^(a|)\1{2}b/
- aaab
- b
- *** Failers
- ab
- aab
- aaaab
-
-/^(a|)\1{2,3}b/
- aaab
- aaaab
- b
- *** Failers
- ab
- aab
- aaaaab
-
-/ab{1,3}bc/
- abbbbc
- abbbc
- abbc
- *** Failers
- abc
- abbbbbc
-
-/([^.]*)\.([^:]*):[T ]+(.*)/
- track1.title:TBlah blah blah
-
-/([^.]*)\.([^:]*):[T ]+(.*)/i
- track1.title:TBlah blah blah
-
-/([^.]*)\.([^:]*):[t ]+(.*)/i
- track1.title:TBlah blah blah
-
-/^[W-c]+$/
- WXY_^abc
- ***Failers
- wxy
-
-/^[W-c]+$/i
- WXY_^abc
- wxy_^ABC
-
-/^[\x3f-\x5F]+$/i
- WXY_^abc
- wxy_^ABC
-
-/^abc$/m
- abc
- qqq\nabc
- abc\nzzz
- qqq\nabc\nzzz
-
-/^abc$/
- abc
- *** Failers
- qqq\nabc
- abc\nzzz
- qqq\nabc\nzzz
-
-/\Aabc\Z/m
- abc
- abc\n
- *** Failers
- qqq\nabc
- abc\nzzz
- qqq\nabc\nzzz
-
-/\A(.)*\Z/s
- abc\ndef
-
-/\A(.)*\Z/m
- *** Failers
- abc\ndef
-
-/(?:b)|(?::+)/
- b::c
- c::b
-
-/[-az]+/
- az-
- *** Failers
- b
-
-/[az-]+/
- za-
- *** Failers
- b
-
-/[a\-z]+/
- a-z
- *** Failers
- b
-
-/[a-z]+/
- abcdxyz
-
-/[\d-]+/
- 12-34
- *** Failers
- aaa
-
-/[\d-z]+/
- 12-34z
- *** Failers
- aaa
-
-/\x5c/
- \\
-
-/\x20Z/
- the Zoo
- *** Failers
- Zulu
-
-/(abc)\1/i
- abcabc
- ABCabc
- abcABC
-
-/ab{3cd/
- ab{3cd
-
-/ab{3,cd/
- ab{3,cd
-
-/ab{3,4a}cd/
- ab{3,4a}cd
-
-/{4,5a}bc/
- {4,5a}bc
-
-/^a.b/
- a\rb
- *** Failers
- a\nb
-
-/abc$/
- abc
- abc\n
- *** Failers
- abc\ndef
-
-/(abc)\123/
- abc\x53
-
-/(abc)\223/
- abc\x93
-
-/(abc)\323/
- abc\xd3
-
-/(abc)\500/
- abc\x40
- abc\100
-
-/(abc)\5000/
- abc\x400
- abc\x40\x30
- abc\1000
- abc\100\x30
- abc\100\060
- abc\100\60
-
-/abc\81/
- abc\081
- abc\0\x38\x31
-
-/abc\91/
- abc\091
- abc\0\x39\x31
-
-/(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\12\123/
- abcdefghijkllS
-
-/(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)\12\123/
- abcdefghijk\12S
-
-/ab\gdef/
- abgdef
-
-/a{0}bc/
- bc
-
-/(a|(bc)){0,0}?xyz/
- xyz
-
-/abc[\10]de/
- abc\010de
-
-/abc[\1]de/
- abc\1de
-
-/(abc)[\1]de/
- abc\1de
-
-/(?s)a.b/
- a\nb
-
-/^([^a])([^\b])([^c]*)([^d]{3,4})/
- baNOTccccd
- baNOTcccd
- baNOTccd
- bacccd
- *** Failers
- anything
- b\bc
- baccd
-
-/[^a]/
- Abc
-
-/[^a]/i
- Abc
-
-/[^a]+/
- AAAaAbc
-
-/[^a]+/i
- AAAaAbc
-
-/[^a]+/
- bbb\nccc
-
-/[^k]$/
- abc
- *** Failers
- abk
-
-/[^k]{2,3}$/
- abc
- kbc
- kabc
- *** Failers
- abk
- akb
- akk
-
-/^\d{8,}\@.+[^k]$/
- 12345678\@a.b.c.d
- 123456789\@x.y.z
- *** Failers
- 12345678\@x.y.uk
- 1234567\@a.b.c.d
-
-/(a)\1{8,}/
- aaaaaaaaa
- aaaaaaaaaa
- *** Failers
- aaaaaaa
-
-/[^a]/
- aaaabcd
- aaAabcd
-
-/[^a]/i
- aaaabcd
- aaAabcd
-
-/[^az]/
- aaaabcd
- aaAabcd
-
-/[^az]/i
- aaaabcd
- aaAabcd
-
-/\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\040\041\042\043\044\045\046\047\050\051\052\053\054\055\056\057\060\061\062\063\064\065\066\067\070\071\072\073\074\075\076\077\100\101\102\103\104\105\106\107\110\111\112\113\114\115\116\117\120\121\122\123\124\125\126\127\130\131\132\133\134\135\136\137\140\141\142\143\144\145\146\147\150\151\152\153\154\155\156\157\160\161\162\163\164\165\166\167\170\171\172\173\174\175\176\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377/
- \000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\040\041\042\043\044\045\046\047\050\051\052\053\054\055\056\057\060\061\062\063\064\065\066\067\070\071\072\073\074\075\076\077\100\101\102\103\104\105\106\107\110\111\112\113\114\115\116\117\120\121\122\123\124\125\126\127\130\131\132\133\134\135\136\137\140\141\142\143\144\145\146\147\150\151\152\153\154\155\156\157\160\161\162\163\164\165\166\167\170\171\172\173\174\175\176\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377
-
-/P[^*]TAIRE[^*]{1,6}?LL/
- xxxxxxxxxxxPSTAIREISLLxxxxxxxxx
-
-/P[^*]TAIRE[^*]{1,}?LL/
- xxxxxxxxxxxPSTAIREISLLxxxxxxxxx
-
-/(\.\d\d[1-9]?)\d+/
- 1.230003938
- 1.875000282
- 1.235
-
-/(\.\d\d((?=0)|\d(?=\d)))/
- 1.230003938
- 1.875000282
- *** Failers
- 1.235
-
-/a(?)b/
- ab
-
-/\b(foo)\s+(\w+)/i
- Food is on the foo table
-
-/foo(.*)bar/
- The food is under the bar in the barn.
-
-/foo(.*?)bar/
- The food is under the bar in the barn.
-
-/(.*)(\d*)/
- I have 2 numbers: 53147
-
-/(.*)(\d+)/
- I have 2 numbers: 53147
-
-/(.*?)(\d*)/
- I have 2 numbers: 53147
-
-/(.*?)(\d+)/
- I have 2 numbers: 53147
-
-/(.*)(\d+)$/
- I have 2 numbers: 53147
-
-/(.*?)(\d+)$/
- I have 2 numbers: 53147
-
-/(.*)\b(\d+)$/
- I have 2 numbers: 53147
-
-/(.*\D)(\d+)$/
- I have 2 numbers: 53147
-
-/^\D*(?!123)/
- ABC123
-
-/^(\D*)(?=\d)(?!123)/
- ABC445
- *** Failers
- ABC123
-
-/^[W-]46]/
- W46]789
- -46]789
- *** Failers
- Wall
- Zebra
- 42
- [abcd]
- ]abcd[
-
-/^[W-\]46]/
- W46]789
- Wall
- Zebra
- Xylophone
- 42
- [abcd]
- ]abcd[
- \\backslash
- *** Failers
- -46]789
- well
-
-/\d\d\/\d\d\/\d\d\d\d/
- 01/01/2000
-
-/word (?:[a-zA-Z0-9]+ ){0,10}otherword/
- word cat dog elephant mussel cow horse canary baboon snake shark otherword
- word cat dog elephant mussel cow horse canary baboon snake shark
-
-/word (?:[a-zA-Z0-9]+ ){0,300}otherword/
- word cat dog elephant mussel cow horse canary baboon snake shark the quick brown fox and the lazy dog and several other words getting close to thirty by now I hope
-
-/^(a){0,0}/
- bcd
- abc
- aab
-
-/^(a){0,1}/
- bcd
- abc
- aab
-
-/^(a){0,2}/
- bcd
- abc
- aab
-
-/^(a){0,3}/
- bcd
- abc
- aab
- aaa
-
-/^(a){0,}/
- bcd
- abc
- aab
- aaa
- aaaaaaaa
-
-/^(a){1,1}/
- bcd
- abc
- aab
-
-/^(a){1,2}/
- bcd
- abc
- aab
-
-/^(a){1,3}/
- bcd
- abc
- aab
- aaa
-
-/^(a){1,}/
- bcd
- abc
- aab
- aaa
- aaaaaaaa
-
-/.*\.gif/
- borfle\nbib.gif\nno
-
-/.{0,}\.gif/
- borfle\nbib.gif\nno
-
-/.*\.gif/m
- borfle\nbib.gif\nno
-
-/.*\.gif/s
- borfle\nbib.gif\nno
-
-/.*\.gif/ms
- borfle\nbib.gif\nno
-
-/.*$/
- borfle\nbib.gif\nno
-
-/.*$/m
- borfle\nbib.gif\nno
-
-/.*$/s
- borfle\nbib.gif\nno
-
-/.*$/ms
- borfle\nbib.gif\nno
-
-/.*$/
- borfle\nbib.gif\nno\n
-
-/.*$/m
- borfle\nbib.gif\nno\n
-
-/.*$/s
- borfle\nbib.gif\nno\n
-
-/.*$/ms
- borfle\nbib.gif\nno\n
-
-/(.*X|^B)/
- abcde\n1234Xyz
- BarFoo
- *** Failers
- abcde\nBar
-
-/(.*X|^B)/m
- abcde\n1234Xyz
- BarFoo
- abcde\nBar
-
-/(.*X|^B)/s
- abcde\n1234Xyz
- BarFoo
- *** Failers
- abcde\nBar
-
-/(.*X|^B)/ms
- abcde\n1234Xyz
- BarFoo
- abcde\nBar
-
-/(?s)(.*X|^B)/
- abcde\n1234Xyz
- BarFoo
- *** Failers
- abcde\nBar
-
-/(?s:.*X|^B)/
- abcde\n1234Xyz
- BarFoo
- *** Failers
- abcde\nBar
-
-/^.*B/
- **** Failers
- abc\nB
-
-/(?s)^.*B/
- abc\nB
-
-/(?m)^.*B/
- abc\nB
-
-/(?ms)^.*B/
- abc\nB
-
-/(?ms)^B/
- abc\nB
-
-/(?s)B$/
- B\n
-
-/^[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]/
- 123456654321
-
-/^\d\d\d\d\d\d\d\d\d\d\d\d/
- 123456654321
-
-/^[\d][\d][\d][\d][\d][\d][\d][\d][\d][\d][\d][\d]/
- 123456654321
-
-/^[abc]{12}/
- abcabcabcabc
-
-/^[a-c]{12}/
- abcabcabcabc
-
-/^(a|b|c){12}/
- abcabcabcabc
-
-/^[abcdefghijklmnopqrstuvwxy0123456789]/
- n
- *** Failers
- z
-
-/abcde{0,0}/
- abcd
- *** Failers
- abce
-
-/ab[cd]{0,0}e/
- abe
- *** Failers
- abcde
-
-/ab(c){0,0}d/
- abd
- *** Failers
- abcd
-
-/a(b*)/
- a
- ab
- abbbb
- *** Failers
- bbbbb
-
-/ab\d{0}e/
- abe
- *** Failers
- ab1e
-
-/"([^\\"]+|\\.)*"/
- the \"quick\" brown fox
- \"the \\\"quick\\\" brown fox\"
-
-/.*?/g+
- abc
-
-/\b/g+
- abc
-
-/\b/+g
- abc
-
-//g
- abc
-
-/<tr([\w\W\s\d][^<>]{0,})><TD([\w\W\s\d][^<>]{0,})>([\d]{0,}\.)(.*)((<BR>([\w\W\s\d][^<>]{0,})|[\s]{0,}))<\/a><\/TD><TD([\w\W\s\d][^<>]{0,})>([\w\W\s\d][^<>]{0,})<\/TD><TD([\w\W\s\d][^<>]{0,})>([\w\W\s\d][^<>]{0,})<\/TD><\/TR>/is
- <TR BGCOLOR='#DBE9E9'><TD align=left valign=top>43.<a href='joblist.cfm?JobID=94 6735&Keyword='>Word Processor<BR>(N-1286)</a></TD><TD align=left valign=top>Lega lstaff.com</TD><TD align=left valign=top>CA - Statewide</TD></TR>
-
-/a[^a]b/
- acb
- a\nb
-
-/a.b/
- acb
- *** Failers
- a\nb
-
-/a[^a]b/s
- acb
- a\nb
-
-/a.b/s
- acb
- a\nb
-
-/^(b+?|a){1,2}?c/
- bac
- bbac
- bbbac
- bbbbac
- bbbbbac
-
-/^(b+|a){1,2}?c/
- bac
- bbac
- bbbac
- bbbbac
- bbbbbac
-
-/(?!\A)x/m
- x\nb\n
- a\bx\n
-
-/\x0{ab}/
- \0{ab}
-
-/(A|B)*?CD/
- CD
-
-/(A|B)*CD/
- CD
-
-/(AB)*?\1/
- ABABAB
-
-/(AB)*\1/
- ABABAB
-
-/(?<!bar)foo/
- foo
- catfood
- arfootle
- rfoosh
- *** Failers
- barfoo
- towbarfoo
-
-/\w{3}(?<!bar)foo/
- catfood
- *** Failers
- foo
- barfoo
- towbarfoo
-
-/(?<=(foo)a)bar/
- fooabar
- *** Failers
- bar
- foobbar
-
-/\Aabc\z/m
- abc
- *** Failers
- abc\n
- qqq\nabc
- abc\nzzz
- qqq\nabc\nzzz
-
-"(?>.*/)foo"
- /this/is/a/very/long/line/in/deed/with/very/many/slashes/in/it/you/see/
-
-"(?>.*/)foo"
- /this/is/a/very/long/line/in/deed/with/very/many/slashes/in/and/foo
-
-/(?>(\.\d\d[1-9]?))\d+/
- 1.230003938
- 1.875000282
- *** Failers
- 1.235
-
-/^((?>\w+)|(?>\s+))*$/
- now is the time for all good men to come to the aid of the party
- *** Failers
- this is not a line with only words and spaces!
-
-/(\d+)(\w)/
- 12345a
- 12345+
-
-/((?>\d+))(\w)/
- 12345a
- *** Failers
- 12345+
-
-/(?>a+)b/
- aaab
-
-/((?>a+)b)/
- aaab
-
-/(?>(a+))b/
- aaab
-
-/(?>b)+/
- aaabbbccc
-
-/(?>a+|b+|c+)*c/
- aaabbbbccccd
-
-/((?>[^()]+)|\([^()]*\))+/
- ((abc(ade)ufh()()x
-
-/\(((?>[^()]+)|\([^()]+\))+\)/
- (abc)
- (abc(def)xyz)
- *** Failers
- ((()aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-
-/a(?-i)b/i
- ab
- Ab
- *** Failers
- aB
- AB
-
-/(a (?x)b c)d e/
- a bcd e
- *** Failers
- a b cd e
- abcd e
- a bcde
-
-/(a b(?x)c d (?-x)e f)/
- a bcde f
- *** Failers
- abcdef
-
-/(a(?i)b)c/
- abc
- aBc
- *** Failers
- abC
- aBC
- Abc
- ABc
- ABC
- AbC
-
-/a(?i:b)c/
- abc
- aBc
- *** Failers
- ABC
- abC
- aBC
-
-/a(?i:b)*c/
- aBc
- aBBc
- *** Failers
- aBC
- aBBC
-
-/a(?=b(?i)c)\w\wd/
- abcd
- abCd
- *** Failers
- aBCd
- abcD
-
-/(?s-i:more.*than).*million/i
- more than million
- more than MILLION
- more \n than Million
- *** Failers
- MORE THAN MILLION
- more \n than \n million
-
-/(?:(?s-i)more.*than).*million/i
- more than million
- more than MILLION
- more \n than Million
- *** Failers
- MORE THAN MILLION
- more \n than \n million
-
-/(?>a(?i)b+)+c/
- abc
- aBbc
- aBBc
- *** Failers
- Abc
- abAb
- abbC
-
-/(?=a(?i)b)\w\wc/
- abc
- aBc
- *** Failers
- Ab
- abC
- aBC
-
-/(?<=a(?i)b)(\w\w)c/
- abxxc
- aBxxc
- *** Failers
- Abxxc
- ABxxc
- abxxC
-
-/(?:(a)|b)(?(1)A|B)/
- aA
- bB
- *** Failers
- aB
- bA
-
-/^(a)?(?(1)a|b)+$/
- aa
- b
- bb
- *** Failers
- ab
-
-/^(?(?=abc)\w{3}:|\d\d)$/
- abc:
- 12
- *** Failers
- 123
- xyz
-
-/^(?(?!abc)\d\d|\w{3}:)$/
- abc:
- 12
- *** Failers
- 123
- xyz
-
-/(?(?<=foo)bar|cat)/
- foobar
- cat
- fcat
- focat
- *** Failers
- foocat
-
-/(?(?<!foo)cat|bar)/
- foobar
- cat
- fcat
- focat
- *** Failers
- foocat
-
-/( \( )? [^()]+ (?(1) \) |) /x
- abcd
- (abcd)
- the quick (abcd) fox
- (abcd
-
-/( \( )? [^()]+ (?(1) \) ) /x
- abcd
- (abcd)
- the quick (abcd) fox
- (abcd
-
-/^(?(2)a|(1)(2))+$/
- 12
- 12a
- 12aa
- *** Failers
- 1234
-
-/((?i)blah)\s+\1/
- blah blah
- BLAH BLAH
- Blah Blah
- blaH blaH
- *** Failers
- blah BLAH
- Blah blah
- blaH blah
-
-/((?i)blah)\s+(?i:\1)/
- blah blah
- BLAH BLAH
- Blah Blah
- blaH blaH
- blah BLAH
- Blah blah
- blaH blah
-
-/(?>a*)*/
- a
- aa
- aaaa
-
-/(abc|)+/
- abc
- abcabc
- abcabcabc
- xyz
-
-/([a]*)*/
- a
- aaaaa
-
-/([ab]*)*/
- a
- b
- ababab
- aaaabcde
- bbbb
-
-/([^a]*)*/
- b
- bbbb
- aaa
-
-/([^ab]*)*/
- cccc
- abab
-
-/([a]*?)*/
- a
- aaaa
-
-/([ab]*?)*/
- a
- b
- abab
- baba
-
-/([^a]*?)*/
- b
- bbbb
- aaa
-
-/([^ab]*?)*/
- c
- cccc
- baba
-
-/(?>a*)*/
- a
- aaabcde
-
-/((?>a*))*/
- aaaaa
- aabbaa
-
-/((?>a*?))*/
- aaaaa
- aabbaa
-
-/(?(?=[^a-z]+[a-z]) \d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} ) /x
- 12-sep-98
- 12-09-98
- *** Failers
- sep-12-98
-
-/(?<=(foo))bar\1/
- foobarfoo
- foobarfootling
- *** Failers
- foobar
- barfoo
-
-/(?i:saturday|sunday)/
- saturday
- sunday
- Saturday
- Sunday
- SATURDAY
- SUNDAY
- SunDay
-
-/(a(?i)bc|BB)x/
- abcx
- aBCx
- bbx
- BBx
- *** Failers
- abcX
- aBCX
- bbX
- BBX
-
-/^([ab](?i)[cd]|[ef])/
- ac
- aC
- bD
- elephant
- Europe
- frog
- France
- *** Failers
- Africa
-
-/^(ab|a(?i)[b-c](?m-i)d|x(?i)y|z)/
- ab
- aBd
- xy
- xY
- zebra
- Zambesi
- *** Failers
- aCD
- XY
-
-/(?<=foo\n)^bar/m
- foo\nbar
- *** Failers
- bar
- baz\nbar
-
-/(?<=(?<!foo)bar)baz/
- barbaz
- barbarbaz
- koobarbaz
- *** Failers
- baz
- foobarbaz
-
-/The case of aaaaaa is missed out below because I think Perl 5.005_02 gets/
-/it wrong; it sets $1 to aaa rather than aa. Compare the following test,/
-/where it does set $1 to aa when matching aaaaaa./
-
-/^(a\1?){4}$/
- a
- aa
- aaa
- aaaa
- aaaaa
- aaaaaaa
- aaaaaaaa
- aaaaaaaaa
- aaaaaaaaaa
- aaaaaaaaaaa
- aaaaaaaaaaaa
- aaaaaaaaaaaaa
- aaaaaaaaaaaaaa
- aaaaaaaaaaaaaaa
- aaaaaaaaaaaaaaaa
-
-/^(a\1?)(a\1?)(a\2?)(a\3?)$/
- a
- aa
- aaa
- aaaa
- aaaaa
- aaaaaa
- aaaaaaa
- aaaaaaaa
- aaaaaaaaa
- aaaaaaaaaa
- aaaaaaaaaaa
- aaaaaaaaaaaa
- aaaaaaaaaaaaa
- aaaaaaaaaaaaaa
- aaaaaaaaaaaaaaa
- aaaaaaaaaaaaaaaa
-
-/The following tests are taken from the Perl 5.005 test suite; some of them/
-/are compatible with 5.004, but I'd rather not have to sort them out./
-
-/abc/
- abc
- xabcy
- ababc
- *** Failers
- xbc
- axc
- abx
-
-/ab*c/
- abc
-
-/ab*bc/
- abc
- abbc
- abbbbc
-
-/.{1}/
- abbbbc
-
-/.{3,4}/
- abbbbc
-
-/ab{0,}bc/
- abbbbc
-
-/ab+bc/
- abbc
- *** Failers
- abc
- abq
-
-/ab{1,}bc/
-
-/ab+bc/
- abbbbc
-
-/ab{1,}bc/
- abbbbc
-
-/ab{1,3}bc/
- abbbbc
-
-/ab{3,4}bc/
- abbbbc
-
-/ab{4,5}bc/
- *** Failers
- abq
- abbbbc
-
-/ab?bc/
- abbc
- abc
-
-/ab{0,1}bc/
- abc
-
-/ab?bc/
-
-/ab?c/
- abc
-
-/ab{0,1}c/
- abc
-
-/^abc$/
- abc
- *** Failers
- abbbbc
- abcc
-
-/^abc/
- abcc
-
-/^abc$/
-
-/abc$/
- aabc
- *** Failers
- aabc
- aabcd
-
-/^/
- abc
-
-/$/
- abc
-
-/a.c/
- abc
- axc
-
-/a.*c/
- axyzc
-
-/a[bc]d/
- abd
- *** Failers
- axyzd
- abc
-
-/a[b-d]e/
- ace
-
-/a[b-d]/
- aac
-
-/a[-b]/
- a-
-
-/a[b-]/
- a-
-
-/a]/
- a]
-
-/a[]]b/
- a]b
-
-/a[^bc]d/
- aed
- *** Failers
- abd
- abd
-
-/a[^-b]c/
- adc
-
-/a[^]b]c/
- adc
- *** Failers
- a-c
- a]c
-
-/\ba\b/
- a-
- -a
- -a-
-
-/\by\b/
- *** Failers
- xy
- yz
- xyz
-
-/\Ba\B/
- *** Failers
- a-
- -a
- -a-
-
-/\By\b/
- xy
-
-/\by\B/
- yz
-
-/\By\B/
- xyz
-
-/\w/
- a
-
-/\W/
- -
- *** Failers
- -
- a
-
-/a\sb/
- a b
-
-/a\Sb/
- a-b
- *** Failers
- a-b
- a b
-
-/\d/
- 1
-
-/\D/
- -
- *** Failers
- -
- 1
-
-/[\w]/
- a
-
-/[\W]/
- -
- *** Failers
- -
- a
-
-/a[\s]b/
- a b
-
-/a[\S]b/
- a-b
- *** Failers
- a-b
- a b
-
-/[\d]/
- 1
-
-/[\D]/
- -
- *** Failers
- -
- 1
-
-/ab|cd/
- abc
- abcd
-
-/()ef/
- def
-
-/$b/
-
-/a\(b/
- a(b
-
-/a\(*b/
- ab
- a((b
-
-/a\\b/
- a\b
-
-/((a))/
- abc
-
-/(a)b(c)/
- abc
-
-/a+b+c/
- aabbabc
-
-/a{1,}b{1,}c/
- aabbabc
-
-/a.+?c/
- abcabc
-
-/(a+|b)*/
- ab
-
-/(a+|b){0,}/
- ab
-
-/(a+|b)+/
- ab
-
-/(a+|b){1,}/
- ab
-
-/(a+|b)?/
- ab
-
-/(a+|b){0,1}/
- ab
-
-/[^ab]*/
- cde
-
-/abc/
- *** Failers
- b
-
-
-/a*/
-
-
-/([abc])*d/
- abbbcd
-
-/([abc])*bcd/
- abcd
-
-/a|b|c|d|e/
- e
-
-/(a|b|c|d|e)f/
- ef
-
-/abcd*efg/
- abcdefg
-
-/ab*/
- xabyabbbz
- xayabbbz
-
-/(ab|cd)e/
- abcde
-
-/[abhgefdc]ij/
- hij
-
-/^(ab|cd)e/
-
-/(abc|)ef/
- abcdef
-
-/(a|b)c*d/
- abcd
-
-/(ab|ab*)bc/
- abc
-
-/a([bc]*)c*/
- abc
-
-/a([bc]*)(c*d)/
- abcd
-
-/a([bc]+)(c*d)/
- abcd
-
-/a([bc]*)(c+d)/
- abcd
-
-/a[bcd]*dcdcde/
- adcdcde
-
-/a[bcd]+dcdcde/
- *** Failers
- abcde
- adcdcde
-
-/(ab|a)b*c/
- abc
-
-/((a)(b)c)(d)/
- abcd
-
-/[a-zA-Z_][a-zA-Z0-9_]*/
- alpha
-
-/^a(bc+|b[eh])g|.h$/
- abh
-
-/(bc+d$|ef*g.|h?i(j|k))/
- effgz
- ij
- reffgz
- *** Failers
- effg
- bcdd
-
-/((((((((((a))))))))))/
- a
-
-/((((((((((a))))))))))\10/
- aa
-
-/(((((((((a)))))))))/
- a
-
-/multiple words of text/
- *** Failers
- aa
- uh-uh
-
-/multiple words/
- multiple words, yeah
-
-/(.*)c(.*)/
- abcde
-
-/\((.*), (.*)\)/
- (a, b)
-
-/[k]/
-
-/abcd/
- abcd
-
-/a(bc)d/
- abcd
-
-/a[-]?c/
- ac
-
-/(abc)\1/
- abcabc
-
-/([a-c]*)\1/
- abcabc
-
-/(a)|\1/
- a
- *** Failers
- ab
- x
-
-/(([a-c])b*?\2)*/
- ababbbcbc
-
-/(([a-c])b*?\2){3}/
- ababbbcbc
-
-/((\3|b)\2(a)x)+/
- aaaxabaxbaaxbbax
-
-/((\3|b)\2(a)){2,}/
- bbaababbabaaaaabbaaaabba
-
-/abc/i
- ABC
- XABCY
- ABABC
- *** Failers
- aaxabxbaxbbx
- XBC
- AXC
- ABX
-
-/ab*c/i
- ABC
-
-/ab*bc/i
- ABC
- ABBC
-
-/ab*?bc/i
- ABBBBC
-
-/ab{0,}?bc/i
- ABBBBC
-
-/ab+?bc/i
- ABBC
-
-/ab+bc/i
- *** Failers
- ABC
- ABQ
-
-/ab{1,}bc/i
-
-/ab+bc/i
- ABBBBC
-
-/ab{1,}?bc/i
- ABBBBC
-
-/ab{1,3}?bc/i
- ABBBBC
-
-/ab{3,4}?bc/i
- ABBBBC
-
-/ab{4,5}?bc/i
- *** Failers
- ABQ
- ABBBBC
-
-/ab??bc/i
- ABBC
- ABC
-
-/ab{0,1}?bc/i
- ABC
-
-/ab??bc/i
-
-/ab??c/i
- ABC
-
-/ab{0,1}?c/i
- ABC
-
-/^abc$/i
- ABC
- *** Failers
- ABBBBC
- ABCC
-
-/^abc/i
- ABCC
-
-/^abc$/i
-
-/abc$/i
- AABC
-
-/^/i
- ABC
-
-/$/i
- ABC
-
-/a.c/i
- ABC
- AXC
-
-/a.*?c/i
- AXYZC
-
-/a.*c/i
- *** Failers
- AABC
- AXYZD
-
-/a[bc]d/i
- ABD
-
-/a[b-d]e/i
- ACE
- *** Failers
- ABC
- ABD
-
-/a[b-d]/i
- AAC
-
-/a[-b]/i
- A-
-
-/a[b-]/i
- A-
-
-/a]/i
- A]
-
-/a[]]b/i
- A]B
-
-/a[^bc]d/i
- AED
-
-/a[^-b]c/i
- ADC
- *** Failers
- ABD
- A-C
-
-/a[^]b]c/i
- ADC
-
-/ab|cd/i
- ABC
- ABCD
-
-/()ef/i
- DEF
-
-/$b/i
- *** Failers
- A]C
- B
-
-/a\(b/i
- A(B
-
-/a\(*b/i
- AB
- A((B
-
-/a\\b/i
- A\B
-
-/((a))/i
- ABC
-
-/(a)b(c)/i
- ABC
-
-/a+b+c/i
- AABBABC
-
-/a{1,}b{1,}c/i
- AABBABC
-
-/a.+?c/i
- ABCABC
-
-/a.*?c/i
- ABCABC
-
-/a.{0,5}?c/i
- ABCABC
-
-/(a+|b)*/i
- AB
-
-/(a+|b){0,}/i
- AB
-
-/(a+|b)+/i
- AB
-
-/(a+|b){1,}/i
- AB
-
-/(a+|b)?/i
- AB
-
-/(a+|b){0,1}/i
- AB
-
-/(a+|b){0,1}?/i
- AB
-
-/[^ab]*/i
- CDE
-
-/abc/i
-
-/a*/i
-
-
-/([abc])*d/i
- ABBBCD
-
-/([abc])*bcd/i
- ABCD
-
-/a|b|c|d|e/i
- E
-
-/(a|b|c|d|e)f/i
- EF
-
-/abcd*efg/i
- ABCDEFG
-
-/ab*/i
- XABYABBBZ
- XAYABBBZ
-
-/(ab|cd)e/i
- ABCDE
-
-/[abhgefdc]ij/i
- HIJ
-
-/^(ab|cd)e/i
- ABCDE
-
-/(abc|)ef/i
- ABCDEF
-
-/(a|b)c*d/i
- ABCD
-
-/(ab|ab*)bc/i
- ABC
-
-/a([bc]*)c*/i
- ABC
-
-/a([bc]*)(c*d)/i
- ABCD
-
-/a([bc]+)(c*d)/i
- ABCD
-
-/a([bc]*)(c+d)/i
- ABCD
-
-/a[bcd]*dcdcde/i
- ADCDCDE
-
-/a[bcd]+dcdcde/i
-
-/(ab|a)b*c/i
- ABC
-
-/((a)(b)c)(d)/i
- ABCD
-
-/[a-zA-Z_][a-zA-Z0-9_]*/i
- ALPHA
-
-/^a(bc+|b[eh])g|.h$/i
- ABH
-
-/(bc+d$|ef*g.|h?i(j|k))/i
- EFFGZ
- IJ
- REFFGZ
- *** Failers
- ADCDCDE
- EFFG
- BCDD
-
-/((((((((((a))))))))))/i
- A
-
-/((((((((((a))))))))))\10/i
- AA
-
-/(((((((((a)))))))))/i
- A
-
-/(?:(?:(?:(?:(?:(?:(?:(?:(?:(a))))))))))/i
- A
-
-/(?:(?:(?:(?:(?:(?:(?:(?:(?:(a|b|c))))))))))/i
- C
-
-/multiple words of text/i
- *** Failers
- AA
- UH-UH
-
-/multiple words/i
- MULTIPLE WORDS, YEAH
-
-/(.*)c(.*)/i
- ABCDE
-
-/\((.*), (.*)\)/i
- (A, B)
-
-/[k]/i
-
-/abcd/i
- ABCD
-
-/a(bc)d/i
- ABCD
-
-/a[-]?c/i
- AC
-
-/(abc)\1/i
- ABCABC
-
-/([a-c]*)\1/i
- ABCABC
-
-/a(?!b)./
- abad
-
-/a(?=d)./
- abad
-
-/a(?=c|d)./
- abad
-
-/a(?:b|c|d)(.)/
- ace
-
-/a(?:b|c|d)*(.)/
- ace
-
-/a(?:b|c|d)+?(.)/
- ace
- acdbcdbe
-
-/a(?:b|c|d)+(.)/
- acdbcdbe
-
-/a(?:b|c|d){2}(.)/
- acdbcdbe
-
-/a(?:b|c|d){4,5}(.)/
- acdbcdbe
-
-/a(?:b|c|d){4,5}?(.)/
- acdbcdbe
-
-/((foo)|(bar))*/
- foobar
-
-/a(?:b|c|d){6,7}(.)/
- acdbcdbe
-
-/a(?:b|c|d){6,7}?(.)/
- acdbcdbe
-
-/a(?:b|c|d){5,6}(.)/
- acdbcdbe
-
-/a(?:b|c|d){5,6}?(.)/
- acdbcdbe
-
-/a(?:b|c|d){5,7}(.)/
- acdbcdbe
-
-/a(?:b|c|d){5,7}?(.)/
- acdbcdbe
-
-/a(?:b|(c|e){1,2}?|d)+?(.)/
- ace
-
-/^(.+)?B/
- AB
-
-/^([^a-z])|(\^)$/
- .
-
-/^[<>]&/
- <&OUT
-
-/^(a\1?){4}$/
- aaaaaaaaaa
- *** Failers
- AB
- aaaaaaaaa
- aaaaaaaaaaa
-
-/^(a(?(1)\1)){4}$/
- aaaaaaaaaa
- *** Failers
- aaaaaaaaa
- aaaaaaaaaaa
-
-/(?:(f)(o)(o)|(b)(a)(r))*/
- foobar
-
-/(?<=a)b/
- ab
- *** Failers
- cb
- b
-
-/(?<!c)b/
- ab
- b
- b
-
-/(?:..)*a/
- aba
-
-/(?:..)*?a/
- aba
-
-/^(?:b|a(?=(.)))*\1/
- abc
-
-/^(){3,5}/
- abc
-
-/^(a+)*ax/
- aax
-
-/^((a|b)+)*ax/
- aax
-
-/^((a|bc)+)*ax/
- aax
-
-/(a|x)*ab/
- cab
-
-/(a)*ab/
- cab
-
-/(?:(?i)a)b/
- ab
-
-/((?i)a)b/
- ab
-
-/(?:(?i)a)b/
- Ab
-
-/((?i)a)b/
- Ab
-
-/(?:(?i)a)b/
- *** Failers
- cb
- aB
-
-/((?i)a)b/
-
-/(?i:a)b/
- ab
-
-/((?i:a))b/
- ab
-
-/(?i:a)b/
- Ab
-
-/((?i:a))b/
- Ab
-
-/(?i:a)b/
- *** Failers
- aB
- aB
-
-/((?i:a))b/
-
-/(?:(?-i)a)b/i
- ab
-
-/((?-i)a)b/i
- ab
-
-/(?:(?-i)a)b/i
- aB
-
-/((?-i)a)b/i
- aB
-
-/(?:(?-i)a)b/i
- *** Failers
- aB
- Ab
-
-/((?-i)a)b/i
-
-/(?:(?-i)a)b/i
- aB
-
-/((?-i)a)b/i
- aB
-
-/(?:(?-i)a)b/i
- *** Failers
- Ab
- AB
-
-/((?-i)a)b/i
-
-/(?-i:a)b/i
- ab
-
-/((?-i:a))b/i
- ab
-
-/(?-i:a)b/i
- aB
-
-/((?-i:a))b/i
- aB
-
-/(?-i:a)b/i
- *** Failers
- AB
- Ab
-
-/((?-i:a))b/i
-
-/(?-i:a)b/i
- aB
-
-/((?-i:a))b/i
- aB
-
-/(?-i:a)b/i
- *** Failers
- Ab
- AB
-
-/((?-i:a))b/i
-
-/((?-i:a.))b/i
- *** Failers
- AB
- a\nB
-
-/((?s-i:a.))b/i
- a\nB
-
-/(?:c|d)(?:)(?:a(?:)(?:b)(?:b(?:))(?:b(?:)(?:b)))/
- cabbbb
-
-/(?:c|d)(?:)(?:aaaaaaaa(?:)(?:bbbbbbbb)(?:bbbbbbbb(?:))(?:bbbbbbbb(?:)(?:bbbbbbbb)))/
- caaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
-
-/(ab)\d\1/i
- Ab4ab
- ab4Ab
-
-/foo\w*\d{4}baz/
- foobar1234baz
-
-/x(~~)*(?:(?:F)?)?/
- x~~
-
-/^a(?#xxx){3}c/
- aaac
-
-/^a (?#xxx) (?#yyy) {3}c/x
- aaac
-
-/(?<![cd])b/
- *** Failers
- B\nB
- dbcb
-
-/(?<![cd])[ab]/
- dbaacb
-
-/(?<!(c|d))b/
-
-/(?<!(c|d))[ab]/
- dbaacb
-
-/(?<!cd)[ab]/
- cdaccb
-
-/^(?:a?b?)*$/
- *** Failers
- dbcb
- a--
-
-/((?s)^a(.))((?m)^b$)/
- a\nb\nc\n
-
-/((?m)^b$)/
- a\nb\nc\n
-
-/(?m)^b/
- a\nb\n
-
-/(?m)^(b)/
- a\nb\n
-
-/((?m)^b)/
- a\nb\n
-
-/\n((?m)^b)/
- a\nb\n
-
-/((?s).)c(?!.)/
- a\nb\nc\n
- a\nb\nc\n
-
-/((?s)b.)c(?!.)/
- a\nb\nc\n
- a\nb\nc\n
-
-/^b/
-
-/()^b/
- *** Failers
- a\nb\nc\n
- a\nb\nc\n
-
-/((?m)^b)/
- a\nb\nc\n
-
-/(?(1)a|b)/
-
-/(?(1)b|a)/
- a
-
-/(x)?(?(1)a|b)/
- *** Failers
- a
- a
-
-/(x)?(?(1)b|a)/
- a
-
-/()?(?(1)b|a)/
- a
-
-/()(?(1)b|a)/
-
-/()?(?(1)a|b)/
- a
-
-/^(\()?blah(?(1)(\)))$/
- (blah)
- blah
- *** Failers
- a
- blah)
- (blah
-
-/^(\(+)?blah(?(1)(\)))$/
- (blah)
- blah
- *** Failers
- blah)
- (blah
-
-/(?(?!a)a|b)/
-
-/(?(?!a)b|a)/
- a
-
-/(?(?=a)b|a)/
- *** Failers
- a
- a
-
-/(?(?=a)a|b)/
- a
-
-/(?=(a+?))(\1ab)/
- aaab
-
-/^(?=(a+?))\1ab/
-
-/(\w+:)+/
- one:
-
-/$(?<=^(a))/
- a
-
-/(?=(a+?))(\1ab)/
- aaab
-
-/^(?=(a+?))\1ab/
- *** Failers
- aaab
- aaab
-
-/([\w:]+::)?(\w+)$/
- abcd
- xy:z:::abcd
-
-/^[^bcd]*(c+)/
- aexycd
-
-/(a*)b+/
- caab
-
-/([\w:]+::)?(\w+)$/
- abcd
- xy:z:::abcd
- *** Failers
- abcd:
- abcd:
-
-/^[^bcd]*(c+)/
- aexycd
-
-/(>a+)ab/
-
-/(?>a+)b/
- aaab
-
-/([[:]+)/
- a:[b]:
-
-/([[=]+)/
- a=[b]=
-
-/([[.]+)/
- a.[b].
-
-/((?>a+)b)/
- aaab
-
-/(?>(a+))b/
- aaab
-
-/((?>[^()]+)|\([^()]*\))+/
- ((abc(ade)ufh()()x
-
-/a\Z/
- *** Failers
- aaab
- a\nb\n
-
-/b\Z/
- a\nb\n
-
-/b\z/
-
-/b\Z/
- a\nb
-
-/b\z/
- a\nb
- *** Failers
-
-/^(?>(?(1)\.|())[^\W_](?>[a-z0-9-]*[^\W_])?)+$/
- a
- abc
- a-b
- 0-9
- a.b
- 5.6.7
- the.quick.brown.fox
- a100.b200.300c
- 12-ab.1245
- ***Failers
- \
- .a
- -a
- a-
- a.
- a_b
- a.-
- a..
- ab..bc
- the.quick.brown.fox-
- the.quick.brown.fox.
- the.quick.brown.fox_
- the.quick.brown.fox+
-
-/(?>.*)(?<=(abcd|wxyz))/
- alphabetabcd
- endingwxyz
- *** Failers
- a rather long string that doesn't end with one of them
-
-/word (?>(?:(?!otherword)[a-zA-Z0-9]+ ){0,30})otherword/
- word cat dog elephant mussel cow horse canary baboon snake shark otherword
- word cat dog elephant mussel cow horse canary baboon snake shark
-
-/word (?>[a-zA-Z0-9]+ ){0,30}otherword/
- word cat dog elephant mussel cow horse canary baboon snake shark the quick brown fox and the lazy dog and several other words getting close to thirty by now I hope
-
-/(?<=\d{3}(?!999))foo/
- 999foo
- 123999foo
- *** Failers
- 123abcfoo
-
-/(?<=(?!...999)\d{3})foo/
- 999foo
- 123999foo
- *** Failers
- 123abcfoo
-
-/(?<=\d{3}(?!999)...)foo/
- 123abcfoo
- 123456foo
- *** Failers
- 123999foo
-
-/(?<=\d{3}...)(?<!999)foo/
- 123abcfoo
- 123456foo
- *** Failers
- 123999foo
-
-/<a[\s]+href[\s]*=[\s]* # find <a href=
- ([\"\'])? # find single or double quote
- (?(1) (.*?)\1 | ([^\s]+)) # if quote found, match up to next matching
- # quote, otherwise match up to next space
-/isx
- <a href=abcd xyz
- <a href=\"abcd xyz pqr\" cats
- <a href=\'abcd xyz pqr\' cats
-
-/<a\s+href\s*=\s* # find <a href=
- (["'])? # find single or double quote
- (?(1) (.*?)\1 | (\S+)) # if quote found, match up to next matching
- # quote, otherwise match up to next space
-/isx
- <a href=abcd xyz
- <a href=\"abcd xyz pqr\" cats
- <a href = \'abcd xyz pqr\' cats
-
-/<a\s+href(?>\s*)=(?>\s*) # find <a href=
- (["'])? # find single or double quote
- (?(1) (.*?)\1 | (\S+)) # if quote found, match up to next matching
- # quote, otherwise match up to next space
-/isx
- <a href=abcd xyz
- <a href=\"abcd xyz pqr\" cats
- <a href = \'abcd xyz pqr\' cats
-
-/((Z)+|A)*/
- ZABCDEFG
-
-/(Z()|A)*/
- ZABCDEFG
-
-/(Z(())|A)*/
- ZABCDEFG
-
-/((?>Z)+|A)*/
- ZABCDEFG
-
-/((?>)+|A)*/
- ZABCDEFG
-
-/a*/g
- abbab
-
-/^[a-\d]/
- abcde
- -things
- 0digit
- *** Failers
- bcdef
-
-/^[\d-a]/
- abcde
- -things
- 0digit
- *** Failers
- bcdef
-
-/[[:space:]]+/
- > \x09\x0a\x0c\x0d\x0b<
-
-/[[:blank:]]+/
- > \x09\x0a\x0c\x0d\x0b<
-
-/[\s]+/
- > \x09\x0a\x0c\x0d\x0b<
-
-/\s+/
- > \x09\x0a\x0c\x0d\x0b<
-
-/a b/x
- ab
-
-/(?!\A)x/m
- a\nxb\n
-
-/(?!^)x/m
- a\nxb\n
-
-/abc\Qabc\Eabc/
- abcabcabc
-
-/abc\Q(*+|\Eabc/
- abc(*+|abc
-
-/ abc\Q abc\Eabc/x
- abc abcabc
- *** Failers
- abcabcabc
-
-/abc#comment
- \Q#not comment
- literal\E/x
- abc#not comment\n literal
-
-/abc#comment
- \Q#not comment
- literal/x
- abc#not comment\n literal
-
-/abc#comment
- \Q#not comment
- literal\E #more comment
- /x
- abc#not comment\n literal
-
-/abc#comment
- \Q#not comment
- literal\E #more comment/x
- abc#not comment\n literal
-
-/\Qabc\$xyz\E/
- abc\\\$xyz
-
-/\Qabc\E\$\Qxyz\E/
- abc\$xyz
-
-/\Gabc/
- abc
- *** Failers
- xyzabc
-
-/\Gabc./g
- abc1abc2xyzabc3
-
-/abc./g
- abc1abc2xyzabc3
-
-/a(?x: b c )d/
- XabcdY
- *** Failers
- Xa b c d Y
-
-/((?x)x y z | a b c)/
- XabcY
- AxyzB
-
-/(?i)AB(?-i)C/
- XabCY
- *** Failers
- XabcY
-
-/((?i)AB(?-i)C|D)E/
- abCE
- DE
- *** Failers
- abcE
- abCe
- dE
- De
-
-/(.*)\d+\1/
- abc123abc
- abc123bc
-
-/(.*)\d+\1/s
- abc123abc
- abc123bc
-
-/((.*))\d+\1/
- abc123abc
- abc123bc
-
-/-- This tests for an IPv6 address in the form where it can have up to --/
-/-- eight components, one and only one of which is empty. This must be --/
-/-- an internal component. --/
-
-/^(?!:) # colon disallowed at start
- (?: # start of item
- (?: [0-9a-f]{1,4} | # 1-4 hex digits or
- (?(1)0 | () ) ) # if null previously matched, fail; else null
- : # followed by colon
- ){1,7} # end item; 1-7 of them required
- [0-9a-f]{1,4} $ # final hex number at end of string
- (?(1)|.) # check that there was an empty component
- /xi
- a123::a123
- a123:b342::abcd
- a123:b342::324e:abcd
- a123:ddde:b342::324e:abcd
- a123:ddde:b342::324e:dcba:abcd
- a123:ddde:9999:b342::324e:dcba:abcd
- *** Failers
- 1:2:3:4:5:6:7:8
- a123:bce:ddde:9999:b342::324e:dcba:abcd
- a123::9999:b342::324e:dcba:abcd
- abcde:2:3:4:5:6:7:8
- ::1
- abcd:fee0:123::
- :1
- 1:
-
-/ End of testinput1 /
diff --git a/ext/pcre/pcrelib/testdata/testinput2 b/ext/pcre/pcrelib/testdata/testinput2
deleted file mode 100644
index 2dd498a713..0000000000
--- a/ext/pcre/pcrelib/testdata/testinput2
+++ /dev/null
@@ -1,1162 +0,0 @@
-/(a)b|/
-
-/abc/
- abc
- defabc
- \Aabc
- *** Failers
- \Adefabc
- ABC
-
-/^abc/
- abc
- \Aabc
- *** Failers
- defabc
- \Adefabc
-
-/a+bc/
-
-/a*bc/
-
-/a{3}bc/
-
-/(abc|a+z)/
-
-/^abc$/
- abc
- *** Failers
- def\nabc
-
-/ab\gdef/X
-
-/(?X)ab\gdef/X
-
-/x{5,4}/
-
-/z{65536}/
-
-/[abcd/
-
-/[\B]/
-
-/[z-a]/
-
-/^*/
-
-/(abc/
-
-/(?# abc/
-
-/(?z)abc/
-
-/.*b/
-
-/.*?b/
-
-/cat|dog|elephant/
- this sentence eventually mentions a cat
- this sentences rambles on and on for a while and then reaches elephant
-
-/cat|dog|elephant/S
- this sentence eventually mentions a cat
- this sentences rambles on and on for a while and then reaches elephant
-
-/cat|dog|elephant/iS
- this sentence eventually mentions a CAT cat
- this sentences rambles on and on for a while to elephant ElePhant
-
-/a|[bcd]/S
-
-/(a|[^\dZ])/S
-
-/(a|b)*[\s]/S
-
-/(ab\2)/
-
-/{4,5}abc/
-
-/(a)(b)(c)\2/
- abcb
- \O0abcb
- \O3abcb
- \O6abcb
- \O9abcb
- \O12abcb
-
-/(a)bc|(a)(b)\2/
- abc
- \O0abc
- \O3abc
- \O6abc
- aba
- \O0aba
- \O3aba
- \O6aba
- \O9aba
- \O12aba
-
-/abc$/E
- abc
- *** Failers
- abc\n
- abc\ndef
-
-/(a)(b)(c)(d)(e)\6/
-
-/the quick brown fox/
- the quick brown fox
- this is a line with the quick brown fox
-
-/the quick brown fox/A
- the quick brown fox
- *** Failers
- this is a line with the quick brown fox
-
-/ab(?z)cd/
-
-/^abc|def/
- abcdef
- abcdef\B
-
-/.*((abc)$|(def))/
- defabc
- \Zdefabc
-
-/abc/P
- abc
- *** Failers
-
-/^abc|def/P
- abcdef
- abcdef\B
-
-/.*((abc)$|(def))/P
- defabc
- \Zdefabc
-
-/the quick brown fox/P
- the quick brown fox
- *** Failers
- The Quick Brown Fox
-
-/the quick brown fox/Pi
- the quick brown fox
- The Quick Brown Fox
-
-/abc.def/P
- *** Failers
- abc\ndef
-
-/abc$/P
- abc
- abc\n
-
-/(abc)\2/P
-
-/(abc\1)/P
- abc
-
-/)/
-
-/a[]b/
-
-/[^aeiou ]{3,}/
- co-processors, and for
-
-/<.*>/
- abc<def>ghi<klm>nop
-
-/<.*?>/
- abc<def>ghi<klm>nop
-
-/<.*>/U
- abc<def>ghi<klm>nop
-
-/(?U)<.*>/
- abc<def>ghi<klm>nop
-
-/<.*?>/U
- abc<def>ghi<klm>nop
-
-/={3,}/U
- abc========def
-
-/(?U)={3,}?/
- abc========def
-
-/(?<!bar|cattle)foo/
- foo
- catfoo
- *** Failers
- the barfoo
- and cattlefoo
-
-/(?<=a+)b/
-
-/(?<=aaa|b{0,3})b/
-
-/(?<!(foo)a\1)bar/
-
-/(?i)abc/
-
-/(a|(?m)a)/
-
-/(?i)^1234/
-
-/(^b|(?i)^d)/
-
-/(?s).*/
-
-/[abcd]/S
-
-/(?i)[abcd]/S
-
-/(?m)[xy]|(b|c)/S
-
-/(^a|^b)/m
-
-/(?i)(^a|^b)/m
-
-/(a)(?(1)a|b|c)/
-
-/(?(?=a)a|b|c)/
-
-/(?(1a)/
-
-/(?(?i))/
-
-/(?(abc))/
-
-/(?(?<ab))/
-
-/((?s)blah)\s+\1/
-
-/((?i)blah)\s+\1/
-
-/((?i)b)/DS
-
-/(a*b|(?i:c*(?-i)d))/S
-
-/a$/
- a
- a\n
- *** Failers
- \Za
- \Za\n
-
-/a$/m
- a
- a\n
- \Za\n
- *** Failers
- \Za
-
-/\Aabc/m
-
-/^abc/m
-
-/^((a+)(?U)([ab]+)(?-U)([bc]+)(\w*))/
- aaaaabbbbbcccccdef
-
-/(?<=foo)[ab]/S
-
-/(?<!foo)(alpha|omega)/S
-
-/(?!alphabet)[ab]/S
-
-/(?<=foo\n)^bar/m
-
-/(?>^abc)/m
- abc
- def\nabc
- *** Failers
- defabc
-
-/(?<=ab(c+)d)ef/
-
-/(?<=ab(?<=c+)d)ef/
-
-/(?<=ab(c|de)f)g/
-
-/The next three are in testinput2 because they have variable length branches/
-
-/(?<=bullock|donkey)-cart/
- the bullock-cart
- a donkey-cart race
- *** Failers
- cart
- horse-and-cart
-
-/(?<=ab(?i)x|y|z)/
-
-/(?>.*)(?<=(abcd)|(xyz))/
- alphabetabcd
- endingxyz
-
-/(?<=ab(?i)x(?-i)y|(?i)z|b)ZZ/
- abxyZZ
- abXyZZ
- ZZZ
- zZZ
- bZZ
- BZZ
- *** Failers
- ZZ
- abXYZZ
- zzz
- bzz
-
-/(?<!(foo)a)bar/
- bar
- foobbar
- *** Failers
- fooabar
-
-/This one is here because Perl 5.005_02 doesn't fail it/
-
-/^(a)?(?(1)a|b)+$/
- *** Failers
- a
-
-/This one is here because I think Perl 5.005_02 gets the setting of $1 wrong/
-
-/^(a\1?){4}$/
- aaaaaa
-
-/These are syntax tests from Perl 5.005/
-
-/a[b-a]/
-
-/a[]b/
-
-/a[/
-
-/*a/
-
-/(*)b/
-
-/abc)/
-
-/(abc/
-
-/a**/
-
-/)(/
-
-/\1/
-
-/\2/
-
-/(a)|\2/
-
-/a[b-a]/i
-
-/a[]b/i
-
-/a[/i
-
-/*a/i
-
-/(*)b/i
-
-/abc)/i
-
-/(abc/i
-
-/a**/i
-
-/)(/i
-
-/:(?:/
-
-/(?<%)b/
-
-/a(?{)b/
-
-/a(?{{})b/
-
-/a(?{}})b/
-
-/a(?{"{"})b/
-
-/a(?{"{"}})b/
-
-/(?(1?)a|b)/
-
-/(?(1)a|b|c)/
-
-/[a[:xyz:/
-
-/(?<=x+)y/
-
-/a{37,17}/
-
-/abc/\
-
-/abc/\P
-
-/abc/\i
-
-/(a)bc(d)/
- abcd
- abcd\C2
- abcd\C5
-
-/(.{20})/
- abcdefghijklmnopqrstuvwxyz
- abcdefghijklmnopqrstuvwxyz\C1
- abcdefghijklmnopqrstuvwxyz\G1
-
-/(.{15})/
- abcdefghijklmnopqrstuvwxyz
- abcdefghijklmnopqrstuvwxyz\C1\G1
-
-/(.{16})/
- abcdefghijklmnopqrstuvwxyz
- abcdefghijklmnopqrstuvwxyz\C1\G1\L
-
-/^(a|(bc))de(f)/
- adef\G1\G2\G3\G4\L
- bcdef\G1\G2\G3\G4\L
- adefghijk\C0
-
-/^abc\00def/
- abc\00def\L\C0
-
-/word ((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+
-)((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+
-)?)?)?)?)?)?)?)?)?otherword/M
-
-/.*X/D
-
-/.*X/Ds
-
-/(.*X|^B)/D
-
-/(.*X|^B)/Ds
-
-/(?s)(.*X|^B)/D
-
-/(?s:.*X|^B)/D
-
-/\Biss\B/+
- Mississippi
-
-/\Biss\B/+P
- Mississippi
-
-/iss/G+
- Mississippi
-
-/\Biss\B/G+
- Mississippi
-
-/\Biss\B/g+
- Mississippi
- *** Failers
- Mississippi\A
-
-/(?<=[Ms])iss/g+
- Mississippi
-
-/(?<=[Ms])iss/G+
- Mississippi
-
-/^iss/g+
- ississippi
-
-/.*iss/g+
- abciss\nxyzisspqr
-
-/.i./+g
- Mississippi
- Mississippi\A
- Missouri river
- Missouri river\A
-
-/^.is/+g
- Mississippi
-
-/^ab\n/g+
- ab\nab\ncd
-
-/^ab\n/mg+
- ab\nab\ncd
-
-/abc/
-
-/abc|bac/
-
-/(abc|bac)/
-
-/(abc|(c|dc))/
-
-/(abc|(d|de)c)/
-
-/a*/
-
-/a+/
-
-/(baa|a+)/
-
-/a{0,3}/
-
-/baa{3,}/
-
-/"([^\\"]+|\\.)*"/
-
-/(abc|ab[cd])/
-
-/(a|.)/
-
-/a|ba|\w/
-
-/abc(?=pqr)/
-
-/...(?<=abc)/
-
-/abc(?!pqr)/
-
-/ab./
-
-/ab[xyz]/
-
-/abc*/
-
-/ab.c*/
-
-/a.c*/
-
-/.c*/
-
-/ac*/
-
-/(a.c*|b.c*)/
-
-/a.c*|aba/
-
-/.+a/
-
-/(?=abcda)a.*/
-
-/(?=a)a.*/
-
-/a(b)*/
-
-/a\d*/
-
-/ab\d*/
-
-/a(\d)*/
-
-/abcde{0,0}/
-
-/ab\d+/
-
-/a(?(1)b)/
-
-/a(?(1)bag|big)/
-
-/a(?(1)bag|big)*/
-
-/a(?(1)bag|big)+/
-
-/a(?(1)b..|b..)/
-
-/ab\d{0}e/
-
-/a?b?/
- a
- b
- ab
- \
- *** Failers
- \N
-
-/|-/
- abcd
- -abc
- \Nab-c
- *** Failers
- \Nabc
-
-/a*(b+)(z)(z)/P
- aaaabbbbzzzz
- aaaabbbbzzzz\O0
- aaaabbbbzzzz\O1
- aaaabbbbzzzz\O2
- aaaabbbbzzzz\O3
- aaaabbbbzzzz\O4
- aaaabbbbzzzz\O5
-
-/^.?abcd/S
-
-/\( # ( at start
- (?: # Non-capturing bracket
- (?>[^()]+) # Either a sequence of non-brackets (no backtracking)
- | # Or
- (?R) # Recurse - i.e. nested bracketed string
- )* # Zero or more contents
- \) # Closing )
- /x
- (abcd)
- (abcd)xyz
- xyz(abcd)
- (ab(xy)cd)pqr
- (ab(xycd)pqr
- () abc ()
- 12(abcde(fsh)xyz(foo(bar))lmno)89
- *** Failers
- abcd
- abcd)
- (abcd
-
-/\( ( (?>[^()]+) | (?R) )* \) /xg
- (ab(xy)cd)pqr
- 1(abcd)(x(y)z)pqr
-
-/\( (?: (?>[^()]+) | (?R) ) \) /x
- (abcd)
- (ab(xy)cd)
- (a(b(c)d)e)
- ((ab))
- *** Failers
- ()
-
-/\( (?: (?>[^()]+) | (?R) )? \) /x
- ()
- 12(abcde(fsh)xyz(foo(bar))lmno)89
-
-/\( ( (?>[^()]+) | (?R) )* \) /x
- (ab(xy)cd)
-
-/\( ( ( (?>[^()]+) | (?R) )* ) \) /x
- (ab(xy)cd)
-
-/\( (123)? ( ( (?>[^()]+) | (?R) )* ) \) /x
- (ab(xy)cd)
- (123ab(xy)cd)
-
-/\( ( (123)? ( (?>[^()]+) | (?R) )* ) \) /x
- (ab(xy)cd)
- (123ab(xy)cd)
-
-/\( (((((((((( ( (?>[^()]+) | (?R) )* )))))))))) \) /x
- (ab(xy)cd)
-
-/\( ( ( (?>[^()<>]+) | ((?>[^()]+)) | (?R) )* ) \) /x
- (abcd(xyz<p>qrs)123)
-
-/\( ( ( (?>[^()]+) | ((?R)) )* ) \) /x
- (ab(cd)ef)
- (ab(cd(ef)gh)ij)
-
-/^[[:alnum:]]/D
-
-/^[[:alpha:]]/D
-
-/^[[:ascii:]]/D
-
-/^[[:blank:]]/D
-
-/^[[:cntrl:]]/D
-
-/^[[:digit:]]/D
-
-/^[[:graph:]]/D
-
-/^[[:lower:]]/D
-
-/^[[:print:]]/D
-
-/^[[:punct:]]/D
-
-/^[[:space:]]/D
-
-/^[[:upper:]]/D
-
-/^[[:xdigit:]]/D
-
-/^[[:word:]]/D
-
-/^[[:^cntrl:]]/D
-
-/^[12[:^digit:]]/D
-
-/^[[:^blank:]]/D
-
-/[01[:alpha:]%]/D
-
-/[[.ch.]]/
-
-/[[=ch=]]/
-
-/[[:rhubarb:]]/
-
-/[[:upper:]]/i
- A
- a
-
-/[[:lower:]]/i
- A
- a
-
-/((?-i)[[:lower:]])[[:lower:]]/i
- ab
- aB
- *** Failers
- Ab
- AB
-
-/[\200-\410]/
-
-/^(?(0)f|b)oo/
-
-/This one's here because of the large output vector needed/
-
-/(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\w+)\s+(\270)/
- \O900 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 ABC ABC
-
-/This one's here because Perl does this differently and PCRE can't at present/
-
-/(main(O)?)+/
- mainmain
- mainOmain
-
-/These are all cases where Perl does it differently (nested captures)/
-
-/^(a(b)?)+$/
- aba
-
-/^(aa(bb)?)+$/
- aabbaa
-
-/^(aa|aa(bb))+$/
- aabbaa
-
-/^(aa(bb)??)+$/
- aabbaa
-
-/^(?:aa(bb)?)+$/
- aabbaa
-
-/^(aa(b(b))?)+$/
- aabbaa
-
-/^(?:aa(b(b))?)+$/
- aabbaa
-
-/^(?:aa(b(?:b))?)+$/
- aabbaa
-
-/^(?:aa(bb(?:b))?)+$/
- aabbbaa
-
-/^(?:aa(b(?:bb))?)+$/
- aabbbaa
-
-/^(?:aa(?:b(b))?)+$/
- aabbaa
-
-/^(?:aa(?:b(bb))?)+$/
- aabbbaa
-
-/^(aa(b(bb))?)+$/
- aabbbaa
-
-/^(aa(bb(bb))?)+$/
- aabbbbaa
-
-/--------------------------------------------------------------------/
-
-/#/xMD
-
-/a#/xMD
-
-/[\s]/D
-
-/[\S]/D
-
-/a(?i)b/D
- ab
- aB
- *** Failers
- AB
-
-/(a(?i)b)/D
- ab
- aB
- *** Failers
- AB
-
-/ (?i)abc/xD
-
-/#this is a comment
- (?i)abc/xD
-
-/123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890/D
-
-/\Q123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890/D
-
-/\Q\E/D
- \
-
-/\Q\Ex/D
-
-/ \Q\E/D
-
-/a\Q\E/D
- abc
- bca
- bac
-
-/a\Q\Eb/D
- abc
-
-/\Q\Eabc/D
-
-/x*+\w/D
- ****Failers
- xxxxx
-
-/x?+/D
-
-/x++/D
-
-/x{1,3}+/D
-
-/(x)*+/D
-
-/^(\w++|\s++)*$/
- now is the time for all good men to come to the aid of the party
- *** Failers
- this is not a line with only words and spaces!
-
-/(\d++)(\w)/
- 12345a
- *** Failers
- 12345+
-
-/a++b/
- aaab
-
-/(a++b)/
- aaab
-
-/(a++)b/
- aaab
-
-/([^()]++|\([^()]*\))+/
- ((abc(ade)ufh()()x
-
-/\(([^()]++|\([^()]+\))+\)/
- (abc)
- (abc(def)xyz)
- *** Failers
- ((()aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-
-/(abc){1,3}+/D
-
-/a+?+/
-
-/a{2,3}?+b/
-
-/(?U)a+?+/
-
-/a{2,3}?+b/U
-
-/x(?U)a++b/D
- xaaaab
-
-/(?U)xa++b/D
- xaaaab
-
-/^((a+)(?U)([ab]+)(?-U)([bc]+)(\w*))/D
-
-/^x(?U)a+b/D
-
-/^x(?U)(a+)b/D
-
-/[.x.]/
-
-/[=x=]/
-
-/[:x:]/
-
-/\l/
-
-/\L/
-
-/\N{name}/
-
-/\pP/
-
-/\PP/
-
-/\p{prop}/
-
-/\P{prop}/
-
-/\u/
-
-/\U/
-
-/\X/
-
-/[/
-
-/[a-/
-
-/[[:space:]/
-
-/[\s]/DM
-
-/[[:space:]]/DM
-
-/[[:space:]abcde]/DM
-
-/< (?: (?(R) \d++ | [^<>]*+) | (?R)) * >/x
- <>
- <abcd>
- <abc <123> hij>
- <abc <def> hij>
- <abc<>def>
- <abc<>
- *** Failers
- <abc
-
-|8J\$WE\<\.rX\+ix\[d1b\!H\#\?vV0vrK\:ZH1\=2M\>iV\;\?aPhFB\<\*vW\@QW\@sO9\}cfZA\-i\'w\%hKd6gt1UJP\,15_\#QY\$M\^Mss_U\/\]\&LK9\[5vQub\^w\[KDD\<EjmhUZ\?\.akp2dF\>qmj\;2\}YWFdYx\.Ap\]hjCPTP\(n28k\+3\;o\&WXqs\/gOXdr\$\:r\'do0\;b4c\(f_Gr\=\"\\4\)\[01T7ajQJvL\$W\~mL_sS\/4h\:x\*\[ZN\=KLs\&L5zX\/\/\>it\,o\:aU\(\;Z\>pW\&T7oP\'2K\^E\:x9\'c\[\%z\-\,64JQ5AeH_G\#KijUKghQw\^\\vea3a\?kka_G\$8\#\`\*kynsxzBLru\'\]k_\[7FrVx\}\^\=\$blx\>s\-N\%j\;D\*aZDnsw\:YKZ\%Q\.Kne9\#hP\?\+b3\(SOvL\,\^\;\&u5\@\?5C5Bhb\=m\-vEh_L15Jl\]U\)0RP6\{q\%L\^_z5E\'Dw6X\b|DM
-
-|\$\<\.X\+ix\[d1b\!H\#\?vV0vrK\:ZH1\=2M\>iV\;\?aPhFB\<\*vW\@QW\@sO9\}cfZA\-i\'w\%hKd6gt1UJP\,15_\#QY\$M\^Mss_U\/\]\&LK9\[5vQub\^w\[KDD\<EjmhUZ\?\.akp2dF\>qmj\;2\}YWFdYx\.Ap\]hjCPTP\(n28k\+3\;o\&WXqs\/gOXdr\$\:r\'do0\;b4c\(f_Gr\=\"\\4\)\[01T7ajQJvL\$W\~mL_sS\/4h\:x\*\[ZN\=KLs\&L5zX\/\/\>it\,o\:aU\(\;Z\>pW\&T7oP\'2K\^E\:x9\'c\[\%z\-\,64JQ5AeH_G\#KijUKghQw\^\\vea3a\?kka_G\$8\#\`\*kynsxzBLru\'\]k_\[7FrVx\}\^\=\$blx\>s\-N\%j\;D\*aZDnsw\:YKZ\%Q\.Kne9\#hP\?\+b3\(SOvL\,\^\;\&u5\@\?5C5Bhb\=m\-vEh_L15Jl\]U\)0RP6\{q\%L\^_z5E\'Dw6X\b|DM
-
-/(.*)\d+\1/I
-
-/(.*)\d+/I
-
-/(.*)\d+\1/Is
-
-/(.*)\d+/Is
-
-/(.*(xyz))\d+\2/I
-
-/((.*))\d+\1/I
- abc123bc
-
-/a[b]/I
-
-/(?=a).*/I
-
-/(?=abc).xyz/iI
-
-/(?=abc)(?i).xyz/I
-
-/(?=a)(?=b)/I
-
-/(?=.)a/I
-
-/((?=abcda)a)/I
-
-/((?=abcda)ab)/I
-
-/()a/I
-
-/(?(1)ab|ac)/I
-
-/(?(1)abz|acz)/I
-
-/(?(1)abz)/I
-
-/(?(1)abz)123/I
-
-/(a)+/I
-
-/(a){2,3}/I
-
-/(a)*/I
-
-/[a]/I
-
-/[ab]/I
-
-/[ab]/IS
-
-/[^a]/I
-
-/\d456/I
-
-/\d456/IS
-
-/a^b/I
-
-/^a/mI
- abcde
- xy\nabc
- *** Failers
- xyabc
-
-/c|abc/I
-
-/(?i)[ab]/IS
-
-/[ab](?i)cd/IS
-
-/abc(?C)def/
- abcdef
- 1234abcdef
- *** Failers
- abcxyz
- abcxyzf
-
-/abc(?C)de(?C1)f/
- 123abcdef
-
-/(?C1)\dabc(?C2)def/
- 1234abcdef
- *** Failers
- abcdef
-
-/(?C255)ab/
-
-/(?C256)ab/
-
-/(?Cab)xx/
-
-/(?C12vr)x/
-
-/abc(?C)def/
- *** Failers
- \x83\x0\x61bcdef
-
-/(abc)(?C)de(?C1)f/
- 123abcdef
- 123abcdef\C+
- 123abcdef\C-
- *** Failers
- 123abcdef\C!1
-
-/(?C0)(abc(?C1))*/
- abcabcabc
- abcabc\C!1!3
- *** Failers
- abcabcabc\C!1!3
-
-/(\d{3}(?C))*/
- 123\C+
- 123456\C+
- 123456789\C+
-
-/((xyz)(?C)p|(?C1)xyzabc)/
- xyzabc\C+
-
-/(X)((xyz)(?C)p|(?C1)xyzabc)/
- Xxyzabc\C+
-
-/(?=(abc))(?C)abcdef/
- abcdef\C+
-
-/(?!(abc)(?C1)d)(?C2)abcxyz/
- abcxyz\C+
-
-/(?<=(abc)(?C))xyz/
- abcxyz\C+
-
-/(?C)abc/
-
-/(?C)^abc/
-
-/(?C)a|b/S
-
-/(?R)/
-
-/(a|(?R))/
-
-/(ab|(bc|(de|(?R))))/
-
-/x(ab|(bc|(de|(?R))))/
- xab
- xbc
- xde
- xxab
- xxxab
- *** Failers
- xyab
-
-/(ab|(bc|(de|(?1))))/
-
-/x(ab|(bc|(de|(?1)x)x)x)/
-
-/^([^()]|\((?1)*\))*$/
- abc
- a(b)c
- a(b(c))d
- *** Failers)
- a(b(c)d
-
-/^>abc>([^()]|\((?1)*\))*<xyz<$/
- >abc>123<xyz<
- >abc>1(2)3<xyz<
- >abc>(1(2)3)<xyz<
-
-/(a(?1)b)/D
-
-/(a(?1)+b)/D
-
-/^\W*(?:((.)\W*(?1)\W*\2|)|((.)\W*(?3)\W*\4|\W*.\W*))\W*$/i
- 1221
- Satan, oscillate my metallic sonatas!
- A man, a plan, a canal: Panama!
- Able was I ere I saw Elba.
- *** Failers
- The quick brown fox
-
-/^(\d+|\((?1)([+*-])(?1)\)|-(?1))$/
- 12
- (((2+2)*-3)-7)
- -12
- *** Failers
- ((2+2)*-3)-7)
-
-/^(x(y|(?1){2})z)/
- xyz
- xxyzxyzz
- *** Failers
- xxyzz
- xxyzxyzxyzz
-
-/((< (?: (?(R) \d++ | [^<>]*+) | (?2)) * >))/x
- <>
- <abcd>
- <abc <123> hij>
- <abc <def> hij>
- <abc<>def>
- <abc<>
- *** Failers
- <abc
-
-/(?1)/
-
-/((?2)(abc)/
-
-/^(abc)def(?1)/
- abcdefabc
-
-/^(a|b|c)=(?1)+/
- a=a
- a=b
- a=bc
-
-/^(a|b|c)=((?1))+/
- a=a
- a=b
- a=bc
-
-/a(?P<name1>b|c)d(?P<longername2>e)/D
- abde
- acde
-
-/(?:a(?P<c>c(?P<d>d)))(?P<a>a)/D
-
-/(?P<a>a)...(?P=a)bbb(?P>a)d/D
-
-/ End of testinput2 /
diff --git a/ext/pcre/pcrelib/testdata/testinput3 b/ext/pcre/pcrelib/testdata/testinput3
deleted file mode 100644
index 391aa62069..0000000000
--- a/ext/pcre/pcrelib/testdata/testinput3
+++ /dev/null
@@ -1,65 +0,0 @@
-/^[\w]+/
- *** Failers
- École
-
-/^[\w]+/Lfr
- École
-
-/^[\w]+/
- *** Failers
- École
-
-/^[\W]+/
- École
-
-/^[\W]+/Lfr
- *** Failers
- École
-
-/[\b]/
- \b
- *** Failers
- a
-
-/[\b]/Lfr
- \b
- *** Failers
- a
-
-/^\w+/
- *** Failers
- École
-
-/^\w+/Lfr
- École
-
-/(.+)\b(.+)/
- École
-
-/(.+)\b(.+)/Lfr
- *** Failers
- École
-
-/École/i
- École
- *** Failers
- école
-
-/École/iLfr
- École
- école
-
-/\w/IS
-
-/\w/ISLfr
-
-/^[\xc8-\xc9]/iLfr
- École
- école
-
-/^[\xc8-\xc9]/Lfr
- École
- *** Failers
- école
-
-/ End of testinput3 /
diff --git a/ext/pcre/pcrelib/testdata/testinput4 b/ext/pcre/pcrelib/testdata/testinput4
deleted file mode 100644
index 51d6b97632..0000000000
--- a/ext/pcre/pcrelib/testdata/testinput4
+++ /dev/null
@@ -1,155 +0,0 @@
-/-- Do not use the \x{} construct except with patterns that have the --/
-/-- /8 option set, because PCRE doesn't recognize them as UTF-8 unless --/
-/-- that option is set. However, the latest Perls recognize them always. --/
-
-/a.b/8
- acb
- a\x7fb
- a\x{100}b
- *** Failers
- a\nb
-
-/a(.{3})b/8
- a\x{4000}xyb
- a\x{4000}\x7fyb
- a\x{4000}\x{100}yb
- *** Failers
- a\x{4000}b
- ac\ncb
-
-/a(.*?)(.)/
- a\xc0\x88b
-
-/a(.*?)(.)/8
- a\x{100}b
-
-/a(.*)(.)/
- a\xc0\x88b
-
-/a(.*)(.)/8
- a\x{100}b
-
-/a(.)(.)/
- a\xc0\x92bcd
-
-/a(.)(.)/8
- a\x{240}bcd
-
-/a(.?)(.)/
- a\xc0\x92bcd
-
-/a(.?)(.)/8
- a\x{240}bcd
-
-/a(.??)(.)/
- a\xc0\x92bcd
-
-/a(.??)(.)/8
- a\x{240}bcd
-
-/a(.{3})b/8
- a\x{1234}xyb
- a\x{1234}\x{4321}yb
- a\x{1234}\x{4321}\x{3412}b
- *** Failers
- a\x{1234}b
- ac\ncb
-
-/a(.{3,})b/8
- a\x{1234}xyb
- a\x{1234}\x{4321}yb
- a\x{1234}\x{4321}\x{3412}b
- axxxxbcdefghijb
- a\x{1234}\x{4321}\x{3412}\x{3421}b
- *** Failers
- a\x{1234}b
-
-/a(.{3,}?)b/8
- a\x{1234}xyb
- a\x{1234}\x{4321}yb
- a\x{1234}\x{4321}\x{3412}b
- axxxxbcdefghijb
- a\x{1234}\x{4321}\x{3412}\x{3421}b
- *** Failers
- a\x{1234}b
-
-/a(.{3,5})b/8
- a\x{1234}xyb
- a\x{1234}\x{4321}yb
- a\x{1234}\x{4321}\x{3412}b
- axxxxbcdefghijb
- a\x{1234}\x{4321}\x{3412}\x{3421}b
- axbxxbcdefghijb
- axxxxxbcdefghijb
- *** Failers
- a\x{1234}b
- axxxxxxbcdefghijb
-
-/a(.{3,5}?)b/8
- a\x{1234}xyb
- a\x{1234}\x{4321}yb
- a\x{1234}\x{4321}\x{3412}b
- axxxxbcdefghijb
- a\x{1234}\x{4321}\x{3412}\x{3421}b
- axbxxbcdefghijb
- axxxxxbcdefghijb
- *** Failers
- a\x{1234}b
- axxxxxxbcdefghijb
-
-/^[a\x{c0}]/8
- *** Failers
- \x{100}
-
-/(?<=aXb)cd/8
- aXbcd
-
-/(?<=a\x{100}b)cd/8
- a\x{100}bcd
-
-/(?<=a\x{100000}b)cd/8
- a\x{100000}bcd
-
-/(?:\x{100}){3}b/8
- \x{100}\x{100}\x{100}b
- *** Failers
- \x{100}\x{100}b
-
-/\x{ab}/8
- \x{ab}
- \xc2\xab
- *** Failers
- \x00{ab}
-
-/(?<=(.))X/8
- WXYZ
- \x{256}XYZ
- *** Failers
- XYZ
-
-/X(\C{3})/8
- X\x{1234}
-
-/X(\C{4})/8
- X\x{1234}YZ
-
-/X\C*/8
- XYZabcdce
-
-/X\C*?/8
- XYZabcde
-
-/X\C{3,5}/8
- Xabcdefg
- X\x{1234}
- X\x{1234}YZ
- X\x{1234}\x{512}
- X\x{1234}\x{512}YZ
-
-/X\C{3,5}?/8
- Xabcdefg
- X\x{1234}
- X\x{1234}YZ
- X\x{1234}\x{512}
-
-/ End of testinput4 /
diff --git a/ext/pcre/pcrelib/testdata/testinput5 b/ext/pcre/pcrelib/testdata/testinput5
deleted file mode 100644
index 81fe233e6b..0000000000
--- a/ext/pcre/pcrelib/testdata/testinput5
+++ /dev/null
@@ -1,91 +0,0 @@
-/\x{100}/8DM
-
-/\x{1000}/8DM
-
-/\x{10000}/8DM
-
-/\x{100000}/8DM
-
-/\x{1000000}/8DM
-
-/\x{4000000}/8DM
-
-/\x{7fffFFFF}/8DM
-
-/[\x{ff}]/8DM
-
-/[\x{100}]/8DM
-
-/\x{ffffffff}/8
-
-/\x{100000000}/8
-
-/^\x{100}a\x{1234}/8
- \x{100}a\x{1234}bcd
-
-/\x80/8D
-
-/\xff/8D
-
-/\x{0041}\x{2262}\x{0391}\x{002e}/D8
- \x{0041}\x{2262}\x{0391}\x{002e}
-
-/\x{D55c}\x{ad6d}\x{C5B4}/D8
- \x{D55c}\x{ad6d}\x{C5B4}
-
-/\x{65e5}\x{672c}\x{8a9e}/D8
- \x{65e5}\x{672c}\x{8a9e}
-
-/\x{80}/D8
-
-/\x{084}/D8
-
-/\x{104}/D8
-
-/\x{861}/D8
-
-/\x{212ab}/D8
-
-/.{3,5}X/D8
- \x{212ab}\x{212ab}\x{212ab}\x{861}X
-
-
-/.{3,5}?/D8
- \x{212ab}\x{212ab}\x{212ab}\x{861}
-
-/-- These tests are here rather than in testinput4 because Perl 5.6 has --/
-/-- some problems with UTF-8 support, in the area of \x{..} where the --/
-/-- value is < 255. It grumbles about invalid UTF-8 strings. --/
-
-/^[a\x{c0}]b/8
- \x{c0}b
-
-/^([a\x{c0}]*?)aa/8
- a\x{c0}aaaa/
-
-/^([a\x{c0}]*?)aa/8
- a\x{c0}aaaa/
- a\x{c0}a\x{c0}aaa/
-
-/^([a\x{c0}]*)aa/8
- a\x{c0}aaaa/
- a\x{c0}a\x{c0}aaa/
-
-/^([a\x{c0}]*)a\x{c0}/8
- a\x{c0}aaaa/
- a\x{c0}a\x{c0}aaa/
-
-/-- --/
-
-/(?<=\C)X/8
- Should produce an error diagnostic
-
-/-- This one is here not because it's different to Perl, but because the --/
-/-- way the captured single-byte is displayed. (In Perl it becomes a --/
-/-- character, and you can't tell the difference.) --/
-
-/X(\C)(.*)/8
- X\x{1234}
- X\nabc
-
-/ End of testinput5 /
diff --git a/ext/pcre/pcrelib/testdata/testinput6 b/ext/pcre/pcrelib/testdata/testinput6
deleted file mode 100644
index 00748513c6..0000000000
--- a/ext/pcre/pcrelib/testdata/testinput6
+++ /dev/null
@@ -1,78 +0,0 @@
-/\x{100}/8DM
-
-/\x{1000}/8DM
-
-/\x{10000}/8DM
-
-/\x{100000}/8DM
-
-/\x{1000000}/8DM
-
-/\x{4000000}/8DM
-
-/\x{7fffFFFF}/8DM
-
-/[\x{ff}]/8DM
-
-/[\x{100}]/8DM
-
-/\x{ffffffff}/8
-
-/\x{100000000}/8
-
-/^\x{100}a\x{1234}/8
- \x{100}a\x{1234}bcd
-
-/\x80/8D
-
-/\xff/8D
-
-/\x{0041}\x{2262}\x{0391}\x{002e}/D8
- \x{0041}\x{2262}\x{0391}\x{002e}
-
-/\x{D55c}\x{ad6d}\x{C5B4}/D8
- \x{D55c}\x{ad6d}\x{C5B4}
-
-/\x{65e5}\x{672c}\x{8a9e}/D8
- \x{65e5}\x{672c}\x{8a9e}
-
-/\x{80}/D8
-
-/\x{084}/D8
-
-/\x{104}/D8
-
-/\x{861}/D8
-
-/\x{212ab}/D8
-
-/.{3,5}X/D8
- \x{212ab}\x{212ab}\x{212ab}\x{861}X
-
-
-/.{3,5}?/D8
- \x{212ab}\x{212ab}\x{212ab}\x{861}
-
-/-- These tests are here rather than in testinput5 because Perl 5.6 has --/
-/-- some problems with UTF-8 support, in the area of \x{..} where the --/
-/-- value is < 255. It grumbles about invalid UTF-8 strings. --/
-
-/^[a\x{c0}]b/8
- \x{c0}b
-
-/^([a\x{c0}]*?)aa/8
- a\x{c0}aaaa/
-
-/^([a\x{c0}]*?)aa/8
- a\x{c0}aaaa/
- a\x{c0}a\x{c0}aaa/
-
-/^([a\x{c0}]*)aa/8
- a\x{c0}aaaa/
- a\x{c0}a\x{c0}aaa/
-
-/^([a\x{c0}]*)a\x{c0}/8
- a\x{c0}aaaa/
- a\x{c0}a\x{c0}aaa/
-
-/ End of testinput6 /
diff --git a/ext/pcre/pcrelib/testdata/testoutput1 b/ext/pcre/pcrelib/testdata/testoutput1
deleted file mode 100644
index 81bf6cef3d..0000000000
--- a/ext/pcre/pcrelib/testdata/testoutput1
+++ /dev/null
@@ -1,6222 +0,0 @@
-PCRE version 3.92 11-Sep-2002
-
-/the quick brown fox/
- the quick brown fox
- 0: the quick brown fox
- The quick brown FOX
-No match
- What do you know about the quick brown fox?
- 0: the quick brown fox
- What do you know about THE QUICK BROWN FOX?
-No match
-
-/The quick brown fox/i
- the quick brown fox
- 0: the quick brown fox
- The quick brown FOX
- 0: The quick brown FOX
- What do you know about the quick brown fox?
- 0: the quick brown fox
- What do you know about THE QUICK BROWN FOX?
- 0: THE QUICK BROWN FOX
-
-/abcd\t\n\r\f\a\e\071\x3b\$\\\?caxyz/
- abcd\t\n\r\f\a\e9;\$\\?caxyz
- 0: abcd\x09\x0a\x0d\x0c\x07\x1b9;$\?caxyz
-
-/a*abc?xyz+pqr{3}ab{2,}xy{4,5}pq{0,6}AB{0,}zz/
- abxyzpqrrrabbxyyyypqAzz
- 0: abxyzpqrrrabbxyyyypqAzz
- abxyzpqrrrabbxyyyypqAzz
- 0: abxyzpqrrrabbxyyyypqAzz
- aabxyzpqrrrabbxyyyypqAzz
- 0: aabxyzpqrrrabbxyyyypqAzz
- aaabxyzpqrrrabbxyyyypqAzz
- 0: aaabxyzpqrrrabbxyyyypqAzz
- aaaabxyzpqrrrabbxyyyypqAzz
- 0: aaaabxyzpqrrrabbxyyyypqAzz
- abcxyzpqrrrabbxyyyypqAzz
- 0: abcxyzpqrrrabbxyyyypqAzz
- aabcxyzpqrrrabbxyyyypqAzz
- 0: aabcxyzpqrrrabbxyyyypqAzz
- aaabcxyzpqrrrabbxyyyypAzz
- 0: aaabcxyzpqrrrabbxyyyypAzz
- aaabcxyzpqrrrabbxyyyypqAzz
- 0: aaabcxyzpqrrrabbxyyyypqAzz
- aaabcxyzpqrrrabbxyyyypqqAzz
- 0: aaabcxyzpqrrrabbxyyyypqqAzz
- aaabcxyzpqrrrabbxyyyypqqqAzz
- 0: aaabcxyzpqrrrabbxyyyypqqqAzz
- aaabcxyzpqrrrabbxyyyypqqqqAzz
- 0: aaabcxyzpqrrrabbxyyyypqqqqAzz
- aaabcxyzpqrrrabbxyyyypqqqqqAzz
- 0: aaabcxyzpqrrrabbxyyyypqqqqqAzz
- aaabcxyzpqrrrabbxyyyypqqqqqqAzz
- 0: aaabcxyzpqrrrabbxyyyypqqqqqqAzz
- aaaabcxyzpqrrrabbxyyyypqAzz
- 0: aaaabcxyzpqrrrabbxyyyypqAzz
- abxyzzpqrrrabbxyyyypqAzz
- 0: abxyzzpqrrrabbxyyyypqAzz
- aabxyzzzpqrrrabbxyyyypqAzz
- 0: aabxyzzzpqrrrabbxyyyypqAzz
- aaabxyzzzzpqrrrabbxyyyypqAzz
- 0: aaabxyzzzzpqrrrabbxyyyypqAzz
- aaaabxyzzzzpqrrrabbxyyyypqAzz
- 0: aaaabxyzzzzpqrrrabbxyyyypqAzz
- abcxyzzpqrrrabbxyyyypqAzz
- 0: abcxyzzpqrrrabbxyyyypqAzz
- aabcxyzzzpqrrrabbxyyyypqAzz
- 0: aabcxyzzzpqrrrabbxyyyypqAzz
- aaabcxyzzzzpqrrrabbxyyyypqAzz
- 0: aaabcxyzzzzpqrrrabbxyyyypqAzz
- aaaabcxyzzzzpqrrrabbxyyyypqAzz
- 0: aaaabcxyzzzzpqrrrabbxyyyypqAzz
- aaaabcxyzzzzpqrrrabbbxyyyypqAzz
- 0: aaaabcxyzzzzpqrrrabbbxyyyypqAzz
- aaaabcxyzzzzpqrrrabbbxyyyyypqAzz
- 0: aaaabcxyzzzzpqrrrabbbxyyyyypqAzz
- aaabcxyzpqrrrabbxyyyypABzz
- 0: aaabcxyzpqrrrabbxyyyypABzz
- aaabcxyzpqrrrabbxyyyypABBzz
- 0: aaabcxyzpqrrrabbxyyyypABBzz
- >>>aaabxyzpqrrrabbxyyyypqAzz
- 0: aaabxyzpqrrrabbxyyyypqAzz
- >aaaabxyzpqrrrabbxyyyypqAzz
- 0: aaaabxyzpqrrrabbxyyyypqAzz
- >>>>abcxyzpqrrrabbxyyyypqAzz
- 0: abcxyzpqrrrabbxyyyypqAzz
- *** Failers
-No match
- abxyzpqrrabbxyyyypqAzz
-No match
- abxyzpqrrrrabbxyyyypqAzz
-No match
- abxyzpqrrrabxyyyypqAzz
-No match
- aaaabcxyzzzzpqrrrabbbxyyyyyypqAzz
-No match
- aaaabcxyzzzzpqrrrabbbxyyypqAzz
-No match
- aaabcxyzpqrrrabbxyyyypqqqqqqqAzz
-No match
-
-/^(abc){1,2}zz/
- abczz
- 0: abczz
- 1: abc
- abcabczz
- 0: abcabczz
- 1: abc
- *** Failers
-No match
- zz
-No match
- abcabcabczz
-No match
- >>abczz
-No match
-
-/^(b+?|a){1,2}?c/
- bc
- 0: bc
- 1: b
- bbc
- 0: bbc
- 1: b
- bbbc
- 0: bbbc
- 1: bb
- bac
- 0: bac
- 1: a
- bbac
- 0: bbac
- 1: a
- aac
- 0: aac
- 1: a
- abbbbbbbbbbbc
- 0: abbbbbbbbbbbc
- 1: bbbbbbbbbbb
- bbbbbbbbbbbac
- 0: bbbbbbbbbbbac
- 1: a
- *** Failers
-No match
- aaac
-No match
- abbbbbbbbbbbac
-No match
-
-/^(b+|a){1,2}c/
- bc
- 0: bc
- 1: b
- bbc
- 0: bbc
- 1: bb
- bbbc
- 0: bbbc
- 1: bbb
- bac
- 0: bac
- 1: a
- bbac
- 0: bbac
- 1: a
- aac
- 0: aac
- 1: a
- abbbbbbbbbbbc
- 0: abbbbbbbbbbbc
- 1: bbbbbbbbbbb
- bbbbbbbbbbbac
- 0: bbbbbbbbbbbac
- 1: a
- *** Failers
-No match
- aaac
-No match
- abbbbbbbbbbbac
-No match
-
-/^(b+|a){1,2}?bc/
- bbc
- 0: bbc
- 1: b
-
-/^(b*|ba){1,2}?bc/
- babc
- 0: babc
- 1: ba
- bbabc
- 0: bbabc
- 1: ba
- bababc
- 0: bababc
- 1: ba
- *** Failers
-No match
- bababbc
-No match
- babababc
-No match
-
-/^(ba|b*){1,2}?bc/
- babc
- 0: babc
- 1: ba
- bbabc
- 0: bbabc
- 1: ba
- bababc
- 0: bababc
- 1: ba
- *** Failers
-No match
- bababbc
-No match
- babababc
-No match
-
-/^\ca\cA\c[\c{\c:/
- \x01\x01\e;z
- 0: \x01\x01\x1b;z
-
-/^[ab\]cde]/
- athing
- 0: a
- bthing
- 0: b
- ]thing
- 0: ]
- cthing
- 0: c
- dthing
- 0: d
- ething
- 0: e
- *** Failers
-No match
- fthing
-No match
- [thing
-No match
- \\thing
-No match
-
-/^[]cde]/
- ]thing
- 0: ]
- cthing
- 0: c
- dthing
- 0: d
- ething
- 0: e
- *** Failers
-No match
- athing
-No match
- fthing
-No match
-
-/^[^ab\]cde]/
- fthing
- 0: f
- [thing
- 0: [
- \\thing
- 0: \
- *** Failers
- 0: *
- athing
-No match
- bthing
-No match
- ]thing
-No match
- cthing
-No match
- dthing
-No match
- ething
-No match
-
-/^[^]cde]/
- athing
- 0: a
- fthing
- 0: f
- *** Failers
- 0: *
- ]thing
-No match
- cthing
-No match
- dthing
-No match
- ething
-No match
-
-/^\/
-
- 0: \x81
-
-/^ÿ/
- ÿ
- 0: \xff
-
-/^[0-9]+$/
- 0
- 0: 0
- 1
- 0: 1
- 2
- 0: 2
- 3
- 0: 3
- 4
- 0: 4
- 5
- 0: 5
- 6
- 0: 6
- 7
- 0: 7
- 8
- 0: 8
- 9
- 0: 9
- 10
- 0: 10
- 100
- 0: 100
- *** Failers
-No match
- abc
-No match
-
-/^.*nter/
- enter
- 0: enter
- inter
- 0: inter
- uponter
- 0: uponter
-
-/^xxx[0-9]+$/
- xxx0
- 0: xxx0
- xxx1234
- 0: xxx1234
- *** Failers
-No match
- xxx
-No match
-
-/^.+[0-9][0-9][0-9]$/
- x123
- 0: x123
- xx123
- 0: xx123
- 123456
- 0: 123456
- *** Failers
-No match
- 123
-No match
- x1234
- 0: x1234
-
-/^.+?[0-9][0-9][0-9]$/
- x123
- 0: x123
- xx123
- 0: xx123
- 123456
- 0: 123456
- *** Failers
-No match
- 123
-No match
- x1234
- 0: x1234
-
-/^([^!]+)!(.+)=apquxz\.ixr\.zzz\.ac\.uk$/
- abc!pqr=apquxz.ixr.zzz.ac.uk
- 0: abc!pqr=apquxz.ixr.zzz.ac.uk
- 1: abc
- 2: pqr
- *** Failers
-No match
- !pqr=apquxz.ixr.zzz.ac.uk
-No match
- abc!=apquxz.ixr.zzz.ac.uk
-No match
- abc!pqr=apquxz:ixr.zzz.ac.uk
-No match
- abc!pqr=apquxz.ixr.zzz.ac.ukk
-No match
-
-/:/
- Well, we need a colon: somewhere
- 0: :
- *** Fail if we don't
-No match
-
-/([\da-f:]+)$/i
- 0abc
- 0: 0abc
- 1: 0abc
- abc
- 0: abc
- 1: abc
- fed
- 0: fed
- 1: fed
- E
- 0: E
- 1: E
- ::
- 0: ::
- 1: ::
- 5f03:12C0::932e
- 0: 5f03:12C0::932e
- 1: 5f03:12C0::932e
- fed def
- 0: def
- 1: def
- Any old stuff
- 0: ff
- 1: ff
- *** Failers
-No match
- 0zzz
-No match
- gzzz
-No match
- fed\x20
-No match
- Any old rubbish
-No match
-
-/^.*\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/
- .1.2.3
- 0: .1.2.3
- 1: 1
- 2: 2
- 3: 3
- A.12.123.0
- 0: A.12.123.0
- 1: 12
- 2: 123
- 3: 0
- *** Failers
-No match
- .1.2.3333
-No match
- 1.2.3
-No match
- 1234.2.3
-No match
-
-/^(\d+)\s+IN\s+SOA\s+(\S+)\s+(\S+)\s*\(\s*$/
- 1 IN SOA non-sp1 non-sp2(
- 0: 1 IN SOA non-sp1 non-sp2(
- 1: 1
- 2: non-sp1
- 3: non-sp2
- 1 IN SOA non-sp1 non-sp2 (
- 0: 1 IN SOA non-sp1 non-sp2 (
- 1: 1
- 2: non-sp1
- 3: non-sp2
- *** Failers
-No match
- 1IN SOA non-sp1 non-sp2(
-No match
-
-/^[a-zA-Z\d][a-zA-Z\d\-]*(\.[a-zA-Z\d][a-zA-z\d\-]*)*\.$/
- a.
- 0: a.
- Z.
- 0: Z.
- 2.
- 0: 2.
- ab-c.pq-r.
- 0: ab-c.pq-r.
- 1: .pq-r
- sxk.zzz.ac.uk.
- 0: sxk.zzz.ac.uk.
- 1: .uk
- x-.y-.
- 0: x-.y-.
- 1: .y-
- *** Failers
-No match
- -abc.peq.
-No match
-
-/^\*\.[a-z]([a-z\-\d]*[a-z\d]+)?(\.[a-z]([a-z\-\d]*[a-z\d]+)?)*$/
- *.a
- 0: *.a
- *.b0-a
- 0: *.b0-a
- 1: 0-a
- *.c3-b.c
- 0: *.c3-b.c
- 1: 3-b
- 2: .c
- *.c-a.b-c
- 0: *.c-a.b-c
- 1: -a
- 2: .b-c
- 3: -c
- *** Failers
-No match
- *.0
-No match
- *.a-
-No match
- *.a-b.c-
-No match
- *.c-a.0-c
-No match
-
-/^(?=ab(de))(abd)(e)/
- abde
- 0: abde
- 1: de
- 2: abd
- 3: e
-
-/^(?!(ab)de|x)(abd)(f)/
- abdf
- 0: abdf
- 1: <unset>
- 2: abd
- 3: f
-
-/^(?=(ab(cd)))(ab)/
- abcd
- 0: ab
- 1: abcd
- 2: cd
- 3: ab
-
-/^[\da-f](\.[\da-f])*$/i
- a.b.c.d
- 0: a.b.c.d
- 1: .d
- A.B.C.D
- 0: A.B.C.D
- 1: .D
- a.b.c.1.2.3.C
- 0: a.b.c.1.2.3.C
- 1: .C
-
-/^\".*\"\s*(;.*)?$/
- \"1234\"
- 0: "1234"
- \"abcd\" ;
- 0: "abcd" ;
- 1: ;
- \"\" ; rhubarb
- 0: "" ; rhubarb
- 1: ; rhubarb
- *** Failers
-No match
- \"1234\" : things
-No match
-
-/^$/
- \
- 0:
- *** Failers
-No match
-
-/ ^ a (?# begins with a) b\sc (?# then b c) $ (?# then end)/x
- ab c
- 0: ab c
- *** Failers
-No match
- abc
-No match
- ab cde
-No match
-
-/(?x) ^ a (?# begins with a) b\sc (?# then b c) $ (?# then end)/
- ab c
- 0: ab c
- *** Failers
-No match
- abc
-No match
- ab cde
-No match
-
-/^ a\ b[c ]d $/x
- a bcd
- 0: a bcd
- a b d
- 0: a b d
- *** Failers
-No match
- abcd
-No match
- ab d
-No match
-
-/^(a(b(c)))(d(e(f)))(h(i(j)))(k(l(m)))$/
- abcdefhijklm
- 0: abcdefhijklm
- 1: abc
- 2: bc
- 3: c
- 4: def
- 5: ef
- 6: f
- 7: hij
- 8: ij
- 9: j
-10: klm
-11: lm
-12: m
-
-/^(?:a(b(c)))(?:d(e(f)))(?:h(i(j)))(?:k(l(m)))$/
- abcdefhijklm
- 0: abcdefhijklm
- 1: bc
- 2: c
- 3: ef
- 4: f
- 5: ij
- 6: j
- 7: lm
- 8: m
-
-/^[\w][\W][\s][\S][\d][\D][\b][\n][\c]][\022]/
- a+ Z0+\x08\n\x1d\x12
- 0: a+ Z0+\x08\x0a\x1d\x12
-
-/^[.^$|()*+?{,}]+/
- .^\$(*+)|{?,?}
- 0: .^$(*+)|{?,?}
-
-/^a*\w/
- z
- 0: z
- az
- 0: az
- aaaz
- 0: aaaz
- a
- 0: a
- aa
- 0: aa
- aaaa
- 0: aaaa
- a+
- 0: a
- aa+
- 0: aa
-
-/^a*?\w/
- z
- 0: z
- az
- 0: a
- aaaz
- 0: a
- a
- 0: a
- aa
- 0: a
- aaaa
- 0: a
- a+
- 0: a
- aa+
- 0: a
-
-/^a+\w/
- az
- 0: az
- aaaz
- 0: aaaz
- aa
- 0: aa
- aaaa
- 0: aaaa
- aa+
- 0: aa
-
-/^a+?\w/
- az
- 0: az
- aaaz
- 0: aa
- aa
- 0: aa
- aaaa
- 0: aa
- aa+
- 0: aa
-
-/^\d{8}\w{2,}/
- 1234567890
- 0: 1234567890
- 12345678ab
- 0: 12345678ab
- 12345678__
- 0: 12345678__
- *** Failers
-No match
- 1234567
-No match
-
-/^[aeiou\d]{4,5}$/
- uoie
- 0: uoie
- 1234
- 0: 1234
- 12345
- 0: 12345
- aaaaa
- 0: aaaaa
- *** Failers
-No match
- 123456
-No match
-
-/^[aeiou\d]{4,5}?/
- uoie
- 0: uoie
- 1234
- 0: 1234
- 12345
- 0: 1234
- aaaaa
- 0: aaaa
- 123456
- 0: 1234
-
-/\A(abc|def)=(\1){2,3}\Z/
- abc=abcabc
- 0: abc=abcabc
- 1: abc
- 2: abc
- def=defdefdef
- 0: def=defdefdef
- 1: def
- 2: def
- *** Failers
-No match
- abc=defdef
-No match
-
-/^(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)\11*(\3\4)\1(?#)2$/
- abcdefghijkcda2
- 0: abcdefghijkcda2
- 1: a
- 2: b
- 3: c
- 4: d
- 5: e
- 6: f
- 7: g
- 8: h
- 9: i
-10: j
-11: k
-12: cd
- abcdefghijkkkkcda2
- 0: abcdefghijkkkkcda2
- 1: a
- 2: b
- 3: c
- 4: d
- 5: e
- 6: f
- 7: g
- 8: h
- 9: i
-10: j
-11: k
-12: cd
-
-/(cat(a(ract|tonic)|erpillar)) \1()2(3)/
- cataract cataract23
- 0: cataract cataract23
- 1: cataract
- 2: aract
- 3: ract
- 4:
- 5: 3
- catatonic catatonic23
- 0: catatonic catatonic23
- 1: catatonic
- 2: atonic
- 3: tonic
- 4:
- 5: 3
- caterpillar caterpillar23
- 0: caterpillar caterpillar23
- 1: caterpillar
- 2: erpillar
- 3: <unset>
- 4:
- 5: 3
-
-
-/^From +([^ ]+) +[a-zA-Z][a-zA-Z][a-zA-Z] +[a-zA-Z][a-zA-Z][a-zA-Z] +[0-9]?[0-9] +[0-9][0-9]:[0-9][0-9]/
- From abcd Mon Sep 01 12:33:02 1997
- 0: From abcd Mon Sep 01 12:33
- 1: abcd
-
-/^From\s+\S+\s+([a-zA-Z]{3}\s+){2}\d{1,2}\s+\d\d:\d\d/
- From abcd Mon Sep 01 12:33:02 1997
- 0: From abcd Mon Sep 01 12:33
- 1: Sep
- From abcd Mon Sep 1 12:33:02 1997
- 0: From abcd Mon Sep 1 12:33
- 1: Sep
- *** Failers
-No match
- From abcd Sep 01 12:33:02 1997
-No match
-
-/^12.34/s
- 12\n34
- 0: 12\x0a34
- 12\r34
- 0: 12\x0d34
-
-/\w+(?=\t)/
- the quick brown\t fox
- 0: brown
-
-/foo(?!bar)(.*)/
- foobar is foolish see?
- 0: foolish see?
- 1: lish see?
-
-/(?:(?!foo)...|^.{0,2})bar(.*)/
- foobar crowbar etc
- 0: rowbar etc
- 1: etc
- barrel
- 0: barrel
- 1: rel
- 2barrel
- 0: 2barrel
- 1: rel
- A barrel
- 0: A barrel
- 1: rel
-
-/^(\D*)(?=\d)(?!123)/
- abc456
- 0: abc
- 1: abc
- *** Failers
-No match
- abc123
-No match
-
-/^1234(?# test newlines
- inside)/
- 1234
- 0: 1234
-
-/^1234 #comment in extended re
- /x
- 1234
- 0: 1234
-
-/#rhubarb
- abcd/x
- abcd
- 0: abcd
-
-/^abcd#rhubarb/x
- abcd
- 0: abcd
-
-/^(a)\1{2,3}(.)/
- aaab
- 0: aaab
- 1: a
- 2: b
- aaaab
- 0: aaaab
- 1: a
- 2: b
- aaaaab
- 0: aaaaa
- 1: a
- 2: a
- aaaaaab
- 0: aaaaa
- 1: a
- 2: a
-
-/(?!^)abc/
- the abc
- 0: abc
- *** Failers
-No match
- abc
-No match
-
-/(?=^)abc/
- abc
- 0: abc
- *** Failers
-No match
- the abc
-No match
-
-/^[ab]{1,3}(ab*|b)/
- aabbbbb
- 0: aabb
- 1: b
-
-/^[ab]{1,3}?(ab*|b)/
- aabbbbb
- 0: aabbbbb
- 1: abbbbb
-
-/^[ab]{1,3}?(ab*?|b)/
- aabbbbb
- 0: aa
- 1: a
-
-/^[ab]{1,3}(ab*?|b)/
- aabbbbb
- 0: aabb
- 1: b
-
-/ (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* # optional leading comment
-(?: (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-" (?: # opening quote...
-[^\\\x80-\xff\n\015"] # Anything except backslash and quote
-| # or
-\\ [^\x80-\xff] # Escaped something (something != CR)
-)* " # closing quote
-) # initial word
-(?: (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* \. (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-" (?: # opening quote...
-[^\\\x80-\xff\n\015"] # Anything except backslash and quote
-| # or
-\\ [^\x80-\xff] # Escaped something (something != CR)
-)* " # closing quote
-) )* # further okay, if led by a period
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* @ (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-| \[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-) # initial subdomain
-(?: #
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* \. # if led by a period...
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-| \[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-) # ...further okay
-)*
-# address
-| # or
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-" (?: # opening quote...
-[^\\\x80-\xff\n\015"] # Anything except backslash and quote
-| # or
-\\ [^\x80-\xff] # Escaped something (something != CR)
-)* " # closing quote
-) # one word, optionally followed by....
-(?:
-[^()<>@,;:".\\\[\]\x80-\xff\000-\010\012-\037] | # atom and space parts, or...
-\(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) | # comments, or...
-
-" (?: # opening quote...
-[^\\\x80-\xff\n\015"] # Anything except backslash and quote
-| # or
-\\ [^\x80-\xff] # Escaped something (something != CR)
-)* " # closing quote
-# quoted strings
-)*
-< (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* # leading <
-(?: @ (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-| \[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-) # initial subdomain
-(?: #
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* \. # if led by a period...
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-| \[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-) # ...further okay
-)*
-
-(?: (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* , (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* @ (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-| \[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-) # initial subdomain
-(?: #
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* \. # if led by a period...
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-| \[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-) # ...further okay
-)*
-)* # further okay, if led by comma
-: # closing colon
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* )? # optional route
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-" (?: # opening quote...
-[^\\\x80-\xff\n\015"] # Anything except backslash and quote
-| # or
-\\ [^\x80-\xff] # Escaped something (something != CR)
-)* " # closing quote
-) # initial word
-(?: (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* \. (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-" (?: # opening quote...
-[^\\\x80-\xff\n\015"] # Anything except backslash and quote
-| # or
-\\ [^\x80-\xff] # Escaped something (something != CR)
-)* " # closing quote
-) )* # further okay, if led by a period
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* @ (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-| \[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-) # initial subdomain
-(?: #
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* \. # if led by a period...
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* (?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-| \[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-) # ...further okay
-)*
-# address spec
-(?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* > # trailing >
-# name and address
-) (?: [\040\t] | \(
-(?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] | \( (?: [^\\\x80-\xff\n\015()] | \\ [^\x80-\xff] )* \) )*
-\) )* # optional trailing comment
-/x
- Alan Other <user\@dom.ain>
- 0: Alan Other <user@dom.ain>
- <user\@dom.ain>
- 0: user@dom.ain
- user\@dom.ain
- 0: user@dom.ain
- \"A. Other\" <user.1234\@dom.ain> (a comment)
- 0: "A. Other" <user.1234@dom.ain> (a comment)
- A. Other <user.1234\@dom.ain> (a comment)
- 0: Other <user.1234@dom.ain> (a comment)
- \"/s=user/ou=host/o=place/prmd=uu.yy/admd= /c=gb/\"\@x400-re.lay
- 0: "/s=user/ou=host/o=place/prmd=uu.yy/admd= /c=gb/"@x400-re.lay
- A missing angle <user\@some.where
- 0: user@some.where
- *** Failers
-No match
- The quick brown fox
-No match
-
-/[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional leading comment
-(?:
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-# Atom
-| # or
-" # "
-[^\\\x80-\xff\n\015"] * # normal
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015"] * )* # ( special normal* )*
-" # "
-# Quoted string
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-\.
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-# Atom
-| # or
-" # "
-[^\\\x80-\xff\n\015"] * # normal
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015"] * )* # ( special normal* )*
-" # "
-# Quoted string
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# additional words
-)*
-@
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-\[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-(?:
-\.
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-\[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-)*
-# address
-| # or
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-# Atom
-| # or
-" # "
-[^\\\x80-\xff\n\015"] * # normal
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015"] * )* # ( special normal* )*
-" # "
-# Quoted string
-)
-# leading word
-[^()<>@,;:".\\\[\]\x80-\xff\000-\010\012-\037] * # "normal" atoms and or spaces
-(?:
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-|
-" # "
-[^\\\x80-\xff\n\015"] * # normal
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015"] * )* # ( special normal* )*
-" # "
-) # "special" comment or quoted string
-[^()<>@,;:".\\\[\]\x80-\xff\000-\010\012-\037] * # more "normal"
-)*
-<
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# <
-(?:
-@
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-\[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-(?:
-\.
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-\[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-)*
-(?: ,
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-@
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-\[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-(?:
-\.
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-\[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-)*
-)* # additional domains
-:
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-)? # optional route
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-# Atom
-| # or
-" # "
-[^\\\x80-\xff\n\015"] * # normal
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015"] * )* # ( special normal* )*
-" # "
-# Quoted string
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-\.
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-# Atom
-| # or
-" # "
-[^\\\x80-\xff\n\015"] * # normal
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015"] * )* # ( special normal* )*
-" # "
-# Quoted string
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# additional words
-)*
-@
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-\[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-(?:
-\.
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-(?:
-[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+ # some number of atom characters...
-(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]) # ..not followed by something that could be part of an atom
-|
-\[ # [
-(?: [^\\\x80-\xff\n\015\[\]] | \\ [^\x80-\xff] )* # stuff
-\] # ]
-)
-[\040\t]* # Nab whitespace.
-(?:
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: # (
-(?: \\ [^\x80-\xff] |
-\( # (
-[^\\\x80-\xff\n\015()] * # normal*
-(?: \\ [^\x80-\xff] [^\\\x80-\xff\n\015()] * )* # (special normal*)*
-\) # )
-) # special
-[^\\\x80-\xff\n\015()] * # normal*
-)* # )*
-\) # )
-[\040\t]* )* # If comment found, allow more spaces.
-# optional trailing comments
-)*
-# address spec
-> # >
-# name and address
-)
-/x
- Alan Other <user\@dom.ain>
- 0: Alan Other <user@dom.ain>
- <user\@dom.ain>
- 0: user@dom.ain
- user\@dom.ain
- 0: user@dom.ain
- \"A. Other\" <user.1234\@dom.ain> (a comment)
- 0: "A. Other" <user.1234@dom.ain>
- A. Other <user.1234\@dom.ain> (a comment)
- 0: Other <user.1234@dom.ain>
- \"/s=user/ou=host/o=place/prmd=uu.yy/admd= /c=gb/\"\@x400-re.lay
- 0: "/s=user/ou=host/o=place/prmd=uu.yy/admd= /c=gb/"@x400-re.lay
- A missing angle <user\@some.where
- 0: user@some.where
- *** Failers
-No match
- The quick brown fox
-No match
-
-/abc\0def\00pqr\000xyz\0000AB/
- abc\0def\00pqr\000xyz\0000AB
- 0: abc\x00def\x00pqr\x00xyz\x000AB
- abc456 abc\0def\00pqr\000xyz\0000ABCDE
- 0: abc\x00def\x00pqr\x00xyz\x000AB
-
-/abc\x0def\x00pqr\x000xyz\x0000AB/
- abc\x0def\x00pqr\x000xyz\x0000AB
- 0: abc\x0def\x00pqr\x000xyz\x0000AB
- abc456 abc\x0def\x00pqr\x000xyz\x0000ABCDE
- 0: abc\x0def\x00pqr\x000xyz\x0000AB
-
-/^[\000-\037]/
- \0A
- 0: \x00
- \01B
- 0: \x01
- \037C
- 0: \x1f
-
-/\0*/
- \0\0\0\0
- 0: \x00\x00\x00\x00
-
-/A\x0{2,3}Z/
- The A\x0\x0Z
- 0: A\x00\x00Z
- An A\0\x0\0Z
- 0: A\x00\x00\x00Z
- *** Failers
-No match
- A\0Z
-No match
- A\0\x0\0\x0Z
-No match
-
-/^(cow|)\1(bell)/
- cowcowbell
- 0: cowcowbell
- 1: cow
- 2: bell
- bell
- 0: bell
- 1:
- 2: bell
- *** Failers
-No match
- cowbell
-No match
-
-/^\s/
- \040abc
- 0:
- \x0cabc
- 0: \x0c
- \nabc
- 0: \x0a
- \rabc
- 0: \x0d
- \tabc
- 0: \x09
- *** Failers
-No match
- abc
-No match
-
-/^a b
-
- c/x
- abc
- 0: abc
-
-/^(a|)\1*b/
- ab
- 0: ab
- 1: a
- aaaab
- 0: aaaab
- 1: a
- b
- 0: b
- 1:
- *** Failers
-No match
- acb
-No match
-
-/^(a|)\1+b/
- aab
- 0: aab
- 1: a
- aaaab
- 0: aaaab
- 1: a
- b
- 0: b
- 1:
- *** Failers
-No match
- ab
-No match
-
-/^(a|)\1?b/
- ab
- 0: ab
- 1: a
- aab
- 0: aab
- 1: a
- b
- 0: b
- 1:
- *** Failers
-No match
- acb
-No match
-
-/^(a|)\1{2}b/
- aaab
- 0: aaab
- 1: a
- b
- 0: b
- 1:
- *** Failers
-No match
- ab
-No match
- aab
-No match
- aaaab
-No match
-
-/^(a|)\1{2,3}b/
- aaab
- 0: aaab
- 1: a
- aaaab
- 0: aaaab
- 1: a
- b
- 0: b
- 1:
- *** Failers
-No match
- ab
-No match
- aab
-No match
- aaaaab
-No match
-
-/ab{1,3}bc/
- abbbbc
- 0: abbbbc
- abbbc
- 0: abbbc
- abbc
- 0: abbc
- *** Failers
-No match
- abc
-No match
- abbbbbc
-No match
-
-/([^.]*)\.([^:]*):[T ]+(.*)/
- track1.title:TBlah blah blah
- 0: track1.title:TBlah blah blah
- 1: track1
- 2: title
- 3: Blah blah blah
-
-/([^.]*)\.([^:]*):[T ]+(.*)/i
- track1.title:TBlah blah blah
- 0: track1.title:TBlah blah blah
- 1: track1
- 2: title
- 3: Blah blah blah
-
-/([^.]*)\.([^:]*):[t ]+(.*)/i
- track1.title:TBlah blah blah
- 0: track1.title:TBlah blah blah
- 1: track1
- 2: title
- 3: Blah blah blah
-
-/^[W-c]+$/
- WXY_^abc
- 0: WXY_^abc
- ***Failers
-No match
- wxy
-No match
-
-/^[W-c]+$/i
- WXY_^abc
- 0: WXY_^abc
- wxy_^ABC
- 0: wxy_^ABC
-
-/^[\x3f-\x5F]+$/i
- WXY_^abc
- 0: WXY_^abc
- wxy_^ABC
- 0: wxy_^ABC
-
-/^abc$/m
- abc
- 0: abc
- qqq\nabc
- 0: abc
- abc\nzzz
- 0: abc
- qqq\nabc\nzzz
- 0: abc
-
-/^abc$/
- abc
- 0: abc
- *** Failers
-No match
- qqq\nabc
-No match
- abc\nzzz
-No match
- qqq\nabc\nzzz
-No match
-
-/\Aabc\Z/m
- abc
- 0: abc
- abc\n
- 0: abc
- *** Failers
-No match
- qqq\nabc
-No match
- abc\nzzz
-No match
- qqq\nabc\nzzz
-No match
-
-/\A(.)*\Z/s
- abc\ndef
- 0: abc\x0adef
- 1: f
-
-/\A(.)*\Z/m
- *** Failers
- 0: *** Failers
- 1: s
- abc\ndef
-No match
-
-/(?:b)|(?::+)/
- b::c
- 0: b
- c::b
- 0: ::
-
-/[-az]+/
- az-
- 0: az-
- *** Failers
- 0: a
- b
-No match
-
-/[az-]+/
- za-
- 0: za-
- *** Failers
- 0: a
- b
-No match
-
-/[a\-z]+/
- a-z
- 0: a-z
- *** Failers
- 0: a
- b
-No match
-
-/[a-z]+/
- abcdxyz
- 0: abcdxyz
-
-/[\d-]+/
- 12-34
- 0: 12-34
- *** Failers
-No match
- aaa
-No match
-
-/[\d-z]+/
- 12-34z
- 0: 12-34z
- *** Failers
-No match
- aaa
-No match
-
-/\x5c/
- \\
- 0: \
-
-/\x20Z/
- the Zoo
- 0: Z
- *** Failers
-No match
- Zulu
-No match
-
-/(abc)\1/i
- abcabc
- 0: abcabc
- 1: abc
- ABCabc
- 0: ABCabc
- 1: ABC
- abcABC
- 0: abcABC
- 1: abc
-
-/ab{3cd/
- ab{3cd
- 0: ab{3cd
-
-/ab{3,cd/
- ab{3,cd
- 0: ab{3,cd
-
-/ab{3,4a}cd/
- ab{3,4a}cd
- 0: ab{3,4a}cd
-
-/{4,5a}bc/
- {4,5a}bc
- 0: {4,5a}bc
-
-/^a.b/
- a\rb
- 0: a\x0db
- *** Failers
-No match
- a\nb
-No match
-
-/abc$/
- abc
- 0: abc
- abc\n
- 0: abc
- *** Failers
-No match
- abc\ndef
-No match
-
-/(abc)\123/
- abc\x53
- 0: abcS
- 1: abc
-
-/(abc)\223/
- abc\x93
- 0: abc\x93
- 1: abc
-
-/(abc)\323/
- abc\xd3
- 0: abc\xd3
- 1: abc
-
-/(abc)\500/
- abc\x40
- 0: abc@
- 1: abc
- abc\100
- 0: abc@
- 1: abc
-
-/(abc)\5000/
- abc\x400
- 0: abc@0
- 1: abc
- abc\x40\x30
- 0: abc@0
- 1: abc
- abc\1000
- 0: abc@0
- 1: abc
- abc\100\x30
- 0: abc@0
- 1: abc
- abc\100\060
- 0: abc@0
- 1: abc
- abc\100\60
- 0: abc@0
- 1: abc
-
-/abc\81/
- abc\081
- 0: abc\x0081
- abc\0\x38\x31
- 0: abc\x0081
-
-/abc\91/
- abc\091
- 0: abc\x0091
- abc\0\x39\x31
- 0: abc\x0091
-
-/(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\12\123/
- abcdefghijkllS
- 0: abcdefghijkllS
- 1: a
- 2: b
- 3: c
- 4: d
- 5: e
- 6: f
- 7: g
- 8: h
- 9: i
-10: j
-11: k
-12: l
-
-/(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)\12\123/
- abcdefghijk\12S
- 0: abcdefghijk\x0aS
- 1: a
- 2: b
- 3: c
- 4: d
- 5: e
- 6: f
- 7: g
- 8: h
- 9: i
-10: j
-11: k
-
-/ab\gdef/
- abgdef
- 0: abgdef
-
-/a{0}bc/
- bc
- 0: bc
-
-/(a|(bc)){0,0}?xyz/
- xyz
- 0: xyz
-
-/abc[\10]de/
- abc\010de
- 0: abc\x08de
-
-/abc[\1]de/
- abc\1de
- 0: abc\x01de
-
-/(abc)[\1]de/
- abc\1de
- 0: abc\x01de
- 1: abc
-
-/(?s)a.b/
- a\nb
- 0: a\x0ab
-
-/^([^a])([^\b])([^c]*)([^d]{3,4})/
- baNOTccccd
- 0: baNOTcccc
- 1: b
- 2: a
- 3: NOT
- 4: cccc
- baNOTcccd
- 0: baNOTccc
- 1: b
- 2: a
- 3: NOT
- 4: ccc
- baNOTccd
- 0: baNOTcc
- 1: b
- 2: a
- 3: NO
- 4: Tcc
- bacccd
- 0: baccc
- 1: b
- 2: a
- 3:
- 4: ccc
- *** Failers
- 0: *** Failers
- 1: *
- 2: *
- 3: * Fail
- 4: ers
- anything
-No match
- b\bc
-No match
- baccd
-No match
-
-/[^a]/
- Abc
- 0: A
-
-/[^a]/i
- Abc
- 0: b
-
-/[^a]+/
- AAAaAbc
- 0: AAA
-
-/[^a]+/i
- AAAaAbc
- 0: bc
-
-/[^a]+/
- bbb\nccc
- 0: bbb\x0accc
-
-/[^k]$/
- abc
- 0: c
- *** Failers
- 0: s
- abk
-No match
-
-/[^k]{2,3}$/
- abc
- 0: abc
- kbc
- 0: bc
- kabc
- 0: abc
- *** Failers
- 0: ers
- abk
-No match
- akb
-No match
- akk
-No match
-
-/^\d{8,}\@.+[^k]$/
- 12345678\@a.b.c.d
- 0: 12345678@a.b.c.d
- 123456789\@x.y.z
- 0: 123456789@x.y.z
- *** Failers
-No match
- 12345678\@x.y.uk
-No match
- 1234567\@a.b.c.d
-No match
-
-/(a)\1{8,}/
- aaaaaaaaa
- 0: aaaaaaaaa
- 1: a
- aaaaaaaaaa
- 0: aaaaaaaaaa
- 1: a
- *** Failers
-No match
- aaaaaaa
-No match
-
-/[^a]/
- aaaabcd
- 0: b
- aaAabcd
- 0: A
-
-/[^a]/i
- aaaabcd
- 0: b
- aaAabcd
- 0: b
-
-/[^az]/
- aaaabcd
- 0: b
- aaAabcd
- 0: A
-
-/[^az]/i
- aaaabcd
- 0: b
- aaAabcd
- 0: b
-
-/\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\040\041\042\043\044\045\046\047\050\051\052\053\054\055\056\057\060\061\062\063\064\065\066\067\070\071\072\073\074\075\076\077\100\101\102\103\104\105\106\107\110\111\112\113\114\115\116\117\120\121\122\123\124\125\126\127\130\131\132\133\134\135\136\137\140\141\142\143\144\145\146\147\150\151\152\153\154\155\156\157\160\161\162\163\164\165\166\167\170\171\172\173\174\175\176\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377/
- \000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\040\041\042\043\044\045\046\047\050\051\052\053\054\055\056\057\060\061\062\063\064\065\066\067\070\071\072\073\074\075\076\077\100\101\102\103\104\105\106\107\110\111\112\113\114\115\116\117\120\121\122\123\124\125\126\127\130\131\132\133\134\135\136\137\140\141\142\143\144\145\146\147\150\151\152\153\154\155\156\157\160\161\162\163\164\165\166\167\170\171\172\173\174\175\176\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377
- 0: \x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff
-
-/P[^*]TAIRE[^*]{1,6}?LL/
- xxxxxxxxxxxPSTAIREISLLxxxxxxxxx
- 0: PSTAIREISLL
-
-/P[^*]TAIRE[^*]{1,}?LL/
- xxxxxxxxxxxPSTAIREISLLxxxxxxxxx
- 0: PSTAIREISLL
-
-/(\.\d\d[1-9]?)\d+/
- 1.230003938
- 0: .230003938
- 1: .23
- 1.875000282
- 0: .875000282
- 1: .875
- 1.235
- 0: .235
- 1: .23
-
-/(\.\d\d((?=0)|\d(?=\d)))/
- 1.230003938
- 0: .23
- 1: .23
- 2:
- 1.875000282
- 0: .875
- 1: .875
- 2: 5
- *** Failers
-No match
- 1.235
-No match
-
-/a(?)b/
- ab
- 0: ab
-
-/\b(foo)\s+(\w+)/i
- Food is on the foo table
- 0: foo table
- 1: foo
- 2: table
-
-/foo(.*)bar/
- The food is under the bar in the barn.
- 0: food is under the bar in the bar
- 1: d is under the bar in the
-
-/foo(.*?)bar/
- The food is under the bar in the barn.
- 0: food is under the bar
- 1: d is under the
-
-/(.*)(\d*)/
- I have 2 numbers: 53147
- 0: I have 2 numbers: 53147
- 1: I have 2 numbers: 53147
- 2:
-
-/(.*)(\d+)/
- I have 2 numbers: 53147
- 0: I have 2 numbers: 53147
- 1: I have 2 numbers: 5314
- 2: 7
-
-/(.*?)(\d*)/
- I have 2 numbers: 53147
- 0:
- 1:
- 2:
-
-/(.*?)(\d+)/
- I have 2 numbers: 53147
- 0: I have 2
- 1: I have
- 2: 2
-
-/(.*)(\d+)$/
- I have 2 numbers: 53147
- 0: I have 2 numbers: 53147
- 1: I have 2 numbers: 5314
- 2: 7
-
-/(.*?)(\d+)$/
- I have 2 numbers: 53147
- 0: I have 2 numbers: 53147
- 1: I have 2 numbers:
- 2: 53147
-
-/(.*)\b(\d+)$/
- I have 2 numbers: 53147
- 0: I have 2 numbers: 53147
- 1: I have 2 numbers:
- 2: 53147
-
-/(.*\D)(\d+)$/
- I have 2 numbers: 53147
- 0: I have 2 numbers: 53147
- 1: I have 2 numbers:
- 2: 53147
-
-/^\D*(?!123)/
- ABC123
- 0: AB
-
-/^(\D*)(?=\d)(?!123)/
- ABC445
- 0: ABC
- 1: ABC
- *** Failers
-No match
- ABC123
-No match
-
-/^[W-]46]/
- W46]789
- 0: W46]
- -46]789
- 0: -46]
- *** Failers
-No match
- Wall
-No match
- Zebra
-No match
- 42
-No match
- [abcd]
-No match
- ]abcd[
-No match
-
-/^[W-\]46]/
- W46]789
- 0: W
- Wall
- 0: W
- Zebra
- 0: Z
- Xylophone
- 0: X
- 42
- 0: 4
- [abcd]
- 0: [
- ]abcd[
- 0: ]
- \\backslash
- 0: \
- *** Failers
-No match
- -46]789
-No match
- well
-No match
-
-/\d\d\/\d\d\/\d\d\d\d/
- 01/01/2000
- 0: 01/01/2000
-
-/word (?:[a-zA-Z0-9]+ ){0,10}otherword/
- word cat dog elephant mussel cow horse canary baboon snake shark otherword
- 0: word cat dog elephant mussel cow horse canary baboon snake shark otherword
- word cat dog elephant mussel cow horse canary baboon snake shark
-No match
-
-/word (?:[a-zA-Z0-9]+ ){0,300}otherword/
- word cat dog elephant mussel cow horse canary baboon snake shark the quick brown fox and the lazy dog and several other words getting close to thirty by now I hope
-No match
-
-/^(a){0,0}/
- bcd
- 0:
- abc
- 0:
- aab
- 0:
-
-/^(a){0,1}/
- bcd
- 0:
- abc
- 0: a
- 1: a
- aab
- 0: a
- 1: a
-
-/^(a){0,2}/
- bcd
- 0:
- abc
- 0: a
- 1: a
- aab
- 0: aa
- 1: a
-
-/^(a){0,3}/
- bcd
- 0:
- abc
- 0: a
- 1: a
- aab
- 0: aa
- 1: a
- aaa
- 0: aaa
- 1: a
-
-/^(a){0,}/
- bcd
- 0:
- abc
- 0: a
- 1: a
- aab
- 0: aa
- 1: a
- aaa
- 0: aaa
- 1: a
- aaaaaaaa
- 0: aaaaaaaa
- 1: a
-
-/^(a){1,1}/
- bcd
-No match
- abc
- 0: a
- 1: a
- aab
- 0: a
- 1: a
-
-/^(a){1,2}/
- bcd
-No match
- abc
- 0: a
- 1: a
- aab
- 0: aa
- 1: a
-
-/^(a){1,3}/
- bcd
-No match
- abc
- 0: a
- 1: a
- aab
- 0: aa
- 1: a
- aaa
- 0: aaa
- 1: a
-
-/^(a){1,}/
- bcd
-No match
- abc
- 0: a
- 1: a
- aab
- 0: aa
- 1: a
- aaa
- 0: aaa
- 1: a
- aaaaaaaa
- 0: aaaaaaaa
- 1: a
-
-/.*\.gif/
- borfle\nbib.gif\nno
- 0: bib.gif
-
-/.{0,}\.gif/
- borfle\nbib.gif\nno
- 0: bib.gif
-
-/.*\.gif/m
- borfle\nbib.gif\nno
- 0: bib.gif
-
-/.*\.gif/s
- borfle\nbib.gif\nno
- 0: borfle\x0abib.gif
-
-/.*\.gif/ms
- borfle\nbib.gif\nno
- 0: borfle\x0abib.gif
-
-/.*$/
- borfle\nbib.gif\nno
- 0: no
-
-/.*$/m
- borfle\nbib.gif\nno
- 0: borfle
-
-/.*$/s
- borfle\nbib.gif\nno
- 0: borfle\x0abib.gif\x0ano
-
-/.*$/ms
- borfle\nbib.gif\nno
- 0: borfle\x0abib.gif\x0ano
-
-/.*$/
- borfle\nbib.gif\nno\n
- 0: no
-
-/.*$/m
- borfle\nbib.gif\nno\n
- 0: borfle
-
-/.*$/s
- borfle\nbib.gif\nno\n
- 0: borfle\x0abib.gif\x0ano\x0a
-
-/.*$/ms
- borfle\nbib.gif\nno\n
- 0: borfle\x0abib.gif\x0ano\x0a
-
-/(.*X|^B)/
- abcde\n1234Xyz
- 0: 1234X
- 1: 1234X
- BarFoo
- 0: B
- 1: B
- *** Failers
-No match
- abcde\nBar
-No match
-
-/(.*X|^B)/m
- abcde\n1234Xyz
- 0: 1234X
- 1: 1234X
- BarFoo
- 0: B
- 1: B
- abcde\nBar
- 0: B
- 1: B
-
-/(.*X|^B)/s
- abcde\n1234Xyz
- 0: abcde\x0a1234X
- 1: abcde\x0a1234X
- BarFoo
- 0: B
- 1: B
- *** Failers
-No match
- abcde\nBar
-No match
-
-/(.*X|^B)/ms
- abcde\n1234Xyz
- 0: abcde\x0a1234X
- 1: abcde\x0a1234X
- BarFoo
- 0: B
- 1: B
- abcde\nBar
- 0: B
- 1: B
-
-/(?s)(.*X|^B)/
- abcde\n1234Xyz
- 0: abcde\x0a1234X
- 1: abcde\x0a1234X
- BarFoo
- 0: B
- 1: B
- *** Failers
-No match
- abcde\nBar
-No match
-
-/(?s:.*X|^B)/
- abcde\n1234Xyz
- 0: abcde\x0a1234X
- BarFoo
- 0: B
- *** Failers
-No match
- abcde\nBar
-No match
-
-/^.*B/
- **** Failers
-No match
- abc\nB
-No match
-
-/(?s)^.*B/
- abc\nB
- 0: abc\x0aB
-
-/(?m)^.*B/
- abc\nB
- 0: B
-
-/(?ms)^.*B/
- abc\nB
- 0: abc\x0aB
-
-/(?ms)^B/
- abc\nB
- 0: B
-
-/(?s)B$/
- B\n
- 0: B
-
-/^[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]/
- 123456654321
- 0: 123456654321
-
-/^\d\d\d\d\d\d\d\d\d\d\d\d/
- 123456654321
- 0: 123456654321
-
-/^[\d][\d][\d][\d][\d][\d][\d][\d][\d][\d][\d][\d]/
- 123456654321
- 0: 123456654321
-
-/^[abc]{12}/
- abcabcabcabc
- 0: abcabcabcabc
-
-/^[a-c]{12}/
- abcabcabcabc
- 0: abcabcabcabc
-
-/^(a|b|c){12}/
- abcabcabcabc
- 0: abcabcabcabc
- 1: c
-
-/^[abcdefghijklmnopqrstuvwxy0123456789]/
- n
- 0: n
- *** Failers
-No match
- z
-No match
-
-/abcde{0,0}/
- abcd
- 0: abcd
- *** Failers
-No match
- abce
-No match
-
-/ab[cd]{0,0}e/
- abe
- 0: abe
- *** Failers
-No match
- abcde
-No match
-
-/ab(c){0,0}d/
- abd
- 0: abd
- *** Failers
-No match
- abcd
-No match
-
-/a(b*)/
- a
- 0: a
- 1:
- ab
- 0: ab
- 1: b
- abbbb
- 0: abbbb
- 1: bbbb
- *** Failers
- 0: a
- 1:
- bbbbb
-No match
-
-/ab\d{0}e/
- abe
- 0: abe
- *** Failers
-No match
- ab1e
-No match
-
-/"([^\\"]+|\\.)*"/
- the \"quick\" brown fox
- 0: "quick"
- 1: quick
- \"the \\\"quick\\\" brown fox\"
- 0: "the \"quick\" brown fox"
- 1: brown fox
-
-/.*?/g+
- abc
- 0:
- 0+ abc
- 0: a
- 0+ bc
- 0:
- 0+ bc
- 0: b
- 0+ c
- 0:
- 0+ c
- 0: c
- 0+
- 0:
- 0+
-
-/\b/g+
- abc
- 0:
- 0+ abc
- 0:
- 0+
-
-/\b/+g
- abc
- 0:
- 0+ abc
- 0:
- 0+
-
-//g
- abc
- 0:
- 0:
- 0:
- 0:
-
-/<tr([\w\W\s\d][^<>]{0,})><TD([\w\W\s\d][^<>]{0,})>([\d]{0,}\.)(.*)((<BR>([\w\W\s\d][^<>]{0,})|[\s]{0,}))<\/a><\/TD><TD([\w\W\s\d][^<>]{0,})>([\w\W\s\d][^<>]{0,})<\/TD><TD([\w\W\s\d][^<>]{0,})>([\w\W\s\d][^<>]{0,})<\/TD><\/TR>/is
- <TR BGCOLOR='#DBE9E9'><TD align=left valign=top>43.<a href='joblist.cfm?JobID=94 6735&Keyword='>Word Processor<BR>(N-1286)</a></TD><TD align=left valign=top>Lega lstaff.com</TD><TD align=left valign=top>CA - Statewide</TD></TR>
- 0: <TR BGCOLOR='#DBE9E9'><TD align=left valign=top>43.<a href='joblist.cfm?JobID=94 6735&Keyword='>Word Processor<BR>(N-1286)</a></TD><TD align=left valign=top>Lega lstaff.com</TD><TD align=left valign=top>CA - Statewide</TD></TR>
- 1: BGCOLOR='#DBE9E9'
- 2: align=left valign=top
- 3: 43.
- 4: <a href='joblist.cfm?JobID=94 6735&Keyword='>Word Processor<BR>(N-1286)
- 5:
- 6:
- 7: <unset>
- 8: align=left valign=top
- 9: Lega lstaff.com
-10: align=left valign=top
-11: CA - Statewide
-
-/a[^a]b/
- acb
- 0: acb
- a\nb
- 0: a\x0ab
-
-/a.b/
- acb
- 0: acb
- *** Failers
-No match
- a\nb
-No match
-
-/a[^a]b/s
- acb
- 0: acb
- a\nb
- 0: a\x0ab
-
-/a.b/s
- acb
- 0: acb
- a\nb
- 0: a\x0ab
-
-/^(b+?|a){1,2}?c/
- bac
- 0: bac
- 1: a
- bbac
- 0: bbac
- 1: a
- bbbac
- 0: bbbac
- 1: a
- bbbbac
- 0: bbbbac
- 1: a
- bbbbbac
- 0: bbbbbac
- 1: a
-
-/^(b+|a){1,2}?c/
- bac
- 0: bac
- 1: a
- bbac
- 0: bbac
- 1: a
- bbbac
- 0: bbbac
- 1: a
- bbbbac
- 0: bbbbac
- 1: a
- bbbbbac
- 0: bbbbbac
- 1: a
-
-/(?!\A)x/m
- x\nb\n
-No match
- a\bx\n
- 0: x
-
-/\x0{ab}/
- \0{ab}
- 0: \x00{ab}
-
-/(A|B)*?CD/
- CD
- 0: CD
-
-/(A|B)*CD/
- CD
- 0: CD
-
-/(AB)*?\1/
- ABABAB
- 0: ABAB
- 1: AB
-
-/(AB)*\1/
- ABABAB
- 0: ABABAB
- 1: AB
-
-/(?<!bar)foo/
- foo
- 0: foo
- catfood
- 0: foo
- arfootle
- 0: foo
- rfoosh
- 0: foo
- *** Failers
-No match
- barfoo
-No match
- towbarfoo
-No match
-
-/\w{3}(?<!bar)foo/
- catfood
- 0: catfoo
- *** Failers
-No match
- foo
-No match
- barfoo
-No match
- towbarfoo
-No match
-
-/(?<=(foo)a)bar/
- fooabar
- 0: bar
- 1: foo
- *** Failers
-No match
- bar
-No match
- foobbar
-No match
-
-/\Aabc\z/m
- abc
- 0: abc
- *** Failers
-No match
- abc\n
-No match
- qqq\nabc
-No match
- abc\nzzz
-No match
- qqq\nabc\nzzz
-No match
-
-"(?>.*/)foo"
- /this/is/a/very/long/line/in/deed/with/very/many/slashes/in/it/you/see/
-No match
-
-"(?>.*/)foo"
- /this/is/a/very/long/line/in/deed/with/very/many/slashes/in/and/foo
- 0: /this/is/a/very/long/line/in/deed/with/very/many/slashes/in/and/foo
-
-/(?>(\.\d\d[1-9]?))\d+/
- 1.230003938
- 0: .230003938
- 1: .23
- 1.875000282
- 0: .875000282
- 1: .875
- *** Failers
-No match
- 1.235
-No match
-
-/^((?>\w+)|(?>\s+))*$/
- now is the time for all good men to come to the aid of the party
- 0: now is the time for all good men to come to the aid of the party
- 1: party
- *** Failers
-No match
- this is not a line with only words and spaces!
-No match
-
-/(\d+)(\w)/
- 12345a
- 0: 12345a
- 1: 12345
- 2: a
- 12345+
- 0: 12345
- 1: 1234
- 2: 5
-
-/((?>\d+))(\w)/
- 12345a
- 0: 12345a
- 1: 12345
- 2: a
- *** Failers
-No match
- 12345+
-No match
-
-/(?>a+)b/
- aaab
- 0: aaab
-
-/((?>a+)b)/
- aaab
- 0: aaab
- 1: aaab
-
-/(?>(a+))b/
- aaab
- 0: aaab
- 1: aaa
-
-/(?>b)+/
- aaabbbccc
- 0: bbb
-
-/(?>a+|b+|c+)*c/
- aaabbbbccccd
- 0: aaabbbbc
-
-/((?>[^()]+)|\([^()]*\))+/
- ((abc(ade)ufh()()x
- 0: abc(ade)ufh()()x
- 1: x
-
-/\(((?>[^()]+)|\([^()]+\))+\)/
- (abc)
- 0: (abc)
- 1: abc
- (abc(def)xyz)
- 0: (abc(def)xyz)
- 1: xyz
- *** Failers
-No match
- ((()aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-No match
-
-/a(?-i)b/i
- ab
- 0: ab
- Ab
- 0: Ab
- *** Failers
-No match
- aB
-No match
- AB
-No match
-
-/(a (?x)b c)d e/
- a bcd e
- 0: a bcd e
- 1: a bc
- *** Failers
-No match
- a b cd e
-No match
- abcd e
-No match
- a bcde
-No match
-
-/(a b(?x)c d (?-x)e f)/
- a bcde f
- 0: a bcde f
- 1: a bcde f
- *** Failers
-No match
- abcdef
-No match
-
-/(a(?i)b)c/
- abc
- 0: abc
- 1: ab
- aBc
- 0: aBc
- 1: aB
- *** Failers
-No match
- abC
-No match
- aBC
-No match
- Abc
-No match
- ABc
-No match
- ABC
-No match
- AbC
-No match
-
-/a(?i:b)c/
- abc
- 0: abc
- aBc
- 0: aBc
- *** Failers
-No match
- ABC
-No match
- abC
-No match
- aBC
-No match
-
-/a(?i:b)*c/
- aBc
- 0: aBc
- aBBc
- 0: aBBc
- *** Failers
-No match
- aBC
-No match
- aBBC
-No match
-
-/a(?=b(?i)c)\w\wd/
- abcd
- 0: abcd
- abCd
- 0: abCd
- *** Failers
-No match
- aBCd
-No match
- abcD
-No match
-
-/(?s-i:more.*than).*million/i
- more than million
- 0: more than million
- more than MILLION
- 0: more than MILLION
- more \n than Million
- 0: more \x0a than Million
- *** Failers
-No match
- MORE THAN MILLION
-No match
- more \n than \n million
-No match
-
-/(?:(?s-i)more.*than).*million/i
- more than million
- 0: more than million
- more than MILLION
- 0: more than MILLION
- more \n than Million
- 0: more \x0a than Million
- *** Failers
-No match
- MORE THAN MILLION
-No match
- more \n than \n million
-No match
-
-/(?>a(?i)b+)+c/
- abc
- 0: abc
- aBbc
- 0: aBbc
- aBBc
- 0: aBBc
- *** Failers
-No match
- Abc
-No match
- abAb
-No match
- abbC
-No match
-
-/(?=a(?i)b)\w\wc/
- abc
- 0: abc
- aBc
- 0: aBc
- *** Failers
-No match
- Ab
-No match
- abC
-No match
- aBC
-No match
-
-/(?<=a(?i)b)(\w\w)c/
- abxxc
- 0: xxc
- 1: xx
- aBxxc
- 0: xxc
- 1: xx
- *** Failers
-No match
- Abxxc
-No match
- ABxxc
-No match
- abxxC
-No match
-
-/(?:(a)|b)(?(1)A|B)/
- aA
- 0: aA
- 1: a
- bB
- 0: bB
- *** Failers
-No match
- aB
-No match
- bA
-No match
-
-/^(a)?(?(1)a|b)+$/
- aa
- 0: aa
- 1: a
- b
- 0: b
- bb
- 0: bb
- *** Failers
-No match
- ab
-No match
-
-/^(?(?=abc)\w{3}:|\d\d)$/
- abc:
- 0: abc:
- 12
- 0: 12
- *** Failers
-No match
- 123
-No match
- xyz
-No match
-
-/^(?(?!abc)\d\d|\w{3}:)$/
- abc:
- 0: abc:
- 12
- 0: 12
- *** Failers
-No match
- 123
-No match
- xyz
-No match
-
-/(?(?<=foo)bar|cat)/
- foobar
- 0: bar
- cat
- 0: cat
- fcat
- 0: cat
- focat
- 0: cat
- *** Failers
-No match
- foocat
-No match
-
-/(?(?<!foo)cat|bar)/
- foobar
- 0: bar
- cat
- 0: cat
- fcat
- 0: cat
- focat
- 0: cat
- *** Failers
-No match
- foocat
-No match
-
-/( \( )? [^()]+ (?(1) \) |) /x
- abcd
- 0: abcd
- (abcd)
- 0: (abcd)
- 1: (
- the quick (abcd) fox
- 0: the quick
- (abcd
- 0: abcd
-
-/( \( )? [^()]+ (?(1) \) ) /x
- abcd
- 0: abcd
- (abcd)
- 0: (abcd)
- 1: (
- the quick (abcd) fox
- 0: the quick
- (abcd
- 0: abcd
-
-/^(?(2)a|(1)(2))+$/
- 12
- 0: 12
- 1: 1
- 2: 2
- 12a
- 0: 12a
- 1: 1
- 2: 2
- 12aa
- 0: 12aa
- 1: 1
- 2: 2
- *** Failers
-No match
- 1234
-No match
-
-/((?i)blah)\s+\1/
- blah blah
- 0: blah blah
- 1: blah
- BLAH BLAH
- 0: BLAH BLAH
- 1: BLAH
- Blah Blah
- 0: Blah Blah
- 1: Blah
- blaH blaH
- 0: blaH blaH
- 1: blaH
- *** Failers
-No match
- blah BLAH
-No match
- Blah blah
-No match
- blaH blah
-No match
-
-/((?i)blah)\s+(?i:\1)/
- blah blah
- 0: blah blah
- 1: blah
- BLAH BLAH
- 0: BLAH BLAH
- 1: BLAH
- Blah Blah
- 0: Blah Blah
- 1: Blah
- blaH blaH
- 0: blaH blaH
- 1: blaH
- blah BLAH
- 0: blah BLAH
- 1: blah
- Blah blah
- 0: Blah blah
- 1: Blah
- blaH blah
- 0: blaH blah
- 1: blaH
-
-/(?>a*)*/
- a
- 0: a
- aa
- 0: aa
- aaaa
- 0: aaaa
-
-/(abc|)+/
- abc
- 0: abc
- 1:
- abcabc
- 0: abcabc
- 1:
- abcabcabc
- 0: abcabcabc
- 1:
- xyz
- 0:
- 1:
-
-/([a]*)*/
- a
- 0: a
- 1:
- aaaaa
- 0: aaaaa
- 1:
-
-/([ab]*)*/
- a
- 0: a
- 1:
- b
- 0: b
- 1:
- ababab
- 0: ababab
- 1:
- aaaabcde
- 0: aaaab
- 1:
- bbbb
- 0: bbbb
- 1:
-
-/([^a]*)*/
- b
- 0: b
- 1:
- bbbb
- 0: bbbb
- 1:
- aaa
- 0:
- 1:
-
-/([^ab]*)*/
- cccc
- 0: cccc
- 1:
- abab
- 0:
- 1:
-
-/([a]*?)*/
- a
- 0:
- 1:
- aaaa
- 0:
- 1:
-
-/([ab]*?)*/
- a
- 0:
- 1:
- b
- 0:
- 1:
- abab
- 0:
- 1:
- baba
- 0:
- 1:
-
-/([^a]*?)*/
- b
- 0:
- 1:
- bbbb
- 0:
- 1:
- aaa
- 0:
- 1:
-
-/([^ab]*?)*/
- c
- 0:
- 1:
- cccc
- 0:
- 1:
- baba
- 0:
- 1:
-
-/(?>a*)*/
- a
- 0: a
- aaabcde
- 0: aaa
-
-/((?>a*))*/
- aaaaa
- 0: aaaaa
- 1:
- aabbaa
- 0: aa
- 1:
-
-/((?>a*?))*/
- aaaaa
- 0:
- 1:
- aabbaa
- 0:
- 1:
-
-/(?(?=[^a-z]+[a-z]) \d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} ) /x
- 12-sep-98
- 0: 12-sep-98
- 12-09-98
- 0: 12-09-98
- *** Failers
-No match
- sep-12-98
-No match
-
-/(?<=(foo))bar\1/
- foobarfoo
- 0: barfoo
- 1: foo
- foobarfootling
- 0: barfoo
- 1: foo
- *** Failers
-No match
- foobar
-No match
- barfoo
-No match
-
-/(?i:saturday|sunday)/
- saturday
- 0: saturday
- sunday
- 0: sunday
- Saturday
- 0: Saturday
- Sunday
- 0: Sunday
- SATURDAY
- 0: SATURDAY
- SUNDAY
- 0: SUNDAY
- SunDay
- 0: SunDay
-
-/(a(?i)bc|BB)x/
- abcx
- 0: abcx
- 1: abc
- aBCx
- 0: aBCx
- 1: aBC
- bbx
- 0: bbx
- 1: bb
- BBx
- 0: BBx
- 1: BB
- *** Failers
-No match
- abcX
-No match
- aBCX
-No match
- bbX
-No match
- BBX
-No match
-
-/^([ab](?i)[cd]|[ef])/
- ac
- 0: ac
- 1: ac
- aC
- 0: aC
- 1: aC
- bD
- 0: bD
- 1: bD
- elephant
- 0: e
- 1: e
- Europe
- 0: E
- 1: E
- frog
- 0: f
- 1: f
- France
- 0: F
- 1: F
- *** Failers
-No match
- Africa
-No match
-
-/^(ab|a(?i)[b-c](?m-i)d|x(?i)y|z)/
- ab
- 0: ab
- 1: ab
- aBd
- 0: aBd
- 1: aBd
- xy
- 0: xy
- 1: xy
- xY
- 0: xY
- 1: xY
- zebra
- 0: z
- 1: z
- Zambesi
- 0: Z
- 1: Z
- *** Failers
-No match
- aCD
-No match
- XY
-No match
-
-/(?<=foo\n)^bar/m
- foo\nbar
- 0: bar
- *** Failers
-No match
- bar
-No match
- baz\nbar
-No match
-
-/(?<=(?<!foo)bar)baz/
- barbaz
- 0: baz
- barbarbaz
- 0: baz
- koobarbaz
- 0: baz
- *** Failers
-No match
- baz
-No match
- foobarbaz
-No match
-
-/The case of aaaaaa is missed out below because I think Perl 5.005_02 gets/
-/it wrong; it sets $1 to aaa rather than aa. Compare the following test,/
-No match
-/where it does set $1 to aa when matching aaaaaa./
-No match
-
-/^(a\1?){4}$/
- a
-No match
- aa
-No match
- aaa
-No match
- aaaa
- 0: aaaa
- 1: a
- aaaaa
- 0: aaaaa
- 1: a
- aaaaaaa
- 0: aaaaaaa
- 1: a
- aaaaaaaa
-No match
- aaaaaaaaa
-No match
- aaaaaaaaaa
- 0: aaaaaaaaaa
- 1: aaaa
- aaaaaaaaaaa
-No match
- aaaaaaaaaaaa
-No match
- aaaaaaaaaaaaa
-No match
- aaaaaaaaaaaaaa
-No match
- aaaaaaaaaaaaaaa
-No match
- aaaaaaaaaaaaaaaa
-No match
-
-/^(a\1?)(a\1?)(a\2?)(a\3?)$/
- a
-No match
- aa
-No match
- aaa
-No match
- aaaa
- 0: aaaa
- 1: a
- 2: a
- 3: a
- 4: a
- aaaaa
- 0: aaaaa
- 1: a
- 2: aa
- 3: a
- 4: a
- aaaaaa
- 0: aaaaaa
- 1: a
- 2: aa
- 3: a
- 4: aa
- aaaaaaa
- 0: aaaaaaa
- 1: a
- 2: aa
- 3: aaa
- 4: a
- aaaaaaaa
-No match
- aaaaaaaaa
-No match
- aaaaaaaaaa
- 0: aaaaaaaaaa
- 1: a
- 2: aa
- 3: aaa
- 4: aaaa
- aaaaaaaaaaa
-No match
- aaaaaaaaaaaa
-No match
- aaaaaaaaaaaaa
-No match
- aaaaaaaaaaaaaa
-No match
- aaaaaaaaaaaaaaa
-No match
- aaaaaaaaaaaaaaaa
-No match
-
-/The following tests are taken from the Perl 5.005 test suite; some of them/
-/are compatible with 5.004, but I'd rather not have to sort them out./
-No match
-
-/abc/
- abc
- 0: abc
- xabcy
- 0: abc
- ababc
- 0: abc
- *** Failers
-No match
- xbc
-No match
- axc
-No match
- abx
-No match
-
-/ab*c/
- abc
- 0: abc
-
-/ab*bc/
- abc
- 0: abc
- abbc
- 0: abbc
- abbbbc
- 0: abbbbc
-
-/.{1}/
- abbbbc
- 0: a
-
-/.{3,4}/
- abbbbc
- 0: abbb
-
-/ab{0,}bc/
- abbbbc
- 0: abbbbc
-
-/ab+bc/
- abbc
- 0: abbc
- *** Failers
-No match
- abc
-No match
- abq
-No match
-
-/ab{1,}bc/
-
-/ab+bc/
- abbbbc
- 0: abbbbc
-
-/ab{1,}bc/
- abbbbc
- 0: abbbbc
-
-/ab{1,3}bc/
- abbbbc
- 0: abbbbc
-
-/ab{3,4}bc/
- abbbbc
- 0: abbbbc
-
-/ab{4,5}bc/
- *** Failers
-No match
- abq
-No match
- abbbbc
-No match
-
-/ab?bc/
- abbc
- 0: abbc
- abc
- 0: abc
-
-/ab{0,1}bc/
- abc
- 0: abc
-
-/ab?bc/
-
-/ab?c/
- abc
- 0: abc
-
-/ab{0,1}c/
- abc
- 0: abc
-
-/^abc$/
- abc
- 0: abc
- *** Failers
-No match
- abbbbc
-No match
- abcc
-No match
-
-/^abc/
- abcc
- 0: abc
-
-/^abc$/
-
-/abc$/
- aabc
- 0: abc
- *** Failers
-No match
- aabc
- 0: abc
- aabcd
-No match
-
-/^/
- abc
- 0:
-
-/$/
- abc
- 0:
-
-/a.c/
- abc
- 0: abc
- axc
- 0: axc
-
-/a.*c/
- axyzc
- 0: axyzc
-
-/a[bc]d/
- abd
- 0: abd
- *** Failers
-No match
- axyzd
-No match
- abc
-No match
-
-/a[b-d]e/
- ace
- 0: ace
-
-/a[b-d]/
- aac
- 0: ac
-
-/a[-b]/
- a-
- 0: a-
-
-/a[b-]/
- a-
- 0: a-
-
-/a]/
- a]
- 0: a]
-
-/a[]]b/
- a]b
- 0: a]b
-
-/a[^bc]d/
- aed
- 0: aed
- *** Failers
-No match
- abd
-No match
- abd
-No match
-
-/a[^-b]c/
- adc
- 0: adc
-
-/a[^]b]c/
- adc
- 0: adc
- *** Failers
-No match
- a-c
- 0: a-c
- a]c
-No match
-
-/\ba\b/
- a-
- 0: a
- -a
- 0: a
- -a-
- 0: a
-
-/\by\b/
- *** Failers
-No match
- xy
-No match
- yz
-No match
- xyz
-No match
-
-/\Ba\B/
- *** Failers
- 0: a
- a-
-No match
- -a
-No match
- -a-
-No match
-
-/\By\b/
- xy
- 0: y
-
-/\by\B/
- yz
- 0: y
-
-/\By\B/
- xyz
- 0: y
-
-/\w/
- a
- 0: a
-
-/\W/
- -
- 0: -
- *** Failers
- 0: *
- -
- 0: -
- a
-No match
-
-/a\sb/
- a b
- 0: a b
-
-/a\Sb/
- a-b
- 0: a-b
- *** Failers
-No match
- a-b
- 0: a-b
- a b
-No match
-
-/\d/
- 1
- 0: 1
-
-/\D/
- -
- 0: -
- *** Failers
- 0: *
- -
- 0: -
- 1
-No match
-
-/[\w]/
- a
- 0: a
-
-/[\W]/
- -
- 0: -
- *** Failers
- 0: *
- -
- 0: -
- a
-No match
-
-/a[\s]b/
- a b
- 0: a b
-
-/a[\S]b/
- a-b
- 0: a-b
- *** Failers
-No match
- a-b
- 0: a-b
- a b
-No match
-
-/[\d]/
- 1
- 0: 1
-
-/[\D]/
- -
- 0: -
- *** Failers
- 0: *
- -
- 0: -
- 1
-No match
-
-/ab|cd/
- abc
- 0: ab
- abcd
- 0: ab
-
-/()ef/
- def
- 0: ef
- 1:
-
-/$b/
-
-/a\(b/
- a(b
- 0: a(b
-
-/a\(*b/
- ab
- 0: ab
- a((b
- 0: a((b
-
-/a\\b/
- a\b
-No match
-
-/((a))/
- abc
- 0: a
- 1: a
- 2: a
-
-/(a)b(c)/
- abc
- 0: abc
- 1: a
- 2: c
-
-/a+b+c/
- aabbabc
- 0: abc
-
-/a{1,}b{1,}c/
- aabbabc
- 0: abc
-
-/a.+?c/
- abcabc
- 0: abc
-
-/(a+|b)*/
- ab
- 0: ab
- 1: b
-
-/(a+|b){0,}/
- ab
- 0: ab
- 1: b
-
-/(a+|b)+/
- ab
- 0: ab
- 1: b
-
-/(a+|b){1,}/
- ab
- 0: ab
- 1: b
-
-/(a+|b)?/
- ab
- 0: a
- 1: a
-
-/(a+|b){0,1}/
- ab
- 0: a
- 1: a
-
-/[^ab]*/
- cde
- 0: cde
-
-/abc/
- *** Failers
-No match
- b
-No match
-
-
-/a*/
-
-
-/([abc])*d/
- abbbcd
- 0: abbbcd
- 1: c
-
-/([abc])*bcd/
- abcd
- 0: abcd
- 1: a
-
-/a|b|c|d|e/
- e
- 0: e
-
-/(a|b|c|d|e)f/
- ef
- 0: ef
- 1: e
-
-/abcd*efg/
- abcdefg
- 0: abcdefg
-
-/ab*/
- xabyabbbz
- 0: ab
- xayabbbz
- 0: a
-
-/(ab|cd)e/
- abcde
- 0: cde
- 1: cd
-
-/[abhgefdc]ij/
- hij
- 0: hij
-
-/^(ab|cd)e/
-
-/(abc|)ef/
- abcdef
- 0: ef
- 1:
-
-/(a|b)c*d/
- abcd
- 0: bcd
- 1: b
-
-/(ab|ab*)bc/
- abc
- 0: abc
- 1: a
-
-/a([bc]*)c*/
- abc
- 0: abc
- 1: bc
-
-/a([bc]*)(c*d)/
- abcd
- 0: abcd
- 1: bc
- 2: d
-
-/a([bc]+)(c*d)/
- abcd
- 0: abcd
- 1: bc
- 2: d
-
-/a([bc]*)(c+d)/
- abcd
- 0: abcd
- 1: b
- 2: cd
-
-/a[bcd]*dcdcde/
- adcdcde
- 0: adcdcde
-
-/a[bcd]+dcdcde/
- *** Failers
-No match
- abcde
-No match
- adcdcde
-No match
-
-/(ab|a)b*c/
- abc
- 0: abc
- 1: ab
-
-/((a)(b)c)(d)/
- abcd
- 0: abcd
- 1: abc
- 2: a
- 3: b
- 4: d
-
-/[a-zA-Z_][a-zA-Z0-9_]*/
- alpha
- 0: alpha
-
-/^a(bc+|b[eh])g|.h$/
- abh
- 0: bh
-
-/(bc+d$|ef*g.|h?i(j|k))/
- effgz
- 0: effgz
- 1: effgz
- ij
- 0: ij
- 1: ij
- 2: j
- reffgz
- 0: effgz
- 1: effgz
- *** Failers
-No match
- effg
-No match
- bcdd
-No match
-
-/((((((((((a))))))))))/
- a
- 0: a
- 1: a
- 2: a
- 3: a
- 4: a
- 5: a
- 6: a
- 7: a
- 8: a
- 9: a
-10: a
-
-/((((((((((a))))))))))\10/
- aa
- 0: aa
- 1: a
- 2: a
- 3: a
- 4: a
- 5: a
- 6: a
- 7: a
- 8: a
- 9: a
-10: a
-
-/(((((((((a)))))))))/
- a
- 0: a
- 1: a
- 2: a
- 3: a
- 4: a
- 5: a
- 6: a
- 7: a
- 8: a
- 9: a
-
-/multiple words of text/
- *** Failers
-No match
- aa
-No match
- uh-uh
-No match
-
-/multiple words/
- multiple words, yeah
- 0: multiple words
-
-/(.*)c(.*)/
- abcde
- 0: abcde
- 1: ab
- 2: de
-
-/\((.*), (.*)\)/
- (a, b)
- 0: (a, b)
- 1: a
- 2: b
-
-/[k]/
-
-/abcd/
- abcd
- 0: abcd
-
-/a(bc)d/
- abcd
- 0: abcd
- 1: bc
-
-/a[-]?c/
- ac
- 0: ac
-
-/(abc)\1/
- abcabc
- 0: abcabc
- 1: abc
-
-/([a-c]*)\1/
- abcabc
- 0: abcabc
- 1: abc
-
-/(a)|\1/
- a
- 0: a
- 1: a
- *** Failers
- 0: a
- 1: a
- ab
- 0: a
- 1: a
- x
-No match
-
-/(([a-c])b*?\2)*/
- ababbbcbc
- 0: ababb
- 1: bb
- 2: b
-
-/(([a-c])b*?\2){3}/
- ababbbcbc
- 0: ababbbcbc
- 1: cbc
- 2: c
-
-/((\3|b)\2(a)x)+/
- aaaxabaxbaaxbbax
- 0: bbax
- 1: bbax
- 2: b
- 3: a
-
-/((\3|b)\2(a)){2,}/
- bbaababbabaaaaabbaaaabba
- 0: bbaaaabba
- 1: bba
- 2: b
- 3: a
-
-/abc/i
- ABC
- 0: ABC
- XABCY
- 0: ABC
- ABABC
- 0: ABC
- *** Failers
-No match
- aaxabxbaxbbx
-No match
- XBC
-No match
- AXC
-No match
- ABX
-No match
-
-/ab*c/i
- ABC
- 0: ABC
-
-/ab*bc/i
- ABC
- 0: ABC
- ABBC
- 0: ABBC
-
-/ab*?bc/i
- ABBBBC
- 0: ABBBBC
-
-/ab{0,}?bc/i
- ABBBBC
- 0: ABBBBC
-
-/ab+?bc/i
- ABBC
- 0: ABBC
-
-/ab+bc/i
- *** Failers
-No match
- ABC
-No match
- ABQ
-No match
-
-/ab{1,}bc/i
-
-/ab+bc/i
- ABBBBC
- 0: ABBBBC
-
-/ab{1,}?bc/i
- ABBBBC
- 0: ABBBBC
-
-/ab{1,3}?bc/i
- ABBBBC
- 0: ABBBBC
-
-/ab{3,4}?bc/i
- ABBBBC
- 0: ABBBBC
-
-/ab{4,5}?bc/i
- *** Failers
-No match
- ABQ
-No match
- ABBBBC
-No match
-
-/ab??bc/i
- ABBC
- 0: ABBC
- ABC
- 0: ABC
-
-/ab{0,1}?bc/i
- ABC
- 0: ABC
-
-/ab??bc/i
-
-/ab??c/i
- ABC
- 0: ABC
-
-/ab{0,1}?c/i
- ABC
- 0: ABC
-
-/^abc$/i
- ABC
- 0: ABC
- *** Failers
-No match
- ABBBBC
-No match
- ABCC
-No match
-
-/^abc/i
- ABCC
- 0: ABC
-
-/^abc$/i
-
-/abc$/i
- AABC
- 0: ABC
-
-/^/i
- ABC
- 0:
-
-/$/i
- ABC
- 0:
-
-/a.c/i
- ABC
- 0: ABC
- AXC
- 0: AXC
-
-/a.*?c/i
- AXYZC
- 0: AXYZC
-
-/a.*c/i
- *** Failers
-No match
- AABC
- 0: AABC
- AXYZD
-No match
-
-/a[bc]d/i
- ABD
- 0: ABD
-
-/a[b-d]e/i
- ACE
- 0: ACE
- *** Failers
-No match
- ABC
-No match
- ABD
-No match
-
-/a[b-d]/i
- AAC
- 0: AC
-
-/a[-b]/i
- A-
- 0: A-
-
-/a[b-]/i
- A-
- 0: A-
-
-/a]/i
- A]
- 0: A]
-
-/a[]]b/i
- A]B
- 0: A]B
-
-/a[^bc]d/i
- AED
- 0: AED
-
-/a[^-b]c/i
- ADC
- 0: ADC
- *** Failers
-No match
- ABD
-No match
- A-C
-No match
-
-/a[^]b]c/i
- ADC
- 0: ADC
-
-/ab|cd/i
- ABC
- 0: AB
- ABCD
- 0: AB
-
-/()ef/i
- DEF
- 0: EF
- 1:
-
-/$b/i
- *** Failers
-No match
- A]C
-No match
- B
-No match
-
-/a\(b/i
- A(B
- 0: A(B
-
-/a\(*b/i
- AB
- 0: AB
- A((B
- 0: A((B
-
-/a\\b/i
- A\B
-No match
-
-/((a))/i
- ABC
- 0: A
- 1: A
- 2: A
-
-/(a)b(c)/i
- ABC
- 0: ABC
- 1: A
- 2: C
-
-/a+b+c/i
- AABBABC
- 0: ABC
-
-/a{1,}b{1,}c/i
- AABBABC
- 0: ABC
-
-/a.+?c/i
- ABCABC
- 0: ABC
-
-/a.*?c/i
- ABCABC
- 0: ABC
-
-/a.{0,5}?c/i
- ABCABC
- 0: ABC
-
-/(a+|b)*/i
- AB
- 0: AB
- 1: B
-
-/(a+|b){0,}/i
- AB
- 0: AB
- 1: B
-
-/(a+|b)+/i
- AB
- 0: AB
- 1: B
-
-/(a+|b){1,}/i
- AB
- 0: AB
- 1: B
-
-/(a+|b)?/i
- AB
- 0: A
- 1: A
-
-/(a+|b){0,1}/i
- AB
- 0: A
- 1: A
-
-/(a+|b){0,1}?/i
- AB
- 0:
-
-/[^ab]*/i
- CDE
- 0: CDE
-
-/abc/i
-
-/a*/i
-
-
-/([abc])*d/i
- ABBBCD
- 0: ABBBCD
- 1: C
-
-/([abc])*bcd/i
- ABCD
- 0: ABCD
- 1: A
-
-/a|b|c|d|e/i
- E
- 0: E
-
-/(a|b|c|d|e)f/i
- EF
- 0: EF
- 1: E
-
-/abcd*efg/i
- ABCDEFG
- 0: ABCDEFG
-
-/ab*/i
- XABYABBBZ
- 0: AB
- XAYABBBZ
- 0: A
-
-/(ab|cd)e/i
- ABCDE
- 0: CDE
- 1: CD
-
-/[abhgefdc]ij/i
- HIJ
- 0: HIJ
-
-/^(ab|cd)e/i
- ABCDE
-No match
-
-/(abc|)ef/i
- ABCDEF
- 0: EF
- 1:
-
-/(a|b)c*d/i
- ABCD
- 0: BCD
- 1: B
-
-/(ab|ab*)bc/i
- ABC
- 0: ABC
- 1: A
-
-/a([bc]*)c*/i
- ABC
- 0: ABC
- 1: BC
-
-/a([bc]*)(c*d)/i
- ABCD
- 0: ABCD
- 1: BC
- 2: D
-
-/a([bc]+)(c*d)/i
- ABCD
- 0: ABCD
- 1: BC
- 2: D
-
-/a([bc]*)(c+d)/i
- ABCD
- 0: ABCD
- 1: B
- 2: CD
-
-/a[bcd]*dcdcde/i
- ADCDCDE
- 0: ADCDCDE
-
-/a[bcd]+dcdcde/i
-
-/(ab|a)b*c/i
- ABC
- 0: ABC
- 1: AB
-
-/((a)(b)c)(d)/i
- ABCD
- 0: ABCD
- 1: ABC
- 2: A
- 3: B
- 4: D
-
-/[a-zA-Z_][a-zA-Z0-9_]*/i
- ALPHA
- 0: ALPHA
-
-/^a(bc+|b[eh])g|.h$/i
- ABH
- 0: BH
-
-/(bc+d$|ef*g.|h?i(j|k))/i
- EFFGZ
- 0: EFFGZ
- 1: EFFGZ
- IJ
- 0: IJ
- 1: IJ
- 2: J
- REFFGZ
- 0: EFFGZ
- 1: EFFGZ
- *** Failers
-No match
- ADCDCDE
-No match
- EFFG
-No match
- BCDD
-No match
-
-/((((((((((a))))))))))/i
- A
- 0: A
- 1: A
- 2: A
- 3: A
- 4: A
- 5: A
- 6: A
- 7: A
- 8: A
- 9: A
-10: A
-
-/((((((((((a))))))))))\10/i
- AA
- 0: AA
- 1: A
- 2: A
- 3: A
- 4: A
- 5: A
- 6: A
- 7: A
- 8: A
- 9: A
-10: A
-
-/(((((((((a)))))))))/i
- A
- 0: A
- 1: A
- 2: A
- 3: A
- 4: A
- 5: A
- 6: A
- 7: A
- 8: A
- 9: A
-
-/(?:(?:(?:(?:(?:(?:(?:(?:(?:(a))))))))))/i
- A
- 0: A
- 1: A
-
-/(?:(?:(?:(?:(?:(?:(?:(?:(?:(a|b|c))))))))))/i
- C
- 0: C
- 1: C
-
-/multiple words of text/i
- *** Failers
-No match
- AA
-No match
- UH-UH
-No match
-
-/multiple words/i
- MULTIPLE WORDS, YEAH
- 0: MULTIPLE WORDS
-
-/(.*)c(.*)/i
- ABCDE
- 0: ABCDE
- 1: AB
- 2: DE
-
-/\((.*), (.*)\)/i
- (A, B)
- 0: (A, B)
- 1: A
- 2: B
-
-/[k]/i
-
-/abcd/i
- ABCD
- 0: ABCD
-
-/a(bc)d/i
- ABCD
- 0: ABCD
- 1: BC
-
-/a[-]?c/i
- AC
- 0: AC
-
-/(abc)\1/i
- ABCABC
- 0: ABCABC
- 1: ABC
-
-/([a-c]*)\1/i
- ABCABC
- 0: ABCABC
- 1: ABC
-
-/a(?!b)./
- abad
- 0: ad
-
-/a(?=d)./
- abad
- 0: ad
-
-/a(?=c|d)./
- abad
- 0: ad
-
-/a(?:b|c|d)(.)/
- ace
- 0: ace
- 1: e
-
-/a(?:b|c|d)*(.)/
- ace
- 0: ace
- 1: e
-
-/a(?:b|c|d)+?(.)/
- ace
- 0: ace
- 1: e
- acdbcdbe
- 0: acd
- 1: d
-
-/a(?:b|c|d)+(.)/
- acdbcdbe
- 0: acdbcdbe
- 1: e
-
-/a(?:b|c|d){2}(.)/
- acdbcdbe
- 0: acdb
- 1: b
-
-/a(?:b|c|d){4,5}(.)/
- acdbcdbe
- 0: acdbcdb
- 1: b
-
-/a(?:b|c|d){4,5}?(.)/
- acdbcdbe
- 0: acdbcd
- 1: d
-
-/((foo)|(bar))*/
- foobar
- 0: foobar
- 1: bar
- 2: foo
- 3: bar
-
-/a(?:b|c|d){6,7}(.)/
- acdbcdbe
- 0: acdbcdbe
- 1: e
-
-/a(?:b|c|d){6,7}?(.)/
- acdbcdbe
- 0: acdbcdbe
- 1: e
-
-/a(?:b|c|d){5,6}(.)/
- acdbcdbe
- 0: acdbcdbe
- 1: e
-
-/a(?:b|c|d){5,6}?(.)/
- acdbcdbe
- 0: acdbcdb
- 1: b
-
-/a(?:b|c|d){5,7}(.)/
- acdbcdbe
- 0: acdbcdbe
- 1: e
-
-/a(?:b|c|d){5,7}?(.)/
- acdbcdbe
- 0: acdbcdb
- 1: b
-
-/a(?:b|(c|e){1,2}?|d)+?(.)/
- ace
- 0: ace
- 1: c
- 2: e
-
-/^(.+)?B/
- AB
- 0: AB
- 1: A
-
-/^([^a-z])|(\^)$/
- .
- 0: .
- 1: .
-
-/^[<>]&/
- <&OUT
- 0: <&
-
-/^(a\1?){4}$/
- aaaaaaaaaa
- 0: aaaaaaaaaa
- 1: aaaa
- *** Failers
-No match
- AB
-No match
- aaaaaaaaa
-No match
- aaaaaaaaaaa
-No match
-
-/^(a(?(1)\1)){4}$/
- aaaaaaaaaa
- 0: aaaaaaaaaa
- 1: aaaa
- *** Failers
-No match
- aaaaaaaaa
-No match
- aaaaaaaaaaa
-No match
-
-/(?:(f)(o)(o)|(b)(a)(r))*/
- foobar
- 0: foobar
- 1: f
- 2: o
- 3: o
- 4: b
- 5: a
- 6: r
-
-/(?<=a)b/
- ab
- 0: b
- *** Failers
-No match
- cb
-No match
- b
-No match
-
-/(?<!c)b/
- ab
- 0: b
- b
- 0: b
- b
- 0: b
-
-/(?:..)*a/
- aba
- 0: aba
-
-/(?:..)*?a/
- aba
- 0: a
-
-/^(?:b|a(?=(.)))*\1/
- abc
- 0: ab
- 1: b
-
-/^(){3,5}/
- abc
- 0:
- 1:
-
-/^(a+)*ax/
- aax
- 0: aax
- 1: a
-
-/^((a|b)+)*ax/
- aax
- 0: aax
- 1: a
- 2: a
-
-/^((a|bc)+)*ax/
- aax
- 0: aax
- 1: a
- 2: a
-
-/(a|x)*ab/
- cab
- 0: ab
-
-/(a)*ab/
- cab
- 0: ab
-
-/(?:(?i)a)b/
- ab
- 0: ab
-
-/((?i)a)b/
- ab
- 0: ab
- 1: a
-
-/(?:(?i)a)b/
- Ab
- 0: Ab
-
-/((?i)a)b/
- Ab
- 0: Ab
- 1: A
-
-/(?:(?i)a)b/
- *** Failers
-No match
- cb
-No match
- aB
-No match
-
-/((?i)a)b/
-
-/(?i:a)b/
- ab
- 0: ab
-
-/((?i:a))b/
- ab
- 0: ab
- 1: a
-
-/(?i:a)b/
- Ab
- 0: Ab
-
-/((?i:a))b/
- Ab
- 0: Ab
- 1: A
-
-/(?i:a)b/
- *** Failers
-No match
- aB
-No match
- aB
-No match
-
-/((?i:a))b/
-
-/(?:(?-i)a)b/i
- ab
- 0: ab
-
-/((?-i)a)b/i
- ab
- 0: ab
- 1: a
-
-/(?:(?-i)a)b/i
- aB
- 0: aB
-
-/((?-i)a)b/i
- aB
- 0: aB
- 1: a
-
-/(?:(?-i)a)b/i
- *** Failers
-No match
- aB
- 0: aB
- Ab
-No match
-
-/((?-i)a)b/i
-
-/(?:(?-i)a)b/i
- aB
- 0: aB
-
-/((?-i)a)b/i
- aB
- 0: aB
- 1: a
-
-/(?:(?-i)a)b/i
- *** Failers
-No match
- Ab
-No match
- AB
-No match
-
-/((?-i)a)b/i
-
-/(?-i:a)b/i
- ab
- 0: ab
-
-/((?-i:a))b/i
- ab
- 0: ab
- 1: a
-
-/(?-i:a)b/i
- aB
- 0: aB
-
-/((?-i:a))b/i
- aB
- 0: aB
- 1: a
-
-/(?-i:a)b/i
- *** Failers
-No match
- AB
-No match
- Ab
-No match
-
-/((?-i:a))b/i
-
-/(?-i:a)b/i
- aB
- 0: aB
-
-/((?-i:a))b/i
- aB
- 0: aB
- 1: a
-
-/(?-i:a)b/i
- *** Failers
-No match
- Ab
-No match
- AB
-No match
-
-/((?-i:a))b/i
-
-/((?-i:a.))b/i
- *** Failers
-No match
- AB
-No match
- a\nB
-No match
-
-/((?s-i:a.))b/i
- a\nB
- 0: a\x0aB
- 1: a\x0a
-
-/(?:c|d)(?:)(?:a(?:)(?:b)(?:b(?:))(?:b(?:)(?:b)))/
- cabbbb
- 0: cabbbb
-
-/(?:c|d)(?:)(?:aaaaaaaa(?:)(?:bbbbbbbb)(?:bbbbbbbb(?:))(?:bbbbbbbb(?:)(?:bbbbbbbb)))/
- caaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
- 0: caaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
-
-/(ab)\d\1/i
- Ab4ab
- 0: Ab4ab
- 1: Ab
- ab4Ab
- 0: ab4Ab
- 1: ab
-
-/foo\w*\d{4}baz/
- foobar1234baz
- 0: foobar1234baz
-
-/x(~~)*(?:(?:F)?)?/
- x~~
- 0: x~~
- 1: ~~
-
-/^a(?#xxx){3}c/
- aaac
- 0: aaac
-
-/^a (?#xxx) (?#yyy) {3}c/x
- aaac
- 0: aaac
-
-/(?<![cd])b/
- *** Failers
-No match
- B\nB
-No match
- dbcb
-No match
-
-/(?<![cd])[ab]/
- dbaacb
- 0: a
-
-/(?<!(c|d))b/
-
-/(?<!(c|d))[ab]/
- dbaacb
- 0: a
-
-/(?<!cd)[ab]/
- cdaccb
- 0: b
-
-/^(?:a?b?)*$/
- *** Failers
-No match
- dbcb
-No match
- a--
-No match
-
-/((?s)^a(.))((?m)^b$)/
- a\nb\nc\n
- 0: a\x0ab
- 1: a\x0a
- 2: \x0a
- 3: b
-
-/((?m)^b$)/
- a\nb\nc\n
- 0: b
- 1: b
-
-/(?m)^b/
- a\nb\n
- 0: b
-
-/(?m)^(b)/
- a\nb\n
- 0: b
- 1: b
-
-/((?m)^b)/
- a\nb\n
- 0: b
- 1: b
-
-/\n((?m)^b)/
- a\nb\n
- 0: \x0ab
- 1: b
-
-/((?s).)c(?!.)/
- a\nb\nc\n
- 0: \x0ac
- 1: \x0a
- a\nb\nc\n
- 0: \x0ac
- 1: \x0a
-
-/((?s)b.)c(?!.)/
- a\nb\nc\n
- 0: b\x0ac
- 1: b\x0a
- a\nb\nc\n
- 0: b\x0ac
- 1: b\x0a
-
-/^b/
-
-/()^b/
- *** Failers
-No match
- a\nb\nc\n
-No match
- a\nb\nc\n
-No match
-
-/((?m)^b)/
- a\nb\nc\n
- 0: b
- 1: b
-
-/(?(1)a|b)/
-
-/(?(1)b|a)/
- a
- 0: a
-
-/(x)?(?(1)a|b)/
- *** Failers
-No match
- a
-No match
- a
-No match
-
-/(x)?(?(1)b|a)/
- a
- 0: a
-
-/()?(?(1)b|a)/
- a
- 0: a
-
-/()(?(1)b|a)/
-
-/()?(?(1)a|b)/
- a
- 0: a
- 1:
-
-/^(\()?blah(?(1)(\)))$/
- (blah)
- 0: (blah)
- 1: (
- 2: )
- blah
- 0: blah
- *** Failers
-No match
- a
-No match
- blah)
-No match
- (blah
-No match
-
-/^(\(+)?blah(?(1)(\)))$/
- (blah)
- 0: (blah)
- 1: (
- 2: )
- blah
- 0: blah
- *** Failers
-No match
- blah)
-No match
- (blah
-No match
-
-/(?(?!a)a|b)/
-
-/(?(?!a)b|a)/
- a
- 0: a
-
-/(?(?=a)b|a)/
- *** Failers
-No match
- a
-No match
- a
-No match
-
-/(?(?=a)a|b)/
- a
- 0: a
-
-/(?=(a+?))(\1ab)/
- aaab
- 0: aab
- 1: a
- 2: aab
-
-/^(?=(a+?))\1ab/
-
-/(\w+:)+/
- one:
- 0: one:
- 1: one:
-
-/$(?<=^(a))/
- a
- 0:
- 1: a
-
-/(?=(a+?))(\1ab)/
- aaab
- 0: aab
- 1: a
- 2: aab
-
-/^(?=(a+?))\1ab/
- *** Failers
-No match
- aaab
-No match
- aaab
-No match
-
-/([\w:]+::)?(\w+)$/
- abcd
- 0: abcd
- 1: <unset>
- 2: abcd
- xy:z:::abcd
- 0: xy:z:::abcd
- 1: xy:z:::
- 2: abcd
-
-/^[^bcd]*(c+)/
- aexycd
- 0: aexyc
- 1: c
-
-/(a*)b+/
- caab
- 0: aab
- 1: aa
-
-/([\w:]+::)?(\w+)$/
- abcd
- 0: abcd
- 1: <unset>
- 2: abcd
- xy:z:::abcd
- 0: xy:z:::abcd
- 1: xy:z:::
- 2: abcd
- *** Failers
- 0: Failers
- 1: <unset>
- 2: Failers
- abcd:
-No match
- abcd:
-No match
-
-/^[^bcd]*(c+)/
- aexycd
- 0: aexyc
- 1: c
-
-/(>a+)ab/
-
-/(?>a+)b/
- aaab
- 0: aaab
-
-/([[:]+)/
- a:[b]:
- 0: :[
- 1: :[
-
-/([[=]+)/
- a=[b]=
- 0: =[
- 1: =[
-
-/([[.]+)/
- a.[b].
- 0: .[
- 1: .[
-
-/((?>a+)b)/
- aaab
- 0: aaab
- 1: aaab
-
-/(?>(a+))b/
- aaab
- 0: aaab
- 1: aaa
-
-/((?>[^()]+)|\([^()]*\))+/
- ((abc(ade)ufh()()x
- 0: abc(ade)ufh()()x
- 1: x
-
-/a\Z/
- *** Failers
-No match
- aaab
-No match
- a\nb\n
-No match
-
-/b\Z/
- a\nb\n
- 0: b
-
-/b\z/
-
-/b\Z/
- a\nb
- 0: b
-
-/b\z/
- a\nb
- 0: b
- *** Failers
-No match
-
-/^(?>(?(1)\.|())[^\W_](?>[a-z0-9-]*[^\W_])?)+$/
- a
- 0: a
- 1:
- abc
- 0: abc
- 1:
- a-b
- 0: a-b
- 1:
- 0-9
- 0: 0-9
- 1:
- a.b
- 0: a.b
- 1:
- 5.6.7
- 0: 5.6.7
- 1:
- the.quick.brown.fox
- 0: the.quick.brown.fox
- 1:
- a100.b200.300c
- 0: a100.b200.300c
- 1:
- 12-ab.1245
- 0: 12-ab.1245
- 1:
- ***Failers
-No match
- \
-No match
- .a
-No match
- -a
-No match
- a-
-No match
- a.
-No match
- a_b
-No match
- a.-
-No match
- a..
-No match
- ab..bc
-No match
- the.quick.brown.fox-
-No match
- the.quick.brown.fox.
-No match
- the.quick.brown.fox_
-No match
- the.quick.brown.fox+
-No match
-
-/(?>.*)(?<=(abcd|wxyz))/
- alphabetabcd
- 0: alphabetabcd
- 1: abcd
- endingwxyz
- 0: endingwxyz
- 1: wxyz
- *** Failers
-No match
- a rather long string that doesn't end with one of them
-No match
-
-/word (?>(?:(?!otherword)[a-zA-Z0-9]+ ){0,30})otherword/
- word cat dog elephant mussel cow horse canary baboon snake shark otherword
- 0: word cat dog elephant mussel cow horse canary baboon snake shark otherword
- word cat dog elephant mussel cow horse canary baboon snake shark
-No match
-
-/word (?>[a-zA-Z0-9]+ ){0,30}otherword/
- word cat dog elephant mussel cow horse canary baboon snake shark the quick brown fox and the lazy dog and several other words getting close to thirty by now I hope
-No match
-
-/(?<=\d{3}(?!999))foo/
- 999foo
- 0: foo
- 123999foo
- 0: foo
- *** Failers
-No match
- 123abcfoo
-No match
-
-/(?<=(?!...999)\d{3})foo/
- 999foo
- 0: foo
- 123999foo
- 0: foo
- *** Failers
-No match
- 123abcfoo
-No match
-
-/(?<=\d{3}(?!999)...)foo/
- 123abcfoo
- 0: foo
- 123456foo
- 0: foo
- *** Failers
-No match
- 123999foo
-No match
-
-/(?<=\d{3}...)(?<!999)foo/
- 123abcfoo
- 0: foo
- 123456foo
- 0: foo
- *** Failers
-No match
- 123999foo
-No match
-
-/<a[\s]+href[\s]*=[\s]* # find <a href=
- ([\"\'])? # find single or double quote
- (?(1) (.*?)\1 | ([^\s]+)) # if quote found, match up to next matching
- # quote, otherwise match up to next space
-/isx
- <a href=abcd xyz
- 0: <a href=abcd
- 1: <unset>
- 2: <unset>
- 3: abcd
- <a href=\"abcd xyz pqr\" cats
- 0: <a href="abcd xyz pqr"
- 1: "
- 2: abcd xyz pqr
- <a href=\'abcd xyz pqr\' cats
- 0: <a href='abcd xyz pqr'
- 1: '
- 2: abcd xyz pqr
-
-/<a\s+href\s*=\s* # find <a href=
- (["'])? # find single or double quote
- (?(1) (.*?)\1 | (\S+)) # if quote found, match up to next matching
- # quote, otherwise match up to next space
-/isx
- <a href=abcd xyz
- 0: <a href=abcd
- 1: <unset>
- 2: <unset>
- 3: abcd
- <a href=\"abcd xyz pqr\" cats
- 0: <a href="abcd xyz pqr"
- 1: "
- 2: abcd xyz pqr
- <a href = \'abcd xyz pqr\' cats
- 0: <a href = 'abcd xyz pqr'
- 1: '
- 2: abcd xyz pqr
-
-/<a\s+href(?>\s*)=(?>\s*) # find <a href=
- (["'])? # find single or double quote
- (?(1) (.*?)\1 | (\S+)) # if quote found, match up to next matching
- # quote, otherwise match up to next space
-/isx
- <a href=abcd xyz
- 0: <a href=abcd
- 1: <unset>
- 2: <unset>
- 3: abcd
- <a href=\"abcd xyz pqr\" cats
- 0: <a href="abcd xyz pqr"
- 1: "
- 2: abcd xyz pqr
- <a href = \'abcd xyz pqr\' cats
- 0: <a href = 'abcd xyz pqr'
- 1: '
- 2: abcd xyz pqr
-
-/((Z)+|A)*/
- ZABCDEFG
- 0: ZA
- 1: A
- 2: Z
-
-/(Z()|A)*/
- ZABCDEFG
- 0: ZA
- 1: A
- 2:
-
-/(Z(())|A)*/
- ZABCDEFG
- 0: ZA
- 1: A
- 2:
- 3:
-
-/((?>Z)+|A)*/
- ZABCDEFG
- 0: ZA
- 1: A
-
-/((?>)+|A)*/
- ZABCDEFG
- 0:
- 1:
-
-/a*/g
- abbab
- 0: a
- 0:
- 0:
- 0: a
- 0:
- 0:
-
-/^[a-\d]/
- abcde
- 0: a
- -things
- 0: -
- 0digit
- 0: 0
- *** Failers
-No match
- bcdef
-No match
-
-/^[\d-a]/
- abcde
- 0: a
- -things
- 0: -
- 0digit
- 0: 0
- *** Failers
-No match
- bcdef
-No match
-
-/[[:space:]]+/
- > \x09\x0a\x0c\x0d\x0b<
- 0: \x09\x0a\x0c\x0d\x0b
-
-/[[:blank:]]+/
- > \x09\x0a\x0c\x0d\x0b<
- 0: \x09
-
-/[\s]+/
- > \x09\x0a\x0c\x0d\x0b<
- 0: \x09\x0a\x0c\x0d
-
-/\s+/
- > \x09\x0a\x0c\x0d\x0b<
- 0: \x09\x0a\x0c\x0d
-
-/a b/x
- ab
-No match
-
-/(?!\A)x/m
- a\nxb\n
- 0: x
-
-/(?!^)x/m
- a\nxb\n
-No match
-
-/abc\Qabc\Eabc/
- abcabcabc
- 0: abcabcabc
-
-/abc\Q(*+|\Eabc/
- abc(*+|abc
- 0: abc(*+|abc
-
-/ abc\Q abc\Eabc/x
- abc abcabc
- 0: abc abcabc
- *** Failers
-No match
- abcabcabc
-No match
-
-/abc#comment
- \Q#not comment
- literal\E/x
- abc#not comment\n literal
- 0: abc#not comment\x0a literal
-
-/abc#comment
- \Q#not comment
- literal/x
- abc#not comment\n literal
- 0: abc#not comment\x0a literal
-
-/abc#comment
- \Q#not comment
- literal\E #more comment
- /x
- abc#not comment\n literal
- 0: abc#not comment\x0a literal
-
-/abc#comment
- \Q#not comment
- literal\E #more comment/x
- abc#not comment\n literal
- 0: abc#not comment\x0a literal
-
-/\Qabc\$xyz\E/
- abc\\\$xyz
- 0: abc\$xyz
-
-/\Qabc\E\$\Qxyz\E/
- abc\$xyz
- 0: abc$xyz
-
-/\Gabc/
- abc
- 0: abc
- *** Failers
-No match
- xyzabc
-No match
-
-/\Gabc./g
- abc1abc2xyzabc3
- 0: abc1
- 0: abc2
-
-/abc./g
- abc1abc2xyzabc3
- 0: abc1
- 0: abc2
- 0: abc3
-
-/a(?x: b c )d/
- XabcdY
- 0: abcd
- *** Failers
-No match
- Xa b c d Y
-No match
-
-/((?x)x y z | a b c)/
- XabcY
- 0: abc
- 1: abc
- AxyzB
- 0: xyz
- 1: xyz
-
-/(?i)AB(?-i)C/
- XabCY
- 0: abC
- *** Failers
-No match
- XabcY
-No match
-
-/((?i)AB(?-i)C|D)E/
- abCE
- 0: abCE
- 1: abC
- DE
- 0: DE
- 1: D
- *** Failers
-No match
- abcE
-No match
- abCe
-No match
- dE
-No match
- De
-No match
-
-/(.*)\d+\1/
- abc123abc
- 0: abc123abc
- 1: abc
- abc123bc
- 0: bc123bc
- 1: bc
-
-/(.*)\d+\1/s
- abc123abc
- 0: abc123abc
- 1: abc
- abc123bc
- 0: bc123bc
- 1: bc
-
-/((.*))\d+\1/
- abc123abc
- 0: abc123abc
- 1: abc
- 2: abc
- abc123bc
- 0: bc123bc
- 1: bc
- 2: bc
-
-/-- This tests for an IPv6 address in the form where it can have up to --/
-/-- eight components, one and only one of which is empty. This must be --/
-No match
-/-- an internal component. --/
-No match
-
-/^(?!:) # colon disallowed at start
- (?: # start of item
- (?: [0-9a-f]{1,4} | # 1-4 hex digits or
- (?(1)0 | () ) ) # if null previously matched, fail; else null
- : # followed by colon
- ){1,7} # end item; 1-7 of them required
- [0-9a-f]{1,4} $ # final hex number at end of string
- (?(1)|.) # check that there was an empty component
- /xi
- a123::a123
- 0: a123::a123
- 1:
- a123:b342::abcd
- 0: a123:b342::abcd
- 1:
- a123:b342::324e:abcd
- 0: a123:b342::324e:abcd
- 1:
- a123:ddde:b342::324e:abcd
- 0: a123:ddde:b342::324e:abcd
- 1:
- a123:ddde:b342::324e:dcba:abcd
- 0: a123:ddde:b342::324e:dcba:abcd
- 1:
- a123:ddde:9999:b342::324e:dcba:abcd
- 0: a123:ddde:9999:b342::324e:dcba:abcd
- 1:
- *** Failers
-No match
- 1:2:3:4:5:6:7:8
-No match
- a123:bce:ddde:9999:b342::324e:dcba:abcd
-No match
- a123::9999:b342::324e:dcba:abcd
-No match
- abcde:2:3:4:5:6:7:8
-No match
- ::1
-No match
- abcd:fee0:123::
-No match
- :1
-No match
- 1:
-No match
-
-/ End of testinput1 /
-
diff --git a/ext/pcre/pcrelib/testdata/testoutput2 b/ext/pcre/pcrelib/testdata/testoutput2
deleted file mode 100644
index 2d1db0fb13..0000000000
--- a/ext/pcre/pcrelib/testdata/testoutput2
+++ /dev/null
@@ -1,4088 +0,0 @@
-PCRE version 3.92 11-Sep-2002
-
-/(a)b|/
-Capturing subpattern count = 1
-No options
-No first char
-No need char
-
-/abc/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'c'
- abc
- 0: abc
- defabc
- 0: abc
- \Aabc
- 0: abc
- *** Failers
-No match
- \Adefabc
-No match
- ABC
-No match
-
-/^abc/
-Capturing subpattern count = 0
-Options: anchored
-No first char
-Need char = 'c'
- abc
- 0: abc
- \Aabc
- 0: abc
- *** Failers
-No match
- defabc
-No match
- \Adefabc
-No match
-
-/a+bc/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'c'
-
-/a*bc/
-Capturing subpattern count = 0
-No options
-No first char
-Need char = 'c'
-
-/a{3}bc/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'c'
-
-/(abc|a+z)/
-Capturing subpattern count = 1
-No options
-First char = 'a'
-No need char
-
-/^abc$/
-Capturing subpattern count = 0
-Options: anchored
-No first char
-Need char = 'c'
- abc
- 0: abc
- *** Failers
-No match
- def\nabc
-No match
-
-/ab\gdef/X
-Failed: unrecognized character follows \ at offset 3
-
-/(?X)ab\gdef/X
-Failed: unrecognized character follows \ at offset 7
-
-/x{5,4}/
-Failed: numbers out of order in {} quantifier at offset 5
-
-/z{65536}/
-Failed: number too big in {} quantifier at offset 7
-
-/[abcd/
-Failed: missing terminating ] for character class at offset 5
-
-/[\B]/
-Failed: invalid escape sequence in character class at offset 2
-
-/[z-a]/
-Failed: range out of order in character class at offset 3
-
-/^*/
-Failed: nothing to repeat at offset 1
-
-/(abc/
-Failed: missing ) at offset 4
-
-/(?# abc/
-Failed: missing ) after comment at offset 7
-
-/(?z)abc/
-Failed: unrecognized character after (? at offset 2
-
-/.*b/
-Capturing subpattern count = 0
-No options
-First char at start or follows \n
-Need char = 'b'
-
-/.*?b/
-Capturing subpattern count = 0
-No options
-First char at start or follows \n
-Need char = 'b'
-
-/cat|dog|elephant/
-Capturing subpattern count = 0
-No options
-No first char
-No need char
- this sentence eventually mentions a cat
- 0: cat
- this sentences rambles on and on for a while and then reaches elephant
- 0: elephant
-
-/cat|dog|elephant/S
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-Starting character set: c d e
- this sentence eventually mentions a cat
- 0: cat
- this sentences rambles on and on for a while and then reaches elephant
- 0: elephant
-
-/cat|dog|elephant/iS
-Capturing subpattern count = 0
-Options: caseless
-No first char
-No need char
-Starting character set: C D E c d e
- this sentence eventually mentions a CAT cat
- 0: CAT
- this sentences rambles on and on for a while to elephant ElePhant
- 0: elephant
-
-/a|[bcd]/S
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-Starting character set: a b c d
-
-/(a|[^\dZ])/S
-Capturing subpattern count = 1
-No options
-No first char
-No need char
-Starting character set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
- \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
- \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / : ; < = >
- ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y [ \ ] ^ _ ` a b c d
- e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 \x83
- \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92
- \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1
- \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0
- \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf
- \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce
- \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd
- \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec
- \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb
- \xfc \xfd \xfe \xff
-
-/(a|b)*[\s]/S
-Capturing subpattern count = 1
-No options
-No first char
-No need char
-Starting character set: \x09 \x0a \x0c \x0d \x20 a b
-
-/(ab\2)/
-Failed: reference to non-existent subpattern at offset 6
-
-/{4,5}abc/
-Failed: nothing to repeat at offset 4
-
-/(a)(b)(c)\2/
-Capturing subpattern count = 3
-Max back reference = 2
-No options
-First char = 'a'
-Need char = 'c'
- abcb
- 0: abcb
- 1: a
- 2: b
- 3: c
- \O0abcb
-Matched, but too many substrings
- \O3abcb
-Matched, but too many substrings
- 0: abcb
- \O6abcb
-Matched, but too many substrings
- 0: abcb
- 1: a
- \O9abcb
-Matched, but too many substrings
- 0: abcb
- 1: a
- 2: b
- \O12abcb
- 0: abcb
- 1: a
- 2: b
- 3: c
-
-/(a)bc|(a)(b)\2/
-Capturing subpattern count = 3
-Max back reference = 2
-No options
-First char = 'a'
-No need char
- abc
- 0: abc
- 1: a
- \O0abc
-Matched, but too many substrings
- \O3abc
-Matched, but too many substrings
- 0: abc
- \O6abc
- 0: abc
- 1: a
- aba
- 0: aba
- 1: <unset>
- 2: a
- 3: b
- \O0aba
-Matched, but too many substrings
- \O3aba
-Matched, but too many substrings
- 0: aba
- \O6aba
-Matched, but too many substrings
- 0: aba
- 1: <unset>
- \O9aba
-Matched, but too many substrings
- 0: aba
- 1: <unset>
- 2: a
- \O12aba
- 0: aba
- 1: <unset>
- 2: a
- 3: b
-
-/abc$/E
-Capturing subpattern count = 0
-Options: dollar_endonly
-First char = 'a'
-Need char = 'c'
- abc
- 0: abc
- *** Failers
-No match
- abc\n
-No match
- abc\ndef
-No match
-
-/(a)(b)(c)(d)(e)\6/
-Failed: reference to non-existent subpattern at offset 17
-
-/the quick brown fox/
-Capturing subpattern count = 0
-No options
-First char = 't'
-Need char = 'x'
- the quick brown fox
- 0: the quick brown fox
- this is a line with the quick brown fox
- 0: the quick brown fox
-
-/the quick brown fox/A
-Capturing subpattern count = 0
-Options: anchored
-No first char
-Need char = 'x'
- the quick brown fox
- 0: the quick brown fox
- *** Failers
-No match
- this is a line with the quick brown fox
-No match
-
-/ab(?z)cd/
-Failed: unrecognized character after (? at offset 4
-
-/^abc|def/
-Capturing subpattern count = 0
-No options
-No first char
-No need char
- abcdef
- 0: abc
- abcdef\B
- 0: def
-
-/.*((abc)$|(def))/
-Capturing subpattern count = 3
-No options
-First char at start or follows \n
-No need char
- defabc
- 0: defabc
- 1: abc
- 2: abc
- \Zdefabc
- 0: def
- 1: def
- 2: <unset>
- 3: def
-
-/abc/P
- abc
- 0: abc
- *** Failers
-No match: POSIX code 17: match failed
-
-/^abc|def/P
- abcdef
- 0: abc
- abcdef\B
- 0: def
-
-/.*((abc)$|(def))/P
- defabc
- 0: defabc
- 1: abc
- 2: abc
- \Zdefabc
- 0: def
- 1: def
- 3: def
-
-/the quick brown fox/P
- the quick brown fox
- 0: the quick brown fox
- *** Failers
-No match: POSIX code 17: match failed
- The Quick Brown Fox
-No match: POSIX code 17: match failed
-
-/the quick brown fox/Pi
- the quick brown fox
- 0: the quick brown fox
- The Quick Brown Fox
- 0: The Quick Brown Fox
-
-/abc.def/P
- *** Failers
-No match: POSIX code 17: match failed
- abc\ndef
-No match: POSIX code 17: match failed
-
-/abc$/P
- abc
- 0: abc
- abc\n
- 0: abc
-
-/(abc)\2/P
-Failed: POSIX code 15: bad back reference at offset 7
-
-/(abc\1)/P
- abc
-No match: POSIX code 17: match failed
-
-/)/
-Failed: unmatched parentheses at offset 0
-
-/a[]b/
-Failed: missing terminating ] for character class at offset 4
-
-/[^aeiou ]{3,}/
-Capturing subpattern count = 0
-No options
-No first char
-No need char
- co-processors, and for
- 0: -pr
-
-/<.*>/
-Capturing subpattern count = 0
-No options
-First char = '<'
-Need char = '>'
- abc<def>ghi<klm>nop
- 0: <def>ghi<klm>
-
-/<.*?>/
-Capturing subpattern count = 0
-No options
-First char = '<'
-Need char = '>'
- abc<def>ghi<klm>nop
- 0: <def>
-
-/<.*>/U
-Capturing subpattern count = 0
-Options: ungreedy
-First char = '<'
-Need char = '>'
- abc<def>ghi<klm>nop
- 0: <def>
-
-/(?U)<.*>/
-Capturing subpattern count = 0
-Options: ungreedy
-First char = '<'
-Need char = '>'
- abc<def>ghi<klm>nop
- 0: <def>
-
-/<.*?>/U
-Capturing subpattern count = 0
-Options: ungreedy
-First char = '<'
-Need char = '>'
- abc<def>ghi<klm>nop
- 0: <def>ghi<klm>
-
-/={3,}/U
-Capturing subpattern count = 0
-Options: ungreedy
-First char = '='
-Need char = '='
- abc========def
- 0: ===
-
-/(?U)={3,}?/
-Capturing subpattern count = 0
-Options: ungreedy
-First char = '='
-Need char = '='
- abc========def
- 0: ========
-
-/(?<!bar|cattle)foo/
-Capturing subpattern count = 0
-No options
-First char = 'f'
-Need char = 'o'
- foo
- 0: foo
- catfoo
- 0: foo
- *** Failers
-No match
- the barfoo
-No match
- and cattlefoo
-No match
-
-/(?<=a+)b/
-Failed: lookbehind assertion is not fixed length at offset 6
-
-/(?<=aaa|b{0,3})b/
-Failed: lookbehind assertion is not fixed length at offset 14
-
-/(?<!(foo)a\1)bar/
-Failed: lookbehind assertion is not fixed length at offset 12
-
-/(?i)abc/
-Capturing subpattern count = 0
-Options: caseless
-First char = 'a' (caseless)
-Need char = 'c' (caseless)
-
-/(a|(?m)a)/
-Capturing subpattern count = 1
-No options
-First char = 'a'
-No need char
-
-/(?i)^1234/
-Capturing subpattern count = 0
-Options: anchored caseless
-No first char
-Need char = '4'
-
-/(^b|(?i)^d)/
-Capturing subpattern count = 1
-Options: anchored
-Case state changes
-No first char
-No need char
-
-/(?s).*/
-Capturing subpattern count = 0
-Options: anchored dotall
-No first char
-No need char
-
-/[abcd]/S
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-Starting character set: a b c d
-
-/(?i)[abcd]/S
-Capturing subpattern count = 0
-Options: caseless
-No first char
-No need char
-Starting character set: A B C D a b c d
-
-/(?m)[xy]|(b|c)/S
-Capturing subpattern count = 1
-Options: multiline
-No first char
-No need char
-Starting character set: b c x y
-
-/(^a|^b)/m
-Capturing subpattern count = 1
-Options: multiline
-First char at start or follows \n
-No need char
-
-/(?i)(^a|^b)/m
-Capturing subpattern count = 1
-Options: caseless multiline
-First char at start or follows \n
-No need char
-
-/(a)(?(1)a|b|c)/
-Failed: conditional group contains more than two branches at offset 13
-
-/(?(?=a)a|b|c)/
-Failed: conditional group contains more than two branches at offset 12
-
-/(?(1a)/
-Failed: malformed number after (?( at offset 4
-
-/(?(?i))/
-Failed: assertion expected after (?( at offset 3
-
-/(?(abc))/
-Failed: assertion expected after (?( at offset 3
-
-/(?(?<ab))/
-Failed: unrecognized character after (?< at offset 5
-
-/((?s)blah)\s+\1/
-Capturing subpattern count = 1
-Max back reference = 1
-No options
-First char = 'b'
-Need char = 'h'
-
-/((?i)blah)\s+\1/
-Capturing subpattern count = 1
-Max back reference = 1
-No options
-Case state changes
-First char = 'b' (caseless)
-Need char = 'h' (caseless)
-
-/((?i)b)/DS
-------------------------------------------------------------------
- 0 16 Bra 0
- 3 8 Bra 1
- 6 01 Opt
- 8 1 b
- 11 8 Ket
- 14 00 Opt
- 16 16 Ket
- 19 End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-No options
-Case state changes
-First char = 'b' (caseless)
-No need char
-Study returned NULL
-
-/(a*b|(?i:c*(?-i)d))/S
-Capturing subpattern count = 1
-No options
-Case state changes
-No first char
-No need char
-Starting character set: C a b c d
-
-/a$/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-No need char
- a
- 0: a
- a\n
- 0: a
- *** Failers
-No match
- \Za
-No match
- \Za\n
-No match
-
-/a$/m
-Capturing subpattern count = 0
-Options: multiline
-First char = 'a'
-No need char
- a
- 0: a
- a\n
- 0: a
- \Za\n
- 0: a
- *** Failers
-No match
- \Za
-No match
-
-/\Aabc/m
-Capturing subpattern count = 0
-Options: anchored multiline
-No first char
-Need char = 'c'
-
-/^abc/m
-Capturing subpattern count = 0
-Options: multiline
-First char at start or follows \n
-Need char = 'c'
-
-/^((a+)(?U)([ab]+)(?-U)([bc]+)(\w*))/
-Capturing subpattern count = 5
-Options: anchored
-No first char
-Need char = 'a'
- aaaaabbbbbcccccdef
- 0: aaaaabbbbbcccccdef
- 1: aaaaabbbbbcccccdef
- 2: aaaaa
- 3: b
- 4: bbbbccccc
- 5: def
-
-/(?<=foo)[ab]/S
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-Starting character set: a b
-
-/(?<!foo)(alpha|omega)/S
-Capturing subpattern count = 1
-No options
-No first char
-Need char = 'a'
-Starting character set: a o
-
-/(?!alphabet)[ab]/S
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-Starting character set: a b
-
-/(?<=foo\n)^bar/m
-Capturing subpattern count = 0
-Options: multiline
-First char at start or follows \n
-Need char = 'r'
-
-/(?>^abc)/m
-Capturing subpattern count = 0
-Options: multiline
-First char at start or follows \n
-Need char = 'c'
- abc
- 0: abc
- def\nabc
- 0: abc
- *** Failers
-No match
- defabc
-No match
-
-/(?<=ab(c+)d)ef/
-Failed: lookbehind assertion is not fixed length at offset 11
-
-/(?<=ab(?<=c+)d)ef/
-Failed: lookbehind assertion is not fixed length at offset 12
-
-/(?<=ab(c|de)f)g/
-Failed: lookbehind assertion is not fixed length at offset 13
-
-/The next three are in testinput2 because they have variable length branches/
-Capturing subpattern count = 0
-No options
-First char = 'T'
-Need char = 's'
-
-/(?<=bullock|donkey)-cart/
-Capturing subpattern count = 0
-No options
-First char = '-'
-Need char = 't'
- the bullock-cart
- 0: -cart
- a donkey-cart race
- 0: -cart
- *** Failers
-No match
- cart
-No match
- horse-and-cart
-No match
-
-/(?<=ab(?i)x|y|z)/
-Capturing subpattern count = 0
-No options
-Case state changes
-No first char
-No need char
-
-/(?>.*)(?<=(abcd)|(xyz))/
-Capturing subpattern count = 2
-No options
-First char at start or follows \n
-No need char
- alphabetabcd
- 0: alphabetabcd
- 1: abcd
- endingxyz
- 0: endingxyz
- 1: <unset>
- 2: xyz
-
-/(?<=ab(?i)x(?-i)y|(?i)z|b)ZZ/
-Capturing subpattern count = 0
-No options
-Case state changes
-First char = 'Z'
-Need char = 'Z'
- abxyZZ
- 0: ZZ
- abXyZZ
- 0: ZZ
- ZZZ
- 0: ZZ
- zZZ
- 0: ZZ
- bZZ
- 0: ZZ
- BZZ
- 0: ZZ
- *** Failers
-No match
- ZZ
-No match
- abXYZZ
-No match
- zzz
-No match
- bzz
-No match
-
-/(?<!(foo)a)bar/
-Capturing subpattern count = 1
-No options
-First char = 'b'
-Need char = 'r'
- bar
- 0: bar
- foobbar
- 0: bar
- *** Failers
-No match
- fooabar
-No match
-
-/This one is here because Perl 5.005_02 doesn't fail it/
-Capturing subpattern count = 0
-No options
-First char = 'T'
-Need char = 't'
-
-/^(a)?(?(1)a|b)+$/
-Capturing subpattern count = 1
-Options: anchored
-No first char
-No need char
- *** Failers
-No match
- a
-No match
-
-/This one is here because I think Perl 5.005_02 gets the setting of $1 wrong/
-Capturing subpattern count = 0
-No options
-First char = 'T'
-Need char = 'g'
-
-/^(a\1?){4}$/
-Capturing subpattern count = 1
-Max back reference = 1
-Options: anchored
-No first char
-Need char = 'a'
- aaaaaa
- 0: aaaaaa
- 1: aa
-
-/These are syntax tests from Perl 5.005/
-Capturing subpattern count = 0
-No options
-First char = 'T'
-Need char = '5'
-
-/a[b-a]/
-Failed: range out of order in character class at offset 4
-
-/a[]b/
-Failed: missing terminating ] for character class at offset 4
-
-/a[/
-Failed: missing terminating ] for character class at offset 2
-
-/*a/
-Failed: nothing to repeat at offset 0
-
-/(*)b/
-Failed: nothing to repeat at offset 1
-
-/abc)/
-Failed: unmatched parentheses at offset 3
-
-/(abc/
-Failed: missing ) at offset 4
-
-/a**/
-Failed: nothing to repeat at offset 2
-
-/)(/
-Failed: unmatched parentheses at offset 0
-
-/\1/
-Failed: reference to non-existent subpattern at offset 2
-
-/\2/
-Failed: reference to non-existent subpattern at offset 2
-
-/(a)|\2/
-Failed: reference to non-existent subpattern at offset 6
-
-/a[b-a]/i
-Failed: range out of order in character class at offset 4
-
-/a[]b/i
-Failed: missing terminating ] for character class at offset 4
-
-/a[/i
-Failed: missing terminating ] for character class at offset 2
-
-/*a/i
-Failed: nothing to repeat at offset 0
-
-/(*)b/i
-Failed: nothing to repeat at offset 1
-
-/abc)/i
-Failed: unmatched parentheses at offset 3
-
-/(abc/i
-Failed: missing ) at offset 4
-
-/a**/i
-Failed: nothing to repeat at offset 2
-
-/)(/i
-Failed: unmatched parentheses at offset 0
-
-/:(?:/
-Failed: missing ) at offset 4
-
-/(?<%)b/
-Failed: unrecognized character after (?< at offset 3
-
-/a(?{)b/
-Failed: unrecognized character after (? at offset 3
-
-/a(?{{})b/
-Failed: unrecognized character after (? at offset 3
-
-/a(?{}})b/
-Failed: unrecognized character after (? at offset 3
-
-/a(?{"{"})b/
-Failed: unrecognized character after (? at offset 3
-
-/a(?{"{"}})b/
-Failed: unrecognized character after (? at offset 3
-
-/(?(1?)a|b)/
-Failed: malformed number after (?( at offset 4
-
-/(?(1)a|b|c)/
-Failed: conditional group contains more than two branches at offset 10
-
-/[a[:xyz:/
-Failed: missing terminating ] for character class at offset 8
-
-/(?<=x+)y/
-Failed: lookbehind assertion is not fixed length at offset 6
-
-/a{37,17}/
-Failed: numbers out of order in {} quantifier at offset 7
-
-/abc/\
-Failed: \ at end of pattern at offset 4
-
-/abc/\P
-Failed: POSIX code 9: bad escape sequence at offset 4
-
-/abc/\i
-Failed: \ at end of pattern at offset 4
-
-/(a)bc(d)/
-Capturing subpattern count = 2
-No options
-First char = 'a'
-Need char = 'd'
- abcd
- 0: abcd
- 1: a
- 2: d
- abcd\C2
- 0: abcd
- 1: a
- 2: d
- 2C d (1)
- abcd\C5
- 0: abcd
- 1: a
- 2: d
-copy substring 5 failed -7
-
-/(.{20})/
-Capturing subpattern count = 1
-No options
-No first char
-No need char
- abcdefghijklmnopqrstuvwxyz
- 0: abcdefghijklmnopqrst
- 1: abcdefghijklmnopqrst
- abcdefghijklmnopqrstuvwxyz\C1
- 0: abcdefghijklmnopqrst
- 1: abcdefghijklmnopqrst
-copy substring 1 failed -6
- abcdefghijklmnopqrstuvwxyz\G1
- 0: abcdefghijklmnopqrst
- 1: abcdefghijklmnopqrst
- 1G abcdefghijklmnopqrst (20)
-
-/(.{15})/
-Capturing subpattern count = 1
-No options
-No first char
-No need char
- abcdefghijklmnopqrstuvwxyz
- 0: abcdefghijklmno
- 1: abcdefghijklmno
- abcdefghijklmnopqrstuvwxyz\C1\G1
- 0: abcdefghijklmno
- 1: abcdefghijklmno
- 1C abcdefghijklmno (15)
- 1G abcdefghijklmno (15)
-
-/(.{16})/
-Capturing subpattern count = 1
-No options
-No first char
-No need char
- abcdefghijklmnopqrstuvwxyz
- 0: abcdefghijklmnop
- 1: abcdefghijklmnop
- abcdefghijklmnopqrstuvwxyz\C1\G1\L
- 0: abcdefghijklmnop
- 1: abcdefghijklmnop
-copy substring 1 failed -6
- 1G abcdefghijklmnop (16)
- 0L abcdefghijklmnop
- 1L abcdefghijklmnop
-
-/^(a|(bc))de(f)/
-Capturing subpattern count = 3
-Options: anchored
-No first char
-Need char = 'f'
- adef\G1\G2\G3\G4\L
- 0: adef
- 1: a
- 2: <unset>
- 3: f
- 1G a (1)
- 2G (0)
- 3G f (1)
-get substring 4 failed -7
- 0L adef
- 1L a
- 2L
- 3L f
- bcdef\G1\G2\G3\G4\L
- 0: bcdef
- 1: bc
- 2: bc
- 3: f
- 1G bc (2)
- 2G bc (2)
- 3G f (1)
-get substring 4 failed -7
- 0L bcdef
- 1L bc
- 2L bc
- 3L f
- adefghijk\C0
- 0: adef
- 1: a
- 2: <unset>
- 3: f
- 0C adef (4)
-
-/^abc\00def/
-Capturing subpattern count = 0
-Options: anchored
-No first char
-Need char = 'f'
- abc\00def\L\C0
- 0: abc\x00def
- 0C abc (7)
- 0L abc
-
-/word ((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+
-)((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+
-)?)?)?)?)?)?)?)?)?otherword/M
-Memory allocation (code space): 428
-Capturing subpattern count = 8
-No options
-First char = 'w'
-Need char = 'd'
-
-/.*X/D
-------------------------------------------------------------------
- 0 8 Bra 0
- 3 Any*
- 5 1 X
- 8 8 Ket
- 11 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-First char at start or follows \n
-Need char = 'X'
-
-/.*X/Ds
-------------------------------------------------------------------
- 0 8 Bra 0
- 3 Any*
- 5 1 X
- 8 8 Ket
- 11 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored dotall
-No first char
-Need char = 'X'
-
-/(.*X|^B)/D
-------------------------------------------------------------------
- 0 21 Bra 0
- 3 8 Bra 1
- 6 Any*
- 8 1 X
- 11 7 Alt
- 14 ^
- 15 1 B
- 18 15 Ket
- 21 21 Ket
- 24 End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-No options
-First char at start or follows \n
-No need char
-
-/(.*X|^B)/Ds
-------------------------------------------------------------------
- 0 21 Bra 0
- 3 8 Bra 1
- 6 Any*
- 8 1 X
- 11 7 Alt
- 14 ^
- 15 1 B
- 18 15 Ket
- 21 21 Ket
- 24 End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-Options: anchored dotall
-No first char
-No need char
-
-/(?s)(.*X|^B)/D
-------------------------------------------------------------------
- 0 21 Bra 0
- 3 8 Bra 1
- 6 Any*
- 8 1 X
- 11 7 Alt
- 14 ^
- 15 1 B
- 18 15 Ket
- 21 21 Ket
- 24 End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-Options: anchored dotall
-No first char
-No need char
-
-/(?s:.*X|^B)/D
-------------------------------------------------------------------
- 0 27 Bra 0
- 3 10 Bra 0
- 6 04 Opt
- 8 Any*
- 10 1 X
- 13 9 Alt
- 16 04 Opt
- 18 ^
- 19 1 B
- 22 19 Ket
- 25 00 Opt
- 27 27 Ket
- 30 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-First char at start or follows \n
-No need char
-
-/\Biss\B/+
-Capturing subpattern count = 0
-No options
-First char = 'i'
-Need char = 's'
- Mississippi
- 0: iss
- 0+ issippi
-
-/\Biss\B/+P
- Mississippi
- 0: iss
- 0+ issippi
-
-/iss/G+
-Capturing subpattern count = 0
-No options
-First char = 'i'
-Need char = 's'
- Mississippi
- 0: iss
- 0+ issippi
- 0: iss
- 0+ ippi
-
-/\Biss\B/G+
-Capturing subpattern count = 0
-No options
-First char = 'i'
-Need char = 's'
- Mississippi
- 0: iss
- 0+ issippi
-
-/\Biss\B/g+
-Capturing subpattern count = 0
-No options
-First char = 'i'
-Need char = 's'
- Mississippi
- 0: iss
- 0+ issippi
- 0: iss
- 0+ ippi
- *** Failers
-No match
- Mississippi\A
-No match
-
-/(?<=[Ms])iss/g+
-Capturing subpattern count = 0
-No options
-First char = 'i'
-Need char = 's'
- Mississippi
- 0: iss
- 0+ issippi
- 0: iss
- 0+ ippi
-
-/(?<=[Ms])iss/G+
-Capturing subpattern count = 0
-No options
-First char = 'i'
-Need char = 's'
- Mississippi
- 0: iss
- 0+ issippi
-
-/^iss/g+
-Capturing subpattern count = 0
-Options: anchored
-No first char
-Need char = 's'
- ississippi
- 0: iss
- 0+ issippi
-
-/.*iss/g+
-Capturing subpattern count = 0
-No options
-First char at start or follows \n
-Need char = 's'
- abciss\nxyzisspqr
- 0: abciss
- 0+ \x0axyzisspqr
- 0: xyziss
- 0+ pqr
-
-/.i./+g
-Capturing subpattern count = 0
-No options
-No first char
-Need char = 'i'
- Mississippi
- 0: Mis
- 0+ sissippi
- 0: sis
- 0+ sippi
- 0: sip
- 0+ pi
- Mississippi\A
- 0: Mis
- 0+ sissippi
- 0: sis
- 0+ sippi
- 0: sip
- 0+ pi
- Missouri river
- 0: Mis
- 0+ souri river
- 0: ri
- 0+ river
- 0: riv
- 0+ er
- Missouri river\A
- 0: Mis
- 0+ souri river
-
-/^.is/+g
-Capturing subpattern count = 0
-Options: anchored
-No first char
-Need char = 's'
- Mississippi
- 0: Mis
- 0+ sissippi
-
-/^ab\n/g+
-Capturing subpattern count = 0
-Options: anchored
-No first char
-Need char = 10
- ab\nab\ncd
- 0: ab\x0a
- 0+ ab\x0acd
-
-/^ab\n/mg+
-Capturing subpattern count = 0
-Options: multiline
-First char at start or follows \n
-Need char = 10
- ab\nab\ncd
- 0: ab\x0a
- 0+ ab\x0acd
- 0: ab\x0a
- 0+ cd
-
-/abc/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'c'
-
-/abc|bac/
-Capturing subpattern count = 0
-No options
-No first char
-Need char = 'c'
-
-/(abc|bac)/
-Capturing subpattern count = 1
-No options
-No first char
-Need char = 'c'
-
-/(abc|(c|dc))/
-Capturing subpattern count = 2
-No options
-No first char
-Need char = 'c'
-
-/(abc|(d|de)c)/
-Capturing subpattern count = 2
-No options
-No first char
-Need char = 'c'
-
-/a*/
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-
-/a+/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-No need char
-
-/(baa|a+)/
-Capturing subpattern count = 1
-No options
-No first char
-Need char = 'a'
-
-/a{0,3}/
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-
-/baa{3,}/
-Capturing subpattern count = 0
-No options
-First char = 'b'
-Need char = 'a'
-
-/"([^\\"]+|\\.)*"/
-Capturing subpattern count = 1
-No options
-First char = '"'
-Need char = '"'
-
-/(abc|ab[cd])/
-Capturing subpattern count = 1
-No options
-First char = 'a'
-No need char
-
-/(a|.)/
-Capturing subpattern count = 1
-No options
-No first char
-No need char
-
-/a|ba|\w/
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-
-/abc(?=pqr)/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'r'
-
-/...(?<=abc)/
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-
-/abc(?!pqr)/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'c'
-
-/ab./
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'b'
-
-/ab[xyz]/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'b'
-
-/abc*/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'b'
-
-/ab.c*/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'b'
-
-/a.c*/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-No need char
-
-/.c*/
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-
-/ac*/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-No need char
-
-/(a.c*|b.c*)/
-Capturing subpattern count = 1
-No options
-No first char
-No need char
-
-/a.c*|aba/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-No need char
-
-/.+a/
-Capturing subpattern count = 0
-No options
-No first char
-Need char = 'a'
-
-/(?=abcda)a.*/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'a'
-
-/(?=a)a.*/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-No need char
-
-/a(b)*/
-Capturing subpattern count = 1
-No options
-First char = 'a'
-No need char
-
-/a\d*/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-No need char
-
-/ab\d*/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'b'
-
-/a(\d)*/
-Capturing subpattern count = 1
-No options
-First char = 'a'
-No need char
-
-/abcde{0,0}/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'd'
-
-/ab\d+/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'b'
-
-/a(?(1)b)/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-No need char
-
-/a(?(1)bag|big)/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'g'
-
-/a(?(1)bag|big)*/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-No need char
-
-/a(?(1)bag|big)+/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'g'
-
-/a(?(1)b..|b..)/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'b'
-
-/ab\d{0}e/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'e'
-
-/a?b?/
-Capturing subpattern count = 0
-No options
-No first char
-No need char
- a
- 0: a
- b
- 0: b
- ab
- 0: ab
- \
- 0:
- *** Failers
- 0:
- \N
-No match
-
-/|-/
-Capturing subpattern count = 0
-No options
-No first char
-No need char
- abcd
- 0:
- -abc
- 0:
- \Nab-c
- 0: -
- *** Failers
- 0:
- \Nabc
-No match
-
-/a*(b+)(z)(z)/P
- aaaabbbbzzzz
- 0: aaaabbbbzz
- 1: bbbb
- 2: z
- 3: z
- aaaabbbbzzzz\O0
- aaaabbbbzzzz\O1
- 0: aaaabbbbzz
- aaaabbbbzzzz\O2
- 0: aaaabbbbzz
- 1: bbbb
- aaaabbbbzzzz\O3
- 0: aaaabbbbzz
- 1: bbbb
- 2: z
- aaaabbbbzzzz\O4
- 0: aaaabbbbzz
- 1: bbbb
- 2: z
- 3: z
- aaaabbbbzzzz\O5
- 0: aaaabbbbzz
- 1: bbbb
- 2: z
- 3: z
-
-/^.?abcd/S
-Capturing subpattern count = 0
-Options: anchored
-No first char
-Need char = 'd'
-Study returned NULL
-
-/\( # ( at start
- (?: # Non-capturing bracket
- (?>[^()]+) # Either a sequence of non-brackets (no backtracking)
- | # Or
- (?R) # Recurse - i.e. nested bracketed string
- )* # Zero or more contents
- \) # Closing )
- /x
-Capturing subpattern count = 0
-Options: extended
-First char = '('
-Need char = ')'
- (abcd)
- 0: (abcd)
- (abcd)xyz
- 0: (abcd)
- xyz(abcd)
- 0: (abcd)
- (ab(xy)cd)pqr
- 0: (ab(xy)cd)
- (ab(xycd)pqr
- 0: (xycd)
- () abc ()
- 0: ()
- 12(abcde(fsh)xyz(foo(bar))lmno)89
- 0: (abcde(fsh)xyz(foo(bar))lmno)
- *** Failers
-No match
- abcd
-No match
- abcd)
-No match
- (abcd
-No match
-
-/\( ( (?>[^()]+) | (?R) )* \) /xg
-Capturing subpattern count = 1
-Options: extended
-First char = '('
-Need char = ')'
- (ab(xy)cd)pqr
- 0: (ab(xy)cd)
- 1: cd
- 1(abcd)(x(y)z)pqr
- 0: (abcd)
- 1: abcd
- 0: (x(y)z)
- 1: z
-
-/\( (?: (?>[^()]+) | (?R) ) \) /x
-Capturing subpattern count = 0
-Options: extended
-First char = '('
-Need char = ')'
- (abcd)
- 0: (abcd)
- (ab(xy)cd)
- 0: (xy)
- (a(b(c)d)e)
- 0: (c)
- ((ab))
- 0: ((ab))
- *** Failers
-No match
- ()
-No match
-
-/\( (?: (?>[^()]+) | (?R) )? \) /x
-Capturing subpattern count = 0
-Options: extended
-First char = '('
-Need char = ')'
- ()
- 0: ()
- 12(abcde(fsh)xyz(foo(bar))lmno)89
- 0: (fsh)
-
-/\( ( (?>[^()]+) | (?R) )* \) /x
-Capturing subpattern count = 1
-Options: extended
-First char = '('
-Need char = ')'
- (ab(xy)cd)
- 0: (ab(xy)cd)
- 1: cd
-
-/\( ( ( (?>[^()]+) | (?R) )* ) \) /x
-Capturing subpattern count = 2
-Options: extended
-First char = '('
-Need char = ')'
- (ab(xy)cd)
- 0: (ab(xy)cd)
- 1: ab(xy)cd
- 2: cd
-
-/\( (123)? ( ( (?>[^()]+) | (?R) )* ) \) /x
-Capturing subpattern count = 3
-Options: extended
-First char = '('
-Need char = ')'
- (ab(xy)cd)
- 0: (ab(xy)cd)
- 1: <unset>
- 2: ab(xy)cd
- 3: cd
- (123ab(xy)cd)
- 0: (123ab(xy)cd)
- 1: 123
- 2: ab(xy)cd
- 3: cd
-
-/\( ( (123)? ( (?>[^()]+) | (?R) )* ) \) /x
-Capturing subpattern count = 3
-Options: extended
-First char = '('
-Need char = ')'
- (ab(xy)cd)
- 0: (ab(xy)cd)
- 1: ab(xy)cd
- 2: <unset>
- 3: cd
- (123ab(xy)cd)
- 0: (123ab(xy)cd)
- 1: 123ab(xy)cd
- 2: 123
- 3: cd
-
-/\( (((((((((( ( (?>[^()]+) | (?R) )* )))))))))) \) /x
-Capturing subpattern count = 11
-Options: extended
-First char = '('
-Need char = ')'
- (ab(xy)cd)
- 0: (ab(xy)cd)
- 1: ab(xy)cd
- 2: ab(xy)cd
- 3: ab(xy)cd
- 4: ab(xy)cd
- 5: ab(xy)cd
- 6: ab(xy)cd
- 7: ab(xy)cd
- 8: ab(xy)cd
- 9: ab(xy)cd
-10: ab(xy)cd
-11: cd
-
-/\( ( ( (?>[^()<>]+) | ((?>[^()]+)) | (?R) )* ) \) /x
-Capturing subpattern count = 3
-Options: extended
-First char = '('
-Need char = ')'
- (abcd(xyz<p>qrs)123)
- 0: (abcd(xyz<p>qrs)123)
- 1: abcd(xyz<p>qrs)123
- 2: 123
- 3: <unset>
-
-/\( ( ( (?>[^()]+) | ((?R)) )* ) \) /x
-Capturing subpattern count = 3
-Options: extended
-First char = '('
-Need char = ')'
- (ab(cd)ef)
- 0: (ab(cd)ef)
- 1: ab(cd)ef
- 2: ef
- 3: (cd)
- (ab(cd(ef)gh)ij)
- 0: (ab(cd(ef)gh)ij)
- 1: ab(cd(ef)gh)ij
- 2: ij
- 3: (cd(ef)gh)
-
-/^[[:alnum:]]/D
-------------------------------------------------------------------
- 0 37 Bra 0
- 3 ^
- 4 [0-9A-Za-z]
- 37 37 Ket
- 40 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-No need char
-
-/^[[:alpha:]]/D
-------------------------------------------------------------------
- 0 37 Bra 0
- 3 ^
- 4 [A-Za-z]
- 37 37 Ket
- 40 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-No need char
-
-/^[[:ascii:]]/D
-------------------------------------------------------------------
- 0 37 Bra 0
- 3 ^
- 4 [\x00-\x7f]
- 37 37 Ket
- 40 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-No need char
-
-/^[[:blank:]]/D
-------------------------------------------------------------------
- 0 37 Bra 0
- 3 ^
- 4 [\x09 ]
- 37 37 Ket
- 40 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-No need char
-
-/^[[:cntrl:]]/D
-------------------------------------------------------------------
- 0 37 Bra 0
- 3 ^
- 4 [\x00-\x1f\x7f]
- 37 37 Ket
- 40 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-No need char
-
-/^[[:digit:]]/D
-------------------------------------------------------------------
- 0 37 Bra 0
- 3 ^
- 4 [0-9]
- 37 37 Ket
- 40 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-No need char
-
-/^[[:graph:]]/D
-------------------------------------------------------------------
- 0 37 Bra 0
- 3 ^
- 4 [!-~]
- 37 37 Ket
- 40 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-No need char
-
-/^[[:lower:]]/D
-------------------------------------------------------------------
- 0 37 Bra 0
- 3 ^
- 4 [a-z]
- 37 37 Ket
- 40 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-No need char
-
-/^[[:print:]]/D
-------------------------------------------------------------------
- 0 37 Bra 0
- 3 ^
- 4 [ -~]
- 37 37 Ket
- 40 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-No need char
-
-/^[[:punct:]]/D
-------------------------------------------------------------------
- 0 37 Bra 0
- 3 ^
- 4 [!-/:-@[-`{-~]
- 37 37 Ket
- 40 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-No need char
-
-/^[[:space:]]/D
-------------------------------------------------------------------
- 0 37 Bra 0
- 3 ^
- 4 [\x09-\x0d ]
- 37 37 Ket
- 40 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-No need char
-
-/^[[:upper:]]/D
-------------------------------------------------------------------
- 0 37 Bra 0
- 3 ^
- 4 [A-Z]
- 37 37 Ket
- 40 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-No need char
-
-/^[[:xdigit:]]/D
-------------------------------------------------------------------
- 0 37 Bra 0
- 3 ^
- 4 [0-9A-Fa-f]
- 37 37 Ket
- 40 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-No need char
-
-/^[[:word:]]/D
-------------------------------------------------------------------
- 0 37 Bra 0
- 3 ^
- 4 [0-9A-Z_a-z]
- 37 37 Ket
- 40 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-No need char
-
-/^[[:^cntrl:]]/D
-------------------------------------------------------------------
- 0 37 Bra 0
- 3 ^
- 4 [ -~\x80-\xff]
- 37 37 Ket
- 40 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-No need char
-
-/^[12[:^digit:]]/D
-------------------------------------------------------------------
- 0 37 Bra 0
- 3 ^
- 4 [\x00-/1-2:-\xff]
- 37 37 Ket
- 40 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-No need char
-
-/^[[:^blank:]]/D
-------------------------------------------------------------------
- 0 37 Bra 0
- 3 ^
- 4 [\x00-\x08\x0a-\x1f!-\xff]
- 37 37 Ket
- 40 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-No need char
-
-/[01[:alpha:]%]/D
-------------------------------------------------------------------
- 0 36 Bra 0
- 3 [%0-1A-Za-z]
- 36 36 Ket
- 39 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-
-/[[.ch.]]/
-Failed: POSIX collating elements are not supported at offset 1
-
-/[[=ch=]]/
-Failed: POSIX collating elements are not supported at offset 1
-
-/[[:rhubarb:]]/
-Failed: unknown POSIX class name at offset 3
-
-/[[:upper:]]/i
-Capturing subpattern count = 0
-Options: caseless
-No first char
-No need char
- A
- 0: A
- a
- 0: a
-
-/[[:lower:]]/i
-Capturing subpattern count = 0
-Options: caseless
-No first char
-No need char
- A
- 0: A
- a
- 0: a
-
-/((?-i)[[:lower:]])[[:lower:]]/i
-Capturing subpattern count = 1
-Options: caseless
-Case state changes
-No first char
-No need char
- ab
- 0: ab
- 1: a
- aB
- 0: aB
- 1: a
- *** Failers
- 0: ai
- 1: a
- Ab
-No match
- AB
-No match
-
-/[\200-\410]/
-Failed: range out of order in character class at offset 9
-
-/^(?(0)f|b)oo/
-Failed: invalid condition (?(0) at offset 5
-
-/This one's here because of the large output vector needed/
-Capturing subpattern count = 0
-No options
-First char = 'T'
-Need char = 'd'
-
-/(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\d+(?:\s|$))(\w+)\s+(\270)/
-Capturing subpattern count = 271
-Max back reference = 270
-No options
-No first char
-No need char
- \O900 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 ABC ABC
- 0: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 ABC ABC
- 1: 1
- 2: 2
- 3: 3
- 4: 4
- 5: 5
- 6: 6
- 7: 7
- 8: 8
- 9: 9
-10: 10
-11: 11
-12: 12
-13: 13
-14: 14
-15: 15
-16: 16
-17: 17
-18: 18
-19: 19
-20: 20
-21: 21
-22: 22
-23: 23
-24: 24
-25: 25
-26: 26
-27: 27
-28: 28
-29: 29
-30: 30
-31: 31
-32: 32
-33: 33
-34: 34
-35: 35
-36: 36
-37: 37
-38: 38
-39: 39
-40: 40
-41: 41
-42: 42
-43: 43
-44: 44
-45: 45
-46: 46
-47: 47
-48: 48
-49: 49
-50: 50
-51: 51
-52: 52
-53: 53
-54: 54
-55: 55
-56: 56
-57: 57
-58: 58
-59: 59
-60: 60
-61: 61
-62: 62
-63: 63
-64: 64
-65: 65
-66: 66
-67: 67
-68: 68
-69: 69
-70: 70
-71: 71
-72: 72
-73: 73
-74: 74
-75: 75
-76: 76
-77: 77
-78: 78
-79: 79
-80: 80
-81: 81
-82: 82
-83: 83
-84: 84
-85: 85
-86: 86
-87: 87
-88: 88
-89: 89
-90: 90
-91: 91
-92: 92
-93: 93
-94: 94
-95: 95
-96: 96
-97: 97
-98: 98
-99: 99
-100: 100
-101: 101
-102: 102
-103: 103
-104: 104
-105: 105
-106: 106
-107: 107
-108: 108
-109: 109
-110: 110
-111: 111
-112: 112
-113: 113
-114: 114
-115: 115
-116: 116
-117: 117
-118: 118
-119: 119
-120: 120
-121: 121
-122: 122
-123: 123
-124: 124
-125: 125
-126: 126
-127: 127
-128: 128
-129: 129
-130: 130
-131: 131
-132: 132
-133: 133
-134: 134
-135: 135
-136: 136
-137: 137
-138: 138
-139: 139
-140: 140
-141: 141
-142: 142
-143: 143
-144: 144
-145: 145
-146: 146
-147: 147
-148: 148
-149: 149
-150: 150
-151: 151
-152: 152
-153: 153
-154: 154
-155: 155
-156: 156
-157: 157
-158: 158
-159: 159
-160: 160
-161: 161
-162: 162
-163: 163
-164: 164
-165: 165
-166: 166
-167: 167
-168: 168
-169: 169
-170: 170
-171: 171
-172: 172
-173: 173
-174: 174
-175: 175
-176: 176
-177: 177
-178: 178
-179: 179
-180: 180
-181: 181
-182: 182
-183: 183
-184: 184
-185: 185
-186: 186
-187: 187
-188: 188
-189: 189
-190: 190
-191: 191
-192: 192
-193: 193
-194: 194
-195: 195
-196: 196
-197: 197
-198: 198
-199: 199
-200: 200
-201: 201
-202: 202
-203: 203
-204: 204
-205: 205
-206: 206
-207: 207
-208: 208
-209: 209
-210: 210
-211: 211
-212: 212
-213: 213
-214: 214
-215: 215
-216: 216
-217: 217
-218: 218
-219: 219
-220: 220
-221: 221
-222: 222
-223: 223
-224: 224
-225: 225
-226: 226
-227: 227
-228: 228
-229: 229
-230: 230
-231: 231
-232: 232
-233: 233
-234: 234
-235: 235
-236: 236
-237: 237
-238: 238
-239: 239
-240: 240
-241: 241
-242: 242
-243: 243
-244: 244
-245: 245
-246: 246
-247: 247
-248: 248
-249: 249
-250: 250
-251: 251
-252: 252
-253: 253
-254: 254
-255: 255
-256: 256
-257: 257
-258: 258
-259: 259
-260: 260
-261: 261
-262: 262
-263: 263
-264: 264
-265: 265
-266: 266
-267: 267
-268: 268
-269: 269
-270: ABC
-271: ABC
-
-/This one's here because Perl does this differently and PCRE can't at present/
-Capturing subpattern count = 0
-No options
-First char = 'T'
-Need char = 't'
-
-/(main(O)?)+/
-Capturing subpattern count = 2
-No options
-First char = 'm'
-Need char = 'n'
- mainmain
- 0: mainmain
- 1: main
- mainOmain
- 0: mainOmain
- 1: main
- 2: O
-
-/These are all cases where Perl does it differently (nested captures)/
-Capturing subpattern count = 1
-No options
-First char = 'T'
-Need char = 's'
-
-/^(a(b)?)+$/
-Capturing subpattern count = 2
-Options: anchored
-No first char
-Need char = 'a'
- aba
- 0: aba
- 1: a
- 2: b
-
-/^(aa(bb)?)+$/
-Capturing subpattern count = 2
-Options: anchored
-No first char
-Need char = 'a'
- aabbaa
- 0: aabbaa
- 1: aa
- 2: bb
-
-/^(aa|aa(bb))+$/
-Capturing subpattern count = 2
-Options: anchored
-No first char
-Need char = 'a'
- aabbaa
- 0: aabbaa
- 1: aa
- 2: bb
-
-/^(aa(bb)??)+$/
-Capturing subpattern count = 2
-Options: anchored
-No first char
-Need char = 'a'
- aabbaa
- 0: aabbaa
- 1: aa
- 2: bb
-
-/^(?:aa(bb)?)+$/
-Capturing subpattern count = 1
-Options: anchored
-No first char
-Need char = 'a'
- aabbaa
- 0: aabbaa
- 1: bb
-
-/^(aa(b(b))?)+$/
-Capturing subpattern count = 3
-Options: anchored
-No first char
-Need char = 'a'
- aabbaa
- 0: aabbaa
- 1: aa
- 2: bb
- 3: b
-
-/^(?:aa(b(b))?)+$/
-Capturing subpattern count = 2
-Options: anchored
-No first char
-Need char = 'a'
- aabbaa
- 0: aabbaa
- 1: bb
- 2: b
-
-/^(?:aa(b(?:b))?)+$/
-Capturing subpattern count = 1
-Options: anchored
-No first char
-Need char = 'a'
- aabbaa
- 0: aabbaa
- 1: bb
-
-/^(?:aa(bb(?:b))?)+$/
-Capturing subpattern count = 1
-Options: anchored
-No first char
-Need char = 'a'
- aabbbaa
- 0: aabbbaa
- 1: bbb
-
-/^(?:aa(b(?:bb))?)+$/
-Capturing subpattern count = 1
-Options: anchored
-No first char
-Need char = 'a'
- aabbbaa
- 0: aabbbaa
- 1: bbb
-
-/^(?:aa(?:b(b))?)+$/
-Capturing subpattern count = 1
-Options: anchored
-No first char
-Need char = 'a'
- aabbaa
- 0: aabbaa
- 1: b
-
-/^(?:aa(?:b(bb))?)+$/
-Capturing subpattern count = 1
-Options: anchored
-No first char
-Need char = 'a'
- aabbbaa
- 0: aabbbaa
- 1: bb
-
-/^(aa(b(bb))?)+$/
-Capturing subpattern count = 3
-Options: anchored
-No first char
-Need char = 'a'
- aabbbaa
- 0: aabbbaa
- 1: aa
- 2: bbb
- 3: bb
-
-/^(aa(bb(bb))?)+$/
-Capturing subpattern count = 3
-Options: anchored
-No first char
-Need char = 'a'
- aabbbbaa
- 0: aabbbbaa
- 1: aa
- 2: bbbb
- 3: bb
-
-/--------------------------------------------------------------------/
-Capturing subpattern count = 0
-No options
-First char = '-'
-Need char = '-'
-
-/#/xMD
-Memory allocation (code space): 7
-------------------------------------------------------------------
- 0 3 Bra 0
- 3 3 Ket
- 6 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: extended
-No first char
-No need char
-
-/a#/xMD
-Memory allocation (code space): 13
-------------------------------------------------------------------
- 0 6 Bra 0
- 3 1 a
- 6 6 Ket
- 9 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: extended
-First char = 'a'
-No need char
-
-/[\s]/D
-------------------------------------------------------------------
- 0 36 Bra 0
- 3 [\x09-\x0a\x0c-\x0d ]
- 36 36 Ket
- 39 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-
-/[\S]/D
-------------------------------------------------------------------
- 0 36 Bra 0
- 3 [\x00-\x08\x0b\x0e-\x1f!-\xff]
- 36 36 Ket
- 39 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-
-/a(?i)b/D
-------------------------------------------------------------------
- 0 11 Bra 0
- 3 1 a
- 6 01 Opt
- 8 1 b
- 11 11 Ket
- 14 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-Case state changes
-First char = 'a'
-Need char = 'b' (caseless)
- ab
- 0: ab
- aB
- 0: aB
- *** Failers
-No match
- AB
-No match
-
-/(a(?i)b)/D
-------------------------------------------------------------------
- 0 19 Bra 0
- 3 11 Bra 1
- 6 1 a
- 9 01 Opt
- 11 1 b
- 14 11 Ket
- 17 00 Opt
- 19 19 Ket
- 22 End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-No options
-Case state changes
-First char = 'a'
-Need char = 'b' (caseless)
- ab
- 0: ab
- 1: ab
- aB
- 0: aB
- 1: aB
- *** Failers
-No match
- AB
-No match
-
-/ (?i)abc/xD
-------------------------------------------------------------------
- 0 8 Bra 0
- 3 3 abc
- 8 8 Ket
- 11 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: caseless extended
-First char = 'a' (caseless)
-Need char = 'c' (caseless)
-
-/#this is a comment
- (?i)abc/xD
-------------------------------------------------------------------
- 0 8 Bra 0
- 3 3 abc
- 8 8 Ket
- 11 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: caseless extended
-First char = 'a' (caseless)
-Need char = 'c' (caseless)
-
-/123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890/D
-------------------------------------------------------------------
- 0 307 Bra 0
- 3 250 1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
-255 50 12345678901234567890123456789012345678901234567890
-307 307 Ket
-310 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-First char = '1'
-Need char = '0'
-
-/\Q123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890/D
-------------------------------------------------------------------
- 0 307 Bra 0
- 3 250 1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
-255 50 12345678901234567890123456789012345678901234567890
-307 307 Ket
-310 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-First char = '1'
-Need char = '0'
-
-/\Q\E/D
-------------------------------------------------------------------
- 0 3 Bra 0
- 3 3 Ket
- 6 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-No first char
-No need char
- \
- 0:
-
-/\Q\Ex/D
-------------------------------------------------------------------
- 0 6 Bra 0
- 3 1 x
- 6 6 Ket
- 9 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-First char = 'x'
-No need char
-
-/ \Q\E/D
-------------------------------------------------------------------
- 0 6 Bra 0
- 3 1
- 6 6 Ket
- 9 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-First char = ' '
-No need char
-
-/a\Q\E/D
-------------------------------------------------------------------
- 0 6 Bra 0
- 3 1 a
- 6 6 Ket
- 9 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-First char = 'a'
-No need char
- abc
- 0: a
- bca
- 0: a
- bac
- 0: a
-
-/a\Q\Eb/D
-------------------------------------------------------------------
- 0 9 Bra 0
- 3 1 a
- 6 1 b
- 9 9 Ket
- 12 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'b'
- abc
- 0: ab
-
-/\Q\Eabc/D
-------------------------------------------------------------------
- 0 8 Bra 0
- 3 3 abc
- 8 8 Ket
- 11 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'c'
-
-/x*+\w/D
-------------------------------------------------------------------
- 0 12 Bra 0
- 3 5 Once
- 6 x*
- 8 5 Ket
- 11 \w
- 12 12 Ket
- 15 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-No first char
-No need char
- ****Failers
- 0: F
- xxxxx
-No match
-
-/x?+/D
-------------------------------------------------------------------
- 0 11 Bra 0
- 3 5 Once
- 6 x?
- 8 5 Ket
- 11 11 Ket
- 14 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-
-/x++/D
-------------------------------------------------------------------
- 0 11 Bra 0
- 3 5 Once
- 6 x+
- 8 5 Ket
- 11 11 Ket
- 14 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-First char = 'x'
-No need char
-
-/x{1,3}+/D
-------------------------------------------------------------------
- 0 16 Bra 0
- 3 10 Once
- 6 1 x
- 9 x{,2}
- 13 10 Ket
- 16 16 Ket
- 19 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-First char = 'x'
-No need char
-
-/(x)*+/D
-------------------------------------------------------------------
- 0 19 Bra 0
- 3 13 Once
- 6 Brazero
- 7 6 Bra 1
- 10 1 x
- 13 6 KetRmax
- 16 13 Ket
- 19 19 Ket
- 22 End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-No options
-No first char
-No need char
-
-/^(\w++|\s++)*$/
-Capturing subpattern count = 1
-Options: anchored
-No first char
-No need char
- now is the time for all good men to come to the aid of the party
- 0: now is the time for all good men to come to the aid of the party
- 1: party
- *** Failers
-No match
- this is not a line with only words and spaces!
-No match
-
-/(\d++)(\w)/
-Capturing subpattern count = 2
-No options
-No first char
-No need char
- 12345a
- 0: 12345a
- 1: 12345
- 2: a
- *** Failers
-No match
- 12345+
-No match
-
-/a++b/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'b'
- aaab
- 0: aaab
-
-/(a++b)/
-Capturing subpattern count = 1
-No options
-First char = 'a'
-Need char = 'b'
- aaab
- 0: aaab
- 1: aaab
-
-/(a++)b/
-Capturing subpattern count = 1
-No options
-First char = 'a'
-Need char = 'b'
- aaab
- 0: aaab
- 1: aaa
-
-/([^()]++|\([^()]*\))+/
-Capturing subpattern count = 1
-No options
-No first char
-No need char
- ((abc(ade)ufh()()x
- 0: abc(ade)ufh()()x
- 1: x
-
-/\(([^()]++|\([^()]+\))+\)/
-Capturing subpattern count = 1
-No options
-First char = '('
-Need char = ')'
- (abc)
- 0: (abc)
- 1: abc
- (abc(def)xyz)
- 0: (abc(def)xyz)
- 1: xyz
- *** Failers
-No match
- ((()aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-No match
-
-/(abc){1,3}+/D
-------------------------------------------------------------------
- 0 50 Bra 0
- 3 44 Once
- 6 8 Bra 1
- 9 3 abc
- 14 8 Ket
- 17 Brazero
- 18 26 Bra 0
- 21 8 Bra 1
- 24 3 abc
- 29 8 Ket
- 32 Brazero
- 33 8 Bra 1
- 36 3 abc
- 41 8 Ket
- 44 26 Ket
- 47 44 Ket
- 50 50 Ket
- 53 End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-No options
-First char = 'a'
-Need char = 'c'
-
-/a+?+/
-Failed: nothing to repeat at offset 3
-
-/a{2,3}?+b/
-Failed: nothing to repeat at offset 7
-
-/(?U)a+?+/
-Failed: nothing to repeat at offset 7
-
-/a{2,3}?+b/U
-Failed: nothing to repeat at offset 7
-
-/x(?U)a++b/D
-------------------------------------------------------------------
- 0 17 Bra 0
- 3 1 x
- 6 5 Once
- 9 a+
- 11 5 Ket
- 14 1 b
- 17 17 Ket
- 20 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-First char = 'x'
-Need char = 'b'
- xaaaab
- 0: xaaaab
-
-/(?U)xa++b/D
-------------------------------------------------------------------
- 0 17 Bra 0
- 3 1 x
- 6 5 Once
- 9 a+
- 11 5 Ket
- 14 1 b
- 17 17 Ket
- 20 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: ungreedy
-First char = 'x'
-Need char = 'b'
- xaaaab
- 0: xaaaab
-
-/^((a+)(?U)([ab]+)(?-U)([bc]+)(\w*))/D
-------------------------------------------------------------------
- 0 106 Bra 0
- 3 ^
- 4 99 Bra 1
- 7 5 Bra 2
- 10 a+
- 12 5 Ket
- 15 37 Bra 3
- 18 [a-b]+?
- 52 37 Ket
- 55 37 Bra 4
- 58 [b-c]+
- 92 37 Ket
- 95 5 Bra 5
- 98 \w*
-100 5 Ket
-103 99 Ket
-106 106 Ket
-109 End
-------------------------------------------------------------------
-Capturing subpattern count = 5
-Options: anchored
-No first char
-Need char = 'a'
-
-/^x(?U)a+b/D
-------------------------------------------------------------------
- 0 12 Bra 0
- 3 ^
- 4 1 x
- 7 a+?
- 9 1 b
- 12 12 Ket
- 15 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: anchored
-No first char
-Need char = 'b'
-
-/^x(?U)(a+)b/D
-------------------------------------------------------------------
- 0 18 Bra 0
- 3 ^
- 4 1 x
- 7 5 Bra 1
- 10 a+?
- 12 5 Ket
- 15 1 b
- 18 18 Ket
- 21 End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-Options: anchored
-No first char
-Need char = 'b'
-
-/[.x.]/
-Failed: POSIX collating elements are not supported at offset 0
-
-/[=x=]/
-Failed: POSIX collating elements are not supported at offset 0
-
-/[:x:]/
-Failed: POSIX named classes are supported only within a class at offset 0
-
-/\l/
-Failed: PCRE does not support \L, \l, \N, \P, \p, \U, \u, or \X at offset 1
-
-/\L/
-Failed: PCRE does not support \L, \l, \N, \P, \p, \U, \u, or \X at offset 1
-
-/\N{name}/
-Failed: PCRE does not support \L, \l, \N, \P, \p, \U, \u, or \X at offset 1
-
-/\pP/
-Failed: PCRE does not support \L, \l, \N, \P, \p, \U, \u, or \X at offset 1
-
-/\PP/
-Failed: PCRE does not support \L, \l, \N, \P, \p, \U, \u, or \X at offset 1
-
-/\p{prop}/
-Failed: PCRE does not support \L, \l, \N, \P, \p, \U, \u, or \X at offset 1
-
-/\P{prop}/
-Failed: PCRE does not support \L, \l, \N, \P, \p, \U, \u, or \X at offset 1
-
-/\u/
-Failed: PCRE does not support \L, \l, \N, \P, \p, \U, \u, or \X at offset 1
-
-/\U/
-Failed: PCRE does not support \L, \l, \N, \P, \p, \U, \u, or \X at offset 1
-
-/\X/
-Failed: PCRE does not support \L, \l, \N, \P, \p, \U, \u, or \X at offset 1
-
-/[/
-Failed: missing terminating ] for character class at offset 1
-
-/[a-/
-Failed: missing terminating ] for character class at offset 3
-
-/[[:space:]/
-Failed: missing terminating ] for character class at offset 10
-
-/[\s]/DM
-Memory allocation (code space): 40
-------------------------------------------------------------------
- 0 36 Bra 0
- 3 [\x09-\x0a\x0c-\x0d ]
- 36 36 Ket
- 39 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-
-/[[:space:]]/DM
-Memory allocation (code space): 40
-------------------------------------------------------------------
- 0 36 Bra 0
- 3 [\x09-\x0d ]
- 36 36 Ket
- 39 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-
-/[[:space:]abcde]/DM
-Memory allocation (code space): 40
-------------------------------------------------------------------
- 0 36 Bra 0
- 3 [\x09-\x0d a-e]
- 36 36 Ket
- 39 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-
-/< (?: (?(R) \d++ | [^<>]*+) | (?R)) * >/x
-Capturing subpattern count = 0
-Options: extended
-First char = '<'
-Need char = '>'
- <>
- 0: <>
- <abcd>
- 0: <abcd>
- <abc <123> hij>
- 0: <abc <123> hij>
- <abc <def> hij>
- 0: <def>
- <abc<>def>
- 0: <abc<>def>
- <abc<>
- 0: <>
- *** Failers
-No match
- <abc
-No match
-
-|8J\$WE\<\.rX\+ix\[d1b\!H\#\?vV0vrK\:ZH1\=2M\>iV\;\?aPhFB\<\*vW\@QW\@sO9\}cfZA\-i\'w\%hKd6gt1UJP\,15_\#QY\$M\^Mss_U\/\]\&LK9\[5vQub\^w\[KDD\<EjmhUZ\?\.akp2dF\>qmj\;2\}YWFdYx\.Ap\]hjCPTP\(n28k\+3\;o\&WXqs\/gOXdr\$\:r\'do0\;b4c\(f_Gr\=\"\\4\)\[01T7ajQJvL\$W\~mL_sS\/4h\:x\*\[ZN\=KLs\&L5zX\/\/\>it\,o\:aU\(\;Z\>pW\&T7oP\'2K\^E\:x9\'c\[\%z\-\,64JQ5AeH_G\#KijUKghQw\^\\vea3a\?kka_G\$8\#\`\*kynsxzBLru\'\]k_\[7FrVx\}\^\=\$blx\>s\-N\%j\;D\*aZDnsw\:YKZ\%Q\.Kne9\#hP\?\+b3\(SOvL\,\^\;\&u5\@\?5C5Bhb\=m\-vEh_L15Jl\]U\)0RP6\{q\%L\^_z5E\'Dw6X\b|DM
-Memory allocation (code space): 421
-------------------------------------------------------------------
- 0 417 Bra 0
- 3 250 8J$WE<.rX+ix[d1b!H#?vV0vrK:ZH1=2M>iV;?aPhFB<*vW@QW@sO9}cfZA-i'w%hKd6gt1UJP,15_#QY$M^Mss_U/]&LK9[5vQub^w[KDD<EjmhUZ?.akp2dF>qmj;2}YWFdYx.Ap]hjCPTP(n28k+3;o&WXqs/gOXdr$:r'do0;b4c(f_Gr="\4)[01T7ajQJvL$W~mL_sS/4h:x*[ZN=KLs&L5zX//>it,o:aU(;Z>pW&T7oP'2K^E:
-255 159 x9'c[%z-,64JQ5AeH_G#KijUKghQw^\vea3a?kka_G$8#`*kynsxzBLru']k_[7FrVx}^=$blx>s-N%j;D*aZDnsw:YKZ%Q.Kne9#hP?+b3(SOvL,^;&u5@?5C5Bhb=m-vEh_L15Jl]U)0RP6{q%L^_z5E'Dw6X
-416 \b
-417 417 Ket
-420 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-First char = '8'
-Need char = 'X'
-
-|\$\<\.X\+ix\[d1b\!H\#\?vV0vrK\:ZH1\=2M\>iV\;\?aPhFB\<\*vW\@QW\@sO9\}cfZA\-i\'w\%hKd6gt1UJP\,15_\#QY\$M\^Mss_U\/\]\&LK9\[5vQub\^w\[KDD\<EjmhUZ\?\.akp2dF\>qmj\;2\}YWFdYx\.Ap\]hjCPTP\(n28k\+3\;o\&WXqs\/gOXdr\$\:r\'do0\;b4c\(f_Gr\=\"\\4\)\[01T7ajQJvL\$W\~mL_sS\/4h\:x\*\[ZN\=KLs\&L5zX\/\/\>it\,o\:aU\(\;Z\>pW\&T7oP\'2K\^E\:x9\'c\[\%z\-\,64JQ5AeH_G\#KijUKghQw\^\\vea3a\?kka_G\$8\#\`\*kynsxzBLru\'\]k_\[7FrVx\}\^\=\$blx\>s\-N\%j\;D\*aZDnsw\:YKZ\%Q\.Kne9\#hP\?\+b3\(SOvL\,\^\;\&u5\@\?5C5Bhb\=m\-vEh_L15Jl\]U\)0RP6\{q\%L\^_z5E\'Dw6X\b|DM
-Memory allocation (code space): 416
-------------------------------------------------------------------
- 0 412 Bra 0
- 3 250 $<.X+ix[d1b!H#?vV0vrK:ZH1=2M>iV;?aPhFB<*vW@QW@sO9}cfZA-i'w%hKd6gt1UJP,15_#QY$M^Mss_U/]&LK9[5vQub^w[KDD<EjmhUZ?.akp2dF>qmj;2}YWFdYx.Ap]hjCPTP(n28k+3;o&WXqs/gOXdr$:r'do0;b4c(f_Gr="\4)[01T7ajQJvL$W~mL_sS/4h:x*[ZN=KLs&L5zX//>it,o:aU(;Z>pW&T7oP'2K^E:x9'c[
-255 154 %z-,64JQ5AeH_G#KijUKghQw^\vea3a?kka_G$8#`*kynsxzBLru']k_[7FrVx}^=$blx>s-N%j;D*aZDnsw:YKZ%Q.Kne9#hP?+b3(SOvL,^;&u5@?5C5Bhb=m-vEh_L15Jl]U)0RP6{q%L^_z5E'Dw6X
-411 \b
-412 412 Ket
-415 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-First char = '$'
-Need char = 'X'
-
-/(.*)\d+\1/I
-Capturing subpattern count = 1
-Max back reference = 1
-No options
-No first char
-No need char
-
-/(.*)\d+/I
-Capturing subpattern count = 1
-No options
-First char at start or follows \n
-No need char
-
-/(.*)\d+\1/Is
-Capturing subpattern count = 1
-Max back reference = 1
-Options: dotall
-No first char
-No need char
-
-/(.*)\d+/Is
-Capturing subpattern count = 1
-Options: anchored dotall
-No first char
-No need char
-
-/(.*(xyz))\d+\2/I
-Capturing subpattern count = 2
-Max back reference = 2
-No options
-No first char
-Need char = 'z'
-
-/((.*))\d+\1/I
-Capturing subpattern count = 2
-Max back reference = 1
-No options
-No first char
-No need char
- abc123bc
- 0: bc123bc
- 1: bc
- 2: bc
-
-/a[b]/I
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'b'
-
-/(?=a).*/I
-Capturing subpattern count = 0
-No options
-First char = 'a'
-No need char
-
-/(?=abc).xyz/iI
-Capturing subpattern count = 0
-Options: caseless
-First char = 'a' (caseless)
-Need char = 'z' (caseless)
-
-/(?=abc)(?i).xyz/I
-Capturing subpattern count = 0
-No options
-Case state changes
-First char = 'a'
-Need char = 'z' (caseless)
-
-/(?=a)(?=b)/I
-Capturing subpattern count = 0
-No options
-First char = 'a'
-No need char
-
-/(?=.)a/I
-Capturing subpattern count = 0
-No options
-First char = 'a'
-No need char
-
-/((?=abcda)a)/I
-Capturing subpattern count = 1
-No options
-First char = 'a'
-Need char = 'a'
-
-/((?=abcda)ab)/I
-Capturing subpattern count = 1
-No options
-First char = 'a'
-Need char = 'b'
-
-/()a/I
-Capturing subpattern count = 1
-No options
-No first char
-Need char = 'a'
-
-/(?(1)ab|ac)/I
-Capturing subpattern count = 0
-No options
-First char = 'a'
-No need char
-
-/(?(1)abz|acz)/I
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'z'
-
-/(?(1)abz)/I
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-
-/(?(1)abz)123/I
-Capturing subpattern count = 0
-No options
-No first char
-Need char = '3'
-
-/(a)+/I
-Capturing subpattern count = 1
-No options
-First char = 'a'
-No need char
-
-/(a){2,3}/I
-Capturing subpattern count = 1
-No options
-First char = 'a'
-Need char = 'a'
-
-/(a)*/I
-Capturing subpattern count = 1
-No options
-No first char
-No need char
-
-/[a]/I
-Capturing subpattern count = 0
-No options
-First char = 'a'
-No need char
-
-/[ab]/I
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-
-/[ab]/IS
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-Starting character set: a b
-
-/[^a]/I
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-
-/\d456/I
-Capturing subpattern count = 0
-No options
-No first char
-Need char = '6'
-
-/\d456/IS
-Capturing subpattern count = 0
-No options
-No first char
-Need char = '6'
-Starting character set: 0 1 2 3 4 5 6 7 8 9
-
-/a^b/I
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'b'
-
-/^a/mI
-Capturing subpattern count = 0
-Options: multiline
-First char at start or follows \n
-Need char = 'a'
- abcde
- 0: a
- xy\nabc
- 0: a
- *** Failers
-No match
- xyabc
-No match
-
-/c|abc/I
-Capturing subpattern count = 0
-No options
-No first char
-Need char = 'c'
-
-/(?i)[ab]/IS
-Capturing subpattern count = 0
-Options: caseless
-No first char
-No need char
-Starting character set: A B a b
-
-/[ab](?i)cd/IS
-Capturing subpattern count = 0
-No options
-Case state changes
-No first char
-Need char = 'd' (caseless)
-Starting character set: a b
-
-/abc(?C)def/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'f'
- abcdef
---->abcdef
- 0 ^ ^
- 0: abcdef
- 1234abcdef
---->1234abcdef
- 0 ^ ^
- 0: abcdef
- *** Failers
-No match
- abcxyz
-No match
- abcxyzf
---->abcxyzf
- 0 ^ ^
-No match
-
-/abc(?C)de(?C1)f/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'f'
- 123abcdef
---->123abcdef
- 0 ^ ^
- 1 ^ ^
- 0: abcdef
-
-/(?C1)\dabc(?C2)def/
-Capturing subpattern count = 0
-No options
-No first char
-Need char = 'f'
- 1234abcdef
---->1234abcdef
- 1 ^
- 1 ^
- 1 ^
- 1 ^
- 2 ^ ^
- 0: 4abcdef
- *** Failers
-No match
- abcdef
---->abcdef
- 1 ^
- 1 ^
- 1 ^
- 1 ^
- 1 ^
- 1 ^
-No match
-
-/(?C255)ab/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'b'
-
-/(?C256)ab/
-Failed: number after (?C is > 255 at offset 6
-
-/(?Cab)xx/
-Failed: closing ) for (?C expected at offset 3
-
-/(?C12vr)x/
-Failed: closing ) for (?C expected at offset 5
-
-/abc(?C)def/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'f'
- *** Failers
-No match
- \x83\x0\x61bcdef
---->\x83\x00abcdef
- 0 ^ ^
- 0: abcdef
-
-/(abc)(?C)de(?C1)f/
-Capturing subpattern count = 1
-No options
-First char = 'a'
-Need char = 'f'
- 123abcdef
---->123abcdef
- 0 ^ ^
- 1 ^ ^
- 0: abcdef
- 1: abc
- 123abcdef\C+
-Callout 0: last capture = 1
- 0: <unset>
- 1: abc
---->123abcdef
- ^ ^
-Callout 1: last capture = 1
- 0: <unset>
- 1: abc
---->123abcdef
- ^ ^
- 0: abcdef
- 1: abc
- 123abcdef\C-
- 0: abcdef
- 1: abc
- *** Failers
-No match
- 123abcdef\C!1
---->123abcdef
- 0 ^ ^
- 1 ^ ^
-No match
-
-/(?C0)(abc(?C1))*/
-Capturing subpattern count = 1
-No options
-No first char
-No need char
- abcabcabc
---->abcabcabc
- 0 ^
- 1 ^ ^
- 1 ^ ^
- 1 ^ ^
- 0: abcabcabc
- 1: abc
- abcabc\C!1!3
---->abcabc
- 0 ^
- 1 ^ ^
- 1 ^ ^
- 0: abcabc
- 1: abc
- *** Failers
---->*** Failers
- 0 ^
- 0:
- abcabcabc\C!1!3
---->abcabcabc
- 0 ^
- 1 ^ ^
- 1 ^ ^
- 1 ^ ^
- 0: abcabc
- 1: abc
-
-/(\d{3}(?C))*/
-Capturing subpattern count = 1
-No options
-No first char
-No need char
- 123\C+
-Callout 0: last capture = -1
- 0: <unset>
---->123
- ^ ^
- 0: 123
- 1: 123
- 123456\C+
-Callout 0: last capture = -1
- 0: <unset>
---->123456
- ^ ^
-Callout 0: last capture = 1
- 0: <unset>
- 1: 123
---->123456
- ^ ^
- 0: 123456
- 1: 456
- 123456789\C+
-Callout 0: last capture = -1
- 0: <unset>
---->123456789
- ^ ^
-Callout 0: last capture = 1
- 0: <unset>
- 1: 123
---->123456789
- ^ ^
-Callout 0: last capture = 1
- 0: <unset>
- 1: 456
---->123456789
- ^ ^
- 0: 123456789
- 1: 789
-
-/((xyz)(?C)p|(?C1)xyzabc)/
-Capturing subpattern count = 2
-No options
-First char = 'x'
-No need char
- xyzabc\C+
-Callout 0: last capture = 2
- 0: <unset>
- 1: <unset>
- 2: xyz
---->xyzabc
- ^ ^
-Callout 1: last capture = -1
- 0: <unset>
---->xyzabc
- ^
- 0: xyzabc
- 1: xyzabc
-
-/(X)((xyz)(?C)p|(?C1)xyzabc)/
-Capturing subpattern count = 3
-No options
-First char = 'X'
-Need char = 'x'
- Xxyzabc\C+
-Callout 0: last capture = 3
- 0: <unset>
- 1: X
- 2: <unset>
- 3: xyz
---->Xxyzabc
- ^ ^
-Callout 1: last capture = 1
- 0: <unset>
- 1: X
---->Xxyzabc
- ^^
- 0: Xxyzabc
- 1: X
- 2: xyzabc
-
-/(?=(abc))(?C)abcdef/
-Capturing subpattern count = 1
-No options
-First char = 'a'
-Need char = 'f'
- abcdef\C+
-Callout 0: last capture = 1
- 0: <unset>
- 1: abc
---->abcdef
- ^
- 0: abcdef
- 1: abc
-
-/(?!(abc)(?C1)d)(?C2)abcxyz/
-Capturing subpattern count = 1
-No options
-First char = 'a'
-Need char = 'z'
- abcxyz\C+
-Callout 1: last capture = 1
- 0: <unset>
- 1: abc
---->abcxyz
- ^ ^
-Callout 2: last capture = -1
- 0: <unset>
---->abcxyz
- ^
- 0: abcxyz
-
-/(?<=(abc)(?C))xyz/
-Capturing subpattern count = 1
-No options
-First char = 'x'
-Need char = 'z'
- abcxyz\C+
-Callout 0: last capture = 1
- 0: <unset>
- 1: abc
---->abcxyz
- ^
- 0: xyz
- 1: abc
-
-/(?C)abc/
-Capturing subpattern count = 0
-No options
-First char = 'a'
-Need char = 'c'
-
-/(?C)^abc/
-Capturing subpattern count = 0
-Options: anchored
-No first char
-Need char = 'c'
-
-/(?C)a|b/S
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-Starting character set: a b
-
-/(?R)/
-Failed: recursive call could loop indefinitely at offset 3
-
-/(a|(?R))/
-Failed: recursive call could loop indefinitely at offset 6
-
-/(ab|(bc|(de|(?R))))/
-Failed: recursive call could loop indefinitely at offset 15
-
-/x(ab|(bc|(de|(?R))))/
-Capturing subpattern count = 3
-No options
-First char = 'x'
-No need char
- xab
- 0: xab
- 1: ab
- xbc
- 0: xbc
- 1: bc
- 2: bc
- xde
- 0: xde
- 1: de
- 2: de
- 3: de
- xxab
- 0: xxab
- 1: xab
- 2: xab
- 3: xab
- xxxab
- 0: xxxab
- 1: xxab
- 2: xxab
- 3: xxab
- *** Failers
-No match
- xyab
-No match
-
-/(ab|(bc|(de|(?1))))/
-Failed: recursive call could loop indefinitely at offset 15
-
-/x(ab|(bc|(de|(?1)x)x)x)/
-Failed: recursive call could loop indefinitely at offset 16
-
-/^([^()]|\((?1)*\))*$/
-Capturing subpattern count = 1
-Options: anchored
-No first char
-No need char
- abc
- 0: abc
- 1: c
- a(b)c
- 0: a(b)c
- 1: c
- a(b(c))d
- 0: a(b(c))d
- 1: d
- *** Failers)
-No match
- a(b(c)d
-No match
-
-/^>abc>([^()]|\((?1)*\))*<xyz<$/
-Capturing subpattern count = 1
-Options: anchored
-No first char
-Need char = '<'
- >abc>123<xyz<
- 0: >abc>123<xyz<
- 1: 3
- >abc>1(2)3<xyz<
- 0: >abc>1(2)3<xyz<
- 1: 3
- >abc>(1(2)3)<xyz<
- 0: >abc>(1(2)3)<xyz<
- 1: (1(2)3)
-
-/(a(?1)b)/D
-------------------------------------------------------------------
- 0 18 Bra 0
- 3 12 Bra 1
- 6 1 a
- 9 3 Recurse
- 12 1 b
- 15 12 Ket
- 18 18 Ket
- 21 End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-No options
-First char = 'a'
-Need char = 'b'
-
-/(a(?1)+b)/D
-------------------------------------------------------------------
- 0 24 Bra 0
- 3 18 Bra 1
- 6 1 a
- 9 6 Bra 0
- 12 3 Recurse
- 15 6 KetRmax
- 18 1 b
- 21 18 Ket
- 24 24 Ket
- 27 End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-No options
-First char = 'a'
-Need char = 'b'
-
-/^\W*(?:((.)\W*(?1)\W*\2|)|((.)\W*(?3)\W*\4|\W*.\W*))\W*$/i
-Capturing subpattern count = 4
-Max back reference = 4
-Options: anchored caseless
-No first char
-No need char
- 1221
- 0: 1221
- 1: 1221
- 2: 1
- Satan, oscillate my metallic sonatas!
- 0: Satan, oscillate my metallic sonatas!
- 1: <unset>
- 2: <unset>
- 3: Satan, oscillate my metallic sonatas
- 4: S
- A man, a plan, a canal: Panama!
- 0: A man, a plan, a canal: Panama!
- 1: <unset>
- 2: <unset>
- 3: A man, a plan, a canal: Panama
- 4: A
- Able was I ere I saw Elba.
- 0: Able was I ere I saw Elba.
- 1: <unset>
- 2: <unset>
- 3: Able was I ere I saw Elba
- 4: A
- *** Failers
-No match
- The quick brown fox
-No match
-
-/^(\d+|\((?1)([+*-])(?1)\)|-(?1))$/
-Capturing subpattern count = 2
-Options: anchored
-No first char
-No need char
- 12
- 0: 12
- 1: 12
- (((2+2)*-3)-7)
- 0: (((2+2)*-3)-7)
- 1: (((2+2)*-3)-7)
- 2: -
- -12
- 0: -12
- 1: -12
- *** Failers
-No match
- ((2+2)*-3)-7)
-No match
-
-/^(x(y|(?1){2})z)/
-Capturing subpattern count = 2
-Options: anchored
-No first char
-Need char = 'z'
- xyz
- 0: xyz
- 1: xyz
- 2: y
- xxyzxyzz
- 0: xxyzxyzz
- 1: xxyzxyzz
- 2: xyzxyz
- *** Failers
-No match
- xxyzz
-No match
- xxyzxyzxyzz
-No match
-
-/((< (?: (?(R) \d++ | [^<>]*+) | (?2)) * >))/x
-Capturing subpattern count = 2
-Options: extended
-First char = '<'
-Need char = '>'
- <>
- 0: <>
- 1: <>
- 2: <>
- <abcd>
- 0: <abcd>
- 1: <abcd>
- 2: <abcd>
- <abc <123> hij>
- 0: <abc <123> hij>
- 1: <abc <123> hij>
- 2: <abc <123> hij>
- <abc <def> hij>
- 0: <def>
- 1: <def>
- 2: <def>
- <abc<>def>
- 0: <abc<>def>
- 1: <abc<>def>
- 2: <abc<>def>
- <abc<>
- 0: <>
- 1: <>
- 2: <>
- *** Failers
-No match
- <abc
-No match
-
-/(?1)/
-Failed: reference to non-existent subpattern at offset 3
-
-/((?2)(abc)/
-Failed: reference to non-existent subpattern at offset 4
-
-/^(abc)def(?1)/
-Capturing subpattern count = 1
-Options: anchored
-No first char
-Need char = 'f'
- abcdefabc
- 0: abcdefabc
- 1: abc
-
-/^(a|b|c)=(?1)+/
-Capturing subpattern count = 1
-Options: anchored
-No first char
-Need char = '='
- a=a
- 0: a=a
- 1: a
- a=b
- 0: a=b
- 1: a
- a=bc
- 0: a=bc
- 1: a
-
-/^(a|b|c)=((?1))+/
-Capturing subpattern count = 2
-Options: anchored
-No first char
-Need char = '='
- a=a
- 0: a=a
- 1: a
- 2: a
- a=b
- 0: a=b
- 1: a
- 2: b
- a=bc
- 0: a=bc
- 1: a
- 2: c
-
-/a(?P<name1>b|c)d(?P<longername2>e)/D
-------------------------------------------------------------------
- 0 33 Bra 0
- 3 1 a
- 6 6 Bra 1
- 9 1 b
- 12 6 Alt
- 15 1 c
- 18 12 Ket
- 21 1 d
- 24 6 Bra 2
- 27 1 e
- 30 6 Ket
- 33 33 Ket
- 36 End
-------------------------------------------------------------------
-Capturing subpattern count = 2
-Named capturing subpatterns:
- longername2 2
- name1 1
-No options
-First char = 'a'
-Need char = 'e'
- abde
- 0: abde
- 1: b
- 2: e
- acde
- 0: acde
- 1: c
- 2: e
-
-/(?:a(?P<c>c(?P<d>d)))(?P<a>a)/D
-------------------------------------------------------------------
- 0 39 Bra 0
- 3 24 Bra 0
- 6 1 a
- 9 15 Bra 1
- 12 1 c
- 15 6 Bra 2
- 18 1 d
- 21 6 Ket
- 24 15 Ket
- 27 24 Ket
- 30 6 Bra 3
- 33 1 a
- 36 6 Ket
- 39 39 Ket
- 42 End
-------------------------------------------------------------------
-Capturing subpattern count = 3
-Named capturing subpatterns:
- a 3
- c 1
- d 2
-No options
-First char = 'a'
-Need char = 'a'
-
-/(?P<a>a)...(?P=a)bbb(?P>a)d/D
-------------------------------------------------------------------
- 0 29 Bra 0
- 3 6 Bra 1
- 6 1 a
- 9 6 Ket
- 12 Any
- 13 Any
- 14 Any
- 15 \1
- 18 3 bbb
- 23 3 Recurse
- 26 1 d
- 29 29 Ket
- 32 End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-Named capturing subpatterns:
- a 1
-No options
-First char = 'a'
-Need char = 'd'
-
-/ End of testinput2 /
-Capturing subpattern count = 0
-No options
-First char = ' '
-Need char = ' '
-
diff --git a/ext/pcre/pcrelib/testdata/testoutput3 b/ext/pcre/pcrelib/testdata/testoutput3
deleted file mode 100644
index 8cc3e8dc64..0000000000
--- a/ext/pcre/pcrelib/testdata/testoutput3
+++ /dev/null
@@ -1,116 +0,0 @@
-PCRE version 3.92 11-Sep-2002
-
-/^[\w]+/
- *** Failers
-No match
- École
-No match
-
-/^[\w]+/Lfr
- École
- 0: École
-
-/^[\w]+/
- *** Failers
-No match
- École
-No match
-
-/^[\W]+/
- École
- 0: \xc9
-
-/^[\W]+/Lfr
- *** Failers
- 0: ***
- École
-No match
-
-/[\b]/
- \b
- 0: \x08
- *** Failers
-No match
- a
-No match
-
-/[\b]/Lfr
- \b
- 0: \x08
- *** Failers
-No match
- a
-No match
-
-/^\w+/
- *** Failers
-No match
- École
-No match
-
-/^\w+/Lfr
- École
- 0: École
-
-/(.+)\b(.+)/
- École
- 0: \xc9cole
- 1: \xc9
- 2: cole
-
-/(.+)\b(.+)/Lfr
- *** Failers
- 0: *** Failers
- 1: ***
- 2: Failers
- École
-No match
-
-/École/i
- École
- 0: \xc9cole
- *** Failers
-No match
- école
-No match
-
-/École/iLfr
- École
- 0: École
- école
- 0: école
-
-/\w/IS
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-Starting character set: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P
- Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z
-
-/\w/ISLfr
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-Starting character set: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P
- Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z
- À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å
- æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø ù ú û ü ý þ ÿ
-
-/^[\xc8-\xc9]/iLfr
- École
- 0: É
- école
- 0: é
-
-/^[\xc8-\xc9]/Lfr
- École
- 0: É
- *** Failers
-No match
- école
-No match
-
-/ End of testinput3 /
-
diff --git a/ext/pcre/pcrelib/testdata/testoutput4 b/ext/pcre/pcrelib/testdata/testoutput4
deleted file mode 100644
index 3018c9baa7..0000000000
--- a/ext/pcre/pcrelib/testdata/testoutput4
+++ /dev/null
@@ -1,304 +0,0 @@
-PCRE version 3.92 11-Sep-2002
-
-/-- Do not use the \x{} construct except with patterns that have the --/
-/-- /8 option set, because PCRE doesn't recognize them as UTF-8 unless --/
-No match
-/-- that option is set. However, the latest Perls recognize them always. --/
-No match
-
-/a.b/8
- acb
- 0: acb
- a\x7fb
- 0: a\x{7f}b
- a\x{100}b
- 0: a\x{100}b
- *** Failers
-No match
- a\nb
-No match
-
-/a(.{3})b/8
- a\x{4000}xyb
- 0: a\x{4000}xyb
- 1: \x{4000}xy
- a\x{4000}\x7fyb
- 0: a\x{4000}\x{7f}yb
- 1: \x{4000}\x{7f}y
- a\x{4000}\x{100}yb
- 0: a\x{4000}\x{100}yb
- 1: \x{4000}\x{100}y
- *** Failers
-No match
- a\x{4000}b
-No match
- ac\ncb
-No match
-
-/a(.*?)(.)/
- a\xc0\x88b
- 0: a\xc0
- 1:
- 2: \xc0
-
-/a(.*?)(.)/8
- a\x{100}b
- 0: a\x{100}
- 1:
- 2: \x{100}
-
-/a(.*)(.)/
- a\xc0\x88b
- 0: a\xc0\x88b
- 1: \xc0\x88
- 2: b
-
-/a(.*)(.)/8
- a\x{100}b
- 0: a\x{100}b
- 1: \x{100}
- 2: b
-
-/a(.)(.)/
- a\xc0\x92bcd
- 0: a\xc0\x92
- 1: \xc0
- 2: \x92
-
-/a(.)(.)/8
- a\x{240}bcd
- 0: a\x{240}b
- 1: \x{240}
- 2: b
-
-/a(.?)(.)/
- a\xc0\x92bcd
- 0: a\xc0\x92
- 1: \xc0
- 2: \x92
-
-/a(.?)(.)/8
- a\x{240}bcd
- 0: a\x{240}b
- 1: \x{240}
- 2: b
-
-/a(.??)(.)/
- a\xc0\x92bcd
- 0: a\xc0
- 1:
- 2: \xc0
-
-/a(.??)(.)/8
- a\x{240}bcd
- 0: a\x{240}
- 1:
- 2: \x{240}
-
-/a(.{3})b/8
- a\x{1234}xyb
- 0: a\x{1234}xyb
- 1: \x{1234}xy
- a\x{1234}\x{4321}yb
- 0: a\x{1234}\x{4321}yb
- 1: \x{1234}\x{4321}y
- a\x{1234}\x{4321}\x{3412}b
- 0: a\x{1234}\x{4321}\x{3412}b
- 1: \x{1234}\x{4321}\x{3412}
- *** Failers
-No match
- a\x{1234}b
-No match
- ac\ncb
-No match
-
-/a(.{3,})b/8
- a\x{1234}xyb
- 0: a\x{1234}xyb
- 1: \x{1234}xy
- a\x{1234}\x{4321}yb
- 0: a\x{1234}\x{4321}yb
- 1: \x{1234}\x{4321}y
- a\x{1234}\x{4321}\x{3412}b
- 0: a\x{1234}\x{4321}\x{3412}b
- 1: \x{1234}\x{4321}\x{3412}
- axxxxbcdefghijb
- 0: axxxxbcdefghijb
- 1: xxxxbcdefghij
- a\x{1234}\x{4321}\x{3412}\x{3421}b
- 0: a\x{1234}\x{4321}\x{3412}\x{3421}b
- 1: \x{1234}\x{4321}\x{3412}\x{3421}
- *** Failers
-No match
- a\x{1234}b
-No match
-
-/a(.{3,}?)b/8
- a\x{1234}xyb
- 0: a\x{1234}xyb
- 1: \x{1234}xy
- a\x{1234}\x{4321}yb
- 0: a\x{1234}\x{4321}yb
- 1: \x{1234}\x{4321}y
- a\x{1234}\x{4321}\x{3412}b
- 0: a\x{1234}\x{4321}\x{3412}b
- 1: \x{1234}\x{4321}\x{3412}
- axxxxbcdefghijb
- 0: axxxxb
- 1: xxxx
- a\x{1234}\x{4321}\x{3412}\x{3421}b
- 0: a\x{1234}\x{4321}\x{3412}\x{3421}b
- 1: \x{1234}\x{4321}\x{3412}\x{3421}
- *** Failers
-No match
- a\x{1234}b
-No match
-
-/a(.{3,5})b/8
- a\x{1234}xyb
- 0: a\x{1234}xyb
- 1: \x{1234}xy
- a\x{1234}\x{4321}yb
- 0: a\x{1234}\x{4321}yb
- 1: \x{1234}\x{4321}y
- a\x{1234}\x{4321}\x{3412}b
- 0: a\x{1234}\x{4321}\x{3412}b
- 1: \x{1234}\x{4321}\x{3412}
- axxxxbcdefghijb
- 0: axxxxb
- 1: xxxx
- a\x{1234}\x{4321}\x{3412}\x{3421}b
- 0: a\x{1234}\x{4321}\x{3412}\x{3421}b
- 1: \x{1234}\x{4321}\x{3412}\x{3421}
- axbxxbcdefghijb
- 0: axbxxb
- 1: xbxx
- axxxxxbcdefghijb
- 0: axxxxxb
- 1: xxxxx
- *** Failers
-No match
- a\x{1234}b
-No match
- axxxxxxbcdefghijb
-No match
-
-/a(.{3,5}?)b/8
- a\x{1234}xyb
- 0: a\x{1234}xyb
- 1: \x{1234}xy
- a\x{1234}\x{4321}yb
- 0: a\x{1234}\x{4321}yb
- 1: \x{1234}\x{4321}y
- a\x{1234}\x{4321}\x{3412}b
- 0: a\x{1234}\x{4321}\x{3412}b
- 1: \x{1234}\x{4321}\x{3412}
- axxxxbcdefghijb
- 0: axxxxb
- 1: xxxx
- a\x{1234}\x{4321}\x{3412}\x{3421}b
- 0: a\x{1234}\x{4321}\x{3412}\x{3421}b
- 1: \x{1234}\x{4321}\x{3412}\x{3421}
- axbxxbcdefghijb
- 0: axbxxb
- 1: xbxx
- axxxxxbcdefghijb
- 0: axxxxxb
- 1: xxxxx
- *** Failers
-No match
- a\x{1234}b
-No match
- axxxxxxbcdefghijb
-No match
-
-/^[a\x{c0}]/8
- *** Failers
-No match
- \x{100}
-No match
-
-/(?<=aXb)cd/8
- aXbcd
- 0: cd
-
-/(?<=a\x{100}b)cd/8
- a\x{100}bcd
- 0: cd
-
-/(?<=a\x{100000}b)cd/8
- a\x{100000}bcd
- 0: cd
-
-/(?:\x{100}){3}b/8
- \x{100}\x{100}\x{100}b
- 0: \x{100}\x{100}\x{100}b
- *** Failers
-No match
- \x{100}\x{100}b
-No match
-
-/\x{ab}/8
- \x{ab}
- 0: \x{ab}
- \xc2\xab
- 0: \x{ab}
- *** Failers
-No match
- \x00{ab}
-No match
-
-/(?<=(.))X/8
- WXYZ
- 0: X
- 1: W
- \x{256}XYZ
- 0: X
- 1: \x{256}
- *** Failers
-No match
- XYZ
-No match
-
-/X(\C{3})/8
- X\x{1234}
- 0: X\x{1234}
- 1: \x{1234}
-
-/X(\C{4})/8
- X\x{1234}YZ
- 0: X\x{1234}Y
- 1: \x{1234}Y
-
-/X\C*/8
- XYZabcdce
- 0: XYZabcdce
-
-/X\C*?/8
- XYZabcde
- 0: X
-
-/X\C{3,5}/8
- Xabcdefg
- 0: Xabcde
- X\x{1234}
- 0: X\x{1234}
- X\x{1234}YZ
- 0: X\x{1234}YZ
- X\x{1234}\x{512}
- 0: X\x{1234}\x{512}
- X\x{1234}\x{512}YZ
- 0: X\x{1234}\x{512}
-
-/X\C{3,5}?/8
- Xabcdefg
- 0: Xabc
- X\x{1234}
- 0: X\x{1234}
- X\x{1234}YZ
- 0: X\x{1234}
- X\x{1234}\x{512}
- 0: X\x{1234}
-
-/ End of testinput4 /
-
diff --git a/ext/pcre/pcrelib/testdata/testoutput5 b/ext/pcre/pcrelib/testdata/testoutput5
deleted file mode 100644
index 01daca505e..0000000000
--- a/ext/pcre/pcrelib/testdata/testoutput5
+++ /dev/null
@@ -1,339 +0,0 @@
-PCRE version 3.92 11-Sep-2002
-
-/\x{100}/8DM
-Memory allocation (code space): 11
-------------------------------------------------------------------
- 0 7 Bra 0
- 3 2 \xc4\x80
- 7 7 Ket
- 10 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 128
-
-/\x{1000}/8DM
-Memory allocation (code space): 12
-------------------------------------------------------------------
- 0 8 Bra 0
- 3 3 \xe1\x80\x80
- 8 8 Ket
- 11 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 225
-Need char = 128
-
-/\x{10000}/8DM
-Memory allocation (code space): 13
-------------------------------------------------------------------
- 0 9 Bra 0
- 3 4 \xf0\x90\x80\x80
- 9 9 Ket
- 12 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 240
-Need char = 128
-
-/\x{100000}/8DM
-Memory allocation (code space): 13
-------------------------------------------------------------------
- 0 9 Bra 0
- 3 4 \xf4\x80\x80\x80
- 9 9 Ket
- 12 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 244
-Need char = 128
-
-/\x{1000000}/8DM
-Memory allocation (code space): 14
-------------------------------------------------------------------
- 0 10 Bra 0
- 3 5 \xf9\x80\x80\x80\x80
- 10 10 Ket
- 13 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 249
-Need char = 128
-
-/\x{4000000}/8DM
-Memory allocation (code space): 15
-------------------------------------------------------------------
- 0 11 Bra 0
- 3 6 \xfc\x84\x80\x80\x80\x80
- 11 11 Ket
- 14 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 252
-Need char = 128
-
-/\x{7fffFFFF}/8DM
-Memory allocation (code space): 15
-------------------------------------------------------------------
- 0 11 Bra 0
- 3 6 \xfd\xbf\xbf\xbf\xbf\xbf
- 11 11 Ket
- 14 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 253
-Need char = 191
-
-/[\x{ff}]/8DM
-Memory allocation (code space): 40
-------------------------------------------------------------------
- 0 6 Bra 0
- 3 1 \xff
- 6 6 Ket
- 9 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 255
-No need char
-
-/[\x{100}]/8DM
-Failed: characters with values > 255 are not yet supported in classes at offset 7
-
-/\x{ffffffff}/8
-Failed: character value in \x{...} sequence is too large at offset 11
-
-/\x{100000000}/8
-Failed: character value in \x{...} sequence is too large at offset 12
-
-/^\x{100}a\x{1234}/8
- \x{100}a\x{1234}bcd
- 0: \x{100}a\x{1234}
-
-/\x80/8D
-------------------------------------------------------------------
- 0 7 Bra 0
- 3 2 \xc2\x80
- 7 7 Ket
- 10 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 194
-Need char = 128
-
-/\xff/8D
-------------------------------------------------------------------
- 0 7 Bra 0
- 3 2 \xc3\xbf
- 7 7 Ket
- 10 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 195
-Need char = 191
-
-/\x{0041}\x{2262}\x{0391}\x{002e}/D8
-------------------------------------------------------------------
- 0 12 Bra 0
- 3 7 A\xe2\x89\xa2\xce\x91.
- 12 12 Ket
- 15 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 'A'
-Need char = '.'
- \x{0041}\x{2262}\x{0391}\x{002e}
- 0: A\x{2262}\x{391}.
-
-/\x{D55c}\x{ad6d}\x{C5B4}/D8
-------------------------------------------------------------------
- 0 14 Bra 0
- 3 9 \xed\x95\x9c\xea\xb5\xad\xec\x96\xb4
- 14 14 Ket
- 17 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 237
-Need char = 180
- \x{D55c}\x{ad6d}\x{C5B4}
- 0: \x{d55c}\x{ad6d}\x{c5b4}
-
-/\x{65e5}\x{672c}\x{8a9e}/D8
-------------------------------------------------------------------
- 0 14 Bra 0
- 3 9 \xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e
- 14 14 Ket
- 17 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 230
-Need char = 158
- \x{65e5}\x{672c}\x{8a9e}
- 0: \x{65e5}\x{672c}\x{8a9e}
-
-/\x{80}/D8
-------------------------------------------------------------------
- 0 7 Bra 0
- 3 2 \xc2\x80
- 7 7 Ket
- 10 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 194
-Need char = 128
-
-/\x{084}/D8
-------------------------------------------------------------------
- 0 7 Bra 0
- 3 2 \xc2\x84
- 7 7 Ket
- 10 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 194
-Need char = 132
-
-/\x{104}/D8
-------------------------------------------------------------------
- 0 7 Bra 0
- 3 2 \xc4\x84
- 7 7 Ket
- 10 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 132
-
-/\x{861}/D8
-------------------------------------------------------------------
- 0 8 Bra 0
- 3 3 \xe0\xa1\xa1
- 8 8 Ket
- 11 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 224
-Need char = 161
-
-/\x{212ab}/D8
-------------------------------------------------------------------
- 0 9 Bra 0
- 3 4 \xf0\xa1\x8a\xab
- 9 9 Ket
- 12 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 240
-Need char = 171
-
-/.{3,5}X/D8
-------------------------------------------------------------------
- 0 14 Bra 0
- 3 Any{3}
- 7 Any{0,2}
- 11 1 X
- 14 14 Ket
- 17 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-No first char
-Need char = 'X'
- \x{212ab}\x{212ab}\x{212ab}\x{861}X
- 0: \x{212ab}\x{212ab}\x{212ab}\x{861}X
-
-
-/.{3,5}?/D8
-------------------------------------------------------------------
- 0 11 Bra 0
- 3 Any{3}
- 7 Any{0,2}?
- 11 11 Ket
- 14 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-No first char
-No need char
- \x{212ab}\x{212ab}\x{212ab}\x{861}
- 0: \x{212ab}\x{212ab}\x{212ab}
-
-/-- These tests are here rather than in testinput4 because Perl 5.6 has --/
-/-- some problems with UTF-8 support, in the area of \x{..} where the --/
-No match
-/-- value is < 255. It grumbles about invalid UTF-8 strings. --/
-No match
-
-/^[a\x{c0}]b/8
- \x{c0}b
- 0: \x{c0}b
-
-/^([a\x{c0}]*?)aa/8
- a\x{c0}aaaa/
- 0: a\x{c0}aa
- 1: a\x{c0}
-
-/^([a\x{c0}]*?)aa/8
- a\x{c0}aaaa/
- 0: a\x{c0}aa
- 1: a\x{c0}
- a\x{c0}a\x{c0}aaa/
- 0: a\x{c0}a\x{c0}aa
- 1: a\x{c0}a\x{c0}
-
-/^([a\x{c0}]*)aa/8
- a\x{c0}aaaa/
- 0: a\x{c0}aaaa
- 1: a\x{c0}aa
- a\x{c0}a\x{c0}aaa/
- 0: a\x{c0}a\x{c0}aaa
- 1: a\x{c0}a\x{c0}a
-
-/^([a\x{c0}]*)a\x{c0}/8
- a\x{c0}aaaa/
- 0: a\x{c0}
- 1:
- a\x{c0}a\x{c0}aaa/
- 0: a\x{c0}a\x{c0}
- 1: a\x{c0}
-
-/-- --/
-
-/(?<=\C)X/8
-Failed: \C not allowed in lookbehind assertion at offset 6
-
-/-- This one is here not because it's different to Perl, but because the --/
-/-- way the captured single-byte is displayed. (In Perl it becomes a --/
-No match
-/-- character, and you can't tell the difference.) --/
-No match
-
-/X(\C)(.*)/8
- X\x{1234}
- 0: X\x{1234}
- 1: \xe1
- 2: \x88\xb4
- X\nabc
- 0: X\x{0a}abc
- 1: \x{0a}
- 2: abc
-
-/ End of testinput5 /
-
diff --git a/ext/pcre/pcrelib/testdata/testoutput6 b/ext/pcre/pcrelib/testdata/testoutput6
deleted file mode 100644
index fcf084670f..0000000000
--- a/ext/pcre/pcrelib/testdata/testoutput6
+++ /dev/null
@@ -1,319 +0,0 @@
-PCRE version 3.9 02-Jan-2002
-
-/\x{100}/8DM
-Memory allocation (code space): 11
-------------------------------------------------------------------
- 0 7 Bra 0
- 3 2 \xc4\x80
- 7 7 Ket
- 10 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 128
-
-/\x{1000}/8DM
-Memory allocation (code space): 12
-------------------------------------------------------------------
- 0 8 Bra 0
- 3 3 \xe1\x80\x80
- 8 8 Ket
- 11 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 225
-Need char = 128
-
-/\x{10000}/8DM
-Memory allocation (code space): 13
-------------------------------------------------------------------
- 0 9 Bra 0
- 3 4 \xf0\x90\x80\x80
- 9 9 Ket
- 12 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 240
-Need char = 128
-
-/\x{100000}/8DM
-Memory allocation (code space): 13
-------------------------------------------------------------------
- 0 9 Bra 0
- 3 4 \xf4\x80\x80\x80
- 9 9 Ket
- 12 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 244
-Need char = 128
-
-/\x{1000000}/8DM
-Memory allocation (code space): 14
-------------------------------------------------------------------
- 0 10 Bra 0
- 3 5 \xf9\x80\x80\x80\x80
- 10 10 Ket
- 13 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 249
-Need char = 128
-
-/\x{4000000}/8DM
-Memory allocation (code space): 15
-------------------------------------------------------------------
- 0 11 Bra 0
- 3 6 \xfc\x84\x80\x80\x80\x80
- 11 11 Ket
- 14 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 252
-Need char = 128
-
-/\x{7fffFFFF}/8DM
-Memory allocation (code space): 15
-------------------------------------------------------------------
- 0 11 Bra 0
- 3 6 \xfd\xbf\xbf\xbf\xbf\xbf
- 11 11 Ket
- 14 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 253
-Need char = 191
-
-/[\x{ff}]/8DM
-Memory allocation (code space): 40
-------------------------------------------------------------------
- 0 6 Bra 0
- 3 1 \xff
- 6 6 Ket
- 9 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 255
-No need char
-
-/[\x{100}]/8DM
-Memory allocation (code space): 40
-Failed: characters with values > 255 are not yet supported in classes at offset 7
-
-/\x{ffffffff}/8
-Failed: character value in \x{...} sequence is too large at offset 11
-
-/\x{100000000}/8
-Failed: character value in \x{...} sequence is too large at offset 12
-
-/^\x{100}a\x{1234}/8
- \x{100}a\x{1234}bcd
- 0: \x{100}a\x{1234}
-
-/\x80/8D
-------------------------------------------------------------------
- 0 7 Bra 0
- 3 2 \xc2\x80
- 7 7 Ket
- 10 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 194
-Need char = 128
-
-/\xff/8D
-------------------------------------------------------------------
- 0 7 Bra 0
- 3 2 \xc3\xbf
- 7 7 Ket
- 10 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 195
-Need char = 191
-
-/\x{0041}\x{2262}\x{0391}\x{002e}/D8
-------------------------------------------------------------------
- 0 12 Bra 0
- 3 7 A\xe2\x89\xa2\xce\x91.
- 12 12 Ket
- 15 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 'A'
-Need char = '.'
- \x{0041}\x{2262}\x{0391}\x{002e}
- 0: A\x{2262}\x{391}.
-
-/\x{D55c}\x{ad6d}\x{C5B4}/D8
-------------------------------------------------------------------
- 0 14 Bra 0
- 3 9 \xed\x95\x9c\xea\xb5\xad\xec\x96\xb4
- 14 14 Ket
- 17 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 237
-Need char = 180
- \x{D55c}\x{ad6d}\x{C5B4}
- 0: \x{d55c}\x{ad6d}\x{c5b4}
-
-/\x{65e5}\x{672c}\x{8a9e}/D8
-------------------------------------------------------------------
- 0 14 Bra 0
- 3 9 \xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e
- 14 14 Ket
- 17 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 230
-Need char = 158
- \x{65e5}\x{672c}\x{8a9e}
- 0: \x{65e5}\x{672c}\x{8a9e}
-
-/\x{80}/D8
-------------------------------------------------------------------
- 0 7 Bra 0
- 3 2 \xc2\x80
- 7 7 Ket
- 10 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 194
-Need char = 128
-
-/\x{084}/D8
-------------------------------------------------------------------
- 0 7 Bra 0
- 3 2 \xc2\x84
- 7 7 Ket
- 10 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 194
-Need char = 132
-
-/\x{104}/D8
-------------------------------------------------------------------
- 0 7 Bra 0
- 3 2 \xc4\x84
- 7 7 Ket
- 10 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 132
-
-/\x{861}/D8
-------------------------------------------------------------------
- 0 8 Bra 0
- 3 3 \xe0\xa1\xa1
- 8 8 Ket
- 11 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 224
-Need char = 161
-
-/\x{212ab}/D8
-------------------------------------------------------------------
- 0 9 Bra 0
- 3 4 \xf0\xa1\x8a\xab
- 9 9 Ket
- 12 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 240
-Need char = 171
-
-/.{3,5}X/D8
-------------------------------------------------------------------
- 0 14 Bra 0
- 3 Any{3}
- 7 Any{0,2}
- 11 1 X
- 14 14 Ket
- 17 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-No first char
-Need char = 'X'
- \x{212ab}\x{212ab}\x{212ab}\x{861}X
- 0: \x{212ab}\x{212ab}\x{212ab}\x{861}X
-
-
-/.{3,5}?/D8
-------------------------------------------------------------------
- 0 11 Bra 0
- 3 Any{3}
- 7 Any{0,2}?
- 11 11 Ket
- 14 End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-No first char
-No need char
- \x{212ab}\x{212ab}\x{212ab}\x{861}
- 0: \x{212ab}\x{212ab}\x{212ab}
-
-/-- These tests are here rather than in testinput5 because Perl 5.6 has --/
-/-- some problems with UTF-8 support, in the area of \x{..} where the --/
-No match
-/-- value is < 255. It grumbles about invalid UTF-8 strings. --/
-No match
-
-/^[a\x{c0}]b/8
- \x{c0}b
- 0: \x{c0}b
-
-/^([a\x{c0}]*?)aa/8
- a\x{c0}aaaa/
- 0: a\x{c0}aa
- 1: a\x{c0}
-
-/^([a\x{c0}]*?)aa/8
- a\x{c0}aaaa/
- 0: a\x{c0}aa
- 1: a\x{c0}
- a\x{c0}a\x{c0}aaa/
- 0: a\x{c0}a\x{c0}aa
- 1: a\x{c0}a\x{c0}
-
-/^([a\x{c0}]*)aa/8
- a\x{c0}aaaa/
- 0: a\x{c0}aaaa
- 1: a\x{c0}aa
- a\x{c0}a\x{c0}aaa/
- 0: a\x{c0}a\x{c0}aaa
- 1: a\x{c0}a\x{c0}a
-
-/^([a\x{c0}]*)a\x{c0}/8
- a\x{c0}aaaa/
- 0: a\x{c0}
- 1:
- a\x{c0}a\x{c0}aaa/
- 0: a\x{c0}a\x{c0}
- 1: a\x{c0}
-
-/ End of testinput6 /
-
diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c
deleted file mode 100644
index 3a75f26318..0000000000
--- a/ext/pcre/php_pcre.c
+++ /dev/null
@@ -1,1508 +0,0 @@
-/*
- +----------------------------------------------------------------------+
- | PHP Version 4 |
- +----------------------------------------------------------------------+
- | Copyright (c) 1997-2003 The PHP Group |
- +----------------------------------------------------------------------+
- | This source file is subject to version 2.02 of the PHP license, |
- | that is bundled with this package in the file LICENSE, and is |
- | available at through the world-wide-web at |
- | http://www.php.net/license/2_02.txt. |
- | If you did not receive a copy of the PHP license and are unable to |
- | obtain it through the world-wide-web, please send a note to |
- | license@php.net so we can mail you a copy immediately. |
- +----------------------------------------------------------------------+
- | Author: Andrei Zmievski <andrei@php.net> |
- +----------------------------------------------------------------------+
- */
-
-/* $Id$ */
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include "php.h"
-#include "php_globals.h"
-#include "php_pcre.h"
-#include "ext/standard/info.h"
-#include "ext/standard/php_smart_str.h"
-
-#if HAVE_PCRE || HAVE_BUNDLED_PCRE
-
-#include "ext/standard/php_string.h"
-
-#define PREG_PATTERN_ORDER 1
-#define PREG_SET_ORDER 2
-#define PREG_OFFSET_CAPTURE (1<<8)
-
-#define PREG_SPLIT_NO_EMPTY (1<<0)
-#define PREG_SPLIT_DELIM_CAPTURE (1<<1)
-#define PREG_SPLIT_OFFSET_CAPTURE (1<<2)
-
-#define PREG_REPLACE_EVAL (1<<0)
-
-#define PREG_GREP_INVERT (1<<0)
-
-
-ZEND_DECLARE_MODULE_GLOBALS(pcre)
-
-
-static void *php_pcre_malloc(size_t size)
-{
- return pemalloc(size, 1);
-}
-
-
-static void php_pcre_free(void *ptr)
-{
- pefree(ptr, 1);
-}
-
-
-static void php_free_pcre_cache(void *data)
-{
- pcre_cache_entry *pce = (pcre_cache_entry *) data;
- pefree(pce->re, 1);
-#if HAVE_SETLOCALE
- if ((void*)pce->tables) pefree((void*)pce->tables, 1);
- pefree(pce->locale, 1);
-#endif
-}
-
-
-static void php_pcre_init_globals(zend_pcre_globals *pcre_globals TSRMLS_DC)
-{
- zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1);
-}
-
-static void php_pcre_shutdown_globals(zend_pcre_globals *pcre_globals TSRMLS_DC)
-{
- zend_hash_destroy(&pcre_globals->pcre_cache);
-}
-
-
-static /* {{{ PHP_MINFO_FUNCTION(pcre) */
-PHP_MINFO_FUNCTION(pcre)
-{
- php_info_print_table_start();
- php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" );
- php_info_print_table_row(2, "PCRE Library Version", pcre_version() );
- php_info_print_table_end();
-}
-/* }}} */
-
-/* {{{ PHP_MINIT_FUNCTION(pcre) */
-static PHP_MINIT_FUNCTION(pcre)
-{
- ZEND_INIT_MODULE_GLOBALS(pcre, php_pcre_init_globals, php_pcre_shutdown_globals);
-
- REGISTER_LONG_CONSTANT("PREG_PATTERN_ORDER", PREG_PATTERN_ORDER, CONST_CS | CONST_PERSISTENT);
- REGISTER_LONG_CONSTANT("PREG_SET_ORDER", PREG_SET_ORDER, CONST_CS | CONST_PERSISTENT);
- REGISTER_LONG_CONSTANT("PREG_OFFSET_CAPTURE", PREG_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
- REGISTER_LONG_CONSTANT("PREG_SPLIT_NO_EMPTY", PREG_SPLIT_NO_EMPTY, CONST_CS | CONST_PERSISTENT);
- REGISTER_LONG_CONSTANT("PREG_SPLIT_DELIM_CAPTURE", PREG_SPLIT_DELIM_CAPTURE, CONST_CS | CONST_PERSISTENT);
- REGISTER_LONG_CONSTANT("PREG_SPLIT_OFFSET_CAPTURE", PREG_SPLIT_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
- REGISTER_LONG_CONSTANT("PREG_GREP_INVERT", PREG_GREP_INVERT, CONST_CS | CONST_PERSISTENT);
- return SUCCESS;
-}
-/* }}} */
-
-/* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */
-static PHP_MSHUTDOWN_FUNCTION(pcre)
-{
-#ifndef ZTS
- php_pcre_shutdown_globals(&pcre_globals TSRMLS_CC);
-#endif
-
- return SUCCESS;
-}
-/* }}} */
-
-/* {{{ PHP_RINIT_FUNCTION(pcre) */
-static PHP_RINIT_FUNCTION(pcre)
-{
- pcre_malloc = php_pcre_malloc;
- pcre_free = php_pcre_free;
-
- return SUCCESS;
-}
-/* }}} */
-
-/* {{{ pcre_get_compiled_regex
- */
-PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_options) {
- pcre *re = NULL;
- int coptions = 0;
- int soptions = 0;
- const char *error;
- int erroffset;
- char delimiter;
- char start_delimiter;
- char end_delimiter;
- char *p, *pp;
- char *pattern;
- int regex_len;
- int do_study = 0;
- int poptions = 0;
- unsigned const char *tables = NULL;
-#if HAVE_SETLOCALE
- char *locale = setlocale(LC_CTYPE, NULL);
-#endif
- pcre_cache_entry *pce;
- pcre_cache_entry new_entry;
- TSRMLS_FETCH();
-
- /* Try to lookup the cached regex entry, and if successful, just pass
- back the compiled pattern, otherwise go on and compile it. */
- regex_len = strlen(regex);
- if (zend_hash_find(&PCRE_G(pcre_cache), regex, regex_len+1, (void **)&pce) == SUCCESS) {
-#if HAVE_SETLOCALE
- if (!strcmp(pce->locale, locale)) {
-#endif
- *extra = pce->extra;
- *preg_options = pce->preg_options;
- return pce->re;
-#if HAVE_SETLOCALE
- }
-#endif
- }
-
- p = regex;
-
- /* Parse through the leading whitespace, and display a warning if we
- get to the end without encountering a delimiter. */
- while (isspace((int)*p)) p++;
- if (*p == 0) {
- php_error_docref(NULL TSRMLS_CC, E_WARNING, "Empty regular expression");
- return NULL;
- }
-
- /* Get the delimiter and display a warning if it is alphanumeric
- or a backslash. */
- delimiter = *p++;
- if (isalnum((int)delimiter) || delimiter == '\\') {
- php_error_docref(NULL TSRMLS_CC,E_WARNING, "Delimiter must not be alphanumeric or backslash");
- return NULL;
- }
-
- start_delimiter = delimiter;
- if ((pp = strchr("([{< )]}> )]}>", delimiter)))
- delimiter = pp[5];
- end_delimiter = delimiter;
-
- if (start_delimiter == end_delimiter) {
- /* We need to iterate through the pattern, searching for the ending delimiter,
- but skipping the backslashed delimiters. If the ending delimiter is not
- found, display a warning. */
- pp = p;
- while (*pp != 0) {
- if (*pp == '\\' && pp[1] != 0) pp++;
- else if (*pp == delimiter)
- break;
- pp++;
- }
- if (*pp == 0) {
- php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending delimiter '%c' found", delimiter);
- return NULL;
- }
- } else {
- /* We iterate through the pattern, searching for the matching ending
- * delimiter. For each matching starting delimiter, we increment nesting
- * level, and decrement it for each matching ending delimiter. If we
- * reach the end of the pattern without matching, display a warning.
- */
- int brackets = 1; /* brackets nesting level */
- pp = p;
- while (*pp != 0) {
- if (*pp == '\\' && pp[1] != 0) pp++;
- else if (*pp == end_delimiter && --brackets <= 0)
- break;
- else if (*pp == start_delimiter)
- brackets++;
- pp++;
- }
- if (*pp == 0) {
- php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending matching delimiter '%c' found", end_delimiter);
- return NULL;
- }
- }
-
- /* Make a copy of the actual pattern. */
- pattern = estrndup(p, pp-p);
-
- /* Move on to the options */
- pp++;
-
- /* Clear out preg options */
- *preg_options = 0;
-
- /* Parse through the options, setting appropriate flags. Display
- a warning if we encounter an unknown modifier. */
- while (*pp != 0) {
- switch (*pp++) {
- /* Perl compatible options */
- case 'i': coptions |= PCRE_CASELESS; break;
- case 'm': coptions |= PCRE_MULTILINE; break;
- case 's': coptions |= PCRE_DOTALL; break;
- case 'x': coptions |= PCRE_EXTENDED; break;
-
- /* PCRE specific options */
- case 'A': coptions |= PCRE_ANCHORED; break;
- case 'D': coptions |= PCRE_DOLLAR_ENDONLY;break;
- case 'S': do_study = 1; break;
- case 'U': coptions |= PCRE_UNGREEDY; break;
- case 'X': coptions |= PCRE_EXTRA; break;
- case 'u': coptions |= PCRE_UTF8; break;
-
- /* Custom preg options */
- case 'e': poptions |= PREG_REPLACE_EVAL; break;
-
- case ' ':
- case '\n':
- break;
-
- default:
- php_error_docref(NULL TSRMLS_CC,E_WARNING, "Unknown modifier '%c'", pp[-1]);
- efree(pattern);
- return NULL;
- }
- }
-
-#if HAVE_SETLOCALE
- if (strcmp(locale, "C"))
- tables = pcre_maketables();
-#endif
-
- /* Compile pattern and display a warning if compilation failed. */
- re = pcre_compile(pattern,
- coptions,
- &error,
- &erroffset,
- tables);
-
- if (re == NULL) {
- php_error_docref(NULL TSRMLS_CC,E_WARNING, "Compilation failed: %s at offset %d", error, erroffset);
- efree(pattern);
- return NULL;
- }
-
- /* If study option was specified, study the pattern and
- store the result in extra for passing to pcre_exec. */
- if (do_study) {
- *extra = pcre_study(re, soptions, &error);
- if (error != NULL) {
- php_error_docref(NULL TSRMLS_CC,E_WARNING, "Error while studying pattern");
- }
- }
-
- *preg_options = poptions;
-
- efree(pattern);
-
- /* Store the compiled pattern and extra info in the cache. */
- new_entry.re = re;
- new_entry.extra = *extra;
- new_entry.preg_options = poptions;
-#if HAVE_SETLOCALE
- new_entry.locale = pestrdup(locale, 1);
- new_entry.tables = tables;
-#endif
- zend_hash_update(&PCRE_G(pcre_cache), regex, regex_len+1, (void *)&new_entry,
- sizeof(pcre_cache_entry), NULL);
-
- return re;
-}
-/* }}} */
-
-/* {{{ add_offset_pair */
-static inline void add_offset_pair(zval *result, char *str, int len, int offset)
-{
- zval *match_pair;
-
- ALLOC_ZVAL(match_pair);
- array_init(match_pair);
- INIT_PZVAL(match_pair);
-
- /* Add (match, offset) to the return value */
- add_next_index_stringl(match_pair, str, len, 1);
- add_next_index_long(match_pair, offset);
-
- zend_hash_next_index_insert(Z_ARRVAL_P(result), &match_pair, sizeof(zval *), NULL);
-}
-/* }}} */
-
-/* {{{ php_pcre_match
- */
-static void php_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global)
-{
- zval **regex, /* Regular expression */
- **subject, /* String to match against */
- **subpats = NULL, /* Array for subpatterns */
- **flags, /* Match control flags */
- *result_set, /* Holds a set of subpatterns after
- a global match */
- **match_sets = NULL; /* An array of sets of matches for each
- subpattern after a global match */
- pcre *re = NULL; /* Compiled regular expression */
- pcre_extra *extra = NULL; /* Holds results of studying */
- int exoptions = 0; /* Execution options */
- int preg_options = 0; /* Custom preg options */
- int count = 0; /* Count of matched subpatterns */
- int *offsets; /* Array of subpattern offsets */
- int num_subpats; /* Number of captured subpatterns */
- int size_offsets; /* Size of the offsets array */
- int start_offset; /* Where the new search starts */
- int matched; /* Has anything matched */
- int i;
- int subpats_order = 0; /* Order of subpattern matches */
- int offset_capture = 0;/* Capture match offsets: yes/no */
- int g_notempty = 0; /* If the match should not be empty */
- const char **stringlist; /* Used to hold list of subpatterns */
- char *match; /* The current match */
-
-
- /* Get function parameters and do error-checking. */
- switch(ZEND_NUM_ARGS()) {
- case 2:
- if (global || zend_get_parameters_ex(2, &regex, &subject) == FAILURE) {
- WRONG_PARAM_COUNT;
- }
- break;
-
- case 3:
- if (zend_get_parameters_ex(3, &regex, &subject, &subpats) == FAILURE) {
- WRONG_PARAM_COUNT;
- }
- if (global)
- subpats_order = PREG_PATTERN_ORDER;
- break;
-
- case 4:
- if (zend_get_parameters_ex(4, &regex, &subject, &subpats, &flags) == FAILURE) {
- WRONG_PARAM_COUNT;
- }
-
- convert_to_long_ex(flags);
- offset_capture = Z_LVAL_PP(flags) & PREG_OFFSET_CAPTURE;
- subpats_order = Z_LVAL_PP(flags) & 0xff;
- if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
- (!global && subpats_order != 0)) {
- php_error_docref(NULL TSRMLS_CC,E_WARNING, "Wrong value for parameter 4");
- return;
- }
- break;
-
- default:
- WRONG_PARAM_COUNT;
- }
-
- /* Make sure we're dealing with strings. */
- convert_to_string_ex(regex);
- convert_to_string_ex(subject);
-
- /* Make sure to clean up the passed array and initialize it. */
- if (subpats != NULL) {
- zval_dtor(*subpats);
- array_init(*subpats);
- }
-
- /* Compile regex or get it from cache. */
- if ((re = pcre_get_compiled_regex(Z_STRVAL_PP(regex), &extra, &preg_options)) == NULL) {
- RETURN_FALSE;
- }
-
- /* Calculate the size of the offsets array, and allocate memory for it. */
- pcre_fullinfo(re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
- num_subpats++;
- size_offsets = num_subpats * 3;
- offsets = (int *)emalloc(size_offsets * sizeof(int));
-
- /* Allocate match sets array and initialize the values */
- if (global && subpats_order == PREG_PATTERN_ORDER) {
- match_sets = (zval **)emalloc(num_subpats * sizeof(zval *));
- for (i=0; i<num_subpats; i++) {
- ALLOC_ZVAL(match_sets[i]);
- array_init(match_sets[i]);
- INIT_PZVAL(match_sets[i]);
- }
- }
-
- /* Start from the beginning of the string */
- start_offset = 0;
- match = NULL;
- matched = 0;
-
- do {
- /* Execute the regular expression. */
- count = pcre_exec(re, extra, Z_STRVAL_PP(subject),
- Z_STRLEN_PP(subject), start_offset,
- exoptions|g_notempty, offsets, size_offsets);
-
- /* Check for too many substrings condition. */
- if (count == 0) {
- php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
- count = size_offsets/3;
- }
-
- /* If something has matched */
- if (count >= 0) {
- matched++;
- match = Z_STRVAL_PP(subject) + offsets[0];
-
- /* If subpatterns array has been passed, fill it in with values. */
- if (subpats != NULL) {
- /* Try to get the list of substrings and display a warning if failed. */
- if (pcre_get_substring_list(Z_STRVAL_PP(subject),
- offsets, count, &stringlist) < 0) {
- efree(offsets);
- efree(re);
- php_error_docref(NULL TSRMLS_CC,E_WARNING, "Get subpatterns list failed");
- return;
- }
-
- if (global) { /* global pattern matching */
- if (subpats_order == PREG_PATTERN_ORDER) {
- /* For each subpattern, insert it into the appropriate array. */
- for (i = 0; i < count; i++) {
- if (offset_capture) {
- add_offset_pair(match_sets[i], (char *)stringlist[i],
- offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1]);
- } else {
- add_next_index_stringl(match_sets[i], (char *)stringlist[i],
- offsets[(i<<1)+1] - offsets[i<<1], 1);
- }
- }
- /*
- * If the number of captured subpatterns on this run is
- * less than the total possible number, pad the result
- * arrays with empty strings.
- */
- if (count < num_subpats) {
- for (; i < num_subpats; i++) {
- add_next_index_string(match_sets[i], empty_string, 1);
- }
- }
- } else {
- /* Allocate the result set array */
- ALLOC_ZVAL(result_set);
- array_init(result_set);
- INIT_PZVAL(result_set);
-
- /* Add all the subpatterns to it */
- for (i = 0; i < count; i++) {
- if (offset_capture) {
- add_offset_pair(result_set, (char *)stringlist[i],
- offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1]);
- } else {
- add_next_index_stringl(result_set, (char *)stringlist[i],
- offsets[(i<<1)+1] - offsets[i<<1], 1);
- }
- }
- /* And add it to the output array */
- zend_hash_next_index_insert(Z_ARRVAL_PP(subpats), &result_set,
- sizeof(zval *), NULL);
- }
- } else { /* single pattern matching */
- /* For each subpattern, insert it into the subpatterns array. */
- for (i = 0; i < count; i++) {
- if (offset_capture) {
- add_offset_pair(*subpats, (char *)stringlist[i],
- offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1]);
- } else {
- add_next_index_stringl(*subpats, (char *)stringlist[i],
- offsets[(i<<1)+1] - offsets[i<<1], 1);
- }
- }
- }
-
- php_pcre_free((void *) stringlist);
- }
- }
- else { /* Failed to match */
- /* If we previously set PCRE_NOTEMPTY after a null match,
- this is not necessarily the end. We need to advance
- the start offset, and continue. Fudge the offset values
- to achieve this, unless we're already at the end of the string. */
- if (g_notempty != 0 && start_offset < Z_STRLEN_PP(subject)) {
- offsets[0] = start_offset;
- offsets[1] = start_offset + 1;
- } else
- break;
- }
-
- /* If we have matched an empty string, mimic what Perl's /g options does.
- This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
- the match again at the same point. If this fails (picked up above) we
- advance to the next character. */
- g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
-
- /* Advance to the position right after the last full match */
- start_offset = offsets[1];
- } while (global);
-
- /* Add the match sets to the output array and clean up */
- if (global && subpats_order == PREG_PATTERN_ORDER) {
- for (i=0; i<num_subpats; i++) {
- zend_hash_next_index_insert(Z_ARRVAL_PP(subpats), &match_sets[i], sizeof(zval *), NULL);
- }
- efree(match_sets);
- }
-
- efree(offsets);
-
- RETVAL_LONG(matched);
-}
-/* }}} */
-
-/* {{{ proto int preg_match(string pattern, string subject [, array subpatterns [, int flags]])
- Perform a Perl-style regular expression match */
-PHP_FUNCTION(preg_match)
-{
- php_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
-}
-/* }}} */
-
-/* {{{ proto int preg_match_all(string pattern, string subject, array subpatterns [, int flags])
- Perform a Perl-style global regular expression match */
-PHP_FUNCTION(preg_match_all)
-{
- php_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
-}
-/* }}} */
-
-/* {{{ preg_get_backref
- */
-static int preg_get_backref(char **str, int *backref)
-{
- register char in_brace = 0;
- register char *walk = *str;
-
- if (walk[1] == 0)
- return 0;
-
- if (*walk == '$' && walk[1] == '{') {
- in_brace = 1;
- walk++;
- }
- walk++;
-
- if (*walk >= '0' && *walk <= '9') {
- *backref = *walk - '0';
- walk++;
- } else
- return 0;
-
- if (*walk && *walk >= '0' && *walk <= '9') {
- *backref = *backref * 10 + *walk - '0';
- walk++;
- }
-
- if (in_brace) {
- if (*walk == 0 || *walk != '}')
- return 0;
- else
- walk++;
- }
-
- *str = walk;
- return 1;
-}
-/* }}} */
-
-/* {{{ preg_do_repl_func
- */
-static int preg_do_repl_func(zval *function, char *subject, int *offsets, int count, char **result)
-{
- zval *retval_ptr; /* Function return value */
- zval **args[1]; /* Argument to pass to function */
- zval *subpats; /* Captured subpatterns */
- int result_len; /* Return value length */
- int i;
- TSRMLS_FETCH();
-
- MAKE_STD_ZVAL(subpats);
- array_init(subpats);
- for (i = 0; i < count; i++)
- add_next_index_stringl(subpats, &subject[offsets[i<<1]], offsets[(i<<1)+1] - offsets[i<<1], 1);
- args[0] = &subpats;
-
- if (call_user_function_ex(EG(function_table), NULL, function, &retval_ptr, 1, args, 0, NULL TSRMLS_CC) == SUCCESS && retval_ptr) {
- convert_to_string_ex(&retval_ptr);
- *result = estrndup(Z_STRVAL_P(retval_ptr), Z_STRLEN_P(retval_ptr));
- result_len = Z_STRLEN_P(retval_ptr);
- zval_ptr_dtor(&retval_ptr);
- } else {
- php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unable to call custom replacement function");
- result_len = offsets[1] - offsets[0];
- *result = estrndup(&subject[offsets[0]], result_len);
- }
- zval_dtor(subpats);
- FREE_ZVAL(subpats);
-
- return result_len;
-}
-/* }}} */
-
-/* {{{ preg_do_eval
- */
-static int preg_do_eval(char *eval_str, int eval_str_len, char *subject,
- int *offsets, int count, char **result TSRMLS_DC)
-{
- zval retval; /* Return value from evaluation */
- char *eval_str_end, /* End of eval string */
- *match, /* Current match for a backref */
- *esc_match, /* Quote-escaped match */
- *walk, /* Used to walk the code string */
- *segment, /* Start of segment to append while walking */
- walk_last; /* Last walked character */
- int match_len; /* Length of the match */
- int esc_match_len; /* Length of the quote-escaped match */
- int result_len; /* Length of the result of the evaluation */
- int backref; /* Current backref */
- char *compiled_string_description;
- smart_str code = {0};
-
- eval_str_end = eval_str + eval_str_len;
- walk = segment = eval_str;
- walk_last = 0;
-
- while (walk < eval_str_end) {
- /* If found a backreference.. */
- if ('\\' == *walk || '$' == *walk) {
- smart_str_appendl(&code, segment, walk - segment);
- if (walk_last == '\\') {
- code.c[code.len-1] = *walk++;
- segment = walk;
- walk_last = 0;
- continue;
- }
- segment = walk;
- if (preg_get_backref(&walk, &backref)) {
- if (backref < count) {
- /* Find the corresponding string match and substitute it
- in instead of the backref */
- match = subject + offsets[backref<<1];
- match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
- if (match_len)
- esc_match = php_addslashes(match, match_len, &esc_match_len, 0 TSRMLS_CC);
- else {
- esc_match = match;
- esc_match_len = 0;
- }
- } else {
- esc_match = empty_string;
- esc_match_len = 0;
- match_len = 0;
- }
- smart_str_appendl(&code, esc_match, esc_match_len);
-
- segment = walk;
-
- /* Clean up and reassign */
- if (esc_match_len)
- efree(esc_match);
- continue;
- }
- }
- walk++;
- walk_last = walk[-1];
- }
- smart_str_appendl(&code, segment, walk - segment);
- smart_str_0(&code);
-
- compiled_string_description = zend_make_compiled_string_description("regexp code" TSRMLS_CC);
- /* Run the code */
- if (zend_eval_string(code.c, &retval, compiled_string_description TSRMLS_CC) == FAILURE) {
- efree(compiled_string_description);
- php_error_docref(NULL TSRMLS_CC,E_ERROR, "Failed evaluating code:\n%s", code);
- /* zend_error() does not return in this case */
- }
- efree(compiled_string_description);
- convert_to_string(&retval);
-
- /* Save the return value and its length */
- *result = estrndup(Z_STRVAL(retval), Z_STRLEN(retval));
- result_len = Z_STRLEN(retval);
-
- /* Clean up */
- zval_dtor(&retval);
- smart_str_free(&code);
-
- return result_len;
-}
-/* }}} */
-
-/* {{{ php_pcre_replace
- */
-PHPAPI char *php_pcre_replace(char *regex, int regex_len,
- char *subject, int subject_len,
- zval *replace_val, int is_callable_replace,
- int *result_len, int limit TSRMLS_DC)
-{
- pcre *re = NULL; /* Compiled regular expression */
- pcre_extra *extra = NULL; /* Holds results of studying */
- int exoptions = 0; /* Execution options */
- int preg_options = 0; /* Custom preg options */
- int count = 0; /* Count of matched subpatterns */
- int *offsets; /* Array of subpattern offsets */
- int size_offsets; /* Size of the offsets array */
- int new_len; /* Length of needed storage */
- int alloc_len; /* Actual allocated length */
- int eval_result_len=0; /* Length of the eval'ed or
- function-returned string */
- int match_len; /* Length of the current match */
- int backref; /* Backreference number */
- int eval; /* If the replacement string should be eval'ed */
- int start_offset; /* Where the new search starts */
- int g_notempty=0; /* If the match should not be empty */
- int replace_len=0; /* Length of replacement string */
- char *result, /* Result of replacement */
- *replace=NULL, /* Replacement string */
- *new_buf, /* Temporary buffer for re-allocation */
- *walkbuf, /* Location of current replacement in the result */
- *walk, /* Used to walk the replacement string */
- *match, /* The current match */
- *piece, /* The current piece of subject */
- *replace_end=NULL, /* End of replacement string */
- *eval_result, /* Result of eval or custom function */
- walk_last; /* Last walked character */
-
- /* Compile regex or get it from cache. */
- if ((re = pcre_get_compiled_regex(regex, &extra, &preg_options)) == NULL) {
- return NULL;
- }
-
- eval = preg_options & PREG_REPLACE_EVAL;
- if (is_callable_replace) {
- if (eval) {
- php_error_docref(NULL TSRMLS_CC, E_WARNING, "/e modifier cannot be used with replacement callback");
- return NULL;
- }
- } else {
- replace = Z_STRVAL_P(replace_val);
- replace_len = Z_STRLEN_P(replace_val);
- replace_end = replace + replace_len;
- }
-
- /* Calculate the size of the offsets array, and allocate memory for it. */
- size_offsets = (pcre_info(re, NULL, NULL) + 1) * 3;
- offsets = (int *)emalloc(size_offsets * sizeof(int));
-
- alloc_len = 2 * subject_len + 1;
- result = emalloc(alloc_len * sizeof(char));
-
- /* Initialize */
- match = NULL;
- *result_len = 0;
- start_offset = 0;
-
- while (1) {
- /* Execute the regular expression. */
- count = pcre_exec(re, extra, subject, subject_len, start_offset,
- exoptions|g_notempty, offsets, size_offsets);
-
- /* Check for too many substrings condition. */
- if (count == 0) {
- php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
- count = size_offsets/3;
- }
-
- piece = subject + start_offset;
-
- if (count > 0 && (limit == -1 || limit > 0)) {
- /* Set the match location in subject */
- match = subject + offsets[0];
-
- new_len = *result_len + offsets[0] - start_offset; /* part before the match */
-
- /* If evaluating, do it and add the return string's length */
- if (eval) {
- eval_result_len = preg_do_eval(replace, replace_len, subject,
- offsets, count, &eval_result TSRMLS_CC);
- new_len += eval_result_len;
- } else if (is_callable_replace) {
- /* Use custom function to get replacement string and its length. */
- eval_result_len = preg_do_repl_func(replace_val, subject, offsets,
- count, &eval_result);
- new_len += eval_result_len;
- } else { /* do regular substitution */
- walk = replace;
- walk_last = 0;
- while (walk < replace_end) {
- if ('\\' == *walk || '$' == *walk) {
- if (walk_last == '\\') {
- walk++;
- walk_last = 0;
- continue;
- }
- if (preg_get_backref(&walk, &backref)) {
- if (backref < count)
- new_len += offsets[(backref<<1)+1] - offsets[backref<<1];
- continue;
- }
- }
- new_len++;
- walk++;
- walk_last = walk[-1];
- }
- }
-
- if (new_len + 1 > alloc_len) {
- alloc_len = 1 + alloc_len + 2 * new_len;
- new_buf = emalloc(alloc_len);
- memcpy(new_buf, result, *result_len);
- efree(result);
- result = new_buf;
- }
- /* copy the part of the string before the match */
- memcpy(&result[*result_len], piece, match-piece);
- *result_len += match-piece;
-
- /* copy replacement and backrefs */
- walkbuf = result + *result_len;
-
- /* If evaluating or using custom function, copy result to the buffer
- * and clean up. */
- if (eval || is_callable_replace) {
- memcpy(walkbuf, eval_result, eval_result_len);
- *result_len += eval_result_len;
- STR_FREE(eval_result);
- } else { /* do regular backreference copying */
- walk = replace;
- walk_last = 0;
- while (walk < replace_end) {
- if ('\\' == *walk || '$' == *walk) {
- if (walk_last == '\\') {
- *(walkbuf-1) = *walk++;
- walk_last = 0;
- continue;
- }
- if (preg_get_backref(&walk, &backref)) {
- if (backref < count) {
- match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
- memcpy(walkbuf, subject + offsets[backref<<1], match_len);
- walkbuf += match_len;
- }
- continue;
- }
- }
- *walkbuf++ = *walk++;
- walk_last = walk[-1];
- }
- *walkbuf = '\0';
- /* increment the result length by how much we've added to the string */
- *result_len += walkbuf - (result + *result_len);
- }
-
- if (limit != -1)
- limit--;
-
- } else { /* Failed to match */
- /* If we previously set PCRE_NOTEMPTY after a null match,
- this is not necessarily the end. We need to advance
- the start offset, and continue. Fudge the offset values
- to achieve this, unless we're already at the end of the string. */
- if (g_notempty != 0 && start_offset < subject_len) {
- offsets[0] = start_offset;
- offsets[1] = start_offset + 1;
- memcpy(&result[*result_len], piece, 1);
- (*result_len)++;
- } else {
- new_len = *result_len + subject_len - start_offset;
- if (new_len + 1 > alloc_len) {
- alloc_len = new_len + 1; /* now we know exactly how long it is */
- new_buf = emalloc(alloc_len * sizeof(char));
- memcpy(new_buf, result, *result_len);
- efree(result);
- result = new_buf;
- }
- /* stick that last bit of string on our output */
- memcpy(&result[*result_len], piece, subject_len - start_offset);
- *result_len += subject_len - start_offset;
- result[*result_len] = '\0';
- break;
- }
- }
-
- /* If we have matched an empty string, mimic what Perl's /g options does.
- This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
- the match again at the same point. If this fails (picked up above) we
- advance to the next character. */
- g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
-
- /* Advance to the next piece. */
- start_offset = offsets[1];
- }
-
- efree(offsets);
-
- return result;
-}
-/* }}} */
-
-/* {{{ php_replace_in_subject
- */
-static char *php_replace_in_subject(zval *regex, zval *replace, zval **subject, int *result_len, int limit, zend_bool is_callable_replace TSRMLS_DC)
-{
- zval **regex_entry,
- **replace_entry = NULL,
- *replace_value,
- empty_replace;
- char *subject_value,
- *result;
- int subject_len;
-
- /* Make sure we're dealing with strings. */
- convert_to_string_ex(subject);
- ZVAL_STRINGL(&empty_replace, empty_string, 0, 0);
-
- /* If regex is an array */
- if (Z_TYPE_P(regex) == IS_ARRAY) {
- /* Duplicate subject string for repeated replacement */
- subject_value = estrndup(Z_STRVAL_PP(subject), Z_STRLEN_PP(subject));
- subject_len = Z_STRLEN_PP(subject);
- *result_len = subject_len;
-
- zend_hash_internal_pointer_reset(Z_ARRVAL_P(regex));
-
- replace_value = replace;
- if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace)
- zend_hash_internal_pointer_reset(Z_ARRVAL_P(replace));
-
- /* For each entry in the regex array, get the entry */
- while (zend_hash_get_current_data(Z_ARRVAL_P(regex), (void **)&regex_entry) == SUCCESS) {
- /* Make sure we're dealing with strings. */
- convert_to_string_ex(regex_entry);
-
- /* If replace is an array and not a callable construct */
- if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace) {
- /* Get current entry */
- if (zend_hash_get_current_data(Z_ARRVAL_P(replace), (void **)&replace_entry) == SUCCESS) {
- if (!is_callable_replace) {
- convert_to_string_ex(replace_entry);
- }
- replace_value = *replace_entry;
- zend_hash_move_forward(Z_ARRVAL_P(replace));
- } else {
- /* We've run out of replacement strings, so use an empty one */
- replace_value = &empty_replace;
- }
- }
-
- /* Do the actual replacement and put the result back into subject_value
- for further replacements. */
- if ((result = php_pcre_replace(Z_STRVAL_PP(regex_entry),
- Z_STRLEN_PP(regex_entry),
- subject_value,
- subject_len,
- replace_value,
- is_callable_replace,
- result_len,
- limit TSRMLS_CC)) != NULL) {
- efree(subject_value);
- subject_value = result;
- subject_len = *result_len;
- }
-
- zend_hash_move_forward(Z_ARRVAL_P(regex));
- }
-
- return subject_value;
- } else {
- result = php_pcre_replace(Z_STRVAL_P(regex),
- Z_STRLEN_P(regex),
- Z_STRVAL_PP(subject),
- Z_STRLEN_PP(subject),
- replace,
- is_callable_replace,
- result_len,
- limit TSRMLS_CC);
- return result;
- }
-}
-/* }}} */
-
-/* {{{ preg_replace_impl
- */
-static void preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS, zend_bool is_callable_replace)
-{
- zval **regex,
- **replace,
- **subject,
- **limit,
- **subject_entry;
- char *result;
- int result_len;
- int limit_val = -1;
- char *string_key;
- ulong num_key;
- char *callback_name = NULL;
-
- /* Get function parameters and do error-checking. */
- if (ZEND_NUM_ARGS() < 3 || ZEND_NUM_ARGS() > 4 ||
- zend_get_parameters_ex(ZEND_NUM_ARGS(), &regex, &replace, &subject, &limit) == FAILURE) {
- WRONG_PARAM_COUNT;
- }
- if (!is_callable_replace && Z_TYPE_PP(replace) == IS_ARRAY && Z_TYPE_PP(regex) != IS_ARRAY) {
- php_error_docref(NULL TSRMLS_CC, E_WARNING, "Parameter mismatch, pattern is a string while replacement in an array.");
- RETURN_FALSE;
- }
-
- SEPARATE_ZVAL(replace);
- if (Z_TYPE_PP(replace) != IS_ARRAY)
- convert_to_string_ex(replace);
- if (is_callable_replace) {
- if (!zend_is_callable(*replace, 0, &callback_name)) {
- php_error_docref(NULL TSRMLS_CC, E_WARNING, "requires argument 2, '%s', to be a valid callback", callback_name);
- efree(callback_name);
- *return_value = **subject;
- zval_copy_ctor(return_value);
- return;
- }
- efree(callback_name);
- }
-
- SEPARATE_ZVAL(regex);
- SEPARATE_ZVAL(subject);
-
- if (ZEND_NUM_ARGS() > 3) {
- convert_to_long_ex(limit);
- limit_val = Z_LVAL_PP(limit);
- }
-
- if (Z_TYPE_PP(regex) != IS_ARRAY)
- convert_to_string_ex(regex);
-
- /* if subject is an array */
- if (Z_TYPE_PP(subject) == IS_ARRAY) {
- array_init(return_value);
- zend_hash_internal_pointer_reset(Z_ARRVAL_PP(subject));
-
- /* For each subject entry, convert it to string, then perform replacement
- and add the result to the return_value array. */
- while (zend_hash_get_current_data(Z_ARRVAL_PP(subject), (void **)&subject_entry) == SUCCESS) {
- if ((result = php_replace_in_subject(*regex, *replace, subject_entry, &result_len, limit_val, is_callable_replace TSRMLS_CC)) != NULL) {
- /* Add to return array */
- switch(zend_hash_get_current_key(Z_ARRVAL_PP(subject), &string_key, &num_key, 0))
- {
- case HASH_KEY_IS_STRING:
- add_assoc_stringl(return_value, string_key, result, result_len, 0);
- break;
-
- case HASH_KEY_IS_LONG:
- add_index_stringl(return_value, num_key, result, result_len, 0);
- break;
- }
- }
-
- zend_hash_move_forward(Z_ARRVAL_PP(subject));
- }
- }
- else { /* if subject is not an array */
- if ((result = php_replace_in_subject(*regex, *replace, subject, &result_len, limit_val, is_callable_replace TSRMLS_CC)) != NULL) {
- RETVAL_STRINGL(result, result_len, 0);
- }
- }
-}
-/* }}} */
-
-/* {{{ proto string preg_replace(mixed regex, mixed replace, mixed subject [, int limit])
- Perform Perl-style regular expression replacement. */
-PHP_FUNCTION(preg_replace)
-{
- preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
-}
-/* }}} */
-
-/* {{{ proto string preg_replace_callback(mixed regex, mixed callback, mixed subject [, int limit])
- Perform Perl-style regular expression replacement using replacement callback. */
-PHP_FUNCTION(preg_replace_callback)
-{
- preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
-}
-/* }}} */
-
-/* {{{ proto array preg_split(string pattern, string subject [, int limit [, int flags]])
- Split string into an array using a perl-style regular expression as a delimiter */
-PHP_FUNCTION(preg_split)
-{
- zval **regex, /* Regular expression to split by */
- **subject, /* Subject string to split */
- **limit, /* Number of pieces to return */
- **flags;
- pcre *re = NULL; /* Compiled regular expression */
- pcre_extra *extra = NULL; /* Holds results of studying */
- int *offsets; /* Array of subpattern offsets */
- int size_offsets; /* Size of the offsets array */
- int exoptions = 0; /* Execution options */
- int preg_options = 0; /* Custom preg options */
- int argc; /* Argument count */
- int limit_val = -1; /* Integer value of limit */
- int no_empty = 0; /* If NO_EMPTY flag is set */
- int delim_capture = 0; /* If delimiters should be captured */
- int offset_capture = 0;/* If offsets should be captured */
- int count = 0; /* Count of matched subpatterns */
- int start_offset; /* Where the new search starts */
- int next_offset; /* End of the last delimiter match + 1 */
- int g_notempty = 0; /* If the match should not be empty */
- char *match, /* The current match */
- *last_match; /* Location of last match */
-
- /* Get function parameters and do error checking */
- argc = ZEND_NUM_ARGS();
- if (argc < 2 || argc > 4 || zend_get_parameters_ex(argc, &regex, &subject, &limit, &flags) == FAILURE) {
- WRONG_PARAM_COUNT;
- }
-
- if (argc > 2) {
- convert_to_long_ex(limit);
- limit_val = Z_LVAL_PP(limit);
- if (limit_val == 0)
- limit_val = -1;
-
- if (argc > 3) {
- convert_to_long_ex(flags);
- no_empty = Z_LVAL_PP(flags) & PREG_SPLIT_NO_EMPTY;
- delim_capture = Z_LVAL_PP(flags) & PREG_SPLIT_DELIM_CAPTURE;
- offset_capture = Z_LVAL_PP(flags) & PREG_SPLIT_OFFSET_CAPTURE;
- }
- }
-
- /* Make sure we're dealing with strings */
- convert_to_string_ex(regex);
- convert_to_string_ex(subject);
-
- /* Compile regex or get it from cache. */
- if ((re = pcre_get_compiled_regex(Z_STRVAL_PP(regex), &extra, &preg_options)) == NULL) {
- RETURN_FALSE;
- }
-
- /* Initialize return value */
- array_init(return_value);
-
- /* Calculate the size of the offsets array, and allocate memory for it. */
- size_offsets = (pcre_info(re, NULL, NULL) + 1) * 3;
- offsets = (int *)emalloc(size_offsets * sizeof(int));
-
- /* Start at the beginning of the string */
- start_offset = 0;
- next_offset = 0;
- last_match = Z_STRVAL_PP(subject);
- match = NULL;
-
- /* Get next piece if no limit or limit not yet reached and something matched*/
- while ((limit_val == -1 || limit_val > 1)) {
- count = pcre_exec(re, extra, Z_STRVAL_PP(subject),
- Z_STRLEN_PP(subject), start_offset,
- exoptions|g_notempty, offsets, size_offsets);
-
- /* Check for too many substrings condition. */
- if (count == 0) {
- php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
- count = size_offsets/3;
- }
-
- /* If something matched */
- if (count > 0) {
- match = Z_STRVAL_PP(subject) + offsets[0];
-
- if (!no_empty || &Z_STRVAL_PP(subject)[offsets[0]] != last_match) {
-
- if (offset_capture) {
- /* Add (match, offset) pair to the return value */
- add_offset_pair(return_value, last_match, &Z_STRVAL_PP(subject)[offsets[0]]-last_match, next_offset);
- } else {
- /* Add the piece to the return value */
- add_next_index_stringl(return_value, last_match,
- &Z_STRVAL_PP(subject)[offsets[0]]-last_match, 1);
- }
-
- /* One less left to do */
- if (limit_val != -1)
- limit_val--;
- }
-
- last_match = &Z_STRVAL_PP(subject)[offsets[1]];
- next_offset = offsets[1];
-
- if (delim_capture) {
- int i, match_len;
- for (i = 1; i < count; i++) {
- match_len = offsets[(i<<1)+1] - offsets[i<<1];
- /* If we have matched a delimiter */
- if (!no_empty || match_len > 0) {
- if (offset_capture) {
- add_offset_pair(return_value, &Z_STRVAL_PP(subject)[offsets[i<<1]], match_len, offsets[i<<1]);
- } else {
- add_next_index_stringl(return_value,
- &Z_STRVAL_PP(subject)[offsets[i<<1]],
- match_len, 1);
- }
- }
- }
- }
- } else { /* Failed to match */
- /* If we previously set PCRE_NOTEMPTY after a null match,
- this is not necessarily the end. We need to advance
- the start offset, and continue. Fudge the offset values
- to achieve this, unless we're already at the end of the string. */
- if (g_notempty != 0 && start_offset < Z_STRLEN_PP(subject)) {
- offsets[0] = start_offset;
- offsets[1] = start_offset + 1;
- } else
- break;
- }
-
- /* If we have matched an empty string, mimic what Perl's /g options does.
- This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
- the match again at the same point. If this fails (picked up above) we
- advance to the next character. */
- g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
-
- /* Advance to the position right after the last full match */
- start_offset = offsets[1];
- }
-
-
- if (!no_empty || start_offset != Z_STRLEN_PP(subject))
- {
- if (offset_capture) {
- /* Add the last (match, offset) pair to the return value */
- add_offset_pair(return_value, &Z_STRVAL_PP(subject)[start_offset], Z_STRLEN_PP(subject) - start_offset, start_offset);
- } else {
- /* Add the last piece to the return value */
- add_next_index_stringl(return_value, last_match, Z_STRVAL_PP(subject) + Z_STRLEN_PP(subject) - last_match, 1);
- }
- }
-
-
- /* Clean up */
- efree(offsets);
-}
-/* }}} */
-
-/* {{{ proto string preg_quote(string str, string delim_char)
- Quote regular expression characters plus an optional character */
-PHP_FUNCTION(preg_quote)
-{
- zval **in_str_arg; /* Input string argument */
- zval **delim; /* Additional delimiter argument */
- char *in_str, /* Input string */
- *in_str_end, /* End of the input string */
- *out_str, /* Output string with quoted characters */
- *p, /* Iterator for input string */
- *q, /* Iterator for output string */
- delim_char=0, /* Delimiter character to be quoted */
- c; /* Current character */
- zend_bool quote_delim = 0; /* Whether to quote additional delim char */
-
- /* Get the arguments and check for errors */
- if (ZEND_NUM_ARGS() < 1 || ZEND_NUM_ARGS() > 2 ||
- zend_get_parameters_ex(ZEND_NUM_ARGS(), &in_str_arg, &delim) == FAILURE) {
- WRONG_PARAM_COUNT;
- }
-
- /* Make sure we're working with strings */
- convert_to_string_ex(in_str_arg);
- in_str = Z_STRVAL_PP(in_str_arg);
- in_str_end = Z_STRVAL_PP(in_str_arg) + Z_STRLEN_PP(in_str_arg);
-
- /* Nothing to do if we got an empty string */
- if (in_str == in_str_end) {
- RETVAL_STRINGL(empty_string, 0, 0);
- }
-
- if (ZEND_NUM_ARGS() == 2) {
- convert_to_string_ex(delim);
- if (Z_STRLEN_PP(delim) > 0) {
- delim_char = Z_STRVAL_PP(delim)[0];
- quote_delim = 1;
- }
- }
-
- /* Allocate enough memory so that even if each character
- is quoted, we won't run out of room */
- out_str = emalloc(2 * Z_STRLEN_PP(in_str_arg) + 1);
-
- /* Go through the string and quote necessary characters */
- for(p = in_str, q = out_str; p != in_str_end; p++) {
- c = *p;
- switch(c) {
- case '.':
- case '\\':
- case '+':
- case '*':
- case '?':
- case '[':
- case '^':
- case ']':
- case '$':
- case '(':
- case ')':
- case '{':
- case '}':
- case '=':
- case '!':
- case '>':
- case '<':
- case '|':
- case ':':
- *q++ = '\\';
- *q++ = c;
- break;
-
- default:
- if (quote_delim && c == delim_char)
- *q++ = '\\';
- *q++ = c;
- break;
- }
- }
- *q = '\0';
-
- /* Reallocate string and return it */
- RETVAL_STRINGL(erealloc(out_str, q - out_str + 1), q - out_str, 0);
-}
-/* }}} */
-
-/* {{{ proto array preg_grep(string regex, array input)
- Searches array and returns entries which match regex */
-PHP_FUNCTION(preg_grep)
-{
- zval **regex, /* Regular expression */
- **input, /* Input array */
- **flags,
- **entry; /* An entry in the input array */
- pcre *re = NULL; /* Compiled regular expression */
- pcre_extra *extra = NULL; /* Holds results of studying */
- int preg_options = 0; /* Custom preg options */
- int *offsets; /* Array of subpattern offsets */
- int size_offsets; /* Size of the offsets array */
- int count = 0; /* Count of matched subpatterns */
- char *string_key;
- ulong num_key;
- zend_bool invert = 0; /* Whether to return non-matching
- entries */
-
- /* Get arguments and do error checking */
-
- if (ZEND_NUM_ARGS() < 2 || ZEND_NUM_ARGS() > 3 ||
- zend_get_parameters_ex(ZEND_NUM_ARGS(), &regex, &input, &flags) == FAILURE) {
- WRONG_PARAM_COUNT;
- }
-
- if (Z_TYPE_PP(input) != IS_ARRAY) {
- php_error_docref(NULL TSRMLS_CC,E_WARNING, "Second argument to preg_grep() should be an array");
- return;
- }
-
- SEPARATE_ZVAL(input);
-
- /* Make sure regex is a string */
- convert_to_string_ex(regex);
-
- if (ZEND_NUM_ARGS() > 2) {
- convert_to_long_ex(flags);
- invert = (Z_LVAL_PP(flags) & PREG_GREP_INVERT) ? 1 : 0;
- }
-
- /* Compile regex or get it from cache. */
- if ((re = pcre_get_compiled_regex(Z_STRVAL_PP(regex), &extra, &preg_options)) == NULL) {
- RETURN_FALSE;
- }
-
- /* Calculate the size of the offsets array, and allocate memory for it. */
- size_offsets = (pcre_info(re, NULL, NULL) + 1) * 3;
- offsets = (int *)emalloc(size_offsets * sizeof(int));
-
- /* Initialize return array */
- array_init(return_value);
-
- /* Go through the input array */
- zend_hash_internal_pointer_reset(Z_ARRVAL_PP(input));
- while(zend_hash_get_current_data(Z_ARRVAL_PP(input), (void **)&entry) == SUCCESS) {
-
- convert_to_string_ex(entry);
-
- /* Perform the match */
- count = pcre_exec(re, extra, Z_STRVAL_PP(entry),
- Z_STRLEN_PP(entry), 0,
- 0, offsets, size_offsets);
-
- /* Check for too many substrings condition. */
- if (count == 0) {
- php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Matched, but too many substrings");
- count = size_offsets/3;
- }
-
- /* If the entry fits our requirements */
- if ((count > 0 && !invert) ||
- (count < 0 && invert)) {
- (*entry)->refcount++;
-
- /* Add to return array */
- switch(zend_hash_get_current_key(Z_ARRVAL_PP(input), &string_key, &num_key, 0))
- {
- case HASH_KEY_IS_STRING:
- zend_hash_update(Z_ARRVAL_P(return_value), string_key,
- strlen(string_key)+1, entry, sizeof(zval *), NULL);
- break;
-
- case HASH_KEY_IS_LONG:
- zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry,
- sizeof(zval *), NULL);
- break;
- }
- }
-
- zend_hash_move_forward(Z_ARRVAL_PP(input));
- }
-
- /* Clean up */
- efree(offsets);
-}
-/* }}} */
-
-/* {{{ module definition structures */
-
-function_entry pcre_functions[] = {
- PHP_FE(preg_match, third_arg_force_ref)
- PHP_FE(preg_match_all, third_arg_force_ref)
- PHP_FE(preg_replace, NULL)
- PHP_FE(preg_replace_callback, NULL)
- PHP_FE(preg_split, NULL)
- PHP_FE(preg_quote, NULL)
- PHP_FE(preg_grep, NULL)
- {NULL, NULL, NULL}
-};
-
-zend_module_entry pcre_module_entry = {
- STANDARD_MODULE_HEADER,
- "pcre",
- pcre_functions,
- PHP_MINIT(pcre),
- PHP_MSHUTDOWN(pcre),
- PHP_RINIT(pcre),
- NULL,
- PHP_MINFO(pcre),
- NO_VERSION_YET,
- STANDARD_MODULE_PROPERTIES
-};
-
-#ifdef COMPILE_DL_PCRE
-ZEND_GET_MODULE(pcre)
-#endif
-
-/* }}} */
-
-#endif /* HAVE_PCRE || HAVE_BUNDLED_PCRE */
-
-/*
- * Local variables:
- * tab-width: 4
- * c-basic-offset: 4
- * End:
- * vim600: sw=4 ts=4 fdm=marker
- * vim<600: sw=4 ts=4
- */
diff --git a/ext/pcre/php_pcre.h b/ext/pcre/php_pcre.h
deleted file mode 100644
index d65f3412f2..0000000000
--- a/ext/pcre/php_pcre.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- +----------------------------------------------------------------------+
- | PHP Version 4 |
- +----------------------------------------------------------------------+
- | Copyright (c) 1997-2003 The PHP Group |
- +----------------------------------------------------------------------+
- | This source file is subject to version 2.02 of the PHP license, |
- | that is bundled with this package in the file LICENSE, and is |
- | available at through the world-wide-web at |
- | http://www.php.net/license/2_02.txt. |
- | If you did not receive a copy of the PHP license and are unable to |
- | obtain it through the world-wide-web, please send a note to |
- | license@php.net so we can mail you a copy immediately. |
- +----------------------------------------------------------------------+
- | Author: Andrei Zmievski <andrei@php.net> |
- +----------------------------------------------------------------------+
- */
-
-/* $Id$ */
-
-#ifndef PHP_PCRE_H
-#define PHP_PCRE_H
-
-#if HAVE_PCRE || HAVE_BUNDLED_PCRE
-
-#if HAVE_BUNDLED_PCRE
-#include "pcrelib/pcre.h"
-#else
-#include "pcre.h"
-#endif
-
-#if HAVE_LOCALE_H
-#include <locale.h>
-#endif
-
-PHP_FUNCTION(preg_match);
-PHP_FUNCTION(preg_match_all);
-PHP_FUNCTION(preg_replace);
-PHP_FUNCTION(preg_replace_callback);
-PHP_FUNCTION(preg_split);
-PHP_FUNCTION(preg_quote);
-PHP_FUNCTION(preg_grep);
-
-PHPAPI char *php_pcre_replace(char *regex, int regex_len, char *subject, int subject_len, zval *replace_val, int is_callable_replace, int *result_len, int limit TSRMLS_DC);
-PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *options);
-
-extern zend_module_entry pcre_module_entry;
-#define pcre_module_ptr &pcre_module_entry
-
-typedef struct {
- pcre *re;
- pcre_extra *extra;
- int preg_options;
-#if HAVE_SETLOCALE
- char *locale;
- unsigned const char *tables;
-#endif
-} pcre_cache_entry;
-
-ZEND_BEGIN_MODULE_GLOBALS(pcre)
- HashTable pcre_cache;
-ZEND_END_MODULE_GLOBALS(pcre)
-
-#ifdef ZTS
-# define PCRE_G(v) TSRMG(pcre_globals_id, zend_pcre_globals *, v)
-#else
-# define PCRE_G(v) (pcre_globals.v)
-#endif
-
-#else
-
-#define pcre_module_ptr NULL
-
-#endif /* HAVE_PCRE || HAVE_BUNDLED_PCRE */
-
-#define phpext_pcre_ptr pcre_module_ptr
-
-#endif /* PHP_PCRE_H */
diff --git a/ext/pcre/tests/bug20528.phpt b/ext/pcre/tests/bug20528.phpt
deleted file mode 100644
index 8182fd9bfe..0000000000
--- a/ext/pcre/tests/bug20528.phpt
+++ /dev/null
@@ -1,24 +0,0 @@
---TEST--
-Bug #20528 (preg_split() drops characters (re-opens Bug #15413))
---FILE--
-<?php
- $data = '(#11/19/2002#)';
- var_dump(preg_split('/\b/', $data));
-?>
---EXPECT--
-array(7) {
- [0]=>
- string(2) "(#"
- [1]=>
- string(2) "11"
- [2]=>
- string(1) "/"
- [3]=>
- string(2) "19"
- [4]=>
- string(1) "/"
- [5]=>
- string(4) "2002"
- [6]=>
- string(2) "#)"
-}
diff --git a/ext/pcre/tests/bug21732.phpt b/ext/pcre/tests/bug21732.phpt
deleted file mode 100644
index 3dfc41e19f..0000000000
--- a/ext/pcre/tests/bug21732.phpt
+++ /dev/null
@@ -1,29 +0,0 @@
---TEST--
-Bug #21732 (preg_replace() segfaults with invalid parameters)
---INI--
-error_reporting=0
---FILE--
-<?php
-class foo {
- function cb($param) {
- var_dump($param);
- return "yes!";
- }
-}
-
-var_dump(preg_replace('', array(), ''));
-var_dump(preg_replace_callback("/(ab)(cd)(e)/", array(new foo(), "cb"), 'abcde'));
-?>
---EXPECT--
-bool(false)
-array(4) {
- [0]=>
- string(5) "abcde"
- [1]=>
- string(2) "ab"
- [2]=>
- string(2) "cd"
- [3]=>
- string(1) "e"
-}
-string(4) "yes!"