MFB

author: Andrei Zmievski <andrei@php.net> 2003-12-16 22:20:30 +0000
committer: Andrei Zmievski <andrei@php.net> 2003-12-16 22:20:30 +0000
commit: 9fc9e4b2cf6f71f130ad080ea8a0924ec3732b62 (patch)
tree: 50329fc541100f6beccfc10b36a748365cde7081 /ext/pcre/pcrelib/study.c
parent: e9fb9a7fa75b7e8c0381c85628741ec27f2874a9 (diff)
download: php-git-9fc9e4b2cf6f71f130ad080ea8a0924ec3732b62.tar.gz
1 files changed, 42 insertions, 8 deletions
diff --git a/ext/pcre/pcrelib/study.c b/ext/pcre/pcrelib/study.c
index 4320bd23d0..a40f721656 100644
--- a/ext/pcre/pcrelib/study.c
+++ b/ext/pcre/pcrelib/study.c
@@ -9,7 +9,7 @@ the file Tech.Notes for some information on the internals.
 
 Written by: Philip Hazel <ph10@cam.ac.uk>
 
-           Copyright (c) 1997-2002 University of Cambridge
+           Copyright (c) 1997-2003 University of Cambridge
 
 -----------------------------------------------------------------------------
 Permission is granted to anyone to use this software for any purpose on any
@@ -260,6 +260,9 @@ do
       case OP_TYPEMINQUERY:
       switch(tcode[1])
         {
+        case OP_ANY:
+        return FALSE;
+
         case OP_NOT_DIGIT:
         for (c = 0; c < 32; c++)
           start_bits[c] |= ~cd->cbits[c+cbit_digit];
@@ -297,19 +300,50 @@ do
       /* Character class where all the information is in a bit map: set the
       bits and either carry on or not, according to the repeat count. If it was
       a negative class, and we are operating with UTF-8 characters, any byte
-      with the top-bit set is a potentially valid starter because it may start
-      a character with a value > 255. (This is sub-optimal in that the
-      character may be in the range 128-255, and those characters might be
-      unwanted, but that's as far as we go for the moment.) */
+      with a value >= 0xc4 is a potentially valid starter because it starts a
+      character with a value > 255. */
 
       case OP_NCLASS:
-      if (utf8) memset(start_bits+16, 0xff, 16);
+      if (utf8)
+        {
+        start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
+        memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
+        }
       /* Fall through */
 
       case OP_CLASS:
         {
         tcode++;
-        for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
+
+        /* In UTF-8 mode, the bits in a bit map correspond to character
+        values, not to byte values. However, the bit map we are constructing is
+        for byte values. So we have to do a conversion for characters whose
+        value is > 127. In fact, there are only two possible starting bytes for
+        characters in the range 128 - 255. */
+
+        if (utf8)
+          {
+          for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
+          for (c = 128; c < 256; c++)
+            {
+            if ((tcode[c/8] && (1 << (c&7))) != 0)
+              {
+              int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
+              start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
+              c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
+              }
+            }
+          }
+
+        /* In non-UTF-8 mode, the two bit maps are completely compatible. */
+
+        else
+          {
+          for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
+          }
+
+        /* Advance past the bit map, and act on what follows */
+
         tcode += 32;
         switch (*tcode)
           {
@@ -363,7 +397,7 @@ Returns:    pointer to a pcre_extra block, with study_data filled in and the
             NULL on error or if no optimization possible
 */
 
-pcre_extra *
+EXPORT pcre_extra *
 pcre_study(const pcre *external_re, int options, const char **errorptr)
 {
 uschar start_bits[32];
author	Andrei Zmievski <andrei@php.net>	2003-12-16 22:20:30 +0000
committer	Andrei Zmievski <andrei@php.net>	2003-12-16 22:20:30 +0000
commit	9fc9e4b2cf6f71f130ad080ea8a0924ec3732b62 (patch)
tree	50329fc541100f6beccfc10b36a748365cde7081 /ext/pcre/pcrelib/study.c
parent	e9fb9a7fa75b7e8c0381c85628741ec27f2874a9 (diff)
download	php-git-9fc9e4b2cf6f71f130ad080ea8a0924ec3732b62.tar.gz