diff options
| author | Fredrik Lundh <fredrik@pythonware.com> | 2006-05-23 18:44:25 +0000 | 
|---|---|---|
| committer | Fredrik Lundh <fredrik@pythonware.com> | 2006-05-23 18:44:25 +0000 | 
| commit | b63588c1881c350d209d389804dce4d8f5ca34d7 (patch) | |
| tree | d9a437d3c4231cc9175659337c540b23fcbe161a | |
| parent | 7298f270a7163b923e42de97c50f3c0c9c5922a5 (diff) | |
| download | cpython-git-b63588c1881c350d209d389804dce4d8f5ca34d7.tar.gz | |
needforspeed: use append+reverse for rsplit, use "bloom filters" to
speed up splitlines and strip with charsets; etc.  rsplit is now as
fast as split in all our tests (reverse takes no time at all), and
splitlines() is nearly as fast as a plain split("\n") in our tests.
and we're not done yet... ;-)
| -rw-r--r-- | Objects/unicodeobject.c | 144 | 
1 files changed, 101 insertions, 43 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 60b8cd91cd..714bdddae7 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -46,6 +46,18 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.  #include <windows.h>  #endif +#undef USE_INLINE /* XXX - set via configure? */ + +#if defined(_MSC_VER) /* this is taken from _sre.c */ +#pragma warning(disable: 4710) +/* fastest possible local call under MSVC */ +#define LOCAL(type) static __inline type __fastcall +#elif defined(USE_INLINE) +#define LOCAL(type) static inline type +#else +#define LOCAL(type) static type +#endif +  /* Limit for the Unicode object free list */  #define MAX_UNICODE_FREELIST_SIZE       1024 @@ -121,6 +133,51 @@ PyUnicode_GetMax(void)  #endif  } +/* --- Bloom Filters ----------------------------------------------------- */ + +/* stuff to implement simple "bloom filters" for Unicode characters. +   to keep things simple, we use a single bitmask, using the least 5 +   bits from each unicode characters as the bit index. */ + +/* the linebreak mask is set up by Unicode_Init below */ + +#define BLOOM_MASK unsigned long + +static BLOOM_MASK bloom_linebreak; + +#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) + +#define BLOOM_LINEBREAK(ch)\ +    (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch))) + +LOCAL(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) +{ +    /* calculate simple bloom-style bitmask for a given unicode string */ + +    long mask; +    Py_ssize_t i; + +    mask = 0; +    for (i = 0; i < len; i++) +        mask |= (1 << (ptr[i] & 0x1F)); + +    return mask; +} + +LOCAL(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) +{ +    Py_ssize_t i; + +    for (i = 0; i < setlen; i++) +        if (set[i] == chr) +            return 1; + +    return -1; +} + +#define BLOOM_MEMBER(mask, chr, set, setlen)\ +    BLOOM(mask, chr) && unicode_member(chr, set, setlen) +  /* --- Unicode Object ----------------------------------------------------- */  static @@ -3791,8 +3848,7 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,  /* --- Helpers ------------------------------------------------------------ */ -static -Py_ssize_t count(PyUnicodeObject *self, +static Py_ssize_t count(PyUnicodeObject *self,  		 Py_ssize_t start,  		 Py_ssize_t end,  		 PyUnicodeObject *substring) @@ -3850,8 +3906,7 @@ Py_ssize_t PyUnicode_Count(PyObject *str,      return result;  } -static -Py_ssize_t findstring(PyUnicodeObject *self, +static Py_ssize_t findstring(PyUnicodeObject *self,  	       PyUnicodeObject *substring,  	       Py_ssize_t start,  	       Py_ssize_t end, @@ -4332,17 +4387,6 @@ PyUnicodeObject *pad(PyUnicodeObject *self,          else								\              Py_DECREF(str); -#define SPLIT_INSERT(data, left, right)					\ -	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\ -	if (!str)							\ -	    goto onError;						\ -	if (PyList_Insert(list, 0, str)) {				\ -	    Py_DECREF(str);						\ -	    goto onError;						\ -	}								\ -        else								\ -            Py_DECREF(str); -  static  PyObject *split_whitespace(PyUnicodeObject *self,  			   PyObject *list, @@ -4403,7 +4447,7 @@ PyObject *PyUnicode_Splitlines(PyObject *string,  	Py_ssize_t eol;  	/* Find a line and append it */ -	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i])) +	while (i < len && !BLOOM_LINEBREAK(data[i]))  	    i++;  	/* Skip the line break reading CRLF as one line break */ @@ -4514,15 +4558,17 @@ PyObject *rsplit_whitespace(PyUnicodeObject *self,  	if (j > i) {  	    if (maxcount-- <= 0)  		break; -	    SPLIT_INSERT(self->str, i + 1, j + 1); +	    SPLIT_APPEND(self->str, i + 1, j + 1);  	    while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))  		i--;  	    j = i;  	}      }      if (j >= 0) { -	SPLIT_INSERT(self->str, 0, j + 1); +	SPLIT_APPEND(self->str, 0, j + 1);      } +    if (PyList_Reverse(list) < 0) +        goto onError;      return list;   onError: @@ -4545,14 +4591,16 @@ PyObject *rsplit_char(PyUnicodeObject *self,  	if (self->str[i] == ch) {  	    if (maxcount-- <= 0)  		break; -	    SPLIT_INSERT(self->str, i + 1, j + 1); +	    SPLIT_APPEND(self->str, i + 1, j + 1);  	    j = i = i - 1;  	} else  	    i--;      }      if (j >= -1) { -	SPLIT_INSERT(self->str, 0, j + 1); +	SPLIT_APPEND(self->str, 0, j + 1);      } +    if (PyList_Reverse(list) < 0) +        goto onError;      return list;   onError: @@ -4576,15 +4624,17 @@ PyObject *rsplit_substring(PyUnicodeObject *self,  	if (Py_UNICODE_MATCH(self, i, substring)) {  	    if (maxcount-- <= 0)  		break; -	    SPLIT_INSERT(self->str, i + sublen, j); +	    SPLIT_APPEND(self->str, i + sublen, j);  	    j = i;  	    i -= sublen;  	} else  	    i--;      }      if (j >= 0) { -	SPLIT_INSERT(self->str, 0, j); +	SPLIT_APPEND(self->str, 0, j);      } +    if (PyList_Reverse(list) < 0) +        goto onError;      return list;   onError: @@ -4593,7 +4643,6 @@ PyObject *rsplit_substring(PyUnicodeObject *self,  }  #undef SPLIT_APPEND -#undef SPLIT_INSERT  static  PyObject *split(PyUnicodeObject *self, @@ -5703,16 +5752,6 @@ static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};  #define STRIPNAME(i) (stripformat[i]+3) -static const Py_UNICODE * -unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n) -{ -	size_t i; -	for (i = 0; i < n; ++i) -		if (s[i] == c) -			return s+i; -	return NULL; -} -  /* externally visible for str.strip(unicode) */  PyObject *  _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) @@ -5723,27 +5762,29 @@ _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)  	Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);  	Py_ssize_t i, j; +        BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); +  	i = 0;  	if (striptype != RIGHTSTRIP) { -		while (i < len && unicode_memchr(sep, s[i], seplen)) { -			i++; -		} +            while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { +                i++; +            }  	}  	j = len;  	if (striptype != LEFTSTRIP) { -		do { -			j--; -		} while (j >= i && unicode_memchr(sep, s[j], seplen)); -		j++; +            do { +                j--; +            } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); +            j++;  	}  	if (i == 0 && j == len && PyUnicode_CheckExact(self)) { -		Py_INCREF(self); -		return (PyObject*)self; +            Py_INCREF(self); +            return (PyObject*)self;  	}  	else -		return PyUnicode_FromUnicode(s+i, j-i); +            return PyUnicode_FromUnicode(s+i, j-i);  } @@ -7387,6 +7428,18 @@ void _PyUnicode_Init(void)  {      int i; +    /* XXX - move this array to unicodectype.c ? */ +    Py_UNICODE linebreak[] = { +        0x000A, /* LINE FEED */ +        0x000D, /* CARRIAGE RETURN */ +        0x001C, /* FILE SEPARATOR */ +        0x001D, /* GROUP SEPARATOR */ +        0x001E, /* RECORD SEPARATOR */ +        0x0085, /* NEXT LINE */ +        0x2028, /* LINE SEPARATOR */ +        0x2029, /* PARAGRAPH SEPARATOR */ +    }; +      /* Init the implementation */      unicode_freelist = NULL;      unicode_freelist_size = 0; @@ -7396,6 +7449,11 @@ void _PyUnicode_Init(void)  	unicode_latin1[i] = NULL;      if (PyType_Ready(&PyUnicode_Type) < 0)  	Py_FatalError("Can't initialize 'unicode'"); + +    /* initialize the linebreak bloom filter */ +    bloom_linebreak = make_bloom_mask( +        linebreak, sizeof(linebreak) / sizeof(linebreak[0]) +        );  }  /* Finalize the Unicode implementation */  | 
