diff options
Diffstat (limited to 'numpy/core')
-rw-r--r-- | numpy/core/defchararray.py | 19 | ||||
-rw-r--r-- | numpy/core/include/numpy/arrayscalars.h | 8 | ||||
-rw-r--r-- | numpy/core/src/common/ucsnarrow.c | 116 | ||||
-rw-r--r-- | numpy/core/src/common/ucsnarrow.h | 6 | ||||
-rw-r--r-- | numpy/core/src/multiarray/arraytypes.c.src | 58 | ||||
-rw-r--r-- | numpy/core/src/multiarray/buffer.c | 5 | ||||
-rw-r--r-- | numpy/core/src/multiarray/common.c | 32 | ||||
-rw-r--r-- | numpy/core/src/multiarray/scalarapi.c | 70 | ||||
-rw-r--r-- | numpy/core/src/multiarray/scalartypes.c.src | 66 | ||||
-rw-r--r-- | numpy/core/tests/test_multiarray.py | 28 | ||||
-rw-r--r-- | numpy/core/tests/test_scalarbuffer.py | 35 |
11 files changed, 175 insertions, 268 deletions
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py index 942a698a9..b22d6b85e 100644 --- a/numpy/core/defchararray.py +++ b/numpy/core/defchararray.py @@ -2679,25 +2679,6 @@ def array(obj, itemsize=None, copy=True, unicode=None, order=None): itemsize = len(obj) shape = len(obj) // itemsize - if unicode: - if sys.maxunicode == 0xffff: - # On a narrow Python build, the buffer for Unicode - # strings is UCS2, which doesn't match the buffer for - # NumPy Unicode types, which is ALWAYS UCS4. - # Therefore, we need to convert the buffer. On Python - # 2.6 and later, we can use the utf_32 codec. Earlier - # versions don't have that codec, so we convert to a - # numerical array that matches the input buffer, and - # then use NumPy to convert it to UCS4. All of this - # should happen in native endianness. - obj = obj.encode('utf_32') - else: - obj = str(obj) - else: - # Let the default Unicode -> string encoding (if any) take - # precedence. - obj = bytes(obj) - return chararray(shape, itemsize=itemsize, unicode=unicode, buffer=obj, order=order) diff --git a/numpy/core/include/numpy/arrayscalars.h b/numpy/core/include/numpy/arrayscalars.h index 64450e713..42a0df76a 100644 --- a/numpy/core/include/numpy/arrayscalars.h +++ b/numpy/core/include/numpy/arrayscalars.h @@ -135,7 +135,13 @@ typedef struct { } PyScalarObject; #define PyStringScalarObject PyStringObject -#define PyUnicodeScalarObject PyUnicodeObject +#define PyStringScalarObject PyStringObject +typedef struct { + /* note that the PyObject_HEAD macro lives right here */ + PyUnicodeObject base; + Py_UCS4 *obval; +} PyUnicodeScalarObject; + typedef struct { PyObject_VAR_HEAD diff --git a/numpy/core/src/common/ucsnarrow.c b/numpy/core/src/common/ucsnarrow.c index 946a72257..3ef5d6878 100644 --- a/numpy/core/src/common/ucsnarrow.c +++ b/numpy/core/src/common/ucsnarrow.c @@ -16,76 +16,12 @@ #include "ctors.h" /* - * Functions only needed on narrow builds of Python for converting back and - * forth between the NumPy Unicode data-type (always 4-bytes) and the - * Python Unicode scalar (2-bytes on a narrow build). - */ - -/* - * The ucs2 buffer must be large enough to hold 2*ucs4length characters - * due to the use of surrogate pairs. + * This file originally contained functions only needed on narrow builds of + * Python for converting back and forth between the NumPy Unicode data-type + * (always 4-bytes) and the Python Unicode scalar (2-bytes on a narrow build). * - * The return value is the number of ucs2 bytes used-up which - * is ucs4length + number of surrogate pairs found. - * - * Values above 0xffff are converted to surrogate pairs. + * This "narrow" interface is now deprecated in python and unused in NumPy. */ -NPY_NO_EXPORT int -PyUCS2Buffer_FromUCS4(Py_UNICODE *ucs2, npy_ucs4 const *ucs4, int ucs4length) -{ - int i; - int numucs2 = 0; - npy_ucs4 chr; - for (i = 0; i < ucs4length; i++) { - chr = *ucs4++; - if (chr > 0xffff) { - numucs2++; - chr -= 0x10000L; - *ucs2++ = 0xD800 + (Py_UNICODE) (chr >> 10); - *ucs2++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); - } - else { - *ucs2++ = (Py_UNICODE) chr; - } - numucs2++; - } - return numucs2; -} - - -/* - * This converts a UCS2 buffer of the given length to UCS4 buffer. - * It converts up to ucs4len characters of UCS2 - * - * It returns the number of characters converted which can - * be less than ucs2len if there are surrogate pairs in ucs2. - * - * The return value is the actual size of the used part of the ucs4 buffer. - */ -NPY_NO_EXPORT int -PyUCS2Buffer_AsUCS4(Py_UNICODE const *ucs2, npy_ucs4 *ucs4, int ucs2len, int ucs4len) -{ - int i; - npy_ucs4 chr; - Py_UNICODE ch; - int numchars=0; - - for (i = 0; (i < ucs2len) && (numchars < ucs4len); i++) { - ch = *ucs2++; - if (ch >= 0xd800 && ch <= 0xdfff) { - /* surrogate pair */ - chr = ((npy_ucs4)(ch-0xd800)) << 10; - chr += *ucs2++ + 0x2400; /* -0xdc00 + 0x10000 */ - i++; - } - else { - chr = (npy_ucs4) ch; - } - *ucs4++ = chr; - numchars++; - } - return numchars; -} /* * Returns a PyUnicodeObject initialized from a buffer containing @@ -112,14 +48,13 @@ PyUnicode_FromUCS4(char const *src_char, Py_ssize_t size, int swap, int align) Py_ssize_t ucs4len = size / sizeof(npy_ucs4); npy_ucs4 const *src = (npy_ucs4 const *)src_char; npy_ucs4 *buf = NULL; - PyUnicodeObject *ret; /* swap and align if needed */ if (swap || align) { buf = (npy_ucs4 *)malloc(size); if (buf == NULL) { PyErr_NoMemory(); - goto fail; + return NULL; } memcpy(buf, src, size); if (swap) { @@ -132,43 +67,8 @@ PyUnicode_FromUCS4(char const *src_char, Py_ssize_t size, int swap, int align) while (ucs4len > 0 && src[ucs4len - 1] == 0) { ucs4len--; } - - /* produce PyUnicode object */ -#ifdef Py_UNICODE_WIDE - { - ret = (PyUnicodeObject *)PyUnicode_FromUnicode((Py_UNICODE const*)src, - (Py_ssize_t) ucs4len); - if (ret == NULL) { - goto fail; - } - } -#else - { - Py_ssize_t tmpsiz = 2 * sizeof(Py_UNICODE) * ucs4len; - Py_ssize_t ucs2len; - Py_UNICODE *tmp; - - if ((tmp = (Py_UNICODE *)malloc(tmpsiz)) == NULL) { - PyErr_NoMemory(); - goto fail; - } - ucs2len = PyUCS2Buffer_FromUCS4(tmp, src, ucs4len); - ret = (PyUnicodeObject *)PyUnicode_FromUnicode(tmp, (Py_ssize_t) ucs2len); - free(tmp); - if (ret == NULL) { - goto fail; - } - } -#endif - - if (buf) { - free(buf); - } + PyUnicodeObject *ret = (PyUnicodeObject *)PyUnicode_FromKindAndData( + PyUnicode_4BYTE_KIND, src, ucs4len); + free(buf); return ret; - -fail: - if (buf) { - free(buf); - } - return NULL; } diff --git a/numpy/core/src/common/ucsnarrow.h b/numpy/core/src/common/ucsnarrow.h index fe31a5e25..c811e1f2c 100644 --- a/numpy/core/src/common/ucsnarrow.h +++ b/numpy/core/src/common/ucsnarrow.h @@ -1,12 +1,6 @@ #ifndef _NPY_UCSNARROW_H_ #define _NPY_UCSNARROW_H_ -NPY_NO_EXPORT int -PyUCS2Buffer_FromUCS4(Py_UNICODE *ucs2, npy_ucs4 *ucs4, int ucs4length); - -NPY_NO_EXPORT int -PyUCS2Buffer_AsUCS4(Py_UNICODE *ucs2, npy_ucs4 *ucs4, int ucs2len, int ucs4len); - NPY_NO_EXPORT PyUnicodeObject * PyUnicode_FromUCS4(char *src, Py_ssize_t size, int swap, int align); diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src index ce288d62e..c16e0f311 100644 --- a/numpy/core/src/multiarray/arraytypes.c.src +++ b/numpy/core/src/multiarray/arraytypes.c.src @@ -450,12 +450,6 @@ static int UNICODE_setitem(PyObject *op, void *ov, void *vap) { PyArrayObject *ap = vap; - PyObject *temp; - Py_UNICODE *ptr; - int datalen; -#ifndef Py_UNICODE_WIDE - char *buffer; -#endif if (PyArray_IsZeroDim(op)) { return convert_to_scalar_and_retry(op, ov, vap, UNICODE_setitem); @@ -466,6 +460,8 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap) "setting an array element with a sequence"); return -1; } + + PyObject *temp; if (PyBytes_Check(op)) { /* Try to decode from ASCII */ temp = PyUnicode_FromEncodedObject(op, "ASCII", "strict"); @@ -476,18 +472,27 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap) else if ((temp=PyObject_Str(op)) == NULL) { return -1; } - ptr = PyUnicode_AS_UNICODE(temp); - if ((ptr == NULL) || (PyErr_Occurred())) { + + /* truncate if needed */ + Py_ssize_t max_len = PyArray_DESCR(ap)->elsize >> 2; + Py_ssize_t actual_len = PyUnicode_GetLength(temp); + if (actual_len < 0) { Py_DECREF(temp); return -1; } - datalen = PyUnicode_GET_DATA_SIZE(temp); + if (actual_len > max_len) { + Py_SETREF(temp, PyUnicode_Substring(temp, 0, max_len)); + if (temp == NULL) { + return -1; + } + actual_len = max_len; + } -#ifdef Py_UNICODE_WIDE - memcpy(ov, ptr, PyArray_MIN(PyArray_DESCR(ap)->elsize, datalen)); -#else + Py_ssize_t num_bytes = actual_len * 4; + + char *buffer; if (!PyArray_ISALIGNED(ap)) { - buffer = PyArray_malloc(PyArray_DESCR(ap)->elsize); + buffer = PyArray_malloc(num_bytes); if (buffer == NULL) { Py_DECREF(temp); PyErr_NoMemory(); @@ -497,20 +502,23 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap) else { buffer = ov; } - datalen = PyUCS2Buffer_AsUCS4(ptr, (npy_ucs4 *)buffer, - datalen >> 1, PyArray_DESCR(ap)->elsize >> 2); - datalen <<= 2; + if (PyUnicode_AsUCS4(temp, (Py_UCS4 *)buffer, actual_len, 0) == NULL) { + PyArray_free(buffer); + Py_DECREF(temp); + return -1; + } + if (!PyArray_ISALIGNED(ap)) { - memcpy(ov, buffer, datalen); + memcpy(ov, buffer, num_bytes); PyArray_free(buffer); } -#endif + /* Fill in the rest of the space with 0 */ - if (PyArray_DESCR(ap)->elsize > datalen) { - memset((char*)ov + datalen, 0, (PyArray_DESCR(ap)->elsize - datalen)); + if (PyArray_DESCR(ap)->elsize > num_bytes) { + memset((char*)ov + num_bytes, 0, (PyArray_DESCR(ap)->elsize - num_bytes)); } if (PyArray_ISBYTESWAPPED(ap)) { - byte_swap_vector(ov, PyArray_DESCR(ap)->elsize >> 2, 4); + byte_swap_vector(ov, actual_len, 4); } Py_DECREF(temp); return 0; @@ -2650,12 +2658,6 @@ STRING_nonzero (char *ip, PyArrayObject *ap) return nonz; } -#ifdef Py_UNICODE_WIDE -#define PyArray_UCS4_ISSPACE Py_UNICODE_ISSPACE -#else -#define PyArray_UCS4_ISSPACE(ch) Py_STRING_ISSPACE((char)ch) -#endif - static npy_bool UNICODE_nonzero (npy_ucs4 *ip, PyArrayObject *ap) { @@ -2681,7 +2683,7 @@ UNICODE_nonzero (npy_ucs4 *ip, PyArrayObject *ap) if (*ip == '\0') { seen_null = NPY_TRUE; } - else if (seen_null || !PyArray_UCS4_ISSPACE(*ip)) { + else if (seen_null || !Py_UNICODE_ISSPACE(*ip)) { nonz = NPY_TRUE; break; } diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c index 576186362..9a1f7b230 100644 --- a/numpy/core/src/multiarray/buffer.c +++ b/numpy/core/src/multiarray/buffer.c @@ -832,11 +832,6 @@ gentype_getbuffer(PyObject *self, Py_buffer *view, int flags) descr = PyArray_DescrFromScalar(self); view->buf = (void *)scalar_value(self, descr); elsize = descr->elsize; -#ifndef Py_UNICODE_WIDE - if (descr->type_num == NPY_UNICODE) { - elsize >>= 1; - } -#endif view->len = elsize; if (PyArray_IsScalar(self, Datetime) || PyArray_IsScalar(self, Timedelta)) { elsize = 1; /* descr->elsize,char is 8,'M', but we return 1,'B' */ diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c index 3ee2cc6c6..0150ae10e 100644 --- a/numpy/core/src/multiarray/common.c +++ b/numpy/core/src/multiarray/common.c @@ -130,27 +130,34 @@ PyArray_DTypeFromObjectStringDiscovery( PyObject *obj, PyArray_Descr *last_dtype, int string_type) { int itemsize; - PyObject *temp; if (string_type == NPY_STRING) { - if ((temp = PyObject_Str(obj)) == NULL) { + PyObject *temp = PyObject_Str(obj); + if (temp == NULL) { return NULL; } + /* assume that when we do the encoding elsewhere we'll use ASCII */ itemsize = PyUnicode_GetLength(temp); + Py_DECREF(temp); + if (itemsize < 0) { + return NULL; + } } else if (string_type == NPY_UNICODE) { - if ((temp = PyObject_Str(obj)) == NULL) { + PyObject *temp = PyObject_Str(obj); + if (temp == NULL) { return NULL; } - itemsize = PyUnicode_GET_DATA_SIZE(temp); -#ifndef Py_UNICODE_WIDE - itemsize <<= 1; -#endif + itemsize = PyUnicode_GetLength(temp); + Py_DECREF(temp); + if (itemsize < 0) { + return NULL; + } + itemsize *= 4; /* convert UCS4 codepoints to bytes */ } else { return NULL; } - Py_DECREF(temp); if (last_dtype != NULL && last_dtype->type_num == string_type && last_dtype->elsize >= itemsize) { @@ -258,10 +265,11 @@ PyArray_DTypeFromObjectHelper(PyObject *obj, int maxdims, /* Check if it's a Unicode string */ if (PyUnicode_Check(obj)) { - int itemsize = PyUnicode_GET_DATA_SIZE(obj); -#ifndef Py_UNICODE_WIDE - itemsize <<= 1; -#endif + int itemsize = PyUnicode_GetLength(obj); + if (itemsize < 0) { + goto fail; + } + itemsize *= 4; /* * If it's already a big enough unicode object, diff --git a/numpy/core/src/multiarray/scalarapi.c b/numpy/core/src/multiarray/scalarapi.c index 5c4332364..6d3276e18 100644 --- a/numpy/core/src/multiarray/scalarapi.c +++ b/numpy/core/src/multiarray/scalarapi.c @@ -71,7 +71,16 @@ scalar_value(PyObject *scalar, PyArray_Descr *descr) case NPY_STRING: return (void *)PyString_AS_STRING(scalar); case NPY_UNICODE: - return (void *)PyUnicode_AS_DATA(scalar); + /* lazy initialization, to reduce the memory used by string scalars */ + if (PyArrayScalar_VAL(scalar, Unicode) == NULL) { + Py_UCS4 *raw_data = PyUnicode_AsUCS4Copy(scalar); + if (raw_data == NULL) { + return NULL; + } + PyArrayScalar_VAL(scalar, Unicode) = raw_data; + return (void *)raw_data; + } + return PyArrayScalar_VAL(scalar, Unicode); case NPY_VOID: /* Note: no & needed here, so can't use CASE */ return PyArrayScalar_VAL(scalar, Void); @@ -319,21 +328,10 @@ PyArray_FromScalar(PyObject *scalar, PyArray_Descr *outcode) memptr = scalar_value(scalar, typecode); -#ifndef Py_UNICODE_WIDE - if (typecode->type_num == NPY_UNICODE) { - PyUCS2Buffer_AsUCS4((Py_UNICODE *)memptr, - (npy_ucs4 *)PyArray_DATA(r), - PyUnicode_GET_SIZE(scalar), - PyArray_ITEMSIZE(r) >> 2); - } - else -#endif - { - memcpy(PyArray_DATA(r), memptr, PyArray_ITEMSIZE(r)); - if (PyDataType_FLAGCHK(typecode, NPY_ITEM_HASOBJECT)) { - /* Need to INCREF just the PyObject portion */ - PyArray_Item_INCREF(memptr, typecode); - } + memcpy(PyArray_DATA(r), memptr, PyArray_ITEMSIZE(r)); + if (PyDataType_FLAGCHK(typecode, NPY_ITEM_HASOBJECT)) { + /* Need to INCREF just the PyObject portion */ + PyArray_Item_INCREF(memptr, typecode); } finish: @@ -568,10 +566,7 @@ PyArray_DescrFromScalar(PyObject *sc) descr->elsize = PyString_GET_SIZE(sc); } else if (type_num == NPY_UNICODE) { - descr->elsize = PyUnicode_GET_DATA_SIZE(sc); -#ifndef Py_UNICODE_WIDE - descr->elsize <<= 1; -#endif + descr->elsize = PyUnicode_GET_LENGTH(sc) * 4; } else { PyArray_Descr *dtype; @@ -654,23 +649,30 @@ PyArray_Scalar(void *data, PyArray_Descr *descr, PyObject *base) } } if (type_num == NPY_UNICODE) { - PyObject *u, *args; - int byteorder; - -#if NPY_BYTE_ORDER == NPY_LITTLE_ENDIAN - byteorder = -1; -#elif NPY_BYTE_ORDER == NPY_BIG_ENDIAN - byteorder = +1; -#else - #error Endianness undefined ? -#endif - if (swap) byteorder *= -1; - - u = PyUnicode_DecodeUTF32(data, itemsize, NULL, &byteorder); + /* we need the full string length here, else copyswap will write too + many bytes */ + void *buff = PyArray_malloc(descr->elsize); + if (buff == NULL) { + return PyErr_NoMemory(); + } + /* copyswap needs an array object, but only actually cares about the + * dtype + */ + PyArrayObject_fields dummy_arr; + if (base == NULL) { + dummy_arr.descr = descr; + base = (PyObject *)&dummy_arr; + } + copyswap(buff, data, swap, base); + + /* truncation occurs here */ + PyObject *u = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buff, itemsize / 4); + PyArray_free(buff); if (u == NULL) { return NULL; } - args = Py_BuildValue("(O)", u); + + PyObject *args = Py_BuildValue("(O)", u); if (args == NULL) { Py_DECREF(u); return NULL; diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src index 7657e39ee..eafa13ff2 100644 --- a/numpy/core/src/multiarray/scalartypes.c.src +++ b/numpy/core/src/multiarray/scalartypes.c.src @@ -345,6 +345,10 @@ format_@name@(@type@ val, npy_bool scientific, * over-ride repr and str of array-scalar strings and unicode to * remove NULL bytes and then call the corresponding functions * of string and unicode. + * + * FIXME: + * is this really a good idea? + * stop using Py_UNICODE here. */ /**begin repeat @@ -1094,11 +1098,6 @@ gentype_itemsize_get(PyObject *self) typecode = PyArray_DescrFromScalar(self); elsize = typecode->elsize; -#ifndef Py_UNICODE_WIDE - if (typecode->type_num == NPY_UNICODE) { - elsize >>= 1; - } -#endif ret = PyInt_FromLong((long) elsize); Py_DECREF(typecode); return ret; @@ -1658,12 +1657,7 @@ gentype_reduce(PyObject *self, PyObject *NPY_UNUSED(args)) return NULL; } - if (PyArray_IsScalar(self, Unicode)) { - /* Unicode on Python 3 does not expose the buffer interface */ - buffer = PyUnicode_AS_DATA(self); - buflen = PyUnicode_GET_DATA_SIZE(self); - } - else if (PyObject_GetBuffer(self, &view, PyBUF_SIMPLE) >= 0) { + if (PyObject_GetBuffer(self, &view, PyBUF_SIMPLE) >= 0) { buffer = view.buf; buflen = view.len; /* @@ -1718,48 +1712,13 @@ gentype_reduce(PyObject *self, PyObject *NPY_UNUSED(args)) PyTuple_SET_ITEM(ret, 1, tup); } else { -#ifndef Py_UNICODE_WIDE - /* - * We need to expand the buffer so that we always write - * UCS4 to disk for pickle of unicode scalars. - * - * This could be in a unicode_reduce function, but - * that would require re-factoring. - */ - int alloc = 0; - char *tmp; - int newlen; - - if (PyArray_IsScalar(self, Unicode)) { - tmp = PyArray_malloc(buflen*2); - if (tmp == NULL) { - Py_DECREF(ret); - return PyErr_NoMemory(); - } - alloc = 1; - newlen = PyUCS2Buffer_AsUCS4((Py_UNICODE *)buffer, - (npy_ucs4 *)tmp, - buflen / 2, buflen / 2); - buflen = newlen*4; - buffer = tmp; - } -#endif mod = PyBytes_FromStringAndSize(buffer, buflen); if (mod == NULL) { Py_DECREF(ret); -#ifndef Py_UNICODE_WIDE - ret = NULL; - goto fail; -#else return NULL; -#endif } PyTuple_SET_ITEM(ret, 1, Py_BuildValue("NN", obj, mod)); -#ifndef Py_UNICODE_WIDE -fail: - if (alloc) PyArray_free((char *)buffer); -#endif } return ret; } @@ -2409,6 +2368,15 @@ object_arrtype_dealloc(PyObject *v) Py_TYPE(v)->tp_free(v); } +static void +unicode_arrtype_dealloc(PyObject *v) +{ + /* note: may be null if it was never requested */ + PyMem_Free(PyArrayScalar_VAL(v, Unicode)); + /* delegate to the base class */ + PyUnicode_Type.tp_dealloc(v); +} + /**begin repeat * #name = byte, short, int, long, longlong, ubyte, ushort, uint, ulong, * ulonglong, half, float, double, longdouble, cfloat, cdouble, @@ -2444,6 +2412,9 @@ static PyObject * PyErr_Clear(); } else { +#if defined(_@TYPE@_IS_UNICODE) + PyArrayScalar_VAL(from_superclass, Unicode) = NULL; +#endif return from_superclass; } #endif @@ -3667,6 +3638,9 @@ initialize_numeric_types(void) /**end repeat**/ + PyUnicodeArrType_Type.tp_dealloc = unicode_arrtype_dealloc; + PyUnicodeArrType_Type.tp_as_buffer = &gentype_as_buffer; + /**begin repeat * #name = bool, byte, short, ubyte, ushort, uint, ulong, ulonglong, * half, float, longdouble, cfloat, clongdouble, void, object, diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index ad38911cb..13244f3ba 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -7854,6 +7854,34 @@ class TestBytestringArrayNonzero: assert_(a) +class TestUnicodeEncoding: + """ + Tests for encoding related bugs, such as UCS2 vs UCS4, round-tripping + issues, etc + """ + def test_round_trip(self): + """ Tests that GETITEM, SETITEM, and PyArray_Scalar roundtrip """ + # gh-15363 + arr = np.zeros(shape=(), dtype="U1") + for i in range(1, sys.maxunicode + 1): + expected = chr(i) + arr[()] = expected + assert arr[()] == expected + assert arr.item() == expected + + def test_assign_scalar(self): + # gh-3258 + l = np.array(['aa', 'bb']) + l[:] = np.unicode_('cc') + assert_equal(l, ['cc', 'cc']) + + def test_fill_scalar(self): + # gh-7227 + l = np.array(['aa', 'bb']) + l.fill(np.unicode_('cc')) + assert_equal(l, ['cc', 'cc']) + + class TestUnicodeArrayNonzero: def test_empty_ustring_array_is_falsey(self): diff --git a/numpy/core/tests/test_scalarbuffer.py b/numpy/core/tests/test_scalarbuffer.py index b8c6dd4aa..b1c1bbbb1 100644 --- a/numpy/core/tests/test_scalarbuffer.py +++ b/numpy/core/tests/test_scalarbuffer.py @@ -76,27 +76,44 @@ class TestScalarPEP3118: assert_equal(mv_x.itemsize, mv_a.itemsize) assert_equal(mv_x.format, mv_a.format) + def _as_dict(self, m): + return dict(strides=m.strides, shape=m.shape, itemsize=m.itemsize, + ndim=m.ndim, format=m.format) + def test_datetime_memoryview(self): # gh-11656 # Values verified with v1.13.3, shape is not () as in test_scalar_dim - def as_dict(m): - return dict(strides=m.strides, shape=m.shape, itemsize=m.itemsize, - ndim=m.ndim, format=m.format) dt1 = np.datetime64('2016-01-01') dt2 = np.datetime64('2017-01-01') - expected = {'strides': (1,), 'itemsize': 1, 'ndim': 1, - 'shape': (8,), 'format': 'B'} + expected = dict(strides=(1,), itemsize=1, ndim=1, shape=(8,), + format='B') v = memoryview(dt1) - res = as_dict(v) - assert_equal(res, expected) + assert self._as_dict(v) == expected v = memoryview(dt2 - dt1) - res = as_dict(v) - assert_equal(res, expected) + assert self._as_dict(v) == expected dt = np.dtype([('a', 'uint16'), ('b', 'M8[s]')]) a = np.empty(1, dt) # Fails to create a PEP 3118 valid buffer assert_raises((ValueError, BufferError), memoryview, a[0]) + @pytest.mark.parametrize('s', [ + pytest.param("\x32\x32", id="ascii"), + pytest.param("\uFE0F\uFE0F", id="basic multilingual"), + pytest.param("\U0001f4bb\U0001f4bb", id="non-BMP"), + ]) + def test_str_ucs4(self, s): + s = np.str_(s) # only our subclass implements the buffer protocol + + # all the same, characters always encode as ucs4 + expected = dict(strides=(), itemsize=8, ndim=0, shape=(), format='2w') + + v = memoryview(s) + assert self._as_dict(v) == expected + + # integers of the paltform-appropriate endianness + code_points = np.frombuffer(v, dtype='i4') + + assert_equal(code_points, [ord(c) for c in s]) |