summaryrefslogtreecommitdiff
path: root/numpy/core
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/core')
-rw-r--r--numpy/core/defchararray.py19
-rw-r--r--numpy/core/include/numpy/arrayscalars.h8
-rw-r--r--numpy/core/src/common/ucsnarrow.c116
-rw-r--r--numpy/core/src/common/ucsnarrow.h6
-rw-r--r--numpy/core/src/multiarray/arraytypes.c.src58
-rw-r--r--numpy/core/src/multiarray/buffer.c5
-rw-r--r--numpy/core/src/multiarray/common.c32
-rw-r--r--numpy/core/src/multiarray/scalarapi.c70
-rw-r--r--numpy/core/src/multiarray/scalartypes.c.src66
-rw-r--r--numpy/core/tests/test_multiarray.py28
-rw-r--r--numpy/core/tests/test_scalarbuffer.py35
11 files changed, 175 insertions, 268 deletions
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py
index 942a698a9..b22d6b85e 100644
--- a/numpy/core/defchararray.py
+++ b/numpy/core/defchararray.py
@@ -2679,25 +2679,6 @@ def array(obj, itemsize=None, copy=True, unicode=None, order=None):
itemsize = len(obj)
shape = len(obj) // itemsize
- if unicode:
- if sys.maxunicode == 0xffff:
- # On a narrow Python build, the buffer for Unicode
- # strings is UCS2, which doesn't match the buffer for
- # NumPy Unicode types, which is ALWAYS UCS4.
- # Therefore, we need to convert the buffer. On Python
- # 2.6 and later, we can use the utf_32 codec. Earlier
- # versions don't have that codec, so we convert to a
- # numerical array that matches the input buffer, and
- # then use NumPy to convert it to UCS4. All of this
- # should happen in native endianness.
- obj = obj.encode('utf_32')
- else:
- obj = str(obj)
- else:
- # Let the default Unicode -> string encoding (if any) take
- # precedence.
- obj = bytes(obj)
-
return chararray(shape, itemsize=itemsize, unicode=unicode,
buffer=obj, order=order)
diff --git a/numpy/core/include/numpy/arrayscalars.h b/numpy/core/include/numpy/arrayscalars.h
index 64450e713..42a0df76a 100644
--- a/numpy/core/include/numpy/arrayscalars.h
+++ b/numpy/core/include/numpy/arrayscalars.h
@@ -135,7 +135,13 @@ typedef struct {
} PyScalarObject;
#define PyStringScalarObject PyStringObject
-#define PyUnicodeScalarObject PyUnicodeObject
+#define PyStringScalarObject PyStringObject
+typedef struct {
+ /* note that the PyObject_HEAD macro lives right here */
+ PyUnicodeObject base;
+ Py_UCS4 *obval;
+} PyUnicodeScalarObject;
+
typedef struct {
PyObject_VAR_HEAD
diff --git a/numpy/core/src/common/ucsnarrow.c b/numpy/core/src/common/ucsnarrow.c
index 946a72257..3ef5d6878 100644
--- a/numpy/core/src/common/ucsnarrow.c
+++ b/numpy/core/src/common/ucsnarrow.c
@@ -16,76 +16,12 @@
#include "ctors.h"
/*
- * Functions only needed on narrow builds of Python for converting back and
- * forth between the NumPy Unicode data-type (always 4-bytes) and the
- * Python Unicode scalar (2-bytes on a narrow build).
- */
-
-/*
- * The ucs2 buffer must be large enough to hold 2*ucs4length characters
- * due to the use of surrogate pairs.
+ * This file originally contained functions only needed on narrow builds of
+ * Python for converting back and forth between the NumPy Unicode data-type
+ * (always 4-bytes) and the Python Unicode scalar (2-bytes on a narrow build).
*
- * The return value is the number of ucs2 bytes used-up which
- * is ucs4length + number of surrogate pairs found.
- *
- * Values above 0xffff are converted to surrogate pairs.
+ * This "narrow" interface is now deprecated in python and unused in NumPy.
*/
-NPY_NO_EXPORT int
-PyUCS2Buffer_FromUCS4(Py_UNICODE *ucs2, npy_ucs4 const *ucs4, int ucs4length)
-{
- int i;
- int numucs2 = 0;
- npy_ucs4 chr;
- for (i = 0; i < ucs4length; i++) {
- chr = *ucs4++;
- if (chr > 0xffff) {
- numucs2++;
- chr -= 0x10000L;
- *ucs2++ = 0xD800 + (Py_UNICODE) (chr >> 10);
- *ucs2++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
- }
- else {
- *ucs2++ = (Py_UNICODE) chr;
- }
- numucs2++;
- }
- return numucs2;
-}
-
-
-/*
- * This converts a UCS2 buffer of the given length to UCS4 buffer.
- * It converts up to ucs4len characters of UCS2
- *
- * It returns the number of characters converted which can
- * be less than ucs2len if there are surrogate pairs in ucs2.
- *
- * The return value is the actual size of the used part of the ucs4 buffer.
- */
-NPY_NO_EXPORT int
-PyUCS2Buffer_AsUCS4(Py_UNICODE const *ucs2, npy_ucs4 *ucs4, int ucs2len, int ucs4len)
-{
- int i;
- npy_ucs4 chr;
- Py_UNICODE ch;
- int numchars=0;
-
- for (i = 0; (i < ucs2len) && (numchars < ucs4len); i++) {
- ch = *ucs2++;
- if (ch >= 0xd800 && ch <= 0xdfff) {
- /* surrogate pair */
- chr = ((npy_ucs4)(ch-0xd800)) << 10;
- chr += *ucs2++ + 0x2400; /* -0xdc00 + 0x10000 */
- i++;
- }
- else {
- chr = (npy_ucs4) ch;
- }
- *ucs4++ = chr;
- numchars++;
- }
- return numchars;
-}
/*
* Returns a PyUnicodeObject initialized from a buffer containing
@@ -112,14 +48,13 @@ PyUnicode_FromUCS4(char const *src_char, Py_ssize_t size, int swap, int align)
Py_ssize_t ucs4len = size / sizeof(npy_ucs4);
npy_ucs4 const *src = (npy_ucs4 const *)src_char;
npy_ucs4 *buf = NULL;
- PyUnicodeObject *ret;
/* swap and align if needed */
if (swap || align) {
buf = (npy_ucs4 *)malloc(size);
if (buf == NULL) {
PyErr_NoMemory();
- goto fail;
+ return NULL;
}
memcpy(buf, src, size);
if (swap) {
@@ -132,43 +67,8 @@ PyUnicode_FromUCS4(char const *src_char, Py_ssize_t size, int swap, int align)
while (ucs4len > 0 && src[ucs4len - 1] == 0) {
ucs4len--;
}
-
- /* produce PyUnicode object */
-#ifdef Py_UNICODE_WIDE
- {
- ret = (PyUnicodeObject *)PyUnicode_FromUnicode((Py_UNICODE const*)src,
- (Py_ssize_t) ucs4len);
- if (ret == NULL) {
- goto fail;
- }
- }
-#else
- {
- Py_ssize_t tmpsiz = 2 * sizeof(Py_UNICODE) * ucs4len;
- Py_ssize_t ucs2len;
- Py_UNICODE *tmp;
-
- if ((tmp = (Py_UNICODE *)malloc(tmpsiz)) == NULL) {
- PyErr_NoMemory();
- goto fail;
- }
- ucs2len = PyUCS2Buffer_FromUCS4(tmp, src, ucs4len);
- ret = (PyUnicodeObject *)PyUnicode_FromUnicode(tmp, (Py_ssize_t) ucs2len);
- free(tmp);
- if (ret == NULL) {
- goto fail;
- }
- }
-#endif
-
- if (buf) {
- free(buf);
- }
+ PyUnicodeObject *ret = (PyUnicodeObject *)PyUnicode_FromKindAndData(
+ PyUnicode_4BYTE_KIND, src, ucs4len);
+ free(buf);
return ret;
-
-fail:
- if (buf) {
- free(buf);
- }
- return NULL;
}
diff --git a/numpy/core/src/common/ucsnarrow.h b/numpy/core/src/common/ucsnarrow.h
index fe31a5e25..c811e1f2c 100644
--- a/numpy/core/src/common/ucsnarrow.h
+++ b/numpy/core/src/common/ucsnarrow.h
@@ -1,12 +1,6 @@
#ifndef _NPY_UCSNARROW_H_
#define _NPY_UCSNARROW_H_
-NPY_NO_EXPORT int
-PyUCS2Buffer_FromUCS4(Py_UNICODE *ucs2, npy_ucs4 *ucs4, int ucs4length);
-
-NPY_NO_EXPORT int
-PyUCS2Buffer_AsUCS4(Py_UNICODE *ucs2, npy_ucs4 *ucs4, int ucs2len, int ucs4len);
-
NPY_NO_EXPORT PyUnicodeObject *
PyUnicode_FromUCS4(char *src, Py_ssize_t size, int swap, int align);
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index ce288d62e..c16e0f311 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -450,12 +450,6 @@ static int
UNICODE_setitem(PyObject *op, void *ov, void *vap)
{
PyArrayObject *ap = vap;
- PyObject *temp;
- Py_UNICODE *ptr;
- int datalen;
-#ifndef Py_UNICODE_WIDE
- char *buffer;
-#endif
if (PyArray_IsZeroDim(op)) {
return convert_to_scalar_and_retry(op, ov, vap, UNICODE_setitem);
@@ -466,6 +460,8 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
"setting an array element with a sequence");
return -1;
}
+
+ PyObject *temp;
if (PyBytes_Check(op)) {
/* Try to decode from ASCII */
temp = PyUnicode_FromEncodedObject(op, "ASCII", "strict");
@@ -476,18 +472,27 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
else if ((temp=PyObject_Str(op)) == NULL) {
return -1;
}
- ptr = PyUnicode_AS_UNICODE(temp);
- if ((ptr == NULL) || (PyErr_Occurred())) {
+
+ /* truncate if needed */
+ Py_ssize_t max_len = PyArray_DESCR(ap)->elsize >> 2;
+ Py_ssize_t actual_len = PyUnicode_GetLength(temp);
+ if (actual_len < 0) {
Py_DECREF(temp);
return -1;
}
- datalen = PyUnicode_GET_DATA_SIZE(temp);
+ if (actual_len > max_len) {
+ Py_SETREF(temp, PyUnicode_Substring(temp, 0, max_len));
+ if (temp == NULL) {
+ return -1;
+ }
+ actual_len = max_len;
+ }
-#ifdef Py_UNICODE_WIDE
- memcpy(ov, ptr, PyArray_MIN(PyArray_DESCR(ap)->elsize, datalen));
-#else
+ Py_ssize_t num_bytes = actual_len * 4;
+
+ char *buffer;
if (!PyArray_ISALIGNED(ap)) {
- buffer = PyArray_malloc(PyArray_DESCR(ap)->elsize);
+ buffer = PyArray_malloc(num_bytes);
if (buffer == NULL) {
Py_DECREF(temp);
PyErr_NoMemory();
@@ -497,20 +502,23 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
else {
buffer = ov;
}
- datalen = PyUCS2Buffer_AsUCS4(ptr, (npy_ucs4 *)buffer,
- datalen >> 1, PyArray_DESCR(ap)->elsize >> 2);
- datalen <<= 2;
+ if (PyUnicode_AsUCS4(temp, (Py_UCS4 *)buffer, actual_len, 0) == NULL) {
+ PyArray_free(buffer);
+ Py_DECREF(temp);
+ return -1;
+ }
+
if (!PyArray_ISALIGNED(ap)) {
- memcpy(ov, buffer, datalen);
+ memcpy(ov, buffer, num_bytes);
PyArray_free(buffer);
}
-#endif
+
/* Fill in the rest of the space with 0 */
- if (PyArray_DESCR(ap)->elsize > datalen) {
- memset((char*)ov + datalen, 0, (PyArray_DESCR(ap)->elsize - datalen));
+ if (PyArray_DESCR(ap)->elsize > num_bytes) {
+ memset((char*)ov + num_bytes, 0, (PyArray_DESCR(ap)->elsize - num_bytes));
}
if (PyArray_ISBYTESWAPPED(ap)) {
- byte_swap_vector(ov, PyArray_DESCR(ap)->elsize >> 2, 4);
+ byte_swap_vector(ov, actual_len, 4);
}
Py_DECREF(temp);
return 0;
@@ -2650,12 +2658,6 @@ STRING_nonzero (char *ip, PyArrayObject *ap)
return nonz;
}
-#ifdef Py_UNICODE_WIDE
-#define PyArray_UCS4_ISSPACE Py_UNICODE_ISSPACE
-#else
-#define PyArray_UCS4_ISSPACE(ch) Py_STRING_ISSPACE((char)ch)
-#endif
-
static npy_bool
UNICODE_nonzero (npy_ucs4 *ip, PyArrayObject *ap)
{
@@ -2681,7 +2683,7 @@ UNICODE_nonzero (npy_ucs4 *ip, PyArrayObject *ap)
if (*ip == '\0') {
seen_null = NPY_TRUE;
}
- else if (seen_null || !PyArray_UCS4_ISSPACE(*ip)) {
+ else if (seen_null || !Py_UNICODE_ISSPACE(*ip)) {
nonz = NPY_TRUE;
break;
}
diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
index 576186362..9a1f7b230 100644
--- a/numpy/core/src/multiarray/buffer.c
+++ b/numpy/core/src/multiarray/buffer.c
@@ -832,11 +832,6 @@ gentype_getbuffer(PyObject *self, Py_buffer *view, int flags)
descr = PyArray_DescrFromScalar(self);
view->buf = (void *)scalar_value(self, descr);
elsize = descr->elsize;
-#ifndef Py_UNICODE_WIDE
- if (descr->type_num == NPY_UNICODE) {
- elsize >>= 1;
- }
-#endif
view->len = elsize;
if (PyArray_IsScalar(self, Datetime) || PyArray_IsScalar(self, Timedelta)) {
elsize = 1; /* descr->elsize,char is 8,'M', but we return 1,'B' */
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index 3ee2cc6c6..0150ae10e 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -130,27 +130,34 @@ PyArray_DTypeFromObjectStringDiscovery(
PyObject *obj, PyArray_Descr *last_dtype, int string_type)
{
int itemsize;
- PyObject *temp;
if (string_type == NPY_STRING) {
- if ((temp = PyObject_Str(obj)) == NULL) {
+ PyObject *temp = PyObject_Str(obj);
+ if (temp == NULL) {
return NULL;
}
+ /* assume that when we do the encoding elsewhere we'll use ASCII */
itemsize = PyUnicode_GetLength(temp);
+ Py_DECREF(temp);
+ if (itemsize < 0) {
+ return NULL;
+ }
}
else if (string_type == NPY_UNICODE) {
- if ((temp = PyObject_Str(obj)) == NULL) {
+ PyObject *temp = PyObject_Str(obj);
+ if (temp == NULL) {
return NULL;
}
- itemsize = PyUnicode_GET_DATA_SIZE(temp);
-#ifndef Py_UNICODE_WIDE
- itemsize <<= 1;
-#endif
+ itemsize = PyUnicode_GetLength(temp);
+ Py_DECREF(temp);
+ if (itemsize < 0) {
+ return NULL;
+ }
+ itemsize *= 4; /* convert UCS4 codepoints to bytes */
}
else {
return NULL;
}
- Py_DECREF(temp);
if (last_dtype != NULL &&
last_dtype->type_num == string_type &&
last_dtype->elsize >= itemsize) {
@@ -258,10 +265,11 @@ PyArray_DTypeFromObjectHelper(PyObject *obj, int maxdims,
/* Check if it's a Unicode string */
if (PyUnicode_Check(obj)) {
- int itemsize = PyUnicode_GET_DATA_SIZE(obj);
-#ifndef Py_UNICODE_WIDE
- itemsize <<= 1;
-#endif
+ int itemsize = PyUnicode_GetLength(obj);
+ if (itemsize < 0) {
+ goto fail;
+ }
+ itemsize *= 4;
/*
* If it's already a big enough unicode object,
diff --git a/numpy/core/src/multiarray/scalarapi.c b/numpy/core/src/multiarray/scalarapi.c
index 5c4332364..6d3276e18 100644
--- a/numpy/core/src/multiarray/scalarapi.c
+++ b/numpy/core/src/multiarray/scalarapi.c
@@ -71,7 +71,16 @@ scalar_value(PyObject *scalar, PyArray_Descr *descr)
case NPY_STRING:
return (void *)PyString_AS_STRING(scalar);
case NPY_UNICODE:
- return (void *)PyUnicode_AS_DATA(scalar);
+ /* lazy initialization, to reduce the memory used by string scalars */
+ if (PyArrayScalar_VAL(scalar, Unicode) == NULL) {
+ Py_UCS4 *raw_data = PyUnicode_AsUCS4Copy(scalar);
+ if (raw_data == NULL) {
+ return NULL;
+ }
+ PyArrayScalar_VAL(scalar, Unicode) = raw_data;
+ return (void *)raw_data;
+ }
+ return PyArrayScalar_VAL(scalar, Unicode);
case NPY_VOID:
/* Note: no & needed here, so can't use CASE */
return PyArrayScalar_VAL(scalar, Void);
@@ -319,21 +328,10 @@ PyArray_FromScalar(PyObject *scalar, PyArray_Descr *outcode)
memptr = scalar_value(scalar, typecode);
-#ifndef Py_UNICODE_WIDE
- if (typecode->type_num == NPY_UNICODE) {
- PyUCS2Buffer_AsUCS4((Py_UNICODE *)memptr,
- (npy_ucs4 *)PyArray_DATA(r),
- PyUnicode_GET_SIZE(scalar),
- PyArray_ITEMSIZE(r) >> 2);
- }
- else
-#endif
- {
- memcpy(PyArray_DATA(r), memptr, PyArray_ITEMSIZE(r));
- if (PyDataType_FLAGCHK(typecode, NPY_ITEM_HASOBJECT)) {
- /* Need to INCREF just the PyObject portion */
- PyArray_Item_INCREF(memptr, typecode);
- }
+ memcpy(PyArray_DATA(r), memptr, PyArray_ITEMSIZE(r));
+ if (PyDataType_FLAGCHK(typecode, NPY_ITEM_HASOBJECT)) {
+ /* Need to INCREF just the PyObject portion */
+ PyArray_Item_INCREF(memptr, typecode);
}
finish:
@@ -568,10 +566,7 @@ PyArray_DescrFromScalar(PyObject *sc)
descr->elsize = PyString_GET_SIZE(sc);
}
else if (type_num == NPY_UNICODE) {
- descr->elsize = PyUnicode_GET_DATA_SIZE(sc);
-#ifndef Py_UNICODE_WIDE
- descr->elsize <<= 1;
-#endif
+ descr->elsize = PyUnicode_GET_LENGTH(sc) * 4;
}
else {
PyArray_Descr *dtype;
@@ -654,23 +649,30 @@ PyArray_Scalar(void *data, PyArray_Descr *descr, PyObject *base)
}
}
if (type_num == NPY_UNICODE) {
- PyObject *u, *args;
- int byteorder;
-
-#if NPY_BYTE_ORDER == NPY_LITTLE_ENDIAN
- byteorder = -1;
-#elif NPY_BYTE_ORDER == NPY_BIG_ENDIAN
- byteorder = +1;
-#else
- #error Endianness undefined ?
-#endif
- if (swap) byteorder *= -1;
-
- u = PyUnicode_DecodeUTF32(data, itemsize, NULL, &byteorder);
+ /* we need the full string length here, else copyswap will write too
+ many bytes */
+ void *buff = PyArray_malloc(descr->elsize);
+ if (buff == NULL) {
+ return PyErr_NoMemory();
+ }
+ /* copyswap needs an array object, but only actually cares about the
+ * dtype
+ */
+ PyArrayObject_fields dummy_arr;
+ if (base == NULL) {
+ dummy_arr.descr = descr;
+ base = (PyObject *)&dummy_arr;
+ }
+ copyswap(buff, data, swap, base);
+
+ /* truncation occurs here */
+ PyObject *u = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buff, itemsize / 4);
+ PyArray_free(buff);
if (u == NULL) {
return NULL;
}
- args = Py_BuildValue("(O)", u);
+
+ PyObject *args = Py_BuildValue("(O)", u);
if (args == NULL) {
Py_DECREF(u);
return NULL;
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index 7657e39ee..eafa13ff2 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -345,6 +345,10 @@ format_@name@(@type@ val, npy_bool scientific,
* over-ride repr and str of array-scalar strings and unicode to
* remove NULL bytes and then call the corresponding functions
* of string and unicode.
+ *
+ * FIXME:
+ * is this really a good idea?
+ * stop using Py_UNICODE here.
*/
/**begin repeat
@@ -1094,11 +1098,6 @@ gentype_itemsize_get(PyObject *self)
typecode = PyArray_DescrFromScalar(self);
elsize = typecode->elsize;
-#ifndef Py_UNICODE_WIDE
- if (typecode->type_num == NPY_UNICODE) {
- elsize >>= 1;
- }
-#endif
ret = PyInt_FromLong((long) elsize);
Py_DECREF(typecode);
return ret;
@@ -1658,12 +1657,7 @@ gentype_reduce(PyObject *self, PyObject *NPY_UNUSED(args))
return NULL;
}
- if (PyArray_IsScalar(self, Unicode)) {
- /* Unicode on Python 3 does not expose the buffer interface */
- buffer = PyUnicode_AS_DATA(self);
- buflen = PyUnicode_GET_DATA_SIZE(self);
- }
- else if (PyObject_GetBuffer(self, &view, PyBUF_SIMPLE) >= 0) {
+ if (PyObject_GetBuffer(self, &view, PyBUF_SIMPLE) >= 0) {
buffer = view.buf;
buflen = view.len;
/*
@@ -1718,48 +1712,13 @@ gentype_reduce(PyObject *self, PyObject *NPY_UNUSED(args))
PyTuple_SET_ITEM(ret, 1, tup);
}
else {
-#ifndef Py_UNICODE_WIDE
- /*
- * We need to expand the buffer so that we always write
- * UCS4 to disk for pickle of unicode scalars.
- *
- * This could be in a unicode_reduce function, but
- * that would require re-factoring.
- */
- int alloc = 0;
- char *tmp;
- int newlen;
-
- if (PyArray_IsScalar(self, Unicode)) {
- tmp = PyArray_malloc(buflen*2);
- if (tmp == NULL) {
- Py_DECREF(ret);
- return PyErr_NoMemory();
- }
- alloc = 1;
- newlen = PyUCS2Buffer_AsUCS4((Py_UNICODE *)buffer,
- (npy_ucs4 *)tmp,
- buflen / 2, buflen / 2);
- buflen = newlen*4;
- buffer = tmp;
- }
-#endif
mod = PyBytes_FromStringAndSize(buffer, buflen);
if (mod == NULL) {
Py_DECREF(ret);
-#ifndef Py_UNICODE_WIDE
- ret = NULL;
- goto fail;
-#else
return NULL;
-#endif
}
PyTuple_SET_ITEM(ret, 1,
Py_BuildValue("NN", obj, mod));
-#ifndef Py_UNICODE_WIDE
-fail:
- if (alloc) PyArray_free((char *)buffer);
-#endif
}
return ret;
}
@@ -2409,6 +2368,15 @@ object_arrtype_dealloc(PyObject *v)
Py_TYPE(v)->tp_free(v);
}
+static void
+unicode_arrtype_dealloc(PyObject *v)
+{
+ /* note: may be null if it was never requested */
+ PyMem_Free(PyArrayScalar_VAL(v, Unicode));
+ /* delegate to the base class */
+ PyUnicode_Type.tp_dealloc(v);
+}
+
/**begin repeat
* #name = byte, short, int, long, longlong, ubyte, ushort, uint, ulong,
* ulonglong, half, float, double, longdouble, cfloat, cdouble,
@@ -2444,6 +2412,9 @@ static PyObject *
PyErr_Clear();
}
else {
+#if defined(_@TYPE@_IS_UNICODE)
+ PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
return from_superclass;
}
#endif
@@ -3667,6 +3638,9 @@ initialize_numeric_types(void)
/**end repeat**/
+ PyUnicodeArrType_Type.tp_dealloc = unicode_arrtype_dealloc;
+ PyUnicodeArrType_Type.tp_as_buffer = &gentype_as_buffer;
+
/**begin repeat
* #name = bool, byte, short, ubyte, ushort, uint, ulong, ulonglong,
* half, float, longdouble, cfloat, clongdouble, void, object,
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index ad38911cb..13244f3ba 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -7854,6 +7854,34 @@ class TestBytestringArrayNonzero:
assert_(a)
+class TestUnicodeEncoding:
+ """
+ Tests for encoding related bugs, such as UCS2 vs UCS4, round-tripping
+ issues, etc
+ """
+ def test_round_trip(self):
+ """ Tests that GETITEM, SETITEM, and PyArray_Scalar roundtrip """
+ # gh-15363
+ arr = np.zeros(shape=(), dtype="U1")
+ for i in range(1, sys.maxunicode + 1):
+ expected = chr(i)
+ arr[()] = expected
+ assert arr[()] == expected
+ assert arr.item() == expected
+
+ def test_assign_scalar(self):
+ # gh-3258
+ l = np.array(['aa', 'bb'])
+ l[:] = np.unicode_('cc')
+ assert_equal(l, ['cc', 'cc'])
+
+ def test_fill_scalar(self):
+ # gh-7227
+ l = np.array(['aa', 'bb'])
+ l.fill(np.unicode_('cc'))
+ assert_equal(l, ['cc', 'cc'])
+
+
class TestUnicodeArrayNonzero:
def test_empty_ustring_array_is_falsey(self):
diff --git a/numpy/core/tests/test_scalarbuffer.py b/numpy/core/tests/test_scalarbuffer.py
index b8c6dd4aa..b1c1bbbb1 100644
--- a/numpy/core/tests/test_scalarbuffer.py
+++ b/numpy/core/tests/test_scalarbuffer.py
@@ -76,27 +76,44 @@ class TestScalarPEP3118:
assert_equal(mv_x.itemsize, mv_a.itemsize)
assert_equal(mv_x.format, mv_a.format)
+ def _as_dict(self, m):
+ return dict(strides=m.strides, shape=m.shape, itemsize=m.itemsize,
+ ndim=m.ndim, format=m.format)
+
def test_datetime_memoryview(self):
# gh-11656
# Values verified with v1.13.3, shape is not () as in test_scalar_dim
- def as_dict(m):
- return dict(strides=m.strides, shape=m.shape, itemsize=m.itemsize,
- ndim=m.ndim, format=m.format)
dt1 = np.datetime64('2016-01-01')
dt2 = np.datetime64('2017-01-01')
- expected = {'strides': (1,), 'itemsize': 1, 'ndim': 1,
- 'shape': (8,), 'format': 'B'}
+ expected = dict(strides=(1,), itemsize=1, ndim=1, shape=(8,),
+ format='B')
v = memoryview(dt1)
- res = as_dict(v)
- assert_equal(res, expected)
+ assert self._as_dict(v) == expected
v = memoryview(dt2 - dt1)
- res = as_dict(v)
- assert_equal(res, expected)
+ assert self._as_dict(v) == expected
dt = np.dtype([('a', 'uint16'), ('b', 'M8[s]')])
a = np.empty(1, dt)
# Fails to create a PEP 3118 valid buffer
assert_raises((ValueError, BufferError), memoryview, a[0])
+ @pytest.mark.parametrize('s', [
+ pytest.param("\x32\x32", id="ascii"),
+ pytest.param("\uFE0F\uFE0F", id="basic multilingual"),
+ pytest.param("\U0001f4bb\U0001f4bb", id="non-BMP"),
+ ])
+ def test_str_ucs4(self, s):
+ s = np.str_(s) # only our subclass implements the buffer protocol
+
+ # all the same, characters always encode as ucs4
+ expected = dict(strides=(), itemsize=8, ndim=0, shape=(), format='2w')
+
+ v = memoryview(s)
+ assert self._as_dict(v) == expected
+
+ # integers of the paltform-appropriate endianness
+ code_points = np.frombuffer(v, dtype='i4')
+
+ assert_equal(code_points, [ord(c) for c in s])