summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/release/upcoming_changes/22863.new_feature.rst11
-rw-r--r--doc/release/upcoming_changes/22963.new_feature.rst7
-rw-r--r--numpy/core/defchararray.py77
-rw-r--r--numpy/core/src/multiarray/dtypemeta.h1
-rw-r--r--numpy/core/src/multiarray/multiarraymodule.c44
-rw-r--r--numpy/core/tests/test_defchararray.py14
6 files changed, 111 insertions, 43 deletions
diff --git a/doc/release/upcoming_changes/22863.new_feature.rst b/doc/release/upcoming_changes/22863.new_feature.rst
index 88ec3f641..3f45ed834 100644
--- a/doc/release/upcoming_changes/22863.new_feature.rst
+++ b/doc/release/upcoming_changes/22863.new_feature.rst
@@ -1,7 +1,4 @@
-String dtype instances can be created from the string abstract dtype classes
-----------------------------------------------------------------------------
-It is now possible to create a string dtype instance with a size without
-using the string name of the dtype. For example, ``type(np.dtype('U'))(8)``
-will create a dtype that is equivalent to ``np.dtype('U8')``. This feature
-is most useful when writing generic code dealing with string dtype
-classes.
+String functions in np.char are compatible with NEP 42 custom dtypes
+--------------------------------------------------------------------
+Custom dtypes that represent unicode strings or byte strings can now be
+passed to the string functions in np.char.
diff --git a/doc/release/upcoming_changes/22963.new_feature.rst b/doc/release/upcoming_changes/22963.new_feature.rst
new file mode 100644
index 000000000..88ec3f641
--- /dev/null
+++ b/doc/release/upcoming_changes/22963.new_feature.rst
@@ -0,0 +1,7 @@
+String dtype instances can be created from the string abstract dtype classes
+----------------------------------------------------------------------------
+It is now possible to create a string dtype instance with a size without
+using the string name of the dtype. For example, ``type(np.dtype('U'))(8)``
+will create a dtype that is equivalent to ``np.dtype('U8')``. This feature
+is most useful when writing generic code dealing with string dtype
+classes.
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py
index d312506ff..98db3d882 100644
--- a/numpy/core/defchararray.py
+++ b/numpy/core/defchararray.py
@@ -46,26 +46,29 @@ array_function_dispatch = functools.partial(
overrides.array_function_dispatch, module='numpy.char')
-def _use_unicode(*args):
- """
- Helper function for determining the output type of some string
- operations.
+def _is_unicode(arr):
+ """Returns True if arr is a string or a string array with a dtype that
+ represents a unicode string, otherwise returns False.
- For an operation on two ndarrays, if at least one is unicode, the
- result should be unicode.
"""
- for x in args:
- if (isinstance(x, str) or
- issubclass(numpy.asarray(x).dtype.type, unicode_)):
- return unicode_
- return string_
+ if (isinstance(arr, str) or
+ issubclass(numpy.asarray(arr).dtype.type, str)):
+ return True
+ return False
+
-def _to_string_or_unicode_array(result):
+def _to_string_or_unicode_array(result, output_dtype_like=None):
"""
- Helper function to cast a result back into a string or unicode array
- if an object array must be used as an intermediary.
+ Helper function to cast a result back into an array
+ with the appropriate dtype if an object array must be used
+ as an intermediary.
"""
- return numpy.asarray(result.tolist())
+ ret = numpy.asarray(result.tolist())
+ dtype = getattr(output_dtype_like, 'dtype', None)
+ if dtype is not None:
+ return ret.astype(type(dtype)(_get_num_chars(ret)), copy=False)
+ return ret
+
def _clean_args(*args):
"""
@@ -319,9 +322,19 @@ def add(x1, x2):
arr1 = numpy.asarray(x1)
arr2 = numpy.asarray(x2)
out_size = _get_num_chars(arr1) + _get_num_chars(arr2)
- dtype = _use_unicode(arr1, arr2)
- return _vec_string(arr1, (dtype, out_size), '__add__', (arr2,))
+ if type(arr1.dtype) != type(arr2.dtype):
+ # Enforce this for now. The solution to it will be implement add
+ # as a ufunc. It never worked right on Python 3: bytes + unicode gave
+ # nonsense unicode + bytes errored, and unicode + object used the
+ # object dtype itemsize as num chars (worked on short strings).
+ # bytes + void worked but promoting void->bytes is dubious also.
+ raise TypeError(
+ "np.char.add() requires both arrays of the same dtype kind, but "
+ f"got dtypes: '{arr1.dtype}' and '{arr2.dtype}' (the few cases "
+ "where this used to work often lead to incorrect results).")
+
+ return _vec_string(arr1, type(arr1.dtype)(out_size), '__add__', (arr2,))
def _multiply_dispatcher(a, i):
return (a,)
@@ -371,7 +384,7 @@ def multiply(a, i):
raise ValueError("Can only multiply by integers")
out_size = _get_num_chars(a_arr) * max(int(i_arr.max()), 0)
return _vec_string(
- a_arr, (a_arr.dtype.type, out_size), '__mul__', (i_arr,))
+ a_arr, type(a_arr.dtype)(out_size), '__mul__', (i_arr,))
def _mod_dispatcher(a, values):
@@ -403,7 +416,7 @@ def mod(a, values):
"""
return _to_string_or_unicode_array(
- _vec_string(a, object_, '__mod__', (values,)))
+ _vec_string(a, object_, '__mod__', (values,)), a)
@array_function_dispatch(_unary_op_dispatcher)
@@ -499,7 +512,7 @@ def center(a, width, fillchar=' '):
if numpy.issubdtype(a_arr.dtype, numpy.string_):
fillchar = asbytes(fillchar)
return _vec_string(
- a_arr, (a_arr.dtype.type, size), 'center', (width_arr, fillchar))
+ a_arr, type(a_arr.dtype)(size), 'center', (width_arr, fillchar))
def _count_dispatcher(a, sub, start=None, end=None):
@@ -723,7 +736,7 @@ def expandtabs(a, tabsize=8):
"""
return _to_string_or_unicode_array(
- _vec_string(a, object_, 'expandtabs', (tabsize,)))
+ _vec_string(a, object_, 'expandtabs', (tabsize,)), a)
@array_function_dispatch(_count_dispatcher)
@@ -1043,7 +1056,7 @@ def join(sep, seq):
"""
return _to_string_or_unicode_array(
- _vec_string(sep, object_, 'join', (seq,)))
+ _vec_string(sep, object_, 'join', (seq,)), seq)
@@ -1084,7 +1097,7 @@ def ljust(a, width, fillchar=' '):
if numpy.issubdtype(a_arr.dtype, numpy.string_):
fillchar = asbytes(fillchar)
return _vec_string(
- a_arr, (a_arr.dtype.type, size), 'ljust', (width_arr, fillchar))
+ a_arr, type(a_arr.dtype)(size), 'ljust', (width_arr, fillchar))
@array_function_dispatch(_unary_op_dispatcher)
@@ -1218,7 +1231,7 @@ def partition(a, sep):
"""
return _to_string_or_unicode_array(
- _vec_string(a, object_, 'partition', (sep,)))
+ _vec_string(a, object_, 'partition', (sep,)), a)
def _replace_dispatcher(a, old, new, count=None):
@@ -1263,8 +1276,7 @@ def replace(a, old, new, count=None):
array(['The dwash was fresh', 'Thwas was it'], dtype='<U19')
"""
return _to_string_or_unicode_array(
- _vec_string(
- a, object_, 'replace', [old, new] + _clean_args(count)))
+ _vec_string(a, object_, 'replace', [old, new] + _clean_args(count)), a)
@array_function_dispatch(_count_dispatcher)
@@ -1363,7 +1375,7 @@ def rjust(a, width, fillchar=' '):
if numpy.issubdtype(a_arr.dtype, numpy.string_):
fillchar = asbytes(fillchar)
return _vec_string(
- a_arr, (a_arr.dtype.type, size), 'rjust', (width_arr, fillchar))
+ a_arr, type(a_arr.dtype)(size), 'rjust', (width_arr, fillchar))
@array_function_dispatch(_partition_dispatcher)
@@ -1399,7 +1411,7 @@ def rpartition(a, sep):
"""
return _to_string_or_unicode_array(
- _vec_string(a, object_, 'rpartition', (sep,)))
+ _vec_string(a, object_, 'rpartition', (sep,)), a)
def _split_dispatcher(a, sep=None, maxsplit=None):
@@ -1829,7 +1841,7 @@ def zfill(a, width):
width_arr = numpy.asarray(width)
size = int(numpy.max(width_arr.flat))
return _vec_string(
- a_arr, (a_arr.dtype.type, size), 'zfill', (width_arr,))
+ a_arr, type(a_arr.dtype)(size), 'zfill', (width_arr,))
@array_function_dispatch(_unary_op_dispatcher)
@@ -1864,7 +1876,7 @@ def isnumeric(a):
array([ True, False, False, False, False])
"""
- if _use_unicode(a) != unicode_:
+ if not _is_unicode(a):
raise TypeError("isnumeric is only available for Unicode strings and arrays")
return _vec_string(a, bool_, 'isnumeric')
@@ -1901,8 +1913,9 @@ def isdecimal(a):
array([ True, False, False, False])
"""
- if _use_unicode(a) != unicode_:
- raise TypeError("isnumeric is only available for Unicode strings and arrays")
+ if not _is_unicode(a):
+ raise TypeError(
+ "isdecimal is only available for Unicode strings and arrays")
return _vec_string(a, bool_, 'isdecimal')
diff --git a/numpy/core/src/multiarray/dtypemeta.h b/numpy/core/src/multiarray/dtypemeta.h
index 5df333584..ef702f923 100644
--- a/numpy/core/src/multiarray/dtypemeta.h
+++ b/numpy/core/src/multiarray/dtypemeta.h
@@ -81,6 +81,7 @@ typedef struct {
#define NPY_DT_is_legacy(dtype) (((dtype)->flags & NPY_DT_LEGACY) != 0)
#define NPY_DT_is_abstract(dtype) (((dtype)->flags & NPY_DT_ABSTRACT) != 0)
#define NPY_DT_is_parametric(dtype) (((dtype)->flags & NPY_DT_PARAMETRIC) != 0)
+#define NPY_DT_is_user_defined(dtype) (((dtype)->type_num == -1))
/*
* Macros for convenient classmethod calls, since these require
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 5da3d66df..94fa2a909 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -3785,6 +3785,34 @@ format_longfloat(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
TrimMode_LeaveOneZero, -1, -1);
}
+/*
+ * returns 1 if array is a user-defined string dtype, sets an error and
+ * returns 0 otherwise
+ */
+static int _is_user_defined_string_array(PyArrayObject* array)
+{
+ if (NPY_DT_is_user_defined(PyArray_DESCR(array))) {
+ PyTypeObject* scalar_type = NPY_DTYPE(PyArray_DESCR(array))->scalar_type;
+ if (PyType_IsSubtype(scalar_type, &PyBytes_Type) ||
+ PyType_IsSubtype(scalar_type, &PyUnicode_Type)) {
+ return 1;
+ }
+ else {
+ PyErr_SetString(
+ PyExc_TypeError,
+ "string comparisons are only allowed for dtypes with a "
+ "scalar type that is a subtype of str or bytes.");
+ return 0;
+ }
+ }
+ else {
+ PyErr_SetString(
+ PyExc_TypeError,
+ "string operation on non-string array");
+ return 0;
+ }
+}
+
/*
* The only purpose of this function is that it allows the "rstrip".
@@ -3861,6 +3889,9 @@ compare_chararrays(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
else {
PyErr_SetString(PyExc_TypeError,
"comparison of non-string arrays");
+ Py_DECREF(newarr);
+ Py_DECREF(newoth);
+ return NULL;
}
Py_DECREF(newarr);
Py_DECREF(newoth);
@@ -4061,10 +4092,15 @@ _vec_string(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUSED(kw
method = PyObject_GetAttr((PyObject *)&PyUnicode_Type, method_name);
}
else {
- PyErr_SetString(PyExc_TypeError,
- "string operation on non-string array");
- Py_DECREF(type);
- goto err;
+ if (_is_user_defined_string_array(char_array)) {
+ PyTypeObject* scalar_type =
+ NPY_DTYPE(PyArray_DESCR(char_array))->scalar_type;
+ method = PyObject_GetAttr((PyObject*)scalar_type, method_name);
+ }
+ else {
+ Py_DECREF(type);
+ goto err;
+ }
}
if (method == NULL) {
Py_DECREF(type);
diff --git a/numpy/core/tests/test_defchararray.py b/numpy/core/tests/test_defchararray.py
index 22296604e..8d92d97f7 100644
--- a/numpy/core/tests/test_defchararray.py
+++ b/numpy/core/tests/test_defchararray.py
@@ -1,3 +1,5 @@
+import pytest
+
import numpy as np
from numpy.core.multiarray import _vec_string
from numpy.testing import (
@@ -670,3 +672,15 @@ def test_empty_indexing():
# empty chararray instead of a chararray with a single empty string in it.
s = np.chararray((4,))
assert_(s[[]].size == 0)
+
+
+@pytest.mark.parametrize(["dt1", "dt2"],
+ [("S", "U"), ("U", "S"), ("S", "O"), ("U", "O"),
+ ("S", "d"), ("S", "V")])
+def test_add_types(dt1, dt2):
+ arr1 = np.array([1234234], dtype=dt1)
+ # If the following fails, e.g. use a number and test "V" explicitly
+ arr2 = np.array([b"423"], dtype=dt2)
+ with pytest.raises(TypeError,
+ match=f".*same dtype kind.*{arr1.dtype}.*{arr2.dtype}"):
+ np.char.add(arr1, arr2)