summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSebastian Berg <sebastian@sipsolutions.net>2020-03-06 17:35:59 -0800
committerSebastian Berg <sebastian@sipsolutions.net>2020-07-08 18:13:05 -0500
commitb5dc1ed68147ec03ce7ba7cfde253fb4eabe5321 (patch)
tree56e3b283ef0dce02f5fbba70eb2b0380c9a6eaae
parent996e6419e44668a6daa28f58efcc0d2c07b3a9da (diff)
downloadnumpy-b5dc1ed68147ec03ce7ba7cfde253fb4eabe5321.tar.gz
WIP: Rework array coercion
-rw-r--r--numpy/core/src/multiarray/array_coercion.c356
-rw-r--r--numpy/core/src/multiarray/array_coercion.h27
2 files changed, 383 insertions, 0 deletions
diff --git a/numpy/core/src/multiarray/array_coercion.c b/numpy/core/src/multiarray/array_coercion.c
new file mode 100644
index 000000000..94e57b775
--- /dev/null
+++ b/numpy/core/src/multiarray/array_coercion.c
@@ -0,0 +1,356 @@
+#include "numpy/arrayobject.h"
+
+#include "descriptor.h"
+#include "convert_datatype.h"
+#include "dtypemeta.h"
+
+#include "array_coercion.h"
+#include "ctors.h"
+
+
+/*
+ * This file defines helpers for some of the ctors.c functions which
+ * create an array from Python sequences and types.
+ * When creating an array with ``np.array(...)`` we have to do two main things:
+ *
+ * 1. Find the exact shape of the resulting array
+ * 2. Find the correct dtype of the resulting array.
+ *
+ * In most cases these two things are can be done in a single processing step.
+ * There are in principle three different calls that should be distinguished:
+ *
+ * 1. The user calls ``np.array(..., dtype=np.dtype("<f8"))``
+ * 2. The user calls ``np.array(..., dtype="S")``
+ * 3. The user calls ``np.array(...)``
+ *
+ * In the first case, in principle only the shape needs to be found. In the
+ * second case, the DType class (e.g. string) is already known but the DType
+ * instance (e.g. length of the string) has to be found.
+ * In the last case the DType class needs to be found as well. Note that
+ * it is not necessary to find the DType class of the entire array, but
+ * the DType class needs to be found for each element before the actual
+ * dtype instance can be found.
+ *
+ * Further, there are a few other things to keep in mind when coercing arrays:
+ *
+ * * For UFunc promotion, Python scalars need to be handled specially to
+ * allow value based casting.
+ * * It is necessary to decide whether or not a sequence is an element.
+ * For example tuples are considered elements for structured dtypes, but
+ * otherwise are considered sequences.
+ * This means that if a dtype is given (either as a class or instance),
+ * it can effect the dimension discovery part.
+ *
+ * In the initial version of this implementation, it is assumed that dtype
+ * discovery can be implemented sufficiently fast, that it is not necessary
+ * to create fast paths that only find the correct shape e.g. when
+ * ``dtype=np.dtype("f8")`` is given.
+ *
+ * One design goal in this code is to avoid multiple conversions of nested
+ * array like objects and sequences. Thus a cache is created to store sequences
+ * for the internal API which in almost all cases will, after allocating the
+ * new array, iterate all objects a second time to fill that array.
+ */
+
+
+static int
+update_shape(int curr_ndim, int *max_ndim,
+ npy_intp out_shape[NPY_MAXDIMS], int new_ndim,
+ const npy_intp new_shape[NPY_MAXDIMS], npy_bool sequence)
+{
+ int success = 0; /* unsuccessful if array is ragged */
+ if (curr_ndim + new_ndim > *max_ndim) {
+ success = -1;
+ /* Only update check as many dims as possible, max_ndim is unchanged */
+ new_ndim = *max_ndim - curr_ndim;
+ }
+ else if (!sequence && (*max_ndim != curr_ndim + new_ndim)) {
+ /*
+ * Sequences do not update max_ndim, otherwise shrink and check.
+ * This is depth first, so if it is already set, `out_shape` is filled.
+ */
+ *max_ndim = curr_ndim + new_ndim;
+ /* If a shape was already set, this is also ragged */
+ if (out_shape[*max_ndim] >= 0) {
+ success = -1;
+ }
+ }
+ for (int i = 0; i < new_ndim; i++) {
+ npy_intp curr_dim = out_shape[curr_ndim + i];
+ npy_intp new_dim = new_shape[i];
+
+ if (curr_dim == -1) {
+ out_shape[curr_ndim + i] = new_dim;
+ }
+ else if (new_dim != curr_dim) {
+ /* The array is ragged, and this dimension is unusable already */
+ success = -1;
+ if (!sequence) {
+ /* Remove dimensions that we cannot use: */
+ *max_ndim -= new_ndim + i;
+ }
+ else {
+ assert(i == 0);
+ /* max_ndim is usually not updated for sequences, so set now: */
+ *max_ndim = curr_ndim;
+ }
+ break;
+ }
+ }
+ return success;
+}
+
+
+/*
+ * This cache is necessary for the simple case of an array input mostly.
+ * Since that is the main reason, this may be removed if the single array
+ * case is handled specifically up-front.
+ * This may be better as a different caching mechanism...
+ */
+NPY_NO_EXPORT int
+npy_new_coercion_cache(
+ PyObject *converted_obj, PyObject *arr_or_sequence, npy_bool sequence,
+ coercion_cache_obj ***next_ptr)
+{
+ coercion_cache_obj *cache = PyArray_malloc(sizeof(coercion_cache_obj));
+ if (cache == NULL) {
+ PyErr_NoMemory();
+ return -1;
+ }
+ cache->converted_obj = converted_obj;
+ Py_INCREF(arr_or_sequence);
+ cache->arr_or_sequence = arr_or_sequence;
+ cache->sequence = sequence;
+ cache->next = NULL;
+ **next_ptr = cache;
+ *next_ptr = &(cache->next);
+ return 0;
+}
+
+
+NPY_NO_EXPORT void
+npy_free_coercion_cache(coercion_cache_obj *next) {
+ /* We only need to check from the last used cache pos */
+ int cache_pos = 0;
+ while (next != NULL) {
+ coercion_cache_obj *current = next;
+ next = current->next;
+
+ Py_DECREF(current->arr_or_sequence);
+ PyArray_free(current);
+ }
+}
+
+
+
+static NPY_INLINE int
+handle_promotion(PyArray_Descr**out_descr, PyArray_Descr *descr)
+{
+ if (*out_descr == NULL) {
+ *out_descr = descr;
+ return 0;
+ }
+ PyArray_Descr *new_descr = PyArray_PromoteTypes(*out_descr, descr);
+ // TODO: Have to take care of the retry-with-string logic for now :(
+ Py_DECREF(descr);
+ if (new_descr == NULL) {
+ return -1;
+ }
+ Py_SETREF(*out_descr, new_descr);
+ return 0;
+
+}
+
+
+enum _dtype_discovery_flags {
+ IS_RAGGED_ARRAY = 1,
+};
+
+
+/**
+ * Discover the dtype and shape for a potentially nested sequence of scalars.
+ * Note that in the ufunc machinery, when value based casting is desired it
+ * is necessary to first check for the scalar case.
+ *
+ * @param obj The python object or nested sequence to convert
+ * @param max_dims The maximum number of dimensions.
+ * @param curr_dims The current number of dimensions (depth in the recursion)
+ * @param out_shape The discovered output shape, will be filled
+ * @param coercion_cache The coercion cache object to use.
+ * @param DType the DType class that should be used, or NULL, if not provided.
+ * @param requested_descr The dtype instance passed in by the user, this is
+ * is used only array-likes.
+ * @param flags used signal that this is a ragged array, used internally and
+ * can be expanded if necessary.
+ */
+NPY_NO_EXPORT int
+PyArray_DiscoverDTypeAndShape_Recursive(
+ PyObject *obj, int curr_dims, int max_dims, PyArray_Descr**out_descr,
+ npy_intp out_shape[NPY_MAXDIMS],
+ coercion_cache_obj ***coercion_cache_tail_ptr,
+ PyArray_DTypeMeta *fixed_DType, PyArray_Descr *requested_descr,
+ _dtype_discovery_flags *flags)
+{
+ /*
+ * The first step is to find the DType class if it was not provided
+ */
+ PyArray_DTypeMeta *DType;
+ if (fixed_DType == NULL) {
+ DType = discover_dtype_from_pytype(Py_TYPE(obj));
+ if (DType == NULL) {
+ // TODO: Is there a need for an error return here?
+ return -1;
+ }
+ }
+ else {
+ Py_INCREF(fixed_DType);
+ DType = fixed_DType;
+ }
+
+ /*
+ * The second step is to ask the DType class to handle the scalar cases
+ * or return NotImplemented to signal that this should be assumed to be
+ * an array-like or sequence.
+ * We do this even when the dtype was provided, to handle the dimension
+ * discovery (possibly a fastpath can be added for that at some point).
+ */
+ if (DType != (PyArray_DTypeMeta *)Py_NotImplemented) {
+ assert(Py_TYPE(DType) == &PyArrayDTypeMeta_Type);
+
+ PyArray_Descr *descr = DType->discover_descr_from_pyobject(obj);
+ Py_DECREF(DType);
+ if (descr == NULL) {
+ return -1;
+ }
+ else if (descr == (PyArray_Descr *) Py_NotImplemented) {
+ Py_DECREF(descr);
+ }
+ else {
+ /* This is a scalar */
+ if (update_shape(curr_dims, &max_dims, out_shape, 0, NULL, NPY_FALSE) < 0) {
+ goto ragged_array;
+ }
+ if (handle_promotion(out_descr, descr) < 0) {
+ Py_DECREF(descr);
+ return -1;
+ }
+ Py_DECREF(descr);
+ return max_dims;
+ }
+ }
+ else {
+ /* If no DType was found, this must be an array or a sequence */
+ assert(DType == (PyArray_DTypeMeta *)Py_NotImplemented);
+ Py_DECREF(DType);
+ }
+
+ /*
+ * The third step is to first check for any arrays or array-likes.
+ */
+
+ /* Check if it's an ndarray */
+ PyArrayObject *arr = NULL;
+ if (PyArray_Check(obj)) {
+ arr = (PyArrayObject *)obj;
+ Py_INCREF(arr);
+ }
+ else {
+ arr = _array_from_array_like(obj, requested_descr, 0, NULL);
+ if (arr == NULL) {
+ goto fail;
+ }
+ else if (arr == (PyArrayObject *)Py_NotImplemented) {
+ Py_DECREF(arr);
+ arr = NULL;
+ }
+ }
+ if (arr) {
+ /* This is an array object which will be added to the cache */
+ if (npy_new_coercion_cache(obj, arr, 0, coercion_cache_tail_ptr) < 0) {
+ goto fail;
+ }
+ if (update_shape(curr_dims, &max_dims, out_shape,
+ PyArray_NDIM(arr), PyArray_SHAPE(arr), NPY_FALSE) < 0) {
+ goto ragged_array;
+ }
+ if (handle_promotion(out_descr, PyArray_DESCR(arr)) < 0) {
+ return -1;
+ }
+ return max_dims;
+ }
+
+ /*
+ * The last step is to assume the input should be handled as a sequence
+ * and to handle it recursively.
+ */
+ // TODO: Is the PySequence check necessary or does PySequence_Fast suffice?
+ if (!PySequence_Check(obj) || PySequence_Size(obj) < 0) {
+ /* clear any PySequence_Size error which corrupts further calls */
+ PyErr_Clear();
+
+ /* This branch always leads to a ragged array */
+ update_shape(curr_dims, &max_dims, out_shape, 0, NULL, NPY_FALSE);
+ goto ragged_array;
+ }
+
+ /* Ensure we have a sequence (required for PyPy) */
+ PyObject *seq = PySequence_Fast(obj, "Could not convert object to sequence");
+ if (seq == NULL) {
+ /* Specifically do not fail on things that look like a dictionary */
+ if (PyErr_ExceptionMatches(PyExc_KeyError)) {
+ PyErr_Clear();
+ update_shape(curr_dims, &max_dims, out_shape, 0, NULL, NPY_FALSE);
+ goto ragged_array;
+ }
+ goto fail;
+ }
+ if (npy_new_coercion_cache(obj, seq, 1, coercion_cache_tail_ptr) < 0) {
+ Py_DECREF(seq);
+ goto fail;
+ }
+
+ npy_intp size = PySequence_Fast_GET_SIZE(seq);
+ PyObject **objects = PySequence_Fast_ITEMS(seq);
+
+ if (update_shape(curr_dims, &max_dims,
+ out_shape, 1, &size, NPY_TRUE) < 0) {
+ /* But do update, if there this is a ragged case */
+ Py_DECREF(seq);
+ goto ragged_array;
+ }
+ if (size == 0) {
+ Py_DECREF(seq);
+ /* If the sequence is empty, we have to assume thats it... */
+ return curr_dims+1;
+ }
+
+ /* Recursive call for each sequence item */
+ for (Py_ssize_t i = 0; i < size; ++i) {
+ max_dims = PyArray_DiscoverDTypeAndShape_Recursive(
+ objects[i], max_dims, curr_dims + 1,
+ out_dtype, out_shape, use_minimal,
+ single_or_no_element, requested_dtype, context,
+ stop_at_tuple, string_is_sequence,
+ prev_type, prev_dtype, coercion_cache_tail_ptr);
+ // NOTE: If there is a ragged array found (NPY_OBJECT) could break
+ if (max_dims < 0) {
+ Py_DECREF(seq);
+ goto fail;
+ }
+ }
+ Py_DECREF(seq);
+ return max_dims;
+
+
+
+ragged_array:
+ /*
+ * This is discovered as a ragged array, which means the dtype is
+ * guaranteed to be object. A warning will need to be given if an
+ * dtype[object] was not requested (checked outside to only warn once).
+ */
+ *flags &= IS_RAGGED_ARRAY;
+ Py_XDECREF(*out_descr);
+ *out_descr = PyArray_DescrFromType(NPY_OBJECT);
+ return max_dims;
+} \ No newline at end of file
diff --git a/numpy/core/src/multiarray/array_coercion.h b/numpy/core/src/multiarray/array_coercion.h
new file mode 100644
index 000000000..7402b4b45
--- /dev/null
+++ b/numpy/core/src/multiarray/array_coercion.h
@@ -0,0 +1,27 @@
+#ifndef _NPY_ARRAY_COERCION_H
+#define _NPY_ARRAY_COERCION_H
+
+#include <numpy/ndarraytypes.h>
+
+/*
+ * We do not want to coerce arrays many times unless absolutely necessary.
+ * The same goes for sequences, so everything we have seen, we will have
+ * to store somehow. This is a linked list of these objects.
+ */
+typedef struct coercion_cache_obj {
+ PyObject *converted_obj;
+ PyObject *arr_or_sequence;
+ struct coercion_cache_obj *next;
+ npy_bool sequence;
+} coercion_cache_obj;
+
+/* Create a new cache object */
+NPY_NO_EXPORT int npy_new_coercion_cache(
+ PyObject *converted_obj, PyObject *arr_or_sequence, npy_bool sequence,
+ coercion_cache_obj ***next_ptr);
+
+/* Frees the coercion cache object. */
+NPY_NO_EXPORT void npy_free_coercion_cache(coercion_cache_obj *first);
+
+
+#endif /* _NPY_ARRAY_COERCION_H */