summaryrefslogtreecommitdiff
path: root/numpy/lib/recfunctions.py
diff options
context:
space:
mode:
authorEric Wieser <wieser.eric@gmail.com>2019-06-08 16:17:17 -0700
committerEric Wieser <wieser.eric@gmail.com>2019-06-08 19:57:59 -0700
commitb12a8690b6383e03573237b65fddd859afa1f282 (patch)
treef639320bd30b8b7bff5400592ab595c6fb5d6fb6 /numpy/lib/recfunctions.py
parent24b2a2d36a7e8356310cd16dbe60abd9d0e682dc (diff)
parentf07a38da97a6a36eb12b203f6c1ffa4bf2b2cb87 (diff)
downloadnumpy-b12a8690b6383e03573237b65fddd859afa1f282.tar.gz
Merge tag 'branch-points/1.16.x' into bit_shifts
Diffstat (limited to 'numpy/lib/recfunctions.py')
-rw-r--r--numpy/lib/recfunctions.py745
1 files changed, 656 insertions, 89 deletions
diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py
index 4ae1079d2..fcc0d9a7a 100644
--- a/numpy/lib/recfunctions.py
+++ b/numpy/lib/recfunctions.py
@@ -14,8 +14,10 @@ import numpy.ma as ma
from numpy import ndarray, recarray
from numpy.ma import MaskedArray
from numpy.ma.mrecords import MaskedRecords
+from numpy.core.overrides import array_function_dispatch
from numpy.lib._iotools import _is_string_like
from numpy.compat import basestring
+from numpy.testing import suppress_warnings
if sys.version_info[0] < 3:
from future_builtins import zip
@@ -31,6 +33,11 @@ __all__ = [
]
+def _recursive_fill_fields_dispatcher(input, output):
+ return (input, output)
+
+
+@array_function_dispatch(_recursive_fill_fields_dispatcher)
def recursive_fill_fields(input, output):
"""
Fills fields from output with fields from input,
@@ -70,6 +77,37 @@ def recursive_fill_fields(input, output):
return output
+def get_fieldspec(dtype):
+ """
+ Produce a list of name/dtype pairs corresponding to the dtype fields
+
+ Similar to dtype.descr, but the second item of each tuple is a dtype, not a
+ string. As a result, this handles subarray dtypes
+
+ Can be passed to the dtype constructor to reconstruct the dtype, noting that
+ this (deliberately) discards field offsets.
+
+ Examples
+ --------
+ >>> dt = np.dtype([(('a', 'A'), int), ('b', float, 3)])
+ >>> dt.descr
+ [(('a', 'A'), '<i4'), ('b', '<f8', (3,))]
+ >>> get_fieldspec(dt)
+ [(('a', 'A'), dtype('int32')), ('b', dtype(('<f8', (3,))))]
+
+ """
+ if dtype.names is None:
+ # .descr returns a nameless field, so we should too
+ return [('', dtype)]
+ else:
+ fields = ((name, dtype.fields[name]) for name in dtype.names)
+ # keep any titles, if present
+ return [
+ (name if len(f) == 2 else (f[2], name), f[0])
+ for name, f in fields
+ ]
+
+
def get_names(adtype):
"""
Returns the field names of the input datatype as a tuple.
@@ -146,7 +184,7 @@ def flatten_descr(ndtype):
"""
names = ndtype.names
if names is None:
- return ndtype.descr
+ return (('', ndtype),)
else:
descr = []
for field in names:
@@ -158,6 +196,28 @@ def flatten_descr(ndtype):
return tuple(descr)
+def _zip_dtype_dispatcher(seqarrays, flatten=None):
+ return seqarrays
+
+
+@array_function_dispatch(_zip_dtype_dispatcher)
+def zip_dtype(seqarrays, flatten=False):
+ newdtype = []
+ if flatten:
+ for a in seqarrays:
+ newdtype.extend(flatten_descr(a.dtype))
+ else:
+ for a in seqarrays:
+ current = a.dtype
+ if current.names and len(current.names) <= 1:
+ # special case - dtypes of 0 or 1 field are flattened
+ newdtype.extend(get_fieldspec(current))
+ else:
+ newdtype.append(('', current))
+ return np.dtype(newdtype)
+
+
+@array_function_dispatch(_zip_dtype_dispatcher)
def zip_descr(seqarrays, flatten=False):
"""
Combine the dtype description of a series of arrays.
@@ -169,19 +229,7 @@ def zip_descr(seqarrays, flatten=False):
flatten : {boolean}, optional
Whether to collapse nested descriptions.
"""
- newdtype = []
- if flatten:
- for a in seqarrays:
- newdtype.extend(flatten_descr(a.dtype))
- else:
- for a in seqarrays:
- current = a.dtype
- names = current.names or ()
- if len(names) > 1:
- newdtype.append(('', current.descr))
- else:
- newdtype.extend(current.descr)
- return np.dtype(newdtype).descr
+ return zip_dtype(seqarrays, flatten=flatten).descr
def get_fieldstructure(adtype, lastname=None, parents=None,):
@@ -262,6 +310,11 @@ def _izip_fields(iterable):
yield element
+def _izip_records_dispatcher(seqarrays, fill_value=None, flatten=None):
+ return seqarrays
+
+
+@array_function_dispatch(_izip_records_dispatcher)
def izip_records(seqarrays, fill_value=None, flatten=True):
"""
Returns an iterator of concatenated items from a sequence of arrays.
@@ -275,24 +328,20 @@ def izip_records(seqarrays, fill_value=None, flatten=True):
flatten : {True, False},
Whether to
"""
- # OK, that's a complete ripoff from Python2.6 itertools.izip_longest
- def sentinel(counter=([fill_value] * (len(seqarrays) - 1)).pop):
- "Yields the fill_value or raises IndexError"
- yield counter()
- #
- fillers = itertools.repeat(fill_value)
- iters = [itertools.chain(it, sentinel(), fillers) for it in seqarrays]
+
# Should we flatten the items, or just use a nested approach
if flatten:
zipfunc = _izip_fields_flat
else:
zipfunc = _izip_fields
- #
- try:
- for tup in zip(*iters):
- yield tuple(zipfunc(tup))
- except IndexError:
- pass
+
+ if sys.version_info[0] >= 3:
+ zip_longest = itertools.zip_longest
+ else:
+ zip_longest = itertools.izip_longest
+
+ for tup in zip_longest(*seqarrays, fillvalue=fill_value):
+ yield tuple(zipfunc(tup))
def _fix_output(output, usemask=True, asrecarray=False):
@@ -326,6 +375,12 @@ def _fix_defaults(output, defaults=None):
return output
+def _merge_arrays_dispatcher(seqarrays, fill_value=None, flatten=None,
+ usemask=None, asrecarray=None):
+ return seqarrays
+
+
+@array_function_dispatch(_merge_arrays_dispatcher)
def merge_arrays(seqarrays, fill_value=-1, flatten=False,
usemask=False, asrecarray=False):
"""
@@ -366,12 +421,13 @@ def merge_arrays(seqarrays, fill_value=-1, flatten=False,
Notes
-----
* Without a mask, the missing value will be filled with something,
- * depending on what its corresponding type:
- -1 for integers
- -1.0 for floating point numbers
- '-' for characters
- '-1' for strings
- True for boolean values
+ depending on what its corresponding type:
+
+ * ``-1`` for integers
+ * ``-1.0`` for floating point numbers
+ * ``'-'`` for characters
+ * ``'-1'`` for strings
+ * ``True`` for boolean values
* XXX: I just obtained these values empirically
"""
# Only one item in the input sequence ?
@@ -380,13 +436,12 @@ def merge_arrays(seqarrays, fill_value=-1, flatten=False,
# Do we have a single ndarray as input ?
if isinstance(seqarrays, (ndarray, np.void)):
seqdtype = seqarrays.dtype
- if (not flatten) or \
- (zip_descr((seqarrays,), flatten=True) == seqdtype.descr):
+ # Make sure we have named fields
+ if not seqdtype.names:
+ seqdtype = np.dtype([('', seqdtype)])
+ if not flatten or zip_dtype((seqarrays,), flatten=True) == seqdtype:
# Minimal processing needed: just make sure everythng's a-ok
seqarrays = seqarrays.ravel()
- # Make sure we have named fields
- if not seqdtype.names:
- seqdtype = [('', seqdtype)]
# Find what type of array we must return
if usemask:
if asrecarray:
@@ -407,7 +462,7 @@ def merge_arrays(seqarrays, fill_value=-1, flatten=False,
sizes = tuple(a.size for a in seqarrays)
maxlength = max(sizes)
# Get the dtype of the output (flattening if needed)
- newdtype = zip_descr(seqarrays, flatten=flatten)
+ newdtype = zip_dtype(seqarrays, flatten=flatten)
# Initialize the sequences for data and mask
seqdata = []
seqmask = []
@@ -463,6 +518,11 @@ def merge_arrays(seqarrays, fill_value=-1, flatten=False,
return output
+def _drop_fields_dispatcher(base, drop_names, usemask=None, asrecarray=None):
+ return (base,)
+
+
+@array_function_dispatch(_drop_fields_dispatcher)
def drop_fields(base, drop_names, usemask=True, asrecarray=False):
"""
Return a new array with fields in `drop_names` dropped.
@@ -499,7 +559,7 @@ def drop_fields(base, drop_names, usemask=True, asrecarray=False):
dtype=[('a', '<i4')])
"""
if _is_string_like(drop_names):
- drop_names = [drop_names, ]
+ drop_names = [drop_names]
else:
drop_names = set(drop_names)
@@ -527,6 +587,36 @@ def drop_fields(base, drop_names, usemask=True, asrecarray=False):
return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
+def _keep_fields(base, keep_names, usemask=True, asrecarray=False):
+ """
+ Return a new array keeping only the fields in `keep_names`,
+ and preserving the order of those fields.
+
+ Parameters
+ ----------
+ base : array
+ Input array
+ keep_names : string or sequence
+ String or sequence of strings corresponding to the names of the
+ fields to keep. Order of the names will be preserved.
+ usemask : {False, True}, optional
+ Whether to return a masked array or not.
+ asrecarray : string or sequence, optional
+ Whether to return a recarray or a mrecarray (`asrecarray=True`) or
+ a plain ndarray or masked array with flexible dtype. The default
+ is False.
+ """
+ newdtype = [(n, base.dtype[n]) for n in keep_names]
+ output = np.empty(base.shape, dtype=newdtype)
+ output = recursive_fill_fields(base, output)
+ return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
+
+
+def _rec_drop_fields_dispatcher(base, drop_names):
+ return (base,)
+
+
+@array_function_dispatch(_rec_drop_fields_dispatcher)
def rec_drop_fields(base, drop_names):
"""
Returns a new numpy.recarray with fields in `drop_names` dropped.
@@ -534,6 +624,11 @@ def rec_drop_fields(base, drop_names):
return drop_fields(base, drop_names, usemask=False, asrecarray=True)
+def _rename_fields_dispatcher(base, namemapper):
+ return (base,)
+
+
+@array_function_dispatch(_rename_fields_dispatcher)
def rename_fields(base, namemapper):
"""
Rename the fields from a flexible-datatype ndarray or recarray.
@@ -573,6 +668,14 @@ def rename_fields(base, namemapper):
return base.view(newdtype)
+def _append_fields_dispatcher(base, names, data, dtypes=None,
+ fill_value=None, usemask=None, asrecarray=None):
+ yield base
+ for d in data:
+ yield d
+
+
+@array_function_dispatch(_append_fields_dispatcher)
def append_fields(base, names, data, dtypes=None,
fill_value=-1, usemask=True, asrecarray=False):
"""
@@ -634,14 +737,22 @@ def append_fields(base, names, data, dtypes=None,
else:
data = data.pop()
#
- output = ma.masked_all(max(len(base), len(data)),
- dtype=base.dtype.descr + data.dtype.descr)
+ output = ma.masked_all(
+ max(len(base), len(data)),
+ dtype=get_fieldspec(base.dtype) + get_fieldspec(data.dtype))
output = recursive_fill_fields(base, output)
output = recursive_fill_fields(data, output)
#
return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
+def _rec_append_fields_dispatcher(base, names, data, dtypes=None):
+ yield base
+ for d in data:
+ yield d
+
+
+@array_function_dispatch(_rec_append_fields_dispatcher)
def rec_append_fields(base, names, data, dtypes=None):
"""
Add new fields to an existing array.
@@ -676,6 +787,435 @@ def rec_append_fields(base, names, data, dtypes=None):
asrecarray=True, usemask=False)
+def _repack_fields_dispatcher(a, align=None, recurse=None):
+ return (a,)
+
+
+@array_function_dispatch(_repack_fields_dispatcher)
+def repack_fields(a, align=False, recurse=False):
+ """
+ Re-pack the fields of a structured array or dtype in memory.
+
+ The memory layout of structured datatypes allows fields at arbitrary
+ byte offsets. This means the fields can be separated by padding bytes,
+ their offsets can be non-monotonically increasing, and they can overlap.
+
+ This method removes any overlaps and reorders the fields in memory so they
+ have increasing byte offsets, and adds or removes padding bytes depending
+ on the `align` option, which behaves like the `align` option to `np.dtype`.
+
+ If `align=False`, this method produces a "packed" memory layout in which
+ each field starts at the byte the previous field ended, and any padding
+ bytes are removed.
+
+ If `align=True`, this methods produces an "aligned" memory layout in which
+ each field's offset is a multiple of its alignment, and the total itemsize
+ is a multiple of the largest alignment, by adding padding bytes as needed.
+
+ Parameters
+ ----------
+ a : ndarray or dtype
+ array or dtype for which to repack the fields.
+ align : boolean
+ If true, use an "aligned" memory layout, otherwise use a "packed" layout.
+ recurse : boolean
+ If True, also repack nested structures.
+
+ Returns
+ -------
+ repacked : ndarray or dtype
+ Copy of `a` with fields repacked, or `a` itself if no repacking was
+ needed.
+
+ Examples
+ --------
+
+ >>> def print_offsets(d):
+ ... print("offsets:", [d.fields[name][1] for name in d.names])
+ ... print("itemsize:", d.itemsize)
+ ...
+ >>> dt = np.dtype('u1,i4,f4', align=True)
+ >>> dt
+ dtype({'names':['f0','f1','f2'], 'formats':['u1','<i4','<f8'], 'offsets':[0,4,8], 'itemsize':16}, align=True)
+ >>> print_offsets(dt)
+ offsets: [0, 4, 8]
+ itemsize: 16
+ >>> packed_dt = repack_fields(dt)
+ >>> packed_dt
+ dtype([('f0', 'u1'), ('f1', '<i4'), ('f2', '<f8')])
+ >>> print_offsets(packed_dt)
+ offsets: [0, 1, 5]
+ itemsize: 13
+
+ """
+ if not isinstance(a, np.dtype):
+ dt = repack_fields(a.dtype, align=align, recurse=recurse)
+ return a.astype(dt, copy=False)
+
+ if a.names is None:
+ return a
+
+ fieldinfo = []
+ for name in a.names:
+ tup = a.fields[name]
+ if recurse:
+ fmt = repack_fields(tup[0], align=align, recurse=True)
+ else:
+ fmt = tup[0]
+
+ if len(tup) == 3:
+ name = (tup[2], name)
+
+ fieldinfo.append((name, fmt))
+
+ dt = np.dtype(fieldinfo, align=align)
+ return np.dtype((a.type, dt))
+
+def _get_fields_and_offsets(dt, offset=0):
+ """
+ Returns a flat list of (dtype, count, offset) tuples of all the
+ scalar fields in the dtype "dt", including nested fields, in left
+ to right order.
+ """
+ fields = []
+ for name in dt.names:
+ field = dt.fields[name]
+ if field[0].names is None:
+ count = 1
+ for size in field[0].shape:
+ count *= size
+ fields.append((field[0], count, field[1] + offset))
+ else:
+ fields.extend(_get_fields_and_offsets(field[0], field[1] + offset))
+ return fields
+
+
+def _structured_to_unstructured_dispatcher(arr, dtype=None, copy=None,
+ casting=None):
+ return (arr,)
+
+@array_function_dispatch(_structured_to_unstructured_dispatcher)
+def structured_to_unstructured(arr, dtype=None, copy=False, casting='unsafe'):
+ """
+ Converts and n-D structured array into an (n+1)-D unstructured array.
+
+ The new array will have a new last dimension equal in size to the
+ number of field-elements of the input array. If not supplied, the output
+ datatype is determined from the numpy type promotion rules applied to all
+ the field datatypes.
+
+ Nested fields, as well as each element of any subarray fields, all count
+ as a single field-elements.
+
+ Parameters
+ ----------
+ arr : ndarray
+ Structured array or dtype to convert. Cannot contain object datatype.
+ dtype : dtype, optional
+ The dtype of the output unstructured array.
+ copy : bool, optional
+ See copy argument to `ndarray.astype`. If true, always return a copy.
+ If false, and `dtype` requirements are satisfied, a view is returned.
+ casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
+ See casting argument of `ndarray.astype`. Controls what kind of data
+ casting may occur.
+
+ Returns
+ -------
+ unstructured : ndarray
+ Unstructured array with one more dimension.
+
+ Examples
+ --------
+
+ >>> a = np.zeros(4, dtype=[('a', 'i4'), ('b', 'f4,u2'), ('c', 'f4', 2)])
+ >>> a
+ array([(0, (0., 0), [0., 0.]), (0, (0., 0), [0., 0.]),
+ (0, (0., 0), [0., 0.]), (0, (0., 0), [0., 0.])],
+ dtype=[('a', '<i4'), ('b', [('f0', '<f4'), ('f1', '<u2')]), ('c', '<f4', (2,))])
+ >>> structured_to_unstructured(arr)
+ array([[0., 0., 0., 0., 0.],
+ [0., 0., 0., 0., 0.],
+ [0., 0., 0., 0., 0.],
+ [0., 0., 0., 0., 0.]])
+
+ >>> b = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)],
+ ... dtype=[('x', 'i4'), ('y', 'f4'), ('z', 'f8')])
+ >>> np.mean(structured_to_unstructured(b[['x', 'z']]), axis=-1)
+ array([ 3. , 5.5, 9. , 11. ])
+
+ """
+ if arr.dtype.names is None:
+ raise ValueError('arr must be a structured array')
+
+ fields = _get_fields_and_offsets(arr.dtype)
+ n_fields = len(fields)
+ dts, counts, offsets = zip(*fields)
+ names = ['f{}'.format(n) for n in range(n_fields)]
+
+ if dtype is None:
+ out_dtype = np.result_type(*[dt.base for dt in dts])
+ else:
+ out_dtype = dtype
+
+ # Use a series of views and casts to convert to an unstructured array:
+
+ # first view using flattened fields (doesn't work for object arrays)
+ # Note: dts may include a shape for subarrays
+ flattened_fields = np.dtype({'names': names,
+ 'formats': dts,
+ 'offsets': offsets,
+ 'itemsize': arr.dtype.itemsize})
+ with suppress_warnings() as sup: # until 1.16 (gh-12447)
+ sup.filter(FutureWarning, "Numpy has detected")
+ arr = arr.view(flattened_fields)
+
+ # next cast to a packed format with all fields converted to new dtype
+ packed_fields = np.dtype({'names': names,
+ 'formats': [(out_dtype, c) for c in counts]})
+ arr = arr.astype(packed_fields, copy=copy, casting=casting)
+
+ # finally is it safe to view the packed fields as the unstructured type
+ return arr.view((out_dtype, sum(counts)))
+
+def _unstructured_to_structured_dispatcher(arr, dtype=None, names=None,
+ align=None, copy=None, casting=None):
+ return (arr,)
+
+@array_function_dispatch(_unstructured_to_structured_dispatcher)
+def unstructured_to_structured(arr, dtype=None, names=None, align=False,
+ copy=False, casting='unsafe'):
+ """
+ Converts and n-D unstructured array into an (n-1)-D structured array.
+
+ The last dimension of the input array is converted into a structure, with
+ number of field-elements equal to the size of the last dimension of the
+ input array. By default all output fields have the input array's dtype, but
+ an output structured dtype with an equal number of fields-elements can be
+ supplied instead.
+
+ Nested fields, as well as each element of any subarray fields, all count
+ towards the number of field-elements.
+
+ Parameters
+ ----------
+ arr : ndarray
+ Unstructured array or dtype to convert.
+ dtype : dtype, optional
+ The structured dtype of the output array
+ names : list of strings, optional
+ If dtype is not supplied, this specifies the field names for the output
+ dtype, in order. The field dtypes will be the same as the input array.
+ align : boolean, optional
+ Whether to create an aligned memory layout.
+ copy : bool, optional
+ See copy argument to `ndarray.astype`. If true, always return a copy.
+ If false, and `dtype` requirements are satisfied, a view is returned.
+ casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
+ See casting argument of `ndarray.astype`. Controls what kind of data
+ casting may occur.
+
+ Returns
+ -------
+ structured : ndarray
+ Structured array with fewer dimensions.
+
+ Examples
+ --------
+
+ >>> dt = np.dtype([('a', 'i4'), ('b', 'f4,u2'), ('c', 'f4', 2)])
+ >>> a = np.arange(20).reshape((4,5))
+ >>> a
+ array([[ 0, 1, 2, 3, 4],
+ [ 5, 6, 7, 8, 9],
+ [10, 11, 12, 13, 14],
+ [15, 16, 17, 18, 19]])
+ >>> unstructured_to_structured(a, dt)
+ array([( 0, ( 1., 2), [ 3., 4.]), ( 5, ( 6., 7), [ 8., 9.]),
+ (10, (11., 12), [13., 14.]), (15, (16., 17), [18., 19.])],
+ dtype=[('a', '<i4'), ('b', [('f0', '<f4'), ('f1', '<u2')]), ('c', '<f4', (2,))])
+
+ """
+ if arr.shape == ():
+ raise ValueError('arr must have at least one dimension')
+ n_elem = arr.shape[-1]
+
+ if dtype is None:
+ if names is None:
+ names = ['f{}'.format(n) for n in range(n_elem)]
+ out_dtype = np.dtype([(n, arr.dtype) for n in names], align=align)
+ fields = _get_fields_and_offsets(out_dtype)
+ dts, counts, offsets = zip(*fields)
+ else:
+ if names is not None:
+ raise ValueError("don't supply both dtype and names")
+ # sanity check of the input dtype
+ fields = _get_fields_and_offsets(dtype)
+ dts, counts, offsets = zip(*fields)
+ if n_elem != sum(counts):
+ raise ValueError('The length of the last dimension of arr must '
+ 'be equal to the number of fields in dtype')
+ out_dtype = dtype
+ if align and not out_dtype.isalignedstruct:
+ raise ValueError("align was True but dtype is not aligned")
+
+ names = ['f{}'.format(n) for n in range(len(fields))]
+
+ # Use a series of views and casts to convert to a structured array:
+
+ # first view as a packed structured array of one dtype
+ packed_fields = np.dtype({'names': names,
+ 'formats': [(arr.dtype, c) for c in counts]})
+ arr = np.ascontiguousarray(arr).view(packed_fields)
+
+ # next cast to an unpacked but flattened format with varied dtypes
+ flattened_fields = np.dtype({'names': names,
+ 'formats': dts,
+ 'offsets': offsets,
+ 'itemsize': out_dtype.itemsize})
+ arr = arr.astype(flattened_fields, copy=copy, casting=casting)
+
+ # finally view as the final nested dtype and remove the last axis
+ return arr.view(out_dtype)[..., 0]
+
+def _apply_along_fields_dispatcher(func, arr):
+ return (arr,)
+
+@array_function_dispatch(_apply_along_fields_dispatcher)
+def apply_along_fields(func, arr):
+ """
+ Apply function 'func' as a reduction across fields of a structured array.
+
+ This is similar to `apply_along_axis`, but treats the fields of a
+ structured array as an extra axis. The fields are all first cast to a
+ common type following the type-promotion rules from `numpy.result_type`
+ applied to the field's dtypes.
+
+ Parameters
+ ----------
+ func : function
+ Function to apply on the "field" dimension. This function must
+ support an `axis` argument, like np.mean, np.sum, etc.
+ arr : ndarray
+ Structured array for which to apply func.
+
+ Returns
+ -------
+ out : ndarray
+ Result of the recution operation
+
+ Examples
+ --------
+
+ >>> b = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)],
+ ... dtype=[('x', 'i4'), ('y', 'f4'), ('z', 'f8')])
+ >>> apply_along_fields(np.mean, b)
+ array([ 2.66666667, 5.33333333, 8.66666667, 11. ])
+ >>> apply_along_fields(np.mean, b[['x', 'z']])
+ array([ 3. , 5.5, 9. , 11. ])
+
+ """
+ if arr.dtype.names is None:
+ raise ValueError('arr must be a structured array')
+
+ uarr = structured_to_unstructured(arr)
+ return func(uarr, axis=-1)
+ # works and avoids axis requirement, but very, very slow:
+ #return np.apply_along_axis(func, -1, uarr)
+
+def _assign_fields_by_name_dispatcher(dst, src, zero_unassigned=None):
+ return dst, src
+
+@array_function_dispatch(_assign_fields_by_name_dispatcher)
+def assign_fields_by_name(dst, src, zero_unassigned=True):
+ """
+ Assigns values from one structured array to another by field name.
+
+ Normally in numpy >= 1.14, assignment of one structured array to another
+ copies fields "by position", meaning that the first field from the src is
+ copied to the first field of the dst, and so on, regardless of field name.
+
+ This function instead copies "by field name", such that fields in the dst
+ are assigned from the identically named field in the src. This applies
+ recursively for nested structures. This is how structure assignment worked
+ in numpy >= 1.6 to <= 1.13.
+
+ Parameters
+ ----------
+ dst : ndarray
+ src : ndarray
+ The source and destination arrays during assignment.
+ zero_unassigned : bool, optional
+ If True, fields in the dst for which there was no matching
+ field in the src are filled with the value 0 (zero). This
+ was the behavior of numpy <= 1.13. If False, those fields
+ are not modified.
+ """
+
+ if dst.dtype.names is None:
+ dst[...] = src
+ return
+
+ for name in dst.dtype.names:
+ if name not in src.dtype.names:
+ if zero_unassigned:
+ dst[name] = 0
+ else:
+ assign_fields_by_name(dst[name], src[name],
+ zero_unassigned)
+
+def _require_fields_dispatcher(array, required_dtype):
+ return (array,)
+
+@array_function_dispatch(_require_fields_dispatcher)
+def require_fields(array, required_dtype):
+ """
+ Casts a structured array to a new dtype using assignment by field-name.
+
+ This function assigns from the old to the new array by name, so the
+ value of a field in the output array is the value of the field with the
+ same name in the source array. This has the effect of creating a new
+ ndarray containing only the fields "required" by the required_dtype.
+
+ If a field name in the required_dtype does not exist in the
+ input array, that field is created and set to 0 in the output array.
+
+ Parameters
+ ----------
+ a : ndarray
+ array to cast
+ required_dtype : dtype
+ datatype for output array
+
+ Returns
+ -------
+ out : ndarray
+ array with the new dtype, with field values copied from the fields in
+ the input array with the same name
+
+ Examples
+ --------
+
+ >>> a = np.ones(4, dtype=[('a', 'i4'), ('b', 'f8'), ('c', 'u1')])
+ >>> require_fields(a, [('b', 'f4'), ('c', 'u1')])
+ array([(1., 1), (1., 1), (1., 1), (1., 1)],
+ dtype=[('b', '<f4'), ('c', 'u1')])
+ >>> require_fields(a, [('b', 'f4'), ('newf', 'u1')])
+ array([(1., 0), (1., 0), (1., 0), (1., 0)],
+ dtype=[('b', '<f4'), ('newf', 'u1')])
+
+ """
+ out = np.empty(array.shape, dtype=required_dtype)
+ assign_fields_by_name(out, array)
+ return out
+
+
+def _stack_arrays_dispatcher(arrays, defaults=None, usemask=None,
+ asrecarray=None, autoconvert=None):
+ return arrays
+
+
+@array_function_dispatch(_stack_arrays_dispatcher)
def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False,
autoconvert=False):
"""
@@ -725,25 +1265,21 @@ def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False,
fldnames = [d.names for d in ndtype]
#
dtype_l = ndtype[0]
- newdescr = dtype_l.descr
- names = [_[0] for _ in newdescr]
+ newdescr = get_fieldspec(dtype_l)
+ names = [n for n, d in newdescr]
for dtype_n in ndtype[1:]:
- for descr in dtype_n.descr:
- name = descr[0] or ''
- if name not in names:
- newdescr.append(descr)
- names.append(name)
+ for fname, fdtype in get_fieldspec(dtype_n):
+ if fname not in names:
+ newdescr.append((fname, fdtype))
+ names.append(fname)
else:
- nameidx = names.index(name)
- current_descr = newdescr[nameidx]
+ nameidx = names.index(fname)
+ _, cdtype = newdescr[nameidx]
if autoconvert:
- if np.dtype(descr[1]) > np.dtype(current_descr[-1]):
- current_descr = list(current_descr)
- current_descr[-1] = descr[1]
- newdescr[nameidx] = tuple(current_descr)
- elif descr[1] != current_descr[-1]:
+ newdescr[nameidx] = (fname, max(fdtype, cdtype))
+ elif fdtype != cdtype:
raise TypeError("Incompatible type '%s' <> '%s'" %
- (dict(newdescr)[name], descr[1]))
+ (cdtype, fdtype))
# Only one field: use concatenate
if len(newdescr) == 1:
output = ma.concatenate(seqarrays)
@@ -766,6 +1302,12 @@ def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False,
usemask=usemask, asrecarray=asrecarray)
+def _find_duplicates_dispatcher(
+ a, key=None, ignoremask=None, return_index=None):
+ return (a,)
+
+
+@array_function_dispatch(_find_duplicates_dispatcher)
def find_duplicates(a, key=None, ignoremask=True, return_index=False):
"""
Find the duplicates in a structured array along a given key
@@ -820,8 +1362,15 @@ def find_duplicates(a, key=None, ignoremask=True, return_index=False):
return duplicates
+def _join_by_dispatcher(
+ key, r1, r2, jointype=None, r1postfix=None, r2postfix=None,
+ defaults=None, usemask=None, asrecarray=None):
+ return (r1, r2)
+
+
+@array_function_dispatch(_join_by_dispatcher)
def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
- defaults=None, usemask=True, asrecarray=False):
+ defaults=None, usemask=True, asrecarray=False):
"""
Join arrays `r1` and `r2` on key `key`.
@@ -881,11 +1430,14 @@ def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
key = (key,)
# Check the keys
+ if len(set(key)) != len(key):
+ dup = next(x for n,x in enumerate(key) if x in key[n+1:])
+ raise ValueError("duplicate join key %r" % dup)
for name in key:
if name not in r1.dtype.names:
- raise ValueError('r1 does not have key field %s' % name)
+ raise ValueError('r1 does not have key field %r' % name)
if name not in r2.dtype.names:
- raise ValueError('r2 does not have key field %s' % name)
+ raise ValueError('r2 does not have key field %r' % name)
# Make sure we work with ravelled arrays
r1 = r1.ravel()
@@ -896,15 +1448,17 @@ def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
(r1names, r2names) = (r1.dtype.names, r2.dtype.names)
# Check the names for collision
- if (set.intersection(set(r1names), set(r2names)).difference(key) and
- not (r1postfix or r2postfix)):
+ collisions = (set(r1names) & set(r2names)) - set(key)
+ if collisions and not (r1postfix or r2postfix):
msg = "r1 and r2 contain common names, r1postfix and r2postfix "
- msg += "can't be empty"
+ msg += "can't both be empty"
raise ValueError(msg)
# Make temporary arrays of just the keys
- r1k = drop_fields(r1, [n for n in r1names if n not in key])
- r2k = drop_fields(r2, [n for n in r2names if n not in key])
+ # (use order of keys in `r1` for back-compatibility)
+ key1 = [ n for n in r1names if n in key ]
+ r1k = _keep_fields(r1, key1)
+ r2k = _keep_fields(r2, key1)
# Concatenate the two arrays for comparison
aux = ma.concatenate((r1k, r2k))
@@ -934,32 +1488,38 @@ def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
#
# Build the new description of the output array .......
# Start with the key fields
- ndtype = [list(_) for _ in r1k.dtype.descr]
- # Add the other fields
- ndtype.extend(list(_) for _ in r1.dtype.descr if _[0] not in key)
- # Find the new list of names (it may be different from r1names)
- names = list(_[0] for _ in ndtype)
- for desc in r2.dtype.descr:
- desc = list(desc)
- name = desc[0]
+ ndtype = get_fieldspec(r1k.dtype)
+
+ # Add the fields from r1
+ for fname, fdtype in get_fieldspec(r1.dtype):
+ if fname not in key:
+ ndtype.append((fname, fdtype))
+
+ # Add the fields from r2
+ for fname, fdtype in get_fieldspec(r2.dtype):
# Have we seen the current name already ?
- if name in names:
- nameidx = ndtype.index(desc)
- current = ndtype[nameidx]
- # The current field is part of the key: take the largest dtype
- if name in key:
- current[-1] = max(desc[1], current[-1])
- # The current field is not part of the key: add the suffixes
- else:
- current[0] += r1postfix
- desc[0] += r2postfix
- ndtype.insert(nameidx + 1, desc)
- #... we haven't: just add the description to the current list
+ # we need to rebuild this list every time
+ names = list(name for name, dtype in ndtype)
+ try:
+ nameidx = names.index(fname)
+ except ValueError:
+ #... we haven't: just add the description to the current list
+ ndtype.append((fname, fdtype))
else:
- names.extend(desc[0])
- ndtype.append(desc)
- # Revert the elements to tuples
- ndtype = [tuple(_) for _ in ndtype]
+ # collision
+ _, cdtype = ndtype[nameidx]
+ if fname in key:
+ # The current field is part of the key: take the largest dtype
+ ndtype[nameidx] = (fname, max(fdtype, cdtype))
+ else:
+ # The current field is not part of the key: add the suffixes,
+ # and place the new field adjacent to the old one
+ ndtype[nameidx:nameidx + 1] = [
+ (fname + r1postfix, cdtype),
+ (fname + r2postfix, fdtype)
+ ]
+ # Rebuild a dtype from the new fields
+ ndtype = np.dtype(ndtype)
# Find the largest nb of common fields :
# r1cmn and r2cmn should be equal, but...
cmn = max(r1cmn, r2cmn)
@@ -988,6 +1548,13 @@ def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
return _fix_output(_fix_defaults(output, defaults), **kwargs)
+def _rec_join_dispatcher(
+ key, r1, r2, jointype=None, r1postfix=None, r2postfix=None,
+ defaults=None):
+ return (r1, r2)
+
+
+@array_function_dispatch(_rec_join_dispatcher)
def rec_join(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
defaults=None):
"""