From 49e10732433c26d7c781e00a415fa33dada6ac90 Mon Sep 17 00:00:00 2001
From: Eric Wieser <wieser.eric@gmail.com>
Date: Sat, 1 Jul 2017 12:51:49 +0100
Subject: MAINT: use set operators for brevity

---
 numpy/lib/recfunctions.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'numpy/lib/recfunctions.py')

diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py
index b9542e848..08faeee0e 100644
--- a/numpy/lib/recfunctions.py
+++ b/numpy/lib/recfunctions.py
@@ -920,10 +920,10 @@ def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
     (r1names, r2names) = (r1.dtype.names, r2.dtype.names)
 
     # Check the names for collision
-    if (set.intersection(set(r1names), set(r2names)).difference(key) and
-            not (r1postfix or r2postfix)):
+    collisions = (set(r1names) & set(r2names)) - set(key)
+    if collisions and not (r1postfix or r2postfix):
         msg = "r1 and r2 contain common names, r1postfix and r2postfix "
-        msg += "can't be empty"
+        msg += "can't both be empty"
         raise ValueError(msg)
 
     # Make temporary arrays of just the keys
-- 
cgit v1.2.1


From cd761d81b571525ac6c2cca36da6bd270bb8357d Mon Sep 17 00:00:00 2001
From: Eric Wieser <wieser.eric@gmail.com>
Date: Sat, 1 Jul 2017 13:05:15 +0100
Subject: BUG: recfunctions.join_by fails for colliding values with different
 dtypes

Fixes #9338
---
 numpy/lib/recfunctions.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'numpy/lib/recfunctions.py')

diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py
index 08faeee0e..e42421786 100644
--- a/numpy/lib/recfunctions.py
+++ b/numpy/lib/recfunctions.py
@@ -963,27 +963,28 @@ def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
     ndtype = [list(_) for _ in r1k.dtype.descr]
     # Add the other fields
     ndtype.extend(list(_) for _ in r1.dtype.descr if _[0] not in key)
-    # Find the new list of names (it may be different from r1names)
-    names = list(_[0] for _ in ndtype)
+
     for desc in r2.dtype.descr:
         desc = list(desc)
-        name = desc[0]
         # Have we seen the current name already ?
-        if name in names:
-            nameidx = ndtype.index(desc)
+        name = desc[0]
+        names = list(_[0] for _ in ndtype)
+        try:
+            nameidx = names.index(name)
+        except ValueError:
+            #... we haven't: just add the description to the current list
+            ndtype.append(desc)
+        else:
             current = ndtype[nameidx]
-            # The current field is part of the key: take the largest dtype
             if name in key:
+                # The current field is part of the key: take the largest dtype
                 current[-1] = max(desc[1], current[-1])
-            # The current field is not part of the key: add the suffixes
             else:
+                # The current field is not part of the key: add the suffixes,
+                # and place the new field adjacent to the old one
                 current[0] += r1postfix
                 desc[0] += r2postfix
                 ndtype.insert(nameidx + 1, desc)
-        #... we haven't: just add the description to the current list
-        else:
-            names.extend(desc[0])
-            ndtype.append(desc)
     # Revert the elements to tuples
     ndtype = [tuple(_) for _ in ndtype]
     # Find the largest nb of common fields :
-- 
cgit v1.2.1


From bdbac02b0bddb265840cc00cc5dec0590c09b093 Mon Sep 17 00:00:00 2001
From: Eric Wieser <wieser.eric@gmail.com>
Date: Sat, 1 Jul 2017 14:25:21 +0100
Subject: BUG: recfunctions.join_by fails when key is a subdtype

It seems that working with .descr is a generally terrible idea.
Instead we introduce `get_fieldspec`, which returns a list of 2-tuples,
encapsulating subdtypes.

This also means that np.core.test_rational.rational survives a roundtrip - its
.descr is 'V8', which ddoesn't survive
---
 numpy/lib/recfunctions.py | 58 ++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 11 deletions(-)

(limited to 'numpy/lib/recfunctions.py')

diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py
index e42421786..a0a070547 100644
--- a/numpy/lib/recfunctions.py
+++ b/numpy/lib/recfunctions.py
@@ -70,6 +70,42 @@ def recursive_fill_fields(input, output):
     return output
 
 
+def get_fieldspec(dtype):
+    """
+    Produce a list of name/dtype pairs corresponding to the dtype fields
+
+    Similar to dtype.descr, but the second item of each tuple is a dtype, not a
+    string. As a result, this handles subarray dtypes
+
+    Can be passed to the dtype constructor to reconstruct the dtype, noting that
+    this (deliberately) discards field offsets.
+
+    Examples
+    --------
+    >>> dt = np.dtype([(('a', 'A'), int), ('b', float, 3)])
+    >>> dt.descr
+    [(('a', 'A'), '<i4'), ('b', '<f8', (3,))]
+    >>> get_fieldspec(dt)
+    [(('a', 'A'), dtype('int32')), ('b', dtype(('<f8', (3,))))]
+
+    """
+    if dtype.names is None:
+        # .descr returns a nameless field, so we should too
+        return [('', dtype)]
+    else:
+        # extract the titles of the fields
+        name_titles = {}
+        for d in dtype.descr:
+            name_title = d[0]
+            if isinstance(name_title, tuple):
+                name = name_title[1]
+            else:
+                name = name_title
+            name_titles[name] = name_title
+
+        return [(name_titles[name], dtype[name]) for name in dtype.names]
+
+
 def get_names(adtype):
     """
     Returns the field names of the input datatype as a tuple.
@@ -960,33 +996,33 @@ def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
     #
     # Build the new description of the output array .......
     # Start with the key fields
-    ndtype = [list(_) for _ in r1k.dtype.descr]
+    ndtype = [list(f) for f in get_fieldspec(r1k.dtype)]
     # Add the other fields
-    ndtype.extend(list(_) for _ in r1.dtype.descr if _[0] not in key)
+    ndtype.extend(list(f) for f in get_fieldspec(r1.dtype) if f[0] not in key)
 
-    for desc in r2.dtype.descr:
-        desc = list(desc)
+    for field in get_fieldspec(r2.dtype):
+        field = list(field)
         # Have we seen the current name already ?
-        name = desc[0]
+        name = field[0]
         names = list(_[0] for _ in ndtype)
         try:
             nameidx = names.index(name)
         except ValueError:
             #... we haven't: just add the description to the current list
-            ndtype.append(desc)
+            ndtype.append(field)
         else:
             current = ndtype[nameidx]
             if name in key:
                 # The current field is part of the key: take the largest dtype
-                current[-1] = max(desc[1], current[-1])
+                current[1] = max(field[1], current[1])
             else:
                 # The current field is not part of the key: add the suffixes,
                 # and place the new field adjacent to the old one
                 current[0] += r1postfix
-                desc[0] += r2postfix
-                ndtype.insert(nameidx + 1, desc)
-    # Revert the elements to tuples
-    ndtype = [tuple(_) for _ in ndtype]
+                field[0] += r2postfix
+                ndtype.insert(nameidx + 1, field)
+    # Rebuild a dtype from the new fields
+    ndtype = np.dtype([tuple(_) for _ in ndtype])
     # Find the largest nb of common fields :
     # r1cmn and r2cmn should be equal, but...
     cmn = max(r1cmn, r2cmn)
-- 
cgit v1.2.1


From 57225485fe72ca059e8c7d9fa17a07c3a31ba009 Mon Sep 17 00:00:00 2001
From: Eric Wieser <wieser.eric@gmail.com>
Date: Sat, 1 Jul 2017 15:20:01 +0100
Subject: BUG: stack_arrays fails for subdtypes

Again, fixed by not using descr
---
 numpy/lib/recfunctions.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'numpy/lib/recfunctions.py')

diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py
index a0a070547..f66cfd32e 100644
--- a/numpy/lib/recfunctions.py
+++ b/numpy/lib/recfunctions.py
@@ -782,10 +782,10 @@ def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False,
     fldnames = [d.names for d in ndtype]
     #
     dtype_l = ndtype[0]
-    newdescr = dtype_l.descr
+    newdescr = get_fieldspec(dtype_l)
     names = [_[0] for _ in newdescr]
     for dtype_n in ndtype[1:]:
-        for descr in dtype_n.descr:
+        for descr in get_fieldspec(dtype_n):
             name = descr[0] or ''
             if name not in names:
                 newdescr.append(descr)
@@ -794,11 +794,11 @@ def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False,
                 nameidx = names.index(name)
                 current_descr = newdescr[nameidx]
                 if autoconvert:
-                    if np.dtype(descr[1]) > np.dtype(current_descr[-1]):
+                    if descr[1] > current_descr[1]:
                         current_descr = list(current_descr)
-                        current_descr[-1] = descr[1]
+                        current_descr[1] = descr[1]
                         newdescr[nameidx] = tuple(current_descr)
-                elif descr[1] != current_descr[-1]:
+                elif descr[1] != current_descr[1]:
                     raise TypeError("Incompatible type '%s' <> '%s'" %
                                     (dict(newdescr)[name], descr[1]))
     # Only one field: use concatenate
-- 
cgit v1.2.1


From b3d9ec77d4448f424449a9e9643df2d3cfd7701b Mon Sep 17 00:00:00 2001
From: Eric Wieser <wieser.eric@gmail.com>
Date: Sat, 1 Jul 2017 15:36:22 +0100
Subject: MAINT: Stop using .descr in recfunctions

This change shouldn't affect behaviour - all old uses were still correct.
---
 numpy/lib/recfunctions.py | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

(limited to 'numpy/lib/recfunctions.py')

diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py
index f66cfd32e..71672eae3 100644
--- a/numpy/lib/recfunctions.py
+++ b/numpy/lib/recfunctions.py
@@ -194,6 +194,22 @@ def flatten_descr(ndtype):
         return tuple(descr)
 
 
+def zip_dtype(seqarrays, flatten=False):
+    newdtype = []
+    if flatten:
+        for a in seqarrays:
+            newdtype.extend(flatten_descr(a.dtype))
+    else:
+        for a in seqarrays:
+            current = a.dtype
+            if current.names and len(current.names) <= 1:
+                # special case - dtypes of 0 or 1 field are flattened
+                newdtype.extend(get_fieldspec(current))
+            else:
+                newdtype.append(('', current))
+    return np.dtype(newdtype)
+
+
 def zip_descr(seqarrays, flatten=False):
     """
     Combine the dtype description of a series of arrays.
@@ -205,19 +221,7 @@ def zip_descr(seqarrays, flatten=False):
     flatten : {boolean}, optional
         Whether to collapse nested descriptions.
     """
-    newdtype = []
-    if flatten:
-        for a in seqarrays:
-            newdtype.extend(flatten_descr(a.dtype))
-    else:
-        for a in seqarrays:
-            current = a.dtype
-            names = current.names or ()
-            if len(names) > 1:
-                newdtype.append(('', current.descr))
-            else:
-                newdtype.extend(current.descr)
-    return np.dtype(newdtype).descr
+    return zip_dtype(seqarrays, flatten=flatten).descr
 
 
 def get_fieldstructure(adtype, lastname=None, parents=None,):
@@ -412,8 +416,7 @@ def merge_arrays(seqarrays, fill_value=-1, flatten=False,
     # Do we have a single ndarray as input ?
     if isinstance(seqarrays, (ndarray, np.void)):
         seqdtype = seqarrays.dtype
-        if (not flatten) or \
-           (zip_descr((seqarrays,), flatten=True) == seqdtype.descr):
+        if not flatten or zip_dtype((seqarrays,), flatten=True) == seqdtype:
             # Minimal processing needed: just make sure everythng's a-ok
             seqarrays = seqarrays.ravel()
             # Make sure we have named fields
@@ -439,7 +442,7 @@ def merge_arrays(seqarrays, fill_value=-1, flatten=False,
     sizes = tuple(a.size for a in seqarrays)
     maxlength = max(sizes)
     # Get the dtype of the output (flattening if needed)
-    newdtype = zip_descr(seqarrays, flatten=flatten)
+    newdtype = zip_dtype(seqarrays, flatten=flatten)
     # Initialize the sequences for data and mask
     seqdata = []
     seqmask = []
@@ -691,8 +694,9 @@ def append_fields(base, names, data, dtypes=None,
     else:
         data = data.pop()
     #
-    output = ma.masked_all(max(len(base), len(data)),
-                           dtype=base.dtype.descr + data.dtype.descr)
+    output = ma.masked_all(
+        max(len(base), len(data)),
+        dtype=get_fieldspec(base.dtype) + get_fieldspec(data.dtype))
     output = recursive_fill_fields(base, output)
     output = recursive_fill_fields(data, output)
     #
-- 
cgit v1.2.1


From 87c1b1f56af5fe2796cb78dd9bc76e92cb2e1f93 Mon Sep 17 00:00:00 2001
From: Eric Wieser <wieser.eric@gmail.com>
Date: Sat, 1 Jul 2017 15:40:10 +0100
Subject: BUG: flatten_descr returns string not dtype for scalar dtype

---
 numpy/lib/recfunctions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'numpy/lib/recfunctions.py')

diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py
index 71672eae3..0a1a259d8 100644
--- a/numpy/lib/recfunctions.py
+++ b/numpy/lib/recfunctions.py
@@ -182,7 +182,7 @@ def flatten_descr(ndtype):
     """
     names = ndtype.names
     if names is None:
-        return ndtype.descr
+        return (('', ndtype),)
     else:
         descr = []
         for field in names:
-- 
cgit v1.2.1


From 1c76fed4aa3cbead721b90bef6ccbefbcc61dbd2 Mon Sep 17 00:00:00 2001
From: Eric Wieser <wieser.eric@gmail.com>
Date: Sat, 1 Jul 2017 15:42:46 +0100
Subject: MAINT: Shortcut for flat dtypes wasn't used for scalar dtypes

---
 numpy/lib/recfunctions.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'numpy/lib/recfunctions.py')

diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py
index 0a1a259d8..2b89ee0a4 100644
--- a/numpy/lib/recfunctions.py
+++ b/numpy/lib/recfunctions.py
@@ -416,12 +416,12 @@ def merge_arrays(seqarrays, fill_value=-1, flatten=False,
     # Do we have a single ndarray as input ?
     if isinstance(seqarrays, (ndarray, np.void)):
         seqdtype = seqarrays.dtype
+        # Make sure we have named fields
+        if not seqdtype.names:
+            seqdtype = np.dtype([('', seqdtype)])
         if not flatten or zip_dtype((seqarrays,), flatten=True) == seqdtype:
             # Minimal processing needed: just make sure everythng's a-ok
             seqarrays = seqarrays.ravel()
-            # Make sure we have named fields
-            if not seqdtype.names:
-                seqdtype = [('', seqdtype)]
             # Find what type of array we must return
             if usemask:
                 if asrecarray:
-- 
cgit v1.2.1


From 908cd986a5e1dcefd68e37dce5ac14641e364e56 Mon Sep 17 00:00:00 2001
From: Eric Wieser <wieser.eric@gmail.com>
Date: Sat, 1 Jul 2017 19:58:06 +0100
Subject: MAINT: remove tuple<->list conversion dance

---
 numpy/lib/recfunctions.py | 61 ++++++++++++++++++++++++-----------------------
 1 file changed, 31 insertions(+), 30 deletions(-)

(limited to 'numpy/lib/recfunctions.py')

diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py
index 2b89ee0a4..6e2d1726f 100644
--- a/numpy/lib/recfunctions.py
+++ b/numpy/lib/recfunctions.py
@@ -787,24 +787,20 @@ def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False,
     #
     dtype_l = ndtype[0]
     newdescr = get_fieldspec(dtype_l)
-    names = [_[0] for _ in newdescr]
+    names = [n for n, d in newdescr]
     for dtype_n in ndtype[1:]:
-        for descr in get_fieldspec(dtype_n):
-            name = descr[0] or ''
-            if name not in names:
-                newdescr.append(descr)
-                names.append(name)
+        for fname, fdtype in get_fieldspec(dtype_n):
+            if fname not in names:
+                newdescr.append((fname, fdtype))
+                names.append(fname)
             else:
-                nameidx = names.index(name)
-                current_descr = newdescr[nameidx]
+                nameidx = names.index(fname)
+                _, cdtype = newdescr[nameidx]
                 if autoconvert:
-                    if descr[1] > current_descr[1]:
-                        current_descr = list(current_descr)
-                        current_descr[1] = descr[1]
-                        newdescr[nameidx] = tuple(current_descr)
-                elif descr[1] != current_descr[1]:
+                    newdescr[nameidx] = (fname, max(fdtype, cdtype))
+                elif fdtype != cdtype:
                     raise TypeError("Incompatible type '%s' <> '%s'" %
-                                    (dict(newdescr)[name], descr[1]))
+                                    (cdtype, fdtype))
     # Only one field: use concatenate
     if len(newdescr) == 1:
         output = ma.concatenate(seqarrays)
@@ -1000,33 +996,38 @@ def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
     #
     # Build the new description of the output array .......
     # Start with the key fields
-    ndtype = [list(f) for f in get_fieldspec(r1k.dtype)]
-    # Add the other fields
-    ndtype.extend(list(f) for f in get_fieldspec(r1.dtype) if f[0] not in key)
+    ndtype = get_fieldspec(r1k.dtype)
 
-    for field in get_fieldspec(r2.dtype):
-        field = list(field)
+    # Add the fields from r1
+    for fname, fdtype in get_fieldspec(r1.dtype):
+        if fname not in key:
+            ndtype.append((fname, fdtype))
+
+    # Add the fields from r2
+    for fname, fdtype in get_fieldspec(r2.dtype):
         # Have we seen the current name already ?
-        name = field[0]
-        names = list(_[0] for _ in ndtype)
+        # we need to rebuild this list every time
+        names = list(name for name, dtype in ndtype)
         try:
-            nameidx = names.index(name)
+            nameidx = names.index(fname)
         except ValueError:
             #... we haven't: just add the description to the current list
-            ndtype.append(field)
+            ndtype.append((fname, fdtype))
         else:
-            current = ndtype[nameidx]
-            if name in key:
+            # collision
+            _, cdtype = ndtype[nameidx]
+            if fname in key:
                 # The current field is part of the key: take the largest dtype
-                current[1] = max(field[1], current[1])
+                ndtype[nameidx] = (fname, max(fdtype, cdtype))
             else:
                 # The current field is not part of the key: add the suffixes,
                 # and place the new field adjacent to the old one
-                current[0] += r1postfix
-                field[0] += r2postfix
-                ndtype.insert(nameidx + 1, field)
+                ndtype[nameidx:nameidx + 1] = [
+                    (fname + r1postfix, cdtype),
+                    (fname + r2postfix, fdtype)
+                ]
     # Rebuild a dtype from the new fields
-    ndtype = np.dtype([tuple(_) for _ in ndtype])
+    ndtype = np.dtype(ndtype)
     # Find the largest nb of common fields :
     # r1cmn and r2cmn should be equal, but...
     cmn = max(r1cmn, r2cmn)
-- 
cgit v1.2.1


From ae14f151d2534dfa1b632ed156fe8e7fc9753de2 Mon Sep 17 00:00:00 2001
From: Eric Wieser <wieser.eric@gmail.com>
Date: Sat, 1 Jul 2017 20:53:56 +0100
Subject: MAINT: Avoid one more use of descr

---
 numpy/lib/recfunctions.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'numpy/lib/recfunctions.py')

diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py
index 6e2d1726f..e9ba38f46 100644
--- a/numpy/lib/recfunctions.py
+++ b/numpy/lib/recfunctions.py
@@ -93,17 +93,12 @@ def get_fieldspec(dtype):
         # .descr returns a nameless field, so we should too
         return [('', dtype)]
     else:
-        # extract the titles of the fields
-        name_titles = {}
-        for d in dtype.descr:
-            name_title = d[0]
-            if isinstance(name_title, tuple):
-                name = name_title[1]
-            else:
-                name = name_title
-            name_titles[name] = name_title
-
-        return [(name_titles[name], dtype[name]) for name in dtype.names]
+        fields = ((name, dtype.fields[name]) for name in dtype.names)
+        # keep any titles, if present
+        return [
+            (name if len(f) == 2 else (f[2], name), f[0]) 
+            for name, f in fields
+        ]
 
 
 def get_names(adtype):
-- 
cgit v1.2.1