105 files changed, 5335 insertions, 3254 deletions
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index 513415e09..0727a72fe 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -3178,7 +3178,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('itemset',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('max',
     """
-    a.max(axis=None, out=None, keepdims=False)
+    a.max(axis=None, out=None, keepdims=False, initial=<no value>, where=True)
 
     Return the maximum along a given axis.
 
@@ -3208,7 +3208,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('mean',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('min',
     """
-    a.min(axis=None, out=None, keepdims=False)
+    a.min(axis=None, out=None, keepdims=False, initial=<no value>, where=True)
 
     Return the minimum along a given axis.
 
@@ -3361,7 +3361,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('nonzero',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('prod',
     """
-    a.prod(axis=None, dtype=None, out=None, keepdims=False)
+    a.prod(axis=None, dtype=None, out=None, keepdims=False, initial=1, where=True)
 
     Return the product of the array elements over the given axis
 
@@ -3930,7 +3930,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('std',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('sum',
     """
-    a.sum(axis=None, dtype=None, out=None, keepdims=False)
+    a.sum(axis=None, dtype=None, out=None, keepdims=False, initial=0, where=True)
 
     Return the sum of the array elements over the given axis.
 
@@ -4017,10 +4017,14 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('tolist',
     """
     a.tolist()
 
-    Return the array as a (possibly nested) list.
+    Return the array as an ``a.ndim``-levels deep nested list of Python scalars.
 
     Return a copy of the array data as a (nested) Python list.
-    Data items are converted to the nearest compatible Python type.
+    Data items are converted to the nearest compatible builtin Python type, via
+    the `~numpy.ndarray.item` function.
+
+    If ``a.ndim`` is 0, then since the depth of the nested list is 0, it will
+    not be a list at all, but a simple Python scalar.
 
     Parameters
     ----------
@@ -4028,24 +4032,41 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('tolist',
 
     Returns
     -------
-    y : list
+    y : object, or list of object, or list of list of object, or ...
         The possibly nested list of array elements.
 
     Notes
     -----
-    The array may be recreated, ``a = np.array(a.tolist())``.
+    The array may be recreated via ``a = np.array(a.tolist())``, although this
+    may sometimes lose precision.
 
     Examples
     --------
+    For a 1D array, ``a.tolist()`` is almost the same as ``list(a)``:
+
     >>> a = np.array([1, 2])
+    >>> list(a)
+    [1, 2]
     >>> a.tolist()
     [1, 2]
+
+    However, for a 2D array, ``tolist`` applies recursively:
+
     >>> a = np.array([[1, 2], [3, 4]])
     >>> list(a)
     [array([1, 2]), array([3, 4])]
     >>> a.tolist()
     [[1, 2], [3, 4]]
 
+    The base case for this recursion is a 0D array:
+
+    >>> a = np.array(1)
+    >>> list(a)
+    Traceback (most recent call last):
+      ...
+    TypeError: iteration over a 0-d array
+    >>> a.tolist()
+    1
     """))
 
 
@@ -4876,7 +4897,7 @@ add_newdoc('numpy.core', 'ufunc', ('signature',
 
 add_newdoc('numpy.core', 'ufunc', ('reduce',
     """
-    reduce(a, axis=0, dtype=None, out=None, keepdims=False, initial)
+    reduce(a, axis=0, dtype=None, out=None, keepdims=False, initial=<no value>, where=True)
 
     Reduces `a`'s dimension by one, by applying ufunc along one axis.
 
@@ -4941,6 +4962,14 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
 
         .. versionadded:: 1.15.0
 
+    where : array_like of bool, optional
+        A boolean array which is broadcasted to match the dimensions
+        of `a`, and selects elements to include in the reduction. Note
+        that for ufuncs like ``minimum`` that do not have an identity
+        defined, one has to pass in also ``initial``.
+
+        .. versionadded:: 1.17.0
+
     Returns
     -------
     r : ndarray
@@ -4972,19 +5001,24 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
     array([[ 1,  5],
            [ 9, 13]])
 
-    You can use the ``initial`` keyword argument to initialize the reduction with a
-    different value.
+    You can use the ``initial`` keyword argument to initialize the reduction
+    with a different value, and ``where`` to select specific elements to include:
 
     >>> np.add.reduce([10], initial=5)
     15
     >>> np.add.reduce(np.ones((2, 2, 2)), axis=(0, 2), initial=10)
     array([14., 14.])
+    >>> a = np.array([10., np.nan, 10])
+    >>> np.add.reduce(a, where=~np.isnan(a))
+    20.0
 
     Allows reductions of empty arrays where they would normally fail, i.e.
     for ufuncs without an identity.
 
     >>> np.minimum.reduce([], initial=np.inf)
     inf
+    >>> np.minimum.reduce([[1., 2.], [3., 4.]], initial=10., where=[True, False])
+    array([ 1., 10.])
     >>> np.minimum.reduce([])
     Traceback (most recent call last):
         ...
@@ -6721,25 +6755,25 @@ add_newdoc('numpy.core.numerictypes', 'generic', ('view',
 add_newdoc('numpy.core.numerictypes', 'number',
     """
     Abstract base class of all numeric scalar types.
-    
+
     """)
 
 add_newdoc('numpy.core.numerictypes', 'integer',
     """
     Abstract base class of all integer scalar types.
-    
+
     """)
 
 add_newdoc('numpy.core.numerictypes', 'signedinteger',
     """
     Abstract base class of all signed integer scalar types.
-    
+
     """)
 
 add_newdoc('numpy.core.numerictypes', 'unsignedinteger',
     """
     Abstract base class of all unsigned integer scalar types.
-    
+
     """)
 
 add_newdoc('numpy.core.numerictypes', 'inexact',
@@ -6747,20 +6781,20 @@ add_newdoc('numpy.core.numerictypes', 'inexact',
     Abstract base class of all numeric scalar types with a (potentially)
     inexact representation of the values in its range, such as
     floating-point numbers.
-    
+
     """)
 
 add_newdoc('numpy.core.numerictypes', 'floating',
     """
     Abstract base class of all floating-point scalar types.
-    
+
     """)
 
 add_newdoc('numpy.core.numerictypes', 'complexfloating',
     """
     Abstract base class of all complex number scalar types that are made up of
     floating-point numbers.
-    
+
     """)
 
 add_newdoc('numpy.core.numerictypes', 'flexible',
@@ -6768,13 +6802,13 @@ add_newdoc('numpy.core.numerictypes', 'flexible',
     Abstract base class of all scalar types without predefined length.
     The actual size of these types depends on the specific `np.dtype`
     instantiation.
-    
+
     """)
 
 add_newdoc('numpy.core.numerictypes', 'character',
     """
     Abstract base class of all character string scalar types.
-    
+
     """)
 
 
diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py
index 27a3deeda..1d3bb5584 100644
--- a/numpy/core/_internal.py
+++ b/numpy/core/_internal.py
@@ -830,6 +830,13 @@ def array_ufunc_errmsg_formatter(dummy, ufunc, method, *inputs, **kwargs):
             .format(ufunc, method, args_string, types_string))
 
 
+def array_function_errmsg_formatter(public_api, types):
+    """ Format the error message for when __array_ufunc__ gives up. """
+    func_name = '{}.{}'.format(public_api.__module__, public_api.__name__)
+    return ("no implementation found for '{}' on types that implement "
+            '__array_function__: {}'.format(func_name, list(types)))
+
+
 def _ufunc_doc_signature_formatter(ufunc):
     """
     Builds a signature string which resembles PEP 457
diff --git a/numpy/core/_methods.py b/numpy/core/_methods.py
index baeab6383..51362c761 100644
--- a/numpy/core/_methods.py
+++ b/numpy/core/_methods.py
@@ -24,20 +24,20 @@ umr_all = um.logical_and.reduce
 # avoid keyword arguments to speed up parsing, saves about 15%-20% for very
 # small reductions
 def _amax(a, axis=None, out=None, keepdims=False,
-          initial=_NoValue):
-    return umr_maximum(a, axis, None, out, keepdims, initial)
+          initial=_NoValue, where=True):
+    return umr_maximum(a, axis, None, out, keepdims, initial, where)
 
 def _amin(a, axis=None, out=None, keepdims=False,
-          initial=_NoValue):
-    return umr_minimum(a, axis, None, out, keepdims, initial)
+          initial=_NoValue, where=True):
+    return umr_minimum(a, axis, None, out, keepdims, initial, where)
 
 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
-         initial=_NoValue):
-    return umr_sum(a, axis, dtype, out, keepdims, initial)
+         initial=_NoValue, where=True):
+    return umr_sum(a, axis, dtype, out, keepdims, initial, where)
 
 def _prod(a, axis=None, dtype=None, out=None, keepdims=False,
-          initial=_NoValue):
-    return umr_prod(a, axis, dtype, out, keepdims, initial)
+          initial=_NoValue, where=True):
+    return umr_prod(a, axis, dtype, out, keepdims, initial, where)
 
 def _any(a, axis=None, dtype=None, out=None, keepdims=False):
     return umr_any(a, axis, dtype, out, keepdims)
@@ -154,15 +154,3 @@ def _ptp(a, axis=None, out=None, keepdims=False):
         umr_minimum(a, axis, None, None, keepdims),
         out
     )
-
-_NDARRAY_ARRAY_FUNCTION = mu.ndarray.__array_function__
-
-def _array_function(self, func, types, args, kwargs):
-    # TODO: rewrite this in C
-    # Cannot handle items that have __array_function__ other than our own.
-    for t in types:
-        if not issubclass(t, mu.ndarray) and hasattr(t, '__array_function__'):
-            return NotImplemented
-
-    # The regular implementation can handle this, so we call it directly.
-    return func.__wrapped__(*args, **kwargs)
diff --git a/numpy/core/code_generators/genapi.py b/numpy/core/code_generators/genapi.py
index 1d2cd25c8..4aca2373c 100644
--- a/numpy/core/code_generators/genapi.py
+++ b/numpy/core/code_generators/genapi.py
@@ -19,6 +19,7 @@ __docformat__ = 'restructuredtext'
 
 # The files under src/ that are scanned for API functions
 API_FILES = [join('multiarray', 'alloc.c'),
+             join('multiarray', 'arrayfunction_override.c'),
              join('multiarray', 'array_assign_array.c'),
              join('multiarray', 'array_assign_scalar.c'),
              join('multiarray', 'arrayobject.c'),
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index f5ee02c42..108fff631 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -315,7 +315,7 @@ defdict = {
           TD(intfltcmplx),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),
-           #TypeDescription('m', FullTypeDescr, 'mm', 'd'),
+           TypeDescription('m', FullTypeDescr, 'mm', 'q'),
           ],
           TD(O, f='PyNumber_FloorDivide'),
           ),
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index 267e63b2d..6dd6982df 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -32,6 +32,9 @@ subst = {
             For other keyword-only arguments, see the
             :ref:`ufunc docs <ufuncs.kwargs>`.
     """).strip(),
+    'BROADCASTABLE_2': ("If ``x1.shape != x2.shape``, they must be "
+                        "broadcastable to a common shape (which becomes the "
+                        "shape of the output)."),
     'OUT_SCALAR_1': "This is a scalar if `x` is a scalar.",
     'OUT_SCALAR_2': "This is a scalar if both `x1` and `x2` are scalars.",
 }
@@ -104,9 +107,7 @@ add_newdoc('numpy.core.umath', 'add',
     Parameters
     ----------
     x1, x2 : array_like
-        The arrays to be added.  If ``x1.shape != x2.shape``, they must be
-        broadcastable to a common shape (which may be the shape of one or
-        the other).
+        The arrays to be added. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -432,8 +433,7 @@ add_newdoc('numpy.core.umath', 'arctan2',
     x1 : array_like, real-valued
         `y`-coordinates.
     x2 : array_like, real-valued
-        `x`-coordinates. `x2` must be broadcastable to match the shape of
-        `x1` or vice versa.
+        `x`-coordinates. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -556,7 +556,7 @@ add_newdoc('numpy.core.umath', 'bitwise_and',
     Parameters
     ----------
     x1, x2 : array_like
-        Only integer and boolean types are handled.
+        Only integer and boolean types are handled. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -609,7 +609,7 @@ add_newdoc('numpy.core.umath', 'bitwise_or',
     Parameters
     ----------
     x1, x2 : array_like
-        Only integer and boolean types are handled.
+        Only integer and boolean types are handled. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -667,7 +667,7 @@ add_newdoc('numpy.core.umath', 'bitwise_xor',
     Parameters
     ----------
     x1, x2 : array_like
-        Only integer and boolean types are handled.
+        Only integer and boolean types are handled. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -970,7 +970,7 @@ add_newdoc('numpy.core.umath', 'heaviside',
     x1 : array_like
         Input values.
     x2 : array_like
-        The value of the function when x1 is 0.
+        The value of the function when x1 is 0. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1005,7 +1005,7 @@ add_newdoc('numpy.core.umath', 'divide',
     x1 : array_like
         Dividend array.
     x2 : array_like
-        Divisor array.
+        Divisor array. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1074,7 +1074,7 @@ add_newdoc('numpy.core.umath', 'equal',
     Parameters
     ----------
     x1, x2 : array_like
-        Input arrays of the same shape.
+        Input arrays. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1321,7 +1321,7 @@ add_newdoc('numpy.core.umath', 'floor_divide',
     x1 : array_like
         Numerator.
     x2 : array_like
-        Denominator.
+        Denominator. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1361,7 +1361,7 @@ add_newdoc('numpy.core.umath', 'fmod',
     x1 : array_like
         Dividend.
     x2 : array_like
-        Divisor.
+        Divisor. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1411,9 +1411,7 @@ add_newdoc('numpy.core.umath', 'greater',
     Parameters
     ----------
     x1, x2 : array_like
-        Input arrays.  If ``x1.shape != x2.shape``, they must be
-        broadcastable to a common shape (which may be the shape of one or
-        the other).
+        Input arrays. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1449,9 +1447,7 @@ add_newdoc('numpy.core.umath', 'greater_equal',
     Parameters
     ----------
     x1, x2 : array_like
-        Input arrays.  If ``x1.shape != x2.shape``, they must be
-        broadcastable to a common shape (which may be the shape of one or
-        the other).
+        Input arrays. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1484,7 +1480,7 @@ add_newdoc('numpy.core.umath', 'hypot',
     Parameters
     ----------
     x1, x2 : array_like
-        Leg of the triangle(s).
+        Leg of the triangle(s). $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1783,6 +1779,7 @@ add_newdoc('numpy.core.umath', 'left_shift',
         Input values.
     x2 : array_like of integer type
         Number of zeros to append to `x1`. Has to be non-negative.
+        $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1818,9 +1815,7 @@ add_newdoc('numpy.core.umath', 'less',
     Parameters
     ----------
     x1, x2 : array_like
-        Input arrays.  If ``x1.shape != x2.shape``, they must be
-        broadcastable to a common shape (which may be the shape of one or
-        the other).
+        Input arrays. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1848,9 +1843,7 @@ add_newdoc('numpy.core.umath', 'less_equal',
     Parameters
     ----------
     x1, x2 : array_like
-        Input arrays.  If ``x1.shape != x2.shape``, they must be
-        broadcastable to a common shape (which may be the shape of one or
-        the other).
+        Input arrays. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2034,7 +2027,7 @@ add_newdoc('numpy.core.umath', 'logaddexp',
     Parameters
     ----------
     x1, x2 : array_like
-        Input values.
+        Input values. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2076,7 +2069,7 @@ add_newdoc('numpy.core.umath', 'logaddexp2',
     Parameters
     ----------
     x1, x2 : array_like
-        Input values.
+        Input values. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2167,14 +2160,14 @@ add_newdoc('numpy.core.umath', 'logical_and',
     Parameters
     ----------
     x1, x2 : array_like
-        Input arrays. `x1` and `x2` must be of the same shape.
+        Input arrays. $BROADCASTABLE_2
     $PARAMS
 
     Returns
     -------
     y : ndarray or bool
-        Boolean result with the same shape as `x1` and `x2` of the logical
-        AND operation on corresponding elements of `x1` and `x2`.
+        Boolean result of the logical OR operation applied to the elements
+        of `x1` and `x2`; the shape is determined by broadcasting.
         $OUT_SCALAR_2
 
     See Also
@@ -2237,14 +2230,14 @@ add_newdoc('numpy.core.umath', 'logical_or',
     ----------
     x1, x2 : array_like
         Logical OR is applied to the elements of `x1` and `x2`.
-        They have to be of the same shape.
+        $BROADCASTABLE_2
     $PARAMS
 
     Returns
     -------
     y : ndarray or bool
-        Boolean result with the same shape as `x1` and `x2` of the logical
-        OR operation on elements of `x1` and `x2`.
+        Boolean result of the logical OR operation applied to the elements
+        of `x1` and `x2`; the shape is determined by broadcasting.
         $OUT_SCALAR_2
 
     See Also
@@ -2272,16 +2265,14 @@ add_newdoc('numpy.core.umath', 'logical_xor',
     Parameters
     ----------
     x1, x2 : array_like
-        Logical XOR is applied to the elements of `x1` and `x2`.  They must
-        be broadcastable to the same shape.
+        Logical XOR is applied to the elements of `x1` and `x2`. $BROADCASTABLE_2
     $PARAMS
 
     Returns
     -------
     y : bool or ndarray of bool
         Boolean result of the logical XOR operation applied to the elements
-        of `x1` and `x2`; the shape is determined by whether or not
-        broadcasting of one or both arrays was required.
+        of `x1` and `x2`; the shape is determined by broadcasting.
         $OUT_SCALAR_2
 
     See Also
@@ -2321,8 +2312,7 @@ add_newdoc('numpy.core.umath', 'maximum',
     Parameters
     ----------
     x1, x2 : array_like
-        The arrays holding the elements to be compared. They must have
-        the same shape, or shapes that can be broadcast to a single shape.
+        The arrays holding the elements to be compared. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2380,8 +2370,7 @@ add_newdoc('numpy.core.umath', 'minimum',
     Parameters
     ----------
     x1, x2 : array_like
-        The arrays holding the elements to be compared. They must have
-        the same shape, or shapes that can be broadcast to a single shape.
+        The arrays holding the elements to be compared. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2439,8 +2428,7 @@ add_newdoc('numpy.core.umath', 'fmax',
     Parameters
     ----------
     x1, x2 : array_like
-        The arrays holding the elements to be compared. They must have
-        the same shape.
+        The arrays holding the elements to be compared. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2497,8 +2485,7 @@ add_newdoc('numpy.core.umath', 'fmin',
     Parameters
     ----------
     x1, x2 : array_like
-        The arrays holding the elements to be compared. They must have
-        the same shape.
+        The arrays holding the elements to be compared. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2711,7 +2698,7 @@ add_newdoc('numpy.core.umath', 'multiply',
     Parameters
     ----------
     x1, x2 : array_like
-        Input arrays to be multiplied.
+        Input arrays to be multiplied. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2792,7 +2779,7 @@ add_newdoc('numpy.core.umath', 'not_equal',
     Parameters
     ----------
     x1, x2 : array_like
-        Input arrays.
+        Input arrays.  $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2841,7 +2828,7 @@ add_newdoc('numpy.core.umath', 'power',
     x1 : array_like
         The bases.
     x2 : array_like
-        The exponents.
+        The exponents. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2900,7 +2887,7 @@ add_newdoc('numpy.core.umath', 'float_power',
     x1 : array_like
         The bases.
     x2 : array_like
-        The exponents.
+        The exponents. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3072,7 +3059,7 @@ add_newdoc('numpy.core.umath', 'remainder',
     x1 : array_like
         Dividend array.
     x2 : array_like
-        Divisor array.
+        Divisor array. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3117,7 +3104,7 @@ add_newdoc('numpy.core.umath', 'divmod',
     x1 : array_like
         Dividend array.
     x2 : array_like
-        Divisor array.
+        Divisor array. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3156,7 +3143,7 @@ add_newdoc('numpy.core.umath', 'right_shift',
     x1 : array_like, int
         Input values.
     x2 : array_like, int
-        Number of bits to remove at the right of `x1`.
+        Number of bits to remove at the right of `x1`. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3283,16 +3270,14 @@ add_newdoc('numpy.core.umath', 'copysign',
     """
     Change the sign of x1 to that of x2, element-wise.
 
-    If both arguments are arrays or sequences, they have to be of the same
-    length. If `x2` is a scalar, its sign will be copied to all elements of
-    `x1`.
+    If `x2` is a scalar, its sign will be copied to all elements of `x1`.
 
     Parameters
     ----------
     x1 : array_like
         Values to change the sign of.
     x2 : array_like
-        The sign of `x2` is copied to `x1`.
+        The sign of `x2` is copied to `x1`. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3327,6 +3312,7 @@ add_newdoc('numpy.core.umath', 'nextafter',
         Values to find the next representable value of.
     x2 : array_like
         The direction where to look for the next representable value of `x1`.
+        $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3598,7 +3584,7 @@ add_newdoc('numpy.core.umath', 'subtract',
     Parameters
     ----------
     x1, x2 : array_like
-        The arrays to be subtracted from each other.
+        The arrays to be subtracted from each other. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3739,7 +3725,7 @@ add_newdoc('numpy.core.umath', 'true_divide',
     x1 : array_like
         Dividend array.
     x2 : array_like
-        Divisor array.
+        Divisor array. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3836,7 +3822,7 @@ add_newdoc('numpy.core.umath', 'ldexp',
     x1 : array_like
         Array of multipliers.
     x2 : array_like, int
-        Array of twos exponents.
+        Array of twos exponents. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3874,7 +3860,7 @@ add_newdoc('numpy.core.umath', 'gcd',
     Parameters
     ----------
     x1, x2 : array_like, int
-        Arrays of values
+        Arrays of values. $BROADCASTABLE_2
 
     Returns
     -------
@@ -3904,7 +3890,7 @@ add_newdoc('numpy.core.umath', 'lcm',
     Parameters
     ----------
     x1, x2 : array_like, int
-        Arrays of values
+        Arrays of values. $BROADCASTABLE_2
 
     Returns
     -------
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index 240eac6ce..d94372986 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -243,7 +243,7 @@ def reshape(a, newshape, order='C'):
 
      # A transpose makes the array non-contiguous
      >>> b = a.T
-     
+
      # Taking a view makes it possible to modify the shape without modifying
      # the initial object.
      >>> c = b.view()
@@ -1452,7 +1452,7 @@ def diagonal(a, offset=0, axis1=0, axis2=1):
         same type as `a` is returned unless `a` is a `matrix`, in which case
         a 1-D array rather than a (2-D) `matrix` is returned in order to
         maintain backward compatibility.
-        
+
         If ``a.ndim > 2``, then the dimensions specified by `axis1` and `axis2`
         are removed, and a new axis inserted at the end corresponding to the
         diagonal.
@@ -1963,12 +1963,13 @@ def clip(a, a_min, a_max, out=None):
 
 
 def _sum_dispatcher(a, axis=None, dtype=None, out=None, keepdims=None,
-                    initial=None):
+                    initial=None, where=None):
     return (a, out)
 
 
 @array_function_dispatch(_sum_dispatcher)
-def sum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
+def sum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue,
+        initial=np._NoValue, where=np._NoValue):
     """
     Sum of array elements over a given axis.
 
@@ -2012,6 +2013,11 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, initial=np._No
 
         .. versionadded:: 1.15.0
 
+    where : array_like of bool, optional
+        Elements to include in the sum. See `~numpy.ufunc.reduce` for details.
+
+        .. versionadded:: 1.17.0
+
     Returns
     -------
     sum_along_axis : ndarray
@@ -2052,6 +2058,8 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, initial=np._No
     array([0, 6])
     >>> np.sum([[0, 1], [0, 5]], axis=1)
     array([1, 5])
+    >>> np.sum([[0, 1], [np.nan, 5]], where=[False, True], axis=1)
+    array([1., 5.])
 
     If the accumulator is too small, overflow occurs:
 
@@ -2077,7 +2085,7 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, initial=np._No
         return res
 
     return _wrapreduction(a, np.add, 'sum', axis, dtype, out, keepdims=keepdims,
-                          initial=initial)
+                          initial=initial, where=where)
 
 
 def _any_dispatcher(a, axis=None, out=None, keepdims=None):
@@ -2394,12 +2402,14 @@ def ptp(a, axis=None, out=None, keepdims=np._NoValue):
     return _methods._ptp(a, axis=axis, out=out, **kwargs)
 
 
-def _amax_dispatcher(a, axis=None, out=None, keepdims=None, initial=None):
+def _amax_dispatcher(a, axis=None, out=None, keepdims=None, initial=None,
+                     where=None):
     return (a, out)
 
 
 @array_function_dispatch(_amax_dispatcher)
-def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
+def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
+         where=np._NoValue):
     """
     Return the maximum of an array or maximum along an axis.
 
@@ -2437,6 +2447,11 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
 
         .. versionadded:: 1.15.0
 
+    where : array_like of bool, optional
+        Elements to compare for the maximum. See `~numpy.ufunc.reduce`
+        for details.
+
+        .. versionadded:: 1.17.0
 
     Returns
     -------
@@ -2482,11 +2497,14 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
     array([2, 3])
     >>> np.amax(a, axis=1)   # Maxima along the second axis
     array([1, 3])
-
+    >>> np.amax(a, where=[False, True], initial=-1, axis=0)
+    array([-1,  3])
     >>> b = np.arange(5, dtype=float)
     >>> b[2] = np.NaN
     >>> np.amax(b)
     nan
+    >>> np.amax(b, where=~np.isnan(b), initial=-1)
+    4.0
     >>> np.nanmax(b)
     4.0
 
@@ -2505,16 +2523,18 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
     >>> max([5], default=6)
     5
     """
-    return _wrapreduction(a, np.maximum, 'max', axis, None, out, keepdims=keepdims,
-                          initial=initial)
+    return _wrapreduction(a, np.maximum, 'max', axis, None, out,
+                          keepdims=keepdims, initial=initial, where=where)
 
 
-def _amin_dispatcher(a, axis=None, out=None, keepdims=None, initial=None):
+def _amin_dispatcher(a, axis=None, out=None, keepdims=None, initial=None,
+                     where=None):
     return (a, out)
 
 
 @array_function_dispatch(_amin_dispatcher)
-def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
+def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
+         where=np._NoValue):
     """
     Return the minimum of an array or minimum along an axis.
 
@@ -2552,6 +2572,12 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
 
         .. versionadded:: 1.15.0
 
+    where : array_like of bool, optional
+        Elements to compare for the minimum. See `~numpy.ufunc.reduce`
+        for details.
+
+        .. versionadded:: 1.17.0
+
     Returns
     -------
     amin : ndarray or scalar
@@ -2596,11 +2622,15 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
     array([0, 1])
     >>> np.amin(a, axis=1)   # Minima along the second axis
     array([0, 2])
+    >>> np.amin(a, where=[False, True], initial=10, axis=0)
+    array([10,  1])
 
     >>> b = np.arange(5, dtype=float)
     >>> b[2] = np.NaN
     >>> np.amin(b)
     nan
+    >>> np.amin(b, where=~np.isnan(b), initial=10)
+    0.0
     >>> np.nanmin(b)
     0.0
 
@@ -2618,8 +2648,8 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
     >>> min([6], default=5)
     6
     """
-    return _wrapreduction(a, np.minimum, 'min', axis, None, out, keepdims=keepdims,
-                          initial=initial)
+    return _wrapreduction(a, np.minimum, 'min', axis, None, out,
+                          keepdims=keepdims, initial=initial, where=where)
 
 
 def _alen_dispathcer(a):
@@ -2660,13 +2690,14 @@ def alen(a):
         return len(array(a, ndmin=1))
 
 
-def _prod_dispatcher(
-        a, axis=None, dtype=None, out=None, keepdims=None, initial=None):
+def _prod_dispatcher(a, axis=None, dtype=None, out=None, keepdims=None,
+                     initial=None, where=None):
     return (a, out)
 
 
 @array_function_dispatch(_prod_dispatcher)
-def prod(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
+def prod(a, axis=None, dtype=None, out=None, keepdims=np._NoValue,
+         initial=np._NoValue, where=np._NoValue):
     """
     Return the product of array elements over a given axis.
 
@@ -2711,6 +2742,11 @@ def prod(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, initial=np._N
 
         .. versionadded:: 1.15.0
 
+    where : array_like of bool, optional
+        Elements to include in the product. See `~numpy.ufunc.reduce` for details.
+
+        .. versionadded:: 1.17.0
+
     Returns
     -------
     product_along_axis : ndarray, see `dtype` parameter above.
@@ -2753,6 +2789,11 @@ def prod(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, initial=np._N
     >>> np.prod([[1.,2.],[3.,4.]], axis=1)
     array([  2.,  12.])
 
+    Or select specific elements to include:
+
+    >>> np.prod([1., np.nan, 3.], where=[True, False, True])
+    3.0
+
     If the type of `x` is unsigned, then the output type is
     the unsigned platform integer:
 
@@ -2772,8 +2813,8 @@ def prod(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, initial=np._N
     >>> np.prod([1, 2], initial=5)
     10
     """
-    return _wrapreduction(a, np.multiply, 'prod', axis, dtype, out, keepdims=keepdims,
-                          initial=initial)
+    return _wrapreduction(a, np.multiply, 'prod', axis, dtype, out,
+                          keepdims=keepdims, initial=initial, where=where)
 
 
 def _cumprod_dispatcher(a, axis=None, dtype=None, out=None):
diff --git a/numpy/core/function_base.py b/numpy/core/function_base.py
index 762328173..f8800b83e 100644
--- a/numpy/core/function_base.py
+++ b/numpy/core/function_base.py
@@ -110,9 +110,6 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None,
 
     Graphical illustration:
 
-    >>> import matplotlib
-    >>> import matplotlib.pyplot
-    >>> matplotlib.pyplot.switch_backend('agg')
     >>> import matplotlib.pyplot as plt
     >>> N = 8
     >>> y = np.zeros(N)
@@ -263,9 +260,6 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None,
 
     Graphical illustration:
 
-    >>> import matplotlib
-    >>> import matplotlib.pyplot
-    >>> matplotlib.pyplot.switch_backend('agg')
     >>> import matplotlib.pyplot as plt
     >>> N = 10
     >>> x1 = np.logspace(0.1, 1, N, endpoint=True)
@@ -379,8 +373,6 @@ def geomspace(start, stop, num=50, endpoint=True, dtype=None, axis=0):
 
     Graphical illustration of ``endpoint`` parameter:
 
-    >>> import matplotlib
-    >>> matplotlib.use('agg')
     >>> import matplotlib.pyplot as plt
     >>> N = 10
     >>> y = np.zeros(N)
diff --git a/numpy/core/include/numpy/ufuncobject.h b/numpy/core/include/numpy/ufuncobject.h
index 90d837a9b..15dcdf010 100644
--- a/numpy/core/include/numpy/ufuncobject.h
+++ b/numpy/core/include/numpy/ufuncobject.h
@@ -120,7 +120,11 @@ typedef struct _tagPyUFuncObject {
          */
         int nin, nout, nargs;
 
-        /* Identity for reduction, either PyUFunc_One or PyUFunc_Zero */
+        /*
+         * Identity for reduction, any of PyUFunc_One, PyUFunc_Zero
+         * PyUFunc_MinusOne, PyUFunc_None, PyUFunc_ReorderableNone,
+         * PyUFunc_IdentityValue.
+         */
         int identity;
 
         /* Array of one-dimensional core loops */
@@ -301,7 +305,7 @@ typedef struct _tagPyUFuncObject {
  */
 #define PyUFunc_ReorderableNone -2
 /*
- * UFunc unit is in identity_value, and the order of operations can be reordered
+ * UFunc unit is an identity_value, and the order of operations can be reordered
  * This case allows reduction with multiple axes at once.
  */
 #define PyUFunc_IdentityValue -3
diff --git a/numpy/core/multiarray.py b/numpy/core/multiarray.py
index 1b9c65782..4c2715892 100644
--- a/numpy/core/multiarray.py
+++ b/numpy/core/multiarray.py
@@ -211,9 +211,11 @@ def concatenate(arrays, axis=None, out=None):
            fill_value=999999)
 
     """
-    for array in arrays:
-        yield array
-    yield out
+    if out is not None:
+        # optimize for the typical case where only arrays is provided
+        arrays = list(arrays)
+        arrays.append(out)
+    return arrays
 
 
 @array_function_from_c_func_and_dispatcher(_multiarray_umath.inner)
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 8a8efddf3..1b8f36c3e 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -12,6 +12,7 @@ import operator
 import sys
 import warnings
 import numbers
+import contextlib
 
 import numpy as np
 from . import multiarray
@@ -2990,7 +2991,7 @@ _Unspecified = _unspecified()
 
 
 @set_module('numpy')
-class errstate(object):
+class errstate(contextlib.ContextDecorator):
     """
     errstate(**kwargs)
 
@@ -3000,7 +3001,12 @@ class errstate(object):
     that context to execute with a known error handling behavior. Upon entering
     the context the error handling is set with `seterr` and `seterrcall`, and
     upon exiting it is reset to what it was before.
-
+    
+    ..  versionchanged:: 1.17.0
+        `errstate` is also usable as a function decorator, saving
+        a level of indentation if an entire function is wrapped.
+        See :py:class:`contextlib.ContextDecorator` for more information. 
+    
     Parameters
     ----------
     kwargs : {divide, over, under, invalid}
diff --git a/numpy/core/overrides.py b/numpy/core/overrides.py
index 0979858a1..c55174ecd 100644
--- a/numpy/core/overrides.py
+++ b/numpy/core/overrides.py
@@ -1,73 +1,23 @@
-"""Preliminary implementation of NEP-18
-
-TODO: rewrite this in C for performance.
-"""
+"""Implementation of __array_function__ overrides from NEP-18."""
 import collections
 import functools
 import os
 
-from numpy.core._multiarray_umath import add_docstring, ndarray
+from numpy.core._multiarray_umath import (
+    add_docstring, implement_array_function, _get_implementing_args)
 from numpy.compat._inspect import getargspec
 
 
-_NDARRAY_ARRAY_FUNCTION = ndarray.__array_function__
-_NDARRAY_ONLY = [ndarray]
-
 ENABLE_ARRAY_FUNCTION = bool(
     int(os.environ.get('NUMPY_EXPERIMENTAL_ARRAY_FUNCTION', 0)))
 
 
-def get_overloaded_types_and_args(relevant_args):
-    """Returns a list of arguments on which to call __array_function__.
-
-    Parameters
-    ----------
-    relevant_args : iterable of array-like
-        Iterable of array-like arguments to check for __array_function__
-        methods.
-
-    Returns
-    -------
-    overloaded_types : collection of types
-        Types of arguments from relevant_args with __array_function__ methods.
-    overloaded_args : list
-        Arguments from relevant_args on which to call __array_function__
-        methods, in the order in which they should be called.
+add_docstring(
+    implement_array_function,
     """
-    # Runtime is O(num_arguments * num_unique_types)
-    overloaded_types = []
-    overloaded_args = []
-    for arg in relevant_args:
-        arg_type = type(arg)
-        # We only collect arguments if they have a unique type, which ensures
-        # reasonable performance even with a long list of possibly overloaded
-        # arguments.
-        if (arg_type not in overloaded_types and
-                hasattr(arg_type, '__array_function__')):
-
-            # Create lists explicitly for the first type (usually the only one
-            # done) to avoid setting up the iterator for overloaded_args.
-            if overloaded_types:
-                overloaded_types.append(arg_type)
-                # By default, insert argument at the end, but if it is
-                # subclass of another argument, insert it before that argument.
-                # This ensures "subclasses before superclasses".
-                index = len(overloaded_args)
-                for i, old_arg in enumerate(overloaded_args):
-                    if issubclass(arg_type, type(old_arg)):
-                        index = i
-                        break
-                overloaded_args.insert(index, arg)
-            else:
-                overloaded_types = [arg_type]
-                overloaded_args = [arg]
-
-    return overloaded_types, overloaded_args
-
-
-def array_function_implementation_or_override(
-        implementation, public_api, relevant_args, args, kwargs):
-    """Implement a function with checks for __array_function__ overrides.
+    Implement a function with checks for __array_function__ overrides.
+
+    All arguments are required, and can only be passed by position.
 
     Arguments
     ---------
@@ -82,41 +32,37 @@ def array_function_implementation_or_override(
         Iterable of arguments to check for __array_function__ methods.
     args : tuple
         Arbitrary positional arguments originally passed into ``public_api``.
-    kwargs : tuple
+    kwargs : dict
         Arbitrary keyword arguments originally passed into ``public_api``.
 
     Returns
     -------
-    Result from calling `implementation()` or an `__array_function__`
+    Result from calling ``implementation()`` or an ``__array_function__``
     method, as appropriate.
 
     Raises
     ------
     TypeError : if no implementation is found.
+    """)
+
+
+# exposed for testing purposes; used internally by implement_array_function
+add_docstring(
+    _get_implementing_args,
     """
-    # Check for __array_function__ methods.
-    types, overloaded_args = get_overloaded_types_and_args(relevant_args)
-    # Short-cut for common cases: no overload or only ndarray overload
-    # (directly or with subclasses that do not override __array_function__).
-    if (not overloaded_args or types == _NDARRAY_ONLY or
-            all(type(arg).__array_function__ is _NDARRAY_ARRAY_FUNCTION
-                for arg in overloaded_args)):
-        return implementation(*args, **kwargs)
-
-    # Call overrides
-    for overloaded_arg in overloaded_args:
-        # Use `public_api` instead of `implemenation` so __array_function__
-        # implementations can do equality/identity comparisons.
-        result = overloaded_arg.__array_function__(
-            public_api, types, args, kwargs)
-
-        if result is not NotImplemented:
-            return result
-
-    func_name = '{}.{}'.format(public_api.__module__, public_api.__name__)
-    raise TypeError("no implementation found for '{}' on types that implement "
-                    '__array_function__: {}'
-                    .format(func_name, list(map(type, overloaded_args))))
+    Collect arguments on which to call __array_function__.
+
+    Parameters
+    ----------
+    relevant_args : iterable of array-like
+        Iterable of possibly array-like arguments to check for
+        __array_function__ methods.
+
+    Returns
+    -------
+    Sequence of arguments with __array_function__ methods, in the order in
+    which they should be called.
+    """)
 
 
 ArgSpec = collections.namedtuple('ArgSpec', 'args varargs keywords defaults')
@@ -215,7 +161,7 @@ def array_function_dispatch(dispatcher, module=None, verify=True,
         @functools.wraps(implementation)
         def public_api(*args, **kwargs):
             relevant_args = dispatcher(*args, **kwargs)
-            return array_function_implementation_or_override(
+            return implement_array_function(
                 implementation, public_api, relevant_args, args, kwargs)
 
         if module is not None:
diff --git a/numpy/core/records.py b/numpy/core/records.py
index d128f41e1..42aca5b60 100644
--- a/numpy/core/records.py
+++ b/numpy/core/records.py
@@ -167,10 +167,12 @@ class format_parser(object):
         if formats is None:
             raise ValueError("Need formats argument")
         if isinstance(formats, list):
-            if len(formats) < 2:
-                formats.append('')
-            formats = ','.join(formats)
-        dtype = sb.dtype(formats, aligned)
+            dtype = sb.dtype(
+                [('f{}'.format(i), format_) for i, format_ in enumerate(formats)],
+                aligned,
+            )
+        else:
+            dtype = sb.dtype(formats, aligned)
         fields = dtype.fields
         if fields is None:
             dtype = sb.dtype([('f1', dtype)], aligned)
@@ -611,7 +613,6 @@ def fromarrays(arrayList, dtype=None, shape=None, formats=None,
             if not isinstance(obj, ndarray):
                 raise ValueError("item in the array list must be an ndarray.")
             formats.append(obj.dtype.str)
-        formats = ','.join(formats)
 
     if dtype is not None:
         descr = sb.dtype(dtype)
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 467b590ac..9ccca629e 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -775,6 +775,7 @@ def configuration(parent_package='',top_path=None):
     multiarray_deps = [
             join('src', 'multiarray', 'arrayobject.h'),
             join('src', 'multiarray', 'arraytypes.h'),
+            join('src', 'multiarray', 'arrayfunction_override.h'),
             join('src', 'multiarray', 'buffer.h'),
             join('src', 'multiarray', 'calculation.h'),
             join('src', 'multiarray', 'common.h'),
@@ -827,6 +828,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'arraytypes.c.src'),
             join('src', 'multiarray', 'array_assign_scalar.c'),
             join('src', 'multiarray', 'array_assign_array.c'),
+            join('src', 'multiarray', 'arrayfunction_override.c'),
             join('src', 'multiarray', 'buffer.c'),
             join('src', 'multiarray', 'calculation.c'),
             join('src', 'multiarray', 'compiled_base.c'),
diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py
index 0378d3c1f..f8332c362 100644
--- a/numpy/core/shape_base.py
+++ b/numpy/core/shape_base.py
@@ -342,10 +342,11 @@ def hstack(tup):
 
 def _stack_dispatcher(arrays, axis=None, out=None):
     arrays = _arrays_for_stack_dispatcher(arrays, stacklevel=6)
-    for a in arrays:
-        yield a
     if out is not None:
-        yield out
+        # optimize for the typical case where only arrays is provided
+        arrays = list(arrays)
+        arrays.append(out)
+    return arrays
 
 
 @array_function_dispatch(_stack_dispatcher)
diff --git a/numpy/core/src/common/array_assign.c b/numpy/core/src/common/array_assign.c
index ac3fdbef7..02a423e3a 100644
--- a/numpy/core/src/common/array_assign.c
+++ b/numpy/core/src/common/array_assign.c
@@ -125,9 +125,13 @@ raw_array_is_aligned(int ndim, npy_intp *shape,
 
         return npy_is_aligned((void *)align_check, alignment);
     }
-    else {
+    else if (alignment == 1) {
         return 1;
     }
+    else {
+        /* always return false for alignment == 0, which means cannot-be-aligned */
+        return 0;
+    }
 }
 
 NPY_NO_EXPORT int
diff --git a/numpy/core/src/common/array_assign.h b/numpy/core/src/common/array_assign.h
index 07438c5e8..69ef56bb4 100644
--- a/numpy/core/src/common/array_assign.h
+++ b/numpy/core/src/common/array_assign.h
@@ -87,8 +87,10 @@ broadcast_strides(int ndim, npy_intp *shape,
 
 /*
  * Checks whether a data pointer + set of strides refers to a raw
- * array whose elements are all aligned to a given alignment.
- * alignment should be a power of two.
+ * array whose elements are all aligned to a given alignment. Returns
+ * 1 if data is aligned to alignment or 0 if not.
+ * alignment should be a power of two, or may be the sentinel value 0 to mean
+ * cannot-be-aligned, in which case 0 (false) is always returned.
  */
 NPY_NO_EXPORT int
 raw_array_is_aligned(int ndim, npy_intp *shape,
diff --git a/numpy/core/src/common/get_attr_string.h b/numpy/core/src/common/get_attr_string.h
index bec87c5ed..d458d9550 100644
--- a/numpy/core/src/common/get_attr_string.h
+++ b/numpy/core/src/common/get_attr_string.h
@@ -103,7 +103,6 @@ PyArray_LookupSpecial(PyObject *obj, char *name)
     if (_is_basic_python_type(tp)) {
         return NULL;
     }
-
     return maybe_get_attr((PyObject *)tp, name);
 }
 
diff --git a/numpy/core/src/multiarray/_multiarray_tests.c.src b/numpy/core/src/multiarray/_multiarray_tests.c.src
index 2a8275572..c26bd16ac 100644
--- a/numpy/core/src/multiarray/_multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/_multiarray_tests.c.src
@@ -1871,11 +1871,14 @@ printf_float_g(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
 static PyObject *
 getset_numericops(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
 {
-    PyObject * ops = PyArray_GetNumericOps();
+    PyObject *ret;
+    PyObject *ops = PyArray_GetNumericOps();
     if (ops == NULL) {
         return NULL;
     }
-    return PyLong_FromLong(PyArray_SetNumericOps(ops));
+    ret = PyLong_FromLong(PyArray_SetNumericOps(ops));
+    Py_DECREF(ops);
+    return ret;
 }
 
 static PyMethodDef Multiarray_TestsMethods[] = {
diff --git a/numpy/core/src/multiarray/array_assign_array.c b/numpy/core/src/multiarray/array_assign_array.c
index f692e0307..6e31fd3f7 100644
--- a/numpy/core/src/multiarray/array_assign_array.c
+++ b/numpy/core/src/multiarray/array_assign_array.c
@@ -25,6 +25,47 @@
 #include "array_assign.h"
 
 /*
+ * Check that array data is both uint-aligned and true-aligned for all array
+ * elements, as required by the copy/casting code in lowlevel_strided_loops.c
+ */
+NPY_NO_EXPORT int
+copycast_isaligned(int ndim, npy_intp *shape,
+        PyArray_Descr *dtype, char *data, npy_intp *strides)
+{
+    int aligned;
+    int big_aln, small_aln;
+
+    int uint_aln = npy_uint_alignment(dtype->elsize);
+    int true_aln = dtype->alignment;
+
+    /* uint alignment can be 0, meaning not uint alignable */
+    if (uint_aln == 0) {
+        return 0;
+    }
+
+    /*
+     * As an optimization, it is unnecessary to check the alignment to the
+     * smaller of (uint_aln, true_aln) if the data is aligned to the bigger of
+     * the two and the big is a multiple of the small aln. We check the bigger
+     * one first and only check the smaller if necessary.
+     */
+    if (true_aln >= uint_aln) {
+        big_aln = true_aln;
+        small_aln = uint_aln;
+    }
+    else {
+        big_aln = uint_aln;
+        small_aln = true_aln;
+    }
+
+    aligned = raw_array_is_aligned(ndim, shape, data, strides, big_aln);
+    if (aligned && big_aln % small_aln != 0) {
+        aligned = raw_array_is_aligned(ndim, shape, data, strides, small_aln);
+    }
+    return aligned;
+}
+
+/*
  * Assigns the array from 'src' to 'dst'. The strides must already have
  * been broadcast.
  *
@@ -48,11 +89,9 @@ raw_array_assign_array(int ndim, npy_intp *shape,
 
     NPY_BEGIN_THREADS_DEF;
 
-    /* Check alignment */
-    aligned = raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
-                                   npy_uint_alignment(dst_dtype->elsize)) &&
-              raw_array_is_aligned(ndim, shape, src_data, src_strides,
-                                   npy_uint_alignment(src_dtype->elsize));
+    aligned =
+        copycast_isaligned(ndim, shape, dst_dtype, dst_data, dst_strides) &&
+        copycast_isaligned(ndim, shape, src_dtype, src_data, src_strides);
 
     /* Use raw iteration with no heap allocation */
     if (PyArray_PrepareTwoRawArrayIter(
@@ -133,11 +172,9 @@ raw_array_wheremasked_assign_array(int ndim, npy_intp *shape,
 
     NPY_BEGIN_THREADS_DEF;
 
-    /* Check alignment */
-    aligned = raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
-                                   npy_uint_alignment(dst_dtype->elsize)) &&
-              raw_array_is_aligned(ndim, shape, src_data, src_strides,
-                                   npy_uint_alignment(src_dtype->elsize));
+    aligned =
+        copycast_isaligned(ndim, shape, dst_dtype, dst_data, dst_strides) &&
+        copycast_isaligned(ndim, shape, src_dtype, src_data, src_strides);
 
     /* Use raw iteration with no heap allocation */
     if (PyArray_PrepareThreeRawArrayIter(
diff --git a/numpy/core/src/multiarray/array_assign_scalar.c b/numpy/core/src/multiarray/array_assign_scalar.c
index 841a41850..ecb5be47b 100644
--- a/numpy/core/src/multiarray/array_assign_scalar.c
+++ b/numpy/core/src/multiarray/array_assign_scalar.c
@@ -45,10 +45,13 @@ raw_array_assign_scalar(int ndim, npy_intp *shape,
 
     NPY_BEGIN_THREADS_DEF;
 
-    /* Check alignment */
+    /* Check both uint and true alignment */
     aligned = raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
                                    npy_uint_alignment(dst_dtype->elsize)) &&
-              npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize));
+              raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
+                                   dst_dtype->alignment) &&
+              npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize) &&
+              npy_is_aligned(src_data, src_dtype->alignment));
 
     /* Use raw iteration with no heap allocation */
     if (PyArray_PrepareOneRawArrayIter(
@@ -116,10 +119,13 @@ raw_array_wheremasked_assign_scalar(int ndim, npy_intp *shape,
 
     NPY_BEGIN_THREADS_DEF;
 
-    /* Check alignment */
+    /* Check both uint and true alignment */
     aligned = raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
                                    npy_uint_alignment(dst_dtype->elsize)) &&
-              npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize));
+              raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
+                                   dst_dtype->alignment) &&
+              npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize) &&
+              npy_is_aligned(src_data, src_dtype->alignment));
 
     /* Use raw iteration with no heap allocation */
     if (PyArray_PrepareTwoRawArrayIter(
@@ -220,7 +226,8 @@ PyArray_AssignRawScalar(PyArrayObject *dst,
      * we also skip this if 'dst' has an object dtype.
      */
     if ((!PyArray_EquivTypes(PyArray_DESCR(dst), src_dtype) ||
-            !npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize))) &&
+            !(npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize)) &&
+              npy_is_aligned(src_data, src_dtype->alignment))) &&
                     PyArray_SIZE(dst) > 1 &&
                     !PyDataType_REFCHK(PyArray_DESCR(dst))) {
         char *tmp_src_data;
diff --git a/numpy/core/src/multiarray/arrayfunction_override.c b/numpy/core/src/multiarray/arrayfunction_override.c
new file mode 100644
index 000000000..e62b32ab2
--- /dev/null
+++ b/numpy/core/src/multiarray/arrayfunction_override.c
@@ -0,0 +1,376 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
+#include "npy_pycompat.h"
+#include "get_attr_string.h"
+#include "npy_import.h"
+#include "multiarraymodule.h"
+
+
+/* Return the ndarray.__array_function__ method. */
+static PyObject *
+get_ndarray_array_function(void)
+{
+    PyObject* method = PyObject_GetAttrString((PyObject *)&PyArray_Type,
+                                              "__array_function__");
+    assert(method != NULL);
+    return method;
+}
+
+
+/*
+ * Get an object's __array_function__ method in the fastest way possible.
+ * Never raises an exception. Returns NULL if the method doesn't exist.
+ */
+static PyObject *
+get_array_function(PyObject *obj)
+{
+    static PyObject *ndarray_array_function = NULL;
+
+    if (ndarray_array_function == NULL) {
+        ndarray_array_function = get_ndarray_array_function();
+    }
+
+    /* Fast return for ndarray */
+    if (PyArray_CheckExact(obj)) {
+        Py_INCREF(ndarray_array_function);
+        return ndarray_array_function;
+    }
+
+    return PyArray_LookupSpecial(obj, "__array_function__");
+}
+
+
+/*
+ * Like list.insert(), but for C arrays of PyObject*. Skips error checking.
+ */
+static void
+pyobject_array_insert(PyObject **array, int length, int index, PyObject *item)
+{
+    int j;
+
+    for (j = length; j > index; j--) {
+        array[j] = array[j - 1];
+    }
+    array[index] = item;
+}
+
+
+/*
+ * Collects arguments with __array_function__ and their corresponding methods
+ * in the order in which they should be tried (i.e., skipping redundant types).
+ * `relevant_args` is expected to have been produced by PySequence_Fast.
+ * Returns the number of arguments, or -1 on failure. 
+ */
+static int
+get_implementing_args_and_methods(PyObject *relevant_args,
+                                  PyObject **implementing_args,
+                                  PyObject **methods)
+{
+    int num_implementing_args = 0;
+    Py_ssize_t i;
+    int j;
+
+    PyObject **items = PySequence_Fast_ITEMS(relevant_args);
+    Py_ssize_t length = PySequence_Fast_GET_SIZE(relevant_args);
+
+    for (i = 0; i < length; i++) {
+        int new_class = 1;
+        PyObject *argument = items[i];
+
+        /* Have we seen this type before? */
+        for (j = 0; j < num_implementing_args; j++) {
+            if (Py_TYPE(argument) == Py_TYPE(implementing_args[j])) {
+                new_class = 0;
+                break;
+            }
+        }
+        if (new_class) {
+            PyObject *method = get_array_function(argument);
+
+            if (method != NULL) {
+                int arg_index;
+
+                if (num_implementing_args >= NPY_MAXARGS) {
+                    PyErr_Format(
+                        PyExc_TypeError,
+                        "maximum number (%d) of distinct argument types " \
+                        "implementing __array_function__ exceeded",
+                        NPY_MAXARGS);
+                    Py_DECREF(method);
+                    goto fail;
+                }
+
+                /* "subclasses before superclasses, otherwise left to right" */
+                arg_index = num_implementing_args;
+                for (j = 0; j < num_implementing_args; j++) {
+                    PyObject *other_type;
+                    other_type = (PyObject *)Py_TYPE(implementing_args[j]);
+                    if (PyObject_IsInstance(argument, other_type)) {
+                        arg_index = j;
+                        break;
+                    }
+                }
+                Py_INCREF(argument);
+                pyobject_array_insert(implementing_args, num_implementing_args,
+                                      arg_index, argument);
+                pyobject_array_insert(methods, num_implementing_args,
+                                      arg_index, method);
+                ++num_implementing_args;
+            }
+        }
+    }
+    return num_implementing_args;
+
+fail:
+    for (j = 0; j < num_implementing_args; j++) {
+        Py_DECREF(implementing_args[j]);
+        Py_DECREF(methods[j]);
+    }
+    return -1;
+}
+
+
+/*
+ * Is this object ndarray.__array_function__?
+ */
+static int
+is_default_array_function(PyObject *obj)
+{
+    static PyObject *ndarray_array_function = NULL;
+
+    if (ndarray_array_function == NULL) {
+        ndarray_array_function = get_ndarray_array_function();
+    }
+    return obj == ndarray_array_function;
+}
+
+
+/*
+ * Core implementation of ndarray.__array_function__. This is exposed
+ * separately so we can avoid the overhead of a Python method call from
+ * within `implement_array_function`.
+ */
+NPY_NO_EXPORT PyObject *
+array_function_method_impl(PyObject *func, PyObject *types, PyObject *args,
+                           PyObject *kwargs)
+{
+    Py_ssize_t j;
+    PyObject *implementation, *result;
+
+    PyObject **items = PySequence_Fast_ITEMS(types);
+    Py_ssize_t length = PySequence_Fast_GET_SIZE(types);
+
+    for (j = 0; j < length; j++) {
+        int is_subclass = PyObject_IsSubclass(
+            items[j], (PyObject *)&PyArray_Type);
+        if (is_subclass == -1) {
+            return NULL;
+        }
+        if (!is_subclass) {
+            Py_INCREF(Py_NotImplemented);
+            return Py_NotImplemented;
+        }
+    }
+
+    implementation = PyObject_GetAttr(func, npy_ma_str_wrapped);
+    if (implementation == NULL) {
+        return NULL;
+    }
+    result = PyObject_Call(implementation, args, kwargs);
+    Py_DECREF(implementation);
+    return result;
+}
+
+
+/*
+ * Calls __array_function__ on the provided argument, with a fast-path for
+ * ndarray.
+ */
+static PyObject *
+call_array_function(PyObject* argument, PyObject* method,
+                    PyObject* public_api, PyObject* types,
+                    PyObject* args, PyObject* kwargs)
+{
+    if (is_default_array_function(method)) {
+        return array_function_method_impl(public_api, types, args, kwargs);
+    }
+    else {
+        return PyObject_CallFunctionObjArgs(
+            method, argument, public_api, types, args, kwargs, NULL);
+    }
+}
+
+
+/*
+ * Implements the __array_function__ protocol for a function, as described in
+ * in NEP-18. See numpy.core.overrides for a full docstring.
+ */
+NPY_NO_EXPORT PyObject *
+array_implement_array_function(
+    PyObject *NPY_UNUSED(dummy), PyObject *positional_args)
+{
+    PyObject *implementation, *public_api, *relevant_args, *args, *kwargs;
+
+    PyObject *types = NULL;
+    PyObject *implementing_args[NPY_MAXARGS];
+    PyObject *array_function_methods[NPY_MAXARGS];
+
+    int j, any_overrides;
+    int num_implementing_args = 0;
+    PyObject *result = NULL;
+
+    static PyObject *errmsg_formatter = NULL;
+
+    if (!PyArg_UnpackTuple(
+            positional_args, "implement_array_function", 5, 5,
+            &implementation, &public_api, &relevant_args, &args, &kwargs)) {
+        return NULL;
+    }
+
+    relevant_args = PySequence_Fast(
+        relevant_args,
+        "dispatcher for __array_function__ did not return an iterable");
+    if (relevant_args == NULL) {
+        return NULL;
+    }
+
+    /* Collect __array_function__ implementations */
+    num_implementing_args = get_implementing_args_and_methods(
+        relevant_args, implementing_args, array_function_methods);
+    if (num_implementing_args == -1) {
+        goto cleanup;
+    }
+
+    /*
+     * Handle the typical case of no overrides. This is merely an optimization
+     * if some arguments are ndarray objects, but is also necessary if no
+     * arguments implement __array_function__ at all (e.g., if they are all
+     * built-in types).
+     */
+    any_overrides = 0;
+    for (j = 0; j < num_implementing_args; j++) {
+        if (!is_default_array_function(array_function_methods[j])) {
+            any_overrides = 1;
+            break;
+        }
+    }
+    if (!any_overrides) {
+        result = PyObject_Call(implementation, args, kwargs);
+        goto cleanup;
+    }
+
+    /*
+     * Create a Python object for types.
+     * We use a tuple, because it's the fastest Python collection to create
+     * and has the bonus of being immutable.
+     */
+    types = PyTuple_New(num_implementing_args);
+    if (types == NULL) {
+        goto cleanup;
+    }
+    for (j = 0; j < num_implementing_args; j++) {
+        PyObject *arg_type = (PyObject *)Py_TYPE(implementing_args[j]);
+        Py_INCREF(arg_type);
+        PyTuple_SET_ITEM(types, j, arg_type);
+    }
+
+    /* Call __array_function__ methods */
+    for (j = 0; j < num_implementing_args; j++) {
+        PyObject *argument = implementing_args[j];
+        PyObject *method = array_function_methods[j];
+
+        /*
+         * We use `public_api` instead of `implementation` here so
+         * __array_function__ implementations can do equality/identity
+         * comparisons.
+         */
+        result = call_array_function(
+            argument, method, public_api, types, args, kwargs);
+
+        if (result == Py_NotImplemented) {
+            /* Try the next one */
+            Py_DECREF(result);
+            result = NULL;
+        }
+        else {
+            /* Either a good result, or an exception was raised. */
+            goto cleanup;
+        }
+    }
+
+    /* No acceptable override found, raise TypeError. */
+    npy_cache_import("numpy.core._internal",
+                     "array_function_errmsg_formatter",
+                     &errmsg_formatter);
+    if (errmsg_formatter != NULL) {
+        PyObject *errmsg = PyObject_CallFunctionObjArgs(
+            errmsg_formatter, public_api, types, NULL);
+        if (errmsg != NULL) {
+            PyErr_SetObject(PyExc_TypeError, errmsg);
+            Py_DECREF(errmsg);
+        }
+    }
+
+cleanup:
+    for (j = 0; j < num_implementing_args; j++) {
+        Py_DECREF(implementing_args[j]);
+        Py_DECREF(array_function_methods[j]);
+    }
+    Py_XDECREF(types);
+    Py_DECREF(relevant_args);
+    return result;
+}
+
+
+/*
+ * Python wrapper for get_implementing_args_and_methods, for testing purposes.
+ */
+NPY_NO_EXPORT PyObject *
+array__get_implementing_args(
+    PyObject *NPY_UNUSED(dummy), PyObject *positional_args)
+{
+    PyObject *relevant_args;
+    int j;
+    int num_implementing_args = 0;
+    PyObject *implementing_args[NPY_MAXARGS];
+    PyObject *array_function_methods[NPY_MAXARGS];
+    PyObject *result = NULL;
+
+    if (!PyArg_ParseTuple(positional_args, "O:array__get_implementing_args",
+                          &relevant_args)) {
+        return NULL;
+    }
+
+    relevant_args = PySequence_Fast(
+        relevant_args,
+        "dispatcher for __array_function__ did not return an iterable");
+    if (relevant_args == NULL) {
+        return NULL;
+    }
+
+    num_implementing_args = get_implementing_args_and_methods(
+        relevant_args, implementing_args, array_function_methods);
+    if (num_implementing_args == -1) {
+        goto cleanup;
+    }
+
+    /* create a Python object for implementing_args */
+    result = PyList_New(num_implementing_args);
+    if (result == NULL) {
+        goto cleanup;
+    }
+    for (j = 0; j < num_implementing_args; j++) {
+        PyObject *argument = implementing_args[j];
+        Py_INCREF(argument);
+        PyList_SET_ITEM(result, j, argument);
+    }
+
+cleanup:
+    for (j = 0; j < num_implementing_args; j++) {
+        Py_DECREF(implementing_args[j]);
+        Py_DECREF(array_function_methods[j]);
+    }
+    Py_DECREF(relevant_args);
+    return result;
+}
diff --git a/numpy/core/src/multiarray/arrayfunction_override.h b/numpy/core/src/multiarray/arrayfunction_override.h
new file mode 100644
index 000000000..0d224e2b6
--- /dev/null
+++ b/numpy/core/src/multiarray/arrayfunction_override.h
@@ -0,0 +1,16 @@
+#ifndef _NPY_PRIVATE__ARRAYFUNCTION_OVERRIDE_H
+#define _NPY_PRIVATE__ARRAYFUNCTION_OVERRIDE_H
+
+NPY_NO_EXPORT PyObject *
+array_implement_array_function(
+    PyObject *NPY_UNUSED(dummy), PyObject *positional_args);
+
+NPY_NO_EXPORT PyObject *
+array__get_implementing_args(
+    PyObject *NPY_UNUSED(dummy), PyObject *positional_args);
+
+NPY_NO_EXPORT PyObject *
+array_function_method_impl(PyObject *func, PyObject *types, PyObject *args,
+                           PyObject *kwargs);
+
+#endif
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 823ee7115..ca5f5a47b 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -2205,15 +2205,19 @@ static void
 VOID_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride,
                 npy_intp n, int swap, PyArrayObject *arr)
 {
+    PyArray_Descr *descr;
+
     if (arr == NULL) {
         return;
     }
+
+    descr = PyArray_DESCR(arr);
+
     if (PyArray_HASFIELDS(arr)) {
         PyObject *key, *value;
-        PyArray_Descr *descr;
+
         Py_ssize_t pos = 0;
 
-        descr = PyArray_DESCR(arr);
         while (PyDict_Next(descr->fields, &pos, &key, &value)) {
             npy_intp offset;
             PyArray_Descr * new;
@@ -2236,14 +2240,28 @@ VOID_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride,
         ((PyArrayObject_fields *)arr)->descr = descr;
         return;
     }
-    if (swap && PyArray_DESCR(arr)->subarray != NULL) {
-        PyArray_Descr *descr, *new;
+    if (PyDataType_HASSUBARRAY(descr)) {
+        PyArray_Descr *new;
         npy_intp num;
         npy_intp i;
         int subitemsize;
         char *dstptr, *srcptr;
+        /*
+         * In certain cases subarray copy can be optimized. This is when
+         * swapping is unecessary and the subarrays data type can certainly
+         * be simply copied (no object, fields, subarray, and not a user dtype).
+         */
+        npy_bool can_optimize_subarray = (!swap &&
+                !PyDataType_HASFIELDS(descr->subarray->base) &&
+                !PyDataType_HASSUBARRAY(descr->subarray->base) &&
+                !PyDataType_REFCHK(descr->subarray->base) &&
+                (descr->subarray->base->type_num < NPY_NTYPES));
+
+        if (can_optimize_subarray) {
+            _basic_copyn(dst, dstride, src, sstride, n, descr->elsize);
+            return;
+        }
 
-        descr = PyArray_DESCR(arr);
         new = descr->subarray->base;
         /*
          * TODO: temporarily modifying the array like this
@@ -2253,6 +2271,10 @@ VOID_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride,
         dstptr = dst;
         srcptr = src;
         subitemsize = new->elsize;
+        if (subitemsize == 0) {
+            /* There cannot be any elements, so return */
+            return;
+        }
         num = descr->elsize / subitemsize;
         for (i = 0; i < n; i++) {
             new->f->copyswapn(dstptr, subitemsize, srcptr,
@@ -2265,22 +2287,26 @@ VOID_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride,
         ((PyArrayObject_fields *)arr)->descr = descr;
         return;
     }
-    _basic_copyn(dst, dstride, src, sstride, n, PyArray_DESCR(arr)->elsize);
+    /* Must be a naive Void type (e.g. a "V8") so simple copy is sufficient. */
+    _basic_copyn(dst, dstride, src, sstride, n, descr->elsize);
     return;
 }
 
 static void
 VOID_copyswap (char *dst, char *src, int swap, PyArrayObject *arr)
 {
+    PyArray_Descr *descr;
+
     if (arr == NULL) {
         return;
     }
+
+    descr = PyArray_DESCR(arr);
+
     if (PyArray_HASFIELDS(arr)) {
         PyObject *key, *value;
-        PyArray_Descr *descr;
         Py_ssize_t pos = 0;
 
-        descr = PyArray_DESCR(arr);
         while (PyDict_Next(descr->fields, &pos, &key, &value)) {
             npy_intp offset;
             PyArray_Descr * new;
@@ -2303,28 +2329,45 @@ VOID_copyswap (char *dst, char *src, int swap, PyArrayObject *arr)
         ((PyArrayObject_fields *)arr)->descr = descr;
         return;
     }
-    if (swap && PyArray_DESCR(arr)->subarray != NULL) {
-        PyArray_Descr *descr, *new;
+    if (PyDataType_HASSUBARRAY(descr)) {
+        PyArray_Descr *new;
         npy_intp num;
-        int itemsize;
+        int subitemsize;
+        /*
+         * In certain cases subarray copy can be optimized. This is when
+         * swapping is unecessary and the subarrays data type can certainly
+         * be simply copied (no object, fields, subarray, and not a user dtype).
+         */
+        npy_bool can_optimize_subarray = (!swap &&
+                !PyDataType_HASFIELDS(descr->subarray->base) &&
+                !PyDataType_HASSUBARRAY(descr->subarray->base) &&
+                !PyDataType_REFCHK(descr->subarray->base) &&
+                (descr->subarray->base->type_num < NPY_NTYPES));
+
+        if (can_optimize_subarray) {
+            _basic_copy(dst, src, descr->elsize);
+            return;
+        }
 
-        descr = PyArray_DESCR(arr);
         new = descr->subarray->base;
         /*
          * TODO: temporarily modifying the array like this
          *       is bad coding style, should be changed.
          */
         ((PyArrayObject_fields *)arr)->descr = new;
-        itemsize = new->elsize;
-        num = descr->elsize / itemsize;
-        new->f->copyswapn(dst, itemsize, src,
-                itemsize, num, swap, arr);
+        subitemsize = new->elsize;
+        if (subitemsize == 0) {
+            /* There cannot be any elements, so return */
+            return;
+        }
+        num = descr->elsize / subitemsize;
+        new->f->copyswapn(dst, subitemsize, src,
+                subitemsize, num, swap, arr);
         ((PyArrayObject_fields *)arr)->descr = descr;
         return;
     }
-
-    /* copy first if needed */
-    _basic_copy(dst, src, PyArray_DESCR(arr)->elsize);
+    /* Must be a naive Void type (e.g. a "V8") so simple copy is sufficient. */
+    _basic_copy(dst, src, descr->elsize);
     return;
 }
 
diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
index 2f66d7f2f..d8ad80266 100644
--- a/numpy/core/src/multiarray/buffer.c
+++ b/numpy/core/src/multiarray/buffer.c
@@ -509,6 +509,10 @@ _buffer_info_new(PyObject *obj)
     PyArray_Descr *descr = NULL;
     int err = 0;
 
+    /*
+     * Note that the buffer info is cached as pyints making them appear like
+     * unreachable lost memory to valgrind.
+     */
     info = malloc(sizeof(_buffer_info_t));
     if (info == NULL) {
         PyErr_NoMemory();
@@ -579,9 +583,11 @@ _buffer_info_new(PyObject *obj)
     err = _buffer_format_string(descr, &fmt, obj, NULL, NULL);
     Py_DECREF(descr);
     if (err != 0) {
+        free(info->shape);
         goto fail;
     }
     if (_append_char(&fmt, '\0') < 0) {
+        free(info->shape);
         goto fail;
     }
     info->format = fmt.s;
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index 2e51cee7e..addb67732 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -164,7 +164,7 @@ PyArray_DTypeFromObjectHelper(PyObject *obj, int maxdims,
 
             if (string_type == NPY_STRING) {
                 if ((temp = PyObject_Str(obj)) == NULL) {
-                    return -1;
+                    goto fail;
                 }
 #if defined(NPY_PY3K)
     #if PY_VERSION_HEX >= 0x03030000
@@ -182,7 +182,7 @@ PyArray_DTypeFromObjectHelper(PyObject *obj, int maxdims,
 #else
                 if ((temp = PyObject_Unicode(obj)) == NULL) {
 #endif
-                    return -1;
+                    goto fail;
                 }
                 itemsize = PyUnicode_GET_DATA_SIZE(temp);
 #ifndef Py_UNICODE_WIDE
@@ -216,7 +216,7 @@ PyArray_DTypeFromObjectHelper(PyObject *obj, int maxdims,
 
             if (string_type == NPY_STRING) {
                 if ((temp = PyObject_Str(obj)) == NULL) {
-                    return -1;
+                    goto fail;
                 }
 #if defined(NPY_PY3K)
     #if PY_VERSION_HEX >= 0x03030000
@@ -234,7 +234,7 @@ PyArray_DTypeFromObjectHelper(PyObject *obj, int maxdims,
 #else
                 if ((temp = PyObject_Unicode(obj)) == NULL) {
 #endif
-                    return -1;
+                    goto fail;
                 }
                 itemsize = PyUnicode_GET_DATA_SIZE(temp);
 #ifndef Py_UNICODE_WIDE
@@ -511,7 +511,7 @@ promote_types:
         PyArray_Descr *res_dtype = PyArray_PromoteTypes(dtype, *out_dtype);
         Py_DECREF(dtype);
         if (res_dtype == NULL) {
-            return -1;
+            goto fail;
         }
         if (!string_type &&
                 res_dtype->type_num == NPY_UNICODE &&
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 2b8d3d3a4..0e162903d 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -182,6 +182,7 @@ check_and_adjust_axis(int *axis, int ndim)
 
 /* used for some alignment checks */
 #define _ALIGN(type) offsetof(struct {char c; type v;}, v)
+#define _UINT_ALIGN(type) npy_uint_alignment(sizeof(type))
 /*
  * Disable harmless compiler warning "4116: unnamed type definition in
  * parentheses" which is caused by the _ALIGN macro.
@@ -201,6 +202,7 @@ npy_is_aligned(const void * p, const npy_uintp alignment)
      * Assumes cast from pointer to uintp gives a sensible representation we
      * can use bitwise & on (not required by C standard, but used by glibc).
      * This test is faster than a direct modulo.
+     * Note alignment value of 0 is allowed and returns False.
      */
     return ((npy_uintp)(p) & ((alignment) - 1)) == 0;
 }
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index 10e3478e2..88924a860 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -1575,6 +1575,7 @@ pack_bits(PyObject *input, int axis)
     if (!PyArray_ISBOOL(inp) && !PyArray_ISINTEGER(inp)) {
         PyErr_SetString(PyExc_TypeError,
                 "Expected an input array of integer or boolean data type");
+        Py_DECREF(inp);
         goto fail;
     }
 
@@ -1682,6 +1683,7 @@ unpack_bits(PyObject *input, int axis)
     if (PyArray_TYPE(inp) != NPY_UBYTE) {
         PyErr_SetString(PyExc_TypeError,
                 "Expected an input array of unsigned byte data type");
+        Py_DECREF(inp);
         goto fail;
     }
 
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index 33a706412..c59979e75 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -47,11 +47,10 @@ PyArray_CastToType(PyArrayObject *arr, PyArray_Descr *dtype, int is_f_order)
     PyObject *out;
 
     /* If the requested dtype is flexible, adapt it */
-    PyArray_AdaptFlexibleDType((PyObject *)arr, PyArray_DESCR(arr), &dtype);
+    dtype = PyArray_AdaptFlexibleDType((PyObject *)arr, PyArray_DESCR(arr), dtype);
     if (dtype == NULL) {
         return NULL;
     }
-
     out = PyArray_NewFromDescr(Py_TYPE(arr), dtype,
                                PyArray_NDIM(arr),
                                PyArray_DIMS(arr),
@@ -128,9 +127,9 @@ PyArray_GetCastFunc(PyArray_Descr *descr, int type_num)
 }
 
 /*
- * This function calls Py_DECREF on flex_dtype, and replaces it with
- * a new dtype that has been adapted based on the values in data_dtype
- * and data_obj. If the flex_dtype is not flexible, it leaves it as is.
+ * This function returns a dtype based on flex_dtype and the values in
+ * data_dtype and data_obj. It also calls Py_DECREF on the flex_dtype. If the
+ * flex_dtype is not flexible, it returns it as-is.
  *
  * Usually, if data_obj is not an array, dtype should be the result
  * given by the PyArray_GetArrayParamsFromObject function.
@@ -138,40 +137,37 @@ PyArray_GetCastFunc(PyArray_Descr *descr, int type_num)
  * The data_obj may be NULL if just a dtype is known for the source.
  *
  * If *flex_dtype is NULL, returns immediately, without setting an
- * exception. This basically assumes an error was already set previously.
+ * exception, leaving any previous error handling intact.
  *
  * The current flexible dtypes include NPY_STRING, NPY_UNICODE, NPY_VOID,
  * and NPY_DATETIME with generic units.
  */
-NPY_NO_EXPORT void
+NPY_NO_EXPORT PyArray_Descr *
 PyArray_AdaptFlexibleDType(PyObject *data_obj, PyArray_Descr *data_dtype,
-                            PyArray_Descr **flex_dtype)
+                            PyArray_Descr *flex_dtype)
 {
     PyArray_DatetimeMetaData *meta;
+    PyArray_Descr *retval = NULL;
     int flex_type_num;
 
-    if (*flex_dtype == NULL) {
-        if (!PyErr_Occurred()) {
-            PyErr_SetString(PyExc_RuntimeError,
-                    "NumPy AdaptFlexibleDType was called with NULL flex_dtype "
-                    "but no error set");
-        }
-        return;
+    if (flex_dtype == NULL) {
+        return retval;
     }
 
-    flex_type_num = (*flex_dtype)->type_num;
+    flex_type_num = flex_dtype->type_num;
 
     /* Flexible types with expandable size */
-    if (PyDataType_ISUNSIZED(*flex_dtype)) {
+    if (PyDataType_ISUNSIZED(flex_dtype)) {
         /* First replace the flex_dtype */
-        PyArray_DESCR_REPLACE(*flex_dtype);
-        if (*flex_dtype == NULL) {
-            return;
+        retval = PyArray_DescrNew(flex_dtype);
+        Py_DECREF(flex_dtype);
+        if (retval == NULL) {
+            return retval;
         }
 
         if (data_dtype->type_num == flex_type_num ||
                                     flex_type_num == NPY_VOID) {
-            (*flex_dtype)->elsize = data_dtype->elsize;
+            (retval)->elsize = data_dtype->elsize;
         }
         else if (flex_type_num == NPY_STRING || flex_type_num == NPY_UNICODE) {
             npy_intp size = 8;
@@ -199,7 +195,7 @@ PyArray_AdaptFlexibleDType(PyObject *data_obj, PyArray_Descr *data_dtype,
                     }
                     else if (data_dtype->elsize > 8 ||
                              data_dtype->elsize < 0) {
-                        /* 
+                        /*
                          * Element size should never be greater than 8 or
                          * less than 0 for integer type, but just in case...
                          */
@@ -237,9 +233,8 @@ PyArray_AdaptFlexibleDType(PyObject *data_obj, PyArray_Descr *data_dtype,
                                 PyObject *s = PyObject_Str(list);
                                 if (s == NULL) {
                                     Py_DECREF(list);
-                                    Py_DECREF(*flex_dtype);
-                                    *flex_dtype = NULL;
-                                    return;
+                                    Py_DECREF(retval);
+                                    return NULL;
                                 }
                                 else {
                                     size = PyObject_Length(s);
@@ -262,9 +257,16 @@ PyArray_AdaptFlexibleDType(PyObject *data_obj, PyArray_Descr *data_dtype,
                             list = PyArray_ToList((PyArrayObject *)data_obj);
                             result = PyArray_GetArrayParamsFromObject(
                                     list,
-                                    *flex_dtype,
+                                    retval,
                                     0, &dtype,
                                     &ndim, dims, &arr, NULL);
+                            Py_DECREF(list);
+                            Py_XDECREF(arr);
+                            if (result < 0) {
+                                Py_XDECREF(dtype);
+                                Py_DECREF(retval);
+                                return NULL;
+                            }
                             if (result == 0 && dtype != NULL) {
                                 if (flex_type_num == NPY_UNICODE) {
                                     size = dtype->elsize / 4;
@@ -274,15 +276,12 @@ PyArray_AdaptFlexibleDType(PyObject *data_obj, PyArray_Descr *data_dtype,
                                 }
                             }
                             Py_XDECREF(dtype);
-                            Py_XDECREF(arr);
-                            Py_DECREF(list);
                         }
                         else if (PyArray_IsPythonScalar(data_obj)) {
                             PyObject *s = PyObject_Str(data_obj);
                             if (s == NULL) {
-                                Py_DECREF(*flex_dtype);
-                                *flex_dtype = NULL;
-                                return;
+                                Py_DECREF(retval);
+                                return NULL;
                             }
                             else {
                                 size = PyObject_Length(s);
@@ -301,9 +300,8 @@ PyArray_AdaptFlexibleDType(PyObject *data_obj, PyArray_Descr *data_dtype,
                 case NPY_DATETIME:
                     meta = get_datetime_metadata_from_dtype(data_dtype);
                     if (meta == NULL) {
-                        Py_DECREF(*flex_dtype);
-                        *flex_dtype = NULL;
-                        return;
+                        Py_DECREF(retval);
+                        return NULL;
                     }
                     size = get_datetime_iso_8601_strlen(0, meta->base);
                     break;
@@ -313,10 +311,10 @@ PyArray_AdaptFlexibleDType(PyObject *data_obj, PyArray_Descr *data_dtype,
             }
 
             if (flex_type_num == NPY_STRING) {
-                (*flex_dtype)->elsize = size;
+                retval->elsize = size;
             }
             else if (flex_type_num == NPY_UNICODE) {
-                (*flex_dtype)->elsize = size * 4;
+                retval->elsize = size * 4;
             }
         }
         else {
@@ -326,18 +324,17 @@ PyArray_AdaptFlexibleDType(PyObject *data_obj, PyArray_Descr *data_dtype,
              */
             PyErr_SetString(PyExc_TypeError,
                     "don't know how to adapt flex dtype");
-            *flex_dtype = NULL;
-            return;
+            Py_DECREF(retval);
+            return NULL;
         }
     }
     /* Flexible type with generic time unit that adapts */
     else if (flex_type_num == NPY_DATETIME ||
                 flex_type_num == NPY_TIMEDELTA) {
-        meta = get_datetime_metadata_from_dtype(*flex_dtype);
+        meta = get_datetime_metadata_from_dtype(flex_dtype);
+        retval = flex_dtype;
         if (meta == NULL) {
-            Py_DECREF(*flex_dtype);
-            *flex_dtype = NULL;
-            return;
+            return NULL;
         }
 
         if (meta->base == NPY_FR_GENERIC) {
@@ -345,22 +342,24 @@ PyArray_AdaptFlexibleDType(PyObject *data_obj, PyArray_Descr *data_dtype,
                     data_dtype->type_num == NPY_TIMEDELTA) {
                 meta = get_datetime_metadata_from_dtype(data_dtype);
                 if (meta == NULL) {
-                    Py_DECREF(*flex_dtype);
-                    *flex_dtype = NULL;
-                    return;
+                    return NULL;
                 }
 
-                Py_DECREF(*flex_dtype);
-                *flex_dtype = create_datetime_dtype(flex_type_num, meta);
+                retval = create_datetime_dtype(flex_type_num, meta);
+                Py_DECREF(flex_dtype);
             }
             else if (data_obj != NULL) {
                 /* Detect the unit from the input's data */
-                Py_DECREF(*flex_dtype);
-                *flex_dtype = find_object_datetime_type(data_obj,
+                retval = find_object_datetime_type(data_obj,
                                                     flex_type_num);
+                Py_DECREF(flex_dtype);
             }
         }
     }
+    else {
+        retval = flex_dtype;
+    }
+    return retval;
 }
 
 /*
@@ -518,7 +517,7 @@ PyArray_CanCastTo(PyArray_Descr *from, PyArray_Descr *to)
          * stringified value of the object.
          */
         else if (to_type_num == NPY_STRING || to_type_num == NPY_UNICODE) {
-            /* 
+            /*
              * Boolean value cast to string type is 5 characters max
              * for string 'False'.
              */
@@ -531,7 +530,7 @@ PyArray_CanCastTo(PyArray_Descr *from, PyArray_Descr *to)
             if (PyDataType_ISUNSIZED(to)) {
                 ret = 1;
             }
-            /* 
+            /*
              * Need at least 5 characters to convert from boolean
              * to 'True' or 'False'.
              */
@@ -1166,7 +1165,11 @@ PyArray_PromoteTypes(PyArray_Descr *type1, PyArray_Descr *type2)
                 PyArray_Descr *ret = NULL;
                 PyArray_Descr *temp = PyArray_DescrNew(type1);
                 PyDataType_MAKEUNSIZED(temp);
-                PyArray_AdaptFlexibleDType(NULL, type2, &temp);
+
+                temp = PyArray_AdaptFlexibleDType(NULL, type2, temp);
+                if (temp == NULL) {
+                    return NULL;
+                }
                 if (temp->elsize > type1->elsize) {
                     ret = ensure_dtype_nbo(temp);
                 }
@@ -1204,7 +1207,10 @@ PyArray_PromoteTypes(PyArray_Descr *type1, PyArray_Descr *type2)
                 PyArray_Descr *ret = NULL;
                 PyArray_Descr *temp = PyArray_DescrNew(type1);
                 PyDataType_MAKEUNSIZED(temp);
-                PyArray_AdaptFlexibleDType(NULL, type2, &temp);
+                temp = PyArray_AdaptFlexibleDType(NULL, type2, temp);
+                if (temp == NULL) {
+                    return NULL;
+                }
                 if (temp->elsize > type1->elsize) {
                     ret = ensure_dtype_nbo(temp);
                 }
@@ -1252,7 +1258,10 @@ PyArray_PromoteTypes(PyArray_Descr *type1, PyArray_Descr *type2)
                 PyArray_Descr *ret = NULL;
                 PyArray_Descr *temp = PyArray_DescrNew(type2);
                 PyDataType_MAKEUNSIZED(temp);
-                PyArray_AdaptFlexibleDType(NULL, type1, &temp);
+                temp = PyArray_AdaptFlexibleDType(NULL, type1, temp);
+                if (temp == NULL) {
+                    return NULL;
+                }
                 if (temp->elsize > type2->elsize) {
                     ret = ensure_dtype_nbo(temp);
                 }
@@ -1269,7 +1278,10 @@ PyArray_PromoteTypes(PyArray_Descr *type1, PyArray_Descr *type2)
                 PyArray_Descr *ret = NULL;
                 PyArray_Descr *temp = PyArray_DescrNew(type2);
                 PyDataType_MAKEUNSIZED(temp);
-                PyArray_AdaptFlexibleDType(NULL, type1, &temp);
+                temp = PyArray_AdaptFlexibleDType(NULL, type1, temp);
+                if (temp == NULL) {
+                    return NULL;
+                }
                 if (temp->elsize > type2->elsize) {
                     ret = ensure_dtype_nbo(temp);
                 }
diff --git a/numpy/core/src/multiarray/convert_datatype.h b/numpy/core/src/multiarray/convert_datatype.h
index bf77d699a..653557161 100644
--- a/numpy/core/src/multiarray/convert_datatype.h
+++ b/numpy/core/src/multiarray/convert_datatype.h
@@ -21,13 +21,21 @@ can_cast_scalar_to(PyArray_Descr *scal_type, char *scal_data,
 /*
  * This function calls Py_DECREF on flex_dtype, and replaces it with
  * a new dtype that has been adapted based on the values in data_dtype
- * and data_obj. If the flex_dtype is not flexible, it leaves it as is.
+ * and data_obj. If the flex_dtype is not flexible, it returns it as-is.
+ *
+ * Usually, if data_obj is not an array, dtype should be the result
+ * given by the PyArray_GetArrayParamsFromObject function.
+ *
+ * The data_obj may be NULL if just a dtype is known for the source.
+ *
+ * If *flex_dtype is NULL, returns immediately, without setting an
+ * exception, leaving any previous error handling intact.
  *
  * The current flexible dtypes include NPY_STRING, NPY_UNICODE, NPY_VOID,
  * and NPY_DATETIME with generic units.
  */
-NPY_NO_EXPORT void
+NPY_NO_EXPORT PyArray_Descr *
 PyArray_AdaptFlexibleDType(PyObject *data_obj, PyArray_Descr *data_dtype,
-                            PyArray_Descr **flex_dtype);
+                            PyArray_Descr *flex_dtype);
 
 #endif
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index f77e414da..63bf5377a 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -1410,6 +1410,7 @@ _array_from_buffer_3118(PyObject *memoryview)
          * dimensions, so the array is now 0d.
          */
         nd = 0;
+        Py_DECREF(descr);
         descr = (PyArray_Descr *)PyObject_CallFunctionObjArgs(
                 (PyObject *)&PyArrayDescr_Type, Py_TYPE(view->obj), NULL);
         if (descr == NULL) {
@@ -1811,9 +1812,12 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
 
     /* If the requested dtype is flexible, adapt it */
     if (newtype != NULL) {
-        PyArray_AdaptFlexibleDType(op,
+        newtype = PyArray_AdaptFlexibleDType(op,
                     (dtype == NULL) ? PyArray_DESCR(arr) : dtype,
-                    &newtype);
+                    newtype);
+        if (newtype == NULL) {
+            return NULL;
+        }
     }
 
     /* If we got dimensions and dtype instead of an array */
@@ -2128,12 +2132,15 @@ PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int flags)
              */
 
             /* 2017-Nov-10 1.14 */
-            if (DEPRECATE("NPY_ARRAY_UPDATEIFCOPY, NPY_ARRAY_INOUT_ARRAY, and "
-                "NPY_ARRAY_INOUT_FARRAY are deprecated, use NPY_WRITEBACKIFCOPY, "
-                "NPY_ARRAY_INOUT_ARRAY2, or NPY_ARRAY_INOUT_FARRAY2 respectively "
-                "instead, and call PyArray_ResolveWritebackIfCopy before the "
-                "array is deallocated, i.e. before the last call to Py_DECREF.") < 0)
+            if (DEPRECATE(
+                    "NPY_ARRAY_UPDATEIFCOPY, NPY_ARRAY_INOUT_ARRAY, and "
+                    "NPY_ARRAY_INOUT_FARRAY are deprecated, use NPY_WRITEBACKIFCOPY, "
+                    "NPY_ARRAY_INOUT_ARRAY2, or NPY_ARRAY_INOUT_FARRAY2 respectively "
+                    "instead, and call PyArray_ResolveWritebackIfCopy before the "
+                    "array is deallocated, i.e. before the last call to Py_DECREF.") < 0) {
+                Py_DECREF(ret);
                 return NULL;
+            }
             Py_INCREF(arr);
             if (PyArray_SetWritebackIfCopyBase(ret, arr) < 0) {
                 Py_DECREF(ret);
@@ -2160,14 +2167,12 @@ PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int flags)
 
         Py_DECREF(newtype);
         if (needview) {
-            PyArray_Descr *dtype = PyArray_DESCR(arr);
             PyTypeObject *subtype = NULL;
 
             if (flags & NPY_ARRAY_ENSUREARRAY) {
                 subtype = &PyArray_Type;
             }
 
-            Py_INCREF(dtype);
             ret = (PyArrayObject *)PyArray_View(arr, NULL, subtype);
             if (ret == NULL) {
                 return NULL;
@@ -2827,7 +2832,8 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
      * contiguous strides, etc.
      */
     if (PyArray_GetDTypeTransferFunction(
-                    IsUintAligned(src) && IsUintAligned(dst),
+                    IsUintAligned(src) && IsAligned(src) &&
+                    IsUintAligned(dst) && IsAligned(dst),
                     src_stride, dst_stride,
                     PyArray_DESCR(src), PyArray_DESCR(dst),
                     0,
diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c
index a8550d958..54d19d993 100644
--- a/numpy/core/src/multiarray/datetime.c
+++ b/numpy/core/src/multiarray/datetime.c
@@ -3822,18 +3822,26 @@ recursive_find_object_timedelta64_type(PyObject *obj,
                  * single object using [()], but not by using
                  * __getitem__(integer) approaches
                  */
-                PyObject *item, *meth, *args;
+                PyObject *item, *args;
 
-                meth = PyObject_GetAttrString(obj, "__getitem__");
-                args = Py_BuildValue("(())");
-                item = PyObject_CallObject(meth, args);
+                args = PyTuple_New(0);
+                if (args == NULL) {
+                    return 0;
+                }
+                item = PyObject_GetItem(obj, args);
+                Py_DECREF(args);
+                if (item == NULL) {
+                    return 0;
+                }
                 /*
                  * NOTE: may need other type checks here in the future
                  * for expanded 0 D datetime array conversions?
                  */
                 if (PyDelta_Check(item)) {
+                    Py_DECREF(item);
                     return delta_checker(meta);
                 }
+                Py_DECREF(item);
             }
         }
     }
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index 3038e4dea..0471a2a3e 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -515,6 +515,7 @@ _convert_from_array_descr(PyObject *obj, int align)
 #if defined(NPY_PY3K)
             Py_DECREF(name);
 #endif
+            Py_DECREF(conv);
             goto fail;
         }
         dtypeflags |= (conv->flags & NPY_FROM_FIELDS);
@@ -837,9 +838,11 @@ _use_inherit(PyArray_Descr *type, PyObject *newobj, int *errflag)
     else if (new->elsize != conv->elsize) {
         PyErr_SetString(PyExc_ValueError,
                 "mismatch in size of old and new data-descriptor");
+        Py_DECREF(new);
         goto fail;
     }
     else if (invalid_union_object_dtype(new, conv)) {
+        Py_DECREF(new);
         goto fail;
     }
 
@@ -1728,6 +1731,7 @@ PyArray_DescrNew(PyArray_Descr *base)
         newdescr->c_metadata = NPY_AUXDATA_CLONE(base->c_metadata);
         if (newdescr->c_metadata == NULL) {
             PyErr_NoMemory();
+            /* TODO: This seems wrong, as the old fields get decref'd? */
             Py_DECREF(newdescr);
             return NULL;
         }
@@ -3336,12 +3340,15 @@ static PyObject *
 _subscript_by_index(PyArray_Descr *self, Py_ssize_t i)
 {
     PyObject *name = PySequence_GetItem(self->names, i);
+    PyObject *ret;
     if (name == NULL) {
         PyErr_Format(PyExc_IndexError,
                      "Field index %zd out of range.", i);
         return NULL;
     }
-    return _subscript_by_name(self, name);
+    ret = _subscript_by_name(self, name);
+    Py_DECREF(name);
+    return ret;
 }
 
 static PyObject *
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index 2b29d4f8c..3ab07ad31 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -26,6 +26,7 @@
 #include "_datetime.h"
 #include "datetime_strings.h"
 #include "descriptor.h"
+#include "array_assign.h"
 
 #include "shape.h"
 #include "lowlevel_strided_loops.h"
@@ -1126,7 +1127,7 @@ get_datetime_to_unicode_transfer_function(int aligned,
 
     /* Get an ASCII string data type, adapted to match the UNICODE one */
     str_dtype = PyArray_DescrFromType(NPY_STRING);
-    PyArray_AdaptFlexibleDType(NULL, dst_dtype, &str_dtype);
+    str_dtype = PyArray_AdaptFlexibleDType(NULL, dst_dtype, str_dtype);
     if (str_dtype == NULL) {
         return NPY_FAIL;
     }
@@ -1248,7 +1249,7 @@ get_unicode_to_datetime_transfer_function(int aligned,
 
     /* Get an ASCII string data type, adapted to match the UNICODE one */
     str_dtype = PyArray_DescrFromType(NPY_STRING);
-    PyArray_AdaptFlexibleDType(NULL, src_dtype, &str_dtype);
+    str_dtype = PyArray_AdaptFlexibleDType(NULL, src_dtype, str_dtype);
     if (str_dtype == NULL) {
         return NPY_FAIL;
     }
@@ -1571,12 +1572,30 @@ get_cast_transfer_function(int aligned,
                                 src_dtype,
                                 &tobuffer, &todata);
 
-
-        /* Get the copy/swap operation to dst */
-        PyArray_GetDTypeCopySwapFn(aligned,
-                                dst_itemsize, dst_stride,
-                                dst_dtype,
-                                &frombuffer, &fromdata);
+        if (!PyDataType_REFCHK(dst_dtype)) {
+            /* Copying from buffer is a simple copy/swap operation */
+            PyArray_GetDTypeCopySwapFn(aligned,
+                                    dst_itemsize, dst_stride,
+                                    dst_dtype,
+                                    &frombuffer, &fromdata);
+        }
+        else {
+            /*
+             * Since the buffer is initialized to NULL, need to move the
+             * references in order to DECREF the existing data.
+             */
+             /* Object types cannot be byte swapped */
+            assert(PyDataType_ISNOTSWAPPED(dst_dtype));
+            /* The loop already needs the python api if this is reached */
+            assert(*out_needs_api);
+
+            if (PyArray_GetDTypeTransferFunction(
+                    aligned, dst_itemsize, dst_stride,
+                    dst_dtype, dst_dtype, 1,
+                    &frombuffer, &fromdata, out_needs_api) != NPY_SUCCEED) {
+                return NPY_FAIL;
+            }
+        }
 
         if (frombuffer == NULL || tobuffer == NULL) {
             NPY_AUXDATA_FREE(castdata);
@@ -2000,6 +2019,7 @@ typedef struct {
     _subarray_broadcast_offsetrun offsetruns;
 } _subarray_broadcast_data;
 
+
 /* transfer data free function */
 static void _subarray_broadcast_data_free(NpyAuxData *data)
 {
@@ -3765,11 +3785,15 @@ PyArray_CastRawArrays(npy_intp count,
         return NPY_SUCCEED;
     }
 
-    /* Check data alignment */
-    aligned = (((npy_intp)src | src_stride) &
-                                (src_dtype->alignment - 1)) == 0 &&
-              (((npy_intp)dst | dst_stride) &
-                                (dst_dtype->alignment - 1)) == 0;
+    /* Check data alignment, both uint and true */
+    aligned = raw_array_is_aligned(1, &count, dst, &dst_stride,
+                                   npy_uint_alignment(dst_dtype->elsize)) &&
+              raw_array_is_aligned(1, &count, dst, &dst_stride,
+                                   dst_dtype->alignment) &&
+              raw_array_is_aligned(1, &count, src, &src_stride,
+                                   npy_uint_alignment(src_dtype->elsize)) &&
+              raw_array_is_aligned(1, &count, src, &src_stride,
+                                   src_dtype->alignment);
 
     /* Get the function to do the casting */
     if (PyArray_GetDTypeTransferFunction(aligned,
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 1765982a0..eb2b33870 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -1992,12 +1992,13 @@ parse_output_subscripts(char *subscripts, int length,
 
 
 /*
- * When there's just one operand and no reduction, we
- * can return a view into op.  This calculates the view
- * if possible.
+ * When there's just one operand and no reduction we can return a view
+ * into 'op'.  This calculates the view and stores it in 'ret', if
+ * possible.  Returns -1 on error, 0 otherwise.  Note that a 0 return
+ * does not mean that a view was successfully created.
  */
 static int
-get_single_op_view(PyArrayObject *op, int  iop, char *labels,
+get_single_op_view(PyArrayObject *op, char *labels,
                    int ndim_output, char *output_labels,
                    PyArrayObject **ret)
 {
@@ -2052,13 +2053,11 @@ get_single_op_view(PyArrayObject *op, int  iop, char *labels,
             }
             /* Update the dimensions and strides of the output */
             i = out_label - output_labels;
-            if (new_dims[i] != 0 &&
-                    new_dims[i] != PyArray_DIM(op, idim)) {
+            if (new_dims[i] != 0 && new_dims[i] != PyArray_DIM(op, idim)) {
                 PyErr_Format(PyExc_ValueError,
-                        "dimensions in operand %d for collapsing "
+                        "dimensions in single operand for collapsing "
                         "index '%c' don't match (%d != %d)",
-                        iop, label, (int)new_dims[i],
-                        (int)PyArray_DIM(op, idim));
+                        label, (int)new_dims[i], (int)PyArray_DIM(op, idim));
                 return -1;
             }
             new_dims[i] = PyArray_DIM(op, idim);
@@ -2086,80 +2085,107 @@ get_single_op_view(PyArrayObject *op, int  iop, char *labels,
     return 0;
 }
 
+
+/*
+ * The char type may be either signed or unsigned, we need it to be
+ * signed here.
+ */
+static int
+_any_labels_are_negative(signed char *labels, int ndim)
+{
+    int idim;
+
+    for (idim = 0; idim < ndim; ++idim) {
+        if (labels[idim] < 0) {
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Given the labels for an operand array, returns a view of the array
+ * with all repeated labels collapsed into a single dimension along
+ * the corresponding diagonal. The labels are also updated to match
+ * the dimensions of the new array. If no label is repeated, the
+ * original array is reference increased and returned unchanged.
+ */
 static PyArrayObject *
 get_combined_dims_view(PyArrayObject *op, int iop, char *labels)
 {
     npy_intp new_strides[NPY_MAXDIMS];
     npy_intp new_dims[NPY_MAXDIMS];
-    int idim, ndim, icombine, combineoffset;
+    int idim, icombine;
     int icombinemap[NPY_MAXDIMS];
-
+    int ndim = PyArray_NDIM(op);
     PyArrayObject *ret = NULL;
 
-    ndim = PyArray_NDIM(op);
+    /* A fast path to avoid unnecessary calculations. */
+    if (!_any_labels_are_negative((signed char *)labels, ndim)) {
+        Py_INCREF(op);
 
-    /* Initialize the dimensions and strides to zero */
-    for (idim = 0; idim < ndim; ++idim) {
-        new_dims[idim] = 0;
-        new_strides[idim] = 0;
+        return op;
     }
 
-    /* Copy the dimensions and strides, except when collapsing */
+    /* Combine repeated labels. */
     icombine = 0;
-    for (idim = 0; idim < ndim; ++idim) {
+    for(idim = 0; idim < ndim; ++idim) {
         /*
          * The char type may be either signed or unsigned, we
          * need it to be signed here.
          */
         int label = (signed char)labels[idim];
-        /* If this label says to merge axes, get the actual label */
-        if (label < 0) {
-            combineoffset = label;
-            label = labels[idim+label];
-        }
-        else {
-            combineoffset = 0;
-            if (icombine != idim) {
-                labels[icombine] = labels[idim];
-            }
+        npy_intp dim = PyArray_DIM(op, idim);
+        npy_intp stride = PyArray_STRIDE(op, idim);
+
+        /* A label seen for the first time, add it to the op view. */
+        if (label >= 0) {
+            /*
+             * icombinemap maps dimensions in the original array to
+             * their position in the combined dimensions view.
+             */
             icombinemap[idim] = icombine;
+            new_dims[icombine] = dim;
+            new_strides[icombine] = stride;
+            ++icombine;
         }
-        /* If the label is 0, it's an unlabeled broadcast dimension */
-        if (label == 0) {
-            new_dims[icombine] = PyArray_DIM(op, idim);
-            new_strides[icombine] = PyArray_STRIDE(op, idim);
-        }
+        /* A repeated label, find the original one and merge them. */
         else {
-            /* Update the combined axis dimensions and strides */
-            int i = icombinemap[idim + combineoffset];
-            if (combineoffset < 0 && new_dims[i] != 0 &&
-                        new_dims[i] != PyArray_DIM(op, idim)) {
+            int i = icombinemap[idim + label];
+
+            icombinemap[idim] = -1;
+            if (new_dims[i] != dim) {
                 PyErr_Format(PyExc_ValueError,
-                        "dimensions in operand %d for collapsing "
-                        "index '%c' don't match (%d != %d)",
-                        iop, label, (int)new_dims[i],
-                        (int)PyArray_DIM(op, idim));
+                             "dimensions in operand %d for collapsing "
+                             "index '%c' don't match (%d != %d)",
+                             iop, label, (int)new_dims[i], (int)dim);
                 return NULL;
             }
-            new_dims[i] = PyArray_DIM(op, idim);
-            new_strides[i] += PyArray_STRIDE(op, idim);
+            new_strides[i] += stride;
         }
+    }
 
-        /* If the label didn't say to combine axes, increment dest i */
-        if (combineoffset == 0) {
-            icombine++;
+    /* Overwrite labels to match the new operand view. */
+    for (idim = 0; idim < ndim; ++idim) {
+        int i = icombinemap[idim];
+
+        if (i >= 0) {
+            labels[i] = labels[idim];
         }
     }
 
-    /* The compressed number of dimensions */
+    /* The number of dimensions of the combined view. */
     ndim = icombine;
 
+    /* Create a view of the operand with the compressed dimensions. */
     Py_INCREF(PyArray_DESCR(op));
     ret = (PyArrayObject *)PyArray_NewFromDescrAndBase(
             Py_TYPE(op), PyArray_DESCR(op),
             ndim, new_dims, new_strides, PyArray_DATA(op),
             PyArray_ISWRITEABLE(op) ? NPY_ARRAY_WRITEABLE : 0,
             (PyObject *)op, (PyObject *)op);
+
     return ret;
 }
 
@@ -2620,6 +2646,24 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
         return NULL;
     }
 
+    /*
+     * If there's just one operand and no output parameter,
+     * first try remapping the axes to the output to return
+     * a view instead of a copy.
+     */
+    if (nop == 1 && out == NULL) {
+        ret = NULL;
+
+        if (get_single_op_view(op_in[0], op_labels[0], ndim_output,
+                               output_labels, &ret) < 0) {
+            return NULL;
+        }
+
+        if (ret != NULL) {
+            return ret;
+        }
+    }
+
     /* Set all the op references to NULL */
     for (iop = 0; iop < nop; ++iop) {
         op[iop] = NULL;
@@ -2631,53 +2675,10 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
      */
     for (iop = 0; iop < nop; ++iop) {
         char *labels = op_labels[iop];
-        int combine, ndim;
-
-        ndim = PyArray_NDIM(op_in[iop]);
 
-        /*
-         * If there's just one operand and no output parameter,
-         * first try remapping the axes to the output to return
-         * a view instead of a copy.
-         */
-        if (iop == 0 && nop == 1 && out == NULL) {
-            ret = NULL;
-
-            if (get_single_op_view(op_in[iop], iop, labels,
-                                   ndim_output, output_labels,
-                                   &ret) < 0) {
-                return NULL;
-            }
-
-            if (ret != NULL) {
-                return ret;
-            }
-        }
-
-        /*
-         * Check whether any dimensions need to be combined
-         *
-         * The char type may be either signed or unsigned, we
-         * need it to be signed here.
-         */
-        combine = 0;
-        for (idim = 0; idim < ndim; ++idim) {
-            if ((signed char)labels[idim] < 0) {
-                combine = 1;
-            }
-        }
-
-        /* If any dimensions are combined, create a view which combines them */
-        if (combine) {
-            op[iop] = get_combined_dims_view(op_in[iop], iop, labels);
-            if (op[iop] == NULL) {
-                goto fail;
-            }
-        }
-        /* No combining needed */
-        else {
-            Py_INCREF(op_in[iop]);
-            op[iop] = op_in[iop];
+        op[iop] = get_combined_dims_view(op_in[iop], iop, labels);
+        if (op[iop] == NULL) {
+            goto fail;
         }
     }
 
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index 896e466c8..16bacf1ab 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -82,7 +82,7 @@
 /**begin repeat
  * #elsize = 1, 2, 4, 8, 16#
  * #elsize_half = 0, 1, 2, 4, 8#
- * #type = npy_uint8, npy_uint16, npy_uint32, npy_uint64, npy_uint128#
+ * #type = npy_uint8, npy_uint16, npy_uint32, npy_uint64, npy_uint64#
  */
 /**begin repeat1
  * #oper = strided_to_strided, strided_to_contig,
@@ -119,10 +119,10 @@ static void
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                         NpyAuxData *NPY_UNUSED(data))
 {
-#if @is_aligned@ && @elsize@ != 16
+#if @is_aligned@
     /* sanity check */
-    assert(N == 0 || npy_is_aligned(dst, _ALIGN(@type@)));
-    assert(N == 0 || npy_is_aligned(src, _ALIGN(@type@)));
+    assert(N == 0 || npy_is_aligned(dst, _UINT_ALIGN(@type@)));
+    assert(N == 0 || npy_is_aligned(src, _UINT_ALIGN(@type@)));
 #endif
     /*printf("fn @prefix@_@oper@_size@elsize@\n");*/
     while (N > 0) {
@@ -201,8 +201,8 @@ static NPY_GCC_OPT_3 void
     }
 #if @is_aligned@ && @elsize@ != 16
     /* sanity check */
-    assert(N == 0 || npy_is_aligned(dst, _ALIGN(@type@)));
-    assert(N == 0 || npy_is_aligned(src, _ALIGN(@type@)));
+    assert(N == 0 || npy_is_aligned(dst, _UINT_ALIGN(@type@)));
+    assert(N == 0 || npy_is_aligned(src, _UINT_ALIGN(@type@)));
 #endif
 #if @elsize@ == 1 && @dst_contig@
     memset(dst, *src, N);
@@ -808,12 +808,8 @@ static NPY_GCC_OPT_3 void
 
 #if @aligned@
    /* sanity check */
-#  if !@is_complex1@
     assert(N == 0 || npy_is_aligned(src, _ALIGN(_TYPE1)));
-#  endif
-#  if !@is_complex2@
     assert(N == 0 || npy_is_aligned(dst, _ALIGN(_TYPE2)));
-#  endif
 #endif
 
     /*printf("@prefix@_cast_@name1@_to_@name2@\n");*/
@@ -1425,7 +1421,7 @@ mapiter_trivial_@name@(PyArrayObject *self, PyArrayObject *ind,
         while (itersize--) {
             char * self_ptr;
             npy_intp indval = *((npy_intp*)ind_ptr);
-            assert(npy_is_aligned(ind_ptr, _ALIGN(npy_intp)));
+            assert(npy_is_aligned(ind_ptr, _UINT_ALIGN(npy_intp)));
 #if @isget@
             if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
                 return -1;
@@ -1439,8 +1435,8 @@ mapiter_trivial_@name@(PyArrayObject *self, PyArrayObject *ind,
 
 #if @isget@
 #if @elsize@
-            assert(npy_is_aligned(result_ptr, _ALIGN(@copytype@)));
-            assert(npy_is_aligned(self_ptr, _ALIGN(@copytype@)));
+            assert(npy_is_aligned(result_ptr, _UINT_ALIGN(@copytype@)));
+            assert(npy_is_aligned(self_ptr, _UINT_ALIGN(@copytype@)));
             *(@copytype@ *)result_ptr = *(@copytype@ *)self_ptr;
 #else
             copyswap(result_ptr, self_ptr, 0, self);
@@ -1448,8 +1444,8 @@ mapiter_trivial_@name@(PyArrayObject *self, PyArrayObject *ind,
 
 #else /* !@isget@ */
 #if @elsize@
-            assert(npy_is_aligned(result_ptr, _ALIGN(@copytype@)));
-            assert(npy_is_aligned(self_ptr, _ALIGN(@copytype@)));
+            assert(npy_is_aligned(result_ptr, _UINT_ALIGN(@copytype@)));
+            assert(npy_is_aligned(self_ptr, _UINT_ALIGN(@copytype@)));
             *(@copytype@ *)self_ptr = *(@copytype@ *)result_ptr;
 #else
             copyswap(self_ptr, result_ptr, 0, self);
@@ -1571,7 +1567,7 @@ mapiter_@name@(PyArrayMapIterObject *mit)
                         for (i=0; i < @numiter@; i++) {
                             npy_intp indval = *((npy_intp*)outer_ptrs[i]);
                             assert(npy_is_aligned(outer_ptrs[i],
-                                                  _ALIGN(npy_intp)));
+                                                  _UINT_ALIGN(npy_intp)));
 
 #if @isget@ && @one_iter@
                             if (check_and_adjust_index(&indval, fancy_dims[i],
@@ -1591,16 +1587,20 @@ mapiter_@name@(PyArrayMapIterObject *mit)
 
 #if @isget@
 #if @elsize@
-                        assert(npy_is_aligned(outer_ptrs[i], _ALIGN(@copytype@)));
-                        assert(npy_is_aligned(self_ptr, _ALIGN(@copytype@)));
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              _UINT_ALIGN(@copytype@)));
+                        assert(npy_is_aligned(self_ptr,
+                                              _UINT_ALIGN(@copytype@)));
                         *(@copytype@ *)(outer_ptrs[i]) = *(@copytype@ *)self_ptr;
 #else
                         copyswap(outer_ptrs[i], self_ptr, 0, array);
 #endif
 #else /* !@isget@ */
 #if @elsize@
-                        assert(npy_is_aligned(outer_ptrs[i], _ALIGN(@copytype@)));
-                        assert(npy_is_aligned(self_ptr, _ALIGN(@copytype@)));
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               _UINT_ALIGN(@copytype@)));
+                        assert(npy_is_aligned(self_ptr,
+                               _UINT_ALIGN(@copytype@)));
                         *(@copytype@ *)self_ptr = *(@copytype@ *)(outer_ptrs[i]);
 #else
                         copyswap(self_ptr, outer_ptrs[i], 0, array);
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 1b05faeeb..17edd2bbf 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -1064,7 +1064,8 @@ array_boolean_subscript(PyArrayObject *self,
 
         /* Get a dtype transfer function */
         NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
-        if (PyArray_GetDTypeTransferFunction(IsUintAligned(self),
+        if (PyArray_GetDTypeTransferFunction(
+                        IsUintAligned(self) && IsAligned(self),
                         fixed_strides[0], itemsize,
                         dtype, dtype,
                         0,
@@ -1253,7 +1254,8 @@ array_assign_boolean_subscript(PyArrayObject *self,
         /* Get a dtype transfer function */
         NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
         if (PyArray_GetDTypeTransferFunction(
-                        IsUintAligned(self) && IsUintAligned(v),
+                        IsUintAligned(self) && IsAligned(self) &&
+                        IsUintAligned(v) && IsAligned(v),
                         v_stride, fixed_strides[0],
                         PyArray_DESCR(v), PyArray_DESCR(self),
                         0,
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 7c814e6e6..6005b9751 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -8,6 +8,7 @@
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
+#include "arrayfunction_override.h"
 #include "npy_config.h"
 #include "npy_pycompat.h"
 #include "npy_import.h"
@@ -823,8 +824,8 @@ array_astype(PyArrayObject *self, PyObject *args, PyObject *kwds)
         PyArrayObject *ret;
 
         /* If the requested dtype is flexible, adapt it */
-        PyArray_AdaptFlexibleDType((PyObject *)self, PyArray_DESCR(self),
-                                                                    &dtype);
+        dtype = PyArray_AdaptFlexibleDType((PyObject *)self,
+                                           PyArray_DESCR(self), dtype);
         if (dtype == NULL) {
             return NULL;
         }
@@ -1088,13 +1089,29 @@ cleanup:
     return result;
 }
 
-
 static PyObject *
-array_function(PyArrayObject *self, PyObject *args, PyObject *kwds)
+array_function(PyArrayObject *self, PyObject *c_args, PyObject *c_kwds)
 {
-    NPY_FORWARD_NDARRAY_METHOD("_array_function");
-}
+    PyObject *func, *types, *args, *kwargs, *result;
+    static char *kwlist[] = {"func", "types", "args", "kwargs", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(
+            c_args, c_kwds, "OOOO:__array_function__", kwlist,
+            &func, &types, &args, &kwargs)) {
+        return NULL;
+    }
+
+    types = PySequence_Fast(
+        types,
+        "types argument to ndarray.__array_function__ must be iterable");
+    if (types == NULL) {
+        return NULL;
+    }
 
+    result = array_function_method_impl(func, types, args, kwargs);
+    Py_DECREF(types);
+    return result;
+}
 
 static PyObject *
 array_copy(PyArrayObject *self, PyObject *args, PyObject *kwds)
@@ -1364,6 +1381,7 @@ array_argsort(PyArrayObject *self, PyObject *args, PyObject *kwds)
             return NULL;
         }
         newd = PyArray_DescrNew(saved);
+        Py_DECREF(newd->names);
         newd->names = new_name;
         ((PyArrayObject_fields *)self)->descr = newd;
     }
@@ -1418,6 +1436,7 @@ array_argpartition(PyArrayObject *self, PyObject *args, PyObject *kwds)
             return NULL;
         }
         newd = PyArray_DescrNew(saved);
+        Py_DECREF(newd->names);
         newd->names = new_name;
         ((PyArrayObject_fields *)self)->descr = newd;
     }
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 8135769d9..ce6a3870f 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -34,6 +34,7 @@
 NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 
 /* Internal APIs */
+#include "arrayfunction_override.h"
 #include "arraytypes.h"
 #include "arrayobject.h"
 #include "hashdescr.h"
@@ -408,9 +409,12 @@ PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
         npy_intp *arr_shape;
 
         if (PyArray_NDIM(arrays[iarrays]) != ndim) {
-            PyErr_SetString(PyExc_ValueError,
-                            "all the input arrays must have same "
-                            "number of dimensions");
+            PyErr_Format(PyExc_ValueError,
+                         "all the input arrays must have same number of "
+                         "dimensions, but the array at index %d has %d "
+                         "dimension(s) and the array at index %d has %d "
+                         "dimension(s)",
+                         0, ndim, iarrays, PyArray_NDIM(arrays[iarrays]));
             return NULL;
         }
         arr_shape = PyArray_SHAPE(arrays[iarrays]);
@@ -422,10 +426,12 @@ PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
             }
             /* Validate that the rest of the dimensions match */
             else if (shape[idim] != arr_shape[idim]) {
-                PyErr_SetString(PyExc_ValueError,
-                                "all the input array dimensions "
-                                "except for the concatenation axis "
-                                "must match exactly");
+                PyErr_Format(PyExc_ValueError,
+                             "all the input array dimensions for the "
+                             "concatenation axis must match exactly, but "
+                             "along dimension %d, the array at index %d has "
+                             "size %d and the array at index %d has size %d",
+                             idim, 0, shape[idim], iarrays, arr_shape[idim]);
                 return NULL;
             }
         }
@@ -3254,6 +3260,7 @@ array_datetime_data(PyObject *NPY_UNUSED(dummy), PyObject *args)
     }
 
     meta = get_datetime_metadata_from_dtype(dtype);
+    Py_DECREF(dtype);    
     if (meta == NULL) {
         return NULL;
     }
@@ -3618,6 +3625,7 @@ _vec_string_with_args(PyArrayObject* char_array, PyArray_Descr* type,
     if (nargs == -1 || nargs > NPY_MAXARGS) {
         PyErr_Format(PyExc_ValueError,
                 "len(args) must be < %d", NPY_MAXARGS - 1);
+        Py_DECREF(type);
         goto err;
     }
 
@@ -3625,6 +3633,7 @@ _vec_string_with_args(PyArrayObject* char_array, PyArray_Descr* type,
     for (i = 1; i < nargs; i++) {
         PyObject* item = PySequence_GetItem(args, i-1);
         if (item == NULL) {
+            Py_DECREF(type);
             goto err;
         }
         broadcast_args[i] = item;
@@ -3633,6 +3642,7 @@ _vec_string_with_args(PyArrayObject* char_array, PyArray_Descr* type,
     in_iter = (PyArrayMultiIterObject*)PyArray_MultiIterFromObjects
         (broadcast_args, nargs, 0);
     if (in_iter == NULL) {
+        Py_DECREF(type);
         goto err;
     }
     n = in_iter->numiter;
@@ -3713,6 +3723,7 @@ _vec_string_no_args(PyArrayObject* char_array,
 
     in_iter = (PyArrayIterObject*)PyArray_IterNew((PyObject*)char_array);
     if (in_iter == NULL) {
+        Py_DECREF(type);
         goto err;
     }
 
@@ -3769,7 +3780,7 @@ static PyObject *
 _vec_string(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
 {
     PyArrayObject* char_array = NULL;
-    PyArray_Descr *type = NULL;
+    PyArray_Descr *type;
     PyObject* method_name;
     PyObject* args_seq = NULL;
 
@@ -3806,6 +3817,7 @@ _vec_string(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
         result = _vec_string_with_args(char_array, type, method, args_seq);
     }
     else {
+        Py_DECREF(type);
         PyErr_SetString(PyExc_TypeError,
                 "'args' must be a sequence of arguments");
         goto err;
@@ -4062,6 +4074,9 @@ normalize_axis_index(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
 }
 
 static struct PyMethodDef array_module_methods[] = {
+    {"_get_implementing_args",
+        (PyCFunction)array__get_implementing_args,
+        METH_VARARGS, NULL},
     {"_get_ndarray_c_version",
         (PyCFunction)array__get_ndarray_c_version,
         METH_VARARGS|METH_KEYWORDS, NULL},
@@ -4224,6 +4239,9 @@ static struct PyMethodDef array_module_methods[] = {
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"_monotonicity", (PyCFunction)arr__monotonicity,
         METH_VARARGS | METH_KEYWORDS, NULL},
+    {"implement_array_function",
+        (PyCFunction)array_implement_array_function,
+        METH_VARARGS, NULL},
     {"interp", (PyCFunction)arr_interp,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"interp_complex", (PyCFunction)arr_interp_complex,
@@ -4476,6 +4494,7 @@ NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_array_wrap = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_array_finalize = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_buffer = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_ufunc = NULL;
+NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_wrapped = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_order = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_copy = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_dtype = NULL;
@@ -4492,6 +4511,7 @@ intern_strings(void)
     npy_ma_str_array_finalize = PyUString_InternFromString("__array_finalize__");
     npy_ma_str_buffer = PyUString_InternFromString("__buffer__");
     npy_ma_str_ufunc = PyUString_InternFromString("__array_ufunc__");
+    npy_ma_str_wrapped = PyUString_InternFromString("__wrapped__");
     npy_ma_str_order = PyUString_InternFromString("order");
     npy_ma_str_copy = PyUString_InternFromString("copy");
     npy_ma_str_dtype = PyUString_InternFromString("dtype");
@@ -4501,7 +4521,7 @@ intern_strings(void)
 
     return npy_ma_str_array && npy_ma_str_array_prepare &&
            npy_ma_str_array_wrap && npy_ma_str_array_finalize &&
-           npy_ma_str_buffer && npy_ma_str_ufunc &&
+           npy_ma_str_buffer && npy_ma_str_ufunc && npy_ma_str_wrapped &&
            npy_ma_str_order && npy_ma_str_copy && npy_ma_str_dtype &&
            npy_ma_str_ndmin && npy_ma_str_axis1 && npy_ma_str_axis2;
 }
diff --git a/numpy/core/src/multiarray/multiarraymodule.h b/numpy/core/src/multiarray/multiarraymodule.h
index 3de68c549..60a3965c9 100644
--- a/numpy/core/src/multiarray/multiarraymodule.h
+++ b/numpy/core/src/multiarray/multiarraymodule.h
@@ -7,6 +7,7 @@ NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_array_wrap;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_array_finalize;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_buffer;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_ufunc;
+NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_wrapped;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_order;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_copy;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_dtype;
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index dbb24f26b..dc58b3a78 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -1101,8 +1101,8 @@ npyiter_prepare_one_operand(PyArrayObject **op,
             /* We just have a borrowed reference to op_request_dtype */
             Py_INCREF(op_request_dtype);
             /* If the requested dtype is flexible, adapt it */
-            PyArray_AdaptFlexibleDType((PyObject *)(*op), PyArray_DESCR(*op),
-                                        &op_request_dtype);
+            op_request_dtype = PyArray_AdaptFlexibleDType((PyObject *)(*op), PyArray_DESCR(*op),
+                                                          op_request_dtype);
             if (op_request_dtype == NULL) {
                 return 0;
             }
@@ -1132,7 +1132,7 @@ npyiter_prepare_one_operand(PyArrayObject **op,
         /* Check if the operand is aligned */
         if (op_flags & NPY_ITER_ALIGNED) {
             /* Check alignment */
-            if (!IsUintAligned(*op)) {
+            if (!IsAligned(*op)) {
                 NPY_IT_DBG_PRINT("Iterator: Setting NPY_OP_ITFLAG_CAST "
                                     "because of NPY_ITER_ALIGNED\n");
                 *op_itflags |= NPY_OP_ITFLAG_CAST;
@@ -1248,9 +1248,9 @@ npyiter_prepare_operands(int nop, PyArrayObject **op_in,
     return 1;
 
   fail_nop:
-    iop = nop;
+    iop = nop - 1;
   fail_iop:
-    for (i = 0; i < iop; ++i) {
+    for (i = 0; i < iop+1; ++i) {
         Py_XDECREF(op[i]);
         Py_XDECREF(op_dtype[i]);
     }
@@ -2851,8 +2851,14 @@ npyiter_allocate_arrays(NpyIter *iter,
             npyiter_replace_axisdata(iter, iop, op[iop], ondim,
                     PyArray_DATA(op[iop]), op_axes ? op_axes[iop] : NULL);
 
-            /* New arrays are aligned and need no cast */
-            op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+            /*
+             * New arrays are guaranteed true-aligned, but copy/cast code
+             * needs uint-alignment in addition.
+             */
+            if (IsUintAligned(out)) {
+                op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+            }
+            /* New arrays need no cast */
             op_itflags[iop] &= ~NPY_OP_ITFLAG_CAST;
         }
         /*
@@ -2888,11 +2894,17 @@ npyiter_allocate_arrays(NpyIter *iter,
                     PyArray_DATA(op[iop]), NULL);
 
             /*
-             * New arrays are aligned need no cast, and in the case
+             * New arrays are guaranteed true-aligned, but copy/cast code
+             * needs uint-alignment in addition.
+             */
+            if (IsUintAligned(temp)) {
+                op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+            }
+            /*
+             * New arrays need no cast, and in the case
              * of scalars, always have stride 0 so never need buffering
              */
-            op_itflags[iop] |= (NPY_OP_ITFLAG_ALIGNED |
-                                  NPY_OP_ITFLAG_BUFNEVER);
+            op_itflags[iop] |= NPY_OP_ITFLAG_BUFNEVER;
             op_itflags[iop] &= ~NPY_OP_ITFLAG_CAST;
             if (itflags & NPY_ITFLAG_BUFFER) {
                 NBF_STRIDES(bufferdata)[iop] = 0;
@@ -2953,8 +2965,14 @@ npyiter_allocate_arrays(NpyIter *iter,
             npyiter_replace_axisdata(iter, iop, op[iop], ondim,
                     PyArray_DATA(op[iop]), op_axes ? op_axes[iop] : NULL);
 
-            /* The temporary copy is aligned and needs no cast */
-            op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+            /*
+             * New arrays are guaranteed true-aligned, but copy/cast code
+             * additionally needs uint-alignment in addition.
+             */
+            if (IsUintAligned(temp)) {
+                op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+            }
+            /* The temporary copy needs no cast */
             op_itflags[iop] &= ~NPY_OP_ITFLAG_CAST;
         }
         else {
@@ -3157,6 +3175,7 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
                                         &stransfer,
                                         &transferdata,
                                         &needs_api) != NPY_SUCCEED) {
+                    iop -= 1;  /* This one cannot be cleaned up yet. */
                     goto fail;
                 }
                 readtransferfn[iop] = stransfer;
@@ -3250,7 +3269,7 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
     return 1;
 
 fail:
-    for (i = 0; i < iop; ++i) {
+    for (i = 0; i < iop+1; ++i) {
         if (readtransferdata[iop] != NULL) {
             NPY_AUXDATA_FREE(readtransferdata[iop]);
             readtransferdata[iop] = NULL;
diff --git a/numpy/core/src/multiarray/nditer_pywrap.c b/numpy/core/src/multiarray/nditer_pywrap.c
index 5a9f3c5fa..30a81e0ca 100644
--- a/numpy/core/src/multiarray/nditer_pywrap.c
+++ b/numpy/core/src/multiarray/nditer_pywrap.c
@@ -2355,6 +2355,8 @@ npyiter_close(NewNpyArrayIterObject *self)
     }
     ret = NpyIter_Deallocate(iter);
     self->iter = NULL;
+    Py_XDECREF(self->nested_child);
+    self->nested_child = NULL;
     if (ret < 0) {
         return NULL;
     }
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index d153a8a64..420501ce2 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -599,15 +599,16 @@ array_positive(PyArrayObject *m1)
             PyErr_Restore(exc, val, tb);
             return NULL;
         }
+        Py_XDECREF(exc);
+        Py_XDECREF(val);
+        Py_XDECREF(tb);
+
         /* 2018-06-28, 1.16.0 */
         if (DEPRECATE("Applying '+' to a non-numerical array is "
                       "ill-defined. Returning a copy, but in the future "
                       "this will error.") < 0) {
             return NULL;
         }
-        Py_XDECREF(exc);
-        Py_XDECREF(val);
-        Py_XDECREF(tb);
         value = PyArray_Return((PyArrayObject *)PyArray_Copy(m1));
     }
     return value;
diff --git a/numpy/core/src/multiarray/refcount.c b/numpy/core/src/multiarray/refcount.c
index 4b018b056..b8230c81a 100644
--- a/numpy/core/src/multiarray/refcount.c
+++ b/numpy/core/src/multiarray/refcount.c
@@ -19,8 +19,12 @@
 static void
 _fillobject(char *optr, PyObject *obj, PyArray_Descr *dtype);
 
-/* Incref all objects found at this record */
+
 /*NUMPY_API
+ * XINCREF all objects in a single array item. This is complicated for
+ * structured datatypes where the position of objects needs to be extracted.
+ * The function is execute recursively for each nested field or subarrays dtype
+ * such as as `np.dtype([("field1", "O"), ("field2", "f,O", (3,2))])`
  */
 NPY_NO_EXPORT void
 PyArray_Item_INCREF(char *data, PyArray_Descr *descr)
@@ -51,11 +55,37 @@ PyArray_Item_INCREF(char *data, PyArray_Descr *descr)
             PyArray_Item_INCREF(data + offset, new);
         }
     }
+    else if (PyDataType_HASSUBARRAY(descr)) {
+        int size, i, inner_elsize;
+
+        inner_elsize = descr->subarray->base->elsize;
+        if (inner_elsize == 0) {
+            /* There cannot be any elements, so return */
+            return;
+        }
+        /* Subarrays are always contiguous in memory */
+        size = descr->elsize / inner_elsize;
+
+        for (i = 0; i < size; i++){
+            /* Recursively increment the reference count of subarray elements */
+            PyArray_Item_INCREF(data + i * inner_elsize,
+                                descr->subarray->base);
+        }
+    }
+    else {
+        /* This path should not be reachable. */
+        assert(0);
+    }
     return;
 }
 
-/* XDECREF all objects found at this record */
+
 /*NUMPY_API
+ *
+ * XDECREF all objects in a single array item. This is complicated for
+ * structured datatypes where the position of objects needs to be extracted.
+ * The function is execute recursively for each nested field or subarrays dtype
+ * such as as `np.dtype([("field1", "O"), ("field2", "f,O", (3,2))])`
  */
 NPY_NO_EXPORT void
 PyArray_Item_XDECREF(char *data, PyArray_Descr *descr)
@@ -87,6 +117,27 @@ PyArray_Item_XDECREF(char *data, PyArray_Descr *descr)
                 PyArray_Item_XDECREF(data + offset, new);
             }
         }
+    else if (PyDataType_HASSUBARRAY(descr)) {
+        int size, i, inner_elsize;
+
+        inner_elsize = descr->subarray->base->elsize;
+        if (inner_elsize == 0) {
+            /* There cannot be any elements, so return */
+            return;
+        }
+        /* Subarrays are always contiguous in memory */
+        size = descr->elsize / inner_elsize;
+
+        for (i = 0; i < size; i++){
+            /* Recursively decrement the reference count of subarray elements */
+            PyArray_Item_XDECREF(data + i * inner_elsize,
+                                 descr->subarray->base);
+        }
+    }
+    else {
+        /* This path should not be reachable. */
+        assert(0);
+    }
     return;
 }
 
@@ -258,6 +309,10 @@ _fillobject(char *optr, PyObject *obj, PyArray_Descr *dtype)
             Py_XDECREF(arr);
         }
     }
+    if (dtype->type_num == NPY_OBJECT) {
+        Py_XINCREF(obj);
+        NPY_COPY_PYOBJECT_PTR(optr, &obj);
+    }
     else if (PyDataType_HASFIELDS(dtype)) {
         PyObject *key, *value, *title = NULL;
         PyArray_Descr *new;
@@ -274,15 +329,26 @@ _fillobject(char *optr, PyObject *obj, PyArray_Descr *dtype)
             _fillobject(optr + offset, obj, new);
         }
     }
-    else {
-        npy_intp i;
-        npy_intp nsize = dtype->elsize / sizeof(obj);
+    else if (PyDataType_HASSUBARRAY(dtype)) {
+        int size, i, inner_elsize;
 
-        for (i = 0; i < nsize; i++) {
-            Py_XINCREF(obj);
-            NPY_COPY_PYOBJECT_PTR(optr, &obj);
-            optr += sizeof(obj);
+        inner_elsize = dtype->subarray->base->elsize;
+        if (inner_elsize == 0) {
+            /* There cannot be any elements, so return */
+            return;
+        }
+        /* Subarrays are always contiguous in memory */
+        size = dtype->elsize / inner_elsize;
+
+        /* Call _fillobject on each item recursively. */
+        for (i = 0; i < size; i++){
+            _fillobject(optr, obj, dtype->subarray->base);
+            optr += inner_elsize;
         }
-        return;
     }
+    else {
+        /* This path should not be reachable. */
+        assert(0);
+    }
+    return;
 }
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index 2f71c8ae9..52de31289 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -2599,6 +2599,8 @@ NPY_NO_EXPORT PyTypeObject PyGenericArrType_Type = {
 static void
 void_dealloc(PyVoidScalarObject *v)
 {
+    _dealloc_cached_buffer_info((PyObject *)v);
+
     if (v->flags & NPY_ARRAY_OWNDATA) {
         npy_free_cache(v->obval, Py_SIZE(v));
     }
diff --git a/numpy/core/src/multiarray/usertypes.c b/numpy/core/src/multiarray/usertypes.c
index 8e8090002..2e8fb514f 100644
--- a/numpy/core/src/multiarray/usertypes.c
+++ b/numpy/core/src/multiarray/usertypes.c
@@ -40,19 +40,27 @@ maintainer email:  oliphant.travis@ieee.org
 
 NPY_NO_EXPORT PyArray_Descr **userdescrs=NULL;
 
-static int *
-_append_new(int *types, int insert)
+static int
+_append_new(int **p_types, int insert)
 {
     int n = 0;
     int *newtypes;
+    int *types = *p_types;
 
     while (types[n] != NPY_NOTYPE) {
         n++;
     }
     newtypes = (int *)realloc(types, (n + 2)*sizeof(int));
+    if (newtypes == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
     newtypes[n] = insert;
     newtypes[n + 1] = NPY_NOTYPE;
-    return newtypes;
+
+    /* Replace the passed-in pointer */
+    *p_types = newtypes;
+    return 0;
 }
 
 static npy_bool
@@ -247,10 +255,13 @@ PyArray_RegisterCanCast(PyArray_Descr *descr, int totype,
          */
         if (descr->f->cancastto == NULL) {
             descr->f->cancastto = (int *)malloc(1*sizeof(int));
+            if (descr->f->cancastto == NULL) {
+                PyErr_NoMemory();
+                return -1;
+            }
             descr->f->cancastto[0] = NPY_NOTYPE;
         }
-        descr->f->cancastto = _append_new(descr->f->cancastto,
-                                          totype);
+        return _append_new(&descr->f->cancastto, totype);
     }
     else {
         /* register with cancastscalarkindto */
@@ -258,6 +269,10 @@ PyArray_RegisterCanCast(PyArray_Descr *descr, int totype,
             int i;
             descr->f->cancastscalarkindto =
                 (int **)malloc(NPY_NSCALARKINDS* sizeof(int*));
+            if (descr->f->cancastscalarkindto == NULL) {
+                PyErr_NoMemory();
+                return -1;
+            }
             for (i = 0; i < NPY_NSCALARKINDS; i++) {
                 descr->f->cancastscalarkindto[i] = NULL;
             }
@@ -265,11 +280,13 @@ PyArray_RegisterCanCast(PyArray_Descr *descr, int totype,
         if (descr->f->cancastscalarkindto[scalar] == NULL) {
             descr->f->cancastscalarkindto[scalar] =
                 (int *)malloc(1*sizeof(int));
+            if (descr->f->cancastscalarkindto[scalar] == NULL) {
+                PyErr_NoMemory();
+                return -1;
+            }
             descr->f->cancastscalarkindto[scalar][0] =
                 NPY_NOTYPE;
         }
-        descr->f->cancastscalarkindto[scalar] =
-            _append_new(descr->f->cancastscalarkindto[scalar], totype);
+        return _append_new(&descr->f->cancastscalarkindto[scalar], totype);
     }
-    return 0;
 }
diff --git a/numpy/core/src/umath/_struct_ufunc_tests.c.src b/numpy/core/src/umath/_struct_ufunc_tests.c.src
index b831d5c2a..5c6e235e0 100644
--- a/numpy/core/src/umath/_struct_ufunc_tests.c.src
+++ b/numpy/core/src/umath/_struct_ufunc_tests.c.src
@@ -114,6 +114,7 @@ PyMODINIT_FUNC init_struct_ufunc_tests(void)
                                 dtypes,
                                 NULL);
 
+    Py_DECREF(dtype);
     d = PyModule_GetDict(m);
 
     PyDict_SetItemString(d, "add_triplet", add_triplet);
diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src
index 8cb74f177..6c3bcce71 100644
--- a/numpy/core/src/umath/_umath_tests.c.src
+++ b/numpy/core/src/umath/_umath_tests.c.src
@@ -564,7 +564,7 @@ UMath_Tests_test_signature(PyObject *NPY_UNUSED(dummy), PyObject *args)
         core_dim_sizes = Py_None;
     }
     Py_DECREF(f);
-    return Py_BuildValue("iOOOO", core_enabled, core_num_dims,
+    return Py_BuildValue("iNNNN", core_enabled, core_num_dims,
                          core_dim_ixs, core_dim_flags, core_dim_sizes);
 
 fail:
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index ae3ece77b..6accf3065 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -566,16 +566,36 @@ NPY_NO_EXPORT void
 PyUFunc_O_O_method(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
 {
     char *meth = (char *)func;
+    PyObject *tup = PyTuple_New(0);
+    if (tup == NULL) {
+        return;
+    }
     UNARY_LOOP {
         PyObject *in1 = *(PyObject **)ip1;
         PyObject **out = (PyObject **)op1;
-        PyObject *ret = PyObject_CallMethod(in1 ? in1 : Py_None, meth, NULL);
+        PyObject *ret, *func;
+        func = PyObject_GetAttrString(in1 ? in1 : Py_None, meth);
+        if (func == NULL || !PyCallable_Check(func)) {
+            PyObject *exc, *val, *tb;
+            PyTypeObject *type = in1 ? Py_TYPE(in1) : Py_TYPE(Py_None);
+            PyErr_Fetch(&exc, &val, &tb);
+            PyErr_Format(PyExc_TypeError,
+                         "loop of ufunc does not support argument %d of "
+                         "type %s which has no callable %s method",
+                         i, type->tp_name, meth);
+            npy_PyErr_ChainExceptionsCause(exc, val, tb);
+            Py_DECREF(tup);
+            return;
+        }
+        ret = PyObject_Call(func, tup, NULL);
         if (ret == NULL) {
+            Py_DECREF(tup);
             return;
         }
         Py_XDECREF(*out);
         *out = ret;
     }
+    Py_DECREF(tup);
 }
 
 /*UFUNC_API*/
@@ -1619,6 +1639,31 @@ TIMEDELTA_mm_m_remainder(char **args, npy_intp *dimensions, npy_intp *steps, voi
     }
 }
 
+NPY_NO_EXPORT void
+TIMEDELTA_mm_q_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            npy_set_floatstatus_invalid();
+            *((npy_timedelta *)op1) = 0;
+        }
+        else if (in2 == 0) {
+            npy_set_floatstatus_divbyzero();
+            *((npy_timedelta *)op1) = 0;
+        }
+        else {
+            if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) {
+                *((npy_timedelta *)op1) = in1/in2 - 1;
+            }
+            else {
+                *((npy_timedelta *)op1) = in1/in2;
+            }
+        }
+    }
+}
+
 /*
  *****************************************************************************
  **                             FLOAT LOOPS                                 **
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 9b6327308..3c908121e 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -474,6 +474,9 @@ NPY_NO_EXPORT void
 TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
+TIMEDELTA_mm_q_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
 TIMEDELTA_mm_m_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
 
 /* Special case equivalents to above functions */
diff --git a/numpy/core/src/umath/override.c b/numpy/core/src/umath/override.c
index 2ea23311b..8d67f96ac 100644
--- a/numpy/core/src/umath/override.c
+++ b/numpy/core/src/umath/override.c
@@ -226,14 +226,14 @@ normalize_reduce_args(PyUFuncObject *ufunc, PyObject *args,
     PyObject *obj;
     static PyObject *NoValue = NULL;
     static char *kwlist[] = {"array", "axis", "dtype", "out", "keepdims",
-        "initial"};
+                             "initial", "where"};
 
     npy_cache_import("numpy", "_NoValue", &NoValue);
     if (NoValue == NULL) return -1;
 
-    if (nargs < 1 || nargs > 6) {
+    if (nargs < 1 || nargs > 7) {
         PyErr_Format(PyExc_TypeError,
-                     "ufunc.reduce() takes from 1 to 6 positional "
+                     "ufunc.reduce() takes from 1 to 7 positional "
                      "arguments but %"NPY_INTP_FMT" were given", nargs);
         return -1;
     }
diff --git a/numpy/core/src/umath/reduction.c b/numpy/core/src/umath/reduction.c
index 6d04ce372..4174e69a8 100644
--- a/numpy/core/src/umath/reduction.c
+++ b/numpy/core/src/umath/reduction.c
@@ -186,7 +186,6 @@ conform_reduce_result(int ndim, npy_bool *axis_flags,
             return NULL;
         }
 
-        Py_INCREF(ret);
         if (PyArray_SetWritebackIfCopyBase(ret_copy, (PyArrayObject *)ret) < 0) {
             Py_DECREF(ret);
             Py_DECREF(ret_copy);
@@ -444,9 +443,9 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
 
     /* Iterator parameters */
     NpyIter *iter = NULL;
-    PyArrayObject *op[2];
-    PyArray_Descr *op_dtypes[2];
-    npy_uint32 flags, op_flags[2];
+    PyArrayObject *op[3];
+    PyArray_Descr *op_dtypes[3];
+    npy_uint32 flags, op_flags[3];
 
     /* More than one axis means multiple orders are possible */
     if (!reorderable && count_axes(PyArray_NDIM(operand), axis_flags) > 1) {
@@ -456,13 +455,12 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
                      funcname);
         return NULL;
     }
-
-
-    /* Validate that the parameters for future expansion are NULL */
-    if (wheremask != NULL) {
-        PyErr_SetString(PyExc_RuntimeError,
-                "Reduce operations in NumPy do not yet support "
-                "a where mask");
+    /* Can only use where with an initial ( from identity or argument) */
+    if (wheremask != NULL && identity == Py_None) {
+        PyErr_Format(PyExc_ValueError,
+                     "reduction operation '%s' does not have an identity, "
+                     "so to use a where mask one has to specify 'initial'",
+                     funcname);
         return NULL;
     }
 
@@ -524,8 +522,16 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
                   NPY_ITER_NO_SUBTYPE;
     op_flags[1] = NPY_ITER_READONLY |
                   NPY_ITER_ALIGNED;
+    if (wheremask != NULL) {
+        op[2] = wheremask;
+        op_dtypes[2] = PyArray_DescrFromType(NPY_BOOL);
+        if (op_dtypes[2] == NULL) {
+            goto fail;
+        }
+        op_flags[2] = NPY_ITER_READONLY;
+    }
 
-    iter = NpyIter_AdvancedNew(2, op, flags,
+    iter = NpyIter_AdvancedNew(wheremask == NULL ? 2 : 3, op, flags,
                                NPY_KEEPORDER, casting,
                                op_flags,
                                op_dtypes,
@@ -568,7 +574,7 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
             goto fail;
         }
     }
-    
+
     /* Check whether any errors occurred during the loop */
     if (PyErr_Occurred() ||
             _check_ufunc_fperr(errormask, NULL, "reduce") < 0) {
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index ea0007a9d..ab986caa1 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -3063,8 +3063,10 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
     Py_XDECREF(axis);
     Py_XDECREF(full_args.in);
     Py_XDECREF(full_args.out);
+    PyArray_free(remap_axis_memory);
+    PyArray_free(remap_axis);
 
-    NPY_UF_DBG_PRINT1("Returning code %d\n", reval);
+    NPY_UF_DBG_PRINT1("Returning code %d\n", retval);
 
     return retval;
 
@@ -3466,12 +3468,15 @@ reduce_loop(NpyIter *iter, char **dataptrs, npy_intp *strides,
     PyUFuncObject *ufunc = (PyUFuncObject *)data;
     char *dataptrs_copy[3];
     npy_intp strides_copy[3];
+    npy_bool masked;
 
     /* The normal selected inner loop */
     PyUFuncGenericFunction innerloop = NULL;
     void *innerloopdata = NULL;
 
     NPY_BEGIN_THREADS_DEF;
+    /* Get the number of operands, to determine whether "where" is used */
+    masked = (NpyIter_GetNOp(iter) == 3);
 
     /* Get the inner loop */
     iter_dtypes = NpyIter_GetDescrArray(iter);
@@ -3531,8 +3536,36 @@ reduce_loop(NpyIter *iter, char **dataptrs, npy_intp *strides,
         strides_copy[0] = strides[0];
         strides_copy[1] = strides[1];
         strides_copy[2] = strides[0];
-        innerloop(dataptrs_copy, countptr,
-                    strides_copy, innerloopdata);
+
+        if (!masked) {
+            innerloop(dataptrs_copy, countptr,
+                      strides_copy, innerloopdata);
+        }
+        else {
+            npy_intp count = *countptr;
+            char *maskptr = dataptrs[2];
+            npy_intp mask_stride = strides[2];
+            /* Optimization for when the mask is broadcast */
+            npy_intp n = mask_stride == 0 ? count : 1;
+            while (count) {
+                char mask = *maskptr;
+                maskptr += mask_stride;
+                while (n < count && mask == *maskptr) {
+                    n++;
+                    maskptr += mask_stride;
+                }
+                /* If mask set, apply inner loop on this contiguous region */
+                if (mask) {
+                    innerloop(dataptrs_copy, &n,
+                              strides_copy, innerloopdata);
+                }
+                dataptrs_copy[0] += n * strides[0];
+                dataptrs_copy[1] += n * strides[1];
+                dataptrs_copy[2] = dataptrs_copy[0];
+                count -= n;
+                n = 1;
+            }
+        }
     } while (iternext(iter));
 
 finish_loop:
@@ -3561,7 +3594,7 @@ finish_loop:
 static PyArrayObject *
 PyUFunc_Reduce(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
         int naxes, int *axes, PyArray_Descr *odtype, int keepdims,
-        PyObject *initial)
+        PyObject *initial, PyArrayObject *wheremask)
 {
     int iaxes, ndim;
     npy_bool reorderable;
@@ -3627,7 +3660,7 @@ PyUFunc_Reduce(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
         return NULL;
     }
 
-    result = PyUFunc_ReduceWrapper(arr, out, NULL, dtype, dtype,
+    result = PyUFunc_ReduceWrapper(arr, out, wheremask, dtype, dtype,
                                    NPY_UNSAFE_CASTING,
                                    axis_flags, reorderable,
                                    keepdims, 0,
@@ -4384,7 +4417,7 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc, PyObject *args,
     int i, naxes=0, ndim;
     int axes[NPY_MAXDIMS];
     PyObject *axes_in = NULL;
-    PyArrayObject *mp = NULL, *ret = NULL;
+    PyArrayObject *mp = NULL, *wheremask = NULL, *ret = NULL;
     PyObject *op;
     PyObject *obj_ind, *context;
     PyArrayObject *indices = NULL;
@@ -4393,7 +4426,7 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc, PyObject *args,
     int keepdims = 0;
     PyObject *initial = NULL;
     static char *reduce_kwlist[] = {
-            "array", "axis", "dtype", "out", "keepdims", "initial", NULL};
+        "array", "axis", "dtype", "out", "keepdims", "initial", "where", NULL};
     static char *accumulate_kwlist[] = {
             "array", "axis", "dtype", "out", NULL};
     static char *reduceat_kwlist[] = {
@@ -4456,22 +4489,23 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc, PyObject *args,
     }
     else if (operation == UFUNC_ACCUMULATE) {
         if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO&O&:accumulate",
-                                        accumulate_kwlist,
-                                        &op,
-                                        &axes_in,
-                                        PyArray_DescrConverter2, &otype,
-                                        PyArray_OutputConverter, &out)) {
+                                         accumulate_kwlist,
+                                         &op,
+                                         &axes_in,
+                                         PyArray_DescrConverter2, &otype,
+                                         PyArray_OutputConverter, &out)) {
             goto fail;
         }
     }
     else {
-        if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO&O&iO:reduce",
-                                        reduce_kwlist,
-                                        &op,
-                                        &axes_in,
-                                        PyArray_DescrConverter2, &otype,
-                                        PyArray_OutputConverter, &out,
-                                        &keepdims, &initial)) {
+        if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO&O&iOO&:reduce",
+                                         reduce_kwlist,
+                                         &op,
+                                         &axes_in,
+                                         PyArray_DescrConverter2, &otype,
+                                         PyArray_OutputConverter, &out,
+                                         &keepdims, &initial,
+                                         _wheremask_converter, &wheremask)) {
             goto fail;
         }
     }
@@ -4602,7 +4636,8 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc, PyObject *args,
     switch(operation) {
     case UFUNC_REDUCE:
         ret = PyUFunc_Reduce(ufunc, mp, out, naxes, axes,
-                                          otype, keepdims, initial);
+                             otype, keepdims, initial, wheremask);
+        Py_XDECREF(wheremask);
         break;
     case UFUNC_ACCUMULATE:
         if (naxes != 1) {
@@ -4660,6 +4695,7 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc, PyObject *args,
 fail:
     Py_XDECREF(otype);
     Py_XDECREF(mp);
+    Py_XDECREF(wheremask);
     return NULL;
 }
 
@@ -4892,12 +4928,15 @@ PyUFunc_FromFuncAndDataAndSignatureAndIdentity(PyUFuncGenericFunction *func, voi
         return NULL;
     }
 
-    ufunc = PyArray_malloc(sizeof(PyUFuncObject));
+    ufunc = PyObject_GC_New(PyUFuncObject, &PyUFunc_Type);
+    /*
+     * We use GC_New here for ufunc->obj, but do not use GC_Track since
+     * ufunc->obj is still NULL at the end of this function.
+     * See ufunc_frompyfunc where ufunc->obj is set and GC_Track is called.
+     */
     if (ufunc == NULL) {
         return NULL;
     }
-    memset(ufunc, 0, sizeof(PyUFuncObject));
-    PyObject_Init((PyObject *)ufunc, &PyUFunc_Type);
 
     ufunc->nin = nin;
     ufunc->nout = nout;
@@ -4905,13 +4944,30 @@ PyUFunc_FromFuncAndDataAndSignatureAndIdentity(PyUFuncGenericFunction *func, voi
     ufunc->identity = identity;
     if (ufunc->identity == PyUFunc_IdentityValue) {
         Py_INCREF(identity_value);
+        ufunc->identity_value = identity_value;
+    }
+    else {
+        ufunc->identity_value = NULL;
     }
-    ufunc->identity_value = identity_value;
 
     ufunc->functions = func;
     ufunc->data = data;
     ufunc->types = types;
     ufunc->ntypes = ntypes;
+    ufunc->core_signature = NULL;
+    ufunc->core_enabled = 0;
+    ufunc->obj = NULL;
+    ufunc->core_num_dims = NULL;
+    ufunc->core_num_dim_ix = 0;
+    ufunc->core_offsets = NULL;
+    ufunc->core_dim_ixs = NULL;
+    ufunc->core_dim_sizes = NULL;
+    ufunc->core_dim_flags = NULL;
+    ufunc->userloops = NULL;
+    ufunc->ptr = NULL;
+    ufunc->reserved2 = NULL;
+    ufunc->reserved1 = 0;
+    ufunc->iter_flags = 0;
 
     /* Type resolution and inner loop selection functions */
     ufunc->type_resolver = &PyUFunc_DefaultTypeResolver;
@@ -5277,18 +5333,23 @@ PyUFunc_RegisterLoopForType(PyUFuncObject *ufunc,
 static void
 ufunc_dealloc(PyUFuncObject *ufunc)
 {
+    PyObject_GC_UnTrack((PyObject *)ufunc);
     PyArray_free(ufunc->core_num_dims);
     PyArray_free(ufunc->core_dim_ixs);
+    PyArray_free(ufunc->core_dim_sizes);
+    PyArray_free(ufunc->core_dim_flags);
     PyArray_free(ufunc->core_offsets);
     PyArray_free(ufunc->core_signature);
     PyArray_free(ufunc->ptr);
     PyArray_free(ufunc->op_flags);
     Py_XDECREF(ufunc->userloops);
-    Py_XDECREF(ufunc->obj);
     if (ufunc->identity == PyUFunc_IdentityValue) {
         Py_DECREF(ufunc->identity_value);
     }
-    PyArray_free(ufunc);
+    if (ufunc->obj != NULL) {
+        Py_DECREF(ufunc->obj);
+    }
+    PyObject_GC_Del(ufunc);
 }
 
 static PyObject *
@@ -5297,6 +5358,15 @@ ufunc_repr(PyUFuncObject *ufunc)
     return PyUString_FromFormat("<ufunc '%s'>", ufunc->name);
 }
 
+static int
+ufunc_traverse(PyUFuncObject *self, visitproc visit, void *arg)
+{
+    Py_VISIT(self->obj);
+    if (self->identity == PyUFunc_IdentityValue) {
+        Py_VISIT(self->identity_value);
+    }
+    return 0;
+}
 
 /******************************************************************************
  ***                          UFUNC METHODS                                 ***
@@ -6013,9 +6083,9 @@ NPY_NO_EXPORT PyTypeObject PyUFunc_Type = {
     0,                                          /* tp_getattro */
     0,                                          /* tp_setattro */
     0,                                          /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT,                         /* tp_flags */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,    /* tp_flags */
     0,                                          /* tp_doc */
-    0,                                          /* tp_traverse */
+    (traverseproc)ufunc_traverse,               /* tp_traverse */
     0,                                          /* tp_clear */
     0,                                          /* tp_richcompare */
     0,                                          /* tp_weaklistoffset */
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index ec60d9cfd..3cf507258 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -1114,7 +1114,16 @@ PyUFunc_DivisionTypeResolver(PyUFuncObject *ufunc,
             }
             out_dtypes[1] = out_dtypes[0];
             Py_INCREF(out_dtypes[1]);
+
+            /*
+             * TODO: split function into truediv and floordiv resolvers
+             */
+            if (strcmp(ufunc->name, "floor_divide") == 0) {
+                out_dtypes[2] = PyArray_DescrFromType(NPY_LONGLONG);
+            }
+            else {
             out_dtypes[2] = PyArray_DescrFromType(NPY_DOUBLE);
+            }
             if (out_dtypes[2] == NULL) {
                 Py_DECREF(out_dtypes[0]);
                 out_dtypes[0] = NULL;
diff --git a/numpy/core/src/umath/umathmodule.c b/numpy/core/src/umath/umathmodule.c
index 5de19fec2..23e3ffcfc 100644
--- a/numpy/core/src/umath/umathmodule.c
+++ b/numpy/core/src/umath/umathmodule.c
@@ -161,6 +161,7 @@ ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUS
 
     self->type_resolver = &object_ufunc_type_resolver;
     self->legacy_inner_loop_selector = &object_ufunc_loop_selector;
+    PyObject_GC_Track(self);
 
     return (PyObject *)self;
 }
@@ -170,7 +171,7 @@ PyObject *
 add_newdoc_ufunc(PyObject *NPY_UNUSED(dummy), PyObject *args)
 {
     PyUFuncObject *ufunc;
-    PyObject *str;
+    PyObject *str, *tmp;
     char *docstr, *newdocstr;
 
 #if defined(NPY_PY3K)
@@ -178,7 +179,11 @@ add_newdoc_ufunc(PyObject *NPY_UNUSED(dummy), PyObject *args)
                                         &PyUnicode_Type, &str)) {
         return NULL;
     }
-    docstr = PyBytes_AS_STRING(PyUnicode_AsUTF8String(str));
+    tmp = PyUnicode_AsUTF8String(str);
+    if (tmp == NULL) {
+        return NULL;
+    }
+    docstr = PyBytes_AS_STRING(tmp);
 #else
     if (!PyArg_ParseTuple(args, "O!O!:_add_newdoc_ufunc", &PyUFunc_Type, &ufunc,
                                          &PyString_Type, &str)) {
@@ -190,6 +195,9 @@ add_newdoc_ufunc(PyObject *NPY_UNUSED(dummy), PyObject *args)
     if (NULL != ufunc->doc) {
         PyErr_SetString(PyExc_ValueError,
                 "Cannot change docstring of ufunc with non-NULL docstring");
+#if defined(NPY_PY3K)
+        Py_DECREF(tmp);
+#endif
         return NULL;
     }
 
@@ -203,6 +211,9 @@ add_newdoc_ufunc(PyObject *NPY_UNUSED(dummy), PyObject *args)
     strcpy(newdocstr, docstr);
     ufunc->doc = newdocstr;
 
+#if defined(NPY_PY3K)
+    Py_DECREF(tmp);
+#endif
     Py_RETURN_NONE;
 }
 
diff --git a/numpy/core/tests/test_arrayprint.py b/numpy/core/tests/test_arrayprint.py
index 7a858d2e2..f2b8fdca7 100644
--- a/numpy/core/tests/test_arrayprint.py
+++ b/numpy/core/tests/test_arrayprint.py
@@ -90,6 +90,7 @@ class TestArrayRepr(object):
         assert_equal(repr(x),
             'sub(sub(sub(..., dtype=object), dtype=object), dtype=object)')
         assert_equal(str(x), '...')
+        x[()] = 0  # resolve circular references for garbage collector
 
         # nested 0d-subclass-object
         x = sub(None)
@@ -124,11 +125,13 @@ class TestArrayRepr(object):
         arr0d[()] = arr0d
         assert_equal(repr(arr0d),
             'array(array(..., dtype=object), dtype=object)')
+        arr0d[()] = 0  # resolve recursion for garbage collector
 
         arr1d = np.array([None, None])
         arr1d[1] = arr1d
         assert_equal(repr(arr1d),
             'array([None, array(..., dtype=object)], dtype=object)')
+        arr1d[1] = 0  # resolve recursion for garbage collector
 
         first = np.array(None)
         second = np.array(None)
@@ -136,6 +139,7 @@ class TestArrayRepr(object):
         second[()] = first
         assert_equal(repr(first),
             'array(array(array(..., dtype=object), dtype=object), dtype=object)')
+        first[()] = 0  # resolve circular references for garbage collector
 
     def test_containing_list(self):
         # printing square brackets directly would be ambiguuous
diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py
index b2ce0402a..cb7555a34 100644
--- a/numpy/core/tests/test_datetime.py
+++ b/numpy/core/tests/test_datetime.py
@@ -1081,6 +1081,86 @@ class TestDateTime(object):
                 check(np.timedelta64(0), f, nat)
                 check(nat, f, nat)
 
+    @pytest.mark.parametrize("op1, op2, exp", [
+        # m8 same units round down
+        (np.timedelta64(7, 's'),
+         np.timedelta64(4, 's'),
+         1),
+        # m8 same units round down with negative
+        (np.timedelta64(7, 's'),
+         np.timedelta64(-4, 's'),
+         -2),
+        # m8 same units negative no round down
+        (np.timedelta64(8, 's'),
+         np.timedelta64(-4, 's'),
+         -2),
+        # m8 different units
+        (np.timedelta64(1, 'm'),
+         np.timedelta64(31, 's'),
+         1),
+        # m8 generic units
+        (np.timedelta64(1890),
+         np.timedelta64(31),
+         60),
+        # Y // M works
+        (np.timedelta64(2, 'Y'),
+         np.timedelta64('13', 'M'),
+         1),
+        # handle 1D arrays
+        (np.array([1, 2, 3], dtype='m8'),
+         np.array([2], dtype='m8'),
+         np.array([0, 1, 1], dtype=np.int64)),
+        ])
+    def test_timedelta_floor_divide(self, op1, op2, exp):
+        assert_equal(op1 // op2, exp)
+
+    @pytest.mark.parametrize("op1, op2", [
+        # div by 0
+        (np.timedelta64(10, 'us'),
+         np.timedelta64(0, 'us')),
+        # div with NaT
+        (np.timedelta64('NaT'),
+         np.timedelta64(50, 'us')),
+        # special case for int64 min
+        # in integer floor division
+        (np.timedelta64(np.iinfo(np.int64).min),
+         np.timedelta64(-1)),
+        ])
+    def test_timedelta_floor_div_warnings(self, op1, op2):
+        with assert_warns(RuntimeWarning):
+            actual = op1 // op2
+            assert_equal(actual, 0)
+            assert_equal(actual.dtype, np.int64)
+
+    @pytest.mark.parametrize("val1, val2", [
+        # the smallest integer that can't be represented
+        # exactly in a double should be preserved if we avoid
+        # casting to double in floordiv operation
+        (9007199254740993, 1),
+        # stress the alternate floordiv code path where
+        # operand signs don't match and remainder isn't 0
+        (9007199254740999, -2),
+        ])
+    def test_timedelta_floor_div_precision(self, val1, val2):
+        op1 = np.timedelta64(val1)
+        op2 = np.timedelta64(val2)
+        actual = op1 // op2
+        # Python reference integer floor
+        expected = val1 // val2
+        assert_equal(actual, expected)
+
+    @pytest.mark.parametrize("val1, val2", [
+        # years and months sometimes can't be unambiguously
+        # divided for floor division operation
+        (np.timedelta64(7, 'Y'),
+         np.timedelta64(3, 's')),
+        (np.timedelta64(7, 'M'),
+         np.timedelta64(1, 'D')),
+        ])
+    def test_timedelta_floor_div_error(self, val1, val2):
+        with assert_raises_regex(TypeError, "common metadata divisor"):
+            val1 // val2
+
     def test_datetime_divide(self):
         for dta, tda, tdb, tdc, tdd in \
                     [
@@ -1111,8 +1191,6 @@ class TestDateTime(object):
             assert_equal(tda / tdd, 60.0)
             assert_equal(tdd / tda, 1.0 / 60.0)
 
-            # m8 // m8
-            assert_raises(TypeError, np.floor_divide, tda, tdb)
             # int / m8
             assert_raises(TypeError, np.divide, 2, tdb)
             # float / m8
diff --git a/numpy/core/tests/test_dtype.py b/numpy/core/tests/test_dtype.py
index c55751e3c..8f371197c 100644
--- a/numpy/core/tests/test_dtype.py
+++ b/numpy/core/tests/test_dtype.py
@@ -4,10 +4,12 @@ import sys
 import operator
 import pytest
 import ctypes
+import gc
 
 import numpy as np
 from numpy.core._rational_tests import rational
-from numpy.testing import assert_, assert_equal, assert_raises
+from numpy.testing import (
+    assert_, assert_equal, assert_array_equal, assert_raises, HAS_REFCOUNT)
 from numpy.core.numeric import pickle
 
 def assert_dtype_equal(a, b):
@@ -446,6 +448,173 @@ class TestSubarray(object):
         assert_equal(t1.alignment, t2.alignment)
 
 
+def iter_struct_object_dtypes():
+    """
+    Iterates over a few complex dtypes and object pattern which
+    fill the array with a given object (defaults to a singleton).
+
+    Yields
+    ------
+    dtype : dtype
+    pattern : tuple
+        Structured tuple for use with `np.array`.
+    count : int
+        Number of objects stored in the dtype.
+    singleton : object
+        A singleton object. The returned pattern is constructed so that
+        all objects inside the datatype are set to the singleton.
+    """
+    obj = object()
+
+    dt = np.dtype([('b', 'O', (2, 3))])
+    p = ([[obj] * 3] * 2,)
+    yield pytest.param(dt, p, 6, obj, id="<subarray>")
+
+    dt = np.dtype([('a', 'i4'), ('b', 'O', (2, 3))])
+    p = (0, [[obj] * 3] * 2)
+    yield pytest.param(dt, p, 6, obj, id="<subarray in field>")
+
+    dt = np.dtype([('a', 'i4'),
+                   ('b', [('ba', 'O'), ('bb', 'i1')], (2, 3))])
+    p = (0, [[(obj, 0)] * 3] * 2)
+    yield pytest.param(dt, p, 6, obj, id="<structured subarray 1>")
+
+    dt = np.dtype([('a', 'i4'),
+                   ('b', [('ba', 'O'), ('bb', 'O')], (2, 3))])
+    p = (0, [[(obj, obj)] * 3] * 2)
+    yield pytest.param(dt, p, 12, obj, id="<structured subarray 2>")
+
+
+@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+class TestStructuredObjectRefcounting:
+    """These tests cover various uses of complicated structured types which
+    include objects and thus require reference counting.
+    """
+    @pytest.mark.parametrize(['dt', 'pat', 'count', 'singleton'],
+                             iter_struct_object_dtypes())
+    @pytest.mark.parametrize(["creation_func", "creation_obj"], [
+        pytest.param(np.empty, None,
+             # None is probably used for too many things
+             marks=pytest.mark.skip("unreliable due to python's behaviour")),
+        (np.ones, 1),
+        (np.zeros, 0)])
+    def test_structured_object_create_delete(self, dt, pat, count, singleton,
+                                             creation_func, creation_obj):
+        """Structured object reference counting in creation and deletion"""
+        # The test assumes that 0, 1, and None are singletons.
+        gc.collect()
+        before = sys.getrefcount(creation_obj)
+        arr = creation_func(3, dt)
+
+        now = sys.getrefcount(creation_obj)
+        assert now - before == count * 3
+        del arr
+        now = sys.getrefcount(creation_obj)
+        assert now == before
+
+    @pytest.mark.parametrize(['dt', 'pat', 'count', 'singleton'],
+                             iter_struct_object_dtypes())
+    def test_structured_object_item_setting(self, dt, pat, count, singleton):
+        """Structured object reference counting for simple item setting"""
+        one = 1
+
+        gc.collect()
+        before = sys.getrefcount(singleton)
+        arr = np.array([pat] * 3, dt)
+        assert sys.getrefcount(singleton) - before == count * 3
+        # Fill with `1` and check that it was replaced correctly:
+        before2 = sys.getrefcount(one)
+        arr[...] = one
+        after2 = sys.getrefcount(one)
+        assert after2 - before2 == count * 3
+        del arr
+        gc.collect()
+        assert sys.getrefcount(one) == before2
+        assert sys.getrefcount(singleton) == before
+
+    @pytest.mark.parametrize(['dt', 'pat', 'count', 'singleton'],
+                             iter_struct_object_dtypes())
+    @pytest.mark.parametrize(
+        ['shape', 'index', 'items_changed'],
+        [((3,), ([0, 2],), 2),
+         ((3, 2), ([0, 2], slice(None)), 4),
+         ((3, 2), ([0, 2], [1]), 2),
+         ((3,), ([True, False, True]), 2)])
+    def test_structured_object_indexing(self, shape, index, items_changed,
+                                        dt, pat, count, singleton):
+        """Structured object reference counting for advanced indexing."""
+        zero = 0
+        one = 1
+
+        arr = np.zeros(shape, dt)
+
+        gc.collect()
+        before_zero = sys.getrefcount(zero)
+        before_one = sys.getrefcount(one)
+        # Test item getting:
+        part = arr[index]
+        after_zero = sys.getrefcount(zero)
+        assert after_zero - before_zero == count * items_changed
+        del part
+        # Test item setting:
+        arr[index] = one
+        gc.collect()
+        after_zero = sys.getrefcount(zero)
+        after_one = sys.getrefcount(one)
+        assert before_zero - after_zero == count * items_changed
+        assert after_one - before_one == count * items_changed
+
+    @pytest.mark.parametrize(['dt', 'pat', 'count', 'singleton'],
+                             iter_struct_object_dtypes())
+    def test_structured_object_take_and_repeat(self, dt, pat, count, singleton):
+        """Structured object reference counting for specialized functions.
+        The older functions such as take and repeat use different code paths
+        then item setting (when writing this).
+        """
+        indices = [0, 1]
+
+        arr = np.array([pat] * 3, dt)
+        gc.collect()
+        before = sys.getrefcount(singleton)
+        res = arr.take(indices)
+        after = sys.getrefcount(singleton)
+        assert after - before == count * 2
+        new = res.repeat(10)
+        gc.collect()
+        after_repeat = sys.getrefcount(singleton)
+        assert after_repeat - after == count * 2 * 10
+
+
+class TestStructuredDtypeSparseFields(object):
+    """Tests subarray fields which contain sparse dtypes so that
+    not all memory is used by the dtype work. Such dtype's should
+    leave the underlying memory unchanged.
+    """
+    dtype = np.dtype([('a', {'names':['aa', 'ab'], 'formats':['f', 'f'],
+                             'offsets':[0, 4]}, (2, 3))])
+    sparse_dtype = np.dtype([('a', {'names':['ab'], 'formats':['f'],
+                                    'offsets':[4]}, (2, 3))])
+
+    @pytest.mark.xfail(reason="inaccessible data is changed see gh-12686.")
+    @pytest.mark.valgrind_error(reason="reads from unitialized buffers.")
+    def test_sparse_field_assignment(self):
+        arr = np.zeros(3, self.dtype)
+        sparse_arr = arr.view(self.sparse_dtype)
+
+        sparse_arr[...] = np.finfo(np.float32).max
+        # dtype is reduced when accessing the field, so shape is (3, 2, 3):
+        assert_array_equal(arr["a"]["aa"], np.zeros((3, 2, 3)))
+
+    def test_sparse_field_assignment_fancy(self):
+        # Fancy assignment goes to the copyswap function for comlex types:
+        arr = np.zeros(3, self.dtype)
+        sparse_arr = arr.view(self.sparse_dtype)
+
+        sparse_arr[[0, 1, 2]] = np.finfo(np.float32).max
+        # dtype is reduced when accessing the field, so shape is (3, 2, 3):
+        assert_array_equal(arr["a"]["aa"], np.zeros((3, 2, 3)))
+
+
 class TestMonsterType(object):
     """Test deeply nested subtypes."""
 
diff --git a/numpy/core/tests/test_errstate.py b/numpy/core/tests/test_errstate.py
index 670d485c1..0008c4cc8 100644
--- a/numpy/core/tests/test_errstate.py
+++ b/numpy/core/tests/test_errstate.py
@@ -39,3 +39,11 @@ class TestErrstate(object):
             with np.errstate(call=None):
                 assert_(np.geterrcall() is None, 'call is not None')
         assert_(np.geterrcall() is olderrcall, 'call is not olderrcall')
+
+    def test_errstate_decorator(self):
+        @np.errstate(all='ignore')
+        def foo():
+            a = -np.arange(3)
+            a // 0
+            
+        foo()
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 951c01c6d..3dd45c5ce 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -54,7 +54,12 @@ else:
 
 
 def _aligned_zeros(shape, dtype=float, order="C", align=None):
-    """Allocate a new ndarray with aligned memory."""
+    """
+    Allocate a new ndarray with aligned memory.
+
+    The ndarray is guaranteed *not* aligned to twice the requested alignment.
+    Eg, if align=4, guarantees it is not aligned to 8. If align=None uses
+    dtype.alignment."""
     dtype = np.dtype(dtype)
     if dtype == np.dtype(object):
         # Can't do this, fall back to standard allocation (which
@@ -67,10 +72,15 @@ def _aligned_zeros(shape, dtype=float, order="C", align=None):
     if not hasattr(shape, '__len__'):
         shape = (shape,)
     size = functools.reduce(operator.mul, shape) * dtype.itemsize
-    buf = np.empty(size + align + 1, np.uint8)
-    offset = buf.__array_interface__['data'][0] % align
+    buf = np.empty(size + 2*align + 1, np.uint8)
+
+    ptr = buf.__array_interface__['data'][0]
+    offset = ptr % align
     if offset != 0:
         offset = align - offset
+    if (ptr % (2*align)) == 0:
+        offset += align
+
     # Note: slices producing 0-size arrays do not necessarily change
     # data pointer --- so we use and allocate size+1
     buf = buf[offset:offset+size+1][:-1]
@@ -3124,8 +3134,8 @@ class TestMethods(object):
         assert_equal(ac, np.conjugate(a))
 
         a = np.array([1-1j, 1, 2.0, 'f'], object)
-        assert_raises(AttributeError, lambda: a.conj())
-        assert_raises(AttributeError, lambda: a.conjugate())
+        assert_raises(TypeError, lambda: a.conj())
+        assert_raises(TypeError, lambda: a.conjugate())
 
     def test__complex__(self):
         dtypes = ['i1', 'i2', 'i4', 'i8',
@@ -7201,6 +7211,7 @@ class TestConversion(object):
         except NameError:
             Error = RuntimeError  # python < 3.5
         assert_raises(Error, bool, self_containing)  # previously stack overflow
+        self_containing[0] = None  # resolve circular reference
 
     def test_to_int_scalar(self):
         # gh-9972 means that these aren't always the same
@@ -8000,16 +8011,49 @@ class TestAlignment(object):
             raise ValueError()
 
     def test_various_alignments(self):
-        for align in [1, 2, 3, 4, 8, 16, 32, 64, None]:
+        for align in [1, 2, 3, 4, 8, 12, 16, 32, 64, None]:
             for n in [0, 1, 3, 11]:
                 for order in ["C", "F", None]:
-                    for dtype in np.typecodes["All"]:
+                    for dtype in list(np.typecodes["All"]) + ['i4,i4,i4']:
                         if dtype == 'O':
                             # object dtype can't be misaligned
                             continue
                         for shape in [n, (1, 2, 3, n)]:
                             self.check(shape, np.dtype(dtype), order, align)
 
+    def test_strided_loop_alignments(self):
+        # particularly test that complex64 and float128 use right alignment
+        # code-paths, since these are particularly problematic. It is useful to
+        # turn on USE_DEBUG for this test, so lowlevel-loop asserts are run.
+        for align in [1, 2, 4, 8, 12, 16, None]:
+            xf64 = _aligned_zeros(3, np.float64)
+
+            xc64 = _aligned_zeros(3, np.complex64, align=align)
+            xf128 = _aligned_zeros(3, np.longdouble, align=align)
+
+            # test casting, both to and from misaligned
+            with suppress_warnings() as sup:
+                sup.filter(np.ComplexWarning, "Casting complex values")
+                xc64.astype('f8')
+            xf64.astype(np.complex64)
+            test = xc64 + xf64
+
+            xf128.astype('f8')
+            xf64.astype(np.longdouble)
+            test = xf128 + xf64
+
+            test = xf128 + xc64
+
+            # test copy, both to and from misaligned
+            # contig copy
+            xf64[:] = xf64.copy()
+            xc64[:] = xc64.copy()
+            xf128[:] = xf128.copy()
+            # strided copy
+            xf64[::2] = xf64[::2].copy()
+            xc64[::2] = xc64[::2].copy()
+            xf128[::2] = xf128[::2].copy()
+
 def test_getfield():
     a = np.arange(32, dtype='uint16')
     if sys.byteorder == 'little':
diff --git a/numpy/core/tests/test_overrides.py b/numpy/core/tests/test_overrides.py
index 62b2a3e53..8f1c16539 100644
--- a/numpy/core/tests/test_overrides.py
+++ b/numpy/core/tests/test_overrides.py
@@ -7,7 +7,7 @@ import numpy as np
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex)
 from numpy.core.overrides import (
-    get_overloaded_types_and_args, array_function_dispatch,
+    _get_implementing_args, array_function_dispatch,
     verify_matching_signatures, ENABLE_ARRAY_FUNCTION)
 from numpy.core.numeric import pickle
 import pytest
@@ -18,11 +18,6 @@ requires_array_function = pytest.mark.skipif(
     reason="__array_function__ dispatch not enabled.")
 
 
-def _get_overloaded_args(relevant_args):
-    types, args = get_overloaded_types_and_args(relevant_args)
-    return args
-
-
 def _return_not_implemented(self, *args, **kwargs):
     return NotImplemented
 
@@ -41,26 +36,21 @@ def dispatched_two_arg(array1, array2):
 
 
 @requires_array_function
-class TestGetOverloadedTypesAndArgs(object):
+class TestGetImplementingArgs(object):
 
     def test_ndarray(self):
         array = np.array(1)
 
-        types, args = get_overloaded_types_and_args([array])
-        assert_equal(set(types), {np.ndarray})
+        args = _get_implementing_args([array])
         assert_equal(list(args), [array])
 
-        types, args = get_overloaded_types_and_args([array, array])
-        assert_equal(len(types), 1)
-        assert_equal(set(types), {np.ndarray})
+        args = _get_implementing_args([array, array])
         assert_equal(list(args), [array])
 
-        types, args = get_overloaded_types_and_args([array, 1])
-        assert_equal(set(types), {np.ndarray})
+        args = _get_implementing_args([array, 1])
         assert_equal(list(args), [array])
 
-        types, args = get_overloaded_types_and_args([1, array])
-        assert_equal(set(types), {np.ndarray})
+        args = _get_implementing_args([1, array])
         assert_equal(list(args), [array])
 
     def test_ndarray_subclasses(self):
@@ -75,17 +65,14 @@ class TestGetOverloadedTypesAndArgs(object):
         override_sub = np.array(1).view(OverrideSub)
         no_override_sub = np.array(1).view(NoOverrideSub)
 
-        types, args = get_overloaded_types_and_args([array, override_sub])
-        assert_equal(set(types), {np.ndarray, OverrideSub})
+        args = _get_implementing_args([array, override_sub])
         assert_equal(list(args), [override_sub, array])
 
-        types, args = get_overloaded_types_and_args([array, no_override_sub])
-        assert_equal(set(types), {np.ndarray, NoOverrideSub})
+        args = _get_implementing_args([array, no_override_sub])
         assert_equal(list(args), [no_override_sub, array])
 
-        types, args = get_overloaded_types_and_args(
+        args = _get_implementing_args(
             [override_sub, no_override_sub])
-        assert_equal(set(types), {OverrideSub, NoOverrideSub})
         assert_equal(list(args), [override_sub, no_override_sub])
 
     def test_ndarray_and_duck_array(self):
@@ -96,12 +83,10 @@ class TestGetOverloadedTypesAndArgs(object):
         array = np.array(1)
         other = Other()
 
-        types, args = get_overloaded_types_and_args([other, array])
-        assert_equal(set(types), {np.ndarray, Other})
+        args = _get_implementing_args([other, array])
         assert_equal(list(args), [other, array])
 
-        types, args = get_overloaded_types_and_args([array, other])
-        assert_equal(set(types), {np.ndarray, Other})
+        args = _get_implementing_args([array, other])
         assert_equal(list(args), [array, other])
 
     def test_ndarray_subclass_and_duck_array(self):
@@ -116,9 +101,9 @@ class TestGetOverloadedTypesAndArgs(object):
         subarray = np.array(1).view(OverrideSub)
         other = Other()
 
-        assert_equal(_get_overloaded_args([array, subarray, other]),
+        assert_equal(_get_implementing_args([array, subarray, other]),
                      [subarray, array, other])
-        assert_equal(_get_overloaded_args([array, other, subarray]),
+        assert_equal(_get_implementing_args([array, other, subarray]),
                      [subarray, array, other])
 
     def test_many_duck_arrays(self):
@@ -140,15 +125,26 @@ class TestGetOverloadedTypesAndArgs(object):
         c = C()
         d = D()
 
-        assert_equal(_get_overloaded_args([1]), [])
-        assert_equal(_get_overloaded_args([a]), [a])
-        assert_equal(_get_overloaded_args([a, 1]), [a])
-        assert_equal(_get_overloaded_args([a, a, a]), [a])
-        assert_equal(_get_overloaded_args([a, d, a]), [a, d])
-        assert_equal(_get_overloaded_args([a, b]), [b, a])
-        assert_equal(_get_overloaded_args([b, a]), [b, a])
-        assert_equal(_get_overloaded_args([a, b, c]), [b, c, a])
-        assert_equal(_get_overloaded_args([a, c, b]), [c, b, a])
+        assert_equal(_get_implementing_args([1]), [])
+        assert_equal(_get_implementing_args([a]), [a])
+        assert_equal(_get_implementing_args([a, 1]), [a])
+        assert_equal(_get_implementing_args([a, a, a]), [a])
+        assert_equal(_get_implementing_args([a, d, a]), [a, d])
+        assert_equal(_get_implementing_args([a, b]), [b, a])
+        assert_equal(_get_implementing_args([b, a]), [b, a])
+        assert_equal(_get_implementing_args([a, b, c]), [b, c, a])
+        assert_equal(_get_implementing_args([a, c, b]), [c, b, a])
+
+    def test_too_many_duck_arrays(self):
+        namespace = dict(__array_function__=_return_not_implemented)
+        types = [type('A' + str(i), (object,), namespace) for i in range(33)]
+        relevant_args = [t() for t in types]
+
+        actual = _get_implementing_args(relevant_args[:32])
+        assert_equal(actual, relevant_args[:32])
+
+        with assert_raises_regex(TypeError, 'distinct argument types'):
+            _get_implementing_args(relevant_args)
 
 
 @requires_array_function
@@ -201,6 +197,14 @@ class TestNDArrayArrayFunction(object):
         result = np.concatenate((array, override_sub))
         assert_equal(result, expected.view(OverrideSub))
 
+    def test_no_wrapper(self):
+        array = np.array(1)
+        func = dispatched_one_arg.__wrapped__
+        with assert_raises_regex(AttributeError, '__wrapped__'):
+            array.__array_function__(func=func,
+                                     types=(np.ndarray,),
+                                     args=(array,), kwargs={})
+
 
 @requires_array_function
 class TestArrayFunctionDispatch(object):
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index 2421a1161..17c48987f 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -46,7 +46,7 @@ class TestRegression(object):
             assert_array_equal(a, b)
 
     def test_typeNA(self):
-        # Issue gh-515 
+        # Issue gh-515
         with suppress_warnings() as sup:
             sup.filter(np.VisibleDeprecationWarning)
             assert_equal(np.typeNA[np.int64], 'Int64')
@@ -2415,3 +2415,11 @@ class TestRegression(object):
         # gh-11993
         arr = np.array(['AAAAA', 18465886.0, 18465886.0], dtype=object)
         assert_raises(TypeError, arr.astype, 'c8')
+
+    def test_eff1d_casting(self):
+        # gh-12711
+        x = np.array([1, 2, 4, 7, 0], dtype=np.int16)
+        res = np.ediff1d(x, to_begin=-99, to_end=np.array([88, 99]))
+        assert_equal(res, [-99,   1,   2,   3,  -7,  88,  99])
+        assert_raises(ValueError, np.ediff1d, x, to_begin=(1<<20))
+        assert_raises(ValueError, np.ediff1d, x, to_end=(1<<20))
diff --git a/numpy/core/tests/test_shape_base.py b/numpy/core/tests/test_shape_base.py
index ef5c118ec..53d272fc5 100644
--- a/numpy/core/tests/test_shape_base.py
+++ b/numpy/core/tests/test_shape_base.py
@@ -224,13 +224,27 @@ class TestConcatenate(object):
         assert_raises(ValueError, concatenate, (0,))
         assert_raises(ValueError, concatenate, (np.array(0),))
 
+        # dimensionality must match
+        assert_raises_regex(
+            ValueError,
+            r"all the input arrays must have same number of dimensions, but "
+            r"the array at index 0 has 1 dimension\(s\) and the array at "
+            r"index 1 has 2 dimension\(s\)",
+            np.concatenate, (np.zeros(1), np.zeros((1, 1))))
+
         # test shapes must match except for concatenation axis
         a = np.ones((1, 2, 3))
         b = np.ones((2, 2, 3))
         axis = list(range(3))
         for i in range(3):
             np.concatenate((a, b), axis=axis[0])  # OK
-            assert_raises(ValueError, np.concatenate, (a, b), axis=axis[1])
+            assert_raises_regex(
+                ValueError,
+                "all the input array dimensions for the concatenation axis "
+                "must match exactly, but along dimension {}, the array at "
+                "index 0 has size 1 and the array at index 1 has size 2"
+                .format(i),
+                np.concatenate, (a, b), axis=axis[1])
             assert_raises(ValueError, np.concatenate, (a, b), axis=axis[2])
             a = np.moveaxis(a, -1, 0)
             b = np.moveaxis(b, -1, 0)
@@ -373,6 +387,10 @@ def test_stack():
     # empty arrays
     assert_(stack([[], [], []]).shape == (3, 0))
     assert_(stack([[], [], []], axis=1).shape == (0, 3))
+    # out
+    out = np.zeros_like(r1)
+    np.stack((a, b), out=out)
+    assert_array_equal(out, r1)
     # edge cases
     assert_raises_regex(ValueError, 'need at least one array', stack, [])
     assert_raises_regex(ValueError, 'must have the same shape',
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index b83b8ccff..fa62767f6 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -3,6 +3,8 @@ from __future__ import division, absolute_import, print_function
 import warnings
 import itertools
 
+import pytest
+
 import numpy as np
 import numpy.core._umath_tests as umt
 import numpy.linalg._umath_linalg as uml
@@ -596,6 +598,12 @@ class TestUfunc(object):
         assert_equal(np.sum(np.ones((2, 3, 5), dtype=np.int64), axis=(0, 2), initial=2),
                      [12, 12, 12])
 
+    def test_sum_where(self):
+        # More extensive tests done in test_reduction_with_where.
+        assert_equal(np.sum([[1., 2.], [3., 4.]], where=[True, False]), 4.)
+        assert_equal(np.sum([[1., 2.], [3., 4.]], axis=0, initial=5.,
+                            where=[True, False]), [9., 5.])
+
     def test_inner1d(self):
         a = np.arange(6).reshape((2, 3))
         assert_array_equal(umt.inner1d(a, a), np.sum(a*a, axis=-1))
@@ -1162,6 +1170,8 @@ class TestUfunc(object):
         assert_equal(np.array([[1]], dtype=object).sum(), 1)
         assert_equal(np.array([[[1, 2]]], dtype=object).sum((0, 1)), [1, 2])
         assert_equal(np.array([1], dtype=object).sum(initial=1), 2)
+        assert_equal(np.array([[1], [2, 3]], dtype=object)
+                     .sum(initial=[0], where=[False, True]), [0, 2, 3])
 
     def test_object_array_accumulate_inplace(self):
         # Checks that in-place accumulates work, see also gh-7402
@@ -1396,6 +1406,44 @@ class TestUfunc(object):
         res = np.add.reduce(a, initial=5)
         assert_equal(res, 15)
 
+    @pytest.mark.parametrize('axis', (0, 1, None))
+    @pytest.mark.parametrize('where', (np.array([False, True, True]),
+                                       np.array([[True], [False], [True]]),
+                                       np.array([[True, False, False],
+                                                 [False, True, False],
+                                                 [False, True, True]])))
+    def test_reduction_with_where(self, axis, where):
+        a = np.arange(9.).reshape(3, 3)
+        a_copy = a.copy()
+        a_check = np.zeros_like(a)
+        np.positive(a, out=a_check, where=where)
+
+        res = np.add.reduce(a, axis=axis, where=where)
+        check = a_check.sum(axis)
+        assert_equal(res, check)
+        # Check we do not overwrite elements of a internally.
+        assert_array_equal(a, a_copy)
+
+    @pytest.mark.parametrize(('axis', 'where'),
+                             ((0, np.array([True, False, True])),
+                              (1, [True, True, False]),
+                              (None, True)))
+    @pytest.mark.parametrize('initial', (-np.inf, 5.))
+    def test_reduction_with_where_and_initial(self, axis, where, initial):
+        a = np.arange(9.).reshape(3, 3)
+        a_copy = a.copy()
+        a_check = np.full(a.shape, -np.inf)
+        np.positive(a, out=a_check, where=where)
+
+        res = np.maximum.reduce(a, axis=axis, where=where, initial=initial)
+        check = a_check.max(axis, initial=initial)
+        assert_equal(res, check)
+
+    def test_reduction_where_initial_needed(self):
+        a = np.arange(9.).reshape(3, 3)
+        m = [False, True, False]
+        assert_raises(ValueError, np.maximum.reduce, a, where=m)
+
     def test_identityless_reduction_nonreorderable(self):
         a = np.array([[8.0, 2.0, 2.0], [1.0, 0.5, 0.25]])
 
@@ -1749,16 +1797,19 @@ class TestUfunc(object):
         assert_equal(f(d, 0, None, None, True), r.reshape((1,) + r.shape))
         assert_equal(f(d, 0, None, None, False, 0), r)
         assert_equal(f(d, 0, None, None, False, initial=0), r)
+        assert_equal(f(d, 0, None, None, False, 0, True), r)
+        assert_equal(f(d, 0, None, None, False, 0, where=True), r)
         # multiple keywords
         assert_equal(f(d, axis=0, dtype=None, out=None, keepdims=False), r)
         assert_equal(f(d, 0, dtype=None, out=None, keepdims=False), r)
         assert_equal(f(d, 0, None, out=None, keepdims=False), r)
-        assert_equal(f(d, 0, None, out=None, keepdims=False, initial=0), r)
+        assert_equal(f(d, 0, None, out=None, keepdims=False, initial=0,
+                       where=True), r)
 
         # too little
         assert_raises(TypeError, f)
         # too much
-        assert_raises(TypeError, f, d, 0, None, None, False, 0, 1)
+        assert_raises(TypeError, f, d, 0, None, None, False, 0, True, 1)
         # invalid axis
         assert_raises(TypeError, f, d, "invalid")
         assert_raises(TypeError, f, d, axis="invalid")
@@ -1857,3 +1908,9 @@ class TestUfunc(object):
     def test_no_doc_string(self):
         # gh-9337
         assert_('\n' not in umt.inner1d_no_doc.__doc__)
+
+    def test_invalid_args(self):
+        # gh-7961
+        exc = pytest.raises(TypeError, np.sqrt, None)
+        # minimally check the exception text
+        assert 'loop of ufunc does not support' in str(exc)
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 2f8edebc0..21097244f 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -1894,7 +1894,8 @@ class TestSpecialMethods(object):
 
         # reduce, kwargs
         res = np.multiply.reduce(a, axis='axis0', dtype='dtype0', out='out0',
-                                 keepdims='keep0', initial='init0')
+                                 keepdims='keep0', initial='init0',
+                                 where='where0')
         assert_equal(res[0], a)
         assert_equal(res[1], np.multiply)
         assert_equal(res[2], 'reduce')
@@ -1903,7 +1904,8 @@ class TestSpecialMethods(object):
                               'out': ('out0',),
                               'keepdims': 'keep0',
                               'axis': 'axis0',
-                              'initial': 'init0'})
+                              'initial': 'init0',
+                              'where': 'where0'})
 
         # reduce, output equal to None removed, but not other explicit ones,
         # even if they are at their default value.
@@ -1913,14 +1915,18 @@ class TestSpecialMethods(object):
         assert_equal(res[4], {'axis': 0, 'keepdims': True})
         res = np.multiply.reduce(a, None, out=(None,), dtype=None)
         assert_equal(res[4], {'axis': None, 'dtype': None})
-        res = np.multiply.reduce(a, 0, None, None, False, 2)
-        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False, 'initial': 2})
-        # np._NoValue ignored for initial.
-        res = np.multiply.reduce(a, 0, None, None, False, np._NoValue)
-        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False})
-        # None kept for initial.
-        res = np.multiply.reduce(a, 0, None, None, False, None)
-        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False, 'initial': None})
+        res = np.multiply.reduce(a, 0, None, None, False, 2, True)
+        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False,
+                              'initial': 2, 'where': True})
+        # np._NoValue ignored for initial
+        res = np.multiply.reduce(a, 0, None, None, False,
+                                 np._NoValue, True)
+        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False,
+                              'where': True})
+        # None kept for initial, True for where.
+        res = np.multiply.reduce(a, 0, None, None, False, None, True)
+        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False,
+                              'initial': None, 'where': True})
 
         # reduce, wrong args
         assert_raises(ValueError, np.multiply.reduce, a, out=())
diff --git a/numpy/core/tests/test_umath_complex.py b/numpy/core/tests/test_umath_complex.py
index 785ae8c57..1f5b4077f 100644
--- a/numpy/core/tests/test_umath_complex.py
+++ b/numpy/core/tests/test_umath_complex.py
@@ -5,7 +5,8 @@ import platform
 import pytest
 
 import numpy as np
-import numpy.core.umath as ncu
+# import the c-extension module directly since _arg is not exported via umath
+import numpy.core._multiarray_umath as ncu
 from numpy.testing import (
     assert_raises, assert_equal, assert_array_equal, assert_almost_equal
     )
diff --git a/numpy/core/umath.py b/numpy/core/umath.py
index a0e8ad427..f3b26ab72 100644
--- a/numpy/core/umath.py
+++ b/numpy/core/umath.py
@@ -9,7 +9,7 @@ by importing from the extension module.
 from . import _multiarray_umath
 from numpy.core._multiarray_umath import *
 from numpy.core._multiarray_umath import (
-    _UFUNC_API, _add_newdoc_ufunc, _arg, _ones_like
+    _UFUNC_API, _add_newdoc_ufunc, _ones_like
     )
 
 __all__ = [
@@ -18,7 +18,7 @@ __all__ = [
     'FPE_DIVIDEBYZERO', 'FPE_INVALID', 'FPE_OVERFLOW', 'FPE_UNDERFLOW', 'NAN',
     'NINF', 'NZERO', 'PINF', 'PZERO', 'SHIFT_DIVIDEBYZERO', 'SHIFT_INVALID',
     'SHIFT_OVERFLOW', 'SHIFT_UNDERFLOW', 'UFUNC_BUFSIZE_DEFAULT',
-    'UFUNC_PYVALS_NAME', '_add_newdoc_ufunc', '_arg', 'absolute', 'add',
+    'UFUNC_PYVALS_NAME', '_add_newdoc_ufunc', 'absolute', 'add',
     'arccos', 'arccosh', 'arcsin', 'arcsinh', 'arctan', 'arctan2', 'arctanh',
     'bitwise_and', 'bitwise_or', 'bitwise_xor', 'cbrt', 'ceil', 'conj',
     'conjugate', 'copysign', 'cos', 'cosh', 'deg2rad', 'degrees', 'divide',
diff --git a/numpy/ctypeslib.py b/numpy/ctypeslib.py
index 2e9781286..02c3bd211 100644
--- a/numpy/ctypeslib.py
+++ b/numpy/ctypeslib.py
@@ -346,27 +346,157 @@ def ndpointer(dtype=None, ndim=None, shape=None, flags=None):
     return klass
 
 
-def _get_typecodes():
-    """ Return a dictionary mapping __array_interface__ formats to ctypes types """
-    ct = ctypes
-    simple_types = [
-        ct.c_byte, ct.c_short, ct.c_int, ct.c_long, ct.c_longlong,
-        ct.c_ubyte, ct.c_ushort, ct.c_uint, ct.c_ulong, ct.c_ulonglong,
-        ct.c_float, ct.c_double,
-    ]
+if ctypes is not None:
+    def _ctype_ndarray(element_type, shape):
+        """ Create an ndarray of the given element type and shape """
+        for dim in shape[::-1]:
+            element_type = dim * element_type
+            # prevent the type name include np.ctypeslib
+            element_type.__module__ = None
+        return element_type
 
-    return {_dtype(ctype).str: ctype for ctype in simple_types}
 
+    def _get_scalar_type_map():
+        """
+        Return a dictionary mapping native endian scalar dtype to ctypes types
+        """
+        ct = ctypes
+        simple_types = [
+            ct.c_byte, ct.c_short, ct.c_int, ct.c_long, ct.c_longlong,
+            ct.c_ubyte, ct.c_ushort, ct.c_uint, ct.c_ulong, ct.c_ulonglong,
+            ct.c_float, ct.c_double,
+            ct.c_bool,
+        ]
+        return {_dtype(ctype): ctype for ctype in simple_types}
 
-def _ctype_ndarray(element_type, shape):
-    """ Create an ndarray of the given element type and shape """
-    for dim in shape[::-1]:
-        element_type = element_type * dim
-    return element_type
 
+    _scalar_type_map = _get_scalar_type_map()
+
+
+    def _ctype_from_dtype_scalar(dtype):
+        # swapping twice ensure that `=` is promoted to <, >, or |
+        dtype_with_endian = dtype.newbyteorder('S').newbyteorder('S')
+        dtype_native = dtype.newbyteorder('=')
+        try:
+            ctype = _scalar_type_map[dtype_native]
+        except KeyError:
+            raise NotImplementedError(
+                "Converting {!r} to a ctypes type".format(dtype)
+            )
+
+        if dtype_with_endian.byteorder == '>':
+            ctype = ctype.__ctype_be__
+        elif dtype_with_endian.byteorder == '<':
+            ctype = ctype.__ctype_le__
+
+        return ctype
+
+
+    def _ctype_from_dtype_subarray(dtype):
+        element_dtype, shape = dtype.subdtype
+        ctype = _ctype_from_dtype(element_dtype)
+        return _ctype_ndarray(ctype, shape)
+
+
+    def _ctype_from_dtype_structured(dtype):
+        # extract offsets of each field
+        field_data = []
+        for name in dtype.names:
+            field_dtype, offset = dtype.fields[name][:2]
+            field_data.append((offset, name, _ctype_from_dtype(field_dtype)))
+
+        # ctypes doesn't care about field order
+        field_data = sorted(field_data, key=lambda f: f[0])
+
+        if len(field_data) > 1 and all(offset == 0 for offset, name, ctype in field_data):
+            # union, if multiple fields all at address 0
+            size = 0
+            _fields_ = []
+            for offset, name, ctype in field_data:
+                _fields_.append((name, ctype))
+                size = max(size, ctypes.sizeof(ctype))
+
+            # pad to the right size
+            if dtype.itemsize != size:
+                _fields_.append(('', ctypes.c_char * dtype.itemsize))
+
+            # we inserted manual padding, so always `_pack_`
+            return type('union', (ctypes.Union,), dict(
+                _fields_=_fields_,
+                _pack_=1,
+                __module__=None,
+            ))
+        else:
+            last_offset = 0
+            _fields_ = []
+            for offset, name, ctype in field_data:
+                padding = offset - last_offset
+                if padding < 0:
+                    raise NotImplementedError("Overlapping fields")
+                if padding > 0:
+                    _fields_.append(('', ctypes.c_char * padding))
+
+                _fields_.append((name, ctype))
+                last_offset = offset + ctypes.sizeof(ctype)
+
+
+            padding = dtype.itemsize - last_offset
+            if padding > 0:
+                _fields_.append(('', ctypes.c_char * padding))
+
+            # we inserted manual padding, so always `_pack_`
+            return type('struct', (ctypes.Structure,), dict(
+                _fields_=_fields_,
+                _pack_=1,
+                __module__=None,
+            ))
+
+
+    def _ctype_from_dtype(dtype):
+        if dtype.fields is not None:
+            return _ctype_from_dtype_structured(dtype)
+        elif dtype.subdtype is not None:
+            return _ctype_from_dtype_subarray(dtype)
+        else:
+            return _ctype_from_dtype_scalar(dtype)
+
+
+    def as_ctypes_type(dtype):
+        """
+        Convert a dtype into a ctypes type.
+
+        Parameters
+        ----------
+        dtype : dtype
+            The dtype to convert
+
+        Returns
+        -------
+        ctypes
+            A ctype scalar, union, array, or struct
+
+        Raises
+        ------
+        NotImplementedError
+            If the conversion is not possible
+
+        Notes
+        -----
+        This function does not losslessly round-trip in either direction.
+
+        ``np.dtype(as_ctypes_type(dt))`` will:
+         - insert padding fields
+         - reorder fields to be sorted by offset
+         - discard field titles
+
+        ``as_ctypes_type(np.dtype(ctype))`` will:
+         - discard the class names of ``Structure``s and ``Union``s
+         - convert single-element ``Union``s into single-element ``Structure``s
+         - insert padding fields
+
+        """
+        return _ctype_from_dtype(_dtype(dtype))
 
-if ctypes is not None:
-    _typecodes = _get_typecodes()
 
     def as_array(obj, shape=None):
         """
@@ -388,6 +518,7 @@ if ctypes is not None:
 
         return array(obj, copy=False)
 
+
     def as_ctypes(obj):
         """Create and return a ctypes object from a numpy array.  Actually
         anything that exposes the __array_interface__ is accepted."""
@@ -399,7 +530,8 @@ if ctypes is not None:
         addr, readonly = ai["data"]
         if readonly:
             raise TypeError("readonly arrays unsupported")
-        tp = _ctype_ndarray(_typecodes[ai["typestr"]], ai["shape"])
-        result = tp.from_address(addr)
+
+        dtype = _dtype((ai["typestr"], ai["shape"]))
+        result = as_ctypes_type(dtype).from_address(addr)
         result.__keep = obj
         return result
diff --git a/numpy/distutils/fcompiler/environment.py b/numpy/distutils/fcompiler/environment.py
index 489784580..4238f35cb 100644
--- a/numpy/distutils/fcompiler/environment.py
+++ b/numpy/distutils/fcompiler/environment.py
@@ -1,6 +1,7 @@
 from __future__ import division, absolute_import, print_function
 
 import os
+import warnings
 from distutils.dist import Distribution
 
 __metaclass__ = type
@@ -54,8 +55,18 @@ class EnvironmentConfig(object):
         if envvar is not None:
             envvar_contents = os.environ.get(envvar)
             if envvar_contents is not None:
-                if var and append and os.environ.get('NPY_DISTUTILS_APPEND_FLAGS', '0') == '1':
-                    var = var + [envvar_contents]
+                if var and append:
+                    if os.environ.get('NPY_DISTUTILS_APPEND_FLAGS', '0') == '1':
+                        var = var + [envvar_contents]
+                    else:
+                        var = envvar_contents
+                        if 'NPY_DISTUTILS_APPEND_FLAGS' not in os.environ.keys():
+                            msg = "{} is used as is, not appended ".format(envvar) + \
+                                  "to flags already defined " + \
+                                  "by numpy.distutils! Use NPY_DISTUTILS_APPEND_FLAGS=1 " + \
+                                  "to obtain appending behavior instead (this " + \
+                                  "behavior will become default in a future release)."
+                            warnings.warn(msg, UserWarning, stacklevel=3)
                 else:
                     var = envvar_contents
         if confvar is not None and self._conf:
diff --git a/numpy/distutils/tests/test_fcompiler.py b/numpy/distutils/tests/test_fcompiler.py
index 95e44b051..ba19a97ea 100644
--- a/numpy/distutils/tests/test_fcompiler.py
+++ b/numpy/distutils/tests/test_fcompiler.py
@@ -1,6 +1,8 @@
 from __future__ import division, absolute_import, print_function
 
-from numpy.testing import assert_
+import pytest
+
+from numpy.testing import assert_, suppress_warnings
 import numpy.distutils.fcompiler
 
 customizable_flags = [
@@ -25,6 +27,7 @@ def test_fcompiler_flags(monkeypatch):
 
         monkeypatch.setenv(envvar, new_flag)
         new_flags = getattr(flag_vars, opt)
+
         monkeypatch.delenv(envvar)
         assert_(new_flags == [new_flag])
 
@@ -33,12 +36,46 @@ def test_fcompiler_flags(monkeypatch):
     for opt, envvar in customizable_flags:
         new_flag = '-dummy-{}-flag'.format(opt)
         prev_flags = getattr(flag_vars, opt)
-
         monkeypatch.setenv(envvar, new_flag)
         new_flags = getattr(flag_vars, opt)
+
         monkeypatch.delenv(envvar)
         if prev_flags is None:
             assert_(new_flags == [new_flag])
         else:
             assert_(new_flags == prev_flags + [new_flag])
 
+
+def test_fcompiler_flags_append_warning(monkeypatch):
+    # Test to check that the warning for append behavior changing in future
+    # is triggered.  Need to use a real compiler instance so that we have
+    # non-empty flags to start with (otherwise the "if var and append" check
+    # will always be false).
+    try:
+        with suppress_warnings() as sup:
+            sup.record()
+            fc = numpy.distutils.fcompiler.new_fcompiler(compiler='gnu95')
+            fc.customize()
+    except numpy.distutils.fcompiler.CompilerNotFound:
+        pytest.skip("gfortran not found, so can't execute this test")
+
+    # Ensure NPY_DISTUTILS_APPEND_FLAGS not defined
+    monkeypatch.delenv('NPY_DISTUTILS_APPEND_FLAGS', raising=False)
+
+    for opt, envvar in customizable_flags:
+        new_flag = '-dummy-{}-flag'.format(opt)
+        with suppress_warnings() as sup:
+            sup.record()
+            prev_flags = getattr(fc.flag_vars, opt)
+
+        monkeypatch.setenv(envvar, new_flag)
+        with suppress_warnings() as sup:
+            sup.record()
+            new_flags = getattr(fc.flag_vars, opt)
+            if prev_flags:
+                # Check that warning was issued
+                assert len(sup.log) == 1
+
+        monkeypatch.delenv(envvar)
+        assert_(new_flags == [new_flag])
+
diff --git a/numpy/doc/structured_arrays.py b/numpy/doc/structured_arrays.py
index e92a06124..da3a74bd6 100644
--- a/numpy/doc/structured_arrays.py
+++ b/numpy/doc/structured_arrays.py
@@ -13,8 +13,8 @@ datatypes organized as a sequence of named :term:`fields <field>`. For example,
  >>> x = np.array([('Rex', 9, 81.0), ('Fido', 3, 27.0)],
  ...              dtype=[('name', 'U10'), ('age', 'i4'), ('weight', 'f4')])
  >>> x
- array([('Rex', 9, 81.0), ('Fido', 3, 27.0)],
-       dtype=[('name', 'S10'), ('age', '<i4'), ('weight', '<f4')])
+ array([('Rex', 9, 81.), ('Fido', 3, 27.)],
+       dtype=[('name', 'U10'), ('age', '<i4'), ('weight', '<f4')])
 
 Here ``x`` is a one-dimensional array of length two whose datatype is a
 structure with three fields: 1. A string of length 10 or less named 'name', 2.
@@ -32,8 +32,8 @@ with the field name::
  array([9, 3], dtype=int32)
  >>> x['age'] = 5
  >>> x
- array([('Rex', 5, 81.0), ('Fido', 5, 27.0)],
-       dtype=[('name', 'S10'), ('age', '<i4'), ('weight', '<f4')])
+ array([('Rex', 5, 81.), ('Fido', 5, 27.)],
+       dtype=[('name', 'U10'), ('age', '<i4'), ('weight', '<f4')])
 
 Structured datatypes are designed to be able to mimic 'structs' in the C
 language, and share a similar memory layout. They are meant for interfacing with
@@ -79,14 +79,14 @@ summary they are:
      convertible to a datatype, and ``shape`` is a tuple of integers specifying
      subarray shape.
 
-      >>> np.dtype([('x', 'f4'), ('y', np.float32), ('z', 'f4', (2,2))])
-      dtype=[('x', '<f4'), ('y', '<f4'), ('z', '<f4', (2, 2))])
+      >>> np.dtype([('x', 'f4'), ('y', np.float32), ('z', 'f4', (2, 2))])
+      dtype([('x', '<f4'), ('y', '<f4'), ('z', '<f4', (2, 2))])
 
      If ``fieldname`` is the empty string ``''``, the field will be given a
      default name of the form ``f#``, where ``#`` is the integer index of the
      field, counting from 0 from the left::
 
-      >>> np.dtype([('x', 'f4'),('', 'i4'),('z', 'i8')])
+      >>> np.dtype([('x', 'f4'), ('', 'i4'), ('z', 'i8')])
       dtype([('x', '<f4'), ('f1', '<i4'), ('z', '<i8')])
 
      The byte offsets of the fields within the structure and the total
@@ -100,10 +100,10 @@ summary they are:
      automatically, and the field names are given the default names ``f0``,
      ``f1``, etc. ::
 
-      >>> np.dtype('i8,f4,S3')
+      >>> np.dtype('i8, f4, S3')
       dtype([('f0', '<i8'), ('f1', '<f4'), ('f2', 'S3')])
-      >>> np.dtype('3int8, float32, (2,3)float64')
-      dtype([('f0', 'i1', 3), ('f1', '<f4'), ('f2', '<f8', (2, 3))])
+      >>> np.dtype('3int8, float32, (2, 3)float64')
+      dtype([('f0', 'i1', (3,)), ('f1', '<f4'), ('f2', '<f8', (2, 3))])
 
 3.   A dictionary of field parameter arrays
 
@@ -121,10 +121,10 @@ summary they are:
      enough to contain all the fields.
      ::
 
-      >>> np.dtype({'names': ['col1', 'col2'], 'formats': ['i4','f4']})
+      >>> np.dtype({'names': ['col1', 'col2'], 'formats': ['i4', 'f4']})
       dtype([('col1', '<i4'), ('col2', '<f4')])
       >>> np.dtype({'names': ['col1', 'col2'],
-      ...           'formats': ['i4','f4'],
+      ...           'formats': ['i4', 'f4'],
       ...           'offsets': [0, 4],
       ...           'itemsize': 12})
       dtype({'names':['col1','col2'], 'formats':['<i4','<f4'], 'offsets':[0,4], 'itemsize':12})
@@ -149,8 +149,8 @@ summary they are:
      because older numpy code may use it. The keys of the dictionary are the
      field names and the values are tuples specifying type and offset::
 
-      >>> np.dtype=({'col1': ('i1',0), 'col2': ('f4',1)})
-      dtype([(('col1'), 'i1'), (('col2'), '>f4')])
+      >>> np.dtype({'col1': ('i1', 0), 'col2': ('f4', 1)})
+      dtype([('col1', 'i1'), ('col2', '<f4')])
 
      This form is discouraged because Python dictionaries do not preserve order
      in Python versions before Python 3.6, and the order of the fields in a
@@ -202,7 +202,7 @@ are contiguous in memory. ::
  >>> def print_offsets(d):
  ...     print("offsets:", [d.fields[name][1] for name in d.names])
  ...     print("itemsize:", d.itemsize)
- >>> print_offsets(np.dtype('u1,u1,i4,u1,i8,u2'))
+ >>> print_offsets(np.dtype('u1, u1, i4, u1, i8, u2'))
  offsets: [0, 1, 2, 6, 7, 15]
  itemsize: 17
 
@@ -215,7 +215,7 @@ in bytes for simple datatypes, see :c:member:`PyArray_Descr.alignment`.  The
 structure will also have trailing padding added so that its itemsize is a
 multiple of the largest field's alignment. ::
 
- >>> print_offsets(np.dtype('u1,u1,i4,u1,i8,u2', align=True))
+ >>> print_offsets(np.dtype('u1, u1, i4, u1, i8, u2', align=True))
  offsets: [0, 1, 4, 8, 16, 24]
  itemsize: 32
 
@@ -255,6 +255,7 @@ string, which will be the field's title and field name respectively. For
 example::
 
  >>> np.dtype([(('my title', 'name'), 'f4')])
+ dtype([(('my title', 'name'), '<f4')])
 
 When using the first form of dictionary-based specification, the titles may be
 supplied as an extra ``'titles'`` key as described above. When using the second
@@ -263,6 +264,7 @@ providing a 3-element tuple ``(datatype, offset, title)`` instead of the usual
 2-element tuple::
 
  >>> np.dtype({'name': ('i4', 0, 'my title')})
+ dtype([(('my title', 'name'), '<i4')])
 
 The ``dtype.fields`` dictionary will contain :term:`titles` as keys, if any
 titles are used.  This means effectively that a field with a title will be
@@ -275,6 +277,8 @@ in::
 
  >>> for name in d.names:
  ...     print(d.fields[name][:2])
+ (dtype('int64'), 0)
+ (dtype('float32'), 8)
 
 Union types
 -----------
@@ -305,8 +309,8 @@ in the array, and not a list or array as these will trigger numpy's
 broadcasting rules. The tuple's elements are assigned to the successive fields
 of the array, from left to right::
 
- >>> x = np.array([(1,2,3),(4,5,6)], dtype='i8,f4,f8')
- >>> x[1] = (7,8,9)
+ >>> x = np.array([(1, 2, 3), (4, 5, 6)], dtype='i8, f4, f8')
+ >>> x[1] = (7, 8, 9)
  >>> x
  array([(1, 2., 3.), (7, 8., 9.)],
       dtype=[('f0', '<i8'), ('f1', '<f4'), ('f2', '<f8')])
@@ -318,14 +322,14 @@ A scalar assigned to a structured element will be assigned to all fields. This
 happens when a scalar is assigned to a structured array, or when an
 unstructured array is assigned to a structured array::
 
- >>> x = np.zeros(2, dtype='i8,f4,?,S1')
+ >>> x = np.zeros(2, dtype='i8, f4, ?, S1')
  >>> x[:] = 3
  >>> x
- array([(3, 3.0, True, b'3'), (3, 3.0, True, b'3')],
+ array([(3, 3., True, b'3'), (3, 3., True, b'3')],
        dtype=[('f0', '<i8'), ('f1', '<f4'), ('f2', '?'), ('f3', 'S1')])
  >>> x[:] = np.arange(2)
  >>> x
- array([(0, 0.0, False, b'0'), (1, 1.0, True, b'1')],
+ array([(0, 0., False, b'0'), (1, 1., True, b'1')],
        dtype=[('f0', '<i8'), ('f1', '<f4'), ('f2', '?'), ('f3', 'S1')])
 
 Structured arrays can also be assigned to unstructured arrays, but only if the
@@ -335,6 +339,8 @@ structured datatype has just a single field::
  >>> onefield = np.zeros(2, dtype=[('A', 'i4')])
  >>> nostruct = np.zeros(2, dtype='i4')
  >>> nostruct[:] = twofield
+ Traceback (most recent call last):
+    File "<stdin>", line 1, in <module>
  ValueError: Can't cast from structure to non-structure, except if the structure only has a single field.
  >>> nostruct[:] = onefield
  >>> nostruct
@@ -355,7 +361,7 @@ included in any of the fields are unaffected. ::
  >>> b = np.ones(3, dtype=[('x', 'f4'), ('y', 'S3'), ('z', 'O')])
  >>> b[:] = a
  >>> b
- array([(0.0, b'0.0', b''), (0.0, b'0.0', b''), (0.0, b'0.0', b'')],
+ array([(0., b'0.0', b''), (0., b'0.0', b''), (0., b'0.0', b'')],
        dtype=[('x', '<f4'), ('y', 'S3'), ('z', 'O')])
 
 
@@ -374,7 +380,7 @@ Accessing Individual Fields
 Individual fields of a structured array may be accessed and modified by indexing
 the array with the field name. ::
 
- >>> x = np.array([(1,2),(3,4)], dtype=[('foo', 'i8'), ('bar', 'f4')])
+ >>> x = np.array([(1, 2), (3, 4)], dtype=[('foo', 'i8'), ('bar', 'f4')])
  >>> x['foo']
  array([1, 3])
  >>> x['foo'] = 10
@@ -386,9 +392,9 @@ The resulting array is a view into the original array. It shares the same
 memory locations and writing to the view will modify the original array. ::
 
  >>> y = x['bar']
- >>> y[:] = 10
+ >>> y[:] = 11
  >>> x
- array([(10, 5.), (10, 5.)],
+ array([(10, 11.), (10, 11.)],
        dtype=[('foo', '<i8'), ('bar', '<f4')])
 
 This view has the same dtype and itemsize as the indexed field, so it is
@@ -400,7 +406,7 @@ typically a non-structured array, except in the case of nested structures.
 If the accessed field is a subarray, the dimensions of the subarray
 are appended to the shape of the result::
 
-   >>> x = np.zeros((2,2), dtype=[('a', np.int32), ('b', np.float64, (3,3))])
+   >>> x = np.zeros((2, 2), dtype=[('a', np.int32), ('b', np.float64, (3, 3))])
    >>> x['a'].shape
    (2, 2)
    >>> x['b'].shape
@@ -438,8 +444,9 @@ same offsets as in the original array, and unindexed fields are merely missing.
     code which depends on the data having a "packed" layout. For instance code
     such as::
 
-     >>> a = np.zeros(3, dtype=[('a', 'i4'), ('b', 'i4'), ('c', 'f4')])
-     >>> a[['a','c']].view('i8')  # Fails in Numpy 1.16
+     >>> a[['a', 'c']].view('i8')  # Fails in Numpy 1.16
+     Traceback (most recent call last):
+        File "<stdin>", line 1, in <module>
      ValueError: When changing to a smaller dtype, its size must be a divisor of the size of original dtype
 
     will need to be changed. This code has raised a ``FutureWarning`` since
@@ -459,7 +466,8 @@ same offsets as in the original array, and unindexed fields are merely missing.
     used to reproduce the old behavior, as it will return a packed copy of the
     structured array. The code above, for example, can be replaced with:
 
-     >>> repack_fields(a[['a','c']]).view('i8')  # supported in 1.16
+     >>> from numpy.lib.recfunctions import repack_fields
+     >>> repack_fields(a[['a', 'c']]).view('i8')  # supported in 1.16
      array([0, 0, 0])
 
     Furthermore, numpy now provides a new function
@@ -470,12 +478,14 @@ same offsets as in the original array, and unindexed fields are merely missing.
     account padding, often avoids a copy, and also casts the datatypes
     as needed, unlike the view. Code such as:
 
-     >>> a = np.zeros(3, dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4')])
-     >>> a[['x', 'z']].view('f4')
+     >>> b = np.zeros(3, dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4')])
+     >>> b[['x', 'z']].view('f4')
+     array([0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)
 
     can be made safer by replacing with:
 
-     >>> structured_to_unstructured(a[['x', 'z']])
+     >>> from numpy.lib.recfunctions import structured_to_unstructured
+     >>> structured_to_unstructured(b[['x', 'z']])
      array([0, 0, 0])
 
 
@@ -483,8 +493,8 @@ Assignment to an array with a multi-field index modifies the original array::
 
  >>> a[['a', 'c']] = (2, 3)
  >>> a
- array([(2, 0, 3.0), (2, 0, 3.0), (2, 0, 3.0)],
-       dtype=[('a', '<i8'), ('b', '<i4'), ('c', '<f8')])
+ array([(2, 0, 3.), (2, 0, 3.), (2, 0, 3.)],
+       dtype=[('a', '<i4'), ('b', '<i4'), ('c', '<f4')])
 
 This obeys the structured array assignment rules described above. For example,
 this means that one can swap the values of two fields using appropriate
@@ -498,19 +508,19 @@ Indexing with an Integer to get a Structured Scalar
 Indexing a single element of a structured array (with an integer index) returns
 a structured scalar::
 
- >>> x = np.array([(1, 2., 3.)], dtype='i,f,f')
+ >>> x = np.array([(1, 2., 3.)], dtype='i, f, f')
  >>> scalar = x[0]
  >>> scalar
  (1, 2., 3.)
  >>> type(scalar)
- numpy.void
+ <class 'numpy.void'>
 
 Unlike other numpy scalars, structured scalars are mutable and act like views
 into the original array, such that modifying the scalar will modify the
 original array. Structured scalars also support access and assignment by field
 name::
 
- >>> x = np.array([(1,2),(3,4)], dtype=[('foo', 'i8'), ('bar', 'f4')])
+ >>> x = np.array([(1, 2), (3, 4)], dtype=[('foo', 'i8'), ('bar', 'f4')])
  >>> s = x[0]
  >>> s['bar'] = 100
  >>> x
@@ -519,7 +529,7 @@ name::
 
 Similarly to tuples, structured scalars can also be indexed with an integer::
 
- >>> scalar = np.array([(1, 2., 3.)], dtype='i,f,f')[0]
+ >>> scalar = np.array([(1, 2., 3.)], dtype='i, f, f')[0]
  >>> scalar[0]
  1
  >>> scalar[1] = 4
@@ -530,7 +540,7 @@ numpy's integer types. Structured scalars may be converted to a tuple by
 calling :func:`ndarray.item`::
 
  >>> scalar.item(), type(scalar.item())
- ((1, 2.0, 3.0), tuple)
+ ((1, 4.0, 3.0), <class 'tuple'>)
 
 Viewing Structured Arrays Containing Objects
 --------------------------------------------
@@ -574,24 +584,24 @@ structured scalars obtained from the array.
 
 The simplest way to create a record array is with :func:`numpy.rec.array`::
 
- >>> recordarr = np.rec.array([(1,2.,'Hello'),(2,3.,"World")],
+ >>> recordarr = np.rec.array([(1, 2., 'Hello'), (2, 3., "World")],
  ...                    dtype=[('foo', 'i4'),('bar', 'f4'), ('baz', 'S10')])
  >>> recordarr.bar
  array([ 2.,  3.], dtype=float32)
  >>> recordarr[1:2]
- rec.array([(2, 3.0, 'World')],
+ rec.array([(2, 3., b'World')],
        dtype=[('foo', '<i4'), ('bar', '<f4'), ('baz', 'S10')])
  >>> recordarr[1:2].foo
  array([2], dtype=int32)
  >>> recordarr.foo[1:2]
  array([2], dtype=int32)
  >>> recordarr[1].baz
- 'World'
+ b'World'
 
 :func:`numpy.rec.array` can convert a wide variety of arguments into record
 arrays, including structured arrays::
 
- >>> arr = array([(1,2.,'Hello'),(2,3.,"World")],
+ >>> arr = np.array([(1, 2., 'Hello'), (2, 3., "World")],
  ...             dtype=[('foo', 'i4'), ('bar', 'f4'), ('baz', 'S10')])
  >>> recordarr = np.rec.array(arr)
 
@@ -602,9 +612,9 @@ creating record arrays, see :ref:`record array creation routines
 A record array representation of a structured array can be obtained using the
 appropriate :ref:`view`::
 
- >>> arr = np.array([(1,2.,'Hello'),(2,3.,"World")],
+ >>> arr = np.array([(1, 2., 'Hello'), (2, 3., "World")],
  ...                dtype=[('foo', 'i4'),('bar', 'f4'), ('baz', 'a10')])
- >>> recordarr = arr.view(dtype=dtype((np.record, arr.dtype)),
+ >>> recordarr = arr.view(dtype=np.dtype((np.record, arr.dtype)),
  ...                      type=np.recarray)
 
 For convenience, viewing an ndarray as type :class:`np.recarray` will
@@ -624,12 +634,12 @@ recordarr was not a structured type::
 Record array fields accessed by index or by attribute are returned as a record
 array if the field has a structured type but as a plain ndarray otherwise. ::
 
- >>> recordarr = np.rec.array([('Hello', (1,2)),("World", (3,4))],
+ >>> recordarr = np.rec.array([('Hello', (1, 2)), ("World", (3, 4))],
  ...                 dtype=[('foo', 'S6'),('bar', [('A', int), ('B', int)])])
  >>> type(recordarr.foo)
- <type 'numpy.ndarray'>
+ <class 'numpy.ndarray'>
  >>> type(recordarr.bar)
- <class 'numpy.core.records.recarray'>
+ <class 'numpy.recarray'>
 
 Note that if a field has the same name as an ndarray attribute, the ndarray
 attribute takes precedence. Such fields will be inaccessible by attribute but
diff --git a/numpy/dual.py b/numpy/dual.py
index 3a16a8ec5..651e845bb 100644
--- a/numpy/dual.py
+++ b/numpy/dual.py
@@ -51,14 +51,14 @@ _restore_dict = {}
 
 def register_func(name, func):
     if name not in __all__:
-        raise ValueError("%s not a dual function." % name)
+        raise ValueError("{} not a dual function.".format(name))
     f = sys._getframe(0).f_globals
     _restore_dict[name] = f[name]
     f[name] = func
 
 def restore_func(name):
     if name not in __all__:
-        raise ValueError("%s not a dual function." % name)
+        raise ValueError("{} not a dual function.".format(name))
     try:
         val = _restore_dict[name]
     except KeyError:
diff --git a/numpy/f2py/crackfortran.py b/numpy/f2py/crackfortran.py
index 2620fc9b7..c4a650585 100755
--- a/numpy/f2py/crackfortran.py
+++ b/numpy/f2py/crackfortran.py
@@ -2399,7 +2399,7 @@ def _selected_real_kind_func(p, r=0, radix=0):
     if p < 16:
         return 8
     machine = platform.machine().lower()
-    if machine.startswith(('aarch64', 'power', 'ppc64', 's390x')):
+    if machine.startswith(('aarch64', 'power', 'ppc64', 's390x', 'sparc')):
         if p <= 20:
             return 16
     else:
diff --git a/numpy/fft/README.md b/numpy/fft/README.md
new file mode 100644
index 000000000..7040a2e9b
--- /dev/null
+++ b/numpy/fft/README.md
@@ -0,0 +1,53 @@
+PocketFFT
+---------
+
+This is a heavily modified implementation of FFTPack [1,2], with the following
+advantages:
+
+- strictly C99 compliant
+- more accurate twiddle factor computation
+- very fast plan generation
+- worst case complexity for transform sizes with large prime factors is
+  `N*log(N)`, because Bluestein's algorithm [3] is used for these cases.
+
+License
+-------
+
+3-clause BSD (see LICENSE.md)
+
+
+Some code details
+-----------------
+
+Twiddle factor computation:
+
+- making use of symmetries to reduce number of sin/cos evaluations
+- all angles are reduced to the range `[0; pi/4]` for higher accuracy
+- an adapted implementation of `sincospi()` is used, which actually computes
+  `sin(x)` and `(cos(x)-1)`.
+- if `n` sin/cos pairs are required, the adjusted `sincospi()` is only called
+  `2*sqrt(n)` times; the remaining values are obtained by evaluating the
+  angle addition theorems in a numerically accurate way.
+
+Parallel invocation:
+
+- Plans only contain read-only data; all temporary arrays are allocated and
+  deallocated during an individual FFT execution. This means that a single plan
+  can be used in several threads at the same time.
+
+Efficient codelets are available for the factors:
+
+- 2, 3, 4, 5, 7, 11 for complex-valued FFTs
+- 2, 3, 4, 5 for real-valued FFTs
+
+Larger prime factors are handled by somewhat less efficient, generic routines.
+
+For lengths with very large prime factors, Bluestein's algorithm is used, and
+instead of an FFT of length `n`, a convolution of length `n2 >= 2*n-1`
+is performed, where `n2` is chosen to be highly composite.
+
+
+[1] Swarztrauber, P. 1982, Vectorizing the Fast Fourier Transforms
+    (New York: Academic Press), 51
+[2] https://www.netlib.org/fftpack/
+[3] https://en.wikipedia.org/wiki/Chirp_Z-transform
diff --git a/numpy/fft/__init__.py b/numpy/fft/__init__.py
index 44243b483..64b35bc19 100644
--- a/numpy/fft/__init__.py
+++ b/numpy/fft/__init__.py
@@ -3,7 +3,7 @@ from __future__ import division, absolute_import, print_function
 # To get sub-modules
 from .info import __doc__
 
-from .fftpack import *
+from .pocketfft import *
 from .helper import *
 
 from numpy._pytesttester import PytestTester
diff --git a/numpy/fft/fftpack.c b/numpy/fft/fftpack.c
deleted file mode 100644
index 07fa2bf4c..000000000
--- a/numpy/fft/fftpack.c
+++ /dev/null
@@ -1,1536 +0,0 @@
-/*
- * fftpack.c : A set of FFT routines in C.
- * Algorithmically based on Fortran-77 FFTPACK by Paul N. Swarztrauber (Version 4, 1985).
-*/
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include <Python.h>
-#include <math.h>
-#include <stdio.h>
-#include <numpy/ndarraytypes.h>
-
-#define DOUBLE
-#ifdef DOUBLE
-#define Treal double
-#else
-#define Treal float
-#endif
-
-#define ref(u,a) u[a]
-
-/* Macros for accurate calculation of the twiddle factors. */
-#define TWOPI 6.283185307179586476925286766559005768391
-#define cos2pi(m, n) cos((TWOPI * (m)) / (n))
-#define sin2pi(m, n) sin((TWOPI * (m)) / (n))
-
-#define MAXFAC 13    /* maximum number of factors in factorization of n */
-#define NSPECIAL 4   /* number of factors for which we have special-case routines */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static void sincos2pi(int m, int n, Treal* si, Treal* co)
-/* Calculates sin(2pi * m/n) and cos(2pi * m/n). It is more accurate
- * than the naive calculation as the fraction m/n is reduced to [0, 1/8) first.
- * Due to the symmetry of sin(x) and cos(x) the values for all x can be
- * determined from the function values of the reduced argument in the first
- * octant.
- */
-    {
-        int n8, m8, octant;
-        n8 = 8 * n;
-        m8 = (8 * m) % n8;
-        octant = m8 / n;
-        m8 = m8 % n;
-        switch(octant) {
-            case 0:
-                *co = cos2pi(m8, n8);
-                *si = sin2pi(m8, n8);
-                break;
-            case 1:
-                *co = sin2pi(n-m8, n8);
-                *si = cos2pi(n-m8, n8);
-                break;
-            case 2:
-                *co = -sin2pi(m8, n8);
-                *si = cos2pi(m8, n8);
-                break;
-            case 3:
-                *co = -cos2pi(n-m8, n8);
-                *si = sin2pi(n-m8, n8);
-                break;
-            case 4:
-                *co = -cos2pi(m8, n8);
-                *si = -sin2pi(m8, n8);
-                break;
-            case 5:
-                *co = -sin2pi(n-m8, n8);
-                *si = -cos2pi(n-m8, n8);
-                break;
-            case 6:
-                *co = sin2pi(m8, n8);
-                *si = -cos2pi(m8, n8);
-                break;
-            case 7:
-                *co = cos2pi(n-m8, n8);
-                *si = -sin2pi(n-m8, n8);
-                break;
-        }
-    }
-
-/* ----------------------------------------------------------------------
-   passf2, passf3, passf4, passf5, passf. Complex FFT passes fwd and bwd.
------------------------------------------------------------------------ */
-
-static void passf2(int ido, int l1, const Treal cc[], Treal ch[], const Treal wa1[], int isign)
-  /* isign==+1 for backward transform */
-  {
-    int i, k, ah, ac;
-    Treal ti2, tr2;
-    if (ido <= 2) {
-      for (k=0; k<l1; k++) {
-        ah = k*ido;
-        ac = 2*k*ido;
-        ch[ah]              = ref(cc,ac) + ref(cc,ac + ido);
-        ch[ah + ido*l1]     = ref(cc,ac) - ref(cc,ac + ido);
-        ch[ah+1]            = ref(cc,ac+1) + ref(cc,ac + ido + 1);
-        ch[ah + ido*l1 + 1] = ref(cc,ac+1) - ref(cc,ac + ido + 1);
-      }
-    } else {
-      for (k=0; k<l1; k++) {
-        for (i=0; i<ido-1; i+=2) {
-          ah = i + k*ido;
-          ac = i + 2*k*ido;
-          ch[ah]   = ref(cc,ac) + ref(cc,ac + ido);
-          tr2      = ref(cc,ac) - ref(cc,ac + ido);
-          ch[ah+1] = ref(cc,ac+1) + ref(cc,ac + 1 + ido);
-          ti2      = ref(cc,ac+1) - ref(cc,ac + 1 + ido);
-          ch[ah+l1*ido+1] = wa1[i]*ti2 + isign*wa1[i+1]*tr2;
-          ch[ah+l1*ido]   = wa1[i]*tr2 - isign*wa1[i+1]*ti2;
-        }
-      }
-    }
-  } /* passf2 */
-
-
-static void passf3(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[], int isign)
-  /* isign==+1 for backward transform */
-  {
-    static const Treal taur = -0.5;
-    static const Treal taui = 0.86602540378443864676;
-    int i, k, ac, ah;
-    Treal ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
-    if (ido == 2) {
-      for (k=1; k<=l1; k++) {
-        ac = (3*k - 2)*ido;
-        tr2 = ref(cc,ac) + ref(cc,ac + ido);
-        cr2 = ref(cc,ac - ido) + taur*tr2;
-        ah = (k - 1)*ido;
-        ch[ah] = ref(cc,ac - ido) + tr2;
-
-        ti2 = ref(cc,ac + 1) + ref(cc,ac + ido + 1);
-        ci2 = ref(cc,ac - ido + 1) + taur*ti2;
-        ch[ah + 1] = ref(cc,ac - ido + 1) + ti2;
-
-        cr3 = isign*taui*(ref(cc,ac) - ref(cc,ac + ido));
-        ci3 = isign*taui*(ref(cc,ac + 1) - ref(cc,ac + ido + 1));
-        ch[ah + l1*ido] = cr2 - ci3;
-        ch[ah + 2*l1*ido] = cr2 + ci3;
-        ch[ah + l1*ido + 1] = ci2 + cr3;
-        ch[ah + 2*l1*ido + 1] = ci2 - cr3;
-      }
-    } else {
-      for (k=1; k<=l1; k++) {
-        for (i=0; i<ido-1; i+=2) {
-          ac = i + (3*k - 2)*ido;
-          tr2 = ref(cc,ac) + ref(cc,ac + ido);
-          cr2 = ref(cc,ac - ido) + taur*tr2;
-          ah = i + (k-1)*ido;
-          ch[ah] = ref(cc,ac - ido) + tr2;
-          ti2 = ref(cc,ac + 1) + ref(cc,ac + ido + 1);
-          ci2 = ref(cc,ac - ido + 1) + taur*ti2;
-          ch[ah + 1] = ref(cc,ac - ido + 1) + ti2;
-          cr3 = isign*taui*(ref(cc,ac) - ref(cc,ac + ido));
-          ci3 = isign*taui*(ref(cc,ac + 1) - ref(cc,ac + ido + 1));
-          dr2 = cr2 - ci3;
-          dr3 = cr2 + ci3;
-          di2 = ci2 + cr3;
-          di3 = ci2 - cr3;
-          ch[ah + l1*ido + 1] = wa1[i]*di2 + isign*wa1[i+1]*dr2;
-          ch[ah + l1*ido] = wa1[i]*dr2 - isign*wa1[i+1]*di2;
-          ch[ah + 2*l1*ido + 1] = wa2[i]*di3 + isign*wa2[i+1]*dr3;
-          ch[ah + 2*l1*ido] = wa2[i]*dr3 - isign*wa2[i+1]*di3;
-        }
-      }
-    }
-  } /* passf3 */
-
-
-static void passf4(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[], const Treal wa3[], int isign)
-  /* isign == -1 for forward transform and +1 for backward transform */
-  {
-    int i, k, ac, ah;
-    Treal ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-    if (ido == 2) {
-      for (k=0; k<l1; k++) {
-        ac = 4*k*ido + 1;
-        ti1 = ref(cc,ac) - ref(cc,ac + 2*ido);
-        ti2 = ref(cc,ac) + ref(cc,ac + 2*ido);
-        tr4 = ref(cc,ac + 3*ido) - ref(cc,ac + ido);
-        ti3 = ref(cc,ac + ido) + ref(cc,ac + 3*ido);
-        tr1 = ref(cc,ac - 1) - ref(cc,ac + 2*ido - 1);
-        tr2 = ref(cc,ac - 1) + ref(cc,ac + 2*ido - 1);
-        ti4 = ref(cc,ac + ido - 1) - ref(cc,ac + 3*ido - 1);
-        tr3 = ref(cc,ac + ido - 1) + ref(cc,ac + 3*ido - 1);
-        ah = k*ido;
-        ch[ah] = tr2 + tr3;
-        ch[ah + 2*l1*ido] = tr2 - tr3;
-        ch[ah + 1] = ti2 + ti3;
-        ch[ah + 2*l1*ido + 1] = ti2 - ti3;
-        ch[ah + l1*ido] = tr1 + isign*tr4;
-        ch[ah + 3*l1*ido] = tr1 - isign*tr4;
-        ch[ah + l1*ido + 1] = ti1 + isign*ti4;
-        ch[ah + 3*l1*ido + 1] = ti1 - isign*ti4;
-      }
-    } else {
-      for (k=0; k<l1; k++) {
-        for (i=0; i<ido-1; i+=2) {
-          ac = i + 1 + 4*k*ido;
-          ti1 = ref(cc,ac) - ref(cc,ac + 2*ido);
-          ti2 = ref(cc,ac) + ref(cc,ac + 2*ido);
-          ti3 = ref(cc,ac + ido) + ref(cc,ac + 3*ido);
-          tr4 = ref(cc,ac + 3*ido) - ref(cc,ac + ido);
-          tr1 = ref(cc,ac - 1) - ref(cc,ac + 2*ido - 1);
-          tr2 = ref(cc,ac - 1) + ref(cc,ac + 2*ido - 1);
-          ti4 = ref(cc,ac + ido - 1) - ref(cc,ac + 3*ido - 1);
-          tr3 = ref(cc,ac + ido - 1) + ref(cc,ac + 3*ido - 1);
-          ah = i + k*ido;
-          ch[ah] = tr2 + tr3;
-          cr3 = tr2 - tr3;
-          ch[ah + 1] = ti2 + ti3;
-          ci3 = ti2 - ti3;
-          cr2 = tr1 + isign*tr4;
-          cr4 = tr1 - isign*tr4;
-          ci2 = ti1 + isign*ti4;
-          ci4 = ti1 - isign*ti4;
-          ch[ah + l1*ido] = wa1[i]*cr2 - isign*wa1[i + 1]*ci2;
-          ch[ah + l1*ido + 1] = wa1[i]*ci2 + isign*wa1[i + 1]*cr2;
-          ch[ah + 2*l1*ido] = wa2[i]*cr3 - isign*wa2[i + 1]*ci3;
-          ch[ah + 2*l1*ido + 1] = wa2[i]*ci3 + isign*wa2[i + 1]*cr3;
-          ch[ah + 3*l1*ido] = wa3[i]*cr4 -isign*wa3[i + 1]*ci4;
-          ch[ah + 3*l1*ido + 1] = wa3[i]*ci4 + isign*wa3[i + 1]*cr4;
-        }
-      }
-    }
-  } /* passf4 */
-
-
-static void passf5(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[], const Treal wa3[], const Treal wa4[], int isign)
-  /* isign == -1 for forward transform and +1 for backward transform */
-  {
-    static const Treal tr11 = 0.3090169943749474241;
-    static const Treal ti11 = 0.95105651629515357212;
-    static const Treal tr12 = -0.8090169943749474241;
-    static const Treal ti12 = 0.58778525229247312917;
-    int i, k, ac, ah;
-    Treal ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
-        ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
-    if (ido == 2) {
-      for (k = 1; k <= l1; ++k) {
-        ac = (5*k - 4)*ido + 1;
-        ti5 = ref(cc,ac) - ref(cc,ac + 3*ido);
-        ti2 = ref(cc,ac) + ref(cc,ac + 3*ido);
-        ti4 = ref(cc,ac + ido) - ref(cc,ac + 2*ido);
-        ti3 = ref(cc,ac + ido) + ref(cc,ac + 2*ido);
-        tr5 = ref(cc,ac - 1) - ref(cc,ac + 3*ido - 1);
-        tr2 = ref(cc,ac - 1) + ref(cc,ac + 3*ido - 1);
-        tr4 = ref(cc,ac + ido - 1) - ref(cc,ac + 2*ido - 1);
-        tr3 = ref(cc,ac + ido - 1) + ref(cc,ac + 2*ido - 1);
-        ah = (k - 1)*ido;
-        ch[ah] = ref(cc,ac - ido - 1) + tr2 + tr3;
-        ch[ah + 1] = ref(cc,ac - ido) + ti2 + ti3;
-        cr2 = ref(cc,ac - ido - 1) + tr11*tr2 + tr12*tr3;
-        ci2 = ref(cc,ac - ido) + tr11*ti2 + tr12*ti3;
-        cr3 = ref(cc,ac - ido - 1) + tr12*tr2 + tr11*tr3;
-        ci3 = ref(cc,ac - ido) + tr12*ti2 + tr11*ti3;
-        cr5 = isign*(ti11*tr5 + ti12*tr4);
-        ci5 = isign*(ti11*ti5 + ti12*ti4);
-        cr4 = isign*(ti12*tr5 - ti11*tr4);
-        ci4 = isign*(ti12*ti5 - ti11*ti4);
-        ch[ah + l1*ido] = cr2 - ci5;
-        ch[ah + 4*l1*ido] = cr2 + ci5;
-        ch[ah + l1*ido + 1] = ci2 + cr5;
-        ch[ah + 2*l1*ido + 1] = ci3 + cr4;
-        ch[ah + 2*l1*ido] = cr3 - ci4;
-        ch[ah + 3*l1*ido] = cr3 + ci4;
-        ch[ah + 3*l1*ido + 1] = ci3 - cr4;
-        ch[ah + 4*l1*ido + 1] = ci2 - cr5;
-      }
-    } else {
-      for (k=1; k<=l1; k++) {
-        for (i=0; i<ido-1; i+=2) {
-          ac = i + 1 + (k*5 - 4)*ido;
-          ti5 = ref(cc,ac) - ref(cc,ac + 3*ido);
-          ti2 = ref(cc,ac) + ref(cc,ac + 3*ido);
-          ti4 = ref(cc,ac + ido) - ref(cc,ac + 2*ido);
-          ti3 = ref(cc,ac + ido) + ref(cc,ac + 2*ido);
-          tr5 = ref(cc,ac - 1) - ref(cc,ac + 3*ido - 1);
-          tr2 = ref(cc,ac - 1) + ref(cc,ac + 3*ido - 1);
-          tr4 = ref(cc,ac + ido - 1) - ref(cc,ac + 2*ido - 1);
-          tr3 = ref(cc,ac + ido - 1) + ref(cc,ac + 2*ido - 1);
-          ah = i + (k - 1)*ido;
-          ch[ah] = ref(cc,ac - ido - 1) + tr2 + tr3;
-          ch[ah + 1] = ref(cc,ac - ido) + ti2 + ti3;
-          cr2 = ref(cc,ac - ido - 1) + tr11*tr2 + tr12*tr3;
-
-          ci2 = ref(cc,ac - ido) + tr11*ti2 + tr12*ti3;
-          cr3 = ref(cc,ac - ido - 1) + tr12*tr2 + tr11*tr3;
-
-          ci3 = ref(cc,ac - ido) + tr12*ti2 + tr11*ti3;
-          cr5 = isign*(ti11*tr5 + ti12*tr4);
-          ci5 = isign*(ti11*ti5 + ti12*ti4);
-          cr4 = isign*(ti12*tr5 - ti11*tr4);
-          ci4 = isign*(ti12*ti5 - ti11*ti4);
-          dr3 = cr3 - ci4;
-          dr4 = cr3 + ci4;
-          di3 = ci3 + cr4;
-          di4 = ci3 - cr4;
-          dr5 = cr2 + ci5;
-          dr2 = cr2 - ci5;
-          di5 = ci2 - cr5;
-          di2 = ci2 + cr5;
-          ch[ah + l1*ido] = wa1[i]*dr2 - isign*wa1[i+1]*di2;
-          ch[ah + l1*ido + 1] = wa1[i]*di2 + isign*wa1[i+1]*dr2;
-          ch[ah + 2*l1*ido] = wa2[i]*dr3 - isign*wa2[i+1]*di3;
-          ch[ah + 2*l1*ido + 1] = wa2[i]*di3 + isign*wa2[i+1]*dr3;
-          ch[ah + 3*l1*ido] = wa3[i]*dr4 - isign*wa3[i+1]*di4;
-          ch[ah + 3*l1*ido + 1] = wa3[i]*di4 + isign*wa3[i+1]*dr4;
-          ch[ah + 4*l1*ido] = wa4[i]*dr5 - isign*wa4[i+1]*di5;
-          ch[ah + 4*l1*ido + 1] = wa4[i]*di5 + isign*wa4[i+1]*dr5;
-        }
-      }
-    }
-  } /* passf5 */
-
-
-static void passf(int *nac, int ido, int ip, int l1, int idl1,
-      Treal cc[], Treal ch[],
-      const Treal wa[], int isign)
-  /* isign is -1 for forward transform and +1 for backward transform */
-  {
-    int idij, idlj, idot, ipph, i, j, k, l, jc, lc, ik, idj, idl, inc,idp;
-    Treal wai, war;
-
-    idot = ido / 2;
-    /* nt = ip*idl1;*/
-    ipph = (ip + 1) / 2;
-    idp = ip*ido;
-    if (ido >= l1) {
-      for (j=1; j<ipph; j++) {
-        jc = ip - j;
-        for (k=0; k<l1; k++) {
-          for (i=0; i<ido; i++) {
-            ch[i + (k + j*l1)*ido] =
-                ref(cc,i + (j + k*ip)*ido) + ref(cc,i + (jc + k*ip)*ido);
-            ch[i + (k + jc*l1)*ido] =
-                ref(cc,i + (j + k*ip)*ido) - ref(cc,i + (jc + k*ip)*ido);
-          }
-        }
-      }
-      for (k=0; k<l1; k++)
-        for (i=0; i<ido; i++)
-          ch[i + k*ido] = ref(cc,i + k*ip*ido);
-    } else {
-      for (j=1; j<ipph; j++) {
-        jc = ip - j;
-        for (i=0; i<ido; i++) {
-          for (k=0; k<l1; k++) {
-            ch[i + (k + j*l1)*ido] = ref(cc,i + (j + k*ip)*ido) + ref(cc,i + (jc + k*
-                ip)*ido);
-            ch[i + (k + jc*l1)*ido] = ref(cc,i + (j + k*ip)*ido) - ref(cc,i + (jc + k*
-                ip)*ido);
-          }
-        }
-      }
-      for (i=0; i<ido; i++)
-        for (k=0; k<l1; k++)
-          ch[i + k*ido] = ref(cc,i + k*ip*ido);
-    }
-
-    idl = 2 - ido;
-    inc = 0;
-    for (l=1; l<ipph; l++) {
-      lc = ip - l;
-      idl += ido;
-      for (ik=0; ik<idl1; ik++) {
-        cc[ik + l*idl1] = ch[ik] + wa[idl - 2]*ch[ik + idl1];
-        cc[ik + lc*idl1] = isign*wa[idl-1]*ch[ik + (ip-1)*idl1];
-      }
-      idlj = idl;
-      inc += ido;
-      for (j=2; j<ipph; j++) {
-        jc = ip - j;
-        idlj += inc;
-        if (idlj > idp) idlj -= idp;
-        war = wa[idlj - 2];
-        wai = wa[idlj-1];
-        for (ik=0; ik<idl1; ik++) {
-          cc[ik + l*idl1] += war*ch[ik + j*idl1];
-          cc[ik + lc*idl1] += isign*wai*ch[ik + jc*idl1];
-        }
-      }
-    }
-    for (j=1; j<ipph; j++)
-      for (ik=0; ik<idl1; ik++)
-        ch[ik] += ch[ik + j*idl1];
-    for (j=1; j<ipph; j++) {
-      jc = ip - j;
-      for (ik=1; ik<idl1; ik+=2) {
-        ch[ik - 1 + j*idl1] = cc[ik - 1 + j*idl1] - cc[ik + jc*idl1];
-        ch[ik - 1 + jc*idl1] = cc[ik - 1 + j*idl1] + cc[ik + jc*idl1];
-        ch[ik + j*idl1] = cc[ik + j*idl1] + cc[ik - 1 + jc*idl1];
-        ch[ik + jc*idl1] = cc[ik + j*idl1] - cc[ik - 1 + jc*idl1];
-      }
-    }
-    *nac = 1;
-    if (ido == 2) return;
-    *nac = 0;
-    for (ik=0; ik<idl1; ik++)
-      cc[ik] = ch[ik];
-    for (j=1; j<ip; j++) {
-      for (k=0; k<l1; k++) {
-        cc[(k + j*l1)*ido + 0] = ch[(k + j*l1)*ido + 0];
-        cc[(k + j*l1)*ido + 1] = ch[(k + j*l1)*ido + 1];
-      }
-    }
-    if (idot <= l1) {
-      idij = 0;
-      for (j=1; j<ip; j++) {
-        idij += 2;
-        for (i=3; i<ido; i+=2) {
-          idij += 2;
-          for (k=0; k<l1; k++) {
-            cc[i - 1 + (k + j*l1)*ido] =
-                wa[idij - 2]*ch[i - 1 + (k + j*l1)*ido] -
-                isign*wa[idij-1]*ch[i + (k + j*l1)*ido];
-            cc[i + (k + j*l1)*ido] =
-                wa[idij - 2]*ch[i + (k + j*l1)*ido] +
-                isign*wa[idij-1]*ch[i - 1 + (k + j*l1)*ido];
-          }
-        }
-      }
-    } else {
-      idj = 2 - ido;
-      for (j=1; j<ip; j++) {
-        idj += ido;
-        for (k = 0; k < l1; k++) {
-          idij = idj;
-          for (i=3; i<ido; i+=2) {
-            idij += 2;
-            cc[i - 1 + (k + j*l1)*ido] =
-                wa[idij - 2]*ch[i - 1 + (k + j*l1)*ido] -
-                isign*wa[idij-1]*ch[i + (k + j*l1)*ido];
-            cc[i + (k + j*l1)*ido] =
-                wa[idij - 2]*ch[i + (k + j*l1)*ido] +
-                isign*wa[idij-1]*ch[i - 1 + (k + j*l1)*ido];
-          }
-        }
-      }
-    }
-  } /* passf */
-
-
-  /* ----------------------------------------------------------------------
-radf2,radb2, radf3,radb3, radf4,radb4, radf5,radb5, radfg,radbg.
-Treal FFT passes fwd and bwd.
----------------------------------------------------------------------- */
-
-static void radf2(int ido, int l1, const Treal cc[], Treal ch[], const Treal wa1[])
-  {
-    int i, k, ic;
-    Treal ti2, tr2;
-    for (k=0; k<l1; k++) {
-      ch[2*k*ido] =
-          ref(cc,k*ido) + ref(cc,(k + l1)*ido);
-      ch[(2*k+1)*ido + ido-1] =
-          ref(cc,k*ido) - ref(cc,(k + l1)*ido);
-    }
-    if (ido < 2) return;
-    if (ido != 2) {
-      for (k=0; k<l1; k++) {
-        for (i=2; i<ido; i+=2) {
-          ic = ido - i;
-          tr2 = wa1[i - 2]*ref(cc, i-1 + (k + l1)*ido) + wa1[i - 1]*ref(cc, i + (k + l1)*ido);
-          ti2 = wa1[i - 2]*ref(cc, i + (k + l1)*ido) - wa1[i - 1]*ref(cc, i-1 + (k + l1)*ido);
-          ch[i + 2*k*ido] = ref(cc,i + k*ido) + ti2;
-          ch[ic + (2*k+1)*ido] = ti2 - ref(cc,i + k*ido);
-          ch[i - 1 + 2*k*ido] = ref(cc,i - 1 + k*ido) + tr2;
-          ch[ic - 1 + (2*k+1)*ido] = ref(cc,i - 1 + k*ido) - tr2;
-        }
-      }
-      if (ido % 2 == 1) return;
-    }
-    for (k=0; k<l1; k++) {
-      ch[(2*k+1)*ido] = -ref(cc,ido-1 + (k + l1)*ido);
-      ch[ido-1 + 2*k*ido] = ref(cc,ido-1 + k*ido);
-    }
-  } /* radf2 */
-
-
-static void radb2(int ido, int l1, const Treal cc[], Treal ch[], const Treal wa1[])
-  {
-    int i, k, ic;
-    Treal ti2, tr2;
-    for (k=0; k<l1; k++) {
-      ch[k*ido] =
-          ref(cc,2*k*ido) + ref(cc,ido-1 + (2*k+1)*ido);
-      ch[(k + l1)*ido] =
-          ref(cc,2*k*ido) - ref(cc,ido-1 + (2*k+1)*ido);
-    }
-    if (ido < 2) return;
-    if (ido != 2) {
-      for (k = 0; k < l1; ++k) {
-        for (i = 2; i < ido; i += 2) {
-          ic = ido - i;
-          ch[i-1 + k*ido] =
-              ref(cc,i-1 + 2*k*ido) + ref(cc,ic-1 + (2*k+1)*ido);
-          tr2 = ref(cc,i-1 + 2*k*ido) - ref(cc,ic-1 + (2*k+1)*ido);
-          ch[i + k*ido] =
-              ref(cc,i + 2*k*ido) - ref(cc,ic + (2*k+1)*ido);
-          ti2 = ref(cc,i + (2*k)*ido) + ref(cc,ic + (2*k+1)*ido);
-          ch[i-1 + (k + l1)*ido] =
-              wa1[i - 2]*tr2 - wa1[i - 1]*ti2;
-          ch[i + (k + l1)*ido] =
-              wa1[i - 2]*ti2 + wa1[i - 1]*tr2;
-        }
-      }
-      if (ido % 2 == 1) return;
-    }
-    for (k = 0; k < l1; k++) {
-      ch[ido-1 + k*ido] = 2*ref(cc,ido-1 + 2*k*ido);
-      ch[ido-1 + (k + l1)*ido] = -2*ref(cc,(2*k+1)*ido);
-    }
-  } /* radb2 */
-
-
-static void radf3(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[])
-  {
-    static const Treal taur = -0.5;
-    static const Treal taui = 0.86602540378443864676;
-    int i, k, ic;
-    Treal ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3;
-    for (k=0; k<l1; k++) {
-      cr2 = ref(cc,(k + l1)*ido) + ref(cc,(k + 2*l1)*ido);
-      ch[3*k*ido] = ref(cc,k*ido) + cr2;
-      ch[(3*k+2)*ido] = taui*(ref(cc,(k + l1*2)*ido) - ref(cc,(k + l1)*ido));
-      ch[ido-1 + (3*k + 1)*ido] = ref(cc,k*ido) + taur*cr2;
-    }
-    if (ido == 1) return;
-    for (k=0; k<l1; k++) {
-      for (i=2; i<ido; i+=2) {
-        ic = ido - i;
-        dr2 = wa1[i - 2]*ref(cc,i - 1 + (k + l1)*ido) +
-            wa1[i - 1]*ref(cc,i + (k + l1)*ido);
-        di2 = wa1[i - 2]*ref(cc,i + (k + l1)*ido) - wa1[i - 1]*ref(cc,i - 1 + (k + l1)*ido);
-        dr3 = wa2[i - 2]*ref(cc,i - 1 + (k + l1*2)*ido) + wa2[i - 1]*ref(cc,i + (k + l1*2)*ido);
-        di3 = wa2[i - 2]*ref(cc,i + (k + l1*2)*ido) - wa2[i - 1]*ref(cc,i - 1 + (k + l1*2)*ido);
-        cr2 = dr2 + dr3;
-        ci2 = di2 + di3;
-        ch[i - 1 + 3*k*ido] = ref(cc,i - 1 + k*ido) + cr2;
-        ch[i + 3*k*ido] = ref(cc,i + k*ido) + ci2;
-        tr2 = ref(cc,i - 1 + k*ido) + taur*cr2;
-        ti2 = ref(cc,i + k*ido) + taur*ci2;
-        tr3 = taui*(di2 - di3);
-        ti3 = taui*(dr3 - dr2);
-        ch[i - 1 + (3*k + 2)*ido] = tr2 + tr3;
-        ch[ic - 1 + (3*k + 1)*ido] = tr2 - tr3;
-        ch[i + (3*k + 2)*ido] = ti2 + ti3;
-        ch[ic + (3*k + 1)*ido] = ti3 - ti2;
-      }
-    }
-  } /* radf3 */
-
-
-static void radb3(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[])
-  {
-    static const Treal taur = -0.5;
-    static const Treal taui = 0.86602540378443864676;
-    int i, k, ic;
-    Treal ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
-    for (k=0; k<l1; k++) {
-      tr2 = 2*ref(cc,ido-1 + (3*k + 1)*ido);
-      cr2 = ref(cc,3*k*ido) + taur*tr2;
-      ch[k*ido] = ref(cc,3*k*ido) + tr2;
-      ci3 = 2*taui*ref(cc,(3*k + 2)*ido);
-      ch[(k + l1)*ido] = cr2 - ci3;
-      ch[(k + 2*l1)*ido] = cr2 + ci3;
-    }
-    if (ido == 1) return;
-    for (k=0; k<l1; k++) {
-      for (i=2; i<ido; i+=2) {
-        ic = ido - i;
-        tr2 = ref(cc,i - 1 + (3*k + 2)*ido) + ref(cc,ic - 1 + (3*k + 1)*ido);
-        cr2 = ref(cc,i - 1 + 3*k*ido) + taur*tr2;
-        ch[i - 1 + k*ido] = ref(cc,i - 1 + 3*k*ido) + tr2;
-        ti2 = ref(cc,i + (3*k + 2)*ido) - ref(cc,ic + (3*k + 1)*ido);
-        ci2 = ref(cc,i + 3*k*ido) + taur*ti2;
-        ch[i + k*ido] = ref(cc,i + 3*k*ido) + ti2;
-        cr3 = taui*(ref(cc,i - 1 + (3*k + 2)*ido) - ref(cc,ic - 1 + (3*k + 1)*ido));
-        ci3 = taui*(ref(cc,i + (3*k + 2)*ido) + ref(cc,ic + (3*k + 1)*ido));
-        dr2 = cr2 - ci3;
-        dr3 = cr2 + ci3;
-        di2 = ci2 + cr3;
-        di3 = ci2 - cr3;
-        ch[i - 1 + (k + l1)*ido] = wa1[i - 2]*dr2 - wa1[i - 1]*di2;
-        ch[i + (k + l1)*ido] = wa1[i - 2]*di2 + wa1[i - 1]*dr2;
-        ch[i - 1 + (k + 2*l1)*ido] = wa2[i - 2]*dr3 - wa2[i - 1]*di3;
-        ch[i + (k + 2*l1)*ido] = wa2[i - 2]*di3 + wa2[i - 1]*dr3;
-      }
-    }
-  } /* radb3 */
-
-
-static void radf4(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[], const Treal wa3[])
-  {
-    static const Treal hsqt2 = 0.70710678118654752440;
-    int i, k, ic;
-    Treal ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-    for (k=0; k<l1; k++) {
-      tr1 = ref(cc,(k + l1)*ido) + ref(cc,(k + 3*l1)*ido);
-      tr2 = ref(cc,k*ido) + ref(cc,(k + 2*l1)*ido);
-      ch[4*k*ido] = tr1 + tr2;
-      ch[ido-1 + (4*k + 3)*ido] = tr2 - tr1;
-      ch[ido-1 + (4*k + 1)*ido] = ref(cc,k*ido) - ref(cc,(k + 2*l1)*ido);
-      ch[(4*k + 2)*ido] = ref(cc,(k + 3*l1)*ido) - ref(cc,(k + l1)*ido);
-    }
-    if (ido < 2) return;
-    if (ido != 2) {
-      for (k=0; k<l1; k++) {
-        for (i=2; i<ido; i += 2) {
-          ic = ido - i;
-          cr2 = wa1[i - 2]*ref(cc,i - 1 + (k + l1)*ido) + wa1[i - 1]*ref(cc,i + (k + l1)*ido);
-          ci2 = wa1[i - 2]*ref(cc,i + (k + l1)*ido) - wa1[i - 1]*ref(cc,i - 1 + (k + l1)*ido);
-          cr3 = wa2[i - 2]*ref(cc,i - 1 + (k + 2*l1)*ido) + wa2[i - 1]*ref(cc,i + (k + 2*l1)*
-              ido);
-          ci3 = wa2[i - 2]*ref(cc,i + (k + 2*l1)*ido) - wa2[i - 1]*ref(cc,i - 1 + (k + 2*l1)*
-              ido);
-          cr4 = wa3[i - 2]*ref(cc,i - 1 + (k + 3*l1)*ido) + wa3[i - 1]*ref(cc,i + (k + 3*l1)*
-              ido);
-          ci4 = wa3[i - 2]*ref(cc,i + (k + 3*l1)*ido) - wa3[i - 1]*ref(cc,i - 1 + (k + 3*l1)*
-              ido);
-          tr1 = cr2 + cr4;
-          tr4 = cr4 - cr2;
-          ti1 = ci2 + ci4;
-          ti4 = ci2 - ci4;
-          ti2 = ref(cc,i + k*ido) + ci3;
-          ti3 = ref(cc,i + k*ido) - ci3;
-          tr2 = ref(cc,i - 1 + k*ido) + cr3;
-          tr3 = ref(cc,i - 1 + k*ido) - cr3;
-          ch[i - 1 + 4*k*ido] = tr1 + tr2;
-          ch[ic - 1 + (4*k + 3)*ido] = tr2 - tr1;
-          ch[i + 4*k*ido] = ti1 + ti2;
-          ch[ic + (4*k + 3)*ido] = ti1 - ti2;
-          ch[i - 1 + (4*k + 2)*ido] = ti4 + tr3;
-          ch[ic - 1 + (4*k + 1)*ido] = tr3 - ti4;
-          ch[i + (4*k + 2)*ido] = tr4 + ti3;
-          ch[ic + (4*k + 1)*ido] = tr4 - ti3;
-        }
-      }
-      if (ido % 2 == 1) return;
-    }
-    for (k=0; k<l1; k++) {
-      ti1 = -hsqt2*(ref(cc,ido-1 + (k + l1)*ido) + ref(cc,ido-1 + (k + 3*l1)*ido));
-      tr1 = hsqt2*(ref(cc,ido-1 + (k + l1)*ido) - ref(cc,ido-1 + (k + 3*l1)*ido));
-      ch[ido-1 + 4*k*ido] = tr1 + ref(cc,ido-1 + k*ido);
-      ch[ido-1 + (4*k + 2)*ido] = ref(cc,ido-1 + k*ido) - tr1;
-      ch[(4*k + 1)*ido] = ti1 - ref(cc,ido-1 + (k + 2*l1)*ido);
-      ch[(4*k + 3)*ido] = ti1 + ref(cc,ido-1 + (k + 2*l1)*ido);
-    }
-  } /* radf4 */
-
-
-static void radb4(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[], const Treal wa3[])
-  {
-    static const Treal sqrt2 = 1.41421356237309504880;
-    int i, k, ic;
-    Treal ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-    for (k = 0; k < l1; k++) {
-      tr1 = ref(cc,4*k*ido) - ref(cc,ido-1 + (4*k + 3)*ido);
-      tr2 = ref(cc,4*k*ido) + ref(cc,ido-1 + (4*k + 3)*ido);
-      tr3 = ref(cc,ido-1 + (4*k + 1)*ido) + ref(cc,ido-1 + (4*k + 1)*ido);
-      tr4 = ref(cc,(4*k + 2)*ido) + ref(cc,(4*k + 2)*ido);
-      ch[k*ido] = tr2 + tr3;
-      ch[(k + l1)*ido] = tr1 - tr4;
-      ch[(k + 2*l1)*ido] = tr2 - tr3;
-      ch[(k + 3*l1)*ido] = tr1 + tr4;
-    }
-    if (ido < 2) return;
-    if (ido != 2) {
-      for (k = 0; k < l1; ++k) {
-        for (i = 2; i < ido; i += 2) {
-          ic = ido - i;
-          ti1 = ref(cc,i + 4*k*ido) + ref(cc,ic + (4*k + 3)*ido);
-          ti2 = ref(cc,i + 4*k*ido) - ref(cc,ic + (4*k + 3)*ido);
-          ti3 = ref(cc,i + (4*k + 2)*ido) - ref(cc,ic + (4*k + 1)*ido);
-          tr4 = ref(cc,i + (4*k + 2)*ido) + ref(cc,ic + (4*k + 1)*ido);
-          tr1 = ref(cc,i - 1 + 4*k*ido) - ref(cc,ic - 1 + (4*k + 3)*ido);
-          tr2 = ref(cc,i - 1 + 4*k*ido) + ref(cc,ic - 1 + (4*k + 3)*ido);
-          ti4 = ref(cc,i - 1 + (4*k + 2)*ido) - ref(cc,ic - 1 + (4*k + 1)*ido);
-          tr3 = ref(cc,i - 1 + (4*k + 2)*ido) + ref(cc,ic - 1 + (4*k + 1)*ido);
-          ch[i - 1 + k*ido] = tr2 + tr3;
-          cr3 = tr2 - tr3;
-          ch[i + k*ido] = ti2 + ti3;
-          ci3 = ti2 - ti3;
-          cr2 = tr1 - tr4;
-          cr4 = tr1 + tr4;
-          ci2 = ti1 + ti4;
-          ci4 = ti1 - ti4;
-          ch[i - 1 + (k + l1)*ido] = wa1[i - 2]*cr2 - wa1[i - 1]*ci2;
-          ch[i + (k + l1)*ido] = wa1[i - 2]*ci2 + wa1[i - 1]*cr2;
-          ch[i - 1 + (k + 2*l1)*ido] = wa2[i - 2]*cr3 - wa2[i - 1]*ci3;
-          ch[i + (k + 2*l1)*ido] = wa2[i - 2]*ci3 + wa2[i - 1]*cr3;
-          ch[i - 1 + (k + 3*l1)*ido] = wa3[i - 2]*cr4 - wa3[i - 1]*ci4;
-          ch[i + (k + 3*l1)*ido] = wa3[i - 2]*ci4 + wa3[i - 1]*cr4;
-        }
-      }
-      if (ido % 2 == 1) return;
-    }
-    for (k = 0; k < l1; k++) {
-      ti1 = ref(cc,(4*k + 1)*ido) + ref(cc,(4*k + 3)*ido);
-      ti2 = ref(cc,(4*k + 3)*ido) - ref(cc,(4*k + 1)*ido);
-      tr1 = ref(cc,ido-1 + 4*k*ido) - ref(cc,ido-1 + (4*k + 2)*ido);
-      tr2 = ref(cc,ido-1 + 4*k*ido) + ref(cc,ido-1 + (4*k + 2)*ido);
-      ch[ido-1 + k*ido] = tr2 + tr2;
-      ch[ido-1 + (k + l1)*ido] = sqrt2*(tr1 - ti1);
-      ch[ido-1 + (k + 2*l1)*ido] = ti2 + ti2;
-      ch[ido-1 + (k + 3*l1)*ido] = -sqrt2*(tr1 + ti1);
-    }
-  } /* radb4 */
-
-
-static void radf5(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[], const Treal wa3[], const Treal wa4[])
-  {
-    static const Treal tr11 = 0.3090169943749474241;
-    static const Treal ti11 = 0.95105651629515357212;
-    static const Treal tr12 = -0.8090169943749474241;
-    static const Treal ti12 = 0.58778525229247312917;
-    int i, k, ic;
-    Treal ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
-        cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
-    for (k = 0; k < l1; k++) {
-      cr2 = ref(cc,(k + 4*l1)*ido) + ref(cc,(k + l1)*ido);
-      ci5 = ref(cc,(k + 4*l1)*ido) - ref(cc,(k + l1)*ido);
-      cr3 = ref(cc,(k + 3*l1)*ido) + ref(cc,(k + 2*l1)*ido);
-      ci4 = ref(cc,(k + 3*l1)*ido) - ref(cc,(k + 2*l1)*ido);
-      ch[5*k*ido] = ref(cc,k*ido) + cr2 + cr3;
-      ch[ido-1 + (5*k + 1)*ido] = ref(cc,k*ido) + tr11*cr2 + tr12*cr3;
-      ch[(5*k + 2)*ido] = ti11*ci5 + ti12*ci4;
-      ch[ido-1 + (5*k + 3)*ido] = ref(cc,k*ido) + tr12*cr2 + tr11*cr3;
-      ch[(5*k + 4)*ido] = ti12*ci5 - ti11*ci4;
-    }
-    if (ido == 1) return;
-    for (k = 0; k < l1; ++k) {
-      for (i = 2; i < ido; i += 2) {
-        ic = ido - i;
-        dr2 = wa1[i - 2]*ref(cc,i - 1 + (k + l1)*ido) + wa1[i - 1]*ref(cc,i + (k + l1)*ido);
-        di2 = wa1[i - 2]*ref(cc,i + (k + l1)*ido) - wa1[i - 1]*ref(cc,i - 1 + (k + l1)*ido);
-        dr3 = wa2[i - 2]*ref(cc,i - 1 + (k + 2*l1)*ido) + wa2[i - 1]*ref(cc,i + (k + 2*l1)*ido);
-        di3 = wa2[i - 2]*ref(cc,i + (k + 2*l1)*ido) - wa2[i - 1]*ref(cc,i - 1 + (k + 2*l1)*ido);
-        dr4 = wa3[i - 2]*ref(cc,i - 1 + (k + 3*l1)*ido) + wa3[i - 1]*ref(cc,i + (k + 3*l1)*ido);
-        di4 = wa3[i - 2]*ref(cc,i + (k + 3*l1)*ido) - wa3[i - 1]*ref(cc,i - 1 + (k + 3*l1)*ido);
-        dr5 = wa4[i - 2]*ref(cc,i - 1 + (k + 4*l1)*ido) + wa4[i - 1]*ref(cc,i + (k + 4*l1)*ido);
-        di5 = wa4[i - 2]*ref(cc,i + (k + 4*l1)*ido) - wa4[i - 1]*ref(cc,i - 1 + (k + 4*l1)*ido);
-        cr2 = dr2 + dr5;
-        ci5 = dr5 - dr2;
-        cr5 = di2 - di5;
-        ci2 = di2 + di5;
-        cr3 = dr3 + dr4;
-        ci4 = dr4 - dr3;
-        cr4 = di3 - di4;
-        ci3 = di3 + di4;
-        ch[i - 1 + 5*k*ido] = ref(cc,i - 1 + k*ido) + cr2 + cr3;
-        ch[i + 5*k*ido] = ref(cc,i + k*ido) + ci2 + ci3;
-        tr2 = ref(cc,i - 1 + k*ido) + tr11*cr2 + tr12*cr3;
-        ti2 = ref(cc,i + k*ido) + tr11*ci2 + tr12*ci3;
-        tr3 = ref(cc,i - 1 + k*ido) + tr12*cr2 + tr11*cr3;
-        ti3 = ref(cc,i + k*ido) + tr12*ci2 + tr11*ci3;
-        tr5 = ti11*cr5 + ti12*cr4;
-        ti5 = ti11*ci5 + ti12*ci4;
-        tr4 = ti12*cr5 - ti11*cr4;
-        ti4 = ti12*ci5 - ti11*ci4;
-        ch[i - 1 + (5*k + 2)*ido] = tr2 + tr5;
-        ch[ic - 1 + (5*k + 1)*ido] = tr2 - tr5;
-        ch[i + (5*k + 2)*ido] = ti2 + ti5;
-        ch[ic + (5*k + 1)*ido] = ti5 - ti2;
-        ch[i - 1 + (5*k + 4)*ido] = tr3 + tr4;
-        ch[ic - 1 + (5*k + 3)*ido] = tr3 - tr4;
-        ch[i + (5*k + 4)*ido] = ti3 + ti4;
-        ch[ic + (5*k + 3)*ido] = ti4 - ti3;
-      }
-    }
-  } /* radf5 */
-
-
-static void radb5(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[], const Treal wa3[], const Treal wa4[])
-  {
-    static const Treal tr11 = 0.3090169943749474241;
-    static const Treal ti11 = 0.95105651629515357212;
-    static const Treal tr12 = -0.8090169943749474241;
-    static const Treal ti12 = 0.58778525229247312917;
-    int i, k, ic;
-    Treal ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
-        ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
-    for (k = 0; k < l1; k++) {
-      ti5 = 2*ref(cc,(5*k + 2)*ido);
-      ti4 = 2*ref(cc,(5*k + 4)*ido);
-      tr2 = 2*ref(cc,ido-1 + (5*k + 1)*ido);
-      tr3 = 2*ref(cc,ido-1 + (5*k + 3)*ido);
-      ch[k*ido] = ref(cc,5*k*ido) + tr2 + tr3;
-      cr2 = ref(cc,5*k*ido) + tr11*tr2 + tr12*tr3;
-      cr3 = ref(cc,5*k*ido) + tr12*tr2 + tr11*tr3;
-      ci5 = ti11*ti5 + ti12*ti4;
-      ci4 = ti12*ti5 - ti11*ti4;
-      ch[(k + l1)*ido] = cr2 - ci5;
-      ch[(k + 2*l1)*ido] = cr3 - ci4;
-      ch[(k + 3*l1)*ido] = cr3 + ci4;
-      ch[(k + 4*l1)*ido] = cr2 + ci5;
-    }
-    if (ido == 1) return;
-    for (k = 0; k < l1; ++k) {
-      for (i = 2; i < ido; i += 2) {
-        ic = ido - i;
-        ti5 = ref(cc,i + (5*k + 2)*ido) + ref(cc,ic + (5*k + 1)*ido);
-        ti2 = ref(cc,i + (5*k + 2)*ido) - ref(cc,ic + (5*k + 1)*ido);
-        ti4 = ref(cc,i + (5*k + 4)*ido) + ref(cc,ic + (5*k + 3)*ido);
-        ti3 = ref(cc,i + (5*k + 4)*ido) - ref(cc,ic + (5*k + 3)*ido);
-        tr5 = ref(cc,i - 1 + (5*k + 2)*ido) - ref(cc,ic - 1 + (5*k + 1)*ido);
-        tr2 = ref(cc,i - 1 + (5*k + 2)*ido) + ref(cc,ic - 1 + (5*k + 1)*ido);
-        tr4 = ref(cc,i - 1 + (5*k + 4)*ido) - ref(cc,ic - 1 + (5*k + 3)*ido);
-        tr3 = ref(cc,i - 1 + (5*k + 4)*ido) + ref(cc,ic - 1 + (5*k + 3)*ido);
-        ch[i - 1 + k*ido] = ref(cc,i - 1 + 5*k*ido) + tr2 + tr3;
-        ch[i + k*ido] = ref(cc,i + 5*k*ido) + ti2 + ti3;
-        cr2 = ref(cc,i - 1 + 5*k*ido) + tr11*tr2 + tr12*tr3;
-
-        ci2 = ref(cc,i + 5*k*ido) + tr11*ti2 + tr12*ti3;
-        cr3 = ref(cc,i - 1 + 5*k*ido) + tr12*tr2 + tr11*tr3;
-
-        ci3 = ref(cc,i + 5*k*ido) + tr12*ti2 + tr11*ti3;
-        cr5 = ti11*tr5 + ti12*tr4;
-        ci5 = ti11*ti5 + ti12*ti4;
-        cr4 = ti12*tr5 - ti11*tr4;
-        ci4 = ti12*ti5 - ti11*ti4;
-        dr3 = cr3 - ci4;
-        dr4 = cr3 + ci4;
-        di3 = ci3 + cr4;
-        di4 = ci3 - cr4;
-        dr5 = cr2 + ci5;
-        dr2 = cr2 - ci5;
-        di5 = ci2 - cr5;
-        di2 = ci2 + cr5;
-        ch[i - 1 + (k + l1)*ido] = wa1[i - 2]*dr2 - wa1[i - 1]*di2;
-        ch[i + (k + l1)*ido] = wa1[i - 2]*di2 + wa1[i - 1]*dr2;
-        ch[i - 1 + (k + 2*l1)*ido] = wa2[i - 2]*dr3 - wa2[i - 1]*di3;
-        ch[i + (k + 2*l1)*ido] = wa2[i - 2]*di3 + wa2[i - 1]*dr3;
-        ch[i - 1 + (k + 3*l1)*ido] = wa3[i - 2]*dr4 - wa3[i - 1]*di4;
-        ch[i + (k + 3*l1)*ido] = wa3[i - 2]*di4 + wa3[i - 1]*dr4;
-        ch[i - 1 + (k + 4*l1)*ido] = wa4[i - 2]*dr5 - wa4[i - 1]*di5;
-        ch[i + (k + 4*l1)*ido] = wa4[i - 2]*di5 + wa4[i - 1]*dr5;
-      }
-    }
-  } /* radb5 */
-
-
-static void radfg(int ido, int ip, int l1, int idl1,
-      Treal cc[], Treal ch[], const Treal wa[])
-  {
-    int idij, ipph, i, j, k, l, j2, ic, jc, lc, ik, is, nbd;    
-    Treal dc2, ai1, ai2, ar1, ar2, ds2, dcp, dsp, ar1h, ar2h;
-    sincos2pi(1, ip, &dsp, &dcp);
-    ipph = (ip + 1) / 2;
-    nbd = (ido - 1) / 2;
-    if (ido != 1) {
-      for (ik=0; ik<idl1; ik++) ch[ik] = cc[ik];
-      for (j=1; j<ip; j++)
-        for (k=0; k<l1; k++)
-          ch[(k + j*l1)*ido] = cc[(k + j*l1)*ido];
-      if (nbd <= l1) {
-        is = -ido;
-        for (j=1; j<ip; j++) {
-          is += ido;
-          idij = is-1;
-          for (i=2; i<ido; i+=2) {
-            idij += 2;
-            for (k=0; k<l1; k++) {
-              ch[i - 1 + (k + j*l1)*ido] =
-                  wa[idij - 1]*cc[i - 1 + (k + j*l1)*ido] + wa[idij]*cc[i + (k + j*l1)*ido];
-              ch[i + (k + j*l1)*ido] =
-                  wa[idij - 1]*cc[i + (k + j*l1)*ido] - wa[idij]*cc[i - 1 + (k + j*l1)*ido];
-            }
-          }
-        }
-      } else {
-        is = -ido;
-        for (j=1; j<ip; j++) {
-          is += ido;
-          for (k=0; k<l1; k++) {
-            idij = is-1;
-            for (i=2; i<ido; i+=2) {
-              idij += 2;
-              ch[i - 1 + (k + j*l1)*ido] =
-                  wa[idij - 1]*cc[i - 1 + (k + j*l1)*ido] + wa[idij]*cc[i + (k + j*l1)*ido];
-              ch[i + (k + j*l1)*ido] =
-                  wa[idij - 1]*cc[i + (k + j*l1)*ido] - wa[idij]*cc[i - 1 + (k + j*l1)*ido];
-            }
-          }
-        }
-      }
-      if (nbd >= l1) {
-        for (j=1; j<ipph; j++) {
-          jc = ip - j;
-          for (k=0; k<l1; k++) {
-            for (i=2; i<ido; i+=2) {
-              cc[i - 1 + (k + j*l1)*ido] = ch[i - 1 + (k + j*l1)*ido] + ch[i - 1 + (k + jc*l1)*ido];
-              cc[i - 1 + (k + jc*l1)*ido] = ch[i + (k + j*l1)*ido] - ch[i + (k + jc*l1)*ido];
-              cc[i + (k + j*l1)*ido] = ch[i + (k + j*l1)*ido] + ch[i + (k + jc*l1)*ido];
-              cc[i + (k + jc*l1)*ido] = ch[i - 1 + (k + jc*l1)*ido] - ch[i - 1 + (k + j*l1)*ido];
-            }
-          }
-        }
-      } else {
-        for (j=1; j<ipph; j++) {
-          jc = ip - j;
-          for (i=2; i<ido; i+=2) {
-            for (k=0; k<l1; k++) {
-              cc[i - 1 + (k + j*l1)*ido] =
-                  ch[i - 1 + (k + j*l1)*ido] + ch[i - 1 + (k + jc*l1)*ido];
-              cc[i - 1 + (k + jc*l1)*ido] = ch[i + (k + j*l1)*ido] - ch[i + (k + jc*l1)*ido];
-              cc[i + (k + j*l1)*ido] = ch[i + (k + j*l1)*ido] + ch[i + (k + jc*l1)*ido];
-              cc[i + (k + jc*l1)*ido] = ch[i - 1 + (k + jc*l1)*ido] - ch[i - 1 + (k + j*l1)*ido];
-            }
-          }
-        }
-      }
-    } else {  /* now ido == 1 */
-      for (ik=0; ik<idl1; ik++) cc[ik] = ch[ik];
-    }
-    for (j=1; j<ipph; j++) {
-      jc = ip - j;
-      for (k=0; k<l1; k++) {
-        cc[(k + j*l1)*ido] = ch[(k + j*l1)*ido] + ch[(k + jc*l1)*ido];
-        cc[(k + jc*l1)*ido] = ch[(k + jc*l1)*ido] - ch[(k + j*l1)*ido];
-      }
-    }
-
-    ar1 = 1;
-    ai1 = 0;    
-    for (l=1; l<ipph; l++) {
-      lc = ip - l;
-      ar1h = dcp*ar1 - dsp*ai1;
-      ai1 = dcp*ai1 + dsp*ar1;
-      ar1 = ar1h;
-      for (ik=0; ik<idl1; ik++) {
-        ch[ik + l*idl1] = cc[ik] + ar1*cc[ik + idl1];
-        ch[ik + lc*idl1] = ai1*cc[ik + (ip-1)*idl1];
-      }
-      dc2 = ar1;
-      ds2 = ai1;
-      ar2 = ar1;
-      ai2 = ai1;
-      for (j=2; j<ipph; j++) {
-        jc = ip - j;
-        ar2h = dc2*ar2 - ds2*ai2;
-        ai2 = dc2*ai2 + ds2*ar2;
-        ar2 = ar2h;
-        for (ik=0; ik<idl1; ik++) {
-          ch[ik + l*idl1] += ar2*cc[ik + j*idl1];
-          ch[ik + lc*idl1] += ai2*cc[ik + jc*idl1];
-        }
-      }
-    }
-    
-    for (j=1; j<ipph; j++)
-      for (ik=0; ik<idl1; ik++)
-        ch[ik] += cc[ik + j*idl1];
-
-    if (ido >= l1) {
-      for (k=0; k<l1; k++) {
-        for (i=0; i<ido; i++) {
-          ref(cc,i + k*ip*ido) = ch[i + k*ido];
-        }
-      }
-    } else {
-      for (i=0; i<ido; i++) {
-        for (k=0; k<l1; k++) {
-          ref(cc,i + k*ip*ido) = ch[i + k*ido];
-        }
-      }
-    }
-    for (j=1; j<ipph; j++) {
-      jc = ip - j;
-      j2 = 2*j;
-      for (k=0; k<l1; k++) {
-        ref(cc,ido-1 + (j2 - 1 + k*ip)*ido) =
-            ch[(k + j*l1)*ido];
-        ref(cc,(j2 + k*ip)*ido) =
-            ch[(k + jc*l1)*ido];
-      }
-    }
-    if (ido == 1) return;
-    if (nbd >= l1) {
-      for (j=1; j<ipph; j++) {
-        jc = ip - j;
-        j2 = 2*j;
-        for (k=0; k<l1; k++) {
-          for (i=2; i<ido; i+=2) {
-            ic = ido - i;
-            ref(cc,i - 1 + (j2 + k*ip)*ido) = ch[i - 1 + (k + j*l1)*ido] + ch[i - 1 + (k + jc*l1)*ido];
-            ref(cc,ic - 1 + (j2 - 1 + k*ip)*ido) = ch[i - 1 + (k + j*l1)*ido] - ch[i - 1 + (k + jc*l1)*ido];
-            ref(cc,i + (j2 + k*ip)*ido) = ch[i + (k + j*l1)*ido] + ch[i + (k + jc*l1)*ido];
-            ref(cc,ic + (j2 - 1 + k*ip)*ido) = ch[i + (k + jc*l1)*ido] - ch[i + (k + j*l1)*ido];
-          }
-        }
-      }
-    } else {
-      for (j=1; j<ipph; j++) {
-        jc = ip - j;
-        j2 = 2*j;
-        for (i=2; i<ido; i+=2) {
-          ic = ido - i;
-          for (k=0; k<l1; k++) {
-            ref(cc,i - 1 + (j2 + k*ip)*ido) = ch[i - 1 + (k + j*l1)*ido] + ch[i - 1 + (k + jc*l1)*ido];
-            ref(cc,ic - 1 + (j2 - 1 + k*ip)*ido) = ch[i - 1 + (k + j*l1)*ido] - ch[i - 1 + (k + jc*l1)*ido];
-            ref(cc,i + (j2 + k*ip)*ido) = ch[i + (k + j*l1)*ido] + ch[i + (k + jc*l1)*ido];
-            ref(cc,ic + (j2 - 1 + k*ip)*ido) = ch[i + (k + jc*l1)*ido] - ch[i + (k + j*l1)*ido];
-          }
-        }
-      }
-    }
-  } /* radfg */
-
-
-static void radbg(int ido, int ip, int l1, int idl1,
-      Treal cc[], Treal ch[], const Treal wa[])
-  {
-    int idij, ipph, i, j, k, l, j2, ic, jc, lc, ik, is;
-    Treal dc2, ai1, ai2, ar1, ar2, ds2;
-    int nbd;
-    Treal dcp, dsp, ar1h, ar2h;
-    sincos2pi(1, ip, &dsp, &dcp);
-    nbd = (ido - 1) / 2;
-    ipph = (ip + 1) / 2;
-    if (ido >= l1) {
-      for (k=0; k<l1; k++) {
-        for (i=0; i<ido; i++) {
-          ch[i + k*ido] = ref(cc,i + k*ip*ido);
-        }
-      }
-    } else {
-      for (i=0; i<ido; i++) {
-        for (k=0; k<l1; k++) {
-          ch[i + k*ido] = ref(cc,i + k*ip*ido);
-        }
-      }
-    }
-    for (j=1; j<ipph; j++) {
-      jc = ip - j;
-      j2 = 2*j;
-      for (k=0; k<l1; k++) {
-        ch[(k + j*l1)*ido] = ref(cc,ido-1 + (j2 - 1 + k*ip)*ido) + ref(cc,ido-1 + (j2 - 1 + k*ip)*
-            ido);
-        ch[(k + jc*l1)*ido] = ref(cc,(j2 + k*ip)*ido) + ref(cc,(j2 + k*ip)*ido);
-      }
-    }
-
-    if (ido != 1) {
-      if (nbd >= l1) {
-        for (j=1; j<ipph; j++) {
-          jc = ip - j;
-          for (k=0; k<l1; k++) {
-            for (i=2; i<ido; i+=2) {
-              ic = ido - i;
-              ch[i - 1 + (k + j*l1)*ido] = ref(cc,i - 1 + (2*j + k*ip)*ido) + ref(cc,
-                  ic - 1 + (2*j - 1 + k*ip)*ido);
-              ch[i - 1 + (k + jc*l1)*ido] = ref(cc,i - 1 + (2*j + k*ip)*ido) -
-                  ref(cc,ic - 1 + (2*j - 1 + k*ip)*ido);
-              ch[i + (k + j*l1)*ido] = ref(cc,i + (2*j + k*ip)*ido) - ref(cc,ic
-                  + (2*j - 1 + k*ip)*ido);
-              ch[i + (k + jc*l1)*ido] = ref(cc,i + (2*j + k*ip)*ido) + ref(cc,ic
-                  + (2*j - 1 + k*ip)*ido);
-            }
-          }
-        }
-      } else {
-        for (j=1; j<ipph; j++) {
-          jc = ip - j;
-          for (i=2; i<ido; i+=2) {
-            ic = ido - i;
-            for (k=0; k<l1; k++) {
-              ch[i - 1 + (k + j*l1)*ido] = ref(cc,i - 1 + (2*j + k*ip)*ido) + ref(cc,
-                  ic - 1 + (2*j - 1 + k*ip)*ido);
-              ch[i - 1 + (k + jc*l1)*ido] = ref(cc,i - 1 + (2*j + k*ip)*ido) -
-                  ref(cc,ic - 1 + (2*j - 1 + k*ip)*ido);
-              ch[i + (k + j*l1)*ido] = ref(cc,i + (2*j + k*ip)*ido) - ref(cc,ic
-                  + (2*j - 1 + k*ip)*ido);
-              ch[i + (k + jc*l1)*ido] = ref(cc,i + (2*j + k*ip)*ido) + ref(cc,ic
-                  + (2*j - 1 + k*ip)*ido);
-            }
-          }
-        }
-      }
-    }
-
-    ar1 = 1;
-    ai1 = 0;
-    for (l=1; l<ipph; l++) {
-      lc = ip - l;
-      ar1h = dcp*ar1 - dsp*ai1;
-      ai1 = dcp*ai1 + dsp*ar1;
-      ar1 = ar1h;
-      for (ik=0; ik<idl1; ik++) {
-        cc[ik + l*idl1] = ch[ik] + ar1*ch[ik + idl1];
-        cc[ik + lc*idl1] = ai1*ch[ik + (ip-1)*idl1];
-      }
-      dc2 = ar1;
-      ds2 = ai1;
-      ar2 = ar1;
-      ai2 = ai1;
-      for (j=2; j<ipph; j++) {
-        jc = ip - j;
-        ar2h = dc2*ar2 - ds2*ai2;
-        ai2 = dc2*ai2 + ds2*ar2;
-        ar2 = ar2h;
-        for (ik=0; ik<idl1; ik++) {
-          cc[ik + l*idl1] += ar2*ch[ik + j*idl1];
-          cc[ik + lc*idl1] += ai2*ch[ik + jc*idl1];
-        }
-      }
-    }
-    for (j=1; j<ipph; j++) {
-      for (ik=0; ik<idl1; ik++) {
-        ch[ik] += ch[ik + j*idl1];
-      }
-    }
-    for (j=1; j<ipph; j++) {
-      jc = ip - j;
-      for (k=0; k<l1; k++) {
-        ch[(k + j*l1)*ido] = cc[(k + j*l1)*ido] - cc[(k + jc*l1)*ido];
-        ch[(k + jc*l1)*ido] = cc[(k + j*l1)*ido] + cc[(k + jc*l1)*ido];
-      }
-    }
-
-    if (ido == 1) return;
-    if (nbd >= l1) {
-      for (j=1; j<ipph; j++) {
-        jc = ip - j;
-        for (k=0; k<l1; k++) {
-          for (i=2; i<ido; i+=2) {
-            ch[i - 1 + (k + j*l1)*ido] = cc[i - 1 + (k + j*l1)*ido] - cc[i + (k + jc*l1)*ido];
-            ch[i - 1 + (k + jc*l1)*ido] = cc[i - 1 + (k + j*l1)*ido] + cc[i + (k + jc*l1)*ido];
-            ch[i + (k + j*l1)*ido] = cc[i + (k + j*l1)*ido] + cc[i - 1 + (k + jc*l1)*ido];
-            ch[i + (k + jc*l1)*ido] = cc[i + (k + j*l1)*ido] - cc[i - 1 + (k + jc*l1)*ido];
-          }
-        }
-      }
-    } else {
-      for (j=1; j<ipph; j++) {
-        jc = ip - j;
-        for (i=2; i<ido; i+=2) {
-          for (k=0; k<l1; k++) {
-            ch[i - 1 + (k + j*l1)*ido] = cc[i - 1 + (k + j*l1)*ido] - cc[i + (k + jc*l1)*ido];
-            ch[i - 1 + (k + jc*l1)*ido] = cc[i - 1 + (k + j *l1)*ido] + cc[i + (k + jc*l1)*ido];
-            ch[i + (k + j*l1)*ido] = cc[i + (k + j*l1)*ido] + cc[i - 1 + (k + jc*l1)*ido];
-            ch[i + (k + jc*l1)*ido] = cc[i + (k + j*l1)*ido] - cc[i - 1 + (k + jc*l1)*ido];
-          }
-        }
-      }
-    }
-    for (ik=0; ik<idl1; ik++) cc[ik] = ch[ik];
-    for (j=1; j<ip; j++)
-      for (k=0; k<l1; k++)
-        cc[(k + j*l1)*ido] = ch[(k + j*l1)*ido];
-    if (nbd <= l1) {
-      is = -ido;
-      for (j=1; j<ip; j++) {
-        is += ido;
-        idij = is-1;
-        for (i=2; i<ido; i+=2) {
-          idij += 2;
-          for (k=0; k<l1; k++) {
-            cc[i - 1 + (k + j*l1)*ido] = wa[idij - 1]*ch[i - 1 + (k + j*l1)*ido] - wa[idij]*
-                ch[i + (k + j*l1)*ido];
-            cc[i + (k + j*l1)*ido] = wa[idij - 1]*ch[i + (k + j*l1)*ido] + wa[idij]*ch[i - 1 + (k + j*l1)*ido];
-          }
-        }
-      }
-    } else {
-      is = -ido;
-      for (j=1; j<ip; j++) {
-        is += ido;
-        for (k=0; k<l1; k++) {
-          idij = is - 1;
-          for (i=2; i<ido; i+=2) {
-            idij += 2;
-            cc[i - 1 + (k + j*l1)*ido] = wa[idij-1]*ch[i - 1 + (k + j*l1)*ido] - wa[idij]*
-                ch[i + (k + j*l1)*ido];
-            cc[i + (k + j*l1)*ido] = wa[idij-1]*ch[i + (k + j*l1)*ido] + wa[idij]*ch[i - 1 + (k + j*l1)*ido];
-          }
-        }
-      }
-    }
-  } /* radbg */
-
-  /* ------------------------------------------------------------
-cfftf1, npy_cfftf, npy_cfftb, cffti1, npy_cffti. Complex FFTs.
---------------------------------------------------------------- */
-
-static void cfftf1(int n, Treal c[], Treal ch[], const Treal wa[], const int ifac[MAXFAC+2], int isign)
-  {
-    int idot, i;
-    int k1, l1, l2;
-    int na, nf, ip, iw, ix2, ix3, ix4, nac, ido, idl1;
-    Treal *cinput, *coutput;
-    nf = ifac[1];
-    na = 0;
-    l1 = 1;
-    iw = 0;
-    for (k1=2; k1<=nf+1; k1++) {
-      ip = ifac[k1];
-      l2 = ip*l1;
-      ido = n / l2;
-      idot = ido + ido;
-      idl1 = idot*l1;
-      if (na) {
-        cinput = ch;
-        coutput = c;
-      } else {
-        cinput = c;
-        coutput = ch;
-      }
-      switch (ip) {
-      case 4:
-        ix2 = iw + idot;
-        ix3 = ix2 + idot;
-        passf4(idot, l1, cinput, coutput, &wa[iw], &wa[ix2], &wa[ix3], isign);
-        na = !na;
-        break;
-      case 2:
-        passf2(idot, l1, cinput, coutput, &wa[iw], isign);
-        na = !na;
-        break;
-      case 3:
-        ix2 = iw + idot;
-        passf3(idot, l1, cinput, coutput, &wa[iw], &wa[ix2], isign);
-        na = !na;
-        break;
-      case 5:
-        ix2 = iw + idot;
-        ix3 = ix2 + idot;
-        ix4 = ix3 + idot;
-        passf5(idot, l1, cinput, coutput, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
-        na = !na;
-        break;
-      default:
-        passf(&nac, idot, ip, l1, idl1, cinput, coutput, &wa[iw], isign);
-        if (nac != 0) na = !na;
-      }
-      l1 = l2;
-      iw += (ip - 1)*idot;
-    }
-    if (na == 0) return;
-    for (i=0; i<2*n; i++) c[i] = ch[i];
-  } /* cfftf1 */
-
-
-NPY_VISIBILITY_HIDDEN void npy_cfftf(int n, Treal c[], Treal wsave[])
-  {
-    int iw1, iw2;
-    if (n == 1) return;
-    iw1 = 2*n;
-    iw2 = iw1 + 2*n;
-    cfftf1(n, c, wsave, wsave+iw1, (int*)(wsave+iw2), -1);
-  } /* npy_cfftf */
-
-
-NPY_VISIBILITY_HIDDEN void npy_cfftb(int n, Treal c[], Treal wsave[])
-  {
-    int iw1, iw2;
-    if (n == 1) return;
-    iw1 = 2*n;
-    iw2 = iw1 + 2*n;
-    cfftf1(n, c, wsave, wsave+iw1, (int*)(wsave+iw2), +1);
-  } /* npy_cfftb */
-
-
-static void factorize(int n, int ifac[MAXFAC+2], const int ntryh[NSPECIAL])
-  /* Factorize n in factors in ntryh and rest. On exit,
-ifac[0] contains n and ifac[1] contains number of factors,
-the factors start from ifac[2]. */
-  {
-    int ntry=3, i, j=0, ib, nf=0, nl=n, nq, nr;
-startloop:
-    if (j < NSPECIAL)
-      ntry = ntryh[j];
-    else
-      ntry+= 2;
-    j++;
-    do {
-      nq = nl / ntry;
-      nr = nl - ntry*nq;
-      if (nr != 0) goto startloop;
-      nf++;
-      ifac[nf + 1] = ntry;
-      nl = nq;
-      if (ntry == 2 && nf != 1) {
-        for (i=2; i<=nf; i++) {
-          ib = nf - i + 2;
-          ifac[ib + 1] = ifac[ib];
-        }
-        ifac[2] = 2;
-      }
-    } while (nl != 1);
-    ifac[0] = n;
-    ifac[1] = nf;
-  }
-
-
-static void cffti1(int n, Treal wa[], int ifac[MAXFAC+2])
-  {
-    int fi, idot, i, j;
-    int i1, k1, l1, l2;
-    int ld, ii, nf, ip;
-    int ido, ipm;
-
-    static const int ntryh[NSPECIAL] = {
-      3,4,2,5    }; /* Do not change the order of these. */
-
-    factorize(n,ifac,ntryh);
-    nf = ifac[1];
-    i = 1;
-    l1 = 1;
-    for (k1=1; k1<=nf; k1++) {
-      ip = ifac[k1+1];
-      ld = 0;
-      l2 = l1*ip;
-      ido = n / l2;
-      idot = ido + ido + 2;
-      ipm = ip - 1;
-      for (j=1; j<=ipm; j++) {
-        i1 = i;
-        wa[i-1] = 1;
-        wa[i] = 0;
-        ld += l1;
-        fi = 0;
-        for (ii=4; ii<=idot; ii+=2) {
-          i+= 2;
-          fi+= 1;
-          sincos2pi(fi*ld, n, wa+i, wa+i-1);
-        }
-        if (ip > 5) {
-          wa[i1-1] = wa[i-1];
-          wa[i1] = wa[i];
-        }
-      }
-      l1 = l2;
-    }
-  } /* cffti1 */
-
-
-NPY_VISIBILITY_HIDDEN void npy_cffti(int n, Treal wsave[])
- {
-    int iw1, iw2;
-    if (n == 1) return;
-    iw1 = 2*n;
-    iw2 = iw1 + 2*n;
-    cffti1(n, wsave+iw1, (int*)(wsave+iw2));
-  } /* npy_cffti */
-
-  /* -------------------------------------------------------------------
-rfftf1, rfftb1, npy_rfftf, npy_rfftb, rffti1, npy_rffti. Treal FFTs.
----------------------------------------------------------------------- */
-
-static void rfftf1(int n, Treal c[], Treal ch[], const Treal wa[], const int ifac[MAXFAC+2])
-  {
-    int i;
-    int k1, l1, l2, na, kh, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
-    Treal *cinput, *coutput;
-    nf = ifac[1];
-    na = 1;
-    l2 = n;
-    iw = n-1;
-    for (k1 = 1; k1 <= nf; ++k1) {
-      kh = nf - k1;
-      ip = ifac[kh + 2];
-      l1 = l2 / ip;
-      ido = n / l2;
-      idl1 = ido*l1;
-      iw -= (ip - 1)*ido;
-      na = !na;
-      if (na) {
-        cinput = ch;
-        coutput = c;
-      } else {
-        cinput = c;
-        coutput = ch;
-      }
-      switch (ip) {
-      case 4:
-        ix2 = iw + ido;
-        ix3 = ix2 + ido;
-        radf4(ido, l1, cinput, coutput, &wa[iw], &wa[ix2], &wa[ix3]);
-        break;
-      case 2:
-        radf2(ido, l1, cinput, coutput, &wa[iw]);
-        break;
-      case 3:
-        ix2 = iw + ido;
-        radf3(ido, l1, cinput, coutput, &wa[iw], &wa[ix2]);
-        break;
-      case 5:
-        ix2 = iw + ido;
-        ix3 = ix2 + ido;
-        ix4 = ix3 + ido;
-        radf5(ido, l1, cinput, coutput, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
-        break;
-      default:
-        if (ido == 1)
-          na = !na;
-        if (na == 0) {
-          radfg(ido, ip, l1, idl1, c, ch, &wa[iw]);
-          na = 1;
-        } else {
-          radfg(ido, ip, l1, idl1, ch, c, &wa[iw]);
-          na = 0;
-        }
-      }
-      l2 = l1;
-    }
-    if (na == 1) return;
-    for (i = 0; i < n; i++) c[i] = ch[i];
-  } /* rfftf1 */
-
-
-static void rfftb1(int n, Treal c[], Treal ch[], const Treal wa[], const int ifac[MAXFAC+2])
-  {
-    int i;
-    int k1, l1, l2, na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
-    Treal *cinput, *coutput;
-    nf = ifac[1];
-    na = 0;
-    l1 = 1;
-    iw = 0;
-    for (k1=1; k1<=nf; k1++) {
-      ip = ifac[k1 + 1];
-      l2 = ip*l1;
-      ido = n / l2;
-      idl1 = ido*l1;
-      if (na) {
-        cinput = ch;
-        coutput = c;
-      } else {
-        cinput = c;
-        coutput = ch;
-      }
-      switch (ip) {
-      case 4:
-        ix2 = iw + ido;
-        ix3 = ix2 + ido;
-        radb4(ido, l1, cinput, coutput, &wa[iw], &wa[ix2], &wa[ix3]);
-        na = !na;
-        break;
-      case 2:
-        radb2(ido, l1, cinput, coutput, &wa[iw]);
-        na = !na;
-        break;
-      case 3:
-        ix2 = iw + ido;
-        radb3(ido, l1, cinput, coutput, &wa[iw], &wa[ix2]);
-        na = !na;
-        break;
-      case 5:
-        ix2 = iw + ido;
-        ix3 = ix2 + ido;
-        ix4 = ix3 + ido;
-        radb5(ido, l1, cinput, coutput, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
-        na = !na;
-        break;
-      default:
-        radbg(ido, ip, l1, idl1, cinput, coutput, &wa[iw]);
-        if (ido == 1) na = !na;
-      }
-      l1 = l2;
-      iw += (ip - 1)*ido;
-    }
-    if (na == 0) return;
-    for (i=0; i<n; i++) c[i] = ch[i];
-  } /* rfftb1 */
-
-
-NPY_VISIBILITY_HIDDEN void npy_rfftf(int n, Treal r[], Treal wsave[])
-  {
-    if (n == 1) return;
-    rfftf1(n, r, wsave, wsave+n, (int*)(wsave+2*n));
-  } /* npy_rfftf */
-
-
-NPY_VISIBILITY_HIDDEN void npy_rfftb(int n, Treal r[], Treal wsave[])
-  {
-    if (n == 1) return;
-    rfftb1(n, r, wsave, wsave+n, (int*)(wsave+2*n));
-  } /* npy_rfftb */
-
-
-static void rffti1(int n, Treal wa[], int ifac[MAXFAC+2])
-  {
-    int fi, i, j;
-    int k1, l1, l2;
-    int ld, ii, nf, ip, is;
-    int ido, ipm, nfm1;
-    static const int ntryh[NSPECIAL] = {
-      4,2,3,5    }; /* Do not change the order of these. */
-    factorize(n,ifac,ntryh);
-    nf = ifac[1];
-    is = 0;
-    nfm1 = nf - 1;
-    l1 = 1;
-    if (nfm1 == 0) return;
-    for (k1 = 1; k1 <= nfm1; k1++) {
-      ip = ifac[k1 + 1];
-      ld = 0;
-      l2 = l1*ip;
-      ido = n / l2;
-      ipm = ip - 1;
-      for (j = 1; j <= ipm; ++j) {
-        ld += l1;
-        i = is;
-        fi = 0;
-        for (ii = 3; ii <= ido; ii += 2) {
-          i += 2;
-          fi += 1;
-          sincos2pi(fi*ld, n, wa+i-1, wa+i-2);
-        }
-        is += ido;
-      }
-      l1 = l2;
-    }
-  } /* rffti1 */
-
-
-NPY_VISIBILITY_HIDDEN void npy_rffti(int n, Treal wsave[])
-  {
-    if (n == 1) return;
-    rffti1(n, wsave+n, (int*)(wsave+2*n));
-  } /* npy_rffti */
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/numpy/fft/fftpack.h b/numpy/fft/fftpack.h
deleted file mode 100644
index 5e8f4631c..000000000
--- a/numpy/fft/fftpack.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * This file is part of tela the Tensor Language.
- * Copyright (c) 1994-1995 Pekka Janhunen
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define DOUBLE
-
-#ifdef DOUBLE
-#define Treal double
-#else
-#define Treal float
-#endif
-
-extern NPY_VISIBILITY_HIDDEN void npy_cfftf(int N, Treal data[], const Treal wrk[]);
-extern NPY_VISIBILITY_HIDDEN void npy_cfftb(int N, Treal data[], const Treal wrk[]);
-extern NPY_VISIBILITY_HIDDEN void npy_cffti(int N, Treal wrk[]);
-
-extern NPY_VISIBILITY_HIDDEN void npy_rfftf(int N, Treal data[], const Treal wrk[]);
-extern NPY_VISIBILITY_HIDDEN void npy_rfftb(int N, Treal data[], const Treal wrk[]);
-extern NPY_VISIBILITY_HIDDEN void npy_rffti(int N, Treal wrk[]);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/numpy/fft/fftpack_litemodule.c b/numpy/fft/fftpack_litemodule.c
deleted file mode 100644
index bd6cfc120..000000000
--- a/numpy/fft/fftpack_litemodule.c
+++ /dev/null
@@ -1,366 +0,0 @@
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include "Python.h"
-#include "numpy/arrayobject.h"
-#include "fftpack.h"
-
-static PyObject *ErrorObject;
-
-static const char fftpack_cfftf__doc__[] = "";
-
-static PyObject *
-fftpack_cfftf(PyObject *NPY_UNUSED(self), PyObject *args)
-{
-    PyObject *op1, *op2;
-    PyArrayObject *data;
-    PyArray_Descr *descr;
-    double *wsave, *dptr;
-    npy_intp nsave;
-    int npts, nrepeats, i;
-
-    if(!PyArg_ParseTuple(args, "OO:cfftf", &op1, &op2)) {
-        return NULL;
-    }
-    data = (PyArrayObject *)PyArray_CopyFromObject(op1,
-            NPY_CDOUBLE, 1, 0);
-    if (data == NULL) {
-        return NULL;
-    }
-    descr = PyArray_DescrFromType(NPY_DOUBLE);
-    if (PyArray_AsCArray(&op2, (void *)&wsave, &nsave, 1, descr) == -1) {
-        goto fail;
-    }
-    if (data == NULL) {
-        goto fail;
-    }
-
-    npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
-    if (nsave != npts*4 + 15) {
-        PyErr_SetString(ErrorObject, "invalid work array for fft size");
-        goto fail;
-    }
-
-    nrepeats = PyArray_SIZE(data)/npts;
-    dptr = (double *)PyArray_DATA(data);
-    Py_BEGIN_ALLOW_THREADS;
-    NPY_SIGINT_ON;
-    for (i = 0; i < nrepeats; i++) {
-        npy_cfftf(npts, dptr, wsave);
-        dptr += npts*2;
-    }
-    NPY_SIGINT_OFF;
-    Py_END_ALLOW_THREADS;
-    PyArray_Free(op2, (char *)wsave);
-    return (PyObject *)data;
-
-fail:
-    PyArray_Free(op2, (char *)wsave);
-    Py_DECREF(data);
-    return NULL;
-}
-
-static const char fftpack_cfftb__doc__[] = "";
-
-static PyObject *
-fftpack_cfftb(PyObject *NPY_UNUSED(self), PyObject *args)
-{
-    PyObject *op1, *op2;
-    PyArrayObject *data;
-    PyArray_Descr *descr;
-    double *wsave, *dptr;
-    npy_intp nsave;
-    int npts, nrepeats, i;
-
-    if(!PyArg_ParseTuple(args, "OO:cfftb", &op1, &op2)) {
-        return NULL;
-    }
-    data = (PyArrayObject *)PyArray_CopyFromObject(op1,
-            NPY_CDOUBLE, 1, 0);
-    if (data == NULL) {
-        return NULL;
-    }
-    descr = PyArray_DescrFromType(NPY_DOUBLE);
-    if (PyArray_AsCArray(&op2, (void *)&wsave, &nsave, 1, descr) == -1) {
-        goto fail;
-    }
-    if (data == NULL) {
-        goto fail;
-    }
-
-    npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
-    if (nsave != npts*4 + 15) {
-        PyErr_SetString(ErrorObject, "invalid work array for fft size");
-        goto fail;
-    }
-
-    nrepeats = PyArray_SIZE(data)/npts;
-    dptr = (double *)PyArray_DATA(data);
-    Py_BEGIN_ALLOW_THREADS;
-    NPY_SIGINT_ON;
-    for (i = 0; i < nrepeats; i++) {
-        npy_cfftb(npts, dptr, wsave);
-        dptr += npts*2;
-    }
-    NPY_SIGINT_OFF;
-    Py_END_ALLOW_THREADS;
-    PyArray_Free(op2, (char *)wsave);
-    return (PyObject *)data;
-
-fail:
-    PyArray_Free(op2, (char *)wsave);
-    Py_DECREF(data);
-    return NULL;
-}
-
-static const char fftpack_cffti__doc__[] = "";
-
-static PyObject *
-fftpack_cffti(PyObject *NPY_UNUSED(self), PyObject *args)
-{
-    PyArrayObject *op;
-    npy_intp dim;
-    long n;
-
-    if (!PyArg_ParseTuple(args, "l:cffti", &n)) {
-        return NULL;
-    }
-    /*Magic size needed by npy_cffti*/
-    dim = 4*n + 15;
-    /*Create a 1 dimensional array of dimensions of type double*/
-    op = (PyArrayObject *)PyArray_SimpleNew(1, &dim, NPY_DOUBLE);
-    if (op == NULL) {
-        return NULL;
-    }
-
-    Py_BEGIN_ALLOW_THREADS;
-    NPY_SIGINT_ON;
-    npy_cffti(n, (double *)PyArray_DATA((PyArrayObject*)op));
-    NPY_SIGINT_OFF;
-    Py_END_ALLOW_THREADS;
-
-    return (PyObject *)op;
-}
-
-static const char fftpack_rfftf__doc__[] = "";
-
-static PyObject *
-fftpack_rfftf(PyObject *NPY_UNUSED(self), PyObject *args)
-{
-    PyObject *op1, *op2;
-    PyArrayObject *data, *ret;
-    PyArray_Descr *descr;
-    double *wsave = NULL, *dptr, *rptr;
-    npy_intp nsave;
-    int npts, nrepeats, i, rstep;
-
-    if(!PyArg_ParseTuple(args, "OO:rfftf", &op1, &op2)) {
-        return NULL;
-    }
-    data = (PyArrayObject *)PyArray_ContiguousFromObject(op1,
-            NPY_DOUBLE, 1, 0);
-    if (data == NULL) {
-        return NULL;
-    }
-    /* FIXME, direct access changing contents of data->dimensions */
-    npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
-    PyArray_DIMS(data)[PyArray_NDIM(data) - 1] = npts/2 + 1;
-    ret = (PyArrayObject *)PyArray_Zeros(PyArray_NDIM(data),
-            PyArray_DIMS(data), PyArray_DescrFromType(NPY_CDOUBLE), 0);
-    if (ret == NULL) {
-        goto fail;
-    }
-    PyArray_DIMS(data)[PyArray_NDIM(data) - 1] = npts;
-    rstep = PyArray_DIM(ret, PyArray_NDIM(ret) - 1)*2;
-
-    descr = PyArray_DescrFromType(NPY_DOUBLE);
-    if (PyArray_AsCArray(&op2, (void *)&wsave, &nsave, 1, descr) == -1) {
-        goto fail;
-    }
-    if (data == NULL || ret == NULL) {
-        goto fail;
-    }
-    if (nsave != npts*2+15) {
-        PyErr_SetString(ErrorObject, "invalid work array for fft size");
-        goto fail;
-    }
-
-    nrepeats = PyArray_SIZE(data)/npts;
-    rptr = (double *)PyArray_DATA(ret);
-    dptr = (double *)PyArray_DATA(data);
-
-    Py_BEGIN_ALLOW_THREADS;
-    NPY_SIGINT_ON;
-    for (i = 0; i < nrepeats; i++) {
-        memcpy((char *)(rptr+1), dptr, npts*sizeof(double));
-        npy_rfftf(npts, rptr+1, wsave);
-        rptr[0] = rptr[1];
-        rptr[1] = 0.0;
-        rptr += rstep;
-        dptr += npts;
-    }
-    NPY_SIGINT_OFF;
-    Py_END_ALLOW_THREADS;
-    PyArray_Free(op2, (char *)wsave);
-    Py_DECREF(data);
-    return (PyObject *)ret;
-
-fail:
-    PyArray_Free(op2, (char *)wsave);
-    Py_XDECREF(data);
-    Py_XDECREF(ret);
-    return NULL;
-}
-
-static const char fftpack_rfftb__doc__[] = "";
-
-static PyObject *
-fftpack_rfftb(PyObject *NPY_UNUSED(self), PyObject *args)
-{
-    PyObject *op1, *op2;
-    PyArrayObject *data, *ret;
-    PyArray_Descr *descr;
-    double *wsave, *dptr, *rptr;
-    npy_intp nsave;
-    int npts, nrepeats, i;
-
-    if(!PyArg_ParseTuple(args, "OO:rfftb", &op1, &op2)) {
-        return NULL;
-    }
-    data = (PyArrayObject *)PyArray_ContiguousFromObject(op1,
-            NPY_CDOUBLE, 1, 0);
-    if (data == NULL) {
-        return NULL;
-    }
-    npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
-    ret = (PyArrayObject *)PyArray_Zeros(PyArray_NDIM(data), PyArray_DIMS(data),
-            PyArray_DescrFromType(NPY_DOUBLE), 0);
-
-    descr = PyArray_DescrFromType(NPY_DOUBLE);
-    if (PyArray_AsCArray(&op2, (void *)&wsave, &nsave, 1, descr) == -1) {
-        goto fail;
-    }
-    if (data == NULL || ret == NULL) {
-        goto fail;
-    }
-    if (nsave != npts*2 + 15) {
-        PyErr_SetString(ErrorObject, "invalid work array for fft size");
-        goto fail;
-    }
-
-    nrepeats = PyArray_SIZE(ret)/npts;
-    rptr = (double *)PyArray_DATA(ret);
-    dptr = (double *)PyArray_DATA(data);
-
-    Py_BEGIN_ALLOW_THREADS;
-    NPY_SIGINT_ON;
-    for (i = 0; i < nrepeats; i++) {
-        memcpy((char *)(rptr + 1), (dptr + 2), (npts - 1)*sizeof(double));
-        rptr[0] = dptr[0];
-        npy_rfftb(npts, rptr, wsave);
-        rptr += npts;
-        dptr += npts*2;
-    }
-    NPY_SIGINT_OFF;
-    Py_END_ALLOW_THREADS;
-    PyArray_Free(op2, (char *)wsave);
-    Py_DECREF(data);
-    return (PyObject *)ret;
-
-fail:
-    PyArray_Free(op2, (char *)wsave);
-    Py_XDECREF(data);
-    Py_XDECREF(ret);
-    return NULL;
-}
-
-static const char fftpack_rffti__doc__[] = "";
-
-static PyObject *
-fftpack_rffti(PyObject *NPY_UNUSED(self), PyObject *args)
-{
-  PyArrayObject *op;
-  npy_intp dim;
-  long n;
-
-  if (!PyArg_ParseTuple(args, "l:rffti", &n)) {
-      return NULL;
-  }
-  /*Magic size needed by npy_rffti*/
-  dim = 2*n + 15;
-  /*Create a 1 dimensional array of dimensions of type double*/
-  op = (PyArrayObject *)PyArray_SimpleNew(1, &dim, NPY_DOUBLE);
-  if (op == NULL) {
-      return NULL;
-  }
-  Py_BEGIN_ALLOW_THREADS;
-  NPY_SIGINT_ON;
-  npy_rffti(n, (double *)PyArray_DATA((PyArrayObject*)op));
-  NPY_SIGINT_OFF;
-  Py_END_ALLOW_THREADS;
-
-  return (PyObject *)op;
-}
-
-
-/* List of methods defined in the module */
-
-static struct PyMethodDef fftpack_methods[] = {
-    {"cfftf",   fftpack_cfftf,  1,      fftpack_cfftf__doc__},
-    {"cfftb",   fftpack_cfftb,  1,      fftpack_cfftb__doc__},
-    {"cffti",   fftpack_cffti,  1,      fftpack_cffti__doc__},
-    {"rfftf",   fftpack_rfftf,  1,      fftpack_rfftf__doc__},
-    {"rfftb",   fftpack_rfftb,  1,      fftpack_rfftb__doc__},
-    {"rffti",   fftpack_rffti,  1,      fftpack_rffti__doc__},
-    {NULL, NULL, 0, NULL}          /* sentinel */
-};
-
-#if PY_MAJOR_VERSION >= 3
-static struct PyModuleDef moduledef = {
-        PyModuleDef_HEAD_INIT,
-        "fftpack_lite",
-        NULL,
-        -1,
-        fftpack_methods,
-        NULL,
-        NULL,
-        NULL,
-        NULL
-};
-#endif
-
-/* Initialization function for the module */
-#if PY_MAJOR_VERSION >= 3
-#define RETVAL(x) x
-PyMODINIT_FUNC PyInit_fftpack_lite(void)
-#else
-#define RETVAL(x)
-PyMODINIT_FUNC
-initfftpack_lite(void)
-#endif
-{
-    PyObject *m,*d;
-#if PY_MAJOR_VERSION >= 3
-    m = PyModule_Create(&moduledef);
-#else
-    static const char fftpack_module_documentation[] = "";
-
-    m = Py_InitModule4("fftpack_lite", fftpack_methods,
-            fftpack_module_documentation,
-            (PyObject*)NULL,PYTHON_API_VERSION);
-#endif
-    if (m == NULL) {
-        return RETVAL(NULL);
-    }
-
-    /* Import the array object */
-    import_array();
-
-    /* Add some symbolic constants to the module */
-    d = PyModule_GetDict(m);
-    ErrorObject = PyErr_NewException("fftpack.error", NULL, NULL);
-    PyDict_SetItemString(d, "error", ErrorObject);
-
-    /* XXXX Add constants here */
-
-    return RETVAL(m);
-}
diff --git a/numpy/fft/helper.py b/numpy/fft/helper.py
index 9985f6d4c..a920a4ac0 100644
--- a/numpy/fft/helper.py
+++ b/numpy/fft/helper.py
@@ -4,11 +4,6 @@ Discrete Fourier Transforms - helper.py
 """
 from __future__ import division, absolute_import, print_function
 
-import collections
-try:
-    import threading
-except ImportError:
-    import dummy_threading as threading
 from numpy.compat import integer_types
 from numpy.core import integer, empty, arange, asarray, roll
 from numpy.core.overrides import array_function_dispatch, set_module
@@ -227,99 +222,3 @@ def rfftfreq(n, d=1.0):
     N = n//2 + 1
     results = arange(0, N, dtype=int)
     return results * val
-
-
-class _FFTCache(object):
-    """
-    Cache for the FFT twiddle factors as an LRU (least recently used) cache.
-
-    Parameters
-    ----------
-    max_size_in_mb : int
-        Maximum memory usage of the cache before items are being evicted.
-    max_item_count : int
-        Maximum item count of the cache before items are being evicted.
-
-    Notes
-    -----
-    Items will be evicted if either limit has been reached upon getting and
-    setting. The maximum memory usages is not strictly the given
-    ``max_size_in_mb`` but rather
-    ``max(max_size_in_mb, 1.5 * size_of_largest_item)``. Thus the cache will
-    never be completely cleared - at least one item will remain and a single
-    large item can cause the cache to retain several smaller items even if the
-    given maximum cache size has been exceeded.
-    """
-    def __init__(self, max_size_in_mb, max_item_count):
-        self._max_size_in_bytes = max_size_in_mb * 1024 ** 2
-        self._max_item_count = max_item_count
-        self._dict = collections.OrderedDict()
-        self._lock = threading.Lock()
-
-    def put_twiddle_factors(self, n, factors):
-        """
-        Store twiddle factors for an FFT of length n in the cache.
-
-        Putting multiple twiddle factors for a certain n will store it multiple
-        times.
-
-        Parameters
-        ----------
-        n : int
-            Data length for the FFT.
-        factors : ndarray
-            The actual twiddle values.
-        """
-        with self._lock:
-            # Pop + later add to move it to the end for LRU behavior.
-            # Internally everything is stored in a dictionary whose values are
-            # lists.
-            try:
-                value = self._dict.pop(n)
-            except KeyError:
-                value = []
-            value.append(factors)
-            self._dict[n] = value
-            self._prune_cache()
-
-    def pop_twiddle_factors(self, n):
-        """
-        Pop twiddle factors for an FFT of length n from the cache.
-
-        Will return None if the requested twiddle factors are not available in
-        the cache.
-
-        Parameters
-        ----------
-        n : int
-            Data length for the FFT.
-
-        Returns
-        -------
-        out : ndarray or None
-            The retrieved twiddle factors if available, else None.
-        """
-        with self._lock:
-            if n not in self._dict or not self._dict[n]:
-                return None
-            # Pop + later add to move it to the end for LRU behavior.
-            all_values = self._dict.pop(n)
-            value = all_values.pop()
-            # Only put pack if there are still some arrays left in the list.
-            if all_values:
-                self._dict[n] = all_values
-            return value
-
-    def _prune_cache(self):
-        # Always keep at least one item.
-        while len(self._dict) > 1 and (
-                len(self._dict) > self._max_item_count or self._check_size()):
-            self._dict.popitem(last=False)
-
-    def _check_size(self):
-        item_sizes = [sum(_j.nbytes for _j in _i)
-                      for _i in self._dict.values() if _i]
-        if not item_sizes:
-            return False
-        max_size = max(self._max_size_in_bytes, 1.5 * max(item_sizes))
-        return sum(item_sizes) > max_size
diff --git a/numpy/fft/pocketfft.c b/numpy/fft/pocketfft.c
new file mode 100644
index 000000000..9d1218e6b
--- /dev/null
+++ b/numpy/fft/pocketfft.c
@@ -0,0 +1,2406 @@
+/*
+ * This file is part of pocketfft.
+ * Licensed under a 3-clause BSD style license - see LICENSE.md
+ */
+
+/*
+ *  Main implementation file.
+ *
+ *  Copyright (C) 2004-2018 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#include <math.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "npy_config.h"
+#define restrict NPY_RESTRICT
+
+#define RALLOC(type,num) \
+  ((type *)malloc((num)*sizeof(type)))
+#define DEALLOC(ptr) \
+  do { free(ptr); (ptr)=NULL; } while(0)
+
+#define SWAP(a,b,type) \
+  do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0)
+
+#ifdef __GNUC__
+#define NOINLINE __attribute__((noinline))
+#define WARN_UNUSED_RESULT __attribute__ ((warn_unused_result))
+#else
+#define NOINLINE
+#define WARN_UNUSED_RESULT
+#endif
+
+struct cfft_plan_i;
+typedef struct cfft_plan_i * cfft_plan;
+struct rfft_plan_i;
+typedef struct rfft_plan_i * rfft_plan;
+
+// adapted from https://stackoverflow.com/questions/42792939/
+// CAUTION: this function only works for arguments in the range [-0.25; 0.25]!
+static void my_sincosm1pi (double a, double *restrict res)
+  {
+  double s = a * a;
+  /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
+  double r =     -1.0369917389758117e-4;
+  r = fma (r, s,  1.9294935641298806e-3);
+  r = fma (r, s, -2.5806887942825395e-2);
+  r = fma (r, s,  2.3533063028328211e-1);
+  r = fma (r, s, -1.3352627688538006e+0);
+  r = fma (r, s,  4.0587121264167623e+0);
+  r = fma (r, s, -4.9348022005446790e+0);
+  double c = r*s;
+  /* Approximate sin(pi*x) for x in [-0.25,0.25] */
+  r =             4.6151442520157035e-4;
+  r = fma (r, s, -7.3700183130883555e-3);
+  r = fma (r, s,  8.2145868949323936e-2);
+  r = fma (r, s, -5.9926452893214921e-1);
+  r = fma (r, s,  2.5501640398732688e+0);
+  r = fma (r, s, -5.1677127800499516e+0);
+  s = s * a;
+  r = r * s;
+  s = fma (a, 3.1415926535897931e+0, r);
+  res[0] = c;
+  res[1] = s;
+  }
+
+NOINLINE static void calc_first_octant(size_t den, double * restrict res)
+  {
+  size_t n = (den+4)>>3;
+  if (n==0) return;
+  res[0]=1.; res[1]=0.;
+  if (n==1) return;
+  size_t l1=(size_t)sqrt(n);
+  for (size_t i=1; i<l1; ++i)
+    my_sincosm1pi((2.*i)/den,&res[2*i]);
+  size_t start=l1;
+  while(start<n)
+    {
+    double cs[2];
+    my_sincosm1pi((2.*start)/den,cs);
+    res[2*start] = cs[0]+1.;
+    res[2*start+1] = cs[1];
+    size_t end = l1;
+    if (start+end>n) end = n-start;
+    for (size_t i=1; i<end; ++i)
+      {
+      double csx[2]={res[2*i], res[2*i+1]};
+      res[2*(start+i)] = ((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1.;
+      res[2*(start+i)+1] = (cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1];
+      }
+    start += l1;
+    }
+  for (size_t i=1; i<l1; ++i)
+    res[2*i] += 1.;
+  }
+
+NOINLINE static void calc_first_quadrant(size_t n, double * restrict res)
+  {
+  double * restrict p = res+n;
+  calc_first_octant(n<<1, p);
+  size_t ndone=(n+2)>>2;
+  size_t i=0, idx1=0, idx2=2*ndone-2;
+  for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
+    {
+    res[idx1]   = p[2*i];
+    res[idx1+1] = p[2*i+1];
+    res[idx2]   = p[2*i+3];
+    res[idx2+1] = p[2*i+2];
+    }
+  if (i!=ndone)
+    {
+    res[idx1  ] = p[2*i];
+    res[idx1+1] = p[2*i+1];
+    }
+  }
+
+NOINLINE static void calc_first_half(size_t n, double * restrict res)
+  {
+  int ndone=(n+1)>>1;
+  double * p = res+n-1;
+  calc_first_octant(n<<2, p);
+  int i4=0, in=n, i=0;
+  for (; i4<=in-i4; ++i, i4+=4) // octant 0
+    {
+    res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1];
+    }
+  for (; i4-in <= 0; ++i, i4+=4) // octant 1
+    {
+    int xm = in-i4;
+    res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm];
+    }
+  for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
+    {
+    int xm = i4-in;
+    res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm];
+    }
+  for (; i<ndone; ++i, i4+=4) // octant 3
+    {
+    int xm = 2*in-i4;
+    res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1];
+    }
+  }
+
+NOINLINE static void fill_first_quadrant(size_t n, double * restrict res)
+  {
+  const double hsqt2 = 0.707106781186547524400844362104849;
+  size_t quart = n>>2;
+  if ((n&7)==0)
+    res[quart] = res[quart+1] = hsqt2;
+  for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
+    {
+    res[j  ] = res[i+1];
+    res[j+1] = res[i  ];
+    }
+  }
+
+NOINLINE static void fill_first_half(size_t n, double * restrict res)
+  {
+  size_t half = n>>1;
+  if ((n&3)==0)
+    for (size_t i=0; i<half; i+=2)
+      {
+      res[i+half]   = -res[i+1];
+      res[i+half+1] =  res[i  ];
+      }
+  else
+    for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
+      {
+      res[j  ] = -res[i  ];
+      res[j+1] =  res[i+1];
+      }
+  }
+
+NOINLINE static void fill_second_half(size_t n, double * restrict res)
+  {
+  if ((n&1)==0)
+    for (size_t i=0; i<n; ++i)
+      res[i+n] = -res[i];
+  else
+    for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
+      {
+      res[j  ] =  res[i  ];
+      res[j+1] = -res[i+1];
+      }
+  }
+
+NOINLINE static void sincos_2pibyn_half(size_t n, double * restrict res)
+  {
+  if ((n&3)==0)
+    {
+    calc_first_octant(n, res);
+    fill_first_quadrant(n, res);
+    fill_first_half(n, res);
+    }
+  else if ((n&1)==0)
+    {
+    calc_first_quadrant(n, res);
+    fill_first_half(n, res);
+    }
+  else
+    calc_first_half(n, res);
+  }
+
+NOINLINE static void sincos_2pibyn(size_t n, double * restrict res)
+  {
+  sincos_2pibyn_half(n, res);
+  fill_second_half(n, res);
+  }
+
+NOINLINE static size_t largest_prime_factor (size_t n)
+  {
+  size_t res=1;
+  size_t tmp;
+  while (((tmp=(n>>1))<<1)==n)
+    { res=2; n=tmp; }
+
+  size_t limit=(size_t)sqrt(n+0.01);
+  for (size_t x=3; x<=limit; x+=2)
+  while (((tmp=(n/x))*x)==n)
+    {
+    res=x;
+    n=tmp;
+    limit=(size_t)sqrt(n+0.01);
+    }
+  if (n>1) res=n;
+
+  return res;
+  }
+
+NOINLINE static double cost_guess (size_t n)
+  {
+  const double lfp=1.1; // penalty for non-hardcoded larger factors
+  size_t ni=n;
+  double result=0.;
+  size_t tmp;
+  while (((tmp=(n>>1))<<1)==n)
+    { result+=2; n=tmp; }
+
+  size_t limit=(size_t)sqrt(n+0.01);
+  for (size_t x=3; x<=limit; x+=2)
+  while ((tmp=(n/x))*x==n)
+    {
+    result+= (x<=5) ? x : lfp*x; // penalize larger prime factors
+    n=tmp;
+    limit=(size_t)sqrt(n+0.01);
+    }
+  if (n>1) result+=(n<=5) ? n : lfp*n;
+
+  return result*ni;
+  }
+
+/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
+NOINLINE static size_t good_size(size_t n)
+  {
+  if (n<=6) return n;
+
+  size_t bestfac=2*n;
+  for (size_t f2=1; f2<bestfac; f2*=2)
+    for (size_t f23=f2; f23<bestfac; f23*=3)
+      for (size_t f235=f23; f235<bestfac; f235*=5)
+        for (size_t f2357=f235; f2357<bestfac; f2357*=7)
+          for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
+            if (f235711>=n) bestfac=f235711;
+  return bestfac;
+  }
+
+typedef struct cmplx {
+  double r,i;
+} cmplx;
+
+#define NFCT 25
+typedef struct cfftp_fctdata
+  {
+  size_t fct;
+  cmplx *tw, *tws;
+  } cfftp_fctdata;
+
+typedef struct cfftp_plan_i
+  {
+  size_t length, nfct;
+  cmplx *mem;
+  cfftp_fctdata fct[NFCT];
+  } cfftp_plan_i;
+typedef struct cfftp_plan_i * cfftp_plan;
+
+#define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
+#define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; }
+#define SCALEC(a,b) { a.r*=b; a.i*=b; }
+#define ROT90(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; }
+#define ROTM90(a) { double tmp_=-a.r; a.r=a.i; a.i=tmp_; }
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+#define WA(x,i) wa[(i)-1+(x)*(ido-1)]
+/* a = b*c */
+#define A_EQ_B_MUL_C(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; }
+/* a = conj(b)*c*/
+#define A_EQ_CB_MUL_C(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; }
+
+#define PMSIGNC(a,b,c,d) { a.r=c.r+sign*d.r; a.i=c.i+sign*d.i; b.r=c.r-sign*d.r; b.i=c.i-sign*d.i; }
+/* a = b*c */
+#define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-sign*b.i*c.i; a.i=b.r*c.i+sign*b.i*c.r; }
+/* a *= b */
+#define MULPMSIGNCEQ(a,b) { double xtmp=a.r; a.r=b.r*a.r-sign*b.i*a.i; a.i=b.r*a.i+sign*b.i*xtmp; }
+
+NOINLINE static void pass2b (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=2;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
+      for (size_t i=1; i<ido; ++i)
+        {
+        cmplx t;
+        PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
+        A_EQ_B_MUL_C (CH(i,k,1),WA(0,i),t)
+        }
+      }
+  }
+
+NOINLINE static void pass2f (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=2;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
+      for (size_t i=1; i<ido; ++i)
+        {
+        cmplx t;
+        PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
+        A_EQ_CB_MUL_C (CH(i,k,1),WA(0,i),t)
+        }
+      }
+  }
+
+#define PREP3(idx) \
+        cmplx t0 = CC(idx,0,k), t1, t2; \
+        PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)) \
+        CH(idx,k,0).r=t0.r+t1.r; \
+        CH(idx,k,0).i=t0.i+t1.i;
+#define PARTSTEP3a(u1,u2,twr,twi) \
+        { \
+        cmplx ca,cb; \
+        ca.r=t0.r+twr*t1.r; \
+        ca.i=t0.i+twr*t1.i; \
+        cb.i=twi*t2.r; \
+        cb.r=-(twi*t2.i); \
+        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
+        }
+
+#define PARTSTEP3b(u1,u2,twr,twi) \
+        { \
+        cmplx ca,cb,da,db; \
+        ca.r=t0.r+twr*t1.r; \
+        ca.i=t0.i+twr*t1.i; \
+        cb.i=twi*t2.r; \
+        cb.r=-(twi*t2.i); \
+        PMC(da,db,ca,cb) \
+        A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
+        A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+NOINLINE static void pass3b (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=3;
+  const double tw1r=-0.5, tw1i= 0.86602540378443864676;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP3(0)
+      PARTSTEP3a(1,2,tw1r,tw1i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP3(0)
+      PARTSTEP3a(1,2,tw1r,tw1i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP3(i)
+        PARTSTEP3b(1,2,tw1r,tw1i)
+        }
+      }
+  }
+#define PARTSTEP3f(u1,u2,twr,twi) \
+        { \
+        cmplx ca,cb,da,db; \
+        ca.r=t0.r+twr*t1.r; \
+        ca.i=t0.i+twr*t1.i; \
+        cb.i=twi*t2.r; \
+        cb.r=-(twi*t2.i); \
+        PMC(da,db,ca,cb) \
+        A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
+        A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+NOINLINE static void pass3f (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=3;
+  const double tw1r=-0.5, tw1i= -0.86602540378443864676;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP3(0)
+      PARTSTEP3a(1,2,tw1r,tw1i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP3(0)
+      PARTSTEP3a(1,2,tw1r,tw1i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP3(i)
+        PARTSTEP3f(1,2,tw1r,tw1i)
+        }
+      }
+  }
+
+NOINLINE static void pass4b (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=4;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      cmplx t1, t2, t3, t4;
+      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+      ROT90(t4)
+      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      cmplx t1, t2, t3, t4;
+      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+      ROT90(t4)
+      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        cmplx c2, c3, c4, t1, t2, t3, t4;
+        cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
+        PMC(t2,t1,cc0,cc2)
+        PMC(t3,t4,cc1,cc3)
+        ROT90(t4)
+        cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
+        PMC(CH(i,k,0),c3,t2,t3)
+        PMC(c2,c4,t1,t4)
+        A_EQ_B_MUL_C (CH(i,k,1),wa0,c2)
+        A_EQ_B_MUL_C (CH(i,k,2),wa1,c3)
+        A_EQ_B_MUL_C (CH(i,k,3),wa2,c4)
+        }
+      }
+  }
+NOINLINE static void pass4f (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=4;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      cmplx t1, t2, t3, t4;
+      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+      ROTM90(t4)
+      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      cmplx t1, t2, t3, t4;
+      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+      ROTM90(t4)
+      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+      PMC (CH(0,k,1),CH(0,k,3),t1,t4)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        cmplx c2, c3, c4, t1, t2, t3, t4;
+        cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
+        PMC(t2,t1,cc0,cc2)
+        PMC(t3,t4,cc1,cc3)
+        ROTM90(t4)
+        cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
+        PMC(CH(i,k,0),c3,t2,t3)
+        PMC(c2,c4,t1,t4)
+        A_EQ_CB_MUL_C (CH(i,k,1),wa0,c2)
+        A_EQ_CB_MUL_C (CH(i,k,2),wa1,c3)
+        A_EQ_CB_MUL_C (CH(i,k,3),wa2,c4)
+        }
+      }
+  }
+
+#define PREP5(idx) \
+        cmplx t0 = CC(idx,0,k), t1, t2, t3, t4; \
+        PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)) \
+        PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)) \
+        CH(idx,k,0).r=t0.r+t1.r+t2.r; \
+        CH(idx,k,0).i=t0.i+t1.i+t2.i;
+
+#define PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
+        { \
+        cmplx ca,cb; \
+        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
+        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
+        cb.i=twai*t4.r twbi*t3.r; \
+        cb.r=-(twai*t4.i twbi*t3.i); \
+        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
+        }
+
+#define PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
+        { \
+        cmplx ca,cb,da,db; \
+        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
+        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
+        cb.i=twai*t4.r twbi*t3.r; \
+        cb.r=-(twai*t4.i twbi*t3.i); \
+        PMC(da,db,ca,cb) \
+        A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
+        A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+NOINLINE static void pass5b (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=5;
+  const double tw1r= 0.3090169943749474241,
+               tw1i= 0.95105651629515357212,
+               tw2r= -0.8090169943749474241,
+               tw2i= 0.58778525229247312917;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP5(0)
+      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP5(0)
+      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP5(i)
+        PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
+        PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
+        }
+      }
+  }
+#define PARTSTEP5f(u1,u2,twar,twbr,twai,twbi) \
+        { \
+        cmplx ca,cb,da,db; \
+        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
+        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
+        cb.i=twai*t4.r twbi*t3.r; \
+        cb.r=-(twai*t4.i twbi*t3.i); \
+        PMC(da,db,ca,cb) \
+        A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
+        A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+NOINLINE static void pass5f (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=5;
+  const double tw1r= 0.3090169943749474241,
+               tw1i= -0.95105651629515357212,
+               tw2r= -0.8090169943749474241,
+               tw2i= -0.58778525229247312917;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP5(0)
+      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP5(0)
+      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP5(i)
+        PARTSTEP5f(1,4,tw1r,tw2r,+tw1i,+tw2i)
+        PARTSTEP5f(2,3,tw2r,tw1r,+tw2i,-tw1i)
+        }
+      }
+  }
+
+#define PREP7(idx) \
+        cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
+        PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)) \
+        PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)) \
+        PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)) \
+        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
+        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
+
+#define PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
+        { \
+        cmplx ca,cb; \
+        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
+        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
+        cb.i=y1*t7.r y2*t6.r y3*t5.r; \
+        cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
+        PMC(out1,out2,ca,cb) \
+        }
+#define PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
+        PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
+#define PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
+        { \
+        cmplx da,db; \
+        PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
+        MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
+        MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+
+NOINLINE static void pass7(size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
+  {
+  const size_t cdim=7;
+  const double tw1r= 0.623489801858733530525,
+               tw1i= sign * 0.7818314824680298087084,
+               tw2r= -0.222520933956314404289,
+               tw2i= sign * 0.9749279121818236070181,
+               tw3r= -0.9009688679024191262361,
+               tw3i= sign * 0.4338837391175581204758;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP7(0)
+      PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
+      PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
+      PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP7(0)
+      PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
+      PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
+      PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP7(i)
+        PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
+        PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
+        PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
+        }
+      }
+  }
+
+#define PREP11(idx) \
+        cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
+        PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)) \
+        PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)) \
+        PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)) \
+        PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)) \
+        PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)) \
+        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
+        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
+
+#define PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
+        { \
+        cmplx ca,cb; \
+        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r+x4*t5.r+x5*t6.r; \
+        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i+x4*t5.i+x5*t6.i; \
+        cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
+        cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
+        PMC(out1,out2,ca,cb) \
+        }
+#define PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
+        PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
+#define PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
+        { \
+        cmplx da,db; \
+        PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
+        MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
+        MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+
+NOINLINE static void pass11 (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
+  {
+  const size_t cdim=11;
+  const double tw1r =        0.8412535328311811688618,
+               tw1i = sign * 0.5406408174555975821076,
+               tw2r =        0.4154150130018864255293,
+               tw2i = sign * 0.9096319953545183714117,
+               tw3r =       -0.1423148382732851404438,
+               tw3i = sign * 0.9898214418809327323761,
+               tw4r =       -0.6548607339452850640569,
+               tw4i = sign * 0.755749574354258283774,
+               tw5r =       -0.9594929736144973898904,
+               tw5i = sign * 0.2817325568414296977114;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP11(0)
+      PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
+      PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
+      PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
+      PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
+      PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP11(0)
+      PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
+      PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
+      PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
+      PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
+      PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP11(i)
+        PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
+        PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
+        PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
+        PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
+        PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
+        }
+      }
+  }
+
+#define CX(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define CX2(a,b) cc[(a)+idl1*(b)]
+#define CH2(a,b) ch[(a)+idl1*(b)]
+
+NOINLINE static int passg (size_t ido, size_t ip, size_t l1,
+  cmplx * restrict cc, cmplx * restrict ch, const cmplx * restrict wa,
+  const cmplx * restrict csarr, const int sign)
+  {
+  const size_t cdim=ip;
+  size_t ipph = (ip+1)/2;
+  size_t idl1 = ido*l1;
+
+  cmplx * restrict wal=RALLOC(cmplx,ip);
+  if (!wal) return -1;
+  wal[0]=(cmplx){1.,0.};
+  for (size_t i=1; i<ip; ++i)
+    wal[i]=(cmplx){csarr[i].r,sign*csarr[i].i};
+
+  for (size_t k=0; k<l1; ++k)
+    for (size_t i=0; i<ido; ++i)
+      CH(i,k,0) = CC(i,0,k);
+  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
+    for (size_t k=0; k<l1; ++k)
+      for (size_t i=0; i<ido; ++i)
+        PMC(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k))
+  for (size_t k=0; k<l1; ++k)
+    for (size_t i=0; i<ido; ++i)
+      {
+      cmplx tmp = CH(i,k,0);
+      for (size_t j=1; j<ipph; ++j)
+        ADDC(tmp,tmp,CH(i,k,j))
+      CX(i,k,0) = tmp;
+      }
+  for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
+    {
+    // j=0
+    for (size_t ik=0; ik<idl1; ++ik)
+      {
+      CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
+      CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
+      CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
+      CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
+      }
+
+    size_t iwal=2*l;
+    size_t j=3, jc=ip-3;
+    for (; j<ipph-1; j+=2, jc-=2)
+      {
+      iwal+=l; if (iwal>ip) iwal-=ip;
+      cmplx xwal=wal[iwal];
+      iwal+=l; if (iwal>ip) iwal-=ip;
+      cmplx xwal2=wal[iwal];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
+        CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
+        CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
+        CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
+        }
+      }
+    for (; j<ipph; ++j, --jc)
+      {
+      iwal+=l; if (iwal>ip) iwal-=ip;
+      cmplx xwal=wal[iwal];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        CX2(ik,l).r += CH2(ik,j).r*xwal.r;
+        CX2(ik,l).i += CH2(ik,j).i*xwal.r;
+        CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
+        CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
+        }
+      }
+    }
+  DEALLOC(wal);
+
+  // shuffling and twiddling
+  if (ido==1)
+    for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        cmplx t1=CX2(ik,j), t2=CX2(ik,jc);
+        PMC(CX2(ik,j),CX2(ik,jc),t1,t2)
+        }
+  else
+    {
+    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
+      for (size_t k=0; k<l1; ++k)
+        {
+        cmplx t1=CX(0,k,j), t2=CX(0,k,jc);
+        PMC(CX(0,k,j),CX(0,k,jc),t1,t2)
+        for (size_t i=1; i<ido; ++i)
+          {
+          cmplx x1, x2;
+          PMC(x1,x2,CX(i,k,j),CX(i,k,jc))
+          size_t idij=(j-1)*(ido-1)+i-1;
+          MULPMSIGNC (CX(i,k,j),wa[idij],x1)
+          idij=(jc-1)*(ido-1)+i-1;
+          MULPMSIGNC (CX(i,k,jc),wa[idij],x2)
+          }
+        }
+    }
+  return 0;
+  }
+
+#undef CH2
+#undef CX2
+#undef CX
+
+NOINLINE WARN_UNUSED_RESULT static int pass_all(cfftp_plan plan, cmplx c[], double fct,
+  const int sign)
+  {
+  if (plan->length==1) return 0;
+  size_t len=plan->length;
+  size_t l1=1, nf=plan->nfct;
+  cmplx *ch = RALLOC(cmplx, len);
+  if (!ch) return -1;
+  cmplx *p1=c, *p2=ch;
+
+  for(size_t k1=0; k1<nf; k1++)
+    {
+    size_t ip=plan->fct[k1].fct;
+    size_t l2=ip*l1;
+    size_t ido = len/l2;
+    if     (ip==4)
+      sign>0 ? pass4b (ido, l1, p1, p2, plan->fct[k1].tw)
+             : pass4f (ido, l1, p1, p2, plan->fct[k1].tw);
+    else if(ip==2)
+      sign>0 ? pass2b (ido, l1, p1, p2, plan->fct[k1].tw)
+             : pass2f (ido, l1, p1, p2, plan->fct[k1].tw);
+    else if(ip==3)
+      sign>0 ? pass3b (ido, l1, p1, p2, plan->fct[k1].tw)
+             : pass3f (ido, l1, p1, p2, plan->fct[k1].tw);
+    else if(ip==5)
+      sign>0 ? pass5b (ido, l1, p1, p2, plan->fct[k1].tw)
+             : pass5f (ido, l1, p1, p2, plan->fct[k1].tw);
+    else if(ip==7)  pass7 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
+    else if(ip==11) pass11(ido, l1, p1, p2, plan->fct[k1].tw, sign);
+    else
+      {
+      if (passg(ido, ip, l1, p1, p2, plan->fct[k1].tw, plan->fct[k1].tws, sign))
+        { DEALLOC(ch); return -1; }
+      SWAP(p1,p2,cmplx *);
+      }
+    SWAP(p1,p2,cmplx *);
+    l1=l2;
+    }
+  if (p1!=c)
+    {
+    if (fct!=1.)
+      for (size_t i=0; i<len; ++i)
+        {
+        c[i].r = ch[i].r*fct;
+        c[i].i = ch[i].i*fct;
+        }
+    else
+      memcpy (c,p1,len*sizeof(cmplx));
+    }
+  else
+    if (fct!=1.)
+      for (size_t i=0; i<len; ++i)
+        {
+        c[i].r *= fct;
+        c[i].i *= fct;
+        }
+  DEALLOC(ch);
+  return 0;
+  }
+
+#undef PMSIGNC
+#undef A_EQ_B_MUL_C
+#undef A_EQ_CB_MUL_C
+#undef MULPMSIGNC
+#undef MULPMSIGNCEQ
+
+#undef WA
+#undef CC
+#undef CH
+#undef ROT90
+#undef SCALEC
+#undef ADDC
+#undef PMC
+
+NOINLINE WARN_UNUSED_RESULT
+static int cfftp_forward(cfftp_plan plan, double c[], double fct)
+  { return pass_all(plan,(cmplx *)c, fct, -1); }
+
+NOINLINE WARN_UNUSED_RESULT
+static int cfftp_backward(cfftp_plan plan, double c[], double fct)
+  { return pass_all(plan,(cmplx *)c, fct, 1); }
+
+NOINLINE WARN_UNUSED_RESULT
+static int cfftp_factorize (cfftp_plan plan)
+  {
+  size_t length=plan->length;
+  size_t nfct=0;
+  while ((length%4)==0)
+    { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
+  if ((length%2)==0)
+    {
+    length>>=1;
+    // factor 2 should be at the front of the factor list
+    if (nfct>=NFCT) return -1;
+    plan->fct[nfct++].fct=2;
+    SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
+    }
+  size_t maxl=(size_t)(sqrt((double)length))+1;
+  for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
+    if ((length%divisor)==0)
+      {
+      while ((length%divisor)==0)
+        {
+        if (nfct>=NFCT) return -1;
+        plan->fct[nfct++].fct=divisor;
+        length/=divisor;
+        }
+      maxl=(size_t)(sqrt((double)length))+1;
+      }
+  if (length>1) plan->fct[nfct++].fct=length;
+  plan->nfct=nfct;
+  return 0;
+  }
+
+NOINLINE static size_t cfftp_twsize (cfftp_plan plan)
+  {
+  size_t twsize=0, l1=1;
+  for (size_t k=0; k<plan->nfct; ++k)
+    {
+    size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
+    twsize+=(ip-1)*(ido-1);
+    if (ip>11)
+      twsize+=ip;
+    l1*=ip;
+    }
+  return twsize;
+  }
+
+NOINLINE WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan)
+  {
+  size_t length=plan->length;
+  double *twid = RALLOC(double, 2*length);
+  if (!twid) return -1;
+  sincos_2pibyn(length, twid);
+  size_t l1=1;
+  size_t memofs=0;
+  for (size_t k=0; k<plan->nfct; ++k)
+    {
+    size_t ip=plan->fct[k].fct, ido= length/(l1*ip);
+    plan->fct[k].tw=plan->mem+memofs;
+    memofs+=(ip-1)*(ido-1);
+    for (size_t j=1; j<ip; ++j)
+      for (size_t i=1; i<ido; ++i)
+        {
+        plan->fct[k].tw[(j-1)*(ido-1)+i-1].r = twid[2*j*l1*i];
+        plan->fct[k].tw[(j-1)*(ido-1)+i-1].i = twid[2*j*l1*i+1];
+        }
+    if (ip>11)
+      {
+      plan->fct[k].tws=plan->mem+memofs;
+      memofs+=ip;
+      for (size_t j=0; j<ip; ++j)
+        {
+        plan->fct[k].tws[j].r = twid[2*j*l1*ido];
+        plan->fct[k].tws[j].i = twid[2*j*l1*ido+1];
+        }
+      }
+    l1*=ip;
+    }
+  DEALLOC(twid);
+  return 0;
+  }
+
+static cfftp_plan make_cfftp_plan (size_t length)
+  {
+  if (length==0) return NULL;
+  cfftp_plan plan = RALLOC(cfftp_plan_i,1);
+  if (!plan) return NULL;
+  plan->length=length;
+  plan->nfct=0;
+  for (size_t i=0; i<NFCT; ++i)
+    plan->fct[i]=(cfftp_fctdata){0,0,0};
+  plan->mem=0;
+  if (length==1) return plan;
+  if (cfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
+  size_t tws=cfftp_twsize(plan);
+  plan->mem=RALLOC(cmplx,tws);
+  if (!plan->mem) { DEALLOC(plan); return NULL; }
+  if (cfftp_comp_twiddle(plan)!=0)
+    { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  return plan;
+  }
+
+static void destroy_cfftp_plan (cfftp_plan plan)
+  {
+  DEALLOC(plan->mem);
+  DEALLOC(plan);
+  }
+
+typedef struct rfftp_fctdata
+  {
+  size_t fct;
+  double *tw, *tws;
+  } rfftp_fctdata;
+
+typedef struct rfftp_plan_i
+  {
+  size_t length, nfct;
+  double *mem;
+  rfftp_fctdata fct[NFCT];
+  } rfftp_plan_i;
+typedef struct rfftp_plan_i * rfftp_plan;
+
+#define WA(x,i) wa[(i)+(x)*(ido-1)]
+#define PM(a,b,c,d) { a=c+d; b=c-d; }
+/* (a+ib) = conj(c+id) * (e+if) */
+#define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; }
+
+#define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))]
+
+NOINLINE static void radf2 (size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=2;
+
+  for (size_t k=0; k<l1; k++)
+    PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1))
+  if ((ido&1)==0)
+    for (size_t k=0; k<l1; k++)
+      {
+      CH(    0,1,k) = -CC(ido-1,k,1);
+      CH(ido-1,0,k) =  CC(ido-1,k,0);
+      }
+  if (ido<=2) return;
+  for (size_t k=0; k<l1; k++)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double tr2, ti2;
+      MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
+      PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2)
+      PM (CH(i  ,0,k),CH(ic  ,1,k),ti2,CC(i  ,k,0))
+      }
+  }
+
+NOINLINE static void radf3(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=3;
+  static const double taur=-0.5, taui=0.86602540378443864676;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double cr2=CC(0,k,1)+CC(0,k,2);
+    CH(0,0,k) = CC(0,k,0)+cr2;
+    CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
+    CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
+    }
+  if (ido==1) return;
+  for (size_t k=0; k<l1; k++)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double di2, di3, dr2, dr3;
+      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)) // d2=conj(WA0)*CC1
+      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)) // d3=conj(WA1)*CC2
+      double cr2=dr2+dr3; // c add
+      double ci2=di2+di3;
+      CH(i-1,0,k) = CC(i-1,k,0)+cr2; // c add
+      CH(i  ,0,k) = CC(i  ,k,0)+ci2;
+      double tr2 = CC(i-1,k,0)+taur*cr2; // c add
+      double ti2 = CC(i  ,k,0)+taur*ci2;
+      double tr3 = taui*(di2-di3);  // t3 = taui*i*(d3-d2)?
+      double ti3 = taui*(dr3-dr2);
+      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3) // PM(i) = t2+t3
+      PM(CH(i  ,2,k),CH(ic  ,1,k),ti3,ti2) // PM(ic) = conj(t2-t3)
+      }
+  }
+
+NOINLINE static void radf4(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=4;
+  static const double hsqt2=0.70710678118654752440;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double tr1,tr2;
+    PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1))
+    PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2))
+    PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1)
+    }
+  if ((ido&1)==0)
+    for (size_t k=0; k<l1; k++)
+      {
+      double ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
+      double tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
+      PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1)
+      PM (CH(    0,3,k),CH(    0,1,k),ti1,CC(ido-1,k,2))
+      }
+  if (ido<=2) return;
+  for (size_t k=0; k<l1; k++)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+      MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
+      MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
+      MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
+      PM(tr1,tr4,cr4,cr2)
+      PM(ti1,ti4,ci2,ci4)
+      PM(tr2,tr3,CC(i-1,k,0),cr3)
+      PM(ti2,ti3,CC(i  ,k,0),ci3)
+      PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1)
+      PM(CH(i  ,0,k),CH(ic  ,3,k),ti1,ti2)
+      PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4)
+      PM(CH(i  ,2,k),CH(ic  ,1,k),tr4,ti3)
+      }
+  }
+
+NOINLINE static void radf5(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=5;
+  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
+                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double cr2, cr3, ci4, ci5;
+    PM (cr2,ci5,CC(0,k,4),CC(0,k,1))
+    PM (cr3,ci4,CC(0,k,3),CC(0,k,2))
+    CH(0,0,k)=CC(0,k,0)+cr2+cr3;
+    CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
+    CH(0,2,k)=ti11*ci5+ti12*ci4;
+    CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
+    CH(0,4,k)=ti12*ci5-ti11*ci4;
+    }
+  if (ido==1) return;
+  for (size_t k=0; k<l1;++k)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      double ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3,
+         dr4, dr5, cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
+      size_t ic=ido-i;
+      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
+      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
+      MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
+      MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4))
+      PM(cr2,ci5,dr5,dr2)
+      PM(ci2,cr5,di2,di5)
+      PM(cr3,ci4,dr4,dr3)
+      PM(ci3,cr4,di3,di4)
+      CH(i-1,0,k)=CC(i-1,k,0)+cr2+cr3;
+      CH(i  ,0,k)=CC(i  ,k,0)+ci2+ci3;
+      tr2=CC(i-1,k,0)+tr11*cr2+tr12*cr3;
+      ti2=CC(i  ,k,0)+tr11*ci2+tr12*ci3;
+      tr3=CC(i-1,k,0)+tr12*cr2+tr11*cr3;
+      ti3=CC(i  ,k,0)+tr12*ci2+tr11*ci3;
+      MULPM(tr5,tr4,cr5,cr4,ti11,ti12)
+      MULPM(ti5,ti4,ci5,ci4,ti11,ti12)
+      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5)
+      PM(CH(i  ,2,k),CH(ic  ,1,k),ti5,ti2)
+      PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4)
+      PM(CH(i  ,4,k),CH(ic  ,3,k),ti4,ti3)
+      }
+  }
+
+#undef CC
+#undef CH
+#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define C2(a,b) cc[(a)+idl1*(b)]
+#define CH2(a,b) ch[(a)+idl1*(b)]
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+NOINLINE static void radfg(size_t ido, size_t ip, size_t l1,
+  double * restrict cc, double * restrict ch, const double * restrict wa,
+  const double * restrict csarr)
+  {
+  const size_t cdim=ip;
+  size_t ipph=(ip+1)/2;
+  size_t idl1 = ido*l1;
+
+  if (ido>1)
+    {
+    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)              // 114
+      {
+      size_t is=(j-1)*(ido-1),
+             is2=(jc-1)*(ido-1);
+      for (size_t k=0; k<l1; ++k)                            // 113
+        {
+        size_t idij=is;
+        size_t idij2=is2;
+        for (size_t i=1; i<=ido-2; i+=2)                      // 112
+          {
+          double t1=C1(i,k,j ), t2=C1(i+1,k,j ),
+                 t3=C1(i,k,jc), t4=C1(i+1,k,jc);
+          double x1=wa[idij]*t1 + wa[idij+1]*t2,
+                 x2=wa[idij]*t2 - wa[idij+1]*t1,
+                 x3=wa[idij2]*t3 + wa[idij2+1]*t4,
+                 x4=wa[idij2]*t4 - wa[idij2+1]*t3;
+          C1(i  ,k,j ) = x1+x3;
+          C1(i  ,k,jc) = x2-x4;
+          C1(i+1,k,j ) = x2+x4;
+          C1(i+1,k,jc) = x3-x1;
+          idij+=2;
+          idij2+=2;
+          }
+        }
+      }
+    }
+
+  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 123
+    for (size_t k=0; k<l1; ++k)                              // 122
+      {
+      double t1=C1(0,k,j), t2=C1(0,k,jc);
+      C1(0,k,j ) = t1+t2;
+      C1(0,k,jc) = t2-t1;
+      }
+
+//everything in C
+//memset(ch,0,ip*l1*ido*sizeof(double));
+
+  for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)                 // 127
+    {
+    for (size_t ik=0; ik<idl1; ++ik)                         // 124
+      {
+      CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
+      CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
+      }
+    size_t iang = 2*l;
+    size_t j=3, jc=ip-3;
+    for (; j<ipph-3; j+=4,jc-=4)              // 126
+      {
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)                       // 125
+        {
+        CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
+                     +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
+        CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
+                     +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
+        }
+      }
+    for (; j<ipph-1; j+=2,jc-=2)              // 126
+      {
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)                       // 125
+        {
+        CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
+        CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
+        }
+      }
+    for (; j<ipph; ++j,--jc)              // 126
+      {
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar=csarr[2*iang], ai=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)                       // 125
+        {
+        CH2(ik,l ) += ar*C2(ik,j );
+        CH2(ik,lc) += ai*C2(ik,jc);
+        }
+      }
+    }
+  for (size_t ik=0; ik<idl1; ++ik)                         // 101
+    CH2(ik,0) = C2(ik,0);
+  for (size_t j=1; j<ipph; ++j)                              // 129
+    for (size_t ik=0; ik<idl1; ++ik)                         // 128
+      CH2(ik,0) += C2(ik,j);
+
+// everything in CH at this point!
+//memset(cc,0,ip*l1*ido*sizeof(double));
+
+  for (size_t k=0; k<l1; ++k)                                // 131
+    for (size_t i=0; i<ido; ++i)                             // 130
+      CC(i,0,k) = CH(i,k,0);
+
+  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 137
+    {
+    size_t j2=2*j-1;
+    for (size_t k=0; k<l1; ++k)                              // 136
+      {
+      CC(ido-1,j2,k) = CH(0,k,j);
+      CC(0,j2+1,k) = CH(0,k,jc);
+      }
+    }
+
+  if (ido==1) return;
+
+  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 140
+    {
+    size_t j2=2*j-1;
+    for(size_t k=0; k<l1; ++k)                               // 139
+      for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 138
+        {
+        CC(i   ,j2+1,k) = CH(i  ,k,j )+CH(i  ,k,jc);
+        CC(ic  ,j2  ,k) = CH(i  ,k,j )-CH(i  ,k,jc);
+        CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
+        CC(ic+1,j2  ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
+        }
+    }
+  }
+#undef C1
+#undef C2
+#undef CH2
+
+#undef CH
+#undef CC
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+
+NOINLINE static void radb2(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=2;
+
+  for (size_t k=0; k<l1; k++)
+    PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k))
+  if ((ido&1)==0)
+    for (size_t k=0; k<l1; k++)
+      {
+      CH(ido-1,k,0) = 2.*CC(ido-1,0,k);
+      CH(ido-1,k,1) =-2.*CC(0    ,1,k);
+      }
+  if (ido<=2) return;
+  for (size_t k=0; k<l1;++k)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double ti2, tr2;
+      PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k))
+      PM (ti2,CH(i  ,k,0),CC(i  ,0,k),CC(ic  ,1,k))
+      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2)
+      }
+  }
+
+NOINLINE static void radb3(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=3;
+  static const double taur=-0.5, taui=0.86602540378443864676;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double tr2=2.*CC(ido-1,1,k);
+    double cr2=CC(0,0,k)+taur*tr2;
+    CH(0,k,0)=CC(0,0,k)+tr2;
+    double ci3=2.*taui*CC(0,2,k);
+    PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
+    }
+  if (ido==1) return;
+  for (size_t k=0; k<l1; k++)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
+      double ti2=CC(i  ,2,k)-CC(ic  ,1,k);
+      double cr2=CC(i-1,0,k)+taur*tr2;     // c2=CC +taur*t2
+      double ci2=CC(i  ,0,k)+taur*ti2;
+      CH(i-1,k,0)=CC(i-1,0,k)+tr2;         // CH=CC+t2
+      CH(i  ,k,0)=CC(i  ,0,k)+ti2;
+      double cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
+      double ci3=taui*(CC(i  ,2,k)+CC(ic  ,1,k));
+      double di2, di3, dr2, dr3;
+      PM(dr3,dr2,cr2,ci3) // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
+      PM(di2,di3,ci2,cr3) // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
+      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2) // ch = WA*d2
+      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
+      }
+  }
+
+NOINLINE static void radb4(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=4;
+  static const double sqrt2=1.41421356237309504880;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double tr1, tr2;
+    PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k))
+    double tr3=2.*CC(ido-1,1,k);
+    double tr4=2.*CC(0,2,k);
+    PM (CH(0,k,0),CH(0,k,2),tr2,tr3)
+    PM (CH(0,k,3),CH(0,k,1),tr1,tr4)
+    }
+  if ((ido&1)==0)
+    for (size_t k=0; k<l1; k++)
+      {
+      double tr1,tr2,ti1,ti2;
+      PM (ti1,ti2,CC(0    ,3,k),CC(0    ,1,k))
+      PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k))
+      CH(ido-1,k,0)=tr2+tr2;
+      CH(ido-1,k,1)=sqrt2*(tr1-ti1);
+      CH(ido-1,k,2)=ti2+ti2;
+      CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
+      }
+  if (ido<=2) return;
+  for (size_t k=0; k<l1;++k)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+      size_t ic=ido-i;
+      PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k))
+      PM (ti1,ti2,CC(i  ,0,k),CC(ic  ,3,k))
+      PM (tr4,ti3,CC(i  ,2,k),CC(ic  ,1,k))
+      PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k))
+      PM (CH(i-1,k,0),cr3,tr2,tr3)
+      PM (CH(i  ,k,0),ci3,ti2,ti3)
+      PM (cr4,cr2,tr1,tr4)
+      PM (ci2,ci4,ti1,ti4)
+      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2)
+      MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3)
+      MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4)
+      }
+  }
+
+NOINLINE static void radb5(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=5;
+  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
+                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double ti5=CC(0,2,k)+CC(0,2,k);
+    double ti4=CC(0,4,k)+CC(0,4,k);
+    double tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
+    double tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
+    CH(0,k,0)=CC(0,0,k)+tr2+tr3;
+    double cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
+    double cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
+    double ci4, ci5;
+    MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
+    PM(CH(0,k,4),CH(0,k,1),cr2,ci5)
+    PM(CH(0,k,3),CH(0,k,2),cr3,ci4)
+    }
+  if (ido==1) return;
+  for (size_t k=0; k<l1;++k)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
+      PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k))
+      PM(ti5,ti2,CC(i  ,2,k),CC(ic  ,1,k))
+      PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k))
+      PM(ti4,ti3,CC(i  ,4,k),CC(ic  ,3,k))
+      CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
+      CH(i  ,k,0)=CC(i  ,0,k)+ti2+ti3;
+      double cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
+      double ci2=CC(i  ,0,k)+tr11*ti2+tr12*ti3;
+      double cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
+      double ci3=CC(i  ,0,k)+tr12*ti2+tr11*ti3;
+      double ci4, ci5, cr5, cr4;
+      MULPM(cr5,cr4,tr5,tr4,ti11,ti12)
+      MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
+      double dr2, dr3, dr4, dr5, di2, di3, di4, di5;
+      PM(dr4,dr3,cr3,ci4)
+      PM(di3,di4,ci3,cr4)
+      PM(dr5,dr2,cr2,ci5)
+      PM(di2,di5,ci2,cr5)
+      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2)
+      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
+      MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4)
+      MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5)
+      }
+  }
+
+#undef CC
+#undef CH
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define C2(a,b) cc[(a)+idl1*(b)]
+#define CH2(a,b) ch[(a)+idl1*(b)]
+
+NOINLINE static void radbg(size_t ido, size_t ip, size_t l1,
+  double * restrict cc, double * restrict ch, const double * restrict wa,
+  const double * restrict csarr)
+  {
+  const size_t cdim=ip;
+  size_t ipph=(ip+1)/ 2;
+  size_t idl1 = ido*l1;
+
+  for (size_t k=0; k<l1; ++k)        // 102
+    for (size_t i=0; i<ido; ++i)     // 101
+      CH(i,k,0) = CC(i,0,k);
+  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)   // 108
+    {
+    size_t j2=2*j-1;
+    for (size_t k=0; k<l1; ++k)
+      {
+      CH(0,k,j ) = 2*CC(ido-1,j2,k);
+      CH(0,k,jc) = 2*CC(0,j2+1,k);
+      }
+    }
+
+  if (ido!=1)
+    {
+    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 111
+      {
+      size_t j2=2*j-1;
+      for (size_t k=0; k<l1; ++k)
+        for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 109
+          {
+          CH(i  ,k,j ) = CC(i  ,j2+1,k)+CC(ic  ,j2,k);
+          CH(i  ,k,jc) = CC(i  ,j2+1,k)-CC(ic  ,j2,k);
+          CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
+          CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
+          }
+      }
+    }
+  for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
+    {
+    for (size_t ik=0; ik<idl1; ++ik)
+      {
+      C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
+      C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
+      }
+    size_t iang=2*l;
+    size_t j=3,jc=ip-3;
+    for(; j<ipph-3; j+=4,jc-=4)
+      {
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
+                    +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
+        C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
+                    +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
+        }
+      }
+    for(; j<ipph-1; j+=2,jc-=2)
+      {
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
+        C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
+        }
+      }
+    for(; j<ipph; ++j,--jc)
+      {
+      iang+=l; if(iang>ip) iang-=ip;
+      double war=csarr[2*iang], wai=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        C2(ik,l ) += war*CH2(ik,j );
+        C2(ik,lc) += wai*CH2(ik,jc);
+        }
+      }
+    }
+  for (size_t j=1; j<ipph; ++j)
+    for (size_t ik=0; ik<idl1; ++ik)
+      CH2(ik,0) += CH2(ik,j);
+  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 124
+    for (size_t k=0; k<l1; ++k)
+      {
+      CH(0,k,j ) = C1(0,k,j)-C1(0,k,jc);
+      CH(0,k,jc) = C1(0,k,j)+C1(0,k,jc);
+      }
+
+  if (ido==1) return;
+
+  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)  // 127
+    for (size_t k=0; k<l1; ++k)
+      for (size_t i=1; i<=ido-2; i+=2)
+        {
+        CH(i  ,k,j ) = C1(i  ,k,j)-C1(i+1,k,jc);
+        CH(i  ,k,jc) = C1(i  ,k,j)+C1(i+1,k,jc);
+        CH(i+1,k,j ) = C1(i+1,k,j)+C1(i  ,k,jc);
+        CH(i+1,k,jc) = C1(i+1,k,j)-C1(i  ,k,jc);
+        }
+
+// All in CH
+
+  for (size_t j=1; j<ip; ++j)
+    {
+    size_t is = (j-1)*(ido-1);
+    for (size_t k=0; k<l1; ++k)
+      {
+      size_t idij = is;
+      for (size_t i=1; i<=ido-2; i+=2)
+        {
+        double t1=CH(i,k,j), t2=CH(i+1,k,j);
+        CH(i  ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
+        CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
+        idij+=2;
+        }
+      }
+    }
+  }
+#undef C1
+#undef C2
+#undef CH2
+
+#undef CC
+#undef CH
+#undef PM
+#undef MULPM
+#undef WA
+
+static void copy_and_norm(double *c, double *p1, size_t n, double fct)
+  {
+  if (p1!=c)
+    {
+    if (fct!=1.)
+      for (size_t i=0; i<n; ++i)
+        c[i] = fct*p1[i];
+    else
+      memcpy (c,p1,n*sizeof(double));
+    }
+  else
+    if (fct!=1.)
+      for (size_t i=0; i<n; ++i)
+        c[i] *= fct;
+  }
+
+WARN_UNUSED_RESULT
+static int rfftp_forward(rfftp_plan plan, double c[], double fct)
+  {
+  if (plan->length==1) return 0;
+  size_t n=plan->length;
+  size_t l1=n, nf=plan->nfct;
+  double *ch = RALLOC(double, n);
+  if (!ch) return -1;
+  double *p1=c, *p2=ch;
+
+  for(size_t k1=0; k1<nf;++k1)
+    {
+    size_t k=nf-k1-1;
+    size_t ip=plan->fct[k].fct;
+    size_t ido=n / l1;
+    l1 /= ip;
+    if(ip==4)
+      radf4(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==2)
+      radf2(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==3)
+      radf3(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==5)
+      radf5(ido, l1, p1, p2, plan->fct[k].tw);
+    else
+      {
+      radfg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
+      SWAP (p1,p2,double *);
+      }
+    SWAP (p1,p2,double *);
+    }
+  copy_and_norm(c,p1,n,fct);
+  DEALLOC(ch);
+  return 0;
+  }
+
+WARN_UNUSED_RESULT
+static int rfftp_backward(rfftp_plan plan, double c[], double fct)
+  {
+  if (plan->length==1) return 0;
+  size_t n=plan->length;
+  size_t l1=1, nf=plan->nfct;
+  double *ch = RALLOC(double, n);
+  if (!ch) return -1;
+  double *p1=c, *p2=ch;
+
+  for(size_t k=0; k<nf; k++)
+    {
+    size_t ip = plan->fct[k].fct,
+           ido= n/(ip*l1);
+    if(ip==4)
+      radb4(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==2)
+      radb2(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==3)
+      radb3(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==5)
+      radb5(ido, l1, p1, p2, plan->fct[k].tw);
+    else
+      radbg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
+    SWAP (p1,p2,double *);
+    l1*=ip;
+    }
+  copy_and_norm(c,p1,n,fct);
+  DEALLOC(ch);
+  return 0;
+  }
+
+WARN_UNUSED_RESULT
+static int rfftp_factorize (rfftp_plan plan)
+  {
+  size_t length=plan->length;
+  size_t nfct=0;
+  while ((length%4)==0)
+    { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
+  if ((length%2)==0)
+    {
+    length>>=1;
+    // factor 2 should be at the front of the factor list
+    if (nfct>=NFCT) return -1;
+    plan->fct[nfct++].fct=2;
+    SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
+    }
+  size_t maxl=(size_t)(sqrt((double)length))+1;
+  for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
+    if ((length%divisor)==0)
+      {
+      while ((length%divisor)==0)
+        {
+        if (nfct>=NFCT) return -1;
+        plan->fct[nfct++].fct=divisor;
+        length/=divisor;
+        }
+      maxl=(size_t)(sqrt((double)length))+1;
+      }
+  if (length>1) plan->fct[nfct++].fct=length;
+  plan->nfct=nfct;
+  return 0;
+  }
+
+static size_t rfftp_twsize(rfftp_plan plan)
+  {
+  size_t twsize=0, l1=1;
+  for (size_t k=0; k<plan->nfct; ++k)
+    {
+    size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
+    twsize+=(ip-1)*(ido-1);
+    if (ip>5) twsize+=2*ip;
+    l1*=ip;
+    }
+  return twsize;
+  return 0;
+  }
+
+WARN_UNUSED_RESULT NOINLINE static int rfftp_comp_twiddle (rfftp_plan plan)
+  {
+  size_t length=plan->length;
+  double *twid = RALLOC(double, 2*length);
+  if (!twid) return -1;
+  sincos_2pibyn_half(length, twid);
+  size_t l1=1;
+  double *ptr=plan->mem;
+  for (size_t k=0; k<plan->nfct; ++k)
+    {
+    size_t ip=plan->fct[k].fct, ido=length/(l1*ip);
+    if (k<plan->nfct-1) // last factor doesn't need twiddles
+      {
+      plan->fct[k].tw=ptr; ptr+=(ip-1)*(ido-1);
+      for (size_t j=1; j<ip; ++j)
+        for (size_t i=1; i<=(ido-1)/2; ++i)
+          {
+          plan->fct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i];
+          plan->fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1];
+          }
+      }
+    if (ip>5) // special factors required by *g functions
+      {
+      plan->fct[k].tws=ptr; ptr+=2*ip;
+      plan->fct[k].tws[0] = 1.;
+      plan->fct[k].tws[1] = 0.;
+      for (size_t i=1; i<=(ip>>1); ++i)
+        {
+        plan->fct[k].tws[2*i  ] = twid[2*i*(length/ip)];
+        plan->fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1];
+        plan->fct[k].tws[2*(ip-i)  ] = twid[2*i*(length/ip)];
+        plan->fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1];
+        }
+      }
+    l1*=ip;
+    }
+  DEALLOC(twid);
+  return 0;
+  }
+
+NOINLINE static rfftp_plan make_rfftp_plan (size_t length)
+  {
+  if (length==0) return NULL;
+  rfftp_plan plan = RALLOC(rfftp_plan_i,1);
+  if (!plan) return NULL;
+  plan->length=length;
+  plan->nfct=0;
+  plan->mem=NULL;
+  for (size_t i=0; i<NFCT; ++i)
+    plan->fct[i]=(rfftp_fctdata){0,0,0};
+  if (length==1) return plan;
+  if (rfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
+  size_t tws=rfftp_twsize(plan);
+  plan->mem=RALLOC(double,tws);
+  if (!plan->mem) { DEALLOC(plan); return NULL; }
+  if (rfftp_comp_twiddle(plan)!=0)
+    { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  return plan;
+  }
+
+NOINLINE static void destroy_rfftp_plan (rfftp_plan plan)
+  {
+  DEALLOC(plan->mem);
+  DEALLOC(plan);
+  }
+
+typedef struct fftblue_plan_i
+  {
+  size_t n, n2;
+  cfftp_plan plan;
+  double *mem;
+  double *bk, *bkf;
+  } fftblue_plan_i;
+typedef struct fftblue_plan_i * fftblue_plan;
+
+NOINLINE static fftblue_plan make_fftblue_plan (size_t length)
+  {
+  fftblue_plan plan = RALLOC(fftblue_plan_i,1);
+  if (!plan) return NULL;
+  plan->n = length;
+  plan->n2 = good_size(plan->n*2-1);
+  plan->mem = RALLOC(double, 2*plan->n+2*plan->n2);
+  if (!plan->mem) { DEALLOC(plan); return NULL; }
+  plan->bk  = plan->mem;
+  plan->bkf = plan->bk+2*plan->n;
+
+/* initialize b_k */
+  double *tmp = RALLOC(double,4*plan->n);
+  if (!tmp) { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  sincos_2pibyn(2*plan->n,tmp);
+  plan->bk[0] = 1;
+  plan->bk[1] = 0;
+
+  size_t coeff=0;
+  for (size_t m=1; m<plan->n; ++m)
+    {
+    coeff+=2*m-1;
+    if (coeff>=2*plan->n) coeff-=2*plan->n;
+    plan->bk[2*m  ] = tmp[2*coeff  ];
+    plan->bk[2*m+1] = tmp[2*coeff+1];
+    }
+
+  /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
+  double xn2 = 1./plan->n2;
+  plan->bkf[0] = plan->bk[0]*xn2;
+  plan->bkf[1] = plan->bk[1]*xn2;
+  for (size_t m=2; m<2*plan->n; m+=2)
+    {
+    plan->bkf[m]   = plan->bkf[2*plan->n2-m]   = plan->bk[m]   *xn2;
+    plan->bkf[m+1] = plan->bkf[2*plan->n2-m+1] = plan->bk[m+1] *xn2;
+    }
+  for (size_t m=2*plan->n;m<=(2*plan->n2-2*plan->n+1);++m)
+    plan->bkf[m]=0.;
+  plan->plan=make_cfftp_plan(plan->n2);
+  if (!plan->plan)
+    { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  if (cfftp_forward(plan->plan,plan->bkf,1.)!=0)
+    { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  DEALLOC(tmp);
+
+  return plan;
+  }
+
+NOINLINE static void destroy_fftblue_plan (fftblue_plan plan)
+  {
+  DEALLOC(plan->mem);
+  destroy_cfftp_plan(plan->plan);
+  DEALLOC(plan);
+  }
+
+NOINLINE WARN_UNUSED_RESULT
+static int fftblue_fft(fftblue_plan plan, double c[], int isign, double fct)
+  {
+  size_t n=plan->n;
+  size_t n2=plan->n2;
+  double *bk  = plan->bk;
+  double *bkf = plan->bkf;
+  double *akf = RALLOC(double, 2*n2);
+  if (!akf) return -1;
+
+/* initialize a_k and FFT it */
+  if (isign>0)
+    for (size_t m=0; m<2*n; m+=2)
+      {
+      akf[m]   = c[m]*bk[m]   - c[m+1]*bk[m+1];
+      akf[m+1] = c[m]*bk[m+1] + c[m+1]*bk[m];
+      }
+  else
+    for (size_t m=0; m<2*n; m+=2)
+      {
+      akf[m]   = c[m]*bk[m]   + c[m+1]*bk[m+1];
+      akf[m+1] =-c[m]*bk[m+1] + c[m+1]*bk[m];
+      }
+  for (size_t m=2*n; m<2*n2; ++m)
+    akf[m]=0;
+
+  if (cfftp_forward (plan->plan,akf,fct)!=0)
+    { DEALLOC(akf); return -1; }
+
+/* do the convolution */
+  if (isign>0)
+    for (size_t m=0; m<2*n2; m+=2)
+      {
+      double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
+      akf[m  ]  =  akf[m]*bkf[m]   + akf[m+1]*bkf[m+1];
+      akf[m+1]  = im;
+      }
+  else
+    for (size_t m=0; m<2*n2; m+=2)
+      {
+      double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
+      akf[m  ]  = akf[m]*bkf[m]   - akf[m+1]*bkf[m+1];
+      akf[m+1]  = im;
+      }
+
+/* inverse FFT */
+  if (cfftp_backward (plan->plan,akf,1.)!=0)
+    { DEALLOC(akf); return -1; }
+
+/* multiply by b_k */
+  if (isign>0)
+    for (size_t m=0; m<2*n; m+=2)
+      {
+      c[m]   = bk[m]  *akf[m] - bk[m+1]*akf[m+1];
+      c[m+1] = bk[m+1]*akf[m] + bk[m]  *akf[m+1];
+      }
+  else
+    for (size_t m=0; m<2*n; m+=2)
+      {
+      c[m]   = bk[m]  *akf[m] + bk[m+1]*akf[m+1];
+      c[m+1] =-bk[m+1]*akf[m] + bk[m]  *akf[m+1];
+      }
+  DEALLOC(akf);
+  return 0;
+  }
+
+WARN_UNUSED_RESULT
+static int cfftblue_backward(fftblue_plan plan, double c[], double fct)
+  { return fftblue_fft(plan,c,1,fct); }
+
+WARN_UNUSED_RESULT
+static int cfftblue_forward(fftblue_plan plan, double c[], double fct)
+  { return fftblue_fft(plan,c,-1,fct); }
+
+WARN_UNUSED_RESULT
+static int rfftblue_backward(fftblue_plan plan, double c[], double fct)
+  {
+  size_t n=plan->n;
+  double *tmp = RALLOC(double,2*n);
+  if (!tmp) return -1;
+  tmp[0]=c[0];
+  tmp[1]=0.;
+  memcpy (tmp+2,c+1, (n-1)*sizeof(double));
+  if ((n&1)==0) tmp[n+1]=0.;
+  for (size_t m=2; m<n; m+=2)
+    {
+    tmp[2*n-m]=tmp[m];
+    tmp[2*n-m+1]=-tmp[m+1];
+    }
+  if (fftblue_fft(plan,tmp,1,fct)!=0)
+    { DEALLOC(tmp); return -1; }
+  for (size_t m=0; m<n; ++m)
+    c[m] = tmp[2*m];
+  DEALLOC(tmp);
+  return 0;
+  }
+
+WARN_UNUSED_RESULT
+static int rfftblue_forward(fftblue_plan plan, double c[], double fct)
+  {
+  size_t n=plan->n;
+  double *tmp = RALLOC(double,2*n);
+  if (!tmp) return -1;
+  for (size_t m=0; m<n; ++m)
+    {
+    tmp[2*m] = c[m];
+    tmp[2*m+1] = 0.;
+    }
+  if (fftblue_fft(plan,tmp,-1,fct)!=0)
+    { DEALLOC(tmp); return -1; }
+  c[0] = tmp[0];
+  memcpy (c+1, tmp+2, (n-1)*sizeof(double));
+  DEALLOC(tmp);
+  return 0;
+  }
+
+typedef struct cfft_plan_i
+  {
+  cfftp_plan packplan;
+  fftblue_plan blueplan;
+  } cfft_plan_i;
+
+static cfft_plan make_cfft_plan (size_t length)
+  {
+  if (length==0) return NULL;
+  cfft_plan plan = RALLOC(cfft_plan_i,1);
+  if (!plan) return NULL;
+  plan->blueplan=0;
+  plan->packplan=0;
+  if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
+    {
+    plan->packplan=make_cfftp_plan(length);
+    if (!plan->packplan) { DEALLOC(plan); return NULL; }
+    return plan;
+    }
+  double comp1 = cost_guess(length);
+  double comp2 = 2*cost_guess(good_size(2*length-1));
+  comp2*=1.5; /* fudge factor that appears to give good overall performance */
+  if (comp2<comp1) // use Bluestein
+    {
+    plan->blueplan=make_fftblue_plan(length);
+    if (!plan->blueplan) { DEALLOC(plan); return NULL; }
+    }
+  else
+    {
+    plan->packplan=make_cfftp_plan(length);
+    if (!plan->packplan) { DEALLOC(plan); return NULL; }
+    }
+  return plan;
+  }
+
+static void destroy_cfft_plan (cfft_plan plan)
+  {
+  if (plan->blueplan)
+    destroy_fftblue_plan(plan->blueplan);
+  if (plan->packplan)
+    destroy_cfftp_plan(plan->packplan);
+  DEALLOC(plan);
+  }
+
+WARN_UNUSED_RESULT static int cfft_backward(cfft_plan plan, double c[], double fct)
+  {
+  if (plan->packplan)
+    return cfftp_backward(plan->packplan,c,fct);
+  // if (plan->blueplan)
+  return cfftblue_backward(plan->blueplan,c,fct);
+  }
+
+WARN_UNUSED_RESULT static int cfft_forward(cfft_plan plan, double c[], double fct)
+  {
+  if (plan->packplan)
+    return cfftp_forward(plan->packplan,c,fct);
+  // if (plan->blueplan)
+  return cfftblue_forward(plan->blueplan,c,fct);
+  }
+
+typedef struct rfft_plan_i
+  {
+  rfftp_plan packplan;
+  fftblue_plan blueplan;
+  } rfft_plan_i;
+
+static rfft_plan make_rfft_plan (size_t length)
+  {
+  if (length==0) return NULL;
+  rfft_plan plan = RALLOC(rfft_plan_i,1);
+  if (!plan) return NULL;
+  plan->blueplan=0;
+  plan->packplan=0;
+  if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
+    {
+    plan->packplan=make_rfftp_plan(length);
+    if (!plan->packplan) { DEALLOC(plan); return NULL; }
+    return plan;
+    }
+  double comp1 = 0.5*cost_guess(length);
+  double comp2 = 2*cost_guess(good_size(2*length-1));
+  comp2*=1.5; /* fudge factor that appears to give good overall performance */
+  if (comp2<comp1) // use Bluestein
+    {
+    plan->blueplan=make_fftblue_plan(length);
+    if (!plan->blueplan) { DEALLOC(plan); return NULL; }
+    }
+  else
+    {
+    plan->packplan=make_rfftp_plan(length);
+    if (!plan->packplan) { DEALLOC(plan); return NULL; }
+    }
+  return plan;
+  }
+
+static void destroy_rfft_plan (rfft_plan plan)
+  {
+  if (plan->blueplan)
+    destroy_fftblue_plan(plan->blueplan);
+  if (plan->packplan)
+    destroy_rfftp_plan(plan->packplan);
+  DEALLOC(plan);
+  }
+
+WARN_UNUSED_RESULT static int rfft_backward(rfft_plan plan, double c[], double fct)
+  {
+  if (plan->packplan)
+    return rfftp_backward(plan->packplan,c,fct);
+  else // if (plan->blueplan)
+    return rfftblue_backward(plan->blueplan,c,fct);
+  }
+
+WARN_UNUSED_RESULT static int rfft_forward(rfft_plan plan, double c[], double fct)
+  {
+  if (plan->packplan)
+    return rfftp_forward(plan->packplan,c,fct);
+  else // if (plan->blueplan)
+    return rfftblue_forward(plan->blueplan,c,fct);
+  }
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "Python.h"
+#include "numpy/arrayobject.h"
+
+static PyObject *
+execute_complex(PyObject *a1, int is_forward, double fct)
+{
+    PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
+            PyArray_DescrFromType(NPY_CDOUBLE), 1, 0,
+            NPY_ARRAY_ENSURECOPY | NPY_ARRAY_DEFAULT |
+            NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
+            NULL);
+    if (!data) return NULL;
+
+    int npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
+    cfft_plan plan=NULL;
+
+    int nrepeats = PyArray_SIZE(data)/npts;
+    double *dptr = (double *)PyArray_DATA(data);
+    int fail=0;
+    Py_BEGIN_ALLOW_THREADS;
+    NPY_SIGINT_ON;
+    plan = make_cfft_plan(npts);
+    if (!plan) fail=1;
+    if (!fail)
+      for (int i = 0; i < nrepeats; i++) {
+          int res = is_forward ?
+            cfft_forward(plan, dptr, fct) : cfft_backward(plan, dptr, fct);
+          if (res!=0) { fail=1; break; }
+          dptr += npts*2;
+      }
+    if (plan) destroy_cfft_plan(plan);
+    NPY_SIGINT_OFF;
+    Py_END_ALLOW_THREADS;
+    if (fail) {
+      Py_XDECREF(data);
+      return PyErr_NoMemory();
+    }
+    return (PyObject *)data;
+}
+
+static PyObject *
+execute_real_forward(PyObject *a1, double fct)
+{
+    rfft_plan plan=NULL;
+    int fail = 0;
+    PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
+            PyArray_DescrFromType(NPY_DOUBLE), 1, 0,
+            NPY_ARRAY_DEFAULT | NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
+            NULL);
+    if (!data) return NULL;
+
+    int ndim = PyArray_NDIM(data);
+    const npy_intp *odim = PyArray_DIMS(data);
+    int npts = odim[ndim - 1];
+    npy_intp *tdim=(npy_intp *)malloc(ndim*sizeof(npy_intp));
+    if (!tdim)
+      { Py_XDECREF(data); return NULL; }
+    for (int d=0; d<ndim-1; ++d)
+      tdim[d] = odim[d];
+    tdim[ndim-1] = npts/2 + 1;
+    PyArrayObject *ret = (PyArrayObject *)PyArray_Empty(ndim,
+            tdim, PyArray_DescrFromType(NPY_CDOUBLE), 0);
+    free(tdim);
+    if (!ret) fail=1;
+    if (!fail) {
+      int rstep = PyArray_DIM(ret, PyArray_NDIM(ret) - 1)*2;
+
+      int nrepeats = PyArray_SIZE(data)/npts;
+      double *rptr = (double *)PyArray_DATA(ret),
+             *dptr = (double *)PyArray_DATA(data);
+
+      Py_BEGIN_ALLOW_THREADS;
+      NPY_SIGINT_ON;
+      plan = make_rfft_plan(npts);
+      if (!plan) fail=1;
+      if (!fail)
+        for (int i = 0; i < nrepeats; i++) {
+            rptr[rstep-1] = 0.0;
+            memcpy((char *)(rptr+1), dptr, npts*sizeof(double));
+            if (rfft_forward(plan, rptr+1, fct)!=0) {fail=1; break;}
+            rptr[0] = rptr[1];
+            rptr[1] = 0.0;
+            rptr += rstep;
+            dptr += npts;
+      }
+      if (plan) destroy_rfft_plan(plan);
+      NPY_SIGINT_OFF;
+      Py_END_ALLOW_THREADS;
+    }
+    if (fail) {
+      Py_XDECREF(data);
+      Py_XDECREF(ret);
+      return PyErr_NoMemory();
+    }
+    Py_DECREF(data);
+    return (PyObject *)ret;
+}
+static PyObject *
+execute_real_backward(PyObject *a1, double fct)
+{
+    rfft_plan plan=NULL;
+    PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
+            PyArray_DescrFromType(NPY_CDOUBLE), 1, 0,
+            NPY_ARRAY_DEFAULT | NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
+            NULL);
+    if (!data) return NULL;
+    int npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
+    PyArrayObject *ret = (PyArrayObject *)PyArray_Empty(PyArray_NDIM(data),
+            PyArray_DIMS(data), PyArray_DescrFromType(NPY_DOUBLE), 0);
+    int fail = 0;
+    if (!ret) fail=1;
+    if (!fail) {
+      int nrepeats = PyArray_SIZE(ret)/npts;
+      double *rptr = (double *)PyArray_DATA(ret),
+             *dptr = (double *)PyArray_DATA(data);
+
+      Py_BEGIN_ALLOW_THREADS;
+      NPY_SIGINT_ON;
+      plan = make_rfft_plan(npts);
+      if (!plan) fail=1;
+      if (!fail) {
+        for (int i = 0; i < nrepeats; i++) {
+          memcpy((char *)(rptr + 1), (dptr + 2), (npts - 1)*sizeof(double));
+          rptr[0] = dptr[0];
+          if (rfft_backward(plan, rptr, fct)!=0) {fail=1; break;}
+          rptr += npts;
+          dptr += npts*2;
+        }
+      }
+      if (plan) destroy_rfft_plan(plan);
+      NPY_SIGINT_OFF;
+      Py_END_ALLOW_THREADS;
+    }
+    if (fail) {
+      Py_XDECREF(data);
+      Py_XDECREF(ret);
+      return PyErr_NoMemory();
+    }
+    Py_DECREF(data);
+    return (PyObject *)ret;
+}
+
+static PyObject *
+execute_real(PyObject *a1, int is_forward, double fct)
+{
+    return is_forward ? execute_real_forward(a1, fct)
+                      : execute_real_backward(a1, fct);
+}
+
+static const char execute__doc__[] = "";
+
+static PyObject *
+execute(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *a1;
+    int is_real, is_forward;
+    double fct;
+
+    if(!PyArg_ParseTuple(args, "Oiid:execute", &a1, &is_real, &is_forward, &fct)) {
+        return NULL;
+    }
+
+    return is_real ? execute_real(a1, is_forward, fct)
+                   : execute_complex(a1, is_forward, fct);
+}
+
+/* List of methods defined in the module */
+
+static struct PyMethodDef methods[] = {
+    {"execute",   execute,   1, execute__doc__},
+    {NULL, NULL, 0, NULL}          /* sentinel */
+};
+
+#if PY_MAJOR_VERSION >= 3
+static struct PyModuleDef moduledef = {
+        PyModuleDef_HEAD_INIT,
+        "pocketfft_internal",
+        NULL,
+        -1,
+        methods,
+        NULL,
+        NULL,
+        NULL,
+        NULL
+};
+#endif
+
+/* Initialization function for the module */
+#if PY_MAJOR_VERSION >= 3
+#define RETVAL(x) x
+PyMODINIT_FUNC PyInit_pocketfft_internal(void)
+#else
+#define RETVAL(x)
+PyMODINIT_FUNC
+initpocketfft_internal(void)
+#endif
+{
+    PyObject *m;
+#if PY_MAJOR_VERSION >= 3
+    m = PyModule_Create(&moduledef);
+#else
+    static const char module_documentation[] = "";
+
+    m = Py_InitModule4("pocketfft_internal", methods,
+            module_documentation,
+            (PyObject*)NULL,PYTHON_API_VERSION);
+#endif
+    if (m == NULL) {
+        return RETVAL(NULL);
+    }
+
+    /* Import the array object */
+    import_array();
+
+    /* XXXX Add constants here */
+
+    return RETVAL(m);
+}
diff --git a/numpy/fft/fftpack.py b/numpy/fft/pocketfft.py
index d0df6fb48..45dc162f6 100644
--- a/numpy/fft/fftpack.py
+++ b/numpy/fft/pocketfft.py
@@ -26,9 +26,6 @@ n = n-dimensional transform
 (Note: 2D routines are just nD routines with different default
 behavior.)
 
-The underlying code for these functions is an f2c-translated and modified
-version of the FFTPACK routines.
-
 """
 from __future__ import division, absolute_import, print_function
 
@@ -37,26 +34,18 @@ __all__ = ['fft', 'ifft', 'rfft', 'irfft', 'hfft', 'ihfft', 'rfftn',
 
 import functools
 
-from numpy.core import (array, asarray, zeros, swapaxes, shape, conjugate,
-                        take, sqrt)
+from numpy.core import asarray, zeros, swapaxes, conjugate, take, sqrt
+from . import pocketfft_internal as pfi
 from numpy.core.multiarray import normalize_axis_index
 from numpy.core import overrides
-from . import fftpack_lite as fftpack
-from .helper import _FFTCache
-
-_fft_cache = _FFTCache(max_size_in_mb=100, max_item_count=32)
-_real_fft_cache = _FFTCache(max_size_in_mb=100, max_item_count=32)
 
 
 array_function_dispatch = functools.partial(
     overrides.array_function_dispatch, module='numpy.fft')
 
 
-def _raw_fft(a, n=None, axis=-1, init_function=fftpack.cffti,
-             work_function=fftpack.cfftf, fft_cache=_fft_cache):
-    a = asarray(a)
+def _raw_fft(a, n, axis, is_real, is_forward, fct):
     axis = normalize_axis_index(axis, a.ndim)
-
     if n is None:
         n = a.shape[axis]
 
@@ -64,15 +53,6 @@ def _raw_fft(a, n=None, axis=-1, init_function=fftpack.cffti,
         raise ValueError("Invalid number of FFT data points (%d) specified."
                          % n)
 
-    # We have to ensure that only a single thread can access a wsave array
-    # at any given time. Thus we remove it from the cache and insert it
-    # again after it has been used. Multiple threads might create multiple
-    # copies of the wsave array. This is intentional and a limitation of
-    # the current C code.
-    wsave = fft_cache.pop_twiddle_factors(n)
-    if wsave is None:
-        wsave = init_function(n)
-
     if a.shape[axis] != n:
         s = list(a.shape)
         if s[axis] > n:
@@ -87,25 +67,22 @@ def _raw_fft(a, n=None, axis=-1, init_function=fftpack.cffti,
             z[tuple(index)] = a
             a = z
 
-    if axis != a.ndim - 1:
+    if axis == a.ndim-1:
+        r = pfi.execute(a, is_real, is_forward, fct)
+    else:
         a = swapaxes(a, axis, -1)
-    r = work_function(a, wsave)
-    if axis != a.ndim - 1:
+        r = pfi.execute(a, is_real, is_forward, fct)
         r = swapaxes(r, axis, -1)
-
-    # As soon as we put wsave back into the cache, another thread could pick it
-    # up and start using it, so we must not do this until after we're
-    # completely done using it ourselves.
-    fft_cache.put_twiddle_factors(n, wsave)
-
     return r
 
 
 def _unitary(norm):
-    if norm not in (None, "ortho"):
-        raise ValueError("Invalid norm value %s, should be None or \"ortho\"."
-                         % norm)
-    return norm is not None
+    if norm is None:
+        return False
+    if norm=="ortho":
+        return True
+    raise ValueError("Invalid norm value %s, should be None or \"ortho\"."
+                     % norm)
 
 
 def _fft_dispatcher(a, n=None, axis=None, norm=None):
@@ -186,8 +163,6 @@ def fft(a, n=None, axis=-1, norm=None):
     in the real part and anti-symmetric in the imaginary part, as described in
     the `numpy.fft` documentation:
 
-    >>> import matplotlib
-    >>> matplotlib.use('Agg')
     >>> import matplotlib.pyplot as plt
     >>> t = np.arange(256)
     >>> sp = np.fft.fft(np.sin(t))
@@ -198,12 +173,13 @@ def fft(a, n=None, axis=-1, norm=None):
 
     """
 
-    a = asarray(a).astype(complex, copy=False)
+    a = asarray(a)
     if n is None:
         n = a.shape[axis]
-    output = _raw_fft(a, n, axis, fftpack.cffti, fftpack.cfftf, _fft_cache)
-    if _unitary(norm):
-        output *= 1 / sqrt(n)
+    fct = 1
+    if norm is not None and _unitary(norm):
+        fct = 1 / sqrt(n)
+    output = _raw_fft(a, n, axis, False, True, fct)
     return output
 
 
@@ -280,8 +256,6 @@ def ifft(a, n=None, axis=-1, norm=None):
 
     Create and plot a band-limited signal with random phases:
 
-    >>> import matplotlib
-    >>> matplotlib.use('agg')
     >>> import matplotlib.pyplot as plt
     >>> t = np.arange(400)
     >>> n = np.zeros((400,), dtype=complex)
@@ -294,13 +268,14 @@ def ifft(a, n=None, axis=-1, norm=None):
     >>> plt.show()
 
     """
-    # The copy may be required for multithreading.
-    a = array(a, copy=True, dtype=complex)
+    a = asarray(a)
     if n is None:
         n = a.shape[axis]
-    unitary = _unitary(norm)
-    output = _raw_fft(a, n, axis, fftpack.cffti, fftpack.cfftb, _fft_cache)
-    return output * (1 / (sqrt(n) if unitary else n))
+    fct = 1/n
+    if norm is not None and _unitary(norm):
+        fct = 1/sqrt(n)
+    output = _raw_fft(a, n, axis, False, False, fct)
+    return output
 
 
 
@@ -383,14 +358,13 @@ def rfft(a, n=None, axis=-1, norm=None):
     exploited to compute only the non-negative frequency terms.
 
     """
-    # The copy may be required for multithreading.
-    a = array(a, copy=True, dtype=float)
-    output = _raw_fft(a, n, axis, fftpack.rffti, fftpack.rfftf,
-                      _real_fft_cache)
-    if _unitary(norm):
+    a = asarray(a)
+    fct = 1
+    if norm is not None and _unitary(norm):
         if n is None:
             n = a.shape[axis]
-        output *= 1 / sqrt(n)
+        fct = 1/sqrt(n)
+    output = _raw_fft(a, n, axis, True, True, fct)
     return output
 
 
@@ -475,14 +449,14 @@ def irfft(a, n=None, axis=-1, norm=None):
     specified, and the output array is purely real.
 
     """
-    # The copy may be required for multithreading.
-    a = array(a, copy=True, dtype=complex)
+    a = asarray(a)
     if n is None:
         n = (a.shape[axis] - 1) * 2
-    unitary = _unitary(norm)
-    output = _raw_fft(a, n, axis, fftpack.rffti, fftpack.rfftb,
-                      _real_fft_cache)
-    return output * (1 / (sqrt(n) if unitary else n))
+    fct = 1/n
+    if norm is not None and _unitary(norm):
+        fct = 1/sqrt(n)
+    output = _raw_fft(a, n, axis, True, False, fct)
+    return output
 
 
 @array_function_dispatch(_fft_dispatcher)
@@ -560,8 +534,7 @@ def hfft(a, n=None, axis=-1, norm=None):
            [ 2., -2.]])
 
     """
-    # The copy may be required for multithreading.
-    a = array(a, copy=True, dtype=complex)
+    a = asarray(a)
     if n is None:
         n = (a.shape[axis] - 1) * 2
     unitary = _unitary(norm)
@@ -621,8 +594,7 @@ def ihfft(a, n=None, axis=-1, norm=None):
     array([ 1.-0.j,  2.-0.j,  3.-0.j,  4.-0.j]) # may vary
 
     """
-    # The copy may be required for multithreading.
-    a = array(a, copy=True, dtype=float)
+    a = asarray(a)
     if n is None:
         n = a.shape[axis]
     unitary = _unitary(norm)
@@ -1122,8 +1094,7 @@ def rfftn(a, s=None, axes=None, norm=None):
             [0.+0.j,  0.+0.j]]])
 
     """
-    # The copy may be required for multithreading.
-    a = array(a, copy=True, dtype=float)
+    a = asarray(a)
     s, axes = _cook_nd_args(a, s, axes)
     a = rfft(a, s[-1], axes[-1], norm)
     for ii in range(len(axes)-1):
@@ -1255,8 +1226,7 @@ def irfftn(a, s=None, axes=None, norm=None):
             [1.,  1.]]])
 
     """
-    # The copy may be required for multithreading.
-    a = array(a, copy=True, dtype=complex)
+    a = asarray(a)
     s, axes = _cook_nd_args(a, s, axes, invreal=1)
     for ii in range(len(axes)-1):
         a = ifft(a, s[ii], axes[ii], norm)
diff --git a/numpy/fft/setup.py b/numpy/fft/setup.py
index cd99a82d7..6c3548b65 100644
--- a/numpy/fft/setup.py
+++ b/numpy/fft/setup.py
@@ -7,9 +7,9 @@ def configuration(parent_package='',top_path=None):
 
     config.add_data_dir('tests')
 
-    # Configure fftpack_lite
-    config.add_extension('fftpack_lite',
-                         sources=['fftpack_litemodule.c', 'fftpack.c']
+    # Configure pocketfft_internal
+    config.add_extension('pocketfft_internal',
+                         sources=['pocketfft.c']
                          )
 
     return config
diff --git a/numpy/fft/tests/test_helper.py b/numpy/fft/tests/test_helper.py
index 8d315fa02..6613c8002 100644
--- a/numpy/fft/tests/test_helper.py
+++ b/numpy/fft/tests/test_helper.py
@@ -7,7 +7,6 @@ from __future__ import division, absolute_import, print_function
 import numpy as np
 from numpy.testing import assert_array_almost_equal, assert_equal
 from numpy import fft, pi
-from numpy.fft.helper import _FFTCache
 
 
 class TestFFTShift(object):
@@ -168,81 +167,3 @@ class TestIRFFTN(object):
 
         # Should not raise error
         fft.irfftn(a, axes=axes)
-
-
-class TestFFTCache(object):
-
-    def test_basic_behaviour(self):
-        c = _FFTCache(max_size_in_mb=1, max_item_count=4)
-
-        # Put
-        c.put_twiddle_factors(1, np.ones(2, dtype=np.float32))
-        c.put_twiddle_factors(2, np.zeros(2, dtype=np.float32))
-
-        # Get
-        assert_array_almost_equal(c.pop_twiddle_factors(1),
-                                  np.ones(2, dtype=np.float32))
-        assert_array_almost_equal(c.pop_twiddle_factors(2),
-                                  np.zeros(2, dtype=np.float32))
-
-        # Nothing should be left.
-        assert_equal(len(c._dict), 0)
-
-        # Now put everything in twice so it can be retrieved once and each will
-        # still have one item left.
-        for _ in range(2):
-            c.put_twiddle_factors(1, np.ones(2, dtype=np.float32))
-            c.put_twiddle_factors(2, np.zeros(2, dtype=np.float32))
-        assert_array_almost_equal(c.pop_twiddle_factors(1),
-                                  np.ones(2, dtype=np.float32))
-        assert_array_almost_equal(c.pop_twiddle_factors(2),
-                                  np.zeros(2, dtype=np.float32))
-        assert_equal(len(c._dict), 2)
-
-    def test_automatic_pruning(self):
-        # That's around 2600 single precision samples.
-        c = _FFTCache(max_size_in_mb=0.01, max_item_count=4)
-
-        c.put_twiddle_factors(1, np.ones(200, dtype=np.float32))
-        c.put_twiddle_factors(2, np.ones(200, dtype=np.float32))
-        assert_equal(list(c._dict.keys()), [1, 2])
-
-        # This is larger than the limit but should still be kept.
-        c.put_twiddle_factors(3, np.ones(3000, dtype=np.float32))
-        assert_equal(list(c._dict.keys()), [1, 2, 3])
-        # Add one more.
-        c.put_twiddle_factors(4, np.ones(3000, dtype=np.float32))
-        # The other three should no longer exist.
-        assert_equal(list(c._dict.keys()), [4])
-
-        # Now test the max item count pruning.
-        c = _FFTCache(max_size_in_mb=0.01, max_item_count=2)
-        c.put_twiddle_factors(2, np.empty(2))
-        c.put_twiddle_factors(1, np.empty(2))
-        # Can still be accessed.
-        assert_equal(list(c._dict.keys()), [2, 1])
-
-        c.put_twiddle_factors(3, np.empty(2))
-        # 1 and 3 can still be accessed - c[2] has been touched least recently
-        # and is thus evicted.
-        assert_equal(list(c._dict.keys()), [1, 3])
-
-        # One last test. We will add a single large item that is slightly
-        # bigger then the cache size. Some small items can still be added.
-        c = _FFTCache(max_size_in_mb=0.01, max_item_count=5)
-        c.put_twiddle_factors(1, np.ones(3000, dtype=np.float32))
-        c.put_twiddle_factors(2, np.ones(2, dtype=np.float32))
-        c.put_twiddle_factors(3, np.ones(2, dtype=np.float32))
-        c.put_twiddle_factors(4, np.ones(2, dtype=np.float32))
-        assert_equal(list(c._dict.keys()), [1, 2, 3, 4])
-
-        # One more big item. This time it is 6 smaller ones but they are
-        # counted as one big item.
-        for _ in range(6):
-            c.put_twiddle_factors(5, np.ones(500, dtype=np.float32))
-        # '1' no longer in the cache. Rest still in the cache.
-        assert_equal(list(c._dict.keys()), [2, 3, 4, 5])
-
-        # Another big item - should now be the only item in the cache.
-        c.put_twiddle_factors(6, np.ones(4000, dtype=np.float32))
-        assert_equal(list(c._dict.keys()), [6])
diff --git a/numpy/fft/tests/test_fftpack.py b/numpy/fft/tests/test_pocketfft.py
index 8d6cd8407..08f80076a 100644
--- a/numpy/fft/tests/test_fftpack.py
+++ b/numpy/fft/tests/test_pocketfft.py
@@ -1,6 +1,7 @@
 from __future__ import division, absolute_import, print_function
 
 import numpy as np
+import pytest
 from numpy.random import random
 from numpy.testing import (
         assert_array_almost_equal, assert_array_equal, assert_raises,
@@ -28,6 +29,16 @@ class TestFFTShift(object):
 
 class TestFFT1D(object):
 
+    def test_identity(self):
+        maxlen = 512
+        x = random(maxlen) + 1j*random(maxlen)
+        xr = random(maxlen)
+        for i in range(1,maxlen):
+            assert_array_almost_equal(np.fft.ifft(np.fft.fft(x[0:i])), x[0:i],
+                                      decimal=12)
+            assert_array_almost_equal(np.fft.irfft(np.fft.rfft(xr[0:i]),i),
+                                      xr[0:i], decimal=12)
+
     def test_fft(self):
         x = random(30) + 1j*random(30)
         assert_array_almost_equal(fft1(x), np.fft.fft(x))
@@ -146,6 +157,53 @@ class TestFFT1D(object):
                     assert_array_almost_equal(x_norm,
                                               np.linalg.norm(tmp))
 
+    @pytest.mark.parametrize("dtype", [np.half, np.single, np.double,
+                                       np.longdouble])
+    def test_dtypes(self, dtype):
+        # make sure that all input precisions are accepted and internally
+        # converted to 64bit
+        x = random(30).astype(dtype)
+        assert_array_almost_equal(np.fft.ifft(np.fft.fft(x)), x)
+        assert_array_almost_equal(np.fft.irfft(np.fft.rfft(x)), x)
+
+
+@pytest.mark.parametrize(
+        "dtype",
+        [np.float32, np.float64, np.complex64, np.complex128])
+@pytest.mark.parametrize("order", ["F", 'non-contiguous'])
+@pytest.mark.parametrize(
+        "fft",
+        [np.fft.fft, np.fft.fft2, np.fft.fftn,
+         np.fft.ifft, np.fft.ifft2, np.fft.ifftn])
+def test_fft_with_order(dtype, order, fft):
+    # Check that FFT/IFFT produces identical results for C, Fortran and
+    # non contiguous arrays
+    rng = np.random.RandomState(42)
+    X = rng.rand(8, 7, 13).astype(dtype, copy=False)
+    if order == 'F':
+        Y = np.asfortranarray(X)
+    else:
+        # Make a non contiguous array
+        Y = X[::-1]
+        X = np.ascontiguousarray(X[::-1])
+
+    if fft.__name__.endswith('fft'):
+        for axis in range(3):
+            X_res = fft(X, axis=axis)
+            Y_res = fft(Y, axis=axis)
+            assert_array_almost_equal(X_res, Y_res)
+    elif fft.__name__.endswith(('fft2', 'fftn')):
+        axes = [(0, 1), (1, 2), (0, 2)]
+        if fft.__name__.endswith('fftn'):
+            axes.extend([(0,), (1,), (2,), None])
+        for ax in axes:
+            X_res = fft(X, axes=ax)
+            Y_res = fft(Y, axes=ax)
+            assert_array_almost_equal(X_res, Y_res)
+    else:
+        raise ValueError
+
+
 class TestFFTThreadSafe(object):
     threads = 16
     input_shape = (800, 200)
diff --git a/numpy/lib/_datasource.py b/numpy/lib/_datasource.py
index 3a0e67f60..816f7624e 100644
--- a/numpy/lib/_datasource.py
+++ b/numpy/lib/_datasource.py
@@ -547,6 +547,11 @@ class DataSource(object):
         is accessible if it exists in either location.
 
         """
+
+        # First test for local path
+        if os.path.exists(path):
+            return True
+
         # We import this here because importing urllib2 is slow and
         # a significant fraction of numpy's total import time.
         if sys.version_info[0] >= 3:
@@ -556,10 +561,6 @@ class DataSource(object):
             from urllib2 import urlopen
             from urllib2 import URLError
 
-        # Test local path
-        if os.path.exists(path):
-            return True
-
         # Test cached url
         upath = self.abspath(path)
         if os.path.exists(upath):
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
index 558150e48..b53d8c03f 100644
--- a/numpy/lib/arraysetops.py
+++ b/numpy/lib/arraysetops.py
@@ -94,8 +94,7 @@ def ediff1d(ary, to_end=None, to_begin=None):
     # force a 1d array
     ary = np.asanyarray(ary).ravel()
 
-    # we have unit tests enforcing
-    # propagation of the dtype of input
+    # enforce propagation of the dtype of input
     # ary to returned result
     dtype_req = ary.dtype
 
@@ -106,23 +105,22 @@ def ediff1d(ary, to_end=None, to_begin=None):
     if to_begin is None:
         l_begin = 0
     else:
-        to_begin = np.asanyarray(to_begin)
-        if not np.can_cast(to_begin, dtype_req):
-            raise TypeError("dtype of to_begin must be compatible "
-                            "with input ary")
-
-        to_begin = to_begin.ravel()
+        _to_begin = np.asanyarray(to_begin, dtype=dtype_req)
+        if not np.all(_to_begin == to_begin):
+            raise ValueError("cannot convert 'to_begin' to array with dtype "
+                            "'%r' as required for input ary" % dtype_req)
+        to_begin = _to_begin.ravel()
         l_begin = len(to_begin)
 
     if to_end is None:
         l_end = 0
     else:
-        to_end = np.asanyarray(to_end)
-        if not np.can_cast(to_end, dtype_req):
-            raise TypeError("dtype of to_end must be compatible "
-                            "with input ary")
-
-        to_end = to_end.ravel()
+        _to_end = np.asanyarray(to_end, dtype=dtype_req)
+        # check that casting has not overflowed
+        if not np.all(_to_end == to_end):
+            raise ValueError("cannot convert 'to_end' to array with dtype "
+                             "'%r' as required for input ary" % dtype_req)
+        to_end = _to_end.ravel()
         l_end = len(to_end)
 
     # do the calculation in place and copy to_begin and to_end
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 1ead375de..b61a64b8e 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -1150,7 +1150,7 @@ def diff(a, n=1, axis=-1, prepend=np._NoValue, append=np._NoValue):
     """
     Calculate the n-th discrete difference along the given axis.
 
-    The first difference is given by ``out[n] = a[n+1] - a[n]`` along
+    The first difference is given by ``out[i] = a[i+1] - a[i]`` along
     the given axis, higher differences are calculated by using `diff`
     recursively.
 
@@ -2593,8 +2593,6 @@ def blackman(M):
 
     Examples
     --------
-    >>> import matplotlib
-    >>> matplotlib.use('agg')
     >>> import matplotlib.pyplot as plt
     >>> np.blackman(12)
     array([-1.38777878e-17,   3.26064346e-02,   1.59903635e-01, # may vary
@@ -2811,9 +2809,6 @@ def hanning(M):
 
     Plot the window and its frequency response:
 
-    >>> import matplotlib
-    >>> import matplotlib.pyplot
-    >>> matplotlib.pyplot.switch_backend('agg')
     >>> import matplotlib.pyplot as plt
     >>> from numpy.fft import fft, fftshift
     >>> window = np.hanning(51)
@@ -2914,9 +2909,6 @@ def hamming(M):
 
     Plot the window and the frequency response:
 
-    >>> import matplotlib
-    >>> import matplotlib.pyplot
-    >>> matplotlib.pyplot.switch_backend('agg')
     >>> import matplotlib.pyplot as plt
     >>> from numpy.fft import fft, fftshift
     >>> window = np.hamming(51)
@@ -3192,8 +3184,6 @@ def kaiser(M, beta):
 
     Examples
     --------
-    >>> import matplotlib
-    >>> matplotlib.use('agg')
     >>> import matplotlib.pyplot as plt
     >>> np.kaiser(12, 14)
      array([7.72686684e-06, 3.46009194e-03, 4.65200189e-02, # may vary
@@ -3288,7 +3278,6 @@ def sinc(x):
 
     Examples
     --------
-    >>> import matplotlib
     >>> import matplotlib.pyplot as plt
     >>> x = np.linspace(-4, 4, 41)
     >>> np.sinc(x)
@@ -3967,8 +3956,6 @@ def _quantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False,
             r = add(x1, x2)
 
     if np.any(n):
-        warnings.warn("Invalid value encountered in percentile",
-                      RuntimeWarning, stacklevel=3)
         if zerod:
             if ap.ndim == 1:
                 if out is not None:
@@ -4241,7 +4228,7 @@ def delete(arr, obj, axis=None):
     arr : array_like
       Input array.
     obj : slice, int or array of ints
-      Indicate which sub-arrays to remove.
+      Indicate indices of sub-arrays to remove along the specified axis.
     axis : int, optional
       The axis along which to delete the subarray defined by `obj`.
       If `axis` is None, `obj` is applied to the flattened array.
diff --git a/numpy/lib/nanfunctions.py b/numpy/lib/nanfunctions.py
index b3bf1880b..77c851fcf 100644
--- a/numpy/lib/nanfunctions.py
+++ b/numpy/lib/nanfunctions.py
@@ -40,6 +40,33 @@ __all__ = [
     ]
 
 
+def _nan_mask(a, out=None):
+    """
+    Parameters
+    ----------
+    a : array-like
+        Input array with at least 1 dimension.
+    out : ndarray, optional
+        Alternate output array in which to place the result.  The default
+        is ``None``; if provided, it must have the same shape as the
+        expected output and will prevent the allocation of a new array.
+
+    Returns
+    -------
+    y : bool ndarray or True
+        A bool array where ``np.nan`` positions are marked with ``False``
+        and other positions are marked with ``True``. If the type of ``a``
+        is such that it can't possibly contain ``np.nan``, returns ``True``.
+    """
+    # we assume that a is an array for this private function
+
+    if a.dtype.kind not in 'fc':
+        return True
+
+    y = np.isnan(a, out=out)
+    y = np.invert(y, out=y)
+    return y
+
 def _replace_nan(a, val):
     """
     If `a` is of inexact type, make a copy of `a`, replace NaNs with
diff --git a/numpy/lib/polynomial.py b/numpy/lib/polynomial.py
index 7904092ed..b55764b5d 100644
--- a/numpy/lib/polynomial.py
+++ b/numpy/lib/polynomial.py
@@ -704,6 +704,8 @@ def polyval(p, x):
     for polynomials of high degree the values may be inaccurate due to
     rounding errors. Use carefully.
 
+    If `x` is a subtype of `ndarray` the return value will be of the same type.
+
     References
     ----------
     .. [1] I. N. Bronshtein, K. A. Semendyayev, and K. A. Hirsch (Eng.
@@ -726,7 +728,7 @@ def polyval(p, x):
     if isinstance(x, poly1d):
         y = 0
     else:
-        x = NX.asarray(x)
+        x = NX.asanyarray(x)
         y = NX.zeros_like(x)
     for i in range(len(p)):
         y = y * x + p[i]
diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py
index a17fc66e5..93d4b279f 100644
--- a/numpy/lib/tests/test_arraysetops.py
+++ b/numpy/lib/tests/test_arraysetops.py
@@ -136,8 +136,8 @@ class TestSetOps(object):
          np.nan),
         # should fail because attempting
         # to downcast to smaller int type:
-        (np.array([1, 2, 3], dtype=np.int32),
-         np.array([5, 7, 2], dtype=np.int64),
+        (np.array([1, 2, 3], dtype=np.int16),
+         np.array([5, 1<<20, 2], dtype=np.int32),
          None),
         # should fail because attempting to cast
         # two special floating point values
@@ -152,8 +152,8 @@ class TestSetOps(object):
         # specifically, raise an appropriate
         # Exception when attempting to append or
         # prepend with an incompatible type
-        msg = 'must be compatible'
-        with assert_raises_regex(TypeError, msg):
+        msg = 'cannot convert'
+        with assert_raises_regex(ValueError, msg):
             ediff1d(ary=ary,
                     to_end=append,
                     to_begin=prepend)
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 3d4b0e3b2..d9a97db1b 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -4,6 +4,7 @@ import operator
 import warnings
 import sys
 import decimal
+import types
 import pytest
 
 import numpy as np
@@ -24,6 +25,7 @@ from numpy.lib import (
 
 from numpy.compat import long
 
+PY2 = sys.version_info[0] == 2
 
 def get_mat(n):
     data = np.arange(n)
@@ -353,9 +355,9 @@ class TestAverage(object):
         assert_equal(type(np.average(a, weights=w)), subclass)
 
     def test_upcasting(self):
-        types = [('i4', 'i4', 'f8'), ('i4', 'f4', 'f8'), ('f4', 'i4', 'f8'),
+        typs = [('i4', 'i4', 'f8'), ('i4', 'f4', 'f8'), ('f4', 'i4', 'f8'),
                  ('f4', 'f4', 'f4'), ('f4', 'f8', 'f8')]
-        for at, wt, rt in types:
+        for at, wt, rt in typs:
             a = np.array([[1,2],[3,4]], dtype=at)
             w = np.array([[1,2],[3,4]], dtype=wt)
             assert_equal(np.average(a, weights=w).dtype, np.dtype(rt))
@@ -1498,6 +1500,49 @@ class TestVectorize(object):
             f(x)
 
 
+class TestLeaks(object):
+    class A(object):
+        iters = 20
+
+        def bound(self, *args):
+            return 0
+
+        @staticmethod
+        def unbound(*args):
+            return 0
+
+    @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+    @pytest.mark.parametrize('name, incr', [
+            ('bound', A.iters),
+            ('unbound', 0),
+            ])
+    def test_frompyfunc_leaks(self, name, incr):
+        # exposed in gh-11867 as np.vectorized, but the problem stems from
+        # frompyfunc.
+        # class.attribute = np.frompyfunc(<method>) creates a
+        # reference cycle if <method> is a bound class method. It requires a
+        # gc collection cycle to break the cycle (on CPython 3)
+        import gc
+        A_func = getattr(self.A, name)
+        gc.disable()
+        try:
+            refcount = sys.getrefcount(A_func)
+            for i in range(self.A.iters):
+                a = self.A()
+                a.f = np.frompyfunc(getattr(a, name), 1, 1)
+                out = a.f(np.arange(10))
+            a = None
+            if PY2:
+                assert_equal(sys.getrefcount(A_func), refcount)
+            else:
+                # A.func is part of a reference cycle if incr is non-zero
+                assert_equal(sys.getrefcount(A_func), refcount + incr)
+            for i in range(5):
+                gc.collect()
+            assert_equal(sys.getrefcount(A_func), refcount)
+        finally:
+            gc.enable()
+
 class TestDigitize(object):
 
     def test_forward(self):
@@ -2391,11 +2436,8 @@ class TestPercentile(object):
         assert_equal(np.percentile(x, 100), 3.5)
         assert_equal(np.percentile(x, 50), 1.75)
         x[1] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(x, 0), np.nan)
-            assert_equal(np.percentile(x, 0, interpolation='nearest'), np.nan)
-            assert_(w[0].category is RuntimeWarning)
+        assert_equal(np.percentile(x, 0), np.nan)
+        assert_equal(np.percentile(x, 0, interpolation='nearest'), np.nan)
 
     def test_api(self):
         d = np.ones(5)
@@ -2733,85 +2775,63 @@ class TestPercentile(object):
     def test_nan_behavior(self):
         a = np.arange(24, dtype=float)
         a[2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(a, 0.3), np.nan)
-            assert_equal(np.percentile(a, 0.3, axis=0), np.nan)
-            assert_equal(np.percentile(a, [0.3, 0.6], axis=0),
-                         np.array([np.nan] * 2))
-            assert_(w[0].category is RuntimeWarning)
-            assert_(w[1].category is RuntimeWarning)
-            assert_(w[2].category is RuntimeWarning)
+        assert_equal(np.percentile(a, 0.3), np.nan)
+        assert_equal(np.percentile(a, 0.3, axis=0), np.nan)
+        assert_equal(np.percentile(a, [0.3, 0.6], axis=0),
+                     np.array([np.nan] * 2))
 
         a = np.arange(24, dtype=float).reshape(2, 3, 4)
         a[1, 2, 3] = np.nan
         a[1, 1, 2] = np.nan
 
         # no axis
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(a, 0.3), np.nan)
-            assert_equal(np.percentile(a, 0.3).ndim, 0)
-            assert_(w[0].category is RuntimeWarning)
+        assert_equal(np.percentile(a, 0.3), np.nan)
+        assert_equal(np.percentile(a, 0.3).ndim, 0)
 
         # axis0 zerod
         b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4), 0.3, 0)
         b[2, 3] = np.nan
         b[1, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(a, 0.3, 0), b)
+        assert_equal(np.percentile(a, 0.3, 0), b)
 
         # axis0 not zerod
         b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4),
                           [0.3, 0.6], 0)
         b[:, 2, 3] = np.nan
         b[:, 1, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(a, [0.3, 0.6], 0), b)
+        assert_equal(np.percentile(a, [0.3, 0.6], 0), b)
 
         # axis1 zerod
         b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4), 0.3, 1)
         b[1, 3] = np.nan
         b[1, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(a, 0.3, 1), b)
+        assert_equal(np.percentile(a, 0.3, 1), b)
         # axis1 not zerod
         b = np.percentile(
             np.arange(24, dtype=float).reshape(2, 3, 4), [0.3, 0.6], 1)
         b[:, 1, 3] = np.nan
         b[:, 1, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(a, [0.3, 0.6], 1), b)
+        assert_equal(np.percentile(a, [0.3, 0.6], 1), b)
 
         # axis02 zerod
         b = np.percentile(
             np.arange(24, dtype=float).reshape(2, 3, 4), 0.3, (0, 2))
         b[1] = np.nan
         b[2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(a, 0.3, (0, 2)), b)
+        assert_equal(np.percentile(a, 0.3, (0, 2)), b)
         # axis02 not zerod
         b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4),
                           [0.3, 0.6], (0, 2))
         b[:, 1] = np.nan
         b[:, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(a, [0.3, 0.6], (0, 2)), b)
+        assert_equal(np.percentile(a, [0.3, 0.6], (0, 2)), b)
         # axis02 not zerod with nearest interpolation
         b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4),
                           [0.3, 0.6], (0, 2), interpolation='nearest')
         b[:, 1] = np.nan
         b[:, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(
-                a, [0.3, 0.6], (0, 2), interpolation='nearest'), b)
+        assert_equal(np.percentile(
+            a, [0.3, 0.6], (0, 2), interpolation='nearest'), b)
 
 
 class TestQuantile(object):
@@ -2858,10 +2878,7 @@ class TestMedian(object):
         # check array scalar result
         assert_equal(np.median(a).ndim, 0)
         a[1] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.median(a).ndim, 0)
-            assert_(w[0].category is RuntimeWarning)
+        assert_equal(np.median(a).ndim, 0)
 
     def test_axis_keyword(self):
         a3 = np.array([[2, 3],
@@ -2960,58 +2977,43 @@ class TestMedian(object):
     def test_nan_behavior(self):
         a = np.arange(24, dtype=float)
         a[2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.median(a), np.nan)
-            assert_equal(np.median(a, axis=0), np.nan)
-            assert_(w[0].category is RuntimeWarning)
-            assert_(w[1].category is RuntimeWarning)
+        assert_equal(np.median(a), np.nan)
+        assert_equal(np.median(a, axis=0), np.nan)
 
         a = np.arange(24, dtype=float).reshape(2, 3, 4)
         a[1, 2, 3] = np.nan
         a[1, 1, 2] = np.nan
 
         # no axis
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.median(a), np.nan)
-            assert_equal(np.median(a).ndim, 0)
-            assert_(w[0].category is RuntimeWarning)
+        assert_equal(np.median(a), np.nan)
+        assert_equal(np.median(a).ndim, 0)
 
         # axis0
         b = np.median(np.arange(24, dtype=float).reshape(2, 3, 4), 0)
         b[2, 3] = np.nan
         b[1, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.median(a, 0), b)
-            assert_equal(len(w), 1)
+        assert_equal(np.median(a, 0), b)
 
         # axis1
         b = np.median(np.arange(24, dtype=float).reshape(2, 3, 4), 1)
         b[1, 3] = np.nan
         b[1, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.median(a, 1), b)
-            assert_equal(len(w), 1)
+        assert_equal(np.median(a, 1), b)
 
         # axis02
         b = np.median(np.arange(24, dtype=float).reshape(2, 3, 4), (0, 2))
         b[1] = np.nan
         b[2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.median(a, (0, 2)), b)
-            assert_equal(len(w), 1)
+        assert_equal(np.median(a, (0, 2)), b)
 
     def test_empty(self):
-        # empty arrays
+        # mean(empty array) emits two warnings: empty slice and divide by 0
         a = np.array([], dtype=float)
         with warnings.catch_warnings(record=True) as w:
             warnings.filterwarnings('always', '', RuntimeWarning)
             assert_equal(np.median(a), np.nan)
             assert_(w[0].category is RuntimeWarning)
+            assert_equal(len(w), 2)
 
         # multiple dimensions
         a = np.array([], dtype=float, ndmin=3)
diff --git a/numpy/lib/tests/test_nanfunctions.py b/numpy/lib/tests/test_nanfunctions.py
index 504372faf..b7261c63f 100644
--- a/numpy/lib/tests/test_nanfunctions.py
+++ b/numpy/lib/tests/test_nanfunctions.py
@@ -1,8 +1,10 @@
 from __future__ import division, absolute_import, print_function
 
 import warnings
+import pytest
 
 import numpy as np
+from numpy.lib.nanfunctions import _nan_mask
 from numpy.testing import (
     assert_, assert_equal, assert_almost_equal, assert_no_warnings,
     assert_raises, assert_array_equal, suppress_warnings
@@ -925,3 +927,29 @@ class TestNanFunctions_Quantile(object):
         p = p.tolist()
         np.nanquantile(np.arange(100.), p, interpolation="midpoint")
         assert_array_equal(p, p0)
+
+@pytest.mark.parametrize("arr, expected", [
+    # array of floats with some nans
+    (np.array([np.nan, 5.0, np.nan, np.inf]),
+     np.array([False, True, False, True])),
+    # int64 array that can't possibly have nans
+    (np.array([1, 5, 7, 9], dtype=np.int64),
+     True),
+    # bool array that can't possibly have nans
+    (np.array([False, True, False, True]),
+     True),
+    # 2-D complex array with nans
+    (np.array([[np.nan, 5.0],
+               [np.nan, np.inf]], dtype=np.complex64),
+     np.array([[False, True],
+               [False, True]])),
+    ])
+def test__nan_mask(arr, expected):
+    for out in [None, np.empty(arr.shape, dtype=np.bool_)]:
+        actual = _nan_mask(arr, out=out)
+        assert_equal(actual, expected)
+        # the above won't distinguish between True proper
+        # and an array of True values; we want True proper
+        # for types that can't possibly contain NaN
+        if type(expected) is not np.ndarray:
+            assert actual is True
diff --git a/numpy/lib/tests/test_recfunctions.py b/numpy/lib/tests/test_recfunctions.py
index 11f8a5afa..069693613 100644
--- a/numpy/lib/tests/test_recfunctions.py
+++ b/numpy/lib/tests/test_recfunctions.py
@@ -221,9 +221,9 @@ class TestRecFunctions(object):
                          ( 5, ( 6.,  7), [ 8.,  9.]),
                          (10, (11., 12), [13., 14.]),
                          (15, (16., 17), [18., 19.])],
-                     dtype=[('a', '<i4'),
-                            ('b', [('f0', '<f4'), ('f1', '<u2')]),
-                            ('c', '<f4', (2,))])
+                     dtype=[('a', 'i4'),
+                            ('b', [('f0', 'f4'), ('f1', 'u2')]),
+                            ('c', 'f4', (2,))])
         assert_equal(out, want)
 
         d = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)],
diff --git a/numpy/lib/twodim_base.py b/numpy/lib/twodim_base.py
index 54d0240ef..e165c9b02 100644
--- a/numpy/lib/twodim_base.py
+++ b/numpy/lib/twodim_base.py
@@ -644,10 +644,7 @@ def histogram2d(x, y, bins=10, range=None, normed=None, weights=None,
 
     Examples
     --------
-    >>> import matplotlib
-    >>> import matplotlib.pyplot
-    >>> matplotlib.pyplot.switch_backend('agg')
-    >>> import matplotlib as mpl
+    >>> from matplotlib.image import NonUniformImage
     >>> import matplotlib.pyplot as plt
 
     Construct a 2-D histogram with variable bin width. First define the bin
@@ -684,7 +681,7 @@ def histogram2d(x, y, bins=10, range=None, normed=None, weights=None,
 
     >>> ax = fig.add_subplot(133, title='NonUniformImage: interpolated',
     ...         aspect='equal', xlim=xedges[[0, -1]], ylim=yedges[[0, -1]])
-    >>> im = mpl.image.NonUniformImage(ax, interpolation='bilinear')
+    >>> im = NonUniformImage(ax, interpolation='bilinear')
     >>> xcenters = (xedges[:-1] + xedges[1:]) / 2
     >>> ycenters = (yedges[:-1] + yedges[1:]) / 2
     >>> im.set_data(xcenters, ycenters, H)
diff --git a/numpy/lib/utils.py b/numpy/lib/utils.py
index 5a4cae235..6b112f37a 100644
--- a/numpy/lib/utils.py
+++ b/numpy/lib/utils.py
@@ -1140,17 +1140,12 @@ def _median_nancheck(data, result, axis, out):
         n = n.filled(False)
     if result.ndim == 0:
         if n == True:
-            warnings.warn("Invalid value encountered in median",
-                          RuntimeWarning, stacklevel=3)
             if out is not None:
                 out[...] = data.dtype.type(np.nan)
                 result = out
             else:
                 result = data.dtype.type(np.nan)
     elif np.count_nonzero(n.ravel()) > 0:
-        warnings.warn("Invalid value encountered in median for" +
-                      " %d results" % np.count_nonzero(n.ravel()),
-                      RuntimeWarning, stacklevel=3)
         result[n] = np.nan
     return result
 
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 92fa6cb73..17e84be2d 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -26,7 +26,7 @@ from numpy.core import (
     add, multiply, sqrt, fastCopyAndTranspose, sum, isfinite,
     finfo, errstate, geterrobj, moveaxis, amin, amax, product, abs,
     atleast_2d, intp, asanyarray, object_, matmul,
-    swapaxes, divide, count_nonzero, isnan
+    swapaxes, divide, count_nonzero, isnan, sign
 )
 from numpy.core.multiarray import normalize_axis_index
 from numpy.core.overrides import set_module
@@ -1461,12 +1461,12 @@ def eigh(a, UPLO='L'):
 
 # Singular value decomposition
 
-def _svd_dispatcher(a, full_matrices=None, compute_uv=None):
+def _svd_dispatcher(a, full_matrices=None, compute_uv=None, hermitian=None):
     return (a,)
 
 
 @array_function_dispatch(_svd_dispatcher)
-def svd(a, full_matrices=True, compute_uv=True):
+def svd(a, full_matrices=True, compute_uv=True, hermitian=False):
     """
     Singular Value Decomposition.
 
@@ -1504,6 +1504,12 @@ def svd(a, full_matrices=True, compute_uv=True):
         size as those of the input `a`. The size of the last two dimensions
         depends on the value of `full_matrices`. Only returned when
         `compute_uv` is True.
+    hermitian : bool, optional
+        If True, `a` is assumed to be Hermitian (symmetric if real-valued),
+        enabling a more efficient method for finding singular values.
+        Defaults to False.
+
+        ..versionadded:: 1.17.0
 
     Raises
     ------
@@ -1590,6 +1596,24 @@ def svd(a, full_matrices=True, compute_uv=True):
 
     """
     a, wrap = _makearray(a)
+
+    if hermitian:
+        # note: lapack returns eigenvalues in reverse order to our contract.
+        # reversing is cheap by design in numpy, so we do so to be consistent
+        if compute_uv:
+            s, u = eigh(a)
+            s = s[..., ::-1]
+            u = u[..., ::-1]
+            # singular values are unsigned, move the sign into v
+            vt = transpose(u * sign(s)[..., None, :]).conjugate()
+            s = abs(s)
+            return wrap(u), s, wrap(vt)
+        else:
+            s = eigvalsh(a)
+            s = s[..., ::-1]
+            s = abs(s)
+            return s
+
     _assertRankAtLeast2(a)
     t, result_t = _commonType(a)
 
@@ -1844,10 +1868,7 @@ def matrix_rank(M, tol=None, hermitian=False):
     M = asarray(M)
     if M.ndim < 2:
         return int(not all(M==0))
-    if hermitian:
-        S = abs(eigvalsh(M))
-    else:
-        S = svd(M, compute_uv=False)
+    S = svd(M, compute_uv=False, hermitian=hermitian)
     if tol is None:
         tol = S.max(axis=-1, keepdims=True) * max(M.shape[-2:]) * finfo(S.dtype).eps
     else:
@@ -1857,12 +1878,12 @@ def matrix_rank(M, tol=None, hermitian=False):
 
 # Generalized inverse
 
-def _pinv_dispatcher(a, rcond=None):
+def _pinv_dispatcher(a, rcond=None, hermitian=None):
     return (a,)
 
 
 @array_function_dispatch(_pinv_dispatcher)
-def pinv(a, rcond=1e-15):
+def pinv(a, rcond=1e-15, hermitian=False):
     """
     Compute the (Moore-Penrose) pseudo-inverse of a matrix.
 
@@ -1882,6 +1903,12 @@ def pinv(a, rcond=1e-15):
         Singular values smaller (in modulus) than
         `rcond` * largest_singular_value (again, in modulus)
         are set to zero. Broadcasts against the stack of matrices
+    hermitian : bool, optional
+        If True, `a` is assumed to be Hermitian (symmetric if real-valued),
+        enabling a more efficient method for finding singular values.
+        Defaults to False.
+
+        ..versionadded:: 1.17.0
 
     Returns
     -------
@@ -1935,7 +1962,7 @@ def pinv(a, rcond=1e-15):
         res = empty(a.shape[:-2] + (n, m), dtype=a.dtype)
         return wrap(res)
     a = a.conjugate()
-    u, s, vt = svd(a, full_matrices=False)
+    u, s, vt = svd(a, full_matrices=False, hermitian=hermitian)
 
     # discard small singular values
     cutoff = rcond[..., newaxis] * amax(s, axis=-1, keepdims=True)
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index 235488c6e..831c059d0 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -633,6 +633,20 @@ class TestEig(EigCases):
         assert_(isinstance(a, np.ndarray))
 
 
+class SVDBaseTests(object):
+    hermitian = False
+
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        u, s, vh = linalg.svd(x)
+        assert_equal(u.dtype, dtype)
+        assert_equal(s.dtype, get_real_dtype(dtype))
+        assert_equal(vh.dtype, dtype)
+        s = linalg.svd(x, compute_uv=False, hermitian=self.hermitian)
+        assert_equal(s.dtype, get_real_dtype(dtype))
+
+
 class SVDCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
 
     def do(self, a, b, tags):
@@ -644,32 +658,37 @@ class SVDCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
         assert_(consistent_subclass(vt, a))
 
 
-class TestSVD(SVDCases):
-    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
-    def test_types(self, dtype):
-        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
-        u, s, vh = linalg.svd(x)
-        assert_equal(u.dtype, dtype)
-        assert_equal(s.dtype, get_real_dtype(dtype))
-        assert_equal(vh.dtype, dtype)
-        s = linalg.svd(x, compute_uv=False)
-        assert_equal(s.dtype, get_real_dtype(dtype))
-
+class TestSVD(SVDCases, SVDBaseTests):
     def test_empty_identity(self):
         """ Empty input should put an identity matrix in u or vh """
         x = np.empty((4, 0))
-        u, s, vh = linalg.svd(x, compute_uv=True)
+        u, s, vh = linalg.svd(x, compute_uv=True, hermitian=self.hermitian)
         assert_equal(u.shape, (4, 4))
         assert_equal(vh.shape, (0, 0))
         assert_equal(u, np.eye(4))
 
         x = np.empty((0, 4))
-        u, s, vh = linalg.svd(x, compute_uv=True)
+        u, s, vh = linalg.svd(x, compute_uv=True, hermitian=self.hermitian)
         assert_equal(u.shape, (0, 0))
         assert_equal(vh.shape, (4, 4))
         assert_equal(vh, np.eye(4))
 
 
+class SVDHermitianCases(HermitianTestCase, HermitianGeneralizedTestCase):
+
+    def do(self, a, b, tags):
+        u, s, vt = linalg.svd(a, 0, hermitian=True)
+        assert_allclose(a, dot_generalized(np.asarray(u) * np.asarray(s)[..., None, :],
+                                           np.asarray(vt)),
+                        rtol=get_rtol(u.dtype))
+        assert_(consistent_subclass(u, a))
+        assert_(consistent_subclass(vt, a))
+
+
+class TestSVDHermitian(SVDHermitianCases, SVDBaseTests):
+    hermitian = True
+
+
 class CondCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
     # cond(x, p) for p in (None, 2, -2)
 
@@ -797,6 +816,20 @@ class TestPinv(PinvCases):
     pass
 
 
+class PinvHermitianCases(HermitianTestCase, HermitianGeneralizedTestCase):
+
+    def do(self, a, b, tags):
+        a_ginv = linalg.pinv(a, hermitian=True)
+        # `a @ a_ginv == I` does not hold if a is singular
+        dot = dot_generalized
+        assert_almost_equal(dot(dot(a, a_ginv), a), a, single_decimal=5, double_decimal=11)
+        assert_(consistent_subclass(a_ginv, a))
+
+
+class TestPinvHermitian(PinvHermitianCases):
+    pass
+
+
 class DetCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
 
     def do(self, a, b, tags):
@@ -1962,3 +1995,10 @@ class TestTensorinv(object):
         ainv = linalg.tensorinv(a, ind=1)
         b = np.ones(24)
         assert_allclose(np.tensordot(ainv, b, 1), np.linalg.tensorsolve(a, b))
+
+
+def test_unsupported_commontype():
+    # linalg gracefully handles unsupported type
+    arr = np.array([[1, -2], [2, 5]], dtype='float16')
+    with assert_raises_regex(TypeError, "unsupported in linalg"):
+        linalg.cholesky(arr)
diff --git a/numpy/ma/tests/test_extras.py b/numpy/ma/tests/test_extras.py
index 5243cf714..afcfd126e 100644
--- a/numpy/ma/tests/test_extras.py
+++ b/numpy/ma/tests/test_extras.py
@@ -891,61 +891,51 @@ class TestMedian(object):
                            expected)
 
     def test_nan(self):
-        with suppress_warnings() as w:
-            w.record(RuntimeWarning)
-            for mask in (False, np.zeros(6, dtype=bool)):
-                dm = np.ma.array([[1, np.nan, 3], [1, 2, 3]])
-                dm.mask = mask
-
-                # scalar result
-                r = np.ma.median(dm, axis=None)
-                assert_(np.isscalar(r))
-                assert_array_equal(r, np.nan)
-                r = np.ma.median(dm.ravel(), axis=0)
-                assert_(np.isscalar(r))
-                assert_array_equal(r, np.nan)
-
-                r = np.ma.median(dm, axis=0)
-                assert_equal(type(r), MaskedArray)
-                assert_array_equal(r, [1, np.nan, 3])
-                r = np.ma.median(dm, axis=1)
-                assert_equal(type(r), MaskedArray)
-                assert_array_equal(r, [np.nan, 2])
-                r = np.ma.median(dm, axis=-1)
-                assert_equal(type(r), MaskedArray)
-                assert_array_equal(r, [np.nan, 2])
-
+        for mask in (False, np.zeros(6, dtype=bool)):
             dm = np.ma.array([[1, np.nan, 3], [1, 2, 3]])
-            dm[:, 2] = np.ma.masked
-            assert_array_equal(np.ma.median(dm, axis=None), np.nan)
-            assert_array_equal(np.ma.median(dm, axis=0), [1, np.nan, 3])
-            assert_array_equal(np.ma.median(dm, axis=1), [np.nan, 1.5])
-            assert_equal([x.category is RuntimeWarning for x in w.log],
-                         [True]*13)
+            dm.mask = mask
+
+            # scalar result
+            r = np.ma.median(dm, axis=None)
+            assert_(np.isscalar(r))
+            assert_array_equal(r, np.nan)
+            r = np.ma.median(dm.ravel(), axis=0)
+            assert_(np.isscalar(r))
+            assert_array_equal(r, np.nan)
+
+            r = np.ma.median(dm, axis=0)
+            assert_equal(type(r), MaskedArray)
+            assert_array_equal(r, [1, np.nan, 3])
+            r = np.ma.median(dm, axis=1)
+            assert_equal(type(r), MaskedArray)
+            assert_array_equal(r, [np.nan, 2])
+            r = np.ma.median(dm, axis=-1)
+            assert_equal(type(r), MaskedArray)
+            assert_array_equal(r, [np.nan, 2])
+
+        dm = np.ma.array([[1, np.nan, 3], [1, 2, 3]])
+        dm[:, 2] = np.ma.masked
+        assert_array_equal(np.ma.median(dm, axis=None), np.nan)
+        assert_array_equal(np.ma.median(dm, axis=0), [1, np.nan, 3])
+        assert_array_equal(np.ma.median(dm, axis=1), [np.nan, 1.5])
 
     def test_out_nan(self):
-        with warnings.catch_warnings(record=True):
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            o = np.ma.masked_array(np.zeros((4,)))
-            d = np.ma.masked_array(np.ones((3, 4)))
-            d[2, 1] = np.nan
-            d[2, 2] = np.ma.masked
-            assert_equal(np.ma.median(d, 0, out=o), o)
-            o = np.ma.masked_array(np.zeros((3,)))
-            assert_equal(np.ma.median(d, 1, out=o), o)
-            o = np.ma.masked_array(np.zeros(()))
-            assert_equal(np.ma.median(d, out=o), o)
+        o = np.ma.masked_array(np.zeros((4,)))
+        d = np.ma.masked_array(np.ones((3, 4)))
+        d[2, 1] = np.nan
+        d[2, 2] = np.ma.masked
+        assert_equal(np.ma.median(d, 0, out=o), o)
+        o = np.ma.masked_array(np.zeros((3,)))
+        assert_equal(np.ma.median(d, 1, out=o), o)
+        o = np.ma.masked_array(np.zeros(()))
+        assert_equal(np.ma.median(d, out=o), o)
 
     def test_nan_behavior(self):
         a = np.ma.masked_array(np.arange(24, dtype=float))
         a[::3] = np.ma.masked
         a[2] = np.nan
-        with suppress_warnings() as w:
-            w.record(RuntimeWarning)
-            assert_array_equal(np.ma.median(a), np.nan)
-            assert_array_equal(np.ma.median(a, axis=0), np.nan)
-            assert_(w.log[0].category is RuntimeWarning)
-            assert_(w.log[1].category is RuntimeWarning)
+        assert_array_equal(np.ma.median(a), np.nan)
+        assert_array_equal(np.ma.median(a, axis=0), np.nan)
 
         a = np.ma.masked_array(np.arange(24, dtype=float).reshape(2, 3, 4))
         a.mask = np.arange(a.size) % 2 == 1
@@ -954,39 +944,26 @@ class TestMedian(object):
         a[1, 1, 2] = np.nan
 
         # no axis
-        with suppress_warnings() as w:
-            w.record(RuntimeWarning)
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_array_equal(np.ma.median(a), np.nan)
-            assert_(np.isscalar(np.ma.median(a)))
-            assert_(w.log[0].category is RuntimeWarning)
+        assert_array_equal(np.ma.median(a), np.nan)
+        assert_(np.isscalar(np.ma.median(a)))
 
         # axis0
         b = np.ma.median(aorig, axis=0)
         b[2, 3] = np.nan
         b[1, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.ma.median(a, 0), b)
-            assert_equal(len(w), 1)
+        assert_equal(np.ma.median(a, 0), b)
 
         # axis1
         b = np.ma.median(aorig, axis=1)
         b[1, 3] = np.nan
         b[1, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.ma.median(a, 1), b)
-            assert_equal(len(w), 1)
+        assert_equal(np.ma.median(a, 1), b)
 
         # axis02
         b = np.ma.median(aorig, axis=(0, 2))
         b[1] = np.nan
         b[2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.ma.median(a, (0, 2)), b)
-            assert_equal(len(w), 1)
+        assert_equal(np.ma.median(a, (0, 2)), b)
 
     def test_ambigous_fill(self):
         # 255 is max value, used as filler for sort
diff --git a/numpy/polynomial/tests/test_polynomial.py b/numpy/polynomial/tests/test_polynomial.py
index 0c93be278..562aa904d 100644
--- a/numpy/polynomial/tests/test_polynomial.py
+++ b/numpy/polynomial/tests/test_polynomial.py
@@ -9,7 +9,7 @@ import numpy as np
 import numpy.polynomial.polynomial as poly
 from numpy.testing import (
     assert_almost_equal, assert_raises, assert_equal, assert_,
-    )
+    assert_array_equal)
 
 
 def trim(x):
@@ -147,6 +147,19 @@ class TestEvaluation(object):
             assert_equal(poly.polyval(x, [1, 0]).shape, dims)
             assert_equal(poly.polyval(x, [1, 0, 0]).shape, dims)
 
+        #check masked arrays are processed correctly
+        mask = [False, True, False]
+        mx = np.ma.array([1, 2, 3], mask=mask)
+        res = np.polyval([7, 5, 3], mx)
+        assert_array_equal(res.mask, mask)
+
+        #check subtypes of ndarray are preserved
+        class C(np.ndarray):
+            pass
+
+        cx = np.array([1, 2, 3]).view(C)
+        assert_equal(type(np.polyval([2, 3, 4], cx)), C)
+
     def test_polyvalfromroots(self):
         # check exception for broadcasting x values over root array with
         # too few dimensions
diff --git a/numpy/random/mtrand/mtrand.pyx b/numpy/random/mtrand/mtrand.pyx
index e4a401b24..f922c59a5 100644
--- a/numpy/random/mtrand/mtrand.pyx
+++ b/numpy/random/mtrand/mtrand.pyx
@@ -1141,9 +1141,12 @@ cdef class RandomState:
                 raise ValueError("'p' must be 1-dimensional")
             if p.size != pop_size:
                 raise ValueError("'a' and 'p' must have same size")
+            p_sum = kahan_sum(pix, d)
+            if np.isnan(p_sum):
+                raise ValueError("probabilities contain NaN")
             if np.logical_or.reduce(p < 0):
                 raise ValueError("probabilities are not non-negative")
-            if abs(kahan_sum(pix, d) - 1.) > atol:
+            if abs(p_sum - 1.) > atol:
                 raise ValueError("probabilities do not sum to 1")
 
         shape = size
@@ -2168,7 +2171,6 @@ cdef class RandomState:
         >>> NF = np.histogram(nc_vals, bins=50, density=True)
         >>> c_vals = np.random.f(dfnum, dfden, 1000000)
         >>> F = np.histogram(c_vals, bins=50, density=True)
-        >>> import matplotlib
         >>> import matplotlib.pyplot as plt
         >>> plt.plot(F[1][1:], F[0])
         >>> plt.plot(NF[1][1:], NF[0])
@@ -2447,7 +2449,6 @@ cdef class RandomState:
         --------
         Draw samples and plot the distribution:
 
-        >>> import matplotlib
         >>> import matplotlib.pyplot as plt
         >>> s = np.random.standard_cauchy(1000000)
         >>> s = s[(s>-25) & (s<25)]  # truncate distribution so it plots well
@@ -3285,7 +3286,6 @@ cdef class RandomState:
 
         >>> loc, scale = 10, 1
         >>> s = np.random.logistic(loc, scale, 10000)
-        >>> import matplotlib
         >>> import matplotlib.pyplot as plt
         >>> count, bins, ignored = plt.hist(s, bins=50)
 
@@ -3487,7 +3487,6 @@ cdef class RandomState:
         --------
         Draw values from the distribution and plot the histogram
 
-        >>> import matplotlib
         >>> from matplotlib.pyplot import hist
         >>> values = hist(np.random.rayleigh(3, 100000), bins=200, density=True)
 
@@ -4243,7 +4242,6 @@ cdef class RandomState:
         >>> ngood, nbad, nsamp = 100, 2, 10
         # number of good, number of bad, and number of samples
         >>> s = np.random.hypergeometric(ngood, nbad, nsamp, 1000)
-        >>> import matplotlib
         >>> from matplotlib.pyplot import hist
         >>> hist(s)
         #   note that it is very unlikely to grab both bad items
@@ -4354,7 +4352,6 @@ cdef class RandomState:
 
         >>> a = .6
         >>> s = np.random.logseries(a, 10000)
-        >>> import matplotlib
         >>> import matplotlib.pyplot as plt
         >>> count, bins, ignored = plt.hist(s)
 
@@ -4673,8 +4670,9 @@ cdef class RandomState:
 
         Draw `size` samples of dimension k from a Dirichlet distribution. A
         Dirichlet-distributed random variable can be seen as a multivariate
-        generalization of a Beta distribution. Dirichlet pdf is the conjugate
-        prior of a multinomial in Bayesian inference.
+        generalization of a Beta distribution. The Dirichlet distribution
+        is a conjugate prior of a multinomial distribution in Bayesian 
+        inference.
 
         Parameters
         ----------
@@ -4698,13 +4696,24 @@ cdef class RandomState:
 
         Notes
         -----
-        .. math:: X \\approx \\prod_{i=1}^{k}{x^{\\alpha_i-1}_i}
 
-        Uses the following property for computation: for each dimension,
-        draw a random sample y_i from a standard gamma generator of shape
-        `alpha_i`, then
-        :math:`X = \\frac{1}{\\sum_{i=1}^k{y_i}} (y_1, \\ldots, y_n)` is
-        Dirichlet distributed.
+        The Dirichlet distribution is a distribution over vectors 
+        :math:`x` that fulfil the conditions :math:`x_i>0` and 
+        :math:`\\sum_{i=1}^k x_i = 1`.
+
+        The probability density function :math:`p` of a 
+        Dirichlet-distributed random vector :math:`X` is 
+        proportional to
+
+        .. math:: p(x) \\propto \\prod_{i=1}^{k}{x^{\\alpha_i-1}_i},
+
+        where :math:`\\alpha` is a vector containing the positive 
+        concentration parameters.
+
+        The method uses the following property for computation: let :math:`Y`
+        be a random vector which has components that follow a standard gamma 
+        distribution, then :math:`X = \\frac{1}{\\sum_{i=1}^k{Y_i}} Y` 
+        is Dirichlet-distributed
 
         References
         ----------
@@ -4724,7 +4733,6 @@ cdef class RandomState:
 
         >>> s = np.random.dirichlet((10, 5, 3), 20).transpose()
 
-        >>> import matplotlib
         >>> import matplotlib.pyplot as plt
         >>> plt.barh(range(20), s[0])
         >>> plt.barh(range(20), s[1], left=s[0], color='g')
diff --git a/numpy/random/tests/test_random.py b/numpy/random/tests/test_random.py
index d0bb92a73..d4721bc62 100644
--- a/numpy/random/tests/test_random.py
+++ b/numpy/random/tests/test_random.py
@@ -448,6 +448,11 @@ class TestRandomDist(object):
         assert_equal(np.random.choice(['a', 'b'], size=(3, 0, 4)).shape, (3, 0, 4))
         assert_raises(ValueError, np.random.choice, [], 10)
 
+    def test_choice_nan_probabilities(self):
+        a = np.array([42, 1, 2])
+        p = [None, None, None]
+        assert_raises(ValueError, np.random.choice, a, p=p)
+
     def test_bytes(self):
         np.random.seed(self.seed)
         actual = np.random.bytes(10)
diff --git a/numpy/testing/_private/nosetester.py b/numpy/testing/_private/nosetester.py
index 1728d9d1f..19569a509 100644
--- a/numpy/testing/_private/nosetester.py
+++ b/numpy/testing/_private/nosetester.py
@@ -92,7 +92,7 @@ def run_module_suite(file_to_run=None, argv=None):
 
     Alternatively, calling::
 
-    >>> run_module_suite(file_to_run="numpy/tests/test_matlib.py")
+    >>> run_module_suite(file_to_run="numpy/tests/test_matlib.py")  # doctest: +SKIP
 
     from an interpreter will run all the test routine in 'test_matlib.py'.
     """
diff --git a/numpy/testing/_private/utils.py b/numpy/testing/_private/utils.py
index 4059f6ee6..1f7b516b3 100644
--- a/numpy/testing/_private/utils.py
+++ b/numpy/testing/_private/utils.py
@@ -318,8 +318,9 @@ def assert_equal(actual, desired, err_msg='', verbose=True):
     Examples
     --------
     >>> np.testing.assert_equal([4,5], [4,6])
-    ...
-    <type 'exceptions.AssertionError'>:
+    Traceback (most recent call last):
+        ...
+    AssertionError:
     Items are not equal:
     item=1
      ACTUAL: 5
@@ -510,20 +511,24 @@ def assert_almost_equal(actual,desired,decimal=7,err_msg='',verbose=True):
     >>> import numpy.testing as npt
     >>> npt.assert_almost_equal(2.3333333333333, 2.33333334)
     >>> npt.assert_almost_equal(2.3333333333333, 2.33333334, decimal=10)
-    ...
-    <type 'exceptions.AssertionError'>:
-    Items are not equal:
-     ACTUAL: 2.3333333333333002
-     DESIRED: 2.3333333399999998
+    Traceback (most recent call last):
+        ...
+    AssertionError:
+    Arrays are not almost equal to 10 decimals
+     ACTUAL: 2.3333333333333
+     DESIRED: 2.33333334
 
     >>> npt.assert_almost_equal(np.array([1.0,2.3333333333333]),
     ...                         np.array([1.0,2.33333334]), decimal=9)
-    ...
-    <type 'exceptions.AssertionError'>:
-    Arrays are not almost equal
-    (mismatch 50.0%)
-     x: array([ 1.        ,  2.33333333])
-     y: array([ 1.        ,  2.33333334])
+    Traceback (most recent call last):
+        ...
+    AssertionError:
+    Arrays are not almost equal to 9 decimals
+    Mismatch: 50%
+    Max absolute difference: 6.66669964e-09
+    Max relative difference: 2.85715698e-09
+     x: array([1.         , 2.333333333])
+     y: array([1.        , 2.33333334])
 
     """
     __tracebackhide__ = True  # Hide traceback for py.test
@@ -625,14 +630,15 @@ def assert_approx_equal(actual,desired,significant=7,err_msg='',verbose=True):
     --------
     >>> np.testing.assert_approx_equal(0.12345677777777e-20, 0.1234567e-20)
     >>> np.testing.assert_approx_equal(0.12345670e-20, 0.12345671e-20,
-                                       significant=8)
+    ...                                significant=8)
     >>> np.testing.assert_approx_equal(0.12345670e-20, 0.12345672e-20,
-                                       significant=8)
-    ...
-    <type 'exceptions.AssertionError'>:
+    ...                                significant=8)
+    Traceback (most recent call last):
+        ...
+    AssertionError:
     Items are not equal to 8 significant digits:
-     ACTUAL: 1.234567e-021
-     DESIRED: 1.2345672000000001e-021
+     ACTUAL: 1.234567e-21
+     DESIRED: 1.2345672e-21
 
     the evaluated condition that raises the exception is
 
@@ -659,10 +665,10 @@ def assert_approx_equal(actual,desired,significant=7,err_msg='',verbose=True):
         sc_actual = actual/scale
     except ZeroDivisionError:
         sc_actual = 0.0
-    msg = build_err_msg([actual, desired], err_msg,
-                header='Items are not equal to %d significant digits:' %
-                                 significant,
-                verbose=verbose)
+    msg = build_err_msg(
+        [actual, desired], err_msg,
+        header='Items are not equal to %d significant digits:' % significant,
+        verbose=verbose)
     try:
         # If one of desired/actual is not finite, handle it specially here:
         # check that both are nan if any is a nan, and test for equality
@@ -685,7 +691,7 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
                          header='', precision=6, equal_nan=True,
                          equal_inf=True):
     __tracebackhide__ = True  # Hide traceback for py.test
-    from numpy.core import array, isnan, inf, bool_
+    from numpy.core import array, array2string, isnan, inf, bool_, errstate
 
     x = array(x, copy=False, subok=True)
     y = array(y, copy=False, subok=True)
@@ -781,15 +787,31 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
             reduced = val.ravel()
             cond = reduced.all()
             reduced = reduced.tolist()
+
         # The below comparison is a hack to ensure that fully masked
         # results, for which val.ravel().all() returns np.ma.masked,
         # do not trigger a failure (np.ma.masked != True evaluates as
         # np.ma.masked, which is falsy).
         if cond != True:
             mismatch = 100.0 * reduced.count(0) / ox.size
-            msg = build_err_msg([ox, oy],
-                                err_msg
-                                + '\n(mismatch %s%%)' % (mismatch,),
+            remarks = ['Mismatch: {:.3g}%'.format(mismatch)]
+
+            with errstate(invalid='ignore', divide='ignore'):
+                # ignore errors for non-numeric types
+                with contextlib.suppress(TypeError):
+                    error = abs(x - y)
+                    max_abs_error = error.max()
+                    remarks.append('Max absolute difference: '
+                                   + array2string(max_abs_error))
+
+                    # note: this definition of relative error matches that one
+                    # used by assert_allclose (found in np.isclose)
+                    max_rel_error = (error / abs(y)).max()
+                    remarks.append('Max relative difference: '
+                                   + array2string(max_rel_error))
+
+            err_msg += '\n' + '\n'.join(remarks)
+            msg = build_err_msg([ox, oy], err_msg,
                                 verbose=verbose, header=header,
                                 names=('x', 'y'), precision=precision)
             raise AssertionError(msg)
@@ -849,13 +871,15 @@ def assert_array_equal(x, y, err_msg='', verbose=True):
 
     >>> np.testing.assert_array_equal([1.0,np.pi,np.nan],
     ...                               [1, np.sqrt(np.pi)**2, np.nan])
-    ...
-    <type 'exceptions.ValueError'>:
+    Traceback (most recent call last):
+        ...
     AssertionError:
     Arrays are not equal
-    (mismatch 50.0%)
-     x: array([ 1.        ,  3.14159265,         NaN])
-     y: array([ 1.        ,  3.14159265,         NaN])
+    Mismatch: 33.3%
+    Max absolute difference: 4.4408921e-16
+    Max relative difference: 1.41357986e-16
+     x: array([1.      , 3.141593,      nan])
+     y: array([1.      , 3.141593,      nan])
 
     Use `assert_allclose` or one of the nulp (number of floating point values)
     functions for these cases instead:
@@ -920,25 +944,29 @@ def assert_array_almost_equal(x, y, decimal=6, err_msg='', verbose=True):
     the first assert does not raise an exception
 
     >>> np.testing.assert_array_almost_equal([1.0,2.333,np.nan],
-                                             [1.0,2.333,np.nan])
+    ...                                      [1.0,2.333,np.nan])
 
     >>> np.testing.assert_array_almost_equal([1.0,2.33333,np.nan],
     ...                                      [1.0,2.33339,np.nan], decimal=5)
-    ...
-    <type 'exceptions.AssertionError'>:
+    Traceback (most recent call last):
+        ...
     AssertionError:
-    Arrays are not almost equal
-    (mismatch 50.0%)
-     x: array([ 1.     ,  2.33333,      NaN])
-     y: array([ 1.     ,  2.33339,      NaN])
+    Arrays are not almost equal to 5 decimals
+    Mismatch: 33.3%
+    Max absolute difference: 6.e-05
+    Max relative difference: 2.57136612e-05
+     x: array([1.     , 2.33333,     nan])
+     y: array([1.     , 2.33339,     nan])
 
     >>> np.testing.assert_array_almost_equal([1.0,2.33333,np.nan],
     ...                                      [1.0,2.33333, 5], decimal=5)
-    <type 'exceptions.ValueError'>:
-    ValueError:
-    Arrays are not almost equal
-     x: array([ 1.     ,  2.33333,      NaN])
-     y: array([ 1.     ,  2.33333,  5.     ])
+    Traceback (most recent call last):
+        ...
+    AssertionError:
+    Arrays are not almost equal to 5 decimals
+    x and y nan location mismatch:
+     x: array([1.     , 2.33333,     nan])
+     y: array([1.     , 2.33333, 5.     ])
 
     """
     __tracebackhide__ = True  # Hide traceback for py.test
@@ -1019,27 +1047,34 @@ def assert_array_less(x, y, err_msg='', verbose=True):
     --------
     >>> np.testing.assert_array_less([1.0, 1.0, np.nan], [1.1, 2.0, np.nan])
     >>> np.testing.assert_array_less([1.0, 1.0, np.nan], [1, 2.0, np.nan])
-    ...
-    <type 'exceptions.ValueError'>:
+    Traceback (most recent call last):
+        ...
+    AssertionError:
     Arrays are not less-ordered
-    (mismatch 50.0%)
-     x: array([  1.,   1.,  NaN])
-     y: array([  1.,   2.,  NaN])
+    Mismatch: 33.3%
+    Max absolute difference: 1.
+    Max relative difference: 0.5
+     x: array([ 1.,  1., nan])
+     y: array([ 1.,  2., nan])
 
     >>> np.testing.assert_array_less([1.0, 4.0], 3)
-    ...
-    <type 'exceptions.ValueError'>:
+    Traceback (most recent call last):
+        ...
+    AssertionError:
     Arrays are not less-ordered
-    (mismatch 50.0%)
-     x: array([ 1.,  4.])
+    Mismatch: 50%
+    Max absolute difference: 2.
+    Max relative difference: 0.66666667
+     x: array([1., 4.])
      y: array(3)
 
     >>> np.testing.assert_array_less([1.0, 2.0, 3.0], [4])
-    ...
-    <type 'exceptions.ValueError'>:
+    Traceback (most recent call last):
+        ...
+    AssertionError:
     Arrays are not less-ordered
     (shapes (3,), (1,) mismatch)
-     x: array([ 1.,  2.,  3.])
+     x: array([1., 2., 3.])
      y: array([4])
 
     """
@@ -1144,7 +1179,7 @@ def rundocs(filename=None, raise_on_error=True):
     argument to the ``test()`` call. For example, to run all tests (including
     doctests) for `numpy.lib`:
 
-    >>> np.lib.test(doctests=True) #doctest: +SKIP
+    >>> np.lib.test(doctests=True)  # doctest: +SKIP
     """
     from numpy.compat import npy_load_module
     import doctest
@@ -1326,7 +1361,7 @@ def decorate_methods(cls, decorator, testmatch=None):
     return
 
 
-def measure(code_str,times=1,label=None):
+def measure(code_str, times=1, label=None):
     """
     Return elapsed time for executing code in the namespace of the caller.
 
@@ -1353,9 +1388,9 @@ def measure(code_str,times=1,label=None):
 
     Examples
     --------
-    >>> etime = np.testing.measure('for i in range(1000): np.sqrt(i**2)',
-    ...                            times=times)
-    >>> print("Time for a single execution : ", etime / times, "s")
+    >>> times = 10
+    >>> etime = np.testing.measure('for i in range(1000): np.sqrt(i**2)', times=times)
+    >>> print("Time for a single execution : ", etime / times, "s")  # doctest: +SKIP
     Time for a single execution :  0.005 s
 
     """
@@ -1440,7 +1475,7 @@ def assert_allclose(actual, desired, rtol=1e-7, atol=0, equal_nan=True,
     --------
     >>> x = [1e-5, 1e-3, 1e-1]
     >>> y = np.arccos(np.cos(x))
-    >>> assert_allclose(x, y, rtol=1e-5, atol=0)
+    >>> np.testing.assert_allclose(x, y, rtol=1e-5, atol=0)
 
     """
     __tracebackhide__ = True  # Hide traceback for py.test
@@ -1894,7 +1929,8 @@ class clear_and_catch_warnings(warnings.catch_warnings):
     Examples
     --------
     >>> import warnings
-    >>> with clear_and_catch_warnings(modules=[np.core.fromnumeric]):
+    >>> with np.testing.clear_and_catch_warnings(
+    ...         modules=[np.core.fromnumeric]):
     ...     warnings.simplefilter('always')
     ...     warnings.filterwarnings('ignore', module='np.core.fromnumeric')
     ...     # do something that raises a warning but ignore those in
@@ -1975,25 +2011,28 @@ class suppress_warnings(object):
 
     Examples
     --------
-    >>> with suppress_warnings() as sup:
-    ...     sup.filter(DeprecationWarning, "Some text")
-    ...     sup.filter(module=np.ma.core)
-    ...     log = sup.record(FutureWarning, "Does this occur?")
-    ...     command_giving_warnings()
-    ...     # The FutureWarning was given once, the filtered warnings were
-    ...     # ignored. All other warnings abide outside settings (may be
-    ...     # printed/error)
-    ...     assert_(len(log) == 1)
-    ...     assert_(len(sup.log) == 1)  # also stored in log attribute
-
-    Or as a decorator:
-
-    >>> sup = suppress_warnings()
-    >>> sup.filter(module=np.ma.core)  # module must match exact
-    >>> @sup
-    >>> def some_function():
-    ...     # do something which causes a warning in np.ma.core
-    ...     pass
+
+    With a context manager::
+
+        with np.testing.suppress_warnings() as sup:
+            sup.filter(DeprecationWarning, "Some text")
+            sup.filter(module=np.ma.core)
+            log = sup.record(FutureWarning, "Does this occur?")
+            command_giving_warnings()
+            # The FutureWarning was given once, the filtered warnings were
+            # ignored. All other warnings abide outside settings (may be
+            # printed/error)
+            assert_(len(log) == 1)
+            assert_(len(sup.log) == 1)  # also stored in log attribute
+
+    Or as a decorator::
+
+        sup = np.testing.suppress_warnings()
+        sup.filter(module=np.ma.core)  # module must match exactly
+        @sup
+        def some_function():
+            # do something which causes a warning in np.ma.core
+            pass
     """
     def __init__(self, forwarding_rule="always"):
         self._entered = False
diff --git a/numpy/testing/tests/test_utils.py b/numpy/testing/tests/test_utils.py
index 43afafaa8..c376a3852 100644
--- a/numpy/testing/tests/test_utils.py
+++ b/numpy/testing/tests/test_utils.py
@@ -327,24 +327,22 @@ class TestEqual(TestArrayEqual):
         self._test_not_equal(x, y)
 
     def test_error_message(self):
-        try:
+        with pytest.raises(AssertionError) as exc_info:
             self._assert_func(np.array([1, 2]), np.array([[1, 2]]))
-        except AssertionError as e:
-            msg = str(e)
-            msg2 = msg.replace("shapes (2L,), (1L, 2L)", "shapes (2,), (1, 2)")
-            msg_reference = textwrap.dedent("""\
+        msg = str(exc_info.value)
+        msg2 = msg.replace("shapes (2L,), (1L, 2L)", "shapes (2,), (1, 2)")
+        msg_reference = textwrap.dedent("""\
 
-            Arrays are not equal
+        Arrays are not equal
 
-            (shapes (2,), (1, 2) mismatch)
-             x: array([1, 2])
-             y: array([[1, 2]])""")
-            try:
-                assert_equal(msg, msg_reference)
-            except AssertionError:
-                assert_equal(msg2, msg_reference)
-        else:
-            raise AssertionError("Did not raise")
+        (shapes (2,), (1, 2) mismatch)
+         x: array([1, 2])
+         y: array([[1, 2]])""")
+
+        try:
+            assert_equal(msg, msg_reference)
+        except AssertionError:
+            assert_equal(msg2, msg_reference)
 
 
 class TestArrayAlmostEqual(_GenericTest):
@@ -509,38 +507,53 @@ class TestAlmostEqual(_GenericTest):
         x = np.array([1.00000000001, 2.00000000002, 3.00003])
         y = np.array([1.00000000002, 2.00000000003, 3.00004])
 
-        # test with a different amount of decimal digits
-        # note that we only check for the formatting of the arrays themselves
-        b = ('x: array([1.00000000001, 2.00000000002, 3.00003     '
-             ' ])\n y: array([1.00000000002, 2.00000000003, 3.00004      ])')
-        try:
+        # Test with a different amount of decimal digits
+        with pytest.raises(AssertionError) as exc_info:
             self._assert_func(x, y, decimal=12)
-        except AssertionError as e:
-            # remove anything that's not the array string
-            assert_equal(str(e).split('%)\n ')[1], b)
-
-        # with the default value of decimal digits, only the 3rd element differs
-        # note that we only check for the formatting of the arrays themselves
-        b = ('x: array([1.     , 2.     , 3.00003])\n y: array([1.     , '
-             '2.     , 3.00004])')
-        try:
+        msgs = str(exc_info.value).split('\n')
+        assert_equal(msgs[3], 'Mismatch: 100%')
+        assert_equal(msgs[4], 'Max absolute difference: 1.e-05')
+        assert_equal(msgs[5], 'Max relative difference: 3.33328889e-06')
+        assert_equal(
+            msgs[6],
+            ' x: array([1.00000000001, 2.00000000002, 3.00003      ])')
+        assert_equal(
+            msgs[7],
+            ' y: array([1.00000000002, 2.00000000003, 3.00004      ])')
+
+        # With the default value of decimal digits, only the 3rd element
+        # differs. Note that we only check for the formatting of the arrays
+        # themselves.
+        with pytest.raises(AssertionError) as exc_info:
             self._assert_func(x, y)
-        except AssertionError as e:
-            # remove anything that's not the array string
-            assert_equal(str(e).split('%)\n ')[1], b)
-
-        # Check the error message when input includes inf or nan
+        msgs = str(exc_info.value).split('\n')
+        assert_equal(msgs[3], 'Mismatch: 33.3%')
+        assert_equal(msgs[4], 'Max absolute difference: 1.e-05')
+        assert_equal(msgs[5], 'Max relative difference: 3.33328889e-06')
+        assert_equal(msgs[6], ' x: array([1.     , 2.     , 3.00003])')
+        assert_equal(msgs[7], ' y: array([1.     , 2.     , 3.00004])')
+
+        # Check the error message when input includes inf
         x = np.array([np.inf, 0])
         y = np.array([np.inf, 1])
-        try:
+        with pytest.raises(AssertionError) as exc_info:
+            self._assert_func(x, y)
+        msgs = str(exc_info.value).split('\n')
+        assert_equal(msgs[3], 'Mismatch: 50%')
+        assert_equal(msgs[4], 'Max absolute difference: 1.')
+        assert_equal(msgs[5], 'Max relative difference: 1.')
+        assert_equal(msgs[6], ' x: array([inf,  0.])')
+        assert_equal(msgs[7], ' y: array([inf,  1.])')
+
+        # Check the error message when dividing by zero
+        x = np.array([1, 2])
+        y = np.array([0, 0])
+        with pytest.raises(AssertionError) as exc_info:
             self._assert_func(x, y)
-        except AssertionError as e:
-            msgs = str(e).split('\n')
-            # assert error percentage is 50%
-            assert_equal(msgs[3], '(mismatch 50.0%)')
-            # assert output array contains inf
-            assert_equal(msgs[4], ' x: array([inf,  0.])')
-            assert_equal(msgs[5], ' y: array([inf,  1.])')
+        msgs = str(exc_info.value).split('\n')
+        assert_equal(msgs[3], 'Mismatch: 100%')
+        assert_equal(msgs[4], 'Max absolute difference: 2')
+        assert_equal(msgs[5], 'Max relative difference: inf')
 
     def test_subclass_that_cannot_be_bool(self):
         # While we cannot guarantee testing functions will always work for
@@ -829,12 +842,12 @@ class TestAssertAllclose(object):
     def test_report_fail_percentage(self):
         a = np.array([1, 1, 1, 1])
         b = np.array([1, 1, 1, 2])
-        try:
+
+        with pytest.raises(AssertionError) as exc_info:
             assert_allclose(a, b)
-            msg = ''
-        except AssertionError as exc:
-            msg = exc.args[0]
-        assert_("mismatch 25.0%" in msg)
+        msg = str(exc_info.value)
+        assert_('Mismatch: 25%\nMax absolute difference: 1\n'
+                'Max relative difference: 0.5' in msg)
 
     def test_equal_nan(self):
         a = np.array([np.nan])
@@ -1117,12 +1130,10 @@ class TestStringEqual(object):
         assert_string_equal("hello", "hello")
         assert_string_equal("hello\nmultiline", "hello\nmultiline")
 
-        try:
+        with pytest.raises(AssertionError) as exc_info:
             assert_string_equal("foo\nbar", "hello\nbar")
-        except AssertionError as exc:
-            assert_equal(str(exc), "Differences in strings:\n- foo\n+ hello")
-        else:
-            raise AssertionError("exception not raised")
+        msg = str(exc_info.value)
+        assert_equal(msg, "Differences in strings:\n- foo\n+ hello")
 
         assert_raises(AssertionError,
                       lambda: assert_string_equal("foo", "hello"))
diff --git a/numpy/tests/test_ctypeslib.py b/numpy/tests/test_ctypeslib.py
index d389b37a8..521208c36 100644
--- a/numpy/tests/test_ctypeslib.py
+++ b/numpy/tests/test_ctypeslib.py
@@ -273,3 +273,95 @@ class TestAsArray(object):
 
         # check we avoid the segfault
         c_arr[0][0][0]
+
+
+@pytest.mark.skipif(ctypes is None,
+                    reason="ctypes not available on this python installation")
+class TestAsCtypesType(object):
+    """ Test conversion from dtypes to ctypes types """
+    def test_scalar(self):
+        dt = np.dtype('<u2')
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_equal(ct, ctypes.c_uint16.__ctype_le__)
+
+        dt = np.dtype('>u2')
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_equal(ct, ctypes.c_uint16.__ctype_be__)
+
+        dt = np.dtype('u2')
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_equal(ct, ctypes.c_uint16)
+
+    def test_subarray(self):
+        dt = np.dtype((np.int32, (2, 3)))
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_equal(ct, 2 * (3 * ctypes.c_int32))
+
+    def test_structure(self):
+        dt = np.dtype([
+            ('a', np.uint16),
+            ('b', np.uint32),
+        ])
+
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_(issubclass(ct, ctypes.Structure))
+        assert_equal(ctypes.sizeof(ct), dt.itemsize)
+        assert_equal(ct._fields_, [
+            ('a', ctypes.c_uint16),
+            ('b', ctypes.c_uint32),
+        ])
+
+    def test_structure_aligned(self):
+        dt = np.dtype([
+            ('a', np.uint16),
+            ('b', np.uint32),
+        ], align=True)
+
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_(issubclass(ct, ctypes.Structure))
+        assert_equal(ctypes.sizeof(ct), dt.itemsize)
+        assert_equal(ct._fields_, [
+            ('a', ctypes.c_uint16),
+            ('', ctypes.c_char * 2),  # padding
+            ('b', ctypes.c_uint32),
+        ])
+
+    def test_union(self):
+        dt = np.dtype(dict(
+            names=['a', 'b'],
+            offsets=[0, 0],
+            formats=[np.uint16, np.uint32]
+        ))
+
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_(issubclass(ct, ctypes.Union))
+        assert_equal(ctypes.sizeof(ct), dt.itemsize)
+        assert_equal(ct._fields_, [
+            ('a', ctypes.c_uint16),
+            ('b', ctypes.c_uint32),
+        ])
+
+    def test_padded_union(self):
+        dt = np.dtype(dict(
+            names=['a', 'b'],
+            offsets=[0, 0],
+            formats=[np.uint16, np.uint32],
+            itemsize=5,
+        ))
+
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_(issubclass(ct, ctypes.Union))
+        assert_equal(ctypes.sizeof(ct), dt.itemsize)
+        assert_equal(ct._fields_, [
+            ('a', ctypes.c_uint16),
+            ('b', ctypes.c_uint32),
+            ('', ctypes.c_char * 5),  # padding
+        ])
+
+    def test_overlapping(self):
+        dt = np.dtype(dict(
+            names=['a', 'b'],
+            offsets=[0, 2],
+            formats=[np.uint32, np.uint32]
+        ))
+        assert_raises(NotImplementedError, np.ctypeslib.as_ctypes_type, dt)