29 files changed, 935 insertions, 376 deletions
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index dbe3d226f..b60edd1df 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -1326,9 +1326,9 @@ add_newdoc('numpy.core.multiarray', 'arange',
 
     See Also
     --------
-    linspace : Evenly spaced numbers with careful handling of endpoints.
-    ogrid: Arrays of evenly spaced numbers in N-dimensions.
-    mgrid: Grid-shaped arrays of evenly spaced numbers in N-dimensions.
+    numpy.linspace : Evenly spaced numbers with careful handling of endpoints.
+    numpy.ogrid: Arrays of evenly spaced numbers in N-dimensions.
+    numpy.mgrid: Grid-shaped arrays of evenly spaced numbers in N-dimensions.
 
     Examples
     --------
@@ -3706,10 +3706,10 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('sort',
     See Also
     --------
     numpy.sort : Return a sorted copy of an array.
-    argsort : Indirect sort.
-    lexsort : Indirect stable sort on multiple keys.
-    searchsorted : Find elements in sorted array.
-    partition: Partial sort.
+    numpy.argsort : Indirect sort.
+    numpy.lexsort : Indirect stable sort on multiple keys.
+    numpy.searchsorted : Find elements in sorted array.
+    numpy.partition: Partial sort.
 
     Notes
     -----
@@ -4497,7 +4497,7 @@ add_newdoc('numpy.core', 'ufunc',
         Alternate array object(s) in which to put the result; if provided, it
         must have a shape that the inputs broadcast to. A tuple of arrays
         (possible only as a keyword argument) must have length equal to the
-        number of outputs; use `None` for uninitialized outputs to be
+        number of outputs; use None for uninitialized outputs to be
         allocated by the ufunc.
     where : array_like, optional
         This condition is broadcast over the input. At locations where the
@@ -4691,7 +4691,7 @@ add_newdoc('numpy.core', 'ufunc', ('signature',
     -----
     Generalized ufuncs are used internally in many linalg functions, and in
     the testing suite; the examples below are taken from these.
-    For ufuncs that operate on scalars, the signature is `None`, which is
+    For ufuncs that operate on scalars, the signature is None, which is
     equivalent to '()' for every argument.
 
     Examples
@@ -4742,7 +4742,7 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
 
         .. versionadded:: 1.7.0
 
-        If this is `None`, a reduction is performed over all the axes.
+        If this is None, a reduction is performed over all the axes.
         If this is a tuple of ints, a reduction is performed on multiple
         axes, instead of a single axis or all the axes as before.
 
@@ -4755,7 +4755,7 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
         to the data-type of the output array if this is provided, or
         the data-type of the input array if no output array is provided.
     out : ndarray, None, or tuple of ndarray and None, optional
-        A location into which the result is stored. If not provided or `None`,
+        A location into which the result is stored. If not provided or None,
         a freshly-allocated array is returned. For consistency with
         ``ufunc.__call__``, if given as a keyword, this may be wrapped in a
         1-element tuple.
@@ -4872,7 +4872,7 @@ add_newdoc('numpy.core', 'ufunc', ('accumulate',
         to the data-type of the output array if such is provided, or the
         the data-type of the input array if no output array is provided.
     out : ndarray, None, or tuple of ndarray and None, optional
-        A location into which the result is stored. If not provided or `None`,
+        A location into which the result is stored. If not provided or None,
         a freshly-allocated array is returned. For consistency with
         ``ufunc.__call__``, if given as a keyword, this may be wrapped in a
         1-element tuple.
@@ -4954,7 +4954,7 @@ add_newdoc('numpy.core', 'ufunc', ('reduceat',
         to the data type of the output array if this is provided, or
         the data type of the input array if no output array is provided.
     out : ndarray, None, or tuple of ndarray and None, optional
-        A location into which the result is stored. If not provided or `None`,
+        A location into which the result is stored. If not provided or None,
         a freshly-allocated array is returned. For consistency with
         ``ufunc.__call__``, if given as a keyword, this may be wrapped in a
         1-element tuple.
diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py
index b0ea603e1..05e401e0b 100644
--- a/numpy/core/_internal.py
+++ b/numpy/core/_internal.py
@@ -247,55 +247,13 @@ class _missing_ctypes(object):
             self.value = ptr
 
 
-class _unsafe_first_element_pointer(object):
-    """
-    Helper to allow viewing an array as a ctypes pointer to the first element
-
-    This avoids:
-      * dealing with strides
-      * `.view` rejecting object-containing arrays
-      * `memoryview` not supporting overlapping fields
-    """
-    def __init__(self, arr):
-        self.base = arr
-
-    @property
-    def __array_interface__(self):
-        i = dict(
-            shape=(),
-            typestr='|V0',
-            data=(self.base.__array_interface__['data'][0], False),
-            strides=(),
-            version=3,
-        )
-        return i
-
-
-def _get_void_ptr(arr):
-    """
-    Get a `ctypes.c_void_p` to arr.data, that keeps a reference to the array
-    """
-    import numpy as np
-    # convert to a 0d array that has a data pointer referrign to the start
-    # of arr. This holds a reference to arr.
-    simple_arr = np.asarray(_unsafe_first_element_pointer(arr))
-
-    # create a `char[0]` using the same memory.
-    c_arr = (ctypes.c_char * 0).from_buffer(simple_arr)
-
-    # finally cast to void*
-    return ctypes.cast(ctypes.pointer(c_arr), ctypes.c_void_p)
-
-
 class _ctypes(object):
     def __init__(self, array, ptr=None):
         self._arr = array
 
         if ctypes:
             self._ctypes = ctypes
-            # get a void pointer to the buffer, which keeps the array alive
-            self._data = _get_void_ptr(array)
-            assert self._data.value == ptr
+            self._data = self._ctypes.c_void_p(ptr)
         else:
             # fake a pointer-like object that holds onto the reference
             self._ctypes = _missing_ctypes()
@@ -317,7 +275,14 @@ class _ctypes(object):
 
         The returned pointer will keep a reference to the array.
         """
-        return self._ctypes.cast(self._data, obj)
+        # _ctypes.cast function causes a circular reference of self._data in
+        # self._data._objects. Attributes of self._data cannot be released
+        # until gc.collect is called. Make a copy of the pointer first then let
+        # it hold the array reference. This is a workaround to circumvent the
+        # CPython bug https://bugs.python.org/issue12836
+        ptr = self._ctypes.cast(self._data, obj)
+        ptr._arr = self._arr
+        return ptr
 
     def shape_as(self, obj):
         """
@@ -348,7 +313,7 @@ class _ctypes(object):
         crashing. User Beware! The value of this attribute is exactly the same
         as ``self._array_interface_['data'][0]``.
 
-        Note that unlike `data_as`, a reference will not be kept to the array:
+        Note that unlike ``data_as``, a reference will not be kept to the array:
         code like ``ctypes.c_void_p((a + b).ctypes.data)`` will result in a
         pointer to a deallocated array, and should be spelt
         ``(a + b).ctypes.data_as(ctypes.c_void_p)``
@@ -385,7 +350,7 @@ class _ctypes(object):
 
         Enables `c_func(some_array.ctypes)`
         """
-        return self._data
+        return self.data_as(ctypes.c_void_p)
 
     # kept for compatibility
     get_data = data.fget
diff --git a/numpy/core/arrayprint.py b/numpy/core/arrayprint.py
index 233d139fd..401018015 100644
--- a/numpy/core/arrayprint.py
+++ b/numpy/core/arrayprint.py
@@ -111,7 +111,7 @@ def set_printoptions(precision=None, threshold=None, edgeitems=None,
     ----------
     precision : int or None, optional
         Number of digits of precision for floating point output (default 8).
-        May be `None` if `floatmode` is not `fixed`, to print as many digits as
+        May be None if `floatmode` is not `fixed`, to print as many digits as
         necessary to uniquely specify the value.
     threshold : int, optional
         Total number of array elements which trigger summarization
@@ -1479,7 +1479,11 @@ def array_repr(arr, max_line_width=None, precision=None, suppress_small=None):
         arr, max_line_width, precision, suppress_small)
 
 
-_guarded_str = _recursive_guard()(str)
+@_recursive_guard()
+def _guarded_repr_or_str(v):
+    if isinstance(v, bytes):
+        return repr(v)
+    return str(v)
 
 
 def _array_str_implementation(
@@ -1497,7 +1501,7 @@ def _array_str_implementation(
         # obtain a scalar and call str on it, avoiding problems for subclasses
         # for which indexing with () returns a 0d instead of a scalar by using
         # ndarray's getindex. Also guard against recursive 0d object arrays.
-        return _guarded_str(np.ndarray.__getitem__(a, ()))
+        return _guarded_repr_or_str(np.ndarray.__getitem__(a, ()))
 
     return array2string(a, max_line_width, precision, suppress_small, ' ', "")
 
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 6729fe197..e0b6a654c 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -287,7 +287,7 @@ defdict = {
     Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.subtract'),
           'PyUFunc_SubtractionTypeResolver',
-          TD(notimes_or_obj, simd=[('avx2', ints)]),
+          TD(ints + inexact, simd=[('avx2', ints)]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'MM', 'm'),
@@ -358,14 +358,14 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.square'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints)]),
+          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'fd')]),
           TD(O, f='Py_square'),
           ),
 'reciprocal':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.reciprocal'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints)]),
+          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f','fd')]),
           TD(O, f='Py_reciprocal'),
           ),
 # This is no longer used as numpy.ones_like, however it is
@@ -395,7 +395,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.absolute'),
           'PyUFunc_AbsoluteTypeResolver',
-          TD(bints+flts+timedeltaonly),
+          TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
           TD(cmplx, out=('f', 'd', 'g')),
           TD(O, f='PyNumber_Absolute'),
           ),
@@ -409,7 +409,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.negative'),
           'PyUFunc_NegativeTypeResolver',
-          TD(bints+flts+timedeltaonly, simd=[('avx2', ints)]),
+          TD(ints+flts+timedeltaonly, simd=[('avx2', ints)]),
           TD(cmplx, f='neg'),
           TD(O, f='PyNumber_Negative'),
           ),
@@ -762,7 +762,7 @@ defdict = {
           docstrings.get('numpy.core.umath.sqrt'),
           None,
           TD('e', f='sqrt', astype={'e':'f'}),
-          TD(inexactvec),
+          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
           TD('fdg' + cmplx, f='sqrt'),
           TD(P, f='sqrt'),
           ),
@@ -777,14 +777,18 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.ceil'),
           None,
-          TD(flts, f='ceil', astype={'e':'f'}),
+          TD('e', f='ceil', astype={'e':'f'}),
+          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD('fdg', f='ceil'),
           TD(O, f='npy_ObjectCeil'),
           ),
 'trunc':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.trunc'),
           None,
-          TD(flts, f='trunc', astype={'e':'f'}),
+          TD('e', f='trunc', astype={'e':'f'}),
+          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD('fdg', f='trunc'),
           TD(O, f='npy_ObjectTrunc'),
           ),
 'fabs':
@@ -798,14 +802,18 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.floor'),
           None,
-          TD(flts, f='floor', astype={'e':'f'}),
+          TD('e', f='floor', astype={'e':'f'}),
+          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD('fdg', f='floor'),
           TD(O, f='npy_ObjectFloor'),
           ),
 'rint':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.rint'),
           None,
-          TD(inexact, f='rint', astype={'e':'f'}),
+          TD('e', f='rint', astype={'e':'f'}),
+          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD('fdg' + cmplx, f='rint'),
           TD(P, f='rint'),
           ),
 'arctan2':
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index 1ac477b54..4dec73505 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -22,7 +22,7 @@ subst = {
     'PARAMS': textwrap.dedent("""
         out : ndarray, None, or tuple of ndarray and None, optional
             A location into which the result is stored. If provided, it must have
-            a shape that the inputs broadcast to. If not provided or `None`,
+            a shape that the inputs broadcast to. If not provided or None,
             a freshly-allocated array is returned. A tuple (possible only as a
             keyword argument) must have length equal to the number of outputs.
         where : array_like, optional
@@ -2596,7 +2596,7 @@ add_newdoc('numpy.core.umath', 'matmul',
     out : ndarray, optional
         A location into which the result is stored. If provided, it must have
         a shape that matches the signature `(n,k),(k,m)->(n,m)`. If not
-        provided or `None`, a freshly-allocated array is returned.
+        provided or None, a freshly-allocated array is returned.
     **kwargs
         For other keyword-only arguments, see the
         :ref:`ufunc docs <ufuncs.kwargs>`.
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py
index a941c5b81..2d89d6fe0 100644
--- a/numpy/core/defchararray.py
+++ b/numpy/core/defchararray.py
@@ -82,7 +82,7 @@ def _clean_args(*args):
 
     Many of the Python string operations that have optional arguments
     do not use 'None' to indicate a default value.  In these cases,
-    we need to remove all `None` arguments, and those following them.
+    we need to remove all None arguments, and those following them.
     """
     newargs = []
     for chk in args:
@@ -1333,7 +1333,7 @@ def rsplit(a, sep=None, maxsplit=None):
     a : array_like of str or unicode
 
     sep : str or unicode, optional
-        If `sep` is not specified or `None`, any whitespace string
+        If `sep` is not specified or None, any whitespace string
         is a separator.
     maxsplit : int, optional
         If `maxsplit` is given, at most `maxsplit` splits are done,
@@ -1417,7 +1417,7 @@ def split(a, sep=None, maxsplit=None):
     a : array_like of str or unicode
 
     sep : str or unicode, optional
-       If `sep` is not specified or `None`, any whitespace string is a
+       If `sep` is not specified or None, any whitespace string is a
        separator.
 
     maxsplit : int, optional
@@ -2659,7 +2659,7 @@ def array(obj, itemsize=None, copy=True, unicode=None, order=None):
     unicode : bool, optional
         When true, the resulting `chararray` can contain Unicode
         characters, when false only 8-bit characters.  If unicode is
-        `None` and `obj` is one of the following:
+        None and `obj` is one of the following:
 
           - a `chararray`,
           - an ndarray of type `str` or `unicode`
@@ -2799,7 +2799,7 @@ def asarray(obj, itemsize=None, unicode=None, order=None):
     unicode : bool, optional
         When true, the resulting `chararray` can contain Unicode
         characters, when false only 8-bit characters.  If unicode is
-        `None` and `obj` is one of the following:
+        None and `obj` is one of the following:
 
           - a `chararray`,
           - an ndarray of type `str` or 'unicode`
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index 6c0b9cde9..5f7716455 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -1409,7 +1409,7 @@ def squeeze(a, axis=None):
     Raises
     ------
     ValueError
-        If `axis` is not `None`, and an axis being squeezed is not of length 1
+        If `axis` is not None, and an axis being squeezed is not of length 1
 
     See Also
     --------
@@ -1945,7 +1945,7 @@ def compress(condition, a, axis=None, out=None):
     take, choose, diag, diagonal, select
     ndarray.compress : Equivalent method in ndarray
     np.extract: Equivalent method when working on 1-D arrays
-    numpy.doc.ufuncs : Section "Output arguments"
+    ufuncs-output-type
 
     Examples
     --------
@@ -1995,14 +1995,14 @@ def clip(a, a_min, a_max, out=None, **kwargs):
     ----------
     a : array_like
         Array containing elements to clip.
-    a_min : scalar or array_like or `None`
-        Minimum value. If `None`, clipping is not performed on lower
+    a_min : scalar or array_like or None
+        Minimum value. If None, clipping is not performed on lower
         interval edge. Not more than one of `a_min` and `a_max` may be
-        `None`.
-    a_max : scalar or array_like or `None`
-        Maximum value. If `None`, clipping is not performed on upper
+        None.
+    a_max : scalar or array_like or None
+        Maximum value. If None, clipping is not performed on upper
         interval edge. Not more than one of `a_min` and `a_max` may be
-        `None`. If `a_min` or `a_max` are array_like, then the three
+        None. If `a_min` or `a_max` are array_like, then the three
         arrays will be broadcasted to match their shapes.
     out : ndarray, optional
         The results will be placed in this array. It may be the input
@@ -2023,7 +2023,7 @@ def clip(a, a_min, a_max, out=None, **kwargs):
 
     See Also
     --------
-    numpy.doc.ufuncs : Section "Output arguments"
+    ufuncs-output-type
 
     Examples
     --------
@@ -2206,7 +2206,7 @@ def any(a, axis=None, out=None, keepdims=np._NoValue):
         Input array or object that can be converted to an array.
     axis : None or int or tuple of ints, optional
         Axis or axes along which a logical OR reduction is performed.
-        The default (`axis` = `None`) is to perform a logical OR over all
+        The default (``axis=None``) is to perform a logical OR over all
         the dimensions of the input array. `axis` may be negative, in
         which case it counts from the last to the first axis.
 
@@ -2219,7 +2219,7 @@ def any(a, axis=None, out=None, keepdims=np._NoValue):
         the same shape as the expected output and its type is preserved
         (e.g., if it is of type float, then it will remain so, returning
         1.0 for True and 0.0 for False, regardless of the type of `a`).
-        See `doc.ufuncs` (Section "Output arguments") for details.
+        See `ufuncs-output-type` for more details.
 
     keepdims : bool, optional
         If this is set to True, the axes which are reduced are left
@@ -2292,7 +2292,7 @@ def all(a, axis=None, out=None, keepdims=np._NoValue):
         Input array or object that can be converted to an array.
     axis : None or int or tuple of ints, optional
         Axis or axes along which a logical AND reduction is performed.
-        The default (`axis` = `None`) is to perform a logical AND over all
+        The default (``axis=None``) is to perform a logical AND over all
         the dimensions of the input array. `axis` may be negative, in
         which case it counts from the last to the first axis.
 
@@ -2304,8 +2304,8 @@ def all(a, axis=None, out=None, keepdims=np._NoValue):
         Alternate output array in which to place the result.
         It must have the same shape as the expected output and its
         type is preserved (e.g., if ``dtype(out)`` is float, the result
-        will consist of 0.0's and 1.0's).  See `doc.ufuncs` (Section
-        "Output arguments") for more details.
+        will consist of 0.0's and 1.0's). See `ufuncs-output-type` for more
+        details.
 
     keepdims : bool, optional
         If this is set to True, the axes which are reduced are left
@@ -2383,8 +2383,8 @@ def cumsum(a, axis=None, dtype=None, out=None):
     out : ndarray, optional
         Alternative output array in which to place the result. It must
         have the same shape and buffer length as the expected output
-        but the type will be cast if necessary. See `doc.ufuncs`
-        (Section "Output arguments") for more details.
+        but the type will be cast if necessary. See `ufuncs-output-type` for
+        more details.
 
     Returns
     -------
@@ -2529,7 +2529,7 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
     out : ndarray, optional
         Alternative output array in which to place the result.  Must
         be of the same shape and buffer length as the expected output.
-        See `doc.ufuncs` (Section "Output arguments") for more details.
+        See `ufuncs-output-type` for more details.
 
     keepdims : bool, optional
         If this is set to True, the axes which are reduced are left
@@ -2654,7 +2654,7 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
     out : ndarray, optional
         Alternative output array in which to place the result.  Must
         be of the same shape and buffer length as the expected output.
-        See `doc.ufuncs` (Section "Output arguments") for more details.
+        See `ufuncs-output-type` for more details.
 
     keepdims : bool, optional
         If this is set to True, the axes which are reduced are left
@@ -2861,7 +2861,7 @@ def prod(a, axis=None, dtype=None, out=None, keepdims=np._NoValue,
     See Also
     --------
     ndarray.prod : equivalent method
-    numpy.doc.ufuncs : Section "Output arguments"
+    ufuncs-output-type
 
     Notes
     -----
@@ -2957,7 +2957,7 @@ def cumprod(a, axis=None, dtype=None, out=None):
 
     See Also
     --------
-    numpy.doc.ufuncs : Section "Output arguments"
+    ufuncs-output-type
 
     Notes
     -----
@@ -3103,8 +3103,8 @@ def around(a, decimals=0, out=None):
     out : ndarray, optional
         Alternative output array in which to place the result. It must have
         the same shape as the expected output, but the type of the output
-        values will be cast if necessary. See `doc.ufuncs` (Section
-        "Output arguments") for details.
+        values will be cast if necessary. See `ufuncs-output-type` for more
+        details.
 
     Returns
     -------
@@ -3218,7 +3218,7 @@ def mean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
         Alternate output array in which to place the result.  The default
         is ``None``; if provided, it must have the same shape as the
         expected output, but the type will be cast if necessary.
-        See `doc.ufuncs` for details.
+        See `ufuncs-output-type` for more details.
 
     keepdims : bool, optional
         If this is set to True, the axes which are reduced are left
@@ -3353,7 +3353,7 @@ def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
     See Also
     --------
     var, mean, nanmean, nanstd, nanvar
-    numpy.doc.ufuncs : Section "Output arguments"
+    ufuncs-output-type
 
     Notes
     -----
@@ -3478,7 +3478,7 @@ def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
     See Also
     --------
     std, mean, nanmean, nanstd, nanvar
-    numpy.doc.ufuncs : Section "Output arguments"
+    ufuncs-output-type
 
     Notes
     -----
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 6d25f864b..1e011e2e7 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -292,7 +292,7 @@ def full(shape, fill_value, dtype=None, order='C'):
     fill_value : scalar
         Fill value.
     dtype : data-type, optional
-        The desired data-type for the array  The default, `None`, means
+        The desired data-type for the array  The default, None, means
          `np.array(fill_value).dtype`.
     order : {'C', 'F'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
@@ -960,6 +960,9 @@ def tensordot(a, b, axes=2):
     two sequences of the same length, with the first axis to sum over given
     first in both sequences, the second axis second, and so forth.
 
+    The shape of the result consists of the non-contracted axes of the
+    first tensor, followed by the non-contracted axes of the second.
+
     Examples
     --------
     A "traditional" example:
@@ -1781,19 +1784,19 @@ def _frombuffer(buf, dtype, shape, order):
 
 
 @set_module('numpy')
-def isscalar(num):
+def isscalar(element):
     """
-    Returns True if the type of `num` is a scalar type.
+    Returns True if the type of `element` is a scalar type.
 
     Parameters
     ----------
-    num : any
+    element : any
         Input argument, can be of any type and shape.
 
     Returns
     -------
     val : bool
-        True if `num` is a scalar type, False if it is not.
+        True if `element` is a scalar type, False if it is not.
 
     See Also
     --------
@@ -1801,10 +1804,14 @@ def isscalar(num):
 
     Notes
     -----
-    In almost all cases ``np.ndim(x) == 0`` should be used instead of this
-    function, as that will also return true for 0d arrays. This is how
-    numpy overloads functions in the style of the ``dx`` arguments to `gradient`
-    and the ``bins`` argument to `histogram`. Some key differences:
+    If you need a stricter way to identify a *numerical* scalar, use
+    ``isinstance(x, numbers.Number)``, as that returns ``False`` for most
+    non-numerical elements such as strings.
+
+    In most cases ``np.ndim(x) == 0`` should be used instead of this function,
+    as that will also return true for 0d arrays. This is how numpy overloads
+    functions in the style of the ``dx`` arguments to `gradient` and the ``bins``
+    argument to `histogram`. Some key differences:
 
     +--------------------------------------+---------------+-------------------+
     | x                                    |``isscalar(x)``|``np.ndim(x) == 0``|
@@ -1852,9 +1859,9 @@ def isscalar(num):
     True
 
     """
-    return (isinstance(num, generic)
-            or type(num) in ScalarType
-            or isinstance(num, numbers.Number))
+    return (isinstance(element, generic)
+            or type(element) in ScalarType
+            or isinstance(element, numbers.Number))
 
 
 @set_module('numpy')
@@ -2091,9 +2098,9 @@ def allclose(a, b, rtol=1.e-5, atol=1.e-8, equal_nan=False):
     `atol` are added together to compare against the absolute difference
     between `a` and `b`.
 
-    If either array contains one or more NaNs, False is returned.
-    Infs are treated as equal if they are in the same place and of the same
-    sign in both arrays.
+    NaNs are treated as equal if they are in the same place and if
+    ``equal_nan=True``.  Infs are treated as equal if they are in the same
+    place and of the same sign in both arrays.
 
     Parameters
     ----------
@@ -2105,7 +2112,7 @@ def allclose(a, b, rtol=1.e-5, atol=1.e-8, equal_nan=False):
         The absolute tolerance parameter (see Notes).
     equal_nan : bool
         Whether to compare NaN's as equal.  If True, NaN's in `a` will be
-        considered equal to NaN's in `b`.
+        considered equal to NaN's in `b` in the output array.
 
         .. versionadded:: 1.10.0
 
diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py
index d7e769e62..369d956fb 100644
--- a/numpy/core/shape_base.py
+++ b/numpy/core/shape_base.py
@@ -472,7 +472,7 @@ def _block_check_depths_match(arrays, parent_index=[]):
     first_index : list of int
         The full index of an element from the bottom of the nesting in
         `arrays`. If any element at the bottom is an empty list, this will
-        refer to it, and the last index along the empty axis will be `None`.
+        refer to it, and the last index along the empty axis will be None.
     max_arr_ndim : int
         The maximum of the ndims of the arrays nested in `arrays`.
     final_size: int
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index 4e229e321..5ed5b7635 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -557,7 +557,7 @@ PyArray_DebugPrint(PyArrayObject *obj)
     printf(" ndim   : %d\n", fobj->nd);
     printf(" shape  :");
     for (i = 0; i < fobj->nd; ++i) {
-        printf(" %d", (int)fobj->dimensions[i]);
+        printf(" %" NPY_INTP_FMT, fobj->dimensions[i]);
     }
     printf("\n");
 
@@ -567,7 +567,7 @@ PyArray_DebugPrint(PyArrayObject *obj)
     printf(" data   : %p\n", fobj->data);
     printf(" strides:");
     for (i = 0; i < fobj->nd; ++i) {
-        printf(" %d", (int)fobj->strides[i]);
+        printf(" %" NPY_INTP_FMT, fobj->strides[i]);
     }
     printf("\n");
 
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 5d9e990e8..152a2be9c 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -3078,6 +3078,7 @@ BOOL_argmax(npy_bool *ip, npy_intp n, npy_intp *max_ind,
  * #le = _LESS_THAN_OR_EQUAL*10, npy_half_le, _LESS_THAN_OR_EQUAL*8#
  * #iscomplex = 0*14, 1*3, 0*2#
  * #incr = ip++*14, ip+=2*3, ip++*2#
+ * #isdatetime = 0*17, 1*2#
  */
 static int
 @fname@_argmax(@type@ *ip, npy_intp n, npy_intp *max_ind,
@@ -3103,6 +3104,12 @@ static int
         return 0;
     }
 #endif
+#if @isdatetime@
+    if (mp == NPY_DATETIME_NAT) {
+        /* NaT encountered, it's maximal */
+        return 0;
+    }
+#endif
 
     for (i = 1; i < n; i++) {
         @incr@;
@@ -3122,6 +3129,13 @@ static int
             }
         }
 #else
+#if @isdatetime@
+        if (*ip == NPY_DATETIME_NAT) {
+            /* NaT encountered, it's maximal */
+            *max_ind = i;
+            break;
+        }
+#endif
         if (!@le@(*ip, mp)) {  /* negated, for correct nan handling */
             mp = *ip;
             *max_ind = i;
@@ -3158,16 +3172,19 @@ BOOL_argmin(npy_bool *ip, npy_intp n, npy_intp *min_ind,
  * #fname = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
  *          LONG, ULONG, LONGLONG, ULONGLONG,
  *          HALF, FLOAT, DOUBLE, LONGDOUBLE,
- *          CFLOAT, CDOUBLE, CLONGDOUBLE#
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *          DATETIME, TIMEDELTA#
  * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
  *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
  *         npy_half, npy_float, npy_double, npy_longdouble,
- *         npy_float, npy_double, npy_longdouble#
- * #isfloat = 0*10, 1*7#
- * #isnan = nop*10, npy_half_isnan, npy_isnan*6#
- * #le = _LESS_THAN_OR_EQUAL*10, npy_half_le, _LESS_THAN_OR_EQUAL*6#
- * #iscomplex = 0*14, 1*3#
- * #incr = ip++*14, ip+=2*3#
+ *         npy_float, npy_double, npy_longdouble,
+ *         npy_datetime, npy_timedelta#
+ * #isfloat = 0*10, 1*7, 0*2#
+ * #isnan = nop*10, npy_half_isnan, npy_isnan*6, nop*2#
+ * #le = _LESS_THAN_OR_EQUAL*10, npy_half_le, _LESS_THAN_OR_EQUAL*8#
+ * #iscomplex = 0*14, 1*3, 0*2#
+ * #incr = ip++*14, ip+=2*3, ip++*2#
+ * #isdatetime = 0*17, 1*2#
  */
 static int
 @fname@_argmin(@type@ *ip, npy_intp n, npy_intp *min_ind,
@@ -3193,6 +3210,12 @@ static int
         return 0;
     }
 #endif
+#if @isdatetime@
+    if (mp == NPY_DATETIME_NAT) {
+        /* NaT encountered, it's minimal */
+        return 0;
+    }
+#endif
 
     for (i = 1; i < n; i++) {
         @incr@;
@@ -3212,6 +3235,13 @@ static int
             }
         }
 #else
+#if @isdatetime@
+        if (*ip == NPY_DATETIME_NAT) {
+            /* NaT encountered, it's minimal */
+            *min_ind = i;
+            break;
+        }
+#endif 
         if (!@le@(mp, *ip)) {  /* negated, for correct nan handling */
             mp = *ip;
             *min_ind = i;
@@ -3231,43 +3261,6 @@ static int
 
 #undef _LESS_THAN_OR_EQUAL
 
-/**begin repeat
- *
- * #fname = DATETIME, TIMEDELTA#
- * #type = npy_datetime, npy_timedelta#
- */
-static int
-@fname@_argmin(@type@ *ip, npy_intp n, npy_intp *min_ind,
-        PyArrayObject *NPY_UNUSED(aip))
-{
-    /* NPY_DATETIME_NAT is smaller than every other value, we skip
-     * it for consistency with min().
-     */
-    npy_intp i;
-    @type@ mp = NPY_DATETIME_NAT;
-
-    i = 0;
-    while (i < n && mp == NPY_DATETIME_NAT) {
-        mp = ip[i];
-        i++;
-    }
-    if (i == n) {
-        /* All NaTs: return 0 */
-        *min_ind = 0;
-        return 0;
-    }
-    *min_ind = i - 1;
-    for (; i < n; i++) {
-        if (mp > ip[i] && ip[i] != NPY_DATETIME_NAT) {
-            mp = ip[i];
-            *min_ind = i;
-        }
-    }
-    return 0;
-}
-
-/**end repeat**/
-
 static int
 OBJECT_argmax(PyObject **ip, npy_intp n, npy_intp *max_ind,
               PyArrayObject *NPY_UNUSED(aip))
diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c
index 4baa02052..5f0ad5817 100644
--- a/numpy/core/src/multiarray/conversion_utils.c
+++ b/numpy/core/src/multiarray/conversion_utils.c
@@ -667,8 +667,8 @@ PyArray_ConvertClipmodeSequence(PyObject *object, NPY_CLIPMODE *modes, int n)
     if (object && (PyTuple_Check(object) || PyList_Check(object))) {
         if (PySequence_Size(object) != n) {
             PyErr_Format(PyExc_ValueError,
-                    "list of clipmodes has wrong length (%d instead of %d)",
-                    (int)PySequence_Size(object), n);
+                    "list of clipmodes has wrong length (%zd instead of %d)",
+                    PySequence_Size(object), n);
             return NPY_FAIL;
         }
 
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index 5174bd889..9b6f59e3a 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -544,8 +544,8 @@ setArrayFromSequence(PyArrayObject *a, PyObject *s,
      */
     if (slen != PyArray_DIMS(a)[dim] && slen != 1) {
         PyErr_Format(PyExc_ValueError,
-                 "cannot copy sequence with size %d to array axis "
-                 "with dimension %d", (int)slen, (int)PyArray_DIMS(a)[dim]);
+                 "cannot copy sequence with size %zd to array axis "
+                 "with dimension %" NPY_INTP_FMT, slen, PyArray_DIMS(a)[dim]);
         goto fail;
     }
 
@@ -2894,8 +2894,8 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
     src_size = PyArray_SIZE(src);
     if (dst_size != src_size) {
         PyErr_Format(PyExc_ValueError,
-                "cannot copy from array of size %d into an array "
-                "of size %d", (int)src_size, (int)dst_size);
+                "cannot copy from array of size %" NPY_INTP_FMT " into an array "
+                "of size %" NPY_INTP_FMT, src_size, dst_size);
         return -1;
     }
 
diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c
index d21bb9776..72a3df89c 100644
--- a/numpy/core/src/multiarray/datetime.c
+++ b/numpy/core/src/multiarray/datetime.c
@@ -758,8 +758,8 @@ parse_datetime_extended_unit_from_string(char *str, Py_ssize_t len,
 bad_input:
     if (metastr != NULL) {
         PyErr_Format(PyExc_TypeError,
-                "Invalid datetime metadata string \"%s\" at position %d",
-                metastr, (int)(substr-metastr));
+                "Invalid datetime metadata string \"%s\" at position %zd",
+                metastr, substr-metastr);
     }
     else {
         PyErr_Format(PyExc_TypeError,
@@ -820,8 +820,8 @@ parse_datetime_metadata_from_metastr(char *metastr, Py_ssize_t len,
 bad_input:
     if (substr != metastr) {
         PyErr_Format(PyExc_TypeError,
-                "Invalid datetime metadata string \"%s\" at position %d",
-                metastr, (int)(substr-metastr));
+                "Invalid datetime metadata string \"%s\" at position %zd",
+                metastr, substr - metastr);
     }
     else {
         PyErr_Format(PyExc_TypeError,
@@ -2273,15 +2273,15 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out,
 
 invalid_date:
     PyErr_Format(PyExc_ValueError,
-            "Invalid date (%d,%d,%d) when converting to NumPy datetime",
-            (int)out->year, (int)out->month, (int)out->day);
+            "Invalid date (%" NPY_INT64_FMT ",%" NPY_INT32_FMT ",%" NPY_INT32_FMT ") when converting to NumPy datetime",
+            out->year, out->month, out->day);
     return -1;
 
 invalid_time:
     PyErr_Format(PyExc_ValueError,
-            "Invalid time (%d,%d,%d,%d) when converting "
+            "Invalid time (%" NPY_INT32_FMT ",%" NPY_INT32_FMT ",%" NPY_INT32_FMT ",%" NPY_INT32_FMT ") when converting "
             "to NumPy datetime",
-            (int)out->hour, (int)out->min, (int)out->sec, (int)out->us);
+            out->hour, out->min, out->sec, out->us);
     return -1;
 }
 
@@ -3221,18 +3221,6 @@ NPY_NO_EXPORT PyArrayObject *
 datetime_arange(PyObject *start, PyObject *stop, PyObject *step,
                 PyArray_Descr *dtype)
 {
-    PyArray_DatetimeMetaData meta;
-    /*
-     * Both datetime and timedelta are stored as int64, so they can
-     * share value variables.
-     */
-    npy_int64 values[3];
-    PyObject *objs[3];
-    int type_nums[3];
-
-    npy_intp i, length;
-    PyArrayObject *ret;
-    npy_int64 *ret_data;
 
     /*
      * First normalize the input parameters so there is no Py_None,
@@ -3265,6 +3253,8 @@ datetime_arange(PyObject *start, PyObject *stop, PyObject *step,
     /* Check if the units of the given dtype are generic, in which
      * case we use the code path that detects the units
      */
+    int type_nums[3];
+    PyArray_DatetimeMetaData meta;
     if (dtype != NULL) {
         PyArray_DatetimeMetaData *meta_tmp;
 
@@ -3313,6 +3303,7 @@ datetime_arange(PyObject *start, PyObject *stop, PyObject *step,
     }
 
     /* Set up to convert the objects to a common datetime unit metadata */
+    PyObject *objs[3];
     objs[0] = start;
     objs[1] = stop;
     objs[2] = step;
@@ -3333,11 +3324,22 @@ datetime_arange(PyObject *start, PyObject *stop, PyObject *step,
         type_nums[2] = NPY_TIMEDELTA;
     }
 
-    /* Convert all the arguments */
+    /* Convert all the arguments
+     *
+     * Both datetime and timedelta are stored as int64, so they can
+     * share value variables.
+     */
+    npy_int64 values[3];
     if (convert_pyobjects_to_datetimes(3, objs, type_nums,
                                 NPY_SAME_KIND_CASTING, values, &meta) < 0) {
         return NULL;
     }
+    /* If no start was provided, default to 0 */
+    if (start == NULL) {
+        /* enforced above */
+        assert(type_nums[0] == NPY_TIMEDELTA);
+        values[0] = 0;
+    }
 
     /* If no step was provided, default to 1 */
     if (step == NULL) {
@@ -3362,6 +3364,7 @@ datetime_arange(PyObject *start, PyObject *stop, PyObject *step,
     }
 
     /* Calculate the array length */
+    npy_intp length;
     if (values[2] > 0 && values[1] > values[0]) {
         length = (values[1] - values[0] + (values[2] - 1)) / values[2];
     }
@@ -3389,19 +3392,20 @@ datetime_arange(PyObject *start, PyObject *stop, PyObject *step,
     }
 
     /* Create the result array */
-    ret = (PyArrayObject *)PyArray_NewFromDescr(
-                            &PyArray_Type, dtype, 1, &length, NULL,
-                            NULL, 0, NULL);
+    PyArrayObject *ret = (PyArrayObject *)PyArray_NewFromDescr(
+            &PyArray_Type, dtype, 1, &length, NULL,
+            NULL, 0, NULL);
+
     if (ret == NULL) {
         return NULL;
     }
 
     if (length > 0) {
         /* Extract the data pointer */
-        ret_data = (npy_int64 *)PyArray_DATA(ret);
+        npy_int64 *ret_data = (npy_int64 *)PyArray_DATA(ret);
 
         /* Create the timedeltas or datetimes */
-        for (i = 0; i < length; ++i) {
+        for (npy_intp i = 0; i < length; ++i) {
             *ret_data = values[0];
             values[0] += values[2];
             ret_data++;
diff --git a/numpy/core/src/multiarray/datetime_strings.c b/numpy/core/src/multiarray/datetime_strings.c
index 95b7bb3dc..dfc01494f 100644
--- a/numpy/core/src/multiarray/datetime_strings.c
+++ b/numpy/core/src/multiarray/datetime_strings.c
@@ -743,8 +743,8 @@ finish:
 
 parse_error:
     PyErr_Format(PyExc_ValueError,
-            "Error parsing datetime string \"%s\" at position %d",
-            str, (int)(substr-str));
+            "Error parsing datetime string \"%s\" at position %zd",
+            str, substr - str);
     return -1;
 
 error:
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index 734255a9d..522b69307 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -1149,8 +1149,8 @@ _convert_from_dict(PyObject *obj, int align)
             }
             Py_DECREF(off);
             if (offset < 0) {
-                PyErr_Format(PyExc_ValueError, "offset %d cannot be negative",
-                             (int)offset);
+                PyErr_Format(PyExc_ValueError, "offset %ld cannot be negative",
+                             offset);
                 Py_DECREF(tup);
                 Py_DECREF(ind);
                 goto fail;
@@ -1164,10 +1164,10 @@ _convert_from_dict(PyObject *obj, int align)
             /* If align=True, enforce field alignment */
             if (align && offset % newdescr->alignment != 0) {
                 PyErr_Format(PyExc_ValueError,
-                        "offset %d for NumPy dtype with fields is "
+                        "offset %ld for NumPy dtype with fields is "
                         "not divisible by the field alignment %d "
                         "with align=True",
-                        (int)offset, (int)newdescr->alignment);
+                        offset, newdescr->alignment);
                 ret = NPY_FAIL;
             }
             else if (offset + newdescr->elsize > totalsize) {
@@ -1286,7 +1286,7 @@ _convert_from_dict(PyObject *obj, int align)
             PyErr_Format(PyExc_ValueError,
                     "NumPy dtype descriptor requires %d bytes, "
                     "cannot override to smaller itemsize of %d",
-                    (int)new->elsize, (int)itemsize);
+                    new->elsize, itemsize);
             Py_DECREF(new);
             goto fail;
         }
@@ -1295,7 +1295,7 @@ _convert_from_dict(PyObject *obj, int align)
             PyErr_Format(PyExc_ValueError,
                     "NumPy dtype descriptor requires alignment of %d bytes, "
                     "which is not divisible into the specified itemsize %d",
-                    (int)new->alignment, (int)itemsize);
+                    new->alignment, itemsize);
             Py_DECREF(new);
             goto fail;
         }
@@ -1385,7 +1385,6 @@ NPY_NO_EXPORT int
 PyArray_DescrConverter(PyObject *obj, PyArray_Descr **at)
 {
     int check_num = NPY_NOTYPE + 10;
-    PyObject *item;
     int elsize = 0;
     char endian = '=';
 
@@ -1664,16 +1663,22 @@ finish:
         PyErr_Clear();
         /* Now check to see if the object is registered in typeDict */
         if (typeDict != NULL) {
-            item = PyDict_GetItem(typeDict, obj);
+            PyObject *item = NULL;
 #if defined(NPY_PY3K)
-            if (!item && PyBytes_Check(obj)) {
+            if (PyBytes_Check(obj)) {
                 PyObject *tmp;
                 tmp = PyUnicode_FromEncodedObject(obj, "ascii", "strict");
-                if (tmp != NULL) {
-                    item = PyDict_GetItem(typeDict, tmp);
-                    Py_DECREF(tmp);
+                if (tmp == NULL) {
+                    goto fail;
                 }
+                item = PyDict_GetItem(typeDict, tmp);
+                Py_DECREF(tmp);
+            }
+            else {
+                item = PyDict_GetItem(typeDict, obj);
             }
+#else
+            item = PyDict_GetItem(typeDict, obj);
 #endif
             if (item) {
                 /* Check for a deprecated Numeric-style typecode */
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 247864775..8dcd28c84 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -1198,9 +1198,9 @@ array_assign_boolean_subscript(PyArrayObject *self,
         if (size != PyArray_DIMS(v)[0]) {
             PyErr_Format(PyExc_ValueError,
                     "NumPy boolean array indexing assignment "
-                    "cannot assign %d input values to "
-                    "the %d output values where the mask is true",
-                    (int)PyArray_DIMS(v)[0], (int)size);
+                    "cannot assign %" NPY_INTP_FMT " input values to "
+                    "the %" NPY_INTP_FMT " output values where the mask is true",
+                    PyArray_DIMS(v)[0], size);
             return -1;
         }
         v_stride = PyArray_STRIDES(v)[0];
diff --git a/numpy/core/src/multiarray/nditer_api.c b/numpy/core/src/multiarray/nditer_api.c
index db0bfcece..e7fe0fa50 100644
--- a/numpy/core/src/multiarray/nditer_api.c
+++ b/numpy/core/src/multiarray/nditer_api.c
@@ -371,8 +371,8 @@ NpyIter_ResetToIterIndexRange(NpyIter *iter,
         }
         if (errmsg == NULL) {
             PyErr_Format(PyExc_ValueError,
-                    "Out-of-bounds range [%d, %d) passed to "
-                    "ResetToIterIndexRange", (int)istart, (int)iend);
+                    "Out-of-bounds range [%" NPY_INTP_FMT ", %" NPY_INTP_FMT ") passed to "
+                    "ResetToIterIndexRange", istart, iend);
         }
         else {
             *errmsg = "Out-of-bounds range passed to ResetToIterIndexRange";
@@ -382,8 +382,8 @@ NpyIter_ResetToIterIndexRange(NpyIter *iter,
     else if (iend < istart) {
         if (errmsg == NULL) {
             PyErr_Format(PyExc_ValueError,
-                    "Invalid range [%d, %d) passed to ResetToIterIndexRange",
-                    (int)istart, (int)iend);
+                    "Invalid range [%" NPY_INTP_FMT ", %" NPY_INTP_FMT ") passed to ResetToIterIndexRange",
+                    istart, iend);
         }
         else {
             *errmsg = "Invalid range passed to ResetToIterIndexRange";
@@ -1429,8 +1429,8 @@ NpyIter_DebugPrint(NpyIter *iter)
         printf("REUSE_REDUCE_LOOPS ");
 
     printf("\n");
-    printf("| NDim: %d\n", (int)ndim);
-    printf("| NOp: %d\n", (int)nop);
+    printf("| NDim: %d\n", ndim);
+    printf("| NOp: %d\n", nop);
     if (NIT_MASKOP(iter) >= 0) {
         printf("| MaskOp: %d\n", (int)NIT_MASKOP(iter));
     }
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index d40836dc2..5e770338d 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -154,7 +154,7 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
     if (nop > NPY_MAXARGS) {
         PyErr_Format(PyExc_ValueError,
             "Cannot construct an iterator with more than %d operands "
-            "(%d were requested)", (int)NPY_MAXARGS, (int)nop);
+            "(%d were requested)", NPY_MAXARGS, nop);
         return NULL;
     }
 
@@ -810,7 +810,7 @@ npyiter_check_op_axes(int nop, int oa_ndim, int **op_axes,
         PyErr_Format(PyExc_ValueError,
                 "Cannot construct an iterator with more than %d dimensions "
                 "(%d were requested for op_axes)",
-                (int)NPY_MAXDIMS, oa_ndim);
+                NPY_MAXDIMS, oa_ndim);
         return 0;
     }
     if (op_axes == NULL) {
@@ -826,14 +826,14 @@ npyiter_check_op_axes(int nop, int oa_ndim, int **op_axes,
         if (axes != NULL) {
             memset(axes_dupcheck, 0, NPY_MAXDIMS);
             for (idim = 0; idim < oa_ndim; ++idim) {
-                npy_intp i = axes[idim];
+                int i = axes[idim];
                 if (i >= 0) {
                     if (i >= NPY_MAXDIMS) {
                         PyErr_Format(PyExc_ValueError,
                                 "The 'op_axes' provided to the iterator "
                                 "constructor for operand %d "
                                 "contained invalid "
-                                "values %d", (int)iop, (int)i);
+                                "values %d", iop, i);
                         return 0;
                     }
                     else if (axes_dupcheck[i] == 1) {
@@ -841,7 +841,7 @@ npyiter_check_op_axes(int nop, int oa_ndim, int **op_axes,
                                 "The 'op_axes' provided to the iterator "
                                 "constructor for operand %d "
                                 "contained duplicate "
-                                "value %d", (int)iop, (int)i);
+                                "value %d", iop, i);
                         return 0;
                     }
                     else {
@@ -1311,7 +1311,7 @@ npyiter_check_casting(int nop, PyArrayObject **op,
                 PyObject *errmsg;
                 errmsg = PyUString_FromFormat(
                         "Iterator operand %d dtype could not be cast from ",
-                        (int)iop);
+                        iop);
                 PyUString_ConcatAndDel(&errmsg,
                         PyObject_Repr((PyObject *)PyArray_DESCR(op[iop])));
                 PyUString_ConcatAndDel(&errmsg,
@@ -1342,7 +1342,7 @@ npyiter_check_casting(int nop, PyArrayObject **op,
                 PyUString_ConcatAndDel(&errmsg,
                         PyUString_FromFormat(", the operand %d dtype, "
                                 "according to the rule %s",
-                                (int)iop,
+                                iop,
                                 npyiter_casting_to_string(casting)));
                 PyErr_SetObject(PyExc_TypeError, errmsg);
                 Py_DECREF(errmsg);
@@ -1500,8 +1500,8 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, npyiter_opitflags *op_itf
                                     "Iterator input op_axes[%d][%d] (==%d) "
                                     "is not a valid axis of op[%d], which "
                                     "has %d dimensions ",
-                                    (int)iop, (int)(ndim-idim-1), (int)i,
-                                    (int)iop, (int)ondim);
+                                    iop, (ndim-idim-1), i,
+                                    iop, ondim);
                             return 0;
                         }
                     }
diff --git a/numpy/core/src/multiarray/nditer_pywrap.c b/numpy/core/src/multiarray/nditer_pywrap.c
index 4b9d41aa4..246f9d382 100644
--- a/numpy/core/src/multiarray/nditer_pywrap.c
+++ b/numpy/core/src/multiarray/nditer_pywrap.c
@@ -2016,7 +2016,7 @@ npyiter_seq_item(NewNpyArrayIterObject *self, Py_ssize_t i)
 
     if (i < 0 || i >= nop) {
         PyErr_Format(PyExc_IndexError,
-                "Iterator operand index %d is out of bounds", (int)i_orig);
+                "Iterator operand index %zd is out of bounds", i_orig);
         return NULL;
     }
 
@@ -2030,7 +2030,7 @@ npyiter_seq_item(NewNpyArrayIterObject *self, Py_ssize_t i)
      */
     if (!self->readflags[i]) {
         PyErr_Format(PyExc_RuntimeError,
-                "Iterator operand %d is write-only", (int)i);
+                "Iterator operand %zd is write-only", i);
         return NULL;
     }
 #endif
@@ -2147,12 +2147,12 @@ npyiter_seq_ass_item(NewNpyArrayIterObject *self, Py_ssize_t i, PyObject *v)
 
     if (i < 0 || i >= nop) {
         PyErr_Format(PyExc_IndexError,
-                "Iterator operand index %d is out of bounds", (int)i_orig);
+                "Iterator operand index %zd is out of bounds", i_orig);
         return -1;
     }
     if (!self->writeflags[i]) {
         PyErr_Format(PyExc_RuntimeError,
-                "Iterator operand %d is not writeable", (int)i_orig);
+                "Iterator operand %zd is not writeable", i_orig);
         return -1;
     }
 
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 5443223ab..e6d8eca0d 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1294,10 +1294,10 @@ NPY_NO_EXPORT void
         const @type@ in1 = *(@type@ *)ip1;
         const @type@ in2 = *(@type@ *)ip2;
         if (in1 == NPY_DATETIME_NAT) {
-            *((@type@ *)op1) = in2;
+            *((@type@ *)op1) = in1;
         }
         else if (in2 == NPY_DATETIME_NAT) {
-            *((@type@ *)op1) = in1;
+            *((@type@ *)op1) = in2;
         }
         else {
             *((@type@ *)op1) = (in1 @OP@ in2) ? in1 : in2;
@@ -1635,6 +1635,30 @@ NPY_NO_EXPORT void
 /**end repeat**/
 
 /**begin repeat
+ *  #func = rint, ceil, floor, trunc#
+ *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
+ */
+
+/**begin repeat1
+*  #TYPE = FLOAT, DOUBLE#
+*  #type = npy_float, npy_double#
+*  #typesub = f, #
+*/
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *(@type@ *)op1 = @scalarf@@typesub@(in1);
+    }
+}
+
+
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
  *  #func = sin, cos, exp, log#
  *  #scalarf = npy_sinf, npy_cosf, npy_expf, npy_logf#
  */
@@ -1657,6 +1681,78 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
  */
 
 /**begin repeat1
+ *  #TYPE = FLOAT, DOUBLE#
+ *  #type = npy_float, npy_double#
+ *  #typesub = f, #
+ */
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_sqrt_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_sqrt_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *(@type@ *)op1 = npy_sqrt@typesub@(in1);
+        }
+    }
+}
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_absolute_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_absolute_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ tmp = in1 > 0 ? in1 : -in1;
+            /* add 0 to clear -0.0 */
+            *((@type@ *)op1) = tmp + 0;
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_square_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_square_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *(@type@ *)op1 = in1*in1;
+        }
+    }
+}
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_reciprocal_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_reciprocal_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *(@type@ *)op1 = 1.0f/in1;
+        }
+    }
+}
+
+/**begin repeat2
+ *  #func = rint, ceil, floor, trunc#
+ *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
+ */
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_@isa@_@func@_@TYPE@(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            *(@type@ *)op1 = @scalarf@@typesub@(in1);
+        }
+    }
+}
+
+/**end repeat2**/
+/**end repeat1**/
+
+/**begin repeat1
  *  #func = exp, log#
  *  #scalarf = npy_expf, npy_logf#
  */
@@ -1706,10 +1802,9 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
 }
 
 /**end repeat1**/
-
-
 /**end repeat**/
 
+
 /**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble, npy_float#
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 5070ab38b..0ef14a809 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -7,14 +7,12 @@
 #define _NPY_UMATH_LOOPS_H_
 
 #define BOOL_invert BOOL_logical_not
-#define BOOL_negative BOOL_logical_not
 #define BOOL_add BOOL_logical_or
 #define BOOL_bitwise_and BOOL_logical_and
 #define BOOL_bitwise_or BOOL_logical_or
 #define BOOL_logical_xor BOOL_not_equal
 #define BOOL_bitwise_xor BOOL_logical_xor
 #define BOOL_multiply BOOL_logical_and
-#define BOOL_subtract BOOL_logical_xor
 #define BOOL_maximum BOOL_logical_or
 #define BOOL_minimum BOOL_logical_and
 #define BOOL_fmax BOOL_maximum
@@ -175,6 +173,19 @@ NPY_NO_EXPORT void
  */
 NPY_NO_EXPORT void
 @TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #isa = avx512f, fma#
+ */
+
+/**begin repeat2
+ * #func = sqrt, absolute, square, reciprocal#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+/**end repeat2**/
+/**end repeat1**/
 /**end repeat**/
 
 /**begin repeat
@@ -194,6 +205,26 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY
 /**end repeat**/
 
 /**begin repeat
+ *  #func = rint, ceil, floor, trunc#
+ */
+
+/**begin repeat1
+*  #TYPE = FLOAT, DOUBLE#
+*/
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+
+/**begin repeat2
+ * #isa = avx512f, fma#
+ */
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
+/**end repeat2**/
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
  * Float types
  *  #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
  *  #c = f, f, , l#
diff --git a/numpy/core/src/umath/override.c b/numpy/core/src/umath/override.c
index 8d67f96ac..43bed425c 100644
--- a/numpy/core/src/umath/override.c
+++ b/numpy/core/src/umath/override.c
@@ -494,32 +494,18 @@ PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
             }
             else {
                 /* not a tuple */
-                if (nout > 1 && DEPRECATE("passing a single argument to the "
-                                          "'out' keyword argument of a "
-                                          "ufunc with\n"
-                                          "more than one output will "
-                                          "result in an error in the "
-                                          "future") < 0) {
-                    /*
-                     * If the deprecation is removed, also remove the loop
-                     * below setting tuple items to None (but keep this future
-                     * error message.)
-                     */
+                if (nout > 1) {
                     PyErr_SetString(PyExc_TypeError,
                                     "'out' must be a tuple of arguments");
                     goto fail;
                 }
                 if (out != Py_None) {
                     /* not already a tuple and not None */
-                    PyObject *out_tuple = PyTuple_New(nout);
+                    PyObject *out_tuple = PyTuple_New(1);
 
                     if (out_tuple == NULL) {
                         goto fail;
                     }
-                    for (i = 1; i < nout; i++) {
-                        Py_INCREF(Py_None);
-                        PyTuple_SET_ITEM(out_tuple, i, Py_None);
-                    }
                     /* out was borrowed ref; make it permanent */
                     Py_INCREF(out);
                     /* steals reference */
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 88e5e1f1b..74f52cc9d 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -139,6 +139,37 @@ abs_ptrdiff(char *a, char *b)
 /* prototypes */
 
 /**begin repeat1
+ * #type = npy_float, npy_double#
+ * #TYPE = FLOAT, DOUBLE#
+ */
+
+/**begin repeat2
+ *  #func = sqrt, absolute, square, reciprocal, rint, floor, ceil, trunc#
+ */
+
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE NPY_GCC_TARGET_@ISA@ void
+@ISA@_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n, const npy_intp stride);
+#endif
+
+static NPY_INLINE int
+run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
+    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), @REGISTER_SIZE@)) {
+        @ISA@_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
+        return 1;
+    }
+    else
+        return 0;
+#endif
+    return 0;
+}
+
+/**end repeat2**/
+/**end repeat1**/
+
+/**begin repeat1
  * #func = exp, log#
  */
 
@@ -185,7 +216,6 @@ run_unary_@isa@_sincos_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps,
 /**end repeat**/
 
 
-
 /**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble#
@@ -1144,41 +1174,76 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
 
 #if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_full_load_mask(void)
+fma_get_full_load_mask_ps(void)
 {
     return _mm256_set1_ps(-1.0);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+fma_get_full_load_mask_pd(void)
+{
+    return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem)
+fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
 {
     float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
                             1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
-    float* addr = maskint + total_elem - num_lanes;
+    float* addr = maskint + num_lanes - num_elem;
     return _mm256_loadu_ps(addr);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
+{
+    npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
+    npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
+    return _mm256_loadu_si256((__m256i*) addr);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_gather(__m256 src,
-                   npy_float* addr,
-                   __m256i vindex,
-                   __m256 mask)
+fma_masked_gather_ps(__m256 src,
+                     npy_float* addr,
+                     __m256i vindex,
+                     __m256 mask)
 {
     return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+fma_masked_gather_pd(__m256d src,
+                     npy_double* addr,
+                     __m128i vindex,
+                     __m256d mask)
+{
+    return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_load(__m256 mask, npy_float* addr)
+fma_masked_load_ps(__m256 mask, npy_float* addr)
 {
     return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+fma_masked_load_pd(__m256i mask, npy_double* addr)
+{
+    return _mm256_maskload_pd(addr, mask);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_set_masked_lanes(__m256 x, __m256 val, __m256 mask)
+fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
 {
     return _mm256_blendv_ps(x, val, mask);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
+fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
+{
+    return _mm256_blendv_pd(x, val, mask);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
 fma_blend(__m256 x, __m256 y, __m256 ymask)
 {
@@ -1186,6 +1251,18 @@ fma_blend(__m256 x, __m256 y, __m256 ymask)
 }
 
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
+fma_invert_mask_ps(__m256 ymask)
+{
+    return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
+fma_invert_mask_pd(__m256i ymask)
+{
+    return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
 fma_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp)
 {
    return _mm256_cvtepi32_ps(
@@ -1290,42 +1367,115 @@ fma_scalef_ps(__m256 poly, __m256 quadrant)
      }
 }
 
+/**begin repeat
+ *  #vsub = ps, pd#
+ *  #vtype = __m256, __m256d#
+ */
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+fma_abs_@vsub@(@vtype@ x)
+{
+    return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+fma_reciprocal_@vsub@(@vtype@ x)
+{
+    return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+fma_rint_@vsub@(@vtype@ x)
+{
+    return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEAREST_INT);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+fma_floor_@vsub@(@vtype@ x)
+{
+    return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEG_INF);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+fma_ceil_@vsub@(@vtype@ x)
+{
+    return _mm256_round_@vsub@(x, _MM_FROUND_TO_POS_INF);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
+fma_trunc_@vsub@(@vtype@ x)
+{
+    return _mm256_round_@vsub@(x, _MM_FROUND_TO_ZERO);
+}
+/**end repeat**/
 #endif
 
 #if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_full_load_mask(void)
+avx512_get_full_load_mask_ps(void)
 {
     return 0xFFFF;
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+avx512_get_full_load_mask_pd(void)
+{
+    return 0xFF;
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem)
+avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
 {
     return (0x0001 << num_elem) - 0x0001;
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
+{
+    return (0x01 << num_elem) - 0x01;
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_gather(__m512 src,
-                     npy_float* addr,
-                     __m512i vindex,
-                     __mmask16 kmask)
+avx512_masked_gather_ps(__m512 src,
+                        npy_float* addr,
+                        __m512i vindex,
+                        __mmask16 kmask)
 {
     return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+avx512_masked_gather_pd(__m512d src,
+                        npy_double* addr,
+                        __m256i vindex,
+                        __mmask8 kmask)
+{
+    return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_load(__mmask16 mask, npy_float* addr)
+avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
 {
     return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
+{
+    return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_set_masked_lanes(__m512 x, __m512 val, __mmask16 mask)
+avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
 {
     return _mm512_mask_blend_ps(mask, x, val);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
+{
+    return _mm512_mask_blend_pd(mask, x, val);
+}
+
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
 {
@@ -1333,6 +1483,18 @@ avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
 }
 
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+avx512_invert_mask_ps(__mmask16 ymask)
+{
+    return _mm512_knot(ymask);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+avx512_invert_mask_pd(__mmask8 ymask)
+{
+    return _mm512_knot(ymask);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
 avx512_should_calculate_sine(__m512i k, __m512i andop, __m512i cmp)
 {
     return _mm512_cmpeq_epi32_mask(_mm512_and_epi32(k, andop), cmp);
@@ -1361,6 +1523,49 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant)
 {
     return _mm512_scalef_ps(poly, quadrant);
 }
+/**begin repeat
+ *  #vsub  = ps, pd#
+ *  #epi_vsub  = epi32, epi64#
+ *  #vtype = __m512, __m512d#
+ *  #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
+ */
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_abs_@vsub@(@vtype@ x)
+{
+    return (@vtype@) _mm512_and_@epi_vsub@((__m512i) x,
+				    _mm512_set1_@epi_vsub@ (@and_const@));
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_reciprocal_@vsub@(@vtype@ x)
+{
+    return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_rint_@vsub@(@vtype@ x)
+{
+    return _mm512_roundscale_@vsub@(x, 0x08);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_floor_@vsub@(@vtype@ x)
+{
+    return _mm512_roundscale_@vsub@(x, 0x09);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_ceil_@vsub@(@vtype@ x)
+{
+    return _mm512_roundscale_@vsub@(x, 0x0A);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
+avx512_trunc_@vsub@(@vtype@ x)
+{
+    return _mm512_roundscale_@vsub@(x, 0x0B);
+}
+/**end repeat**/
 #endif
 
 /**begin repeat
@@ -1438,7 +1643,187 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
     sin = @fmadd@(sin, x, x);
     return sin;
 }
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+@isa@_sqrt_ps(@vtype@ x)
+{
+    return _mm@vsize@_sqrt_ps(x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
+@isa@_sqrt_pd(@vtype@d x)
+{
+    return _mm@vsize@_sqrt_pd(x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
+@isa@_square_ps(@vtype@ x)
+{
+    return _mm@vsize@_mul_ps(x,x);
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
+@isa@_square_pd(@vtype@d x)
+{
+    return _mm@vsize@_mul_pd(x,x);
+}
+
+#endif
+/**end repeat**/
+
+
+/**begin repeat
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512#
+ * #vsize = 256, 512#
+ * #BYTES = 32, 64#
+ * #cvtps_epi32 = _mm256_cvtps_epi32, #
+ * #mask = __m256, __mmask16#
+ * #vsub = , _mask#
+ * #vtype = __m256, __m512#
+ * #cvtps_epi32 = _mm256_cvtps_epi32, #
+ * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
+ */
+
+/**begin repeat1
+ *  #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
+ *  #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
+ *  #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
+ */
+
+#if defined @CHK@
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
+@ISA@_@func@_FLOAT(npy_float* op,
+                   npy_float* ip,
+                   const npy_intp array_size,
+                   const npy_intp steps)
+{
+    const npy_intp stride = steps/sizeof(npy_float);
+    const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+    npy_intp num_remaining_elements = array_size;
+    @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
+    @mask@ load_mask = @isa@_get_full_load_mask_ps();
+#if @replace_0_with_1@
+    @mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask);
+#endif
+    npy_int indexarr[16];
+    for (npy_int ii = 0; ii < 16; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < num_lanes) {
+            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
+                                                       num_lanes);
+#if @replace_0_with_1@
+            inv_load_mask = @isa@_invert_mask_ps(load_mask);
+#endif
+        }
+        @vtype@ x;
+        if (stride == 1) {
+            x = @isa@_masked_load_ps(load_mask, ip);
+#if @replace_0_with_1@
+            /*
+             * Replace masked elements with 1.0f to avoid divide by zero fp
+             * exception in reciprocal
+             */
+            x = @isa@_set_masked_lanes_ps(x, ones_f, inv_load_mask);
+#endif
+        }
+        else {
+            x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask);
+        }
+        @vtype@ out = @isa@_@vectorf@_ps(x);
+        @masked_store@(op, @cvtps_epi32@(load_mask), out);
+
+        ip += num_lanes*stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+}
+#endif
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #ISA = FMA, AVX512F#
+ * #isa = fma, avx512#
+ * #vsize = 256, 512#
+ * #BYTES = 32, 64#
+ * #cvtps_epi32 = _mm256_cvtps_epi32, #
+ * #mask = __m256i, __mmask8#
+ * #vsub = , _mask#
+ * #vtype = __m256d, __m512d#
+ * #vindextype = __m128i, __m256i#
+ * #vindexsize = 128, 256#
+ * #vindexload = _mm_loadu_si128, _mm256_loadu_si256#
+ * #cvtps_epi32 = _mm256_cvtpd_epi32, #
+ * #castmask = _mm256_castsi256_pd, #
+ * #masked_store = _mm256_maskstore_pd, _mm512_mask_storeu_pd#
+ * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
+ */
+
+/**begin repeat1
+ *  #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
+ *  #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
+ *  #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
+ */
+
+#if defined @CHK@
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
+@ISA@_@func@_DOUBLE(npy_double* op,
+                    npy_double* ip,
+                    const npy_intp array_size,
+                    const npy_intp steps)
+{
+    const npy_intp stride = steps/sizeof(npy_double);
+    const npy_int num_lanes = @BYTES@/sizeof(npy_double);
+    npy_intp num_remaining_elements = array_size;
+    @mask@ load_mask = @isa@_get_full_load_mask_pd();
+#if @replace_0_with_1@
+    @mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask);
+#endif
+    @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f);
+    npy_int indexarr[8];
+    for (npy_int ii = 0; ii < 8; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+    @vindextype@ vindex = @vindexload@((@vindextype@*)&indexarr[0]);
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < num_lanes) {
+            load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements,
+                                                       num_lanes);
+#if @replace_0_with_1@
+            inv_load_mask = @isa@_invert_mask_pd(load_mask);
 #endif
+        }
+        @vtype@ x;
+        if (stride == 1) {
+            x = @isa@_masked_load_pd(load_mask, ip);
+#if @replace_0_with_1@
+            /*
+             * Replace masked elements with 1.0f to avoid divide by zero fp
+             * exception in reciprocal
+             */
+            x = @isa@_set_masked_lanes_pd(x, ones_d, @castmask@(inv_load_mask));
+#endif
+        }
+        else {
+            x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask));
+        }
+        @vtype@ out = @isa@_@vectorf@_pd(x);
+        @masked_store@(op, load_mask, out);
+
+        ip += num_lanes*stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+}
+#endif
+/**end repeat1**/
 /**end repeat**/
 
 /**begin repeat
@@ -1460,7 +1845,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
  * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
  */
 
-
 /*
  * Vectorized approximate sine/cosine algorithms: The following code is a
  * vectorized version of the algorithm presented here:
@@ -1519,7 +1903,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @vtype@ quadrant, reduced_x, reduced_x2, cos, sin;
     @vtype@i iquadrant;
     @mask@ nan_mask, glibc_mask, sine_mask, negate_mask;
-    @mask@ load_mask = @isa@_get_full_load_mask();
+    @mask@ load_mask = @isa@_get_full_load_mask_ps();
     npy_intp num_remaining_elements = array_size;
     npy_int indexarr[16];
     for (npy_int ii = 0; ii < 16; ii++) {
@@ -1530,16 +1914,16 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     while (num_remaining_elements > 0) {
 
         if (num_remaining_elements < num_lanes) {
-            load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
+            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
                                                          num_lanes);
         }
 
         @vtype@ x;
         if (stride == 1) {
-            x = @isa@_masked_load(load_mask, ip);
+            x = @isa@_masked_load_ps(load_mask, ip);
         }
         else {
-            x = @isa@_masked_gather(zero_f, ip, vindex, load_mask);
+            x = @isa@_masked_gather_ps(zero_f, ip, vindex, load_mask);
         }
 
         /*
@@ -1551,7 +1935,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         glibc_mask = @isa@_in_range_mask(x, large_number,-large_number);
         glibc_mask = @and_masks@(load_mask, glibc_mask);
         nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
-        x = @isa@_set_masked_lanes(x, zero_f, @or_masks@(nan_mask, glibc_mask));
+        x = @isa@_set_masked_lanes_ps(x, zero_f, @or_masks@(nan_mask, glibc_mask));
         npy_int iglibc_mask = @mask_to_int@(glibc_mask);
 
         if (iglibc_mask != @full_mask@) {
@@ -1584,7 +1968,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
             /* multiply by -1 for appropriate elements */
             negate_mask = @isa@_should_negate(iquadrant, twos, twos);
             cos = @isa@_blend(cos, _mm@vsize@_sub_ps(zero_f, cos), negate_mask);
-            cos = @isa@_set_masked_lanes(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
+            cos = @isa@_set_masked_lanes_ps(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
 
             @masked_store@(op, @cvtps_epi32@(load_mask), cos);
         }
@@ -1662,27 +2046,27 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
 
     @mask@ xmax_mask, xmin_mask, nan_mask, inf_mask;
-    @mask@ overflow_mask = @isa@_get_partial_load_mask(0, num_lanes);
-    @mask@ load_mask = @isa@_get_full_load_mask();
+    @mask@ overflow_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
+    @mask@ load_mask = @isa@_get_full_load_mask_ps();
     npy_intp num_remaining_elements = array_size;
 
     while (num_remaining_elements > 0) {
 
         if (num_remaining_elements < num_lanes) {
-            load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
-                                                    num_lanes);
+            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
+                                                       num_lanes);
         }
 
         @vtype@ x;
         if (stride == 1) {
-            x = @isa@_masked_load(load_mask, ip);
+            x = @isa@_masked_load_ps(load_mask, ip);
         }
         else {
-            x = @isa@_masked_gather(zeros_f, ip, vindex, load_mask);
+            x = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
         }
 
         nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
-        x = @isa@_set_masked_lanes(x, zeros_f, nan_mask);
+        x = @isa@_set_masked_lanes_ps(x, zeros_f, nan_mask);
 
         xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ);
         xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ);
@@ -1690,7 +2074,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         overflow_mask = @or_masks@(overflow_mask,
                                     @xor_masks@(xmax_mask, inf_mask));
 
-        x = @isa@_set_masked_lanes(x, zeros_f, @or_masks@(
+        x = @isa@_set_masked_lanes_ps(x, zeros_f, @or_masks@(
                                     @or_masks@(nan_mask, xmin_mask), xmax_mask));
 
         quadrant = _mm@vsize@_mul_ps(x, log2e);
@@ -1723,9 +2107,9 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
          * elem < xmin; return 0.0f
          * elem = +/- nan, return nan
          */
-        poly = @isa@_set_masked_lanes(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
-        poly = @isa@_set_masked_lanes(poly, inf, xmax_mask);
-        poly = @isa@_set_masked_lanes(poly, zeros_f, xmin_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, inf, xmax_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, zeros_f, xmin_mask);
 
         @masked_store@(op, @cvtps_epi32@(load_mask), poly);
 
@@ -1790,24 +2174,24 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @vtype@ poly, num_poly, denom_poly, exponent;
 
     @mask@ inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
-    @mask@ invalid_mask = @isa@_get_partial_load_mask(0, num_lanes);
+    @mask@ invalid_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
     @mask@ divide_by_zero_mask = invalid_mask;
-    @mask@ load_mask = @isa@_get_full_load_mask();
+    @mask@ load_mask = @isa@_get_full_load_mask_ps();
     npy_intp num_remaining_elements = array_size;
 
     while (num_remaining_elements > 0) {
 
         if (num_remaining_elements < num_lanes) {
-            load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
-                                                    num_lanes);
+            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
+                                                       num_lanes);
         }
 
         @vtype@ x_in;
         if (stride == 1) {
-            x_in = @isa@_masked_load(load_mask, ip);
+            x_in = @isa@_masked_load_ps(load_mask, ip);
         }
         else {
-            x_in  = @isa@_masked_gather(zeros_f, ip, vindex, load_mask);
+            x_in  = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
         }
 
         negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ);
@@ -1818,7 +2202,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
                                         @and_masks@(zero_mask, load_mask));
         invalid_mask = @or_masks@(invalid_mask, negx_mask);
 
-        @vtype@ x = @isa@_set_masked_lanes(x_in, zeros_f, negx_mask);
+        @vtype@ x = @isa@_set_masked_lanes_ps(x_in, zeros_f, negx_mask);
 
         /* set x = normalized mantissa */
         exponent = @isa@_get_exponent(x);
@@ -1852,10 +2236,10 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
          * x = +/- NAN; return NAN
          * x = 0.0f; return -INF
          */
-        poly = @isa@_set_masked_lanes(poly, nan, nan_mask);
-        poly = @isa@_set_masked_lanes(poly, neg_nan, negx_mask);
-        poly = @isa@_set_masked_lanes(poly, neg_inf, zero_mask);
-        poly = @isa@_set_masked_lanes(poly, inf, inf_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, nan, nan_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, neg_nan, negx_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, neg_inf, zero_mask);
+        poly = @isa@_set_masked_lanes_ps(poly, inf, inf_mask);
 
         @masked_store@(op, @cvtps_epi32@(load_mask), poly);
 
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index c36680ed2..1dc581977 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -1193,34 +1193,11 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
                 }
             }
             else {
-                /*
-                 * If the deprecated behavior is ever removed,
-                 * keep only the else branch of this if-else
-                 */
-                if (PyArray_Check(out_kwd) || out_kwd == Py_None) {
-                    if (DEPRECATE("passing a single array to the "
-                                  "'out' keyword argument of a "
-                                  "ufunc with\n"
-                                  "more than one output will "
-                                  "result in an error in the "
-                                  "future") < 0) {
-                        /* The future error message */
-                        PyErr_SetString(PyExc_TypeError,
-                                        "'out' must be a tuple of arrays");
-                        goto fail;
-                    }
-                    if (_set_out_array(out_kwd, out_op+nin) < 0) {
-                        goto fail;
-                    }
-                }
-                else {
-                    PyErr_SetString(PyExc_TypeError,
-                                    nout > 1 ? "'out' must be a tuple "
-                                    "of arrays" :
-                                    "'out' must be an array or a "
-                                    "tuple of a single array");
-                    goto fail;
-                }
+                PyErr_SetString(PyExc_TypeError,
+                        nout > 1 ? "'out' must be a tuple of arrays" :
+                                   "'out' must be an array or a tuple with "
+                                   "a single array");
+                goto fail;
             }
         }
         /*
@@ -4081,8 +4058,8 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
     for (i = 0; i < ind_size; ++i) {
         if (reduceat_ind[i] < 0 || reduceat_ind[i] >= red_axis_size) {
             PyErr_Format(PyExc_IndexError,
-                "index %d out-of-bounds in %s.%s [0, %d)",
-                (int)reduceat_ind[i], ufunc_name, opname, (int)red_axis_size);
+                "index %" NPY_INTP_FMT " out-of-bounds in %s.%s [0, %" NPY_INTP_FMT ")",
+                reduceat_ind[i], ufunc_name, opname, red_axis_size);
             return NULL;
         }
     }
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index 9be7b63a0..f93d8229e 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -883,7 +883,7 @@ PyUFunc_SubtractionTypeResolver(PyUFuncObject *ufunc,
         /* The type resolver would have upcast already */
         if (out_dtypes[0]->type_num == NPY_BOOL) {
             PyErr_Format(PyExc_TypeError,
-                "numpy boolean subtract, the `-` operator, is deprecated, "
+                "numpy boolean subtract, the `-` operator, is not supported, "
                 "use the bitwise_xor, the `^` operator, or the logical_xor "
                 "function instead.");
             return -1;
diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py
index f99c0f72b..11f900c5f 100644
--- a/numpy/core/tests/test_datetime.py
+++ b/numpy/core/tests/test_datetime.py
@@ -1333,10 +1333,10 @@ class TestDateTime(object):
         # Interaction with NaT
         a = np.array('1999-03-12T13', dtype='M8[2m]')
         dtnat = np.array('NaT', dtype='M8[h]')
-        assert_equal(np.minimum(a, dtnat), a)
-        assert_equal(np.minimum(dtnat, a), a)
-        assert_equal(np.maximum(a, dtnat), a)
-        assert_equal(np.maximum(dtnat, a), a)
+        assert_equal(np.minimum(a, dtnat), dtnat)
+        assert_equal(np.minimum(dtnat, a), dtnat)
+        assert_equal(np.maximum(a, dtnat), dtnat)
+        assert_equal(np.maximum(dtnat, a), dtnat)
 
         # Also do timedelta
         a = np.array(3, dtype='m8[h]')
@@ -1831,7 +1831,7 @@ class TestDateTime(object):
     def test_timedelta_arange_no_dtype(self):
         d = np.array(5, dtype="m8[D]")
         assert_equal(np.arange(d, d + 1), d)
-        assert_raises(ValueError, np.arange, d)
+        assert_equal(np.arange(d), np.arange(0, d))
 
     def test_datetime_maximum_reduce(self):
         a = np.array(['2010-01-02', '1999-03-14', '1833-03'], dtype='M8[D]')
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 9b124f603..c699a9bc1 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -3602,10 +3602,10 @@ class TestBinop(object):
         assert_equal(np.modf(dummy, out=(None, a)), (1,))
         assert_equal(np.modf(dummy, out=(dummy, a)), (1,))
         assert_equal(np.modf(a, out=(dummy, a)), 0)
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', DeprecationWarning)
-            assert_equal(np.modf(dummy, out=a), (0,))
-            assert_(w[0].category is DeprecationWarning)
+        with assert_raises(TypeError):
+            # Out argument must be tuple, since there are multiple outputs
+            np.modf(dummy, out=a)
+
         assert_raises(ValueError, np.modf, dummy, out=(a,))
 
         # 2 inputs, 1 output
@@ -4105,17 +4105,17 @@ class TestArgmax(object):
           np.datetime64('2010-01-03T05:14:12'),
           np.datetime64('NaT'),
           np.datetime64('2015-09-23T10:10:13'),
-          np.datetime64('1932-10-10T03:50:30')], 4),
+          np.datetime64('1932-10-10T03:50:30')], 0),
         ([np.datetime64('2059-03-14T12:43:12'),
           np.datetime64('1996-09-21T14:43:15'),
           np.datetime64('NaT'),
           np.datetime64('2022-12-25T16:02:16'),
           np.datetime64('1963-10-04T03:14:12'),
-          np.datetime64('2013-05-08T18:15:23')], 0),
+          np.datetime64('2013-05-08T18:15:23')], 2),
         ([np.timedelta64(2, 's'),
           np.timedelta64(1, 's'),
           np.timedelta64('NaT', 's'),
-          np.timedelta64(3, 's')], 3),
+          np.timedelta64(3, 's')], 2),
         ([np.timedelta64('NaT', 's')] * 3, 0),
 
         ([timedelta(days=5, seconds=14), timedelta(days=2, seconds=35),
@@ -4240,17 +4240,17 @@ class TestArgmin(object):
           np.datetime64('2010-01-03T05:14:12'),
           np.datetime64('NaT'),
           np.datetime64('2015-09-23T10:10:13'),
-          np.datetime64('1932-10-10T03:50:30')], 5),
+          np.datetime64('1932-10-10T03:50:30')], 0),
         ([np.datetime64('2059-03-14T12:43:12'),
           np.datetime64('1996-09-21T14:43:15'),
           np.datetime64('NaT'),
           np.datetime64('2022-12-25T16:02:16'),
           np.datetime64('1963-10-04T03:14:12'),
-          np.datetime64('2013-05-08T18:15:23')], 4),
+          np.datetime64('2013-05-08T18:15:23')], 2),
         ([np.timedelta64(2, 's'),
           np.timedelta64(1, 's'),
           np.timedelta64('NaT', 's'),
-          np.timedelta64(3, 's')], 1),
+          np.timedelta64(3, 's')], 2),
         ([np.timedelta64('NaT', 's')] * 3, 0),
 
         ([timedelta(days=5, seconds=14), timedelta(days=2, seconds=35),
@@ -4366,18 +4366,14 @@ class TestMinMax(object):
         assert_equal(np.amax([[1, 2, 3]], axis=1), 3)
 
     def test_datetime(self):
-        # NaTs are ignored
+        # Do not ignore NaT
         for dtype in ('m8[s]', 'm8[Y]'):
             a = np.arange(10).astype(dtype)
-            a[3] = 'NaT'
             assert_equal(np.amin(a), a[0])
             assert_equal(np.amax(a), a[9])
-            a[0] = 'NaT'
-            assert_equal(np.amin(a), a[1])
-            assert_equal(np.amax(a), a[9])
-            a.fill('NaT')
-            assert_equal(np.amin(a), a[0])
-            assert_equal(np.amax(a), a[0])
+            a[3] = 'NaT'
+            assert_equal(np.amin(a), a[3])
+            assert_equal(np.amax(a), a[3])
 
 
 class TestNewaxis(object):
@@ -7975,6 +7971,8 @@ class TestFormat(object):
                 dst = object.__format__(a, '30')
                 assert_equal(res, dst)
 
+from numpy.testing import IS_PYPY
+
 class TestCTypes(object):
 
     def test_ctypes_is_available(self):
@@ -8041,7 +8039,29 @@ class TestCTypes(object):
 
         # but when the `ctypes_ptr` object dies, so should `arr`
         del ctypes_ptr
+        if IS_PYPY:
+            # Pypy does not recycle arr objects immediately. Trigger gc to
+            # release arr. Cpython uses refcounts. An explicit call to gc
+            # should not be needed here.
+            break_cycles()
+        assert_(arr_ref() is None, "unknowable whether ctypes pointer holds a reference")
+
+    def test_ctypes_as_parameter_holds_reference(self):
+        arr = np.array([None]).copy()
+
+        arr_ref = weakref.ref(arr)
+
+        ctypes_ptr = arr.ctypes._as_parameter_
+
+        # `ctypes_ptr` should hold onto `arr`
+        del arr
         break_cycles()
+        assert_(arr_ref() is not None, "ctypes pointer did not hold onto a reference")
+
+        # but when the `ctypes_ptr` object dies, so should `arr`
+        del ctypes_ptr
+        if IS_PYPY:
+            break_cycles()
         assert_(arr_ref() is None, "unknowable whether ctypes pointer holds a reference")
 
 
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index ef48fed05..9b4ce9e47 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -75,11 +75,9 @@ class TestOut(object):
             assert_(r1 is o1)
             assert_(r2 is o2)
 
-            with warnings.catch_warnings(record=True) as w:
-                warnings.filterwarnings('always', '', DeprecationWarning)
+            with assert_raises(TypeError):
+                # Out argument must be tuple, since there are multiple outputs.
                 r1, r2 = np.frexp(d, out=o1, subok=subok)
-                assert_(r1 is o1)
-                assert_(w[0].category is DeprecationWarning)
 
             assert_raises(ValueError, np.add, a, 2, o, o, subok=subok)
             assert_raises(ValueError, np.add, a, 2, o, out=o, subok=subok)
@@ -165,14 +163,9 @@ class TestOut(object):
             else:
                 assert_(type(r1) == np.ndarray)
 
-            with warnings.catch_warnings(record=True) as w:
-                warnings.filterwarnings('always', '', DeprecationWarning)
+            with assert_raises(TypeError):
+                # Out argument must be tuple, since there are multiple outputs.
                 r1, r2 = np.frexp(d, out=o1, subok=subok)
-                if subok:
-                    assert_(isinstance(r2, ArrayWrap))
-                else:
-                    assert_(type(r2) == np.ndarray)
-                assert_(w[0].category is DeprecationWarning)
 
 
 class TestComparisons(object):
@@ -694,8 +687,96 @@ class TestSpecialFloats(object):
             assert_raises(FloatingPointError, np.cos, np.float32(-np.inf))
             assert_raises(FloatingPointError, np.cos, np.float32(np.inf))
 
+    def test_sqrt_values(self):
+        with np.errstate(all='ignore'):
+            x = [np.nan,  np.nan, np.inf, np.nan, 0.]
+            y = [np.nan, -np.nan, np.inf, -np.inf, 0.]
+            for dt in ['f', 'd', 'g']:
+                xf = np.array(x, dtype=dt)
+                yf = np.array(y, dtype=dt)
+                assert_equal(np.sqrt(yf), xf)
+
+        #with np.errstate(invalid='raise'):
+        #    for dt in ['f', 'd', 'g']:
+        #        assert_raises(FloatingPointError, np.sqrt, np.array(-100., dtype=dt))
 
-class TestSIMDFloat32(object):
+    def test_abs_values(self):
+        x = [np.nan,  np.nan, np.inf, np.inf, 0., 0., 1.0, 1.0]
+        y = [np.nan, -np.nan, np.inf, -np.inf, 0., -0., -1.0, 1.0]
+        for dt in ['f', 'd', 'g']:
+            xf = np.array(x, dtype=dt)
+            yf = np.array(y, dtype=dt)
+            assert_equal(np.abs(yf), xf)
+
+    def test_square_values(self):
+        x = [np.nan,  np.nan, np.inf, np.inf]
+        y = [np.nan, -np.nan, np.inf, -np.inf]
+        with np.errstate(all='ignore'):
+            for dt in ['f', 'd', 'g']:
+                xf = np.array(x, dtype=dt)
+                yf = np.array(y, dtype=dt)
+                assert_equal(np.square(yf), xf)
+
+        with np.errstate(over='raise'):
+            assert_raises(FloatingPointError, np.square, np.array(1E32,  dtype='f'))
+            assert_raises(FloatingPointError, np.square, np.array(1E200, dtype='d'))
+
+    def test_reciprocal_values(self):
+        with np.errstate(all='ignore'):
+            x = [np.nan,  np.nan, 0.0, -0.0, np.inf, -np.inf]
+            y = [np.nan, -np.nan, np.inf, -np.inf, 0., -0.]
+            for dt in ['f', 'd', 'g']:
+                xf = np.array(x, dtype=dt)
+                yf = np.array(y, dtype=dt)
+                assert_equal(np.reciprocal(yf), xf)
+
+        with np.errstate(divide='raise'):
+            for dt in ['f', 'd', 'g']:
+                assert_raises(FloatingPointError, np.reciprocal, np.array(-0.0, dtype=dt))
+
+# func : [maxulperror, low, high]
+avx_ufuncs = {'sqrt'        :[1,  0.,   100.],
+              'absolute'    :[0, -100., 100.],
+              'reciprocal'  :[1,  1.,   100.],
+              'square'      :[1, -100., 100.],
+              'rint'        :[0, -100., 100.],
+              'floor'       :[0, -100., 100.],
+              'ceil'        :[0, -100., 100.],
+              'trunc'       :[0, -100., 100.]}
+
+class TestAVXUfuncs(object):
+    def test_avx_based_ufunc(self):
+        strides = np.array([-4,-3,-2,-1,1,2,3,4])
+        np.random.seed(42)
+        for func, prop in avx_ufuncs.items():
+            maxulperr = prop[0]
+            minval = prop[1]
+            maxval = prop[2]
+            # various array sizes to ensure masking in AVX is tested
+            for size in range(1,32):
+                myfunc = getattr(np, func)
+                x_f32 = np.float32(np.random.uniform(low=minval, high=maxval,
+                    size=size))
+                x_f64 = np.float64(x_f32)
+                x_f128 = np.longdouble(x_f32)
+                y_true128 = myfunc(x_f128)
+                if maxulperr == 0:
+                    assert_equal(myfunc(x_f32), np.float32(y_true128))
+                    assert_equal(myfunc(x_f64), np.float64(y_true128))
+                else:
+                    assert_array_max_ulp(myfunc(x_f32), np.float32(y_true128),
+                            maxulp=maxulperr)
+                    assert_array_max_ulp(myfunc(x_f64), np.float64(y_true128),
+                            maxulp=maxulperr)
+                # various strides to test gather instruction
+                if size > 1:
+                    y_true32 = myfunc(x_f32)
+                    y_true64 = myfunc(x_f64)
+                    for jj in strides:
+                        assert_equal(myfunc(x_f64[::jj]), y_true64[::jj])
+                        assert_equal(myfunc(x_f32[::jj]), y_true32[::jj])
+
+class TestAVXFloat32Transcendental(object):
     def test_exp_float32(self):
         np.random.seed(42)
         x_f32 = np.float32(np.random.uniform(low=0.0,high=88.1,size=1000000))
@@ -722,8 +803,8 @@ class TestSIMDFloat32(object):
 
     def test_strided_float32(self):
         np.random.seed(42)
-        strides = np.random.randint(low=-100, high=100, size=100)
-        sizes = np.random.randint(low=1, high=2000, size=100)
+        strides = np.array([-4,-3,-2,-1,1,2,3,4])
+        sizes = np.arange(2,100)
         for ii in sizes:
             x_f32 = np.float32(np.random.uniform(low=0.01,high=88.1,size=ii))
             exp_true = np.exp(x_f32)
@@ -2161,10 +2242,9 @@ class TestSpecialMethods(object):
         assert_(np.modf(a, None) == {})
         assert_(np.modf(a, None, None) == {})
         assert_(np.modf(a, out=(None, None)) == {})
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', DeprecationWarning)
-            assert_(np.modf(a, out=None) == {})
-            assert_(w[0].category is DeprecationWarning)
+        with assert_raises(TypeError):
+            # Out argument must be tuple, since there are multiple outputs.
+            np.modf(a, out=None)
 
         # don't give positional and output argument, or too many arguments.
         # wrong number of arguments in the tuple is an error too.