42 files changed, 1442 insertions, 493 deletions
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index 52ab9c994..e5e7f6667 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -2614,7 +2614,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('argmin',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('argsort',
     """
-    a.argsort(axis=-1, kind='quicksort', order=None)
+    a.argsort(axis=-1, kind=None, order=None)
 
     Returns the indices that would sort this array.
 
@@ -2771,7 +2771,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('choose',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('clip',
     """
-    a.clip(min=None, max=None, out=None)
+    a.clip(min=None, max=None, out=None, **kwargs)
 
     Return an array whose values are limited to ``[min, max]``.
     One of max or min must be given.
@@ -3800,7 +3800,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('setflags',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('sort',
     """
-    a.sort(axis=-1, kind='quicksort', order=None)
+    a.sort(axis=-1, kind=None, order=None)
 
     Sort an array in-place. Refer to `numpy.sort` for full documentation.
 
diff --git a/numpy/core/_methods.py b/numpy/core/_methods.py
index 3ab64f7a1..ba6f7d111 100644
--- a/numpy/core/_methods.py
+++ b/numpy/core/_methods.py
@@ -11,6 +11,7 @@ from numpy.core import multiarray as mu
 from numpy.core import umath as um
 from numpy.core._asarray import asanyarray
 from numpy.core import numerictypes as nt
+from numpy.core import _exceptions
 from numpy._globals import _NoValue
 
 # save those O(100) nanoseconds!
@@ -55,6 +56,80 @@ def _count_reduce_items(arr, axis):
         items *= arr.shape[ax]
     return items
 
+# Numpy 1.17.0, 2019-02-24
+# Various clip behavior deprecations, marked with _clip_dep as a prefix.
+
+def _clip_dep_is_scalar_nan(a):
+    # guarded to protect circular imports
+    from numpy.core.fromnumeric import ndim
+    if ndim(a) != 0:
+        return False
+    try:
+        return um.isnan(a)
+    except TypeError:
+        return False
+
+def _clip_dep_is_byte_swapped(a):
+    if isinstance(a, mu.ndarray):
+        return not a.dtype.isnative
+    return False
+
+def _clip_dep_invoke_with_casting(ufunc, *args, out=None, casting=None, **kwargs):
+    # normal path
+    if casting is not None:
+        return ufunc(*args, out=out, casting=casting, **kwargs)
+
+    # try to deal with broken casting rules
+    try:
+        return ufunc(*args, out=out, **kwargs)
+    except _exceptions._UFuncOutputCastingError as e:
+        # Numpy 1.17.0, 2019-02-24
+        warnings.warn(
+            "Converting the output of clip from {!r} to {!r} is deprecated. "
+            "Pass `casting=\"unsafe\"` explicitly to silence this warning, or "
+            "correct the type of the variables.".format(e.from_, e.to),
+            DeprecationWarning,
+            stacklevel=2
+        )
+        return ufunc(*args, out=out, casting="unsafe", **kwargs)
+
+def _clip(a, min=None, max=None, out=None, *, casting=None, **kwargs):
+    if min is None and max is None:
+        raise ValueError("One of max or min must be given")
+
+    # Numpy 1.17.0, 2019-02-24
+    # This deprecation probably incurs a substantial slowdown for small arrays,
+    # it will be good to get rid of it.
+    if not _clip_dep_is_byte_swapped(a) and not _clip_dep_is_byte_swapped(out):
+        using_deprecated_nan = False
+        if _clip_dep_is_scalar_nan(min):
+            min = -float('inf')
+            using_deprecated_nan = True
+        if _clip_dep_is_scalar_nan(max):
+            max = float('inf')
+            using_deprecated_nan = True
+        if using_deprecated_nan:
+            warnings.warn(
+                "Passing `np.nan` to mean no clipping in np.clip has always "
+                "been unreliable, and is now deprecated. "
+                "In future, this will always return nan, like it already does "
+                "when min or max are arrays that contain nan. "
+                "To skip a bound, pass either None or an np.inf of an "
+                "appropriate sign.",
+                DeprecationWarning,
+                stacklevel=2
+            )
+
+    if min is None:
+        return _clip_dep_invoke_with_casting(
+            um.minimum, a, max, out=out, casting=casting, **kwargs)
+    elif max is None:
+        return _clip_dep_invoke_with_casting(
+            um.maximum, a, min, out=out, casting=casting, **kwargs)
+    else:
+        return _clip_dep_invoke_with_casting(
+            um.clip, a, min, max, out=out, casting=casting, **kwargs)
+
 def _mean(a, axis=None, dtype=None, out=None, keepdims=False):
     arr = asanyarray(a)
 
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index e17523451..bf1747272 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -511,6 +511,13 @@ defdict = {
           TD(noobj),
           TD(O, f='npy_ObjectMin')
           ),
+'clip':
+    Ufunc(3, 1, ReorderableNone,
+          docstrings.get('numpy.core.umath.clip'),
+          'PyUFunc_SimpleUniformOperationTypeResolver',
+          TD(noobj),
+          [TypeDescription('O', 'npy_ObjectClip', 'OOO', 'O')]
+          ),
 'fmax':
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmax'),
@@ -697,6 +704,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.exp'),
           None,
+          TD('e', f='exp', astype={'e':'f'}),
           TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='exp', astype={'e':'f'}),
           TD(P, f='exp'),
@@ -719,6 +727,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.log'),
           None,
+          TD('e', f='log', astype={'e':'f'}),
           TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='log', astype={'e':'f'}),
           TD(P, f='log'),
@@ -922,6 +931,7 @@ defdict = {
           docstrings.get('numpy.core.umath.matmul'),
           "PyUFunc_SimpleUniformOperationTypeResolver",
           TD(notimes_or_obj),
+          TD(O),
           signature='(n?,k),(k,m?)->(n?,m?)',
           ),
 }
@@ -961,6 +971,9 @@ arity_lookup = {
         'O': 'OO_O',
         'P': 'OO_O_method',
     },
+    (3, 1): {
+        'O': 'OOO_O',
+    }
 }
 
 #for each name
@@ -1137,6 +1150,7 @@ def make_code(funcdict, filename):
     #include "ufunc_type_resolution.h"
     #include "loops.h"
     #include "matmul.h"
+    #include "clip.h"
     %s
 
     static int
@@ -1154,7 +1168,6 @@ def make_code(funcdict, filename):
 
 if __name__ == "__main__":
     filename = __file__
-    fid = open('__umath_generated.c', 'w')
     code = make_code(defdict, filename)
-    fid.write(code)
-    fid.close()
+    with open('__umath_generated.c', 'w') as fid:
+        fid.write(code)
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index 8b1a5a3db..6a5def4f2 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -42,14 +42,20 @@ subst = {
 def add_newdoc(place, name, doc):
     doc = textwrap.dedent(doc).strip()
 
-    if name[0] != '_' and name != 'matmul':
-        # matmul is special, it does not use the OUT_SCALAR replacement strings
+    skip = (
+        # gufuncs do not use the OUT_SCALAR replacement strings
+        'matmul',
+        # clip has 3 inputs, which is not handled by this
+        'clip',
+    )
+    if name[0] != '_' and name not in skip:
         if '\nx :' in doc:
             assert '$OUT_SCALAR_1' in doc, "in {}".format(name)
         elif '\nx2 :' in doc or '\nx1, x2 :' in doc:
             assert '$OUT_SCALAR_2' in doc, "in {}".format(name)
         else:
             assert False, "Could not detect number of inputs in {}".format(name)
+
     for k, v in subst.items():
         doc = doc.replace('$' + k, v)
 
@@ -2535,6 +2541,46 @@ add_newdoc('numpy.core.umath', 'fmin',
 
     """)
 
+add_newdoc('numpy.core.umath', 'clip',
+    """
+    Clip (limit) the values in an array.
+
+    Given an interval, values outside the interval are clipped to
+    the interval edges.  For example, if an interval of ``[0, 1]``
+    is specified, values smaller than 0 become 0, and values larger
+    than 1 become 1.
+
+    Equivalent to but faster than ``np.minimum(np.maximum(a, a_min), a_max)``.
+
+    Parameters
+    ----------
+    a : array_like
+        Array containing elements to clip.
+    a_min : array_like
+        Minimum value.
+    a_max : array_like
+        Maximum value.
+    out : ndarray, optional
+        The results will be placed in this array. It may be the input
+        array for in-place clipping.  `out` must be of the right shape
+        to hold the output.  Its type is preserved.
+    $PARAMS
+
+    See Also
+    --------
+    numpy.clip :
+        Wrapper that makes the ``a_min`` and ``a_max`` arguments optional,
+        dispatching to one of `~numpy.core.umath.clip`,
+        `~numpy.core.umath.minimum`, and `~numpy.core.umath.maximum`.
+
+    Returns
+    -------
+    clipped_array : ndarray
+        An array with the elements of `a`, but where values
+        < `a_min` are replaced with `a_min`, and those > `a_max`
+        with `a_max`.
+    """)
+
 add_newdoc('numpy.core.umath', 'matmul',
     """
     Matrix product of two arrays.
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py
index 3fd7d14c4..d7ecce1b4 100644
--- a/numpy/core/defchararray.py
+++ b/numpy/core/defchararray.py
@@ -2124,7 +2124,7 @@ class chararray(ndarray):
     def __rmod__(self, other):
         return NotImplemented
 
-    def argsort(self, axis=-1, kind='quicksort', order=None):
+    def argsort(self, axis=-1, kind=None, order=None):
         """
         Return the indices that sort the array lexicographically.
 
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index b4d721940..58da8a54b 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -824,7 +824,7 @@ def _sort_dispatcher(a, axis=None, kind=None, order=None):
 
 
 @array_function_dispatch(_sort_dispatcher)
-def sort(a, axis=-1, kind='quicksort', order=None):
+def sort(a, axis=-1, kind=None, order=None):
     """
     Return a sorted copy of an array.
 
@@ -837,8 +837,8 @@ def sort(a, axis=-1, kind='quicksort', order=None):
         sorting. The default is -1, which sorts along the last axis.
     kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
         Sorting algorithm. The default is 'quicksort'. Note that both 'stable'
-        and 'mergesort' use timsort under the covers and, in general, the
-        actual implementation will vary with data type. The 'mergesort' option
+        and 'mergesort' use timsort or radix sort under the covers and, in general,
+        the actual implementation will vary with data type. The 'mergesort' option
         is retained for backwards compatibility.
 
         .. versionchanged:: 1.15.0.
@@ -914,7 +914,8 @@ def sort(a, axis=-1, kind='quicksort', order=None):
 
     'stable' automatically choses the best stable sorting algorithm
     for the data type being sorted. It, along with 'mergesort' is
-    currently mapped to timsort. API forward compatibility currently limits the
+    currently mapped to timsort or radix sort depending on the
+    data type. API forward compatibility currently limits the
     ability to select the implementation and it is hardwired for the different
     data types.
 
@@ -925,7 +926,8 @@ def sort(a, axis=-1, kind='quicksort', order=None):
     mergesort. It is now used for stable sort while quicksort is still the
     default sort if none is chosen. For details of timsort, refer to
     `CPython listsort.txt <https://github.com/python/cpython/blob/3.7/Objects/listsort.txt>`_.
-
+    'mergesort' and 'stable' are mapped to radix sort for integer data types. Radix sort is an
+    O(n) sort instead of O(n log n).
 
     Examples
     --------
@@ -974,7 +976,7 @@ def _argsort_dispatcher(a, axis=None, kind=None, order=None):
 
 
 @array_function_dispatch(_argsort_dispatcher)
-def argsort(a, axis=-1, kind='quicksort', order=None):
+def argsort(a, axis=-1, kind=None, order=None):
     """
     Returns the indices that would sort an array.
 
@@ -997,8 +999,6 @@ def argsort(a, axis=-1, kind='quicksort', order=None):
 
         .. versionchanged:: 1.15.0.
            The 'stable' option was added.
-
-
     order : str or list of str, optional
         When `a` is an array with fields defined, this argument specifies
         which fields to compare first, second, etc.  A single field can
@@ -1961,12 +1961,12 @@ def compress(condition, a, axis=None, out=None):
     return _wrapfunc(a, 'compress', condition, axis=axis, out=out)
 
 
-def _clip_dispatcher(a, a_min, a_max, out=None):
+def _clip_dispatcher(a, a_min, a_max, out=None, **kwargs):
     return (a, a_min, a_max)
 
 
 @array_function_dispatch(_clip_dispatcher)
-def clip(a, a_min, a_max, out=None):
+def clip(a, a_min, a_max, out=None, **kwargs):
     """
     Clip (limit) the values in an array.
 
@@ -1975,6 +1975,9 @@ def clip(a, a_min, a_max, out=None):
     is specified, values smaller than 0 become 0, and values larger
     than 1 become 1.
 
+    Equivalent to but faster than ``np.maximum(a_min, np.minimum(a, a_max))``.
+    No check is performed to ensure ``a_min < a_max``.
+
     Parameters
     ----------
     a : array_like
@@ -1992,6 +1995,11 @@ def clip(a, a_min, a_max, out=None):
         The results will be placed in this array. It may be the input
         array for in-place clipping.  `out` must be of the right shape
         to hold the output.  Its type is preserved.
+    **kwargs
+        For other keyword-only arguments, see the
+        :ref:`ufunc docs <ufuncs.kwargs>`.
+
+        .. versionadded:: 1.17.0
 
     Returns
     -------
@@ -2020,7 +2028,7 @@ def clip(a, a_min, a_max, out=None):
     array([3, 4, 2, 3, 4, 5, 6, 7, 8, 8])
 
     """
-    return _wrapfunc(a, 'clip', a_min, a_max, out=out)
+    return _wrapfunc(a, 'clip', a_min, a_max, out=out, **kwargs)
 
 
 def _sum_dispatcher(a, axis=None, dtype=None, out=None, keepdims=None,
diff --git a/numpy/core/multiarray.py b/numpy/core/multiarray.py
index a839ae402..78fec1aab 100644
--- a/numpy/core/multiarray.py
+++ b/numpy/core/multiarray.py
@@ -8,6 +8,7 @@ by importing from the extension module.
 
 import functools
 import warnings
+import sys
 
 from . import overrides
 from . import _multiarray_umath
@@ -1113,7 +1114,7 @@ def putmask(a, mask, values):
 
 
 @array_function_from_c_func_and_dispatcher(_multiarray_umath.packbits)
-def packbits(a, axis=None):
+def packbits(a, axis=None, bitorder='big'):
     """
     packbits(a, axis=None)
 
@@ -1129,6 +1130,13 @@ def packbits(a, axis=None):
     axis : int, optional
         The dimension over which bit-packing is done.
         ``None`` implies packing the flattened array.
+    bitorder : {'big', 'little'}, optional
+        The order of the input bits. 'big' will mimic bin(val),
+        ``[0, 0, 0, 0, 0, 0, 1, 1] => 3 = 0b00000011 => ``, 'little' will
+        reverse the order so ``[1, 1, 0, 0, 0, 0, 0, 0] => 3``.
+        Defaults to 'big'.
+
+        .. versionadded:: 1.17.0
 
     Returns
     -------
@@ -1164,7 +1172,7 @@ def packbits(a, axis=None):
 
 
 @array_function_from_c_func_and_dispatcher(_multiarray_umath.unpackbits)
-def unpackbits(a, axis=None, count=None):
+def unpackbits(a, axis=None, count=None, bitorder='big'):
     """
     unpackbits(a, axis=None, count=None)
 
@@ -1194,6 +1202,14 @@ def unpackbits(a, axis=None, count=None):
 
         .. versionadded:: 1.17.0
 
+    bitorder : {'big', 'little'}, optional
+        The order of the returned bits. 'big' will mimic bin(val),
+        ``3 = 0b00000011 => [0, 0, 0, 0, 0, 0, 1, 1]``, 'little' will reverse
+        the order to ``[1, 1, 0, 0, 0, 0, 0, 0]``.
+        Defaults to 'big'.
+
+        .. versionadded:: 1.17.0
+
     Returns
     -------
     unpacked : ndarray, uint8 type
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 4b59d730d..e72ab3012 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -2002,7 +2002,8 @@ def load(file):
         "np.core.numeric.load is deprecated, use pickle.load instead",
         DeprecationWarning, stacklevel=2)
     if isinstance(file, type("")):
-        file = open(file, "rb")
+        with open(file, "rb") as file_pointer:
+            return pickle.load(file_pointer)
     return pickle.load(file)
 
 
diff --git a/numpy/core/overrides.py b/numpy/core/overrides.py
index 9f91adc83..ad4d1c721 100644
--- a/numpy/core/overrides.py
+++ b/numpy/core/overrides.py
@@ -1,6 +1,7 @@
 """Implementation of __array_function__ overrides from NEP-18."""
 import collections
 import functools
+import textwrap
 
 from numpy.core._multiarray_umath import (
     add_docstring, implement_array_function, _get_implementing_args)
@@ -143,15 +144,36 @@ def array_function_dispatch(dispatcher, module=None, verify=True,
         if docs_from_dispatcher:
             add_docstring(implementation, dispatcher.__doc__)
 
+        # Equivalently, we could define this function directly instead of using
+        # exec. This version has the advantage of giving the helper function a
+        # more interpettable name. Otherwise, the original function does not
+        # show up at all in many cases, e.g., if it's written in C or if the
+        # dispatcher gets an invalid keyword argument.
+        source = textwrap.dedent("""
         @functools.wraps(implementation)
-        def public_api(*args, **kwargs):
+        def {name}(*args, **kwargs):
             relevant_args = dispatcher(*args, **kwargs)
             return implement_array_function(
-                implementation, public_api, relevant_args, args, kwargs)
+                implementation, {name}, relevant_args, args, kwargs)
+        """).format(name=implementation.__name__)
+
+        source_object = compile(
+            source, filename='<__array_function__ internals>', mode='exec')
+        scope = {
+            'implementation': implementation,
+            'dispatcher': dispatcher,
+            'functools': functools,
+            'implement_array_function': implement_array_function,
+        }
+        exec(source_object, scope)
+
+        public_api = scope[implementation.__name__]
 
         if module is not None:
             public_api.__module__ = module
 
+        public_api.__skip_array_function__ = implementation
+
         return public_api
 
     return decorator
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 45b4fb3c7..62147d22b 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -710,6 +710,7 @@ def configuration(parent_package='',top_path=None):
                        join('src', 'npysort', 'mergesort.c.src'),
                        join('src', 'npysort', 'timsort.c.src'),
                        join('src', 'npysort', 'heapsort.c.src'),
+                       join('src', 'npysort', 'radixsort.c.src'),
                        join('src', 'common', 'npy_partition.h.src'),
                        join('src', 'npysort', 'selection.c.src'),
                        join('src', 'common', 'npy_binsearch.h.src'),
@@ -905,6 +906,8 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'loops.c.src'),
             join('src', 'umath', 'matmul.h.src'),
             join('src', 'umath', 'matmul.c.src'),
+            join('src', 'umath', 'clip.h.src'),
+            join('src', 'umath', 'clip.c.src'),
             join('src', 'umath', 'ufunc_object.c'),
             join('src', 'umath', 'extobj.c'),
             join('src', 'umath', 'cpuid.c'),
diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py
index 0eac772e8..3a037e7d2 100644
--- a/numpy/core/shape_base.py
+++ b/numpy/core/shape_base.py
@@ -832,7 +832,7 @@ def block(arrays):
 
 # Theses helper functions are mostly used for testing.
 # They allow us to write tests that directly call `_block_slicing`
-# or `_block_concatenate` wtihout blocking large arrays to forse the wisdom
+# or `_block_concatenate` without blocking large arrays to forse the wisdom
 # to trigger the desired path.
 def _block_setup(arrays):
     """
diff --git a/numpy/core/src/common/lowlevel_strided_loops.h b/numpy/core/src/common/lowlevel_strided_loops.h
index 5f139cffb..bacd27473 100644
--- a/numpy/core/src/common/lowlevel_strided_loops.h
+++ b/numpy/core/src/common/lowlevel_strided_loops.h
@@ -4,6 +4,9 @@
 #include <npy_config.h>
 #include "mem_overlap.h"
 
+/* For PyArray_ macros used below */
+#include "numpy/ndarrayobject.h"
+
 /*
  * NOTE: This API should remain private for the time being, to allow
  *       for further refinement.  I think the 'aligned' mechanism
diff --git a/numpy/core/src/common/npy_longdouble.c b/numpy/core/src/common/npy_longdouble.c
index 561f4b825..c580e0cce 100644
--- a/numpy/core/src/common/npy_longdouble.c
+++ b/numpy/core/src/common/npy_longdouble.c
@@ -6,6 +6,7 @@
 #include "numpy/ndarraytypes.h"
 #include "numpy/npy_math.h"
 #include "npy_pycompat.h"
+#include "numpyos.h"
 
 /*
  * Heavily derived from PyLong_FromDouble
@@ -94,3 +95,84 @@ done:
     Py_DECREF(l_chunk_size);
     return v;
 }
+
+/* Helper function to get unicode(PyLong).encode('utf8') */
+static PyObject *
+_PyLong_Bytes(PyObject *long_obj) {
+    PyObject *bytes;
+#if defined(NPY_PY3K)
+    PyObject *unicode = PyObject_Str(long_obj);
+    if (unicode == NULL) {
+        return NULL;
+    }
+    bytes = PyUnicode_AsUTF8String(unicode);
+    Py_DECREF(unicode);
+#else
+    bytes = PyObject_Str(long_obj);
+#endif
+    return bytes;
+}
+
+
+/**
+ * TODO: currently a hack that converts the long through a string. This is
+ * correct, but slow.
+ *
+ * Another approach would be to do this numerically, in a similar way to
+ * PyLong_AsDouble.
+ * However, in order to respect rounding modes correctly, this needs to know
+ * the size of the mantissa, which is platform-dependent.
+ */
+NPY_VISIBILITY_HIDDEN npy_longdouble
+npy_longdouble_from_PyLong(PyObject *long_obj) {
+    npy_longdouble result = 1234;
+    char *end;
+    char *cstr;
+    PyObject *bytes;
+
+    /* convert the long to a string */
+    bytes = _PyLong_Bytes(long_obj);
+    if (bytes == NULL) {
+        return -1;
+    }
+
+    cstr = PyBytes_AsString(bytes);
+    if (cstr == NULL) {
+        goto fail;
+    }
+    end = NULL;
+
+    /* convert the string to a long double and capture errors */
+    errno = 0;
+    result = NumPyOS_ascii_strtold(cstr, &end);
+    if (errno == ERANGE) {
+        /* strtold returns INFINITY of the correct sign. */
+        if (PyErr_Warn(PyExc_RuntimeWarning,
+                "overflow encountered in conversion from python long") < 0) {
+            goto fail;
+        }
+    }
+    else if (errno) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "Could not parse python long as longdouble: %s (%s)",
+                     cstr,
+                     strerror(errno));
+        goto fail;
+    }
+
+    /* Extra characters at the end of the string, or nothing parsed */
+    if (end == cstr || *end != '\0') {
+        PyErr_Format(PyExc_RuntimeError,
+                     "Could not parse long as longdouble: %s",
+                     cstr);
+        goto fail;
+    }
+
+    /* finally safe to decref now that we're done with `end` */
+    Py_DECREF(bytes);
+    return result;
+
+fail:
+    Py_DECREF(bytes);
+    return -1;
+}
diff --git a/numpy/core/src/common/npy_longdouble.h b/numpy/core/src/common/npy_longdouble.h
index 036b53070..01db06de7 100644
--- a/numpy/core/src/common/npy_longdouble.h
+++ b/numpy/core/src/common/npy_longdouble.h
@@ -14,4 +14,14 @@
 NPY_VISIBILITY_HIDDEN PyObject *
 npy_longdouble_to_PyLong(npy_longdouble ldval);
 
+/* Convert a python `long` integer to a npy_longdouble
+ *
+ * This performs the same task as PyLong_AsDouble, but for long doubles
+ * which have a greater range.
+ *
+ * Returns -1 if an error occurs.
+ */
+NPY_VISIBILITY_HIDDEN npy_longdouble
+npy_longdouble_from_PyLong(PyObject *long_obj);
+
 #endif
diff --git a/numpy/core/src/common/npy_sort.h.src b/numpy/core/src/common/npy_sort.h.src
index 521f0fee5..16a105499 100644
--- a/numpy/core/src/common/npy_sort.h.src
+++ b/numpy/core/src/common/npy_sort.h.src
@@ -44,6 +44,17 @@ int atimsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
 
 /**end repeat**/
 
+/**begin repeat
+ *
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong#
+ */
+
+int radixsort_@suff@(void *vec, npy_intp cnt, void *null);
+int aradixsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+/**end repeat**/
+
 
 
 /*
diff --git a/numpy/core/src/multiarray/arrayfunction_override.c b/numpy/core/src/multiarray/arrayfunction_override.c
index e62b32ab2..02078306c 100644
--- a/numpy/core/src/multiarray/arrayfunction_override.c
+++ b/numpy/core/src/multiarray/arrayfunction_override.c
@@ -173,7 +173,7 @@ array_function_method_impl(PyObject *func, PyObject *types, PyObject *args,
         }
     }
 
-    implementation = PyObject_GetAttr(func, npy_ma_str_wrapped);
+    implementation = PyObject_GetAttr(func, npy_ma_str_skip_array_function);
     if (implementation == NULL) {
         return NULL;
     }
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 49819ca4a..5f7bcb8f7 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -30,6 +30,7 @@
 #include <emmintrin.h>
 #endif
 
+#include "npy_longdouble.h"
 #include "numpyos.h"
 #include <string.h>
 
@@ -328,6 +329,17 @@ string_to_long_double(PyObject*op)
     npy_longdouble temp;
     PyObject* b;
 
+    /* Convert python long objects to a longdouble, without precision or range
+     * loss via a double.
+     */
+    if ((PyLong_Check(op) && !PyBool_Check(op))
+#if !defined(NPY_PY3K)
+        || (PyInt_Check(op) && !PyBool_Check(op))
+#endif
+    ) {
+        return npy_longdouble_from_PyLong(op);
+    }
+
     if (PyUnicode_Check(op)) {
         b = PyUnicode_AsUTF8String(op);
         if (!b) {
@@ -3798,176 +3810,6 @@ static void
 
 /*
  *****************************************************************************
- **                               FASTCLIP                                  **
- *****************************************************************************
- */
-
-#define _LESS_THAN(a, b) ((a) < (b))
-#define _GREATER_THAN(a, b) ((a) > (b))
-
-/*
- * In fastclip, 'b' was already checked for NaN, so the half comparison
- * only needs to check 'a' for NaN.
- */
-
-#define _HALF_LESS_THAN(a, b) (!npy_half_isnan(a) && npy_half_lt_nonan(a, b))
-#define _HALF_GREATER_THAN(a, b) (!npy_half_isnan(a) && npy_half_lt_nonan(b, a))
-
-/**begin repeat
- *
- * #name = BOOL,
- *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- *         LONG, ULONG, LONGLONG, ULONGLONG,
- *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
- *         DATETIME, TIMEDELTA#
- * #type = npy_bool,
- *         npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
- *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *         npy_half, npy_float, npy_double, npy_longdouble,
- *         npy_datetime, npy_timedelta#
- * #isfloat = 0*11, 1*4, 0*2#
- * #isnan = nop*11, npy_half_isnan, npy_isnan*3, nop*2#
- * #lt = _LESS_THAN*11, _HALF_LESS_THAN, _LESS_THAN*5#
- * #gt = _GREATER_THAN*11, _HALF_GREATER_THAN, _GREATER_THAN*5#
- */
-static void
-@name@_fastclip(@type@ *in, npy_intp ni, @type@ *min, @type@ *max, @type@ *out)
-{
-    npy_intp i;
-    @type@ max_val = 0, min_val = 0;
-
-    if (max != NULL) {
-        max_val = *max;
-#if @isfloat@
-        /* NaNs result in no clipping, so optimize the case away */
-        if (@isnan@(max_val)) {
-            if (min == NULL) {
-                memmove(out, in, ni * sizeof(@type@));
-                return;
-            }
-            max = NULL;
-        }
-#endif
-    }
-    if (min != NULL) {
-        min_val = *min;
-#if @isfloat@
-        if (@isnan@(min_val)) {
-            if (max == NULL) {
-                memmove(out, in, ni * sizeof(@type@));
-                return;
-            }
-            min = NULL;
-        }
-#endif
-    }
-    if (max == NULL) {
-        for (i = 0; i < ni; i++) {
-            if (@lt@(in[i], min_val)) {
-                out[i] = min_val;
-            }
-            else {
-                out[i] = in[i];
-            }
-        }
-    }
-    else if (min == NULL) {
-        for (i = 0; i < ni; i++) {
-            if (@gt@(in[i], max_val)) {
-                out[i] = max_val;
-            }
-            else {
-                out[i] = in[i];
-            }
-        }
-    }
-    else {
-        /*
-         * Visual Studio 2015 loop vectorizer handles NaN in an unexpected
-         * manner, see: https://github.com/numpy/numpy/issues/7601
-         */
-        #if (_MSC_VER == 1900)
-        #pragma loop( no_vector )
-        #endif
-        for (i = 0; i < ni; i++) {
-            if (@lt@(in[i], min_val)) {
-                out[i]   = min_val;
-            }
-            else if (@gt@(in[i], max_val)) {
-                out[i]   = max_val;
-            }
-            else {
-                out[i] = in[i];
-            }
-        }
-    }
-}
-/**end repeat**/
-
-#undef _LESS_THAN
-#undef _GREATER_THAN
-#undef _HALF_LESS_THAN
-#undef _HALF_GREATER_THAN
-
-/**begin repeat
- *
- * #name = CFLOAT, CDOUBLE, CLONGDOUBLE#
- * #type = npy_cfloat, npy_cdouble, npy_clongdouble#
- */
-static void
-@name@_fastclip(@type@ *in, npy_intp ni, @type@ *min, @type@ *max, @type@ *out)
-{
-    npy_intp i;
-    @type@ max_val, min_val;
-
-    if (max != NULL) {
-        max_val = *max;
-    }
-    if (min != NULL) {
-        min_val = *min;
-    }
-    if (max == NULL) {
-        for (i = 0; i < ni; i++) {
-            if (PyArray_CLT(in[i],min_val)) {
-                out[i] = min_val;
-            }
-            else {
-                out[i] = in[i];
-            }
-        }
-    }
-    else if (min == NULL) {
-        for (i = 0; i < ni; i++) {
-            if (PyArray_CGT(in[i], max_val)) {
-                out[i] = max_val;
-            }
-            else {
-                out[i] = in[i];
-            }
-        }
-    }
-    else {
-        for (i = 0; i < ni; i++) {
-            if (PyArray_CLT(in[i], min_val)) {
-                out[i] = min_val;
-            }
-            else if (PyArray_CGT(in[i], max_val)) {
-                out[i] = max_val;
-            }
-            else {
-                out[i] = in[i];
-            }
-        }
-    }
-}
-
-/**end repeat**/
-
-#define OBJECT_fastclip NULL
-
-
-/*
- *****************************************************************************
  **                              FASTPUTMASK                                **
  *****************************************************************************
  */
@@ -4405,6 +4247,7 @@ static PyArray_Descr @from@_Descr = {
  *             npy_half, npy_float, npy_double, npy_longdouble,
  *             npy_cfloat, npy_cdouble, npy_clongdouble,
  *             PyObject *, npy_datetime, npy_timedelta#
+ * #rsort = 1*5, 0*16#
  * #NAME = Bool,
  *         Byte, UByte, Short, UShort, Int, UInt,
  *         Long, ULong, LongLong, ULongLong,
@@ -4461,12 +4304,20 @@ static PyArray_ArrFuncs _Py@NAME@_ArrFuncs = {
     {
         quicksort_@suff@,
         heapsort_@suff@,
-        timsort_@suff@
+        #if @rsort@
+            radixsort_@suff@
+        #else
+            timsort_@suff@
+        #endif
     },
     {
         aquicksort_@suff@,
         aheapsort_@suff@,
-        atimsort_@suff@
+        #if @rsort@
+            aradixsort_@suff@
+        #else
+            atimsort_@suff@            
+        #endif
     },
 #else
     {
@@ -4480,7 +4331,7 @@ static PyArray_ArrFuncs _Py@NAME@_ArrFuncs = {
     (PyArray_ScalarKindFunc*)NULL,
     NULL,
     NULL,
-    (PyArray_FastClipFunc*)@from@_fastclip,
+    (PyArray_FastClipFunc*)NULL,
     (PyArray_FastPutmaskFunc*)@from@_fastputmask,
     (PyArray_FastTakeFunc*)@from@_fasttake,
     (PyArray_ArgFunc*)@from@_argmin
diff --git a/numpy/core/src/multiarray/calculation.c b/numpy/core/src/multiarray/calculation.c
index 90ee2c5b2..1d72a5227 100644
--- a/numpy/core/src/multiarray/calculation.c
+++ b/numpy/core/src/multiarray/calculation.c
@@ -918,6 +918,27 @@ PyArray_Clip(PyArrayObject *self, PyObject *min, PyObject *max, PyArrayObject *o
     }
 
     func = PyArray_DESCR(self)->f->fastclip;
+    if (func == NULL) {
+        if (min == NULL) {
+            return PyObject_CallFunctionObjArgs(n_ops.minimum, self, max, out, NULL);
+        }
+        else if (max == NULL) {
+            return PyObject_CallFunctionObjArgs(n_ops.maximum, self, min, out, NULL);
+        }
+        else {
+            return PyObject_CallFunctionObjArgs(n_ops.clip, self, min, max, out, NULL);
+        }
+    }
+
+    /* NumPy 1.17.0, 2019-02-24 */
+    if (DEPRECATE(
+            "->f->fastclip is deprecated. Use PyUFunc_RegisterLoopForDescr to "
+            "attach a custom loop to np.core.umath.clip, np.minimum, and "
+            "np.maximum") < 0) {
+        return NULL;
+    }
+    /* everything below can be removed once this deprecation completes */
+
     if (func == NULL
         || (min != NULL && !PyArray_CheckAnyScalar(min))
         || (max != NULL && !PyArray_CheckAnyScalar(max))
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index d80d16529..25dc6951c 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -921,7 +921,7 @@ fail:
     return -1;
 }
 
-/* Inner loop for unravel_index */
+/* Inner loop for ravel_multi_index */
 static int
 ravel_multi_index_loop(int ravel_ndim, npy_intp *ravel_dims,
                         npy_intp *ravel_strides,
@@ -1135,67 +1135,44 @@ fail:
     return NULL;
 }
 
-/* C-order inner loop for unravel_index */
-static int
-unravel_index_loop_corder(int unravel_ndim, npy_intp *unravel_dims,
-                        npy_intp unravel_size, npy_intp count,
-                        char *indices, npy_intp indices_stride,
-                        npy_intp *coords)
-{
-    int i;
-    char invalid;
-    npy_intp val;
 
-    NPY_BEGIN_ALLOW_THREADS;
-    invalid = 0;
-    while (count--) {
-        val = *(npy_intp *)indices;
-        if (val < 0 || val >= unravel_size) {
-            invalid = 1;
-            break;
-        }
-        for (i = unravel_ndim-1; i >= 0; --i) {
-            coords[i] = val % unravel_dims[i];
-            val /= unravel_dims[i];
-        }
-        coords += unravel_ndim;
-        indices += indices_stride;
-    }
-    NPY_END_ALLOW_THREADS;
-    if (invalid) {
-        PyErr_Format(PyExc_ValueError,
-            "index %" NPY_INTP_FMT " is out of bounds for array with size "
-            "%" NPY_INTP_FMT,
-            val, unravel_size
-        );
-        return NPY_FAIL;
-    }
-    return NPY_SUCCEED;
-}
-
-/* Fortran-order inner loop for unravel_index */
+/* 
+ * Inner loop for unravel_index
+ * order must be NPY_CORDER or NPY_FORTRANORDER
+ */
 static int
-unravel_index_loop_forder(int unravel_ndim, npy_intp *unravel_dims,
-                        npy_intp unravel_size, npy_intp count,
-                        char *indices, npy_intp indices_stride,
-                        npy_intp *coords)
+unravel_index_loop(int unravel_ndim, npy_intp *unravel_dims,
+                   npy_intp unravel_size, npy_intp count,
+                   char *indices, npy_intp indices_stride,
+                   npy_intp *coords, NPY_ORDER order)
 {
-    int i;
-    char invalid;
-    npy_intp val;
+    int i, idx;
+    int idx_start = (order == NPY_CORDER) ? unravel_ndim - 1: 0;
+    int idx_step = (order == NPY_CORDER) ? -1 : 1;
+    char invalid = 0;
+    npy_intp val = 0;
 
     NPY_BEGIN_ALLOW_THREADS;
-    invalid = 0;
+    /* NPY_KEEPORDER or NPY_ANYORDER have no meaning in this setting */
+    assert(order == NPY_CORDER || order == NPY_FORTRANORDER);
     while (count--) {
         val = *(npy_intp *)indices;
         if (val < 0 || val >= unravel_size) {
             invalid = 1;
             break;
         }
+        idx = idx_start;
         for (i = 0; i < unravel_ndim; ++i) {
-            *coords++ = val % unravel_dims[i];
-            val /= unravel_dims[i];
+            /* 
+             * Using a local seems to enable single-divide optimization
+             * but only if the / precedes the %
+             */
+            npy_intp tmp = val / unravel_dims[idx];
+            coords[idx] = val % unravel_dims[idx];
+            val = tmp;
+            idx += idx_step;
         }
+        coords += unravel_ndim;
         indices += indices_stride;
     }
     NPY_END_ALLOW_THREADS;
@@ -1214,11 +1191,12 @@ unravel_index_loop_forder(int unravel_ndim, npy_intp *unravel_dims,
 NPY_NO_EXPORT PyObject *
 arr_unravel_index(PyObject *self, PyObject *args, PyObject *kwds)
 {
-    PyObject *indices0 = NULL, *ret_tuple = NULL;
+    PyObject *indices0 = NULL;
+    PyObject *ret_tuple = NULL;
     PyArrayObject *ret_arr = NULL;
     PyArrayObject *indices = NULL;
     PyArray_Descr *dtype = NULL;
-    PyArray_Dims dimensions={0,0};
+    PyArray_Dims dimensions = {0, 0};
     NPY_ORDER order = NPY_CORDER;
     npy_intp unravel_size;
 
@@ -1261,7 +1239,13 @@ arr_unravel_index(PyObject *self, PyObject *args, PyObject *kwds)
         goto fail;
     }
 
-    unravel_size = PyArray_MultiplyList(dimensions.ptr, dimensions.len);
+    unravel_size = PyArray_OverflowMultiplyList(dimensions.ptr, dimensions.len);
+    if (unravel_size == -1) {
+        PyErr_SetString(PyExc_ValueError,
+                        "dimensions are too large; arrays and shapes with "
+                        "a total size greater than 'intp' are not supported.");
+        goto fail;
+    }
 
     indices = astype_anyint(indices0);
     if (indices == NULL) {
@@ -1315,64 +1299,35 @@ arr_unravel_index(PyObject *self, PyObject *args, PyObject *kwds)
         goto fail;
     }
 
-    if (order == NPY_CORDER) {
-        if (NpyIter_GetIterSize(iter) != 0) {
-            NpyIter_IterNextFunc *iternext;
-            char **dataptr;
-            npy_intp *strides;
-            npy_intp *countptr, count;
-            npy_intp *coordsptr = (npy_intp *)PyArray_DATA(ret_arr);
+    if (order != NPY_CORDER && order != NPY_FORTRANORDER) {
+        PyErr_SetString(PyExc_ValueError,
+                        "only 'C' or 'F' order is permitted");
+        goto fail;
+    }
+    if (NpyIter_GetIterSize(iter) != 0) {
+        NpyIter_IterNextFunc *iternext;
+        char **dataptr;
+        npy_intp *strides;
+        npy_intp *countptr, count;
+        npy_intp *coordsptr = (npy_intp *)PyArray_DATA(ret_arr);
 
-            iternext = NpyIter_GetIterNext(iter, NULL);
-            if (iternext == NULL) {
-                goto fail;
-            }
-            dataptr = NpyIter_GetDataPtrArray(iter);
-            strides = NpyIter_GetInnerStrideArray(iter);
-            countptr = NpyIter_GetInnerLoopSizePtr(iter);
-
-            do {
-                count = *countptr;
-                if (unravel_index_loop_corder(dimensions.len, dimensions.ptr,
-                            unravel_size, count, *dataptr, *strides,
-                            coordsptr) != NPY_SUCCEED) {
-                    goto fail;
-                }
-                coordsptr += count*dimensions.len;
-            } while(iternext(iter));
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            goto fail;
         }
-    }
-    else if (order == NPY_FORTRANORDER) {
-        if (NpyIter_GetIterSize(iter) != 0) {
-            NpyIter_IterNextFunc *iternext;
-            char **dataptr;
-            npy_intp *strides;
-            npy_intp *countptr, count;
-            npy_intp *coordsptr = (npy_intp *)PyArray_DATA(ret_arr);
+        dataptr = NpyIter_GetDataPtrArray(iter);
+        strides = NpyIter_GetInnerStrideArray(iter);
+        countptr = NpyIter_GetInnerLoopSizePtr(iter);
 
-            iternext = NpyIter_GetIterNext(iter, NULL);
-            if (iternext == NULL) {
+        do {
+            count = *countptr;
+            if (unravel_index_loop(dimensions.len, dimensions.ptr,
+                                   unravel_size, count, *dataptr, *strides,
+                                   coordsptr, order) != NPY_SUCCEED) {
                 goto fail;
             }
-            dataptr = NpyIter_GetDataPtrArray(iter);
-            strides = NpyIter_GetInnerStrideArray(iter);
-            countptr = NpyIter_GetInnerLoopSizePtr(iter);
-
-            do {
-                count = *countptr;
-                if (unravel_index_loop_forder(dimensions.len, dimensions.ptr,
-                            unravel_size, count, *dataptr, *strides,
-                            coordsptr) != NPY_SUCCEED) {
-                    goto fail;
-                }
-                coordsptr += count*dimensions.len;
-            } while(iternext(iter));
-        }
-    }
-    else {
-        PyErr_SetString(PyExc_ValueError,
-                        "only 'C' or 'F' order is permitted");
-        goto fail;
+            coordsptr += count * dimensions.len;
+        } while (iternext(iter));
     }
 
 
@@ -1556,7 +1511,8 @@ pack_inner(const char *inptr,
            npy_intp in_stride,
            char *outptr,
            npy_intp n_out,
-           npy_intp out_stride)
+           npy_intp out_stride,
+           char order)
 {
     /*
      * Loop through the elements of inptr.
@@ -1576,9 +1532,13 @@ pack_inner(const char *inptr,
         vn_out -= (vn_out & 1);
         for (index = 0; index < vn_out; index += 2) {
             unsigned int r;
-            /* swap as packbits is "big endian", note x86 can load unaligned */
-            npy_uint64 a = npy_bswap8(*(npy_uint64*)inptr);
-            npy_uint64 b = npy_bswap8(*(npy_uint64*)(inptr + 8));
+            npy_uint64 a = *(npy_uint64*)inptr;
+            npy_uint64 b = *(npy_uint64*)(inptr + 8);
+            if (order == 'b') {
+                a = npy_bswap8(a);
+                b = npy_bswap8(b);
+            }
+            /* note x86 can load unaligned */
             __m128i v = _mm_set_epi64(_m_from_int64(b), _m_from_int64(a));
             /* false -> 0x00 and true -> 0xFF (there is no cmpneq) */
             v = _mm_cmpeq_epi8(v, zero);
@@ -1598,30 +1558,45 @@ pack_inner(const char *inptr,
     if (remain == 0) {                  /* assumes n_in > 0 */
         remain = 8;
     }
-    /* don't reset index to handle remainder of above block */
+    /* Don't reset index. Just handle remainder of above block */
     for (; index < n_out; index++) {
-        char build = 0;
+        unsigned char build = 0;
         int i, maxi;
         npy_intp j;
 
         maxi = (index == n_out - 1) ? remain : 8;
-        for (i = 0; i < maxi; i++) {
-            build <<= 1;
-            for (j = 0; j < element_size; j++) {
-                build |= (inptr[j] != 0);
+        if (order == 'b') {
+            for (i = 0; i < maxi; i++) {
+                build <<= 1;
+                for (j = 0; j < element_size; j++) {
+                    build |= (inptr[j] != 0);
+                }
+                inptr += in_stride;
+            }
+            if (index == n_out - 1) {
+                build <<= 8 - remain;
             }
-            inptr += in_stride;
         }
-        if (index == n_out - 1) {
-            build <<= 8 - remain;
+        else
+        {
+            for (i = 0; i < maxi; i++) {
+                build >>= 1;
+                for (j = 0; j < element_size; j++) {
+                    build |= (inptr[j] != 0) ? 128 : 0;
+                }
+                inptr += in_stride;
+            }
+            if (index == n_out - 1) {
+                build >>= 8 - remain;
+            }
         }
-        *outptr = build;
+        *outptr = (char)build;
         outptr += out_stride;
     }
 }
 
 static PyObject *
-pack_bits(PyObject *input, int axis)
+pack_bits(PyObject *input, int axis, char order)
 {
     PyArrayObject *inp;
     PyArrayObject *new = NULL;
@@ -1706,7 +1681,7 @@ pack_bits(PyObject *input, int axis)
         pack_inner(PyArray_ITER_DATA(it), PyArray_ITEMSIZE(new),
                    PyArray_DIM(new, axis), PyArray_STRIDE(new, axis),
                    PyArray_ITER_DATA(ot), PyArray_DIM(out, axis),
-                   PyArray_STRIDE(out, axis));
+                   PyArray_STRIDE(out, axis), order);
         PyArray_ITER_NEXT(it);
         PyArray_ITER_NEXT(ot);
     }
@@ -1726,10 +1701,8 @@ fail:
 }
 
 static PyObject *
-unpack_bits(PyObject *input, int axis, PyObject *count_obj)
+unpack_bits(PyObject *input, int axis, PyObject *count_obj, char order)
 {
-    static int unpack_init = 0;
-    static char unpack_lookup[256][8];
     PyArrayObject *inp;
     PyArrayObject *new = NULL;
     PyArrayObject *out = NULL;
@@ -1815,28 +1788,6 @@ unpack_bits(PyObject *input, int axis, PyObject *count_obj)
         goto fail;
     }
 
-    /* setup lookup table under GIL, big endian 0..256 as bytes */
-    if (unpack_init == 0) {
-        npy_uint64 j;
-        npy_uint64 * unpack_lookup_64 = (npy_uint64 *)unpack_lookup;
-        for (j=0; j < 256; j++) {
-            npy_uint64 v = 0;
-            v |= (npy_uint64)((j &   1) ==   1);
-            v |= (npy_uint64)((j &   2) ==   2) << 8;
-            v |= (npy_uint64)((j &   4) ==   4) << 16;
-            v |= (npy_uint64)((j &   8) ==   8) << 24;
-            v |= (npy_uint64)((j &  16) ==  16) << 32;
-            v |= (npy_uint64)((j &  32) ==  32) << 40;
-            v |= (npy_uint64)((j &  64) ==  64) << 48;
-            v |= (npy_uint64)((j & 128) == 128) << 56;
-#if NPY_BYTE_ORDER == NPY_LITTLE_ENDIAN
-            v = npy_bswap8(v);
-#endif
-            unpack_lookup_64[j] = v;
-        }
-        unpack_init = 1;
-    }
-
     count = PyArray_DIM(new, axis) * 8;
     if (outdims[axis] > count) {
         in_n = count / 8;
@@ -1859,45 +1810,40 @@ unpack_bits(PyObject *input, int axis, PyObject *count_obj)
         unsigned const char *inptr = PyArray_ITER_DATA(it);
         char *outptr = PyArray_ITER_DATA(ot);
 
-        if (out_stride == 1) {
-            /* for unity stride we can just copy out of the lookup table */
+        if (order == 'b') {
             for (index = 0; index < in_n; index++) {
-                memcpy(outptr, unpack_lookup[*inptr], 8);
-                outptr += 8;
+                for (i = 0; i < 8; i++) {
+                    *outptr = ((*inptr & (128 >> i)) != 0);
+                    outptr += out_stride;
+                }
                 inptr += in_stride;
             }
             /* Clean up the tail portion */
-            if (in_tail) {
-                memcpy(outptr, unpack_lookup[*inptr], in_tail);
-            }
-            /* Add padding */
-            else if (out_pad) {
-                memset(outptr, 0, out_pad);
+            for (i = 0; i < in_tail; i++) {
+                *outptr = ((*inptr & (128 >> i)) != 0);
+                outptr += out_stride;
             }
         }
         else {
-            unsigned char mask;
-
             for (index = 0; index < in_n; index++) {
-                for (mask = 128; mask; mask >>= 1) {
-                    *outptr = ((mask & (*inptr)) != 0);
+                for (i = 0; i < 8; i++) {
+                    *outptr = ((*inptr & (1 << i)) != 0);
                     outptr += out_stride;
                 }
                 inptr += in_stride;
             }
             /* Clean up the tail portion */
-            mask = 128;
             for (i = 0; i < in_tail; i++) {
-                *outptr = ((mask & (*inptr)) != 0);
-                outptr += out_stride;
-                mask >>= 1;
-            }
-            /* Add padding */
-            for (index = 0; index < out_pad; index++) {
-                *outptr = 0;
+                *outptr = ((*inptr & (1 << i)) != 0);
                 outptr += out_stride;
             }
         }
+        /* Add padding */
+        for (index = 0; index < out_pad; index++) {
+            *outptr = 0;
+            outptr += out_stride;
+        }
+
         PyArray_ITER_NEXT(it);
         PyArray_ITER_NEXT(ot);
     }
@@ -1921,13 +1867,26 @@ io_pack(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
 {
     PyObject *obj;
     int axis = NPY_MAXDIMS;
-    static char *kwlist[] = {"a", "axis", NULL};
+    static char *kwlist[] = {"in", "axis", "bitorder", NULL};
+    char c = 'b';
+    const char * order_str = NULL;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&:pack" , kwlist,
-                                     &obj, PyArray_AxisConverter, &axis)) {
+    if (!PyArg_ParseTupleAndKeywords( args, kwds, "O|O&s:pack" , kwlist,
+                &obj, PyArray_AxisConverter, &axis, &order_str)) {
         return NULL;
     }
-    return pack_bits(obj, axis);
+    if (order_str != NULL) {
+        if (strncmp(order_str, "little", 6) == 0)
+            c = 'l';
+        else if (strncmp(order_str, "big", 3) == 0)
+            c = 'b';
+        else {
+            PyErr_SetString(PyExc_ValueError,
+                    "'order' must be either 'little' or 'big'");
+            return NULL;
+        }
+    }
+    return pack_bits(obj, axis, c);
 }
 
 
@@ -1937,12 +1896,20 @@ io_unpack(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
     PyObject *obj;
     int axis = NPY_MAXDIMS;
     PyObject *count = Py_None;
-    static char *kwlist[] = {"a", "axis", "count", NULL};
+    static char *kwlist[] = {"in", "axis", "count", "bitorder", NULL};
+    const char * c = NULL;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&O:unpack" , kwlist,
-                                     &obj, PyArray_AxisConverter, &axis,
-                                     &count)) {
+    if (!PyArg_ParseTupleAndKeywords( args, kwds, "O|O&Os:unpack" , kwlist,
+                &obj, PyArray_AxisConverter, &axis, &count, &c)) {
+        return NULL;
+    }
+    if (c == NULL) {
+        c = "b";
+    }
+    if (c[0] != 'l' && c[0] != 'b') {
+        PyErr_SetString(PyExc_ValueError,
+                    "'order' must begin with 'l' or 'b'");
         return NULL;
     }
-    return unpack_bits(obj, axis, count);
+    return unpack_bits(obj, axis, count, c[0]);
 }
diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c
index fa8de8b37..a370874a6 100644
--- a/numpy/core/src/multiarray/conversion_utils.c
+++ b/numpy/core/src/multiarray/conversion_utils.c
@@ -116,8 +116,8 @@ PyArray_IntpConverter(PyObject *obj, PyArray_Dims *seq)
         return NPY_FAIL;
     }
     if (len > NPY_MAXDIMS) {
-        PyErr_Format(PyExc_ValueError, "sequence too large; "
-                     "cannot be greater than %d", NPY_MAXDIMS);
+        PyErr_Format(PyExc_ValueError, "maximum supported dimension for an ndarray is %d"
+                     ", found %d", NPY_MAXDIMS, len);
         return NPY_FAIL;
     }
     if (len > 0) {
@@ -393,6 +393,11 @@ PyArray_SortkindConverter(PyObject *obj, NPY_SORTKIND *sortkind)
     char *str;
     PyObject *tmp = NULL;
 
+    if (obj == Py_None) {
+        *sortkind = NPY_QUICKSORT;
+        return NPY_SUCCEED;
+    }
+
     if (PyUnicode_Check(obj)) {
         obj = tmp = PyUnicode_AsASCIIString(obj);
         if (obj == NULL) {
@@ -401,6 +406,8 @@ PyArray_SortkindConverter(PyObject *obj, NPY_SORTKIND *sortkind)
     }
 
     *sortkind = NPY_QUICKSORT;
+        
+
     str = PyBytes_AsString(obj);
     if (!str) {
         Py_XDECREF(tmp);
@@ -424,7 +431,7 @@ PyArray_SortkindConverter(PyObject *obj, NPY_SORTKIND *sortkind)
          * That maintains backwards compatibility while
          * allowing other types of stable sorts to be used.
          */
-        *sortkind = NPY_STABLESORT;
+        *sortkind = NPY_MERGESORT;
     }
     else if (str[0] == 's' || str[0] == 'S') {
         /*
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 6065c1df4..f4d2513ca 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -1126,7 +1126,7 @@ fail:
 NPY_NO_EXPORT int
 PyArray_Sort(PyArrayObject *op, int axis, NPY_SORTKIND which)
 {
-    PyArray_SortFunc *sort;
+    PyArray_SortFunc *sort = NULL;
     int n = PyArray_NDIM(op);
 
     if (check_and_adjust_axis(&axis, n) < 0) {
@@ -1143,6 +1143,7 @@ PyArray_Sort(PyArrayObject *op, int axis, NPY_SORTKIND which)
     }
 
     sort = PyArray_DESCR(op)->f->sort[which];
+
     if (sort == NULL) {
         if (PyArray_DESCR(op)->f->compare) {
             switch (which) {
@@ -1284,16 +1285,11 @@ NPY_NO_EXPORT PyObject *
 PyArray_ArgSort(PyArrayObject *op, int axis, NPY_SORTKIND which)
 {
     PyArrayObject *op2;
-    PyArray_ArgSortFunc *argsort;
+    PyArray_ArgSortFunc *argsort = NULL;
     PyObject *ret;
 
-    if (which < 0 || which >= NPY_NSORTS) {
-        PyErr_SetString(PyExc_ValueError,
-                        "not a valid sort kind");
-        return NULL;
-    }
-
     argsort = PyArray_DESCR(op)->f->argsort[which];
+
     if (argsort == NULL) {
         if (PyArray_DESCR(op)->f->compare) {
             switch (which) {
diff --git a/numpy/core/src/multiarray/iterators.c b/numpy/core/src/multiarray/iterators.c
index 9fcdc91b2..029671e4a 100644
--- a/numpy/core/src/multiarray/iterators.c
+++ b/numpy/core/src/multiarray/iterators.c
@@ -1262,10 +1262,14 @@ PyArray_MultiIterFromObjects(PyObject **mps, int n, int nadd, ...)
     int i, ntot, err=0;
 
     ntot = n + nadd;
-    if (ntot < 1 || ntot > NPY_MAXARGS) {
+    if (ntot < 0) {
         PyErr_Format(PyExc_ValueError,
-                     "Need at least 1 and at most %d "
-                     "array objects.", NPY_MAXARGS);
+                     "n and nadd arguments must be non-negative", NPY_MAXARGS);
+        return NULL;
+    }
+    if (ntot > NPY_MAXARGS) {
+        PyErr_Format(PyExc_ValueError,
+                     "At most %d array objects are supported.", NPY_MAXARGS);
         return NULL;
     }
     multi = PyArray_malloc(sizeof(PyArrayMultiIterObject));
@@ -1328,10 +1332,14 @@ PyArray_MultiIterNew(int n, ...)
 
     int i, err = 0;
 
-    if (n < 1 || n > NPY_MAXARGS) {
+    if (n < 0) {
+        PyErr_Format(PyExc_ValueError,
+                     "n argument must be non-negative", NPY_MAXARGS);
+        return NULL;
+    }
+    if (n > NPY_MAXARGS) {
         PyErr_Format(PyExc_ValueError,
-                     "Need at least 1 and at most %d "
-                     "array objects.", NPY_MAXARGS);
+                     "At most %d array objects are supported.", NPY_MAXARGS);
         return NULL;
     }
 
@@ -1388,7 +1396,7 @@ arraymultiter_new(PyTypeObject *NPY_UNUSED(subtype), PyObject *args, PyObject *k
     PyArrayMultiIterObject *multi;
     PyObject *arr;
 
-    if (kwds != NULL) {
+    if (kwds != NULL && PyDict_Size(kwds) > 0) {
         PyErr_SetString(PyExc_ValueError,
                         "keyword arguments not accepted.");
         return NULL;
@@ -1409,13 +1417,12 @@ arraymultiter_new(PyTypeObject *NPY_UNUSED(subtype), PyObject *args, PyObject *k
             ++n;
         }
     }
-    if (n < 1 || n > NPY_MAXARGS) {
-        if (PyErr_Occurred()) {
-            return NULL;
-        }
+    if (PyErr_Occurred()) {
+        return NULL;
+    }
+    if (n > NPY_MAXARGS) {
         PyErr_Format(PyExc_ValueError,
-                     "Need at least 1 and at most %d "
-                     "array objects.", NPY_MAXARGS);
+                     "At most %d array objects are supported.", NPY_MAXARGS);
         return NULL;
     }
 
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 0ddec2995..0d30db07e 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -2399,21 +2399,7 @@ array_trace(PyArrayObject *self, PyObject *args, PyObject *kwds)
 static PyObject *
 array_clip(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
-    PyObject *min = NULL, *max = NULL;
-    PyArrayObject *out = NULL;
-    static char *kwlist[] = {"min", "max", "out", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OOO&:clip", kwlist,
-                                     &min,
-                                     &max,
-                                     PyArray_OutputConverter, &out)) {
-        return NULL;
-    }
-    if (max == NULL && min == NULL) {
-        PyErr_SetString(PyExc_ValueError, "One of max or min must be given.");
-        return NULL;
-    }
-    return PyArray_Return((PyArrayObject *)PyArray_Clip(self, min, max, out));
+    NPY_FORWARD_NDARRAY_METHOD("_clip");
 }
 
 
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 52412b827..e15ab5172 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -4498,7 +4498,7 @@ NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_array_prepare = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_array_wrap = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_array_finalize = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_ufunc = NULL;
-NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_wrapped = NULL;
+NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_skip_array_function = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_order = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_copy = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_dtype = NULL;
@@ -4514,7 +4514,8 @@ intern_strings(void)
     npy_ma_str_array_wrap = PyUString_InternFromString("__array_wrap__");
     npy_ma_str_array_finalize = PyUString_InternFromString("__array_finalize__");
     npy_ma_str_ufunc = PyUString_InternFromString("__array_ufunc__");
-    npy_ma_str_wrapped = PyUString_InternFromString("__wrapped__");
+    npy_ma_str_skip_array_function = PyUString_InternFromString(
+        "__skip_array_function__");
     npy_ma_str_order = PyUString_InternFromString("order");
     npy_ma_str_copy = PyUString_InternFromString("copy");
     npy_ma_str_dtype = PyUString_InternFromString("dtype");
@@ -4524,7 +4525,7 @@ intern_strings(void)
 
     return npy_ma_str_array && npy_ma_str_array_prepare &&
            npy_ma_str_array_wrap && npy_ma_str_array_finalize &&
-           npy_ma_str_ufunc && npy_ma_str_wrapped &&
+           npy_ma_str_ufunc && npy_ma_str_skip_array_function &&
            npy_ma_str_order && npy_ma_str_copy && npy_ma_str_dtype &&
            npy_ma_str_ndmin && npy_ma_str_axis1 && npy_ma_str_axis2;
 }
diff --git a/numpy/core/src/multiarray/multiarraymodule.h b/numpy/core/src/multiarray/multiarraymodule.h
index 6d33c3295..5cf082fbb 100644
--- a/numpy/core/src/multiarray/multiarraymodule.h
+++ b/numpy/core/src/multiarray/multiarraymodule.h
@@ -6,7 +6,7 @@ NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_array_prepare;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_array_wrap;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_array_finalize;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_ufunc;
-NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_wrapped;
+NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_skip_array_function;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_order;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_copy;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_dtype;
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index 420501ce2..0ceb994ef 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -113,6 +113,7 @@ _PyArray_SetNumericOps(PyObject *dict)
     SET(rint);
     SET(conjugate);
     SET(matmul);
+    SET(clip);
     return 0;
 }
 
@@ -179,6 +180,7 @@ _PyArray_GetNumericOps(void)
     GET(rint);
     GET(conjugate);
     GET(matmul);
+    GET(clip);
     return dict;
 
  fail:
diff --git a/numpy/core/src/multiarray/number.h b/numpy/core/src/multiarray/number.h
index 33a7cf872..643241b3d 100644
--- a/numpy/core/src/multiarray/number.h
+++ b/numpy/core/src/multiarray/number.h
@@ -40,6 +40,7 @@ typedef struct {
     PyObject *rint;
     PyObject *conjugate;
     PyObject *matmul;
+    PyObject *clip;
 } NumericOps;
 
 extern NPY_NO_EXPORT NumericOps n_ops;
diff --git a/numpy/core/src/npysort/radixsort.c.src b/numpy/core/src/npysort/radixsort.c.src
new file mode 100644
index 000000000..c1435bd96
--- /dev/null
+++ b/numpy/core/src/npysort/radixsort.c.src
@@ -0,0 +1,229 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "npy_sort.h"
+#include "npysort_common.h"
+#include <stdlib.h>
+
+/*
+ *****************************************************************************
+ **                            INTEGER SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *         LONGLONG, ULONGLONG#
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong#
+ * #type = npy_ubyte, npy_ubyte, npy_ubyte, npy_ushort, npy_ushort, npy_uint,
+ *         npy_uint, npy_ulong, npy_ulong, npy_ulonglong, npy_ulonglong#
+ * #sign = 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
+ * #floating = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0#
+ */
+
+// Reference: https://github.com/eloj/radix-sorting#-key-derivation
+#if @sign@
+    // Floating-point is currently disabled.
+    // Floating-point tests succeed for double and float on macOS but not on Windows/Linux.
+    // Basic sorting tests succeed but others relying on sort fail.
+    // Possibly related to floating-point normalisation or multiple NaN reprs? Not sure.
+    #if @floating@
+        // For floats, we invert the key if the sign bit is set, else we invert the sign bit.
+        #define KEY_OF(x) ((x) ^ (-((x) >> (sizeof(@type@) * 8 - 1)) | ((@type@)1 << (sizeof(@type@) * 8 - 1))))
+    #else
+        // For signed ints, we flip the sign bit so the negatives are below the positives.
+        #define KEY_OF(x) ((x) ^ ((@type@)1 << (sizeof(@type@) * 8 - 1)))
+    #endif
+#else
+    // For unsigned ints, the key is as-is
+    #define KEY_OF(x) (x)
+#endif
+
+static inline npy_ubyte
+nth_byte_@suff@(@type@ key, npy_intp l) {
+    return (key >> (l << 3)) & 0xFF;
+}
+
+@type@*
+radixsort0_@suff@(@type@ *arr, @type@ *aux, npy_intp num)
+{
+    npy_intp cnt[sizeof(@type@)][1 << 8] = { { 0 } };
+    npy_intp i, l;
+    @type@ key0 = KEY_OF(arr[0]);
+    npy_intp ncols = 0;
+    npy_ubyte cols[sizeof(@type@)];
+
+    for (i = 0; i < num; i++) {
+        @type@ k = KEY_OF(arr[i]);
+
+        for (l = 0; l < sizeof(@type@); l++) {
+            cnt[l][nth_byte_@suff@(k, l)]++;
+        }
+    }
+
+    for (l = 0; l < sizeof(@type@); l++) {
+	    if (cnt[l][nth_byte_@suff@(key0, l)] != num) {
+	        cols[ncols++] = l;
+        }
+    }
+
+    for (l = 0; l < ncols; l++) {
+        npy_intp a = 0;
+        for (i = 0; i < 256; i++) {
+            npy_intp b = cnt[cols[l]][i];
+            cnt[cols[l]][i] = a;
+            a += b;
+        }
+    }
+
+    for (l = 0; l < ncols; l++) {
+        @type@* temp;
+        for (i = 0; i < num; i++) {
+            @type@ k = KEY_OF(arr[i]);
+            npy_intp dst = cnt[cols[l]][nth_byte_@suff@(k, cols[l])]++;
+            aux[dst] = arr[i];
+        }
+
+        temp = aux;
+        aux = arr;
+        arr = temp;
+    }
+
+    return arr;
+}
+
+int
+radixsort_@suff@(void *start, npy_intp num, void *NPY_UNUSED(varr))
+{
+    void *sorted;
+    @type@ *aux;
+    @type@ *arr = start;
+    @type@ k1, k2;
+    npy_bool all_sorted = 1;
+
+    if (num < 2) {
+        return 0;
+    }
+
+    k1 = KEY_OF(arr[0]);
+    for (npy_intp i = 1; i < num; i++) {
+        k2 = KEY_OF(arr[i]);
+        if (k1 > k2) {
+            all_sorted = 0;
+            break;
+        }
+        k1 = k2;
+    }
+
+    if (all_sorted) {
+        return 0;
+    }
+
+    aux = malloc(num * sizeof(@type@));
+    if (aux == NULL) {
+        return -NPY_ENOMEM;
+    }
+
+    sorted = radixsort0_@suff@(start, aux, num);
+    if (sorted != start) {
+        memcpy(start, sorted, num * sizeof(@type@));
+    }
+
+    free(aux);
+    return 0;
+}
+
+npy_intp*
+aradixsort0_@suff@(@type@ *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
+{
+    npy_intp cnt[sizeof(@type@)][1 << 8] = { { 0 } };
+    npy_intp i, l;
+    @type@ key0 = KEY_OF(arr[0]);
+    npy_intp ncols = 0;
+    npy_ubyte cols[sizeof(@type@)];
+
+    for (i = 0; i < num; i++) {
+        @type@ k = KEY_OF(arr[i]);
+
+        for (l = 0; l < sizeof(@type@); l++) {
+            cnt[l][nth_byte_@suff@(k, l)]++;
+        }
+    }
+
+    for (l = 0; l < sizeof(@type@); l++) {
+        if (cnt[l][nth_byte_@suff@(key0, l)] != num) {
+            cols[ncols++] = l;
+        }
+    }
+
+    for (l = 0; l < ncols; l++) {
+        npy_intp a = 0;
+        for (i = 0; i < 256; i++) {
+            npy_intp b = cnt[cols[l]][i];
+            cnt[cols[l]][i] = a;
+            a += b;
+        }
+    }
+
+    for (l = 0; l < ncols; l++) {
+        npy_intp* temp;
+        for (i = 0; i < num; i++) {
+            @type@ k = KEY_OF(arr[tosort[i]]);
+            npy_intp dst = cnt[cols[l]][nth_byte_@suff@(k, cols[l])]++;
+            aux[dst] = tosort[i];
+        }
+
+        temp = aux;
+        aux = tosort;
+        tosort = temp;
+    }
+
+    return tosort;
+}
+
+int
+aradixsort_@suff@(void *start, npy_intp* tosort, npy_intp num, void *NPY_UNUSED(varr))
+{
+    npy_intp *sorted;
+    npy_intp *aux;
+    @type@ *arr = start;
+    @type@ k1, k2;
+    npy_bool all_sorted = 1;
+
+    if (num < 2) {
+        return 0;
+    }
+
+    k1 = KEY_OF(arr[0]);
+    for (npy_intp i = 1; i < num; i++) {
+        k2 = KEY_OF(arr[i]);
+        if (k1 > k2) {
+            all_sorted = 0;
+            break;
+        }
+        k1 = k2;
+    }
+
+    if (all_sorted) {
+        return 0;
+    }
+
+    aux = malloc(num * sizeof(npy_intp));
+    if (aux == NULL) {
+        return -NPY_ENOMEM;
+    }
+
+    sorted = aradixsort0_@suff@(start, aux, tosort, num);
+    if (sorted != tosort) {
+        memcpy(tosort, sorted, num * sizeof(npy_intp));
+    }
+
+    free(aux);
+    return 0;
+}
+
+#undef KEY_OF
+
+/**end repeat**/
diff --git a/numpy/core/src/umath/clip.c.src b/numpy/core/src/umath/clip.c.src
new file mode 100644
index 000000000..30fa3d2b3
--- /dev/null
+++ b/numpy/core/src/umath/clip.c.src
@@ -0,0 +1,119 @@
+/**
+ * This module provides the inner loops for the clip ufunc
+ */
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "Python.h"
+
+#include "numpy/halffloat.h"
+#include "numpy/npy_math.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/npy_common.h"
+#include "numpy/utils.h"
+#include "fast_loop_macros.h"
+
+/*
+ * Produce macros that perform nan/nat-propagating min and max
+ */
+
+/**begin repeat
+ * #name = BOOL,
+ *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG#
+ */
+#define _NPY_@name@_MIN(a, b) PyArray_MIN(a, b)
+#define _NPY_@name@_MAX(a, b) PyArray_MAX(a, b)
+/**end repeat**/
+
+#define _NPY_HALF_MIN(a, b) (npy_half_isnan(a) || npy_half_le(a, b) ? (a) : (b))
+#define _NPY_HALF_MAX(a, b) (npy_half_isnan(a) || npy_half_ge(a, b) ? (a) : (b))
+
+/**begin repeat
+ * #name = FLOAT, DOUBLE, LONGDOUBLE#
+ */
+#define _NPY_@name@_MIN(a, b) (npy_isnan(a) ? (a) : PyArray_MIN(a, b))
+#define _NPY_@name@_MAX(a, b) (npy_isnan(a) ? (a) : PyArray_MAX(a, b))
+/**end repeat**/
+
+/**begin repeat
+ * #name = CFLOAT, CDOUBLE, CLONGDOUBLE#
+ */
+#define _NPY_@name@_MIN(a, b) (npy_isnan((a).real) || npy_isnan((a).imag) || PyArray_CLT(a, b) ? (a) : (b))
+#define _NPY_@name@_MAX(a, b) (npy_isnan((a).real) || npy_isnan((a).imag) || PyArray_CGT(a, b) ? (a) : (b))
+/**end repeat**/
+
+/**begin repeat
+ * #name = DATETIME, TIMEDELTA#
+ */
+#define _NPY_@name@_MIN(a, b) ( \
+    (a) == NPY_DATETIME_NAT ? (a) : \
+    (b) == NPY_DATETIME_NAT ? (b) : \
+    (a) < (b) ? (a) : (b) \
+)
+#define _NPY_@name@_MAX(a, b) ( \
+    (a) == NPY_DATETIME_NAT ? (a) : \
+    (b) == NPY_DATETIME_NAT ? (b) : \
+    (a) > (b) ? (a) : (b) \
+)
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #name = BOOL,
+ *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *         DATETIME, TIMEDELTA#
+ * #type = npy_bool,
+ *         npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_cfloat, npy_cdouble, npy_clongdouble,
+ *         npy_datetime, npy_timedelta#
+ */
+
+#define _NPY_CLIP(x, min, max) \
+    _NPY_@name@_MIN(_NPY_@name@_MAX((x), (min)), (max))
+
+NPY_NO_EXPORT void
+@name@_clip(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    if (steps[1] == 0 && steps[2] == 0) {
+        /* min and max are constant throughout the loop, the most common case */
+        /* NOTE: it may be possible to optimize these checks for nan */
+        @type@ min_val = *(@type@ *)args[1];
+        @type@ max_val = *(@type@ *)args[2];
+
+        char *ip1 = args[0], *op1 = args[3];
+        npy_intp is1 = steps[0], os1 = steps[3];
+        npy_intp n = dimensions[0];
+
+        /* contiguous, branch to let the compiler optimize */
+        if (is1 == sizeof(@type@) && os1 == sizeof(@type@)) {
+            for(npy_intp i = 0; i < n; i++, ip1 += is1, op1 += os1) {
+                *(@type@ *)op1 = _NPY_CLIP(*(@type@ *)ip1, min_val, max_val);
+            }
+        }
+        else {
+            for(npy_intp i = 0; i < n; i++, ip1 += is1, op1 += os1) {
+                *(@type@ *)op1 = _NPY_CLIP(*(@type@ *)ip1, min_val, max_val);
+            }
+        }
+    }
+    else {
+        TERNARY_LOOP {
+            *(@type@ *)op1 = _NPY_CLIP(*(@type@ *)ip1, *(@type@ *)ip2, *(@type@ *)ip3);
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+// clean up the macros we defined above
+#undef _NPY_CLIP
+#undef _NPY_@name@_MAX
+#undef _NPY_@name@_MIN
+
+/**end repeat**/
diff --git a/numpy/core/src/umath/clip.h.src b/numpy/core/src/umath/clip.h.src
new file mode 100644
index 000000000..d77971ad7
--- /dev/null
+++ b/numpy/core/src/umath/clip.h.src
@@ -0,0 +1,18 @@
+#ifndef _NPY_UMATH_CLIP_H_
+#define _NPY_UMATH_CLIP_H_
+
+
+/**begin repeat
+ *
+ * #name = BOOL,
+ *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *         DATETIME, TIMEDELTA#
+ */
+NPY_NO_EXPORT void
+@name@_clip(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat**/
+
+#endif
diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h
index 7a1ed66bc..ae6d69a3e 100644
--- a/numpy/core/src/umath/fast_loop_macros.h
+++ b/numpy/core/src/umath/fast_loop_macros.h
@@ -58,6 +58,14 @@
     npy_intp i;\
     for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1, op2 += os2)
 
+/** (ip1, ip2, ip3) -> (op1) */
+#define TERNARY_LOOP\
+    char *ip1 = args[0], *ip2 = args[1], *ip3 = args[2], *op1 = args[3];\
+    npy_intp is1 = steps[0], is2 = steps[1], is3 = steps[2], os1 = steps[3];\
+    npy_intp n = dimensions[0];\
+    npy_intp i;\
+    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, ip3 += is3, op1 += os1)
+
 /** @} */
 
 /* unary loop input and output contiguous */
diff --git a/numpy/core/src/umath/funcs.inc.src b/numpy/core/src/umath/funcs.inc.src
index 2acae3c37..c2732f925 100644
--- a/numpy/core/src/umath/funcs.inc.src
+++ b/numpy/core/src/umath/funcs.inc.src
@@ -259,6 +259,17 @@ npy_ObjectLCM(PyObject *i1, PyObject *i2)
     return PyNumber_Absolute(tmp);
 }
 
+
+static PyObject *
+npy_ObjectClip(PyObject *arr, PyObject *min, PyObject *max) {
+    PyObject *o = npy_ObjectMax(arr, min);
+    if (o == NULL) {
+        return NULL;
+    }
+    Py_SETREF(o, npy_ObjectMin(o, max));
+    return o;
+}
+
 /*
  *****************************************************************************
  **                           COMPLEX FUNCTIONS                             **
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index ff3b36428..a2649ed93 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -425,6 +425,28 @@ PyUFunc_OO_O(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
     }
 }
 
+NPY_NO_EXPORT void
+PyUFunc_OOO_O(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
+{
+    ternaryfunc f = (ternaryfunc)func;
+    TERNARY_LOOP {
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+        PyObject *in3 = *(PyObject **)ip3;
+        PyObject **out = (PyObject **)op1;
+        PyObject *ret = f(
+            in1 ? in1 : Py_None,
+            in2 ? in2 : Py_None,
+            in3 ? in3 : Py_None
+        );
+        if (ret == NULL) {
+            return;
+        }
+        Py_XDECREF(*out);
+        *out = ret;
+    }
+}
+
 /*UFUNC_API*/
 NPY_NO_EXPORT void
 PyUFunc_OO_O_method(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
@@ -1599,21 +1621,23 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
 FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
 {
+    if (!run_unary_@isa@_@func@_FLOAT(args, dimensions, steps)) {
+        UNARY_LOOP {
+            /*
+             * We use the AVX function to compute exp/log for scalar elements as well.
+             * This is needed to ensure the output of strided and non-strided
+             * cases match. But this worsens the performance of strided arrays.
+             * There is plan to fix this in a subsequent patch by using gather
+             * instructions for strided arrays in the AVX function.
+             */
 #if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
-    @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0]);
+            @ISA@_@func@_FLOAT((npy_float *)op1, (npy_float *)ip1, 1);
 #else
-    /*
-     * This is the path it would take if ISA was runtime detected, but not
-     * compiled for. It fixes the error on clang6.0 which fails to compile
-     * AVX512F version. Not sure if I like this idea, if during runtime it
-     * detects AXV512F, it will end up running the scalar version instead
-     * of AVX2.
-     */
-    UNARY_LOOP {
-	const npy_float in1 = *(npy_float *)ip1;
-	*(npy_float *)op1 = @scalarf@(in1);
-    }
+            const npy_float in1 = *(npy_float *)ip1;
+            *(npy_float *)op1 = @scalarf@(in1);
 #endif
+        }
+    }
 }
 
 /**end repeat1**/
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 8dd3170e3..7f05a693a 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -549,6 +549,9 @@ OBJECT@suffix@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *
 NPY_NO_EXPORT void
 OBJECT_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
 
+NPY_NO_EXPORT void
+PyUFunc_OOO_O(char **args, npy_intp *dimensions, npy_intp *steps, void *func);
+
 /*
  *****************************************************************************
  **                              END LOOPS                                  **
diff --git a/numpy/core/src/umath/matmul.c.src b/numpy/core/src/umath/matmul.c.src
index 0cb3c82ad..480c0c72f 100644
--- a/numpy/core/src/umath/matmul.c.src
+++ b/numpy/core/src/umath/matmul.c.src
@@ -267,19 +267,88 @@ NPY_NO_EXPORT void
 
 /**end repeat**/
 
+
+NPY_NO_EXPORT void
+OBJECT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)                         
+{
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    npy_intp ib1_n = is1_n * dn;
+    npy_intp ib2_n = is2_n * dn;
+    npy_intp ib2_p = is2_p * dp;
+    npy_intp ob_p  = os_p * dp;
+
+    PyObject *product, *sum_of_products = NULL;
+
+    for (npy_intp m = 0; m < dm; m++) {
+        for (npy_intp p = 0; p < dp; p++) {
+            if ( 0 == dn ) {
+                sum_of_products = PyLong_FromLong(0);
+                if (sum_of_products == NULL) {
+                    return;
+                }
+            }
+
+            for (npy_intp n = 0; n < dn; n++) {
+                PyObject *obj1 = *(PyObject**)ip1, *obj2 = *(PyObject**)ip2;
+                if (obj1 == NULL) {
+                    obj1 = Py_None;
+                }
+                if (obj2 == NULL) {
+                    obj2 = Py_None;
+                }
+
+                product = PyNumber_Multiply(obj1, obj2);
+                if (product == NULL) {
+                    Py_XDECREF(sum_of_products);
+                    return;
+                }
+
+                if (n == 0) {
+                    sum_of_products = product;
+                }
+                else {
+                    Py_SETREF(sum_of_products, PyNumber_Add(sum_of_products, product));
+                    Py_DECREF(product);
+                    if (sum_of_products == NULL) {
+                        return;
+                    }
+                }
+
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+
+            *((PyObject **)op) = sum_of_products;
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
 /**begin repeat
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF,
  *          CFLOAT, CDOUBLE, CLONGDOUBLE,
  *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
  *          BYTE, SHORT, INT, LONG, LONGLONG,
- *          BOOL#
+ *          BOOL, OBJECT#
  *  #typ = npy_float,npy_double,npy_longdouble, npy_half,
  *         npy_cfloat, npy_cdouble, npy_clongdouble,
  *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
  *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- *         npy_bool#
- * #IS_COMPLEX = 0, 0, 0, 0, 1, 1, 1, 0*11#
- * #USEBLAS = 1, 1, 0, 0, 1, 1, 0*12#
+ *         npy_bool,npy_object#
+ * #IS_COMPLEX = 0, 0, 0, 0, 1, 1, 1, 0*12#
+ * #USEBLAS = 1, 1, 0, 0, 1, 1, 0*13#
  */
 
 
@@ -398,5 +467,3 @@ NPY_NO_EXPORT void
 }
 
 /**end repeat**/
-
-
diff --git a/numpy/core/src/umath/matmul.h.src b/numpy/core/src/umath/matmul.h.src
index 16be7675b..a664b1b4e 100644
--- a/numpy/core/src/umath/matmul.h.src
+++ b/numpy/core/src/umath/matmul.h.src
@@ -3,7 +3,7 @@
  *          CFLOAT, CDOUBLE, CLONGDOUBLE,
  *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
  *          BYTE, SHORT, INT, LONG, LONGLONG,
- *          BOOL#
+ *          BOOL, OBJECT#
  **/
 NPY_NO_EXPORT void
 @TYPE@_matmul(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index b94a5a0f7..1c6ac4426 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -122,20 +122,36 @@ abs_ptrdiff(char *a, char *b)
 
 /**begin repeat
  * #ISA = AVX2, AVX512F#
+ * #isa = avx2, avx512f#
+ * #REGISTER_SIZE = 32, 64#
  */
 
 /* prototypes */
-#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
 
 /**begin repeat1
  * #func = exp, log#
  */
 
-static void
+#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE void
 @ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n);
+#endif
 
-/**end repeat1**/
+static NPY_INLINE int
+run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+    if (IS_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
+        @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0]);
+        return 1;
+    }
+    else
+        return 0;
 #endif
+    return 0;
+}
+
+/**end repeat1**/
 
 /**end repeat**/
 
diff --git a/numpy/core/tests/test_longdouble.py b/numpy/core/tests/test_longdouble.py
index cf50d5d5c..ee4197f8f 100644
--- a/numpy/core/tests/test_longdouble.py
+++ b/numpy/core/tests/test_longdouble.py
@@ -1,5 +1,6 @@
 from __future__ import division, absolute_import, print_function
 
+import warnings
 import pytest
 
 import numpy as np
@@ -205,3 +206,28 @@ class TestCommaDecimalPointLocale(CommaDecimalPointLocale):
     def test_fromstring_foreign_value(self):
         b = np.fromstring("1,234", dtype=np.longdouble, sep=" ")
         assert_array_equal(b[0], 1)
+
+@pytest.mark.parametrize("int_val", [
+    # cases discussed in gh-10723
+    # and gh-9968
+    2 ** 1024, 0])
+def test_longdouble_from_int(int_val):
+    # for issue gh-9968
+    str_val = str(int_val)
+    # we'll expect a RuntimeWarning on platforms
+    # with np.longdouble equivalent to np.double
+    # for large integer input
+    with warnings.catch_warnings(record=True) as w:
+        warnings.filterwarnings('always', '', RuntimeWarning)
+        # can be inf==inf on some platforms
+        assert np.longdouble(int_val) == np.longdouble(str_val)
+        # we can't directly compare the int and
+        # max longdouble value on all platforms
+        if np.allclose(np.finfo(np.longdouble).max,
+                       np.finfo(np.double).max) and w:
+            assert w[0].category is RuntimeWarning
+
+@pytest.mark.parametrize("bool_val", [
+    True, False])
+def test_longdouble_from_bool(bool_val):
+    assert np.longdouble(bool_val) == np.longdouble(int(bool_val))
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index b29daa675..a2dd47c92 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -1616,16 +1616,31 @@ class TestMethods(object):
         # of sorted items must be greater than ~50 to check the actual
         # algorithm because quick and merge sort fall over to insertion
         # sort for small arrays.
-        a = np.arange(101)
-        b = a[::-1].copy()
-        for kind in self.sort_kinds:
-            msg = "scalar sort, kind=%s" % kind
-            c = a.copy()
-            c.sort(kind=kind)
-            assert_equal(c, a, msg)
-            c = b.copy()
-            c.sort(kind=kind)
-            assert_equal(c, a, msg)
+        # Test unsigned dtypes and nonnegative numbers
+        for dtype in [np.uint8, np.uint16, np.uint32, np.uint64, np.float16, np.float32, np.float64, np.longdouble]:
+            a = np.arange(101, dtype=dtype)
+            b = a[::-1].copy()
+            for kind in self.sort_kinds:
+                msg = "scalar sort, kind=%s, dtype=%s" % (kind, dtype)
+                c = a.copy()
+                c.sort(kind=kind)
+                assert_equal(c, a, msg)
+                c = b.copy()
+                c.sort(kind=kind)
+                assert_equal(c, a, msg)
+
+        # Test signed dtypes and negative numbers as well
+        for dtype in [np.int8, np.int16, np.int32, np.int64, np.float16, np.float32, np.float64, np.longdouble]:
+            a = np.arange(-50, 51, dtype=dtype)
+            b = a[::-1].copy()
+            for kind in self.sort_kinds:
+                msg = "scalar sort, kind=%s, dtype=%s" % (kind, dtype)
+                c = a.copy()
+                c.sort(kind=kind)
+                assert_equal(c, a, msg)
+                c = b.copy()
+                c.sort(kind=kind)
+                assert_equal(c, a, msg)
 
         # test complex sorts. These use the same code as the scalars
         # but the compare function differs.
@@ -1881,12 +1896,14 @@ class TestMethods(object):
         # of sorted items must be greater than ~50 to check the actual
         # algorithm because quick and merge sort fall over to insertion
         # sort for small arrays.
-        a = np.arange(101)
-        b = a[::-1].copy()
-        for kind in self.sort_kinds:
-            msg = "scalar argsort, kind=%s" % kind
-            assert_equal(a.copy().argsort(kind=kind), a, msg)
-            assert_equal(b.copy().argsort(kind=kind), b, msg)
+
+        for dtype in [np.int32, np.uint32, np.float32]:
+            a = np.arange(101, dtype=dtype)
+            b = a[::-1].copy()
+            for kind in self.sort_kinds:
+                msg = "scalar argsort, kind=%s, dtype=%s" % (kind, dtype)
+                assert_equal(a.copy().argsort(kind=kind), a, msg)
+                assert_equal(b.copy().argsort(kind=kind), b, msg)
 
         # test complex argsorts. These use the same code as the scalars
         # but the compare function differs.
@@ -4270,7 +4287,11 @@ class TestClip(object):
 
                 x = (np.random.random(1000) * array_max).astype(dtype)
                 if inplace:
-                    x.clip(clip_min, clip_max, x)
+                    # The tests that call us pass clip_min and clip_max that
+                    # might not fit in the destination dtype. They were written
+                    # assuming the previous unsafe casting, which now must be
+                    # passed explicitly to avoid a warning.
+                    x.clip(clip_min, clip_max, x, casting='unsafe')
                 else:
                     x = x.clip(clip_min, clip_max)
                     byteorder = '='
@@ -4289,7 +4310,7 @@ class TestClip(object):
                 'float', 1024, 0, 0, inplace=inplace)
 
             self._clip_type(
-                'int', 1024, -120, 100.5, inplace=inplace)
+                'int', 1024, -120, 100, inplace=inplace)
             self._clip_type(
                 'int', 1024, 0, 0, inplace=inplace)
 
@@ -5735,7 +5756,7 @@ class MatmulCommon(object):
     """
     # Should work with these types. Will want to add
     # "O" at some point
-    types = "?bhilqBHILQefdgFDG"
+    types = "?bhilqBHILQefdgFDGO"
 
     def test_exceptions(self):
         dims = [
@@ -5786,8 +5807,9 @@ class MatmulCommon(object):
                 assert_(res.dtype == dt)
 
             # vector vector returns scalars
-            res = self.matmul(v, v)
-            assert_(type(res) is np.dtype(dt).type)
+            if dt != "O":
+                res = self.matmul(v, v)
+                assert_(type(res) is np.dtype(dt).type)
 
     def test_scalar_output(self):
         vec1 = np.array([2])
@@ -6038,7 +6060,52 @@ class TestMatmul(MatmulCommon):
 
         r3 = np.matmul(args[0].copy(), args[1].copy())
         assert_equal(r1, r3)
-        
+    
+    def test_matmul_object(self):
+        import fractions
+
+        f = np.vectorize(fractions.Fraction)
+        def random_ints():
+            return np.random.randint(1, 1000, size=(10, 3, 3))
+        M1 = f(random_ints(), random_ints()) 
+        M2 = f(random_ints(), random_ints())
+
+        M3 = self.matmul(M1, M2)
+
+        [N1, N2, N3] = [a.astype(float) for a in [M1, M2, M3]]
+
+        assert_allclose(N3, self.matmul(N1, N2))
+
+    def test_matmul_object_type_scalar(self):
+        from fractions import Fraction as F
+        v = np.array([F(2,3), F(5,7)])
+        res = self.matmul(v, v)
+        assert_(type(res) is F)
+
+    def test_matmul_empty(self):
+        a = np.empty((3, 0), dtype=object)
+        b = np.empty((0, 3), dtype=object)
+        c = np.zeros((3, 3))
+        assert_array_equal(np.matmul(a, b), c)
+
+    def test_matmul_exception_multiply(self):
+        # test that matmul fails if `__mul__` is missing
+        class add_not_multiply():
+            def __add__(self, other):
+                return self
+        a = np.full((3,3), add_not_multiply())
+        with assert_raises(TypeError):
+            b = np.matmul(a, a)
+
+    def test_matmul_exception_add(self):
+        # test that matmul fails if `__add__` is missing
+        class multiply_not_add():
+            def __mul__(self, other):
+                return self
+        a = np.full((3,3), multiply_not_add())
+        with assert_raises(TypeError):
+            b = np.matmul(a, a)
+
 
 
 if sys.version_info[:2] >= (3, 5):
@@ -7776,13 +7843,6 @@ class TestWritebackIfCopy(object):
         res = np.argmin(mat, 0, out=out)
         assert_equal(res, range(5))
 
-    def test_clip_with_out(self):
-        mat = np.eye(5)
-        out = np.eye(5, dtype='i2')
-        res = np.clip(mat, a_min=-10, a_max=0, out=out)
-        assert_(res is out)
-        assert_equal(np.sum(out), 0)
-
     def test_insert_noncontiguous(self):
         a = np.arange(6).reshape(2,3).T # force non-c-contiguous
         # uses arr_insert
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 1c53f9372..9f3b71221 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -13,7 +13,7 @@ from numpy.random import rand, randint, randn
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex,
     assert_array_equal, assert_almost_equal, assert_array_almost_equal,
-    HAS_REFCOUNT
+    assert_warns, HAS_REFCOUNT
     )
 
 
@@ -1336,11 +1336,17 @@ class TestClip(object):
         self.nr = 5
         self.nc = 3
 
-    def fastclip(self, a, m, M, out=None):
+    def fastclip(self, a, m, M, out=None, casting=None):
         if out is None:
-            return a.clip(m, M)
+            if casting is None:
+                return a.clip(m, M)
+            else:
+                return a.clip(m, M, casting=casting)
         else:
-            return a.clip(m, M, out)
+            if casting is None:
+                return a.clip(m, M, out)
+            else:
+                return a.clip(m, M, out, casting=casting)
 
     def clip(self, a, m, M, out=None):
         # use slow-clip
@@ -1378,6 +1384,20 @@ class TestClip(object):
         return (10 * rand(n, m)).astype(np.int32)
 
     # Now the real test cases
+
+    @pytest.mark.parametrize("dtype", '?bhilqpBHILQPefdgFDGO')
+    def test_ones_pathological(self, dtype):
+        # for preservation of behavior described in
+        # gh-12519; amin > amax behavior may still change
+        # in the future
+        arr = np.ones(10, dtype=dtype)
+        expected = np.zeros(10, dtype=dtype)
+        actual = np.clip(arr, 1, 0)
+        if dtype == 'O':
+            assert actual.tolist() == expected.tolist()
+        else:
+            assert_equal(actual, expected)
+
     def test_simple_double(self):
         # Test native double input with scalar min/max.
         a = self._generate_data(self.nr, self.nc)
@@ -1476,14 +1496,21 @@ class TestClip(object):
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
-    def test_simple_int32_inout(self):
+    @pytest.mark.parametrize("casting", [None, "unsafe"])
+    def test_simple_int32_inout(self, casting):
         # Test native int32 input with double min/max and int32 out.
         a = self._generate_int32_data(self.nr, self.nc)
         m = np.float64(0)
         M = np.float64(2)
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        self.fastclip(a, m, M, ac)
+        if casting is None:
+            with assert_warns(DeprecationWarning):
+                # NumPy 1.17.0, 2018-02-24 - casting is unsafe
+                self.fastclip(a, m, M, ac, casting=casting)
+        else:
+            # explicitly passing "unsafe" will silence warning
+            self.fastclip(a, m, M, ac, casting=casting)
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -1505,7 +1532,9 @@ class TestClip(object):
         M = np.float64(1)
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        self.fastclip(a, m, M, ac)
+        with assert_warns(DeprecationWarning):
+            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
+            self.fastclip(a, m, M, ac)
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -1516,7 +1545,9 @@ class TestClip(object):
         M = 2.0
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        self.fastclip(a, m, M, ac)
+        with assert_warns(DeprecationWarning):
+            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
+            self.fastclip(a, m, M, ac)
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -1692,7 +1723,9 @@ class TestClip(object):
         M = np.float64(2)
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        self.fastclip(a, m, M, ac)
+        with assert_warns(DeprecationWarning):
+            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
+            self.fastclip(a, m, M, ac)
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -1714,7 +1747,9 @@ class TestClip(object):
         M = np.float64(1)
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        self.fastclip(a, m, M, ac)
+        with assert_warns(DeprecationWarning):
+            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
+            self.fastclip(a, m, M, ac)
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -1725,7 +1760,9 @@ class TestClip(object):
         M = 2.0
         ac = np.zeros(a.shape, dtype=np.int32)
         act = ac.copy()
-        self.fastclip(a, m, M, ac)
+        with assert_warns(DeprecationWarning):
+            # NumPy 1.17.0, 2018-02-24 - casting is unsafe
+            self.fastclip(a, m, M, ac)
         self.clip(a, m, M, act)
         assert_array_strict_equal(ac, act)
 
@@ -1778,11 +1815,94 @@ class TestClip(object):
 
     def test_clip_nan(self):
         d = np.arange(7.)
-        assert_equal(d.clip(min=np.nan), d)
-        assert_equal(d.clip(max=np.nan), d)
-        assert_equal(d.clip(min=np.nan, max=np.nan), d)
-        assert_equal(d.clip(min=-2, max=np.nan), d)
-        assert_equal(d.clip(min=np.nan, max=10), d)
+        with assert_warns(DeprecationWarning):
+            assert_equal(d.clip(min=np.nan), d)
+        with assert_warns(DeprecationWarning):
+            assert_equal(d.clip(max=np.nan), d)
+        with assert_warns(DeprecationWarning):
+            assert_equal(d.clip(min=np.nan, max=np.nan), d)
+        with assert_warns(DeprecationWarning):
+            assert_equal(d.clip(min=-2, max=np.nan), d)
+        with assert_warns(DeprecationWarning):
+            assert_equal(d.clip(min=np.nan, max=10), d)
+
+    def test_object_clip(self):
+        a = np.arange(10, dtype=object)
+        actual = np.clip(a, 1, 5)
+        expected = np.array([1, 1, 2, 3, 4, 5, 5, 5, 5, 5])
+        assert actual.tolist() == expected.tolist()
+
+    def test_clip_all_none(self):
+        a = np.arange(10, dtype=object)
+        with assert_raises_regex(ValueError, 'max or min'):
+            np.clip(a, None, None)
+
+    def test_clip_invalid_casting(self):
+        a = np.arange(10, dtype=object)
+        with assert_raises_regex(ValueError,
+                                 'casting must be one of'):
+            self.fastclip(a, 1, 8, casting="garbage")
+
+    @pytest.mark.parametrize("amin, amax", [
+        # two scalars
+        (1, 0),
+        # mix scalar and array
+        (1, np.zeros(10)),
+        # two arrays
+        (np.ones(10), np.zeros(10)),
+        ])
+    def test_clip_value_min_max_flip(self, amin, amax):
+        a = np.arange(10, dtype=np.int64)
+        # requirement from ufunc_docstrings.py
+        expected = np.minimum(np.maximum(a, amin), amax)
+        actual = np.clip(a, amin, amax)
+        assert_equal(actual, expected)
+
+    @pytest.mark.parametrize("arr, amin, amax, exp", [
+        # for a bug in npy_ObjectClip, based on a
+        # case produced by hypothesis
+        (np.zeros(10, dtype=np.int64),
+         0,
+         -2**64+1,
+         np.full(10, -2**64+1, dtype=object)),
+        # for bugs in NPY_TIMEDELTA_MAX, based on a case
+        # produced by hypothesis
+        (np.zeros(10, dtype='m8') - 1,
+         0,
+         0,
+         np.zeros(10, dtype='m8')),
+    ])
+    def test_clip_problem_cases(self, arr, amin, amax, exp):
+        actual = np.clip(arr, amin, amax)
+        assert_equal(actual, exp)
+
+    @pytest.mark.xfail(reason="no scalar nan propagation yet")
+    @pytest.mark.parametrize("arr, amin, amax", [
+        # problematic scalar nan case from hypothesis
+        (np.zeros(10, dtype=np.int64),
+         np.array(np.nan),
+         np.zeros(10, dtype=np.int32)),
+    ])
+    def test_clip_scalar_nan_propagation(self, arr, amin, amax):
+        # enforcement of scalar nan propagation for comparisons
+        # called through clip()
+        expected = np.minimum(np.maximum(a, amin), amax)
+        with assert_warns(DeprecationWarning):
+            actual = np.clip(arr, amin, amax)
+            assert_equal(actual, expected)
+
+    @pytest.mark.xfail(reason="propagation doesn't match spec")
+    @pytest.mark.parametrize("arr, amin, amax", [
+        (np.array([1] * 10, dtype='m8'),
+         np.timedelta64('NaT'),
+         np.zeros(10, dtype=np.int32)),
+    ])
+    def test_NaT_propagation(self, arr, amin, amax):
+        # NOTE: the expected function spec doesn't
+        # propagate NaT, but clip() now does
+        expected = np.minimum(np.maximum(a, amin), amax)
+        actual = np.clip(arr, amin, amax)
+        assert_equal(actual, expected)
 
 
 class TestAllclose(object):
@@ -2736,6 +2856,8 @@ class TestBroadcast(object):
         arrs = [np.empty((6, 7)), np.empty((5, 6, 1)), np.empty((7,)),
                 np.empty((5, 1, 7))]
         mits = [np.broadcast(*arrs),
+                np.broadcast(np.broadcast(*arrs[:0]), np.broadcast(*arrs[0:])),
+                np.broadcast(np.broadcast(*arrs[:1]), np.broadcast(*arrs[1:])),
                 np.broadcast(np.broadcast(*arrs[:2]), np.broadcast(*arrs[2:])),
                 np.broadcast(arrs[0], np.broadcast(*arrs[1:-1]), arrs[-1])]
         for mit in mits:
@@ -2760,12 +2882,24 @@ class TestBroadcast(object):
         arr = np.empty((5,))
         for j in range(35):
             arrs = [arr] * j
-            if j < 1 or j > 32:
+            if j > 32:
                 assert_raises(ValueError, np.broadcast, *arrs)
             else:
                 mit = np.broadcast(*arrs)
                 assert_equal(mit.numiter, j)
 
+    def test_broadcast_error_kwargs(self):
+        #gh-13455
+        arrs = [np.empty((5, 6, 7))]
+        mit  = np.broadcast(*arrs)
+        mit2 = np.broadcast(*arrs, **{})
+        assert_equal(mit.shape, mit2.shape)
+        assert_equal(mit.ndim, mit2.ndim)
+        assert_equal(mit.nd, mit2.nd)
+        assert_equal(mit.numiter, mit2.numiter)
+        assert_(mit.iters[0].base is mit2.iters[0].base)
+
+        assert_raises(ValueError, np.broadcast, 1, **{'x': 1})
 
 class TestKeepdims(object):
 
diff --git a/numpy/core/tests/test_overrides.py b/numpy/core/tests/test_overrides.py
index 6e9610fff..7f02399b2 100644
--- a/numpy/core/tests/test_overrides.py
+++ b/numpy/core/tests/test_overrides.py
@@ -2,6 +2,7 @@ from __future__ import division, absolute_import, print_function
 
 import inspect
 import sys
+from unittest import mock
 
 import numpy as np
 from numpy.testing import (
@@ -10,7 +11,6 @@ from numpy.core.overrides import (
     _get_implementing_args, array_function_dispatch,
     verify_matching_signatures)
 from numpy.compat import pickle
-import pytest
 
 
 def _return_not_implemented(self, *args, **kwargs):
@@ -190,12 +190,18 @@ class TestNDArrayArrayFunction(object):
         result = np.concatenate((array, override_sub))
         assert_equal(result, expected.view(OverrideSub))
 
+    def test_skip_array_function(self):
+        assert_(dispatched_one_arg.__skip_array_function__
+                is dispatched_one_arg.__wrapped__)
+
     def test_no_wrapper(self):
+        # This shouldn't happen unless a user intentionally calls
+        # __array_function__ with invalid arguments, but check that we raise
+        # an appropriate error all the same.
         array = np.array(1)
-        func = dispatched_one_arg.__wrapped__
-        with assert_raises_regex(AttributeError, '__wrapped__'):
-            array.__array_function__(func=func,
-                                     types=(np.ndarray,),
+        func = dispatched_one_arg.__skip_array_function__
+        with assert_raises_regex(AttributeError, '__skip_array_function__'):
+            array.__array_function__(func=func, types=(np.ndarray,),
                                      args=(array,), kwargs={})
 
 
@@ -378,3 +384,49 @@ class TestNumPyFunctions(object):
             return 'yes'
 
         assert_equal(np.sum(MyArray()), 'yes')
+
+    def test_sum_implementation_on_list(self):
+        assert_equal(np.sum.__skip_array_function__([1, 2, 3]), 6)
+
+    def test_sum_on_mock_array(self):
+
+        # We need a proxy for mocks because __array_function__ is only looked
+        # up in the class dict
+        class ArrayProxy:
+            def __init__(self, value):
+                self.value = value
+            def __array_function__(self, *args, **kwargs):
+                return self.value.__array_function__(*args, **kwargs)
+            def __array__(self, *args, **kwargs):
+                return self.value.__array__(*args, **kwargs)
+
+        proxy = ArrayProxy(mock.Mock(spec=ArrayProxy))
+        proxy.value.__array_function__.return_value = 1
+        result = np.sum(proxy)
+        assert_equal(result, 1)
+        proxy.value.__array_function__.assert_called_once_with(
+            np.sum, (ArrayProxy,), (proxy,), {})
+        proxy.value.__array__.assert_not_called()
+
+        proxy = ArrayProxy(mock.Mock(spec=ArrayProxy))
+        proxy.value.__array__.return_value = np.array(2)
+        result = np.sum.__skip_array_function__(proxy)
+        assert_equal(result, 2)
+        # TODO: switch to proxy.value.__array__.assert_called() and
+        # proxy.value.__array_function__.assert_not_called() once we drop
+        # Python 3.5 support.
+        ((called_method_name, _, _),) = proxy.value.mock_calls
+        assert_equal(called_method_name, '__array__')
+
+    def test_sum_forwarding_implementation(self):
+
+        class MyArray(object):
+
+            def sum(self, axis, out):
+                return 'summed'
+
+            def __array_function__(self, func, types, args, kwargs):
+                return func.__skip_array_function__(*args, **kwargs)
+
+        # note: the internal implementation of np.sum() calls the .sum() method
+        assert_equal(np.sum(MyArray()), 'summed')
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index b6b68d922..caeea39f4 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -1936,3 +1936,56 @@ class TestUfunc(object):
             assert not np.isinf(nat)
         except TypeError:
             pass  # ok, just not implemented
+
+
+@pytest.mark.parametrize('ufunc', [getattr(np, x) for x in dir(np)
+                                if isinstance(getattr(np, x), np.ufunc)])
+def test_ufunc_types(ufunc):
+    '''
+    Check all ufuncs that the correct type is returned. Avoid
+    object and boolean types since many operations are not defined for
+    for them.
+
+    Choose the shape so even dot and matmul will succeed
+    '''
+    for typ in ufunc.types:
+        # types is a list of strings like ii->i
+        if 'O' in typ or '?' in typ:
+            continue
+        inp, out = typ.split('->')
+        args = [np.ones((3, 3), t) for t in inp]
+        with warnings.catch_warnings(record=True):
+            warnings.filterwarnings("always")
+            res = ufunc(*args)
+        if isinstance(res, tuple):
+            outs = tuple(out)
+            assert len(res) == len(outs)
+            for r, t in zip(res, outs):
+                assert r.dtype == np.dtype(t)
+        else:
+            assert res.dtype == np.dtype(out)
+
+@pytest.mark.parametrize('ufunc', [getattr(np, x) for x in dir(np)
+                                if isinstance(getattr(np, x), np.ufunc)])
+def test_ufunc_noncontiguous(ufunc):
+    '''
+    Check that contiguous and non-contiguous calls to ufuncs
+    have the same results for values in range(9)
+    '''
+    for typ in ufunc.types:
+        # types is a list of strings like ii->i
+        if any(set('O?mM') & set(typ)):
+            # bool, object, datetime are too irregular for this simple test
+            continue
+        inp, out = typ.split('->')
+        args_c = [np.empty(6, t) for t in inp]
+        args_n = [np.empty(18, t)[::3] for t in inp]
+        for a in args_c:
+            a.flat = range(1,7)
+        for a in args_n:
+            a.flat = range(1,7)
+        with warnings.catch_warnings(record=True):
+            warnings.filterwarnings("always")
+            res_c = ufunc(*args_c)
+            res_n = ufunc(*args_n)
+        assert_equal(res_c, res_n)