47 files changed, 1919 insertions, 1197 deletions
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index ea472f1b3..bc034c3e9 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -1072,6 +1072,43 @@ add_newdoc('numpy.core.multiarray', 'fromstring',
 
     """)
 
+add_newdoc('numpy.core.multiarray', 'compare_chararrays',
+    """
+    compare_chararrays(a, b, cmp_op, rstrip)
+
+    Performs element-wise comparison of two string arrays using the
+    comparison operator specified by `cmp_op`.
+
+    Parameters
+    ----------
+    a, b : array_like
+        Arrays to be compared.
+    cmp_op : {"<", "<=", "==", ">=", ">", "!="}
+        Type of comparison.
+    rstrip : Boolean
+        If True, the spaces at the end of Strings are removed before the comparison.
+
+    Returns
+    -------
+    out : ndarray
+        The output array of type Boolean with the same shape as a and b.
+
+    Raises
+    ------
+    ValueError
+        If `cmp_op` is not valid.
+    TypeError
+        If at least one of `a` or `b` is a non-string array
+
+    Examples
+    --------
+    >>> a = np.array(["a", "b", "cde"])
+    >>> b = np.array(["a", "a", "dec"])
+    >>> np.compare_chararrays(a, b, ">", True)
+    array([False,  True, False])
+
+    """)
+
 add_newdoc('numpy.core.multiarray', 'fromiter',
     """
     fromiter(iterable, dtype, count=-1)
@@ -1320,6 +1357,12 @@ add_newdoc('numpy.core.multiarray', 'set_numeric_ops',
 
     Set numerical operators for array objects.
 
+    .. deprecated:: 1.16
+
+        For the general case, use :c:func:`PyUFunc_ReplaceLoopBySignature`.
+        For ndarray subclasses, define the ``__array_ufunc__`` method and
+        override the relevant ufunc.
+
     Parameters
     ----------
     op1, op2, ... : callable
@@ -1597,7 +1640,7 @@ add_newdoc('numpy.core.multiarray', 'c_einsum',
     """
     c_einsum(subscripts, *operands, out=None, dtype=None, order='K',
            casting='safe')
-           
+
     *This documentation shadows that of the native python implementation of the `einsum` function,
     except all references and examples related to the `optimize` argument (v 0.12.0) have been removed.*
 
@@ -2113,7 +2156,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('ctypes',
     -----
     Below are the public attributes of this object which were documented
     in "Guide to NumPy" (we have omitted undocumented public attributes,
-    as well as documented private attributes): 
+    as well as documented private attributes):
 
     .. autoattribute:: numpy.core._internal._ctypes.data
 
@@ -2455,7 +2498,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('size',
 
     Notes
     -----
-    `a.size` returns a standard arbitrary precision Python integer. This 
+    `a.size` returns a standard arbitrary precision Python integer. This
     may not be the case with other methods of obtaining the same value
     (like the suggested ``np.prod(a.shape)``, which returns an instance
     of ``np.int_``), and may be relevant if the value is used further in
@@ -4090,7 +4133,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('tofile',
     machines with different endianness. Some of these problems can be overcome
     by outputting the data as text files, at the expense of speed and file
     size.
-    
+
     When fid is a file object, array contents are directly written to the
     file, bypassing the file object's ``write`` method. As a result, tofile
     cannot be used with files objects supporting compression (e.g., GzipFile)
@@ -4546,184 +4589,6 @@ add_newdoc('numpy.core.umath', 'seterrobj',
 #
 ##############################################################################
 
-add_newdoc('numpy.core.multiarray', 'bincount',
-    """
-    bincount(x, weights=None, minlength=0)
-
-    Count number of occurrences of each value in array of non-negative ints.
-
-    The number of bins (of size 1) is one larger than the largest value in
-    `x`. If `minlength` is specified, there will be at least this number
-    of bins in the output array (though it will be longer if necessary,
-    depending on the contents of `x`).
-    Each bin gives the number of occurrences of its index value in `x`.
-    If `weights` is specified the input array is weighted by it, i.e. if a
-    value ``n`` is found at position ``i``, ``out[n] += weight[i]`` instead
-    of ``out[n] += 1``.
-
-    Parameters
-    ----------
-    x : array_like, 1 dimension, nonnegative ints
-        Input array.
-    weights : array_like, optional
-        Weights, array of the same shape as `x`.
-    minlength : int, optional
-        A minimum number of bins for the output array.
-
-        .. versionadded:: 1.6.0
-
-    Returns
-    -------
-    out : ndarray of ints
-        The result of binning the input array.
-        The length of `out` is equal to ``np.amax(x)+1``.
-
-    Raises
-    ------
-    ValueError
-        If the input is not 1-dimensional, or contains elements with negative
-        values, or if `minlength` is negative.
-    TypeError
-        If the type of the input is float or complex.
-
-    See Also
-    --------
-    histogram, digitize, unique
-
-    Examples
-    --------
-    >>> np.bincount(np.arange(5))
-    array([1, 1, 1, 1, 1])
-    >>> np.bincount(np.array([0, 1, 1, 3, 2, 1, 7]))
-    array([1, 3, 1, 1, 0, 0, 0, 1])
-
-    >>> x = np.array([0, 1, 1, 3, 2, 1, 7, 23])
-    >>> np.bincount(x).size == np.amax(x)+1
-    True
-
-    The input array needs to be of integer dtype, otherwise a
-    TypeError is raised:
-
-    >>> np.bincount(np.arange(5, dtype=float))
-    Traceback (most recent call last):
-      File "<stdin>", line 1, in <module>
-    TypeError: array cannot be safely cast to required type
-
-    A possible use of ``bincount`` is to perform sums over
-    variable-size chunks of an array, using the ``weights`` keyword.
-
-    >>> w = np.array([0.3, 0.5, 0.2, 0.7, 1., -0.6]) # weights
-    >>> x = np.array([0, 1, 1, 2, 2, 2])
-    >>> np.bincount(x,  weights=w)
-    array([ 0.3,  0.7,  1.1])
-
-    """)
-
-add_newdoc('numpy.core.multiarray', 'ravel_multi_index',
-    """
-    ravel_multi_index(multi_index, dims, mode='raise', order='C')
-
-    Converts a tuple of index arrays into an array of flat
-    indices, applying boundary modes to the multi-index.
-
-    Parameters
-    ----------
-    multi_index : tuple of array_like
-        A tuple of integer arrays, one array for each dimension.
-    dims : tuple of ints
-        The shape of array into which the indices from ``multi_index`` apply.
-    mode : {'raise', 'wrap', 'clip'}, optional
-        Specifies how out-of-bounds indices are handled.  Can specify
-        either one mode or a tuple of modes, one mode per index.
-
-        * 'raise' -- raise an error (default)
-        * 'wrap' -- wrap around
-        * 'clip' -- clip to the range
-
-        In 'clip' mode, a negative index which would normally
-        wrap will clip to 0 instead.
-    order : {'C', 'F'}, optional
-        Determines whether the multi-index should be viewed as
-        indexing in row-major (C-style) or column-major
-        (Fortran-style) order.
-
-    Returns
-    -------
-    raveled_indices : ndarray
-        An array of indices into the flattened version of an array
-        of dimensions ``dims``.
-
-    See Also
-    --------
-    unravel_index
-
-    Notes
-    -----
-    .. versionadded:: 1.6.0
-
-    Examples
-    --------
-    >>> arr = np.array([[3,6,6],[4,5,1]])
-    >>> np.ravel_multi_index(arr, (7,6))
-    array([22, 41, 37])
-    >>> np.ravel_multi_index(arr, (7,6), order='F')
-    array([31, 41, 13])
-    >>> np.ravel_multi_index(arr, (4,6), mode='clip')
-    array([22, 23, 19])
-    >>> np.ravel_multi_index(arr, (4,4), mode=('clip','wrap'))
-    array([12, 13, 13])
-
-    >>> np.ravel_multi_index((3,1,4,1), (6,7,8,9))
-    1621
-    """)
-
-add_newdoc('numpy.core.multiarray', 'unravel_index',
-    """
-    unravel_index(indices, shape, order='C')
-
-    Converts a flat index or array of flat indices into a tuple
-    of coordinate arrays.
-
-    Parameters
-    ----------
-    indices : array_like
-        An integer array whose elements are indices into the flattened
-        version of an array of dimensions ``shape``. Before version 1.6.0,
-        this function accepted just one index value.
-    shape : tuple of ints
-        The shape of the array to use for unraveling ``indices``.
-
-        .. versionchanged:: 1.16.0
-            Renamed from ``dims`` to ``shape``.
-
-    order : {'C', 'F'}, optional
-        Determines whether the indices should be viewed as indexing in
-        row-major (C-style) or column-major (Fortran-style) order.
-
-        .. versionadded:: 1.6.0
-
-    Returns
-    -------
-    unraveled_coords : tuple of ndarray
-        Each array in the tuple has the same shape as the ``indices``
-        array.
-
-    See Also
-    --------
-    ravel_multi_index
-
-    Examples
-    --------
-    >>> np.unravel_index([22, 41, 37], (7,6))
-    (array([3, 6, 6]), array([4, 5, 1]))
-    >>> np.unravel_index([31, 41, 13], (7,6), order='F')
-    (array([3, 6, 6]), array([4, 5, 1]))
-
-    >>> np.unravel_index(1621, (6,7,8,9))
-    (3, 1, 4, 1)
-
-    """)
-
 add_newdoc('numpy.core.multiarray', 'add_docstring',
     """
     add_docstring(obj, docstring)
@@ -5200,7 +5065,7 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
         to None - otherwise it defaults to ufunc.identity.
         If ``None`` is given, the first element of the reduction is used,
         and an error is thrown if the reduction is empty.
-        
+
         .. versionadded:: 1.15.0
 
     Returns
@@ -5233,18 +5098,18 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
     >>> np.add.reduce(X, 2)
     array([[ 1,  5],
            [ 9, 13]])
-           
+
     You can use the ``initial`` keyword argument to initialize the reduction with a
     different value.
-    
+
     >>> np.add.reduce([10], initial=5)
     15
     >>> np.add.reduce(np.ones((2, 2, 2)), axis=(0, 2), initializer=10)
     array([14., 14.])
-    
+
     Allows reductions of empty arrays where they would normally fail, i.e.
     for ufuncs without an identity.
-    
+
     >>> np.minimum.reduce([], initial=np.inf)
     inf
     >>> np.minimum.reduce([])
@@ -5714,13 +5579,13 @@ add_newdoc('numpy.core.multiarray', 'dtype', ('char',
 
 add_newdoc('numpy.core.multiarray', 'dtype', ('descr',
     """
-    PEP3118 interface description of the data-type.
+    `__array_interface__` description of the data-type.
 
     The format is that required by the 'descr' key in the
-    PEP3118 `__array_interface__` attribute.
+    `__array_interface__` attribute.
 
-    Warning: This attribute exists specifically for PEP3118 compliance, and
-    is not a datatype description compatible with `np.dtype`.
+    Warning: This attribute exists specifically for `__array_interface__`,
+    and is not a datatype description compatible with `np.dtype`.
     """))
 
 add_newdoc('numpy.core.multiarray', 'dtype', ('fields',
diff --git a/numpy/core/_dtype_ctypes.py b/numpy/core/_dtype_ctypes.py
index f10b4e99f..0852b1ef2 100644
--- a/numpy/core/_dtype_ctypes.py
+++ b/numpy/core/_dtype_ctypes.py
@@ -33,17 +33,65 @@ def _from_ctypes_array(t):
 
 
 def _from_ctypes_structure(t):
-    # TODO: gh-10533, gh-10532
-    fields = []
     for item in t._fields_:
         if len(item) > 2:
             raise TypeError(
                 "ctypes bitfields have no dtype equivalent")
-        fname, ftyp = item
-        fields.append((fname, dtype_from_ctypes_type(ftyp)))
 
-    # by default, ctypes structs are aligned
-    return np.dtype(fields, align=True)
+    if hasattr(t, "_pack_"):
+        formats = []
+        offsets = []
+        names = []
+        current_offset = 0
+        for fname, ftyp in t._fields_:
+            names.append(fname)
+            formats.append(dtype_from_ctypes_type(ftyp))
+            # Each type has a default offset, this is platform dependent for some types.
+            effective_pack = min(t._pack_, ctypes.alignment(ftyp))
+            current_offset = ((current_offset + effective_pack - 1) // effective_pack) * effective_pack
+            offsets.append(current_offset)
+            current_offset += ctypes.sizeof(ftyp)
+
+        return np.dtype(dict(
+            formats=formats,
+            offsets=offsets,
+            names=names,
+            itemsize=ctypes.sizeof(t)))
+    else:
+        fields = []
+        for fname, ftyp in t._fields_:
+            fields.append((fname, dtype_from_ctypes_type(ftyp)))
+
+        # by default, ctypes structs are aligned
+        return np.dtype(fields, align=True)
+
+
+def _from_ctypes_scalar(t):
+    """
+    Return the dtype type with endianness included if it's the case
+    """
+    if getattr(t, '__ctype_be__', None) is t:
+        return np.dtype('>' + t._type_)
+    elif getattr(t, '__ctype_le__', None) is t:
+        return np.dtype('<' + t._type_)
+    else:
+        return np.dtype(t._type_)
+
+
+def _from_ctypes_union(t):
+    formats = []
+    offsets = []
+    names = []
+    for fname, ftyp in t._fields_:
+        names.append(fname)
+        formats.append(dtype_from_ctypes_type(ftyp))
+        offsets.append(0)  # Union fields are offset to 0
+
+    return np.dtype(dict(
+        formats=formats,
+        offsets=offsets,
+        names=names,
+        itemsize=ctypes.sizeof(t)))
 
 
 def dtype_from_ctypes_type(t):
@@ -57,12 +105,9 @@ def dtype_from_ctypes_type(t):
     elif issubclass(t, _ctypes.Structure):
         return _from_ctypes_structure(t)
     elif issubclass(t, _ctypes.Union):
-        # TODO
-        raise NotImplementedError(
-            "conversion from ctypes.Union types like {} to dtype"
-            .format(t.__name__))
-    elif isinstance(t._type_, str):
-        return np.dtype(t._type_)
+        return _from_ctypes_union(t)
+    elif isinstance(getattr(t, '_type_', None), str):
+        return _from_ctypes_scalar(t)
     else:
         raise NotImplementedError(
             "Unknown ctypes type {}".format(t.__name__))
diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py
index 30069f0ca..59da60253 100644
--- a/numpy/core/_internal.py
+++ b/numpy/core/_internal.py
@@ -10,6 +10,7 @@ import re
 import sys
 
 from numpy.compat import unicode
+from numpy.core.overrides import set_module
 from .multiarray import dtype, array, ndarray
 try:
     import ctypes
@@ -482,6 +483,12 @@ _pep3118_standard_map = {
 }
 _pep3118_standard_typechars = ''.join(_pep3118_standard_map.keys())
 
+_pep3118_unsupported_map = {
+    'u': 'UCS-2 strings',
+    '&': 'pointers',
+    't': 'bitfields',
+    'X': 'function pointers',
+}
 
 class _Stream(object):
     def __init__(self, s):
@@ -593,6 +600,11 @@ def __dtype_from_pep3118(stream, is_subdtype):
                 stream.byteorder, stream.byteorder)
             value = dtype(numpy_byteorder + dtypechar)
             align = value.alignment
+        elif stream.next in _pep3118_unsupported_map:
+            desc = _pep3118_unsupported_map[stream.next]
+            raise NotImplementedError(
+                "Unrepresentable PEP 3118 data type {!r} ({})"
+                .format(stream.next, desc))
         else:
             raise ValueError("Unknown PEP 3118 data type specifier %r" % stream.s)
 
@@ -718,9 +730,11 @@ def _lcm(a, b):
     return a // _gcd(a, b) * b
 
 # Exception used in shares_memory()
+@set_module('numpy')
 class TooHardError(RuntimeError):
     pass
 
+@set_module('numpy')
 class AxisError(ValueError, IndexError):
     """ Axis supplied was invalid. """
     def __init__(self, axis, ndim=None, msg_prefix=None):
diff --git a/numpy/core/arrayprint.py b/numpy/core/arrayprint.py
index ccc1468c4..075d75340 100644
--- a/numpy/core/arrayprint.py
+++ b/numpy/core/arrayprint.py
@@ -48,7 +48,7 @@ from .fromnumeric import ravel, any
 from .numeric import concatenate, asarray, errstate
 from .numerictypes import (longlong, intc, int_, float_, complex_, bool_,
                            flexible)
-from .overrides import array_function_dispatch
+from .overrides import array_function_dispatch, set_module
 import warnings
 import contextlib
 
@@ -89,6 +89,8 @@ def _make_options_dict(precision=None, threshold=None, edgeitems=None,
 
     return options
 
+
+@set_module('numpy')
 def set_printoptions(precision=None, threshold=None, edgeitems=None,
                      linewidth=None, suppress=None, nanstr=None, infstr=None,
                      formatter=None, sign=None, floatmode=None, **kwarg):
@@ -250,6 +252,7 @@ def set_printoptions(precision=None, threshold=None, edgeitems=None,
         set_legacy_print_mode(0)
 
 
+@set_module('numpy')
 def get_printoptions():
     """
     Return the current print options.
@@ -279,6 +282,7 @@ def get_printoptions():
     return _format_options.copy()
 
 
+@set_module('numpy')
 @contextlib.contextmanager
 def printoptions(*args, **kwargs):
     """Context manager for setting print options.
@@ -976,6 +980,8 @@ class LongFloatFormat(FloatingFormat):
                       DeprecationWarning, stacklevel=2)
         super(LongFloatFormat, self).__init__(*args, **kwargs)
 
+
+@set_module('numpy')
 def format_float_scientific(x, precision=None, unique=True, trim='k',
                             sign=False, pad_left=None, exp_digits=None):
     """
@@ -1043,6 +1049,8 @@ def format_float_scientific(x, precision=None, unique=True, trim='k',
                               trim=trim, sign=sign, pad_left=pad_left,
                               exp_digits=exp_digits)
 
+
+@set_module('numpy')
 def format_float_positional(x, precision=None, unique=True,
                             fractional=True, trim='k', sign=False,
                             pad_left=None, pad_right=None):
@@ -1547,10 +1555,12 @@ def array_str(a, max_line_width=None, precision=None, suppress_small=None):
         a, max_line_width, precision, suppress_small)
 
 
+# needed if __array_function__ is disabled
+_array2string_impl = getattr(array2string, '__wrapped__', array2string)
 _default_array_str = functools.partial(_array_str_implementation,
-                                       array2string=array2string.__wrapped__)
+                                       array2string=_array2string_impl)
 _default_array_repr = functools.partial(_array_repr_implementation,
-                                        array2string=array2string.__wrapped__)
+                                        array2string=_array2string_impl)
 
 
 def set_string_function(f, repr=True):
diff --git a/numpy/core/code_generators/cversions.txt b/numpy/core/code_generators/cversions.txt
index c8b998bfc..00f10df57 100644
--- a/numpy/core/code_generators/cversions.txt
+++ b/numpy/core/code_generators/cversions.txt
@@ -39,9 +39,12 @@
 0x0000000b = edb1ba83730c650fd9bc5772a919cda7
 
 # Version 12 (NumPy 1.14) Added PyArray_ResolveWritebackIfCopy,
-# Version 12 (NumPy 1.15) No change.
 # PyArray_SetWritebackIfCopyBase and deprecated PyArray_SetUpdateIfCopyBase.
+# Version 12 (NumPy 1.15) No change.
 0x0000000c = a1bc756c5782853ec2e3616cf66869d8
 
-# Version 13 (Numpy 1.16) Added fields core_dim_flags and core_dim_sizes to PyUFuncObject
-0x0000000d = a1bc756c5782853ec2e3616cf66869d8
+# Version 13 (NumPy 1.16)
+# Deprecate PyArray_SetNumericOps and PyArray_GetNumericOps,
+# Add fields core_dim_flags and core_dim_sizes to PyUFuncObject.
+# Add PyUFunc_FromFuncAndDataAndSignatureAndIdentity to ufunc_funcs_api.
+0x0000000d = 5b0e8bbded00b166125974fc71e80a33
diff --git a/numpy/core/code_generators/generate_numpy_api.py b/numpy/core/code_generators/generate_numpy_api.py
index 7f2541667..a883ee469 100644
--- a/numpy/core/code_generators/generate_numpy_api.py
+++ b/numpy/core/code_generators/generate_numpy_api.py
@@ -50,7 +50,6 @@ _import_array(void)
   PyObject *c_api = NULL;
 
   if (numpy == NULL) {
-      PyErr_SetString(PyExc_ImportError, "numpy.core._multiarray_umath failed to import");
       return -1;
   }
   c_api = PyObject_GetAttrString(numpy, "_ARRAY_API");
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 199ad831b..9d4e72c0e 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -10,11 +10,14 @@ sys.path.insert(0, os.path.dirname(__file__))
 import ufunc_docstrings as docstrings
 sys.path.pop(0)
 
-Zero = "PyUFunc_Zero"
-One = "PyUFunc_One"
-None_ = "PyUFunc_None"
-AllOnes = "PyUFunc_MinusOne"
-ReorderableNone = "PyUFunc_ReorderableNone"
+Zero = "PyInt_FromLong(0)"
+One = "PyInt_FromLong(1)"
+True_ = "(Py_INCREF(Py_True), Py_True)"
+False_ = "(Py_INCREF(Py_False), Py_False)"
+None_ = object()
+AllOnes = "PyInt_FromLong(-1)"
+MinusInfinity = 'PyFloat_FromDouble(-NPY_INFINITY)'
+ReorderableNone = "(Py_INCREF(Py_None), Py_None)"
 
 # Sentinel value to specify using the full type description in the
 # function name
@@ -458,7 +461,7 @@ defdict = {
           [TypeDescription('O', FullTypeDescr, 'OO', 'O')],
           ),
 'logical_and':
-    Ufunc(2, 1, One,
+    Ufunc(2, 1, True_,
           docstrings.get('numpy.core.umath.logical_and'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
           TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
@@ -472,14 +475,14 @@ defdict = {
           TD(O, f='npy_ObjectLogicalNot'),
           ),
 'logical_or':
-    Ufunc(2, 1, Zero,
+    Ufunc(2, 1, False_,
           docstrings.get('numpy.core.umath.logical_or'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
           TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
           TD(O, f='npy_ObjectLogicalOr'),
           ),
 'logical_xor':
-    Ufunc(2, 1, Zero,
+    Ufunc(2, 1, False_,
           docstrings.get('numpy.core.umath.logical_xor'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
           TD(nodatetime_or_obj, out='?'),
@@ -514,7 +517,7 @@ defdict = {
           TD(O, f='npy_ObjectMin')
           ),
 'logaddexp':
-    Ufunc(2, 1, None,
+    Ufunc(2, 1, MinusInfinity,
           docstrings.get('numpy.core.umath.logaddexp'),
           None,
           TD(flts, f="logaddexp", astype={'e':'f'})
@@ -1048,18 +1051,38 @@ def make_ufuncs(funcdict):
         # do not play well with \n
         docstring = '\\n\"\"'.join(docstring.split(r"\n"))
         fmt = textwrap.dedent("""\
-            f = PyUFunc_FromFuncAndData(
+            identity = {identity_expr};
+            if ({has_identity} && identity == NULL) {{
+                return -1;
+            }}
+            f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
                 {name}_functions, {name}_data, {name}_signatures, {nloops},
                 {nin}, {nout}, {identity}, "{name}",
-                "{doc}", 0
+                "{doc}", 0, NULL, identity
             );
+            if ({has_identity}) {{
+                Py_DECREF(identity);
+            }}
             if (f == NULL) {{
                 return -1;
-            }}""")
-        mlist.append(fmt.format(
+            }}
+        """)
+        args = dict(
             name=name, nloops=len(uf.type_descriptions),
-            nin=uf.nin, nout=uf.nout, identity=uf.identity, doc=docstring
-        ))
+            nin=uf.nin, nout=uf.nout,
+            has_identity='0' if uf.identity is None_ else '1',
+            identity='PyUFunc_IdentityValue',
+            identity_expr=uf.identity,
+            doc=docstring
+        )
+
+        # Only PyUFunc_None means don't reorder - we pass this using the old
+        # argument
+        if uf.identity is None_:
+            args['identity'] = 'PyUFunc_None'
+            args['identity_expr'] = 'NULL'
+
+        mlist.append(fmt.format(**args))
         if uf.typereso is not None:
             mlist.append(
                 r"((PyUFuncObject *)f)->type_resolver = &%s;" % uf.typereso)
@@ -1087,7 +1110,7 @@ def make_code(funcdict, filename):
 
     static int
     InitOperators(PyObject *dictionary) {
-        PyObject *f;
+        PyObject *f, *identity;
 
     %s
     %s
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index d8a9ee6b4..a71c236fd 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -402,6 +402,8 @@ ufunc_funcs_api = {
     # End 1.7 API
     'PyUFunc_RegisterLoopForDescr':             (41,),
     # End 1.8 API
+    'PyUFunc_FromFuncAndDataAndSignatureAndIdentity': (42,),
+    # End 1.16 API
 }
 
 # List of all the dicts which define the C API
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py
index e86086012..12ba3f02e 100644
--- a/numpy/core/defchararray.py
+++ b/numpy/core/defchararray.py
@@ -23,6 +23,7 @@ from .numerictypes import string_, unicode_, integer, object_, bool_, character
 from .numeric import ndarray, compare_chararrays
 from .numeric import array as narray
 from numpy.core.multiarray import _vec_string
+from numpy.core.overrides import set_module
 from numpy.core import overrides
 from numpy.compat import asbytes, long
 import numpy
@@ -1820,6 +1821,7 @@ def isdecimal(a):
     return _vec_string(a, bool_, 'isdecimal')
 
 
+@set_module('numpy')
 class chararray(ndarray):
     """
     chararray(shape, itemsize=1, unicode=False, buffer=None, offset=0,
diff --git a/numpy/core/function_base.py b/numpy/core/function_base.py
index 799b1418d..b3dd313cf 100644
--- a/numpy/core/function_base.py
+++ b/numpy/core/function_base.py
@@ -7,6 +7,7 @@ from . import numeric as _nx
 from .numeric import (result_type, NaN, shares_memory, MAY_SHARE_BOUNDS,
                       TooHardError,asanyarray)
 from numpy.core.multiarray import add_docstring
+from numpy.core.overrides import set_module
 
 __all__ = ['logspace', 'linspace', 'geomspace']
 
@@ -23,6 +24,7 @@ def _index_deprecate(i, stacklevel=2):
     return i
 
 
+@set_module('numpy')
 def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None):
     """
     Return evenly spaced numbers over a specified interval.
@@ -154,6 +156,7 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None):
         return y.astype(dtype, copy=False)
 
 
+@set_module('numpy')
 def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None):
     """
     Return numbers spaced evenly on a log scale.
@@ -238,6 +241,7 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None):
     return _nx.power(base, y).astype(dtype)
 
 
+@set_module('numpy')
 def geomspace(start, stop, num=50, endpoint=True, dtype=None):
     """
     Return numbers spaced evenly on a log scale (a geometric progression).
diff --git a/numpy/core/getlimits.py b/numpy/core/getlimits.py
index 0e3c58793..544b8b35f 100644
--- a/numpy/core/getlimits.py
+++ b/numpy/core/getlimits.py
@@ -8,6 +8,7 @@ __all__ = ['finfo', 'iinfo']
 import warnings
 
 from .machar import MachAr
+from .overrides import set_module
 from . import numeric
 from . import numerictypes as ntypes
 from .numeric import array, inf
@@ -30,6 +31,32 @@ def _fr1(a):
         a.shape = ()
     return a
 
+class MachArLike(object):
+    """ Object to simulate MachAr instance """
+
+    def __init__(self,
+                 ftype,
+                 **kwargs):
+        params = _MACHAR_PARAMS[ftype]
+        float_conv = lambda v: array([v], ftype)
+        float_to_float = lambda v : _fr1(float_conv(v))
+        float_to_str = lambda v: (params['fmt'] % array(_fr0(v)[0], ftype))
+
+        self.title = params['title']
+        # Parameter types same as for discovered MachAr object.
+        self.epsilon = self.eps = float_to_float(kwargs.pop('eps'))
+        self.epsneg = float_to_float(kwargs.pop('epsneg'))
+        self.xmax = self.huge = float_to_float(kwargs.pop('huge'))
+        self.xmin = self.tiny = float_to_float(kwargs.pop('tiny'))
+        self.ibeta = params['itype'](kwargs.pop('ibeta'))
+        self.__dict__.update(kwargs)
+        self.precision = int(-log10(self.eps))
+        self.resolution = float_to_float(float_conv(10) ** (-self.precision))
+        self._str_eps = float_to_str(self.eps)
+        self._str_epsneg = float_to_str(self.epsneg)
+        self._str_xmin = float_to_str(self.xmin)
+        self._str_xmax = float_to_str(self.xmax)
+        self._str_resolution = float_to_str(self.resolution)
 
 _convert_to_float = {
     ntypes.csingle: ntypes.single,
@@ -37,7 +64,6 @@ _convert_to_float = {
     ntypes.clongfloat: ntypes.longfloat
     }
 
-
 # Parameters for creating MachAr / MachAr-like objects
 _title_fmt = 'numpy {} precision floating point number'
 _MACHAR_PARAMS = {
@@ -58,194 +84,156 @@ _MACHAR_PARAMS = {
         fmt = '%12.5e',
         title = _title_fmt.format('half'))}
 
-
-class MachArLike(object):
-    """ Object to simulate MachAr instance """
-
-    def __init__(self,
-                 ftype,
-                 **kwargs):
-        params = _MACHAR_PARAMS[ftype]
-        float_conv = lambda v: array([v], ftype)
-        float_to_float = lambda v : _fr1(float_conv(v))
-        self._float_to_str = lambda v: (params['fmt'] %
-                                        array(_fr0(v)[0], ftype))
-        self.title = params['title']
-        # Parameter types same as for discovered MachAr object.
-        self.epsilon = self.eps = float_to_float(kwargs.pop('eps'))
-        self.epsneg = float_to_float(kwargs.pop('epsneg'))
-        self.xmax = self.huge = float_to_float(kwargs.pop('huge'))
-        self.xmin = self.tiny = float_to_float(kwargs.pop('tiny'))
-        self.ibeta = params['itype'](kwargs.pop('ibeta'))
-        self.__dict__.update(kwargs)
-        self.precision = int(-log10(self.eps))
-        self.resolution = float_to_float(float_conv(10) ** (-self.precision))
-
-    # Properties below to delay need for float_to_str, and thus avoid circular
-    # imports during early numpy module loading.
-    # See: https://github.com/numpy/numpy/pull/8983#discussion_r115838683
-
-    @property
-    def _str_eps(self):
-        return self._float_to_str(self.eps)
-
-    @property
-    def _str_epsneg(self):
-        return self._float_to_str(self.epsneg)
-
-    @property
-    def _str_xmin(self):
-        return self._float_to_str(self.xmin)
-
-    @property
-    def _str_xmax(self):
-        return self._float_to_str(self.xmax)
-
-    @property
-    def _str_resolution(self):
-        return self._float_to_str(self.resolution)
-
-
-# Known parameters for float16
-# See docstring of MachAr class for description of parameters.
-_f16 = ntypes.float16
-_float16_ma = MachArLike(_f16,
-                         machep=-10,
-                         negep=-11,
-                         minexp=-14,
-                         maxexp=16,
-                         it=10,
-                         iexp=5,
-                         ibeta=2,
-                         irnd=5,
-                         ngrd=0,
-                         eps=exp2(_f16(-10)),
-                         epsneg=exp2(_f16(-11)),
-                         huge=_f16(65504),
-                         tiny=_f16(2 ** -14))
-
-# Known parameters for float32
-_f32 = ntypes.float32
-_float32_ma = MachArLike(_f32,
-                         machep=-23,
-                         negep=-24,
-                         minexp=-126,
-                         maxexp=128,
-                         it=23,
-                         iexp=8,
-                         ibeta=2,
-                         irnd=5,
-                         ngrd=0,
-                         eps=exp2(_f32(-23)),
-                         epsneg=exp2(_f32(-24)),
-                         huge=_f32((1 - 2 ** -24) * 2**128),
-                         tiny=exp2(_f32(-126)))
-
-# Known parameters for float64
-_f64 = ntypes.float64
-_epsneg_f64 = 2.0 ** -53.0
-_tiny_f64 = 2.0 ** -1022.0
-_float64_ma = MachArLike(_f64,
-                         machep=-52,
-                         negep=-53,
-                         minexp=-1022,
-                         maxexp=1024,
-                         it=52,
-                         iexp=11,
-                         ibeta=2,
-                         irnd=5,
-                         ngrd=0,
-                         eps=2.0 ** -52.0,
-                         epsneg=_epsneg_f64,
-                         huge=(1.0 - _epsneg_f64) / _tiny_f64 * _f64(4),
-                         tiny=_tiny_f64)
-
-# Known parameters for IEEE 754 128-bit binary float
-_ld = ntypes.longdouble
-_epsneg_f128 = exp2(_ld(-113))
-_tiny_f128 = exp2(_ld(-16382))
-# Ignore runtime error when this is not f128
-with numeric.errstate(all='ignore'):
-    _huge_f128 = (_ld(1) - _epsneg_f128) / _tiny_f128 * _ld(4)
-_float128_ma = MachArLike(_ld,
-                         machep=-112,
-                         negep=-113,
-                         minexp=-16382,
-                         maxexp=16384,
-                         it=112,
-                         iexp=15,
-                         ibeta=2,
-                         irnd=5,
-                         ngrd=0,
-                         eps=exp2(_ld(-112)),
-                         epsneg=_epsneg_f128,
-                         huge=_huge_f128,
-                         tiny=_tiny_f128)
-
-# Known parameters for float80 (Intel 80-bit extended precision)
-_epsneg_f80 = exp2(_ld(-64))
-_tiny_f80 = exp2(_ld(-16382))
-# Ignore runtime error when this is not f80
-with numeric.errstate(all='ignore'):
-    _huge_f80 = (_ld(1) - _epsneg_f80) / _tiny_f80 * _ld(4)
-_float80_ma = MachArLike(_ld,
-                         machep=-63,
-                         negep=-64,
-                         minexp=-16382,
-                         maxexp=16384,
-                         it=63,
-                         iexp=15,
-                         ibeta=2,
-                         irnd=5,
-                         ngrd=0,
-                         eps=exp2(_ld(-63)),
-                         epsneg=_epsneg_f80,
-                         huge=_huge_f80,
-                         tiny=_tiny_f80)
-
-# Guessed / known parameters for double double; see:
-# https://en.wikipedia.org/wiki/Quadruple-precision_floating-point_format#Double-double_arithmetic
-# These numbers have the same exponent range as float64, but extended number of
-# digits in the significand.
-_huge_dd = (umath.nextafter(_ld(inf), _ld(0))
-            if hasattr(umath, 'nextafter')  # Missing on some platforms?
-            else _float64_ma.huge)
-_float_dd_ma = MachArLike(_ld,
-                          machep=-105,
-                          negep=-106,
-                          minexp=-1022,
-                          maxexp=1024,
-                          it=105,
-                          iexp=11,
-                          ibeta=2,
-                          irnd=5,
-                          ngrd=0,
-                          eps=exp2(_ld(-105)),
-                          epsneg= exp2(_ld(-106)),
-                          huge=_huge_dd,
-                          tiny=exp2(_ld(-1022)))
-
-
 # Key to identify the floating point type.  Key is result of
 # ftype('-0.1').newbyteorder('<').tobytes()
 # See:
 # https://perl5.git.perl.org/perl.git/blob/3118d7d684b56cbeb702af874f4326683c45f045:/Configure
-_KNOWN_TYPES = {
-    b'\x9a\x99\x99\x99\x99\x99\xb9\xbf' : _float64_ma,
-    b'\xcd\xcc\xcc\xbd' : _float32_ma,
-    b'f\xae' : _float16_ma,
+_KNOWN_TYPES = {}
+def _register_type(machar, bytepat):
+    _KNOWN_TYPES[bytepat] = machar
+_float_ma = {}
+
+def _register_known_types():
+    # Known parameters for float16
+    # See docstring of MachAr class for description of parameters.
+    f16 = ntypes.float16
+    float16_ma = MachArLike(f16,
+                            machep=-10,
+                            negep=-11,
+                            minexp=-14,
+                            maxexp=16,
+                            it=10,
+                            iexp=5,
+                            ibeta=2,
+                            irnd=5,
+                            ngrd=0,
+                            eps=exp2(f16(-10)),
+                            epsneg=exp2(f16(-11)),
+                            huge=f16(65504),
+                            tiny=f16(2 ** -14))
+    _register_type(float16_ma, b'f\xae')
+    _float_ma[16] = float16_ma
+
+    # Known parameters for float32
+    f32 = ntypes.float32
+    float32_ma = MachArLike(f32,
+                            machep=-23,
+                            negep=-24,
+                            minexp=-126,
+                            maxexp=128,
+                            it=23,
+                            iexp=8,
+                            ibeta=2,
+                            irnd=5,
+                            ngrd=0,
+                            eps=exp2(f32(-23)),
+                            epsneg=exp2(f32(-24)),
+                            huge=f32((1 - 2 ** -24) * 2**128),
+                            tiny=exp2(f32(-126)))
+    _register_type(float32_ma, b'\xcd\xcc\xcc\xbd')
+    _float_ma[32] = float32_ma
+
+    # Known parameters for float64
+    f64 = ntypes.float64
+    epsneg_f64 = 2.0 ** -53.0
+    tiny_f64 = 2.0 ** -1022.0
+    float64_ma = MachArLike(f64,
+                            machep=-52,
+                            negep=-53,
+                            minexp=-1022,
+                            maxexp=1024,
+                            it=52,
+                            iexp=11,
+                            ibeta=2,
+                            irnd=5,
+                            ngrd=0,
+                            eps=2.0 ** -52.0,
+                            epsneg=epsneg_f64,
+                            huge=(1.0 - epsneg_f64) / tiny_f64 * f64(4),
+                            tiny=tiny_f64)
+    _register_type(float64_ma, b'\x9a\x99\x99\x99\x99\x99\xb9\xbf')
+    _float_ma[64] = float64_ma
+
+    # Known parameters for IEEE 754 128-bit binary float
+    ld = ntypes.longdouble
+    epsneg_f128 = exp2(ld(-113))
+    tiny_f128 = exp2(ld(-16382))
+    # Ignore runtime error when this is not f128
+    with numeric.errstate(all='ignore'):
+        huge_f128 = (ld(1) - epsneg_f128) / tiny_f128 * ld(4)
+    float128_ma = MachArLike(ld,
+                             machep=-112,
+                             negep=-113,
+                             minexp=-16382,
+                             maxexp=16384,
+                             it=112,
+                             iexp=15,
+                             ibeta=2,
+                             irnd=5,
+                             ngrd=0,
+                             eps=exp2(ld(-112)),
+                             epsneg=epsneg_f128,
+                             huge=huge_f128,
+                             tiny=tiny_f128)
+    # IEEE 754 128-bit binary float
+    _register_type(float128_ma,
+        b'\x9a\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\xfb\xbf')
+    _register_type(float128_ma,
+        b'\x9a\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\xfb\xbf')
+    _float_ma[128] = float128_ma
+
+    # Known parameters for float80 (Intel 80-bit extended precision)
+    epsneg_f80 = exp2(ld(-64))
+    tiny_f80 = exp2(ld(-16382))
+    # Ignore runtime error when this is not f80
+    with numeric.errstate(all='ignore'):
+        huge_f80 = (ld(1) - epsneg_f80) / tiny_f80 * ld(4)
+    float80_ma = MachArLike(ld,
+                            machep=-63,
+                            negep=-64,
+                            minexp=-16382,
+                            maxexp=16384,
+                            it=63,
+                            iexp=15,
+                            ibeta=2,
+                            irnd=5,
+                            ngrd=0,
+                            eps=exp2(ld(-63)),
+                            epsneg=epsneg_f80,
+                            huge=huge_f80,
+                            tiny=tiny_f80)
     # float80, first 10 bytes containing actual storage
-    b'\xcd\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xfb\xbf' : _float80_ma,
+    _register_type(float80_ma, b'\xcd\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xfb\xbf')
+    _float_ma[80] = float80_ma
+
+    # Guessed / known parameters for double double; see:
+    # https://en.wikipedia.org/wiki/Quadruple-precision_floating-point_format#Double-double_arithmetic
+    # These numbers have the same exponent range as float64, but extended number of
+    # digits in the significand.
+    huge_dd = (umath.nextafter(ld(inf), ld(0))
+                if hasattr(umath, 'nextafter')  # Missing on some platforms?
+                else float64_ma.huge)
+    float_dd_ma = MachArLike(ld,
+                              machep=-105,
+                              negep=-106,
+                              minexp=-1022,
+                              maxexp=1024,
+                              it=105,
+                              iexp=11,
+                              ibeta=2,
+                              irnd=5,
+                              ngrd=0,
+                              eps=exp2(ld(-105)),
+                              epsneg= exp2(ld(-106)),
+                              huge=huge_dd,
+                              tiny=exp2(ld(-1022)))
     # double double; low, high order (e.g. PPC 64)
-    b'\x9a\x99\x99\x99\x99\x99Y<\x9a\x99\x99\x99\x99\x99\xb9\xbf' :
-    _float_dd_ma,
+    _register_type(float_dd_ma,
+        b'\x9a\x99\x99\x99\x99\x99Y<\x9a\x99\x99\x99\x99\x99\xb9\xbf')
     # double double; high, low order (e.g. PPC 64 le)
-    b'\x9a\x99\x99\x99\x99\x99\xb9\xbf\x9a\x99\x99\x99\x99\x99Y<' :
-    _float_dd_ma,
-    # IEEE 754 128-bit binary float
-    b'\x9a\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\x99\xfb\xbf' :
-    _float128_ma,
-}
+    _register_type(float_dd_ma,
+        b'\x9a\x99\x99\x99\x99\x99\xb9\xbf\x9a\x99\x99\x99\x99\x99Y<')
+    _float_ma['dd'] = float_dd_ma
 
 
 def _get_machar(ftype):
@@ -302,6 +290,7 @@ def _discovered_machar(ftype):
                   params['title'])
 
 
+@set_module('numpy')
 class finfo(object):
     """
     finfo(dtype)
@@ -452,6 +441,7 @@ class finfo(object):
                  " max=%(_str_max)s, dtype=%(dtype)s)") % d)
 
 
+@set_module('numpy')
 class iinfo(object):
     """
     iinfo(type)
diff --git a/numpy/core/include/numpy/npy_1_7_deprecated_api.h b/numpy/core/include/numpy/npy_1_7_deprecated_api.h
index 76b57b748..a6ee21219 100644
--- a/numpy/core/include/numpy/npy_1_7_deprecated_api.h
+++ b/numpy/core/include/numpy/npy_1_7_deprecated_api.h
@@ -5,6 +5,8 @@
 #error "Should never include npy_*_*_deprecated_api directly."
 #endif
 
+/* Emit a warning if the user did not specifically request the old API */
+#ifndef NPY_NO_DEPRECATED_API
 #if defined(_WIN32)
 #define _WARN___STR2__(x) #x
 #define _WARN___STR1__(x) _WARN___STR2__(x)
@@ -16,6 +18,7 @@
          "#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION"
 #endif
 /* TODO: How to do this warning message for other compilers? */
+#endif
 
 /*
  * This header exists to collect all dangerous/deprecated NumPy API
diff --git a/numpy/core/include/numpy/npy_3kcompat.h b/numpy/core/include/numpy/npy_3kcompat.h
index a3c69f44e..832bc0599 100644
--- a/numpy/core/include/numpy/npy_3kcompat.h
+++ b/numpy/core/include/numpy/npy_3kcompat.h
@@ -219,6 +219,7 @@ npy_PyFile_Dup2(PyObject *file, char *mode, npy_off_t *orig_pos)
     if (handle == NULL) {
         PyErr_SetString(PyExc_IOError,
                         "Getting a FILE* from a Python file object failed");
+        return NULL;
     }
 
     /* Record the original raw file handle position */
@@ -383,6 +384,36 @@ npy_PyFile_CloseFile(PyObject *file)
 }
 
 
+/* This is a copy of _PyErr_ChainExceptions
+ */
+static NPY_INLINE void
+npy_PyErr_ChainExceptions(PyObject *exc, PyObject *val, PyObject *tb)
+{
+    if (exc == NULL)
+        return;
+
+    if (PyErr_Occurred()) {
+        /* only py3 supports this anyway */
+        #ifdef NPY_PY3K
+            PyObject *exc2, *val2, *tb2;
+            PyErr_Fetch(&exc2, &val2, &tb2);
+            PyErr_NormalizeException(&exc, &val, &tb);
+            if (tb != NULL) {
+                PyException_SetTraceback(val, tb);
+                Py_DECREF(tb);
+            }
+            Py_DECREF(exc);
+            PyErr_NormalizeException(&exc2, &val2, &tb2);
+            PyException_SetContext(val2, val);
+            PyErr_Restore(exc2, val2, tb2);
+        #endif
+    }
+    else {
+        PyErr_Restore(exc, val, tb);
+    }
+}
+
+
 /* This is a copy of _PyErr_ChainExceptions, with:
  *  - a minimal implementation for python 2
  *  - __cause__ used instead of __context__
diff --git a/numpy/core/include/numpy/ufuncobject.h b/numpy/core/include/numpy/ufuncobject.h
index 85f8a6c08..90d837a9b 100644
--- a/numpy/core/include/numpy/ufuncobject.h
+++ b/numpy/core/include/numpy/ufuncobject.h
@@ -223,7 +223,8 @@ typedef struct _tagPyUFuncObject {
          */
         npy_uint32 *core_dim_flags;
 
-
+        /* Identity for reduction, when identity == PyUFunc_IdentityValue */
+        PyObject *identity_value;
 
 } PyUFuncObject;
 
@@ -299,6 +300,12 @@ typedef struct _tagPyUFuncObject {
  * This case allows reduction with multiple axes at once.
  */
 #define PyUFunc_ReorderableNone -2
+/*
+ * UFunc unit is in identity_value, and the order of operations can be reordered
+ * This case allows reduction with multiple axes at once.
+ */
+#define PyUFunc_IdentityValue -3
+
 
 #define UFUNC_REDUCE 0
 #define UFUNC_ACCUMULATE 1
diff --git a/numpy/core/machar.py b/numpy/core/machar.py
index 7578544fe..91fb4eda8 100644
--- a/numpy/core/machar.py
+++ b/numpy/core/machar.py
@@ -11,9 +11,11 @@ __all__ = ['MachAr']
 
 from numpy.core.fromnumeric import any
 from numpy.core.numeric import errstate
+from numpy.core.overrides import set_module
 
 # Need to speed this up...especially for longfloat
 
+@set_module('numpy')
 class MachAr(object):
     """
     Diagnosing machine parameters.
diff --git a/numpy/core/memmap.py b/numpy/core/memmap.py
index 8269f537f..82bc4707c 100644
--- a/numpy/core/memmap.py
+++ b/numpy/core/memmap.py
@@ -3,8 +3,9 @@ from __future__ import division, absolute_import, print_function
 import numpy as np
 from .numeric import uint8, ndarray, dtype
 from numpy.compat import (
-    long, basestring, is_pathlib_path, contextlib_nullcontext
+    long, basestring, os_fspath, contextlib_nullcontext, is_pathlib_path
 )
+from numpy.core.overrides import set_module
 
 __all__ = ['memmap']
 
@@ -19,6 +20,8 @@ mode_equivalents = {
     "write":"w+"
     }
 
+
+@set_module('numpy')
 class memmap(ndarray):
     """Create a memory-map to an array stored in a *binary* file on disk.
 
@@ -218,10 +221,8 @@ class memmap(ndarray):
 
         if hasattr(filename, 'read'):
             f_ctx = contextlib_nullcontext(filename)
-        elif is_pathlib_path(filename):
-            f_ctx = filename.open(('r' if mode == 'c' else mode)+'b')
         else:
-            f_ctx = open(filename, ('r' if mode == 'c' else mode)+'b')
+            f_ctx = open(os_fspath(filename), ('r' if mode == 'c' else mode)+'b')
 
         with f_ctx as fid:
             fid.seek(0, 2)
@@ -268,14 +269,13 @@ class memmap(ndarray):
             self.offset = offset
             self.mode = mode
 
-            if isinstance(filename, basestring):
-                self.filename = os.path.abspath(filename)
-            elif is_pathlib_path(filename):
+            if is_pathlib_path(filename):
+                # special case - if we were constructed with a pathlib.path,
+                # then filename is a path object, not a string
                 self.filename = filename.resolve()
-            # py3 returns int for TemporaryFile().name
-            elif (hasattr(filename, "name") and
-                  isinstance(filename.name, basestring)):
-                self.filename = os.path.abspath(filename.name)
+            elif hasattr(fid, "name") and isinstance(fid.name, basestring):
+                # py3 returns int for TemporaryFile().name
+                self.filename = os.path.abspath(fid.name)
             # same as memmap copies (e.g. memmap + 1)
             else:
                 self.filename = None
diff --git a/numpy/core/multiarray.py b/numpy/core/multiarray.py
index 25debd2f8..78963b0aa 100644
--- a/numpy/core/multiarray.py
+++ b/numpy/core/multiarray.py
@@ -7,6 +7,7 @@ by importing from the extension module.
 """
 
 import functools
+import warnings
 
 from . import overrides
 from . import _multiarray_umath
@@ -39,6 +40,26 @@ __all__ = [
     'tracemalloc_domain', 'typeinfo', 'unpackbits', 'unravel_index', 'vdot',
     'where', 'zeros']
 
+
+arange.__module__ = 'numpy'
+array.__module__ = 'numpy'
+datetime_data.__module__ = 'numpy'
+empty.__module__ = 'numpy'
+frombuffer.__module__ = 'numpy'
+fromfile.__module__ = 'numpy'
+fromiter.__module__ = 'numpy'
+frompyfunc.__module__ = 'numpy'
+fromstring.__module__ = 'numpy'
+geterrobj.__module__ = 'numpy'
+matmul.__module__ = 'numpy'
+may_share_memory.__module__ = 'numpy'
+nested_iters.__module__ = 'numpy'
+promote_types.__module__ = 'numpy'
+set_numeric_ops.__module__ = 'numpy'
+seterrobj.__module__ = 'numpy'
+zeros.__module__ = 'numpy'
+
+
 array_function_dispatch = functools.partial(
     overrides.array_function_dispatch, module='numpy')
 
@@ -832,6 +853,474 @@ def vdot(a, b):
     return _multiarray_umath.vdot(a, b)
 
 
+def _bincount_dispatcher(x, weights=None, minlength=None):
+    return (x, weights)
+
+
+@array_function_dispatch(_bincount_dispatcher)
+def bincount(x, weights=None, minlength=0):
+    """
+    Count number of occurrences of each value in array of non-negative ints.
+
+    The number of bins (of size 1) is one larger than the largest value in
+    `x`. If `minlength` is specified, there will be at least this number
+    of bins in the output array (though it will be longer if necessary,
+    depending on the contents of `x`).
+    Each bin gives the number of occurrences of its index value in `x`.
+    If `weights` is specified the input array is weighted by it, i.e. if a
+    value ``n`` is found at position ``i``, ``out[n] += weight[i]`` instead
+    of ``out[n] += 1``.
+
+    Parameters
+    ----------
+    x : array_like, 1 dimension, nonnegative ints
+        Input array.
+    weights : array_like, optional
+        Weights, array of the same shape as `x`.
+    minlength : int, optional
+        A minimum number of bins for the output array.
+
+        .. versionadded:: 1.6.0
+
+    Returns
+    -------
+    out : ndarray of ints
+        The result of binning the input array.
+        The length of `out` is equal to ``np.amax(x)+1``.
+
+    Raises
+    ------
+    ValueError
+        If the input is not 1-dimensional, or contains elements with negative
+        values, or if `minlength` is negative.
+    TypeError
+        If the type of the input is float or complex.
+
+    See Also
+    --------
+    histogram, digitize, unique
+
+    Examples
+    --------
+    >>> np.bincount(np.arange(5))
+    array([1, 1, 1, 1, 1])
+    >>> np.bincount(np.array([0, 1, 1, 3, 2, 1, 7]))
+    array([1, 3, 1, 1, 0, 0, 0, 1])
+
+    >>> x = np.array([0, 1, 1, 3, 2, 1, 7, 23])
+    >>> np.bincount(x).size == np.amax(x)+1
+    True
+
+    The input array needs to be of integer dtype, otherwise a
+    TypeError is raised:
+
+    >>> np.bincount(np.arange(5, dtype=float))
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    TypeError: array cannot be safely cast to required type
+
+    A possible use of ``bincount`` is to perform sums over
+    variable-size chunks of an array, using the ``weights`` keyword.
+
+    >>> w = np.array([0.3, 0.5, 0.2, 0.7, 1., -0.6]) # weights
+    >>> x = np.array([0, 1, 1, 2, 2, 2])
+    >>> np.bincount(x,  weights=w)
+    array([ 0.3,  0.7,  1.1])
+
+    """
+    return _multiarray_umath.bincount(x, weights=weights, minlength=minlength)
+
+
+def _ravel_multi_index_dispatcher(multi_index, dims, mode=None, order=None):
+    return multi_index
+
+
+@array_function_dispatch(_ravel_multi_index_dispatcher)
+def ravel_multi_index(multi_index, dims, mode='raise', order='C'):
+    """
+    Converts a tuple of index arrays into an array of flat
+    indices, applying boundary modes to the multi-index.
+
+    Parameters
+    ----------
+    multi_index : tuple of array_like
+        A tuple of integer arrays, one array for each dimension.
+    dims : tuple of ints
+        The shape of array into which the indices from ``multi_index`` apply.
+    mode : {'raise', 'wrap', 'clip'}, optional
+        Specifies how out-of-bounds indices are handled.  Can specify
+        either one mode or a tuple of modes, one mode per index.
+
+        * 'raise' -- raise an error (default)
+        * 'wrap' -- wrap around
+        * 'clip' -- clip to the range
+
+        In 'clip' mode, a negative index which would normally
+        wrap will clip to 0 instead.
+    order : {'C', 'F'}, optional
+        Determines whether the multi-index should be viewed as
+        indexing in row-major (C-style) or column-major
+        (Fortran-style) order.
+
+    Returns
+    -------
+    raveled_indices : ndarray
+        An array of indices into the flattened version of an array
+        of dimensions ``dims``.
+
+    See Also
+    --------
+    unravel_index
+
+    Notes
+    -----
+    .. versionadded:: 1.6.0
+
+    Examples
+    --------
+    >>> arr = np.array([[3,6,6],[4,5,1]])
+    >>> np.ravel_multi_index(arr, (7,6))
+    array([22, 41, 37])
+    >>> np.ravel_multi_index(arr, (7,6), order='F')
+    array([31, 41, 13])
+    >>> np.ravel_multi_index(arr, (4,6), mode='clip')
+    array([22, 23, 19])
+    >>> np.ravel_multi_index(arr, (4,4), mode=('clip','wrap'))
+    array([12, 13, 13])
+
+    >>> np.ravel_multi_index((3,1,4,1), (6,7,8,9))
+    1621
+    """
+    return _multiarray_umath.ravel_multi_index(
+        multi_index, dims, mode=mode, order=order)
+
+
+def _deprecate_dims(shape, dims):
+    if dims is not None:
+        warnings.warn("'shape' argument should be used instead of 'dims'",
+                      DeprecationWarning, stacklevel=3)
+        shape = dims
+    return shape
+
+
+def _unravel_index_dispatcher(indices, shape=None, order=None, dims=None):
+    shape = _deprecate_dims(shape, dims)
+    return (indices,)
+
+
+@array_function_dispatch(_unravel_index_dispatcher)
+def unravel_index(indices, shape=None, order='C', dims=None):
+    """
+    Converts a flat index or array of flat indices into a tuple
+    of coordinate arrays.
+
+    Parameters
+    ----------
+    indices : array_like
+        An integer array whose elements are indices into the flattened
+        version of an array of dimensions ``shape``. Before version 1.6.0,
+        this function accepted just one index value.
+    shape : tuple of ints
+        The shape of the array to use for unraveling ``indices``.
+
+        .. versionchanged:: 1.16.0
+            Renamed from ``dims`` to ``shape``.
+
+    order : {'C', 'F'}, optional
+        Determines whether the indices should be viewed as indexing in
+        row-major (C-style) or column-major (Fortran-style) order.
+
+        .. versionadded:: 1.6.0
+
+    Returns
+    -------
+    unraveled_coords : tuple of ndarray
+        Each array in the tuple has the same shape as the ``indices``
+        array.
+
+    See Also
+    --------
+    ravel_multi_index
+
+    Examples
+    --------
+    >>> np.unravel_index([22, 41, 37], (7,6))
+    (array([3, 6, 6]), array([4, 5, 1]))
+    >>> np.unravel_index([31, 41, 13], (7,6), order='F')
+    (array([3, 6, 6]), array([4, 5, 1]))
+
+    >>> np.unravel_index(1621, (6,7,8,9))
+    (3, 1, 4, 1)
+
+    """
+    shape = _deprecate_dims(shape, dims)
+    return _multiarray_umath.unravel_index(indices, shape, order=order)
+
+
+def _copyto_dispatcher(dst, src, casting=None, where=None):
+    return (dst, src, where)
+
+
+@array_function_dispatch(_copyto_dispatcher)
+def copyto(dst, src, casting='same_kind', where=True):
+    """
+    Copies values from one array to another, broadcasting as necessary.
+
+    Raises a TypeError if the `casting` rule is violated, and if
+    `where` is provided, it selects which elements to copy.
+
+    .. versionadded:: 1.7.0
+
+    Parameters
+    ----------
+    dst : ndarray
+        The array into which values are copied.
+    src : array_like
+        The array from which values are copied.
+    casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
+        Controls what kind of data casting may occur when copying.
+
+          * 'no' means the data types should not be cast at all.
+          * 'equiv' means only byte-order changes are allowed.
+          * 'safe' means only casts which can preserve values are allowed.
+          * 'same_kind' means only safe casts or casts within a kind,
+            like float64 to float32, are allowed.
+          * 'unsafe' means any data conversions may be done.
+    where : array_like of bool, optional
+        A boolean array which is broadcasted to match the dimensions
+        of `dst`, and selects elements to copy from `src` to `dst`
+        wherever it contains the value True.
+    """
+    return _multiarray_umath.copyto(dst, src, casting=casting, where=where)
+
+
+def _putmask_dispatcher(a, mask, values):
+    return (a, mask, values)
+
+
+@array_function_dispatch(_putmask_dispatcher)
+def putmask(a, mask, values):
+    """
+    Changes elements of an array based on conditional and input values.
+
+    Sets ``a.flat[n] = values[n]`` for each n where ``mask.flat[n]==True``.
+
+    If `values` is not the same size as `a` and `mask` then it will repeat.
+    This gives behavior different from ``a[mask] = values``.
+
+    Parameters
+    ----------
+    a : array_like
+        Target array.
+    mask : array_like
+        Boolean mask array. It has to be the same shape as `a`.
+    values : array_like
+        Values to put into `a` where `mask` is True. If `values` is smaller
+        than `a` it will be repeated.
+
+    See Also
+    --------
+    place, put, take, copyto
+
+    Examples
+    --------
+    >>> x = np.arange(6).reshape(2, 3)
+    >>> np.putmask(x, x>2, x**2)
+    >>> x
+    array([[ 0,  1,  2],
+           [ 9, 16, 25]])
+
+    If `values` is smaller than `a` it is repeated:
+
+    >>> x = np.arange(5)
+    >>> np.putmask(x, x>1, [-33, -44])
+    >>> x
+    array([  0,   1, -33, -44, -33])
+
+    """
+    return _multiarray_umath.putmask(a, mask, values)
+
+
+def _packbits_and_unpackbits_dispatcher(myarray, axis=None):
+    return (myarray,)
+
+
+@array_function_dispatch(_packbits_and_unpackbits_dispatcher)
+def packbits(myarray, axis=None):
+    """
+    Packs the elements of a binary-valued array into bits in a uint8 array.
+
+    The result is padded to full bytes by inserting zero bits at the end.
+
+    Parameters
+    ----------
+    myarray : array_like
+        An array of integers or booleans whose elements should be packed to
+        bits.
+    axis : int, optional
+        The dimension over which bit-packing is done.
+        ``None`` implies packing the flattened array.
+
+    Returns
+    -------
+    packed : ndarray
+        Array of type uint8 whose elements represent bits corresponding to the
+        logical (0 or nonzero) value of the input elements. The shape of
+        `packed` has the same number of dimensions as the input (unless `axis`
+        is None, in which case the output is 1-D).
+
+    See Also
+    --------
+    unpackbits: Unpacks elements of a uint8 array into a binary-valued output
+                array.
+
+    Examples
+    --------
+    >>> a = np.array([[[1,0,1],
+    ...                [0,1,0]],
+    ...               [[1,1,0],
+    ...                [0,0,1]]])
+    >>> b = np.packbits(a, axis=-1)
+    >>> b
+    array([[[160],[64]],[[192],[32]]], dtype=uint8)
+
+    Note that in binary 160 = 1010 0000, 64 = 0100 0000, 192 = 1100 0000,
+    and 32 = 0010 0000.
+
+    """
+    return _multiarray_umath.packbits(myarray, axis)
+
+
+@array_function_dispatch(_packbits_and_unpackbits_dispatcher)
+def unpackbits(myarray, axis=None):
+    """
+    Unpacks elements of a uint8 array into a binary-valued output array.
+
+    Each element of `myarray` represents a bit-field that should be unpacked
+    into a binary-valued output array. The shape of the output array is either
+    1-D (if `axis` is None) or the same shape as the input array with unpacking
+    done along the axis specified.
+
+    Parameters
+    ----------
+    myarray : ndarray, uint8 type
+       Input array.
+    axis : int, optional
+        The dimension over which bit-unpacking is done.
+        ``None`` implies unpacking the flattened array.
+
+    Returns
+    -------
+    unpacked : ndarray, uint8 type
+       The elements are binary-valued (0 or 1).
+
+    See Also
+    --------
+    packbits : Packs the elements of a binary-valued array into bits in a uint8
+               array.
+
+    Examples
+    --------
+    >>> a = np.array([[2], [7], [23]], dtype=np.uint8)
+    >>> a
+    array([[ 2],
+           [ 7],
+           [23]], dtype=uint8)
+    >>> b = np.unpackbits(a, axis=1)
+    >>> b
+    array([[0, 0, 0, 0, 0, 0, 1, 0],
+           [0, 0, 0, 0, 0, 1, 1, 1],
+           [0, 0, 0, 1, 0, 1, 1, 1]], dtype=uint8)
+
+    """
+    return _multiarray_umath.unpackbits(myarray, axis)
+
+
+def _shares_memory_dispatcher(a, b, max_work=None):
+    return (a, b)
+
+
+@array_function_dispatch(_shares_memory_dispatcher)
+def shares_memory(a, b, max_work=None):
+    """
+    Determine if two arrays share memory
+
+    Parameters
+    ----------
+    a, b : ndarray
+        Input arrays
+    max_work : int, optional
+        Effort to spend on solving the overlap problem (maximum number
+        of candidate solutions to consider). The following special
+        values are recognized:
+
+        max_work=MAY_SHARE_EXACT  (default)
+            The problem is solved exactly. In this case, the function returns
+            True only if there is an element shared between the arrays.
+        max_work=MAY_SHARE_BOUNDS
+            Only the memory bounds of a and b are checked.
+
+    Raises
+    ------
+    numpy.TooHardError
+        Exceeded max_work.
+
+    Returns
+    -------
+    out : bool
+
+    See Also
+    --------
+    may_share_memory
+
+    Examples
+    --------
+    >>> np.may_share_memory(np.array([1,2]), np.array([5,8,9]))
+    False
+
+    """
+    return _multiarray_umath.shares_memory(a, b, max_work=max_work)
+
+
+@array_function_dispatch(_shares_memory_dispatcher)
+def may_share_memory(a, b, max_work=None):
+    """
+    Determine if two arrays might share memory
+
+    A return of True does not necessarily mean that the two arrays
+    share any element.  It just means that they *might*.
+
+    Only the memory bounds of a and b are checked by default.
+
+    Parameters
+    ----------
+    a, b : ndarray
+        Input arrays
+    max_work : int, optional
+        Effort to spend on solving the overlap problem.  See
+        `shares_memory` for details.  Default for ``may_share_memory``
+        is to do a bounds check.
+
+    Returns
+    -------
+    out : bool
+
+    See Also
+    --------
+    shares_memory
+
+    Examples
+    --------
+    >>> np.may_share_memory(np.array([1,2]), np.array([5,8,9]))
+    False
+    >>> x = np.zeros([3, 4])
+    >>> np.may_share_memory(x[:,0], x[:,1])
+    True
+
+    """
+    return _multiarray_umath.may_share_memory(a, b, max_work=max_work)
+
+
 def _is_busday_dispatcher(
         dates, weekmask=None, holidays=None, busdaycal=None, out=None):
     return (dates, weekmask, holidays, out)
@@ -1156,3 +1645,4 @@ def datetime_as_string(arr, unit=None, timezone='naive', casting='same_kind'):
     datetime with units 'm' according to the rule 'safe'
     """
     return _multiarray_umath.datetime_as_string(arr, unit, timezone, casting)
+
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 5d82bbd8d..aa5be1af3 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -30,6 +30,7 @@ if sys.version_info[0] < 3:
 
 from . import overrides
 from . import umath
+from .overrides import set_module
 from .umath import (multiply, invert, sin, UFUNC_BUFSIZE_DEFAULT,
                     ERR_IGNORE, ERR_WARN, ERR_RAISE, ERR_CALL, ERR_PRINT,
                     ERR_LOG, ERR_DEFAULT, PINF, NAN)
@@ -92,6 +93,7 @@ if sys.version_info[0] < 3:
     __all__.extend(['getbuffer', 'newbuffer'])
 
 
+@set_module('numpy')
 class ComplexWarning(RuntimeWarning):
     """
     The warning raised when casting a complex dtype to a real dtype.
@@ -170,6 +172,7 @@ def zeros_like(a, dtype=None, order='K', subok=True):
     return res
 
 
+@set_module('numpy')
 def ones(shape, dtype=None, order='C'):
     """
     Return a new array of given shape and type, filled with ones.
@@ -287,6 +290,7 @@ def ones_like(a, dtype=None, order='K', subok=True):
     return res
 
 
+@set_module('numpy')
 def full(shape, fill_value, dtype=None, order='C'):
     """
     Return a new array of given shape and type, filled with `fill_value`.
@@ -462,6 +466,7 @@ def count_nonzero(a, axis=None):
     return a_bool.sum(axis=axis, dtype=np.intp)
 
 
+@set_module('numpy')
 def asarray(a, dtype=None, order=None):
     """Convert the input to an array.
 
@@ -533,6 +538,7 @@ def asarray(a, dtype=None, order=None):
     return array(a, dtype, copy=False, order=order)
 
 
+@set_module('numpy')
 def asanyarray(a, dtype=None, order=None):
     """Convert the input to an ndarray, but pass ndarray subclasses through.
 
@@ -585,9 +591,10 @@ def asanyarray(a, dtype=None, order=None):
     return array(a, dtype, copy=False, order=order, subok=True)
 
 
+@set_module('numpy')
 def ascontiguousarray(a, dtype=None):
     """
-    Return a contiguous array in memory (C order).
+    Return a contiguous array (ndim >= 1) in memory (C order).
 
     Parameters
     ----------
@@ -618,13 +625,17 @@ def ascontiguousarray(a, dtype=None):
     >>> x.flags['C_CONTIGUOUS']
     True
 
+    Note: This function returns an array with at least one-dimension (1-d) 
+    so it will not preserve 0-d arrays.  
+
     """
     return array(a, dtype, copy=False, order='C', ndmin=1)
 
 
+@set_module('numpy')
 def asfortranarray(a, dtype=None):
     """
-    Return an array laid out in Fortran order in memory.
+    Return an array (ndim >= 1) laid out in Fortran order in memory.
 
     Parameters
     ----------
@@ -655,10 +666,14 @@ def asfortranarray(a, dtype=None):
     >>> y.flags['F_CONTIGUOUS']
     True
 
+    Note: This function returns an array with at least one-dimension (1-d) 
+    so it will not preserve 0-d arrays.  
+
     """
     return array(a, dtype, copy=False, order='F', ndmin=1)
 
 
+@set_module('numpy')
 def require(a, dtype=None, requirements=None):
     """
     Return an ndarray of the provided type that satisfies requirements.
@@ -757,6 +772,7 @@ def require(a, dtype=None, requirements=None):
     return arr
 
 
+@set_module('numpy')
 def isfortran(a):
     """
     Returns True if the array is Fortran contiguous but *not* C contiguous.
@@ -1883,6 +1899,7 @@ def cross(a, b, axisa=-1, axisb=-1, axisc=-1, axis=None):
 little_endian = (sys.byteorder == 'little')
 
 
+@set_module('numpy')
 def indices(dimensions, dtype=int):
     """
     Return an array representing the indices of a grid.
@@ -1954,6 +1971,7 @@ def indices(dimensions, dtype=int):
     return res
 
 
+@set_module('numpy')
 def fromfunction(function, shape, **kwargs):
     """
     Construct an array by executing a function over each coordinate.
@@ -2014,6 +2032,7 @@ def _frombuffer(buf, dtype, shape, order):
     return frombuffer(buf, dtype=dtype).reshape(shape, order=order)
 
 
+@set_module('numpy')
 def isscalar(num):
     """
     Returns True if the type of `num` is a scalar type.
@@ -2090,6 +2109,7 @@ def isscalar(num):
             or isinstance(num, numbers.Number))
 
 
+@set_module('numpy')
 def binary_repr(num, width=None):
     """
     Return the binary representation of the input number as a string.
@@ -2200,6 +2220,7 @@ def binary_repr(num, width=None):
             return '1' * (outwidth - binwidth) + binary
 
 
+@set_module('numpy')
 def base_repr(number, base=2, padding=0):
     """
     Return a string representation of a number in the given base system.
@@ -2294,6 +2315,7 @@ def _maketup(descr, val):
         return tuple(res)
 
 
+@set_module('numpy')
 def identity(n, dtype=None):
     """
     Return the identity array.
@@ -2634,6 +2656,7 @@ for key in _errdict.keys():
 del key
 
 
+@set_module('numpy')
 def seterr(all=None, divide=None, over=None, under=None, invalid=None):
     """
     Set how floating-point errors are handled.
@@ -2735,6 +2758,7 @@ def seterr(all=None, divide=None, over=None, under=None, invalid=None):
     return old
 
 
+@set_module('numpy')
 def geterr():
     """
     Get the current way of handling floating-point errors.
@@ -2786,6 +2810,7 @@ def geterr():
     return res
 
 
+@set_module('numpy')
 def setbufsize(size):
     """
     Set the size of the buffer used in ufuncs.
@@ -2810,6 +2835,7 @@ def setbufsize(size):
     return old
 
 
+@set_module('numpy')
 def getbufsize():
     """
     Return the size of the buffer used in ufuncs.
@@ -2823,6 +2849,7 @@ def getbufsize():
     return umath.geterrobj()[0]
 
 
+@set_module('numpy')
 def seterrcall(func):
     """
     Set the floating-point error callback function or log object.
@@ -2915,6 +2942,7 @@ def seterrcall(func):
     return old
 
 
+@set_module('numpy')
 def geterrcall():
     """
     Return the current callback function used on floating-point errors.
@@ -2967,6 +2995,7 @@ class _unspecified(object):
 _Unspecified = _unspecified()
 
 
+@set_module('numpy')
 class errstate(object):
     """
     errstate(**kwargs)
diff --git a/numpy/core/numerictypes.py b/numpy/core/numerictypes.py
index 2fb841f7c..f00f92286 100644
--- a/numpy/core/numerictypes.py
+++ b/numpy/core/numerictypes.py
@@ -92,6 +92,7 @@ from numpy.core.multiarray import (
         datetime_as_string, busday_offset, busday_count, is_busday,
         busdaycalendar
         )
+from numpy.core.overrides import set_module
 
 # we add more at the bottom
 __all__ = ['sctypeDict', 'sctypeNA', 'typeDict', 'typeNA', 'sctypes',
@@ -187,6 +188,8 @@ def maximum_sctype(t):
     else:
         return t
 
+
+@set_module('numpy')
 def issctype(rep):
     """
     Determines whether the given object represents a scalar data-type.
@@ -231,6 +234,8 @@ def issctype(rep):
     except Exception:
         return False
 
+
+@set_module('numpy')
 def obj2sctype(rep, default=None):
     """
     Return the scalar dtype or NumPy equivalent of Python type of an object.
@@ -285,6 +290,7 @@ def obj2sctype(rep, default=None):
         return res.type
 
 
+@set_module('numpy')
 def issubclass_(arg1, arg2):
     """
     Determine if a class is a subclass of a second class.
@@ -323,6 +329,8 @@ def issubclass_(arg1, arg2):
     except TypeError:
         return False
 
+
+@set_module('numpy')
 def issubsctype(arg1, arg2):
     """
     Determine if the first argument is a subclass of the second argument.
@@ -353,6 +361,8 @@ def issubsctype(arg1, arg2):
     """
     return issubclass(obj2sctype(arg1), obj2sctype(arg2))
 
+
+@set_module('numpy')
 def issubdtype(arg1, arg2):
     """
     Returns True if first argument is a typecode lower/equal in type hierarchy.
@@ -446,6 +456,8 @@ def _construct_lookups():
 
 _construct_lookups()
 
+
+@set_module('numpy')
 def sctype2char(sctype):
     """
     Return the string representation of a scalar dtype.
@@ -586,6 +598,8 @@ def _register_types():
 
 _register_types()
 
+
+@set_module('numpy')
 def find_common_type(array_types, scalar_types):
     """
     Determine common type following standard coercion rules.
diff --git a/numpy/core/overrides.py b/numpy/core/overrides.py
index 85a8c32bb..1cc1ff8d8 100644
--- a/numpy/core/overrides.py
+++ b/numpy/core/overrides.py
@@ -4,12 +4,17 @@ TODO: rewrite this in C for performance.
 """
 import collections
 import functools
+import os
 
 from numpy.core._multiarray_umath import ndarray
 from numpy.compat._inspect import getargspec
 
 
 _NDARRAY_ARRAY_FUNCTION = ndarray.__array_function__
+_NDARRAY_ONLY = [ndarray]
+
+ENABLE_ARRAY_FUNCTION = bool(
+    int(os.environ.get('NUMPY_EXPERIMENTAL_ARRAY_FUNCTION', 0)))
 
 
 def get_overloaded_types_and_args(relevant_args):
@@ -40,17 +45,26 @@ def get_overloaded_types_and_args(relevant_args):
         if (arg_type not in overloaded_types and
                 hasattr(arg_type, '__array_function__')):
 
-            overloaded_types.append(arg_type)
-
-            # By default, insert this argument at the end, but if it is
-            # subclass of another argument, insert it before that argument.
-            # This ensures "subclasses before superclasses".
-            index = len(overloaded_args)
-            for i, old_arg in enumerate(overloaded_args):
-                if issubclass(arg_type, type(old_arg)):
-                    index = i
-                    break
-            overloaded_args.insert(index, arg)
+            # Create lists explicitly for the first type (usually the only one
+            # done) to avoid setting up the iterator for overloaded_args.
+            if overloaded_types:
+                overloaded_types.append(arg_type)
+                # By default, insert argument at the end, but if it is
+                # subclass of another argument, insert it before that argument.
+                # This ensures "subclasses before superclasses".
+                index = len(overloaded_args)
+                for i, old_arg in enumerate(overloaded_args):
+                    if issubclass(arg_type, type(old_arg)):
+                        index = i
+                        break
+                overloaded_args.insert(index, arg)
+            else:
+                overloaded_types = [arg_type]
+                overloaded_args = [arg]
+
+    # Short-cut for the common case of only ndarray.
+    if overloaded_types == _NDARRAY_ONLY:
+        return overloaded_types, []
 
     # Special handling for ndarray.__array_function__
     overloaded_args = [
@@ -136,12 +150,57 @@ def verify_matching_signatures(implementation, dispatcher):
                                'default argument values')
 
 
+def set_module(module):
+    """Decorator for overriding __module__ on a function or class.
+
+    Example usage::
+
+        @set_module('numpy')
+        def example():
+            pass
+
+        assert example.__module__ == 'numpy'
+    """
+    def decorator(func):
+        if module is not None:
+            func.__module__ = module
+        return func
+    return decorator
+
+
 def array_function_dispatch(dispatcher, module=None, verify=True):
-    """Decorator for adding dispatch with the __array_function__ protocol."""
+    """Decorator for adding dispatch with the __array_function__ protocol.
+
+    See NEP-18 for example usage.
+
+    Parameters
+    ----------
+    dispatcher : callable
+        Function that when called like ``dispatcher(*args, **kwargs)`` with
+        arguments from the NumPy function call returns an iterable of
+        array-like arguments to check for ``__array_function__``.
+    module : str, optional
+        __module__ attribute to set on new function, e.g., ``module='numpy'``.
+        By default, module is copied from the decorated function.
+    verify : bool, optional
+        If True, verify the that the signature of the dispatcher and decorated
+        function signatures match exactly: all required and optional arguments
+        should appear in order with the same names, but the default values for
+        all optional arguments should be ``None``. Only disable verification
+        if the dispatcher's signature needs to deviate for some particular
+        reason, e.g., because the function has a signature like
+        ``func(*args, **kwargs)``.
+
+    Returns
+    -------
+    Function suitable for decorating the implementation of a NumPy function.
+    """
+
+    if not ENABLE_ARRAY_FUNCTION:
+        # __array_function__ requires an explicit opt-in for now
+        return set_module(module)
+
     def decorator(implementation):
-        # TODO: only do this check when the appropriate flag is enabled or for
-        # a dev install. We want this check for testing but don't want to
-        # slow down all numpy imports.
         if verify:
             verify_matching_signatures(implementation, dispatcher)
 
diff --git a/numpy/core/records.py b/numpy/core/records.py
index a483871ba..6fc282500 100644
--- a/numpy/core/records.py
+++ b/numpy/core/records.py
@@ -42,7 +42,8 @@ import warnings
 
 from . import numeric as sb
 from . import numerictypes as nt
-from numpy.compat import isfileobj, bytes, long, unicode
+from numpy.compat import isfileobj, bytes, long, unicode, os_fspath
+from numpy.core.overrides import set_module
 from .arrayprint import get_printoptions
 
 # All of the functions allow formats to be a dtype
@@ -82,6 +83,8 @@ def find_duplicate(list):
                 dup.append(list[i])
     return dup
 
+
+@set_module('numpy')
 class format_parser(object):
     """
     Class to convert formats, names, titles description to a dtype.
@@ -737,9 +740,9 @@ def fromfile(fd, dtype=None, shape=None, offset=0, formats=None,
              names=None, titles=None, aligned=False, byteorder=None):
     """Create an array from binary file data
 
-    If file is a string then that file is opened, else it is assumed
-    to be a file object. The file object must support random access
-    (i.e. it must have tell and seek methods).
+    If file is a string or a path-like object then that file is opened,
+    else it is assumed to be a file object. The file object must
+    support random access (i.e. it must have tell and seek methods).
 
     >>> from tempfile import TemporaryFile
     >>> a = np.empty(10,dtype='f8,i4,a5')
@@ -763,10 +766,14 @@ def fromfile(fd, dtype=None, shape=None, offset=0, formats=None,
     elif isinstance(shape, (int, long)):
         shape = (shape,)
 
-    name = 0
-    if isinstance(fd, str):
+    if isfileobj(fd):
+        # file already opened
+        name = 0
+    else:
+        # open file
+        fd = open(os_fspath(fd), 'rb')
         name = 1
-        fd = open(fd, 'rb')
+
     if (offset > 0):
         fd.seek(offset, 1)
     size = get_remaining_size(fd)
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index a4429cee2..23a9e268b 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -379,8 +379,9 @@ def check_mathlib(config_cmd):
 def visibility_define(config):
     """Return the define value to use for NPY_VISIBILITY_HIDDEN (may be empty
     string)."""
-    if config.check_compiler_gcc4():
-        return '__attribute__((visibility("hidden")))'
+    hide = '__attribute__((visibility("hidden")))'
+    if config.check_gcc_function_attribute(hide, 'hideme'):
+        return hide
     else:
         return ''
 
@@ -677,7 +678,7 @@ def configuration(parent_package='',top_path=None):
                        join('src', 'npymath', 'npy_math_complex.c.src'),
                        join('src', 'npymath', 'halffloat.c')
                        ]
-    
+
     # Must be true for CRT compilers but not MinGW/cygwin. See gh-9977.
     is_msvc = platform.system() == 'Windows'
     config.add_installed_library('npymath',
@@ -697,7 +698,8 @@ def configuration(parent_package='',top_path=None):
     #######################################################################
 
     # This library is created for the build but it is not installed
-    npysort_sources = [join('src', 'npysort', 'quicksort.c.src'),
+    npysort_sources = [join('src', 'common', 'npy_sort.h.src'),
+                       join('src', 'npysort', 'quicksort.c.src'),
                        join('src', 'npysort', 'mergesort.c.src'),
                        join('src', 'npysort', 'heapsort.c.src'),
                        join('src', 'common', 'npy_partition.h.src'),
@@ -903,14 +905,15 @@ def configuration(parent_package='',top_path=None):
             join('include', 'numpy', 'npy_math.h'),
             join('include', 'numpy', 'halffloat.h'),
             join('src', 'multiarray', 'common.h'),
+            join('src', 'multiarray', 'number.h'),
             join('src', 'common', 'templ_common.h.src'),
             join('src', 'umath', 'simd.inc.src'),
             join('src', 'umath', 'override.h'),
             join(codegen_dir, 'generate_ufunc_api.py'),
-            ] 
+            ]
 
     config.add_extension('_multiarray_umath',
-                         sources=multiarray_src + umath_src + 
+                         sources=multiarray_src + umath_src +
                                  npymath_sources + common_src +
                                  [generate_config_h,
                                   generate_numpyconfig_h,
@@ -920,7 +923,7 @@ def configuration(parent_package='',top_path=None):
                                   generate_umath_c,
                                   generate_ufunc_api,
                                  ],
-                         depends=deps + multiarray_deps + umath_deps + 
+                         depends=deps + multiarray_deps + umath_deps +
                                 common_deps,
                          libraries=['npymath', 'npysort'],
                          extra_info=extra_info)
diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py
index 3edf0824e..6d234e527 100644
--- a/numpy/core/shape_base.py
+++ b/numpy/core/shape_base.py
@@ -217,6 +217,11 @@ def _arrays_for_stack_dispatcher(arrays, stacklevel=4):
     return arrays
 
 
+def _warn_for_nonsequence(arrays):
+    if not overrides.ENABLE_ARRAY_FUNCTION:
+        _arrays_for_stack_dispatcher(arrays, stacklevel=4)
+
+
 def _vhstack_dispatcher(tup):
     return _arrays_for_stack_dispatcher(tup)
 
@@ -274,6 +279,7 @@ def vstack(tup):
            [4]])
 
     """
+    _warn_for_nonsequence(tup)
     return _nx.concatenate([atleast_2d(_m) for _m in tup], 0)
 
 
@@ -325,6 +331,7 @@ def hstack(tup):
            [3, 4]])
 
     """
+    _warn_for_nonsequence(tup)
     arrs = [atleast_1d(_m) for _m in tup]
     # As a special case, dimension 0 of 1-dimensional arrays is "horizontal"
     if arrs and arrs[0].ndim == 1:
@@ -398,6 +405,7 @@ def stack(arrays, axis=0, out=None):
            [3, 4]])
 
     """
+    _warn_for_nonsequence(arrays)
     arrays = [asanyarray(arr) for arr in arrays]
     if not arrays:
         raise ValueError('need at least one array to stack')
diff --git a/numpy/core/src/common/npy_sort.h b/numpy/core/src/common/npy_sort.h
deleted file mode 100644
index 8c6f05623..000000000
--- a/numpy/core/src/common/npy_sort.h
+++ /dev/null
@@ -1,204 +0,0 @@
-#ifndef __NPY_SORT_H__
-#define __NPY_SORT_H__
-
-/* Python include is for future object sorts */
-#include <Python.h>
-#include <numpy/npy_common.h>
-#include <numpy/ndarraytypes.h>
-
-#define NPY_ENOMEM 1
-#define NPY_ECOMP 2
-
-static NPY_INLINE int npy_get_msb(npy_uintp unum)
-{
-    int depth_limit = 0;
-    while (unum >>= 1)  {
-        depth_limit++;
-    }
-    return depth_limit;
-}
-
-int quicksort_bool(void *vec, npy_intp cnt, void *null);
-int heapsort_bool(void *vec, npy_intp cnt, void *null);
-int mergesort_bool(void *vec, npy_intp cnt, void *null);
-int aquicksort_bool(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_bool(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_bool(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_byte(void *vec, npy_intp cnt, void *null);
-int heapsort_byte(void *vec, npy_intp cnt, void *null);
-int mergesort_byte(void *vec, npy_intp cnt, void *null);
-int aquicksort_byte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_byte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_byte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_ubyte(void *vec, npy_intp cnt, void *null);
-int heapsort_ubyte(void *vec, npy_intp cnt, void *null);
-int mergesort_ubyte(void *vec, npy_intp cnt, void *null);
-int aquicksort_ubyte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_ubyte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_ubyte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_short(void *vec, npy_intp cnt, void *null);
-int heapsort_short(void *vec, npy_intp cnt, void *null);
-int mergesort_short(void *vec, npy_intp cnt, void *null);
-int aquicksort_short(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_short(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_short(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_ushort(void *vec, npy_intp cnt, void *null);
-int heapsort_ushort(void *vec, npy_intp cnt, void *null);
-int mergesort_ushort(void *vec, npy_intp cnt, void *null);
-int aquicksort_ushort(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_ushort(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_ushort(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_int(void *vec, npy_intp cnt, void *null);
-int heapsort_int(void *vec, npy_intp cnt, void *null);
-int mergesort_int(void *vec, npy_intp cnt, void *null);
-int aquicksort_int(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_int(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_int(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_uint(void *vec, npy_intp cnt, void *null);
-int heapsort_uint(void *vec, npy_intp cnt, void *null);
-int mergesort_uint(void *vec, npy_intp cnt, void *null);
-int aquicksort_uint(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_uint(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_uint(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_long(void *vec, npy_intp cnt, void *null);
-int heapsort_long(void *vec, npy_intp cnt, void *null);
-int mergesort_long(void *vec, npy_intp cnt, void *null);
-int aquicksort_long(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_long(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_long(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_ulong(void *vec, npy_intp cnt, void *null);
-int heapsort_ulong(void *vec, npy_intp cnt, void *null);
-int mergesort_ulong(void *vec, npy_intp cnt, void *null);
-int aquicksort_ulong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_ulong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_ulong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_longlong(void *vec, npy_intp cnt, void *null);
-int heapsort_longlong(void *vec, npy_intp cnt, void *null);
-int mergesort_longlong(void *vec, npy_intp cnt, void *null);
-int aquicksort_longlong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_longlong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_longlong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_ulonglong(void *vec, npy_intp cnt, void *null);
-int heapsort_ulonglong(void *vec, npy_intp cnt, void *null);
-int mergesort_ulonglong(void *vec, npy_intp cnt, void *null);
-int aquicksort_ulonglong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_ulonglong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_ulonglong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_half(void *vec, npy_intp cnt, void *null);
-int heapsort_half(void *vec, npy_intp cnt, void *null);
-int mergesort_half(void *vec, npy_intp cnt, void *null);
-int aquicksort_half(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_half(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_half(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_float(void *vec, npy_intp cnt, void *null);
-int heapsort_float(void *vec, npy_intp cnt, void *null);
-int mergesort_float(void *vec, npy_intp cnt, void *null);
-int aquicksort_float(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_float(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_float(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_double(void *vec, npy_intp cnt, void *null);
-int heapsort_double(void *vec, npy_intp cnt, void *null);
-int mergesort_double(void *vec, npy_intp cnt, void *null);
-int aquicksort_double(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_double(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_double(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_longdouble(void *vec, npy_intp cnt, void *null);
-int heapsort_longdouble(void *vec, npy_intp cnt, void *null);
-int mergesort_longdouble(void *vec, npy_intp cnt, void *null);
-int aquicksort_longdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_longdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_longdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_cfloat(void *vec, npy_intp cnt, void *null);
-int heapsort_cfloat(void *vec, npy_intp cnt, void *null);
-int mergesort_cfloat(void *vec, npy_intp cnt, void *null);
-int aquicksort_cfloat(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_cfloat(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_cfloat(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_cdouble(void *vec, npy_intp cnt, void *null);
-int heapsort_cdouble(void *vec, npy_intp cnt, void *null);
-int mergesort_cdouble(void *vec, npy_intp cnt, void *null);
-int aquicksort_cdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_cdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_cdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_clongdouble(void *vec, npy_intp cnt, void *null);
-int heapsort_clongdouble(void *vec, npy_intp cnt, void *null);
-int mergesort_clongdouble(void *vec, npy_intp cnt, void *null);
-int aquicksort_clongdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_clongdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_clongdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_string(void *vec, npy_intp cnt, void *arr);
-int heapsort_string(void *vec, npy_intp cnt, void *arr);
-int mergesort_string(void *vec, npy_intp cnt, void *arr);
-int aquicksort_string(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int aheapsort_string(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int amergesort_string(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-
-
-int quicksort_unicode(void *vec, npy_intp cnt, void *arr);
-int heapsort_unicode(void *vec, npy_intp cnt, void *arr);
-int mergesort_unicode(void *vec, npy_intp cnt, void *arr);
-int aquicksort_unicode(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int aheapsort_unicode(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int amergesort_unicode(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-
-
-int quicksort_datetime(void *vec, npy_intp cnt, void *null);
-int heapsort_datetime(void *vec, npy_intp cnt, void *null);
-int mergesort_datetime(void *vec, npy_intp cnt, void *null);
-int aquicksort_datetime(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_datetime(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_datetime(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int quicksort_timedelta(void *vec, npy_intp cnt, void *null);
-int heapsort_timedelta(void *vec, npy_intp cnt, void *null);
-int mergesort_timedelta(void *vec, npy_intp cnt, void *null);
-int aquicksort_timedelta(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_timedelta(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_timedelta(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-
-
-int npy_quicksort(void *vec, npy_intp cnt, void *arr);
-int npy_heapsort(void *vec, npy_intp cnt, void *arr);
-int npy_mergesort(void *vec, npy_intp cnt, void *arr);
-int npy_aquicksort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int npy_aheapsort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int npy_amergesort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-
-#endif
diff --git a/numpy/core/src/common/npy_sort.h.src b/numpy/core/src/common/npy_sort.h.src
new file mode 100644
index 000000000..c31a82764
--- /dev/null
+++ b/numpy/core/src/common/npy_sort.h.src
@@ -0,0 +1,83 @@
+#ifndef __NPY_SORT_H__
+#define __NPY_SORT_H__
+
+/* Python include is for future object sorts */
+#include <Python.h>
+#include <numpy/npy_common.h>
+#include <numpy/ndarraytypes.h>
+
+#define NPY_ENOMEM 1
+#define NPY_ECOMP 2
+
+static NPY_INLINE int npy_get_msb(npy_uintp unum)
+{
+    int depth_limit = 0;
+    while (unum >>= 1)  {
+        depth_limit++;
+    }
+    return depth_limit;
+}
+
+
+/*
+ *****************************************************************************
+ **                            NUMERIC SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong, half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble, datetime, timedelta#
+ */
+
+int quicksort_@suff@(void *vec, npy_intp cnt, void *null);
+int heapsort_@suff@(void *vec, npy_intp cnt, void *null);
+int mergesort_@suff@(void *vec, npy_intp cnt, void *null);
+int aquicksort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+int aheapsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+int amergesort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+/**end repeat**/
+
+
+
+/*
+ *****************************************************************************
+ **                             STRING SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #suff = string, unicode#
+ */
+
+int quicksort_@suff@(void *vec, npy_intp cnt, void *arr);
+int heapsort_@suff@(void *vec, npy_intp cnt, void *arr);
+int mergesort_@suff@(void *vec, npy_intp cnt, void *arr);
+int aquicksort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+int aheapsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+int amergesort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                             GENERIC SORT                                **
+ *****************************************************************************
+ */
+
+
+int npy_quicksort(void *vec, npy_intp cnt, void *arr);
+int npy_heapsort(void *vec, npy_intp cnt, void *arr);
+int npy_mergesort(void *vec, npy_intp cnt, void *arr);
+int npy_aquicksort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+int npy_aheapsort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+int npy_amergesort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+
+#endif
diff --git a/numpy/core/src/common/ufunc_override.c b/numpy/core/src/common/ufunc_override.c
index 33b54c665..b67422132 100644
--- a/numpy/core/src/common/ufunc_override.c
+++ b/numpy/core/src/common/ufunc_override.c
@@ -1,10 +1,9 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define NO_IMPORT_ARRAY
+#define _MULTIARRAYMODULE
 
 #include "npy_pycompat.h"
 #include "get_attr_string.h"
 #include "npy_import.h"
-
 #include "ufunc_override.h"
 
 /*
@@ -12,45 +11,39 @@
  * is not the default, i.e., the object is not an ndarray, and its
  * __array_ufunc__ is not the same as that of ndarray.
  *
- * Returns a new reference, the value of type(obj).__array_ufunc__
- *
- * If the __array_ufunc__ matches that of ndarray, or does not exist, return
- * NULL.
- *
- * Note that since this module is used with both multiarray and umath, we do
- * not have access to PyArray_Type and therewith neither to PyArray_CheckExact
- * nor to the default __array_ufunc__ method, so instead we import locally.
- * TODO: Can this really not be done more smartly?
+ * Returns a new reference, the value of type(obj).__array_ufunc__ if it
+ * exists and is different from that of ndarray, and NULL otherwise.
  */
 NPY_NO_EXPORT PyObject *
-get_non_default_array_ufunc(PyObject *obj)
+PyUFuncOverride_GetNonDefaultArrayUfunc(PyObject *obj)
 {
-    static PyObject *ndarray = NULL;
     static PyObject *ndarray_array_ufunc = NULL;
     PyObject *cls_array_ufunc;
 
-    /* on first entry, import and cache ndarray and its __array_ufunc__ */
-    if (ndarray == NULL) {
-        npy_cache_import("numpy.core.multiarray", "ndarray", &ndarray);
-        ndarray_array_ufunc = PyObject_GetAttrString(ndarray,
+    /* On first entry, cache ndarray's __array_ufunc__ */
+    if (ndarray_array_ufunc == NULL) {
+        ndarray_array_ufunc = PyObject_GetAttrString((PyObject *)&PyArray_Type,
                                                      "__array_ufunc__");
     }
 
     /* Fast return for ndarray */
-    if ((PyObject *)Py_TYPE(obj) == ndarray) {
+    if (PyArray_CheckExact(obj)) {
         return NULL;
     }
-    /* does the class define __array_ufunc__? */
+    /*
+     * Does the class define __array_ufunc__? (Note that LookupSpecial has fast
+     * return for basic python types, so no need to worry about those here)
+     */
     cls_array_ufunc = PyArray_LookupSpecial(obj, "__array_ufunc__");
     if (cls_array_ufunc == NULL) {
         return NULL;
     }
-    /* is it different from ndarray.__array_ufunc__? */
-    if (cls_array_ufunc != ndarray_array_ufunc) {
-        return cls_array_ufunc;
+    /* Ignore if the same as ndarray.__array_ufunc__ */
+    if (cls_array_ufunc == ndarray_array_ufunc) {
+        Py_DECREF(cls_array_ufunc);
+        return NULL;
     }
-    Py_DECREF(cls_array_ufunc);
-    return NULL;
+    return cls_array_ufunc;
 }
 
 /*
@@ -62,9 +55,9 @@ get_non_default_array_ufunc(PyObject *obj)
  */
 
 NPY_NO_EXPORT int
-has_non_default_array_ufunc(PyObject * obj)
+PyUFunc_HasOverride(PyObject * obj)
 {
-    PyObject *method = get_non_default_array_ufunc(obj);
+    PyObject *method = PyUFuncOverride_GetNonDefaultArrayUfunc(obj);
     if (method) {
         Py_DECREF(method);
         return 1;
@@ -80,17 +73,17 @@ has_non_default_array_ufunc(PyObject * obj)
  * The out argument itself is returned in out_kwd_obj, and the outputs
  * in the out_obj array (all as borrowed references).
  *
- * Returns -1 if kwds is not a dict, 0 if no outputs found.
+ * Returns 0 if no outputs found, -1 if kwds is not a dict (with an error set).
  */
-static int
-get_out_objects(PyObject *kwds, PyObject **out_kwd_obj, PyObject ***out_objs)
+NPY_NO_EXPORT int
+PyUFuncOverride_GetOutObjects(PyObject *kwds, PyObject **out_kwd_obj, PyObject ***out_objs)
 {
     if (kwds == NULL) {
         return 0;
     }
     if (!PyDict_CheckExact(kwds)) {
         PyErr_SetString(PyExc_TypeError,
-                        "Internal Numpy error: call to PyUFunc_WithOverride "
+                        "Internal Numpy error: call to PyUFuncOverride_GetOutObjects "
                         "with non-dict kwds");
         return -1;
     }
@@ -108,134 +101,3 @@ get_out_objects(PyObject *kwds, PyObject **out_kwd_obj, PyObject ***out_objs)
         return 1;
     }
 }
-
-/*
- * For each positional argument and each argument in a possible "out"
- * keyword, look for overrides of the standard ufunc behaviour, i.e.,
- * non-default __array_ufunc__ methods.
- *
- * Returns the number of overrides, setting corresponding objects
- * in PyObject array ``with_override`` and the corresponding
- * __array_ufunc__ methods in ``methods`` (both using new references).
- *
- * Only the first override for a given class is returned.
- *
- * returns -1 on failure.
- */
-NPY_NO_EXPORT int
-PyUFunc_WithOverride(PyObject *args, PyObject *kwds,
-                     PyObject **with_override, PyObject **methods)
-{
-    int i;
-    int num_override_args = 0;
-    int narg, nout = 0;
-    PyObject *out_kwd_obj;
-    PyObject **arg_objs, **out_objs;
-
-    narg = PyTuple_Size(args);
-    if (narg < 0) {
-        return -1;
-    }
-    arg_objs = PySequence_Fast_ITEMS(args);
-
-    nout = get_out_objects(kwds, &out_kwd_obj, &out_objs);
-    if (nout < 0) {
-        return -1;
-    }
-
-    for (i = 0; i < narg + nout; ++i) {
-        PyObject *obj;
-        int j;
-        int new_class = 1;
-
-        if (i < narg) {
-            obj = arg_objs[i];
-        }
-        else {
-            obj = out_objs[i - narg];
-        }
-        /*
-         * Have we seen this class before?  If so, ignore.
-         */
-        for (j = 0; j < num_override_args; j++) {
-            new_class = (Py_TYPE(obj) != Py_TYPE(with_override[j]));
-            if (!new_class) {
-                break;
-            }
-        }
-        if (new_class) {
-            /*
-             * Now see if the object provides an __array_ufunc__. However, we should
-             * ignore the base ndarray.__ufunc__, so we skip any ndarray as well as
-             * any ndarray subclass instances that did not override __array_ufunc__.
-             */
-            PyObject *method = get_non_default_array_ufunc(obj);
-            if (method == NULL) {
-                continue;
-            }
-            if (method == Py_None) {
-                PyErr_Format(PyExc_TypeError,
-                             "operand '%.200s' does not support ufuncs "
-                             "(__array_ufunc__=None)",
-                             obj->ob_type->tp_name);
-                Py_DECREF(method);
-                goto fail;
-            }
-            Py_INCREF(obj);
-            with_override[num_override_args] = obj;
-            methods[num_override_args] = method;
-            ++num_override_args;
-        }
-    }
-    return num_override_args;
-
-fail:
-    for (i = 0; i < num_override_args; i++) {
-        Py_DECREF(with_override[i]);
-        Py_DECREF(methods[i]);
-    }
-    return -1;
-}
-
-/*
- * Check whether any of a set of input and output args have a non-default
- * __array_ufunc__ method. Return 1 if so, 0 if not.
- *
- * This function primarily exists to help ndarray.__array_ufunc__ determine
- * whether it can support a ufunc (which is the case only if none of the
- * operands have an override).  Thus, unlike in PyUFunc_CheckOverride, the
- * actual overrides are not needed and one can stop looking once one is found.
- *
- * TODO: move this function and has_non_default_array_ufunc closer to ndarray.
- */
-NPY_NO_EXPORT int
-PyUFunc_HasOverride(PyObject *args, PyObject *kwds)
-{
-    int i;
-    int nin, nout;
-    PyObject *out_kwd_obj;
-    PyObject **in_objs, **out_objs;
-
-    /* check inputs */
-    nin = PyTuple_Size(args);
-    if (nin < 0) {
-        return -1;
-    }
-    in_objs = PySequence_Fast_ITEMS(args);
-    for (i = 0; i < nin; ++i) {
-        if (has_non_default_array_ufunc(in_objs[i])) {
-            return 1;
-        }
-    }
-    /* check outputs, if any */
-    nout = get_out_objects(kwds, &out_kwd_obj, &out_objs);
-    if (nout < 0) {
-        return -1;
-    }
-    for (i = 0; i < nout; i++) {
-        if (has_non_default_array_ufunc(out_objs[i])) {
-            return 1;
-        }
-    }
-    return 0;
-}
diff --git a/numpy/core/src/common/ufunc_override.h b/numpy/core/src/common/ufunc_override.h
index 5b269d270..cc39166b3 100644
--- a/numpy/core/src/common/ufunc_override.h
+++ b/numpy/core/src/common/ufunc_override.h
@@ -8,18 +8,11 @@
  * is not the default, i.e., the object is not an ndarray, and its
  * __array_ufunc__ is not the same as that of ndarray.
  *
- * Returns a new reference, the value of type(obj).__array_ufunc__
- *
- * If the __array_ufunc__ matches that of ndarray, or does not exist, return
- * NULL.
- *
- * Note that since this module is used with both multiarray and umath, we do
- * not have access to PyArray_Type and therewith neither to PyArray_CheckExact
- * nor to the default __array_ufunc__ method, so instead we import locally.
- * TODO: Can this really not be done more smartly?
+ * Returns a new reference, the value of type(obj).__array_ufunc__ if it
+ * exists and is different from that of ndarray, and NULL otherwise.
  */
 NPY_NO_EXPORT PyObject *
-get_non_default_array_ufunc(PyObject *obj);
+PyUFuncOverride_GetNonDefaultArrayUfunc(PyObject *obj);
 
 /*
  * Check whether an object has __array_ufunc__ defined on its class and it
@@ -29,18 +22,16 @@ get_non_default_array_ufunc(PyObject *obj);
  * Returns 1 if this is the case, 0 if not.
  */
 NPY_NO_EXPORT int
-has_non_default_array_ufunc(PyObject * obj);
+PyUFunc_HasOverride(PyObject *obj);
 
 /*
- * Check whether a set of input and output args have a non-default
- *  `__array_ufunc__` method. Returns the number of overrides, setting
- * corresponding objects in PyObject array with_override (if not NULL).
- * returns -1 on failure.
+ * Get possible out argument from kwds, and returns the number of outputs
+ * contained within it: if a tuple, the number of elements in it, 1 otherwise.
+ * The out argument itself is returned in out_kwd_obj, and the outputs
+ * in the out_obj array (all as borrowed references).
+ *
+ * Returns 0 if no outputs found, -1 if kwds is not a dict (with an error set).
  */
 NPY_NO_EXPORT int
-PyUFunc_WithOverride(PyObject *args, PyObject *kwds,
-                     PyObject **with_override, PyObject **methods);
-
-NPY_NO_EXPORT int
-PyUFunc_HasOverride(PyObject *args, PyObject *kwds);
+PyUFuncOverride_GetOutObjects(PyObject *kwds, PyObject **out_kwd_obj, PyObject ***out_objs);
 #endif
diff --git a/numpy/core/src/multiarray/_multiarray_tests.c.src b/numpy/core/src/multiarray/_multiarray_tests.c.src
index 6c4d49bd1..2a8275572 100644
--- a/numpy/core/src/multiarray/_multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/_multiarray_tests.c.src
@@ -11,6 +11,13 @@
 #include "npy_extint128.h"
 #include "common.h"
 
+
+#if defined(MS_WIN32) || defined(__CYGWIN__)
+#define EXPORT(x) __declspec(dllexport) x
+#else
+#define EXPORT(x) x
+#endif
+
 #define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0]))
 
 /* test PyArray_IsPythonScalar, before including private py3 compat header */
@@ -31,6 +38,12 @@ IsPythonScalar(PyObject * dummy, PyObject *args)
 
 #include "npy_pycompat.h"
 
+/** Function to test calling via ctypes */
+EXPORT(void*) forward_pointer(void *x)
+{
+    return x;
+}
+
 /*
  * TODO:
  *  - Handle mode
@@ -1855,6 +1868,16 @@ printf_float_g(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
     return PrintFloat_Printf_g(obj, precision);
 }
 
+static PyObject *
+getset_numericops(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
+{
+    PyObject * ops = PyArray_GetNumericOps();
+    if (ops == NULL) {
+        return NULL;
+    }
+    return PyLong_FromLong(PyArray_SetNumericOps(ops));
+}
+
 static PyMethodDef Multiarray_TestsMethods[] = {
     {"IsPythonScalar",
         IsPythonScalar,
@@ -1963,6 +1986,9 @@ static PyMethodDef Multiarray_TestsMethods[] = {
     {"get_fpu_mode",
         get_fpu_mode,
         METH_VARARGS, get_fpu_mode_doc},
+    {"getset_numericops",
+        getset_numericops,
+        METH_NOARGS, NULL},
 /**begin repeat
  * #name = cabs, carg#
  */
@@ -2040,3 +2066,9 @@ init_multiarray_tests(void)
     }
     return RETVAL;
 }
+
+NPY_NO_EXPORT int
+test_not_exported(void)
+{
+    return 1;
+}
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index e8380e3bc..17d8baf7b 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -1158,26 +1158,6 @@ arr_unravel_index(PyObject *self, PyObject *args, PyObject *kwds)
 
     char *kwlist[] = {"indices", "shape", "order", NULL};
 
-    /* Continue to support the older "dims" argument in place
-     * of the "shape" argument. Issue an appropriate warning
-     * if "dims" is detected in keywords, then replace it with
-     * the new "shape" argument and continue processing as usual */
-
-
-    if (kwds) {
-        PyObject *dims_item, *shape_item;
-        dims_item = PyDict_GetItemString(kwds, "dims");
-        shape_item = PyDict_GetItemString(kwds, "shape");
-        if (dims_item != NULL && shape_item == NULL) {
-            if (DEPRECATE("'shape' argument should be"
-                          " used instead of 'dims'") < 0) {
-                return NULL;
-            }
-            PyDict_SetItemString(kwds, "shape", dims_item);
-            PyDict_DelItemString(kwds, "dims");
-        }
-    }
-
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO&|O&:unravel_index",
                     kwlist,
                     &indices0,
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index 97d899ce0..2b29d4f8c 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -51,6 +51,20 @@
 #endif
 /**********************************************/
 
+#if NPY_DT_DBG_TRACING
+/*
+ * Thin wrapper around print that ignores exceptions
+ */
+static void
+_safe_print(PyObject *obj)
+{
+    if (PyObject_Print(obj, stdout, 0) < 0) {
+        PyErr_Clear();
+        printf("<error during print>");
+    }
+}
+#endif
+
 /*
  * Returns a transfer function which DECREFs any references in src_type.
  *
@@ -1042,9 +1056,9 @@ get_nbo_cast_datetime_transfer_function(int aligned,
 
 #if NPY_DT_DBG_TRACING
     printf("Dtype transfer from ");
-    PyObject_Print((PyObject *)src_dtype, stdout, 0);
+    _safe_print((PyObject *)src_dtype);
     printf(" to ");
-    PyObject_Print((PyObject *)dst_dtype, stdout, 0);
+    _safe_print((PyObject *)dst_dtype);
     printf("\n");
     printf("has conversion fraction %lld/%lld\n", num, denom);
 #endif
@@ -1089,9 +1103,9 @@ get_nbo_datetime_to_string_transfer_function(int aligned,
 
 #if NPY_DT_DBG_TRACING
     printf("Dtype transfer from ");
-    PyObject_Print((PyObject *)src_dtype, stdout, 0);
+    _safe_print((PyObject *)src_dtype);
     printf(" to ");
-    PyObject_Print((PyObject *)dst_dtype, stdout, 0);
+    _safe_print((PyObject *)dst_dtype);
     printf("\n");
 #endif
 
@@ -1211,9 +1225,9 @@ get_nbo_string_to_datetime_transfer_function(int aligned,
 
 #if NPY_DT_DBG_TRACING
     printf("Dtype transfer from ");
-    PyObject_Print((PyObject *)src_dtype, stdout, 0);
+    _safe_print((PyObject *)src_dtype);
     printf(" to ");
-    PyObject_Print((PyObject *)dst_dtype, stdout, 0);
+    _safe_print((PyObject *)dst_dtype);
     printf("\n");
 #endif
 
@@ -3421,9 +3435,13 @@ PyArray_GetDTypeTransferFunction(int aligned,
 
 #if NPY_DT_DBG_TRACING
     printf("Calculating dtype transfer from ");
-    PyObject_Print((PyObject *)src_dtype, stdout, 0);
+    if (PyObject_Print((PyObject *)src_dtype, stdout, 0) < 0) {
+        return NPY_FAIL;
+    }
     printf(" to ");
-    PyObject_Print((PyObject *)dst_dtype, stdout, 0);
+    if (PyObject_Print((PyObject *)dst_dtype, stdout, 0) < 0) {
+        return NPY_FAIL;
+    }
     printf("\n");
 #endif
 
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 23b0bfd24..231bd86dc 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -988,8 +988,49 @@ array_getarray(PyArrayObject *self, PyObject *args)
     }
 }
 
+/*
+ * Check whether any of a set of input and output args have a non-default
+ * __array_ufunc__ method. Return 1 if so, 0 if not, and -1 on error.
+ *
+ * This function primarily exists to help ndarray.__array_ufunc__ determine
+ * whether it can support a ufunc (which is the case only if none of the
+ * operands have an override).  Thus, unlike in umath/override.c, the
+ * actual overrides are not needed and one can stop looking once one is found.
+ */
+static int
+any_array_ufunc_overrides(PyObject *args, PyObject *kwds)
+{
+    int i;
+    int nin, nout;
+    PyObject *out_kwd_obj;
+    PyObject **in_objs, **out_objs;
 
-static PyObject *
+    /* check inputs */
+    nin = PyTuple_Size(args);
+    if (nin < 0) {
+        return -1;
+    }
+    in_objs = PySequence_Fast_ITEMS(args);
+    for (i = 0; i < nin; ++i) {
+        if (PyUFunc_HasOverride(in_objs[i])) {
+            return 1;
+        }
+    }
+    /* check outputs, if any */
+    nout = PyUFuncOverride_GetOutObjects(kwds, &out_kwd_obj, &out_objs);
+    if (nout < 0) {
+        return -1;
+    }
+    for (i = 0; i < nout; i++) {
+        if (PyUFunc_HasOverride(out_objs[i])) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT PyObject *
 array_ufunc(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
     PyObject *ufunc, *method_name, *normal_args, *ufunc_method;
@@ -1009,7 +1050,7 @@ array_ufunc(PyArrayObject *self, PyObject *args, PyObject *kwds)
         return NULL;
     }
     /* ndarray cannot handle overrides itself */
-    has_override = PyUFunc_HasOverride(normal_args, kwds);
+    has_override = any_array_ufunc_overrides(normal_args, kwds);
     if (has_override < 0) {
         goto cleanup;
     }
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 8f782cff6..909a24359 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -833,7 +833,10 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
     typenum = PyArray_ObjectType(op2, typenum);
     typec = PyArray_DescrFromType(typenum);
     if (typec == NULL) {
-        PyErr_SetString(PyExc_TypeError, "Cannot find a common data type.");
+        if (!PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError,
+                            "Cannot find a common data type.");
+        }
         goto fail;
     }
 
@@ -919,7 +922,10 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
     typenum = PyArray_ObjectType(op2, typenum);
     typec = PyArray_DescrFromType(typenum);
     if (typec == NULL) {
-        PyErr_SetString(PyExc_TypeError, "Cannot find a common data type.");
+        if (!PyErr_Occurred()) {
+            PyErr_SetString(PyExc_TypeError,
+                            "Cannot find a common data type.");
+        }
         return NULL;
     }
 
@@ -2044,6 +2050,7 @@ static PyObject *
 array_fromfile(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds)
 {
     PyObject *file = NULL, *ret;
+    PyObject *err_type = NULL, *err_value = NULL, *err_traceback = NULL;
     char *sep = "";
     Py_ssize_t nin = -1;
     static char *kwlist[] = {"file", "dtype", "count", "sep", NULL};
@@ -2079,18 +2086,26 @@ array_fromfile(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds)
     }
     ret = PyArray_FromFile(fp, type, (npy_intp) nin, sep);
 
+    /* If an exception is thrown in the call to PyArray_FromFile
+     * we need to clear it, and restore it later to ensure that
+     * we can cleanup the duplicated file descriptor properly.
+     */
+    PyErr_Fetch(&err_type, &err_value, &err_traceback);
     if (npy_PyFile_DupClose2(file, fp, orig_pos) < 0) {
+        npy_PyErr_ChainExceptions(err_type, err_value, err_traceback);
         goto fail;
     }
     if (own && npy_PyFile_CloseFile(file) < 0) {
+        npy_PyErr_ChainExceptions(err_type, err_value, err_traceback);
         goto fail;
     }
+    PyErr_Restore(err_type, err_value, err_traceback);
     Py_DECREF(file);
     return ret;
 
 fail:
     Py_DECREF(file);
-    Py_DECREF(ret);
+    Py_XDECREF(ret);
     return NULL;
 }
 
@@ -2349,7 +2364,10 @@ array_matmul(PyObject *NPY_UNUSED(m), PyObject *args, PyObject* kwds)
     dtype = PyArray_DescrFromObject(in1, NULL);
     dtype = PyArray_DescrFromObject(in2, dtype);
     if (dtype == NULL) {
-        PyErr_SetString(PyExc_ValueError, "Cannot find a common data type.");
+        if (!PyErr_Occurred()) {
+            PyErr_SetString(PyExc_ValueError,
+                            "Cannot find a common data type.");
+        }
         return NULL;
     }
     typenum = dtype->type_num;
@@ -2990,7 +3008,7 @@ array_set_ops_function(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args),
 {
     PyObject *oldops = NULL;
 
-    if ((oldops = PyArray_GetNumericOps()) == NULL) {
+    if ((oldops = _PyArray_GetNumericOps()) == NULL) {
         return NULL;
     }
     /*
@@ -3000,8 +3018,10 @@ array_set_ops_function(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args),
      */
     if (kwds && PyArray_SetNumericOps(kwds) == -1) {
         Py_DECREF(oldops);
-        PyErr_SetString(PyExc_ValueError,
+        if (PyErr_Occurred() == NULL) {
+            PyErr_SetString(PyExc_ValueError,
                 "one or more objects not callable");
+        }
         return NULL;
     }
     return oldops;
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index dabbae064..5ee536d4f 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -71,12 +71,8 @@ array_inplace_power(PyArrayObject *a1, PyObject *o2, PyObject *NPY_UNUSED(modulo
         n_ops.op = temp; \
     }
 
-
-/*NUMPY_API
- *Set internal structure with number functions that all arrays will use
- */
 NPY_NO_EXPORT int
-PyArray_SetNumericOps(PyObject *dict)
+_PyArray_SetNumericOps(PyObject *dict)
 {
     PyObject *temp = NULL;
     SET(add);
@@ -119,16 +115,28 @@ PyArray_SetNumericOps(PyObject *dict)
     return 0;
 }
 
+/*NUMPY_API
+ *Set internal structure with number functions that all arrays will use
+ */
+NPY_NO_EXPORT int
+PyArray_SetNumericOps(PyObject *dict)
+{
+    /* 2018-09-09, 1.16 */
+    if (DEPRECATE("PyArray_SetNumericOps is deprecated. Use "
+        "PyUFunc_ReplaceLoopBySignature to replace ufunc inner loop functions "
+        "instead.") < 0) {
+        return -1;
+    }
+    return _PyArray_SetNumericOps(dict);
+}
+
 /* Note - macro contains goto */
 #define GET(op) if (n_ops.op &&                                         \
                     (PyDict_SetItemString(dict, #op, n_ops.op)==-1))    \
         goto fail;
 
-/*NUMPY_API
-  Get dictionary showing number functions that all arrays will use
-*/
 NPY_NO_EXPORT PyObject *
-PyArray_GetNumericOps(void)
+_PyArray_GetNumericOps(void)
 {
     PyObject *dict;
     if ((dict = PyDict_New())==NULL)
@@ -176,6 +184,19 @@ PyArray_GetNumericOps(void)
     return NULL;
 }
 
+/*NUMPY_API
+  Get dictionary showing number functions that all arrays will use
+*/
+NPY_NO_EXPORT PyObject *
+PyArray_GetNumericOps(void)
+{
+    /* 2018-09-09, 1.16 */
+    if (DEPRECATE("PyArray_GetNumericOps is deprecated.") < 0) {
+        return NULL;
+    }
+    return _PyArray_GetNumericOps();
+}
+
 static PyObject *
 _get_keywords(int rtype, PyArrayObject *out)
 {
@@ -578,7 +599,7 @@ array_positive(PyArrayObject *m1)
          */
         PyObject *exc, *val, *tb;
         PyErr_Fetch(&exc, &val, &tb);
-        if (has_non_default_array_ufunc((PyObject *)m1)) {
+        if (PyUFunc_HasOverride((PyObject *)m1)) {
             PyErr_Restore(exc, val, tb);
             return NULL;
         }
diff --git a/numpy/core/src/multiarray/number.h b/numpy/core/src/multiarray/number.h
index 99a2a722b..fbdfe6f94 100644
--- a/numpy/core/src/multiarray/number.h
+++ b/numpy/core/src/multiarray/number.h
@@ -48,10 +48,10 @@ NPY_NO_EXPORT PyObject *
 array_int(PyArrayObject *v);
 
 NPY_NO_EXPORT int
-PyArray_SetNumericOps(PyObject *dict);
+_PyArray_SetNumericOps(PyObject *dict);
 
 NPY_NO_EXPORT PyObject *
-PyArray_GetNumericOps(void);
+_PyArray_GetNumericOps(void);
 
 NPY_NO_EXPORT PyObject *
 PyArray_GenericBinaryFunction(PyArrayObject *m1, PyObject *m2, PyObject *op);
diff --git a/numpy/core/src/multiarray/temp_elide.c b/numpy/core/src/multiarray/temp_elide.c
index 3d2f976f2..09b948218 100644
--- a/numpy/core/src/multiarray/temp_elide.c
+++ b/numpy/core/src/multiarray/temp_elide.c
@@ -166,7 +166,7 @@ check_callers(int * cannot)
             return 0;
         }
         /* get multiarray base address */
-        if (dladdr(&PyArray_SetNumericOps, &info)) {
+        if (dladdr(&PyArray_INCREF, &info)) {
             pos_ma_start = info.dli_fbase;
             pos_ma_end = info.dli_fbase;
         }
diff --git a/numpy/core/src/umath/override.c b/numpy/core/src/umath/override.c
index 4a381ba12..c56f43fa2 100644
--- a/numpy/core/src/umath/override.c
+++ b/numpy/core/src/umath/override.c
@@ -5,8 +5,96 @@
 #include "numpy/ufuncobject.h"
 #include "npy_import.h"
 
-#include "ufunc_override.h"
 #include "override.h"
+#include "ufunc_override.h"
+
+/*
+ * For each positional argument and each argument in a possible "out"
+ * keyword, look for overrides of the standard ufunc behaviour, i.e.,
+ * non-default __array_ufunc__ methods.
+ *
+ * Returns the number of overrides, setting corresponding objects
+ * in PyObject array ``with_override`` and the corresponding
+ * __array_ufunc__ methods in ``methods`` (both using new references).
+ *
+ * Only the first override for a given class is returned.
+ *
+ * Returns -1 on failure.
+ */
+static int
+get_array_ufunc_overrides(PyObject *args, PyObject *kwds,
+                          PyObject **with_override, PyObject **methods)
+{
+    int i;
+    int num_override_args = 0;
+    int narg, nout = 0;
+    PyObject *out_kwd_obj;
+    PyObject **arg_objs, **out_objs;
+
+    narg = PyTuple_Size(args);
+    if (narg < 0) {
+        return -1;
+    }
+    arg_objs = PySequence_Fast_ITEMS(args);
+
+    nout = PyUFuncOverride_GetOutObjects(kwds, &out_kwd_obj, &out_objs);
+    if (nout < 0) {
+        return -1;
+    }
+
+    for (i = 0; i < narg + nout; ++i) {
+        PyObject *obj;
+        int j;
+        int new_class = 1;
+
+        if (i < narg) {
+            obj = arg_objs[i];
+        }
+        else {
+            obj = out_objs[i - narg];
+        }
+        /*
+         * Have we seen this class before?  If so, ignore.
+         */
+        for (j = 0; j < num_override_args; j++) {
+            new_class = (Py_TYPE(obj) != Py_TYPE(with_override[j]));
+            if (!new_class) {
+                break;
+            }
+        }
+        if (new_class) {
+            /*
+             * Now see if the object provides an __array_ufunc__. However, we should
+             * ignore the base ndarray.__ufunc__, so we skip any ndarray as well as
+             * any ndarray subclass instances that did not override __array_ufunc__.
+             */
+            PyObject *method = PyUFuncOverride_GetNonDefaultArrayUfunc(obj);
+            if (method == NULL) {
+                continue;
+            }
+            if (method == Py_None) {
+                PyErr_Format(PyExc_TypeError,
+                             "operand '%.200s' does not support ufuncs "
+                             "(__array_ufunc__=None)",
+                             obj->ob_type->tp_name);
+                Py_DECREF(method);
+                goto fail;
+            }
+            Py_INCREF(obj);
+            with_override[num_override_args] = obj;
+            methods[num_override_args] = method;
+            ++num_override_args;
+        }
+    }
+    return num_override_args;
+
+fail:
+    for (i = 0; i < num_override_args; i++) {
+        Py_DECREF(with_override[i]);
+        Py_DECREF(methods[i]);
+    }
+    return -1;
+}
 
 /*
  * The following functions normalize ufunc arguments. The work done is similar
@@ -359,7 +447,7 @@ PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
     /*
      * Check inputs for overrides
      */
-    num_override_args = PyUFunc_WithOverride(
+    num_override_args = get_array_ufunc_overrides(
         args, kwds, with_override, array_ufunc_methods);
     if (num_override_args == -1) {
         goto fail;
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index da0713b2b..a3e00b5c1 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -32,6 +32,14 @@
 #include <float.h>
 #include <string.h> /* for memcpy */
 
+#if defined __AVX512F__
+#define VECTOR_SIZE_BYTES 64
+#elif defined __AVX2__
+#define VECTOR_SIZE_BYTES 32
+#else
+#define VECTOR_SIZE_BYTES 16
+#endif
+
 static NPY_INLINE npy_uintp
 abs_ptrdiff(char *a, char *b)
 {
@@ -144,7 +152,7 @@ static NPY_INLINE int
 run_@name@_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
 {
 #if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-    if (@check@(sizeof(@type@), 16)) {
+    if (@check@(sizeof(@type@), VECTOR_SIZE_BYTES)) {
         sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
         return 1;
     }
@@ -183,16 +191,16 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps
     @type@ * op = (@type@ *)args[2];
     npy_intp n = dimensions[0];
     /* argument one scalar */
-    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), 16)) {
+    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), VECTOR_SIZE_BYTES)) {
         sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
         return 1;
     }
     /* argument two scalar */
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), 16)) {
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), VECTOR_SIZE_BYTES)) {
         sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
         return 1;
     }
-    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), 16)) {
+    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), VECTOR_SIZE_BYTES)) {
         sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
         return 1;
     }
@@ -232,16 +240,16 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps
     npy_bool * op = (npy_bool *)args[2];
     npy_intp n = dimensions[0];
     /* argument one scalar */
-    if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), 16)) {
+    if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
         sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
         return 1;
     }
     /* argument two scalar */
-    else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), 16)) {
+    else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
         sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
         return 1;
     }
-    else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), 16)) {
+    else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
         sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
         return 1;
     }
@@ -302,7 +310,8 @@ static NPY_INLINE int
 run_binary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
 {
 #if defined NPY_HAVE_SSE2_INTRINSICS
-    if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_BINARY(sizeof(npy_bool), 16)) {
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_BINARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
         sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
                                (npy_bool*)args[1], dimensions[0]);
         return 1;
@@ -316,7 +325,8 @@ static NPY_INLINE int
 run_reduce_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
 {
 #if defined NPY_HAVE_SSE2_INTRINSICS
-    if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_REDUCE(sizeof(npy_bool), 16)) {
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_REDUCE(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
         sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
                                 dimensions[0]);
         return 1;
@@ -340,7 +350,8 @@ static NPY_INLINE int
 run_unary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
 {
 #if defined NPY_HAVE_SSE2_INTRINSICS
-    if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_UNARY(sizeof(npy_bool), 16)) {
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_UNARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
         sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
         return 1;
     }
@@ -416,19 +427,19 @@ static void
 sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
 #ifdef  __AVX512F__
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, 64)
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
         op[i] = ip1[i] @OP@ ip2[i];
     /* lots of specializations, to squeeze out max performance */
-    if (npy_is_aligned(&ip1[i], 64) && npy_is_aligned(&ip2[i], 64)) {
+    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) && npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
         if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, 64) {
+            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
                 @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
                 @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
                 @vpre512@_store_@vsuf@(&op[i], c);
             }
         }
         else {
-            LOOP_BLOCKED(@type@, 64) {
+            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
                 @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
                 @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
                 @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@@ -436,16 +447,16 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
             }
         }
     }
-    else if (npy_is_aligned(&ip1[i], 64)) {
-        LOOP_BLOCKED(@type@, 64) {
+    else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
             @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
             @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
             @vpre512@_store_@vsuf@(&op[i], c);
         }
     }
-    else if (npy_is_aligned(&ip2[i], 64)) {
-        LOOP_BLOCKED(@type@, 64) {
+    else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
             @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
             @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@@ -454,14 +465,14 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
     }
     else {
         if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, 64) {
+            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
                 @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
                 @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
                 @vpre512@_store_@vsuf@(&op[i], c);
             }
         }
         else {
-            LOOP_BLOCKED(@type@, 64) {
+            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
                 @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
                 @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
                 @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@@ -470,19 +481,20 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
         }
     }
 #elif __AVX2__
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, 32)
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
         op[i] = ip1[i] @OP@ ip2[i];
     /* lots of specializations, to squeeze out max performance */
-    if (npy_is_aligned(&ip1[i], 32) && npy_is_aligned(&ip2[i], 32)) {
+    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) &&
+            npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
         if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, 32) {
+            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
                 @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
                 @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
                 @vpre256@_store_@vsuf@(&op[i], c);
             }
         }
         else {
-            LOOP_BLOCKED(@type@, 32) {
+            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
                 @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
                 @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
                 @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@@ -490,16 +502,16 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
             }
         }
     }
-    else if (npy_is_aligned(&ip1[i], 32)) {
-        LOOP_BLOCKED(@type@, 32) {
+    else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
             @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
             @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
             @vpre256@_store_@vsuf@(&op[i], c);
         }
     }
-    else if (npy_is_aligned(&ip2[i], 32)) {
-        LOOP_BLOCKED(@type@, 32) {
+    else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
             @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
             @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@@ -508,14 +520,14 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
     }
     else {
         if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, 32) {
+            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
                 @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
                 @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
                 @vpre256@_store_@vsuf@(&op[i], c);
             }
         }
         else {
-            LOOP_BLOCKED(@type@, 32) {
+            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
                 @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
                 @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
                 @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@@ -524,19 +536,20 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
         }
     }
 #else
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
         op[i] = ip1[i] @OP@ ip2[i];
     /* lots of specializations, to squeeze out max performance */
-    if (npy_is_aligned(&ip1[i], 16) && npy_is_aligned(&ip2[i], 16)) {
+    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) &&
+            npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
         if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, 16) {
+            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
                 @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
                 @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
                 @vpre@_store_@vsuf@(&op[i], c);
             }
         }
         else {
-            LOOP_BLOCKED(@type@, 16) {
+            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
                 @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
                 @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
                 @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@@ -544,16 +557,16 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
             }
         }
     }
-    else if (npy_is_aligned(&ip1[i], 16)) {
-        LOOP_BLOCKED(@type@, 16) {
+    else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
             @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
             @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
             @vpre@_store_@vsuf@(&op[i], c);
         }
     }
-    else if (npy_is_aligned(&ip2[i], 16)) {
-        LOOP_BLOCKED(@type@, 16) {
+    else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
             @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
             @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@@ -562,14 +575,14 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
     }
     else {
         if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, 16) {
+            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
                 @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
                 @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
                 @vpre@_store_@vsuf@(&op[i], c);
             }
         }
         else {
-            LOOP_BLOCKED(@type@, 16) {
+            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
                 @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
                 @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
                 @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@@ -589,17 +602,17 @@ sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
 {
 #ifdef __AVX512F__
     const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, 64)
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
         op[i] = ip1[0] @OP@ ip2[i];
-    if (npy_is_aligned(&ip2[i], 64)) {
-        LOOP_BLOCKED(@type@, 64) {
+    if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
             @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
             @vpre512@_store_@vsuf@(&op[i], c);
         }
     }
     else {
-        LOOP_BLOCKED(@type@, 64) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
             @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
             @vpre512@_store_@vsuf@(&op[i], c);
@@ -609,17 +622,17 @@ sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
 
 #elif __AVX2__
     const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, 32)
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
         op[i] = ip1[0] @OP@ ip2[i];
-    if (npy_is_aligned(&ip2[i], 32)) {
-        LOOP_BLOCKED(@type@, 32) {
+    if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
             @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
             @vpre256@_store_@vsuf@(&op[i], c);
         }
     }
     else {
-        LOOP_BLOCKED(@type@, 32) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
             @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
             @vpre256@_store_@vsuf@(&op[i], c);
@@ -627,17 +640,17 @@ sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
     }
 #else
     const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
         op[i] = ip1[0] @OP@ ip2[i];
-    if (npy_is_aligned(&ip2[i], 16)) {
-        LOOP_BLOCKED(@type@, 16) {
+    if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
             @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
             @vpre@_store_@vsuf@(&op[i], c);
         }
     }
     else {
-        LOOP_BLOCKED(@type@, 16) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
             @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
             @vpre@_store_@vsuf@(&op[i], c);
@@ -655,17 +668,17 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
 {
 #ifdef __AVX512F__
     const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, 64)
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
         op[i] = ip1[i] @OP@ ip2[0];
-    if (npy_is_aligned(&ip1[i], 64)) {
-        LOOP_BLOCKED(@type@, 64) {
+    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
             @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
             @vpre512@_store_@vsuf@(&op[i], c);
         }
     }
     else {
-        LOOP_BLOCKED(@type@, 64) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
             @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
             @vpre512@_store_@vsuf@(&op[i], c);
@@ -674,17 +687,17 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
 
 #elif __AVX2__
     const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, 32)
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
         op[i] = ip1[i] @OP@ ip2[0];
-    if (npy_is_aligned(&ip1[i], 32)) {
-        LOOP_BLOCKED(@type@, 32) {
+    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
             @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
             @vpre256@_store_@vsuf@(&op[i], c);
         }
     }
     else {
-        LOOP_BLOCKED(@type@, 32) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
             @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
             @vpre256@_store_@vsuf@(&op[i], c);
@@ -692,17 +705,17 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
     }
 #else
     const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
         op[i] = ip1[i] @OP@ ip2[0];
-    if (npy_is_aligned(&ip1[i], 16)) {
-        LOOP_BLOCKED(@type@, 16) {
+    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
             @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
             @vpre@_store_@vsuf@(&op[i], c);
         }
     }
     else {
-        LOOP_BLOCKED(@type@, 16) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
             @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
             @vpre@_store_@vsuf@(&op[i], c);
@@ -742,10 +755,10 @@ sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4,
 static void
 sse2_signbit_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
 {
-    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
+    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
         op[i] = npy_signbit(ip1[i]) != 0;
     }
-    LOOP_BLOCKED(@type@, 16) {
+    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
         @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
         int r = @vpre@_movemask_@vsuf@(a);
         if (sizeof(@type@) == 8) {
@@ -783,14 +796,14 @@ sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
     const @vtype@ fltmax = @vpre@_set1_@vsuf@(FLT_MAX);
 #endif
 #endif
-    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
+    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
         op[i] = npy_@kind@(ip1[i]) != 0;
     }
-    LOOP_BLOCKED(@type@, 64) {
-        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
-        @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
-        @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
-        @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
+    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
+        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
         @vtype@ r1, r2, r3, r4;
 #if @var@ != 0 /* isinf/isfinite */
         /* fabs via masking of sign bit */
@@ -853,18 +866,18 @@ sse2_ordered_cmp_@kind@_@TYPE@(const @type@ a, const @type@ b)
 static void
 sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
-    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
+    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
         op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
     }
-    LOOP_BLOCKED(@type@, 64) {
-        @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
-        @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
-        @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
-        @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
-        @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * 16 / sizeof(@type@)]);
-        @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * 16 / sizeof(@type@)]);
-        @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * 16 / sizeof(@type@)]);
-        @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * 16 / sizeof(@type@)]);
+    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
+        @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
         @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2);
         @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2);
         @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2);
@@ -881,14 +894,14 @@ static void
 sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
     @vtype@ s = @vpre@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(ip2, @type@, 16) {
+    LOOP_BLOCK_ALIGN_VAR(ip2, @type@, VECTOR_SIZE_BYTES) {
         op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
     }
-    LOOP_BLOCKED(@type@, 64) {
-        @vtype@ a = @vpre@_load_@vsuf@(&ip2[i + 0 * 16 / sizeof(@type@)]);
-        @vtype@ b = @vpre@_load_@vsuf@(&ip2[i + 1 * 16 / sizeof(@type@)]);
-        @vtype@ c = @vpre@_load_@vsuf@(&ip2[i + 2 * 16 / sizeof(@type@)]);
-        @vtype@ d = @vpre@_load_@vsuf@(&ip2[i + 3 * 16 / sizeof(@type@)]);
+    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
+        @vtype@ a = @vpre@_load_@vsuf@(&ip2[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ b = @vpre@_load_@vsuf@(&ip2[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ c = @vpre@_load_@vsuf@(&ip2[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ d = @vpre@_load_@vsuf@(&ip2[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
         @vtype@ r1 = @vpre@_@VOP@_@vsuf@(s, a);
         @vtype@ r2 = @vpre@_@VOP@_@vsuf@(s, b);
         @vtype@ r3 = @vpre@_@VOP@_@vsuf@(s, c);
@@ -905,14 +918,14 @@ static void
 sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
     @vtype@ s = @vpre@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
+    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
         op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
     }
-    LOOP_BLOCKED(@type@, 64) {
-        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
-        @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
-        @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
-        @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
+    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
+        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+        @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
         @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a, s);
         @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b, s);
         @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c, s);
@@ -928,19 +941,20 @@ sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy
 static void
 sse2_sqrt_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
 {
-    /* align output to 16 bytes */
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
+    /* align output to VECTOR_SIZE_BYTES bytes */
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
         op[i] = @scalarf@(ip[i]);
     }
-    assert(n < (16 / sizeof(@type@)) || npy_is_aligned(&op[i], 16));
-    if (npy_is_aligned(&ip[i], 16)) {
-        LOOP_BLOCKED(@type@, 16) {
+    assert(n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
+           npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
+    if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype@ d = @vpre@_load_@vsuf@(&ip[i]);
             @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
         }
     }
     else {
-        LOOP_BLOCKED(@type@, 16) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]);
             @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
         }
@@ -979,19 +993,20 @@ sse2_@kind@_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
      */
     const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
 
-    /* align output to 16 bytes */
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
+    /* align output to VECTOR_SIZE_BYTES bytes */
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
         op[i] = @scalar@_@type@(ip[i]);
     }
-    assert(n < (16 / sizeof(@type@)) || npy_is_aligned(&op[i], 16));
-    if (npy_is_aligned(&ip[i], 16)) {
-        LOOP_BLOCKED(@type@, 16) {
+    assert(n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
+           npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
+    if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
             @vpre@_store_@vsuf@(&op[i], @vpre@_@VOP@_@vsuf@(mask, a));
         }
     }
     else {
-        LOOP_BLOCKED(@type@, 16) {
+        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
             @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
             @vpre@_store_@vsuf@(&op[i], @vpre@_@VOP@_@vsuf@(mask, a));
         }
@@ -1012,11 +1027,11 @@ sse2_@kind@_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
 static void
 sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
 {
-    const npy_intp stride = 16 / (npy_intp)sizeof(@type@);
-    LOOP_BLOCK_ALIGN_VAR(ip, @type@, 16) {
+    const npy_intp stride = VECTOR_SIZE_BYTES / (npy_intp)sizeof(@type@);
+    LOOP_BLOCK_ALIGN_VAR(ip, @type@, VECTOR_SIZE_BYTES) {
         *op = (npy_isnan(*op) || *op @OP@ ip[i]) ? *op : ip[i];
     }
-    assert(n < (stride) || npy_is_aligned(&ip[i], 16));
+    assert(n < (stride) || npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES));
     if (i + 3 * stride <= n) {
         /* load the first elements */
         @vtype@ c1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
@@ -1025,7 +1040,7 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
 
         /* minps/minpd will set invalid flag if nan is encountered */
         npy_clear_floatstatus_barrier((char*)&c1);
-        LOOP_BLOCKED(@type@, 32) {
+        LOOP_BLOCKED(@type@, 2 * VECTOR_SIZE_BYTES) {
             @vtype@ v1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
             @vtype@ v2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
             c1 = @vpre@_@VOP@_@vsuf@(c1, v1);
@@ -1090,9 +1105,9 @@ static NPY_INLINE @vtype@ byte_to_true(@vtype@ v)
 static void
 sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp n)
 {
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
         op[i] = ip1[i] @op@ ip2[i];
-    LOOP_BLOCKED(@type@, 16) {
+    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
         @vtype@ a = @vloadu@((@vtype@*)&ip1[i]);
         @vtype@ b = @vloadu@((@vtype@*)&ip2[i]);
 #if @and@
@@ -1117,16 +1132,16 @@ static void
 sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
 {
     const @vtype@ zero = @vpre@_setzero_@vsuf@();
-    LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, 16) {
+    LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, VECTOR_SIZE_BYTES) {
         *op = *op @op@ ip[i];
         if (*op @sc@ 0) {
             return;
         }
     }
     /* unrolled once to replace a slow movmsk with a fast pmaxb */
-    LOOP_BLOCKED(npy_bool, 32) {
+    LOOP_BLOCKED(npy_bool, 2 * VECTOR_SIZE_BYTES) {
         @vtype@ v = @vload@((@vtype@*)&ip[i]);
-        @vtype@ v2 = @vload@((@vtype@*)&ip[i + 16]);
+        @vtype@ v2 = @vload@((@vtype@*)&ip[i + VECTOR_SIZE_BYTES]);
         v = @vpre@_cmpeq_epi8(v, zero);
         v2 = @vpre@_cmpeq_epi8(v2, zero);
 #if @and@
@@ -1164,9 +1179,9 @@ sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
 static void
 sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
 {
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
         op[i] = (ip[i] @op@ 0);
-    LOOP_BLOCKED(@type@, 16) {
+    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
         @vtype@ a = @vloadu@((@vtype@*)&ip[i]);
 #if @not@
         const @vtype@ zero = @vpre@_setzero_@vsuf@();
@@ -1187,6 +1202,8 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
 
 /**end repeat**/
 
+#undef VECTOR_SIZE_BYTES
+
 #endif /* NPY_HAVE_SSE2_INTRINSICS */
 
 #endif
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 8fb731fb7..1fe8745a0 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -308,6 +308,49 @@ _find_array_prepare(ufunc_full_args args,
     return;
 }
 
+#define NPY_UFUNC_DEFAULT_INPUT_FLAGS \
+    NPY_ITER_READONLY | \
+    NPY_ITER_ALIGNED | \
+    NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE
+
+#define NPY_UFUNC_DEFAULT_OUTPUT_FLAGS \
+    NPY_ITER_ALIGNED | \
+    NPY_ITER_ALLOCATE | \
+    NPY_ITER_NO_BROADCAST | \
+    NPY_ITER_NO_SUBTYPE | \
+    NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE
+/*
+ * Set per-operand flags according to desired input or output flags.
+ * op_flags[i] for i in input (as determined by ufunc->nin) will be
+ * merged with op_in_flags, perhaps overriding per-operand flags set
+ * in previous stages.
+ * op_flags[i] for i in output will be set to op_out_flags only if previously
+ * unset.
+ * The input flag behavior preserves backward compatibility, while the
+ * output flag behaviour is the "correct" one for maximum flexibility.
+ */
+NPY_NO_EXPORT void
+_ufunc_setup_flags(PyUFuncObject *ufunc, npy_uint32 op_in_flags,
+                   npy_uint32 op_out_flags, npy_uint32 *op_flags)
+{
+    int nin = ufunc->nin;
+    int nout = ufunc->nout;
+    int nop = nin + nout, i;
+    /* Set up the flags */
+    for (i = 0; i < nin; ++i) {
+        op_flags[i] = ufunc->op_flags[i] | op_in_flags;
+        /*
+         * If READWRITE flag has been set for this operand,
+         * then clear default READONLY flag
+         */
+        if (op_flags[i] & (NPY_ITER_READWRITE | NPY_ITER_WRITEONLY)) {
+            op_flags[i] &= ~NPY_ITER_READONLY;
+        }
+    }
+    for (i = nin; i < nop; ++i) {
+        op_flags[i] = ufunc->op_flags[i] ? ufunc->op_flags[i] : op_out_flags;
+    }
+}
 
 /*
  * This function analyzes the input arguments
@@ -1394,11 +1437,11 @@ iterator_loop(PyUFuncObject *ufunc,
                     PyObject **arr_prep,
                     ufunc_full_args full_args,
                     PyUFuncGenericFunction innerloop,
-                    void *innerloopdata)
+                    void *innerloopdata,
+                    npy_uint32 *op_flags)
 {
     npy_intp i, nin = ufunc->nin, nout = ufunc->nout;
     npy_intp nop = nin + nout;
-    npy_uint32 op_flags[NPY_MAXARGS];
     NpyIter *iter;
     char *baseptrs[NPY_MAXARGS];
 
@@ -1412,29 +1455,6 @@ iterator_loop(PyUFuncObject *ufunc,
 
     NPY_BEGIN_THREADS_DEF;
 
-    /* Set up the flags */
-    for (i = 0; i < nin; ++i) {
-        op_flags[i] = NPY_ITER_READONLY |
-                      NPY_ITER_ALIGNED |
-                      NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
-        /*
-         * If READWRITE flag has been set for this operand,
-         * then clear default READONLY flag
-         */
-        op_flags[i] |= ufunc->op_flags[i];
-        if (op_flags[i] & (NPY_ITER_READWRITE | NPY_ITER_WRITEONLY)) {
-            op_flags[i] &= ~NPY_ITER_READONLY;
-        }
-    }
-    for (i = nin; i < nop; ++i) {
-        op_flags[i] = NPY_ITER_WRITEONLY |
-                      NPY_ITER_ALIGNED |
-                      NPY_ITER_ALLOCATE |
-                      NPY_ITER_NO_BROADCAST |
-                      NPY_ITER_NO_SUBTYPE |
-                      NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
-    }
-
     iter_flags = ufunc->iter_flags |
                  NPY_ITER_EXTERNAL_LOOP |
                  NPY_ITER_REFS_OK |
@@ -1538,15 +1558,15 @@ iterator_loop(PyUFuncObject *ufunc,
 }
 
 /*
+ * ufunc           - the ufunc to call
  * trivial_loop_ok - 1 if no alignment, data conversion, etc required
- * nin             - number of inputs
- * nout            - number of outputs
- * op              - the operands (nin + nout of them)
+ * op              - the operands (ufunc->nin + ufunc->nout of them)
+ * dtypes          - the dtype of each operand
  * order           - the loop execution order/output memory order
  * buffersize      - how big of a buffer to use
  * arr_prep        - the __array_prepare__ functions for the outputs
- * innerloop       - the inner loop function
- * innerloopdata   - data to pass to the inner loop
+ * full_args       - the original input, output PyObject *
+ * op_flags        - per-operand flags, a combination of NPY_ITER_* constants
  */
 static int
 execute_legacy_ufunc_loop(PyUFuncObject *ufunc,
@@ -1556,7 +1576,8 @@ execute_legacy_ufunc_loop(PyUFuncObject *ufunc,
                     NPY_ORDER order,
                     npy_intp buffersize,
                     PyObject **arr_prep,
-                    ufunc_full_args full_args)
+                    ufunc_full_args full_args,
+                    npy_uint32 *op_flags)
 {
     npy_intp nin = ufunc->nin, nout = ufunc->nout;
     PyUFuncGenericFunction innerloop;
@@ -1691,7 +1712,7 @@ execute_legacy_ufunc_loop(PyUFuncObject *ufunc,
     NPY_UF_DBG_PRINT("iterator loop\n");
     if (iterator_loop(ufunc, op, dtypes, order,
                     buffersize, arr_prep, full_args,
-                    innerloop, innerloopdata) < 0) {
+                    innerloop, innerloopdata, op_flags) < 0) {
         return -1;
     }
 
@@ -1717,14 +1738,13 @@ execute_fancy_ufunc_loop(PyUFuncObject *ufunc,
                     NPY_ORDER order,
                     npy_intp buffersize,
                     PyObject **arr_prep,
-                    ufunc_full_args full_args)
+                    ufunc_full_args full_args,
+                    npy_uint32 *op_flags)
 {
     int i, nin = ufunc->nin, nout = ufunc->nout;
     int nop = nin + nout;
-    npy_uint32 op_flags[NPY_MAXARGS];
     NpyIter *iter;
     int needs_api;
-    npy_intp default_op_in_flags = 0, default_op_out_flags = 0;
 
     NpyIter_IterNextFunc *iternext;
     char **dataptr;
@@ -1734,48 +1754,10 @@ execute_fancy_ufunc_loop(PyUFuncObject *ufunc,
     PyArrayObject **op_it;
     npy_uint32 iter_flags;
 
-    if (wheremask != NULL) {
-        if (nop + 1 > NPY_MAXARGS) {
-            PyErr_SetString(PyExc_ValueError,
-                    "Too many operands when including where= parameter");
-            return -1;
-        }
-        op[nop] = wheremask;
-        dtypes[nop] = NULL;
-        default_op_out_flags |= NPY_ITER_WRITEMASKED;
-    }
-
-    /* Set up the flags */
-    for (i = 0; i < nin; ++i) {
-        op_flags[i] = default_op_in_flags |
-                      NPY_ITER_READONLY |
-                      NPY_ITER_ALIGNED |
-                      NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
-        /*
-         * If READWRITE flag has been set for this operand,
-         * then clear default READONLY flag
-         */
-        op_flags[i] |= ufunc->op_flags[i];
-        if (op_flags[i] & (NPY_ITER_READWRITE | NPY_ITER_WRITEONLY)) {
-            op_flags[i] &= ~NPY_ITER_READONLY;
-        }
-    }
     for (i = nin; i < nop; ++i) {
-        /*
-         * We don't write to all elements, and the iterator may make
-         * UPDATEIFCOPY temporary copies. The output arrays (unless they are
-         * allocated by the iterator itself) must be considered READWRITE by the
-         * iterator, so that the elements we don't write to are copied to the
-         * possible temporary array.
-         */
-        op_flags[i] = default_op_out_flags |
-                      (op[i] != NULL ? NPY_ITER_READWRITE : NPY_ITER_WRITEONLY) |
-                      NPY_ITER_ALIGNED |
-                      NPY_ITER_ALLOCATE |
-                      NPY_ITER_NO_BROADCAST |
-                      NPY_ITER_NO_SUBTYPE |
-                      NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
+        op_flags[i] |= (op[i] != NULL ? NPY_ITER_READWRITE : NPY_ITER_WRITEONLY);
     }
+
     if (wheremask != NULL) {
         op_flags[nop] = NPY_ITER_READONLY | NPY_ITER_ARRAYMASK;
     }
@@ -2471,6 +2453,11 @@ _get_identity(PyUFuncObject *ufunc, npy_bool *reorderable) {
         *reorderable = 0;
         Py_RETURN_NONE;
 
+    case PyUFunc_IdentityValue:
+        *reorderable = 1;
+        Py_INCREF(ufunc->identity_value);
+        return ufunc->identity_value;
+
     default:
         PyErr_Format(PyExc_ValueError,
                 "ufunc %s has an invalid identity", ufunc_get_name_cstr(ufunc));
@@ -2785,6 +2772,18 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
     if (retval < 0) {
         goto fail;
     }
+    /*
+     * We don't write to all elements, and the iterator may make
+     * UPDATEIFCOPY temporary copies. The output arrays (unless they are
+     * allocated by the iterator itself) must be considered READWRITE by the
+     * iterator, so that the elements we don't write to are copied to the
+     * possible temporary array.
+     */
+    _ufunc_setup_flags(ufunc, NPY_ITER_COPY | NPY_UFUNC_DEFAULT_INPUT_FLAGS,
+                       NPY_ITER_UPDATEIFCOPY |
+                       NPY_ITER_READWRITE |
+                       NPY_UFUNC_DEFAULT_OUTPUT_FLAGS,
+                       op_flags);
     /* For the generalized ufunc, we get the loop right away too */
     retval = ufunc->legacy_inner_loop_selector(ufunc, dtypes,
                                     &innerloop, &innerloopdata, &needs_api);
@@ -2827,28 +2826,6 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
      * Set up the iterator per-op flags.  For generalized ufuncs, we
      * can't do buffering, so must COPY or UPDATEIFCOPY.
      */
-    for (i = 0; i < nin; ++i) {
-        op_flags[i] = NPY_ITER_READONLY |
-                      NPY_ITER_COPY |
-                      NPY_ITER_ALIGNED |
-                      NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
-        /*
-         * If READWRITE flag has been set for this operand,
-         * then clear default READONLY flag
-         */
-        op_flags[i] |= ufunc->op_flags[i];
-        if (op_flags[i] & (NPY_ITER_READWRITE | NPY_ITER_WRITEONLY)) {
-            op_flags[i] &= ~NPY_ITER_READONLY;
-        }
-    }
-    for (i = nin; i < nop; ++i) {
-        op_flags[i] = NPY_ITER_READWRITE|
-                      NPY_ITER_UPDATEIFCOPY|
-                      NPY_ITER_ALIGNED|
-                      NPY_ITER_ALLOCATE|
-                      NPY_ITER_NO_BROADCAST|
-                      NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
-    }
 
     iter_flags = ufunc->iter_flags |
                  NPY_ITER_MULTI_INDEX |
@@ -3097,7 +3074,8 @@ PyUFunc_GenericFunction(PyUFuncObject *ufunc,
     int i, nop;
     const char *ufunc_name;
     int retval = -1, subok = 1;
-    int need_fancy = 0;
+    npy_uint32 op_flags[NPY_MAXARGS];
+    npy_intp default_op_out_flags;
 
     PyArray_Descr *dtypes[NPY_MAXARGS];
 
@@ -3156,13 +3134,6 @@ PyUFunc_GenericFunction(PyUFuncObject *ufunc,
         return retval;
     }
 
-    /*
-     * Use the masked loop if a wheremask was specified.
-     */
-    if (wheremask != NULL) {
-        need_fancy = 1;
-    }
-
     /* Get the buffersize and errormask */
     if (_get_bufsize_errmask(extobj, ufunc_name, &buffersize, &errormask) < 0) {
         retval = -1;
@@ -3177,16 +3148,20 @@ PyUFunc_GenericFunction(PyUFuncObject *ufunc,
         goto fail;
     }
 
-    /* Only do the trivial loop check for the unmasked version. */
-    if (!need_fancy) {
-        /*
-         * This checks whether a trivial loop is ok, making copies of
-         * scalar and one dimensional operands if that will help.
-         */
-        trivial_loop_ok = check_for_trivial_loop(ufunc, op, dtypes, buffersize);
-        if (trivial_loop_ok < 0) {
-            goto fail;
-        }
+    if (wheremask != NULL) {
+        /* Set up the flags. */
+        default_op_out_flags = NPY_ITER_NO_SUBTYPE |
+                               NPY_ITER_WRITEMASKED |
+                               NPY_UFUNC_DEFAULT_OUTPUT_FLAGS;
+        _ufunc_setup_flags(ufunc, NPY_UFUNC_DEFAULT_INPUT_FLAGS,
+                           default_op_out_flags, op_flags);
+    }
+    else {
+        /* Set up the flags. */
+        default_op_out_flags = NPY_ITER_WRITEONLY |
+                               NPY_UFUNC_DEFAULT_OUTPUT_FLAGS;
+        _ufunc_setup_flags(ufunc, NPY_UFUNC_DEFAULT_INPUT_FLAGS,
+                           default_op_out_flags, op_flags);
     }
 
 #if NPY_UF_DBG_TRACING
@@ -3214,23 +3189,46 @@ PyUFunc_GenericFunction(PyUFuncObject *ufunc,
         _find_array_prepare(full_args, arr_prep, nin, nout);
     }
 
-    /* Start with the floating-point exception flags cleared */
-    npy_clear_floatstatus_barrier((char*)&ufunc);
 
     /* Do the ufunc loop */
-    if (need_fancy) {
+    if (wheremask != NULL) {
         NPY_UF_DBG_PRINT("Executing fancy inner loop\n");
 
+        if (nop + 1 > NPY_MAXARGS) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Too many operands when including where= parameter");
+            return -1;
+        }
+        op[nop] = wheremask;
+        dtypes[nop] = NULL;
+
+        /* Set up the flags */
+
+        npy_clear_floatstatus_barrier((char*)&ufunc);
         retval = execute_fancy_ufunc_loop(ufunc, wheremask,
                             op, dtypes, order,
-                            buffersize, arr_prep, full_args);
+                            buffersize, arr_prep, full_args, op_flags);
     }
     else {
         NPY_UF_DBG_PRINT("Executing legacy inner loop\n");
 
+        /*
+         * This checks whether a trivial loop is ok, making copies of
+         * scalar and one dimensional operands if that will help.
+         * Since it requires dtypes, it can only be called after
+         * ufunc->type_resolver
+         */
+        trivial_loop_ok = check_for_trivial_loop(ufunc, op, dtypes, buffersize);
+        if (trivial_loop_ok < 0) {
+            goto fail;
+        }
+
+        /* check_for_trivial_loop on half-floats can overflow */
+        npy_clear_floatstatus_barrier((char*)&ufunc);
+
         retval = execute_legacy_ufunc_loop(ufunc, trivial_loop_ok,
                             op, dtypes, order,
-                            buffersize, arr_prep, full_args);
+                            buffersize, arr_prep, full_args, op_flags);
     }
     if (retval < 0) {
         goto fail;
@@ -4840,6 +4838,20 @@ PyUFunc_FromFuncAndDataAndSignature(PyUFuncGenericFunction *func, void **data,
                                      const char *name, const char *doc,
                                      int unused, const char *signature)
 {
+    return PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        func, data, types, ntypes, nin, nout, identity, name, doc,
+        unused, signature, NULL);
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT PyObject *
+PyUFunc_FromFuncAndDataAndSignatureAndIdentity(PyUFuncGenericFunction *func, void **data,
+                                     char *types, int ntypes,
+                                     int nin, int nout, int identity,
+                                     const char *name, const char *doc,
+                                     int unused, const char *signature,
+                                     PyObject *identity_value)
+{
     PyUFuncObject *ufunc;
     if (nin + nout > NPY_MAXARGS) {
         PyErr_Format(PyExc_ValueError,
@@ -4860,6 +4872,10 @@ PyUFunc_FromFuncAndDataAndSignature(PyUFuncGenericFunction *func, void **data,
     ufunc->nout = nout;
     ufunc->nargs = nin+nout;
     ufunc->identity = identity;
+    if (ufunc->identity == PyUFunc_IdentityValue) {
+        Py_INCREF(identity_value);
+    }
+    ufunc->identity_value = identity_value;
 
     ufunc->functions = func;
     ufunc->data = data;
@@ -4881,6 +4897,7 @@ PyUFunc_FromFuncAndDataAndSignature(PyUFuncGenericFunction *func, void **data,
 
     ufunc->op_flags = PyArray_malloc(sizeof(npy_uint32)*ufunc->nargs);
     if (ufunc->op_flags == NULL) {
+        Py_DECREF(ufunc);
         return PyErr_NoMemory();
     }
     memset(ufunc->op_flags, 0, sizeof(npy_uint32)*ufunc->nargs);
@@ -5237,6 +5254,9 @@ ufunc_dealloc(PyUFuncObject *ufunc)
     PyArray_free(ufunc->op_flags);
     Py_XDECREF(ufunc->userloops);
     Py_XDECREF(ufunc->obj);
+    if (ufunc->identity == PyUFunc_IdentityValue) {
+        Py_DECREF(ufunc->identity_value);
+    }
     PyArray_free(ufunc);
 }
 
diff --git a/numpy/core/src/umath/umathmodule.c b/numpy/core/src/umath/umathmodule.c
index 20bd2b0a8..8277ad6cc 100644
--- a/numpy/core/src/umath/umathmodule.c
+++ b/numpy/core/src/umath/umathmodule.c
@@ -29,6 +29,7 @@
 #include "abstract.h"
 
 #include "numpy/npy_math.h"
+#include "number.h"
 
 static PyUFuncGenericFunction pyfunc_functions[] = {PyUFunc_On_Om};
 
@@ -325,7 +326,7 @@ int initumath(PyObject *m)
     s2 = PyDict_GetItemString(d, "remainder");
     /* Setup the array object's numerical structures with appropriate
        ufuncs in d*/
-    PyArray_SetNumericOps(d);
+    _PyArray_SetNumericOps(d);
 
     PyDict_SetItemString(d, "conj", s);
     PyDict_SetItemString(d, "mod", s2);
diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py
index 10ef16800..7e6e256fe 100644
--- a/numpy/core/tests/test_deprecations.py
+++ b/numpy/core/tests/test_deprecations.py
@@ -523,3 +523,14 @@ class TestFromstring(_DeprecationTestCase):
     # 2017-10-19, 1.14
     def test_fromstring(self):
         self.assert_deprecated(np.fromstring, args=('\x00'*80,))
+
+class Test_GetSet_NumericOps(_DeprecationTestCase):
+    # 2018-09-20, 1.16.0
+    def test_get_numeric_ops(self):
+        from numpy.core._multiarray_tests import getset_numericops
+        self.assert_deprecated(getset_numericops, num=2)
+        
+        # empty kwargs prevents any state actually changing which would break
+        # other tests.
+        self.assert_deprecated(np.set_numeric_ops, kwargs={})
+        assert_raises(ValueError, np.set_numeric_ops, add='abc')
diff --git a/numpy/core/tests/test_dtype.py b/numpy/core/tests/test_dtype.py
index ecb51f72d..8cde19612 100644
--- a/numpy/core/tests/test_dtype.py
+++ b/numpy/core/tests/test_dtype.py
@@ -807,9 +807,9 @@ class TestFromCTypes(object):
         p_uint8 = ctypes.POINTER(ctypes.c_uint8)
         assert_raises(TypeError, np.dtype, p_uint8)
 
-    @pytest.mark.xfail(
-        reason="Unions are not implemented",
-        raises=NotImplementedError)
+    def test_void_pointer(self):
+        self.check(ctypes.c_void_p, np.uintp)
+
     def test_union(self):
         class Union(ctypes.Union):
             _fields_ = [
@@ -824,7 +824,52 @@ class TestFromCTypes(object):
         ))
         self.check(Union, expected)
 
-    @pytest.mark.xfail(reason="_pack_ is ignored - see gh-11651")
+    def test_union_with_struct_packed(self):
+        class Struct(ctypes.Structure):
+            _pack_ = 1
+            _fields_ = [
+                ('one', ctypes.c_uint8),
+                ('two', ctypes.c_uint32)
+            ]
+
+        class Union(ctypes.Union):
+            _fields_ = [
+                ('a', ctypes.c_uint8),
+                ('b', ctypes.c_uint16),
+                ('c', ctypes.c_uint32),
+                ('d', Struct),
+            ]
+        expected = np.dtype(dict(
+            names=['a', 'b', 'c', 'd'],
+            formats=['u1', np.uint16, np.uint32, [('one', 'u1'), ('two', np.uint32)]],
+            offsets=[0, 0, 0, 0],
+            itemsize=ctypes.sizeof(Union)
+        ))
+        self.check(Union, expected)
+
+    def test_union_packed(self):
+        class Struct(ctypes.Structure):
+            _fields_ = [
+                ('one', ctypes.c_uint8),
+                ('two', ctypes.c_uint32)
+            ]
+            _pack_ = 1
+        class Union(ctypes.Union):
+            _pack_ = 1
+            _fields_ = [
+                ('a', ctypes.c_uint8),
+                ('b', ctypes.c_uint16),
+                ('c', ctypes.c_uint32),
+                ('d', Struct),
+            ]
+        expected = np.dtype(dict(
+            names=['a', 'b', 'c', 'd'],
+            formats=['u1', np.uint16, np.uint32, [('one', 'u1'), ('two', np.uint32)]],
+            offsets=[0, 0, 0, 0],
+            itemsize=ctypes.sizeof(Union)
+        ))
+        self.check(Union, expected)
+
     def test_packed_structure(self):
         class PackedStructure(ctypes.Structure):
             _pack_ = 1
@@ -838,8 +883,45 @@ class TestFromCTypes(object):
         ])
         self.check(PackedStructure, expected)
 
-    @pytest.mark.xfail(sys.byteorder != 'little',
-        reason="non-native endianness does not work - see gh-10533")
+    def test_large_packed_structure(self):
+        class PackedStructure(ctypes.Structure):
+            _pack_ = 2
+            _fields_ = [
+                ('a', ctypes.c_uint8),
+                ('b', ctypes.c_uint16),
+                ('c', ctypes.c_uint8),
+                ('d', ctypes.c_uint16),
+                ('e', ctypes.c_uint32),
+                ('f', ctypes.c_uint32),
+                ('g', ctypes.c_uint8)
+                ]
+        expected = np.dtype(dict(
+            formats=[np.uint8, np.uint16, np.uint8, np.uint16, np.uint32, np.uint32, np.uint8 ],
+            offsets=[0, 2, 4, 6, 8, 12, 16],
+            names=['a', 'b', 'c', 'd', 'e', 'f', 'g'],
+            itemsize=18))
+        self.check(PackedStructure, expected)
+
+    def test_big_endian_structure_packed(self):
+        class BigEndStruct(ctypes.BigEndianStructure):
+            _fields_ = [
+                ('one', ctypes.c_uint8),
+                ('two', ctypes.c_uint32)
+            ]
+            _pack_ = 1
+        expected = np.dtype([('one', 'u1'), ('two', '>u4')])
+        self.check(BigEndStruct, expected)
+
+    def test_little_endian_structure_packed(self):
+        class LittleEndStruct(ctypes.LittleEndianStructure):
+            _fields_ = [
+                ('one', ctypes.c_uint8),
+                ('two', ctypes.c_uint32)
+            ]
+            _pack_ = 1
+        expected = np.dtype([('one', 'u1'), ('two', '<u4')])
+        self.check(LittleEndStruct, expected)
+
     def test_little_endian_structure(self):
         class PaddedStruct(ctypes.LittleEndianStructure):
             _fields_ = [
@@ -852,8 +934,6 @@ class TestFromCTypes(object):
         ], align=True)
         self.check(PaddedStruct, expected)
 
-    @pytest.mark.xfail(sys.byteorder != 'big',
-        reason="non-native endianness does not work - see gh-10533")
     def test_big_endian_structure(self):
         class PaddedStruct(ctypes.BigEndianStructure):
             _fields_ = [
@@ -865,3 +945,9 @@ class TestFromCTypes(object):
             ('b', '>H')
         ], align=True)
         self.check(PaddedStruct, expected)
+
+    def test_simple_endian_types(self):
+        self.check(ctypes.c_uint16.__ctype_le__, np.dtype('<u2'))
+        self.check(ctypes.c_uint16.__ctype_be__, np.dtype('>u2'))
+        self.check(ctypes.c_uint8.__ctype_le__, np.dtype('u1'))
+        self.check(ctypes.c_uint8.__ctype_be__, np.dtype('u1'))
diff --git a/numpy/core/tests/test_getlimits.py b/numpy/core/tests/test_getlimits.py
index ca8093c62..2f6648183 100644
--- a/numpy/core/tests/test_getlimits.py
+++ b/numpy/core/tests/test_getlimits.py
@@ -7,10 +7,7 @@ import numpy as np
 from numpy.core import finfo, iinfo
 from numpy import half, single, double, longdouble
 from numpy.testing import assert_equal, assert_, assert_raises
-from numpy.core.getlimits import (
-    _discovered_machar, _float16_ma, _float32_ma, _float64_ma, _float128_ma,
-    _float80_ma
-    )
+from numpy.core.getlimits import _discovered_machar, _float_ma
 
 ##################################################
 
@@ -101,9 +98,9 @@ def assert_ma_equal(discovered, ma_like):
 
 def test_known_types():
     # Test we are correctly compiling parameters for known types
-    for ftype, ma_like in ((np.float16, _float16_ma),
-                           (np.float32, _float32_ma),
-                           (np.float64, _float64_ma)):
+    for ftype, ma_like in ((np.float16, _float_ma[16]),
+                           (np.float32, _float_ma[32]),
+                           (np.float64, _float_ma[64])):
         assert_ma_equal(_discovered_machar(ftype), ma_like)
     # Suppress warning for broken discovery of double double on PPC
     with np.errstate(all='ignore'):
@@ -111,10 +108,10 @@ def test_known_types():
     bytes = np.dtype(np.longdouble).itemsize
     if (ld_ma.it, ld_ma.maxexp) == (63, 16384) and bytes in (12, 16):
         # 80-bit extended precision
-        assert_ma_equal(ld_ma, _float80_ma)
+        assert_ma_equal(ld_ma, _float_ma[80])
     elif (ld_ma.it, ld_ma.maxexp) == (112, 16384) and bytes == 16:
         # IEE 754 128-bit
-        assert_ma_equal(ld_ma, _float128_ma)
+        assert_ma_equal(ld_ma, _float_ma[128])
 
 
 def test_plausible_finfo():
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 4b2a38990..51fe6e9ef 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -2727,6 +2727,17 @@ class TestMethods(object):
         np.dot(a, b, out=out)
         np.matmul(a, b, out=out)
 
+    def test_dot_matmul_inner_array_casting_fails(self):
+
+        class A(object):
+            def __array__(self, *args, **kwargs):
+                raise NotImplementedError
+
+        # Don't override the error from calling __array__()
+        assert_raises(NotImplementedError, np.dot, A(), A())
+        assert_raises(NotImplementedError, np.matmul, A(), A())
+        assert_raises(NotImplementedError, np.inner, A(), A())
+
     def test_diagonal(self):
         a = np.arange(12).reshape((3, 4))
         assert_equal(a.diagonal(), [0, 5, 10])
@@ -4541,6 +4552,19 @@ class TestIO(object):
             f.close()
             assert_equal(pos, 10, err_msg=err_msg)
 
+    def test_load_object_array_fromfile(self):
+        # gh-12300
+        with open(self.filename, 'w') as f:
+            # Ensure we have a file with consistent contents
+            pass
+
+        with open(self.filename, 'rb') as f:
+            assert_raises_regex(ValueError, "Cannot read into object array",
+                                np.fromfile, f, dtype=object)
+
+        assert_raises_regex(ValueError, "Cannot read into object array",
+                            np.fromfile, self.filename, dtype=object)
+
     def _check_from(self, s, value, **kw):
         if 'sep' not in kw:
             y = np.frombuffer(s, **kw)
@@ -6775,7 +6799,7 @@ class TestNewBufferProtocol(object):
             ValueError, "format string",
             np.array, m)
 
-    def test_error_message(self):
+    def test_error_message_unsupported(self):
         # wchar has no corresponding numpy type - if this changes in future, we
         # need a better way to construct an invalid memoryview format.
         t = ctypes.c_wchar * 4
@@ -6784,7 +6808,10 @@ class TestNewBufferProtocol(object):
 
         exc = cm.exception
         if sys.version_info.major > 2:
-            with assert_raises_regex(ValueError, "Unknown .* specifier 'u'"):
+            with assert_raises_regex(
+                NotImplementedError,
+                r"Unrepresentable .* 'u' \(UCS-2 strings\)"
+            ):
                 raise exc.__cause__
 
     def test_ctypes_integer_via_memoryview(self):
diff --git a/numpy/core/tests/test_overrides.py b/numpy/core/tests/test_overrides.py
index ee6d5da4a..7db551801 100644
--- a/numpy/core/tests/test_overrides.py
+++ b/numpy/core/tests/test_overrides.py
@@ -1,5 +1,6 @@
 from __future__ import division, absolute_import, print_function
 
+import inspect
 import sys
 
 import numpy as np
@@ -7,8 +8,14 @@ from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex)
 from numpy.core.overrides import (
     get_overloaded_types_and_args, array_function_dispatch,
-    verify_matching_signatures)
+    verify_matching_signatures, ENABLE_ARRAY_FUNCTION)
 from numpy.core.numeric import pickle
+import pytest
+
+
+requires_array_function = pytest.mark.skipif(
+    not ENABLE_ARRAY_FUNCTION,
+    reason="__array_function__ dispatch not enabled.")
 
 
 def _get_overloaded_args(relevant_args):
@@ -165,6 +172,7 @@ def dispatched_one_arg(array):
     return 'original'
 
 
+@requires_array_function
 class TestArrayFunctionDispatch(object):
 
     def test_pickle(self):
@@ -204,6 +212,7 @@ class TestArrayFunctionDispatch(object):
             dispatched_one_arg(array)
 
 
+@requires_array_function
 class TestVerifyMatchingSignatures(object):
 
     def test_verify_matching_signatures(self):
@@ -256,6 +265,7 @@ def _new_duck_type_and_implements():
     return (MyArray, implements)
 
 
+@requires_array_function
 class TestArrayFunctionImplementation(object):
 
     def test_one_arg(self):
@@ -302,6 +312,7 @@ class TestArrayFunctionImplementation(object):
 
         array = np.array(1)
         assert_(func(array) is array)
+        assert_equal(func.__module__, 'my')
 
         with assert_raises_regex(
                 TypeError, "no implementation found for 'my.func'"):
@@ -322,15 +333,21 @@ class TestNDArrayMethods(object):
         assert_equal(repr(array), 'MyArray(1)')
         assert_equal(str(array), '1')
 
-        
+
 class TestNumPyFunctions(object):
 
-    def test_module(self):
+    def test_set_module(self):
         assert_equal(np.sum.__module__, 'numpy')
         assert_equal(np.char.equal.__module__, 'numpy.char')
         assert_equal(np.fft.fft.__module__, 'numpy.fft')
         assert_equal(np.linalg.solve.__module__, 'numpy.linalg')
 
+    @pytest.mark.skipif(sys.version_info[0] < 3, reason="Python 3 only")
+    def test_inspect_sum(self):
+        signature = inspect.signature(np.sum)
+        assert_('axis' in signature.parameters)
+
+    @requires_array_function
     def test_override_sum(self):
         MyArray, implements = _new_duck_type_and_implements()
 
diff --git a/numpy/core/tests/test_records.py b/numpy/core/tests/test_records.py
index a77eef404..08d8865a0 100644
--- a/numpy/core/tests/test_records.py
+++ b/numpy/core/tests/test_records.py
@@ -13,9 +13,10 @@ from os import path
 import pytest
 
 import numpy as np
+from numpy.compat import Path
 from numpy.testing import (
     assert_, assert_equal, assert_array_equal, assert_array_almost_equal,
-    assert_raises, assert_warns
+    assert_raises, assert_warns, temppath
     )
 from numpy.core.numeric import pickle
 
@@ -325,6 +326,23 @@ class TestFromrecords(object):
         assert_equal(rec['f1'], [b'', b'', b''])
 
 
+@pytest.mark.skipif(Path is None, reason="No pathlib.Path")
+class TestPathUsage(object):
+    # Test that pathlib.Path can be used
+    def test_tofile_fromfile(self):
+        with temppath(suffix='.bin') as path:
+            path = Path(path)
+            np.random.seed(123)
+            a = np.random.rand(10).astype('f8,i4,a5')
+            a[5] = (0.5,10,'abcde')
+            with path.open("wb") as fd:
+                a.tofile(fd)
+            x = np.core.records.fromfile(path,
+                                         formats='f8,i4,a5',
+                                         shape=10)
+            assert_array_equal(x, a)
+
+
 class TestRecord(object):
     def setup(self):
         self.data = np.rec.fromrecords([(1, 2, 3), (4, 5, 6)],
diff --git a/numpy/core/tests/test_shape_base.py b/numpy/core/tests/test_shape_base.py
index b2c610da6..416bd18db 100644
--- a/numpy/core/tests/test_shape_base.py
+++ b/numpy/core/tests/test_shape_base.py
@@ -1,5 +1,6 @@
 from __future__ import division, absolute_import, print_function
 
+import pytest
 import warnings
 import sys
 import numpy as np
@@ -391,39 +392,32 @@ def test_stack():
     assert_array_equal(result, np.array([0, 1, 2]))
 
 
-# See for more information on how to parametrize a whole class
-# https://docs.pytest.org/en/latest/example/parametrize.html#parametrizing-test-methods-through-per-class-configuration
-def pytest_generate_tests(metafunc):
-    # called once per each test function
-    if hasattr(metafunc.cls, 'params'):
-        arglist = metafunc.cls.params
-        argnames = sorted(arglist[0])
-        metafunc.parametrize(argnames,
-                             [[funcargs[name] for name in argnames]
-                              for funcargs in arglist])
-
-
-# blocking small arrays and large arrays go through different paths.
-# the algorithm is triggered depending on the number of element
-# copies required.
-# We define a test fixture that forces most tests to go through
-# both code paths.
-# Ultimately, this should be removed if a single algorithm is found
-# to be faster for both small and large arrays.s
-def _block_force_concatenate(arrays):
-    arrays, list_ndim, result_ndim, _ = _block_setup(arrays)
-    return _block_concatenate(arrays, list_ndim, result_ndim)
-
-
-def _block_force_slicing(arrays):
-    arrays, list_ndim, result_ndim, _ = _block_setup(arrays)
-    return _block_slicing(arrays, list_ndim, result_ndim)
-
-
 class TestBlock(object):
-    params = [dict(block=block),
-              dict(block=_block_force_concatenate),
-              dict(block=_block_force_slicing)]
+    @pytest.fixture(params=['block', 'force_concatenate', 'force_slicing'])
+    def block(self, request):
+        # blocking small arrays and large arrays go through different paths.
+        # the algorithm is triggered depending on the number of element
+        # copies required.
+        # We define a test fixture that forces most tests to go through
+        # both code paths.
+        # Ultimately, this should be removed if a single algorithm is found
+        # to be faster for both small and large arrays.
+        def _block_force_concatenate(arrays):
+            arrays, list_ndim, result_ndim, _ = _block_setup(arrays)
+            return _block_concatenate(arrays, list_ndim, result_ndim)
+
+        def _block_force_slicing(arrays):
+            arrays, list_ndim, result_ndim, _ = _block_setup(arrays)
+            return _block_slicing(arrays, list_ndim, result_ndim)
+
+        if request.param == 'force_concatenate':
+            return _block_force_concatenate
+        elif request.param == 'force_slicing':
+            return _block_force_slicing
+        elif request.param == 'block':
+            return block
+        else:
+            raise ValueError('Unknown blocking request. There is a typo in the tests.')
 
     def test_returns_copy(self, block):
         a = np.eye(3)
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index bd7985dfb..fe23c922b 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -685,6 +685,10 @@ class TestLogAddExp(_FilterInvalids):
         assert_(np.isnan(np.logaddexp(0, np.nan)))
         assert_(np.isnan(np.logaddexp(np.nan, np.nan)))
 
+    def test_reduce(self):
+        assert_equal(np.logaddexp.identity, -np.inf)
+        assert_equal(np.logaddexp.reduce([]), -np.inf)
+
 
 class TestLog1p(object):
     def test_log1p(self):
@@ -2444,11 +2448,6 @@ class TestRationalFunctions(object):
         assert_equal(np.gcd(2**100, 3**100), 1)
 
 
-def is_longdouble_finfo_bogus():
-    info = np.finfo(np.longcomplex)
-    return not np.isfinite(np.log10(info.tiny/info.eps))
-
-
 class TestComplexFunctions(object):
     funcs = [np.arcsin,  np.arccos,  np.arctan, np.arcsinh, np.arccosh,
              np.arctanh, np.sin,     np.cos,    np.tan,     np.exp,
@@ -2544,7 +2543,8 @@ class TestComplexFunctions(object):
                 b = cfunc(p)
                 assert_(abs(a - b) < atol, "%s %s: %s; cmath: %s" % (fname, p, a, b))
 
-    def check_loss_of_precision(self, dtype):
+    @pytest.mark.parametrize('dtype', [np.complex64, np.complex_, np.longcomplex])
+    def test_loss_of_precision(self, dtype):
         """Check loss of precision in complex arc* functions"""
 
         # Check against known-good functions
@@ -2586,10 +2586,11 @@ class TestComplexFunctions(object):
             # It's not guaranteed that the system-provided arc functions
             # are accurate down to a few epsilons. (Eg. on Linux 64-bit)
             # So, give more leeway for long complex tests here:
-            check(x_series, 50*eps)
+            # Can use 2.1 for > Ubuntu LTS Trusty (2014), glibc = 2.19.
+            check(x_series, 50.0*eps)
         else:
             check(x_series, 2.1*eps)
-        check(x_basic, 2*eps/1e-3)
+        check(x_basic, 2.0*eps/1e-3)
 
         # Check a few points
 
@@ -2629,15 +2630,6 @@ class TestComplexFunctions(object):
             check(func, pts, 1j)
             check(func, pts, 1+1j)
 
-    def test_loss_of_precision(self):
-        for dtype in [np.complex64, np.complex_]:
-            self.check_loss_of_precision(dtype)
-
-    @pytest.mark.skipif(is_longdouble_finfo_bogus(),
-                        reason="Bogus long double finfo")
-    def test_loss_of_precision_longcomplex(self):
-        self.check_loss_of_precision(np.longcomplex)
-
 
 class TestAttributes(object):
     def test_attributes(self):