53 files changed, 1244 insertions, 325 deletions
diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml
index 239a18602..86fb094c6 100644
--- a/.github/workflows/build_test.yml
+++ b/.github/workflows/build_test.yml
@@ -277,3 +277,35 @@ jobs:
         docker run --rm --interactive -v $(pwd):/numpy the_build /bin/bash -c "
           cd /numpy && python3 runtests.py -n -v -- -k test_simd
         "
+
+  sde_simd_avx512_test:
+    # Intel Software Development Emulator (SDE) is used to run a given program
+    # on a specific instruction set architecture and capture various performance details.
+    # see https://www.intel.com/content/www/us/en/developer/articles/tool/software-development-emulator.html
+    needs: [smoke_test]
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: recursive
+        fetch-depth: 0
+    - uses: actions/setup-python@v2
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install Intel SDE
+      run: |
+        curl -o /tmp/sde.tar.bz2 https://www.intel.com/content/dam/develop/external/us/en/documents/downloads/sde-external-8.69.1-2021-07-18-lin.tar.bz2
+        mkdir /tmp/sde && tar -xvf /tmp/sde.tar.bz2 -C /tmp/sde/
+        sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
+    - name: Install dependencies
+      run: python -m pip install -r test_requirements.txt
+    - name: Build
+      run: python setup.py build
+           --simd-test="\$werror AVX512F AVX512_KNL AVX512_KNM AVX512_SKX AVX512_CLX AVX512_CNL AVX512_ICL"
+           install
+    # KNM implies KNL
+    - name: Run SIMD tests (Xeon PHI)
+      run: sde -knm -- python runtests.py -n -v -- -k test_simd
+    # ICL implies SKX, CLX and CNL
+    - name: Run SIMD tests (Ice Lake)
+      run: sde -icl -- python runtests.py -n -v -- -k test_simd
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index f919debaa..3c382f8b3 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -40,7 +40,7 @@ jobs:
     name: Build wheel for cp${{ matrix.python }}-${{ matrix.platform }}
     needs: get_commit_message
     if: >-
-      contains(needs.get_commit_message.outputs.message, '[cd build]') ||
+      contains(needs.get_commit_message.outputs.message, '[wheel build]') ||
       github.event_name == 'schedule' ||
       github.event_name == 'workflow_dispatch'
     runs-on: ${{ matrix.os }}
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 7c21087e1..9d2973b59 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -109,7 +109,7 @@ stages:
       # the docs even though i.e., numba uses another in their
       # azure config for mac os -- Microsoft has indicated
       # they will patch this issue
-      vmImage: 'macOS-10.14'
+      vmImage: 'macOS-1015'
     strategy:
       maxParallel: 3
       matrix:
diff --git a/doc/neps/nep-0047-array-api-standard.rst b/doc/neps/nep-0047-array-api-standard.rst
index 3e63602cc..53b8e35b0 100644
--- a/doc/neps/nep-0047-array-api-standard.rst
+++ b/doc/neps/nep-0047-array-api-standard.rst
@@ -338,9 +338,10 @@ the options already present in NumPy are:
 
 Adding support for DLPack to NumPy entails:
 
-- Adding a ``ndarray.__dlpack__`` method.
-- Adding a ``from_dlpack`` function, which takes as input an object
-  supporting ``__dlpack__``, and returns an ``ndarray``.
+- Adding a ``ndarray.__dlpack__()`` method which returns a ``dlpack`` C
+  structure wrapped in a ``PyCapsule``.
+- Adding a ``np._from_dlpack(obj)`` function, where ``obj`` supports
+  ``__dlpack__()``, and returns an ``ndarray``.
 
 DLPack is currently a ~200 LoC header, and is meant to be included directly, so
 no external dependency is needed. Implementation should be straightforward.
diff --git a/doc/neps/nep-0049.rst b/doc/neps/nep-0049.rst
index 4758edb35..3bd1d102c 100644
--- a/doc/neps/nep-0049.rst
+++ b/doc/neps/nep-0049.rst
@@ -109,6 +109,15 @@ The name of the handler will be exposed on the python level via a
 ``numpy.core.multiarray.get_handler_name()`` it will return the name of the
 handler that will be used to allocate data for the next new `ndarrray`.
 
+The version of the handler will be exposed on the python level via a
+``numpy.core.multiarray.get_handler_version(arr)`` function. If called as
+``numpy.core.multiarray.get_handler_version()`` it will return the version of the
+handler that will be used to allocate data for the next new `ndarrray`.
+
+The version, currently 1, allows for future enhancements to the
+``PyDataMemAllocator``. If fields are added, they must be added to the end.
+
+
 NumPy C-API functions
 =====================
 
@@ -119,7 +128,8 @@ NumPy C-API functions
     .. code-block:: c
 
         typedef struct {
-            char name[128];  /* multiple of 64 to keep the struct aligned */
+            char name[127];  /* multiple of 64 to keep the struct aligned */
+            uint8_t version; /* currently 1 */
             PyDataMemAllocator allocator;
         } PyDataMem_Handler;
 
@@ -279,6 +289,7 @@ the ``sz`` argument is correct.
 
     static PyDataMem_Handler new_handler = {
         "secret_data_allocator",
+        1,
         {
             &new_handler_ctx,
             shift_alloc,      /* malloc */
diff --git a/doc/release/upcoming_changes/19083.new_feature.rst b/doc/release/upcoming_changes/19083.new_feature.rst
new file mode 100644
index 000000000..92f00c0d6
--- /dev/null
+++ b/doc/release/upcoming_changes/19083.new_feature.rst
@@ -0,0 +1,6 @@
+Add NEP 47-compatible dlpack support
+------------------------------------
+
+Add a ``ndarray.__dlpack__()`` method which returns a ``dlpack`` C structure
+wrapped in a ``PyCapsule``. Also add a ``np._from_dlpack(obj)`` function, where
+``obj`` supports ``__dlpack__()``, and returns an ``ndarray``.
diff --git a/doc/source/reference/c-api/data_memory.rst b/doc/source/reference/c-api/data_memory.rst
index 11a37adc4..b779026b4 100644
--- a/doc/source/reference/c-api/data_memory.rst
+++ b/doc/source/reference/c-api/data_memory.rst
@@ -62,7 +62,8 @@ reallocate or free the data memory of the instance.
     .. code-block:: c
 
         typedef struct {
-            char name[128];  /* multiple of 64 to keep the struct aligned */
+            char name[127];  /* multiple of 64 to keep the struct aligned */
+            uint8_t version; /* currently 1 */
             PyDataMemAllocator allocator;
         } PyDataMem_Handler;
 
diff --git a/numpy/__init__.pyi b/numpy/__init__.pyi
index 637f8578a..b2e9eec77 100644
--- a/numpy/__init__.pyi
+++ b/numpy/__init__.pyi
@@ -1413,6 +1413,7 @@ _SupportsBuffer = Union[
 
 _T = TypeVar("_T")
 _T_co = TypeVar("_T_co", covariant=True)
+_T_contra = TypeVar("_T_contra", contravariant=True)
 _2Tuple = Tuple[_T, _T]
 _CastingKind = L["no", "equiv", "safe", "same_kind", "unsafe"]
 
@@ -1432,6 +1433,10 @@ _ArrayTD64_co = NDArray[Union[bool_, integer[Any], timedelta64]]
 # Introduce an alias for `dtype` to avoid naming conflicts.
 _dtype = dtype
 
+# `builtins.PyCapsule` unfortunately lacks annotations as of the moment;
+# use `Any` as a stopgap measure
+_PyCapsule = Any
+
 class _SupportsItem(Protocol[_T_co]):
     def item(self, args: Any, /) -> _T_co: ...
 
@@ -2439,6 +2444,12 @@ class ndarray(_ArrayOrScalarCommon, Generic[_ShapeType, _DType_co]):
     def __ior__(self: NDArray[signedinteger[_NBit1]], other: _ArrayLikeInt_co) -> NDArray[signedinteger[_NBit1]]: ...
     @overload
     def __ior__(self: NDArray[object_], other: Any) -> NDArray[object_]: ...
+    @overload
+    def __ior__(self: NDArray[_ScalarType], other: _RecursiveSequence) -> NDArray[_ScalarType]: ...
+    @overload
+    def __dlpack__(self: NDArray[number[Any]], *, stream: None = ...) -> _PyCapsule: ...
+    @overload
+    def __dlpack_device__(self) -> Tuple[int, L[0]]: ...
 
     # Keep `dtype` at the bottom to avoid name conflicts with `np.dtype`
     @property
@@ -4320,3 +4331,9 @@ class chararray(ndarray[_ShapeType, _CharDType]):
 
 # NOTE: Deprecated
 # class MachAr: ...
+
+class _SupportsDLPack(Protocol[_T_contra]):
+    def __dlpack__(self, *, stream: None | _T_contra = ...) -> _PyCapsule: ...
+
+def _from_dlpack(__obj: _SupportsDLPack[None]) -> NDArray[Any]: ...
+
diff --git a/numpy/array_api/__init__.py b/numpy/array_api/__init__.py
index d8b29057e..36e3f3ed5 100644
--- a/numpy/array_api/__init__.py
+++ b/numpy/array_api/__init__.py
@@ -169,6 +169,7 @@ __all__ += [
 ]
 
 from ._data_type_functions import (
+    astype,
     broadcast_arrays,
     broadcast_to,
     can_cast,
@@ -178,6 +179,7 @@ from ._data_type_functions import (
 )
 
 __all__ += [
+    "astype",
     "broadcast_arrays",
     "broadcast_to",
     "can_cast",
@@ -358,9 +360,9 @@ from ._searching_functions import argmax, argmin, nonzero, where
 
 __all__ += ["argmax", "argmin", "nonzero", "where"]
 
-from ._set_functions import unique
+from ._set_functions import unique_all, unique_counts, unique_inverse, unique_values
 
-__all__ += ["unique"]
+__all__ += ["unique_all", "unique_counts", "unique_inverse", "unique_values"]
 
 from ._sorting_functions import argsort, sort
 
diff --git a/numpy/array_api/_array_object.py b/numpy/array_api/_array_object.py
index ef66c5efd..dc74bb8c5 100644
--- a/numpy/array_api/_array_object.py
+++ b/numpy/array_api/_array_object.py
@@ -32,7 +32,7 @@ from ._dtypes import (
 from typing import TYPE_CHECKING, Optional, Tuple, Union, Any
 
 if TYPE_CHECKING:
-    from ._typing import PyCapsule, Device, Dtype
+    from ._typing import Any, PyCapsule, Device, Dtype
 
 import numpy as np
 
@@ -99,9 +99,13 @@ class Array:
         """
         Performs the operation __repr__.
         """
-        prefix = "Array("
         suffix = f", dtype={self.dtype.name})"
-        mid = np.array2string(self._array, separator=', ', prefix=prefix, suffix=suffix)
+        if 0 in self.shape:
+            prefix = "empty("
+            mid = str(self.shape)
+        else:
+            prefix = "Array("
+            mid = np.array2string(self._array, separator=', ', prefix=prefix, suffix=suffix)
         return prefix + mid + suffix
 
     # These are various helper functions to make the array behavior match the
@@ -244,6 +248,10 @@ class Array:
         The following cases are allowed by NumPy, but not specified by the array
         API specification:
 
+        - Indices to not include an implicit ellipsis at the end. That is,
+          every axis of an array must be explicitly indexed or an ellipsis
+          included.
+
         - The start and stop of a slice may not be out of bounds. In
           particular, for a slice ``i:j:k`` on an axis of size ``n``, only the
           following are allowed:
@@ -270,6 +278,10 @@ class Array:
                 return key
             if shape == ():
                 return key
+            if len(shape) > 1:
+                raise IndexError(
+                    "Multidimensional arrays must include an index for every axis or use an ellipsis"
+                )
             size = shape[0]
             # Ensure invalid slice entries are passed through.
             if key.start is not None:
@@ -277,7 +289,7 @@ class Array:
                     operator.index(key.start)
                 except TypeError:
                     return key
-                if not (-size <= key.start <= max(0, size - 1)):
+                if not (-size <= key.start <= size):
                     raise IndexError(
                         "Slices with out-of-bounds start are not allowed in the array API namespace"
                     )
@@ -322,6 +334,10 @@ class Array:
                 zip(key[:ellipsis_i:-1], shape[:ellipsis_i:-1])
             ):
                 Array._validate_index(idx, (size,))
+            if n_ellipsis == 0 and len(key) < len(shape):
+                raise IndexError(
+                    "Multidimensional arrays must include an index for every axis or use an ellipsis"
+                )
             return key
         elif isinstance(key, bool):
             return key
@@ -339,7 +355,12 @@ class Array:
                 "newaxis indices are not allowed in the array API namespace"
             )
         try:
-            return operator.index(key)
+            key = operator.index(key)
+            if shape is not None and len(shape) > 1:
+                raise IndexError(
+                    "Multidimensional arrays must include an index for every axis or use an ellipsis"
+                )
+            return key
         except TypeError:
             # Note: This also omits boolean arrays that are not already in
             # Array() form, like a list of booleans.
@@ -403,16 +424,14 @@ class Array:
         """
         Performs the operation __dlpack__.
         """
-        res = self._array.__dlpack__(stream=stream)
-        return self.__class__._new(res)
+        return self._array.__dlpack__(stream=stream)
 
     def __dlpack_device__(self: Array, /) -> Tuple[IntEnum, int]:
         """
         Performs the operation __dlpack_device__.
         """
         # Note: device support is required for this
-        res = self._array.__dlpack_device__()
-        return self.__class__._new(res)
+        return self._array.__dlpack_device__()
 
     def __eq__(self: Array, other: Union[int, float, bool, Array], /) -> Array:
         """
@@ -527,13 +546,6 @@ class Array:
         res = self._array.__le__(other._array)
         return self.__class__._new(res)
 
-    # Note: __len__ may end up being removed from the array API spec.
-    def __len__(self, /) -> int:
-        """
-        Performs the operation __len__.
-        """
-        return self._array.__len__()
-
     def __lshift__(self: Array, other: Union[int, Array], /) -> Array:
         """
         Performs the operation __lshift__.
@@ -995,7 +1007,9 @@ class Array:
         res = self._array.__rxor__(other._array)
         return self.__class__._new(res)
 
-    def to_device(self: Array, device: Device, /) -> Array:
+    def to_device(self: Array, device: Device, /, stream: None = None) -> Array:
+        if stream is not None:
+            raise ValueError("The stream argument to to_device() is not supported")
         if device == 'cpu':
             return self
         raise ValueError(f"Unsupported device {device!r}")
diff --git a/numpy/array_api/_creation_functions.py b/numpy/array_api/_creation_functions.py
index e36807468..23beec444 100644
--- a/numpy/array_api/_creation_functions.py
+++ b/numpy/array_api/_creation_functions.py
@@ -9,7 +9,6 @@ if TYPE_CHECKING:
         Device,
         Dtype,
         NestedSequence,
-        SupportsDLPack,
         SupportsBufferProtocol,
     )
     from collections.abc import Sequence
@@ -36,7 +35,6 @@ def asarray(
         int,
         float,
         NestedSequence[bool | int | float],
-        SupportsDLPack,
         SupportsBufferProtocol,
     ],
     /,
@@ -60,7 +58,9 @@ def asarray(
     if copy is False:
         # Note: copy=False is not yet implemented in np.asarray
         raise NotImplementedError("copy=False is not yet implemented")
-    if isinstance(obj, Array) and (dtype is None or obj.dtype == dtype):
+    if isinstance(obj, Array):
+        if dtype is not None and obj.dtype != dtype:
+            copy = True
         if copy is True:
             return Array._new(np.array(obj._array, copy=True, dtype=dtype))
         return obj
@@ -152,8 +152,9 @@ def eye(
 
 
 def from_dlpack(x: object, /) -> Array:
-    # Note: dlpack support is not yet implemented on Array
-    raise NotImplementedError("DLPack support is not yet implemented")
+    from ._array_object import Array
+
+    return Array._new(np._from_dlpack(x))
 
 
 def full(
@@ -240,6 +241,12 @@ def meshgrid(*arrays: Array, indexing: str = "xy") -> List[Array]:
     """
     from ._array_object import Array
 
+    # Note: unlike np.meshgrid, only inputs with all the same dtype are
+    # allowed
+
+    if len({a.dtype for a in arrays}) > 1:
+        raise ValueError("meshgrid inputs must all have the same dtype")
+
     return [
         Array._new(array)
         for array in np.meshgrid(*[a._array for a in arrays], indexing=indexing)
diff --git a/numpy/array_api/_data_type_functions.py b/numpy/array_api/_data_type_functions.py
index 7ccbe9469..e4d6db61b 100644
--- a/numpy/array_api/_data_type_functions.py
+++ b/numpy/array_api/_data_type_functions.py
@@ -13,6 +13,13 @@ if TYPE_CHECKING:
 import numpy as np
 
 
+# Note: astype is a function, not an array method as in NumPy.
+def astype(x: Array, dtype: Dtype, /, *, copy: bool = True) -> Array:
+    if not copy and dtype == x.dtype:
+        return x
+    return Array._new(x._array.astype(dtype=dtype, copy=copy))
+
+
 def broadcast_arrays(*arrays: Array) -> List[Array]:
     """
     Array API compatible wrapper for :py:func:`np.broadcast_arrays <numpy.broadcast_arrays>`.
diff --git a/numpy/array_api/_searching_functions.py b/numpy/array_api/_searching_functions.py
index 3dcef61c3..40f5a4d2e 100644
--- a/numpy/array_api/_searching_functions.py
+++ b/numpy/array_api/_searching_functions.py
@@ -43,4 +43,5 @@ def where(condition: Array, x1: Array, x2: Array, /) -> Array:
     """
     # Call result type here just to raise on disallowed type combinations
     _result_type(x1.dtype, x2.dtype)
+    x1, x2 = Array._normalize_two_args(x1, x2)
     return Array._new(np.where(condition._array, x1._array, x2._array))
diff --git a/numpy/array_api/_set_functions.py b/numpy/array_api/_set_functions.py
index 357f238f5..05ee7e555 100644
--- a/numpy/array_api/_set_functions.py
+++ b/numpy/array_api/_set_functions.py
@@ -2,19 +2,82 @@ from __future__ import annotations
 
 from ._array_object import Array
 
-from typing import Tuple, Union
+from typing import NamedTuple
 
 import numpy as np
 
+# Note: np.unique() is split into four functions in the array API:
+# unique_all, unique_counts, unique_inverse, and unique_values (this is done
+# to remove polymorphic return types).
 
-def unique(
-    x: Array,
-    /,
-    *,
-    return_counts: bool = False,
-    return_index: bool = False,
-    return_inverse: bool = False,
-) -> Union[Array, Tuple[Array, ...]]:
+# Note: The various unique() functions are supposed to return multiple NaNs.
+# This does not match the NumPy behavior, however, this is currently left as a
+# TODO in this implementation as this behavior may be reverted in np.unique().
+# See https://github.com/numpy/numpy/issues/20326.
+
+# Note: The functions here return a namedtuple (np.unique() returns a normal
+# tuple).
+
+class UniqueAllResult(NamedTuple):
+    values: Array
+    indices: Array
+    inverse_indices: Array
+    counts: Array
+
+
+class UniqueCountsResult(NamedTuple):
+    values: Array
+    counts: Array
+
+
+class UniqueInverseResult(NamedTuple):
+    values: Array
+    inverse_indices: Array
+
+
+def unique_all(x: Array, /) -> UniqueAllResult:
+    """
+    Array API compatible wrapper for :py:func:`np.unique <numpy.unique>`.
+
+    See its docstring for more information.
+    """
+    res = np.unique(
+        x._array,
+        return_counts=True,
+        return_index=True,
+        return_inverse=True,
+    )
+
+    return UniqueAllResult(*[Array._new(i) for i in res])
+
+
+def unique_counts(x: Array, /) -> UniqueCountsResult:
+    res = np.unique(
+        x._array,
+        return_counts=True,
+        return_index=False,
+        return_inverse=False,
+    )
+
+    return UniqueCountsResult(*[Array._new(i) for i in res])
+
+
+def unique_inverse(x: Array, /) -> UniqueInverseResult:
+    """
+    Array API compatible wrapper for :py:func:`np.unique <numpy.unique>`.
+
+    See its docstring for more information.
+    """
+    res = np.unique(
+        x._array,
+        return_counts=False,
+        return_index=False,
+        return_inverse=True,
+    )
+    return UniqueInverseResult(*[Array._new(i) for i in res])
+
+
+def unique_values(x: Array, /) -> Array:
     """
     Array API compatible wrapper for :py:func:`np.unique <numpy.unique>`.
 
@@ -22,10 +85,8 @@ def unique(
     """
     res = np.unique(
         x._array,
-        return_counts=return_counts,
-        return_index=return_index,
-        return_inverse=return_inverse,
+        return_counts=False,
+        return_index=False,
+        return_inverse=False,
     )
-    if isinstance(res, tuple):
-        return tuple(Array._new(i) for i in res)
     return Array._new(res)
diff --git a/numpy/array_api/tests/test_array_object.py b/numpy/array_api/tests/test_array_object.py
index fb42cf621..12479d765 100644
--- a/numpy/array_api/tests/test_array_object.py
+++ b/numpy/array_api/tests/test_array_object.py
@@ -3,7 +3,7 @@ import operator
 from numpy.testing import assert_raises
 import numpy as np
 
-from .. import ones, asarray, result_type
+from .. import ones, asarray, result_type, all, equal
 from .._dtypes import (
     _all_dtypes,
     _boolean_dtypes,
@@ -39,18 +39,18 @@ def test_validate_index():
     assert_raises(IndexError, lambda: a[:-4])
     assert_raises(IndexError, lambda: a[:3:-1])
     assert_raises(IndexError, lambda: a[:-5:-1])
-    assert_raises(IndexError, lambda: a[3:])
+    assert_raises(IndexError, lambda: a[4:])
     assert_raises(IndexError, lambda: a[-4:])
-    assert_raises(IndexError, lambda: a[3::-1])
+    assert_raises(IndexError, lambda: a[4::-1])
     assert_raises(IndexError, lambda: a[-4::-1])
 
     assert_raises(IndexError, lambda: a[...,:5])
     assert_raises(IndexError, lambda: a[...,:-5])
-    assert_raises(IndexError, lambda: a[...,:4:-1])
+    assert_raises(IndexError, lambda: a[...,:5:-1])
     assert_raises(IndexError, lambda: a[...,:-6:-1])
-    assert_raises(IndexError, lambda: a[...,4:])
+    assert_raises(IndexError, lambda: a[...,5:])
     assert_raises(IndexError, lambda: a[...,-5:])
-    assert_raises(IndexError, lambda: a[...,4::-1])
+    assert_raises(IndexError, lambda: a[...,5::-1])
     assert_raises(IndexError, lambda: a[...,-5::-1])
 
     # Boolean indices cannot be part of a larger tuple index
@@ -74,6 +74,11 @@ def test_validate_index():
     assert_raises(IndexError, lambda: a[None, ...])
     assert_raises(IndexError, lambda: a[..., None])
 
+    # Multiaxis indices must contain exactly as many indices as dimensions
+    assert_raises(IndexError, lambda: a[()])
+    assert_raises(IndexError, lambda: a[0,])
+    assert_raises(IndexError, lambda: a[0])
+    assert_raises(IndexError, lambda: a[:])
 
 def test_operators():
     # For every operator, we test that it works for the required type
@@ -291,8 +296,8 @@ def test_device_property():
     a = ones((3, 4))
     assert a.device == 'cpu'
 
-    assert np.array_equal(a.to_device('cpu'), a)
+    assert all(equal(a.to_device('cpu'), a))
     assert_raises(ValueError, lambda: a.to_device('gpu'))
 
-    assert np.array_equal(asarray(a, device='cpu'), a)
+    assert all(equal(asarray(a, device='cpu'), a))
     assert_raises(ValueError, lambda: asarray(a, device='gpu'))
diff --git a/numpy/array_api/tests/test_creation_functions.py b/numpy/array_api/tests/test_creation_functions.py
index 7b633eaf1..ebbb6aab3 100644
--- a/numpy/array_api/tests/test_creation_functions.py
+++ b/numpy/array_api/tests/test_creation_functions.py
@@ -11,11 +11,13 @@ from .._creation_functions import (
     full,
     full_like,
     linspace,
+    meshgrid,
     ones,
     ones_like,
     zeros,
     zeros_like,
 )
+from .._dtypes import float32, float64
 from .._array_object import Array
 
 
@@ -124,3 +126,11 @@ def test_zeros_like_errors():
     assert_raises(ValueError, lambda: zeros_like(asarray(1), device="gpu"))
     assert_raises(ValueError, lambda: zeros_like(asarray(1), dtype=int))
     assert_raises(ValueError, lambda: zeros_like(asarray(1), dtype="i"))
+
+def test_meshgrid_dtype_errors():
+    # Doesn't raise
+    meshgrid()
+    meshgrid(asarray([1.], dtype=float32))
+    meshgrid(asarray([1.], dtype=float32), asarray([1.], dtype=float32))
+
+    assert_raises(ValueError, lambda: meshgrid(asarray([1.], dtype=float32), asarray([1.], dtype=float64)))
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index c8a24db0c..078c58976 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -1573,6 +1573,19 @@ add_newdoc('numpy.core.multiarray', 'frombuffer',
         array_function_like_doc,
     ))
 
+add_newdoc('numpy.core.multiarray', '_from_dlpack',
+    """
+    _from_dlpack(x, /)
+
+    Create a NumPy array from an object implementing the ``__dlpack__``
+    protocol.
+
+    See Also
+    --------
+    `Array API documentation
+    <https://data-apis.org/array-api/latest/design_topics/data_interchange.html#syntax-for-data-interchange-with-dlpack>`_
+    """)
+
 add_newdoc('numpy.core', 'fastCopyAndTranspose',
     """_fastCopyAndTranspose(a)""")
 
@@ -2263,6 +2276,15 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('__array_priority__',
 add_newdoc('numpy.core.multiarray', 'ndarray', ('__array_struct__',
     """Array protocol: C-struct side."""))
 
+add_newdoc('numpy.core.multiarray', 'ndarray', ('__dlpack__',
+    """a.__dlpack__(*, stream=None)
+    
+    DLPack Protocol: Part of the Array API."""))
+
+add_newdoc('numpy.core.multiarray', 'ndarray', ('__dlpack_device__',
+    """a.__dlpack_device__()
+    
+    DLPack Protocol: Part of the Array API."""))
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('base',
     """
@@ -4737,6 +4759,16 @@ add_newdoc('numpy.core.multiarray', 'get_handler_name',
     memory, in which case you can traverse ``a.base`` for a memory handler.
     """)
 
+add_newdoc('numpy.core.multiarray', 'get_handler_version',
+    """
+    get_handler_version(a: ndarray) -> int,None
+
+    Return the version of the memory handler used by `a`. If not provided,
+    return the version of the memory handler that will be used to allocate data
+    for the next `ndarray` in this context. May return None if `a` does not own
+    its memory, in which case you can traverse ``a.base`` for a memory handler.
+    """)
+
 add_newdoc('numpy.core.multiarray', '_set_madvise_hugepage',
     """
     _set_madvise_hugepage(enabled: bool) -> bool
diff --git a/numpy/core/code_generators/genapi.py b/numpy/core/code_generators/genapi.py
index c2458c2b5..b401ee6a5 100644
--- a/numpy/core/code_generators/genapi.py
+++ b/numpy/core/code_generators/genapi.py
@@ -41,6 +41,7 @@ API_FILES = [join('multiarray', 'alloc.c'),
              join('multiarray', 'datetime_busdaycal.c'),
              join('multiarray', 'datetime_strings.c'),
              join('multiarray', 'descriptor.c'),
+             join('multiarray', 'dlpack.c'),
              join('multiarray', 'dtypemeta.c'),
              join('multiarray', 'einsum.c.src'),
              join('multiarray', 'flagsobject.c'),
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 3a27a34cd..292d9e0d3 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -827,7 +827,7 @@ defdict = {
           docstrings.get('numpy.core.umath.ceil'),
           None,
           TD('e', f='ceil', astype={'e': 'f'}),
-          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
           TD('fdg', f='ceil'),
           TD(O, f='npy_ObjectCeil'),
           ),
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index 80177e2bb..2607fb732 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -674,10 +674,15 @@ typedef struct {
     void* (*calloc) (void *ctx, size_t nelem, size_t elsize);
     void* (*realloc) (void *ctx, void *ptr, size_t new_size);
     void (*free) (void *ctx, void *ptr, size_t size);
+    /*
+     * This is the end of the version=1 struct. Only add new fields after
+     * this line
+     */
 } PyDataMemAllocator;
 
 typedef struct {
-    char name[128];  /* multiple of 64 to keep the struct aligned */
+    char name[127];  /* multiple of 64 to keep the struct aligned */
+    uint8_t version; /* currently 1 */
     PyDataMemAllocator allocator;
 } PyDataMem_Handler;
 
diff --git a/numpy/core/include/numpy/ufuncobject.h b/numpy/core/include/numpy/ufuncobject.h
index 3f184bd45..1d7050bbe 100644
--- a/numpy/core/include/numpy/ufuncobject.h
+++ b/numpy/core/include/numpy/ufuncobject.h
@@ -173,11 +173,8 @@ typedef struct _tagPyUFuncObject {
          * but this was never implemented. (This is also why the above
          * selector is called the "legacy" selector.)
          */
-    #if PY_VERSION_HEX >= 0x03080000
         vectorcallfunc vectorcall;
-    #else
-        void *reserved2;
-    #endif
+
         /* Was previously the `PyUFunc_MaskedInnerLoopSelectionFunc` */
         void *_always_null_previously_masked_innerloop_selector;
 
diff --git a/numpy/core/multiarray.py b/numpy/core/multiarray.py
index 351cd3a1b..f88d75978 100644
--- a/numpy/core/multiarray.py
+++ b/numpy/core/multiarray.py
@@ -14,8 +14,9 @@ from ._multiarray_umath import *  # noqa: F403
 # do not change them. issue gh-15518
 # _get_ndarray_c_version is semi-public, on purpose not added to __all__
 from ._multiarray_umath import (
-    _fastCopyAndTranspose, _flagdict, _insert, _reconstruct, _vec_string,
-    _ARRAY_API, _monotonicity, _get_ndarray_c_version, _set_madvise_hugepage,
+    _fastCopyAndTranspose, _flagdict, _from_dlpack, _insert, _reconstruct,
+    _vec_string, _ARRAY_API, _monotonicity, _get_ndarray_c_version,
+    _set_madvise_hugepage,
     )
 
 __all__ = [
@@ -23,29 +24,30 @@ __all__ = [
     'ITEM_HASOBJECT', 'ITEM_IS_POINTER', 'LIST_PICKLE', 'MAXDIMS',
     'MAY_SHARE_BOUNDS', 'MAY_SHARE_EXACT', 'NEEDS_INIT', 'NEEDS_PYAPI',
     'RAISE', 'USE_GETITEM', 'USE_SETITEM', 'WRAP', '_fastCopyAndTranspose',
-    '_flagdict', '_insert', '_reconstruct', '_vec_string', '_monotonicity',
-    'add_docstring', 'arange', 'array', 'asarray', 'asanyarray',
-    'ascontiguousarray', 'asfortranarray', 'bincount', 'broadcast',
-    'busday_count', 'busday_offset', 'busdaycalendar', 'can_cast',
+    '_flagdict', '_from_dlpack', '_insert', '_reconstruct', '_vec_string',
+    '_monotonicity', 'add_docstring', 'arange', 'array', 'asarray',
+    'asanyarray', 'ascontiguousarray', 'asfortranarray', 'bincount',
+    'broadcast', 'busday_count', 'busday_offset', 'busdaycalendar', 'can_cast',
     'compare_chararrays', 'concatenate', 'copyto', 'correlate', 'correlate2',
     'count_nonzero', 'c_einsum', 'datetime_as_string', 'datetime_data',
     'dot', 'dragon4_positional', 'dragon4_scientific', 'dtype',
     'empty', 'empty_like', 'error', 'flagsobj', 'flatiter', 'format_longfloat',
-    'frombuffer', 'fromfile', 'fromiter', 'fromstring', 'get_handler_name',
-    'inner', 'interp', 'interp_complex', 'is_busday', 'lexsort',
-    'matmul', 'may_share_memory', 'min_scalar_type', 'ndarray', 'nditer',
-    'nested_iters', 'normalize_axis_index', 'packbits',
-    'promote_types', 'putmask', 'ravel_multi_index', 'result_type', 'scalar',
-    'set_datetimeparse_function', 'set_legacy_print_mode', 'set_numeric_ops',
-    'set_string_function', 'set_typeDict', 'shares_memory',
-    'tracemalloc_domain', 'typeinfo', 'unpackbits', 'unravel_index', 'vdot',
-    'where', 'zeros']
+    'frombuffer', 'fromfile', 'fromiter', 'fromstring',
+    'get_handler_name', 'get_handler_version', 'inner', 'interp',
+    'interp_complex', 'is_busday', 'lexsort', 'matmul', 'may_share_memory',
+    'min_scalar_type', 'ndarray', 'nditer', 'nested_iters',
+    'normalize_axis_index', 'packbits', 'promote_types', 'putmask',
+    'ravel_multi_index', 'result_type', 'scalar', 'set_datetimeparse_function',
+    'set_legacy_print_mode', 'set_numeric_ops', 'set_string_function',
+    'set_typeDict', 'shares_memory', 'tracemalloc_domain', 'typeinfo',
+    'unpackbits', 'unravel_index', 'vdot', 'where', 'zeros']
 
 # For backward compatibility, make sure pickle imports these functions from here
 _reconstruct.__module__ = 'numpy.core.multiarray'
 scalar.__module__ = 'numpy.core.multiarray'
 
 
+_from_dlpack.__module__ = 'numpy'
 arange.__module__ = 'numpy'
 array.__module__ = 'numpy'
 asarray.__module__ = 'numpy'
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 1654e8364..344d40d93 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -13,8 +13,8 @@ from .multiarray import (
     WRAP, arange, array, asarray, asanyarray, ascontiguousarray,
     asfortranarray, broadcast, can_cast, compare_chararrays,
     concatenate, copyto, dot, dtype, empty,
-    empty_like, flatiter, frombuffer, fromfile, fromiter, fromstring,
-    inner, lexsort, matmul, may_share_memory,
+    empty_like, flatiter, frombuffer, _from_dlpack, fromfile, fromiter,
+    fromstring, inner, lexsort, matmul, may_share_memory,
     min_scalar_type, ndarray, nditer, nested_iters, promote_types,
     putmask, result_type, set_numeric_ops, shares_memory, vdot, where,
     zeros, normalize_axis_index)
@@ -41,7 +41,7 @@ __all__ = [
     'newaxis', 'ndarray', 'flatiter', 'nditer', 'nested_iters', 'ufunc',
     'arange', 'array', 'asarray', 'asanyarray', 'ascontiguousarray',
     'asfortranarray', 'zeros', 'count_nonzero', 'empty', 'broadcast', 'dtype',
-    'fromstring', 'fromfile', 'frombuffer', 'where',
+    'fromstring', 'fromfile', 'frombuffer', '_from_dlpack', 'where',
     'argwhere', 'copyto', 'concatenate', 'fastCopyAndTranspose', 'lexsort',
     'set_numeric_ops', 'can_cast', 'promote_types', 'min_scalar_type',
     'result_type', 'isfortran', 'empty_like', 'zeros_like', 'ones_like',
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 3e1ed4c9b..2c99060ec 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -740,6 +740,7 @@ def configuration(parent_package='',top_path=None):
     #######################################################################
 
     common_deps = [
+            join('src', 'common', 'dlpack', 'dlpack.h'),
             join('src', 'common', 'array_assign.h'),
             join('src', 'common', 'binop_override.h'),
             join('src', 'common', 'cblasfuncs.h'),
@@ -749,6 +750,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'common', 'npy_cblas.h'),
             join('src', 'common', 'npy_config.h'),
             join('src', 'common', 'npy_ctypes.h'),
+            join('src', 'common', 'npy_dlpack.h'),
             join('src', 'common', 'npy_extint128.h'),
             join('src', 'common', 'npy_import.h'),
             join('src', 'common', 'npy_hashtable.h'),
@@ -881,6 +883,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'datetime_busday.c'),
             join('src', 'multiarray', 'datetime_busdaycal.c'),
             join('src', 'multiarray', 'descriptor.c'),
+            join('src', 'multiarray', 'dlpack.c'),
             join('src', 'multiarray', 'dtypemeta.c'),
             join('src', 'multiarray', 'dragon4.c'),
             join('src', 'multiarray', 'dtype_transfer.c'),
diff --git a/numpy/core/src/common/dlpack/dlpack.h b/numpy/core/src/common/dlpack/dlpack.h
new file mode 100644
index 000000000..29209aee1
--- /dev/null
+++ b/numpy/core/src/common/dlpack/dlpack.h
@@ -0,0 +1,201 @@
+// Taken from:
+// https://github.com/dmlc/dlpack/blob/9b6176fdecb55e9bf39b16f08b96913ed3f275b4/include/dlpack/dlpack.h
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ */
+#ifndef DLPACK_DLPACK_H_
+#define DLPACK_DLPACK_H_
+
+#ifdef __cplusplus
+#define DLPACK_EXTERN_C extern "C"
+#else
+#define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current version of dlpack */
+#define DLPACK_VERSION 050
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef DLPACK_EXPORTS
+#define DLPACK_DLL __declspec(dllexport)
+#else
+#define DLPACK_DLL __declspec(dllimport)
+#endif
+#else
+#define DLPACK_DLL
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!
+ * \brief The device type in DLDevice.
+ */
+typedef enum {
+  /*! \brief CPU device */
+  kDLCPU = 1,
+  /*! \brief CUDA GPU device */
+  kDLCUDA = 2,
+  /*!
+   * \brief Pinned CUDA CPU memory by cudaMallocHost
+   */
+  kDLCUDAHost = 3,
+  /*! \brief OpenCL devices. */
+  kDLOpenCL = 4,
+  /*! \brief Vulkan buffer for next generation graphics. */
+  kDLVulkan = 7,
+  /*! \brief Metal for Apple GPU. */
+  kDLMetal = 8,
+  /*! \brief Verilog simulator buffer */
+  kDLVPI = 9,
+  /*! \brief ROCm GPUs for AMD GPUs */
+  kDLROCM = 10,
+  /*!
+   * \brief Pinned ROCm CPU memory allocated by hipMallocHost
+   */
+  kDLROCMHost = 11,
+  /*!
+   * \brief Reserved extension device type,
+   * used for quickly test extension device
+   * The semantics can differ depending on the implementation.
+   */
+  kDLExtDev = 12,
+  /*!
+   * \brief CUDA managed/unified memory allocated by cudaMallocManaged
+   */
+  kDLCUDAManaged = 13,
+} DLDeviceType;
+
+/*!
+ * \brief A Device for Tensor and operator.
+ */
+typedef struct {
+  /*! \brief The device type used in the device. */
+  DLDeviceType device_type;
+  /*!
+   * \brief The device index.
+   * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
+   */
+  int device_id;
+} DLDevice;
+
+/*!
+ * \brief The type code options DLDataType.
+ */
+typedef enum {
+  /*! \brief signed integer */
+  kDLInt = 0U,
+  /*! \brief unsigned integer */
+  kDLUInt = 1U,
+  /*! \brief IEEE floating point */
+  kDLFloat = 2U,
+  /*!
+   * \brief Opaque handle type, reserved for testing purposes.
+   * Frameworks need to agree on the handle data type for the exchange to be well-defined.
+   */
+  kDLOpaqueHandle = 3U,
+  /*! \brief bfloat16 */
+  kDLBfloat = 4U,
+  /*!
+   * \brief complex number
+   * (C/C++/Python layout: compact struct per complex number)
+   */
+  kDLComplex = 5U,
+} DLDataTypeCode;
+
+/*!
+ * \brief The data type the tensor can hold.
+ *
+ *  Examples
+ *   - float: type_code = 2, bits = 32, lanes=1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
+ *   - int8: type_code = 0, bits = 8, lanes=1
+ *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+ */
+typedef struct {
+  /*!
+   * \brief Type code of base types.
+   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+   * footprint, but the value should be one of DLDataTypeCode enum values.
+   * */
+  uint8_t code;
+  /*!
+   * \brief Number of bits, common choices are 8, 16, 32.
+   */
+  uint8_t bits;
+  /*! \brief Number of lanes in the type, used for vector types. */
+  uint16_t lanes;
+} DLDataType;
+
+/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */
+typedef struct {
+  /*!
+   * \brief The opaque data pointer points to the allocated data. This will be
+   * CUDA device pointer or cl_mem handle in OpenCL. This pointer is always
+   * aligned to 256 bytes as in CUDA.
+   *
+   * For given DLTensor, the size of memory required to store the contents of
+   * data is calculated as follows:
+   *
+   * \code{.c}
+   * static inline size_t GetDataSize(const DLTensor* t) {
+   *   size_t size = 1;
+   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+   *     size *= t->shape[i];
+   *   }
+   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+   *   return size;
+   * }
+   * \endcode
+   */
+  void* data;
+  /*! \brief The device of the tensor */
+  DLDevice device;
+  /*! \brief Number of dimensions */
+  int ndim;
+  /*! \brief The data type of the pointer*/
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  int64_t* shape;
+  /*!
+   * \brief strides of the tensor (in number of elements, not bytes)
+   *  can be NULL, indicating tensor is compact and row-majored.
+   */
+  int64_t* strides;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint64_t byte_offset;
+} DLTensor;
+
+/*!
+ * \brief C Tensor object, manage memory of DLTensor. This data structure is
+ *  intended to facilitate the borrowing of DLTensor by another framework. It is
+ *  not meant to transfer the tensor. When the borrowing framework doesn't need
+ *  the tensor, it should call the deleter to notify the host that the resource
+ *  is no longer needed.
+ */
+typedef struct DLManagedTensor {
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+  /*! \brief the context of the original host framework of DLManagedTensor in
+   *   which DLManagedTensor is used in the framework. It can also be NULL.
+   */
+  void * manager_ctx;
+  /*! \brief Destructor signature void (*)(void*) - this should be called
+   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
+   *   if there is no way for the caller to provide a reasonable destructor.
+   *   The destructors deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensor * self);
+} DLManagedTensor;
+#ifdef __cplusplus
+}  // DLPACK_EXTERN_C
+#endif
+#endif  // DLPACK_DLPACK_H_
diff --git a/numpy/core/src/common/npy_dlpack.h b/numpy/core/src/common/npy_dlpack.h
new file mode 100644
index 000000000..14ca352c0
--- /dev/null
+++ b/numpy/core/src/common/npy_dlpack.h
@@ -0,0 +1,28 @@
+#include "Python.h"
+#include "dlpack/dlpack.h"
+
+#ifndef NPY_DLPACK_H
+#define NPY_DLPACK_H
+
+// Part of the Array API specification.
+#define NPY_DLPACK_CAPSULE_NAME "dltensor"
+#define NPY_DLPACK_USED_CAPSULE_NAME "used_dltensor"
+
+// Used internally by NumPy to store a base object
+// as it has to release a reference to the original
+// capsule.
+#define NPY_DLPACK_INTERNAL_CAPSULE_NAME "numpy_dltensor"
+
+PyObject *
+array_dlpack(PyArrayObject *self, PyObject *const *args, Py_ssize_t len_args,
+             PyObject *kwnames);
+
+
+PyObject *
+array_dlpack_device(PyArrayObject *self, PyObject *NPY_UNUSED(args));
+
+
+NPY_NO_EXPORT PyObject *
+_from_dlpack(PyObject *NPY_UNUSED(self), PyObject *obj);
+
+#endif
diff --git a/numpy/core/src/multiarray/alloc.c b/numpy/core/src/multiarray/alloc.c
index e4756264d..d1173410d 100644
--- a/numpy/core/src/multiarray/alloc.c
+++ b/numpy/core/src/multiarray/alloc.c
@@ -370,6 +370,7 @@ default_free(void *NPY_UNUSED(ctx), void *ptr, size_t size)
 /* Memory handler global default */
 PyDataMem_Handler default_handler = {
     "default_allocator",
+    1,
     {
         NULL,            /* ctx */
         default_malloc,  /* malloc */
@@ -395,7 +396,6 @@ PyDataMem_UserNEW(size_t size, PyObject *mem_handler)
     if (handler == NULL) {
         return NULL;
     }
-
     assert(size != 0);
     result = handler->allocator.malloc(handler->allocator.ctx, size);
     if (_PyDataMem_eventhook != NULL) {
@@ -639,3 +639,40 @@ get_handler_name(PyObject *NPY_UNUSED(self), PyObject *args)
     Py_DECREF(mem_handler);
     return name;
 }
+
+NPY_NO_EXPORT PyObject *
+get_handler_version(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *arr=NULL;
+    if (!PyArg_ParseTuple(args, "|O:get_handler_version", &arr)) {
+        return NULL;
+    }
+    if (arr != NULL && !PyArray_Check(arr)) {
+         PyErr_SetString(PyExc_ValueError, "if supplied, argument must be an ndarray");
+         return NULL;
+    }
+    PyObject *mem_handler;
+    PyDataMem_Handler *handler;
+    PyObject *version;
+    if (arr != NULL) {
+        mem_handler = PyArray_HANDLER((PyArrayObject *) arr);
+        if (mem_handler == NULL) {
+            Py_RETURN_NONE;
+        }
+        Py_INCREF(mem_handler);
+    }
+    else {
+        mem_handler = PyDataMem_GetHandler();
+        if (mem_handler == NULL) {
+            return NULL;
+        }
+    }
+    handler = (PyDataMem_Handler *) PyCapsule_GetPointer(mem_handler, "mem_handler");
+    if (handler == NULL) {
+        Py_DECREF(mem_handler);
+        return NULL;
+    }
+    version = PyLong_FromLong(handler->version);
+    Py_DECREF(mem_handler);
+    return version;
+}
diff --git a/numpy/core/src/multiarray/alloc.h b/numpy/core/src/multiarray/alloc.h
index 4f7df1f84..f1ccf0bcd 100644
--- a/numpy/core/src/multiarray/alloc.h
+++ b/numpy/core/src/multiarray/alloc.h
@@ -47,5 +47,7 @@ extern PyDataMem_Handler default_handler;
 
 NPY_NO_EXPORT PyObject *
 get_handler_name(PyObject *NPY_UNUSED(self), PyObject *obj);
+NPY_NO_EXPORT PyObject *
+get_handler_version(PyObject *NPY_UNUSED(self), PyObject *obj);
 
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_ALLOC_H_ */
diff --git a/numpy/core/src/multiarray/array_coercion.c b/numpy/core/src/multiarray/array_coercion.c
index 847bdafc3..3c44d312c 100644
--- a/numpy/core/src/multiarray/array_coercion.c
+++ b/numpy/core/src/multiarray/array_coercion.c
@@ -555,6 +555,7 @@ npy_new_coercion_cache(
         cache = PyMem_Malloc(sizeof(coercion_cache_obj));
     }
     if (cache == NULL) {
+        Py_DECREF(arr_or_sequence);
         PyErr_NoMemory();
         return -1;
     }
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index 9910fffe6..5853e068b 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -1393,7 +1393,7 @@ arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *args)
 {
     PyObject *obj;
     PyObject *str;
-    #if PY_VERSION_HEX >= 0x030700A2 && (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM > 0x07030300)
+    #if !defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM > 0x07030300
     const char *docstr;
     #else
     char *docstr;
diff --git a/numpy/core/src/multiarray/dlpack.c b/numpy/core/src/multiarray/dlpack.c
new file mode 100644
index 000000000..291e60a22
--- /dev/null
+++ b/numpy/core/src/multiarray/dlpack.c
@@ -0,0 +1,408 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <dlpack/dlpack.h>
+
+#include "numpy/arrayobject.h"
+#include "common/npy_argparse.h"
+
+#include "common/dlpack/dlpack.h"
+#include "common/npy_dlpack.h"
+
+static void
+array_dlpack_deleter(DLManagedTensor *self)
+{
+    PyArrayObject *array = (PyArrayObject *)self->manager_ctx;
+    // This will also free the strides as it's one allocation.
+    PyMem_Free(self->dl_tensor.shape);
+    PyMem_Free(self);
+    Py_XDECREF(array);
+}
+
+/* This is exactly as mandated by dlpack */
+static void dlpack_capsule_deleter(PyObject *self) {
+    if (PyCapsule_IsValid(self, NPY_DLPACK_USED_CAPSULE_NAME)) {
+        return;
+    }
+
+    /* an exception may be in-flight, we must save it in case we create another one */
+    PyObject *type, *value, *traceback;
+    PyErr_Fetch(&type, &value, &traceback);
+
+    DLManagedTensor *managed =
+        (DLManagedTensor *)PyCapsule_GetPointer(self, NPY_DLPACK_CAPSULE_NAME);
+    if (managed == NULL) {
+        PyErr_WriteUnraisable(self);
+        goto done;
+    }
+    /*
+     *  the spec says the deleter can be NULL if there is no way for the caller
+     * to provide a reasonable destructor.
+     */
+    if (managed->deleter) {
+        managed->deleter(managed);
+        /* TODO: is the deleter allowed to set a python exception? */
+        assert(!PyErr_Occurred());
+    }
+
+done:
+    PyErr_Restore(type, value, traceback);
+}
+
+/* used internally, almost identical to dlpack_capsule_deleter() */
+static void array_dlpack_internal_capsule_deleter(PyObject *self)
+{
+    /* an exception may be in-flight, we must save it in case we create another one */
+    PyObject *type, *value, *traceback;
+    PyErr_Fetch(&type, &value, &traceback);
+
+    DLManagedTensor *managed =
+        (DLManagedTensor *)PyCapsule_GetPointer(self, NPY_DLPACK_INTERNAL_CAPSULE_NAME);
+    if (managed == NULL) {
+        PyErr_WriteUnraisable(self);
+        goto done;
+    }
+    /*
+     *  the spec says the deleter can be NULL if there is no way for the caller
+     * to provide a reasonable destructor.
+     */
+    if (managed->deleter) {
+        managed->deleter(managed);
+        /* TODO: is the deleter allowed to set a python exception? */
+        assert(!PyErr_Occurred());
+    }
+
+done:
+    PyErr_Restore(type, value, traceback);
+}
+
+
+// This function cannot return NULL, but it can fail,
+// So call PyErr_Occurred to check if it failed after
+// calling it.
+static DLDevice
+array_get_dl_device(PyArrayObject *self) {
+    DLDevice ret;
+    ret.device_type = kDLCPU;
+    ret.device_id = 0;
+    PyObject *base = PyArray_BASE(self);
+    // The outer if is due to the fact that NumPy arrays are on the CPU
+    // by default (if not created from DLPack).
+    if (PyCapsule_IsValid(base, NPY_DLPACK_INTERNAL_CAPSULE_NAME)) {
+        DLManagedTensor *managed = PyCapsule_GetPointer(
+                base, NPY_DLPACK_INTERNAL_CAPSULE_NAME);
+        if (managed == NULL) {
+            return ret;
+        }
+        return managed->dl_tensor.device;
+    }
+    return ret;
+}
+
+
+PyObject *
+array_dlpack(PyArrayObject *self,
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+    PyObject *stream = Py_None;
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("__dlpack__", args, len_args, kwnames,
+            "$stream", NULL, &stream, NULL, NULL, NULL)) {
+        return NULL;
+    }
+
+    if (stream != Py_None) {
+        PyErr_SetString(PyExc_RuntimeError, "NumPy only supports "
+                "stream=None.");
+        return NULL;
+    }
+
+    if ( !(PyArray_FLAGS(self) & NPY_ARRAY_WRITEABLE)) {
+        PyErr_SetString(PyExc_TypeError, "NumPy currently only supports "
+                "dlpack for writeable arrays");
+        return NULL;
+    }
+
+    npy_intp itemsize = PyArray_ITEMSIZE(self);
+    int ndim = PyArray_NDIM(self);
+    npy_intp *strides = PyArray_STRIDES(self);
+    npy_intp *shape = PyArray_SHAPE(self);
+
+    if (!PyArray_IS_C_CONTIGUOUS(self) && PyArray_SIZE(self) != 1) {
+        for (int i = 0; i < ndim; ++i) {
+            if (strides[i] % itemsize != 0) {
+                PyErr_SetString(PyExc_RuntimeError,
+                        "DLPack only supports strides which are a multiple of "
+                        "itemsize.");
+                return NULL;
+            }
+        }
+    }
+
+    DLDataType managed_dtype;
+    PyArray_Descr *dtype = PyArray_DESCR(self);
+
+    if (PyDataType_ISBYTESWAPPED(dtype)) {
+        PyErr_SetString(PyExc_TypeError, "DLPack only supports native "
+                    "byte swapping.");
+            return NULL;
+    }
+
+    managed_dtype.bits = 8 * itemsize;
+    managed_dtype.lanes = 1;
+
+    if (PyDataType_ISSIGNED(dtype)) {
+        managed_dtype.code = kDLInt;
+    }
+    else if (PyDataType_ISUNSIGNED(dtype)) {
+        managed_dtype.code = kDLUInt;
+    }
+    else if (PyDataType_ISFLOAT(dtype)) {
+        // We can't be sure that the dtype is
+        // IEEE or padded.
+        if (itemsize > 8) {
+            PyErr_SetString(PyExc_TypeError, "DLPack only supports IEEE "
+                    "floating point types without padding.");
+            return NULL;
+        }
+        managed_dtype.code = kDLFloat;
+    }
+    else if (PyDataType_ISCOMPLEX(dtype)) {
+        // We can't be sure that the dtype is
+        // IEEE or padded.
+        if (itemsize > 16) {
+            PyErr_SetString(PyExc_TypeError, "DLPack only supports IEEE "
+                    "complex point types without padding.");
+            return NULL;
+        }
+        managed_dtype.code = kDLComplex;
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+                        "DLPack only supports signed/unsigned integers, float "
+                        "and complex dtypes.");
+        return NULL;
+    }
+
+    DLDevice device = array_get_dl_device(self);
+    if (PyErr_Occurred()) {
+        return NULL;
+    }
+
+    DLManagedTensor *managed = PyMem_Malloc(sizeof(DLManagedTensor));
+    if (managed == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    /*
+     * Note: the `dlpack.h` header suggests/standardizes that `data` must be
+     * 256-byte aligned.  We ignore this intentionally, because `__dlpack__`
+     * standardizes that `byte_offset` must be 0 (for now) to not break pytorch:
+     * https://github.com/data-apis/array-api/issues/293#issuecomment-964111413
+     *
+     * We further assume that exporting fully unaligned data is OK even without
+     * `byte_offset` since the standard does not reject it.
+     * Presumably, pytorch will support importing `byte_offset != 0` and NumPy
+     * can choose to use it starting about 2023.  At that point, it may be
+     * that NumPy MUST use `byte_offset` to adhere to the standard (as
+     * specified in the header)!
+     */
+    managed->dl_tensor.data = PyArray_DATA(self);
+    managed->dl_tensor.byte_offset = 0;
+    managed->dl_tensor.device = device;
+    managed->dl_tensor.dtype = managed_dtype;
+
+    int64_t *managed_shape_strides = PyMem_Malloc(sizeof(int64_t) * ndim * 2);
+    if (managed_shape_strides == NULL) {
+        PyErr_NoMemory();
+        PyMem_Free(managed);
+        return NULL;
+    }
+
+    int64_t *managed_shape = managed_shape_strides;
+    int64_t *managed_strides = managed_shape_strides + ndim;
+    for (int i = 0; i < ndim; ++i) {
+        managed_shape[i] = shape[i];
+        // Strides in DLPack are items; in NumPy are bytes.
+        managed_strides[i] = strides[i] / itemsize;
+    }
+
+    managed->dl_tensor.ndim = ndim;
+    managed->dl_tensor.shape = managed_shape;
+    managed->dl_tensor.strides = NULL;
+    if (PyArray_SIZE(self) != 1 && !PyArray_IS_C_CONTIGUOUS(self)) {
+        managed->dl_tensor.strides = managed_strides;
+    }
+    managed->dl_tensor.byte_offset = 0;
+    managed->manager_ctx = self;
+    managed->deleter = array_dlpack_deleter;
+
+    PyObject *capsule = PyCapsule_New(managed, NPY_DLPACK_CAPSULE_NAME,
+            dlpack_capsule_deleter);
+    if (capsule == NULL) {
+        PyMem_Free(managed);
+        PyMem_Free(managed_shape_strides);
+        return NULL;
+    }
+
+    // the capsule holds a reference
+    Py_INCREF(self);
+    return capsule;
+}
+
+PyObject *
+array_dlpack_device(PyArrayObject *self, PyObject *NPY_UNUSED(args))
+{
+    DLDevice device = array_get_dl_device(self);
+    if (PyErr_Occurred()) {
+        return NULL;
+    }
+    return Py_BuildValue("ii", device.device_type, device.device_id);
+}
+
+NPY_NO_EXPORT PyObject *
+_from_dlpack(PyObject *NPY_UNUSED(self), PyObject *obj) {
+    PyObject *capsule = PyObject_CallMethod((PyObject *)obj->ob_type,
+            "__dlpack__", "O", obj);
+    if (capsule == NULL) {
+        return NULL;
+    }
+
+    DLManagedTensor *managed =
+        (DLManagedTensor *)PyCapsule_GetPointer(capsule,
+        NPY_DLPACK_CAPSULE_NAME);
+
+    if (managed == NULL) {
+        Py_DECREF(capsule);
+        return NULL;
+    }
+
+    const int ndim = managed->dl_tensor.ndim;
+    if (ndim > NPY_MAXDIMS) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "maxdims of DLPack tensor is higher than the supported "
+                "maxdims.");
+        Py_DECREF(capsule);
+        return NULL;
+    }
+
+    DLDeviceType device_type = managed->dl_tensor.device.device_type;
+    if (device_type != kDLCPU &&
+            device_type != kDLCUDAHost &&
+            device_type != kDLROCMHost &&
+            device_type != kDLCUDAManaged) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Unsupported device in DLTensor.");
+        Py_DECREF(capsule);
+        return NULL;
+    }
+
+    if (managed->dl_tensor.dtype.lanes != 1) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Unsupported lanes in DLTensor dtype.");
+        Py_DECREF(capsule);
+        return NULL;
+    }
+
+    int typenum = -1;
+    const uint8_t bits = managed->dl_tensor.dtype.bits;
+    const npy_intp itemsize = bits / 8;
+    switch (managed->dl_tensor.dtype.code) {
+    case kDLInt:
+        switch (bits)
+        {
+            case 8: typenum = NPY_INT8; break;
+            case 16: typenum = NPY_INT16; break;
+            case 32: typenum = NPY_INT32; break;
+            case 64: typenum = NPY_INT64; break;
+        }
+        break;
+    case kDLUInt:
+        switch (bits)
+        {
+            case 8: typenum = NPY_UINT8; break;
+            case 16: typenum = NPY_UINT16; break;
+            case 32: typenum = NPY_UINT32; break;
+            case 64: typenum = NPY_UINT64; break;
+        }
+        break;
+    case kDLFloat:
+        switch (bits)
+        {
+            case 16: typenum = NPY_FLOAT16; break;
+            case 32: typenum = NPY_FLOAT32; break;
+            case 64: typenum = NPY_FLOAT64; break;
+        }
+        break;
+    case kDLComplex:
+        switch (bits)
+        {
+            case 64: typenum = NPY_COMPLEX64; break;
+            case 128: typenum = NPY_COMPLEX128; break;
+        }
+        break;
+    }
+
+    if (typenum == -1) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Unsupported dtype in DLTensor.");
+        Py_DECREF(capsule);
+        return NULL;
+    }
+
+    npy_intp shape[NPY_MAXDIMS];
+    npy_intp strides[NPY_MAXDIMS];
+
+    for (int i = 0; i < ndim; ++i) {
+        shape[i] = managed->dl_tensor.shape[i];
+        // DLPack has elements as stride units, NumPy has bytes.
+        if (managed->dl_tensor.strides != NULL) {
+            strides[i] = managed->dl_tensor.strides[i] * itemsize;
+        }
+    }
+
+    char *data = (char *)managed->dl_tensor.data +
+            managed->dl_tensor.byte_offset;
+
+    PyArray_Descr *descr = PyArray_DescrFromType(typenum);
+    if (descr == NULL) {
+        Py_DECREF(capsule);
+        return NULL;
+    }
+
+    PyObject *ret = PyArray_NewFromDescr(&PyArray_Type, descr, ndim, shape,
+            managed->dl_tensor.strides != NULL ? strides : NULL, data, 0, NULL);
+    if (ret == NULL) {
+        Py_DECREF(capsule);
+        return NULL;
+    }
+
+    PyObject *new_capsule = PyCapsule_New(managed,
+            NPY_DLPACK_INTERNAL_CAPSULE_NAME,
+            array_dlpack_internal_capsule_deleter);
+    if (new_capsule == NULL) {
+        Py_DECREF(capsule);
+        Py_DECREF(ret);
+        return NULL;
+    }
+
+    if (PyArray_SetBaseObject((PyArrayObject *)ret, new_capsule) < 0) {
+        Py_DECREF(capsule);
+        Py_DECREF(ret);
+        return NULL;
+    }
+
+    if (PyCapsule_SetName(capsule, NPY_DLPACK_USED_CAPSULE_NAME) < 0) {
+        Py_DECREF(capsule);
+        Py_DECREF(ret);
+        return NULL;
+    }
+
+    Py_DECREF(capsule);
+    return ret;
+}
+
+
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 2d66c77dc..2ca8d9288 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -26,6 +26,7 @@
 #include "shape.h"
 #include "strfuncs.h"
 #include "array_assign.h"
+#include "npy_dlpack.h"
 
 #include "methods.h"
 #include "alloc.h"
@@ -1821,22 +1822,8 @@ array_reduce_ex_picklebuffer(PyArrayObject *self, int protocol)
 
     descr = PyArray_DESCR(self);
 
-    /* if the python version is below 3.8, the pickle module does not provide
-     * built-in support for protocol 5. We try importing the pickle5
-     * backport instead */
-#if PY_VERSION_HEX >= 0x03080000
     /* we expect protocol 5 to be available in Python 3.8 */
     pickle_module = PyImport_ImportModule("pickle");
-#else
-    pickle_module = PyImport_ImportModule("pickle5");
-    if (pickle_module == NULL) {
-        /* for protocol 5, raise a clear ImportError if pickle5 is not found
-         */
-        PyErr_SetString(PyExc_ImportError, "Using pickle protocol 5 "
-                "requires the pickle5 module for Python >=3.6 and <3.8");
-        return NULL;
-    }
-#endif
     if (pickle_module == NULL){
         return NULL;
     }
@@ -2989,5 +2976,13 @@ NPY_NO_EXPORT PyMethodDef array_methods[] = {
     {"view",
         (PyCFunction)array_view,
         METH_FASTCALL | METH_KEYWORDS, NULL},
+    // For data interchange between libraries
+    {"__dlpack__",
+        (PyCFunction)array_dlpack,
+        METH_FASTCALL | METH_KEYWORDS, NULL},
+
+    {"__dlpack_device__",
+        (PyCFunction)array_dlpack_device,
+        METH_NOARGS, NULL},
     {NULL, NULL, 0, NULL}           /* sentinel */
 };
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index dea828ed9..a854bcb3b 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -70,6 +70,8 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "get_attr_string.h"
 #include "experimental_public_dtype_api.h"  /* _get_experimental_dtype_api */
 
+#include "npy_dlpack.h"
+
 /*
  *****************************************************************************
  **                    INCLUDE GENERATED CODE                               **
@@ -4231,7 +4233,6 @@ _reload_guard(PyObject *NPY_UNUSED(self)) {
     Py_RETURN_NONE;
 }
 
-
 static struct PyMethodDef array_module_methods[] = {
     {"_get_implementing_args",
         (PyCFunction)array__get_implementing_args,
@@ -4436,6 +4437,9 @@ static struct PyMethodDef array_module_methods[] = {
     {"get_handler_name",
         (PyCFunction) get_handler_name,
         METH_VARARGS, NULL},
+    {"get_handler_version",
+        (PyCFunction) get_handler_version,
+        METH_VARARGS, NULL},
     {"_add_newdoc_ufunc", (PyCFunction)add_newdoc_ufunc,
         METH_VARARGS, NULL},
     {"_get_sfloat_dtype",
@@ -4445,6 +4449,8 @@ static struct PyMethodDef array_module_methods[] = {
     {"_reload_guard", (PyCFunction)_reload_guard,
         METH_NOARGS,
         "Give a warning on reload and big warning in sub-interpreters."},
+    {"_from_dlpack", (PyCFunction)_from_dlpack,
+        METH_O, NULL},
     {NULL, NULL, 0, NULL}                /* sentinel */
 };
 
@@ -4675,14 +4681,14 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
     PyObject *m, *d, *s;
     PyObject *c_api;
 
-    /* Initialize CPU features */
-    if (npy_cpu_init() < 0) {
-        goto err;
-    }
-
     /* Create the module and add the functions */
     m = PyModule_Create(&moduledef);
     if (!m) {
+        return NULL;
+    }
+
+    /* Initialize CPU features */
+    if (npy_cpu_init() < 0) {
         goto err;
     }
 
@@ -4934,5 +4940,6 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
         PyErr_SetString(PyExc_RuntimeError,
                         "cannot load multiarray module.");
     }
+    Py_DECREF(m);
     return NULL;
 }
diff --git a/numpy/core/src/multiarray/scalarapi.c b/numpy/core/src/multiarray/scalarapi.c
index e409e9874..564352f1f 100644
--- a/numpy/core/src/multiarray/scalarapi.c
+++ b/numpy/core/src/multiarray/scalarapi.c
@@ -233,8 +233,12 @@ PyArray_CastScalarToCtype(PyObject *scalar, void *ctypeptr,
     PyArray_VectorUnaryFunc* castfunc;
 
     descr = PyArray_DescrFromScalar(scalar);
+    if (descr == NULL) {
+        return -1;
+    }
     castfunc = PyArray_GetCastFunc(descr, outcode->type_num);
     if (castfunc == NULL) {
+        Py_DECREF(descr);
         return -1;
     }
     if (PyTypeNum_ISEXTENDED(descr->type_num) ||
@@ -254,6 +258,7 @@ PyArray_CastScalarToCtype(PyObject *scalar, void *ctypeptr,
                     NPY_ARRAY_CARRAY, NULL);
         if (aout == NULL) {
             Py_DECREF(ain);
+            Py_DECREF(descr);
             return -1;
         }
         castfunc(PyArray_DATA(ain), PyArray_DATA(aout), 1, ain, aout);
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 7c0710819..aaa694f34 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1532,8 +1532,8 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
  */
 
 /**begin repeat
- *  #func = rint, ceil, floor, trunc#
- *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
+ *  #func = rint, floor, trunc#
+ *  #scalarf = npy_rint, npy_floor, npy_trunc#
  */
 
 /**begin repeat1
@@ -1568,8 +1568,8 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
  */
 
 /**begin repeat2
- *  #func = rint, ceil, floor, trunc#
- *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
+ *  #func = rint, floor, trunc#
+ *  #scalarf = npy_rint, npy_floor, npy_trunc#
  */
 
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 0938cd050..081ca9957 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -187,7 +187,7 @@ NPY_NO_EXPORT void
  *  #TYPE = FLOAT, DOUBLE#
  */
 /**begin repeat1
- * #kind = sqrt, absolute, square, reciprocal#
+ * #kind = ceil, sqrt, absolute, square, reciprocal#
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
@@ -228,7 +228,7 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
 /**end repeat**/
 
 /**begin repeat
- * #func = sin, cos# 
+ * #func = sin, cos#
  */
 
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@,
@@ -275,7 +275,7 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
 /**end repeat**/
 
 /**begin repeat
- *  #func = rint, ceil, floor, trunc#
+ *  #func = rint, floor, trunc#
  */
 
 /**begin repeat1
diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
index 2d5917282..789733fb6 100644
--- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -1,6 +1,8 @@
 /*@targets
  ** $maxopt baseline
- ** sse2 vsx2 neon
+ ** sse2 sse41
+ ** vsx2
+ ** neon asimd
  **/
 /**
  * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
@@ -65,6 +67,9 @@ NPY_FINLINE double c_square_f64(double a)
     #define c_sqrt_f64 npy_sqrt
 #endif
 
+#define c_ceil_f32 npy_ceilf
+#define c_ceil_f64 npy_ceil
+
 /********************************************************************************
  ** Defining the SIMD kernels
  ********************************************************************************/
@@ -134,10 +139,10 @@ NPY_FINLINE double c_square_f64(double a)
  */
 #if @VCHK@
 /**begin repeat1
- * #kind     = sqrt, absolute, square, reciprocal#
- * #intr     = sqrt, abs,      square, recip#
- * #repl_0w1 = 0,    0,        0,      1#
- * #RECIP_WORKAROUND = 0, 0,   0,      WORKAROUND_CLANG_RECIPROCAL_BUG#
+ * #kind     = ceil, sqrt, absolute, square, reciprocal#
+ * #intr     = ceil, sqrt, abs,      square, recip#
+ * #repl_0w1 = 0,    0,    0,        0,      1#
+ * #RECIP_WORKAROUND = 0, 0, 0, 0, WORKAROUND_CLANG_RECIPROCAL_BUG#
  */
 /**begin repeat2
  * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
@@ -245,9 +250,9 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
  * #VCHK = NPY_SIMD, NPY_SIMD_F64#
  */
 /**begin repeat1
- * #kind  = sqrt, absolute, square, reciprocal#
- * #intr  = sqrt, abs,      square, recip#
- * #clear = 0,    1,        0,      0#
+ * #kind  = ceil, sqrt, absolute, square, reciprocal#
+ * #intr  = ceil, sqrt, abs,      square, recip#
+ * #clear = 0,    0,    1,        0,      0#
  */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index d47be9a30..0e2c1ab8b 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -169,7 +169,7 @@ run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp c
  */
 
 /**begin repeat2
- *  #func = rint, floor, ceil, trunc#
+ *  #func = rint, floor, trunc#
  */
 
 #if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
@@ -850,12 +850,6 @@ fma_floor_@vsub@(@vtype@ x)
 }
 
 NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_ceil_@vsub@(@vtype@ x)
-{
-    return _mm256_round_@vsub@(x, _MM_FROUND_TO_POS_INF);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
 fma_trunc_@vsub@(@vtype@ x)
 {
     return _mm256_round_@vsub@(x, _MM_FROUND_TO_ZERO);
@@ -988,12 +982,6 @@ avx512_floor_@vsub@(@vtype@ x)
 }
 
 NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_ceil_@vsub@(@vtype@ x)
-{
-    return _mm512_roundscale_@vsub@(x, 0x0A);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
 avx512_trunc_@vsub@(@vtype@ x)
 {
     return _mm512_roundscale_@vsub@(x, 0x0B);
@@ -1327,8 +1315,8 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
  */
 
 /**begin repeat1
- *  #func = rint, ceil, floor, trunc#
- *  #vectorf = rint, ceil, floor, trunc#
+ *  #func = rint, floor, trunc#
+ *  #vectorf = rint, floor, trunc#
  */
 
 #if defined @CHK@
@@ -1398,8 +1386,8 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
  */
 
 /**begin repeat1
- *  #func = rint, ceil, floor, trunc#
- *  #vectorf =  rint, ceil, floor, trunc#
+ *  #func = rint, floor, trunc#
+ *  #vectorf =  rint, floor, trunc#
  */
 
 #if defined @CHK@
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 237af81b2..186f18a62 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -4926,65 +4926,6 @@ fail:
 
 
 /*
- * TODO: The implementation below can be replaced with PyVectorcall_Call
- *       when available (should be Python 3.8+).
- */
-static PyObject *
-ufunc_generic_call(
-        PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
-{
-    Py_ssize_t len_args = PyTuple_GET_SIZE(args);
-    /*
-     * Wrapper for tp_call to tp_fastcall, to support both on older versions
-     * of Python. (and generally simplifying support of both versions in the
-     * same codebase.
-     */
-    if (kwds == NULL) {
-        return ufunc_generic_fastcall(ufunc,
-                PySequence_Fast_ITEMS(args), len_args, NULL, NPY_FALSE);
-    }
-
-    PyObject *new_args[NPY_MAXARGS];
-    Py_ssize_t len_kwds = PyDict_Size(kwds);
-
-    if (NPY_UNLIKELY(len_args + len_kwds > NPY_MAXARGS)) {
-        /*
-         * We do not have enough scratch-space, so we have to abort;
-         * In practice this error should not be seen by users.
-         */
-        PyErr_Format(PyExc_ValueError,
-                "%s() takes from %d to %d positional arguments but "
-                "%zd were given",
-                ufunc_get_name_cstr(ufunc) , ufunc->nin, ufunc->nargs, len_args);
-        return NULL;
-    }
-
-    /* Copy args into the scratch space */
-    for (Py_ssize_t i = 0; i < len_args; i++) {
-        new_args[i] = PyTuple_GET_ITEM(args, i);
-    }
-
-    PyObject *kwnames = PyTuple_New(len_kwds);
-
-    PyObject *key, *value;
-    Py_ssize_t pos = 0;
-    Py_ssize_t i = 0;
-    while (PyDict_Next(kwds, &pos, &key, &value)) {
-        Py_INCREF(key);
-        PyTuple_SET_ITEM(kwnames, i, key);
-        new_args[i + len_args] = value;
-        i++;
-    }
-
-    PyObject *res = ufunc_generic_fastcall(ufunc,
-            new_args, len_args, kwnames, NPY_FALSE);
-    Py_DECREF(kwnames);
-    return res;
-}
-
-
-#if PY_VERSION_HEX >= 0x03080000
-/*
  * Implement vectorcallfunc which should be defined with Python 3.8+.
  * In principle this could be backported, but the speed gain seems moderate
  * since ufunc calls often do not have keyword arguments and always have
@@ -5001,7 +4942,6 @@ ufunc_generic_vectorcall(PyObject *ufunc,
     return ufunc_generic_fastcall((PyUFuncObject *)ufunc,
             args, PyVectorcall_NARGS(len_args), kwnames, NPY_FALSE);
 }
-#endif  /* PY_VERSION_HEX >= 0x03080000 */
 
 
 NPY_NO_EXPORT PyObject *
@@ -5178,11 +5118,7 @@ PyUFunc_FromFuncAndDataAndSignatureAndIdentity(PyUFuncGenericFunction *func, voi
     ufunc->core_dim_flags = NULL;
     ufunc->userloops = NULL;
     ufunc->ptr = NULL;
-#if PY_VERSION_HEX >= 0x03080000
     ufunc->vectorcall = &ufunc_generic_vectorcall;
-#else
-    ufunc->reserved2 = NULL;
-#endif
     ufunc->reserved1 = 0;
     ufunc->iter_flags = 0;
 
@@ -6437,19 +6373,15 @@ NPY_NO_EXPORT PyTypeObject PyUFunc_Type = {
     .tp_basicsize = sizeof(PyUFuncObject),
     .tp_dealloc = (destructor)ufunc_dealloc,
     .tp_repr = (reprfunc)ufunc_repr,
-    .tp_call = (ternaryfunc)ufunc_generic_call,
+    .tp_call = &PyVectorcall_Call,
     .tp_str = (reprfunc)ufunc_repr,
     .tp_flags = Py_TPFLAGS_DEFAULT |
-#if PY_VERSION_HEX >= 0x03080000
         _Py_TPFLAGS_HAVE_VECTORCALL |
-#endif
         Py_TPFLAGS_HAVE_GC,
     .tp_traverse = (traverseproc)ufunc_traverse,
     .tp_methods = ufunc_methods,
     .tp_getset = ufunc_getset,
-#if PY_VERSION_HEX >= 0x03080000
     .tp_vectorcall_offset = offsetof(PyUFuncObject, vectorcall),
-#endif
 };
 
 /* End of code for ufunc objects */
diff --git a/numpy/core/tests/test_dlpack.py b/numpy/core/tests/test_dlpack.py
new file mode 100644
index 000000000..f848b2008
--- /dev/null
+++ b/numpy/core/tests/test_dlpack.py
@@ -0,0 +1,109 @@
+import sys
+import pytest
+
+import numpy as np
+from numpy.testing import assert_array_equal, IS_PYPY
+
+
+class TestDLPack:
+    @pytest.mark.skipif(IS_PYPY, reason="PyPy can't get refcounts.")
+    def test_dunder_dlpack_refcount(self):
+        x = np.arange(5)
+        y = x.__dlpack__()
+        assert sys.getrefcount(x) == 3
+        del y
+        assert sys.getrefcount(x) == 2
+
+    def test_dunder_dlpack_stream(self):
+        x = np.arange(5)
+        x.__dlpack__(stream=None)
+
+        with pytest.raises(RuntimeError):
+            x.__dlpack__(stream=1)
+
+    def test_strides_not_multiple_of_itemsize(self):
+        dt = np.dtype([('int', np.int32), ('char', np.int8)])
+        y = np.zeros((5,), dtype=dt)
+        z = y['int']
+
+        with pytest.raises(RuntimeError):
+            np._from_dlpack(z)
+
+    @pytest.mark.skipif(IS_PYPY, reason="PyPy can't get refcounts.")
+    def test_from_dlpack_refcount(self):
+        x = np.arange(5)
+        y = np._from_dlpack(x)
+        assert sys.getrefcount(x) == 3
+        del y
+        assert sys.getrefcount(x) == 2
+
+    @pytest.mark.parametrize("dtype", [
+        np.int8, np.int16, np.int32, np.int64,
+        np.uint8, np.uint16, np.uint32, np.uint64,
+        np.float16, np.float32, np.float64,
+        np.complex64, np.complex128
+    ])
+    def test_dtype_passthrough(self, dtype):
+        x = np.arange(5, dtype=dtype)
+        y = np._from_dlpack(x)
+
+        assert y.dtype == x.dtype
+        assert_array_equal(x, y)
+
+    def test_invalid_dtype(self):
+        x = np.asarray(np.datetime64('2021-05-27'))
+
+        with pytest.raises(TypeError):
+            np._from_dlpack(x)
+
+    def test_invalid_byte_swapping(self):
+        dt = np.dtype('=i8').newbyteorder()
+        x = np.arange(5, dtype=dt)
+
+        with pytest.raises(TypeError):
+            np._from_dlpack(x)
+
+    def test_non_contiguous(self):
+        x = np.arange(25).reshape((5, 5))
+
+        y1 = x[0]
+        assert_array_equal(y1, np._from_dlpack(y1))
+
+        y2 = x[:, 0]
+        assert_array_equal(y2, np._from_dlpack(y2))
+
+        y3 = x[1, :]
+        assert_array_equal(y3, np._from_dlpack(y3))
+
+        y4 = x[1]
+        assert_array_equal(y4, np._from_dlpack(y4))
+
+        y5 = np.diagonal(x).copy()
+        assert_array_equal(y5, np._from_dlpack(y5))
+
+    @pytest.mark.parametrize("ndim", range(33))
+    def test_higher_dims(self, ndim):
+        shape = (1,) * ndim
+        x = np.zeros(shape, dtype=np.float64)
+
+        assert shape == np._from_dlpack(x).shape
+
+    def test_dlpack_device(self):
+        x = np.arange(5)
+        assert x.__dlpack_device__() == (1, 0)
+        assert np._from_dlpack(x).__dlpack_device__() == (1, 0)
+
+    def dlpack_deleter_exception(self):
+        x = np.arange(5)
+        _ = x.__dlpack__()
+        raise RuntimeError
+    
+    def test_dlpack_destructor_exception(self):
+        with pytest.raises(RuntimeError):
+            self.dlpack_deleter_exception()
+
+    def test_readonly(self):
+        x = np.arange(5)
+        x.flags.writeable = False
+        with pytest.raises(TypeError):
+            x.__dlpack__()
diff --git a/numpy/core/tests/test_mem_policy.py b/numpy/core/tests/test_mem_policy.py
index 7fec8897f..abf340062 100644
--- a/numpy/core/tests/test_mem_policy.py
+++ b/numpy/core/tests/test_mem_policy.py
@@ -179,6 +179,7 @@ def get_module(tmp_path):
         };
         static PyDataMem_Handler secret_data_handler = {
             "secret_data_allocator",
+            1,
             {
                 &secret_data_handler_ctx, /* ctx */
                 shift_alloc,              /* malloc */
@@ -212,17 +213,22 @@ def get_module(tmp_path):
 def test_set_policy(get_module):
 
     get_handler_name = np.core.multiarray.get_handler_name
+    get_handler_version = np.core.multiarray.get_handler_version
     orig_policy_name = get_handler_name()
 
     a = np.arange(10).reshape((2, 5))  # a doesn't own its own data
     assert get_handler_name(a) is None
+    assert get_handler_version(a) is None
     assert get_handler_name(a.base) == orig_policy_name
+    assert get_handler_version(a.base) == 1
 
     orig_policy = get_module.set_secret_data_policy()
 
     b = np.arange(10).reshape((2, 5))  # b doesn't own its own data
     assert get_handler_name(b) is None
+    assert get_handler_version(b) is None
     assert get_handler_name(b.base) == 'secret_data_allocator'
+    assert get_handler_version(b.base) == 1
 
     if orig_policy_name == 'default_allocator':
         get_module.set_old_policy(None)  # tests PyDataMem_SetHandler(NULL)
diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py
index d7df386fe..85bf5d0d1 100644
--- a/numpy/distutils/ccompiler_opt.py
+++ b/numpy/distutils/ccompiler_opt.py
@@ -196,7 +196,6 @@ class _Config:
             native = '-march=native',
             opt = '-O3',
             werror = '-Werror',
-            cxx = '-std=c++11',
         ),
         clang = dict(
             native = '-march=native',
@@ -207,25 +206,21 @@ class _Config:
             # "unused arguments" warnings.
             # see https://github.com/numpy/numpy/issues/19624
             werror = '-Werror=switch -Werror',
-            cxx = '-std=c++11',
         ),
         icc = dict(
             native = '-xHost',
             opt = '-O3',
             werror = '-Werror',
-            cxx = '-std=c++11',
         ),
         iccw = dict(
             native = '/QxHost',
             opt = '/O3',
             werror = '/Werror',
-            cxx = '-std=c++11',
         ),
         msvc = dict(
             native = None,
             opt = '/O2',
             werror = '/WX',
-            cxx = '-std=c++11',
         )
     )
     conf_min_features = dict(
diff --git a/numpy/distutils/mingw32ccompiler.py b/numpy/distutils/mingw32ccompiler.py
index 82d296434..fbe3655c9 100644
--- a/numpy/distutils/mingw32ccompiler.py
+++ b/numpy/distutils/mingw32ccompiler.py
@@ -24,7 +24,6 @@ from numpy.distutils import log
 # 3. Force windows to use g77
 
 import distutils.cygwinccompiler
-from distutils.version import StrictVersion
 from distutils.unixccompiler import UnixCCompiler
 from distutils.msvccompiler import get_build_version as get_build_msvc_version
 from distutils.errors import UnknownFileError
@@ -62,35 +61,6 @@ class Mingw32CCompiler(distutils.cygwinccompiler.CygwinCCompiler):
         distutils.cygwinccompiler.CygwinCCompiler.__init__ (self, verbose,
                                                             dry_run, force)
 
-        # we need to support 3.2 which doesn't match the standard
-        # get_versions methods regex
-        if self.gcc_version is None:
-            try:
-                out_string  = subprocess.check_output(['gcc', '-dumpversion'])
-            except (OSError, CalledProcessError):
-                out_string = ""  # ignore failures to match old behavior
-            result = re.search(r'(\d+\.\d+)', out_string)
-            if result:
-                self.gcc_version = StrictVersion(result.group(1))
-
-        # A real mingw32 doesn't need to specify a different entry point,
-        # but cygwin 2.91.57 in no-cygwin-mode needs it.
-        if self.gcc_version <= "2.91.57":
-            entry_point = '--entry _DllMain@12'
-        else:
-            entry_point = ''
-
-        if self.linker_dll == 'dllwrap':
-            # Commented out '--driver-name g++' part that fixes weird
-            #   g++.exe: g++: No such file or directory
-            # error (mingw 1.0 in Enthon24 tree, gcc-3.4.5).
-            # If the --driver-name part is required for some environment
-            # then make the inclusion of this part specific to that
-            # environment.
-            self.linker = 'dllwrap' #  --driver-name g++'
-        elif self.linker_dll == 'gcc':
-            self.linker = 'g++'
-
         # **changes: eric jones 4/11/01
         # 1. Check for import library on Windows.  Build if it doesn't exist.
 
@@ -113,42 +83,18 @@ class Mingw32CCompiler(distutils.cygwinccompiler.CygwinCCompiler):
         # kind of bad consequences, like using Py_ModuleInit4 instead of
         # Py_ModuleInit4_64, etc... So we add it here
         if get_build_architecture() == 'AMD64':
-            if self.gcc_version < "4.0":
-                self.set_executables(
-                    compiler='gcc -g -DDEBUG -DMS_WIN64 -mno-cygwin -O0 -Wall',
-                    compiler_so='gcc -g -DDEBUG -DMS_WIN64 -mno-cygwin -O0'
-                                ' -Wall -Wstrict-prototypes',
-                    linker_exe='gcc -g -mno-cygwin',
-                    linker_so='gcc -g -mno-cygwin -shared')
-            else:
-                # gcc-4 series releases do not support -mno-cygwin option
-                self.set_executables(
-                    compiler='gcc -g -DDEBUG -DMS_WIN64 -O0 -Wall',
-                    compiler_so='gcc -g -DDEBUG -DMS_WIN64 -O0 -Wall -Wstrict-prototypes',
-                    linker_exe='gcc -g',
-                    linker_so='gcc -g -shared')
+            self.set_executables(
+                compiler='gcc -g -DDEBUG -DMS_WIN64 -O0 -Wall',
+                compiler_so='gcc -g -DDEBUG -DMS_WIN64 -O0 -Wall '
+                            '-Wstrict-prototypes',
+                linker_exe='gcc -g',
+                linker_so='gcc -g -shared')
         else:
-            if self.gcc_version <= "3.0.0":
-                self.set_executables(
-                    compiler='gcc -mno-cygwin -O2 -w',
-                    compiler_so='gcc -mno-cygwin -mdll -O2 -w'
-                                ' -Wstrict-prototypes',
-                    linker_exe='g++ -mno-cygwin',
-                    linker_so='%s -mno-cygwin -mdll -static %s' %
-                              (self.linker, entry_point))
-            elif self.gcc_version < "4.0":
-                self.set_executables(
-                    compiler='gcc -mno-cygwin -O2 -Wall',
-                    compiler_so='gcc -mno-cygwin -O2 -Wall'
-                                ' -Wstrict-prototypes',
-                    linker_exe='g++ -mno-cygwin',
-                    linker_so='g++ -mno-cygwin -shared')
-            else:
-                # gcc-4 series releases do not support -mno-cygwin option
-                self.set_executables(compiler='gcc -O2 -Wall',
-                                     compiler_so='gcc -O2 -Wall -Wstrict-prototypes',
-                                     linker_exe='g++ ',
-                                     linker_so='g++ -shared')
+            self.set_executables(
+                compiler='gcc -O2 -Wall',
+                compiler_so='gcc -O2 -Wall -Wstrict-prototypes',
+                linker_exe='g++ ',
+                linker_so='g++ -shared')
         # added for python2.3 support
         # we can't pass it through set_executables because pre 2.2 would fail
         self.compiler_cxx = ['g++']
@@ -198,10 +144,7 @@ class Mingw32CCompiler(distutils.cygwinccompiler.CygwinCCompiler):
                 extra_postargs,
                 build_temp,
                 target_lang)
-        if self.gcc_version < "3.0.0":
-            func = distutils.cygwinccompiler.CygwinCCompiler.link
-        else:
-            func = UnixCCompiler.link
+        func = UnixCCompiler.link
         func(*args[:func.__code__.co_argcount])
         return
 
diff --git a/numpy/distutils/unixccompiler.py b/numpy/distutils/unixccompiler.py
index 733a9fc50..4884960fd 100644
--- a/numpy/distutils/unixccompiler.py
+++ b/numpy/distutils/unixccompiler.py
@@ -5,6 +5,7 @@ unixccompiler - can handle very long argument lists for ar.
 import os
 import sys
 import subprocess
+import shlex
 
 from distutils.errors import CompileError, DistutilsExecError, LibError
 from distutils.unixccompiler import UnixCCompiler
@@ -30,15 +31,15 @@ def UnixCCompiler__compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts
     if 'OPT' in os.environ:
         # XXX who uses this?
         from sysconfig import get_config_vars
-        opt = " ".join(os.environ['OPT'].split())
-        gcv_opt = " ".join(get_config_vars('OPT')[0].split())
-        ccomp_s = " ".join(self.compiler_so)
+        opt = shlex.join(shlex.split(os.environ['OPT']))
+        gcv_opt = shlex.join(shlex.split(get_config_vars('OPT')[0]))
+        ccomp_s = shlex.join(self.compiler_so)
         if opt not in ccomp_s:
             ccomp_s = ccomp_s.replace(gcv_opt, opt)
-            self.compiler_so = ccomp_s.split()
-        llink_s = " ".join(self.linker_so)
+            self.compiler_so = shlex.split(ccomp_s)
+        llink_s = shlex.join(self.linker_so)
         if opt not in llink_s:
-            self.linker_so = llink_s.split() + opt.split()
+            self.linker_so = self.linker_so + shlex.split(opt)
 
     display = '%s: %s' % (os.path.basename(self.compiler_so[0]), src)
 
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 4d57acdb2..3c9983edf 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -67,7 +67,7 @@ __all__ = [
 # fix_gamma : Callable
 #   A function used for discret methods to force the index to a specific value.
 _QuantileInterpolation = dict(
-    # --- HYNDMAN and FAN methods
+    # --- HYNDMAN AND FAN METHODS
     # Discrete methods
     inverted_cdf=dict(
         get_virtual_index=lambda n, quantiles: _inverted_cdf(n, quantiles),
@@ -102,10 +102,12 @@ _QuantileInterpolation = dict(
         _compute_virtual_index(n, quantiles, 0, 0),
         fix_gamma=lambda gamma, _: gamma,
     ),
-    # Default value
+    # Default method.
+    # To avoid some rounding issues, `(n-1) * quantiles` is preferred to
+    # `_compute_virtual_index(n, quantiles, 1, 1)`.
+    # They are mathematically equivalent.
     linear=dict(
-        get_virtual_index=lambda n, quantiles:
-        _compute_virtual_index(n, quantiles, 1, 1),
+        get_virtual_index=lambda n, quantiles: (n - 1) * quantiles,
         fix_gamma=lambda gamma, _: gamma,
     ),
     median_unbiased=dict(
@@ -118,7 +120,7 @@ _QuantileInterpolation = dict(
         _compute_virtual_index(n, quantiles, 3 / 8.0, 3 / 8.0),
         fix_gamma=lambda gamma, _: gamma,
     ),
-    # --- OTHER METHODS fixme add deprecated ?
+    # --- OTHER METHODS
     lower=dict(
         get_virtual_index=lambda n, quantiles: np.floor(
             (n - 1) * quantiles).astype(np.intp),
@@ -3901,7 +3903,7 @@ def percentile(a,
         * (NPY 2): 'higher',
         * (NPY 3): 'midpoint'
         * (NPY 4): 'nearest'
-        * (NPY 5): 'linear', aliased with 'inclusive'  (default)
+        * (NPY 5): 'linear'
 
         New options:
 
@@ -3911,7 +3913,7 @@ def percentile(a,
         * (H&F 4): 'interpolated_inverted_cdf'
         * (H&F 5): 'hazen'
         * (H&F 6): 'weibull'
-        * (H&F 7): 'inclusive',  aliased with 'linear'  (default)
+        * (H&F 7): 'linear'  (default)
         * (H&F 8): 'median_unbiased'
         * (H&F 9): 'normal_unbiased'
 
@@ -4007,8 +4009,7 @@ def percentile(a,
         * alpha = 0
         * beta = 0
 
-    inclusive:
-        Default method, aliased with "linear".
+    linear:
         method 7 of H&F [1]_.
         This method give continuous results using:
         * alpha = 1
@@ -4164,7 +4165,7 @@ def quantile(a,
         * (NPY 2): 'higher',
         * (NPY 3): 'midpoint'
         * (NPY 4): 'nearest'
-        * (NPY 5): 'linear', aliased with 'inclusive'  (default)
+        * (NPY 5): 'linear'
 
         New options:
 
@@ -4174,7 +4175,7 @@ def quantile(a,
         * (H&F 4): 'interpolated_inverted_cdf'
         * (H&F 5): 'hazen'
         * (H&F 6): 'weibull'
-        * (H&F 7): 'inclusive',  aliased with 'linear'  (default)
+        * (H&F 7): 'linear'  (default)
         * (H&F 8): 'median_unbiased'
         * (H&F 9): 'normal_unbiased'
 
@@ -4261,8 +4262,7 @@ def quantile(a,
         * alpha = 0
         * beta = 0
 
-    inclusive:
-        Default method, aliased with "linear".
+    linear:
         method 7 of H&F [1]_.
         This method give continuous results using:
         * alpha = 1
diff --git a/numpy/lib/function_base.pyi b/numpy/lib/function_base.pyi
index 4bbd873a3..82c625fed 100644
--- a/numpy/lib/function_base.pyi
+++ b/numpy/lib/function_base.pyi
@@ -514,7 +514,6 @@ _InterpolationKind = L[
     "higher",
     "midpoint",
     "nearest",
-    "inclusive",
 ]
 
 @overload
diff --git a/numpy/lib/nanfunctions.py b/numpy/lib/nanfunctions.py
index 9fab77f45..7e953be03 100644
--- a/numpy/lib/nanfunctions.py
+++ b/numpy/lib/nanfunctions.py
@@ -1277,7 +1277,7 @@ def nanpercentile(
         * (NPY 2): 'higher',
         * (NPY 3): 'midpoint'
         * (NPY 4): 'nearest'
-        * (NPY 5): 'linear', aliased with 'inclusive'  (default)
+        * (NPY 5): 'linear'  (default)
 
         New options:
 
@@ -1287,7 +1287,7 @@ def nanpercentile(
         * (H&F 4): 'interpolated_inverted_cdf'
         * (H&F 5): 'hazen'
         * (H&F 6): 'weibull'
-        * (H&F 7): 'inclusive',  aliased with 'linear'  (default)
+        * (H&F 7): 'linear'  (default)
         * (H&F 8): 'median_unbiased'
         * (H&F 9): 'normal_unbiased'
 
@@ -1418,7 +1418,7 @@ def nanquantile(
         * (NPY 2): 'higher',
         * (NPY 3): 'midpoint'
         * (NPY 4): 'nearest'
-        * (NPY 5): 'linear', aliased with 'inclusive'  (default)
+        * (NPY 5): 'linear'  (default)
 
         New options:
 
@@ -1428,7 +1428,7 @@ def nanquantile(
         * (H&F 4): 'interpolated_inverted_cdf'
         * (H&F 5): 'hazen'
         * (H&F 6): 'weibull'
-        * (H&F 7): 'inclusive',  aliased with 'linear'  (default)
+        * (H&F 7): 'linear'  (default)
         * (H&F 8): 'median_unbiased'
         * (H&F 9): 'normal_unbiased'
 
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index d5fa012f1..1c274afae 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -3356,6 +3356,14 @@ class TestPercentile:
 class TestQuantile:
     # most of this is already tested by TestPercentile
 
+    def test_max_ulp(self):
+        x = [0.0, 0.2, 0.4]
+        a = np.quantile(x, 0.45)
+        # The default linear method would result in 0 + 0.2 * (0.45/2) = 0.18.
+        # 0.18 is not exactly representable and the formula leads to a 1 ULP
+        # different result. Ensure it is this exact within 1 ULP, see gh-20331.
+        np.testing.assert_array_max_ulp(a, 0.18, maxulp=1)
+
     def test_basic(self):
         x = np.arange(8) * 0.5
         assert_equal(np.quantile(x, 0), 0.)
diff --git a/numpy/random/_generator.pyx b/numpy/random/_generator.pyx
index 5347ea125..7087b6e1d 100644
--- a/numpy/random/_generator.pyx
+++ b/numpy/random/_generator.pyx
@@ -3107,7 +3107,7 @@ cdef class Generator:
         `a` > 1.
 
         The Zipf distribution (also known as the zeta distribution) is a
-        continuous probability distribution that satisfies Zipf's law: the
+        discrete probability distribution that satisfies Zipf's law: the
         frequency of an item is inversely proportional to its rank in a
         frequency table.
 
@@ -3135,9 +3135,10 @@ cdef class Generator:
         -----
         The probability density for the Zipf distribution is
 
-        .. math:: p(x) = \\frac{x^{-a}}{\\zeta(a)},
+        .. math:: p(k) = \\frac{k^{-a}}{\\zeta(a)},
 
-        where :math:`\\zeta` is the Riemann Zeta function.
+        for integers :math:`k \geq 1`, where :math:`\\zeta` is the Riemann Zeta
+        function.
 
         It is named for the American linguist George Kingsley Zipf, who noted
         that the frequency of any word in a sample of a language is inversely
@@ -3153,22 +3154,29 @@ cdef class Generator:
         --------
         Draw samples from the distribution:
 
-        >>> a = 2. # parameter
-        >>> s = np.random.default_rng().zipf(a, 1000)
+        >>> a = 4.0
+        >>> n = 20000
+        >>> s = np.random.default_rng().zipf(a, size=n)
 
         Display the histogram of the samples, along with
-        the probability density function:
+        the expected histogram based on the probability
+        density function:
 
         >>> import matplotlib.pyplot as plt
-        >>> from scipy import special  # doctest: +SKIP
+        >>> from scipy.special import zeta  # doctest: +SKIP
+
+        `bincount` provides a fast histogram for small integers.
 
-        Truncate s values at 50 so plot is interesting:
+        >>> count = np.bincount(s)
+        >>> k = np.arange(1, s.max() + 1)
 
-        >>> count, bins, ignored = plt.hist(s[s<50],
-        ...         50, density=True)
-        >>> x = np.arange(1., 50.)
-        >>> y = x**(-a) / special.zetac(a)  # doctest: +SKIP
-        >>> plt.plot(x, y/max(y), linewidth=2, color='r')  # doctest: +SKIP
+        >>> plt.bar(k, count[1:], alpha=0.5, label='sample count')
+        >>> plt.plot(k, n*(k**-a)/zeta(a), 'k.-', alpha=0.5,
+        ...          label='expected count')   # doctest: +SKIP
+        >>> plt.semilogy()
+        >>> plt.grid(alpha=0.4)
+        >>> plt.legend()
+        >>> plt.title(f'Zipf sample, a={a}, size={n}')
         >>> plt.show()
 
         """
diff --git a/numpy/random/mtrand.pyx b/numpy/random/mtrand.pyx
index 81a526ab4..3e13503d0 100644
--- a/numpy/random/mtrand.pyx
+++ b/numpy/random/mtrand.pyx
@@ -3609,7 +3609,7 @@ cdef class RandomState:
         `a` > 1.
 
         The Zipf distribution (also known as the zeta distribution) is a
-        continuous probability distribution that satisfies Zipf's law: the
+        discrete probability distribution that satisfies Zipf's law: the
         frequency of an item is inversely proportional to its rank in a
         frequency table.
 
@@ -3642,9 +3642,10 @@ cdef class RandomState:
         -----
         The probability density for the Zipf distribution is
 
-        .. math:: p(x) = \\frac{x^{-a}}{\\zeta(a)},
+        .. math:: p(k) = \\frac{k^{-a}}{\\zeta(a)},
 
-        where :math:`\\zeta` is the Riemann Zeta function.
+        for integers :math:`k \geq 1`, where :math:`\\zeta` is the Riemann Zeta
+        function.
 
         It is named for the American linguist George Kingsley Zipf, who noted
         that the frequency of any word in a sample of a language is inversely
@@ -3660,21 +3661,29 @@ cdef class RandomState:
         --------
         Draw samples from the distribution:
 
-        >>> a = 2. # parameter
-        >>> s = np.random.zipf(a, 1000)
+        >>> a = 4.0
+        >>> n = 20000
+        >>> s = np.random.zipf(a, n)
 
         Display the histogram of the samples, along with
-        the probability density function:
+        the expected histogram based on the probability
+        density function:
 
         >>> import matplotlib.pyplot as plt
-        >>> from scipy import special  # doctest: +SKIP
+        >>> from scipy.special import zeta  # doctest: +SKIP
+
+        `bincount` provides a fast histogram for small integers.
 
-        Truncate s values at 50 so plot is interesting:
+        >>> count = np.bincount(s)
+        >>> k = np.arange(1, s.max() + 1)
 
-        >>> count, bins, ignored = plt.hist(s[s<50], 50, density=True)
-        >>> x = np.arange(1., 50.)
-        >>> y = x**(-a) / special.zetac(a)  # doctest: +SKIP
-        >>> plt.plot(x, y/max(y), linewidth=2, color='r')  # doctest: +SKIP
+        >>> plt.bar(k, count[1:], alpha=0.5, label='sample count')
+        >>> plt.plot(k, n*(k**-a)/zeta(a), 'k.-', alpha=0.5,
+        ...          label='expected count')   # doctest: +SKIP
+        >>> plt.semilogy()
+        >>> plt.grid(alpha=0.4)
+        >>> plt.legend()
+        >>> plt.title(f'Zipf sample, a={a}, size={n}')
         >>> plt.show()
 
         """
diff --git a/numpy/typing/_generic_alias.py b/numpy/typing/_generic_alias.py
index 932f12dd0..1eb2c8c05 100644
--- a/numpy/typing/_generic_alias.py
+++ b/numpy/typing/_generic_alias.py
@@ -185,6 +185,8 @@ class _GenericAlias:
         "__mro_entries__",
         "__reduce__",
         "__reduce_ex__",
+        "__copy__",
+        "__deepcopy__",
     })
 
     def __getattribute__(self, name: str) -> Any:
diff --git a/numpy/typing/tests/test_generic_alias.py b/numpy/typing/tests/test_generic_alias.py
index 3021d9859..39343420b 100644
--- a/numpy/typing/tests/test_generic_alias.py
+++ b/numpy/typing/tests/test_generic_alias.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import sys
+import copy
 import types
 import pickle
 import weakref
@@ -80,6 +81,21 @@ class TestGenericAlias:
             value_ref = func(NDArray_ref)
             assert value == value_ref
 
+    @pytest.mark.parametrize("name,func", [
+        ("__copy__", lambda n: n == copy.copy(n)),
+        ("__deepcopy__", lambda n: n == copy.deepcopy(n)),
+    ])
+    def test_copy(self, name: str, func: FuncType) -> None:
+        value = func(NDArray)
+
+        # xref bpo-45167
+        GE_398 = (
+            sys.version_info[:2] == (3, 9) and sys.version_info >= (3, 9, 8)
+        )
+        if GE_398 or sys.version_info >= (3, 10, 1):
+            value_ref = func(NDArray_ref)
+            assert value == value_ref
+
     def test_weakref(self) -> None:
         """Test ``__weakref__``."""
         value = weakref.ref(NDArray)()
diff --git a/setup.py b/setup.py
index a6cb89b32..703fe79e1 100755
--- a/setup.py
+++ b/setup.py
@@ -30,8 +30,7 @@ import re
 
 # Python supported version checks. Keep right after stdlib imports to ensure we
 # get a sensible error for older Python versions
-# This needs to be changed to 3.8 for 1.22 release, but 3.7 is needed for LGTM.
-if sys.version_info[:2] < (3, 7):
+if sys.version_info[:2] < (3, 8):
     raise RuntimeError("Python version >= 3.8 required.")