104 files changed, 5985 insertions, 488 deletions
diff --git a/numpy/__init__.py b/numpy/__init__.py
index e6a24f0d1..550fb1772 100644
--- a/numpy/__init__.py
+++ b/numpy/__init__.py
@@ -136,6 +136,9 @@ else:
     __all__ = ['ModuleDeprecationWarning',
                'VisibleDeprecationWarning']
 
+    # mapping of {name: (value, deprecation_msg)}
+    __deprecated_attrs__ = {}
+
     # Allow distributors to run custom init code
     from . import _distributor_init
 
@@ -156,11 +159,35 @@ else:
     from . import matrixlib as _mat
     from .matrixlib import *
 
-    # Make these accessible from numpy name-space
-    # but not imported in from numpy import *
-    # TODO[gh-6103]: Deprecate these
-    from builtins import bool, int, float, complex, object, str
-    from .compat import long, unicode
+    # Deprecations introduced in NumPy 1.20.0, 2020-06-06
+    import builtins as _builtins
+    __deprecated_attrs__.update({
+        n: (
+            getattr(_builtins, n),
+            "`np.{n}` is a deprecated alias for the builtin `{n}`. "
+            "Use `{n}` by itself, which is identical in behavior, to silence "
+            "this warning. "
+            "If you specifically wanted the numpy scalar type, use `np.{n}_` "
+            "here."
+            .format(n=n)
+        )
+        for n in ["bool", "int", "float", "complex", "object", "str"]
+    })
+    __deprecated_attrs__.update({
+        n: (
+            getattr(compat, n),
+            "`np.{n}` is a deprecated alias for `np.compat.{n}`. "
+            "Use `np.compat.{n}` by itself, which is identical in behavior, "
+            "to silence this warning. "
+            "In the likely event your code does not need to work on Python 2 "
+            "you can use the builtin ``{n2}`` for which ``np.compat.{n}`` is "
+            "itself an alias. "
+            "If you specifically wanted the numpy scalar type, use `np.{n2}_` "
+            "here."
+            .format(n=n, n2=n2)
+        )
+        for n, n2 in [("long", "int"), ("unicode", "str")]
+    })
 
     from .core import round, abs, max, min
     # now that numpy modules are imported, can initialize limits
@@ -172,8 +199,10 @@ else:
     __all__.extend(lib.__all__)
     __all__.extend(['linalg', 'fft', 'random', 'ctypeslib', 'ma'])
 
-    # These are added by `from .core import *` and `core.__all__`, but we
-    # overwrite them above with builtins we do _not_ want to export.
+    # These are exported by np.core, but are replaced by the builtins below
+    # remove them to ensure that we don't end up with `np.long == np.int_`,
+    # which would be a breaking change.
+    del long, unicode
     __all__.remove('long')
     __all__.remove('unicode')
 
@@ -196,25 +225,33 @@ else:
     numarray = 'removed'
 
     if sys.version_info[:2] >= (3, 7):
-        # Importing Tester requires importing all of UnitTest which is not a
-        # cheap import Since it is mainly used in test suits, we lazy import it
-        # here to save on the order of 10 ms of import time for most users
-        #
-        # The previous way Tester was imported also had a side effect of adding
-        # the full `numpy.testing` namespace
-        #
         # module level getattr is only supported in 3.7 onwards
         # https://www.python.org/dev/peps/pep-0562/
         def __getattr__(attr):
+            # Emit warnings for deprecated attributes
+            try:
+                val, msg = __deprecated_attrs__[attr]
+            except KeyError:
+                pass
+            else:
+                warnings.warn(msg, DeprecationWarning, stacklevel=2)
+                return val
+
+            # Importing Tester requires importing all of UnitTest which is not a
+            # cheap import Since it is mainly used in test suits, we lazy import it
+            # here to save on the order of 10 ms of import time for most users
+            #
+            # The previous way Tester was imported also had a side effect of adding
+            # the full `numpy.testing` namespace
             if attr == 'testing':
                 import numpy.testing as testing
                 return testing
             elif attr == 'Tester':
                 from .testing import Tester
                 return Tester
-            else:
-                raise AttributeError("module {!r} has no attribute "
-                                     "{!r}".format(__name__, attr))
+
+            raise AttributeError("module {!r} has no attribute "
+                                 "{!r}".format(__name__, attr))
 
         def __dir__():
             return list(globals().keys() | {'Tester', 'testing'})
@@ -224,6 +261,13 @@ else:
         # no-one else in the world is using it (though I hope not)
         from .testing import Tester
 
+        # We weren't able to emit a warning about these, so keep them around
+        globals().update({
+            k: v
+            for k, (v, msg) in __deprecated_attrs__.items()
+        })
+
+
     # Pytest testing
     from numpy._pytesttester import PytestTester
     test = PytestTester(__name__)
@@ -279,12 +323,11 @@ else:
                 error_message = "{}: {}".format(w[-1].category.__name__, str(w[-1].message))
                 msg = (
                     "Polyfit sanity test emitted a warning, most likely due "
-                    "to using a buggy Accelerate backend. "
-                    "If you compiled yourself, "
-                    "see site.cfg.example for information. "
+                    "to using a buggy Accelerate backend. If you compiled "
+                    "yourself, more information is available at "
+                    "https://numpy.org/doc/stable/user/building.html#accelerated-blas-lapack-libraries "
                     "Otherwise report this to the vendor "
-                    "that provided NumPy.\n{}\n".format(
-                        error_message))
+                    "that provided NumPy.\n{}\n".format(error_message))
                 raise RuntimeError(msg)
     del _mac_os_check
 
diff --git a/numpy/__init__.pyi b/numpy/__init__.pyi
index 5031893ed..f9218391e 100644
--- a/numpy/__init__.pyi
+++ b/numpy/__init__.pyi
@@ -51,7 +51,12 @@ _NdArraySubClass = TypeVar("_NdArraySubClass", bound=ndarray)
 
 class dtype:
     names: Optional[Tuple[str, ...]]
-    def __init__(self, obj: DtypeLike, align: bool = ..., copy: bool = ...) -> None: ...
+    def __init__(
+        self,
+        dtype: DtypeLike,
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> None: ...
     def __eq__(self, other: DtypeLike) -> bool: ...
     def __ne__(self, other: DtypeLike) -> bool: ...
     def __gt__(self, other: DtypeLike) -> bool: ...
@@ -382,18 +387,18 @@ class _real_generic(generic):  # type: ignore
 class number(generic): ...  # type: ignore
 
 class bool_(_real_generic):
-    def __init__(self, value: object = ...) -> None: ...
+    def __init__(self, __value: object = ...) -> None: ...
 
 class object_(generic):
-    def __init__(self, value: object = ...) -> None: ...
+    def __init__(self, __value: object = ...) -> None: ...
 
 class datetime64:
     @overload
     def __init__(
-        self, _data: Union[datetime64, str, dt.datetime] = ..., _format: str = ...
+        self, __value: Union[datetime64, str, dt.datetime] = ..., __format: str = ...
     ) -> None: ...
     @overload
-    def __init__(self, _data: int, _format: str) -> None: ...
+    def __init__(self, __value: int, __format: str) -> None: ...
     def __add__(self, other: Union[timedelta64, int]) -> datetime64: ...
     def __sub__(self, other: Union[timedelta64, datetime64, int]) -> timedelta64: ...
 
@@ -401,19 +406,19 @@ class integer(number, _real_generic): ...  # type: ignore
 class signedinteger(integer): ...  # type: ignore
 
 class int8(signedinteger):
-    def __init__(self, value: SupportsInt = ...) -> None: ...
+    def __init__(self, __value: SupportsInt = ...) -> None: ...
 
 class int16(signedinteger):
-    def __init__(self, value: SupportsInt = ...) -> None: ...
+    def __init__(self, __value: SupportsInt = ...) -> None: ...
 
 class int32(signedinteger):
-    def __init__(self, value: SupportsInt = ...) -> None: ...
+    def __init__(self, __value: SupportsInt = ...) -> None: ...
 
 class int64(signedinteger):
-    def __init__(self, value: SupportsInt = ...) -> None: ...
+    def __init__(self, __value: SupportsInt = ...) -> None: ...
 
 class timedelta64(signedinteger):
-    def __init__(self, _data: Any = ..., _format: str = ...) -> None: ...
+    def __init__(self, __value: Any = ..., __format: str = ...) -> None: ...
     @overload
     def __add__(self, other: Union[timedelta64, int]) -> timedelta64: ...
     @overload
@@ -433,34 +438,34 @@ class timedelta64(signedinteger):
 class unsignedinteger(integer): ...  # type: ignore
 
 class uint8(unsignedinteger):
-    def __init__(self, value: SupportsInt = ...) -> None: ...
+    def __init__(self, __value: SupportsInt = ...) -> None: ...
 
 class uint16(unsignedinteger):
-    def __init__(self, value: SupportsInt = ...) -> None: ...
+    def __init__(self, __value: SupportsInt = ...) -> None: ...
 
 class uint32(unsignedinteger):
-    def __init__(self, value: SupportsInt = ...) -> None: ...
+    def __init__(self, __value: SupportsInt = ...) -> None: ...
 
 class uint64(unsignedinteger):
-    def __init__(self, value: SupportsInt = ...) -> None: ...
+    def __init__(self, __value: SupportsInt = ...) -> None: ...
 
 class inexact(number): ...  # type: ignore
 class floating(inexact, _real_generic): ...  # type: ignore
 
 class float16(floating):
-    def __init__(self, value: SupportsFloat = ...) -> None: ...
+    def __init__(self, __value: SupportsFloat = ...) -> None: ...
 
 class float32(floating):
-    def __init__(self, value: SupportsFloat = ...) -> None: ...
+    def __init__(self, __value: SupportsFloat = ...) -> None: ...
 
 class float64(floating):
-    def __init__(self, value: SupportsFloat = ...) -> None: ...
+    def __init__(self, __value: SupportsFloat = ...) -> None: ...
 
 class complexfloating(inexact): ...  # type: ignore
 
 class complex64(complexfloating):
     def __init__(
-        self, value: Union[SupportsInt, SupportsFloat, SupportsComplex] = ...
+        self, __value: Union[SupportsInt, SupportsFloat, SupportsComplex] = ...
     ) -> None: ...
     @property
     def real(self) -> float32: ...
@@ -469,7 +474,7 @@ class complex64(complexfloating):
 
 class complex128(complexfloating):
     def __init__(
-        self, value: Union[SupportsInt, SupportsFloat, SupportsComplex] = ...
+        self, __value: Union[SupportsInt, SupportsFloat, SupportsComplex] = ...
     ) -> None: ...
     @property
     def real(self) -> float64: ...
@@ -479,24 +484,24 @@ class complex128(complexfloating):
 class flexible(_real_generic): ...  # type: ignore
 
 class void(flexible):
-    def __init__(self, value: Union[int, integer, bool_, bytes, bytes_]): ...
+    def __init__(self, __value: Union[int, integer, bool_, bytes, bytes_]): ...
 
 class character(_real_generic): ...  # type: ignore
 
 class bytes_(character):
     @overload
-    def __init__(self, value: object = ...) -> None: ...
+    def __init__(self, __value: object = ...) -> None: ...
     @overload
     def __init__(
-        self, value: object, encoding: str = ..., errors: str = ...
+        self, __value: Union[str, str_], encoding: str = ..., errors: str = ...
     ) -> None: ...
 
 class str_(character):
     @overload
-    def __init__(self, value: object = ...) -> None: ...
+    def __init__(self, __value: object = ...) -> None: ...
     @overload
     def __init__(
-        self, value: object, encoding: str = ..., errors: str = ...
+        self, __value: Union[bytes, bytes_], encoding: str = ..., errors: str = ...
     ) -> None: ...
 
 # TODO(alan): Platform dependent types
@@ -936,21 +941,18 @@ def reshape(a: ArrayLike, newshape: _ShapeLike, order: _Order = ...) -> ndarray:
 @overload
 def choose(
     a: _ScalarIntOrBool,
-    choices: Union[Sequence[ArrayLike], ndarray],
+    choices: ArrayLike,
     out: Optional[ndarray] = ...,
     mode: _Mode = ...,
 ) -> _ScalarIntOrBool: ...
 @overload
 def choose(
-    a: _IntOrBool,
-    choices: Union[Sequence[ArrayLike], ndarray],
-    out: Optional[ndarray] = ...,
-    mode: _Mode = ...,
+    a: _IntOrBool, choices: ArrayLike, out: Optional[ndarray] = ..., mode: _Mode = ...
 ) -> Union[integer, bool_]: ...
 @overload
 def choose(
     a: _ArrayLikeIntOrBool,
-    choices: Union[Sequence[ArrayLike], ndarray],
+    choices: ArrayLike,
     out: Optional[ndarray] = ...,
     mode: _Mode = ...,
 ) -> ndarray: ...
@@ -960,9 +962,7 @@ def repeat(
 def put(
     a: ndarray, ind: _ArrayLikeIntOrBool, v: ArrayLike, mode: _Mode = ...
 ) -> None: ...
-def swapaxes(
-    a: Union[Sequence[ArrayLike], ndarray], axis1: int, axis2: int
-) -> ndarray: ...
+def swapaxes(a: ArrayLike, axis1: int, axis2: int) -> ndarray: ...
 def transpose(
     a: ArrayLike, axes: Union[None, Sequence[int], ndarray] = ...
 ) -> ndarray: ...
@@ -998,54 +998,42 @@ def argpartition(
     order: Union[None, str, Sequence[str]] = ...,
 ) -> ndarray: ...
 def sort(
-    a: Union[Sequence[ArrayLike], ndarray],
+    a: ArrayLike,
     axis: Optional[int] = ...,
     kind: Optional[_SortKind] = ...,
     order: Union[None, str, Sequence[str]] = ...,
 ) -> ndarray: ...
 def argsort(
-    a: Union[Sequence[ArrayLike], ndarray],
+    a: ArrayLike,
     axis: Optional[int] = ...,
     kind: Optional[_SortKind] = ...,
     order: Union[None, str, Sequence[str]] = ...,
 ) -> ndarray: ...
 @overload
-def argmax(
-    a: Union[Sequence[ArrayLike], ndarray],
-    axis: None = ...,
-    out: Optional[ndarray] = ...,
-) -> integer: ...
+def argmax(a: ArrayLike, axis: None = ..., out: Optional[ndarray] = ...) -> integer: ...
 @overload
 def argmax(
-    a: Union[Sequence[ArrayLike], ndarray],
-    axis: int = ...,
-    out: Optional[ndarray] = ...,
+    a: ArrayLike, axis: int = ..., out: Optional[ndarray] = ...
 ) -> Union[integer, ndarray]: ...
 @overload
-def argmin(
-    a: Union[Sequence[ArrayLike], ndarray],
-    axis: None = ...,
-    out: Optional[ndarray] = ...,
-) -> integer: ...
+def argmin(a: ArrayLike, axis: None = ..., out: Optional[ndarray] = ...) -> integer: ...
 @overload
 def argmin(
-    a: Union[Sequence[ArrayLike], ndarray],
-    axis: int = ...,
-    out: Optional[ndarray] = ...,
+    a: ArrayLike, axis: int = ..., out: Optional[ndarray] = ...
 ) -> Union[integer, ndarray]: ...
 @overload
 def searchsorted(
-    a: Union[Sequence[ArrayLike], ndarray],
+    a: ArrayLike,
     v: _Scalar,
     side: _Side = ...,
-    sorter: Union[None, Sequence[_IntOrBool], ndarray] = ...,  # 1D int array
+    sorter: Optional[_ArrayLikeIntOrBool] = ...,  # 1D int array
 ) -> integer: ...
 @overload
 def searchsorted(
-    a: Union[Sequence[ArrayLike], ndarray],
+    a: ArrayLike,
     v: ArrayLike,
     side: _Side = ...,
-    sorter: Union[None, Sequence[_IntOrBool], ndarray] = ...,  # 1D int array
+    sorter: Optional[_ArrayLikeIntOrBool] = ...,  # 1D int array
 ) -> ndarray: ...
 def resize(a: ArrayLike, new_shape: _ShapeLike) -> ndarray: ...
 @overload
@@ -1053,13 +1041,10 @@ def squeeze(a: _ScalarGeneric, axis: Optional[_ShapeLike] = ...) -> _ScalarGener
 @overload
 def squeeze(a: ArrayLike, axis: Optional[_ShapeLike] = ...) -> ndarray: ...
 def diagonal(
-    a: Union[Sequence[Sequence[ArrayLike]], ndarray],  # >= 2D array
-    offset: int = ...,
-    axis1: int = ...,
-    axis2: int = ...,
+    a: ArrayLike, offset: int = ..., axis1: int = ..., axis2: int = ...  # >= 2D array
 ) -> ndarray: ...
 def trace(
-    a: Union[Sequence[Sequence[ArrayLike]], ndarray],  # >= 2D array
+    a: ArrayLike,  # >= 2D array
     offset: int = ...,
     axis1: int = ...,
     axis2: int = ...,
@@ -1070,7 +1055,7 @@ def ravel(a: ArrayLike, order: _Order = ...) -> ndarray: ...
 def nonzero(a: ArrayLike) -> Tuple[ndarray, ...]: ...
 def shape(a: ArrayLike) -> _Shape: ...
 def compress(
-    condition: Union[Sequence[_Bool], ndarray],  # 1D bool array
+    condition: ArrayLike,  # 1D bool array
     a: ArrayLike,
     axis: Optional[int] = ...,
     out: Optional[ndarray] = ...,
diff --git a/numpy/_pytesttester.py b/numpy/_pytesttester.py
index ca86aeb22..1c32367f3 100644
--- a/numpy/_pytesttester.py
+++ b/numpy/_pytesttester.py
@@ -35,12 +35,27 @@ __all__ = ['PytestTester']
 
 
 def _show_numpy_info():
+    from numpy.core._multiarray_umath import (
+        __cpu_features__, __cpu_baseline__, __cpu_dispatch__
+    )
     import numpy as np
 
     print("NumPy version %s" % np.__version__)
     relaxed_strides = np.ones((10, 1), order="C").flags.f_contiguous
     print("NumPy relaxed strides checking option:", relaxed_strides)
 
+    if len(__cpu_baseline__) == 0 and len(__cpu_dispatch__) == 0:
+        enabled_features = "nothing enabled"
+    else:
+        enabled_features = ' '.join(__cpu_baseline__)
+        for feature in __cpu_dispatch__:
+            if __cpu_features__[feature]:
+                enabled_features += " %s*" % feature
+            else:
+                enabled_features += " %s?" % feature
+    print("NumPy CPU features:", enabled_features)
+
+
 
 class PytestTester:
     """
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index 688238af3..33c1f08b1 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -1307,14 +1307,14 @@ add_newdoc('numpy.core.multiarray', 'arange',
 
     Parameters
     ----------
-    start : number, optional
+    start : integer or real, optional
         Start of interval.  The interval includes this value.  The default
         start value is 0.
-    stop : number
+    stop : integer or real
         End of interval.  The interval does not include this value, except
         in some cases where `step` is not an integer and floating point
         round-off affects the length of `out`.
-    step : number, optional
+    step : integer or real, optional
         Spacing between values.  For any output `out`, this is the distance
         between two adjacent values, ``out[i+1] - out[i]``.  The default
         step size is 1.  If `step` is specified as a position argument,
@@ -1525,7 +1525,7 @@ add_newdoc('numpy.core.multiarray', 'c_einsum',
         Controls the memory layout of the output. 'C' means it should
         be C contiguous. 'F' means it should be Fortran contiguous,
         'A' means it should be 'F' if the inputs are all 'F', 'C' otherwise.
-        'K' means it should be as close to the layout as the inputs as
+        'K' means it should be as close to the layout of the inputs as
         is possible, including arbitrarily permuted axes.
         Default is 'K'.
     casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
@@ -3936,18 +3936,17 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('tobytes', """
     Construct Python bytes containing the raw data bytes in the array.
 
     Constructs Python bytes showing a copy of the raw contents of
-    data memory. The bytes object can be produced in either 'C' or 'Fortran',
-    or 'Any' order (the default is 'C'-order). 'Any' order means C-order
-    unless the F_CONTIGUOUS flag in the array is set, in which case it
-    means 'Fortran' order.
+    data memory. The bytes object is produced in C-order by default.
+    This behavior is controlled by the ``order`` parameter.
 
     .. versionadded:: 1.9.0
 
     Parameters
     ----------
-    order : {'C', 'F', None}, optional
-        Order of the data for multidimensional arrays:
-        C, Fortran, or the same as for the original array.
+    order : {'C', 'F', 'A'}, optional
+        Controls the memory layout of the bytes object. 'C' means C-order,
+        'F' means F-order, 'A' (short for *Any*) means 'F' if `a` is
+        Fortran contiguous, 'C' otherwise. Default is 'C'.
 
     Returns
     -------
@@ -5142,7 +5141,7 @@ add_newdoc('numpy.core', 'ufunc', ('at',
 
 add_newdoc('numpy.core.multiarray', 'dtype',
     """
-    dtype(obj, align=False, copy=False)
+    dtype(dtype, align=False, copy=False)
 
     Create a data type object.
 
@@ -5152,7 +5151,7 @@ add_newdoc('numpy.core.multiarray', 'dtype',
 
     Parameters
     ----------
-    obj
+    dtype
         Object to be converted to a data type object.
     align : bool, optional
         Add padding to the fields to match what a C compiler would output
diff --git a/numpy/core/_type_aliases.py b/numpy/core/_type_aliases.py
index c26431443..de90fd818 100644
--- a/numpy/core/_type_aliases.py
+++ b/numpy/core/_type_aliases.py
@@ -11,40 +11,19 @@ and sometimes other mappings too.
 .. data:: sctypeDict
     Similar to `allTypes`, but maps a broader set of aliases to their types.
 
-.. data:: sctypeNA
-    NumArray-compatible names for the scalar types. Contains not only
-    ``name: type`` mappings, but ``char: name`` mappings too.
-
-    .. deprecated:: 1.16
-
 .. data:: sctypes
     A dictionary keyed by a "type group" string, providing a list of types
     under that group.
 
 """
-import warnings
 
 from numpy.compat import unicode
-from numpy._globals import VisibleDeprecationWarning
-from numpy.core._string_helpers import english_lower, english_capitalize
+from numpy.core._string_helpers import english_lower
 from numpy.core.multiarray import typeinfo, dtype
 from numpy.core._dtype import _kind_name
 
 
 sctypeDict = {}      # Contains all leaf-node scalar types with aliases
-class TypeNADict(dict):
-    def __getitem__(self, key):
-        # 2018-06-24, 1.16
-        warnings.warn('sctypeNA and typeNA will be removed in v1.18 '
-                      'of numpy', VisibleDeprecationWarning, stacklevel=2)
-        return dict.__getitem__(self, key)
-    def get(self, key, default=None):
-        # 2018-06-24, 1.16
-        warnings.warn('sctypeNA and typeNA will be removed in v1.18 '
-                      'of numpy', VisibleDeprecationWarning, stacklevel=2)
-        return dict.get(self, key, default)
-
-sctypeNA = TypeNADict()  # Contails all leaf-node types -> numarray type equivalences
 allTypes = {}            # Collect the types we will add to the module
 
 
@@ -127,27 +106,24 @@ def _add_aliases():
         if name in ('longdouble', 'clongdouble') and myname in allTypes:
             continue
 
-        base_capitalize = english_capitalize(base)
-        if base == 'complex':
-            na_name = '%s%d' % (base_capitalize, bit//2)
-        elif base == 'bool':
-            na_name = base_capitalize
-        else:
-            na_name = "%s%d" % (base_capitalize, bit)
-
         allTypes[myname] = info.type
 
         # add mapping for both the bit name and the numarray name
         sctypeDict[myname] = info.type
-        sctypeDict[na_name] = info.type
 
         # add forward, reverse, and string mapping to numarray
-        sctypeNA[na_name] = info.type
-        sctypeNA[info.type] = na_name
-        sctypeNA[info.char] = na_name
-
         sctypeDict[char] = info.type
-        sctypeNA[char] = na_name
+
+    # Add deprecated numeric-style type aliases manually, at some point
+    # we may want to deprecate the lower case "bytes0" version as well.
+    for name in ["Bytes0", "Datetime64", "Str0", "Uint32", "Uint64"]:
+        if english_lower(name) not in allTypes:
+            # Only one of Uint32 or Uint64, aliases of `np.uintp`, was (and is) defined, note that this
+            # is not UInt32/UInt64 (capital i), which is removed.
+            continue
+        allTypes[name] = allTypes[english_lower(name)]
+        sctypeDict[name] = sctypeDict[english_lower(name)]
+
 _add_aliases()
 
 def _add_integer_aliases():
@@ -157,20 +133,15 @@ def _add_integer_aliases():
         u_info = _concrete_typeinfo[u_ctype]
         bits = i_info.bits  # same for both
 
-        for info, charname, intname, Intname in [
-                (i_info,'i%d' % (bits//8,), 'int%d' % bits, 'Int%d' % bits),
-                (u_info,'u%d' % (bits//8,), 'uint%d' % bits, 'UInt%d' % bits)]:
+        for info, charname, intname in [
+                (i_info,'i%d' % (bits//8,), 'int%d' % bits),
+                (u_info,'u%d' % (bits//8,), 'uint%d' % bits)]:
             if bits not in seen_bits:
                 # sometimes two different types have the same number of bits
                 # if so, the one iterated over first takes precedence
                 allTypes[intname] = info.type
                 sctypeDict[intname] = info.type
-                sctypeDict[Intname] = info.type
                 sctypeDict[charname] = info.type
-                sctypeNA[Intname] = info.type
-                sctypeNA[charname] = info.type
-            sctypeNA[info.type] = Intname
-            sctypeNA[info.char] = Intname
 
         seen_bits.add(bits)
 
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index 2b88ccedf..412d9fe6a 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -649,6 +649,10 @@ def transpose(a, axes=None):
     >>> np.transpose(x, (1, 0, 2)).shape
     (2, 1, 3)
 
+    >>> x = np.ones((2, 3, 4, 5))
+    >>> np.transpose(x).shape
+    (5, 4, 3, 2)
+
     """
     return _wrapfunc(a, 'transpose', axes)
 
diff --git a/numpy/core/function_base.py b/numpy/core/function_base.py
index 9e46f0ea5..f57e95742 100644
--- a/numpy/core/function_base.py
+++ b/numpy/core/function_base.py
@@ -52,8 +52,10 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None,
         If True, return (`samples`, `step`), where `step` is the spacing
         between samples.
     dtype : dtype, optional
-        The type of the output array.  If `dtype` is not given, infer the data
-        type from the other input arguments.
+        The type of the output array.  If `dtype` is not given, the data type
+        is inferred from `start` and `stop`. The inferred dtype will never be
+        an integer; `float` is chosen even if the arguments would produce an
+        array of integers.
 
         .. versionadded:: 1.9.0
 
@@ -202,8 +204,10 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None,
         ``ln(samples) / ln(base)`` (or ``log_base(samples)``) is uniform.
         Default is 10.0.
     dtype : dtype
-        The type of the output array.  If `dtype` is not given, infer the data
-        type from the other input arguments.
+        The type of the output array.  If `dtype` is not given, the data type
+        is inferred from `start` and `stop`. The inferred type will never be
+        an integer; `float` is chosen even if the arguments would produce an
+        array of integers.
     axis : int, optional
         The axis in the result to store the samples.  Relevant only if start
         or stop are array-like.  By default (0), the samples will be along a
@@ -297,8 +301,10 @@ def geomspace(start, stop, num=50, endpoint=True, dtype=None, axis=0):
         If true, `stop` is the last sample. Otherwise, it is not included.
         Default is True.
     dtype : dtype
-        The type of the output array.  If `dtype` is not given, infer the data
-        type from the other input arguments.
+        The type of the output array.  If `dtype` is not given, the data type
+        is inferred from `start` and `stop`. The inferred dtype will never be
+        an integer; `float` is chosen even if the arguments would produce an
+        array of integers.
     axis : int, optional
         The axis in the result to store the samples.  Relevant only if start
         or stop are array-like.  By default (0), the samples will be along a
@@ -408,8 +414,18 @@ def geomspace(start, stop, num=50, endpoint=True, dtype=None, axis=0):
 
     log_start = _nx.log10(start)
     log_stop = _nx.log10(stop)
-    result = out_sign * logspace(log_start, log_stop, num=num,
-                                 endpoint=endpoint, base=10.0, dtype=dtype)
+    result = logspace(log_start, log_stop, num=num,
+                      endpoint=endpoint, base=10.0, dtype=dtype)
+
+    # Make sure the endpoints match the start and stop arguments. This is
+    # necessary because np.exp(np.log(x)) is not necessarily equal to x.
+    if num > 0:
+        result[0] = start
+        if num > 1 and endpoint:
+            result[-1] = stop
+
+    result = out_sign * result
+
     if axis != 0:
         result = _nx.moveaxis(result, 0, axis)
 
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index 1b61899fa..275bb336b 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -341,9 +341,6 @@ struct NpyAuxData_tag {
 #define NPY_ERR(str) fprintf(stderr, #str); fflush(stderr);
 #define NPY_ERR2(str) fprintf(stderr, str); fflush(stderr);
 
-#define NPY_STRINGIFY(x) #x
-#define NPY_TOSTRING(x) NPY_STRINGIFY(x)
-
   /*
    * Macros to define how array, and dimension/strides data is
    * allocated.
diff --git a/numpy/core/include/numpy/utils.h b/numpy/core/include/numpy/utils.h
index 32218b8c7..e251a5201 100644
--- a/numpy/core/include/numpy/utils.h
+++ b/numpy/core/include/numpy/utils.h
@@ -2,20 +2,36 @@
 #define __NUMPY_UTILS_HEADER__
 
 #ifndef __COMP_NPY_UNUSED
-        #if defined(__GNUC__)
-                #define __COMP_NPY_UNUSED __attribute__ ((__unused__))
-        # elif defined(__ICC)
-                #define __COMP_NPY_UNUSED __attribute__ ((__unused__))
-        # elif defined(__clang__)
-                #define __COMP_NPY_UNUSED __attribute__ ((unused))
-        #else
-                #define __COMP_NPY_UNUSED
-        #endif
+    #if defined(__GNUC__)
+        #define __COMP_NPY_UNUSED __attribute__ ((__unused__))
+    #elif defined(__ICC)
+        #define __COMP_NPY_UNUSED __attribute__ ((__unused__))
+    #elif defined(__clang__)
+        #define __COMP_NPY_UNUSED __attribute__ ((unused))
+    #else
+        #define __COMP_NPY_UNUSED
+    #endif
+#endif
+
+#if defined(__GNUC__) || defined(__ICC) || defined(__clang__)
+    #define NPY_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
+#elif defined(_MSC_VER)
+    #define NPY_DECL_ALIGNED(x) __declspec(align(x))
+#else
+    #define NPY_DECL_ALIGNED(x)
 #endif
 
 /* Use this to tag a variable as not used. It will remove unused variable
  * warning on support platforms (see __COM_NPY_UNUSED) and mangle the variable
  * to avoid accidental use */
 #define NPY_UNUSED(x) (__NPY_UNUSED_TAGGED ## x) __COMP_NPY_UNUSED
+#define NPY_EXPAND(x) x
+
+#define NPY_STRINGIFY(x) #x
+#define NPY_TOSTRING(x) NPY_STRINGIFY(x)
+
+#define NPY_CAT__(a, b) a ## b
+#define NPY_CAT_(a, b) NPY_CAT__(a, b)
+#define NPY_CAT(a, b) NPY_CAT_(a, b)
 
 #endif
diff --git a/numpy/core/numerictypes.py b/numpy/core/numerictypes.py
index aac741612..2a015f48f 100644
--- a/numpy/core/numerictypes.py
+++ b/numpy/core/numerictypes.py
@@ -91,7 +91,7 @@ from numpy.core.multiarray import (
 from numpy.core.overrides import set_module
 
 # we add more at the bottom
-__all__ = ['sctypeDict', 'sctypeNA', 'typeDict', 'typeNA', 'sctypes',
+__all__ = ['sctypeDict', 'typeDict', 'sctypes',
            'ScalarType', 'obj2sctype', 'cast', 'nbytes', 'sctype2char',
            'maximum_sctype', 'issctype', 'typecodes', 'find_common_type',
            'issubdtype', 'datetime_data', 'datetime_as_string',
@@ -106,7 +106,6 @@ from ._string_helpers import (
 
 from ._type_aliases import (
     sctypeDict,
-    sctypeNA,
     allTypes,
     bitname,
     sctypes,
@@ -512,7 +511,6 @@ typecodes = {'Character':'c',
 
 # backwards compatibility --- deprecated name
 typeDict = sctypeDict
-typeNA = sctypeNA
 
 # b -> boolean
 # u -> unsigned integer
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 5351b30bf..549860179 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -738,6 +738,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'common', 'ufunc_override.h'),
             join('src', 'common', 'umathmodule.h'),
             join('src', 'common', 'numpyos.h'),
+            join('src', 'common', 'npy_cpu_dispatch.h'),
             ]
 
     common_src = [
@@ -939,8 +940,11 @@ def configuration(parent_package='',top_path=None):
     #                        umath_tests module                           #
     #######################################################################
 
-    config.add_extension('_umath_tests',
-                    sources=[join('src', 'umath', '_umath_tests.c.src')])
+    config.add_extension('_umath_tests', sources=[
+        join('src', 'umath', '_umath_tests.c.src'),
+        join('src', 'umath', '_umath_tests.dispatch.c'),
+        join('src', 'common', 'npy_cpu_features.c.src'),
+    ])
 
     #######################################################################
     #                   custom rational dtype module                      #
diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h
index aebe241a5..4493409bb 100644
--- a/numpy/core/src/common/npy_config.h
+++ b/numpy/core/src/common/npy_config.h
@@ -3,6 +3,7 @@
 
 #include "config.h"
 #include "npy_cpu_features.h"
+#include "npy_cpu_dispatch.h"
 #include "numpy/numpyconfig.h"
 #include "numpy/npy_cpu.h"
 #include "numpy/npy_os.h"
diff --git a/numpy/core/src/common/npy_cpu_dispatch.h b/numpy/core/src/common/npy_cpu_dispatch.h
new file mode 100644
index 000000000..846d1ebb9
--- /dev/null
+++ b/numpy/core/src/common/npy_cpu_dispatch.h
@@ -0,0 +1,260 @@
+#ifndef NPY_CPU_DISPATCH_H_
+#define NPY_CPU_DISPATCH_H_
+/**
+ * This file is part of the NumPy CPU dispatcher. Please have a look at doc/reference/simd-optimizations.html
+ * To get a better understanding of the mechanism behind it.
+ */
+#include "npy_cpu_features.h" // NPY_CPU_HAVE
+#include "numpy/utils.h" // NPY_EXPAND, NPY_CAT
+/**
+ * Bringing the main configration header '_cpu_dispatch.h'.
+ *
+ * This header is generated by the distutils module 'ccompiler_opt',
+ * and contains all the #definitions and headers of instruction-sets,
+ * that had been configured through command arguments '--cpu-baseline' and '--cpu-dispatch'.
+ *
+ * It also contains extra C #definitions and macros that are used for implementing
+ * NumPy module's attributes `__cpu_baseline__` and `__cpu_dispaٍtch__`.
+ */
+/**
+ * Note: Always gaurd the genreated headers within 'NPY_DISABLE_OPTIMIZATION',
+ * due the nature of command argument '--disable-optimization',
+ * which is explicitly disabling the module ccompiler_opt.
+ */
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #if defined(__powerpc64__) && !defined(__cplusplus) && defined(bool)
+        /**
+         * "altivec.h" header contains the definitions(bool, vector, pixel),
+         * usually in c++ we undefine them after including the header.
+         * It's better anyway to take them off and use built-in types(__vector, __pixel, __bool) instead,
+         * since c99 supports bool variables which may lead to ambiguous errors.
+        */
+        // backup 'bool' before including '_cpu_dispatch.h', since it may not defiend as a compiler token.
+        #define NPY__DISPATCH_DEFBOOL
+        typedef bool npy__dispatch_bkbool;
+    #endif
+    #include "_cpu_dispatch.h"
+    #ifdef NPY_HAVE_VSX
+        #undef bool
+        #undef vector
+        #undef pixel
+        #ifdef NPY__DISPATCH_DEFBOOL
+            #define bool npy__dispatch_bkbool
+        #endif
+    #endif
+#endif // !NPY_DISABLE_OPTIMIZATION
+/**
+ * Macro NPY_CPU_DISPATCH_CURFX(NAME)
+ *
+ * Returns @NAME suffixed with "_" + "the current target" during compiling
+ * the wrapped sources that generated from the dispatch-able sources according
+ * to the provided configuration statements.
+ *
+ * It also returns @NAME as-is without any suffix when it comes to the baseline or
+ * in case if the optimization is disabled.
+ *
+ * The idea behind this Macro is to allow exporting certain symbols and to
+ * avoid linking duplications due to the nature of the dispatch-able sources.
+ *
+ * Example:
+ *    @targets baseline avx avx512_skx vsx3 asimdhp // configration statments
+ *
+ *    void NPY_CPU_DISPATCH_CURFX(dispatch_me)(const int *src, int *dst)
+ *    {
+ *       // the kernel
+ *    }
+ *
+ * By assuming the required optimizations are enabled via '--cpu-dspatch' and
+ * the compiler supported them too, then the generated symbols will be named as follows:
+ *
+ * - x86:
+ *      dispatch_me(const int*, int*) // baseline
+ *      dispatch_me_AVX(const int*, int*)
+ *      dispatch_me_AVX512_SKX(const int*, int*)
+ *
+ * - ppc64:
+ *      dispatch_me(const int*, int*)
+ *      dispatch_me_VSX3(const int*, int*)
+ *
+ * - ARM:
+ *      dispatch_me(const int*, int*)
+ *      dispatch_me_ASIMHP(const int*, int*)
+ *
+ * - unsupported arch or when optimization is disabled:
+ *      dispatch_me(const int*, int*)
+ *
+ * For forward declarations, see 'NPY_CPU_DISPATCH_DECLARE'.
+ */
+#ifdef NPY__CPU_TARGET_CURRENT
+    // 'NPY__CPU_TARGET_CURRENT': only defined by the dispatch-able sources
+    #define NPY_CPU_DISPATCH_CURFX(NAME) NPY_CAT(NPY_CAT(NAME, _), NPY__CPU_TARGET_CURRENT)
+#else
+    #define NPY_CPU_DISPATCH_CURFX(NAME) NPY_EXPAND(NAME)
+#endif
+/**
+ * Defining the default behavior for the configurable macros of dispatch-able sources,
+ * 'NPY__CPU_DISPATCH_CALL(...)' and 'NPY__CPU_DISPATCH_BASELINE_CALL(...)'
+ *
+ * These macros are defined inside the generated config files that been derived from
+ * the configuration statements of the dispatch-able sources.
+ *
+ * The generated config file takes the same name of the dispatch-able source with replacing
+ * the extension to '.h' instead of '.c', and it should be treated as a header template.
+ *
+ * For more clarification, please have a look at doc/reference/simd-optimizations.html.
+ */
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #define NPY__CPU_DISPATCH_BASELINE_CALL(CB, ...) \
+        &&"Expected config header of the dispatch-able source";
+    #define NPY__CPU_DISPATCH_CALL(CHK, CB, ...) \
+        &&"Expected config header of the dispatch-able source";
+#else
+    /**
+     * We assume by default that all configuration statements contains 'baseline' option however,
+     * if the dispatch-able source doesn't require it, then the dispatch-able source and following macros
+     * need to be guard it with '#ifndef NPY_DISABLE_OPTIMIZATION'
+     */
+    #define NPY__CPU_DISPATCH_BASELINE_CALL(CB, ...) \
+        NPY_EXPAND(CB(__VA_ARGS__))
+    #define NPY__CPU_DISPATCH_CALL(CHK, CB, ...)
+#endif // !NPY_DISABLE_OPTIMIZATION
+/**
+ * Macro NPY_CPU_DISPATCH_DECLARE(LEFT, ...) is used to provide forward
+ * declarations for the exported variables and functions that defined inside
+ * the dispatch-able sources.
+ *
+ * The first argument should ends with the exported function or variable name,
+ * while the Macro pasting the extra arguments.
+ *
+ * Examples:
+ *    #ifndef NPY_DISABLE_OPTIMIZATION
+ *       #include "dispatchable_source_name.dispatch.h"
+ *    #endif
+ *
+ *    NPY_CPU_DISPATCH_DECLARE(void dispatch_me, (const int*, int*))
+ *    NPY_CPU_DISPATCH_DECLARE(extern cb_type callback_tab, [TAB_SIZE])
+ *
+ * By assuming the provided config header drived from a dispatch-able source,
+ * that configured with "@targets baseline sse41 vsx3 asimdhp",
+ * they supported by the compiler and enabled via '--cpu-dspatch',
+ * then the prototype declrations at the above example will equlivent to the follows:
+ *
+ * - x86:
+ *      void dispatch_me(const int*, int*); // baseline
+ *      void dispatch_me_SSE41(const int*, int*);
+ *
+ *      extern cb_type callback_tab[TAB_SIZE];
+ *      extern cb_type callback_tab_SSE41[TAB_SIZE];
+ *
+ * - ppc64:
+ *      void dispatch_me(const int*, int*);
+ *      void dispatch_me_VSX3(const int*, int*);
+ *
+ *      extern cb_type callback_tab[TAB_SIZE];
+ *      extern cb_type callback_tab_VSX3[TAB_SIZE];
+ *
+ * - ARM:
+ *     void dispatch_me(const int*, int*);
+ *     void dispatch_me_ASIMDHP(const int*, int*);
+ *
+ *     extern cb_type callback_tab[TAB_SIZE];
+ *     extern cb_type callback_tab_ASIMDHP[TAB_SIZE];
+ *
+ * - unsupported arch or when optimization is disabled:
+ *     void dispatch_me(const int*, int*);
+ *     extern cb_type callback_tab[TAB_SIZE];
+ *
+ * For runtime dispatching, see 'NPY_CPU_DISPATCH_CALL'
+ */
+#define NPY_CPU_DISPATCH_DECLARE(...) \
+    NPY__CPU_DISPATCH_CALL(NPY_CPU_DISPATCH_DECLARE_CHK_, NPY_CPU_DISPATCH_DECLARE_CB_, __VA_ARGS__) \
+    NPY__CPU_DISPATCH_BASELINE_CALL(NPY_CPU_DISPATCH_DECLARE_BASE_CB_, __VA_ARGS__)
+// Preprocessor callbacks
+#define NPY_CPU_DISPATCH_DECLARE_CB_(DUMMY, TARGET_NAME, LEFT, ...) \
+    NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__;
+#define NPY_CPU_DISPATCH_DECLARE_BASE_CB_(LEFT, ...) \
+    LEFT __VA_ARGS__;
+// Dummy CPU runtime checking
+#define NPY_CPU_DISPATCH_DECLARE_CHK_(FEATURE)
+/**
+ * Macro NPY_CPU_DISPATCH_DECLARE_XB(LEFT, ...)
+ *
+ * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declration even
+ * if it was provided within the configration statments.
+ */
+#define NPY_CPU_DISPATCH_DECLARE_XB(...) \
+    NPY__CPU_DISPATCH_CALL(NPY_CPU_DISPATCH_DECLARE_CHK_, NPY_CPU_DISPATCH_DECLARE_CB_, __VA_ARGS__)
+/**
+ * Macro NPY_CPU_DISPATCH_CALL(LEFT, ...) is used for runtime dispatching
+ * of the exported functions and variables within the dispatch-able sources
+ * according to the highested interesed CPU features that supported by the
+ * running machine depending on the required optimizations.
+ *
+ * The first argument should ends with the exported function or variable name,
+ * while the Macro pasting the extra arguments.
+ *
+ * Example:
+ *  Assume we have a dispatch-able source exporting the following function:
+ *
+ *    @targets baseline avx2 avx512_skx // configration statments
+ *
+ *    void NPY_CPU_DISPATCH_CURFX(dispatch_me)(const int *src, int *dst)
+ *    {
+ *       // the kernel
+ *    }
+ *
+ *  In order to call or to assign the pointer of it from outside the dispatch-able source,
+ *  you have to use this Macro as follows:
+ *
+ *    // bring the genreated config header of the dispatch-abel source
+ *    #ifndef NPY_DISABLE_OPTIMIZATION
+ *        #include "dispatchable_source_name.dispatch.h"
+ *    #endif
+ *    // forward declaration
+ *    NPY_CPU_DISPATCH_DECLARE(dispatch_me, (const int *src, int *dst))
+ *
+ *    typedef void(*func_type)(const int*, int*);
+ *    func_type the_callee(const int *src, int *dst, func_type *cb)
+ *    {
+ *        // direct call
+ *        NPY_CPU_DISPATCH_CALL(dispatch_me, (src, dst))
+ *        // assign the pointer
+ *        NPY_CPU_DISPATCH_CALL(*cb = dispatch_me)
+ *        // return the pointer
+ *        NPY_CPU_DISPATCH_CALL(return dispatch_me)
+ *    }
+ */
+#define NPY_CPU_DISPATCH_CALL(...) \
+    if (0) {/*DUMMY*/} \
+    NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_CB_, __VA_ARGS__) \
+    NPY__CPU_DISPATCH_BASELINE_CALL(NPY_CPU_DISPATCH_CALL_BASE_CB_, __VA_ARGS__)
+// Preprocessor callbacks
+#define NPY_CPU_DISPATCH_CALL_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \
+    else if (TESTED_FEATURES) { NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__; }
+#define NPY_CPU_DISPATCH_CALL_BASE_CB_(LEFT, ...) \
+    else { LEFT __VA_ARGS__; }
+/**
+ * Macro NPY_CPU_DISPATCH_CALL_XB(LEFT, ...)
+ *
+ * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declration even
+ * if it was provided within the configration statments.
+ */
+#define NPY_CPU_DISPATCH_CALL_XB(...) \
+    if (0) {/*DUMMY*/} \
+    NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_CB_, __VA_ARGS__)
+/**
+ * Macro NPY_CPU_DISPATCH_CALL_ALL(LEFT, ...)
+ *
+ * Same as `NPY_CPU_DISPATCH_CALL` but dispatching all the required optimizations for
+ * the exported functions and variables instead of highest interested one.
+ */
+#define NPY_CPU_DISPATCH_CALL_ALL(...) \
+    NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_ALL_CB_, __VA_ARGS__) \
+    NPY__CPU_DISPATCH_BASELINE_CALL(NPY_CPU_DISPATCH_CALL_ALL_BASE_CB_, __VA_ARGS__)
+// Preprocessor callbacks
+#define NPY_CPU_DISPATCH_CALL_ALL_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \
+    if (TESTED_FEATURES) { NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__; }
+#define NPY_CPU_DISPATCH_CALL_ALL_BASE_CB_(LEFT, ...) \
+    { LEFT __VA_ARGS__; }
+
+#endif // NPY_CPU_DISPATCH_H_
diff --git a/numpy/core/src/common/npy_cpu_features.c.src b/numpy/core/src/common/npy_cpu_features.c.src
index d35199760..facd27f3c 100644
--- a/numpy/core/src/common/npy_cpu_features.c.src
+++ b/numpy/core/src/common/npy_cpu_features.c.src
@@ -1,6 +1,7 @@
 #include "npy_cpu_features.h"
+#include "npy_cpu_dispatch.h" // To guarantee the CPU baseline definitions are in scope.
 #include "numpy/npy_common.h" // for NPY_INLINE
-#include "numpy/npy_cpu.h" // To guarantee of having CPU definitions in scope.
+#include "numpy/npy_cpu.h" // To guarantee the CPU definitions are in scope.
 
 /******************** Private Definitions *********************/
 
@@ -55,6 +56,44 @@ npy_cpu_features_dict(void)
     return dict;
 }
 
+#define NPY__CPU_PYLIST_APPEND_CB(FEATURE, LIST) \
+    item = PyUnicode_FromString(NPY_TOSTRING(FEATURE)); \
+    if (item == NULL) { \
+        Py_DECREF(LIST); \
+        return NULL; \
+    } \
+    PyList_SET_ITEM(LIST, index++, item);
+
+NPY_VISIBILITY_HIDDEN PyObject *
+npy_cpu_baseline_list(void)
+{
+#if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_BASELINE_N > 0
+    PyObject *list = PyList_New(NPY_WITH_CPU_BASELINE_N), *item;
+    int index = 0;
+    if (list != NULL) {
+        NPY_WITH_CPU_BASELINE_CALL(NPY__CPU_PYLIST_APPEND_CB, list)
+    }
+    return list;
+#else
+    return PyList_New(0);
+#endif
+}
+
+NPY_VISIBILITY_HIDDEN PyObject *
+npy_cpu_dispatch_list(void)
+{
+#if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_DISPATCH_N > 0
+    PyObject *list = PyList_New(NPY_WITH_CPU_DISPATCH_N), *item;
+    int index = 0;
+    if (list != NULL) {
+        NPY_WITH_CPU_DISPATCH_CALL(NPY__CPU_PYLIST_APPEND_CB, list)
+    }
+    return list;
+#else
+    return PyList_New(0);
+#endif
+}
+
 /****************************************************************
  * This section is reserved to defining @npy__cpu_init_features
  * for each CPU architecture, please try to keep it clean. Ty
@@ -366,7 +405,7 @@ npy__cpu_init_features(void)
         return;
 #endif
     // We have nothing else todo
-#if defined(NPY_HAVE_NEON_ARM8) || defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8)
+#if defined(NPY_HAVE_ASIMD) || defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8)
     #if defined(NPY_HAVE_FPHP) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     npy__cpu_have[NPY_CPU_FEATURE_FPHP] = 1;
     #endif
diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h
index 0e8901328..fffdef38e 100644
--- a/numpy/core/src/common/npy_cpu_features.h
+++ b/numpy/core/src/common/npy_cpu_features.h
@@ -109,6 +109,48 @@ npy_cpu_have(NPY_CPU_FEATURE_##FEATURE_NAME)
  */
 NPY_VISIBILITY_HIDDEN PyObject *
 npy_cpu_features_dict(void);
+/*
+ * Return a new a Python list contains the minimal set of required optimizations
+ * that supported by the compiler and platform according to the specified
+ * values to command argument '--cpu-baseline'.
+ *
+ * This function is mainly used to implement umath's attrbute '__cpu_baseline__',
+ * and the items are sorted from the lowest to highest interest.
+ *
+ * For example, according to the default build configuration and by assuming the compiler
+ * support all the involved optimizations then the returned list should equivalent to:
+ *
+ * On x86: ['SSE', 'SSE2']
+ * On x64: ['SSE', 'SSE2', 'SSE3']
+ * On armhf: []
+ * On aarch64: ['NEON', 'NEON_FP16', 'NEON_VPFV4', 'ASIMD']
+ * On ppc64: []
+ * On ppc64le: ['VSX', 'VSX2']
+ * On any other arch or if the optimization is disabled: []
+ */
+NPY_VISIBILITY_HIDDEN PyObject *
+npy_cpu_baseline_list(void);
+/*
+ * Return a new a Python list contains the dispatched set of additional optimizations
+ * that supported by the compiler and platform according to the specified
+ * values to command argument '--cpu-dispatch'.
+ *
+ * This function is mainly used to implement umath's attrbute '__cpu_dispatch__',
+ * and the items are sorted from the lowest to highest interest.
+ *
+ * For example, according to the default build configuration and by assuming the compiler
+ * support all the involved optimizations then the returned list should equivalent to:
+ *
+ * On x86: ['SSE3', 'SSSE3', 'SSE41', 'POPCNT', 'SSE42', 'AVX', 'F16C', 'FMA3', 'AVX2', 'AVX512F', ...]
+ * On x64: ['SSSE3', 'SSE41', 'POPCNT', 'SSE42', 'AVX', 'F16C', 'FMA3', 'AVX2', 'AVX512F', ...]
+ * On armhf: ['NEON', 'NEON_FP16', 'NEON_VPFV4', 'ASIMD', 'ASIMDHP', 'ASIMDDP', 'ASIMDFHM']
+ * On aarch64: ['ASIMDHP', 'ASIMDDP', 'ASIMDFHM']
+ * On ppc64:  ['VSX', 'VSX2', 'VSX3']
+ * On ppc64le: ['VSX3']
+ * On any other arch or if the optimization is disabled: []
+ */
+NPY_VISIBILITY_HIDDEN PyObject *
+npy_cpu_dispatch_list(void);
 
 #ifdef __cplusplus
 }
diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
index 232176011..8b482dc03 100644
--- a/numpy/core/src/multiarray/buffer.c
+++ b/numpy/core/src/multiarray/buffer.c
@@ -64,7 +64,7 @@ _append_char(_tmp_string_t *s, char c)
         char *p;
         size_t to_alloc = (s->allocated == 0) ? INIT_SIZE : (2 * s->allocated);
 
-        p = realloc(s->s, to_alloc);
+        p = PyObject_Realloc(s->s, to_alloc);
         if (p == NULL) {
             PyErr_SetString(PyExc_MemoryError, "memory allocation failed");
             return -1;
@@ -135,12 +135,25 @@ fail:
  * AND, the descr element size is a multiple of the alignment,
  * AND, the array data is positioned to alignment granularity.
  */
-static int
+static NPY_INLINE int
 _is_natively_aligned_at(PyArray_Descr *descr,
                         PyArrayObject *arr, Py_ssize_t offset)
 {
     int k;
 
+    if (NPY_LIKELY(descr == PyArray_DESCR(arr))) {
+        /*
+         * If the descriptor is the arrays descriptor we can assume the
+         * array's alignment is correct.
+         */
+        assert(offset == 0);
+        if (PyArray_ISALIGNED(arr)) {
+            assert(descr->elsize % descr->alignment == 0);
+            return 1;
+        }
+        return 0;
+    }
+
     if ((Py_ssize_t)(PyArray_DATA(arr)) % descr->alignment != 0) {
         return 0;
     }
@@ -297,8 +310,6 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
                 descr->type_num == NPY_ULONGLONG);
         }
 
-        *offset += descr->elsize;
-
         if (PyArray_IsScalar(obj, Generic)) {
             /* scalars are always natively aligned */
             is_natively_aligned = 1;
@@ -308,6 +319,8 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
                                               (PyArrayObject*)obj, *offset);
         }
 
+        *offset += descr->elsize;
+
         if (descr->byteorder == '=' && is_natively_aligned) {
             /* Prefer native types, to cater for Cython */
             is_standard_size = 0;
@@ -445,49 +458,22 @@ static PyObject *_buffer_info_cache = NULL;
 static _buffer_info_t*
 _buffer_info_new(PyObject *obj)
 {
+    /*
+     * Note that the buffer info is cached as PyLongObjects making them appear
+     * like unreachable lost memory to valgrind.
+     */
     _buffer_info_t *info;
     _tmp_string_t fmt = {NULL, 0, 0};
     int k;
     PyArray_Descr *descr = NULL;
     int err = 0;
 
-    /*
-     * Note that the buffer info is cached as pyints making them appear like
-     * unreachable lost memory to valgrind.
-     */
-    info = malloc(sizeof(_buffer_info_t));
-    if (info == NULL) {
-        PyErr_NoMemory();
-        goto fail;
-    }
-
-    if (PyArray_IsScalar(obj, Datetime) || PyArray_IsScalar(obj, Timedelta)) {
-        /*
-         * Special case datetime64 scalars to remain backward compatible.
-         * This will change in a future version.
-         * Note arrays of datetime64 and structured arrays with datetime64
-         * fields will not hit this code path and are currently unsupported
-         * in _buffer_format_string.
-         */
-        if (_append_char(&fmt, 'B') < 0) {
-            goto fail;
-        }
-        if (_append_char(&fmt, '\0') < 0) {
-            goto fail;
-        }
-        info->ndim = 1;
-        info->shape = malloc(sizeof(Py_ssize_t) * 2);
-        if (info->shape == NULL) {
+    if (PyArray_IsScalar(obj, Void)) {
+        info = PyObject_Malloc(sizeof(_buffer_info_t));
+        if (info == NULL) {
             PyErr_NoMemory();
             goto fail;
         }
-        info->strides = info->shape + info->ndim;
-        info->shape[0] = 8;
-        info->strides[0] = 1;
-        info->format = fmt.s;
-        return info;
-    }
-    else if (PyArray_IsScalar(obj, Generic)) {
         descr = PyArray_DescrFromScalar(obj);
         if (descr == NULL) {
             goto fail;
@@ -497,8 +483,16 @@ _buffer_info_new(PyObject *obj)
         info->strides = NULL;
     }
     else {
+        assert(PyArray_Check(obj));
         PyArrayObject * arr = (PyArrayObject *)obj;
         descr = PyArray_DESCR(arr);
+
+        info = PyObject_Malloc(sizeof(_buffer_info_t) +
+                               sizeof(Py_ssize_t) * PyArray_NDIM(arr) * 2);
+        if (info == NULL) {
+            PyErr_NoMemory();
+            goto fail;
+        }
         /* Fill in shape and strides */
         info->ndim = PyArray_NDIM(arr);
 
@@ -507,11 +501,8 @@ _buffer_info_new(PyObject *obj)
             info->strides = NULL;
         }
         else {
-            info->shape = malloc(sizeof(Py_ssize_t) * PyArray_NDIM(arr) * 2 + 1);
-            if (info->shape == NULL) {
-                PyErr_NoMemory();
-                goto fail;
-            }
+            info->shape = (npy_intp *)((char *)info + sizeof(_buffer_info_t));
+            assert((size_t)info->shape % sizeof(npy_intp) == 0);
             info->strides = info->shape + PyArray_NDIM(arr);
             for (k = 0; k < PyArray_NDIM(arr); ++k) {
                 info->shape[k] = PyArray_DIMS(arr)[k];
@@ -525,11 +516,9 @@ _buffer_info_new(PyObject *obj)
     err = _buffer_format_string(descr, &fmt, obj, NULL, NULL);
     Py_DECREF(descr);
     if (err != 0) {
-        free(info->shape);
         goto fail;
     }
     if (_append_char(&fmt, '\0') < 0) {
-        free(info->shape);
         goto fail;
     }
     info->format = fmt.s;
@@ -537,8 +526,8 @@ _buffer_info_new(PyObject *obj)
     return info;
 
 fail:
-    free(fmt.s);
-    free(info);
+    PyObject_Free(fmt.s);
+    PyObject_Free(info);
     return NULL;
 }
 
@@ -569,12 +558,9 @@ static void
 _buffer_info_free(_buffer_info_t *info)
 {
     if (info->format) {
-        free(info->format);
-    }
-    if (info->shape) {
-        free(info->shape);
+        PyObject_Free(info->format);
     }
-    free(info);
+    PyObject_Free(info);
 }
 
 /* Get buffer info from the global dictionary */
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index 0390c92fc..7bd088677 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -1419,8 +1419,7 @@ PyArray_PromoteTypes(PyArray_Descr *type1, PyArray_Descr *type2)
             }
             break;
         case NPY_TIMEDELTA:
-            if (PyTypeNum_ISINTEGER(type_num1) ||
-                            PyTypeNum_ISFLOAT(type_num1)) {
+            if (PyTypeNum_ISSIGNED(type_num1)) {
                 return ensure_dtype_nbo(type2);
             }
             break;
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index 2c2c457ac..ccebe9da6 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -3580,7 +3580,7 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char const *sep, size_t *nre
     npy_intp i;
     char *dptr, *clean_sep, *tmp;
     int err = 0;
-    int stop_reading_flag;  /* -1 indicates end reached; -2 a parsing error */
+    int stop_reading_flag = 0;  /* -1 means end reached; -2 a parsing error */
     npy_intp thisbuf = 0;
     npy_intp size;
     npy_intp bytes, totalbytes;
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index 5b4a94aa4..4e37b9628 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -1678,14 +1678,14 @@ _convert_from_str(PyObject *obj, int align)
         }
 
         /* Check for a deprecated Numeric-style typecode */
-        char *dep_tps[] = {"Bool", "Complex", "Float", "Int",
-                           "Object0", "String0", "Timedelta64",
-                           "Unicode0", "UInt", "Void0"};
+        /* `Uint` has deliberately weird uppercasing */
+        char *dep_tps[] = {"Bytes", "Datetime64", "Str", "Uint"};
         int ndep_tps = sizeof(dep_tps) / sizeof(dep_tps[0]);
         for (int i = 0; i < ndep_tps; ++i) {
             char *dep_tp = dep_tps[i];
 
             if (strncmp(type, dep_tp, strlen(dep_tp)) == 0) {
+                /* Deprecated 2020-06-09, NumPy 1.20 */
                 if (DEPRECATE("Numeric-style type codes are "
                               "deprecated and will result in "
                               "an error in the future.") < 0) {
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index b914e5bb3..2538e05c6 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -31,9 +31,6 @@
 #define EINSUM_USE_SSE1 0
 #endif
 
-/*
- * TODO: Only some SSE2 for float64 is implemented.
- */
 #ifdef NPY_HAVE_SSE2_INTRINSICS
 #define EINSUM_USE_SSE2 1
 #else
@@ -276,6 +273,8 @@ static void
 
 #if EINSUM_USE_SSE1 && @float32@
     __m128 a, b;
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, b;
 #endif
 
     NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
@@ -319,6 +318,29 @@ finish_after_unrolled_loop:
         /* Finish off the loop */
         goto finish_after_unrolled_loop;
     }
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
+        EINSUM_IS_SSE_ALIGNED(data_out)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
+            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
+            _mm_store_pd(data_out+@i@, b);
+/**end repeat2**/
+            data0 += 8;
+            data1 += 8;
+            data_out += 8;
+        }
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
 #endif
 
     /* Unroll the loop by 8 */
@@ -333,6 +355,14 @@ finish_after_unrolled_loop:
         b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
         _mm_storeu_ps(data_out+@i@, b);
 /**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
+        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
+        _mm_storeu_pd(data_out+@i@, b);
+/**end repeat2**/
 #else
 /**begin repeat2
  * #i = 0, 1, 2, 3, 4, 5, 6, 7#
@@ -491,6 +521,8 @@ static void
 
 #if EINSUM_USE_SSE1 && @float32@
     __m128 a, b, value1_sse;
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, b, value1_sse;
 #endif
 
     NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n",
@@ -534,6 +566,29 @@ finish_after_unrolled_loop:
         /* Finish off the loop */
         goto finish_after_unrolled_loop;
     }
+#elif EINSUM_USE_SSE2 && @float64@
+    value1_sse = _mm_set1_pd(value1);
+
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse);
+            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
+            _mm_store_pd(data_out+@i@, b);
+/**end repeat2**/
+            data0 += 8;
+            data_out += 8;
+        }
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
 #endif
 
     /* Unroll the loop by 8 */
@@ -548,6 +603,14 @@ finish_after_unrolled_loop:
         b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
         _mm_storeu_ps(data_out+@i@, b);
 /**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse);
+        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
+        _mm_storeu_pd(data_out+@i@, b);
+/**end repeat2**/
 #else
 /**begin repeat2
  * #i = 0, 1, 2, 3, 4, 5, 6, 7#
@@ -735,6 +798,8 @@ static void
 
 #if EINSUM_USE_SSE1 && @float32@
     __m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, accum_sse = _mm_setzero_pd();
 #endif
 
     NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
@@ -772,15 +837,38 @@ finish_after_unrolled_loop:
 /**end repeat2**/
             data1 += 8;
         }
-
-#if EINSUM_USE_SSE1 && @float32@
         /* Add the four SSE values and put in accum */
         a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
         accum_sse = _mm_add_ps(a, accum_sse);
         a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
         accum_sse = _mm_add_ps(a, accum_sse);
         _mm_store_ss(&accum, accum_sse);
-#endif
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data1)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@));
+/**end repeat2**/
+            data1 += 8;
+        }
+        /* Add the two SSE2 values and put in accum */
+        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+        accum_sse = _mm_add_pd(a, accum_sse);
+        _mm_store_sd(&accum, accum_sse);
 
         /* Finish off the loop */
         goto finish_after_unrolled_loop;
@@ -801,6 +889,16 @@ finish_after_unrolled_loop:
          */
         accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
 /**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@));
+/**end repeat2**/
 #else
 /**begin repeat2
  * #i = 0, 1, 2, 3, 4, 5, 6, 7#
@@ -818,6 +916,11 @@ finish_after_unrolled_loop:
     a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
     accum_sse = _mm_add_ps(a, accum_sse);
     _mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Add the two SSE2 values and put in accum */
+    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+    accum_sse = _mm_add_pd(a, accum_sse);
+    _mm_store_sd(&accum, accum_sse);
 #endif
 
     /* Finish off the loop */
@@ -834,6 +937,8 @@ static void
 
 #if EINSUM_USE_SSE1 && @float32@
     __m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, accum_sse = _mm_setzero_pd();
 #endif
 
     NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
@@ -871,16 +976,37 @@ finish_after_unrolled_loop:
 /**end repeat2**/
             data0 += 8;
         }
-
-#if EINSUM_USE_SSE1 && @float32@
         /* Add the four SSE values and put in accum */
         a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
         accum_sse = _mm_add_ps(a, accum_sse);
         a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
         accum_sse = _mm_add_ps(a, accum_sse);
         _mm_store_ss(&accum, accum_sse);
-#endif
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
 
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
+/**end repeat2**/
+            data0 += 8;
+        }
+        /* Add the two SSE2 values and put in accum */
+        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+        accum_sse = _mm_add_pd(a, accum_sse);
+        _mm_store_sd(&accum, accum_sse);
         /* Finish off the loop */
         goto finish_after_unrolled_loop;
     }
@@ -900,6 +1026,16 @@ finish_after_unrolled_loop:
          */
         accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
 /**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
+/**end repeat2**/
 #else
 /**begin repeat2
  * #i = 0, 1, 2, 3, 4, 5, 6, 7#
@@ -917,6 +1053,11 @@ finish_after_unrolled_loop:
     a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
     accum_sse = _mm_add_ps(a, accum_sse);
     _mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Add the two SSE2 values and put in accum */
+    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+    accum_sse = _mm_add_pd(a, accum_sse);
+    _mm_store_sd(&accum, accum_sse);
 #endif
 
     /* Finish off the loop */
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 7aefbfc38..f73cb48d9 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -2480,8 +2480,6 @@ PyArray_MapIterCheckIndices(PyArrayMapIterObject *mit)
     int i;
     NPY_BEGIN_THREADS_DEF;
 
-    intp_type = PyArray_DescrFromType(NPY_INTP);
-
     if (NpyIter_GetIterSize(mit->outer) == 0) {
         /*
          * When the outer iteration is empty, the indices broadcast to an
@@ -2493,6 +2491,8 @@ PyArray_MapIterCheckIndices(PyArrayMapIterObject *mit)
         return 0;
     }
 
+    intp_type = PyArray_DescrFromType(NPY_INTP);
+
     NPY_BEGIN_THREADS;
 
     for (i=0; i < mit->numiter; i++) {
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 84c22ba65..4190c53bd 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -4542,6 +4542,26 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
     }
     Py_DECREF(s);
 
+    s = npy_cpu_baseline_list();
+    if (s == NULL) {
+        goto err;
+    }
+    if (PyDict_SetItemString(d, "__cpu_baseline__", s) < 0) {
+        Py_DECREF(s);
+        goto err;
+    }
+    Py_DECREF(s);
+
+    s = npy_cpu_dispatch_list();
+    if (s == NULL) {
+        goto err;
+    }
+    if (PyDict_SetItemString(d, "__cpu_dispatch__", s) < 0) {
+        Py_DECREF(s);
+        goto err;
+    }
+    Py_DECREF(s);
+
     s = NpyCapsule_FromVoidPtr((void *)_datetime_strings, NULL);
     if (s == NULL) {
         goto err;
diff --git a/numpy/core/src/multiarray/scalarapi.c b/numpy/core/src/multiarray/scalarapi.c
index f3c440dc6..154f4d637 100644
--- a/numpy/core/src/multiarray/scalarapi.c
+++ b/numpy/core/src/multiarray/scalarapi.c
@@ -286,14 +286,10 @@ PyArray_CastScalarDirect(PyObject *scalar, PyArray_Descr *indescr,
 NPY_NO_EXPORT PyObject *
 PyArray_FromScalar(PyObject *scalar, PyArray_Descr *outcode)
 {
-    PyArray_Descr *typecode;
-    PyArrayObject *r;
-    char *memptr;
-    PyObject *ret;
-
     /* convert to 0-dim array of scalar typecode */
-    typecode = PyArray_DescrFromScalar(scalar);
+    PyArray_Descr *typecode = PyArray_DescrFromScalar(scalar);
     if (typecode == NULL) {
+        Py_XDECREF(outcode);
         return NULL;
     }
     if ((typecode->type_num == NPY_VOID) &&
@@ -307,49 +303,53 @@ PyArray_FromScalar(PyObject *scalar, PyArray_Descr *outcode)
                 NULL, (PyObject *)scalar);
     }
 
-    /* Need to INCREF typecode because PyArray_NewFromDescr steals a
-     * reference below and we still need to access typecode afterwards. */
-    Py_INCREF(typecode);
-    r = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type,
+    PyArrayObject *r = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type,
             typecode,
             0, NULL,
             NULL, NULL, 0, NULL);
-    if (r==NULL) {
-        Py_DECREF(typecode); Py_XDECREF(outcode);
+    if (r == NULL) {
+        Py_XDECREF(outcode);
         return NULL;
     }
+    /* the dtype used by the array may be different to the one requested */
+    typecode = PyArray_DESCR(r);
     if (PyDataType_FLAGCHK(typecode, NPY_USE_SETITEM)) {
         if (typecode->f->setitem(scalar, PyArray_DATA(r), r) < 0) {
-            Py_DECREF(typecode); Py_XDECREF(outcode); Py_DECREF(r);
+            Py_DECREF(r);
+            Py_XDECREF(outcode);
             return NULL;
         }
-        goto finish;
     }
+    else {
+        char *memptr = scalar_value(scalar, typecode);
 
-    memptr = scalar_value(scalar, typecode);
-
-    memcpy(PyArray_DATA(r), memptr, PyArray_ITEMSIZE(r));
-    if (PyDataType_FLAGCHK(typecode, NPY_ITEM_HASOBJECT)) {
-        /* Need to INCREF just the PyObject portion */
-        PyArray_Item_INCREF(memptr, typecode);
+        memcpy(PyArray_DATA(r), memptr, PyArray_ITEMSIZE(r));
+        if (PyDataType_FLAGCHK(typecode, NPY_ITEM_HASOBJECT)) {
+            /* Need to INCREF just the PyObject portion */
+            PyArray_Item_INCREF(memptr, typecode);
+        }
     }
 
-finish:
     if (outcode == NULL) {
-        Py_DECREF(typecode);
         return (PyObject *)r;
     }
     if (PyArray_EquivTypes(outcode, typecode)) {
         if (!PyTypeNum_ISEXTENDED(typecode->type_num)
                 || (outcode->elsize == typecode->elsize)) {
-            Py_DECREF(typecode); Py_DECREF(outcode);
+            /*
+             * Since the type is equivalent, and we haven't handed the array
+             * to anyone yet, let's fix the dtype to be what was requested,
+             * even if it is equivalent to what was passed in.
+             */
+            Py_SETREF(((PyArrayObject_fields *)r)->descr, outcode);
+
             return (PyObject *)r;
         }
     }
 
     /* cast if necessary to desired output typecode */
-    ret = PyArray_CastToType((PyArrayObject *)r, outcode, 0);
-    Py_DECREF(typecode); Py_DECREF(r);
+    PyObject *ret = PyArray_CastToType(r, outcode, 0);
+    Py_DECREF(r);
     return ret;
 }
 
diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src
index abc8d78c4..d08aabd64 100644
--- a/numpy/core/src/umath/_umath_tests.c.src
+++ b/numpy/core/src/umath/_umath_tests.c.src
@@ -576,6 +576,51 @@ fail:
     return NULL;
 }
 
+// Testing the utilites of the CPU dispatcher
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "_umath_tests.dispatch.h"
+#endif
+NPY_CPU_DISPATCH_DECLARE(extern const char *_umath_tests_dispatch_var)
+NPY_CPU_DISPATCH_DECLARE(const char *_umath_tests_dispatch_func, (void))
+NPY_CPU_DISPATCH_DECLARE(void _umath_tests_dispatch_attach, (PyObject *list))
+
+static PyObject *
+UMath_Tests_test_dispatch(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(dummy2))
+{
+    const char *highest_func, *highest_var;
+    NPY_CPU_DISPATCH_CALL(highest_func = _umath_tests_dispatch_func, ())
+    NPY_CPU_DISPATCH_CALL(highest_var  = _umath_tests_dispatch_var)
+    const char *highest_func_xb = "nobase", *highest_var_xb = "nobase";
+    NPY_CPU_DISPATCH_CALL_XB(highest_func_xb = _umath_tests_dispatch_func, ())
+    NPY_CPU_DISPATCH_CALL_XB(highest_var_xb  = _umath_tests_dispatch_var)
+
+    PyObject *dict = PyDict_New(), *item;
+    if (dict == NULL) {
+        return NULL;
+    }
+    /**begin repeat
+     * #str = func, var, func_xb, var_xb#
+    */
+    item = PyUnicode_FromString(highest_@str@);
+    if (item == NULL || PyDict_SetItemString(dict, "@str@", item) < 0) {
+        goto err;
+    }
+    /**end repeat**/
+    item = PyList_New(0);
+    if (item == NULL || PyDict_SetItemString(dict, "all", item) < 0) {
+        goto err;
+    }
+    NPY_CPU_DISPATCH_CALL_ALL(_umath_tests_dispatch_attach, (item))
+    if (PyErr_Occurred()) {
+        goto err;
+    }
+    return dict;
+err:
+    Py_XDECREF(item);
+    Py_DECREF(dict);
+    return NULL;
+}
+
 static PyMethodDef UMath_TestsMethods[] = {
     {"test_signature",  UMath_Tests_test_signature, METH_VARARGS,
      "Test signature parsing of ufunc. \n"
@@ -583,6 +628,7 @@ static PyMethodDef UMath_TestsMethods[] = {
      "If fails, it returns NULL. Otherwise it returns a tuple of ufunc "
      "internals. \n",
      },
+    {"test_dispatch", UMath_Tests_test_dispatch, METH_NOARGS, NULL},
     {NULL, NULL, 0, NULL}        /* Sentinel */
 };
 
@@ -604,6 +650,11 @@ PyMODINIT_FUNC PyInit__umath_tests(void) {
     PyObject *d;
     PyObject *version;
 
+    // Initialize CPU features
+    if (npy_cpu_init() < 0) {
+        return NULL;
+    }
+
     m = PyModule_Create(&moduledef);
     if (m == NULL) {
         return NULL;
@@ -632,6 +683,5 @@ PyMODINIT_FUNC PyInit__umath_tests(void) {
                         "cannot load _umath_tests module.");
         return NULL;
     }
-
     return m;
 }
diff --git a/numpy/core/src/umath/_umath_tests.dispatch.c b/numpy/core/src/umath/_umath_tests.dispatch.c
new file mode 100644
index 000000000..d86a54411
--- /dev/null
+++ b/numpy/core/src/umath/_umath_tests.dispatch.c
@@ -0,0 +1,33 @@
+/**
+ * Testing the utilites of the CPU dispatcher
+ *
+ * @targets $werror baseline
+ * SSE2 SSE41 AVX2
+ * VSX VSX2 VSX3
+ * NEON ASIMD ASIMDHP
+ */
+#include <Python.h>
+#include "npy_cpu_dispatch.h"
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "_umath_tests.dispatch.h"
+#endif
+
+NPY_CPU_DISPATCH_DECLARE(const char *_umath_tests_dispatch_func, (void))
+NPY_CPU_DISPATCH_DECLARE(extern const char *_umath_tests_dispatch_var)
+NPY_CPU_DISPATCH_DECLARE(void _umath_tests_dispatch_attach, (PyObject *list))
+
+const char *NPY_CPU_DISPATCH_CURFX(_umath_tests_dispatch_var) = NPY_TOSTRING(NPY_CPU_DISPATCH_CURFX(var));
+const char *NPY_CPU_DISPATCH_CURFX(_umath_tests_dispatch_func)(void)
+{
+    static const char *current = NPY_TOSTRING(NPY_CPU_DISPATCH_CURFX(func));
+    return current;
+}
+
+void NPY_CPU_DISPATCH_CURFX(_umath_tests_dispatch_attach)(PyObject *list)
+{
+    PyObject *item = PyUnicode_FromString(NPY_TOSTRING(NPY_CPU_DISPATCH_CURFX(func)));
+    if (item) {
+        PyList_Append(list, item);
+    }
+}
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 48e89915c..e6414e29e 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -2698,17 +2698,17 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 
         /* process elements using glibc for large elements */
         if (my_trig_op == npy_compute_cos) {
-            for (int ii = 0; iglibc_mask != 0; ii++) {
+            for (int ii = 0, jj = 0; iglibc_mask != 0; ii++, jj += stride) {
                 if (iglibc_mask & 0x01) {
-                    op[ii] = npy_cosf(ip[ii]);
+                    op[ii] = npy_cosf(ip[jj]);
                 }
                 iglibc_mask  = iglibc_mask >> 1;
             }
         }
         else {
-            for (int ii = 0; iglibc_mask != 0; ii++) {
+            for (int ii = 0, jj = 0; iglibc_mask != 0; ii++, jj += stride) {
                 if (iglibc_mask & 0x01) {
-                    op[ii] = npy_sinf(ip[ii]);
+                    op[ii] = npy_sinf(ip[jj]);
                 }
                 iglibc_mask  = iglibc_mask >> 1;
             }
diff --git a/numpy/core/tests/test_api.py b/numpy/core/tests/test_api.py
index 2600d409a..5d079d9d2 100644
--- a/numpy/core/tests/test_api.py
+++ b/numpy/core/tests/test_api.py
@@ -291,7 +291,7 @@ def test_array_astype_warning(t):
 
 @pytest.mark.parametrize(["dtype", "out_dtype"],
         [(np.bytes_, np.bool_),
-         (np.unicode, np.bool_),
+         (np.unicode_, np.bool_),
          (np.dtype("S10,S9"), np.dtype("?,?"))])
 def test_string_to_boolean_cast(dtype, out_dtype):
     """
@@ -305,7 +305,7 @@ def test_string_to_boolean_cast(dtype, out_dtype):
 
 @pytest.mark.parametrize(["dtype", "out_dtype"],
         [(np.bytes_, np.bool_),
-         (np.unicode, np.bool_),
+         (np.unicode_, np.bool_),
          (np.dtype("S10,S9"), np.dtype("?,?"))])
 def test_string_to_boolean_cast_errors(dtype, out_dtype):
     """
diff --git a/numpy/core/tests/test_array_coercion.py b/numpy/core/tests/test_array_coercion.py
new file mode 100644
index 000000000..b8d4b5cdf
--- /dev/null
+++ b/numpy/core/tests/test_array_coercion.py
@@ -0,0 +1,577 @@
+"""
+Tests for array coercion, mainly through testing `np.array` results directly.
+Note that other such tests exist e.g. in `test_api.py` and many corner-cases
+are tested (sometimes indirectly) elsewhere.
+"""
+
+import pytest
+from pytest import param
+
+from itertools import product
+
+import numpy as np
+from numpy.core._rational_tests import rational
+
+from numpy.testing import (
+    assert_array_equal, assert_warns, IS_PYPY)
+
+
+def arraylikes():
+    """
+    Generator for functions converting an array into various array-likes.
+    If full is True (default) includes array-likes not capable of handling
+    all dtypes
+    """
+    # base array:
+    def ndarray(a):
+        return a
+
+    yield param(ndarray, id="ndarray")
+
+    # subclass:
+    class MyArr(np.ndarray):
+        pass
+
+    def subclass(a):
+        return a.view(MyArr)
+
+    yield subclass
+
+    # Array-interface
+    class ArrayDunder:
+        def __init__(self, a):
+            self.a = a
+
+        def __array__(self, dtype=None):
+            return self.a
+
+    yield param(ArrayDunder, id="__array__")
+
+    # memory-view
+    yield param(memoryview, id="memoryview")
+
+    # Array-interface
+    class ArrayInterface:
+        def __init__(self, a):
+            self.a = a  # need to hold on to keep interface valid
+            self.__array_interface__ = a.__array_interface__
+
+    yield param(ArrayInterface, id="__array_interface__")
+
+    # Array-Struct
+    class ArrayStruct:
+        def __init__(self, a):
+            self.a = a  # need to hold on to keep struct valid
+            self.__array_struct__ = a.__array_struct__
+
+    yield param(ArrayStruct, id="__array_struct__")
+
+
+def scalar_instances(times=True, extended_precision=True, user_dtype=True):
+    # Hard-coded list of scalar instances.
+    # Floats:
+    yield param(np.sqrt(np.float16(5)), id="float16")
+    yield param(np.sqrt(np.float32(5)), id="float32")
+    yield param(np.sqrt(np.float64(5)), id="float64")
+    if extended_precision:
+        yield param(np.sqrt(np.longdouble(5)), id="longdouble")
+
+    # Complex:
+    yield param(np.sqrt(np.complex64(2+3j)), id="complex64")
+    yield param(np.sqrt(np.complex128(2+3j)), id="complex128")
+    if extended_precision:
+        yield param(np.sqrt(np.longcomplex(2+3j)), id="clongdouble")
+
+    # Bool:
+    # XFAIL: Bool should be added, but has some bad properties when it
+    # comes to strings, see also gh-9875
+    # yield param(np.bool_(0), id="bool")
+
+    # Integers:
+    yield param(np.int8(2), id="int8")
+    yield param(np.int16(2), id="int16")
+    yield param(np.int32(2), id="int32")
+    yield param(np.int64(2), id="int64")
+
+    yield param(np.uint8(2), id="uint8")
+    yield param(np.uint16(2), id="uint16")
+    yield param(np.uint32(2), id="uint32")
+    yield param(np.uint64(2), id="uint64")
+
+    # Rational:
+    if user_dtype:
+        yield param(rational(1, 2), id="rational")
+
+    # Cannot create a structured void scalar directly:
+    structured = np.array([(1, 3)], "i,i")[0]
+    assert isinstance(structured, np.void)
+    assert structured.dtype == np.dtype("i,i")
+    yield param(structured, id="structured")
+
+    if times:
+        # Datetimes and timedelta
+        yield param(np.timedelta64(2), id="timedelta64[generic]")
+        yield param(np.timedelta64(23, "s"), id="timedelta64[s]")
+        yield param(np.timedelta64("NaT", "s"), id="timedelta64[s](NaT)")
+
+        yield param(np.datetime64("NaT"), id="datetime64[generic](NaT)")
+        yield param(np.datetime64("2020-06-07 12:43", "ms"), id="datetime64[ms]")
+
+    # Strings and unstructured void:
+    yield param(np.bytes_(b"1234"), id="bytes")
+    yield param(np.unicode_("2345"), id="unicode")
+    yield param(np.void(b"4321"), id="unstructured_void")
+
+
+def is_parametric_dtype(dtype):
+    """Returns True if the the dtype is a parametric legacy dtype (itemsize
+    is 0, or a datetime without units)
+    """
+    if dtype.itemsize == 0:
+        return True
+    if issubclass(dtype.type, (np.datetime64, np.timedelta64)):
+        if dtype.name.endswith("64"):
+            # Generic time units
+            return True
+    return False
+
+
+class TestStringDiscovery:
+    @pytest.mark.parametrize("obj",
+            [object(), 1.2, 10**43, None, "string"],
+            ids=["object", "1.2", "10**43", "None", "string"])
+    def test_basic_stringlength(self, obj):
+        if not isinstance(obj, (str, int)):
+            pytest.xfail(
+                "The Single object (first assert) uses a different branch "
+                "and thus gives a different result (either wrong or longer"
+                "string than normally discovered).")
+
+        length = len(str(obj))
+        expected = np.dtype(f"S{length}")
+
+        assert np.array(obj, dtype="S").dtype == expected
+        assert np.array([obj], dtype="S").dtype == expected
+
+        # A nested array is also discovered correctly
+        arr = np.array(obj, dtype="O")
+        assert np.array(arr, dtype="S").dtype == expected
+
+    @pytest.mark.xfail(reason="Only single array unpacking is supported")
+    @pytest.mark.parametrize("obj",
+            [object(), 1.2, 10**43, None, "string"],
+            ids=["object", "1.2", "10**43", "None", "string"])
+    def test_nested_arrays_stringlength(self, obj):
+        length = len(str(obj))
+        expected = np.dtype(f"S{length}")
+        arr = np.array(obj, dtype="O")
+        assert np.array([arr, arr], dtype="S").dtype == expected
+
+    @pytest.mark.xfail(reason="Only single array unpacking is supported")
+    @pytest.mark.parametrize("arraylike", arraylikes())
+    def test_unpack_first_level(self, arraylike):
+        # We unpack exactly one level of array likes
+        obj = np.array([None])
+        obj[0] = np.array(1.2)
+        # the length of the included item, not of the float dtype
+        length = len(str(obj[0]))
+        expected = np.dtype(f"S{length}")
+
+        obj = arraylike(obj)
+        # casting to string usually calls str(obj)
+        arr = np.array([obj], dtype="S")
+        assert arr.shape == (1, 1)
+        assert arr.dtype == expected
+
+
+class TestScalarDiscovery:
+    def test_void_special_case(self):
+        # Void dtypes with structures discover tuples as elements
+        arr = np.array((1, 2, 3), dtype="i,i,i")
+        assert arr.shape == ()
+        arr = np.array([(1, 2, 3)], dtype="i,i,i")
+        assert arr.shape == (1,)
+
+    def test_char_special_case(self):
+        arr = np.array("string", dtype="c")
+        assert arr.shape == (6,)
+        assert arr.dtype.char == "c"
+        arr = np.array(["string"], dtype="c")
+        assert arr.shape == (1, 6)
+        assert arr.dtype.char == "c"
+
+    def test_char_special_case_deep(self):
+        # Check that the character special case errors correctly if the
+        # array is too deep:
+        nested = ["string"]  # 2 dimensions (due to string being sequence)
+        for i in range(np.MAXDIMS - 2):
+            nested = [nested]
+
+        arr = np.array(nested, dtype='c')
+        assert arr.shape == (1,) * (np.MAXDIMS - 1) + (6,)
+        with pytest.raises(ValueError):
+            np.array([nested], dtype="c")
+
+    def test_unknown_object(self):
+        arr = np.array(object())
+        assert arr.shape == ()
+        assert arr.dtype == np.dtype("O")
+
+    @pytest.mark.parametrize("scalar", scalar_instances())
+    def test_scalar(self, scalar):
+        arr = np.array(scalar)
+        assert arr.shape == ()
+        assert arr.dtype == scalar.dtype
+
+        if type(scalar) is np.bytes_:
+            pytest.xfail("Nested bytes use len(str(scalar)) currently.")
+
+        arr = np.array([[scalar, scalar]])
+        assert arr.shape == (1, 2)
+        assert arr.dtype == scalar.dtype
+
+    # Additionally to string this test also runs into a corner case
+    # with datetime promotion (the difference is the promotion order).
+    @pytest.mark.xfail(reason="Coercion to string is not symmetric")
+    def test_scalar_promotion(self):
+        for sc1, sc2 in product(scalar_instances(), scalar_instances()):
+            sc1, sc2 = sc1.values[0], sc2.values[0]
+            # test all combinations:
+            arr = np.array([sc1, sc2])
+            assert arr.shape == (2,)
+            try:
+                dt1, dt2 = sc1.dtype, sc2.dtype
+                expected_dtype = np.promote_types(dt1, dt2)
+                assert arr.dtype == expected_dtype
+            except TypeError as e:
+                # Will currently always go to object dtype
+                assert arr.dtype == np.dtype("O")
+
+    @pytest.mark.parametrize("scalar", scalar_instances())
+    def test_scalar_coercion(self, scalar):
+        # This tests various scalar coercion paths, mainly for the numerical
+        # types.  It includes some paths not directly related to `np.array`
+        if isinstance(scalar, np.inexact):
+            # Ensure we have a full-precision number if available
+            scalar = type(scalar)((scalar * 2)**0.5)
+
+        if is_parametric_dtype(scalar.dtype) or type(scalar) is rational:
+            # datetime with unit will be named "datetime64[unit]"
+            # Rational generally fails due to a missing cast. In the future
+            # object casts should automatically be defined based on `setitem`.
+            pytest.xfail("0-D object array to a unit-less datetime cast fails")
+
+        # Use casting from object:
+        arr = np.array(scalar, dtype=object).astype(scalar.dtype)
+
+        # Test various ways to create an array containing this scalar:
+        arr1 = np.array(scalar).reshape(1)
+        arr2 = np.array([scalar])
+        arr3 = np.empty(1, dtype=scalar.dtype)
+        arr3[0] = scalar
+        arr4 = np.empty(1, dtype=scalar.dtype)
+        arr4[:] = [scalar]
+        # All of these methods should yield the same results
+        assert_array_equal(arr, arr1)
+        assert_array_equal(arr, arr2)
+        assert_array_equal(arr, arr3)
+        assert_array_equal(arr, arr4)
+
+    @pytest.mark.xfail(IS_PYPY, reason="`int(np.complex128(3))` fails on PyPy")
+    @pytest.mark.filterwarnings("ignore::numpy.ComplexWarning")
+    # After change, can enable times here, and below and it will work,
+    # Right now times are too complex, so map out some details below.
+    @pytest.mark.parametrize("cast_to", scalar_instances(times=False))
+    def test_scalar_coercion_same_as_cast_and_assignment(self, cast_to):
+        """
+        Test that in most cases:
+           * `np.array(scalar, dtype=dtype)`
+           * `np.empty((), dtype=dtype)[()] = scalar`
+           * `np.array(scalar).astype(dtype)`
+        should behave the same.  The only exceptions are paramteric dtypes
+        (mainly datetime/timedelta without unit) and void without fields.
+        """
+        dtype = cast_to.dtype  # use to parametrize only the target dtype
+
+        # XFAIL: Some extended precision tests fail, because assigning to
+        #        complex256 will use float(float128). Rational fails currently.
+        for scalar in scalar_instances(
+                times=False, extended_precision=False, user_dtype=False):
+            scalar = scalar.values[0]
+
+            if dtype.type == np.void:
+               if scalar.dtype.fields is not None and dtype.fields is None:
+                    # Here, coercion to "V6" works, but the cast fails.
+                    # Since the types are identical, SETITEM takes care of
+                    # this, but has different rules than the cast.
+                    with pytest.raises(TypeError):
+                        np.array(scalar).astype(dtype)
+                    # XFAIL: np.array(scalar, dtype=dtype)
+                    np.array([scalar], dtype=dtype)
+                    continue
+
+            # The main test, we first try to use casting and if it succeeds
+            # continue below testing that things are the same, otherwise
+            # test that the alternative paths at least also fail.
+            try:
+                cast = np.array(scalar).astype(dtype)
+            except (TypeError, ValueError, RuntimeError):
+                # coercion should also raise (error type may change)
+                with pytest.raises(Exception):
+                    np.array(scalar, dtype=dtype)
+                # assignment should also raise
+                res = np.zeros((), dtype=dtype)
+                with pytest.raises(Exception):
+                    res[()] = scalar
+
+                return
+
+            # Non error path:
+            arr = np.array(scalar, dtype=dtype)
+            assert_array_equal(arr, cast)
+            # assignment behaves the same
+            ass = np.zeros((), dtype=dtype)
+            ass[()] = scalar
+            assert_array_equal(ass, cast)
+
+
+class TestTimeScalars:
+    @pytest.mark.parametrize("dtype", [np.int64, np.float32])
+    @pytest.mark.parametrize("scalar",
+            [param(np.timedelta64("NaT", "s"), id="timedelta64[s](NaT)"),
+             param(np.timedelta64(123, "s"), id="timedelta64[s]"),
+             param(np.datetime64("NaT", "generic"), id="datetime64[generic](NaT)"),
+             param(np.datetime64(1, "D"), id="datetime64[D]")],)
+    @pytest.mark.xfail(
+            reason="This uses int(scalar) or float(scalar) to assign, which "
+                   "fails.  However, casting currently does not fail.")
+    def test_coercion_basic(self, dtype, scalar):
+        arr = np.array(scalar, dtype=dtype)
+        cast = np.array(scalar).astype(dtype)
+        ass = np.ones((), dtype=dtype)
+        ass[()] = scalar  # raises, as would np.array([scalar], dtype=dtype)
+
+        assert_array_equal(arr, cast)
+        assert_array_equal(cast, cast)
+
+    @pytest.mark.parametrize("dtype", [np.int64, np.float32])
+    @pytest.mark.parametrize("scalar",
+            [param(np.timedelta64(123, "ns"), id="timedelta64[ns]"),
+             param(np.timedelta64(12, "generic"), id="timedelta64[generic]")])
+    def test_coercion_timedelta_convert_to_number(self, dtype, scalar):
+        # Only "ns" and "generic" timedeltas can be converted to numbers
+        # so these are slightly special.
+        arr = np.array(scalar, dtype=dtype)
+        cast = np.array(scalar).astype(dtype)
+        ass = np.ones((), dtype=dtype)
+        ass[()] = scalar  # raises, as would np.array([scalar], dtype=dtype)
+
+        assert_array_equal(arr, cast)
+        assert_array_equal(cast, cast)
+
+    @pytest.mark.parametrize(["val", "unit"],
+            [param(123, "s", id="[s]"), param(123, "D", id="[D]")])
+    @pytest.mark.parametrize("scalar_type", [np.datetime64, np.timedelta64])
+    @pytest.mark.xfail(reason="Error not raised for assignment")
+    def test_coercion_assignment_times(self, scalar_type, val, unit):
+        scalar = scalar_type(val, unit)
+
+        # The error type is not ideal, fails because string is too short:
+        with pytest.raises(RuntimeError):
+            np.array(scalar, dtype="S6")
+        with pytest.raises(RuntimeError):
+            cast = np.array(scalar).astype("S6")
+        ass = np.ones((), dtype="S6")
+        with pytest.raises(RuntimeError):
+            ass[()] = scalar
+
+
+class TestNested:
+    @pytest.mark.xfail(reason="No deprecation warning given.")
+    def test_nested_simple(self):
+        initial = [1.2]
+        nested = initial
+        for i in range(np.MAXDIMS - 1):
+            nested = [nested]
+
+        arr = np.array(nested, dtype="float64")
+        assert arr.shape == (1,) * np.MAXDIMS
+        with pytest.raises(ValueError):
+            np.array([nested], dtype="float64")
+
+        # We discover object automatically at this time:
+        with assert_warns(np.VisibleDeprecationWarning):
+            arr = np.array([nested])
+        assert arr.dtype == np.dtype("O")
+        assert arr.shape == (1,) * np.MAXDIMS
+        assert arr.item() is initial
+
+    def test_pathological_self_containing(self):
+        # Test that this also works for two nested sequences
+        l = []
+        l.append(l)
+        arr = np.array([l, l, l], dtype=object)
+        assert arr.shape == (3,) + (1,) * (np.MAXDIMS - 1)
+
+        # Also check a ragged case:
+        arr = np.array([l, [None], l], dtype=object)
+        assert arr.shape == (3, 1)
+
+    @pytest.mark.xfail(
+            reason="For arrays and memoryview, this used to not complain "
+                   "and assign to a too small array instead. For other "
+                   "array-likes the error is different because fewer (only "
+                   "MAXDIM-1) dimensions are found, failing the last test.")
+    @pytest.mark.parametrize("arraylike", arraylikes())
+    def test_nested_arraylikes(self, arraylike):
+        # We try storing an array like into an array, but the array-like
+        # will have too many dimensions.  This means the shape discovery
+        # decides that the array-like must be treated as an object (a special
+        # case of ragged discovery).  The result will be an array with one
+        # dimension less than the maximum dimensions, and the array being
+        # assigned to it (which does work for object or if `float(arraylike)`
+        # works).
+        initial = arraylike(np.ones((1, 1)))
+        #if not isinstance(initial, (np.ndarray, memoryview)):
+        #    pytest.xfail(
+        #        "When coercing to object, these cases currently discover "
+        #        "fewer dimensions than ndarray failing the second part.")
+
+        nested = initial
+        for i in range(np.MAXDIMS - 1):
+            nested = [nested]
+
+        with pytest.raises(ValueError):
+            # It will refuse to assign the array into
+            np.array(nested, dtype="float64")
+
+        # If this is object, we end up assigning a (1, 1) array into (1,)
+        # (due to running out of dimensions), this is currently supported but
+        # a special case which is not ideal.
+        arr = np.array(nested, dtype=object)
+        assert arr.shape == (1,) * np.MAXDIMS
+        assert arr.item() == np.array(initial).item()
+
+    @pytest.mark.parametrize("arraylike", arraylikes())
+    def test_uneven_depth_ragged(self, arraylike):
+        arr = np.arange(4).reshape((2, 2))
+        arr = arraylike(arr)
+
+        # Array is ragged in the second dimension already:
+        out = np.array([arr, [arr]], dtype=object)
+        assert out.shape == (2,)
+        assert out[0] is arr
+        assert type(out[1]) is list
+
+        if not isinstance(arr, (np.ndarray, memoryview)):
+            pytest.xfail(
+                "does not raise ValueError below, because it discovers "
+                "the dimension as (2,) and not (2, 2, 2)")
+
+        # Array is ragged in the third dimension:
+        with pytest.raises(ValueError):
+            # This is a broadcast error during assignment, because
+            # the array shape would be (2, 2, 2) but `arr[0, 0] = arr` fails.
+            np.array([arr, [arr, arr]], dtype=object)
+
+    def test_empty_sequence(self):
+        arr = np.array([[], [1], [[1]]], dtype=object)
+        assert arr.shape == (3,)
+
+        # The empty sequence stops further dimension discovery, so the
+        # result shape will be (0,) which leads to an error during:
+        with pytest.raises(ValueError):
+            np.array([[], np.empty((0, 1))], dtype=object)
+
+
+class TestBadSequences:
+    # These are tests for bad objects passed into `np.array`, in general
+    # these have undefined behaviour.  In the old code they partially worked
+    # when now they will fail.  We could (and maybe should) create a copy
+    # of all sequences to be safe against bad-actors.
+
+    def test_growing_list(self):
+        # List to coerce, `mylist` will append to it during coercion
+        obj = []
+        class mylist(list):
+            def __len__(self):
+                obj.append([1, 2])
+                return super().__len__()
+
+        obj.append(mylist([1, 2]))
+
+        with pytest.raises(ValueError):  # changes to RuntimeError
+            np.array(obj)
+
+    # Note: We do not test a shrinking list.  These do very evil things
+    #       and the only way to fix them would be to copy all sequences.
+    #       (which may be a real option in the future).
+
+    def test_mutated_list(self):
+        # List to coerce, `mylist` will mutate the first element
+        obj = []
+        class mylist(list):
+            def __len__(self):
+                obj[0] = [2, 3]  # replace with a different list.
+                return super().__len__()
+
+        obj.append([2, 3])
+        obj.append(mylist([1, 2]))
+        #with pytest.raises(RuntimeError):  # Will error in the future
+        np.array(obj)
+
+    def test_replace_0d_array(self):
+        # List to coerce, `mylist` will mutate the first element
+        obj = []
+        class baditem:
+            def __len__(self):
+                obj[0][0] = 2  # replace with a different list.
+                raise ValueError("not actually a sequence!")
+
+            def __getitem__(self):
+                pass
+
+        # Runs into a corner case in the new code, the `array(2)` is cached
+        # so replacing it invalidates the cache.
+        obj.append([np.array(2), baditem()])
+        # with pytest.raises(RuntimeError):  # Will error in the future
+        np.array(obj)
+
+
+class TestArrayLikes:
+    @pytest.mark.parametrize("arraylike", arraylikes())
+    def test_0d_object_special_case(self, arraylike):
+        arr = np.array(0.)
+        obj = arraylike(arr)
+        # A single array-like is always converted:
+        res = np.array(obj, dtype=object)
+        assert_array_equal(arr, res)
+
+        # But a single 0-D nested array-like never:
+        res = np.array([obj], dtype=object)
+        assert res[0] is obj
+
+    def test_0d_generic_special_case(self):
+        class ArraySubclass(np.ndarray):
+            def __float__(self):
+                raise TypeError("e.g. quantities raise on this")
+
+        arr = np.array(0.)
+        obj = arr.view(ArraySubclass)
+        res = np.array(obj)
+        # The subclass is simply cast:
+        assert_array_equal(arr, res)
+
+        # If the 0-D array-like is included, __float__ is currently
+        # guaranteed to be used.  We may want to change that, quantities
+        # and masked arrays half make use of this.
+        with pytest.raises(TypeError):
+            np.array([obj])
+
+        # The same holds for memoryview:
+        obj = memoryview(arr)
+        res = np.array(obj)
+        assert_array_equal(arr, res)
+        with pytest.raises(ValueError):
+            # The error type does not matter much here.
+            np.array([obj])
diff --git a/numpy/core/tests/test_cpu_dispatcher.py b/numpy/core/tests/test_cpu_dispatcher.py
new file mode 100644
index 000000000..8712dee1a
--- /dev/null
+++ b/numpy/core/tests/test_cpu_dispatcher.py
@@ -0,0 +1,42 @@
+from numpy.core._multiarray_umath import __cpu_features__, __cpu_baseline__, __cpu_dispatch__
+from numpy.core import _umath_tests
+from numpy.testing import assert_equal
+
+def test_dispatcher():
+    """
+    Testing the utilites of the CPU dispatcher
+    """
+    targets = (
+        "SSE2", "SSE41", "AVX2",
+        "VSX", "VSX2", "VSX3",
+        "NEON", "ASIMD", "ASIMDHP"
+    )
+    highest_sfx = "" # no suffix for the baseline
+    all_sfx = []
+    for feature in reversed(targets):
+        # skip baseline features, by the default `CCompilerOpt` do not generate separated objects
+        # for the baseline,  just one object combined all of them via 'baseline' option
+        # within the configuration statments.
+        if feature in __cpu_baseline__:
+            continue
+        # check compiler and running machine support
+        if feature not in __cpu_dispatch__ or not __cpu_features__[feature]:
+            continue
+
+        if not highest_sfx:
+            highest_sfx = "_" + feature
+        all_sfx.append("func" + "_" + feature)
+
+    test = _umath_tests.test_dispatch()
+    assert_equal(test["func"], "func" + highest_sfx)
+    assert_equal(test["var"], "var"  + highest_sfx)
+
+    if highest_sfx:
+        assert_equal(test["func_xb"], "func" + highest_sfx)
+        assert_equal(test["var_xb"], "var"  + highest_sfx)
+    else:
+        assert_equal(test["func_xb"], "nobase")
+        assert_equal(test["var_xb"], "nobase")
+
+    all_sfx.append("func") # add the baseline
+    assert_equal(test["all"], all_sfx)
diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py
index 438d52f97..fef1e24d8 100644
--- a/numpy/core/tests/test_datetime.py
+++ b/numpy/core/tests/test_datetime.py
@@ -775,6 +775,12 @@ class TestDateTime:
                             np.dtype('m8[Y]'), np.dtype('m8[D]'))
         assert_raises(TypeError, np.promote_types,
                             np.dtype('m8[M]'), np.dtype('m8[W]'))
+        # timedelta and float cannot be safely cast with each other
+        assert_raises(TypeError, np.promote_types, "float32", "m8")
+        assert_raises(TypeError, np.promote_types, "m8", "float32")
+        assert_raises(TypeError, np.promote_types, "uint64", "m8")
+        assert_raises(TypeError, np.promote_types, "m8", "uint64")
+
         # timedelta <op> timedelta may overflow with big unit ranges
         assert_raises(OverflowError, np.promote_types,
                             np.dtype('m8[W]'), np.dtype('m8[fs]'))
diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py
index 01924410f..239d20c9d 100644
--- a/numpy/core/tests/test_deprecations.py
+++ b/numpy/core/tests/test_deprecations.py
@@ -9,6 +9,7 @@ import warnings
 import pytest
 import tempfile
 import re
+import sys
 
 import numpy as np
 from numpy.testing import (
@@ -313,19 +314,14 @@ class TestBinaryReprInsufficientWidthParameterForRepresentation(_DeprecationTest
 
 class TestNumericStyleTypecodes(_DeprecationTestCase):
     """
-    Deprecate the old numeric-style dtypes, which are especially
-    confusing for complex types, e.g. Complex32 -> complex64. When the
-    deprecation cycle is complete, the check for the strings should be
-    removed from PyArray_DescrConverter in descriptor.c, and the
-    deprecated keys should not be added as capitalized aliases in
-    _add_aliases in numerictypes.py.
+    Most numeric style typecodes were previously deprecated (and removed)
+    in 1.20. This also deprecates the remaining ones.
     """
+    # 2020-06-09, NumPy 1.20
     def test_all_dtypes(self):
-        deprecated_types = [
-            'Bool', 'Complex32', 'Complex64', 'Float16', 'Float32', 'Float64',
-            'Int8', 'Int16', 'Int32', 'Int64', 'Object0', 'Timedelta64',
-            'UInt8', 'UInt16', 'UInt32', 'UInt64', 'Void0'
-            ]
+        deprecated_types = ['Bytes0', 'Datetime64', 'Str0']
+        # Depending on intp size, either Uint32 or Uint64 is defined:
+        deprecated_types.append(f"U{np.dtype(np.intp).name}")
         for dt in deprecated_types:
             self.assert_deprecated(np.dtype, exceptions=(TypeError,),
                                    args=(dt,))
@@ -438,14 +434,6 @@ class TestGeneratorSum(_DeprecationTestCase):
         self.assert_deprecated(np.sum, args=((i for i in range(5)),))
 
 
-class TestSctypeNA(_VisibleDeprecationTestCase):
-    # 2018-06-24, 1.16
-    def test_sctypeNA(self):
-        self.assert_deprecated(lambda: np.sctypeNA['?'])
-        self.assert_deprecated(lambda: np.typeNA['?'])
-        self.assert_deprecated(lambda: np.typeNA.get('?'))
-
-
 class TestPositiveOnNonNumerical(_DeprecationTestCase):
     # 2018-06-28, 1.16.0
     def test_positive_on_non_number(self):
@@ -655,3 +643,22 @@ class TestNonExactMatchDeprecation(_DeprecationTestCase):
         self.assert_deprecated(lambda: np.ravel_multi_index(arr, (7, 6), mode='Cilp'))
         # using completely different word with first character as R
         self.assert_deprecated(lambda: np.searchsorted(arr[0], 4, side='Random'))
+
+
+class TestDeprecatedGlobals(_DeprecationTestCase):
+    # 2020-06-06
+    @pytest.mark.skipif(
+        sys.version_info < (3, 7),
+        reason='module-level __getattr__ not supported')
+    def test_type_aliases(self):
+        # from builtins
+        self.assert_deprecated(lambda: np.bool)
+        self.assert_deprecated(lambda: np.int)
+        self.assert_deprecated(lambda: np.float)
+        self.assert_deprecated(lambda: np.complex)
+        self.assert_deprecated(lambda: np.object)
+        self.assert_deprecated(lambda: np.str)
+
+        # from np.compat
+        self.assert_deprecated(lambda: np.long)
+        self.assert_deprecated(lambda: np.unicode)
diff --git a/numpy/core/tests/test_dtype.py b/numpy/core/tests/test_dtype.py
index 73aa01de6..2e2b0dbe2 100644
--- a/numpy/core/tests/test_dtype.py
+++ b/numpy/core/tests/test_dtype.py
@@ -86,6 +86,15 @@ class TestBuiltin:
             assert_raises(TypeError, np.dtype, 'q8')
             assert_raises(TypeError, np.dtype, 'Q8')
 
+    @pytest.mark.parametrize("dtype",
+             ['Bool', 'Complex32', 'Complex64', 'Float16', 'Float32', 'Float64',
+              'Int8', 'Int16', 'Int32', 'Int64', 'Object0', 'Timedelta64',
+              'UInt8', 'UInt16', 'UInt32', 'UInt64', 'Void0',
+              "Float128", "Complex128"])
+    def test_numeric_style_types_are_invalid(self, dtype):
+        with assert_raises(TypeError):
+            np.dtype(dtype)
+
     @pytest.mark.parametrize(
         'value',
         ['m8', 'M8', 'datetime64', 'timedelta64',
@@ -1047,6 +1056,11 @@ def test_invalid_dtype_string():
     assert_raises(TypeError, np.dtype, u'Fl\xfcgel')
 
 
+def test_keyword_argument():
+    # test for https://github.com/numpy/numpy/pull/16574#issuecomment-642660971
+    assert np.dtype(dtype=np.float64) == np.dtype(np.float64)
+
+
 class TestFromDTypeAttribute:
     def test_simple(self):
         class dt:
@@ -1324,4 +1338,3 @@ class TestFromCTypes:
         pair_type = np.dtype('{},{}'.format(*pair))
         expected = np.dtype([('f0', pair[0]), ('f1', pair[1])])
         assert_equal(pair_type, expected)
-
diff --git a/numpy/core/tests/test_function_base.py b/numpy/core/tests/test_function_base.py
index 2197ef0cd..62a9772c8 100644
--- a/numpy/core/tests/test_function_base.py
+++ b/numpy/core/tests/test_function_base.py
@@ -1,6 +1,6 @@
 from numpy import (
     logspace, linspace, geomspace, dtype, array, sctypes, arange, isnan,
-    ndarray, sqrt, nextafter, stack
+    ndarray, sqrt, nextafter, stack, errstate
     )
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_array_equal, assert_allclose,
@@ -113,6 +113,40 @@ class TestGeomspace:
         assert_array_equal(y, [-100, -10, -1])
         assert_array_equal(y.imag, 0)
 
+    def test_boundaries_match_start_and_stop_exactly(self):
+        # make sure that the boundaries of the returned array exactly
+        # equal 'start' and 'stop' - this isn't obvious because
+        # np.exp(np.log(x)) isn't necessarily exactly equal to x
+        start = 0.3
+        stop = 20.3
+
+        y = geomspace(start, stop, num=1)
+        assert_equal(y[0], start)
+
+        y = geomspace(start, stop, num=1, endpoint=False)
+        assert_equal(y[0], start)
+
+        y = geomspace(start, stop, num=3)
+        assert_equal(y[0], start)
+        assert_equal(y[-1], stop)
+
+        y = geomspace(start, stop, num=3, endpoint=False)
+        assert_equal(y[0], start)
+
+    def test_nan_interior(self):
+        with errstate(invalid='ignore'):
+            y = geomspace(-3, 3, num=4)
+
+        assert_equal(y[0], -3.0)
+        assert_(isnan(y[1:-1]).all())
+        assert_equal(y[3], 3.0)
+
+        with errstate(invalid='ignore'):
+            y = geomspace(-3, 3, num=4, endpoint=False)
+
+        assert_equal(y[0], -3.0)
+        assert_(isnan(y[1:]).all())
+
     def test_complex(self):
         # Purely imaginary
         y = geomspace(1j, 16j, num=5)
diff --git a/numpy/core/tests/test_indexing.py b/numpy/core/tests/test_indexing.py
index 4bb5cb11a..f6e263774 100644
--- a/numpy/core/tests/test_indexing.py
+++ b/numpy/core/tests/test_indexing.py
@@ -370,6 +370,20 @@ class TestIndexing:
         a[...] = s
         assert_((a == 1).all())
 
+    def test_array_like_values(self):
+        # Similar to the above test, but use a memoryview instead
+        a = np.zeros((5, 5))
+        s = np.arange(25, dtype=np.float64).reshape(5, 5)
+
+        a[[0, 1, 2, 3, 4], :] = memoryview(s)
+        assert_array_equal(a, s)
+
+        a[:, [0, 1, 2, 3, 4]] = memoryview(s)
+        assert_array_equal(a, s)
+
+        a[...] = memoryview(s)
+        assert_array_equal(a, s)
+
     def test_subclass_writeable(self):
         d = np.rec.array([('NGC1001', 11), ('NGC1002', 1.), ('NGC1003', 1.)],
                          dtype=[('target', 'S20'), ('V_mag', '>f4')])
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index e116077f9..09adddf6d 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -4702,6 +4702,10 @@ class TestIO:
         e = np.array([-25041670086757, 104783749223640], dtype=np.int64)
         assert_array_equal(d, e)
 
+    def test_fromstring_count0(self):
+        d = np.fromstring("1,2", sep=",", dtype=np.int64, count=0)
+        assert d.shape == (0,)
+
     def test_empty_files_binary(self):
         with open(self.filename, 'w') as f:
             pass
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index 96a6d810f..cf18a5d93 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -42,13 +42,6 @@ class TestRegression:
                 b = pickle.load(f)
             assert_array_equal(a, b)
 
-    def test_typeNA(self):
-        # Issue gh-515
-        with suppress_warnings() as sup:
-            sup.filter(np.VisibleDeprecationWarning)
-            assert_equal(np.typeNA[np.int64], 'Int64')
-            assert_equal(np.typeNA[np.uint64], 'UInt64')
-
     def test_dtype_names(self):
         # Ticket #35
         # Should succeed
diff --git a/numpy/core/tests/test_scalar_ctors.py b/numpy/core/tests/test_scalar_ctors.py
index 7645a0853..7e933537d 100644
--- a/numpy/core/tests/test_scalar_ctors.py
+++ b/numpy/core/tests/test_scalar_ctors.py
@@ -65,7 +65,7 @@ class TestExtraArgs:
 
     def test_bool(self):
         with pytest.raises(TypeError):
-            np.bool(False, garbage=True)
+            np.bool_(False, garbage=True)
 
     def test_void(self):
         with pytest.raises(TypeError):
@@ -79,3 +79,37 @@ class TestFromInt:
 
     def test_uint64_from_negative(self):
         assert_equal(np.uint64(-2), np.uint64(18446744073709551614))
+
+
+int_types = [np.byte, np.short, np.intc, np.int_, np.longlong]
+uint_types = [np.ubyte, np.ushort, np.uintc, np.uint, np.ulonglong]
+float_types = [np.half, np.single, np.double, np.longdouble]
+cfloat_types = [np.csingle, np.cdouble, np.clongdouble]
+
+
+class TestArrayFromScalar:
+    """ gh-15467 """
+
+    def _do_test(self, t1, t2):
+        x = t1(2)
+        arr = np.array(x, dtype=t2)
+        # type should be preserved exactly
+        if t2 is None:
+            assert arr.dtype.type is t1
+        else:
+            assert arr.dtype.type is t2
+
+    @pytest.mark.parametrize('t1', int_types + uint_types)
+    @pytest.mark.parametrize('t2', int_types + uint_types + [None])
+    def test_integers(self, t1, t2):
+        return self._do_test(t1, t2)
+
+    @pytest.mark.parametrize('t1', float_types)
+    @pytest.mark.parametrize('t2', float_types + [None])
+    def test_reals(self, t1, t2):
+        return self._do_test(t1, t2)
+
+    @pytest.mark.parametrize('t1', cfloat_types)
+    @pytest.mark.parametrize('t2', cfloat_types + [None])
+    def test_complex(self, t1, t2):
+        return self._do_test(t1, t2)
diff --git a/numpy/core/tests/test_scalarbuffer.py b/numpy/core/tests/test_scalarbuffer.py
index b1c1bbbb1..574c56864 100644
--- a/numpy/core/tests/test_scalarbuffer.py
+++ b/numpy/core/tests/test_scalarbuffer.py
@@ -2,6 +2,7 @@
 Test scalar buffer interface adheres to PEP 3118
 """
 import numpy as np
+from numpy.core._rational_tests import rational
 import pytest
 
 from numpy.testing import assert_, assert_equal, assert_raises
@@ -117,3 +118,8 @@ class TestScalarPEP3118:
         code_points = np.frombuffer(v, dtype='i4')
 
         assert_equal(code_points, [ord(c) for c in s])
+
+    def test_user_scalar_fails_buffer(self):
+        r = rational(1)
+        with assert_raises(TypeError):
+            memoryview(r)
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 91acd6ac3..f836af168 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -890,15 +890,17 @@ class TestAVXFloat32Transcendental:
         sizes = np.arange(2,100)
         for ii in sizes:
             x_f32 = np.float32(np.random.uniform(low=0.01,high=88.1,size=ii))
+            x_f32_large = x_f32.copy()
+            x_f32_large[3:-1:4] = 120000.0
             exp_true = np.exp(x_f32)
             log_true = np.log(x_f32)
-            sin_true = np.sin(x_f32)
-            cos_true = np.cos(x_f32)
+            sin_true = np.sin(x_f32_large)
+            cos_true = np.cos(x_f32_large)
             for jj in strides:
                 assert_array_almost_equal_nulp(np.exp(x_f32[::jj]), exp_true[::jj], nulp=2)
                 assert_array_almost_equal_nulp(np.log(x_f32[::jj]), log_true[::jj], nulp=2)
-                assert_array_almost_equal_nulp(np.sin(x_f32[::jj]), sin_true[::jj], nulp=2)
-                assert_array_almost_equal_nulp(np.cos(x_f32[::jj]), cos_true[::jj], nulp=2)
+                assert_array_almost_equal_nulp(np.sin(x_f32_large[::jj]), sin_true[::jj], nulp=2)
+                assert_array_almost_equal_nulp(np.cos(x_f32_large[::jj]), cos_true[::jj], nulp=2)
 
 class TestLogAddExp(_FilterInvalids):
     def test_logaddexp_values(self):
diff --git a/numpy/core/tests/test_umath_accuracy.py b/numpy/core/tests/test_umath_accuracy.py
index e3c2eb025..33080edbb 100644
--- a/numpy/core/tests/test_umath_accuracy.py
+++ b/numpy/core/tests/test_umath_accuracy.py
@@ -57,9 +57,3 @@ class TestAccuracy:
                         outval = outval[perm]
                         maxulperr = data_subset['ulperr'].max()
                         assert_array_max_ulp(npfunc(inval), outval, maxulperr)
-
-    def test_ignore_nan_ulperror(self):
-        # Ignore ULP differences between various NAN's
-        nan1_f32 = np.array(str_to_float('0xffffffff'), dtype=np.float32)
-        nan2_f32 = np.array(str_to_float('0x7fddbfbf'), dtype=np.float32)
-        assert_array_max_ulp(nan1_f32, nan2_f32, 0)
diff --git a/numpy/core/tests/test_umath_complex.py b/numpy/core/tests/test_umath_complex.py
index a21158420..a626219c5 100644
--- a/numpy/core/tests/test_umath_complex.py
+++ b/numpy/core/tests/test_umath_complex.py
@@ -545,25 +545,25 @@ class TestSpecialComplexAVX(object):
     @pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4])
     @pytest.mark.parametrize("astype", [np.complex64, np.complex128])
     def test_array(self, stride, astype):
-        arr = np.array([np.complex(np.nan , np.nan),
-                        np.complex(np.nan , np.inf),
-                        np.complex(np.inf , np.nan),
-                        np.complex(np.inf , np.inf),
-                        np.complex(0.     , np.inf),
-                        np.complex(np.inf , 0.),
-                        np.complex(0.     , 0.),
-                        np.complex(0.     , np.nan),
-                        np.complex(np.nan , 0.)], dtype=astype)
+        arr = np.array([complex(np.nan , np.nan),
+                        complex(np.nan , np.inf),
+                        complex(np.inf , np.nan),
+                        complex(np.inf , np.inf),
+                        complex(0.     , np.inf),
+                        complex(np.inf , 0.),
+                        complex(0.     , 0.),
+                        complex(0.     , np.nan),
+                        complex(np.nan , 0.)], dtype=astype)
         abs_true = np.array([np.nan, np.inf, np.inf, np.inf, np.inf, np.inf, 0., np.nan, np.nan], dtype=arr.real.dtype)
-        sq_true = np.array([np.complex(np.nan,  np.nan),
-                            np.complex(np.nan,  np.nan),
-                            np.complex(np.nan,  np.nan),
-                            np.complex(np.nan,  np.inf),
-                            np.complex(-np.inf, np.nan),
-                            np.complex(np.inf,  np.nan),
-                            np.complex(0.,     0.),
-                            np.complex(np.nan, np.nan),
-                            np.complex(np.nan, np.nan)], dtype=astype)
+        sq_true = np.array([complex(np.nan,  np.nan),
+                            complex(np.nan,  np.nan),
+                            complex(np.nan,  np.nan),
+                            complex(np.nan,  np.inf),
+                            complex(-np.inf, np.nan),
+                            complex(np.inf,  np.nan),
+                            complex(0.,     0.),
+                            complex(np.nan, np.nan),
+                            complex(np.nan, np.nan)], dtype=astype)
         assert_equal(np.abs(arr[::stride]), abs_true[::stride])
         with np.errstate(invalid='ignore'):
             assert_equal(np.square(arr[::stride]), sq_true[::stride])
diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py
new file mode 100644
index 000000000..edf6c1ba0
--- /dev/null
+++ b/numpy/distutils/ccompiler_opt.py
@@ -0,0 +1,2463 @@
+"""Provides the `CCompilerOpt` class, used for handling the CPU/hardware
+optimization, starting from parsing the command arguments, to managing the
+relation between the CPU baseline and dispatch-able features,
+also generating the required C headers and ending with compiling
+the sources with proper compiler's flags.
+
+`CCompilerOpt` doesn't provide runtime detection for the CPU features,
+instead only focuses on the compiler side, but it creates abstract C headers
+that can be used later for the final runtime dispatching process."""
+
+import sys, io, os, re, textwrap, pprint, inspect, atexit, subprocess
+
+class _Config:
+    """An abstract class holds all configurable attributes of `CCompilerOpt`,
+    these class attributes can be used to change the default behavior
+    of `CCompilerOpt` in order to fit other requirements.
+
+    Attributes
+    ----------
+    conf_nocache : bool
+        Set True to disable memory and file cache.
+        Default is False.
+
+    conf_noopt : bool
+        Set True to forces the optimization to be disabled,
+        in this case `CCompilerOpt` tends to generate all
+        expected headers in order to 'not' break the build.
+        Default is False.
+
+    conf_cache_factors : list
+        Add extra factors to the primary caching factors. The caching factors
+        are utilized to determine if there are changes had happened that
+        requires to discard the cache and re-updating it. The primary factors
+        are the arguments of `CCompilerOpt` and `CCompiler`'s properties(type, flags, etc).
+        Default is list of two items, containing the time of last modification
+        of `ccompiler_opt` and value of attribute "conf_noopt"
+
+    conf_tmp_path : str,
+        The path of temporary directory. Default is auto-created
+        temporary directory via ``tempfile.mkdtemp()``.
+
+    conf_check_path : str
+        The path of testing files. Each added CPU feature must have a
+        **C** source file contains at least one intrinsic or instruction that
+        related to this feature, so it can be tested against the compiler.
+        Default is ``./distutils/checks``.
+
+    conf_target_groups : dict
+        Extra tokens that can be reached from dispatch-able sources through
+        the special mark ``@targets``. Default is an empty dictionary.
+
+        **Notes**:
+            - case-insensitive for tokens and group names
+            - sign '#' must stick in the begin of group name and only within ``@targets``
+
+        **Example**:
+            .. code-block:: console
+
+                $ "@targets #avx_group other_tokens" > group_inside.c
+
+            >>> CCompilerOpt.conf_target_groups["avx_group"] = \\
+            "$werror $maxopt avx2 avx512f avx512_skx"
+            >>> cco = CCompilerOpt(cc_instance)
+            >>> cco.try_dispatch(["group_inside.c"])
+
+    conf_c_prefix : str
+        The prefix of public C definitions. Default is ``"NPY_"``.
+
+    conf_c_prefix_ : str
+        The prefix of internal C definitions. Default is ``"NPY__"``.
+
+    conf_cc_flags : dict
+        Nested dictionaries defining several compiler flags
+        that linked to some major functions, the main key
+        represent the compiler name and sub-keys represent
+        flags names. Default is already covers all supported
+        **C** compilers.
+
+        Sub-keys explained as follows:
+
+        "native": str or None
+            used by argument option `native`, to detect the current
+            machine support via the compiler.
+        "werror": str or None
+            utilized to treat warning as errors during testing CPU features
+            against the compiler and also for target's policy `$werror`
+            via dispatch-able sources.
+        "maxopt": str or None
+            utilized for target's policy '$maxopt' and the value should
+            contains the maximum acceptable optimization by the compiler.
+            e.g. in gcc `'-O3'`
+
+        **Notes**:
+            * case-sensitive for compiler names and flags
+            * use space to separate multiple flags
+            * any flag will tested against the compiler and it will skipped
+              if it's not applicable.
+
+    conf_min_features : dict
+        A dictionary defines the used CPU features for
+        argument option `'min'`, the key represent the CPU architecture
+        name e.g. `'x86'`. Default values provide the best effort
+        on wide range of users platforms.
+
+        **Note**: case-sensitive for architecture names.
+
+    conf_features : dict
+        Nested dictionaries used for identifying the CPU features.
+        the primary key is represented as a feature name or group name
+        that gathers several features. Default values covers all
+        supported features but without the major options like "flags",
+        these undefined options handle it by method `conf_features_partial()`.
+        Default value is covers almost all CPU features for *X86*, *IBM/Power64*
+        and *ARM 7/8*.
+
+        Sub-keys explained as follows:
+
+        "implies" : str or list, optional,
+            List of CPU feature names to be implied by it,
+            the feature name must be defined within `conf_features`.
+            Default is None.
+
+        "flags": str or list, optional
+            List of compiler flags. Default is None.
+
+        "detect": str or list, optional
+            List of CPU feature names that required to be detected
+            in runtime. By default, its the feature name or features
+            in "group" if its specified.
+
+        "implies_detect": bool, optional
+            If True, all "detect" of implied features will be combined.
+            Default is True. see `feature_detect()`.
+
+        "group": str or list, optional
+            Same as "implies" but doesn't require the feature name to be
+            defined within `conf_features`.
+
+        "interest": int, required
+            a key for sorting CPU features
+
+        "headers": str or list, optional
+            intrinsics C header file
+
+        "disable": str, optional
+            force disable feature, the string value should contains the
+            reason of disabling.
+
+        "autovec": bool or None, optional
+            True or False to declare that CPU feature can be auto-vectorized
+            by the compiler.
+            By default(None), treated as True if the feature contains at
+            least one applicable flag. see `feature_can_autovec()`
+
+        **NOTES**:
+            * space can be used as separator with options that supports "str or list"
+            * case-sensitive for all values and feature name must be in upper-case.
+            * if flags aren't applicable, its will skipped rather than disable the
+              CPU feature
+            * the CPU feature will disabled if the compiler fail to compile
+              the test file
+    """
+    conf_nocache = False
+    conf_noopt = False
+    conf_cache_factors = None
+    conf_tmp_path = None
+    conf_check_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "checks"
+    )
+    conf_target_groups = {}
+    conf_c_prefix = 'NPY_'
+    conf_c_prefix_ = 'NPY__'
+    conf_cc_flags = dict(
+        gcc = dict(
+            # native should always fail on arm and ppc64,
+            # native usually works only with x86
+            native = '-march=native',
+            opt = '-O3',
+            werror = '-Werror'
+        ),
+        clang = dict(
+            native = '-march=native',
+            opt = "-O3",
+            werror = '-Werror'
+        ),
+        icc = dict(
+            native = '-xHost',
+            opt = '-O3',
+            werror = '-Werror'
+        ),
+        iccw = dict(
+            native = '/QxHost',
+            opt = '/O3',
+            werror = '/Werror'
+        ),
+        msvc = dict(
+            native = None,
+            opt = '/O2',
+            werror = '/WX'
+        )
+    )
+    conf_min_features = dict(
+        x86 = "SSE SSE2",
+        x64 = "SSE SSE2 SSE3",
+        ppc64 = '', # play it safe
+        ppc64le = "VSX VSX2",
+        armhf = '', # play it safe
+        aarch64 = "NEON NEON_FP16 NEON_VFPV4 ASIMD"
+    )
+    conf_features = dict(
+        # X86
+        SSE = dict(
+            interest=1, headers="xmmintrin.h",
+            # enabling SSE without SSE2 is useless also
+            # it's non-optional for x86_64
+            implies="SSE2"
+        ),
+        SSE2   = dict(interest=2, implies="SSE", headers="emmintrin.h"),
+        SSE3   = dict(interest=3, implies="SSE2", headers="pmmintrin.h"),
+        SSSE3  = dict(interest=4, implies="SSE3", headers="tmmintrin.h"),
+        SSE41  = dict(interest=5, implies="SSSE3", headers="smmintrin.h"),
+        POPCNT = dict(interest=6, implies="SSE41", headers="popcntintrin.h"),
+        SSE42  = dict(interest=7, implies="POPCNT"),
+        AVX    = dict(
+            interest=8, implies="SSE42", headers="immintrin.h",
+            implies_detect=False
+        ),
+        XOP    = dict(interest=9, implies="AVX", headers="x86intrin.h"),
+        FMA4   = dict(interest=10, implies="AVX", headers="x86intrin.h"),
+        F16C   = dict(interest=11, implies="AVX"),
+        FMA3   = dict(interest=12, implies="F16C"),
+        AVX2   = dict(interest=13, implies="F16C"),
+        AVX512F = dict(interest=20, implies="FMA3 AVX2", implies_detect=False),
+        AVX512CD = dict(interest=21, implies="AVX512F"),
+        AVX512_KNL = dict(
+            interest=40, implies="AVX512CD", group="AVX512ER AVX512PF",
+            detect="AVX512_KNL", implies_detect=False
+        ),
+        AVX512_KNM = dict(
+            interest=41, implies="AVX512_KNL",
+            group="AVX5124FMAPS AVX5124VNNIW AVX512VPOPCNTDQ",
+            detect="AVX512_KNM", implies_detect=False
+        ),
+        AVX512_SKX = dict(
+            interest=42, implies="AVX512CD", group="AVX512VL AVX512BW AVX512DQ",
+            detect="AVX512_SKX", implies_detect=False
+        ),
+        AVX512_CLX = dict(
+            interest=43, implies="AVX512_SKX", group="AVX512VNNI",
+            detect="AVX512_CLX"
+        ),
+        AVX512_CNL = dict(
+            interest=44, implies="AVX512_SKX", group="AVX512IFMA AVX512VBMI",
+            detect="AVX512_CNL", implies_detect=False
+        ),
+        AVX512_ICL = dict(
+            interest=45, implies="AVX512_CLX AVX512_CNL",
+            group="AVX512VBMI2 AVX512BITALG AVX512VPOPCNTDQ",
+            detect="AVX512_ICL", implies_detect=False
+        ),
+        # IBM/Power
+        ## Power7/ISA 2.06
+        VSX = dict(interest=1, headers="altivec.h"),
+        ## Power8/ISA 2.07
+        VSX2 = dict(interest=2, implies="VSX", implies_detect=False),
+        ## Power9/ISA 3.00
+        VSX3 = dict(interest=3, implies="VSX2", implies_detect=False),
+        # ARM
+        NEON  = dict(interest=1, headers="arm_neon.h"),
+        NEON_FP16 = dict(interest=2, implies="NEON"),
+        ## FMA
+        NEON_VFPV4 = dict(interest=3, implies="NEON_FP16"),
+        ## Advanced SIMD
+        ASIMD = dict(interest=4, implies="NEON_FP16 NEON_VFPV4", implies_detect=False),
+        ## ARMv8.2 half-precision & vector arithm
+        ASIMDHP = dict(interest=5, implies="ASIMD"),
+        ## ARMv8.2 dot product
+        ASIMDDP = dict(interest=6, implies="ASIMD"),
+        ## ARMv8.2 Single & half-precision Multiply
+        ASIMDFHM = dict(interest=7, implies="ASIMDHP"),
+    )
+    def conf_features_partial(self):
+        """Return a dictionary of supported CPU features by the platform,
+        and accumulate the rest of undefined options in `conf_features`,
+        the returned dict has same rules and notes in
+        class attribute `conf_features`, also its override
+        any options that been set in 'conf_features'.
+        """
+        if self.cc_noopt:
+            # optimization is disabled
+            return {}
+
+        on_x86 = self.cc_on_x86 or self.cc_on_x64
+        is_unix = self.cc_is_gcc or self.cc_is_clang
+
+        if on_x86 and is_unix: return dict(
+            SSE    = dict(flags="-msse"),
+            SSE2   = dict(flags="-msse2"),
+            SSE3   = dict(flags="-msse3"),
+            SSSE3  = dict(flags="-mssse3"),
+            SSE41  = dict(flags="-msse4.1"),
+            POPCNT = dict(flags="-mpopcnt"),
+            SSE42  = dict(flags="-msse4.2"),
+            AVX    = dict(flags="-mavx"),
+            F16C   = dict(flags="-mf16c"),
+            XOP    = dict(flags="-mxop"),
+            FMA4   = dict(flags="-mfma4"),
+            FMA3   = dict(flags="-mfma"),
+            AVX2   = dict(flags="-mavx2"),
+            AVX512F = dict(flags="-mavx512f"),
+            AVX512CD = dict(flags="-mavx512cd"),
+            AVX512_KNL = dict(flags="-mavx512er -mavx512pf"),
+            AVX512_KNM = dict(
+                flags="-mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq"
+            ),
+            AVX512_SKX = dict(flags="-mavx512vl -mavx512bw -mavx512dq"),
+            AVX512_CLX = dict(flags="-mavx512vnni"),
+            AVX512_CNL = dict(flags="-mavx512ifma -mavx512vbmi"),
+            AVX512_ICL = dict(
+                flags="-mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq"
+            )
+        )
+        if on_x86 and self.cc_is_icc: return dict(
+            SSE    = dict(flags="-msse"),
+            SSE2   = dict(flags="-msse2"),
+            SSE3   = dict(flags="-msse3"),
+            SSSE3  = dict(flags="-mssse3"),
+            SSE41  = dict(flags="-msse4.1"),
+            POPCNT = {},
+            SSE42  = dict(flags="-msse4.2"),
+            AVX    = dict(flags="-mavx"),
+            F16C   = {},
+            XOP    = dict(disable="Intel Compiler doesn't support it"),
+            FMA4   = dict(disable="Intel Compiler doesn't support it"),
+            # Intel Compiler doesn't support AVX2 or FMA3 independently
+            FMA3 = dict(
+                implies="F16C AVX2", flags="-march=core-avx2"
+            ),
+            AVX2 = dict(implies="FMA3", flags="-march=core-avx2"),
+            # Intel Compiler doesn't support AVX512F or AVX512CD independently
+            AVX512F = dict(
+                implies="AVX2 AVX512CD", flags="-march=common-avx512"
+            ),
+            AVX512CD = dict(
+                implies="AVX2 AVX512F", flags="-march=common-avx512"
+            ),
+            AVX512_KNL = dict(flags="-xKNL"),
+            AVX512_KNM = dict(flags="-xKNM"),
+            AVX512_SKX = dict(flags="-xSKYLAKE-AVX512"),
+            AVX512_CLX = dict(flags="-xCASCADELAKE"),
+            AVX512_CNL = dict(flags="-xCANNONLAKE"),
+            AVX512_ICL = dict(flags="-xICELAKE-CLIENT"),
+        )
+        if on_x86 and self.cc_is_iccw: return dict(
+            SSE    = dict(flags="/arch:SSE"),
+            SSE2   = dict(flags="/arch:SSE2"),
+            SSE3   = dict(flags="/arch:SSE3"),
+            SSSE3  = dict(flags="/arch:SSSE3"),
+            SSE41  = dict(flags="/arch:SSE4.1"),
+            POPCNT = {},
+            SSE42  = dict(flags="/arch:SSE4.2"),
+            AVX    = dict(flags="/arch:AVX"),
+            F16C   = {},
+            XOP    = dict(disable="Intel Compiler doesn't support it"),
+            FMA4   = dict(disable="Intel Compiler doesn't support it"),
+            # Intel Compiler doesn't support FMA3 or AVX2 independently
+            FMA3 = dict(
+                implies="F16C AVX2", flags="/arch:CORE-AVX2"
+            ),
+            AVX2 = dict(
+                implies="FMA3", flags="/arch:CORE-AVX2"
+            ),
+            # Intel Compiler doesn't support AVX512F or AVX512CD independently
+            AVX512F = dict(
+                implies="AVX2 AVX512CD", flags="/Qx:COMMON-AVX512"
+            ),
+            AVX512CD = dict(
+                implies="AVX2 AVX512F", flags="/Qx:COMMON-AVX512"
+            ),
+            AVX512_KNL = dict(flags="/Qx:KNL"),
+            AVX512_KNM = dict(flags="/Qx:KNM"),
+            AVX512_SKX = dict(flags="/Qx:SKYLAKE-AVX512"),
+            AVX512_CLX = dict(flags="/Qx:CASCADELAKE"),
+            AVX512_CNL = dict(flags="/Qx:CANNONLAKE"),
+            AVX512_ICL = dict(flags="/Qx:ICELAKE-CLIENT")
+        )
+        if on_x86 and self.cc_is_msvc: return dict(
+            SSE    = dict(flags="/arch:SSE"),
+            SSE2   = dict(flags="/arch:SSE2"),
+            SSE3   = {},
+            SSSE3  = {},
+            SSE41  = {},
+            POPCNT = dict(headers="nmmintrin.h"),
+            SSE42  = {},
+            AVX    = dict(flags="/arch:AVX"),
+            F16C   = {},
+            XOP    = dict(headers="ammintrin.h"),
+            FMA4   = dict(headers="ammintrin.h"),
+            # MSVC doesn't support FMA3 or AVX2 independently
+            FMA3 = dict(
+                implies="F16C AVX2", flags="/arch:AVX2"
+            ),
+            AVX2 = dict(
+                implies="F16C FMA3", flags="/arch:AVX2"
+            ),
+            # MSVC doesn't support AVX512F or AVX512CD independently,
+            # always generate instructions belong to (VL/VW/DQ)
+            AVX512F = dict(
+                implies="AVX2 AVX512CD AVX512_SKX", flags="/arch:AVX512"
+            ),
+            AVX512CD = dict(
+                implies="AVX512F AVX512_SKX", flags="/arch:AVX512"
+            ),
+            AVX512_KNL = dict(
+                disable="MSVC compiler doesn't support it"
+            ),
+            AVX512_KNM = dict(
+                disable="MSVC compiler doesn't support it"
+            ),
+            AVX512_SKX = dict(flags="/arch:AVX512"),
+            AVX512_CLX = {},
+            AVX512_CNL = {},
+            AVX512_ICL = {}
+        )
+
+        on_power = self.cc_on_ppc64le or self.cc_on_ppc64
+        if on_power:
+            partial = dict(
+                VSX = dict(
+                    implies=("VSX2" if self.cc_on_ppc64le else ""),
+                    flags="-mvsx"
+                ),
+                VSX2 = dict(
+                    flags="-mcpu=power8", implies_detect=False
+                ),
+                VSX3 = dict(
+                    flags="-mcpu=power9 -mtune=power9", implies_detect=False
+                )
+            )
+            if self.cc_is_clang:
+                partial["VSX"]["flags"]  = "-maltivec -mvsx"
+                partial["VSX2"]["flags"] = "-mpower8-vector"
+                partial["VSX3"]["flags"] = "-mpower9-vector"
+
+            return partial
+
+        if self.cc_on_aarch64 and is_unix: return dict(
+            NEON = dict(
+                implies="NEON_FP16 NEON_VFPV4 ASIMD", autovec=True
+            ),
+            NEON_FP16 = dict(
+                implies="NEON NEON_VFPV4 ASIMD", autovec=True
+            ),
+            NEON_VFPV4 = dict(
+                implies="NEON NEON_FP16 ASIMD", autovec=True
+            ),
+            ASIMD = dict(
+                implies="NEON NEON_FP16 NEON_VFPV4", autovec=True
+            ),
+            ASIMDHP = dict(
+                flags="-march=armv8.2-a+fp16"
+            ),
+            ASIMDDP = dict(
+                flags="-march=armv8.2-a+dotprod"
+            ),
+            ASIMDFHM = dict(
+                flags="-march=armv8.2-a+fp16fml"
+            ),
+        )
+        if self.cc_on_armhf and is_unix: return dict(
+            NEON = dict(
+                flags="-mfpu=neon"
+            ),
+            NEON_FP16 = dict(
+                flags="-mfpu=neon-fp16 -mfp16-format=ieee"
+            ),
+            NEON_VFPV4 = dict(
+                flags="-mfpu=neon-vfpv4",
+            ),
+            ASIMD = dict(
+                flags="-mfpu=neon-fp-armv8 -march=armv8-a+simd",
+            ),
+            ASIMDHP = dict(
+                flags="-march=armv8.2-a+fp16"
+            ),
+            ASIMDDP = dict(
+                flags="-march=armv8.2-a+dotprod",
+            ),
+            ASIMDFHM = dict(
+                flags="-march=armv8.2-a+fp16fml"
+            )
+        )
+        # TODO: ARM MSVC
+        return {}
+
+    def __init__(self):
+        if self.conf_tmp_path is None:
+            import tempfile, shutil
+            tmp = tempfile.mkdtemp()
+            def rm_temp():
+                try:
+                    shutil.rmtree(tmp)
+                except IOError:
+                    pass
+            atexit.register(rm_temp)
+            self.conf_tmp_path = tmp
+
+        if self.conf_cache_factors is None:
+            self.conf_cache_factors = [
+                os.path.getmtime(__file__),
+                self.conf_nocache
+            ]
+
+class _Distutils:
+    """A helper class that provides a collection of fundamental methods
+    implemented in a top of Python and NumPy Distutils.
+
+    The idea behind this class is to gather all methods that it may
+    need to override in case of reuse 'CCompilerOpt' in environment
+    different than of what NumPy has.
+
+    Parameters
+    ----------
+    ccompiler : `CCompiler`
+        The generate instance that returned from `distutils.ccompiler.new_compiler()`.
+    """
+    def __init__(self, ccompiler):
+        self._ccompiler = ccompiler
+
+    def dist_compile(self, sources, flags, **kwargs):
+        """Wrap CCompiler.compile()"""
+        assert(isinstance(sources, list))
+        assert(isinstance(flags, list))
+        flags = kwargs.pop("extra_postargs", []) + flags
+        return self._ccompiler.compile(
+            sources, extra_postargs=flags, **kwargs
+        )
+
+    def dist_test(self, source, flags):
+        """Return True if 'CCompiler.compile()' able to compile
+        a source file with certain flags.
+        """
+        assert(isinstance(source, str))
+        from distutils.errors import CompileError
+        cc = self._ccompiler;
+        bk_spawn = getattr(cc, 'spawn', None)
+        if bk_spawn:
+            cc_type = getattr(self._ccompiler, "compiler_type", "")
+            if cc_type in ("msvc",):
+                setattr(cc, 'spawn', self._dist_test_spawn_paths)
+            else:
+                setattr(cc, 'spawn', self._dist_test_spawn)
+        test = False
+        try:
+            self.dist_compile(
+                [source], flags, output_dir=self.conf_tmp_path
+            )
+            test = True
+        except CompileError as e:
+            self.dist_log(str(e), stderr=True)
+        if bk_spawn:
+            setattr(cc, 'spawn', bk_spawn)
+        return test
+
+    def dist_info(self):
+        """Return a string containing all environment information, required
+        by the abstract class '_CCompiler' to discovering the platform
+        environment, also used as a cache factor in order to detect
+        any changes from outside.
+        """
+        if hasattr(self, "_dist_info"):
+            return self._dist_info
+        # play it safe
+        cc_info = ""
+        compiler = getattr(self._ccompiler, "compiler", None)
+        if compiler is not None:
+            if isinstance(compiler, str):
+                cc_info += compiler
+            elif hasattr(compiler, "__iter__"):
+                cc_info += ' '.join(compiler)
+        # in case if 'compiler' attribute doesn't provide anything
+        cc_type = getattr(self._ccompiler, "compiler_type", "")
+        if cc_type in ("intelem", "intelemw", "mingw64"):
+            cc_info += "x86_64"
+        elif cc_type in ("intel", "intelw", "intele"):
+            cc_info += "x86"
+        elif cc_type in ("msvc", "mingw32"):
+            import platform
+            if platform.architecture()[0] == "32bit":
+                cc_info += "x86"
+            else:
+                cc_info += "x86_64"
+        else:
+            # the last hope, too bad for cross-compiling
+            import platform
+            cc_info += platform.machine()
+
+        cc_info += cc_type
+        cflags = os.environ.get("CFLAGS", "")
+        if cflags not in cc_info:
+            cc_info += cflags
+
+        self._dist_info = cc_info
+        return cc_info
+
+    @staticmethod
+    def dist_error(*args):
+        """Raise a compiler error"""
+        from distutils.errors import CompileError
+        raise CompileError(_Distutils._dist_str(*args))
+
+    @staticmethod
+    def dist_fatal(*args):
+        """Raise a distutils error"""
+        from distutils.errors import DistutilsError
+        raise DistutilsError(_Distutils._dist_str(*args))
+
+    @staticmethod
+    def dist_log(*args, stderr=False):
+        """Print a console message"""
+        from numpy.distutils import log
+        out = _Distutils._dist_str(*args)
+        if stderr:
+            log.warn(out)
+        else:
+            log.info(out)
+
+    @staticmethod
+    def dist_load_module(name, path):
+        """Load a module from file, required by the abstract class '_Cache'."""
+        from numpy.compat import npy_load_module
+        try:
+            return npy_load_module(name, path)
+        except Exception as e:
+            _Distutils.dist_log(e, stderr=True)
+        return None
+
+    @staticmethod
+    def _dist_str(*args):
+        """Return a string to print by log and errors."""
+        def to_str(arg):
+            if not isinstance(arg, str) and hasattr(arg, '__iter__'):
+                ret = []
+                for a in arg:
+                    ret.append(to_str(a))
+                return '('+ ' '.join(ret) + ')'
+            return str(arg)
+
+        stack = inspect.stack()[2]
+        start = "CCompilerOpt.%s[%d] : " % (stack.function, stack.lineno)
+        out = ' '.join([
+            to_str(a)
+            for a in (*args,)
+        ])
+        return start + out
+
+    def _dist_test_spawn_paths(self, cmd, display=None):
+        """
+        Fix msvc SDK ENV path same as distutils do
+        without it we get c1: fatal error C1356: unable to find mspdbcore.dll
+        """
+        if not hasattr(self._ccompiler, "_paths"):
+            self._dist_test_spawn(cmd)
+            return
+        old_path = os.getenv("path")
+        try:
+            os.environ["path"] = self._ccompiler._paths
+            self._dist_test_spawn(cmd)
+        finally:
+            os.environ["path"] = old_path
+
+    _dist_warn_regex = re.compile(
+        # intel and msvc compilers don't raise
+        # fatal errors when flags are wrong or unsupported
+        ".*("
+        "ignoring unknown option|" # msvc
+        "invalid argument for option" # intel
+        ").*"
+    )
+    @staticmethod
+    def _dist_test_spawn(cmd, display=None):
+        from distutils.errors import CompileError
+        try:
+            o = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
+            if isinstance(o, bytes):
+                o = o.decode()
+            if o and re.match(_Distutils._dist_warn_regex, o):
+                _Distutils.dist_error(
+                    "Flags in command", cmd ,"aren't supported by the compiler"
+                    ", output -> \n%s" % o
+                )
+        except subprocess.CalledProcessError as exc:
+            o = exc.output
+            s = exc.returncode
+        except OSError:
+            o = b''
+            s = 127
+        else:
+            return None
+        o = o.decode()
+        _Distutils.dist_error(
+            "Command", cmd, "failed with exit status %d output -> \n%s" % (
+            s, o
+        ))
+
+_share_cache = {}
+class _Cache:
+    """An abstract class handles caching functionality, provides two
+    levels of caching, in-memory by share instances attributes among
+    each other and by store attributes into files.
+
+    **Note**:
+        any attributes that start with ``_`` or ``conf_`` will be ignored.
+
+    Parameters
+    ----------
+    cache_path: str or None
+        The path of cache file, if None then cache in file will disabled.
+
+    *factors:
+        The caching factors that need to utilize next to `conf_cache_factors`.
+
+    Attributes
+    ----------
+    cache_private: set
+        Hold the attributes that need be skipped from "in-memory cache".
+
+    cache_infile: bool
+        Utilized during initializing this class, to determine if the cache was able
+        to loaded from the specified cache path in 'cache_path'.
+    """
+
+    # skip attributes from cache
+    _cache_ignore = re.compile("^(_|conf_)")
+
+    def __init__(self, cache_path=None, *factors):
+        self.cache_me = {}
+        self.cache_private = set()
+        self.cache_infile = False
+
+        if self.conf_nocache:
+            self.dist_log("cache is disabled by `Config`")
+            return
+
+        chash = self.cache_hash(*factors, *self.conf_cache_factors)
+        if cache_path:
+            if os.path.exists(cache_path):
+                self.dist_log("load cache from file ->", cache_path)
+                cache_mod = self.dist_load_module("cache", cache_path)
+                if not cache_mod:
+                    self.dist_log(
+                        "unable to load the cache file as a module",
+                        stderr=True
+                    )
+                elif not hasattr(cache_mod, "hash") or \
+                     not hasattr(cache_mod, "data"):
+                    self.dist_log("invalid cache file", stderr=True)
+                elif chash == cache_mod.hash:
+                    self.dist_log("hit the file cache")
+                    for attr, val in cache_mod.data.items():
+                        setattr(self, attr, val)
+                    self.cache_infile = True
+                else:
+                    self.dist_log("miss the file cache")
+
+            atexit.register(self._cache_write, cache_path, chash)
+
+        if not self.cache_infile:
+            other_cache = _share_cache.get(chash)
+            if other_cache:
+                self.dist_log("hit the memory cache")
+                for attr, val in other_cache.__dict__.items():
+                    if attr in other_cache.cache_private or \
+                               re.match(self._cache_ignore, attr):
+                        continue
+                    setattr(self, attr, val)
+
+        _share_cache[chash] = self
+
+    def __del__(self):
+        # TODO: remove the cache form share on del
+        pass
+
+    def _cache_write(self, cache_path, cache_hash):
+        # TODO: don't write if the cache doesn't change
+        self.dist_log("write cache to path ->", cache_path)
+        for attr in list(self.__dict__.keys()):
+            if re.match(self._cache_ignore, attr):
+                self.__dict__.pop(attr)
+
+        d = os.path.dirname(cache_path)
+        if not os.path.exists(d):
+            os.makedirs(d)
+
+        repr_dict = pprint.pformat(self.__dict__, compact=True)
+        with open(cache_path, "w") as f:
+            f.write(textwrap.dedent("""\
+            # AUTOGENERATED DON'T EDIT
+            # Please make changes to the code generator \
+            (distutils/ccompiler_opt.py)
+            hash = {}
+            data = \\
+            """).format(cache_hash))
+            f.write(repr_dict)
+
+    def cache_hash(self, *factors):
+        # is there a built-in non-crypto hash?
+        # sdbm
+        chash = 0
+        for f in factors:
+            for char in str(f):
+                chash  = ord(char) + (chash << 6) + (chash << 16) - chash
+                chash &= 0xFFFFFFFF
+        return chash
+
+    @staticmethod
+    def me(cb):
+        """
+        A static method that can be treated as a decorator to
+        dynamically cache certain methods.
+        """
+        def cache_wrap_me(self, *args, **kwargs):
+            # good for normal args
+            cache_key = str((
+                cb.__name__, *args, *kwargs.keys(), *kwargs.values()
+            ))
+            if cache_key in self.cache_me:
+                return self.cache_me[cache_key]
+            ccb = cb(self, *args, **kwargs)
+            self.cache_me[cache_key] = ccb
+            return ccb
+        return cache_wrap_me
+
+class _CCompiler(object):
+    """A helper class for `CCompilerOpt` containing all utilities that
+    related to the fundamental compiler's functions.
+
+    Attributes
+    ----------
+    cc_on_x86 : bool
+        True when the target architecture is 32-bit x86
+    cc_on_x64 : bool
+        True when the target architecture is 64-bit x86
+    cc_on_ppc64 : bool
+        True when the target architecture is 64-bit big-endian PowerPC
+    cc_on_armhf : bool
+        True when the target architecture is 32-bit ARMv7+
+    cc_on_aarch64 : bool
+        True when the target architecture is 64-bit Armv8-a+
+    cc_on_noarch : bool
+        True when the target architecture is unknown or not supported
+    cc_is_gcc : bool
+        True if the compiler is GNU or
+        if the compiler is unknown
+    cc_is_clang : bool
+        True if the compiler is Clang
+    cc_is_icc : bool
+        True if the compiler is Intel compiler (unix like)
+    cc_is_iccw : bool
+        True if the compiler is Intel compiler (msvc like)
+    cc_is_nocc : bool
+        True if the compiler isn't supported directly,
+        Note: that cause a fail-back to gcc
+    cc_has_debug : bool
+        True if the compiler has debug flags
+    cc_has_native : bool
+        True if the compiler has native flags
+    cc_noopt : bool
+        True if the compiler has definition 'DISABLE_OPT*',
+        or 'cc_on_noarch' is True
+    cc_march : str
+        The target architecture name, or "unknown" if
+        the architecture isn't supported
+    cc_name : str
+        The compiler name, or "unknown" if the compiler isn't supported
+    cc_flags : dict
+        Dictionary containing the initialized flags of `_Config.conf_cc_flags`
+    """
+    def __init__(self):
+        if hasattr(self, "cc_is_cached"):
+            return
+        to_detect = (
+            #        attr                regex
+            (
+                ("cc_on_x64",      "^(x|x86_|amd)64"),
+                ("cc_on_x86",      "^(x86|i386|i686)"),
+                ("cc_on_ppc64le",  "^(powerpc|ppc)64(el|le)"),
+                ("cc_on_ppc64",    "^(powerpc|ppc)64"),
+                ("cc_on_armhf",    "^arm"),
+                ("cc_on_aarch64",  "^aarch64"),
+                # priority is given to first of string
+                # if it fail we search in the rest, due
+                # to append platform.machine() at the end,
+                # check method 'dist_info()' for more clarification.
+                ("cc_on_x64",      ".*(x|x86_|amd)64.*"),
+                ("cc_on_x86",      ".*(x86|i386|i686).*"),
+                ("cc_on_ppc64le",  ".*(powerpc|ppc)64(el|le).*"),
+                ("cc_on_ppc64",    ".*(powerpc|ppc)64.*"),
+                ("cc_on_armhf",    ".*arm.*"),
+                ("cc_on_aarch64",  ".*aarch64.*"),
+                # undefined platform
+                ("cc_on_noarch",    ""),
+            ),
+            (
+                ("cc_is_gcc",     r".*(gcc|gnu\-g).*"),
+                ("cc_is_clang",    ".*clang.*"),
+                ("cc_is_iccw",     ".*(intelw|intelemw|iccw).*"), # intel msvc like
+                ("cc_is_icc",      ".*(intel|icc).*"), # intel unix like
+                ("cc_is_msvc",     ".*msvc.*"),
+                ("cc_is_nocc",     ""),
+            ),
+               (("cc_has_debug",  ".*(O0|Od|ggdb|coverage|debug:full).*"),),
+               (("cc_has_native", ".*(-march=native|-xHost|/QxHost).*"),),
+               # in case if the class run with -DNPY_DISABLE_OPTIMIZATION
+               (("cc_noopt", ".*DISABLE_OPT.*"),),
+        )
+        for section in to_detect:
+            for attr, rgex in section:
+                setattr(self, attr, False)
+
+        dist_info = self.dist_info()
+        for section in to_detect:
+            for attr, rgex in section:
+                if rgex and not re.match(rgex, dist_info, re.IGNORECASE):
+                    continue
+                setattr(self, attr, True)
+                break
+
+        if self.cc_on_noarch:
+            self.dist_log(
+                "unable to detect CPU arch via compiler info, "
+                "optimization is disabled \ninfo << %s >> " % dist_info,
+                stderr=True
+            )
+            self.cc_noopt = True
+
+        if self.conf_noopt:
+            self.dist_log("Optimization is disabled by the Config", stderr=True)
+            self.cc_noopt = True
+
+        if self.cc_is_nocc:
+            """
+            mingw can be treated as a gcc, and also xlc even if it based on clang,
+            but still has the same gcc optimization flags.
+            """
+            self.dist_log(
+                "unable to detect compiler name via info <<\n%s\n>> "
+                "treating it as a gcc" % dist_info,
+                stderr=True
+            )
+            self.cc_is_gcc = True
+
+        self.cc_march = "unknown"
+        for arch in ("x86", "x64", "ppc64", "ppc64le", "armhf", "aarch64"):
+            if getattr(self, "cc_on_" + arch):
+                self.cc_march = arch
+                break
+
+        self.cc_name = "unknown"
+        for name in ("gcc", "clang", "iccw", "icc", "msvc"):
+            if getattr(self, "cc_is_" + name):
+                self.cc_name = name
+                break
+
+        self.cc_flags = {}
+        compiler_flags = self.conf_cc_flags.get(self.cc_name)
+        if compiler_flags is None:
+            self.dist_fatal(
+                "undefined flag for compiler '%s', "
+                "leave an empty dict instead" % self.cc_name
+            )
+        for name, flags in compiler_flags.items():
+            self.cc_flags[name] = nflags = []
+            if flags:
+                assert(isinstance(flags, str))
+                flags = flags.split()
+                for f in flags:
+                    if self.cc_test_flags([f]):
+                        nflags.append(f)
+
+        self.cc_is_cached = True
+
+    @_Cache.me
+    def cc_test_flags(self, flags):
+        """
+        Returns True if the compiler supports 'flags'.
+        """
+        assert(isinstance(flags, list))
+        self.dist_log("testing flags", flags)
+        test_path = os.path.join(self.conf_check_path, "test_flags.c")
+        test = self.dist_test(test_path, flags)
+        if not test:
+            self.dist_log("testing failed", stderr=True)
+        return test
+
+    def cc_normalize_flags(self, flags):
+        """
+        Remove the conflicts that caused due gathering implied features flags.
+
+        Parameters
+        ----------
+        'flags' list, compiler flags
+            flags should be sorted from the lowest to the highest interest.
+
+        Returns
+        -------
+        list, filtered from any conflicts.
+
+        Examples
+        --------
+        >>> self.cc_normalize_flags(['-march=armv8.2-a+fp16', '-march=armv8.2-a+dotprod'])
+        ['armv8.2-a+fp16+dotprod']
+
+        >>> self.cc_normalize_flags(
+            ['-msse', '-msse2', '-msse3', '-mssse3', '-msse4.1', '-msse4.2', '-mavx', '-march=core-avx2']
+        )
+        ['-march=core-avx2']
+        """
+        assert(isinstance(flags, list))
+        if self.cc_is_gcc or self.cc_is_clang or self.cc_is_icc:
+            return self._cc_normalize_unix(flags)
+
+        if self.cc_is_msvc or self.cc_is_iccw:
+            return self._cc_normalize_win(flags)
+        return flags
+
+    _cc_normalize_unix_mrgx = re.compile(
+        # 1- to check the highest of
+        r"^(-mcpu=|-march=|-x[A-Z0-9\-])"
+    )
+    _cc_normalize_unix_frgx = re.compile(
+        # 2- to remove any flags starts with
+        # -march, -mcpu, -x(INTEL) and '-m' without '='
+        r"^(?!(-mcpu=|-march=|-x[A-Z0-9\-]))(?!-m[a-z0-9\-\.]*.$)"
+    )
+    _cc_normalize_unix_krgx = re.compile(
+        # 3- keep only the highest of
+        r"^(-mfpu|-mtune)"
+    )
+    _cc_normalize_arch_ver = re.compile(
+        r"[0-9.]"
+    )
+    def _cc_normalize_unix(self, flags):
+        def ver_flags(f):
+            #        arch ver  subflag
+            # -march=armv8.2-a+fp16fml
+            tokens = f.split('+')
+            ver = float('0' + ''.join(
+                re.findall(self._cc_normalize_arch_ver, tokens[0])
+            ))
+            return ver, tokens[0], tokens[1:]
+
+        if len(flags) <= 1:
+            return flags
+        # get the highest matched flag
+        for i, cur_flag in enumerate(reversed(flags)):
+            if not re.match(self._cc_normalize_unix_mrgx, cur_flag):
+                continue
+            lower_flags = flags[:-(i+1)]
+            upper_flags = flags[-i:]
+            filterd = list(filter(
+                self._cc_normalize_unix_frgx.search, lower_flags
+            ))
+            # gather subflags
+            ver, arch, subflags = ver_flags(cur_flag)
+            if ver > 0 and len(subflags) > 0:
+                for xflag in lower_flags:
+                    xver, _, xsubflags = ver_flags(xflag)
+                    if ver == xver:
+                        subflags = xsubflags + subflags
+                cur_flag = arch + '+' + '+'.join(subflags)
+
+            flags = filterd + [cur_flag]
+            if i > 0:
+                flags += upper_flags
+            break
+
+        # to remove overridable flags
+        final_flags = []
+        matched = set()
+        for f in reversed(flags):
+            match = re.match(self._cc_normalize_unix_krgx, f)
+            if not match:
+                pass
+            elif match[0] in matched:
+                continue
+            else:
+                matched.add(match[0])
+            final_flags.insert(0, f)
+        return final_flags
+
+    _cc_normalize_win_frgx = re.compile(
+        r"^(?!(/arch\:|/Qx\:))"
+    )
+    _cc_normalize_win_mrgx = re.compile(
+        r"^(/arch|/Qx:)"
+    )
+    def _cc_normalize_win(self, flags):
+        for i, f in enumerate(reversed(flags)):
+            if not re.match(self._cc_normalize_win_mrgx, f):
+                continue
+            i += 1
+            return list(filter(
+                self._cc_normalize_win_frgx.search, flags[:-i]
+            )) + flags[-i:]
+        return flags
+
+class _Feature:
+    """A helper class for `CCompilerOpt` that managing CPU features.
+
+    Attributes
+    ----------
+    feature_supported : dict
+        Dictionary containing all CPU features that supported
+        by the platform, according to the specified values in attribute
+        `_Config.conf_features` and `_Config.conf_features_partial()`
+
+    feature_min : set
+        The minimum support of CPU features, according to
+        the specified values in attribute `_Config.conf_min_features`.
+    """
+    def __init__(self):
+        if hasattr(self, "feature_is_cached"):
+            return
+        self.feature_supported = pfeatures = self.conf_features_partial()
+        for feature_name in list(pfeatures.keys()):
+            feature  = pfeatures[feature_name]
+            cfeature = self.conf_features[feature_name]
+            feature.update({
+                k:v for k,v in cfeature.items() if k not in feature
+            })
+            disabled = feature.get("disable")
+            if disabled is not None:
+                pfeatures.pop(feature_name)
+                self.dist_log(
+                    "feature '%s' is disabled," % feature_name,
+                    disabled, stderr=True
+                )
+                continue
+            # list is used internally for these options
+            for option in (
+                "implies", "group", "detect", "headers", "flags"
+            ) :
+                oval = feature.get(option)
+                if isinstance(oval, str):
+                    feature[option] = oval.split()
+
+        self.feature_min = set()
+        min_f = self.conf_min_features.get(self.cc_march, "")
+        for F in min_f.upper().split():
+            if F in self.feature_supported:
+                self.feature_min.add(F)
+
+        self.feature_is_cached = True
+
+    def feature_names(self, names=None, force_flags=None):
+        """
+        Returns a set of CPU feature names that supported by platform and the **C** compiler.
+
+        Parameters
+        ----------
+        'names': sequence or None, optional
+            Specify certain CPU features to test it against the **C** compiler.
+            if None(default), it will test all current supported features.
+            **Note**: feature names must be in upper-case.
+
+        'force_flags': list or None, optional
+            If None(default), default compiler flags for every CPU feature will be used
+            during the test.
+        """
+        assert(
+            names is None or (
+                not isinstance(names, str) and
+                hasattr(names, "__iter__")
+            )
+        )
+        assert(force_flags is None or isinstance(force_flags, list))
+        if names is None:
+            names = self.feature_supported.keys()
+        supported_names = set()
+        for f in names:
+            if self.feature_is_supported(f, force_flags=force_flags):
+                supported_names.add(f)
+        return supported_names
+
+    def feature_is_exist(self, name):
+        """
+        Returns True if a certain feature is exist and covered within
+        `_Config.conf_features`.
+
+        Parameters
+        ----------
+        'name': str
+            feature name in uppercase.
+        """
+        assert(name.isupper())
+        return name in self.conf_features
+
+    def feature_sorted(self, names, reverse=False):
+        """
+        Sort a list of CPU features ordered by the lowest interest.
+
+        Parameters
+        ----------
+        'names': sequence
+            sequence of supported feature names in uppercase.
+        'reverse': bool, optional
+            If true, the sorted features is reversed. (highest interest)
+
+        Returns
+        -------
+        list, sorted CPU features
+        """
+        def sort_cb(k):
+            if isinstance(k, str):
+                return self.feature_supported[k]["interest"]
+            # multiple features
+            rank = max([self.feature_supported[f]["interest"] for f in k])
+            # FIXME: that's not a safe way to increase the rank for
+            # multi targets
+            rank += len(k) -1
+            return rank
+        return sorted(names, reverse=reverse, key=sort_cb)
+
+    def feature_implies(self, names, keep_origins=False):
+        """
+        Return a set of CPU features that implied by 'names'
+
+        Parameters
+        ----------
+        names: str or sequence of str
+            CPU feature name(s) in uppercase.
+
+        keep_origins: bool
+            if False(default) then the returned set will not contain any
+            features from 'names'. This case happens only when two features
+            imply each other.
+
+        Examples
+        --------
+        >>> self.feature_implies("SSE3")
+        {'SSE', 'SSE2'}
+        >>> self.feature_implies("SSE2")
+        {'SSE'}
+        >>> self.feature_implies("SSE2", keep_origins=True)
+        # 'SSE2' found here since 'SSE' and 'SSE2' imply each other
+        {'SSE', 'SSE2'}
+        """
+        def get_implies(name, _caller=set()):
+            implies = set()
+            d = self.feature_supported[name]
+            for i in d.get("implies", []):
+                implies.add(i)
+                if i in _caller:
+                    # infinity recursive guard since
+                    # features can imply each other
+                    continue
+                _caller.add(name)
+                implies = implies.union(get_implies(i, _caller))
+            return implies
+
+        if isinstance(names, str):
+            implies = get_implies(names)
+            names = [names]
+        else:
+            assert(hasattr(names, "__iter__"))
+            implies = set()
+            for n in names:
+                implies = implies.union(get_implies(n))
+        if not keep_origins:
+            implies.difference_update(names)
+        return implies
+
+    def feature_implies_c(self, names):
+        """same as feature_implies() but combining 'names'"""
+        if isinstance(names, str):
+            names = set((names,))
+        else:
+            names = set(names)
+        return names.union(self.feature_implies(names))
+
+    def feature_ahead(self, names):
+        """
+        Return list of features in 'names' after remove any
+        implied features and keep the origins.
+
+        Parameters
+        ----------
+        'names': sequence
+            sequence of CPU feature names in uppercase.
+
+        Returns
+        -------
+        list of CPU features sorted as-is 'names'
+
+        Examples
+        --------
+        >>> self.feature_ahead(["SSE2", "SSE3", "SSE41"])
+        ["SSE41"]
+        # assume AVX2 and FMA3 implies each other and AVX2
+        # is the highest interest
+        >>> self.feature_ahead(["SSE2", "SSE3", "SSE41", "AVX2", "FMA3"])
+        ["AVX2"]
+        # assume AVX2 and FMA3 don't implies each other
+        >>> self.feature_ahead(["SSE2", "SSE3", "SSE41", "AVX2", "FMA3"])
+        ["AVX2", "FMA3"]
+        """
+        assert(
+            not isinstance(names, str)
+            and hasattr(names, '__iter__')
+        )
+        implies = self.feature_implies(names, keep_origins=True)
+        ahead = [n for n in names if n not in implies]
+        if len(ahead) == 0:
+            # return the highest interested feature
+            # if all features imply each other
+            ahead = self.feature_sorted(names, reverse=True)[:1]
+        return ahead
+
+    def feature_untied(self, names):
+        """
+        same as 'feature_ahead()' but if both features implied each other
+        and keep the highest interest.
+
+        Parameters
+        ----------
+        'names': sequence
+            sequence of CPU feature names in uppercase.
+
+        Returns
+        -------
+        list of CPU features sorted as-is 'names'
+
+        Examples
+        --------
+        >>> self.feature_untied(["SSE2", "SSE3", "SSE41"])
+        ["SSE2", "SSE3", "SSE41"]
+        # assume AVX2 and FMA3 implies each other
+        >>> self.feature_untied(["SSE2", "SSE3", "SSE41", "FMA3", "AVX2"])
+        ["SSE2", "SSE3", "SSE41", "AVX2"]
+        """
+        assert(
+            not isinstance(names, str)
+            and hasattr(names, '__iter__')
+        )
+        final = []
+        for n in names:
+            implies = self.feature_implies(n)
+            tied = [
+                nn for nn in final
+                if nn in implies and n in self.feature_implies(nn)
+            ]
+            if tied:
+                tied = self.feature_sorted(tied + [n])
+                if n not in tied[1:]:
+                    continue
+                final.remove(tied[:1][0])
+            final.append(n)
+        return final
+
+    def feature_get_til(self, names, keyisfalse):
+        """
+        same as `feature_implies_c()` but stop collecting implied
+        features when feature's option that provided through
+        parameter 'keyisfalse' is False, also sorting the returned
+        features.
+        """
+        def til(tnames):
+            # sort from highest to lowest interest then cut if "key" is False
+            tnames = self.feature_implies_c(tnames)
+            tnames = self.feature_sorted(tnames, reverse=True)
+            for i, n in enumerate(tnames):
+                if not self.feature_supported[n].get(keyisfalse, True):
+                    tnames = tnames[:i+1]
+                    break
+            return tnames
+
+        if isinstance(names, str) or len(names) <= 1:
+            names = til(names)
+            # normalize the sort
+            names.reverse()
+            return names
+
+        names = self.feature_ahead(names)
+        names = {t for n in names for t in til(n)}
+        return self.feature_sorted(names)
+
+    def feature_detect(self, names):
+        """
+        Return a list of CPU features that required to be detected
+        sorted from the lowest to highest interest.
+        """
+        names = self.feature_get_til(names, "implies_detect")
+        detect = []
+        for n in names:
+            d = self.feature_supported[n]
+            detect += d.get("detect", d.get("group", [n]))
+        return detect
+
+    @_Cache.me
+    def feature_flags(self, names):
+        """
+        Return a list of CPU features flags sorted from the lowest
+        to highest interest.
+        """
+        names = self.feature_sorted(self.feature_implies_c(names))
+        flags = []
+        for n in names:
+            d = self.feature_supported[n]
+            f = d.get("flags", [])
+            if not f or not self.cc_test_flags(f):
+                continue
+            flags += f
+        return self.cc_normalize_flags(flags)
+
+    @_Cache.me
+    def feature_test(self, name, force_flags=None):
+        """
+        Test a certain CPU feature against the compiler through its own
+        check file.
+
+        Parameters
+        ----------
+        'name': str
+            Supported CPU feature name.
+
+        'force_flags': list or None, optional
+            If None(default), the returned flags from `feature_flags()`
+            will be used.
+       """
+        if force_flags is None:
+            force_flags = self.feature_flags(name)
+
+        self.dist_log(
+            "testing feature '%s' with flags (%s)" % (
+            name, ' '.join(force_flags)
+        ))
+        # Each CPU feature must have C source code contains at
+        # least one intrinsic or instruction related to this feature.
+        test_path = os.path.join(
+            self.conf_check_path, "cpu_%s.c" % name.lower()
+        )
+        if not os.path.exists(test_path):
+            self.dist_fatal("feature test file is not exist", path)
+
+        test = self.dist_test(test_path, force_flags + self.cc_flags["werror"])
+        if not test:
+            self.dist_log("testing failed", stderr=True)
+        return test
+
+    @_Cache.me
+    def feature_is_supported(self, name, force_flags=None):
+        """
+        Check if a certain CPU feature is supported by the platform and compiler.
+
+        Parameters
+        ----------
+        'name': str
+            CPU feature name in uppercase.
+
+        'force_flags': list or None, optional
+            If None(default), default compiler flags for every CPU feature will be used
+            during test.
+        """
+        assert(name.isupper())
+        assert(force_flags is None or isinstance(force_flags, list))
+
+        supported = name in self.feature_supported
+        if supported:
+            for impl in self.feature_implies(name):
+                if not self.feature_test(impl, force_flags):
+                    return False
+            if not self.feature_test(name, force_flags):
+                return False
+        return supported
+
+    @_Cache.me
+    def feature_can_autovec(self, name):
+        """
+        check if the feature can be auto-vectorized by the compiler
+        """
+        assert(isinstance(name, str))
+        d = self.feature_supported[name]
+        can = d.get("autovec", None)
+        if can is None:
+            valid_flags = [
+                self.cc_test_flags([f]) for f in d.get("flags", [])
+            ]
+            can = valid_flags and any(valid_flags)
+        return can
+
+    def feature_c_preprocessor(self, feature_name, tabs=0):
+        """
+        Generate C preprocessor definitions and include headers of a CPU feature.
+
+        Parameters
+        ----------
+        'feature_name': str
+            CPU feature name in uppercase.
+        'tabs': int
+            if > 0, align the generated strings to the right depend on number of tabs.
+
+        Returns
+        -------
+        str, generated C preprocessor
+
+        Examples
+        --------
+        >>> self.feature_c_preprocessor("SSE3")
+        /** SSE3 **/
+        #define NPY_HAVE_SSE3 1
+        #include <pmmintrin.h>
+        """
+        assert(feature_name.isupper())
+        feature = self.feature_supported.get(feature_name)
+        assert(feature is not None)
+
+        prepr = [
+            "/** %s **/" % feature_name,
+            "#define %sHAVE_%s 1" % (self.conf_c_prefix, feature_name)
+        ]
+        prepr += [
+            "#include <%s>" % h for h in feature.get("headers", [])
+        ]
+        group = feature.get("group", [])
+        for f in group:
+            # Guard features in case of duplicate definitions
+            prepr += [
+                "#ifndef %sHAVE_%s" % (self.conf_c_prefix, f),
+                "\t#define %sHAVE_%s 1" % (self.conf_c_prefix, f),
+                "#endif",
+            ]
+        if tabs > 0:
+            prepr = [('\t'*tabs) + l for l in prepr]
+        return '\n'.join(prepr)
+
+class _Parse:
+    """A helper class that parsing main arguments of `CCompilerOpt`,
+    also parsing configuration statements in dispatch-able sources.
+
+    Parameters
+    ----------
+    cpu_baseline: str or None
+        minimal set of required CPU features or special options.
+
+    cpu_dispatch: str or None
+        dispatched set of additional CPU features or special options.
+
+    Special options can be:
+        - **MIN**: Enables the minimum CPU features that utilized via `_Config.conf_min_features`
+        - **MAX**: Enables all supported CPU features by the Compiler and platform.
+        - **NATIVE**: Enables all CPU features that supported by the current machine.
+        - **NONE**: Enables nothing
+        - **Operand +/-**: remove or add features, useful with options **MAX**, **MIN** and **NATIVE**.
+            NOTE: operand + is only added for nominal reason.
+
+    NOTES:
+        - Case-insensitive among all CPU features and special options.
+        - Comma or space can be used as a separator.
+        - If the CPU feature is not supported by the user platform or compiler,
+          it will be skipped rather than raising a fatal error.
+        - Any specified CPU features to 'cpu_dispatch' will be skipped if its part of CPU baseline features
+        - 'cpu_baseline' force enables implied features.
+
+    Attributes
+    ----------
+    parse_baseline_names : list
+        Final CPU baseline's feature names(sorted from low to high)
+    parse_baseline_flags : list
+        Compiler flags of baseline features
+    parse_dispatch_names : list
+        Final CPU dispatch-able feature names(sorted from low to high)
+    parse_target_groups : dict
+        Dictionary containing initialized target groups that configured
+        through class attribute `conf_target_groups`.
+
+        The key is represent the group name and value is a tuple
+        contains three items :
+            - bool, True if group has the 'baseline' option.
+            - list, list of CPU features.
+            - list, list of extra compiler flags.
+
+    """
+    def __init__(self, cpu_baseline, cpu_dispatch):
+        self._parse_policies = dict(
+            # POLICY NAME, (HAVE, NOT HAVE, [DEB])
+            KEEP_BASELINE = (
+                None, self._parse_policy_not_keepbase,
+                []
+            ),
+            KEEP_SORT = (
+                self._parse_policy_keepsort,
+                self._parse_policy_not_keepsort,
+                []
+            ),
+            MAXOPT = (
+                self._parse_policy_maxopt, None,
+                []
+            ),
+            WERROR = (
+                self._parse_policy_werror, None,
+                []
+            ),
+            AUTOVEC = (
+                self._parse_policy_autovec, None,
+                ["MAXOPT"]
+            )
+        )
+        if hasattr(self, "parse_is_cached"):
+            return
+
+        self.parse_baseline_names = []
+        self.parse_baseline_flags = []
+        self.parse_dispatch_names = []
+        self.parse_target_groups = {}
+
+        if self.cc_noopt:
+            # skip parsing baseline and dispatch args and keep parsing target groups
+            cpu_baseline = cpu_dispatch = None
+
+        self.dist_log("check requested baseline")
+        if cpu_baseline is not None:
+            cpu_baseline = self._parse_arg_features("cpu_baseline", cpu_baseline)
+            baseline_names = self.feature_names(cpu_baseline)
+            self.parse_baseline_flags = self.feature_flags(baseline_names)
+            self.parse_baseline_names = self.feature_sorted(
+                self.feature_implies_c(baseline_names)
+            )
+
+        self.dist_log("check requested dispatch-able features")
+        if cpu_dispatch is not None:
+            cpu_dispatch_ = self._parse_arg_features("cpu_dispatch", cpu_dispatch)
+            cpu_dispatch = {
+                f for f in cpu_dispatch_
+                if f not in self.parse_baseline_names
+            }
+            conflict_baseline = cpu_dispatch_.difference(cpu_dispatch)
+            self.parse_dispatch_names = self.feature_sorted(
+                self.feature_names(cpu_dispatch)
+            )
+            if len(conflict_baseline) > 0:
+                self.dist_log(
+                    "skip features", conflict_baseline, "since its part of baseline"
+                )
+
+        self.dist_log("initialize targets groups")
+        for group_name, tokens in self.conf_target_groups.items():
+            self.dist_log("parse target group", group_name)
+            GROUP_NAME = group_name.upper()
+            if not tokens or not tokens.strip():
+                # allow empty groups, useful in case if there's a need
+                # to disable certain group since '_parse_target_tokens()'
+                # requires at least one valid target
+                self.parse_target_groups[GROUP_NAME] = (
+                    False, [], []
+                )
+                continue
+            has_baseline, features, extra_flags = \
+                self._parse_target_tokens(tokens)
+            self.parse_target_groups[GROUP_NAME] = (
+                has_baseline, features, extra_flags
+            )
+
+        self.parse_is_cached = True
+
+    def parse_targets(self, source):
+        """
+        Fetch and parse configuration statements that required for
+        defining the targeted CPU features, statements should be declared
+        in the top of source in between **C** comment and start
+        with a special mark **@targets**.
+
+        Configuration statements are sort of keywords representing
+        CPU features names, group of statements and policies, combined
+        together to determine the required optimization.
+
+        Parameters
+        ----------
+        source: str
+            the path of **C** source file.
+
+        Returns
+        -------
+        - bool, True if group has the 'baseline' option
+        - list, list of CPU features
+        - list, list of extra compiler flags
+        """
+        self.dist_log("looking for '@targets' inside -> ", source)
+        # get lines between /*@targets and */
+        with open(source) as fd:
+            tokens = ""
+            max_to_reach = 1000 # good enough, isn't?
+            start_with = "@targets"
+            start_pos = -1
+            end_with = "*/"
+            end_pos = -1
+            for current_line, line in enumerate(fd):
+                if current_line == max_to_reach:
+                    self.dist_fatal("reached the max of lines")
+                    break
+                if start_pos == -1:
+                    start_pos = line.find(start_with)
+                    if start_pos == -1:
+                        continue
+                    start_pos += len(start_with)
+                tokens += line
+                end_pos = line.find(end_with)
+                if end_pos != -1:
+                    end_pos += len(tokens) - len(line)
+                    break
+
+        if start_pos == -1:
+            self.dist_fatal("expected to find '%s' within a C comment" % start_with)
+        if end_pos == -1:
+            self.dist_fatal("expected to end with '%s'" % end_with)
+
+        tokens = tokens[start_pos:end_pos]
+        return self._parse_target_tokens(tokens)
+
+    _parse_regex_arg = re.compile(r'\s|[,]|([+-])')
+    def _parse_arg_features(self, arg_name, req_features):
+        if not isinstance(req_features, str):
+            self.dist_fatal("expected a string in '%s'" % arg_name)
+
+        final_features = set()
+        # space and comma can be used as a separator
+        tokens = list(filter(None, re.split(self._parse_regex_arg, req_features)))
+        append = True # append is the default
+        for tok in tokens:
+            if tok[0] in ("#", "$"):
+                self.dist_fatal(
+                    arg_name, "target groups and policies "
+                    "aren't allowed from arguments, "
+                    "only from dispatch-able sources"
+                )
+            if tok == '+':
+                append = True
+                continue
+            if tok == '-':
+                append = False
+                continue
+
+            TOK = tok.upper() # we use upper-case internally
+            features_to = set()
+            if TOK == "NONE":
+                pass
+            elif TOK == "NATIVE":
+                native = self.cc_flags["native"]
+                if not native:
+                    self.dist_fatal(arg_name,
+                        "native option isn't supported by the compiler"
+                    )
+                features_to = self.feature_names(force_flags=native)
+            elif TOK == "MAX":
+                features_to = self.feature_supported.keys()
+            elif TOK == "MIN":
+                features_to = self.feature_min
+            else:
+                if TOK in self.feature_supported:
+                    features_to.add(TOK)
+                else:
+                    if not self.feature_is_exist(TOK):
+                        self.dist_fatal(arg_name,
+                            ", '%s' isn't a known feature or option" % tok
+                        )
+            if append:
+                final_features = final_features.union(features_to)
+            else:
+                final_features = final_features.difference(features_to)
+
+            append = True # back to default
+
+        return final_features
+
+    _parse_regex_target = re.compile(r'\s|[*,/]|([()])')
+    def _parse_target_tokens(self, tokens):
+        assert(isinstance(tokens, str))
+        final_targets = [] # to keep it sorted as specified
+        extra_flags = []
+        has_baseline = False
+
+        skipped  = set()
+        policies = set()
+        multi_target = None
+
+        tokens = list(filter(None, re.split(self._parse_regex_target, tokens)))
+        if not tokens:
+            self.dist_fatal("expected one token at least")
+
+        for tok in tokens:
+            TOK = tok.upper()
+            ch = tok[0]
+            if ch in ('+', '-'):
+                self.dist_fatal(
+                    "+/- are 'not' allowed from target's groups or @targets, "
+                    "only from cpu_baseline and cpu_dispatch parms"
+                )
+            elif ch == '$':
+                if multi_target is not None:
+                    self.dist_fatal(
+                        "policies aren't allowed inside multi-target '()'"
+                        ", only CPU features"
+                    )
+                policies.add(self._parse_token_policy(TOK))
+            elif ch == '#':
+                if multi_target is not None:
+                    self.dist_fatal(
+                        "target groups aren't allowed inside multi-target '()'"
+                        ", only CPU features"
+                    )
+                has_baseline, final_targets, extra_flags = \
+                self._parse_token_group(TOK, has_baseline, final_targets, extra_flags)
+            elif ch == '(':
+                if multi_target is not None:
+                    self.dist_fatal("unclosed multi-target, missing ')'")
+                multi_target = set()
+            elif ch == ')':
+                if multi_target is None:
+                    self.dist_fatal("multi-target opener '(' wasn't found")
+                targets = self._parse_multi_target(multi_target)
+                if targets is None:
+                    skipped.add(tuple(multi_target))
+                else:
+                    if len(targets) == 1:
+                        targets = targets[0]
+                    if targets and targets not in final_targets:
+                        final_targets.append(targets)
+                multi_target = None # back to default
+            else:
+                if TOK == "BASELINE":
+                    if multi_target is not None:
+                        self.dist_fatal("baseline isn't allowed inside multi-target '()'")
+                    has_baseline = True
+                    continue
+
+                if multi_target is not None:
+                    multi_target.add(TOK)
+                    continue
+
+                if not self.feature_is_exist(TOK):
+                    self.dist_fatal("invalid target name '%s'" % TOK)
+
+                is_enabled = (
+                    TOK in self.parse_baseline_names or
+                    TOK in self.parse_dispatch_names
+                )
+                if  is_enabled:
+                    if TOK not in final_targets:
+                        final_targets.append(TOK)
+                    continue
+
+                skipped.add(TOK)
+
+        if multi_target is not None:
+            self.dist_fatal("unclosed multi-target, missing ')'")
+        if skipped:
+            self.dist_log(
+                "skip targets", skipped,
+                "not part of baseline or dispatch-able features"
+            )
+
+        final_targets = self.feature_untied(final_targets)
+
+        # add polices dependencies
+        for p in list(policies):
+            _, _, deps = self._parse_policies[p]
+            for d in deps:
+                if d in policies:
+                    continue
+                self.dist_log(
+                    "policy '%s' force enables '%s'" % (
+                    p, d
+                ))
+                policies.add(d)
+
+        # release policies filtrations
+        for p, (have, nhave, _) in self._parse_policies.items():
+            func = None
+            if p in policies:
+                func = have
+                self.dist_log("policy '%s' is ON" % p)
+            else:
+                func = nhave
+            if not func:
+                continue
+            has_baseline, final_targets, extra_flags = func(
+                has_baseline, final_targets, extra_flags
+            )
+
+        return has_baseline, final_targets, extra_flags
+
+    def _parse_token_policy(self, token):
+        """validate policy token"""
+        if len(token) <= 1 or token[-1:] == token[0]:
+            self.dist_fatal("'$' must stuck in the begin of policy name")
+        token = token[1:]
+        if token not in self._parse_policies:
+            self.dist_fatal(
+                "'%s' is an invalid policy name, available policies are" % token,
+                self._parse_policies.keys()
+            )
+        return token
+
+    def _parse_token_group(self, token, has_baseline, final_targets, extra_flags):
+        """validate group token"""
+        if len(token) <= 1 or token[-1:] == token[0]:
+            self.dist_fatal("'#' must stuck in the begin of group name")
+
+        token = token[1:]
+        ghas_baseline, gtargets, gextra_flags = self.parse_target_groups.get(
+            token, (False, None, [])
+        )
+        if gtargets is None:
+            self.dist_fatal(
+                "'%s' is an invalid target group name, " % token + \
+                "available target groups are",
+                self.parse_target_groups.keys()
+            )
+        if ghas_baseline:
+            has_baseline = True
+        # always keep sorting as specified
+        final_targets += [f for f in gtargets if f not in final_targets]
+        extra_flags += [f for f in gextra_flags if f not in extra_flags]
+        return has_baseline, final_targets, extra_flags
+
+    def _parse_multi_target(self, targets):
+        """validate multi targets that defined between parentheses()"""
+        # remove any implied features and keep the origins
+        if not targets:
+            self.dist_fatal("empty multi-target '()'")
+        if not all([
+            self.feature_is_exist(tar) for tar in targets
+        ]) :
+            self.dist_fatal("invalid target name in multi-target", targets)
+        if not all([
+            (
+                tar in self.parse_baseline_names or
+                tar in self.parse_dispatch_names
+            )
+            for tar in targets
+        ]) :
+            return None
+        targets = self.feature_ahead(targets)
+        if not targets:
+            return None
+        # force sort multi targets, so it can be comparable
+        targets = self.feature_sorted(targets)
+        targets = tuple(targets) # hashable
+        return targets
+
+    def _parse_policy_not_keepbase(self, has_baseline, final_targets, extra_flags):
+        """skip all baseline features"""
+        skipped = []
+        for tar in final_targets[:]:
+            is_base = False
+            if isinstance(tar, str):
+                is_base = tar in self.parse_baseline_names
+            else:
+                # multi targets
+                is_base = all([
+                    f in self.parse_baseline_names
+                    for f in tar
+                ])
+            if is_base:
+                skipped.append(tar)
+                final_targets.remove(tar)
+
+        if skipped:
+            self.dist_log("skip baseline features", skipped)
+
+        return has_baseline, final_targets, extra_flags
+
+    def _parse_policy_keepsort(self, has_baseline, final_targets, extra_flags):
+        """leave a notice that $keep_sort is on"""
+        self.dist_log(
+            "policy 'keep_sort' is on, dispatch-able targets", final_targets, "\n"
+            "are 'not' sorted depend on the highest interest but"
+            "as specified in the dispatch-able source or the extra group"
+        )
+        return has_baseline, final_targets, extra_flags
+
+    def _parse_policy_not_keepsort(self, has_baseline, final_targets, extra_flags):
+        """sorted depend on the highest interest"""
+        final_targets = self.feature_sorted(final_targets, reverse=True)
+        return has_baseline, final_targets, extra_flags
+
+    def _parse_policy_maxopt(self, has_baseline, final_targets, extra_flags):
+        """append the compiler optimization flags"""
+        if self.cc_has_debug:
+            self.dist_log("debug mode is detected, policy 'maxopt' is skipped.")
+        elif self.cc_noopt:
+            self.dist_log("optimization is disabled, policy 'maxopt' is skipped.")
+        else:
+            flags = self.cc_flags["opt"]
+            if not flags:
+                self.dist_log(
+                    "current compiler doesn't support optimization flags, "
+                    "policy 'maxopt' is skipped", stderr=True
+                )
+            else:
+                extra_flags += flags
+        return has_baseline, final_targets, extra_flags
+
+    def _parse_policy_werror(self, has_baseline, final_targets, extra_flags):
+        """force warnings to treated as errors"""
+        flags = self.cc_flags["werror"]
+        if not flags:
+            self.dist_log(
+                "current compiler doesn't support werror flags, "
+                "warnings will 'not' treated as errors", stderr=True
+            )
+        else:
+            self.dist_log("compiler warnings are treated as errors")
+            extra_flags += flags
+        return has_baseline, final_targets, extra_flags
+
+    def _parse_policy_autovec(self, has_baseline, final_targets, extra_flags):
+        """skip features that has no auto-vectorized support by compiler"""
+        skipped = []
+        for tar in final_targets[:]:
+            if isinstance(tar, str):
+                can = self.feature_can_autovec(tar)
+            else: # multiple target
+                can = all([
+                    self.feature_can_autovec(t)
+                    for t in tar
+                ])
+            if not can:
+                final_targets.remove(tar)
+                skipped.append(tar)
+
+        if skipped:
+            self.dist_log("skip non auto-vectorized features", skipped)
+
+        return has_baseline, final_targets, extra_flags
+
+class CCompilerOpt(_Config, _Distutils, _Cache, _CCompiler, _Feature, _Parse):
+    """
+    A helper class for `CCompiler` aims to provide extra build options
+    to effectively control of compiler optimizations that are directly
+    related to CPU features.
+    """
+    def __init__(self, ccompiler, cpu_baseline="min", cpu_dispatch="max", cache_path=None):
+        _Config.__init__(self)
+        _Distutils.__init__(self, ccompiler)
+        _Cache.__init__(self, cache_path, self.dist_info(), cpu_baseline, cpu_dispatch)
+        _CCompiler.__init__(self)
+        _Feature.__init__(self)
+        if not self.cc_noopt and self.cc_has_native:
+            self.dist_log(
+                "native flag is specified through environment variables. "
+                "force cpu-baseline='native'"
+            )
+            cpu_baseline = "native"
+        _Parse.__init__(self, cpu_baseline, cpu_dispatch)
+        # keep the requested features untouched, need it later for report
+        # and trace purposes
+        self._requested_baseline = cpu_baseline
+        self._requested_dispatch = cpu_dispatch
+        # key is the dispatch-able source and value is a tuple
+        # contains two items (has_baseline[boolean], dispatched-features[list])
+        self.sources_status = getattr(self, "sources_status", {})
+        # every instance should has a separate one
+        self.cache_private.add("sources_status")
+        # set it at the end to make sure the cache writing was done after init
+        # this class
+        self.hit_cache = hasattr(self, "hit_cache")
+
+    def is_cached(self):
+        """
+        Returns True if the class loaded from the cache file
+        """
+        return self.cache_infile and self.hit_cache
+
+    def cpu_baseline_flags(self):
+        """
+        Returns a list of final CPU baseline compiler flags
+        """
+        return self.parse_baseline_flags
+
+    def cpu_baseline_names(self):
+        """
+        return a list of final CPU baseline feature names
+        """
+        return self.parse_baseline_names
+
+    def cpu_dispatch_names(self):
+        """
+        return a list of final CPU dispatch feature names
+        """
+        return self.parse_dispatch_names
+
+    def try_dispatch(self, sources, src_dir=None, **kwargs):
+        """
+        Compile one or more dispatch-able sources and generates object files,
+        also generates abstract C config headers and macros that
+        used later for the final runtime dispatching process.
+
+        The mechanism behind it is to takes each source file that specified
+        in 'sources' and branching it into several files depend on
+        special configuration statements that must be declared in the
+        top of each source which contains targeted CPU features,
+        then it compiles every branched source with the proper compiler flags.
+
+        Parameters
+        ----------
+        sources : list
+            Must be a list of dispatch-able sources file paths,
+            and configuration statements must be declared inside
+            each file.
+
+        src_dir : str
+            Path of parent directory for the generated headers and wrapped sources.
+            If None(default) the files will generated in-place.
+
+        **kwargs : any
+            Arguments to pass on to the `CCompiler.compile()`
+
+        Returns
+        -------
+        list : generated object files
+
+        Raises
+        ------
+        CompileError
+            Raises by `CCompiler.compile()` on compiling failure.
+        DistutilsError
+            Some errors during checking the sanity of configuration statements.
+
+        See Also
+        --------
+        parse_targets() :
+            Parsing the configuration statements of dispatch-able sources.
+        """
+        to_compile = {}
+        baseline_flags = self.cpu_baseline_flags()
+        include_dirs = kwargs.setdefault("include_dirs", [])
+
+        for src in sources:
+            output_dir = os.path.dirname(src)
+            if src_dir and not output_dir.startswith(src_dir):
+                output_dir = os.path.join(src_dir, output_dir)
+                if output_dir not in include_dirs:
+                    include_dirs.append(output_dir)
+
+            has_baseline, targets, extra_flags = self.parse_targets(src)
+            nochange = self._generate_config(output_dir, src, targets, has_baseline)
+            for tar in targets:
+                tar_src = self._wrap_target(output_dir, src, tar, nochange=nochange)
+                flags = tuple(extra_flags + self.feature_flags(tar))
+                to_compile.setdefault(flags, []).append(tar_src)
+
+            if has_baseline:
+                flags = tuple(extra_flags + baseline_flags)
+                to_compile.setdefault(flags, []).append(src)
+
+            self.sources_status[src] = (has_baseline, targets)
+
+        # For these reasons, the sources are compiled in a separate loop:
+        # - Gathering all sources with the same flags to benefit from
+        #   the parallel compiling as much as possible.
+        # - To generate all config headers of the dispatchable sources,
+        #   before the compilation in case if there are dependency relationships
+        #   among them.
+        objects = []
+        for flags, srcs in to_compile.items():
+            objects += self.dist_compile(srcs, list(flags), **kwargs)
+        return objects
+
+    def generate_dispatch_header(self, header_path):
+        """
+        Generate the dispatch header which containing all definitions
+        and headers of instruction-sets for the enabled CPU baseline and
+        dispatch-able features.
+
+        Its highly recommended to take a look at the generated header
+        also the generated source files via `try_dispatch()`
+        in order to get the full picture.
+        """
+        self.dist_log("generate CPU dispatch header: (%s)" % header_path)
+
+        baseline_names = self.cpu_baseline_names()
+        dispatch_names = self.cpu_dispatch_names()
+        baseline_len = len(baseline_names)
+        dispatch_len = len(dispatch_names)
+
+        with open(header_path, 'w') as f:
+            baseline_calls = ' \\\n'.join([
+                (
+                    "\t%sWITH_CPU_EXPAND_(MACRO_TO_CALL(%s, __VA_ARGS__))"
+                ) % (self.conf_c_prefix, f)
+                for f in baseline_names
+            ])
+            dispatch_calls = ' \\\n'.join([
+                (
+                    "\t%sWITH_CPU_EXPAND_(MACRO_TO_CALL(%s, __VA_ARGS__))"
+                ) % (self.conf_c_prefix, f)
+                for f in dispatch_names
+            ])
+            f.write(textwrap.dedent("""\
+                /*
+                 * AUTOGENERATED DON'T EDIT
+                 * Please make changes to the code generator (distutils/ccompiler_opt.py)
+                */
+                #define {pfx}WITH_CPU_BASELINE  "{baseline_str}"
+                #define {pfx}WITH_CPU_DISPATCH  "{dispatch_str}"
+                #define {pfx}WITH_CPU_BASELINE_N {baseline_len}
+                #define {pfx}WITH_CPU_DISPATCH_N {dispatch_len}
+                #define {pfx}WITH_CPU_EXPAND_(X) X
+                #define {pfx}WITH_CPU_BASELINE_CALL(MACRO_TO_CALL, ...) \\
+                {baseline_calls}
+                #define {pfx}WITH_CPU_DISPATCH_CALL(MACRO_TO_CALL, ...) \\
+                {dispatch_calls}
+            """).format(
+                pfx=self.conf_c_prefix, baseline_str=" ".join(baseline_names),
+                dispatch_str=" ".join(dispatch_names), baseline_len=baseline_len,
+                dispatch_len=dispatch_len, baseline_calls=baseline_calls,
+                dispatch_calls=dispatch_calls
+            ))
+            baseline_pre = ''
+            for name in baseline_names:
+                baseline_pre += self.feature_c_preprocessor(name, tabs=1) + '\n'
+
+            dispatch_pre = ''
+            for name in dispatch_names:
+                dispatch_pre += textwrap.dedent("""\
+                #ifdef {pfx}CPU_TARGET_{name}
+                {pre}
+                #endif /*{pfx}CPU_TARGET_{name}*/
+                """).format(
+                    pfx=self.conf_c_prefix_, name=name, pre=self.feature_c_preprocessor(
+                    name, tabs=1
+                ))
+
+            f.write(textwrap.dedent("""\
+            /******* baseline features *******/
+            {baseline_pre}
+            /******* dispatch features *******/
+            {dispatch_pre}
+            """).format(
+                pfx=self.conf_c_prefix_, baseline_pre=baseline_pre,
+                dispatch_pre=dispatch_pre
+            ))
+
+    def report(self, full=False):
+        report = []
+        baseline_rows = []
+        dispatch_rows = []
+        report.append(("CPU baseline", baseline_rows))
+        report.append(("", ""))
+        report.append(("CPU dispatch", dispatch_rows))
+
+        ########## baseline ##########
+        if self.cc_noopt:
+            baseline_rows.append((
+                "Requested", "optimization disabled %s" % (
+                    "(unsupported arch)" if self.cc_on_noarch else ""
+                )
+            ))
+        else:
+            baseline_rows.append(("Requested", repr(self._requested_baseline)))
+
+        baseline_names = self.cpu_baseline_names()
+        baseline_rows.append((
+            "Enabled", (' '.join(baseline_names) if baseline_names else "none")
+        ))
+        baseline_flags = self.cpu_baseline_flags()
+        baseline_rows.append((
+            "Flags", (' '.join(baseline_flags) if baseline_flags else "none")
+        ))
+
+        ########## dispatch ##########
+        if self.cc_noopt:
+            dispatch_rows.append((
+                "Requested", "optimization disabled %s" % (
+                    "(unsupported arch)" if self.cc_on_noarch else ""
+                )
+            ))
+        else:
+            dispatch_rows.append(("Requested", repr(self._requested_dispatch)))
+
+        dispatch_names = self.cpu_dispatch_names()
+        dispatch_rows.append((
+            "Enabled", (' '.join(dispatch_names) if dispatch_names else "none")
+        ))
+        ########## Generated ##########
+        # TODO:
+        # - collect object names from 'try_dispatch()'
+        #   then get size of each object and printed
+        # - give more details about the features that not
+        #   generated due compiler support
+        # - find a better output's design.
+        #
+        target_sources = {}
+        for source, (_, targets) in self.sources_status.items():
+            for tar in targets:
+                target_sources.setdefault(tar, []).append(source)
+
+        if not full or not target_sources:
+            generated = ""
+            for tar in self.feature_sorted(target_sources):
+                sources = target_sources[tar]
+                name = tar if isinstance(tar, str) else '(%s)' % ' '.join(tar)
+                generated += name + "[%d] " % len(sources)
+            dispatch_rows.append(("Generated", generated[:-1] if generated else "none"))
+        else:
+            dispatch_rows.append(("Generated", ''))
+            for tar in self.feature_sorted(target_sources):
+                sources = target_sources[tar]
+                name = tar if isinstance(tar, str) else '(%s)' % ' '.join(tar)
+                flags = ' '.join(self.feature_flags(tar))
+                implies = ' '.join(self.feature_sorted(self.feature_implies(tar)))
+                detect = ' '.join(self.feature_detect(tar))
+                dispatch_rows.append(('', ''))
+                dispatch_rows.append((name, implies))
+                dispatch_rows.append(("Flags", flags))
+                dispatch_rows.append(("Detect", detect))
+                for src in sources:
+                    dispatch_rows.append(("", src))
+
+        ###############################
+        # TODO: add support for 'markdown' format
+        text = []
+        secs_len = [len(secs) for secs, _ in report]
+        cols_len = [len(col) for _, rows in report for col, _ in rows]
+        tab = ' ' * 2
+        pad =  max(max(secs_len), max(cols_len))
+        for sec, rows in report:
+            if not sec:
+                text.append("") # empty line
+                continue
+            sec += ' ' * (pad - len(sec))
+            text.append(sec + tab + ': ')
+            for col, val in rows:
+                col += ' ' * (pad - len(col))
+                text.append(tab + col + ': ' + val)
+
+        return '\n'.join(text)
+
+    def _wrap_target(self, output_dir, dispatch_src, target, nochange=False):
+        assert(isinstance(target, (str, tuple)))
+        if isinstance(target, str):
+            ext_name = target_name = target
+        else:
+            # multi-target
+            ext_name = '.'.join(target)
+            target_name = '__'.join(target)
+
+        wrap_path = os.path.join(output_dir, os.path.basename(dispatch_src))
+        wrap_path = "{0}.{2}{1}".format(*os.path.splitext(wrap_path), ext_name.lower())
+        if nochange and os.path.exists(wrap_path):
+            return wrap_path
+
+        self.dist_log("wrap dispatch-able target -> ", wrap_path)
+        # sorting for readability
+        features = self.feature_sorted(self.feature_implies_c(target))
+        target_join = "#define %sCPU_TARGET_" % self.conf_c_prefix_
+        target_defs = [target_join + f for f in features]
+        target_defs = '\n'.join(target_defs)
+
+        with open(wrap_path, "w") as fd:
+            fd.write(textwrap.dedent("""\
+            /**
+             * AUTOGENERATED DON'T EDIT
+             * Please make changes to the code generator \
+             (distutils/ccompiler_opt.py)
+             */
+            #define {pfx}CPU_TARGET_MODE
+            #define {pfx}CPU_TARGET_CURRENT {target_name}
+            {target_defs}
+            #include "{path}"
+            """).format(
+                pfx=self.conf_c_prefix_, target_name=target_name,
+                path=os.path.abspath(dispatch_src), target_defs=target_defs
+            ))
+        return wrap_path
+
+    def _generate_config(self, output_dir, dispatch_src, targets, has_baseline=False):
+        config_path = os.path.basename(dispatch_src).replace(".c", ".h")
+        config_path = os.path.join(output_dir, config_path)
+        # check if targets didn't change to avoid recompiling
+        cache_hash = self.cache_hash(targets, has_baseline)
+        try:
+            with open(config_path) as f:
+                last_hash = f.readline().split("cache_hash:")
+                if len(last_hash) == 2 and int(last_hash[1]) == cache_hash:
+                    return True
+        except IOError:
+            pass
+
+        self.dist_log("generate dispatched config -> ", config_path)
+        dispatch_calls = []
+        for tar in targets:
+            if isinstance(tar, str):
+                target_name = tar
+            else: # multi target
+                target_name = '__'.join([t for t in tar])
+            req_detect = self.feature_detect(tar)
+            req_detect = '&&'.join([
+                "CHK(%s)" % f for f in req_detect
+            ])
+            dispatch_calls.append(
+                "\t%sCPU_DISPATCH_EXPAND_(CB((%s), %s, __VA_ARGS__))" % (
+                self.conf_c_prefix_, req_detect, target_name
+            ))
+        dispatch_calls = ' \\\n'.join(dispatch_calls)
+
+        if has_baseline:
+            baseline_calls = (
+                "\t%sCPU_DISPATCH_EXPAND_(CB(__VA_ARGS__))"
+            ) % self.conf_c_prefix_
+        else:
+            baseline_calls = ''
+
+        with open(config_path, "w") as fd:
+            fd.write(textwrap.dedent("""\
+            // cache_hash:{cache_hash}
+            /**
+             * AUTOGENERATED DON'T EDIT
+             * Please make changes to the code generator (distutils/ccompiler_opt.py)
+             */
+            #ifndef {pfx}CPU_DISPATCH_EXPAND_
+                #define {pfx}CPU_DISPATCH_EXPAND_(X) X
+            #endif
+            #undef {pfx}CPU_DISPATCH_BASELINE_CALL
+            #undef {pfx}CPU_DISPATCH_CALL
+            #define {pfx}CPU_DISPATCH_BASELINE_CALL(CB, ...) \\
+            {baseline_calls}
+            #define {pfx}CPU_DISPATCH_CALL(CHK, CB, ...) \\
+            {dispatch_calls}
+            """).format(
+                pfx=self.conf_c_prefix_, baseline_calls=baseline_calls,
+                dispatch_calls=dispatch_calls, cache_hash=cache_hash
+            ))
+        return False
+
+def new_ccompiler_opt(compiler, **kwargs):
+    """
+    Create a new instance of 'CCompilerOpt' and generate the dispatch header
+    inside NumPy source dir.
+
+    Parameters
+    ----------
+    'compiler' : CCompiler instance
+    '**kwargs': passed as-is to `CCompilerOpt(...)`
+
+    Returns
+    -------
+    new instance of CCompilerOpt
+    """
+    opt = CCompilerOpt(compiler, **kwargs)
+    npy_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+    header_dir = os.path.join(npy_path, *("core/src/common".split("/")))
+    header_path = os.path.join(header_dir, "_cpu_dispatch.h")
+    if not os.path.exists(header_path) or not opt.is_cached():
+        if not os.path.exists(header_dir):
+            opt.dist_log(
+                "dispatch header dir '%s' isn't exist, creating it" % header_dir,
+                stderr=True
+            )
+            os.makedirs(header_dir)
+        opt.generate_dispatch_header(header_path)
+    return opt
diff --git a/numpy/distutils/checks/cpu_asimd.c b/numpy/distutils/checks/cpu_asimd.c
new file mode 100644
index 000000000..8df556b6c
--- /dev/null
+++ b/numpy/distutils/checks/cpu_asimd.c
@@ -0,0 +1,25 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f);
+    /* MAXMIN */
+    int ret  = (int)vgetq_lane_f32(vmaxnmq_f32(v1, v2), 0);
+        ret += (int)vgetq_lane_f32(vminnmq_f32(v1, v2), 0);
+    /* ROUNDING */
+    ret += (int)vgetq_lane_f32(vrndq_f32(v1), 0);
+#ifdef __aarch64__
+    {
+        float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0);
+        /* MAXMIN */
+        ret += (int)vgetq_lane_f64(vmaxnmq_f64(vd1, vd2), 0);
+        ret += (int)vgetq_lane_f64(vminnmq_f64(vd1, vd2), 0);
+        /* ROUNDING */
+        ret += (int)vgetq_lane_f64(vrndq_f64(vd1), 0);
+    }
+#endif
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_asimddp.c b/numpy/distutils/checks/cpu_asimddp.c
new file mode 100644
index 000000000..0158d1354
--- /dev/null
+++ b/numpy/distutils/checks/cpu_asimddp.c
@@ -0,0 +1,15 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    uint8x16_t v1 = vdupq_n_u8((unsigned char)1), v2 = vdupq_n_u8((unsigned char)2);
+    uint32x4_t va = vdupq_n_u32(3);
+    int ret = (int)vgetq_lane_u32(vdotq_u32(va, v1, v2), 0);
+#ifdef __aarch64__
+    ret += (int)vgetq_lane_u32(vdotq_laneq_u32(va, v1, v2, 0), 0);
+#endif
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_asimdfhm.c b/numpy/distutils/checks/cpu_asimdfhm.c
new file mode 100644
index 000000000..bb437aa40
--- /dev/null
+++ b/numpy/distutils/checks/cpu_asimdfhm.c
@@ -0,0 +1,17 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float16x8_t vhp  = vdupq_n_f16((float16_t)1);
+    float16x4_t vlhp = vdup_n_f16((float16_t)1);
+    float32x4_t vf   = vdupq_n_f32(1.0f);
+    float32x2_t vlf  = vdup_n_f32(1.0f);
+
+    int ret  = (int)vget_lane_f32(vfmlal_low_u32(vlf, vlhp, vlhp), 0);
+        ret += (int)vgetq_lane_f32(vfmlslq_high_u32(vf, vhp, vhp), 0);
+
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_asimdhp.c b/numpy/distutils/checks/cpu_asimdhp.c
new file mode 100644
index 000000000..80b94000f
--- /dev/null
+++ b/numpy/distutils/checks/cpu_asimdhp.c
@@ -0,0 +1,14 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float16x8_t vhp  = vdupq_n_f16((float16_t)-1);
+    float16x4_t vlhp = vdup_n_f16((float16_t)-1);
+
+    int ret  =  (int)vgetq_lane_f16(vabdq_f16(vhp, vhp), 0);
+        ret  += (int)vget_lane_f16(vabd_f16(vlhp, vlhp), 0);
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_avx.c b/numpy/distutils/checks/cpu_avx.c
new file mode 100644
index 000000000..737c0d2e9
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx.c
@@ -0,0 +1,7 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    __m256 a = _mm256_add_ps(_mm256_setzero_ps(), _mm256_setzero_ps());
+    return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx2.c b/numpy/distutils/checks/cpu_avx2.c
new file mode 100644
index 000000000..dfb11fd79
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx2.c
@@ -0,0 +1,7 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    __m256i a = _mm256_abs_epi16(_mm256_setzero_si256());
+    return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_clx.c b/numpy/distutils/checks/cpu_avx512_clx.c
new file mode 100644
index 000000000..71dad83a7
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_clx.c
@@ -0,0 +1,8 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    /* VNNI */
+    __m512i a = _mm512_dpbusd_epi32(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512());
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_cnl.c b/numpy/distutils/checks/cpu_avx512_cnl.c
new file mode 100644
index 000000000..dfab4436d
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_cnl.c
@@ -0,0 +1,10 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    /* IFMA */
+    __m512i a = _mm512_madd52hi_epu64(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512());
+    /* VMBI */
+    a = _mm512_permutex2var_epi8(a, _mm512_setzero_si512(), _mm512_setzero_si512());
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_icl.c b/numpy/distutils/checks/cpu_avx512_icl.c
new file mode 100644
index 000000000..cf2706b3b
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_icl.c
@@ -0,0 +1,12 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    /* VBMI2 */
+    __m512i a = _mm512_shrdv_epi64(_mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512());
+    /* BITLAG */
+    a = _mm512_popcnt_epi8(a);
+    /* VPOPCNTDQ */
+    a = _mm512_popcnt_epi64(a);
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_knl.c b/numpy/distutils/checks/cpu_avx512_knl.c
new file mode 100644
index 000000000..0699f37a6
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_knl.c
@@ -0,0 +1,11 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    int base[128];
+    /* ER */
+    __m512i a = _mm512_castpd_si512(_mm512_exp2a23_pd(_mm512_setzero_pd()));
+    /* PF */
+    _mm512_mask_prefetch_i64scatter_pd(base, _mm512_cmpeq_epi64_mask(a, a), a, 1, _MM_HINT_T1);
+    return base[0];
+}
diff --git a/numpy/distutils/checks/cpu_avx512_knm.c b/numpy/distutils/checks/cpu_avx512_knm.c
new file mode 100644
index 000000000..db61b4bfa
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_knm.c
@@ -0,0 +1,17 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    __m512i a = _mm512_setzero_si512();
+    __m512 b = _mm512_setzero_ps();
+
+    /* 4FMAPS */
+    b = _mm512_4fmadd_ps(b, b, b, b, b, NULL);
+    /* 4VNNIW */
+    a = _mm512_4dpwssd_epi32(a, a, a, a, a, NULL);
+    /* VPOPCNTDQ */
+    a = _mm512_popcnt_epi64(a);
+
+    a = _mm512_add_epi32(a, _mm512_castps_si512(b));
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512_skx.c b/numpy/distutils/checks/cpu_avx512_skx.c
new file mode 100644
index 000000000..1d5e15b5e
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512_skx.c
@@ -0,0 +1,12 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    /* VL */
+    __m256i a = _mm256_abs_epi64(_mm256_setzero_si256());
+    /* DQ */
+    __m512i b = _mm512_broadcast_i32x8(a);
+    /* BW */
+    b = _mm512_abs_epi16(b);
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(b));
+}
diff --git a/numpy/distutils/checks/cpu_avx512cd.c b/numpy/distutils/checks/cpu_avx512cd.c
new file mode 100644
index 000000000..61bef6b82
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512cd.c
@@ -0,0 +1,7 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    __m512i a = _mm512_lzcnt_epi32(_mm512_setzero_si512());
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_avx512f.c b/numpy/distutils/checks/cpu_avx512f.c
new file mode 100644
index 000000000..f60cc09dd
--- /dev/null
+++ b/numpy/distutils/checks/cpu_avx512f.c
@@ -0,0 +1,7 @@
+#include <immintrin.h>
+
+int main(void)
+{
+    __m512i a = _mm512_abs_epi32(_mm512_setzero_si512());
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
diff --git a/numpy/distutils/checks/cpu_f16c.c b/numpy/distutils/checks/cpu_f16c.c
new file mode 100644
index 000000000..a5a343e2d
--- /dev/null
+++ b/numpy/distutils/checks/cpu_f16c.c
@@ -0,0 +1,9 @@
+#include <emmintrin.h>
+#include <immintrin.h>
+
+int main(void)
+{
+    __m128 a  = _mm_cvtph_ps(_mm_setzero_si128());
+    __m256 a8 = _mm256_cvtph_ps(_mm_setzero_si128());
+    return (int)(_mm_cvtss_f32(a) + _mm_cvtss_f32(_mm256_castps256_ps128(a8)));
+}
diff --git a/numpy/distutils/checks/cpu_fma3.c b/numpy/distutils/checks/cpu_fma3.c
new file mode 100644
index 000000000..cf34c6cb1
--- /dev/null
+++ b/numpy/distutils/checks/cpu_fma3.c
@@ -0,0 +1,8 @@
+#include <xmmintrin.h>
+#include <immintrin.h>
+
+int main(void)
+{
+    __m256 a = _mm256_fmadd_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps());
+    return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
diff --git a/numpy/distutils/checks/cpu_fma4.c b/numpy/distutils/checks/cpu_fma4.c
new file mode 100644
index 000000000..1ad717033
--- /dev/null
+++ b/numpy/distutils/checks/cpu_fma4.c
@@ -0,0 +1,12 @@
+#include <immintrin.h>
+#ifdef _MSC_VER
+    #include <ammintrin.h>
+#else
+    #include <x86intrin.h>
+#endif
+
+int main(void)
+{
+    __m256 a = _mm256_macc_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps());
+    return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
diff --git a/numpy/distutils/checks/cpu_neon.c b/numpy/distutils/checks/cpu_neon.c
new file mode 100644
index 000000000..4eab1f384
--- /dev/null
+++ b/numpy/distutils/checks/cpu_neon.c
@@ -0,0 +1,15 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f);
+    int ret = (int)vgetq_lane_f32(vmulq_f32(v1, v2), 0);
+#ifdef __aarch64__
+    float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0);
+    ret += (int)vgetq_lane_f64(vmulq_f64(vd1, vd2), 0);
+#endif
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_neon_fp16.c b/numpy/distutils/checks/cpu_neon_fp16.c
new file mode 100644
index 000000000..745d2e793
--- /dev/null
+++ b/numpy/distutils/checks/cpu_neon_fp16.c
@@ -0,0 +1,11 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    short z4[] = {0, 0, 0, 0, 0, 0, 0, 0};
+    float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16((const short*)z4));
+    return (int)vgetq_lane_f32(v_z4, 0);
+}
diff --git a/numpy/distutils/checks/cpu_neon_vfpv4.c b/numpy/distutils/checks/cpu_neon_vfpv4.c
new file mode 100644
index 000000000..45f7b5d69
--- /dev/null
+++ b/numpy/distutils/checks/cpu_neon_vfpv4.c
@@ -0,0 +1,19 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float32x4_t v1 = vdupq_n_f32(1.0f);
+    float32x4_t v2 = vdupq_n_f32(2.0f);
+    float32x4_t v3 = vdupq_n_f32(3.0f);
+    int ret = (int)vgetq_lane_f32(vfmaq_f32(v1, v2, v3), 0);
+#ifdef __aarch64__
+    float64x2_t vd1 = vdupq_n_f64(1.0);
+    float64x2_t vd2 = vdupq_n_f64(2.0);
+    float64x2_t vd3 = vdupq_n_f64(3.0);
+    ret += (int)vgetq_lane_f64(vfmaq_f64(vd1, vd2, vd3), 0);
+#endif
+    return ret;
+}
diff --git a/numpy/distutils/checks/cpu_popcnt.c b/numpy/distutils/checks/cpu_popcnt.c
new file mode 100644
index 000000000..e6a80fb40
--- /dev/null
+++ b/numpy/distutils/checks/cpu_popcnt.c
@@ -0,0 +1,23 @@
+#ifdef _MSC_VER
+    #include <nmmintrin.h>
+#else
+    #include <popcntintrin.h>
+#endif
+
+int main(void)
+{
+    long long a = 0;
+    int b;
+#ifdef _MSC_VER
+    #ifdef _M_X64
+    a = _mm_popcnt_u64(1);
+    #endif
+    b = _mm_popcnt_u32(1);
+#else
+    #ifdef __x86_64__
+    a = __builtin_popcountll(1);
+    #endif
+    b = __builtin_popcount(1);
+#endif
+    return (int)a + b;
+}
diff --git a/numpy/distutils/checks/cpu_sse.c b/numpy/distutils/checks/cpu_sse.c
new file mode 100644
index 000000000..bb98bf63c
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse.c
@@ -0,0 +1,7 @@
+#include <xmmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_add_ps(_mm_setzero_ps(), _mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
diff --git a/numpy/distutils/checks/cpu_sse2.c b/numpy/distutils/checks/cpu_sse2.c
new file mode 100644
index 000000000..658afc9b4
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse2.c
@@ -0,0 +1,7 @@
+#include <emmintrin.h>
+
+int main(void)
+{
+    __m128i a = _mm_add_epi16(_mm_setzero_si128(), _mm_setzero_si128());
+    return _mm_cvtsi128_si32(a);
+}
diff --git a/numpy/distutils/checks/cpu_sse3.c b/numpy/distutils/checks/cpu_sse3.c
new file mode 100644
index 000000000..aece1e601
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse3.c
@@ -0,0 +1,7 @@
+#include <pmmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
diff --git a/numpy/distutils/checks/cpu_sse41.c b/numpy/distutils/checks/cpu_sse41.c
new file mode 100644
index 000000000..bfdb9feac
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse41.c
@@ -0,0 +1,7 @@
+#include <smmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_floor_ps(_mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
diff --git a/numpy/distutils/checks/cpu_sse42.c b/numpy/distutils/checks/cpu_sse42.c
new file mode 100644
index 000000000..24f5d93fe
--- /dev/null
+++ b/numpy/distutils/checks/cpu_sse42.c
@@ -0,0 +1,7 @@
+#include <smmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
diff --git a/numpy/distutils/checks/cpu_ssse3.c b/numpy/distutils/checks/cpu_ssse3.c
new file mode 100644
index 000000000..ad0abc1e6
--- /dev/null
+++ b/numpy/distutils/checks/cpu_ssse3.c
@@ -0,0 +1,7 @@
+#include <tmmintrin.h>
+
+int main(void)
+{
+    __m128i a = _mm_hadd_epi16(_mm_setzero_si128(), _mm_setzero_si128());
+    return (int)_mm_cvtsi128_si32(a);
+}
diff --git a/numpy/distutils/checks/cpu_vsx.c b/numpy/distutils/checks/cpu_vsx.c
new file mode 100644
index 000000000..0b3f30d6a
--- /dev/null
+++ b/numpy/distutils/checks/cpu_vsx.c
@@ -0,0 +1,21 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
+    #define vsx_ld  vec_vsx_ld
+    #define vsx_st  vec_vsx_st
+#else
+    #define vsx_ld  vec_xl
+    #define vsx_st  vec_xst
+#endif
+
+int main(void)
+{
+    unsigned int zout[4];
+    unsigned int z4[] = {0, 0, 0, 0};
+    __vector unsigned int v_z4 = vsx_ld(0, z4);
+    vsx_st(v_z4, 0, zout);
+    return zout[0];
+}
diff --git a/numpy/distutils/checks/cpu_vsx2.c b/numpy/distutils/checks/cpu_vsx2.c
new file mode 100644
index 000000000..410fb29d6
--- /dev/null
+++ b/numpy/distutils/checks/cpu_vsx2.c
@@ -0,0 +1,13 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+typedef __vector unsigned long long v_uint64x2;
+
+int main(void)
+{
+    v_uint64x2 z2 = (v_uint64x2){0, 0};
+    z2 = (v_uint64x2)vec_cmpeq(z2, z2);
+    return (int)vec_extract(z2, 0);
+}
diff --git a/numpy/distutils/checks/cpu_vsx3.c b/numpy/distutils/checks/cpu_vsx3.c
new file mode 100644
index 000000000..857526535
--- /dev/null
+++ b/numpy/distutils/checks/cpu_vsx3.c
@@ -0,0 +1,13 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+typedef __vector unsigned int v_uint32x4;
+
+int main(void)
+{
+    v_uint32x4 z4 = (v_uint32x4){0, 0, 0, 0};
+    z4 = vec_absd(z4, z4);
+    return (int)vec_extract(z4, 0);
+}
diff --git a/numpy/distutils/checks/cpu_xop.c b/numpy/distutils/checks/cpu_xop.c
new file mode 100644
index 000000000..51d70cf2b
--- /dev/null
+++ b/numpy/distutils/checks/cpu_xop.c
@@ -0,0 +1,12 @@
+#include <immintrin.h>
+#ifdef _MSC_VER
+    #include <ammintrin.h>
+#else
+    #include <x86intrin.h>
+#endif
+
+int main(void)
+{
+    __m128i a = _mm_comge_epu32(_mm_setzero_si128(), _mm_setzero_si128());
+    return _mm_cvtsi128_si32(a);
+}
diff --git a/numpy/distutils/checks/test_flags.c b/numpy/distutils/checks/test_flags.c
new file mode 100644
index 000000000..4cd09d42a
--- /dev/null
+++ b/numpy/distutils/checks/test_flags.c
@@ -0,0 +1 @@
+int test_flags;
diff --git a/numpy/distutils/command/build.py b/numpy/distutils/command/build.py
index a156a7c6e..60ba4c917 100644
--- a/numpy/distutils/command/build.py
+++ b/numpy/distutils/command/build.py
@@ -16,6 +16,12 @@ class build(old_build):
          "specify the Fortran compiler type"),
         ('warn-error', None,
          "turn all warnings into errors (-Werror)"),
+        ('cpu-baseline=', None,
+         "specify a list of enabled baseline CPU optimizations"),
+        ('cpu-dispatch=', None,
+         "specify a list of dispatched CPU optimizations"),
+        ('disable-optimization', None,
+         "disable CPU optimized code(dispatch,simd,fast...)"),
         ]
 
     help_options = old_build.help_options + [
@@ -27,6 +33,9 @@ class build(old_build):
         old_build.initialize_options(self)
         self.fcompiler = None
         self.warn_error = False
+        self.cpu_baseline = "min"
+        self.cpu_dispatch = "max -xop -fma4" # drop AMD legacy features by default
+        self.disable_optimization = False
 
     def finalize_options(self):
         build_scripts = self.build_scripts
diff --git a/numpy/distutils/command/build_clib.py b/numpy/distutils/command/build_clib.py
index f6a84e351..87345adbc 100644
--- a/numpy/distutils/command/build_clib.py
+++ b/numpy/distutils/command/build_clib.py
@@ -13,6 +13,7 @@ from numpy.distutils.misc_util import (
     filter_sources, get_lib_source_files, get_numpy_include_dirs,
     has_cxx_sources, has_f_sources, is_sequence
 )
+from numpy.distutils.ccompiler_opt import new_ccompiler_opt
 
 # Fix Python distutils bug sf #1718574:
 _l = old_build_clib.user_options
@@ -34,9 +35,16 @@ class build_clib(old_build_clib):
          "number of parallel jobs"),
         ('warn-error', None,
          "turn all warnings into errors (-Werror)"),
+        ('cpu-baseline=', None,
+         "specify a list of enabled baseline CPU optimizations"),
+        ('cpu-dispatch=', None,
+         "specify a list of dispatched CPU optimizations"),
+        ('disable-optimization', None,
+         "disable CPU optimized code(dispatch,simd,fast...)"),
     ]
 
-    boolean_options = old_build_clib.boolean_options + ['inplace', 'warn-error']
+    boolean_options = old_build_clib.boolean_options + \
+    ['inplace', 'warn-error', 'disable-optimization']
 
     def initialize_options(self):
         old_build_clib.initialize_options(self)
@@ -44,6 +52,10 @@ class build_clib(old_build_clib):
         self.inplace = 0
         self.parallel = None
         self.warn_error = None
+        self.cpu_baseline = None
+        self.cpu_dispatch = None
+        self.disable_optimization = None
+
 
     def finalize_options(self):
         if self.parallel:
@@ -55,6 +67,9 @@ class build_clib(old_build_clib):
         self.set_undefined_options('build',
                                         ('parallel', 'parallel'),
                                         ('warn_error', 'warn_error'),
+                                        ('cpu_baseline', 'cpu_baseline'),
+                                        ('cpu_dispatch', 'cpu_dispatch'),
+                                        ('disable_optimization', 'disable_optimization')
                                   )
 
     def have_f_sources(self):
@@ -102,6 +117,25 @@ class build_clib(old_build_clib):
 
         self.compiler.show_customization()
 
+        if not self.disable_optimization:
+            opt_cache_path = os.path.abspath(
+                os.path.join(self.build_temp, 'ccompiler_opt_cache_clib.py'
+            ))
+            self.compiler_opt = new_ccompiler_opt(
+                compiler=self.compiler, cpu_baseline=self.cpu_baseline,
+                cpu_dispatch=self.cpu_dispatch, cache_path=opt_cache_path
+            )
+            if not self.compiler_opt.is_cached():
+                log.info("Detected changes on compiler optimizations, force rebuilding")
+                self.force = True
+
+            import atexit
+            def report():
+                log.info("\n########### CLIB COMPILER OPTIMIZATION ###########")
+                log.info(self.compiler_opt.report(full=True))
+
+            atexit.register(report)
+
         if self.have_f_sources():
             from numpy.distutils.fcompiler import new_fcompiler
             self._f_compiler = new_fcompiler(compiler=self.fcompiler,
@@ -211,6 +245,8 @@ class build_clib(old_build_clib):
                 'extra_f90_compile_args') or []
 
         macros = build_info.get('macros')
+        if macros is None:
+            macros = []
         include_dirs = build_info.get('include_dirs')
         if include_dirs is None:
             include_dirs = []
@@ -223,6 +259,31 @@ class build_clib(old_build_clib):
         if requiref90:
             self.mkpath(module_build_dir)
 
+        dispatch_objects = []
+        if not self.disable_optimization:
+            dispatch_sources  = [
+                c_sources.pop(c_sources.index(src))
+                for src in c_sources[:] if src.endswith(".dispatch.c")
+            ]
+            if dispatch_sources:
+                if not self.inplace:
+                    build_src = self.get_finalized_command("build_src").build_src
+                else:
+                    build_src = None
+                dispatch_objects = self.compiler_opt.try_dispatch(
+                    dispatch_sources,
+                    output_dir=self.build_temp,
+                    src_dir=build_src,
+                    macros=macros,
+                    include_dirs=include_dirs,
+                    debug=self.debug,
+                    extra_postargs=extra_postargs
+                )
+            extra_args_baseopt = extra_postargs + self.compiler_opt.cpu_baseline_flags()
+        else:
+            extra_args_baseopt = extra_postargs
+            macros.append(("NPY_DISABLE_OPTIMIZATION", 1))
+
         if compiler.compiler_type == 'msvc':
             # this hack works around the msvc compiler attributes
             # problem, msvc uses its own convention :(
@@ -237,7 +298,8 @@ class build_clib(old_build_clib):
                                        macros=macros,
                                        include_dirs=include_dirs,
                                        debug=self.debug,
-                                       extra_postargs=extra_postargs)
+                                       extra_postargs=extra_args_baseopt)
+        objects.extend(dispatch_objects)
 
         if cxx_sources:
             log.info("compiling C++ sources")
diff --git a/numpy/distutils/command/build_ext.py b/numpy/distutils/command/build_ext.py
index d53285c92..b6557fcf6 100644
--- a/numpy/distutils/command/build_ext.py
+++ b/numpy/distutils/command/build_ext.py
@@ -19,7 +19,7 @@ from numpy.distutils.misc_util import (
     has_cxx_sources, has_f_sources, is_sequence
 )
 from numpy.distutils.command.config_compiler import show_fortran_compilers
-
+from numpy.distutils.ccompiler_opt import new_ccompiler_opt
 
 
 class build_ext (old_build_ext):
@@ -33,6 +33,12 @@ class build_ext (old_build_ext):
          "number of parallel jobs"),
         ('warn-error', None,
          "turn all warnings into errors (-Werror)"),
+        ('cpu-baseline=', None,
+         "specify a list of enabled baseline CPU optimizations"),
+        ('cpu-dispatch=', None,
+         "specify a list of dispatched CPU optimizations"),
+        ('disable-optimization', None,
+         "disable CPU optimized code(dispatch,simd,fast...)"),
     ]
 
     help_options = old_build_ext.help_options + [
@@ -40,13 +46,16 @@ class build_ext (old_build_ext):
          show_fortran_compilers),
     ]
 
-    boolean_options = old_build_ext.boolean_options + ['warn-error']
+    boolean_options = old_build_ext.boolean_options + ['warn-error', 'disable-optimization']
 
     def initialize_options(self):
         old_build_ext.initialize_options(self)
         self.fcompiler = None
         self.parallel = None
         self.warn_error = None
+        self.cpu_baseline = None
+        self.cpu_dispatch = None
+        self.disable_optimization = None
 
     def finalize_options(self):
         if self.parallel:
@@ -75,6 +84,9 @@ class build_ext (old_build_ext):
         self.set_undefined_options('build',
                                         ('parallel', 'parallel'),
                                         ('warn_error', 'warn_error'),
+                                        ('cpu_baseline', 'cpu_baseline'),
+                                        ('cpu_dispatch', 'cpu_dispatch'),
+                                        ('disable_optimization', 'disable_optimization'),
                                   )
 
     def run(self):
@@ -129,6 +141,22 @@ class build_ext (old_build_ext):
 
         self.compiler.show_customization()
 
+        if not self.disable_optimization:
+            opt_cache_path = os.path.abspath(os.path.join(self.build_temp, 'ccompiler_opt_cache_ext.py'))
+            self.compiler_opt = new_ccompiler_opt(compiler=self.compiler,
+                                                  cpu_baseline=self.cpu_baseline,
+                                                  cpu_dispatch=self.cpu_dispatch,
+                                                  cache_path=opt_cache_path)
+            if not self.compiler_opt.is_cached():
+                log.info("Detected changes on compiler optimizations, force rebuilding")
+                self.force = True
+
+            import atexit
+            def report():
+                log.info("\n########### EXT COMPILER OPTIMIZATION ###########")
+                log.info(self.compiler_opt.report(full=True))
+            atexit.register(report)
+
         # Setup directory for storing generated extra DLL files on Windows
         self.extra_dll_dir = os.path.join(self.build_temp, '.libs')
         if not os.path.isdir(self.extra_dll_dir):
@@ -378,6 +406,32 @@ class build_ext (old_build_ext):
 
         include_dirs = ext.include_dirs + get_numpy_include_dirs()
 
+        dispatch_objects = []
+        if not self.disable_optimization:
+            dispatch_sources  = [
+                c_sources.pop(c_sources.index(src))
+                for src in c_sources[:] if src.endswith(".dispatch.c")
+            ]
+            if dispatch_sources:
+                if not self.inplace:
+                    build_src = self.get_finalized_command("build_src").build_src
+                else:
+                    build_src = None
+                dispatch_objects = self.compiler_opt.try_dispatch(
+                    dispatch_sources,
+                    output_dir=output_dir,
+                    src_dir=build_src,
+                    macros=macros,
+                    include_dirs=include_dirs,
+                    debug=self.debug,
+                    extra_postargs=extra_args,
+                    **kws
+                )
+            extra_args_baseopt = extra_args + self.compiler_opt.cpu_baseline_flags()
+        else:
+            extra_args_baseopt = extra_args
+            macros.append(("NPY_DISABLE_OPTIMIZATION", 1))
+
         c_objects = []
         if c_sources:
             log.info("compiling C sources")
@@ -386,8 +440,9 @@ class build_ext (old_build_ext):
                                               macros=macros,
                                               include_dirs=include_dirs,
                                               debug=self.debug,
-                                              extra_postargs=extra_args,
+                                              extra_postargs=extra_args_baseopt,
                                               **kws)
+        c_objects.extend(dispatch_objects)
 
         if cxx_sources:
             log.info("compiling C++ sources")
diff --git a/numpy/distutils/setup.py b/numpy/distutils/setup.py
index 88cd1a160..798c3686f 100644
--- a/numpy/distutils/setup.py
+++ b/numpy/distutils/setup.py
@@ -7,6 +7,7 @@ def configuration(parent_package='',top_path=None):
     config.add_subpackage('tests')
     config.add_data_files('site.cfg')
     config.add_data_files('mingw/gfortran_vs2003_hack.c')
+    config.add_data_dir('checks')
     config.make_config_py()
     return config
 
diff --git a/numpy/distutils/tests/test_ccompiler_opt.py b/numpy/distutils/tests/test_ccompiler_opt.py
new file mode 100644
index 000000000..a789be1ea
--- /dev/null
+++ b/numpy/distutils/tests/test_ccompiler_opt.py
@@ -0,0 +1,787 @@
+import re, textwrap, os
+from os import sys, path
+from distutils.errors import DistutilsError
+
+is_standalone = __name__ == '__main__' and __package__ is None
+if is_standalone:
+    import unittest, contextlib, tempfile, shutil
+    sys.path.append(path.abspath(path.join(path.dirname(__file__), "..")))
+    from ccompiler_opt import CCompilerOpt
+
+    # from numpy/testing/_private/utils.py
+    @contextlib.contextmanager
+    def tempdir(*args, **kwargs):
+        tmpdir = tempfile.mkdtemp(*args, **kwargs)
+        try:
+            yield tmpdir
+        finally:
+            shutil.rmtree(tmpdir)
+
+    def assert_(expr, msg=''):
+        if not expr:
+            raise AssertionError(msg)
+else:
+    from numpy.distutils.ccompiler_opt import CCompilerOpt
+    from numpy.testing import assert_, tempdir
+
+# architectures and compilers to test
+arch_compilers = dict(
+    x86 = ("gcc", "clang", "icc", "iccw", "msvc"),
+    x64 = ("gcc", "clang", "icc", "iccw", "msvc"),
+    ppc64 = ("gcc", "clang"),
+    ppc64le = ("gcc", "clang"),
+    armhf = ("gcc", "clang"),
+    aarch64 = ("gcc", "clang"),
+    noarch = ("gcc",)
+)
+
+class FakeCCompilerOpt(CCompilerOpt):
+    fake_info = ""
+    def __init__(self, trap_files="", trap_flags="", *args, **kwargs):
+        self.fake_trap_files = trap_files
+        self.fake_trap_flags = trap_flags
+        CCompilerOpt.__init__(self, None, **kwargs)
+
+    def __repr__(self):
+        return textwrap.dedent("""\
+            <<<<
+            march    : {}
+            compiler : {}
+            ----------------
+            {}
+            >>>>
+        """).format(self.cc_march, self.cc_name, self.report())
+
+    def dist_compile(self, sources, flags, **kwargs):
+        assert(isinstance(sources, list))
+        assert(isinstance(flags, list))
+        if self.fake_trap_files:
+            for src in sources:
+                if re.match(self.fake_trap_files, src):
+                    self.dist_error("source is trapped by a fake interface")
+        if self.fake_trap_flags:
+            for f in flags:
+                if re.match(self.fake_trap_flags, f):
+                    self.dist_error("flag is trapped by a fake interface")
+        # fake objects
+        return zip(sources, [' '.join(flags)] * len(sources))
+
+    def dist_info(self):
+        return FakeCCompilerOpt.fake_info
+
+    @staticmethod
+    def dist_log(*args, stderr=False):
+        pass
+
+class _Test_CCompilerOpt(object):
+    arch = None # x86_64
+    cc   = None # gcc
+
+    def setup(self):
+        FakeCCompilerOpt.conf_nocache = True
+        self._opt = None
+
+    def nopt(self, *args, **kwargs):
+        FakeCCompilerOpt.fake_info = self.arch + '_' + self.cc
+        return FakeCCompilerOpt(*args, **kwargs)
+
+    def opt(self):
+        if not self._opt:
+            self._opt = self.nopt()
+        return self._opt
+
+    def march(self):
+        return self.opt().cc_march
+
+    def cc_name(self):
+        return self.opt().cc_name
+
+    def get_targets(self, targets, groups, **kwargs):
+        FakeCCompilerOpt.conf_target_groups = groups
+        opt = self.nopt(
+            cpu_baseline=kwargs.get("baseline", "min"),
+            cpu_dispatch=kwargs.get("dispatch", "max"),
+            trap_files=kwargs.get("trap_files", ""),
+            trap_flags=kwargs.get("trap_flags", "")
+        )
+        with tempdir() as tmpdir:
+            file = os.path.join(tmpdir, "test_targets.c")
+            with open(file, 'w') as f:
+                f.write(targets)
+            gtargets = []
+            gflags = {}
+            fake_objects = opt.try_dispatch([file])
+            for source, flags in fake_objects:
+                gtar = source.split('.')[1:-1]
+                glen = len(gtar)
+                if glen == 0:
+                    gtar = "baseline"
+                elif glen == 1:
+                    gtar = gtar[0].upper()
+                else:
+                    # converting multi-target into parentheses str format to be equivalent
+                    # to the configuration statements syntax.
+                    gtar = ('('+' '.join(gtar)+')').upper()
+                gtargets.append(gtar)
+                gflags[gtar] = flags
+
+        has_baseline, targets = opt.sources_status[file]
+        targets = targets + ["baseline"] if has_baseline else targets
+        # convert tuple that represent multi-target into parentheses str format
+        targets = [
+            '('+' '.join(tar)+')' if isinstance(tar, tuple) else tar
+            for tar in targets
+        ]
+        if len(targets) != len(gtargets) or not all(t in gtargets for t in targets):
+            raise AssertionError(
+                "'sources_status' returns different targets than the compiled targets\n"
+                "%s != %s" % (targets, gtargets)
+            )
+        # return targets from 'sources_status' since the order is matters
+        return targets, gflags
+
+    def arg_regex(self, **kwargs):
+        map2origin = dict(
+            x64 = "x86",
+            ppc64le = "ppc64",
+            aarch64 = "armhf",
+            clang = "gcc",
+        )
+        march = self.march(); cc_name = self.cc_name()
+        map_march = map2origin.get(march, march)
+        map_cc = map2origin.get(cc_name, cc_name)
+        for key in (
+            march, cc_name, map_march, map_cc,
+            march + '_' + cc_name,
+            map_march + '_' + cc_name,
+            march + '_' + map_cc,
+            map_march + '_' + map_cc,
+        ) :
+            regex = kwargs.pop(key, None)
+            if regex is not None:
+                break
+        if regex:
+            if isinstance(regex, dict):
+                for k, v in regex.items():
+                    if v[-1:] not in ')}$?\\.+*':
+                        regex[k] = v + '$'
+            else:
+                assert(isinstance(regex, str))
+                if regex[-1:] not in ')}$?\\.+*':
+                    regex += '$'
+        return regex
+
+    def expect(self, dispatch, baseline="", **kwargs):
+        match = self.arg_regex(**kwargs)
+        if match is None:
+            return
+        opt = self.nopt(
+            cpu_baseline=baseline, cpu_dispatch=dispatch,
+            trap_files=kwargs.get("trap_files", ""),
+            trap_flags=kwargs.get("trap_flags", "")
+        )
+        features = ' '.join(opt.cpu_dispatch_names())
+        if not match:
+            if len(features) != 0:
+                raise AssertionError(
+                    'expected empty features, not "%s"' % features
+                )
+            return
+        if not re.match(match, features, re.IGNORECASE):
+            raise AssertionError(
+                'dispatch features "%s" not match "%s"' % (features, match)
+            )
+
+    def expect_baseline(self, baseline, dispatch="", **kwargs):
+        match = self.arg_regex(**kwargs)
+        if match is None:
+            return
+        opt = self.nopt(
+            cpu_baseline=baseline, cpu_dispatch=dispatch,
+            trap_files=kwargs.get("trap_files", ""),
+            trap_flags=kwargs.get("trap_flags", "")
+        )
+        features = ' '.join(opt.cpu_baseline_names())
+        if not match:
+            if len(features) != 0:
+                raise AssertionError(
+                    'expected empty features, not "%s"' % features
+                )
+            return
+        if not re.match(match, features, re.IGNORECASE):
+            raise AssertionError(
+                'baseline features "%s" not match "%s"' % (features, match)
+            )
+
+    def expect_flags(self, baseline, dispatch="", **kwargs):
+        match = self.arg_regex(**kwargs)
+        if match is None:
+            return
+        opt = self.nopt(
+            cpu_baseline=baseline, cpu_dispatch=dispatch,
+            trap_files=kwargs.get("trap_files", ""),
+            trap_flags=kwargs.get("trap_flags", "")
+        )
+        flags = ' '.join(opt.cpu_baseline_flags())
+        if not match:
+            if len(flags) != 0:
+                raise AssertionError(
+                    'expected empty flags not "%s"' % flags
+                )
+            return
+        if not re.match(match, flags):
+            raise AssertionError(
+                'flags "%s" not match "%s"' % (flags, match)
+            )
+
+    def expect_targets(self, targets, groups={}, **kwargs):
+        match = self.arg_regex(**kwargs)
+        if match is None:
+            return
+        targets, _ = self.get_targets(targets=targets, groups=groups, **kwargs)
+        targets = ' '.join(targets)
+        if not match:
+            if len(targets) != 0:
+                raise AssertionError(
+                    'expected empty targets, not "%s"' % targets
+                )
+            return
+        if not re.match(match, targets, re.IGNORECASE):
+            raise AssertionError(
+                'targets "%s" not match "%s"' % (targets, match)
+            )
+
+    def expect_target_flags(self, targets, groups={}, **kwargs):
+        match_dict = self.arg_regex(**kwargs)
+        if match_dict is None:
+            return
+        assert(isinstance(match_dict, dict))
+        _, tar_flags = self.get_targets(targets=targets, groups=groups)
+
+        for match_tar, match_flags in match_dict.items():
+            if match_tar not in tar_flags:
+                raise AssertionError(
+                    'expected to find target "%s"' % match_tar
+                )
+            flags = tar_flags[match_tar]
+            if not match_flags:
+                if len(flags) != 0:
+                    raise AssertionError(
+                        'expected to find empty flags in target "%s"' % match_tar
+                    )
+            if not re.match(match_flags, flags):
+                raise AssertionError(
+                    '"%s" flags "%s" not match "%s"' % (match_tar, flags, match_flags)
+                )
+
+    def test_interface(self):
+        wrong_arch = "ppc64" if self.arch != "ppc64" else "x86"
+        wrong_cc   = "clang" if self.cc   != "clang" else "icc"
+        opt = self.opt()
+        assert_(getattr(opt, "cc_on_" + self.arch))
+        assert_(not getattr(opt, "cc_on_" + wrong_arch))
+        assert_(getattr(opt, "cc_is_" + self.cc))
+        assert_(not getattr(opt, "cc_is_" + wrong_cc))
+
+    def test_args_empty(self):
+        for baseline, dispatch in (
+            ("", "none"),
+            (None, ""),
+            ("none +none", "none - none"),
+            ("none -max", "min - max"),
+            ("+vsx2 -VSX2", "vsx avx2 avx512f -max"),
+            ("max -vsx - avx + avx512f neon -MAX ",
+             "min -min + max -max -vsx + avx2 -avx2 +NONE")
+        ) :
+            opt = self.nopt(cpu_baseline=baseline, cpu_dispatch=dispatch)
+            assert(len(opt.cpu_baseline_names()) == 0)
+            assert(len(opt.cpu_dispatch_names()) == 0)
+
+    def test_args_validation(self):
+        if self.march() == "unknown":
+            return
+        # check sanity of argument's validation
+        for baseline, dispatch in (
+            ("unkown_feature - max +min", "unknown max min"), # unknowing features
+            ("#avx2", "$vsx") # groups and polices aren't acceptable
+        ) :
+            try:
+                self.nopt(cpu_baseline=baseline, cpu_dispatch=dispatch)
+                raise AssertionError("excepted an exception for invalid arguments")
+            except DistutilsError:
+                pass
+
+    def test_skip(self):
+        # only takes what platform supports and skip the others
+        # without casing exceptions
+        self.expect(
+            "sse vsx neon",
+            x86="sse", ppc64="vsx", armhf="neon", unknown=""
+        )
+        self.expect(
+            "sse41 avx avx2 vsx2 vsx3 neon_vfpv4 asimd",
+            x86   = "sse41 avx avx2",
+            ppc64 = "vsx2 vsx3",
+            armhf = "neon_vfpv4 asimd",
+            unknown = ""
+        )
+        # any features in cpu_dispatch must be ignored if it's part of baseline
+        self.expect(
+            "sse neon vsx", baseline="sse neon vsx",
+            x86="", ppc64="", armhf=""
+        )
+        self.expect(
+            "avx2 vsx3 asimdhp", baseline="avx2 vsx3 asimdhp",
+            x86="", ppc64="", armhf=""
+        )
+
+    def test_implies(self):
+        # baseline combining implied features, so we count
+        # on it instead of testing 'feature_implies()'' directly
+        self.expect_baseline(
+            "fma3 avx2 asimd vsx3",
+            # .* between two spaces can validate features in between
+            x86   = "sse .* sse41 .* fma3.*avx2",
+            ppc64 = "vsx vsx2 vsx3",
+            armhf = "neon neon_fp16 neon_vfpv4 asimd"
+        )
+        """
+        special cases
+        """
+        # in icc and msvc, FMA3 and AVX2 can't be separated
+        # both need to implies each other, same for avx512f & cd
+        for f0, f1 in (
+            ("fma3",    "avx2"),
+            ("avx512f", "avx512cd"),
+        ):
+            diff = ".* sse42 .* %s .*%s$" % (f0, f1)
+            self.expect_baseline(f0,
+                x86_gcc=".* sse42 .* %s$" % f0,
+                x86_icc=diff, x86_iccw=diff
+            )
+            self.expect_baseline(f1,
+                x86_gcc=".* avx .* %s$" % f1,
+                x86_icc=diff, x86_iccw=diff
+            )
+        # in msvc, following features can't be separated too
+        for f in (("fma3", "avx2"), ("avx512f", "avx512cd", "avx512_skx")):
+            for ff in f:
+                self.expect_baseline(ff,
+                    x86_msvc=".*%s" % ' '.join(f)
+                )
+
+        # in ppc64le VSX and VSX2 can't be separated
+        self.expect_baseline("vsx", ppc64le="vsx vsx2")
+        # in aarch64 following features can't be separated
+        for f in ("neon", "neon_fp16", "neon_vfpv4", "asimd"):
+            self.expect_baseline(f, aarch64="neon neon_fp16 neon_vfpv4 asimd")
+
+    def test_args_options(self):
+        # max & native
+        for o in ("max", "native"):
+            if o == "native" and self.cc_name() == "msvc":
+                continue
+            self.expect(o,
+                trap_files=".*cpu_(sse|vsx|neon).c",
+                x86="", ppc64="", armhf=""
+            )
+            self.expect(o,
+                trap_files=".*cpu_(sse3|vsx2|neon_vfpv4).c",
+                x86="sse sse2", ppc64="vsx", armhf="neon neon_fp16",
+                aarch64="", ppc64le=""
+            )
+            self.expect(o,
+                trap_files=".*cpu_(popcnt|vsx3).c",
+                x86="sse .* sse41", ppc64="vsx vsx2",
+                armhf="neon neon_fp16 .* asimd .*"
+            )
+            self.expect(o,
+                x86_gcc=".* xop fma4 .* avx512f .* avx512_knl avx512_knm avx512_skx .*",
+                # in icc, xop and fam4 aren't supported
+                x86_icc=".* avx512f .* avx512_knl avx512_knm avx512_skx .*",
+                x86_iccw=".* avx512f .* avx512_knl avx512_knm avx512_skx .*",
+                # in msvc, avx512_knl avx512_knm aren't supported
+                x86_msvc=".* xop fma4 .* avx512f .* avx512_skx .*",
+                armhf=".* asimd asimdhp asimddp .*",
+                ppc64="vsx vsx2 vsx3.*"
+            )
+        # min
+        self.expect("min",
+            x86="sse sse2", x64="sse sse2 sse3",
+            armhf="", aarch64="neon neon_fp16 .* asimd",
+            ppc64="", ppc64le="vsx vsx2"
+        )
+        self.expect(
+            "min", trap_files=".*cpu_(sse2|vsx2).c",
+            x86="", ppc64le=""
+        )
+        # an exception must triggered if native flag isn't supported
+        # when option "native" is activated through the args
+        try:
+            self.expect("native",
+                trap_flags=".*(-march=native|-xHost|/QxHost).*",
+                x86=".*", ppc64=".*", armhf=".*"
+            )
+            if self.march() != "unknown":
+                raise AssertionError(
+                    "excepted an exception for %s" % self.march()
+                )
+        except DistutilsError:
+            if self.march() == "unknown":
+                raise AssertionError("excepted no exceptions")
+
+    def test_flags(self):
+        self.expect_flags(
+            "sse sse2 vsx vsx2 neon neon_fp16",
+            x86_gcc="-msse -msse2", x86_icc="-msse -msse2",
+            x86_iccw="/arch:SSE2", x86_msvc="/arch:SSE2",
+            ppc64_gcc= "-mcpu=power8",
+            ppc64_clang="-maltivec -mvsx -mpower8-vector",
+            armhf_gcc="-mfpu=neon-fp16 -mfp16-format=ieee",
+            aarch64=""
+        )
+        # testing normalize -march
+        self.expect_flags(
+            "asimd",
+            aarch64="",
+            armhf_gcc=r"-mfp16-format=ieee -mfpu=neon-fp-armv8 -march=armv8-a\+simd"
+        )
+        self.expect_flags(
+            "asimdhp",
+            aarch64_gcc=r"-march=armv8.2-a\+fp16",
+            armhf_gcc=r"-mfp16-format=ieee -mfpu=neon-fp-armv8 -march=armv8.2-a\+fp16"
+        )
+        self.expect_flags(
+            "asimddp", aarch64_gcc=r"-march=armv8.2-a\+dotprod"
+        )
+        self.expect_flags(
+            # asimdfhm implies asimdhp
+            "asimdfhm", aarch64_gcc=r"-march=armv8.2-a\+fp16\+fp16fml"
+        )
+        self.expect_flags(
+            "asimddp asimdhp asimdfhm",
+            aarch64_gcc=r"-march=armv8.2-a\+dotprod\+fp16\+fp16fml"
+        )
+
+    def test_targets_exceptions(self):
+        for targets in (
+            "bla bla", "/*@targets",
+            "/*@targets */",
+            "/*@targets unknown */",
+            "/*@targets $unknown_policy avx2 */",
+            "/*@targets #unknown_group avx2 */",
+            "/*@targets $ */",
+            "/*@targets # vsx */",
+            "/*@targets #$ vsx */",
+            "/*@targets vsx avx2 ) */",
+            "/*@targets vsx avx2 (avx2 */",
+            "/*@targets vsx avx2 () */",
+            "/*@targets vsx avx2 ($autovec) */", # no features
+            "/*@targets vsx avx2 (xxx) */",
+            "/*@targets vsx avx2 (baseline) */",
+        ) :
+            try:
+                self.expect_targets(
+                    targets,
+                    x86="", armhf="", ppc64=""
+                )
+                if self.march() != "unknown":
+                    raise AssertionError(
+                        "excepted an exception for %s" % self.march()
+                    )
+            except DistutilsError:
+                if self.march() == "unknown":
+                    raise AssertionError("excepted no exceptions")
+
+    def test_targets_syntax(self):
+        for targets in (
+            "/*@targets $keep_baseline sse vsx neon*/",
+            "/*@targets,$keep_baseline,sse,vsx,neon*/",
+            "/*@targets*$keep_baseline*sse*vsx*neon*/",
+            """
+            /*
+            ** @targets
+            ** $keep_baseline, sse vsx,neon
+            */
+            """,
+            """
+            /*
+            ************@targets*************
+            ** $keep_baseline, sse vsx, neon
+            *********************************
+            */
+            """,
+            """
+            /*
+            /////////////@targets/////////////////
+            //$keep_baseline//sse//vsx//neon
+            /////////////////////////////////////
+            */
+            """,
+            """
+            /*
+            @targets
+            $keep_baseline
+            SSE VSX NEON*/
+            """
+        ) :
+            self.expect_targets(targets,
+                x86="sse", ppc64="vsx", armhf="neon", unknown=""
+            )
+
+    def test_targets(self):
+        # test skipping baseline features
+        self.expect_targets(
+            """
+            /*@targets
+                sse sse2 sse41 avx avx2 avx512f
+                vsx vsx2 vsx3
+                neon neon_fp16 asimdhp asimddp
+            */
+            """,
+            baseline="avx vsx2 asimd",
+            x86="avx512f avx2", armhf="asimddp asimdhp", ppc64="vsx3"
+        )
+        # test skipping non-dispatch features
+        self.expect_targets(
+            """
+            /*@targets
+                sse41 avx avx2 avx512f
+                vsx2 vsx3
+                asimd asimdhp asimddp
+            */
+            """,
+            baseline="", dispatch="sse41 avx2 vsx2 asimd asimddp",
+            x86="avx2 sse41", armhf="asimddp asimd", ppc64="vsx2"
+        )
+        # test skipping features that not supported
+        self.expect_targets(
+            """
+            /*@targets
+                sse2 sse41 avx2 avx512f
+                vsx2 vsx3
+                neon asimdhp asimddp
+            */
+            """,
+            baseline="",
+            trap_files=".*(avx2|avx512f|vsx3|asimddp).c",
+            x86="sse41 sse2", ppc64="vsx2", armhf="asimdhp neon"
+        )
+        # test skipping features that implies each other
+        self.expect_targets(
+            """
+            /*@targets
+                sse sse2 avx fma3 avx2 avx512f avx512cd
+                vsx vsx2 vsx3
+                neon neon_vfpv4 neon_fp16 neon_fp16 asimd asimdhp
+                asimddp asimdfhm
+            */
+            """,
+            baseline="",
+            x86_gcc="avx512cd avx512f avx2 fma3 avx sse2",
+            x86_msvc="avx512cd avx2 avx sse2",
+            x86_icc="avx512cd avx2 avx sse2",
+            x86_iccw="avx512cd avx2 avx sse2",
+            ppc64="vsx3 vsx2 vsx",
+            ppc64le="vsx3 vsx2",
+            armhf="asimdfhm asimddp asimdhp asimd neon_vfpv4 neon_fp16 neon",
+            aarch64="asimdfhm asimddp asimdhp asimd"
+        )
+
+    def test_targets_policies(self):
+        # 'keep_baseline', generate objects for baseline features
+        self.expect_targets(
+            """
+            /*@targets
+                $keep_baseline
+                sse2 sse42 avx2 avx512f
+                vsx2 vsx3
+                neon neon_vfpv4 asimd asimddp
+            */
+            """,
+            baseline="sse41 avx2 vsx2 asimd vsx3",
+            x86="avx512f avx2 sse42 sse2",
+            ppc64="vsx3 vsx2",
+            armhf="asimddp asimd neon_vfpv4 neon",
+            # neon, neon_vfpv4, asimd implies each other
+            aarch64="asimddp asimd"
+        )
+        # 'keep_sort', leave the sort as-is
+        self.expect_targets(
+            """
+            /*@targets
+                $keep_baseline $keep_sort
+                avx512f sse42 avx2 sse2
+                vsx2 vsx3
+                asimd neon neon_vfpv4 asimddp
+            */
+            """,
+            x86="avx512f sse42 avx2 sse2",
+            ppc64="vsx2 vsx3",
+            armhf="asimd neon neon_vfpv4 asimddp",
+            # neon, neon_vfpv4, asimd implies each other
+            aarch64="asimd asimddp"
+        )
+        # 'autovec', skipping features that can't be
+        # vectorized by the compiler
+        self.expect_targets(
+            """
+            /*@targets
+                $keep_baseline $keep_sort $autovec
+                avx512f avx2 sse42 sse41 sse2
+                vsx3 vsx2
+                asimddp asimd neon_vfpv4 neon
+            */
+            """,
+            x86_gcc="avx512f avx2 sse42 sse41 sse2",
+            x86_icc="avx512f avx2 sse42 sse41 sse2",
+            x86_iccw="avx512f avx2 sse42 sse41 sse2",
+            x86_msvc="avx512f avx2 sse2",
+            ppc64="vsx3 vsx2",
+            armhf="asimddp asimd neon_vfpv4 neon",
+            # neon, neon_vfpv4, asimd implies each other
+            aarch64="asimddp asimd"
+        )
+        for policy in ("$maxopt", "$autovec"):
+            # 'maxopt' and autovec set the max acceptable optimization flags
+            self.expect_target_flags(
+                "/*@targets baseline %s */" % policy,
+                gcc={"baseline":".*-O3.*"}, icc={"baseline":".*-O3.*"},
+                iccw={"baseline":".*/O3.*"}, msvc={"baseline":".*/O2.*"},
+                unknown={"baseline":".*"}
+            )
+
+        # 'werror', force compilers to treat warnings as errors
+        self.expect_target_flags(
+            "/*@targets baseline $werror */",
+            gcc={"baseline":".*-Werror.*"}, icc={"baseline":".*-Werror.*"},
+            iccw={"baseline":".*/Werror.*"}, msvc={"baseline":".*/WX.*"},
+            unknown={"baseline":".*"}
+        )
+
+    def test_targets_groups(self):
+        self.expect_targets(
+            """
+            /*@targets $keep_baseline baseline #test_group */
+            """,
+            groups=dict(
+                test_group=("""
+                    $keep_baseline
+                    asimddp sse2 vsx2 avx2 vsx3
+                    avx512f asimdhp
+                """)
+            ),
+            x86="avx512f avx2 sse2 baseline",
+            ppc64="vsx3 vsx2 baseline",
+            armhf="asimddp asimdhp baseline"
+        )
+        # test skip duplicating and sorting
+        self.expect_targets(
+            """
+            /*@targets
+             * sse42 avx avx512f
+             * #test_group_1
+             * vsx2
+             * #test_group_2
+             * asimddp asimdfhm
+            */
+            """,
+            groups=dict(
+                test_group_1=("""
+                    VSX2 vsx3 asimd avx2 SSE41
+                """),
+                test_group_2=("""
+                    vsx2 vsx3 asImd aVx2 sse41
+                """)
+            ),
+            x86="avx512f avx2 avx sse42 sse41",
+            ppc64="vsx3 vsx2",
+            # vsx2 part of the default baseline of ppc64le, option ("min")
+            ppc64le="vsx3",
+            armhf="asimdfhm asimddp asimd",
+            # asimd part of the default baseline of aarch64, option ("min")
+            aarch64="asimdfhm asimddp"
+        )
+
+    def test_targets_multi(self):
+        self.expect_targets(
+            """
+            /*@targets
+                (avx512_clx avx512_cnl) (asimdhp asimddp)
+            */
+            """,
+            x86=r"\(avx512_clx avx512_cnl\)",
+            armhf=r"\(asimdhp asimddp\)",
+        )
+        # test skipping implied features and auto-sort
+        self.expect_targets(
+            """
+            /*@targets
+                f16c (sse41 avx sse42) (sse3 avx2 avx512f)
+                vsx2 (vsx vsx3 vsx2)
+                (neon neon_vfpv4 asimd asimdhp asimddp)
+            */
+            """,
+            x86="avx512f f16c avx",
+            ppc64="vsx3 vsx2",
+            ppc64le="vsx3", # vsx2 part of baseline
+            armhf=r"\(asimdhp asimddp\)",
+        )
+        # test skipping implied features and keep sort
+        self.expect_targets(
+            """
+            /*@targets $keep_sort
+                (sse41 avx sse42) (sse3 avx2 avx512f)
+                (vsx vsx3 vsx2)
+                (asimddp neon neon_vfpv4 asimd asimdhp)
+            */
+            """,
+            x86="avx avx512f",
+            ppc64="vsx3",
+            armhf=r"\(asimdhp asimddp\)",
+        )
+        # test compiler variety and avoiding duplicating
+        self.expect_targets(
+            """
+            /*@targets $keep_sort
+                fma3 avx2 (fma3 avx2) (avx2 fma3) avx2 fma3
+            */
+            """,
+            x86_gcc=r"fma3 avx2 \(fma3 avx2\)",
+            x86_icc="avx2", x86_iccw="avx2",
+            x86_msvc="avx2"
+        )
+
+def new_test(arch, cc):
+    if is_standalone: return textwrap.dedent("""\
+    class TestCCompilerOpt_{class_name}(_Test_CCompilerOpt, unittest.TestCase):
+        arch = '{arch}'
+        cc   = '{cc}'
+        def __init__(self, methodName="runTest"):
+            unittest.TestCase.__init__(self, methodName)
+            self.setup()
+    """).format(
+        class_name=arch + '_' + cc, arch=arch, cc=cc
+    )
+    return textwrap.dedent("""\
+    class TestCCompilerOpt_{class_name}(_Test_CCompilerOpt):
+        arch = '{arch}'
+        cc   = '{cc}'
+    """).format(
+        class_name=arch + '_' + cc, arch=arch, cc=cc
+    )
+"""
+if 1 and is_standalone:
+    FakeCCompilerOpt.fake_info = "x86_icc"
+    cco = FakeCCompilerOpt(None, cpu_baseline="avx2")
+    print(' '.join(cco.cpu_baseline_names()))
+    print(cco.cpu_baseline_flags())
+    unittest.main()
+    sys.exit()
+"""
+for arch, compilers in arch_compilers.items():
+    for cc in compilers:
+        exec(new_test(arch, cc))
+
+if is_standalone:
+    unittest.main()
diff --git a/numpy/distutils/tests/test_ccompiler_opt_conf.py b/numpy/distutils/tests/test_ccompiler_opt_conf.py
new file mode 100644
index 000000000..2f83a59e0
--- /dev/null
+++ b/numpy/distutils/tests/test_ccompiler_opt_conf.py
@@ -0,0 +1,169 @@
+import unittest
+from os import sys, path
+
+is_standalone = __name__ == '__main__' and __package__ is None
+if is_standalone:
+    sys.path.append(path.abspath(path.join(path.dirname(__file__), "..")))
+    from ccompiler_opt import CCompilerOpt
+else:
+    from numpy.distutils.ccompiler_opt import CCompilerOpt
+
+arch_compilers = dict(
+    x86 = ("gcc", "clang", "icc", "iccw", "msvc"),
+    x64 = ("gcc", "clang", "icc", "iccw", "msvc"),
+    ppc64 = ("gcc", "clang"),
+    ppc64le = ("gcc", "clang"),
+    armhf = ("gcc", "clang"),
+    aarch64 = ("gcc", "clang"),
+    narch = ("gcc",)
+)
+
+class FakeCCompilerOpt(CCompilerOpt):
+    fake_info = ""
+    def __init__(self, *args, **kwargs):
+        CCompilerOpt.__init__(self, None, **kwargs)
+    def dist_compile(self, sources, flags, **kwargs):
+        return sources
+    def dist_info(self):
+        return FakeCCompilerOpt.fake_info
+    @staticmethod
+    def dist_log(*args, stderr=False):
+        pass
+
+class _TestConfFeatures(FakeCCompilerOpt):
+    """A hook to check the sanity of configured features
+-   before it called by the abstract class '_Feature'
+    """
+
+    def conf_features_partial(self):
+        conf_all = self.conf_features
+        for feature_name, feature in conf_all.items():
+            self.test_feature(
+                "attribute conf_features",
+                conf_all, feature_name, feature
+            )
+
+        conf_partial = FakeCCompilerOpt.conf_features_partial(self)
+        for feature_name, feature in conf_partial.items():
+            self.test_feature(
+                "conf_features_partial()",
+                conf_partial, feature_name, feature
+            )
+        return conf_partial
+
+    def test_feature(self, log, search_in, feature_name, feature_dict):
+        error_msg = (
+            "during validate '{}' within feature '{}', "
+            "march '{}' and compiler '{}'\n>> "
+        ).format(log, feature_name, self.cc_march, self.cc_name)
+
+        if not feature_name.isupper():
+            raise AssertionError(error_msg + "feature name must be in uppercase")
+
+        for option, val in feature_dict.items():
+            self.test_option_types(error_msg, option, val)
+            self.test_duplicates(error_msg, option, val)
+
+        self.test_implies(error_msg, search_in, feature_name, feature_dict)
+        self.test_group(error_msg, search_in, feature_name, feature_dict)
+
+    def test_option_types(self, error_msg, option, val):
+        for tp, available in (
+            ((str, list), (
+                "implies", "headers", "flags", "group", "detect"
+            )),
+            ((str,),  ("disable",)),
+            ((int,),  ("interest",)),
+            ((bool,), ("implies_detect",)),
+            ((bool, type(None)), ("autovec",)),
+        ) :
+            found_it = option in available
+            if not found_it:
+                continue
+            if not isinstance(val, tp):
+                error_tp = [t.__name__ for t in (*tp,)]
+                error_tp = ' or '.join(error_tp)
+                raise AssertionError(error_msg + \
+                    "expected '%s' type for option '%s' not '%s'" % (
+                     error_tp, option, type(val).__name__
+                ))
+            break
+
+        if not found_it:
+            raise AssertionError(error_msg + \
+                "invalid option name '%s'" % option
+            )
+
+    def test_duplicates(self, error_msg, option, val):
+        if option not in (
+            "implies", "headers", "flags", "group", "detect"
+        ) : return
+
+        if isinstance(val, str):
+            val = val.split()
+
+        if len(val) != len(set(val)):
+            raise AssertionError(error_msg + \
+                "duplicated values in option '%s'" % option
+            )
+
+    def test_implies(self, error_msg, search_in, feature_name, feature_dict):
+        if feature_dict.get("disabled") is not None:
+            return
+        implies = feature_dict.get("implies", "")
+        if not implies:
+            return
+        if isinstance(implies, str):
+            implies = implies.split()
+
+        if feature_name in implies:
+            raise AssertionError(error_msg + \
+                "feature implies itself"
+            )
+
+        for impl in implies:
+            impl_dict = search_in.get(impl)
+            if impl_dict is not None:
+                if "disable" in impl_dict:
+                    raise AssertionError(error_msg + \
+                        "implies disabled feature '%s'" % impl
+                    )
+                continue
+            raise AssertionError(error_msg + \
+                "implies non-exist feature '%s'" % impl
+            )
+
+    def test_group(self, error_msg, search_in, feature_name, feature_dict):
+        if feature_dict.get("disabled") is not None:
+            return
+        group = feature_dict.get("group", "")
+        if not group:
+            return
+        if isinstance(group, str):
+            group = group.split()
+
+        for f in group:
+            impl_dict = search_in.get(f)
+            if not impl_dict or "disable" in impl_dict:
+                continue
+            raise AssertionError(error_msg + \
+                "in option '%s', '%s' already exists as a feature name" % (
+                option, f
+            ))
+
+class TestConfFeatures(unittest.TestCase):
+    def __init__(self, methodName="runTest"):
+        unittest.TestCase.__init__(self, methodName)
+        self.setup()
+
+    def setup(self):
+        FakeCCompilerOpt.conf_nocache = True
+
+    def test_features(self):
+        for arch, compilers in arch_compilers.items():
+            for cc in compilers:
+                FakeCCompilerOpt.fake_info = arch + cc
+                _TestConfFeatures()
+
+if is_standalone:
+    unittest.main()
diff --git a/numpy/f2py/rules.py b/numpy/f2py/rules.py
index 7b25b545a..56f2033ff 100755
--- a/numpy/f2py/rules.py
+++ b/numpy/f2py/rules.py
@@ -55,6 +55,9 @@ __version__ = "$Revision: 1.129 $"[10:-1]
 from . import __version__
 f2py_version = __version__.version
 
+from .. import version as _numpy_version
+numpy_version = _numpy_version.version
+
 import os
 import time
 import copy
@@ -206,6 +209,9 @@ PyMODINIT_FUNC PyInit_#modulename#(void) {
 \t\t\"This module '#modulename#' is auto-generated with f2py (version:#f2py_version#).\\nFunctions:\\n\"\n#docs#\".\");
 \tPyDict_SetItemString(d, \"__doc__\", s);
 \tPy_DECREF(s);
+\ts = PyUnicode_FromString(\"""" + numpy_version + """\");
+\tPyDict_SetItemString(d, \"__f2py_numpy_version__\", s);
+\tPy_DECREF(s);
 \t#modulename#_error = PyErr_NewException (\"#modulename#.error\", NULL, NULL);
 \t/*
 \t * Store the error object inside the dict, so that it could get deallocated.
diff --git a/numpy/f2py/tests/test_regression.py b/numpy/f2py/tests/test_regression.py
index 67e00f1f7..a1b772069 100644
--- a/numpy/f2py/tests/test_regression.py
+++ b/numpy/f2py/tests/test_regression.py
@@ -2,7 +2,7 @@ import os
 import pytest
 
 import numpy as np
-from numpy.testing import assert_raises, assert_equal
+from numpy.testing import assert_, assert_raises, assert_equal, assert_string_equal
 
 from . import util
 
@@ -25,3 +25,23 @@ class TestIntentInOut(util.F2PyTest):
         x = np.arange(3, dtype=np.float32)
         self.module.foo(x)
         assert_equal(x, [3, 1, 2])
+ 
+
+class TestNumpyVersionAttribute(util.F2PyTest):
+    # Check that th attribute __f2py_numpy_version__ is present
+    # in the compiled module and that has the value np.__version__.
+    sources = [_path('src', 'regression', 'inout.f90')]
+    
+    @pytest.mark.slow
+    def test_numpy_version_attribute(self):
+        
+        # Check that self.module has an attribute named "__f2py_numpy_version__"
+        assert_(hasattr(self.module, "__f2py_numpy_version__"), 
+                msg="Fortran module does not have __f2py_numpy_version__")
+        
+        # Check that the attribute __f2py_numpy_version__ is a string
+        assert_(isinstance(self.module.__f2py_numpy_version__, str),
+                msg="__f2py_numpy_version__ is not a string")
+        
+        # Check that __f2py_numpy_version__ has the value numpy.__version__
+        assert_string_equal(np.__version__, self.module.__f2py_numpy_version__)
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index f5a548433..6d6222d3e 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -784,6 +784,7 @@ def _getconv(dtype):
     else:
         return asstr
 
+
 # amount of lines loadtxt reads in one chunk, can be overridden for testing
 _loadtxt_chunksize = 50000
 
@@ -914,68 +915,10 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
            [ 19.22,  64.31],
            [-17.57,  63.94]])
     """
-    # Type conversions for Py3 convenience
-    if comments is not None:
-        if isinstance(comments, (str, bytes)):
-            comments = [comments]
-        comments = [_decode_line(x) for x in comments]
-        # Compile regex for comments beforehand
-        comments = (re.escape(comment) for comment in comments)
-        regex_comments = re.compile('|'.join(comments))
-
-    if delimiter is not None:
-        delimiter = _decode_line(delimiter)
-
-    user_converters = converters
-
-    if encoding == 'bytes':
-        encoding = None
-        byte_converters = True
-    else:
-        byte_converters = False
-
-    if usecols is not None:
-        # Allow usecols to be a single int or a sequence of ints
-        try:
-            usecols_as_list = list(usecols)
-        except TypeError:
-            usecols_as_list = [usecols]
-        for col_idx in usecols_as_list:
-            try:
-                opindex(col_idx)
-            except TypeError as e:
-                e.args = (
-                    "usecols must be an int or a sequence of ints but "
-                    "it contains at least one element of type %s" %
-                    type(col_idx),
-                    )
-                raise
-        # Fall back to existing code
-        usecols = usecols_as_list
-
-    fown = False
-    try:
-        if isinstance(fname, os_PathLike):
-            fname = os_fspath(fname)
-        if _is_string_like(fname):
-            fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
-            fencoding = getattr(fh, 'encoding', 'latin1')
-            fh = iter(fh)
-            fown = True
-        else:
-            fh = iter(fname)
-            fencoding = getattr(fname, 'encoding', 'latin1')
-    except TypeError:
-        raise ValueError('fname must be a string, file handle, or generator')
 
-    # input may be a python2 io stream
-    if encoding is not None:
-        fencoding = encoding
-    # we must assume local encoding
-    # TODO emit portability warning?
-    elif fencoding is None:
-        import locale
-        fencoding = locale.getpreferredencoding()
+    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    # Nested functions used by loadtxt.
+    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
     # not to be confused with the flatten_dtype we import...
     @recursive
@@ -1075,11 +1018,84 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         if X:
             yield X
 
+    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    # Main body of loadtxt.
+    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+    # Check correctness of the values of `ndmin`
+    if ndmin not in [0, 1, 2]:
+        raise ValueError('Illegal value of ndmin keyword: %s' % ndmin)
+
+    # Type conversions for Py3 convenience
+    if comments is not None:
+        if isinstance(comments, (str, bytes)):
+            comments = [comments]
+        comments = [_decode_line(x) for x in comments]
+        # Compile regex for comments beforehand
+        comments = (re.escape(comment) for comment in comments)
+        regex_comments = re.compile('|'.join(comments))
+
+    if delimiter is not None:
+        delimiter = _decode_line(delimiter)
+
+    user_converters = converters
+
+    if encoding == 'bytes':
+        encoding = None
+        byte_converters = True
+    else:
+        byte_converters = False
+
+    if usecols is not None:
+        # Allow usecols to be a single int or a sequence of ints
+        try:
+            usecols_as_list = list(usecols)
+        except TypeError:
+            usecols_as_list = [usecols]
+        for col_idx in usecols_as_list:
+            try:
+                opindex(col_idx)
+            except TypeError as e:
+                e.args = (
+                    "usecols must be an int or a sequence of ints but "
+                    "it contains at least one element of type %s" %
+                    type(col_idx),
+                    )
+                raise
+        # Fall back to existing code
+        usecols = usecols_as_list
+
+    # Make sure we're dealing with a proper dtype
+    dtype = np.dtype(dtype)
+    defconv = _getconv(dtype)
+
+    dtype_types, packing = flatten_dtype_internal(dtype)
+
+    fown = False
     try:
-        # Make sure we're dealing with a proper dtype
-        dtype = np.dtype(dtype)
-        defconv = _getconv(dtype)
+        if isinstance(fname, os_PathLike):
+            fname = os_fspath(fname)
+        if _is_string_like(fname):
+            fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
+            fencoding = getattr(fh, 'encoding', 'latin1')
+            fh = iter(fh)
+            fown = True
+        else:
+            fh = iter(fname)
+            fencoding = getattr(fname, 'encoding', 'latin1')
+    except TypeError:
+        raise ValueError('fname must be a string, file handle, or generator')
 
+    # input may be a python2 io stream
+    if encoding is not None:
+        fencoding = encoding
+    # we must assume local encoding
+    # TODO emit portability warning?
+    elif fencoding is None:
+        import locale
+        fencoding = locale.getpreferredencoding()
+
+    try:
         # Skip the first `skiprows` lines
         for i in range(skiprows):
             next(fh)
@@ -1095,10 +1111,12 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
             # End of lines reached
             first_line = ''
             first_vals = []
-            warnings.warn('loadtxt: Empty input file: "%s"' % fname, stacklevel=2)
+            warnings.warn('loadtxt: Empty input file: "%s"' % fname,
+                          stacklevel=2)
         N = len(usecols or first_vals)
 
-        dtype_types, packing = flatten_dtype_internal(dtype)
+        # Now that we know N, create the default converters list, and
+        # set packing, if necessary.
         if len(dtype_types) > 1:
             # We're dealing with a structured array, each field of
             # the dtype matches a column
@@ -1118,8 +1136,9 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
                     # Unused converter specified
                     continue
             if byte_converters:
-                # converters may use decode to workaround numpy's old behaviour,
-                # so encode the string again before passing to the user converter
+                # converters may use decode to workaround numpy's old
+                # behaviour, so encode the string again before passing to
+                # the user converter
                 def tobytes_first(x, conv):
                     if type(x) is bytes:
                         return conv(x)
@@ -1158,9 +1177,6 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         X.shape = (1, -1)
 
     # Verify that the array has at least dimensions `ndmin`.
-    # Check correctness of the values of `ndmin`
-    if ndmin not in [0, 1, 2]:
-        raise ValueError('Illegal value of ndmin keyword: %s' % ndmin)
     # Tweak the size and shape of the arrays - remove extraneous dimensions
     if X.ndim > ndmin:
         X = np.squeeze(X)
diff --git a/numpy/lib/tests/test_nanfunctions.py b/numpy/lib/tests/test_nanfunctions.py
index db563e30c..e0f723a3c 100644
--- a/numpy/lib/tests/test_nanfunctions.py
+++ b/numpy/lib/tests/test_nanfunctions.py
@@ -957,7 +957,7 @@ def test__replace_nan():
     """ Test that _replace_nan returns the original array if there are no
     NaNs, not a copy.
     """
-    for dtype in [np.bool, np.int32, np.int64]:
+    for dtype in [np.bool_, np.int32, np.int64]:
         arr = np.array([0, 1], dtype=dtype)
         result, mask = _replace_nan(arr, 0)
         assert mask is None
diff --git a/numpy/ma/core.py b/numpy/ma/core.py
index 8d612b8ed..b5371f51a 100644
--- a/numpy/ma/core.py
+++ b/numpy/ma/core.py
@@ -21,6 +21,7 @@ Released for unlimited redistribution.
 """
 # pylint: disable-msg=E1002
 import builtins
+import inspect
 import operator
 import warnings
 import textwrap
@@ -122,15 +123,8 @@ def doc_note(initialdoc, note):
     if note is None:
         return initialdoc
 
-    notesplit = re.split(r'\n\s*?Notes\n\s*?-----', initialdoc)
-
-    notedoc = """\
-Notes
-    -----
-    %s""" % note
-
-    if len(notesplit) > 1:
-        notedoc = '\n\n    ' + notedoc + '\n'
+    notesplit = re.split(r'\n\s*?Notes\n\s*?-----', inspect.cleandoc(initialdoc))
+    notedoc = "\n\nNotes\n-----\n%s\n" % inspect.cleandoc(note)
 
     return ''.join(notesplit[:1] + [notedoc] + notesplit[1:])
 
diff --git a/numpy/ma/extras.py b/numpy/ma/extras.py
index f86ebf551..8ede29da1 100644
--- a/numpy/ma/extras.py
+++ b/numpy/ma/extras.py
@@ -244,11 +244,6 @@ class _fromnxfunction:
         the new masked array version of the function. A note on application
         of the function to the mask is appended.
 
-        .. warning::
-          If the function docstring already contained a Notes section, the
-          new docstring will have two Notes sections instead of appending a note
-          to the existing section.
-
         Parameters
         ----------
         None
@@ -258,9 +253,9 @@ class _fromnxfunction:
         doc = getattr(npfunc, '__doc__', None)
         if doc:
             sig = self.__name__ + ma.get_object_signature(npfunc)
-            locdoc = "Notes\n-----\nThe function is applied to both the _data"\
-                     " and the _mask, if any."
-            return '\n'.join((sig, doc, locdoc))
+            doc = ma.doc_note(doc, "The function is applied to both the _data "
+                                   "and the _mask, if any.")
+            return '\n\n'.join((sig, doc))
         return
 
     def __call__(self, *args, **params):
diff --git a/numpy/ma/tests/test_core.py b/numpy/ma/tests/test_core.py
index 6f34144bb..76a92f5ca 100644
--- a/numpy/ma/tests/test_core.py
+++ b/numpy/ma/tests/test_core.py
@@ -34,8 +34,8 @@ from numpy.ma.core import (
     MAError, MaskError, MaskType, MaskedArray, abs, absolute, add, all,
     allclose, allequal, alltrue, angle, anom, arange, arccos, arccosh, arctan2,
     arcsin, arctan, argsort, array, asarray, choose, concatenate,
-    conjugate, cos, cosh, count, default_fill_value, diag, divide, empty,
-    empty_like, equal, exp, flatten_mask, filled, fix_invalid,
+    conjugate, cos, cosh, count, default_fill_value, diag, divide, doc_note,
+    empty, empty_like, equal, exp, flatten_mask, filled, fix_invalid,
     flatten_structured_array, fromflex, getmask, getmaskarray, greater,
     greater_equal, identity, inner, isMaskedArray, less, less_equal, log,
     log10, make_mask, make_mask_descr, mask_or, masked, masked_array,
@@ -5283,3 +5283,33 @@ def test_mask_shape_assignment_does_not_break_masked():
     b = np.ma.array(1, mask=a.mask)
     b.shape = (1,)
     assert_equal(a.mask.shape, ())
+
+@pytest.mark.skipif(sys.flags.optimize > 1,
+                    reason="no docstrings present to inspect when PYTHONOPTIMIZE/Py_OptimizeFlag > 1")
+def test_doc_note():
+    def method(self):
+        """This docstring
+
+        Has multiple lines
+
+        And notes
+
+        Notes
+        -----
+        original note
+        """
+        pass
+
+    expected_doc = """This docstring
+
+Has multiple lines
+
+And notes
+
+Notes
+-----
+note
+
+original note"""
+
+    assert_equal(np.ma.core.doc_note(method.__doc__, "note"), expected_doc)
diff --git a/numpy/polynomial/_polybase.py b/numpy/polynomial/_polybase.py
index 30887b670..b5341ba37 100644
--- a/numpy/polynomial/_polybase.py
+++ b/numpy/polynomial/_polybase.py
@@ -919,10 +919,8 @@ class ABCPolyBase(abc.ABC):
         ----------
         x : array_like, shape (M,)
             x-coordinates of the M sample points ``(x[i], y[i])``.
-        y : array_like, shape (M,) or (M, K)
-            y-coordinates of the sample points. Several data sets of sample
-            points sharing the same x-coordinates can be fitted at once by
-            passing in a 2D-array that contains one dataset per column.
+        y : array_like, shape (M,)
+            y-coordinates of the M sample points ``(x[i], y[i])``.
         deg : int or 1-D array_like
             Degree(s) of the fitting polynomials. If `deg` is a single integer
             all terms up to and including the `deg`'th term are included in the
diff --git a/numpy/random/bit_generator.pyx b/numpy/random/bit_generator.pyx
index f145ec13d..3c52a9933 100644
--- a/numpy/random/bit_generator.pyx
+++ b/numpy/random/bit_generator.pyx
@@ -382,13 +382,22 @@ cdef class SeedSequence():
         -------
         entropy_array : 1D uint32 array
         """
-        # Convert run-entropy, program-entropy, and the spawn key into uint32
+        # Convert run-entropy and the spawn key into uint32
         # arrays and concatenate them.
 
         # We MUST have at least some run-entropy. The others are optional.
         assert self.entropy is not None
         run_entropy = _coerce_to_uint32_array(self.entropy)
         spawn_entropy = _coerce_to_uint32_array(self.spawn_key)
+        if len(spawn_entropy) > 0 and len(run_entropy) < self.pool_size:
+            # Explicitly fill out the entropy with 0s to the pool size to avoid
+            # conflict with spawn keys. We changed this in 1.19.0 to fix
+            # gh-16539. In order to preserve stream-compatibility with
+            # unspawned SeedSequences with small entropy inputs, we only do
+            # this when a spawn_key is specified.
+            diff = self.pool_size - len(run_entropy)
+            run_entropy = np.concatenate(
+                [run_entropy, np.zeros(diff, dtype=np.uint32)])
         entropy_array = np.concatenate([run_entropy, spawn_entropy])
         return entropy_array
 
diff --git a/numpy/random/tests/test_seed_sequence.py b/numpy/random/tests/test_seed_sequence.py
index fe23680ed..f08cf80fa 100644
--- a/numpy/random/tests/test_seed_sequence.py
+++ b/numpy/random/tests/test_seed_sequence.py
@@ -1,5 +1,5 @@
 import numpy as np
-from numpy.testing import assert_array_equal
+from numpy.testing import assert_array_equal, assert_array_compare
 
 from numpy.random import SeedSequence
 
@@ -52,3 +52,29 @@ def test_reference_data():
         assert_array_equal(state, expected)
         state64 = ss.generate_state(len(expected64), dtype=np.uint64)
         assert_array_equal(state64, expected64)
+
+
+def test_zero_padding():
+    """ Ensure that the implicit zero-padding does not cause problems.
+    """
+    # Ensure that large integers are inserted in little-endian fashion to avoid
+    # trailing 0s.
+    ss0 = SeedSequence(42)
+    ss1 = SeedSequence(42 << 32)
+    assert_array_compare(
+        np.not_equal,
+        ss0.generate_state(4),
+        ss1.generate_state(4))
+
+    # Ensure backwards compatibility with the original 0.17 release for small
+    # integers and no spawn key.
+    expected42 = np.array([3444837047, 2669555309, 2046530742, 3581440988],
+                          dtype=np.uint32)
+    assert_array_equal(SeedSequence(42).generate_state(4), expected42)
+
+    # Regression test for gh-16539 to ensure that the implicit 0s don't
+    # conflict with spawn keys.
+    assert_array_compare(
+        np.not_equal,
+        SeedSequence(42, spawn_key=(0,)).generate_state(4),
+        expected42)
diff --git a/numpy/setup.py b/numpy/setup.py
index c6498d101..cbf633504 100644
--- a/numpy/setup.py
+++ b/numpy/setup.py
@@ -17,6 +17,7 @@ def configuration(parent_package='',top_path=None):
     config.add_subpackage('polynomial')
     config.add_subpackage('random')
     config.add_subpackage('testing')
+    config.add_subpackage('typing')
     config.add_data_dir('doc')
     config.add_data_files('py.typed')
     config.add_data_files('*.pyi')
diff --git a/numpy/testing/_private/utils.py b/numpy/testing/_private/utils.py
index ef623255b..3827b7505 100644
--- a/numpy/testing/_private/utils.py
+++ b/numpy/testing/_private/utils.py
@@ -719,6 +719,8 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
         at the same locations.
 
         """
+        __tracebackhide__ = True  # Hide traceback for py.test
+
         x_id = func(x)
         y_id = func(y)
         # We include work-arounds here to handle three types of slightly
diff --git a/numpy/testing/tests/test_utils.py b/numpy/testing/tests/test_utils.py
index b899e94f4..6a6cc664a 100644
--- a/numpy/testing/tests/test_utils.py
+++ b/numpy/testing/tests/test_utils.py
@@ -941,6 +941,17 @@ class TestArrayAlmostEqualNulp:
         assert_raises(AssertionError, assert_array_almost_equal_nulp,
                       x, y, nulp)
 
+    def test_float64_ignore_nan(self):
+        # Ignore ULP differences between various NAN's
+        # Note that MIPS may reverse quiet and signaling nans
+        # so we use the builtin version as a base.
+        offset = np.uint64(0xffffffff)
+        nan1_i64 = np.array(np.nan, dtype=np.float64).view(np.uint64)
+        nan2_i64 = nan1_i64 ^ offset  # nan payload on MIPS is all ones.
+        nan1_f64 = nan1_i64.view(np.float64)
+        nan2_f64 = nan2_i64.view(np.float64)
+        assert_array_max_ulp(nan1_f64, nan2_f64, 0)
+
     def test_float32_pass(self):
         nulp = 5
         x = np.linspace(-20, 20, 50, dtype=np.float32)
@@ -971,6 +982,17 @@ class TestArrayAlmostEqualNulp:
         assert_raises(AssertionError, assert_array_almost_equal_nulp,
                       x, y, nulp)
 
+    def test_float32_ignore_nan(self):
+        # Ignore ULP differences between various NAN's
+        # Note that MIPS may reverse quiet and signaling nans
+        # so we use the builtin version as a base.
+        offset = np.uint32(0xffff)
+        nan1_i32 = np.array(np.nan, dtype=np.float32).view(np.uint32)
+        nan2_i32 = nan1_i32 ^ offset  # nan payload on MIPS is all ones.
+        nan1_f32 = nan1_i32.view(np.float32)
+        nan2_f32 = nan2_i32.view(np.float32)
+        assert_array_max_ulp(nan1_f32, nan2_f32, 0)
+
     def test_float16_pass(self):
         nulp = 5
         x = np.linspace(-4, 4, 10, dtype=np.float16)
@@ -1001,6 +1023,17 @@ class TestArrayAlmostEqualNulp:
         assert_raises(AssertionError, assert_array_almost_equal_nulp,
                       x, y, nulp)
 
+    def test_float16_ignore_nan(self):
+        # Ignore ULP differences between various NAN's
+        # Note that MIPS may reverse quiet and signaling nans
+        # so we use the builtin version as a base.
+        offset = np.uint16(0xff)
+        nan1_i16 = np.array(np.nan, dtype=np.float16).view(np.uint16)
+        nan2_i16 = nan1_i16 ^ offset  # nan payload on MIPS is all ones.
+        nan1_f16 = nan1_i16.view(np.float16)
+        nan2_f16 = nan2_i16.view(np.float16)
+        assert_array_max_ulp(nan1_f16, nan2_f16, 0)
+
     def test_complex128_pass(self):
         nulp = 5
         x = np.linspace(-20, 20, 50, dtype=np.float64)
diff --git a/numpy/tests/test_public_api.py b/numpy/tests/test_public_api.py
index 7ce74bc43..cc4c5d8c5 100644
--- a/numpy/tests/test_public_api.py
+++ b/numpy/tests/test_public_api.py
@@ -54,18 +54,22 @@ def test_numpy_namespace():
         'show_config': 'numpy.__config__.show',
         'who': 'numpy.lib.utils.who',
     }
-    # These built-in types are re-exported by numpy.
-    builtins = {
-        'bool': 'builtins.bool',
-        'complex': 'builtins.complex',
-        'float': 'builtins.float',
-        'int': 'builtins.int',
-        'long': 'builtins.int',
-        'object': 'builtins.object',
-        'str': 'builtins.str',
-        'unicode': 'builtins.str',
-    }
-    whitelist = dict(undocumented, **builtins)
+    if sys.version_info < (3, 7):
+        # These built-in types are re-exported by numpy.
+        builtins = {
+            'bool': 'builtins.bool',
+            'complex': 'builtins.complex',
+            'float': 'builtins.float',
+            'int': 'builtins.int',
+            'long': 'builtins.int',
+            'object': 'builtins.object',
+            'str': 'builtins.str',
+            'unicode': 'builtins.str',
+        }
+        whitelist = dict(undocumented, **builtins)
+    else:
+        # after 3.7, we override dir to not show these members
+        whitelist = undocumented
     bad_results = check_dir(np)
     # pytest gives better error messages with the builtin assert than with
     # assert_equal
@@ -98,7 +102,7 @@ def test_dir_testing():
     """Assert that output of dir has only one "testing/tester"
     attribute without duplicate"""
     assert len(dir(np)) == len(set(dir(np)))
-    
+
 
 def test_numpy_linalg():
     bad_results = check_dir(np.linalg)
@@ -176,6 +180,7 @@ PUBLIC_MODULES = ['numpy.' + s for s in [
     "polynomial.polyutils",
     "random",
     "testing",
+    "typing",
     "version",
 ]]
 
@@ -209,6 +214,7 @@ PRIVATE_BUT_PRESENT_MODULES = ['numpy.' + s for s in [
     "core.umath",
     "core.umath_tests",
     "distutils.ccompiler",
+    'distutils.ccompiler_opt',
     "distutils.command",
     "distutils.command.autodist",
     "distutils.command.bdist_rpm",
diff --git a/numpy/tests/typing/fail/array_like.py b/numpy/tests/typing/fail/array_like.py
index a5ef5795f..a97e72dc7 100644
--- a/numpy/tests/typing/fail/array_like.py
+++ b/numpy/tests/typing/fail/array_like.py
@@ -1,11 +1,5 @@
-from typing import Any, TYPE_CHECKING
-
 import numpy as np
-
-if TYPE_CHECKING:
-    from numpy.typing import ArrayLike
-else:
-    ArrayLike = Any
+from numpy.typing import ArrayLike
 
 
 class A:
diff --git a/numpy/tests/typing/fail/fromnumeric.py b/numpy/tests/typing/fail/fromnumeric.py
index f158a1071..7455ce722 100644
--- a/numpy/tests/typing/fail/fromnumeric.py
+++ b/numpy/tests/typing/fail/fromnumeric.py
@@ -22,11 +22,9 @@ np.choose(A, mode="bob")  # E: No overload variant of "choose" matches argument
 np.repeat(a, None)  # E: Argument 2 to "repeat" has incompatible type
 np.repeat(A, 1, axis=1.0)  # E: Argument "axis" to "repeat" has incompatible type
 
-np.swapaxes(a, 0, 0)  # E: Argument 1 to "swapaxes" has incompatible type
 np.swapaxes(A, None, 1)  # E: Argument 2 to "swapaxes" has incompatible type
 np.swapaxes(A, 1, [0])  # E: Argument 3 to "swapaxes" has incompatible type
 
-np.transpose(a, axes=1)  # E: Argument "axes" to "transpose" has incompatible type
 np.transpose(A, axes=1.0)  # E: Argument "axes" to "transpose" has incompatible type
 
 np.partition(a, None)  # E: Argument 2 to "partition" has incompatible type
@@ -53,25 +51,20 @@ np.argpartition(
     A, 0, order=range(5)  # E: Argument "order" to "argpartition" has incompatible type
 )
 
-np.sort(a)  # E: Argument 1 to "sort" has incompatible type
 np.sort(A, axis="bob")  # E: Argument "axis" to "sort" has incompatible type
 np.sort(A, kind="bob")  # E: Argument "kind" to "sort" has incompatible type
 np.sort(A, order=range(5))  # E: Argument "order" to "sort" has incompatible type
 
-np.argsort(a)  # E: Argument 1 to "argsort" has incompatible type
 np.argsort(A, axis="bob")  # E: Argument "axis" to "argsort" has incompatible type
 np.argsort(A, kind="bob")  # E: Argument "kind" to "argsort" has incompatible type
 np.argsort(A, order=range(5))  # E: Argument "order" to "argsort" has incompatible type
 
-np.argmax(a)  # E: No overload variant of "argmax" matches argument type
 np.argmax(A, axis="bob")  # E: No overload variant of "argmax" matches argument type
 np.argmax(A, kind="bob")  # E: No overload variant of "argmax" matches argument type
 
-np.argmin(a)  # E: No overload variant of "argmin" matches argument type
 np.argmin(A, axis="bob")  # E: No overload variant of "argmin" matches argument type
 np.argmin(A, kind="bob")  # E: No overload variant of "argmin" matches argument type
 
-np.searchsorted(a, 0)  # E: No overload variant of "searchsorted" matches argument type
 np.searchsorted(  # E: No overload variant of "searchsorted" matches argument type
     A[0], 0, side="bob"
 )
@@ -83,19 +76,16 @@ np.resize(A, 1.0)  # E: Argument 2 to "resize" has incompatible type
 
 np.squeeze(A, 1.0)  # E: No overload variant of "squeeze" matches argument type
 
-np.diagonal(a)  # E: Argument 1 to "diagonal" has incompatible type
 np.diagonal(A, offset=None)  # E: Argument "offset" to "diagonal" has incompatible type
 np.diagonal(A, axis1="bob")  # E: Argument "axis1" to "diagonal" has incompatible type
 np.diagonal(A, axis2=[])  # E: Argument "axis2" to "diagonal" has incompatible type
 
-np.trace(a)  # E: Argument 1 to "trace" has incompatible type
 np.trace(A, offset=None)  # E: Argument "offset" to "trace" has incompatible type
 np.trace(A, axis1="bob")  # E: Argument "axis1" to "trace" has incompatible type
 np.trace(A, axis2=[])  # E: Argument "axis2" to "trace" has incompatible type
 
 np.ravel(a, order="bob")  # E: Argument "order" to "ravel" has incompatible type
 
-np.compress(True, A)  # E: Argument 1 to "compress" has incompatible type
 np.compress(
     [True], A, axis=1.0  # E: Argument "axis" to "compress" has incompatible type
 )
diff --git a/numpy/tests/typing/fail/scalars.py b/numpy/tests/typing/fail/scalars.py
index 0dfc55124..5d7221895 100644
--- a/numpy/tests/typing/fail/scalars.py
+++ b/numpy/tests/typing/fail/scalars.py
@@ -65,3 +65,17 @@ np.floating(1)  # E: Cannot instantiate abstract class
 np.complexfloating(1)  # E: Cannot instantiate abstract class
 np.character("test")  # E: Cannot instantiate abstract class
 np.flexible(b"test")  # E: Cannot instantiate abstract class
+
+np.float64(value=0.0)  # E: Unexpected keyword argument
+np.int64(value=0)  # E: Unexpected keyword argument
+np.uint64(value=0)  # E: Unexpected keyword argument
+np.complex128(value=0.0j)  # E: Unexpected keyword argument
+np.str_(value='bob')  # E: No overload variant
+np.bytes_(value=b'test')  # E: No overload variant
+np.void(value=b'test')  # E: Unexpected keyword argument
+np.bool_(value=True)  # E: Unexpected keyword argument
+np.datetime64(value="2019")  # E: No overload variant
+np.timedelta64(value=0)  # E: Unexpected keyword argument
+
+np.bytes_(b"hello", encoding='utf-8')  # E: No overload variant
+np.str_("hello", encoding='utf-8')  # E: No overload variant
diff --git a/numpy/tests/typing/pass/array_like.py b/numpy/tests/typing/pass/array_like.py
index 098149c4b..e668b4963 100644
--- a/numpy/tests/typing/pass/array_like.py
+++ b/numpy/tests/typing/pass/array_like.py
@@ -1,13 +1,7 @@
-from typing import Any, List, Optional, TYPE_CHECKING
+from typing import Any, List, Optional
 
 import numpy as np
-
-if TYPE_CHECKING:
-    from numpy.typing import ArrayLike, DtypeLike, _SupportsArray
-else:
-    ArrayLike = Any
-    DtypeLike = Any
-    _SupportsArray = Any
+from numpy.typing import ArrayLike, DtypeLike, _SupportsArray
 
 x1: ArrayLike = True
 x2: ArrayLike = 5
diff --git a/numpy/tests/typing/pass/dtype.py b/numpy/tests/typing/pass/dtype.py
new file mode 100644
index 000000000..f954fdd44
--- /dev/null
+++ b/numpy/tests/typing/pass/dtype.py
@@ -0,0 +1,3 @@
+import numpy as np
+
+np.dtype(dtype=np.int64)
diff --git a/numpy/tests/typing/pass/scalars.py b/numpy/tests/typing/pass/scalars.py
index bd055673b..7de182626 100644
--- a/numpy/tests/typing/pass/scalars.py
+++ b/numpy/tests/typing/pass/scalars.py
@@ -34,7 +34,11 @@ np.float32(16)
 np.float64(3.0)
 
 np.bytes_(b"hello")
+np.bytes_("hello", 'utf-8')
+np.bytes_("hello", encoding='utf-8')
 np.str_("hello")
+np.str_(b"hello", 'utf-8')
+np.str_(b"hello", encoding='utf-8')
 
 # Protocols
 float(np.int8(4))
diff --git a/numpy/typing/__init__.py b/numpy/typing/__init__.py
new file mode 100644
index 000000000..f2000823f
--- /dev/null
+++ b/numpy/typing/__init__.py
@@ -0,0 +1,81 @@
+"""
+============================
+Typing (:mod:`numpy.typing`)
+============================
+
+.. warning::
+
+  Some of the types in this module rely on features only present in
+  the standard library in Python 3.8 and greater. If you want to use
+  these types in earlier versions of Python, you should install the
+  typing-extensions_ package.
+
+Large parts of the NumPy API have PEP-484-style type annotations. In
+addition, the following type aliases are available for users.
+
+- ``typing.ArrayLike``: objects that can be converted to arrays
+- ``typing.DtypeLike``: objects that can be converted to dtypes
+
+Roughly speaking, ``typing.ArrayLike`` is "objects that can be used as
+inputs to ``np.array``" and ``typing.DtypeLike`` is "objects that can
+be used as inputs to ``np.dtype``".
+
+.. _typing-extensions: https://pypi.org/project/typing-extensions/
+
+Differences from the runtime NumPy API
+--------------------------------------
+
+NumPy is very flexible. Trying to describe the full range of
+possibilities statically would result in types that are not very
+helpful. For that reason, the typed NumPy API is often stricter than
+the runtime NumPy API. This section describes some notable
+differences.
+
+ArrayLike
+~~~~~~~~~
+
+The ``ArrayLike`` type tries to avoid creating object arrays. For
+example,
+
+.. code-block:: python
+
+    >>> np.array(x**2 for x in range(10))
+    array(<generator object <genexpr> at 0x10c004cd0>, dtype=object)
+
+is valid NumPy code which will create a 0-dimensional object
+array. Type checkers will complain about the above example when using
+the NumPy types however. If you really intended to do the above, then
+you can either use a ``# type: ignore`` comment:
+
+.. code-block:: python
+
+    >>> np.array(x**2 for x in range(10))  # type: ignore
+
+or explicitly type the array like object as ``Any``:
+
+.. code-block:: python
+
+    >>> from typing import Any
+    >>> array_like: Any = (x**2 for x in range(10))
+    >>> np.array(array_like)
+    array(<generator object <genexpr> at 0x1192741d0>, dtype=object)
+
+ndarray
+~~~~~~~
+
+It's possible to mutate the dtype of an array at runtime. For example,
+the following code is valid:
+
+.. code-block:: python
+
+    x = np.array([1, 2])
+    x.dtype = np.bool_
+
+This sort of mutation is not allowed by the types. Users who want to
+write statically typed code should insted use the `numpy.ndarray.view`
+method to create a view of the array with a different dtype.
+
+"""
+from ._array_like import _SupportsArray, ArrayLike
+from ._shape import _Shape, _ShapeLike
+from ._dtype_like import DtypeLike
diff --git a/numpy/typing/_array_like.py b/numpy/typing/_array_like.py
new file mode 100644
index 000000000..76c0c839c
--- /dev/null
+++ b/numpy/typing/_array_like.py
@@ -0,0 +1,34 @@
+import sys
+from typing import Any, overload, Sequence, TYPE_CHECKING, Union
+
+from numpy import ndarray
+from ._dtype_like import DtypeLike
+
+if sys.version_info >= (3, 8):
+    from typing import Protocol
+    HAVE_PROTOCOL = True
+else:
+    try:
+        from typing_extensions import Protocol
+    except ImportError:
+        HAVE_PROTOCOL = False
+    else:
+        HAVE_PROTOCOL = True
+
+if TYPE_CHECKING or HAVE_PROTOCOL:
+    class _SupportsArray(Protocol):
+        @overload
+        def __array__(self, __dtype: DtypeLike = ...) -> ndarray: ...
+        @overload
+        def __array__(self, dtype: DtypeLike = ...) -> ndarray: ...
+else:
+    _SupportsArray = Any
+
+# TODO: support buffer protocols once
+#
+# https://bugs.python.org/issue27501
+#
+# is resolved. See also the mypy issue:
+#
+# https://github.com/python/typing/issues/593
+ArrayLike = Union[bool, int, float, complex, _SupportsArray, Sequence]
diff --git a/numpy/typing.pyi b/numpy/typing/_dtype_like.py
index f5705192a..b9df0af04 100644
--- a/numpy/typing.pyi
+++ b/numpy/typing/_dtype_like.py
@@ -1,17 +1,7 @@
-import sys
-from typing import Any, Dict, List, overload, Sequence, Text, Tuple, Union
+from typing import Any, Dict, List, Sequence, Tuple, Union
 
-from numpy import dtype, ndarray
-
-if sys.version_info >= (3, 8):
-    from typing import Protocol
-else:
-    from typing_extensions import Protocol
-
-_Shape = Tuple[int, ...]
-
-# Anything that can be coerced to a shape tuple
-_ShapeLike = Union[int, Sequence[int]]
+from numpy import dtype
+from ._shape import _ShapeLike
 
 _DtypeLikeNested = Any  # TODO: wait for support for recursive types
 
@@ -45,7 +35,7 @@ DtypeLike = Union[
             Sequence[str],  # names
             Sequence[_DtypeLikeNested],  # formats
             Sequence[int],  # offsets
-            Sequence[Union[bytes, Text, None]],  # titles
+            Sequence[Union[bytes, str, None]],  # titles
             int,  # itemsize
         ],
     ],
@@ -54,11 +44,3 @@ DtypeLike = Union[
     # (base_dtype, new_dtype)
     Tuple[_DtypeLikeNested, _DtypeLikeNested],
 ]
-
-class _SupportsArray(Protocol):
-    @overload
-    def __array__(self, __dtype: DtypeLike = ...) -> ndarray: ...
-    @overload
-    def __array__(self, dtype: DtypeLike = ...) -> ndarray: ...
-
-ArrayLike = Union[bool, int, float, complex, _SupportsArray, Sequence]
diff --git a/numpy/typing/_shape.py b/numpy/typing/_shape.py
new file mode 100644
index 000000000..4629046ea
--- /dev/null
+++ b/numpy/typing/_shape.py
@@ -0,0 +1,6 @@
+from typing import Sequence, Tuple, Union
+
+_Shape = Tuple[int, ...]
+
+# Anything that can be coerced to a shape tuple
+_ShapeLike = Union[int, Sequence[int]]