diff options
51 files changed, 375 insertions, 371 deletions
diff --git a/.gitpod.yml b/.gitpod.yml index f9c35fd9b..c46752f10 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -6,18 +6,20 @@ image: numpy/numpy-gitpod:latest tasks: - - name: Prepare development + - name: Prepare development environment init: | mkdir -p .vscode cp tools/gitpod/settings.json .vscode/settings.json + rm -f /workspace/numpy/.git/shallow.lock conda activate numpy-dev + git pull --unshallow # need to force this else the prebuild fails + git fetch --tags python setup.py build_ext --inplace echo "🛠Completed rebuilding NumPy!! 🛠" echo "📖 Building docs 📖 " cd doc make html echo "✨ Pre-build complete! You can close this terminal ✨ " - # -------------------------------------------------------- # exposing ports for liveserve @@ -60,3 +62,4 @@ github: addBadge: false # add a label once the prebuild is ready to pull requests (defaults to false) addLabel: false +
\ No newline at end of file @@ -13,7 +13,7 @@ []( https://numfocus.org) -[]( +[]( https://pypi.org/project/numpy/) []( https://anaconda.org/conda-forge/numpy) diff --git a/doc/changelog/1.12.0-changelog.rst b/doc/changelog/1.12.0-changelog.rst index 2e91f510f..052714374 100644 --- a/doc/changelog/1.12.0-changelog.rst +++ b/doc/changelog/1.12.0-changelog.rst @@ -283,7 +283,7 @@ A total of 418 pull requests were merged for this release. * `#7373 <https://github.com/numpy/numpy/pull/7373>`__: ENH: Add bitwise_and identity * `#7378 <https://github.com/numpy/numpy/pull/7378>`__: added NumPy logo and separator * `#7382 <https://github.com/numpy/numpy/pull/7382>`__: MAINT: cleanup np.average -* `#7385 <https://github.com/numpy/numpy/pull/7385>`__: DOC: note about wheels / windows wheels for pypi +* `#7385 <https://github.com/numpy/numpy/pull/7385>`__: DOC: note about wheels / windows wheels for PyPI * `#7386 <https://github.com/numpy/numpy/pull/7386>`__: Added label icon to Travis status * `#7397 <https://github.com/numpy/numpy/pull/7397>`__: BUG: incorrect type for objects whose __len__ fails * `#7398 <https://github.com/numpy/numpy/pull/7398>`__: DOC: fix typo diff --git a/doc/changelog/1.20.0-changelog.rst b/doc/changelog/1.20.0-changelog.rst index f06bd8a8d..f2af4a7de 100644 --- a/doc/changelog/1.20.0-changelog.rst +++ b/doc/changelog/1.20.0-changelog.rst @@ -714,7 +714,7 @@ A total of 716 pull requests were merged for this release. * `#17440 <https://github.com/numpy/numpy/pull/17440>`__: DOC: Cleaner template for PRs * `#17442 <https://github.com/numpy/numpy/pull/17442>`__: MAINT: fix exception chaining in format.py * `#17443 <https://github.com/numpy/numpy/pull/17443>`__: ENH: Warn on unsupported Python 3.10+ -* `#17444 <https://github.com/numpy/numpy/pull/17444>`__: ENH: Add ``Typing :: Typed`` to the PyPi classifier +* `#17444 <https://github.com/numpy/numpy/pull/17444>`__: ENH: Add ``Typing :: Typed`` to the PyPI classifier * `#17445 <https://github.com/numpy/numpy/pull/17445>`__: DOC: Fix the references for macros * `#17447 <https://github.com/numpy/numpy/pull/17447>`__: NEP: update NEP 42 with discussion of type hinting applications * `#17448 <https://github.com/numpy/numpy/pull/17448>`__: DOC: Remove CoC pages from Sphinx diff --git a/doc/neps/nep-0017-split-out-maskedarray.rst b/doc/neps/nep-0017-split-out-maskedarray.rst index 5cb1c0c39..faad68828 100644 --- a/doc/neps/nep-0017-split-out-maskedarray.rst +++ b/doc/neps/nep-0017-split-out-maskedarray.rst @@ -123,7 +123,7 @@ References and Footnotes .. [1] Subclassing ndarray, https://docs.scipy.org/doc/numpy/user/basics.subclassing.html -.. [2] PyPi: maskedarray, https://pypi.org/project/maskedarray/ +.. [2] PyPI: maskedarray, https://pypi.org/project/maskedarray/ Copyright --------- diff --git a/doc/source/dev/development_workflow.rst b/doc/source/dev/development_workflow.rst index 457bcf34a..38f047ec0 100644 --- a/doc/source/dev/development_workflow.rst +++ b/doc/source/dev/development_workflow.rst @@ -204,10 +204,10 @@ fragments in your commit message:: ``[skip travis]``: skip TravisCI jobs ``[skip azurepipelines]``: skip Azure jobs -*Note: unfortunately not all CI systems implement this feature well, or at all. +*Note*: unfortunately not all CI systems implement this feature well, or at all. CircleCI supports ``ci skip`` but has no command to skip only CircleCI. Azure chooses to still run jobs with skip commands on PRs, the jobs only get -skipped on merging to master.* +skipped on merging to master. .. _workflow_mailing_list: diff --git a/doc/source/reference/random/index.rst b/doc/source/reference/random/index.rst index aaabc9b39..674799d47 100644 --- a/doc/source/reference/random/index.rst +++ b/doc/source/reference/random/index.rst @@ -185,7 +185,7 @@ What's New or Different methods which are 2-10 times faster than NumPy's Box-Muller or inverse CDF implementations. * Optional ``dtype`` argument that accepts ``np.float32`` or ``np.float64`` - to produce either single or double prevision uniform random variables for + to produce either single or double precision uniform random variables for select distributions * Optional ``out`` argument that allows existing arrays to be filled for select distributions diff --git a/doc/source/reference/random/new-or-different.rst b/doc/source/reference/random/new-or-different.rst index a81543926..7a206a2ce 100644 --- a/doc/source/reference/random/new-or-different.rst +++ b/doc/source/reference/random/new-or-different.rst @@ -84,7 +84,7 @@ And in more detail: * The bit generators can be used in downstream projects via Cython. * Optional ``dtype`` argument that accepts ``np.float32`` or ``np.float64`` - to produce either single or double prevision uniform random variables for + to produce either single or double precision uniform random variables for select distributions * Uniforms (`~.Generator.random` and `~.Generator.integers`) diff --git a/doc/source/reference/simd/generated_tables/cpu_features.inc b/doc/source/reference/simd/generated_tables/cpu_features.inc index 17d1b4951..7782172d2 100644 --- a/doc/source/reference/simd/generated_tables/cpu_features.inc +++ b/doc/source/reference/simd/generated_tables/cpu_features.inc @@ -36,26 +36,28 @@ On IBM/POWER big-endian .. table:: :align: left - ======== ================ - Name Implies - ======== ================ - ``VSX`` - ``VSX2`` ``VSX`` - ``VSX3`` ``VSX`` ``VSX2`` - ======== ================ + ======== ========================= + Name Implies + ======== ========================= + ``VSX`` + ``VSX2`` ``VSX`` + ``VSX3`` ``VSX`` ``VSX2`` + ``VSX4`` ``VSX`` ``VSX2`` ``VSX3`` + ======== ========================= On IBM/POWER little-endian ~~~~~~~~~~~~~~~~~~~~~~~~~~ .. table:: :align: left - ======== ================ - Name Implies - ======== ================ - ``VSX`` ``VSX2`` - ``VSX2`` ``VSX`` - ``VSX3`` ``VSX`` ``VSX2`` - ======== ================ + ======== ========================= + Name Implies + ======== ========================= + ``VSX`` ``VSX2`` + ``VSX2`` ``VSX`` + ``VSX3`` ``VSX`` ``VSX2`` + ``VSX4`` ``VSX`` ``VSX2`` ``VSX3`` + ======== ========================= On ARMv7/A32 ~~~~~~~~~~~~ diff --git a/doc/source/release/1.10.3-notes.rst b/doc/source/release/1.10.3-notes.rst index 0d4df4ce6..9172f7663 100644 --- a/doc/source/release/1.10.3-notes.rst +++ b/doc/source/release/1.10.3-notes.rst @@ -2,4 +2,4 @@ NumPy 1.10.3 Release Notes ========================== -N/A this release did not happen due to various screwups involving PyPi. +N/A this release did not happen due to various screwups involving PyPI. diff --git a/doc/source/release/1.11.1-notes.rst b/doc/source/release/1.11.1-notes.rst index 6303c32f0..a196502cf 100644 --- a/doc/source/release/1.11.1-notes.rst +++ b/doc/source/release/1.11.1-notes.rst @@ -4,7 +4,7 @@ NumPy 1.11.1 Release Notes Numpy 1.11.1 supports Python 2.6 - 2.7 and 3.2 - 3.5. It fixes bugs and regressions found in Numpy 1.11.0 and includes several build related -improvements. Wheels for Linux, Windows, and OSX can be found on pypi. +improvements. Wheels for Linux, Windows, and OSX can be found on PyPI. Fixes Merged ============ diff --git a/doc/source/release/1.12.1-notes.rst b/doc/source/release/1.12.1-notes.rst index f67dab108..09a2e6738 100644 --- a/doc/source/release/1.12.1-notes.rst +++ b/doc/source/release/1.12.1-notes.rst @@ -4,7 +4,7 @@ NumPy 1.12.1 Release Notes NumPy 1.12.1 supports Python 2.7 and 3.4 - 3.6 and fixes bugs and regressions found in NumPy 1.12.0. In particular, the regression in f2py constant parsing -is fixed. Wheels for Linux, Windows, and OSX can be found on pypi, +is fixed. Wheels for Linux, Windows, and OSX can be found on PyPI, Bugs Fixed ========== diff --git a/doc/source/user/absolute_beginners.rst b/doc/source/user/absolute_beginners.rst index 2c6882905..cf11c6745 100644 --- a/doc/source/user/absolute_beginners.rst +++ b/doc/source/user/absolute_beginners.rst @@ -1588,7 +1588,7 @@ If you created this array "a" :: .. for doctests The continuous integration truncates dataframe display without this setting. - >>> pd.set_option('max_columns', 10) + >>> pd.set_option('display.max_columns', 10) You could create a Pandas dataframe :: diff --git a/environment.yml b/environment.yml index 024bee2c7..214a75352 100644 --- a/environment.yml +++ b/environment.yml @@ -7,7 +7,7 @@ name: numpy-dev channels: - conda-forge dependencies: - - python + - python=3.9 #need to pin to avoid issues with builds - cython - compilers - openblas @@ -21,13 +21,14 @@ dependencies: # For type annotations - mypy=0.931 # For building docs - - sphinx=4.1.1 + - sphinx=4.2.0 + - sphinx-panels - numpydoc=1.1.0 - ipython - scipy - pandas - matplotlib - - pydata-sphinx-theme + - pydata-sphinx-theme=0.7.2 - breathe # For linting - pycodestyle=2.7.0 diff --git a/numpy/__init__.pyi b/numpy/__init__.pyi index 451a29a02..92f98a801 100644 --- a/numpy/__init__.pyi +++ b/numpy/__init__.pyi @@ -3673,6 +3673,8 @@ class memmap(ndarray[_ShapeType, _DType_co]): ) -> Any: ... def flush(self) -> None: ... +# TODO: Add a mypy plugin for managing functions whose output type is dependant +# on the literal value of some sort of signature (e.g. `einsum` and `vectorize`) class vectorize: pyfunc: Callable[..., Any] cache: bool @@ -3689,7 +3691,7 @@ class vectorize: cache: bool = ..., signature: None | str = ..., ) -> None: ... - def __call__(self, *args: Any, **kwargs: Any) -> NDArray[Any]: ... + def __call__(self, *args: Any, **kwargs: Any) -> Any: ... class poly1d: @property diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py index 7081f9a59..1bbacad45 100644 --- a/numpy/core/_add_newdocs.py +++ b/numpy/core/_add_newdocs.py @@ -2943,7 +2943,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('any', add_newdoc('numpy.core.multiarray', 'ndarray', ('argmax', """ - a.argmax(axis=None, out=None) + a.argmax(axis=None, out=None, *, keepdims=False) Return indices of the maximum values along the given axis. @@ -2958,7 +2958,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('argmax', add_newdoc('numpy.core.multiarray', 'ndarray', ('argmin', """ - a.argmin(axis=None, out=None) + a.argmin(axis=None, out=None, *, keepdims=False) Return indices of the minimum values along the given axis. diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 054150b28..b11504c03 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -844,7 +844,7 @@ defdict = { docstrings.get('numpy.core.umath.trunc'), None, TD('e', f='trunc', astype={'e': 'f'}), - TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), + TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]), TD('fdg', f='trunc'), TD(O, f='npy_ObjectTrunc'), ), @@ -860,7 +860,7 @@ defdict = { docstrings.get('numpy.core.umath.floor'), None, TD('e', f='floor', astype={'e': 'f'}), - TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), + TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]), TD('fdg', f='floor'), TD(O, f='npy_ObjectFloor'), ), @@ -869,7 +869,7 @@ defdict = { docstrings.get('numpy.core.umath.rint'), None, TD('e', f='rint', astype={'e': 'f'}), - TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]), + TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]), TD('fdg' + cmplx, f='rint'), TD(P, f='rint'), ), diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py index f26f306fa..0ec4f7687 100644 --- a/numpy/core/fromnumeric.py +++ b/numpy/core/fromnumeric.py @@ -3408,6 +3408,7 @@ def mean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, *, 0.55000000074505806 # may vary Specifying a where argument: + >>> a = np.array([[5, 9, 13], [14, 10, 12], [11, 15, 19]]) >>> np.mean(a) 12.0 diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h index 47d063178..35d82ec03 100644 --- a/numpy/core/include/numpy/ndarraytypes.h +++ b/numpy/core/include/numpy/ndarraytypes.h @@ -87,7 +87,7 @@ enum NPY_TYPES { NPY_BOOL=0, /* The number of types not including the new 1.6 types */ NPY_NTYPES_ABI_COMPATIBLE=21 }; -#ifdef _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) #pragma deprecated(NPY_CHAR) #endif diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h index 88794ca07..2bcc45e4f 100644 --- a/numpy/core/include/numpy/npy_common.h +++ b/numpy/core/include/numpy/npy_common.h @@ -131,9 +131,10 @@ #endif #endif -#if defined(_MSC_VER) - #define NPY_INLINE __inline -#elif defined(__GNUC__) +#if defined(_MSC_VER) && !defined(__clang__) + #define NPY_INLINE __inline +/* clang included here to handle clang-cl on Windows */ +#elif defined(__GNUC__) || defined(__clang__) #if defined(__STRICT_ANSI__) #define NPY_INLINE __inline__ #else @@ -180,12 +181,6 @@ defined(__MINGW32__) || defined(__MINGW64__) #include <io.h> -/* mingw based on 3.4.5 has lseek but not ftell/fseek */ -#if defined(__MINGW32__) || defined(__MINGW64__) -extern int __cdecl _fseeki64(FILE *, long long, int); -extern long long __cdecl _ftelli64(FILE *); -#endif - #define npy_fseek _fseeki64 #define npy_ftell _ftelli64 #define npy_lseek _lseeki64 diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index 84de9a059..fabec069c 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -381,7 +381,7 @@ SIMD_IMPL_INTRIN_1(sumup_@sfx@, @esfx@, v@sfx@) ***************************/ #if @fp_only@ /**begin repeat1 - * #intrin = sqrt, recip, abs, square, ceil, trunc# + * #intrin = sqrt, recip, abs, square, rint, ceil, trunc, floor# */ SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@) /**end repeat1**/ @@ -615,7 +615,7 @@ SIMD_INTRIN_DEF(sumup_@sfx@) ***************************/ #if @fp_only@ /**begin repeat1 - * #intrin = sqrt, recip, abs, square, ceil, trunc# + * #intrin = sqrt, recip, abs, square, rint, ceil, trunc, floor# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ diff --git a/numpy/core/src/common/npy_cpu_features.c.src b/numpy/core/src/common/npy_cpu_features.c.src index 1385220f9..ff4f9f60a 100644 --- a/numpy/core/src/common/npy_cpu_features.c.src +++ b/numpy/core/src/common/npy_cpu_features.c.src @@ -61,7 +61,7 @@ npy_cpu_features_dict(void) * AVX512VPOPCNTDQ, AVX512VL, AVX512BW, AVX512DQ, AVX512VNNI, * AVX512IFMA, AVX512VBMI, AVX512VBMI2, AVX512BITALG, * AVX512_KNL, AVX512_KNM, AVX512_SKX, AVX512_CLX, AVX512_CNL, AVX512_ICL, - * VSX, VSX2, VSX3, + * VSX, VSX2, VSX3, VSX4, * VX, VXE, VXE2, * NEON, NEON_FP16, NEON_VFPV4, ASIMD, FPHP, ASIMDHP, ASIMDDP, ASIMDFHM# */ @@ -474,9 +474,15 @@ npy__cpu_init_features(void) #ifndef AT_HWCAP2 #define AT_HWCAP2 26 #endif + #ifndef PPC_FEATURE2_ARCH_2_07 + #define PPC_FEATURE2_ARCH_2_07 0x80000000 + #endif #ifndef PPC_FEATURE2_ARCH_3_00 #define PPC_FEATURE2_ARCH_3_00 0x00800000 #endif + #ifndef PPC_FEATURE2_ARCH_3_1 + #define PPC_FEATURE2_ARCH_3_1 0x00040000 + #endif #endif static void @@ -489,15 +495,18 @@ npy__cpu_init_features(void) return; hwcap = getauxval(AT_HWCAP2); - if (hwcap & PPC_FEATURE2_ARCH_3_00) + if (hwcap & PPC_FEATURE2_ARCH_3_1) { npy__cpu_have[NPY_CPU_FEATURE_VSX] = npy__cpu_have[NPY_CPU_FEATURE_VSX2] = - npy__cpu_have[NPY_CPU_FEATURE_VSX3] = 1; + npy__cpu_have[NPY_CPU_FEATURE_VSX3] = + npy__cpu_have[NPY_CPU_FEATURE_VSX4] = 1; return; } - npy__cpu_have[NPY_CPU_FEATURE_VSX2] = (hwcap & PPC_FEATURE2_ARCH_2_07) != 0; npy__cpu_have[NPY_CPU_FEATURE_VSX] = 1; + npy__cpu_have[NPY_CPU_FEATURE_VSX2] = (hwcap & PPC_FEATURE2_ARCH_2_07) != 0; + npy__cpu_have[NPY_CPU_FEATURE_VSX3] = (hwcap & PPC_FEATURE2_ARCH_3_00) != 0; + npy__cpu_have[NPY_CPU_FEATURE_VSX4] = (hwcap & PPC_FEATURE2_ARCH_3_1) != 0; // TODO: AIX, FreeBSD #else npy__cpu_have[NPY_CPU_FEATURE_VSX] = 1; @@ -507,6 +516,9 @@ npy__cpu_init_features(void) #ifdef NPY_HAVE_VSX3 npy__cpu_have[NPY_CPU_FEATURE_VSX3] = 1; #endif + #ifdef NPY_HAVE_VSX4 + npy__cpu_have[NPY_CPU_FEATURE_VSX4] = 1; + #endif #endif } diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h index 1f52a445d..3d5f2e75c 100644 --- a/numpy/core/src/common/npy_cpu_features.h +++ b/numpy/core/src/common/npy_cpu_features.h @@ -65,6 +65,8 @@ enum npy_cpu_features NPY_CPU_FEATURE_VSX2 = 201, // POWER9 NPY_CPU_FEATURE_VSX3 = 202, + // POWER10 + NPY_CPU_FEATURE_VSX4 = 203, // ARM NPY_CPU_FEATURE_NEON = 300, @@ -167,8 +169,8 @@ npy_cpu_baseline_list(void); * On x64: ['SSSE3', 'SSE41', 'POPCNT', 'SSE42', 'AVX', 'F16C', 'FMA3', 'AVX2', 'AVX512F', ...] * On armhf: ['NEON', 'NEON_FP16', 'NEON_VPFV4', 'ASIMD', 'ASIMDHP', 'ASIMDDP', 'ASIMDFHM'] * On aarch64: ['ASIMDHP', 'ASIMDDP', 'ASIMDFHM'] - * On ppc64: ['VSX', 'VSX2', 'VSX3'] - * On ppc64le: ['VSX3'] + * On ppc64: ['VSX', 'VSX2', 'VSX3', 'VSX4'] + * On ppc64le: ['VSX3', 'VSX4'] * On s390x: ['VX', 'VXE', VXE2] * On any other arch or if the optimization is disabled: [] */ diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h index ec15e50e1..deaf4ad11 100644 --- a/numpy/core/src/common/simd/avx2/math.h +++ b/numpy/core/src/common/simd/avx2/math.h @@ -42,7 +42,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_max_f64 _mm256_max_pd // Maximum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b) { __m256 nn = _mm256_cmp_ps(b, b, _CMP_ORD_Q); @@ -76,7 +76,7 @@ NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b) #define npyv_min_f64 _mm256_min_pd // Minimum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b) { __m256 nn = _mm256_cmp_ps(b, b, _CMP_ORD_Q); @@ -105,6 +105,10 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return _mm256_blendv_epi8(a, b, _mm256_cmpgt_epi64(a, b)); } +// round to nearest intger even +#define npyv_rint_f32(A) _mm256_round_ps(A, _MM_FROUND_TO_NEAREST_INT) +#define npyv_rint_f64(A) _mm256_round_pd(A, _MM_FROUND_TO_NEAREST_INT) + // ceil #define npyv_ceil_f32 _mm256_ceil_ps #define npyv_ceil_f64 _mm256_ceil_pd @@ -113,4 +117,8 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) #define npyv_trunc_f32(A) _mm256_round_ps(A, _MM_FROUND_TO_ZERO) #define npyv_trunc_f64(A) _mm256_round_pd(A, _MM_FROUND_TO_ZERO) +// floor +#define npyv_floor_f32 _mm256_floor_ps +#define npyv_floor_f64 _mm256_floor_pd + #endif // _NPY_SIMD_AVX2_MATH_H diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h index f30e50ad0..5a6cb6dcd 100644 --- a/numpy/core/src/common/simd/avx512/math.h +++ b/numpy/core/src/common/simd/avx512/math.h @@ -51,7 +51,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_max_f64 _mm512_max_pd // Maximum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b) { __mmask16 nn = _mm512_cmp_ps_mask(b, b, _CMP_ORD_Q); @@ -84,7 +84,7 @@ NPY_FINLINE npyv_f64 npyv_maxp_f64(npyv_f64 a, npyv_f64 b) #define npyv_min_f64 _mm512_min_pd // Minimum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b) { __mmask16 nn = _mm512_cmp_ps_mask(b, b, _CMP_ORD_Q); @@ -112,6 +112,10 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b) #define npyv_min_u64 _mm512_min_epu64 #define npyv_min_s64 _mm512_min_epi64 +// round to nearest integer even +#define npyv_rint_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_NEAREST_INT) +#define npyv_rint_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_NEAREST_INT) + // ceil #define npyv_ceil_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_POS_INF) #define npyv_ceil_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_POS_INF) @@ -120,4 +124,8 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b) #define npyv_trunc_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_ZERO) #define npyv_trunc_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_ZERO) +// floor +#define npyv_floor_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_NEG_INF) +#define npyv_floor_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_NEG_INF) + #endif // _NPY_SIMD_AVX512_MATH_H diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h index a7a461721..42f022c55 100644 --- a/numpy/core/src/common/simd/intdiv.h +++ b/numpy/core/src/common/simd/intdiv.h @@ -136,7 +136,7 @@ NPY_FINLINE npy_uint64 npyv__divh128_u64(npy_uint64 high, npy_uint64 divisor) { assert(divisor > 1); npy_uint64 quotient; -#if defined(_M_X64) && defined(_MSC_VER) && _MSC_VER >= 1920 +#if defined(_M_X64) && defined(_MSC_VER) && _MSC_VER >= 1920 && !defined(__clang__) npy_uint64 remainder; quotient = _udiv128(high, 0, divisor, &remainder); (void)remainder; diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index 19e5cd846..4607d6f27 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -153,6 +153,33 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return vbslq_s64(npyv_cmplt_s64(a, b), a, b); } +// round to nearest integer even +NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) +{ +#ifdef NPY_HAVE_ASIMD + return vrndnq_f32(a); +#else + // ARMv7 NEON only supports fp to int truncate conversion. + // a magic trick of adding 1.5 * 2**23 is used for rounding + // to nearest even and then substract this magic number to get + // the integer. + const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + const npyv_f32 magic = vdupq_n_f32(12582912.0f); // 1.5 * 2**23 + npyv_f32 round = vsubq_f32(vaddq_f32(a, magic), magic); + npyv_b32 overflow = vcleq_f32(vabsq_f32(a), vreinterpretq_f32_u32(vdupq_n_u32(0x4b000000))); + round = vbslq_f32(overflow, round, a); + // signed zero + round = vreinterpretq_f32_s32(vorrq_s32( + vreinterpretq_s32_f32(round), + vandq_s32(vreinterpretq_s32_f32(a), szero) + )); + return round; +#endif +} +#if NPY_SIMD_F64 + #define npyv_rint_f64 vrndnq_f64 +#endif // NPY_SIMD_F64 + // ceil #ifdef NPY_HAVE_ASIMD #define npyv_ceil_f32 vrndpq_f32 @@ -223,4 +250,36 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) #define npyv_trunc_f64 vrndq_f64 #endif // NPY_SIMD_F64 +// floor +#ifdef NPY_HAVE_ASIMD + #define npyv_floor_f32 vrndmq_f32 +#else + NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) + { + const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f)); + const npyv_s32 max_int = vdupq_n_s32(0x7fffffff); + + npyv_s32 roundi = vcvtq_s32_f32(a); + npyv_f32 round = vcvtq_f32_s32(roundi); + npyv_f32 floor = vsubq_f32(round, vreinterpretq_f32_u32( + vandq_u32(vcgtq_f32(round, a), one) + )); + // respect signed zero + npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32( + vreinterpretq_s32_f32(floor), + vandq_s32(vreinterpretq_s32_f32(a), szero) + )); + npyv_u32 nnan = npyv_notnan_f32(a); + npyv_u32 overflow = vorrq_u32( + vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int) + ); + + return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a); + } +#endif // NPY_HAVE_ASIMD +#if NPY_SIMD_F64 + #define npyv_floor_f64 vrndmq_f64 +#endif // NPY_SIMD_F64 + #endif // _NPY_SIMD_NEON_MATH_H diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h index 5daf7711e..e4b77b671 100644 --- a/numpy/core/src/common/simd/sse/math.h +++ b/numpy/core/src/common/simd/sse/math.h @@ -42,7 +42,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_max_f64 _mm_max_pd // Maximum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b) { __m128 nn = _mm_cmpord_ps(b, b); @@ -95,7 +95,7 @@ NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b) #define npyv_min_f64 _mm_min_pd // Minimum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b) { __m128 nn = _mm_cmpord_ps(b, b); @@ -143,6 +143,38 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return npyv_select_s64(npyv_cmplt_s64(a, b), a, b); } +// round to nearest integer even +NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) +{ +#ifdef NPY_HAVE_SSE41 + return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); +#else + const npyv_f32 szero = _mm_set1_ps(-0.0f); + __m128i roundi = _mm_cvtps_epi32(a); + __m128i overflow = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)); + __m128 r = _mm_cvtepi32_ps(roundi); + // respect sign of zero + r = _mm_or_ps(r, _mm_and_ps(a, szero)); + return npyv_select_f32(overflow, a, r); +#endif +} + +// round to nearest integer even +NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a) +{ +#ifdef NPY_HAVE_SSE41 + return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT); +#else + const npyv_f64 szero = _mm_set1_pd(-0.0); + const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000); + npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero)); + // round by add magic number 2^52 + npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52); + // respect signed zero, e.g. -0.5 -> -0.0 + return _mm_or_pd(round, _mm_and_pd(a, szero)); +#endif +} + // ceil #ifdef NPY_HAVE_SSE41 #define npyv_ceil_f32 _mm_ceil_ps @@ -202,4 +234,23 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) } #endif +// floor +#ifdef NPY_HAVE_SSE41 + #define npyv_floor_f32 _mm_floor_ps + #define npyv_floor_f64 _mm_floor_pd +#else + NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) + { + const npyv_f32 one = _mm_set1_ps(1.0f); + npyv_f32 round = npyv_rint_f32(a); + return _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, a), one)); + } + NPY_FINLINE npyv_f64 npyv_floor_f64(npyv_f64 a) + { + const npyv_f64 one = _mm_set1_pd(1.0); + npyv_f64 round = npyv_rint_f64(a); + return _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, a), one)); + } +#endif // NPY_HAVE_SSE41 + #endif // _NPY_SIMD_SSE_MATH_H diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vsx/math.h index d138cae8a..444bc9e54 100644 --- a/numpy/core/src/common/simd/vsx/math.h +++ b/numpy/core/src/common/simd/vsx/math.h @@ -38,7 +38,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_max_f64 vec_max // Maximum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. #define npyv_maxp_f32 vec_max #define npyv_maxp_f64 vec_max // Maximum, integer operations @@ -56,7 +56,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_min_f64 vec_min // Minimum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. #define npyv_minp_f32 vec_min #define npyv_minp_f64 vec_min // Minimum, integer operations @@ -69,6 +69,10 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_min_u64 vec_min #define npyv_min_s64 vec_min +// round to nearest int even +#define npyv_rint_f32 vec_rint +#define npyv_rint_f64 vec_rint + // ceil #define npyv_ceil_f32 vec_ceil #define npyv_ceil_f64 vec_ceil @@ -77,4 +81,8 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_trunc_f32 vec_trunc #define npyv_trunc_f64 vec_trunc +// floor +#define npyv_floor_f32 vec_floor +#define npyv_floor_f64 vec_floor + #endif // _NPY_SIMD_VSX_MATH_H diff --git a/numpy/core/src/multiarray/common_dtype.c b/numpy/core/src/multiarray/common_dtype.c index ca80b1ed7..3561a905a 100644 --- a/numpy/core/src/multiarray/common_dtype.c +++ b/numpy/core/src/multiarray/common_dtype.c @@ -41,7 +41,7 @@ * @param dtype2 Second DType class. * @return The common DType or NULL with an error set */ -NPY_NO_EXPORT NPY_INLINE PyArray_DTypeMeta * +NPY_NO_EXPORT PyArray_DTypeMeta * PyArray_CommonDType(PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtype2) { if (dtype1 == dtype2) { diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 5f054d0a9..7f084ac39 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1506,62 +1506,6 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const * */ /**begin repeat - * #func = rint, floor, trunc# - * #scalarf = npy_rint, npy_floor, npy_trunc# - */ - -/**begin repeat1 -* #TYPE = FLOAT, DOUBLE# -* #type = npy_float, npy_double# -* #typesub = f, # -*/ - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@@typesub@(in1); - } -} - - -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * #isa = avx512f, fma# - * #ISA = AVX512F, FMA# - * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS# - */ - -/**begin repeat1 - * #TYPE = FLOAT, DOUBLE# - * #type = npy_float, npy_double# - * #typesub = f, # - */ - -/**begin repeat2 - * #func = rint, floor, trunc# - * #scalarf = npy_rint, npy_floor, npy_trunc# - */ - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - if (!run_unary_@isa@_@func@_@TYPE@(args, dimensions, steps)) { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *(@type@ *)op1 = @scalarf@@typesub@(in1); - } - } -} - -/**end repeat2**/ -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat * Float types * #type = npy_float, npy_double, npy_longdouble# * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 3eafbdf66..e5235b464 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -186,7 +186,7 @@ NPY_NO_EXPORT void * #TYPE = FLOAT, DOUBLE# */ /**begin repeat1 - * #kind = ceil, sqrt, absolute, square, reciprocal# + * #kind = rint, floor, trunc, ceil, sqrt, absolute, square, reciprocal# */ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) @@ -274,26 +274,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, ( /**end repeat**/ /**begin repeat - * #func = rint, floor, trunc# - */ - -/**begin repeat1 -* #TYPE = FLOAT, DOUBLE# -*/ - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); - -/**begin repeat2 - * #isa = avx512f, fma# - */ -NPY_NO_EXPORT NPY_GCC_OPT_3 void -@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)); -/**end repeat2**/ -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat * Float types * #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE# * #c = f, f, , l# diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src index 93761b98c..78e231965 100644 --- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src +++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src @@ -70,6 +70,15 @@ NPY_FINLINE double c_square_f64(double a) #define c_ceil_f32 npy_ceilf #define c_ceil_f64 npy_ceil +#define c_trunc_f32 npy_truncf +#define c_trunc_f64 npy_trunc + +#define c_floor_f32 npy_floorf +#define c_floor_f64 npy_floor + +#define c_rint_f32 npy_rintf +#define c_rint_f64 npy_rint + /******************************************************************************** ** Defining the SIMD kernels ********************************************************************************/ @@ -119,6 +128,9 @@ NPY_FINLINE double c_square_f64(double a) #if __clang_major__ < 10 // Clang before v10 #define WORKAROUND_CLANG_RECIPROCAL_BUG 1 + #elif defined(_MSC_VER) + // clang-cl has the same bug + #define WORKAROUND_CLANG_RECIPROCAL_BUG 1 #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64) // Clang v10+, targeting i386 or x86_64 #define WORKAROUND_CLANG_RECIPROCAL_BUG 0 @@ -139,10 +151,10 @@ NPY_FINLINE double c_square_f64(double a) */ #if @VCHK@ /**begin repeat1 - * #kind = ceil, sqrt, absolute, square, reciprocal# - * #intr = ceil, sqrt, abs, square, recip# - * #repl_0w1 = 0, 0, 0, 0, 1# - * #RECIP_WORKAROUND = 0, 0, 0, 0, WORKAROUND_CLANG_RECIPROCAL_BUG# + * #kind = rint, floor, ceil, trunc, sqrt, absolute, square, reciprocal# + * #intr = rint, floor, ceil, trunc, sqrt, abs, square, recip# + * #repl_0w1 = 0*7, 1# + * #RECIP_WORKAROUND = 0*7, WORKAROUND_CLANG_RECIPROCAL_BUG# */ /**begin repeat2 * #STYPE = CONTIG, NCONTIG, CONTIG, NCONTIG# @@ -250,9 +262,9 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@ * #VCHK = NPY_SIMD, NPY_SIMD_F64# */ /**begin repeat1 - * #kind = ceil, sqrt, absolute, square, reciprocal# - * #intr = ceil, sqrt, abs, square, recip# - * #clear = 0, 0, 1, 0, 0# + * #kind = rint, floor, ceil, trunc, sqrt, absolute, square, reciprocal# + * #intr = rint, floor, ceil, trunc, sqrt, abs, square, recip# + * #clear = 0, 0, 0, 0, 0, 1, 0, 0# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 8b833ee56..b477027b3 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -123,47 +123,6 @@ run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp c /**end repeat**/ /**begin repeat - * #ISA = FMA, AVX512F# - * #isa = fma, avx512f# - * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# - * #REGISTER_SIZE = 32, 64# - */ - -/* prototypes */ - -/**begin repeat1 - * #type = npy_float, npy_double# - * #TYPE = FLOAT, DOUBLE# - */ - -/**begin repeat2 - * #func = rint, floor, trunc# - */ - -#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS -static NPY_INLINE NPY_GCC_TARGET_@ISA@ void -@ISA@_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n, const npy_intp stride); -#endif - -static NPY_INLINE int -run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS - if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), sizeof(@type@), @REGISTER_SIZE@)) { - @ISA@_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]); - return 1; - } - else - return 0; -#endif - return 0; -} - -/**end repeat2**/ -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat * Float types * #type = npy_float, npy_double, npy_longdouble# * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# @@ -1119,144 +1078,6 @@ AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, co /**end repeat**/ /**begin repeat - * #ISA = FMA, AVX512F# - * #isa = fma, avx512# - * #vsize = 256, 512# - * #BYTES = 32, 64# - * #cvtps_epi32 = _mm256_cvtps_epi32, # - * #mask = __m256, __mmask16# - * #vsub = , _mask# - * #vtype = __m256, __m512# - * #cvtps_epi32 = _mm256_cvtps_epi32, # - * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps# - * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# - */ - -/**begin repeat1 - * #func = rint, floor, trunc# - * #vectorf = rint, floor, trunc# - */ - -#if defined @CHK@ -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void -@ISA@_@func@_FLOAT(npy_float* op, - npy_float* ip, - const npy_intp array_size, - const npy_intp steps) -{ - const npy_intp stride = steps/(npy_intp)sizeof(npy_float); - const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float); - npy_intp num_remaining_elements = array_size; - @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f); - @mask@ load_mask = @isa@_get_full_load_mask_ps(); - /* - * Note: while generally indices are npy_intp, we ensure that our maximum index - * will fit in an int32 as a precondition for this function via - * IS_OUTPUT_BLOCKABLE_UNARY - */ - - npy_int32 indexarr[16]; - for (npy_int32 ii = 0; ii < 16; ii++) { - indexarr[ii] = ii*stride; - } - @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]); - - while (num_remaining_elements > 0) { - if (num_remaining_elements < num_lanes) { - load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements, - num_lanes); - } - @vtype@ x; - if (stride == 1) { - x = @isa@_masked_load_ps(load_mask, ip); - } - else { - x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask); - } - @vtype@ out = @isa@_@vectorf@_ps(x); - @masked_store@(op, @cvtps_epi32@(load_mask), out); - - ip += num_lanes*stride; - op += num_lanes; - num_remaining_elements -= num_lanes; - } -} -#endif -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat - * #ISA = FMA, AVX512F# - * #isa = fma, avx512# - * #vsize = 256, 512# - * #BYTES = 32, 64# - * #cvtps_epi32 = _mm256_cvtps_epi32, # - * #mask = __m256i, __mmask8# - * #vsub = , _mask# - * #vtype = __m256d, __m512d# - * #vindextype = __m128i, __m256i# - * #vindexsize = 128, 256# - * #vindexload = _mm_loadu_si128, _mm256_loadu_si256# - * #cvtps_epi32 = _mm256_cvtpd_epi32, # - * #castmask = _mm256_castsi256_pd, # - * #masked_store = _mm256_maskstore_pd, _mm512_mask_storeu_pd# - * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# - */ - -/**begin repeat1 - * #func = rint, floor, trunc# - * #vectorf = rint, floor, trunc# - */ - -#if defined @CHK@ -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void -@ISA@_@func@_DOUBLE(npy_double* op, - npy_double* ip, - const npy_intp array_size, - const npy_intp steps) -{ - const npy_intp stride = steps/(npy_intp)sizeof(npy_double); - const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_double); - npy_intp num_remaining_elements = array_size; - @mask@ load_mask = @isa@_get_full_load_mask_pd(); - @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f); - - /* - * Note: while generally indices are npy_intp, we ensure that our maximum index - * will fit in an int32 as a precondition for this function via - * IS_OUTPUT_BLOCKABLE_UNARY - */ - npy_int32 indexarr[8]; - for (npy_int32 ii = 0; ii < 8; ii++) { - indexarr[ii] = ii*stride; - } - @vindextype@ vindex = @vindexload@((@vindextype@*)&indexarr[0]); - - while (num_remaining_elements > 0) { - if (num_remaining_elements < num_lanes) { - load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements, - num_lanes); - } - @vtype@ x; - if (stride == 1) { - x = @isa@_masked_load_pd(load_mask, ip); - } - else { - x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask)); - } - @vtype@ out = @isa@_@vectorf@_pd(x); - @masked_store@(op, load_mask, out); - - ip += num_lanes*stride; - op += num_lanes; - num_remaining_elements -= num_lanes; - } -} -#endif -/**end repeat1**/ -/**end repeat**/ - -/**begin repeat * #TYPE = CFLOAT, CDOUBLE# * #type = npy_float, npy_double# * #num_lanes = 16, 8# @@ -1535,3 +1356,4 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n) #undef VECTOR_SIZE_BYTES #endif /* NPY_HAVE_SSE2_INTRINSICS */ #endif + diff --git a/numpy/core/tests/test_cpu_features.py b/numpy/core/tests/test_cpu_features.py index 706cf7a7e..1a76897e2 100644 --- a/numpy/core/tests/test_cpu_features.py +++ b/numpy/core/tests/test_cpu_features.py @@ -140,8 +140,8 @@ class Test_X86_Features(AbstractTest): is_power = re.match("^(powerpc|ppc)64", machine, re.IGNORECASE) @pytest.mark.skipif(not is_linux or not is_power, reason="Only for Linux and Power") class Test_POWER_Features(AbstractTest): - features = ["VSX", "VSX2", "VSX3"] - features_map = dict(VSX2="ARCH_2_07", VSX3="ARCH_3_00") + features = ["VSX", "VSX2", "VSX3", "VSX4"] + features_map = dict(VSX2="ARCH_2_07", VSX3="ARCH_3_00", VSX4="ARCH_3_1") def load_flags(self): self.load_flags_auxv() diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 12a67c44d..605baefe6 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -330,16 +330,18 @@ class _SIMD_FP(_Test_Utility): square = self.square(vdata) assert square == data_square - @pytest.mark.parametrize("intrin, func", [("self.ceil", math.ceil), - ("self.trunc", math.trunc)]) + @pytest.mark.parametrize("intrin, func", [("ceil", math.ceil), + ("trunc", math.trunc), ("floor", math.floor), ("rint", round)]) def test_rounding(self, intrin, func): """ Test intrinsics: + npyv_rint_##SFX npyv_ceil_##SFX npyv_trunc_##SFX + npyv_floor##SFX """ intrin_name = intrin - intrin = eval(intrin) + intrin = getattr(self, intrin) pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan() # special cases round_cases = ((nan, nan), (pinf, pinf), (ninf, ninf)) @@ -347,20 +349,25 @@ class _SIMD_FP(_Test_Utility): data_round = [desired]*self.nlanes _round = intrin(self.setall(case)) assert _round == pytest.approx(data_round, nan_ok=True) + for x in range(0, 2**20, 256**2): for w in (-1.05, -1.10, -1.15, 1.05, 1.10, 1.15): - data = [x*w+a for a in range(self.nlanes)] - vdata = self.load(data) + data = self.load([(x+a)*w for a in range(self.nlanes)]) data_round = [func(x) for x in data] - _round = intrin(vdata) + _round = intrin(data) assert _round == data_round + # signed zero - if "ceil" in intrin_name or "trunc" in intrin_name: - for w in (-0.25, -0.30, -0.45): - _round = self._to_unsigned(intrin(self.setall(w))) - data_round = self._to_unsigned(self.setall(-0.0)) - assert _round == data_round - + if intrin_name == "floor": + data_szero = (-0.0,) + else: + data_szero = (-0.0, -0.25, -0.30, -0.45, -0.5) + + for w in data_szero: + _round = self._to_unsigned(intrin(self.setall(w))) + data_round = self._to_unsigned(self.setall(-0.0)) + assert _round == data_round + def test_max(self): """ Test intrinsics: diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py index f1d024b94..854584998 100644 --- a/numpy/distutils/ccompiler_opt.py +++ b/numpy/distutils/ccompiler_opt.py @@ -294,6 +294,9 @@ class _Config: VSX2 = dict(interest=2, implies="VSX", implies_detect=False), ## Power9/ISA 3.00 VSX3 = dict(interest=3, implies="VSX2", implies_detect=False), + ## Power10/ISA 3.1 + VSX4 = dict(interest=4, implies="VSX3", implies_detect=False, + extra_checks="VSX4_MMA"), # IBM/Z ## VX(z13) support VX = dict(interest=1, headers="vecintrin.h"), @@ -471,12 +474,16 @@ class _Config: ), VSX3 = dict( flags="-mcpu=power9 -mtune=power9", implies_detect=False + ), + VSX4 = dict( + flags="-mcpu=power10 -mtune=power10", implies_detect=False ) ) if self.cc_is_clang: partial["VSX"]["flags"] = "-maltivec -mvsx" partial["VSX2"]["flags"] = "-mpower8-vector" partial["VSX3"]["flags"] = "-mpower9-vector" + partial["VSX4"]["flags"] = "-mpower10-vector" return partial diff --git a/numpy/distutils/checks/cpu_vsx4.c b/numpy/distutils/checks/cpu_vsx4.c new file mode 100644 index 000000000..a6acc7384 --- /dev/null +++ b/numpy/distutils/checks/cpu_vsx4.c @@ -0,0 +1,14 @@ +#ifndef __VSX__ + #error "VSX is not supported" +#endif +#include <altivec.h> + +typedef __vector unsigned int v_uint32x4; + +int main(void) +{ + v_uint32x4 v1 = (v_uint32x4){2, 4, 8, 16}; + v_uint32x4 v2 = (v_uint32x4){2, 2, 2, 2}; + v_uint32x4 v3 = vec_mod(v1, v2); + return (int)vec_extractm(v3); +} diff --git a/numpy/distutils/checks/extra_vsx4_mma.c b/numpy/distutils/checks/extra_vsx4_mma.c new file mode 100644 index 000000000..a70b2a9f6 --- /dev/null +++ b/numpy/distutils/checks/extra_vsx4_mma.c @@ -0,0 +1,21 @@ +#ifndef __VSX__ + #error "VSX is not supported" +#endif +#include <altivec.h> + +typedef __vector float fv4sf_t; +typedef __vector unsigned char vec_t; + +int main(void) +{ + __vector_quad acc0; + float a[4] = {0,1,2,3}; + float b[4] = {0,1,2,3}; + vec_t *va = (vec_t *) a; + vec_t *vb = (vec_t *) b; + __builtin_mma_xvf32ger(&acc0, va[0], vb[0]); + fv4sf_t result[4]; + __builtin_mma_disassemble_acc((void *)result, &acc0); + fv4sf_t c0 = result[0]; + return (int)((float*)&c0)[0]; +} diff --git a/numpy/distutils/command/build.py b/numpy/distutils/command/build.py index dc1ab3b9b..80830d559 100644 --- a/numpy/distutils/command/build.py +++ b/numpy/distutils/command/build.py @@ -47,8 +47,8 @@ class build(old_build): - not part of dispatch-able features(--cpu-dispatch) - not supported by compiler or platform """ - self.simd_test = "BASELINE SSE2 SSE42 XOP FMA4 (FMA3 AVX2) AVX512F" \ - " AVX512_SKX VSX VSX2 VSX3 NEON ASIMD VX VXE VXE2" + self.simd_test = "BASELINE SSE2 SSE42 XOP FMA4 (FMA3 AVX2) AVX512F " \ + "AVX512_SKX VSX VSX2 VSX3 VSX4 NEON ASIMD VX VXE VXE2" def finalize_options(self): build_scripts = self.build_scripts diff --git a/numpy/distutils/tests/test_ccompiler_opt.py b/numpy/distutils/tests/test_ccompiler_opt.py index 6f9970c75..1ca8bc09b 100644 --- a/numpy/distutils/tests/test_ccompiler_opt.py +++ b/numpy/distutils/tests/test_ccompiler_opt.py @@ -405,7 +405,7 @@ class _Test_CCompilerOpt: # in msvc, avx512_knl avx512_knm aren't supported x86_msvc=".* xop fma4 .* avx512f .* avx512_skx .*", armhf=".* asimd asimdhp asimddp .*", - ppc64="vsx vsx2 vsx3.*", + ppc64="vsx vsx2 vsx3 vsx4.*", s390x="vx vxe vxe2.*" ) # min @@ -544,13 +544,13 @@ class _Test_CCompilerOpt: """ /*@targets sse sse2 sse41 avx avx2 avx512f - vsx vsx2 vsx3 + vsx vsx2 vsx3 vsx4 neon neon_fp16 asimdhp asimddp vx vxe vxe2 */ """, baseline="avx vsx2 asimd vx vxe", - x86="avx512f avx2", armhf="asimddp asimdhp", ppc64="vsx3", + x86="avx512f avx2", armhf="asimddp asimdhp", ppc64="vsx4 vsx3", s390x="vxe2" ) # test skipping non-dispatch features @@ -558,7 +558,7 @@ class _Test_CCompilerOpt: """ /*@targets sse41 avx avx2 avx512f - vsx2 vsx3 + vsx2 vsx3 vsx4 asimd asimdhp asimddp vx vxe vxe2 */ @@ -571,13 +571,13 @@ class _Test_CCompilerOpt: """ /*@targets sse2 sse41 avx2 avx512f - vsx2 vsx3 + vsx2 vsx3 vsx4 neon asimdhp asimddp vx vxe vxe2 */ """, baseline="", - trap_files=".*(avx2|avx512f|vsx3|asimddp|vxe2).c", + trap_files=".*(avx2|avx512f|vsx3|vsx4|asimddp|vxe2).c", x86="sse41 sse2", ppc64="vsx2", armhf="asimdhp neon", s390x="vxe vx" ) diff --git a/numpy/f2py/capi_maps.py b/numpy/f2py/capi_maps.py index 581f946e5..b4fab71f9 100644 --- a/numpy/f2py/capi_maps.py +++ b/numpy/f2py/capi_maps.py @@ -504,7 +504,8 @@ def sign2map(a, var): varname,ctype,atype init,init.r,init.i,pytype vardebuginfo,vardebugshowvalue,varshowvalue - varrfromat + varrformat + intent """ out_a = a diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index d4abde425..d5b130b72 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -4907,7 +4907,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing='xy'): >>> x = np.linspace(-5, 5, 101) >>> y = np.linspace(-5, 5, 101) - >>> # full coorindate arrays + >>> # full coordinate arrays >>> xx, yy = np.meshgrid(x, y) >>> zz = np.sqrt(xx**2 + yy**2) >>> xx.shape, yy.shape, zz.shape diff --git a/numpy/lib/polynomial.py b/numpy/lib/polynomial.py index f824c4c5e..6aa708861 100644 --- a/numpy/lib/polynomial.py +++ b/numpy/lib/polynomial.py @@ -686,7 +686,7 @@ def polyfit(x, y, deg, rcond=None, full=False, w=None, cov=False): "to scale the covariance matrix") # note, this used to be: fac = resids / (len(x) - order - 2.0) # it was deciced that the "- 2" (originally justified by "Bayesian - # uncertainty analysis") is not was the user expects + # uncertainty analysis") is not what the user expects # (see gh-11196 and gh-11197) fac = resids / (len(x) - order) if y.ndim == 1: diff --git a/numpy/ma/core.py b/numpy/ma/core.py index e0e5403a9..9c9dfac68 100644 --- a/numpy/ma/core.py +++ b/numpy/ma/core.py @@ -3542,15 +3542,17 @@ class MaskedArray(ndarray): def harden_mask(self): """ - Force the mask to hard. + Force the mask to hard, preventing unmasking by assignment. Whether the mask of a masked array is hard or soft is determined by its `~ma.MaskedArray.hardmask` property. `harden_mask` sets - `~ma.MaskedArray.hardmask` to ``True``. + `~ma.MaskedArray.hardmask` to ``True`` (and returns the modified + self). See Also -------- ma.MaskedArray.hardmask + ma.MaskedArray.soften_mask """ self._hardmask = True @@ -3558,15 +3560,17 @@ class MaskedArray(ndarray): def soften_mask(self): """ - Force the mask to soft. + Force the mask to soft (default), allowing unmasking by assignment. Whether the mask of a masked array is hard or soft is determined by its `~ma.MaskedArray.hardmask` property. `soften_mask` sets - `~ma.MaskedArray.hardmask` to ``False``. + `~ma.MaskedArray.hardmask` to ``False`` (and returns the modified + self). See Also -------- ma.MaskedArray.hardmask + ma.MaskedArray.harden_mask """ self._hardmask = False @@ -3574,16 +3578,55 @@ class MaskedArray(ndarray): @property def hardmask(self): - """ Hardness of the mask """ + """ + Specifies whether values can be unmasked through assignments. + + By default, assigning definite values to masked array entries will + unmask them. When `hardmask` is ``True``, the mask will not change + through assignments. + + See Also + -------- + ma.MaskedArray.harden_mask + ma.MaskedArray.soften_mask + + Examples + -------- + >>> x = np.arange(10) + >>> m = np.ma.masked_array(x, x>5) + >>> assert not m.hardmask + + Since `m` has a soft mask, assigning an element value unmasks that + element: + + >>> m[8] = 42 + >>> m + masked_array(data=[0, 1, 2, 3, 4, 5, --, --, 42, --], + mask=[False, False, False, False, False, False, + True, True, False, True], + fill_value=999999) + + After hardening, the mask is not affected by assignments: + + >>> hardened = np.ma.harden_mask(m) + >>> assert m.hardmask and hardened is m + >>> m[:] = 23 + >>> m + masked_array(data=[23, 23, 23, 23, 23, 23, --, --, 23, --], + mask=[False, False, False, False, False, False, + True, True, False, True], + fill_value=999999) + + """ return self._hardmask def unshare_mask(self): """ - Copy the mask and set the sharedmask flag to False. + Copy the mask and set the `sharedmask` flag to ``False``. Whether the mask is shared between masked arrays can be seen from - the `sharedmask` property. `unshare_mask` ensures the mask is not shared. - A copy of the mask is only made if it was shared. + the `sharedmask` property. `unshare_mask` ensures the mask is not + shared. A copy of the mask is only made if it was shared. See Also -------- diff --git a/numpy/random/mtrand.pyx b/numpy/random/mtrand.pyx index 8bf74aa5d..38b5484bc 100644 --- a/numpy/random/mtrand.pyx +++ b/numpy/random/mtrand.pyx @@ -4341,7 +4341,7 @@ cdef class RandomState: The drawn samples, of shape ``(size, k)``. Raises - ------- + ------ ValueError If any value in ``alpha`` is less than or equal to zero diff --git a/numpy/testing/_private/extbuild.py b/numpy/testing/_private/extbuild.py index 940e2f7d7..b7a071e7f 100644 --- a/numpy/testing/_private/extbuild.py +++ b/numpy/testing/_private/extbuild.py @@ -8,8 +8,6 @@ import os import pathlib import sys import sysconfig -from numpy.distutils.ccompiler import new_compiler -from distutils.errors import CompileError __all__ = ['build_and_import_extension', 'compile_extension_module'] @@ -53,6 +51,7 @@ def build_and_import_extension( >>> assert not mod.test_bytes(u'abc') >>> assert mod.test_bytes(b'abc') """ + from distutils.errors import CompileError body = prologue + _make_methods(functions, modname) init = """PyObject *mod = PyModule_Create(&moduledef); @@ -221,6 +220,7 @@ def _c_compile(cfile, outputfilename, include_dirs=[], libraries=[], def build(cfile, outputfilename, compile_extra, link_extra, include_dirs, libraries, library_dirs): "cd into the directory where the cfile is, use distutils to build" + from numpy.distutils.ccompiler import new_compiler compiler = new_compiler(force=1, verbose=2) compiler.customize('') diff --git a/numpy/typing/tests/data/reveal/lib_function_base.pyi b/numpy/typing/tests/data/reveal/lib_function_base.pyi index c559eb295..eebe9fbfd 100644 --- a/numpy/typing/tests/data/reveal/lib_function_base.pyi +++ b/numpy/typing/tests/data/reveal/lib_function_base.pyi @@ -26,7 +26,7 @@ reveal_type(vectorized_func.signature) # E: Union[None, builtins.str] reveal_type(vectorized_func.otypes) # E: Union[None, builtins.str] reveal_type(vectorized_func.excluded) # E: set[Union[builtins.int, builtins.str]] reveal_type(vectorized_func.__doc__) # E: Union[None, builtins.str] -reveal_type(vectorized_func([1])) # E: ndarray[Any, dtype[Any]] +reveal_type(vectorized_func([1])) # E: Any reveal_type(np.vectorize(int)) # E: vectorize reveal_type(np.vectorize( # E: vectorize int, otypes="i", doc="doc", excluded=(), cache=True, signature=None @@ -291,7 +291,7 @@ def parse_setuppy_commands(): - `pip install .` (from a git repo or downloaded source release) - - `pip install numpy` (last NumPy release on PyPi) + - `pip install numpy` (last NumPy release on PyPI) """)) return True @@ -303,7 +303,7 @@ def parse_setuppy_commands(): To install NumPy from here with reliable uninstall, we recommend that you use `pip install .`. To install the latest NumPy release - from PyPi, use `pip install numpy`. + from PyPI, use `pip install numpy`. For help with build/installation issues, please ask on the numpy-discussion mailing list. If you are sure that you have run @@ -371,7 +371,7 @@ def get_docs_url(): if 'dev' in VERSION: return "https://numpy.org/devdocs" else: - # For releases, this URL ends up on pypi. + # For releases, this URL ends up on PyPI. # By pinning the version, users looking at old PyPI releases can get # to the associated docs easily. return "https://numpy.org/doc/{}.{}".format(MAJOR, MINOR) diff --git a/tools/gitpod/Dockerfile b/tools/gitpod/Dockerfile index e2e0e1bc9..592a5ee0a 100644 --- a/tools/gitpod/Dockerfile +++ b/tools/gitpod/Dockerfile @@ -27,7 +27,7 @@ # OS/ARCH: linux/amd64 FROM gitpod/workspace-base:latest -ARG MAMBAFORGE_VERSION="4.10.0-0" +ARG MAMBAFORGE_VERSION="4.11.0-0" ARG CONDA_ENV=numpy-dev diff --git a/tools/openblas_support.py b/tools/openblas_support.py index 4eb72dbc9..c89cb9284 100644 --- a/tools/openblas_support.py +++ b/tools/openblas_support.py @@ -13,8 +13,9 @@ from tempfile import mkstemp, gettempdir from urllib.request import urlopen, Request from urllib.error import HTTPError -OPENBLAS_V = '0.3.18' -OPENBLAS_LONG = 'v0.3.18' +# 0.3.19 fails AVX512_SKX tests, issue 20654, comments in PR 20660 +OPENBLAS_V = '0.3.19.dev' +OPENBLAS_LONG = 'v0.3.19-22-g5188aede' BASE_LOC = 'https://anaconda.org/multibuild-wheels-staging/openblas-libs' BASEURL = f'{BASE_LOC}/{OPENBLAS_LONG}/download' SUPPORTED_PLATFORMS = [ |
