6 files changed, 132 insertions, 103 deletions
diff --git a/numpy/__init__.py b/numpy/__init__.py
index 3260046d6..22c90677e 100644
--- a/numpy/__init__.py
+++ b/numpy/__init__.py
@@ -85,10 +85,11 @@ __version__
 
 Viewing documentation using IPython
 -----------------------------------
-Start IPython with the NumPy profile (``ipython -p numpy``), which will
-import `numpy` under the alias ``np``.  Then, use the ``cpaste`` command to
-paste examples into the shell.  To see which functions are available in
-`numpy`, type ``np.<TAB>`` (where ``<TAB>`` refers to the TAB key), or use
+
+Start IPython and import `numpy` usually under the alias ``np``: `import
+numpy as np`.  Then, directly past or use the ``%cpaste`` magic to paste
+examples into the shell.  To see which functions are available in `numpy`,
+type ``np.<TAB>`` (where ``<TAB>`` refers to the TAB key), or use
 ``np.*cos*?<ENTER>`` (where ``<ENTER>`` refers to the ENTER key) to narrow
 down the list.  To view the docstring for a function, use
 ``np.cos?<ENTER>`` (to view the docstring) and ``np.cos??<ENTER>`` (to view
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 17dc8438e..10b8c093e 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -177,6 +177,16 @@ def check_math_capabilities(config, ext, moredefs, mathlibs):
         else:
             return 1
 
+    # GH-14787: Work around GCC<8.4 bug when compiling with AVX512
+    # support on Windows-based platforms
+    def check_gh14787(fn):
+        if fn == 'attribute_target_avx512f':
+            if (sys.platform in ('win32', 'cygwin') and
+                    config.check_compiler_gcc() and
+                    not config.check_gcc_version_at_least(8, 4)):
+                ext.extra_compile_args.extend(
+                        ['-ffixed-xmm%s' % n for n in range(16, 32)])
+
     #use_msvc = config.check_decl("_MSC_VER")
     if not check_funcs_once(MANDATORY_FUNCS, add_to_moredefs=False):
         raise SystemError("One of the required function to build numpy is not"
@@ -227,19 +237,19 @@ def check_math_capabilities(config, ext, moredefs, mathlibs):
     for dec, fn in OPTIONAL_FUNCTION_ATTRIBUTES:
         if config.check_gcc_function_attribute(dec, fn):
             moredefs.append((fname2def(fn), 1))
-            if fn == 'attribute_target_avx512f':
-                # GH-14787: Work around GCC<8.4 bug when compiling with AVX512
-                # support on Windows-based platforms
-                if (sys.platform in ('win32', 'cygwin') and
-                        config.check_compiler_gcc() and
-                        not config.check_gcc_version_at_least(8, 4)):
-                    ext.extra_compile_args.extend(
-                            ['-ffixed-xmm%s' % n for n in range(16, 32)])
-
-    for dec, fn, code, header in OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS:
-        if config.check_gcc_function_attribute_with_intrinsics(dec, fn, code,
-                                                               header):
-            moredefs.append((fname2def(fn), 1))
+            check_gh14787(fn)
+
+    platform = sysconfig.get_platform()
+    if ("x86_64" in platform):
+        for dec, fn in OPTIONAL_FUNCTION_ATTRIBUTES_AVX:
+            if config.check_gcc_function_attribute(dec, fn):
+                moredefs.append((fname2def(fn), 1))
+                check_gh14787(fn)
+        for dec, fn, code, header in (
+        OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS_AVX):
+            if config.check_gcc_function_attribute_with_intrinsics(
+                    dec, fn, code, header):
+                moredefs.append((fname2def(fn), 1))
 
     for fn in OPTIONAL_VARIABLE_ATTRIBUTES:
         if config.check_gcc_variable_attribute(fn):
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index a8497fe75..55daa8648 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -209,16 +209,18 @@ OPTIONAL_FUNCTION_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))',
                                  'attribute_optimize_opt_2'),
                                 ('__attribute__((nonnull (1)))',
                                  'attribute_nonnull'),
-                                ('__attribute__((target ("avx")))',
-                                 'attribute_target_avx'),
-                                ('__attribute__((target ("avx2")))',
-                                 'attribute_target_avx2'),
-                                ('__attribute__((target ("avx512f")))',
-                                 'attribute_target_avx512f'),
-                                ('__attribute__((target ("avx512f,avx512dq,avx512bw,avx512vl,avx512cd")))',
-                                 'attribute_target_avx512_skx'),
                                 ]
 
+OPTIONAL_FUNCTION_ATTRIBUTES_AVX = [('__attribute__((target ("avx")))',
+    'attribute_target_avx'),
+    ('__attribute__((target ("avx2")))',
+    'attribute_target_avx2'),
+    ('__attribute__((target ("avx512f")))',
+    'attribute_target_avx512f'),
+    ('__attribute__((target ("avx512f,avx512dq,avx512bw,avx512vl,avx512cd")))',
+    'attribute_target_avx512_skx'),
+    ]
+
 # function attributes with intrinsics
 # To ensure your compiler can compile avx intrinsics with just the attributes
 # gcc 4.8.4 support attributes but not with intrisics
@@ -227,23 +229,24 @@ OPTIONAL_FUNCTION_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))',
 # The _mm512_castps_si512 instruction is specific check for AVX-512F support
 # in gcc-4.9 which is missing a subset of intrinsics. See
 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61878
-OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2,fma")))',
-                                'attribute_target_avx2_with_intrinsics',
-                                '__m256 temp = _mm256_set1_ps(1.0); temp = \
-                                _mm256_fmadd_ps(temp, temp, temp)',
-                                'immintrin.h'),
-                                ('__attribute__((target("avx512f")))',
-                                'attribute_target_avx512f_with_intrinsics',
-                                '__m512i temp = _mm512_castps_si512(_mm512_set1_ps(1.0))',
-                                'immintrin.h'),
-                                ('__attribute__((target ("avx512f,avx512dq,avx512bw,avx512vl,avx512cd")))',
-                                'attribute_target_avx512_skx_with_intrinsics',
-                                '__mmask8 temp = _mm512_fpclass_pd_mask(_mm512_set1_pd(1.0), 0x01);\
-                                __m512i unused_temp = \
-                                    _mm512_castps_si512(_mm512_set1_ps(1.0));\
-                                _mm_mask_storeu_epi8(NULL, 0xFF, _mm_broadcastmb_epi64(temp))',
-                                'immintrin.h'),
-                                ]
+OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS_AVX = [
+    ('__attribute__((target("avx2,fma")))',
+    'attribute_target_avx2_with_intrinsics',
+    '__m256 temp = _mm256_set1_ps(1.0); temp = \
+    _mm256_fmadd_ps(temp, temp, temp)',
+    'immintrin.h'),
+    ('__attribute__((target("avx512f")))',
+    'attribute_target_avx512f_with_intrinsics',
+    '__m512i temp = _mm512_castps_si512(_mm512_set1_ps(1.0))',
+    'immintrin.h'),
+    ('__attribute__((target ("avx512f,avx512dq,avx512bw,avx512vl,avx512cd")))',
+    'attribute_target_avx512_skx_with_intrinsics',
+    '__mmask8 temp = _mm512_fpclass_pd_mask(_mm512_set1_pd(1.0), 0x01);\
+    __m512i unused_temp = \
+        _mm512_castps_si512(_mm512_set1_ps(1.0));\
+    _mm_mask_storeu_epi8(NULL, 0xFF, _mm_broadcastmb_epi64(temp))',
+    'immintrin.h'),
+    ]
 
 def fname2def(name):
     return "HAVE_%s" % name.upper()
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index b6acce570..248397196 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -1410,9 +1410,9 @@ check_mask_for_writemasked_reduction(NpyIter *iter, int iop)
 static int
 npyiter_check_reduce_ok_and_set_flags(
         NpyIter *iter, npy_uint32 flags, npyiter_opitflags *op_itflags,
-        int dim) {
+        int iop, int maskop, int dim) {
     /* If it's writeable, this means a reduction */
-    if (*op_itflags & NPY_OP_ITFLAG_WRITE) {
+    if (op_itflags[iop] & NPY_OP_ITFLAG_WRITE) {
         if (!(flags & NPY_ITER_REDUCE_OK)) {
             PyErr_Format(PyExc_ValueError,
                     "output operand requires a reduction along dimension %d, "
@@ -1420,17 +1420,35 @@ npyiter_check_reduce_ok_and_set_flags(
                     "does not match the expected output shape.", dim);
             return 0;
         }
-        if (!(*op_itflags & NPY_OP_ITFLAG_READ)) {
+        if (!(op_itflags[iop] & NPY_OP_ITFLAG_READ)) {
             PyErr_SetString(PyExc_ValueError,
                     "output operand requires a reduction, but is flagged as "
                     "write-only, not read-write");
             return 0;
         }
+        /*
+         * The ARRAYMASK can't be a reduction, because
+         * it would be possible to write back to the
+         * array once when the ARRAYMASK says 'True',
+         * then have the reduction on the ARRAYMASK
+         * later flip to 'False', indicating that the
+         * write back should never have been done,
+         * and violating the strict masking semantics
+         */
+        if (iop == maskop) {
+            PyErr_SetString(PyExc_ValueError,
+                    "output operand requires a "
+                    "reduction, but is flagged as "
+                    "the ARRAYMASK operand which "
+                    "is not permitted to be the "
+                    "result of a reduction");
+            return 0;
+        }
         NPY_IT_DBG_PRINT("Iterator: Indicating that a reduction is"
                          "occurring\n");
 
         NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
-        *op_itflags |= NPY_OP_ITFLAG_REDUCE;
+        op_itflags[iop] |= NPY_OP_ITFLAG_REDUCE;
     }
     return 1;
 }
@@ -1613,42 +1631,9 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, npyiter_opitflags *op_itf
                             goto operand_different_than_broadcast;
                         }
                         /* If it's writeable, this means a reduction */
-                        if (op_itflags[iop] & NPY_OP_ITFLAG_WRITE) {
-                            if (!(flags & NPY_ITER_REDUCE_OK)) {
-                                PyErr_SetString(PyExc_ValueError,
-                                        "output operand requires a "
-                                        "reduction, but reduction is "
-                                        "not enabled");
-                                return 0;
-                            }
-                            if (!(op_itflags[iop] & NPY_OP_ITFLAG_READ)) {
-                                PyErr_SetString(PyExc_ValueError,
-                                        "output operand requires a "
-                                        "reduction, but is flagged as "
-                                        "write-only, not read-write");
-                                return 0;
-                            }
-                            /*
-                             * The ARRAYMASK can't be a reduction, because
-                             * it would be possible to write back to the
-                             * array once when the ARRAYMASK says 'True',
-                             * then have the reduction on the ARRAYMASK
-                             * later flip to 'False', indicating that the
-                             * write back should never have been done,
-                             * and violating the strict masking semantics
-                             */
-                            if (iop == maskop) {
-                                PyErr_SetString(PyExc_ValueError,
-                                        "output operand requires a "
-                                        "reduction, but is flagged as "
-                                        "the ARRAYMASK operand which "
-                                        "is not permitted to be the "
-                                        "result of a reduction");
-                                return 0;
-                            }
-
-                            NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
-                            op_itflags[iop] |= NPY_OP_ITFLAG_REDUCE;
+                        if (!npyiter_check_reduce_ok_and_set_flags(
+                                iter, flags, op_itflags, iop, maskop, idim)) {
+                            return 0;
                         }
                     }
                     else {
@@ -1697,7 +1682,7 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, npyiter_opitflags *op_itf
                             goto operand_different_than_broadcast;
                         }
                         if (!npyiter_check_reduce_ok_and_set_flags(
-                                iter, flags, &op_itflags[iop], i)) {
+                                iter, flags, op_itflags, iop, maskop, i)) {
                             return 0;
                         }
                     }
@@ -1707,8 +1692,14 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, npyiter_opitflags *op_itf
                 }
                 else {
                     strides[iop] = 0;
+                    /*
+                     * If deleting this axis produces a reduction, but
+                     * reduction wasn't enabled, throw an error.
+                     * NOTE: We currently always allow new-axis if the iteration
+                     *       size is 1 (thus allowing broadcasting sometimes).
+                     */
                     if (!npyiter_check_reduce_ok_and_set_flags(
-                            iter, flags, &op_itflags[iop], i)) {
+                            iter, flags, op_itflags, iop, maskop, i)) {
                         return 0;
                     }
                 }
@@ -2545,6 +2536,11 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
             i = npyiter_undo_iter_axis_perm(idim, ndim, perm, NULL);
             i = npyiter_get_op_axis(op_axes[i], &reduction_axis);
 
+            /*
+             * If i < 0, this is a new axis (the operand does not have it)
+             * so we can ignore it here.  The iterator setup will have
+             * ensured already that a potential reduction/broadcast is valid.
+             */
             if (i >= 0) {
                 NPY_IT_DBG_PRINT3("Iterator: Setting allocated stride %d "
                                     "for iterator dimension %d to %d\n", (int)i,
@@ -2575,22 +2571,6 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
                     stride *= shape[i];
                 }
             }
-            else {
-                if (shape == NULL) {
-                    /*
-                     * If deleting this axis produces a reduction, but
-                     * reduction wasn't enabled, throw an error.
-                     * NOTE: We currently always allow new-axis if the iteration
-                     *       size is 1 (thus allowing broadcasting sometimes).
-                     */
-                    if (!reduction_axis && NAD_SHAPE(axisdata) != 1) {
-                        if (!npyiter_check_reduce_ok_and_set_flags(
-                                iter, flags, op_itflags, i)) {
-                            return NULL;
-                        }
-                    }
-                }
-            }
         }
     }
     else {
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index e5104db81..fe5aa9374 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -571,7 +571,6 @@ NPY_NO_EXPORT void
 
 /**begin repeat1
  * #isa = , _avx2#
- * #ISA = , AVX2#
  * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX2)#
  * #ATTR = , NPY_GCC_TARGET_AVX2#
  */
@@ -658,6 +657,7 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
 #define INT_left_shift_needs_clear_floatstatus
 #define UINT_left_shift_needs_clear_floatstatus
 
+#if @CHK@
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
 @TYPE@_left_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps,
                   void *NPY_UNUSED(func))
@@ -670,10 +670,12 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
     npy_clear_floatstatus_barrier((char*)dimensions);
 #endif
 }
+#endif
 
 #undef INT_left_shift_needs_clear_floatstatus
 #undef UINT_left_shift_needs_clear_floatstatus
 
+#if @CHK@
 NPY_NO_EXPORT
 #ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
 NPY_GCC_OPT_3
@@ -684,7 +686,7 @@ void
 {
     BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
 }
-
+#endif
 
 /**begin repeat2
  * #kind = logical_and, logical_or#
@@ -1448,7 +1450,10 @@ NPY_NO_EXPORT void
 /**begin repeat2
  * #ISA  = , _avx512_skx#
  * #isa  = simd, avx512_skx#
+ * #CHK  = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX512_SKX)#
  **/
+
+#if @CHK@
 NPY_NO_EXPORT void
 @TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -1460,6 +1465,7 @@ NPY_NO_EXPORT void
     }
     npy_clear_floatstatus_barrier((char*)dimensions);
 }
+#endif
 /**end repeat2**/
 /**end repeat1**/
 
@@ -2289,7 +2295,7 @@ NPY_NO_EXPORT void
     }
 }
 
-#if @SIMD@
+#if @SIMD@ && defined(HAVE_ATTRIBUTE_TARGET_AVX512F)
 /**begin repeat1
  * arithmetic
  * #kind = conjugate, square, absolute#
diff --git a/numpy/core/tests/test_nditer.py b/numpy/core/tests/test_nditer.py
index b43bc50e9..08f44568c 100644
--- a/numpy/core/tests/test_nditer.py
+++ b/numpy/core/tests/test_nditer.py
@@ -2728,6 +2728,7 @@ def test_iter_writemasked_badinput():
                     op_dtypes=['f4', None],
                     casting='same_kind')
 
+
 def _is_buffered(iterator):
     try:
         iterator.itviews
@@ -2803,6 +2804,34 @@ def test_iter_writemasked(a):
     # were copied back
     assert_equal(a, np.broadcast_to([3, 3, 2.5] * reps, shape))
 
+
+@pytest.mark.parametrize(["mask", "mask_axes"], [
+        # Allocated operand (only broadcasts with -1)
+        (None, [-1, 0]),
+        # Reduction along the first dimension (with and without op_axes)
+        (np.zeros((1, 4), dtype="bool"), [0, 1]),
+        (np.zeros((1, 4), dtype="bool"), None),
+        # Test 0-D and -1 op_axes
+        (np.zeros(4, dtype="bool"), [-1, 0]),
+        (np.zeros((), dtype="bool"), [-1, -1]),
+        (np.zeros((), dtype="bool"), None)])
+def test_iter_writemasked_broadcast_error(mask, mask_axes):
+    # This assumes that a readwrite mask makes sense. This is likely not the
+    # case and should simply be deprecated.
+    arr = np.zeros((3, 4))
+    itflags = ["reduce_ok"]
+    mask_flags = ["arraymask", "readwrite", "allocate"]
+    a_flags = ["writeonly", "writemasked"]
+    if mask_axes is None:
+        op_axes = None
+    else:
+        op_axes = [mask_axes, [0, 1]]
+
+    with assert_raises(ValueError):
+        np.nditer((mask, arr), flags=itflags, op_flags=[mask_flags, a_flags],
+                  op_axes=op_axes)
+
+
 def test_iter_writemasked_decref():
     # force casting (to make it interesting) by using a structured dtype.
     arr = np.arange(10000).astype(">i,O")