12 files changed, 385 insertions, 68 deletions
diff --git a/doc/release/2.0.0-notes.rst b/doc/release/2.0.0-notes.rst
index 3f4b62096..5cabf245c 100644
--- a/doc/release/2.0.0-notes.rst
+++ b/doc/release/2.0.0-notes.rst
@@ -44,6 +44,8 @@ What doesn't work with NA:
     * UFunc.accumulate, UFunc.reduceat.
     * np.logical_and, np.logical_or, np.all, and np.any don't satisfy the
       rules NA | True == True and NA & False == False yet.
+    * The reduction methods like <ndarray>.sum, etc. do not yet support
+      the new extended axis=, skipna= and keepdims= parameters.
     * Array methods:
        + ndarray.argmax, ndarray.argmin,
        + numpy.repeat, numpy.delete (relies on fancy indexing),
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index 042688fc2..76e006c65 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -318,24 +318,25 @@ multiarray_funcs_api = {
     'PyArray_ConvertClipmodeSequence':      279,
     'PyArray_MatrixProduct2':               280,
     # End 1.6 API
-    'PyArray_SetBaseObject':                282,
-    'PyArray_HasNASupport':                 283,
-    'PyArray_ContainsNA':                   284,
-    'PyArray_AllocateMaskNA':               285,
-    'NpyIter_GetFirstMaskNAOp':             286,
-    'NpyIter_GetMaskNAIndexArray':          287,
-    'PyArray_ReduceMaskNAArray':            288,
-    'PyArray_CreateSortedStridePerm':       289,
-    'PyArray_AssignZero':                   290,
-    'PyArray_AssignOne':                    291,
-    'PyArray_AssignNA':                     292,
-    'PyArray_AssignMaskNA':                 293,
-    'PyArray_AssignRawScalar':              294,
-    'PyArray_AssignArray':                  295,
-    'PyArray_CreateReduceResult':           296,
-    'PyArray_InitializeReduceResult':       297,
-    'PyArray_RemoveAxesInPlace':            298,
-    'PyArray_DebugPrint':                   299,
+    'NpyIter_GetFirstMaskNAOp':             282,
+    'NpyIter_GetMaskNAIndexArray':          283,
+    'NpyIter_IsFirstVisit':                 284,
+    'PyArray_SetBaseObject':                285,
+    'PyArray_HasNASupport':                 286,
+    'PyArray_ContainsNA':                   287,
+    'PyArray_AllocateMaskNA':               288,
+    'PyArray_ReduceMaskNAArray':            289,
+    'PyArray_CreateSortedStridePerm':       290,
+    'PyArray_AssignZero':                   291,
+    'PyArray_AssignOne':                    292,
+    'PyArray_AssignNA':                     293,
+    'PyArray_AssignMaskNA':                 294,
+    'PyArray_AssignRawScalar':              295,
+    'PyArray_AssignArray':                  296,
+    'PyArray_CreateReduceResult':           297,
+    'PyArray_InitializeReduceResult':       298,
+    'PyArray_RemoveAxesInPlace':            299,
+    'PyArray_DebugPrint':                   300,
 }
 
 ufunc_types_api = {
diff --git a/numpy/core/src/multiarray/array_assign_array.c b/numpy/core/src/multiarray/array_assign_array.c
index db4bdbf52..6734d845a 100644
--- a/numpy/core/src/multiarray/array_assign_array.c
+++ b/numpy/core/src/multiarray/array_assign_array.c
@@ -483,7 +483,7 @@ PyArray_AssignArray(PyArrayObject *dst, PyArrayObject *src,
         /* TODO: add 'wheremask' as a parameter to ContainsNA */
         if (PyArray_ContainsNA(src)) {
             PyErr_SetString(PyExc_ValueError,
-                    "Cannot assign NA value to an array which "
+                    "Cannot assign NA to an array which "
                     "does not support NAs");
             goto fail;
         }
@@ -659,7 +659,7 @@ PyArray_AssignArray(PyArrayObject *dst, PyArrayObject *src,
         if (PyArray_ContainsNA(wheremask)) {
             if (!dst_has_maskna) {
                 PyErr_SetString(PyExc_ValueError,
-                        "Cannot assign NA value to an array which "
+                        "Cannot assign NA to an array which "
                         "does not support NAs");
                 goto fail;
             }
diff --git a/numpy/core/src/multiarray/array_assign_scalar.c b/numpy/core/src/multiarray/array_assign_scalar.c
index c81d75a0b..9200c856e 100644
--- a/numpy/core/src/multiarray/array_assign_scalar.c
+++ b/numpy/core/src/multiarray/array_assign_scalar.c
@@ -435,7 +435,7 @@ PyArray_AssignRawScalar(PyArrayObject *dst,
         if (PyArray_ContainsNA(wheremask)) {
             if (!dst_has_maskna) {
                 PyErr_SetString(PyExc_ValueError,
-                        "Cannot assign NA value to an array which "
+                        "Cannot assign NA to an array which "
                         "does not support NAs");
                 goto fail;
             }
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index 186da0b9c..03c035615 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -2672,7 +2672,7 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
         else {
             if (PyArray_ContainsNA(src)) {
                 PyErr_SetString(PyExc_ValueError,
-                        "Cannot assign NA value to an array which "
+                        "Cannot assign NA to an array which "
                         "does not support NAs");
                 return -1;
             }
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index ec0b1aa46..5b03fdb0c 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -125,7 +125,7 @@ PyArray_TakeFrom(PyArrayObject *self0, PyObject *indices0, int axis,
             }
             else if (PyArray_ContainsNA(self)) {
                 PyErr_SetString(PyExc_ValueError,
-                        "Cannot assign NA value to an array which "
+                        "Cannot assign NA to an array which "
                         "does not support NAs");
                 goto fail;
             }
@@ -1898,13 +1898,14 @@ assign_reduce_unit_zero(PyArrayObject *result, int preservena, void *data)
     return PyArray_AssignZero(result, NULL, preservena, NULL);
 }
 
-static void
+static int
 reduce_count_nonzero_inner_loop(NpyIter *iter,
                                             char **dataptr,
                                             npy_intp *strideptr,
                                             npy_intp *countptr,
                                             NpyIter_IterNextFunc *iternext,
                                             int needs_api,
+                                            npy_intp skip_first_count,
                                             void *data)
 {
     PyArray_NonzeroFunc *nonzero = (PyArray_NonzeroFunc *)data;
@@ -1933,15 +1934,18 @@ reduce_count_nonzero_inner_loop(NpyIter *iter,
     if (!needs_api) {
         NPY_END_THREADS;
     }
+
+    return (needs_api && PyErr_Occurred()) ? -1 : 0;
 }
 
-static void
+static int
 reduce_count_nonzero_masked_inner_loop(NpyIter *iter,
                                             char **dataptr,
                                             npy_intp *strideptr,
                                             npy_intp *countptr,
                                             NpyIter_IterNextFunc *iternext,
                                             int needs_api,
+                                            npy_intp skip_first_count,
                                             void *data)
 {
     PyArray_NonzeroFunc *nonzero = (PyArray_NonzeroFunc *)data;
@@ -1973,6 +1977,8 @@ reduce_count_nonzero_masked_inner_loop(NpyIter *iter,
     if (!needs_api) {
         NPY_END_THREADS;
     }
+
+    return (needs_api && PyErr_Occurred()) ? -1 : 0;
 }
 
 
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index aea41fa86..b1dd6b8bb 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -996,7 +996,7 @@ array_ass_boolean_subscript(PyArrayObject *self,
     if (v_has_maskna && !self_has_maskna) {
         if (PyArray_ContainsNA(v)) {
             PyErr_SetString(PyExc_ValueError,
-                    "Cannot assign NA value to an array which "
+                    "Cannot assign NA to an array which "
                     "does not support NAs");
             return -1;
         }
diff --git a/numpy/core/src/multiarray/nditer_api.c b/numpy/core/src/multiarray/nditer_api.c
index 6db3279d6..2343461cb 100644
--- a/numpy/core/src/multiarray/nditer_api.c
+++ b/numpy/core/src/multiarray/nditer_api.c
@@ -693,6 +693,81 @@ NpyIter_HasIndex(NpyIter *iter)
 }
 
 /*NUMPY_API
+ * Checks to see whether this is the first time the elements
+ * of the specified reduction operand which the iterator points at are
+ * being seen for the first time. The function returns
+ * a reasonable answer for reduction operands and when buffering is
+ * disabled. The answer may be incorrect for buffered non-reduction
+ * operands.
+ *
+ * If this function returns true, the caller should also
+ * check the inner loop stride of the operand, because if
+ * that stride is 0, then only the first element of the innermost
+ * external loop is being visited for the first time.
+ *
+ * This function does not check that 'iop' is within bounds, because
+ * it is intended to be called within the iteration loop. If there
+ * is a good way to do the iteration without calling this function,
+ * that is preferable since this is a non-trivial property to check.
+ */
+NPY_NO_EXPORT npy_bool
+NpyIter_IsFirstVisit(NpyIter *iter, int iop)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    int idim, ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    NpyIter_AxisData *axisdata;
+    npy_intp sizeof_axisdata;
+
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+    axisdata = NIT_AXISDATA(iter);
+
+    for (idim = 0; idim < ndim; ++idim) {
+        npy_intp coord = NAD_INDEX(axisdata);
+        npy_intp stride = NAD_STRIDES(axisdata)[iop];
+
+        /*
+         * If this is a reduction dimension and the coordinate
+         * is not at the start, it's definitely not the first visit
+         */
+        if (stride == 0 && coord != 0) {
+            return 0;
+        }
+
+        NIT_ADVANCE_AXISDATA(axisdata, 1);
+    }
+
+    /*
+     * In reduction buffering mode, there's a double loop being
+     * tracked in the buffer part of the iterator data structure.
+     * We need to check the two levels of that loop as well.
+     */
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+        /* The outer reduce loop */
+        if (NBF_REDUCE_OUTERSTRIDES(bufferdata)[iop] == 0 &&
+                NBF_REDUCE_POS(bufferdata) != 0) {
+            return 0;
+        }
+        /*
+         * The inner reduce loop, when there's no external loop. This
+         * requires a more complicated check for the second "coord != 0"
+         * check, because this part of the loop is operating based on
+         * a global iterindex instead of a local coordinate like the rest.
+         */
+        if (!(itflags&NPY_ITFLAG_EXLOOP) &&
+                NBF_STRIDES(bufferdata)[iop] == 0 &&
+                NIT_ITERINDEX(iter) !=
+                        NBF_BUFITEREND(bufferdata) - NBF_SIZE(bufferdata)) {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+/*NUMPY_API
  * Whether the iteration could be done with no buffering.
  */
 NPY_NO_EXPORT npy_bool
diff --git a/numpy/core/src/multiarray/reduction.c b/numpy/core/src/multiarray/reduction.c
index 78caa3674..4684085d1 100644
--- a/numpy/core/src/multiarray/reduction.c
+++ b/numpy/core/src/multiarray/reduction.c
@@ -288,6 +288,9 @@ PyArray_CreateReduceResult(PyArrayObject *operand, PyArrayObject *out,
  * skipna  : If True, indicates that the reduction is being calculated
  *           as if the NA values are being dropped from the computation
  *           instead of accumulating into an NA result.
+ * out_skip_first_count : This gets populated with the number of first-visit
+ *                        elements that should be skipped during the
+ *                        iteration loop.
  * funcname : The name of the reduction operation, for the purpose of
  *            better quality error messages. For example, "numpy.max"
  *            would be a good name for NumPy's max function.
@@ -298,14 +301,18 @@ PyArray_CreateReduceResult(PyArrayObject *operand, PyArrayObject *out,
 NPY_NO_EXPORT PyArrayObject *
 PyArray_InitializeReduceResult(
                     PyArrayObject *result, PyArrayObject *operand,
-                    npy_bool *axis_flags, int skipna, const char *funcname)
+                    npy_bool *axis_flags, int skipna,
+                    npy_intp *out_skip_first_count, const char *funcname)
 {
     npy_intp *strides, *shape, shape_orig[NPY_MAXDIMS], shape0;
     PyArrayObject *op_view = NULL;
-    int idim, ndim;
+    int idim, ndim, nreduce_axes;
 
     ndim = PyArray_NDIM(operand);
 
+    /* Default to no skipping first-visit elements in the iteration */
+    *out_skip_first_count = 0;
+
     /*
      * If 'skipna' is False, or 'operand' has no NA mask in which
      * case the 'skipna' flag does nothing.
@@ -389,13 +396,16 @@ PyArray_InitializeReduceResult(
      * then return a view to the rest.
      *
      * Adjust the shape to only look at the first element along
-     * any of the reduction axes.
+     * any of the reduction axes. We count the number of reduction axes
+     * at the same time.
      */
     shape = PyArray_SHAPE(op_view);
+    nreduce_axes = 0;
     memcpy(shape_orig, shape, ndim * sizeof(npy_intp));
     for (idim = 0; idim < ndim; ++idim) {
         if (axis_flags[idim]) {
             shape[idim] = 1;
+            ++nreduce_axes;
         }
     }
 
@@ -410,23 +420,39 @@ PyArray_InitializeReduceResult(
         return NULL;
     }
 
-    /* Adjust the shape to only look at the remaining elements */
-    strides = PyArray_STRIDES(op_view);
-    for (idim = 0; idim < ndim; ++idim) {
-        if (axis_flags[idim]) {
-            shape[idim] = shape_orig[idim] - 1;
-            ((PyArrayObject_fieldaccess *)op_view)->data += strides[idim];
-        }
-    }
-    if (PyArray_HASMASKNA(op_view)) {
-        strides = PyArray_MASKNA_STRIDES(op_view);
+    /*
+     * If there are zero or one reduction axes, adjust the view's
+     * shape to only look at the remaining elements
+     */
+    if (nreduce_axes <= 1) {
+        strides = PyArray_STRIDES(op_view);
         for (idim = 0; idim < ndim; ++idim) {
             if (axis_flags[idim]) {
-                ((PyArrayObject_fieldaccess *)op_view)->maskna_data +=
-                                                            strides[idim];
+                shape[idim] = shape_orig[idim] - 1;
+                ((PyArrayObject_fieldaccess *)op_view)->data += strides[idim];
+            }
+        }
+        if (PyArray_HASMASKNA(op_view)) {
+            strides = PyArray_MASKNA_STRIDES(op_view);
+            for (idim = 0; idim < ndim; ++idim) {
+                if (axis_flags[idim]) {
+                    ((PyArrayObject_fieldaccess *)op_view)->maskna_data +=
+                                                                strides[idim];
+                }
             }
         }
     }
+    /*
+     * Otherwise iterate over the whole operand, but tell the inner loop
+     * to skip the elements we already copied by setting the skip_first_count.
+     */
+    else {
+        *out_skip_first_count = PyArray_SIZE(result);
+
+        Py_DECREF(op_view);
+        Py_INCREF(operand);
+        op_view = operand;
+    }
 
     return op_view;
 }
@@ -465,6 +491,7 @@ PyArray_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
 {
     int use_maskna;
     PyArrayObject *result = NULL, *op_view = NULL;
+    npy_intp skip_first_count = 0;
 
     /* Iterator parameters */
     NpyIter *iter = NULL;
@@ -481,7 +508,7 @@ PyArray_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
     if (use_maskna && !skipna && out != NULL && !PyArray_HASMASKNA(out)) {
         if (PyArray_ContainsNA(operand)) {
             PyErr_SetString(PyExc_ValueError,
-                    "Cannot assign NA value to an array which "
+                    "Cannot assign NA to an array which "
                     "does not support NAs");
             goto fail;
         }
@@ -533,7 +560,7 @@ PyArray_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
     }
     else {
         op_view = PyArray_InitializeReduceResult(result, operand,
-                            axis_flags, skipna, funcname);
+                            axis_flags, skipna, &skip_first_count, funcname);
         if (op_view == NULL) {
             Py_DECREF(result);
             return NULL;
@@ -623,8 +650,11 @@ PyArray_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
                 goto fail;
             }
 
-            inner_loop(iter, dataptr, strideptr, countptr,
-                                                iternext, needs_api, data);
+            if (inner_loop(iter, dataptr, strideptr, countptr,
+                            iternext, needs_api, skip_first_count, data) < 0) {
+
+                goto fail;
+            }
         }
         /* Masked reduction */
         else {
@@ -635,12 +665,10 @@ PyArray_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
                 goto fail;
             }
 
-            masked_inner_loop(iter, dataptr, strideptr, countptr,
-                                                iternext, needs_api, data);
-        }
-
-        if (PyErr_Occurred()) {
-            goto fail;
+            if (masked_inner_loop(iter, dataptr, strideptr, countptr,
+                            iternext, needs_api, skip_first_count, data) < 0) {
+                goto fail;
+            }
         }
     }
 
diff --git a/numpy/core/src/multiarray/reduction.h b/numpy/core/src/multiarray/reduction.h
index e2a9806c1..9de44de61 100644
--- a/numpy/core/src/multiarray/reduction.h
+++ b/numpy/core/src/multiarray/reduction.h
@@ -23,12 +23,57 @@ typedef int (PyArray_AssignReduceUnitFunc)(PyArrayObject *result,
  * the inner loop, such as when the iternext() function never calls
  * a function which could raise a Python exception.
  *
+ * Ths skip_first_count parameter indicates how many elements need to be
+ * skipped based on NpyIter_IsFirstVisit checks. This can only be positive
+ * when the 'assign_unit' parameter was NULL when calling
+ * PyArray_ReduceWrapper.
+ *
  * The unmasked inner loop gets two data pointers and two strides, and should
  * look roughly like this:
+ *  {
  *      NPY_BEGIN_THREADS_DEF;
  *      if (!needs_api) {
  *          NPY_BEGIN_THREADS;
  *      }
+ *      // This first-visit loop can be skipped if 'assign_unit' was non-NULL
+ *      if (skip_first_count > 0) {
+ *          do {
+ *              char *data0 = dataptr[0], *data1 = dataptr[1];
+ *              npy_intp stride0 = strideptr[0], stride1 = strideptr[1];
+ *              npy_intp count = *countptr;
+ *
+ *              // Skip any first-visit elements
+ *              if (NpyIter_IsFirstVisit(iter, 1)) {
+ *                  if (stride0 == 0) {
+ *                      --count;
+ *                      --skip_first_count;
+ *                      data1 += stride1;
+ *                      //data2 += stride2; // In masked loop
+ *                  }
+ *                  else {
+ *                      skip_first_count -= count;
+ *                      count = 0;
+ *                  }
+ *              }
+ *
+ *              while (count--) {
+ *                  *(result_t *)data0 = my_reduce_op(*(result_t *)data0,
+ *                                                    *(operand_t *)data1);
+ *                  data0 += stride0;
+ *                  data1 += stride1;
+ *              }
+ *
+ *              // Jump to the faster loop when skipping is done
+ *              if (skip_first_count == 0) {
+ *                  if (iternext(iter)) {
+ *                      break;
+ *                  }
+ *                  else {
+ *                      goto finish_loop;
+ *                  }
+ *              }
+ *          } while (iternext(iter));
+ *      }
  *      do {
  *          char *data0 = dataptr[0], *data1 = dataptr[1];
  *          npy_intp stride0 = strideptr[0], stride1 = strideptr[1];
@@ -41,22 +86,24 @@ typedef int (PyArray_AssignReduceUnitFunc)(PyArrayObject *result,
  *              data1 += stride1;
  *          }
  *      } while (iternext(iter));
+ *  finish_loop:
  *      if (!needs_api) {
  *          NPY_END_THREADS;
  *      }
+ *      return (needs_api && PyErr_Occurred()) ? -1 : 0;
+ *  }
  *
  * The masked inner loop gets three data pointers and three strides, and
- * should look roughly like this:
- *      NPY_BEGIN_THREADS_DEF;
- *      if (!needs_api) {
- *          NPY_BEGIN_THREADS;
- *      }
+ * looks identical except for the iteration inner loops which should be
+ * like this:
  *      do {
  *          char *data0 = dataptr[0], *data1 = dataptr[1], *data2 = dataptr[2];
  *          npy_intp stride0 = strideptr[0], stride1 = strideptr[1],
  *                      stride2 = strideptr[2];
  *          npy_intp count = *countptr;
  *
+ *          // Skipping first visits would go here
+ *
  *          while (count--) {
  *              if (NpyMaskValue_IsExposed((npy_mask)*data2)) {
  *                  *(result_t *)data0 = my_reduce_op(*(result_t *)data0,
@@ -66,21 +113,22 @@ typedef int (PyArray_AssignReduceUnitFunc)(PyArrayObject *result,
  *              data1 += stride1;
  *              data2 += stride2;
  *          }
+ *
+ *          // Jumping to the faster loop would go here
+ *
  *      } while (iternext(iter));
- *      if (!needs_api) {
- *          NPY_END_THREADS;
- *      }
  *
- * Once the inner loop is finished, PyArray_ReduceWrapper calls
- * PyErr_Occurred to check if any exception was raised during the
- * computation.
+ * If needs_api is True, this function should call PyErr_Occurred()
+ * to check if an error occurred during processing, and return -1 for
+ * error, 0 for success.
  */
-typedef void (PyArray_ReduceInnerLoopFunc)(NpyIter *iter,
+typedef int (PyArray_ReduceInnerLoopFunc)(NpyIter *iter,
                                             char **dataptr,
                                             npy_intp *strideptr,
                                             npy_intp *countptr,
                                             NpyIter_IterNextFunc *iternext,
                                             int needs_api,
+                                            npy_intp skip_first_count,
                                             void *data);
 
 /*
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 8c35207a4..a86c67c2d 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -2569,9 +2569,11 @@ get_masked_binary_op_function(PyUFuncObject *self, PyArrayObject *arr,
 static PyArrayObject *
 initialize_reduce_result(int identity, PyArrayObject *result,
                         npy_bool *axis_flags, PyArrayObject *arr,
-                        int skipna, char *ufunc_name)
+                        int skipna, npy_intp *out_skip_first_count,
+                        char *ufunc_name)
 {
     if (identity == PyUFunc_One) {
+        *out_skip_first_count = 0;
         if (PyArray_AssignOne(result, NULL, !skipna, NULL) < 0) {
             return NULL;
         }
@@ -2579,6 +2581,7 @@ initialize_reduce_result(int identity, PyArrayObject *result,
         return arr;
     }
     else if (identity == PyUFunc_Zero) {
+        *out_skip_first_count = 0;
         if (PyArray_AssignZero(result, NULL, !skipna, NULL) < 0) {
             return NULL;
         }
@@ -2586,8 +2589,8 @@ initialize_reduce_result(int identity, PyArrayObject *result,
         return arr;
     }
     else {
-        return PyArray_InitializeReduceResult(
-                        result, arr, axis_flags, skipna, ufunc_name);
+        return PyArray_InitializeReduceResult(result, arr, axis_flags,
+                                skipna, out_skip_first_count, ufunc_name);
     }
 }
 
@@ -2616,6 +2619,7 @@ PyUFunc_Reduce(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
     PyArray_Descr *otype_dtype = NULL;
     npy_bool axis_flags[NPY_MAXDIMS];
     PyArrayObject *arr_view = NULL, *result = NULL;
+    npy_intp skip_first_count = 0;
 
     /* The normal selected inner loop */
     PyUFuncGenericFunction innerloop = NULL;
@@ -2786,7 +2790,8 @@ PyUFunc_Reduce(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
      * all the elements to reduce into 'result'.
      */
     arr_view = initialize_reduce_result(self->identity, result,
-                                        axis_flags, arr, skipna, ufunc_name);
+                                        axis_flags, arr, skipna,
+                                        &skip_first_count, ufunc_name);
     if (arr_view == NULL) {
         goto fail;
     }
@@ -2871,6 +2876,44 @@ PyUFunc_Reduce(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
 
         /* Straightforward reduction */
         if (!use_maskna) {
+            if (skip_first_count > 0) {
+                do {
+                    npy_intp count = *count_ptr;
+
+                    /* Skip any first-visit elements */
+                    if (NpyIter_IsFirstVisit(iter, 1)) {
+                        if (stride[0] == 0) {
+                            --count;
+                            --skip_first_count;
+                            dataptr[1] += stride[1];
+                        }
+                        else {
+                            skip_first_count -= count;
+                            count = 0;
+                        }
+                    }
+
+                    /* Turn the two items into three for the inner loop */
+                    dataptr_copy[0] = dataptr[0];
+                    dataptr_copy[1] = dataptr[1];
+                    dataptr_copy[2] = dataptr[0];
+                    stride_copy[0] = stride[0];
+                    stride_copy[1] = stride[1];
+                    stride_copy[2] = stride[0];
+                    innerloop(dataptr_copy, &count,
+                                stride_copy, innerloopdata);
+
+                    /* Jump to the faster loop when skipping is done */
+                    if (skip_first_count == 0) {
+                        if (iternext(iter)) {
+                            break;
+                        }
+                        else {
+                            goto finish_loop;
+                        }
+                    }
+                } while (iternext(iter));
+            }
             do {
                 /* Turn the two items into three for the inner loop */
                 dataptr_copy[0] = dataptr[0];
@@ -2885,6 +2928,49 @@ PyUFunc_Reduce(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
         }
         /* Masked reduction */
         else {
+            if (skip_first_count > 0) {
+                do {
+                    npy_intp count = *count_ptr;
+
+                    /* Skip any first-visit elements */
+                    if (NpyIter_IsFirstVisit(iter, 1)) {
+                        if (stride[0] == 0) {
+                            --count;
+                            --skip_first_count;
+                            dataptr[1] += stride[1];
+                            dataptr[2] += stride[2];
+                        }
+                        else {
+                            skip_first_count -= count;
+                            count = 0;
+                        }
+                    }
+
+                    /* Turn the two items into three for the inner loop */
+                    dataptr_copy[0] = dataptr[0];
+                    dataptr_copy[1] = dataptr[1];
+                    dataptr_copy[2] = dataptr[0];
+                    stride_copy[0] = stride[0];
+                    stride_copy[1] = stride[1];
+                    stride_copy[2] = stride[0];
+                    /*
+                     * If skipna=True, this masks based on the mask in 'arr',
+                     * otherwise it masks based on the mask in 'result'
+                     */
+                    maskedinnerloop(dataptr_copy, dataptr[2], &count,
+                                stride_copy, stride[2], maskedinnerloopdata);
+
+                    /* Jump to the faster loop when skipping is done */
+                    if (skip_first_count == 0) {
+                        if (iternext(iter)) {
+                            break;
+                        }
+                        else {
+                            goto finish_loop;
+                        }
+                    }
+                } while (iternext(iter));
+            }
             do {
                 /* Turn the two items into three for the inner loop */
                 dataptr_copy[0] = dataptr[0];
@@ -2901,7 +2987,7 @@ PyUFunc_Reduce(PyUFuncObject *self, PyArrayObject *arr, PyArrayObject *out,
                             stride_copy, stride[2], maskedinnerloopdata);
             } while (iternext(iter));
         }
-
+finish_loop:
         if (!needs_api) {
             NPY_END_THREADS;
         }
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index be7dae8eb..2db3fa029 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -560,5 +560,76 @@ class TestUfunc(TestCase):
         np.add(a, b, out=c, where=[1,0,0,1,0,0,1,1,1,0])
         assert_equal(c, [2,1.5,1.5,2,1.5,1.5,2,2,2,1.5])
 
+    def check_unitless_reduction(self, a):
+        # np.minimum.reduce is a unitless reduction
+
+        # Verify that it sees the zero at various positions
+        a[...] = 1
+        a[1,0,0] = 0
+        assert_equal(np.minimum.reduce(a, axis=None), 0)
+        assert_equal(np.minimum.reduce(a, axis=(0,1)), [0,1,1,1])
+        assert_equal(np.minimum.reduce(a, axis=(0,2)), [0,1,1])
+        assert_equal(np.minimum.reduce(a, axis=(1,2)), [1,0])
+        assert_equal(np.minimum.reduce(a, axis=0),
+                                    [[0,1,1,1], [1,1,1,1], [1,1,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=1),
+                                    [[1,1,1,1], [0,1,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=2),
+                                    [[1,1,1], [0,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=()), a)
+
+        a[...] = 1
+        a[0,1,0] = 0
+        assert_equal(np.minimum.reduce(a, axis=None), 0)
+        assert_equal(np.minimum.reduce(a, axis=(0,1)), [0,1,1,1])
+        assert_equal(np.minimum.reduce(a, axis=(0,2)), [1,0,1])
+        assert_equal(np.minimum.reduce(a, axis=(1,2)), [0,1])
+        assert_equal(np.minimum.reduce(a, axis=0),
+                                    [[1,1,1,1], [0,1,1,1], [1,1,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=1),
+                                    [[0,1,1,1], [1,1,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=2),
+                                    [[1,0,1], [1,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=()), a)
+
+        a[...] = 1
+        a[0,0,1] = 0
+        assert_equal(np.minimum.reduce(a, axis=None), 0)
+        assert_equal(np.minimum.reduce(a, axis=(0,1)), [1,0,1,1])
+        assert_equal(np.minimum.reduce(a, axis=(0,2)), [0,1,1])
+        assert_equal(np.minimum.reduce(a, axis=(1,2)), [0,1])
+        assert_equal(np.minimum.reduce(a, axis=0),
+                                    [[1,0,1,1], [1,1,1,1], [1,1,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=1),
+                                    [[1,0,1,1], [1,1,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=2),
+                                    [[0,1,1], [1,1,1]])
+        assert_equal(np.minimum.reduce(a, axis=()), a)
+
+    def test_unitless_reduction_corder(self):
+        a = np.empty((2,3,4), order='C')
+        self.check_unitless_reduction(a)
+
+    def test_unitless_reduction_forder(self):
+        a = np.empty((2,3,4), order='F')
+        self.check_unitless_reduction(a)
+
+    def test_unitless_reduction_otherorder(self):
+        a = np.empty((2,4,3), order='C').swapaxes(1,2)
+        self.check_unitless_reduction(a)
+
+    def test_unitless_reduction_noncontig(self):
+        a = np.empty((3,5,4), order='C').swapaxes(1,2)
+        a = a[1:, 1:, 1:]
+        self.check_unitless_reduction(a)
+
+    def test_unitless_reduction_noncontig_unaligned(self):
+        a = np.empty((3*4*5*8 + 1,), dtype='i1')
+        a = a[1:].view(dtype='f8')
+        a.shape = (3,4,5)
+        a = a[1:, 1:, 1:]
+        self.check_unitless_reduction(a)
+
+
 if __name__ == "__main__":
     run_module_suite()