summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/source/f2py/getting-started.rst9
-rw-r--r--numpy/core/fromnumeric.py4
-rw-r--r--numpy/core/src/umath/simd.inc.src54
-rw-r--r--numpy/core/tests/test_numeric.py5
-rw-r--r--numpy/core/tests/test_umath.py8
-rw-r--r--numpy/fft/_pocketfft.py3
-rw-r--r--numpy/lib/shape_base.py4
7 files changed, 63 insertions, 24 deletions
diff --git a/doc/source/f2py/getting-started.rst b/doc/source/f2py/getting-started.rst
index c600eee01..27ddbb005 100644
--- a/doc/source/f2py/getting-started.rst
+++ b/doc/source/f2py/getting-started.rst
@@ -29,13 +29,12 @@ either by just in one command or step-by-step, some steps can be
omitted or combined with others.
Below I'll describe three typical approaches of using F2PY.
-The following `example Fortran 77 code`__ will be used for
-illustration:
+The following example Fortran 77 code will be used for
+illustration, save it as fib1.f:
.. include:: fib1.f
:literal:
-__ fib1.f
The quick way
==============
@@ -242,14 +241,12 @@ directive defines special comment lines (starting with ``Cf2py``, for
example) which are ignored by Fortran compilers but F2PY interprets
them as normal lines.
-Here is shown a `modified version of the example Fortran code`__, saved
+Here is shown a modified version of the previous Fortran code, save it
as ``fib3.f``:
.. include:: fib3.f
:literal:
-__ fib3.f
-
Building the extension module can be now carried out in one command::
python -m numpy.f2py -c -m fib3 fib3.f
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index ab45ddfe8..acd2d2bea 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -2039,8 +2039,8 @@ def clip(a, a_min, a_max, out=None, **kwargs):
is specified, values smaller than 0 become 0, and values larger
than 1 become 1.
- Equivalent to but faster than ``np.maximum(a_min, np.minimum(a, a_max))``
- assuming ``a_min < a_max``.
+ Equivalent to but faster than ``np.minimum(a_max, np.maximum(a, a_min))``.
+
No check is performed to ensure ``a_min < a_max``.
Parameters
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 7ec90f9c8..4265476b5 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -55,6 +55,37 @@ abs_ptrdiff(char *a, char *b)
return (a > b) ? (a - b) : (b - a);
}
+/*
+ * nomemoverlap - returns true if two strided arrays have an overlapping
+ * region in memory. ip_size/op_size = size of the arrays which can be negative
+ * indicating negative steps.
+ */
+static NPY_INLINE npy_bool
+nomemoverlap(char *ip,
+ npy_intp ip_size,
+ char *op,
+ npy_intp op_size)
+{
+ char *ip_start, *ip_end, *op_start, *op_end;
+ if (ip_size < 0) {
+ ip_start = ip + ip_size;
+ ip_end = ip;
+ }
+ else {
+ ip_start = ip;
+ ip_end = ip + ip_size;
+ }
+ if (op_size < 0) {
+ op_start = op + op_size;
+ op_end = op;
+ }
+ else {
+ op_start = op;
+ op_end = op + op_size;
+ }
+ return (ip_start > op_end) | (op_start > ip_end);
+}
+
#define IS_BINARY_STRIDE_ONE(esize, vsize) \
((steps[0] == esize) && \
(steps[1] == esize) && \
@@ -83,22 +114,25 @@ abs_ptrdiff(char *a, char *b)
* cross page boundaries.
*
* We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
- * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE ensures this.
+ * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE
+ * ensures this. The condition also requires that the input and output arrays
+ * should have no overlap in memory.
*/
-#define IS_BINARY_SMALL_STEPS \
+#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
((abs(steps[0]) < MAX_STEP_SIZE) && \
(abs(steps[1]) < MAX_STEP_SIZE) && \
- (abs(steps[2]) < MAX_STEP_SIZE))
+ (abs(steps[2]) < MAX_STEP_SIZE) && \
+ (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
+ (nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))
/*
- * output should be contiguous, can handle strided input data
- * Input step should be smaller than MAX_STEP_SIZE for performance
+ * 1) Output should be contiguous, can handle strided input data
+ * 2) Input step should be smaller than MAX_STEP_SIZE for performance
+ * 3) Input and output arrays should have no overlap in memory
*/
#define IS_OUTPUT_BLOCKABLE_UNARY(esize, vsize) \
(steps[1] == (esize) && abs(steps[0]) < MAX_STEP_SIZE && \
- (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
- ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
- ((abs_ptrdiff(args[1], args[0]) == 0))))
+ (nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0])))
#define IS_BLOCKABLE_REDUCE(esize, vsize) \
(steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
@@ -253,7 +287,7 @@ static NPY_INLINE int
run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
- if (IS_BINARY_SMALL_STEPS) {
+ if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
AVX512F_@func@_@TYPE@(args, dimensions, steps);
return 1;
}
@@ -1943,7 +1977,7 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
/*
* Note: while generally indices are npy_intp, we ensure that our maximum index
* will fit in an int32 as a precondition for this function via
- * IS_BINARY_SMALL_STEPS
+ * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
*/
npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 135acc51d..05f59d9dc 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -2000,12 +2000,12 @@ class TestClip:
np.array(np.nan),
np.zeros(10, dtype=np.int32)),
])
+ @pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_clip_scalar_nan_propagation(self, arr, amin, amax):
# enforcement of scalar nan propagation for comparisons
# called through clip()
expected = np.minimum(np.maximum(arr, amin), amax)
- with assert_warns(DeprecationWarning):
- actual = np.clip(arr, amin, amax)
+ actual = np.clip(arr, amin, amax)
assert_equal(actual, expected)
@pytest.mark.xfail(reason="propagation doesn't match spec")
@@ -2014,6 +2014,7 @@ class TestClip:
np.timedelta64('NaT'),
np.zeros(10, dtype=np.int32)),
])
+ @pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_NaT_propagation(self, arr, amin, amax):
# NOTE: the expected function spec doesn't
# propagate NaT, but clip() now does
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index d1d4467d6..233a0b1d6 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -3157,6 +3157,14 @@ def test_rint_big_int():
# Rint should not change the value
assert_equal(val, np.rint(val))
+@pytest.mark.parametrize('ftype', [np.float32, np.float64])
+def test_memoverlap_accumulate(ftype):
+ # Reproduces bug https://github.com/numpy/numpy/issues/15597
+ arr = np.array([0.61, 0.60, 0.77, 0.41, 0.19], dtype=ftype)
+ out_max = np.array([0.61, 0.61, 0.77, 0.77, 0.77], dtype=ftype)
+ out_min = np.array([0.61, 0.60, 0.60, 0.41, 0.19], dtype=ftype)
+ assert_equal(np.maximum.accumulate(arr), out_max)
+ assert_equal(np.minimum.accumulate(arr), out_min)
def test_signaling_nan_exceptions():
with assert_no_warnings():
diff --git a/numpy/fft/_pocketfft.py b/numpy/fft/_pocketfft.py
index f2510a6c2..3eab242e5 100644
--- a/numpy/fft/_pocketfft.py
+++ b/numpy/fft/_pocketfft.py
@@ -59,12 +59,11 @@ def _raw_fft(a, n, axis, is_real, is_forward, inv_norm):
if a.shape[axis] != n:
s = list(a.shape)
+ index = [slice(None)]*len(s)
if s[axis] > n:
- index = [slice(None)]*len(s)
index[axis] = slice(0, n)
a = a[tuple(index)]
else:
- index = [slice(None)]*len(s)
index[axis] = slice(0, s[axis])
s[axis] = n
z = zeros(s, a.dtype.char)
diff --git a/numpy/lib/shape_base.py b/numpy/lib/shape_base.py
index 7634af010..b7f1f16f2 100644
--- a/numpy/lib/shape_base.py
+++ b/numpy/lib/shape_base.py
@@ -269,8 +269,8 @@ def apply_along_axis(func1d, axis, arr, *args, **kwargs):
"""
Apply a function to 1-D slices along the given axis.
- Execute `func1d(a, *args)` where `func1d` operates on 1-D arrays and `a`
- is a 1-D slice of `arr` along `axis`.
+ Execute `func1d(a, *args, **kwargs)` where `func1d` operates on 1-D arrays
+ and `a` is a 1-D slice of `arr` along `axis`.
This is equivalent to (but faster than) the following use of `ndindex` and
`s_`, which sets each of ``ii``, ``jj``, and ``kk`` to a tuple of indices::