BUG: Workaround segfault in Apple Accelerate framework SGEMV

SGEMV in Accelerate framework will segfault on MacOS X version 10.9 (aka Mavericks) if arrays are not aligned to 32 byte boundaries and the CPU supports AVX instructions. This can produce segfaults in np.dot. This patch overshadows the symbols cblas_sgemv, sgemv_ and sgemv exported by Accelerate to produce the correct behavior. The MacOS X version and CPU specs are checked on module import. If Mavericks and AVX are detected the call to SGEMV is emulated with a call to SGEMM if the arrays are not 32 byte aligned. If the exported symbols cannot be overshadowed on module import, a fatal error is produced and the process aborts. All the fixes are in a self-contained C file and do not alter the multiarray C code. The patch is not applied unless NumPy is configured to link with Apple's Accelerate framework.
author: Sturla Molden <sturla@molden.no> 2014-10-27 07:45:57 +0100
committer: Charles Harris <charlesr.harris@gmail.com> 2015-05-03 12:58:26 -0600
commit: cf3077696dd76d88e77173546a0d969eda3a7aa0 (patch)
tree: 4880f6a00c5a3b16ad3bbb575b8338d8a47a442f
parent: 3782f7e0bfd9ac230b672a1295df580a40e9e109 (diff)
download: numpy-cf3077696dd76d88e77173546a0d969eda3a7aa0.tar.gz
4 files changed, 320 insertions, 0 deletions
diff --git a/numpy/build_utils/apple_accelerate.py b/numpy/build_utils/apple_accelerate.py
new file mode 100644
index 000000000..d7351f4c5
--- /dev/null
+++ b/numpy/build_utils/apple_accelerate.py
@@ -0,0 +1,21 @@
+import os
+import sys
+import re
+
+__all__ = ['uses_accelerate_framework', 'get_sgemv_fix']
+
+def uses_accelerate_framework(info):
+    """ Returns True if Accelerate framework is used for BLAS/LAPACK """
+    if sys.platform != "darwin":
+        return False
+    r_accelerate = re.compile("Accelerate")
+    extra_link_args = info.get('extra_link_args', '')
+    for arg in extra_link_args:
+        if r_accelerate.search(arg):
+            return True
+    return False
+
+def get_sgemv_fix():
+    """ Returns source file needed to correct SGEMV """
+    path = os.path.abspath(os.path.dirname(__file__))
+    return [os.path.join(path, 'src', 'apple_sgemv_fix.c')]
diff --git a/numpy/build_utils/src/apple_sgemv_fix.c b/numpy/build_utils/src/apple_sgemv_fix.c
new file mode 100644
index 000000000..558343477
--- /dev/null
+++ b/numpy/build_utils/src/apple_sgemv_fix.c
@@ -0,0 +1,229 @@
+/* This is a collection of ugly hacks to circumvent a bug in
+ * Apple Accelerate framework's SGEMV subroutine.
+ *
+ * See: https://github.com/numpy/numpy/issues/4007
+ *
+ * SGEMV in Accelerate framework will segfault on MacOS X version 10.9
+ * (aka Mavericks) if arrays are not aligned to 32 byte boundaries
+ * and the CPU supports AVX instructions. This can produce segfaults
+ * in np.dot.
+ *
+ * This patch overshadows the symbols cblas_sgemv, sgemv_ and sgemv
+ * exported by Accelerate to produce the correct behavior. The MacOS X
+ * version and CPU specs are checked on module import. If Mavericks and
+ * AVX are detected the call to SGEMV is emulated with a call to SGEMM
+ * if the arrays are not 32 byte aligned. If the exported symbols cannot
+ * be overshadowed on module import, a fatal error is produced and the
+ * process aborts. All the fixes are in a self-contained C file
+ * and do not alter the multiarray C code. The patch is not applied
+ * unless NumPy is configured to link with Apple's Accelerate
+ * framework.
+ *
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include "Python.h"
+#include "numpy/arrayobject.h"
+
+#include <string.h>
+#include <dlfcn.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+/* ----------------------------------------------------------------- */
+/* Original cblas_sgemv */
+
+#define VECLIB_FILE "/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/vecLib"
+
+enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
+enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113};
+extern void cblas_xerbla(int info, const char *rout, const char *form, ...);
+
+typedef void cblas_sgemv_t(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const float alpha, const float  *A, const int lda,
+                 const float  *X, const int incX,
+                 const float beta, float  *Y, const int incY);
+
+typedef void cblas_sgemm_t(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
+                 const int M, const int N, const int K,
+                 const float alpha, const float  *A, const int lda,
+                 const float  *B, const int ldb,
+                 const float beta, float  *C, const int incC);
+
+typedef void fortran_sgemv_t( const char* trans, const int* m, const int* n,
+             const float* alpha, const float* A, const int* ldA,
+             const float* X, const int* incX,
+             const float* beta, float* Y, const int* incY );
+
+static void *veclib = NULL;
+static cblas_sgemv_t *accelerate_cblas_sgemv = NULL;
+static cblas_sgemm_t *accelerate_cblas_sgemm = NULL;
+static fortran_sgemv_t *accelerate_sgemv = NULL;
+static int AVX_and_10_9 = 0;
+
+/* Dynamic check for AVX support
+ * __builtin_cpu_supports("avx") is available in gcc 4.8,
+ * but clang and icc do not currently support it. */
+#define cpu_supports_avx()\
+(system("sysctl -n machdep.cpu.features | grep -q AVX") == 0)
+
+/* Check if we are using MacOS X version 10.9 */
+#define using_mavericks()\
+(system("sw_vers -productVersion | grep -q 10\\.9\\.") == 0)
+
+__attribute__((destructor))
+static void unloadlib(void)
+{
+   if (veclib) dlclose(veclib);
+}
+
+__attribute__((constructor))
+static void loadlib()
+/* automatically executed on module import */
+{
+    char errormsg[1024];
+    int AVX, MAVERICKS;
+    memset((void*)errormsg, 0, sizeof(errormsg));
+    /* check if the CPU supports AVX */
+    AVX = cpu_supports_avx();
+    /* check if the OS is MacOS X Mavericks */
+    MAVERICKS = using_mavericks();
+    /* we need the workaround when the CPU supports
+     * AVX and the OS version is Mavericks */
+    AVX_and_10_9 = AVX && MAVERICKS;
+    /* load vecLib */
+    veclib = dlopen(VECLIB_FILE, RTLD_LOCAL | RTLD_FIRST);
+    if (!veclib) {
+        veclib = NULL;
+        sprintf(errormsg,"Failed to open vecLib from location '%s'.", VECLIB_FILE);
+        Py_FatalError(errormsg); /* calls abort() and dumps core */
+    }
+    /* resolve Fortran SGEMV from Accelerate */
+    accelerate_sgemv = (fortran_sgemv_t*) dlsym(veclib, "sgemv_");
+    if (!accelerate_sgemv) {
+        unloadlib();
+        sprintf(errormsg,"Failed to resolve symbol 'sgemv_'.");
+        Py_FatalError(errormsg);
+    }
+    /* resolve cblas_sgemv from Accelerate */
+    accelerate_cblas_sgemv = (cblas_sgemv_t*) dlsym(veclib, "cblas_sgemv");
+    if (!accelerate_cblas_sgemv) {
+        unloadlib();
+        sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemv'.");
+        Py_FatalError(errormsg);
+    }
+    /* resolve cblas_sgemm from Accelerate */
+    accelerate_cblas_sgemm = (cblas_sgemm_t*) dlsym(veclib, "cblas_sgemm");
+    if (!accelerate_cblas_sgemm) {
+        unloadlib();
+        sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemm'.");
+        Py_FatalError(errormsg);
+    }
+}
+
+/* ----------------------------------------------------------------- */
+/* Fortran SGEMV override */
+
+void sgemv_( const char* trans, const int* m, const int* n,
+             const float* alpha, const float* A, const int* ldA,
+             const float* X, const int* incX,
+             const float* beta, float* Y, const int* incY )
+{
+    /* It is safe to use the original SGEMV if we are not using AVX on Mavericks
+     * or the input arrays A, X and Y are all aligned on 32 byte boundaries. */
+    #define BADARRAY(x) (((npy_intp)(void*)x) % 32)
+    const int use_sgemm = AVX_and_10_9 && (BADARRAY(A) || BADARRAY(X) || BADARRAY(Y));
+    if (!use_sgemm) {
+        accelerate_sgemv(trans,m,n,alpha,A,ldA,X,incX,beta,Y,incY);
+        return;
+    }
+
+    /* Arrays are misaligned, the CPU supports AVX, and we are running
+     * Mavericks.
+     *
+     * Emulation of SGEMV with SGEMM:
+     *
+     * SGEMV allows vectors to be strided. SGEMM requires all arrays to be
+     * contiguous along the leading dimension. To emulate striding in SGEMV
+     * with the leading dimension arguments in SGEMM we compute
+     *
+     *    Y = alpha * op(A) @ X + beta * Y
+     *
+     * as
+     *
+     *    Y.T = alpha * X.T @ op(A).T + beta * Y.T
+     *
+     * Because Fortran uses column major order and X.T and Y.T are row vectors,
+     * the leading dimensions of X.T and Y.T in SGEMM become equal to the
+     * strides of the the column vectors X and Y in SGEMV. */
+
+    switch (*trans) {
+        case 'T':
+        case 't':
+        case 'C':
+        case 'c':
+            accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
+                1, *n, *m, *alpha, X, *incX, A, *ldA, *beta, Y, *incY );
+            break;
+        case 'N':
+        case 'n':
+            accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans,
+                1, *m, *n, *alpha, X, *incX, A, *ldA, *beta, Y, *incY );
+            break;
+        default:
+            cblas_xerbla(1, "SGEMV", "Illegal transpose setting: %c\n", *trans);
+    }
+}
+
+/* ----------------------------------------------------------------- */
+/* Override for an alias symbol for sgemv_ in Accelerate */
+
+void sgemv (char *trans,
+            const int *m, const int *n,
+            const float *alpha,
+            const float *A, const int *lda,
+            const float *B, const int *incB,
+            const float *beta,
+            float *C, const int *incC)
+{
+    sgemv_(trans,m,n,alpha,A,lda,B,incB,beta,C,incC);
+}
+
+/* ----------------------------------------------------------------- */
+/* cblas_sgemv override, based on Netlib CBLAS code */
+
+void cblas_sgemv(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const float alpha, const float  *A, const int lda,
+                 const float  *X, const int incX, const float beta,
+                 float  *Y, const int incY)
+{
+   char TA;
+   if (order == CblasColMajor)
+   {
+      if (TransA == CblasNoTrans) TA = 'N';
+      else if (TransA == CblasTrans) TA = 'T';
+      else if (TransA == CblasConjTrans) TA = 'C';
+      else
+      {
+         cblas_xerbla(2, "cblas_sgemv","Illegal TransA setting, %d\n", TransA);
+      }
+      sgemv_(&TA, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+   }
+   else if (order == CblasRowMajor)
+   {
+      if (TransA == CblasNoTrans) TA = 'T';
+      else if (TransA == CblasTrans) TA = 'N';
+      else if (TransA == CblasConjTrans) TA = 'N';
+      else
+      {
+         cblas_xerbla(2, "cblas_sgemv", "Illegal TransA setting, %d\n", TransA);
+         return;
+      }
+      sgemv_(&TA, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+   }
+   else
+      cblas_xerbla(1, "cblas_sgemv", "Illegal Order setting, %d\n", order);
+}
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 11b443cf8..7a82f1e35 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -12,6 +12,7 @@ from os.path import join
 from numpy.distutils import log
 from distutils.dep_util import newer
 from distutils.sysconfig import get_config_var
+from numpy.build_utils.apple_accelerate import uses_accelerate_framework, get_sgemv_fix
 
 from setup_common import *
 
@@ -838,6 +839,8 @@ def configuration(parent_package='',top_path=None):
         multiarray_src.extend([join('src', 'multiarray', 'cblasfuncs.c'),
                                join('src', 'multiarray', 'python_xerbla.c'),
                               ])
+        if uses_accelerate_framework(blas_info):
+            multiarray_src.extend(get_sgemv_fix())
     else:
         extra_info = {}
 
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 314adf4d1..dfa7880f9 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -3885,6 +3885,73 @@ class TestDot(TestCase):
         assert_raises(TypeError, np.dot, b, c)
         assert_raises(TypeError, c.dot, b)
 
+    def test_accelerate_framework_sgemv_fix(self):
+        from itertools import product
+        if sys.platform != 'darwin':
+            return
+
+        def aligned_array(shape, align, dtype, order='C'):
+            d = dtype()
+            N = np.prod(shape)
+            tmp = np.zeros(N * d.nbytes + align, dtype=np.uint8)
+            address = tmp.__array_interface__["data"][0]
+            for offset in range(align):
+                if (address + offset) % align == 0:
+                    break
+            tmp = tmp[offset:offset+N*d.nbytes].view(dtype=dtype)
+            return tmp.reshape(shape, order=order)
+
+        def as_aligned(arr, align, dtype, order='C'):
+            aligned = aligned_array(arr.shape, align, dtype, order)
+            aligned[:] = arr[:]
+            return aligned
+
+        def assert_dot_close(A, X, desired):
+            assert_allclose(np.dot(A, X), desired, rtol=1e-5, atol=1e-7)
+
+        m = aligned_array(100, 15, np.float32)
+        s = aligned_array((100, 100), 15, np.float32)
+        np.dot(s, m) # this will always segfault if the bug is present
+
+        testdata = product((15,32), (10000,), (200,89), ('C','F'))
+        for align, m, n, a_order in testdata:
+            # Calculation in double precision
+            A_d = np.random.rand(m, n)
+            X_d = np.random.rand(n)
+            desired = np.dot(A_d, X_d)
+            # Calculation with aligned single precision
+            A_f = as_aligned(A_d, align, np.float32, order=a_order)
+            X_f = as_aligned(X_d, align, np.float32)
+            assert_dot_close(A_f, X_f, desired)
+            # Strided A rows
+            A_d_2 = A_d[::2]
+            desired = np.dot(A_d_2, X_d)
+            A_f_2 = A_f[::2]
+            assert_dot_close(A_f_2, X_f, desired)
+            # Strided A columns, strided X vector
+            A_d_22 = A_d_2[:, ::2]
+            X_d_2 = X_d[::2]
+            desired = np.dot(A_d_22, X_d_2)
+            A_f_22 = A_f_2[:, ::2]
+            X_f_2 = X_f[::2]
+            assert_dot_close(A_f_22, X_f_2, desired)
+            # Check the strides are as expected
+            if a_order == 'F':
+                assert_equal(A_f_22.strides, (8, 8 * m))
+            else:
+                assert_equal(A_f_22.strides, (8 * n, 8))
+            assert_equal(X_f_2.strides, (8,))
+            # Strides in A rows + cols only
+            X_f_2c = as_aligned(X_f_2, align, np.float32)
+            assert_dot_close(A_f_22, X_f_2c, desired)
+            # Strides just in A cols
+            A_d_12 = A_d[:, ::2]
+            desired = np.dot(A_d_12, X_d_2)
+            A_f_12 = A_f[:, ::2]
+            assert_dot_close(A_f_12, X_f_2c, desired)
+            # Strides in A cols and X
+            assert_dot_close(A_f_12, X_f_2, desired)
+
 
 class TestInner(TestCase):
author	Sturla Molden <sturla@molden.no>	2014-10-27 07:45:57 +0100
committer	Charles Harris <charlesr.harris@gmail.com>	2015-05-03 12:58:26 -0600
commit	cf3077696dd76d88e77173546a0d969eda3a7aa0 (patch)
tree	4880f6a00c5a3b16ad3bbb575b8338d8a47a442f
parent	3782f7e0bfd9ac230b672a1295df580a40e9e109 (diff)
download	numpy-cf3077696dd76d88e77173546a0d969eda3a7aa0.tar.gz