ENH: core: use symbol-suffixed 64-bit CBLAS in cblasfuncs, when available

When Numpy is compiled with BLAS64_ enabled, use the 64-bit CBLAS routines to provide cblasfuncs, and dot() et al.
author: Pauli Virtanen <pav@iki.fi> 2019-11-29 21:32:44 +0200
committer: Pauli Virtanen <pav@iki.fi> 2019-11-29 21:34:57 +0200
commit: acf8bcb04408811e6c0c72c4a4101505a14e59db (patch)
tree: 4667fe2c5d8a9c29a610e1363c814d4897a43b7b /numpy
parent: 2d75dec5ae997c2b089232b96e4a64b1065e54f5 (diff)
download: numpy-acf8bcb04408811e6c0c72c4a4101505a14e59db.tar.gz
1 files changed, 50 insertions, 36 deletions
diff --git a/numpy/core/src/common/cblasfuncs.c b/numpy/core/src/common/cblasfuncs.c
index 39572fed4..14d13a6c7 100644
--- a/numpy/core/src/common/cblasfuncs.c
+++ b/numpy/core/src/common/cblasfuncs.c
@@ -10,10 +10,20 @@
 #include <assert.h>
 #include <numpy/arrayobject.h>
 #include "npy_cblas.h"
+#include "npy_cblas64_.h"
 #include "arraytypes.h"
 #include "common.h"
 
 
+/*
+ * If 64-bit CBLAS with symbol suffix '64_' is available, use it.
+ */
+#ifdef HAVE_CBLAS64_
+#define CBLAS_FUNC(name) name ## 64_
+#else
+#define CBLAS_FUNC(name) name
+#endif
+
 static const double oneD[2] = {1.0, 0.0}, zeroD[2] = {0.0, 0.0};
 static const float oneF[2] = {1.0, 0.0}, zeroF[2] = {0.0, 0.0};
 
@@ -24,28 +34,28 @@ static const float oneF[2] = {1.0, 0.0}, zeroF[2] = {0.0, 0.0};
 static void
 gemm(int typenum, enum CBLAS_ORDER order,
      enum CBLAS_TRANSPOSE transA, enum CBLAS_TRANSPOSE transB,
-     int m, int n, int k,
-     PyArrayObject *A, int lda, PyArrayObject *B, int ldb, PyArrayObject *R)
+     npy_intp m, npy_intp n, npy_intp k,
+     PyArrayObject *A, npy_intp lda, PyArrayObject *B, npy_intp ldb, PyArrayObject *R)
 {
     const void *Adata = PyArray_DATA(A), *Bdata = PyArray_DATA(B);
     void *Rdata = PyArray_DATA(R);
-    int ldc = PyArray_DIM(R, 1) > 1 ? PyArray_DIM(R, 1) : 1;
+    npy_intp ldc = PyArray_DIM(R, 1) > 1 ? PyArray_DIM(R, 1) : 1;
 
     switch (typenum) {
         case NPY_DOUBLE:
-            cblas_dgemm(order, transA, transB, m, n, k, 1.,
+            CBLAS_FUNC(cblas_dgemm)(order, transA, transB, m, n, k, 1.,
                         Adata, lda, Bdata, ldb, 0., Rdata, ldc);
             break;
         case NPY_FLOAT:
-            cblas_sgemm(order, transA, transB, m, n, k, 1.f,
+            CBLAS_FUNC(cblas_sgemm)(order, transA, transB, m, n, k, 1.f,
                         Adata, lda, Bdata, ldb, 0.f, Rdata, ldc);
             break;
         case NPY_CDOUBLE:
-            cblas_zgemm(order, transA, transB, m, n, k, oneD,
+            CBLAS_FUNC(cblas_zgemm)(order, transA, transB, m, n, k, oneD,
                         Adata, lda, Bdata, ldb, zeroD, Rdata, ldc);
             break;
         case NPY_CFLOAT:
-            cblas_cgemm(order, transA, transB, m, n, k, oneF,
+            CBLAS_FUNC(cblas_cgemm)(order, transA, transB, m, n, k, oneF,
                         Adata, lda, Bdata, ldb, zeroF, Rdata, ldc);
             break;
     }
@@ -57,29 +67,29 @@ gemm(int typenum, enum CBLAS_ORDER order,
  */
 static void
 gemv(int typenum, enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans,
-     PyArrayObject *A, int lda, PyArrayObject *X, int incX,
+     PyArrayObject *A, npy_intp lda, PyArrayObject *X, npy_intp incX,
      PyArrayObject *R)
 {
     const void *Adata = PyArray_DATA(A), *Xdata = PyArray_DATA(X);
     void *Rdata = PyArray_DATA(R);
 
-    int m = PyArray_DIM(A, 0), n = PyArray_DIM(A, 1);
+    npy_intp m = PyArray_DIM(A, 0), n = PyArray_DIM(A, 1);
 
     switch (typenum) {
         case NPY_DOUBLE:
-            cblas_dgemv(order, trans, m, n, 1., Adata, lda, Xdata, incX,
+            CBLAS_FUNC(cblas_dgemv)(order, trans, m, n, 1., Adata, lda, Xdata, incX,
                         0., Rdata, 1);
             break;
         case NPY_FLOAT:
-            cblas_sgemv(order, trans, m, n, 1.f, Adata, lda, Xdata, incX,
+            CBLAS_FUNC(cblas_sgemv)(order, trans, m, n, 1.f, Adata, lda, Xdata, incX,
                         0.f, Rdata, 1);
             break;
         case NPY_CDOUBLE:
-            cblas_zgemv(order, trans, m, n, oneD, Adata, lda, Xdata, incX,
+            CBLAS_FUNC(cblas_zgemv)(order, trans, m, n, oneD, Adata, lda, Xdata, incX,
                         zeroD, Rdata, 1);
             break;
         case NPY_CFLOAT:
-            cblas_cgemv(order, trans, m, n, oneF, Adata, lda, Xdata, incX,
+            CBLAS_FUNC(cblas_cgemv)(order, trans, m, n, oneF, Adata, lda, Xdata, incX,
                         zeroF, Rdata, 1);
             break;
     }
@@ -91,19 +101,19 @@ gemv(int typenum, enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans,
  */
 static void
 syrk(int typenum, enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans,
-     int n, int k,
-     PyArrayObject *A, int lda, PyArrayObject *R)
+     npy_intp n, npy_intp k,
+     PyArrayObject *A, npy_intp lda, PyArrayObject *R)
 {
     const void *Adata = PyArray_DATA(A);
     void *Rdata = PyArray_DATA(R);
-    int ldc = PyArray_DIM(R, 1) > 1 ? PyArray_DIM(R, 1) : 1;
+    npy_intp ldc = PyArray_DIM(R, 1) > 1 ? PyArray_DIM(R, 1) : 1;
 
     npy_intp i;
     npy_intp j;
 
     switch (typenum) {
         case NPY_DOUBLE:
-            cblas_dsyrk(order, CblasUpper, trans, n, k, 1.,
+            CBLAS_FUNC(cblas_dsyrk)(order, CblasUpper, trans, n, k, 1.,
                         Adata, lda, 0., Rdata, ldc);
 
             for (i = 0; i < n; i++) {
@@ -114,7 +124,7 @@ syrk(int typenum, enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans,
             }
             break;
         case NPY_FLOAT:
-            cblas_ssyrk(order, CblasUpper, trans, n, k, 1.f,
+            CBLAS_FUNC(cblas_ssyrk)(order, CblasUpper, trans, n, k, 1.f,
                         Adata, lda, 0.f, Rdata, ldc);
 
             for (i = 0; i < n; i++) {
@@ -125,7 +135,7 @@ syrk(int typenum, enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans,
             }
             break;
         case NPY_CDOUBLE:
-            cblas_zsyrk(order, CblasUpper, trans, n, k, oneD,
+            CBLAS_FUNC(cblas_zsyrk)(order, CblasUpper, trans, n, k, oneD,
                         Adata, lda, zeroD, Rdata, ldc);
 
             for (i = 0; i < n; i++) {
@@ -136,7 +146,7 @@ syrk(int typenum, enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans,
             }
             break;
         case NPY_CFLOAT:
-            cblas_csyrk(order, CblasUpper, trans, n, k, oneF,
+            CBLAS_FUNC(cblas_csyrk)(order, CblasUpper, trans, n, k, oneF,
                         Adata, lda, zeroF, Rdata, ldc);
 
             for (i = 0; i < n; i++) {
@@ -222,7 +232,7 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
                     PyArrayObject *out)
 {
     PyArrayObject *result = NULL, *out_buf = NULL;
-    int j, lda, ldb;
+    npy_intp j, lda, ldb;
     npy_intp l;
     int nd;
     npy_intp ap1stride = 0;
@@ -385,14 +395,15 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
                                                  *((double *)PyArray_DATA(ap1));
             }
             else if (ap1shape != _matrix) {
-                cblas_daxpy(l,
+                CBLAS_FUNC(cblas_daxpy)(l,
                             *((double *)PyArray_DATA(ap2)),
                             (double *)PyArray_DATA(ap1),
                             ap1stride/sizeof(double),
                             (double *)PyArray_DATA(out_buf), 1);
             }
             else {
-                int maxind, oind, i, a1s, outs;
+                int maxind, oind;
+                npy_intp i, a1s, outs;
                 char *ptr, *optr;
                 double val;
 
@@ -405,7 +416,7 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
                 a1s = PyArray_STRIDE(ap1, maxind) / sizeof(double);
                 outs = PyArray_STRIDE(out_buf, maxind) / sizeof(double);
                 for (i = 0; i < PyArray_DIM(ap1, oind); i++) {
-                    cblas_daxpy(l, val, (double *)ptr, a1s,
+                    CBLAS_FUNC(cblas_daxpy)(l, val, (double *)ptr, a1s,
                                 (double *)optr, outs);
                     ptr += PyArray_STRIDE(ap1, oind);
                     optr += PyArray_STRIDE(out_buf, oind);
@@ -423,14 +434,15 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
                 res->imag = ptr1->real * ptr2->imag + ptr1->imag * ptr2->real;
             }
             else if (ap1shape != _matrix) {
-                cblas_zaxpy(l,
+                CBLAS_FUNC(cblas_zaxpy)(l,
                             (double *)PyArray_DATA(ap2),
                             (double *)PyArray_DATA(ap1),
                             ap1stride/sizeof(npy_cdouble),
                             (double *)PyArray_DATA(out_buf), 1);
             }
             else {
-                int maxind, oind, i, a1s, outs;
+                int maxind, oind;
+                npy_intp i, a1s, outs;
                 char *ptr, *optr;
                 double *pval;
 
@@ -443,7 +455,7 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
                 a1s = PyArray_STRIDE(ap1, maxind) / sizeof(npy_cdouble);
                 outs = PyArray_STRIDE(out_buf, maxind) / sizeof(npy_cdouble);
                 for (i = 0; i < PyArray_DIM(ap1, oind); i++) {
-                    cblas_zaxpy(l, pval, (double *)ptr, a1s,
+                    CBLAS_FUNC(cblas_zaxpy)(l, pval, (double *)ptr, a1s,
                                 (double *)optr, outs);
                     ptr += PyArray_STRIDE(ap1, oind);
                     optr += PyArray_STRIDE(out_buf, oind);
@@ -456,14 +468,15 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
                     *((float *)PyArray_DATA(ap1));
             }
             else if (ap1shape != _matrix) {
-                cblas_saxpy(l,
+                CBLAS_FUNC(cblas_saxpy)(l,
                             *((float *)PyArray_DATA(ap2)),
                             (float *)PyArray_DATA(ap1),
                             ap1stride/sizeof(float),
                             (float *)PyArray_DATA(out_buf), 1);
             }
             else {
-                int maxind, oind, i, a1s, outs;
+                int maxind, oind;
+                npy_intp i, a1s, outs;
                 char *ptr, *optr;
                 float val;
 
@@ -476,7 +489,7 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
                 a1s = PyArray_STRIDE(ap1, maxind) / sizeof(float);
                 outs = PyArray_STRIDE(out_buf, maxind) / sizeof(float);
                 for (i = 0; i < PyArray_DIM(ap1, oind); i++) {
-                    cblas_saxpy(l, val, (float *)ptr, a1s,
+                    CBLAS_FUNC(cblas_saxpy)(l, val, (float *)ptr, a1s,
                                 (float *)optr, outs);
                     ptr += PyArray_STRIDE(ap1, oind);
                     optr += PyArray_STRIDE(out_buf, oind);
@@ -494,14 +507,15 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
                 res->imag = ptr1->real * ptr2->imag + ptr1->imag * ptr2->real;
             }
             else if (ap1shape != _matrix) {
-                cblas_caxpy(l,
+                CBLAS_FUNC(cblas_caxpy)(l,
                             (float *)PyArray_DATA(ap2),
                             (float *)PyArray_DATA(ap1),
                             ap1stride/sizeof(npy_cfloat),
                             (float *)PyArray_DATA(out_buf), 1);
             }
             else {
-                int maxind, oind, i, a1s, outs;
+                int maxind, oind;
+                npy_intp i, a1s, outs;
                 char *ptr, *optr;
                 float *pval;
 
@@ -514,7 +528,7 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
                 a1s = PyArray_STRIDE(ap1, maxind) / sizeof(npy_cfloat);
                 outs = PyArray_STRIDE(out_buf, maxind) / sizeof(npy_cfloat);
                 for (i = 0; i < PyArray_DIM(ap1, oind); i++) {
-                    cblas_caxpy(l, pval, (float *)ptr, a1s,
+                    CBLAS_FUNC(cblas_caxpy)(l, pval, (float *)ptr, a1s,
                                 (float *)optr, outs);
                     ptr += PyArray_STRIDE(ap1, oind);
                     optr += PyArray_STRIDE(out_buf, oind);
@@ -537,7 +551,7 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
         /* Matrix vector multiplication -- Level 2 BLAS */
         /* lda must be MAX(M,1) */
         enum CBLAS_ORDER Order;
-        int ap2s;
+        npy_intp ap2s;
 
         if (!PyArray_ISONESEGMENT(ap1)) {
             PyObject *new;
@@ -564,7 +578,7 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
     else if (ap1shape != _matrix && ap2shape == _matrix) {
         /* Vector matrix multiplication -- Level 2 BLAS */
         enum CBLAS_ORDER Order;
-        int ap1s;
+        npy_intp ap1s;
 
         if (!PyArray_ISONESEGMENT(ap2)) {
             PyObject *new;
@@ -601,7 +615,7 @@ cblas_matrixproduct(int typenum, PyArrayObject *ap1, PyArrayObject *ap2,
          */
         enum CBLAS_ORDER Order;
         enum CBLAS_TRANSPOSE Trans1, Trans2;
-        int M, N, L;
+        npy_intp M, N, L;
 
         /* Optimization possible: */
         /*
author	Pauli Virtanen <pav@iki.fi>	2019-11-29 21:32:44 +0200
committer	Pauli Virtanen <pav@iki.fi>	2019-11-29 21:34:57 +0200
commit	acf8bcb04408811e6c0c72c4a4101505a14e59db (patch)
tree	4667fe2c5d8a9c29a610e1363c814d4897a43b7b /numpy
parent	2d75dec5ae997c2b089232b96e4a64b1065e54f5 (diff)
download	numpy-acf8bcb04408811e6c0c72c4a4101505a14e59db.tar.gz