diff options
| -rw-r--r-- | numpy/linalg/umath_linalg.cpp.src | 2151 |
1 files changed, 1138 insertions, 1013 deletions
diff --git a/numpy/linalg/umath_linalg.cpp.src b/numpy/linalg/umath_linalg.cpp.src index 44e1bef88..944f46ba7 100644 --- a/numpy/linalg/umath_linalg.cpp.src +++ b/numpy/linalg/umath_linalg.cpp.src @@ -22,10 +22,16 @@ #include <stdio.h> #include <assert.h> #include <math.h> +#include <utility> static const char* umath_linalg_version_string = "0.1.5"; +struct scalar_trait {}; +struct complex_trait {}; +template<typename typ> +using dispatch_scalar = typename std::conditional<std::is_scalar<typ>::value, scalar_trait, complex_trait>::type; + /* **************************************************************************** * Debugging support * @@ -427,80 +433,92 @@ set_fp_invalid_or_clear(int error_occurred) #define UMATH_LINALG_MODULE_NAME "_umath_linalg" -typedef union { - fortran_complex f; - npy_cfloat npy; - float array[2]; -} COMPLEX_t; - -typedef union { - fortran_doublecomplex f; - npy_cdouble npy; - double array[2]; -} DOUBLECOMPLEX_t; - -static float s_one; -static float s_zero; -static float s_minus_one; -static float s_ninf; -static float s_nan; -static double d_one; -static double d_zero; -static double d_minus_one; -static double d_ninf; -static double d_nan; -static COMPLEX_t c_one; -static COMPLEX_t c_zero; -static COMPLEX_t c_minus_one; -static COMPLEX_t c_ninf; -static COMPLEX_t c_nan; -static DOUBLECOMPLEX_t z_one; -static DOUBLECOMPLEX_t z_zero; -static DOUBLECOMPLEX_t z_minus_one; -static DOUBLECOMPLEX_t z_ninf; -static DOUBLECOMPLEX_t z_nan; - -static void init_constants(void) -{ - /* - this is needed as NPY_INFINITY and NPY_NAN macros - can't be used as initializers. I prefer to just set - all the constants the same way. - */ - s_one = 1.0f; - s_zero = 0.0f; - s_minus_one = -1.0f; - s_ninf = -NPY_INFINITYF; - s_nan = NPY_NANF; - - d_one = 1.0; - d_zero = 0.0; - d_minus_one = -1.0; - d_ninf = -NPY_INFINITY; - d_nan = NPY_NAN; - - c_one.array[0] = 1.0f; - c_one.array[1] = 0.0f; - c_zero.array[0] = 0.0f; - c_zero.array[1] = 0.0f; - c_minus_one.array[0] = -1.0f; - c_minus_one.array[1] = 0.0f; - c_ninf.array[0] = -NPY_INFINITYF; - c_ninf.array[1] = 0.0f; - c_nan.array[0] = NPY_NANF; - c_nan.array[1] = NPY_NANF; - - z_one.array[0] = 1.0; - z_one.array[1] = 0.0; - z_zero.array[0] = 0.0; - z_zero.array[1] = 0.0; - z_minus_one.array[0] = -1.0; - z_minus_one.array[1] = 0.0; - z_ninf.array[0] = -NPY_INFINITY; - z_ninf.array[1] = 0.0; - z_nan.array[0] = NPY_NAN; - z_nan.array[1] = NPY_NAN; -} +template<typename T> +struct numeric_limits; + +template<> +struct numeric_limits<float> { +static constexpr float one = 1.0f; +static constexpr float zero = 0.0f; +static constexpr float minus_one = -1.0f; +static const float ninf; +static const float nan; +}; +constexpr float numeric_limits<float>::one; +constexpr float numeric_limits<float>::zero; +constexpr float numeric_limits<float>::minus_one; +const float numeric_limits<float>::ninf = -NPY_INFINITYF; +const float numeric_limits<float>::nan = NPY_NANF; + +template<> +struct numeric_limits<double> { +static constexpr double one = 1.0; +static constexpr double zero = 0.0; +static constexpr double minus_one = -1.0; +static const double ninf; +static const double nan; +}; +constexpr double numeric_limits<double>::one; +constexpr double numeric_limits<double>::zero; +constexpr double numeric_limits<double>::minus_one; +const double numeric_limits<double>::ninf = -NPY_INFINITY; +const double numeric_limits<double>::nan = NPY_NAN; + +template<> +struct numeric_limits<npy_cfloat> { +static constexpr npy_cfloat one = {1.0f, 0.0f}; +static constexpr npy_cfloat zero = {0.0f, 0.0f}; +static constexpr npy_cfloat minus_one = {-1.0f, 0.0f}; +static const npy_cfloat ninf; +static const npy_cfloat nan; +}; +constexpr npy_cfloat numeric_limits<npy_cfloat>::one; +constexpr npy_cfloat numeric_limits<npy_cfloat>::zero; +constexpr npy_cfloat numeric_limits<npy_cfloat>::minus_one; +const npy_cfloat numeric_limits<npy_cfloat>::ninf = {-NPY_INFINITYF, 0.0f}; +const npy_cfloat numeric_limits<npy_cfloat>::nan = {NPY_NANF, NPY_NANF}; + +template<> +struct numeric_limits<f2c_complex> { +static constexpr f2c_complex one = {1.0f, 0.0f}; +static constexpr f2c_complex zero = {0.0f, 0.0f}; +static constexpr f2c_complex minus_one = {-1.0f, 0.0f}; +static const f2c_complex ninf; +static const f2c_complex nan; +}; +constexpr f2c_complex numeric_limits<f2c_complex>::one; +constexpr f2c_complex numeric_limits<f2c_complex>::zero; +constexpr f2c_complex numeric_limits<f2c_complex>::minus_one; +const f2c_complex numeric_limits<f2c_complex>::ninf = {-NPY_INFINITYF, 0.0f}; +const f2c_complex numeric_limits<f2c_complex>::nan = {NPY_NANF, NPY_NANF}; + +template<> +struct numeric_limits<npy_cdouble> { +static constexpr npy_cdouble one = {1.0, 0.0}; +static constexpr npy_cdouble zero = {0.0, 0.0}; +static constexpr npy_cdouble minus_one = {-1.0, 0.0}; +static const npy_cdouble ninf; +static const npy_cdouble nan; +}; +constexpr npy_cdouble numeric_limits<npy_cdouble>::one; +constexpr npy_cdouble numeric_limits<npy_cdouble>::zero; +constexpr npy_cdouble numeric_limits<npy_cdouble>::minus_one; +const npy_cdouble numeric_limits<npy_cdouble>::ninf = {-NPY_INFINITY, 0.0}; +const npy_cdouble numeric_limits<npy_cdouble>::nan = {NPY_NAN, NPY_NAN}; + +template<> +struct numeric_limits<f2c_doublecomplex> { +static constexpr f2c_doublecomplex one = {1.0, 0.0}; +static constexpr f2c_doublecomplex zero = {0.0, 0.0}; +static constexpr f2c_doublecomplex minus_one = {-1.0, 0.0}; +static const f2c_doublecomplex ninf; +static const f2c_doublecomplex nan; +}; +constexpr f2c_doublecomplex numeric_limits<f2c_doublecomplex>::one; +constexpr f2c_doublecomplex numeric_limits<f2c_doublecomplex>::zero; +constexpr f2c_doublecomplex numeric_limits<f2c_doublecomplex>::minus_one; +const f2c_doublecomplex numeric_limits<f2c_doublecomplex>::ninf = {-NPY_INFINITY, 0.0}; +const f2c_doublecomplex numeric_limits<f2c_doublecomplex>::nan = {NPY_NAN, NPY_NAN}; /* ***************************************************************************** @@ -590,36 +608,33 @@ dump_linearize_data(const char* name, const LINEARIZE_DATA_t* params) } static NPY_INLINE void -print_FLOAT(npy_float s) +print(npy_float s) { TRACE_TXT(" %8.4f", s); } static NPY_INLINE void -print_DOUBLE(npy_double d) +print(npy_double d) { TRACE_TXT(" %10.6f", d); } static NPY_INLINE void -print_CFLOAT(npy_cfloat c) +print(npy_cfloat c) { float* c_parts = (float*)&c; TRACE_TXT("(%8.4f, %8.4fj)", c_parts[0], c_parts[1]); } static NPY_INLINE void -print_CDOUBLE(npy_cdouble z) +print(npy_cdouble z) { double* z_parts = (double*)&z; TRACE_TXT("(%8.4f, %8.4fj)", z_parts[0], z_parts[1]); } -/**begin repeat - #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# - #typ = npy_float, npy_double, npy_cfloat, npy_cdouble# - */ +template<typename typ> static NPY_INLINE void -dump_@TYPE@_matrix(const char* name, +dump_matrix(const char* name, size_t rows, size_t columns, - const @typ@* ptr) + const typ* ptr) { size_t i, j; @@ -629,13 +644,12 @@ dump_@TYPE@_matrix(const char* name, TRACE_TXT("| "); for (j = 0; j < columns; j++) { - print_@TYPE@(ptr[j*rows + i]); + print(ptr[j*rows + i]); TRACE_TXT(", "); } TRACE_TXT(" |\n"); } } -/**end repeat**/ /* @@ -754,46 +768,100 @@ update_pointers(npy_uint8** bases, ptrdiff_t* offsets, size_t count) /* ***************************************************************************** + ** DISPATCHER FUNCS ** + ***************************************************************************** + */ +static fortran_int copy(fortran_int *n, + float *sx, fortran_int *incx, + float *sy, fortran_int *incy) { return FNAME(scopy)(n, sx, incx, + sy, incy); +} +static fortran_int copy(fortran_int *n, + double *sx, fortran_int *incx, + double *sy, fortran_int *incy) { return FNAME(dcopy)(n, sx, incx, + sy, incy); +} +static fortran_int copy(fortran_int *n, + f2c_complex *sx, fortran_int *incx, + f2c_complex *sy, fortran_int *incy) { return FNAME(ccopy)(n, sx, incx, + sy, incy); +} +static fortran_int copy(fortran_int *n, + f2c_doublecomplex *sx, fortran_int *incx, + f2c_doublecomplex *sy, fortran_int *incy) { return FNAME(zcopy)(n, sx, incx, + sy, incy); +} + +static fortran_int getrf(fortran_int *m, fortran_int *n, float a[], fortran_int +*lda, fortran_int ipiv[], fortran_int *info) { + return LAPACK(sgetrf)(m, n, a, lda, ipiv, info); +} +static fortran_int getrf(fortran_int *m, fortran_int *n, double a[], fortran_int +*lda, fortran_int ipiv[], fortran_int *info) { + return LAPACK(dgetrf)(m, n, a, lda, ipiv, info); +} +static fortran_int getrf(fortran_int *m, fortran_int *n, f2c_complex a[], fortran_int +*lda, fortran_int ipiv[], fortran_int *info) { + return LAPACK(cgetrf)(m, n, a, lda, ipiv, info); +} +static fortran_int getrf(fortran_int *m, fortran_int *n, f2c_doublecomplex a[], fortran_int +*lda, fortran_int ipiv[], fortran_int *info) { + return LAPACK(zgetrf)(m, n, a, lda, ipiv, info); +} + +/* + ***************************************************************************** ** HELPER FUNCS ** ***************************************************************************** */ +template<typename T> +struct fortran_type { +using type = T; +}; + +template<> struct fortran_type<npy_cfloat> { using type = f2c_complex;}; +template<> struct fortran_type<npy_cdouble> { using type = f2c_doublecomplex;}; +template<typename T> +using fortran_type_t = typename fortran_type<T>::type; + +template<typename T> +struct basetype { +using type = T; +}; +template<> struct basetype<npy_cfloat> { using type = npy_float;}; +template<> struct basetype<npy_cdouble> { using type = npy_double;}; +template<> struct basetype<f2c_complex> { using type = fortran_real;}; +template<> struct basetype<f2c_doublecomplex> { using type = fortran_doublereal;}; +template<typename T> +using basetype_t = typename basetype<T>::type; /* rearranging of 2D matrices using blas */ -/**begin repeat - #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# - #typ = float, double, COMPLEX_t, DOUBLECOMPLEX_t# - #ftyp = float, double, f2c_complex, f2c_doublecomplex# - #copy = scopy, dcopy, ccopy, zcopy# - #nan = s_nan, d_nan, c_nan, z_nan# - #zero = s_zero, d_zero, c_zero, z_zero# - */ +template<typename typ> static NPY_INLINE void * -linearize_@TYPE@_matrix(void *dst_in, - void *src_in, +linearize_matrix(typ *dst, + typ *src, const LINEARIZE_DATA_t* data) { - @typ@ *src = (@typ@ *) src_in; - @typ@ *dst = (@typ@ *) dst_in; - + using ftyp = fortran_type_t<typ>; if (dst) { int i, j; - @typ@* rv = dst; + typ* rv = dst; fortran_int columns = (fortran_int)data->columns; fortran_int column_strides = - (fortran_int)(data->column_strides/sizeof(@typ@)); + (fortran_int)(data->column_strides/sizeof(typ)); fortran_int one = 1; for (i = 0; i < data->rows; i++) { if (column_strides > 0) { - FNAME(@copy@)(&columns, - (@ftyp@*)src, &column_strides, - (@ftyp@*)dst, &one); + copy(&columns, + (ftyp*)src, &column_strides, + (ftyp*)dst, &one); } else if (column_strides < 0) { - FNAME(@copy@)(&columns, - ((@ftyp@*)src + (columns-1)*column_strides), + copy(&columns, + ((ftyp*)src + (columns-1)*column_strides), &column_strides, - (@ftyp@*)dst, &one); + (ftyp*)dst, &one); } else { /* @@ -802,10 +870,10 @@ linearize_@TYPE@_matrix(void *dst_in, * manually */ for (j = 0; j < columns; ++j) { - memcpy((@typ@*)dst + j, (@typ@*)src, sizeof(@typ@)); + memcpy(dst + j, src, sizeof(typ)); } } - src += data->row_strides/sizeof(@typ@); + src += data->row_strides/sizeof(typ); dst += data->output_lead_dim; } return rv; @@ -814,31 +882,31 @@ linearize_@TYPE@_matrix(void *dst_in, } } +template<typename typ> static NPY_INLINE void * -delinearize_@TYPE@_matrix(void *dst_in, - void *src_in, +delinearize_matrix(typ *dst, + typ *src, const LINEARIZE_DATA_t* data) { - @typ@ *src = (@typ@ *) src_in; - @typ@ *dst = (@typ@ *) dst_in; +using ftyp = fortran_type_t<typ>; if (src) { int i; - @typ@ *rv = src; + typ *rv = src; fortran_int columns = (fortran_int)data->columns; fortran_int column_strides = - (fortran_int)(data->column_strides/sizeof(@typ@)); + (fortran_int)(data->column_strides/sizeof(typ)); fortran_int one = 1; for (i = 0; i < data->rows; i++) { if (column_strides > 0) { - FNAME(@copy@)(&columns, - (@ftyp@*)src, &one, - (@ftyp@*)dst, &column_strides); + copy(&columns, + (ftyp*)src, &one, + (ftyp*)dst, &column_strides); } else if (column_strides < 0) { - FNAME(@copy@)(&columns, - (@ftyp@*)src, &one, - ((@ftyp@*)dst + (columns-1)*column_strides), + copy(&columns, + (ftyp*)src, &one, + ((ftyp*)dst + (columns-1)*column_strides), &column_strides); } else { @@ -848,13 +916,13 @@ delinearize_@TYPE@_matrix(void *dst_in, * manually */ if (columns > 0) { - memcpy((@typ@*)dst, - (@typ@*)src + (columns-1), - sizeof(@typ@)); + memcpy(dst, + src + (columns-1), + sizeof(typ)); } } src += data->output_lead_dim; - dst += data->row_strides/sizeof(@typ@); + dst += data->row_strides/sizeof(typ); } return rv; @@ -863,116 +931,97 @@ delinearize_@TYPE@_matrix(void *dst_in, } } +template<typename typ> static NPY_INLINE void -nan_@TYPE@_matrix(void *dst_in, const LINEARIZE_DATA_t* data) +nan_matrix(typ *dst, const LINEARIZE_DATA_t* data) { - @typ@ *dst = (@typ@ *) dst_in; - int i, j; for (i = 0; i < data->rows; i++) { - @typ@ *cp = dst; - ptrdiff_t cs = data->column_strides/sizeof(@typ@); + typ *cp = dst; + ptrdiff_t cs = data->column_strides/sizeof(typ); for (j = 0; j < data->columns; ++j) { - *cp = @nan@; + *cp = numeric_limits<typ>::nan; cp += cs; } - dst += data->row_strides/sizeof(@typ@); + dst += data->row_strides/sizeof(typ); } } +template<typename typ> static NPY_INLINE void -zero_@TYPE@_matrix(void *dst_in, const LINEARIZE_DATA_t* data) +zero_matrix(typ *dst, const LINEARIZE_DATA_t* data) { - @typ@ *dst = (@typ@ *) dst_in; - int i, j; for (i = 0; i < data->rows; i++) { - @typ@ *cp = dst; - ptrdiff_t cs = data->column_strides/sizeof(@typ@); + typ *cp = dst; + ptrdiff_t cs = data->column_strides/sizeof(typ); for (j = 0; j < data->columns; ++j) { - *cp = @zero@; + *cp = numeric_limits<typ>::zero; cp += cs; } - dst += data->row_strides/sizeof(@typ@); + dst += data->row_strides/sizeof(typ); } } -/**end repeat**/ - /* identity square matrix generation */ -/**begin repeat - #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# - #typ = float, double, COMPLEX_t, DOUBLECOMPLEX_t# - #cblas_type = s, d, c, z# - */ +template<typename typ> static NPY_INLINE void -identity_@TYPE@_matrix(void *ptr, size_t n) +identity_matrix(typ *matrix, size_t n) { size_t i; - @typ@ *matrix = (@typ@*) ptr; /* in IEEE floating point, zeroes are represented as bitwise 0 */ - memset(matrix, 0, n*n*sizeof(@typ@)); + memset(matrix, 0, n*n*sizeof(typ)); for (i = 0; i < n; ++i) { - *matrix = @cblas_type@_one; + *matrix = numeric_limits<typ>::one; matrix += n+1; } } -/**end repeat**/ /* lower/upper triangular matrix using blas (in place) */ -/**begin repeat - - #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# - #typ = float, double, COMPLEX_t, DOUBLECOMPLEX_t# - #cblas_type = s, d, c, z# - */ +template<typename typ> static NPY_INLINE void -triu_@TYPE@_matrix(void *ptr, size_t n) +triu_matrix(typ *matrix, size_t n) { size_t i, j; - @typ@ *matrix = (@typ@*)ptr; matrix += n; for (i = 1; i < n; ++i) { for (j = 0; j < i; ++j) { - matrix[j] = @cblas_type@_zero; + matrix[j] = numeric_limits<typ>::zero; } matrix += n; } } -/**end repeat**/ /* -------------------------------------------------------------------------- */ /* Determinants */ -/**begin repeat - #TYPE = FLOAT, DOUBLE# - #typ = npy_float, npy_double# - #log_func = npy_logf, npy_log# - #exp_func = npy_expf, npy_exp# - #zero = 0.0f, 0.0# -*/ +static npy_float npylog(npy_float f) { return npy_logf(f);} +static npy_double npylog(npy_double d) { return npy_log(d);} +static npy_float npyexp(npy_float f) { return npy_expf(f);} +static npy_double npyexp(npy_double d) { return npy_exp(d);} +template<typename typ> static NPY_INLINE void -@TYPE@_slogdet_from_factored_diagonal(@typ@* src, +slogdet_from_factored_diagonal(typ* src, fortran_int m, - @typ@ *sign, - @typ@ *logdet) + typ *sign, + typ *logdet) { - @typ@ acc_sign = *sign; - @typ@ acc_logdet = @zero@; + typ acc_sign = *sign; + typ acc_logdet = numeric_limits<typ>::zero; int i; for (i = 0; i < m; i++) { - @typ@ abs_element = *src; - if (abs_element < @zero@) { + typ abs_element = *src; + if (abs_element < numeric_limits<typ>::zero) { acc_sign = -acc_sign; abs_element = -abs_element; } - acc_logdet += @log_func@(abs_element); + acc_logdet += npylog(abs_element); src += m+1; } @@ -980,32 +1029,26 @@ static NPY_INLINE void *logdet = acc_logdet; } -static NPY_INLINE @typ@ -@TYPE@_det_from_slogdet(@typ@ sign, @typ@ logdet) +template<typename typ> +static NPY_INLINE typ +det_from_slogdet(typ sign, typ logdet) { - @typ@ result = sign * @exp_func@(logdet); + typ result = sign * npyexp(logdet); return result; } -/**end repeat**/ +npy_float npyabs(npy_cfloat z) { return npy_cabsf(z);} +npy_double npyabs(npy_cdouble z) { return npy_cabs(z);} -/**begin repeat - #TYPE = CFLOAT, CDOUBLE# - #typ = npy_cfloat, npy_cdouble# - #basetyp = npy_float, npy_double# - #abs_func = npy_cabsf, npy_cabs# - #log_func = npy_logf, npy_log# - #exp_func = npy_expf, npy_exp# - #zero = 0.0f, 0.0# -*/ -#define RE(COMPLEX) (((@basetyp@*)(&COMPLEX))[0]) -#define IM(COMPLEX) (((@basetyp@*)(&COMPLEX))[1]) +#define RE(COMPLEX) (COMPLEX).real +#define IM(COMPLEX) (COMPLEX).imag -static NPY_INLINE @typ@ -@TYPE@_mult(@typ@ op1, @typ@ op2) +template<typename typ> +static NPY_INLINE typ +mult(typ op1, typ op2) { - @typ@ rv; + typ rv; RE(rv) = RE(op1)*RE(op2) - IM(op1)*IM(op2); IM(rv) = RE(op1)*IM(op2) + IM(op1)*RE(op2); @@ -1014,25 +1057,26 @@ static NPY_INLINE @typ@ } +template<typename typ, typename basetyp> static NPY_INLINE void -@TYPE@_slogdet_from_factored_diagonal(@typ@* src, +slogdet_from_factored_diagonal(typ* src, fortran_int m, - @typ@ *sign, - @basetyp@ *logdet) + typ *sign, + basetyp *logdet) { int i; - @typ@ sign_acc = *sign; - @basetyp@ logdet_acc = @zero@; + typ sign_acc = *sign; + basetyp logdet_acc = numeric_limits<basetyp>::zero; for (i = 0; i < m; i++) { - @basetyp@ abs_element = @abs_func@(*src); - @typ@ sign_element; + basetyp abs_element = npyabs(*src); + typ sign_element; RE(sign_element) = RE(*src) / abs_element; IM(sign_element) = IM(*src) / abs_element; - sign_acc = @TYPE@_mult(sign_acc, sign_element); - logdet_acc += @log_func@(abs_element); + sign_acc = mult(sign_acc, sign_element); + logdet_acc += npylog(abs_element); src += m + 1; } @@ -1040,17 +1084,17 @@ static NPY_INLINE void *logdet = logdet_acc; } -static NPY_INLINE @typ@ -@TYPE@_det_from_slogdet(@typ@ sign, @basetyp@ logdet) +template<typename typ, typename basetyp> +static NPY_INLINE typ +det_from_slogdet(typ sign, basetyp logdet) { - @typ@ tmp; - RE(tmp) = @exp_func@(logdet); - IM(tmp) = @zero@; - return @TYPE@_mult(sign, tmp); + typ tmp; + RE(tmp) = npyexp(logdet); + IM(tmp) = numeric_limits<basetyp>::zero; + return mult(sign, tmp); } #undef RE #undef IM -/**end repeat**/ /* As in the linalg package, the determinant is computed via LU factorization @@ -1058,27 +1102,20 @@ static NPY_INLINE @typ@ * slogdet computes sign + log(determinant). * det computes sign * exp(slogdet). */ -/**begin repeat - - #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# - #typ = npy_float, npy_double, npy_cfloat, npy_cdouble# - #ftyp = float, double, f2c_complex, f2c_doublecomplex# - #basetyp = npy_float, npy_double, npy_float, npy_double# - #cblas_type = s, d, c, z# -*/ - +template<typename typ, typename basetyp> static NPY_INLINE void -@TYPE@_slogdet_single_element(fortran_int m, - void* src, +slogdet_single_element(fortran_int m, + typ* src, fortran_int* pivots, - @typ@ *sign, - @basetyp@ *logdet) + typ *sign, + basetyp *logdet) { +using ftyp = fortran_type_t<typ>; fortran_int info = 0; fortran_int lda = fortran_int_max(m, 1); int i; /* note: done in place */ - LAPACK(@cblas_type@getrf)(&m, &m, (@ftyp@*)src, &lda, pivots, &info); + getrf(&m, &m, (ftyp*)src, &lda, pivots, &info); if (info == 0) { int change_sign = 0; @@ -1088,23 +1125,20 @@ static NPY_INLINE void change_sign += (pivots[i] != (i+1)); } - memcpy(sign, - (change_sign % 2)? - &@cblas_type@_minus_one : - &@cblas_type@_one - , sizeof(*sign)); - @TYPE@_slogdet_from_factored_diagonal((@typ@*)src, m, sign, logdet); + *sign = (change_sign % 2)?numeric_limits<typ>::minus_one:numeric_limits<typ>::one; + slogdet_from_factored_diagonal(src, m, sign, logdet); } else { /* if getrf fails, use 0 as sign and -inf as logdet */ - memcpy(sign, &@cblas_type@_zero, sizeof(*sign)); - memcpy(logdet, &@cblas_type@_ninf, sizeof(*logdet)); + *sign = numeric_limits<typ>::zero; + *logdet = numeric_limits<basetyp>::ninf; } } +template<typename typ, typename basetyp> static void -@TYPE@_slogdet(char **args, +slogdet(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) @@ -1125,7 +1159,7 @@ static void INIT_OUTER_LOOP_3 m = (fortran_int) dimensions[0]; safe_m = m; - matrix_size = safe_m * safe_m * sizeof(@typ@); + matrix_size = safe_m * safe_m * sizeof(typ); pivot_size = safe_m * sizeof(fortran_int); tmp_buff = (npy_uint8 *)malloc(matrix_size + pivot_size); @@ -1134,20 +1168,21 @@ static void /* swapped steps to get matrix in FORTRAN order */ init_linearize_data(&lin_data, m, m, steps[1], steps[0]); BEGIN_OUTER_LOOP_3 - linearize_@TYPE@_matrix(tmp_buff, args[0], &lin_data); - @TYPE@_slogdet_single_element(m, - tmp_buff, + linearize_matrix((typ*)tmp_buff, (typ*)args[0], &lin_data); + slogdet_single_element(m, + (typ*)tmp_buff, (fortran_int*)(tmp_buff+matrix_size), - (@typ@*)args[1], - (@basetyp@*)args[2]); + (typ*)args[1], + (basetyp*)args[2]); END_OUTER_LOOP free(tmp_buff); } } +template<typename typ, typename basetyp> static void -@TYPE@_det(char **args, +det(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) @@ -1168,42 +1203,42 @@ static void INIT_OUTER_LOOP_2 m = (fortran_int) dimensions[0]; safe_m = m; - matrix_size = safe_m * safe_m * sizeof(@typ@); + matrix_size = safe_m * safe_m * sizeof(typ); pivot_size = safe_m * sizeof(fortran_int); tmp_buff = (npy_uint8 *)malloc(matrix_size + pivot_size); if (tmp_buff) { LINEARIZE_DATA_t lin_data; - @typ@ sign; - @basetyp@ logdet; + typ sign; + basetyp logdet; /* swapped steps to get matrix in FORTRAN order */ init_linearize_data(&lin_data, m, m, steps[1], steps[0]); BEGIN_OUTER_LOOP_2 - linearize_@TYPE@_matrix(tmp_buff, args[0], &lin_data); - @TYPE@_slogdet_single_element(m, - tmp_buff, + linearize_matrix((typ*)tmp_buff, (typ*)args[0], &lin_data); + slogdet_single_element(m, + (typ*)tmp_buff, (fortran_int*)(tmp_buff + matrix_size), &sign, &logdet); - *(@typ@ *)args[1] = @TYPE@_det_from_slogdet(sign, logdet); + *(typ *)args[1] = det_from_slogdet(sign, logdet); END_OUTER_LOOP free(tmp_buff); } } -/**end repeat**/ /* -------------------------------------------------------------------------- */ /* Eigh family */ -typedef struct eigh_params_struct { - void *A; /* matrix */ - void *W; /* eigenvalue vector */ - void *WORK; /* main work buffer */ - void *RWORK; /* secondary work buffer (for complex versions) */ - void *IWORK; +template<typename typ> +struct EIGH_PARAMS_t { + typ *A; /* matrix */ + basetype_t<typ> *W; /* eigenvalue vector */ + typ *WORK; /* main work buffer */ + basetype_t<typ> *RWORK; /* secondary work buffer (for complex versions) */ + fortran_int *IWORK; fortran_int N; fortran_int LWORK; fortran_int LRWORK; @@ -1211,34 +1246,40 @@ typedef struct eigh_params_struct { char JOBZ; char UPLO; fortran_int LDA; -} EIGH_PARAMS_t; - -/**begin repeat - #TYPE = FLOAT, DOUBLE# - #typ = npy_float, npy_double# - #ftyp = fortran_real, fortran_doublereal# - #lapack_func = ssyevd, dsyevd# -*/ +} ; static NPY_INLINE fortran_int -call_@lapack_func@(EIGH_PARAMS_t *params) +call_evd(EIGH_PARAMS_t<npy_float> *params) +{ + fortran_int rv; + LAPACK(ssyevd)(¶ms->JOBZ, ¶ms->UPLO, ¶ms->N, + params->A, ¶ms->LDA, params->W, + params->WORK, ¶ms->LWORK, + params->IWORK, ¶ms->LIWORK, + &rv); + return rv; +} +static NPY_INLINE fortran_int +call_evd(EIGH_PARAMS_t<npy_double> *params) { fortran_int rv; - LAPACK(@lapack_func@)(¶ms->JOBZ, ¶ms->UPLO, ¶ms->N, - (@typ@*)params->A, ¶ms->LDA, (@typ@*)params->W, - (@typ@*)params->WORK, ¶ms->LWORK, - (fortran_int*)params->IWORK, (fortran_int*)¶ms->LIWORK, + LAPACK(dsyevd)(¶ms->JOBZ, ¶ms->UPLO, ¶ms->N, + params->A, ¶ms->LDA, params->W, + params->WORK, ¶ms->LWORK, + params->IWORK, ¶ms->LIWORK, &rv); return rv; } + /* * Initialize the parameters to use in for the lapack function _syevd * Handles buffer allocation */ +template<typename typ> static NPY_INLINE int -init_@lapack_func@(EIGH_PARAMS_t* params, char JOBZ, char UPLO, - fortran_int N) +init_evd(EIGH_PARAMS_t<typ>* params, char JOBZ, char UPLO, + fortran_int N, scalar_trait) { npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; @@ -1246,7 +1287,7 @@ init_@lapack_func@(EIGH_PARAMS_t* params, char JOBZ, char UPLO, fortran_int liwork; npy_uint8 *a, *w, *work, *iwork; size_t safe_N = N; - size_t alloc_size = safe_N * (safe_N + 1) * sizeof(@typ@); + size_t alloc_size = safe_N * (safe_N + 1) * sizeof(typ); fortran_int lda = fortran_int_max(N, 1); mem_buff = (npy_uint8 *)malloc(alloc_size); @@ -1255,10 +1296,10 @@ init_@lapack_func@(EIGH_PARAMS_t* params, char JOBZ, char UPLO, goto error; } a = mem_buff; - w = mem_buff + safe_N * safe_N * sizeof(@typ@); + w = mem_buff + safe_N * safe_N * sizeof(typ); - params->A = a; - params->W = w; + params->A = (typ*)a; + params->W = (typ*)w; params->RWORK = NULL; /* unused */ params->N = N; params->LRWORK = 0; /* unused */ @@ -1268,7 +1309,7 @@ init_@lapack_func@(EIGH_PARAMS_t* params, char JOBZ, char UPLO, /* Work size query */ { - @typ@ query_work_size; + typ query_work_size; fortran_int query_iwork_size; params->LWORK = -1; @@ -1276,7 +1317,7 @@ init_@lapack_func@(EIGH_PARAMS_t* params, char JOBZ, char UPLO, params->WORK = &query_work_size; params->IWORK = &query_iwork_size; - if (call_@lapack_func@(params) != 0) { + if (call_evd(params) != 0) { goto error; } @@ -1284,18 +1325,18 @@ init_@lapack_func@(EIGH_PARAMS_t* params, char JOBZ, char UPLO, liwork = query_iwork_size; } - mem_buff2 = (npy_uint8 *)malloc(lwork*sizeof(@typ@) + liwork*sizeof(fortran_int)); + mem_buff2 = (npy_uint8 *)malloc(lwork*sizeof(typ) + liwork*sizeof(fortran_int)); if (!mem_buff2) { goto error; } work = mem_buff2; - iwork = mem_buff2 + lwork*sizeof(@typ@); + iwork = mem_buff2 + lwork*sizeof(typ); params->LWORK = lwork; - params->WORK = work; + params->WORK = (typ*)work; params->LIWORK = liwork; - params->IWORK = iwork; + params->IWORK = (fortran_int*)iwork; return 1; @@ -1307,40 +1348,44 @@ init_@lapack_func@(EIGH_PARAMS_t* params, char JOBZ, char UPLO, return 0; } -/**end repeat**/ -/**begin repeat - #TYPE = CFLOAT, CDOUBLE# - #typ = npy_cfloat, npy_cdouble# - #basetyp = npy_float, npy_double# - #ftyp = fortran_complex, fortran_doublecomplex# - #fbasetyp = fortran_real, fortran_doublereal# - #lapack_func = cheevd, zheevd# -*/ static NPY_INLINE fortran_int -call_@lapack_func@(EIGH_PARAMS_t *params) +call_evd(EIGH_PARAMS_t<npy_cfloat> *params) { fortran_int rv; - LAPACK(@lapack_func@)(¶ms->JOBZ, ¶ms->UPLO, ¶ms->N, - (@ftyp@*)params->A, ¶ms->LDA, (@fbasetyp@*)params->W, - (@ftyp@*)params->WORK, ¶ms->LWORK, - (@fbasetyp@*)params->RWORK, ¶ms->LRWORK, - (fortran_int*)params->IWORK, (fortran_int*)¶ms->LIWORK, + LAPACK(cheevd)(¶ms->JOBZ, ¶ms->UPLO, ¶ms->N, + (fortran_type_t<npy_cfloat>*)params->A, ¶ms->LDA, params->W, + (fortran_type_t<npy_cfloat>*)params->WORK, ¶ms->LWORK, + params->RWORK, ¶ms->LRWORK, + params->IWORK, ¶ms->LIWORK, &rv); return rv; } -/* - * Initialize the parameters to use in for the lapack function _heev - * Handles buffer allocation - */ +static NPY_INLINE fortran_int +call_evd(EIGH_PARAMS_t<npy_cdouble> *params) +{ + fortran_int rv; + LAPACK(zheevd)(¶ms->JOBZ, ¶ms->UPLO, ¶ms->N, + (fortran_type_t<npy_cdouble>*)params->A, ¶ms->LDA, params->W, + (fortran_type_t<npy_cdouble>*)params->WORK, ¶ms->LWORK, + params->RWORK, ¶ms->LRWORK, + params->IWORK, ¶ms->LIWORK, + &rv); + return rv; +} + +template<typename typ> static NPY_INLINE int -init_@lapack_func@(EIGH_PARAMS_t *params, +init_evd(EIGH_PARAMS_t<typ> *params, char JOBZ, char UPLO, - fortran_int N) + fortran_int N, complex_trait) { + using basetyp = basetype_t<typ>; +using ftyp = fortran_type_t<typ>; +using fbasetyp = fortran_type_t<basetyp>; npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; fortran_int lwork; @@ -1350,16 +1395,16 @@ init_@lapack_func@(EIGH_PARAMS_t *params, size_t safe_N = N; fortran_int lda = fortran_int_max(N, 1); - mem_buff = (npy_uint8 *)malloc(safe_N * safe_N * sizeof(@typ@) + - safe_N * sizeof(@basetyp@)); + mem_buff = (npy_uint8 *)malloc(safe_N * safe_N * sizeof(typ) + + safe_N * sizeof(basetyp)); if (!mem_buff) { goto error; } a = mem_buff; - w = mem_buff + safe_N * safe_N * sizeof(@typ@); + w = mem_buff + safe_N * safe_N * sizeof(typ); - params->A = a; - params->W = w; + params->A = (typ*)a; + params->W = (basetyp*)w; params->N = N; params->JOBZ = JOBZ; params->UPLO = UPLO; @@ -1367,40 +1412,40 @@ init_@lapack_func@(EIGH_PARAMS_t *params, /* Work size query */ { - @ftyp@ query_work_size; - @fbasetyp@ query_rwork_size; + ftyp query_work_size; + fbasetyp query_rwork_size; fortran_int query_iwork_size; params->LWORK = -1; params->LRWORK = -1; params->LIWORK = -1; - params->WORK = &query_work_size; - params->RWORK = &query_rwork_size; + params->WORK = (typ*)&query_work_size; + params->RWORK = (basetyp*)&query_rwork_size; params->IWORK = &query_iwork_size; - if (call_@lapack_func@(params) != 0) { + if (call_evd(params) != 0) { goto error; } - lwork = (fortran_int)*(@fbasetyp@*)&query_work_size; + lwork = (fortran_int)*(fbasetyp*)&query_work_size; lrwork = (fortran_int)query_rwork_size; liwork = query_iwork_size; } - mem_buff2 = (npy_uint8 *)malloc(lwork*sizeof(@typ@) + - lrwork*sizeof(@basetyp@) + + mem_buff2 = (npy_uint8 *)malloc(lwork*sizeof(typ) + + lrwork*sizeof(basetyp) + liwork*sizeof(fortran_int)); if (!mem_buff2) { goto error; } work = mem_buff2; - rwork = work + lwork*sizeof(@typ@); - iwork = rwork + lrwork*sizeof(@basetyp@); + rwork = work + lwork*sizeof(typ); + iwork = rwork + lrwork*sizeof(basetyp); - params->WORK = work; - params->RWORK = rwork; - params->IWORK = iwork; + params->WORK = (typ*)work; + params->RWORK = (basetyp*)rwork; + params->IWORK = (fortran_int*)iwork; params->LWORK = lwork; params->LRWORK = lrwork; params->LIWORK = liwork; @@ -1415,16 +1460,7 @@ error: return 0; } -/**end repeat**/ - -/**begin repeat - #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# - #BASETYPE = FLOAT, DOUBLE, FLOAT, DOUBLE# - #typ = npy_float, npy_double, npy_cfloat, npy_cdouble# - #basetyp = npy_float, npy_double, npy_float, npy_double# - #lapack_func = ssyevd, dsyevd, cheevd, zheevd# -**/ /* * (M, M)->(M,)(M, M) * dimensions[1] -> M @@ -1433,8 +1469,9 @@ error: * args[2] -> A[out] */ +template<typename typ> static NPY_INLINE void -release_@lapack_func@(EIGH_PARAMS_t *params) +release_evd(EIGH_PARAMS_t<typ> *params) { /* allocated memory in A and WORK */ free(params->A); @@ -1443,18 +1480,20 @@ release_@lapack_func@(EIGH_PARAMS_t *params) } +template<typename typ> static NPY_INLINE void -@TYPE@_eigh_wrapper(char JOBZ, +eigh_wrapper(char JOBZ, char UPLO, char**args, npy_intp const *dimensions, npy_intp const *steps) { + using basetyp = basetype_t<typ>; ptrdiff_t outer_steps[3]; size_t iter; size_t outer_dim = *dimensions++; size_t op_count = (JOBZ=='N')?2:3; - EIGH_PARAMS_t eigh_params; + EIGH_PARAMS_t<typ> eigh_params; int error_occurred = get_fp_invalid_and_clear(); for (iter = 0; iter < op_count; ++iter) { @@ -1462,10 +1501,10 @@ static NPY_INLINE void } steps += op_count; - if (init_@lapack_func@(&eigh_params, + if (init_evd(&eigh_params, JOBZ, UPLO, - (fortran_int)dimensions[0])) { + (fortran_int)dimensions[0], dispatch_scalar<typ>())) { LINEARIZE_DATA_t matrix_in_ld; LINEARIZE_DATA_t eigenvectors_out_ld; LINEARIZE_DATA_t eigenvalues_out_ld; @@ -1485,137 +1524,167 @@ static NPY_INLINE void for (iter = 0; iter < outer_dim; ++iter) { int not_ok; /* copy the matrix in */ - linearize_@TYPE@_matrix(eigh_params.A, args[0], &matrix_in_ld); - not_ok = call_@lapack_func@(&eigh_params); + linearize_matrix((typ*)eigh_params.A, (typ*)args[0], &matrix_in_ld); + not_ok = call_evd(&eigh_params); if (!not_ok) { /* lapack ok, copy result out */ - delinearize_@BASETYPE@_matrix(args[1], - eigh_params.W, + delinearize_matrix((basetyp*)args[1], + (basetyp*)eigh_params.W, &eigenvalues_out_ld); if ('V' == eigh_params.JOBZ) { - delinearize_@TYPE@_matrix(args[2], - eigh_params.A, + delinearize_matrix((typ*)args[2], + (typ*)eigh_params.A, &eigenvectors_out_ld); } } else { /* lapack fail, set result to nan */ error_occurred = 1; - nan_@BASETYPE@_matrix(args[1], &eigenvalues_out_ld); + nan_matrix((basetyp*)args[1], &eigenvalues_out_ld); if ('V' == eigh_params.JOBZ) { - nan_@TYPE@_matrix(args[2], &eigenvectors_out_ld); + nan_matrix((typ*)args[2], &eigenvectors_out_ld); } } update_pointers((npy_uint8**)args, outer_steps, op_count); } - release_@lapack_func@(&eigh_params); + release_evd(&eigh_params); } set_fp_invalid_or_clear(error_occurred); } -/**end repeat**/ -/**begin repeat - #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# - */ +template<typename typ> static void -@TYPE@_eighlo(char **args, +eighlo(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - @TYPE@_eigh_wrapper('V', 'L', args, dimensions, steps); + eigh_wrapper<typ>('V', 'L', args, dimensions, steps); } +template<typename typ> static void -@TYPE@_eighup(char **args, +eighup(char **args, npy_intp const *dimensions, npy_intp const *steps, void* NPY_UNUSED(func)) { - @TYPE@_eigh_wrapper('V', 'U', args, dimensions, steps); + eigh_wrapper<typ>('V', 'U', args, dimensions, steps); } +template<typename typ> static void -@TYPE@_eigvalshlo(char **args, +eigvalshlo(char **args, npy_intp const *dimensions, npy_intp const *steps, void* NPY_UNUSED(func)) { - @TYPE@_eigh_wrapper('N', 'L', args, dimensions, steps); + eigh_wrapper<typ>('N', 'L', args, dimensions, steps); } +template<typename typ> static void -@TYPE@_eigvalshup(char **args, +eigvalshup(char **args, npy_intp const *dimensions, npy_intp const *steps, void* NPY_UNUSED(func)) { - @TYPE@_eigh_wrapper('N', 'U', args, dimensions, steps); + eigh_wrapper<typ>('N', 'U', args, dimensions, steps); } -/**end repeat**/ /* -------------------------------------------------------------------------- */ /* Solve family (includes inv) */ -typedef struct gesv_params_struct +template<typename typ> +struct GESV_PARAMS_t { - void *A; /* A is (N, N) of base type */ - void *B; /* B is (N, NRHS) of base type */ + typ *A; /* A is (N, N) of base type */ + typ *B; /* B is (N, NRHS) of base type */ fortran_int * IPIV; /* IPIV is (N) */ fortran_int N; fortran_int NRHS; fortran_int LDA; fortran_int LDB; -} GESV_PARAMS_t; - -/**begin repeat - #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# - #typ = npy_float, npy_double, npy_cfloat, npy_cdouble# - #ftyp = fortran_real, fortran_doublereal, - fortran_complex, fortran_doublecomplex# - #lapack_func = sgesv, dgesv, cgesv, zgesv# -*/ +}; + +static NPY_INLINE fortran_int +call_gesv(GESV_PARAMS_t<fortran_real> *params) +{ + fortran_int rv; + LAPACK(sgesv)(¶ms->N, ¶ms->NRHS, + params->A, ¶ms->LDA, + params->IPIV, + params->B, ¶ms->LDB, + &rv); + return rv; +} + +static NPY_INLINE fortran_int +call_gesv(GESV_PARAMS_t<fortran_doublereal> *params) +{ + fortran_int rv; + LAPACK(dgesv)(¶ms->N, ¶ms->NRHS, + params->A, ¶ms->LDA, + params->IPIV, + params->B, ¶ms->LDB, + &rv); + return rv; +} + +static NPY_INLINE fortran_int +call_gesv(GESV_PARAMS_t<fortran_complex> *params) +{ + fortran_int rv; + LAPACK(cgesv)(¶ms->N, ¶ms->NRHS, + params->A, ¶ms->LDA, + params->IPIV, + params->B, ¶ms->LDB, + &rv); + return rv; +} static NPY_INLINE fortran_int -call_@lapack_func@(GESV_PARAMS_t *params) +call_gesv(GESV_PARAMS_t<fortran_doublecomplex> *params) { fortran_int rv; - LAPACK(@lapack_func@)(¶ms->N, ¶ms->NRHS, - (@ftyp@*)params->A, ¶ms->LDA, + LAPACK(zgesv)(¶ms->N, ¶ms->NRHS, + params->A, ¶ms->LDA, params->IPIV, - (@ftyp@*)params->B, ¶ms->LDB, + params->B, ¶ms->LDB, &rv); return rv; } + /* * Initialize the parameters to use in for the lapack function _heev * Handles buffer allocation */ +template<typename ftyp> static NPY_INLINE int -init_@lapack_func@(GESV_PARAMS_t *params, fortran_int N, fortran_int NRHS) +init_gesv(GESV_PARAMS_t<ftyp> *params, fortran_int N, fortran_int NRHS) { npy_uint8 *mem_buff = NULL; npy_uint8 *a, *b, *ipiv; size_t safe_N = N; size_t safe_NRHS = NRHS; fortran_int ld = fortran_int_max(N, 1); - mem_buff = (npy_uint8 *)malloc(safe_N * safe_N * sizeof(@ftyp@) + - safe_N * safe_NRHS*sizeof(@ftyp@) + + mem_buff = (npy_uint8 *)malloc(safe_N * safe_N * sizeof(ftyp) + + safe_N * safe_NRHS*sizeof(ftyp) + safe_N * sizeof(fortran_int)); if (!mem_buff) { goto error; } a = mem_buff; - b = a + safe_N * safe_N * sizeof(@ftyp@); - ipiv = b + safe_N * safe_NRHS * sizeof(@ftyp@); + b = a + safe_N * safe_N * sizeof(ftyp); + ipiv = b + safe_N * safe_NRHS * sizeof(ftyp); - params->A = a; - params->B = b; + params->A = (ftyp*)a; + params->B = (ftyp*)b; params->IPIV = (fortran_int*)ipiv; params->N = N; params->NRHS = NRHS; @@ -1630,26 +1699,29 @@ init_@lapack_func@(GESV_PARAMS_t *params, fortran_int N, fortran_int NRHS) return 0; } +template<typename ftyp> static NPY_INLINE void -release_@lapack_func@(GESV_PARAMS_t *params) +release_gesv(GESV_PARAMS_t<ftyp> *params) { /* memory block base is in A */ free(params->A); memset(params, 0, sizeof(*params)); } +template<typename typ> static void -@TYPE@_solve(char **args, npy_intp const *dimensions, npy_intp const *steps, +solve(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - GESV_PARAMS_t params; +using ftyp = fortran_type_t<typ>; + GESV_PARAMS_t<ftyp> params; fortran_int n, nrhs; int error_occurred = get_fp_invalid_and_clear(); INIT_OUTER_LOOP_3 n = (fortran_int)dimensions[0]; nrhs = (fortran_int)dimensions[1]; - if (init_@lapack_func@(¶ms, n, nrhs)) { + if (init_gesv(¶ms, n, nrhs)) { LINEARIZE_DATA_t a_in, b_in, r_out; init_linearize_data(&a_in, n, n, steps[1], steps[0]); @@ -1658,34 +1730,37 @@ static void BEGIN_OUTER_LOOP_3 int not_ok; - linearize_@TYPE@_matrix(params.A, args[0], &a_in); - linearize_@TYPE@_matrix(params.B, args[1], &b_in); - not_ok =call_@lapack_func@(¶ms); + linearize_matrix((typ*)params.A, (typ*)args[0], &a_in); + linearize_matrix((typ*)params.B, (typ*)args[1], &b_in); + not_ok =call_gesv(¶ms); if (!not_ok) { - delinearize_@TYPE@_matrix(args[2], params.B, &r_out); + delinearize_matrix((typ*)args[2], (typ*)params.B, &r_out); } else { error_occurred = 1; - nan_@TYPE@_matrix(args[2], &r_out); + nan_matrix((typ*)args[2], &r_out); } END_OUTER_LOOP - release_@lapack_func@(¶ms); + release_gesv(¶ms); } set_fp_invalid_or_clear(error_occurred); } + +template<typename typ> static void -@TYPE@_solve1(char **args, npy_intp const *dimensions, npy_intp const *steps, +solve1(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - GESV_PARAMS_t params; +using ftyp = fortran_type_t<typ>; + GESV_PARAMS_t<ftyp> params; int error_occurred = get_fp_invalid_and_clear(); fortran_int n; INIT_OUTER_LOOP_3 n = (fortran_int)dimensions[0]; - if (init_@lapack_func@(¶ms, n, 1)) { + if (init_gesv(¶ms, n, 1)) { LINEARIZE_DATA_t a_in, b_in, r_out; init_linearize_data(&a_in, n, n, steps[1], steps[0]); init_linearize_data(&b_in, 1, n, 1, steps[2]); @@ -1693,105 +1768,130 @@ static void BEGIN_OUTER_LOOP_3 int not_ok; - linearize_@TYPE@_matrix(params.A, args[0], &a_in); - linearize_@TYPE@_matrix(params.B, args[1], &b_in); - not_ok = call_@lapack_func@(¶ms); + linearize_matrix((typ*)params.A, (typ*)args[0], &a_in); + linearize_matrix((typ*)params.B, (typ*)args[1], &b_in); + not_ok = call_gesv(¶ms); if (!not_ok) { - delinearize_@TYPE@_matrix(args[2], params.B, &r_out); + delinearize_matrix((typ*)args[2], (typ*)params.B, &r_out); } else { error_occurred = 1; - nan_@TYPE@_matrix(args[2], &r_out); + nan_matrix((typ*)args[2], &r_out); } END_OUTER_LOOP - release_@lapack_func@(¶ms); + release_gesv(¶ms); } set_fp_invalid_or_clear(error_occurred); } +template<typename typ> static void -@TYPE@_inv(char **args, npy_intp const *dimensions, npy_intp const *steps, +inv(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - GESV_PARAMS_t params; +using ftyp = fortran_type_t<typ>; + GESV_PARAMS_t<ftyp> params; fortran_int n; int error_occurred = get_fp_invalid_and_clear(); INIT_OUTER_LOOP_2 n = (fortran_int)dimensions[0]; - if (init_@lapack_func@(¶ms, n, n)) { + if (init_gesv(¶ms, n, n)) { LINEARIZE_DATA_t a_in, r_out; init_linearize_data(&a_in, n, n, steps[1], steps[0]); init_linearize_data(&r_out, n, n, steps[3], steps[2]); BEGIN_OUTER_LOOP_2 int not_ok; - linearize_@TYPE@_matrix(params.A, args[0], &a_in); - identity_@TYPE@_matrix(params.B, n); - not_ok = call_@lapack_func@(¶ms); + linearize_matrix((typ*)params.A, (typ*)args[0], &a_in); + identity_matrix((typ*)params.B, n); + not_ok = call_gesv(¶ms); if (!not_ok) { - delinearize_@TYPE@_matrix(args[1], params.B, &r_out); + delinearize_matrix((typ*)args[1], (typ*)params.B, &r_out); } else { error_occurred = 1; - nan_@TYPE@_matrix(args[1], &r_out); + nan_matrix((typ*)args[1], &r_out); } END_OUTER_LOOP - release_@lapack_func@(¶ms); + release_gesv(¶ms); } set_fp_invalid_or_clear(error_occurred); } -/**end repeat**/ - /* -------------------------------------------------------------------------- */ /* Cholesky decomposition */ -typedef struct potr_params_struct +template<typename typ> +struct POTR_PARAMS_t { - void *A; + typ *A; fortran_int N; fortran_int LDA; char UPLO; -} POTR_PARAMS_t; +}; -/**begin repeat - #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# - #ftyp = fortran_real, fortran_doublereal, - fortran_complex, fortran_doublecomplex# - #lapack_func = spotrf, dpotrf, cpotrf, zpotrf# - */ +static NPY_INLINE fortran_int +call_potrf(POTR_PARAMS_t<fortran_real> *params) +{ + fortran_int rv; + LAPACK(spotrf)(¶ms->UPLO, + ¶ms->N, params->A, ¶ms->LDA, + &rv); + return rv; +} + +static NPY_INLINE fortran_int +call_potrf(POTR_PARAMS_t<fortran_doublereal> *params) +{ + fortran_int rv; + LAPACK(dpotrf)(¶ms->UPLO, + ¶ms->N, params->A, ¶ms->LDA, + &rv); + return rv; +} static NPY_INLINE fortran_int -call_@lapack_func@(POTR_PARAMS_t *params) +call_potrf(POTR_PARAMS_t<fortran_complex> *params) { fortran_int rv; - LAPACK(@lapack_func@)(¶ms->UPLO, - ¶ms->N, (@ftyp@*)params->A, ¶ms->LDA, + LAPACK(cpotrf)(¶ms->UPLO, + ¶ms->N, params->A, ¶ms->LDA, &rv); return rv; } +static NPY_INLINE fortran_int +call_potrf(POTR_PARAMS_t<fortran_doublecomplex> *params) +{ + fortran_int rv; + LAPACK(zpotrf)(¶ms->UPLO, + ¶ms->N, params->A, ¶ms->LDA, + &rv); + return rv; +} + +template<typename ftyp> static NPY_INLINE int -init_@lapack_func@(POTR_PARAMS_t *params, char UPLO, fortran_int N) +init_potrf(POTR_PARAMS_t<ftyp> *params, char UPLO, fortran_int N) { npy_uint8 *mem_buff = NULL; npy_uint8 *a; size_t safe_N = N; fortran_int lda = fortran_int_max(N, 1); - mem_buff = (npy_uint8 *)malloc(safe_N * safe_N * sizeof(@ftyp@)); + mem_buff = (npy_uint8 *)malloc(safe_N * safe_N * sizeof(ftyp)); if (!mem_buff) { goto error; } a = mem_buff; - params->A = a; + params->A = (ftyp*)a; params->N = N; params->LDA = lda; params->UPLO = UPLO; @@ -1804,18 +1904,21 @@ init_@lapack_func@(POTR_PARAMS_t *params, char UPLO, fortran_int N) return 0; } +template<typename ftyp> static NPY_INLINE void -release_@lapack_func@(POTR_PARAMS_t *params) +release_potrf(POTR_PARAMS_t<ftyp> *params) { /* memory block base in A */ free(params->A); memset(params, 0, sizeof(*params)); } +template<typename typ> static void -@TYPE@_cholesky(char uplo, char **args, npy_intp const *dimensions, npy_intp const *steps) +cholesky(char uplo, char **args, npy_intp const *dimensions, npy_intp const *steps) { - POTR_PARAMS_t params; + using ftyp = fortran_type_t<typ>; + POTR_PARAMS_t<ftyp> params; int error_occurred = get_fp_invalid_and_clear(); fortran_int n; INIT_OUTER_LOOP_2 @@ -1823,50 +1926,50 @@ static void assert(uplo == 'L'); n = (fortran_int)dimensions[0]; - if (init_@lapack_func@(¶ms, uplo, n)) { + if (init_potrf(¶ms, uplo, n)) { LINEARIZE_DATA_t a_in, r_out; init_linearize_data(&a_in, n, n, steps[1], steps[0]); init_linearize_data(&r_out, n, n, steps[3], steps[2]); BEGIN_OUTER_LOOP_2 int not_ok; - linearize_@TYPE@_matrix(params.A, args[0], &a_in); - not_ok = call_@lapack_func@(¶ms); + linearize_matrix(params.A, (ftyp*)args[0], &a_in); + not_ok = call_potrf(¶ms); if (!not_ok) { - triu_@TYPE@_matrix(params.A, params.N); - delinearize_@TYPE@_matrix(args[1], params.A, &r_out); + triu_matrix(params.A, params.N); + delinearize_matrix((ftyp*)args[1], params.A, &r_out); } else { error_occurred = 1; - nan_@TYPE@_matrix(args[1], &r_out); + nan_matrix((ftyp*)args[1], &r_out); } END_OUTER_LOOP - release_@lapack_func@(¶ms); + release_potrf(¶ms); } set_fp_invalid_or_clear(error_occurred); } +template<typename typ> static void -@TYPE@_cholesky_lo(char **args, npy_intp const *dimensions, npy_intp const *steps, +cholesky_lo(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - @TYPE@_cholesky('L', args, dimensions, steps); + cholesky<typ>('L', args, dimensions, steps); } -/**end repeat**/ - /* -------------------------------------------------------------------------- */ /* eig family */ -typedef struct geev_params_struct { - void *A; - void *WR; /* RWORK in complex versions, REAL W buffer for (sd)geev*/ - void *WI; - void *VLR; /* REAL VL buffers for _geev where _ is s, d */ - void *VRR; /* REAL VR buffers for _geev where _ is s, d */ - void *WORK; - void *W; /* final w */ - void *VL; /* final vl */ - void *VR; /* final vr */ +template<typename typ> +struct GEEV_PARAMS_t { + typ *A; + basetype_t<typ> *WR; /* RWORK in complex versions, REAL W buffer for (sd)geev*/ + typ *WI; + typ *VLR; /* REAL VL buffers for _geev where _ is s, d */ + typ *VRR; /* REAL VR buffers for _geev where _ is s, d */ + typ *WORK; + typ *W; /* final w */ + typ *VL; /* final vl */ + typ *VR; /* final vr */ fortran_int N; fortran_int LDA; @@ -1876,10 +1979,11 @@ typedef struct geev_params_struct { char JOBVL; char JOBVR; -} GEEV_PARAMS_t; +}; +template<typename typ> static NPY_INLINE void -dump_geev_params(const char *name, GEEV_PARAMS_t* params) +dump_geev_params(const char *name, GEEV_PARAMS_t<typ>* params) { TRACE_TXT("\n%s\n" @@ -1924,41 +2028,49 @@ dump_geev_params(const char *name, GEEV_PARAMS_t* params) "JOBVR", params->JOBVR); } -/**begin repeat - #TYPE = FLOAT, DOUBLE# - #CTYPE = CFLOAT, CDOUBLE# - #typ = float, double# - #complextyp = COMPLEX_t, DOUBLECOMPLEX_t# - #lapack_func = sgeev, dgeev# - #zero = 0.0f, 0.0# -*/ +static NPY_INLINE fortran_int +call_geev(GEEV_PARAMS_t<float>* params) +{ + fortran_int rv; + LAPACK(sgeev)(¶ms->JOBVL, ¶ms->JOBVR, + ¶ms->N, params->A, ¶ms->LDA, + params->WR, params->WI, + params->VLR, ¶ms->LDVL, + params->VRR, ¶ms->LDVR, + params->WORK, ¶ms->LWORK, + &rv); + return rv; +} static NPY_INLINE fortran_int -call_@lapack_func@(GEEV_PARAMS_t* params) +call_geev(GEEV_PARAMS_t<double>* params) { fortran_int rv; - LAPACK(@lapack_func@)(¶ms->JOBVL, ¶ms->JOBVR, - ¶ms->N, (@typ@*)params->A, ¶ms->LDA, - (@typ@*)params->WR, (@typ@*)params->WI, - (@typ@*)params->VLR, ¶ms->LDVL, - (@typ@*)params->VRR, ¶ms->LDVR, - (@typ@*)params->WORK, ¶ms->LWORK, + LAPACK(dgeev)(¶ms->JOBVL, ¶ms->JOBVR, + ¶ms->N, params->A, ¶ms->LDA, + params->WR, params->WI, + params->VLR, ¶ms->LDVL, + params->VRR, ¶ms->LDVR, + params->WORK, ¶ms->LWORK, &rv); return rv; } + +template<typename typ> static NPY_INLINE int -init_@lapack_func@(GEEV_PARAMS_t *params, char jobvl, char jobvr, fortran_int n) +init_geev(GEEV_PARAMS_t<typ> *params, char jobvl, char jobvr, fortran_int n, +scalar_trait) { npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; npy_uint8 *a, *wr, *wi, *vlr, *vrr, *work, *w, *vl, *vr; size_t safe_n = n; - size_t a_size = safe_n * safe_n * sizeof(@typ@); - size_t wr_size = safe_n * sizeof(@typ@); - size_t wi_size = safe_n * sizeof(@typ@); - size_t vlr_size = jobvl=='V' ? safe_n * safe_n * sizeof(@typ@) : 0; - size_t vrr_size = jobvr=='V' ? safe_n * safe_n * sizeof(@typ@) : 0; + size_t a_size = safe_n * safe_n * sizeof(typ); + size_t wr_size = safe_n * sizeof(typ); + size_t wi_size = safe_n * sizeof(typ); + size_t vlr_size = jobvl=='V' ? safe_n * safe_n * sizeof(typ) : 0; + size_t vrr_size = jobvr=='V' ? safe_n * safe_n * sizeof(typ) : 0; size_t w_size = wr_size*2; size_t vl_size = vlr_size*2; size_t vr_size = vrr_size*2; @@ -1982,14 +2094,14 @@ init_@lapack_func@(GEEV_PARAMS_t *params, char jobvl, char jobvr, fortran_int n) vl = w + w_size; vr = vl + vl_size; - params->A = a; - params->WR = wr; - params->WI = wi; - params->VLR = vlr; - params->VRR = vrr; - params->W = w; - params->VL = vl; - params->VR = vr; + params->A = (typ*)a; + params->WR = (typ*)wr; + params->WI = (typ*)wi; + params->VLR = (typ*)vlr; + params->VRR = (typ*)vrr; + params->W = (typ*)w; + params->VL = (typ*)vl; + params->VR = (typ*)vr; params->N = n; params->LDA = ld; params->LDVL = ld; @@ -1999,26 +2111,26 @@ init_@lapack_func@(GEEV_PARAMS_t *params, char jobvl, char jobvr, fortran_int n) /* Work size query */ { - @typ@ work_size_query; + typ work_size_query; params->LWORK = -1; params->WORK = &work_size_query; - if (call_@lapack_func@(params) != 0) { + if (call_geev(params) != 0) { goto error; } work_count = (size_t)work_size_query; } - mem_buff2 = (npy_uint8 *)malloc(work_count*sizeof(@typ@)); + mem_buff2 = (npy_uint8 *)malloc(work_count*sizeof(typ)); if (!mem_buff2) { goto error; } work = mem_buff2; params->LWORK = (fortran_int)work_count; - params->WORK = work; + params->WORK = (typ*)work; return 1; error: @@ -2029,42 +2141,45 @@ init_@lapack_func@(GEEV_PARAMS_t *params, char jobvl, char jobvr, fortran_int n) return 0; } +template<typename complextyp, typename typ> static NPY_INLINE void -mk_@TYPE@_complex_array_from_real(@complextyp@ *c, const @typ@ *re, size_t n) +mk_complex_array_from_real(complextyp *c, const typ *re, size_t n) { size_t iter; for (iter = 0; iter < n; ++iter) { - c[iter].array[0] = re[iter]; - c[iter].array[1] = @zero@; + c[iter].r = re[iter]; + c[iter].i = numeric_limits<typ>::zero; } } +template<typename complextyp, typename typ> static NPY_INLINE void -mk_@TYPE@_complex_array(@complextyp@ *c, - const @typ@ *re, - const @typ@ *im, +mk_complex_array(complextyp *c, + const typ *re, + const typ *im, size_t n) { size_t iter; for (iter = 0; iter < n; ++iter) { - c[iter].array[0] = re[iter]; - c[iter].array[1] = im[iter]; + c[iter].r = re[iter]; + c[iter].i = im[iter]; } } +template<typename complextyp, typename typ> static NPY_INLINE void -mk_@TYPE@_complex_array_conjugate_pair(@complextyp@ *c, - const @typ@ *r, +mk_complex_array_conjugate_pair(complextyp *c, + const typ *r, size_t n) { size_t iter; for (iter = 0; iter < n; ++iter) { - @typ@ re = r[iter]; - @typ@ im = r[iter+n]; - c[iter].array[0] = re; - c[iter].array[1] = im; - c[iter+n].array[0] = re; - c[iter+n].array[1] = -im; + typ re = r[iter]; + typ im = r[iter+n]; + c[iter].r = re; + c[iter].i = im; + c[iter+n].r = re; + c[iter+n].i = -im; } } @@ -2075,24 +2190,25 @@ mk_@TYPE@_complex_array_conjugate_pair(@complextyp@ *c, * i is the eigenvalue imaginary part produced by sgeev/zgeev * n is so that the order of the matrix is n by n */ +template<typename complextyp, typename typ> static NPY_INLINE void -mk_@lapack_func@_complex_eigenvectors(@complextyp@ *c, - const @typ@ *r, - const @typ@ *i, +mk_geev_complex_eigenvectors(complextyp *c, + const typ *r, + const typ *i, size_t n) { size_t iter = 0; while (iter < n) { - if (i[iter] == @zero@) { + if (i[iter] == numeric_limits<typ>::zero) { /* eigenvalue was real, eigenvectors as well... */ - mk_@TYPE@_complex_array_from_real(c, r, n); + mk_complex_array_from_real(c, r, n); c += n; r += n; iter ++; } else { /* eigenvalue was complex, generate a pair of eigenvectors */ - mk_@TYPE@_complex_array_conjugate_pair(c, r, n); + mk_complex_array_conjugate_pair(c, r, n); c += 2*n; r += 2*n; iter += 2; @@ -2101,68 +2217,76 @@ mk_@lapack_func@_complex_eigenvectors(@complextyp@ *c, } +template<typename complextyp, typename typ> static NPY_INLINE void -process_@lapack_func@_results(GEEV_PARAMS_t *params) +process_geev_results(GEEV_PARAMS_t<typ> *params, scalar_trait) { /* REAL versions of geev need the results to be translated * into complex versions. This is the way to deal with imaginary * results. In our gufuncs we will always return complex arrays! */ - mk_@TYPE@_complex_array((@complextyp@*)params->W, (@typ@*)params->WR, (@typ@*)params->WI, params->N); + mk_complex_array((complextyp*)params->W, (typ*)params->WR, (typ*)params->WI, params->N); /* handle the eigenvectors */ if ('V' == params->JOBVL) { - mk_@lapack_func@_complex_eigenvectors((@complextyp@*)params->VL, (@typ@*)params->VLR, - (@typ@*)params->WI, params->N); + mk_geev_complex_eigenvectors((complextyp*)params->VL, (typ*)params->VLR, + (typ*)params->WI, params->N); } if ('V' == params->JOBVR) { - mk_@lapack_func@_complex_eigenvectors((@complextyp@*)params->VR, (@typ@*)params->VRR, - (@typ@*)params->WI, params->N); + mk_geev_complex_eigenvectors((complextyp*)params->VR, (typ*)params->VRR, + (typ*)params->WI, params->N); } } -/**end repeat**/ +static NPY_INLINE fortran_int +call_geev(GEEV_PARAMS_t<fortran_complex>* params) +{ + fortran_int rv; -/**begin repeat - #TYPE = CFLOAT, CDOUBLE# - #typ = COMPLEX_t, DOUBLECOMPLEX_t# - #ftyp = fortran_complex, fortran_doublecomplex# - #realtyp = float, double# - #lapack_func = cgeev, zgeev# - */ - + LAPACK(cgeev)(¶ms->JOBVL, ¶ms->JOBVR, + ¶ms->N, params->A, ¶ms->LDA, + params->W, + params->VL, ¶ms->LDVL, + params->VR, ¶ms->LDVR, + params->WORK, ¶ms->LWORK, + params->WR, /* actually RWORK */ + &rv); + return rv; +} static NPY_INLINE fortran_int -call_@lapack_func@(GEEV_PARAMS_t* params) +call_geev(GEEV_PARAMS_t<fortran_doublecomplex>* params) { fortran_int rv; - LAPACK(@lapack_func@)(¶ms->JOBVL, ¶ms->JOBVR, - ¶ms->N, (@ftyp@*)params->A, ¶ms->LDA, - (@ftyp@*)params->W, - (@ftyp@*)params->VL, ¶ms->LDVL, - (@ftyp@*)params->VR, ¶ms->LDVR, - (@ftyp@*)params->WORK, ¶ms->LWORK, - (@realtyp@*)params->WR, /* actually RWORK */ + LAPACK(zgeev)(¶ms->JOBVL, ¶ms->JOBVR, + ¶ms->N, params->A, ¶ms->LDA, + params->W, + params->VL, ¶ms->LDVL, + params->VR, ¶ms->LDVR, + params->WORK, ¶ms->LWORK, + params->WR, /* actually RWORK */ &rv); return rv; } +template<typename ftyp> static NPY_INLINE int -init_@lapack_func@(GEEV_PARAMS_t* params, +init_geev(GEEV_PARAMS_t<ftyp>* params, char jobvl, char jobvr, - fortran_int n) + fortran_int n, complex_trait) { +using realtyp = basetype_t<ftyp>; npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; npy_uint8 *a, *w, *vl, *vr, *work, *rwork; size_t safe_n = n; - size_t a_size = safe_n * safe_n * sizeof(@ftyp@); - size_t w_size = safe_n * sizeof(@ftyp@); - size_t vl_size = jobvl=='V'? safe_n * safe_n * sizeof(@ftyp@) : 0; - size_t vr_size = jobvr=='V'? safe_n * safe_n * sizeof(@ftyp@) : 0; - size_t rwork_size = 2 * safe_n * sizeof(@realtyp@); + size_t a_size = safe_n * safe_n * sizeof(ftyp); + size_t w_size = safe_n * sizeof(ftyp); + size_t vl_size = jobvl=='V'? safe_n * safe_n * sizeof(ftyp) : 0; + size_t vr_size = jobvr=='V'? safe_n * safe_n * sizeof(ftyp) : 0; + size_t rwork_size = 2 * safe_n * sizeof(realtyp); size_t work_count = 0; size_t total_size = a_size + w_size + vl_size + vr_size + rwork_size; fortran_int ld = fortran_int_max(n, 1); @@ -2178,14 +2302,14 @@ init_@lapack_func@(GEEV_PARAMS_t* params, vr = vl + vl_size; rwork = vr + vr_size; - params->A = a; - params->WR = rwork; + params->A = (ftyp*)a; + params->WR = (realtyp*)rwork; params->WI = NULL; params->VLR = NULL; params->VRR = NULL; - params->VL = vl; - params->VR = vr; - params->W = w; + params->VL = (ftyp*)vl; + params->VR = (ftyp*)vr; + params->W = (ftyp*)w; params->N = n; params->LDA = ld; params->LDVL = ld; @@ -2195,21 +2319,21 @@ init_@lapack_func@(GEEV_PARAMS_t* params, /* Work size query */ { - @typ@ work_size_query; + ftyp work_size_query; params->LWORK = -1; params->WORK = &work_size_query; - if (call_@lapack_func@(params) != 0) { + if (call_geev(params) != 0) { goto error; } - work_count = (size_t) work_size_query.array[0]; + work_count = (size_t) work_size_query.r; /* Fix a bug in lapack 3.0.0 */ if(work_count == 0) work_count = 1; } - mem_buff2 = (npy_uint8 *)malloc(work_count*sizeof(@ftyp@)); + mem_buff2 = (npy_uint8 *)malloc(work_count*sizeof(ftyp)); if (!mem_buff2) { goto error; } @@ -2217,7 +2341,7 @@ init_@lapack_func@(GEEV_PARAMS_t* params, work = mem_buff2; params->LWORK = (fortran_int)work_count; - params->WORK = work; + params->WORK = (ftyp*)work; return 1; error: @@ -2228,32 +2352,27 @@ init_@lapack_func@(GEEV_PARAMS_t* params, return 0; } - +template<typename complextyp, typename typ> static NPY_INLINE void -process_@lapack_func@_results(GEEV_PARAMS_t *NPY_UNUSED(params)) +process_geev_results(GEEV_PARAMS_t<typ> *NPY_UNUSED(params), complex_trait) { /* nothing to do here, complex versions are ready to copy out */ } -/**end repeat**/ -/**begin repeat - #TYPE = FLOAT, DOUBLE, CDOUBLE# - #COMPLEXTYPE = CFLOAT, CDOUBLE, CDOUBLE# - #ftype = fortran_real, fortran_doublereal, fortran_doublecomplex# - #lapack_func = sgeev, dgeev, zgeev# - */ +template<typename typ> static NPY_INLINE void -release_@lapack_func@(GEEV_PARAMS_t *params) +release_geev(GEEV_PARAMS_t<typ> *params) { free(params->WORK); free(params->A); memset(params, 0, sizeof(*params)); } +template<typename fctype, typename ftype> static NPY_INLINE void -@TYPE@_eig_wrapper(char JOBVL, +eig_wrapper(char JOBVL, char JOBVR, char**args, npy_intp const *dimensions, @@ -2264,7 +2383,7 @@ static NPY_INLINE void size_t outer_dim = *dimensions++; size_t op_count = 2; int error_occurred = get_fp_invalid_and_clear(); - GEEV_PARAMS_t geev_params; + GEEV_PARAMS_t<ftype> geev_params; assert(JOBVL == 'N'); @@ -2277,9 +2396,9 @@ static NPY_INLINE void } steps += op_count; - if (init_@lapack_func@(&geev_params, + if (init_geev(&geev_params, JOBVL, JOBVR, - (fortran_int)dimensions[0])) { + (fortran_int)dimensions[0], dispatch_scalar<ftype>())) { LINEARIZE_DATA_t a_in; LINEARIZE_DATA_t w_out; LINEARIZE_DATA_t vl_out; @@ -2309,78 +2428,81 @@ static NPY_INLINE void int not_ok; char **arg_iter = args; /* copy the matrix in */ - linearize_@TYPE@_matrix(geev_params.A, *arg_iter++, &a_in); - not_ok = call_@lapack_func@(&geev_params); + linearize_matrix((ftype*)geev_params.A, (ftype*)*arg_iter++, &a_in); + not_ok = call_geev(&geev_params); if (!not_ok) { - process_@lapack_func@_results(&geev_params); - delinearize_@COMPLEXTYPE@_matrix(*arg_iter++, - geev_params.W, + process_geev_results<fctype>(&geev_params, +dispatch_scalar<ftype>{}); + delinearize_matrix((fctype*)*arg_iter++, + (fctype*)geev_params.W, &w_out); if ('V' == geev_params.JOBVL) { - delinearize_@COMPLEXTYPE@_matrix(*arg_iter++, - geev_params.VL, + delinearize_matrix((fctype*)*arg_iter++, + (fctype*)geev_params.VL, &vl_out); } if ('V' == geev_params.JOBVR) { - delinearize_@COMPLEXTYPE@_matrix(*arg_iter++, - geev_params.VR, + delinearize_matrix((fctype*)*arg_iter++, + (fctype*)geev_params.VR, &vr_out); } } else { /* geev failed */ error_occurred = 1; - nan_@COMPLEXTYPE@_matrix(*arg_iter++, &w_out); + nan_matrix((fctype*)*arg_iter++, &w_out); if ('V' == geev_params.JOBVL) { - nan_@COMPLEXTYPE@_matrix(*arg_iter++, &vl_out); + nan_matrix((fctype*)*arg_iter++, &vl_out); } if ('V' == geev_params.JOBVR) { - nan_@COMPLEXTYPE@_matrix(*arg_iter++, &vr_out); + nan_matrix((fctype*)*arg_iter++, &vr_out); } } update_pointers((npy_uint8**)args, outer_steps, op_count); } - release_@lapack_func@(&geev_params); + release_geev(&geev_params); } set_fp_invalid_or_clear(error_occurred); } +template<typename fctype, typename ftype> static void -@TYPE@_eig(char **args, +eig(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - @TYPE@_eig_wrapper('N', 'V', args, dimensions, steps); + eig_wrapper<fctype, ftype>('N', 'V', args, dimensions, steps); } +template<typename fctype, typename ftype> static void -@TYPE@_eigvals(char **args, +eigvals(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - @TYPE@_eig_wrapper('N', 'N', args, dimensions, steps); + eig_wrapper<fctype, ftype>('N', 'N', args, dimensions, steps); } -/**end repeat**/ /* -------------------------------------------------------------------------- */ /* singular value decomposition */ -typedef struct gessd_params_struct +template<typename ftyp> +struct GESDD_PARAMS_t { - void *A; - void *S; - void *U; - void *VT; - void *WORK; - void *RWORK; - void *IWORK; + ftyp *A; + basetype_t<ftyp> *S; + ftyp *U; + ftyp *VT; + ftyp *WORK; + basetype_t<ftyp> *RWORK; + fortran_int *IWORK; fortran_int M; fortran_int N; @@ -2389,12 +2511,13 @@ typedef struct gessd_params_struct fortran_int LDVT; fortran_int LWORK; char JOBZ; -} GESDD_PARAMS_t; +} ; +template<typename ftyp> static NPY_INLINE void dump_gesdd_params(const char *name, - GESDD_PARAMS_t *params) + GESDD_PARAMS_t<ftyp> *params) { TRACE_TXT("\n%s:\n"\ @@ -2464,43 +2587,51 @@ compute_urows_vtcolumns(char jobz, return 1; } - -/**begin repeat - #TYPE = FLOAT, DOUBLE# - #lapack_func = sgesdd, dgesdd# - #ftyp = fortran_real, fortran_doublereal# - */ - static NPY_INLINE fortran_int -call_@lapack_func@(GESDD_PARAMS_t *params) +call_gesdd(GESDD_PARAMS_t<fortran_real> *params) +{ + fortran_int rv; + LAPACK(sgesdd)(¶ms->JOBZ, ¶ms->M, ¶ms->N, + params->A, ¶ms->LDA, + params->S, + params->U, ¶ms->LDU, + params->VT, ¶ms->LDVT, + params->WORK, ¶ms->LWORK, + (fortran_int*)params->IWORK, + &rv); + return rv; +} +static NPY_INLINE fortran_int +call_gesdd(GESDD_PARAMS_t<fortran_doublereal> *params) { fortran_int rv; - LAPACK(@lapack_func@)(¶ms->JOBZ, ¶ms->M, ¶ms->N, - (@ftyp@*)params->A, ¶ms->LDA, - (@ftyp@*)params->S, - (@ftyp@*)params->U, ¶ms->LDU, - (@ftyp@*)params->VT, ¶ms->LDVT, - (@ftyp@*)params->WORK, ¶ms->LWORK, + LAPACK(dgesdd)(¶ms->JOBZ, ¶ms->M, ¶ms->N, + params->A, ¶ms->LDA, + params->S, + params->U, ¶ms->LDU, + params->VT, ¶ms->LDVT, + params->WORK, ¶ms->LWORK, (fortran_int*)params->IWORK, &rv); return rv; } +template<typename ftyp> static NPY_INLINE int -init_@lapack_func@(GESDD_PARAMS_t *params, +init_gesdd(GESDD_PARAMS_t<ftyp> *params, char jobz, fortran_int m, - fortran_int n) + fortran_int n, scalar_trait) { npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; npy_uint8 *a, *s, *u, *vt, *work, *iwork; size_t safe_m = m; size_t safe_n = n; - size_t a_size = safe_m * safe_n * sizeof(@ftyp@); + size_t a_size = safe_m * safe_n * sizeof(ftyp); fortran_int min_m_n = fortran_int_min(m, n); size_t safe_min_m_n = min_m_n; - size_t s_size = safe_min_m_n * sizeof(@ftyp@); + size_t s_size = safe_min_m_n * sizeof(ftyp); fortran_int u_row_count, vt_column_count; size_t safe_u_row_count, safe_vt_column_count; size_t u_size, vt_size; @@ -2516,8 +2647,8 @@ init_@lapack_func@(GESDD_PARAMS_t *params, safe_u_row_count = u_row_count; safe_vt_column_count = vt_column_count; - u_size = safe_u_row_count * safe_m * sizeof(@ftyp@); - vt_size = safe_n * safe_vt_column_count * sizeof(@ftyp@); + u_size = safe_u_row_count * safe_m * sizeof(ftyp); + vt_size = safe_n * safe_vt_column_count * sizeof(ftyp); mem_buff = (npy_uint8 *)malloc(a_size + s_size + u_size + vt_size + iwork_size); @@ -2536,12 +2667,12 @@ init_@lapack_func@(GESDD_PARAMS_t *params, params->M = m; params->N = n; - params->A = a; - params->S = s; - params->U = u; - params->VT = vt; + params->A = (ftyp*)a; + params->S = (ftyp*)s; + params->U = (ftyp*)u; + params->VT = (ftyp*)vt; params->RWORK = NULL; - params->IWORK = iwork; + params->IWORK = (fortran_int*)iwork; params->LDA = ld; params->LDU = ld; params->LDVT = vt_column_count; @@ -2549,19 +2680,19 @@ init_@lapack_func@(GESDD_PARAMS_t *params, /* Work size query */ { - @ftyp@ work_size_query; + ftyp work_size_query; params->LWORK = -1; params->WORK = &work_size_query; - if (call_@lapack_func@(params) != 0) { + if (call_gesdd(params) != 0) { goto error; } work_count = (fortran_int)work_size_query; /* Fix a bug in lapack 3.0.0 */ if(work_count == 0) work_count = 1; - work_size = (size_t)work_count * sizeof(@ftyp@); + work_size = (size_t)work_count * sizeof(ftyp); } mem_buff2 = (npy_uint8 *)malloc(work_size); @@ -2572,7 +2703,7 @@ init_@lapack_func@(GESDD_PARAMS_t *params, work = mem_buff2; params->LWORK = work_count; - params->WORK = work; + params->WORK = (ftyp*)work; return 1; error: @@ -2584,38 +2715,45 @@ init_@lapack_func@(GESDD_PARAMS_t *params, return 0; } -/**end repeat**/ - -/**begin repeat - #TYPE = CFLOAT, CDOUBLE# - #ftyp = fortran_complex, fortran_doublecomplex# - #frealtyp = fortran_real, fortran_doublereal# - #typ = COMPLEX_t, DOUBLECOMPLEX_t# - #lapack_func = cgesdd, zgesdd# - */ - static NPY_INLINE fortran_int -call_@lapack_func@(GESDD_PARAMS_t *params) +call_gesdd(GESDD_PARAMS_t<fortran_complex> *params) { fortran_int rv; - LAPACK(@lapack_func@)(¶ms->JOBZ, ¶ms->M, ¶ms->N, - (@ftyp@*)params->A, ¶ms->LDA, - (@frealtyp@*)params->S, - (@ftyp@*)params->U, ¶ms->LDU, - (@ftyp@*)params->VT, ¶ms->LDVT, - (@ftyp@*)params->WORK, ¶ms->LWORK, - (@frealtyp@*)params->RWORK, - (fortran_int*)params->IWORK, + LAPACK(cgesdd)(¶ms->JOBZ, ¶ms->M, ¶ms->N, + params->A, ¶ms->LDA, + params->S, + params->U, ¶ms->LDU, + params->VT, ¶ms->LDVT, + params->WORK, ¶ms->LWORK, + params->RWORK, + params->IWORK, + &rv); + return rv; +} +static NPY_INLINE fortran_int +call_gesdd(GESDD_PARAMS_t<fortran_doublecomplex> *params) +{ + fortran_int rv; + LAPACK(zgesdd)(¶ms->JOBZ, ¶ms->M, ¶ms->N, + params->A, ¶ms->LDA, + params->S, + params->U, ¶ms->LDU, + params->VT, ¶ms->LDVT, + params->WORK, ¶ms->LWORK, + params->RWORK, + params->IWORK, &rv); return rv; } +template<typename ftyp> static NPY_INLINE int -init_@lapack_func@(GESDD_PARAMS_t *params, +init_gesdd(GESDD_PARAMS_t<ftyp> *params, char jobz, fortran_int m, - fortran_int n) + fortran_int n, complex_trait) { +using frealtyp = basetype_t<ftyp>; npy_uint8 *mem_buff = NULL, *mem_buff2 = NULL; npy_uint8 *a,*s, *u, *vt, *work, *rwork, *iwork; size_t a_size, s_size, u_size, vt_size, work_size, rwork_size, iwork_size; @@ -2634,14 +2772,14 @@ init_@lapack_func@(GESDD_PARAMS_t *params, safe_u_row_count = u_row_count; safe_vt_column_count = vt_column_count; - a_size = safe_m * safe_n * sizeof(@ftyp@); - s_size = safe_min_m_n * sizeof(@frealtyp@); - u_size = safe_u_row_count * safe_m * sizeof(@ftyp@); - vt_size = safe_n * safe_vt_column_count * sizeof(@ftyp@); + a_size = safe_m * safe_n * sizeof(ftyp); + s_size = safe_min_m_n * sizeof(frealtyp); + u_size = safe_u_row_count * safe_m * sizeof(ftyp); + vt_size = safe_n * safe_vt_column_count * sizeof(ftyp); rwork_size = 'N'==jobz? (7 * safe_min_m_n) : (5*safe_min_m_n * safe_min_m_n + 5*safe_min_m_n); - rwork_size *= sizeof(@ftyp@); + rwork_size *= sizeof(ftyp); iwork_size = 8 * safe_min_m_n* sizeof(fortran_int); mem_buff = (npy_uint8 *)malloc(a_size + @@ -2664,12 +2802,12 @@ init_@lapack_func@(GESDD_PARAMS_t *params, /* fix vt_column_count so that it is a valid lapack parameter (0 is not) */ vt_column_count = fortran_int_max(1, vt_column_count); - params->A = a; - params->S = s; - params->U = u; - params->VT = vt; - params->RWORK = rwork; - params->IWORK = iwork; + params->A = (ftyp*)a; + params->S = (frealtyp*)s; + params->U = (ftyp*)u; + params->VT = (ftyp*)vt; + params->RWORK = (frealtyp*)rwork; + params->IWORK = (fortran_int*)iwork; params->M = m; params->N = n; params->LDA = ld; @@ -2679,19 +2817,19 @@ init_@lapack_func@(GESDD_PARAMS_t *params, /* Work size query */ { - @ftyp@ work_size_query; + ftyp work_size_query; params->LWORK = -1; params->WORK = &work_size_query; - if (call_@lapack_func@(params) != 0) { + if (call_gesdd(params) != 0) { goto error; } - work_count = (fortran_int)((@typ@*)&work_size_query)->array[0]; + work_count = (fortran_int)(*(frealtyp*)&work_size_query); /* Fix a bug in lapack 3.0.0 */ if(work_count == 0) work_count = 1; - work_size = (size_t)work_count * sizeof(@ftyp@); + work_size = (size_t)work_count * sizeof(ftyp); } mem_buff2 = (npy_uint8 *)malloc(work_size); @@ -2702,7 +2840,7 @@ init_@lapack_func@(GESDD_PARAMS_t *params, work = mem_buff2; params->LWORK = work_count; - params->WORK = work; + params->WORK = (ftyp*)work; return 1; error: @@ -2713,16 +2851,10 @@ init_@lapack_func@(GESDD_PARAMS_t *params, return 0; } -/**end repeat**/ - -/**begin repeat - #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# - #REALTYPE = FLOAT, DOUBLE, FLOAT, DOUBLE# - #lapack_func = sgesdd, dgesdd, cgesdd, zgesdd# - */ +template<typename typ> static NPY_INLINE void -release_@lapack_func@(GESDD_PARAMS_t* params) +release_gesdd(GESDD_PARAMS_t<typ>* params) { /* A and WORK contain allocated blocks */ free(params->A); @@ -2730,28 +2862,31 @@ release_@lapack_func@(GESDD_PARAMS_t* params) memset(params, 0, sizeof(*params)); } +template<typename typ> static NPY_INLINE void -@TYPE@_svd_wrapper(char JOBZ, +svd_wrapper(char JOBZ, char **args, npy_intp const *dimensions, npy_intp const *steps) { +using basetyp = basetype_t<typ>; ptrdiff_t outer_steps[4]; int error_occurred = get_fp_invalid_and_clear(); size_t iter; size_t outer_dim = *dimensions++; size_t op_count = (JOBZ=='N')?2:4; - GESDD_PARAMS_t params; + GESDD_PARAMS_t<typ> params; for (iter = 0; iter < op_count; ++iter) { outer_steps[iter] = (ptrdiff_t) steps[iter]; } steps += op_count; - if (init_@lapack_func@(¶ms, - JOBZ, - (fortran_int)dimensions[0], - (fortran_int)dimensions[1])) { + if (init_gesdd(¶ms, + JOBZ, + (fortran_int)dimensions[0], + (fortran_int)dimensions[1], +dispatch_scalar<typ>())) { LINEARIZE_DATA_t a_in, u_out, s_out, v_out; fortran_int min_m_n = params.M < params.N ? params.M : params.N; @@ -2782,97 +2917,95 @@ static NPY_INLINE void for (iter = 0; iter < outer_dim; ++iter) { int not_ok; /* copy the matrix in */ - linearize_@TYPE@_matrix(params.A, args[0], &a_in); - not_ok = call_@lapack_func@(¶ms); + linearize_matrix((typ*)params.A, (typ*)args[0], &a_in); + not_ok = call_gesdd(¶ms); if (!not_ok) { if ('N' == params.JOBZ) { - delinearize_@REALTYPE@_matrix(args[1], params.S, &s_out); + delinearize_matrix((basetyp*)args[1], (basetyp*)params.S, &s_out); } else { if ('A' == params.JOBZ && min_m_n == 0) { /* Lapack has betrayed us and left these uninitialized, * so produce an identity matrix for whichever of u * and v is not empty. */ - identity_@TYPE@_matrix(params.U, params.M); - identity_@TYPE@_matrix(params.VT, params.N); + identity_matrix((typ*)params.U, params.M); + identity_matrix((typ*)params.VT, params.N); } - delinearize_@TYPE@_matrix(args[1], params.U, &u_out); - delinearize_@REALTYPE@_matrix(args[2], params.S, &s_out); - delinearize_@TYPE@_matrix(args[3], params.VT, &v_out); + delinearize_matrix((typ*)args[1], (typ*)params.U, &u_out); + delinearize_matrix((basetyp*)args[2], (basetyp*)params.S, &s_out); + delinearize_matrix((typ*)args[3], (typ*)params.VT, &v_out); } } else { error_occurred = 1; if ('N' == params.JOBZ) { - nan_@REALTYPE@_matrix(args[1], &s_out); + nan_matrix((basetyp*)args[1], &s_out); } else { - nan_@TYPE@_matrix(args[1], &u_out); - nan_@REALTYPE@_matrix(args[2], &s_out); - nan_@TYPE@_matrix(args[3], &v_out); + nan_matrix((typ*)args[1], &u_out); + nan_matrix((basetyp*)args[2], &s_out); + nan_matrix((typ*)args[3], &v_out); } } update_pointers((npy_uint8**)args, outer_steps, op_count); } - release_@lapack_func@(¶ms); + release_gesdd(¶ms); } set_fp_invalid_or_clear(error_occurred); } -/**end repeat*/ -/* svd gufunc entry points */ -/**begin repeat - #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# - */ +template<typename typ> static void -@TYPE@_svd_N(char **args, +svd_N(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - @TYPE@_svd_wrapper('N', args, dimensions, steps); + svd_wrapper<fortran_type_t<typ>>('N', args, dimensions, steps); } +template<typename typ> static void -@TYPE@_svd_S(char **args, +svd_S(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - @TYPE@_svd_wrapper('S', args, dimensions, steps); + svd_wrapper<fortran_type_t<typ>>('S', args, dimensions, steps); } +template<typename typ> static void -@TYPE@_svd_A(char **args, +svd_A(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - @TYPE@_svd_wrapper('A', args, dimensions, steps); + svd_wrapper<fortran_type_t<typ>>('A', args, dimensions, steps); } -/**end repeat**/ - /* -------------------------------------------------------------------------- */ /* qr (modes - r, raw) */ -typedef struct geqfr_params_struct +template<typename typ> +struct GEQRF_PARAMS_t { fortran_int M; fortran_int N; - void *A; + typ *A; fortran_int LDA; - void* TAU; - void *WORK; + typ* TAU; + typ *WORK; fortran_int LWORK; -} GEQRF_PARAMS_t; +}; +template<typename typ> static inline void dump_geqrf_params(const char *name, - GEQRF_PARAMS_t *params) + GEQRF_PARAMS_t<typ> *params) { TRACE_TXT("\n%s:\n"\ @@ -2896,35 +3029,36 @@ dump_geqrf_params(const char *name, "LWORK", (int)params->LWORK); } -/**begin repeat - #lapack_func=dgeqrf,zgeqrf# - #typ=double,f2c_doublecomplex# - */ - static inline fortran_int -call_@lapack_func@(GEQRF_PARAMS_t *params) +call_geqrf(GEQRF_PARAMS_t<double> *params) +{ + fortran_int rv; + LAPACK(dgeqrf)(¶ms->M, ¶ms->N, + params->A, ¶ms->LDA, + params->TAU, + params->WORK, ¶ms->LWORK, + &rv); + return rv; +} +static inline fortran_int +call_geqrf(GEQRF_PARAMS_t<f2c_doublecomplex> *params) { fortran_int rv; - LAPACK(@lapack_func@)(¶ms->M, ¶ms->N, - (@typ@*)params->A, ¶ms->LDA, - (@typ@*)params->TAU, - (@typ@*)params->WORK, ¶ms->LWORK, + LAPACK(zgeqrf)(¶ms->M, ¶ms->N, + params->A, ¶ms->LDA, + params->TAU, + params->WORK, ¶ms->LWORK, &rv); return rv; } -/**end repeat**/ -/**begin repeat - #TYPE=DOUBLE# - #lapack_func=dgeqrf# - #ftyp=fortran_doublereal# - */ static inline int -init_@lapack_func@(GEQRF_PARAMS_t *params, +init_geqrf(GEQRF_PARAMS_t<fortran_doublereal> *params, fortran_int m, fortran_int n) { +using ftyp = fortran_doublereal; npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; npy_uint8 *a, *tau, *work; @@ -2933,8 +3067,8 @@ init_@lapack_func@(GEQRF_PARAMS_t *params, size_t safe_m = m; size_t safe_n = n; - size_t a_size = safe_m * safe_n * sizeof(@ftyp@); - size_t tau_size = safe_min_m_n * sizeof(@ftyp@); + size_t a_size = safe_m * safe_n * sizeof(ftyp); + size_t tau_size = safe_min_m_n * sizeof(ftyp); fortran_int work_count; size_t work_size; @@ -2952,35 +3086,35 @@ init_@lapack_func@(GEQRF_PARAMS_t *params, params->M = m; params->N = n; - params->A = a; - params->TAU = tau; + params->A = (ftyp*)a; + params->TAU = (ftyp*)tau; params->LDA = lda; { /* compute optimal work size */ - @ftyp@ work_size_query; + ftyp work_size_query; params->WORK = &work_size_query; params->LWORK = -1; - if (call_@lapack_func@(params) != 0) + if (call_geqrf(params) != 0) goto error; - work_count = (fortran_int) *(@ftyp@*) params->WORK; + work_count = (fortran_int) *(ftyp*) params->WORK; } params->LWORK = fortran_int_max(fortran_int_max(1, n), work_count); - work_size = (size_t) params->LWORK * sizeof(@ftyp@); + work_size = (size_t) params->LWORK * sizeof(ftyp); mem_buff2 = (npy_uint8 *)malloc(work_size); if (!mem_buff2) goto error; work = mem_buff2; - params->WORK = work; + params->WORK = (ftyp*)work; return 1; error: @@ -2992,18 +3126,13 @@ init_@lapack_func@(GEQRF_PARAMS_t *params, return 0; } -/**end repeat**/ -/**begin repeat - #TYPE=CDOUBLE# - #lapack_func=zgeqrf# - #ftyp=fortran_doublecomplex# - */ static inline int -init_@lapack_func@(GEQRF_PARAMS_t *params, +init_geqrf(GEQRF_PARAMS_t<fortran_doublecomplex> *params, fortran_int m, fortran_int n) { +using ftyp = fortran_doublecomplex; npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; npy_uint8 *a, *tau, *work; @@ -3012,8 +3141,8 @@ init_@lapack_func@(GEQRF_PARAMS_t *params, size_t safe_m = m; size_t safe_n = n; - size_t a_size = safe_m * safe_n * sizeof(@ftyp@); - size_t tau_size = safe_min_m_n * sizeof(@ftyp@); + size_t a_size = safe_m * safe_n * sizeof(ftyp); + size_t tau_size = safe_min_m_n * sizeof(ftyp); fortran_int work_count; size_t work_size; @@ -3031,29 +3160,29 @@ init_@lapack_func@(GEQRF_PARAMS_t *params, params->M = m; params->N = n; - params->A = a; - params->TAU = tau; + params->A = (ftyp*)a; + params->TAU = (ftyp*)tau; params->LDA = lda; { /* compute optimal work size */ - @ftyp@ work_size_query; + ftyp work_size_query; params->WORK = &work_size_query; params->LWORK = -1; - if (call_@lapack_func@(params) != 0) + if (call_geqrf(params) != 0) goto error; - work_count = (fortran_int) ((@ftyp@*)params->WORK)->r; + work_count = (fortran_int) ((ftyp*)params->WORK)->r; } params->LWORK = fortran_int_max(fortran_int_max(1, n), work_count); - work_size = (size_t) params->LWORK * sizeof(@ftyp@); + work_size = (size_t) params->LWORK * sizeof(ftyp); mem_buff2 = (npy_uint8 *)malloc(work_size); if (!mem_buff2) @@ -3061,7 +3190,7 @@ init_@lapack_func@(GEQRF_PARAMS_t *params, work = mem_buff2; - params->WORK = work; + params->WORK = (ftyp*)work; return 1; error: @@ -3073,13 +3202,10 @@ init_@lapack_func@(GEQRF_PARAMS_t *params, return 0; } -/**end repeat**/ -/**begin repeat - #lapack_func=dgeqrf,zgeqrf# - */ +template<typename ftyp> static inline void -release_@lapack_func@(GEQRF_PARAMS_t* params) +release_geqrf(GEQRF_PARAMS_t<ftyp>* params) { /* A and WORK contain allocated blocks */ free(params->A); @@ -3087,22 +3213,14 @@ release_@lapack_func@(GEQRF_PARAMS_t* params) memset(params, 0, sizeof(*params)); } -/**end repeat**/ - -/**begin repeat - #TYPE=DOUBLE,CDOUBLE# - #REALTYPE=DOUBLE,DOUBLE# - #lapack_func=dgeqrf,zgeqrf# - #typ = npy_double,npy_cdouble# - #basetyp = npy_double,npy_double# - #ftyp = fortran_doublereal,fortran_doublecomplex# - #cmplx = 0, 1# - */ +template<typename typ> static void -@TYPE@_qr_r_raw(char **args, npy_intp const *dimensions, npy_intp const *steps, +qr_r_raw(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - GEQRF_PARAMS_t params; +using ftyp = fortran_type_t<typ>; + + GEQRF_PARAMS_t<ftyp> params; int error_occurred = get_fp_invalid_and_clear(); fortran_int n, m; @@ -3111,7 +3229,7 @@ static void m = (fortran_int)dimensions[0]; n = (fortran_int)dimensions[1]; - if (init_@lapack_func@(¶ms, m, n)) { + if (init_geqrf(¶ms, m, n)) { LINEARIZE_DATA_t a_in, tau_out; init_linearize_data(&a_in, n, m, steps[1], steps[0]); @@ -3119,70 +3237,71 @@ static void BEGIN_OUTER_LOOP_2 int not_ok; - linearize_@TYPE@_matrix(params.A, args[0], &a_in); - not_ok = call_@lapack_func@(¶ms); + linearize_matrix((typ*)params.A, (typ*)args[0], &a_in); + not_ok = call_geqrf(¶ms); if (!not_ok) { - delinearize_@TYPE@_matrix(args[0], params.A, &a_in); - delinearize_@TYPE@_matrix(args[1], params.TAU, &tau_out); + delinearize_matrix((typ*)args[0], (typ*)params.A, &a_in); + delinearize_matrix((typ*)args[1], (typ*)params.TAU, &tau_out); } else { error_occurred = 1; - nan_@TYPE@_matrix(args[1], &tau_out); + nan_matrix((typ*)args[1], &tau_out); } END_OUTER_LOOP - release_@lapack_func@(¶ms); + release_geqrf(¶ms); } set_fp_invalid_or_clear(error_occurred); } -/**end repeat**/ /* -------------------------------------------------------------------------- */ /* qr common code (modes - reduced and complete) */ -typedef struct gqr_params_struct +template<typename typ> +struct GQR_PARAMS_t { fortran_int M; fortran_int MC; fortran_int MN; void* A; - void *Q; + typ *Q; fortran_int LDA; - void* TAU; - void *WORK; + typ* TAU; + typ *WORK; fortran_int LWORK; -} GQR_PARAMS_t; - -/**begin repeat - #lapack_func=dorgqr,zungqr# - #typ=double, f2c_doublecomplex# - */ +} ; static inline fortran_int -call_@lapack_func@(GQR_PARAMS_t *params) +call_gqr(GQR_PARAMS_t<double> *params) { fortran_int rv; - LAPACK(@lapack_func@)(¶ms->M, ¶ms->MC, ¶ms->MN, - (@typ@*)params->Q, ¶ms->LDA, - (@typ@*)params->TAU, - (@typ@*)params->WORK, ¶ms->LWORK, + LAPACK(dorgqr)(¶ms->M, ¶ms->MC, ¶ms->MN, + params->Q, ¶ms->LDA, + params->TAU, + params->WORK, ¶ms->LWORK, + &rv); + return rv; +} +static inline fortran_int +call_gqr(GQR_PARAMS_t<f2c_doublecomplex> *params) +{ + fortran_int rv; + LAPACK(zungqr)(¶ms->M, ¶ms->MC, ¶ms->MN, + params->Q, ¶ms->LDA, + params->TAU, + params->WORK, ¶ms->LWORK, &rv); return rv; } -/**end repeat**/ - -/**begin repeat - #lapack_func=dorgqr# - #ftyp=fortran_doublereal# - */ static inline int -init_@lapack_func@_common(GQR_PARAMS_t *params, +init_gqr_common(GQR_PARAMS_t<fortran_doublereal> *params, fortran_int m, fortran_int n, fortran_int mc) { +using ftyp = fortran_doublereal; npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; npy_uint8 *a, *q, *tau, *work; @@ -3191,9 +3310,9 @@ init_@lapack_func@_common(GQR_PARAMS_t *params, size_t safe_min_m_n = min_m_n; size_t safe_m = m; size_t safe_n = n; - size_t a_size = safe_m * safe_n * sizeof(@ftyp@); - size_t q_size = safe_m * safe_mc * sizeof(@ftyp@); - size_t tau_size = safe_min_m_n * sizeof(@ftyp@); + size_t a_size = safe_m * safe_n * sizeof(ftyp); + size_t q_size = safe_m * safe_mc * sizeof(ftyp); + size_t tau_size = safe_min_m_n * sizeof(ftyp); fortran_int work_count; size_t work_size; @@ -3213,27 +3332,27 @@ init_@lapack_func@_common(GQR_PARAMS_t *params, params->MC = mc; params->MN = min_m_n; params->A = a; - params->Q = q; - params->TAU = tau; + params->Q = (ftyp*)q; + params->TAU = (ftyp*)tau; params->LDA = lda; { /* compute optimal work size */ - @ftyp@ work_size_query; + ftyp work_size_query; params->WORK = &work_size_query; params->LWORK = -1; - if (call_@lapack_func@(params) != 0) + if (call_gqr(params) != 0) goto error; - work_count = (fortran_int) *(@ftyp@*) params->WORK; + work_count = (fortran_int) *(ftyp*) params->WORK; } params->LWORK = fortran_int_max(fortran_int_max(1, n), work_count); - work_size = (size_t) params->LWORK * sizeof(@ftyp@); + work_size = (size_t) params->LWORK * sizeof(ftyp); mem_buff2 = (npy_uint8 *)malloc(work_size); if (!mem_buff2) @@ -3241,7 +3360,7 @@ init_@lapack_func@_common(GQR_PARAMS_t *params, work = mem_buff2; - params->WORK = work; + params->WORK = (ftyp*)work; return 1; error: @@ -3253,18 +3372,14 @@ init_@lapack_func@_common(GQR_PARAMS_t *params, return 0; } -/**end repeat**/ -/**begin repeat - #lapack_func=zungqr# - #ftyp=fortran_doublecomplex# - */ static inline int -init_@lapack_func@_common(GQR_PARAMS_t *params, +init_gqr_common(GQR_PARAMS_t<fortran_doublecomplex> *params, fortran_int m, fortran_int n, fortran_int mc) { +using ftyp=fortran_doublecomplex; npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; npy_uint8 *a, *q, *tau, *work; @@ -3274,9 +3389,9 @@ init_@lapack_func@_common(GQR_PARAMS_t *params, size_t safe_m = m; size_t safe_n = n; - size_t a_size = safe_m * safe_n * sizeof(@ftyp@); - size_t q_size = safe_m * safe_mc * sizeof(@ftyp@); - size_t tau_size = safe_min_m_n * sizeof(@ftyp@); + size_t a_size = safe_m * safe_n * sizeof(ftyp); + size_t q_size = safe_m * safe_mc * sizeof(ftyp); + size_t tau_size = safe_min_m_n * sizeof(ftyp); fortran_int work_count; size_t work_size; @@ -3296,28 +3411,28 @@ init_@lapack_func@_common(GQR_PARAMS_t *params, params->MC = mc; params->MN = min_m_n; params->A = a; - params->Q = q; - params->TAU = tau; + params->Q = (ftyp*)q; + params->TAU = (ftyp*)tau; params->LDA = lda; { /* compute optimal work size */ - @ftyp@ work_size_query; + ftyp work_size_query; params->WORK = &work_size_query; params->LWORK = -1; - if (call_@lapack_func@(params) != 0) + if (call_gqr(params) != 0) goto error; - work_count = (fortran_int) ((@ftyp@*)params->WORK)->r; + work_count = (fortran_int) ((ftyp*)params->WORK)->r; } params->LWORK = fortran_int_max(fortran_int_max(1, n), work_count); - work_size = (size_t) params->LWORK * sizeof(@ftyp@); + work_size = (size_t) params->LWORK * sizeof(ftyp); mem_buff2 = (npy_uint8 *)malloc(work_size); if (!mem_buff2) @@ -3325,7 +3440,7 @@ init_@lapack_func@_common(GQR_PARAMS_t *params, work = mem_buff2; - params->WORK = work; + params->WORK = (ftyp*)work; params->LWORK = work_count; return 1; @@ -3338,15 +3453,14 @@ init_@lapack_func@_common(GQR_PARAMS_t *params, return 0; } -/**end repeat**/ - /* -------------------------------------------------------------------------- */ /* qr (modes - reduced) */ +template<typename typ> static inline void dump_gqr_params(const char *name, - GQR_PARAMS_t *params) + GQR_PARAMS_t<typ> *params) { TRACE_TXT("\n%s:\n"\ @@ -3372,27 +3486,21 @@ dump_gqr_params(const char *name, "LWORK", (int)params->LWORK); } -/**begin repeat - #lapack_func=dorgqr,zungqr# - #ftyp=fortran_doublereal,fortran_doublecomplex# - */ +template<typename ftyp> static inline int -init_@lapack_func@(GQR_PARAMS_t *params, +init_gqr(GQR_PARAMS_t<ftyp> *params, fortran_int m, fortran_int n) { - return init_@lapack_func@_common( + return init_gqr_common( params, m, n, fortran_int_min(m, n)); } -/**end repeat**/ -/**begin repeat - #lapack_func=dorgqr,zungqr# - */ +template<typename typ> static inline void -release_@lapack_func@(GQR_PARAMS_t* params) +release_gqr(GQR_PARAMS_t<typ>* params) { /* A and WORK contain allocated blocks */ free(params->Q); @@ -3400,22 +3508,13 @@ release_@lapack_func@(GQR_PARAMS_t* params) memset(params, 0, sizeof(*params)); } -/**end repeat**/ - -/**begin repeat - #TYPE=DOUBLE,CDOUBLE# - #REALTYPE=DOUBLE,DOUBLE# - #lapack_func=dorgqr,zungqr# - #typ = npy_double, npy_cdouble# - #basetyp = npy_double, npy_double# - #ftyp = fortran_doublereal,fortran_doublecomplex# - #cmplx = 0, 1# - */ +template<typename typ> static void -@TYPE@_qr_reduced(char **args, npy_intp const *dimensions, npy_intp const *steps, +qr_reduced(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - GQR_PARAMS_t params; +using ftyp = fortran_type_t<typ>; + GQR_PARAMS_t<ftyp> params; int error_occurred = get_fp_invalid_and_clear(); fortran_int n, m; @@ -3424,7 +3523,7 @@ static void m = (fortran_int)dimensions[0]; n = (fortran_int)dimensions[1]; - if (init_@lapack_func@(¶ms, m, n)) { + if (init_gqr(¶ms, m, n)) { LINEARIZE_DATA_t a_in, tau_in, q_out; init_linearize_data(&a_in, n, m, steps[1], steps[0]); @@ -3433,57 +3532,44 @@ static void BEGIN_OUTER_LOOP_3 int not_ok; - linearize_@TYPE@_matrix(params.A, args[0], &a_in); - linearize_@TYPE@_matrix(params.Q, args[0], &a_in); - linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in); - not_ok = call_@lapack_func@(¶ms); + linearize_matrix((typ*)params.A, (typ*)args[0], &a_in); + linearize_matrix((typ*)params.Q, (typ*)args[0], &a_in); + linearize_matrix((typ*)params.TAU, (typ*)args[1], &tau_in); + not_ok = call_gqr(¶ms); if (!not_ok) { - delinearize_@TYPE@_matrix(args[2], params.Q, &q_out); + delinearize_matrix((typ*)args[2], (typ*)params.Q, &q_out); } else { error_occurred = 1; - nan_@TYPE@_matrix(args[2], &q_out); + nan_matrix((typ*)args[2], &q_out); } END_OUTER_LOOP - release_@lapack_func@(¶ms); + release_gqr(¶ms); } set_fp_invalid_or_clear(error_occurred); } -/**end repeat**/ - /* -------------------------------------------------------------------------- */ /* qr (modes - complete) */ -/**begin repeat - #lapack_func=dorgqr,zungqr# - #ftyp=fortran_doublereal,fortran_doublecomplex# - */ +template<typename ftyp> static inline int -init_@lapack_func@_complete(GQR_PARAMS_t *params, +init_gqr_complete(GQR_PARAMS_t<ftyp> *params, fortran_int m, fortran_int n) { - return init_@lapack_func@_common(params, m, n, m); + return init_gqr_common(params, m, n, m); } -/**end repeat**/ -/**begin repeat - #TYPE=DOUBLE,CDOUBLE# - #REALTYPE=DOUBLE,DOUBLE# - #lapack_func=dorgqr,zungqr# - #typ = npy_double,npy_cdouble# - #basetyp = npy_double,npy_double# - #ftyp = fortran_doublereal,fortran_doublecomplex# - #cmplx = 0, 1# - */ +template<typename typ> static void -@TYPE@_qr_complete(char **args, npy_intp const *dimensions, npy_intp const *steps, +qr_complete(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - GQR_PARAMS_t params; +using ftyp = fortran_type_t<typ>; + GQR_PARAMS_t<ftyp> params; int error_occurred = get_fp_invalid_and_clear(); fortran_int n, m; @@ -3493,7 +3579,7 @@ static void n = (fortran_int)dimensions[1]; - if (init_@lapack_func@_complete(¶ms, m, n)) { + if (init_gqr_complete(¶ms, m, n)) { LINEARIZE_DATA_t a_in, tau_in, q_out; init_linearize_data(&a_in, n, m, steps[1], steps[0]); @@ -3502,51 +3588,50 @@ static void BEGIN_OUTER_LOOP_3 int not_ok; - linearize_@TYPE@_matrix(params.A, args[0], &a_in); - linearize_@TYPE@_matrix(params.Q, args[0], &a_in); - linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in); - not_ok = call_@lapack_func@(¶ms); + linearize_matrix((typ*)params.A, (typ*)args[0], &a_in); + linearize_matrix((typ*)params.Q, (typ*)args[0], &a_in); + linearize_matrix((typ*)params.TAU, (typ*)args[1], &tau_in); + not_ok = call_gqr(¶ms); if (!not_ok) { - delinearize_@TYPE@_matrix(args[2], params.Q, &q_out); + delinearize_matrix((typ*)args[2], (typ*)params.Q, &q_out); } else { error_occurred = 1; - nan_@TYPE@_matrix(args[2], &q_out); + nan_matrix((typ*)args[2], &q_out); } END_OUTER_LOOP - release_@lapack_func@(¶ms); + release_gqr(¶ms); } set_fp_invalid_or_clear(error_occurred); } -/**end repeat**/ - /* -------------------------------------------------------------------------- */ /* least squares */ -typedef struct gelsd_params_struct +template<typename typ> +struct GELSD_PARAMS_t { fortran_int M; fortran_int N; fortran_int NRHS; - void *A; + typ *A; fortran_int LDA; - void *B; + typ *B; fortran_int LDB; - void *S; - void *RCOND; + basetype_t<typ> *S; + basetype_t<typ> *RCOND; fortran_int RANK; - void *WORK; + typ *WORK; fortran_int LWORK; - void *RWORK; - void *IWORK; -} GELSD_PARAMS_t; - + basetype_t<typ> *RWORK; + fortran_int *IWORK; +}; +template<typename typ> static inline void dump_gelsd_params(const char *name, - GELSD_PARAMS_t *params) + GELSD_PARAMS_t<typ> *params) { TRACE_TXT("\n%s:\n"\ @@ -3587,33 +3672,45 @@ dump_gelsd_params(const char *name, "RCOND", params->RCOND); } +static inline fortran_int +call_gelsd(GELSD_PARAMS_t<fortran_real> *params) +{ + fortran_int rv; + LAPACK(sgelsd)(¶ms->M, ¶ms->N, ¶ms->NRHS, + params->A, ¶ms->LDA, + params->B, ¶ms->LDB, + params->S, + params->RCOND, ¶ms->RANK, + params->WORK, ¶ms->LWORK, + params->IWORK, + &rv); + return rv; +} -/**begin repeat - #TYPE=FLOAT,DOUBLE# - #lapack_func=sgelsd,dgelsd# - #ftyp=fortran_real,fortran_doublereal# - */ static inline fortran_int -call_@lapack_func@(GELSD_PARAMS_t *params) +call_gelsd(GELSD_PARAMS_t<fortran_doublereal> *params) { fortran_int rv; - LAPACK(@lapack_func@)(¶ms->M, ¶ms->N, ¶ms->NRHS, - (@ftyp@*)params->A, ¶ms->LDA, - (@ftyp@*)params->B, ¶ms->LDB, - (@ftyp@*)params->S, - (@ftyp@*)params->RCOND, ¶ms->RANK, - (@ftyp@*)params->WORK, ¶ms->LWORK, - (fortran_int*)params->IWORK, + LAPACK(dgelsd)(¶ms->M, ¶ms->N, ¶ms->NRHS, + params->A, ¶ms->LDA, + params->B, ¶ms->LDB, + params->S, + params->RCOND, ¶ms->RANK, + params->WORK, ¶ms->LWORK, + params->IWORK, &rv); return rv; } + +template<typename ftyp> static inline int -init_@lapack_func@(GELSD_PARAMS_t *params, +init_gelsd(GELSD_PARAMS_t<ftyp> *params, fortran_int m, fortran_int n, - fortran_int nrhs) + fortran_int nrhs, +scalar_trait) { npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; @@ -3626,9 +3723,9 @@ init_@lapack_func@(GELSD_PARAMS_t *params, size_t safe_n = n; size_t safe_nrhs = nrhs; - size_t a_size = safe_m * safe_n * sizeof(@ftyp@); - size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@); - size_t s_size = safe_min_m_n * sizeof(@ftyp@); + size_t a_size = safe_m * safe_n * sizeof(ftyp); + size_t b_size = safe_max_m_n * safe_nrhs * sizeof(ftyp); + size_t s_size = safe_min_m_n * sizeof(ftyp); fortran_int work_count; size_t work_size; @@ -3649,15 +3746,15 @@ init_@lapack_func@(GELSD_PARAMS_t *params, params->M = m; params->N = n; params->NRHS = nrhs; - params->A = a; - params->B = b; - params->S = s; + params->A = (ftyp*)a; + params->B = (ftyp*)b; + params->S = (ftyp*)s; params->LDA = lda; params->LDB = ldb; { /* compute optimal work size */ - @ftyp@ work_size_query; + ftyp work_size_query; fortran_int iwork_size_query; params->WORK = &work_size_query; @@ -3665,12 +3762,12 @@ init_@lapack_func@(GELSD_PARAMS_t *params, params->RWORK = NULL; params->LWORK = -1; - if (call_@lapack_func@(params) != 0) + if (call_gelsd(params) != 0) goto error; work_count = (fortran_int)work_size_query; - work_size = (size_t) work_size_query * sizeof(@ftyp@); + work_size = (size_t) work_size_query * sizeof(ftyp); iwork_size = (size_t)iwork_size_query * sizeof(fortran_int); } @@ -3681,9 +3778,9 @@ init_@lapack_func@(GELSD_PARAMS_t *params, work = mem_buff2; iwork = work + work_size; - params->WORK = work; + params->WORK = (ftyp*)work; params->RWORK = NULL; - params->IWORK = iwork; + params->IWORK = (fortran_int*)iwork; params->LWORK = work_count; return 1; @@ -3696,37 +3793,46 @@ init_@lapack_func@(GELSD_PARAMS_t *params, return 0; } -/**end repeat**/ - -/**begin repeat - #TYPE=CFLOAT,CDOUBLE# - #ftyp=fortran_complex,fortran_doublecomplex# - #frealtyp=fortran_real,fortran_doublereal# - #typ=COMPLEX_t,DOUBLECOMPLEX_t# - #lapack_func=cgelsd,zgelsd# - */ +static inline fortran_int +call_gelsd(GELSD_PARAMS_t<fortran_complex> *params) +{ + fortran_int rv; + LAPACK(cgelsd)(¶ms->M, ¶ms->N, ¶ms->NRHS, + params->A, ¶ms->LDA, + params->B, ¶ms->LDB, + params->S, + params->RCOND, ¶ms->RANK, + params->WORK, ¶ms->LWORK, + params->RWORK, (fortran_int*)params->IWORK, + &rv); + return rv; +} static inline fortran_int -call_@lapack_func@(GELSD_PARAMS_t *params) +call_gelsd(GELSD_PARAMS_t<fortran_doublecomplex> *params) { fortran_int rv; - LAPACK(@lapack_func@)(¶ms->M, ¶ms->N, ¶ms->NRHS, - (@ftyp@*)params->A, ¶ms->LDA, - (@ftyp@*)params->B, ¶ms->LDB, - (@frealtyp@*)params->S, - (@frealtyp@*)params->RCOND, ¶ms->RANK, - (@ftyp@*)params->WORK, ¶ms->LWORK, - (@frealtyp@*)params->RWORK, (fortran_int*)params->IWORK, - (fortran_int*)&rv); + LAPACK(zgelsd)(¶ms->M, ¶ms->N, ¶ms->NRHS, + params->A, ¶ms->LDA, + params->B, ¶ms->LDB, + params->S, + params->RCOND, ¶ms->RANK, + params->WORK, ¶ms->LWORK, + params->RWORK, (fortran_int*)params->IWORK, + &rv); return rv; } + +template<typename ftyp> static inline int -init_@lapack_func@(GELSD_PARAMS_t *params, +init_gelsd(GELSD_PARAMS_t<ftyp> *params, fortran_int m, fortran_int n, - fortran_int nrhs) + fortran_int nrhs, +complex_trait) { +using frealtyp = basetype_t<ftyp>; npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; npy_uint8 *a, *b, *s, *work, *iwork, *rwork; @@ -3738,9 +3844,9 @@ init_@lapack_func@(GELSD_PARAMS_t *params, size_t safe_n = n; size_t safe_nrhs = nrhs; - size_t a_size = safe_m * safe_n * sizeof(@ftyp@); - size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@); - size_t s_size = safe_min_m_n * sizeof(@frealtyp@); + size_t a_size = safe_m * safe_n * sizeof(ftyp); + size_t b_size = safe_max_m_n * safe_nrhs * sizeof(ftyp); + size_t s_size = safe_min_m_n * sizeof(frealtyp); fortran_int work_count; size_t work_size, rwork_size, iwork_size; @@ -3760,16 +3866,16 @@ init_@lapack_func@(GELSD_PARAMS_t *params, params->M = m; params->N = n; params->NRHS = nrhs; - params->A = a; - params->B = b; - params->S = s; + params->A = (ftyp*)a; + params->B = (ftyp*)b; + params->S = (frealtyp*)s; params->LDA = lda; params->LDB = ldb; { /* compute optimal work size */ - @ftyp@ work_size_query; - @frealtyp@ rwork_size_query; + ftyp work_size_query; + frealtyp rwork_size_query; fortran_int iwork_size_query; params->WORK = &work_size_query; @@ -3777,13 +3883,13 @@ init_@lapack_func@(GELSD_PARAMS_t *params, params->RWORK = &rwork_size_query; params->LWORK = -1; - if (call_@lapack_func@(params) != 0) + if (call_gelsd(params) != 0) goto error; work_count = (fortran_int)work_size_query.r; - work_size = (size_t )work_size_query.r * sizeof(@ftyp@); - rwork_size = (size_t)rwork_size_query * sizeof(@frealtyp@); + work_size = (size_t )work_size_query.r * sizeof(ftyp); + rwork_size = (size_t)rwork_size_query * sizeof(frealtyp); iwork_size = (size_t)iwork_size_query * sizeof(fortran_int); } @@ -3795,9 +3901,9 @@ init_@lapack_func@(GELSD_PARAMS_t *params, rwork = work + work_size; iwork = rwork + rwork_size; - params->WORK = work; - params->RWORK = rwork; - params->IWORK = iwork; + params->WORK = (ftyp*)work; + params->RWORK = (frealtyp*)rwork; + params->IWORK = (fortran_int*)iwork; params->LWORK = work_count; return 1; @@ -3810,22 +3916,9 @@ init_@lapack_func@(GELSD_PARAMS_t *params, return 0; } -/**end repeat**/ - - -/**begin repeat - #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE# - #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE# - #lapack_func=sgelsd,dgelsd,cgelsd,zgelsd# - #dot_func=sdot,ddot,cdotc,zdotc# - #typ = npy_float, npy_double, npy_cfloat, npy_cdouble# - #basetyp = npy_float, npy_double, npy_float, npy_double# - #ftyp = fortran_real, fortran_doublereal, - fortran_complex, fortran_doublecomplex# - #cmplx = 0, 0, 1, 1# - */ +template<typename ftyp> static inline void -release_@lapack_func@(GELSD_PARAMS_t* params) +release_gelsd(GELSD_PARAMS_t<ftyp>* params) { /* A and WORK contain allocated blocks */ free(params->A); @@ -3834,26 +3927,38 @@ release_@lapack_func@(GELSD_PARAMS_t* params) } /** Compute the squared l2 norm of a contiguous vector */ -static @basetyp@ -@TYPE@_abs2(@typ@ *p, npy_intp n) { +template<typename typ> +static basetype_t<typ> +abs2(typ *p, npy_intp n, scalar_trait) { npy_intp i; - @basetyp@ res = 0; + basetype_t<typ> res = 0; for (i = 0; i < n; i++) { - @typ@ el = p[i]; -#if @cmplx@ - res += el.real*el.real + el.imag*el.imag; -#else + typ el = p[i]; res += el*el; -#endif + } + return res; +} +template<typename typ> +static basetype_t<typ> +abs2(typ *p, npy_intp n, complex_trait) { + npy_intp i; + basetype_t<typ> res = 0; + for (i = 0; i < n; i++) { + typ el = p[i]; + res += el.real*el.real + el.imag*el.imag; } return res; } + +template<typename typ> static void -@TYPE@_lstsq(char **args, npy_intp const *dimensions, npy_intp const *steps, +lstsq(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { - GELSD_PARAMS_t params; +using ftyp = fortran_type_t<typ>; +using basetyp = basetype_t<typ>; + GELSD_PARAMS_t<ftyp> params; int error_occurred = get_fp_invalid_and_clear(); fortran_int n, m, nrhs; fortran_int excess; @@ -3865,7 +3970,7 @@ static void nrhs = (fortran_int)dimensions[2]; excess = m - n; - if (init_@lapack_func@(¶ms, m, n, nrhs)) { + if (init_gelsd(¶ms, m, n, nrhs, dispatch_scalar<ftyp>{})) { LINEARIZE_DATA_t a_in, b_in, x_out, s_out, r_out; init_linearize_data(&a_in, n, m, steps[1], steps[0]); @@ -3876,52 +3981,51 @@ static void BEGIN_OUTER_LOOP_7 int not_ok; - linearize_@TYPE@_matrix(params.A, args[0], &a_in); - linearize_@TYPE@_matrix(params.B, args[1], &b_in); - params.RCOND = args[2]; - not_ok = call_@lapack_func@(¶ms); + linearize_matrix((typ*)params.A, (typ*)args[0], &a_in); + linearize_matrix((typ*)params.B, (typ*)args[1], &b_in); + params.RCOND = (basetyp*)args[2]; + not_ok = call_gelsd(¶ms); if (!not_ok) { - delinearize_@TYPE@_matrix(args[3], params.B, &x_out); + delinearize_matrix((typ*)args[3], (typ*)params.B, &x_out); *(npy_int*) args[5] = params.RANK; - delinearize_@REALTYPE@_matrix(args[6], params.S, &s_out); + delinearize_matrix((basetyp*)args[6], (basetyp*)params.S, &s_out); /* Note that linalg.lstsq discards this when excess == 0 */ if (excess >= 0 && params.RANK == n) { /* Compute the residuals as the square sum of each column */ int i; char *resid = args[4]; - @ftyp@ *components = (@ftyp@ *)params.B + n; + ftyp *components = (ftyp *)params.B + n; for (i = 0; i < nrhs; i++) { - @ftyp@ *vector = components + i*m; + ftyp *vector = components + i*m; /* Numpy and fortran floating types are the same size, * so this cast is safe */ - @basetyp@ abs2 = @TYPE@_abs2((@typ@ *)vector, excess); + basetyp abs = abs2((typ *)vector, excess, +dispatch_scalar<typ>{}); memcpy( resid + i*r_out.column_strides, - &abs2, sizeof(abs2)); + &abs, sizeof(abs)); } } else { /* Note that this is always discarded by linalg.lstsq */ - nan_@REALTYPE@_matrix(args[4], &r_out); + nan_matrix((basetyp*)args[4], &r_out); } } else { error_occurred = 1; - nan_@TYPE@_matrix(args[3], &x_out); - nan_@REALTYPE@_matrix(args[4], &r_out); + nan_matrix((typ*)args[3], &x_out); + nan_matrix((basetyp*)args[4], &r_out); *(npy_int*) args[5] = -1; - nan_@REALTYPE@_matrix(args[6], &s_out); + nan_matrix((basetyp*)args[6], &s_out); } END_OUTER_LOOP - release_@lapack_func@(¶ms); + release_gelsd(¶ms); } set_fp_invalid_or_clear(error_occurred); } -/**end repeat**/ - #pragma GCC diagnostic pop /* -------------------------------------------------------------------------- */ @@ -3966,6 +4070,22 @@ static void *array_of_nulls[] = { CFLOAT_ ## NAME, \ CDOUBLE_ ## NAME \ } +#define GUFUNC_FUNC_ARRAY_REAL_COMPLEX_(NAME) \ + static PyUFuncGenericFunction \ + FUNC_ARRAY_NAME(NAME)[] = { \ + NAME<npy_float, npy_float>, \ + NAME<npy_double, npy_double>, \ + NAME<npy_cfloat, npy_float>, \ + NAME<npy_cdouble, npy_double> \ + } +#define GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(NAME) \ + static PyUFuncGenericFunction \ + FUNC_ARRAY_NAME(NAME)[] = { \ + NAME<npy_float>, \ + NAME<npy_double>, \ + NAME<npy_cfloat>, \ + NAME<npy_cdouble> \ + } /* There are problems with eig in complex single precision. * That kernel is disabled @@ -3973,9 +4093,9 @@ static void *array_of_nulls[] = { #define GUFUNC_FUNC_ARRAY_EIG(NAME) \ static PyUFuncGenericFunction \ FUNC_ARRAY_NAME(NAME)[] = { \ - FLOAT_ ## NAME, \ - DOUBLE_ ## NAME, \ - CDOUBLE_ ## NAME \ + NAME<fortran_complex,fortran_real>, \ + NAME<fortran_doublecomplex,fortran_doublereal>, \ + NAME<fortran_doublecomplex,fortran_doublecomplex> \ } /* The single precision functions are not used at all, @@ -3988,25 +4108,31 @@ static void *array_of_nulls[] = { DOUBLE_ ## NAME, \ CDOUBLE_ ## NAME \ } +#define GUFUNC_FUNC_ARRAY_QR__(NAME) \ + static PyUFuncGenericFunction \ + FUNC_ARRAY_NAME(NAME)[] = { \ + NAME<npy_double>, \ + NAME<npy_cdouble> \ + } -GUFUNC_FUNC_ARRAY_REAL_COMPLEX(slogdet); -GUFUNC_FUNC_ARRAY_REAL_COMPLEX(det); -GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eighlo); -GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eighup); -GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eigvalshlo); -GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eigvalshup); -GUFUNC_FUNC_ARRAY_REAL_COMPLEX(solve); -GUFUNC_FUNC_ARRAY_REAL_COMPLEX(solve1); -GUFUNC_FUNC_ARRAY_REAL_COMPLEX(inv); -GUFUNC_FUNC_ARRAY_REAL_COMPLEX(cholesky_lo); -GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_N); -GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_S); -GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_A); -GUFUNC_FUNC_ARRAY_QR(qr_r_raw); -GUFUNC_FUNC_ARRAY_QR(qr_reduced); -GUFUNC_FUNC_ARRAY_QR(qr_complete); -GUFUNC_FUNC_ARRAY_REAL_COMPLEX(lstsq); +GUFUNC_FUNC_ARRAY_REAL_COMPLEX_(slogdet); +GUFUNC_FUNC_ARRAY_REAL_COMPLEX_(det); +GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(eighlo); +GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(eighup); +GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(eigvalshlo); +GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(eigvalshup); +GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(solve); +GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(solve1); +GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(inv); +GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(cholesky_lo); +GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(svd_N); +GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(svd_S); +GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(svd_A); +GUFUNC_FUNC_ARRAY_QR__(qr_r_raw); +GUFUNC_FUNC_ARRAY_QR__(qr_reduced); +GUFUNC_FUNC_ARRAY_QR__(qr_complete); +GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(lstsq); GUFUNC_FUNC_ARRAY_EIG(eig); GUFUNC_FUNC_ARRAY_EIG(eigvals); @@ -4406,7 +4532,6 @@ PyMODINIT_FUNC PyInit__umath_linalg(void) PyObject *d; PyObject *version; - init_constants(); m = PyModule_Create(&moduledef); if (m == NULL) { return NULL; |
