/* -*- c -*- */ /* ***************************************************************************** ** INCLUDES ** ***************************************************************************** */ #define NPY_NO_DEPRECATED_API NPY_API_VERSION #include "Python.h" #include "numpy/arrayobject.h" #include "numpy/ufuncobject.h" #include "npy_pycompat.h" #include "npy_config.h" #include "npy_cblas.h" #include #include #include #include static const char* umath_linalg_version_string = "0.1.5"; /* **************************************************************************** * Debugging support * **************************************************************************** */ #define TRACE_TXT(...) do { fprintf (stderr, __VA_ARGS__); } while (0) #define STACK_TRACE do {} while (0) #define TRACE\ do { \ fprintf (stderr, \ "%s:%d:%s\n", \ __FILE__, \ __LINE__, \ __FUNCTION__); \ STACK_TRACE; \ } while (0) #if 0 #include void dbg_stack_trace() { void *trace[32]; size_t size; size = backtrace(trace, sizeof(trace)/sizeof(trace[0])); backtrace_symbols_fd(trace, size, 1); } #undef STACK_TRACE #define STACK_TRACE do { dbg_stack_trace(); } while (0) #endif /* ***************************************************************************** * BLAS/LAPACK calling macros * ***************************************************************************** */ #define FNAME(x) BLAS_FUNC(x) typedef CBLAS_INT fortran_int; typedef struct { float r, i; } f2c_complex; typedef struct { double r, i; } f2c_doublecomplex; /* typedef long int (*L_fp)(); */ typedef float fortran_real; typedef double fortran_doublereal; typedef f2c_complex fortran_complex; typedef f2c_doublecomplex fortran_doublecomplex; extern fortran_int FNAME(sgeev)(char *jobvl, char *jobvr, fortran_int *n, float a[], fortran_int *lda, float wr[], float wi[], float vl[], fortran_int *ldvl, float vr[], fortran_int *ldvr, float work[], fortran_int lwork[], fortran_int *info); extern fortran_int FNAME(dgeev)(char *jobvl, char *jobvr, fortran_int *n, double a[], fortran_int *lda, double wr[], double wi[], double vl[], fortran_int *ldvl, double vr[], fortran_int *ldvr, double work[], fortran_int lwork[], fortran_int *info); extern fortran_int FNAME(cgeev)(char *jobvl, char *jobvr, fortran_int *n, f2c_doublecomplex a[], fortran_int *lda, f2c_doublecomplex w[], f2c_doublecomplex vl[], fortran_int *ldvl, f2c_doublecomplex vr[], fortran_int *ldvr, f2c_doublecomplex work[], fortran_int *lwork, double rwork[], fortran_int *info); extern fortran_int FNAME(zgeev)(char *jobvl, char *jobvr, fortran_int *n, f2c_doublecomplex a[], fortran_int *lda, f2c_doublecomplex w[], f2c_doublecomplex vl[], fortran_int *ldvl, f2c_doublecomplex vr[], fortran_int *ldvr, f2c_doublecomplex work[], fortran_int *lwork, double rwork[], fortran_int *info); extern fortran_int FNAME(ssyevd)(char *jobz, char *uplo, fortran_int *n, float a[], fortran_int *lda, float w[], float work[], fortran_int *lwork, fortran_int iwork[], fortran_int *liwork, fortran_int *info); extern fortran_int FNAME(dsyevd)(char *jobz, char *uplo, fortran_int *n, double a[], fortran_int *lda, double w[], double work[], fortran_int *lwork, fortran_int iwork[], fortran_int *liwork, fortran_int *info); extern fortran_int FNAME(cheevd)(char *jobz, char *uplo, fortran_int *n, f2c_complex a[], fortran_int *lda, float w[], f2c_complex work[], fortran_int *lwork, float rwork[], fortran_int *lrwork, fortran_int iwork[], fortran_int *liwork, fortran_int *info); extern fortran_int FNAME(zheevd)(char *jobz, char *uplo, fortran_int *n, f2c_doublecomplex a[], fortran_int *lda, double w[], f2c_doublecomplex work[], fortran_int *lwork, double rwork[], fortran_int *lrwork, fortran_int iwork[], fortran_int *liwork, fortran_int *info); extern fortran_int FNAME(sgelsd)(fortran_int *m, fortran_int *n, fortran_int *nrhs, float a[], fortran_int *lda, float b[], fortran_int *ldb, float s[], float *rcond, fortran_int *rank, float work[], fortran_int *lwork, fortran_int iwork[], fortran_int *info); extern fortran_int FNAME(dgelsd)(fortran_int *m, fortran_int *n, fortran_int *nrhs, double a[], fortran_int *lda, double b[], fortran_int *ldb, double s[], double *rcond, fortran_int *rank, double work[], fortran_int *lwork, fortran_int iwork[], fortran_int *info); extern fortran_int FNAME(cgelsd)(fortran_int *m, fortran_int *n, fortran_int *nrhs, f2c_complex a[], fortran_int *lda, f2c_complex b[], fortran_int *ldb, float s[], float *rcond, fortran_int *rank, f2c_complex work[], fortran_int *lwork, float rwork[], fortran_int iwork[], fortran_int *info); extern fortran_int FNAME(zgelsd)(fortran_int *m, fortran_int *n, fortran_int *nrhs, f2c_doublecomplex a[], fortran_int *lda, f2c_doublecomplex b[], fortran_int *ldb, double s[], double *rcond, fortran_int *rank, f2c_doublecomplex work[], fortran_int *lwork, double rwork[], fortran_int iwork[], fortran_int *info); extern fortran_int FNAME(sgesv)(fortran_int *n, fortran_int *nrhs, float a[], fortran_int *lda, fortran_int ipiv[], float b[], fortran_int *ldb, fortran_int *info); extern fortran_int FNAME(dgesv)(fortran_int *n, fortran_int *nrhs, double a[], fortran_int *lda, fortran_int ipiv[], double b[], fortran_int *ldb, fortran_int *info); extern fortran_int FNAME(cgesv)(fortran_int *n, fortran_int *nrhs, f2c_complex a[], fortran_int *lda, fortran_int ipiv[], f2c_complex b[], fortran_int *ldb, fortran_int *info); extern fortran_int FNAME(zgesv)(fortran_int *n, fortran_int *nrhs, f2c_doublecomplex a[], fortran_int *lda, fortran_int ipiv[], f2c_doublecomplex b[], fortran_int *ldb, fortran_int *info); extern fortran_int FNAME(sgetrf)(fortran_int *m, fortran_int *n, float a[], fortran_int *lda, fortran_int ipiv[], fortran_int *info); extern fortran_int FNAME(dgetrf)(fortran_int *m, fortran_int *n, double a[], fortran_int *lda, fortran_int ipiv[], fortran_int *info); extern fortran_int FNAME(cgetrf)(fortran_int *m, fortran_int *n, f2c_complex a[], fortran_int *lda, fortran_int ipiv[], fortran_int *info); extern fortran_int FNAME(zgetrf)(fortran_int *m, fortran_int *n, f2c_doublecomplex a[], fortran_int *lda, fortran_int ipiv[], fortran_int *info); extern fortran_int FNAME(spotrf)(char *uplo, fortran_int *n, float a[], fortran_int *lda, fortran_int *info); extern fortran_int FNAME(dpotrf)(char *uplo, fortran_int *n, double a[], fortran_int *lda, fortran_int *info); extern fortran_int FNAME(cpotrf)(char *uplo, fortran_int *n, f2c_complex a[], fortran_int *lda, fortran_int *info); extern fortran_int FNAME(zpotrf)(char *uplo, fortran_int *n, f2c_doublecomplex a[], fortran_int *lda, fortran_int *info); extern fortran_int FNAME(sgesdd)(char *jobz, fortran_int *m, fortran_int *n, float a[], fortran_int *lda, float s[], float u[], fortran_int *ldu, float vt[], fortran_int *ldvt, float work[], fortran_int *lwork, fortran_int iwork[], fortran_int *info); extern fortran_int FNAME(dgesdd)(char *jobz, fortran_int *m, fortran_int *n, double a[], fortran_int *lda, double s[], double u[], fortran_int *ldu, double vt[], fortran_int *ldvt, double work[], fortran_int *lwork, fortran_int iwork[], fortran_int *info); extern fortran_int FNAME(cgesdd)(char *jobz, fortran_int *m, fortran_int *n, f2c_complex a[], fortran_int *lda, float s[], f2c_complex u[], fortran_int *ldu, f2c_complex vt[], fortran_int *ldvt, f2c_complex work[], fortran_int *lwork, float rwork[], fortran_int iwork[], fortran_int *info); extern fortran_int FNAME(zgesdd)(char *jobz, fortran_int *m, fortran_int *n, f2c_doublecomplex a[], fortran_int *lda, double s[], f2c_doublecomplex u[], fortran_int *ldu, f2c_doublecomplex vt[], fortran_int *ldvt, f2c_doublecomplex work[], fortran_int *lwork, double rwork[], fortran_int iwork[], fortran_int *info); extern fortran_int FNAME(spotrs)(char *uplo, fortran_int *n, fortran_int *nrhs, float a[], fortran_int *lda, float b[], fortran_int *ldb, fortran_int *info); extern fortran_int FNAME(dpotrs)(char *uplo, fortran_int *n, fortran_int *nrhs, double a[], fortran_int *lda, double b[], fortran_int *ldb, fortran_int *info); extern fortran_int FNAME(cpotrs)(char *uplo, fortran_int *n, fortran_int *nrhs, f2c_complex a[], fortran_int *lda, f2c_complex b[], fortran_int *ldb, fortran_int *info); extern fortran_int FNAME(zpotrs)(char *uplo, fortran_int *n, fortran_int *nrhs, f2c_doublecomplex a[], fortran_int *lda, f2c_doublecomplex b[], fortran_int *ldb, fortran_int *info); extern fortran_int FNAME(spotri)(char *uplo, fortran_int *n, float a[], fortran_int *lda, fortran_int *info); extern fortran_int FNAME(dpotri)(char *uplo, fortran_int *n, double a[], fortran_int *lda, fortran_int *info); extern fortran_int FNAME(cpotri)(char *uplo, fortran_int *n, f2c_complex a[], fortran_int *lda, fortran_int *info); extern fortran_int FNAME(zpotri)(char *uplo, fortran_int *n, f2c_doublecomplex a[], fortran_int *lda, fortran_int *info); extern fortran_int FNAME(scopy)(fortran_int *n, float *sx, fortran_int *incx, float *sy, fortran_int *incy); extern fortran_int FNAME(dcopy)(fortran_int *n, double *sx, fortran_int *incx, double *sy, fortran_int *incy); extern fortran_int FNAME(ccopy)(fortran_int *n, f2c_complex *sx, fortran_int *incx, f2c_complex *sy, fortran_int *incy); extern fortran_int FNAME(zcopy)(fortran_int *n, f2c_doublecomplex *sx, fortran_int *incx, f2c_doublecomplex *sy, fortran_int *incy); extern float FNAME(sdot)(fortran_int *n, float *sx, fortran_int *incx, float *sy, fortran_int *incy); extern double FNAME(ddot)(fortran_int *n, double *sx, fortran_int *incx, double *sy, fortran_int *incy); extern void FNAME(cdotu)(f2c_complex *ret, fortran_int *n, f2c_complex *sx, fortran_int *incx, f2c_complex *sy, fortran_int *incy); extern void FNAME(zdotu)(f2c_doublecomplex *ret, fortran_int *n, f2c_doublecomplex *sx, fortran_int *incx, f2c_doublecomplex *sy, fortran_int *incy); extern void FNAME(cdotc)(f2c_complex *ret, fortran_int *n, f2c_complex *sx, fortran_int *incx, f2c_complex *sy, fortran_int *incy); extern void FNAME(zdotc)(f2c_doublecomplex *ret, fortran_int *n, f2c_doublecomplex *sx, fortran_int *incx, f2c_doublecomplex *sy, fortran_int *incy); extern fortran_int FNAME(sgemm)(char *transa, char *transb, fortran_int *m, fortran_int *n, fortran_int *k, float *alpha, float *a, fortran_int *lda, float *b, fortran_int *ldb, float *beta, float *c, fortran_int *ldc); extern fortran_int FNAME(dgemm)(char *transa, char *transb, fortran_int *m, fortran_int *n, fortran_int *k, double *alpha, double *a, fortran_int *lda, double *b, fortran_int *ldb, double *beta, double *c, fortran_int *ldc); extern fortran_int FNAME(cgemm)(char *transa, char *transb, fortran_int *m, fortran_int *n, fortran_int *k, f2c_complex *alpha, f2c_complex *a, fortran_int *lda, f2c_complex *b, fortran_int *ldb, f2c_complex *beta, f2c_complex *c, fortran_int *ldc); extern fortran_int FNAME(zgemm)(char *transa, char *transb, fortran_int *m, fortran_int *n, fortran_int *k, f2c_doublecomplex *alpha, f2c_doublecomplex *a, fortran_int *lda, f2c_doublecomplex *b, fortran_int *ldb, f2c_doublecomplex *beta, f2c_doublecomplex *c, fortran_int *ldc); #define LAPACK_T(FUNC) \ TRACE_TXT("Calling LAPACK ( " # FUNC " )\n"); \ FNAME(FUNC) #define BLAS(FUNC) \ FNAME(FUNC) #define LAPACK(FUNC) \ FNAME(FUNC) /* ***************************************************************************** ** Some handy functions ** ***************************************************************************** */ static NPY_INLINE int get_fp_invalid_and_clear(void) { int status; status = npy_clear_floatstatus_barrier((char*)&status); return !!(status & NPY_FPE_INVALID); } static NPY_INLINE void set_fp_invalid_or_clear(int error_occurred) { if (error_occurred) { npy_set_floatstatus_invalid(); } else { npy_clear_floatstatus_barrier((char*)&error_occurred); } } /* ***************************************************************************** ** Some handy constants ** ***************************************************************************** */ #define UMATH_LINALG_MODULE_NAME "_umath_linalg" typedef union { fortran_complex f; npy_cfloat npy; float array[2]; } COMPLEX_t; typedef union { fortran_doublecomplex f; npy_cdouble npy; double array[2]; } DOUBLECOMPLEX_t; static float s_one; static float s_zero; static float s_minus_one; static float s_ninf; static float s_nan; static double d_one; static double d_zero; static double d_minus_one; static double d_ninf; static double d_nan; static COMPLEX_t c_one; static COMPLEX_t c_zero; static COMPLEX_t c_minus_one; static COMPLEX_t c_ninf; static COMPLEX_t c_nan; static DOUBLECOMPLEX_t z_one; static DOUBLECOMPLEX_t z_zero; static DOUBLECOMPLEX_t z_minus_one; static DOUBLECOMPLEX_t z_ninf; static DOUBLECOMPLEX_t z_nan; static void init_constants(void) { /* this is needed as NPY_INFINITY and NPY_NAN macros can't be used as initializers. I prefer to just set all the constants the same way. */ s_one = 1.0f; s_zero = 0.0f; s_minus_one = -1.0f; s_ninf = -NPY_INFINITYF; s_nan = NPY_NANF; d_one = 1.0; d_zero = 0.0; d_minus_one = -1.0; d_ninf = -NPY_INFINITY; d_nan = NPY_NAN; c_one.array[0] = 1.0f; c_one.array[1] = 0.0f; c_zero.array[0] = 0.0f; c_zero.array[1] = 0.0f; c_minus_one.array[0] = -1.0f; c_minus_one.array[1] = 0.0f; c_ninf.array[0] = -NPY_INFINITYF; c_ninf.array[1] = 0.0f; c_nan.array[0] = NPY_NANF; c_nan.array[1] = NPY_NANF; z_one.array[0] = 1.0; z_one.array[1] = 0.0; z_zero.array[0] = 0.0; z_zero.array[1] = 0.0; z_minus_one.array[0] = -1.0; z_minus_one.array[1] = 0.0; z_ninf.array[0] = -NPY_INFINITY; z_ninf.array[1] = 0.0; z_nan.array[0] = NPY_NAN; z_nan.array[1] = NPY_NAN; } /* ***************************************************************************** ** Structs used for data rearrangement ** ***************************************************************************** */ /* * this struct contains information about how to linearize a matrix in a local * buffer so that it can be used by blas functions. All strides are specified * in bytes and are converted to elements later in type specific functions. * * rows: number of rows in the matrix * columns: number of columns in the matrix * row_strides: the number bytes between consecutive rows. * column_strides: the number of bytes between consecutive columns. * output_lead_dim: BLAS/LAPACK-side leading dimension, in elements */ typedef struct linearize_data_struct { npy_intp rows; npy_intp columns; npy_intp row_strides; npy_intp column_strides; npy_intp output_lead_dim; } LINEARIZE_DATA_t; static NPY_INLINE void init_linearize_data_ex(LINEARIZE_DATA_t *lin_data, npy_intp rows, npy_intp columns, npy_intp row_strides, npy_intp column_strides, npy_intp output_lead_dim) { lin_data->rows = rows; lin_data->columns = columns; lin_data->row_strides = row_strides; lin_data->column_strides = column_strides; lin_data->output_lead_dim = output_lead_dim; } static NPY_INLINE void init_linearize_data(LINEARIZE_DATA_t *lin_data, npy_intp rows, npy_intp columns, npy_intp row_strides, npy_intp column_strides) { init_linearize_data_ex( lin_data, rows, columns, row_strides, column_strides, columns); } static NPY_INLINE void dump_ufunc_object(PyUFuncObject* ufunc) { TRACE_TXT("\n\n%s '%s' (%d input(s), %d output(s), %d specialization(s).\n", ufunc->core_enabled? "generalized ufunc" : "scalar ufunc", ufunc->name, ufunc->nin, ufunc->nout, ufunc->ntypes); if (ufunc->core_enabled) { int arg; int dim; TRACE_TXT("\t%s (%d dimension(s) detected).\n", ufunc->core_signature, ufunc->core_num_dim_ix); for (arg = 0; arg < ufunc->nargs; arg++){ int * arg_dim_ix = ufunc->core_dim_ixs + ufunc->core_offsets[arg]; TRACE_TXT("\t\targ %d (%s) has %d dimension(s): (", arg, arg < ufunc->nin? "INPUT" : "OUTPUT", ufunc->core_num_dims[arg]); for (dim = 0; dim < ufunc->core_num_dims[arg]; dim ++) { TRACE_TXT(" %d", arg_dim_ix[dim]); } TRACE_TXT(" )\n"); } } } static NPY_INLINE void dump_linearize_data(const char* name, const LINEARIZE_DATA_t* params) { TRACE_TXT("\n\t%s rows: %zd columns: %zd"\ "\n\t\trow_strides: %td column_strides: %td"\ "\n", name, params->rows, params->columns, params->row_strides, params->column_strides); } static NPY_INLINE void print_FLOAT(npy_float s) { TRACE_TXT(" %8.4f", s); } static NPY_INLINE void print_DOUBLE(npy_double d) { TRACE_TXT(" %10.6f", d); } static NPY_INLINE void print_CFLOAT(npy_cfloat c) { float* c_parts = (float*)&c; TRACE_TXT("(%8.4f, %8.4fj)", c_parts[0], c_parts[1]); } static NPY_INLINE void print_CDOUBLE(npy_cdouble z) { double* z_parts = (double*)&z; TRACE_TXT("(%8.4f, %8.4fj)", z_parts[0], z_parts[1]); } /**begin repeat #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# #typ = npy_float, npy_double, npy_cfloat, npy_cdouble# */ static NPY_INLINE void dump_@TYPE@_matrix(const char* name, size_t rows, size_t columns, const @typ@* ptr) { size_t i, j; TRACE_TXT("\n%s %p (%zd, %zd)\n", name, ptr, rows, columns); for (i = 0; i < rows; i++) { TRACE_TXT("| "); for (j = 0; j < columns; j++) { print_@TYPE@(ptr[j*rows + i]); TRACE_TXT(", "); } TRACE_TXT(" |\n"); } } /**end repeat**/ /* ***************************************************************************** ** Basics ** ***************************************************************************** */ static NPY_INLINE fortran_int fortran_int_min(fortran_int x, fortran_int y) { return x < y ? x : y; } static NPY_INLINE fortran_int fortran_int_max(fortran_int x, fortran_int y) { return x > y ? x : y; } #define INIT_OUTER_LOOP_1 \ npy_intp dN = *dimensions++;\ npy_intp N_;\ npy_intp s0 = *steps++; #define INIT_OUTER_LOOP_2 \ INIT_OUTER_LOOP_1\ npy_intp s1 = *steps++; #define INIT_OUTER_LOOP_3 \ INIT_OUTER_LOOP_2\ npy_intp s2 = *steps++; #define INIT_OUTER_LOOP_4 \ INIT_OUTER_LOOP_3\ npy_intp s3 = *steps++; #define INIT_OUTER_LOOP_5 \ INIT_OUTER_LOOP_4\ npy_intp s4 = *steps++; #define INIT_OUTER_LOOP_6 \ INIT_OUTER_LOOP_5\ npy_intp s5 = *steps++; #define INIT_OUTER_LOOP_7 \ INIT_OUTER_LOOP_6\ npy_intp s6 = *steps++; #define BEGIN_OUTER_LOOP_2 \ for (N_ = 0;\ N_ < dN;\ N_++, args[0] += s0,\ args[1] += s1) { #define BEGIN_OUTER_LOOP_3 \ for (N_ = 0;\ N_ < dN;\ N_++, args[0] += s0,\ args[1] += s1,\ args[2] += s2) { #define BEGIN_OUTER_LOOP_4 \ for (N_ = 0;\ N_ < dN;\ N_++, args[0] += s0,\ args[1] += s1,\ args[2] += s2,\ args[3] += s3) { #define BEGIN_OUTER_LOOP_5 \ for (N_ = 0;\ N_ < dN;\ N_++, args[0] += s0,\ args[1] += s1,\ args[2] += s2,\ args[3] += s3,\ args[4] += s4) { #define BEGIN_OUTER_LOOP_6 \ for (N_ = 0;\ N_ < dN;\ N_++, args[0] += s0,\ args[1] += s1,\ args[2] += s2,\ args[3] += s3,\ args[4] += s4,\ args[5] += s5) { #define BEGIN_OUTER_LOOP_7 \ for (N_ = 0;\ N_ < dN;\ N_++, args[0] += s0,\ args[1] += s1,\ args[2] += s2,\ args[3] += s3,\ args[4] += s4,\ args[5] += s5,\ args[6] += s6) { #define END_OUTER_LOOP } static NPY_INLINE void update_pointers(npy_uint8** bases, ptrdiff_t* offsets, size_t count) { size_t i; for (i = 0; i < count; ++i) { bases[i] += offsets[i]; } } /* disable -Wmaybe-uninitialized as there is some code that generate false positives with this warning */ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" /* ***************************************************************************** ** HELPER FUNCS ** ***************************************************************************** */ /* rearranging of 2D matrices using blas */ /**begin repeat #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# #typ = float, double, COMPLEX_t, DOUBLECOMPLEX_t# #copy = scopy, dcopy, ccopy, zcopy# #nan = s_nan, d_nan, c_nan, z_nan# #zero = s_zero, d_zero, c_zero, z_zero# */ static NPY_INLINE void * linearize_@TYPE@_matrix(void *dst_in, void *src_in, const LINEARIZE_DATA_t* data) { @typ@ *src = (@typ@ *) src_in; @typ@ *dst = (@typ@ *) dst_in; if (dst) { int i, j; @typ@* rv = dst; fortran_int columns = (fortran_int)data->columns; fortran_int column_strides = (fortran_int)(data->column_strides/sizeof(@typ@)); fortran_int one = 1; for (i = 0; i < data->rows; i++) { if (column_strides > 0) { FNAME(@copy@)(&columns, (void*)src, &column_strides, (void*)dst, &one); } else if (column_strides < 0) { FNAME(@copy@)(&columns, (void*)((@typ@*)src + (columns-1)*column_strides), &column_strides, (void*)dst, &one); } else { /* * Zero stride has undefined behavior in some BLAS * implementations (e.g. OSX Accelerate), so do it * manually */ for (j = 0; j < columns; ++j) { memcpy((@typ@*)dst + j, (@typ@*)src, sizeof(@typ@)); } } src += data->row_strides/sizeof(@typ@); dst += data->output_lead_dim; } return rv; } else { return src; } } static NPY_INLINE void * delinearize_@TYPE@_matrix(void *dst_in, void *src_in, const LINEARIZE_DATA_t* data) { @typ@ *src = (@typ@ *) src_in; @typ@ *dst = (@typ@ *) dst_in; if (src) { int i; @typ@ *rv = src; fortran_int columns = (fortran_int)data->columns; fortran_int column_strides = (fortran_int)(data->column_strides/sizeof(@typ@)); fortran_int one = 1; for (i = 0; i < data->rows; i++) { if (column_strides > 0) { FNAME(@copy@)(&columns, (void*)src, &one, (void*)dst, &column_strides); } else if (column_strides < 0) { FNAME(@copy@)(&columns, (void*)src, &one, (void*)((@typ@*)dst + (columns-1)*column_strides), &column_strides); } else { /* * Zero stride has undefined behavior in some BLAS * implementations (e.g. OSX Accelerate), so do it * manually */ if (columns > 0) { memcpy((@typ@*)dst, (@typ@*)src + (columns-1), sizeof(@typ@)); } } src += data->output_lead_dim; dst += data->row_strides/sizeof(@typ@); } return rv; } else { return src; } } static NPY_INLINE void nan_@TYPE@_matrix(void *dst_in, const LINEARIZE_DATA_t* data) { @typ@ *dst = (@typ@ *) dst_in; int i, j; for (i = 0; i < data->rows; i++) { @typ@ *cp = dst; ptrdiff_t cs = data->column_strides/sizeof(@typ@); for (j = 0; j < data->columns; ++j) { *cp = @nan@; cp += cs; } dst += data->row_strides/sizeof(@typ@); } } static NPY_INLINE void zero_@TYPE@_matrix(void *dst_in, const LINEARIZE_DATA_t* data) { @typ@ *dst = (@typ@ *) dst_in; int i, j; for (i = 0; i < data->rows; i++) { @typ@ *cp = dst; ptrdiff_t cs = data->column_strides/sizeof(@typ@); for (j = 0; j < data->columns; ++j) { *cp = @zero@; cp += cs; } dst += data->row_strides/sizeof(@typ@); } } /**end repeat**/ /* identity square matrix generation */ /**begin repeat #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# #typ = float, double, COMPLEX_t, DOUBLECOMPLEX_t# #cblas_type = s, d, c, z# */ static NPY_INLINE void identity_@TYPE@_matrix(void *ptr, size_t n) { size_t i; @typ@ *matrix = (@typ@*) ptr; /* in IEEE floating point, zeroes are represented as bitwise 0 */ memset(matrix, 0, n*n*sizeof(@typ@)); for (i = 0; i < n; ++i) { *matrix = @cblas_type@_one; matrix += n+1; } } /**end repeat**/ /* lower/upper triangular matrix using blas (in place) */ /**begin repeat #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# #typ = float, double, COMPLEX_t, DOUBLECOMPLEX_t# #cblas_type = s, d, c, z# */ static NPY_INLINE void triu_@TYPE@_matrix(void *ptr, size_t n) { size_t i, j; @typ@ *matrix = (@typ@*)ptr; matrix += n; for (i = 1; i < n; ++i) { for (j = 0; j < i; ++j) { matrix[j] = @cblas_type@_zero; } matrix += n; } } /**end repeat**/ /* -------------------------------------------------------------------------- */ /* Determinants */ /**begin repeat #TYPE = FLOAT, DOUBLE# #typ = npy_float, npy_double# #log_func = npy_logf, npy_log# #exp_func = npy_expf, npy_exp# #zero = 0.0f, 0.0# */ static NPY_INLINE void @TYPE@_slogdet_from_factored_diagonal(@typ@* src, fortran_int m, @typ@ *sign, @typ@ *logdet) { @typ@ acc_sign = *sign; @typ@ acc_logdet = @zero@; int i; for (i = 0; i < m; i++) { @typ@ abs_element = *src; if (abs_element < @zero@) { acc_sign = -acc_sign; abs_element = -abs_element; } acc_logdet += @log_func@(abs_element); src += m+1; } *sign = acc_sign; *logdet = acc_logdet; } static NPY_INLINE @typ@ @TYPE@_det_from_slogdet(@typ@ sign, @typ@ logdet) { @typ@ result = sign * @exp_func@(logdet); return result; } /**end repeat**/ /**begin repeat #TYPE = CFLOAT, CDOUBLE# #typ = npy_cfloat, npy_cdouble# #basetyp = npy_float, npy_double# #abs_func = npy_cabsf, npy_cabs# #log_func = npy_logf, npy_log# #exp_func = npy_expf, npy_exp# #zero = 0.0f, 0.0# */ #define RE(COMPLEX) (((@basetyp@*)(&COMPLEX))[0]) #define IM(COMPLEX) (((@basetyp@*)(&COMPLEX))[1]) static NPY_INLINE @typ@ @TYPE@_mult(@typ@ op1, @typ@ op2) { @typ@ rv; RE(rv) = RE(op1)*RE(op2) - IM(op1)*IM(op2); IM(rv) = RE(op1)*IM(op2) + IM(op1)*RE(op2); return rv; } static NPY_INLINE void @TYPE@_slogdet_from_factored_diagonal(@typ@* src, fortran_int m, @typ@ *sign, @basetyp@ *logdet) { int i; @typ@ sign_acc = *sign; @basetyp@ logdet_acc = @zero@; for (i = 0; i < m; i++) { @basetyp@ abs_element = @abs_func@(*src); @typ@ sign_element; RE(sign_element) = RE(*src) / abs_element; IM(sign_element) = IM(*src) / abs_element; sign_acc = @TYPE@_mult(sign_acc, sign_element); logdet_acc += @log_func@(abs_element); src += m + 1; } *sign = sign_acc; *logdet = logdet_acc; } static NPY_INLINE @typ@ @TYPE@_det_from_slogdet(@typ@ sign, @basetyp@ logdet) { @typ@ tmp; RE(tmp) = @exp_func@(logdet); IM(tmp) = @zero@; return @TYPE@_mult(sign, tmp); } #undef RE #undef IM /**end repeat**/ /* As in the linalg package, the determinant is computed via LU factorization * using LAPACK. * slogdet computes sign + log(determinant). * det computes sign * exp(slogdet). */ /**begin repeat #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# #typ = npy_float, npy_double, npy_cfloat, npy_cdouble# #basetyp = npy_float, npy_double, npy_float, npy_double# #cblas_type = s, d, c, z# */ static NPY_INLINE void @TYPE@_slogdet_single_element(fortran_int m, void* src, fortran_int* pivots, @typ@ *sign, @basetyp@ *logdet) { fortran_int info = 0; fortran_int lda = fortran_int_max(m, 1); int i; /* note: done in place */ LAPACK(@cblas_type@getrf)(&m, &m, (void *)src, &lda, pivots, &info); if (info == 0) { int change_sign = 0; /* note: fortran uses 1 based indexing */ for (i = 0; i < m; i++) { change_sign += (pivots[i] != (i+1)); } memcpy(sign, (change_sign % 2)? &@cblas_type@_minus_one : &@cblas_type@_one , sizeof(*sign)); @TYPE@_slogdet_from_factored_diagonal(src, m, sign, logdet); } else { /* if getrf fails, use 0 as sign and -inf as logdet */ memcpy(sign, &@cblas_type@_zero, sizeof(*sign)); memcpy(logdet, &@cblas_type@_ninf, sizeof(*logdet)); } } static void @TYPE@_slogdet(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { fortran_int m; npy_uint8 *tmp_buff = NULL; size_t matrix_size; size_t pivot_size; size_t safe_m; /* notes: * matrix will need to be copied always, as factorization in lapack is * made inplace * matrix will need to be in column-major order, as expected by lapack * code (fortran) * always a square matrix * need to allocate memory for both, matrix_buffer and pivot buffer */ INIT_OUTER_LOOP_3 m = (fortran_int) dimensions[0]; safe_m = m; matrix_size = safe_m * safe_m * sizeof(@typ@); pivot_size = safe_m * sizeof(fortran_int); tmp_buff = (npy_uint8 *)malloc(matrix_size + pivot_size); if (tmp_buff) { LINEARIZE_DATA_t lin_data; /* swapped steps to get matrix in FORTRAN order */ init_linearize_data(&lin_data, m, m, steps[1], steps[0]); BEGIN_OUTER_LOOP_3 linearize_@TYPE@_matrix(tmp_buff, args[0], &lin_data); @TYPE@_slogdet_single_element(m, (void*)tmp_buff, (fortran_int*)(tmp_buff+matrix_size), (@typ@*)args[1], (@basetyp@*)args[2]); END_OUTER_LOOP free(tmp_buff); } } static void @TYPE@_det(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { fortran_int m; npy_uint8 *tmp_buff; size_t matrix_size; size_t pivot_size; size_t safe_m; /* notes: * matrix will need to be copied always, as factorization in lapack is * made inplace * matrix will need to be in column-major order, as expected by lapack * code (fortran) * always a square matrix * need to allocate memory for both, matrix_buffer and pivot buffer */ INIT_OUTER_LOOP_2 m = (fortran_int) dimensions[0]; safe_m = m; matrix_size = safe_m * safe_m * sizeof(@typ@); pivot_size = safe_m * sizeof(fortran_int); tmp_buff = (npy_uint8 *)malloc(matrix_size + pivot_size); if (tmp_buff) { LINEARIZE_DATA_t lin_data; @typ@ sign; @basetyp@ logdet; /* swapped steps to get matrix in FORTRAN order */ init_linearize_data(&lin_data, m, m, steps[1], steps[0]); BEGIN_OUTER_LOOP_2 linearize_@TYPE@_matrix(tmp_buff, args[0], &lin_data); @TYPE@_slogdet_single_element(m, (void*)tmp_buff, (fortran_int*)(tmp_buff + matrix_size), &sign, &logdet); *(@typ@ *)args[1] = @TYPE@_det_from_slogdet(sign, logdet); END_OUTER_LOOP free(tmp_buff); } } /**end repeat**/ /* -------------------------------------------------------------------------- */ /* Eigh family */ typedef struct eigh_params_struct { void *A; /* matrix */ void *W; /* eigenvalue vector */ void *WORK; /* main work buffer */ void *RWORK; /* secondary work buffer (for complex versions) */ void *IWORK; fortran_int N; fortran_int LWORK; fortran_int LRWORK; fortran_int LIWORK; char JOBZ; char UPLO; fortran_int LDA; } EIGH_PARAMS_t; /**begin repeat #TYPE = FLOAT, DOUBLE# #typ = npy_float, npy_double# #ftyp = fortran_real, fortran_doublereal# #lapack_func = ssyevd, dsyevd# */ static NPY_INLINE fortran_int call_@lapack_func@(EIGH_PARAMS_t *params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->JOBZ, ¶ms->UPLO, ¶ms->N, params->A, ¶ms->LDA, params->W, params->WORK, ¶ms->LWORK, params->IWORK, ¶ms->LIWORK, &rv); return rv; } /* * Initialize the parameters to use in for the lapack function _syevd * Handles buffer allocation */ static NPY_INLINE int init_@lapack_func@(EIGH_PARAMS_t* params, char JOBZ, char UPLO, fortran_int N) { npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; fortran_int lwork; fortran_int liwork; npy_uint8 *a, *w, *work, *iwork; size_t safe_N = N; size_t alloc_size = safe_N * (safe_N + 1) * sizeof(@typ@); fortran_int lda = fortran_int_max(N, 1); mem_buff = malloc(alloc_size); if (!mem_buff) { goto error; } a = mem_buff; w = mem_buff + safe_N * safe_N * sizeof(@typ@); params->A = a; params->W = w; params->RWORK = NULL; /* unused */ params->N = N; params->LRWORK = 0; /* unused */ params->JOBZ = JOBZ; params->UPLO = UPLO; params->LDA = lda; /* Work size query */ { @typ@ query_work_size; fortran_int query_iwork_size; params->LWORK = -1; params->LIWORK = -1; params->WORK = &query_work_size; params->IWORK = &query_iwork_size; if (call_@lapack_func@(params) != 0) { goto error; } lwork = (fortran_int)query_work_size; liwork = query_iwork_size; } mem_buff2 = malloc(lwork*sizeof(@typ@) + liwork*sizeof(fortran_int)); if (!mem_buff2) { goto error; } work = mem_buff2; iwork = mem_buff2 + lwork*sizeof(@typ@); params->LWORK = lwork; params->WORK = work; params->LIWORK = liwork; params->IWORK = iwork; return 1; error: /* something failed */ memset(params, 0, sizeof(*params)); free(mem_buff2); free(mem_buff); return 0; } /**end repeat**/ /**begin repeat #TYPE = CFLOAT, CDOUBLE# #typ = npy_cfloat, npy_cdouble# #basetyp = npy_float, npy_double# #ftyp = fortran_complex, fortran_doublecomplex# #fbasetyp = fortran_real, fortran_doublereal# #lapack_func = cheevd, zheevd# */ static NPY_INLINE fortran_int call_@lapack_func@(EIGH_PARAMS_t *params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->JOBZ, ¶ms->UPLO, ¶ms->N, params->A, ¶ms->LDA, params->W, params->WORK, ¶ms->LWORK, params->RWORK, ¶ms->LRWORK, params->IWORK, ¶ms->LIWORK, &rv); return rv; } /* * Initialize the parameters to use in for the lapack function _heev * Handles buffer allocation */ static NPY_INLINE int init_@lapack_func@(EIGH_PARAMS_t *params, char JOBZ, char UPLO, fortran_int N) { npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; fortran_int lwork; fortran_int lrwork; fortran_int liwork; npy_uint8 *a, *w, *work, *rwork, *iwork; size_t safe_N = N; fortran_int lda = fortran_int_max(N, 1); mem_buff = malloc(safe_N * safe_N * sizeof(@typ@) + safe_N * sizeof(@basetyp@)); if (!mem_buff) { goto error; } a = mem_buff; w = mem_buff + safe_N * safe_N * sizeof(@typ@); params->A = a; params->W = w; params->N = N; params->JOBZ = JOBZ; params->UPLO = UPLO; params->LDA = lda; /* Work size query */ { @ftyp@ query_work_size; @fbasetyp@ query_rwork_size; fortran_int query_iwork_size; params->LWORK = -1; params->LRWORK = -1; params->LIWORK = -1; params->WORK = &query_work_size; params->RWORK = &query_rwork_size; params->IWORK = &query_iwork_size; if (call_@lapack_func@(params) != 0) { goto error; } lwork = (fortran_int)*(@fbasetyp@*)&query_work_size; lrwork = (fortran_int)query_rwork_size; liwork = query_iwork_size; } mem_buff2 = malloc(lwork*sizeof(@typ@) + lrwork*sizeof(@basetyp@) + liwork*sizeof(fortran_int)); if (!mem_buff2) { goto error; } work = mem_buff2; rwork = work + lwork*sizeof(@typ@); iwork = rwork + lrwork*sizeof(@basetyp@); params->WORK = work; params->RWORK = rwork; params->IWORK = iwork; params->LWORK = lwork; params->LRWORK = lrwork; params->LIWORK = liwork; return 1; /* something failed */ error: memset(params, 0, sizeof(*params)); free(mem_buff2); free(mem_buff); return 0; } /**end repeat**/ /**begin repeat #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# #BASETYPE = FLOAT, DOUBLE, FLOAT, DOUBLE# #typ = npy_float, npy_double, npy_cfloat, npy_cdouble# #basetyp = npy_float, npy_double, npy_float, npy_double# #lapack_func = ssyevd, dsyevd, cheevd, zheevd# **/ /* * (M, M)->(M,)(M, M) * dimensions[1] -> M * args[0] -> A[in] * args[1] -> W * args[2] -> A[out] */ static NPY_INLINE void release_@lapack_func@(EIGH_PARAMS_t *params) { /* allocated memory in A and WORK */ free(params->A); free(params->WORK); memset(params, 0, sizeof(*params)); } static NPY_INLINE void @TYPE@_eigh_wrapper(char JOBZ, char UPLO, char**args, npy_intp const *dimensions, npy_intp const *steps) { ptrdiff_t outer_steps[3]; size_t iter; size_t outer_dim = *dimensions++; size_t op_count = (JOBZ=='N')?2:3; EIGH_PARAMS_t eigh_params; int error_occurred = get_fp_invalid_and_clear(); for (iter = 0; iter < op_count; ++iter) { outer_steps[iter] = (ptrdiff_t) steps[iter]; } steps += op_count; if (init_@lapack_func@(&eigh_params, JOBZ, UPLO, (fortran_int)dimensions[0])) { LINEARIZE_DATA_t matrix_in_ld; LINEARIZE_DATA_t eigenvectors_out_ld; LINEARIZE_DATA_t eigenvalues_out_ld; init_linearize_data(&matrix_in_ld, eigh_params.N, eigh_params.N, steps[1], steps[0]); init_linearize_data(&eigenvalues_out_ld, 1, eigh_params.N, 0, steps[2]); if ('V' == eigh_params.JOBZ) { init_linearize_data(&eigenvectors_out_ld, eigh_params.N, eigh_params.N, steps[4], steps[3]); } for (iter = 0; iter < outer_dim; ++iter) { int not_ok; /* copy the matrix in */ linearize_@TYPE@_matrix(eigh_params.A, args[0], &matrix_in_ld); not_ok = call_@lapack_func@(&eigh_params); if (!not_ok) { /* lapack ok, copy result out */ delinearize_@BASETYPE@_matrix(args[1], eigh_params.W, &eigenvalues_out_ld); if ('V' == eigh_params.JOBZ) { delinearize_@TYPE@_matrix(args[2], eigh_params.A, &eigenvectors_out_ld); } } else { /* lapack fail, set result to nan */ error_occurred = 1; nan_@BASETYPE@_matrix(args[1], &eigenvalues_out_ld); if ('V' == eigh_params.JOBZ) { nan_@TYPE@_matrix(args[2], &eigenvectors_out_ld); } } update_pointers((npy_uint8**)args, outer_steps, op_count); } release_@lapack_func@(&eigh_params); } set_fp_invalid_or_clear(error_occurred); } /**end repeat**/ /**begin repeat #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# */ static void @TYPE@_eighlo(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @TYPE@_eigh_wrapper('V', 'L', args, dimensions, steps); } static void @TYPE@_eighup(char **args, npy_intp const *dimensions, npy_intp const *steps, void* NPY_UNUSED(func)) { @TYPE@_eigh_wrapper('V', 'U', args, dimensions, steps); } static void @TYPE@_eigvalshlo(char **args, npy_intp const *dimensions, npy_intp const *steps, void* NPY_UNUSED(func)) { @TYPE@_eigh_wrapper('N', 'L', args, dimensions, steps); } static void @TYPE@_eigvalshup(char **args, npy_intp const *dimensions, npy_intp const *steps, void* NPY_UNUSED(func)) { @TYPE@_eigh_wrapper('N', 'U', args, dimensions, steps); } /**end repeat**/ /* -------------------------------------------------------------------------- */ /* Solve family (includes inv) */ typedef struct gesv_params_struct { void *A; /* A is (N, N) of base type */ void *B; /* B is (N, NRHS) of base type */ fortran_int * IPIV; /* IPIV is (N) */ fortran_int N; fortran_int NRHS; fortran_int LDA; fortran_int LDB; } GESV_PARAMS_t; /**begin repeat #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# #typ = npy_float, npy_double, npy_cfloat, npy_cdouble# #ftyp = fortran_real, fortran_doublereal, fortran_complex, fortran_doublecomplex# #lapack_func = sgesv, dgesv, cgesv, zgesv# */ static NPY_INLINE fortran_int call_@lapack_func@(GESV_PARAMS_t *params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->N, ¶ms->NRHS, params->A, ¶ms->LDA, params->IPIV, params->B, ¶ms->LDB, &rv); return rv; } /* * Initialize the parameters to use in for the lapack function _heev * Handles buffer allocation */ static NPY_INLINE int init_@lapack_func@(GESV_PARAMS_t *params, fortran_int N, fortran_int NRHS) { npy_uint8 *mem_buff = NULL; npy_uint8 *a, *b, *ipiv; size_t safe_N = N; size_t safe_NRHS = NRHS; fortran_int ld = fortran_int_max(N, 1); mem_buff = malloc(safe_N * safe_N * sizeof(@ftyp@) + safe_N * safe_NRHS*sizeof(@ftyp@) + safe_N * sizeof(fortran_int)); if (!mem_buff) { goto error; } a = mem_buff; b = a + safe_N * safe_N * sizeof(@ftyp@); ipiv = b + safe_N * safe_NRHS * sizeof(@ftyp@); params->A = a; params->B = b; params->IPIV = (fortran_int*)ipiv; params->N = N; params->NRHS = NRHS; params->LDA = ld; params->LDB = ld; return 1; error: free(mem_buff); memset(params, 0, sizeof(*params)); return 0; } static NPY_INLINE void release_@lapack_func@(GESV_PARAMS_t *params) { /* memory block base is in A */ free(params->A); memset(params, 0, sizeof(*params)); } static void @TYPE@_solve(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { GESV_PARAMS_t params; fortran_int n, nrhs; int error_occurred = get_fp_invalid_and_clear(); INIT_OUTER_LOOP_3 n = (fortran_int)dimensions[0]; nrhs = (fortran_int)dimensions[1]; if (init_@lapack_func@(¶ms, n, nrhs)) { LINEARIZE_DATA_t a_in, b_in, r_out; init_linearize_data(&a_in, n, n, steps[1], steps[0]); init_linearize_data(&b_in, nrhs, n, steps[3], steps[2]); init_linearize_data(&r_out, nrhs, n, steps[5], steps[4]); BEGIN_OUTER_LOOP_3 int not_ok; linearize_@TYPE@_matrix(params.A, args[0], &a_in); linearize_@TYPE@_matrix(params.B, args[1], &b_in); not_ok =call_@lapack_func@(¶ms); if (!not_ok) { delinearize_@TYPE@_matrix(args[2], params.B, &r_out); } else { error_occurred = 1; nan_@TYPE@_matrix(args[2], &r_out); } END_OUTER_LOOP release_@lapack_func@(¶ms); } set_fp_invalid_or_clear(error_occurred); } static void @TYPE@_solve1(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { GESV_PARAMS_t params; int error_occurred = get_fp_invalid_and_clear(); fortran_int n; INIT_OUTER_LOOP_3 n = (fortran_int)dimensions[0]; if (init_@lapack_func@(¶ms, n, 1)) { LINEARIZE_DATA_t a_in, b_in, r_out; init_linearize_data(&a_in, n, n, steps[1], steps[0]); init_linearize_data(&b_in, 1, n, 1, steps[2]); init_linearize_data(&r_out, 1, n, 1, steps[3]); BEGIN_OUTER_LOOP_3 int not_ok; linearize_@TYPE@_matrix(params.A, args[0], &a_in); linearize_@TYPE@_matrix(params.B, args[1], &b_in); not_ok = call_@lapack_func@(¶ms); if (!not_ok) { delinearize_@TYPE@_matrix(args[2], params.B, &r_out); } else { error_occurred = 1; nan_@TYPE@_matrix(args[2], &r_out); } END_OUTER_LOOP release_@lapack_func@(¶ms); } set_fp_invalid_or_clear(error_occurred); } static void @TYPE@_inv(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { GESV_PARAMS_t params; fortran_int n; int error_occurred = get_fp_invalid_and_clear(); INIT_OUTER_LOOP_2 n = (fortran_int)dimensions[0]; if (init_@lapack_func@(¶ms, n, n)) { LINEARIZE_DATA_t a_in, r_out; init_linearize_data(&a_in, n, n, steps[1], steps[0]); init_linearize_data(&r_out, n, n, steps[3], steps[2]); BEGIN_OUTER_LOOP_2 int not_ok; linearize_@TYPE@_matrix(params.A, args[0], &a_in); identity_@TYPE@_matrix(params.B, n); not_ok = call_@lapack_func@(¶ms); if (!not_ok) { delinearize_@TYPE@_matrix(args[1], params.B, &r_out); } else { error_occurred = 1; nan_@TYPE@_matrix(args[1], &r_out); } END_OUTER_LOOP release_@lapack_func@(¶ms); } set_fp_invalid_or_clear(error_occurred); } /**end repeat**/ /* -------------------------------------------------------------------------- */ /* Cholesky decomposition */ typedef struct potr_params_struct { void *A; fortran_int N; fortran_int LDA; char UPLO; } POTR_PARAMS_t; /**begin repeat #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# #ftyp = fortran_real, fortran_doublereal, fortran_complex, fortran_doublecomplex# #lapack_func = spotrf, dpotrf, cpotrf, zpotrf# */ static NPY_INLINE fortran_int call_@lapack_func@(POTR_PARAMS_t *params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->UPLO, ¶ms->N, params->A, ¶ms->LDA, &rv); return rv; } static NPY_INLINE int init_@lapack_func@(POTR_PARAMS_t *params, char UPLO, fortran_int N) { npy_uint8 *mem_buff = NULL; npy_uint8 *a; size_t safe_N = N; fortran_int lda = fortran_int_max(N, 1); mem_buff = malloc(safe_N * safe_N * sizeof(@ftyp@)); if (!mem_buff) { goto error; } a = mem_buff; params->A = a; params->N = N; params->LDA = lda; params->UPLO = UPLO; return 1; error: free(mem_buff); memset(params, 0, sizeof(*params)); return 0; } static NPY_INLINE void release_@lapack_func@(POTR_PARAMS_t *params) { /* memory block base in A */ free(params->A); memset(params, 0, sizeof(*params)); } static void @TYPE@_cholesky(char uplo, char **args, npy_intp const *dimensions, npy_intp const *steps) { POTR_PARAMS_t params; int error_occurred = get_fp_invalid_and_clear(); fortran_int n; INIT_OUTER_LOOP_2 assert(uplo == 'L'); n = (fortran_int)dimensions[0]; if (init_@lapack_func@(¶ms, uplo, n)) { LINEARIZE_DATA_t a_in, r_out; init_linearize_data(&a_in, n, n, steps[1], steps[0]); init_linearize_data(&r_out, n, n, steps[3], steps[2]); BEGIN_OUTER_LOOP_2 int not_ok; linearize_@TYPE@_matrix(params.A, args[0], &a_in); not_ok = call_@lapack_func@(¶ms); if (!not_ok) { triu_@TYPE@_matrix(params.A, params.N); delinearize_@TYPE@_matrix(args[1], params.A, &r_out); } else { error_occurred = 1; nan_@TYPE@_matrix(args[1], &r_out); } END_OUTER_LOOP release_@lapack_func@(¶ms); } set_fp_invalid_or_clear(error_occurred); } static void @TYPE@_cholesky_lo(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @TYPE@_cholesky('L', args, dimensions, steps); } /**end repeat**/ /* -------------------------------------------------------------------------- */ /* eig family */ typedef struct geev_params_struct { void *A; void *WR; /* RWORK in complex versions, REAL W buffer for (sd)geev*/ void *WI; void *VLR; /* REAL VL buffers for _geev where _ is s, d */ void *VRR; /* REAL VR buffers for _geev where _ is s, d */ void *WORK; void *W; /* final w */ void *VL; /* final vl */ void *VR; /* final vr */ fortran_int N; fortran_int LDA; fortran_int LDVL; fortran_int LDVR; fortran_int LWORK; char JOBVL; char JOBVR; } GEEV_PARAMS_t; static NPY_INLINE void dump_geev_params(const char *name, GEEV_PARAMS_t* params) { TRACE_TXT("\n%s\n" "\t%10s: %p\n"\ "\t%10s: %p\n"\ "\t%10s: %p\n"\ "\t%10s: %p\n"\ "\t%10s: %p\n"\ "\t%10s: %p\n"\ "\t%10s: %p\n"\ "\t%10s: %p\n"\ "\t%10s: %p\n"\ "\t%10s: %d\n"\ "\t%10s: %d\n"\ "\t%10s: %d\n"\ "\t%10s: %d\n"\ "\t%10s: %d\n"\ "\t%10s: %c\n"\ "\t%10s: %c\n", name, "A", params->A, "WR", params->WR, "WI", params->WI, "VLR", params->VLR, "VRR", params->VRR, "WORK", params->WORK, "W", params->W, "VL", params->VL, "VR", params->VR, "N", (int)params->N, "LDA", (int)params->LDA, "LDVL", (int)params->LDVL, "LDVR", (int)params->LDVR, "LWORK", (int)params->LWORK, "JOBVL", params->JOBVL, "JOBVR", params->JOBVR); } /**begin repeat #TYPE = FLOAT, DOUBLE# #CTYPE = CFLOAT, CDOUBLE# #typ = float, double# #complextyp = COMPLEX_t, DOUBLECOMPLEX_t# #lapack_func = sgeev, dgeev# #zero = 0.0f, 0.0# */ static NPY_INLINE fortran_int call_@lapack_func@(GEEV_PARAMS_t* params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->JOBVL, ¶ms->JOBVR, ¶ms->N, params->A, ¶ms->LDA, params->WR, params->WI, params->VLR, ¶ms->LDVL, params->VRR, ¶ms->LDVR, params->WORK, ¶ms->LWORK, &rv); return rv; } static NPY_INLINE int init_@lapack_func@(GEEV_PARAMS_t *params, char jobvl, char jobvr, fortran_int n) { npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; npy_uint8 *a, *wr, *wi, *vlr, *vrr, *work, *w, *vl, *vr; size_t safe_n = n; size_t a_size = safe_n * safe_n * sizeof(@typ@); size_t wr_size = safe_n * sizeof(@typ@); size_t wi_size = safe_n * sizeof(@typ@); size_t vlr_size = jobvl=='V' ? safe_n * safe_n * sizeof(@typ@) : 0; size_t vrr_size = jobvr=='V' ? safe_n * safe_n * sizeof(@typ@) : 0; size_t w_size = wr_size*2; size_t vl_size = vlr_size*2; size_t vr_size = vrr_size*2; size_t work_count = 0; fortran_int ld = fortran_int_max(n, 1); /* allocate data for known sizes (all but work) */ mem_buff = malloc(a_size + wr_size + wi_size + vlr_size + vrr_size + w_size + vl_size + vr_size); if (!mem_buff) { goto error; } a = mem_buff; wr = a + a_size; wi = wr + wr_size; vlr = wi + wi_size; vrr = vlr + vlr_size; w = vrr + vrr_size; vl = w + w_size; vr = vl + vl_size; params->A = a; params->WR = wr; params->WI = wi; params->VLR = vlr; params->VRR = vrr; params->W = w; params->VL = vl; params->VR = vr; params->N = n; params->LDA = ld; params->LDVL = ld; params->LDVR = ld; params->JOBVL = jobvl; params->JOBVR = jobvr; /* Work size query */ { @typ@ work_size_query; params->LWORK = -1; params->WORK = &work_size_query; if (call_@lapack_func@(params) != 0) { goto error; } work_count = (size_t)work_size_query; } mem_buff2 = malloc(work_count*sizeof(@typ@)); if (!mem_buff2) { goto error; } work = mem_buff2; params->LWORK = (fortran_int)work_count; params->WORK = work; return 1; error: free(mem_buff2); free(mem_buff); memset(params, 0, sizeof(*params)); return 0; } static NPY_INLINE void mk_@TYPE@_complex_array_from_real(@complextyp@ *c, const @typ@ *re, size_t n) { size_t iter; for (iter = 0; iter < n; ++iter) { c[iter].array[0] = re[iter]; c[iter].array[1] = @zero@; } } static NPY_INLINE void mk_@TYPE@_complex_array(@complextyp@ *c, const @typ@ *re, const @typ@ *im, size_t n) { size_t iter; for (iter = 0; iter < n; ++iter) { c[iter].array[0] = re[iter]; c[iter].array[1] = im[iter]; } } static NPY_INLINE void mk_@TYPE@_complex_array_conjugate_pair(@complextyp@ *c, const @typ@ *r, size_t n) { size_t iter; for (iter = 0; iter < n; ++iter) { @typ@ re = r[iter]; @typ@ im = r[iter+n]; c[iter].array[0] = re; c[iter].array[1] = im; c[iter+n].array[0] = re; c[iter+n].array[1] = -im; } } /* * make the complex eigenvectors from the real array produced by sgeev/zgeev. * c is the array where the results will be left. * r is the source array of reals produced by sgeev/zgeev * i is the eigenvalue imaginary part produced by sgeev/zgeev * n is so that the order of the matrix is n by n */ static NPY_INLINE void mk_@lapack_func@_complex_eigenvectors(@complextyp@ *c, const @typ@ *r, const @typ@ *i, size_t n) { size_t iter = 0; while (iter < n) { if (i[iter] == @zero@) { /* eigenvalue was real, eigenvectors as well... */ mk_@TYPE@_complex_array_from_real(c, r, n); c += n; r += n; iter ++; } else { /* eigenvalue was complex, generate a pair of eigenvectors */ mk_@TYPE@_complex_array_conjugate_pair(c, r, n); c += 2*n; r += 2*n; iter += 2; } } } static NPY_INLINE void process_@lapack_func@_results(GEEV_PARAMS_t *params) { /* REAL versions of geev need the results to be translated * into complex versions. This is the way to deal with imaginary * results. In our gufuncs we will always return complex arrays! */ mk_@TYPE@_complex_array(params->W, params->WR, params->WI, params->N); /* handle the eigenvectors */ if ('V' == params->JOBVL) { mk_@lapack_func@_complex_eigenvectors(params->VL, params->VLR, params->WI, params->N); } if ('V' == params->JOBVR) { mk_@lapack_func@_complex_eigenvectors(params->VR, params->VRR, params->WI, params->N); } } /**end repeat**/ /**begin repeat #TYPE = CFLOAT, CDOUBLE# #typ = COMPLEX_t, DOUBLECOMPLEX_t# #ftyp = fortran_complex, fortran_doublecomplex# #realtyp = float, double# #lapack_func = cgeev, zgeev# */ static NPY_INLINE fortran_int call_@lapack_func@(GEEV_PARAMS_t* params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->JOBVL, ¶ms->JOBVR, ¶ms->N, params->A, ¶ms->LDA, params->W, params->VL, ¶ms->LDVL, params->VR, ¶ms->LDVR, params->WORK, ¶ms->LWORK, params->WR, /* actually RWORK */ &rv); return rv; } static NPY_INLINE int init_@lapack_func@(GEEV_PARAMS_t* params, char jobvl, char jobvr, fortran_int n) { npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; npy_uint8 *a, *w, *vl, *vr, *work, *rwork; size_t safe_n = n; size_t a_size = safe_n * safe_n * sizeof(@ftyp@); size_t w_size = safe_n * sizeof(@ftyp@); size_t vl_size = jobvl=='V'? safe_n * safe_n * sizeof(@ftyp@) : 0; size_t vr_size = jobvr=='V'? safe_n * safe_n * sizeof(@ftyp@) : 0; size_t rwork_size = 2 * safe_n * sizeof(@realtyp@); size_t work_count = 0; size_t total_size = a_size + w_size + vl_size + vr_size + rwork_size; fortran_int ld = fortran_int_max(n, 1); mem_buff = malloc(total_size); if (!mem_buff) { goto error; } a = mem_buff; w = a + a_size; vl = w + w_size; vr = vl + vl_size; rwork = vr + vr_size; params->A = a; params->WR = rwork; params->WI = NULL; params->VLR = NULL; params->VRR = NULL; params->VL = vl; params->VR = vr; params->W = w; params->N = n; params->LDA = ld; params->LDVL = ld; params->LDVR = ld; params->JOBVL = jobvl; params->JOBVR = jobvr; /* Work size query */ { @typ@ work_size_query; params->LWORK = -1; params->WORK = &work_size_query; if (call_@lapack_func@(params) != 0) { goto error; } work_count = (size_t) work_size_query.array[0]; /* Fix a bug in lapack 3.0.0 */ if(work_count == 0) work_count = 1; } mem_buff2 = malloc(work_count*sizeof(@ftyp@)); if (!mem_buff2) { goto error; } work = mem_buff2; params->LWORK = (fortran_int)work_count; params->WORK = work; return 1; error: free(mem_buff2); free(mem_buff); memset(params, 0, sizeof(*params)); return 0; } static NPY_INLINE void process_@lapack_func@_results(GEEV_PARAMS_t *NPY_UNUSED(params)) { /* nothing to do here, complex versions are ready to copy out */ } /**end repeat**/ /**begin repeat #TYPE = FLOAT, DOUBLE, CDOUBLE# #COMPLEXTYPE = CFLOAT, CDOUBLE, CDOUBLE# #ftype = fortran_real, fortran_doublereal, fortran_doublecomplex# #lapack_func = sgeev, dgeev, zgeev# */ static NPY_INLINE void release_@lapack_func@(GEEV_PARAMS_t *params) { free(params->WORK); free(params->A); memset(params, 0, sizeof(*params)); } static NPY_INLINE void @TYPE@_eig_wrapper(char JOBVL, char JOBVR, char**args, npy_intp const *dimensions, npy_intp const *steps) { ptrdiff_t outer_steps[4]; size_t iter; size_t outer_dim = *dimensions++; size_t op_count = 2; int error_occurred = get_fp_invalid_and_clear(); GEEV_PARAMS_t geev_params; assert(JOBVL == 'N'); STACK_TRACE; op_count += 'V'==JOBVL?1:0; op_count += 'V'==JOBVR?1:0; for (iter = 0; iter < op_count; ++iter) { outer_steps[iter] = (ptrdiff_t) steps[iter]; } steps += op_count; if (init_@lapack_func@(&geev_params, JOBVL, JOBVR, (fortran_int)dimensions[0])) { LINEARIZE_DATA_t a_in; LINEARIZE_DATA_t w_out; LINEARIZE_DATA_t vl_out; LINEARIZE_DATA_t vr_out; init_linearize_data(&a_in, geev_params.N, geev_params.N, steps[1], steps[0]); steps += 2; init_linearize_data(&w_out, 1, geev_params.N, 0, steps[0]); steps += 1; if ('V' == geev_params.JOBVL) { init_linearize_data(&vl_out, geev_params.N, geev_params.N, steps[1], steps[0]); steps += 2; } if ('V' == geev_params.JOBVR) { init_linearize_data(&vr_out, geev_params.N, geev_params.N, steps[1], steps[0]); } for (iter = 0; iter < outer_dim; ++iter) { int not_ok; char **arg_iter = args; /* copy the matrix in */ linearize_@TYPE@_matrix(geev_params.A, *arg_iter++, &a_in); not_ok = call_@lapack_func@(&geev_params); if (!not_ok) { process_@lapack_func@_results(&geev_params); delinearize_@COMPLEXTYPE@_matrix(*arg_iter++, geev_params.W, &w_out); if ('V' == geev_params.JOBVL) { delinearize_@COMPLEXTYPE@_matrix(*arg_iter++, geev_params.VL, &vl_out); } if ('V' == geev_params.JOBVR) { delinearize_@COMPLEXTYPE@_matrix(*arg_iter++, geev_params.VR, &vr_out); } } else { /* geev failed */ error_occurred = 1; nan_@COMPLEXTYPE@_matrix(*arg_iter++, &w_out); if ('V' == geev_params.JOBVL) { nan_@COMPLEXTYPE@_matrix(*arg_iter++, &vl_out); } if ('V' == geev_params.JOBVR) { nan_@COMPLEXTYPE@_matrix(*arg_iter++, &vr_out); } } update_pointers((npy_uint8**)args, outer_steps, op_count); } release_@lapack_func@(&geev_params); } set_fp_invalid_or_clear(error_occurred); } static void @TYPE@_eig(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @TYPE@_eig_wrapper('N', 'V', args, dimensions, steps); } static void @TYPE@_eigvals(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @TYPE@_eig_wrapper('N', 'N', args, dimensions, steps); } /**end repeat**/ /* -------------------------------------------------------------------------- */ /* singular value decomposition */ typedef struct gessd_params_struct { void *A; void *S; void *U; void *VT; void *WORK; void *RWORK; void *IWORK; fortran_int M; fortran_int N; fortran_int LDA; fortran_int LDU; fortran_int LDVT; fortran_int LWORK; char JOBZ; } GESDD_PARAMS_t; static NPY_INLINE void dump_gesdd_params(const char *name, GESDD_PARAMS_t *params) { TRACE_TXT("\n%s:\n"\ "%14s: %18p\n"\ "%14s: %18p\n"\ "%14s: %18p\n"\ "%14s: %18p\n"\ "%14s: %18p\n"\ "%14s: %18p\n"\ "%14s: %18p\n"\ "%14s: %18d\n"\ "%14s: %18d\n"\ "%14s: %18d\n"\ "%14s: %18d\n"\ "%14s: %18d\n"\ "%14s: %18d\n"\ "%14s: %15c'%c'\n", name, "A", params->A, "S", params->S, "U", params->U, "VT", params->VT, "WORK", params->WORK, "RWORK", params->RWORK, "IWORK", params->IWORK, "M", (int)params->M, "N", (int)params->N, "LDA", (int)params->LDA, "LDU", (int)params->LDU, "LDVT", (int)params->LDVT, "LWORK", (int)params->LWORK, "JOBZ", ' ', params->JOBZ); } static NPY_INLINE int compute_urows_vtcolumns(char jobz, fortran_int m, fortran_int n, fortran_int *urows, fortran_int *vtcolumns) { fortran_int min_m_n = fortran_int_min(m, n); switch(jobz) { case 'N': *urows = 0; *vtcolumns = 0; break; case 'A': *urows = m; *vtcolumns = n; break; case 'S': { *urows = min_m_n; *vtcolumns = min_m_n; } break; default: return 0; } return 1; } /**begin repeat #TYPE = FLOAT, DOUBLE# #lapack_func = sgesdd, dgesdd# #ftyp = fortran_real, fortran_doublereal# */ static NPY_INLINE fortran_int call_@lapack_func@(GESDD_PARAMS_t *params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->JOBZ, ¶ms->M, ¶ms->N, params->A, ¶ms->LDA, params->S, params->U, ¶ms->LDU, params->VT, ¶ms->LDVT, params->WORK, ¶ms->LWORK, params->IWORK, &rv); return rv; } static NPY_INLINE int init_@lapack_func@(GESDD_PARAMS_t *params, char jobz, fortran_int m, fortran_int n) { npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; npy_uint8 *a, *s, *u, *vt, *work, *iwork; size_t safe_m = m; size_t safe_n = n; size_t a_size = safe_m * safe_n * sizeof(@ftyp@); fortran_int min_m_n = fortran_int_min(m, n); size_t safe_min_m_n = min_m_n; size_t s_size = safe_min_m_n * sizeof(@ftyp@); fortran_int u_row_count, vt_column_count; size_t safe_u_row_count, safe_vt_column_count; size_t u_size, vt_size; fortran_int work_count; size_t work_size; size_t iwork_size = 8 * safe_min_m_n * sizeof(fortran_int); fortran_int ld = fortran_int_max(m, 1); if (!compute_urows_vtcolumns(jobz, m, n, &u_row_count, &vt_column_count)) { goto error; } safe_u_row_count = u_row_count; safe_vt_column_count = vt_column_count; u_size = safe_u_row_count * safe_m * sizeof(@ftyp@); vt_size = safe_n * safe_vt_column_count * sizeof(@ftyp@); mem_buff = malloc(a_size + s_size + u_size + vt_size + iwork_size); if (!mem_buff) { goto error; } a = mem_buff; s = a + a_size; u = s + s_size; vt = u + u_size; iwork = vt + vt_size; /* fix vt_column_count so that it is a valid lapack parameter (0 is not) */ vt_column_count = fortran_int_max(1, vt_column_count); params->M = m; params->N = n; params->A = a; params->S = s; params->U = u; params->VT = vt; params->RWORK = NULL; params->IWORK = iwork; params->LDA = ld; params->LDU = ld; params->LDVT = vt_column_count; params->JOBZ = jobz; /* Work size query */ { @ftyp@ work_size_query; params->LWORK = -1; params->WORK = &work_size_query; if (call_@lapack_func@(params) != 0) { goto error; } work_count = (fortran_int)work_size_query; /* Fix a bug in lapack 3.0.0 */ if(work_count == 0) work_count = 1; work_size = (size_t)work_count * sizeof(@ftyp@); } mem_buff2 = malloc(work_size); if (!mem_buff2) { goto error; } work = mem_buff2; params->LWORK = work_count; params->WORK = work; return 1; error: TRACE_TXT("%s failed init\n", __FUNCTION__); free(mem_buff); free(mem_buff2); memset(params, 0, sizeof(*params)); return 0; } /**end repeat**/ /**begin repeat #TYPE = CFLOAT, CDOUBLE# #ftyp = fortran_complex, fortran_doublecomplex# #frealtyp = fortran_real, fortran_doublereal# #typ = COMPLEX_t, DOUBLECOMPLEX_t# #lapack_func = cgesdd, zgesdd# */ static NPY_INLINE fortran_int call_@lapack_func@(GESDD_PARAMS_t *params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->JOBZ, ¶ms->M, ¶ms->N, params->A, ¶ms->LDA, params->S, params->U, ¶ms->LDU, params->VT, ¶ms->LDVT, params->WORK, ¶ms->LWORK, params->RWORK, params->IWORK, &rv); return rv; } static NPY_INLINE int init_@lapack_func@(GESDD_PARAMS_t *params, char jobz, fortran_int m, fortran_int n) { npy_uint8 *mem_buff = NULL, *mem_buff2 = NULL; npy_uint8 *a,*s, *u, *vt, *work, *rwork, *iwork; size_t a_size, s_size, u_size, vt_size, work_size, rwork_size, iwork_size; size_t safe_u_row_count, safe_vt_column_count; fortran_int u_row_count, vt_column_count, work_count; size_t safe_m = m; size_t safe_n = n; fortran_int min_m_n = fortran_int_min(m, n); size_t safe_min_m_n = min_m_n; fortran_int ld = fortran_int_max(m, 1); if (!compute_urows_vtcolumns(jobz, m, n, &u_row_count, &vt_column_count)) { goto error; } safe_u_row_count = u_row_count; safe_vt_column_count = vt_column_count; a_size = safe_m * safe_n * sizeof(@ftyp@); s_size = safe_min_m_n * sizeof(@frealtyp@); u_size = safe_u_row_count * safe_m * sizeof(@ftyp@); vt_size = safe_n * safe_vt_column_count * sizeof(@ftyp@); rwork_size = 'N'==jobz? (7 * safe_min_m_n) : (5*safe_min_m_n * safe_min_m_n + 5*safe_min_m_n); rwork_size *= sizeof(@ftyp@); iwork_size = 8 * safe_min_m_n* sizeof(fortran_int); mem_buff = malloc(a_size + s_size + u_size + vt_size + rwork_size + iwork_size); if (!mem_buff) { goto error; } a = mem_buff; s = a + a_size; u = s + s_size; vt = u + u_size; rwork = vt + vt_size; iwork = rwork + rwork_size; /* fix vt_column_count so that it is a valid lapack parameter (0 is not) */ vt_column_count = fortran_int_max(1, vt_column_count); params->A = a; params->S = s; params->U = u; params->VT = vt; params->RWORK = rwork; params->IWORK = iwork; params->M = m; params->N = n; params->LDA = ld; params->LDU = ld; params->LDVT = vt_column_count; params->JOBZ = jobz; /* Work size query */ { @ftyp@ work_size_query; params->LWORK = -1; params->WORK = &work_size_query; if (call_@lapack_func@(params) != 0) { goto error; } work_count = (fortran_int)((@typ@*)&work_size_query)->array[0]; /* Fix a bug in lapack 3.0.0 */ if(work_count == 0) work_count = 1; work_size = (size_t)work_count * sizeof(@ftyp@); } mem_buff2 = malloc(work_size); if (!mem_buff2) { goto error; } work = mem_buff2; params->LWORK = work_count; params->WORK = work; return 1; error: TRACE_TXT("%s failed init\n", __FUNCTION__); free(mem_buff2); free(mem_buff); memset(params, 0, sizeof(*params)); return 0; } /**end repeat**/ /**begin repeat #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# #REALTYPE = FLOAT, DOUBLE, FLOAT, DOUBLE# #lapack_func = sgesdd, dgesdd, cgesdd, zgesdd# */ static NPY_INLINE void release_@lapack_func@(GESDD_PARAMS_t* params) { /* A and WORK contain allocated blocks */ free(params->A); free(params->WORK); memset(params, 0, sizeof(*params)); } static NPY_INLINE void @TYPE@_svd_wrapper(char JOBZ, char **args, npy_intp const *dimensions, npy_intp const *steps) { ptrdiff_t outer_steps[4]; int error_occurred = get_fp_invalid_and_clear(); size_t iter; size_t outer_dim = *dimensions++; size_t op_count = (JOBZ=='N')?2:4; GESDD_PARAMS_t params; for (iter = 0; iter < op_count; ++iter) { outer_steps[iter] = (ptrdiff_t) steps[iter]; } steps += op_count; if (init_@lapack_func@(¶ms, JOBZ, (fortran_int)dimensions[0], (fortran_int)dimensions[1])) { LINEARIZE_DATA_t a_in, u_out, s_out, v_out; fortran_int min_m_n = params.M < params.N ? params.M : params.N; init_linearize_data(&a_in, params.N, params.M, steps[1], steps[0]); if ('N' == params.JOBZ) { /* only the singular values are wanted */ init_linearize_data(&s_out, 1, min_m_n, 0, steps[2]); } else { fortran_int u_columns, v_rows; if ('S' == params.JOBZ) { u_columns = min_m_n; v_rows = min_m_n; } else { /* JOBZ == 'A' */ u_columns = params.M; v_rows = params.N; } init_linearize_data(&u_out, u_columns, params.M, steps[3], steps[2]); init_linearize_data(&s_out, 1, min_m_n, 0, steps[4]); init_linearize_data(&v_out, params.N, v_rows, steps[6], steps[5]); } for (iter = 0; iter < outer_dim; ++iter) { int not_ok; /* copy the matrix in */ linearize_@TYPE@_matrix(params.A, args[0], &a_in); not_ok = call_@lapack_func@(¶ms); if (!not_ok) { if ('N' == params.JOBZ) { delinearize_@REALTYPE@_matrix(args[1], params.S, &s_out); } else { if ('A' == params.JOBZ && min_m_n == 0) { /* Lapack has betrayed us and left these uninitialized, * so produce an identity matrix for whichever of u * and v is not empty. */ identity_@TYPE@_matrix(params.U, params.M); identity_@TYPE@_matrix(params.VT, params.N); } delinearize_@TYPE@_matrix(args[1], params.U, &u_out); delinearize_@REALTYPE@_matrix(args[2], params.S, &s_out); delinearize_@TYPE@_matrix(args[3], params.VT, &v_out); } } else { error_occurred = 1; if ('N' == params.JOBZ) { nan_@REALTYPE@_matrix(args[1], &s_out); } else { nan_@TYPE@_matrix(args[1], &u_out); nan_@REALTYPE@_matrix(args[2], &s_out); nan_@TYPE@_matrix(args[3], &v_out); } } update_pointers((npy_uint8**)args, outer_steps, op_count); } release_@lapack_func@(¶ms); } set_fp_invalid_or_clear(error_occurred); } /**end repeat*/ /* svd gufunc entry points */ /**begin repeat #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE# */ static void @TYPE@_svd_N(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @TYPE@_svd_wrapper('N', args, dimensions, steps); } static void @TYPE@_svd_S(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @TYPE@_svd_wrapper('S', args, dimensions, steps); } static void @TYPE@_svd_A(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { @TYPE@_svd_wrapper('A', args, dimensions, steps); } /**end repeat**/ /* -------------------------------------------------------------------------- */ /* least squares */ typedef struct gelsd_params_struct { fortran_int M; fortran_int N; fortran_int NRHS; void *A; fortran_int LDA; void *B; fortran_int LDB; void *S; void *RCOND; fortran_int RANK; void *WORK; fortran_int LWORK; void *RWORK; void *IWORK; } GELSD_PARAMS_t; static inline void dump_gelsd_params(const char *name, GELSD_PARAMS_t *params) { TRACE_TXT("\n%s:\n"\ "%14s: %18p\n"\ "%14s: %18p\n"\ "%14s: %18p\n"\ "%14s: %18p\n"\ "%14s: %18p\n"\ "%14s: %18p\n"\ "%14s: %18d\n"\ "%14s: %18d\n"\ "%14s: %18d\n"\ "%14s: %18d\n"\ "%14s: %18d\n"\ "%14s: %18d\n"\ "%14s: %18d\n"\ "%14s: %18p\n", name, "A", params->A, "B", params->B, "S", params->S, "WORK", params->WORK, "RWORK", params->RWORK, "IWORK", params->IWORK, "M", (int)params->M, "N", (int)params->N, "NRHS", (int)params->NRHS, "LDA", (int)params->LDA, "LDB", (int)params->LDB, "LWORK", (int)params->LWORK, "RANK", (int)params->RANK, "RCOND", params->RCOND); } /**begin repeat #TYPE=FLOAT,DOUBLE# #lapack_func=sgelsd,dgelsd# #ftyp=fortran_real,fortran_doublereal# */ static inline fortran_int call_@lapack_func@(GELSD_PARAMS_t *params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->M, ¶ms->N, ¶ms->NRHS, params->A, ¶ms->LDA, params->B, ¶ms->LDB, params->S, params->RCOND, ¶ms->RANK, params->WORK, ¶ms->LWORK, params->IWORK, &rv); return rv; } static inline int init_@lapack_func@(GELSD_PARAMS_t *params, fortran_int m, fortran_int n, fortran_int nrhs) { npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; npy_uint8 *a, *b, *s, *work, *iwork; fortran_int min_m_n = fortran_int_min(m, n); fortran_int max_m_n = fortran_int_max(m, n); size_t safe_min_m_n = min_m_n; size_t safe_max_m_n = max_m_n; size_t safe_m = m; size_t safe_n = n; size_t safe_nrhs = nrhs; size_t a_size = safe_m * safe_n * sizeof(@ftyp@); size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@); size_t s_size = safe_min_m_n * sizeof(@ftyp@); fortran_int work_count; size_t work_size; size_t iwork_size; fortran_int lda = fortran_int_max(1, m); fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n)); mem_buff = malloc(a_size + b_size + s_size); if (!mem_buff) goto error; a = mem_buff; b = a + a_size; s = b + b_size; params->M = m; params->N = n; params->NRHS = nrhs; params->A = a; params->B = b; params->S = s; params->LDA = lda; params->LDB = ldb; { /* compute optimal work size */ @ftyp@ work_size_query; fortran_int iwork_size_query; params->WORK = &work_size_query; params->IWORK = &iwork_size_query; params->RWORK = NULL; params->LWORK = -1; if (call_@lapack_func@(params) != 0) goto error; work_count = (fortran_int)work_size_query; work_size = (size_t) work_size_query * sizeof(@ftyp@); iwork_size = (size_t)iwork_size_query * sizeof(fortran_int); } mem_buff2 = malloc(work_size + iwork_size); if (!mem_buff2) goto error; work = mem_buff2; iwork = work + work_size; params->WORK = work; params->RWORK = NULL; params->IWORK = iwork; params->LWORK = work_count; return 1; error: TRACE_TXT("%s failed init\n", __FUNCTION__); free(mem_buff); free(mem_buff2); memset(params, 0, sizeof(*params)); return 0; } /**end repeat**/ /**begin repeat #TYPE=CFLOAT,CDOUBLE# #ftyp=fortran_complex,fortran_doublecomplex# #frealtyp=fortran_real,fortran_doublereal# #typ=COMPLEX_t,DOUBLECOMPLEX_t# #lapack_func=cgelsd,zgelsd# */ static inline fortran_int call_@lapack_func@(GELSD_PARAMS_t *params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->M, ¶ms->N, ¶ms->NRHS, params->A, ¶ms->LDA, params->B, ¶ms->LDB, params->S, params->RCOND, ¶ms->RANK, params->WORK, ¶ms->LWORK, params->RWORK, params->IWORK, &rv); return rv; } static inline int init_@lapack_func@(GELSD_PARAMS_t *params, fortran_int m, fortran_int n, fortran_int nrhs) { npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; npy_uint8 *a, *b, *s, *work, *iwork, *rwork; fortran_int min_m_n = fortran_int_min(m, n); fortran_int max_m_n = fortran_int_max(m, n); size_t safe_min_m_n = min_m_n; size_t safe_max_m_n = max_m_n; size_t safe_m = m; size_t safe_n = n; size_t safe_nrhs = nrhs; size_t a_size = safe_m * safe_n * sizeof(@ftyp@); size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@); size_t s_size = safe_min_m_n * sizeof(@frealtyp@); fortran_int work_count; size_t work_size, rwork_size, iwork_size; fortran_int lda = fortran_int_max(1, m); fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n)); mem_buff = malloc(a_size + b_size + s_size); if (!mem_buff) goto error; a = mem_buff; b = a + a_size; s = b + b_size; params->M = m; params->N = n; params->NRHS = nrhs; params->A = a; params->B = b; params->S = s; params->LDA = lda; params->LDB = ldb; { /* compute optimal work size */ @ftyp@ work_size_query; @frealtyp@ rwork_size_query; fortran_int iwork_size_query; params->WORK = &work_size_query; params->IWORK = &iwork_size_query; params->RWORK = &rwork_size_query; params->LWORK = -1; if (call_@lapack_func@(params) != 0) goto error; work_count = (fortran_int)work_size_query.r; work_size = (size_t )work_size_query.r * sizeof(@ftyp@); rwork_size = (size_t)rwork_size_query * sizeof(@frealtyp@); iwork_size = (size_t)iwork_size_query * sizeof(fortran_int); } mem_buff2 = malloc(work_size + rwork_size + iwork_size); if (!mem_buff2) goto error; work = mem_buff2; rwork = work + work_size; iwork = rwork + rwork_size; params->WORK = work; params->RWORK = rwork; params->IWORK = iwork; params->LWORK = work_count; return 1; error: TRACE_TXT("%s failed init\n", __FUNCTION__); free(mem_buff); free(mem_buff2); memset(params, 0, sizeof(*params)); return 0; } /**end repeat**/ /**begin repeat #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE# #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE# #lapack_func=sgelsd,dgelsd,cgelsd,zgelsd# #dot_func=sdot,ddot,cdotc,zdotc# #typ = npy_float, npy_double, npy_cfloat, npy_cdouble# #basetyp = npy_float, npy_double, npy_float, npy_double# #ftyp = fortran_real, fortran_doublereal, fortran_complex, fortran_doublecomplex# #cmplx = 0, 0, 1, 1# */ static inline void release_@lapack_func@(GELSD_PARAMS_t* params) { /* A and WORK contain allocated blocks */ free(params->A); free(params->WORK); memset(params, 0, sizeof(*params)); } /** Compute the squared l2 norm of a contiguous vector */ static @basetyp@ @TYPE@_abs2(@typ@ *p, npy_intp n) { npy_intp i; @basetyp@ res = 0; for (i = 0; i < n; i++) { @typ@ el = p[i]; #if @cmplx@ res += el.real*el.real + el.imag*el.imag; #else res += el*el; #endif } return res; } static void @TYPE@_lstsq(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { GELSD_PARAMS_t params; int error_occurred = get_fp_invalid_and_clear(); fortran_int n, m, nrhs; fortran_int excess; INIT_OUTER_LOOP_7 m = (fortran_int)dimensions[0]; n = (fortran_int)dimensions[1]; nrhs = (fortran_int)dimensions[2]; excess = m - n; if (init_@lapack_func@(¶ms, m, n, nrhs)) { LINEARIZE_DATA_t a_in, b_in, x_out, s_out, r_out; init_linearize_data(&a_in, n, m, steps[1], steps[0]); init_linearize_data_ex(&b_in, nrhs, m, steps[3], steps[2], fortran_int_max(n, m)); init_linearize_data_ex(&x_out, nrhs, n, steps[5], steps[4], fortran_int_max(n, m)); init_linearize_data(&r_out, 1, nrhs, 1, steps[6]); init_linearize_data(&s_out, 1, fortran_int_min(n, m), 1, steps[7]); BEGIN_OUTER_LOOP_7 int not_ok; linearize_@TYPE@_matrix(params.A, args[0], &a_in); linearize_@TYPE@_matrix(params.B, args[1], &b_in); params.RCOND = args[2]; not_ok = call_@lapack_func@(¶ms); if (!not_ok) { delinearize_@TYPE@_matrix(args[3], params.B, &x_out); *(npy_int*) args[5] = params.RANK; delinearize_@REALTYPE@_matrix(args[6], params.S, &s_out); /* Note that linalg.lstsq discards this when excess == 0 */ if (excess >= 0 && params.RANK == n) { /* Compute the residuals as the square sum of each column */ int i; char *resid = args[4]; @ftyp@ *components = (@ftyp@ *)params.B + n; for (i = 0; i < nrhs; i++) { @ftyp@ *vector = components + i*m; /* Numpy and fortran floating types are the same size, * so this cast is safe */ @basetyp@ abs2 = @TYPE@_abs2((@typ@ *)vector, excess); memcpy( resid + i*r_out.column_strides, &abs2, sizeof(abs2)); } } else { /* Note that this is always discarded by linalg.lstsq */ nan_@REALTYPE@_matrix(args[4], &r_out); } } else { error_occurred = 1; nan_@TYPE@_matrix(args[3], &x_out); nan_@REALTYPE@_matrix(args[4], &r_out); *(npy_int*) args[5] = -1; nan_@REALTYPE@_matrix(args[6], &s_out); } END_OUTER_LOOP release_@lapack_func@(¶ms); } set_fp_invalid_or_clear(error_occurred); } /**end repeat**/ #pragma GCC diagnostic pop /* -------------------------------------------------------------------------- */ /* gufunc registration */ static void *array_of_nulls[] = { (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL }; #define FUNC_ARRAY_NAME(NAME) NAME ## _funcs #define GUFUNC_FUNC_ARRAY_REAL(NAME) \ static PyUFuncGenericFunction \ FUNC_ARRAY_NAME(NAME)[] = { \ FLOAT_ ## NAME, \ DOUBLE_ ## NAME \ } #define GUFUNC_FUNC_ARRAY_REAL_COMPLEX(NAME) \ static PyUFuncGenericFunction \ FUNC_ARRAY_NAME(NAME)[] = { \ FLOAT_ ## NAME, \ DOUBLE_ ## NAME, \ CFLOAT_ ## NAME, \ CDOUBLE_ ## NAME \ } /* There are problems with eig in complex single precision. * That kernel is disabled */ #define GUFUNC_FUNC_ARRAY_EIG(NAME) \ static PyUFuncGenericFunction \ FUNC_ARRAY_NAME(NAME)[] = { \ FLOAT_ ## NAME, \ DOUBLE_ ## NAME, \ CDOUBLE_ ## NAME \ } GUFUNC_FUNC_ARRAY_REAL_COMPLEX(slogdet); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(det); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eighlo); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eighup); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eigvalshlo); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eigvalshup); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(solve); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(solve1); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(inv); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(cholesky_lo); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_N); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_S); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_A); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(lstsq); GUFUNC_FUNC_ARRAY_EIG(eig); GUFUNC_FUNC_ARRAY_EIG(eigvals); static char equal_2_types[] = { NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE }; static char equal_3_types[] = { NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE }; /* second result is logdet, that will always be a REAL */ static char slogdet_types[] = { NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_FLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE }; static char eigh_types[] = { NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_CFLOAT, NPY_FLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE }; static char eighvals_types[] = { NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_CFLOAT, NPY_FLOAT, NPY_CDOUBLE, NPY_DOUBLE }; static char eig_types[] = { NPY_FLOAT, NPY_CFLOAT, NPY_CFLOAT, NPY_DOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE }; static char eigvals_types[] = { NPY_FLOAT, NPY_CFLOAT, NPY_DOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE }; static char svd_1_1_types[] = { NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_CFLOAT, NPY_FLOAT, NPY_CDOUBLE, NPY_DOUBLE }; static char svd_1_3_types[] = { NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_FLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE }; /* A, b, rcond, x, resid, rank, s, */ static char lstsq_types[] = { NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_INT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_INT, NPY_DOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_FLOAT, NPY_CFLOAT, NPY_FLOAT, NPY_INT, NPY_FLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_INT, NPY_DOUBLE, }; typedef struct gufunc_descriptor_struct { char *name; char *signature; char *doc; int ntypes; int nin; int nout; PyUFuncGenericFunction *funcs; char *types; } GUFUNC_DESCRIPTOR_t; GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = { { "slogdet", "(m,m)->(),()", "slogdet on the last two dimensions and broadcast on the rest. \n"\ "Results in two arrays, one with sign and the other with log of the"\ " determinants. \n"\ " \"(m,m)->(),()\" \n", 4, 1, 2, FUNC_ARRAY_NAME(slogdet), slogdet_types }, { "det", "(m,m)->()", "det of the last two dimensions and broadcast on the rest. \n"\ " \"(m,m)->()\" \n", 4, 1, 1, FUNC_ARRAY_NAME(det), equal_2_types }, { "eigh_lo", "(m,m)->(m),(m,m)", "eigh on the last two dimension and broadcast to the rest, using"\ " lower triangle \n"\ "Results in a vector of eigenvalues and a matrix with the"\ "eigenvectors. \n"\ " \"(m,m)->(m),(m,m)\" \n", 4, 1, 2, FUNC_ARRAY_NAME(eighlo), eigh_types }, { "eigh_up", "(m,m)->(m),(m,m)", "eigh on the last two dimension and broadcast to the rest, using"\ " upper triangle. \n"\ "Results in a vector of eigenvalues and a matrix with the"\ " eigenvectors. \n"\ " \"(m,m)->(m),(m,m)\" \n", 4, 1, 2, FUNC_ARRAY_NAME(eighup), eigh_types }, { "eigvalsh_lo", "(m,m)->(m)", "eigh on the last two dimension and broadcast to the rest, using"\ " lower triangle. \n"\ "Results in a vector of eigenvalues and a matrix with the"\ "eigenvectors. \n"\ " \"(m,m)->(m)\" \n", 4, 1, 1, FUNC_ARRAY_NAME(eigvalshlo), eighvals_types }, { "eigvalsh_up", "(m,m)->(m)", "eigvalsh on the last two dimension and broadcast to the rest,"\ " using upper triangle. \n"\ "Results in a vector of eigenvalues and a matrix with the"\ "eigenvectors.\n"\ " \"(m,m)->(m)\" \n", 4, 1, 1, FUNC_ARRAY_NAME(eigvalshup), eighvals_types }, { "solve", "(m,m),(m,n)->(m,n)", "solve the system a x = b, on the last two dimensions, broadcast"\ " to the rest. \n"\ "Results in a matrices with the solutions. \n"\ " \"(m,m),(m,n)->(m,n)\" \n", 4, 2, 1, FUNC_ARRAY_NAME(solve), equal_3_types }, { "solve1", "(m,m),(m)->(m)", "solve the system a x = b, for b being a vector, broadcast in"\ " the outer dimensions. \n"\ "Results in vectors with the solutions. \n"\ " \"(m,m),(m)->(m)\" \n", 4, 2, 1, FUNC_ARRAY_NAME(solve1), equal_3_types }, { "inv", "(m, m)->(m, m)", "compute the inverse of the last two dimensions and broadcast"\ " to the rest. \n"\ "Results in the inverse matrices. \n"\ " \"(m,m)->(m,m)\" \n", 4, 1, 1, FUNC_ARRAY_NAME(inv), equal_2_types }, { "cholesky_lo", "(m,m)->(m,m)", "cholesky decomposition of hermitian positive-definite matrices. \n"\ "Broadcast to all outer dimensions. \n"\ " \"(m,m)->(m,m)\" \n", 4, 1, 1, FUNC_ARRAY_NAME(cholesky_lo), equal_2_types }, { "svd_m", "(m,n)->(m)", "svd when n>=m. ", 4, 1, 1, FUNC_ARRAY_NAME(svd_N), svd_1_1_types }, { "svd_n", "(m,n)->(n)", "svd when n<=m", 4, 1, 1, FUNC_ARRAY_NAME(svd_N), svd_1_1_types }, { "svd_m_s", "(m,n)->(m,m),(m),(m,n)", "svd when m<=n", 4, 1, 3, FUNC_ARRAY_NAME(svd_S), svd_1_3_types }, { "svd_n_s", "(m,n)->(m,n),(n),(n,n)", "svd when m>=n", 4, 1, 3, FUNC_ARRAY_NAME(svd_S), svd_1_3_types }, { "svd_m_f", "(m,n)->(m,m),(m),(n,n)", "svd when m<=n", 4, 1, 3, FUNC_ARRAY_NAME(svd_A), svd_1_3_types }, { "svd_n_f", "(m,n)->(m,m),(n),(n,n)", "svd when m>=n", 4, 1, 3, FUNC_ARRAY_NAME(svd_A), svd_1_3_types }, { "eig", "(m,m)->(m),(m,m)", "eig on the last two dimension and broadcast to the rest. \n"\ "Results in a vector with the eigenvalues and a matrix with the"\ " eigenvectors. \n"\ " \"(m,m)->(m),(m,m)\" \n", 3, 1, 2, FUNC_ARRAY_NAME(eig), eig_types }, { "eigvals", "(m,m)->(m)", "eigvals on the last two dimension and broadcast to the rest. \n"\ "Results in a vector of eigenvalues. \n", 3, 1, 1, FUNC_ARRAY_NAME(eigvals), eigvals_types }, { "lstsq_m", "(m,n),(m,nrhs),()->(n,nrhs),(nrhs),(),(m)", "least squares on the last two dimensions and broadcast to the rest. \n"\ "For m <= n. \n", 4, 3, 4, FUNC_ARRAY_NAME(lstsq), lstsq_types }, { "lstsq_n", "(m,n),(m,nrhs),()->(n,nrhs),(nrhs),(),(n)", "least squares on the last two dimensions and broadcast to the rest. \n"\ "For m >= n, meaning that residuals are produced. \n", 4, 3, 4, FUNC_ARRAY_NAME(lstsq), lstsq_types } }; static int addUfuncs(PyObject *dictionary) { PyObject *f; int i; const int gufunc_count = sizeof(gufunc_descriptors)/ sizeof(gufunc_descriptors[0]); for (i = 0; i < gufunc_count; i++) { GUFUNC_DESCRIPTOR_t* d = &gufunc_descriptors[i]; f = PyUFunc_FromFuncAndDataAndSignature(d->funcs, array_of_nulls, d->types, d->ntypes, d->nin, d->nout, PyUFunc_None, d->name, d->doc, 0, d->signature); if (f == NULL) { return -1; } #if 0 dump_ufunc_object((PyUFuncObject*) f); #endif int ret = PyDict_SetItemString(dictionary, d->name, f); Py_DECREF(f); if (ret < 0) { return -1; } } return 0; } /* -------------------------------------------------------------------------- */ /* Module initialization stuff */ static PyMethodDef UMath_LinAlgMethods[] = { {NULL, NULL, 0, NULL} /* Sentinel */ }; static struct PyModuleDef moduledef = { PyModuleDef_HEAD_INIT, UMATH_LINALG_MODULE_NAME, NULL, -1, UMath_LinAlgMethods, NULL, NULL, NULL, NULL }; PyObject *PyInit__umath_linalg(void) { PyObject *m; PyObject *d; PyObject *version; init_constants(); m = PyModule_Create(&moduledef); if (m == NULL) { return NULL; } import_array(); import_ufunc(); d = PyModule_GetDict(m); if (d == NULL) { return NULL; } version = PyUnicode_FromString(umath_linalg_version_string); if (version == NULL) { return NULL; } int ret = PyDict_SetItemString(d, "__version__", version); Py_DECREF(version); if (ret < 0) { return NULL; } /* Load the ufunc operators into the module's namespace */ if (addUfuncs(d) < 0) { return NULL; } return m; }