Merge branch 'numpy:main' into doc-improveGenShuffleDoc

author: Bhavuk kalra <bhavukkalra1786@gmail.com> 2022-04-03 23:55:56 +0530
committer: GitHub <noreply@github.com> 2022-04-03 23:55:56 +0530
commit: 6aa2fcaecf8e56b61143f34269d04fc55f8cabf1 (patch)
tree: 00276e2b56888f48a6e2a6756cd4af52fef46862
parent: 65f4baf28a5f8314ea7e3b9059f0f844efa129a0 (diff)
parent: 37bbfc9012ea46427143c0aa865d4cc7d8c94b9a (diff)
download: numpy-6aa2fcaecf8e56b61143f34269d04fc55f8cabf1.tar.gz
28 files changed, 2301 insertions, 1180 deletions
diff --git a/.gitpod.yml b/.gitpod.yml
index c46752f10..096483fd6 100644
--- a/.gitpod.yml
+++ b/.gitpod.yml
@@ -19,6 +19,8 @@ tasks:
       echo "📖 Building docs 📖 "
       cd doc
       make html
+      echo "Installing dependencies for documentation Live-Preview"
+      pip install esbonio
       echo "✨ Pre-build complete! You can close this terminal ✨ "
 
 # --------------------------------------------------------
diff --git a/doc/release/upcoming_changes/21154.improvement.rst b/doc/release/upcoming_changes/21154.improvement.rst
new file mode 100644
index 000000000..38630b47b
--- /dev/null
+++ b/doc/release/upcoming_changes/21154.improvement.rst
@@ -0,0 +1,7 @@
+Math C library feature detection now uses correct signatures
+------------------------------------------------------------
+Compiling is preceded by a detection phase to determine whether the
+underlying libc supports certain math operations. Previously this code
+did not respect the proper signatures. Fixing this enables compilation
+for the ``wasm-ld`` backend (compilation for web assembly) and reduces
+the number of warnings.
diff --git a/doc/source/dev/index.rst b/doc/source/dev/index.rst
index a8c969267..46ff3a5b2 100644
--- a/doc/source/dev/index.rst
+++ b/doc/source/dev/index.rst
@@ -27,8 +27,10 @@ the `numpy-discussion mailing list <https://mail.python.org/mailman/listinfo/num
 or on `GitHub <https://github.com/numpy/numpy>`__ (open an issue or comment on a
 relevant issue). These are our preferred communication channels (open source is open
 by nature!), however if you prefer to discuss in private first, please reach out to
-our community coordinators at `numpy-team@googlegroups.com` or `numpy-team.slack.com`
-(send an email to `numpy-team@googlegroups.com` for an invite the first time).
+our community coordinators at `numpy-team@googlegroups.com
+<mailto://numpy-team@googlegroups.com>`_ or `numpy-team.slack.com
+<https://numpy-team.slack.com>`__ (send an email to `numpy-team@googlegroups.com`_ for an
+invite the first time).
 
 Development process - summary
 =============================
@@ -53,7 +55,7 @@ Here's the short summary, complete TOC links are below:
 
       git remote add upstream https://github.com/numpy/numpy.git
 
-   * Now, `git remote -v` will show two remote repositories named:
+   * Now, ``git remote -v`` will show two remote repositories named:
 
      - ``upstream``, which refers to the ``numpy`` repository
      - ``origin``, which refers to your personal fork
diff --git a/environment.yml b/environment.yml
index f2be74f62..922fb55dc 100644
--- a/environment.yml
+++ b/environment.yml
@@ -29,7 +29,8 @@ dependencies:
   - pandas
   - matplotlib
   - pydata-sphinx-theme=0.7.2
-  - breathe
+  # NOTE: breathe 4.33.0 collides with sphinx.ext.graphviz
+  - breathe!=4.33.0
   # For linting
   - pycodestyle=2.7.0
   - gitpython
diff --git a/numpy/core/feature_detection_locale.h b/numpy/core/feature_detection_locale.h
new file mode 100644
index 000000000..0af1d6e7e
--- /dev/null
+++ b/numpy/core/feature_detection_locale.h
@@ -0,0 +1 @@
+long double strtold_l(const char*, char**, locale_t);
diff --git a/numpy/core/feature_detection_math.h b/numpy/core/feature_detection_math.h
new file mode 100644
index 000000000..27ad7bcf0
--- /dev/null
+++ b/numpy/core/feature_detection_math.h
@@ -0,0 +1,107 @@
+double expm1(double);
+double log1p(double);
+double acosh(double);
+double asinh(double);
+double atanh(double);
+double rint(double);
+double trunc(double);
+double exp2(double);
+double log2(double);
+double hypot(double, double);
+double atan2(double, double);
+double pow(double, double);
+double copysign(double, double);
+double nextafter(double, double);
+long long strtoll(const char*, char**, int);
+unsigned long long strtoull(const char*, char**, int);
+double cbrt(double);
+long double sinl(long double);
+long double cosl(long double);
+long double tanl(long double);
+long double sinhl(long double);
+long double coshl(long double);
+long double tanhl(long double);
+long double fabsl(long double);
+long double floorl(long double);
+long double ceill(long double);
+long double rintl(long double);
+long double truncl(long double);
+long double sqrtl(long double);
+long double log10l(long double);
+long double logl(long double);
+long double log1pl(long double);
+long double expl(long double);
+long double expm1l(long double);
+long double asinl(long double);
+long double acosl(long double);
+long double atanl(long double);
+long double asinhl(long double);
+long double acoshl(long double);
+long double atanhl(long double);
+long double hypotl(long double, long double);
+long double atan2l(long double, long double);
+long double powl(long double, long double);
+long double fmodl(long double, long double);
+long double modfl(long double, long double*);
+long double frexpl(long double, int*);
+long double ldexpl(long double, int);
+long double exp2l(long double);
+long double log2l(long double);
+long double copysignl(long double, long double);
+long double nextafterl(long double, long double);
+long double cbrtl(long double);
+float sinf(float);
+float cosf(float);
+float tanf(float);
+float sinhf(float);
+float coshf(float);
+float tanhf(float);
+float fabsf(float);
+float floorf(float);
+float ceilf(float);
+float rintf(float);
+float truncf(float);
+float sqrtf(float);
+float log10f(float);
+float logf(float);
+float log1pf(float);
+float expf(float);
+float expm1f(float);
+float asinf(float);
+float acosf(float);
+float atanf(float);
+float asinhf(float);
+float acoshf(float);
+float atanhf(float);
+float hypotf(float, float);
+float atan2f(float, float);
+float powf(float, float);
+float fmodf(float, float);
+float modff(float, float*);
+float frexpf(float, int*);
+float ldexpf(float, int);
+float exp2f(float);
+float log2f(float);
+float copysignf(float, float);
+float nextafterf(float, float);
+float cbrtf(float);
+double sin(double);
+double cos(double);
+double tan(double);
+double sinh(double);
+double cosh(double);
+double tanh(double);
+double fabs(double);
+double floor(double);
+double ceil(double);
+double sqrt(double);
+double log10(double);
+double log(double);
+double exp(double);
+double asin(double);
+double acos(double);
+double atan(double);
+double fmod(double, double);
+double modf(double, double*);
+double frexp(double, int*);
+double ldexp(double, int);
diff --git a/numpy/core/feature_detection_misc.h b/numpy/core/feature_detection_misc.h
new file mode 100644
index 000000000..f58cf4b62
--- /dev/null
+++ b/numpy/core/feature_detection_misc.h
@@ -0,0 +1,4 @@
+#include <stddef.h>
+
+int backtrace(void **, int);
+int madvise(void *, size_t, int);
diff --git a/numpy/core/feature_detection_stdio.h b/numpy/core/feature_detection_stdio.h
new file mode 100644
index 000000000..bc14d16d0
--- /dev/null
+++ b/numpy/core/feature_detection_stdio.h
@@ -0,0 +1,6 @@
+#include <stdio.h>
+#include <fcntl.h>
+
+off_t ftello(FILE *stream);
+int fseeko(FILE *stream, off_t offset, int whence);
+int fallocate(int, int, off_t, off_t);
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index c4222d7c0..5b946d0d4 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -129,32 +129,48 @@ def win32_checks(deflist):
         deflist.append('FORCE_NO_LONG_DOUBLE_FORMATTING')
 
 def check_math_capabilities(config, ext, moredefs, mathlibs):
-    def check_func(func_name):
-        return config.check_func(func_name, libraries=mathlibs,
-                                 decl=True, call=True)
-
-    def check_funcs_once(funcs_name):
-        decl = dict([(f, True) for f in funcs_name])
-        st = config.check_funcs_once(funcs_name, libraries=mathlibs,
-                                     decl=decl, call=decl)
+    def check_func(
+        func_name,
+        decl=False,
+        headers=["feature_detection_math.h"],
+    ):
+        return config.check_func(
+            func_name,
+            libraries=mathlibs,
+            decl=decl,
+            call=True,
+            call_args=FUNC_CALL_ARGS[func_name],
+            headers=headers,
+        )
+
+    def check_funcs_once(funcs_name, headers=["feature_detection_math.h"]):
+        call = dict([(f, True) for f in funcs_name])
+        call_args = dict([(f, FUNC_CALL_ARGS[f]) for f in funcs_name])
+        st = config.check_funcs_once(
+            funcs_name,
+            libraries=mathlibs,
+            decl=False,
+            call=call,
+            call_args=call_args,
+            headers=headers,
+        )
         if st:
             moredefs.extend([(fname2def(f), 1) for f in funcs_name])
         return st
 
-    def check_funcs(funcs_name):
+    def check_funcs(funcs_name, headers=["feature_detection_math.h"]):
         # Use check_funcs_once first, and if it does not work, test func per
         # func. Return success only if all the functions are available
-        if not check_funcs_once(funcs_name):
+        if not check_funcs_once(funcs_name, headers=headers):
             # Global check failed, check func per func
             for f in funcs_name:
-                if check_func(f):
+                if check_func(f, headers=headers):
                     moredefs.append((fname2def(f), 1))
             return 0
         else:
             return 1
 
     #use_msvc = config.check_decl("_MSC_VER")
-
     if not check_funcs_once(MANDATORY_FUNCS):
         raise SystemError("One of the required function to build numpy is not"
                 " available (the list is %s)." % str(MANDATORY_FUNCS))
@@ -169,15 +185,34 @@ def check_math_capabilities(config, ext, moredefs, mathlibs):
     for f in OPTIONAL_STDFUNCS_MAYBE:
         if config.check_decl(fname2def(f),
                     headers=["Python.h", "math.h"]):
-            OPTIONAL_STDFUNCS.remove(f)
+            if f in OPTIONAL_STDFUNCS:
+                OPTIONAL_STDFUNCS.remove(f)
+            else:
+                OPTIONAL_FILE_FUNCS.remove(f)
+
 
     check_funcs(OPTIONAL_STDFUNCS)
+    check_funcs(OPTIONAL_FILE_FUNCS, headers=["feature_detection_stdio.h"])
+    check_funcs(OPTIONAL_MISC_FUNCS, headers=["feature_detection_misc.h"])
+    
+
 
     for h in OPTIONAL_HEADERS:
         if config.check_func("", decl=False, call=False, headers=[h]):
             h = h.replace(".", "_").replace(os.path.sep, "_")
             moredefs.append((fname2def(h), 1))
 
+    # Try with both "locale.h" and "xlocale.h"
+    locale_headers = [
+        "stdlib.h",
+        "xlocale.h",
+        "feature_detection_locale.h",
+    ]
+    if not check_funcs(OPTIONAL_LOCALE_FUNCS, headers=locale_headers):
+        # It didn't work with xlocale.h, maybe it will work with locale.h?
+        locale_headers[1] = "locale.h"
+        check_funcs(OPTIONAL_LOCALE_FUNCS, headers=locale_headers)
+
     for tup in OPTIONAL_INTRINSICS:
         headers = None
         if len(tup) == 2:
@@ -398,20 +433,28 @@ def check_types(config_cmd, ext, build_dir):
 def check_mathlib(config_cmd):
     # Testing the C math library
     mathlibs = []
-    mathlibs_choices = [[], ['m'], ['cpml']]
-    mathlib = os.environ.get('MATHLIB')
+    mathlibs_choices = [[], ["m"], ["cpml"]]
+    mathlib = os.environ.get("MATHLIB")
     if mathlib:
-        mathlibs_choices.insert(0, mathlib.split(','))
+        mathlibs_choices.insert(0, mathlib.split(","))
     for libs in mathlibs_choices:
-        if config_cmd.check_func("exp", libraries=libs, decl=True, call=True):
+        if config_cmd.check_func(
+            "log",
+            libraries=libs,
+            call_args="0",
+            decl="double log(double);",
+            call=True
+        ):
             mathlibs = libs
             break
     else:
         raise RuntimeError(
             "math library missing; rerun setup.py after setting the "
-            "MATHLIB env variable")
+            "MATHLIB env variable"
+        )
     return mathlibs
 
+
 def visibility_define(config):
     """Return the define value to use for NPY_VISIBILITY_HIDDEN (may be empty
     string)."""
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 20d44f4ec..1143872b1 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -1,8 +1,9 @@
 # Code common to build tools
-import sys
-import warnings
 import copy
+import pathlib
+import sys
 import textwrap
+import warnings
 
 from numpy.distutils.misc_util import mingw32
 
@@ -91,6 +92,32 @@ def check_api_version(apiversion, codegen_dir):
         warnings.warn(msg % (apiversion, curapi_hash, apiversion, api_hash,
                              __file__),
                       MismatchCAPIWarning, stacklevel=2)
+
+
+FUNC_CALL_ARGS = {}
+
+def set_sig(sig):
+    prefix, _, args = sig.partition("(")
+    args = args.rpartition(")")[0]
+    funcname = prefix.rpartition(" ")[-1]
+    args = [arg.strip() for arg in args.split(",")]
+    FUNC_CALL_ARGS[funcname] = ", ".join("(%s) 0" % arg for arg in args)
+
+
+for file in [
+    "feature_detection_locale.h",
+    "feature_detection_math.h",
+    "feature_detection_misc.h",
+    "feature_detection_stdio.h",
+]:
+    with open(pathlib.Path(__file__).parent / file) as f:
+        for line in f:
+            if line.startswith("#"):
+                continue
+            if not line.strip():
+                continue
+            set_sig(line)
+
 # Mandatory functions: if not found, fail the build
 MANDATORY_FUNCS = ["sin", "cos", "tan", "sinh", "cosh", "tanh", "fabs",
         "floor", "ceil", "sqrt", "log10", "log", "exp", "asin",
@@ -100,9 +127,11 @@ MANDATORY_FUNCS = ["sin", "cos", "tan", "sinh", "cosh", "tanh", "fabs",
 # replacement implementation. Note that some of these are C99 functions.
 OPTIONAL_STDFUNCS = ["expm1", "log1p", "acosh", "asinh", "atanh",
         "rint", "trunc", "exp2", "log2", "hypot", "atan2", "pow",
-        "copysign", "nextafter", "ftello", "fseeko",
-        "strtoll", "strtoull", "cbrt", "strtold_l", "fallocate",
-        "backtrace", "madvise"]
+        "copysign", "nextafter", "strtoll", "strtoull", "cbrt"]
+
+OPTIONAL_LOCALE_FUNCS = ["strtold_l"]
+OPTIONAL_FILE_FUNCS = ["ftello", "fseeko", "fallocate"]
+OPTIONAL_MISC_FUNCS = ["backtrace", "madvise"]
 
 
 OPTIONAL_HEADERS = [
diff --git a/numpy/core/src/multiarray/textreading/tokenize.cpp b/numpy/core/src/multiarray/textreading/tokenize.cpp
index b6d9f882b..b09fc3356 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.cpp
+++ b/numpy/core/src/multiarray/textreading/tokenize.cpp
@@ -40,7 +40,7 @@
 
 
 template <typename UCS>
-static NPY_INLINE int
+static inline int
 copy_to_field_buffer(tokenizer_state *ts,
         const UCS *chunk_start, const UCS *chunk_end)
 {
@@ -73,7 +73,7 @@ copy_to_field_buffer(tokenizer_state *ts,
 }
 
 
-static NPY_INLINE int
+static inline int
 add_field(tokenizer_state *ts)
 {
     /* The previous field is done, advance to keep a NUL byte at the end */
@@ -109,7 +109,7 @@ add_field(tokenizer_state *ts)
 
 
 template <typename UCS>
-static NPY_INLINE int
+static inline int
 tokenizer_core(tokenizer_state *ts, parser_config *const config)
 {
     UCS *pos = (UCS *)ts->pos;
diff --git a/numpy/core/src/npymath/npy_math_private.h b/numpy/core/src/npymath/npy_math_private.h
index 7ca0c5ba0..a474b3de3 100644
--- a/numpy/core/src/npymath/npy_math_private.h
+++ b/numpy/core/src/npymath/npy_math_private.h
@@ -514,7 +514,7 @@ typedef union {
 typedef union {
         npy_cdouble npy_z;
 #ifdef __cplusplus
-        std::complex<double> c99z;
+        std::complex<double> c99_z;
 #else
         complex double c99_z;
 #endif
@@ -523,7 +523,7 @@ typedef union {
 typedef union {
         npy_cfloat npy_z;
 #ifdef __cplusplus
-        std::complex<float> c99z;
+        std::complex<float> c99_z;
 #else
         complex float c99_z;
 #endif
diff --git a/numpy/core/src/npysort/binsearch.cpp b/numpy/core/src/npysort/binsearch.cpp
index 8dd72c094..98d305910 100644
--- a/numpy/core/src/npysort/binsearch.cpp
+++ b/numpy/core/src/npysort/binsearch.cpp
@@ -344,7 +344,7 @@ constexpr std::array<typename binsearch_t<arg>::value_type,
         binsearch_t<arg>::map;
 
 template <arg_t arg>
-static NPY_INLINE typename binsearch_t<arg>::function_type
+static inline typename binsearch_t<arg>::function_type
 _get_binsearch_func(PyArray_Descr *dtype, NPY_SEARCHSIDE side)
 {
     using binsearch = binsearch_t<arg>;
diff --git a/numpy/core/src/npysort/radixsort.cpp b/numpy/core/src/npysort/radixsort.cpp
index 5393869ee..0e1a41c69 100644
--- a/numpy/core/src/npysort/radixsort.cpp
+++ b/numpy/core/src/npysort/radixsort.cpp
@@ -4,7 +4,7 @@
 #include "npysort_common.h"
 
 #include "../common/numpy_tag.h"
-#include <stdlib.h>
+#include <cstdlib>
 #include <type_traits>
 
 /*
diff --git a/numpy/core/src/npysort/selection.cpp b/numpy/core/src/npysort/selection.cpp
index 7fd04a660..ebe188b71 100644
--- a/numpy/core/src/npysort/selection.cpp
+++ b/numpy/core/src/npysort/selection.cpp
@@ -39,7 +39,7 @@ introselect_(type *v, npy_intp *tosort, npy_intp num, npy_intp kth,
  *****************************************************************************
  */
 
-static NPY_INLINE void
+static inline void
 store_pivot(npy_intp pivot, npy_intp kth, npy_intp *pivots, npy_intp *npiv)
 {
     if (pivots == NULL) {
@@ -104,7 +104,7 @@ inexact()
  * for efficient partitioning, see unguarded_partition
  */
 template <typename Tag, bool arg, typename type>
-static NPY_INLINE void
+static inline void
 median3_swap_(type *v, npy_intp *tosort, npy_intp low, npy_intp mid,
               npy_intp high)
 {
@@ -170,7 +170,7 @@ median5_(type *v, npy_intp *tosort)
  * lower-than-pivot [x x x x] larger-than-pivot
  */
 template <typename Tag, bool arg, typename type>
-static NPY_INLINE void
+static inline void
 unguarded_partition_(type *v, npy_intp *tosort, const type pivot, npy_intp *ll,
                      npy_intp *hh)
 {
@@ -442,7 +442,7 @@ struct partition_t {
 };
 constexpr std::array<arg_map, partition_t::taglist::size> partition_t::map;
 
-static NPY_INLINE PyArray_PartitionFunc *
+static inline PyArray_PartitionFunc *
 _get_partition_func(int type, NPY_SELECTKIND which)
 {
     npy_intp i;
@@ -459,7 +459,7 @@ _get_partition_func(int type, NPY_SELECTKIND which)
     return NULL;
 }
 
-static NPY_INLINE PyArray_ArgPartitionFunc *
+static inline PyArray_ArgPartitionFunc *
 _get_argpartition_func(int type, NPY_SELECTKIND which)
 {
     npy_intp i;
diff --git a/numpy/core/src/npysort/timsort.cpp b/numpy/core/src/npysort/timsort.cpp
index 8bcd90061..27294af0c 100644
--- a/numpy/core/src/npysort/timsort.cpp
+++ b/numpy/core/src/npysort/timsort.cpp
@@ -67,7 +67,7 @@ typedef struct {
 } buffer_intp;
 
 /* buffer method */
-static NPY_INLINE int
+static inline int
 resize_buffer_intp(buffer_intp *buffer, npy_intp new_size)
 {
     if (new_size <= buffer->size) {
@@ -105,7 +105,7 @@ struct buffer_ {
 };
 
 template <typename Tag>
-static NPY_INLINE int
+static inline int
 resize_buffer_(buffer_<Tag> *buffer, npy_intp new_size)
 {
     using type = typename Tag::type;
@@ -978,7 +978,7 @@ struct string_buffer_ {
 };
 
 template <typename Tag>
-static NPY_INLINE int
+static inline int
 resize_buffer_(string_buffer_<Tag> *buffer, npy_intp new_size)
 {
     using type = typename Tag::type;
@@ -1873,7 +1873,7 @@ typedef struct {
     size_t len;
 } buffer_char;
 
-static NPY_INLINE int
+static inline int
 resize_buffer_char(buffer_char *buffer, npy_intp new_size)
 {
     if (new_size <= buffer->size) {
diff --git a/numpy/core/src/npysort/x86-qsort.dispatch.cpp b/numpy/core/src/npysort/x86-qsort.dispatch.cpp
index 4b01e3528..66261ad0a 100644
--- a/numpy/core/src/npysort/x86-qsort.dispatch.cpp
+++ b/numpy/core/src/npysort/x86-qsort.dispatch.cpp
@@ -72,7 +72,7 @@ heapsort_(type *start, npy_intp n);
     _mm256_or_si256(_mm256_slli_epi64((x), (k)),               \
                     _mm256_srli_epi64((x), 64 - (k)))
 
-static NPY_INLINE __m256i
+static inline __m256i
 vnext(__m256i *s0, __m256i *s1)
 {
     *s1 = _mm256_xor_si256(*s0, *s1); /* modify vectors s1 and s0 */
@@ -83,7 +83,7 @@ vnext(__m256i *s0, __m256i *s1)
 }
 
 /* transform random numbers to the range between 0 and bound - 1 */
-static NPY_INLINE __m256i
+static inline __m256i
 rnd_epu32(__m256i rnd_vec, __m256i bound)
 {
     __m256i even = _mm256_srli_epi64(_mm256_mul_epu32(rnd_vec, bound), 32);
@@ -282,7 +282,7 @@ COEX(mm_t &a, mm_t &b)
 }
 
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static NPY_INLINE zmm_t
+static inline zmm_t
 cmp_merge(zmm_t in1, zmm_t in2, __mmask16 mask)
 {
     zmm_t min = vtype::min(in2, in1);
@@ -295,7 +295,7 @@ cmp_merge(zmm_t in1, zmm_t in2, __mmask16 mask)
  * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
  */
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static NPY_INLINE zmm_t
+static inline zmm_t
 sort_zmm(zmm_t zmm)
 {
     zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
@@ -323,7 +323,7 @@ sort_zmm(zmm_t zmm)
 
 // Assumes zmm is bitonic and performs a recursive half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static NPY_INLINE zmm_t
+static inline zmm_t
 bitonic_merge_zmm(zmm_t zmm)
 {
     // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
@@ -343,7 +343,7 @@ bitonic_merge_zmm(zmm_t zmm)
 
 // Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static NPY_INLINE void
+static inline void
 bitonic_merge_two_zmm(zmm_t *zmm1, zmm_t *zmm2)
 {
     // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
@@ -358,7 +358,7 @@ bitonic_merge_two_zmm(zmm_t *zmm1, zmm_t *zmm2)
 // Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
 // half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static NPY_INLINE void
+static inline void
 bitonic_merge_four_zmm(zmm_t *zmm)
 {
     zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[2]);
@@ -380,7 +380,7 @@ bitonic_merge_four_zmm(zmm_t *zmm)
 }
 
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static NPY_INLINE void
+static inline void
 bitonic_merge_eight_zmm(zmm_t *zmm)
 {
     zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[4]);
@@ -418,7 +418,7 @@ bitonic_merge_eight_zmm(zmm_t *zmm)
 }
 
 template <typename vtype, typename type_t>
-static NPY_INLINE void
+static inline void
 sort_16(type_t *arr, npy_int N)
 {
     __mmask16 load_mask = (0x0001 << N) - 0x0001;
@@ -428,7 +428,7 @@ sort_16(type_t *arr, npy_int N)
 }
 
 template <typename vtype, typename type_t>
-static NPY_INLINE void
+static inline void
 sort_32(type_t *arr, npy_int N)
 {
     if (N <= 16) {
@@ -447,7 +447,7 @@ sort_32(type_t *arr, npy_int N)
 }
 
 template <typename vtype, typename type_t>
-static NPY_INLINE void
+static inline void
 sort_64(type_t *arr, npy_int N)
 {
     if (N <= 32) {
@@ -482,7 +482,7 @@ sort_64(type_t *arr, npy_int N)
 }
 
 template <typename vtype, typename type_t>
-static NPY_INLINE void
+static inline void
 sort_128(type_t *arr, npy_int N)
 {
     if (N <= 64) {
@@ -545,7 +545,7 @@ sort_128(type_t *arr, npy_int N)
 }
 
 template <typename type_t>
-static NPY_INLINE void
+static inline void
 swap(type_t *arr, npy_intp ii, npy_intp jj)
 {
     type_t temp = arr[ii];
@@ -555,7 +555,7 @@ swap(type_t *arr, npy_intp ii, npy_intp jj)
 
 // Median of 3 strategy
 // template<typename type_t>
-// static NPY_INLINE
+// static inline
 // npy_intp get_pivot_index(type_t *arr, const npy_intp left, const npy_intp
 // right) {
 //    return (rand() % (right + 1 - left)) + left;
@@ -574,7 +574,7 @@ swap(type_t *arr, npy_intp ii, npy_intp jj)
  */
 
 template <typename vtype, typename type_t>
-static NPY_INLINE type_t
+static inline type_t
 get_pivot(type_t *arr, const npy_intp left, const npy_intp right)
 {
     /* seeds for vectorized random number generator */
@@ -641,7 +641,7 @@ get_pivot(type_t *arr, const npy_intp left, const npy_intp right)
  * last element that is less than equal to the pivot.
  */
 template <typename vtype, typename type_t, typename zmm_t>
-static NPY_INLINE npy_int
+static inline npy_int
 partition_vec(type_t *arr, npy_intp left, npy_intp right, const zmm_t curr_vec,
               const zmm_t pivot_vec, zmm_t *smallest_vec, zmm_t *biggest_vec)
 {
@@ -661,7 +661,7 @@ partition_vec(type_t *arr, npy_intp left, npy_intp right, const zmm_t curr_vec,
  * last element that is less than equal to the pivot.
  */
 template <typename vtype, typename type_t>
-static NPY_INLINE npy_intp
+static inline npy_intp
 partition_avx512(type_t *arr, npy_intp left, npy_intp right, type_t pivot,
                  type_t *smallest, type_t *biggest)
 {
@@ -742,7 +742,7 @@ partition_avx512(type_t *arr, npy_intp left, npy_intp right, type_t pivot,
 }
 
 template <typename vtype, typename type_t>
-static NPY_INLINE void
+static inline void
 qsort_(type_t *arr, npy_intp left, npy_intp right, npy_int max_iters)
 {
     /*
@@ -771,7 +771,7 @@ qsort_(type_t *arr, npy_intp left, npy_intp right, npy_int max_iters)
         qsort_<vtype>(arr, pivot_index, right, max_iters - 1);
 }
 
-static NPY_INLINE npy_intp
+static inline npy_intp
 replace_nan_with_inf(npy_float *arr, npy_intp arrsize)
 {
     npy_intp nan_count = 0;
@@ -790,7 +790,7 @@ replace_nan_with_inf(npy_float *arr, npy_intp arrsize)
     return nan_count;
 }
 
-static NPY_INLINE void
+static inline void
 replace_inf_with_nan(npy_float *arr, npy_intp arrsize, npy_intp nan_count)
 {
     for (npy_intp ii = arrsize - 1; nan_count > 0; --ii) {
diff --git a/numpy/core/tests/data/generate_umath_validation_data.cpp b/numpy/core/tests/data/generate_umath_validation_data.cpp
index 418eae670..51ee12501 100644
--- a/numpy/core/tests/data/generate_umath_validation_data.cpp
+++ b/numpy/core/tests/data/generate_umath_validation_data.cpp
@@ -1,10 +1,10 @@
 #include <algorithm>
 #include <fstream>
 #include <iostream>
-#include <math.h>
+#include <cmath>
 #include <random>
-#include <stdio.h>
-#include <time.h>
+#include <cstdio>
+#include <ctime>
 #include <vector>
 
 struct ufunc {
diff --git a/numpy/f2py/crackfortran.py b/numpy/f2py/crackfortran.py
index 0374ae8d7..eb9ae03c6 100755
--- a/numpy/f2py/crackfortran.py
+++ b/numpy/f2py/crackfortran.py
@@ -2689,7 +2689,7 @@ def analyzevars(block):
             n_checks = []
             n_is_input = l_or(isintent_in, isintent_inout,
                               isintent_inplace)(vars[n])
-            if 'dimension' in vars[n]:  # n is array
+            if isarray(vars[n]):  # n is array
                 for i, d in enumerate(vars[n]['dimension']):
                     coeffs_and_deps = dimension_exprs.get(d)
                     if coeffs_and_deps is None:
@@ -2700,6 +2700,13 @@ def analyzevars(block):
                         # may define variables used in dimension
                         # specifications.
                         for v, (solver, deps) in coeffs_and_deps.items():
+                            def compute_deps(v, deps):
+                                for v1 in coeffs_and_deps.get(v, [None, []])[1]:
+                                    if v1 not in deps:
+                                        deps.add(v1)
+                                        compute_deps(v1, deps)
+                            all_deps = set()
+                            compute_deps(v, all_deps)
                             if ((v in n_deps
                                  or '=' in vars[v]
                                  or 'depend' in vars[v])):
@@ -2708,7 +2715,7 @@ def analyzevars(block):
                                 # - has user-defined initialization expression
                                 # - has user-defined dependencies
                                 continue
-                            if solver is not None:
+                            if solver is not None and v not in all_deps:
                                 # v can be solved from d, hence, we
                                 # make it an optional argument with
                                 # initialization expression:
diff --git a/numpy/f2py/rules.py b/numpy/f2py/rules.py
index eaa559528..feb181bcb 100755
--- a/numpy/f2py/rules.py
+++ b/numpy/f2py/rules.py
@@ -50,7 +50,7 @@ $Date: 2005/08/30 08:58:42 $
 Pearu Peterson
 
 """
-import os
+import os, sys
 import time
 import copy
 
@@ -1202,8 +1202,8 @@ def buildmodule(m, um):
                     break
 
         if not nb:
-            errmess(
-                'buildmodule: Could not found the body of interfaced routine "%s". Skipping.\n' % (n))
+            print(
+                'buildmodule: Could not find the body of interfaced routine "%s". Skipping.\n' % (n), file=sys.stderr)
             continue
         nb_list = [nb]
         if 'entry' in nb:
diff --git a/numpy/f2py/tests/src/cli/hi77.f b/numpy/f2py/tests/src/cli/hi77.f
new file mode 100644
index 000000000..8b916ebe0
--- /dev/null
+++ b/numpy/f2py/tests/src/cli/hi77.f
@@ -0,0 +1,3 @@
+      SUBROUTINE HI
+        PRINT*, "HELLO WORLD"
+      END SUBROUTINE
diff --git a/numpy/f2py/tests/src/cli/hiworld.f90 b/numpy/f2py/tests/src/cli/hiworld.f90
new file mode 100644
index 000000000..981f87754
--- /dev/null
+++ b/numpy/f2py/tests/src/cli/hiworld.f90
@@ -0,0 +1,3 @@
+function hi()
+  print*, "Hello World"
+end function
diff --git a/numpy/f2py/tests/src/negative_bounds/issue_20853.f90 b/numpy/f2py/tests/src/negative_bounds/issue_20853.f90
new file mode 100644
index 000000000..bf1fa9285
--- /dev/null
+++ b/numpy/f2py/tests/src/negative_bounds/issue_20853.f90
@@ -0,0 +1,7 @@
+subroutine foo(is_, ie_, arr, tout)
+ implicit none
+ integer :: is_,ie_
+ real, intent(in) :: arr(is_:ie_)
+ real, intent(out) :: tout(is_:ie_)
+ tout = arr
+end
diff --git a/numpy/f2py/tests/test_f2py2e.py b/numpy/f2py/tests/test_f2py2e.py
new file mode 100644
index 000000000..9de043d73
--- /dev/null
+++ b/numpy/f2py/tests/test_f2py2e.py
@@ -0,0 +1,748 @@
+import textwrap, re, sys, subprocess, shlex
+from pathlib import Path
+from collections import namedtuple
+
+import pytest
+
+from . import util
+from numpy.f2py.f2py2e import main as f2pycli
+
+#########################
+# CLI utils and classes #
+#########################
+
+PPaths = namedtuple("PPaths", "finp, f90inp, pyf, wrap77, wrap90, cmodf")
+
+
+def get_io_paths(fname_inp, mname="untitled"):
+    """Takes in a temporary file for testing and returns the expected output and input paths
+
+    Here expected output is essentially one of any of the possible generated
+    files.
+
+    ..note::
+
+         Since this does not actually run f2py, none of these are guaranteed to
+         exist, and module names are typically incorrect
+
+    Parameters
+    ----------
+    fname_inp : str
+                The input filename
+    mname : str, optional
+                The name of the module, untitled by default
+
+    Returns
+    -------
+    genp : NamedTuple PPaths
+            The possible paths which are generated, not all of which exist
+    """
+    bpath = Path(fname_inp)
+    return PPaths(
+        finp=bpath.with_suffix(".f"),
+        f90inp=bpath.with_suffix(".f90"),
+        pyf=bpath.with_suffix(".pyf"),
+        wrap77=bpath.with_name(f"{mname}-f2pywrappers.f"),
+        wrap90=bpath.with_name(f"{mname}-f2pywrappers2.f90"),
+        cmodf=bpath.with_name(f"{mname}module.c"),
+    )
+
+
+##############
+# CLI Fixtures and Tests #
+#############
+
+
+@pytest.fixture(scope="session")
+def hello_world_f90(tmpdir_factory):
+    """Generates a single f90 file for testing"""
+    fdat = util.getpath("tests", "src", "cli", "hiworld.f90").read_text()
+    fn = tmpdir_factory.getbasetemp() / "hello.f90"
+    fn.write_text(fdat, encoding="ascii")
+    return fn
+
+
+@pytest.fixture(scope="session")
+def hello_world_f77(tmpdir_factory):
+    """Generates a single f77 file for testing"""
+    fdat = util.getpath("tests", "src", "cli", "hi77.f").read_text()
+    fn = tmpdir_factory.getbasetemp() / "hello.f"
+    fn.write_text(fdat, encoding="ascii")
+    return fn
+
+
+@pytest.fixture(scope="session")
+def retreal_f77(tmpdir_factory):
+    """Generates a single f77 file for testing"""
+    fdat = util.getpath("tests", "src", "return_real", "foo77.f").read_text()
+    fn = tmpdir_factory.getbasetemp() / "foo.f"
+    fn.write_text(fdat, encoding="ascii")
+    return fn
+
+
+def test_gen_pyf(capfd, hello_world_f90, monkeypatch):
+    """Ensures that a signature file is generated via the CLI
+    CLI :: -h
+    """
+    ipath = Path(hello_world_f90)
+    opath = Path(hello_world_f90).stem + ".pyf"
+    monkeypatch.setattr(sys, "argv", f'f2py -h {opath} {ipath}'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()  # Generate wrappers
+        out, _ = capfd.readouterr()
+        assert "Saving signatures to file" in out
+        assert Path(f'{opath}').exists()
+
+
+def test_gen_pyf_stdout(capfd, hello_world_f90, monkeypatch):
+    """Ensures that a signature file can be dumped to stdout
+    CLI :: -h
+    """
+    ipath = Path(hello_world_f90)
+    monkeypatch.setattr(sys, "argv", f'f2py -h stdout {ipath}'.split())
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, _ = capfd.readouterr()
+        assert "Saving signatures to file" in out
+
+
+def test_gen_pyf_no_overwrite(capfd, hello_world_f90, monkeypatch):
+    """Ensures that the CLI refuses to overwrite signature files
+    CLI :: -h without --overwrite-signature
+    """
+    ipath = Path(hello_world_f90)
+    monkeypatch.setattr(sys, "argv", f'f2py -h faker.pyf {ipath}'.split())
+
+    with util.switchdir(ipath.parent):
+        Path("faker.pyf").write_text("Fake news", encoding="ascii")
+        with pytest.raises(SystemExit):
+            f2pycli()  # Refuse to overwrite
+            _, err = capfd.readouterr()
+            assert "Use --overwrite-signature to overwrite" in err
+
+
+@pytest.mark.xfail
+def test_f2py_skip(capfd, retreal_f77, monkeypatch):
+    """Tests that functions can be skipped
+    CLI :: skip:
+    """
+    foutl = get_io_paths(retreal_f77, mname="test")
+    ipath = foutl.finp
+    toskip = "t0 t4 t8 sd s8 s4"
+    remaining = "td s0"
+    monkeypatch.setattr(
+        sys, "argv",
+        f'f2py {ipath} -m test skip: {toskip}'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, err = capfd.readouterr()
+        for skey in toskip.split():
+            assert (
+                f'buildmodule: Could not found the body of interfaced routine "{skey}". Skipping.'
+                in err)
+        for rkey in remaining.split():
+            assert f'Constructing wrapper function "{rkey}"' in out
+
+
+def test_f2py_only(capfd, retreal_f77, monkeypatch):
+    """Test that functions can be kept by only:
+    CLI :: only:
+    """
+    foutl = get_io_paths(retreal_f77, mname="test")
+    ipath = foutl.finp
+    toskip = "t0 t4 t8 sd s8 s4"
+    tokeep = "td s0"
+    monkeypatch.setattr(
+        sys, "argv",
+        f'f2py {ipath} -m test only: {tokeep}'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, err = capfd.readouterr()
+        for skey in toskip.split():
+            assert (
+                f'buildmodule: Could not find the body of interfaced routine "{skey}". Skipping.'
+                in err)
+        for rkey in tokeep.split():
+            assert f'Constructing wrapper function "{rkey}"' in out
+
+
+def test_file_processing_switch(capfd, hello_world_f90, retreal_f77,
+                                monkeypatch):
+    """Tests that it is possible to return to file processing mode
+    CLI :: :
+    BUG: numpy-gh #20520
+    """
+    foutl = get_io_paths(retreal_f77, mname="test")
+    ipath = foutl.finp
+    toskip = "t0 t4 t8 sd s8 s4"
+    ipath2 = Path(hello_world_f90)
+    tokeep = "td s0 hi"  # hi is in ipath2
+    mname = "blah"
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        f'f2py {ipath} -m {mname} only: {tokeep} : {ipath2}'.split(
+        ),
+    )
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, err = capfd.readouterr()
+        for skey in toskip.split():
+            assert (
+                f'buildmodule: Could not find the body of interfaced routine "{skey}". Skipping.'
+                in err)
+        for rkey in tokeep.split():
+            assert f'Constructing wrapper function "{rkey}"' in out
+
+
+def test_mod_gen_f77(capfd, hello_world_f90, monkeypatch):
+    """Checks the generation of files based on a module name
+    CLI :: -m
+    """
+    MNAME = "hi"
+    foutl = get_io_paths(hello_world_f90, mname=MNAME)
+    ipath = foutl.f90inp
+    monkeypatch.setattr(sys, "argv", f'f2py {ipath} -m {MNAME}'.split())
+    with util.switchdir(ipath.parent):
+        f2pycli()
+
+    # Always generate C module
+    assert Path.exists(foutl.cmodf)
+    # File contains a function, check for F77 wrappers
+    assert Path.exists(foutl.wrap77)
+
+
+def test_lower_cmod(capfd, hello_world_f77, monkeypatch):
+    """Lowers cases by flag or when -h is present
+
+    CLI :: --[no-]lower
+    """
+    foutl = get_io_paths(hello_world_f77, mname="test")
+    ipath = foutl.finp
+    capshi = re.compile(r"HI\(\)")
+    capslo = re.compile(r"hi\(\)")
+    # Case I: --lower is passed
+    monkeypatch.setattr(sys, "argv", f'f2py {ipath} -m test --lower'.split())
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, _ = capfd.readouterr()
+        assert capslo.search(out) is not None
+        assert capshi.search(out) is None
+    # Case II: --no-lower is passed
+    monkeypatch.setattr(sys, "argv",
+                        f'f2py {ipath} -m test --no-lower'.split())
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, _ = capfd.readouterr()
+        assert capslo.search(out) is None
+        assert capshi.search(out) is not None
+
+
+def test_lower_sig(capfd, hello_world_f77, monkeypatch):
+    """Lowers cases in signature files by flag or when -h is present
+
+    CLI :: --[no-]lower -h
+    """
+    foutl = get_io_paths(hello_world_f77, mname="test")
+    ipath = foutl.finp
+    # Signature files
+    capshi = re.compile(r"Block: HI")
+    capslo = re.compile(r"Block: hi")
+    # Case I: --lower is implied by -h
+    # TODO: Clean up to prevent passing --overwrite-signature
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        f'f2py {ipath} -h {foutl.pyf} -m test --overwrite-signature'.split(),
+    )
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, _ = capfd.readouterr()
+        assert capslo.search(out) is not None
+        assert capshi.search(out) is None
+
+    # Case II: --no-lower overrides -h
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        f'f2py {ipath} -h {foutl.pyf} -m test --overwrite-signature --no-lower'
+        .split(),
+    )
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, _ = capfd.readouterr()
+        assert capslo.search(out) is None
+        assert capshi.search(out) is not None
+
+
+def test_build_dir(capfd, hello_world_f90, monkeypatch):
+    """Ensures that the build directory can be specified
+
+    CLI :: --build-dir
+    """
+    ipath = Path(hello_world_f90)
+    mname = "blah"
+    odir = "tttmp"
+    monkeypatch.setattr(sys, "argv",
+                        f'f2py -m {mname} {ipath} --build-dir {odir}'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, _ = capfd.readouterr()
+        assert f"Wrote C/API module \"{mname}\"" in out
+
+
+def test_overwrite(capfd, hello_world_f90, monkeypatch):
+    """Ensures that the build directory can be specified
+
+    CLI :: --overwrite-signature
+    """
+    ipath = Path(hello_world_f90)
+    monkeypatch.setattr(
+        sys, "argv",
+        f'f2py -h faker.pyf {ipath} --overwrite-signature'.split())
+
+    with util.switchdir(ipath.parent):
+        Path("faker.pyf").write_text("Fake news", encoding="ascii")
+        f2pycli()
+        out, _ = capfd.readouterr()
+        assert "Saving signatures to file" in out
+
+
+def test_latexdoc(capfd, hello_world_f90, monkeypatch):
+    """Ensures that TeX documentation is written out
+
+    CLI :: --latex-doc
+    """
+    ipath = Path(hello_world_f90)
+    mname = "blah"
+    monkeypatch.setattr(sys, "argv",
+                        f'f2py -m {mname} {ipath} --latex-doc'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, _ = capfd.readouterr()
+        assert "Documentation is saved to file" in out
+        with Path(f"{mname}module.tex").open() as otex:
+            assert "\\documentclass" in otex.read()
+
+
+def test_nolatexdoc(capfd, hello_world_f90, monkeypatch):
+    """Ensures that TeX documentation is written out
+
+    CLI :: --no-latex-doc
+    """
+    ipath = Path(hello_world_f90)
+    mname = "blah"
+    monkeypatch.setattr(sys, "argv",
+                        f'f2py -m {mname} {ipath} --no-latex-doc'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, _ = capfd.readouterr()
+        assert "Documentation is saved to file" not in out
+
+
+def test_shortlatex(capfd, hello_world_f90, monkeypatch):
+    """Ensures that truncated documentation is written out
+
+    TODO: Test to ensure this has no effect without --latex-doc
+    CLI :: --latex-doc --short-latex
+    """
+    ipath = Path(hello_world_f90)
+    mname = "blah"
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        f'f2py -m {mname} {ipath} --latex-doc --short-latex'.split(),
+    )
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, _ = capfd.readouterr()
+        assert "Documentation is saved to file" in out
+        with Path(f"./{mname}module.tex").open() as otex:
+            assert "\\documentclass" not in otex.read()
+
+
+def test_restdoc(capfd, hello_world_f90, monkeypatch):
+    """Ensures that RsT documentation is written out
+
+    CLI :: --rest-doc
+    """
+    ipath = Path(hello_world_f90)
+    mname = "blah"
+    monkeypatch.setattr(sys, "argv",
+                        f'f2py -m {mname} {ipath} --rest-doc'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, _ = capfd.readouterr()
+        assert "ReST Documentation is saved to file" in out
+        with Path(f"./{mname}module.rest").open() as orst:
+            assert r".. -*- rest -*-" in orst.read()
+
+
+def test_norestexdoc(capfd, hello_world_f90, monkeypatch):
+    """Ensures that TeX documentation is written out
+
+    CLI :: --no-rest-doc
+    """
+    ipath = Path(hello_world_f90)
+    mname = "blah"
+    monkeypatch.setattr(sys, "argv",
+                        f'f2py -m {mname} {ipath} --no-rest-doc'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, _ = capfd.readouterr()
+        assert "ReST Documentation is saved to file" not in out
+
+
+def test_debugcapi(capfd, hello_world_f90, monkeypatch):
+    """Ensures that debugging wrappers are written
+
+    CLI :: --debug-capi
+    """
+    ipath = Path(hello_world_f90)
+    mname = "blah"
+    monkeypatch.setattr(sys, "argv",
+                        f'f2py -m {mname} {ipath} --debug-capi'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        with Path(f"./{mname}module.c").open() as ocmod:
+            assert r"#define DEBUGCFUNCS" in ocmod.read()
+
+
+@pytest.mark.xfail(reason="Consistently fails on CI.")
+def test_debugcapi_bld(hello_world_f90, monkeypatch):
+    """Ensures that debugging wrappers work
+
+    CLI :: --debug-capi -c
+    """
+    ipath = Path(hello_world_f90)
+    mname = "blah"
+    monkeypatch.setattr(sys, "argv",
+                        f'f2py -m {mname} {ipath} -c --debug-capi'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        cmd_run = shlex.split("python3 -c \"import blah; blah.hi()\"")
+        rout = subprocess.run(cmd_run, capture_output=True, encoding='UTF-8')
+        eout = ' Hello World\n'
+        eerr = textwrap.dedent("""\
+debug-capi:Python C/API function blah.hi()
+debug-capi:float hi=:output,hidden,scalar
+debug-capi:hi=0
+debug-capi:Fortran subroutine `f2pywraphi(&hi)'
+debug-capi:hi=0
+debug-capi:Building return value.
+debug-capi:Python C/API function blah.hi: successful.
+debug-capi:Freeing memory.
+        """)
+        assert rout.stdout == eout
+        assert rout.stderr == eerr
+
+
+def test_wrapfunc_def(capfd, hello_world_f90, monkeypatch):
+    """Ensures that fortran subroutine wrappers for F77 are included by default
+
+    CLI :: --[no]-wrap-functions
+    """
+    # Implied
+    ipath = Path(hello_world_f90)
+    mname = "blah"
+    monkeypatch.setattr(sys, "argv", f'f2py -m {mname} {ipath}'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+    out, _ = capfd.readouterr()
+    assert r"Fortran 77 wrappers are saved to" in out
+
+    # Explicit
+    monkeypatch.setattr(sys, "argv",
+                        f'f2py -m {mname} {ipath} --wrap-functions'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, _ = capfd.readouterr()
+        assert r"Fortran 77 wrappers are saved to" in out
+
+
+def test_nowrapfunc(capfd, hello_world_f90, monkeypatch):
+    """Ensures that fortran subroutine wrappers for F77 can be disabled
+
+    CLI :: --no-wrap-functions
+    """
+    ipath = Path(hello_world_f90)
+    mname = "blah"
+    monkeypatch.setattr(sys, "argv",
+                        f'f2py -m {mname} {ipath} --no-wrap-functions'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, _ = capfd.readouterr()
+        assert r"Fortran 77 wrappers are saved to" not in out
+
+
+def test_inclheader(capfd, hello_world_f90, monkeypatch):
+    """Add to the include directories
+
+    CLI :: -include
+    TODO: Document this in the help string
+    """
+    ipath = Path(hello_world_f90)
+    mname = "blah"
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        f'f2py -m {mname} {ipath} -include<stdbool.h> -include<stdio.h> '.
+        split(),
+    )
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        with Path(f"./{mname}module.c").open() as ocmod:
+            ocmr = ocmod.read()
+            assert "#include <stdbool.h>" in ocmr
+            assert "#include <stdio.h>" in ocmr
+
+
+def test_inclpath():
+    """Add to the include directories
+
+    CLI :: --include-paths
+    """
+    # TODO: populate
+    pass
+
+
+def test_hlink():
+    """Add to the include directories
+
+    CLI :: --help-link
+    """
+    # TODO: populate
+    pass
+
+
+def test_f2cmap():
+    """Check that Fortran-to-Python KIND specs can be passed
+
+    CLI :: --f2cmap
+    """
+    # TODO: populate
+    pass
+
+
+def test_quiet(capfd, hello_world_f90, monkeypatch):
+    """Reduce verbosity
+
+    CLI :: --quiet
+    """
+    ipath = Path(hello_world_f90)
+    monkeypatch.setattr(sys, "argv", f'f2py -m blah {ipath} --quiet'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, _ = capfd.readouterr()
+        assert len(out) == 0
+
+
+def test_verbose(capfd, hello_world_f90, monkeypatch):
+    """Increase verbosity
+
+    CLI :: --verbose
+    """
+    ipath = Path(hello_world_f90)
+    monkeypatch.setattr(sys, "argv", f'f2py -m blah {ipath} --verbose'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        out, _ = capfd.readouterr()
+        assert "analyzeline" in out
+
+
+def test_version(capfd, monkeypatch):
+    """Ensure version
+
+    CLI :: -v
+    """
+    monkeypatch.setattr(sys, "argv", 'f2py -v'.split())
+    # TODO: f2py2e should not call sys.exit() after printing the version
+    with pytest.raises(SystemExit):
+        f2pycli()
+        out, _ = capfd.readouterr()
+        import numpy as np
+        assert np.__version__ == out.strip()
+
+
+@pytest.mark.xfail(reason="Consistently fails on CI.")
+def test_npdistop(hello_world_f90, monkeypatch):
+    """
+    CLI :: -c
+    """
+    ipath = Path(hello_world_f90)
+    monkeypatch.setattr(sys, "argv", f'f2py -m blah {ipath} -c'.split())
+
+    with util.switchdir(ipath.parent):
+        f2pycli()
+        cmd_run = shlex.split("python -c \"import blah; blah.hi()\"")
+        rout = subprocess.run(cmd_run, capture_output=True, encoding='UTF-8')
+        eout = ' Hello World\n'
+        assert rout.stdout == eout
+
+
+# Numpy distutils flags
+# TODO: These should be tested separately
+
+
+def test_npd_fcompiler():
+    """
+    CLI :: -c --fcompiler
+    """
+    # TODO: populate
+    pass
+
+
+def test_npd_compiler():
+    """
+    CLI :: -c --compiler
+    """
+    # TODO: populate
+    pass
+
+
+def test_npd_help_fcompiler():
+    """
+    CLI :: -c --help-fcompiler
+    """
+    # TODO: populate
+    pass
+
+
+def test_npd_f77exec():
+    """
+    CLI :: -c --f77exec
+    """
+    # TODO: populate
+    pass
+
+
+def test_npd_f90exec():
+    """
+    CLI :: -c --f90exec
+    """
+    # TODO: populate
+    pass
+
+
+def test_npd_f77flags():
+    """
+    CLI :: -c --f77flags
+    """
+    # TODO: populate
+    pass
+
+
+def test_npd_f90flags():
+    """
+    CLI :: -c --f90flags
+    """
+    # TODO: populate
+    pass
+
+
+def test_npd_opt():
+    """
+    CLI :: -c --opt
+    """
+    # TODO: populate
+    pass
+
+
+def test_npd_arch():
+    """
+    CLI :: -c --arch
+    """
+    # TODO: populate
+    pass
+
+
+def test_npd_noopt():
+    """
+    CLI :: -c --noopt
+    """
+    # TODO: populate
+    pass
+
+
+def test_npd_noarch():
+    """
+    CLI :: -c --noarch
+    """
+    # TODO: populate
+    pass
+
+
+def test_npd_debug():
+    """
+    CLI :: -c --debug
+    """
+    # TODO: populate
+    pass
+
+
+def test_npd_link_auto():
+    """
+    CLI :: -c --link-<resource>
+    """
+    # TODO: populate
+    pass
+
+
+def test_npd_lib():
+    """
+    CLI :: -c -L/path/to/lib/ -l<libname>
+    """
+    # TODO: populate
+    pass
+
+
+def test_npd_define():
+    """
+    CLI :: -D<define>
+    """
+    # TODO: populate
+    pass
+
+
+def test_npd_undefine():
+    """
+    CLI :: -U<name>
+    """
+    # TODO: populate
+    pass
+
+
+def test_npd_incl():
+    """
+    CLI :: -I/path/to/include/
+    """
+    # TODO: populate
+    pass
+
+
+def test_npd_linker():
+    """
+    CLI :: <filename>.o <filename>.so <filename>.a
+    """
+    # TODO: populate
+    pass
diff --git a/numpy/f2py/tests/test_regression.py b/numpy/f2py/tests/test_regression.py
index 40b9d4327..044f952f2 100644
--- a/numpy/f2py/tests/test_regression.py
+++ b/numpy/f2py/tests/test_regression.py
@@ -22,6 +22,25 @@ class TestIntentInOut(util.F2PyTest):
         assert np.allclose(x, [3, 1, 2])
 
 
+class TestNegativeBounds(util.F2PyTest):
+    # Check that negative bounds work correctly
+    sources = [util.getpath("tests", "src", "negative_bounds", "issue_20853.f90")]
+
+    @pytest.mark.slow
+    def test_negbound(self):
+        xvec = np.arange(12)
+        xlow = -6
+        xhigh = 4
+        # Calculate the upper bound,
+        # Keeping the 1 index in mind
+        def ubound(xl, xh):
+            return xh - xl + 1
+        rval = self.module.foo(is_=xlow, ie_=xhigh,
+                        arr=xvec[:ubound(xlow, xhigh)])
+        expval = np.arange(11, dtype = np.float32)
+        assert np.allclose(rval, expval)
+
+
 class TestNumpyVersionAttribute(util.F2PyTest):
     # Check that th attribute __f2py_numpy_version__ is present
     # in the compiled module and that has the value np.__version__.
diff --git a/numpy/linalg/setup.py b/numpy/linalg/setup.py
index 94536bb2c..82c688d91 100644
--- a/numpy/linalg/setup.py
+++ b/numpy/linalg/setup.py
@@ -69,9 +69,13 @@ def configuration(parent_package='', top_path=None):
     # umath_linalg module
     config.add_extension(
         '_umath_linalg',
-        sources=['umath_linalg.c.src', get_lapack_lite_sources],
+        sources=['umath_linalg.cpp', get_lapack_lite_sources],
         depends=['lapack_lite/f2c.h'],
         extra_info=lapack_info,
+        extra_cxx_compile_args=['-std=c++11',
+                                '-D__STDC_VERSION__=0',
+                                '-fno-exceptions',
+                                '-fno-rtti'],
         libraries=['npymath'],
     )
     config.add_data_files('*.pyi')
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.cpp
index f8a154445..bbeb37906 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.cpp
@@ -18,14 +18,20 @@
 
 #include "npy_cblas.h"
 
-#include <stddef.h>
-#include <stdio.h>
-#include <assert.h>
-#include <math.h>
+#include <cstddef>
+#include <cstdio>
+#include <cassert>
+#include <cmath>
+#include <utility>
 
 
 static const char* umath_linalg_version_string = "0.1.5";
 
+struct scalar_trait {};
+struct complex_trait {};
+template<typename typ>
+using dispatch_scalar = typename std::conditional<std::is_scalar<typ>::value, scalar_trait, complex_trait>::type;
+
 /*
  ****************************************************************************
  *                        Debugging support                                 *
@@ -78,28 +84,28 @@ typedef double            fortran_doublereal;
 typedef f2c_complex       fortran_complex;
 typedef f2c_doublecomplex fortran_doublecomplex;
 
-extern fortran_int
+extern "C" fortran_int
 FNAME(sgeev)(char *jobvl, char *jobvr, fortran_int *n,
              float a[], fortran_int *lda, float wr[], float wi[],
              float vl[], fortran_int *ldvl, float vr[], fortran_int *ldvr,
              float work[], fortran_int lwork[],
              fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(dgeev)(char *jobvl, char *jobvr, fortran_int *n,
              double a[], fortran_int *lda, double wr[], double wi[],
              double vl[], fortran_int *ldvl, double vr[], fortran_int *ldvr,
              double work[], fortran_int lwork[],
              fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(cgeev)(char *jobvl, char *jobvr, fortran_int *n,
-             f2c_doublecomplex a[], fortran_int *lda,
-             f2c_doublecomplex w[],
-             f2c_doublecomplex vl[], fortran_int *ldvl,
-             f2c_doublecomplex vr[], fortran_int *ldvr,
-             f2c_doublecomplex work[], fortran_int *lwork,
-             double rwork[],
+             f2c_complex a[], fortran_int *lda,
+             f2c_complex w[],
+             f2c_complex vl[], fortran_int *ldvl,
+             f2c_complex vr[], fortran_int *ldvr,
+             f2c_complex work[], fortran_int *lwork,
+             float rwork[],
              fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(zgeev)(char *jobvl, char *jobvr, fortran_int *n,
              f2c_doublecomplex a[], fortran_int *lda,
              f2c_doublecomplex w[],
@@ -109,24 +115,24 @@ FNAME(zgeev)(char *jobvl, char *jobvr, fortran_int *n,
              double rwork[],
              fortran_int *info);
 
-extern fortran_int
+extern "C" fortran_int
 FNAME(ssyevd)(char *jobz, char *uplo, fortran_int *n,
               float a[], fortran_int *lda, float w[], float work[],
               fortran_int *lwork, fortran_int iwork[], fortran_int *liwork,
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(dsyevd)(char *jobz, char *uplo, fortran_int *n,
               double a[], fortran_int *lda, double w[], double work[],
               fortran_int *lwork, fortran_int iwork[], fortran_int *liwork,
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(cheevd)(char *jobz, char *uplo, fortran_int *n,
               f2c_complex a[], fortran_int *lda,
               float w[], f2c_complex work[],
               fortran_int *lwork, float rwork[], fortran_int *lrwork, fortran_int iwork[],
               fortran_int *liwork,
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(zheevd)(char *jobz, char *uplo, fortran_int *n,
               f2c_doublecomplex a[], fortran_int *lda,
               double w[], f2c_doublecomplex work[],
@@ -134,19 +140,19 @@ FNAME(zheevd)(char *jobz, char *uplo, fortran_int *n,
               fortran_int *liwork,
               fortran_int *info);
 
-extern fortran_int
+extern "C" fortran_int
 FNAME(sgelsd)(fortran_int *m, fortran_int *n, fortran_int *nrhs,
               float a[], fortran_int *lda, float b[], fortran_int *ldb,
               float s[], float *rcond, fortran_int *rank,
               float work[], fortran_int *lwork, fortran_int iwork[],
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(dgelsd)(fortran_int *m, fortran_int *n, fortran_int *nrhs,
               double a[], fortran_int *lda, double b[], fortran_int *ldb,
               double s[], double *rcond, fortran_int *rank,
               double work[], fortran_int *lwork, fortran_int iwork[],
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(cgelsd)(fortran_int *m, fortran_int *n, fortran_int *nrhs,
               f2c_complex a[], fortran_int *lda,
               f2c_complex b[], fortran_int *ldb,
@@ -154,7 +160,7 @@ FNAME(cgelsd)(fortran_int *m, fortran_int *n, fortran_int *nrhs,
               f2c_complex work[], fortran_int *lwork,
               float rwork[], fortran_int iwork[],
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(zgelsd)(fortran_int *m, fortran_int *n, fortran_int *nrhs,
               f2c_doublecomplex a[], fortran_int *lda,
               f2c_doublecomplex b[], fortran_int *ldb,
@@ -163,105 +169,105 @@ FNAME(zgelsd)(fortran_int *m, fortran_int *n, fortran_int *nrhs,
               double rwork[], fortran_int iwork[],
               fortran_int *info);
 
-extern fortran_int
+extern "C" fortran_int
 FNAME(dgeqrf)(fortran_int *m, fortran_int *n, double a[], fortran_int *lda,
               double tau[], double work[],
               fortran_int *lwork, fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(zgeqrf)(fortran_int *m, fortran_int *n, f2c_doublecomplex a[], fortran_int *lda,
               f2c_doublecomplex tau[], f2c_doublecomplex work[],
               fortran_int *lwork, fortran_int *info);
 
-extern fortran_int
+extern "C" fortran_int
 FNAME(dorgqr)(fortran_int *m, fortran_int *n, fortran_int *k, double a[], fortran_int *lda,
               double tau[], double work[],
               fortran_int *lwork, fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(zungqr)(fortran_int *m, fortran_int *n, fortran_int *k, f2c_doublecomplex a[],
               fortran_int *lda, f2c_doublecomplex tau[],
               f2c_doublecomplex work[], fortran_int *lwork, fortran_int *info);
 
-extern fortran_int
+extern "C" fortran_int
 FNAME(sgesv)(fortran_int *n, fortran_int *nrhs,
              float a[], fortran_int *lda,
              fortran_int ipiv[],
              float b[], fortran_int *ldb,
              fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(dgesv)(fortran_int *n, fortran_int *nrhs,
              double a[], fortran_int *lda,
              fortran_int ipiv[],
              double b[], fortran_int *ldb,
              fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(cgesv)(fortran_int *n, fortran_int *nrhs,
              f2c_complex a[], fortran_int *lda,
              fortran_int ipiv[],
              f2c_complex b[], fortran_int *ldb,
              fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(zgesv)(fortran_int *n, fortran_int *nrhs,
              f2c_doublecomplex a[], fortran_int *lda,
              fortran_int ipiv[],
              f2c_doublecomplex b[], fortran_int *ldb,
              fortran_int *info);
 
-extern fortran_int
+extern "C" fortran_int
 FNAME(sgetrf)(fortran_int *m, fortran_int *n,
               float a[], fortran_int *lda,
               fortran_int ipiv[],
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(dgetrf)(fortran_int *m, fortran_int *n,
               double a[], fortran_int *lda,
               fortran_int ipiv[],
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(cgetrf)(fortran_int *m, fortran_int *n,
               f2c_complex a[], fortran_int *lda,
               fortran_int ipiv[],
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(zgetrf)(fortran_int *m, fortran_int *n,
               f2c_doublecomplex a[], fortran_int *lda,
               fortran_int ipiv[],
               fortran_int *info);
 
-extern fortran_int
+extern "C" fortran_int
 FNAME(spotrf)(char *uplo, fortran_int *n,
               float a[], fortran_int *lda,
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(dpotrf)(char *uplo, fortran_int *n,
               double a[], fortran_int *lda,
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(cpotrf)(char *uplo, fortran_int *n,
               f2c_complex a[], fortran_int *lda,
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(zpotrf)(char *uplo, fortran_int *n,
               f2c_doublecomplex a[], fortran_int *lda,
               fortran_int *info);
 
-extern fortran_int
+extern "C" fortran_int
 FNAME(sgesdd)(char *jobz, fortran_int *m, fortran_int *n,
               float a[], fortran_int *lda, float s[], float u[],
               fortran_int *ldu, float vt[], fortran_int *ldvt, float work[],
               fortran_int *lwork, fortran_int iwork[], fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(dgesdd)(char *jobz, fortran_int *m, fortran_int *n,
               double a[], fortran_int *lda, double s[], double u[],
               fortran_int *ldu, double vt[], fortran_int *ldvt, double work[],
               fortran_int *lwork, fortran_int iwork[], fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(cgesdd)(char *jobz, fortran_int *m, fortran_int *n,
               f2c_complex a[], fortran_int *lda,
               float s[], f2c_complex u[], fortran_int *ldu,
               f2c_complex vt[], fortran_int *ldvt,
               f2c_complex work[], fortran_int *lwork,
               float rwork[], fortran_int iwork[], fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(zgesdd)(char *jobz, fortran_int *m, fortran_int *n,
               f2c_doublecomplex a[], fortran_int *lda,
               double s[], f2c_doublecomplex u[], fortran_int *ldu,
@@ -269,87 +275,87 @@ FNAME(zgesdd)(char *jobz, fortran_int *m, fortran_int *n,
               f2c_doublecomplex work[], fortran_int *lwork,
               double rwork[], fortran_int iwork[], fortran_int *info);
 
-extern fortran_int
+extern "C" fortran_int
 FNAME(spotrs)(char *uplo, fortran_int *n, fortran_int *nrhs,
               float a[], fortran_int *lda,
               float b[], fortran_int *ldb,
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(dpotrs)(char *uplo, fortran_int *n, fortran_int *nrhs,
               double a[], fortran_int *lda,
               double b[], fortran_int *ldb,
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(cpotrs)(char *uplo, fortran_int *n, fortran_int *nrhs,
               f2c_complex a[], fortran_int *lda,
               f2c_complex b[], fortran_int *ldb,
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(zpotrs)(char *uplo, fortran_int *n, fortran_int *nrhs,
               f2c_doublecomplex a[], fortran_int *lda,
               f2c_doublecomplex b[], fortran_int *ldb,
               fortran_int *info);
 
-extern fortran_int
+extern "C" fortran_int
 FNAME(spotri)(char *uplo, fortran_int *n,
               float a[], fortran_int *lda,
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(dpotri)(char *uplo, fortran_int *n,
               double a[], fortran_int *lda,
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(cpotri)(char *uplo, fortran_int *n,
               f2c_complex a[], fortran_int *lda,
               fortran_int *info);
-extern fortran_int
+extern "C" fortran_int
 FNAME(zpotri)(char *uplo, fortran_int *n,
               f2c_doublecomplex a[], fortran_int *lda,
               fortran_int *info);
 
-extern fortran_int
+extern "C" fortran_int
 FNAME(scopy)(fortran_int *n,
              float *sx, fortran_int *incx,
              float *sy, fortran_int *incy);
-extern fortran_int
+extern "C" fortran_int
 FNAME(dcopy)(fortran_int *n,
              double *sx, fortran_int *incx,
              double *sy, fortran_int *incy);
-extern fortran_int
+extern "C" fortran_int
 FNAME(ccopy)(fortran_int *n,
              f2c_complex *sx, fortran_int *incx,
              f2c_complex *sy, fortran_int *incy);
-extern fortran_int
+extern "C" fortran_int
 FNAME(zcopy)(fortran_int *n,
              f2c_doublecomplex *sx, fortran_int *incx,
              f2c_doublecomplex *sy, fortran_int *incy);
 
-extern float
+extern "C" float
 FNAME(sdot)(fortran_int *n,
             float *sx, fortran_int *incx,
             float *sy, fortran_int *incy);
-extern double
+extern "C" double
 FNAME(ddot)(fortran_int *n,
             double *sx, fortran_int *incx,
             double *sy, fortran_int *incy);
-extern void
+extern "C" void
 FNAME(cdotu)(f2c_complex *ret, fortran_int *n,
              f2c_complex *sx, fortran_int *incx,
              f2c_complex *sy, fortran_int *incy);
-extern void
+extern "C" void
 FNAME(zdotu)(f2c_doublecomplex *ret, fortran_int *n,
              f2c_doublecomplex *sx, fortran_int *incx,
              f2c_doublecomplex *sy, fortran_int *incy);
-extern void
+extern "C" void
 FNAME(cdotc)(f2c_complex *ret, fortran_int *n,
              f2c_complex *sx, fortran_int *incx,
              f2c_complex *sy, fortran_int *incy);
-extern void
+extern "C" void
 FNAME(zdotc)(f2c_doublecomplex *ret, fortran_int *n,
              f2c_doublecomplex *sx, fortran_int *incx,
              f2c_doublecomplex *sy, fortran_int *incy);
 
-extern fortran_int
+extern "C" fortran_int
 FNAME(sgemm)(char *transa, char *transb,
              fortran_int *m, fortran_int *n, fortran_int *k,
              float *alpha,
@@ -357,7 +363,7 @@ FNAME(sgemm)(char *transa, char *transb,
              float *b, fortran_int *ldb,
              float *beta,
              float *c, fortran_int *ldc);
-extern fortran_int
+extern "C" fortran_int
 FNAME(dgemm)(char *transa, char *transb,
              fortran_int *m, fortran_int *n, fortran_int *k,
              double *alpha,
@@ -365,7 +371,7 @@ FNAME(dgemm)(char *transa, char *transb,
              double *b, fortran_int *ldb,
              double *beta,
              double *c, fortran_int *ldc);
-extern fortran_int
+extern "C" fortran_int
 FNAME(cgemm)(char *transa, char *transb,
              fortran_int *m, fortran_int *n, fortran_int *k,
              f2c_complex *alpha,
@@ -373,7 +379,7 @@ FNAME(cgemm)(char *transa, char *transb,
              f2c_complex *b, fortran_int *ldb,
              f2c_complex *beta,
              f2c_complex *c, fortran_int *ldc);
-extern fortran_int
+extern "C" fortran_int
 FNAME(zgemm)(char *transa, char *transb,
              fortran_int *m, fortran_int *n, fortran_int *k,
              f2c_doublecomplex *alpha,
@@ -400,7 +406,7 @@ FNAME(zgemm)(char *transa, char *transb,
  *****************************************************************************
  */
 
-static NPY_INLINE int
+static inline int
 get_fp_invalid_and_clear(void)
 {
     int status;
@@ -408,7 +414,7 @@ get_fp_invalid_and_clear(void)
     return !!(status & NPY_FPE_INVALID);
 }
 
-static NPY_INLINE void
+static inline void
 set_fp_invalid_or_clear(int error_occurred)
 {
     if (error_occurred) {
@@ -427,80 +433,92 @@ set_fp_invalid_or_clear(int error_occurred)
 
 #define UMATH_LINALG_MODULE_NAME "_umath_linalg"
 
-typedef union {
-    fortran_complex f;
-    npy_cfloat npy;
-    float array[2];
-} COMPLEX_t;
-
-typedef union {
-    fortran_doublecomplex f;
-    npy_cdouble npy;
-    double array[2];
-} DOUBLECOMPLEX_t;
-
-static float s_one;
-static float s_zero;
-static float s_minus_one;
-static float s_ninf;
-static float s_nan;
-static double d_one;
-static double d_zero;
-static double d_minus_one;
-static double d_ninf;
-static double d_nan;
-static COMPLEX_t c_one;
-static COMPLEX_t c_zero;
-static COMPLEX_t c_minus_one;
-static COMPLEX_t c_ninf;
-static COMPLEX_t c_nan;
-static DOUBLECOMPLEX_t z_one;
-static DOUBLECOMPLEX_t z_zero;
-static DOUBLECOMPLEX_t z_minus_one;
-static DOUBLECOMPLEX_t z_ninf;
-static DOUBLECOMPLEX_t z_nan;
-
-static void init_constants(void)
-{
-    /*
-       this is needed as NPY_INFINITY and NPY_NAN macros
-       can't be used as initializers. I prefer to just set
-       all the constants the same way.
-    */
-    s_one  = 1.0f;
-    s_zero = 0.0f;
-    s_minus_one = -1.0f;
-    s_ninf = -NPY_INFINITYF;
-    s_nan = NPY_NANF;
-
-    d_one  = 1.0;
-    d_zero = 0.0;
-    d_minus_one = -1.0;
-    d_ninf = -NPY_INFINITY;
-    d_nan = NPY_NAN;
-
-    c_one.array[0]  = 1.0f;
-    c_one.array[1]  = 0.0f;
-    c_zero.array[0] = 0.0f;
-    c_zero.array[1] = 0.0f;
-    c_minus_one.array[0] = -1.0f;
-    c_minus_one.array[1] = 0.0f;
-    c_ninf.array[0] = -NPY_INFINITYF;
-    c_ninf.array[1] = 0.0f;
-    c_nan.array[0] = NPY_NANF;
-    c_nan.array[1] = NPY_NANF;
-
-    z_one.array[0]  = 1.0;
-    z_one.array[1]  = 0.0;
-    z_zero.array[0] = 0.0;
-    z_zero.array[1] = 0.0;
-    z_minus_one.array[0] = -1.0;
-    z_minus_one.array[1] = 0.0;
-    z_ninf.array[0] = -NPY_INFINITY;
-    z_ninf.array[1] = 0.0;
-    z_nan.array[0] = NPY_NAN;
-    z_nan.array[1] = NPY_NAN;
-}
+template<typename T>
+struct numeric_limits;
+
+template<>
+struct numeric_limits<float> {
+static constexpr float one = 1.0f;
+static constexpr float zero = 0.0f;
+static constexpr float minus_one = -1.0f;
+static const float ninf;
+static const float nan;
+};
+constexpr float numeric_limits<float>::one;
+constexpr float numeric_limits<float>::zero;
+constexpr float numeric_limits<float>::minus_one;
+const float numeric_limits<float>::ninf = -NPY_INFINITYF;
+const float numeric_limits<float>::nan = NPY_NANF;
+
+template<>
+struct numeric_limits<double> {
+static constexpr double one = 1.0;
+static constexpr double zero = 0.0;
+static constexpr double minus_one = -1.0;
+static const double ninf;
+static const double nan;
+};
+constexpr double numeric_limits<double>::one;
+constexpr double numeric_limits<double>::zero;
+constexpr double numeric_limits<double>::minus_one;
+const double numeric_limits<double>::ninf = -NPY_INFINITY;
+const double numeric_limits<double>::nan = NPY_NAN;
+
+template<>
+struct numeric_limits<npy_cfloat> {
+static constexpr npy_cfloat one = {1.0f, 0.0f};
+static constexpr npy_cfloat zero = {0.0f, 0.0f};
+static constexpr npy_cfloat minus_one = {-1.0f, 0.0f};
+static const npy_cfloat ninf;
+static const npy_cfloat nan;
+};
+constexpr npy_cfloat numeric_limits<npy_cfloat>::one;
+constexpr npy_cfloat numeric_limits<npy_cfloat>::zero;
+constexpr npy_cfloat numeric_limits<npy_cfloat>::minus_one;
+const npy_cfloat numeric_limits<npy_cfloat>::ninf = {-NPY_INFINITYF, 0.0f};
+const npy_cfloat numeric_limits<npy_cfloat>::nan = {NPY_NANF, NPY_NANF};
+
+template<>
+struct numeric_limits<f2c_complex> {
+static constexpr f2c_complex one = {1.0f, 0.0f};
+static constexpr f2c_complex zero = {0.0f, 0.0f};
+static constexpr f2c_complex minus_one = {-1.0f, 0.0f};
+static const f2c_complex ninf;
+static const f2c_complex nan;
+};
+constexpr f2c_complex numeric_limits<f2c_complex>::one;
+constexpr f2c_complex numeric_limits<f2c_complex>::zero;
+constexpr f2c_complex numeric_limits<f2c_complex>::minus_one;
+const f2c_complex numeric_limits<f2c_complex>::ninf = {-NPY_INFINITYF, 0.0f};
+const f2c_complex numeric_limits<f2c_complex>::nan = {NPY_NANF, NPY_NANF};
+
+template<>
+struct numeric_limits<npy_cdouble> {
+static constexpr npy_cdouble one = {1.0, 0.0};
+static constexpr npy_cdouble zero = {0.0, 0.0};
+static constexpr npy_cdouble minus_one = {-1.0, 0.0};
+static const npy_cdouble ninf;
+static const npy_cdouble nan;
+};
+constexpr npy_cdouble numeric_limits<npy_cdouble>::one;
+constexpr npy_cdouble numeric_limits<npy_cdouble>::zero;
+constexpr npy_cdouble numeric_limits<npy_cdouble>::minus_one;
+const npy_cdouble numeric_limits<npy_cdouble>::ninf = {-NPY_INFINITY, 0.0};
+const npy_cdouble numeric_limits<npy_cdouble>::nan = {NPY_NAN, NPY_NAN};
+
+template<>
+struct numeric_limits<f2c_doublecomplex> {
+static constexpr f2c_doublecomplex one = {1.0, 0.0};
+static constexpr f2c_doublecomplex zero = {0.0, 0.0};
+static constexpr f2c_doublecomplex minus_one = {-1.0, 0.0};
+static const f2c_doublecomplex ninf;
+static const f2c_doublecomplex nan;
+};
+constexpr f2c_doublecomplex numeric_limits<f2c_doublecomplex>::one;
+constexpr f2c_doublecomplex numeric_limits<f2c_doublecomplex>::zero;
+constexpr f2c_doublecomplex numeric_limits<f2c_doublecomplex>::minus_one;
+const f2c_doublecomplex numeric_limits<f2c_doublecomplex>::ninf = {-NPY_INFINITY, 0.0};
+const f2c_doublecomplex numeric_limits<f2c_doublecomplex>::nan = {NPY_NAN, NPY_NAN};
 
 /*
  *****************************************************************************
@@ -529,7 +547,7 @@ typedef struct linearize_data_struct
   npy_intp output_lead_dim;
 } LINEARIZE_DATA_t;
 
-static NPY_INLINE void
+static inline void
 init_linearize_data_ex(LINEARIZE_DATA_t *lin_data,
                        npy_intp rows,
                        npy_intp columns,
@@ -544,7 +562,7 @@ init_linearize_data_ex(LINEARIZE_DATA_t *lin_data,
     lin_data->output_lead_dim = output_lead_dim;
 }
 
-static NPY_INLINE void
+static inline void
 init_linearize_data(LINEARIZE_DATA_t *lin_data,
                     npy_intp rows,
                     npy_intp columns,
@@ -555,7 +573,7 @@ init_linearize_data(LINEARIZE_DATA_t *lin_data,
         lin_data, rows, columns, row_strides, column_strides, columns);
 }
 
-static NPY_INLINE void
+static inline void
 dump_ufunc_object(PyUFuncObject* ufunc)
 {
     TRACE_TXT("\n\n%s '%s' (%d input(s), %d output(s), %d specialization(s).\n",
@@ -580,7 +598,7 @@ dump_ufunc_object(PyUFuncObject* ufunc)
     }
 }
 
-static NPY_INLINE void
+static inline void
 dump_linearize_data(const char* name, const LINEARIZE_DATA_t* params)
 {
     TRACE_TXT("\n\t%s rows: %zd columns: %zd"\
@@ -589,37 +607,34 @@ dump_linearize_data(const char* name, const LINEARIZE_DATA_t* params)
               params->row_strides, params->column_strides);
 }
 
-static NPY_INLINE void
-print_FLOAT(npy_float s)
+static inline void
+print(npy_float s)
 {
     TRACE_TXT(" %8.4f", s);
 }
-static NPY_INLINE void
-print_DOUBLE(npy_double d)
+static inline void
+print(npy_double d)
 {
     TRACE_TXT(" %10.6f", d);
 }
-static NPY_INLINE void
-print_CFLOAT(npy_cfloat c)
+static inline void
+print(npy_cfloat c)
 {
     float* c_parts = (float*)&c;
     TRACE_TXT("(%8.4f, %8.4fj)", c_parts[0], c_parts[1]);
 }
-static NPY_INLINE void
-print_CDOUBLE(npy_cdouble z)
+static inline void
+print(npy_cdouble z)
 {
     double* z_parts = (double*)&z;
     TRACE_TXT("(%8.4f, %8.4fj)", z_parts[0], z_parts[1]);
 }
 
-/**begin repeat
-   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
-   #typ = npy_float, npy_double, npy_cfloat, npy_cdouble#
- */
-static NPY_INLINE void
-dump_@TYPE@_matrix(const char* name,
+template<typename typ>
+static inline void
+dump_matrix(const char* name,
                    size_t rows, size_t columns,
-                   const @typ@* ptr)
+                  const typ* ptr)
 {
     size_t i, j;
 
@@ -629,13 +644,12 @@ dump_@TYPE@_matrix(const char* name,
         TRACE_TXT("| ");
         for (j = 0; j < columns; j++)
         {
-            print_@TYPE@(ptr[j*rows + i]);
+            print(ptr[j*rows + i]);
             TRACE_TXT(", ");
         }
         TRACE_TXT(" |\n");
     }
 }
-/**end repeat**/
 
 
 /*
@@ -644,12 +658,12 @@ dump_@TYPE@_matrix(const char* name,
  *****************************************************************************
  */
 
-static NPY_INLINE fortran_int
+static inline fortran_int
 fortran_int_min(fortran_int x, fortran_int y) {
     return x < y ? x : y;
 }
 
-static NPY_INLINE fortran_int
+static inline fortran_int
 fortran_int_max(fortran_int x, fortran_int y) {
     return x > y ? x : y;
 }
@@ -736,7 +750,7 @@ fortran_int_max(fortran_int x, fortran_int y) {
 
 #define END_OUTER_LOOP  }
 
-static NPY_INLINE void
+static inline void
 update_pointers(npy_uint8** bases, ptrdiff_t* offsets, size_t count)
 {
     size_t i;
@@ -754,45 +768,100 @@ update_pointers(npy_uint8** bases, ptrdiff_t* offsets, size_t count)
 
 /*
  *****************************************************************************
+ **                             DISPATCHER FUNCS                            **
+ *****************************************************************************
+ */
+static fortran_int copy(fortran_int *n,
+        float *sx, fortran_int *incx,
+        float *sy, fortran_int *incy) { return FNAME(scopy)(n, sx, incx,
+            sy, incy);
+}
+static fortran_int copy(fortran_int *n,
+        double *sx, fortran_int *incx,
+        double *sy, fortran_int *incy) { return FNAME(dcopy)(n, sx, incx,
+            sy, incy);
+}
+static fortran_int copy(fortran_int *n,
+        f2c_complex *sx, fortran_int *incx,
+        f2c_complex *sy, fortran_int *incy) { return FNAME(ccopy)(n, sx, incx,
+            sy, incy);
+}
+static fortran_int copy(fortran_int *n,
+        f2c_doublecomplex *sx, fortran_int *incx,
+        f2c_doublecomplex *sy, fortran_int *incy) { return FNAME(zcopy)(n, sx, incx,
+            sy, incy);
+}
+
+static fortran_int getrf(fortran_int *m, fortran_int *n, float a[], fortran_int
+*lda, fortran_int ipiv[], fortran_int *info) {
+ return LAPACK(sgetrf)(m, n, a, lda, ipiv, info);
+}
+static fortran_int getrf(fortran_int *m, fortran_int *n, double a[], fortran_int
+*lda, fortran_int ipiv[], fortran_int *info) {
+ return LAPACK(dgetrf)(m, n, a, lda, ipiv, info);
+}
+static fortran_int getrf(fortran_int *m, fortran_int *n, f2c_complex a[], fortran_int
+*lda, fortran_int ipiv[], fortran_int *info) {
+ return LAPACK(cgetrf)(m, n, a, lda, ipiv, info);
+}
+static fortran_int getrf(fortran_int *m, fortran_int *n, f2c_doublecomplex a[], fortran_int
+*lda, fortran_int ipiv[], fortran_int *info) {
+ return LAPACK(zgetrf)(m, n, a, lda, ipiv, info);
+}
+
+/*
+ *****************************************************************************
  **                             HELPER FUNCS                                **
  *****************************************************************************
  */
+template<typename T>
+struct fortran_type {
+using type = T;
+};
+
+template<> struct fortran_type<npy_cfloat> { using type = f2c_complex;};
+template<> struct fortran_type<npy_cdouble> { using type = f2c_doublecomplex;};
+template<typename T>
+using fortran_type_t = typename fortran_type<T>::type;
+
+template<typename T>
+struct basetype {
+using type = T;
+};
+template<> struct basetype<npy_cfloat> { using type = npy_float;};
+template<> struct basetype<npy_cdouble> { using type = npy_double;};
+template<> struct basetype<f2c_complex> { using type = fortran_real;};
+template<> struct basetype<f2c_doublecomplex> { using type = fortran_doublereal;};
+template<typename T>
+using basetype_t = typename basetype<T>::type;
 
              /* rearranging of 2D matrices using blas */
 
-/**begin repeat
-    #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
-    #typ = float, double, COMPLEX_t, DOUBLECOMPLEX_t#
-    #copy = scopy, dcopy, ccopy, zcopy#
-    #nan = s_nan, d_nan, c_nan, z_nan#
-    #zero = s_zero, d_zero, c_zero, z_zero#
- */
-static NPY_INLINE void *
-linearize_@TYPE@_matrix(void *dst_in,
-                        void *src_in,
+template<typename typ>
+static inline void *
+linearize_matrix(typ *dst,
+                        typ *src,
                         const LINEARIZE_DATA_t* data)
 {
-    @typ@ *src = (@typ@ *) src_in;
-    @typ@ *dst = (@typ@ *) dst_in;
-
+    using ftyp = fortran_type_t<typ>;
     if (dst) {
         int i, j;
-        @typ@* rv = dst;
+        typ* rv = dst;
         fortran_int columns = (fortran_int)data->columns;
         fortran_int column_strides =
-            (fortran_int)(data->column_strides/sizeof(@typ@));
+            (fortran_int)(data->column_strides/sizeof(typ));
         fortran_int one = 1;
         for (i = 0; i < data->rows; i++) {
             if (column_strides > 0) {
-                FNAME(@copy@)(&columns,
-                              (void*)src, &column_strides,
-                              (void*)dst, &one);
+                copy(&columns,
+                              (ftyp*)src, &column_strides,
+                              (ftyp*)dst, &one);
             }
             else if (column_strides < 0) {
-                FNAME(@copy@)(&columns,
-                              (void*)((@typ@*)src + (columns-1)*column_strides),
+                copy(&columns,
+                              ((ftyp*)src + (columns-1)*column_strides),
                               &column_strides,
-                              (void*)dst, &one);
+                              (ftyp*)dst, &one);
             }
             else {
                 /*
@@ -801,10 +870,10 @@ linearize_@TYPE@_matrix(void *dst_in,
                  * manually
                  */
                 for (j = 0; j < columns; ++j) {
-                    memcpy((@typ@*)dst + j, (@typ@*)src, sizeof(@typ@));
+                    memcpy(dst + j, src, sizeof(typ));
                 }
             }
-            src += data->row_strides/sizeof(@typ@);
+            src += data->row_strides/sizeof(typ);
             dst += data->output_lead_dim;
         }
         return rv;
@@ -813,31 +882,31 @@ linearize_@TYPE@_matrix(void *dst_in,
     }
 }
 
-static NPY_INLINE void *
-delinearize_@TYPE@_matrix(void *dst_in,
-                          void *src_in,
+template<typename typ>
+static inline void *
+delinearize_matrix(typ *dst,
+                          typ *src,
                           const LINEARIZE_DATA_t* data)
 {
-    @typ@ *src = (@typ@ *) src_in;
-    @typ@ *dst = (@typ@ *) dst_in;
+using ftyp = fortran_type_t<typ>;
 
     if (src) {
         int i;
-        @typ@ *rv = src;
+        typ *rv = src;
         fortran_int columns = (fortran_int)data->columns;
         fortran_int column_strides =
-            (fortran_int)(data->column_strides/sizeof(@typ@));
+            (fortran_int)(data->column_strides/sizeof(typ));
         fortran_int one = 1;
         for (i = 0; i < data->rows; i++) {
             if (column_strides > 0) {
-                FNAME(@copy@)(&columns,
-                              (void*)src, &one,
-                              (void*)dst, &column_strides);
+                copy(&columns,
+                              (ftyp*)src, &one,
+                              (ftyp*)dst, &column_strides);
             }
             else if (column_strides < 0) {
-                FNAME(@copy@)(&columns,
-                              (void*)src, &one,
-                              (void*)((@typ@*)dst + (columns-1)*column_strides),
+                copy(&columns,
+                              (ftyp*)src, &one,
+                              ((ftyp*)dst + (columns-1)*column_strides),
                               &column_strides);
             }
             else {
@@ -847,13 +916,13 @@ delinearize_@TYPE@_matrix(void *dst_in,
                  * manually
                  */
                 if (columns > 0) {
-                    memcpy((@typ@*)dst,
-                           (@typ@*)src + (columns-1),
-                           sizeof(@typ@));
+                    memcpy(dst,
+                           src + (columns-1),
+                           sizeof(typ));
                 }
             }
             src += data->output_lead_dim;
-            dst += data->row_strides/sizeof(@typ@);
+            dst += data->row_strides/sizeof(typ);
         }
 
         return rv;
@@ -862,116 +931,97 @@ delinearize_@TYPE@_matrix(void *dst_in,
     }
 }
 
-static NPY_INLINE void
-nan_@TYPE@_matrix(void *dst_in, const LINEARIZE_DATA_t* data)
+template<typename typ>
+static inline void
+nan_matrix(typ *dst, const LINEARIZE_DATA_t* data)
 {
-    @typ@ *dst = (@typ@ *) dst_in;
-
     int i, j;
     for (i = 0; i < data->rows; i++) {
-        @typ@ *cp = dst;
-        ptrdiff_t cs = data->column_strides/sizeof(@typ@);
+        typ *cp = dst;
+        ptrdiff_t cs = data->column_strides/sizeof(typ);
         for (j = 0; j < data->columns; ++j) {
-            *cp = @nan@;
+            *cp = numeric_limits<typ>::nan;
             cp += cs;
         }
-        dst += data->row_strides/sizeof(@typ@);
+        dst += data->row_strides/sizeof(typ);
     }
 }
 
-static NPY_INLINE void
-zero_@TYPE@_matrix(void *dst_in, const LINEARIZE_DATA_t* data)
+template<typename typ>
+static inline void
+zero_matrix(typ *dst, const LINEARIZE_DATA_t* data)
 {
-    @typ@ *dst = (@typ@ *) dst_in;
-
     int i, j;
     for (i = 0; i < data->rows; i++) {
-        @typ@ *cp = dst;
-        ptrdiff_t cs = data->column_strides/sizeof(@typ@);
+        typ *cp = dst;
+        ptrdiff_t cs = data->column_strides/sizeof(typ);
         for (j = 0; j < data->columns; ++j) {
-            *cp = @zero@;
+            *cp = numeric_limits<typ>::zero;
             cp += cs;
         }
-        dst += data->row_strides/sizeof(@typ@);
+        dst += data->row_strides/sizeof(typ);
     }
 }
 
-/**end repeat**/
-
                /* identity square matrix generation */
-/**begin repeat
-   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
-   #typ = float, double, COMPLEX_t, DOUBLECOMPLEX_t#
-   #cblas_type = s, d, c, z#
- */
-static NPY_INLINE void
-identity_@TYPE@_matrix(void *ptr, size_t n)
+template<typename typ>
+static inline void
+identity_matrix(typ *matrix, size_t n)
 {
     size_t i;
-    @typ@ *matrix = (@typ@*) ptr;
     /* in IEEE floating point, zeroes are represented as bitwise 0 */
-    memset(matrix, 0, n*n*sizeof(@typ@));
+    memset(matrix, 0, n*n*sizeof(typ));
 
     for (i = 0; i < n; ++i)
     {
-        *matrix = @cblas_type@_one;
+        *matrix = numeric_limits<typ>::one;
         matrix += n+1;
     }
 }
-/**end repeat**/
 
          /* lower/upper triangular matrix using blas (in place) */
-/**begin repeat
 
-   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
-   #typ = float, double, COMPLEX_t, DOUBLECOMPLEX_t#
-   #cblas_type = s, d, c, z#
- */
-
-static NPY_INLINE void
-triu_@TYPE@_matrix(void *ptr, size_t n)
+template<typename typ>
+static inline void
+triu_matrix(typ *matrix, size_t n)
 {
     size_t i, j;
-    @typ@ *matrix = (@typ@*)ptr;
     matrix += n;
     for (i = 1; i < n; ++i) {
         for (j = 0; j < i; ++j) {
-            matrix[j] = @cblas_type@_zero;
+            matrix[j] = numeric_limits<typ>::zero;
         }
         matrix += n;
     }
 }
-/**end repeat**/
 
 
 /* -------------------------------------------------------------------------- */
                           /* Determinants */
 
-/**begin repeat
-   #TYPE = FLOAT, DOUBLE#
-   #typ = npy_float, npy_double#
-   #log_func = npy_logf, npy_log#
-   #exp_func = npy_expf, npy_exp#
-   #zero = 0.0f, 0.0#
-*/
+static npy_float npylog(npy_float f) { return npy_logf(f);}
+static npy_double npylog(npy_double d) { return npy_log(d);}
+static npy_float npyexp(npy_float f) { return npy_expf(f);}
+static npy_double npyexp(npy_double d) { return npy_exp(d);}
 
-static NPY_INLINE void
-@TYPE@_slogdet_from_factored_diagonal(@typ@* src,
+template<typename typ>
+static inline void
+slogdet_from_factored_diagonal(typ* src,
                                       fortran_int m,
-                                      @typ@ *sign,
-                                      @typ@ *logdet)
+                                      typ *sign,
+                                      typ *logdet)
 {
-    @typ@ acc_sign = *sign;
-    @typ@ acc_logdet = @zero@;
+    typ acc_sign = *sign;
+    typ acc_logdet = numeric_limits<typ>::zero;
     int i;
     for (i = 0; i < m; i++) {
-        @typ@ abs_element = *src;
-        if (abs_element < @zero@) {
+        typ abs_element = *src;
+        if (abs_element < numeric_limits<typ>::zero) {
             acc_sign = -acc_sign;
             abs_element = -abs_element;
         }
 
-        acc_logdet += @log_func@(abs_element);
+        acc_logdet += npylog(abs_element);
         src += m+1;
     }
 
@@ -979,32 +1029,26 @@ static NPY_INLINE void
     *logdet = acc_logdet;
 }
 
-static NPY_INLINE @typ@
-@TYPE@_det_from_slogdet(@typ@ sign, @typ@ logdet)
+template<typename typ>
+static inline typ
+det_from_slogdet(typ sign, typ logdet)
 {
-    @typ@ result = sign * @exp_func@(logdet);
+    typ result = sign * npyexp(logdet);
     return result;
 }
 
-/**end repeat**/
 
+npy_float npyabs(npy_cfloat z) { return npy_cabsf(z);}
+npy_double npyabs(npy_cdouble z) { return npy_cabs(z);}
 
-/**begin repeat
-   #TYPE = CFLOAT, CDOUBLE#
-   #typ = npy_cfloat, npy_cdouble#
-   #basetyp = npy_float, npy_double#
-   #abs_func = npy_cabsf, npy_cabs#
-   #log_func = npy_logf, npy_log#
-   #exp_func = npy_expf, npy_exp#
-   #zero = 0.0f, 0.0#
-*/
-#define RE(COMPLEX) (((@basetyp@*)(&COMPLEX))[0])
-#define IM(COMPLEX) (((@basetyp@*)(&COMPLEX))[1])
+#define RE(COMPLEX) (COMPLEX).real
+#define IM(COMPLEX) (COMPLEX).imag
 
-static NPY_INLINE @typ@
-@TYPE@_mult(@typ@ op1, @typ@ op2)
+template<typename typ>
+static inline typ
+mult(typ op1, typ op2)
 {
-    @typ@ rv;
+    typ rv;
 
     RE(rv) = RE(op1)*RE(op2) - IM(op1)*IM(op2);
     IM(rv) = RE(op1)*IM(op2) + IM(op1)*RE(op2);
@@ -1013,25 +1057,26 @@ static NPY_INLINE @typ@
 }
 
 
-static NPY_INLINE void
-@TYPE@_slogdet_from_factored_diagonal(@typ@* src,
+template<typename typ, typename basetyp>
+static inline void
+slogdet_from_factored_diagonal(typ* src,
                                       fortran_int m,
-                                      @typ@ *sign,
-                                      @basetyp@ *logdet)
+                                      typ *sign,
+                                      basetyp *logdet)
 {
     int i;
-    @typ@ sign_acc = *sign;
-    @basetyp@ logdet_acc = @zero@;
+    typ sign_acc = *sign;
+    basetyp logdet_acc = numeric_limits<basetyp>::zero;
 
     for (i = 0; i < m; i++)
     {
-        @basetyp@ abs_element = @abs_func@(*src);
-        @typ@ sign_element;
+        basetyp abs_element = npyabs(*src);
+        typ sign_element;
         RE(sign_element) = RE(*src) / abs_element;
         IM(sign_element) = IM(*src) / abs_element;
 
-        sign_acc = @TYPE@_mult(sign_acc, sign_element);
-        logdet_acc += @log_func@(abs_element);
+        sign_acc = mult(sign_acc, sign_element);
+        logdet_acc += npylog(abs_element);
         src += m + 1;
     }
 
@@ -1039,17 +1084,17 @@ static NPY_INLINE void
     *logdet = logdet_acc;
 }
 
-static NPY_INLINE @typ@
-@TYPE@_det_from_slogdet(@typ@ sign, @basetyp@ logdet)
+template<typename typ, typename basetyp>
+static inline typ
+det_from_slogdet(typ sign, basetyp logdet)
 {
-    @typ@ tmp;
-    RE(tmp) = @exp_func@(logdet);
-    IM(tmp) = @zero@;
-    return @TYPE@_mult(sign, tmp);
+    typ tmp;
+    RE(tmp) = npyexp(logdet);
+    IM(tmp) = numeric_limits<basetyp>::zero;
+    return mult(sign, tmp);
 }
 #undef RE
 #undef IM
-/**end repeat**/
 
 
 /* As in the linalg package, the determinant is computed via LU factorization
@@ -1057,26 +1102,20 @@ static NPY_INLINE @typ@
  * slogdet computes sign + log(determinant).
  * det computes sign * exp(slogdet).
  */
-/**begin repeat
-
-   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
-   #typ = npy_float, npy_double, npy_cfloat, npy_cdouble#
-   #basetyp = npy_float, npy_double, npy_float, npy_double#
-   #cblas_type = s, d, c, z#
-*/
-
-static NPY_INLINE void
-@TYPE@_slogdet_single_element(fortran_int m,
-                              void* src,
+template<typename typ, typename basetyp>
+static inline void
+slogdet_single_element(fortran_int m,
+                              typ* src,
                               fortran_int* pivots,
-                              @typ@ *sign,
-                              @basetyp@ *logdet)
+                              typ *sign,
+                              basetyp *logdet)
 {
+using ftyp = fortran_type_t<typ>;
     fortran_int info = 0;
     fortran_int lda = fortran_int_max(m, 1);
     int i;
     /* note: done in place */
-    LAPACK(@cblas_type@getrf)(&m, &m, (void *)src, &lda, pivots, &info);
+    getrf(&m, &m, (ftyp*)src, &lda, pivots, &info);
 
     if (info == 0) {
         int change_sign = 0;
@@ -1086,23 +1125,20 @@ static NPY_INLINE void
             change_sign += (pivots[i] != (i+1));
         }
 
-        memcpy(sign,
-               (change_sign % 2)?
-                   &@cblas_type@_minus_one :
-                   &@cblas_type@_one
-               , sizeof(*sign));
-        @TYPE@_slogdet_from_factored_diagonal(src, m, sign, logdet);
+        *sign = (change_sign % 2)?numeric_limits<typ>::minus_one:numeric_limits<typ>::one;
+        slogdet_from_factored_diagonal(src, m, sign, logdet);
     } else {
         /*
           if getrf fails, use 0 as sign and -inf as logdet
         */
-        memcpy(sign, &@cblas_type@_zero, sizeof(*sign));
-        memcpy(logdet, &@cblas_type@_ninf, sizeof(*logdet));
+        *sign = numeric_limits<typ>::zero;
+        *logdet = numeric_limits<basetyp>::ninf;
     }
 }
 
+template<typename typ, typename basetyp>
 static void
-@TYPE@_slogdet(char **args,
+slogdet(char **args,
                npy_intp const *dimensions,
                npy_intp const *steps,
                void *NPY_UNUSED(func))
@@ -1123,7 +1159,7 @@ static void
     INIT_OUTER_LOOP_3
     m = (fortran_int) dimensions[0];
     safe_m = m;
-    matrix_size = safe_m * safe_m * sizeof(@typ@);
+    matrix_size = safe_m * safe_m * sizeof(typ);
     pivot_size = safe_m * sizeof(fortran_int);
     tmp_buff = (npy_uint8 *)malloc(matrix_size + pivot_size);
 
@@ -1132,20 +1168,21 @@ static void
         /* swapped steps to get matrix in FORTRAN order */
         init_linearize_data(&lin_data, m, m, steps[1], steps[0]);
         BEGIN_OUTER_LOOP_3
-            linearize_@TYPE@_matrix(tmp_buff, args[0], &lin_data);
-            @TYPE@_slogdet_single_element(m,
-                                          (void*)tmp_buff,
+            linearize_matrix((typ*)tmp_buff, (typ*)args[0], &lin_data);
+            slogdet_single_element(m,
+                                   (typ*)tmp_buff,
                                           (fortran_int*)(tmp_buff+matrix_size),
-                                          (@typ@*)args[1],
-                                          (@basetyp@*)args[2]);
+                                          (typ*)args[1],
+                                          (basetyp*)args[2]);
         END_OUTER_LOOP
 
         free(tmp_buff);
     }
 }
 
+template<typename typ, typename basetyp>
 static void
-@TYPE@_det(char **args,
+det(char **args,
            npy_intp const *dimensions,
            npy_intp const *steps,
            void *NPY_UNUSED(func))
@@ -1166,42 +1203,42 @@ static void
     INIT_OUTER_LOOP_2
     m = (fortran_int) dimensions[0];
     safe_m = m;
-    matrix_size = safe_m * safe_m * sizeof(@typ@);
+    matrix_size = safe_m * safe_m * sizeof(typ);
     pivot_size = safe_m * sizeof(fortran_int);
     tmp_buff = (npy_uint8 *)malloc(matrix_size + pivot_size);
 
     if (tmp_buff) {
         LINEARIZE_DATA_t lin_data;
-        @typ@ sign;
-        @basetyp@ logdet;
+        typ sign;
+        basetyp logdet;
         /* swapped steps to get matrix in FORTRAN order */
         init_linearize_data(&lin_data, m, m, steps[1], steps[0]);
 
         BEGIN_OUTER_LOOP_2
-            linearize_@TYPE@_matrix(tmp_buff, args[0], &lin_data);
-            @TYPE@_slogdet_single_element(m,
-                                          (void*)tmp_buff,
+            linearize_matrix((typ*)tmp_buff, (typ*)args[0], &lin_data);
+            slogdet_single_element(m,
+                                         (typ*)tmp_buff,
                                           (fortran_int*)(tmp_buff + matrix_size),
                                           &sign,
                                           &logdet);
-            *(@typ@ *)args[1] = @TYPE@_det_from_slogdet(sign, logdet);
+            *(typ *)args[1] = det_from_slogdet(sign, logdet);
         END_OUTER_LOOP
 
         free(tmp_buff);
     }
 }
-/**end repeat**/
 
 
 /* -------------------------------------------------------------------------- */
                           /* Eigh family */
 
-typedef struct eigh_params_struct {
-    void *A;     /* matrix */
-    void *W;     /* eigenvalue vector */
-    void *WORK;  /* main work buffer */
-    void *RWORK; /* secondary work buffer (for complex versions) */
-    void *IWORK;
+template<typename typ>
+struct EIGH_PARAMS_t {
+    typ *A;     /* matrix */
+    basetype_t<typ> *W;     /* eigenvalue vector */
+    typ *WORK;  /* main work buffer */
+    basetype_t<typ> *RWORK; /* secondary work buffer (for complex versions) */
+    fortran_int *IWORK;
     fortran_int N;
     fortran_int LWORK;
     fortran_int LRWORK;
@@ -1209,20 +1246,24 @@ typedef struct eigh_params_struct {
     char JOBZ;
     char UPLO;
     fortran_int LDA;
-} EIGH_PARAMS_t;
+} ;
 
-/**begin repeat
-   #TYPE = FLOAT, DOUBLE#
-   #typ = npy_float, npy_double#
-   #ftyp = fortran_real, fortran_doublereal#
-   #lapack_func = ssyevd, dsyevd#
-*/
-
-static NPY_INLINE fortran_int
-call_@lapack_func@(EIGH_PARAMS_t *params)
+static inline fortran_int
+call_evd(EIGH_PARAMS_t<npy_float> *params)
+{
+    fortran_int rv;
+    LAPACK(ssyevd)(&params->JOBZ, &params->UPLO, &params->N,
+                          params->A, &params->LDA, params->W,
+                          params->WORK, &params->LWORK,
+                          params->IWORK, &params->LIWORK,
+                          &rv);
+    return rv;
+}
+static inline fortran_int
+call_evd(EIGH_PARAMS_t<npy_double> *params)
 {
     fortran_int rv;
-    LAPACK(@lapack_func@)(&params->JOBZ, &params->UPLO, &params->N,
+    LAPACK(dsyevd)(&params->JOBZ, &params->UPLO, &params->N,
                           params->A, &params->LDA, params->W,
                           params->WORK, &params->LWORK,
                           params->IWORK, &params->LIWORK,
@@ -1230,13 +1271,15 @@ call_@lapack_func@(EIGH_PARAMS_t *params)
     return rv;
 }
 
+
 /*
  * Initialize the parameters to use in for the lapack function _syevd
  * Handles buffer allocation
  */
-static NPY_INLINE int
-init_@lapack_func@(EIGH_PARAMS_t* params, char JOBZ, char UPLO,
-                   fortran_int N)
+template<typename typ>
+static inline int
+init_evd(EIGH_PARAMS_t<typ>* params, char JOBZ, char UPLO,
+                   fortran_int N, scalar_trait)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
@@ -1244,19 +1287,19 @@ init_@lapack_func@(EIGH_PARAMS_t* params, char JOBZ, char UPLO,
     fortran_int liwork;
     npy_uint8 *a, *w, *work, *iwork;
     size_t safe_N = N;
-    size_t alloc_size = safe_N * (safe_N + 1) * sizeof(@typ@);
+    size_t alloc_size = safe_N * (safe_N + 1) * sizeof(typ);
     fortran_int lda = fortran_int_max(N, 1);
 
-    mem_buff = malloc(alloc_size);
+    mem_buff = (npy_uint8 *)malloc(alloc_size);
 
     if (!mem_buff) {
         goto error;
     }
     a = mem_buff;
-    w = mem_buff + safe_N * safe_N * sizeof(@typ@);
+    w = mem_buff + safe_N * safe_N * sizeof(typ);
 
-    params->A = a;
-    params->W = w;
+    params->A = (typ*)a;
+    params->W = (typ*)w;
     params->RWORK = NULL; /* unused */
     params->N = N;
     params->LRWORK = 0; /* unused */
@@ -1266,7 +1309,7 @@ init_@lapack_func@(EIGH_PARAMS_t* params, char JOBZ, char UPLO,
 
     /* Work size query */
     {
-        @typ@ query_work_size;
+        typ query_work_size;
         fortran_int query_iwork_size;
 
         params->LWORK = -1;
@@ -1274,7 +1317,7 @@ init_@lapack_func@(EIGH_PARAMS_t* params, char JOBZ, char UPLO,
         params->WORK = &query_work_size;
         params->IWORK = &query_iwork_size;
 
-        if (call_@lapack_func@(params) != 0) {
+        if (call_evd(params) != 0) {
             goto error;
         }
 
@@ -1282,18 +1325,18 @@ init_@lapack_func@(EIGH_PARAMS_t* params, char JOBZ, char UPLO,
         liwork = query_iwork_size;
     }
 
-    mem_buff2 = malloc(lwork*sizeof(@typ@) + liwork*sizeof(fortran_int));
+    mem_buff2 = (npy_uint8 *)malloc(lwork*sizeof(typ) + liwork*sizeof(fortran_int));
     if (!mem_buff2) {
         goto error;
     }
 
     work = mem_buff2;
-    iwork = mem_buff2 + lwork*sizeof(@typ@);
+    iwork = mem_buff2 + lwork*sizeof(typ);
 
     params->LWORK = lwork;
-    params->WORK = work;
+    params->WORK = (typ*)work;
     params->LIWORK = liwork;
-    params->IWORK = iwork;
+    params->IWORK = (fortran_int*)iwork;
 
     return 1;
 
@@ -1305,40 +1348,44 @@ init_@lapack_func@(EIGH_PARAMS_t* params, char JOBZ, char UPLO,
 
     return 0;
 }
-/**end repeat**/
 
 
-/**begin repeat
-   #TYPE = CFLOAT, CDOUBLE#
-   #typ = npy_cfloat, npy_cdouble#
-   #basetyp = npy_float, npy_double#
-   #ftyp = fortran_complex, fortran_doublecomplex#
-   #fbasetyp = fortran_real, fortran_doublereal#
-   #lapack_func = cheevd, zheevd#
-*/
-static NPY_INLINE fortran_int
-call_@lapack_func@(EIGH_PARAMS_t *params)
+static inline fortran_int
+call_evd(EIGH_PARAMS_t<npy_cfloat> *params)
 {
     fortran_int rv;
-    LAPACK(@lapack_func@)(&params->JOBZ, &params->UPLO, &params->N,
-                          params->A, &params->LDA, params->W,
-                          params->WORK, &params->LWORK,
+    LAPACK(cheevd)(&params->JOBZ, &params->UPLO, &params->N,
+                          (fortran_type_t<npy_cfloat>*)params->A, &params->LDA, params->W,
+                          (fortran_type_t<npy_cfloat>*)params->WORK, &params->LWORK,
                           params->RWORK, &params->LRWORK,
                           params->IWORK, &params->LIWORK,
                           &rv);
     return rv;
 }
 
-/*
- * Initialize the parameters to use in for the lapack function _heev
- * Handles buffer allocation
- */
-static NPY_INLINE int
-init_@lapack_func@(EIGH_PARAMS_t *params,
+static inline fortran_int
+call_evd(EIGH_PARAMS_t<npy_cdouble> *params)
+{
+    fortran_int rv;
+    LAPACK(zheevd)(&params->JOBZ, &params->UPLO, &params->N,
+                          (fortran_type_t<npy_cdouble>*)params->A, &params->LDA, params->W,
+                          (fortran_type_t<npy_cdouble>*)params->WORK, &params->LWORK,
+                          params->RWORK, &params->LRWORK,
+                          params->IWORK, &params->LIWORK,
+                          &rv);
+    return rv;
+}
+
+template<typename typ>
+static inline int
+init_evd(EIGH_PARAMS_t<typ> *params,
                    char JOBZ,
                    char UPLO,
-                   fortran_int N)
+                   fortran_int N, complex_trait)
 {
+    using basetyp = basetype_t<typ>;
+using ftyp = fortran_type_t<typ>;
+using fbasetyp = fortran_type_t<basetyp>;
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
     fortran_int lwork;
@@ -1348,16 +1395,16 @@ init_@lapack_func@(EIGH_PARAMS_t *params,
     size_t safe_N = N;
     fortran_int lda = fortran_int_max(N, 1);
 
-    mem_buff = malloc(safe_N * safe_N * sizeof(@typ@) +
-                      safe_N * sizeof(@basetyp@));
+    mem_buff = (npy_uint8 *)malloc(safe_N * safe_N * sizeof(typ) +
+                      safe_N * sizeof(basetyp));
     if (!mem_buff) {
         goto error;
     }
     a = mem_buff;
-    w = mem_buff + safe_N * safe_N * sizeof(@typ@);
+    w = mem_buff + safe_N * safe_N * sizeof(typ);
 
-    params->A = a;
-    params->W = w;
+    params->A = (typ*)a;
+    params->W = (basetyp*)w;
     params->N = N;
     params->JOBZ = JOBZ;
     params->UPLO = UPLO;
@@ -1365,40 +1412,40 @@ init_@lapack_func@(EIGH_PARAMS_t *params,
 
     /* Work size query */
     {
-        @ftyp@ query_work_size;
-        @fbasetyp@ query_rwork_size;
+        ftyp query_work_size;
+        fbasetyp query_rwork_size;
         fortran_int query_iwork_size;
 
         params->LWORK = -1;
         params->LRWORK = -1;
         params->LIWORK = -1;
-        params->WORK = &query_work_size;
-        params->RWORK = &query_rwork_size;
+        params->WORK = (typ*)&query_work_size;
+        params->RWORK = (basetyp*)&query_rwork_size;
         params->IWORK = &query_iwork_size;
 
-        if (call_@lapack_func@(params) != 0) {
+        if (call_evd(params) != 0) {
             goto error;
         }
 
-        lwork = (fortran_int)*(@fbasetyp@*)&query_work_size;
+        lwork = (fortran_int)*(fbasetyp*)&query_work_size;
         lrwork = (fortran_int)query_rwork_size;
         liwork = query_iwork_size;
     }
 
-    mem_buff2 = malloc(lwork*sizeof(@typ@) +
-                       lrwork*sizeof(@basetyp@) +
+    mem_buff2 = (npy_uint8 *)malloc(lwork*sizeof(typ) +
+                       lrwork*sizeof(basetyp) +
                        liwork*sizeof(fortran_int));
     if (!mem_buff2) {
         goto error;
     }
 
     work = mem_buff2;
-    rwork = work + lwork*sizeof(@typ@);
-    iwork = rwork + lrwork*sizeof(@basetyp@);
+    rwork = work + lwork*sizeof(typ);
+    iwork = rwork + lrwork*sizeof(basetyp);
 
-    params->WORK = work;
-    params->RWORK = rwork;
-    params->IWORK = iwork;
+    params->WORK = (typ*)work;
+    params->RWORK = (basetyp*)rwork;
+    params->IWORK = (fortran_int*)iwork;
     params->LWORK = lwork;
     params->LRWORK = lrwork;
     params->LIWORK = liwork;
@@ -1413,16 +1460,7 @@ error:
 
     return 0;
 }
-/**end repeat**/
 
-
-/**begin repeat
-   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
-   #BASETYPE = FLOAT, DOUBLE, FLOAT, DOUBLE#
-   #typ = npy_float, npy_double, npy_cfloat, npy_cdouble#
-   #basetyp = npy_float, npy_double, npy_float, npy_double#
-   #lapack_func = ssyevd, dsyevd, cheevd, zheevd#
-**/
 /*
  * (M, M)->(M,)(M, M)
  * dimensions[1] -> M
@@ -1431,8 +1469,9 @@ error:
  * args[2] -> A[out]
  */
 
-static NPY_INLINE void
-release_@lapack_func@(EIGH_PARAMS_t *params)
+template<typename typ>
+static inline void
+release_evd(EIGH_PARAMS_t<typ> *params)
 {
     /* allocated memory in A and WORK */
     free(params->A);
@@ -1441,18 +1480,20 @@ release_@lapack_func@(EIGH_PARAMS_t *params)
 }
 
 
-static NPY_INLINE void
-@TYPE@_eigh_wrapper(char JOBZ,
+template<typename typ>
+static inline void
+eigh_wrapper(char JOBZ,
                     char UPLO,
                     char**args,
                     npy_intp const *dimensions,
                     npy_intp const *steps)
 {
+    using basetyp = basetype_t<typ>;
     ptrdiff_t outer_steps[3];
     size_t iter;
     size_t outer_dim = *dimensions++;
     size_t op_count = (JOBZ=='N')?2:3;
-    EIGH_PARAMS_t eigh_params;
+    EIGH_PARAMS_t<typ> eigh_params;
     int error_occurred = get_fp_invalid_and_clear();
 
     for (iter = 0; iter < op_count; ++iter) {
@@ -1460,10 +1501,10 @@ static NPY_INLINE void
     }
     steps += op_count;
 
-    if (init_@lapack_func@(&eigh_params,
+    if (init_evd(&eigh_params,
                            JOBZ,
                            UPLO,
-                           (fortran_int)dimensions[0])) {
+                           (fortran_int)dimensions[0], dispatch_scalar<typ>())) {
         LINEARIZE_DATA_t matrix_in_ld;
         LINEARIZE_DATA_t eigenvectors_out_ld;
         LINEARIZE_DATA_t eigenvalues_out_ld;
@@ -1483,106 +1524,98 @@ static NPY_INLINE void
         for (iter = 0; iter < outer_dim; ++iter) {
             int not_ok;
             /* copy the matrix in */
-            linearize_@TYPE@_matrix(eigh_params.A, args[0], &matrix_in_ld);
-            not_ok = call_@lapack_func@(&eigh_params);
+            linearize_matrix((typ*)eigh_params.A, (typ*)args[0], &matrix_in_ld);
+            not_ok = call_evd(&eigh_params);
             if (!not_ok) {
                 /* lapack ok, copy result out */
-                delinearize_@BASETYPE@_matrix(args[1],
-                                              eigh_params.W,
+                delinearize_matrix((basetyp*)args[1],
+                                              (basetyp*)eigh_params.W,
                                               &eigenvalues_out_ld);
 
                 if ('V' == eigh_params.JOBZ) {
-                    delinearize_@TYPE@_matrix(args[2],
-                                              eigh_params.A,
+                    delinearize_matrix((typ*)args[2],
+                                              (typ*)eigh_params.A,
                                               &eigenvectors_out_ld);
                 }
             } else {
                 /* lapack fail, set result to nan */
                 error_occurred = 1;
-                nan_@BASETYPE@_matrix(args[1], &eigenvalues_out_ld);
+                nan_matrix((basetyp*)args[1], &eigenvalues_out_ld);
                 if ('V' == eigh_params.JOBZ) {
-                    nan_@TYPE@_matrix(args[2], &eigenvectors_out_ld);
+                    nan_matrix((typ*)args[2], &eigenvectors_out_ld);
                 }
             }
             update_pointers((npy_uint8**)args, outer_steps, op_count);
         }
 
-        release_@lapack_func@(&eigh_params);
+        release_evd(&eigh_params);
     }
 
     set_fp_invalid_or_clear(error_occurred);
 }
-/**end repeat**/
 
 
-/**begin repeat
-   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
- */
+template<typename typ>
 static void
-@TYPE@_eighlo(char **args,
+eighlo(char **args,
               npy_intp const *dimensions,
               npy_intp const *steps,
               void *NPY_UNUSED(func))
 {
-    @TYPE@_eigh_wrapper('V', 'L', args, dimensions, steps);
+    eigh_wrapper<typ>('V', 'L', args, dimensions, steps);
 }
 
+template<typename typ>
 static void
-@TYPE@_eighup(char **args,
+eighup(char **args,
               npy_intp const *dimensions,
               npy_intp const *steps,
               void* NPY_UNUSED(func))
 {
-    @TYPE@_eigh_wrapper('V', 'U', args, dimensions, steps);
+    eigh_wrapper<typ>('V', 'U', args, dimensions, steps);
 }
 
+template<typename typ>
 static void
-@TYPE@_eigvalshlo(char **args,
+eigvalshlo(char **args,
                   npy_intp const *dimensions,
                   npy_intp const *steps,
                   void* NPY_UNUSED(func))
 {
-    @TYPE@_eigh_wrapper('N', 'L', args, dimensions, steps);
+    eigh_wrapper<typ>('N', 'L', args, dimensions, steps);
 }
 
+template<typename typ>
 static void
-@TYPE@_eigvalshup(char **args,
+eigvalshup(char **args,
                   npy_intp const *dimensions,
                   npy_intp const *steps,
                   void* NPY_UNUSED(func))
 {
-    @TYPE@_eigh_wrapper('N', 'U', args, dimensions, steps);
+    eigh_wrapper<typ>('N', 'U', args, dimensions, steps);
 }
-/**end repeat**/
 
 /* -------------------------------------------------------------------------- */
                   /* Solve family (includes inv) */
 
-typedef struct gesv_params_struct
+template<typename typ>
+struct GESV_PARAMS_t
 {
-    void *A; /* A is (N, N) of base type */
-    void *B; /* B is (N, NRHS) of base type */
+    typ *A; /* A is (N, N) of base type */
+    typ *B; /* B is (N, NRHS) of base type */
     fortran_int * IPIV; /* IPIV is (N) */
 
     fortran_int N;
     fortran_int NRHS;
     fortran_int LDA;
     fortran_int LDB;
-} GESV_PARAMS_t;
-
-/**begin repeat
-   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
-   #typ = npy_float, npy_double, npy_cfloat, npy_cdouble#
-   #ftyp = fortran_real, fortran_doublereal,
-           fortran_complex, fortran_doublecomplex#
-   #lapack_func = sgesv, dgesv, cgesv, zgesv#
-*/
+};
 
-static NPY_INLINE fortran_int
-call_@lapack_func@(GESV_PARAMS_t *params)
+static inline fortran_int
+call_gesv(GESV_PARAMS_t<fortran_real> *params)
 {
     fortran_int rv;
-    LAPACK(@lapack_func@)(&params->N, &params->NRHS,
+    LAPACK(sgesv)(&params->N, &params->NRHS,
                           params->A, &params->LDA,
                           params->IPIV,
                           params->B, &params->LDB,
@@ -1590,30 +1623,68 @@ call_@lapack_func@(GESV_PARAMS_t *params)
     return rv;
 }
 
+static inline fortran_int
+call_gesv(GESV_PARAMS_t<fortran_doublereal> *params)
+{
+    fortran_int rv;
+    LAPACK(dgesv)(&params->N, &params->NRHS,
+                          params->A, &params->LDA,
+                          params->IPIV,
+                          params->B, &params->LDB,
+                          &rv);
+    return rv;
+}
+
+static inline fortran_int
+call_gesv(GESV_PARAMS_t<fortran_complex> *params)
+{
+    fortran_int rv;
+    LAPACK(cgesv)(&params->N, &params->NRHS,
+                          params->A, &params->LDA,
+                          params->IPIV,
+                          params->B, &params->LDB,
+                          &rv);
+    return rv;
+}
+
+static inline fortran_int
+call_gesv(GESV_PARAMS_t<fortran_doublecomplex> *params)
+{
+    fortran_int rv;
+    LAPACK(zgesv)(&params->N, &params->NRHS,
+                          params->A, &params->LDA,
+                          params->IPIV,
+                          params->B, &params->LDB,
+                          &rv);
+    return rv;
+}
+
+
 /*
  * Initialize the parameters to use in for the lapack function _heev
  * Handles buffer allocation
  */
-static NPY_INLINE int
-init_@lapack_func@(GESV_PARAMS_t *params, fortran_int N, fortran_int NRHS)
+template<typename ftyp>
+static inline int
+init_gesv(GESV_PARAMS_t<ftyp> *params, fortran_int N, fortran_int NRHS)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *a, *b, *ipiv;
     size_t safe_N = N;
     size_t safe_NRHS = NRHS;
     fortran_int ld = fortran_int_max(N, 1);
-    mem_buff = malloc(safe_N * safe_N * sizeof(@ftyp@) +
-                      safe_N * safe_NRHS*sizeof(@ftyp@) +
+    mem_buff = (npy_uint8 *)malloc(safe_N * safe_N * sizeof(ftyp) +
+                      safe_N * safe_NRHS*sizeof(ftyp) +
                       safe_N * sizeof(fortran_int));
     if (!mem_buff) {
         goto error;
     }
     a = mem_buff;
-    b = a + safe_N * safe_N * sizeof(@ftyp@);
-    ipiv = b + safe_N * safe_NRHS * sizeof(@ftyp@);
+    b = a + safe_N * safe_N * sizeof(ftyp);
+    ipiv = b + safe_N * safe_NRHS * sizeof(ftyp);
 
-    params->A = a;
-    params->B = b;
+    params->A = (ftyp*)a;
+    params->B = (ftyp*)b;
     params->IPIV = (fortran_int*)ipiv;
     params->N = N;
     params->NRHS = NRHS;
@@ -1628,26 +1699,29 @@ init_@lapack_func@(GESV_PARAMS_t *params, fortran_int N, fortran_int NRHS)
     return 0;
 }
 
-static NPY_INLINE void
-release_@lapack_func@(GESV_PARAMS_t *params)
+template<typename ftyp>
+static inline void
+release_gesv(GESV_PARAMS_t<ftyp> *params)
 {
     /* memory block base is in A */
     free(params->A);
     memset(params, 0, sizeof(*params));
 }
 
+template<typename typ>
 static void
-@TYPE@_solve(char **args, npy_intp const *dimensions, npy_intp const *steps,
+solve(char **args, npy_intp const *dimensions, npy_intp const *steps,
              void *NPY_UNUSED(func))
 {
-    GESV_PARAMS_t params;
+using ftyp = fortran_type_t<typ>;
+    GESV_PARAMS_t<ftyp> params;
     fortran_int n, nrhs;
     int error_occurred = get_fp_invalid_and_clear();
     INIT_OUTER_LOOP_3
 
     n = (fortran_int)dimensions[0];
     nrhs = (fortran_int)dimensions[1];
-    if (init_@lapack_func@(&params, n, nrhs)) {
+    if (init_gesv(&params, n, nrhs)) {
         LINEARIZE_DATA_t a_in, b_in, r_out;
 
         init_linearize_data(&a_in, n, n, steps[1], steps[0]);
@@ -1656,34 +1730,37 @@ static void
 
         BEGIN_OUTER_LOOP_3
             int not_ok;
-            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
-            linearize_@TYPE@_matrix(params.B, args[1], &b_in);
-            not_ok =call_@lapack_func@(&params);
+            linearize_matrix((typ*)params.A, (typ*)args[0], &a_in);
+            linearize_matrix((typ*)params.B, (typ*)args[1], &b_in);
+            not_ok =call_gesv(&params);
             if (!not_ok) {
-                delinearize_@TYPE@_matrix(args[2], params.B, &r_out);
+                delinearize_matrix((typ*)args[2], (typ*)params.B, &r_out);
             } else {
                 error_occurred = 1;
-                nan_@TYPE@_matrix(args[2], &r_out);
+                nan_matrix((typ*)args[2], &r_out);
             }
         END_OUTER_LOOP
 
-        release_@lapack_func@(&params);
+        release_gesv(&params);
     }
 
     set_fp_invalid_or_clear(error_occurred);
 }
 
+
+template<typename typ>
 static void
-@TYPE@_solve1(char **args, npy_intp const *dimensions, npy_intp const *steps,
+solve1(char **args, npy_intp const *dimensions, npy_intp const *steps,
               void *NPY_UNUSED(func))
 {
-    GESV_PARAMS_t params;
+using ftyp = fortran_type_t<typ>;
+    GESV_PARAMS_t<ftyp> params;
     int error_occurred = get_fp_invalid_and_clear();
     fortran_int n;
     INIT_OUTER_LOOP_3
 
     n = (fortran_int)dimensions[0];
-    if (init_@lapack_func@(&params, n, 1)) {
+    if (init_gesv(&params, n, 1)) {
         LINEARIZE_DATA_t a_in, b_in, r_out;
         init_linearize_data(&a_in, n, n, steps[1], steps[0]);
         init_linearize_data(&b_in, 1, n, 1, steps[2]);
@@ -1691,105 +1768,130 @@ static void
 
         BEGIN_OUTER_LOOP_3
             int not_ok;
-            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
-            linearize_@TYPE@_matrix(params.B, args[1], &b_in);
-            not_ok = call_@lapack_func@(&params);
+            linearize_matrix((typ*)params.A, (typ*)args[0], &a_in);
+            linearize_matrix((typ*)params.B, (typ*)args[1], &b_in);
+            not_ok = call_gesv(&params);
             if (!not_ok) {
-                delinearize_@TYPE@_matrix(args[2], params.B, &r_out);
+                delinearize_matrix((typ*)args[2], (typ*)params.B, &r_out);
             } else {
                 error_occurred = 1;
-                nan_@TYPE@_matrix(args[2], &r_out);
+                nan_matrix((typ*)args[2], &r_out);
             }
         END_OUTER_LOOP
 
-        release_@lapack_func@(&params);
+        release_gesv(&params);
     }
 
     set_fp_invalid_or_clear(error_occurred);
 }
 
+template<typename typ>
 static void
-@TYPE@_inv(char **args, npy_intp const *dimensions, npy_intp const *steps,
+inv(char **args, npy_intp const *dimensions, npy_intp const *steps,
            void *NPY_UNUSED(func))
 {
-    GESV_PARAMS_t params;
+using ftyp = fortran_type_t<typ>;
+    GESV_PARAMS_t<ftyp> params;
     fortran_int n;
     int error_occurred = get_fp_invalid_and_clear();
     INIT_OUTER_LOOP_2
 
     n = (fortran_int)dimensions[0];
-    if (init_@lapack_func@(&params, n, n)) {
+    if (init_gesv(&params, n, n)) {
         LINEARIZE_DATA_t a_in, r_out;
         init_linearize_data(&a_in, n, n, steps[1], steps[0]);
         init_linearize_data(&r_out, n, n, steps[3], steps[2]);
 
         BEGIN_OUTER_LOOP_2
             int not_ok;
-            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
-            identity_@TYPE@_matrix(params.B, n);
-            not_ok = call_@lapack_func@(&params);
+            linearize_matrix((typ*)params.A, (typ*)args[0], &a_in);
+            identity_matrix((typ*)params.B, n);
+            not_ok = call_gesv(&params);
             if (!not_ok) {
-                delinearize_@TYPE@_matrix(args[1], params.B, &r_out);
+                delinearize_matrix((typ*)args[1], (typ*)params.B, &r_out);
             } else {
                 error_occurred = 1;
-                nan_@TYPE@_matrix(args[1], &r_out);
+                nan_matrix((typ*)args[1], &r_out);
             }
         END_OUTER_LOOP
 
-        release_@lapack_func@(&params);
+        release_gesv(&params);
     }
 
     set_fp_invalid_or_clear(error_occurred);
 }
 
-/**end repeat**/
-
 
 /* -------------------------------------------------------------------------- */
                      /* Cholesky decomposition */
 
-typedef struct potr_params_struct
+template<typename typ>
+struct POTR_PARAMS_t
 {
-    void *A;
+    typ *A;
     fortran_int N;
     fortran_int LDA;
     char UPLO;
-} POTR_PARAMS_t;
+};
 
-/**begin repeat
 
-   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
-   #ftyp = fortran_real, fortran_doublereal,
-           fortran_complex, fortran_doublecomplex#
-   #lapack_func = spotrf, dpotrf, cpotrf, zpotrf#
- */
+static inline fortran_int
+call_potrf(POTR_PARAMS_t<fortran_real> *params)
+{
+    fortran_int rv;
+    LAPACK(spotrf)(&params->UPLO,
+                          &params->N, params->A, &params->LDA,
+                          &rv);
+    return rv;
+}
+
+static inline fortran_int
+call_potrf(POTR_PARAMS_t<fortran_doublereal> *params)
+{
+    fortran_int rv;
+    LAPACK(dpotrf)(&params->UPLO,
+                          &params->N, params->A, &params->LDA,
+                          &rv);
+    return rv;
+}
+
+static inline fortran_int
+call_potrf(POTR_PARAMS_t<fortran_complex> *params)
+{
+    fortran_int rv;
+    LAPACK(cpotrf)(&params->UPLO,
+                          &params->N, params->A, &params->LDA,
+                          &rv);
+    return rv;
+}
 
-static NPY_INLINE fortran_int
-call_@lapack_func@(POTR_PARAMS_t *params)
+static inline fortran_int
+call_potrf(POTR_PARAMS_t<fortran_doublecomplex> *params)
 {
     fortran_int rv;
-    LAPACK(@lapack_func@)(&params->UPLO,
+    LAPACK(zpotrf)(&params->UPLO,
                           &params->N, params->A, &params->LDA,
                           &rv);
     return rv;
 }
 
-static NPY_INLINE int
-init_@lapack_func@(POTR_PARAMS_t *params, char UPLO, fortran_int N)
+template<typename ftyp>
+static inline int
+init_potrf(POTR_PARAMS_t<ftyp> *params, char UPLO, fortran_int N)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *a;
     size_t safe_N = N;
     fortran_int lda = fortran_int_max(N, 1);
 
-    mem_buff = malloc(safe_N * safe_N * sizeof(@ftyp@));
+    mem_buff = (npy_uint8 *)malloc(safe_N * safe_N * sizeof(ftyp));
     if (!mem_buff) {
         goto error;
     }
 
     a = mem_buff;
 
-    params->A = a;
+    params->A = (ftyp*)a;
     params->N = N;
     params->LDA = lda;
     params->UPLO = UPLO;
@@ -1802,18 +1904,21 @@ init_@lapack_func@(POTR_PARAMS_t *params, char UPLO, fortran_int N)
     return 0;
 }
 
-static NPY_INLINE void
-release_@lapack_func@(POTR_PARAMS_t *params)
+template<typename ftyp>
+static inline void
+release_potrf(POTR_PARAMS_t<ftyp> *params)
 {
     /* memory block base in A */
     free(params->A);
     memset(params, 0, sizeof(*params));
 }
 
+template<typename typ>
 static void
-@TYPE@_cholesky(char uplo, char **args, npy_intp const *dimensions, npy_intp const *steps)
+cholesky(char uplo, char **args, npy_intp const *dimensions, npy_intp const *steps)
 {
-    POTR_PARAMS_t params;
+    using ftyp = fortran_type_t<typ>;
+    POTR_PARAMS_t<ftyp> params;
     int error_occurred = get_fp_invalid_and_clear();
     fortran_int n;
     INIT_OUTER_LOOP_2
@@ -1821,50 +1926,50 @@ static void
     assert(uplo == 'L');
 
     n = (fortran_int)dimensions[0];
-    if (init_@lapack_func@(&params, uplo, n)) {
+    if (init_potrf(&params, uplo, n)) {
         LINEARIZE_DATA_t a_in, r_out;
         init_linearize_data(&a_in, n, n, steps[1], steps[0]);
         init_linearize_data(&r_out, n, n, steps[3], steps[2]);
         BEGIN_OUTER_LOOP_2
             int not_ok;
-            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
-            not_ok = call_@lapack_func@(&params);
+            linearize_matrix(params.A, (ftyp*)args[0], &a_in);
+            not_ok = call_potrf(&params);
             if (!not_ok) {
-                triu_@TYPE@_matrix(params.A, params.N);
-                delinearize_@TYPE@_matrix(args[1], params.A, &r_out);
+                triu_matrix(params.A, params.N);
+                delinearize_matrix((ftyp*)args[1], params.A, &r_out);
             } else {
                 error_occurred = 1;
-                nan_@TYPE@_matrix(args[1], &r_out);
+                nan_matrix((ftyp*)args[1], &r_out);
             }
         END_OUTER_LOOP
-        release_@lapack_func@(&params);
+        release_potrf(&params);
     }
 
     set_fp_invalid_or_clear(error_occurred);
 }
 
+template<typename typ>
 static void
-@TYPE@_cholesky_lo(char **args, npy_intp const *dimensions, npy_intp const *steps,
+cholesky_lo(char **args, npy_intp const *dimensions, npy_intp const *steps,
                 void *NPY_UNUSED(func))
 {
-    @TYPE@_cholesky('L', args, dimensions, steps);
+    cholesky<typ>('L', args, dimensions, steps);
 }
 
-/**end repeat**/
-
 /* -------------------------------------------------------------------------- */
                           /* eig family  */
 
-typedef struct geev_params_struct {
-    void *A;
-    void *WR; /* RWORK in complex versions, REAL W buffer for (sd)geev*/
-    void *WI;
-    void *VLR; /* REAL VL buffers for _geev where _ is s, d */
-    void *VRR; /* REAL VR buffers for _geev where _ is s, d */
-    void *WORK;
-    void *W;  /* final w */
-    void *VL; /* final vl */
-    void *VR; /* final vr */
+template<typename typ>
+struct GEEV_PARAMS_t {
+    typ *A;
+    basetype_t<typ> *WR; /* RWORK in complex versions, REAL W buffer for (sd)geev*/
+    typ *WI;
+    typ *VLR; /* REAL VL buffers for _geev where _ is s, d */
+    typ *VRR; /* REAL VR buffers for _geev where _ is s, d */
+    typ *WORK;
+    typ *W;  /* final w */
+    typ *VL; /* final vl */
+    typ *VR; /* final vr */
 
     fortran_int N;
     fortran_int LDA;
@@ -1874,10 +1979,11 @@ typedef struct geev_params_struct {
 
     char JOBVL;
     char JOBVR;
-} GEEV_PARAMS_t;
+};
 
-static NPY_INLINE void
-dump_geev_params(const char *name, GEEV_PARAMS_t* params)
+template<typename typ>
+static inline void
+dump_geev_params(const char *name, GEEV_PARAMS_t<typ>* params)
 {
     TRACE_TXT("\n%s\n"
 
@@ -1922,20 +2028,25 @@ dump_geev_params(const char *name, GEEV_PARAMS_t* params)
               "JOBVR", params->JOBVR);
 }
 
-/**begin repeat
-   #TYPE = FLOAT, DOUBLE#
-   #CTYPE = CFLOAT, CDOUBLE#
-   #typ = float, double#
-   #complextyp = COMPLEX_t, DOUBLECOMPLEX_t#
-   #lapack_func = sgeev, dgeev#
-   #zero = 0.0f, 0.0#
-*/
+static inline fortran_int
+call_geev(GEEV_PARAMS_t<float>* params)
+{
+    fortran_int rv;
+    LAPACK(sgeev)(&params->JOBVL, &params->JOBVR,
+                          &params->N, params->A, &params->LDA,
+                          params->WR, params->WI,
+                          params->VLR, &params->LDVL,
+                          params->VRR, &params->LDVR,
+                          params->WORK, &params->LWORK,
+                          &rv);
+    return rv;
+}
 
-static NPY_INLINE fortran_int
-call_@lapack_func@(GEEV_PARAMS_t* params)
+static inline fortran_int
+call_geev(GEEV_PARAMS_t<double>* params)
 {
     fortran_int rv;
-    LAPACK(@lapack_func@)(&params->JOBVL, &params->JOBVR,
+    LAPACK(dgeev)(&params->JOBVL, &params->JOBVR,
                           &params->N, params->A, &params->LDA,
                           params->WR, params->WI,
                           params->VLR, &params->LDVL,
@@ -1945,18 +2056,21 @@ call_@lapack_func@(GEEV_PARAMS_t* params)
     return rv;
 }
 
-static NPY_INLINE int
-init_@lapack_func@(GEEV_PARAMS_t *params, char jobvl, char jobvr, fortran_int n)
+
+template<typename typ>
+static inline int
+init_geev(GEEV_PARAMS_t<typ> *params, char jobvl, char jobvr, fortran_int n,
+scalar_trait)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
     npy_uint8 *a, *wr, *wi, *vlr, *vrr, *work, *w, *vl, *vr;
     size_t safe_n = n;
-    size_t a_size = safe_n * safe_n * sizeof(@typ@);
-    size_t wr_size = safe_n * sizeof(@typ@);
-    size_t wi_size = safe_n * sizeof(@typ@);
-    size_t vlr_size = jobvl=='V' ? safe_n * safe_n * sizeof(@typ@) : 0;
-    size_t vrr_size = jobvr=='V' ? safe_n * safe_n * sizeof(@typ@) : 0;
+    size_t a_size = safe_n * safe_n * sizeof(typ);
+    size_t wr_size = safe_n * sizeof(typ);
+    size_t wi_size = safe_n * sizeof(typ);
+    size_t vlr_size = jobvl=='V' ? safe_n * safe_n * sizeof(typ) : 0;
+    size_t vrr_size = jobvr=='V' ? safe_n * safe_n * sizeof(typ) : 0;
     size_t w_size = wr_size*2;
     size_t vl_size = vlr_size*2;
     size_t vr_size = vrr_size*2;
@@ -1964,7 +2078,7 @@ init_@lapack_func@(GEEV_PARAMS_t *params, char jobvl, char jobvr, fortran_int n)
     fortran_int ld = fortran_int_max(n, 1);
 
     /* allocate data for known sizes (all but work) */
-    mem_buff = malloc(a_size + wr_size + wi_size +
+    mem_buff = (npy_uint8 *)malloc(a_size + wr_size + wi_size +
                       vlr_size + vrr_size +
                       w_size + vl_size + vr_size);
     if (!mem_buff) {
@@ -1980,14 +2094,14 @@ init_@lapack_func@(GEEV_PARAMS_t *params, char jobvl, char jobvr, fortran_int n)
     vl = w + w_size;
     vr = vl + vl_size;
 
-    params->A = a;
-    params->WR = wr;
-    params->WI = wi;
-    params->VLR = vlr;
-    params->VRR = vrr;
-    params->W = w;
-    params->VL = vl;
-    params->VR = vr;
+    params->A = (typ*)a;
+    params->WR = (typ*)wr;
+    params->WI = (typ*)wi;
+    params->VLR = (typ*)vlr;
+    params->VRR = (typ*)vrr;
+    params->W = (typ*)w;
+    params->VL = (typ*)vl;
+    params->VR = (typ*)vr;
     params->N = n;
     params->LDA = ld;
     params->LDVL = ld;
@@ -1997,26 +2111,26 @@ init_@lapack_func@(GEEV_PARAMS_t *params, char jobvl, char jobvr, fortran_int n)
 
     /* Work size query */
     {
-        @typ@ work_size_query;
+        typ work_size_query;
 
         params->LWORK = -1;
         params->WORK = &work_size_query;
 
-        if (call_@lapack_func@(params) != 0) {
+        if (call_geev(params) != 0) {
             goto error;
         }
 
         work_count = (size_t)work_size_query;
     }
 
-    mem_buff2 = malloc(work_count*sizeof(@typ@));
+    mem_buff2 = (npy_uint8 *)malloc(work_count*sizeof(typ));
     if (!mem_buff2) {
         goto error;
     }
     work = mem_buff2;
 
     params->LWORK = (fortran_int)work_count;
-    params->WORK = work;
+    params->WORK = (typ*)work;
 
     return 1;
  error:
@@ -2027,42 +2141,45 @@ init_@lapack_func@(GEEV_PARAMS_t *params, char jobvl, char jobvr, fortran_int n)
     return 0;
 }
 
-static NPY_INLINE void
-mk_@TYPE@_complex_array_from_real(@complextyp@ *c, const @typ@ *re, size_t n)
+template<typename complextyp, typename typ>
+static inline void
+mk_complex_array_from_real(complextyp *c, const typ *re, size_t n)
 {
     size_t iter;
     for (iter = 0; iter < n; ++iter) {
-        c[iter].array[0] = re[iter];
-        c[iter].array[1] = @zero@;
+        c[iter].r = re[iter];
+        c[iter].i = numeric_limits<typ>::zero;
     }
 }
 
-static NPY_INLINE void
-mk_@TYPE@_complex_array(@complextyp@ *c,
-                        const @typ@ *re,
-                        const @typ@ *im,
+template<typename complextyp, typename typ>
+static inline void
+mk_complex_array(complextyp *c,
+                        const typ *re,
+                        const typ *im,
                         size_t n)
 {
     size_t iter;
     for (iter = 0; iter < n; ++iter) {
-        c[iter].array[0] = re[iter];
-        c[iter].array[1] = im[iter];
+        c[iter].r = re[iter];
+        c[iter].i = im[iter];
     }
 }
 
-static NPY_INLINE void
-mk_@TYPE@_complex_array_conjugate_pair(@complextyp@ *c,
-                                       const @typ@ *r,
+template<typename complextyp, typename typ>
+static inline void
+mk_complex_array_conjugate_pair(complextyp *c,
+                                       const typ *r,
                                        size_t n)
 {
     size_t iter;
     for (iter = 0; iter < n; ++iter) {
-        @typ@ re = r[iter];
-        @typ@ im = r[iter+n];
-        c[iter].array[0] = re;
-        c[iter].array[1] = im;
-        c[iter+n].array[0] = re;
-        c[iter+n].array[1] = -im;
+        typ re = r[iter];
+        typ im = r[iter+n];
+        c[iter].r = re;
+        c[iter].i = im;
+        c[iter+n].r = re;
+        c[iter+n].i = -im;
     }
 }
 
@@ -2073,24 +2190,25 @@ mk_@TYPE@_complex_array_conjugate_pair(@complextyp@ *c,
  * i is the eigenvalue imaginary part produced by sgeev/zgeev
  * n is so that the order of the matrix is n by n
  */
-static NPY_INLINE void
-mk_@lapack_func@_complex_eigenvectors(@complextyp@ *c,
-                                      const @typ@ *r,
-                                      const @typ@ *i,
+template<typename complextyp, typename typ>
+static inline void
+mk_geev_complex_eigenvectors(complextyp *c,
+                                      const typ *r,
+                                      const typ *i,
                                       size_t n)
 {
     size_t iter = 0;
     while (iter < n)
     {
-        if (i[iter] ==  @zero@) {
+        if (i[iter] ==  numeric_limits<typ>::zero) {
             /* eigenvalue was real, eigenvectors as well...  */
-            mk_@TYPE@_complex_array_from_real(c, r, n);
+            mk_complex_array_from_real(c, r, n);
             c += n;
             r += n;
             iter ++;
         } else {
             /* eigenvalue was complex, generate a pair of eigenvectors */
-            mk_@TYPE@_complex_array_conjugate_pair(c, r, n);
+            mk_complex_array_conjugate_pair(c, r, n);
             c += 2*n;
             r += 2*n;
             iter += 2;
@@ -2099,43 +2217,49 @@ mk_@lapack_func@_complex_eigenvectors(@complextyp@ *c,
 }
 
 
-static NPY_INLINE void
-process_@lapack_func@_results(GEEV_PARAMS_t *params)
+template<typename complextyp, typename typ>
+static inline void
+process_geev_results(GEEV_PARAMS_t<typ> *params, scalar_trait)
 {
     /* REAL versions of geev need the results to be translated
      * into complex versions. This is the way to deal with imaginary
      * results. In our gufuncs we will always return complex arrays!
      */
-    mk_@TYPE@_complex_array(params->W, params->WR, params->WI, params->N);
+    mk_complex_array((complextyp*)params->W, (typ*)params->WR, (typ*)params->WI, params->N);
 
     /* handle the eigenvectors */
     if ('V' == params->JOBVL) {
-        mk_@lapack_func@_complex_eigenvectors(params->VL, params->VLR,
-                                              params->WI, params->N);
+        mk_geev_complex_eigenvectors((complextyp*)params->VL, (typ*)params->VLR,
+                                              (typ*)params->WI, params->N);
     }
     if ('V' == params->JOBVR) {
-        mk_@lapack_func@_complex_eigenvectors(params->VR, params->VRR,
-                                              params->WI, params->N);
+        mk_geev_complex_eigenvectors((complextyp*)params->VR, (typ*)params->VRR,
+                                              (typ*)params->WI, params->N);
     }
 }
 
-/**end repeat**/
 
+static inline fortran_int
+call_geev(GEEV_PARAMS_t<fortran_complex>* params)
+{
+    fortran_int rv;
 
-/**begin repeat
-   #TYPE = CFLOAT, CDOUBLE#
-   #typ = COMPLEX_t, DOUBLECOMPLEX_t#
-   #ftyp = fortran_complex, fortran_doublecomplex#
-   #realtyp = float, double#
-   #lapack_func = cgeev, zgeev#
- */
-
-static NPY_INLINE fortran_int
-call_@lapack_func@(GEEV_PARAMS_t* params)
+    LAPACK(cgeev)(&params->JOBVL, &params->JOBVR,
+                          &params->N, params->A, &params->LDA,
+                          params->W,
+                          params->VL, &params->LDVL,
+                          params->VR, &params->LDVR,
+                          params->WORK, &params->LWORK,
+                          params->WR, /* actually RWORK */
+                          &rv);
+    return rv;
+}
+static inline fortran_int
+call_geev(GEEV_PARAMS_t<fortran_doublecomplex>* params)
 {
     fortran_int rv;
 
-    LAPACK(@lapack_func@)(&params->JOBVL, &params->JOBVR,
+    LAPACK(zgeev)(&params->JOBVL, &params->JOBVR,
                           &params->N, params->A, &params->LDA,
                           params->W,
                           params->VL, &params->LDVL,
@@ -2146,26 +2270,28 @@ call_@lapack_func@(GEEV_PARAMS_t* params)
     return rv;
 }
 
-static NPY_INLINE int
-init_@lapack_func@(GEEV_PARAMS_t* params,
+template<typename ftyp>
+static inline int
+init_geev(GEEV_PARAMS_t<ftyp>* params,
                    char jobvl,
                    char jobvr,
-                   fortran_int n)
+                   fortran_int n, complex_trait)
 {
+using realtyp = basetype_t<ftyp>;
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
     npy_uint8 *a, *w, *vl, *vr, *work, *rwork;
     size_t safe_n = n;
-    size_t a_size = safe_n * safe_n * sizeof(@ftyp@);
-    size_t w_size = safe_n * sizeof(@ftyp@);
-    size_t vl_size = jobvl=='V'? safe_n * safe_n * sizeof(@ftyp@) : 0;
-    size_t vr_size = jobvr=='V'? safe_n * safe_n * sizeof(@ftyp@) : 0;
-    size_t rwork_size = 2 * safe_n * sizeof(@realtyp@);
+    size_t a_size = safe_n * safe_n * sizeof(ftyp);
+    size_t w_size = safe_n * sizeof(ftyp);
+    size_t vl_size = jobvl=='V'? safe_n * safe_n * sizeof(ftyp) : 0;
+    size_t vr_size = jobvr=='V'? safe_n * safe_n * sizeof(ftyp) : 0;
+    size_t rwork_size = 2 * safe_n * sizeof(realtyp);
     size_t work_count = 0;
     size_t total_size = a_size + w_size + vl_size + vr_size + rwork_size;
     fortran_int ld = fortran_int_max(n, 1);
 
-    mem_buff = malloc(total_size);
+    mem_buff = (npy_uint8 *)malloc(total_size);
     if (!mem_buff) {
         goto error;
     }
@@ -2176,14 +2302,14 @@ init_@lapack_func@(GEEV_PARAMS_t* params,
     vr = vl + vl_size;
     rwork = vr + vr_size;
 
-    params->A = a;
-    params->WR = rwork;
+    params->A = (ftyp*)a;
+    params->WR = (realtyp*)rwork;
     params->WI = NULL;
     params->VLR = NULL;
     params->VRR = NULL;
-    params->VL = vl;
-    params->VR = vr;
-    params->W = w;
+    params->VL = (ftyp*)vl;
+    params->VR = (ftyp*)vr;
+    params->W = (ftyp*)w;
     params->N = n;
     params->LDA = ld;
     params->LDVL = ld;
@@ -2193,21 +2319,21 @@ init_@lapack_func@(GEEV_PARAMS_t* params,
 
     /* Work size query */
     {
-        @typ@ work_size_query;
+        ftyp work_size_query;
 
         params->LWORK = -1;
         params->WORK = &work_size_query;
 
-        if (call_@lapack_func@(params) != 0) {
+        if (call_geev(params) != 0) {
             goto error;
         }
 
-        work_count = (size_t) work_size_query.array[0];
+        work_count = (size_t) work_size_query.r;
         /* Fix a bug in lapack 3.0.0 */
         if(work_count == 0) work_count = 1;
     }
 
-    mem_buff2 = malloc(work_count*sizeof(@ftyp@));
+    mem_buff2 = (npy_uint8 *)malloc(work_count*sizeof(ftyp));
     if (!mem_buff2) {
         goto error;
     }
@@ -2215,7 +2341,7 @@ init_@lapack_func@(GEEV_PARAMS_t* params,
     work = mem_buff2;
 
     params->LWORK = (fortran_int)work_count;
-    params->WORK = work;
+    params->WORK = (ftyp*)work;
 
     return 1;
  error:
@@ -2226,32 +2352,27 @@ init_@lapack_func@(GEEV_PARAMS_t* params,
     return 0;
 }
 
-
-static NPY_INLINE void
-process_@lapack_func@_results(GEEV_PARAMS_t *NPY_UNUSED(params))
+template<typename complextyp, typename typ>
+static inline void
+process_geev_results(GEEV_PARAMS_t<typ> *NPY_UNUSED(params), complex_trait)
 {
     /* nothing to do here, complex versions are ready to copy out */
 }
-/**end repeat**/
 
 
-/**begin repeat
-   #TYPE = FLOAT, DOUBLE, CDOUBLE#
-   #COMPLEXTYPE = CFLOAT, CDOUBLE, CDOUBLE#
-   #ftype = fortran_real, fortran_doublereal, fortran_doublecomplex#
-   #lapack_func = sgeev, dgeev, zgeev#
- */
 
-static NPY_INLINE void
-release_@lapack_func@(GEEV_PARAMS_t *params)
+template<typename typ>
+static inline void
+release_geev(GEEV_PARAMS_t<typ> *params)
 {
     free(params->WORK);
     free(params->A);
     memset(params, 0, sizeof(*params));
 }
 
-static NPY_INLINE void
-@TYPE@_eig_wrapper(char JOBVL,
+template<typename fctype, typename ftype>
+static inline void
+eig_wrapper(char JOBVL,
                    char JOBVR,
                    char**args,
                    npy_intp const *dimensions,
@@ -2262,7 +2383,7 @@ static NPY_INLINE void
     size_t outer_dim = *dimensions++;
     size_t op_count = 2;
     int error_occurred = get_fp_invalid_and_clear();
-    GEEV_PARAMS_t geev_params;
+    GEEV_PARAMS_t<ftype> geev_params;
 
     assert(JOBVL == 'N');
 
@@ -2275,9 +2396,9 @@ static NPY_INLINE void
     }
     steps += op_count;
 
-    if (init_@lapack_func@(&geev_params,
+    if (init_geev(&geev_params,
                            JOBVL, JOBVR,
-                           (fortran_int)dimensions[0])) {
+                           (fortran_int)dimensions[0], dispatch_scalar<ftype>())) {
         LINEARIZE_DATA_t a_in;
         LINEARIZE_DATA_t w_out;
         LINEARIZE_DATA_t vl_out;
@@ -2307,78 +2428,81 @@ static NPY_INLINE void
             int not_ok;
             char **arg_iter = args;
             /* copy the matrix in */
-            linearize_@TYPE@_matrix(geev_params.A, *arg_iter++, &a_in);
-            not_ok = call_@lapack_func@(&geev_params);
+            linearize_matrix((ftype*)geev_params.A, (ftype*)*arg_iter++, &a_in);
+            not_ok = call_geev(&geev_params);
 
             if (!not_ok) {
-                process_@lapack_func@_results(&geev_params);
-                delinearize_@COMPLEXTYPE@_matrix(*arg_iter++,
-                                                 geev_params.W,
+                process_geev_results<fctype>(&geev_params,
+dispatch_scalar<ftype>{});
+                delinearize_matrix((fctype*)*arg_iter++,
+                                                 (fctype*)geev_params.W,
                                                  &w_out);
 
                 if ('V' == geev_params.JOBVL) {
-                    delinearize_@COMPLEXTYPE@_matrix(*arg_iter++,
-                                                     geev_params.VL,
+                    delinearize_matrix((fctype*)*arg_iter++,
+                                                     (fctype*)geev_params.VL,
                                                      &vl_out);
                 }
                 if ('V' == geev_params.JOBVR) {
-                    delinearize_@COMPLEXTYPE@_matrix(*arg_iter++,
-                                                     geev_params.VR,
+                    delinearize_matrix((fctype*)*arg_iter++,
+                                                     (fctype*)geev_params.VR,
                                                      &vr_out);
                 }
             } else {
                 /* geev failed */
                 error_occurred = 1;
-                nan_@COMPLEXTYPE@_matrix(*arg_iter++, &w_out);
+                nan_matrix((fctype*)*arg_iter++, &w_out);
                 if ('V' == geev_params.JOBVL) {
-                    nan_@COMPLEXTYPE@_matrix(*arg_iter++, &vl_out);
+                    nan_matrix((fctype*)*arg_iter++, &vl_out);
                 }
                 if ('V' == geev_params.JOBVR) {
-                    nan_@COMPLEXTYPE@_matrix(*arg_iter++, &vr_out);
+                    nan_matrix((fctype*)*arg_iter++, &vr_out);
                 }
             }
             update_pointers((npy_uint8**)args, outer_steps, op_count);
         }
 
-        release_@lapack_func@(&geev_params);
+        release_geev(&geev_params);
     }
 
     set_fp_invalid_or_clear(error_occurred);
 }
 
+template<typename fctype, typename ftype>
 static void
-@TYPE@_eig(char **args,
+eig(char **args,
            npy_intp const *dimensions,
            npy_intp const *steps,
            void *NPY_UNUSED(func))
 {
-    @TYPE@_eig_wrapper('N', 'V', args, dimensions, steps);
+    eig_wrapper<fctype, ftype>('N', 'V', args, dimensions, steps);
 }
 
+template<typename fctype, typename ftype>
 static void
-@TYPE@_eigvals(char **args,
+eigvals(char **args,
                npy_intp const *dimensions,
                npy_intp const *steps,
                void *NPY_UNUSED(func))
 {
-    @TYPE@_eig_wrapper('N', 'N', args, dimensions, steps);
+    eig_wrapper<fctype, ftype>('N', 'N', args, dimensions, steps);
 }
 
-/**end repeat**/
 
 
 /* -------------------------------------------------------------------------- */
                  /* singular value decomposition  */
 
-typedef struct gessd_params_struct
+template<typename ftyp>
+struct GESDD_PARAMS_t
 {
-    void *A;
-    void *S;
-    void *U;
-    void *VT;
-    void *WORK;
-    void *RWORK;
-    void *IWORK;
+    ftyp *A;
+    basetype_t<ftyp> *S;
+    ftyp *U;
+    ftyp *VT;
+    ftyp *WORK;
+    basetype_t<ftyp> *RWORK;
+    fortran_int *IWORK;
 
     fortran_int M;
     fortran_int N;
@@ -2387,12 +2511,13 @@ typedef struct gessd_params_struct
     fortran_int LDVT;
     fortran_int LWORK;
     char JOBZ;
-} GESDD_PARAMS_t;
+} ;
 
 
-static NPY_INLINE void
+template<typename ftyp>
+static inline void
 dump_gesdd_params(const char *name,
-                  GESDD_PARAMS_t *params)
+                  GESDD_PARAMS_t<ftyp> *params)
 {
     TRACE_TXT("\n%s:\n"\
 
@@ -2433,7 +2558,7 @@ dump_gesdd_params(const char *name,
               "JOBZ", ' ', params->JOBZ);
 }
 
-static NPY_INLINE int
+static inline int
 compute_urows_vtcolumns(char jobz,
                         fortran_int m, fortran_int n,
                         fortran_int *urows, fortran_int *vtcolumns)
@@ -2462,43 +2587,51 @@ compute_urows_vtcolumns(char jobz,
     return 1;
 }
 
-
-/**begin repeat
-   #TYPE = FLOAT, DOUBLE#
-   #lapack_func = sgesdd, dgesdd#
-   #ftyp = fortran_real, fortran_doublereal#
- */
-
-static NPY_INLINE fortran_int
-call_@lapack_func@(GESDD_PARAMS_t *params)
+static inline fortran_int
+call_gesdd(GESDD_PARAMS_t<fortran_real> *params)
 {
     fortran_int rv;
-    LAPACK(@lapack_func@)(&params->JOBZ, &params->M, &params->N,
+    LAPACK(sgesdd)(&params->JOBZ, &params->M, &params->N,
                           params->A, &params->LDA,
                           params->S,
                           params->U, &params->LDU,
                           params->VT, &params->LDVT,
                           params->WORK, &params->LWORK,
-                          params->IWORK,
+                          (fortran_int*)params->IWORK,
+                          &rv);
+    return rv;
+}
+static inline fortran_int
+call_gesdd(GESDD_PARAMS_t<fortran_doublereal> *params)
+{
+    fortran_int rv;
+    LAPACK(dgesdd)(&params->JOBZ, &params->M, &params->N,
+                          params->A, &params->LDA,
+                          params->S,
+                          params->U, &params->LDU,
+                          params->VT, &params->LDVT,
+                          params->WORK, &params->LWORK,
+                          (fortran_int*)params->IWORK,
                           &rv);
     return rv;
 }
 
-static NPY_INLINE int
-init_@lapack_func@(GESDD_PARAMS_t *params,
+template<typename ftyp>
+static inline int
+init_gesdd(GESDD_PARAMS_t<ftyp> *params,
                    char jobz,
                    fortran_int m,
-                   fortran_int n)
+                   fortran_int n, scalar_trait)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
     npy_uint8 *a, *s, *u, *vt, *work, *iwork;
     size_t safe_m = m;
     size_t safe_n = n;
-    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t a_size = safe_m * safe_n * sizeof(ftyp);
     fortran_int min_m_n = fortran_int_min(m, n);
     size_t safe_min_m_n = min_m_n;
-    size_t s_size = safe_min_m_n * sizeof(@ftyp@);
+    size_t s_size = safe_min_m_n * sizeof(ftyp);
     fortran_int u_row_count, vt_column_count;
     size_t safe_u_row_count, safe_vt_column_count;
     size_t u_size, vt_size;
@@ -2514,10 +2647,10 @@ init_@lapack_func@(GESDD_PARAMS_t *params,
     safe_u_row_count = u_row_count;
     safe_vt_column_count = vt_column_count;
 
-    u_size = safe_u_row_count * safe_m * sizeof(@ftyp@);
-    vt_size = safe_n * safe_vt_column_count * sizeof(@ftyp@);
+    u_size = safe_u_row_count * safe_m * sizeof(ftyp);
+    vt_size = safe_n * safe_vt_column_count * sizeof(ftyp);
 
-    mem_buff = malloc(a_size + s_size + u_size + vt_size + iwork_size);
+    mem_buff = (npy_uint8 *)malloc(a_size + s_size + u_size + vt_size + iwork_size);
 
     if (!mem_buff) {
         goto error;
@@ -2534,12 +2667,12 @@ init_@lapack_func@(GESDD_PARAMS_t *params,
 
     params->M = m;
     params->N = n;
-    params->A = a;
-    params->S = s;
-    params->U = u;
-    params->VT = vt;
+    params->A = (ftyp*)a;
+    params->S = (ftyp*)s;
+    params->U = (ftyp*)u;
+    params->VT = (ftyp*)vt;
     params->RWORK = NULL;
-    params->IWORK = iwork;
+    params->IWORK = (fortran_int*)iwork;
     params->LDA = ld;
     params->LDU = ld;
     params->LDVT = vt_column_count;
@@ -2547,22 +2680,22 @@ init_@lapack_func@(GESDD_PARAMS_t *params,
 
     /* Work size query */
     {
-        @ftyp@ work_size_query;
+        ftyp work_size_query;
 
         params->LWORK = -1;
         params->WORK = &work_size_query;
 
-        if (call_@lapack_func@(params) != 0) {
+        if (call_gesdd(params) != 0) {
             goto error;
         }
 
         work_count = (fortran_int)work_size_query;
         /* Fix a bug in lapack 3.0.0 */
         if(work_count == 0) work_count = 1;
-        work_size = (size_t)work_count * sizeof(@ftyp@);
+        work_size = (size_t)work_count * sizeof(ftyp);
     }
 
-    mem_buff2 = malloc(work_size);
+    mem_buff2 = (npy_uint8 *)malloc(work_size);
     if (!mem_buff2) {
         goto error;
     }
@@ -2570,7 +2703,7 @@ init_@lapack_func@(GESDD_PARAMS_t *params,
     work = mem_buff2;
 
     params->LWORK = work_count;
-    params->WORK = work;
+    params->WORK = (ftyp*)work;
 
     return 1;
  error:
@@ -2582,21 +2715,26 @@ init_@lapack_func@(GESDD_PARAMS_t *params,
     return 0;
 }
 
-/**end repeat**/
-
-/**begin repeat
-   #TYPE = CFLOAT, CDOUBLE#
-   #ftyp = fortran_complex, fortran_doublecomplex#
-   #frealtyp = fortran_real, fortran_doublereal#
-   #typ = COMPLEX_t, DOUBLECOMPLEX_t#
-   #lapack_func = cgesdd, zgesdd#
- */
-
-static NPY_INLINE fortran_int
-call_@lapack_func@(GESDD_PARAMS_t *params)
+static inline fortran_int
+call_gesdd(GESDD_PARAMS_t<fortran_complex> *params)
+{
+    fortran_int rv;
+    LAPACK(cgesdd)(&params->JOBZ, &params->M, &params->N,
+                          params->A, &params->LDA,
+                          params->S,
+                          params->U, &params->LDU,
+                          params->VT, &params->LDVT,
+                          params->WORK, &params->LWORK,
+                          params->RWORK,
+                          params->IWORK,
+                          &rv);
+    return rv;
+}
+static inline fortran_int
+call_gesdd(GESDD_PARAMS_t<fortran_doublecomplex> *params)
 {
     fortran_int rv;
-    LAPACK(@lapack_func@)(&params->JOBZ, &params->M, &params->N,
+    LAPACK(zgesdd)(&params->JOBZ, &params->M, &params->N,
                           params->A, &params->LDA,
                           params->S,
                           params->U, &params->LDU,
@@ -2608,12 +2746,14 @@ call_@lapack_func@(GESDD_PARAMS_t *params)
     return rv;
 }
 
-static NPY_INLINE int
-init_@lapack_func@(GESDD_PARAMS_t *params,
+template<typename ftyp>
+static inline int
+init_gesdd(GESDD_PARAMS_t<ftyp> *params,
                    char jobz,
                    fortran_int m,
-                   fortran_int n)
+                   fortran_int n, complex_trait)
 {
+using frealtyp = basetype_t<ftyp>;
     npy_uint8 *mem_buff = NULL, *mem_buff2 = NULL;
     npy_uint8 *a,*s, *u, *vt, *work, *rwork, *iwork;
     size_t a_size, s_size, u_size, vt_size, work_size, rwork_size, iwork_size;
@@ -2632,17 +2772,17 @@ init_@lapack_func@(GESDD_PARAMS_t *params,
     safe_u_row_count = u_row_count;
     safe_vt_column_count = vt_column_count;
 
-    a_size = safe_m * safe_n * sizeof(@ftyp@);
-    s_size = safe_min_m_n * sizeof(@frealtyp@);
-    u_size = safe_u_row_count * safe_m * sizeof(@ftyp@);
-    vt_size = safe_n * safe_vt_column_count * sizeof(@ftyp@);
+    a_size = safe_m * safe_n * sizeof(ftyp);
+    s_size = safe_min_m_n * sizeof(frealtyp);
+    u_size = safe_u_row_count * safe_m * sizeof(ftyp);
+    vt_size = safe_n * safe_vt_column_count * sizeof(ftyp);
     rwork_size = 'N'==jobz?
         (7 * safe_min_m_n) :
         (5*safe_min_m_n * safe_min_m_n + 5*safe_min_m_n);
-    rwork_size *= sizeof(@ftyp@);
+    rwork_size *= sizeof(ftyp);
     iwork_size = 8 * safe_min_m_n* sizeof(fortran_int);
 
-    mem_buff = malloc(a_size +
+    mem_buff = (npy_uint8 *)malloc(a_size +
                       s_size +
                       u_size +
                       vt_size +
@@ -2662,12 +2802,12 @@ init_@lapack_func@(GESDD_PARAMS_t *params,
     /* fix vt_column_count so that it is a valid lapack parameter (0 is not) */
     vt_column_count = fortran_int_max(1, vt_column_count);
 
-    params->A = a;
-    params->S = s;
-    params->U = u;
-    params->VT = vt;
-    params->RWORK = rwork;
-    params->IWORK = iwork;
+    params->A = (ftyp*)a;
+    params->S = (frealtyp*)s;
+    params->U = (ftyp*)u;
+    params->VT = (ftyp*)vt;
+    params->RWORK = (frealtyp*)rwork;
+    params->IWORK = (fortran_int*)iwork;
     params->M = m;
     params->N = n;
     params->LDA = ld;
@@ -2677,22 +2817,22 @@ init_@lapack_func@(GESDD_PARAMS_t *params,
 
     /* Work size query */
     {
-        @ftyp@ work_size_query;
+        ftyp work_size_query;
 
         params->LWORK = -1;
         params->WORK = &work_size_query;
 
-        if (call_@lapack_func@(params) != 0) {
+        if (call_gesdd(params) != 0) {
             goto error;
         }
 
-        work_count = (fortran_int)((@typ@*)&work_size_query)->array[0];
+        work_count = (fortran_int)(*(frealtyp*)&work_size_query);
         /* Fix a bug in lapack 3.0.0 */
         if(work_count == 0) work_count = 1;
-        work_size = (size_t)work_count * sizeof(@ftyp@);
+        work_size = (size_t)work_count * sizeof(ftyp);
     }
 
-    mem_buff2 = malloc(work_size);
+    mem_buff2 = (npy_uint8 *)malloc(work_size);
     if (!mem_buff2) {
         goto error;
     }
@@ -2700,7 +2840,7 @@ init_@lapack_func@(GESDD_PARAMS_t *params,
     work = mem_buff2;
 
     params->LWORK = work_count;
-    params->WORK = work;
+    params->WORK = (ftyp*)work;
 
     return 1;
  error:
@@ -2711,16 +2851,10 @@ init_@lapack_func@(GESDD_PARAMS_t *params,
 
     return 0;
 }
-/**end repeat**/
 
-
-/**begin repeat
-   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
-   #REALTYPE = FLOAT, DOUBLE, FLOAT, DOUBLE#
-   #lapack_func = sgesdd, dgesdd, cgesdd, zgesdd#
- */
-static NPY_INLINE void
-release_@lapack_func@(GESDD_PARAMS_t* params)
+template<typename typ>
+static inline void
+release_gesdd(GESDD_PARAMS_t<typ>* params)
 {
     /* A and WORK contain allocated blocks */
     free(params->A);
@@ -2728,28 +2862,31 @@ release_@lapack_func@(GESDD_PARAMS_t* params)
     memset(params, 0, sizeof(*params));
 }
 
-static NPY_INLINE void
-@TYPE@_svd_wrapper(char JOBZ,
+template<typename typ>
+static inline void
+svd_wrapper(char JOBZ,
                    char **args,
                    npy_intp const *dimensions,
                    npy_intp const *steps)
 {
+using basetyp = basetype_t<typ>;
     ptrdiff_t outer_steps[4];
     int error_occurred = get_fp_invalid_and_clear();
     size_t iter;
     size_t outer_dim = *dimensions++;
     size_t op_count = (JOBZ=='N')?2:4;
-    GESDD_PARAMS_t params;
+    GESDD_PARAMS_t<typ> params;
 
     for (iter = 0; iter < op_count; ++iter) {
         outer_steps[iter] = (ptrdiff_t) steps[iter];
     }
     steps += op_count;
 
-    if (init_@lapack_func@(&params,
-                           JOBZ,
-                           (fortran_int)dimensions[0],
-                           (fortran_int)dimensions[1])) {
+    if (init_gesdd(&params,
+                   JOBZ,
+                   (fortran_int)dimensions[0],
+                   (fortran_int)dimensions[1],
+dispatch_scalar<typ>())) {
         LINEARIZE_DATA_t a_in, u_out, s_out, v_out;
         fortran_int min_m_n = params.M < params.N ? params.M : params.N;
 
@@ -2780,97 +2917,95 @@ static NPY_INLINE void
         for (iter = 0; iter < outer_dim; ++iter) {
             int not_ok;
             /* copy the matrix in */
-            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
-            not_ok = call_@lapack_func@(&params);
+            linearize_matrix((typ*)params.A, (typ*)args[0], &a_in);
+            not_ok = call_gesdd(&params);
             if (!not_ok) {
                 if ('N' == params.JOBZ) {
-                    delinearize_@REALTYPE@_matrix(args[1], params.S, &s_out);
+                    delinearize_matrix((basetyp*)args[1], (basetyp*)params.S, &s_out);
                 } else {
                     if ('A' == params.JOBZ && min_m_n == 0) {
                         /* Lapack has betrayed us and left these uninitialized,
                          * so produce an identity matrix for whichever of u
                          * and v is not empty.
                          */
-                        identity_@TYPE@_matrix(params.U, params.M);
-                        identity_@TYPE@_matrix(params.VT, params.N);
+                        identity_matrix((typ*)params.U, params.M);
+                        identity_matrix((typ*)params.VT, params.N);
                     }
 
-                    delinearize_@TYPE@_matrix(args[1], params.U, &u_out);
-                    delinearize_@REALTYPE@_matrix(args[2], params.S, &s_out);
-                    delinearize_@TYPE@_matrix(args[3], params.VT, &v_out);
+                    delinearize_matrix((typ*)args[1], (typ*)params.U, &u_out);
+                    delinearize_matrix((basetyp*)args[2], (basetyp*)params.S, &s_out);
+                    delinearize_matrix((typ*)args[3], (typ*)params.VT, &v_out);
                 }
             } else {
                 error_occurred = 1;
                 if ('N' == params.JOBZ) {
-                    nan_@REALTYPE@_matrix(args[1], &s_out);
+                    nan_matrix((basetyp*)args[1], &s_out);
                 } else {
-                    nan_@TYPE@_matrix(args[1], &u_out);
-                    nan_@REALTYPE@_matrix(args[2], &s_out);
-                    nan_@TYPE@_matrix(args[3], &v_out);
+                    nan_matrix((typ*)args[1], &u_out);
+                    nan_matrix((basetyp*)args[2], &s_out);
+                    nan_matrix((typ*)args[3], &v_out);
                 }
             }
             update_pointers((npy_uint8**)args, outer_steps, op_count);
         }
 
-        release_@lapack_func@(&params);
+        release_gesdd(&params);
     }
 
     set_fp_invalid_or_clear(error_occurred);
 }
-/**end repeat*/
 
 
-/* svd gufunc entry points */
-/**begin repeat
-   #TYPE = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
- */
+template<typename typ>
 static void
-@TYPE@_svd_N(char **args,
+svd_N(char **args,
              npy_intp const *dimensions,
              npy_intp const *steps,
              void *NPY_UNUSED(func))
 {
-    @TYPE@_svd_wrapper('N', args, dimensions, steps);
+    svd_wrapper<fortran_type_t<typ>>('N', args, dimensions, steps);
 }
 
+template<typename typ>
 static void
-@TYPE@_svd_S(char **args,
+svd_S(char **args,
              npy_intp const *dimensions,
              npy_intp const *steps,
              void *NPY_UNUSED(func))
 {
-    @TYPE@_svd_wrapper('S', args, dimensions, steps);
+    svd_wrapper<fortran_type_t<typ>>('S', args, dimensions, steps);
 }
 
+template<typename typ>
 static void
-@TYPE@_svd_A(char **args,
+svd_A(char **args,
              npy_intp const *dimensions,
              npy_intp const *steps,
              void *NPY_UNUSED(func))
 {
-    @TYPE@_svd_wrapper('A', args, dimensions, steps);
+    svd_wrapper<fortran_type_t<typ>>('A', args, dimensions, steps);
 }
 
-/**end repeat**/
-
 /* -------------------------------------------------------------------------- */
                  /* qr (modes - r, raw) */
 
-typedef struct geqfr_params_struct
+template<typename typ>
+struct GEQRF_PARAMS_t
 {
     fortran_int M;
     fortran_int N;
-    void *A;
+    typ *A;
     fortran_int LDA;
-    void* TAU;
-    void *WORK;
+    typ* TAU;
+    typ *WORK;
     fortran_int LWORK;
-} GEQRF_PARAMS_t;
+};
 
 
+template<typename typ>
 static inline void
 dump_geqrf_params(const char *name,
-                  GEQRF_PARAMS_t *params)
+                  GEQRF_PARAMS_t<typ> *params)
 {
     TRACE_TXT("\n%s:\n"\
 
@@ -2894,15 +3029,22 @@ dump_geqrf_params(const char *name,
               "LWORK", (int)params->LWORK);
 }
 
-/**begin repeat
-   #lapack_func=dgeqrf,zgeqrf#
- */
-
 static inline fortran_int
-call_@lapack_func@(GEQRF_PARAMS_t *params)
+call_geqrf(GEQRF_PARAMS_t<double> *params)
+{
+    fortran_int rv;
+    LAPACK(dgeqrf)(&params->M, &params->N,
+                          params->A, &params->LDA,
+                          params->TAU,
+                          params->WORK, &params->LWORK,
+                          &rv);
+    return rv;
+}
+static inline fortran_int
+call_geqrf(GEQRF_PARAMS_t<f2c_doublecomplex> *params)
 {
     fortran_int rv;
-    LAPACK(@lapack_func@)(&params->M, &params->N,
+    LAPACK(zgeqrf)(&params->M, &params->N,
                           params->A, &params->LDA,
                           params->TAU,
                           params->WORK, &params->LWORK,
@@ -2910,18 +3052,13 @@ call_@lapack_func@(GEQRF_PARAMS_t *params)
     return rv;
 }
 
-/**end repeat**/
 
-/**begin repeat
-   #TYPE=DOUBLE#
-   #lapack_func=dgeqrf#
-   #ftyp=fortran_doublereal#
- */
 static inline int
-init_@lapack_func@(GEQRF_PARAMS_t *params,
+init_geqrf(GEQRF_PARAMS_t<fortran_doublereal> *params,
                    fortran_int m,
                    fortran_int n)
 {
+using ftyp = fortran_doublereal;
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
     npy_uint8 *a, *tau, *work;
@@ -2930,14 +3067,14 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
     size_t safe_m = m;
     size_t safe_n = n;
 
-    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+    size_t a_size = safe_m * safe_n * sizeof(ftyp);
+    size_t tau_size = safe_min_m_n * sizeof(ftyp);
 
     fortran_int work_count;
     size_t work_size;
     fortran_int lda = fortran_int_max(1, m);
 
-    mem_buff = malloc(a_size + tau_size);
+    mem_buff = (npy_uint8 *)malloc(a_size + tau_size);
 
     if (!mem_buff)
         goto error;
@@ -2949,35 +3086,35 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
 
     params->M = m;
     params->N = n;
-    params->A = a;
-    params->TAU = tau;
+    params->A = (ftyp*)a;
+    params->TAU = (ftyp*)tau;
     params->LDA = lda;
 
     {
         /* compute optimal work size */
 
-        @ftyp@ work_size_query;
+        ftyp work_size_query;
 
         params->WORK = &work_size_query;
         params->LWORK = -1;
 
-        if (call_@lapack_func@(params) != 0)
+        if (call_geqrf(params) != 0)
             goto error;
 
-        work_count = (fortran_int) *(@ftyp@*) params->WORK;
+        work_count = (fortran_int) *(ftyp*) params->WORK;
 
     }
 
     params->LWORK = fortran_int_max(fortran_int_max(1, n), work_count);
 
-    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
-    mem_buff2 = malloc(work_size);
+    work_size = (size_t) params->LWORK * sizeof(ftyp);
+    mem_buff2 = (npy_uint8 *)malloc(work_size);
     if (!mem_buff2)
         goto error;
 
     work = mem_buff2;
 
-    params->WORK = work;
+    params->WORK = (ftyp*)work;
 
     return 1;
  error:
@@ -2989,18 +3126,13 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
     return 0;
 }
 
-/**end repeat**/
 
-/**begin repeat
-   #TYPE=CDOUBLE#
-   #lapack_func=zgeqrf#
-   #ftyp=fortran_doublecomplex#
- */
 static inline int
-init_@lapack_func@(GEQRF_PARAMS_t *params,
+init_geqrf(GEQRF_PARAMS_t<fortran_doublecomplex> *params,
                    fortran_int m,
                    fortran_int n)
 {
+using ftyp = fortran_doublecomplex;
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
     npy_uint8 *a, *tau, *work;
@@ -3009,14 +3141,14 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
     size_t safe_m = m;
     size_t safe_n = n;
 
-    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+    size_t a_size = safe_m * safe_n * sizeof(ftyp);
+    size_t tau_size = safe_min_m_n * sizeof(ftyp);
 
     fortran_int work_count;
     size_t work_size;
     fortran_int lda = fortran_int_max(1, m);
 
-    mem_buff = malloc(a_size + tau_size);
+    mem_buff = (npy_uint8 *)malloc(a_size + tau_size);
 
     if (!mem_buff)
         goto error;
@@ -3028,37 +3160,37 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
 
     params->M = m;
     params->N = n;
-    params->A = a;
-    params->TAU = tau;
+    params->A = (ftyp*)a;
+    params->TAU = (ftyp*)tau;
     params->LDA = lda;
 
     {
         /* compute optimal work size */
 
-        @ftyp@ work_size_query;
+        ftyp work_size_query;
 
         params->WORK = &work_size_query;
         params->LWORK = -1;
 
-        if (call_@lapack_func@(params) != 0)
+        if (call_geqrf(params) != 0)
             goto error;
 
-        work_count = (fortran_int) ((@ftyp@*)params->WORK)->r;
+        work_count = (fortran_int) ((ftyp*)params->WORK)->r;
 
     }
 
     params->LWORK = fortran_int_max(fortran_int_max(1, n),
                                     work_count);
 
-    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+    work_size = (size_t) params->LWORK * sizeof(ftyp);
 
-    mem_buff2 = malloc(work_size);
+    mem_buff2 = (npy_uint8 *)malloc(work_size);
     if (!mem_buff2)
         goto error;
 
     work = mem_buff2;
 
-    params->WORK = work;
+    params->WORK = (ftyp*)work;
 
     return 1;
  error:
@@ -3070,13 +3202,10 @@ init_@lapack_func@(GEQRF_PARAMS_t *params,
     return 0;
 }
 
-/**end repeat**/
 
-/**begin repeat
-   #lapack_func=dgeqrf,zgeqrf#
- */
+template<typename ftyp>
 static inline void
-release_@lapack_func@(GEQRF_PARAMS_t* params)
+release_geqrf(GEQRF_PARAMS_t<ftyp>* params)
 {
     /* A and WORK contain allocated blocks */
     free(params->A);
@@ -3084,22 +3213,14 @@ release_@lapack_func@(GEQRF_PARAMS_t* params)
     memset(params, 0, sizeof(*params));
 }
 
-/**end repeat**/
-
-/**begin repeat
-   #TYPE=DOUBLE,CDOUBLE#
-   #REALTYPE=DOUBLE,DOUBLE#
-   #lapack_func=dgeqrf,zgeqrf#
-   #typ     = npy_double,npy_cdouble#
-   #basetyp = npy_double,npy_double#
-   #ftyp = fortran_doublereal,fortran_doublecomplex#
-   #cmplx = 0, 1#
- */
+template<typename typ>
 static void
-@TYPE@_qr_r_raw(char **args, npy_intp const *dimensions, npy_intp const *steps,
+qr_r_raw(char **args, npy_intp const *dimensions, npy_intp const *steps,
           void *NPY_UNUSED(func))
 {
-    GEQRF_PARAMS_t params;
+using ftyp = fortran_type_t<typ>;
+
+    GEQRF_PARAMS_t<ftyp> params;
     int error_occurred = get_fp_invalid_and_clear();
     fortran_int n, m;
 
@@ -3108,7 +3229,7 @@ static void
     m = (fortran_int)dimensions[0];
     n = (fortran_int)dimensions[1];
 
-    if (init_@lapack_func@(&params, m, n)) {
+    if (init_geqrf(&params, m, n)) {
         LINEARIZE_DATA_t a_in, tau_out;
 
         init_linearize_data(&a_in, n, m, steps[1], steps[0]);
@@ -3116,50 +3237,57 @@ static void
 
         BEGIN_OUTER_LOOP_2
             int not_ok;
-            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
-            not_ok = call_@lapack_func@(&params);
+            linearize_matrix((typ*)params.A, (typ*)args[0], &a_in);
+            not_ok = call_geqrf(&params);
             if (!not_ok) {
-                delinearize_@TYPE@_matrix(args[0], params.A, &a_in);
-                delinearize_@TYPE@_matrix(args[1], params.TAU, &tau_out);
+                delinearize_matrix((typ*)args[0], (typ*)params.A, &a_in);
+                delinearize_matrix((typ*)args[1], (typ*)params.TAU, &tau_out);
             } else {
                 error_occurred = 1;
-                nan_@TYPE@_matrix(args[1], &tau_out);
+                nan_matrix((typ*)args[1], &tau_out);
             }
         END_OUTER_LOOP
 
-        release_@lapack_func@(&params);
+        release_geqrf(&params);
     }
 
     set_fp_invalid_or_clear(error_occurred);
 }
 
-/**end repeat**/
 
 /* -------------------------------------------------------------------------- */
                  /* qr common code (modes - reduced and complete) */ 
 
-typedef struct gqr_params_struct
+template<typename typ>
+struct GQR_PARAMS_t
 {
     fortran_int M;
     fortran_int MC;
     fortran_int MN;
     void* A;
-    void *Q;
+    typ *Q;
     fortran_int LDA;
-    void* TAU;
-    void *WORK;
+    typ* TAU;
+    typ *WORK;
     fortran_int LWORK;
-} GQR_PARAMS_t;
-
-/**begin repeat
-   #lapack_func=dorgqr,zungqr#
- */
+} ;
 
 static inline fortran_int
-call_@lapack_func@(GQR_PARAMS_t *params)
+call_gqr(GQR_PARAMS_t<double> *params)
 {
     fortran_int rv;
-    LAPACK(@lapack_func@)(&params->M, &params->MC, &params->MN,
+    LAPACK(dorgqr)(&params->M, &params->MC, &params->MN,
+                          params->Q, &params->LDA,
+                          params->TAU,
+                          params->WORK, &params->LWORK,
+                          &rv);
+    return rv;
+}
+static inline fortran_int
+call_gqr(GQR_PARAMS_t<f2c_doublecomplex> *params)
+{
+    fortran_int rv;
+    LAPACK(zungqr)(&params->M, &params->MC, &params->MN,
                           params->Q, &params->LDA,
                           params->TAU,
                           params->WORK, &params->LWORK,
@@ -3167,18 +3295,13 @@ call_@lapack_func@(GQR_PARAMS_t *params)
     return rv;
 }
 
-/**end repeat**/
-
-/**begin repeat
-   #lapack_func=dorgqr#
-   #ftyp=fortran_doublereal#
- */
 static inline int
-init_@lapack_func@_common(GQR_PARAMS_t *params,
+init_gqr_common(GQR_PARAMS_t<fortran_doublereal> *params,
                           fortran_int m,
                           fortran_int n,
                           fortran_int mc)
 {
+using ftyp = fortran_doublereal;
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
     npy_uint8 *a, *q, *tau, *work;
@@ -3187,15 +3310,15 @@ init_@lapack_func@_common(GQR_PARAMS_t *params,
     size_t safe_min_m_n = min_m_n;
     size_t safe_m = m;
     size_t safe_n = n;
-    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t q_size = safe_m * safe_mc * sizeof(@ftyp@);
-    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+    size_t a_size = safe_m * safe_n * sizeof(ftyp);
+    size_t q_size = safe_m * safe_mc * sizeof(ftyp);
+    size_t tau_size = safe_min_m_n * sizeof(ftyp);
 
     fortran_int work_count;
     size_t work_size;
     fortran_int lda = fortran_int_max(1, m);
 
-    mem_buff = malloc(q_size + tau_size + a_size);
+    mem_buff = (npy_uint8 *)malloc(q_size + tau_size + a_size);
 
     if (!mem_buff)
         goto error;
@@ -3209,35 +3332,35 @@ init_@lapack_func@_common(GQR_PARAMS_t *params,
     params->MC = mc;
     params->MN = min_m_n;
     params->A = a;
-    params->Q = q;
-    params->TAU = tau;
+    params->Q = (ftyp*)q;
+    params->TAU = (ftyp*)tau;
     params->LDA = lda;
 
     {
         /* compute optimal work size */
-        @ftyp@ work_size_query;
+        ftyp work_size_query;
 
         params->WORK = &work_size_query;
         params->LWORK = -1;
 
-        if (call_@lapack_func@(params) != 0)
+        if (call_gqr(params) != 0)
             goto error;
 
-        work_count = (fortran_int) *(@ftyp@*) params->WORK;
+        work_count = (fortran_int) *(ftyp*) params->WORK;
 
     }
 
     params->LWORK = fortran_int_max(fortran_int_max(1, n), work_count);
 
-    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+    work_size = (size_t) params->LWORK * sizeof(ftyp);
 
-    mem_buff2 = malloc(work_size);
+    mem_buff2 = (npy_uint8 *)malloc(work_size);
     if (!mem_buff2)
         goto error;
 
     work = mem_buff2;
 
-    params->WORK = work;
+    params->WORK = (ftyp*)work;
 
     return 1;
  error:
@@ -3249,18 +3372,14 @@ init_@lapack_func@_common(GQR_PARAMS_t *params,
     return 0;
 }
 
-/**end repeat**/
 
-/**begin repeat
-   #lapack_func=zungqr#
-   #ftyp=fortran_doublecomplex#
- */
 static inline int
-init_@lapack_func@_common(GQR_PARAMS_t *params,
+init_gqr_common(GQR_PARAMS_t<fortran_doublecomplex> *params,
                           fortran_int m,
                           fortran_int n,
                           fortran_int mc)
 {
+using ftyp=fortran_doublecomplex;
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
     npy_uint8 *a, *q, *tau, *work;
@@ -3270,15 +3389,15 @@ init_@lapack_func@_common(GQR_PARAMS_t *params,
     size_t safe_m = m;
     size_t safe_n = n;
 
-    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t q_size = safe_m * safe_mc * sizeof(@ftyp@);
-    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+    size_t a_size = safe_m * safe_n * sizeof(ftyp);
+    size_t q_size = safe_m * safe_mc * sizeof(ftyp);
+    size_t tau_size = safe_min_m_n * sizeof(ftyp);
 
     fortran_int work_count;
     size_t work_size;
     fortran_int lda = fortran_int_max(1, m);
 
-    mem_buff = malloc(q_size + tau_size + a_size);
+    mem_buff = (npy_uint8 *)malloc(q_size + tau_size + a_size);
 
     if (!mem_buff)
         goto error;
@@ -3292,36 +3411,36 @@ init_@lapack_func@_common(GQR_PARAMS_t *params,
     params->MC = mc;
     params->MN = min_m_n;
     params->A = a;
-    params->Q = q;
-    params->TAU = tau;
+    params->Q = (ftyp*)q;
+    params->TAU = (ftyp*)tau;
     params->LDA = lda;
 
     {
         /* compute optimal work size */
-        @ftyp@ work_size_query;
+        ftyp work_size_query;
 
         params->WORK = &work_size_query;
         params->LWORK = -1;
 
-        if (call_@lapack_func@(params) != 0)
+        if (call_gqr(params) != 0)
             goto error;
 
-        work_count = (fortran_int) ((@ftyp@*)params->WORK)->r;
+        work_count = (fortran_int) ((ftyp*)params->WORK)->r;
 
     }
 
     params->LWORK = fortran_int_max(fortran_int_max(1, n),
                                     work_count);
 
-    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+    work_size = (size_t) params->LWORK * sizeof(ftyp);
 
-    mem_buff2 = malloc(work_size);
+    mem_buff2 = (npy_uint8 *)malloc(work_size);
     if (!mem_buff2)
         goto error;
 
     work = mem_buff2;
 
-    params->WORK = work;
+    params->WORK = (ftyp*)work;
     params->LWORK = work_count;
 
     return 1;
@@ -3334,15 +3453,14 @@ init_@lapack_func@_common(GQR_PARAMS_t *params,
     return 0;
 }
 
-/**end repeat**/
-
 /* -------------------------------------------------------------------------- */
                  /* qr (modes - reduced) */
 
 
+template<typename typ>
 static inline void
 dump_gqr_params(const char *name,
-                GQR_PARAMS_t *params)
+                GQR_PARAMS_t<typ> *params)
 {
     TRACE_TXT("\n%s:\n"\
 
@@ -3368,27 +3486,21 @@ dump_gqr_params(const char *name,
               "LWORK", (int)params->LWORK);
 }
 
-/**begin repeat
-   #lapack_func=dorgqr,zungqr#
-   #ftyp=fortran_doublereal,fortran_doublecomplex#
- */
+template<typename ftyp>
 static inline int
-init_@lapack_func@(GQR_PARAMS_t *params,
+init_gqr(GQR_PARAMS_t<ftyp> *params,
                    fortran_int m,
                    fortran_int n)
 {
-    return init_@lapack_func@_common(
+    return init_gqr_common(
         params, m, n, 
         fortran_int_min(m, n));
 }
 
-/**end repeat**/
 
-/**begin repeat
-   #lapack_func=dorgqr,zungqr#
- */
+template<typename typ>
 static inline void
-release_@lapack_func@(GQR_PARAMS_t* params)
+release_gqr(GQR_PARAMS_t<typ>* params)
 {
     /* A and WORK contain allocated blocks */
     free(params->Q);
@@ -3396,22 +3508,13 @@ release_@lapack_func@(GQR_PARAMS_t* params)
     memset(params, 0, sizeof(*params));
 }
 
-/**end repeat**/
-
-/**begin repeat
-   #TYPE=DOUBLE,CDOUBLE#
-   #REALTYPE=DOUBLE,DOUBLE#
-   #lapack_func=dorgqr,zungqr#
-   #typ     = npy_double, npy_cdouble#
-   #basetyp = npy_double, npy_double#
-   #ftyp = fortran_doublereal,fortran_doublecomplex#
-   #cmplx = 0, 1#
- */
+template<typename typ>
 static void
-@TYPE@_qr_reduced(char **args, npy_intp const *dimensions, npy_intp const *steps,
+qr_reduced(char **args, npy_intp const *dimensions, npy_intp const *steps,
                   void *NPY_UNUSED(func))
 {
-    GQR_PARAMS_t params;
+using ftyp = fortran_type_t<typ>;
+    GQR_PARAMS_t<ftyp> params;
     int error_occurred = get_fp_invalid_and_clear();
     fortran_int n, m;
 
@@ -3420,7 +3523,7 @@ static void
     m = (fortran_int)dimensions[0];
     n = (fortran_int)dimensions[1];
 
-    if (init_@lapack_func@(&params, m, n)) {
+    if (init_gqr(&params, m, n)) {
         LINEARIZE_DATA_t a_in, tau_in, q_out;
 
         init_linearize_data(&a_in, n, m, steps[1], steps[0]);
@@ -3429,57 +3532,44 @@ static void
 
         BEGIN_OUTER_LOOP_3
             int not_ok;
-            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
-            linearize_@TYPE@_matrix(params.Q, args[0], &a_in);
-            linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in);
-            not_ok = call_@lapack_func@(&params);
+            linearize_matrix((typ*)params.A, (typ*)args[0], &a_in);
+            linearize_matrix((typ*)params.Q, (typ*)args[0], &a_in);
+            linearize_matrix((typ*)params.TAU, (typ*)args[1], &tau_in);
+            not_ok = call_gqr(&params);
             if (!not_ok) {
-                delinearize_@TYPE@_matrix(args[2], params.Q, &q_out);
+                delinearize_matrix((typ*)args[2], (typ*)params.Q, &q_out);
             } else {
                 error_occurred = 1;
-                nan_@TYPE@_matrix(args[2], &q_out);
+                nan_matrix((typ*)args[2], &q_out);
             }
         END_OUTER_LOOP
 
-        release_@lapack_func@(&params);
+        release_gqr(&params);
     }
 
     set_fp_invalid_or_clear(error_occurred);
 }
 
-/**end repeat**/
-
 /* -------------------------------------------------------------------------- */
                  /* qr (modes - complete) */
 
-/**begin repeat
-   #lapack_func=dorgqr,zungqr#
-   #ftyp=fortran_doublereal,fortran_doublecomplex#
- */
+template<typename ftyp>
 static inline int
-init_@lapack_func@_complete(GQR_PARAMS_t *params,
+init_gqr_complete(GQR_PARAMS_t<ftyp> *params,
                             fortran_int m,
                             fortran_int n)
 {
-    return init_@lapack_func@_common(params, m, n, m);
+    return init_gqr_common(params, m, n, m);
 }
 
-/**end repeat**/
 
-/**begin repeat
-   #TYPE=DOUBLE,CDOUBLE#
-   #REALTYPE=DOUBLE,DOUBLE#
-   #lapack_func=dorgqr,zungqr#
-   #typ     = npy_double,npy_cdouble#
-   #basetyp = npy_double,npy_double#
-   #ftyp = fortran_doublereal,fortran_doublecomplex#
-   #cmplx = 0, 1#
- */
+template<typename typ>
 static void
-@TYPE@_qr_complete(char **args, npy_intp const *dimensions, npy_intp const *steps,
+qr_complete(char **args, npy_intp const *dimensions, npy_intp const *steps,
                   void *NPY_UNUSED(func))
 {
-    GQR_PARAMS_t params;
+using ftyp = fortran_type_t<typ>;
+    GQR_PARAMS_t<ftyp> params;
     int error_occurred = get_fp_invalid_and_clear();
     fortran_int n, m;
 
@@ -3489,7 +3579,7 @@ static void
     n = (fortran_int)dimensions[1];
 
 
-    if (init_@lapack_func@_complete(&params, m, n)) {
+    if (init_gqr_complete(&params, m, n)) {
         LINEARIZE_DATA_t a_in, tau_in, q_out;
 
         init_linearize_data(&a_in, n, m, steps[1], steps[0]);
@@ -3498,51 +3588,50 @@ static void
 
         BEGIN_OUTER_LOOP_3
             int not_ok;
-            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
-            linearize_@TYPE@_matrix(params.Q, args[0], &a_in);
-            linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in);
-            not_ok = call_@lapack_func@(&params);
+            linearize_matrix((typ*)params.A, (typ*)args[0], &a_in);
+            linearize_matrix((typ*)params.Q, (typ*)args[0], &a_in);
+            linearize_matrix((typ*)params.TAU, (typ*)args[1], &tau_in);
+            not_ok = call_gqr(&params);
             if (!not_ok) {
-                delinearize_@TYPE@_matrix(args[2], params.Q, &q_out);
+                delinearize_matrix((typ*)args[2], (typ*)params.Q, &q_out);
             } else {
                 error_occurred = 1;
-                nan_@TYPE@_matrix(args[2], &q_out);
+                nan_matrix((typ*)args[2], &q_out);
             }
         END_OUTER_LOOP
 
-        release_@lapack_func@(&params);
+        release_gqr(&params);
     }
 
     set_fp_invalid_or_clear(error_occurred);
 }
 
-/**end repeat**/
-
 /* -------------------------------------------------------------------------- */
                  /* least squares */
 
-typedef struct gelsd_params_struct
+template<typename typ>
+struct GELSD_PARAMS_t
 {
     fortran_int M;
     fortran_int N;
     fortran_int NRHS;
-    void *A;
+    typ *A;
     fortran_int LDA;
-    void *B;
+    typ *B;
     fortran_int LDB;
-    void *S;
-    void *RCOND;
+    basetype_t<typ> *S;
+    basetype_t<typ> *RCOND;
     fortran_int RANK;
-    void *WORK;
+    typ *WORK;
     fortran_int LWORK;
-    void *RWORK;
-    void *IWORK;
-} GELSD_PARAMS_t;
-
+    basetype_t<typ> *RWORK;
+    fortran_int *IWORK;
+};
 
+template<typename typ>
 static inline void
 dump_gelsd_params(const char *name,
-                  GELSD_PARAMS_t *params)
+                  GELSD_PARAMS_t<typ> *params)
 {
     TRACE_TXT("\n%s:\n"\
 
@@ -3583,18 +3672,27 @@ dump_gelsd_params(const char *name,
               "RCOND", params->RCOND);
 }
 
+static inline fortran_int
+call_gelsd(GELSD_PARAMS_t<fortran_real> *params)
+{
+    fortran_int rv;
+    LAPACK(sgelsd)(&params->M, &params->N, &params->NRHS,
+                          params->A, &params->LDA,
+                          params->B, &params->LDB,
+                          params->S,
+                          params->RCOND, &params->RANK,
+                          params->WORK, &params->LWORK,
+                          params->IWORK,
+                          &rv);
+    return rv;
+}
 
-/**begin repeat
-   #TYPE=FLOAT,DOUBLE#
-   #lapack_func=sgelsd,dgelsd#
-   #ftyp=fortran_real,fortran_doublereal#
- */
 
 static inline fortran_int
-call_@lapack_func@(GELSD_PARAMS_t *params)
+call_gelsd(GELSD_PARAMS_t<fortran_doublereal> *params)
 {
     fortran_int rv;
-    LAPACK(@lapack_func@)(&params->M, &params->N, &params->NRHS,
+    LAPACK(dgelsd)(&params->M, &params->N, &params->NRHS,
                           params->A, &params->LDA,
                           params->B, &params->LDB,
                           params->S,
@@ -3605,11 +3703,14 @@ call_@lapack_func@(GELSD_PARAMS_t *params)
     return rv;
 }
 
+
+template<typename ftyp>
 static inline int
-init_@lapack_func@(GELSD_PARAMS_t *params,
+init_gelsd(GELSD_PARAMS_t<ftyp> *params,
                    fortran_int m,
                    fortran_int n,
-                   fortran_int nrhs)
+                   fortran_int nrhs,
+scalar_trait)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
@@ -3622,9 +3723,9 @@ init_@lapack_func@(GELSD_PARAMS_t *params,
     size_t safe_n = n;
     size_t safe_nrhs = nrhs;
 
-    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@);
-    size_t s_size = safe_min_m_n * sizeof(@ftyp@);
+    size_t a_size = safe_m * safe_n * sizeof(ftyp);
+    size_t b_size = safe_max_m_n * safe_nrhs * sizeof(ftyp);
+    size_t s_size = safe_min_m_n * sizeof(ftyp);
 
     fortran_int work_count;
     size_t work_size;
@@ -3632,7 +3733,7 @@ init_@lapack_func@(GELSD_PARAMS_t *params,
     fortran_int lda = fortran_int_max(1, m);
     fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
 
-    mem_buff = malloc(a_size + b_size + s_size);
+    mem_buff = (npy_uint8 *)malloc(a_size + b_size + s_size);
 
     if (!mem_buff)
         goto error;
@@ -3645,15 +3746,15 @@ init_@lapack_func@(GELSD_PARAMS_t *params,
     params->M = m;
     params->N = n;
     params->NRHS = nrhs;
-    params->A = a;
-    params->B = b;
-    params->S = s;
+    params->A = (ftyp*)a;
+    params->B = (ftyp*)b;
+    params->S = (ftyp*)s;
     params->LDA = lda;
     params->LDB = ldb;
 
     {
         /* compute optimal work size */
-        @ftyp@ work_size_query;
+        ftyp work_size_query;
         fortran_int iwork_size_query;
 
         params->WORK = &work_size_query;
@@ -3661,25 +3762,25 @@ init_@lapack_func@(GELSD_PARAMS_t *params,
         params->RWORK = NULL;
         params->LWORK = -1;
 
-        if (call_@lapack_func@(params) != 0)
+        if (call_gelsd(params) != 0)
             goto error;
 
         work_count = (fortran_int)work_size_query;
 
-        work_size  = (size_t) work_size_query * sizeof(@ftyp@);
+        work_size  = (size_t) work_size_query * sizeof(ftyp);
         iwork_size = (size_t)iwork_size_query * sizeof(fortran_int);
     }
 
-    mem_buff2 = malloc(work_size + iwork_size);
+    mem_buff2 = (npy_uint8 *)malloc(work_size + iwork_size);
     if (!mem_buff2)
         goto error;
 
     work = mem_buff2;
     iwork = work + work_size;
 
-    params->WORK = work;
+    params->WORK = (ftyp*)work;
     params->RWORK = NULL;
-    params->IWORK = iwork;
+    params->IWORK = (fortran_int*)iwork;
     params->LWORK = work_count;
 
     return 1;
@@ -3692,37 +3793,46 @@ init_@lapack_func@(GELSD_PARAMS_t *params,
     return 0;
 }
 
-/**end repeat**/
-
-/**begin repeat
-   #TYPE=CFLOAT,CDOUBLE#
-   #ftyp=fortran_complex,fortran_doublecomplex#
-   #frealtyp=fortran_real,fortran_doublereal#
-   #typ=COMPLEX_t,DOUBLECOMPLEX_t#
-   #lapack_func=cgelsd,zgelsd#
- */
+static inline fortran_int
+call_gelsd(GELSD_PARAMS_t<fortran_complex> *params)
+{
+    fortran_int rv;
+    LAPACK(cgelsd)(&params->M, &params->N, &params->NRHS,
+                          params->A, &params->LDA,
+                          params->B, &params->LDB,
+                          params->S,
+                          params->RCOND, &params->RANK,
+                          params->WORK, &params->LWORK,
+                          params->RWORK, (fortran_int*)params->IWORK,
+                          &rv);
+    return rv;
+}
 
 static inline fortran_int
-call_@lapack_func@(GELSD_PARAMS_t *params)
+call_gelsd(GELSD_PARAMS_t<fortran_doublecomplex> *params)
 {
     fortran_int rv;
-    LAPACK(@lapack_func@)(&params->M, &params->N, &params->NRHS,
+    LAPACK(zgelsd)(&params->M, &params->N, &params->NRHS,
                           params->A, &params->LDA,
                           params->B, &params->LDB,
                           params->S,
                           params->RCOND, &params->RANK,
                           params->WORK, &params->LWORK,
-                          params->RWORK, params->IWORK,
+                          params->RWORK, (fortran_int*)params->IWORK,
                           &rv);
     return rv;
 }
 
+
+template<typename ftyp>
 static inline int
-init_@lapack_func@(GELSD_PARAMS_t *params,
+init_gelsd(GELSD_PARAMS_t<ftyp> *params,
                    fortran_int m,
                    fortran_int n,
-                   fortran_int nrhs)
+                   fortran_int nrhs,
+complex_trait)
 {
+using frealtyp = basetype_t<ftyp>;
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
     npy_uint8 *a, *b, *s, *work, *iwork, *rwork;
@@ -3734,16 +3844,16 @@ init_@lapack_func@(GELSD_PARAMS_t *params,
     size_t safe_n = n;
     size_t safe_nrhs = nrhs;
 
-    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
-    size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@);
-    size_t s_size = safe_min_m_n * sizeof(@frealtyp@);
+    size_t a_size = safe_m * safe_n * sizeof(ftyp);
+    size_t b_size = safe_max_m_n * safe_nrhs * sizeof(ftyp);
+    size_t s_size = safe_min_m_n * sizeof(frealtyp);
 
     fortran_int work_count;
     size_t work_size, rwork_size, iwork_size;
     fortran_int lda = fortran_int_max(1, m);
     fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
 
-    mem_buff = malloc(a_size + b_size + s_size);
+    mem_buff = (npy_uint8 *)malloc(a_size + b_size + s_size);
 
     if (!mem_buff)
         goto error;
@@ -3756,16 +3866,16 @@ init_@lapack_func@(GELSD_PARAMS_t *params,
     params->M = m;
     params->N = n;
     params->NRHS = nrhs;
-    params->A = a;
-    params->B = b;
-    params->S = s;
+    params->A = (ftyp*)a;
+    params->B = (ftyp*)b;
+    params->S = (frealtyp*)s;
     params->LDA = lda;
     params->LDB = ldb;
 
     {
         /* compute optimal work size */
-        @ftyp@ work_size_query;
-        @frealtyp@ rwork_size_query;
+        ftyp work_size_query;
+        frealtyp rwork_size_query;
         fortran_int iwork_size_query;
 
         params->WORK = &work_size_query;
@@ -3773,17 +3883,17 @@ init_@lapack_func@(GELSD_PARAMS_t *params,
         params->RWORK = &rwork_size_query;
         params->LWORK = -1;
 
-        if (call_@lapack_func@(params) != 0)
+        if (call_gelsd(params) != 0)
             goto error;
 
         work_count = (fortran_int)work_size_query.r;
 
-        work_size  = (size_t )work_size_query.r * sizeof(@ftyp@);
-        rwork_size = (size_t)rwork_size_query * sizeof(@frealtyp@);
+        work_size  = (size_t )work_size_query.r * sizeof(ftyp);
+        rwork_size = (size_t)rwork_size_query * sizeof(frealtyp);
         iwork_size = (size_t)iwork_size_query * sizeof(fortran_int);
     }
 
-    mem_buff2 = malloc(work_size + rwork_size + iwork_size);
+    mem_buff2 = (npy_uint8 *)malloc(work_size + rwork_size + iwork_size);
     if (!mem_buff2)
         goto error;
 
@@ -3791,9 +3901,9 @@ init_@lapack_func@(GELSD_PARAMS_t *params,
     rwork = work + work_size;
     iwork = rwork + rwork_size;
 
-    params->WORK = work;
-    params->RWORK = rwork;
-    params->IWORK = iwork;
+    params->WORK = (ftyp*)work;
+    params->RWORK = (frealtyp*)rwork;
+    params->IWORK = (fortran_int*)iwork;
     params->LWORK = work_count;
 
     return 1;
@@ -3806,22 +3916,9 @@ init_@lapack_func@(GELSD_PARAMS_t *params,
     return 0;
 }
 
-/**end repeat**/
-
-
-/**begin repeat
-   #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
-   #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
-   #lapack_func=sgelsd,dgelsd,cgelsd,zgelsd#
-   #dot_func=sdot,ddot,cdotc,zdotc#
-   #typ     = npy_float, npy_double, npy_cfloat, npy_cdouble#
-   #basetyp = npy_float, npy_double, npy_float,  npy_double#
-   #ftyp = fortran_real, fortran_doublereal,
-           fortran_complex, fortran_doublecomplex#
-   #cmplx = 0, 0, 1, 1#
- */
+template<typename ftyp>
 static inline void
-release_@lapack_func@(GELSD_PARAMS_t* params)
+release_gelsd(GELSD_PARAMS_t<ftyp>* params)
 {
     /* A and WORK contain allocated blocks */
     free(params->A);
@@ -3830,26 +3927,38 @@ release_@lapack_func@(GELSD_PARAMS_t* params)
 }
 
 /** Compute the squared l2 norm of a contiguous vector */
-static @basetyp@
-@TYPE@_abs2(@typ@ *p, npy_intp n) {
+template<typename typ>
+static basetype_t<typ>
+abs2(typ *p, npy_intp n, scalar_trait) {
     npy_intp i;
-    @basetyp@ res = 0;
+    basetype_t<typ> res = 0;
     for (i = 0; i < n; i++) {
-        @typ@ el = p[i];
-#if @cmplx@
-        res += el.real*el.real + el.imag*el.imag;
-#else
+        typ el = p[i];
         res += el*el;
-#endif
+    }
+    return res;
+}
+template<typename typ>
+static basetype_t<typ>
+abs2(typ *p, npy_intp n, complex_trait) {
+    npy_intp i;
+    basetype_t<typ> res = 0;
+    for (i = 0; i < n; i++) {
+        typ el = p[i];
+        res += el.real*el.real + el.imag*el.imag;
     }
     return res;
 }
 
+
+template<typename typ>
 static void
-@TYPE@_lstsq(char **args, npy_intp const *dimensions, npy_intp const *steps,
+lstsq(char **args, npy_intp const *dimensions, npy_intp const *steps,
              void *NPY_UNUSED(func))
 {
-    GELSD_PARAMS_t params;
+using ftyp = fortran_type_t<typ>;
+using basetyp = basetype_t<typ>;
+    GELSD_PARAMS_t<ftyp> params;
     int error_occurred = get_fp_invalid_and_clear();
     fortran_int n, m, nrhs;
     fortran_int excess;
@@ -3861,7 +3970,7 @@ static void
     nrhs = (fortran_int)dimensions[2];
     excess = m - n;
 
-    if (init_@lapack_func@(&params, m, n, nrhs)) {
+    if (init_gelsd(&params, m, n, nrhs, dispatch_scalar<ftyp>{})) {
         LINEARIZE_DATA_t a_in, b_in, x_out, s_out, r_out;
 
         init_linearize_data(&a_in, n, m, steps[1], steps[0]);
@@ -3872,52 +3981,51 @@ static void
 
         BEGIN_OUTER_LOOP_7
             int not_ok;
-            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
-            linearize_@TYPE@_matrix(params.B, args[1], &b_in);
-            params.RCOND = args[2];
-            not_ok = call_@lapack_func@(&params);
+            linearize_matrix((typ*)params.A, (typ*)args[0], &a_in);
+            linearize_matrix((typ*)params.B, (typ*)args[1], &b_in);
+            params.RCOND = (basetyp*)args[2];
+            not_ok = call_gelsd(&params);
             if (!not_ok) {
-                delinearize_@TYPE@_matrix(args[3], params.B, &x_out);
+                delinearize_matrix((typ*)args[3], (typ*)params.B, &x_out);
                 *(npy_int*) args[5] = params.RANK;
-                delinearize_@REALTYPE@_matrix(args[6], params.S, &s_out);
+                delinearize_matrix((basetyp*)args[6], (basetyp*)params.S, &s_out);
 
                 /* Note that linalg.lstsq discards this when excess == 0 */
                 if (excess >= 0 && params.RANK == n) {
                     /* Compute the residuals as the square sum of each column */
                     int i;
                     char *resid = args[4];
-                    @ftyp@ *components = (@ftyp@ *)params.B + n;
+                    ftyp *components = (ftyp *)params.B + n;
                     for (i = 0; i < nrhs; i++) {
-                        @ftyp@ *vector = components + i*m;
+                        ftyp *vector = components + i*m;
                         /* Numpy and fortran floating types are the same size,
                          * so this cast is safe */
-                        @basetyp@ abs2 = @TYPE@_abs2((@typ@ *)vector, excess);
+                        basetyp abs = abs2((typ *)vector, excess,
+dispatch_scalar<typ>{});
                         memcpy(
                             resid + i*r_out.column_strides,
-                            &abs2, sizeof(abs2));
+                            &abs, sizeof(abs));
                     }
                 }
                 else {
                     /* Note that this is always discarded by linalg.lstsq */
-                    nan_@REALTYPE@_matrix(args[4], &r_out);
+                    nan_matrix((basetyp*)args[4], &r_out);
                 }
             } else {
                 error_occurred = 1;
-                nan_@TYPE@_matrix(args[3], &x_out);
-                nan_@REALTYPE@_matrix(args[4], &r_out);
+                nan_matrix((typ*)args[3], &x_out);
+                nan_matrix((basetyp*)args[4], &r_out);
                 *(npy_int*) args[5] = -1;
-                nan_@REALTYPE@_matrix(args[6], &s_out);
+                nan_matrix((basetyp*)args[6], &s_out);
             }
         END_OUTER_LOOP
 
-        release_@lapack_func@(&params);
+        release_gelsd(&params);
     }
 
     set_fp_invalid_or_clear(error_occurred);
 }
 
-/**end repeat**/
-
 #pragma GCC diagnostic pop
 
 /* -------------------------------------------------------------------------- */
@@ -3962,6 +4070,22 @@ static void *array_of_nulls[] = {
         CFLOAT_ ## NAME,                                \
         CDOUBLE_ ## NAME                                \
     }
+#define GUFUNC_FUNC_ARRAY_REAL_COMPLEX_(NAME)            \
+    static PyUFuncGenericFunction                       \
+    FUNC_ARRAY_NAME(NAME)[] = {                         \
+        NAME<npy_float, npy_float>,                                 \
+        NAME<npy_double, npy_double>,                                \
+        NAME<npy_cfloat, npy_float>,                                \
+        NAME<npy_cdouble, npy_double>                                \
+    }
+#define GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(NAME)            \
+    static PyUFuncGenericFunction                       \
+    FUNC_ARRAY_NAME(NAME)[] = {                         \
+        NAME<npy_float>,                                 \
+        NAME<npy_double>,                                \
+        NAME<npy_cfloat>,                                \
+        NAME<npy_cdouble>                                \
+    }
 
 /* There are problems with eig in complex single precision.
  * That kernel is disabled
@@ -3969,9 +4093,9 @@ static void *array_of_nulls[] = {
 #define GUFUNC_FUNC_ARRAY_EIG(NAME)                     \
     static PyUFuncGenericFunction                       \
     FUNC_ARRAY_NAME(NAME)[] = {                         \
-        FLOAT_ ## NAME,                                 \
-        DOUBLE_ ## NAME,                                \
-        CDOUBLE_ ## NAME                                \
+        NAME<fortran_complex,fortran_real>,                                 \
+        NAME<fortran_doublecomplex,fortran_doublereal>,                                \
+        NAME<fortran_doublecomplex,fortran_doublecomplex>                                \
     }
 
 /* The single precision functions are not used at all,
@@ -3984,25 +4108,31 @@ static void *array_of_nulls[] = {
         DOUBLE_ ## NAME,                                \
         CDOUBLE_ ## NAME                                \
     }
+#define GUFUNC_FUNC_ARRAY_QR__(NAME)                      \
+    static PyUFuncGenericFunction                       \
+    FUNC_ARRAY_NAME(NAME)[] = {                         \
+        NAME<npy_double>,                                \
+        NAME<npy_cdouble>                                \
+    }
 
 
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(slogdet);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(det);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eighlo);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eighup);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eigvalshlo);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eigvalshup);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(solve);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(solve1);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(inv);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(cholesky_lo);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_N);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_S);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_A);
-GUFUNC_FUNC_ARRAY_QR(qr_r_raw);
-GUFUNC_FUNC_ARRAY_QR(qr_reduced);
-GUFUNC_FUNC_ARRAY_QR(qr_complete);
-GUFUNC_FUNC_ARRAY_REAL_COMPLEX(lstsq);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX_(slogdet);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX_(det);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(eighlo);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(eighup);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(eigvalshlo);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(eigvalshup);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(solve);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(solve1);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(inv);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(cholesky_lo);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(svd_N);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(svd_S);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(svd_A);
+GUFUNC_FUNC_ARRAY_QR__(qr_r_raw);
+GUFUNC_FUNC_ARRAY_QR__(qr_reduced);
+GUFUNC_FUNC_ARRAY_QR__(qr_complete);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX__(lstsq);
 GUFUNC_FUNC_ARRAY_EIG(eig);
 GUFUNC_FUNC_ARRAY_EIG(eigvals);
 
@@ -4095,9 +4225,9 @@ static char lstsq_types[] = {
 };
 
 typedef struct gufunc_descriptor_struct {
-    char *name;
-    char *signature;
-    char *doc;
+    const char *name;
+    const char *signature;
+    const char *doc;
     int ntypes;
     int nin;
     int nout;
@@ -4402,7 +4532,6 @@ PyMODINIT_FUNC PyInit__umath_linalg(void)
     PyObject *d;
     PyObject *version;
 
-    init_constants();
     m = PyModule_Create(&moduledef);
     if (m == NULL) {
         return NULL;
diff --git a/tools/gitpod/settings.json b/tools/gitpod/settings.json
index ea0775f68..50296336d 100644
--- a/tools/gitpod/settings.json
+++ b/tools/gitpod/settings.json
@@ -1,9 +1,8 @@
 {
-    "restructuredtext.languageServer.disabled": true,
-    "restructuredtext.builtDocumentationPath": "${workspaceRoot}/doc/build/html",
-    "restructuredtext.confPath": "",
     "restructuredtext.updateOnTextChanged": "true",
     "restructuredtext.updateDelay": 300,
-    "restructuredtext.linter.disabled": true,
-    "python.defaultInterpreterPath": "/home/gitpod/mambaforge3/envs/numpy-dev/bin/python"
+    "restructuredtext.linter.disabledLinters": ["doc8","rst-lint", "rstcheck"],
+    "python.defaultInterpreterPath": "/home/gitpod/mambaforge3/envs/numpy-dev/bin/python",
+    "esbonio.sphinx.buildDir": "${workspaceRoot}/doc/build/html",
+    "esbonio.sphinx.confDir": ""
 }
 \ No newline at end of file
author	Bhavuk kalra <bhavukkalra1786@gmail.com>	2022-04-03 23:55:56 +0530
committer	GitHub <noreply@github.com>	2022-04-03 23:55:56 +0530
commit	6aa2fcaecf8e56b61143f34269d04fc55f8cabf1 (patch)
tree	00276e2b56888f48a6e2a6756cd4af52fef46862
parent	65f4baf28a5f8314ea7e3b9059f0f844efa129a0 (diff)
parent	37bbfc9012ea46427143c0aa865d4cc7d8c94b9a (diff)
download	numpy-6aa2fcaecf8e56b61143f34269d04fc55f8cabf1.tar.gz