3 files changed, 839 insertions, 541 deletions
diff --git a/numpy/core/src/multiarray/dragon4.c b/numpy/core/src/multiarray/dragon4.c
index 6706fc495..8f7dc0e2a 100644
--- a/numpy/core/src/multiarray/dragon4.c
+++ b/numpy/core/src/multiarray/dragon4.c
@@ -114,6 +114,17 @@ LogBase2_64(npy_uint64 val)
     return LogBase2_32((npy_uint32)val);
 }
 
+#if defined(HAVE_LDOUBLE_IEEE_QUAD_LE)
+static npy_uint32
+LogBase2_128(npy_uint64 hi, npy_uint64 lo)
+{
+    if (hi) {
+        return 64 + LogBase2_64(hi);
+    }
+
+    return LogBase2_64(lo);
+}
+#endif /* HAVE_LDOUBLE_IEEE_QUAD_LE */
 
 /*
  * Maximum number of 32 bit blocks needed in high precision arithmetic to print
@@ -134,6 +145,45 @@ typedef struct BigInt {
     npy_uint32 blocks[c_BigInt_MaxBlocks];
 } BigInt;
 
+/*
+ * Dummy implementation of a memory manager for BigInts. Currently, only
+ * supports a single call to Dragon4, but that is OK because Dragon4
+ * does not release the GIL.
+ *
+ * We try to raise an error anyway if dragon4 re-enters, and this code serves
+ * as a placeholder if we want to make it re-entrant in the future.
+ *
+ * Each call to dragon4 uses 7 BigInts.
+ */
+#define BIGINT_DRAGON4_GROUPSIZE 7
+typedef struct {
+    BigInt bigints[BIGINT_DRAGON4_GROUPSIZE];
+    char repr[16384];
+} Dragon4_Scratch;
+
+static int _bigint_static_in_use = 0;
+static Dragon4_Scratch _bigint_static;
+
+static Dragon4_Scratch*
+get_dragon4_bigint_scratch(void) {
+    /* this test+set is not threadsafe, but no matter because we have GIL */
+    if (_bigint_static_in_use) {
+        PyErr_SetString(PyExc_RuntimeError,
+            "numpy float printing code is not re-entrant. "
+            "Ping the devs to fix it.");
+        return NULL;
+    }
+    _bigint_static_in_use = 1;
+
+    /* in this dummy implementation we only return the static allocation */
+    return &_bigint_static;
+}
+
+static void
+free_dragon4_bigint_scratch(Dragon4_Scratch *mem){
+    _bigint_static_in_use = 0;
+}
+
 /* Copy integer */
 static void
 BigInt_Copy(BigInt *dst, const BigInt *src)
@@ -165,12 +215,46 @@ BigInt_Set_uint64(BigInt *i, npy_uint64 val)
     }
 }
 
+#if defined(HAVE_LDOUBLE_IEEE_QUAD_LE)
+static void
+BigInt_Set_2x_uint64(BigInt *i, npy_uint64 hi, npy_uint64 lo)
+{
+    if (hi > bitmask_u64(32)) {
+        i->length = 4;
+    }
+    else if (hi != 0) {
+        i->length = 3;
+    }
+    else if (lo > bitmask_u64(32)) {
+        i->length = 2;
+    }
+    else if (lo != 0) {
+        i->length = 1;
+    }
+    else {
+        i->length = 0;
+    }
+
+    /* Note deliberate fallthrough in this switch */
+    switch (i->length) {
+        case 4:
+            i->blocks[3] = (hi >> 32) & bitmask_u64(32);
+        case 3:
+            i->blocks[2] = hi & bitmask_u64(32);
+        case 2:
+            i->blocks[1] = (lo >> 32) & bitmask_u64(32);
+        case 1:
+            i->blocks[0] = lo & bitmask_u64(32);
+    }
+}
+#endif /* QUAD */
+
 static void
 BigInt_Set_uint32(BigInt *i, npy_uint32 val)
 {
     if (val != 0) {
         i->blocks[0] = val;
-        i->length = (val != 0);
+        i->length = 1;
     }
     else {
         i->length = 0;
@@ -178,6 +262,24 @@ BigInt_Set_uint32(BigInt *i, npy_uint32 val)
 }
 
 /*
+ * Returns 1 if the value is zero
+ */
+static int
+BigInt_IsZero(const BigInt *i)
+{
+    return i->length == 0;
+}
+
+/*
+ * Returns 1 if the value is even
+ */
+static int
+BigInt_IsEven(const BigInt *i)
+{
+    return (i->length == 0) || ( (i->blocks[0] % 2) == 0);
+}
+
+/*
  * Returns 0 if (lhs = rhs), negative if (lhs < rhs), positive if (lhs > rhs)
  */
 static npy_int32
@@ -649,13 +751,11 @@ static BigInt g_PowerOf10_Big[] =
 
 /* result = 10^exponent */
 static void
-BigInt_Pow10(BigInt *result, npy_uint32 exponent)
+BigInt_Pow10(BigInt *result, npy_uint32 exponent, BigInt *temp)
 {
-    /* create two temporary values to reduce large integer copy operations */
-    BigInt temp1;
-    BigInt temp2;
-    BigInt *curTemp = &temp1;
-    BigInt *pNextTemp = &temp2;
+    /* use two temporary values to reduce large integer copy operations */
+    BigInt *curTemp = result;
+    BigInt *pNextTemp = temp;
     npy_uint32 smallExponent;
     npy_uint32 tableIdx = 0;
 
@@ -693,19 +793,17 @@ BigInt_Pow10(BigInt *result, npy_uint32 exponent)
     }
 
     /* output the result */
-    BigInt_Copy(result, curTemp);
+    if (curTemp != result) {
+        BigInt_Copy(result, curTemp);
+    }
 }
 
-/* result = in * 10^exponent */
+/* in = in * 10^exponent */
 static void
-BigInt_MultiplyPow10(BigInt *result, const BigInt *in, npy_uint32 exponent)
+BigInt_MultiplyPow10(BigInt *in, npy_uint32 exponent, BigInt *temp)
 {
-
-    /* create two temporary values to reduce large integer copy operations */
-    BigInt temp1;
-    BigInt temp2;
-    BigInt *curTemp = &temp1;
-    BigInt *pNextTemp = &temp2;
+    /* use two temporary values to reduce large integer copy operations */
+    BigInt *curTemp, *pNextTemp;
     npy_uint32 smallExponent;
     npy_uint32 tableIdx = 0;
 
@@ -718,10 +816,13 @@ BigInt_MultiplyPow10(BigInt *result, const BigInt *in, npy_uint32 exponent)
      */
     smallExponent = exponent & bitmask_u32(3);
     if (smallExponent != 0) {
-        BigInt_Multiply_int(curTemp, in, g_PowerOf10_U32[smallExponent]);
+        BigInt_Multiply_int(temp, in, g_PowerOf10_U32[smallExponent]);
+        curTemp = temp;
+        pNextTemp = in;
     }
     else {
-        BigInt_Copy(curTemp, in);
+        curTemp = in;
+        pNextTemp = temp;
     }
 
     /* remove the low bits that we used for the 32-bit lookup table */
@@ -736,7 +837,7 @@ BigInt_MultiplyPow10(BigInt *result, const BigInt *in, npy_uint32 exponent)
             /* multiply into the next temporary */
             BigInt_Multiply(pNextTemp, curTemp, &g_PowerOf10_Big[tableIdx]);
 
-            // swap to the next temporary
+            /* swap to the next temporary */
             pSwap = curTemp;
             curTemp = pNextTemp;
             pNextTemp = pSwap;
@@ -748,7 +849,9 @@ BigInt_MultiplyPow10(BigInt *result, const BigInt *in, npy_uint32 exponent)
     }
 
     /* output the result */
-    BigInt_Copy(result, curTemp);
+    if (curTemp != in){
+        BigInt_Copy(in, curTemp);
+    }
 }
 
 /* result = 2^exponent */
@@ -1005,12 +1108,20 @@ BigInt_ShiftLeft(BigInt *result, npy_uint32 shift)
  * There is some more documentation of these changes on Ryan Juckett's website
  * at http://www.ryanjuckett.com/programming/printing-floating-point-numbers/
  *
- * Ryan Juckett's implementation did not implement "IEEE unbiased rounding",
- * except in the last digit. This has been added back, following the Burger &
- * Dybvig code, using the isEven variable.
+ * This code also has a few implementation differences from Ryan Juckett's
+ * version:
+ *  1. fixed overflow problems when mantissa was 64 bits (in float128 types),
+ *     by replacing multiplication by 2 or 4 by BigInt_ShiftLeft calls.
+ *  2. Increased c_BigInt_MaxBlocks, for 128-bit floats
+ *  3. Added more entries to the g_PowerOf10_Big table, for 128-bit floats.
+ *  4. Added unbiased rounding calculation with isEven. Ryan Juckett's
+ *     implementation did not implement "IEEE unbiased rounding", except in the
+ *     last digit. This has been added back, following the Burger & Dybvig
+ *     code, using the isEven variable.
  *
  * Arguments:
- *   * mantissa - value significand
+ *   * bigints - memory to store all bigints needed (7) for dragon4 computation.
+ *               The first BigInt should be filled in with the mantissa.
  *   * exponent - value exponent in base 2
  *   * mantissaBit - index of the highest set mantissa bit
  *   * hasUnequalMargins - is the high margin twice as large as the low margin
@@ -1019,9 +1130,11 @@ BigInt_ShiftLeft(BigInt *result, npy_uint32 shift)
  *   * pOutBuffer - buffer to output into
  *   * bufferSize - maximum characters that can be printed to pOutBuffer
  *   * pOutExponent - the base 10 exponent of the first digit
+ *
+ * Returns the number of digits written to the output buffer.
  */
 static npy_uint32
-Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
+Dragon4(BigInt *bigints, const npy_int32 exponent,
         const npy_uint32 mantissaBit, const npy_bool hasUnequalMargins,
         const DigitMode digitMode, const CutoffMode cutoffMode,
         npy_int32 cutoffNumber, char *pOutBuffer,
@@ -1037,21 +1150,24 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
      * Here, marginLow and marginHigh represent 1/2 of the distance to the next
      * floating point value above/below the mantissa.
      *
-     * scaledMarginHigh is a pointer so that it can point to scaledMarginLow in
-     * the case they must be equal to each other, otherwise it will point to
-     * optionalMarginHigh.
+     * scaledMarginHigh will point to scaledMarginLow in the case they must be
+     * equal to each other, otherwise it will point to optionalMarginHigh.
      */
-    BigInt scale;
-    BigInt scaledValue;
-    BigInt scaledMarginLow;
+    BigInt *mantissa = &bigints[0];  /* the only initialized bigint */
+    BigInt *scale = &bigints[1];
+    BigInt *scaledValue = &bigints[2];
+    BigInt *scaledMarginLow = &bigints[3];
     BigInt *scaledMarginHigh;
-    BigInt optionalMarginHigh;
+    BigInt *optionalMarginHigh = &bigints[4];
+
+    BigInt *temp1 = &bigints[5];
+    BigInt *temp2 = &bigints[6];
 
     const npy_float64 log10_2 = 0.30102999566398119521373889472449;
     npy_int32 digitExponent, cutoffExponent, hiBlock;
     npy_uint32 outputDigit;    /* current digit being output */
     npy_uint32 outputLen;
-    npy_bool isEven = (mantissa % 2) == 0;
+    npy_bool isEven = BigInt_IsEven(mantissa);
     npy_int32 cmp;
 
     /* values used to determine how to round */
@@ -1060,12 +1176,14 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
     DEBUG_ASSERT(bufferSize > 0);
 
     /* if the mantissa is zero, the value is zero regardless of the exponent */
-    if (mantissa == 0) {
+    if (BigInt_IsZero(mantissa)) {
         *curDigit = '0';
         *pOutExponent = 0;
         return 1;
     }
 
+    BigInt_Copy(scaledValue, mantissa);
+
     if (hasUnequalMargins) {
         /* if we have no fractional component */
         if (exponent > 0) {
@@ -1079,17 +1197,13 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
              */
 
             /* scaledValue      = 2 * 2 * mantissa*2^exponent */
-            BigInt_Set_uint64(&scaledValue, mantissa);
-            BigInt_ShiftLeft(&scaledValue, exponent + 2);
-
+            BigInt_ShiftLeft(scaledValue, exponent + 2);
             /* scale            = 2 * 2 * 1 */
-            BigInt_Set_uint32(&scale,  4);
-
+            BigInt_Set_uint32(scale,  4);
             /* scaledMarginLow  = 2 * 2^(exponent-1) */
-            BigInt_Pow2(&scaledMarginLow, exponent);
-
+            BigInt_Pow2(scaledMarginLow, exponent);
             /* scaledMarginHigh = 2 * 2 * 2^(exponent-1) */
-            BigInt_Pow2(&optionalMarginHigh, exponent + 1);
+            BigInt_Pow2(optionalMarginHigh, exponent + 1);
         }
         /* else we have a fractional exponent */
         else {
@@ -1099,34 +1213,27 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
              */
 
             /* scaledValue      = 2 * 2 * mantissa */
-            BigInt_Set_uint64(&scaledValue, mantissa);
-            BigInt_ShiftLeft(&scaledValue, 2);
-
+            BigInt_ShiftLeft(scaledValue, 2);
             /* scale            = 2 * 2 * 2^(-exponent) */
-            BigInt_Pow2(&scale, -exponent + 2);
-
+            BigInt_Pow2(scale, -exponent + 2);
             /* scaledMarginLow  = 2 * 2^(-1) */
-            BigInt_Set_uint32(&scaledMarginLow, 1);
-
+            BigInt_Set_uint32(scaledMarginLow, 1);
             /* scaledMarginHigh = 2 * 2 * 2^(-1) */
-            BigInt_Set_uint32(&optionalMarginHigh, 2);
+            BigInt_Set_uint32(optionalMarginHigh, 2);
         }
 
         /* the high and low margins are different */
-        scaledMarginHigh = &optionalMarginHigh;
+        scaledMarginHigh = optionalMarginHigh;
     }
     else {
         /* if we have no fractional component */
         if (exponent > 0) {
             /* scaledValue     = 2 * mantissa*2^exponent */
-            BigInt_Set_uint64(&scaledValue, mantissa);
-            BigInt_ShiftLeft(&scaledValue, exponent + 1);
-
+            BigInt_ShiftLeft(scaledValue, exponent + 1);
             /* scale           = 2 * 1 */
-            BigInt_Set_uint32(&scale, 2);
-
+            BigInt_Set_uint32(scale, 2);
             /* scaledMarginLow = 2 * 2^(exponent-1) */
-            BigInt_Pow2(&scaledMarginLow, exponent);
+            BigInt_Pow2(scaledMarginLow, exponent);
         }
         /* else we have a fractional exponent */
         else {
@@ -1136,18 +1243,15 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
              */
 
             /* scaledValue     = 2 * mantissa */
-            BigInt_Set_uint64(&scaledValue, mantissa);
-            BigInt_ShiftLeft(&scaledValue, 1);
-
+            BigInt_ShiftLeft(scaledValue, 1);
             /* scale           = 2 * 2^(-exponent) */
-            BigInt_Pow2(&scale, -exponent + 1);
-
+            BigInt_Pow2(scale, -exponent + 1);
             /* scaledMarginLow = 2 * 2^(-1) */
-            BigInt_Set_uint32(&scaledMarginLow, 1);
+            BigInt_Set_uint32(scaledMarginLow, 1);
         }
 
         /* the high and low margins are equal */
-        scaledMarginHigh = &scaledMarginLow;
+        scaledMarginHigh = scaledMarginLow;
     }
 
     /*
@@ -1170,6 +1274,9 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
      *                                                 <= log10(v) + log10(2)
      *  floor(log10(v)) < ceil((mantissaBit + exponent) * log10(2))
      *                                                 <= floor(log10(v)) + 1
+     *
+     *  Warning: This calculation assumes npy_float64 is an IEEE-binary64
+     *  float. This line may need to be updated if this is not the case.
      */
     digitExponent = (npy_int32)(
        ceil((npy_float64)((npy_int32)mantissaBit + exponent) * log10_2 - 0.69));
@@ -1191,31 +1298,29 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
     /* Divide value by 10^digitExponent. */
     if (digitExponent > 0) {
         /* A positive exponent creates a division so we multiply the scale. */
-        BigInt temp;
-        BigInt_MultiplyPow10(&temp, &scale, digitExponent);
-        BigInt_Copy(&scale, &temp);
+        BigInt_MultiplyPow10(scale, digitExponent, temp1);
     }
     else if (digitExponent < 0) {
         /*
          * A negative exponent creates a multiplication so we multiply up the
          * scaledValue, scaledMarginLow and scaledMarginHigh.
          */
-        BigInt pow10, temp;
-        BigInt_Pow10(&pow10, -digitExponent);
+        BigInt *temp=temp1, *pow10=temp2;
+        BigInt_Pow10(pow10, -digitExponent, temp);
 
-        BigInt_Multiply(&temp, &scaledValue, &pow10);
-        BigInt_Copy(&scaledValue, &temp);
+        BigInt_Multiply(temp, scaledValue, pow10);
+        BigInt_Copy(scaledValue, temp);
 
-        BigInt_Multiply(&temp, &scaledMarginLow, &pow10);
-        BigInt_Copy(&scaledMarginLow, &temp);
+        BigInt_Multiply(temp, scaledMarginLow, pow10);
+        BigInt_Copy(scaledMarginLow, temp);
 
-        if (scaledMarginHigh != &scaledMarginLow) {
-            BigInt_Multiply2(scaledMarginHigh, &scaledMarginLow);
+        if (scaledMarginHigh != scaledMarginLow) {
+            BigInt_Multiply2(scaledMarginHigh, scaledMarginLow);
         }
     }
 
     /* If (value >= 1), our estimate for digitExponent was too low */
-    if (BigInt_Compare(&scaledValue, &scale) >= 0) {
+    if (BigInt_Compare(scaledValue, scale) >= 0) {
         /*
          * The exponent estimate was incorrect.
          * Increment the exponent and don't perform the premultiply needed
@@ -1229,10 +1334,10 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
          * Multiply larger by the output base to prepare for the first loop
          * iteration.
          */
-        BigInt_Multiply10(&scaledValue);
-        BigInt_Multiply10(&scaledMarginLow);
-        if (scaledMarginHigh != &scaledMarginLow) {
-            BigInt_Multiply2(scaledMarginHigh, &scaledMarginLow);
+        BigInt_Multiply10(scaledValue);
+        BigInt_Multiply10(scaledMarginLow);
+        if (scaledMarginHigh != scaledMarginLow) {
+            BigInt_Multiply2(scaledMarginHigh, scaledMarginLow);
         }
     }
 
@@ -1273,8 +1378,8 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
      * to be less than or equal to 429496729 which is the highest number that
      * can be multiplied by 10 without overflowing to a new block.
      */
-    DEBUG_ASSERT(scale.length > 0);
-    hiBlock = scale.blocks[scale.length - 1];
+    DEBUG_ASSERT(scale->length > 0);
+    hiBlock = scale->blocks[scale->length - 1];
     if (hiBlock < 8 || hiBlock > 429496729) {
         npy_uint32 hiBlockLog2, shift;
 
@@ -1292,11 +1397,11 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
         DEBUG_ASSERT(hiBlockLog2 < 3 || hiBlockLog2 > 27);
         shift = (32 + 27 - hiBlockLog2) % 32;
 
-        BigInt_ShiftLeft(&scale, shift);
-        BigInt_ShiftLeft(&scaledValue, shift);
-        BigInt_ShiftLeft(&scaledMarginLow, shift);
-        if (scaledMarginHigh != &scaledMarginLow) {
-            BigInt_Multiply2(scaledMarginHigh, &scaledMarginLow);
+        BigInt_ShiftLeft(scale, shift);
+        BigInt_ShiftLeft(scaledValue, shift);
+        BigInt_ShiftLeft(scaledMarginLow, shift);
+        if (scaledMarginHigh != scaledMarginLow) {
+            BigInt_Multiply2(scaledMarginHigh, scaledMarginLow);
         }
     }
 
@@ -1308,25 +1413,25 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
          * terminate early.
          */
         for (;;) {
-            BigInt scaledValueHigh;
+            BigInt *scaledValueHigh = temp1;
 
             digitExponent = digitExponent-1;
 
             /* divide out the scale to extract the digit */
             outputDigit =
-                BigInt_DivideWithRemainder_MaxQuotient9(&scaledValue, &scale);
+                BigInt_DivideWithRemainder_MaxQuotient9(scaledValue, scale);
             DEBUG_ASSERT(outputDigit < 10);
 
             /* update the high end of the value */
-            BigInt_Add(&scaledValueHigh, &scaledValue, scaledMarginHigh);
+            BigInt_Add(scaledValueHigh, scaledValue, scaledMarginHigh);
 
             /*
              * stop looping if we are far enough away from our neighboring
              * values or if we have reached the cutoff digit
              */
-            cmp = BigInt_Compare(&scaledValue, &scaledMarginLow);
+            cmp = BigInt_Compare(scaledValue, scaledMarginLow);
             low = isEven ? (cmp <= 0) : (cmp < 0);
-            cmp = BigInt_Compare(&scaledValueHigh, &scale);
+            cmp = BigInt_Compare(scaledValueHigh, scale);
             high = isEven ? (cmp >= 0) : (cmp > 0);
             if (low | high | (digitExponent == cutoffExponent))
                 break;
@@ -1336,10 +1441,10 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
             ++curDigit;
 
             /* multiply larger by the output base */
-            BigInt_Multiply10(&scaledValue);
-            BigInt_Multiply10(&scaledMarginLow);
-            if (scaledMarginHigh != &scaledMarginLow) {
-                BigInt_Multiply2(scaledMarginHigh, &scaledMarginLow);
+            BigInt_Multiply10(scaledValue);
+            BigInt_Multiply10(scaledMarginLow);
+            if (scaledMarginHigh != scaledMarginLow) {
+                BigInt_Multiply2(scaledMarginHigh, scaledMarginLow);
             }
         }
     }
@@ -1357,10 +1462,11 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
 
             /* divide out the scale to extract the digit */
             outputDigit =
-                BigInt_DivideWithRemainder_MaxQuotient9(&scaledValue, &scale);
+                BigInt_DivideWithRemainder_MaxQuotient9(scaledValue, scale);
             DEBUG_ASSERT(outputDigit < 10);
 
-            if ((scaledValue.length == 0) | (digitExponent == cutoffExponent)) {
+            if ((scaledValue->length == 0) |
+                    (digitExponent == cutoffExponent)) {
                 break;
             }
 
@@ -1369,7 +1475,7 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
             ++curDigit;
 
             /* multiply larger by the output base */
-            BigInt_Multiply10(&scaledValue);
+            BigInt_Multiply10(scaledValue);
         }
     }
 
@@ -1387,8 +1493,8 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
          *  compare( scale * value, scale * 0.5 )
          *  compare( 2 * scale * value, scale )
          */
-        BigInt_Multiply2_inplace(&scaledValue);
-        compare = BigInt_Compare(&scaledValue, &scale);
+        BigInt_Multiply2_inplace(scaledValue);
+        compare = BigInt_Compare(scaledValue, scale);
         roundDown = compare < 0;
 
         /*
@@ -1443,134 +1549,53 @@ Dragon4(const npy_uint64 mantissa, const npy_int32 exponent,
 
 
 /*
- * Helper union to decompose a 16-bit IEEE float.
- * sign:      1 bit
- * exponent:  5 bits
- * mantissa: 10 bits
- */
-typedef union FloatUnion16
-{
-    npy_uint16 integer;
-} FloatUnion16;
-
-npy_bool   IsNegative_F16(FloatUnion16 *v) { return (v->integer >> 15) != 0; }
-npy_uint32 GetExponent_F16(FloatUnion16 *v) { return (v->integer >> 10) & bitmask_u32(5);}
-npy_uint32 GetMantissa_F16(FloatUnion16 *v) { return v->integer & bitmask_u32(10); }
-
-
-/*
- * Helper union to decompose a 32-bit IEEE float.
- * sign:      1 bit
- * exponent:  8 bits
- * mantissa: 23 bits
- */
-typedef union FloatUnion32
-{
-    npy_float32 floatingPoint;
-    npy_uint32 integer;
-} FloatUnion32;
-
-npy_bool   IsNegative_F32(FloatUnion32 *v) { return (v->integer >> 31) != 0; }
-npy_uint32 GetExponent_F32(FloatUnion32 *v) { return (v->integer >> 23) & bitmask_u32(8);}
-npy_uint32 GetMantissa_F32(FloatUnion32 *v) { return v->integer & bitmask_u32(23); }
-
-/*
- * Helper union to decompose a 64-bit IEEE float.
- * sign:      1 bit
- * exponent: 11 bits
- * mantissa: 52 bits
- */
-typedef union FloatUnion64
-{
-    npy_float64 floatingPoint;
-    npy_uint64 integer;
-} FloatUnion64;
-npy_bool   IsNegative_F64(FloatUnion64 *v) { return (v->integer >> 63) != 0; }
-npy_uint32 GetExponent_F64(FloatUnion64 *v) { return (v->integer >> 52) & bitmask_u32(11); }
-npy_uint64 GetMantissa_F64(FloatUnion64 *v) { return v->integer & bitmask_u64(52); }
-
-/*
- * Helper unions and datatype to decompose a 80-bit IEEE float
- * sign:      1 bit,  second u64
- * exponent: 15 bits, second u64
- * intbit     1 bit,  first u64
- * mantissa: 63 bits, first u64
- */
-
-/*
- * Since systems have different types of long doubles, and may not necessarily
- * have a 128-byte format we can use to pass values around, here we create
- * our own 128-bit storage type for convenience.
- */
-typedef struct FloatVal128 {
-    npy_uint64 integer[2];
-} FloatVal128;
-npy_bool   IsNegative_F128(FloatVal128 *v) {
-    return ((v->integer[1] >> 15) & 0x1) != 0;
-}
-npy_uint32 GetExponent_F128(FloatVal128 *v) { return v->integer[1] & bitmask_u32(15); }
-npy_uint64 GetMantissa_F128(FloatVal128 *v) {
-    return v->integer[0] & bitmask_u64(63);
-}
-
-/*
- * then for each different definition of long double, we create a union to
- * unpack the float data safely. We can then copy these integers to a
- * FloatVal128.
+ * The FormatPositional and FormatScientific functions have been more
+ * significantly rewritten relative to Ryan Juckett's code.
+ *
+ * The binary16 and the various 128-bit float functions are new, and adapted
+ * from the 64 bit version. The python interface functions are new.
  */
-#ifdef NPY_FLOAT128
-typedef union FloatUnion128
-{
-    npy_float128 floatingPoint;
-    struct {
-        npy_uint64 a;
-        npy_uint16 b;
-    } integer;
-} FloatUnion128;
-#endif
-
-#ifdef NPY_FLOAT96
-typedef union FloatUnion96
-{
-    npy_float96 floatingPoint;
-    struct {
-        npy_uint64 a;
-        npy_uint32 b;
-    } integer;
-} FloatUnion96;
-#endif
-
-#ifdef NPY_FLOAT80
-typedef union FloatUnion80
-{
-    npy_float80 floatingPoint;
-    struct {
-        npy_uint64 a;
-        npy_uint16 b;
-    } integer;
-} FloatUnion80;
-#endif
 
 
-/*
- * The main changes above this point, relative to Ryan Juckett's code, are:
- *  1. fixed overflow problems when mantissa was 64 bits (in float128 types),
- *     by replacing multiplication by 2 or 4 by BigInt_ShiftLeft calls.
- *  2. Increased c_BigInt_MaxBlocks
- *  3. Added more entries to the g_PowerOf10_Big table
- *  4. Added unbiased rounding calculation with isEven
+/* Options struct for easy passing of Dragon4 options.
  *
- * Below this point, the FormatPositional and FormatScientific functions have
- * been more significantly rewritten. The Dragon4_PrintFloat16 and
- * Dragon4_PrintFloat128 functions are new, and were adapted from the 64 and 32
- * bit versions. The python interfacing functions (in the header) are new.
+ *   scientific - boolean controlling whether scientific notation is used
+ *   digit_mode - whether to use unique or fixed fracional output
+ *   cutoff_mode - whether 'precision' refers to toal digits, or digits past
+ *                 the decimal point.
+ *   precision - When negative, prints as many digits as needed for a unique
+ *               number. When positive specifies the maximum number of
+ *               significant digits to print.
+ *   sign - whether to always show sign
+ *   trim_mode - how to treat trailing 0s and '.'. See TrimMode comments.
+ *   digits_left - pad characters to left of decimal point. -1 for no padding
+ *   digits_right - pad characters to right of decimal point. -1 for no padding.
+ *                  Padding adds whitespace until there are the specified
+ *                  number characters to sides of decimal point. Applies after
+ *                  trim_mode characters were removed. If digits_right is
+ *                  positive and the decimal point was trimmed, decimal point
+ *                  will be replaced by a whitespace character.
+ *   exp_digits - Only affects scientific output. If positive, pads the
+ *                exponent with 0s until there are this many digits. If
+ *                negative, only use sufficient digits.
  */
-
+typedef struct Dragon4_Options {
+    npy_bool scientific;
+    DigitMode digit_mode;
+    CutoffMode cutoff_mode;
+    npy_int32 precision;
+    npy_bool sign;
+    TrimMode trim_mode;
+    npy_int32 digits_left;
+    npy_int32 digits_right;
+    npy_int32 exp_digits;
+} Dragon4_Options;
 
 /*
  * Outputs the positive number with positional notation: ddddd.dddd
  * The output is always NUL terminated and the output length (not including the
  * NUL) is returned.
+ *
  * Arguments:
  *    buffer - buffer to output into
  *    bufferSize - maximum characters that can be printed to buffer
@@ -1579,20 +1604,11 @@ typedef union FloatUnion80
  *    signbit - value of the sign position. Should be '+', '-' or ''
  *    mantissaBit - index of the highest set mantissa bit
  *    hasUnequalMargins - is the high margin twice as large as the low margin
- *    precision - Negative prints as many digits as are needed for a unique
- *                number. Positive specifies the maximum number of significant
- *                digits to print past the decimal point.
- *    trim_mode - how to treat trailing 0s and '.'. See TrimMode comments.
- *    digits_left - pad characters to left of decimal point. -1 for no padding
- *    digits_right - pad characters to right of decimal point. -1 for no padding
- *                   padding adds whitespace until there are the specified
- *                   number characters to sides of decimal point. Applies after
- *                   trim_mode characters were removed. If digits_right is
- *                   positive and the decimal point was trimmed, decimal point
- *                   will be replaced by a whitespace character.
+ *
+ * See Dragon4_Options for description of remaining arguments.
  */
 static npy_uint32
-FormatPositional(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
+FormatPositional(char *buffer, npy_uint32 bufferSize, BigInt *mantissa,
                  npy_int32 exponent, char signbit, npy_uint32 mantissaBit,
                  npy_bool hasUnequalMargins, DigitMode digit_mode,
                  CutoffMode cutoff_mode, npy_int32 precision,
@@ -1719,7 +1735,7 @@ FormatPositional(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
 
     /* always add decimal point, except for DprZeros mode */
     if (trim_mode != TrimMode_DptZeros && numFractionDigits == 0 &&
-            pos < maxPrintLen){
+            pos < maxPrintLen) {
         buffer[pos++] = '.';
     }
 
@@ -1757,7 +1773,7 @@ FormatPositional(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
      * when rounding, we may still end up with trailing zeros. Remove them
      * depending on trim settings.
      */
-    if (precision >= 0 && trim_mode != TrimMode_None && numFractionDigits > 0){
+    if (precision >= 0 && trim_mode != TrimMode_None && numFractionDigits > 0) {
         while (buffer[pos-1] == '0') {
             pos--;
             numFractionDigits--;
@@ -1791,7 +1807,7 @@ FormatPositional(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
         npy_int32 shift = digits_left - (numWholeDigits + has_sign);
         npy_int32 count = pos;
 
-        if (count + shift > maxPrintLen){
+        if (count + shift > maxPrintLen) {
             count = maxPrintLen - shift;
         }
 
@@ -1815,6 +1831,7 @@ FormatPositional(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
  * Outputs the positive number with scientific notation: d.dddde[sign]ddd
  * The output is always NUL terminated and the output length (not including the
  * NUL) is returned.
+ *
  * Arguments:
  *    buffer - buffer to output into
  *    bufferSize - maximum characters that can be printed to buffer
@@ -1823,15 +1840,11 @@ FormatPositional(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
  *    signbit - value of the sign position. Should be '+', '-' or ''
  *    mantissaBit - index of the highest set mantissa bit
  *    hasUnequalMargins - is the high margin twice as large as the low margin
- *    precision - Negative prints as many digits as are needed for a unique
- *                number. Positive specifies the maximum number of significant
- *                digits to print past the decimal point.
- *    trim_mode - how to treat trailing 0s and '.'. See TrimMode comments.
- *    digits_left - pad characters to left of decimal point. -1 for no padding
- *    exp_digits - pads exponent with zeros until it has this many digits
+ *
+ * See Dragon4_Options for description of remaining arguments.
  */
 static npy_uint32
-FormatScientific (char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
+FormatScientific (char *buffer, npy_uint32 bufferSize, BigInt *mantissa,
                   npy_int32 exponent, char signbit, npy_uint32 mantissaBit,
                   npy_bool hasUnequalMargins, DigitMode digit_mode,
                   npy_int32 precision, TrimMode trim_mode,
@@ -1856,7 +1869,7 @@ FormatScientific (char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
     leftchars = 1 + (signbit == '-' || signbit == '+');
     if (digits_left > leftchars) {
         int i;
-        for (i = 0; i < digits_left - leftchars && bufferSize > 1; i++){
+        for (i = 0; i < digits_left - leftchars && bufferSize > 1; i++) {
             *pCurOut = ' ';
             pCurOut++;
             --bufferSize;
@@ -1904,7 +1917,7 @@ FormatScientific (char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
 
     /* always add decimal point, except for DprZeros mode */
     if (trim_mode != TrimMode_DptZeros && numFractionDigits == 0 &&
-            bufferSize > 1){
+            bufferSize > 1) {
         *pCurOut = '.';
         ++pCurOut;
         --bufferSize;
@@ -1943,7 +1956,7 @@ FormatScientific (char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
      * when rounding, we may still end up with trailing zeros. Remove them
      * depending on trim settings.
      */
-    if (precision >= 0 && trim_mode != TrimMode_None && numFractionDigits > 0){
+    if (precision >= 0 && trim_mode != TrimMode_None && numFractionDigits > 0) {
         --pCurOut;
         while (*pCurOut == '0') {
             --pCurOut;
@@ -1984,7 +1997,7 @@ FormatScientific (char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
         DEBUG_ASSERT(printExponent < 100000);
 
         /* get exp digits */
-        for (i = 0; i < 5; i++){
+        for (i = 0; i < 5; i++) {
             digits[i] = printExponent % 10;
             printExponent /= 10;
         }
@@ -1993,7 +2006,7 @@ FormatScientific (char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
         }
         exp_size = i;
         /* write remaining digits to tmp buf */
-        for (i = exp_size; i > 0; i--){
+        for (i = exp_size; i > 0; i--) {
             exponentBuffer[2 + (exp_size-i)] = (char)('0' + digits[i-1]);
         }
 
@@ -2069,12 +2082,12 @@ PrintInfNan(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
 
         /* only print sign for inf values (though nan can have a sign set) */
         if (signbit == '+') {
-            if (pos < maxPrintLen-1){
+            if (pos < maxPrintLen-1) {
                 buffer[pos++] = '+';
             }
         }
         else if (signbit == '-') {
-            if (pos < maxPrintLen-1){
+            if (pos < maxPrintLen-1) {
                 buffer[pos++] = '-';
             }
         }
@@ -2092,7 +2105,9 @@ PrintInfNan(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
         buffer[pos + printLen] = '\0';
 
         /*
-         * // XXX: Should we change this for numpy?
+         *  For numpy we ignore unusual mantissa values for nan, but keep this
+         *  code in case we change our mind later.
+         *
          * // append HEX value
          * if (maxPrintLen > 3) {
          *     printLen += PrintHex(buffer+3, bufferSize-3, mantissa,
@@ -2105,34 +2120,63 @@ PrintInfNan(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa,
 }
 
 /*
- * These functions print a floating-point number as a decimal string.
- * The output string is always NUL terminated and the string length (not
- * including the NUL) is returned.
+ * The functions below format a floating-point numbers stored in particular
+ * formats,  as a decimal string.  The output string is always NUL terminated
+ * and the string length (not including the NUL) is returned.
+ *
+ * For 16, 32 and 64 bit floats we assume they are the IEEE 754 type.
+ * For 128 bit floats we account for different definitions.
  *
  * Arguments are:
  *   buffer - buffer to output into
  *   bufferSize - maximum characters that can be printed to buffer
- *   value - value significand
- *   scientific - boolean controlling whether scientific notation is used
- *   precision - If positive, specifies the number of decimals to show after
- *               decimal point. If negative, sufficient digits to uniquely
- *               specify the float will be output.
- *   trim_mode - how to treat trailing zeros and decimal point. See TrimMode.
- *   digits_right - pad the result with '' on the right past the decimal point
- *   digits_left - pad the result with '' on the right past the decimal point
- *   exp_digits - Only affects scientific output. If positive, pads the
- *                exponent with 0s until there are this many digits. If
- *                negative, only use sufficient digits.
+ *   value - value to print
+ *   opt - Dragon4 options, see above
+ */
+
+/*
+ * Helper function that takes Dragon4 parameters and options and
+ * calls Dragon4.
+ */
+static npy_uint32
+Format_floatbits(char *buffer, npy_uint32 bufferSize, BigInt *mantissa,
+                 npy_int32 exponent, char signbit, npy_uint32 mantissaBit,
+                 npy_bool hasUnequalMargins, Dragon4_Options *opt)
+{
+    /* format the value */
+    if (opt->scientific) {
+        return FormatScientific(buffer, bufferSize, mantissa, exponent,
+                                signbit, mantissaBit, hasUnequalMargins,
+                                opt->digit_mode, opt->precision,
+                                opt->trim_mode, opt->digits_left,
+                                opt->exp_digits);
+    }
+    else {
+        return FormatPositional(buffer, bufferSize, mantissa, exponent,
+                                signbit, mantissaBit, hasUnequalMargins,
+                                opt->digit_mode, opt->cutoff_mode,
+                                opt->precision, opt->trim_mode,
+                                opt->digits_left, opt->digits_right);
+    }
+}
+
+/*
+ * IEEE binary16 floating-point format
+ *
+ * sign:      1 bit
+ * exponent:  5 bits
+ * mantissa: 10 bits
  */
 static npy_uint32
-Dragon4_PrintFloat16(char *buffer, npy_uint32 bufferSize, npy_uint16 value,
-                     npy_bool scientific, DigitMode digit_mode,
-                     CutoffMode cutoff_mode, npy_int32 precision,
-                     npy_bool sign, TrimMode trim_mode, npy_int32 digits_left,
-                     npy_int32 digits_right, npy_int32 exp_digits)
+Dragon4_PrintFloat_IEEE_binary16(
+        Dragon4_Scratch *scratch, npy_half *value, Dragon4_Options *opt)
 {
-    FloatUnion16 floatUnion;
-    npy_uint32 floatExponent, floatMantissa;
+    char *buffer = scratch->repr;
+    npy_uint32 bufferSize = sizeof(scratch->repr);
+    BigInt *bigints = scratch->bigints;
+
+    npy_uint16 val = *value;
+    npy_uint32 floatExponent, floatMantissa, floatSign;
 
     npy_uint32 mantissa;
     npy_int32 exponent;
@@ -2150,15 +2194,15 @@ Dragon4_PrintFloat16(char *buffer, npy_uint32 bufferSize, npy_uint16 value,
     }
 
     /* deconstruct the floating point value */
-    floatUnion.integer = value;
-    floatExponent = GetExponent_F16(&floatUnion);
-    floatMantissa = GetMantissa_F16(&floatUnion);
+    floatMantissa = val & bitmask_u32(10);
+    floatExponent = (val >> 10) & bitmask_u32(5);
+    floatSign = val >> 15;
 
     /* output the sign */
-    if (IsNegative_F16(&floatUnion)) {
+    if (floatSign != 0) {
         signbit = '-';
     }
-    else if (sign) {
+    else if (opt->sign) {
         signbit = '+';
     }
 
@@ -2207,29 +2251,33 @@ Dragon4_PrintFloat16(char *buffer, npy_uint32 bufferSize, npy_uint16 value,
         hasUnequalMargins  = NPY_FALSE;
     }
 
-    /* format the value */
-    if (scientific) {
-        return FormatScientific(buffer, bufferSize, mantissa, exponent, signbit,
-                                mantissaBit, hasUnequalMargins, digit_mode,
-                                precision, trim_mode, digits_left, exp_digits);
-    }
-    else {
-        return FormatPositional(buffer, bufferSize, mantissa, exponent, signbit,
-                                mantissaBit, hasUnequalMargins, digit_mode,
-                                cutoff_mode, precision, trim_mode,
-                                digits_left, digits_right);
-    }
+    BigInt_Set_uint32(&bigints[0], mantissa);
+    return Format_floatbits(buffer, bufferSize, bigints, exponent,
+                            signbit, mantissaBit, hasUnequalMargins, opt);
 }
 
+/*
+ * IEEE binary32 floating-point format
+ *
+ * sign:      1 bit
+ * exponent:  8 bits
+ * mantissa: 23 bits
+ */
 static npy_uint32
-Dragon4_PrintFloat32(char *buffer, npy_uint32 bufferSize, npy_float32 value,
-                     npy_bool scientific, DigitMode digit_mode,
-                     CutoffMode cutoff_mode, npy_int32 precision,
-                     npy_bool sign, TrimMode trim_mode, npy_int32 digits_left,
-                     npy_int32 digits_right, npy_int32 exp_digits)
+Dragon4_PrintFloat_IEEE_binary32(
+        Dragon4_Scratch *scratch, npy_float32 *value,
+        Dragon4_Options *opt)
 {
-    FloatUnion32 floatUnion;
-    npy_uint32 floatExponent, floatMantissa;
+    char *buffer = scratch->repr;
+    npy_uint32 bufferSize = sizeof(scratch->repr);
+    BigInt *bigints = scratch->bigints;
+
+    union
+    {
+        npy_float32 floatingPoint;
+        npy_uint32 integer;
+    } floatUnion;
+    npy_uint32 floatExponent, floatMantissa, floatSign;
 
     npy_uint32 mantissa;
     npy_int32 exponent;
@@ -2247,15 +2295,16 @@ Dragon4_PrintFloat32(char *buffer, npy_uint32 bufferSize, npy_float32 value,
     }
 
     /* deconstruct the floating point value */
-    floatUnion.floatingPoint = value;
-    floatExponent = GetExponent_F32(&floatUnion);
-    floatMantissa = GetMantissa_F32(&floatUnion);
+    floatUnion.floatingPoint = *value;
+    floatMantissa = floatUnion.integer & bitmask_u32(23);
+    floatExponent = (floatUnion.integer >> 23) & bitmask_u32(8);
+    floatSign = floatUnion.integer >> 31;
 
     /* output the sign */
-    if (IsNegative_F32(&floatUnion)) {
+    if (floatSign != 0) {
         signbit = '-';
     }
-    else if (sign) {
+    else if (opt->sign) {
         signbit = '+';
     }
 
@@ -2304,29 +2353,32 @@ Dragon4_PrintFloat32(char *buffer, npy_uint32 bufferSize, npy_float32 value,
         hasUnequalMargins  = NPY_FALSE;
     }
 
-    /* format the value */
-    if (scientific) {
-        return FormatScientific(buffer, bufferSize, mantissa, exponent, signbit,
-                                mantissaBit, hasUnequalMargins, digit_mode,
-                                precision, trim_mode, digits_left, exp_digits);
-    }
-    else {
-        return FormatPositional(buffer, bufferSize, mantissa, exponent, signbit,
-                                mantissaBit, hasUnequalMargins, digit_mode,
-                                cutoff_mode, precision, trim_mode,
-                                digits_left, digits_right);
-    }
+    BigInt_Set_uint32(&bigints[0], mantissa);
+    return Format_floatbits(buffer, bufferSize, bigints, exponent,
+                           signbit, mantissaBit, hasUnequalMargins, opt);
 }
 
+/*
+ * IEEE binary64 floating-point format
+ *
+ * sign:      1 bit
+ * exponent: 11 bits
+ * mantissa: 52 bits
+ */
 static npy_uint32
-Dragon4_PrintFloat64(char *buffer, npy_uint32 bufferSize, npy_float64 value,
-                     npy_bool scientific, DigitMode digit_mode,
-                     CutoffMode cutoff_mode, npy_int32 precision,
-                     npy_bool sign, TrimMode trim_mode, npy_int32 digits_left,
-                     npy_int32 digits_right, npy_int32 exp_digits)
+Dragon4_PrintFloat_IEEE_binary64(
+        Dragon4_Scratch *scratch, npy_float64 *value, Dragon4_Options *opt)
 {
-    FloatUnion64 floatUnion;
-    npy_uint32 floatExponent;
+    char *buffer = scratch->repr;
+    npy_uint32 bufferSize = sizeof(scratch->repr);
+    BigInt *bigints = scratch->bigints;
+
+    union
+    {
+        npy_float64 floatingPoint;
+        npy_uint64 integer;
+    } floatUnion;
+    npy_uint32 floatExponent, floatSign;
     npy_uint64 floatMantissa;
 
     npy_uint64 mantissa;
@@ -2345,15 +2397,16 @@ Dragon4_PrintFloat64(char *buffer, npy_uint32 bufferSize, npy_float64 value,
     }
 
     /* deconstruct the floating point value */
-    floatUnion.floatingPoint = value;
-    floatExponent = GetExponent_F64(&floatUnion);
-    floatMantissa = GetMantissa_F64(&floatUnion);
+    floatUnion.floatingPoint = *value;
+    floatMantissa = floatUnion.integer & bitmask_u64(52);
+    floatExponent = (floatUnion.integer >> 52) & bitmask_u32(11);
+    floatSign = floatUnion.integer >> 63;
 
     /* output the sign */
-    if (IsNegative_F64(&floatUnion)) {
+    if (floatSign != 0) {
         signbit = '-';
     }
-    else if (sign) {
+    else if (opt->sign) {
         signbit = '+';
     }
 
@@ -2402,28 +2455,48 @@ Dragon4_PrintFloat64(char *buffer, npy_uint32 bufferSize, npy_float64 value,
         hasUnequalMargins   = NPY_FALSE;
     }
 
-    /* format the value */
-    if (scientific) {
-        return FormatScientific(buffer, bufferSize, mantissa, exponent, signbit,
-                                mantissaBit, hasUnequalMargins, digit_mode,
-                                precision, trim_mode, digits_left, exp_digits);
-    }
-    else {
-        return FormatPositional(buffer, bufferSize, mantissa, exponent, signbit,
-                                mantissaBit, hasUnequalMargins, digit_mode,
-                                cutoff_mode, precision, trim_mode,
-                                digits_left, digits_right);
-    }
+    BigInt_Set_uint64(&bigints[0], mantissa);
+    return Format_floatbits(buffer, bufferSize, bigints, exponent,
+                            signbit, mantissaBit, hasUnequalMargins, opt);
 }
 
+
+/*
+ * Since systems have different types of long doubles, and may not necessarily
+ * have a 128-byte format we can use to pass values around, here we create
+ * our own 128-bit storage type for convenience.
+ */
+typedef struct FloatVal128 {
+    npy_uint64 hi, lo;
+} FloatVal128;
+
+#if defined(HAVE_LDOUBLE_INTEL_EXTENDED_10_BYTES_LE) || \
+    defined(HAVE_LDOUBLE_INTEL_EXTENDED_12_BYTES_LE) || \
+    defined(HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE) || \
+    defined(HAVE_LDOUBLE_MOTOROLA_EXTENDED_12_BYTES_BE)
+/*
+ * Intel's 80-bit IEEE extended precision floating-point format
+ *
+ * "long doubles" with this format are stored as 96 or 128 bits, but
+ * are equivalent to the 80 bit type with some zero padding on the high bits.
+ * This method expects the user to pass in the value using a 128-bit
+ * FloatVal128, so can support 80, 96, or 128 bit storage formats,
+ * and is endian-independent.
+ *
+ * sign:      1 bit,  second u64
+ * exponent: 15 bits, second u64
+ * intbit     1 bit,  first u64
+ * mantissa: 63 bits, first u64
+ */
 static npy_uint32
-Dragon4_PrintFloat128(char *buffer, npy_uint32 bufferSize, FloatVal128 value,
-                      npy_bool scientific, DigitMode digit_mode,
-                      CutoffMode cutoff_mode, npy_int32 precision,
-                      npy_bool sign, TrimMode trim_mode, npy_int32 digits_left,
-                      npy_int32 digits_right, npy_int32 exp_digits)
+Dragon4_PrintFloat_Intel_extended(
+    Dragon4_Scratch *scratch, FloatVal128 value, Dragon4_Options *opt)
 {
-    npy_uint32 floatExponent;
+    char *buffer = scratch->repr;
+    npy_uint32 bufferSize = sizeof(scratch->repr);
+    BigInt *bigints = scratch->bigints;
+
+    npy_uint32 floatExponent, floatSign;
     npy_uint64 floatMantissa;
 
     npy_uint64 mantissa;
@@ -2441,20 +2514,27 @@ Dragon4_PrintFloat128(char *buffer, npy_uint32 bufferSize, FloatVal128 value,
         return 0;
     }
 
-    /* deconstruct the floating point value */
-    floatExponent = GetExponent_F128(&value);
-    floatMantissa = GetMantissa_F128(&value);
+    /* deconstruct the floating point value (we ignore the intbit) */
+    floatMantissa = value.lo & bitmask_u64(63);
+    floatExponent = value.hi & bitmask_u32(15);
+    floatSign = (value.hi >> 15) & 0x1;
 
     /* output the sign */
-    if (IsNegative_F128(&value)) {
+    if (floatSign != 0) {
         signbit = '-';
     }
-    else if (sign) {
+    else if (opt->sign) {
         signbit = '+';
     }
 
     /* if this is a special value */
     if (floatExponent == bitmask_u32(15)) {
+        /*
+         * Note: Technically there are other special extended values defined if
+         * the intbit is 0, like Pseudo-Infinity, Pseudo-Nan, Quiet-NaN. We
+         * ignore all of these since they are not generated on modern
+         * processors. We treat Quiet-Nan as simply Nan.
+         */
         return PrintInfNan(buffer, bufferSize, floatMantissa, 16, signbit);
     }
     /* else this is a number */
@@ -2498,247 +2578,417 @@ Dragon4_PrintFloat128(char *buffer, npy_uint32 bufferSize, FloatVal128 value,
         hasUnequalMargins   = NPY_FALSE;
     }
 
-    /* format the value */
-    if (scientific) {
-        return FormatScientific(buffer, bufferSize, mantissa, exponent, signbit,
-                                mantissaBit, hasUnequalMargins, digit_mode,
-                                precision, trim_mode, digits_left, exp_digits);
-    }
-    else {
-        return FormatPositional(buffer, bufferSize, mantissa, exponent, signbit,
-                                mantissaBit, hasUnequalMargins, digit_mode,
-                                cutoff_mode, precision, trim_mode,
-                                digits_left, digits_right);
-    }
+    BigInt_Set_uint64(&bigints[0], mantissa);
+    return Format_floatbits(buffer, bufferSize, bigints, exponent,
+                            signbit, mantissaBit, hasUnequalMargins, opt);
+
 }
 
-PyObject *
-Dragon4_Positional_AnySize(void *val, size_t size, DigitMode digit_mode,
-                           CutoffMode cutoff_mode, int precision, int sign,
-                           TrimMode trim, int pad_left, int pad_right)
+#endif /* INTEL_EXTENDED group */
+
+
+#ifdef HAVE_LDOUBLE_INTEL_EXTENDED_10_BYTES_LE
+/*
+ * Intel's 80-bit IEEE extended precision format, 80-bit storage
+ *
+ * Note: It is not clear if a long double with 10-byte storage exists on any
+ * system. But numpy defines NPY_FLOAT80, so if we come across it, assume it is
+ * an Intel extended format.
+ */
+static npy_uint32
+Dragon4_PrintFloat_Intel_extended80(
+    Dragon4_Scratch *scratch, npy_float80 *value, Dragon4_Options *opt)
 {
-    /*
-     * Use a very large buffer in case anyone tries to output a large numberG.
-     * 16384 should be enough to uniquely print any float128, which goes up
-     * to about 10^4932 */
-    static char repr[16384];
     FloatVal128 val128;
-#ifdef NPY_FLOAT80
-    FloatUnion80 buf80;;
-#endif
-#ifdef NPY_FLOAT96
-    FloatUnion96 buf96;
-#endif
+    union {
+        npy_float80 floatingPoint;
+        struct {
+            npy_uint64 a;
+            npy_uint16 b;
+        } integer;
+    } buf80;
+
+    buf80.floatingPoint = *value;
+    /* Intel is little-endian */
+    val128.lo = buf80.integer.a;
+    val128.hi = buf80.integer.b;
+
+    return Dragon4_PrintFloat_Intel_extended(scratch, val128, opt);
+}
+#endif /* HAVE_LDOUBLE_INTEL_EXTENDED_10_BYTES_LE */
+
+#ifdef HAVE_LDOUBLE_INTEL_EXTENDED_12_BYTES_LE
+/* Intel's 80-bit IEEE extended precision format, 96-bit storage */
+static npy_uint32
+Dragon4_PrintFloat_Intel_extended96(
+    Dragon4_Scratch *scratch, npy_float96 *value, Dragon4_Options *opt)
+{
+    FloatVal128 val128;
+    union {
+        npy_float96 floatingPoint;
+        struct {
+            npy_uint64 a;
+            npy_uint32 b;
+        } integer;
+    } buf96;
+
+    buf96.floatingPoint = *value;
+    /* Intel is little-endian */
+    val128.lo = buf96.integer.a;
+    val128.hi = buf96.integer.b;
+
+    return Dragon4_PrintFloat_Intel_extended(scratch, val128, opt);
+}
+#endif /* HAVE_LDOUBLE_INTEL_EXTENDED_12_BYTES_LE */
+
+#ifdef HAVE_LDOUBLE_MOTOROLA_EXTENDED_12_BYTES_BE
+/* Motorola Big-endian equivalent of the Intel-extended 96 fp format */
+static npy_uint32
+Dragon4_PrintFloat_Motorola_extended96(
+    Dragon4_Scratch *scratch, npy_float96 *value, Dragon4_Options *opt)
+{
+    FloatVal128 val128;
+    union {
+        npy_float96 floatingPoint;
+        struct {
+            npy_uint64 a;
+            npy_uint32 b;
+        } integer;
+    } buf96;
+
+    buf96.floatingPoint = *value;
+    /* Motorola is big-endian */
+    val128.lo = buf96.integer.b;
+    val128.hi = buf96.integer.a >> 16;
+    /* once again we assume the int has same endianness as the float */
+
+    return Dragon4_PrintFloat_Intel_extended(scratch, val128, opt);
+}
+#endif /* HAVE_LDOUBLE_MOTOROLA_EXTENDED_12_BYTES_BE */
+
+
 #ifdef NPY_FLOAT128
+
+typedef union FloatUnion128
+{
+    npy_float128 floatingPoint;
+    struct {
+        npy_uint64 a;
+        npy_uint64 b;
+    } integer;
+} FloatUnion128;
+
+#ifdef HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE
+/* Intel's 80-bit IEEE extended precision format, 128-bit storage */
+static npy_uint32
+Dragon4_PrintFloat_Intel_extended128(
+    Dragon4_Scratch *scratch, npy_float128 *value, Dragon4_Options *opt)
+{
+    FloatVal128 val128;
     FloatUnion128 buf128;
-#endif
 
-    switch (size) {
-        case 2:
-            Dragon4_PrintFloat16(repr, sizeof(repr), *(npy_float16*)val,
-                                 0, digit_mode, cutoff_mode, precision,
-                                 sign, trim, pad_left, pad_right, -1);
-            break;
-        case 4:
-            Dragon4_PrintFloat32(repr, sizeof(repr), *(npy_float32*)val,
-                                 0, digit_mode, cutoff_mode, precision,
-                                 sign, trim, pad_left, pad_right, -1);
-            break;
-        case 8:
-            Dragon4_PrintFloat64(repr, sizeof(repr), *(npy_float64*)val,
-                                 0, digit_mode, cutoff_mode, precision,
-                                 sign, trim, pad_left, pad_right, -1);
-            break;
-#ifdef NPY_FLOAT80
-        case 10:
-            buf80.floatingPoint = *(npy_float80*)val;
-            val128.integer[0] = buf80.integer.a;
-            val128.integer[1] = buf80.integer.b;
-            Dragon4_PrintFloat128(repr, sizeof(repr), val128,
-                                  0, digit_mode, cutoff_mode, precision,
-                                  sign, trim, pad_left, pad_right, -1);
-            break;
-#endif
-#ifdef NPY_FLOAT96
-        case 12:
-            buf96.floatingPoint = *(npy_float96*)val;
-            val128.integer[0] = buf96.integer.a;
-            val128.integer[1] = buf96.integer.b;
-            Dragon4_PrintFloat128(repr, sizeof(repr), val128,
-                                  0, digit_mode, cutoff_mode, precision,
-                                  sign, trim, pad_left, pad_right, -1);
-            break;
-#endif
-#ifdef NPY_FLOAT128
-        case 16:
-            buf128.floatingPoint = *(npy_float128*)val;
-            val128.integer[0] = buf128.integer.a;
-            val128.integer[1] = buf128.integer.b;
-            Dragon4_PrintFloat128(repr, sizeof(repr), val128,
-                                  0, digit_mode, cutoff_mode, precision,
-                                  sign, trim, pad_left, pad_right, -1);
-            break;
-#endif
-        default:
-            PyErr_Format(PyExc_ValueError, "unexpected itemsize %zu", size);
-            return NULL;
+    buf128.floatingPoint = *value;
+    /* Intel is little-endian */
+    val128.lo = buf128.integer.a;
+    val128.hi = buf128.integer.b;
+
+    return Dragon4_PrintFloat_Intel_extended(scratch, val128, opt);
+}
+#endif /* HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE */
+
+#if defined(HAVE_LDOUBLE_IEEE_QUAD_LE)
+/*
+ * IEEE binary128 floating-point format
+ *
+ * sign:       1 bit
+ * exponent:  15 bits
+ * mantissa: 112 bits
+ *
+ * Currently binary128 format exists on only a few CPUs, such as on the POWER9
+ * arch. Because of this, this code has not been tested. I am not sure if the
+ * arch also supports uint128, and C does not seem to support int128 literals.
+ * So we use uint64 to do manipulation. Unfortunately this means we are endian
+ * dependent. Assume little-endian for now, can fix later once binary128
+ * becomes more common.
+ */
+static npy_uint32
+Dragon4_PrintFloat_IEEE_binary128(
+    Dragon4_Scratch *scratch, npy_float128 *value, Dragon4_Options *opt)
+{
+    FloatUnion128 buf128;
+
+    char *buffer = scratch->repr;
+    npy_uint32 bufferSize = sizeof(scratch->repr);
+    BigInt *bigints = scratch->bigints;
+
+    npy_uint32 floatExponent, floatSign;
+
+    npy_uint64 mantissa_hi, mantissa_lo;
+    npy_int32 exponent;
+    npy_uint32 mantissaBit;
+    npy_bool hasUnequalMargins;
+    char signbit = '\0';
+
+    buf128.floatingPoint = *value;
+
+    if (bufferSize == 0) {
+        return 0;
+    }
+
+    if (bufferSize == 1) {
+        buffer[0] = '\0';
+        return 0;
+    }
+
+    /* Assumes little-endian !!! */
+    mantissa_hi = buf128.integer.a & bitmask_u64(48);
+    mantissa_lo = buf128.integer.b;
+    floatExponent = (buf128.integer.a >> 48) & bitmask_u32(15);
+    floatSign = buf128.integer.a >> 63;
+
+    /* output the sign */
+    if (floatSign != 0) {
+        signbit = '-';
+    }
+    else if (opt->sign) {
+        signbit = '+';
+    }
+
+    /* if this is a special value */
+    if (floatExponent == bitmask_u32(15)) {
+        npy_uint64 mantissa_zero = mantissa_hi == 0 && mantissa_lo == 0;
+        return PrintInfNan(buffer, bufferSize, !mantissa_zero, 16, signbit);
     }
+    /* else this is a number */
 
-    return PyUString_FromString(repr);
+    /* factor the value into its parts */
+    if (floatExponent != 0) {
+        /*
+         * normal
+         * The floating point equation is:
+         *  value = (1 + mantissa/2^112) * 2 ^ (exponent-16383)
+         * We convert the integer equation by factoring a 2^112 out of the
+         * exponent
+         *  value = (1 + mantissa/2^112) * 2^112 * 2 ^ (exponent-16383-112)
+         *  value = (2^112 + mantissa) * 2 ^ (exponent-16383-112)
+         * Because of the implied 1 in front of the mantissa we have 112 bits of
+         * precision.
+         *   m = (2^112 + mantissa)
+         *   e = (exponent-16383+1-112)
+         *
+         *   Adding 2^112 to the mantissa is the same as adding 2^48 to the hi
+         *   64 bit part.
+         */
+        mantissa_hi         = (1ull << 48) | mantissa_hi;
+        /* mantissa_lo is unchanged */
+        exponent            = floatExponent - 16383 - 112;
+        mantissaBit         = 112;
+        hasUnequalMargins   = (floatExponent != 1) && (mantissa_hi == 0 &&
+                                                       mantissa_lo == 0);
+    }
+    else {
+        /*
+         * subnormal
+         * The floating point equation is:
+         *  value = (mantissa/2^112) * 2 ^ (1-16383)
+         * We convert the integer equation by factoring a 2^112 out of the
+         * exponent
+         *  value = (mantissa/2^112) * 2^112 * 2 ^ (1-16383-112)
+         *  value = mantissa * 2 ^ (1-16383-112)
+         * We have up to 112 bits of precision.
+         *   m = (mantissa)
+         *   e = (1-16383-112)
+         */
+        exponent            = 1 - 16383 - 112;
+        mantissaBit         = LogBase2_128(mantissa_hi, mantissa_lo);
+        hasUnequalMargins   = NPY_FALSE;
+    }
+
+    BigInt_Set_2x_uint64(&bigints[0], mantissa_hi, mantissa_lo);
+    return Format_floatbits(buffer, bufferSize, bigints, exponent,
+                            signbit, mantissaBit, hasUnequalMargins, opt);
 }
+#endif /* HAVE_LDOUBLE_IEEE_QUAD_LE */
+
+#endif /* NPY_FLOAT128 */
+
+
+/*
+ * Here we define two Dragon4 entry functions for each type. One of them
+ * accepts the args in a Dragon4_Options struct for convenience, the
+ * other enumerates only the necessary parameters.
+ *
+ * Use a very large string buffer in case anyone tries to output a large number.
+ * 16384 should be enough to exactly print the integer part of any float128,
+ * which goes up to about 10^4932. The Dragon4_scratch struct provides a string
+ * buffer of this size.
+ */
+#define make_dragon4_typefuncs_inner(Type, npy_type, format) \
+\
+PyObject *\
+Dragon4_Positional_##Type##_opt(npy_type *val, Dragon4_Options *opt)\
+{\
+    PyObject *ret;\
+    Dragon4_Scratch *scratch = get_dragon4_bigint_scratch();\
+    if (scratch == NULL) {\
+        return NULL;\
+    }\
+    if (Dragon4_PrintFloat_##format(scratch, val, opt) < 0) {\
+        free_dragon4_bigint_scratch(scratch);\
+        return NULL;\
+    }\
+    ret = PyUString_FromString(scratch->repr);\
+    free_dragon4_bigint_scratch(scratch);\
+    return ret;\
+}\
+\
+PyObject *\
+Dragon4_Positional_##Type(npy_type *val, DigitMode digit_mode,\
+                   CutoffMode cutoff_mode, int precision,\
+                   int sign, TrimMode trim, int pad_left, int pad_right)\
+{\
+    Dragon4_Options opt;\
+    \
+    opt.scientific = 0;\
+    opt.digit_mode = digit_mode;\
+    opt.cutoff_mode = cutoff_mode;\
+    opt.precision = precision;\
+    opt.sign = sign;\
+    opt.trim_mode = trim;\
+    opt.digits_left = pad_left;\
+    opt.digits_right = pad_right;\
+    opt.exp_digits = -1;\
+\
+    return Dragon4_Positional_##Type##_opt(val, &opt);\
+}\
+\
+PyObject *\
+Dragon4_Scientific_##Type##_opt(npy_type *val, Dragon4_Options *opt)\
+{\
+    PyObject *ret;\
+    Dragon4_Scratch *scratch = get_dragon4_bigint_scratch();\
+    if (scratch == NULL) {\
+        return NULL;\
+    }\
+    if (Dragon4_PrintFloat_##format(scratch, val, opt) < 0) {\
+        free_dragon4_bigint_scratch(scratch);\
+        return NULL;\
+    }\
+    ret = PyUString_FromString(scratch->repr);\
+    free_dragon4_bigint_scratch(scratch);\
+    return ret;\
+}\
+PyObject *\
+Dragon4_Scientific_##Type(npy_type *val, DigitMode digit_mode, int precision,\
+                   int sign, TrimMode trim, int pad_left, int exp_digits)\
+{\
+    Dragon4_Options opt;\
+\
+    opt.scientific = 1;\
+    opt.digit_mode = digit_mode;\
+    opt.cutoff_mode = CutoffMode_TotalLength;\
+    opt.precision = precision;\
+    opt.sign = sign;\
+    opt.trim_mode = trim;\
+    opt.digits_left = pad_left;\
+    opt.digits_right = -1;\
+    opt.exp_digits = exp_digits;\
+\
+    return Dragon4_Scientific_##Type##_opt(val, &opt);\
+}
+
+#define make_dragon4_typefuncs(Type, npy_type, format) \
+        make_dragon4_typefuncs_inner(Type, npy_type, format)
+
+make_dragon4_typefuncs(Half, npy_half, NPY_HALF_BINFMT_NAME)
+make_dragon4_typefuncs(Float, npy_float, NPY_FLOAT_BINFMT_NAME)
+make_dragon4_typefuncs(Double, npy_double, NPY_DOUBLE_BINFMT_NAME)
+make_dragon4_typefuncs(LongDouble, npy_longdouble, NPY_LONGDOUBLE_BINFMT_NAME)
+
+#undef make_dragon4_typefuncs
+#undef make_dragon4_typefuncs_inner
 
 PyObject *
 Dragon4_Positional(PyObject *obj, DigitMode digit_mode, CutoffMode cutoff_mode,
                    int precision, int sign, TrimMode trim, int pad_left,
                    int pad_right)
 {
-    double val;
+    npy_double val;
+    Dragon4_Options opt;
+
+    opt.scientific = 0;
+    opt.digit_mode = digit_mode;
+    opt.cutoff_mode = cutoff_mode;
+    opt.precision = precision;
+    opt.sign = sign;
+    opt.trim_mode = trim;
+    opt.digits_left = pad_left;
+    opt.digits_right = pad_right;
+    opt.exp_digits = -1;
 
     if (PyArray_IsScalar(obj, Half)) {
         npy_half x = ((PyHalfScalarObject *)obj)->obval;
-        return Dragon4_Positional_AnySize(&x, sizeof(npy_half),
-                                          digit_mode, cutoff_mode, precision,
-                                          sign, trim, pad_left, pad_right);
+        return Dragon4_Positional_Half_opt(&x, &opt);
     }
     else if (PyArray_IsScalar(obj, Float)) {
         npy_float x = ((PyFloatScalarObject *)obj)->obval;
-        return Dragon4_Positional_AnySize(&x, sizeof(npy_float),
-                                          digit_mode, cutoff_mode, precision,
-                                          sign, trim, pad_left, pad_right);
+        return Dragon4_Positional_Float_opt(&x, &opt);
     }
     else if (PyArray_IsScalar(obj, Double)) {
         npy_double x = ((PyDoubleScalarObject *)obj)->obval;
-        return Dragon4_Positional_AnySize(&x, sizeof(npy_double),
-                                          digit_mode, cutoff_mode, precision,
-                                          sign, trim, pad_left, pad_right);
+        return Dragon4_Positional_Double_opt(&x, &opt);
     }
     else if (PyArray_IsScalar(obj, LongDouble)) {
         npy_longdouble x = ((PyLongDoubleScalarObject *)obj)->obval;
-        return Dragon4_Positional_AnySize(&x, sizeof(npy_longdouble),
-                                          digit_mode, cutoff_mode, precision,
-                                          sign, trim, pad_left, pad_right);
+        return Dragon4_Positional_LongDouble_opt(&x, &opt);
     }
 
     val = PyFloat_AsDouble(obj);
     if (PyErr_Occurred()) {
         return NULL;
     }
-    return Dragon4_Positional_AnySize(&val, sizeof(double),
-                                      digit_mode, cutoff_mode, precision,
-                                      sign, trim, pad_left, pad_right);
-}
-
-PyObject *
-Dragon4_Scientific_AnySize(void *val, size_t size, DigitMode digit_mode,
-                           int precision, int sign, TrimMode trim,
-                           int pad_left, int exp_digits)
-{
-    /* use a very large buffer in case anyone tries to output a large precision */
-    static char repr[4096];
-    FloatVal128 val128;
-#ifdef NPY_FLOAT80
-    FloatUnion80 buf80;;
-#endif
-#ifdef NPY_FLOAT96
-    FloatUnion96 buf96;
-#endif
-#ifdef NPY_FLOAT128
-    FloatUnion128 buf128;
-#endif
-
-    /* dummy, is ignored in scientific mode */
-    CutoffMode cutoff_mode = CutoffMode_TotalLength;
-
-    switch (size) {
-        case 2:
-            Dragon4_PrintFloat16(repr, sizeof(repr), *(npy_float16*)val,
-                                 1, digit_mode, cutoff_mode, precision, sign,
-                                 trim, pad_left, -1, exp_digits);
-            break;
-        case 4:
-            Dragon4_PrintFloat32(repr, sizeof(repr), *(npy_float32*)val,
-                                 1, digit_mode, cutoff_mode, precision, sign,
-                                 trim, pad_left, -1, exp_digits);
-            break;
-        case 8:
-            Dragon4_PrintFloat64(repr, sizeof(repr), *(npy_float64*)val,
-                                 1, digit_mode, cutoff_mode, precision, sign,
-                                 trim, pad_left, -1, exp_digits);
-            break;
-#ifdef NPY_FLOAT80
-        case 10:
-            buf80.floatingPoint = *(npy_float80*)val;
-            val128.integer[0] = buf80.integer.a;
-            val128.integer[1] = buf80.integer.b;
-            Dragon4_PrintFloat128(repr, sizeof(repr), val128,
-                                  1, digit_mode, cutoff_mode, precision, sign,
-                                  trim, pad_left, -1, exp_digits);
-            break;
-#endif
-#ifdef NPY_FLOAT96
-        case 12:
-            buf96.floatingPoint = *(npy_float96*)val;
-            val128.integer[0] = buf96.integer.a;
-            val128.integer[1] = buf96.integer.b;
-            Dragon4_PrintFloat128(repr, sizeof(repr), val128,
-                                  1, digit_mode, cutoff_mode, precision, sign,
-                                  trim, pad_left, -1, exp_digits);
-            break;
-#endif
-#ifdef NPY_FLOAT128
-        case 16:
-            buf128.floatingPoint = *(npy_float128*)val;
-            val128.integer[0] = buf128.integer.a;
-            val128.integer[1] = buf128.integer.b;
-            Dragon4_PrintFloat128(repr, sizeof(repr), val128,
-                                  1, digit_mode, cutoff_mode, precision, sign,
-                                  trim, pad_left, -1, exp_digits);
-            break;
-#endif
-        default:
-            PyErr_Format(PyExc_ValueError, "unexpected itemsize %zu", size);
-            return NULL;
-    }
-
-    return PyUString_FromString(repr);
+    return Dragon4_Positional_Double_opt(&val, &opt);
 }
 
 PyObject *
 Dragon4_Scientific(PyObject *obj, DigitMode digit_mode, int precision,
                    int sign, TrimMode trim, int pad_left, int exp_digits)
 {
-    double val;
+    npy_double val;
+    Dragon4_Options opt;
+
+    opt.scientific = 1;
+    opt.digit_mode = digit_mode;
+    opt.cutoff_mode = CutoffMode_TotalLength;
+    opt.precision = precision;
+    opt.sign = sign;
+    opt.trim_mode = trim;
+    opt.digits_left = pad_left;
+    opt.digits_right = -1;
+    opt.exp_digits = exp_digits;
 
     if (PyArray_IsScalar(obj, Half)) {
         npy_half x = ((PyHalfScalarObject *)obj)->obval;
-        return Dragon4_Scientific_AnySize(&x, sizeof(npy_half),
-                                          digit_mode, precision,
-                                          sign, trim, pad_left, exp_digits);
+        return Dragon4_Scientific_Half_opt(&x, &opt);
     }
     else if (PyArray_IsScalar(obj, Float)) {
         npy_float x = ((PyFloatScalarObject *)obj)->obval;
-        return Dragon4_Scientific_AnySize(&x, sizeof(npy_float),
-                                          digit_mode, precision,
-                                          sign, trim, pad_left, exp_digits);
+        return Dragon4_Scientific_Float_opt(&x, &opt);
     }
     else if (PyArray_IsScalar(obj, Double)) {
         npy_double x = ((PyDoubleScalarObject *)obj)->obval;
-        return Dragon4_Scientific_AnySize(&x, sizeof(npy_double),
-                                          digit_mode, precision,
-                                          sign, trim, pad_left, exp_digits);
+        return Dragon4_Scientific_Double_opt(&x, &opt);
     }
     else if (PyArray_IsScalar(obj, LongDouble)) {
         npy_longdouble x = ((PyLongDoubleScalarObject *)obj)->obval;
-        return Dragon4_Scientific_AnySize(&x, sizeof(npy_longdouble),
-                                          digit_mode, precision,
-                                          sign, trim, pad_left, exp_digits);
+        return Dragon4_Scientific_LongDouble_opt(&x, &opt);
     }
 
     val = PyFloat_AsDouble(obj);
     if (PyErr_Occurred()) {
         return NULL;
     }
-    return Dragon4_Scientific_AnySize(&val, sizeof(double),
-                                      digit_mode, precision,
-                                      sign, trim, pad_left, exp_digits);
+    return Dragon4_Scientific_Double_opt(&val, &opt);
 }
+
+#undef DEBUG_ASSERT
diff --git a/numpy/core/src/multiarray/dragon4.h b/numpy/core/src/multiarray/dragon4.h
index 5559c5157..176b79f69 100644
--- a/numpy/core/src/multiarray/dragon4.h
+++ b/numpy/core/src/multiarray/dragon4.h
@@ -40,6 +40,45 @@
 #include "npy_pycompat.h"
 #include "numpy/arrayscalars.h"
 
+/* Half binary format */
+#define NPY_HALF_BINFMT_NAME IEEE_binary16
+
+/* Float binary format */
+#if NPY_BITSOF_FLOAT == 32
+    #define NPY_FLOAT_BINFMT_NAME IEEE_binary32
+#elif NPY_BITSOF_FLOAT == 64
+    #define NPY_FLOAT_BINFMT_NAME IEEE_binary64
+#else
+    #error No float representation defined
+#endif
+
+/* Double binary format */
+#if NPY_BITSOF_DOUBLE == 32
+    #define NPY_DOUBLE_BINFMT_NAME IEEE_binary32
+#elif NPY_BITSOF_DOUBLE == 64
+    #define NPY_DOUBLE_BINFMT_NAME IEEE_binary64
+#else
+    #error No double representation defined
+#endif
+
+/* LongDouble binary format */
+#if defined(HAVE_LDOUBLE_IEEE_QUAD_BE)
+    #define NPY_LONGDOUBLE_BINFMT_NAME IEEE_binary128_be
+#elif defined(HAVE_LDOUBLE_IEEE_QUAD_LE)
+    #define NPY_LONGDOUBLE_BINFMT_NAME IEEE_binary128_le
+#elif (defined(HAVE_LDOUBLE_IEEE_DOUBLE_LE) || \
+       defined(HAVE_LDOUBLE_IEEE_DOUBLE_BE))
+    #define NPY_LONGDOUBLE_BINFMT_NAME IEEE_binary64
+#elif defined(HAVE_LDOUBLE_INTEL_EXTENDED_12_BYTES_LE)
+    #define NPY_LONGDOUBLE_BINFMT_NAME Intel_extended96
+#elif defined(HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE)
+    #define NPY_LONGDOUBLE_BINFMT_NAME Intel_extended128
+#elif defined(HAVE_LDOUBLE_MOTOROLA_EXTENDED_12_BYTES_BE)
+    #define NPY_LONGDOUBLE_BINFMT_NAME Motorola_extended96
+#else
+    #error No long double representation defined
+#endif
+
 typedef enum DigitMode
 {
     /* Round digits to print shortest uniquely identifiable number. */
@@ -64,15 +103,23 @@ typedef enum TrimMode
     TrimMode_DptZeros,     /* trim trailing zeros & trailing decimal point */
 } TrimMode;
 
-PyObject *
-Dragon4_Positional_AnySize(void *val, size_t size, DigitMode digit_mode,
-                           CutoffMode cutoff_mode, int precision, int sign,
-                           TrimMode trim, int pad_left, int pad_right);
+#define make_dragon4_typedecl(Type, npy_type) \
+    PyObject *\
+    Dragon4_Positional_##Type(npy_type *val, DigitMode digit_mode,\
+                              CutoffMode cutoff_mode, int precision,\
+                              int sign, TrimMode trim, int pad_left,\
+                              int pad_right);\
+    PyObject *\
+    Dragon4_Scientific_##Type(npy_type *val, DigitMode digit_mode,\
+                              int precision, int sign, TrimMode trim,\
+                              int pad_left, int exp_digits);
 
-PyObject *
-Dragon4_Scientific_AnySize(void *val, size_t size, DigitMode digit_mode,
-                           int precision, int sign, TrimMode trim,
-                           int pad_left, int pad_right);
+make_dragon4_typedecl(Half, npy_half)
+make_dragon4_typedecl(Float, npy_float)
+make_dragon4_typedecl(Double, npy_double)
+make_dragon4_typedecl(LongDouble, npy_longdouble)
+
+#undef make_dragon4_typedecl
 
 PyObject *
 Dragon4_Positional(PyObject *obj, DigitMode digit_mode, CutoffMode cutoff_mode,
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index 6dc7e5a3e..25e0668ed 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -423,6 +423,7 @@ gentype_format(PyObject *self, PyObject *args)
 
 /**begin repeat
  * #name = half, float, double, longdouble#
+ * #Name = Half, Float, Double, LongDouble#
  * #NAME = HALF, FLOAT, DOUBLE, LONGDOUBLE#
  * #type = npy_half, npy_float, npy_double, npy_longdouble#
  * #suff = h, f, d, l#
@@ -434,12 +435,12 @@ format_@name@(@type@ val, npy_bool scientific,
               int pad_left, int pad_right, int exp_digits)
 {
     if (scientific) {
-        return Dragon4_Scientific_AnySize(&val, sizeof(@type@),
+        return Dragon4_Scientific_@Name@(&val,
                         DigitMode_Unique, precision,
                         sign, trim, pad_left, exp_digits);
     }
     else {
-        return Dragon4_Positional_AnySize(&val, sizeof(@type@),
+        return Dragon4_Positional_@Name@(&val,
                         DigitMode_Unique, CutoffMode_TotalLength, precision,
                         sign, trim, pad_left, pad_right);
     }