Recognize POSIX nan/inf representations in NumPyOS_strtod.

Collect some NaN and Inf constants to a single place.
author: Pauli Virtanen <pav@iki.fi> 2009-01-12 21:22:51 +0000
committer: Pauli Virtanen <pav@iki.fi> 2009-01-12 21:22:51 +0000
commit: 2dd3ad531b59ee15ef54aeb10b261956d7407d94 (patch)
tree: 354632e303aef59ce12a38a9f1ca90203a1f65b4 /numpy/core/src/numpyos.c
parent: 8f9ab09976f4f10801eff1e0cb5a8bbe6ba8f8ae (diff)
download: numpy-2dd3ad531b59ee15ef54aeb10b261956d7407d94.tar.gz
1 files changed, 627 insertions, 0 deletions
diff --git a/numpy/core/src/numpyos.c b/numpy/core/src/numpyos.c
new file mode 100644
index 000000000..8a4a8681c
--- /dev/null
+++ b/numpy/core/src/numpyos.c
@@ -0,0 +1,627 @@
+#include <locale.h>
+#include <stdio.h>
+
+/* From the C99 standard, section 7.19.6: The exponent always contains at least
+   two digits, and only as many more digits as necessary to represent the
+   exponent.
+*/
+/* We force 3 digits on windows for python < 2.6 for compatibility reason */
+#if defined(MS_WIN32) && (PY_VERSION_HEX < 0x02060000)
+#define MIN_EXPONENT_DIGITS 3
+#else
+#define MIN_EXPONENT_DIGITS 2
+#endif
+
+/* Ensure that any exponent, if present, is at least MIN_EXPONENT_DIGITS
+   in length. */
+static void
+_ensure_minimum_exponent_length(char* buffer, size_t buf_size)
+{
+    char *p = strpbrk(buffer, "eE");
+    if (p && (*(p + 1) == '-' || *(p + 1) == '+')) {
+        char *start = p + 2;
+        int exponent_digit_cnt = 0;
+        int leading_zero_cnt = 0;
+        int in_leading_zeros = 1;
+        int significant_digit_cnt;
+
+        /* Skip over the exponent and the sign. */
+        p += 2;
+
+        /* Find the end of the exponent, keeping track of leading
+           zeros. */
+        while (*p && isdigit(Py_CHARMASK(*p))) {
+            if (in_leading_zeros && *p == '0')
+                ++leading_zero_cnt;
+            if (*p != '0')
+                in_leading_zeros = 0;
+            ++p;
+            ++exponent_digit_cnt;
+        }
+
+        significant_digit_cnt = exponent_digit_cnt - leading_zero_cnt;
+        if (exponent_digit_cnt == MIN_EXPONENT_DIGITS) {
+            /* If there are 2 exactly digits, we're done,
+               regardless of what they contain */
+        }
+        else if (exponent_digit_cnt > MIN_EXPONENT_DIGITS) {
+            int extra_zeros_cnt;
+
+            /* There are more than 2 digits in the exponent.  See
+               if we can delete some of the leading zeros */
+            if (significant_digit_cnt < MIN_EXPONENT_DIGITS)
+                significant_digit_cnt = MIN_EXPONENT_DIGITS;
+
+            extra_zeros_cnt = exponent_digit_cnt - significant_digit_cnt;
+
+            /* Delete extra_zeros_cnt worth of characters from the
+               front of the exponent */
+            assert(extra_zeros_cnt >= 0);
+
+            /* Add one to significant_digit_cnt to copy the
+               trailing 0 byte, thus setting the length */
+            memmove(start, start + extra_zeros_cnt, significant_digit_cnt + 1);
+        }
+        else {
+            /* If there are fewer than 2 digits, add zeros
+               until there are 2, if there's enough room */
+            int zeros = MIN_EXPONENT_DIGITS - exponent_digit_cnt;
+            if (start + zeros + exponent_digit_cnt + 1 < buffer + buf_size) {
+                memmove(start + zeros, start, exponent_digit_cnt + 1);
+                memset(start, '0', zeros);
+            }
+        }
+    }
+}
+
+/* Ensure that buffer has a decimal point in it.  The decimal point
+   will not be in the current locale, it will always be '.' */
+static void
+_ensure_decimal_point(char* buffer, size_t buf_size)
+{
+    int insert_count = 0;
+    char* chars_to_insert;
+
+    /* search for the first non-digit character */
+    char *p = buffer;
+    if (*p == '-' || *p == '+')
+        /* Skip leading sign, if present.  I think this could only
+           ever be '-', but it can't hurt to check for both. */
+        ++p;
+    while (*p && isdigit(Py_CHARMASK(*p)))
+        ++p;
+
+    if (*p == '.') {
+        if (isdigit(Py_CHARMASK(*(p+1)))) {
+            /* Nothing to do, we already have a decimal
+               point and a digit after it */
+        }
+        else {
+            /* We have a decimal point, but no following
+               digit.  Insert a zero after the decimal. */
+            ++p;
+            chars_to_insert = "0";
+            insert_count = 1;
+        }
+    }
+    else {
+        chars_to_insert = ".0";
+        insert_count = 2;
+    }
+    if (insert_count) {
+        size_t buf_len = strlen(buffer);
+        if (buf_len + insert_count + 1 >= buf_size) {
+            /* If there is not enough room in the buffer
+               for the additional text, just skip it.  It's
+               not worth generating an error over. */
+        }
+        else {
+            memmove(p + insert_count, p,
+                    buffer + strlen(buffer) - p + 1);
+            memcpy(p, chars_to_insert, insert_count);
+        }
+    }
+}
+
+/* see FORMATBUFLEN in unicodeobject.c */
+#define FLOAT_FORMATBUFLEN 120
+
+/* Given a string that may have a decimal point in the current
+   locale, change it back to a dot.  Since the string cannot get
+   longer, no need for a maximum buffer size parameter. */
+static void
+_change_decimal_from_locale_to_dot(char* buffer)
+{
+    struct lconv *locale_data = localeconv();
+    const char *decimal_point = locale_data->decimal_point;
+
+    if (decimal_point[0] != '.' || decimal_point[1] != 0) {
+        size_t decimal_point_len = strlen(decimal_point);
+
+        if (*buffer == '+' || *buffer == '-')
+            buffer++;
+        while (isdigit(Py_CHARMASK(*buffer)))
+            buffer++;
+        if (strncmp(buffer, decimal_point, decimal_point_len) == 0) {
+            *buffer = '.';
+            buffer++;
+            if (decimal_point_len > 1) {
+                /* buffer needs to get smaller */
+                size_t rest_len = strlen(buffer +
+                                         (decimal_point_len - 1));
+                memmove(buffer,
+                        buffer + (decimal_point_len - 1),
+                        rest_len);
+                buffer[rest_len] = 0;
+            }
+        }
+    }
+}
+
+/*
+ * Check that the format string is a valid one for NumPyOS_ascii_format*
+ */
+static int
+_check_ascii_format(const char *format)
+{
+    char format_char;
+    size_t format_len = strlen(format);
+
+    /* The last character in the format string must be the format char */
+    format_char = format[format_len - 1];
+
+    if (format[0] != '%') {
+        return -1;
+    }
+
+    /* I'm not sure why this test is here.  It's ensuring that the format
+       string after the first character doesn't have a single quote, a
+       lowercase l, or a percent. This is the reverse of the commented-out
+       test about 10 lines ago. */
+    if (strpbrk(format + 1, "'l%")) {
+        return -1;
+    }
+
+    /* Also curious about this function is that it accepts format strings
+       like "%xg", which are invalid for floats.  In general, the
+       interface to this function is not very good, but changing it is
+       difficult because it's a public API. */
+
+    if (!(format_char == 'e' || format_char == 'E' ||
+          format_char == 'f' || format_char == 'F' ||
+          format_char == 'g' || format_char == 'G')) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Fix the generated string: make sure the decimal is ., that exponent has a
+ * minimal number of digits, and that it has a decimal + one digit after that
+ * decimal if decimal argument != 0 (Same effect that 'Z' format in
+ * PyOS_ascii_formatd
+ */
+static char*
+_fix_ascii_format(char* buf, size_t buflen, int decimal)
+{
+    /* Get the current locale, and find the decimal point string.
+       Convert that string back to a dot. */
+    _change_decimal_from_locale_to_dot(buf);
+
+    /* If an exponent exists, ensure that the exponent is at least
+       MIN_EXPONENT_DIGITS digits, providing the buffer is large enough
+       for the extra zeros.  Also, if there are more than
+       MIN_EXPONENT_DIGITS, remove as many zeros as possible until we get
+       back to MIN_EXPONENT_DIGITS */
+    _ensure_minimum_exponent_length(buf, buflen);
+
+    if (decimal != 0) {
+        _ensure_decimal_point(buf, buflen);
+    }
+
+    return buf;
+}
+
+/*
+ * NumPyOS_ascii_format*:
+ *	- buffer: A buffer to place the resulting string in
+ *	- buf_size: The length of the buffer.
+ *	- format: The printf()-style format to use for the code to use for
+ *	converting.
+ *	- value: The value to convert
+ *	- decimal: if != 0, always has a decimal, and at leasat one digit after
+ *	the decimal. This has the same effect as passing 'Z' in the origianl
+ *	PyOS_ascii_formatd
+ *
+ * This is similar to PyOS_ascii_formatd in python > 2.6, except that it does
+ * not handle 'n', and handles nan / inf.
+ *
+ * Converts a #gdouble to a string, using the '.' as decimal point. To format
+ * the number you pass in a printf()-style format string. Allowed conversion
+ * specifiers are 'e', 'E', 'f', 'F', 'g', 'G'.
+ *
+ * Return value: The pointer to the buffer with the converted string.
+ */
+#define _ASCII_FORMAT(type, suffix, print_type)                         \
+    static char*                                                        \
+    NumPyOS_ascii_format ## suffix(char *buffer, size_t buf_size,       \
+                                   const char *format,                  \
+                                   type val, int decimal)               \
+    {                                                                   \
+	if (isfinite(val)) {                                            \
+            if(_check_ascii_format(format)) {                           \
+                return NULL;                                            \
+            }                                                           \
+            PyOS_snprintf(buffer, buf_size, format, (print_type)val);   \
+            return _fix_ascii_format(buffer, buf_size, decimal);        \
+	}                                                               \
+        else if (isnan(val)){                                           \
+            if (buf_size < 4) {                                         \
+                return NULL;                                            \
+            }                                                           \
+            strcpy(buffer, "nan");                                      \
+	}                                                               \
+        else {                                                          \
+            if (signbit(val)) {                                         \
+                if (buf_size < 5) {                                     \
+                    return NULL;                                        \
+                }                                                       \
+                strcpy(buffer, "-inf");                                 \
+            }                                                           \
+            else {                                                      \
+                if (buf_size < 4) {                                     \
+                    return NULL;                                        \
+                }                                                       \
+                strcpy(buffer, "inf");                                  \
+            }                                                           \
+	}                                                               \
+	return buffer;                                                  \
+    }
+
+_ASCII_FORMAT(float, f, float)
+_ASCII_FORMAT(double, d, double)
+#ifndef FORCE_NO_LONG_DOUBLE_FORMATTING
+_ASCII_FORMAT(long double, l, long double)
+#else
+_ASCII_FORMAT(long double, l, double)
+#endif
+
+
+static double NumPyOS_PINF;  /* Positive infinity */
+static double NumPyOS_PZERO; /* +0 */
+static double NumPyOS_NAN;   /* NaN */
+
+/* NumPyOS_init:
+ *
+ * initialize floating-point constants
+ */
+static void
+NumPyOS_init(void) {
+    double mul = 1e100;
+    double div = 1e10;
+    double tmp, c;
+
+    c = mul;
+    for (;;) {
+        c *= mul;
+        if (c == tmp) break;
+        tmp = c;
+    }
+    NumPyOS_PINF = c;
+
+    c = div;
+    for (;;) {
+        c /= div;
+        if (c == tmp) break;
+        tmp = c;
+    }
+    NumPyOS_PZERO = c;
+
+    NumPyOS_NAN = NumPyOS_PINF / NumPyOS_PINF;
+}
+
+
+/* NumPyOS_ascii_isspace:
+ *
+ * Same as isspace under C locale
+ */
+static int
+NumPyOS_ascii_isspace(char c)
+{
+    return c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t' ||
+        c == '\v';
+}
+
+
+/* NumPyOS_ascii_isalpha:
+ *
+ * Same as isalpha under C locale
+ */
+static int
+NumPyOS_ascii_isalpha(char c)
+{
+    return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
+}
+
+
+/* NumPyOS_ascii_isdigit:
+ *
+ * Same as isdigit under C locale
+ */
+static int
+NumPyOS_ascii_isdigit(char c)
+{
+    return (c >= '0' && c <= '9');
+}
+
+
+/* NumPyOS_ascii_isalnum:
+ *
+ * Same as isalnum under C locale
+ */
+static int
+NumPyOS_ascii_isalnum(char c)
+{
+    return NumPyOS_ascii_isdigit(c) || NumPyOS_ascii_isalpha(c);
+}
+
+
+/* NumPyOS_ascii_tolower:
+ *
+ * Same as tolower under C locale
+ */
+static char
+NumPyOS_ascii_tolower(char c)
+{
+    if (c >= 'A' && c <= 'Z')
+        return c + ('a'-'A');
+    return c;
+}
+
+
+/* NumPyOS_ascii_strncasecmp:
+ *
+ * Same as strncasecmp under C locale
+ */
+static int
+NumPyOS_ascii_strncasecmp(const char* s1, const char* s2, size_t len)
+{
+    int diff;
+    while (len > 0 && *s1 != '\0' && *s2 != '\0') {
+        diff = ((int)NumPyOS_ascii_tolower(*s1)) -
+            ((int)NumPyOS_ascii_tolower(*s2));
+        if (diff != 0) return diff;
+        ++s1;
+        ++s2;
+        --len;
+    }
+    if (len > 0)
+        return ((int)*s1) - ((int)*s2);
+    return 0;
+}
+
+
+/* NumPyOS_ascii_strtod:
+ *
+ * Work around bugs in PyOS_ascii_strtod
+ */
+static double
+NumPyOS_ascii_strtod(const char *s, char** endptr)
+{
+    struct lconv *locale_data = localeconv();
+    const char *decimal_point = locale_data->decimal_point;
+    size_t decimal_point_len = strlen(decimal_point);
+
+    char buffer[FLOAT_FORMATBUFLEN+1];
+    char *p;
+    size_t n;
+    double result;
+
+    while (NumPyOS_ascii_isspace(*s)) {
+        ++s;
+    }
+
+    /* ##1
+     *
+     * Recognize POSIX inf/nan representations on all platforms.
+     */
+    p = s;
+    result = 1.0;
+    if (*p == '-') {
+        result = -1.0;
+        ++p;
+    }
+    else if (*p == '+') {
+        ++p;
+    }
+    if (NumPyOS_ascii_strncasecmp(p, "nan", 3) == 0) {
+        p += 3;
+        if (*p == '(') {
+            ++p;
+            while (NumPyOS_ascii_isalnum(*p) || *p == '_') ++p;
+            if (*p == ')') ++p;
+        }
+        if (endptr != NULL) *endptr = p;
+        return NumPyOS_NAN;
+    }
+    else if (NumPyOS_ascii_strncasecmp(p, "inf", 3) == 0) {
+        p += 3;
+        if (NumPyOS_ascii_strncasecmp(p, "inity", 5) == 0)
+            p += 5;
+        if (endptr != NULL) *endptr = p;
+        return result*NumPyOS_PINF;
+    }
+    /* End of ##1 */
+
+    /* ## 2
+     *
+     * At least Python versions <= 2.5.2 and <= 2.6.1
+     *
+     * Fails to do best-efforts parsing of strings of the form "1<DP>234"
+     * where <DP> is the decimal point under the foreign locale.
+     */
+    if (decimal_point[0] != '.' || decimal_point[1] != 0) {
+        p = (char *)s;
+        if (*p == '+' || *p == '-')
+            ++p;
+        while (*p >= '0' && *p <= '9')
+            ++p;
+        if (strncmp(p, decimal_point, decimal_point_len) == 0) {
+            n = (size_t)(p - s);
+            if (n > FLOAT_FORMATBUFLEN)
+                n = FLOAT_FORMATBUFLEN;
+            memcpy(buffer, s, n);
+            buffer[n] = '\0';
+            result = PyOS_ascii_strtod(buffer, &p);
+            if (endptr != NULL) {
+                *endptr = s + (p - buffer);
+            }
+            return result;
+        }
+    }
+    /* End of ##2 */
+
+    return PyOS_ascii_strtod(s, endptr);
+}
+
+
+/*
+ * NumPyOS_ascii_ftolf:
+ *	* fp: FILE pointer
+ *	* value: Place to store the value read
+ *
+ * Similar to PyOS_ascii_strtod, except that it reads input from a file.
+ *
+ * Similarly to fscanf, this function always consumes leading whitespace,
+ * and any text that could be the leading part in valid input.
+ *
+ * Return value: similar to fscanf.
+ *      * 0 if no number read,
+ *      * 1 if a number read,
+ *      * EOF if end-of-file met before reading anything.
+ */
+static int
+NumPyOS_ascii_ftolf(FILE *fp, double *value)
+{
+    char buffer[FLOAT_FORMATBUFLEN+1];
+    char *endp;
+    char *p;
+    int c;
+    int ok;
+
+    /*
+     * Pass on to PyOS_ascii_strtod the leftmost matching part in regexp
+     *
+     *     \s*[+-]? ( [0-9]*\.[0-9]+([eE][+-]?[0-9]+)
+     *              | nan  (  \([:alphanum:_]*\) )?
+     *              | inf(inity)?
+     *              )
+     *
+     * case-insensitively.
+     *
+     * The "do { ... } while (0)" wrapping in macros ensures that they behave
+     * properly eg. in "if ... else" structures.
+     */
+
+#define END_MATCH()                                                         \
+        goto buffer_filled
+
+#define NEXT_CHAR()                                                         \
+        do {                                                                \
+            if (c == EOF || endp >= buffer + FLOAT_FORMATBUFLEN)            \
+                END_MATCH();                                                \
+            *endp++ = (char)c;                                              \
+            c = getc(fp);                                                   \
+        } while (0)
+
+#define MATCH_ALPHA_STRING_NOCASE(string)                                   \
+        do {                                                                \
+            for (p=(string); *p!='\0' && (c==*p || c+('a'-'A')==*p); ++p)   \
+                NEXT_CHAR();                                                \
+            if (*p != '\0') END_MATCH();                                    \
+        } while (0)
+
+#define MATCH_ONE_OR_NONE(condition)                                        \
+        do { if (condition) NEXT_CHAR(); } while (0)
+
+#define MATCH_ONE_OR_MORE(condition)                                        \
+        do {                                                                \
+            ok = 0;                                                         \
+            while (condition) { NEXT_CHAR(); ok = 1; }                      \
+            if (!ok) END_MATCH();                                           \
+        } while (0)
+
+#define MATCH_ZERO_OR_MORE(condition)                                       \
+        while (condition) { NEXT_CHAR(); }
+    
+    /* 1. emulate fscanf EOF handling */
+    c = getc(fp);
+    if (c == EOF)
+        return EOF;
+
+    /* 2. consume leading whitespace unconditionally */
+    while (NumPyOS_ascii_isspace(c)) {
+        c = getc(fp);
+    }
+
+    /* 3. start reading matching input to buffer */
+    endp = buffer;
+
+    /* 4.1 sign (optional) */
+    MATCH_ONE_OR_NONE(c == '+' || c == '-');
+
+    /* 4.2 nan, inf, infinity; [case-insensitive] */
+    if (c == 'n' || c == 'N') {
+        NEXT_CHAR();
+        MATCH_ALPHA_STRING_NOCASE("an");
+
+        /* accept nan([:alphanum:_]*), similarly to strtod */
+        if (c == '(') {
+            NEXT_CHAR();
+            MATCH_ZERO_OR_MORE(NumPyOS_ascii_isalnum(c) || c == '_');
+            if (c == ')') NEXT_CHAR();
+        }
+        END_MATCH();
+    }
+    else if (c == 'i' || c == 'I') {
+        NEXT_CHAR();
+        MATCH_ALPHA_STRING_NOCASE("nfinity");
+        END_MATCH();
+    }
+
+    /* 4.3 mantissa */
+    MATCH_ZERO_OR_MORE(NumPyOS_ascii_isdigit(c));
+
+    if (c == '.') {
+        NEXT_CHAR();
+        MATCH_ONE_OR_MORE(NumPyOS_ascii_isdigit(c));
+    }
+
+    /* 4.4 exponent */
+    if (c == 'e' || c == 'E') {
+        NEXT_CHAR();
+        MATCH_ONE_OR_NONE(c == '+' || c == '-');
+        MATCH_ONE_OR_MORE(NumPyOS_ascii_isdigit(c));
+    }
+
+    END_MATCH();
+
+buffer_filled:
+
+    ungetc(c, fp);
+    *endp = '\0';
+
+    /* 5. try to convert buffer. */
+
+    *value = NumPyOS_ascii_strtod(buffer, &p);
+
+    return (buffer == p) ? 0 : 1; /* if something was read */
+}
+
+#undef END_MATCH
+#undef NEXT_CHAR
+#undef MATCH_ALPHA_STRING_NOCASE
+#undef MATCH_ONE_OR_NONE
+#undef MATCH_ONE_OR_MORE
+#undef MATCH_ZERO_OR_MORE
author	Pauli Virtanen <pav@iki.fi>	2009-01-12 21:22:51 +0000
committer	Pauli Virtanen <pav@iki.fi>	2009-01-12 21:22:51 +0000
commit	2dd3ad531b59ee15ef54aeb10b261956d7407d94 (patch)
tree	354632e303aef59ce12a38a9f1ca90203a1f65b4 /numpy/core/src/numpyos.c
parent	8f9ab09976f4f10801eff1e0cb5a8bbe6ba8f8ae (diff)
download	numpy-2dd3ad531b59ee15ef54aeb10b261956d7407d94.tar.gz