BUG: genfromtxt gave OverflorError for large integers

Fix StringConverter to avoid OverflowError in genfromtxt. Before, int(2**66) would work (and return a ‘long’) but then np.array([2**66], dtype=np.integer) would not work and return an OverflowError which would propagate to genfromtxt. This commit fixes this by ensuring testing in advance whether an OverflowError will occur. In addition, this adds an explicit np.int64 entry on systems where integer means int32. Values larger than 2**63-1 will be cast as float. This includes a regression test and adds an entry to the release notes.
author: Thomas Robitaille <thomas.robitaille@gmail.com> 2015-03-05 23:44:30 +0100
committer: Thomas Robitaille <thomas.robitaille@gmail.com> 2015-03-06 20:59:09 +0100
commit: 2c3ef4cbe8af6fddd82d7e90433f92b23cbcdd37 (patch)
tree: 88756495a1d32e4a55cd7edc943cf4cf8d32d399
parent: 4cba5310c7b8d1a3aab7202209d238f569a8f9ff (diff)
download: numpy-2c3ef4cbe8af6fddd82d7e90433f92b23cbcdd37.tar.gz
4 files changed, 76 insertions, 10 deletions
diff --git a/doc/release/1.10.0-notes.rst b/doc/release/1.10.0-notes.rst
index a07dca80f..36b0c38fb 100644
--- a/doc/release/1.10.0-notes.rst
+++ b/doc/release/1.10.0-notes.rst
@@ -162,6 +162,12 @@ what was provided by *np.allclose*.
 compare NaNs as equal by setting ``equal_nan=True``. Subclasses, such as
 *np.ma.MaskedArray*, are also preserved now.
 
+*np.genfromtxt* now handles large integers correctly
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*np.genfromtxt* now correctly handles integers larger than ``2**31-1`` on
+32-bit systems and larger than ``2**63-1`` on 64-bit systems (it previously
+crashed with an ``OverflowError`` in these cases). Integers larger than
+``2**63-1`` are converted to floating-point values.
 
 Changes
 =======
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index 54ed96490..44bd48df7 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -518,12 +518,18 @@ class StringConverter(object):
     """
     #
     _mapper = [(nx.bool_, str2bool, False),
-               (nx.integer, int, -1),
-               (nx.floating, float, nx.nan),
-               (complex, _bytes_to_complex, nx.nan + 0j),
-               (nx.string_, bytes, asbytes('???'))]
+               (nx.integer, int, -1)]
+
+    # On 32-bit systems, we need to make sure that we explicitly include
+    # nx.int64 since ns.integer is nx.int32.
+    if nx.dtype(nx.integer).itemsize < nx.dtype(nx.int64).itemsize:
+        _mapper.append((nx.int64, int, -1))
+
+    _mapper.extend([(nx.floating, float, nx.nan),
+                    (complex, _bytes_to_complex, nx.nan + 0j),
+                    (nx.string_, bytes, asbytes('???'))])
+
     (_defaulttype, _defaultfunc, _defaultfill) = zip(*_mapper)
-    #
 
     @classmethod
     def _getdtype(cls, val):
@@ -677,7 +683,22 @@ class StringConverter(object):
 
     def _strict_call(self, value):
         try:
-            return self.func(value)
+
+            # We check if we can convert the value using the current function
+            new_value = self.func(value)
+
+            # In addition to having to check whether func can convert the
+            # value, we also have to make sure that we don't get overflow
+            # errors for integers.
+            if self.func is int:
+                try:
+                    np.array(value, dtype=self.type)
+                except OverflowError:
+                    raise ValueError
+
+            # We're still here so we can now return the new value
+            return new_value
+
         except ValueError:
             if value.strip() in self.missing_values:
                 if not self._status:
diff --git a/numpy/lib/tests/test__iotools.py b/numpy/lib/tests/test__iotools.py
index 060f815d5..e0a917a21 100644
--- a/numpy/lib/tests/test__iotools.py
+++ b/numpy/lib/tests/test__iotools.py
@@ -152,17 +152,31 @@ class TestStringConverter(TestCase):
 
     def test_upgrade(self):
         "Tests the upgrade method."
+
         converter = StringConverter()
         assert_equal(converter._status, 0)
+
         # test int
         assert_equal(converter.upgrade(asbytes('0')), 0)
         assert_equal(converter._status, 1)
+
+        # On systems where integer defaults to 32-bit, the statuses will be
+        # offset by one, so we check for this here.
+        import numpy.core.numeric as nx
+        status_offset = int(nx.dtype(nx.integer).itemsize < nx.dtype(nx.int64).itemsize)
+
+        # test int > 2**32
+        assert_equal(converter.upgrade(asbytes('17179869184')), 17179869184)
+        assert_equal(converter._status, 1 + status_offset)
+
         # test float
         assert_allclose(converter.upgrade(asbytes('0.')), 0.0)
-        assert_equal(converter._status, 2)
+        assert_equal(converter._status, 2 + status_offset)
+
         # test complex
         assert_equal(converter.upgrade(asbytes('0j')), complex('0j'))
-        assert_equal(converter._status, 3)
+        assert_equal(converter._status, 3 + status_offset)
+
         # test str
         assert_equal(converter.upgrade(asbytes('a')), asbytes('a'))
         assert_equal(converter._status, len(converter._mapper) - 1)
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index 7054ab1fe..2598a6cfb 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -18,7 +18,7 @@ from numpy.lib._iotools import (ConverterError, ConverterLockError,
 from numpy.compat import asbytes, asbytes_nested, bytes, asstr
 from nose import SkipTest
 from numpy.ma.testutils import (
-    TestCase, assert_equal, assert_array_equal,
+    TestCase, assert_equal, assert_array_equal, assert_allclose,
     assert_raises, assert_raises_regex, run_module_suite
 )
 from numpy.testing import assert_warns, assert_, build_err_msg
@@ -216,7 +216,7 @@ class TestSavezLoad(RoundtripTest, TestCase):
         l = np.load(c)
         assert_equal(a, l['file_a'])
         assert_equal(b, l['file_b'])
-    
+
     def test_BagObj(self):
         a = np.array([[1, 2], [3, 4]], float)
         b = np.array([[1 + 2j, 2 + 7j], [3 - 6j, 4 + 12j]], complex)
@@ -1762,6 +1762,31 @@ M   33  21.99
         res = np.genfromtxt(count())
         assert_array_equal(res, np.arange(10))
 
+    def test_auto_dtype_largeint(self):
+        """
+        Regression test for numpy/numpy#5635 whereby large integers could
+        cause OverflowErrors.
+        """
+        "Test the automatic definition of the output dtype"
+
+        # 2**66 = 73786976294838206464 => should convert to float
+        # 2**34 = 17179869184 => should convert to int64
+        # 2**10 = 1024 => should convert to int (int32 on 32-bit systems,
+        #                 int64 on 64-bit systems)
+
+        data = TextIO('73786976294838206464 17179869184 1024')
+
+        test = np.ndfromtxt(data, dtype=None)
+
+        assert_equal(test.dtype.names, ['f0', 'f1', 'f2'])
+
+        assert test.dtype['f0'] == np.float
+        assert test.dtype['f1'] == np.int64
+        assert test.dtype['f2'] == np.integer
+
+        assert_allclose(test['f0'], 73786976294838206464.)
+        assert_equal(test['f1'], 17179869184)
+        assert_equal(test['f2'], 1024)
 
 def test_gzip_load():
     a = np.random.random((5, 5))
author	Thomas Robitaille <thomas.robitaille@gmail.com>	2015-03-05 23:44:30 +0100
committer	Thomas Robitaille <thomas.robitaille@gmail.com>	2015-03-06 20:59:09 +0100
commit	2c3ef4cbe8af6fddd82d7e90433f92b23cbcdd37 (patch)
tree	88756495a1d32e4a55cd7edc943cf4cf8d32d399
parent	4cba5310c7b8d1a3aab7202209d238f569a8f9ff (diff)
download	numpy-2c3ef4cbe8af6fddd82d7e90433f92b23cbcdd37.tar.gz