3K: lib: more str vs bytes issues in the lib/io loadtxt, savetxt and genfromtxt

author: Pauli Virtanen <pav@iki.fi> 2010-02-20 18:18:18 +0000
committer: Pauli Virtanen <pav@iki.fi> 2010-02-20 18:18:18 +0000
commit: 9c77c439698e34656d21f5e13bdf12210f659735 (patch)
tree: 0a73fe08e4c31ddf9fc066c0b95486412915b097 /numpy
parent: fe8b7034708ffdf0d2efec268c9852162da56078 (diff)
download: numpy-9c77c439698e34656d21f5e13bdf12210f659735.tar.gz
5 files changed, 104 insertions, 67 deletions
diff --git a/numpy/compat/py3k.py b/numpy/compat/py3k.py
index 7af73c3d0..7357bacff 100644
--- a/numpy/compat/py3k.py
+++ b/numpy/compat/py3k.py
@@ -21,7 +21,7 @@ if sys.version_info[0] >= 3:
     def asstr(s):
         if isinstance(s, str):
             return s
-        return bytes.decode('latin1')
+        return s.decode('latin1')
     def isfileobj(f):
         return isinstance(f, io.FileIO)
     strchar = 'U'
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index a19852ac6..5eb4c0005 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -6,7 +6,7 @@ import numpy as np
 import numpy.core.numeric as nx
 from __builtin__ import bool, int, long, float, complex, object, unicode, str
 
-from numpy.compat import asbytes, bytes
+from numpy.compat import asbytes, bytes, asbytes_nested
 
 if sys.version_info[0] >= 3:
     def _bytes_to_complex(s):
@@ -542,6 +542,11 @@ class StringConverter:
     #
     def __init__(self, dtype_or_func=None, default=None, missing_values=None,
                  locked=False):
+        # Convert unicode (for Py3)
+        if isinstance(missing_values, unicode):
+            missing_values = asbytes(missing_values)
+        elif isinstance(missing_values, (list, tuple)):
+            missing_values = asbytes_nested(missing_values)
         # Defines a lock for upgrade
         self._locked = bool(locked)
         # No input dtype: minimal initialization
@@ -566,7 +571,7 @@ class StringConverter:
                 # If we don't have a default, try to guess it or set it to None
                 if default is None:
                     try:
-                        default = self.func('0')
+                        default = self.func(asbytes('0'))
                     except ValueError:
                         default = None
                 ttype = self._getsubdtype(default)
@@ -729,7 +734,7 @@ class StringConverter:
             self.type = self._getsubdtype(default)
         else:
             try:
-                tester = func('1')
+                tester = func(asbytes('1'))
             except (TypeError, ValueError):
                 tester = None
             self.type = self._getsubdtype(tester)
diff --git a/numpy/lib/io.py b/numpy/lib/io.py
index f57231c56..8233fc7a0 100644
--- a/numpy/lib/io.py
+++ b/numpy/lib/io.py
@@ -8,7 +8,7 @@ __all__ = ['savetxt', 'loadtxt',
 
 import numpy as np
 import format
-import cStringIO
+import sys
 import os
 import sys
 import itertools
@@ -24,7 +24,13 @@ from _iotools import LineSplitter, NameValidator, StringConverter, \
                      _is_string_like, has_nested_fields, flatten_dtype, \
                      easy_dtype, _bytes_to_name
 
-from numpy.compat import asbytes, asstr
+from numpy.compat import asbytes, asstr, asbytes_nested
+
+if sys.version_info[0] >= 3:
+    import io
+    BytesIO = io.BytesIO
+else:
+    from cStringIO import StringIO as BytesIO
 
 _file = open
 _string_like = _is_string_like
@@ -34,7 +40,7 @@ def seek_gzip_factory(f):
     import on gzip.
 
     """
-    import gzip, new
+    import gzip
 
     def seek(self, offset, whence=0):
         # figure out new position (we can only seek forwards)
@@ -58,8 +64,14 @@ def seek_gzip_factory(f):
     if isinstance(f, str):
         f = gzip.GzipFile(f)
 
-    f.seek = new.instancemethod(seek, f)
-    f.tell = new.instancemethod(tell, f)
+    if sys.version_info[0] >= 3:
+        import types
+        f.seek = types.MethodType(seek, f)
+        f.tell = types.MethodType(tell, f)
+    else:
+        import new
+        f.seek = new.instancemethod(seek, f)
+        f.tell = new.instancemethod(tell, f)
 
     return f
 
@@ -180,7 +192,7 @@ class NpzFile(object):
         if member:
             bytes = self.zip.read(key)
             if bytes.startswith(format.MAGIC_PREFIX):
-                value = cStringIO.StringIO(bytes)
+                value = BytesIO(bytes)
                 return format.read_array(value)
             else:
                 return bytes
@@ -474,12 +486,14 @@ def _getconv(dtype):
         return float
     elif issubclass(typ, np.complex):
         return complex
+    elif issubclass(typ, np.bytes_):
+        return bytes
     else:
         return str
 
 
 
-def loadtxt(fname, dtype=float, comments=asbytes('#'), delimiter=None,
+def loadtxt(fname, dtype=float, comments='#', delimiter=None,
             converters=None, skiprows=0, usecols=None, unpack=False):
     """
     Load data from a text file.
@@ -555,6 +569,11 @@ def loadtxt(fname, dtype=float, comments=asbytes('#'), delimiter=None,
     array([ 2.,  4.])
 
     """
+    # Type conversions for Py3 convenience
+    comments = asbytes(comments)
+    if delimiter is not None:
+        delimiter = asbytes(delimiter)
+
     user_converters = converters
 
     if usecols is not None:
@@ -768,9 +787,9 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n'):
     """
 
     # Py3 conversions first
-    if isinstance(format, bytes):
-        format = asstr(format)
-    delimiter = asbytes(delimiter)
+    if isinstance(fmt, bytes):
+        fmt = asstr(fmt)
+    delimiter = asstr(delimiter)
 
     if _is_string_like(fname):
         if fname.endswith('.gz'):
@@ -877,9 +896,9 @@ def fromregex(file, regexp, dtype):
 
     """
     if not hasattr(file, "read"):
-        file = open(file, 'r')
+        file = open(file, 'rb')
     if not hasattr(regexp, 'match'):
-        regexp = re.compile(regexp)
+        regexp = re.compile(asbytes(regexp))
     if not isinstance(dtype, np.dtype):
         dtype = np.dtype(dtype)
 
@@ -905,9 +924,9 @@ def fromregex(file, regexp, dtype):
 
 
 
-def genfromtxt(fname, dtype=float, comments=asbytes('#'), delimiter=None,
+def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
                skiprows=0, skip_header=0, skip_footer=0, converters=None,
-               missing=asbytes(''), missing_values=None, filling_values=None,
+               missing='', missing_values=None, filling_values=None,
                usecols=None, names=None, excludelist=None, deletechars=None,
                autostrip=False, case_sensitive=True, defaultfmt="f%i",
                unpack=None, usemask=False, loose=True, invalid_raise=True):
@@ -1042,6 +1061,15 @@ def genfromtxt(fname, dtype=float, comments=asbytes('#'), delimiter=None,
           dtype=[('intvar', '<i8'), ('fltvar', '<f8'), ('strvar', '|S5')])
 
     """
+    # Py3 data conversions to bytes, for convenience
+    comments = asbytes(comments)
+    if isinstance(delimiter, unicode):
+        delimiter = asbytes(delimiter)
+    if isinstance(missing, unicode):
+        missing = asbytes(missing)
+    if isinstance(missing_values, (unicode, list, tuple)):
+        missing_values = asbytes_nested(missing_values)
+
     #
     if usemask:
         from numpy.ma import MaskedArray, make_mask_descr
@@ -1182,7 +1210,7 @@ def genfromtxt(fname, dtype=float, comments=asbytes('#'), delimiter=None,
                 entry.append(value)
     # We have a string : apply it to all entries
     elif isinstance(user_missing_values, basestring):
-        user_value = user_missing_values.split(",")
+        user_value = user_missing_values.split(asbytes(","))
         for entry in missing_values:
             entry.extend(user_value)
     # We have something else: apply it to all entries
@@ -1195,7 +1223,7 @@ def genfromtxt(fname, dtype=float, comments=asbytes('#'), delimiter=None,
         warnings.warn("The use of `missing` is deprecated.\n"\
                       "Please use `missing_values` instead.",
                       DeprecationWarning)
-        values = [str(_) for _ in missing.split(",")]
+        values = [str(_) for _ in missing.split(asbytes(","))]
         for entry in missing_values:
             entry.extend(values)
 
diff --git a/numpy/lib/tests/test__iotools.py b/numpy/lib/tests/test__iotools.py
index d105cf835..7c45b3527 100644
--- a/numpy/lib/tests/test__iotools.py
+++ b/numpy/lib/tests/test__iotools.py
@@ -6,6 +6,9 @@ if sys.version_info[0] >= 3:
 else:
     from StringIO import StringIO
 
+from datetime import date
+import time
+
 import numpy as np
 from numpy.lib._iotools import LineSplitter, NameValidator, StringConverter,\
                                has_nested_fields, easy_dtype
@@ -130,6 +133,12 @@ class TestNameValidator(TestCase):
 
 #-------------------------------------------------------------------------------
 
+def _bytes_to_date(s):
+    if sys.version_info[0] >= 3:
+        return date(*time.strptime(s.decode('latin1'), "%Y-%m-%d")[:3])
+    else:
+        return date(*time.strptime(s, "%Y-%m-%d")[:3])
+
 class TestStringConverter(TestCase):
     "Test StringConverter"
     #
@@ -168,27 +177,19 @@ class TestStringConverter(TestCase):
     #
     def test_upgrademapper(self):
         "Tests updatemapper"
-        from datetime import date
-        import time
-        if sys.version_info[0] >= 3:
-            dateparser = lambda s : date(*time.strptime(s.decode('latin1'),
-                                                        "%Y-%m-%d")[:3])
-        else:
-            dateparser = lambda s : date(*time.strptime(s, "%Y-%m-%d")[:3])
+        dateparser = _bytes_to_date
         StringConverter.upgrade_mapper(dateparser, date(2000,1,1))
         convert = StringConverter(dateparser, date(2000, 1, 1))
-        test = convert('2001-01-01')
+        test = convert(asbytes('2001-01-01'))
         assert_equal(test, date(2001, 01, 01))
-        test = convert('2009-01-01')
+        test = convert(asbytes('2009-01-01'))
         assert_equal(test, date(2009, 01, 01))
-        test = convert('')
+        test = convert(asbytes(''))
         assert_equal(test, date(2000, 01, 01))
     #
     def test_string_to_object(self):
         "Make sure that string-to-object functions are properly recognized"
-        from datetime import date
-        import time
-        conv = StringConverter(lambda s: date(*(time.strptime(s)[:3])))
+        conv = StringConverter(_bytes_to_date)
         assert_equal(conv._mapper[-2][0](0), 0j)
         assert(hasattr(conv, 'default'))
     #
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index dd1bfbad8..2b4d542c7 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -15,7 +15,7 @@ from datetime import datetime
 
 from numpy.lib._iotools import ConverterError, ConverterLockError, \
                                ConversionWarning
-from numpy.compat import asbytes
+from numpy.compat import asbytes, asbytes_nested
 
 if sys.version_info[0] >= 3:
     from io import BytesIO
@@ -31,7 +31,10 @@ def strptime(s, fmt=None):
     from Python >= 2.5.
 
     """
-    return datetime(*time.strptime(s, fmt)[:3])
+    if sys.version_info[0] >= 3:
+        return datetime(*time.strptime(s.decode('latin1'), fmt)[:3])
+    else:
+        return datetime(*time.strptime(s, fmt)[:3])
 
 class RoundtripTest(object):
     def roundtrip(self, save_func, *args, **kwargs):
@@ -175,7 +178,7 @@ class TestSaveTxt(TestCase):
         c = StringIO()
         np.savetxt(c, a, fmt='%d')
         c.seek(0)
-        assert_equal(c.readlines(), ['1 2\n', '3 4\n'])
+        assert_equal(c.readlines(), asbytes_nested(['1 2\n', '3 4\n']))
 
     def test_1D(self):
         a = np.array([1, 2, 3, 4], int)
@@ -190,7 +193,7 @@ class TestSaveTxt(TestCase):
         c = StringIO()
         np.savetxt(c, a, fmt='%d')
         c.seek(0)
-        assert_equal(c.readlines(), ['1 2\n', '3 4\n'])
+        assert_equal(c.readlines(), asbytes_nested(['1 2\n', '3 4\n']))
 
     def test_delimiter(self):
         a = np.array([[1., 2.], [3., 4.]])
@@ -205,34 +208,34 @@ class TestSaveTxt(TestCase):
         # Sequence of formats
         np.savetxt(c, a, fmt=['%02d', '%3.1f'])
         c.seek(0)
-        assert_equal(c.readlines(), ['01 2.0\n', '03 4.0\n'])
+        assert_equal(c.readlines(), asbytes_nested(['01 2.0\n', '03 4.0\n']))
 
         # A single multiformat string
         c = StringIO()
         np.savetxt(c, a, fmt='%02d : %3.1f')
         c.seek(0)
         lines = c.readlines()
-        assert_equal(lines, ['01 : 2.0\n', '03 : 4.0\n'])
+        assert_equal(lines, asbytes_nested(['01 : 2.0\n', '03 : 4.0\n']))
 
         # Specify delimiter, should be overiden
         c = StringIO()
         np.savetxt(c, a, fmt='%02d : %3.1f', delimiter=',')
         c.seek(0)
         lines = c.readlines()
-        assert_equal(lines, ['01 : 2.0\n', '03 : 4.0\n'])
+        assert_equal(lines, asbytes_nested(['01 : 2.0\n', '03 : 4.0\n']))
 
 
 class TestLoadTxt(TestCase):
     def test_record(self):
         c = StringIO()
-        c.write('1 2\n3 4')
+        c.write(asbytes('1 2\n3 4'))
         c.seek(0)
         x = np.loadtxt(c, dtype=[('x', np.int32), ('y', np.int32)])
         a = np.array([(1, 2), (3, 4)], dtype=[('x', 'i4'), ('y', 'i4')])
         assert_array_equal(x, a)
 
         d = StringIO()
-        d.write('M 64.0 75.0\nF 25.0 60.0')
+        d.write(asbytes('M 64.0 75.0\nF 25.0 60.0'))
         d.seek(0)
         mydescriptor = {'names': ('gender', 'age', 'weight'),
                         'formats': ('S1',
@@ -244,7 +247,7 @@ class TestLoadTxt(TestCase):
 
     def test_array(self):
         c = StringIO()
-        c.write('1 2\n3 4')
+        c.write(asbytes('1 2\n3 4'))
 
         c.seek(0)
         x = np.loadtxt(c, dtype=int)
@@ -258,14 +261,14 @@ class TestLoadTxt(TestCase):
 
     def test_1D(self):
         c = StringIO()
-        c.write('1\n2\n3\n4\n')
+        c.write(asbytes('1\n2\n3\n4\n'))
         c.seek(0)
         x = np.loadtxt(c, dtype=int)
         a = np.array([1, 2, 3, 4], int)
         assert_array_equal(x, a)
 
         c = StringIO()
-        c.write('1,2,3,4\n')
+        c.write(asbytes('1,2,3,4\n'))
         c.seek(0)
         x = np.loadtxt(c, dtype=int, delimiter=',')
         a = np.array([1, 2, 3, 4], int)
@@ -273,7 +276,7 @@ class TestLoadTxt(TestCase):
 
     def test_missing(self):
         c = StringIO()
-        c.write('1,2,3,,5\n')
+        c.write(asbytes('1,2,3,,5\n'))
         c.seek(0)
         x = np.loadtxt(c, dtype=int, delimiter=',', \
             converters={3:lambda s: int(s or - 999)})
@@ -282,7 +285,7 @@ class TestLoadTxt(TestCase):
 
     def test_converters_with_usecols(self):
         c = StringIO()
-        c.write('1,2,3,,5\n6,7,8,9,10\n')
+        c.write(asbytes('1,2,3,,5\n6,7,8,9,10\n'))
         c.seek(0)
         x = np.loadtxt(c, dtype=int, delimiter=',', \
             converters={3:lambda s: int(s or - 999)}, \
@@ -292,7 +295,7 @@ class TestLoadTxt(TestCase):
 
     def test_comments(self):
         c = StringIO()
-        c.write('# comment\n1,2,3,5\n')
+        c.write(asbytes('# comment\n1,2,3,5\n'))
         c.seek(0)
         x = np.loadtxt(c, dtype=int, delimiter=',', \
             comments='#')
@@ -301,7 +304,7 @@ class TestLoadTxt(TestCase):
 
     def test_skiprows(self):
         c = StringIO()
-        c.write('comment\n1,2,3,5\n')
+        c.write(asbytes('comment\n1,2,3,5\n'))
         c.seek(0)
         x = np.loadtxt(c, dtype=int, delimiter=',', \
             skiprows=1)
@@ -309,7 +312,7 @@ class TestLoadTxt(TestCase):
         assert_array_equal(x, a)
 
         c = StringIO()
-        c.write('# comment\n1,2,3,5\n')
+        c.write(asbytes('# comment\n1,2,3,5\n'))
         c.seek(0)
         x = np.loadtxt(c, dtype=int, delimiter=',', \
             skiprows=1)
@@ -344,12 +347,12 @@ class TestLoadTxt(TestCase):
         names = ['stid', 'temp']
         dtypes = ['S4', 'f8']
         arr = np.loadtxt(c, usecols=(0, 2), dtype=zip(names, dtypes))
-        assert_equal(arr['stid'], ["JOE", "BOB"])
+        assert_equal(arr['stid'], asbytes_nested(["JOE", "BOB"]))
         assert_equal(arr['temp'], [25.3, 27.9])
 
     def test_fancy_dtype(self):
         c = StringIO()
-        c.write('1,2,3.0\n4,5,6.0\n')
+        c.write(asbytes('1,2,3.0\n4,5,6.0\n'))
         c.seek(0)
         dt = np.dtype([('x', int), ('y', [('t', int), ('s', float)])])
         x = np.loadtxt(c, dtype=dt, delimiter=',')
@@ -371,7 +374,7 @@ class TestLoadTxt(TestCase):
 
     def test_unused_converter(self):
         c = StringIO()
-        c.writelines(['1 21\n', '3 42\n'])
+        c.writelines([asbytes('1 21\n'), asbytes('3 42\n')])
         c.seek(0)
         data = np.loadtxt(c, usecols=(1,),
                           converters={0: lambda s: int(s, 16)})
@@ -404,7 +407,7 @@ class TestLoadTxt(TestCase):
 class Testfromregex(TestCase):
     def test_record(self):
         c = StringIO()
-        c.write('1.312 foo\n1.534 bar\n4.444 qux')
+        c.write(asbytes('1.312 foo\n1.534 bar\n4.444 qux'))
         c.seek(0)
 
         dt = [('num', np.float64), ('val', 'S3')]
@@ -415,7 +418,7 @@ class Testfromregex(TestCase):
 
     def test_record_2(self):
         c = StringIO()
-        c.write('1312 foo\n1534 bar\n4444 qux')
+        c.write(asbytes('1312 foo\n1534 bar\n4444 qux'))
         c.seek(0)
 
         dt = [('num', np.int32), ('val', 'S3')]
@@ -426,7 +429,7 @@ class Testfromregex(TestCase):
 
     def test_record_3(self):
         c = StringIO()
-        c.write('1312 foo\n1534 bar\n4444 qux')
+        c.write(asbytes('1312 foo\n1534 bar\n4444 qux'))
         c.seek(0)
 
         dt = [('num', np.float64)]
@@ -521,7 +524,7 @@ class TestFromTxt(TestCase):
         "Test retrieving a header"
         data = StringIO('gender age weight\nM 64.0 75.0\nF 25.0 60.0')
         test = np.ndfromtxt(data, dtype=None, names=True)
-        control = {'gender': np.array(['M', 'F']),
+        control = {'gender': np.array(asbytes_nested(['M', 'F'])),
                    'age': np.array([64.0, 25.0]),
                    'weight': np.array([75.0, 60.0])}
         assert_equal(test['gender'], control['gender'])
@@ -532,7 +535,7 @@ class TestFromTxt(TestCase):
         "Test the automatic definition of the output dtype"
         data = StringIO('A 64 75.0 3+4j True\nBCD 25 60.0 5+6j False')
         test = np.ndfromtxt(data, dtype=None)
-        control = [np.array(['A', 'BCD']),
+        control = [np.array(asbytes_nested(['A', 'BCD'])),
                    np.array([64, 25]),
                    np.array([75.0, 60.0]),
                    np.array([3 + 4j, 5 + 6j]),
@@ -649,10 +652,10 @@ M   33  21.99
 
 
     def test_invalid_converter(self):
-        strip_rand = lambda x : float(('r' in x.lower() and x.split()[-1]) or
-                                      (not 'r' in x.lower() and x.strip() or 0.0))
-        strip_per = lambda x : float(('%' in x.lower() and x.split()[0]) or
-                                     (not '%' in x.lower() and x.strip() or 0.0))
+        strip_rand = lambda x : float((asbytes('r') in x.lower() and x.split()[-1]) or
+                                      (not asbytes('r') in x.lower() and x.strip() or 0.0))
+        strip_per = lambda x : float((asbytes('%') in x.lower() and x.split()[0]) or
+                                     (not asbytes('%') in x.lower() and x.strip() or 0.0))
         s = StringIO("D01N01,10/1/2003 ,1 %,R 75,400,600\r\n" \
                               "L24U05,12/5/2003, 2 %,1,300, 150.5\r\n"
                               "D02N03,10/10/2004,R 1,,7,145.55")
@@ -678,10 +681,10 @@ M   33  21.99
         "Test using an explicit dtype with an object"
         from datetime import date
         import time
-        data = """
+        data = asbytes("""
         1; 2001-01-01
         2; 2002-01-31
-        """
+        """)
         ndtype = [('idx', int), ('code', np.object)]
         func = lambda s: strptime(s.strip(), "%Y-%m-%d")
         converters = {1: func}
@@ -775,7 +778,7 @@ M   33  21.99
         names = ['stid', 'temp']
         dtypes = ['S4', 'f8']
         test = np.ndfromtxt(data, usecols=(0, 2), dtype=zip(names, dtypes))
-        assert_equal(test['stid'], ["JOE", "BOB"])
+        assert_equal(test['stid'], asbytes_nested(["JOE", "BOB"]))
         assert_equal(test['temp'], [25.3, 27.9])
 
     def test_usecols_with_integer(self):
@@ -1153,7 +1156,7 @@ def test_gzip_loadtxt():
     # which is then read from by the loadtxt function
     s = StringIO()
     g = gzip.GzipFile(fileobj=s, mode='w')
-    g.write('1 2 3\n')
+    g.write(asbytes('1 2 3\n'))
     g.close()
     s.seek(0)
 
@@ -1169,7 +1172,7 @@ def test_gzip_loadtxt():
 def test_gzip_loadtxt_from_string():
     s = StringIO()
     f = gzip.GzipFile(fileobj=s, mode="w")
-    f.write('1 2 3\n')
+    f.write(asbytes('1 2 3\n'))
     f.close()
     s.seek(0)
author	Pauli Virtanen <pav@iki.fi>	2010-02-20 18:18:18 +0000
committer	Pauli Virtanen <pav@iki.fi>	2010-02-20 18:18:18 +0000
commit	9c77c439698e34656d21f5e13bdf12210f659735 (patch)
tree	0a73fe08e4c31ddf9fc066c0b95486412915b097 /numpy
parent	fe8b7034708ffdf0d2efec268c9852162da56078 (diff)
download	numpy-9c77c439698e34656d21f5e13bdf12210f659735.tar.gz