* _iotools.StringConverter

- prevents an explicit default to be overwritten during upgrade * io.genfromtxt - deprecate `skiprows` for `skip_header` - deprecate `missing` for `missing_values` - `missing_values` can now be a sequence - add support for `filling_values` * fixed ticket #1257
author: pierregm <pierregm@localhost> 2009-10-12 04:45:31 +0000
committer: pierregm <pierregm@localhost> 2009-10-12 04:45:31 +0000
commit: 80851e34d2955a331cecb9f50d2287d33618dd3e (patch)
tree: a1c2ea17ef9f9192b1ad7839c31d7f73bc579bfe
parent: 295e24af415bb49f12a632f0e27fb9e2a1099ea2 (diff)
download: numpy-80851e34d2955a331cecb9f50d2287d33618dd3e.tar.gz
4 files changed, 208 insertions, 75 deletions
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index 9e8bcce2a..398ed07a4 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -557,6 +557,7 @@ class StringConverter:
         self._callingfunction = self._strict_call
         self.type = ttype
         self._checked = False
+        self._initial_default = default
     #
     def _loose_call(self, value):
         try:
@@ -608,12 +609,18 @@ class StringConverter:
                 raise ConverterLockError(errmsg)
             _statusmax = len(self._mapper)
             # Complains if we try to upgrade by the maximum
-            if self._status == _statusmax:
+            _status = self._status
+            if _status == _statusmax:
                 errmsg = "Could not find a valid conversion function"
                 raise ConverterError(errmsg)
-            elif self._status < _statusmax - 1:
-                self._status += 1
-            (self.type, self.func, self.default) = self._mapper[self._status]
+            elif _status < _statusmax - 1:
+                _status += 1
+            (self.type, self.func, default) = self._mapper[_status]
+            self._status = _status
+            if self._initial_default is not None:
+                self.default = self._initial_default
+            else:
+                self.default = default
             self.upgrade(value)
 
     def iterupgrade(self, value):
@@ -630,11 +637,17 @@ class StringConverter:
                 raise ConverterLockError(errmsg)
             _statusmax = len(self._mapper)
             # Complains if we try to upgrade by the maximum
-            if self._status == _statusmax:
+            _status = self._status
+            if _status == _statusmax:
                 raise ConverterError("Could not find a valid conversion function")
-            elif self._status < _statusmax - 1:
-                self._status += 1
-            (self.type, self.func, self.default) = self._mapper[self._status]
+            elif _status < _statusmax - 1:
+                _status += 1
+            (self.type, self.func, default) = self._mapper[_status]
+            if self._initial_default is not None:
+                self.default = self._initial_default
+            else:
+                self.default = default
+            self._status = _status
             self.iterupgrade(value)
 
     def update(self, func, default=None, missing_values='', locked=False):
diff --git a/numpy/lib/io.py b/numpy/lib/io.py
index 239d0808e..3618e1111 100644
--- a/numpy/lib/io.py
+++ b/numpy/lib/io.py
@@ -870,11 +870,12 @@ def fromregex(file, regexp, dtype):
 
 
 
-def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0,
-               converters=None, missing='', missing_values=None, usecols=None,
-               names=None, excludelist=None, deletechars=None, autostrip=False,
-               case_sensitive=True, defaultfmt="f%i", unpack=None,
-               usemask=False, loose=True, invalid_raise=True):
+def genfromtxt(fname, dtype=float, comments='#', delimiter=None, 
+               skiprows=0, skip_header=0, skip_footer=0, converters=None,
+               missing='', missing_values=None, filling_values=None,
+               usecols=None, names=None, excludelist=None, deletechars=None,
+               autostrip=False, case_sensitive=True, defaultfmt="f%i",
+               unpack=None, usemask=False, loose=True, invalid_raise=True):
     """
     Load data from a text file, with missing values handled as specified.
 
@@ -1021,13 +1022,6 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0,
         errmsg = "The input argument 'converter' should be a valid dictionary "\
             "(got '%s' instead)"
         raise TypeError(errmsg % type(user_converters))
-    # Check the input dictionary of missing values
-    user_missing_values = missing_values or {}
-    if not isinstance(user_missing_values, dict):
-        errmsg = "The input argument 'missing_values' should be a valid "\
-            "dictionary (got '%s' instead)"
-        raise TypeError(errmsg % type(missing_values))
-    defmissing = [_.strip() for _ in missing.split(',')] + ['']
 
     # Initialize the filehandle, the LineSplitter and the NameValidator
     if isinstance(fname, basestring):
@@ -1043,18 +1037,26 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0,
                                    deletechars=deletechars,
                                    case_sensitive=case_sensitive)
 
-    # Get the first valid lines after the first skiprows ones
-    for i in xrange(skiprows):
+    # Get the first valid lines after the first skiprows ones ..
+    if skiprows:
+        warnings.warn("The use of `skiprows` is deprecated.\n"\
+                      "Please use `skip_header` instead.",
+                      DeprecationWarning)
+        skip_header = skiprows
+    # Skip the first `skip_header` rows
+    for i in xrange(skip_header):
         fhd.readline()
+    # Keep on until we find the first valid values
     first_values = None
     while not first_values:
         first_line = fhd.readline()
         if first_line == '':
             raise IOError('End-of-file reached before encountering data.')
         if names is True:
-            first_values = first_line.strip().split(delimiter)
-        else:
-            first_values = split_line(first_line)
+            if comments in first_line:
+                first_line = ''.join(first_line.split(comments)[1])
+        first_values = split_line(first_line)
+    # Should we take the first values as names ?
     if names is True:
         fval = first_values[0].strip()
         if fval in comments:
@@ -1073,6 +1075,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0,
         names = validate_names([_.strip() for _ in names.split(',')])
     elif names:
         names = validate_names(names)
+    if names is not None:
+        names = list(names)
     # Get the dtype
     if dtype is not None:
         dtype = easy_dtype(dtype, defaultfmt=defaultfmt, names=names)
@@ -1083,48 +1087,114 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0,
             if _is_string_like(current):
                 usecols[i] = names.index(current)
 
-    # If user_missing_values has names as keys, transform them to indices
-    missing = {}
-    for (key, val) in user_missing_values.iteritems():
-        # If val is a list, flatten it. In any case, add missing &'' to the list
-        if isinstance(val, (list, tuple)):
-            val = [str(_) for _ in val]
-        else:
-            val = [str(val), ]
-        val.extend(defmissing)
-        if _is_string_like(key):
-            try:
-                missing[names.index(key)] = val
-            except ValueError:
-                pass
+    # Process the missing values ...............................
+    # Rename missing_values for convenience
+    user_missing_values = missing_values or ()
+
+    # Define the list of missing_values (one column: one list)
+    missing_values = [list(['']) for _ in range(nbcols)]
+
+    # We have a dictionary: process it field by field
+    if isinstance(user_missing_values, dict):
+         # Loop on the items
+        for (key, val) in user_missing_values.items():
+            # Make sure the key is an index
+            if _is_string_like(key):
+                key = names.index(key)
+            # Redefine the key as needed if it's a column number
+            if usecols:
+                try:
+                    key = usecols.index(key)
+                except ValueError:
+                    pass
+            # Transform the value as a list of string
+            if isinstance(val, (list, tuple)):
+                val = [str(_) for _ in val]
+            else:
+                val = [str(val),]
+            # Add the value(s) to the current list of missing
+            if key is None:
+                # None acts as default
+                for miss in missing_values:
+                    miss.extend(val)
+            else:
+                missing_values[key].extend(val)
+    # We have a sequence : each item matches a column
+    elif isinstance(user_missing_values, (list, tuple)):
+        for (value, entry) in zip(user_missing_values, missing_values):
+            value = str(value)
+            if value not in entry:
+                entry.append(value)
+    # We have a string : apply it to all entries
+    elif isinstance(user_missing_values, basestring):
+        user_value = user_missing_values.split(",")
+        for entry in missing_values:
+            entry.extend(user_value)
+    # We have something else: apply it to all entries
+    else:
+        for entry in missing_values:
+            entry.extend([str(user_missing_values)])
+
+    # Process the deprecated `missing`
+    if missing != '':
+        warnings.warn("The use of `missing` is deprecated.\n"\
+                      "Please use `missing_values` instead.",
+                      DeprecationWarning)
+        values = [str(_) for _ in missing.split(",")]
+        for entry in missing_values:
+            entry.extend(values)
+
+    # Process the filling_values ...............................
+    # Rename the input for convenience
+    user_filling_values = filling_values or []
+    # Define the default
+    filling_values = [None] * nbcols
+    # We have a dictionary : update each entry individually
+    if isinstance(user_filling_values, dict):
+        for (key, val) in user_filling_values.items():
+            # Make sure the key is an index
+            if _is_string_like(key):
+                key = names.index(key)
+            # Redefine the key if it's a column number and usecols is defined
+            if usecols:
+                try:
+                    key = usecols.index(key)
+                except ValueError:
+                    pass
+            # Add the value to the list
+            filling_values[key] = val
+    # We have a sequence : update on a one-to-one basis
+    elif isinstance(user_filling_values, (list, tuple)):
+        n = len(user_filling_values)
+        if (n <= nbcols):
+            filling_values[:n] = user_filling_values
         else:
-            missing[key] = val
-
+            filling_values = user_filling_values[:nbcols]
+    # We have something else : use it for all entries
+    else:
+        filling_values = [user_filling_values] * nbcols
 
-    # Initialize the default converters
+    # Initialize the converters ................................
     if dtype is None:
         # Note: we can't use a [...]*nbcols, as we would have 3 times the same
         # ... converter, instead of 3 different converters.
-        converters = [StringConverter(None,
-                                      missing_values=missing.get(_, defmissing))
-                      for _ in range(nbcols)]
+        converters = [StringConverter(None, missing_values=miss, default=fill)
+                      for (miss, fill) in zip(missing_values, filling_values)]
     else:
         dtype_flat = flatten_dtype(dtype, flatten_base=True)
         # Initialize the converters
         if len(dtype_flat) > 1:
             # Flexible type : get a converter from each dtype
-            converters = [StringConverter(dt,
-                                          missing_values=missing.get(i, defmissing),
-                                          locked=True)
-                          for (i, dt) in enumerate(dtype_flat)]
+            zipit = zip(dtype_flat, missing_values, filling_values)
+            converters = [StringConverter(dt, locked=True,
+                                          missing_values=miss, default=fill)
+                           for (dt, miss, fill) in zipit]
         else:
             # Set to a default converter (but w/ different missing values)
-            converters = [StringConverter(dtype,
-                                          missing_values=missing.get(_, defmissing),
-                                          locked=True)
-                          for _ in range(nbcols)]
-    missing = [_.missing_values for _ in converters]
-
+            zipit = zip(missing_values, filling_values)
+            converters = [StringConverter(dtype, locked=True,
+                                          missing_values=miss, default=fill)
+                          for (miss, fill) in zipit]
     # Update the converters to use the user-defined ones
     uc_update = []
     for (i, conv) in user_converters.items():
@@ -1137,13 +1207,15 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0,
             except ValueError:
                 # Unused converter specified
                 continue
-        converters[i].update(conv, default=None,
-                             missing_values=missing[i],
-                             locked=True)
+        converters[i].update(conv, locked=True,
+                             default=filling_values[i],
+                             missing_values=missing_values[i],)
         uc_update.append((i, conv))
     # Make sure we have the corrected keys in user_converters...
     user_converters.update(uc_update)
 
+    miss_chars = [_.missing_values for _ in converters]
+
     # Reset the names to match the usecols
     if (not first_line) and usecols:
         names = [names[_] for _ in usecols]
@@ -1180,8 +1252,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0,
         # Store the values
         append_to_rows(tuple(values))
         if usemask:
-            append_to_masks(tuple([val.strip() in mss
-                            for (val, mss) in zip(values, missing)]))
+            append_to_masks(tuple([v.strip() in m
+                                   for (v, m) in zip(values, missing_values)]))
 
     # Upgrade the converters (if needed)
     if dtype is None:
@@ -1197,7 +1269,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0,
                         converter.upgrade(value)
                     except (ConverterError, ValueError):
                         errmsg += "(occurred line #%i for value '%s')"
-                        errmsg %= (j + 1 + skiprows, value)
+                        errmsg %= (j + 1 + skip_header, value)
                         raise ConverterError(errmsg)
 
     # Check that we don't have invalid values
@@ -1271,7 +1343,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0,
             if 'O' in (_.char for _ in dtype_flat):
                 if has_nested_fields(dtype):
                     errmsg = "Nested fields involving objects "\
-                        "are not supported..."
+                             "are not supported..."
                     raise NotImplementedError(errmsg)
                 else:
                     output = np.array(data, dtype=dtype)
diff --git a/numpy/lib/tests/test__iotools.py b/numpy/lib/tests/test__iotools.py
index c16491aee..ed0f4dc63 100644
--- a/numpy/lib/tests/test__iotools.py
+++ b/numpy/lib/tests/test__iotools.py
@@ -179,6 +179,18 @@ class TestStringConverter(TestCase):
         conv = StringConverter(lambda s: date(*(time.strptime(s)[:3])))
         assert_equal(conv._mapper[-2][0](0), 0j)
         assert(hasattr(conv, 'default'))
+    #
+    def test_keep_default(self):
+        "Make sure we don't lose an explicit default"
+        converter = StringConverter(None, missing_values='', default=-999)
+        converter.upgrade('3.14159265')
+        assert_equal(converter.default, -999)
+        assert_equal(converter.type, np.dtype(float))
+        #
+        converter = StringConverter(None, missing_values='', default=0)
+        converter.upgrade('3.14159265')
+        assert_equal(converter.default, 0)
+        assert_equal(converter.type, np.dtype(float))
 
 
 #-------------------------------------------------------------------------------
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index 5e0d666c2..ce62a83db 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -781,20 +781,18 @@ M   33  21.99
 
 
     def test_user_missing_values(self):
-        datastr ="A, B, C\n0, 0., 0j\n1, N/A, 1j\n-9, 2.2, N/A\n3, -99, 3j"
-        data = StringIO.StringIO(datastr)
-        basekwargs = dict(dtype=None, delimiter=',', names=True, missing='N/A')
+        data = "A, B, C\n0, 0., 0j\n1, N/A, 1j\n-9, 2.2, N/A\n3, -99, 3j"
+        basekwargs = dict(dtype=None, delimiter=",", names=True, missing="N/A")
         mdtype = [('A', int), ('B', float), ('C', complex)]
         #
-        test = np.mafromtxt(data, **basekwargs)
+        test = np.mafromtxt(StringIO.StringIO(data), **basekwargs)
         control = ma.array([(   0, 0.0,    0j), (1, -999, 1j),
                             (  -9, 2.2, -999j), (3,  -99, 3j)],
                             mask=[(0, 0, 0), (0, 1, 0), (0, 0, 1), (0, 0, 0)],
                             dtype=mdtype)
         assert_equal(test, control)
         #
-        data.seek(0)
-        test = np.mafromtxt(data,
+        test = np.mafromtxt(StringIO.StringIO(data),
                             missing_values={0:-9, 1:-99, 2:-999j}, **basekwargs)
         control = ma.array([(   0, 0.0,    0j), (1, -999, 1j),
                             (  -9, 2.2, -999j), (3,  -99, 3j)],
@@ -802,8 +800,7 @@ M   33  21.99
                             dtype=mdtype)
         assert_equal(test, control)
         #
-        data.seek(0)
-        test = np.mafromtxt(data,
+        test = np.mafromtxt(StringIO.StringIO(data),
                             missing_values={0:-9, 'B':-99, 'C':-999j},
                             **basekwargs)
         control = ma.array([(   0, 0.0,    0j), (1, -999, 1j),
@@ -885,7 +882,7 @@ M   33  21.99
         data = ["1, 1, 1, 1, -1.1"] * 50
         mdata = StringIO.StringIO("\n".join(data))
 
-        converters = {4: lambda x:np.sqrt(float(x))}
+        converters = {4: lambda x:"(%s)" % x}
         kwargs = dict(delimiter=",", converters=converters,
                       dtype=[(_, int) for _ in 'abcde'],)
         assert_raises(TypeError, np.genfromtxt, mdata, **kwargs)
@@ -948,6 +945,44 @@ M   33  21.99
                         dtype=[('f0', '|S10'), ('f1', float), ('f2', '|S5')])
         assert_equal(mtest, ctrl)
 
+    def test_incomplete_names(self):
+        "Test w/ incomplete names"
+        data = "A,,C\n0,1,2\n3,4,5"
+        kwargs = dict(delimiter=",", names=True)
+        # w/ dtype=None
+        ctrl = np.array([(0, 1, 2), (3, 4, 5)],
+                        dtype=[(_, int) for _ in ('A', 'f0', 'C')])
+        test = np.ndfromtxt(StringIO.StringIO(data), dtype=None, **kwargs)
+        assert_equal(test, ctrl)
+        # w/ default dtype
+        ctrl = np.array([(0, 1, 2), (3, 4, 5)],
+                        dtype=[(_, float) for _ in ('A', 'f0', 'C')])
+        test = np.ndfromtxt(StringIO.StringIO(data), **kwargs)
+
+
+    def test_fixed_width_names(self):
+        "Test fix-width w/ names"
+        data = "    A    B   C\n    0    1 2.3\n   45   67   9."
+        kwargs = dict(delimiter=(5, 5, 4), names=True, dtype=None)
+        ctrl = np.array([(0, 1, 2.3), (45, 67, 9.)],
+                        dtype=[('A', int), ('B', int), ('C', float)])
+        test = np.ndfromtxt(StringIO.StringIO(data), **kwargs)
+        assert_equal(test, ctrl)
+        #
+        kwargs = dict(delimiter=5, names=True, dtype=None)
+        ctrl = np.array([(0, 1, 2.3), (45, 67, 9.)],
+                        dtype=[('A', int), ('B', int), ('C', float)])
+        test = np.ndfromtxt(StringIO.StringIO(data), **kwargs)
+        assert_equal(test, ctrl)
+
+    def test_filling_values(self):
+        "Test missing values"
+        data = "1, 2, 3\n1, , 5\n0, 6, \n"
+        kwargs = dict(delimiter=",", dtype=None, filling_values=-999)
+        ctrl = np.array([[1, 2, 3], [1, -999, 5], [0, 6, -999]], dtype=int)
+        test = np.ndfromtxt(StringIO.StringIO(data), **kwargs)
+        assert_equal(test, ctrl)
+
 
     def test_recfromtxt(self):
         #
@@ -972,16 +1007,15 @@ M   33  21.99
     def test_recfromcsv(self):
         #
         data = StringIO.StringIO('A,B\n0,1\n2,3')
-        test = np.recfromcsv(data, missing='N/A',
-                             names=True, case_sensitive=True)
+        kwargs = dict(missing='N/A', names=True, case_sensitive=True)
+        test = np.recfromcsv(data, dtype=None, **kwargs)
         control = np.array([(0, 1), (2, 3)],
                            dtype=[('A', np.int), ('B', np.int)])
         self.failUnless(isinstance(test, np.recarray))
         assert_equal(test, control)
         #
         data = StringIO.StringIO('A,B\n0,1\n2,N/A')
-        test = np.recfromcsv(data, dtype=None, missing='N/A',
-                             names=True, case_sensitive=True, usemask=True)
+        test = np.recfromcsv(data, dtype=None, usemask=True, **kwargs)
         control = ma.array([(0, 1), (2, -1)],
                            mask=[(False, False), (False, True)],
                            dtype=[('A', np.int), ('B', np.int)])
@@ -996,6 +1030,7 @@ M   33  21.99
         self.failUnless(isinstance(test, np.recarray))
         assert_equal(test, control)
 
+
 def test_gzip_load():
     a = np.random.random((5, 5))
 
@@ -1009,6 +1044,7 @@ def test_gzip_load():
     f = gzip.GzipFile(fileobj=s, mode="r")
     assert_array_equal(np.load(f), a)
 
+
 def test_gzip_loadtxt():
     # Thanks to another windows brokeness, we can't use
     # NamedTemporaryFile: a file created from this function cannot be
author	pierregm <pierregm@localhost>	2009-10-12 04:45:31 +0000
committer	pierregm <pierregm@localhost>	2009-10-12 04:45:31 +0000
commit	80851e34d2955a331cecb9f50d2287d33618dd3e (patch)
tree	a1c2ea17ef9f9192b1ad7839c31d7f73bc579bfe
parent	295e24af415bb49f12a632f0e27fb9e2a1099ea2 (diff)
download	numpy-80851e34d2955a331cecb9f50d2287d33618dd3e.tar.gz