diff options
author | pierregm <pierregm@localhost> | 2009-10-12 04:45:31 +0000 |
---|---|---|
committer | pierregm <pierregm@localhost> | 2009-10-12 04:45:31 +0000 |
commit | 80851e34d2955a331cecb9f50d2287d33618dd3e (patch) | |
tree | a1c2ea17ef9f9192b1ad7839c31d7f73bc579bfe | |
parent | 295e24af415bb49f12a632f0e27fb9e2a1099ea2 (diff) | |
download | numpy-80851e34d2955a331cecb9f50d2287d33618dd3e.tar.gz |
* _iotools.StringConverter
- prevents an explicit default to be overwritten during upgrade
* io.genfromtxt
- deprecate `skiprows` for `skip_header`
- deprecate `missing` for `missing_values`
- `missing_values` can now be a sequence
- add support for `filling_values`
* fixed ticket #1257
-rw-r--r-- | numpy/lib/_iotools.py | 29 | ||||
-rw-r--r-- | numpy/lib/io.py | 180 | ||||
-rw-r--r-- | numpy/lib/tests/test__iotools.py | 12 | ||||
-rw-r--r-- | numpy/lib/tests/test_io.py | 62 |
4 files changed, 208 insertions, 75 deletions
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py index 9e8bcce2a..398ed07a4 100644 --- a/numpy/lib/_iotools.py +++ b/numpy/lib/_iotools.py @@ -557,6 +557,7 @@ class StringConverter: self._callingfunction = self._strict_call self.type = ttype self._checked = False + self._initial_default = default # def _loose_call(self, value): try: @@ -608,12 +609,18 @@ class StringConverter: raise ConverterLockError(errmsg) _statusmax = len(self._mapper) # Complains if we try to upgrade by the maximum - if self._status == _statusmax: + _status = self._status + if _status == _statusmax: errmsg = "Could not find a valid conversion function" raise ConverterError(errmsg) - elif self._status < _statusmax - 1: - self._status += 1 - (self.type, self.func, self.default) = self._mapper[self._status] + elif _status < _statusmax - 1: + _status += 1 + (self.type, self.func, default) = self._mapper[_status] + self._status = _status + if self._initial_default is not None: + self.default = self._initial_default + else: + self.default = default self.upgrade(value) def iterupgrade(self, value): @@ -630,11 +637,17 @@ class StringConverter: raise ConverterLockError(errmsg) _statusmax = len(self._mapper) # Complains if we try to upgrade by the maximum - if self._status == _statusmax: + _status = self._status + if _status == _statusmax: raise ConverterError("Could not find a valid conversion function") - elif self._status < _statusmax - 1: - self._status += 1 - (self.type, self.func, self.default) = self._mapper[self._status] + elif _status < _statusmax - 1: + _status += 1 + (self.type, self.func, default) = self._mapper[_status] + if self._initial_default is not None: + self.default = self._initial_default + else: + self.default = default + self._status = _status self.iterupgrade(value) def update(self, func, default=None, missing_values='', locked=False): diff --git a/numpy/lib/io.py b/numpy/lib/io.py index 239d0808e..3618e1111 100644 --- a/numpy/lib/io.py +++ b/numpy/lib/io.py @@ -870,11 +870,12 @@ def fromregex(file, regexp, dtype): -def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, - converters=None, missing='', missing_values=None, usecols=None, - names=None, excludelist=None, deletechars=None, autostrip=False, - case_sensitive=True, defaultfmt="f%i", unpack=None, - usemask=False, loose=True, invalid_raise=True): +def genfromtxt(fname, dtype=float, comments='#', delimiter=None, + skiprows=0, skip_header=0, skip_footer=0, converters=None, + missing='', missing_values=None, filling_values=None, + usecols=None, names=None, excludelist=None, deletechars=None, + autostrip=False, case_sensitive=True, defaultfmt="f%i", + unpack=None, usemask=False, loose=True, invalid_raise=True): """ Load data from a text file, with missing values handled as specified. @@ -1021,13 +1022,6 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, errmsg = "The input argument 'converter' should be a valid dictionary "\ "(got '%s' instead)" raise TypeError(errmsg % type(user_converters)) - # Check the input dictionary of missing values - user_missing_values = missing_values or {} - if not isinstance(user_missing_values, dict): - errmsg = "The input argument 'missing_values' should be a valid "\ - "dictionary (got '%s' instead)" - raise TypeError(errmsg % type(missing_values)) - defmissing = [_.strip() for _ in missing.split(',')] + [''] # Initialize the filehandle, the LineSplitter and the NameValidator if isinstance(fname, basestring): @@ -1043,18 +1037,26 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, deletechars=deletechars, case_sensitive=case_sensitive) - # Get the first valid lines after the first skiprows ones - for i in xrange(skiprows): + # Get the first valid lines after the first skiprows ones .. + if skiprows: + warnings.warn("The use of `skiprows` is deprecated.\n"\ + "Please use `skip_header` instead.", + DeprecationWarning) + skip_header = skiprows + # Skip the first `skip_header` rows + for i in xrange(skip_header): fhd.readline() + # Keep on until we find the first valid values first_values = None while not first_values: first_line = fhd.readline() if first_line == '': raise IOError('End-of-file reached before encountering data.') if names is True: - first_values = first_line.strip().split(delimiter) - else: - first_values = split_line(first_line) + if comments in first_line: + first_line = ''.join(first_line.split(comments)[1]) + first_values = split_line(first_line) + # Should we take the first values as names ? if names is True: fval = first_values[0].strip() if fval in comments: @@ -1073,6 +1075,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, names = validate_names([_.strip() for _ in names.split(',')]) elif names: names = validate_names(names) + if names is not None: + names = list(names) # Get the dtype if dtype is not None: dtype = easy_dtype(dtype, defaultfmt=defaultfmt, names=names) @@ -1083,48 +1087,114 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, if _is_string_like(current): usecols[i] = names.index(current) - # If user_missing_values has names as keys, transform them to indices - missing = {} - for (key, val) in user_missing_values.iteritems(): - # If val is a list, flatten it. In any case, add missing &'' to the list - if isinstance(val, (list, tuple)): - val = [str(_) for _ in val] - else: - val = [str(val), ] - val.extend(defmissing) - if _is_string_like(key): - try: - missing[names.index(key)] = val - except ValueError: - pass + # Process the missing values ............................... + # Rename missing_values for convenience + user_missing_values = missing_values or () + + # Define the list of missing_values (one column: one list) + missing_values = [list(['']) for _ in range(nbcols)] + + # We have a dictionary: process it field by field + if isinstance(user_missing_values, dict): + # Loop on the items + for (key, val) in user_missing_values.items(): + # Make sure the key is an index + if _is_string_like(key): + key = names.index(key) + # Redefine the key as needed if it's a column number + if usecols: + try: + key = usecols.index(key) + except ValueError: + pass + # Transform the value as a list of string + if isinstance(val, (list, tuple)): + val = [str(_) for _ in val] + else: + val = [str(val),] + # Add the value(s) to the current list of missing + if key is None: + # None acts as default + for miss in missing_values: + miss.extend(val) + else: + missing_values[key].extend(val) + # We have a sequence : each item matches a column + elif isinstance(user_missing_values, (list, tuple)): + for (value, entry) in zip(user_missing_values, missing_values): + value = str(value) + if value not in entry: + entry.append(value) + # We have a string : apply it to all entries + elif isinstance(user_missing_values, basestring): + user_value = user_missing_values.split(",") + for entry in missing_values: + entry.extend(user_value) + # We have something else: apply it to all entries + else: + for entry in missing_values: + entry.extend([str(user_missing_values)]) + + # Process the deprecated `missing` + if missing != '': + warnings.warn("The use of `missing` is deprecated.\n"\ + "Please use `missing_values` instead.", + DeprecationWarning) + values = [str(_) for _ in missing.split(",")] + for entry in missing_values: + entry.extend(values) + + # Process the filling_values ............................... + # Rename the input for convenience + user_filling_values = filling_values or [] + # Define the default + filling_values = [None] * nbcols + # We have a dictionary : update each entry individually + if isinstance(user_filling_values, dict): + for (key, val) in user_filling_values.items(): + # Make sure the key is an index + if _is_string_like(key): + key = names.index(key) + # Redefine the key if it's a column number and usecols is defined + if usecols: + try: + key = usecols.index(key) + except ValueError: + pass + # Add the value to the list + filling_values[key] = val + # We have a sequence : update on a one-to-one basis + elif isinstance(user_filling_values, (list, tuple)): + n = len(user_filling_values) + if (n <= nbcols): + filling_values[:n] = user_filling_values else: - missing[key] = val - + filling_values = user_filling_values[:nbcols] + # We have something else : use it for all entries + else: + filling_values = [user_filling_values] * nbcols - # Initialize the default converters + # Initialize the converters ................................ if dtype is None: # Note: we can't use a [...]*nbcols, as we would have 3 times the same # ... converter, instead of 3 different converters. - converters = [StringConverter(None, - missing_values=missing.get(_, defmissing)) - for _ in range(nbcols)] + converters = [StringConverter(None, missing_values=miss, default=fill) + for (miss, fill) in zip(missing_values, filling_values)] else: dtype_flat = flatten_dtype(dtype, flatten_base=True) # Initialize the converters if len(dtype_flat) > 1: # Flexible type : get a converter from each dtype - converters = [StringConverter(dt, - missing_values=missing.get(i, defmissing), - locked=True) - for (i, dt) in enumerate(dtype_flat)] + zipit = zip(dtype_flat, missing_values, filling_values) + converters = [StringConverter(dt, locked=True, + missing_values=miss, default=fill) + for (dt, miss, fill) in zipit] else: # Set to a default converter (but w/ different missing values) - converters = [StringConverter(dtype, - missing_values=missing.get(_, defmissing), - locked=True) - for _ in range(nbcols)] - missing = [_.missing_values for _ in converters] - + zipit = zip(missing_values, filling_values) + converters = [StringConverter(dtype, locked=True, + missing_values=miss, default=fill) + for (miss, fill) in zipit] # Update the converters to use the user-defined ones uc_update = [] for (i, conv) in user_converters.items(): @@ -1137,13 +1207,15 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, except ValueError: # Unused converter specified continue - converters[i].update(conv, default=None, - missing_values=missing[i], - locked=True) + converters[i].update(conv, locked=True, + default=filling_values[i], + missing_values=missing_values[i],) uc_update.append((i, conv)) # Make sure we have the corrected keys in user_converters... user_converters.update(uc_update) + miss_chars = [_.missing_values for _ in converters] + # Reset the names to match the usecols if (not first_line) and usecols: names = [names[_] for _ in usecols] @@ -1180,8 +1252,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, # Store the values append_to_rows(tuple(values)) if usemask: - append_to_masks(tuple([val.strip() in mss - for (val, mss) in zip(values, missing)])) + append_to_masks(tuple([v.strip() in m + for (v, m) in zip(values, missing_values)])) # Upgrade the converters (if needed) if dtype is None: @@ -1197,7 +1269,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, converter.upgrade(value) except (ConverterError, ValueError): errmsg += "(occurred line #%i for value '%s')" - errmsg %= (j + 1 + skiprows, value) + errmsg %= (j + 1 + skip_header, value) raise ConverterError(errmsg) # Check that we don't have invalid values @@ -1271,7 +1343,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, if 'O' in (_.char for _ in dtype_flat): if has_nested_fields(dtype): errmsg = "Nested fields involving objects "\ - "are not supported..." + "are not supported..." raise NotImplementedError(errmsg) else: output = np.array(data, dtype=dtype) diff --git a/numpy/lib/tests/test__iotools.py b/numpy/lib/tests/test__iotools.py index c16491aee..ed0f4dc63 100644 --- a/numpy/lib/tests/test__iotools.py +++ b/numpy/lib/tests/test__iotools.py @@ -179,6 +179,18 @@ class TestStringConverter(TestCase): conv = StringConverter(lambda s: date(*(time.strptime(s)[:3]))) assert_equal(conv._mapper[-2][0](0), 0j) assert(hasattr(conv, 'default')) + # + def test_keep_default(self): + "Make sure we don't lose an explicit default" + converter = StringConverter(None, missing_values='', default=-999) + converter.upgrade('3.14159265') + assert_equal(converter.default, -999) + assert_equal(converter.type, np.dtype(float)) + # + converter = StringConverter(None, missing_values='', default=0) + converter.upgrade('3.14159265') + assert_equal(converter.default, 0) + assert_equal(converter.type, np.dtype(float)) #------------------------------------------------------------------------------- diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 5e0d666c2..ce62a83db 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -781,20 +781,18 @@ M 33 21.99 def test_user_missing_values(self): - datastr ="A, B, C\n0, 0., 0j\n1, N/A, 1j\n-9, 2.2, N/A\n3, -99, 3j" - data = StringIO.StringIO(datastr) - basekwargs = dict(dtype=None, delimiter=',', names=True, missing='N/A') + data = "A, B, C\n0, 0., 0j\n1, N/A, 1j\n-9, 2.2, N/A\n3, -99, 3j" + basekwargs = dict(dtype=None, delimiter=",", names=True, missing="N/A") mdtype = [('A', int), ('B', float), ('C', complex)] # - test = np.mafromtxt(data, **basekwargs) + test = np.mafromtxt(StringIO.StringIO(data), **basekwargs) control = ma.array([( 0, 0.0, 0j), (1, -999, 1j), ( -9, 2.2, -999j), (3, -99, 3j)], mask=[(0, 0, 0), (0, 1, 0), (0, 0, 1), (0, 0, 0)], dtype=mdtype) assert_equal(test, control) # - data.seek(0) - test = np.mafromtxt(data, + test = np.mafromtxt(StringIO.StringIO(data), missing_values={0:-9, 1:-99, 2:-999j}, **basekwargs) control = ma.array([( 0, 0.0, 0j), (1, -999, 1j), ( -9, 2.2, -999j), (3, -99, 3j)], @@ -802,8 +800,7 @@ M 33 21.99 dtype=mdtype) assert_equal(test, control) # - data.seek(0) - test = np.mafromtxt(data, + test = np.mafromtxt(StringIO.StringIO(data), missing_values={0:-9, 'B':-99, 'C':-999j}, **basekwargs) control = ma.array([( 0, 0.0, 0j), (1, -999, 1j), @@ -885,7 +882,7 @@ M 33 21.99 data = ["1, 1, 1, 1, -1.1"] * 50 mdata = StringIO.StringIO("\n".join(data)) - converters = {4: lambda x:np.sqrt(float(x))} + converters = {4: lambda x:"(%s)" % x} kwargs = dict(delimiter=",", converters=converters, dtype=[(_, int) for _ in 'abcde'],) assert_raises(TypeError, np.genfromtxt, mdata, **kwargs) @@ -948,6 +945,44 @@ M 33 21.99 dtype=[('f0', '|S10'), ('f1', float), ('f2', '|S5')]) assert_equal(mtest, ctrl) + def test_incomplete_names(self): + "Test w/ incomplete names" + data = "A,,C\n0,1,2\n3,4,5" + kwargs = dict(delimiter=",", names=True) + # w/ dtype=None + ctrl = np.array([(0, 1, 2), (3, 4, 5)], + dtype=[(_, int) for _ in ('A', 'f0', 'C')]) + test = np.ndfromtxt(StringIO.StringIO(data), dtype=None, **kwargs) + assert_equal(test, ctrl) + # w/ default dtype + ctrl = np.array([(0, 1, 2), (3, 4, 5)], + dtype=[(_, float) for _ in ('A', 'f0', 'C')]) + test = np.ndfromtxt(StringIO.StringIO(data), **kwargs) + + + def test_fixed_width_names(self): + "Test fix-width w/ names" + data = " A B C\n 0 1 2.3\n 45 67 9." + kwargs = dict(delimiter=(5, 5, 4), names=True, dtype=None) + ctrl = np.array([(0, 1, 2.3), (45, 67, 9.)], + dtype=[('A', int), ('B', int), ('C', float)]) + test = np.ndfromtxt(StringIO.StringIO(data), **kwargs) + assert_equal(test, ctrl) + # + kwargs = dict(delimiter=5, names=True, dtype=None) + ctrl = np.array([(0, 1, 2.3), (45, 67, 9.)], + dtype=[('A', int), ('B', int), ('C', float)]) + test = np.ndfromtxt(StringIO.StringIO(data), **kwargs) + assert_equal(test, ctrl) + + def test_filling_values(self): + "Test missing values" + data = "1, 2, 3\n1, , 5\n0, 6, \n" + kwargs = dict(delimiter=",", dtype=None, filling_values=-999) + ctrl = np.array([[1, 2, 3], [1, -999, 5], [0, 6, -999]], dtype=int) + test = np.ndfromtxt(StringIO.StringIO(data), **kwargs) + assert_equal(test, ctrl) + def test_recfromtxt(self): # @@ -972,16 +1007,15 @@ M 33 21.99 def test_recfromcsv(self): # data = StringIO.StringIO('A,B\n0,1\n2,3') - test = np.recfromcsv(data, missing='N/A', - names=True, case_sensitive=True) + kwargs = dict(missing='N/A', names=True, case_sensitive=True) + test = np.recfromcsv(data, dtype=None, **kwargs) control = np.array([(0, 1), (2, 3)], dtype=[('A', np.int), ('B', np.int)]) self.failUnless(isinstance(test, np.recarray)) assert_equal(test, control) # data = StringIO.StringIO('A,B\n0,1\n2,N/A') - test = np.recfromcsv(data, dtype=None, missing='N/A', - names=True, case_sensitive=True, usemask=True) + test = np.recfromcsv(data, dtype=None, usemask=True, **kwargs) control = ma.array([(0, 1), (2, -1)], mask=[(False, False), (False, True)], dtype=[('A', np.int), ('B', np.int)]) @@ -996,6 +1030,7 @@ M 33 21.99 self.failUnless(isinstance(test, np.recarray)) assert_equal(test, control) + def test_gzip_load(): a = np.random.random((5, 5)) @@ -1009,6 +1044,7 @@ def test_gzip_load(): f = gzip.GzipFile(fileobj=s, mode="r") assert_array_equal(np.load(f), a) + def test_gzip_loadtxt(): # Thanks to another windows brokeness, we can't use # NamedTemporaryFile: a file created from this function cannot be |