diff options
author | pierregm <pierregm@localhost> | 2009-10-12 04:45:31 +0000 |
---|---|---|
committer | pierregm <pierregm@localhost> | 2009-10-12 04:45:31 +0000 |
commit | 80851e34d2955a331cecb9f50d2287d33618dd3e (patch) | |
tree | a1c2ea17ef9f9192b1ad7839c31d7f73bc579bfe /numpy/lib/io.py | |
parent | 295e24af415bb49f12a632f0e27fb9e2a1099ea2 (diff) | |
download | numpy-80851e34d2955a331cecb9f50d2287d33618dd3e.tar.gz |
* _iotools.StringConverter
- prevents an explicit default to be overwritten during upgrade
* io.genfromtxt
- deprecate `skiprows` for `skip_header`
- deprecate `missing` for `missing_values`
- `missing_values` can now be a sequence
- add support for `filling_values`
* fixed ticket #1257
Diffstat (limited to 'numpy/lib/io.py')
-rw-r--r-- | numpy/lib/io.py | 180 |
1 files changed, 126 insertions, 54 deletions
diff --git a/numpy/lib/io.py b/numpy/lib/io.py index 239d0808e..3618e1111 100644 --- a/numpy/lib/io.py +++ b/numpy/lib/io.py @@ -870,11 +870,12 @@ def fromregex(file, regexp, dtype): -def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, - converters=None, missing='', missing_values=None, usecols=None, - names=None, excludelist=None, deletechars=None, autostrip=False, - case_sensitive=True, defaultfmt="f%i", unpack=None, - usemask=False, loose=True, invalid_raise=True): +def genfromtxt(fname, dtype=float, comments='#', delimiter=None, + skiprows=0, skip_header=0, skip_footer=0, converters=None, + missing='', missing_values=None, filling_values=None, + usecols=None, names=None, excludelist=None, deletechars=None, + autostrip=False, case_sensitive=True, defaultfmt="f%i", + unpack=None, usemask=False, loose=True, invalid_raise=True): """ Load data from a text file, with missing values handled as specified. @@ -1021,13 +1022,6 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, errmsg = "The input argument 'converter' should be a valid dictionary "\ "(got '%s' instead)" raise TypeError(errmsg % type(user_converters)) - # Check the input dictionary of missing values - user_missing_values = missing_values or {} - if not isinstance(user_missing_values, dict): - errmsg = "The input argument 'missing_values' should be a valid "\ - "dictionary (got '%s' instead)" - raise TypeError(errmsg % type(missing_values)) - defmissing = [_.strip() for _ in missing.split(',')] + [''] # Initialize the filehandle, the LineSplitter and the NameValidator if isinstance(fname, basestring): @@ -1043,18 +1037,26 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, deletechars=deletechars, case_sensitive=case_sensitive) - # Get the first valid lines after the first skiprows ones - for i in xrange(skiprows): + # Get the first valid lines after the first skiprows ones .. + if skiprows: + warnings.warn("The use of `skiprows` is deprecated.\n"\ + "Please use `skip_header` instead.", + DeprecationWarning) + skip_header = skiprows + # Skip the first `skip_header` rows + for i in xrange(skip_header): fhd.readline() + # Keep on until we find the first valid values first_values = None while not first_values: first_line = fhd.readline() if first_line == '': raise IOError('End-of-file reached before encountering data.') if names is True: - first_values = first_line.strip().split(delimiter) - else: - first_values = split_line(first_line) + if comments in first_line: + first_line = ''.join(first_line.split(comments)[1]) + first_values = split_line(first_line) + # Should we take the first values as names ? if names is True: fval = first_values[0].strip() if fval in comments: @@ -1073,6 +1075,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, names = validate_names([_.strip() for _ in names.split(',')]) elif names: names = validate_names(names) + if names is not None: + names = list(names) # Get the dtype if dtype is not None: dtype = easy_dtype(dtype, defaultfmt=defaultfmt, names=names) @@ -1083,48 +1087,114 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, if _is_string_like(current): usecols[i] = names.index(current) - # If user_missing_values has names as keys, transform them to indices - missing = {} - for (key, val) in user_missing_values.iteritems(): - # If val is a list, flatten it. In any case, add missing &'' to the list - if isinstance(val, (list, tuple)): - val = [str(_) for _ in val] - else: - val = [str(val), ] - val.extend(defmissing) - if _is_string_like(key): - try: - missing[names.index(key)] = val - except ValueError: - pass + # Process the missing values ............................... + # Rename missing_values for convenience + user_missing_values = missing_values or () + + # Define the list of missing_values (one column: one list) + missing_values = [list(['']) for _ in range(nbcols)] + + # We have a dictionary: process it field by field + if isinstance(user_missing_values, dict): + # Loop on the items + for (key, val) in user_missing_values.items(): + # Make sure the key is an index + if _is_string_like(key): + key = names.index(key) + # Redefine the key as needed if it's a column number + if usecols: + try: + key = usecols.index(key) + except ValueError: + pass + # Transform the value as a list of string + if isinstance(val, (list, tuple)): + val = [str(_) for _ in val] + else: + val = [str(val),] + # Add the value(s) to the current list of missing + if key is None: + # None acts as default + for miss in missing_values: + miss.extend(val) + else: + missing_values[key].extend(val) + # We have a sequence : each item matches a column + elif isinstance(user_missing_values, (list, tuple)): + for (value, entry) in zip(user_missing_values, missing_values): + value = str(value) + if value not in entry: + entry.append(value) + # We have a string : apply it to all entries + elif isinstance(user_missing_values, basestring): + user_value = user_missing_values.split(",") + for entry in missing_values: + entry.extend(user_value) + # We have something else: apply it to all entries + else: + for entry in missing_values: + entry.extend([str(user_missing_values)]) + + # Process the deprecated `missing` + if missing != '': + warnings.warn("The use of `missing` is deprecated.\n"\ + "Please use `missing_values` instead.", + DeprecationWarning) + values = [str(_) for _ in missing.split(",")] + for entry in missing_values: + entry.extend(values) + + # Process the filling_values ............................... + # Rename the input for convenience + user_filling_values = filling_values or [] + # Define the default + filling_values = [None] * nbcols + # We have a dictionary : update each entry individually + if isinstance(user_filling_values, dict): + for (key, val) in user_filling_values.items(): + # Make sure the key is an index + if _is_string_like(key): + key = names.index(key) + # Redefine the key if it's a column number and usecols is defined + if usecols: + try: + key = usecols.index(key) + except ValueError: + pass + # Add the value to the list + filling_values[key] = val + # We have a sequence : update on a one-to-one basis + elif isinstance(user_filling_values, (list, tuple)): + n = len(user_filling_values) + if (n <= nbcols): + filling_values[:n] = user_filling_values else: - missing[key] = val - + filling_values = user_filling_values[:nbcols] + # We have something else : use it for all entries + else: + filling_values = [user_filling_values] * nbcols - # Initialize the default converters + # Initialize the converters ................................ if dtype is None: # Note: we can't use a [...]*nbcols, as we would have 3 times the same # ... converter, instead of 3 different converters. - converters = [StringConverter(None, - missing_values=missing.get(_, defmissing)) - for _ in range(nbcols)] + converters = [StringConverter(None, missing_values=miss, default=fill) + for (miss, fill) in zip(missing_values, filling_values)] else: dtype_flat = flatten_dtype(dtype, flatten_base=True) # Initialize the converters if len(dtype_flat) > 1: # Flexible type : get a converter from each dtype - converters = [StringConverter(dt, - missing_values=missing.get(i, defmissing), - locked=True) - for (i, dt) in enumerate(dtype_flat)] + zipit = zip(dtype_flat, missing_values, filling_values) + converters = [StringConverter(dt, locked=True, + missing_values=miss, default=fill) + for (dt, miss, fill) in zipit] else: # Set to a default converter (but w/ different missing values) - converters = [StringConverter(dtype, - missing_values=missing.get(_, defmissing), - locked=True) - for _ in range(nbcols)] - missing = [_.missing_values for _ in converters] - + zipit = zip(missing_values, filling_values) + converters = [StringConverter(dtype, locked=True, + missing_values=miss, default=fill) + for (miss, fill) in zipit] # Update the converters to use the user-defined ones uc_update = [] for (i, conv) in user_converters.items(): @@ -1137,13 +1207,15 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, except ValueError: # Unused converter specified continue - converters[i].update(conv, default=None, - missing_values=missing[i], - locked=True) + converters[i].update(conv, locked=True, + default=filling_values[i], + missing_values=missing_values[i],) uc_update.append((i, conv)) # Make sure we have the corrected keys in user_converters... user_converters.update(uc_update) + miss_chars = [_.missing_values for _ in converters] + # Reset the names to match the usecols if (not first_line) and usecols: names = [names[_] for _ in usecols] @@ -1180,8 +1252,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, # Store the values append_to_rows(tuple(values)) if usemask: - append_to_masks(tuple([val.strip() in mss - for (val, mss) in zip(values, missing)])) + append_to_masks(tuple([v.strip() in m + for (v, m) in zip(values, missing_values)])) # Upgrade the converters (if needed) if dtype is None: @@ -1197,7 +1269,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, converter.upgrade(value) except (ConverterError, ValueError): errmsg += "(occurred line #%i for value '%s')" - errmsg %= (j + 1 + skiprows, value) + errmsg %= (j + 1 + skip_header, value) raise ConverterError(errmsg) # Check that we don't have invalid values @@ -1271,7 +1343,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, if 'O' in (_.char for _ in dtype_flat): if has_nested_fields(dtype): errmsg = "Nested fields involving objects "\ - "are not supported..." + "are not supported..." raise NotImplementedError(errmsg) else: output = np.array(data, dtype=dtype) |