diff options
author | pierregm <pierregm@localhost> | 2009-01-19 21:22:52 +0000 |
---|---|---|
committer | pierregm <pierregm@localhost> | 2009-01-19 21:22:52 +0000 |
commit | 8bd6c70d47e16fd81c8e3aefd4b2ec6dd90f38d6 (patch) | |
tree | d3042373968e32f0daffa3cdef6049d7c34b029a /numpy/lib/io.py | |
parent | 065626fa64df3eb51f6cb5eafcb466818ebb621d (diff) | |
download | numpy-8bd6c70d47e16fd81c8e3aefd4b2ec6dd90f38d6.tar.gz |
* lib : introduced _iotools
* lib.io : introduced genfromtxt, ndfromtxt, mafromtxt, recfromtxt, recfromcsv.
Diffstat (limited to 'numpy/lib/io.py')
-rw-r--r-- | numpy/lib/io.py | 476 |
1 files changed, 470 insertions, 6 deletions
diff --git a/numpy/lib/io.py b/numpy/lib/io.py index e9a012db1..303796d5f 100644 --- a/numpy/lib/io.py +++ b/numpy/lib/io.py @@ -1,4 +1,5 @@ __all__ = ['savetxt', 'loadtxt', + 'genfromtxt', 'ndfromtxt', 'mafromtxt', 'recfromtxt', 'recfromcsv', 'load', 'loads', 'save', 'savez', 'packbits', 'unpackbits', @@ -15,7 +16,11 @@ from cPickle import load as _cload, loads from _datasource import DataSource from _compiled_base import packbits, unpackbits +from _iotools import LineSplitter, NameValidator, StringConverter, \ + _is_string_like, flatten_dtype + _file = file +_string_like = _is_string_like class BagObj(object): """A simple class that converts attribute lookups to @@ -264,10 +269,6 @@ def _getconv(dtype): return str -def _string_like(obj): - try: obj + '' - except (TypeError, ValueError): return 0 - return 1 def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None, skiprows=0, usecols=None, unpack=False): @@ -342,7 +343,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None, if usecols is not None: usecols = list(usecols) - if _string_like(fname): + if _is_string_like(fname): if fname.endswith('.gz'): import gzip fh = gzip.open(fname) @@ -520,7 +521,7 @@ def savetxt(fname, X, fmt='%.18e',delimiter=' '): """ - if _string_like(fname): + if _is_string_like(fname): if fname.endswith('.gz'): import gzip fh = gzip.open(fname,'wb') @@ -608,3 +609,466 @@ def fromregex(file, regexp, dtype): seq = [(x,) for x in seq] output = np.array(seq, dtype=dtype) return output + + + + +#####-------------------------------------------------------------------------- +#---- --- ASCII functions --- +#####-------------------------------------------------------------------------- + + + +def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, + converters=None, missing='', missing_values=None, usecols=None, + names=None, excludelist=None, deletechars=None, + case_sensitive=True, unpack=None, usemask=False, loose=True): + """ + Load data from a text file. + + Each line past the first `skiprows` ones is split at the `delimiter` + character, and characters following the `comments` character are discarded. + + + + Parameters + ---------- + fname : file or string + File or filename to read. If the filename extension is `.gz` or `.bz2`, + the file is first decompressed. + dtype : data-type + Data type of the resulting array. If this is a flexible data-type, + the resulting array will be 1-dimensional, and each row will be + interpreted as an element of the array. In this case, the number + of columns used must match the number of fields in the data-type, + and the names of each field will be set by the corresponding name + of the dtype. + If None, the dtypes will be determined by the contents of each + column, individually. + comments : {string}, optional + The character used to indicate the start of a comment. + All the characters occurring on a line after a comment are discarded + delimiter : {string}, optional + The string used to separate values. By default, any consecutive + whitespace act as delimiter. + skiprows : {int}, optional + Numbers of lines to skip at the beginning of the file. + converters : {None, dictionary}, optional + A dictionary mapping column number to a function that will convert + values in the column to a number. Converters can also be used to + provide a default value for missing data: + ``converters = {3: lambda s: float(s or 0)}``. + missing : {string}, optional + A string representing a missing value, irrespective of the column where + it appears (e.g., `'missing'` or `'unused'`). + missing_values : {None, dictionary}, optional + A dictionary mapping a column number to a string indicating whether the + corresponding field should be masked. + usecols : {None, sequence}, optional + Which columns to read, with 0 being the first. For example, + ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns. + names : {None, True, string, sequence}, optional + If `names` is True, the field names are read from the first valid line + after the first `skiprows` lines. + If `names` is a sequence or a single-string of comma-separated names, + the names will be used to define the field names in a flexible dtype. + If `names` is None, the names of the dtype fields will be used, if any. + excludelist : {sequence}, optional + A list of names to exclude. This list is appended to the default list + ['return','file','print']. Excluded names are appended an underscore: + for example, `file` would become `file_`. + deletechars : {string}, optional + A string combining invalid characters that must be deleted from the names. + case_sensitive : {True, False, 'upper', 'lower'}, optional + If True, field names are case_sensitive. + If False or 'upper', field names are converted to upper case. + If 'lower', field names are converted to lower case. + unpack : {bool}, optional + If True, the returned array is transposed, so that arguments may be + unpacked using ``x, y, z = loadtxt(...)`` + usemask : {bool}, optional + If True, returns a masked array. + If False, return a regular standard array. + + Returns + ------- + out : MaskedArray + Data read from the text file. + + Notes + -------- + * When spaces are used as delimiters, or when no delimiter has been given + as input, there should not be any missing data between two fields. + * When the variable are named (either by a flexible dtype or with `names`, + there must not be any header in the file (else a :exc:ValueError exception + is raised). + + See Also + -------- + numpy.loadtxt : equivalent function when no data is missing. + + """ + # + if usemask: + from numpy.ma import MaskedArray, make_mask_descr + # Check the input dictionary of converters + user_converters = converters or {} + if not isinstance(user_converters, dict): + errmsg = "The input argument 'converter' should be a valid dictionary "\ + "(got '%s' instead)" + raise TypeError(errmsg % type(user_converters)) + # Check the input dictionary of missing values + user_missing_values = missing_values or {} + if not isinstance(user_missing_values, dict): + errmsg = "The input argument 'missing_values' should be a valid "\ + "dictionary (got '%s' instead)" + raise TypeError(errmsg % type(missing_values)) + defmissing = [_.strip() for _ in missing.split(',')] + [''] + + # Initialize the filehandle, the LineSplitter and the NameValidator +# fhd = _to_filehandle(fname) + if isinstance(fname, basestring): + fhd = np.lib._datasource.open(fname) + elif not hasattr(fname, 'read'): + raise TypeError("The input should be a string or a filehandle. "\ + "(got %s instead)" % type(fname)) + else: + fhd = fname + split_line = LineSplitter(delimiter=delimiter, comments=comments, + autostrip=False)._handyman + validate_names = NameValidator(excludelist=excludelist, + deletechars=deletechars, + case_sensitive=case_sensitive) + + # Get the first valid lines after the first skiprows ones + for i in xrange(skiprows): + fhd.readline() + first_values = None + while not first_values: + first_line = fhd.readline() + if first_line == '': + raise IOError('End-of-file reached before encountering data.') + first_values = split_line(first_line) + + # Check the columns to use + if usecols is not None: + usecols = list(usecols) + nbcols = len(usecols or first_values) + + # Check the names and overwrite the dtype.names if needed + if dtype is not None: + dtype = np.dtype(dtype) + dtypenames = getattr(dtype, 'names', None) + if names is True: + names = validate_names([_.strip() for _ in first_values]) + first_line ='' + elif _is_string_like(names): + names = validate_names([_.strip() for _ in names.split(',')]) + elif names: + names = validate_names(names) + elif dtypenames: + dtype.names = validate_names(dtypenames) + if names and dtypenames: + dtype.names = names + + # If usecols is a list of names, convert to a list of indices + if usecols: + for (i, current) in enumerate(usecols): + if _is_string_like(current): + usecols[i] = names.index(current) + + # If user_missing_values has names as keys, transform them to indices + missing_values = {} + for (key, val) in user_missing_values.iteritems(): + # If val is a list, flatten it. In any case, add missing &'' to the list + if isinstance(val, (list, tuple)): + val = [str(_) for _ in val] + else: + val = [str(val),] + val.extend(defmissing) + if _is_string_like(key): + try: + missing_values[names.index(key)] = val + except ValueError: + pass + else: + missing_values[key] = val + + + # Initialize the default converters + if dtype is None: + # Note: we can't use a [...]*nbcols, as we would have 3 times the same + # ... converter, instead of 3 different converters. + converters = [StringConverter(None, + missing_values=missing_values.get(_, defmissing)) + for _ in range(nbcols)] + else: + flatdtypes = flatten_dtype(dtype) + # Initialize the converters + if len(flatdtypes) > 1: + # Flexible type : get a converter from each dtype + converters = [StringConverter(dt, + missing_values=missing_values.get(i, defmissing), + locked=True) + for (i, dt) in enumerate(flatdtypes)] + else: + # Set to a default converter (but w/ different missing values) + converters = [StringConverter(dtype, + missing_values=missing_values.get(_, defmissing), + locked=True) + for _ in range(nbcols)] + missing_values = [_.missing_values for _ in converters] + + # Update the converters to use the user-defined ones + for (i, conv) in user_converters.iteritems(): + # If the converter is specified by column names, use the index instead + if _is_string_like(i): + i = names.index(i) + if usecols: + try: + i = usecols.index(i) + except ValueError: + # Unused converter specified + continue + converters[i].update(conv, default=None, + missing_values=missing_values[i], + locked=True) + + # Reset the names to match the usecols + if (not first_line) and usecols: + names = [names[_] for _ in usecols] + + rows = [] + append_to_rows = rows.append + if usemask: + masks = [] + append_to_masks = masks.append + # Parse each line + for line in itertools.chain([first_line,], fhd): + values = split_line(line) + # Skip an empty line + if len(values) == 0: + continue + # Select only the columns we need + if usecols: + values = [values[_] for _ in usecols] + # Check whether we need to update the converter + if dtype is None: + for (converter, item) in zip(converters, values): + converter.upgrade(item) + # Store the values + append_to_rows(tuple(values)) + if usemask: + append_to_masks(tuple([val.strip() in mss + for (val, mss) in zip(values, + missing_values)])) + + # Convert each value according to the converter: + # We want to modify the list in place to avoid creating a new one... + if loose: + conversionfuncs = [conv._loose_call for conv in converters] + else: + conversionfuncs = [conv._strict_call for conv in converters] + for (i, vals) in enumerate(rows): + rows[i] = tuple([convert(val) + for (convert, val) in zip(conversionfuncs, vals)]) + + # Reset the dtype + data = rows + if dtype is None: + # Get the dtypes from the first row + coldtypes = [np.array(val).dtype for val in data[0]] + # Find the columns with strings, and take the largest number of chars. + strcolidx = [i for (i, v) in enumerate(coldtypes) if v.char == 'S'] + for i in strcolidx: + coldtypes[i] = "|S%i" % max(len(row[i]) for row in data) + # + if names is None: + # If the dtype is uniform, don't define names, else use '' + base = coldtypes[0] + if np.all([(dt == base) for dt in coldtypes]): + (ddtype, mdtype) = (base, np.bool) + else: + ddtype = [('', dt) for dt in coldtypes] + mdtype = [('', np.bool) for dt in coldtypes] + else: + ddtype = zip(names, coldtypes) + mdtype = zip(names, [np.bool] * len(coldtypes)) + output = np.array(data, dtype=ddtype) + if usemask: + outputmask = np.array(masks, dtype=mdtype) + else: + # Overwrite the initial dtype names if needed + if names and dtype.names: + dtype.names = names + flatdtypes = flatten_dtype(dtype) + # Case 1. We have a structured type + if len(flatdtypes) > 1: + # Nested dtype, eg [('a', int), ('b', [('b0', int), ('b1', 'f4')])] + # First, create the array using a flattened dtype: + # [('a', int), ('b1', int), ('b2', float)] + # Then, view the array using the specified dtype. + rows = np.array(data, dtype=[('', t) for t in flatdtypes]) + output = rows.view(dtype) + # Now, process the rowmasks the same way + if usemask: + rowmasks = np.array(masks, + dtype=np.dtype([('', np.bool) + for t in flatdtypes])) + # Construct the new dtype + mdtype = make_mask_descr(dtype) + outputmask = rowmasks.view(mdtype) + # Case #2. We have a basic dtype + else: + # We used some user-defined converters + if user_converters: + ishomogeneous = True + descr = [] + for (i, ttype) in enumerate([conv.type for conv in converters]): + # Keep the dtype of the current converter + if i in user_converters: + ishomogeneous &= (ttype == dtype.type) + if ttype == np.string_: + ttype = "|S%i" % max(len(row[i]) for row in data) + descr.append(('', ttype)) + else: + descr.append(('', dtype)) + if not ishomogeneous: + dtype = np.dtype(descr) + # + output = np.array(data, dtype) + if usemask: + if dtype.names: + mdtype = [(_, np.bool) for _ in dtype.names] + else: + mdtype = np.bool + outputmask = np.array(masks, dtype=mdtype) + # Try to take care of the missing data we missed + if usemask and output.dtype.names: + for (name, conv) in zip(names or (), converters): + missing_values = [conv(_) for _ in conv.missing_values if _ != ''] + for mval in missing_values: + outputmask[name] |= (output[name] == mval) + # Construct the final array + if usemask: + output = output.view(MaskedArray) + output._mask = outputmask + if unpack: + return output.squeeze().T + return output.squeeze() + + + +def ndfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, + converters=None, missing='', missing_values=None, + usecols=None, unpack=None, names=None, + excludelist=None, deletechars=None, case_sensitive=True,): + """ + Load ASCII data stored in fname and returns a ndarray. + + Complete description of all the optional input parameters is available in + the docstring of the `genfromtxt` function. + + See Also + -------- + numpy.genfromtxt : generic function. + + """ + kwargs = dict(dtype=dtype, comments=comments, delimiter=delimiter, + skiprows=skiprows, converters=converters, + missing=missing, missing_values=missing_values, + usecols=usecols, unpack=unpack, names=names, + excludelist=excludelist, deletechars=deletechars, + case_sensitive=case_sensitive, usemask=False) + return genfromtxt(fname, **kwargs) + +def mafromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, + converters=None, missing='', missing_values=None, + usecols=None, unpack=None, names=None, + excludelist=None, deletechars=None, case_sensitive=True,): + """ + Load ASCII data stored in fname and returns a MaskedArray. + + Complete description of all the optional input parameters is available in + the docstring of the `genfromtxt` function. + + See Also + -------- + numpy.genfromtxt : generic function. + """ + kwargs = dict(dtype=dtype, comments=comments, delimiter=delimiter, + skiprows=skiprows, converters=converters, + missing=missing, missing_values=missing_values, + usecols=usecols, unpack=unpack, names=names, + excludelist=excludelist, deletechars=deletechars, + case_sensitive=case_sensitive, + usemask=True) + return genfromtxt(fname, **kwargs) + + +def recfromtxt(fname, dtype=None, comments='#', delimiter=None, skiprows=0, + converters=None, missing='', missing_values=None, + usecols=None, unpack=None, names=None, + excludelist=None, deletechars=None, case_sensitive=True, + usemask=False): + """ + Load ASCII data stored in fname and returns a standard recarray (if + `usemask=False`) or a MaskedRecords (if `usemask=True`). + + Complete description of all the optional input parameters is available in + the docstring of the `genfromtxt` function. + + See Also + -------- + numpy.genfromtxt : generic function + + Warnings + -------- + * by default, `dtype=None`, which means that the dtype of the output array + will be determined from the data. + """ + kwargs = dict(dtype=dtype, comments=comments, delimiter=delimiter, + skiprows=skiprows, converters=converters, + missing=missing, missing_values=missing_values, + usecols=usecols, unpack=unpack, names=names, + excludelist=excludelist, deletechars=deletechars, + case_sensitive=case_sensitive, usemask=usemask) + output = genfromtxt(fname, **kwargs) + if usemask: + from numpy.ma.mrecords import MaskedRecords + output = output.view(MaskedRecords) + else: + output = output.view(np.recarray) + return output + + +def recfromcsv(fname, dtype=None, comments='#', skiprows=0, + converters=None, missing='', missing_values=None, + usecols=None, unpack=None, names=True, + excludelist=None, deletechars=None, case_sensitive='lower', + usemask=False): + """ + Load ASCII data stored in comma-separated file and returns a recarray (if + `usemask=False`) or a MaskedRecords (if `usemask=True`). + + Complete description of all the optional input parameters is available in + the docstring of the `genfromtxt` function. + + See Also + -------- + numpy.genfromtxt : generic function + """ + kwargs = dict(dtype=dtype, comments=comments, delimiter=",", + skiprows=skiprows, converters=converters, + missing=missing, missing_values=missing_values, + usecols=usecols, unpack=unpack, names=names, + excludelist=excludelist, deletechars=deletechars, + case_sensitive=case_sensitive, usemask=usemask) + output = genfromtxt(fname, **kwargs) + if usemask: + from numpy.ma.mrecords import MaskedRecords + output = output.view(MaskedRecords) + else: + output = output.view(np.recarray) + return output + |