diff options
Diffstat (limited to 'numpy/lib/_iotools.py')
-rw-r--r-- | numpy/lib/_iotools.py | 141 |
1 files changed, 85 insertions, 56 deletions
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py index 67e21aa0c..0ebd39b8c 100644 --- a/numpy/lib/_iotools.py +++ b/numpy/lib/_iotools.py @@ -8,7 +8,7 @@ __docformat__ = "restructuredtext en" import sys import numpy as np import numpy.core.numeric as nx -from numpy.compat import asbytes, bytes, asbytes_nested, basestring +from numpy.compat import asbytes, asunicode, bytes, basestring if sys.version_info[0] >= 3: from builtins import bool, int, float, complex, object, str @@ -17,15 +17,30 @@ else: from __builtin__ import bool, int, float, complex, object, unicode, str -if sys.version_info[0] >= 3: - def _bytes_to_complex(s): - return complex(s.decode('ascii')) +def _decode_line(line, encoding=None): + """Decode bytes from binary input streams. - def _bytes_to_name(s): - return s.decode('ascii') -else: - _bytes_to_complex = complex - _bytes_to_name = str + Defaults to decoding from 'latin1'. That differs from the behavior of + np.compat.asunicode that decodes from 'ascii'. + + Parameters + ---------- + line : str or bytes + Line to be decoded. + + Returns + ------- + decoded_line : unicode + Unicode in Python 2, a str (unicode) in Python 3. + + """ + if type(line) is bytes: + if encoding is None: + line = line.decode('latin1') + else: + line = line.decode(encoding) + + return line def _is_string_like(obj): @@ -44,7 +59,7 @@ def _is_bytes_like(obj): Check whether obj behaves like a bytes object. """ try: - obj + asbytes('') + obj + b'' except (TypeError, ValueError): return False return True @@ -122,19 +137,26 @@ def flatten_dtype(ndtype, flatten_base=False): ---------- ndtype : dtype The datatype to collapse - flatten_base : {False, True}, optional - Whether to transform a field with a shape into several fields or not. + flatten_base : bool, optional + If True, transform a field with a shape into several fields. Default is + False. Examples -------- >>> dt = np.dtype([('name', 'S4'), ('x', float), ('y', float), ... ('block', int, (2, 3))]) >>> np.lib._iotools.flatten_dtype(dt) - [dtype('|S4'), dtype('float64'), dtype('float64'), dtype('int32')] + [dtype('S4'), dtype('float64'), dtype('float64'), dtype('int64')] >>> np.lib._iotools.flatten_dtype(dt, flatten_base=True) - [dtype('|S4'), dtype('float64'), dtype('float64'), dtype('int32'), - dtype('int32'), dtype('int32'), dtype('int32'), dtype('int32'), - dtype('int32')] + [dtype('S4'), + dtype('float64'), + dtype('float64'), + dtype('int64'), + dtype('int64'), + dtype('int64'), + dtype('int64'), + dtype('int64'), + dtype('int64')] """ names = ndtype.names @@ -188,12 +210,14 @@ class LineSplitter(object): return lambda input: [_.strip() for _ in method(input)] # - def __init__(self, delimiter=None, comments=asbytes('#'), autostrip=True): + def __init__(self, delimiter=None, comments='#', autostrip=True, encoding=None): + delimiter = _decode_line(delimiter) + comments = _decode_line(comments) + self.comments = comments + # Delimiter is a character - if isinstance(delimiter, unicode): - delimiter = delimiter.encode('ascii') - if (delimiter is None) or _is_bytes_like(delimiter): + if (delimiter is None) or isinstance(delimiter, basestring): delimiter = delimiter or None _handyman = self._delimited_splitter # Delimiter is a list of field widths @@ -212,12 +236,14 @@ class LineSplitter(object): self._handyman = self.autostrip(_handyman) else: self._handyman = _handyman + self.encoding = encoding # def _delimited_splitter(self, line): + """Chop off comments, strip, and split at delimiter. """ if self.comments is not None: line = line.split(self.comments)[0] - line = line.strip(asbytes(" \r\n")) + line = line.strip(" \r\n") if not line: return [] return line.split(self.delimiter) @@ -226,7 +252,7 @@ class LineSplitter(object): def _fixedwidth_splitter(self, line): if self.comments is not None: line = line.split(self.comments)[0] - line = line.strip(asbytes("\r\n")) + line = line.strip("\r\n") if not line: return [] fixed = self.delimiter @@ -244,7 +270,7 @@ class LineSplitter(object): # def __call__(self, line): - return self._handyman(line) + return self._handyman(_decode_line(line, self.encoding)) class NameValidator(object): @@ -289,13 +315,13 @@ class NameValidator(object): -------- >>> validator = np.lib._iotools.NameValidator() >>> validator(['file', 'field2', 'with space', 'CaSe']) - ['file_', 'field2', 'with_space', 'CaSe'] + ('file_', 'field2', 'with_space', 'CaSe') >>> validator = np.lib._iotools.NameValidator(excludelist=['excl'], - deletechars='q', - case_sensitive='False') + ... deletechars='q', + ... case_sensitive=False) >>> validator(['excl', 'field2', 'no_q', 'with space', 'CaSe']) - ['excl_', 'field2', 'no_', 'with_space', 'case'] + ('EXCL', 'FIELD2', 'NO_Q', 'WITH_SPACE', 'CASE') """ # @@ -433,9 +459,9 @@ def str2bool(value): """ value = value.upper() - if value == asbytes('TRUE'): + if value == 'TRUE': return True - elif value == asbytes('FALSE'): + elif value == 'FALSE': return False else: raise ValueError("Invalid boolean") @@ -509,8 +535,10 @@ class StringConverter(object): Value to return by default, that is, when the string to be converted is flagged as missing. If not given, `StringConverter` tries to supply a reasonable default value. - missing_values : sequence of str, optional - Sequence of strings indicating a missing value. + missing_values : {None, sequence of str}, optional + ``None`` or sequence of strings indicating a missing value. If ``None`` + then missing values are indicated by empty entries. The default is + ``None``. locked : bool, optional Whether the StringConverter should be locked to prevent automatic upgrade or not. Default is False. @@ -526,9 +554,10 @@ class StringConverter(object): _mapper.append((nx.int64, int, -1)) _mapper.extend([(nx.floating, float, nx.nan), - (complex, _bytes_to_complex, nx.nan + 0j), + (nx.complexfloating, complex, nx.nan + 0j), (nx.longdouble, nx.longdouble, nx.nan), - (nx.string_, bytes, asbytes('???'))]) + (nx.unicode_, asunicode, '???'), + (nx.string_, asbytes, '???')]) (_defaulttype, _defaultfunc, _defaultfill) = zip(*_mapper) @@ -576,7 +605,7 @@ class StringConverter(object): -------- >>> import dateutil.parser >>> import datetime - >>> dateparser = datetustil.parser.parse + >>> dateparser = dateutil.parser.parse >>> defaultdate = datetime.date(2000, 1, 1) >>> StringConverter.upgrade_mapper(dateparser, default=defaultdate) """ @@ -600,11 +629,6 @@ class StringConverter(object): def __init__(self, dtype_or_func=None, default=None, missing_values=None, locked=False): - # Convert unicode (for Py3) - if isinstance(missing_values, unicode): - missing_values = asbytes(missing_values) - elif isinstance(missing_values, (list, tuple)): - missing_values = asbytes_nested(missing_values) # Defines a lock for upgrade self._locked = bool(locked) # No input dtype: minimal initialization @@ -630,7 +654,7 @@ class StringConverter(object): # None if default is None: try: - default = self.func(asbytes('0')) + default = self.func('0') except ValueError: default = None dtype = self._getdtype(default) @@ -675,11 +699,11 @@ class StringConverter(object): self.func = lambda x: int(float(x)) # Store the list of strings corresponding to missing values. if missing_values is None: - self.missing_values = set([asbytes('')]) + self.missing_values = {''} else: - if isinstance(missing_values, bytes): - missing_values = missing_values.split(asbytes(",")) - self.missing_values = set(list(missing_values) + [asbytes('')]) + if isinstance(missing_values, basestring): + missing_values = missing_values.split(",") + self.missing_values = set(list(missing_values) + ['']) # self._callingfunction = self._strict_call self.type = self._dtypeortype(dtype) @@ -800,7 +824,7 @@ class StringConverter(object): self.iterupgrade(value) def update(self, func, default=None, testing_value=None, - missing_values=asbytes(''), locked=False): + missing_values='', locked=False): """ Set StringConverter attributes directly. @@ -816,8 +840,9 @@ class StringConverter(object): A string representing a standard input value of the converter. This string is used to help defining a reasonable default value. - missing_values : sequence of str, optional - Sequence of strings indicating a missing value. + missing_values : {sequence of str, None}, optional + Sequence of strings indicating a missing value. If ``None``, then + the existing `missing_values` are cleared. The default is `''`. locked : bool, optional Whether the StringConverter should be locked to prevent automatic upgrade or not. Default is False. @@ -831,25 +856,29 @@ class StringConverter(object): """ self.func = func self._locked = locked + # Don't reset the default to None if we can avoid it if default is not None: self.default = default self.type = self._dtypeortype(self._getdtype(default)) else: try: - tester = func(testing_value or asbytes('1')) + tester = func(testing_value or '1') except (TypeError, ValueError): tester = None self.type = self._dtypeortype(self._getdtype(tester)) - # Add the missing values to the existing set - if missing_values is not None: - if _is_bytes_like(missing_values): - self.missing_values.add(missing_values) - elif hasattr(missing_values, '__iter__'): - for val in missing_values: - self.missing_values.add(val) + + # Add the missing values to the existing set or clear it. + if missing_values is None: + # Clear all missing values even though the ctor initializes it to + # set(['']) when the argument is None. + self.missing_values = set() else: - self.missing_values = [] + if not np.iterable(missing_values): + missing_values = [missing_values] + if not all(isinstance(v, basestring) for v in missing_values): + raise TypeError("missing_values must be strings or unicode") + self.missing_values.update(missing_values) def easy_dtype(ndtype, names=None, defaultfmt="f%i", **validationargs): |