diff options
author | Julian Taylor <jtaylor.debian@googlemail.com> | 2017-04-03 14:20:36 +0200 |
---|---|---|
committer | Charles Harris <charlesr.harris@gmail.com> | 2017-11-21 10:16:00 -0700 |
commit | d8edc62e8c9e69280fb8a171c7678b2fea929696 (patch) | |
tree | aa6813116f4f72bf7270be2fdb1537abe8776f02 /numpy/lib/_iotools.py | |
parent | b6044d88cab21d7ebe274bcd79bc430a57c520e6 (diff) | |
download | numpy-d8edc62e8c9e69280fb8a171c7678b2fea929696.tar.gz |
ENH: Add encoding option to numpy text IO.
This modifies loadtxt and genfromtxt in several ways intended to add
unicode support for text files by adding an `encoding` keyword to
np.load, np.genfromtxt, np.savetxt, and np.fromregex. The original
treatment of the relevant files was to open them as byte
files, whereas they are now opened as text files with an encoding. When
read, they are decoded to unicode strings for Python3 compatibility,
and when written, they are encoded as specified. For backward
compatibility, the default encoding in both cases is latin1.
Diffstat (limited to 'numpy/lib/_iotools.py')
-rw-r--r-- | numpy/lib/_iotools.py | 62 |
1 files changed, 29 insertions, 33 deletions
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py index 1874c2e97..8e091d42d 100644 --- a/numpy/lib/_iotools.py +++ b/numpy/lib/_iotools.py @@ -8,7 +8,7 @@ __docformat__ = "restructuredtext en" import sys import numpy as np import numpy.core.numeric as nx -from numpy.compat import asbytes, bytes, asbytes_nested, basestring +from numpy.compat import asbytes, asunicode, bytes, asbytes_nested, basestring if sys.version_info[0] >= 3: from builtins import bool, int, float, complex, object, str @@ -17,15 +17,15 @@ else: from __builtin__ import bool, int, float, complex, object, unicode, str -if sys.version_info[0] >= 3: - def _bytes_to_complex(s): - return complex(s.decode('ascii')) +def _decode_line(line, encoding=None): + """ decode bytes from binary input streams, default to latin1 """ + if type(line) is bytes: + if encoding is None: + line = line.decode('latin1') + else: + line = line.decode(encoding) - def _bytes_to_name(s): - return s.decode('ascii') -else: - _bytes_to_complex = complex - _bytes_to_name = str + return line def _is_string_like(obj): @@ -189,12 +189,10 @@ class LineSplitter(object): return lambda input: [_.strip() for _ in method(input)] # - def __init__(self, delimiter=None, comments=b'#', autostrip=True): + def __init__(self, delimiter=None, comments='#', autostrip=True, encoding=None): self.comments = comments # Delimiter is a character - if isinstance(delimiter, unicode): - delimiter = delimiter.encode('ascii') - if (delimiter is None) or _is_bytes_like(delimiter): + if (delimiter is None) or isinstance(delimiter, basestring): delimiter = delimiter or None _handyman = self._delimited_splitter # Delimiter is a list of field widths @@ -213,12 +211,14 @@ class LineSplitter(object): self._handyman = self.autostrip(_handyman) else: self._handyman = _handyman + self.encoding = encoding # def _delimited_splitter(self, line): + """Chop off comments, strip, and split at delimiter. """ if self.comments is not None: line = line.split(self.comments)[0] - line = line.strip(b" \r\n") + line = line.strip(" \r\n") if not line: return [] return line.split(self.delimiter) @@ -227,7 +227,7 @@ class LineSplitter(object): def _fixedwidth_splitter(self, line): if self.comments is not None: line = line.split(self.comments)[0] - line = line.strip(b"\r\n") + line = line.strip("\r\n") if not line: return [] fixed = self.delimiter @@ -245,7 +245,7 @@ class LineSplitter(object): # def __call__(self, line): - return self._handyman(line) + return self._handyman(_decode_line(line, self.encoding)) class NameValidator(object): @@ -434,9 +434,9 @@ def str2bool(value): """ value = value.upper() - if value == b'TRUE': + if value == 'TRUE': return True - elif value == b'FALSE': + elif value == 'FALSE': return False else: raise ValueError("Invalid boolean") @@ -527,9 +527,10 @@ class StringConverter(object): _mapper.append((nx.int64, int, -1)) _mapper.extend([(nx.floating, float, nx.nan), - (nx.complexfloating, _bytes_to_complex, nx.nan + 0j), + (nx.complexfloating, complex, nx.nan + 0j), (nx.longdouble, nx.longdouble, nx.nan), - (nx.string_, bytes, b'???')]) + (nx.unicode_, asunicode, '???'), + (nx.string_, asbytes, '???')]) (_defaulttype, _defaultfunc, _defaultfill) = zip(*_mapper) @@ -601,11 +602,6 @@ class StringConverter(object): def __init__(self, dtype_or_func=None, default=None, missing_values=None, locked=False): - # Convert unicode (for Py3) - if isinstance(missing_values, unicode): - missing_values = asbytes(missing_values) - elif isinstance(missing_values, (list, tuple)): - missing_values = asbytes_nested(missing_values) # Defines a lock for upgrade self._locked = bool(locked) # No input dtype: minimal initialization @@ -631,7 +627,7 @@ class StringConverter(object): # None if default is None: try: - default = self.func(b'0') + default = self.func('0') except ValueError: default = None dtype = self._getdtype(default) @@ -676,11 +672,11 @@ class StringConverter(object): self.func = lambda x: int(float(x)) # Store the list of strings corresponding to missing values. if missing_values is None: - self.missing_values = set([b'']) + self.missing_values = set(['']) else: - if isinstance(missing_values, bytes): - missing_values = missing_values.split(b",") - self.missing_values = set(list(missing_values) + [b'']) + if isinstance(missing_values, basestring): + missing_values = missing_values.split(",") + self.missing_values = set(list(missing_values) + ['']) # self._callingfunction = self._strict_call self.type = self._dtypeortype(dtype) @@ -801,7 +797,7 @@ class StringConverter(object): self.iterupgrade(value) def update(self, func, default=None, testing_value=None, - missing_values=b'', locked=False): + missing_values='', locked=False): """ Set StringConverter attributes directly. @@ -838,13 +834,13 @@ class StringConverter(object): self.type = self._dtypeortype(self._getdtype(default)) else: try: - tester = func(testing_value or b'1') + tester = func(testing_value or '1') except (TypeError, ValueError): tester = None self.type = self._dtypeortype(self._getdtype(tester)) # Add the missing values to the existing set if missing_values is not None: - if _is_bytes_like(missing_values): + if isinstance(missing_values, basestring): self.missing_values.add(missing_values) elif hasattr(missing_values, '__iter__'): for val in missing_values: |