summaryrefslogtreecommitdiff
path: root/numpy/lib/_iotools.py
diff options
context:
space:
mode:
authorJulian Taylor <jtaylor.debian@googlemail.com>2017-04-03 14:20:36 +0200
committerCharles Harris <charlesr.harris@gmail.com>2017-11-21 10:16:00 -0700
commitd8edc62e8c9e69280fb8a171c7678b2fea929696 (patch)
treeaa6813116f4f72bf7270be2fdb1537abe8776f02 /numpy/lib/_iotools.py
parentb6044d88cab21d7ebe274bcd79bc430a57c520e6 (diff)
downloadnumpy-d8edc62e8c9e69280fb8a171c7678b2fea929696.tar.gz
ENH: Add encoding option to numpy text IO.
This modifies loadtxt and genfromtxt in several ways intended to add unicode support for text files by adding an `encoding` keyword to np.load, np.genfromtxt, np.savetxt, and np.fromregex. The original treatment of the relevant files was to open them as byte files, whereas they are now opened as text files with an encoding. When read, they are decoded to unicode strings for Python3 compatibility, and when written, they are encoded as specified. For backward compatibility, the default encoding in both cases is latin1.
Diffstat (limited to 'numpy/lib/_iotools.py')
-rw-r--r--numpy/lib/_iotools.py62
1 files changed, 29 insertions, 33 deletions
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index 1874c2e97..8e091d42d 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -8,7 +8,7 @@ __docformat__ = "restructuredtext en"
import sys
import numpy as np
import numpy.core.numeric as nx
-from numpy.compat import asbytes, bytes, asbytes_nested, basestring
+from numpy.compat import asbytes, asunicode, bytes, asbytes_nested, basestring
if sys.version_info[0] >= 3:
from builtins import bool, int, float, complex, object, str
@@ -17,15 +17,15 @@ else:
from __builtin__ import bool, int, float, complex, object, unicode, str
-if sys.version_info[0] >= 3:
- def _bytes_to_complex(s):
- return complex(s.decode('ascii'))
+def _decode_line(line, encoding=None):
+ """ decode bytes from binary input streams, default to latin1 """
+ if type(line) is bytes:
+ if encoding is None:
+ line = line.decode('latin1')
+ else:
+ line = line.decode(encoding)
- def _bytes_to_name(s):
- return s.decode('ascii')
-else:
- _bytes_to_complex = complex
- _bytes_to_name = str
+ return line
def _is_string_like(obj):
@@ -189,12 +189,10 @@ class LineSplitter(object):
return lambda input: [_.strip() for _ in method(input)]
#
- def __init__(self, delimiter=None, comments=b'#', autostrip=True):
+ def __init__(self, delimiter=None, comments='#', autostrip=True, encoding=None):
self.comments = comments
# Delimiter is a character
- if isinstance(delimiter, unicode):
- delimiter = delimiter.encode('ascii')
- if (delimiter is None) or _is_bytes_like(delimiter):
+ if (delimiter is None) or isinstance(delimiter, basestring):
delimiter = delimiter or None
_handyman = self._delimited_splitter
# Delimiter is a list of field widths
@@ -213,12 +211,14 @@ class LineSplitter(object):
self._handyman = self.autostrip(_handyman)
else:
self._handyman = _handyman
+ self.encoding = encoding
#
def _delimited_splitter(self, line):
+ """Chop off comments, strip, and split at delimiter. """
if self.comments is not None:
line = line.split(self.comments)[0]
- line = line.strip(b" \r\n")
+ line = line.strip(" \r\n")
if not line:
return []
return line.split(self.delimiter)
@@ -227,7 +227,7 @@ class LineSplitter(object):
def _fixedwidth_splitter(self, line):
if self.comments is not None:
line = line.split(self.comments)[0]
- line = line.strip(b"\r\n")
+ line = line.strip("\r\n")
if not line:
return []
fixed = self.delimiter
@@ -245,7 +245,7 @@ class LineSplitter(object):
#
def __call__(self, line):
- return self._handyman(line)
+ return self._handyman(_decode_line(line, self.encoding))
class NameValidator(object):
@@ -434,9 +434,9 @@ def str2bool(value):
"""
value = value.upper()
- if value == b'TRUE':
+ if value == 'TRUE':
return True
- elif value == b'FALSE':
+ elif value == 'FALSE':
return False
else:
raise ValueError("Invalid boolean")
@@ -527,9 +527,10 @@ class StringConverter(object):
_mapper.append((nx.int64, int, -1))
_mapper.extend([(nx.floating, float, nx.nan),
- (nx.complexfloating, _bytes_to_complex, nx.nan + 0j),
+ (nx.complexfloating, complex, nx.nan + 0j),
(nx.longdouble, nx.longdouble, nx.nan),
- (nx.string_, bytes, b'???')])
+ (nx.unicode_, asunicode, '???'),
+ (nx.string_, asbytes, '???')])
(_defaulttype, _defaultfunc, _defaultfill) = zip(*_mapper)
@@ -601,11 +602,6 @@ class StringConverter(object):
def __init__(self, dtype_or_func=None, default=None, missing_values=None,
locked=False):
- # Convert unicode (for Py3)
- if isinstance(missing_values, unicode):
- missing_values = asbytes(missing_values)
- elif isinstance(missing_values, (list, tuple)):
- missing_values = asbytes_nested(missing_values)
# Defines a lock for upgrade
self._locked = bool(locked)
# No input dtype: minimal initialization
@@ -631,7 +627,7 @@ class StringConverter(object):
# None
if default is None:
try:
- default = self.func(b'0')
+ default = self.func('0')
except ValueError:
default = None
dtype = self._getdtype(default)
@@ -676,11 +672,11 @@ class StringConverter(object):
self.func = lambda x: int(float(x))
# Store the list of strings corresponding to missing values.
if missing_values is None:
- self.missing_values = set([b''])
+ self.missing_values = set([''])
else:
- if isinstance(missing_values, bytes):
- missing_values = missing_values.split(b",")
- self.missing_values = set(list(missing_values) + [b''])
+ if isinstance(missing_values, basestring):
+ missing_values = missing_values.split(",")
+ self.missing_values = set(list(missing_values) + [''])
#
self._callingfunction = self._strict_call
self.type = self._dtypeortype(dtype)
@@ -801,7 +797,7 @@ class StringConverter(object):
self.iterupgrade(value)
def update(self, func, default=None, testing_value=None,
- missing_values=b'', locked=False):
+ missing_values='', locked=False):
"""
Set StringConverter attributes directly.
@@ -838,13 +834,13 @@ class StringConverter(object):
self.type = self._dtypeortype(self._getdtype(default))
else:
try:
- tester = func(testing_value or b'1')
+ tester = func(testing_value or '1')
except (TypeError, ValueError):
tester = None
self.type = self._dtypeortype(self._getdtype(tester))
# Add the missing values to the existing set
if missing_values is not None:
- if _is_bytes_like(missing_values):
+ if isinstance(missing_values, basestring):
self.missing_values.add(missing_values)
elif hasattr(missing_values, '__iter__'):
for val in missing_values: