summaryrefslogtreecommitdiff
path: root/numpy/lib/_iotools.py
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/lib/_iotools.py')
-rw-r--r--numpy/lib/_iotools.py141
1 files changed, 85 insertions, 56 deletions
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index 67e21aa0c..0ebd39b8c 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -8,7 +8,7 @@ __docformat__ = "restructuredtext en"
import sys
import numpy as np
import numpy.core.numeric as nx
-from numpy.compat import asbytes, bytes, asbytes_nested, basestring
+from numpy.compat import asbytes, asunicode, bytes, basestring
if sys.version_info[0] >= 3:
from builtins import bool, int, float, complex, object, str
@@ -17,15 +17,30 @@ else:
from __builtin__ import bool, int, float, complex, object, unicode, str
-if sys.version_info[0] >= 3:
- def _bytes_to_complex(s):
- return complex(s.decode('ascii'))
+def _decode_line(line, encoding=None):
+ """Decode bytes from binary input streams.
- def _bytes_to_name(s):
- return s.decode('ascii')
-else:
- _bytes_to_complex = complex
- _bytes_to_name = str
+ Defaults to decoding from 'latin1'. That differs from the behavior of
+ np.compat.asunicode that decodes from 'ascii'.
+
+ Parameters
+ ----------
+ line : str or bytes
+ Line to be decoded.
+
+ Returns
+ -------
+ decoded_line : unicode
+ Unicode in Python 2, a str (unicode) in Python 3.
+
+ """
+ if type(line) is bytes:
+ if encoding is None:
+ line = line.decode('latin1')
+ else:
+ line = line.decode(encoding)
+
+ return line
def _is_string_like(obj):
@@ -44,7 +59,7 @@ def _is_bytes_like(obj):
Check whether obj behaves like a bytes object.
"""
try:
- obj + asbytes('')
+ obj + b''
except (TypeError, ValueError):
return False
return True
@@ -122,19 +137,26 @@ def flatten_dtype(ndtype, flatten_base=False):
----------
ndtype : dtype
The datatype to collapse
- flatten_base : {False, True}, optional
- Whether to transform a field with a shape into several fields or not.
+ flatten_base : bool, optional
+ If True, transform a field with a shape into several fields. Default is
+ False.
Examples
--------
>>> dt = np.dtype([('name', 'S4'), ('x', float), ('y', float),
... ('block', int, (2, 3))])
>>> np.lib._iotools.flatten_dtype(dt)
- [dtype('|S4'), dtype('float64'), dtype('float64'), dtype('int32')]
+ [dtype('S4'), dtype('float64'), dtype('float64'), dtype('int64')]
>>> np.lib._iotools.flatten_dtype(dt, flatten_base=True)
- [dtype('|S4'), dtype('float64'), dtype('float64'), dtype('int32'),
- dtype('int32'), dtype('int32'), dtype('int32'), dtype('int32'),
- dtype('int32')]
+ [dtype('S4'),
+ dtype('float64'),
+ dtype('float64'),
+ dtype('int64'),
+ dtype('int64'),
+ dtype('int64'),
+ dtype('int64'),
+ dtype('int64'),
+ dtype('int64')]
"""
names = ndtype.names
@@ -188,12 +210,14 @@ class LineSplitter(object):
return lambda input: [_.strip() for _ in method(input)]
#
- def __init__(self, delimiter=None, comments=asbytes('#'), autostrip=True):
+ def __init__(self, delimiter=None, comments='#', autostrip=True, encoding=None):
+ delimiter = _decode_line(delimiter)
+ comments = _decode_line(comments)
+
self.comments = comments
+
# Delimiter is a character
- if isinstance(delimiter, unicode):
- delimiter = delimiter.encode('ascii')
- if (delimiter is None) or _is_bytes_like(delimiter):
+ if (delimiter is None) or isinstance(delimiter, basestring):
delimiter = delimiter or None
_handyman = self._delimited_splitter
# Delimiter is a list of field widths
@@ -212,12 +236,14 @@ class LineSplitter(object):
self._handyman = self.autostrip(_handyman)
else:
self._handyman = _handyman
+ self.encoding = encoding
#
def _delimited_splitter(self, line):
+ """Chop off comments, strip, and split at delimiter. """
if self.comments is not None:
line = line.split(self.comments)[0]
- line = line.strip(asbytes(" \r\n"))
+ line = line.strip(" \r\n")
if not line:
return []
return line.split(self.delimiter)
@@ -226,7 +252,7 @@ class LineSplitter(object):
def _fixedwidth_splitter(self, line):
if self.comments is not None:
line = line.split(self.comments)[0]
- line = line.strip(asbytes("\r\n"))
+ line = line.strip("\r\n")
if not line:
return []
fixed = self.delimiter
@@ -244,7 +270,7 @@ class LineSplitter(object):
#
def __call__(self, line):
- return self._handyman(line)
+ return self._handyman(_decode_line(line, self.encoding))
class NameValidator(object):
@@ -289,13 +315,13 @@ class NameValidator(object):
--------
>>> validator = np.lib._iotools.NameValidator()
>>> validator(['file', 'field2', 'with space', 'CaSe'])
- ['file_', 'field2', 'with_space', 'CaSe']
+ ('file_', 'field2', 'with_space', 'CaSe')
>>> validator = np.lib._iotools.NameValidator(excludelist=['excl'],
- deletechars='q',
- case_sensitive='False')
+ ... deletechars='q',
+ ... case_sensitive=False)
>>> validator(['excl', 'field2', 'no_q', 'with space', 'CaSe'])
- ['excl_', 'field2', 'no_', 'with_space', 'case']
+ ('EXCL', 'FIELD2', 'NO_Q', 'WITH_SPACE', 'CASE')
"""
#
@@ -433,9 +459,9 @@ def str2bool(value):
"""
value = value.upper()
- if value == asbytes('TRUE'):
+ if value == 'TRUE':
return True
- elif value == asbytes('FALSE'):
+ elif value == 'FALSE':
return False
else:
raise ValueError("Invalid boolean")
@@ -509,8 +535,10 @@ class StringConverter(object):
Value to return by default, that is, when the string to be
converted is flagged as missing. If not given, `StringConverter`
tries to supply a reasonable default value.
- missing_values : sequence of str, optional
- Sequence of strings indicating a missing value.
+ missing_values : {None, sequence of str}, optional
+ ``None`` or sequence of strings indicating a missing value. If ``None``
+ then missing values are indicated by empty entries. The default is
+ ``None``.
locked : bool, optional
Whether the StringConverter should be locked to prevent automatic
upgrade or not. Default is False.
@@ -526,9 +554,10 @@ class StringConverter(object):
_mapper.append((nx.int64, int, -1))
_mapper.extend([(nx.floating, float, nx.nan),
- (complex, _bytes_to_complex, nx.nan + 0j),
+ (nx.complexfloating, complex, nx.nan + 0j),
(nx.longdouble, nx.longdouble, nx.nan),
- (nx.string_, bytes, asbytes('???'))])
+ (nx.unicode_, asunicode, '???'),
+ (nx.string_, asbytes, '???')])
(_defaulttype, _defaultfunc, _defaultfill) = zip(*_mapper)
@@ -576,7 +605,7 @@ class StringConverter(object):
--------
>>> import dateutil.parser
>>> import datetime
- >>> dateparser = datetustil.parser.parse
+ >>> dateparser = dateutil.parser.parse
>>> defaultdate = datetime.date(2000, 1, 1)
>>> StringConverter.upgrade_mapper(dateparser, default=defaultdate)
"""
@@ -600,11 +629,6 @@ class StringConverter(object):
def __init__(self, dtype_or_func=None, default=None, missing_values=None,
locked=False):
- # Convert unicode (for Py3)
- if isinstance(missing_values, unicode):
- missing_values = asbytes(missing_values)
- elif isinstance(missing_values, (list, tuple)):
- missing_values = asbytes_nested(missing_values)
# Defines a lock for upgrade
self._locked = bool(locked)
# No input dtype: minimal initialization
@@ -630,7 +654,7 @@ class StringConverter(object):
# None
if default is None:
try:
- default = self.func(asbytes('0'))
+ default = self.func('0')
except ValueError:
default = None
dtype = self._getdtype(default)
@@ -675,11 +699,11 @@ class StringConverter(object):
self.func = lambda x: int(float(x))
# Store the list of strings corresponding to missing values.
if missing_values is None:
- self.missing_values = set([asbytes('')])
+ self.missing_values = {''}
else:
- if isinstance(missing_values, bytes):
- missing_values = missing_values.split(asbytes(","))
- self.missing_values = set(list(missing_values) + [asbytes('')])
+ if isinstance(missing_values, basestring):
+ missing_values = missing_values.split(",")
+ self.missing_values = set(list(missing_values) + [''])
#
self._callingfunction = self._strict_call
self.type = self._dtypeortype(dtype)
@@ -800,7 +824,7 @@ class StringConverter(object):
self.iterupgrade(value)
def update(self, func, default=None, testing_value=None,
- missing_values=asbytes(''), locked=False):
+ missing_values='', locked=False):
"""
Set StringConverter attributes directly.
@@ -816,8 +840,9 @@ class StringConverter(object):
A string representing a standard input value of the converter.
This string is used to help defining a reasonable default
value.
- missing_values : sequence of str, optional
- Sequence of strings indicating a missing value.
+ missing_values : {sequence of str, None}, optional
+ Sequence of strings indicating a missing value. If ``None``, then
+ the existing `missing_values` are cleared. The default is `''`.
locked : bool, optional
Whether the StringConverter should be locked to prevent
automatic upgrade or not. Default is False.
@@ -831,25 +856,29 @@ class StringConverter(object):
"""
self.func = func
self._locked = locked
+
# Don't reset the default to None if we can avoid it
if default is not None:
self.default = default
self.type = self._dtypeortype(self._getdtype(default))
else:
try:
- tester = func(testing_value or asbytes('1'))
+ tester = func(testing_value or '1')
except (TypeError, ValueError):
tester = None
self.type = self._dtypeortype(self._getdtype(tester))
- # Add the missing values to the existing set
- if missing_values is not None:
- if _is_bytes_like(missing_values):
- self.missing_values.add(missing_values)
- elif hasattr(missing_values, '__iter__'):
- for val in missing_values:
- self.missing_values.add(val)
+
+ # Add the missing values to the existing set or clear it.
+ if missing_values is None:
+ # Clear all missing values even though the ctor initializes it to
+ # set(['']) when the argument is None.
+ self.missing_values = set()
else:
- self.missing_values = []
+ if not np.iterable(missing_values):
+ missing_values = [missing_values]
+ if not all(isinstance(v, basestring) for v in missing_values):
+ raise TypeError("missing_values must be strings or unicode")
+ self.missing_values.update(missing_values)
def easy_dtype(ndtype, names=None, defaultfmt="f%i", **validationargs):