1 files changed, 85 insertions, 56 deletions
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index 67e21aa0c..0ebd39b8c 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -8,7 +8,7 @@ __docformat__ = "restructuredtext en"
 import sys
 import numpy as np
 import numpy.core.numeric as nx
-from numpy.compat import asbytes, bytes, asbytes_nested, basestring
+from numpy.compat import asbytes, asunicode, bytes, basestring
 
 if sys.version_info[0] >= 3:
     from builtins import bool, int, float, complex, object, str
@@ -17,15 +17,30 @@ else:
     from __builtin__ import bool, int, float, complex, object, unicode, str
 
 
-if sys.version_info[0] >= 3:
-    def _bytes_to_complex(s):
-        return complex(s.decode('ascii'))
+def _decode_line(line, encoding=None):
+    """Decode bytes from binary input streams.
 
-    def _bytes_to_name(s):
-        return s.decode('ascii')
-else:
-    _bytes_to_complex = complex
-    _bytes_to_name = str
+    Defaults to decoding from 'latin1'. That differs from the behavior of
+    np.compat.asunicode that decodes from 'ascii'.
+
+    Parameters
+    ----------
+    line : str or bytes
+         Line to be decoded.
+
+    Returns
+    -------
+    decoded_line : unicode
+         Unicode in Python 2, a str (unicode) in Python 3.
+
+    """
+    if type(line) is bytes:
+        if encoding is None:
+            line = line.decode('latin1')
+        else:
+            line = line.decode(encoding)
+
+    return line
 
 
 def _is_string_like(obj):
@@ -44,7 +59,7 @@ def _is_bytes_like(obj):
     Check whether obj behaves like a bytes object.
     """
     try:
-        obj + asbytes('')
+        obj + b''
     except (TypeError, ValueError):
         return False
     return True
@@ -122,19 +137,26 @@ def flatten_dtype(ndtype, flatten_base=False):
     ----------
     ndtype : dtype
         The datatype to collapse
-    flatten_base : {False, True}, optional
-        Whether to transform a field with a shape into several fields or not.
+    flatten_base : bool, optional
+       If True, transform a field with a shape into several fields. Default is
+       False.
 
     Examples
     --------
     >>> dt = np.dtype([('name', 'S4'), ('x', float), ('y', float),
     ...                ('block', int, (2, 3))])
     >>> np.lib._iotools.flatten_dtype(dt)
-    [dtype('|S4'), dtype('float64'), dtype('float64'), dtype('int32')]
+    [dtype('S4'), dtype('float64'), dtype('float64'), dtype('int64')]
     >>> np.lib._iotools.flatten_dtype(dt, flatten_base=True)
-    [dtype('|S4'), dtype('float64'), dtype('float64'), dtype('int32'),
-     dtype('int32'), dtype('int32'), dtype('int32'), dtype('int32'),
-     dtype('int32')]
+    [dtype('S4'),
+     dtype('float64'),
+     dtype('float64'),
+     dtype('int64'),
+     dtype('int64'),
+     dtype('int64'),
+     dtype('int64'),
+     dtype('int64'),
+     dtype('int64')]
 
     """
     names = ndtype.names
@@ -188,12 +210,14 @@ class LineSplitter(object):
         return lambda input: [_.strip() for _ in method(input)]
     #
 
-    def __init__(self, delimiter=None, comments=asbytes('#'), autostrip=True):
+    def __init__(self, delimiter=None, comments='#', autostrip=True, encoding=None):
+        delimiter = _decode_line(delimiter)
+        comments = _decode_line(comments)
+
         self.comments = comments
+
         # Delimiter is a character
-        if isinstance(delimiter, unicode):
-            delimiter = delimiter.encode('ascii')
-        if (delimiter is None) or _is_bytes_like(delimiter):
+        if (delimiter is None) or isinstance(delimiter, basestring):
             delimiter = delimiter or None
             _handyman = self._delimited_splitter
         # Delimiter is a list of field widths
@@ -212,12 +236,14 @@ class LineSplitter(object):
             self._handyman = self.autostrip(_handyman)
         else:
             self._handyman = _handyman
+        self.encoding = encoding
     #
 
     def _delimited_splitter(self, line):
+        """Chop off comments, strip, and split at delimiter. """
         if self.comments is not None:
             line = line.split(self.comments)[0]
-        line = line.strip(asbytes(" \r\n"))
+        line = line.strip(" \r\n")
         if not line:
             return []
         return line.split(self.delimiter)
@@ -226,7 +252,7 @@ class LineSplitter(object):
     def _fixedwidth_splitter(self, line):
         if self.comments is not None:
             line = line.split(self.comments)[0]
-        line = line.strip(asbytes("\r\n"))
+        line = line.strip("\r\n")
         if not line:
             return []
         fixed = self.delimiter
@@ -244,7 +270,7 @@ class LineSplitter(object):
     #
 
     def __call__(self, line):
-        return self._handyman(line)
+        return self._handyman(_decode_line(line, self.encoding))
 
 
 class NameValidator(object):
@@ -289,13 +315,13 @@ class NameValidator(object):
     --------
     >>> validator = np.lib._iotools.NameValidator()
     >>> validator(['file', 'field2', 'with space', 'CaSe'])
-    ['file_', 'field2', 'with_space', 'CaSe']
+    ('file_', 'field2', 'with_space', 'CaSe')
 
     >>> validator = np.lib._iotools.NameValidator(excludelist=['excl'],
-                                                  deletechars='q',
-                                                  case_sensitive='False')
+    ...                                           deletechars='q',
+    ...                                           case_sensitive=False)
     >>> validator(['excl', 'field2', 'no_q', 'with space', 'CaSe'])
-    ['excl_', 'field2', 'no_', 'with_space', 'case']
+    ('EXCL', 'FIELD2', 'NO_Q', 'WITH_SPACE', 'CASE')
 
     """
     #
@@ -433,9 +459,9 @@ def str2bool(value):
 
     """
     value = value.upper()
-    if value == asbytes('TRUE'):
+    if value == 'TRUE':
         return True
-    elif value == asbytes('FALSE'):
+    elif value == 'FALSE':
         return False
     else:
         raise ValueError("Invalid boolean")
@@ -509,8 +535,10 @@ class StringConverter(object):
         Value to return by default, that is, when the string to be
         converted is flagged as missing. If not given, `StringConverter`
         tries to supply a reasonable default value.
-    missing_values : sequence of str, optional
-        Sequence of strings indicating a missing value.
+    missing_values : {None, sequence of str}, optional
+        ``None`` or sequence of strings indicating a missing value. If ``None``
+        then missing values are indicated by empty entries. The default is
+        ``None``.
     locked : bool, optional
         Whether the StringConverter should be locked to prevent automatic
         upgrade or not. Default is False.
@@ -526,9 +554,10 @@ class StringConverter(object):
         _mapper.append((nx.int64, int, -1))
 
     _mapper.extend([(nx.floating, float, nx.nan),
-                    (complex, _bytes_to_complex, nx.nan + 0j),
+                    (nx.complexfloating, complex, nx.nan + 0j),
                     (nx.longdouble, nx.longdouble, nx.nan),
-                    (nx.string_, bytes, asbytes('???'))])
+                    (nx.unicode_, asunicode, '???'),
+                    (nx.string_, asbytes, '???')])
 
     (_defaulttype, _defaultfunc, _defaultfill) = zip(*_mapper)
 
@@ -576,7 +605,7 @@ class StringConverter(object):
     --------
     >>> import dateutil.parser
     >>> import datetime
-    >>> dateparser = datetustil.parser.parse
+    >>> dateparser = dateutil.parser.parse
     >>> defaultdate = datetime.date(2000, 1, 1)
     >>> StringConverter.upgrade_mapper(dateparser, default=defaultdate)
         """
@@ -600,11 +629,6 @@ class StringConverter(object):
 
     def __init__(self, dtype_or_func=None, default=None, missing_values=None,
                  locked=False):
-        # Convert unicode (for Py3)
-        if isinstance(missing_values, unicode):
-            missing_values = asbytes(missing_values)
-        elif isinstance(missing_values, (list, tuple)):
-            missing_values = asbytes_nested(missing_values)
         # Defines a lock for upgrade
         self._locked = bool(locked)
         # No input dtype: minimal initialization
@@ -630,7 +654,7 @@ class StringConverter(object):
                 # None
                 if default is None:
                     try:
-                        default = self.func(asbytes('0'))
+                        default = self.func('0')
                     except ValueError:
                         default = None
                 dtype = self._getdtype(default)
@@ -675,11 +699,11 @@ class StringConverter(object):
                     self.func = lambda x: int(float(x))
         # Store the list of strings corresponding to missing values.
         if missing_values is None:
-            self.missing_values = set([asbytes('')])
+            self.missing_values = {''}
         else:
-            if isinstance(missing_values, bytes):
-                missing_values = missing_values.split(asbytes(","))
-            self.missing_values = set(list(missing_values) + [asbytes('')])
+            if isinstance(missing_values, basestring):
+                missing_values = missing_values.split(",")
+            self.missing_values = set(list(missing_values) + [''])
         #
         self._callingfunction = self._strict_call
         self.type = self._dtypeortype(dtype)
@@ -800,7 +824,7 @@ class StringConverter(object):
             self.iterupgrade(value)
 
     def update(self, func, default=None, testing_value=None,
-               missing_values=asbytes(''), locked=False):
+               missing_values='', locked=False):
         """
         Set StringConverter attributes directly.
 
@@ -816,8 +840,9 @@ class StringConverter(object):
             A string representing a standard input value of the converter.
             This string is used to help defining a reasonable default
             value.
-        missing_values : sequence of str, optional
-            Sequence of strings indicating a missing value.
+        missing_values : {sequence of str, None}, optional
+            Sequence of strings indicating a missing value. If ``None``, then
+            the existing `missing_values` are cleared. The default is `''`.
         locked : bool, optional
             Whether the StringConverter should be locked to prevent
             automatic upgrade or not. Default is False.
@@ -831,25 +856,29 @@ class StringConverter(object):
         """
         self.func = func
         self._locked = locked
+
         # Don't reset the default to None if we can avoid it
         if default is not None:
             self.default = default
             self.type = self._dtypeortype(self._getdtype(default))
         else:
             try:
-                tester = func(testing_value or asbytes('1'))
+                tester = func(testing_value or '1')
             except (TypeError, ValueError):
                 tester = None
             self.type = self._dtypeortype(self._getdtype(tester))
-        # Add the missing values to the existing set
-        if missing_values is not None:
-            if _is_bytes_like(missing_values):
-                self.missing_values.add(missing_values)
-            elif hasattr(missing_values, '__iter__'):
-                for val in missing_values:
-                    self.missing_values.add(val)
+
+        # Add the missing values to the existing set or clear it.
+        if missing_values is None:
+            # Clear all missing values even though the ctor initializes it to
+            # set(['']) when the argument is None.
+            self.missing_values = set()
         else:
-            self.missing_values = []
+            if not np.iterable(missing_values):
+                missing_values = [missing_values]
+            if not all(isinstance(v, basestring) for v in missing_values):
+                raise TypeError("missing_values must be strings or unicode")
+            self.missing_values.update(missing_values)
 
 
 def easy_dtype(ndtype, names=None, defaultfmt="f%i", **validationargs):