1 files changed, 246 insertions, 100 deletions
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 6de5940d7..fe2aa436b 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -1,5 +1,6 @@
 from __future__ import division, absolute_import, print_function
 
+import io
 import sys
 import os
 import re
@@ -15,11 +16,12 @@ from numpy.core.multiarray import packbits, unpackbits
 from ._iotools import (
     LineSplitter, NameValidator, StringConverter, ConverterError,
     ConverterLockError, ConversionWarning, _is_string_like,
-    has_nested_fields, flatten_dtype, easy_dtype, _bytes_to_name
+    has_nested_fields, flatten_dtype, easy_dtype, _decode_line
     )
 
 from numpy.compat import (
-    asbytes, asstr, asbytes_nested, bytes, basestring, unicode, is_pathlib_path
+    asbytes, asstr, asunicode, asbytes_nested, bytes, basestring, unicode,
+    is_pathlib_path
     )
 
 if sys.version_info[0] >= 3:
@@ -731,7 +733,7 @@ def _getconv(dtype):
 
     def floatconv(x):
         x.lower()
-        if b'0x' in x:
+        if '0x' in x:
             return float.fromhex(asstr(x))
         return float(x)
 
@@ -752,13 +754,17 @@ def _getconv(dtype):
         return lambda x: complex(asstr(x))
     elif issubclass(typ, np.bytes_):
         return asbytes
+    elif issubclass(typ, np.unicode_):
+        return asunicode
     else:
         return asstr
 
+# amount of lines loadtxt reads in one chunk, can be overriden for testing
+_loadtxt_chunksize = 50000
 
 def loadtxt(fname, dtype=float, comments='#', delimiter=None,
             converters=None, skiprows=0, usecols=None, unpack=False,
-            ndmin=0):
+            ndmin=0, encoding='bytes'):
     """
     Load data from a text file.
 
@@ -813,6 +819,15 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         Legal values: 0 (default), 1 or 2.
 
         .. versionadded:: 1.6.0
+    encoding: string, optional
+        Encoding used to decode the inputfile. Does not apply to input streams.
+        The special value 'bytes' enables backward compatibility workarounds
+        that ensures you receive byte arrays as results if possible and passes
+        latin1 encoded strings to converters. Override this value to receive
+        unicode arrays and pass strings as input to converters.
+        If set to None the system default is used.
+
+        .. versionadded:: 1.14.0
 
     Returns
     -------
@@ -861,16 +876,20 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
     # Type conversions for Py3 convenience
     if comments is not None:
         if isinstance(comments, (basestring, bytes)):
-            comments = [asbytes(comments)]
-        else:
-            comments = [asbytes(comment) for comment in comments]
+            comments = [comments]
+
+        comments = [_decode_line(x) for x in comments]
 
         # Compile regex for comments beforehand
         comments = (re.escape(comment) for comment in comments)
-        regex_comments = re.compile(b'|'.join(comments))
+        regex_comments = re.compile('|'.join(comments))
     user_converters = converters
-    if delimiter is not None:
-        delimiter = asbytes(delimiter)
+
+    if encoding == 'bytes':
+        encoding = None
+        byte_converters = True
+    else:
+        byte_converters = False
 
     if usecols is not None:
         # Allow usecols to be a single int or a sequence of ints
@@ -896,22 +915,24 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         if is_pathlib_path(fname):
             fname = str(fname)
         if _is_string_like(fname):
+            fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
+            fencoding = getattr(fh, 'encoding', 'latin1')
+            fh = iter(fh)
             fown = True
-            if fname.endswith('.gz'):
-                import gzip
-                fh = iter(gzip.GzipFile(fname))
-            elif fname.endswith('.bz2'):
-                import bz2
-                fh = iter(bz2.BZ2File(fname))
-            elif sys.version_info[0] == 2:
-                fh = iter(open(fname, 'U'))
-            else:
-                fh = iter(open(fname))
         else:
             fh = iter(fname)
+            fencoding = getattr(fname, 'encoding', 'latin1')
     except TypeError:
         raise ValueError('fname must be a string, file handle, or generator')
-    X = []
+
+    # input may be a python2 io stream
+    if encoding is not None:
+        fencoding = encoding
+    # we must assume local encoding
+    # TOOD emit portability warning?
+    elif fencoding is None:
+        import locale
+        fencoding = locale.getpreferredencoding()
 
     # not to be confused with the flatten_dtype we import...
     def flatten_dtype_internal(dt):
@@ -960,21 +981,43 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
             return tuple(ret)
 
     def split_line(line):
-        """Chop off comments, strip, and split at delimiter.
-
-        Note that although the file is opened as text, this function
-        returns bytes.
+        """Chop off comments, strip, and split at delimiter. """
+        line = _decode_line(line, encoding=encoding)
 
-        """
-        line = asbytes(line)
         if comments is not None:
-            line = regex_comments.split(asbytes(line), maxsplit=1)[0]
-        line = line.strip(b'\r\n')
+            line = regex_comments.split(line, maxsplit=1)[0]
+        line = line.strip('\r\n')
         if line:
             return line.split(delimiter)
         else:
             return []
 
+    def read_data(chunk_size):
+        # Parse each line, including the first
+        X = []
+        for i, line in enumerate(itertools.chain([first_line], fh)):
+            vals = split_line(line)
+            if len(vals) == 0:
+                continue
+            if usecols:
+                vals = [vals[j] for j in usecols]
+            if len(vals) != N:
+                line_num = i + skiprows + 1
+                raise ValueError("Wrong number of columns at line %d"
+                                 % line_num)
+
+            # Convert each value according to its column and store
+            items = [conv(val) for (conv, val) in zip(converters, vals)]
+
+            # Then pack it according to the dtype's nesting
+            items = pack_items(items, packing)
+            X.append(items)
+            if len(X) > chunk_size:
+                yield X
+                X = []
+        if X:
+            yield X
+
     try:
         # Make sure we're dealing with a proper dtype
         dtype = np.dtype(dtype)
@@ -1017,30 +1060,42 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
                 except ValueError:
                     # Unused converter specified
                     continue
-            converters[i] = conv
-
-        # Parse each line, including the first
-        for i, line in enumerate(itertools.chain([first_line], fh)):
-            vals = split_line(line)
-            if len(vals) == 0:
-                continue
-            if usecols:
-                vals = [vals[j] for j in usecols]
-            if len(vals) != N:
-                line_num = i + skiprows + 1
-                raise ValueError("Wrong number of columns at line %d"
-                                 % line_num)
-
-            # Convert each value according to its column and store
-            items = [conv(val) for (conv, val) in zip(converters, vals)]
-            # Then pack it according to the dtype's nesting
-            items = pack_items(items, packing)
-            X.append(items)
+            if byte_converters:
+                # converters may use decode to workaround numpy's oldd behaviour,
+                # so encode the string again before passing to the user converter
+                def tobytes_first(x, conv):
+                    if type(x) is bytes:
+                        return conv(x)
+                    return conv(x.encode("latin1"))
+                import functools
+                converters[i] = functools.partial(tobytes_first, conv=conv)
+            else:
+                converters[i] = conv
+
+        converters = [conv if conv is not bytes else
+                      lambda x: x.encode(fencoding) for conv in converters]
+
+        # read data in chunks and fill it into an array via resize
+        # over-allocating and shrinking the array later may be faster but is
+        # probably not relevant compared to the cost of actually reading and
+        # converting the data
+        X = None
+        for x in read_data(_loadtxt_chunksize):
+            if X is None:
+                X = np.array(x, dtype)
+            else:
+                nshape = list(X.shape)
+                pos = nshape[0]
+                nshape[0] += len(x)
+                X.resize(nshape)
+                X[pos:, ...] = x
     finally:
         if fown:
             fh.close()
 
-    X = np.array(X, dtype)
+    if X is None:
+        X = np.array([], dtype)
+
     # Multicolumn data are returned with shape (1, N, M), i.e.
     # (1, 1, M) for a single row - remove the singleton dimension there
     if X.ndim == 3 and X.shape[:2] == (1, 1):
@@ -1072,7 +1127,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
 
 
 def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
-            footer='', comments='# '):
+            footer='', comments='# ', encoding=None):
     """
     Save an array to a text file.
 
@@ -1116,6 +1171,11 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
         ``numpy.loadtxt``.
 
         .. versionadded:: 1.7.0
+    encoding: string, optional
+        Encoding used to encode the outputfile. Does not apply to output
+        streams.
+
+        .. versionadded:: 1.14.0
 
 
     See Also
@@ -1190,21 +1250,51 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
         fmt = asstr(fmt)
     delimiter = asstr(delimiter)
 
+    class WriteWrap(object):
+        """ convert to unicode in py2 or to bytes on bytestream inputs """
+        def __init__(self, fh, encoding):
+            self.fh = fh
+            self.encoding = encoding
+            self.do_write = self.first_write
+
+        def close(self):
+            self.fh.close()
+
+        def write(self, v):
+            self.do_write(v)
+
+        def write_bytes(self, v):
+            if isinstance(v, bytes):
+                self.fh.write(v)
+            else:
+                self.fh.write(v.encode(self.encoding))
+
+        def write_normal(self, v):
+            self.fh.write(asunicode(v))
+
+        def first_write(self, v):
+            try:
+                self.write_normal(v)
+                self.write = self.write_normal
+            except TypeError:
+                # input is probably a bytestream
+                self.write_bytes(v)
+                self.write = self.write_bytes
+
     own_fh = False
     if is_pathlib_path(fname):
         fname = str(fname)
     if _is_string_like(fname):
+        # datasource doesn't support creating a new file ...
+        open(fname, 'wt').close()
+        fh = np.lib._datasource.open(fname, 'wt', encoding=encoding)
         own_fh = True
-        if fname.endswith('.gz'):
-            import gzip
-            fh = gzip.open(fname, 'wb')
-        else:
-            if sys.version_info[0] >= 3:
-                fh = open(fname, 'wb')
-            else:
-                fh = open(fname, 'w')
+        # need to convert str to unicode for text io output
+        if sys.version_info[0] == 2:
+            fh = WriteWrap(fh, encoding or 'latin1')
     elif hasattr(fname, 'write'):
-        fh = fname
+        # wrap to handle byte output streams
+        fh = WriteWrap(fname, encoding or 'latin1')
     else:
         raise ValueError('fname must be a string or file handle')
 
@@ -1254,31 +1344,33 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
 
         if len(header) > 0:
             header = header.replace('\n', '\n' + comments)
-            fh.write(asbytes(comments + header + newline))
+            fh.write(comments + header + newline)
         if iscomplex_X:
             for row in X:
                 row2 = []
                 for number in row:
                     row2.append(number.real)
                     row2.append(number.imag)
-                fh.write(asbytes(format % tuple(row2) + newline))
+                fh.write(format % tuple(row2) + newline)
         else:
             for row in X:
                 try:
-                    fh.write(asbytes(format % tuple(row) + newline))
+                    v = format % tuple(row) + newline
                 except TypeError:
                     raise TypeError("Mismatch between array dtype ('%s') and "
                                     "format specifier ('%s')"
                                     % (str(X.dtype), format))
+                fh.write(v)
+
         if len(footer) > 0:
             footer = footer.replace('\n', '\n' + comments)
-            fh.write(asbytes(comments + footer + newline))
+            fh.write(comments + footer + newline)
     finally:
         if own_fh:
             fh.close()
 
 
-def fromregex(file, regexp, dtype):
+def fromregex(file, regexp, dtype, encoding=None):
     """
     Construct an array from a text file, using regular expression parsing.
 
@@ -1295,6 +1387,10 @@ def fromregex(file, regexp, dtype):
         Groups in the regular expression correspond to fields in the dtype.
     dtype : dtype or list of dtypes
         Dtype for the structured array.
+    encoding: string, optional
+        Encoding used to decode the inputfile. Does not apply to input streams.
+
+        .. versionadded:: 1.14.0
 
     Returns
     -------
@@ -1335,16 +1431,22 @@ def fromregex(file, regexp, dtype):
     """
     own_fh = False
     if not hasattr(file, "read"):
-        file = open(file, 'rb')
+        file = np.lib._datasource.open(file, 'rt', encoding=encoding)
         own_fh = True
 
     try:
-        if not hasattr(regexp, 'match'):
-            regexp = re.compile(asbytes(regexp))
         if not isinstance(dtype, np.dtype):
             dtype = np.dtype(dtype)
 
-        seq = regexp.findall(file.read())
+        content = file.read()
+        if isinstance(content, bytes) and not isinstance(regexp, bytes):
+            regexp = asbytes(regexp)
+        elif not isinstance(content, bytes) and isinstance(regexp, bytes):
+            regexp = asstr(regexp)
+
+        if not hasattr(regexp, 'match'):
+            regexp = re.compile(regexp)
+        seq = regexp.findall(content)
         if seq and not isinstance(seq[0], tuple):
             # Only one group is in the regexp.
             # Create the new array as a single data-type and then
@@ -1372,7 +1474,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
                names=None, excludelist=None, deletechars=None,
                replace_space='_', autostrip=False, case_sensitive=True,
                defaultfmt="f%i", unpack=None, usemask=False, loose=True,
-               invalid_raise=True, max_rows=None):
+               invalid_raise=True, max_rows=None, encoding='bytes'):
     """
     Load data from a text file, with missing values handled as specified.
 
@@ -1460,6 +1562,15 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         to read the entire file.
 
         .. versionadded:: 1.10.0
+    encoding: string, optional
+        Encoding used to decode the inputfile. Does not apply to input streams.
+        The special value 'bytes' enables backward compatibility workarounds
+        that ensures you receive byte arrays as results if possible and passes
+        latin1 encoded strings to converters. Override this value to receive
+        unicode arrays and pass strings as input to converters.
+        If set to None the system default is used.
+
+        .. versionadded:: 1.14.0
 
     Returns
     -------
@@ -1536,15 +1647,6 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         if max_rows < 1:
             raise ValueError("'max_rows' must be at least 1.")
 
-    # Py3 data conversions to bytes, for convenience
-    if comments is not None:
-        comments = asbytes(comments)
-    if isinstance(delimiter, unicode):
-        delimiter = asbytes(delimiter)
-    if isinstance(missing_values, (unicode, list, tuple)):
-        missing_values = asbytes_nested(missing_values)
-
-    #
     if usemask:
         from numpy.ma import MaskedArray, make_mask_descr
     # Check the input dictionary of converters
@@ -1554,16 +1656,19 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
             "The input argument 'converter' should be a valid dictionary "
             "(got '%s' instead)" % type(user_converters))
 
+    if encoding == 'bytes':
+        encoding = None
+        byte_converters = True
+    else:
+        byte_converters = False
+
     # Initialize the filehandle, the LineSplitter and the NameValidator
     own_fhd = False
     try:
         if is_pathlib_path(fname):
             fname = str(fname)
         if isinstance(fname, basestring):
-            if sys.version_info[0] == 2:
-                fhd = iter(np.lib._datasource.open(fname, 'rbU'))
-            else:
-                fhd = iter(np.lib._datasource.open(fname, 'rb'))
+            fhd = iter(np.lib._datasource.open(fname, 'rt', encoding=encoding))
             own_fhd = True
         else:
             fhd = iter(fname)
@@ -1573,7 +1678,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
             "or generator. Got %s instead." % type(fname))
 
     split_line = LineSplitter(delimiter=delimiter, comments=comments,
-                              autostrip=autostrip)._handyman
+                              autostrip=autostrip, encoding=encoding)
     validate_names = NameValidator(excludelist=excludelist,
                                    deletechars=deletechars,
                                    case_sensitive=case_sensitive,
@@ -1587,15 +1692,15 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     first_values = None
     try:
         while not first_values:
-            first_line = next(fhd)
+            first_line = _decode_line(next(fhd), encoding)
             if names is True:
                 if comments in first_line:
                     first_line = (
-                        b''.join(first_line.split(comments)[1:]))
+                        ''.join(first_line.split(comments)[1:]))
             first_values = split_line(first_line)
     except StopIteration:
         # return an empty array if the datafile is empty
-        first_line = b''
+        first_line = ''
         first_values = []
         warnings.warn('genfromtxt: Empty input file: "%s"' % fname, stacklevel=2)
 
@@ -1618,9 +1723,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
 
     # Check the names and overwrite the dtype.names if needed
     if names is True:
-        names = validate_names([_bytes_to_name(_.strip())
-                                for _ in first_values])
-        first_line = b''
+        names = validate_names([str(_.strip()) for _ in first_values])
+        first_line = ''
     elif _is_string_like(names):
         names = validate_names([_.strip() for _ in names.split(',')])
     elif names:
@@ -1657,9 +1761,11 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     # Process the missing values ...............................
     # Rename missing_values for convenience
     user_missing_values = missing_values or ()
+    if isinstance(user_missing_values, bytes):
+        user_missing_values = user_missing_values.decode('latin1')
 
     # Define the list of missing_values (one column: one list)
-    missing_values = [list([b'']) for _ in range(nbcols)]
+    missing_values = [list(['']) for _ in range(nbcols)]
 
     # We have a dictionary: process it field by field
     if isinstance(user_missing_values, dict):
@@ -1698,8 +1804,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
             if value not in entry:
                 entry.append(value)
     # We have a string : apply it to all entries
-    elif isinstance(user_missing_values, bytes):
-        user_value = user_missing_values.split(b",")
+    elif isinstance(user_missing_values, basestring):
+        user_value = user_missing_values.split(",")
         for entry in missing_values:
             entry.extend(user_value)
     # We have something else: apply it to all entries
@@ -1787,11 +1893,24 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
             testing_value = first_values[j]
         else:
             testing_value = None
-        converters[i].update(conv, locked=True,
+        if conv is bytes:
+            user_conv = asbytes
+        elif byte_converters:
+            # converters may use decode to workaround numpy's oldd behaviour,
+            # so encode the string again before passing to the user converter
+            def tobytes_first(x, conv):
+                if type(x) is bytes:
+                    return conv(x)
+                return conv(x.encode("latin1"))
+            import functools
+            user_conv = functools.partial(tobytes_first, conv=conv)
+        else:
+            user_conv = conv
+        converters[i].update(user_conv, locked=True,
                              testing_value=testing_value,
                              default=filling_values[i],
                              missing_values=missing_values[i],)
-        uc_update.append((i, conv))
+        uc_update.append((i, user_conv))
     # Make sure we have the corrected keys in user_converters...
     user_converters.update(uc_update)
 
@@ -1908,16 +2027,43 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         column_types = [conv.type for conv in converters]
         # Find the columns with strings...
         strcolidx = [i for (i, v) in enumerate(column_types)
-                     if v in (type('S'), np.string_)]
+                     if v == np.unicode_]
+
+        typestr = 'U'
+        if byte_converters and strcolidx:
+            # convert strings back to bytes for backward compatibility
+            warnings.warn(
+                "Reading strings without specifying the encoding argument is "
+                "deprecated. Set encoding, use None for the system default.",
+                np.VisibleDeprecationWarning, stacklevel=2)
+            try:
+                for j in range(len(data)):
+                    row = list(data[j])
+                    for i in strcolidx:
+                        row[i] = row[i].encode('latin1')
+                    data[j] = tuple(row)
+                typestr = 'S'
+            except UnicodeEncodeError:
+                # we must use unicode, revert encoding
+                for k in range(0, j + 1):
+                    row = list(data[k])
+                    for i in strcolidx:
+                        if isinstance(row[i], bytes):
+                            row[i] = row[i].decode('latin1')
+                    data[k] = tuple(row)
+
         # ... and take the largest number of chars.
         for i in strcolidx:
-            column_types[i] = "|S%i" % max(len(row[i]) for row in data)
+            column_types[i] = "|%s%i" % (typestr, max(len(row[i]) for row in data))
         #
         if names is None:
             # If the dtype is uniform, don't define names, else use ''
             base = set([c.type for c in converters if c._checked])
             if len(base) == 1:
-                (ddtype, mdtype) = (list(base)[0], bool)
+                if strcolidx:
+                    (ddtype, mdtype) = (typestr, bool)
+                else:
+                    (ddtype, mdtype) = (list(base)[0], bool)
             else:
                 ddtype = [(defaultfmt % i, dt)
                           for (i, dt) in enumerate(column_types)]
@@ -1966,8 +2112,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
                     # Keep the dtype of the current converter
                     if i in user_converters:
                         ishomogeneous &= (ttype == dtype.type)
-                        if ttype == np.string_:
-                            ttype = "|S%i" % max(len(row[i]) for row in data)
+                        if np.issubdtype(ttype, np.character):
+                            ttype = (ttype, max(len(row[i]) for row in data))
                         descr.append(('', ttype))
                     else:
                         descr.append(('', dtype))
@@ -1992,7 +2138,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     if usemask and names:
         for (name, conv) in zip(names or (), converters):
             missing_values = [conv(_) for _ in conv.missing_values
-                              if _ != b'']
+                              if _ != '']
             for mval in missing_values:
                 outputmask[name] |= (output[name] == mval)
     # Construct the final array