diff options
author | Julian Taylor <jtaylor.debian@googlemail.com> | 2017-04-03 14:20:36 +0200 |
---|---|---|
committer | Charles Harris <charlesr.harris@gmail.com> | 2017-11-21 10:16:00 -0700 |
commit | d8edc62e8c9e69280fb8a171c7678b2fea929696 (patch) | |
tree | aa6813116f4f72bf7270be2fdb1537abe8776f02 /numpy/lib/npyio.py | |
parent | b6044d88cab21d7ebe274bcd79bc430a57c520e6 (diff) | |
download | numpy-d8edc62e8c9e69280fb8a171c7678b2fea929696.tar.gz |
ENH: Add encoding option to numpy text IO.
This modifies loadtxt and genfromtxt in several ways intended to add
unicode support for text files by adding an `encoding` keyword to
np.load, np.genfromtxt, np.savetxt, and np.fromregex. The original
treatment of the relevant files was to open them as byte
files, whereas they are now opened as text files with an encoding. When
read, they are decoded to unicode strings for Python3 compatibility,
and when written, they are encoded as specified. For backward
compatibility, the default encoding in both cases is latin1.
Diffstat (limited to 'numpy/lib/npyio.py')
-rw-r--r-- | numpy/lib/npyio.py | 346 |
1 files changed, 246 insertions, 100 deletions
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 6de5940d7..fe2aa436b 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -1,5 +1,6 @@ from __future__ import division, absolute_import, print_function +import io import sys import os import re @@ -15,11 +16,12 @@ from numpy.core.multiarray import packbits, unpackbits from ._iotools import ( LineSplitter, NameValidator, StringConverter, ConverterError, ConverterLockError, ConversionWarning, _is_string_like, - has_nested_fields, flatten_dtype, easy_dtype, _bytes_to_name + has_nested_fields, flatten_dtype, easy_dtype, _decode_line ) from numpy.compat import ( - asbytes, asstr, asbytes_nested, bytes, basestring, unicode, is_pathlib_path + asbytes, asstr, asunicode, asbytes_nested, bytes, basestring, unicode, + is_pathlib_path ) if sys.version_info[0] >= 3: @@ -731,7 +733,7 @@ def _getconv(dtype): def floatconv(x): x.lower() - if b'0x' in x: + if '0x' in x: return float.fromhex(asstr(x)) return float(x) @@ -752,13 +754,17 @@ def _getconv(dtype): return lambda x: complex(asstr(x)) elif issubclass(typ, np.bytes_): return asbytes + elif issubclass(typ, np.unicode_): + return asunicode else: return asstr +# amount of lines loadtxt reads in one chunk, can be overriden for testing +_loadtxt_chunksize = 50000 def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None, skiprows=0, usecols=None, unpack=False, - ndmin=0): + ndmin=0, encoding='bytes'): """ Load data from a text file. @@ -813,6 +819,15 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, Legal values: 0 (default), 1 or 2. .. versionadded:: 1.6.0 + encoding: string, optional + Encoding used to decode the inputfile. Does not apply to input streams. + The special value 'bytes' enables backward compatibility workarounds + that ensures you receive byte arrays as results if possible and passes + latin1 encoded strings to converters. Override this value to receive + unicode arrays and pass strings as input to converters. + If set to None the system default is used. + + .. versionadded:: 1.14.0 Returns ------- @@ -861,16 +876,20 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, # Type conversions for Py3 convenience if comments is not None: if isinstance(comments, (basestring, bytes)): - comments = [asbytes(comments)] - else: - comments = [asbytes(comment) for comment in comments] + comments = [comments] + + comments = [_decode_line(x) for x in comments] # Compile regex for comments beforehand comments = (re.escape(comment) for comment in comments) - regex_comments = re.compile(b'|'.join(comments)) + regex_comments = re.compile('|'.join(comments)) user_converters = converters - if delimiter is not None: - delimiter = asbytes(delimiter) + + if encoding == 'bytes': + encoding = None + byte_converters = True + else: + byte_converters = False if usecols is not None: # Allow usecols to be a single int or a sequence of ints @@ -896,22 +915,24 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, if is_pathlib_path(fname): fname = str(fname) if _is_string_like(fname): + fh = np.lib._datasource.open(fname, 'rt', encoding=encoding) + fencoding = getattr(fh, 'encoding', 'latin1') + fh = iter(fh) fown = True - if fname.endswith('.gz'): - import gzip - fh = iter(gzip.GzipFile(fname)) - elif fname.endswith('.bz2'): - import bz2 - fh = iter(bz2.BZ2File(fname)) - elif sys.version_info[0] == 2: - fh = iter(open(fname, 'U')) - else: - fh = iter(open(fname)) else: fh = iter(fname) + fencoding = getattr(fname, 'encoding', 'latin1') except TypeError: raise ValueError('fname must be a string, file handle, or generator') - X = [] + + # input may be a python2 io stream + if encoding is not None: + fencoding = encoding + # we must assume local encoding + # TOOD emit portability warning? + elif fencoding is None: + import locale + fencoding = locale.getpreferredencoding() # not to be confused with the flatten_dtype we import... def flatten_dtype_internal(dt): @@ -960,21 +981,43 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, return tuple(ret) def split_line(line): - """Chop off comments, strip, and split at delimiter. - - Note that although the file is opened as text, this function - returns bytes. + """Chop off comments, strip, and split at delimiter. """ + line = _decode_line(line, encoding=encoding) - """ - line = asbytes(line) if comments is not None: - line = regex_comments.split(asbytes(line), maxsplit=1)[0] - line = line.strip(b'\r\n') + line = regex_comments.split(line, maxsplit=1)[0] + line = line.strip('\r\n') if line: return line.split(delimiter) else: return [] + def read_data(chunk_size): + # Parse each line, including the first + X = [] + for i, line in enumerate(itertools.chain([first_line], fh)): + vals = split_line(line) + if len(vals) == 0: + continue + if usecols: + vals = [vals[j] for j in usecols] + if len(vals) != N: + line_num = i + skiprows + 1 + raise ValueError("Wrong number of columns at line %d" + % line_num) + + # Convert each value according to its column and store + items = [conv(val) for (conv, val) in zip(converters, vals)] + + # Then pack it according to the dtype's nesting + items = pack_items(items, packing) + X.append(items) + if len(X) > chunk_size: + yield X + X = [] + if X: + yield X + try: # Make sure we're dealing with a proper dtype dtype = np.dtype(dtype) @@ -1017,30 +1060,42 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, except ValueError: # Unused converter specified continue - converters[i] = conv - - # Parse each line, including the first - for i, line in enumerate(itertools.chain([first_line], fh)): - vals = split_line(line) - if len(vals) == 0: - continue - if usecols: - vals = [vals[j] for j in usecols] - if len(vals) != N: - line_num = i + skiprows + 1 - raise ValueError("Wrong number of columns at line %d" - % line_num) - - # Convert each value according to its column and store - items = [conv(val) for (conv, val) in zip(converters, vals)] - # Then pack it according to the dtype's nesting - items = pack_items(items, packing) - X.append(items) + if byte_converters: + # converters may use decode to workaround numpy's oldd behaviour, + # so encode the string again before passing to the user converter + def tobytes_first(x, conv): + if type(x) is bytes: + return conv(x) + return conv(x.encode("latin1")) + import functools + converters[i] = functools.partial(tobytes_first, conv=conv) + else: + converters[i] = conv + + converters = [conv if conv is not bytes else + lambda x: x.encode(fencoding) for conv in converters] + + # read data in chunks and fill it into an array via resize + # over-allocating and shrinking the array later may be faster but is + # probably not relevant compared to the cost of actually reading and + # converting the data + X = None + for x in read_data(_loadtxt_chunksize): + if X is None: + X = np.array(x, dtype) + else: + nshape = list(X.shape) + pos = nshape[0] + nshape[0] += len(x) + X.resize(nshape) + X[pos:, ...] = x finally: if fown: fh.close() - X = np.array(X, dtype) + if X is None: + X = np.array([], dtype) + # Multicolumn data are returned with shape (1, N, M), i.e. # (1, 1, M) for a single row - remove the singleton dimension there if X.ndim == 3 and X.shape[:2] == (1, 1): @@ -1072,7 +1127,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', - footer='', comments='# '): + footer='', comments='# ', encoding=None): """ Save an array to a text file. @@ -1116,6 +1171,11 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', ``numpy.loadtxt``. .. versionadded:: 1.7.0 + encoding: string, optional + Encoding used to encode the outputfile. Does not apply to output + streams. + + .. versionadded:: 1.14.0 See Also @@ -1190,21 +1250,51 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', fmt = asstr(fmt) delimiter = asstr(delimiter) + class WriteWrap(object): + """ convert to unicode in py2 or to bytes on bytestream inputs """ + def __init__(self, fh, encoding): + self.fh = fh + self.encoding = encoding + self.do_write = self.first_write + + def close(self): + self.fh.close() + + def write(self, v): + self.do_write(v) + + def write_bytes(self, v): + if isinstance(v, bytes): + self.fh.write(v) + else: + self.fh.write(v.encode(self.encoding)) + + def write_normal(self, v): + self.fh.write(asunicode(v)) + + def first_write(self, v): + try: + self.write_normal(v) + self.write = self.write_normal + except TypeError: + # input is probably a bytestream + self.write_bytes(v) + self.write = self.write_bytes + own_fh = False if is_pathlib_path(fname): fname = str(fname) if _is_string_like(fname): + # datasource doesn't support creating a new file ... + open(fname, 'wt').close() + fh = np.lib._datasource.open(fname, 'wt', encoding=encoding) own_fh = True - if fname.endswith('.gz'): - import gzip - fh = gzip.open(fname, 'wb') - else: - if sys.version_info[0] >= 3: - fh = open(fname, 'wb') - else: - fh = open(fname, 'w') + # need to convert str to unicode for text io output + if sys.version_info[0] == 2: + fh = WriteWrap(fh, encoding or 'latin1') elif hasattr(fname, 'write'): - fh = fname + # wrap to handle byte output streams + fh = WriteWrap(fname, encoding or 'latin1') else: raise ValueError('fname must be a string or file handle') @@ -1254,31 +1344,33 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', if len(header) > 0: header = header.replace('\n', '\n' + comments) - fh.write(asbytes(comments + header + newline)) + fh.write(comments + header + newline) if iscomplex_X: for row in X: row2 = [] for number in row: row2.append(number.real) row2.append(number.imag) - fh.write(asbytes(format % tuple(row2) + newline)) + fh.write(format % tuple(row2) + newline) else: for row in X: try: - fh.write(asbytes(format % tuple(row) + newline)) + v = format % tuple(row) + newline except TypeError: raise TypeError("Mismatch between array dtype ('%s') and " "format specifier ('%s')" % (str(X.dtype), format)) + fh.write(v) + if len(footer) > 0: footer = footer.replace('\n', '\n' + comments) - fh.write(asbytes(comments + footer + newline)) + fh.write(comments + footer + newline) finally: if own_fh: fh.close() -def fromregex(file, regexp, dtype): +def fromregex(file, regexp, dtype, encoding=None): """ Construct an array from a text file, using regular expression parsing. @@ -1295,6 +1387,10 @@ def fromregex(file, regexp, dtype): Groups in the regular expression correspond to fields in the dtype. dtype : dtype or list of dtypes Dtype for the structured array. + encoding: string, optional + Encoding used to decode the inputfile. Does not apply to input streams. + + .. versionadded:: 1.14.0 Returns ------- @@ -1335,16 +1431,22 @@ def fromregex(file, regexp, dtype): """ own_fh = False if not hasattr(file, "read"): - file = open(file, 'rb') + file = np.lib._datasource.open(file, 'rt', encoding=encoding) own_fh = True try: - if not hasattr(regexp, 'match'): - regexp = re.compile(asbytes(regexp)) if not isinstance(dtype, np.dtype): dtype = np.dtype(dtype) - seq = regexp.findall(file.read()) + content = file.read() + if isinstance(content, bytes) and not isinstance(regexp, bytes): + regexp = asbytes(regexp) + elif not isinstance(content, bytes) and isinstance(regexp, bytes): + regexp = asstr(regexp) + + if not hasattr(regexp, 'match'): + regexp = re.compile(regexp) + seq = regexp.findall(content) if seq and not isinstance(seq[0], tuple): # Only one group is in the regexp. # Create the new array as a single data-type and then @@ -1372,7 +1474,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, names=None, excludelist=None, deletechars=None, replace_space='_', autostrip=False, case_sensitive=True, defaultfmt="f%i", unpack=None, usemask=False, loose=True, - invalid_raise=True, max_rows=None): + invalid_raise=True, max_rows=None, encoding='bytes'): """ Load data from a text file, with missing values handled as specified. @@ -1460,6 +1562,15 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, to read the entire file. .. versionadded:: 1.10.0 + encoding: string, optional + Encoding used to decode the inputfile. Does not apply to input streams. + The special value 'bytes' enables backward compatibility workarounds + that ensures you receive byte arrays as results if possible and passes + latin1 encoded strings to converters. Override this value to receive + unicode arrays and pass strings as input to converters. + If set to None the system default is used. + + .. versionadded:: 1.14.0 Returns ------- @@ -1536,15 +1647,6 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, if max_rows < 1: raise ValueError("'max_rows' must be at least 1.") - # Py3 data conversions to bytes, for convenience - if comments is not None: - comments = asbytes(comments) - if isinstance(delimiter, unicode): - delimiter = asbytes(delimiter) - if isinstance(missing_values, (unicode, list, tuple)): - missing_values = asbytes_nested(missing_values) - - # if usemask: from numpy.ma import MaskedArray, make_mask_descr # Check the input dictionary of converters @@ -1554,16 +1656,19 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, "The input argument 'converter' should be a valid dictionary " "(got '%s' instead)" % type(user_converters)) + if encoding == 'bytes': + encoding = None + byte_converters = True + else: + byte_converters = False + # Initialize the filehandle, the LineSplitter and the NameValidator own_fhd = False try: if is_pathlib_path(fname): fname = str(fname) if isinstance(fname, basestring): - if sys.version_info[0] == 2: - fhd = iter(np.lib._datasource.open(fname, 'rbU')) - else: - fhd = iter(np.lib._datasource.open(fname, 'rb')) + fhd = iter(np.lib._datasource.open(fname, 'rt', encoding=encoding)) own_fhd = True else: fhd = iter(fname) @@ -1573,7 +1678,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, "or generator. Got %s instead." % type(fname)) split_line = LineSplitter(delimiter=delimiter, comments=comments, - autostrip=autostrip)._handyman + autostrip=autostrip, encoding=encoding) validate_names = NameValidator(excludelist=excludelist, deletechars=deletechars, case_sensitive=case_sensitive, @@ -1587,15 +1692,15 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, first_values = None try: while not first_values: - first_line = next(fhd) + first_line = _decode_line(next(fhd), encoding) if names is True: if comments in first_line: first_line = ( - b''.join(first_line.split(comments)[1:])) + ''.join(first_line.split(comments)[1:])) first_values = split_line(first_line) except StopIteration: # return an empty array if the datafile is empty - first_line = b'' + first_line = '' first_values = [] warnings.warn('genfromtxt: Empty input file: "%s"' % fname, stacklevel=2) @@ -1618,9 +1723,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, # Check the names and overwrite the dtype.names if needed if names is True: - names = validate_names([_bytes_to_name(_.strip()) - for _ in first_values]) - first_line = b'' + names = validate_names([str(_.strip()) for _ in first_values]) + first_line = '' elif _is_string_like(names): names = validate_names([_.strip() for _ in names.split(',')]) elif names: @@ -1657,9 +1761,11 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, # Process the missing values ............................... # Rename missing_values for convenience user_missing_values = missing_values or () + if isinstance(user_missing_values, bytes): + user_missing_values = user_missing_values.decode('latin1') # Define the list of missing_values (one column: one list) - missing_values = [list([b'']) for _ in range(nbcols)] + missing_values = [list(['']) for _ in range(nbcols)] # We have a dictionary: process it field by field if isinstance(user_missing_values, dict): @@ -1698,8 +1804,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, if value not in entry: entry.append(value) # We have a string : apply it to all entries - elif isinstance(user_missing_values, bytes): - user_value = user_missing_values.split(b",") + elif isinstance(user_missing_values, basestring): + user_value = user_missing_values.split(",") for entry in missing_values: entry.extend(user_value) # We have something else: apply it to all entries @@ -1787,11 +1893,24 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, testing_value = first_values[j] else: testing_value = None - converters[i].update(conv, locked=True, + if conv is bytes: + user_conv = asbytes + elif byte_converters: + # converters may use decode to workaround numpy's oldd behaviour, + # so encode the string again before passing to the user converter + def tobytes_first(x, conv): + if type(x) is bytes: + return conv(x) + return conv(x.encode("latin1")) + import functools + user_conv = functools.partial(tobytes_first, conv=conv) + else: + user_conv = conv + converters[i].update(user_conv, locked=True, testing_value=testing_value, default=filling_values[i], missing_values=missing_values[i],) - uc_update.append((i, conv)) + uc_update.append((i, user_conv)) # Make sure we have the corrected keys in user_converters... user_converters.update(uc_update) @@ -1908,16 +2027,43 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, column_types = [conv.type for conv in converters] # Find the columns with strings... strcolidx = [i for (i, v) in enumerate(column_types) - if v in (type('S'), np.string_)] + if v == np.unicode_] + + typestr = 'U' + if byte_converters and strcolidx: + # convert strings back to bytes for backward compatibility + warnings.warn( + "Reading strings without specifying the encoding argument is " + "deprecated. Set encoding, use None for the system default.", + np.VisibleDeprecationWarning, stacklevel=2) + try: + for j in range(len(data)): + row = list(data[j]) + for i in strcolidx: + row[i] = row[i].encode('latin1') + data[j] = tuple(row) + typestr = 'S' + except UnicodeEncodeError: + # we must use unicode, revert encoding + for k in range(0, j + 1): + row = list(data[k]) + for i in strcolidx: + if isinstance(row[i], bytes): + row[i] = row[i].decode('latin1') + data[k] = tuple(row) + # ... and take the largest number of chars. for i in strcolidx: - column_types[i] = "|S%i" % max(len(row[i]) for row in data) + column_types[i] = "|%s%i" % (typestr, max(len(row[i]) for row in data)) # if names is None: # If the dtype is uniform, don't define names, else use '' base = set([c.type for c in converters if c._checked]) if len(base) == 1: - (ddtype, mdtype) = (list(base)[0], bool) + if strcolidx: + (ddtype, mdtype) = (typestr, bool) + else: + (ddtype, mdtype) = (list(base)[0], bool) else: ddtype = [(defaultfmt % i, dt) for (i, dt) in enumerate(column_types)] @@ -1966,8 +2112,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, # Keep the dtype of the current converter if i in user_converters: ishomogeneous &= (ttype == dtype.type) - if ttype == np.string_: - ttype = "|S%i" % max(len(row[i]) for row in data) + if np.issubdtype(ttype, np.character): + ttype = (ttype, max(len(row[i]) for row in data)) descr.append(('', ttype)) else: descr.append(('', dtype)) @@ -1992,7 +2138,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, if usemask and names: for (name, conv) in zip(names or (), converters): missing_values = [conv(_) for _ in conv.missing_values - if _ != b''] + if _ != ''] for mval in missing_values: outputmask[name] |= (output[name] == mval) # Construct the final array |