summaryrefslogtreecommitdiff
path: root/numpy/lib/npyio.py
diff options
context:
space:
mode:
authorAntony Lee <anntzer.lee@gmail.com>2021-08-03 16:49:50 +0200
committerAntony Lee <anntzer.lee@gmail.com>2021-08-06 13:52:05 +0200
commitc42f57b58fc2920a32a5be9b60dafba8b41c6411 (patch)
tree8e5f866702f31684aa133dba76afc46d79d7fc3b /numpy/lib/npyio.py
parent887766071724d27431b453b8270f299213b2d189 (diff)
downloadnumpy-c42f57b58fc2920a32a5be9b60dafba8b41c6411.tar.gz
PERF: Special-case single-converter in loadtxt.
~5-13% speedup: `[*map(conv, items)]` (single converter, which is quite common) is much faster than `[conv(val) for conv, val in zip(converters, vals)]`. `_loadtxt_floatconv` and `fencode` were lifted out so that every "instance" of them is the same, allowing checking for whether there's different converters in use (actually, it looks like two `floatconv`s returned by two separate calls to `_getconv` have the same identity, but we don't need to rely on that.
Diffstat (limited to 'numpy/lib/npyio.py')
-rw-r--r--numpy/lib/npyio.py92
1 files changed, 50 insertions, 42 deletions
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 7c73d9655..983e2615c 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -5,7 +5,7 @@ import itertools
import warnings
import weakref
import contextlib
-from operator import itemgetter, index as opindex
+from operator import itemgetter, index as opindex, methodcaller
from collections.abc import Mapping
import numpy as np
@@ -728,41 +728,42 @@ def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None):
zipf.close()
+def _floatconv(x):
+ try:
+ return float(x) # The fastest path.
+ except ValueError:
+ if '0x' in x: # Don't accidentally convert "a" ("0xa") to 10.
+ try:
+ return float.fromhex(x)
+ except ValueError:
+ pass
+ raise # Raise the original exception, which makes more sense.
+
+
+_CONVERTERS = [
+ (np.bool_, lambda x: bool(int(x))),
+ (np.uint64, np.uint64),
+ (np.int64, np.int64),
+ (np.integer, lambda x: int(float(x))),
+ (np.longdouble, np.longdouble),
+ (np.floating, _floatconv),
+ (complex, lambda x: complex(asstr(x).replace('+-', '-'))),
+ (np.bytes_, asbytes),
+ (np.unicode_, asunicode),
+]
+
+
def _getconv(dtype):
- """ Find the correct dtype converter. Adapted from matplotlib """
+ """
+ Find the correct dtype converter. Adapted from matplotlib.
- def floatconv(x):
- try:
- return float(x) # The fastest path.
- except ValueError:
- if '0x' in x: # Don't accidentally convert "a" ("0xa") to 10.
- try:
- return float.fromhex(x)
- except ValueError:
- pass
- raise # Raise the original exception, which makes more sense.
-
- typ = dtype.type
- if issubclass(typ, np.bool_):
- return lambda x: bool(int(x))
- if issubclass(typ, np.uint64):
- return np.uint64
- if issubclass(typ, np.int64):
- return np.int64
- if issubclass(typ, np.integer):
- return lambda x: int(float(x))
- elif issubclass(typ, np.longdouble):
- return np.longdouble
- elif issubclass(typ, np.floating):
- return floatconv
- elif issubclass(typ, complex):
- return lambda x: complex(asstr(x).replace('+-', '-'))
- elif issubclass(typ, np.bytes_):
- return asbytes
- elif issubclass(typ, np.unicode_):
- return asunicode
- else:
- return asstr
+ Even when a lambda is returned, it is defined at the toplevel, to allow
+ testing for equality and enabling optimization for single-type data.
+ """
+ for base, conv in _CONVERTERS:
+ if issubclass(dtype.type, base):
+ return conv
+ return asstr
# _loadtxt_flatten_dtype_internal and _loadtxt_pack_items are loadtxt helpers
@@ -1011,12 +1012,9 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
line_num = i + skiprows + 1
raise ValueError("Wrong number of columns at line %d"
% line_num)
-
- # Convert each value according to its column and store
- items = [conv(val) for (conv, val) in zip(converters, vals)]
-
- # Then pack it according to the dtype's nesting
- items = packer(items)
+ # Convert each value according to its column, then pack it
+ # according to the dtype's nesting
+ items = packer(convert_row(vals))
X.append(items)
if len(X) > chunk_size:
yield X
@@ -1154,8 +1152,18 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
else:
converters[i] = conv
- converters = [conv if conv is not bytes else
- lambda x: x.encode(fencoding) for conv in converters]
+ fencode = methodcaller("encode", fencoding)
+ converters = [conv if conv is not bytes else fencode
+ for conv in converters]
+ if len(set(converters)) == 1:
+ # Optimize single-type data. Note that this is only reached if
+ # `_getconv` returns equal callables (i.e. not local lambdas) on
+ # equal dtypes.
+ def convert_row(vals, _conv=converters[0]):
+ return [*map(_conv, vals)]
+ else:
+ def convert_row(vals):
+ return [conv(val) for conv, val in zip(converters, vals)]
# read data in chunks and fill it into an array via resize
# over-allocating and shrinking the array later may be faster but is