PERF: In loadtxt, decide once and for all whether decoding is needed.

... and use a single decoder function instead of repeatedly checking the input type (in `_decode_line`). ~5-8% speedup.
author: Antony Lee <anntzer.lee@gmail.com> 2021-08-03 16:03:26 +0200
committer: Antony Lee <anntzer.lee@gmail.com> 2021-08-06 19:17:38 +0200
commit: e6be8977bd741ee39fdc0b30fccae036c68db9f7 (patch)
tree: 189d348fc01e65f3c19d41ad261adaadcce5380e /numpy/lib/npyio.py
parent: 99eac42354f9c900bf8897abec6e7c1bd6c9ff6c (diff)
download: numpy-e6be8977bd741ee39fdc0b30fccae036c68db9f7.tar.gz
1 files changed, 12 insertions, 6 deletions
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 983e2615c..159378992 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -979,9 +979,8 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
     # Nested functions used by loadtxt.
     # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
-    def split_line(line):
-        """Chop off comments, strip, and split at delimiter. """
-        line = _decode_line(line, encoding=encoding)
+    def split_line(line: str):
+        """Chop off comments, strip, and split at delimiter."""
         for comment in comments:  # Much faster than using a single regex.
             line = line.split(comment, 1)[0]
         line = line.strip('\r\n')
@@ -1002,8 +1001,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         X = []
         line_iter = itertools.chain([first_line], fh)
         line_iter = itertools.islice(line_iter, max_rows)
-        for i, line in enumerate(line_iter):
-            vals = split_line(line)
+        for i, vals in enumerate(map(split_line, map(decode, line_iter))):
             if len(vals) == 0:
                 continue
             if usecols:
@@ -1108,7 +1106,8 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         # Read until we find a line with some values, and use it to determine
         # the need for decoding and estimate the number of columns.
         for first_line in fh:
-            ncols = len(usecols or split_line(first_line))
+            ncols = len(usecols
+                        or split_line(_decode_line(first_line, encoding)))
             if ncols:
                 break
         else:  # End of lines reached
@@ -1117,6 +1116,13 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
             warnings.warn('loadtxt: Empty input file: "%s"' % fname,
                           stacklevel=2)
 
+        # Decide once and for all whether decoding is needed.
+        if isinstance(first_line, bytes):
+            decode = methodcaller(
+                "decode", encoding if encoding is not None else "latin1")
+        else:
+            def decode(line): return line
+
         # Now that we know ncols, create the default converters list, and
         # set packing, if necessary.
         if len(dtype_types) > 1:
author	Antony Lee <anntzer.lee@gmail.com>	2021-08-03 16:03:26 +0200
committer	Antony Lee <anntzer.lee@gmail.com>	2021-08-06 19:17:38 +0200
commit	e6be8977bd741ee39fdc0b30fccae036c68db9f7 (patch)
tree	189d348fc01e65f3c19d41ad261adaadcce5380e /numpy/lib/npyio.py
parent	99eac42354f9c900bf8897abec6e7c1bd6c9ff6c (diff)
download	numpy-e6be8977bd741ee39fdc0b30fccae036c68db9f7.tar.gz