summaryrefslogtreecommitdiff
path: root/coverage/data.py
diff options
context:
space:
mode:
authorNed Batchelder <ned@nedbatchelder.com>2022-11-07 16:11:26 -0500
committerNed Batchelder <ned@nedbatchelder.com>2022-11-08 06:36:47 -0500
commit09bc2d6ab0f951c58546dab234edeaa9de7d4c44 (patch)
treed53ba990b4a90d264a83ee11f89cef9800db3317 /coverage/data.py
parentbc630b58b5d1c58cc8108e584a480f3a1cd5ab70 (diff)
downloadpython-coveragepy-git-09bc2d6ab0f951c58546dab234edeaa9de7d4c44.tar.gz
perf: hash data files during combining to avoid unneeded work. #1483
When generating many parallel data files, often some data files will be exact copies of each other. Checking the hashes, we can avoid combining the duplicates, speeding the process. On a coverage.py metacov, we had 651 duplicates out of 2189 files (29%). The time to combine was reduced by 17%.
Diffstat (limited to 'coverage/data.py')
-rw-r--r--coverage/data.py71
1 files changed, 45 insertions, 26 deletions
diff --git a/coverage/data.py b/coverage/data.py
index 4bdfe301..798d167f 100644
--- a/coverage/data.py
+++ b/coverage/data.py
@@ -11,6 +11,7 @@ imports working.
"""
import glob
+import hashlib
import os.path
from coverage.exceptions import CoverageException, NoDataError
@@ -110,7 +111,9 @@ def combine_parallel_data(
if strict and not files_to_combine:
raise NoDataError("No data to combine")
- files_combined = 0
+ file_hashes = set()
+ combined_any = False
+
for f in files_to_combine:
if f == data.data_filename():
# Sometimes we are combining into a file which is one of the
@@ -118,34 +121,50 @@ def combine_parallel_data(
if data._debug.should('dataio'):
data._debug.write(f"Skipping combining ourself: {f!r}")
continue
- if data._debug.should('dataio'):
- data._debug.write(f"Combining data file {f!r}")
+
try:
- new_data = CoverageData(f, debug=data._debug)
- new_data.read()
- except CoverageException as exc:
- if data._warn:
- # The CoverageException has the file name in it, so just
- # use the message as the warning.
- data._warn(str(exc))
+ rel_file_name = os.path.relpath(f)
+ except ValueError:
+ # ValueError can be raised under Windows when os.getcwd() returns a
+ # folder from a different drive than the drive of f, in which case
+ # we print the original value of f instead of its relative path
+ rel_file_name = f
+
+ with open(f, "rb") as fobj:
+ hasher = hashlib.new("sha3_256")
+ hasher.update(fobj.read())
+ sha = hasher.digest()
+ combine_this_one = sha not in file_hashes
+
+ delete_this_one = not keep
+ if combine_this_one:
+ if data._debug.should('dataio'):
+ data._debug.write(f"Combining data file {f!r}")
+ file_hashes.add(sha)
+ try:
+ new_data = CoverageData(f, debug=data._debug)
+ new_data.read()
+ except CoverageException as exc:
+ if data._warn:
+ # The CoverageException has the file name in it, so just
+ # use the message as the warning.
+ data._warn(str(exc))
+ delete_this_one = False
+ else:
+ data.update(new_data, aliases=aliases)
+ combined_any = True
+ if message:
+ message(f"Combined data file {rel_file_name}")
else:
- data.update(new_data, aliases=aliases)
- files_combined += 1
if message:
- try:
- file_name = os.path.relpath(f)
- except ValueError:
- # ValueError can be raised under Windows when os.getcwd() returns a
- # folder from a different drive than the drive of f, in which case
- # we print the original value of f instead of its relative path
- file_name = f
- message(f"Combined data file {file_name}")
- if not keep:
- if data._debug.should('dataio'):
- data._debug.write(f"Deleting combined data file {f!r}")
- file_be_gone(f)
-
- if strict and not files_combined:
+ message(f"Skipping duplicate data {rel_file_name}")
+
+ if delete_this_one:
+ if data._debug.should('dataio'):
+ data._debug.write(f"Deleting data file {f!r}")
+ file_be_gone(f)
+
+ if strict and not combined_any:
raise NoDataError("No usable data files")