Refactor collector->data; data has only one of lines and arcs.

Now the collector communicates directly with the data, and control is less involved. In the data, when measuring arcs, only arcs are stored. Lines are calculated as needed. This saves space in the data file, and is faster.
author: Ned Batchelder <ned@nedbatchelder.com> 2015-07-18 14:09:54 -0400
committer: Ned Batchelder <ned@nedbatchelder.com> 2015-07-18 14:09:54 -0400
commit: aa9af88224fac4d25d5bf1d2f4757b8ffd2c22ee (patch)
tree: 3697e6162d46fdb6e17b2c6d694314dff19280ec /coverage
parent: ed2266434af1582cd94c1b89f7172bad62f88745 (diff)
download: python-coveragepy-git-aa9af88224fac4d25d5bf1d2f4757b8ffd2c22ee.tar.gz
3 files changed, 52 insertions, 73 deletions
diff --git a/coverage/collector.py b/coverage/collector.py
index 57c35605..eec8703e 100644
--- a/coverage/collector.py
+++ b/coverage/collector.py
@@ -3,6 +3,8 @@
 import os, sys
 
 from coverage import env
+from coverage.backward import iitems
+from coverage.files import abs_file
 from coverage.misc import CoverageException
 from coverage.pytracer import PyTracer
 
@@ -20,7 +22,7 @@ except ImportError:
         # exception here causes all sorts of other noise in unittest.
         sys.stderr.write(
             "*** COVERAGE_TEST_TRACER is 'c' but can't import CTracer!\n"
-            )
+        )
         sys.exit(1)
     CTracer = None
 
@@ -46,7 +48,8 @@ class Collector(object):
     # the top, and resumed when they become the top again.
     _collectors = []
 
-    def __init__(self,
+    def __init__(
+        self,
         should_trace, check_include, timid, branch, warn, concurrency,
     ):
         """Create a collector.
@@ -289,45 +292,20 @@ class Collector(object):
         else:
             self._start_tracer()
 
-    def get_line_data(self):
-        """Return the line data collected.
+    def save_data(self, covdata):
+        """Save the collected data to a `CoverageData`.
 
-        Data is { filename: { lineno: None, ...}, ...}
+        Also resets the collector.
 
         """
-        if self.branch:
-            # If we were measuring branches, then we have to re-build the dict
-            # to show line data.  We'll use the first lines of all the arcs,
-            # if they are actual lines. We don't need the second lines, because
-            # the second lines will also be first lines, sometimes to exits.
-            line_data = {}
-            for f, arcs in self.data.items():
-                line_data[f] = dict(
-                    (l1, None) for l1, _ in arcs.keys() if l1 > 0
-                )
-            return line_data
-        else:
-            return self.data
-
-    def get_arc_data(self):
-        """Return the arc data collected.
-
-        Data is { filename: { (l1, l2): None, ...}, ...}
+        def abs_file_dict(d):
+            """Return a dict like d, but with keys modified by `abs_file`."""
+            return dict((abs_file(k), v) for k, v in iitems(d))
 
-        Note that no data is collected or returned if the Collector wasn't
-        created with `branch` true.
-
-        """
         if self.branch:
-            return self.data
+            covdata.add_arcs(abs_file_dict(self.data))
         else:
-            return {}
-
-    def get_plugin_data(self):
-        """Return the mapping of source files to plugins.
+            covdata.add_lines(abs_file_dict(self.data))
+        covdata.add_plugins(abs_file_dict(self.plugin_data))
 
-        Returns:
-            dict: { filename: plugin_name, ... }
-
-        """
-        return self.plugin_data
+        self.reset()
diff --git a/coverage/control.py b/coverage/control.py
index 7c14e1b0..3f6f5aca 100644
--- a/coverage/control.py
+++ b/coverage/control.py
@@ -744,15 +744,7 @@ class Coverage(object):
         if not self._measured:
             return
 
-        def abs_file_dict(d):
-            """Return a dict like d, but with keys modified by `abs_file`."""
-            return dict((abs_file(k), v) for k,v in iitems(d))
-
-        # TODO: seems like this parallel structure is getting kinda old...
-        self.data.add_lines(abs_file_dict(self.collector.get_line_data()))
-        self.data.add_arcs(abs_file_dict(self.collector.get_arc_data()))
-        self.data.add_plugins(abs_file_dict(self.collector.get_plugin_data()))
-        self.collector.reset()
+        self.collector.save_data(self.data)
 
         # If there are still entries in the source_pkgs list, then we never
         # encountered those packages.
diff --git a/coverage/data.py b/coverage/data.py
index db205811..adacaecc 100644
--- a/coverage/data.py
+++ b/coverage/data.py
@@ -8,7 +8,7 @@ import socket
 from coverage.backward import iitems, pickle
 from coverage.debug import _TEST_NAME_FILE
 from coverage.files import PathAliases
-from coverage.misc import file_be_gone
+from coverage.misc import CoverageException, file_be_gone
 
 
 class CoverageData(object):
@@ -18,12 +18,12 @@ class CoverageData(object):
 
         * collector: a string identifying the collecting software
 
-        * lines: a dict mapping filenames to sorted lists of line numbers
+        * lines: a dict mapping filenames to lists of line numbers
           executed::
 
             { 'file1': [17,23,45], 'file2': [1,2,3], ... }
 
-        * arcs: a dict mapping filenames to sorted lists of line number pairs::
+        * arcs: a dict mapping filenames to lists of line number pairs::
 
             { 'file1': [(17,23), (17,25), (25,26)], ... }
 
@@ -31,6 +31,11 @@ class CoverageData(object):
 
             { 'file1': "django.coverage", ... }
 
+    Only one of `lines` or `arcs` will be present: with branch coverage, data
+    is stored as arcs. Without branch coverage, it is stored as lines.  The
+    line data is easily recovered from the arcs: it is all the first elements
+    of the pairs that are greater than zero.
+
     """
 
     def __init__(self, collector=None, debug=None):
@@ -82,7 +87,12 @@ class CoverageData(object):
 
     def lines(self, filename):
         """Get the list of lines executed for a file."""
-        return list((self._lines.get(filename) or {}).keys())
+        if self._arcs:
+            arcs = self._arcs.get(filename) or {}
+            return [s for s, __ in arcs if s > 0]
+        else:
+            lines = self._lines.get(filename) or {}
+            return list(lines)
 
     def arcs(self, filename):
         """Get the list of arcs executed for a file."""
@@ -107,30 +117,29 @@ class CoverageData(object):
         Should only be used on an empty CoverageData object.
 
         """
-        try:
-            data = pickle.load(file_obj)
-            if isinstance(data, dict):
-                # Unpack the 'lines' item.
-                self._lines = dict([
-                    (f, dict.fromkeys(linenos, None))
-                    for f, linenos in iitems(data.get('lines', {}))
-                ])
-                # Unpack the 'arcs' item.
-                self._arcs = dict([
-                    (f, dict.fromkeys(arcpairs, None))
-                    for f, arcpairs in iitems(data.get('arcs', {}))
-                ])
-                self._plugins = data.get('plugins', {})
-        except Exception:
-            # TODO: this used to handle file-doesnt-exist problems.  Do we still need it?
-            pass
+        data = pickle.load(file_obj)
+
+        # Unpack the 'lines' item.
+        self._lines = dict([
+            (f, dict.fromkeys(linenos, None))
+            for f, linenos in iitems(data.get('lines', {}))
+        ])
+        # Unpack the 'arcs' item.
+        self._arcs = dict([
+            (f, dict.fromkeys(arcpairs, None))
+            for f, arcpairs in iitems(data.get('arcs', {}))
+        ])
+        self._plugins = data.get('plugins', {})
 
     def read_file(self, filename):
         """Read the coverage data from `filename`."""
         if self._debug and self._debug.should('dataio'):
             self._debug.write("Reading data from %r" % (filename,))
-        with open(filename, "rb") as f:
-            self.read(f)
+        try:
+            with open(filename, "rb") as f:
+                self.read(f)
+        except Exception as exc:
+            raise CoverageException("Couldn't read data from '%s': %s" % (filename, exc))
 
     def write(self, file_obj):
         """Write the coverage data to `file_obj`."""
@@ -202,11 +211,11 @@ class CoverageData(object):
 
     def touch_file(self, filename):
         """Ensure that `filename` appears in the data, empty if needed."""
-        self._lines.setdefault(filename, {})
+        (self._arcs or self._lines).setdefault(filename, {})
 
     def measured_files(self):
         """A list of all files that had been measured."""
-        return list(self._lines.keys())
+        return list(self._arcs or self._lines)
 
     def add_to_hash(self, filename, hasher):
         """Contribute `filename`'s data to the Md5Hash `hasher`."""
@@ -231,8 +240,8 @@ class CoverageData(object):
             filename_fn = lambda f: f
         else:
             filename_fn = os.path.basename
-        for filename, lines in iitems(self._lines):
-            summ[filename_fn(filename)] = len(lines)
+        for filename in self.measured_files():
+            summ[filename_fn(filename)] = len(self.lines(filename))
         return summ
 
     def __nonzero__(self):
author	Ned Batchelder <ned@nedbatchelder.com>	2015-07-18 14:09:54 -0400
committer	Ned Batchelder <ned@nedbatchelder.com>	2015-07-18 14:09:54 -0400
commit	aa9af88224fac4d25d5bf1d2f4757b8ffd2c22ee (patch)
tree	3697e6162d46fdb6e17b2c6d694314dff19280ec /coverage
parent	ed2266434af1582cd94c1b89f7172bad62f88745 (diff)
download	python-coveragepy-git-aa9af88224fac4d25d5bf1d2f4757b8ffd2c22ee.tar.gz