summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSebastian Thiel <byronimo@gmail.com>2010-06-03 18:21:05 +0200
committerSebastian Thiel <byronimo@gmail.com>2010-06-03 18:21:05 +0200
commit26e138cb47dccc859ff219f108ce9b7d96cbcbcd (patch)
treedb40b36d8265f6e2d55a5e4b6f9a6e4a991d2819
parent38d59fc8ccccae8882fa48671377bf40a27915a7 (diff)
downloadgitpython-26e138cb47dccc859ff219f108ce9b7d96cbcbcd.tar.gz
odb: fixed streamed decompression reader ( specific tests would still be missing ) and added performance tests which are extremely promising
-rw-r--r--lib/git/odb/db.py4
-rw-r--r--lib/git/odb/utils.py22
-rw-r--r--test/git/performance/test_streams.py91
3 files changed, 107 insertions, 10 deletions
diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py
index 5c50a512..e656b2b5 100644
--- a/lib/git/odb/db.py
+++ b/lib/git/odb/db.py
@@ -91,8 +91,8 @@ class iObjectDBW(object):
they where given.
:param iter_info: iterable yielding tuples containing the type_string
size_in_bytes and the steam with the content data.
- :param dry_run: see ``to_obj``
- :param sha_as_hex: see ``to_obj``
+ :param dry_run: see ``to_object``
+ :param sha_as_hex: see ``to_object``
:param max_threads: if < 1, any number of threads may be started while processing
the request, otherwise the given number of threads will be started.
:raise IOError: if data could not be written"""
diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py
index 8a054201..1e4a8e9d 100644
--- a/lib/git/odb/utils.py
+++ b/lib/git/odb/utils.py
@@ -39,7 +39,7 @@ write = os.write
close = os.close
# ZLIB configuration
-# used when compressing objects
+# used when compressing objects - 1 to 9 ( slowest )
Z_BEST_SPEED = 1
#} END Routines
@@ -70,7 +70,7 @@ class FDCompressedSha1Writer(object):
bytes_written = write(self.fd, cdata)
if bytes_written != len(cdata):
raise self.exc
- return bytes_written
+ return len(data)
def sha(self, as_hex = False):
""":return: sha so far
@@ -175,7 +175,7 @@ class DecompressMemMapReader(object):
self._br += size
return dat
else:
- dat = self._buf.getvalue() # ouch, duplicates data
+ dat = self._buf.read() # ouch, duplicates data
size -= self._buflen
self._br += self._buflen
@@ -195,28 +195,34 @@ class DecompressMemMapReader(object):
# copied once, and another copy of a part of it when it creates the unconsumed
# tail. We have to use it to hand in the appropriate amount of bytes durin g
# the next read.
- if self._zip.unconsumed_tail:
+ tail = self._zip.unconsumed_tail
+ if tail:
# move the window, make it as large as size demands. For code-clarity,
# we just take the chunk from our map again instead of reusing the unconsumed
# tail. The latter one would safe some memory copying, but we could end up
# with not getting enough data uncompressed, so we had to sort that out as well.
# Now we just assume the worst case, hence the data is uncompressed and the window
# needs to be as large as the uncompressed bytes we want to read.
- self._cws = self._cwe - len(self._zip.unconsumed_tail)
+ self._cws = self._cwe - len(tail)
self._cwe = self._cws + size
+
+
indata = self._m[self._cws:self._cwe] # another copy ... :(
+ # get the actual window end to be sure we don't use it for computations
+ self._cwe = self._cws + len(indata)
else:
cws = self._cws
self._cws = self._cwe
self._cwe = cws + size
indata = self._m[self._cws:self._cwe] # ... copy it again :(
# END handle tail
-
+
dcompdat = self._zip.decompress(indata, size)
- self._br += len(dcompdat)
+ self._br += len(dcompdat)
if dat:
- return dat + dcompdat
+ dcompdat = dat + dcompdat
+
return dcompdat
#} END classes
diff --git a/test/git/performance/test_streams.py b/test/git/performance/test_streams.py
new file mode 100644
index 00000000..15924c08
--- /dev/null
+++ b/test/git/performance/test_streams.py
@@ -0,0 +1,91 @@
+"""Performance data streaming performance"""
+
+from test.testlib import *
+from git.odb.db import *
+
+from array import array
+from cStringIO import StringIO
+from time import time
+import os
+import sys
+import stat
+import random
+
+
+from lib import (
+ TestBigRepoReadOnly
+ )
+
+
+
+def make_memory_file(size_in_bytes, randomize=False):
+ """:return: tuple(size_of_stream, stream)
+ :param randomize: try to produce a very random stream"""
+ actual_size = size_in_bytes / 4
+ producer = xrange(actual_size)
+ if randomize:
+ producer = list(producer)
+ random.shuffle(producer)
+ # END randomize
+ a = array('i', producer)
+ return actual_size*4, StringIO(a.tostring())
+
+
+class TestObjDBPerformance(TestBigRepoReadOnly):
+
+ large_data_size_bytes = 1000*1000*10 # some MiB should do it
+ moderate_data_size_bytes = 1000*1000*1 # just 1 MiB
+
+ @with_bare_rw_repo
+ def test_large_data_streaming(self, rwrepo):
+ ldb = LooseObjectDB(os.path.join(rwrepo.git_dir, 'objects'))
+
+ for randomize in range(2):
+ desc = (randomize and 'random ') or ''
+ print >> sys.stderr, "Creating %s data ..." % desc
+ st = time()
+ size, stream = make_memory_file(self.large_data_size_bytes, randomize)
+ elapsed = time() - st
+ print >> sys.stderr, "Done (in %f s)" % elapsed
+
+ # writing - due to the compression it will seem faster than it is
+ st = time()
+ sha = ldb.to_object('blob', size, stream)
+ elapsed = time() - st
+ assert ldb.has_object(sha)
+ fsize_kib = os.path.getsize(ldb.readable_db_object_path(sha)) / 1000
+
+
+ size_kib = size / 1000
+ print >> sys.stderr, "Added %i KiB (filesize = %i KiB) of %s data to loose odb in %f s ( %f Write KiB / s)" % (size_kib, fsize_kib, desc, elapsed, size_kib / elapsed)
+
+ # reading all at once
+ st = time()
+ type, size, shastream = ldb.object(sha)
+ shadata = shastream.read()
+ elapsed = time() - st
+
+ stream.seek(0)
+ assert shadata == stream.getvalue()
+ print >> sys.stderr, "Read %i KiB of %s data at once from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, elapsed, size_kib / elapsed)
+
+
+ # reading in chunks of 1 MiB
+ cs = 512*1000
+ chunks = list()
+ st = time()
+ type, size, shastream = ldb.object(sha)
+ while True:
+ data = shastream.read(cs)
+ chunks.append(data)
+ if len(data) < cs:
+ break
+ # END read in chunks
+ elapsed = time() - st
+
+ stream.seek(0)
+ assert ''.join(chunks) == stream.getvalue()
+
+ cs_kib = cs / 1000
+ print >> sys.stderr, "Read %i KiB of %s data in %i KiB chunks from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, cs_kib, elapsed, size_kib / elapsed)
+ # END for each randomization factor