summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGES1
-rw-r--r--lib/git/cmd.py7
-rw-r--r--lib/git/objects/base.py4
-rw-r--r--lib/git/objects/commit.py42
-rw-r--r--lib/git/objects/utils.py25
-rw-r--r--lib/git/odb/utils.py32
-rw-r--r--lib/git/repo.py43
-rw-r--r--lib/git/utils.py15
-rw-r--r--test/git/performance/lib.py25
-rw-r--r--test/git/performance/test_commit.py36
-rw-r--r--test/git/performance/test_odb.py6
-rw-r--r--test/git/performance/test_streams.py4
-rw-r--r--test/git/performance/test_utils.py4
-rw-r--r--test/git/test_commit.py59
-rw-r--r--test/git/test_utils.py9
15 files changed, 230 insertions, 82 deletions
diff --git a/CHANGES b/CHANGES
index e24e723d..e9e1257e 100644
--- a/CHANGES
+++ b/CHANGES
@@ -5,6 +5,7 @@ CHANGES
0.2 Beta 2
===========
* Commit objects now carry the 'encoding' information of their message. It wasn't parsed previously, and defaults to UTF-8
+ * Commit.create_from_tree now uses a pure-python implementation, mimicing git-commit-tree
0.2
=====
diff --git a/lib/git/cmd.py b/lib/git/cmd.py
index aaa27adc..18d1c505 100644
--- a/lib/git/cmd.py
+++ b/lib/git/cmd.py
@@ -323,12 +323,7 @@ class Git(object):
stdout_value = proc.stdout.read().rstrip() # strip trailing "\n"
else:
max_chunk_size = 1024*64
- while True:
- chunk = proc.stdout.read(max_chunk_size)
- output_stream.write(chunk)
- if len(chunk) < max_chunk_size:
- break
- # END reading output stream
+ stream_copy(proc.stdout, output_stream, max_chunk_size)
stdout_value = output_stream
# END stdout handling
stderr_value = proc.stderr.read().rstrip() # strip trailing "\n"
diff --git a/lib/git/objects/base.py b/lib/git/objects/base.py
index 64a5678e..f7043199 100644
--- a/lib/git/objects/base.py
+++ b/lib/git/objects/base.py
@@ -125,8 +125,8 @@ class Object(LazyMixin):
Returns
File Object compatible stream to the uncompressed raw data of the object
"""
- sha, type, size, stream = self.repo.git.stream_object_data(self.sha)
- return stream
+ proc = self.repo.git.cat_file(self.type, self.sha, as_process=True)
+ return utils.ProcessStreamAdapter(proc, "stdout")
def stream_data(self, ostream):
"""
diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py
index 98aca360..d56ce306 100644
--- a/lib/git/objects/commit.py
+++ b/lib/git/objects/commit.py
@@ -91,15 +91,6 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri
"""
super(Commit,self).__init__(repo, sha)
self._set_self_from_args_(locals())
-
- if parents is not None:
- cls = type(self)
- self.parents = tuple(cls(repo, p) for p in parents if not isinstance(p, cls))
- # END for each parent to convert
-
- if self.sha and tree is not None:
- self.tree = Tree(repo, tree, path='')
- # END id to tree conversion
@classmethod
def _get_intermediate_items(cls, commit):
@@ -350,7 +341,12 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri
committer, committer_time, committer_offset,
message, parent_commits, conf_encoding)
- # serialize !
+ stream = StringIO()
+ new_commit._serialize(stream)
+ streamlen = stream.tell()
+ stream.seek(0)
+
+ new_commit.sha = repo.odb.to_object(cls.type, streamlen, stream, sha_as_hex=True)
if head:
try:
@@ -377,8 +373,28 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri
#{ Serializable Implementation
def _serialize(self, stream):
- # for now, this is very inefficient and in fact shouldn't be used like this
- return super(Commit, self)._serialize(stream)
+ write = stream.write
+ write("tree %s\n" % self.tree)
+ for p in self.parents:
+ write("parent %s\n" % p)
+
+ a = self.author
+ c = self.committer
+ fmt = "%s %s <%s> %s %s\n"
+ write(fmt % ("author", a.name, a.email,
+ self.authored_date,
+ utils.altz_to_utctz_str(self.author_tz_offset)))
+
+ write(fmt % ("committer", c.name, c.email,
+ self.committed_date,
+ utils.altz_to_utctz_str(self.committer_tz_offset)))
+
+ if self.encoding != self.default_encoding:
+ write("encoding %s\n" % self.encoding)
+
+ write("\n")
+ write(self.message)
+ return self
def _deserialize(self, stream):
""":param from_rev_list: if true, the stream format is coming from the rev-list command
@@ -416,7 +432,7 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri
# a stream from our data simply gives us the plain message
# The end of our message stream is marked with a newline that we strip
- self.message = stream.read()[:-1]
+ self.message = stream.read()
return self
#} END serializable implementation
diff --git a/lib/git/objects/utils.py b/lib/git/objects/utils.py
index 6d378a72..c93f2091 100644
--- a/lib/git/objects/utils.py
+++ b/lib/git/objects/utils.py
@@ -16,7 +16,8 @@ import time
import os
__all__ = ('get_object_type_by_name', 'get_user_id', 'parse_date', 'parse_actor_and_date',
- 'ProcessStreamAdapter', 'Traversable')
+ 'ProcessStreamAdapter', 'Traversable', 'altz_to_utctz_str', 'utctz_to_altz',
+ 'verify_utctz')
def get_object_type_by_name(object_type_name):
"""
@@ -57,14 +58,24 @@ def get_user_id():
return "%s@%s" % (username, platform.node())
-def _utc_tz_to_altz(utctz):
+def utctz_to_altz(utctz):
"""we convert utctz to the timezone in seconds, it is the format time.altzone
returns. Git stores it as UTC timezon which has the opposite sign as well,
which explains the -1 * ( that was made explicit here )
:param utctz: git utc timezone string, i.e. +0200"""
return -1 * int(float(utctz)/100*3600)
+
+def altz_to_utctz_str(altz):
+ """As above, but inverses the operation, returning a string that can be used
+ in commit objects"""
+ utci = -1 * int((altz / 3600)*100)
+ utcs = str(abs(utci))
+ utcs = "0"*(4-len(utcs)) + utcs
+ prefix = (utci < 0 and '-') or '+'
+ return prefix + utcs
+
-def _verify_utctz(offset):
+def verify_utctz(offset):
""":raise ValueError: if offset is incorrect
:return: offset"""
fmt_exc = ValueError("Invalid timezone offset format: %s" % offset)
@@ -97,11 +108,11 @@ def parse_date(string_date):
if string_date.count(' ') == 1 and string_date.rfind(':') == -1:
timestamp, offset = string_date.split()
timestamp = int(timestamp)
- return timestamp, _utc_tz_to_altz(_verify_utctz(offset))
+ return timestamp, utctz_to_altz(verify_utctz(offset))
else:
offset = "+0000" # local time by default
if string_date[-5] in '-+':
- offset = _verify_utctz(string_date[-5:])
+ offset = verify_utctz(string_date[-5:])
string_date = string_date[:-6] # skip space as well
# END split timezone info
@@ -139,7 +150,7 @@ def parse_date(string_date):
fstruct = time.struct_time((dtstruct.tm_year, dtstruct.tm_mon, dtstruct.tm_mday,
tstruct.tm_hour, tstruct.tm_min, tstruct.tm_sec,
dtstruct.tm_wday, dtstruct.tm_yday, tstruct.tm_isdst))
- return int(time.mktime(fstruct)), _utc_tz_to_altz(offset)
+ return int(time.mktime(fstruct)), utctz_to_altz(offset)
except ValueError:
continue
# END exception handling
@@ -167,7 +178,7 @@ def parse_actor_and_date(line):
"""
m = _re_actor_epoch.search(line)
actor, epoch, offset = m.groups()
- return (Actor._from_string(actor), int(epoch), _utc_tz_to_altz(offset))
+ return (Actor._from_string(actor), int(epoch), utctz_to_altz(offset))
diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py
index 94d1cea8..fd340962 100644
--- a/lib/git/odb/utils.py
+++ b/lib/git/odb/utils.py
@@ -137,7 +137,7 @@ class DecompressMemMapReader(object):
# END handle size
# read header
- maxb = 8192
+ maxb = 512 # should really be enough, cgit uses 8192 I believe
self._s = maxb
hdr = self.read(maxb)
hdrend = hdr.find("\0")
@@ -172,20 +172,24 @@ class DecompressMemMapReader(object):
# Our performance now depends on StringIO. This way we don't need two large
# buffers in peak times, but only one large one in the end which is
# the return buffer
- if size > self.max_read_size:
- sio = StringIO()
- while size:
- read_size = min(self.max_read_size, size)
- data = self.read(read_size)
- sio.write(data)
- size -= len(data)
- if len(data) < read_size:
- break
- # END data loop
- sio.seek(0)
- return sio.getvalue()
- # END handle maxread
+ # NO: We don't do it - if the user thinks its best, he is right. If he
+ # has trouble, he will start reading in chunks. According to our tests
+ # its still faster if we read 10 Mb at once instead of chunking it.
+ # if size > self.max_read_size:
+ # sio = StringIO()
+ # while size:
+ # read_size = min(self.max_read_size, size)
+ # data = self.read(read_size)
+ # sio.write(data)
+ # size -= len(data)
+ # if len(data) < read_size:
+ # break
+ # # END data loop
+ # sio.seek(0)
+ # return sio.getvalue()
+ # # END handle maxread
+ #
# deplete the buffer, then just continue using the decompress object
# which has an own buffer. We just need this to transparently parse the
# header from the zlib stream
diff --git a/lib/git/repo.py b/lib/git/repo.py
index f4caa3fb..0bd2249c 100644
--- a/lib/git/repo.py
+++ b/lib/git/repo.py
@@ -4,12 +4,6 @@
# This module is part of GitPython and is released under
# the BSD License: http://www.opensource.org/licenses/bsd-license.php
-import os
-import sys
-import re
-import gzip
-import StringIO
-
from errors import InvalidGitRepositoryError, NoSuchPathError
from cmd import Git
from actor import Actor
@@ -19,6 +13,15 @@ from objects import *
from config import GitConfigParser
from remote import Remote
+from odb.db import LooseObjectDB
+
+import os
+import sys
+import re
+import gzip
+import StringIO
+
+
def touch(filename):
fp = open(filename, "a")
fp.close()
@@ -53,7 +56,7 @@ class Repo(object):
'git_dir' is the .git repository directoy, which is always set.
"""
DAEMON_EXPORT_FILE = 'git-daemon-export-ok'
- __slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git" )
+ __slots__ = ( "working_dir", "_working_tree_dir", "git_dir", "_bare", "git", "odb" )
# precompiled regex
re_whitespace = re.compile(r'\s+')
@@ -65,27 +68,22 @@ class Repo(object):
# represents the configuration level of a configuration file
config_level = ("system", "global", "repository")
- def __init__(self, path=None):
- """
- Create a new Repo instance
-
- ``path``
- is the path to either the root git directory or the bare git repo
+ def __init__(self, path=None, odbt = LooseObjectDB):
+ """ Create a new Repo instance
- Examples::
+ :param path: is the path to either the root git directory or the bare git repo::
repo = Repo("/Users/mtrier/Development/git-python")
repo = Repo("/Users/mtrier/Development/git-python.git")
repo = Repo("~/Development/git-python.git")
repo = Repo("$REPOSITORIES/Development/git-python.git")
-
- Raises
- InvalidGitRepositoryError or NoSuchPathError
-
- Returns
- ``git.Repo``
- """
-
+
+ :param odbt: Object DataBase type - a type which is constructed by providing
+ the directory containing the database objects, i.e. .git/objects. It will
+ be used to access all object data
+ :raise InvalidGitRepositoryError:
+ :raise NoSuchPathError:
+ :return: git.Repo """
epath = os.path.abspath(os.path.expandvars(os.path.expanduser(path or os.getcwd())))
if not os.path.exists(epath):
@@ -130,6 +128,7 @@ class Repo(object):
self.working_dir = self._working_tree_dir or self.git_dir
self.git = Git(self.working_dir)
+ self.odb = odbt(os.path.join(self.git_dir, 'objects'))
def __eq__(self, rhs):
if isinstance(rhs, Repo):
diff --git a/lib/git/utils.py b/lib/git/utils.py
index 360c77c9..60a7de48 100644
--- a/lib/git/utils.py
+++ b/lib/git/utils.py
@@ -27,6 +27,21 @@ def make_sha(source=''):
sha1 = sha.sha(source)
return sha1
+def stream_copy(source, destination, chunk_size=512*1024):
+ """Copy all data from the source stream into the destination stream in chunks
+ of size chunk_size
+ :return: amount of bytes written"""
+ br = 0
+ while True:
+ chunk = source.read(chunk_size)
+ destination.write(chunk)
+ br += len(chunk)
+ if len(chunk) < chunk_size:
+ break
+ # END reading output stream
+ return br
+
+
def join_path(a, *p):
"""Join path tokens together similar to os.path.join, but always use
'/' instead of possibly '\' on windows."""
diff --git a/test/git/performance/lib.py b/test/git/performance/lib.py
index 4b552b20..650bea82 100644
--- a/test/git/performance/lib.py
+++ b/test/git/performance/lib.py
@@ -1,6 +1,8 @@
"""Contains library functions"""
import os
from test.testlib import *
+import shutil
+import tempfile
from git import (
Repo
@@ -25,7 +27,7 @@ def resolve_or_fail(env_var):
#{ Base Classes
-class TestBigRepoReadOnly(TestBase):
+class TestBigRepoR(TestBase):
"""TestCase providing access to readonly 'big' repositories using the following
member variables:
@@ -40,7 +42,24 @@ class TestBigRepoReadOnly(TestBase):
@classmethod
def setUpAll(cls):
- super(TestBigRepoReadOnly, cls).setUpAll()
- cls.gitrepo = Repo(resolve_or_fail(k_env_git_repo))
+ super(TestBigRepoR, cls).setUpAll()
+ cls.gitrorepo = Repo(resolve_or_fail(k_env_git_repo))
+
+class TestBigRepoRW(TestBigRepoR):
+ """As above, but provides a big repository that we can write to.
+
+ Provides ``self.gitrwrepo``"""
+
+ @classmethod
+ def setUpAll(cls):
+ super(TestBigRepoRW, cls).setUpAll()
+ dirname = tempfile.mktemp()
+ os.mkdir(dirname)
+ cls.gitrwrepo = cls.gitrorepo.clone(dirname, shared=True, bare=True)
+
+ @classmethod
+ def tearDownAll(cls):
+ shutil.rmtree(cls.gitrwrepo.working_tree_dir)
+
#} END base classes
diff --git a/test/git/performance/test_commit.py b/test/git/performance/test_commit.py
index b4a9d868..2398c93d 100644
--- a/test/git/performance/test_commit.py
+++ b/test/git/performance/test_commit.py
@@ -6,10 +6,12 @@
from lib import *
from git import *
+from test.git.test_commit import assert_commit_serialization
+from cStringIO import StringIO
from time import time
import sys
-class TestPerformance(TestBigRepoReadOnly):
+class TestPerformance(TestBigRepoRW):
# ref with about 100 commits in its history
ref_100 = '0.1.6'
@@ -48,7 +50,7 @@ class TestPerformance(TestBigRepoReadOnly):
# bound to cat-file parsing performance
nc = 0
st = time()
- for c in self.gitrepo.commit(self.head_sha_2k).traverse(branch_first=False):
+ for c in self.gitrorepo.commit(self.head_sha_2k).traverse(branch_first=False):
nc += 1
self._query_commit_info(c)
# END for each traversed commit
@@ -59,10 +61,38 @@ class TestPerformance(TestBigRepoReadOnly):
# bound to stream parsing performance
nc = 0
st = time()
- for c in Commit.iter_items(self.gitrepo, self.head_sha_2k):
+ for c in Commit.iter_items(self.gitrorepo, self.head_sha_2k):
nc += 1
self._query_commit_info(c)
# END for each traversed commit
elapsed_time = time() - st
print >> sys.stderr, "Iterated %i Commits in %s [s] ( %f commits/s )" % (nc, elapsed_time, nc/elapsed_time)
+ def test_commit_serialization(self):
+ assert_commit_serialization(self.gitrwrepo, self.head_sha_2k, True)
+
+ rwrepo = self.gitrwrepo
+ make_object = rwrepo.odb.to_object
+ # direct serialization - deserialization can be tested afterwards
+ # serialization is probably limited on IO
+ hc = rwrepo.commit(self.head_sha_2k)
+
+ commits = list()
+ nc = 5000
+ st = time()
+ for i in xrange(nc):
+ cm = Commit( rwrepo, Commit.NULL_HEX_SHA, hc.tree,
+ hc.author, hc.authored_date, hc.author_tz_offset,
+ hc.committer, hc.committed_date, hc.committer_tz_offset,
+ str(i), parents=hc.parents, encoding=hc.encoding)
+
+ stream = StringIO()
+ cm._serialize(stream)
+ slen = stream.tell()
+ stream.seek(0)
+
+ cm.sha = make_object(Commit.type, slen, stream)
+ # END commit creation
+ elapsed = time() - st
+
+ print >> sys.stderr, "Serialized %i commits to loose objects in %f s ( %f commits / s )" % (nc, elapsed, nc / elapsed)
diff --git a/test/git/performance/test_odb.py b/test/git/performance/test_odb.py
index 0ad2ce33..7b1ee838 100644
--- a/test/git/performance/test_odb.py
+++ b/test/git/performance/test_odb.py
@@ -5,18 +5,18 @@ import sys
import stat
from lib import (
- TestBigRepoReadOnly
+ TestBigRepoR
)
-class TestObjDBPerformance(TestBigRepoReadOnly):
+class TestObjDBPerformance(TestBigRepoR):
def test_random_access(self):
# GET COMMITS
# TODO: use the actual db for this
st = time()
- root_commit = self.gitrepo.commit(self.head_sha_2k)
+ root_commit = self.gitrorepo.commit(self.head_sha_2k)
commits = list(root_commit.traverse())
nc = len(commits)
elapsed = time() - st
diff --git a/test/git/performance/test_streams.py b/test/git/performance/test_streams.py
index 6c2834b3..d31bee14 100644
--- a/test/git/performance/test_streams.py
+++ b/test/git/performance/test_streams.py
@@ -14,7 +14,7 @@ import subprocess
from lib import (
- TestBigRepoReadOnly
+ TestBigRepoR
)
@@ -32,7 +32,7 @@ def make_memory_file(size_in_bytes, randomize=False):
return actual_size*4, StringIO(a.tostring())
-class TestObjDBPerformance(TestBigRepoReadOnly):
+class TestObjDBPerformance(TestBigRepoR):
large_data_size_bytes = 1000*1000*10 # some MiB should do it
moderate_data_size_bytes = 1000*1000*1 # just 1 MiB
diff --git a/test/git/performance/test_utils.py b/test/git/performance/test_utils.py
index 381d7c8b..47366d34 100644
--- a/test/git/performance/test_utils.py
+++ b/test/git/performance/test_utils.py
@@ -4,11 +4,11 @@ import sys
import stat
from lib import (
- TestBigRepoReadOnly
+ TestBigRepoR
)
-class TestUtilPerformance(TestBigRepoReadOnly):
+class TestUtilPerformance(TestBigRepoR):
def test_access(self):
# compare dict vs. slot access
diff --git a/test/git/test_commit.py b/test/git/test_commit.py
index ad7a0082..a5f184e6 100644
--- a/test/git/test_commit.py
+++ b/test/git/test_commit.py
@@ -7,6 +7,56 @@
from test.testlib import *
from git import *
+from cStringIO import StringIO
+import time
+import sys
+
+
+def assert_commit_serialization(rwrepo, commit_id, print_performance_info=False):
+ """traverse all commits in the history of commit identified by commit_id and check
+ if the serialization works.
+ :param print_performance_info: if True, we will show how fast we are"""
+ ns = 0 # num serializations
+ nds = 0 # num deserializations
+
+ st = time.time()
+ for cm in rwrepo.commit(commit_id).traverse():
+ nds += 1
+
+ # assert that we deserialize commits correctly, hence we get the same
+ # sha on serialization
+ stream = StringIO()
+ cm._serialize(stream)
+ ns += 1
+ streamlen = stream.tell()
+ stream.seek(0)
+
+ csha = rwrepo.odb.to_object(Commit.type, streamlen, stream)
+ assert csha == cm.sha
+
+ nc = Commit(rwrepo, Commit.NULL_HEX_SHA, cm.tree.sha,
+ cm.author, cm.authored_date, cm.author_tz_offset,
+ cm.committer, cm.committed_date, cm.committer_tz_offset,
+ cm.message, cm.parents, cm.encoding)
+
+ assert nc.parents == cm.parents
+ stream = StringIO()
+ nc._serialize(stream)
+ ns += 1
+ streamlen = stream.tell()
+ stream.seek(0)
+ nc.sha = rwrepo.odb.to_object(Commit.type, streamlen, stream)
+
+ # if it worked, we have exactly the same contents !
+ assert nc.sha == cm.sha
+ # END check commits
+ elapsed = time.time() - st
+
+ if print_performance_info:
+ print >> sys.stderr, "Serialized %i and deserialized %i commits in %f s ( (%f, %f) commits / s" % (ns, nds, elapsed, ns/elapsed, nds/elapsed)
+ # END handle performance info
+
+
class TestCommit(TestBase):
def test_bake(self):
@@ -19,7 +69,7 @@ class TestCommit(TestBase):
assert commit.author == commit.committer
assert isinstance(commit.authored_date, int) and isinstance(commit.committed_date, int)
assert isinstance(commit.author_tz_offset, int) and isinstance(commit.committer_tz_offset, int)
- assert commit.message == "Added missing information to docstrings of commit and stats module"
+ assert commit.message == "Added missing information to docstrings of commit and stats module\n"
def test_stats(self):
@@ -49,7 +99,7 @@ class TestCommit(TestBase):
assert commit.committed_date == 1210193388
assert commit.author_tz_offset == 14400, commit.author_tz_offset
assert commit.committer_tz_offset == 14400, commit.committer_tz_offset
- assert commit.message == "initial project"
+ assert commit.message == "initial project\n"
def test_traversal(self):
start = self.rorepo.commit("a4d06724202afccd2b5c54f81bcf2bf26dea7fff")
@@ -171,3 +221,8 @@ class TestCommit(TestBase):
name_rev = self.rorepo.head.commit.name_rev
assert isinstance(name_rev, basestring)
+ @with_bare_rw_repo
+ def test_serialization(self, rwrepo):
+ # create all commits of our repo
+ assert_commit_serialization(rwrepo, '0.1.6')
+
diff --git a/test/git/test_utils.py b/test/git/test_utils.py
index 2c3c392b..83ef7e4b 100644
--- a/test/git/test_utils.py
+++ b/test/git/test_utils.py
@@ -116,8 +116,6 @@ class TestUtils(TestCase):
os.remove(my_file)
# END final cleanup
-
-
def test_user_id(self):
assert '@' in get_user_id()
@@ -127,7 +125,12 @@ class TestUtils(TestCase):
assert len(rval) == 2
assert isinstance(rval[0], int) and isinstance(rval[1], int)
assert rval[0] == veri_time
- assert rval[1] == offset
+ assert rval[1] == offset
+
+ # now that we are here, test our conversion functions as well
+ utctz = altz_to_utctz_str(offset)
+ assert isinstance(utctz, basestring)
+ assert utctz_to_altz(verify_utctz(utctz)) == offset
# END assert rval utility
rfc = ("Thu, 07 Apr 2005 22:13:11 +0000", 0)