diff options
Diffstat (limited to 'git/fun.py')
-rw-r--r-- | git/fun.py | 276 |
1 files changed, 147 insertions, 129 deletions
@@ -8,7 +8,7 @@ it into c later, if required""" from exc import ( BadObjectType - ) +) from util import zlib decompressobj = zlib.decompressobj @@ -23,32 +23,32 @@ OFS_DELTA = 6 REF_DELTA = 7 delta_types = (OFS_DELTA, REF_DELTA) -type_id_to_type_map = { - 0 : "", # EXT 1 - 1 : "commit", - 2 : "tree", - 3 : "blob", - 4 : "tag", - 5 : "", # EXT 2 - OFS_DELTA : "OFS_DELTA", # OFFSET DELTA - REF_DELTA : "REF_DELTA" # REFERENCE DELTA - } +type_id_to_type_map = { + 0: "", # EXT 1 + 1: "commit", + 2: "tree", + 3: "blob", + 4: "tag", + 5: "", # EXT 2 + OFS_DELTA: "OFS_DELTA", # OFFSET DELTA + REF_DELTA: "REF_DELTA" # REFERENCE DELTA +} type_to_type_id_map = dict( - commit=1, - tree=2, - blob=3, - tag=4, - OFS_DELTA=OFS_DELTA, - REF_DELTA=REF_DELTA - ) + commit=1, + tree=2, + blob=3, + tag=4, + OFS_DELTA=OFS_DELTA, + REF_DELTA=REF_DELTA +) # used when dealing with larger streams -chunk_size = 1000*mmap.PAGESIZE +chunk_size = 1000 * mmap.PAGESIZE -__all__ = ('is_loose_object', 'loose_object_header_info', 'msb_size', 'pack_object_header_info', - 'write_object', 'loose_object_header', 'stream_copy', 'apply_delta_data', - 'is_equal_canonical_sha', 'connect_deltas', 'DeltaChunkList', 'create_pack_object_header') +__all__ = ('is_loose_object', 'loose_object_header_info', 'msb_size', 'pack_object_header_info', + 'write_object', 'loose_object_header', 'stream_copy', 'apply_delta_data', + 'is_equal_canonical_sha', 'connect_deltas', 'DeltaChunkList', 'create_pack_object_header') #{ Structures @@ -59,11 +59,12 @@ def _set_delta_rbound(d, size): to our size :return: d""" d.ts = size - + # NOTE: data is truncated automatically when applying the delta # MUST NOT DO THIS HERE return d - + + def _move_delta_lbound(d, bytes): """Move the delta by the given amount of bytes, reducing its size so that its right bound stays static @@ -71,19 +72,21 @@ def _move_delta_lbound(d, bytes): :return: d""" if bytes == 0: return - + d.to += bytes d.so += bytes d.ts -= bytes if d.data is not None: d.data = d.data[bytes:] # END handle data - + return d - + + def delta_duplicate(src): return DeltaChunk(src.to, src.ts, src.so, src.data) - + + def delta_chunk_apply(dc, bbuf, write): """Apply own data to the target buffer :param bbuf: buffer providing source bytes for copy operations @@ -104,16 +107,17 @@ def delta_chunk_apply(dc, bbuf, write): class DeltaChunk(object): + """Represents a piece of a delta, it can either add new data, or copy existing one from a source buffer""" __slots__ = ( - 'to', # start offset in the target buffer in bytes + 'to', # start offset in the target buffer in bytes 'ts', # size of this chunk in the target buffer in bytes 'so', # start offset in the source buffer in bytes or None 'data', # chunk of bytes to be added to the target buffer, # DeltaChunkList to use as base, or None - ) - + ) + def __init__(self, to, ts, so, data): self.to = to self.ts = ts @@ -122,18 +126,19 @@ class DeltaChunk(object): def __repr__(self): return "DeltaChunk(%i, %i, %s, %s)" % (self.to, self.ts, self.so, self.data or "") - + #{ Interface - + def rbound(self): return self.to + self.ts - + def has_data(self): """:return: True if the instance has data to add to the target stream""" return self.data is not None - + #} END interface + def _closest_index(dcl, absofs): """:return: index at which the given absofs should be inserted. The index points to the DeltaChunk with a target buffer absofs that equals or is greater than @@ -152,8 +157,9 @@ def _closest_index(dcl, absofs): lo = mid + 1 # END handle bound # END for each delta absofs - return len(dcl)-1 - + return len(dcl) - 1 + + def delta_list_apply(dcl, bbuf, write): """Apply the chain's changes and write the final result using the passed write function. @@ -165,6 +171,7 @@ def delta_list_apply(dcl, bbuf, write): delta_chunk_apply(dc, bbuf, write) # END for each dc + def delta_list_slice(dcl, absofs, size, ndcl): """:return: Subsection of this list at the given absolute offset, with the given size in bytes. @@ -172,8 +179,8 @@ def delta_list_slice(dcl, absofs, size, ndcl): cdi = _closest_index(dcl, absofs) # delta start index cd = dcl[cdi] slen = len(dcl) - lappend = ndcl.append - + lappend = ndcl.append + if cd.to != absofs: tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data) _move_delta_lbound(tcd, absofs - cd.to) @@ -182,7 +189,7 @@ def delta_list_slice(dcl, absofs, size, ndcl): size -= tcd.ts cdi += 1 # END lbound overlap handling - + while cdi < slen and size: # are we larger than the current block cd = dcl[cdi] @@ -198,38 +205,39 @@ def delta_list_slice(dcl, absofs, size, ndcl): # END hadle size cdi += 1 # END for each chunk - - + + class DeltaChunkList(list): + """List with special functionality to deal with DeltaChunks. There are two types of lists we represent. The one was created bottom-up, working towards the latest delta, the other kind was created top-down, working from the latest delta down to the earliest ancestor. This attribute is queryable after all processing with is_reversed.""" - + __slots__ = tuple() - + def rbound(self): """:return: rightmost extend in bytes, absolute""" if len(self) == 0: return 0 return self[-1].rbound() - + def lbound(self): """:return: leftmost byte at which this chunklist starts""" if len(self) == 0: return 0 return self[0].to - + def size(self): """:return: size of bytes as measured by our delta chunks""" return self.rbound() - self.lbound() - + def apply(self, bbuf, write): """Only used by public clients, internally we only use the global routines for performance""" return delta_list_apply(self, bbuf, write) - + def compress(self): """Alter the list to reduce the amount of nodes. Currently we concatenate add-chunks @@ -239,41 +247,41 @@ class DeltaChunkList(list): return self i = 0 slen_orig = slen - + first_data_index = None while i < slen: dc = self[i] i += 1 if dc.data is None: - if first_data_index is not None and i-2-first_data_index > 1: - #if first_data_index is not None: + if first_data_index is not None and i - 2 - first_data_index > 1: + # if first_data_index is not None: nd = StringIO() # new data so = self[first_data_index].to # start offset in target buffer - for x in xrange(first_data_index, i-1): + for x in xrange(first_data_index, i - 1): xdc = self[x] nd.write(xdc.data[:xdc.ts]) # END collect data - - del(self[first_data_index:i-1]) + + del(self[first_data_index:i - 1]) buf = nd.getvalue() - self.insert(first_data_index, DeltaChunk(so, len(buf), 0, buf)) - + self.insert(first_data_index, DeltaChunk(so, len(buf), 0, buf)) + slen = len(self) i = first_data_index + 1 - + # END concatenate data first_data_index = None continue # END skip non-data chunks - + if first_data_index is None: - first_data_index = i-1 + first_data_index = i - 1 # END iterate list - - #if slen_orig != len(self): + + # if slen_orig != len(self): # print "INFO: Reduced delta list len to %f %% of former size" % ((float(len(self)) / slen_orig) * 100) return self - + def check_integrity(self, target_size=-1): """Verify the list has non-overlapping chunks only, and the total size matches target_size @@ -281,35 +289,36 @@ class DeltaChunkList(list): :raise AssertionError: if the size doen't match""" if target_size > -1: assert self[-1].rbound() == target_size - assert reduce(lambda x,y: x+y, (d.ts for d in self), 0) == target_size + assert reduce(lambda x, y: x + y, (d.ts for d in self), 0) == target_size # END target size verification - + if len(self) < 2: return - + # check data for dc in self: assert dc.ts > 0 if dc.has_data(): assert len(dc.data) >= dc.ts # END for each dc - - left = islice(self, 0, len(self)-1) + + left = islice(self, 0, len(self) - 1) right = iter(self) right.next() - # this is very pythonic - we might have just use index based access here, + # this is very pythonic - we might have just use index based access here, # but this could actually be faster - for lft,rgt in izip(left, right): + for lft, rgt in izip(left, right): assert lft.rbound() == rgt.to assert lft.to + lft.ts == rgt.to # END for each pair - + class TopdownDeltaChunkList(DeltaChunkList): + """Represents a list which is generated by feeding its ancestor streams one by one""" - __slots__ = tuple() - + __slots__ = tuple() + def connect_with_next_base(self, bdcl): """Connect this chain with the next level of our base delta chunklist. The goal in this game is to mark as many of our chunks rigid, hence they @@ -326,13 +335,13 @@ class TopdownDeltaChunkList(DeltaChunkList): while dci < slen: dc = self[dci] dci += 1 - + # all add-chunks which are already topmost don't need additional processing if dc.data is not None: nfc += 1 continue # END skip add chunks - + # copy chunks # integrate the portion of the base list into ourselves. Lists # dont support efficient insertion ( just one at a time ), but for now @@ -341,37 +350,37 @@ class TopdownDeltaChunkList(DeltaChunkList): # ourselves in order to reduce the amount of insertions ... del(ccl[:]) delta_list_slice(bdcl, dc.so, dc.ts, ccl) - + # move the target bounds into place to match with our chunk ofs = dc.to - dc.so for cdc in ccl: cdc.to += ofs # END update target bounds - + if len(ccl) == 1: - self[dci-1] = ccl[0] + self[dci - 1] = ccl[0] else: # maybe try to compute the expenses here, and pick the right algorithm # It would normally be faster than copying everything physically though # TODO: Use a deque here, and decide by the index whether to extend # or extend left ! post_dci = self[dci:] - del(self[dci-1:]) # include deletion of dc + del(self[dci - 1:]) # include deletion of dc self.extend(ccl) self.extend(post_dci) - + slen = len(self) - dci += len(ccl)-1 # deleted dc, added rest - + dci += len(ccl) - 1 # deleted dc, added rest + # END handle chunk replacement # END for each chunk - + if nfc == slen: return False # END handle completeness return True - - + + #} END structures #{ Routines @@ -384,6 +393,7 @@ def is_loose_object(m): word = (b0 << 8) + b1 return b0 == 0x78 and (word % 31) == 0 + def loose_object_header_info(m): """ :return: tuple(type_string, uncompressed_size_in_bytes) the type string of the @@ -393,7 +403,8 @@ def loose_object_header_info(m): hdr = decompressobj().decompress(m, decompress_size) type_name, size = hdr[:hdr.find("\0")].split(" ") return type_name, int(size) - + + def pack_object_header_info(data): """ :return: tuple(type_id, uncompressed_size_in_bytes, byte_offset) @@ -413,13 +424,14 @@ def pack_object_header_info(data): # END character loop return (type_id, size, i) + def create_pack_object_header(obj_type, obj_size): """:return: string defining the pack header comprised of the object type and its incompressed size in bytes :parmam obj_type: pack type_id of the object :param obj_size: uncompressed size in bytes of the following object stream""" c = 0 # 1 byte - hdr = str() # output string + hdr = str() # output string c = (obj_type << 4) | (obj_size & 0xf) obj_size >>= 4 @@ -427,10 +439,11 @@ def create_pack_object_header(obj_type, obj_size): hdr += chr(c | 0x80) c = obj_size & 0x7f obj_size >>= 7 - #END until size is consumed + # END until size is consumed hdr += chr(c) return hdr - + + def msb_size(data, offset=0): """ :return: tuple(read_bytes, size) read the msb size from the given random @@ -440,8 +453,8 @@ def msb_size(data, offset=0): l = len(data) hit_msb = False while i < l: - c = ord(data[i+offset]) - size |= (c & 0x7f) << i*7 + c = ord(data[i + offset]) + size |= (c & 0x7f) << i * 7 i += 1 if not c & 0x80: hit_msb = True @@ -450,19 +463,21 @@ def msb_size(data, offset=0): # END while in range if not hit_msb: raise AssertionError("Could not find terminating MSB byte in data stream") - return i+offset, size - + return i + offset, size + + def loose_object_header(type, size): """ :return: string representing the loose object header, which is immediately followed by the content stream of size 'size'""" return "%s %i\0" % (type, size) - + + def write_object(type, size, read, write, chunk_size=chunk_size): """ Write the object as identified by type, size and source_stream into the target_stream - + :param type: type string of the object :param size: amount of bytes to write from source_stream :param read: read method of a stream providing the content data @@ -471,26 +486,27 @@ def write_object(type, size, read, write, chunk_size=chunk_size): the routine exits, even if an error is thrown :return: The actual amount of bytes written to stream, which includes the header and a trailing newline""" tbw = 0 # total num bytes written - + # WRITE HEADER: type SP size NULL tbw += write(loose_object_header(type, size)) tbw += stream_copy(read, write, size, chunk_size) - + return tbw + def stream_copy(read, write, size, chunk_size): """ Copy a stream up to size bytes using the provided read and write methods, in chunks of chunk_size - + :note: its much like stream_copy utility, but operates just using methods""" dbw = 0 # num data bytes written - + # WRITE ALL DATA UP TO SIZE while True: - cs = min(chunk_size, size-dbw) + cs = min(chunk_size, size - dbw) # NOTE: not all write methods return the amount of written bytes, like - # mmap.write. Its bad, but we just deal with it ... perhaps its not + # mmap.write. Its bad, but we just deal with it ... perhaps its not # even less efficient # data_len = write(read(cs)) # dbw += data_len @@ -503,27 +519,28 @@ def stream_copy(read, write, size, chunk_size): # END check for stream end # END duplicate data return dbw - + + def connect_deltas(dstreams): """ Read the condensed delta chunk information from dstream and merge its information into a list of existing delta chunks - + :param dstreams: iterable of delta stream objects, the delta to be applied last comes first, then all its ancestors in order :return: DeltaChunkList, containing all operations to apply""" tdcl = None # topmost dcl - + dcl = tdcl = TopdownDeltaChunkList() for dsi, ds in enumerate(dstreams): # print "Stream", dsi db = ds.read() delta_buf_size = ds.size - + # read header i, base_size = msb_size(db) i, target_size = msb_size(db, i) - + # interpret opcodes tbw = 0 # amount of target bytes written while i < delta_buf_size: @@ -552,46 +569,47 @@ def connect_deltas(dstreams): if (c & 0x40): cp_size |= (ord(db[i]) << 16) i += 1 - - if not cp_size: + + if not cp_size: cp_size = 0x10000 - + rbound = cp_off + cp_size if (rbound < cp_size or - rbound > base_size): + rbound > base_size): break - + dcl.append(DeltaChunk(tbw, cp_size, cp_off, None)) tbw += cp_size elif c: # NOTE: in C, the data chunks should probably be concatenated here. # In python, we do it as a post-process - dcl.append(DeltaChunk(tbw, c, 0, db[i:i+c])) + dcl.append(DeltaChunk(tbw, c, 0, db[i:i + c])) i += c tbw += c else: raise ValueError("unexpected delta opcode 0") # END handle command byte # END while processing delta data - + dcl.compress() - + # merge the lists ! if dsi > 0: if not tdcl.connect_with_next_base(dcl): break # END handle merge - + # prepare next base dcl = DeltaChunkList() # END for each delta stream - + return tdcl - + + def apply_delta_data(src_buf, src_buf_size, delta_buf, delta_buf_size, write): """ Apply data from a delta buffer using a source buffer to the target file - + :param src_buf: random access data from which the delta was created :param src_buf_size: size of the source buffer in bytes :param delta_buf_size: size fo the delta buffer in bytes @@ -626,27 +644,27 @@ def apply_delta_data(src_buf, src_buf_size, delta_buf, delta_buf_size, write): if (c & 0x40): cp_size |= (ord(db[i]) << 16) i += 1 - - if not cp_size: + + if not cp_size: cp_size = 0x10000 - + rbound = cp_off + cp_size if (rbound < cp_size or - rbound > src_buf_size): + rbound > src_buf_size): break write(buffer(src_buf, cp_off, cp_size)) elif c: - write(db[i:i+c]) + write(db[i:i + c]) i += c else: raise ValueError("unexpected delta opcode 0") # END handle command byte # END while processing delta data - + # yes, lets use the exact same error message that git uses :) assert i == delta_buf_size, "delta replay has gone wild" - - + + def is_equal_canonical_sha(canonical_length, match, sha1): """ :return: True if the given lhs and rhs 20 byte binary shas @@ -654,16 +672,16 @@ def is_equal_canonical_sha(canonical_length, match, sha1): hence the comparison will only use the last 4 bytes for uneven canonical representations :param match: less than 20 byte sha :param sha1: 20 byte sha""" - binary_length = canonical_length/2 + binary_length = canonical_length / 2 if match[:binary_length] != sha1[:binary_length]: return False - + if canonical_length - binary_length and \ - (ord(match[-1]) ^ ord(sha1[len(match)-1])) & 0xf0: + (ord(match[-1]) ^ ord(sha1[len(match) - 1])) & 0xf0: return False # END handle uneven canonnical length return True - + #} END routines |