diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2017-08-05 16:22:51 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2017-08-05 16:22:51 +0000 |
commit | cf46733632c7279a9fd0fe6ce26f9185a4ae82a9 (patch) | |
tree | da27775a2161723ef342e91af41a8b51fedef405 /tools/dev/benchmarks/RepoPerf/copy_repo.py | |
parent | bb0ef45f7c46b0ae221b26265ef98a768c33f820 (diff) | |
download | subversion-tarball-master.tar.gz |
subversion-1.9.7HEADsubversion-1.9.7master
Diffstat (limited to 'tools/dev/benchmarks/RepoPerf/copy_repo.py')
-rw-r--r-- | tools/dev/benchmarks/RepoPerf/copy_repo.py | 313 |
1 files changed, 313 insertions, 0 deletions
diff --git a/tools/dev/benchmarks/RepoPerf/copy_repo.py b/tools/dev/benchmarks/RepoPerf/copy_repo.py new file mode 100644 index 0000000..a95a82d --- /dev/null +++ b/tools/dev/benchmarks/RepoPerf/copy_repo.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python +# +# copy_repo.py: create multiple, interleaved copies of a set of repositories. +# +# Subversion is a tool for revision control. +# See http://subversion.apache.org for more information. +# +# ==================================================================== +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +###################################################################### + +# General modules +import os +import random +import shutil +import sys + +class Separators: + """ This class is a container for dummy / filler files. + It will be used to create spaces between repository + versions on disk, i.e. to simulate some aspect of + real-world FS fragmentation. + + It gets initialized with some parent path as well as + the desired average file size and will create a new + such file with each call to write(). Automatic + sharding keeps FS specific overhead at bay. Call + cleanup() to eventually delete all dummy files. """ + + buffer = "A" * 4096 + """ Write this non-NULL contents into the dummy files. """ + + def __init__(self, path, average_size): + """ Initialize and store all dummy files in a '__tmp' + sub-folder of PATH. The size of each dummy file + is a random value and will be slightly AVERAGE_SIZE + kBytes on average. A value of 0 will effectively + disable dummy file creation. """ + + self.path = os.path.join(path, '__tmp') + self.size = average_size + self.count = 0 + + if os.path.exists(self.path): + shutil.rmtree(self.path) + + os.mkdir(self.path) + + def write(self): + """ Add a new dummy file """ + + # Throw dice of a file size. + # Factor 1024 for kBytes, factor 2 for being an average. + size = (int)(float(self.size) * random.random() * 2 * 1024.0) + + # Don't create empty files. This also implements the + # "average = 0 means no files" rule. + if size > 0: + self.count += 1 + + # Create a new shard for every 1000 files + subfolder = os.path.join(self.path, str(self.count / 1000)) + if not os.path.exists(subfolder): + os.mkdir(subfolder) + + # Create and write the file in 4k chunks. + # Writing full chunks will result in average file sizes + # being slightly above the SELF.SIZE. That's good enough + # for our purposes. + f = open(os.path.join(subfolder, str(self.count)), "wb") + while size > 0: + f.write(self.buffer) + size -= len(self.buffer) + + f.close() + + def cleanup(self): + """ Get rid of all the files (and folders) that we created. """ + + shutil.rmtree(self.path) + +class Repository: + """ Encapsulates key information of a repository. Is is being + used for copy sources only and contains information about + its NAME, PATH, SHARD_SIZE, HEAD revision and MIN_UNPACKED_REV. """ + + def _read_config(self, filename): + """ Read and return all lines from FILENAME. + This will be used to read 'format', 'current' etc. . """ + + f = open(os.path.join(self.path, 'db', filename), "rb") + lines = f.readlines() + f.close() + + return lines + + def __init__(self, parent, name): + """ Constructor collecting everything we need to know about + the repository NAME within PARENT folder. """ + + self.name = name + self.path = os.path.join(parent, name) + + self.shard_size = int(self._read_config('format')[1].split(' ')[2]) + self.min_unpacked_rev = int(self._read_config('min-unpacked-rev')[0]) + self.head = int(self._read_config('current')[0]) + + def needs_copy(self, revision): + """ Return True if REVISION is a revision in this repository + and is "directly copyable", i.e. is either non-packed or + the first rev in a packed shard. Everything else is either + not a valid rev or already gets / got copied as part of + some packed shard. """ + + if revision > self.head: + return False + if revision < self.min_unpacked_rev: + return revision % self.shard_size == 0 + + return True + + @classmethod + def is_repository(cls, path): + """ Quick check that PATH is (probably) a repository. + This is mainly to filter out aux files put next to + (not inside) the repositories to copy. """ + + format_path = os.path.join(path, 'db', 'format') + return os.path.isfile(format_path) + +class Multicopy: + """ Helper class doing the actual copying. It copies individual + revisions and packed shards from the one source repository + to multiple copies of it. The copies have the same name + as the source repo but with numbers 0 .. N-1 appended to it. + + The copy process is being initiated by the constructor + (copies the repo skeleton w/o revision contents). Revision + contents is then copied by successive calls to the copy() + method. """ + + def _init_copy(self, number): + """ Called from the constructor, this will copy SELF.SOURCE_REPO + into NUMBER new repos below SELF.DEST_BASE but omit everything + below db/revs and db/revprops. """ + + src = self.source_repo.path + dst = self.dest_base + str(number) + + # Copy the repo skeleton w/o revs and revprops + shutil.copytree(src, dst, ignore=shutil.ignore_patterns('revs', 'revprops')) + + # Add revs and revprops + self.dst_revs.append(os.path.join(dst, 'db', 'revs')) + self.dst_revprops.append(os.path.join(dst, 'db', 'revprops')) + + os.mkdir(self.dst_revs[number]) + os.mkdir(self.dst_revprops[number]) + + def _copy_packed_shard(self, shard, number): + """ Copy packed shard number SHARD from SELF.SOURCE_REPO to + the copy NUMBER below SELF.DEST_BASE. """ + + # Shards are simple subtrees + src_revs = os.path.join(self.src_revs, str(shard) + '.pack') + dst_revs = os.path.join(self.dst_revs[number], str(shard) + '.pack') + src_revprops = os.path.join(self.src_revprops, str(shard) + '.pack') + dst_revprops = os.path.join(self.dst_revprops[number], str(shard) + '.pack') + + shutil.copytree(src_revs, dst_revs) + shutil.copytree(src_revprops, dst_revprops) + + # Special case: revprops of rev 0 are never packed => extra copy + if shard == 0: + src_revprops = os.path.join(self.src_revprops, '0') + dest_revprops = os.path.join(self.dst_revprops[number], '0') + + shutil.copytree(src_revprops, dest_revprops) + + def _copy_single_revision(self, revision, number): + """ Copy non-packed REVISION from SELF.SOURCE_REPO to the copy + NUMBER below SELF.DEST_BASE. """ + + shard = str(revision / self.source_repo.shard_size) + + # Auto-create shard folder + if revision % self.source_repo.shard_size == 0: + os.mkdir(os.path.join(self.dst_revs[number], shard)) + os.mkdir(os.path.join(self.dst_revprops[number], shard)) + + # Copy the rev file and the revprop file + src_rev = os.path.join(self.src_revs, shard, str(revision)) + dest_rev = os.path.join(self.dst_revs[number], shard, str(revision)) + src_revprop = os.path.join(self.src_revprops, shard, str(revision)) + dest_revprop = os.path.join(self.dst_revprops[number], shard, str(revision)) + + shutil.copyfile(src_rev, dest_rev) + shutil.copyfile(src_revprop, dest_revprop) + + def __init__(self, source, target_parent, count): + """ Initiate the copy process for the SOURCE repository to + be copied COUNT times into the TARGET_PARENT directory. """ + + self.source_repo = source + self.dest_base = os.path.join(target_parent, source.name) + + self.src_revs = os.path.join(source.path, 'db', 'revs') + self.src_revprops = os.path.join(source.path, 'db', 'revprops') + + self.dst_revs = [] + self.dst_revprops = [] + for i in range(0, count): + self._init_copy(i) + + def copy(self, revision, number): + """ Copy (packed or non-packed) REVISION from SELF.SOURCE_REPO + to the copy NUMBER below SELF.DEST_BASE. + + SELF.SOURCE_REPO.needs_copy(REVISION) must be True. """ + + if revision < self.source_repo.min_unpacked_rev: + self._copy_packed_shard(revision / self.source_repo.shard_size, number) + else: + self._copy_single_revision(revision, number) + +def copy_repos(src, dst, count, separator_size): + """ Under DST, create COUNT copies of all repositories immediately + below SRC. + + All copies will "interleaved" such that we copy each individual + revision / packed shard to all target repos first before + continuing with the next revision / packed shard. After each + round (revision / packed shard) insert a temporary file of + SEPARATOR_SIZE kBytes on average to add more spacing between + revisions. The temp files get automatically removed at the end. + + Please note that this function will clear DST before copying + anything into it. """ + + # Remove any remnants from the target folder. + # (DST gets auto-created by the first repo copy.) + shutil.rmtree(dst) + + # Repositories to copy and the respective copy utilities + repositories = [] + copies = [] + + # Find repositories, initiate copies and determine the range of + # revisions to copy in total + max_revision = 0 + for name in os.listdir(src): + if Repository.is_repository(os.path.join(src, name)): + repository = Repository(src, name) + repositories.append(repository) + copies.append(Multicopy(repository, dst, count)) + + if repository.head > max_revision: + max_revision = repository.head + + # Temp file collection (spacers) + separators = Separators(dst, separator_size) + + # Copy all repos in revision,number-major order + for revision in xrange(0, max_revision + 1): + for number in xrange(0, count): + + any_copy = False + for i in xrange(0, len(repositories)): + if repositories[i].needs_copy(revision): + any_copy = True + copies[i].copy(revision, number) + + # Don't add spacers when nothing got copied (REVISION is + # packed in all repositories). + if any_copy: + separators.write() + + # Now that all data is in position, remove the spacers + separators.cleanup() + +def show_usage(): + """ Write a simple CL docstring """ + + print "Copies and duplicates repositories in a way that mimics larger deployments." + print + print "Usage:" + print "copy_repo.py SRC DST COUNT SEPARATOR_SIZE" + print + print "SRC Immediate parent folder of all the repositories to copy." + print "DST Folder to copy into; current contents will be lost." + print "COUNT Number of copies to create of each source repository." + print "SEPARATOR_SIZE Additional spacing, in kBytes, between revisions." + +#main function +if len(argv) == 5: + copy_repos(sys.argv[1], sys.argv[2], int(sys.argv[3]), int(sys.argv[4])) +else: + show_usage() |