diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/checkout.c | 2 | ||||
-rw-r--r-- | src/diff.c | 8 | ||||
-rw-r--r-- | src/diff.h | 10 | ||||
-rw-r--r-- | src/diff_output.c | 203 | ||||
-rw-r--r-- | src/diff_tform.c | 324 | ||||
-rw-r--r-- | src/hashsig.c | 365 | ||||
-rw-r--r-- | src/hashsig.h | 72 |
7 files changed, 817 insertions, 167 deletions
diff --git a/src/checkout.c b/src/checkout.c index eda3e0b4b..040ead2f6 100644 --- a/src/checkout.c +++ b/src/checkout.c @@ -78,7 +78,7 @@ static int checkout_notify( git_oid_cpy(&wdfile.oid, &wditem->oid); wdfile.path = wditem->path; wdfile.size = wditem->file_size; - wdfile.flags = GIT_DIFF_FILE_VALID_OID; + wdfile.flags = GIT_DIFF_FLAG_VALID_OID; wdfile.mode = wditem->mode; workdir = &wdfile; diff --git a/src/diff.c b/src/diff.c index d9bc32a37..0861b13eb 100644 --- a/src/diff.c +++ b/src/diff.c @@ -92,11 +92,11 @@ static int diff_delta__from_one( git_oid_cpy(&delta->new_file.oid, &entry->oid); } - delta->old_file.flags |= GIT_DIFF_FILE_VALID_OID; + delta->old_file.flags |= GIT_DIFF_FLAG_VALID_OID; if (delta->status == GIT_DELTA_DELETED || !git_oid_iszero(&delta->new_file.oid)) - delta->new_file.flags |= GIT_DIFF_FILE_VALID_OID; + delta->new_file.flags |= GIT_DIFF_FLAG_VALID_OID; notify_res = diff_notify(diff, delta, matched_pathspec); @@ -142,7 +142,7 @@ static int diff_delta__from_two( git_oid_cpy(&delta->old_file.oid, &old_entry->oid); delta->old_file.size = old_entry->file_size; delta->old_file.mode = old_mode; - delta->old_file.flags |= GIT_DIFF_FILE_VALID_OID; + delta->old_file.flags |= GIT_DIFF_FLAG_VALID_OID; git_oid_cpy(&delta->new_file.oid, &new_entry->oid); delta->new_file.size = new_entry->file_size; @@ -156,7 +156,7 @@ static int diff_delta__from_two( } if (new_oid || !git_oid_iszero(&new_entry->oid)) - delta->new_file.flags |= GIT_DIFF_FILE_VALID_OID; + delta->new_file.flags |= GIT_DIFF_FLAG_VALID_OID; notify_res = diff_notify(diff, delta, matched_pathspec); diff --git a/src/diff.h b/src/diff.h index 16fbf71e6..8e3cbcd46 100644 --- a/src/diff.h +++ b/src/diff.h @@ -28,8 +28,14 @@ enum { GIT_DIFFCAPS_USE_DEV = (1 << 4), /* use st_dev? */ }; -#define GIT_DELTA__TO_DELETE 10 -#define GIT_DELTA__TO_SPLIT 11 +enum { + GIT_DIFF_FLAG__FREE_PATH = (1 << 7), /* `path` is allocated memory */ + GIT_DIFF_FLAG__FREE_DATA = (1 << 8), /* internal file data is allocated */ + GIT_DIFF_FLAG__UNMAP_DATA = (1 << 9), /* internal file data is mmap'ed */ + GIT_DIFF_FLAG__NO_DATA = (1 << 10), /* file data should not be loaded */ + GIT_DIFF_FLAG__TO_DELETE = (1 << 11), /* delete entry during rename det. */ + GIT_DIFF_FLAG__TO_SPLIT = (1 << 12), /* split entry during rename det. */ +}; struct git_diff_list { git_refcount rc; diff --git a/src/diff_output.c b/src/diff_output.c index 88ccc9d45..13434beb9 100644 --- a/src/diff_output.c +++ b/src/diff_output.c @@ -52,8 +52,8 @@ static int parse_hunk_header(git_diff_range *range, const char *header) return 0; } -#define KNOWN_BINARY_FLAGS (GIT_DIFF_FILE_BINARY|GIT_DIFF_FILE_NOT_BINARY) -#define NOT_BINARY_FLAGS (GIT_DIFF_FILE_NOT_BINARY|GIT_DIFF_FILE_NO_DATA) +#define KNOWN_BINARY_FLAGS (GIT_DIFF_FLAG_BINARY|GIT_DIFF_FLAG_NOT_BINARY) +#define NOT_BINARY_FLAGS (GIT_DIFF_FLAG_NOT_BINARY|GIT_DIFF_FLAG__NO_DATA) static int update_file_is_binary_by_attr( git_repository *repo, git_diff_file *file) @@ -68,9 +68,9 @@ static int update_file_is_binary_by_attr( return -1; if (GIT_ATTR_FALSE(value)) - file->flags |= GIT_DIFF_FILE_BINARY; + file->flags |= GIT_DIFF_FLAG_BINARY; else if (GIT_ATTR_TRUE(value)) - file->flags |= GIT_DIFF_FILE_NOT_BINARY; + file->flags |= GIT_DIFF_FLAG_NOT_BINARY; /* otherwise leave file->flags alone */ return 0; @@ -78,15 +78,15 @@ static int update_file_is_binary_by_attr( static void update_delta_is_binary(git_diff_delta *delta) { - if ((delta->old_file.flags & GIT_DIFF_FILE_BINARY) != 0 || - (delta->new_file.flags & GIT_DIFF_FILE_BINARY) != 0) - delta->binary = 1; + if ((delta->old_file.flags & GIT_DIFF_FLAG_BINARY) != 0 || + (delta->new_file.flags & GIT_DIFF_FLAG_BINARY) != 0) + delta->flags |= GIT_DIFF_FLAG_BINARY; else if ((delta->old_file.flags & NOT_BINARY_FLAGS) != 0 && (delta->new_file.flags & NOT_BINARY_FLAGS) != 0) - delta->binary = 0; + delta->flags |= GIT_DIFF_FLAG_NOT_BINARY; - /* otherwise leave delta->binary value untouched */ + /* otherwise leave delta->flags binary value untouched */ } /* returns if we forced binary setting (and no further checks needed) */ @@ -95,24 +95,24 @@ static bool diff_delta_is_binary_forced( git_diff_delta *delta) { /* return true if binary-ness has already been settled */ - if (delta->binary != -1) + if ((delta->flags & KNOWN_BINARY_FLAGS) != 0) return true; /* make sure files are conceivably mmap-able */ if ((git_off_t)((size_t)delta->old_file.size) != delta->old_file.size || (git_off_t)((size_t)delta->new_file.size) != delta->new_file.size) { - delta->old_file.flags |= GIT_DIFF_FILE_BINARY; - delta->new_file.flags |= GIT_DIFF_FILE_BINARY; - delta->binary = 1; + delta->old_file.flags |= GIT_DIFF_FLAG_BINARY; + delta->new_file.flags |= GIT_DIFF_FLAG_BINARY; + delta->flags |= GIT_DIFF_FLAG_BINARY; return true; } /* check if user is forcing us to text diff these files */ if (ctxt->opts && (ctxt->opts->flags & GIT_DIFF_FORCE_TEXT) != 0) { - delta->old_file.flags |= GIT_DIFF_FILE_NOT_BINARY; - delta->new_file.flags |= GIT_DIFF_FILE_NOT_BINARY; - delta->binary = 0; + delta->old_file.flags |= GIT_DIFF_FLAG_NOT_BINARY; + delta->new_file.flags |= GIT_DIFF_FLAG_NOT_BINARY; + delta->flags |= GIT_DIFF_FLAG_NOT_BINARY; return true; } @@ -125,8 +125,6 @@ static int diff_delta_is_binary_by_attr( int error = 0, mirror_new; git_diff_delta *delta = patch->delta; - delta->binary = -1; - if (diff_delta_is_binary_forced(ctxt, delta)) return 0; @@ -152,23 +150,21 @@ static int diff_delta_is_binary_by_content( git_diff_file *file, const git_map *map) { + const git_buf search = { map->data, 0, min(map->len, 4000) }; + if (diff_delta_is_binary_forced(ctxt, delta)) return 0; - if ((file->flags & KNOWN_BINARY_FLAGS) == 0) { - const git_buf search = { map->data, 0, min(map->len, 4000) }; - - /* TODO: provide encoding / binary detection callbacks that can - * be UTF-8 aware, etc. For now, instead of trying to be smart, - * let's just use the simple NUL-byte detection that core git uses. - */ + /* TODO: provide encoding / binary detection callbacks that can + * be UTF-8 aware, etc. For now, instead of trying to be smart, + * let's just use the simple NUL-byte detection that core git uses. + */ - /* previously was: if (git_buf_text_is_binary(&search)) */ - if (git_buf_text_contains_nul(&search)) - file->flags |= GIT_DIFF_FILE_BINARY; - else - file->flags |= GIT_DIFF_FILE_NOT_BINARY; - } + /* previously was: if (git_buf_text_is_binary(&search)) */ + if (git_buf_text_contains_nul(&search)) + file->flags |= GIT_DIFF_FLAG_BINARY; + else + file->flags |= GIT_DIFF_FLAG_NOT_BINARY; update_delta_is_binary(delta); @@ -192,7 +188,7 @@ static int diff_delta_is_binary_by_size( } if (file->size > threshold) - file->flags |= GIT_DIFF_FILE_BINARY; + file->flags |= GIT_DIFF_FLAG_BINARY; update_delta_is_binary(delta); @@ -247,7 +243,7 @@ static int get_blob_content( map->data = git_buf_detach(&content); map->len = strlen(map->data); - file->flags |= GIT_DIFF_FILE_FREE_DATA; + file->flags |= GIT_DIFF_FLAG__FREE_DATA; return 0; } @@ -270,7 +266,7 @@ static int get_blob_content( /* if blob is too large to diff, mark as binary */ if ((error = diff_delta_is_binary_by_size(ctxt, delta, file)) < 0) return error; - if (delta->binary == 1) + if ((delta->flags & GIT_DIFF_FLAG_BINARY) != 0) return 0; if (odb_obj != NULL) { @@ -306,14 +302,14 @@ static int get_workdir_sm_content( return error; /* update OID if we didn't have it previously */ - if ((file->flags & GIT_DIFF_FILE_VALID_OID) == 0) { + if ((file->flags & GIT_DIFF_FLAG_VALID_OID) == 0) { const git_oid* sm_head; if ((sm_head = git_submodule_wd_id(sm)) != NULL || (sm_head = git_submodule_head_id(sm)) != NULL) { git_oid_cpy(&file->oid, sm_head); - file->flags |= GIT_DIFF_FILE_VALID_OID; + file->flags |= GIT_DIFF_FLAG_VALID_OID; } } @@ -329,7 +325,7 @@ static int get_workdir_sm_content( map->data = git_buf_detach(&content); map->len = strlen(map->data); - file->flags |= GIT_DIFF_FILE_FREE_DATA; + file->flags |= GIT_DIFF_FLAG__FREE_DATA; return 0; } @@ -356,8 +352,8 @@ static int get_workdir_content( if (S_ISLNK(file->mode)) { ssize_t alloc_len, read_len; - file->flags |= GIT_DIFF_FILE_FREE_DATA; - file->flags |= GIT_DIFF_FILE_BINARY; + file->flags |= GIT_DIFF_FLAG__FREE_DATA; + file->flags |= GIT_DIFF_FLAG_BINARY; /* link path on disk could be UTF-16, so prepare a buffer that is * big enough to handle some UTF-8 data expansion @@ -389,7 +385,7 @@ static int get_workdir_content( file->size = git_futils_filesize(fd); if ((error = diff_delta_is_binary_by_size(ctxt, delta, file)) < 0 || - delta->binary == 1) + (delta->flags & GIT_DIFF_FLAG_BINARY) != 0) goto close_and_cleanup; error = git_filters_load( @@ -402,7 +398,7 @@ static int get_workdir_content( goto close_and_cleanup; error = git_futils_mmap_ro(map, fd, 0, (size_t)file->size); - file->flags |= GIT_DIFF_FILE_UNMAP_DATA; + file->flags |= GIT_DIFF_FLAG__UNMAP_DATA; } else { git_buf raw = GIT_BUF_INIT, filtered = GIT_BUF_INIT; @@ -412,7 +408,7 @@ static int get_workdir_content( map->len = git_buf_len(&filtered); map->data = git_buf_detach(&filtered); - file->flags |= GIT_DIFF_FILE_FREE_DATA; + file->flags |= GIT_DIFF_FLAG__FREE_DATA; } git_buf_free(&raw); @@ -425,11 +421,11 @@ close_and_cleanup: } /* once data is loaded, update OID if we didn't have it previously */ - if (!error && (file->flags & GIT_DIFF_FILE_VALID_OID) == 0) { + if (!error && (file->flags & GIT_DIFF_FLAG_VALID_OID) == 0) { error = git_odb_hash( &file->oid, map->data, map->len, GIT_OBJ_BLOB); if (!error) - file->flags |= GIT_DIFF_FILE_VALID_OID; + file->flags |= GIT_DIFF_FLAG_VALID_OID; } if (!error) @@ -445,22 +441,22 @@ static void release_content(git_diff_file *file, git_map *map, git_blob *blob) if (blob != NULL) git_blob_free(blob); - if (file->flags & GIT_DIFF_FILE_FREE_DATA) { + if (file->flags & GIT_DIFF_FLAG__FREE_DATA) { git__free(map->data); map->data = ""; map->len = 0; - file->flags &= ~GIT_DIFF_FILE_FREE_DATA; + file->flags &= ~GIT_DIFF_FLAG__FREE_DATA; } - else if (file->flags & GIT_DIFF_FILE_UNMAP_DATA) { + else if (file->flags & GIT_DIFF_FLAG__UNMAP_DATA) { git_futils_mmap_free(map); map->data = ""; map->len = 0; - file->flags &= ~GIT_DIFF_FILE_UNMAP_DATA; + file->flags &= ~GIT_DIFF_FLAG__UNMAP_DATA; } } -static void diff_context_init( +static int diff_context_init( diff_context *ctxt, git_diff_list *diff, git_repository *repo, @@ -472,6 +468,12 @@ static void diff_context_init( { memset(ctxt, 0, sizeof(diff_context)); + if (!repo && diff) + repo = diff->repo; + + if (!opts && diff) + opts = &diff->opts; + ctxt->repo = repo; ctxt->diff = diff; ctxt->opts = opts; @@ -482,6 +484,8 @@ static void diff_context_init( ctxt->error = 0; setup_xdiff_options(ctxt->opts, &ctxt->xdiff_config, &ctxt->xdiff_params); + + return 0; } static int diff_delta_file_callback( @@ -555,7 +559,7 @@ static int diff_patch_load( patch->new_data.len = 0; patch->new_blob = NULL; - if (delta->binary == 1) + if ((delta->flags & GIT_DIFF_FLAG_BINARY) != 0) goto cleanup; if (!ctxt->hunk_cb && @@ -565,25 +569,25 @@ static int diff_patch_load( switch (delta->status) { case GIT_DELTA_ADDED: - delta->old_file.flags |= GIT_DIFF_FILE_NO_DATA; + delta->old_file.flags |= GIT_DIFF_FLAG__NO_DATA; break; case GIT_DELTA_DELETED: - delta->new_file.flags |= GIT_DIFF_FILE_NO_DATA; + delta->new_file.flags |= GIT_DIFF_FLAG__NO_DATA; break; case GIT_DELTA_MODIFIED: break; case GIT_DELTA_UNTRACKED: - delta->old_file.flags |= GIT_DIFF_FILE_NO_DATA; + delta->old_file.flags |= GIT_DIFF_FLAG__NO_DATA; if ((ctxt->opts->flags & GIT_DIFF_INCLUDE_UNTRACKED_CONTENT) == 0) - delta->new_file.flags |= GIT_DIFF_FILE_NO_DATA; + delta->new_file.flags |= GIT_DIFF_FLAG__NO_DATA; break; default: - delta->new_file.flags |= GIT_DIFF_FILE_NO_DATA; - delta->old_file.flags |= GIT_DIFF_FILE_NO_DATA; + delta->new_file.flags |= GIT_DIFF_FLAG__NO_DATA; + delta->old_file.flags |= GIT_DIFF_FLAG__NO_DATA; break; } -#define CHECK_UNMODIFIED (GIT_DIFF_FILE_NO_DATA | GIT_DIFF_FILE_VALID_OID) +#define CHECK_UNMODIFIED (GIT_DIFF_FLAG__NO_DATA | GIT_DIFF_FLAG_VALID_OID) check_if_unmodified = (delta->old_file.flags & CHECK_UNMODIFIED) == 0 && @@ -594,41 +598,41 @@ static int diff_patch_load( * memory footprint during diff. */ - if ((delta->old_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 && + if ((delta->old_file.flags & GIT_DIFF_FLAG__NO_DATA) == 0 && patch->old_src == GIT_ITERATOR_TYPE_WORKDIR) { if ((error = get_workdir_content( ctxt, delta, &delta->old_file, &patch->old_data)) < 0) goto cleanup; - if (delta->binary == 1) + if ((delta->flags & GIT_DIFF_FLAG_BINARY) != 0) goto cleanup; } - if ((delta->new_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 && + if ((delta->new_file.flags & GIT_DIFF_FLAG__NO_DATA) == 0 && patch->new_src == GIT_ITERATOR_TYPE_WORKDIR) { if ((error = get_workdir_content( ctxt, delta, &delta->new_file, &patch->new_data)) < 0) goto cleanup; - if (delta->binary == 1) + if ((delta->flags & GIT_DIFF_FLAG_BINARY) != 0) goto cleanup; } - if ((delta->old_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 && + if ((delta->old_file.flags & GIT_DIFF_FLAG__NO_DATA) == 0 && patch->old_src != GIT_ITERATOR_TYPE_WORKDIR) { if ((error = get_blob_content( ctxt, delta, &delta->old_file, &patch->old_data, &patch->old_blob)) < 0) goto cleanup; - if (delta->binary == 1) + if ((delta->flags & GIT_DIFF_FLAG_BINARY) != 0) goto cleanup; } - if ((delta->new_file.flags & GIT_DIFF_FILE_NO_DATA) == 0 && + if ((delta->new_file.flags & GIT_DIFF_FLAG__NO_DATA) == 0 && patch->new_src != GIT_ITERATOR_TYPE_WORKDIR) { if ((error = get_blob_content( ctxt, delta, &delta->new_file, &patch->new_data, &patch->new_blob)) < 0) goto cleanup; - if (delta->binary == 1) + if ((delta->flags & GIT_DIFF_FLAG_BINARY) != 0) goto cleanup; } @@ -646,13 +650,13 @@ static int diff_patch_load( } cleanup: - if (delta->binary == -1) + if ((delta->flags & KNOWN_BINARY_FLAGS) == 0) update_delta_is_binary(delta); if (!error) { patch->flags |= GIT_DIFF_PATCH_LOADED; - if (delta->binary != 1 && + if ((delta->flags & GIT_DIFF_FLAG_BINARY) == 0 && delta->status != GIT_DELTA_UNMODIFIED && (patch->old_data.len || patch->new_data.len) && !git_oid_equal(&delta->old_file.oid, &delta->new_file.oid)) @@ -926,6 +930,15 @@ static int diff_patch_line_cb( return 0; } +static int diff_required(git_diff_list *diff, const char *action) +{ + if (!diff) { + giterr_set(GITERR_INVALID, "Must provide valid diff to %s", action); + return -1; + } + + return 0; +} int git_diff_foreach( git_diff_list *diff, @@ -939,9 +952,12 @@ int git_diff_foreach( size_t idx; git_diff_patch patch; - diff_context_init( - &ctxt, diff, diff->repo, &diff->opts, - file_cb, hunk_cb, data_cb, payload); + if (diff_required(diff, "git_diff_foreach") < 0) + return -1; + + if (diff_context_init( + &ctxt, diff, NULL, NULL, file_cb, hunk_cb, data_cb, payload) < 0) + return -1; diff_patch_init(&ctxt, &patch); @@ -1138,7 +1154,7 @@ static int print_patch_file( newpath = "/dev/null"; } - if (delta->binary != 1) { + if ((delta->flags & GIT_DIFF_FLAG_BINARY) == 0) { git_buf_printf(pi->buf, "--- %s%s\n", oldpfx, oldpath); git_buf_printf(pi->buf, "+++ %s%s\n", newpfx, newpath); } @@ -1153,7 +1169,7 @@ static int print_patch_file( return GIT_EUSER; } - if (delta->binary != 1) + if ((delta->flags & GIT_DIFF_FLAG_BINARY) == 0) return 0; git_buf_clear(pi->buf); @@ -1268,7 +1284,7 @@ static void set_data_from_blob( map->data = (char *)git_blob_rawcontent(blob); } else { file->size = 0; - file->flags |= GIT_DIFF_FILE_NO_DATA; + file->flags |= GIT_DIFF_FLAG__NO_DATA; map->len = 0; map->data = ""; @@ -1283,7 +1299,7 @@ static void set_data_from_buffer( map->len = buffer_len; if (!buffer) { - file->flags |= GIT_DIFF_FILE_NO_DATA; + file->flags |= GIT_DIFF_FLAG__NO_DATA; map->data = NULL; } else { map->data = (char *)buffer; @@ -1310,8 +1326,10 @@ static int diff_single_init( memset(data, 0, sizeof(*data)); - diff_context_init( - &data->ctxt, NULL, repo, opts, file_cb, hunk_cb, data_cb, payload); + if (diff_context_init( + &data->ctxt, NULL, repo, opts, + file_cb, hunk_cb, data_cb, payload) < 0) + return -1; diff_patch_init(&data->ctxt, &data->patch); @@ -1322,13 +1340,13 @@ static int diff_single_apply(diff_single_data *data) { int error; git_diff_delta *delta = &data->delta; - bool has_old = ((delta->old_file.flags & GIT_DIFF_FILE_NO_DATA) == 0); - bool has_new = ((delta->new_file.flags & GIT_DIFF_FILE_NO_DATA) == 0); + bool has_old = ((delta->old_file.flags & GIT_DIFF_FLAG__NO_DATA) == 0); + bool has_new = ((delta->new_file.flags & GIT_DIFF_FLAG__NO_DATA) == 0); /* finish setting up fake git_diff_delta record and loaded data */ data->patch.delta = delta; - delta->binary = -1; + delta->flags = delta->flags & ~KNOWN_BINARY_FLAGS; delta->status = has_new ? (has_old ? GIT_DELTA_MODIFIED : GIT_DELTA_ADDED) : @@ -1345,7 +1363,8 @@ static int diff_single_apply(diff_single_data *data) data->patch.flags |= GIT_DIFF_PATCH_LOADED; - if (delta->binary != 1 && delta->status != GIT_DELTA_UNMODIFIED) + if ((delta->flags & GIT_DIFF_FLAG_BINARY) == 0 && + delta->status != GIT_DELTA_UNMODIFIED) data->patch.flags |= GIT_DIFF_PATCH_DIFFABLE; /* do diffs */ @@ -1377,6 +1396,9 @@ int git_diff_blobs( new_blob ? git_object_owner((const git_object *)new_blob) : old_blob ? git_object_owner((const git_object *)old_blob) : NULL; + if (!repo) /* Hmm, given two NULL blobs, silently do no callbacks? */ + return 0; + if ((error = diff_single_init( &d, repo, options, file_cb, hunk_cb, data_cb, payload)) < 0) return error; @@ -1408,6 +1430,9 @@ int git_diff_blob_to_buffer( git_repository *repo = old_blob ? git_object_owner((const git_object *)old_blob) : NULL; + if (!repo && !buf) /* Hmm, given NULLs, silently do no callbacks? */ + return 0; + if ((error = diff_single_init( &d, repo, options, file_cb, hunk_cb, data_cb, payload)) < 0) return error; @@ -1456,11 +1481,19 @@ int git_diff_get_patch( if (patch_ptr) *patch_ptr = NULL; + if (delta_ptr) + *delta_ptr = NULL; + + if (diff_required(diff, "git_diff_get_patch") < 0) + return -1; + + if (diff_context_init( + &ctxt, diff, NULL, NULL, + NULL, diff_patch_hunk_cb, diff_patch_line_cb, NULL) < 0) + return -1; delta = git_vector_get(&diff->deltas, idx); if (!delta) { - if (delta_ptr) - *delta_ptr = NULL; giterr_set(GITERR_INVALID, "Index out of range for delta in diff"); return GIT_ENOTFOUND; } @@ -1469,14 +1502,10 @@ int git_diff_get_patch( *delta_ptr = delta; if (!patch_ptr && - (delta->binary != -1 || + ((delta->flags & KNOWN_BINARY_FLAGS) != 0 || (diff->opts.flags & GIT_DIFF_SKIP_BINARY_CHECK) != 0)) return 0; - diff_context_init( - &ctxt, diff, diff->repo, &diff->opts, - NULL, diff_patch_hunk_cb, diff_patch_line_cb, NULL); - if (git_diff_delta__should_skip(ctxt.opts, delta)) return 0; diff --git a/src/diff_tform.c b/src/diff_tform.c index 2c2e1fb19..958d2bfec 100644 --- a/src/diff_tform.c +++ b/src/diff_tform.c @@ -7,6 +7,8 @@ #include "common.h" #include "diff.h" #include "git2/config.h" +#include "git2/blob.h" +#include "hashsig.h" static git_diff_delta *diff_delta__dup( const git_diff_delta *d, git_pool *pool) @@ -168,6 +170,36 @@ int git_diff_merge( return error; } +static int find_similar__hashsig_for_file( + void **out, const git_diff_file *f, const char *path, void *p) +{ + git_hashsig_option_t opt = (git_hashsig_option_t)p; + GIT_UNUSED(f); + return git_hashsig_create_fromfile((git_hashsig **)out, path, opt); +} + +static int find_similar__hashsig_for_buf( + void **out, const git_diff_file *f, const char *buf, size_t len, void *p) +{ + git_hashsig_option_t opt = (git_hashsig_option_t)p; + GIT_UNUSED(f); + return git_hashsig_create((git_hashsig **)out, buf, len, opt); +} + +static void find_similar__hashsig_free(void *sig, void *payload) +{ + GIT_UNUSED(payload); + git_hashsig_free(sig); +} + +static int find_similar__calc_similarity( + int *score, void *siga, void *sigb, void *payload) +{ + GIT_UNUSED(payload); + *score = git_hashsig_compare(siga, sigb); + return 0; +} + #define DEFAULT_THRESHOLD 50 #define DEFAULT_BREAK_REWRITE_THRESHOLD 60 #define DEFAULT_TARGET_LIMIT 200 @@ -178,7 +210,6 @@ static int normalize_find_opts( git_diff_find_options *given) { git_config *cfg = NULL; - const char *val; if (diff->repo != NULL && git_repository_config__weakptr(&cfg, diff->repo) < 0) @@ -187,8 +218,9 @@ static int normalize_find_opts( if (given != NULL) memcpy(opts, given, sizeof(*opts)); else { - git_diff_find_options init = GIT_DIFF_FIND_OPTIONS_INIT; - memmove(opts, &init, sizeof(init)); + const char *val = NULL; + + GIT_INIT_STRUCTURE(opts, GIT_DIFF_FIND_OPTIONS_VERSION); opts->flags = GIT_DIFF_FIND_RENAMES; @@ -236,6 +268,24 @@ static int normalize_find_opts( opts->target_limit = limit; } + /* assign the internal metric with whitespace flag as payload */ + if (!opts->metric) { + opts->metric = git__malloc(sizeof(git_diff_similarity_metric)); + GITERR_CHECK_ALLOC(opts->metric); + + opts->metric->file_signature = find_similar__hashsig_for_file; + opts->metric->buffer_signature = find_similar__hashsig_for_buf; + opts->metric->free_signature = find_similar__hashsig_free; + opts->metric->similarity = find_similar__calc_similarity; + + if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE) + opts->metric->payload = (void *)GIT_HASHSIG_IGNORE_WHITESPACE; + else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE) + opts->metric->payload = (void *)GIT_HASHSIG_NORMAL; + else + opts->metric->payload = (void *)GIT_HASHSIG_SMART_WHITESPACE; + } + return 0; } @@ -250,10 +300,10 @@ static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size) /* build new delta list without TO_DELETE and splitting TO_SPLIT */ git_vector_foreach(&diff->deltas, i, delta) { - if (delta->status == GIT_DELTA__TO_DELETE) + if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0) continue; - if (delta->status == GIT_DELTA__TO_SPLIT) { + if ((delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0) { git_diff_delta *deleted = diff_delta__dup(delta, &diff->pool); if (!deleted) goto on_error; @@ -261,7 +311,7 @@ static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size) deleted->status = GIT_DELTA_DELETED; memset(&deleted->new_file, 0, sizeof(deleted->new_file)); deleted->new_file.path = deleted->old_file.path; - deleted->new_file.flags |= GIT_DIFF_FILE_VALID_OID; + deleted->new_file.flags |= GIT_DIFF_FLAG_VALID_OID; if (git_vector_insert(&onto, deleted) < 0) goto on_error; @@ -269,7 +319,7 @@ static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size) delta->status = GIT_DELTA_ADDED; memset(&delta->old_file, 0, sizeof(delta->old_file)); delta->old_file.path = delta->new_file.path; - delta->old_file.flags |= GIT_DIFF_FILE_VALID_OID; + delta->old_file.flags |= GIT_DIFF_FLAG_VALID_OID; } if (git_vector_insert(&onto, delta) < 0) @@ -278,7 +328,7 @@ static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size) /* cannot return an error past this point */ git_vector_foreach(&diff->deltas, i, delta) - if (delta->status == GIT_DELTA__TO_DELETE) + if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0) git__free(delta); /* swap new delta list into place */ @@ -297,17 +347,86 @@ on_error: return -1; } -static unsigned int calc_similarity( - void *cache, git_diff_file *old_file, git_diff_file *new_file) +GIT_INLINE(git_diff_file *) similarity_get_file(git_diff_list *diff, size_t idx) +{ + git_diff_delta *delta = git_vector_get(&diff->deltas, idx / 2); + return (idx & 1) ? &delta->new_file : &delta->old_file; +} + +static int similarity_calc( + git_diff_list *diff, + git_diff_find_options *opts, + size_t file_idx, + void **cache) { - GIT_UNUSED(cache); + int error = 0; + git_diff_file *file = similarity_get_file(diff, file_idx); + git_iterator_type_t src = (file_idx & 1) ? diff->old_src : diff->new_src; + + if (src == GIT_ITERATOR_TYPE_WORKDIR) { /* compute hashsig from file */ + git_buf path = GIT_BUF_INIT; + + /* TODO: apply wd-to-odb filters to file data if necessary */ - if (git_oid_cmp(&old_file->oid, &new_file->oid) == 0) + if (!(error = git_buf_joinpath( + &path, git_repository_workdir(diff->repo), file->path))) + error = opts->metric->file_signature( + &cache[file_idx], file, path.ptr, opts->metric->payload); + + git_buf_free(&path); + } else { /* compute hashsig from blob buffer */ + git_blob *blob = NULL; + + /* TODO: add max size threshold a la diff? */ + + if ((error = git_blob_lookup(&blob, diff->repo, &file->oid)) < 0) + return error; + + error = opts->metric->buffer_signature( + &cache[file_idx], file, git_blob_rawcontent(blob), + git_blob_rawsize(blob), opts->metric->payload); + + git_blob_free(blob); + } + + return error; +} + +static int similarity_measure( + git_diff_list *diff, + git_diff_find_options *opts, + void **cache, + size_t a_idx, + size_t b_idx) +{ + int score = 0; + git_diff_file *a_file = similarity_get_file(diff, a_idx); + git_diff_file *b_file = similarity_get_file(diff, b_idx); + + if (GIT_MODE_TYPE(a_file->mode) != GIT_MODE_TYPE(b_file->mode)) + return 0; + + if (git_oid_cmp(&a_file->oid, &b_file->oid) == 0) return 100; - /* TODO: insert actual similarity algo here */ + /* update signature cache if needed */ + if (!cache[a_idx] && similarity_calc(diff, opts, a_idx, cache) < 0) + return -1; + if (!cache[b_idx] && similarity_calc(diff, opts, b_idx, cache) < 0) + return -1; - return 0; + /* compare signatures */ + if (opts->metric->similarity( + &score, cache[a_idx], cache[b_idx], opts->metric->payload) < 0) + return -1; + + /* clip score */ + if (score < 0) + score = 0; + else if (score > 100) + score = 100; + + return score; } #define FLAG_SET(opts,flag_name) ((opts.flags & flag_name) != 0) @@ -316,109 +435,135 @@ int git_diff_find_similar( git_diff_list *diff, git_diff_find_options *given_opts) { - unsigned int i, j, similarity; + size_t i, j, cache_size, *matches; + int error = 0, similarity; git_diff_delta *from, *to; git_diff_find_options opts; - unsigned int tried_targets, num_changes = 0; - git_vector matches = GIT_VECTOR_INIT; + size_t tried_targets, num_rewrites = 0; + void **cache; - if (normalize_find_opts(diff, &opts, given_opts) < 0) - return -1; + if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0) + return error; - /* first do splits if requested */ + /* TODO: maybe abort if deltas.length > target_limit ??? */ + + cache_size = diff->deltas.length * 2; /* must store b/c length may change */ + cache = git__calloc(cache_size, sizeof(void *)); + GITERR_CHECK_ALLOC(cache); + + matches = git__calloc(diff->deltas.length, sizeof(size_t)); + GITERR_CHECK_ALLOC(matches); + + /* first break MODIFIED records that are too different (if requested) */ if (FLAG_SET(opts, GIT_DIFF_FIND_AND_BREAK_REWRITES)) { git_vector_foreach(&diff->deltas, i, from) { if (from->status != GIT_DELTA_MODIFIED) continue; - /* Right now, this doesn't work right because the similarity - * algorithm isn't actually implemented... - */ - similarity = 100; - /* calc_similarity(NULL, &from->old_file, from->new_file); */ + similarity = similarity_measure( + diff, &opts, cache, 2 * i, 2 * i + 1); - if (similarity < opts.break_rewrite_threshold) { - from->status = GIT_DELTA__TO_SPLIT; - num_changes++; + if (similarity < 0) { + error = similarity; + goto cleanup; } - } - /* apply splits as needed */ - if (num_changes > 0 && - apply_splits_and_deletes( - diff, diff->deltas.length + num_changes) < 0) - return -1; + if ((unsigned int)similarity < opts.break_rewrite_threshold) { + from->flags |= GIT_DIFF_FLAG__TO_SPLIT; + num_rewrites++; + } + } } /* next find the most similar delta for each rename / copy candidate */ - if (git_vector_init(&matches, diff->deltas.length, git_diff_delta__cmp) < 0) - return -1; - git_vector_foreach(&diff->deltas, i, from) { tried_targets = 0; + /* skip things that aren't blobs */ + if (GIT_MODE_TYPE(from->old_file.mode) != + GIT_MODE_TYPE(GIT_FILEMODE_BLOB)) + continue; + + /* don't check UNMODIFIED files as source unless given option */ + if (from->status == GIT_DELTA_UNMODIFIED && + !FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)) + continue; + + /* skip all but DELETED files unless copy detection is on */ + if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES) && + from->status != GIT_DELTA_DELETED && + (from->flags & GIT_DIFF_FLAG__TO_SPLIT) == 0) + continue; + git_vector_foreach(&diff->deltas, j, to) { if (i == j) continue; + /* skip things that aren't blobs */ + if (GIT_MODE_TYPE(to->new_file.mode) != + GIT_MODE_TYPE(GIT_FILEMODE_BLOB)) + continue; + switch (to->status) { case GIT_DELTA_ADDED: case GIT_DELTA_UNTRACKED: case GIT_DELTA_RENAMED: case GIT_DELTA_COPIED: break; + case GIT_DELTA_MODIFIED: + if ((to->flags & GIT_DIFF_FLAG__TO_SPLIT) == 0) + continue; + break; default: /* only the above status values should be checked */ continue; } - /* skip all but DELETED files unless copy detection is on */ - if (from->status != GIT_DELTA_DELETED && - !FLAG_SET(opts, GIT_DIFF_FIND_COPIES)) - continue; - - /* don't check UNMODIFIED files as source unless given option */ - if (from->status == GIT_DELTA_UNMODIFIED && - !FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)) - continue; - - /* cap on maximum files we'll examine */ + /* cap on maximum files we'll examine (per "from" file) */ if (++tried_targets > opts.target_limit) break; /* calculate similarity and see if this pair beats the * similarity score of the current best pair. */ - similarity = calc_similarity(NULL, &from->old_file, &to->new_file); + similarity = similarity_measure( + diff, &opts, cache, 2 * i, 2 * j + 1); + + if (similarity < 0) { + error = similarity; + goto cleanup; + } - if (to->similarity < similarity) { - to->similarity = similarity; - if (git_vector_set(NULL, &matches, j, from) < 0) - return -1; + if (to->similarity < (unsigned int)similarity) { + to->similarity = (unsigned int)similarity; + matches[j] = i + 1; } } } /* next rewrite the diffs with renames / copies */ - num_changes = 0; - git_vector_foreach(&diff->deltas, j, to) { - from = GIT_VECTOR_GET(&matches, j); - if (!from) { + if (!matches[j]) { assert(to->similarity == 0); continue; } - /* three possible outcomes here: + i = matches[j] - 1; + from = GIT_VECTOR_GET(&diff->deltas, i); + assert(from); + + /* four possible outcomes here: * 1. old DELETED and if over rename threshold, * new becomes RENAMED and old goes away - * 2. old was MODIFIED but FIND_RENAMES_FROM_REWRITES is on and + * 2. old SPLIT and if over rename threshold, + * new becomes RENAMED and old becomes ADDED (clear SPLIT) + * 3. old was MODIFIED but FIND_RENAMES_FROM_REWRITES is on and * old is more similar to new than it is to itself, in which * case, new becomes RENAMED and old becomed ADDED - * 3. otherwise if over copy threshold, new becomes COPIED + * 4. otherwise if over copy threshold, new becomes COPIED */ if (from->status == GIT_DELTA_DELETED) { @@ -430,8 +575,27 @@ int git_diff_find_similar( to->status = GIT_DELTA_RENAMED; memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); - from->status = GIT_DELTA__TO_DELETE; - num_changes++; + from->flags |= GIT_DIFF_FLAG__TO_DELETE; + num_rewrites++; + + continue; + } + + if (from->status == GIT_DELTA_MODIFIED && + (from->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0) + { + if (to->similarity < opts.rename_threshold) { + to->similarity = 0; + continue; + } + + to->status = GIT_DELTA_RENAMED; + memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); + + from->status = GIT_DELTA_ADDED; + from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; + memset(&from->old_file, 0, sizeof(from->old_file)); + num_rewrites--; continue; } @@ -440,17 +604,22 @@ int git_diff_find_similar( FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) && to->similarity > opts.rename_threshold) { - similarity = 100; - /* calc_similarity(NULL, &from->old_file, from->new_file); */ + similarity = similarity_measure( + diff, &opts, cache, 2 * i, 2 * i + 1); + + if (similarity < 0) { + error = similarity; + goto cleanup; + } - if (similarity < opts.rename_from_rewrite_threshold) { + if ((unsigned int)similarity < opts.rename_from_rewrite_threshold) { to->status = GIT_DELTA_RENAMED; memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); from->status = GIT_DELTA_ADDED; memset(&from->old_file, 0, sizeof(from->old_file)); from->old_file.path = to->old_file.path; - from->old_file.flags |= GIT_DIFF_FILE_VALID_OID; + from->old_file.flags |= GIT_DIFF_FLAG_VALID_OID; continue; } @@ -466,17 +635,26 @@ int git_diff_find_similar( memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); } - git_vector_free(&matches); + if (num_rewrites > 0) { + assert(num_rewrites < diff->deltas.length); - if (num_changes > 0) { - assert(num_changes < diff->deltas.length); + error = apply_splits_and_deletes( + diff, diff->deltas.length - num_rewrites); + } + +cleanup: + git__free(matches); - if (apply_splits_and_deletes( - diff, diff->deltas.length - num_changes) < 0) - return -1; + for (i = 0; i < cache_size; ++i) { + if (cache[i] != NULL) + opts.metric->free_signature(cache[i], opts.metric->payload); } + git__free(cache); - return 0; + if (!given_opts || !given_opts->metric) + git__free(opts.metric); + + return error; } #undef FLAG_SET diff --git a/src/hashsig.c b/src/hashsig.c new file mode 100644 index 000000000..60649fd11 --- /dev/null +++ b/src/hashsig.c @@ -0,0 +1,365 @@ +/* + * Copyright (C) the libgit2 contributors. All rights reserved. + * + * This file is part of libgit2, distributed under the GNU GPL v2 with + * a Linking Exception. For full terms see the included COPYING file. + */ +#include "hashsig.h" +#include "fileops.h" + +typedef uint32_t hashsig_t; +typedef uint64_t hashsig_state; + +#define HASHSIG_SCALE 100 + +#define HASHSIG_HASH_WINDOW 32 +#define HASHSIG_HASH_START 0 +#define HASHSIG_HASH_SHIFT 5 +#define HASHSIG_HASH_MASK 0x7FFFFFFF + +#define HASHSIG_HEAP_SIZE ((1 << 7) - 1) + +typedef int (*hashsig_cmp)(const void *a, const void *b); + +typedef struct { + int size, asize; + hashsig_cmp cmp; + hashsig_t values[HASHSIG_HEAP_SIZE]; +} hashsig_heap; + +typedef struct { + hashsig_state state, shift_n; + char window[HASHSIG_HASH_WINDOW]; + int win_len, win_pos, saw_lf; +} hashsig_in_progress; + +#define HASHSIG_IN_PROGRESS_INIT { HASHSIG_HASH_START, 1, {0}, 0, 0, 1 } + +struct git_hashsig { + hashsig_heap mins; + hashsig_heap maxs; + git_hashsig_option_t opt; + int considered; +}; + +#define HEAP_LCHILD_OF(I) (((I)*2)+1) +#define HEAP_RCHILD_OF(I) (((I)*2)+2) +#define HEAP_PARENT_OF(I) (((I)-1)>>1) + +static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp) +{ + h->size = 0; + h->asize = HASHSIG_HEAP_SIZE; + h->cmp = cmp; +} + +static int hashsig_cmp_max(const void *a, const void *b) +{ + hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b; + return (av < bv) ? -1 : (av > bv) ? 1 : 0; +} + +static int hashsig_cmp_min(const void *a, const void *b) +{ + hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b; + return (av > bv) ? -1 : (av < bv) ? 1 : 0; +} + +static void hashsig_heap_up(hashsig_heap *h, int el) +{ + int parent_el = HEAP_PARENT_OF(el); + + while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el]) > 0) { + hashsig_t t = h->values[el]; + h->values[el] = h->values[parent_el]; + h->values[parent_el] = t; + + el = parent_el; + parent_el = HEAP_PARENT_OF(el); + } +} + +static void hashsig_heap_down(hashsig_heap *h, int el) +{ + hashsig_t v, lv, rv; + + /* 'el < h->size / 2' tests if el is bottom row of heap */ + + while (el < h->size / 2) { + int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel; + + v = h->values[el]; + lv = h->values[lel]; + rv = h->values[rel]; + + if (h->cmp(&v, &lv) < 0 && h->cmp(&v, &rv) < 0) + break; + + swapel = (h->cmp(&lv, &rv) < 0) ? lel : rel; + + h->values[el] = h->values[swapel]; + h->values[swapel] = v; + + el = swapel; + } +} + +static void hashsig_heap_sort(hashsig_heap *h) +{ + /* only need to do this at the end for signature comparison */ + qsort(h->values, h->size, sizeof(hashsig_t), h->cmp); +} + +static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val) +{ + /* if heap is full, pop top if new element should replace it */ + if (h->size == h->asize && h->cmp(&val, &h->values[0]) > 0) { + h->size--; + h->values[0] = h->values[h->size]; + hashsig_heap_down(h, 0); + } + + /* if heap is not full, insert new element */ + if (h->size < h->asize) { + h->values[h->size++] = val; + hashsig_heap_up(h, h->size - 1); + } +} + +GIT_INLINE(bool) hashsig_include_char( + char ch, git_hashsig_option_t opt, int *saw_lf) +{ + if ((opt & GIT_HASHSIG_IGNORE_WHITESPACE) && git__isspace(ch)) + return false; + + if (opt & GIT_HASHSIG_SMART_WHITESPACE) { + if (ch == '\r' || (*saw_lf && git__isspace(ch))) + return false; + + *saw_lf = (ch == '\n'); + } + + return true; +} + +static void hashsig_initial_window( + git_hashsig *sig, + const char **data, + size_t size, + hashsig_in_progress *prog) +{ + hashsig_state state, shift_n; + int win_len; + const char *scan, *end; + + /* init until we have processed at least HASHSIG_HASH_WINDOW data */ + + if (prog->win_len >= HASHSIG_HASH_WINDOW) + return; + + state = prog->state; + win_len = prog->win_len; + shift_n = prog->shift_n; + + scan = *data; + end = scan + size; + + while (scan < end && win_len < HASHSIG_HASH_WINDOW) { + char ch = *scan++; + + if (!hashsig_include_char(ch, sig->opt, &prog->saw_lf)) + continue; + + state = (state * HASHSIG_HASH_SHIFT + ch) & HASHSIG_HASH_MASK; + + if (!win_len) + shift_n = 1; + else + shift_n = (shift_n * HASHSIG_HASH_SHIFT) & HASHSIG_HASH_MASK; + + prog->window[win_len++] = ch; + } + + /* insert initial hash if we just finished */ + + if (win_len == HASHSIG_HASH_WINDOW) { + hashsig_heap_insert(&sig->mins, state); + hashsig_heap_insert(&sig->maxs, state); + sig->considered = 1; + } + + prog->state = state; + prog->win_len = win_len; + prog->shift_n = shift_n; + + *data = scan; +} + +static int hashsig_add_hashes( + git_hashsig *sig, + const char *data, + size_t size, + hashsig_in_progress *prog) +{ + const char *scan = data, *end = data + size; + hashsig_state state, shift_n, rmv; + + if (prog->win_len < HASHSIG_HASH_WINDOW) + hashsig_initial_window(sig, &scan, size, prog); + + state = prog->state; + shift_n = prog->shift_n; + + /* advance window, adding new chars and removing old */ + + for (; scan < end; ++scan) { + char ch = *scan; + + if (!hashsig_include_char(ch, sig->opt, &prog->saw_lf)) + continue; + + rmv = shift_n * prog->window[prog->win_pos]; + + state = (state - rmv) & HASHSIG_HASH_MASK; + state = (state * HASHSIG_HASH_SHIFT) & HASHSIG_HASH_MASK; + state = (state + ch) & HASHSIG_HASH_MASK; + + hashsig_heap_insert(&sig->mins, state); + hashsig_heap_insert(&sig->maxs, state); + sig->considered++; + + prog->window[prog->win_pos] = ch; + prog->win_pos = (prog->win_pos + 1) % HASHSIG_HASH_WINDOW; + } + + prog->state = state; + + return 0; +} + +static int hashsig_finalize_hashes(git_hashsig *sig) +{ + if (sig->mins.size < HASHSIG_HEAP_SIZE) { + giterr_set(GITERR_INVALID, + "File too small for similarity signature calculation"); + return GIT_EBUFS; + } + + hashsig_heap_sort(&sig->mins); + hashsig_heap_sort(&sig->maxs); + + return 0; +} + +static git_hashsig *hashsig_alloc(git_hashsig_option_t opts) +{ + git_hashsig *sig = git__calloc(1, sizeof(git_hashsig)); + if (!sig) + return NULL; + + hashsig_heap_init(&sig->mins, hashsig_cmp_min); + hashsig_heap_init(&sig->maxs, hashsig_cmp_max); + sig->opt = opts; + + return sig; +} + +int git_hashsig_create( + git_hashsig **out, + const char *buf, + size_t buflen, + git_hashsig_option_t opts) +{ + int error; + hashsig_in_progress prog = HASHSIG_IN_PROGRESS_INIT; + git_hashsig *sig = hashsig_alloc(opts); + GITERR_CHECK_ALLOC(sig); + + error = hashsig_add_hashes(sig, buf, buflen, &prog); + + if (!error) + error = hashsig_finalize_hashes(sig); + + if (!error) + *out = sig; + else + git_hashsig_free(sig); + + return error; +} + +int git_hashsig_create_fromfile( + git_hashsig **out, + const char *path, + git_hashsig_option_t opts) +{ + char buf[4096]; + ssize_t buflen = 0; + int error = 0, fd; + hashsig_in_progress prog = HASHSIG_IN_PROGRESS_INIT; + git_hashsig *sig = hashsig_alloc(opts); + GITERR_CHECK_ALLOC(sig); + + if ((fd = git_futils_open_ro(path)) < 0) { + git__free(sig); + return fd; + } + + while (!error) { + if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) { + if ((error = buflen) < 0) + giterr_set(GITERR_OS, + "Read error on '%s' calculating similarity hashes", path); + break; + } + + error = hashsig_add_hashes(sig, buf, buflen, &prog); + } + + p_close(fd); + + if (!error) + error = hashsig_finalize_hashes(sig); + + if (!error) + *out = sig; + else + git_hashsig_free(sig); + + return error; +} + +void git_hashsig_free(git_hashsig *sig) +{ + git__free(sig); +} + +static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b) +{ + int matches = 0, i, j, cmp; + + assert(a->cmp == b->cmp); + + /* hash heaps are sorted - just look for overlap vs total */ + + for (i = 0, j = 0; i < a->size && j < b->size; ) { + cmp = a->cmp(&a->values[i], &b->values[j]); + + if (cmp < 0) + ++i; + else if (cmp > 0) + ++j; + else { + ++i; ++j; ++matches; + } + } + + return HASHSIG_SCALE * (matches * 2) / (a->size + b->size); +} + +int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b) +{ + return (hashsig_heap_compare(&a->mins, &b->mins) + + hashsig_heap_compare(&a->maxs, &b->maxs)) / 2; +} + diff --git a/src/hashsig.h b/src/hashsig.h new file mode 100644 index 000000000..8c920cbf1 --- /dev/null +++ b/src/hashsig.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) the libgit2 contributors. All rights reserved. + * + * This file is part of libgit2, distributed under the GNU GPL v2 with + * a Linking Exception. For full terms see the included COPYING file. + */ +#ifndef INCLUDE_hashsig_h__ +#define INCLUDE_hashsig_h__ + +#include "common.h" + +/** + * Similarity signature of line hashes for a buffer + */ +typedef struct git_hashsig git_hashsig; + +typedef enum { + GIT_HASHSIG_NORMAL = 0, /* use all data */ + GIT_HASHSIG_IGNORE_WHITESPACE = 1, /* ignore whitespace */ + GIT_HASHSIG_SMART_WHITESPACE = 2, /* ignore \r and all space after \n */ +} git_hashsig_option_t; + +/** + * Build a similarity signature for a buffer + * + * If you have passed a whitespace-ignoring buffer, then the whitespace + * will be removed from the buffer while it is being processed, modifying + * the buffer in place. Sorry about that! + * + * This will return an error if the buffer doesn't contain enough data to + * compute a valid signature. + * + * @param out The array of hashed runs representing the file content + * @param buf The contents of the file to hash + * @param buflen The length of the data at `buf` + * @param generate_pairwise_hashes Should pairwise runs be hashed + */ +extern int git_hashsig_create( + git_hashsig **out, + const char *buf, + size_t buflen, + git_hashsig_option_t opts); + +/** + * Build a similarity signature from a file + * + * This walks through the file, only loading a maximum of 4K of file data at + * a time. Otherwise, it acts just like `git_hashsig_create`. + * + * This will return an error if the file doesn't contain enough data to + * compute a valid signature. + */ +extern int git_hashsig_create_fromfile( + git_hashsig **out, + const char *path, + git_hashsig_option_t opts); + +/** + * Release memory for a content similarity signature + */ +extern void git_hashsig_free(git_hashsig *sig); + +/** + * Measure similarity between two files + * + * @return <0 for error, [0 to 100] as similarity score + */ +extern int git_hashsig_compare( + const git_hashsig *a, + const git_hashsig *b); + +#endif |