From 17ce490c3ea0289e0e25d6bb6b7c1c8c292daeb0 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 22 Oct 2024 00:11:50 -0400 Subject: [PATCH 1/4] t5616: mark tests as bogus with --path-walk These two tests in t5616-partial-clone.sh are actually already broken and there are comments supporting that. Those comments were focused on the GIT_TEST_FULL_NAME_HASH variable, but they also apply to this one. We will want to avoid issues here. Signed-off-by: Derrick Stolee --- t/t5616-partial-clone.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/t/t5616-partial-clone.sh b/t/t5616-partial-clone.sh index 425aa8d8789ed1..7923dfbf3cb5a7 100755 --- a/t/t5616-partial-clone.sh +++ b/t/t5616-partial-clone.sh @@ -527,6 +527,7 @@ test_expect_success 'fetch lazy-fetches only to resolve deltas' ' # used as delta bases! GIT_TRACE_PACKET="$(pwd)/trace" \ GIT_TEST_FULL_NAME_HASH=0 \ + GIT_TEST_PACK_PATH_WALK=0 \ git -C client \ fetch "file://$(pwd)/server" main && @@ -557,6 +558,7 @@ test_expect_success 'fetch lazy-fetches only to resolve deltas, protocol v2' ' # used as delta bases! GIT_TRACE_PACKET="$(pwd)/trace" \ GIT_TEST_FULL_NAME_HASH=0 \ + GIT_TEST_PACK_PATH_WALK=0 \ git -C client \ fetch "file://$(pwd)/server" main && From 1af435e33b7ac4dbc516606a3c8960a9e43fc757 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2024 21:40:50 -0400 Subject: [PATCH 2/4] path-walk: add new 'edge_aggressive' option In preparation for allowing both the --shallow and --path-walk options in the 'git pack-objects' builtin, create a new 'edge_aggressive' option in the path-walk API. This option will help walk the boundary more thoroughly and help avoid sending extra objects during fetches and pushes. The only use of the 'edge_hint_aggressive' option in the revision API is within mark_edges_uninteresting(), which is usually called before between prepare_revision_walk() and before visiting commits with get_revision(). In prepare_revision_walk(), the UNINTERESTING commits are walked until a boundary is found. We didn't use this in the past because we would mark objects UNINTERESTING after doing the initial commit walk to the boundary. While we should be marking these objects as UNINTERESTING, we shouldn't _emit_ them all via the path-walk algorithm or else our delta calculations will get really slow. Based on these observations, the way we were handling the UNINTERESTING flag in walk_objects_by_path() was overly complicated and buggy. A lot of it can be removed and simplified to work with this new approach. It also means that we will see the UNINTERESTING boundaries of paths when doing a default path-walk call, changing some existing test cases. Signed-off-by: Derrick Stolee --- Documentation/technical/api-path-walk.txt | 8 +++ path-walk.c | 60 +++++++++++++---------- path-walk.h | 7 +++ t/t6601-path-walk.sh | 16 +++--- 4 files changed, 58 insertions(+), 33 deletions(-) diff --git a/Documentation/technical/api-path-walk.txt b/Documentation/technical/api-path-walk.txt index 83bfe3d665e9fb..85d8e262401486 100644 --- a/Documentation/technical/api-path-walk.txt +++ b/Documentation/technical/api-path-walk.txt @@ -65,6 +65,14 @@ better off using the revision walk API instead. the revision walk so that the walk emits commits marked with the `UNINTERESTING` flag. +`edge_aggressive`:: + For performance reasons, usually only the boundary commits are + explored to find UNINTERESTING objects. However, in the case of + shallow clones it can be helpful to mark all trees and blobs + reachable from UNINTERESTING tip commits as UNINTERESTING. This + matches the behavior of `--objects-edge-aggressive` in the + revision API. + `pl`:: This pattern list pointer allows focusing the path-walk search to a set of patterns, only emitting paths that match the given diff --git a/path-walk.c b/path-walk.c index c0be95922c1615..6451e16562971f 100644 --- a/path-walk.c +++ b/path-walk.c @@ -18,6 +18,7 @@ #include "trace2.h" #include "tree.h" #include "tree-walk.h" +#include "list-objects.h" struct type_and_oid_list { @@ -233,6 +234,26 @@ static void clear_strmap(struct strmap *map) strmap_init(map); } +static struct repository *edge_repo; +static struct type_and_oid_list *edge_tree_list; + +static void show_edge(struct commit *commit) +{ + struct tree *t = repo_get_commit_tree(edge_repo, commit); + + if (!t) + return; + + if (commit->object.flags & UNINTERESTING) + t->object.flags |= UNINTERESTING; + + if (t->object.flags & SEEN) + return; + t->object.flags |= SEEN; + + oid_array_append(&edge_tree_list->oids, &t->object.oid); +} + /** * Given the configuration of 'info', walk the commits based on 'info->revs' and * call 'info->path_fn' on each discovered path. @@ -242,7 +263,7 @@ static void clear_strmap(struct strmap *map) int walk_objects_by_path(struct path_walk_info *info) { const char *root_path = ""; - int ret = 0, has_uninteresting = 0; + int ret = 0; size_t commits_nr = 0, paths_nr = 0; struct commit *c; struct type_and_oid_list *root_tree_list; @@ -254,7 +275,6 @@ int walk_objects_by_path(struct path_walk_info *info) .path_stack = STRING_LIST_INIT_DUP, .paths_to_lists = STRMAP_INIT }; - struct oidset root_tree_set = OIDSET_INIT; trace2_region_enter("path-walk", "commit-walk", info->revs->repo); @@ -280,6 +300,18 @@ int walk_objects_by_path(struct path_walk_info *info) if (prepare_revision_walk(info->revs)) die(_("failed to setup revision walk")); + /* + * Do an initial walk of tip commits in info->revs->commits and + * info->revs->cmdline.rev to match the standard edge-walk behavior. + * + * This is particularly important when 'edge_aggressive' is set. + */ + info->revs->edge_hint_aggressive = info->edge_aggressive; + + edge_repo = info->revs->repo; + edge_tree_list = root_tree_list; + mark_edges_uninteresting(info->revs, show_edge, info->prune_all_uninteresting); + info->revs->blob_objects = info->revs->tree_objects = 0; if (info->tags) { @@ -366,17 +398,10 @@ int walk_objects_by_path(struct path_walk_info *info) if (t->object.flags & SEEN) continue; t->object.flags |= SEEN; - - if (!oidset_insert(&root_tree_set, oid)) - oid_array_append(&root_tree_list->oids, oid); + oid_array_append(&root_tree_list->oids, oid); } else { warning("could not find tree %s", oid_to_hex(oid)); } - - if (t && (c->object.flags & UNINTERESTING)) { - t->object.flags |= UNINTERESTING; - has_uninteresting = 1; - } } trace2_data_intmax("path-walk", ctx.repo, "commits", commits_nr); @@ -389,21 +414,6 @@ int walk_objects_by_path(struct path_walk_info *info) oid_array_clear(&commit_list->oids); free(commit_list); - /* - * Before performing a DFS of our paths and emitting them as interesting, - * do a full walk of the trees to distribute the UNINTERESTING bit. Use - * the sparse algorithm if prune_all_uninteresting was set. - */ - if (has_uninteresting) { - trace2_region_enter("path-walk", "uninteresting-walk", info->revs->repo); - if (info->prune_all_uninteresting) - mark_trees_uninteresting_sparse(ctx.repo, &root_tree_set); - else - mark_trees_uninteresting_dense(ctx.repo, &root_tree_set); - trace2_region_leave("path-walk", "uninteresting-walk", info->revs->repo); - } - oidset_clear(&root_tree_set); - string_list_append(&ctx.path_stack, root_path); trace2_region_enter("path-walk", "path-walk", info->revs->repo); diff --git a/path-walk.h b/path-walk.h index 090cda3b5cf8f4..d19048d0d312e5 100644 --- a/path-walk.h +++ b/path-walk.h @@ -48,6 +48,13 @@ struct path_walk_info { */ int prune_all_uninteresting; + /** + * When 'edge_aggressive' is set, then the revision walk will use + * the '--object-edge-aggressive' option to mark even more objects + * as uninteresting. + */ + int edge_aggressive; + /** * Specify a sparse-checkout definition to match our paths to. Do not * walk outside of this sparse definition. If the patterns are in diff --git a/t/t6601-path-walk.sh b/t/t6601-path-walk.sh index 312bf3c19c176a..d67a077d37b99d 100755 --- a/t/t6601-path-walk.sh +++ b/t/t6601-path-walk.sh @@ -181,13 +181,13 @@ test_expect_success 'topic, not base' ' COMMIT::$(git rev-parse topic) commits:1 TREE::$(git rev-parse topic^{tree}) - TREE:left/:$(git rev-parse topic:left) + TREE:left/:$(git rev-parse base~1:left):UNINTERESTING TREE:right/:$(git rev-parse topic:right) trees:3 - BLOB:a:$(git rev-parse topic:a) - BLOB:left/b:$(git rev-parse topic:left/b) + BLOB:a:$(git rev-parse base~1:a):UNINTERESTING + BLOB:left/b:$(git rev-parse base~1:left/b):UNINTERESTING BLOB:right/c:$(git rev-parse topic:right/c) - BLOB:right/d:$(git rev-parse topic:right/d) + BLOB:right/d:$(git rev-parse base~1:right/d):UNINTERESTING blobs:4 tags:0 EOF @@ -205,10 +205,10 @@ test_expect_success 'topic, not base, only blobs' ' cat >expect <<-EOF && commits:0 trees:0 - BLOB:a:$(git rev-parse topic:a) - BLOB:left/b:$(git rev-parse topic:left/b) + BLOB:a:$(git rev-parse base~1:a):UNINTERESTING + BLOB:left/b:$(git rev-parse base~1:left/b):UNINTERESTING BLOB:right/c:$(git rev-parse topic:right/c) - BLOB:right/d:$(git rev-parse topic:right/d) + BLOB:right/d:$(git rev-parse base~1:right/d):UNINTERESTING blobs:4 tags:0 EOF @@ -246,7 +246,7 @@ test_expect_success 'topic, not base, only trees' ' cat >expect <<-EOF && commits:0 TREE::$(git rev-parse topic^{tree}) - TREE:left/:$(git rev-parse topic:left) + TREE:left/:$(git rev-parse base~1:left):UNINTERESTING TREE:right/:$(git rev-parse topic:right) trees:3 blobs:0 From 24c5b3a39653366cf91f52cfe23a91f8615e10af Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2024 22:59:45 -0400 Subject: [PATCH 3/4] pack-objects: allow --shallow and --path-walk There does not appear to be anything particularly incompatible about the --shallow and --path-walk options of 'git pack-objects'. If shallow commits are to be handled differently, then it is by the revision walk that defines the commit set and which are interesting or uninteresting. However, before the previous change, a trivial removal of the warning would cause a failure in t5500-fetch-pack.sh when GIT_TEST_PACK_PATH_WALK is enabled. The shallow fetch would provide more objects than we desired, due to some incorrect behavior of the path-walk API, especially around walking uninteresting objects. To also cover the symmetrical case of pushing from a shallow clone, add a new test to t5538-push-shallow.sh that confirms the correct behavior of pushing only the new object. This works to validate both the --path-walk and --no-path-walk case when toggling the GIT_TEST_PACK_PATH_WALK environment variable. This test would have failed in the --path-walk case if we created it before the previous change. Signed-off-by: Derrick Stolee --- builtin/pack-objects.c | 7 ++----- t/t5538-push-shallow.sh | 13 +++++++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index b49f5fe797a28d..aa694204d8b313 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -201,6 +201,7 @@ static int keep_unreachable, unpack_unreachable, include_tag; static timestamp_t unpack_unreachable_expiration; static int pack_loose_unreachable; static int cruft; +static int shallow = 0; static timestamp_t cruft_expiration; static int local; static int have_non_local_packs; @@ -4438,6 +4439,7 @@ static void get_object_list_path_walk(struct rev_info *revs) * base objects. */ info.prune_all_uninteresting = sparse; + info.edge_aggressive = shallow; if (walk_objects_by_path(&info)) die(_("failed to pack objects via path-walk")); @@ -4627,7 +4629,6 @@ int cmd_pack_objects(int argc, struct repository *repo UNUSED) { int use_internal_rev_list = 0; - int shallow = 0; int all_progress_implied = 0; struct strvec rp = STRVEC_INIT; int rev_list_unpacked = 0, rev_list_all = 0, rev_list_reflog = 0; @@ -4812,10 +4813,6 @@ int cmd_pack_objects(int argc, warning(_("cannot use delta islands with --path-walk")); path_walk = 0; } - if (path_walk && shallow) { - warning(_("cannot use --shallow with --path-walk")); - path_walk = 0; - } if (path_walk) { strvec_push(&rp, "--boundary"); /* diff --git a/t/t5538-push-shallow.sh b/t/t5538-push-shallow.sh index 6adc3a20a45b77..5a71ab92ae1ce1 100755 --- a/t/t5538-push-shallow.sh +++ b/t/t5538-push-shallow.sh @@ -124,4 +124,17 @@ EOF git cat-file blob $(echo 1|git hash-object --stdin) >/dev/null ) ' + +test_expect_success 'push new commit from shallow clone has correct object count' ' + git init origin && + test_commit -C origin a && + test_commit -C origin b && + + git clone --depth=1 "file://$(pwd)/origin" client && + git -C client checkout -b topic && + git -C client commit --allow-empty -m "empty" && + GIT_PROGRESS_DELAY=0 git -C client push --progress origin topic 2>err && + test_grep "Enumerating objects: 1, done." err +' + test_done From cec2f4aa8fbacddee0eddbf553ec7b43dd6cd483 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 22 Oct 2024 12:05:27 -0400 Subject: [PATCH 4/4] t5538: add test to confirm deltas in shallow pushes It can be notoriously difficult to detect if delta bases are being computed properly during 'git push'. Construct an example where it will make a kilobyte worth of difference when a delta base is not found. We can then use the progress indicators to distinguish between bytes and KiB depending on whether the delta base is found and used. Signed-off-by: Derrick Stolee --- t/t5538-push-shallow.sh | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/t/t5538-push-shallow.sh b/t/t5538-push-shallow.sh index 5a71ab92ae1ce1..7b3efd9b313f15 100755 --- a/t/t5538-push-shallow.sh +++ b/t/t5538-push-shallow.sh @@ -137,4 +137,25 @@ test_expect_success 'push new commit from shallow clone has correct object count test_grep "Enumerating objects: 1, done." err ' +test_expect_success 'push new commit from shallow clone has good deltas' ' + git init base && + test_seq 1 999 >base/a && + test_commit -C base initial && + git -C base add a && + git -C base commit -m "big a" && + + git clone --depth=1 "file://$(pwd)/base" deltas && + git -C deltas checkout -b deltas && + test_seq 1 1000 >deltas/a && + git -C deltas commit -a -m "bigger a" && + GIT_TRACE2_PERF="$(pwd)/trace.txt" \ + GIT_PROGRESS_DELAY=0 git -C deltas push --progress origin deltas 2>err && + + test_grep "Enumerating objects: 5, done" err && + + # If the delta base is found, then this message uses "bytes". + # If the delta base is not found, then this message uses "KiB". + test_grep "Writing objects: .* bytes" err +' + test_done