From 798ddd947ffe9d608d9aa5803dc7c409834e7159 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Mon, 23 Jun 2025 18:32:10 -0400 Subject: [PATCH 1/9] pack-objects: use standard option incompatibility functions pack-objects has a handful of explicit checks for pairs of command-line options which are mutually incompatible. Many of these pre-date a699367bb8 (i18n: factorize more 'incompatible options' messages, 2022-01-31). Convert the explicit checks into die_for_incompatible_opt2() calls, which simplifies the implementation and standardizes pack-objects' output when given incompatible options (e.g., --stdin-packs with --filter gives different output than --keep-unreachable with --unpack-unreachable). There is one minor piece of test fallout in t5331 that expects the old format, which has been corrected. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 20 +++++++++++--------- t/t5331-pack-objects-stdin.sh | 2 +- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 67941c8a60..e7274e0e00 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -5010,9 +5010,10 @@ int cmd_pack_objects(int argc, strvec_push(&rp, "--unpacked"); } - if (exclude_promisor_objects && exclude_promisor_objects_best_effort) - die(_("options '%s' and '%s' cannot be used together"), - "--exclude-promisor-objects", "--exclude-promisor-objects-best-effort"); + die_for_incompatible_opt2(exclude_promisor_objects, + "--exclude-promisor-objects", + exclude_promisor_objects_best_effort, + "--exclude-promisor-objects-best-effort"); if (exclude_promisor_objects) { use_internal_rev_list = 1; fetch_if_missing = 0; @@ -5050,13 +5051,14 @@ int cmd_pack_objects(int argc, if (!pack_to_stdout && thin) die(_("--thin cannot be used to build an indexable pack")); - if (keep_unreachable && unpack_unreachable) - die(_("options '%s' and '%s' cannot be used together"), "--keep-unreachable", "--unpack-unreachable"); + die_for_incompatible_opt2(keep_unreachable, "--keep-unreachable", + unpack_unreachable, "--unpack-unreachable"); if (!rev_list_all || !rev_list_reflog || !rev_list_index) unpack_unreachable_expiration = 0; - if (stdin_packs && filter_options.choice) - die(_("cannot use --filter with --stdin-packs")); + die_for_incompatible_opt2(stdin_packs, "--stdin-packs", + filter_options.choice, "--filter"); + if (stdin_packs && use_internal_rev_list) die(_("cannot use internal rev list with --stdin-packs")); @@ -5064,8 +5066,8 @@ int cmd_pack_objects(int argc, if (cruft) { if (use_internal_rev_list) die(_("cannot use internal rev list with --cruft")); - if (stdin_packs) - die(_("cannot use --stdin-packs with --cruft")); + die_for_incompatible_opt2(stdin_packs, "--stdin-packs", + cruft, "--cruft"); } /* diff --git a/t/t5331-pack-objects-stdin.sh b/t/t5331-pack-objects-stdin.sh index b48c0cbe8f..8fd07deb8d 100755 --- a/t/t5331-pack-objects-stdin.sh +++ b/t/t5331-pack-objects-stdin.sh @@ -64,7 +64,7 @@ test_expect_success '--stdin-packs is incompatible with --filter' ' cd stdin-packs && test_must_fail git pack-objects --stdin-packs --stdout \ --filter=blob:none err && - test_grep "cannot use --filter with --stdin-packs" err + test_grep "options .--stdin-packs. and .--filter. cannot be used together" err ) ' From 9809d4ae9f5b577e0afd18082b095414ce046c00 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Mon, 23 Jun 2025 18:32:13 -0400 Subject: [PATCH 2/9] pack-objects: limit scope in 'add_object_entry_from_pack()' In add_object_entry_from_pack() we declare 'revs' (given to us through the miscellaneous context argument) earlier in the "if (p)" conditional than is necessary. Move it down as far as it can go to reduce its scope. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index e7274e0e00..d04a36a6bf 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -3725,7 +3725,6 @@ static int add_object_entry_from_pack(const struct object_id *oid, return 0; if (p) { - struct rev_info *revs = _data; struct object_info oi = OBJECT_INFO_INIT; oi.typep = &type; @@ -3733,6 +3732,7 @@ static int add_object_entry_from_pack(const struct object_id *oid, die(_("could not get type of object %s in pack %s"), oid_to_hex(oid), p->pack_name); } else if (type == OBJ_COMMIT) { + struct rev_info *revs = _data; /* * commits in included packs are used as starting points for the * subsequent revision walk From 67e1a7827bf81f84ba8933d494e441139bd3f34d Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Mon, 23 Jun 2025 18:32:15 -0400 Subject: [PATCH 3/9] pack-objects: factor out handling '--stdin-packs' At the bottom of cmd_pack_objects() we check which mode the command is running in (e.g., generating a cruft pack, handling '--stdin-packs', using the internal rev-list, etc.) and handle the mode appropriately. The '--stdin-packs' case is handled inline (dating back to its introduction in 339bce27f4 (builtin/pack-objects.c: add '--stdin-packs' option, 2021-02-22)) since it is relatively short. Extract the body of "if (stdin_packs)" into its own function to prepare for the implementation to become lengthier in a following commit. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index d04a36a6bf..7ce04b71dd 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -3909,6 +3909,17 @@ static void read_packs_list_from_stdin(void) string_list_clear(&exclude_packs, 0); } +static void add_unreachable_loose_objects(void); + +static void read_stdin_packs(int rev_list_unpacked) +{ + /* avoids adding objects in excluded packs */ + ignore_packed_keep_in_core = 1; + read_packs_list_from_stdin(); + if (rev_list_unpacked) + add_unreachable_loose_objects(); +} + static void add_cruft_object_entry(const struct object_id *oid, enum object_type type, struct packed_git *pack, off_t offset, const char *name, uint32_t mtime) @@ -4004,7 +4015,6 @@ static void mark_pack_kept_in_core(struct string_list *packs, unsigned keep) } } -static void add_unreachable_loose_objects(void); static void add_objects_in_unpacked_packs(void); static void enumerate_cruft_objects(void) @@ -5135,11 +5145,7 @@ int cmd_pack_objects(int argc, progress_state = start_progress(the_repository, _("Enumerating objects"), 0); if (stdin_packs) { - /* avoids adding objects in excluded packs */ - ignore_packed_keep_in_core = 1; - read_packs_list_from_stdin(); - if (rev_list_unpacked) - add_unreachable_loose_objects(); + read_stdin_packs(rev_list_unpacked); } else if (cruft) { read_cruft_objects(); } else if (!use_internal_rev_list) { From 97ec43247c01bc125fa9618e54f93a7dd0b52ab4 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Mon, 23 Jun 2025 18:32:18 -0400 Subject: [PATCH 4/9] pack-objects: declare 'rev_info' for '--stdin-packs' earlier Once 'read_packs_list_from_stdin()' has called for_each_object_in_pack() on each of the input packs, we do a reachability traversal to discover names for any objects we picked up so we can generate name hash values and hopefully get higher quality deltas as a result. A future commit will change the purpose of this reachability traversal to find and pack objects which are reachable from commits in the input packs, but are packed in an unknown (not included nor excluded) pack. Extract the code which initializes and performs the reachability traversal to take place in the caller, not the callee, which prepares us to share this code for the '--unpacked' case (see the function add_unreachable_loose_objects() for more details). Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 71 +++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 7ce04b71dd..4258ac1792 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -3793,7 +3793,7 @@ static int pack_mtime_cmp(const void *_a, const void *_b) return 0; } -static void read_packs_list_from_stdin(void) +static void read_packs_list_from_stdin(struct rev_info *revs) { struct strbuf buf = STRBUF_INIT; struct string_list include_packs = STRING_LIST_INIT_DUP; @@ -3801,24 +3801,6 @@ static void read_packs_list_from_stdin(void) struct string_list_item *item = NULL; struct packed_git *p; - struct rev_info revs; - - repo_init_revisions(the_repository, &revs, NULL); - /* - * Use a revision walk to fill in the namehash of objects in the include - * packs. To save time, we'll avoid traversing through objects that are - * in excluded packs. - * - * That may cause us to avoid populating all of the namehash fields of - * all included objects, but our goal is best-effort, since this is only - * an optimization during delta selection. - */ - revs.no_kept_objects = 1; - revs.keep_pack_cache_flags |= IN_CORE_KEEP_PACKS; - revs.blob_objects = 1; - revs.tree_objects = 1; - revs.tag_objects = 1; - revs.ignore_missing_links = 1; while (strbuf_getline(&buf, stdin) != EOF) { if (!buf.len) @@ -3888,10 +3870,44 @@ static void read_packs_list_from_stdin(void) struct packed_git *p = item->util; for_each_object_in_pack(p, add_object_entry_from_pack, - &revs, + revs, FOR_EACH_OBJECT_PACK_ORDER); } + strbuf_release(&buf); + string_list_clear(&include_packs, 0); + string_list_clear(&exclude_packs, 0); +} + +static void add_unreachable_loose_objects(void); + +static void read_stdin_packs(int rev_list_unpacked) +{ + struct rev_info revs; + + repo_init_revisions(the_repository, &revs, NULL); + /* + * Use a revision walk to fill in the namehash of objects in the include + * packs. To save time, we'll avoid traversing through objects that are + * in excluded packs. + * + * That may cause us to avoid populating all of the namehash fields of + * all included objects, but our goal is best-effort, since this is only + * an optimization during delta selection. + */ + revs.no_kept_objects = 1; + revs.keep_pack_cache_flags |= IN_CORE_KEEP_PACKS; + revs.blob_objects = 1; + revs.tree_objects = 1; + revs.tag_objects = 1; + revs.ignore_missing_links = 1; + + /* avoids adding objects in excluded packs */ + ignore_packed_keep_in_core = 1; + read_packs_list_from_stdin(&revs); + if (rev_list_unpacked) + add_unreachable_loose_objects(); + if (prepare_revision_walk(&revs)) die(_("revision walk setup failed")); traverse_commit_list(&revs, @@ -3903,21 +3919,6 @@ static void read_packs_list_from_stdin(void) stdin_packs_found_nr); trace2_data_intmax("pack-objects", the_repository, "stdin_packs_hints", stdin_packs_hints_nr); - - strbuf_release(&buf); - string_list_clear(&include_packs, 0); - string_list_clear(&exclude_packs, 0); -} - -static void add_unreachable_loose_objects(void); - -static void read_stdin_packs(int rev_list_unpacked) -{ - /* avoids adding objects in excluded packs */ - ignore_packed_keep_in_core = 1; - read_packs_list_from_stdin(); - if (rev_list_unpacked) - add_unreachable_loose_objects(); } static void add_cruft_object_entry(const struct object_id *oid, enum object_type type, From d6220cce6beda5404effa7107b7544a3d8c6266a Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Mon, 23 Jun 2025 18:32:21 -0400 Subject: [PATCH 5/9] pack-objects: perform name-hash traversal for unpacked objects With '--unpacked', pack-objects adds loose objects (which don't appear in any of the excluded packs from '--stdin-packs') to the output pack without considering them as reachability tips for the name-hash traversal. This was an oversight in the original implementation of '--stdin-packs', since the code which enumerates and adds loose objects to the output pack (`add_unreachable_loose_objects()`) did not have access to the 'rev_info' struct found in `read_packs_list_from_stdin()`. Excluding unpacked objects from that traversal doesn't affect the correctness of the resulting pack, but it does make it harder to discover good deltas for loose objects. Now that the 'rev_info' struct is declared outside of `read_packs_list_from_stdin()`, we can pass it to `add_objects_in_unpacked_packs()` and add any loose objects as tips to the above-mentioned traversal, in theory producing slightly tighter packs as a result. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 4258ac1792..3437dbd7f1 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -3879,7 +3879,7 @@ static void read_packs_list_from_stdin(struct rev_info *revs) string_list_clear(&exclude_packs, 0); } -static void add_unreachable_loose_objects(void); +static void add_unreachable_loose_objects(struct rev_info *revs); static void read_stdin_packs(int rev_list_unpacked) { @@ -3906,7 +3906,7 @@ static void read_stdin_packs(int rev_list_unpacked) ignore_packed_keep_in_core = 1; read_packs_list_from_stdin(&revs); if (rev_list_unpacked) - add_unreachable_loose_objects(); + add_unreachable_loose_objects(&revs); if (prepare_revision_walk(&revs)) die(_("revision walk setup failed")); @@ -4025,7 +4025,7 @@ static void enumerate_cruft_objects(void) _("Enumerating cruft objects"), 0); add_objects_in_unpacked_packs(); - add_unreachable_loose_objects(); + add_unreachable_loose_objects(NULL); stop_progress(&progress_state); } @@ -4303,8 +4303,9 @@ static void add_objects_in_unpacked_packs(void) } static int add_loose_object(const struct object_id *oid, const char *path, - void *data UNUSED) + void *data) { + struct rev_info *revs = data; enum object_type type = oid_object_info(the_repository, oid, NULL); if (type < 0) { @@ -4325,6 +4326,10 @@ static int add_loose_object(const struct object_id *oid, const char *path, } else { add_object_entry(oid, type, "", 0); } + + if (revs && type == OBJ_COMMIT) + add_pending_oid(revs, NULL, oid, 0); + return 0; } @@ -4333,11 +4338,10 @@ static int add_loose_object(const struct object_id *oid, const char *path, * add_object_entry will weed out duplicates, so we just add every * loose object we find. */ -static void add_unreachable_loose_objects(void) +static void add_unreachable_loose_objects(struct rev_info *revs) { for_each_loose_file_in_objdir(repo_get_object_directory(the_repository), - add_loose_object, - NULL, NULL, NULL); + add_loose_object, NULL, NULL, revs); } static int has_sha1_pack_kept_or_nonlocal(const struct object_id *oid) @@ -4684,7 +4688,7 @@ static void get_object_list(struct rev_info *revs, int ac, const char **av) if (keep_unreachable) add_objects_in_unpacked_packs(); if (pack_loose_unreachable) - add_unreachable_loose_objects(); + add_unreachable_loose_objects(NULL); if (unpack_unreachable) loosen_unused_packed_objects(); From 8ed5d87bdd03469249373dead5d12ff4590bcccc Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Mon, 23 Jun 2025 18:32:24 -0400 Subject: [PATCH 6/9] pack-objects: fix typo in 'show_object_pack_hint()' Noticed-by: Elijah Newren Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 3437dbd7f1..9580b4ea1a 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -3767,7 +3767,7 @@ static void show_object_pack_hint(struct object *object, const char *name, * would typically pick up during a reachability traversal. * * Make a best-effort attempt to fill in the ->hash and ->no_try_delta - * here using a now in order to perhaps improve the delta selection + * fields here in order to perhaps improve the delta selection * process. */ oe->hash = pack_name_hash_fn(name); From 63195f013b845b02063b21162fa60fcfb8b631ef Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Mon, 23 Jun 2025 18:32:27 -0400 Subject: [PATCH 7/9] pack-objects: swap 'show_{object,commit}_pack_hint' show_commit_pack_hint() has heretofore been a noop, so its position within its compilation unit only needs to appear before its first use. But the following commit will sometimes have `show_commit_pack_hint()` call `show_object_pack_hint()`, so reorder the former to appear after the latter to minimize the code movement in that patch. Suggested-by: Elijah Newren Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 9580b4ea1a..f44447a3f9 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -3748,12 +3748,6 @@ static int add_object_entry_from_pack(const struct object_id *oid, return 0; } -static void show_commit_pack_hint(struct commit *commit UNUSED, - void *data UNUSED) -{ - /* nothing to do; commits don't have a namehash */ -} - static void show_object_pack_hint(struct object *object, const char *name, void *data UNUSED) { @@ -3776,6 +3770,12 @@ static void show_object_pack_hint(struct object *object, const char *name, stdin_packs_hints_nr++; } +static void show_commit_pack_hint(struct commit *commit UNUSED, + void *data UNUSED) +{ + /* nothing to do; commits don't have a namehash */ +} + static int pack_mtime_cmp(const void *_a, const void *_b) { struct packed_git *a = ((const struct string_list_item*)_a)->util; From cd846bacc7dce3e71137e320adb01f5923353800 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Mon, 23 Jun 2025 18:32:30 -0400 Subject: [PATCH 8/9] pack-objects: introduce '--stdin-packs=follow' When invoked with '--stdin-packs', pack-objects will generate a pack which contains the objects found in the "included" packs, less any objects from "excluded" packs. Packs that exist in the repository but weren't specified as either included or excluded are in practice treated like the latter, at least in the sense that pack-objects won't include objects from those packs. This behavior forces us to include any cruft pack(s) in a repository's multi-pack index for the reasons described in ddee3703b3 (builtin/repack.c: add cruft packs to MIDX during geometric repack, 2022-05-20). The full details are in ddee3703b3, but the gist is if you have a once-unreachable object in a cruft pack which later becomes reachable via one or more commits in a pack generated with '--stdin-packs', you *have* to include that object in the MIDX via the copy in the cruft pack, otherwise we cannot generate reachability bitmaps for any commits which reach that object. Note that the traversal here is best-effort, similar to the existing traversal which provides name-hash hints. This means that the object traversal may hand us back a blob that does not actually exist. We *won't* see missing trees/commits with 'ignore_missing_links' because: - missing commit parents are discarded at the commit traversal stage by revision.c::process_parents() - missing tag objects are discarded by revision.c::handle_commit() - missing tree objects are discarded by the list-objects code in list-objects.c::process_tree() But we have to handle potentially-missing blobs specially by making a separate check to ensure they exist in the repository. Failing to do so would mean that we'd add an object to the packing list which doesn't actually exist, rendering us unable to write out the pack. This prepares us for new repacking behavior which will "resurrect" objects found in cruft or otherwise unspecified packs when generating new packs. In the context of geometric repacking, this may be used to maintain a sequence of geometrically-repacked packs, the union of which is closed under reachability, even in the case described earlier. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- Documentation/git-pack-objects.adoc | 10 ++- builtin/pack-objects.c | 86 +++++++++++++++----- t/t5331-pack-objects-stdin.sh | 120 ++++++++++++++++++++++++++++ 3 files changed, 193 insertions(+), 23 deletions(-) diff --git a/Documentation/git-pack-objects.adoc b/Documentation/git-pack-objects.adoc index b1c5aa27da..eba014c406 100644 --- a/Documentation/git-pack-objects.adoc +++ b/Documentation/git-pack-objects.adoc @@ -87,13 +87,21 @@ base-name:: reference was included in the resulting packfile. This can be useful to send new tags to native Git clients. ---stdin-packs:: +--stdin-packs[=]:: Read the basenames of packfiles (e.g., `pack-1234abcd.pack`) from the standard input, instead of object names or revision arguments. The resulting pack contains all objects listed in the included packs (those not beginning with `^`), excluding any objects listed in the excluded packs (beginning with `^`). + +When `mode` is "follow", objects from packs not listed on stdin receive +special treatment. Objects within unlisted packs will be included if +those objects are (1) reachable from the included packs, and (2) not +found in any excluded packs. This mode is useful, for example, to +resurrect once-unreachable objects found in cruft packs to generate +packs which are closed under reachability up to the boundary set by the +excluded packs. ++ Incompatible with `--revs`, or options that imply `--revs` (such as `--all`), with the exception of `--unpacked`, which is compatible. diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index f44447a3f9..4ae52c6a29 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -284,6 +284,12 @@ static struct oidmap configured_exclusions; static struct oidset excluded_by_config; static int name_hash_version = -1; +enum stdin_packs_mode { + STDIN_PACKS_MODE_NONE, + STDIN_PACKS_MODE_STANDARD, + STDIN_PACKS_MODE_FOLLOW, +}; + /** * Check whether the name_hash_version chosen by user input is appropriate, * and also validate whether it is compatible with other features. @@ -3749,31 +3755,47 @@ static int add_object_entry_from_pack(const struct object_id *oid, } static void show_object_pack_hint(struct object *object, const char *name, - void *data UNUSED) + void *data) { - struct object_entry *oe = packlist_find(&to_pack, &object->oid); - if (!oe) - return; + enum stdin_packs_mode mode = *(enum stdin_packs_mode *)data; + if (mode == STDIN_PACKS_MODE_FOLLOW) { + if (object->type == OBJ_BLOB && + !has_object(the_repository, &object->oid, 0)) + return; + add_object_entry(&object->oid, object->type, name, 0); + } else { + struct object_entry *oe = packlist_find(&to_pack, &object->oid); + if (!oe) + return; - /* - * Our 'to_pack' list was constructed by iterating all objects packed in - * included packs, and so doesn't have a non-zero hash field that you - * would typically pick up during a reachability traversal. - * - * Make a best-effort attempt to fill in the ->hash and ->no_try_delta - * fields here in order to perhaps improve the delta selection - * process. - */ - oe->hash = pack_name_hash_fn(name); - oe->no_try_delta = name && no_try_delta(name); + /* + * Our 'to_pack' list was constructed by iterating all + * objects packed in included packs, and so doesn't have + * a non-zero hash field that you would typically pick + * up during a reachability traversal. + * + * Make a best-effort attempt to fill in the ->hash and + * ->no_try_delta fields here in order to perhaps + * improve the delta selection process. + */ + oe->hash = pack_name_hash_fn(name); + oe->no_try_delta = name && no_try_delta(name); - stdin_packs_hints_nr++; + stdin_packs_hints_nr++; + } } -static void show_commit_pack_hint(struct commit *commit UNUSED, - void *data UNUSED) +static void show_commit_pack_hint(struct commit *commit, void *data) { + enum stdin_packs_mode mode = *(enum stdin_packs_mode *)data; + + if (mode == STDIN_PACKS_MODE_FOLLOW) { + show_object_pack_hint((struct object *)commit, "", data); + return; + } + /* nothing to do; commits don't have a namehash */ + } static int pack_mtime_cmp(const void *_a, const void *_b) @@ -3881,7 +3903,7 @@ static void read_packs_list_from_stdin(struct rev_info *revs) static void add_unreachable_loose_objects(struct rev_info *revs); -static void read_stdin_packs(int rev_list_unpacked) +static void read_stdin_packs(enum stdin_packs_mode mode, int rev_list_unpacked) { struct rev_info revs; @@ -3913,7 +3935,7 @@ static void read_stdin_packs(int rev_list_unpacked) traverse_commit_list(&revs, show_commit_pack_hint, show_object_pack_hint, - NULL); + &mode); trace2_data_intmax("pack-objects", the_repository, "stdin_packs_found", stdin_packs_found_nr); @@ -4795,6 +4817,23 @@ static int is_not_in_promisor_pack(struct commit *commit, void *data) { return is_not_in_promisor_pack_obj((struct object *) commit, data); } +static int parse_stdin_packs_mode(const struct option *opt, const char *arg, + int unset) +{ + enum stdin_packs_mode *mode = opt->value; + + if (unset) + *mode = STDIN_PACKS_MODE_NONE; + else if (!arg || !*arg) + *mode = STDIN_PACKS_MODE_STANDARD; + else if (!strcmp(arg, "follow")) + *mode = STDIN_PACKS_MODE_FOLLOW; + else + die(_("invalid value for '%s': '%s'"), opt->long_name, arg); + + return 0; +} + int cmd_pack_objects(int argc, const char **argv, const char *prefix, @@ -4805,7 +4844,7 @@ int cmd_pack_objects(int argc, struct strvec rp = STRVEC_INIT; int rev_list_unpacked = 0, rev_list_all = 0, rev_list_reflog = 0; int rev_list_index = 0; - int stdin_packs = 0; + enum stdin_packs_mode stdin_packs = STDIN_PACKS_MODE_NONE; struct string_list keep_pack_list = STRING_LIST_INIT_NODUP; struct list_objects_filter_options filter_options = LIST_OBJECTS_FILTER_INIT; @@ -4860,6 +4899,9 @@ int cmd_pack_objects(int argc, OPT_SET_INT_F(0, "indexed-objects", &rev_list_index, N_("include objects referred to by the index"), 1, PARSE_OPT_NONEG), + OPT_CALLBACK_F(0, "stdin-packs", &stdin_packs, N_("mode"), + N_("read packs from stdin"), + PARSE_OPT_OPTARG, parse_stdin_packs_mode), OPT_BOOL(0, "stdin-packs", &stdin_packs, N_("read packs from stdin")), OPT_BOOL(0, "stdout", &pack_to_stdout, @@ -5150,7 +5192,7 @@ int cmd_pack_objects(int argc, progress_state = start_progress(the_repository, _("Enumerating objects"), 0); if (stdin_packs) { - read_stdin_packs(rev_list_unpacked); + read_stdin_packs(stdin_packs, rev_list_unpacked); } else if (cruft) { read_cruft_objects(); } else if (!use_internal_rev_list) { diff --git a/t/t5331-pack-objects-stdin.sh b/t/t5331-pack-objects-stdin.sh index 8fd07deb8d..4a8df5a389 100755 --- a/t/t5331-pack-objects-stdin.sh +++ b/t/t5331-pack-objects-stdin.sh @@ -236,4 +236,124 @@ test_expect_success 'pack-objects --stdin with packfiles from main and alternate test_cmp expected-objects actual-objects ' +objdir=.git/objects +packdir=$objdir/pack + +objects_in_packs () { + for p in "$@" + do + git show-index <"$packdir/pack-$p.idx" || return 1 + done >objects.raw && + + cut -d' ' -f2 objects.raw | sort && + rm -f objects.raw +} + +test_expect_success '--stdin-packs=follow walks into unknown packs' ' + test_when_finished "rm -fr repo" && + + git init repo && + ( + cd repo && + + for c in A B C D + do + test_commit "$c" || return 1 + done && + + A="$(echo A | git pack-objects --revs $packdir/pack)" && + B="$(echo A..B | git pack-objects --revs $packdir/pack)" && + C="$(echo B..C | git pack-objects --revs $packdir/pack)" && + D="$(echo C..D | git pack-objects --revs $packdir/pack)" && + test_commit E && + + git prune-packed && + + cat >in <<-EOF && + pack-$B.pack + ^pack-$C.pack + pack-$D.pack + EOF + + # With just --stdin-packs, pack "A" is unknown to us, so + # only objects from packs "B" and "D" are included in + # the output pack. + P=$(git pack-objects --stdin-packs $packdir/pack expect && + objects_in_packs $P >actual && + test_cmp expect actual && + + # But with --stdin-packs=follow, objects from both + # included packs reach objects from the unknown pack, so + # objects from pack "A" is included in the output pack + # in addition to the above. + P=$(git pack-objects --stdin-packs=follow $packdir/pack expect && + objects_in_packs $P >actual && + test_cmp expect actual && + + # And with --unpacked, we will pick up objects from unknown + # packs that are reachable from loose objects. Loose object E + # reaches objects in pack A, but there are three excluded packs + # in between. + # + # The resulting pack should include objects reachable from E + # that are not present in packs B, C, or D, along with those + # present in pack A. + cat >in <<-EOF && + ^pack-$B.pack + ^pack-$C.pack + ^pack-$D.pack + EOF + + P=$(git pack-objects --stdin-packs=follow --unpacked \ + $packdir/pack expect.raw && + sort expect.raw >expect && + objects_in_packs $P >actual && + test_cmp expect actual + ) +' + +stdin_packs__follow_with_only () { + rm -fr stdin_packs__follow_with_only && + git init stdin_packs__follow_with_only && + ( + cd stdin_packs__follow_with_only && + + test_commit A && + test_commit B && + + git rev-parse "$@" >B.objects && + + echo A | git pack-objects --revs $packdir/pack && + B="$(git pack-objects $packdir/pack objs && + for obj in $(cat objs) + do + rm -f $objdir/$(test_oid_to_path $obj) || return 1 + done && + + ( cd $packdir && ls pack-*.pack ) >in && + git pack-objects --stdin-packs=follow --stdout >/dev/null Date: Mon, 23 Jun 2025 18:32:32 -0400 Subject: [PATCH 9/9] repack: exclude cruft pack(s) from the MIDX where possible In ddee3703b3 (builtin/repack.c: add cruft packs to MIDX during geometric repack, 2022-05-20), repack began adding cruft pack(s) to the MIDX with '--write-midx' to ensure that the resulting MIDX was always closed under reachability in order to generate reachability bitmaps. While the previous patch added the '--stdin-packs=follow' option to pack-objects, it is not yet on by default. Given that, suppose you have a once-unreachable object packed in a cruft pack, which later becomes reachable from one or more objects in a geometrically repacked pack. That once-unreachable object *won't* appear in the new pack, since the cruft pack was not specified as included or excluded when the geometrically repacked pack was created with 'pack-objects --stdin-packs' (*not* '--stdin-packs=follow', which is not on). If that new pack is included in a MIDX without the cruft pack, then trying to generate bitmaps for that MIDX may fail. This happens when the bitmap selection process picks one or more commits which reach the once-unreachable objects. To mitigate this failure mode, commit ddee3703b3 ensures that the MIDX will be closed under reachability by including cruft pack(s). If cruft pack(s) were not included, we would fail to generate a MIDX bitmap. But ddee3703b3 alludes to the fact that this is sub-optimal by saying [...] it's desirable to avoid including cruft packs in the MIDX because it causes the MIDX to store a bunch of objects which are likely to get thrown away. , which is true, but hides an even larger problem. If repositories rarely prune their unreachable objects and/or have many of them, the MIDX must keep track of a large number of objects which bloats the MIDX and slows down object lookup. This is doubly unfortunate because the vast majority of objects in cruft pack(s) are unlikely to be read. But any object lookups that go through the MIDX must binary search over them anyway, slowing down object lookups using the MIDX. This patch causes geometrically-repacked packs to contain a copy of any once-unreachable object(s) with 'git pack-objects --stdin-packs=follow', allowing us to avoid including any cruft packs in the MIDX. This is because a sequence of geometrically-repacked packs that were all generated with '--stdin-packs=follow' are guaranteed to have their union be closed under reachability. Note that you cannot guarantee that a collection of packs is closed under reachability if not all of them were generated with "following" as above. One tell-tale sign that not all geometrically-repacked packs in the MIDX were generated with "following" is to see if there is a pack in the existing MIDX that is not going to be somehow represented (either verbatim or as part of a geometric rollup) in the new MIDX. If there is, then starting to generate packs with "following" during geometric repacking won't work, since it's open to the same race as described above. But if you're starting from scratch (e.g., building the first MIDX after an all-into-one '--cruft' repack), then you can guarantee that the union of subsequently generated packs from geometric repacking *is* closed under reachability. (One exception here is when "starting from scratch" results in a noop repack, e.g., because the non-cruft pack(s) in a repository already form a geometric progression. Since we can't tell whether or not those were generated with '--stdin-packs=follow', they may depend on once-unreachable objects, so we have to include the cruft pack in the MIDX in this case.) Detect when this is the case and avoid including cruft packs in the MIDX where possible. The existing behavior remains the default, and the new behavior is available with the config 'repack.midxMustIncludeCruft' set to 'false'. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- Documentation/config/repack.adoc | 7 ++ builtin/repack.c | 187 +++++++++++++++++++++++++++---- t/t7704-repack-cruft.sh | 145 ++++++++++++++++++++++++ 3 files changed, 319 insertions(+), 20 deletions(-) diff --git a/Documentation/config/repack.adoc b/Documentation/config/repack.adoc index c79af6d7b8..e9e78dcb19 100644 --- a/Documentation/config/repack.adoc +++ b/Documentation/config/repack.adoc @@ -39,3 +39,10 @@ repack.cruftThreads:: a cruft pack and the respective parameters are not given over the command line. See similarly named `pack.*` configuration variables for defaults and meaning. + +repack.midxMustContainCruft:: + When set to true, linkgit:git-repack[1] will unconditionally include + cruft pack(s), if any, in the multi-pack index when invoked with + `--write-midx`. When false, cruft packs are only included in the MIDX + when necessary (e.g., because they might be required to form a + reachability closure with MIDX bitmaps). Defaults to true. diff --git a/builtin/repack.c b/builtin/repack.c index 5ddc6e7f95..8d1540a0fd 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -39,6 +39,7 @@ static int write_bitmaps = -1; static int use_delta_islands; static int run_update_server_info = 1; static char *packdir, *packtmp_name, *packtmp; +static int midx_must_contain_cruft = 1; static const char *const git_repack_usage[] = { N_("git repack [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]\n" @@ -108,6 +109,10 @@ static int repack_config(const char *var, const char *value, free(cruft_po_args->threads); return git_config_string(&cruft_po_args->threads, var, value); } + if (!strcmp(var, "repack.midxmustcontaincruft")) { + midx_must_contain_cruft = git_config_bool(var, value); + return 0; + } return git_default_config(var, value, ctx, cb); } @@ -690,6 +695,77 @@ static void free_pack_geometry(struct pack_geometry *geometry) free(geometry->pack); } +static int midx_has_unknown_packs(char **midx_pack_names, + size_t midx_pack_names_nr, + struct string_list *include, + struct pack_geometry *geometry, + struct existing_packs *existing) +{ + size_t i; + + string_list_sort(include); + + for (i = 0; i < midx_pack_names_nr; i++) { + const char *pack_name = midx_pack_names[i]; + + /* + * Determine whether or not each MIDX'd pack from the existing + * MIDX (if any) is represented in the new MIDX. For each pack + * in the MIDX, it must either be: + * + * - In the "include" list of packs to be included in the new + * MIDX. Note this function is called before the include + * list is populated with any cruft pack(s). + * + * - Below the geometric split line (if using pack geometry), + * indicating that the pack won't be included in the new + * MIDX, but its contents were rolled up as part of the + * geometric repack. + * + * - In the existing non-kept packs list (if not using pack + * geometry), and marked as non-deleted. + */ + if (string_list_has_string(include, pack_name)) { + continue; + } else if (geometry) { + struct strbuf buf = STRBUF_INIT; + uint32_t j; + + for (j = 0; j < geometry->split; j++) { + strbuf_reset(&buf); + strbuf_addstr(&buf, pack_basename(geometry->pack[j])); + strbuf_strip_suffix(&buf, ".pack"); + strbuf_addstr(&buf, ".idx"); + + if (!strcmp(pack_name, buf.buf)) { + strbuf_release(&buf); + break; + } + } + + strbuf_release(&buf); + + if (j < geometry->split) + continue; + } else { + struct string_list_item *item; + + item = string_list_lookup(&existing->non_kept_packs, + pack_name); + if (item && !pack_is_marked_for_deletion(item)) + continue; + } + + /* + * If we got to this point, the MIDX includes some pack that we + * don't know about. + */ + return 1; + } + + return 0; +} + struct midx_snapshot_ref_data { struct tempfile *f; struct oidset seen; @@ -758,6 +834,8 @@ static void midx_snapshot_refs(struct tempfile *f) static void midx_included_packs(struct string_list *include, struct existing_packs *existing, + char **midx_pack_names, + size_t midx_pack_names_nr, struct string_list *names, struct pack_geometry *geometry) { @@ -811,26 +889,56 @@ static void midx_included_packs(struct string_list *include, } } - for_each_string_list_item(item, &existing->cruft_packs) { + if (midx_must_contain_cruft || + midx_has_unknown_packs(midx_pack_names, midx_pack_names_nr, + include, geometry, existing)) { /* - * When doing a --geometric repack, there is no need to check - * for deleted packs, since we're by definition not doing an - * ALL_INTO_ONE repack (hence no packs will be deleted). - * Otherwise we must check for and exclude any packs which are - * enqueued for deletion. + * If there are one or more unknown pack(s) present (see + * midx_has_unknown_packs() for what makes a pack + * "unknown") in the MIDX before the repack, keep them + * as they may be required to form a reachability + * closure if the MIDX is bitmapped. * - * So we could omit the conditional below in the --geometric - * case, but doing so is unnecessary since no packs are marked - * as pending deletion (since we only call - * `mark_packs_for_deletion()` when doing an all-into-one - * repack). + * For example, a cruft pack can be required to form a + * reachability closure if the MIDX is bitmapped and one + * or more of the bitmap's selected commits reaches a + * once-cruft object that was later made reachable. */ - if (pack_is_marked_for_deletion(item)) - continue; + for_each_string_list_item(item, &existing->cruft_packs) { + /* + * When doing a --geometric repack, there is no + * need to check for deleted packs, since we're + * by definition not doing an ALL_INTO_ONE + * repack (hence no packs will be deleted). + * Otherwise we must check for and exclude any + * packs which are enqueued for deletion. + * + * So we could omit the conditional below in the + * --geometric case, but doing so is unnecessary + * since no packs are marked as pending + * deletion (since we only call + * `mark_packs_for_deletion()` when doing an + * all-into-one repack). + */ + if (pack_is_marked_for_deletion(item)) + continue; - strbuf_reset(&buf); - strbuf_addf(&buf, "%s.idx", item->string); - string_list_insert(include, buf.buf); + strbuf_reset(&buf); + strbuf_addf(&buf, "%s.idx", item->string); + string_list_insert(include, buf.buf); + } + } else { + /* + * Modern versions of Git (with the appropriate + * configuration setting) will write new copies of + * once-cruft objects when doing a --geometric repack. + * + * If the MIDX has no cruft pack, new packs written + * during a --geometric repack will not rely on the + * cruft pack to form a reachability closure, so we can + * avoid including them in the MIDX in that case. + */ + ; } strbuf_release(&buf); @@ -1145,6 +1253,8 @@ int cmd_repack(int argc, struct tempfile *refs_snapshot = NULL; int i, ext, ret; int show_progress; + char **midx_pack_names = NULL; + size_t midx_pack_names_nr = 0; /* variables to be filled by option parsing */ int delete_redundant = 0; @@ -1361,7 +1471,10 @@ int cmd_repack(int argc, !(pack_everything & PACK_CRUFT)) strvec_push(&cmd.args, "--pack-loose-unreachable"); } else if (geometry.split_factor) { - strvec_push(&cmd.args, "--stdin-packs"); + if (midx_must_contain_cruft) + strvec_push(&cmd.args, "--stdin-packs"); + else + strvec_push(&cmd.args, "--stdin-packs=follow"); strvec_push(&cmd.args, "--unpacked"); } else { strvec_push(&cmd.args, "--unpacked"); @@ -1401,8 +1514,25 @@ int cmd_repack(int argc, if (ret) goto cleanup; - if (!names.nr && !po_args.quiet) - printf_ln(_("Nothing new to pack.")); + if (!names.nr) { + if (!po_args.quiet) + printf_ln(_("Nothing new to pack.")); + /* + * If we didn't write any new packs, the non-cruft packs + * may refer to once-unreachable objects in the cruft + * pack(s). + * + * If there isn't already a MIDX, the one we write + * must include the cruft pack(s), in case the + * non-cruft pack(s) refer to once-cruft objects. + * + * If there is already a MIDX, we can punt here, since + * midx_has_unknown_packs() will make the decision for + * us. + */ + if (!get_local_multi_pack_index(the_repository)) + midx_must_contain_cruft = 1; + } if (pack_everything & PACK_CRUFT) { const char *pack_prefix = find_pack_prefix(packdir, packtmp); @@ -1483,6 +1613,19 @@ int cmd_repack(int argc, string_list_sort(&names); + if (get_local_multi_pack_index(the_repository)) { + struct multi_pack_index *m = + get_local_multi_pack_index(the_repository); + + ALLOC_ARRAY(midx_pack_names, + m->num_packs + m->num_packs_in_base); + + for (; m; m = m->base_midx) + for (uint32_t i = 0; i < m->num_packs; i++) + midx_pack_names[midx_pack_names_nr++] = + xstrdup(m->pack_names[i]); + } + close_object_store(the_repository->objects); /* @@ -1524,7 +1667,8 @@ int cmd_repack(int argc, if (write_midx) { struct string_list include = STRING_LIST_INIT_DUP; - midx_included_packs(&include, &existing, &names, &geometry); + midx_included_packs(&include, &existing, midx_pack_names, + midx_pack_names_nr, &names, &geometry); ret = write_midx_included_packs(&include, &geometry, &names, refs_snapshot ? get_tempfile_path(refs_snapshot) : NULL, @@ -1575,6 +1719,9 @@ cleanup: string_list_clear(&names, 1); existing_packs_release(&existing); free_pack_geometry(&geometry); + for (size_t i = 0; i < midx_pack_names_nr; i++) + free(midx_pack_names[i]); + free(midx_pack_names); pack_objects_args_release(&po_args); pack_objects_args_release(&cruft_po_args); diff --git a/t/t7704-repack-cruft.sh b/t/t7704-repack-cruft.sh index 8aebfb45f5..aa2e2e6ad8 100755 --- a/t/t7704-repack-cruft.sh +++ b/t/t7704-repack-cruft.sh @@ -724,4 +724,149 @@ test_expect_success 'cruft repack respects --quiet' ' ) ' +setup_cruft_exclude_tests() { + git init "$1" && + ( + cd "$1" && + + git config repack.midxMustContainCruft false && + + test_commit one && + + test_commit --no-tag two && + two="$(git rev-parse HEAD)" && + test_commit --no-tag three && + three="$(git rev-parse HEAD)" && + git reset --hard one && + git reflog expire --all --expire=all && + + GIT_TEST_MULTI_PACK_INDEX=0 git repack --cruft -d && + + git merge $two && + test_commit four + ) +} + +test_expect_success 'repack --write-midx excludes cruft where possible' ' + setup_cruft_exclude_tests exclude-cruft-when-possible && + ( + cd exclude-cruft-when-possible && + + GIT_TEST_MULTI_PACK_INDEX=0 \ + git repack -d --geometric=2 --write-midx --write-bitmap-index && + + test-tool read-midx --show-objects $objdir >midx && + cruft="$(ls $packdir/*.mtimes)" && + test_grep ! "$(basename "$cruft" .mtimes).idx" midx && + + git rev-list --all --objects --no-object-names >reachable.raw && + sort reachable.raw >reachable.objects && + awk "/\.pack$/ { print \$1 }" midx.objects && + + test_cmp reachable.objects midx.objects + ) +' + +test_expect_success 'repack --write-midx includes cruft when instructed' ' + setup_cruft_exclude_tests exclude-cruft-when-instructed && + ( + cd exclude-cruft-when-instructed && + + GIT_TEST_MULTI_PACK_INDEX=0 \ + git -c repack.midxMustContainCruft=true repack \ + -d --geometric=2 --write-midx --write-bitmap-index && + + test-tool read-midx --show-objects $objdir >midx && + cruft="$(ls $packdir/*.mtimes)" && + test_grep "$(basename "$cruft" .mtimes).idx" midx && + + git cat-file --batch-check="%(objectname)" --batch-all-objects \ + >all.objects && + awk "/\.pack$/ { print \$1 }" midx.objects && + + test_cmp all.objects midx.objects + ) +' + +test_expect_success 'repack --write-midx includes cruft when necessary' ' + setup_cruft_exclude_tests exclude-cruft-when-necessary && + ( + cd exclude-cruft-when-necessary && + + test_path_is_file $(ls $packdir/pack-*.mtimes) && + ( cd $packdir && ls pack-*.idx ) | sort >packs.all && + git multi-pack-index write --stdin-packs --bitmap midx && + awk "/\.pack$/ { print \$1 }" midx.objects && + git cat-file --batch-all-objects --batch-check="%(objectname)" \ + >expect.objects && + test_cmp expect.objects midx.objects && + + grep "^pack-" midx >midx.packs && + test_line_count = "$(($(wc -l packs.all && + cruft="$(ls $packdir/pack-*.mtimes)" && + cruft="${cruft%.mtimes}.idx" && + + for idx in $(grep -v $cruft out && + wc -l sizes.raw && + + # Make sure that there are two non-cruft packs, and + # that one of them contains at least twice as many + # objects as the other, ensuring that they are already + # in a geometric progression. + sort -n sizes.raw >sizes && + test_line_count = 2 sizes && + s1=$(head -n 1 sizes) && + s2=$(tail -n 1 sizes) && + test "$s2" -gt "$((2 * $s1))" && + + git -c repack.midxMustContainCruft=false repack --geometric=2 \ + --write-midx --write-bitmap-index + ) +' + test_done