Merge branch 'ds/path-walk-2'

"git pack-objects" learns to find delta bases from blobs at the
same path, using the --path-walk API.

* ds/path-walk-2:
  pack-objects: allow --shallow and --path-walk
  path-walk: add new 'edge_aggressive' option
  pack-objects: thread the path-based compression
  pack-objects: refactor path-walk delta phase
  scalar: enable path-walk during push via config
  pack-objects: enable --path-walk via config
  repack: add --path-walk option
  t5538: add tests to confirm deltas in shallow pushes
  pack-objects: introduce GIT_TEST_PACK_PATH_WALK
  p5313: add performance tests for --path-walk
  pack-objects: update usage to match docs
  pack-objects: add --path-walk option
  pack-objects: extract should_attempt_deltas()
This commit is contained in:
Junio C Hamano
2025-06-17 10:44:38 -07:00
27 changed files with 620 additions and 66 deletions

View File

@@ -41,6 +41,10 @@
#include "promisor-remote.h"
#include "pack-mtimes.h"
#include "parse-options.h"
#include "blob.h"
#include "tree.h"
#include "path-walk.h"
#include "trace2.h"
/*
* Objects we are going to pack are collected in the `to_pack` structure.
@@ -184,8 +188,14 @@ static inline void oe_set_delta_size(struct packing_data *pack,
#define SET_DELTA_SIBLING(obj, val) oe_set_delta_sibling(&to_pack, obj, val)
static const char *const pack_usage[] = {
N_("git pack-objects --stdout [<options>] [< <ref-list> | < <object-list>]"),
N_("git pack-objects [<options>] <base-name> [< <ref-list> | < <object-list>]"),
N_("git pack-objects [-q | --progress | --all-progress] [--all-progress-implied]\n"
" [--no-reuse-delta] [--delta-base-offset] [--non-empty]\n"
" [--local] [--incremental] [--window=<n>] [--depth=<n>]\n"
" [--revs [--unpacked | --all]] [--keep-pack=<pack-name>]\n"
" [--cruft] [--cruft-expiration=<time>]\n"
" [--stdout [--filter=<filter-spec>] | <base-name>]\n"
" [--shallow] [--keep-true-parents] [--[no-]sparse]\n"
" [--name-hash-version=<n>] [--path-walk] < <object-list>"),
NULL
};
@@ -200,6 +210,7 @@ static int keep_unreachable, unpack_unreachable, include_tag;
static timestamp_t unpack_unreachable_expiration;
static int pack_loose_unreachable;
static int cruft;
static int shallow = 0;
static timestamp_t cruft_expiration;
static int local;
static int have_non_local_packs;
@@ -218,6 +229,7 @@ static int delta_search_threads;
static int pack_to_stdout;
static int sparse;
static int thin;
static int path_walk = -1;
static int num_preferred_base;
static struct progress *progress_state;
@@ -3041,6 +3053,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
struct thread_params {
pthread_t thread;
struct object_entry **list;
struct packing_region *regions;
unsigned list_size;
unsigned remaining;
int window;
@@ -3283,6 +3296,242 @@ static int add_ref_tag(const char *tag UNUSED, const char *referent UNUSED, cons
return 0;
}
static int should_attempt_deltas(struct object_entry *entry)
{
if (DELTA(entry))
/* This happens if we decided to reuse existing
* delta from a pack. "reuse_delta &&" is implied.
*/
return 0;
if (!entry->type_valid ||
oe_size_less_than(&to_pack, entry, 50))
return 0;
if (entry->no_try_delta)
return 0;
if (!entry->preferred_base) {
if (oe_type(entry) < 0)
die(_("unable to get type of object %s"),
oid_to_hex(&entry->idx.oid));
} else if (oe_type(entry) < 0) {
/*
* This object is not found, but we
* don't have to include it anyway.
*/
return 0;
}
return 1;
}
static void find_deltas_for_region(struct object_entry *list,
struct packing_region *region,
unsigned int *processed)
{
struct object_entry **delta_list;
unsigned int delta_list_nr = 0;
ALLOC_ARRAY(delta_list, region->nr);
for (size_t i = 0; i < region->nr; i++) {
struct object_entry *entry = list + region->start + i;
if (should_attempt_deltas(entry))
delta_list[delta_list_nr++] = entry;
}
QSORT(delta_list, delta_list_nr, type_size_sort);
find_deltas(delta_list, &delta_list_nr, window, depth, processed);
free(delta_list);
}
static void find_deltas_by_region(struct object_entry *list,
struct packing_region *regions,
size_t start, size_t nr)
{
unsigned int processed = 0;
size_t progress_nr;
if (!nr)
return;
progress_nr = regions[nr - 1].start + regions[nr - 1].nr;
if (progress)
progress_state = start_progress(the_repository,
_("Compressing objects by path"),
progress_nr);
while (nr--)
find_deltas_for_region(list,
&regions[start++],
&processed);
display_progress(progress_state, progress_nr);
stop_progress(&progress_state);
}
static void *threaded_find_deltas_by_path(void *arg)
{
struct thread_params *me = arg;
progress_lock();
while (me->remaining) {
while (me->remaining) {
progress_unlock();
find_deltas_for_region(to_pack.objects,
me->regions,
me->processed);
progress_lock();
me->remaining--;
me->regions++;
}
me->working = 0;
pthread_cond_signal(&progress_cond);
progress_unlock();
/*
* We must not set ->data_ready before we wait on the
* condition because the main thread may have set it to 1
* before we get here. In order to be sure that new
* work is available if we see 1 in ->data_ready, it
* was initialized to 0 before this thread was spawned
* and we reset it to 0 right away.
*/
pthread_mutex_lock(&me->mutex);
while (!me->data_ready)
pthread_cond_wait(&me->cond, &me->mutex);
me->data_ready = 0;
pthread_mutex_unlock(&me->mutex);
progress_lock();
}
progress_unlock();
/* leave ->working 1 so that this doesn't get more work assigned */
return NULL;
}
static void ll_find_deltas_by_region(struct object_entry *list,
struct packing_region *regions,
uint32_t start, uint32_t nr)
{
struct thread_params *p;
int i, ret, active_threads = 0;
unsigned int processed = 0;
uint32_t progress_nr;
init_threaded_search();
if (!nr)
return;
progress_nr = regions[nr - 1].start + regions[nr - 1].nr;
if (delta_search_threads <= 1) {
find_deltas_by_region(list, regions, start, nr);
cleanup_threaded_search();
return;
}
if (progress > pack_to_stdout)
fprintf_ln(stderr,
Q_("Path-based delta compression using up to %d thread",
"Path-based delta compression using up to %d threads",
delta_search_threads),
delta_search_threads);
CALLOC_ARRAY(p, delta_search_threads);
if (progress)
progress_state = start_progress(the_repository,
_("Compressing objects by path"),
progress_nr);
/* Partition the work amongst work threads. */
for (i = 0; i < delta_search_threads; i++) {
unsigned sub_size = nr / (delta_search_threads - i);
p[i].window = window;
p[i].depth = depth;
p[i].processed = &processed;
p[i].working = 1;
p[i].data_ready = 0;
p[i].regions = regions;
p[i].list_size = sub_size;
p[i].remaining = sub_size;
regions += sub_size;
nr -= sub_size;
}
/* Start work threads. */
for (i = 0; i < delta_search_threads; i++) {
if (!p[i].list_size)
continue;
pthread_mutex_init(&p[i].mutex, NULL);
pthread_cond_init(&p[i].cond, NULL);
ret = pthread_create(&p[i].thread, NULL,
threaded_find_deltas_by_path, &p[i]);
if (ret)
die(_("unable to create thread: %s"), strerror(ret));
active_threads++;
}
/*
* Now let's wait for work completion. Each time a thread is done
* with its work, we steal half of the remaining work from the
* thread with the largest number of unprocessed objects and give
* it to that newly idle thread. This ensure good load balancing
* until the remaining object list segments are simply too short
* to be worth splitting anymore.
*/
while (active_threads) {
struct thread_params *target = NULL;
struct thread_params *victim = NULL;
unsigned sub_size = 0;
progress_lock();
for (;;) {
for (i = 0; !target && i < delta_search_threads; i++)
if (!p[i].working)
target = &p[i];
if (target)
break;
pthread_cond_wait(&progress_cond, &progress_mutex);
}
for (i = 0; i < delta_search_threads; i++)
if (p[i].remaining > 2*window &&
(!victim || victim->remaining < p[i].remaining))
victim = &p[i];
if (victim) {
sub_size = victim->remaining / 2;
target->regions = victim->regions + victim->remaining - sub_size;
victim->list_size -= sub_size;
victim->remaining -= sub_size;
}
target->list_size = sub_size;
target->remaining = sub_size;
target->working = 1;
progress_unlock();
pthread_mutex_lock(&target->mutex);
target->data_ready = 1;
pthread_cond_signal(&target->cond);
pthread_mutex_unlock(&target->mutex);
if (!sub_size) {
pthread_join(target->thread, NULL);
pthread_cond_destroy(&target->cond);
pthread_mutex_destroy(&target->mutex);
active_threads--;
}
}
cleanup_threaded_search();
free(p);
display_progress(progress_state, progress_nr);
stop_progress(&progress_state);
}
static void prepare_pack(int window, int depth)
{
struct object_entry **delta_list;
@@ -3307,39 +3556,21 @@ static void prepare_pack(int window, int depth)
if (!to_pack.nr_objects || !window || !depth)
return;
if (path_walk)
ll_find_deltas_by_region(to_pack.objects, to_pack.regions,
0, to_pack.nr_regions);
ALLOC_ARRAY(delta_list, to_pack.nr_objects);
nr_deltas = n = 0;
for (i = 0; i < to_pack.nr_objects; i++) {
struct object_entry *entry = to_pack.objects + i;
if (DELTA(entry))
/* This happens if we decided to reuse existing
* delta from a pack. "reuse_delta &&" is implied.
*/
if (!should_attempt_deltas(entry))
continue;
if (!entry->type_valid ||
oe_size_less_than(&to_pack, entry, 50))
continue;
if (entry->no_try_delta)
continue;
if (!entry->preferred_base) {
if (!entry->preferred_base)
nr_deltas++;
if (oe_type(entry) < 0)
die(_("unable to get type of object %s"),
oid_to_hex(&entry->idx.oid));
} else {
if (oe_type(entry) < 0) {
/*
* This object is not found, but we
* don't have to include it anyway.
*/
continue;
}
}
delta_list[n++] = entry;
}
@@ -4272,6 +4503,93 @@ static void mark_bitmap_preferred_tips(void)
}
}
static inline int is_oid_uninteresting(struct repository *repo,
struct object_id *oid)
{
struct object *o = lookup_object(repo, oid);
return !o || (o->flags & UNINTERESTING);
}
static int add_objects_by_path(const char *path,
struct oid_array *oids,
enum object_type type,
void *data)
{
size_t oe_start = to_pack.nr_objects;
size_t oe_end;
unsigned int *processed = data;
/*
* First, add all objects to the packing data, including the ones
* marked UNINTERESTING (translated to 'exclude') as they can be
* used as delta bases.
*/
for (size_t i = 0; i < oids->nr; i++) {
int exclude;
struct object_info oi = OBJECT_INFO_INIT;
struct object_id *oid = &oids->oid[i];
/* Skip objects that do not exist locally. */
if ((exclude_promisor_objects || arg_missing_action != MA_ERROR) &&
oid_object_info_extended(the_repository, oid, &oi,
OBJECT_INFO_FOR_PREFETCH) < 0)
continue;
exclude = is_oid_uninteresting(the_repository, oid);
if (exclude && !thin)
continue;
add_object_entry(oid, type, path, exclude);
}
oe_end = to_pack.nr_objects;
/* We can skip delta calculations if it is a no-op. */
if (oe_end == oe_start || !window)
return 0;
ALLOC_GROW(to_pack.regions,
to_pack.nr_regions + 1,
to_pack.nr_regions_alloc);
to_pack.regions[to_pack.nr_regions].start = oe_start;
to_pack.regions[to_pack.nr_regions].nr = oe_end - oe_start;
to_pack.nr_regions++;
*processed += oids->nr;
display_progress(progress_state, *processed);
return 0;
}
static void get_object_list_path_walk(struct rev_info *revs)
{
struct path_walk_info info = PATH_WALK_INFO_INIT;
unsigned int processed = 0;
int result;
info.revs = revs;
info.path_fn = add_objects_by_path;
info.path_fn_data = &processed;
/*
* Allow the --[no-]sparse option to be interesting here, if only
* for testing purposes. Paths with no interesting objects will not
* contribute to the resulting pack, but only create noisy preferred
* base objects.
*/
info.prune_all_uninteresting = sparse;
info.edge_aggressive = shallow;
trace2_region_enter("pack-objects", "path-walk", revs->repo);
result = walk_objects_by_path(&info);
trace2_region_leave("pack-objects", "path-walk", revs->repo);
if (result)
die(_("failed to pack objects via path-walk"));
}
static void get_object_list(struct rev_info *revs, int ac, const char **av)
{
struct setup_revision_opt s_r_opt = {
@@ -4327,15 +4645,19 @@ static void get_object_list(struct rev_info *revs, int ac, const char **av)
if (write_bitmap_index)
mark_bitmap_preferred_tips();
if (prepare_revision_walk(revs))
die(_("revision walk setup failed"));
mark_edges_uninteresting(revs, show_edge, sparse);
if (!fn_show_object)
fn_show_object = show_object;
traverse_commit_list(revs,
show_commit, fn_show_object,
NULL);
if (path_walk) {
get_object_list_path_walk(revs);
} else {
if (prepare_revision_walk(revs))
die(_("revision walk setup failed"));
mark_edges_uninteresting(revs, show_edge, sparse);
traverse_commit_list(revs,
show_commit, fn_show_object,
NULL);
}
if (unpack_unreachable_expiration) {
revs->ignore_missing_links = 1;
@@ -4464,7 +4786,6 @@ int cmd_pack_objects(int argc,
struct repository *repo UNUSED)
{
int use_internal_rev_list = 0;
int shallow = 0;
int all_progress_implied = 0;
struct strvec rp = STRVEC_INIT;
int rev_list_unpacked = 0, rev_list_all = 0, rev_list_reflog = 0;
@@ -4545,6 +4866,8 @@ int cmd_pack_objects(int argc,
N_("use the sparse reachability algorithm")),
OPT_BOOL(0, "thin", &thin,
N_("create thin packs")),
OPT_BOOL(0, "path-walk", &path_walk,
N_("use the path-walk API to walk objects when possible")),
OPT_BOOL(0, "shallow", &shallow,
N_("create packs suitable for shallow fetches")),
OPT_BOOL(0, "honor-pack-keep", &ignore_packed_keep_on_disk,
@@ -4614,6 +4937,17 @@ int cmd_pack_objects(int argc,
if (pack_to_stdout != !base_name || argc)
usage_with_options(pack_usage, pack_objects_options);
if (path_walk < 0) {
if (use_bitmap_index > 0 ||
!use_internal_rev_list)
path_walk = 0;
else if (the_repository->gitdir &&
the_repository->settings.pack_use_path_walk)
path_walk = 1;
else
path_walk = git_env_bool("GIT_TEST_PACK_PATH_WALK", 0);
}
if (depth < 0)
depth = 0;
if (depth >= (1 << OE_DEPTH_BITS)) {
@@ -4630,7 +4964,28 @@ int cmd_pack_objects(int argc,
window = 0;
strvec_push(&rp, "pack-objects");
if (thin) {
if (path_walk) {
const char *option = NULL;
if (filter_options.choice)
option = "--filter";
else if (use_delta_islands)
option = "--delta-islands";
if (option) {
warning(_("cannot use %s with %s"),
option, "--path-walk");
path_walk = 0;
}
}
if (path_walk) {
strvec_push(&rp, "--boundary");
/*
* We must disable the bitmaps because we are removing
* the --objects / --objects-edge[-aggressive] options.
*/
use_bitmap_index = 0;
} else if (thin) {
use_internal_rev_list = 1;
strvec_push(&rp, shallow
? "--objects-edge-aggressive"