One way to significantly reduce the cost of a Git clone and later fetches is to use a blobless partial clone and combine that with a sparse-checkout that reduces the paths that need to be populated in the working directory. Not only does this reduce the cost of clones and fetches, the sparse-checkout reduces the number of objects needed to download from a promisor remote. However, history investigations can be expensive as computing blob diffs will trigger promisor remote requests for one object at a time. This can be avoided by downloading the blobs needed for the given sparse-checkout using 'git backfill' and its new '--sparse' mode, at a time that the user is willing to pay that extra cost. Note that this is distinctly different from the '--filter=sparse:<oid>' option, as this assumes that the partial clone has all reachable trees and we are using client-side logic to avoid downloading blobs outside of the sparse-checkout cone. This avoids the server-side cost of walking trees while also achieving a similar goal. It also downloads in batches based on similar path names, presenting a resumable download if things are interrupted. This augments the path-walk API to have a possibly-NULL 'pl' member that may point to a 'struct pattern_list'. This could be more general than the sparse-checkout definition at HEAD, but 'git backfill --sparse' is currently the only consumer. Be sure to test this in both cone mode and not cone mode. Cone mode has the benefit that the path-walk can skip certain paths once they would expand beyond the sparse-checkout. Non-cone mode can describe the included files using both positive and negative patterns, which changes the possible return values of path_matches_pattern_list(). Test both kinds of matches for increased coverage. To test this, we can create a blobless sparse clone, expand the sparse-checkout slightly, and then run 'git backfill --sparse' to see how much data is downloaded. The general steps are 1. git clone --filter=blob:none --sparse <url> 2. git sparse-checkout set <dir1> ... <dirN> 3. git backfill --sparse For the Git repository with the 'builtin' directory in the sparse-checkout, we get these results for various batch sizes: | Batch Size | Pack Count | Pack Size | Time | |-----------------|------------|-----------|-------| | (Initial clone) | 3 | 110 MB | | | 10K | 12 | 192 MB | 17.2s | | 15K | 9 | 192 MB | 15.5s | | 20K | 8 | 192 MB | 15.5s | | 25K | 7 | 192 MB | 14.7s | This case matters less because a full clone of the Git repository from GitHub is currently at 277 MB. Using a copy of the Linux repository with the 'kernel/' directory in the sparse-checkout, we get these results: | Batch Size | Pack Count | Pack Size | Time | |-----------------|------------|-----------|------| | (Initial clone) | 2 | 1,876 MB | | | 10K | 11 | 2,187 MB | 46s | | 25K | 7 | 2,188 MB | 43s | | 50K | 5 | 2,194 MB | 44s | | 100K | 4 | 2,194 MB | 48s | This case is more meaningful because a full clone of the Linux repository is currently over 6 GB, so this is a valuable way to download a fraction of the repository and no longer need network access for all reachable objects within the sparse-checkout. Choosing a batch size will depend on a lot of factors, including the user's network speed or reliability, the repository's file structure, and how many versions there are of the file within the sparse-checkout scope. There will not be a one-size-fits-all solution. Signed-off-by: Derrick Stolee <stolee@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
140 lines
3.2 KiB
C
140 lines
3.2 KiB
C
#include "builtin.h"
|
|
#include "git-compat-util.h"
|
|
#include "config.h"
|
|
#include "parse-options.h"
|
|
#include "repository.h"
|
|
#include "commit.h"
|
|
#include "dir.h"
|
|
#include "hex.h"
|
|
#include "tree.h"
|
|
#include "tree-walk.h"
|
|
#include "object.h"
|
|
#include "object-store-ll.h"
|
|
#include "oid-array.h"
|
|
#include "oidset.h"
|
|
#include "promisor-remote.h"
|
|
#include "strmap.h"
|
|
#include "string-list.h"
|
|
#include "revision.h"
|
|
#include "trace2.h"
|
|
#include "progress.h"
|
|
#include "packfile.h"
|
|
#include "path-walk.h"
|
|
|
|
static const char * const builtin_backfill_usage[] = {
|
|
N_("git backfill [--min-batch-size=<n>] [--[no-]sparse]"),
|
|
NULL
|
|
};
|
|
|
|
struct backfill_context {
|
|
struct repository *repo;
|
|
struct oid_array current_batch;
|
|
size_t min_batch_size;
|
|
int sparse;
|
|
};
|
|
|
|
static void backfill_context_clear(struct backfill_context *ctx)
|
|
{
|
|
oid_array_clear(&ctx->current_batch);
|
|
}
|
|
|
|
static void download_batch(struct backfill_context *ctx)
|
|
{
|
|
promisor_remote_get_direct(ctx->repo,
|
|
ctx->current_batch.oid,
|
|
ctx->current_batch.nr);
|
|
oid_array_clear(&ctx->current_batch);
|
|
|
|
/*
|
|
* We likely have a new packfile. Add it to the packed list to
|
|
* avoid possible duplicate downloads of the same objects.
|
|
*/
|
|
reprepare_packed_git(ctx->repo);
|
|
}
|
|
|
|
static int fill_missing_blobs(const char *path UNUSED,
|
|
struct oid_array *list,
|
|
enum object_type type,
|
|
void *data)
|
|
{
|
|
struct backfill_context *ctx = data;
|
|
|
|
if (type != OBJ_BLOB)
|
|
return 0;
|
|
|
|
for (size_t i = 0; i < list->nr; i++) {
|
|
if (!has_object(ctx->repo, &list->oid[i],
|
|
OBJECT_INFO_FOR_PREFETCH))
|
|
oid_array_append(&ctx->current_batch, &list->oid[i]);
|
|
}
|
|
|
|
if (ctx->current_batch.nr >= ctx->min_batch_size)
|
|
download_batch(ctx);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int do_backfill(struct backfill_context *ctx)
|
|
{
|
|
struct rev_info revs;
|
|
struct path_walk_info info = PATH_WALK_INFO_INIT;
|
|
int ret;
|
|
|
|
if (ctx->sparse) {
|
|
CALLOC_ARRAY(info.pl, 1);
|
|
if (get_sparse_checkout_patterns(info.pl)) {
|
|
path_walk_info_clear(&info);
|
|
return error(_("problem loading sparse-checkout"));
|
|
}
|
|
}
|
|
|
|
repo_init_revisions(ctx->repo, &revs, "");
|
|
handle_revision_arg("HEAD", &revs, 0, 0);
|
|
|
|
info.blobs = 1;
|
|
info.tags = info.commits = info.trees = 0;
|
|
|
|
info.revs = &revs;
|
|
info.path_fn = fill_missing_blobs;
|
|
info.path_fn_data = ctx;
|
|
|
|
ret = walk_objects_by_path(&info);
|
|
|
|
/* Download the objects that did not fill a batch. */
|
|
if (!ret)
|
|
download_batch(ctx);
|
|
|
|
path_walk_info_clear(&info);
|
|
release_revisions(&revs);
|
|
return ret;
|
|
}
|
|
|
|
int cmd_backfill(int argc, const char **argv, const char *prefix, struct repository *repo)
|
|
{
|
|
int result;
|
|
struct backfill_context ctx = {
|
|
.repo = repo,
|
|
.current_batch = OID_ARRAY_INIT,
|
|
.min_batch_size = 50000,
|
|
.sparse = 0,
|
|
};
|
|
struct option options[] = {
|
|
OPT_INTEGER(0, "min-batch-size", &ctx.min_batch_size,
|
|
N_("Minimum number of objects to request at a time")),
|
|
OPT_BOOL(0, "sparse", &ctx.sparse,
|
|
N_("Restrict the missing objects to the current sparse-checkout")),
|
|
OPT_END(),
|
|
};
|
|
|
|
show_usage_if_asked(argc, argv, builtin_backfill_usage[0]);
|
|
|
|
argc = parse_options(argc, argv, prefix, options, builtin_backfill_usage,
|
|
0);
|
|
|
|
repo_config(repo, git_default_config, NULL);
|
|
|
|
result = do_backfill(&ctx);
|
|
backfill_context_clear(&ctx);
|
|
return result;
|
|
}
|