Files
git/builtin/range-diff.c
Paulo Casaretto 00727249ec range-diff: add configurable memory limit for cost matrix
When comparing large commit ranges (e.g., 250,000+ commits), range-diff
attempts to allocate an n×n cost matrix that can exhaust available
memory. For example, with 256,784 commits (n = 513,568), the matrix
would require approximately 256GB of memory (513,568² × 4 bytes),
causing either immediate segmentation faults due to integer overflow or
system hangs.

Add a memory limit check in get_correspondences() before allocating the
cost matrix. This check uses the total size in bytes (n² × sizeof(int))
and compares it against a configurable maximum, preventing both
excessive memory usage and integer overflow issues.

The limit is configurable via a new --max-memory option that accepts
human-readable sizes (e.g., "1G", "500M"). The default is 4GB for 64 bit
systems and 2GB for 32 bit systems. This allows comparing ranges of
approximately 32,000 (16,000) commits - generous for real-world use cases
while preventing impractical operations.

When the limit is exceeded, range-diff now displays a clear error
message showing both the requested memory size and the maximum allowed,
formatted in human-readable units for better user experience.

Example usage:
  git range-diff --max-memory=1G branch1...branch2
  git range-diff --max-memory=500M base..topic1 base..topic2

This approach was chosen over alternatives:
- Pre-counting commits: Would require spawning additional git processes
  and reading all commits twice
- Limiting by commit count: Less precise than actual memory usage
- Streaming approach: Would require significant refactoring of the
  current algorithm

This issue was previously discussed in:
https://lore.kernel.org/git/RFC-cover-v2-0.5-00000000000-20211210T122901Z-avarab@gmail.com/

Acked-by: Johannes Schindelin <johannes.schindelin@gmx.de>
Signed-off-by: Paulo Casaretto <pcasaretto@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2025-08-29 09:46:07 -07:00

197 lines
5.8 KiB
C

#define USE_THE_REPOSITORY_VARIABLE
#include "builtin.h"
#include "gettext.h"
#include "object-name.h"
#include "parse-options.h"
#include "range-diff.h"
#include "config.h"
#include "parse.h"
static const char * const builtin_range_diff_usage[] = {
N_("git range-diff [<options>] <old-base>..<old-tip> <new-base>..<new-tip>"),
N_("git range-diff [<options>] <old-tip>...<new-tip>"),
N_("git range-diff [<options>] <base> <old-tip> <new-tip>"),
NULL
};
static int parse_max_memory(const struct option *opt, const char *arg, int unset)
{
size_t *max_memory = opt->value;
uintmax_t val;
if (unset)
return 0;
if (!git_parse_unsigned(arg, &val, SIZE_MAX))
return error(_("invalid max-memory value: %s"), arg);
*max_memory = (size_t)val;
return 0;
}
int cmd_range_diff(int argc,
const char **argv,
const char *prefix,
struct repository *repo UNUSED)
{
struct diff_options diffopt = { NULL };
struct strvec other_arg = STRVEC_INIT;
struct strvec diff_merges_arg = STRVEC_INIT;
struct range_diff_options range_diff_opts = {
.creation_factor = RANGE_DIFF_CREATION_FACTOR_DEFAULT,
.max_memory = RANGE_DIFF_MAX_MEMORY_DEFAULT,
.diffopt = &diffopt,
.other_arg = &other_arg
};
int simple_color = -1, left_only = 0, right_only = 0;
struct option range_diff_options[] = {
OPT_INTEGER(0, "creation-factor",
&range_diff_opts.creation_factor,
N_("percentage by which creation is weighted")),
OPT_BOOL(0, "no-dual-color", &simple_color,
N_("use simple diff colors")),
OPT_PASSTHRU_ARGV(0, "notes", &other_arg,
N_("notes"), N_("passed to 'git log'"),
PARSE_OPT_OPTARG),
OPT_PASSTHRU_ARGV(0, "diff-merges", &diff_merges_arg,
N_("style"), N_("passed to 'git log'"), 0),
OPT_CALLBACK(0, "max-memory", &range_diff_opts.max_memory,
N_("size"),
N_("maximum memory for cost matrix (default 4G)"),
parse_max_memory),
OPT_PASSTHRU_ARGV(0, "remerge-diff", &diff_merges_arg, NULL,
N_("passed to 'git log'"), PARSE_OPT_NOARG),
OPT_BOOL(0, "left-only", &left_only,
N_("only emit output related to the first range")),
OPT_BOOL(0, "right-only", &right_only,
N_("only emit output related to the second range")),
OPT_END()
};
struct option *options;
int i, dash_dash = -1, res = 0;
struct strbuf range1 = STRBUF_INIT, range2 = STRBUF_INIT;
struct object_id oid;
const char *three_dots = NULL;
repo_config(the_repository, git_diff_ui_config, NULL);
repo_diff_setup(the_repository, &diffopt);
options = add_diff_options(range_diff_options, &diffopt);
argc = parse_options(argc, argv, prefix, options,
builtin_range_diff_usage, PARSE_OPT_KEEP_DASHDASH);
diff_setup_done(&diffopt);
/* force color when --dual-color was used */
if (!simple_color)
diffopt.use_color = 1;
/* If `--diff-merges` was specified, imply `--merges` */
if (diff_merges_arg.nr) {
range_diff_opts.include_merges = 1;
strvec_pushv(&other_arg, diff_merges_arg.v);
}
for (i = 0; i < argc; i++)
if (!strcmp(argv[i], "--")) {
dash_dash = i;
break;
}
if (dash_dash == 3 ||
(dash_dash < 0 && argc > 2 &&
!repo_get_oid_committish(the_repository, argv[0], &oid) &&
!repo_get_oid_committish(the_repository, argv[1], &oid) &&
!repo_get_oid_committish(the_repository, argv[2], &oid))) {
if (dash_dash < 0)
; /* already validated arguments */
else if (repo_get_oid_committish(the_repository, argv[0], &oid))
usage_msg_optf(_("not a revision: '%s'"),
builtin_range_diff_usage, options,
argv[0]);
else if (repo_get_oid_committish(the_repository, argv[1], &oid))
usage_msg_optf(_("not a revision: '%s'"),
builtin_range_diff_usage, options,
argv[1]);
else if (repo_get_oid_committish(the_repository, argv[2], &oid))
usage_msg_optf(_("not a revision: '%s'"),
builtin_range_diff_usage, options,
argv[2]);
strbuf_addf(&range1, "%s..%s", argv[0], argv[1]);
strbuf_addf(&range2, "%s..%s", argv[0], argv[2]);
strvec_pushv(&other_arg, argv +
(dash_dash < 0 ? 3 : dash_dash));
} else if (dash_dash == 2 ||
(dash_dash < 0 && argc > 1 &&
is_range_diff_range(argv[0]) &&
is_range_diff_range(argv[1]))) {
if (dash_dash < 0)
; /* already validated arguments */
else if (!is_range_diff_range(argv[0]))
usage_msg_optf(_("not a commit range: '%s'"),
builtin_range_diff_usage, options,
argv[0]);
else if (!is_range_diff_range(argv[1]))
usage_msg_optf(_("not a commit range: '%s'"),
builtin_range_diff_usage, options,
argv[1]);
strbuf_addstr(&range1, argv[0]);
strbuf_addstr(&range2, argv[1]);
strvec_pushv(&other_arg, argv +
(dash_dash < 0 ? 2 : dash_dash));
} else if (dash_dash == 1 ||
(dash_dash < 0 && argc > 0 &&
(three_dots = strstr(argv[0], "...")))) {
const char *a, *b;
int a_len;
if (dash_dash < 0)
; /* already validated arguments */
else if (!(three_dots = strstr(argv[0], "...")))
usage_msg_optf(_("not a symmetric range: '%s'"),
builtin_range_diff_usage, options,
argv[0]);
if (three_dots == argv[0]) {
a = "HEAD";
a_len = strlen(a);
} else {
a = argv[0];
a_len = (int)(three_dots - a);
}
if (three_dots[3])
b = three_dots + 3;
else
b = "HEAD";
strbuf_addf(&range1, "%s..%.*s", b, a_len, a);
strbuf_addf(&range2, "%.*s..%s", a_len, a, b);
strvec_pushv(&other_arg, argv +
(dash_dash < 0 ? 1 : dash_dash));
} else
usage_msg_opt(_("need two commit ranges"),
builtin_range_diff_usage, options);
FREE_AND_NULL(options);
range_diff_opts.dual_color = simple_color < 1;
range_diff_opts.left_only = left_only;
range_diff_opts.right_only = right_only;
res = show_range_diff(range1.buf, range2.buf, &range_diff_opts);
strvec_clear(&other_arg);
strvec_clear(&diff_merges_arg);
strbuf_release(&range1);
strbuf_release(&range2);
return res;
}