From fb3920fd003e14283fb480b1eb4e8495b95034c9 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Fri, 4 Dec 2020 13:48:54 -0500
Subject: [PATCH 1/9] oid-array.h: drop sha1 mention from header guard

When this file was moved from sha1-array.h, we forgot to update the
preprocessor header guard to match the new name.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 oid-array.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/oid-array.h b/oid-array.h
index f28d322c90..2c8b64c393 100644
--- a/oid-array.h
+++ b/oid-array.h
@@ -1,5 +1,5 @@
-#ifndef SHA1_ARRAY_H
-#define SHA1_ARRAY_H
+#ifndef OID_ARRAY_H
+#define OID_ARRAY_H
 
 /**
  * The API provides storage and manipulation of sets of object identifiers.
@@ -106,4 +106,4 @@ void oid_array_filter(struct oid_array *array,
 		      for_each_oid_fn want,
 		      void *cbdata);
 
-#endif /* SHA1_ARRAY_H */
+#endif /* OID_ARRAY_H */

From d9ca6f8d9054e3441c0f291916cbedd2d0cad15f Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Fri, 4 Dec 2020 13:49:24 -0500
Subject: [PATCH 2/9] t0064: drop sha1 mention from filename

The data type is an oid_array these days, and we are using "test-tool
oid-array", so let's name the test script appropriately.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 t/{t0064-sha1-array.sh => t0064-oid-array.sh} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename t/{t0064-sha1-array.sh => t0064-oid-array.sh} (96%)

diff --git a/t/t0064-sha1-array.sh b/t/t0064-oid-array.sh
similarity index 96%
rename from t/t0064-sha1-array.sh
rename to t/t0064-oid-array.sh
index 45685af2fd..71034a8007 100755
--- a/t/t0064-sha1-array.sh
+++ b/t/t0064-oid-array.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 
-test_description='basic tests for the SHA1 array implementation'
+test_description='basic tests for the oid array implementation'
 . ./test-lib.sh
 
 echoid () {

From 3ea922fc8b19d8cdf967ca7b3229856e6326d099 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Fri, 4 Dec 2020 13:50:23 -0500
Subject: [PATCH 3/9] t0064: make duplicate tests more robust

Our tests for handling duplicates in oid-array provide only a single
duplicate for each number, so our sorted array looks like:

  44 44 55 55 88 88 aa aa

A slightly more interesting test is to have multiple duplicates, which
makes sure that we not only skip the duplicate, but keep skipping until
we are out of the set of matching duplicates.

Unsurprisingly this works just fine, but it's worth beefing up this test
since we're about to change the duplicate-detection code.

Note that we do need to adjust the results on the lookup test, since it
is returning the index of the found item (and now we have more items
before our range, and the range itself is slightly larger, since we'll
accept a match of any element).

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 t/t0064-oid-array.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/t/t0064-oid-array.sh b/t/t0064-oid-array.sh
index 71034a8007..2e5438ccda 100755
--- a/t/t0064-oid-array.sh
+++ b/t/t0064-oid-array.sh
@@ -25,6 +25,7 @@ test_expect_success 'ordered enumeration' '
 test_expect_success 'ordered enumeration with duplicate suppression' '
 	echoid "" 44 55 88 aa >expect &&
 	{
+		echoid append 88 44 aa 55 &&
 		echoid append 88 44 aa 55 &&
 		echoid append 88 44 aa 55 &&
 		echo for_each_unique
@@ -52,17 +53,19 @@ test_expect_success 'lookup non-existing entry' '
 
 test_expect_success 'lookup with duplicates' '
 	{
+		echoid append 88 44 aa 55 &&
 		echoid append 88 44 aa 55 &&
 		echoid append 88 44 aa 55 &&
 		echoid lookup 55
 	} | test-tool oid-array >actual &&
 	n=$(cat actual) &&
-	test "$n" -ge 2 &&
-	test "$n" -le 3
+	test "$n" -ge 3 &&
+	test "$n" -le 5
 '
 
 test_expect_success 'lookup non-existing entry with duplicates' '
 	{
+		echoid append 88 44 aa 55 &&
 		echoid append 88 44 aa 55 &&
 		echoid append 88 44 aa 55 &&
 		echoid lookup 66

From 3fa6f2aa57e997ff2e665d83335a8f1078b94cb8 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Fri, 4 Dec 2020 13:51:39 -0500
Subject: [PATCH 4/9] cache.h: move hash/oid functions to hash.h

We define git_hash_algo and object_id in hash.h, but most of the utility
functions are declared in the main cache.h. Let's move them to hash.h
along with their struct definitions. This cleans up cache.h a bit, but
also avoids circular dependencies when other headers need to know about
these functions (e.g., if oid-array.h were to have an inline that used
oideq(), it couldn't include cache.h because it is itself included by
cache.h).

No including C files should be affected, because hash.h is always
included in cache.h already.

We do have to mention repository.h at the top of hash.h, though, since
we depend on the_repository in some of our inline functions.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 cache.h | 94 --------------------------------------------------------
 hash.h  | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 94 deletions(-)

diff --git a/cache.h b/cache.h
index e986cf4ea9..ec98f5d32c 100644
--- a/cache.h
+++ b/cache.h
@@ -1123,100 +1123,6 @@ const char *repo_find_unique_abbrev(struct repository *r, const struct object_id
 int repo_find_unique_abbrev_r(struct repository *r, char *hex, const struct object_id *oid, int len);
 #define find_unique_abbrev_r(hex, oid, len) repo_find_unique_abbrev_r(the_repository, hex, oid, len)
 
-extern const struct object_id null_oid;
-
-static inline int hashcmp(const unsigned char *sha1, const unsigned char *sha2)
-{
-	/*
-	 * Teach the compiler that there are only two possibilities of hash size
-	 * here, so that it can optimize for this case as much as possible.
-	 */
-	if (the_hash_algo->rawsz == GIT_MAX_RAWSZ)
-		return memcmp(sha1, sha2, GIT_MAX_RAWSZ);
-	return memcmp(sha1, sha2, GIT_SHA1_RAWSZ);
-}
-
-static inline int oidcmp(const struct object_id *oid1, const struct object_id *oid2)
-{
-	return hashcmp(oid1->hash, oid2->hash);
-}
-
-static inline int hasheq(const unsigned char *sha1, const unsigned char *sha2)
-{
-	/*
-	 * We write this here instead of deferring to hashcmp so that the
-	 * compiler can properly inline it and avoid calling memcmp.
-	 */
-	if (the_hash_algo->rawsz == GIT_MAX_RAWSZ)
-		return !memcmp(sha1, sha2, GIT_MAX_RAWSZ);
-	return !memcmp(sha1, sha2, GIT_SHA1_RAWSZ);
-}
-
-static inline int oideq(const struct object_id *oid1, const struct object_id *oid2)
-{
-	return hasheq(oid1->hash, oid2->hash);
-}
-
-static inline int is_null_oid(const struct object_id *oid)
-{
-	return oideq(oid, &null_oid);
-}
-
-static inline void hashcpy(unsigned char *sha_dst, const unsigned char *sha_src)
-{
-	memcpy(sha_dst, sha_src, the_hash_algo->rawsz);
-}
-
-static inline void oidcpy(struct object_id *dst, const struct object_id *src)
-{
-	memcpy(dst->hash, src->hash, GIT_MAX_RAWSZ);
-}
-
-static inline struct object_id *oiddup(const struct object_id *src)
-{
-	struct object_id *dst = xmalloc(sizeof(struct object_id));
-	oidcpy(dst, src);
-	return dst;
-}
-
-static inline void hashclr(unsigned char *hash)
-{
-	memset(hash, 0, the_hash_algo->rawsz);
-}
-
-static inline void oidclr(struct object_id *oid)
-{
-	memset(oid->hash, 0, GIT_MAX_RAWSZ);
-}
-
-static inline void oidread(struct object_id *oid, const unsigned char *hash)
-{
-	memcpy(oid->hash, hash, the_hash_algo->rawsz);
-}
-
-static inline int is_empty_blob_sha1(const unsigned char *sha1)
-{
-	return hasheq(sha1, the_hash_algo->empty_blob->hash);
-}
-
-static inline int is_empty_blob_oid(const struct object_id *oid)
-{
-	return oideq(oid, the_hash_algo->empty_blob);
-}
-
-static inline int is_empty_tree_sha1(const unsigned char *sha1)
-{
-	return hasheq(sha1, the_hash_algo->empty_tree->hash);
-}
-
-static inline int is_empty_tree_oid(const struct object_id *oid)
-{
-	return oideq(oid, the_hash_algo->empty_tree);
-}
-
-const char *empty_tree_oid_hex(void);
-const char *empty_blob_oid_hex(void);
-
 /* set default permissions by passing mode arguments to open(2) */
 int git_mkstemps_mode(char *pattern, int suffix_len, int mode);
 int git_mkstemp_mode(char *pattern, int mode);
diff --git a/hash.h b/hash.h
index e0f3f16b06..3fb0c3d400 100644
--- a/hash.h
+++ b/hash.h
@@ -2,6 +2,7 @@
 #define HASH_H
 
 #include "git-compat-util.h"
+#include "repository.h"
 
 #if defined(SHA1_PPC)
 #include "ppc/sha1.h"
@@ -184,4 +185,98 @@ struct object_id {
 
 #define the_hash_algo the_repository->hash_algo
 
+extern const struct object_id null_oid;
+
+static inline int hashcmp(const unsigned char *sha1, const unsigned char *sha2)
+{
+	/*
+	 * Teach the compiler that there are only two possibilities of hash size
+	 * here, so that it can optimize for this case as much as possible.
+	 */
+	if (the_hash_algo->rawsz == GIT_MAX_RAWSZ)
+		return memcmp(sha1, sha2, GIT_MAX_RAWSZ);
+	return memcmp(sha1, sha2, GIT_SHA1_RAWSZ);
+}
+
+static inline int oidcmp(const struct object_id *oid1, const struct object_id *oid2)
+{
+	return hashcmp(oid1->hash, oid2->hash);
+}
+
+static inline int hasheq(const unsigned char *sha1, const unsigned char *sha2)
+{
+	/*
+	 * We write this here instead of deferring to hashcmp so that the
+	 * compiler can properly inline it and avoid calling memcmp.
+	 */
+	if (the_hash_algo->rawsz == GIT_MAX_RAWSZ)
+		return !memcmp(sha1, sha2, GIT_MAX_RAWSZ);
+	return !memcmp(sha1, sha2, GIT_SHA1_RAWSZ);
+}
+
+static inline int oideq(const struct object_id *oid1, const struct object_id *oid2)
+{
+	return hasheq(oid1->hash, oid2->hash);
+}
+
+static inline int is_null_oid(const struct object_id *oid)
+{
+	return oideq(oid, &null_oid);
+}
+
+static inline void hashcpy(unsigned char *sha_dst, const unsigned char *sha_src)
+{
+	memcpy(sha_dst, sha_src, the_hash_algo->rawsz);
+}
+
+static inline void oidcpy(struct object_id *dst, const struct object_id *src)
+{
+	memcpy(dst->hash, src->hash, GIT_MAX_RAWSZ);
+}
+
+static inline struct object_id *oiddup(const struct object_id *src)
+{
+	struct object_id *dst = xmalloc(sizeof(struct object_id));
+	oidcpy(dst, src);
+	return dst;
+}
+
+static inline void hashclr(unsigned char *hash)
+{
+	memset(hash, 0, the_hash_algo->rawsz);
+}
+
+static inline void oidclr(struct object_id *oid)
+{
+	memset(oid->hash, 0, GIT_MAX_RAWSZ);
+}
+
+static inline void oidread(struct object_id *oid, const unsigned char *hash)
+{
+	memcpy(oid->hash, hash, the_hash_algo->rawsz);
+}
+
+static inline int is_empty_blob_sha1(const unsigned char *sha1)
+{
+	return hasheq(sha1, the_hash_algo->empty_blob->hash);
+}
+
+static inline int is_empty_blob_oid(const struct object_id *oid)
+{
+	return oideq(oid, the_hash_algo->empty_blob);
+}
+
+static inline int is_empty_tree_sha1(const unsigned char *sha1)
+{
+	return hasheq(sha1, the_hash_algo->empty_tree->hash);
+}
+
+static inline int is_empty_tree_oid(const struct object_id *oid)
+{
+	return oideq(oid, the_hash_algo->empty_tree);
+}
+
+const char *empty_tree_oid_hex(void);
+const char *empty_blob_oid_hex(void);
+
 #endif

From d0482b445bb4c22a3f866c849835e366ec0b14a7 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Fri, 4 Dec 2020 13:52:07 -0500
Subject: [PATCH 5/9] oid-array: make sort function public

We sort the oid-array as a side effect of calling the lookup or
unique-iteration functions. But callers may want to sort it themselves
(especially as we add new iteration options in future patches).

We'll also move the check of the "sorted" flag into the sort function,
so callers don't have to remember to check it.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 oid-array.c | 10 +++++-----
 oid-array.h |  5 +++++
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/oid-array.c b/oid-array.c
index 8657a5cedf..29f718d835 100644
--- a/oid-array.c
+++ b/oid-array.c
@@ -14,8 +14,10 @@ static int void_hashcmp(const void *a, const void *b)
 	return oidcmp(a, b);
 }
 
-static void oid_array_sort(struct oid_array *array)
+void oid_array_sort(struct oid_array *array)
 {
+	if (array->sorted)
+		return;
 	QSORT(array->oid, array->nr, void_hashcmp);
 	array->sorted = 1;
 }
@@ -28,8 +30,7 @@ static const unsigned char *sha1_access(size_t index, void *table)
 
 int oid_array_lookup(struct oid_array *array, const struct object_id *oid)
 {
-	if (!array->sorted)
-		oid_array_sort(array);
+	oid_array_sort(array);
 	return sha1_pos(oid->hash, array->oid, array->nr, sha1_access);
 }
 
@@ -64,8 +65,7 @@ int oid_array_for_each_unique(struct oid_array *array,
 {
 	size_t i;
 
-	if (!array->sorted)
-		oid_array_sort(array);
+	oid_array_sort(array);
 
 	for (i = 0; i < array->nr; i++) {
 		int ret;
diff --git a/oid-array.h b/oid-array.h
index 2c8b64c393..6a22c0ac94 100644
--- a/oid-array.h
+++ b/oid-array.h
@@ -106,4 +106,9 @@ void oid_array_filter(struct oid_array *array,
 		      for_each_oid_fn want,
 		      void *cbdata);
 
+/**
+ * Sort the array in order of ascending object id.
+ */
+void oid_array_sort(struct oid_array *array);
+
 #endif /* OID_ARRAY_H */

From 12c4b4ce754640ac565f8b9b6f072bc10fbed5af Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Mon, 7 Dec 2020 14:11:00 -0500
Subject: [PATCH 6/9] oid-array: provide a for-loop iterator

We provide oid_array_for_each_unique() for iterating over the
de-duplicated items in an array. But it's awkward to use for two
reasons:

  1. It uses a callback, which means marshaling arguments into a struct
     and passing it to the callback with a void parameter.

  2. The callback doesn't know the numeric index of the oid we're
     looking at. This is useful for things like progress meters.

Iterating with a for-loop is much more natural for some cases, but the
caller has to do the de-duping itself. However, we can provide a small
helper to make this easier (see the docstring in the header for an
example use).

The caller does have to remember to sort the array first. We could add
an assertion into the helper that array->sorted is set, but I didn't
want to complicate what is otherwise a pretty fast code path.

I also considered adding a full iterator type with init/next/end
functions (similar to what we have for hashmaps). But it ended up making
the callers much harder to read. This version keeps us close to a basic
for-loop.

Yet another option would be adding an option to sort the array and
compact out the duplicates. This would mean iterating over the array an
extra time, though that's probably not a big deal (we did just do an
O(n log n) sort). But we'd still have to write a for-loop to iterate, so
it doesn't really make anything easier for the caller.

No new test, since we'll convert the callback iterator (which is covered
by t0064, among other callers) to use the new code.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 oid-array.c |  7 ++-----
 oid-array.h | 23 +++++++++++++++++++++++
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/oid-array.c b/oid-array.c
index 29f718d835..8e1bcedc0c 100644
--- a/oid-array.c
+++ b/oid-array.c
@@ -67,11 +67,8 @@ int oid_array_for_each_unique(struct oid_array *array,
 
 	oid_array_sort(array);
 
-	for (i = 0; i < array->nr; i++) {
-		int ret;
-		if (i > 0 && oideq(array->oid + i, array->oid + i - 1))
-			continue;
-		ret = fn(array->oid + i, data);
+	for (i = 0; i < array->nr; i = oid_array_next_unique(array, i)) {
+		int ret = fn(array->oid + i, data);
 		if (ret)
 			return ret;
 	}
diff --git a/oid-array.h b/oid-array.h
index 6a22c0ac94..72bca78b7d 100644
--- a/oid-array.h
+++ b/oid-array.h
@@ -1,6 +1,8 @@
 #ifndef OID_ARRAY_H
 #define OID_ARRAY_H
 
+#include "hash.h"
+
 /**
  * The API provides storage and manipulation of sets of object identifiers.
  * The emphasis is on storage and processing efficiency, making them suitable
@@ -111,4 +113,25 @@ void oid_array_filter(struct oid_array *array,
  */
 void oid_array_sort(struct oid_array *array);
 
+/**
+ * Find the next unique oid in the array after position "cur".
+ * The array must be sorted for this to work. You can iterate
+ * over unique elements like this:
+ *
+ *   size_t i;
+ *   oid_array_sort(array);
+ *   for (i = 0; i < array->nr; i = oid_array_next_unique(array, i))
+ *	printf("%s", oid_to_hex(array->oids[i]);
+ *
+ * Non-unique iteration can just increment with "i++" to visit each element.
+ */
+static inline size_t oid_array_next_unique(struct oid_array *array, size_t cur)
+{
+	do {
+		cur++;
+	} while (cur < array->nr &&
+		 oideq(array->oid + cur, array->oid + cur - 1));
+	return cur;
+}
+
 #endif /* OID_ARRAY_H */

From 1cbdbf3bef7e9167794cb2c12f9af1a450584ebe Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Mon, 7 Dec 2020 14:11:02 -0500
Subject: [PATCH 7/9] commit-graph: drop count_distinct_commits() function

When writing a commit graph, we collect a list of object ids in an
array, which we'll eventually copy into an array of "struct commit"
pointers. Before we do that, though, we count the number of distinct
commit entries. There's a subtle bug in this step, though.

We eliminate not only duplicate oids, but also in split mode, any oids
which are not commits or which are already in a graph file. However, the
loop starts at index 1, always counting index 0 as distinct. And indeed
it can't be a duplicate, since we check for those by comparing against
the previous entry, and there isn't one for index 0. But it could be a
commit that's already in a graph file, and we'd overcount the number of
commits by 1 in that case.

That turns out not to be a problem, though. The only things we do with
the count are:

  - check if our count will overflow our data structures. But the limit
    there is 2^31 commits, so while this is a useful check, the
    off-by-one is not likely to matter.

  - pre-allocate the array of commit pointers. But over-allocating by
    one isn't a problem; we'll just waste a few extra bytes.

The bug would be easy enough to fix, but we can observe that neither of
those steps is necessary.

After building the actual commit array, we'll likewise check its count
for overflow. So the extra check of the distinct commit count here is
redundant.

And likewise we use ALLOC_GROW() when building the commit array, so
there's no need to preallocate it (it's possible that doing so is
slightly more efficient, but if we care we can just optimistically
allocate one slot for each oid; I didn't bother here).

So count_distinct_commits() isn't doing anything useful. Let's just get
rid of that step.

Note that a side effect of the function was that we sorted the list of
oids, which we do rely on in copy_oids_to_commits(), since it must also
skip the duplicates. So we'll move the qsort there. I didn't copy the
"TODO" about adding more progress meters. It's actually quite hard to
make a repository large enough for this qsort would take an appreciable
amount of time, so this doesn't seem like a useful note.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c | 43 ++-----------------------------------------
 1 file changed, 2 insertions(+), 41 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 6f62a07313..1ac3516cf5 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -1588,35 +1588,6 @@ static void fill_oids_from_all_packs(struct write_commit_graph_context *ctx)
 	stop_progress(&ctx->progress);
 }
 
-static uint32_t count_distinct_commits(struct write_commit_graph_context *ctx)
-{
-	uint32_t i, count_distinct = 1;
-
-	if (ctx->report_progress)
-		ctx->progress = start_delayed_progress(
-			_("Counting distinct commits in commit graph"),
-			ctx->oids.nr);
-	display_progress(ctx->progress, 0); /* TODO: Measure QSORT() progress */
-	QSORT(ctx->oids.list, ctx->oids.nr, oid_compare);
-
-	for (i = 1; i < ctx->oids.nr; i++) {
-		display_progress(ctx->progress, i + 1);
-		if (!oideq(&ctx->oids.list[i - 1], &ctx->oids.list[i])) {
-			if (ctx->split) {
-				struct commit *c = lookup_commit(ctx->r, &ctx->oids.list[i]);
-
-				if (!c || commit_graph_position(c) != COMMIT_NOT_FROM_GRAPH)
-					continue;
-			}
-
-			count_distinct++;
-		}
-	}
-	stop_progress(&ctx->progress);
-
-	return count_distinct;
-}
-
 static void copy_oids_to_commits(struct write_commit_graph_context *ctx)
 {
 	uint32_t i;
@@ -1628,6 +1599,7 @@ static void copy_oids_to_commits(struct write_commit_graph_context *ctx)
 		ctx->progress = start_delayed_progress(
 			_("Finding extra edges in commit graph"),
 			ctx->oids.nr);
+	QSORT(ctx->oids.list, ctx->oids.nr, oid_compare);
 	for (i = 0; i < ctx->oids.nr; i++) {
 		unsigned int num_parents;
 
@@ -2155,7 +2127,7 @@ int write_commit_graph(struct object_directory *odb,
 		       const struct commit_graph_opts *opts)
 {
 	struct write_commit_graph_context *ctx;
-	uint32_t i, count_distinct = 0;
+	uint32_t i;
 	int res = 0;
 	int replace = 0;
 	struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
@@ -2268,17 +2240,6 @@ int write_commit_graph(struct object_directory *odb,
 
 	close_reachable(ctx);
 
-	count_distinct = count_distinct_commits(ctx);
-
-	if (count_distinct >= GRAPH_EDGE_LAST_MASK) {
-		error(_("the commit graph format cannot write %d commits"), count_distinct);
-		res = -1;
-		goto cleanup;
-	}
-
-	ctx->commits.alloc = count_distinct;
-	ALLOC_ARRAY(ctx->commits.list, ctx->commits.alloc);
-
 	copy_oids_to_commits(ctx);
 
 	if (ctx->commits.nr >= GRAPH_EDGE_LAST_MASK) {

From a5f1c448998fba5f5a1b00e1b9a779176ec665a6 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Mon, 7 Dec 2020 14:11:05 -0500
Subject: [PATCH 8/9] commit-graph: replace packed_oid_list with oid_array

Our custom packed_oid_list data structure is really just an oid_array in
disguise. Let's switch to using the generic structure, which shortens
and simplifies the code slightly.

There's one slightly awkward part: in the old code we copied a hash
straight from the mmap'd on-disk data into the final object_id. And now
we'll copy to a temporary oid, which we'll then pass to
oid_array_append(). But this is an operation we have to do all over the
commit-graph code already, since it mostly uses object_id structs
internally. I also measured "git commit-graph --append", which triggers
this code path, and it showed no difference.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c | 62 ++++++++++++--------------------------------------
 1 file changed, 15 insertions(+), 47 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 1ac3516cf5..4a718fd6e6 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -936,17 +936,11 @@ struct packed_commit_list {
 	int alloc;
 };
 
-struct packed_oid_list {
-	struct object_id *list;
-	int nr;
-	int alloc;
-};
-
 struct write_commit_graph_context {
 	struct repository *r;
 	struct object_directory *odb;
 	char *graph_name;
-	struct packed_oid_list oids;
+	struct oid_array oids;
 	struct packed_commit_list commits;
 	int num_extra_edges;
 	unsigned long approx_nr_objects;
@@ -1240,13 +1234,6 @@ static int write_graph_chunk_bloom_data(struct hashfile *f,
 	return 0;
 }
 
-static int oid_compare(const void *_a, const void *_b)
-{
-	const struct object_id *a = (const struct object_id *)_a;
-	const struct object_id *b = (const struct object_id *)_b;
-	return oidcmp(a, b);
-}
-
 static int add_packed_commits(const struct object_id *oid,
 			      struct packed_git *pack,
 			      uint32_t pos,
@@ -1267,10 +1254,7 @@ static int add_packed_commits(const struct object_id *oid,
 	if (type != OBJ_COMMIT)
 		return 0;
 
-	ALLOC_GROW(ctx->oids.list, ctx->oids.nr + 1, ctx->oids.alloc);
-	oidcpy(&(ctx->oids.list[ctx->oids.nr]), oid);
-	ctx->oids.nr++;
-
+	oid_array_append(&ctx->oids, oid);
 	set_commit_pos(ctx->r, oid);
 
 	return 0;
@@ -1281,9 +1265,7 @@ static void add_missing_parents(struct write_commit_graph_context *ctx, struct c
 	struct commit_list *parent;
 	for (parent = commit->parents; parent; parent = parent->next) {
 		if (!(parent->item->object.flags & REACHABLE)) {
-			ALLOC_GROW(ctx->oids.list, ctx->oids.nr + 1, ctx->oids.alloc);
-			oidcpy(&ctx->oids.list[ctx->oids.nr], &(parent->item->object.oid));
-			ctx->oids.nr++;
+			oid_array_append(&ctx->oids, &parent->item->object.oid);
 			parent->item->object.flags |= REACHABLE;
 		}
 	}
@@ -1302,7 +1284,7 @@ static void close_reachable(struct write_commit_graph_context *ctx)
 					ctx->oids.nr);
 	for (i = 0; i < ctx->oids.nr; i++) {
 		display_progress(ctx->progress, i + 1);
-		commit = lookup_commit(ctx->r, &ctx->oids.list[i]);
+		commit = lookup_commit(ctx->r, &ctx->oids.oid[i]);
 		if (commit)
 			commit->object.flags |= REACHABLE;
 	}
@@ -1319,7 +1301,7 @@ static void close_reachable(struct write_commit_graph_context *ctx)
 					0);
 	for (i = 0; i < ctx->oids.nr; i++) {
 		display_progress(ctx->progress, i + 1);
-		commit = lookup_commit(ctx->r, &ctx->oids.list[i]);
+		commit = lookup_commit(ctx->r, &ctx->oids.oid[i]);
 
 		if (!commit)
 			continue;
@@ -1339,7 +1321,7 @@ static void close_reachable(struct write_commit_graph_context *ctx)
 					ctx->oids.nr);
 	for (i = 0; i < ctx->oids.nr; i++) {
 		display_progress(ctx->progress, i + 1);
-		commit = lookup_commit(ctx->r, &ctx->oids.list[i]);
+		commit = lookup_commit(ctx->r, &ctx->oids.oid[i]);
 
 		if (commit)
 			commit->object.flags &= ~REACHABLE;
@@ -1567,9 +1549,7 @@ static int fill_oids_from_commits(struct write_commit_graph_context *ctx,
 
 	oidset_iter_init(commits, &iter);
 	while ((oid = oidset_iter_next(&iter))) {
-		ALLOC_GROW(ctx->oids.list, ctx->oids.nr + 1, ctx->oids.alloc);
-		oidcpy(&ctx->oids.list[ctx->oids.nr], oid);
-		ctx->oids.nr++;
+		oid_array_append(&ctx->oids, oid);
 	}
 
 	return 0;
@@ -1599,16 +1579,14 @@ static void copy_oids_to_commits(struct write_commit_graph_context *ctx)
 		ctx->progress = start_delayed_progress(
 			_("Finding extra edges in commit graph"),
 			ctx->oids.nr);
-	QSORT(ctx->oids.list, ctx->oids.nr, oid_compare);
-	for (i = 0; i < ctx->oids.nr; i++) {
+	oid_array_sort(&ctx->oids);
+	for (i = 0; i < ctx->oids.nr; i = oid_array_next_unique(&ctx->oids, i)) {
 		unsigned int num_parents;
 
 		display_progress(ctx->progress, i + 1);
-		if (i > 0 && oideq(&ctx->oids.list[i - 1], &ctx->oids.list[i]))
-			continue;
 
 		ALLOC_GROW(ctx->commits.list, ctx->commits.nr + 1, ctx->commits.alloc);
-		ctx->commits.list[ctx->commits.nr] = lookup_commit(ctx->r, &ctx->oids.list[i]);
+		ctx->commits.list[ctx->commits.nr] = lookup_commit(ctx->r, &ctx->oids.oid[i]);
 
 		if (ctx->split && flags != COMMIT_GRAPH_SPLIT_REPLACE &&
 		    commit_graph_position(ctx->commits.list[ctx->commits.nr]) != COMMIT_NOT_FROM_GRAPH)
@@ -2199,26 +2177,16 @@ int write_commit_graph(struct object_directory *odb,
 	}
 
 	ctx->approx_nr_objects = approximate_object_count();
-	ctx->oids.alloc = ctx->approx_nr_objects / 32;
 
-	if (ctx->split && opts && ctx->oids.alloc > opts->max_commits)
-		ctx->oids.alloc = opts->max_commits;
-
-	if (ctx->append) {
+	if (ctx->append)
 		prepare_commit_graph_one(ctx->r, ctx->odb);
-		if (ctx->r->objects->commit_graph)
-			ctx->oids.alloc += ctx->r->objects->commit_graph->num_commits;
-	}
-
-	if (ctx->oids.alloc < 1024)
-		ctx->oids.alloc = 1024;
-	ALLOC_ARRAY(ctx->oids.list, ctx->oids.alloc);
 
 	if (ctx->append && ctx->r->objects->commit_graph) {
 		struct commit_graph *g = ctx->r->objects->commit_graph;
 		for (i = 0; i < g->num_commits; i++) {
-			const unsigned char *hash = g->chunk_oid_lookup + g->hash_len * i;
-			hashcpy(ctx->oids.list[ctx->oids.nr++].hash, hash);
+			struct object_id oid;
+			hashcpy(oid.hash, g->chunk_oid_lookup + g->hash_len * i);
+			oid_array_append(&ctx->oids, &oid);
 		}
 	}
 
@@ -2274,7 +2242,7 @@ int write_commit_graph(struct object_directory *odb,
 cleanup:
 	free(ctx->graph_name);
 	free(ctx->commits.list);
-	free(ctx->oids.list);
+	oid_array_clear(&ctx->oids);
 
 	if (ctx->commit_graph_filenames_after) {
 		for (i = 0; i < ctx->num_commit_graphs_after; i++) {

From 3361390cbedc962adcddcfd98676695c125fa180 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Mon, 7 Dec 2020 14:11:08 -0500
Subject: [PATCH 9/9] commit-graph: use size_t for array allocation and
 indexing

Our packed_commit_list is an array of pointers to commit structs. We use
"int" for the allocation, which is 32-bit even on 64-bit platforms. This
isn't likely to overflow in practice (we're writing commit graphs, so
you'd need to actually have billions of unique commits in the
repository). But it's good practice to use size_t for allocations.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 4a718fd6e6..06f8dc1d89 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -932,8 +932,8 @@ struct tree *get_commit_tree_in_graph(struct repository *r, const struct commit
 
 struct packed_commit_list {
 	struct commit **list;
-	int nr;
-	int alloc;
+	size_t nr;
+	size_t alloc;
 };
 
 struct write_commit_graph_context {