From e770f36307202b1e87e57a3f355dcdac89d4f5aa Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 18 Nov 2024 04:54:40 -0500 Subject: [PATCH 1/6] object-file: prefer array-of-bytes initializer for hash literals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We hard-code a few well-known hash values for empty trees and blobs in both sha1 and sha256 formats. We do so with string literals like this: #define EMPTY_TREE_SHA256_BIN_LITERAL \ "\x6e\xf1\x9b\x41\x22\x5c\x53\x69\xf1\xc1" \ "\x04\xd4\x5d\x8d\x85\xef\xa9\xb0\x57\xb5" \ "\x3b\x14\xb4\xb9\xb9\x39\xdd\x74\xde\xcc" \ "\x53\x21" and then use it to initialize the hash field of an object_id struct. That hash field is exactly 32 bytes long (the size we need for sha256). But the string literal above is actually 33 bytes long due to the NUL terminator. This is legal in C, and the NUL is ignored. Side note on legality: in general excess initializer elements are forbidden, and gcc will warn on both of these: char foo[3] = { 'h', 'u', 'g', 'e' }; char bar[3] = "VeryLongString"; I couldn't find specific language in the standard allowing initialization from a string literal where _just_ the NUL is ignored, but C99 section 6.7.8 (Initialization), paragraph 32 shows this exact case as "example 8". However, the upcoming gcc 15 will start warning for this case (when compiled with -Wextra via DEVELOPER=1): CC object-file.o object-file.c:52:9: warning: initializer-string for array of ‘unsigned char’ is too long [-Wunterminated-string-initialization] 52 | "\x6e\xf1\x9b\x41\x22\x5c\x53\x69\xf1\xc1" \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ object-file.c:79:17: note: in expansion of macro ‘EMPTY_TREE_SHA256_BIN_LITERAL’ which is understandable. Even though this is not a bug for us, since we do not care about the NUL terminator (and are just using the literal as a convenient format), it would be easy to accidentally create an array that was mistakenly unterminated. We can avoid this warning by switching the initializer to an actual array of unsigned values. That arguably demonstrates our intent more clearly anyway. Reported-by: Sam James Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/object-file.c b/object-file.c index b1a3463852..8101585616 100644 --- a/object-file.c +++ b/object-file.c @@ -45,23 +45,27 @@ #define MAX_HEADER_LEN 32 -#define EMPTY_TREE_SHA1_BIN_LITERAL \ - "\x4b\x82\x5d\xc6\x42\xcb\x6e\xb9\xa0\x60" \ - "\xe5\x4b\xf8\xd6\x92\x88\xfb\xee\x49\x04" -#define EMPTY_TREE_SHA256_BIN_LITERAL \ - "\x6e\xf1\x9b\x41\x22\x5c\x53\x69\xf1\xc1" \ - "\x04\xd4\x5d\x8d\x85\xef\xa9\xb0\x57\xb5" \ - "\x3b\x14\xb4\xb9\xb9\x39\xdd\x74\xde\xcc" \ - "\x53\x21" +#define EMPTY_TREE_SHA1_BIN_LITERAL { \ + 0x4b, 0x82, 0x5d, 0xc6, 0x42, 0xcb, 0x6e, 0xb9, 0xa0, 0x60, \ + 0xe5, 0x4b, 0xf8, 0xd6, 0x92, 0x88, 0xfb, 0xee, 0x49, 0x04 \ +} +#define EMPTY_TREE_SHA256_BIN_LITERAL { \ + 0x6e, 0xf1, 0x9b, 0x41, 0x22, 0x5c, 0x53, 0x69, 0xf1, 0xc1, \ + 0x04, 0xd4, 0x5d, 0x8d, 0x85, 0xef, 0xa9, 0xb0, 0x57, 0xb5, \ + 0x3b, 0x14, 0xb4, 0xb9, 0xb9, 0x39, 0xdd, 0x74, 0xde, 0xcc, \ + 0x53, 0x21 \ +} -#define EMPTY_BLOB_SHA1_BIN_LITERAL \ - "\xe6\x9d\xe2\x9b\xb2\xd1\xd6\x43\x4b\x8b" \ - "\x29\xae\x77\x5a\xd8\xc2\xe4\x8c\x53\x91" -#define EMPTY_BLOB_SHA256_BIN_LITERAL \ - "\x47\x3a\x0f\x4c\x3b\xe8\xa9\x36\x81\xa2" \ - "\x67\xe3\xb1\xe9\xa7\xdc\xda\x11\x85\x43" \ - "\x6f\xe1\x41\xf7\x74\x91\x20\xa3\x03\x72" \ - "\x18\x13" +#define EMPTY_BLOB_SHA1_BIN_LITERAL { \ + 0xe6, 0x9d, 0xe2, 0x9b, 0xb2, 0xd1, 0xd6, 0x43, 0x4b, 0x8b, \ + 0x29, 0xae, 0x77, 0x5a, 0xd8, 0xc2, 0xe4, 0x8c, 0x53, 0x91 \ +} +#define EMPTY_BLOB_SHA256_BIN_LITERAL { \ + 0x47, 0x3a, 0x0f, 0x4c, 0x3b, 0xe8, 0xa9, 0x36, 0x81, 0xa2, \ + 0x67, 0xe3, 0xb1, 0xe9, 0xa7, 0xdc, 0xda, 0x11, 0x85, 0x43, \ + 0x6f, 0xe1, 0x41, 0xf7, 0x74, 0x91, 0x20, 0xa3, 0x03, 0x72, \ + 0x18, 0x13 \ +} static const struct object_id empty_tree_oid = { .hash = EMPTY_TREE_SHA1_BIN_LITERAL, From 2911f9ed1eccf92c4a98c50c3a88abb2c03a8126 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 18 Nov 2024 04:55:07 -0500 Subject: [PATCH 2/6] object-file: drop confusing oid initializer of empty_tree struct We treat the empty tree specially, providing an in-memory "cached" copy, which allows you to diff against it even if the object doesn't exist in the repository. This is implemented as part of the larger cached_object subsystem, but we use a stand-alone empty_tree struct. We initialize the oid of that struct using EMPTY_TREE_SHA1_BIN_LITERAL. At first glance, that seems like a bug; how could this ever work for sha256 repositories? The answer is that we never look at the oid field! The oid field is used to look up entries added by pretend_object_file() to the cached_objects array. But for our stand-alone entry, we look for it independently using the_hash_algo->empty_tree, which will point to the correct algo struct for the repository. This happened in 62ba93eaa9 (sha1_file: convert cached object code to struct object_id, 2018-05-02), which even mentions that this field is never used. Let's reduce confusion for anybody reading this code by replacing the sha1 initializer with a comment. The resulting field will be all-zeroes, so any violation of our assumption that the oid field is not used will break equally for sha1 and sha256. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/object-file.c b/object-file.c index 8101585616..19fc4afa43 100644 --- a/object-file.c +++ b/object-file.c @@ -326,9 +326,7 @@ static struct cached_object { static int cached_object_nr, cached_object_alloc; static struct cached_object empty_tree = { - .oid = { - .hash = EMPTY_TREE_SHA1_BIN_LITERAL, - }, + /* no oid needed; we'll look it up manually based on the_hash_algo */ .type = OBJ_TREE, .buf = "", }; From b2a95dfd63e812dc4abe5750371f2f0596d2d063 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 18 Nov 2024 04:55:11 -0500 Subject: [PATCH 3/6] object-file: move empty_tree struct into find_cached_object() The fake empty_tree struct is a static global, but the only code that looks at it is find_cached_object(). The struct itself is a little odd, with an invalid "oid" field that is handled specially by that function. Since it's really just an implementation detail, let's move it to a static within the function. That future-proofs against other code trying to use it and seeing the weird oid value. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/object-file.c b/object-file.c index 19fc4afa43..4d4280543e 100644 --- a/object-file.c +++ b/object-file.c @@ -325,14 +325,13 @@ static struct cached_object { } *cached_objects; static int cached_object_nr, cached_object_alloc; -static struct cached_object empty_tree = { - /* no oid needed; we'll look it up manually based on the_hash_algo */ - .type = OBJ_TREE, - .buf = "", -}; - static struct cached_object *find_cached_object(const struct object_id *oid) { + static struct cached_object empty_tree = { + /* no oid needed; we'll look it up manually based on the_hash_algo */ + .type = OBJ_TREE, + .buf = "", + }; int i; struct cached_object *co = cached_objects; From 9202ffcf1064f883aacc4aba8016918e1d8d8243 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 18 Nov 2024 04:55:15 -0500 Subject: [PATCH 4/6] object-file: drop oid field from find_cached_object() return value The pretend_object_file() function adds to an array mapping oids to object contents, which are later retrieved with find_cached_object(). We naturally need to store the oid for each entry, since it's the lookup key. But find_cached_object() also returns a hard-coded empty_tree object. There we don't care about its oid field and instead compare against the_hash_algo->empty_tree. The oid field is left as all-zeroes. This all works, but it means that the cached_object struct we return from find_cached_object() may or may not have a valid oid field, depend whether it is the hard-coded tree or came from pretend_object_file(). Nobody looks at the field, so there's no bug. But let's future-proof it by returning only the object contents themselves, not the oid. We'll continue to call this "struct cached_object", and the array entry mapping the key to those contents will be a "cached_object_entry". This would also let us swap out the array for a better data structure (like a hashmap) if we chose, but there's not much point. The only code that adds an entry is git-blame, which adds at most a single entry per process. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/object-file.c b/object-file.c index 4d4280543e..67a6731066 100644 --- a/object-file.c +++ b/object-file.c @@ -317,27 +317,28 @@ int hash_algo_by_length(int len) * to write them into the object store (e.g. a browse-only * application). */ -static struct cached_object { +static struct cached_object_entry { struct object_id oid; - enum object_type type; - const void *buf; - unsigned long size; + struct cached_object { + enum object_type type; + const void *buf; + unsigned long size; + } value; } *cached_objects; static int cached_object_nr, cached_object_alloc; static struct cached_object *find_cached_object(const struct object_id *oid) { static struct cached_object empty_tree = { - /* no oid needed; we'll look it up manually based on the_hash_algo */ .type = OBJ_TREE, .buf = "", }; int i; - struct cached_object *co = cached_objects; + struct cached_object_entry *co = cached_objects; for (i = 0; i < cached_object_nr; i++, co++) { if (oideq(&co->oid, oid)) - return co; + return &co->value; } if (oideq(oid, the_hash_algo->empty_tree)) return &empty_tree; @@ -1850,7 +1851,7 @@ int oid_object_info(struct repository *r, int pretend_object_file(void *buf, unsigned long len, enum object_type type, struct object_id *oid) { - struct cached_object *co; + struct cached_object_entry *co; char *co_buf; hash_object_file(the_hash_algo, buf, len, type, oid); @@ -1859,11 +1860,11 @@ int pretend_object_file(void *buf, unsigned long len, enum object_type type, return 0; ALLOC_GROW(cached_objects, cached_object_nr + 1, cached_object_alloc); co = &cached_objects[cached_object_nr++]; - co->size = len; - co->type = type; + co->value.size = len; + co->value.type = type; co_buf = xmalloc(len); memcpy(co_buf, buf, len); - co->buf = co_buf; + co->value.buf = co_buf; oidcpy(&co->oid, oid); return 0; } From e37feea00b2b81c0295fddb4f5137d12ea1825c0 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 18 Nov 2024 04:55:19 -0500 Subject: [PATCH 5/6] object-file: treat cached_object values as const The cached-object API maps oids to in-memory entries. Once inserted, these entries should be immutable. Let's return them from the find_cached_object() call with a const tag to make this clear. Suggested-by: Patrick Steinhardt Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/object-file.c b/object-file.c index 67a6731066..ec62e5fb3b 100644 --- a/object-file.c +++ b/object-file.c @@ -327,14 +327,14 @@ static struct cached_object_entry { } *cached_objects; static int cached_object_nr, cached_object_alloc; -static struct cached_object *find_cached_object(const struct object_id *oid) +static const struct cached_object *find_cached_object(const struct object_id *oid) { - static struct cached_object empty_tree = { + static const struct cached_object empty_tree = { .type = OBJ_TREE, .buf = "", }; int i; - struct cached_object_entry *co = cached_objects; + const struct cached_object_entry *co = cached_objects; for (i = 0; i < cached_object_nr; i++, co++) { if (oideq(&co->oid, oid)) @@ -1629,7 +1629,7 @@ static int do_oid_object_info_extended(struct repository *r, struct object_info *oi, unsigned flags) { static struct object_info blank_oi = OBJECT_INFO_INIT; - struct cached_object *co; + const struct cached_object *co; struct pack_entry e; int rtype; const struct object_id *real = oid; From 2af8ead52be9b72f3db76c9016cc4444eea33544 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 18 Nov 2024 04:55:22 -0500 Subject: [PATCH 6/6] object-file: inline empty tree and blob literals We define macros with the bytes of the empty trees and blobs for sha1 and sha256. But since e1ccd7e2b1 (sha1_file: only expose empty object constants through git_hash_algo, 2018-05-02), those are used only for initializing the git_hash_algo entries. Any other code using the macros directly would be suspicious, since a hash_algo pointer is the level of indirection we use to make everything work with both sha1 and sha256. So let's future proof against code doing the wrong thing by dropping the macros entirely and just initializing the structs directly. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 47 ++++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/object-file.c b/object-file.c index ec62e5fb3b..891eaa2b4b 100644 --- a/object-file.c +++ b/object-file.c @@ -44,35 +44,18 @@ /* The maximum size for an object header. */ #define MAX_HEADER_LEN 32 - -#define EMPTY_TREE_SHA1_BIN_LITERAL { \ - 0x4b, 0x82, 0x5d, 0xc6, 0x42, 0xcb, 0x6e, 0xb9, 0xa0, 0x60, \ - 0xe5, 0x4b, 0xf8, 0xd6, 0x92, 0x88, 0xfb, 0xee, 0x49, 0x04 \ -} -#define EMPTY_TREE_SHA256_BIN_LITERAL { \ - 0x6e, 0xf1, 0x9b, 0x41, 0x22, 0x5c, 0x53, 0x69, 0xf1, 0xc1, \ - 0x04, 0xd4, 0x5d, 0x8d, 0x85, 0xef, 0xa9, 0xb0, 0x57, 0xb5, \ - 0x3b, 0x14, 0xb4, 0xb9, 0xb9, 0x39, 0xdd, 0x74, 0xde, 0xcc, \ - 0x53, 0x21 \ -} - -#define EMPTY_BLOB_SHA1_BIN_LITERAL { \ - 0xe6, 0x9d, 0xe2, 0x9b, 0xb2, 0xd1, 0xd6, 0x43, 0x4b, 0x8b, \ - 0x29, 0xae, 0x77, 0x5a, 0xd8, 0xc2, 0xe4, 0x8c, 0x53, 0x91 \ -} -#define EMPTY_BLOB_SHA256_BIN_LITERAL { \ - 0x47, 0x3a, 0x0f, 0x4c, 0x3b, 0xe8, 0xa9, 0x36, 0x81, 0xa2, \ - 0x67, 0xe3, 0xb1, 0xe9, 0xa7, 0xdc, 0xda, 0x11, 0x85, 0x43, \ - 0x6f, 0xe1, 0x41, 0xf7, 0x74, 0x91, 0x20, 0xa3, 0x03, 0x72, \ - 0x18, 0x13 \ -} - static const struct object_id empty_tree_oid = { - .hash = EMPTY_TREE_SHA1_BIN_LITERAL, + .hash = { + 0x4b, 0x82, 0x5d, 0xc6, 0x42, 0xcb, 0x6e, 0xb9, 0xa0, 0x60, + 0xe5, 0x4b, 0xf8, 0xd6, 0x92, 0x88, 0xfb, 0xee, 0x49, 0x04 + }, .algo = GIT_HASH_SHA1, }; static const struct object_id empty_blob_oid = { - .hash = EMPTY_BLOB_SHA1_BIN_LITERAL, + .hash = { + 0xe6, 0x9d, 0xe2, 0x9b, 0xb2, 0xd1, 0xd6, 0x43, 0x4b, 0x8b, + 0x29, 0xae, 0x77, 0x5a, 0xd8, 0xc2, 0xe4, 0x8c, 0x53, 0x91 + }, .algo = GIT_HASH_SHA1, }; static const struct object_id null_oid_sha1 = { @@ -80,11 +63,21 @@ static const struct object_id null_oid_sha1 = { .algo = GIT_HASH_SHA1, }; static const struct object_id empty_tree_oid_sha256 = { - .hash = EMPTY_TREE_SHA256_BIN_LITERAL, + .hash = { + 0x6e, 0xf1, 0x9b, 0x41, 0x22, 0x5c, 0x53, 0x69, 0xf1, 0xc1, + 0x04, 0xd4, 0x5d, 0x8d, 0x85, 0xef, 0xa9, 0xb0, 0x57, 0xb5, + 0x3b, 0x14, 0xb4, 0xb9, 0xb9, 0x39, 0xdd, 0x74, 0xde, 0xcc, + 0x53, 0x21 + }, .algo = GIT_HASH_SHA256, }; static const struct object_id empty_blob_oid_sha256 = { - .hash = EMPTY_BLOB_SHA256_BIN_LITERAL, + .hash = { + 0x47, 0x3a, 0x0f, 0x4c, 0x3b, 0xe8, 0xa9, 0x36, 0x81, 0xa2, + 0x67, 0xe3, 0xb1, 0xe9, 0xa7, 0xdc, 0xda, 0x11, 0x85, 0x43, + 0x6f, 0xe1, 0x41, 0xf7, 0x74, 0x91, 0x20, 0xa3, 0x03, 0x72, + 0x18, 0x13 + }, .algo = GIT_HASH_SHA256, }; static const struct object_id null_oid_sha256 = {