From 8216cf9419a3a8dfcd65b4caad72c4e6cb9c0513 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 25 Feb 2025 01:28:24 -0500 Subject: [PATCH 01/10] loose_object_info(): BUG() on inflating content with unknown type After unpack_loose_header() returns, it will have inflated not only the object header, but possibly some bytes of the object content. When we call unpack_loose_rest() to extract the actual content, it finds those extra bytes by skipping past the header's terminating NUL in the buffer. Like this: int bytes = strlen(buffer) + 1; n = stream->total_out - bytes; ... memcpy(buf, (char *) buffer + bytes, n); This won't work with the OBJECT_INFO_ALLOW_UNKNOWN_TYPE flag, as there we allow a header of arbitrary size. We put into a strbuf, but feed only the final 32-byte chunk we read to unpack_loose_rest(). In that case stream->total_out may unexpectedly large, and thus our "n" will be large, causing an out-of-bounds read (we do check it against our allocated buffer size, which prevents an out-of-bounds write). Probably this could be made to work by feeding the strbuf to unpack_loose_rest(), along with adjusting some types (e.g., "bytes" would need to be a size_t, since it is no longer operating on a 32-byte buffer). But I don't think it's possible to actually trigger this in practice. The only caller who passes ALLOW_UNKNOWN_TYPE is cat-file, which only allows it with the "-t" and "-s" options (neither of which access the content). There is one way you can _almost_ trigger it: the oid compat routines (i.e., accessing sha1 via sha256 names and vice versa) will convert objects on the fly (which requires access to the content) using the same flags that were passed in. So in theory this: t='some very large type field that causes an extra inflate call' sha1_oid=$(git hash-object -w -t "$t" file) sha256_oid=$(git rev-parse --output-object-format=sha256 $sha1_oid) git cat-file --allow-unknown-type -s $sha256_oid would try to access the content. But it doesn't work, because using compat objects requires an entry in the .git/objects/loose-object-idx file, and we don't generate such an entry for non-standard types (see the "compat" section of write_object_file_literally()). If we use "t=blob" instead, then it does access the compat object, but it doesn't trigger the problem (because "blob" is a standard short type name, and it fits in the initial 32-byte buffer). So given that this is almost a memory error bug, I think it's worth addressing. But because we can't actually trigger the situation, I'm hesitant to try a fix that we can't run. Instead let's document the restriction and protect ourselves from the out-of-bounds read by adding a BUG() check. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/object-file.c b/object-file.c index 619f039ebc..f36846b5ba 100644 --- a/object-file.c +++ b/object-file.c @@ -1491,6 +1491,8 @@ static int loose_object_info(struct repository *r, if (!oi->contentp) break; + if (hdrbuf.len) + BUG("unpacking content with unknown types not yet supported"); *oi->contentp = unpack_loose_rest(&stream, hdr, *oi->sizep, oid); if (*oi->contentp) goto cleanup; From 03e7c454e9bc15f4fa046bd3a5f6147bbd0480e6 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 25 Feb 2025 01:29:00 -0500 Subject: [PATCH 02/10] unpack_loose_header(): simplify next_out assignment When using OBJECT_INFO_ALLOW_UNKNOWN_TYPE to unpack a header that doesn't fit into our initial 32-byte buffer, we loop over calls git_inflate(), feeding it our buffer to the "next_out" pointer each time. As the code is written, we reset next_out after each inflate call (and after reading the output), ready for the next loop. This isn't wrong, but there are a few advantages to setting up "next_out" right before each inflate call, rather than after: 1. It drops a few duplicated lines of code. 2. It makes it obvious that we always feed a fresh buffer on each call (and thus can never see Z_BUF_ERROR due to due to a lack of output space). 3. After we exit the loop, we'll leave stream->next_out pointing to the end of the fetched data (this is how zlib callers find out how much data is in the buffer). This doesn't matter in practice, since nobody looks at it again. But it's probably the least-surprising thing to do, as it matches how next_out is left when the whole thing fits in the initial 32-byte buffer (and we don't enter the loop at all). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/object-file.c b/object-file.c index f36846b5ba..e48da375bd 100644 --- a/object-file.c +++ b/object-file.c @@ -1296,18 +1296,17 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream, * reading the stream. */ strbuf_add(header, buffer, stream->next_out - (unsigned char *)buffer); - stream->next_out = buffer; - stream->avail_out = bufsiz; do { + stream->next_out = buffer; + stream->avail_out = bufsiz; + obj_read_unlock(); status = git_inflate(stream, 0); obj_read_lock(); strbuf_add(header, buffer, stream->next_out - (unsigned char *)buffer); if (memchr(buffer, '\0', stream->next_out - (unsigned char *)buffer)) return 0; - stream->next_out = buffer; - stream->avail_out = bufsiz; } while (status != Z_STREAM_END); return ULHR_TOO_LONG; } From e7ac344d7018d4537eda29d5a09c047a35f27364 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 25 Feb 2025 01:29:40 -0500 Subject: [PATCH 03/10] unpack_loose_header(): report headers without NUL as "bad" If a caller asks us to read the whole loose object header value into a strbuf (e.g., via "cat-file --allow-unknown-type"), we'll keep reading until we see a NUL byte marking the end of the header. If we hit Z_STREAM_END before seeing the NUL, we obviously have to stop. But we return ULHR_TOO_LONG, which doesn't make any sense. The "too long" return code is used in the normal, 32-byte limited mode to indicate that we stopped looking. There is no such thing as "too long" here, as we'd keep reading forever until we see the end of stream or the NUL. Instead, we should return ULHR_BAD. The loose object has no NUL marking the end of header, so it is malformed. The behavior difference is slight; in either case we'd consider the object unreadable and refuse to go further. The only difference is the specific error message we produce. There's no test case here, as we'd need to generate a valid zlib stream without a NUL. That's not something Git will do without writing new custom code. And in the next patch we'll fix another bug in this area which will make this easier to do (and we will test it then). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object-file.c b/object-file.c index e48da375bd..b1c33dbb63 100644 --- a/object-file.c +++ b/object-file.c @@ -1308,7 +1308,7 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream, if (memchr(buffer, '\0', stream->next_out - (unsigned char *)buffer)) return 0; } while (status != Z_STREAM_END); - return ULHR_TOO_LONG; + return ULHR_BAD; } static void *unpack_loose_rest(git_zstream *stream, From b748ddb7a470b952b8a5596649f7433278d7f2c4 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 25 Feb 2025 01:29:58 -0500 Subject: [PATCH 04/10] unpack_loose_header(): fix infinite loop on broken zlib input When reading a loose object, we first try to expand the first 32 bytes to read the type+size header. This is enough for any of the normal Git types. But since 46f034483e (sha1_file: support reading from a loose object of unknown type, 2015-05-03), the caller can also ask us to parse any unknown names, which can be much longer. In this case we keep inflating until we find the NUL at the end of the header, or hit Z_STREAM_END. But what if zlib can't make forward progress? For example, if the loose object file is truncated, we'll have no more data to feed it. It will return Z_BUF_ERROR, and we'll just loop infinitely, calling git_inflate() over and over but never seeing new bytes nor an end-of-stream marker. We can fix this by only looping when we think we can make forward progress. This will always be Z_OK in this case. In other code we might also be able to continue on Z_BUF_ERROR, but: - We will never see Z_BUF_ERROR because the output buffer is full; we always feed a fresh 32-byte buffer on each call to git_inflate(). - We may see Z_BUF_ERROR if we run out of input. But since we've fed the whole mmap'd buffer to zlib, if it runs out of input there is nothing more we can do. So if we don't see Z_OK (and didn't see the end-of-header NUL, otherwise we'd have broken out of the loop), then we should stop looping and return an error. The test case shows an example where the input is truncated (which gives us the input Z_BUF_ERROR case above). Although we do operate on objects we might get from an untrusted remote, I don't think the security implications of this bug are too great. It can only trigger if both of these are true: - You're reading a loose object whose on-disk representation was written by an attacker. So fetching an object (or receiving a push) are mostly OK, because even with unpack-objects it is our local, trusted code that writes out the object file. The exception may be fetching from an untrusted local repo, or using dumb-http, which copies objects verbatim. But... - The only code path which triggers the inflate loop is cat-file's --allow-unknown-type option. This is unlikely to be called at all outside of debugging. But I also suspect that objects with non-standard types (or that are truncated) would not survive the usual fetch/receive checks in the first place. So I think it would be quite hard to trick somebody into running the infinite loop, and we can just fix the bug. Co-authored-by: Taylor Blau Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 2 +- t/t1006-cat-file.sh | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/object-file.c b/object-file.c index b1c33dbb63..5086633e21 100644 --- a/object-file.c +++ b/object-file.c @@ -1307,7 +1307,7 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream, strbuf_add(header, buffer, stream->next_out - (unsigned char *)buffer); if (memchr(buffer, '\0', stream->next_out - (unsigned char *)buffer)) return 0; - } while (status != Z_STREAM_END); + } while (status == Z_OK); return ULHR_BAD; } diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh index e0c6482797..78fd970c8a 100755 --- a/t/t1006-cat-file.sh +++ b/t/t1006-cat-file.sh @@ -817,6 +817,25 @@ test_expect_success 'cat-file -t and -s on corrupt loose object' ' ) ' +test_expect_success 'truncated object with --allow-unknown-type' - <<\EOT + objtype='a really long type name that exceeds the 32-byte limit' && + blob=$(git hash-object -w --literally -t "$objtype" /dev/null) && + objpath=.git/objects/$(test_oid_to_path "$blob") && + + # We want to truncate the object far enough in that we don't hit the + # end while inflating the first 32 bytes (since we want to have to dig + # for the trailing NUL of the header). But we don't want to go too far, + # since our header isn't very big. And of course we are counting + # deflated zlib bytes in the on-disk file, so it's a bit of a guess. + # Empirically 50 seems to work. + mv "$objpath" obj.bak && + test_when_finished 'mv obj.bak "$objpath"' && + test_copy_bytes 50 "$objpath" && + + test_must_fail git cat-file --allow-unknown-type -t $blob 2>err && + test_grep "unable to unpack $blob header" err +EOT + # Tests for git cat-file --follow-symlinks test_expect_success 'prep for symlink tests' ' echo_without_newline "$hello_content" >morx && From 0b1493c2d49222ce07b73016bb156fecb5999bb9 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 25 Feb 2025 01:30:26 -0500 Subject: [PATCH 05/10] git_inflate(): skip zlib_post_call() sanity check on Z_NEED_DICT This fixes a case where malformed object input can cause us to hit a BUG() call in the git-zlib.c code. The zlib format allows the use of preset dictionaries to reduce the size of deflated data. The checksum of the dictionary is computed by the deflate code and goes into the stream. On the inflating side, zlib sees the dictionary checksum and returns Z_NEED_DICT, asking the caller to provide the dictionary data via inflateSetDictionary(). This should never happen in Git, because we never provide a dictionary for deflating (and if we get a stream that mentions a dictionary, we have no idea how to provide it). So normally Z_NEED_DICT is a hard error for us. But something interesting happens if we _do_ happen to see it (e.g., because of a corrupt or malicious input). In git_inflate() as we loop over calls to zlib's inflate(), we translate between our large-integer git_zstream values and zlib's native z_stream types, copying in and out with zlib_pre_call() and zlib_post_call(). In zlib_post_call() we have a few sanity checks, including one that checks that the number of bytes consumed by zlib (as measured by it moving the "next_in" pointer) is equal to the movement of its "total_in" count. But these do not correspond when we see Z_NEED_DICT! Zlib consumes the bytes from the input buffer but it does not increment total_in. And so we hit the BUG("total_in mismatch") call. There are a few options here: - We could ditch that BUG() check. It is making too many assumptions about how zlib updates these values. But it does have value in most cases as a sanity check on the values we're copying. - We could skip the zlib_post_call() entirely when we see Z_NEED_DICT. We know that it's hard error for us, so we should just send the status up the stack and let the caller bail. The downside is that if we ever did want to support dictionaries, we couldn't (the git_zstream will be out of sync, since we never copied its values back from the z_stream). - We could continue to call zlib_post_call(), but skip just that BUG() check if the status is Z_NEED_DICT. This keeps git_inflate() as a thin wrapper around inflate(), and would let us later support dictionaries for some calls if we wanted to. This patch uses the third approach. It seems like the least-surprising thing to keep git_inflate() a close to inflate() as possible. And while it makes the diff a bit larger (since we have to pass the status down to to the zlib_post_call() function), it's a static local function, and every caller by definition will have just made a zlib call (and so will have a status integer). Co-authored-by: Taylor Blau Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- git-zlib.c | 27 ++++++++++++++++----------- t/t1006-cat-file.sh | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 11 deletions(-) diff --git a/git-zlib.c b/git-zlib.c index d43bbeb6da..c2d683528b 100644 --- a/git-zlib.c +++ b/git-zlib.c @@ -45,7 +45,7 @@ static void zlib_pre_call(git_zstream *s) s->z.avail_out = zlib_buf_cap(s->avail_out); } -static void zlib_post_call(git_zstream *s) +static void zlib_post_call(git_zstream *s, int status) { unsigned long bytes_consumed; unsigned long bytes_produced; @@ -54,7 +54,12 @@ static void zlib_post_call(git_zstream *s) bytes_produced = s->z.next_out - s->next_out; if (s->z.total_out != s->total_out + bytes_produced) BUG("total_out mismatch"); - if (s->z.total_in != s->total_in + bytes_consumed) + /* + * zlib does not update total_in when it returns Z_NEED_DICT, + * causing a mismatch here. Skip the sanity check in that case. + */ + if (status != Z_NEED_DICT && + s->z.total_in != s->total_in + bytes_consumed) BUG("total_in mismatch"); s->total_out = s->z.total_out; @@ -71,7 +76,7 @@ void git_inflate_init(git_zstream *strm) zlib_pre_call(strm); status = inflateInit(&strm->z); - zlib_post_call(strm); + zlib_post_call(strm, status); if (status == Z_OK) return; die("inflateInit: %s (%s)", zerr_to_string(status), @@ -89,7 +94,7 @@ void git_inflate_init_gzip_only(git_zstream *strm) zlib_pre_call(strm); status = inflateInit2(&strm->z, windowBits); - zlib_post_call(strm); + zlib_post_call(strm, status); if (status == Z_OK) return; die("inflateInit2: %s (%s)", zerr_to_string(status), @@ -102,7 +107,7 @@ void git_inflate_end(git_zstream *strm) zlib_pre_call(strm); status = inflateEnd(&strm->z); - zlib_post_call(strm); + zlib_post_call(strm, status); if (status == Z_OK) return; error("inflateEnd: %s (%s)", zerr_to_string(status), @@ -121,7 +126,7 @@ int git_inflate(git_zstream *strm, int flush) ? 0 : flush); if (status == Z_MEM_ERROR) die("inflate: out of memory"); - zlib_post_call(strm); + zlib_post_call(strm, status); /* * Let zlib work another round, while we can still @@ -163,7 +168,7 @@ void git_deflate_init(git_zstream *strm, int level) memset(strm, 0, sizeof(*strm)); zlib_pre_call(strm); status = deflateInit(&strm->z, level); - zlib_post_call(strm); + zlib_post_call(strm, status); if (status == Z_OK) return; die("deflateInit: %s (%s)", zerr_to_string(status), @@ -179,7 +184,7 @@ static void do_git_deflate_init(git_zstream *strm, int level, int windowBits) status = deflateInit2(&strm->z, level, Z_DEFLATED, windowBits, 8, Z_DEFAULT_STRATEGY); - zlib_post_call(strm); + zlib_post_call(strm, status); if (status == Z_OK) return; die("deflateInit2: %s (%s)", zerr_to_string(status), @@ -210,7 +215,7 @@ int git_deflate_abort(git_zstream *strm) zlib_pre_call(strm); status = deflateEnd(&strm->z); - zlib_post_call(strm); + zlib_post_call(strm, status); return status; } @@ -230,7 +235,7 @@ int git_deflate_end_gently(git_zstream *strm) zlib_pre_call(strm); status = deflateEnd(&strm->z); - zlib_post_call(strm); + zlib_post_call(strm, status); return status; } @@ -247,7 +252,7 @@ int git_deflate(git_zstream *strm, int flush) ? 0 : flush); if (status == Z_MEM_ERROR) die("deflate: out of memory"); - zlib_post_call(strm); + zlib_post_call(strm, status); /* * Let zlib work another round, while we can still diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh index 78fd970c8a..04099f7b4a 100755 --- a/t/t1006-cat-file.sh +++ b/t/t1006-cat-file.sh @@ -836,6 +836,38 @@ test_expect_success 'truncated object with --allow-unknown-type' - <<\EOT test_grep "unable to unpack $blob header" err EOT +test_expect_success 'object reading handles zlib dictionary' - <<\EOT + echo 'content that will be recompressed' >file && + blob=$(git hash-object -w file) && + objpath=.git/objects/$(test_oid_to_path "$blob") && + + # Recompress a loose object using a precomputed zlib dictionary. + # This was originally done with: + # + # perl -MCompress::Raw::Zlib -e ' + # binmode STDIN; + # binmode STDOUT; + # my $data = do { local $/; }; + # my $in = new Compress::Raw::Zlib::Inflate; + # my $de = new Compress::Raw::Zlib::Deflate( + # -Dictionary => "anything" + # ); + # $in->inflate($data, $raw); + # $de->deflate($raw, $out); + # print $out; + # ' $objpath + # + # but we do not want to require the perl module for all test runs (nor + # carry a custom t/helper program that uses zlib features we don't + # otherwise care about). + mv "$objpath" obj.bak && + test_when_finished 'mv obj.bak "$objpath"' && + printf '\170\273\017\112\003\143' >$objpath && + + test_must_fail git cat-file blob $blob 2>err && + test_grep 'error: inflate: needs dictionary' err +EOT + # Tests for git cat-file --follow-symlinks test_expect_success 'prep for symlink tests' ' echo_without_newline "$hello_content" >morx && From 67a6b1aeb82fd4685e862a3a7807d4ed8ea5d899 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 25 Feb 2025 01:30:56 -0500 Subject: [PATCH 06/10] unpack_loose_header(): avoid numeric comparison of zlib status When unpacking a loose header, we try to inflate the first 32 bytes. We'd expect either Z_OK (we filled up the output buffer, but there are more bytes in the object) or Z_STREAM_END (this is a tiny object whose header and content fit in the buffer). We check for that with "if (status < Z_OK)", making the assumption that all of the errors we'd see have negative values (as Z_OK itself is "0", and Z_STREAM_END is "1"). But there's at least one case this misses: Z_NEED_DICT is "2". This isn't something we'd ever expect to see, but if we do see it, we should consider it an error (since we have no dictionary to load). Instead, the current code interprets Z_NEED_DICT as success and looks for the object header's terminating NUL in the bytes we've read. This will generaly be zero bytes if the dictionary is mentioned at the start of the stream. So we'll fail to find it and complain "the header is too long" (ULHR_LONG). But really, the problem is that the object is malformed, and we should return ULHR_BAD. This is a minor bug, as we consider both cases to be an error. But it does mean we print the wrong error message. The test case added in the previous patch triggers this code, so we can just confirm the error message we see here. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 2 +- t/t1006-cat-file.sh | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/object-file.c b/object-file.c index 5086633e21..0bc62b53d3 100644 --- a/object-file.c +++ b/object-file.c @@ -1273,7 +1273,7 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream, obj_read_unlock(); status = git_inflate(stream, 0); obj_read_lock(); - if (status < Z_OK) + if (status != Z_OK && status != Z_STREAM_END) return ULHR_BAD; /* diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh index 04099f7b4a..609dabd5cf 100755 --- a/t/t1006-cat-file.sh +++ b/t/t1006-cat-file.sh @@ -865,6 +865,8 @@ test_expect_success 'object reading handles zlib dictionary' - <<\EOT printf '\170\273\017\112\003\143' >$objpath && test_must_fail git cat-file blob $blob 2>err && + test_grep ! 'too long' err && + test_grep 'error: unable to unpack' err && test_grep 'error: inflate: needs dictionary' err EOT From 9929a6791703c96e5f613cc3b52f4f9e16baa49c Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 25 Feb 2025 01:31:15 -0500 Subject: [PATCH 07/10] unpack_loose_rest(): avoid numeric comparison of zlib status When unpacking the actual content of a loose object file, we insist both that the status code we got is Z_STREAM_END, and that we consumed all bytes. If we didn't, we'll return an error, but the specific error message we produce depends on which of the two error conditions we saw. So we'll check both a second time to decide which error to produce. But this second time, our status code check is loose: it checks for a negative status value. This can get confused by zlib codes which are not negative, such as Z_NEED_DICT. In this case we'd erroneously print nothing at all, when we should say "corrupt loose object". Instead, this second check should check explicitly against Z_STREAM_END. Note that Z_OK is "0", so the existing code also produced no message for Z_OK. But it's impossible to see that status, since we only break out of the inflate loop when we stop seeing Z_OK (so a stream which has more bytes than its object header claims would eventually yield Z_BUF_ERROR). There's no test here, as it would require a loose object whose zlib stream returns Z_NEED_DICT in the middle of the object content. I think that is probably possible, but even our Z_NEED_DICT test in t1006 does not trigger this, because we hit that error while reading the header. I found this bug while reviewing all callers of git_inflate() for bugs similar to the one we saw in unpack_loose_header(). This was the only other case that did a numeric comparison rather than explicitly checking for Z_STREAM_END. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object-file.c b/object-file.c index 0bc62b53d3..17d54c845d 100644 --- a/object-file.c +++ b/object-file.c @@ -1352,7 +1352,7 @@ static void *unpack_loose_rest(git_zstream *stream, return buf; } - if (status < 0) + if (status != Z_STREAM_END) error(_("corrupt loose object '%s'"), oid_to_hex(oid)); else if (stream->avail_in) error(_("garbage at end of loose object '%s'"), From 84b5c1a099e6df35f4b54d651b425a894513e62b Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 25 Feb 2025 01:33:12 -0500 Subject: [PATCH 08/10] unpack_loose_rest(): never clean up zstream The unpack_loose_rest() function has funny ownership semantics: we pass in a z_stream opened by the caller, but then only _sometimes_ close it. This oddity has developed over time. When the function was originally split out in 5180cacc20 (Split up unpack_sha1_file() some more, 2005-06-02), it always called inflateEnd() to clean up the stream (though nowadays it is a git_zstream and we call git_inflate_end()). But in 7efbff7531 (unpack_sha1_file(): detect corrupt loose object files., 2007-03-05) we added error code paths which don't close the stream. This makes some sense, as we'd still look at parts of the stream struct to decide which error to show (though I am not sure in practice if inflateEnd() even touches those fields). This subtlety makes it hard to know when the caller has to clean up the stream and when it does not. That led to the leak fixed by aa9ef614dc (object-file: fix memory leak when reading corrupted headers, 2024-08-14). Let's instead always leave the stream intact, forcing the caller to clean it up. You might think that would create more work for the callers, but it actually ends up simplifying them, since they can put the call to git_inflate_end() in the common cleanup code path. Two things to note, though: - The check_stream_oid() function is used as a replacement for unpack_loose_rest() in read_loose_object() to read blobs. It inherited the same funny semantics, and we should fix it here, too (to keep the cleanup in read_loose_object() consistent). - In read_loose_object() we need a second "out" label, as we can jump to the existing label before opening the stream at all (and since the struct is opaque, there is no way to if it was initialized or not, so we must not call git_inflate_end() in that case). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/object-file.c b/object-file.c index 17d54c845d..f9713e4e8b 100644 --- a/object-file.c +++ b/object-file.c @@ -1348,7 +1348,6 @@ static void *unpack_loose_rest(git_zstream *stream, } } if (status == Z_STREAM_END && !stream->avail_in) { - git_inflate_end(stream); return buf; } @@ -1512,8 +1511,8 @@ static int loose_object_info(struct repository *r, die(_("loose object %s (stored in %s) is corrupt"), oid_to_hex(oid), path); - git_inflate_end(&stream); cleanup: + git_inflate_end(&stream); munmap(map, mapsize); if (oi->sizep == &size_scratch) oi->sizep = NULL; @@ -2735,7 +2734,6 @@ static int check_stream_oid(git_zstream *stream, the_hash_algo->update_fn(&c, buf, stream->next_out - buf); total_read += stream->next_out - buf; } - git_inflate_end(stream); if (status != Z_STREAM_END) { error(_("corrupt loose object '%s'"), oid_to_hex(expected_oid)); @@ -2782,34 +2780,34 @@ int read_loose_object(const char *path, if (unpack_loose_header(&stream, map, mapsize, hdr, sizeof(hdr), NULL) != ULHR_OK) { error(_("unable to unpack header of %s"), path); - goto out; + goto out_inflate; } if (parse_loose_header(hdr, oi) < 0) { error(_("unable to parse header of %s"), path); - git_inflate_end(&stream); - goto out; + goto out_inflate; } if (*oi->typep == OBJ_BLOB && *size > big_file_threshold) { if (check_stream_oid(&stream, hdr, *size, path, expected_oid) < 0) - goto out; + goto out_inflate; } else { *contents = unpack_loose_rest(&stream, hdr, *size, expected_oid); if (!*contents) { error(_("unable to unpack contents of %s"), path); - git_inflate_end(&stream); - goto out; + goto out_inflate; } hash_object_file_literally(the_repository->hash_algo, *contents, *size, oi->type_name->buf, real_oid); if (!oideq(expected_oid, real_oid)) - goto out; + goto out_inflate; } ret = 0; /* everything checks out */ +out_inflate: + git_inflate_end(&stream); out: if (map) munmap(map, mapsize); From 547f719d9b022e87eb8cf3cb7a7632822b996e29 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 25 Feb 2025 01:33:51 -0500 Subject: [PATCH 09/10] unpack_loose_rest(): simplify error handling Inflating a loose object is considered successful only if we got Z_STREAM_END and there were no more bytes. We check both of those conditions and return success, but then have to check them a second time to decide which error message to produce. I.e., we do something like this: if (!error_1 && !error_2) ...return success... if (error_1) ...handle error1... else if (error_2) ...handle error2... ...common error handling... This repetition was the source of a small bug fixed in an earlier commit (our Z_STREAM_END check was not the same in the two conditionals). Instead we can chain them all into a single if/else cascade, which avoids repeating ourselves: if (error_1) ...handle error1... else if (error_2) ...handle error2.... else ...return success... ...common error handling... Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/object-file.c b/object-file.c index f9713e4e8b..9f6e8504fb 100644 --- a/object-file.c +++ b/object-file.c @@ -1347,15 +1347,15 @@ static void *unpack_loose_rest(git_zstream *stream, obj_read_lock(); } } - if (status == Z_STREAM_END && !stream->avail_in) { - return buf; - } if (status != Z_STREAM_END) error(_("corrupt loose object '%s'"), oid_to_hex(oid)); else if (stream->avail_in) error(_("garbage at end of loose object '%s'"), oid_to_hex(oid)); + else + return buf; + free(buf); return NULL; } From 1cb2f293f5a594fd5dee8400213bd2f395fbd2bf Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 25 Feb 2025 01:34:21 -0500 Subject: [PATCH 10/10] unpack_loose_rest(): rewrite return handling for clarity We have a pattern like: if (error1) ...handle error 1... else if (error2) ...handle error 2... else ...return buf... ...free buf and return NULL... This is a little subtle because it is the return in the success block that lets us skip the common error handling. Rewrite this instead to free the buffer in each error path, marking it as NULL, and then all code paths can use the common return. This should make the logic a bit easier to follow. It does mean duplicating the buf cleanup for errors, but it's a single line. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- object-file.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/object-file.c b/object-file.c index 9f6e8504fb..e463b4bad3 100644 --- a/object-file.c +++ b/object-file.c @@ -1348,16 +1348,16 @@ static void *unpack_loose_rest(git_zstream *stream, } } - if (status != Z_STREAM_END) + if (status != Z_STREAM_END) { error(_("corrupt loose object '%s'"), oid_to_hex(oid)); - else if (stream->avail_in) + FREE_AND_NULL(buf); + } else if (stream->avail_in) { error(_("garbage at end of loose object '%s'"), oid_to_hex(oid)); - else - return buf; + FREE_AND_NULL(buf); + } - free(buf); - return NULL; + return buf; } /*