From 8216cf9419a3a8dfcd65b4caad72c4e6cb9c0513 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Tue, 25 Feb 2025 01:28:24 -0500
Subject: [PATCH 01/10] loose_object_info(): BUG() on inflating content with
 unknown type

After unpack_loose_header() returns, it will have inflated not only the
object header, but possibly some bytes of the object content. When we
call unpack_loose_rest() to extract the actual content, it finds those
extra bytes by skipping past the header's terminating NUL in the buffer.
Like this:

  int bytes = strlen(buffer) + 1;
  n = stream->total_out - bytes;
  ...
  memcpy(buf, (char *) buffer + bytes, n);

This won't work with the OBJECT_INFO_ALLOW_UNKNOWN_TYPE flag, as there
we allow a header of arbitrary size. We put into a strbuf, but feed only
the final 32-byte chunk we read to unpack_loose_rest(). In that case
stream->total_out may unexpectedly large, and thus our "n" will be
large, causing an out-of-bounds read (we do check it against our
allocated buffer size, which prevents an out-of-bounds write).

Probably this could be made to work by feeding the strbuf to
unpack_loose_rest(), along with adjusting some types (e.g., "bytes"
would need to be a size_t, since it is no longer operating on a 32-byte
buffer).

But I don't think it's possible to actually trigger this in practice.
The only caller who passes ALLOW_UNKNOWN_TYPE is cat-file, which only
allows it with the "-t" and "-s" options (neither of which access the
content). There is one way you can _almost_ trigger it: the oid compat
routines (i.e., accessing sha1 via sha256 names and vice versa) will
convert objects on the fly (which requires access to the content) using
the same flags that were passed in. So in theory this:

  t='some very large type field that causes an extra inflate call'
  sha1_oid=$(git hash-object -w -t "$t" file)
  sha256_oid=$(git rev-parse --output-object-format=sha256 $sha1_oid)
  git cat-file --allow-unknown-type -s $sha256_oid

would try to access the content. But it doesn't work, because using
compat objects requires an entry in the .git/objects/loose-object-idx
file, and we don't generate such an entry for non-standard types (see
the "compat" section of write_object_file_literally()).

If we use "t=blob" instead, then it does access the compat object, but
it doesn't trigger the problem (because "blob" is a standard short type
name, and it fits in the initial 32-byte buffer).

So given that this is almost a memory error bug, I think it's worth
addressing. But because we can't actually trigger the situation, I'm
hesitant to try a fix that we can't run. Instead let's document the
restriction and protect ourselves from the out-of-bounds read by adding
a BUG() check.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 object-file.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/object-file.c b/object-file.c
index 619f039ebc..f36846b5ba 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1491,6 +1491,8 @@ static int loose_object_info(struct repository *r,
 
 		if (!oi->contentp)
 			break;
+		if (hdrbuf.len)
+			BUG("unpacking content with unknown types not yet supported");
 		*oi->contentp = unpack_loose_rest(&stream, hdr, *oi->sizep, oid);
 		if (*oi->contentp)
 			goto cleanup;

From 03e7c454e9bc15f4fa046bd3a5f6147bbd0480e6 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Tue, 25 Feb 2025 01:29:00 -0500
Subject: [PATCH 02/10] unpack_loose_header(): simplify next_out assignment

When using OBJECT_INFO_ALLOW_UNKNOWN_TYPE to unpack a header that
doesn't fit into our initial 32-byte buffer, we loop over calls
git_inflate(), feeding it our buffer to the "next_out" pointer each
time. As the code is written, we reset next_out after each inflate call
(and after reading the output), ready for the next loop.

This isn't wrong, but there are a few advantages to setting up
"next_out" right before each inflate call, rather than after:

  1. It drops a few duplicated lines of code.

  2. It makes it obvious that we always feed a fresh buffer on each call
     (and thus can never see Z_BUF_ERROR due to due to a lack of output
     space).

  3. After we exit the loop, we'll leave stream->next_out pointing to
     the end of the fetched data (this is how zlib callers find out how
     much data is in the buffer). This doesn't matter in practice, since
     nobody looks at it again. But it's probably the least-surprising
     thing to do, as it matches how next_out is left when the whole
     thing fits in the initial 32-byte buffer (and we don't enter the
     loop at all).

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 object-file.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/object-file.c b/object-file.c
index f36846b5ba..e48da375bd 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1296,18 +1296,17 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream,
 	 * reading the stream.
 	 */
 	strbuf_add(header, buffer, stream->next_out - (unsigned char *)buffer);
-	stream->next_out = buffer;
-	stream->avail_out = bufsiz;
 
 	do {
+		stream->next_out = buffer;
+		stream->avail_out = bufsiz;
+
 		obj_read_unlock();
 		status = git_inflate(stream, 0);
 		obj_read_lock();
 		strbuf_add(header, buffer, stream->next_out - (unsigned char *)buffer);
 		if (memchr(buffer, '\0', stream->next_out - (unsigned char *)buffer))
 			return 0;
-		stream->next_out = buffer;
-		stream->avail_out = bufsiz;
 	} while (status != Z_STREAM_END);
 	return ULHR_TOO_LONG;
 }

From e7ac344d7018d4537eda29d5a09c047a35f27364 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Tue, 25 Feb 2025 01:29:40 -0500
Subject: [PATCH 03/10] unpack_loose_header(): report headers without NUL as
 "bad"

If a caller asks us to read the whole loose object header value into a
strbuf (e.g., via "cat-file --allow-unknown-type"), we'll keep reading
until we see a NUL byte marking the end of the header.

If we hit Z_STREAM_END before seeing the NUL, we obviously have to stop.
But we return ULHR_TOO_LONG, which doesn't make any sense. The "too
long" return code is used in the normal, 32-byte limited mode to
indicate that we stopped looking. There is no such thing as "too long"
here, as we'd keep reading forever until we see the end of stream or the
NUL.

Instead, we should return ULHR_BAD. The loose object has no NUL marking
the end of header, so it is malformed. The behavior difference is
slight; in either case we'd consider the object unreadable and refuse to
go further. The only difference is the specific error message we
produce.

There's no test case here, as we'd need to generate a valid zlib stream
without a NUL. That's not something Git will do without writing new
custom code. And in the next patch we'll fix another bug in this area
which will make this easier to do (and we will test it then).

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 object-file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/object-file.c b/object-file.c
index e48da375bd..b1c33dbb63 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1308,7 +1308,7 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream,
 		if (memchr(buffer, '\0', stream->next_out - (unsigned char *)buffer))
 			return 0;
 	} while (status != Z_STREAM_END);
-	return ULHR_TOO_LONG;
+	return ULHR_BAD;
 }
 
 static void *unpack_loose_rest(git_zstream *stream,

From b748ddb7a470b952b8a5596649f7433278d7f2c4 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Tue, 25 Feb 2025 01:29:58 -0500
Subject: [PATCH 04/10] unpack_loose_header(): fix infinite loop on broken zlib
 input

When reading a loose object, we first try to expand the first 32 bytes
to read the type+size header. This is enough for any of the normal Git
types. But since 46f034483e (sha1_file: support reading from a loose
object of unknown type, 2015-05-03), the caller can also ask us to parse
any unknown names, which can be much longer. In this case we keep
inflating until we find the NUL at the end of the header, or hit
Z_STREAM_END.

But what if zlib can't make forward progress? For example, if the loose
object file is truncated, we'll have no more data to feed it. It will
return Z_BUF_ERROR, and we'll just loop infinitely, calling
git_inflate() over and over but never seeing new bytes nor an
end-of-stream marker.

We can fix this by only looping when we think we can make forward
progress. This will always be Z_OK in this case. In other code we might
also be able to continue on Z_BUF_ERROR, but:

  - We will never see Z_BUF_ERROR because the output buffer is full; we
    always feed a fresh 32-byte buffer on each call to git_inflate().

  - We may see Z_BUF_ERROR if we run out of input. But since we've fed
    the whole mmap'd buffer to zlib, if it runs out of input there is
    nothing more we can do.

So if we don't see Z_OK (and didn't see the end-of-header NUL, otherwise
we'd have broken out of the loop), then we should stop looping and
return an error.

The test case shows an example where the input is truncated (which gives
us the input Z_BUF_ERROR case above).

Although we do operate on objects we might get from an untrusted remote,
I don't think the security implications of this bug are too great. It
can only trigger if both of these are true:

  - You're reading a loose object whose on-disk representation was
    written by an attacker. So fetching an object (or receiving a push)
    are mostly OK, because even with unpack-objects it is our local,
    trusted code that writes out the object file. The exception may be
    fetching from an untrusted local repo, or using dumb-http, which
    copies objects verbatim. But...

  - The only code path which triggers the inflate loop is cat-file's
    --allow-unknown-type option. This is unlikely to be called at all
    outside of debugging. But I also suspect that objects with
    non-standard types (or that are truncated) would not survive the
    usual fetch/receive checks in the first place.

So I think it would be quite hard to trick somebody into running the
infinite loop, and we can just fix the bug.

Co-authored-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 object-file.c       |  2 +-
 t/t1006-cat-file.sh | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/object-file.c b/object-file.c
index b1c33dbb63..5086633e21 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1307,7 +1307,7 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream,
 		strbuf_add(header, buffer, stream->next_out - (unsigned char *)buffer);
 		if (memchr(buffer, '\0', stream->next_out - (unsigned char *)buffer))
 			return 0;
-	} while (status != Z_STREAM_END);
+	} while (status == Z_OK);
 	return ULHR_BAD;
 }
 
diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh
index e0c6482797..78fd970c8a 100755
--- a/t/t1006-cat-file.sh
+++ b/t/t1006-cat-file.sh
@@ -817,6 +817,25 @@ test_expect_success 'cat-file -t and -s on corrupt loose object' '
 	)
 '
 
+test_expect_success 'truncated object with --allow-unknown-type' - <<\EOT
+	objtype='a really long type name that exceeds the 32-byte limit' &&
+	blob=$(git hash-object -w --literally -t "$objtype" /dev/null) &&
+	objpath=.git/objects/$(test_oid_to_path "$blob") &&
+
+	# We want to truncate the object far enough in that we don't hit the
+	# end while inflating the first 32 bytes (since we want to have to dig
+	# for the trailing NUL of the header). But we don't want to go too far,
+	# since our header isn't very big. And of course we are counting
+	# deflated zlib bytes in the on-disk file, so it's a bit of a guess.
+	# Empirically 50 seems to work.
+	mv "$objpath" obj.bak &&
+	test_when_finished 'mv obj.bak "$objpath"' &&
+	test_copy_bytes 50 <obj.bak >"$objpath" &&
+
+	test_must_fail git cat-file --allow-unknown-type -t $blob 2>err &&
+	test_grep "unable to unpack $blob header" err
+EOT
+
 # Tests for git cat-file --follow-symlinks
 test_expect_success 'prep for symlink tests' '
 	echo_without_newline "$hello_content" >morx &&

From 0b1493c2d49222ce07b73016bb156fecb5999bb9 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Tue, 25 Feb 2025 01:30:26 -0500
Subject: [PATCH 05/10] git_inflate(): skip zlib_post_call() sanity check on
 Z_NEED_DICT

This fixes a case where malformed object input can cause us to hit a
BUG() call in the git-zlib.c code.

The zlib format allows the use of preset dictionaries to reduce the size
of deflated data. The checksum of the dictionary is computed by the
deflate code and goes into the stream. On the inflating side, zlib sees
the dictionary checksum and returns Z_NEED_DICT, asking the caller to
provide the dictionary data via inflateSetDictionary().

This should never happen in Git, because we never provide a dictionary
for deflating (and if we get a stream that mentions a dictionary, we
have no idea how to provide it). So normally Z_NEED_DICT is a hard error
for us. But something interesting happens if we _do_ happen to see it
(e.g., because of a corrupt or malicious input).

In git_inflate() as we loop over calls to zlib's inflate(), we translate
between our large-integer git_zstream values and zlib's native z_stream
types, copying in and out with zlib_pre_call() and zlib_post_call(). In
zlib_post_call() we have a few sanity checks, including one that checks
that the number of bytes consumed by zlib (as measured by it moving the
"next_in" pointer) is equal to the movement of its "total_in" count.

But these do not correspond when we see Z_NEED_DICT! Zlib consumes the
bytes from the input buffer but it does not increment total_in. And so
we hit the BUG("total_in mismatch") call.

There are a few options here:

  - We could ditch that BUG() check. It is making too many assumptions
    about how zlib updates these values. But it does have value in most
    cases as a sanity check on the values we're copying.

  - We could skip the zlib_post_call() entirely when we see Z_NEED_DICT.
    We know that it's hard error for us, so we should just send the
    status up the stack and let the caller bail.

    The downside is that if we ever did want to support dictionaries,
    we couldn't (the git_zstream will be out of sync, since we never
    copied its values back from the z_stream).

  - We could continue to call zlib_post_call(), but skip just that BUG()
    check if the status is Z_NEED_DICT. This keeps git_inflate() as a
    thin wrapper around inflate(), and would let us later support
    dictionaries for some calls if we wanted to.

This patch uses the third approach. It seems like the least-surprising
thing to keep git_inflate() a close to inflate() as possible. And while
it makes the diff a bit larger (since we have to pass the status down to
to the zlib_post_call() function), it's a static local function, and
every caller by definition will have just made a zlib call (and so will
have a status integer).

Co-authored-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 git-zlib.c          | 27 ++++++++++++++++-----------
 t/t1006-cat-file.sh | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/git-zlib.c b/git-zlib.c
index d43bbeb6da..c2d683528b 100644
--- a/git-zlib.c
+++ b/git-zlib.c
@@ -45,7 +45,7 @@ static void zlib_pre_call(git_zstream *s)
 	s->z.avail_out = zlib_buf_cap(s->avail_out);
 }
 
-static void zlib_post_call(git_zstream *s)
+static void zlib_post_call(git_zstream *s, int status)
 {
 	unsigned long bytes_consumed;
 	unsigned long bytes_produced;
@@ -54,7 +54,12 @@ static void zlib_post_call(git_zstream *s)
 	bytes_produced = s->z.next_out - s->next_out;
 	if (s->z.total_out != s->total_out + bytes_produced)
 		BUG("total_out mismatch");
-	if (s->z.total_in != s->total_in + bytes_consumed)
+	/*
+	 * zlib does not update total_in when it returns Z_NEED_DICT,
+	 * causing a mismatch here. Skip the sanity check in that case.
+	 */
+	if (status != Z_NEED_DICT &&
+	    s->z.total_in != s->total_in + bytes_consumed)
 		BUG("total_in mismatch");
 
 	s->total_out = s->z.total_out;
@@ -71,7 +76,7 @@ void git_inflate_init(git_zstream *strm)
 
 	zlib_pre_call(strm);
 	status = inflateInit(&strm->z);
-	zlib_post_call(strm);
+	zlib_post_call(strm, status);
 	if (status == Z_OK)
 		return;
 	die("inflateInit: %s (%s)", zerr_to_string(status),
@@ -89,7 +94,7 @@ void git_inflate_init_gzip_only(git_zstream *strm)
 
 	zlib_pre_call(strm);
 	status = inflateInit2(&strm->z, windowBits);
-	zlib_post_call(strm);
+	zlib_post_call(strm, status);
 	if (status == Z_OK)
 		return;
 	die("inflateInit2: %s (%s)", zerr_to_string(status),
@@ -102,7 +107,7 @@ void git_inflate_end(git_zstream *strm)
 
 	zlib_pre_call(strm);
 	status = inflateEnd(&strm->z);
-	zlib_post_call(strm);
+	zlib_post_call(strm, status);
 	if (status == Z_OK)
 		return;
 	error("inflateEnd: %s (%s)", zerr_to_string(status),
@@ -121,7 +126,7 @@ int git_inflate(git_zstream *strm, int flush)
 				 ? 0 : flush);
 		if (status == Z_MEM_ERROR)
 			die("inflate: out of memory");
-		zlib_post_call(strm);
+		zlib_post_call(strm, status);
 
 		/*
 		 * Let zlib work another round, while we can still
@@ -163,7 +168,7 @@ void git_deflate_init(git_zstream *strm, int level)
 	memset(strm, 0, sizeof(*strm));
 	zlib_pre_call(strm);
 	status = deflateInit(&strm->z, level);
-	zlib_post_call(strm);
+	zlib_post_call(strm, status);
 	if (status == Z_OK)
 		return;
 	die("deflateInit: %s (%s)", zerr_to_string(status),
@@ -179,7 +184,7 @@ static void do_git_deflate_init(git_zstream *strm, int level, int windowBits)
 	status = deflateInit2(&strm->z, level,
 				  Z_DEFLATED, windowBits,
 				  8, Z_DEFAULT_STRATEGY);
-	zlib_post_call(strm);
+	zlib_post_call(strm, status);
 	if (status == Z_OK)
 		return;
 	die("deflateInit2: %s (%s)", zerr_to_string(status),
@@ -210,7 +215,7 @@ int git_deflate_abort(git_zstream *strm)
 
 	zlib_pre_call(strm);
 	status = deflateEnd(&strm->z);
-	zlib_post_call(strm);
+	zlib_post_call(strm, status);
 	return status;
 }
 
@@ -230,7 +235,7 @@ int git_deflate_end_gently(git_zstream *strm)
 
 	zlib_pre_call(strm);
 	status = deflateEnd(&strm->z);
-	zlib_post_call(strm);
+	zlib_post_call(strm, status);
 	return status;
 }
 
@@ -247,7 +252,7 @@ int git_deflate(git_zstream *strm, int flush)
 				 ? 0 : flush);
 		if (status == Z_MEM_ERROR)
 			die("deflate: out of memory");
-		zlib_post_call(strm);
+		zlib_post_call(strm, status);
 
 		/*
 		 * Let zlib work another round, while we can still
diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh
index 78fd970c8a..04099f7b4a 100755
--- a/t/t1006-cat-file.sh
+++ b/t/t1006-cat-file.sh
@@ -836,6 +836,38 @@ test_expect_success 'truncated object with --allow-unknown-type' - <<\EOT
 	test_grep "unable to unpack $blob header" err
 EOT
 
+test_expect_success 'object reading handles zlib dictionary' - <<\EOT
+	echo 'content that will be recompressed' >file &&
+	blob=$(git hash-object -w file) &&
+	objpath=.git/objects/$(test_oid_to_path "$blob") &&
+
+	# Recompress a loose object using a precomputed zlib dictionary.
+	# This was originally done with:
+	#
+	#  perl -MCompress::Raw::Zlib -e '
+	#    binmode STDIN;
+	#    binmode STDOUT;
+	#    my $data = do { local $/; <STDIN> };
+	#    my $in = new Compress::Raw::Zlib::Inflate;
+	#    my $de = new Compress::Raw::Zlib::Deflate(
+	#      -Dictionary => "anything"
+	#    );
+	#    $in->inflate($data, $raw);
+	#    $de->deflate($raw, $out);
+	#    print $out;
+	#  ' <obj.bak >$objpath
+	#
+	# but we do not want to require the perl module for all test runs (nor
+	# carry a custom t/helper program that uses zlib features we don't
+	# otherwise care about).
+	mv "$objpath" obj.bak &&
+	test_when_finished 'mv obj.bak "$objpath"' &&
+	printf '\170\273\017\112\003\143' >$objpath &&
+
+	test_must_fail git cat-file blob $blob 2>err &&
+	test_grep 'error: inflate: needs dictionary' err
+EOT
+
 # Tests for git cat-file --follow-symlinks
 test_expect_success 'prep for symlink tests' '
 	echo_without_newline "$hello_content" >morx &&

From 67a6b1aeb82fd4685e862a3a7807d4ed8ea5d899 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Tue, 25 Feb 2025 01:30:56 -0500
Subject: [PATCH 06/10] unpack_loose_header(): avoid numeric comparison of zlib
 status

When unpacking a loose header, we try to inflate the first 32 bytes.
We'd expect either Z_OK (we filled up the output buffer, but there are
more bytes in the object) or Z_STREAM_END (this is a tiny object whose
header and content fit in the buffer).

We check for that with "if (status < Z_OK)", making the assumption that
all of the errors we'd see have negative values (as Z_OK itself is "0",
and Z_STREAM_END is "1").

But there's at least one case this misses: Z_NEED_DICT is "2". This
isn't something we'd ever expect to see, but if we do see it, we should
consider it an error (since we have no dictionary to load).

Instead, the current code interprets Z_NEED_DICT as success and looks
for the object header's terminating NUL in the bytes we've read. This
will generaly be zero bytes if the dictionary is mentioned at the start
of the stream. So we'll fail to find it and complain "the header is too
long" (ULHR_LONG). But really, the problem is that the object is
malformed, and we should return ULHR_BAD.

This is a minor bug, as we consider both cases to be an error. But it
does mean we print the wrong error message. The test case added in the
previous patch triggers this code, so we can just confirm the error
message we see here.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 object-file.c       | 2 +-
 t/t1006-cat-file.sh | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/object-file.c b/object-file.c
index 5086633e21..0bc62b53d3 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1273,7 +1273,7 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream,
 	obj_read_unlock();
 	status = git_inflate(stream, 0);
 	obj_read_lock();
-	if (status < Z_OK)
+	if (status != Z_OK && status != Z_STREAM_END)
 		return ULHR_BAD;
 
 	/*
diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh
index 04099f7b4a..609dabd5cf 100755
--- a/t/t1006-cat-file.sh
+++ b/t/t1006-cat-file.sh
@@ -865,6 +865,8 @@ test_expect_success 'object reading handles zlib dictionary' - <<\EOT
 	printf '\170\273\017\112\003\143' >$objpath &&
 
 	test_must_fail git cat-file blob $blob 2>err &&
+	test_grep ! 'too long' err &&
+	test_grep 'error: unable to unpack' err &&
 	test_grep 'error: inflate: needs dictionary' err
 EOT
 

From 9929a6791703c96e5f613cc3b52f4f9e16baa49c Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Tue, 25 Feb 2025 01:31:15 -0500
Subject: [PATCH 07/10] unpack_loose_rest(): avoid numeric comparison of zlib
 status

When unpacking the actual content of a loose object file, we insist both
that the status code we got is Z_STREAM_END, and that we consumed all
bytes.

If we didn't, we'll return an error, but the specific error message we
produce depends on which of the two error conditions we saw. So we'll
check both a second time to decide which error to produce. But this
second time, our status code check is loose: it checks for a negative
status value.

This can get confused by zlib codes which are not negative, such as
Z_NEED_DICT. In this case we'd erroneously print nothing at all, when we
should say "corrupt loose object".

Instead, this second check should check explicitly against Z_STREAM_END.

Note that Z_OK is "0", so the existing code also produced no message for
Z_OK. But it's impossible to see that status, since we only break out of
the inflate loop when we stop seeing Z_OK (so a stream which has more
bytes than its object header claims would eventually yield Z_BUF_ERROR).

There's no test here, as it would require a loose object whose zlib
stream returns Z_NEED_DICT in the middle of the object content. I think
that is probably possible, but even our Z_NEED_DICT test in t1006 does
not trigger this, because we hit that error while reading the header. I
found this bug while reviewing all callers of git_inflate() for bugs
similar to the one we saw in unpack_loose_header(). This was the only
other case that did a numeric comparison rather than explicitly checking
for Z_STREAM_END.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 object-file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/object-file.c b/object-file.c
index 0bc62b53d3..17d54c845d 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1352,7 +1352,7 @@ static void *unpack_loose_rest(git_zstream *stream,
 		return buf;
 	}
 
-	if (status < 0)
+	if (status != Z_STREAM_END)
 		error(_("corrupt loose object '%s'"), oid_to_hex(oid));
 	else if (stream->avail_in)
 		error(_("garbage at end of loose object '%s'"),

From 84b5c1a099e6df35f4b54d651b425a894513e62b Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Tue, 25 Feb 2025 01:33:12 -0500
Subject: [PATCH 08/10] unpack_loose_rest(): never clean up zstream

The unpack_loose_rest() function has funny ownership semantics: we pass
in a z_stream opened by the caller, but then only _sometimes_ close it.

This oddity has developed over time. When the function was originally
split out in 5180cacc20 (Split up unpack_sha1_file() some more,
2005-06-02), it always called inflateEnd() to clean up the stream
(though nowadays it is a git_zstream and we call git_inflate_end()).

But in 7efbff7531 (unpack_sha1_file(): detect corrupt loose object
files., 2007-03-05) we added error code paths which don't close the
stream. This makes some sense, as we'd still look at parts of the stream
struct to decide which error to show (though I am not sure in practice
if inflateEnd() even touches those fields).

This subtlety makes it hard to know when the caller has to clean up the
stream and when it does not. That led to the leak fixed by aa9ef614dc
(object-file: fix memory leak when reading corrupted headers,
2024-08-14).

Let's instead always leave the stream intact, forcing the caller to
clean it up. You might think that would create more work for the
callers, but it actually ends up simplifying them, since they can put
the call to git_inflate_end() in the common cleanup code path.

Two things to note, though:

  - The check_stream_oid() function is used as a replacement for
    unpack_loose_rest() in read_loose_object() to read blobs. It
    inherited the same funny semantics, and we should fix it here, too
    (to keep the cleanup in read_loose_object() consistent).

  - In read_loose_object() we need a second "out" label, as we can jump
    to the existing label before opening the stream at all (and since
    the struct is opaque, there is no way to if it was initialized or
    not, so we must not call git_inflate_end() in that case).

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 object-file.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/object-file.c b/object-file.c
index 17d54c845d..f9713e4e8b 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1348,7 +1348,6 @@ static void *unpack_loose_rest(git_zstream *stream,
 		}
 	}
 	if (status == Z_STREAM_END && !stream->avail_in) {
-		git_inflate_end(stream);
 		return buf;
 	}
 
@@ -1512,8 +1511,8 @@ static int loose_object_info(struct repository *r,
 		die(_("loose object %s (stored in %s) is corrupt"),
 		    oid_to_hex(oid), path);
 
-	git_inflate_end(&stream);
 cleanup:
+	git_inflate_end(&stream);
 	munmap(map, mapsize);
 	if (oi->sizep == &size_scratch)
 		oi->sizep = NULL;
@@ -2735,7 +2734,6 @@ static int check_stream_oid(git_zstream *stream,
 		the_hash_algo->update_fn(&c, buf, stream->next_out - buf);
 		total_read += stream->next_out - buf;
 	}
-	git_inflate_end(stream);
 
 	if (status != Z_STREAM_END) {
 		error(_("corrupt loose object '%s'"), oid_to_hex(expected_oid));
@@ -2782,34 +2780,34 @@ int read_loose_object(const char *path,
 	if (unpack_loose_header(&stream, map, mapsize, hdr, sizeof(hdr),
 				NULL) != ULHR_OK) {
 		error(_("unable to unpack header of %s"), path);
-		goto out;
+		goto out_inflate;
 	}
 
 	if (parse_loose_header(hdr, oi) < 0) {
 		error(_("unable to parse header of %s"), path);
-		git_inflate_end(&stream);
-		goto out;
+		goto out_inflate;
 	}
 
 	if (*oi->typep == OBJ_BLOB && *size > big_file_threshold) {
 		if (check_stream_oid(&stream, hdr, *size, path, expected_oid) < 0)
-			goto out;
+			goto out_inflate;
 	} else {
 		*contents = unpack_loose_rest(&stream, hdr, *size, expected_oid);
 		if (!*contents) {
 			error(_("unable to unpack contents of %s"), path);
-			git_inflate_end(&stream);
-			goto out;
+			goto out_inflate;
 		}
 		hash_object_file_literally(the_repository->hash_algo,
 					   *contents, *size,
 					   oi->type_name->buf, real_oid);
 		if (!oideq(expected_oid, real_oid))
-			goto out;
+			goto out_inflate;
 	}
 
 	ret = 0; /* everything checks out */
 
+out_inflate:
+	git_inflate_end(&stream);
 out:
 	if (map)
 		munmap(map, mapsize);

From 547f719d9b022e87eb8cf3cb7a7632822b996e29 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Tue, 25 Feb 2025 01:33:51 -0500
Subject: [PATCH 09/10] unpack_loose_rest(): simplify error handling

Inflating a loose object is considered successful only if we got
Z_STREAM_END and there were no more bytes. We check both of those
conditions and return success, but then have to check them a second time
to decide which error message to produce.

I.e., we do something like this:

  if (!error_1 && !error_2)
          ...return success...

  if (error_1)
          ...handle error1...
  else if (error_2)
          ...handle error2...
  ...common error handling...

This repetition was the source of a small bug fixed in an earlier commit
(our Z_STREAM_END check was not the same in the two conditionals).

Instead we can chain them all into a single if/else cascade, which
avoids repeating ourselves:

  if (error_1)
          ...handle error1...
  else if (error_2)
          ...handle error2....
  else
          ...return success...
  ...common error handling...

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 object-file.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/object-file.c b/object-file.c
index f9713e4e8b..9f6e8504fb 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1347,15 +1347,15 @@ static void *unpack_loose_rest(git_zstream *stream,
 			obj_read_lock();
 		}
 	}
-	if (status == Z_STREAM_END && !stream->avail_in) {
-		return buf;
-	}
 
 	if (status != Z_STREAM_END)
 		error(_("corrupt loose object '%s'"), oid_to_hex(oid));
 	else if (stream->avail_in)
 		error(_("garbage at end of loose object '%s'"),
 		      oid_to_hex(oid));
+	else
+		return buf;
+
 	free(buf);
 	return NULL;
 }

From 1cb2f293f5a594fd5dee8400213bd2f395fbd2bf Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Tue, 25 Feb 2025 01:34:21 -0500
Subject: [PATCH 10/10] unpack_loose_rest(): rewrite return handling for
 clarity

We have a pattern like:

  if (error1)
     ...handle error 1...
  else if (error2)
     ...handle error 2...
  else
     ...return buf...
  ...free buf and return NULL...

This is a little subtle because it is the return in the success block
that lets us skip the common error handling. Rewrite this instead to
free the buffer in each error path, marking it as NULL, and then all
code paths can use the common return.

This should make the logic a bit easier to follow. It does mean
duplicating the buf cleanup for errors, but it's a single line.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 object-file.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/object-file.c b/object-file.c
index 9f6e8504fb..e463b4bad3 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1348,16 +1348,16 @@ static void *unpack_loose_rest(git_zstream *stream,
 		}
 	}
 
-	if (status != Z_STREAM_END)
+	if (status != Z_STREAM_END) {
 		error(_("corrupt loose object '%s'"), oid_to_hex(oid));
-	else if (stream->avail_in)
+		FREE_AND_NULL(buf);
+	} else if (stream->avail_in) {
 		error(_("garbage at end of loose object '%s'"),
 		      oid_to_hex(oid));
-	else
-		return buf;
+		FREE_AND_NULL(buf);
+	}
 
-	free(buf);
-	return NULL;
+	return buf;
 }
 
 /*