git/reftable/block.c

/*
 * Copyright 2020 Google LLC
 *
 * Use of this source code is governed by a BSD-style
 * license that can be found in the LICENSE file or at
 * https://developers.google.com/open-source/licenses/bsd
 */

#include "block.h"

#include "blocksource.h"
#include "constants.h"
#include "record.h"
#include "reftable-error.h"
#include "system.h"

size_t header_size(int version)
{
	switch (version) {
	case 1:
		return 24;
	case 2:
		return 28;
	}
	abort();
}

size_t footer_size(int version)
{
	switch (version) {
	case 1:
		return 68;
	case 2:
		return 72;
	}
	abort();
}

static int block_writer_register_restart(struct block_writer *w, int n,
					 int is_restart, struct reftable_buf *key)
{
	uint32_t rlen;
	int err;

	rlen = w->restart_len;
	if (rlen >= MAX_RESTARTS)
		is_restart = 0;

	if (is_restart)
		rlen++;
	if (2 + 3 * rlen + n > w->block_size - w->next)
		return -1;
	if (is_restart) {
		REFTABLE_ALLOC_GROW_OR_NULL(w->restarts, w->restart_len + 1,
					    w->restart_cap);
		if (!w->restarts)
			return REFTABLE_OUT_OF_MEMORY_ERROR;
		w->restarts[w->restart_len++] = w->next;
	}

	w->next += n;

	reftable_buf_reset(&w->last_key);
	err = reftable_buf_add(&w->last_key, key->buf, key->len);
	if (err < 0)
		return err;

	w->entries++;
	return 0;
}

int block_writer_init(struct block_writer *bw, uint8_t typ, uint8_t *block,
		      uint32_t block_size, uint32_t header_off, uint32_t hash_size)
{
	bw->block = block;
	bw->hash_size = hash_size;
	bw->block_size = block_size;
	bw->header_off = header_off;
	bw->block[header_off] = typ;
	bw->next = header_off + 4;
	bw->restart_interval = 16;
	bw->entries = 0;
	bw->restart_len = 0;
	bw->last_key.len = 0;
	if (!bw->zstream) {
		REFTABLE_CALLOC_ARRAY(bw->zstream, 1);
		if (!bw->zstream)
			return REFTABLE_OUT_OF_MEMORY_ERROR;
		deflateInit(bw->zstream, 9);
	}

	return 0;
}

uint8_t block_writer_type(struct block_writer *bw)
{
	return bw->block[bw->header_off];
}

/* Adds the reftable_record to the block. Returns -1 if it does not fit, 0 on
   success. Returns REFTABLE_API_ERROR if attempting to write a record with
   empty key. */
int block_writer_add(struct block_writer *w, struct reftable_record *rec)
{
	struct reftable_buf empty = REFTABLE_BUF_INIT;
	struct reftable_buf last =
		w->entries % w->restart_interval == 0 ? empty : w->last_key;
	struct string_view out = {
		.buf = w->block + w->next,
		.len = w->block_size - w->next,
	};
	struct string_view start = out;
	int is_restart = 0;
	int n = 0;
	int err;

	err = reftable_record_key(rec, &w->scratch);
	if (err < 0)
		goto done;

	if (!w->scratch.len) {
		err = REFTABLE_API_ERROR;
		goto done;
	}

	n = reftable_encode_key(&is_restart, out, last, w->scratch,
				reftable_record_val_type(rec));
	if (n < 0) {
		err = -1;
		goto done;
	}
	string_view_consume(&out, n);

	n = reftable_record_encode(rec, out, w->hash_size);
	if (n < 0) {
		err = -1;
		goto done;
	}
	string_view_consume(&out, n);

	err = block_writer_register_restart(w, start.len - out.len, is_restart,
					    &w->scratch);
done:
	return err;
}

int block_writer_finish(struct block_writer *w)
{
	for (uint32_t i = 0; i < w->restart_len; i++) {
		reftable_put_be24(w->block + w->next, w->restarts[i]);
		w->next += 3;
	}

	reftable_put_be16(w->block + w->next, w->restart_len);
	w->next += 2;
	reftable_put_be24(w->block + 1 + w->header_off, w->next);

	/*
	 * Log records are stored zlib-compressed. Note that the compression
	 * also spans over the restart points we have just written.
	 */
	if (block_writer_type(w) == BLOCK_TYPE_LOG) {
		int block_header_skip = 4 + w->header_off;
		uLongf src_len = w->next - block_header_skip, compressed_len;
		int ret;

		ret = deflateReset(w->zstream);
		if (ret != Z_OK)
			return REFTABLE_ZLIB_ERROR;

		/*
		 * Precompute the upper bound of how many bytes the compressed
		 * data may end up with. Combined with `Z_FINISH`, `deflate()`
		 * is guaranteed to return `Z_STREAM_END`.
		 */
		compressed_len = deflateBound(w->zstream, src_len);
		REFTABLE_ALLOC_GROW_OR_NULL(w->compressed, compressed_len,
					    w->compressed_cap);
		if (!w->compressed) {
			ret = REFTABLE_OUT_OF_MEMORY_ERROR;
			return ret;
		}

		w->zstream->next_out = w->compressed;
		w->zstream->avail_out = compressed_len;
		w->zstream->next_in = w->block + block_header_skip;
		w->zstream->avail_in = src_len;

		/*
		 * We want to perform all decompression in a single step, which
		 * is why we can pass Z_FINISH here. As we have precomputed the
		 * deflated buffer's size via `deflateBound()` this function is
		 * guaranteed to succeed according to the zlib documentation.
		 */
		ret = deflate(w->zstream, Z_FINISH);
		if (ret != Z_STREAM_END)
			return REFTABLE_ZLIB_ERROR;

		/*
		 * Overwrite the uncompressed data we have already written and
		 * adjust the `next` pointer to point right after the
		 * compressed data.
		 */
		memcpy(w->block + block_header_skip, w->compressed,
		       w->zstream->total_out);
		w->next = w->zstream->total_out + block_header_skip;
	}

	return w->next;
}

static int read_block(struct reftable_block_source *source,
		      struct reftable_block_data *dest, uint64_t off,
		      uint32_t sz)
{
	size_t size = block_source_size(source);
	block_source_release_data(dest);
	if (off >= size)
		return 0;
	if (off + sz > size)
		sz = size - off;
	return block_source_read_data(source, dest, off, sz);
}

int reftable_block_init(struct reftable_block *block,
			struct reftable_block_source *source,
			uint32_t offset, uint32_t header_size,
			uint32_t table_block_size, uint32_t hash_size)
{
	uint32_t guess_block_size = table_block_size ?
		table_block_size : DEFAULT_BLOCK_SIZE;
	uint32_t full_block_size = table_block_size;
	uint16_t restart_count;
	uint32_t restart_off;
	uint32_t block_size;
	uint8_t block_type;
	int err;

	err = read_block(source, &block->block_data, offset, guess_block_size);
	if (err < 0)
		goto done;

	block_type = block->block_data.data[header_size];
	if (!reftable_is_block_type(block_type)) {
		err = REFTABLE_FORMAT_ERROR;
		goto done;
	}

	block_size = reftable_get_be24(block->block_data.data + header_size + 1);
	if (block_size > guess_block_size) {
		err = read_block(source, &block->block_data, offset, block_size);
		if (err < 0)
			goto done;
	}

	if (block_type == BLOCK_TYPE_LOG) {
		uint32_t block_header_skip = 4 + header_size;
		uLong dst_len = block_size - block_header_skip;
		uLong src_len = block->block_data.len - block_header_skip;

		/* Log blocks specify the *uncompressed* size in their header. */
		REFTABLE_ALLOC_GROW_OR_NULL(block->uncompressed_data, block_size,
					    block->uncompressed_cap);
		if (!block->uncompressed_data) {
			err = REFTABLE_OUT_OF_MEMORY_ERROR;
			goto done;
		}

		/* Copy over the block header verbatim. It's not compressed. */
		memcpy(block->uncompressed_data, block->block_data.data, block_header_skip);

		if (!block->zstream) {
			REFTABLE_CALLOC_ARRAY(block->zstream, 1);
			if (!block->zstream) {
				err = REFTABLE_OUT_OF_MEMORY_ERROR;
				goto done;
			}

			err = inflateInit(block->zstream);
		} else {
			err = inflateReset(block->zstream);
		}
		if (err != Z_OK) {
			err = REFTABLE_ZLIB_ERROR;
			goto done;
		}

		block->zstream->next_in = block->block_data.data + block_header_skip;
		block->zstream->avail_in = src_len;
		block->zstream->next_out = block->uncompressed_data + block_header_skip;
		block->zstream->avail_out = dst_len;

		/*
		 * We know both input as well as output size, and we know that
		 * the sizes should never be bigger than `uInt_MAX` because
		 * blocks can at most be 16MB large. We can thus use `Z_FINISH`
		 * here to instruct zlib to inflate the data in one go, which
		 * is more efficient than using `Z_NO_FLUSH`.
		 */
		err = inflate(block->zstream, Z_FINISH);
		if (err != Z_STREAM_END) {
			err = REFTABLE_ZLIB_ERROR;
			goto done;
		}
		err = 0;

		if (block->zstream->total_out + block_header_skip != block_size) {
			err = REFTABLE_FORMAT_ERROR;
			goto done;
		}

		/* We're done with the input data. */
		block_source_release_data(&block->block_data);
		block->block_data.data = block->uncompressed_data;
		block->block_data.len = block_size;
		full_block_size = src_len + block_header_skip - block->zstream->avail_in;
	} else if (full_block_size == 0) {
		full_block_size = block_size;
	} else if (block_size < full_block_size && block_size < block->block_data.len &&
		   block->block_data.data[block_size] != 0) {
		/* If the block is smaller than the full block size, it is
		   padded (data followed by '\0') or the next block is
		   unaligned. */
		full_block_size = block_size;
	}

	restart_count = reftable_get_be16(block->block_data.data + block_size - 2);
	restart_off = block_size - 2 - 3 * restart_count;

	block->block_type = block_type;
	block->hash_size = hash_size;
	block->restart_off = restart_off;
	block->full_block_size = full_block_size;
	block->header_off = header_size;
	block->restart_count = restart_count;

	err = 0;

done:
	if (err < 0)
		reftable_block_release(block);
	return err;
}

void reftable_block_release(struct reftable_block *block)
{
	inflateEnd(block->zstream);
	reftable_free(block->zstream);
	reftable_free(block->uncompressed_data);
	block_source_release_data(&block->block_data);
	memset(block, 0, sizeof(*block));
}

uint8_t reftable_block_type(const struct reftable_block *b)
{
	return b->block_data.data[b->header_off];
}

int reftable_block_first_key(const struct reftable_block *block, struct reftable_buf *key)
{
	int off = block->header_off + 4, n;
	struct string_view in = {
		.buf = block->block_data.data + off,
		.len = block->restart_off - off,
	};
	uint8_t extra = 0;

	reftable_buf_reset(key);

	n = reftable_decode_key(key, &extra, in);
	if (n < 0)
		return n;
	if (!key->len)
		return REFTABLE_FORMAT_ERROR;

	return 0;
}

static uint32_t block_restart_offset(const struct reftable_block *b, size_t idx)
{
	return reftable_get_be24(b->block_data.data + b->restart_off + 3 * idx);
}

void block_iter_seek_start(struct block_iter *it, const struct reftable_block *b)
{
	it->block = b->block_data.data;
	it->block_len = b->restart_off;
	it->hash_size = b->hash_size;
	reftable_buf_reset(&it->last_key);
	it->next_off = b->header_off + 4;
}

struct restart_needle_less_args {
	int error;
	struct reftable_buf needle;
	const struct reftable_block *block;
};

static int restart_needle_less(size_t idx, void *_args)
{
	struct restart_needle_less_args *args = _args;
	uint32_t off = block_restart_offset(args->block, idx);
	struct string_view in = {
		.buf = args->block->block_data.data + off,
		.len = args->block->restart_off - off,
	};
	uint64_t prefix_len, suffix_len;
	uint8_t extra;
	int n;

	/*
	 * Records at restart points are stored without prefix compression, so
	 * there is no need to fully decode the record key here. This removes
	 * the need for allocating memory.
	 */
	n = reftable_decode_keylen(in, &prefix_len, &suffix_len, &extra);
	if (n < 0 || prefix_len) {
		args->error = 1;
		return -1;
	}

	string_view_consume(&in, n);
	if (suffix_len > in.len) {
		args->error = 1;
		return -1;
	}

	n = memcmp(args->needle.buf, in.buf,
		   args->needle.len < suffix_len ? args->needle.len : suffix_len);
	if (n)
		return n < 0;
	return args->needle.len < suffix_len;
}

int block_iter_next(struct block_iter *it, struct reftable_record *rec)
{
	struct string_view in = {
		.buf = (unsigned char *) it->block + it->next_off,
		.len = it->block_len - it->next_off,
	};
	struct string_view start = in;
	uint8_t extra = 0;
	int n = 0;

	if (it->next_off >= it->block_len)
		return 1;

	n = reftable_decode_key(&it->last_key, &extra, in);
	if (n < 0)
		return -1;
	if (!it->last_key.len)
		return REFTABLE_FORMAT_ERROR;

	string_view_consume(&in, n);
	n = reftable_record_decode(rec, it->last_key, extra, in, it->hash_size,
				   &it->scratch);
	if (n < 0)
		return -1;
	string_view_consume(&in, n);

	it->next_off += start.len - in.len;
	return 0;
}

void block_iter_reset(struct block_iter *it)
{
	reftable_buf_reset(&it->last_key);
	it->next_off = 0;
	it->block = NULL;
	it->block_len = 0;
	it->hash_size = 0;
}

void block_iter_close(struct block_iter *it)
{
	reftable_buf_release(&it->last_key);
	reftable_buf_release(&it->scratch);
}

int block_iter_seek_key(struct block_iter *it, const struct reftable_block *block,
			struct reftable_buf *want)
{
	struct restart_needle_less_args args = {
		.needle = *want,
		.block = block,
	};
	struct reftable_record rec;
	int err = 0;
	size_t i;

	/*
	 * Perform a binary search over the block's restart points, which
	 * avoids doing a linear scan over the whole block. Like this, we
	 * identify the section of the block that should contain our key.
	 *
	 * Note that we explicitly search for the first restart point _greater_
	 * than the sought-after record, not _greater or equal_ to it. In case
	 * the sought-after record is located directly at the restart point we
	 * would otherwise start doing the linear search at the preceding
	 * restart point. While that works alright, we would end up scanning
	 * too many record.
	 */
	i = binsearch(block->restart_count, &restart_needle_less, &args);
	if (args.error) {
		err = REFTABLE_FORMAT_ERROR;
		goto done;
	}

	/*
	 * Now there are multiple cases:
	 *
	 *   - `i == 0`: The wanted record is smaller than the record found at
	 *     the first restart point. As the first restart point is the first
	 *     record in the block, our wanted record cannot be located in this
	 *     block at all. We still need to position the iterator so that the
	 *     next call to `block_iter_next()` will yield an end-of-iterator
	 *     signal.
	 *
	 *   - `i == restart_count`: The wanted record was not found at any of
	 *     the restart points. As there is no restart point at the end of
	 *     the section the record may thus be contained in the last block.
	 *
	 *   - `i > 0`: The wanted record must be contained in the section
	 *     before the found restart point. We thus do a linear search
	 *     starting from the preceding restart point.
	 */
	if (i > 0)
		it->next_off = block_restart_offset(block, i - 1);
	else
		it->next_off = block->header_off + 4;
	it->block = block->block_data.data;
	it->block_len = block->restart_off;
	it->hash_size = block->hash_size;

	err = reftable_record_init(&rec, reftable_block_type(block));
	if (err < 0)
		goto done;

	/*
	 * We're looking for the last entry less than the wanted key so that
	 * the next call to `block_reader_next()` would yield the wanted
	 * record. We thus don't want to position our iterator at the sought
	 * after record, but one before. To do so, we have to go one entry too
	 * far and then back up.
	 */
	while (1) {
		size_t prev_off = it->next_off;

		err = block_iter_next(it, &rec);
		if (err < 0)
			goto done;
		if (err > 0) {
			it->next_off = prev_off;
			err = 0;
			goto done;
		}

		err = reftable_record_key(&rec, &it->last_key);
		if (err < 0)
			goto done;

		/*
		 * Check whether the current key is greater or equal to the
		 * sought-after key. In case it is greater we know that the
		 * record does not exist in the block and can thus abort early.
		 * In case it is equal to the sought-after key we have found
		 * the desired record.
		 *
		 * Note that we store the next record's key record directly in
		 * `last_key` without restoring the key of the preceding record
		 * in case we need to go one record back. This is safe to do as
		 * `block_iter_next()` would return the ref whose key is equal
		 * to `last_key` now, and naturally all keys share a prefix
		 * with themselves.
		 */
		if (reftable_buf_cmp(&it->last_key, want) >= 0) {
			it->next_off = prev_off;
			goto done;
		}
	}

done:
	reftable_record_release(&rec);
	return err;
}

void block_writer_release(struct block_writer *bw)
{
	deflateEnd(bw->zstream);
	REFTABLE_FREE_AND_NULL(bw->zstream);
	REFTABLE_FREE_AND_NULL(bw->restarts);
	REFTABLE_FREE_AND_NULL(bw->compressed);
	reftable_buf_release(&bw->scratch);
	reftable_buf_release(&bw->last_key);
	/* the block is not owned. */
}