From 40d52ff77b093fa48f58a168f4b0c4e65b862e56 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 1 Apr 2010 20:05:23 -0400 Subject: [PATCH 1/5] make commit_tree a library function Until now, this has been part of the commit-tree builtin. However, it is already used by other builtins (like commit, merge, and notes), and it would be useful to access it from library code. The check_valid helper has to come along, too, but is given a more library-ish name of "assert_sha1_type". Otherwise, the code is unchanged. There are still a few rough edges for a library function, like printing the utf8 warning to stderr, but we can address those if and when they come up as inappropriate. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin.h | 3 -- builtin/commit-tree.c | 70 +------------------------------------------ cache.h | 2 ++ commit.c | 55 ++++++++++++++++++++++++++++++++++ commit.h | 4 +++ sha1_file.c | 10 +++++++ 6 files changed, 72 insertions(+), 72 deletions(-) diff --git a/builtin.h b/builtin.h index 464588b299..5c887ef61b 100644 --- a/builtin.h +++ b/builtin.h @@ -16,9 +16,6 @@ extern const char *help_unknown_cmd(const char *cmd); extern void prune_packed_objects(int); extern int fmt_merge_msg(int merge_summary, struct strbuf *in, struct strbuf *out); -extern int commit_tree(const char *msg, unsigned char *tree, - struct commit_list *parents, unsigned char *ret, - const char *author); extern int commit_notes(struct notes_tree *t, const char *msg); struct notes_rewrite_cfg { diff --git a/builtin/commit-tree.c b/builtin/commit-tree.c index 90dac349a3..87f0591c2f 100644 --- a/builtin/commit-tree.c +++ b/builtin/commit-tree.c @@ -9,19 +9,6 @@ #include "builtin.h" #include "utf8.h" -/* - * FIXME! Share the code with "write-tree.c" - */ -static void check_valid(unsigned char *sha1, enum object_type expect) -{ - enum object_type type = sha1_object_info(sha1, NULL); - if (type < 0) - die("%s is not a valid object", sha1_to_hex(sha1)); - if (type != expect) - die("%s is not a valid '%s' object", sha1_to_hex(sha1), - typename(expect)); -} - static const char commit_tree_usage[] = "git commit-tree [-p ]* < changelog"; static void new_parent(struct commit *parent, struct commit_list **parents_p) @@ -38,61 +25,6 @@ static void new_parent(struct commit *parent, struct commit_list **parents_p) commit_list_insert(parent, parents_p); } -static const char commit_utf8_warn[] = -"Warning: commit message does not conform to UTF-8.\n" -"You may want to amend it after fixing the message, or set the config\n" -"variable i18n.commitencoding to the encoding your project uses.\n"; - -int commit_tree(const char *msg, unsigned char *tree, - struct commit_list *parents, unsigned char *ret, - const char *author) -{ - int result; - int encoding_is_utf8; - struct strbuf buffer; - - check_valid(tree, OBJ_TREE); - - /* Not having i18n.commitencoding is the same as having utf-8 */ - encoding_is_utf8 = is_encoding_utf8(git_commit_encoding); - - strbuf_init(&buffer, 8192); /* should avoid reallocs for the headers */ - strbuf_addf(&buffer, "tree %s\n", sha1_to_hex(tree)); - - /* - * NOTE! This ordering means that the same exact tree merged with a - * different order of parents will be a _different_ changeset even - * if everything else stays the same. - */ - while (parents) { - struct commit_list *next = parents->next; - strbuf_addf(&buffer, "parent %s\n", - sha1_to_hex(parents->item->object.sha1)); - free(parents); - parents = next; - } - - /* Person/date information */ - if (!author) - author = git_author_info(IDENT_ERROR_ON_NO_NAME); - strbuf_addf(&buffer, "author %s\n", author); - strbuf_addf(&buffer, "committer %s\n", git_committer_info(IDENT_ERROR_ON_NO_NAME)); - if (!encoding_is_utf8) - strbuf_addf(&buffer, "encoding %s\n", git_commit_encoding); - strbuf_addch(&buffer, '\n'); - - /* And add the comment */ - strbuf_addstr(&buffer, msg); - - /* And check the encoding */ - if (encoding_is_utf8 && !is_utf8(buffer.buf)) - fprintf(stderr, commit_utf8_warn); - - result = write_sha1_file(buffer.buf, buffer.len, commit_type, ret); - strbuf_release(&buffer); - return result; -} - int cmd_commit_tree(int argc, const char **argv, const char *prefix) { int i; @@ -117,7 +49,7 @@ int cmd_commit_tree(int argc, const char **argv, const char *prefix) if (get_sha1(b, sha1)) die("Not a valid object name %s", b); - check_valid(sha1, OBJ_COMMIT); + assert_sha1_type(sha1, OBJ_COMMIT); new_parent(lookup_commit(sha1), &parents); } diff --git a/cache.h b/cache.h index 5eb0573bcc..bfc4d82f6c 100644 --- a/cache.h +++ b/cache.h @@ -718,6 +718,8 @@ extern int has_loose_object_nonlocal(const unsigned char *sha1); extern int has_pack_index(const unsigned char *sha1); +extern void assert_sha1_type(const unsigned char *sha1, enum object_type expect); + extern const signed char hexval_table[256]; static inline unsigned int hexval(unsigned char c) { diff --git a/commit.c b/commit.c index 731191e63b..e9b0750967 100644 --- a/commit.c +++ b/commit.c @@ -790,3 +790,58 @@ struct commit_list *reduce_heads(struct commit_list *heads) free(other); return result; } + +static const char commit_utf8_warn[] = +"Warning: commit message does not conform to UTF-8.\n" +"You may want to amend it after fixing the message, or set the config\n" +"variable i18n.commitencoding to the encoding your project uses.\n"; + +int commit_tree(const char *msg, unsigned char *tree, + struct commit_list *parents, unsigned char *ret, + const char *author) +{ + int result; + int encoding_is_utf8; + struct strbuf buffer; + + assert_sha1_type(tree, OBJ_TREE); + + /* Not having i18n.commitencoding is the same as having utf-8 */ + encoding_is_utf8 = is_encoding_utf8(git_commit_encoding); + + strbuf_init(&buffer, 8192); /* should avoid reallocs for the headers */ + strbuf_addf(&buffer, "tree %s\n", sha1_to_hex(tree)); + + /* + * NOTE! This ordering means that the same exact tree merged with a + * different order of parents will be a _different_ changeset even + * if everything else stays the same. + */ + while (parents) { + struct commit_list *next = parents->next; + strbuf_addf(&buffer, "parent %s\n", + sha1_to_hex(parents->item->object.sha1)); + free(parents); + parents = next; + } + + /* Person/date information */ + if (!author) + author = git_author_info(IDENT_ERROR_ON_NO_NAME); + strbuf_addf(&buffer, "author %s\n", author); + strbuf_addf(&buffer, "committer %s\n", git_committer_info(IDENT_ERROR_ON_NO_NAME)); + if (!encoding_is_utf8) + strbuf_addf(&buffer, "encoding %s\n", git_commit_encoding); + strbuf_addch(&buffer, '\n'); + + /* And add the comment */ + strbuf_addstr(&buffer, msg); + + /* And check the encoding */ + if (encoding_is_utf8 && !is_utf8(buffer.buf)) + fprintf(stderr, commit_utf8_warn); + + result = write_sha1_file(buffer.buf, buffer.len, commit_type, ret); + strbuf_release(&buffer); + return result; +} diff --git a/commit.h b/commit.h index 3cf5166581..2b7fd89dfd 100644 --- a/commit.h +++ b/commit.h @@ -158,4 +158,8 @@ static inline int single_parent(struct commit *commit) struct commit_list *reduce_heads(struct commit_list *heads); +extern int commit_tree(const char *msg, unsigned char *tree, + struct commit_list *parents, unsigned char *ret, + const char *author); + #endif /* COMMIT_H */ diff --git a/sha1_file.c b/sha1_file.c index ff65328006..28c056e074 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -2516,3 +2516,13 @@ int read_pack_header(int fd, struct pack_header *header) return PH_ERROR_PROTOCOL; return 0; } + +void assert_sha1_type(const unsigned char *sha1, enum object_type expect) +{ + enum object_type type = sha1_object_info(sha1, NULL); + if (type < 0) + die("%s is not a valid object", sha1_to_hex(sha1)); + if (type != expect) + die("%s is not a valid '%s' object", sha1_to_hex(sha1), + typename(expect)); +} From a941d5e3958ece565570a2dc3a5476ff5242f340 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 1 Apr 2010 20:07:40 -0400 Subject: [PATCH 2/5] introduce notes-cache interface Notes provide a fast lookup mechanism for data keyed by sha1. This is ideal for caching certain operations, like textconv filters. This patch builds some infrastructure to make it simpler to use notes trees as caches. In particular, caches: 1. don't have arbitrary commit messages. They store a cache validity string in the commit, and clear the tree when the cache validity string changes. 2. don't keep any commit history. The accumulated history of a a cache is just useless cruft. 3. use a looser form of locking for ref updates. If two processes try to write to the cache simultaneously, it is OK if one overwrites the other, losing some changes. It's just a cache, so we will just end up with an extra miss. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- Makefile | 2 ++ notes-cache.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++ notes-cache.h | 20 +++++++++++ 3 files changed, 116 insertions(+) create mode 100644 notes-cache.c create mode 100644 notes-cache.h diff --git a/Makefile b/Makefile index 8a0f5c4d5b..24e92abe91 100644 --- a/Makefile +++ b/Makefile @@ -486,6 +486,7 @@ LIB_H += log-tree.h LIB_H += mailmap.h LIB_H += merge-recursive.h LIB_H += notes.h +LIB_H += notes-cache.h LIB_H += object.h LIB_H += pack.h LIB_H += pack-refs.h @@ -575,6 +576,7 @@ LIB_OBJS += merge-file.o LIB_OBJS += merge-recursive.o LIB_OBJS += name-hash.o LIB_OBJS += notes.o +LIB_OBJS += notes-cache.o LIB_OBJS += object.o LIB_OBJS += pack-check.o LIB_OBJS += pack-refs.o diff --git a/notes-cache.c b/notes-cache.c new file mode 100644 index 0000000000..dee6d62e72 --- /dev/null +++ b/notes-cache.c @@ -0,0 +1,94 @@ +#include "cache.h" +#include "notes-cache.h" +#include "commit.h" +#include "refs.h" + +static int notes_cache_match_validity(const char *ref, const char *validity) +{ + unsigned char sha1[20]; + struct commit *commit; + struct pretty_print_context pretty_ctx; + struct strbuf msg = STRBUF_INIT; + int ret; + + if (read_ref(ref, sha1) < 0) + return 0; + + commit = lookup_commit_reference_gently(sha1, 1); + if (!commit) + return 0; + + memset(&pretty_ctx, 0, sizeof(pretty_ctx)); + format_commit_message(commit, "%s", &msg, &pretty_ctx); + strbuf_trim(&msg); + + ret = !strcmp(msg.buf, validity); + strbuf_release(&msg); + + return ret; +} + +void notes_cache_init(struct notes_cache *c, const char *name, + const char *validity) +{ + struct strbuf ref = STRBUF_INIT; + int flags = 0; + + memset(c, 0, sizeof(*c)); + c->validity = xstrdup(validity); + + strbuf_addf(&ref, "refs/notes/%s", name); + if (!notes_cache_match_validity(ref.buf, validity)) + flags = NOTES_INIT_EMPTY; + init_notes(&c->tree, ref.buf, combine_notes_overwrite, flags); + strbuf_release(&ref); +} + +int notes_cache_write(struct notes_cache *c) +{ + unsigned char tree_sha1[20]; + unsigned char commit_sha1[20]; + + if (!c || !c->tree.initialized || !c->tree.ref || !*c->tree.ref) + return -1; + if (!c->tree.dirty) + return 0; + + if (write_notes_tree(&c->tree, tree_sha1)) + return -1; + if (commit_tree(c->validity, tree_sha1, NULL, commit_sha1, NULL) < 0) + return -1; + if (update_ref("update notes cache", c->tree.ref, commit_sha1, NULL, + 0, QUIET_ON_ERR) < 0) + return -1; + + return 0; +} + +char *notes_cache_get(struct notes_cache *c, unsigned char key_sha1[20], + size_t *outsize) +{ + const unsigned char *value_sha1; + enum object_type type; + char *value; + unsigned long size; + + value_sha1 = get_note(&c->tree, key_sha1); + if (!value_sha1) + return NULL; + value = read_sha1_file(value_sha1, &type, &size); + + *outsize = size; + return value; +} + +int notes_cache_put(struct notes_cache *c, unsigned char key_sha1[20], + const char *data, size_t size) +{ + unsigned char value_sha1[20]; + + if (write_sha1_file(data, size, "blob", value_sha1) < 0) + return -1; + add_note(&c->tree, key_sha1, value_sha1, NULL); + return 0; +} diff --git a/notes-cache.h b/notes-cache.h new file mode 100644 index 0000000000..356f88fb3c --- /dev/null +++ b/notes-cache.h @@ -0,0 +1,20 @@ +#ifndef NOTES_CACHE_H +#define NOTES_CACHE_H + +#include "notes.h" + +struct notes_cache { + struct notes_tree tree; + char *validity; +}; + +void notes_cache_init(struct notes_cache *c, const char *name, + const char *validity); +int notes_cache_write(struct notes_cache *c); + +char *notes_cache_get(struct notes_cache *c, unsigned char sha1[20], size_t + *outsize); +int notes_cache_put(struct notes_cache *c, unsigned char sha1[20], + const char *data, size_t size); + +#endif /* NOTES_CACHE_H */ From 840383b2c2bd7179604f5c2595bf95e22a4e0c84 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 1 Apr 2010 20:09:26 -0400 Subject: [PATCH 3/5] textconv: refactor calls to run_textconv This patch adds a fill_textconv wrapper, which centralizes some minor logic like error checking and handling the case of no-textconv. In addition to dropping the number of lines, this will make it easier in future patches to handle multiple types of textconv. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- diff.c | 66 ++++++++++++++++++++++++++-------------------------------- 1 file changed, 30 insertions(+), 36 deletions(-) diff --git a/diff.c b/diff.c index db2cd5d35f..9665d6d41f 100644 --- a/diff.c +++ b/diff.c @@ -43,7 +43,8 @@ static char diff_colors[][COLOR_MAXLEN] = { }; static void diff_filespec_load_driver(struct diff_filespec *one); -static char *run_textconv(const char *, struct diff_filespec *, size_t *); +static size_t fill_textconv(const char *cmd, + struct diff_filespec *df, char **outbuf); static int parse_diff_color_slot(const char *var, int ofs) { @@ -477,7 +478,7 @@ static void emit_rewrite_diff(const char *name_a, const char *reset = diff_get_color(color_diff, DIFF_RESET); static struct strbuf a_name = STRBUF_INIT, b_name = STRBUF_INIT; const char *a_prefix, *b_prefix; - const char *data_one, *data_two; + char *data_one, *data_two; size_t size_one, size_two; struct emit_callback ecbdata; @@ -499,26 +500,8 @@ static void emit_rewrite_diff(const char *name_a, quote_two_c_style(&a_name, a_prefix, name_a, 0); quote_two_c_style(&b_name, b_prefix, name_b, 0); - diff_populate_filespec(one, 0); - diff_populate_filespec(two, 0); - if (textconv_one) { - data_one = run_textconv(textconv_one, one, &size_one); - if (!data_one) - die("unable to read files to diff"); - } - else { - data_one = one->data; - size_one = one->size; - } - if (textconv_two) { - data_two = run_textconv(textconv_two, two, &size_two); - if (!data_two) - die("unable to read files to diff"); - } - else { - data_two = two->data; - size_two = two->size; - } + size_one = fill_textconv(textconv_one, one, &data_one); + size_two = fill_textconv(textconv_two, two, &data_two); memset(&ecbdata, 0, sizeof(ecbdata)); ecbdata.color_diff = color_diff; @@ -1717,20 +1700,8 @@ static void builtin_diff(const char *name_a, strbuf_reset(&header); } - if (textconv_one) { - size_t size; - mf1.ptr = run_textconv(textconv_one, one, &size); - if (!mf1.ptr) - die("unable to read files to diff"); - mf1.size = size; - } - if (textconv_two) { - size_t size; - mf2.ptr = run_textconv(textconv_two, two, &size); - if (!mf2.ptr) - die("unable to read files to diff"); - mf2.size = size; - } + mf1.size = fill_textconv(textconv_one, one, &mf1.ptr); + mf2.size = fill_textconv(textconv_two, two, &mf2.ptr); pe = diff_funcname_pattern(one); if (!pe) @@ -3916,3 +3887,26 @@ static char *run_textconv(const char *pgm, struct diff_filespec *spec, return strbuf_detach(&buf, outsize); } + +static size_t fill_textconv(const char *cmd, + struct diff_filespec *df, + char **outbuf) +{ + size_t size; + + if (!cmd) { + if (!DIFF_FILE_VALID(df)) { + *outbuf = ""; + return 0; + } + if (diff_populate_filespec(df, 0)) + die("unable to read files to diff"); + *outbuf = df->data; + return df->size; + } + + *outbuf = run_textconv(cmd, df, &size); + if (!*outbuf) + die("unable to read files to diff"); + return size; +} From d9bae1a178f0f8b198ea611e874975214ad6f990 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 1 Apr 2010 20:12:15 -0400 Subject: [PATCH 4/5] diff: cache textconv output Running a textconv filter can take a long time. It's particularly bad for a large file which needs to be spooled to disk, but even for small files, the fork+exec overhead can add up for something like "git log -p". This patch uses the notes-cache mechanism to keep a fast cache of textconv output. Caches are stored in refs/notes/textconv/$x, where $x is the userdiff driver defined in gitattributes. Caching is enabled only if diff.$x.cachetextconv is true. In my test repo, on a commit with 45 jpg and avi files changed and a textconv to show their exif tags: [before] $ time git show >/dev/null real 0m13.724s user 0m12.057s sys 0m1.624s [after, first run] $ git config diff.mfo.cachetextconv true $ time git show >/dev/null real 0m14.252s user 0m12.197s sys 0m1.800s [after, subsequent runs] $ time git show >/dev/null real 0m0.352s user 0m0.148s sys 0m0.200s So for a slight (3.8%) cost on the first run, we achieve an almost 40x speed up on subsequent runs. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- Documentation/gitattributes.txt | 20 ++++++ diff.c | 52 ++++++++++++--- t/t4042-diff-textconv-caching.sh | 109 +++++++++++++++++++++++++++++++ userdiff.c | 9 +++ userdiff.h | 4 ++ 5 files changed, 185 insertions(+), 9 deletions(-) create mode 100755 t/t4042-diff-textconv-caching.sh diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt index d892e642ed..a8500d1772 100644 --- a/Documentation/gitattributes.txt +++ b/Documentation/gitattributes.txt @@ -414,6 +414,26 @@ because it quickly conveys the changes you have made), you should generate it separately and send it as a comment _in addition to_ the usual binary diff that you might send. +Because text conversion can be slow, especially when doing a +large number of them with `git log -p`, git provides a mechanism +to cache the output and use it in future diffs. To enable +caching, set the "cachetextconv" variable in your diff driver's +config. For example: + +------------------------ +[diff "jpg"] + textconv = exif + cachetextconv = true +------------------------ + +This will cache the result of running "exif" on each blob +indefinitely. If you change the textconv config variable for a +diff driver, git will automatically invalidate the cache entries +and re-run the textconv filter. If you want to invalidate the +cache manually (e.g., because your version of "exif" was updated +and now produces better output), you can remove the cache +manually with `git update-ref -d refs/notes/textconv/jpg` (where +"jpg" is the name of the diff driver, as in the example above). Performing a three-way merge ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/diff.c b/diff.c index 9665d6d41f..72d8503d89 100644 --- a/diff.c +++ b/diff.c @@ -43,7 +43,7 @@ static char diff_colors[][COLOR_MAXLEN] = { }; static void diff_filespec_load_driver(struct diff_filespec *one); -static size_t fill_textconv(const char *cmd, +static size_t fill_textconv(struct userdiff_driver *driver, struct diff_filespec *df, char **outbuf); static int parse_diff_color_slot(const char *var, int ofs) @@ -466,8 +466,8 @@ static void emit_rewrite_diff(const char *name_a, const char *name_b, struct diff_filespec *one, struct diff_filespec *two, - const char *textconv_one, - const char *textconv_two, + struct userdiff_driver *textconv_one, + struct userdiff_driver *textconv_two, struct diff_options *o) { int lc_a, lc_b; @@ -1569,14 +1569,26 @@ void diff_set_mnemonic_prefix(struct diff_options *options, const char *a, const options->b_prefix = b; } -static const char *get_textconv(struct diff_filespec *one) +static struct userdiff_driver *get_textconv(struct diff_filespec *one) { if (!DIFF_FILE_VALID(one)) return NULL; if (!S_ISREG(one->mode)) return NULL; diff_filespec_load_driver(one); - return one->driver->textconv; + if (!one->driver->textconv) + return NULL; + + if (one->driver->textconv_want_cache && !one->driver->textconv_cache) { + struct notes_cache *c = xmalloc(sizeof(*c)); + struct strbuf name = STRBUF_INIT; + + strbuf_addf(&name, "textconv/%s", one->driver->name); + notes_cache_init(c, name.buf, one->driver->textconv); + one->driver->textconv_cache = c; + } + + return one->driver; } static void builtin_diff(const char *name_a, @@ -1593,7 +1605,8 @@ static void builtin_diff(const char *name_a, const char *set = diff_get_color_opt(o, DIFF_METAINFO); const char *reset = diff_get_color_opt(o, DIFF_RESET); const char *a_prefix, *b_prefix; - const char *textconv_one = NULL, *textconv_two = NULL; + struct userdiff_driver *textconv_one = NULL; + struct userdiff_driver *textconv_two = NULL; struct strbuf header = STRBUF_INIT; if (DIFF_OPT_TST(o, SUBMODULE_LOG) && @@ -3888,13 +3901,13 @@ static char *run_textconv(const char *pgm, struct diff_filespec *spec, return strbuf_detach(&buf, outsize); } -static size_t fill_textconv(const char *cmd, +static size_t fill_textconv(struct userdiff_driver *driver, struct diff_filespec *df, char **outbuf) { size_t size; - if (!cmd) { + if (!driver || !driver->textconv) { if (!DIFF_FILE_VALID(df)) { *outbuf = ""; return 0; @@ -3905,8 +3918,29 @@ static size_t fill_textconv(const char *cmd, return df->size; } - *outbuf = run_textconv(cmd, df, &size); + if (driver->textconv_cache) { + *outbuf = notes_cache_get(driver->textconv_cache, df->sha1, + &size); + if (*outbuf) + return size; + } + + *outbuf = run_textconv(driver->textconv, df, &size); if (!*outbuf) die("unable to read files to diff"); + + if (driver->textconv_cache) { + /* ignore errors, as we might be in a readonly repository */ + notes_cache_put(driver->textconv_cache, df->sha1, *outbuf, + size); + /* + * we could save up changes and flush them all at the end, + * but we would need an extra call after all diffing is done. + * Since generating a cache entry is the slow path anyway, + * this extra overhead probably isn't a big deal. + */ + notes_cache_write(driver->textconv_cache); + } + return size; } diff --git a/t/t4042-diff-textconv-caching.sh b/t/t4042-diff-textconv-caching.sh new file mode 100755 index 0000000000..91f8198f05 --- /dev/null +++ b/t/t4042-diff-textconv-caching.sh @@ -0,0 +1,109 @@ +#!/bin/sh + +test_description='test textconv caching' +. ./test-lib.sh + +cat >helper <<'EOF' +#!/bin/sh +sed 's/^/converted: /' "$@" >helper.out +cat helper.out +EOF +chmod +x helper + +test_expect_success 'setup' ' + echo foo content 1 >foo.bin && + echo bar content 1 >bar.bin && + git add . && + git commit -m one && + echo foo content 2 >foo.bin && + echo bar content 2 >bar.bin && + git commit -a -m two && + echo "*.bin diff=magic" >.gitattributes && + git config diff.magic.textconv ./helper && + git config diff.magic.cachetextconv true +' + +cat >expect <actual && + test_cmp expect actual +' + +test_expect_success 'cached textconv produces same output' ' + git diff HEAD^ HEAD >actual && + test_cmp expect actual +' + +test_expect_success 'cached textconv does not run helper' ' + rm -f helper.out && + git diff HEAD^ HEAD >actual && + test_cmp expect actual && + ! test -r helper.out +' + +cat >expect <other && + git config diff.magic.textconv "./helper other" && + git diff HEAD^ HEAD >actual && + test_cmp expect actual +' + +cat >expect <>.gitattributes && + git diff HEAD^ HEAD >actual && + test_cmp expect actual +' + +test_done diff --git a/userdiff.c b/userdiff.c index df992490d5..67003fbb23 100644 --- a/userdiff.c +++ b/userdiff.c @@ -1,3 +1,4 @@ +#include "cache.h" #include "userdiff.h" #include "cache.h" #include "attr.h" @@ -167,6 +168,12 @@ static int parse_tristate(int *b, const char *k, const char *v) return 1; } +static int parse_bool(int *b, const char *k, const char *v) +{ + *b = git_config_bool(k, v); + return 1; +} + int userdiff_config(const char *k, const char *v) { struct userdiff_driver *drv; @@ -181,6 +188,8 @@ int userdiff_config(const char *k, const char *v) return parse_string(&drv->external, k, v); if ((drv = parse_driver(k, v, "textconv"))) return parse_string(&drv->textconv, k, v); + if ((drv = parse_driver(k, v, "cachetextconv"))) + return parse_bool(&drv->textconv_want_cache, k, v); if ((drv = parse_driver(k, v, "wordregex"))) return parse_string(&drv->word_regex, k, v); diff --git a/userdiff.h b/userdiff.h index c3151594f5..942d594950 100644 --- a/userdiff.h +++ b/userdiff.h @@ -1,6 +1,8 @@ #ifndef USERDIFF_H #define USERDIFF_H +#include "notes-cache.h" + struct userdiff_funcname { const char *pattern; int cflags; @@ -13,6 +15,8 @@ struct userdiff_driver { struct userdiff_funcname funcname; const char *word_regex; const char *textconv; + struct notes_cache *textconv_cache; + int textconv_want_cache; }; int userdiff_config(const char *k, const char *v); From b3373982667dc983b8dacf33861d25b20bafb995 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Thu, 1 Apr 2010 20:14:24 -0400 Subject: [PATCH 5/5] diff: avoid useless filespec population builtin_diff calls fill_mmfile fairly early, which in turn calls diff_populate_filespec, which actually retrieves the file's blob contents into a buffer. Long ago, this was sensible as we would need to look at the blobs eventually. These days, however, we may not ever want those blobs if we end up using a textconv cache, and for large binary files (exactly the sort for which you might have a textconv cache), just retrieving the objects can be costly. This patch just pushes the fill_mmfile call a bit later, so we can avoid populating the filespec in some cases. There is one thing to note that looks like a bug but isn't. We push the fill_mmfile down into the first branch of a conditional. It seems like we would need it on the other branch, too, but we don't; fill_textconv does it for us (in fact, before this, we were just writing over the results of the fill_mmfile on that branch). Here's a timing sample on a commit with 45 changed jpgs and avis. The result is fully textconv cached, but we still wasted a lot of time just pulling the blobs from storage. The total size of the blobs (source and dest) is about 180M. [before] $ time git show >/dev/null real 0m0.352s user 0m0.148s sys 0m0.200s [after] $ time git show >/dev/null real 0m0.009s user 0m0.004s sys 0m0.004s And that's on a warm cache. On a cold cache, the "after" case is not much worse, but the "before" case has to do an extra 180M of I/O. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- diff.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/diff.c b/diff.c index 72d8503d89..4cb6d9a9e8 100644 --- a/diff.c +++ b/diff.c @@ -1680,12 +1680,11 @@ static void builtin_diff(const char *name_a, } } - if (fill_mmfile(&mf1, one) < 0 || fill_mmfile(&mf2, two) < 0) - die("unable to read files to diff"); - if (!DIFF_OPT_TST(o, TEXT) && - ( (diff_filespec_is_binary(one) && !textconv_one) || - (diff_filespec_is_binary(two) && !textconv_two) )) { + ( (!textconv_one && diff_filespec_is_binary(one)) || + (!textconv_two && diff_filespec_is_binary(two)) )) { + if (fill_mmfile(&mf1, one) < 0 || fill_mmfile(&mf2, two) < 0) + die("unable to read files to diff"); /* Quite common confusing case */ if (mf1.size == mf2.size && !memcmp(mf1.ptr, mf2.ptr, mf1.size))