From 9a305b67f8055503a743e67f628400f094c169ee Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sun, 16 Apr 2006 21:07:32 -0700 Subject: [PATCH 1/2] Geert's similarity Define a function to compute similarity score 0.0<=score<=1.0 Signed-off-by: Junio C Hamano --- gsimm.c | 29 +++++++++++++++++++++++++++++ gsimm.h | 5 +++-- test-gsimm.c | 29 +++++++---------------------- 3 files changed, 39 insertions(+), 24 deletions(-) diff --git a/gsimm.c b/gsimm.c index 7024bf8f58..bd646eb3d2 100644 --- a/gsimm.c +++ b/gsimm.c @@ -1,3 +1,4 @@ +#include #include "rabinpoly.h" #include "gsimm.h" @@ -32,6 +33,29 @@ static void freq_to_md(u_char *md, int *freq) bzero (freq, sizeof(freq[0]) * MD_BITS); } +static int dist (u_char *l, u_char *r) +{ int j, k; + int d = 0; + + for (j = 0; j < MD_LENGTH; j++) + { u_char ch = l[j] ^ r[j]; + + for (k = 0; k < 8; k++) d += ((ch & (1< 0); + } + + return d; +} + +double gb_simm_score(u_char *l, u_char *r) +{ + int d = dist(l, r); + double sim = (double) (d) / (MD_LENGTH * 4 - 1); + if (1.0 < sim) + return 0; + else + return 1.0 - sim; +} + void gb_simm_process(u_char *data, unsigned len, u_char *md) { size_t j = 0; u_int32_t ofs; @@ -39,6 +63,11 @@ void gb_simm_process(u_char *data, unsigned len, u_char *md) u_int32_t count [MD_BITS * (GROUP_COUNTERS/GROUP_BITS)]; int freq[MD_BITS]; + if (len < GB_SIMM_MIN_FILE_SIZE || GB_SIMM_MAX_FILE_SIZE < len) { + memset(md, 0, MD_LENGTH); + return; + } + bzero (freq, sizeof(freq[0]) * MD_BITS); bzero (dup_cache, DUP_CACHE_SIZE * sizeof (u_int32_t)); bzero (count, (MD_BITS * (GROUP_COUNTERS/GROUP_BITS) * sizeof (u_int32_t))); diff --git a/gsimm.h b/gsimm.h index 4b023b91a9..17fab32d87 100644 --- a/gsimm.h +++ b/gsimm.h @@ -15,14 +15,15 @@ In order to get at least an average of 12 samples per bit in the final message digest, require at least 3 * MD_LENGTH complete windows in the file. */ -#define MIN_FILE_SIZE (3 * MD_LENGTH + 2 * (RABIN_WINDOW_SIZE - 1)) +#define GB_SIMM_MIN_FILE_SIZE (3 * MD_LENGTH + 2 * (RABIN_WINDOW_SIZE - 1)) /* Limit matching algorithm to files less than 256 MB, so we can use 32 bit integers everywhere without fear of overflow. For larger files we should add logic to mmap the file by piece and accumulate the frequency counts. */ -#define MAX_FILE_SIZE (256*1024*1024 - 1) +#define GB_SIMM_MAX_FILE_SIZE (256*1024*1024 - 1) void gb_simm_process(u_char *data, unsigned len, u_char *md); +double gb_simm_score(u_char *l, u_char *r); #endif diff --git a/test-gsimm.c b/test-gsimm.c index bd28b7da28..b1e7939b65 100644 --- a/test-gsimm.c +++ b/test-gsimm.c @@ -58,19 +58,6 @@ void usage() exit (1); } -int dist (u_char *l, u_char *r) -{ int j, k; - int d = 0; - - for (j = 0; j < MD_LENGTH; j++) - { u_char ch = l[j] ^ r[j]; - - for (k = 0; k < 8; k++) d += ((ch & (1< 0); - } - - return d; -} - char *md_to_str(u_char *md) { int j; @@ -102,8 +89,8 @@ void process_file (char *name) exit (2); } - if (fs.st_size >= MIN_FILE_SIZE - && fs.st_size <= MAX_FILE_SIZE) + if (fs.st_size >= GB_SIMM_MIN_FILE_SIZE + && fs.st_size <= GB_SIMM_MAX_FILE_SIZE) { fi->length = fs.st_size; fi->name = name; @@ -116,13 +103,11 @@ void process_file (char *name) gb_simm_process (data, fs.st_size, fi->md); if (flag_relative) - { int d = dist (fi->md, relative_md); - double sim = 1.0 - MIN (1.0, (double) (d) / (MD_LENGTH * 4 - 1)); - fprintf (stdout, "%s %llu %u %s %u %3.1f\n", - md_to_str (fi->md), (long long unsigned) 0, - (unsigned) fs.st_size, name, - d, 100.0 * sim); - } + fprintf (stdout, "%s %llu %u %s %u %3.1f\n", + md_to_str (fi->md), (long long unsigned) 0, + (unsigned) fs.st_size, name, + (unsigned) 0, + 100.0 * gb_simm_score(fi->md, relative_md)); else { fprintf (stdout, "%s %llu %u %s\n", From ca9de6cadfa55c9b00476112e2bb52fe20ab95e0 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sun, 16 Apr 2006 22:03:01 -0700 Subject: [PATCH 2/2] Try using Geert similarity code in pack-objects. It appears the fingerprinting itself is too expensive to be worth doing for this purpose. A failed experiment. Signed-off-by: Junio C Hamano --- Makefile | 2 +- pack-objects.c | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 69ca05b2f9..aa499ed52f 100644 --- a/Makefile +++ b/Makefile @@ -204,7 +204,7 @@ DIFF_OBJS = \ diffcore-delta.o log-tree.o LIB_OBJS = \ - blob.o commit.o connect.o csum-file.o \ + blob.o commit.o connect.o csum-file.o gsimm.o rabinpoly.o \ date.o diff-delta.o entry.o exec_cmd.o ident.o index.o \ object.o pack-check.o patch-delta.o path.o pkt-line.o \ quote.o read-cache.o refs.o run-command.o \ diff --git a/pack-objects.c b/pack-objects.c index 09f4f2c944..18f8f82571 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -8,6 +8,8 @@ #include "pack.h" #include "csum-file.h" #include "tree-walk.h" +#include "rabinpoly.h" +#include "gsimm.h" #include #include @@ -993,6 +995,7 @@ static int type_size_sort(const struct object_entry *a, const struct object_entr struct unpacked { struct object_entry *entry; + unsigned char fingerprint[MD_LENGTH]; void *data; }; @@ -1041,6 +1044,9 @@ static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_de if (old_entry->depth >= max_depth) return 0; + if (gb_simm_score(cur->fingerprint, old->fingerprint) < 0.4) + return 0; + /* * NOTE! * @@ -1077,6 +1083,7 @@ static void find_deltas(struct object_entry **list, int window, int depth) unsigned processed = 0; unsigned last_percent = 999; + rabin_reset (); memset(array, 0, array_size); i = nr_objects; idx = 0; @@ -1115,6 +1122,8 @@ static void find_deltas(struct object_entry **list, int window, int depth) if (size != entry->size) die("object %s inconsistent object length (%lu vs %lu)", sha1_to_hex(entry->sha1), size, entry->size); + gb_simm_process(n->data, size, n->fingerprint); + j = window; while (--j > 0) { unsigned int other_idx = idx + j; @@ -1124,6 +1133,7 @@ static void find_deltas(struct object_entry **list, int window, int depth) m = array + other_idx; if (!m->entry) break; + if (try_delta(n, m, depth) < 0) break; }