2 * diff-delta.c: generate a delta between two buffers
4 * Many parts of this file have been lifted from LibXDiff version 0.10.
5 * http://www.xmailserver.org/xdiff-lib.html
7 * LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
8 * Copyright (C) 2003 Davide Libenzi
10 * Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
12 * This file is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
17 * Use of this within git automatically means that the LGPL
18 * licensing gets turned into GPLv2 within this project.
26 /* maximum hash entry list for the same hash bucket */
29 #define RABIN_SHIFT 23
30 #define RABIN_WINDOW 16
32 static const unsigned int T[256] = {
33 0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
34 0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
35 0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
36 0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
37 0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
38 0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
39 0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
40 0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
41 0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
42 0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
43 0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
44 0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
45 0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
46 0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
47 0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
48 0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
49 0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
50 0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
51 0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
52 0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
53 0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
54 0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
55 0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
56 0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
57 0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
58 0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
59 0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
60 0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
61 0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
62 0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
63 0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
64 0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
65 0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
66 0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
67 0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
68 0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
69 0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
70 0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
71 0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
72 0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
73 0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
74 0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
75 0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
78 static const unsigned int U[256] = {
79 0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
80 0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
81 0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
82 0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
83 0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
84 0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
85 0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
86 0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
87 0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
88 0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
89 0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
90 0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
91 0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
92 0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
93 0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
94 0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
95 0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
96 0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
97 0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
98 0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
99 0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
100 0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
101 0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
102 0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
103 0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
104 0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
105 0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
106 0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
107 0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
108 0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
109 0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
110 0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
111 0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
112 0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
113 0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
114 0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
115 0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
116 0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
117 0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
118 0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
119 0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
120 0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
121 0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
125 const unsigned char *ptr;
127 struct index_entry *next;
132 unsigned long src_size;
133 unsigned int hash_mask;
134 struct index_entry *hash[0];
137 struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
139 unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
140 const unsigned char *data, *buffer = buf;
141 struct delta_index *index;
142 struct index_entry *entry, **hash;
144 unsigned long memsize;
146 if (!buf || !bufsize)
149 /* Determine index hash size. Note that indexing skips the
150 first byte to allow for optimizing the rabin polynomial
151 initialization in create_delta(). */
152 entries = (bufsize - 1) / RABIN_WINDOW;
154 for (i = 4; (1 << i) < hsize && i < 31; i++);
158 /* allocate lookup index */
159 memsize = sizeof(*index) +
160 sizeof(*hash) * hsize +
161 sizeof(*entry) * entries;
162 mem = malloc(memsize);
171 index->src_buf = buf;
172 index->src_size = bufsize;
173 index->hash_mask = hmask;
174 memset(hash, 0, hsize * sizeof(*hash));
176 /* allocate an array to count hash entries */
177 hash_count = calloc(hsize, sizeof(*hash_count));
183 /* then populate the index */
185 for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
187 data -= RABIN_WINDOW) {
188 unsigned int val = 0;
189 for (i = 1; i <= RABIN_WINDOW; i++)
190 val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
191 if (val == prev_val) {
192 /* keep the lowest of consecutive identical blocks */
193 entry[-1].ptr = data + RABIN_WINDOW;
197 entry->ptr = data + RABIN_WINDOW;
199 entry->next = hash[i];
206 * Determine a limit on the number of entries in the same hash
207 * bucket. This guard us against patological data sets causing
208 * really bad hash distribution with most entries in the same hash
209 * bucket that would bring us to O(m*n) computing costs (m and n
210 * corresponding to reference and target buffer sizes).
212 * Make sure none of the hash buckets has more entries than
213 * we're willing to test. Otherwise we cull the entry list
214 * uniformly to still preserve a good repartition across
215 * the reference buffer.
217 for (i = 0; i < hsize; i++) {
218 if (hash_count[i] < HASH_LIMIT)
222 struct index_entry *keep = entry;
223 int skip = hash_count[i] / HASH_LIMIT / 2;
226 } while(--skip && entry);
235 void free_delta_index(struct delta_index *index)
241 * The maximum size for any opcode sequence, including the initial header
242 * plus rabin window plus biggest copy.
244 #define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
247 create_delta(const struct delta_index *index,
248 const void *trg_buf, unsigned long trg_size,
249 unsigned long *delta_size, unsigned long max_size)
251 unsigned int i, outpos, outsize, val;
253 const unsigned char *ref_data, *ref_top, *data, *top;
256 if (!trg_buf || !trg_size)
261 if (max_size && outsize >= max_size)
262 outsize = max_size + MAX_OP_SIZE + 1;
263 out = malloc(outsize);
267 /* store reference buffer size */
270 out[outpos++] = i | 0x80;
275 /* store target buffer size */
278 out[outpos++] = i | 0x80;
283 ref_data = index->src_buf;
284 ref_top = ref_data + index->src_size;
286 top = trg_buf + trg_size;
290 for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
291 out[outpos++] = *data;
292 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
297 unsigned int moff = 0, msize = 0;
298 struct index_entry *entry;
299 val ^= U[data[-RABIN_WINDOW]];
300 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
301 i = val & index->hash_mask;
302 for (entry = index->hash[i]; entry; entry = entry->next) {
303 const unsigned char *ref = entry->ptr;
304 const unsigned char *src = data;
305 unsigned int ref_size = ref_top - ref;
306 if (entry->val != val)
308 if (ref_size > top - src)
309 ref_size = top - src;
310 if (ref_size > 0x10000)
312 if (ref_size <= msize)
314 while (ref_size-- && *src++ == *ref)
316 if (msize < ref - entry->ptr) {
317 /* this is our best match so far */
318 msize = ref - entry->ptr;
319 moff = entry->ptr - ref_data;
326 out[outpos++] = *data++;
328 if (inscnt == 0x7f) {
329 out[outpos - inscnt - 1] = inscnt;
335 if (msize >= RABIN_WINDOW) {
336 const unsigned char *sk;
337 sk = data + msize - RABIN_WINDOW;
339 for (i = 0; i < RABIN_WINDOW; i++)
340 val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
342 const unsigned char *sk = data + 1;
343 for (i = 1; i < msize; i++) {
344 val ^= U[sk[-RABIN_WINDOW]];
345 val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
350 while (moff && ref_data[moff-1] == data[-1]) {
351 if (msize == 0x10000)
353 /* we can match one byte back */
360 outpos--; /* remove count slot */
361 inscnt--; /* make it -1 */
364 out[outpos - inscnt - 1] = inscnt;
372 if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
374 if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
376 if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
378 if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
380 if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
382 if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
387 if (outpos >= outsize - MAX_OP_SIZE) {
389 outsize = outsize * 3 / 2;
390 if (max_size && outsize >= max_size)
391 outsize = max_size + MAX_OP_SIZE + 1;
392 if (max_size && outpos > max_size)
394 out = realloc(out, outsize);
403 out[outpos - inscnt - 1] = inscnt;
405 if (max_size && outpos > max_size) {
410 *delta_size = outpos;