replace adler32 with Rabin's polynomial in diff-delta
[git.git] / diff-delta.c
1 /*
2  * diff-delta.c: generate a delta between two buffers
3  *
4  *  Many parts of this file have been lifted from LibXDiff version 0.10.
5  *  http://www.xmailserver.org/xdiff-lib.html
6  *
7  *  LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
8  *  Copyright (C) 2003  Davide Libenzi
9  *
10  *  Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
11  *
12  *  This file is free software; you can redistribute it and/or
13  *  modify it under the terms of the GNU Lesser General Public
14  *  License as published by the Free Software Foundation; either
15  *  version 2.1 of the License, or (at your option) any later version.
16  *
17  *  Use of this within git automatically means that the LGPL
18  *  licensing gets turned into GPLv2 within this project.
19  */
20
21 #include <stdlib.h>
22 #include <string.h>
23 #include "delta.h"
24
25
26 /* maximum hash entry list for the same hash bucket */
27 #define HASH_LIMIT 64
28
29 #define RABIN_SHIFT 23
30 #define RABIN_WINDOW 16
31
32 static const unsigned int T[256] = {
33         0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
34         0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
35         0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
36         0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
37         0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
38         0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
39         0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
40         0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
41         0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
42         0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
43         0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
44         0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
45         0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
46         0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
47         0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
48         0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
49         0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
50         0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
51         0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
52         0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
53         0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
54         0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
55         0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
56         0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
57         0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
58         0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
59         0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
60         0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
61         0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
62         0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
63         0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
64         0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
65         0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
66         0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
67         0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
68         0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
69         0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
70         0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
71         0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
72         0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
73         0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
74         0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
75         0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
76 };
77
78 static const unsigned int U[256] = {
79         0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
80         0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
81         0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
82         0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
83         0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
84         0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
85         0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
86         0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
87         0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
88         0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
89         0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
90         0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
91         0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
92         0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
93         0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
94         0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
95         0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
96         0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
97         0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
98         0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
99         0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
100         0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
101         0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
102         0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
103         0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
104         0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
105         0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
106         0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
107         0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
108         0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
109         0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
110         0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
111         0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
112         0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
113         0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
114         0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
115         0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
116         0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
117         0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
118         0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
119         0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
120         0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
121         0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
122 };
123
124 struct index_entry {
125         const unsigned char *ptr;
126         unsigned int val;
127         struct index_entry *next;
128 };
129
130 struct delta_index {
131         const void *src_buf;
132         unsigned long src_size;
133         unsigned int hash_mask;
134         struct index_entry *hash[0];
135 };
136
137 struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
138 {
139         unsigned int i, hsize, hmask, entries, *hash_count;
140         const unsigned char *data, *buffer = buf;
141         struct delta_index *index;
142         struct index_entry *entry, **hash;
143         void *mem;
144
145         if (!buf || !bufsize)
146                 return NULL;
147
148         /* Determine index hash size.  Note that indexing skips the
149            first byte to allow for optimizing the rabin polynomial
150            initialization in create_delta(). */
151         entries = (bufsize - 1)  / RABIN_WINDOW;
152         hsize = entries / 4;
153         for (i = 4; (1 << i) < hsize && i < 31; i++);
154         hsize = 1 << i;
155         hmask = hsize - 1;
156
157         /* allocate lookup index */
158         mem = malloc(sizeof(*index) +
159                      sizeof(*hash) * hsize +
160                      sizeof(*entry) * entries);
161         if (!mem)
162                 return NULL;
163         index = mem;
164         mem = index + 1;
165         hash = mem;
166         mem = hash + hsize;
167         entry = mem;
168
169         index->src_buf = buf;
170         index->src_size = bufsize;
171         index->hash_mask = hmask;
172         memset(hash, 0, hsize * sizeof(*hash));
173
174         /* allocate an array to count hash entries */
175         hash_count = calloc(hsize, sizeof(*hash_count));
176         if (!hash_count) {
177                 free(index);
178                 return NULL;
179         }
180
181         /* then populate the index */
182         data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
183         while (data >= buffer) {
184                 unsigned int val = 0;
185                 for (i = 1; i <= RABIN_WINDOW; i++)
186                         val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
187                 i = val & hmask;
188                 entry->ptr = data + RABIN_WINDOW;
189                 entry->val = val;
190                 entry->next = hash[i];
191                 hash[i] = entry++;
192                 hash_count[i]++;
193                 data -= RABIN_WINDOW;
194         }
195
196         /*
197          * Determine a limit on the number of entries in the same hash
198          * bucket.  This guard us against patological data sets causing
199          * really bad hash distribution with most entries in the same hash
200          * bucket that would bring us to O(m*n) computing costs (m and n
201          * corresponding to reference and target buffer sizes).
202          *
203          * Make sure none of the hash buckets has more entries than
204          * we're willing to test.  Otherwise we cull the entry list
205          * uniformly to still preserve a good repartition across
206          * the reference buffer.
207          */
208         for (i = 0; i < hsize; i++) {
209                 if (hash_count[i] < HASH_LIMIT)
210                         continue;
211                 entry = hash[i];
212                 do {
213                         struct index_entry *keep = entry;
214                         int skip = hash_count[i] / HASH_LIMIT / 2;
215                         do {
216                                 entry = entry->next;
217                         } while(--skip && entry);
218                         keep->next = entry;
219                 } while(entry);
220         }
221         free(hash_count);
222
223         return index;
224 }
225
226 void free_delta_index(struct delta_index *index)
227 {
228         free(index);
229 }
230
231 /*
232  * The maximum size for any opcode sequence, including the initial header
233  * plus rabin window plus biggest copy.
234  */
235 #define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
236
237 void *
238 create_delta(const struct delta_index *index,
239              const void *trg_buf, unsigned long trg_size,
240              unsigned long *delta_size, unsigned long max_size)
241 {
242         unsigned int i, outpos, outsize, hash_mask, val;
243         int inscnt;
244         const unsigned char *ref_data, *ref_top, *data, *top;
245         unsigned char *out;
246
247         if (!trg_buf || !trg_size)
248                 return NULL;
249
250         outpos = 0;
251         outsize = 8192;
252         if (max_size && outsize >= max_size)
253                 outsize = max_size + MAX_OP_SIZE + 1;
254         out = malloc(outsize);
255         if (!out)
256                 return NULL;
257
258         /* store reference buffer size */
259         i = index->src_size;
260         while (i >= 0x80) {
261                 out[outpos++] = i | 0x80;
262                 i >>= 7;
263         }
264         out[outpos++] = i;
265
266         /* store target buffer size */
267         i = trg_size;
268         while (i >= 0x80) {
269                 out[outpos++] = i | 0x80;
270                 i >>= 7;
271         }
272         out[outpos++] = i;
273
274         ref_data = index->src_buf;
275         ref_top = ref_data + index->src_size;
276         data = trg_buf;
277         top = trg_buf + trg_size;
278         hash_mask = index->hash_mask;
279
280         outpos++;
281         val = 0;
282         for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
283                 out[outpos++] = *data;
284                 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
285         }
286         inscnt = i;
287
288         while (data < top) {
289                 unsigned int moff = 0, msize = 0;
290                 struct index_entry *entry;
291                 val ^= U[data[-RABIN_WINDOW]];
292                 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
293                 i = val & hash_mask;
294                 for (entry = index->hash[i]; entry; entry = entry->next) {
295                         const unsigned char *ref = entry->ptr;
296                         const unsigned char *src = data;
297                         unsigned int ref_size = ref_top - ref;
298                         if (entry->val != val)
299                                 continue;
300                         if (ref_size > top - src)
301                                 ref_size = top - src;
302                         if (ref_size > 0x10000)
303                                 ref_size = 0x10000;
304                         if (ref_size <= msize)
305                                 break;
306                         while (ref_size-- && *src++ == *ref)
307                                 ref++;
308                         if (msize < ref - entry->ptr) {
309                                 /* this is our best match so far */
310                                 msize = ref - entry->ptr;
311                                 moff = entry->ptr - ref_data;
312                         }
313                 }
314
315                 if (msize < 4) {
316                         if (!inscnt)
317                                 outpos++;
318                         out[outpos++] = *data++;
319                         inscnt++;
320                         if (inscnt == 0x7f) {
321                                 out[outpos - inscnt - 1] = inscnt;
322                                 inscnt = 0;
323                         }
324                 } else {
325                         unsigned char *op;
326
327                         if (msize >= RABIN_WINDOW) {
328                                 const unsigned char *sk;
329                                 sk = data + msize - RABIN_WINDOW;
330                                 val = 0;
331                                 for (i = 0; i < RABIN_WINDOW; i++)
332                                         val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
333                         } else {
334                                 const unsigned char *sk = data + 1;
335                                 for (i = 1; i < msize; i++) {
336                                         val ^= U[sk[-RABIN_WINDOW]];
337                                         val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
338                                 }
339                         }
340
341                         if (inscnt) {
342                                 while (moff && ref_data[moff-1] == data[-1]) {
343                                         if (msize == 0x10000)
344                                                 break;
345                                         /* we can match one byte back */
346                                         msize++;
347                                         moff--;
348                                         data--;
349                                         outpos--;
350                                         if (--inscnt)
351                                                 continue;
352                                         outpos--;  /* remove count slot */
353                                         inscnt--;  /* make it -1 */
354                                         break;
355                                 }
356                                 out[outpos - inscnt - 1] = inscnt;
357                                 inscnt = 0;
358                         }
359
360                         data += msize;
361                         op = out + outpos++;
362                         i = 0x80;
363
364                         if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
365                         moff >>= 8;
366                         if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
367                         moff >>= 8;
368                         if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
369                         moff >>= 8;
370                         if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
371
372                         if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
373                         msize >>= 8;
374                         if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
375
376                         *op = i;
377                 }
378
379                 if (outpos >= outsize - MAX_OP_SIZE) {
380                         void *tmp = out;
381                         outsize = outsize * 3 / 2;
382                         if (max_size && outsize >= max_size)
383                                 outsize = max_size + MAX_OP_SIZE + 1;
384                         if (max_size && outpos > max_size)
385                                 break;
386                         out = realloc(out, outsize);
387                         if (!out) {
388                                 free(tmp);
389                                 return NULL;
390                         }
391                 }
392         }
393
394         if (inscnt)
395                 out[outpos - inscnt - 1] = inscnt;
396
397         if (max_size && outpos > max_size) {
398                 free(out);
399                 return NULL;
400         }
401
402         *delta_size = outpos;
403         return out;
404 }