Cauterize dropped or duplicate bits from next.
[git.git] / diffcore-rename.c
1 /*
2  * Copyright (C) 2005 Junio C Hamano
3  */
4 #include "cache.h"
5 #include "diff.h"
6 #include "diffcore.h"
7
8 /* Table of rename/copy destinations */
9
10 static struct diff_rename_dst {
11         struct diff_filespec *two;
12         struct diff_filepair *pair;
13 } *rename_dst;
14 static int rename_dst_nr, rename_dst_alloc;
15
16 static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two,
17                                                  int insert_ok)
18 {
19         int first, last;
20
21         first = 0;
22         last = rename_dst_nr;
23         while (last > first) {
24                 int next = (last + first) >> 1;
25                 struct diff_rename_dst *dst = &(rename_dst[next]);
26                 int cmp = strcmp(two->path, dst->two->path);
27                 if (!cmp)
28                         return dst;
29                 if (cmp < 0) {
30                         last = next;
31                         continue;
32                 }
33                 first = next+1;
34         }
35         /* not found */
36         if (!insert_ok)
37                 return NULL;
38         /* insert to make it at "first" */
39         if (rename_dst_alloc <= rename_dst_nr) {
40                 rename_dst_alloc = alloc_nr(rename_dst_alloc);
41                 rename_dst = xrealloc(rename_dst,
42                                       rename_dst_alloc * sizeof(*rename_dst));
43         }
44         rename_dst_nr++;
45         if (first < rename_dst_nr)
46                 memmove(rename_dst + first + 1, rename_dst + first,
47                         (rename_dst_nr - first - 1) * sizeof(*rename_dst));
48         rename_dst[first].two = alloc_filespec(two->path);
49         fill_filespec(rename_dst[first].two, two->sha1, two->mode);
50         rename_dst[first].pair = NULL;
51         return &(rename_dst[first]);
52 }
53
54 /* Table of rename/copy src files */
55 static struct diff_rename_src {
56         struct diff_filespec *one;
57         unsigned src_path_left : 1;
58 } *rename_src;
59 static int rename_src_nr, rename_src_alloc;
60
61 static struct diff_rename_src *register_rename_src(struct diff_filespec *one,
62                                                    int src_path_left)
63 {
64         int first, last;
65
66         first = 0;
67         last = rename_src_nr;
68         while (last > first) {
69                 int next = (last + first) >> 1;
70                 struct diff_rename_src *src = &(rename_src[next]);
71                 int cmp = strcmp(one->path, src->one->path);
72                 if (!cmp)
73                         return src;
74                 if (cmp < 0) {
75                         last = next;
76                         continue;
77                 }
78                 first = next+1;
79         }
80
81         /* insert to make it at "first" */
82         if (rename_src_alloc <= rename_src_nr) {
83                 rename_src_alloc = alloc_nr(rename_src_alloc);
84                 rename_src = xrealloc(rename_src,
85                                       rename_src_alloc * sizeof(*rename_src));
86         }
87         rename_src_nr++;
88         if (first < rename_src_nr)
89                 memmove(rename_src + first + 1, rename_src + first,
90                         (rename_src_nr - first - 1) * sizeof(*rename_src));
91         rename_src[first].one = one;
92         rename_src[first].src_path_left = src_path_left;
93         return &(rename_src[first]);
94 }
95
96 static int is_exact_match(struct diff_filespec *src, struct diff_filespec *dst)
97 {
98         if (src->sha1_valid && dst->sha1_valid &&
99             !memcmp(src->sha1, dst->sha1, 20))
100                 return 1;
101         if (diff_populate_filespec(src, 1) || diff_populate_filespec(dst, 1))
102                 return 0;
103         if (src->size != dst->size)
104                 return 0;
105         if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))
106                 return 0;
107         if (src->size == dst->size &&
108             !memcmp(src->data, dst->data, src->size))
109                 return 1;
110         return 0;
111 }
112
113 struct diff_score {
114         int src; /* index in rename_src */
115         int dst; /* index in rename_dst */
116         int score;
117 };
118
119 static int estimate_similarity(struct diff_filespec *src,
120                                struct diff_filespec *dst,
121                                int minimum_score)
122 {
123         /* src points at a file that existed in the original tree (or
124          * optionally a file in the destination tree) and dst points
125          * at a newly created file.  They may be quite similar, in which
126          * case we want to say src is renamed to dst or src is copied into
127          * dst, and then some edit has been applied to dst.
128          *
129          * Compare them and return how similar they are, representing
130          * the score as an integer between 0 and MAX_SCORE.
131          *
132          * When there is an exact match, it is considered a better
133          * match than anything else; the destination does not even
134          * call into this function in that case.
135          */
136         unsigned long delta_size, base_size, src_copied, literal_added;
137         unsigned long delta_limit;
138         int score;
139
140         /* We deal only with regular files.  Symlink renames are handled
141          * only when they are exact matches --- in other words, no edits
142          * after renaming.
143          */
144         if (!S_ISREG(src->mode) || !S_ISREG(dst->mode))
145                 return 0;
146
147         delta_size = ((src->size < dst->size) ?
148                       (dst->size - src->size) : (src->size - dst->size));
149         base_size = ((src->size < dst->size) ? src->size : dst->size);
150
151         /* We would not consider edits that change the file size so
152          * drastically.  delta_size must be smaller than
153          * (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size).
154          *
155          * Note that base_size == 0 case is handled here already
156          * and the final score computation below would not have a
157          * divide-by-zero issue.
158          */
159         if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
160                 return 0;
161
162         if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))
163                 return 0; /* error but caught downstream */
164
165
166         delta_limit = base_size * (MAX_SCORE-minimum_score) / MAX_SCORE;
167         if (diffcore_count_changes(src->data, src->size,
168                                    dst->data, dst->size,
169                                    delta_limit,
170                                    &src_copied, &literal_added))
171                 return 0;
172
173         /* Extent of damage */
174         if (src->size + literal_added < src_copied)
175                 delta_size = 0;
176         else
177                 delta_size = (src->size - src_copied) + literal_added;
178
179         /*
180          * Now we will give some score to it.  100% edit gets 0 points
181          * and 0% edit gets MAX_SCORE points.
182          */
183         score = MAX_SCORE - (MAX_SCORE * delta_size / base_size); 
184         if (score < 0) return 0;
185         if (MAX_SCORE < score) return MAX_SCORE;
186         return score;
187 }
188
189 static void record_rename_pair(int dst_index, int src_index, int score)
190 {
191         struct diff_filespec *one, *two, *src, *dst;
192         struct diff_filepair *dp;
193
194         if (rename_dst[dst_index].pair)
195                 die("internal error: dst already matched.");
196
197         src = rename_src[src_index].one;
198         one = alloc_filespec(src->path);
199         fill_filespec(one, src->sha1, src->mode);
200
201         dst = rename_dst[dst_index].two;
202         two = alloc_filespec(dst->path);
203         fill_filespec(two, dst->sha1, dst->mode);
204
205         dp = diff_queue(NULL, one, two);
206         dp->score = score;
207         dp->source_stays = rename_src[src_index].src_path_left;
208         rename_dst[dst_index].pair = dp;
209 }
210
211 /*
212  * We sort the rename similarity matrix with the score, in descending
213  * order (the most similar first).
214  */
215 static int score_compare(const void *a_, const void *b_)
216 {
217         const struct diff_score *a = a_, *b = b_;
218         return b->score - a->score;
219 }
220
221 static int compute_stays(struct diff_queue_struct *q,
222                          struct diff_filespec *one)
223 {
224         int i;
225         for (i = 0; i < q->nr; i++) {
226                 struct diff_filepair *p = q->queue[i];
227                 if (strcmp(one->path, p->two->path))
228                         continue;
229                 if (DIFF_PAIR_RENAME(p)) {
230                         return 0; /* something else is renamed into this */
231                 }
232         }
233         return 1;
234 }
235
236 void diffcore_rename(struct diff_options *options)
237 {
238         int detect_rename = options->detect_rename;
239         int minimum_score = options->rename_score;
240         int rename_limit = options->rename_limit;
241         struct diff_queue_struct *q = &diff_queued_diff;
242         struct diff_queue_struct outq;
243         struct diff_score *mx;
244         int i, j, rename_count;
245         int num_create, num_src, dst_cnt;
246
247         if (!minimum_score)
248                 minimum_score = DEFAULT_RENAME_SCORE;
249         rename_count = 0;
250
251         for (i = 0; i < q->nr; i++) {
252                 struct diff_filepair *p = q->queue[i];
253                 if (!DIFF_FILE_VALID(p->one))
254                         if (!DIFF_FILE_VALID(p->two))
255                                 continue; /* unmerged */
256                         else
257                                 locate_rename_dst(p->two, 1);
258                 else if (!DIFF_FILE_VALID(p->two)) {
259                         /* If the source is a broken "delete", and
260                          * they did not really want to get broken,
261                          * that means the source actually stays.
262                          */
263                         int stays = (p->broken_pair && !p->score);
264                         register_rename_src(p->one, stays);
265                 }
266                 else if (detect_rename == DIFF_DETECT_COPY)
267                         register_rename_src(p->one, 1);
268         }
269         if (rename_dst_nr == 0 || rename_src_nr == 0 ||
270             (0 < rename_limit && rename_limit < rename_dst_nr))
271                 goto cleanup; /* nothing to do */
272
273         /* We really want to cull the candidates list early
274          * with cheap tests in order to avoid doing deltas.
275          */
276         for (i = 0; i < rename_dst_nr; i++) {
277                 struct diff_filespec *two = rename_dst[i].two;
278                 for (j = 0; j < rename_src_nr; j++) {
279                         struct diff_filespec *one = rename_src[j].one;
280                         if (!is_exact_match(one, two))
281                                 continue;
282                         record_rename_pair(i, j, MAX_SCORE);
283                         rename_count++;
284                         break; /* we are done with this entry */
285                 }
286         }
287
288         /* Have we run out the created file pool?  If so we can avoid
289          * doing the delta matrix altogether.
290          */
291         if (rename_count == rename_dst_nr)
292                 goto cleanup;
293
294         if (minimum_score == MAX_SCORE)
295                 goto cleanup;
296
297         num_create = (rename_dst_nr - rename_count);
298         num_src = rename_src_nr;
299         mx = xmalloc(sizeof(*mx) * num_create * num_src);
300         for (dst_cnt = i = 0; i < rename_dst_nr; i++) {
301                 int base = dst_cnt * num_src;
302                 struct diff_filespec *two = rename_dst[i].two;
303                 if (rename_dst[i].pair)
304                         continue; /* dealt with exact match already. */
305                 for (j = 0; j < rename_src_nr; j++) {
306                         struct diff_filespec *one = rename_src[j].one;
307                         struct diff_score *m = &mx[base+j];
308                         m->src = j;
309                         m->dst = i;
310                         m->score = estimate_similarity(one, two,
311                                                        minimum_score);
312                 }
313                 dst_cnt++;
314         }
315         /* cost matrix sorted by most to least similar pair */
316         qsort(mx, num_create * num_src, sizeof(*mx), score_compare);
317         for (i = 0; i < num_create * num_src; i++) {
318                 struct diff_rename_dst *dst = &rename_dst[mx[i].dst];
319                 if (dst->pair)
320                         continue; /* already done, either exact or fuzzy. */
321                 if (mx[i].score < minimum_score)
322                         break; /* there is no more usable pair. */
323                 record_rename_pair(mx[i].dst, mx[i].src, mx[i].score);
324                 rename_count++;
325         }
326         free(mx);
327
328  cleanup:
329         /* At this point, we have found some renames and copies and they
330          * are recorded in rename_dst.  The original list is still in *q.
331          */
332         outq.queue = NULL;
333         outq.nr = outq.alloc = 0;
334         for (i = 0; i < q->nr; i++) {
335                 struct diff_filepair *p = q->queue[i];
336                 struct diff_filepair *pair_to_free = NULL;
337
338                 if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) {
339                         /*
340                          * Creation
341                          *
342                          * We would output this create record if it has
343                          * not been turned into a rename/copy already.
344                          */
345                         struct diff_rename_dst *dst =
346                                 locate_rename_dst(p->two, 0);
347                         if (dst && dst->pair) {
348                                 diff_q(&outq, dst->pair);
349                                 pair_to_free = p;
350                         }
351                         else
352                                 /* no matching rename/copy source, so
353                                  * record this as a creation.
354                                  */
355                                 diff_q(&outq, p);
356                 }
357                 else if (DIFF_FILE_VALID(p->one) && !DIFF_FILE_VALID(p->two)) {
358                         /*
359                          * Deletion
360                          *
361                          * We would output this delete record if:
362                          *
363                          * (1) this is a broken delete and the counterpart
364                          *     broken create remains in the output; or
365                          * (2) this is not a broken delete, and rename_dst
366                          *     does not have a rename/copy to move p->one->path
367                          *     out of existence.
368                          *
369                          * Otherwise, the counterpart broken create
370                          * has been turned into a rename-edit; or
371                          * delete did not have a matching create to
372                          * begin with.
373                          */
374                         if (DIFF_PAIR_BROKEN(p)) {
375                                 /* broken delete */
376                                 struct diff_rename_dst *dst =
377                                         locate_rename_dst(p->one, 0);
378                                 if (dst && dst->pair)
379                                         /* counterpart is now rename/copy */
380                                         pair_to_free = p;
381                         }
382                         else {
383                                 for (j = 0; j < rename_dst_nr; j++) {
384                                         if (!rename_dst[j].pair)
385                                                 continue;
386                                         if (strcmp(rename_dst[j].pair->
387                                                    one->path,
388                                                    p->one->path))
389                                                 continue;
390                                         break;
391                                 }
392                                 if (j < rename_dst_nr)
393                                         /* this path remains */
394                                         pair_to_free = p;
395                         }
396
397                         if (pair_to_free)
398                                 ;
399                         else
400                                 diff_q(&outq, p);
401                 }
402                 else if (!diff_unmodified_pair(p))
403                         /* all the usual ones need to be kept */
404                         diff_q(&outq, p);
405                 else
406                         /* no need to keep unmodified pairs */
407                         pair_to_free = p;
408
409                 if (pair_to_free)
410                         diff_free_filepair(pair_to_free);
411         }
412         diff_debug_queue("done copying original", &outq);
413
414         free(q->queue);
415         *q = outq;
416         diff_debug_queue("done collapsing", q);
417
418         /* We need to see which rename source really stays here;
419          * earlier we only checked if the path is left in the result,
420          * but even if a path remains in the result, if that is coming
421          * from copying something else on top of it, then the original
422          * source is lost and does not stay.
423          */
424         for (i = 0; i < q->nr; i++) {
425                 struct diff_filepair *p = q->queue[i];
426                 if (DIFF_PAIR_RENAME(p) && p->source_stays) {
427                         /* If one appears as the target of a rename-copy,
428                          * then mark p->source_stays = 0; otherwise
429                          * leave it as is.
430                          */
431                         p->source_stays = compute_stays(q, p->one);
432                 }
433         }
434
435         for (i = 0; i < rename_dst_nr; i++) {
436                 diff_free_filespec_data(rename_dst[i].two);
437                 free(rename_dst[i].two);
438         }
439
440         free(rename_dst);
441         rename_dst = NULL;
442         rename_dst_nr = rename_dst_alloc = 0;
443         free(rename_src);
444         rename_src = NULL;
445         rename_src_nr = rename_src_alloc = 0;
446         return;
447 }