Merge branch 'jc/name' into next
[git.git] / diffcore-rename.c
1 /*
2  * Copyright (C) 2005 Junio C Hamano
3  */
4 #include "cache.h"
5 #include "diff.h"
6 #include "diffcore.h"
7
8 /* Table of rename/copy destinations */
9
10 static struct diff_rename_dst {
11         struct diff_filespec *two;
12         struct diff_filepair *pair;
13 } *rename_dst;
14 static int rename_dst_nr, rename_dst_alloc;
15
16 static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two,
17                                                  int insert_ok)
18 {
19         int first, last;
20
21         first = 0;
22         last = rename_dst_nr;
23         while (last > first) {
24                 int next = (last + first) >> 1;
25                 struct diff_rename_dst *dst = &(rename_dst[next]);
26                 int cmp = strcmp(two->path, dst->two->path);
27                 if (!cmp)
28                         return dst;
29                 if (cmp < 0) {
30                         last = next;
31                         continue;
32                 }
33                 first = next+1;
34         }
35         /* not found */
36         if (!insert_ok)
37                 return NULL;
38         /* insert to make it at "first" */
39         if (rename_dst_alloc <= rename_dst_nr) {
40                 rename_dst_alloc = alloc_nr(rename_dst_alloc);
41                 rename_dst = xrealloc(rename_dst,
42                                       rename_dst_alloc * sizeof(*rename_dst));
43         }
44         rename_dst_nr++;
45         if (first < rename_dst_nr)
46                 memmove(rename_dst + first + 1, rename_dst + first,
47                         (rename_dst_nr - first - 1) * sizeof(*rename_dst));
48         rename_dst[first].two = alloc_filespec(two->path);
49         fill_filespec(rename_dst[first].two, two->sha1, two->mode);
50         rename_dst[first].pair = NULL;
51         return &(rename_dst[first]);
52 }
53
54 /* Table of rename/copy src files */
55 static struct diff_rename_src {
56         struct diff_filespec *one;
57         unsigned src_path_left : 1;
58 } *rename_src;
59 static int rename_src_nr, rename_src_alloc;
60
61 static struct diff_rename_src *register_rename_src(struct diff_filespec *one,
62                                                    int src_path_left)
63 {
64         int first, last;
65
66         first = 0;
67         last = rename_src_nr;
68         while (last > first) {
69                 int next = (last + first) >> 1;
70                 struct diff_rename_src *src = &(rename_src[next]);
71                 int cmp = strcmp(one->path, src->one->path);
72                 if (!cmp)
73                         return src;
74                 if (cmp < 0) {
75                         last = next;
76                         continue;
77                 }
78                 first = next+1;
79         }
80
81         /* insert to make it at "first" */
82         if (rename_src_alloc <= rename_src_nr) {
83                 rename_src_alloc = alloc_nr(rename_src_alloc);
84                 rename_src = xrealloc(rename_src,
85                                       rename_src_alloc * sizeof(*rename_src));
86         }
87         rename_src_nr++;
88         if (first < rename_src_nr)
89                 memmove(rename_src + first + 1, rename_src + first,
90                         (rename_src_nr - first - 1) * sizeof(*rename_src));
91         rename_src[first].one = one;
92         rename_src[first].src_path_left = src_path_left;
93         return &(rename_src[first]);
94 }
95
96 static int is_exact_match(struct diff_filespec *src, struct diff_filespec *dst)
97 {
98         if (src->sha1_valid && dst->sha1_valid &&
99             !memcmp(src->sha1, dst->sha1, 20))
100                 return 1;
101         if (diff_populate_filespec(src, 1) || diff_populate_filespec(dst, 1))
102                 return 0;
103         if (src->size != dst->size)
104                 return 0;
105         if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))
106                 return 0;
107         if (src->size == dst->size &&
108             !memcmp(src->data, dst->data, src->size))
109                 return 1;
110         return 0;
111 }
112
113 struct diff_score {
114         int src; /* index in rename_src */
115         int dst; /* index in rename_dst */
116         int score;
117 };
118
119 static int estimate_similarity(struct diff_filespec *src,
120                                struct diff_filespec *dst,
121                                int minimum_score)
122 {
123         /* src points at a file that existed in the original tree (or
124          * optionally a file in the destination tree) and dst points
125          * at a newly created file.  They may be quite similar, in which
126          * case we want to say src is renamed to dst or src is copied into
127          * dst, and then some edit has been applied to dst.
128          *
129          * Compare them and return how similar they are, representing
130          * the score as an integer between 0 and MAX_SCORE.
131          *
132          * When there is an exact match, it is considered a better
133          * match than anything else; the destination does not even
134          * call into this function in that case.
135          */
136         unsigned long max_size, delta_size, base_size, src_copied, literal_added;
137         unsigned long delta_limit;
138         int score;
139
140         /* We deal only with regular files.  Symlink renames are handled
141          * only when they are exact matches --- in other words, no edits
142          * after renaming.
143          */
144         if (!S_ISREG(src->mode) || !S_ISREG(dst->mode))
145                 return 0;
146
147         max_size = ((src->size > dst->size) ? src->size : dst->size);
148         base_size = ((src->size < dst->size) ? src->size : dst->size);
149         delta_size = max_size - base_size;
150
151         /* We would not consider edits that change the file size so
152          * drastically.  delta_size must be smaller than
153          * (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size).
154          *
155          * Note that base_size == 0 case is handled here already
156          * and the final score computation below would not have a
157          * divide-by-zero issue.
158          */
159         if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
160                 return 0;
161
162         if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))
163                 return 0; /* error but caught downstream */
164
165
166         delta_limit = base_size * (MAX_SCORE-minimum_score) / MAX_SCORE;
167         if (diffcore_count_changes(src->data, src->size,
168                                    dst->data, dst->size,
169                                    &src->cnt_data, &dst->cnt_data,
170                                    delta_limit,
171                                    &src_copied, &literal_added))
172                 return 0;
173
174         /* How similar are they?
175          * what percentage of material in dst are from source?
176          */
177         if (!dst->size)
178                 score = 0; /* should not happen */
179         else
180                 score = src_copied * MAX_SCORE / max_size;
181         return score;
182 }
183
184 static void record_rename_pair(int dst_index, int src_index, int score)
185 {
186         struct diff_filespec *one, *two, *src, *dst;
187         struct diff_filepair *dp;
188
189         if (rename_dst[dst_index].pair)
190                 die("internal error: dst already matched.");
191
192         src = rename_src[src_index].one;
193         one = alloc_filespec(src->path);
194         fill_filespec(one, src->sha1, src->mode);
195
196         dst = rename_dst[dst_index].two;
197         two = alloc_filespec(dst->path);
198         fill_filespec(two, dst->sha1, dst->mode);
199
200         dp = diff_queue(NULL, one, two);
201         dp->score = score;
202         dp->source_stays = rename_src[src_index].src_path_left;
203         rename_dst[dst_index].pair = dp;
204 }
205
206 /*
207  * We sort the rename similarity matrix with the score, in descending
208  * order (the most similar first).
209  */
210 static int score_compare(const void *a_, const void *b_)
211 {
212         const struct diff_score *a = a_, *b = b_;
213         return b->score - a->score;
214 }
215
216 static int compute_stays(struct diff_queue_struct *q,
217                          struct diff_filespec *one)
218 {
219         int i;
220         for (i = 0; i < q->nr; i++) {
221                 struct diff_filepair *p = q->queue[i];
222                 if (strcmp(one->path, p->two->path))
223                         continue;
224                 if (DIFF_PAIR_RENAME(p)) {
225                         return 0; /* something else is renamed into this */
226                 }
227         }
228         return 1;
229 }
230
231 void diffcore_rename(struct diff_options *options)
232 {
233         int detect_rename = options->detect_rename;
234         int minimum_score = options->rename_score;
235         int rename_limit = options->rename_limit;
236         struct diff_queue_struct *q = &diff_queued_diff;
237         struct diff_queue_struct outq;
238         struct diff_score *mx;
239         int i, j, rename_count;
240         int num_create, num_src, dst_cnt;
241
242         if (!minimum_score)
243                 minimum_score = DEFAULT_RENAME_SCORE;
244         rename_count = 0;
245
246         for (i = 0; i < q->nr; i++) {
247                 struct diff_filepair *p = q->queue[i];
248                 if (!DIFF_FILE_VALID(p->one))
249                         if (!DIFF_FILE_VALID(p->two))
250                                 continue; /* unmerged */
251                         else
252                                 locate_rename_dst(p->two, 1);
253                 else if (!DIFF_FILE_VALID(p->two)) {
254                         /* If the source is a broken "delete", and
255                          * they did not really want to get broken,
256                          * that means the source actually stays.
257                          */
258                         int stays = (p->broken_pair && !p->score);
259                         register_rename_src(p->one, stays);
260                 }
261                 else if (detect_rename == DIFF_DETECT_COPY)
262                         register_rename_src(p->one, 1);
263         }
264         if (rename_dst_nr == 0 || rename_src_nr == 0 ||
265             (0 < rename_limit && rename_limit < rename_dst_nr))
266                 goto cleanup; /* nothing to do */
267
268         /* We really want to cull the candidates list early
269          * with cheap tests in order to avoid doing deltas.
270          */
271         for (i = 0; i < rename_dst_nr; i++) {
272                 struct diff_filespec *two = rename_dst[i].two;
273                 for (j = 0; j < rename_src_nr; j++) {
274                         struct diff_filespec *one = rename_src[j].one;
275                         if (!is_exact_match(one, two))
276                                 continue;
277                         record_rename_pair(i, j, MAX_SCORE);
278                         rename_count++;
279                         break; /* we are done with this entry */
280                 }
281         }
282
283         /* Have we run out the created file pool?  If so we can avoid
284          * doing the delta matrix altogether.
285          */
286         if (rename_count == rename_dst_nr)
287                 goto cleanup;
288
289         if (minimum_score == MAX_SCORE)
290                 goto cleanup;
291
292         num_create = (rename_dst_nr - rename_count);
293         num_src = rename_src_nr;
294         mx = xmalloc(sizeof(*mx) * num_create * num_src);
295         for (dst_cnt = i = 0; i < rename_dst_nr; i++) {
296                 int base = dst_cnt * num_src;
297                 struct diff_filespec *two = rename_dst[i].two;
298                 if (rename_dst[i].pair)
299                         continue; /* dealt with exact match already. */
300                 for (j = 0; j < rename_src_nr; j++) {
301                         struct diff_filespec *one = rename_src[j].one;
302                         struct diff_score *m = &mx[base+j];
303                         m->src = j;
304                         m->dst = i;
305                         m->score = estimate_similarity(one, two,
306                                                        minimum_score);
307                 }
308                 /* We do not need the text anymore */
309                 diff_free_filespec_data(two);
310                 dst_cnt++;
311         }
312         /* cost matrix sorted by most to least similar pair */
313         qsort(mx, num_create * num_src, sizeof(*mx), score_compare);
314         for (i = 0; i < num_create * num_src; i++) {
315                 struct diff_rename_dst *dst = &rename_dst[mx[i].dst];
316                 if (dst->pair)
317                         continue; /* already done, either exact or fuzzy. */
318                 if (mx[i].score < minimum_score)
319                         break; /* there is no more usable pair. */
320                 record_rename_pair(mx[i].dst, mx[i].src, mx[i].score);
321                 rename_count++;
322         }
323         free(mx);
324
325  cleanup:
326         /* At this point, we have found some renames and copies and they
327          * are recorded in rename_dst.  The original list is still in *q.
328          */
329         outq.queue = NULL;
330         outq.nr = outq.alloc = 0;
331         for (i = 0; i < q->nr; i++) {
332                 struct diff_filepair *p = q->queue[i];
333                 struct diff_filepair *pair_to_free = NULL;
334
335                 if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) {
336                         /*
337                          * Creation
338                          *
339                          * We would output this create record if it has
340                          * not been turned into a rename/copy already.
341                          */
342                         struct diff_rename_dst *dst =
343                                 locate_rename_dst(p->two, 0);
344                         if (dst && dst->pair) {
345                                 diff_q(&outq, dst->pair);
346                                 pair_to_free = p;
347                         }
348                         else
349                                 /* no matching rename/copy source, so
350                                  * record this as a creation.
351                                  */
352                                 diff_q(&outq, p);
353                 }
354                 else if (DIFF_FILE_VALID(p->one) && !DIFF_FILE_VALID(p->two)) {
355                         /*
356                          * Deletion
357                          *
358                          * We would output this delete record if:
359                          *
360                          * (1) this is a broken delete and the counterpart
361                          *     broken create remains in the output; or
362                          * (2) this is not a broken delete, and rename_dst
363                          *     does not have a rename/copy to move p->one->path
364                          *     out of existence.
365                          *
366                          * Otherwise, the counterpart broken create
367                          * has been turned into a rename-edit; or
368                          * delete did not have a matching create to
369                          * begin with.
370                          */
371                         if (DIFF_PAIR_BROKEN(p)) {
372                                 /* broken delete */
373                                 struct diff_rename_dst *dst =
374                                         locate_rename_dst(p->one, 0);
375                                 if (dst && dst->pair)
376                                         /* counterpart is now rename/copy */
377                                         pair_to_free = p;
378                         }
379                         else {
380                                 for (j = 0; j < rename_dst_nr; j++) {
381                                         if (!rename_dst[j].pair)
382                                                 continue;
383                                         if (strcmp(rename_dst[j].pair->
384                                                    one->path,
385                                                    p->one->path))
386                                                 continue;
387                                         break;
388                                 }
389                                 if (j < rename_dst_nr)
390                                         /* this path remains */
391                                         pair_to_free = p;
392                         }
393
394                         if (pair_to_free)
395                                 ;
396                         else
397                                 diff_q(&outq, p);
398                 }
399                 else if (!diff_unmodified_pair(p))
400                         /* all the usual ones need to be kept */
401                         diff_q(&outq, p);
402                 else
403                         /* no need to keep unmodified pairs */
404                         pair_to_free = p;
405
406                 if (pair_to_free)
407                         diff_free_filepair(pair_to_free);
408         }
409         diff_debug_queue("done copying original", &outq);
410
411         free(q->queue);
412         *q = outq;
413         diff_debug_queue("done collapsing", q);
414
415         /* We need to see which rename source really stays here;
416          * earlier we only checked if the path is left in the result,
417          * but even if a path remains in the result, if that is coming
418          * from copying something else on top of it, then the original
419          * source is lost and does not stay.
420          */
421         for (i = 0; i < q->nr; i++) {
422                 struct diff_filepair *p = q->queue[i];
423                 if (DIFF_PAIR_RENAME(p) && p->source_stays) {
424                         /* If one appears as the target of a rename-copy,
425                          * then mark p->source_stays = 0; otherwise
426                          * leave it as is.
427                          */
428                         p->source_stays = compute_stays(q, p->one);
429                 }
430         }
431
432         for (i = 0; i < rename_dst_nr; i++) {
433                 diff_free_filespec_data(rename_dst[i].two);
434                 free(rename_dst[i].two);
435         }
436
437         free(rename_dst);
438         rename_dst = NULL;
439         rename_dst_nr = rename_dst_alloc = 0;
440         free(rename_src);
441         rename_src = NULL;
442         rename_src_nr = rename_src_alloc = 0;
443         return;
444 }