- /*
- * Determine a limit on the number of entries in the same hash
- * bucket. This guard us against patological data sets causing
- * really bad hash distribution with most entries in the same hash
- * bucket that would bring us to O(m*n) computing costs (m and n
- * corresponding to reference and target buffer sizes).
- *
- * The more the target buffer is large, the more it is important to
- * have small entry lists for each hash buckets. With such a limit
- * the cost is bounded to something more like O(m+n).
- */
- hlimit = (1 << 26) / trg_bufsize;
- if (hlimit < 16)
- hlimit = 16;
-
- /*
- * Now make sure none of the hash buckets has more entries than
- * we're willing to test. Otherwise we short-circuit the entry
- * list uniformly to still preserve a good repartition across
- * the reference buffer.
- */
- for (i = 0; i < hsize; i++) {
- if (hash_count[i] < hlimit)
- continue;
- entry = hash[i];
- do {
- struct index *keep = entry;
- int skip = hash_count[i] / hlimit / 2;
- do {
- entry = entry->next;
- } while(--skip && entry);
- keep->next = entry;
- } while(entry);
- }
- free(hash_count);
-