+ /*
+ * Determine a limit on the number of entries in the same hash
+ * bucket. This guard us against patological data sets causing
+ * really bad hash distribution with most entries in the same hash
+ * bucket that would bring us to O(m*n) computing costs (m and n
+ * corresponding to reference and target buffer sizes).
+ *
+ * The more the target buffer is large, the more it is important to
+ * have small entry lists for each hash buckets. With such a limit
+ * the cost is bounded to something more like O(m+n).
+ */
+ hlimit = (1 << 26) / trg_bufsize;
+ if (hlimit < 16)
+ hlimit = 16;
+
+ /*
+ * Now make sure none of the hash buckets has more entries than
+ * we're willing to test. Otherwise we cull the entry list to
+ * limit identical three byte prefixes to still preserve a good
+ * repartition across the reference buffer.
+ */
+ for (i = 0; i < hsize; i++) {
+ struct index **list, *bucket, *remaining;
+ int cnt;
+ if (hash_count[i] < hlimit)
+ continue;
+
+ bucket = NULL;
+ list = &bucket;
+ remaining = hash[i];
+ cnt = 0;
+ while (cnt < hlimit && remaining) {
+ struct index *this = remaining, *that;
+ remaining = remaining->next;
+ for (that = bucket; that; that = that->next) {
+ if (!memcmp(that->ptr, this->ptr, 3))
+ break;
+ }
+ if (that)
+ continue; /* discard */
+ cnt++;
+ *list = this;
+ list = &(this->next);
+ this->next = NULL;
+ }
+ hash[i] = bucket;
+ }
+ free(hash_count);
+
+ return hash-1;