From 5d2b41d4c9161035baa9236452ac84c5186a6553 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Mon, 19 Jan 2026 16:20:40 -0500 Subject: [PATCH 1/3] saving. --- README.md | 56 ++++++++ .../org/fastfilter/xor/StringFilters.java | 131 ++++++++++++++++++ 2 files changed, 187 insertions(+) create mode 100644 fastfilter/src/test/java/org/fastfilter/xor/StringFilters.java diff --git a/README.md b/README.md index 50ca0fd..48809d0 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,62 @@ The following additional types are implemented, but less tested: ## Usage + +To use the XOR and Binary Fuse filters, first prepare an array of keys, then construct the filter: + +```java +import org.fastfilter.xor.Xor8; +import org.fastfilter.xor.Xor16; +import org.fastfilter.xor.XorBinaryFuse8; +import org.fastfilter.xor.XorBinaryFuse16; + +// Example keys +long[] keys = {1, 2, 3, 4, 5}; + +// Construct XOR filters +Xor8 xor8 = Xor8.construct(keys); +Xor16 xor16 = Xor16.construct(keys); +XorBinaryFuse8 xorBinaryFuse8 = XorBinaryFuse8.construct(keys); +XorBinaryFuse16 xorBinaryFuse16 = XorBinaryFuse16.construct(keys); + +// Check membership +boolean mightContain = xor8.mayContain(1L); // true +boolean mightContain2 = xor8.mayContain(6L); // false (with high probability) +``` + +All filters implement the `Filter` interface and support the `mayContain(long key)` method to check if a key might be in the set. Note that false positives are possible, but false negatives are not. + + +### Serialization and Deserialization + +Filters can be serialized to and deserialized from a `ByteBuffer` for persistence or transmission: + +```java +import java.nio.ByteBuffer; + +// Assuming you have a constructed filter, e.g., Xor8 xor8 = Xor8.construct(keys); + +// Get the serialized size +int size = xor8.getSerializedSize(); + +// Allocate a ByteBuffer +ByteBuffer buffer = ByteBuffer.allocate(size); + +// Serialize the filter +xor8.serialize(buffer); + +// Prepare buffer for reading (flip) +buffer.flip(); + +// Deserialize the filter +Xor8 deserializedXor8 = Xor8.deserialize(buffer); + +// The deserialized filter behaves identically to the original +boolean result = deserializedXor8.mayContain(1L); // true +``` + +This allows saving filters to files, databases, or sending them over networks. + ### Maven When using Maven: The latest version, 1.0.4, is not yet available on Maven central, see [issue #48](https://github.com/FastFilter/fastfilter_java/issues/48). However, it is available at https://jitpack.io/: diff --git a/fastfilter/src/test/java/org/fastfilter/xor/StringFilters.java b/fastfilter/src/test/java/org/fastfilter/xor/StringFilters.java new file mode 100644 index 0000000..35f7324 --- /dev/null +++ b/fastfilter/src/test/java/org/fastfilter/xor/StringFilters.java @@ -0,0 +1,131 @@ +package org.fastfilter.xor; + +import static org.junit.Assert.assertTrue; + +import java.util.HashSet; +import java.util.Random; +import java.util.Set; + +import org.fastfilter.Filter; +import org.junit.Test; + +public class StringFilters { + + private static final int NUM_STRINGS = 100_000; + private static final int NUM_TEST_STRINGS = 1_000; + private static final Random random = new Random(42); + + private static final long[] keys = generateKeys(); + private static final long[] testKeys = generateTestKeys(); + + private static long[] generateKeys() { + String[] strings = new String[NUM_STRINGS]; + for (int i = 0; i < NUM_STRINGS; i++) { + strings[i] = generateRandomString(); + } + long[] k = new long[NUM_STRINGS]; + for (int i = 0; i < NUM_STRINGS; i++) { + k[i] = hashString(strings[i]); + } + checkUniqueness(k, "keys"); + return k; + } + + private static long[] generateTestKeys() { + String[] strings = new String[NUM_TEST_STRINGS]; + for (int i = 0; i < NUM_TEST_STRINGS; i++) { + strings[i] = generateRandomString(); + } + long[] k = new long[NUM_TEST_STRINGS]; + for (int i = 0; i < NUM_TEST_STRINGS; i++) { + k[i] = hashString(strings[i]); + } + checkUniqueness(k, "test keys"); + return k; + } + + private static void checkUniqueness(long[] array, String name) { + Set set = new HashSet<>(); + int collisions = 0; + for (long l : array) { + if (!set.add(l)) { + collisions++; + } + } + if (collisions > 0) { + System.out.println("Warning: " + collisions + " hash collisions in " + name); + } else { + System.out.println("No hash collisions in " + name); + } + } + + private static String generateRandomString() { + int length = 5 + random.nextInt(16); // 5 to 20 chars + StringBuilder sb = new StringBuilder(length); + for (int i = 0; i < length; i++) { + sb.append((char) ('a' + random.nextInt(26))); + } + return sb.toString(); + } + + private static long hashString(String s) { + long h = 0; + for (char c : s.toCharArray()) { + h = h * 31 + c; + } + return h; + } + + @Test + public void testXor8() { + testFilter(Xor8.class); + } + + @Test + public void testXor16() { + testFilter(Xor16.class); + } + + @Test + public void testXorBinaryFuse8() { + testFilter(XorBinaryFuse8.class); + } + + @Test + public void testXorBinaryFuse16() { + testFilter(XorBinaryFuse16.class); + } + + @Test + public void testXorBinaryFuse32() { + testFilter(XorBinaryFuse32.class); + } + + private void testFilter(Class filterClass) { + // Construct filter + Filter filter; + try { + filter = (Filter) filterClass.getMethod("construct", long[].class).invoke(null, (Object) keys); + } catch (Exception e) { + throw new RuntimeException(e); + } + + // Check all keys are in the filter + for (int i = 0; i < NUM_STRINGS; i++) { + assertTrue("Key " + i + " should be in filter", filter.mayContain(keys[i])); + } + + // Check false positives on test keys + int falsePositives = 0; + for (int i = 0; i < NUM_TEST_STRINGS; i++) { + if (filter.mayContain(testKeys[i])) { + falsePositives++; + } + } + + // Expect low false positive rate (less than 1% for most filters) + double fpp = (double) falsePositives / NUM_TEST_STRINGS; + System.out.println(filterClass.getSimpleName() + " false positive rate: " + fpp); + assertTrue("False positive rate should be low: " + fpp, fpp < 0.01); // Allow up to 1% + } +} From 55d5510107f02a90ee8fecbb610dca2b03ecfa19 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Mon, 19 Jan 2026 18:58:15 -0500 Subject: [PATCH 2/3] This makes XorBinaryFuse16 and XorBinaryFuse8 more robust --- README.md | 22 ++-- .../main/java/org/fastfilter/xor/Xor16.java | 3 + .../main/java/org/fastfilter/xor/Xor8.java | 4 + .../org/fastfilter/xor/XorBinaryFuse16.java | 105 ++++++++++-------- .../org/fastfilter/xor/XorBinaryFuse32.java | 3 +- .../org/fastfilter/xor/XorBinaryFuse8.java | 105 ++++++++++-------- 6 files changed, 138 insertions(+), 104 deletions(-) diff --git a/README.md b/README.md index 48809d0..3437b8a 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ The following additional types are implemented, but less tested: ## Reference -* Thomas Mueller Graf, Daniel Lemire, [Binary Fuse Filters: Fast and Smaller Than Xor Filters](http://arxiv.org/abs/2201.01174), Journal of Experimental Algorithmics 27, 2022. DOI: 10.1145/3510449 +* Thomas Mueller Graf, Daniel Lemire, [Binary Fuse Filters: Fast and Smaller Than Xor Filters](http://arxiv.org/abs/2201.01174), Journal of Experimental Algorithmics 27, 2022. DOI: 10.1145/3510449 * Thomas Mueller Graf, Daniel Lemire, [Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters](https://arxiv.org/abs/1912.08258), Journal of Experimental Algorithmics 25 (1), 2020. DOI: 10.1145/3376122 ## Usage @@ -31,17 +31,13 @@ The following additional types are implemented, but less tested: To use the XOR and Binary Fuse filters, first prepare an array of keys, then construct the filter: ```java -import org.fastfilter.xor.Xor8; -import org.fastfilter.xor.Xor16; import org.fastfilter.xor.XorBinaryFuse8; import org.fastfilter.xor.XorBinaryFuse16; // Example keys long[] keys = {1, 2, 3, 4, 5}; -// Construct XOR filters -Xor8 xor8 = Xor8.construct(keys); -Xor16 xor16 = Xor16.construct(keys); +// Construct binary fuse filters= XorBinaryFuse8 xorBinaryFuse8 = XorBinaryFuse8.construct(keys); XorBinaryFuse16 xorBinaryFuse16 = XorBinaryFuse16.construct(keys); @@ -52,6 +48,11 @@ boolean mightContain2 = xor8.mayContain(6L); // false (with high probability) All filters implement the `Filter` interface and support the `mayContain(long key)` method to check if a key might be in the set. Note that false positives are possible, but false negatives are not. +### Generating the Hash Values + +The library is written to process `long` values that are meant to be hash values. Though you do not need to use +cryptographically strong hashing, you should make sure that your hash functions are reasonable: they should +not generate too many collisions (two objects mapping to the same `long` value). ### Serialization and Deserialization @@ -60,25 +61,24 @@ Filters can be serialized to and deserialized from a `ByteBuffer` for persistenc ```java import java.nio.ByteBuffer; -// Assuming you have a constructed filter, e.g., Xor8 xor8 = Xor8.construct(keys); +// Assuming you have a constructed filter // Get the serialized size -int size = xor8.getSerializedSize(); +int size = XorBinaryFuse8.getSerializedSize(); // Allocate a ByteBuffer ByteBuffer buffer = ByteBuffer.allocate(size); // Serialize the filter -xor8.serialize(buffer); +XorBinaryFuse8.serialize(buffer); // Prepare buffer for reading (flip) buffer.flip(); // Deserialize the filter -Xor8 deserializedXor8 = Xor8.deserialize(buffer); +XorBinaryFuse8 deserializedXorBinaryFuse8 = Xor8.deserialize(buffer); // The deserialized filter behaves identically to the original -boolean result = deserializedXor8.mayContain(1L); // true ``` This allows saving filters to files, databases, or sending them over networks. diff --git a/fastfilter/src/main/java/org/fastfilter/xor/Xor16.java b/fastfilter/src/main/java/org/fastfilter/xor/Xor16.java index 8cdc4d8..603a3f1 100644 --- a/fastfilter/src/main/java/org/fastfilter/xor/Xor16.java +++ b/fastfilter/src/main/java/org/fastfilter/xor/Xor16.java @@ -6,7 +6,10 @@ import org.fastfilter.utils.Hash; /** + * The Xor16 filter implementation is experimental. We recommend using XorBinaryFuse16 instead. Use at your own risks. + * * The xor filter, a new algorithm that can replace a Bloom filter. + * Thomas Mueller Graf, Daniel Lemire, [Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters](https://arxiv.org/abs/1912.08258), Journal of Experimental Algorithmics 25 (1), 2020. DOI: 10.1145/3376122 * * It needs 1.23 log(1/fpp) bits per key. It is related to the BDZ algorithm [1] * (a minimal perfect hash function algorithm). diff --git a/fastfilter/src/main/java/org/fastfilter/xor/Xor8.java b/fastfilter/src/main/java/org/fastfilter/xor/Xor8.java index bb3b5ff..a54b0d9 100644 --- a/fastfilter/src/main/java/org/fastfilter/xor/Xor8.java +++ b/fastfilter/src/main/java/org/fastfilter/xor/Xor8.java @@ -6,8 +6,12 @@ import org.fastfilter.Filter; import org.fastfilter.utils.Hash; + /** + * The Xor8 filter implementation is experimental. We recommend using XorBinaryFuse8 instead. Use at your own risks. + * * The xor filter, a new algorithm that can replace a Bloom filter. + * Thomas Mueller Graf, Daniel Lemire, [Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters](https://arxiv.org/abs/1912.08258), Journal of Experimental Algorithmics 25 (1), 2020. DOI: 10.1145/3376122 * * It needs 1.23 log(1/fpp) bits per key. It is related to the BDZ algorithm [1] * (a minimal perfect hash function algorithm). diff --git a/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse16.java b/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse16.java index e7332fd..b0ef76a 100644 --- a/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse16.java +++ b/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse16.java @@ -7,6 +7,7 @@ /** * The xor binary fuse filter, a new algorithm that can replace a Bloom filter. + * Thomas Mueller Graf, Daniel Lemire, [Binary Fuse Filters: Fast and Smaller Than Xor Filters](http://arxiv.org/abs/2201.01174), Journal of Experimental Algorithmics 27, 2022. DOI: 10.1145/3510449 */ public class XorBinaryFuse16 implements Filter { @@ -78,6 +79,15 @@ private static int mod3(int x) { return x; } + /** + * Constructs a new XorBinaryFuse16 filter from the given array of keys. + * The filter is designed to have a low false positive rate while being space-efficient. + * The keys array should contain unique values. The array may be mutated during construction + * (e.g., sorted and deduplicated) if the algorithm detects that there are likely too many duplicates. + * + * @param keys the array of long keys to add to the filter + * @return a new XorBinaryFuse16 filter containing all the keys + */ public static XorBinaryFuse16 construct(long[] keys) { int size = keys.length; int segmentLength = calculateSegmentLength(ARITY, size); @@ -102,6 +112,7 @@ private void addAll(long[] keys) { long[] reverseOrder = new long[size + 1]; byte[] reverseH = new byte[size]; int reverseOrderPos = 0; + boolean duplicated = false; // the lowest 2 bits are the h index (0, 1, or 2) // so we only have 6 bits for counting; @@ -117,7 +128,6 @@ private void addAll(long[] keys) { blockBits++; } int block = 1 << blockBits; - mainloop: while (true) { reverseOrder[size] = 1; int[] startPos = new int[block]; @@ -126,7 +136,8 @@ private void addAll(long[] keys) { } // counting sort - for (long key : keys) { + for(int i = 0; i < size; i++) { + long key = keys[i]; long hash = Hash.hash64(key, seed); int segmentIndex = (int) (hash >>> (64 - blockBits)); // We only overwrite when the hash was zero. Zero hash values @@ -150,52 +161,48 @@ private void addAll(long[] keys) { } } startPos = null; - if (countMask < 0) { - // we have a possible counter overflow - continue mainloop; - } - - reverseOrderPos = 0; - int alonePos = 0; - for (int i = 0; i < arrayLength; i++) { - alone[alonePos] = i; - int inc = (t2count[i] >> 2) == 1 ? 1 : 0; - alonePos += inc; - } + if (countMask >= 0) { + reverseOrderPos = 0; + int alonePos = 0; + for (int i = 0; i < arrayLength; i++) { + alone[alonePos] = i; + int inc = (t2count[i] >> 2) == 1 ? 1 : 0; + alonePos += inc; + } - while (alonePos > 0) { - alonePos--; - int index = alone[alonePos]; - if ((t2count[index] >> 2) == 1) { - // It is still there! - long hash = t2hash[index]; - byte found = (byte) (t2count[index] & 3); - - reverseH[reverseOrderPos] = found; - reverseOrder[reverseOrderPos] = hash; - - h012[0] = getHashFromHash(hash, 0); - h012[1] = getHashFromHash(hash, 1); - h012[2] = getHashFromHash(hash, 2); - - int index3 = h012[mod3(found + 1)]; - alone[alonePos] = index3; - alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0); - t2count[index3] -= 4; - t2count[index3] ^= mod3(found + 1); - t2hash[index3] ^= hash; - - index3 = h012[mod3(found + 2)]; - alone[alonePos] = index3; - alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0); - t2count[index3] -= 4; - t2count[index3] ^= mod3(found + 2); - t2hash[index3] ^= hash; - - reverseOrderPos++; + while (alonePos > 0) { + alonePos--; + int index = alone[alonePos]; + if ((t2count[index] >> 2) == 1) { + // It is still there! + long hash = t2hash[index]; + byte found = (byte) (t2count[index] & 3); + + reverseH[reverseOrderPos] = found; + reverseOrder[reverseOrderPos] = hash; + + h012[0] = getHashFromHash(hash, 0); + h012[1] = getHashFromHash(hash, 1); + h012[2] = getHashFromHash(hash, 2); + + int index3 = h012[mod3(found + 1)]; + alone[alonePos] = index3; + alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0); + t2count[index3] -= 4; + t2count[index3] ^= mod3(found + 1); + t2hash[index3] ^= hash; + + index3 = h012[mod3(found + 2)]; + alone[alonePos] = index3; + alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0); + t2count[index3] -= 4; + t2count[index3] ^= mod3(found + 2); + t2hash[index3] ^= hash; + + reverseOrderPos++; + } } } - if (reverseOrderPos == size) { break; } @@ -203,7 +210,13 @@ private void addAll(long[] keys) { Arrays.fill(t2count, (byte) 0); Arrays.fill(t2hash, 0); Arrays.fill(reverseOrder, 0); - + // If we reach 10 passes, we assume that there are too many duplicates + // in the input key set. We then sort and remove duplicates in place. + // This should almost never happen. + if (countMask < 0 && !duplicated) { + size = Deduplicator.sortAndRemoveDup(keys, size); + duplicated = true; + } if (hashIndex > 100) { // if construction doesn't succeed eventually, // then there is likely a problem with the hash function. diff --git a/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse32.java b/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse32.java index 29df2e5..fc1a01b 100644 --- a/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse32.java +++ b/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse32.java @@ -7,7 +7,8 @@ import org.fastfilter.utils.Hash; /** - * The xor binary fuse filter, a new algorithm that can replace a Bloom filter. + * The XorBinaryFuse32 filter is experimental. We recommend using XorBinaryFuse8 or XorBinaryFuse16 instead. + * Use at your own risks. */ public class XorBinaryFuse32 implements Filter { diff --git a/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse8.java b/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse8.java index 29b1618..e8f0337 100644 --- a/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse8.java +++ b/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse8.java @@ -8,6 +8,7 @@ /** * The xor binary fuse filter, a new algorithm that can replace a Bloom filter. + * Thomas Mueller Graf, Daniel Lemire, [Binary Fuse Filters: Fast and Smaller Than Xor Filters](http://arxiv.org/abs/2201.01174), Journal of Experimental Algorithmics 27, 2022. DOI: 10.1145/3510449 */ public class XorBinaryFuse8 implements Filter { @@ -79,6 +80,15 @@ private static int mod3(int x) { return x; } + /** + * Constructs a new XorBinaryFuse8 filter from the given array of keys. + * The filter is designed to have a low false positive rate while being space-efficient. + * The keys array should contain unique values. The array may be mutated during construction + * (e.g., sorted and deduplicated) if the algorithm detects that there are likely too many duplicates. + * + * @param keys the array of long keys to add to the filter + * @return a new XorBinaryFuse8 filter containing all the keys + */ public static XorBinaryFuse8 construct(long[] keys) { int size = keys.length; int segmentLength = calculateSegmentLength(ARITY, size); @@ -103,6 +113,7 @@ private void addAll(long[] keys) { long[] reverseOrder = new long[size + 1]; byte[] reverseH = new byte[size]; int reverseOrderPos = 0; + boolean duplicated = false; // the lowest 2 bits are the h index (0, 1, or 2) // so we only have 6 bits for counting; @@ -118,7 +129,6 @@ private void addAll(long[] keys) { blockBits++; } int block = 1 << blockBits; - mainloop: while (true) { reverseOrder[size] = 1; int[] startPos = new int[block]; @@ -126,8 +136,8 @@ private void addAll(long[] keys) { startPos[i] = (int) ((long) i * size / block); } // counting sort - - for (long key : keys) { + for(int i = 0; i < size; i++) { + long key = keys[i]; long hash = Hash.hash64(key, seed); int segmentIndex = (int) (hash >>> (64 - blockBits)); // We only overwrite when the hash was zero. Zero hash values @@ -151,49 +161,46 @@ private void addAll(long[] keys) { } } startPos = null; - if (countMask < 0) { - // we have a possible counter overflow - continue mainloop; - } - - reverseOrderPos = 0; - int alonePos = 0; - for (int i = 0; i < arrayLength; i++) { - alone[alonePos] = i; - int inc = (t2count[i] >> 2) == 1 ? 1 : 0; - alonePos += inc; - } + if (countMask >= 0) { + reverseOrderPos = 0; + int alonePos = 0; + for (int i = 0; i < arrayLength; i++) { + alone[alonePos] = i; + int inc = (t2count[i] >> 2) == 1 ? 1 : 0; + alonePos += inc; + } - while (alonePos > 0) { - alonePos--; - int index = alone[alonePos]; - if ((t2count[index] >> 2) == 1) { - // It is still there! - long hash = t2hash[index]; - byte found = (byte) (t2count[index] & 3); - - reverseH[reverseOrderPos] = found; - reverseOrder[reverseOrderPos] = hash; - - h012[0] = getHashFromHash(hash, 0); - h012[1] = getHashFromHash(hash, 1); - h012[2] = getHashFromHash(hash, 2); - - int index3 = h012[mod3(found + 1)]; - alone[alonePos] = index3; - alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0); - t2count[index3] -= 4; - t2count[index3] ^= mod3(found + 1); - t2hash[index3] ^= hash; - - index3 = h012[mod3(found + 2)]; - alone[alonePos] = index3; - alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0); - t2count[index3] -= 4; - t2count[index3] ^= mod3(found + 2); - t2hash[index3] ^= hash; - - reverseOrderPos++; + while (alonePos > 0) { + alonePos--; + int index = alone[alonePos]; + if ((t2count[index] >> 2) == 1) { + // It is still there! + long hash = t2hash[index]; + byte found = (byte) (t2count[index] & 3); + + reverseH[reverseOrderPos] = found; + reverseOrder[reverseOrderPos] = hash; + + h012[0] = getHashFromHash(hash, 0); + h012[1] = getHashFromHash(hash, 1); + h012[2] = getHashFromHash(hash, 2); + + int index3 = h012[mod3(found + 1)]; + alone[alonePos] = index3; + alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0); + t2count[index3] -= 4; + t2count[index3] ^= mod3(found + 1); + t2hash[index3] ^= hash; + + index3 = h012[mod3(found + 2)]; + alone[alonePos] = index3; + alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0); + t2count[index3] -= 4; + t2count[index3] ^= mod3(found + 2); + t2hash[index3] ^= hash; + + reverseOrderPos++; + } } } @@ -204,7 +211,13 @@ private void addAll(long[] keys) { Arrays.fill(t2count, (byte) 0); Arrays.fill(t2hash, 0); Arrays.fill(reverseOrder, 0); - + // If we reach 10 passes, we assume that there are too many duplicates + // in the input key set. We then sort and remove duplicates in place. + // This should almost never happen. + if (countMask < 0 && !duplicated) { + size = Deduplicator.sortAndRemoveDup(keys, size); + duplicated = true; + } if (hashIndex > 100) { // if construction doesn't succeed eventually, // then there is likely a problem with the hash function. From ff1e4b29f5c7fb9cdf70375967abd8e75e68832a Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Mon, 19 Jan 2026 19:01:31 -0500 Subject: [PATCH 3/3] adding missing file --- .../java/org/fastfilter/xor/Deduplicator.java | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 fastfilter/src/main/java/org/fastfilter/xor/Deduplicator.java diff --git a/fastfilter/src/main/java/org/fastfilter/xor/Deduplicator.java b/fastfilter/src/main/java/org/fastfilter/xor/Deduplicator.java new file mode 100644 index 0000000..6bd1759 --- /dev/null +++ b/fastfilter/src/main/java/org/fastfilter/xor/Deduplicator.java @@ -0,0 +1,27 @@ +package org.fastfilter.xor; + +import java.util.Arrays; + +public class Deduplicator { + + /** + * Sorts the keys array and removes duplicates in place. + * Returns the new length of the array (number of unique elements). + * + * @param keys the array of keys to deduplicate + * @param length the current length of the array + * @return the new length after removing duplicates + */ + public static int sortAndRemoveDup(long[] keys, int length) { + Arrays.sort(keys, 0, length); + int j = 1; + for (int i = 1; i < length; i++) { + if (keys[i] != keys[i - 1]) { + keys[j] = keys[i]; + j++; + } + } + return j; + } + +}