From 5d2b41d4c9161035baa9236452ac84c5186a6553 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Mon, 19 Jan 2026 16:20:40 -0500
Subject: [PATCH 1/3] saving.

---
 README.md                                     |  56 ++++++++
 .../org/fastfilter/xor/StringFilters.java     | 131 ++++++++++++++++++
 2 files changed, 187 insertions(+)
 create mode 100644 fastfilter/src/test/java/org/fastfilter/xor/StringFilters.java

diff --git a/README.md b/README.md
index 50ca0fd..48809d0 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,62 @@ The following additional types are implemented, but less tested:
 
 ## Usage
 
+
+To use the XOR and Binary Fuse filters, first prepare an array of keys, then construct the filter:
+
+```java
+import org.fastfilter.xor.Xor8;
+import org.fastfilter.xor.Xor16;
+import org.fastfilter.xor.XorBinaryFuse8;
+import org.fastfilter.xor.XorBinaryFuse16;
+
+// Example keys
+long[] keys = {1, 2, 3, 4, 5};
+
+// Construct XOR filters
+Xor8 xor8 = Xor8.construct(keys);
+Xor16 xor16 = Xor16.construct(keys);
+XorBinaryFuse8 xorBinaryFuse8 = XorBinaryFuse8.construct(keys);
+XorBinaryFuse16 xorBinaryFuse16 = XorBinaryFuse16.construct(keys);
+
+// Check membership
+boolean mightContain = xor8.mayContain(1L); // true
+boolean mightContain2 = xor8.mayContain(6L); // false (with high probability)
+```
+
+All filters implement the `Filter` interface and support the `mayContain(long key)` method to check if a key might be in the set. Note that false positives are possible, but false negatives are not.
+
+
+### Serialization and Deserialization
+
+Filters can be serialized to and deserialized from a `ByteBuffer` for persistence or transmission:
+
+```java
+import java.nio.ByteBuffer;
+
+// Assuming you have a constructed filter, e.g., Xor8 xor8 = Xor8.construct(keys);
+
+// Get the serialized size
+int size = xor8.getSerializedSize();
+
+// Allocate a ByteBuffer
+ByteBuffer buffer = ByteBuffer.allocate(size);
+
+// Serialize the filter
+xor8.serialize(buffer);
+
+// Prepare buffer for reading (flip)
+buffer.flip();
+
+// Deserialize the filter
+Xor8 deserializedXor8 = Xor8.deserialize(buffer);
+
+// The deserialized filter behaves identically to the original
+boolean result = deserializedXor8.mayContain(1L); // true
+```
+
+This allows saving filters to files, databases, or sending them over networks.
+
 ### Maven
 
 When using Maven: The latest version, 1.0.4, is not yet available on Maven central, see [issue #48](https://github.com/FastFilter/fastfilter_java/issues/48). However, it is available at https://jitpack.io/:
diff --git a/fastfilter/src/test/java/org/fastfilter/xor/StringFilters.java b/fastfilter/src/test/java/org/fastfilter/xor/StringFilters.java
new file mode 100644
index 0000000..35f7324
--- /dev/null
+++ b/fastfilter/src/test/java/org/fastfilter/xor/StringFilters.java
@@ -0,0 +1,131 @@
+package org.fastfilter.xor;
+
+import static org.junit.Assert.assertTrue;
+
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+
+import org.fastfilter.Filter;
+import org.junit.Test;
+
+public class StringFilters {
+
+    private static final int NUM_STRINGS = 100_000;
+    private static final int NUM_TEST_STRINGS = 1_000;
+    private static final Random random = new Random(42);
+
+    private static final long[] keys = generateKeys();
+    private static final long[] testKeys = generateTestKeys();
+
+    private static long[] generateKeys() {
+        String[] strings = new String[NUM_STRINGS];
+        for (int i = 0; i < NUM_STRINGS; i++) {
+            strings[i] = generateRandomString();
+        }
+        long[] k = new long[NUM_STRINGS];
+        for (int i = 0; i < NUM_STRINGS; i++) {
+            k[i] = hashString(strings[i]);
+        }
+        checkUniqueness(k, "keys");
+        return k;
+    }
+
+    private static long[] generateTestKeys() {
+        String[] strings = new String[NUM_TEST_STRINGS];
+        for (int i = 0; i < NUM_TEST_STRINGS; i++) {
+            strings[i] = generateRandomString();
+        }
+        long[] k = new long[NUM_TEST_STRINGS];
+        for (int i = 0; i < NUM_TEST_STRINGS; i++) {
+            k[i] = hashString(strings[i]);
+        }
+        checkUniqueness(k, "test keys");
+        return k;
+    }
+
+    private static void checkUniqueness(long[] array, String name) {
+        Set<Long> set = new HashSet<>();
+        int collisions = 0;
+        for (long l : array) {
+            if (!set.add(l)) {
+                collisions++;
+            }
+        }
+        if (collisions > 0) {
+            System.out.println("Warning: " + collisions + " hash collisions in " + name);
+        } else {
+            System.out.println("No hash collisions in " + name);
+        }
+    }
+
+    private static String generateRandomString() {
+        int length = 5 + random.nextInt(16); // 5 to 20 chars
+        StringBuilder sb = new StringBuilder(length);
+        for (int i = 0; i < length; i++) {
+            sb.append((char) ('a' + random.nextInt(26)));
+        }
+        return sb.toString();
+    }
+
+    private static long hashString(String s) {
+        long h = 0;
+        for (char c : s.toCharArray()) {
+            h = h * 31 + c;
+        }
+        return h;
+    }
+
+    @Test
+    public void testXor8() {
+        testFilter(Xor8.class);
+    }
+
+    @Test
+    public void testXor16() {
+        testFilter(Xor16.class);
+    }
+
+    @Test
+    public void testXorBinaryFuse8() {
+        testFilter(XorBinaryFuse8.class);
+    }
+
+    @Test
+    public void testXorBinaryFuse16() {
+        testFilter(XorBinaryFuse16.class);
+    }
+
+    @Test
+    public void testXorBinaryFuse32() {
+        testFilter(XorBinaryFuse32.class);
+    }
+
+    private void testFilter(Class<?> filterClass) {
+        // Construct filter
+        Filter filter;
+        try {
+            filter = (Filter) filterClass.getMethod("construct", long[].class).invoke(null, (Object) keys);
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+
+        // Check all keys are in the filter
+        for (int i = 0; i < NUM_STRINGS; i++) {
+            assertTrue("Key " + i + " should be in filter", filter.mayContain(keys[i]));
+        }
+
+        // Check false positives on test keys
+        int falsePositives = 0;
+        for (int i = 0; i < NUM_TEST_STRINGS; i++) {
+            if (filter.mayContain(testKeys[i])) {
+                falsePositives++;
+            }
+        }
+
+        // Expect low false positive rate (less than 1% for most filters)
+        double fpp = (double) falsePositives / NUM_TEST_STRINGS;
+        System.out.println(filterClass.getSimpleName() + " false positive rate: " + fpp);
+        assertTrue("False positive rate should be low: " + fpp, fpp < 0.01); // Allow up to 1%
+    }
+}

From 55d5510107f02a90ee8fecbb610dca2b03ecfa19 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Mon, 19 Jan 2026 18:58:15 -0500
Subject: [PATCH 2/3] This makes XorBinaryFuse16 and XorBinaryFuse8 more robust

---
 README.md                                     |  22 ++--
 .../main/java/org/fastfilter/xor/Xor16.java   |   3 +
 .../main/java/org/fastfilter/xor/Xor8.java    |   4 +
 .../org/fastfilter/xor/XorBinaryFuse16.java   | 105 ++++++++++--------
 .../org/fastfilter/xor/XorBinaryFuse32.java   |   3 +-
 .../org/fastfilter/xor/XorBinaryFuse8.java    | 105 ++++++++++--------
 6 files changed, 138 insertions(+), 104 deletions(-)

diff --git a/README.md b/README.md
index 48809d0..3437b8a 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ The following additional types are implemented, but less tested:
 
 ## Reference
 
-* Thomas Mueller Graf, Daniel Lemire, [Binary Fuse Filters: Fast and Smaller Than Xor Filters](http://arxiv.org/abs/2201.01174), 	Journal of Experimental Algorithmics 27, 2022. DOI: 10.1145/3510449   
+* Thomas Mueller Graf, Daniel Lemire, [Binary Fuse Filters: Fast and Smaller Than Xor Filters](http://arxiv.org/abs/2201.01174), 	Journal of Experimental Algorithmics 27, 2022. DOI: 10.1145/3510449
 * Thomas Mueller Graf,  Daniel Lemire, [Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters](https://arxiv.org/abs/1912.08258), Journal of Experimental Algorithmics 25 (1), 2020. DOI: 10.1145/3376122
 
 ## Usage
@@ -31,17 +31,13 @@ The following additional types are implemented, but less tested:
 To use the XOR and Binary Fuse filters, first prepare an array of keys, then construct the filter:
 
 ```java
-import org.fastfilter.xor.Xor8;
-import org.fastfilter.xor.Xor16;
 import org.fastfilter.xor.XorBinaryFuse8;
 import org.fastfilter.xor.XorBinaryFuse16;
 
 // Example keys
 long[] keys = {1, 2, 3, 4, 5};
 
-// Construct XOR filters
-Xor8 xor8 = Xor8.construct(keys);
-Xor16 xor16 = Xor16.construct(keys);
+// Construct binary fuse filters=
 XorBinaryFuse8 xorBinaryFuse8 = XorBinaryFuse8.construct(keys);
 XorBinaryFuse16 xorBinaryFuse16 = XorBinaryFuse16.construct(keys);
 
@@ -52,6 +48,11 @@ boolean mightContain2 = xor8.mayContain(6L); // false (with high probability)
 
 All filters implement the `Filter` interface and support the `mayContain(long key)` method to check if a key might be in the set. Note that false positives are possible, but false negatives are not.
 
+### Generating the Hash Values
+
+The library is written to process `long` values that are meant to be hash values. Though you do not need to use
+cryptographically strong hashing, you should make sure that your hash functions are reasonable: they should
+not generate too many collisions (two objects mapping to the same `long` value).
 
 ### Serialization and Deserialization
 
@@ -60,25 +61,24 @@ Filters can be serialized to and deserialized from a `ByteBuffer` for persistenc
 ```java
 import java.nio.ByteBuffer;
 
-// Assuming you have a constructed filter, e.g., Xor8 xor8 = Xor8.construct(keys);
+// Assuming you have a constructed filter
 
 // Get the serialized size
-int size = xor8.getSerializedSize();
+int size = XorBinaryFuse8.getSerializedSize();
 
 // Allocate a ByteBuffer
 ByteBuffer buffer = ByteBuffer.allocate(size);
 
 // Serialize the filter
-xor8.serialize(buffer);
+XorBinaryFuse8.serialize(buffer);
 
 // Prepare buffer for reading (flip)
 buffer.flip();
 
 // Deserialize the filter
-Xor8 deserializedXor8 = Xor8.deserialize(buffer);
+XorBinaryFuse8 deserializedXorBinaryFuse8 = Xor8.deserialize(buffer);
 
 // The deserialized filter behaves identically to the original
-boolean result = deserializedXor8.mayContain(1L); // true
 ```
 
 This allows saving filters to files, databases, or sending them over networks.
diff --git a/fastfilter/src/main/java/org/fastfilter/xor/Xor16.java b/fastfilter/src/main/java/org/fastfilter/xor/Xor16.java
index 8cdc4d8..603a3f1 100644
--- a/fastfilter/src/main/java/org/fastfilter/xor/Xor16.java
+++ b/fastfilter/src/main/java/org/fastfilter/xor/Xor16.java
@@ -6,7 +6,10 @@
 import org.fastfilter.utils.Hash;
 
 /**
+ * The Xor16 filter implementation is experimental. We recommend using XorBinaryFuse16 instead. Use at your own risks.
+ * 
  * The xor filter, a new algorithm that can replace a Bloom filter.
+ * Thomas Mueller Graf,  Daniel Lemire, [Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters](https://arxiv.org/abs/1912.08258), Journal of Experimental Algorithmics 25 (1), 2020. DOI: 10.1145/3376122
  *
  * It needs 1.23 log(1/fpp) bits per key. It is related to the BDZ algorithm [1]
  * (a minimal perfect hash function algorithm).
diff --git a/fastfilter/src/main/java/org/fastfilter/xor/Xor8.java b/fastfilter/src/main/java/org/fastfilter/xor/Xor8.java
index bb3b5ff..a54b0d9 100644
--- a/fastfilter/src/main/java/org/fastfilter/xor/Xor8.java
+++ b/fastfilter/src/main/java/org/fastfilter/xor/Xor8.java
@@ -6,8 +6,12 @@
 import org.fastfilter.Filter;
 import org.fastfilter.utils.Hash;
 
+
 /**
+ * The Xor8 filter implementation is experimental. We recommend using XorBinaryFuse8 instead. Use at your own risks.
+ *
  * The xor filter, a new algorithm that can replace a Bloom filter.
+ * Thomas Mueller Graf,  Daniel Lemire, [Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters](https://arxiv.org/abs/1912.08258), Journal of Experimental Algorithmics 25 (1), 2020. DOI: 10.1145/3376122
  *
  * It needs 1.23 log(1/fpp) bits per key. It is related to the BDZ algorithm [1]
  * (a minimal perfect hash function algorithm).
diff --git a/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse16.java b/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse16.java
index e7332fd..b0ef76a 100644
--- a/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse16.java
+++ b/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse16.java
@@ -7,6 +7,7 @@
 
 /**
  * The xor binary fuse filter, a new algorithm that can replace a Bloom filter.
+ * Thomas Mueller Graf, Daniel Lemire, [Binary Fuse Filters: Fast and Smaller Than Xor Filters](http://arxiv.org/abs/2201.01174), 	Journal of Experimental Algorithmics 27, 2022. DOI: 10.1145/3510449  
  */
 public class XorBinaryFuse16 implements Filter {
 
@@ -78,6 +79,15 @@ private static int mod3(int x) {
         return x;
     }
 
+    /**
+     * Constructs a new XorBinaryFuse16 filter from the given array of keys.
+     * The filter is designed to have a low false positive rate while being space-efficient.
+     * The keys array should contain unique values. The array may be mutated during construction
+     * (e.g., sorted and deduplicated) if the algorithm detects that there are likely too many duplicates.
+     *
+     * @param keys the array of long keys to add to the filter
+     * @return a new XorBinaryFuse16 filter containing all the keys
+     */
     public static XorBinaryFuse16 construct(long[] keys) {
         int size = keys.length;
         int segmentLength = calculateSegmentLength(ARITY, size);
@@ -102,6 +112,7 @@ private void addAll(long[] keys) {
         long[] reverseOrder = new long[size + 1];
         byte[] reverseH = new byte[size];
         int reverseOrderPos = 0;
+        boolean duplicated = false;
 
         // the lowest 2 bits are the h index (0, 1, or 2)
         // so we only have 6 bits for counting;
@@ -117,7 +128,6 @@ private void addAll(long[] keys) {
             blockBits++;
         }
         int block = 1 << blockBits;
-        mainloop:
         while (true) {
             reverseOrder[size] = 1;
             int[] startPos = new int[block];
@@ -126,7 +136,8 @@ private void addAll(long[] keys) {
             }
             // counting sort
 
-            for (long key : keys) {
+            for(int i = 0; i < size; i++) {
+                long key = keys[i];
                 long hash = Hash.hash64(key, seed);
                 int segmentIndex = (int) (hash >>> (64 - blockBits));
                 // We only overwrite when the hash was zero. Zero hash values
@@ -150,52 +161,48 @@ private void addAll(long[] keys) {
                 }
             }
             startPos = null;
-            if (countMask < 0) {
-                // we have a possible counter overflow
-                continue mainloop;
-            }
-
-            reverseOrderPos = 0;
-            int alonePos = 0;
-            for (int i = 0; i < arrayLength; i++) {
-                alone[alonePos] = i;
-                int inc = (t2count[i] >> 2) == 1 ? 1 : 0;
-                alonePos += inc;
-            }
+            if (countMask >= 0) {
+                reverseOrderPos = 0;
+                int alonePos = 0;
+                for (int i = 0; i < arrayLength; i++) {
+                    alone[alonePos] = i;
+                    int inc = (t2count[i] >> 2) == 1 ? 1 : 0;
+                    alonePos += inc;
+                }
 
-            while (alonePos > 0) {
-                alonePos--;
-                int index = alone[alonePos];
-                if ((t2count[index] >> 2) == 1) {
-                    // It is still there!
-                    long hash = t2hash[index];
-                    byte found = (byte) (t2count[index] & 3);
-
-                    reverseH[reverseOrderPos] = found;
-                    reverseOrder[reverseOrderPos] = hash;
-
-                    h012[0] = getHashFromHash(hash, 0);
-                    h012[1] = getHashFromHash(hash, 1);
-                    h012[2] = getHashFromHash(hash, 2);
-
-                    int index3 = h012[mod3(found + 1)];
-                    alone[alonePos] = index3;
-                    alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0);
-                    t2count[index3] -= 4;
-                    t2count[index3] ^= mod3(found + 1);
-                    t2hash[index3] ^= hash;
-
-                    index3 = h012[mod3(found + 2)];
-                    alone[alonePos] = index3;
-                    alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0);
-                    t2count[index3] -= 4;
-                    t2count[index3] ^= mod3(found + 2);
-                    t2hash[index3] ^= hash;
-
-                    reverseOrderPos++;
+                while (alonePos > 0) {
+                    alonePos--;
+                    int index = alone[alonePos];
+                    if ((t2count[index] >> 2) == 1) {
+                        // It is still there!
+                        long hash = t2hash[index];
+                        byte found = (byte) (t2count[index] & 3);
+
+                        reverseH[reverseOrderPos] = found;
+                        reverseOrder[reverseOrderPos] = hash;
+
+                        h012[0] = getHashFromHash(hash, 0);
+                        h012[1] = getHashFromHash(hash, 1);
+                        h012[2] = getHashFromHash(hash, 2);
+
+                        int index3 = h012[mod3(found + 1)];
+                        alone[alonePos] = index3;
+                        alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0);
+                        t2count[index3] -= 4;
+                        t2count[index3] ^= mod3(found + 1);
+                        t2hash[index3] ^= hash;
+
+                        index3 = h012[mod3(found + 2)];
+                        alone[alonePos] = index3;
+                        alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0);
+                        t2count[index3] -= 4;
+                        t2count[index3] ^= mod3(found + 2);
+                        t2hash[index3] ^= hash;
+
+                        reverseOrderPos++;
+                    }
                 }
             }
-
             if (reverseOrderPos == size) {
                 break;
             }
@@ -203,7 +210,13 @@ private void addAll(long[] keys) {
             Arrays.fill(t2count, (byte) 0);
             Arrays.fill(t2hash, 0);
             Arrays.fill(reverseOrder, 0);
-
+            // If we reach 10 passes, we assume that there are too many duplicates
+            // in the input key set. We then sort and remove duplicates in place.
+            // This should almost never happen.
+            if (countMask < 0 && !duplicated) {
+                size = Deduplicator.sortAndRemoveDup(keys, size);
+                duplicated = true;
+            }
             if (hashIndex > 100) {
                 // if construction doesn't succeed eventually,
                 // then there is likely a problem with the hash function.
diff --git a/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse32.java b/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse32.java
index 29df2e5..fc1a01b 100644
--- a/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse32.java
+++ b/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse32.java
@@ -7,7 +7,8 @@
 import org.fastfilter.utils.Hash;
 
 /**
- * The xor binary fuse filter, a new algorithm that can replace a Bloom filter.
+ * The XorBinaryFuse32 filter is experimental. We recommend using XorBinaryFuse8 or XorBinaryFuse16 instead.
+ * Use at your own risks.
  */
 public class XorBinaryFuse32 implements Filter {
 
diff --git a/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse8.java b/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse8.java
index 29b1618..e8f0337 100644
--- a/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse8.java
+++ b/fastfilter/src/main/java/org/fastfilter/xor/XorBinaryFuse8.java
@@ -8,6 +8,7 @@
 
 /**
  * The xor binary fuse filter, a new algorithm that can replace a Bloom filter.
+ * Thomas Mueller Graf, Daniel Lemire, [Binary Fuse Filters: Fast and Smaller Than Xor Filters](http://arxiv.org/abs/2201.01174), 	Journal of Experimental Algorithmics 27, 2022. DOI: 10.1145/3510449  
  */
 public class XorBinaryFuse8 implements Filter {
 
@@ -79,6 +80,15 @@ private static int mod3(int x) {
         return x;
     }
 
+    /**
+     * Constructs a new XorBinaryFuse8 filter from the given array of keys.
+     * The filter is designed to have a low false positive rate while being space-efficient.
+     * The keys array should contain unique values. The array may be mutated during construction
+     * (e.g., sorted and deduplicated) if the algorithm detects that there are likely too many duplicates.
+     *
+     * @param keys the array of long keys to add to the filter
+     * @return a new XorBinaryFuse8 filter containing all the keys
+     */
     public static XorBinaryFuse8 construct(long[] keys) {
         int size = keys.length;
         int segmentLength = calculateSegmentLength(ARITY, size);
@@ -103,6 +113,7 @@ private void addAll(long[] keys) {
         long[] reverseOrder = new long[size + 1];
         byte[] reverseH = new byte[size];
         int reverseOrderPos = 0;
+        boolean duplicated = false;
 
         // the lowest 2 bits are the h index (0, 1, or 2)
         // so we only have 6 bits for counting;
@@ -118,7 +129,6 @@ private void addAll(long[] keys) {
             blockBits++;
         }
         int block = 1 << blockBits;
-        mainloop:
         while (true) {
             reverseOrder[size] = 1;
             int[] startPos = new int[block];
@@ -126,8 +136,8 @@ private void addAll(long[] keys) {
                 startPos[i] = (int) ((long) i * size / block);
             }
             // counting sort
-
-            for (long key : keys) {
+            for(int i = 0; i < size; i++) {
+                long key = keys[i];
                 long hash = Hash.hash64(key, seed);
                 int segmentIndex = (int) (hash >>> (64 - blockBits));
                 // We only overwrite when the hash was zero. Zero hash values
@@ -151,49 +161,46 @@ private void addAll(long[] keys) {
                 }
             }
             startPos = null;
-            if (countMask < 0) {
-                // we have a possible counter overflow
-                continue mainloop;
-            }
-
-            reverseOrderPos = 0;
-            int alonePos = 0;
-            for (int i = 0; i < arrayLength; i++) {
-                alone[alonePos] = i;
-                int inc = (t2count[i] >> 2) == 1 ? 1 : 0;
-                alonePos += inc;
-            }
+            if (countMask >= 0) {
+                reverseOrderPos = 0;
+                int alonePos = 0;
+                for (int i = 0; i < arrayLength; i++) {
+                    alone[alonePos] = i;
+                    int inc = (t2count[i] >> 2) == 1 ? 1 : 0;
+                    alonePos += inc;
+                }
 
-            while (alonePos > 0) {
-                alonePos--;
-                int index = alone[alonePos];
-                if ((t2count[index] >> 2) == 1) {
-                    // It is still there!
-                    long hash = t2hash[index];
-                    byte found = (byte) (t2count[index] & 3);
-
-                    reverseH[reverseOrderPos] = found;
-                    reverseOrder[reverseOrderPos] = hash;
-
-                    h012[0] = getHashFromHash(hash, 0);
-                    h012[1] = getHashFromHash(hash, 1);
-                    h012[2] = getHashFromHash(hash, 2);
-
-                    int index3 = h012[mod3(found + 1)];
-                    alone[alonePos] = index3;
-                    alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0);
-                    t2count[index3] -= 4;
-                    t2count[index3] ^= mod3(found + 1);
-                    t2hash[index3] ^= hash;
-
-                    index3 = h012[mod3(found + 2)];
-                    alone[alonePos] = index3;
-                    alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0);
-                    t2count[index3] -= 4;
-                    t2count[index3] ^= mod3(found + 2);
-                    t2hash[index3] ^= hash;
-
-                    reverseOrderPos++;
+                while (alonePos > 0) {
+                    alonePos--;
+                    int index = alone[alonePos];
+                    if ((t2count[index] >> 2) == 1) {
+                        // It is still there!
+                        long hash = t2hash[index];
+                        byte found = (byte) (t2count[index] & 3);
+
+                        reverseH[reverseOrderPos] = found;
+                        reverseOrder[reverseOrderPos] = hash;
+
+                        h012[0] = getHashFromHash(hash, 0);
+                        h012[1] = getHashFromHash(hash, 1);
+                        h012[2] = getHashFromHash(hash, 2);
+
+                        int index3 = h012[mod3(found + 1)];
+                        alone[alonePos] = index3;
+                        alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0);
+                        t2count[index3] -= 4;
+                        t2count[index3] ^= mod3(found + 1);
+                        t2hash[index3] ^= hash;
+
+                        index3 = h012[mod3(found + 2)];
+                        alone[alonePos] = index3;
+                        alonePos += ((t2count[index3] >> 2) == 2 ? 1 : 0);
+                        t2count[index3] -= 4;
+                        t2count[index3] ^= mod3(found + 2);
+                        t2hash[index3] ^= hash;
+
+                        reverseOrderPos++;
+                    }
                 }
             }
 
@@ -204,7 +211,13 @@ private void addAll(long[] keys) {
             Arrays.fill(t2count, (byte) 0);
             Arrays.fill(t2hash, 0);
             Arrays.fill(reverseOrder, 0);
-
+            // If we reach 10 passes, we assume that there are too many duplicates
+            // in the input key set. We then sort and remove duplicates in place.
+            // This should almost never happen.
+            if (countMask < 0 && !duplicated) {
+                size = Deduplicator.sortAndRemoveDup(keys, size);
+                duplicated = true;
+            }
             if (hashIndex > 100) {
                 // if construction doesn't succeed eventually,
                 // then there is likely a problem with the hash function.

From ff1e4b29f5c7fb9cdf70375967abd8e75e68832a Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Mon, 19 Jan 2026 19:01:31 -0500
Subject: [PATCH 3/3] adding missing file

---
 .../java/org/fastfilter/xor/Deduplicator.java | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 fastfilter/src/main/java/org/fastfilter/xor/Deduplicator.java

diff --git a/fastfilter/src/main/java/org/fastfilter/xor/Deduplicator.java b/fastfilter/src/main/java/org/fastfilter/xor/Deduplicator.java
new file mode 100644
index 0000000..6bd1759
--- /dev/null
+++ b/fastfilter/src/main/java/org/fastfilter/xor/Deduplicator.java
@@ -0,0 +1,27 @@
+package org.fastfilter.xor;
+
+import java.util.Arrays;
+
+public class Deduplicator {
+
+    /**
+     * Sorts the keys array and removes duplicates in place.
+     * Returns the new length of the array (number of unique elements).
+     *
+     * @param keys the array of keys to deduplicate
+     * @param length the current length of the array
+     * @return the new length after removing duplicates
+     */
+    public static int sortAndRemoveDup(long[] keys, int length) {
+        Arrays.sort(keys, 0, length);
+        int j = 1;
+        for (int i = 1; i < length; i++) {
+            if (keys[i] != keys[i - 1]) {
+                keys[j] = keys[i];
+                j++;
+            }
+        }
+        return j;
+    }
+
+}