IntersectMBO · dcoutts · Mar 24, 2025 · Mar 19, 2025 · Apr 22, 2025 · Mar 19, 2025
@@ -36,6 +36,10 @@
 - ignore: {name: "Redundant =="}
 - ignore: {name: "Hoist not"}
 - ignore: {name: "Use /="}
+- ignore: {name: "Use unless"}
+- ignore: {name: "Use notElem"}
+- ignore: {name: "Use elem"}
+- ignore: {name: "Use infix"}
 
 # Specify additional command line arguments
 #

@@ -9,10 +9,8 @@ import           Control.Monad
 import           Control.Monad.ST
 import           Control.Monad.ST.Unsafe
 import           Data.Bits ((.&.))
-import           Data.BloomFilter (Bloom)
-import qualified Data.BloomFilter as Bloom
-import qualified Data.BloomFilter.Hash as Bloom
-import qualified Data.BloomFilter.Mutable as MBloom
+import           Data.BloomFilter.Blocked (Bloom, BloomSize)
+import qualified Data.BloomFilter.Blocked as Bloom
 import           Data.Time
 import           Data.Vector (Vector)
 import qualified Data.Vector as V
@@ -28,7 +26,6 @@ import           Text.Printf (printf)
 import           Database.LSMTree.Extras.Orphans ()
 import           Database.LSMTree.Internal.Assertions (fromIntegralChecked)
 import qualified Database.LSMTree.Internal.BloomFilterQuery1 as Bloom1
-import           Database.LSMTree.Internal.RunAcc (numHashFunctions)
 import           Database.LSMTree.Internal.Serialise (SerialisedKey,
                      serialiseKey)
 
@@ -60,7 +57,7 @@ benchmarkNumLookups = 25_000_000
 benchmarkBatchSize :: Int
 benchmarkBatchSize = 256
 
-benchmarkNumBitsPerEntry :: Integer
+benchmarkNumBitsPerEntry :: RequestedBitsPerEntry
 benchmarkNumBitsPerEntry = 10
 
 benchmarks :: IO ()
@@ -76,7 +73,7 @@ benchmarks = do
     let filterSizes = lsmStyleBloomFilters benchmarkSizeBase
                                            benchmarkNumBitsPerEntry
     putStrLn "Bloom filter stats:"
-    putStrLn "(numEntries, sizeFactor, numBits, numHashFuncs)"
+    putStrLn "(numEntries, sizeFactor, BloomSize { sizeBits, sizeHashes })"
     mapM_ print filterSizes
     putStrLn $ "total number of entries:\t " ++ show (totalNumEntries filterSizes)
     putStrLn $ "total filter size in bytes:\t " ++ show (totalNumBytes filterSizes)
@@ -94,19 +91,19 @@ benchmarks = do
     putStrLn ""
 
     hashcost <-
-      benchmark "makeCheapHashes"
+      benchmark "makeHashes"
                 "(This baseline is the cost of computing and hashing the keys)"
                 (benchInBatches benchmarkBatchSize rng0
-                   (benchMakeCheapHashes vbs))
+                   (benchMakeHashes vbs))
                 (fromIntegralChecked benchmarkNumLookups)
                 (0, 0)
                 289
 
     _ <-
-      benchmark "elemCheapHashes"
+      benchmark "elemHashes"
                 "(this is the simple one-by-one lookup, less the cost of computing and hashing the keys)"
                 (benchInBatches benchmarkBatchSize rng0
-                  (benchElemCheapHashes vbs))
+                  (benchElemHashes vbs))
                 (fromIntegralChecked benchmarkNumLookups)
                 hashcost
                 0
@@ -180,10 +177,10 @@ benchmark name description action n (subtractTime, subtractAlloc) expectedAlloc
     putStrLn ""
     return (timeNet, allocNet)
 
--- | (numEntries, sizeFactor, numBits, numHashFuncs)
-type BloomFilterSizeInfo = (Integer, Integer, Integer, Integer)
+-- | (numEntries, sizeFactor, (BloomSize numBits numHashFuncs))
+type BloomFilterSizeInfo = (Integer, Integer, BloomSize)
 type SizeBase     = Int
-type RequestedBitsPerEntry = Integer
+type RequestedBitsPerEntry = Double
 
 -- | Calculate the sizes of a realistic LSM style set of Bloom filters, one
 -- for each LSM run. This uses base 4, with 4 disk levels, using tiering
@@ -194,28 +191,29 @@ type RequestedBitsPerEntry = Integer
 --
 lsmStyleBloomFilters :: SizeBase -> RequestedBitsPerEntry -> [BloomFilterSizeInfo]
 lsmStyleBloomFilters l1 requestedBitsPerEntry =
-    [ (numEntries, sizeFactor, nbits, nhashes)
+    [ (numEntries, sizeFactor, bsize)
     | (numEntries, sizeFactor)
         <- replicate 8 (2^(l1+0), 1)   -- 8 runs at level 1 (tiering)
         ++ replicate 8 (2^(l1+2), 4)   -- 8 runs at level 2 (tiering)
         ++ replicate 8 (2^(l1+4),16)   -- 8 runs at level 3 (tiering)
         ++            [(2^(l1+8),256)] -- 1 run  at level 4 (leveling)
-    , let nbits   = numEntries * requestedBitsPerEntry
-          nhashes = numHashFunctions nbits numEntries
+    , let bsize = Bloom.sizeForBits requestedBitsPerEntry (fromIntegral numEntries)
     ]
 
 totalNumEntries, totalNumBytes :: [BloomFilterSizeInfo] -> Integer
 totalNumEntries filterSizes =
-    sum [ numEntries | (numEntries, _, _, _) <- filterSizes ]
+    sum [ numEntries | (numEntries, _, _) <- filterSizes ]
 
 totalNumBytes filterSizes =
-    sum [ nbits | (_,_,nbits,_) <- filterSizes ] `div` 8
+    sum [ toInteger (Bloom.sizeBits bsize)
+        | (_,_,bsize) <- filterSizes ]
+      `div` 8
 
 totalNumEntriesSanityCheck :: SizeBase -> [BloomFilterSizeInfo] -> Bool
 totalNumEntriesSanityCheck l1 filterSizes =
     totalNumEntries filterSizes
     ==
-    sum [ 2^l1 * sizeFactor | (_, sizeFactor, _, _) <- filterSizes ]
+    sum [ 2^l1 * sizeFactor | (_, sizeFactor, _) <- filterSizes ]
 
 
 -- | Input environment for benchmarking 'Bloom.elemMany'.
@@ -240,9 +238,7 @@ elemManyEnv :: [BloomFilterSizeInfo]
 elemManyEnv filterSizes rng0 =
   stToIO $ do
     -- create the filters
-    mbs <- sequence
-             [ MBloom.new (fromIntegralChecked numHashFuncs) (fromIntegralChecked numBits)
-             | (_, _, numBits, numHashFuncs) <- filterSizes ]
+    mbs <- sequence [ Bloom.new bsize | (_, _, bsize) <- filterSizes ]
     -- add elements
     foldM_
       (\rng (i, mb) -> do
@@ -251,13 +247,13 @@ elemManyEnv filterSizes rng0 =
          -- insert n elements into filter b
          let k :: Word256
              (!k, !rng') = uniform rng
-         MBloom.insert mb (serialiseKey k)
+         Bloom.insert mb (serialiseKey k)
          return rng'
       )
       rng0
       (zip [0 .. totalNumEntries filterSizes - 1]
            (cycle [ mb'
-                  | (mb, (_, sizeFactor, _, _)) <- zip mbs filterSizes
+                  | (mb, (_, sizeFactor, _)) <- zip mbs filterSizes
                   , mb' <- replicate (fromIntegralChecked sizeFactor) mb ]))
     V.fromList <$> mapM Bloom.unsafeFreeze mbs
 
@@ -280,21 +276,21 @@ benchInBatches !b !rng0 !action =
 
 -- | This gives us a combined cost of calculating the series of keys and their
 -- hashes (when used with 'benchInBatches').
-benchMakeCheapHashes :: Vector (Bloom SerialisedKey) -> BatchBench
-benchMakeCheapHashes !_bs !ks =
-    let khs :: VP.Vector (Bloom.CheapHashes SerialisedKey)
-        !khs = V.convert (V.map Bloom.makeHashes ks)
+benchMakeHashes :: Vector (Bloom SerialisedKey) -> BatchBench
+benchMakeHashes !_bs !ks =
+    let khs :: VP.Vector (Bloom.Hashes SerialisedKey)
+        !khs = V.convert (V.map Bloom.hashes ks)
      in khs `seq` ()
 
 -- | This gives us a combined cost of calculating the series of keys, their
--- hashes, and then using 'Bloom.elemCheapHashes' with each filter  (when used
+-- hashes, and then using 'Bloom.elemHashes' with each filter  (when used
 -- with 'benchInBatches').
-benchElemCheapHashes :: Vector (Bloom SerialisedKey) -> BatchBench
-benchElemCheapHashes !bs !ks =
-    let khs :: VP.Vector (Bloom.CheapHashes SerialisedKey)
-        !khs = V.convert (V.map Bloom.makeHashes ks)
+benchElemHashes :: Vector (Bloom SerialisedKey) -> BatchBench
+benchElemHashes !bs !ks =
+    let khs :: VP.Vector (Bloom.Hashes SerialisedKey)
+        !khs = V.convert (V.map Bloom.hashes ks)
      in V.foldl'
           (\_ b -> VP.foldl'
-                     (\_ kh -> Bloom.elemHashes kh b `seq` ())
+                     (\_ kh -> Bloom.elemHashes b kh `seq` ())
                      () khs)
           () bs
@@ -9,9 +9,8 @@ import           Control.Monad.Primitive
 import           Control.Monad.ST.Strict (ST, runST)
 import           Control.RefCount
 import           Data.Bits ((.&.))
-import           Data.BloomFilter (Bloom)
-import qualified Data.BloomFilter as Bloom
-import qualified Data.BloomFilter.Internal as Bloom
+import           Data.BloomFilter.Blocked (Bloom)
+import qualified Data.BloomFilter.Blocked as Bloom
 import           Data.Time
 import qualified Data.Vector as V
 import           Data.Vector.Algorithms.Merge as Merge
@@ -167,14 +166,16 @@ benchmarks !caching = withFS $ \hfs hbio -> do
     traceMarkerIO "Computing statistics for generated runs"
     let numEntries = V.map Run.size runs
         numPages   = V.map Run.sizeInPages runs
-        nhashes    = V.map Bloom.hashesN blooms
+        nhashes    = V.map (Bloom.sizeHashes . Bloom.size) blooms
         bitsPerEntry = V.zipWith
-                         (\b (NumEntries n) -> fromIntegral (Bloom.length b) / fromIntegral n :: Double)
+                         (\b (NumEntries n) ->
+                             fromIntegral (Bloom.sizeBits (Bloom.size b))
+                           / fromIntegral n :: Double)
                          blooms
                          numEntries
         stats = V.zip4 numEntries numPages nhashes bitsPerEntry
     putStrLn "Actual stats for generated runs:"
-    putStrLn "(numEntries, numPages, hashesN, bits per entry)"
+    putStrLn "(numEntries, numPages, numHashes, bits per entry)"
     mapM_ print stats
 
     _ <- putStr "Pausing. Drop caches now! When ready, press enter." >> getLine

@@ -9,9 +9,9 @@ module Bench.Database.LSMTree.Internal.BloomFilter (
   ) where
 
 import           Criterion.Main
+import qualified Data.Bifoldable as BiFold
 import           Data.BloomFilter (Bloom)
 import qualified Data.BloomFilter as Bloom
-import qualified Data.BloomFilter.Easy as Bloom.Easy
 import           Data.BloomFilter.Hash (Hashable)
 import qualified Data.Foldable as Fold
 import           Data.Map.Strict (Map)
@@ -38,8 +38,11 @@ benchmarks = bgroup "Bench.Database.LSMTree.Internal.BloomFilter" [
         ]
     , env (constructionEnv 2_500_000) $ \ m ->
       bgroup "construction" [
-          bench "easyList 0.1" $ whnf (constructBloom Bloom.Easy.easyList 0.1) m
-        , bench "easyList 0.9" $ whnf (constructBloom Bloom.Easy.easyList 0.9) m
+          bench "FPR = 0.1" $
+            whnf (constructBloom 0.1) m
+
+        , bench "FPR = 0.9" $
+            whnf (constructBloom 0.9) m
         ]
     ]
 
@@ -57,7 +60,9 @@ elemEnv fpr nbloom nelemsPositive nelemsNegative = do
                   $ uniformWithoutReplacement    @UTxOKey stdgen  (nbloom + nelemsNegative)
         ys2       = sampleUniformWithReplacement @UTxOKey stdgen' nelemsPositive xs
     zs <- generate $ shuffle (ys1 ++ ys2)
-    pure (Bloom.Easy.easyList fpr (fmap serialiseKey xs), fmap serialiseKey zs)
+    pure ( Bloom.fromList (Bloom.policyForFPR fpr) (fmap serialiseKey xs)
+         , fmap serialiseKey zs
+         )
 
 -- | Used for benchmarking 'Bloom.elem'.
 elems :: Hashable a => Bloom a -> [a] -> ()
@@ -74,8 +79,11 @@ constructionEnv n = do
 
 -- | Used for benchmarking the construction of bloom filters from write buffers.
 constructBloom ::
-     (Double -> [SerialisedKey] -> Bloom SerialisedKey)
-  -> Double
+     Double
   -> Map SerialisedKey SerialisedKey
   -> Bloom SerialisedKey
-constructBloom mkBloom fpr m = mkBloom fpr (Map.keys m)
+constructBloom fpr m =
+    -- For faster construction, avoid going via lists and use Bloom.create,
+    -- traversing the map inserting the keys
+    Bloom.create (Bloom.sizeForFPR fpr (Map.size m)) $ \b ->
+      BiFold.bifoldMap (\k -> Bloom.insert b k) (\_v -> pure ()) m
@@ -0,0 +1,58 @@
+module Main where
+
+import qualified Data.BloomFilter.Blocked as B.Blocked
+import qualified Data.BloomFilter.Classic as B.Classic
+import           Data.BloomFilter.Hash (Hashable (..), hash64)
+
+import           Data.Word (Word64)
+import           System.Random
+
+import           Criterion.Main
+
+main :: IO ()
+main =
+    defaultMain [
+      bgroup "Data.BloomFilter.Classic" [
+        env newStdGen $ \g0 ->
+        bench "construct m=1e6 fpr=1%" $
+          whnf (constructBloom_classic 1_000_000 0.01) g0
+
+      , env newStdGen $ \g0 ->
+        bench "construct m=1e6 fpr=0.1%" $
+          whnf (constructBloom_classic 1_000_000 0.001) g0
+
+      , env newStdGen $ \g0 ->
+        bench "construct m=1e7 fpr=0.1%" $
+          whnf (constructBloom_classic 10_000_000 0.001) g0
+      ]
+    , bgroup "Data.BloomFilter.Blocked" [
+        env newStdGen $ \g0 ->
+        bench "construct m=1e6 fpr=1%" $
+          whnf (constructBloom_blocked 1_000_000 0.01) g0
+
+      , env newStdGen $ \g0 ->
+        bench "construct m=1e6 fpr=0.1%" $
+          whnf (constructBloom_blocked 1_000_000 0.001) g0
+
+      , env newStdGen $ \g0 ->
+        bench "construct m=1e7 fpr=0.1%" $
+          whnf (constructBloom_blocked 10_000_000 0.001) g0
+      ]
+    ]
+
+constructBloom_classic :: Int -> Double -> StdGen -> B.Classic.Bloom Word64
+constructBloom_classic n fpr g0 =
+    B.Classic.unfold (B.Classic.sizeForFPR fpr n) (nextElement n) (g0, 0)
+
+constructBloom_blocked :: Int -> Double -> StdGen -> B.Blocked.Bloom Word64
+constructBloom_blocked n fpr g0 =
+    B.Blocked.unfold (B.Blocked.sizeForFPR fpr n) (nextElement n) (g0, 0)
+
+{-# INLINE nextElement #-}
+nextElement :: Int -> (StdGen, Int) -> Maybe (Word64, (StdGen, Int))
+nextElement !n (!g, !i)
+  | i >= n    = Nothing
+  | otherwise = Just (x, (g', i+1))
+    where
+      (!x, !g') = uniform g
+
@@ -1,22 +1,16 @@
 {-# LANGUAGE BangPatterns #-}
 module Main (main) where
 
-import           Control.Exception (IOException, catch)
 import           Control.Monad (forM_, when)
-import           Data.Char (isLetter, toLower)
 import           System.Environment (getArgs)
 
-import           Data.BloomFilter.Easy (easyList, notElem)
-import           Prelude hiding (notElem)
+import qualified Data.BloomFilter as B
 
 main :: IO ()
 main = do
     files <- getArgs
-    dictionary <- readFile "/usr/share/dict/words" `catchIO` \_ -> return "yes no"
-    let !bloom = easyList 0.01 (words dictionary)
-    forM_ files $ \file -> do
-        ws <- words <$> readFile file
-        forM_ ws $ \w -> when (w `notElem` bloom) $ putStrLn w
-
-catchIO :: IO a -> (IOException -> IO a) -> IO a
-catchIO = catch
+    dictionary <- readFile "/usr/share/dict/words"
+    let !bloom = B.fromList (B.policyForFPR 0.01) (words dictionary)
+    forM_ files $ \file ->
+          putStrLn . unlines . filter (`B.notElem` bloom) . words
+      =<< readFile file