Skip to content

Block structured Bloom filter #690

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 43 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
c08ee6d
bloomfilter: Add a simple construction benchmark
dcoutts Mar 24, 2025
02d8311
bloomfilter: removes Hashes, specialise to CheapHashes scheme
dcoutts Mar 19, 2025
c53370d
bloomfilter: use ByteArray type from primitive package
dcoutts Apr 22, 2025
dbff02a
bloomfilter: combine a couple modules into one
dcoutts Mar 19, 2025
a5fe945
bloomfilter: Remove pointless exported functions
dcoutts Mar 26, 2025
98add8e
bloomfilter: misc minor cleanups of the tests
dcoutts Mar 26, 2025
c1eea06
bloomfilter: change the example spell program into an executable
jorisdral Apr 24, 2025
458ba2e
bloomfilter: Add new size calculation code
dcoutts Mar 26, 2025
e4a9333
bloomfilter: add tests for new size calculation functions
dcoutts Mar 26, 2025
7438e05
bloomfilter: change Easy module to use new size calculations
dcoutts Mar 26, 2025
d30bc58
bloomfilter: remove primes helper program
dcoutts Mar 26, 2025
630995b
bloomfilter: remove old calc functions
dcoutts Mar 26, 2025
1cff8e7
bloomfilter: use new BloomSize type for filter construction functions
dcoutts Mar 29, 2025
b6a4675
bloomfilter: change length to size returning BloomSize
dcoutts Mar 29, 2025
14784e9
bloomfilter: add (de)serialise functions, for better abstraction
dcoutts Mar 29, 2025
3c83476
convert bloomFilterToLBS to use new Bloom.serialise
dcoutts Mar 29, 2025
305bff5
Switch FsPath to FsErrorPath in FileCorruptedError exception type
dcoutts Mar 31, 2025
1927d00
bloomfilter: fix showing counterexamples in prop_verifyFPR
dcoutts Mar 31, 2025
4aa4349
convert bloomFilterFromSBS to use new Bloom.deserialise
dcoutts Apr 4, 2025
cfcffa0
bloomfilter: Move most Data.BloomFilter modules under Data.BloomFilte…
dcoutts Apr 9, 2025
8074c4b
bloomfilter: improve naming in Calc functions
dcoutts Apr 9, 2025
3c5feae
bloomfilter: allow 0 bits in policyForBits
dcoutts Apr 9, 2025
02bf170
bloomfilter: remove last uses of internal modules
dcoutts Apr 10, 2025
6c25424
bloomfilter: use a mildly better version of unfoldr
dcoutts Apr 13, 2025
7435cc7
bloomfilter: establish a common API for hash-based insert and elem
dcoutts Apr 14, 2025
f6a7188
bloomfilter: Add new Data.BloomFilter.Blocked implementation
dcoutts Apr 10, 2025
df0fb16
bloomfilter: generalise tests to cover the Blocked implementation
dcoutts Apr 11, 2025
c83b359
bloomfilter: extend benchmark to blocked implementation
dcoutts Apr 22, 2025
f91a7da
bloomfilter: add bloomfilter-fpr-calc and gnuplot script
dcoutts Apr 15, 2025
52cdac3
bloomfilter: add operation (?) = flip elem
dcoutts Apr 14, 2025
d808621
bloomfilter: export a formatVersion number
dcoutts Apr 13, 2025
685e3d2
Use Bloom.filterVersion number in the lsm-tree serialisation code
dcoutts Apr 13, 2025
0181539
bloomfilter: switch range reduction from division to multiplication
dcoutts Apr 13, 2025
cca509b
Re-export (M)Bloom via D.LSMTree.I.BloomFilter to reduce coupling
dcoutts Apr 10, 2025
b977a02
Switch lsm-tree to use the Blocked bloom filter implementation
dcoutts Apr 15, 2025
4ce0308
bloomfilter: enable the same warnings as other packages
dcoutts Apr 22, 2025
b4231ae
Update bloomfilter/src/Data/BloomFilter/Classic/BitArray.hs
dcoutts Apr 29, 2025
dbd10c8
Update bloomfilter/src/Data/BloomFilter/Blocked/BitArray.hs
dcoutts Apr 29, 2025
1f445ac
Apply suggestions from code review
dcoutts Apr 29, 2025
7965a1f
Apply suggestions from code review
dcoutts Apr 29, 2025
c8c1f91
Apply suggestions from code review
dcoutts Apr 29, 2025
91e9bbc
Apply suggestions from code review
dcoutts Apr 29, 2025
9714096
Apply suggestions from code review
dcoutts Apr 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .hlint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@
- ignore: {name: "Redundant =="}
- ignore: {name: "Hoist not"}
- ignore: {name: "Use /="}
- ignore: {name: "Use unless"}
- ignore: {name: "Use notElem"}
- ignore: {name: "Use elem"}
- ignore: {name: "Use infix"}

# Specify additional command line arguments
#
Expand Down
66 changes: 31 additions & 35 deletions bench/macro/lsm-tree-bench-bloomfilter.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,8 @@ import Control.Monad
import Control.Monad.ST
import Control.Monad.ST.Unsafe
import Data.Bits ((.&.))
import Data.BloomFilter (Bloom)
import qualified Data.BloomFilter as Bloom
import qualified Data.BloomFilter.Hash as Bloom
import qualified Data.BloomFilter.Mutable as MBloom
import Data.BloomFilter.Blocked (Bloom, BloomSize)
import qualified Data.BloomFilter.Blocked as Bloom
import Data.Time
import Data.Vector (Vector)
import qualified Data.Vector as V
Expand All @@ -28,7 +26,6 @@ import Text.Printf (printf)
import Database.LSMTree.Extras.Orphans ()
import Database.LSMTree.Internal.Assertions (fromIntegralChecked)
import qualified Database.LSMTree.Internal.BloomFilterQuery1 as Bloom1
import Database.LSMTree.Internal.RunAcc (numHashFunctions)
import Database.LSMTree.Internal.Serialise (SerialisedKey,
serialiseKey)

Expand Down Expand Up @@ -60,7 +57,7 @@ benchmarkNumLookups = 25_000_000
benchmarkBatchSize :: Int
benchmarkBatchSize = 256

benchmarkNumBitsPerEntry :: Integer
benchmarkNumBitsPerEntry :: RequestedBitsPerEntry
benchmarkNumBitsPerEntry = 10

benchmarks :: IO ()
Expand All @@ -76,7 +73,7 @@ benchmarks = do
let filterSizes = lsmStyleBloomFilters benchmarkSizeBase
benchmarkNumBitsPerEntry
putStrLn "Bloom filter stats:"
putStrLn "(numEntries, sizeFactor, numBits, numHashFuncs)"
putStrLn "(numEntries, sizeFactor, BloomSize { sizeBits, sizeHashes })"
mapM_ print filterSizes
putStrLn $ "total number of entries:\t " ++ show (totalNumEntries filterSizes)
putStrLn $ "total filter size in bytes:\t " ++ show (totalNumBytes filterSizes)
Expand All @@ -94,19 +91,19 @@ benchmarks = do
putStrLn ""

hashcost <-
benchmark "makeCheapHashes"
benchmark "makeHashes"
"(This baseline is the cost of computing and hashing the keys)"
(benchInBatches benchmarkBatchSize rng0
(benchMakeCheapHashes vbs))
(benchMakeHashes vbs))
(fromIntegralChecked benchmarkNumLookups)
(0, 0)
289

_ <-
benchmark "elemCheapHashes"
benchmark "elemHashes"
"(this is the simple one-by-one lookup, less the cost of computing and hashing the keys)"
(benchInBatches benchmarkBatchSize rng0
(benchElemCheapHashes vbs))
(benchElemHashes vbs))
(fromIntegralChecked benchmarkNumLookups)
hashcost
0
Expand Down Expand Up @@ -180,10 +177,10 @@ benchmark name description action n (subtractTime, subtractAlloc) expectedAlloc
putStrLn ""
return (timeNet, allocNet)

-- | (numEntries, sizeFactor, numBits, numHashFuncs)
type BloomFilterSizeInfo = (Integer, Integer, Integer, Integer)
-- | (numEntries, sizeFactor, (BloomSize numBits numHashFuncs))
type BloomFilterSizeInfo = (Integer, Integer, BloomSize)
type SizeBase = Int
type RequestedBitsPerEntry = Integer
type RequestedBitsPerEntry = Double

-- | Calculate the sizes of a realistic LSM style set of Bloom filters, one
-- for each LSM run. This uses base 4, with 4 disk levels, using tiering
Expand All @@ -194,28 +191,29 @@ type RequestedBitsPerEntry = Integer
--
lsmStyleBloomFilters :: SizeBase -> RequestedBitsPerEntry -> [BloomFilterSizeInfo]
lsmStyleBloomFilters l1 requestedBitsPerEntry =
[ (numEntries, sizeFactor, nbits, nhashes)
[ (numEntries, sizeFactor, bsize)
| (numEntries, sizeFactor)
<- replicate 8 (2^(l1+0), 1) -- 8 runs at level 1 (tiering)
++ replicate 8 (2^(l1+2), 4) -- 8 runs at level 2 (tiering)
++ replicate 8 (2^(l1+4),16) -- 8 runs at level 3 (tiering)
++ [(2^(l1+8),256)] -- 1 run at level 4 (leveling)
, let nbits = numEntries * requestedBitsPerEntry
nhashes = numHashFunctions nbits numEntries
, let bsize = Bloom.sizeForBits requestedBitsPerEntry (fromIntegral numEntries)
]

totalNumEntries, totalNumBytes :: [BloomFilterSizeInfo] -> Integer
totalNumEntries filterSizes =
sum [ numEntries | (numEntries, _, _, _) <- filterSizes ]
sum [ numEntries | (numEntries, _, _) <- filterSizes ]

totalNumBytes filterSizes =
sum [ nbits | (_,_,nbits,_) <- filterSizes ] `div` 8
sum [ toInteger (Bloom.sizeBits bsize)
| (_,_,bsize) <- filterSizes ]
`div` 8

totalNumEntriesSanityCheck :: SizeBase -> [BloomFilterSizeInfo] -> Bool
totalNumEntriesSanityCheck l1 filterSizes =
totalNumEntries filterSizes
==
sum [ 2^l1 * sizeFactor | (_, sizeFactor, _, _) <- filterSizes ]
sum [ 2^l1 * sizeFactor | (_, sizeFactor, _) <- filterSizes ]


-- | Input environment for benchmarking 'Bloom.elemMany'.
Expand All @@ -240,9 +238,7 @@ elemManyEnv :: [BloomFilterSizeInfo]
elemManyEnv filterSizes rng0 =
stToIO $ do
-- create the filters
mbs <- sequence
[ MBloom.new (fromIntegralChecked numHashFuncs) (fromIntegralChecked numBits)
| (_, _, numBits, numHashFuncs) <- filterSizes ]
mbs <- sequence [ Bloom.new bsize | (_, _, bsize) <- filterSizes ]
-- add elements
foldM_
(\rng (i, mb) -> do
Expand All @@ -251,13 +247,13 @@ elemManyEnv filterSizes rng0 =
-- insert n elements into filter b
let k :: Word256
(!k, !rng') = uniform rng
MBloom.insert mb (serialiseKey k)
Bloom.insert mb (serialiseKey k)
return rng'
)
rng0
(zip [0 .. totalNumEntries filterSizes - 1]
(cycle [ mb'
| (mb, (_, sizeFactor, _, _)) <- zip mbs filterSizes
| (mb, (_, sizeFactor, _)) <- zip mbs filterSizes
, mb' <- replicate (fromIntegralChecked sizeFactor) mb ]))
V.fromList <$> mapM Bloom.unsafeFreeze mbs

Expand All @@ -280,21 +276,21 @@ benchInBatches !b !rng0 !action =

-- | This gives us a combined cost of calculating the series of keys and their
-- hashes (when used with 'benchInBatches').
benchMakeCheapHashes :: Vector (Bloom SerialisedKey) -> BatchBench
benchMakeCheapHashes !_bs !ks =
let khs :: VP.Vector (Bloom.CheapHashes SerialisedKey)
!khs = V.convert (V.map Bloom.makeHashes ks)
benchMakeHashes :: Vector (Bloom SerialisedKey) -> BatchBench
benchMakeHashes !_bs !ks =
let khs :: VP.Vector (Bloom.Hashes SerialisedKey)
!khs = V.convert (V.map Bloom.hashes ks)
in khs `seq` ()

-- | This gives us a combined cost of calculating the series of keys, their
-- hashes, and then using 'Bloom.elemCheapHashes' with each filter (when used
-- hashes, and then using 'Bloom.elemHashes' with each filter (when used
-- with 'benchInBatches').
benchElemCheapHashes :: Vector (Bloom SerialisedKey) -> BatchBench
benchElemCheapHashes !bs !ks =
let khs :: VP.Vector (Bloom.CheapHashes SerialisedKey)
!khs = V.convert (V.map Bloom.makeHashes ks)
benchElemHashes :: Vector (Bloom SerialisedKey) -> BatchBench
benchElemHashes !bs !ks =
let khs :: VP.Vector (Bloom.Hashes SerialisedKey)
!khs = V.convert (V.map Bloom.hashes ks)
in V.foldl'
(\_ b -> VP.foldl'
(\_ kh -> Bloom.elemHashes kh b `seq` ())
(\_ kh -> Bloom.elemHashes b kh `seq` ())
() khs)
() bs
13 changes: 7 additions & 6 deletions bench/macro/lsm-tree-bench-lookups.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@ import Control.Monad.Primitive
import Control.Monad.ST.Strict (ST, runST)
import Control.RefCount
import Data.Bits ((.&.))
import Data.BloomFilter (Bloom)
import qualified Data.BloomFilter as Bloom
import qualified Data.BloomFilter.Internal as Bloom
import Data.BloomFilter.Blocked (Bloom)
import qualified Data.BloomFilter.Blocked as Bloom
import Data.Time
import qualified Data.Vector as V
import Data.Vector.Algorithms.Merge as Merge
Expand Down Expand Up @@ -167,14 +166,16 @@ benchmarks !caching = withFS $ \hfs hbio -> do
traceMarkerIO "Computing statistics for generated runs"
let numEntries = V.map Run.size runs
numPages = V.map Run.sizeInPages runs
nhashes = V.map Bloom.hashesN blooms
nhashes = V.map (Bloom.sizeHashes . Bloom.size) blooms
bitsPerEntry = V.zipWith
(\b (NumEntries n) -> fromIntegral (Bloom.length b) / fromIntegral n :: Double)
(\b (NumEntries n) ->
fromIntegral (Bloom.sizeBits (Bloom.size b))
/ fromIntegral n :: Double)
blooms
numEntries
stats = V.zip4 numEntries numPages nhashes bitsPerEntry
putStrLn "Actual stats for generated runs:"
putStrLn "(numEntries, numPages, hashesN, bits per entry)"
putStrLn "(numEntries, numPages, numHashes, bits per entry)"
mapM_ print stats

_ <- putStr "Pausing. Drop caches now! When ready, press enter." >> getLine
Expand Down
22 changes: 15 additions & 7 deletions bench/micro/Bench/Database/LSMTree/Internal/BloomFilter.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ module Bench.Database.LSMTree.Internal.BloomFilter (
) where

import Criterion.Main
import qualified Data.Bifoldable as BiFold
import Data.BloomFilter (Bloom)
import qualified Data.BloomFilter as Bloom
import qualified Data.BloomFilter.Easy as Bloom.Easy
import Data.BloomFilter.Hash (Hashable)
import qualified Data.Foldable as Fold
import Data.Map.Strict (Map)
Expand All @@ -38,8 +38,11 @@ benchmarks = bgroup "Bench.Database.LSMTree.Internal.BloomFilter" [
]
, env (constructionEnv 2_500_000) $ \ m ->
bgroup "construction" [
bench "easyList 0.1" $ whnf (constructBloom Bloom.Easy.easyList 0.1) m
, bench "easyList 0.9" $ whnf (constructBloom Bloom.Easy.easyList 0.9) m
bench "FPR = 0.1" $
whnf (constructBloom 0.1) m

, bench "FPR = 0.9" $
whnf (constructBloom 0.9) m
]
]

Expand All @@ -57,7 +60,9 @@ elemEnv fpr nbloom nelemsPositive nelemsNegative = do
$ uniformWithoutReplacement @UTxOKey stdgen (nbloom + nelemsNegative)
ys2 = sampleUniformWithReplacement @UTxOKey stdgen' nelemsPositive xs
zs <- generate $ shuffle (ys1 ++ ys2)
pure (Bloom.Easy.easyList fpr (fmap serialiseKey xs), fmap serialiseKey zs)
pure ( Bloom.fromList (Bloom.policyForFPR fpr) (fmap serialiseKey xs)
, fmap serialiseKey zs
)

-- | Used for benchmarking 'Bloom.elem'.
elems :: Hashable a => Bloom a -> [a] -> ()
Expand All @@ -74,8 +79,11 @@ constructionEnv n = do

-- | Used for benchmarking the construction of bloom filters from write buffers.
constructBloom ::
(Double -> [SerialisedKey] -> Bloom SerialisedKey)
-> Double
Double
-> Map SerialisedKey SerialisedKey
-> Bloom SerialisedKey
constructBloom mkBloom fpr m = mkBloom fpr (Map.keys m)
constructBloom fpr m =
-- For faster construction, avoid going via lists and use Bloom.create,
-- traversing the map inserting the keys
Bloom.create (Bloom.sizeForFPR fpr (Map.size m)) $ \b ->
BiFold.bifoldMap (\k -> Bloom.insert b k) (\_v -> pure ()) m
Comment on lines 80 to +89
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you check if it's faster/better going through bifoldMap?

Comment on lines 80 to +89
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Today I learned that ST and IO are monoids... oops 😄

58 changes: 58 additions & 0 deletions bloomfilter/bench/bloomfilter-bench.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
module Main where

import qualified Data.BloomFilter.Blocked as B.Blocked
import qualified Data.BloomFilter.Classic as B.Classic
import Data.BloomFilter.Hash (Hashable (..), hash64)

import Data.Word (Word64)
import System.Random

import Criterion.Main

main :: IO ()
main =
defaultMain [
bgroup "Data.BloomFilter.Classic" [
env newStdGen $ \g0 ->
bench "construct m=1e6 fpr=1%" $
whnf (constructBloom_classic 1_000_000 0.01) g0

, env newStdGen $ \g0 ->
bench "construct m=1e6 fpr=0.1%" $
whnf (constructBloom_classic 1_000_000 0.001) g0

, env newStdGen $ \g0 ->
bench "construct m=1e7 fpr=0.1%" $
whnf (constructBloom_classic 10_000_000 0.001) g0
]
, bgroup "Data.BloomFilter.Blocked" [
env newStdGen $ \g0 ->
bench "construct m=1e6 fpr=1%" $
whnf (constructBloom_blocked 1_000_000 0.01) g0

, env newStdGen $ \g0 ->
bench "construct m=1e6 fpr=0.1%" $
whnf (constructBloom_blocked 1_000_000 0.001) g0

, env newStdGen $ \g0 ->
bench "construct m=1e7 fpr=0.1%" $
whnf (constructBloom_blocked 10_000_000 0.001) g0
]
]

constructBloom_classic :: Int -> Double -> StdGen -> B.Classic.Bloom Word64
constructBloom_classic n fpr g0 =
B.Classic.unfold (B.Classic.sizeForFPR fpr n) (nextElement n) (g0, 0)

constructBloom_blocked :: Int -> Double -> StdGen -> B.Blocked.Bloom Word64
constructBloom_blocked n fpr g0 =
B.Blocked.unfold (B.Blocked.sizeForFPR fpr n) (nextElement n) (g0, 0)

{-# INLINE nextElement #-}
nextElement :: Int -> (StdGen, Int) -> Maybe (Word64, (StdGen, Int))
nextElement !n (!g, !i)
| i >= n = Nothing
| otherwise = Just (x, (g', i+1))
where
(!x, !g') = uniform g

41 changes: 0 additions & 41 deletions bloomfilter/examples/Words.hs

This file was deleted.

18 changes: 6 additions & 12 deletions bloomfilter/examples/spell.hs
Original file line number Diff line number Diff line change
@@ -1,22 +1,16 @@
{-# LANGUAGE BangPatterns #-}
module Main (main) where

import Control.Exception (IOException, catch)
import Control.Monad (forM_, when)
import Data.Char (isLetter, toLower)
import System.Environment (getArgs)

import Data.BloomFilter.Easy (easyList, notElem)
import Prelude hiding (notElem)
import qualified Data.BloomFilter as B

main :: IO ()
main = do
files <- getArgs
dictionary <- readFile "/usr/share/dict/words" `catchIO` \_ -> return "yes no"
let !bloom = easyList 0.01 (words dictionary)
forM_ files $ \file -> do
ws <- words <$> readFile file
forM_ ws $ \w -> when (w `notElem` bloom) $ putStrLn w

catchIO :: IO a -> (IOException -> IO a) -> IO a
catchIO = catch
dictionary <- readFile "/usr/share/dict/words"
let !bloom = B.fromList (B.policyForFPR 0.01) (words dictionary)
forM_ files $ \file ->
putStrLn . unlines . filter (`B.notElem` bloom) . words
=<< readFile file
Loading
Loading