From cd3327367f9cf0ecc3e09f3bb4439e5f37e67801 Mon Sep 17 00:00:00 2001 From: shash256 <111925100+shash256@users.noreply.github.com> Date: Mon, 9 Dec 2024 13:22:08 +0400 Subject: [PATCH] Initial commit --- .gitattributes | 2 + .gitignore | 10 + LICENSE | 20 + README.md | 122 ++++++ benches/bench.nim | 123 ++++++ benches/bench_results.txt | 495 +++++++++++++++++++++++ benches/bench_results_nimStringHash2.txt | 495 +++++++++++++++++++++++ benches/config.nims | 1 + bloom.nimble | 9 + src/bloom.nim | 183 +++++++++ src/murmur3.c | 314 ++++++++++++++ src/murmur3.h | 21 + src/private/probabilities.nim | 103 +++++ tests/config.nims | 1 + tests/test.nim | 151 +++++++ 15 files changed, 2050 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 benches/bench.nim create mode 100644 benches/bench_results.txt create mode 100644 benches/bench_results_nimStringHash2.txt create mode 100644 benches/config.nims create mode 100644 bloom.nimble create mode 100644 src/bloom.nim create mode 100644 src/murmur3.c create mode 100644 src/murmur3.h create mode 100644 src/private/probabilities.nim create mode 100644 tests/config.nims create mode 100644 tests/test.nim diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..dfe0770 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f595a4e --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +nimcache +nimcache/* +tests/test +benches/bench +benches/bench_arch_end +bloom +*.html +*.css +.DS_Store +src/.DS_Store \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..10ea866 --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +The MIT License (MIT) + +Copyright (c) 2013 Nick Greenfield + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..68e6ec1 --- /dev/null +++ b/README.md @@ -0,0 +1,122 @@ +# nim-bloom +***NOTE: THIS IMPLEMENTATION IS NOT PEER-REVIEWED YET. PLEASE USE WITH CAUTION.*** + +A high-performance Bloom filter implementation in Nim offering standard and custom hash function options with different performance characteristics and false positive rates. + +## Features + +- Fast string element insertion and lookup +- Configurable error rates +- Choice between standard Nim hash and custom MurmurHash3 (128-bit or 32-bit) +- Optimized for supporting different use cases of speed and accuracy +- Comprehensive test suite and benchmarks + +## Usage + +Basic usage (defaults to MurmurHash3_128): +```nim +import bloom2 + +# Initialize with default hash (MurmurHash3_128) +var bf = initializeBloomFilter(capacity = 10000, errorRate = 0.01) + +# Or explicitly specify hash type +var bf32 = initializeBloomFilter( + capacity = 10000, + errorRate = 0.01, + hashType = htMurmur32 # Use 32-bit implementation +) + +# Basic operations +bf.insert("test") +assert bf.lookup("test") +``` + +## Hash Function Selection + +1. Use MurmurHash3_128 (default) when: + - You need the best balance of performance and accuracy + - Memory isn't severely constrained + - Working with large datasets + - False positive rates are important + +2. Use MurmurHash3_32 when: + - Running on 32-bit systems + - Memory is constrained + - Working with smaller datasets + - String concatenation overhead for second hash, causing higher insertion and lookup times, is acceptable. + +3. Use NimHash when: + - Consistency with Nim's hashing is important + - Working with smaller datasets where performance is less critical + - Future availability of better hash functions or performant implementations + +Nim's Hash Implementation: + - Default (no flags): Uses FarmHash implementation + - With `-d:nimStringHash2`: Uses Nim's MurmurHash3_32 implementation + - Our implementation allows explicit choice regardless of compilation flags and our MurmurHash3_32 performs better because of directly using a native C Implementation + +## Performance Characteristics +### For 1M items - Random Strings +``` +Insertion Speed: +MurmurHash3_128: ~6.8M ops/sec +MurmurHash3_32: ~5.9M ops/sec +FarmHash: ~2.1M ops/sec + +False Positive Rates: +MurmurHash3_128: ~0.84% +MurmurHash3_32: ~0.83% +FarmHash: ~0.82% +``` + +These measurements show MurmurHash3_128's balanced performance profile, offering best speed and competitive false positive rates. + +Performance will vary based on: +- Choice of hash function +- Hardware specifications +- Data size and memory access patterns (inside vs outside cache) +- Compiler optimizations + +For detailed benchmarks across different data patterns and sizes, see [benches](benches/). + +## Implementation Details + +### Double Hashing Technique +This implmentation uses the Kirsch-Mitzenmacher method to generate k hash values from two initial hashes. The implementation varies by hash type: + +1. MurmurHash3_128: +```nim +h(i) = abs((hash1 + i * hash2) mod m) +``` +- Uses both 64-bit hashes from 128-bit output +- Natural double-hash implementation + +2. MurmurHash3_32: +```nim +let baseHash = murmurHash32(item, 0'u32) +let secondHash = murmurHash32(item & " b", 0'u32) +``` +- Uses string concatention by default for the second hash +- Bit Rotation for second hash provides sufficient randomness in some use cases while being much faster than string concatenation (but results in higher FP rate) +- Choose between bit rotation or string concatenation as per your use-case. + +3. Nim's Hash: +```nim + let + hashA = abs(hash(item)) mod maxValue + hashB = abs(hash(item & " b")) mod maxValue + h(i) = abs((hashA + n * hashB)) mod maxValue +``` +- Farm Hash or Nim's Murmur Hash based (if compliation flag is passed) +- Uses string concatention by default. +- Lower FP rate than bit rotation but comes at the cost of higher insertion and lookup times. + +*Tip:* Bit rotation values can be configurable as well. Use prime numbers for better mixing: 7, 11, 13, 17 for 32-bit; 21, 23, 27, 33 for 64-bit. Smaller rotations provides lesser mixing but as faster than higher rotations. + +## Testing + +Run the test suite: +```bash +nimble test +``` \ No newline at end of file diff --git a/benches/bench.nim b/benches/bench.nim new file mode 100644 index 0000000..82d7ca9 --- /dev/null +++ b/benches/bench.nim @@ -0,0 +1,123 @@ +import times, random, strutils +include bloom + +type + DataPattern = enum + dpRandom, # Random strings + dpSequential, # Sequential numbers + dpFixed, # Fixed length strings + dpLong, # Long strings + dpSpecial # Strings with special characters + +type + BenchmarkResult = tuple[ + insertTime: float, + lookupTime: float, + falsePositives: int + ] + +proc generateBenchData(pattern: DataPattern, size: int, isLookupData: bool = false): seq[string] = + result = newSeq[string](size) + let offset = if isLookupData: size * 2 else: 0 # Ensure lookup data is well separated + + case pattern: + of dpRandom: + for i in 0..= 1.0.0" diff --git a/src/bloom.nim b/src/bloom.nim new file mode 100644 index 0000000..ba3bdc8 --- /dev/null +++ b/src/bloom.nim @@ -0,0 +1,183 @@ +from math import ceil, ln, pow, round +import hashes +import strutils +import private/probabilities + +# Import MurmurHash3 code with both 128-bit and 32-bit implementations +{.compile: "murmur3.c".} + +type + HashType* = enum + htMurmur128, # Default: MurmurHash3_x64_128 + htMurmur32, # MurmurHash3_x86_32 + htNimHash # Nim's Hash (currently Farm Hash) + + BloomFilterError* = object of CatchableError + + MurmurHashes = array[0..1, int] + + BloomFilter* = object + capacity*: int + errorRate*: float + kHashes*: int + mBits*: int + intArray: seq[int] + hashType*: HashType + +{.push overflowChecks: off.} # Turn off overflow checks for hashing operations + +proc rawMurmurHash128(key: cstring, len: int, seed: uint32, + outHashes: var MurmurHashes): void {. + importc: "MurmurHash3_x64_128".} + +proc rawMurmurHash32(key: cstring, len: int, seed: uint32, + outHashes: ptr uint32): void {. + importc: "MurmurHash3_x86_32".} + +proc murmurHash128(key: string, seed = 0'u32): MurmurHashes = + var hashResult: MurmurHashes + rawMurmurHash128(key, key.len, seed, hashResult) + hashResult + +proc murmurHash32(key: string, seed = 0'u32): uint32 = + var result: uint32 + rawMurmurHash32(key, key.len, seed, addr result) + result + +proc hashN(item: string, n: int, maxValue: int): int = + ## Get the nth hash using Nim's built-in hash function using + ## the double hashing technique from Kirsch and Mitzenmacher, 2008: + ## http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf + let + hashA = abs(hash(item)) mod maxValue # Use abs to handle negative hashes + hashB = abs(hash(item & " b")) mod maxValue # string concatenation + abs((hashA + n * hashB)) mod maxValue + # # Use bit rotation for second hash instead of string concatenation if speed if preferred over FP-rate + # # Rotate left by 21 bits (lower the rotation, higher the speed but higher the FP-rate too) + # hashB = abs( + # ((h shl 21) or (h shr (sizeof(int) * 8 - 21))) + # ) mod maxValue + # abs((hashA + n.int64 * hashB)) mod maxValue + +{.pop.} + +proc getMOverNBitsForK(k: int, targetError: float, + probabilityTable = kErrors): int = + ## Returns the optimal number of m/n bits for a given k. + if k notin 0..12: + raise newException(BloomFilterError, + "K must be <= 12 if forceNBitsPerElem is not also specified.") + + for mOverN in 2..probabilityTable[k].high: + if probabilityTable[k][mOverN] < targetError: + return mOverN + + raise newException(BloomFilterError, + "Specified value of k and error rate not achievable using less than 4 bytes / element.") + +proc initializeBloomFilter*(capacity: int, errorRate: float, k = 0, + forceNBitsPerElem = 0, + hashType = htMurmur128): BloomFilter = + ## Initializes a Bloom filter with specified parameters. + ## + ## Parameters: + ## - capacity: Expected number of elements to be inserted + ## - errorRate: Desired false positive rate (e.g., 0.01 for 1%) + ## - k: Optional number of hash functions. If 0, calculated optimally + ## See http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for + ## useful tables on k and m/n (n bits per element) combinations. + ## - forceNBitsPerElem: Optional override for bits per element + ## - hashType: Choose hash function: + ## * htMurmur128: MurmurHash3_x64_128 (default) - recommended + ## * htMurmur32: MurmurHash3_x86_32 + ## * htNimHash: Nim's Hash + var + kHashes: int + nBitsPerElem: int + + if k < 1: # Calculate optimal k and use that + let bitsPerElem = ceil(-1.0 * (ln(errorRate) / (pow(ln(2.float), 2)))) + kHashes = round(ln(2.float) * bitsPerElem).int + nBitsPerElem = round(bitsPerElem).int + else: # Use specified k if possible + if forceNBitsPerElem < 1: # Use lookup table + nBitsPerElem = getMOverNBitsForK(k = k, targetError = errorRate) + else: + nBitsPerElem = forceNBitsPerElem + kHashes = k + + let + mBits = capacity * nBitsPerElem + mInts = 1 + mBits div (sizeof(int) * 8) + + BloomFilter( + capacity: capacity, + errorRate: errorRate, + kHashes: kHashes, + mBits: mBits, + intArray: newSeq[int](mInts), + hashType: hashType + ) + +proc `$`*(bf: BloomFilter): string = + ## Prints the configuration of the Bloom filter. + let hashType = case bf.hashType + of htMurmur128: "MurmurHash3_x64_128" + of htMurmur32: "MurmurHash3_x86_32" + of htNimHash: "NimHashHash" + + "Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits of memory. Using $5." % + [$bf.capacity, + formatFloat(bf.errorRate, format = ffScientific, precision = 1), + $bf.kHashes, + $(bf.mBits div bf.capacity), + hashType] + +{.push overflowChecks: off.} # Turn off overflow checks for hash computations + +proc computeHashes(bf: BloomFilter, item: string): seq[int] = + var hashes = newSeq[int](bf.kHashes) + + case bf.hashType + of htMurmur128: + let murmurHashes = murmurHash128(item, 0'u32) + for i in 0..> (32 - r)); +} + +static inline FORCE_INLINE uint64_t rotl64 ( uint64_t x, int8_t r ) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL32(x,y) rotl32(x,y) +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +#define getblock(p, i) (p[i]) + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +static inline FORCE_INLINE uint32_t fmix32 ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +//---------- + +static inline FORCE_INLINE uint64_t fmix64 ( uint64_t k ) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + int i; + + uint32_t h1 = seed; + + uint32_t c1 = 0xcc9e2d51; + uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + for(i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix32(h1); + + *(uint32_t*)out = h1; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_128 ( const void * key, const int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + int i; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + uint32_t c1 = 0x239b961b; + uint32_t c2 = 0xab0e9789; + uint32_t c3 = 0x38b34ae5; + uint32_t c4 = 0xa1e38b93; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + + for(i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i*4+0); + uint32_t k2 = getblock(blocks,i*4+1); + uint32_t k3 = getblock(blocks,i*4+2); + uint32_t k4 = getblock(blocks,i*4+3); + + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch(len & 15) + { + case 15: k4 ^= tail[14] << 16; + case 14: k4 ^= tail[13] << 8; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + case 12: k3 ^= tail[11] << 24; + case 11: k3 ^= tail[10] << 16; + case 10: k3 ^= tail[ 9] << 8; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + case 8: k2 ^= tail[ 7] << 24; + case 7: k2 ^= tail[ 6] << 16; + case 6: k2 ^= tail[ 5] << 8; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + case 4: k1 ^= tail[ 3] << 24; + case 3: k1 ^= tail[ 2] << 16; + case 2: k1 ^= tail[ 1] << 8; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = fmix32(h1); + h2 = fmix32(h2); + h3 = fmix32(h3); + h4 = fmix32(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + ((uint32_t*)out)[0] = h1; + ((uint32_t*)out)[1] = h2; + ((uint32_t*)out)[2] = h3; + ((uint32_t*)out)[3] = h4; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128 ( const void * key, const int len, + const uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + int i; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *)(data); + + for(i = 0; i < nblocks; i++) + { + uint64_t k1 = getblock(blocks,i*2+0); + uint64_t k2 = getblock(blocks,i*2+1); + + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + + h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) + { + case 15: k2 ^= (uint64_t)(tail[14]) << 48; + case 14: k2 ^= (uint64_t)(tail[13]) << 40; + case 13: k2 ^= (uint64_t)(tail[12]) << 32; + case 12: k2 ^= (uint64_t)(tail[11]) << 24; + case 11: k2 ^= (uint64_t)(tail[10]) << 16; + case 10: k2 ^= (uint64_t)(tail[ 9]) << 8; + case 9: k2 ^= (uint64_t)(tail[ 8]) << 0; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + case 8: k1 ^= (uint64_t)(tail[ 7]) << 56; + case 7: k1 ^= (uint64_t)(tail[ 6]) << 48; + case 6: k1 ^= (uint64_t)(tail[ 5]) << 40; + case 5: k1 ^= (uint64_t)(tail[ 4]) << 32; + case 4: k1 ^= (uint64_t)(tail[ 3]) << 24; + case 3: k1 ^= (uint64_t)(tail[ 2]) << 16; + case 2: k1 ^= (uint64_t)(tail[ 1]) << 8; + case 1: k1 ^= (uint64_t)(tail[ 0]) << 0; + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t*)out)[0] = h1; + ((uint64_t*)out)[1] = h2; +} + +//----------------------------------------------------------------------------- diff --git a/src/murmur3.h b/src/murmur3.h new file mode 100644 index 0000000..6928384 --- /dev/null +++ b/src/murmur3.h @@ -0,0 +1,21 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the +// public domain. The author hereby disclaims copyright to this source +// code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +#include + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 (const void *key, int len, uint32_t seed, void *out); + +void MurmurHash3_x86_128(const void *key, int len, uint32_t seed, void *out); + +void MurmurHash3_x64_128(const void *key, int len, uint32_t seed, void *out); + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH3_H_ \ No newline at end of file diff --git a/src/private/probabilities.nim b/src/private/probabilities.nim new file mode 100644 index 0000000..59175f2 --- /dev/null +++ b/src/private/probabilities.nim @@ -0,0 +1,103 @@ +# +# ### Probability table declaration, in private/ for readability ### +# Table for k hashes from 1..12 from http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html +# Iterate along the sequence at position [k] until the error rate is < specified, otherwise +# raise an error. +# + +type + TErrorForK = seq[float] + TAllErrorRates* = array[0..12, TErrorForK] + +var kErrors*: TAllErrorRates + +kErrors[0] = @[1.0] +kErrors[1] = @[1.0, 1.0, + 0.3930000000, 0.2830000000, 0.2210000000, 0.1810000000, 0.1540000000, + 0.1330000000, 0.1180000000, 0.1050000000, 0.0952000000, 0.0869000000, + 0.0800000000, 0.0740000000, 0.0689000000, 0.0645000000, 0.0606000000, + 0.0571000000, 0.0540000000, 0.0513000000, 0.0488000000, 0.0465000000, + 0.0444000000, 0.0425000000, 0.0408000000, 0.0392000000, 0.0377000000, + 0.0364000000, 0.0351000000, 0.0339000000, 0.0328000000, 0.0317000000, + 0.0308000000 ] + +kErrors[2] = @[1.0, 1.0, + 0.4000000000, 0.2370000000, 0.1550000000, 0.1090000000, 0.0804000000, + 0.0618000000, 0.0489000000, 0.0397000000, 0.0329000000, 0.0276000000, + 0.0236000000, 0.0203000000, 0.0177000000, 0.0156000000, 0.0138000000, + 0.0123000000, 0.0111000000, 0.0099800000, 0.0090600000, 0.0082500000, + 0.0075500000, 0.0069400000, 0.0063900000, 0.0059100000, 0.0054800000, + 0.0051000000, 0.0047500000, 0.0044400000, 0.0041600000, 0.0039000000, + 0.0036700000 ] + +kErrors[3] = @[1.0, 1.0, 1.0, + 0.2530000000, 0.1470000000, 0.0920000000, 0.0609000000, 0.0423000000, + 0.0306000000, 0.0228000000, 0.0174000000, 0.0136000000, 0.0108000000, + 0.0087500000, 0.0071800000, 0.0059600000, 0.0050000000, 0.0042300000, + 0.0036200000, 0.0031200000, 0.0027000000, 0.0023600000, 0.0020700000, + 0.0018300000, 0.0016200000, 0.0014500000, 0.0012900000, 0.0011600000, + 0.0010500000, 0.0009490000, 0.0008620000, 0.0007850000, 0.0007170000 ] + +kErrors[4] = @[1.0, 1.0, 1.0, 1.0, + 0.1600000000, 0.0920000000, 0.0561000000, 0.0359000000, 0.0240000000, + 0.0166000000, 0.0118000000, 0.0086400000, 0.0064600000, 0.0049200000, + 0.0038100000, 0.0030000000, 0.0023900000, 0.0019300000, 0.0015800000, + 0.0013000000, 0.0010800000, 0.0009050000, 0.0007640000, 0.0006490000, + 0.0005550000, 0.0004780000, 0.0004130000, 0.0003590000, 0.0003140000, + 0.0002760000, 0.0002430000, 0.0002150000, 0.0001910000 ] + +kErrors[5] = @[1.0, 1.0, 1.0, 1.0, 1.0, + 0.1010000000, 0.0578000000, 0.0347000000, 0.0217000000, 0.0141000000, + 0.0094300000, 0.0065000000, 0.0045900000, 0.0033200000, 0.0024400000, + 0.0018300000, 0.0013900000, 0.0010700000, 0.0008390000, 0.0006630000, + 0.0005300000, 0.0004270000, 0.0003470000, 0.0002850000, 0.0002350000, + 0.0001960000, 0.0001640000, 0.0001380000, 0.0001170000, 0.0000996000, + 0.0000853000, 0.0000733000, 0.0000633000 ] + +kErrors[6] = @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0638000000, 0.0364000000, 0.0216000000, 0.0133000000, 0.0084400000, + 0.0055200000, 0.0037100000, 0.0025500000, 0.0017900000, 0.0012800000, + 0.0009350000, 0.0006920000, 0.0005190000, 0.0003940000, 0.0003030000, + 0.0002360000, 0.0001850000, 0.0001470000, 0.0001170000, 0.0000944000, + 0.0000766000, 0.0000626000, 0.0000515000, 0.0000426000, 0.0000355000, + 0.0000297000, 0.0000250000 ] + +kErrors[7] = @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0229000000, 0.0135000000, 0.0081900000, 0.0051300000, 0.0032900000, + 0.0021700000, 0.0014600000, 0.0010000000, 0.0007020000, 0.0004990000, + 0.0003600000, 0.0002640000, 0.0001960000, 0.0001470000, 0.0001120000, + 0.0000856000, 0.0000663000, 0.0000518000, 0.0000408000, 0.0000324000, + 0.0000259000, 0.0000209000, 0.0000169000, 0.0000138000, 0.0000113000 ] + +kErrors[8] = @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0145000000, 0.0084600000, 0.0050900000, 0.0031400000, 0.0019900000, + 0.0012900000, 0.0008520000, 0.0005740000, 0.0003940000, 0.0002750000, + 0.0001940000, 0.0001400000, 0.0001010000, 0.0000746000, 0.0000555000, + 0.0000417000, 0.0000316000, 0.0000242000, 0.0000187000, 0.0000146000, + 0.0000114000, 0.0000090100, 0.0000071600, 0.0000057300 ] + +kErrors[9] = @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0053100000, 0.0031700000, 0.0019400000, 0.0012100000, 0.0007750000, + 0.0005050000, 0.0003350000, 0.0002260000, 0.0001550000, 0.0001080000, + 0.0000759000, 0.0000542000, 0.0000392000, 0.0000286000, 0.0000211000, + 0.0000157000, 0.0000118000, 0.0000089600, 0.0000068500, 0.0000052800, + 0.0000041000, 0.0000032000] + +kErrors[10] = @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0033400000, 0.0019800000, 0.0012000000, 0.0007440000, 0.0004700000, + 0.0003020000, 0.0001980000, 0.0001320000, 0.0000889000, 0.0000609000, + 0.0000423000, 0.0000297000, 0.0000211000, 0.0000152000, 0.0000110000, + 0.0000080700, 0.0000059700, 0.0000044500, 0.0000033500, 0.0000025400, + 0.0000019400] + +kErrors[11] = @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0021000000, 0.0012400000, 0.0007470000, 0.0004590000, 0.0002870000, + 0.0001830000, 0.0001180000, 0.0000777000, 0.0000518000, 0.0000350000, + 0.0000240000, 0.0000166000, 0.0000116000, 0.0000082300, 0.0000058900, + 0.0000042500, 0.0000031000, 0.0000022800, 0.0000016900, 0.0000012600] + +kErrors[12] = @[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0007780000, 0.0004660000, 0.0002840000, 0.0001760000, 0.0001110000, + 0.0000712000, 0.0000463000, 0.0000305000, 0.0000204000, 0.0000138000, + 0.0000094200, 0.0000065200, 0.0000045600, 0.0000032200, 0.0000022900, + 0.0000016500, 0.0000012000, 0.0000008740] diff --git a/tests/config.nims b/tests/config.nims new file mode 100644 index 0000000..80091ff --- /dev/null +++ b/tests/config.nims @@ -0,0 +1 @@ +switch("path", "$projectDir/../src") diff --git a/tests/test.nim b/tests/test.nim new file mode 100644 index 0000000..88d70fe --- /dev/null +++ b/tests/test.nim @@ -0,0 +1,151 @@ +import unittest +import strutils +include bloom +from random import rand, randomize + +suite "murmur": + # Test murmurhash3 implementations + setup: + var hashOutputs: MurmurHashes + hashOutputs = [0, 0] + rawMurmurHash128("hello", 5, 0'u32, hashOutputs) + + test "murmur128 raw": + check int(hashOutputs[0]) == -3758069500696749310 + check int(hashOutputs[1]) == 6565844092913065241 + + test "murmur128 wrapped": + let hashOutputs2 = murmurHash128("hello", 0'u32) + check hashOutputs2[0] == hashOutputs[0] + check hashOutputs2[1] == hashOutputs[1] + + test "murmur32": + let hash1 = murmurHash32("hello", 0'u32) + let hash2 = murmurHash32("hello", 0'u32) + check hash1 == hash2 # Same input should give same output + + let hash3 = murmurHash32("hello", 10'u32) + check hash1 != hash3 # Different seeds should give different outputs + +suite "hash quality": + test "hash type selection": + let bfMurmur128 = initializeBloomFilter(100, 0.01, hashType = htMurmur128) + let bfMurmur32 = initializeBloomFilter(100, 0.01, hashType = htMurmur32) + let bfNimHash = initializeBloomFilter(100, 0.01, hashType = htNimHash) + + check bfMurmur128.hashType == htMurmur128 + check bfMurmur32.hashType == htMurmur32 + check bfNimHash.hashType == htNimHash + + test "quality across hash types": + const testSize = 10_000 + let patterns = @[ + "shortstr", + repeat("a", 1000), # Very long string + "special@#$%^&*()", # Special characters + "unicode→★∑≈", # Unicode characters + repeat("pattern", 10) # Repeating pattern + ] + + for hashType in [htMurmur128, htMurmur32, htNimHash]: + var bf = initializeBloomFilter(testSize, 0.01, hashType = hashType) + var inserted = newSeq[string](testSize) + + # Test pattern handling + for pattern in patterns: + bf.insert(pattern) + check bf.lookup(pattern) + + # Test general insertion and lookup + for i in 0..