Skip to content

Commit 656a23b

Browse files
committed
Pre-release 1
1 parent 8b0c66a commit 656a23b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+3295
-193
lines changed

.editorconfig

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,6 @@ indent_size = 2
2929
[*.{yaml,yml}]
3030
indent_size = 2
3131
32+
# Python files
33+
[*.py]
34+
max_line_length = 120

.gitattributes

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,7 @@
5151
*.out filter=lfs diff=lfs merge=lfs -text
5252
*.a filter=lfs diff=lfs merge=lfs -text
5353
*.o filter=lfs diff=lfs merge=lfs -text
54+
55+
# Exclude files from language stats (GitHub Linguist)
56+
*.ipynb linguist-vendored
57+

.github/workflows/tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
strategy:
1212
matrix:
1313
# Define the Rust versions to test against
14-
rust-version: [ "1.83.0", "1.84.0", "beta", "stable" ]
14+
rust-version: [ "1.80.0", "1.83.0", "1.84.0", "beta", "stable" ]
1515

1616
steps:
1717
- name: Checkout Repository

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,3 +79,5 @@ Cargo.lock
7979
# Miscellaneous files and directories to ignore
8080
# Add any additional file patterns a directory names that should be ignored down here
8181
.DS_Store
82+
benchmark_results.csv
83+
eval_*.csv

Cargo.toml

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,28 @@
11
[package]
22
name = "vq"
3-
version = "0.1.1"
3+
version = "0.1.2"
44
description = "A vector quantization library for Rust"
55
repository = "https://github.com/habedi/vq"
66
license = "MIT OR Apache-2.0"
77
readme = "README.md"
8-
keywords = ["vq", "vector-quantization", "clustering", "nearest-neighbor", "data-compression"]
8+
keywords = ["vector-quantization", "quantization", "nearest-neighbor", "data-compression", "embeddings"]
99
authors = ["Hassan Abedi <[email protected]>"]
1010
homepage = "https://github.com/habedi/vq"
1111
documentation = "https://docs.rs/vq"
12-
#categories = ["development-tools"]
12+
categories = ["algorithms", "compression", "data-structures"]
1313
edition = "2021"
1414

1515
[lib]
1616
name = "vq"
1717
path = "src/lib.rs"
1818

1919
[[bin]]
20-
name = "vq"
21-
path = "src/main.rs"
20+
name = "vq-examples"
21+
path = "src/bin/examples.rs"
22+
23+
[[bin]]
24+
name = "eval"
25+
path = "src/bin/eval.rs"
2226

2327
[dependencies]
2428
tracing = "0.1.41"
@@ -28,10 +32,14 @@ rand = "0.9.0"
2832
half = "2.4.1"
2933
nalgebra = "0.33.2"
3034
rayon = "1.10"
35+
anyhow = "1.0.95"
36+
rand_distr = "0.5.0"
37+
serde = { version = "1.0.217", features = ["derive"] }
38+
clap = { version = "4.5.29", features = ["derive"] }
3139

3240
[dev-dependencies]
3341
criterion = { version = "0.5", features = ["html_reports"] }
3442

3543
[[bench]]
36-
name = "my_benchmarks"
44+
name = "main"
3745
harness = false

Makefile

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
# Variables
22
PKG = github.com/habedi/vq
3-
BINARY_NAME = $(or $(PROJ_BINARY), $(notdir $(PKG)))
3+
BINARY_NAME = $(or $(PROJ_BINARY), $(notdir $(PKG)-examples))
44
BINARY = target/release/$(BINARY_NAME)
55
PATH := /snap/bin:$(PATH)
66
CARGO_TERM_COLOR = always
7+
RUST_BACKTRACE = 1
8+
RUST_LOG = info
79
DEBUG_VQ = 0
810

911
# Default target
@@ -16,64 +18,77 @@ help: ## Show this help message
1618
.PHONY: format
1719
format: ## Format Rust files
1820
@echo "Formatting Rust files..."
19-
cargo fmt
21+
@cargo fmt
2022

2123
.PHONY: test
2224
test: format ## Run tests
2325
@echo "Running tests..."
24-
DEBUG_VQ=$(DEBUG_VQ) cargo test -- --nocapture
26+
@DEBUG_VQ=$(DEBUG_VQ) cargo test -- --nocapture
2527

2628
.PHONY: coverage
2729
coverage: format ## Generate test coverage report
2830
@echo "Generating test coverage report..."
29-
DEBUG_VQ=$(DEBUG_VQ) cargo tarpaulin --out Xml --out Html
31+
@DEBUG_VQ=$(DEBUG_VQ) cargo tarpaulin --out Xml --out Html
3032

3133
.PHONY: build
3234
build: format ## Build the binary for the current platform
3335
@echo "Building the project..."
34-
DEBUG_VQ=$(DEBUG_VQ) cargo build --release
36+
@DEBUG_VQ=$(DEBUG_VQ) cargo build --release
3537

3638
.PHONY: run
3739
run: build ## Build and run the binary
3840
@echo "Running the $(BINARY) binary..."
39-
DEBUG_VQ=$(DEBUG_VQ) ./$(BINARY)
41+
@DEBUG_VQ=$(DEBUG_VQ) ./$(BINARY)
4042

4143
.PHONY: clean
4244
clean: ## Remove generated and temporary files
4345
@echo "Cleaning up..."
44-
cargo clean
46+
@cargo clean
47+
@rm -f benchmark_results.csv
48+
@rm -f eval_*.csv
4549

4650
.PHONY: install-snap
4751
install-snap: ## Install a few dependencies using Snapcraft
4852
@echo "Installing the snap package..."
49-
sudo apt-get update
50-
sudo apt-get install -y snapd
51-
sudo snap refresh
52-
sudo snap install rustup --classic
53+
@sudo apt-get update
54+
@sudo apt-get install -y snapd
55+
@sudo snap refresh
56+
@sudo snap install rustup --classic
5357

5458
.PHONY: install-deps
5559
install-deps: install-snap ## Install development dependencies
5660
@echo "Installing dependencies..."
57-
rustup component add rustfmt clippy
58-
cargo install cargo-tarpaulin
59-
cargo install cargo-audit
61+
@rustup component add rustfmt clippy
62+
@cargo install cargo-tarpaulin
63+
@cargo install cargo-audit
6064

6165
.PHONY: lint
6266
lint: format ## Run linters on Rust files
6367
@echo "Linting Rust files..."
64-
DEBUG_VQ=$(DEBUG_VQ) cargo clippy -- -D warnings
68+
@DEBUG_VQ=$(DEBUG_VQ) cargo clippy -- -D warnings
6569

6670
.PHONY: publish
6771
publish: ## Publish the package to crates.io (requires CARGO_REGISTRY_TOKEN to be set)
6872
@echo "Publishing the package to Cargo registry..."
69-
cargo publish --token $(CARGO_REGISTRY_TOKEN)
73+
@cargo publish --token $(CARGO_REGISTRY_TOKEN)
7074

7175
.PHONY: bench
7276
bench: ## Run benchmarks
7377
@echo "Running benchmarks..."
74-
DEBUG_VQ=$(DEBUG_VQ) cargo bench
78+
@DEBUG_VQ=$(DEBUG_VQ) cargo bench
7579

76-
.PHONY: audit
77-
audit: ## Run security audit on Rust dependencies
78-
@echo "Running security audit..."
79-
cargo audit
80+
.PHONY: eval
81+
eval: ## Evaluate an implementation (the ALG should be the algorithm name, e.g., bq, sq, pq, opq, tsvq, rvq)
82+
@echo && if [ -z "$(ALG)" ]; then echo "Please provide the ALG argument"; exit 1; fi
83+
@echo "Evaluating implementation with argument: $(ALG)"
84+
@cargo run --release --bin eval -- --eval $(ALG)
85+
86+
.PHONY: eval-all
87+
eval-all: ## Evaluate all the implementations (bq, sq, pq, opq, tsvq, rvq)
88+
@echo "Evaluating all implementations..."
89+
@cargo run --release --bin eval -- --eval bq
90+
@cargo run --release --bin eval -- --eval sq
91+
@cargo run --release --bin eval -- --eval pq
92+
@cargo run --release --bin eval -- --eval opq
93+
@cargo run --release --bin eval -- --eval tsvq
94+
@cargo run --release --bin eval -- --eval rvq

README.md

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,53 +6,56 @@
66
[<img alt="Crates.io" src="https://img.shields.io/crates/v/vq.svg?style=for-the-badge&color=fc8d62&logo=rust" height="20">](https://crates.io/crates/vq)
77
[<img alt="Docs.rs" src="https://img.shields.io/badge/docs.rs-vq-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs" height="20">](https://docs.rs/vq)
88
[<img alt="Downloads" src="https://img.shields.io/crates/d/vq?style=for-the-badge&labelColor=555555&logo=rust" height="20">](https://crates.io/crates/vq)
9+
<br>
910
[<img alt="Docs" src="https://img.shields.io/badge/docs-latest-3776ab?style=for-the-badge&labelColor=555555&logo=readthedocs" height="20">](docs)
1011
[<img alt="License" src="https://img.shields.io/badge/license-MIT%2FApache--2.0-007ec6?style=for-the-badge&labelColor=555555&logo=open-source-initiative" height="20">](https://github.com/habedi/vq)
1112

12-
Vq (**v**[ector] **q**[uantiztion]) is a Rust library that implements several
13+
Vq (**v**[ector] **q**[uantizer]) is a Rust library that implements several
1314
popular [vector quantization](https://en.wikipedia.org/wiki/Vector_quantization) algorithms including binary, scalar,
1415
and product quantization algorithms.
15-
It provides a simple, efficient API for data compression that help reduce memory usage and computational overhead.
16+
It provides a simple, efficient API for data compression that helps reduce memory usage and computational overhead.
1617

1718
## Features
1819

1920
- Implemented Algorithms:
20-
- [**Binary Quantization (BQ)**](src/bq.rs)
21-
- [**Scalar Quantization (SQ)**](src/sq.rs)
22-
- [**Product Quantization (PQ)**](https://ieeexplore.ieee.org/document/5432202)
23-
- [**Optimized Product Quantization (OPQ)**](https://ieeexplore.ieee.org/document/6619223)
24-
- [**Tree-structured Vector Quantization (TSVQ)**](https://ieeexplore.ieee.org/document/515493)
25-
- [**Residual Vector Quantization (RVQ)**](https://pmc.ncbi.nlm.nih.gov/articles/PMC3231071/)
21+
- [Binary Quantization (BQ)](src/bq.rs)
22+
- [Scalar Quantization (SQ)](src/sq.rs)
23+
- [Product Quantization (PQ)](https://ieeexplore.ieee.org/document/5432202)
24+
- [Optimized Product Quantization (OPQ)](https://ieeexplore.ieee.org/document/6619223)
25+
- [Tree-structured Vector Quantization (TSVQ)](https://ieeexplore.ieee.org/document/515493)
26+
- [Residual Vector Quantization (RVQ)](https://pmc.ncbi.nlm.nih.gov/articles/PMC3231071/)
2627

2728
- Parallelized vector operations for large vectors using [Rayon](https://crates.io/crates/rayon).
28-
- Flexible quantization algorithm implementations that support custom distance functions (e.g., Euclidean, Cosine,
29-
Chebyshev, etc.).
30-
- Support for quantizing vectors of `f32` to `f16` (using [half](https://github.com/starkat99/half-rs/tree/main/src)) or `u8` data types.
31-
- Simple and intuitive API for all quantization algorithms.
29+
- Flexible quantization algorithm implementations that support using various distance metrics such as Euclidean, Cosine,
30+
Manhattan distances.
31+
- Support for quantizing vectors of `f32` to `f16` (using [half](https://crates.io/crates/half)) or `u8` data types.
32+
- Simple, intuitive, and uniform API for all quantization algorithms.
3233

3334
## Installation
3435

3536
```bash
3637
cargo add vq
3738
```
3839

40+
*Vq requires Rust 1.80 or later.*
41+
3942
## Documentation
4043

4144
Find the latest documentation [here](docs) or on [docs.rs](https://docs.rs/vq).
4245

43-
Check out the [tests](tests/) directory for detailed examples of using Vq.
46+
Check out [examples.rs](src/bin/examples.rs) the [tests](tests/) directory for detailed examples of using Vq.
4447

4548
### Quick Example
4649

47-
Here's a simple example using the scalar quantization:
50+
Here's a simple example using the SQ algorithm to quantize a vector:
4851

4952
```rust
5053
use vq::sq::ScalarQuantizer;
5154
use vq::vector::Vector;
5255

5356
fn main() {
5457
// Create a scalar quantizer for values in the range [0.0, 1.0] with 256 levels.
55-
let quantizer = ScalarQuantizer::new(0.0, 1.0, 256);
58+
let quantizer = ScalarQuantizer::fit(0.0, 1.0, 256);
5659

5760
// Create an input vector.
5861
let input = Vector::new(vec![0.1, 0.5, -0.8, -0.3, 0.9]);
@@ -64,9 +67,23 @@ fn main() {
6467
}
6568
```
6669

70+
## Performance
71+
72+
Check out the [notebooks](notebooks/) directory for information on how to evaluate the performance of the implemented
73+
algorithms.
74+
Additionally, see the content of [src/bin](src/bin/) directory for the scripts used for the evaluation.
75+
76+
> [!TIP]
77+
> On a ThinkPad T14 laptop with an Intel i7-1355U CPU and 32GB RAM, the performance of the PQ algorithm for
78+
> quantizing one million vectors of 128 dimensions (into 16 subspaces with 256 centroids per subspace) is as follows:
79+
> - Training Time: 232.5 seconds
80+
> - Quantization Time: 34.1 seconds
81+
> - Reconstruction Error: 0.02
82+
> - Recall@10: 0.19
83+
6784
## Contributing
6885

69-
Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for details on how to contribute.
86+
Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for details on contributing.
7087

7188
## License
7289

benches/bq_bench.rs

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
#[path = "utils.rs"]
2+
mod utils;
3+
4+
use criterion::{black_box, criterion_group, Criterion};
5+
use rayon::prelude::*;
6+
use utils::{BENCH_TIMEOUT, NUM_VECTORS};
7+
use vq::bq::BinaryQuantizer;
8+
use vq::vector::{Vector, PARALLEL_THRESHOLD};
9+
10+
/// Benchmark quantization on a single vector that is small enough to trigger sequential processing.
11+
pub fn bench_quantize_sequential(_c: &mut Criterion) {
12+
// Create a vector with length less than PARALLEL_THRESHOLD.
13+
let n = PARALLEL_THRESHOLD / 2;
14+
let data: Vec<f32> = (0..n).map(|i| (i as f32) / (n as f32)).collect();
15+
let vector = Vector::new(data);
16+
let quantizer = BinaryQuantizer::fit(0.5, 0, 1);
17+
18+
let mut cc = Criterion::default().measurement_time(BENCH_TIMEOUT);
19+
cc.bench_function("quantize_sequential", |b| {
20+
b.iter(|| {
21+
let result = quantizer.quantize(black_box(&vector));
22+
black_box(result)
23+
})
24+
});
25+
}
26+
27+
/// Benchmark quantization on a single vector that is large enough to trigger parallel processing.
28+
pub fn bench_quantize_parallel(_c: &mut Criterion) {
29+
// Create a vector with length greater than PARALLEL_THRESHOLD.
30+
let n = PARALLEL_THRESHOLD + 1000;
31+
let data: Vec<f32> = (0..n).map(|i| (i as f32) / (n as f32)).collect();
32+
let vector = Vector::new(data);
33+
let quantizer = BinaryQuantizer::fit(0.5, 0, 1);
34+
35+
let mut cc = Criterion::default().measurement_time(BENCH_TIMEOUT);
36+
cc.bench_function("quantize_parallel", |b| {
37+
b.iter(|| {
38+
let result = quantizer.quantize(black_box(&vector));
39+
black_box(result)
40+
})
41+
});
42+
}
43+
44+
/// Benchmark quantization of many small vectors (each processed sequentially) using a sequential outer loop.
45+
pub fn bench_quantize_multiple_vectors_sequential(_c: &mut Criterion) {
46+
// Each vector is small enough to use sequential quantization internally.
47+
let vector_size = PARALLEL_THRESHOLD / 2;
48+
let vectors: Vec<Vector<f32>> = (0..NUM_VECTORS)
49+
.map(|_| {
50+
let data: Vec<f32> = (0..vector_size)
51+
.map(|i| (i as f32) / (vector_size as f32))
52+
.collect();
53+
Vector::new(data)
54+
})
55+
.collect();
56+
57+
let quantizer = BinaryQuantizer::fit(0.5, 0, 1);
58+
59+
let mut cc = Criterion::default().measurement_time(BENCH_TIMEOUT);
60+
cc.bench_function("quantize_multiple_vectors_sequential", |b| {
61+
b.iter(|| {
62+
let results: Vec<Vector<u8>> = vectors
63+
.iter()
64+
.map(|v| quantizer.quantize(black_box(v)))
65+
.collect();
66+
black_box(results);
67+
})
68+
});
69+
}
70+
71+
/// Benchmark quantization of many large vectors (each using parallel quantization)
72+
/// and process them concurrently using a parallel outer loop.
73+
pub fn bench_quantize_multiple_vectors_parallel_outer(_c: &mut Criterion) {
74+
// Each vector is large enough to use parallel quantization internally.
75+
let vector_size = PARALLEL_THRESHOLD + 100;
76+
let vectors: Vec<Vector<f32>> = (0..NUM_VECTORS)
77+
.map(|_| {
78+
let data: Vec<f32> = (0..vector_size)
79+
.map(|i| (i as f32) / (vector_size as f32))
80+
.collect();
81+
Vector::new(data)
82+
})
83+
.collect();
84+
85+
let quantizer = BinaryQuantizer::fit(0.5, 0, 1);
86+
87+
let mut cc = Criterion::default().measurement_time(BENCH_TIMEOUT);
88+
cc.bench_function("quantize_multiple_vectors_parallel_outer", |b| {
89+
b.iter(|| {
90+
let results: Vec<Vector<u8>> = vectors
91+
.par_iter()
92+
.map(|v| quantizer.quantize(black_box(v)))
93+
.collect();
94+
black_box(results);
95+
})
96+
});
97+
}
98+
99+
criterion_group!(
100+
benches,
101+
bench_quantize_sequential,
102+
bench_quantize_parallel,
103+
bench_quantize_multiple_vectors_sequential,
104+
bench_quantize_multiple_vectors_parallel_outer
105+
);
106+
//criterion_main!(benches);

0 commit comments

Comments
 (0)