Skip to content

Commit 14cabd0

Browse files
authored
Merge pull request #348 from nmslib/develop
0.6.0 release
2 parents 1866a1d + bcbcb5d commit 14cabd0

21 files changed

+738
-184
lines changed

.github/workflows/build.yml

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
name: HNSW CI
2+
3+
on: [push, pull_request]
4+
5+
jobs:
6+
test:
7+
runs-on: ${{matrix.os}}
8+
strategy:
9+
matrix:
10+
os: [ubuntu-latest, windows-latest]
11+
python-version: ['3.6', '3.7', '3.8', '3.9']
12+
steps:
13+
- uses: actions/checkout@v2
14+
- uses: actions/setup-python@v2
15+
with:
16+
python-version: ${{ matrix.python-version }}
17+
18+
- name: Build and install
19+
run: python -m pip install .
20+
21+
- name: Test
22+
run: python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,6 @@ python_bindings/tests/__pycache__/
66
*.pyd
77
hnswlib.cpython*.so
88
var/
9+
.idea/
10+
.vscode/
11+

.travis.yml

-63
This file was deleted.

ALGO_PARAMS.md

+2
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ The ```knn_query``` function returns two numpy arrays, containing labels and dis
99
elements for the queries. Note that in case the algorithm is not be able to find ```k``` neighbors to all of the queries,
1010
(this can be due to problems with graph or ```k```>size of the dataset) an exception is thrown.
1111

12+
An example of tuning the parameters can be found in [TESTING_RECALL.md](TESTING_RECALL.md)
13+
1214
## Construction parameters:
1315
* ```M``` - the number of bi-directional links created for every new element during construction. Reasonable range for ```M```
1416
is 2-100. Higher ```M``` work better on datasets with high intrinsic dimensionality and/or high recall, while low ```M``` work

CMakeLists.txt

+19-20
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,27 @@
11
cmake_minimum_required (VERSION 2.6)
2-
project (hnsw_lib)
2+
project(hnsw_lib
3+
LANGUAGES CXX)
34

4-
include_directories("${PROJECT_BINARY_DIR}")
5+
add_library(hnswlib INTERFACE)
6+
target_include_directories(hnswlib INTERFACE .)
57

8+
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
9+
set(CMAKE_CXX_STANDARD 11)
610

11+
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
12+
SET( CMAKE_CXX_FLAGS "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize")
13+
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
14+
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
15+
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
16+
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
17+
endif()
718

8-
set(SOURCE_EXE main.cpp)
19+
add_executable(test_updates examples/updates_test.cpp)
20+
target_link_libraries(test_updates hnswlib)
921

10-
set(SOURCE_LIB sift_1b.cpp)
22+
add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp)
23+
target_link_libraries(searchKnnCloserFirst_test hnswlib)
1124

12-
add_library(sift_test STATIC ${SOURCE_LIB})
13-
14-
15-
add_executable(main ${SOURCE_EXE})
16-
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
17-
SET( CMAKE_CXX_FLAGS "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize")
18-
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
19-
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
20-
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
21-
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
25+
add_executable(main main.cpp sift_1b.cpp)
26+
target_link_libraries(main hnswlib)
2227
endif()
23-
24-
add_executable(test_updates examples/updates_test.cpp)
25-
26-
add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp)
27-
28-
target_link_libraries(main sift_test)

README.md

+22-12
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,20 @@ Header-only C++ HNSW implementation with python bindings.
33

44
**NEWS:**
55

6-
* **Hnswlib is now 0.5.2**. Bugfixes - thanks [@marekhanus](https://github.com/marekhanus) for fixing the missing arguments, adding support for python 3.8, 3.9 in Travis, improving python wrapper and fixing typos/code style; [@apoorv-sharma](https://github.com/apoorv-sharma) for fixing the bug int the insertion/deletion logic; [@shengjun1985](https://github.com/shengjun1985) for simplifying the memory reallocation logic; [@TakaakiFuruse](https://github.com/TakaakiFuruse) for improved description of `add_items`; [@psobot ](https://github.com/psobot) for improving error handling; [@ShuAiii](https://github.com/ShuAiii) for reporting the bug in the python interface
6+
**version 0.6**
7+
* Thanks to ([@dyashuni](https://github.com/dyashuni)) hnswlib now uses github actions for CI, there is a search speedup in some scenarios with deletions. `unmark_deleted(label)` is now also a part of the python interface (note now it throws an exception for double deletions).
8+
* Thanks to ([@slice4e](https://github.com/slice4e)) we now support AVX512; thanks to ([@LTLA](https://github.com/LTLA)) the cmake interface for the lib is now updated.
9+
* Thanks to ([@alonre24](https://github.com/alonre24)) we now have a python bindings for brute-force (and examples for recall tuning: [TESTING_RECALL.md](TESTING_RECALL.md).
10+
* Thanks to ([@dorosy-yeong](https://github.com/dorosy-yeong)) there is a bug fixed in the handling large quantities of deleted elements and large K.
711

8-
* **Hnswlib is now 0.5.0**. Added support for pickling indices, support for PEP-517 and PEP-518 building, small speedups, bug and documentation fixes. Many thanks to [@dbespalov](https://github.com/dbespalov), [@dyashuni](https://github.com/dyashuni), [@groodt](https://github.com/groodt),[@uestc-lfs](https://github.com/uestc-lfs), [@vinnitu](https://github.com/vinnitu), [@fabiencastan](https://github.com/fabiencastan), [@JinHai-CN](https://github.com/JinHai-CN), [@js1010](https://github.com/js1010)!
9-
10-
* **Thanks to Apoorv Sharma [@apoorv-sharma](https://github.com/apoorv-sharma), hnswlib now supports true element updates (the interface remained the same, but when you the performance/memory should not degrade as you update the element embeddings).**
11-
12-
* **Thanks to Dmitry [@2ooom](https://github.com/2ooom), hnswlib got a boost in performance for vector dimensions that are not multiple of 4**
12+
1313

14-
* **Thanks to Louis Abraham ([@louisabraham](https://github.com/louisabraham)) hnswlib can now be installed via pip!**
1514

16-
Highlights:
17-
1) Lightweight, header-only, no dependencies other than C++ 11.
18-
2) Interfaces for C++, python and R (https://github.com/jlmelville/rcpphnsw).
15+
### Highlights:
16+
1) Lightweight, header-only, no dependencies other than C++ 11
17+
2) Interfaces for C++, Java, Python and R (https://github.com/jlmelville/rcpphnsw).
1918
3) Has full support for incremental index construction. Has support for element deletions
20-
(currently, without actual freeing of the memory).
19+
(by marking them in index). Index is picklable.
2120
4) Can work with custom user defined distances (C++).
2221
5) Significantly less memory footprint and faster build time compared to current nmslib's implementation.
2322

@@ -53,7 +52,9 @@ For other spaces use the nmslib library https://github.com/nmslib/nmslib.
5352
- If index already has the elements with the same labels, their features will be updated. Note that update procedure is slower than insertion of a new element, but more memory- and query-efficient.
5453
* Thread-safe with other `add_items` calls, but not with `knn_query`.
5554

56-
* `mark_deleted(label)` - marks the element as deleted, so it will be omitted from search results.
55+
* `mark_deleted(label)` - marks the element as deleted, so it will be omitted from search results. Throws an exception if it is already deleted.
56+
*
57+
* `unmark_deleted(label)` - unmarks the element as deleted, so it will be not be omitted from search results.
5758

5859
* `resize_index(new_size)` - changes the maximum capacity of the index. Not thread safe with `add_items` and `knn_query`.
5960

@@ -225,6 +226,15 @@ pip install .
225226
or you can install via pip:
226227
`pip install hnswlib`
227228

229+
230+
### For developers
231+
232+
When making changes please run tests (and please add a test to `python_bindings/tests` in case there is new functionality):
233+
```bash
234+
python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py
235+
```
236+
237+
228238
### Other implementations
229239
* Non-metric space library (nmslib) - main library(python, C++), supports exotic distances: https://github.com/nmslib/nmslib
230240
* Faiss library by facebook, uses own HNSW implementation for coarse quantization (python, C++):

TESTING_RECALL.md

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# Testing recall
2+
3+
Selecting HNSW parameters for a specific use case highly impacts the search quality. One way to test the quality of the constructed index is to compare the HNSW search results to the actual results (i.e., the actual `k` nearest neighbors).
4+
For that cause, the API enables creating a simple "brute-force" index in which vectors are stored as is, and searching for the `k` nearest neighbors to a query vector requires going over the entire index.
5+
Comparing between HNSW and brute-force results may help with finding the desired HNSW parameters for achieving a satisfying recall, based on the index size and data dimension.
6+
7+
### Brute force index API
8+
`hnswlib.BFIndex(space, dim)` creates a non-initialized index in space `space` with integer dimension `dim`.
9+
10+
`hnswlib.BFIndex` methods:
11+
12+
`init_index(max_elements)` initializes the index with no elements.
13+
14+
max_elements defines the maximum number of elements that can be stored in the structure.
15+
16+
`add_items(data, ids)` inserts the data (numpy array of vectors, shape:`N*dim`) into the structure.
17+
`ids` are optional N-size numpy array of integer labels for all elements in data.
18+
19+
`delete_vector(label)` delete the element associated with the given `label` so it will be omitted from search results.
20+
21+
`knn_query(data, k = 1)` make a batch query for `k `closest elements for each element of the
22+
`data` (shape:`N*dim`). Returns a numpy array of (shape:`N*k`).
23+
24+
`load_index(path_to_index, max_elements = 0)` loads the index from persistence to the uninitialized index.
25+
26+
`save_index(path_to_index)` saves the index from persistence.
27+
28+
### measuring recall example
29+
30+
```
31+
import hnswlib
32+
import numpy as np
33+
34+
dim = 32
35+
num_elements = 100000
36+
k = 10
37+
nun_queries = 10
38+
39+
# Generating sample data
40+
data = np.float32(np.random.random((num_elements, dim)))
41+
42+
# Declaring index
43+
hnsw_index = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip
44+
bf_index = hnswlib.BFIndex(space='l2', dim=dim)
45+
46+
# Initing both hnsw and brute force indices
47+
# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded
48+
# during insertion of an element.
49+
# The capacity can be increased by saving/loading the index, see below.
50+
#
51+
# hnsw construction params:
52+
# ef_construction - controls index search speed/build speed tradeoff
53+
#
54+
# M - is tightly connected with internal dimensionality of the data. Strongly affects the memory consumption (~M)
55+
# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction
56+
57+
hnsw_index.init_index(max_elements=num_elements, ef_construction=200, M=16)
58+
bf_index.init_index(max_elements=num_elements)
59+
60+
# Controlling the recall for hnsw by setting ef:
61+
# higher ef leads to better accuracy, but slower search
62+
hnsw_index.set_ef(200)
63+
64+
# Set number of threads used during batch search/construction in hnsw
65+
# By default using all available cores
66+
hnsw_index.set_num_threads(1)
67+
68+
print("Adding batch of %d elements" % (len(data)))
69+
hnsw_index.add_items(data)
70+
bf_index.add_items(data)
71+
72+
print("Indices built")
73+
74+
# Generating query data
75+
query_data = np.float32(np.random.random((nun_queries, dim)))
76+
77+
# Query the elements and measure recall:
78+
labels_hnsw, distances_hnsw = hnsw_index.knn_query(query_data, k)
79+
labels_bf, distances_bf = bf_index.knn_query(query_data, k)
80+
81+
# Measure recall
82+
correct = 0
83+
for i in range(nun_queries):
84+
for label in labels_hnsw[i]:
85+
for correct_label in labels_bf[i]:
86+
if label == correct_label:
87+
correct += 1
88+
break
89+
90+
print("recall is :", float(correct)/(k*nun_queries))
91+
```

examples/git_tester.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from pydriller import Repository
2+
import os
3+
import datetime
4+
os.system("cp examples/speedtest.py examples/speedtest2.py")
5+
for commit in Repository('.', from_tag="v0.5.2").traverse_commits():
6+
print(commit.hash)
7+
print(commit.msg)
8+
9+
os.system(f"git checkout {commit.hash}; rm -rf build; ")
10+
os.system("python -m pip install .")
11+
os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 4 -t 1')
12+
os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 64 -t 1')
13+
os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 128 -t 1')
14+
os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 4 -t 24')
15+
os.system(f'python examples/speedtest2.py -n "{commit.msg}" -d 128 -t 24')
16+

examples/speedtest.py

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import hnswlib
2+
import numpy as np
3+
import os.path
4+
import time
5+
import argparse
6+
7+
# Use nargs to specify how many arguments an option should take.
8+
ap = argparse.ArgumentParser()
9+
ap.add_argument('-d')
10+
ap.add_argument('-n')
11+
ap.add_argument('-t')
12+
args = ap.parse_args()
13+
dim = int(args.d)
14+
name = args.n
15+
threads=int(args.t)
16+
num_elements = 1000000 * 4//dim
17+
18+
# Generating sample data
19+
np.random.seed(1)
20+
data = np.float32(np.random.random((num_elements, dim)))
21+
22+
23+
index_path=f'speed_index{dim}.bin'
24+
# Declaring index
25+
p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip
26+
27+
if not os.path.isfile(index_path) :
28+
29+
p.init_index(max_elements=num_elements, ef_construction=100, M=16)
30+
31+
# Controlling the recall by setting ef:
32+
# higher ef leads to better accuracy, but slower search
33+
p.set_ef(10)
34+
35+
# Set number of threads used during batch search/construction
36+
# By default using all available cores
37+
p.set_num_threads(12)
38+
39+
p.add_items(data)
40+
41+
# Serializing and deleting the index:
42+
43+
print("Saving index to '%s'" % index_path)
44+
p.save_index(index_path)
45+
p.set_num_threads(threads)
46+
times=[]
47+
time.sleep(10)
48+
p.set_ef(100)
49+
for _ in range(3):
50+
p.load_index(index_path)
51+
for _ in range(10):
52+
t0=time.time()
53+
labels, distances = p.knn_query(data, k=1)
54+
tt=time.time()-t0
55+
times.append(tt)
56+
print(f"{tt} seconds")
57+
str_out=f"mean time:{np.mean(times)}, median time:{np.median(times)}, std time {np.std(times)} {name}"
58+
print(str_out)
59+
with open (f"log_{dim}_t{threads}.txt","a") as f:
60+
f.write(str_out+"\n")
61+
f.flush()
62+

0 commit comments

Comments
 (0)