From 0c67adf859478d2c67d3047c6abfa431d768c623 Mon Sep 17 00:00:00 2001 From: Erik Bernhardsson Date: Fri, 7 Apr 2023 13:08:38 -0400 Subject: [PATCH] Black --- ann_benchmarks/__init__.py | 1 + ann_benchmarks/algorithms/annoy.py | 3 +- ann_benchmarks/algorithms/balltree.py | 10 +- ann_benchmarks/algorithms/base.py | 4 +- ann_benchmarks/algorithms/bruteforce.py | 64 ++-- ann_benchmarks/algorithms/ckdtree.py | 2 +- ann_benchmarks/algorithms/datasketch.py | 15 +- ann_benchmarks/algorithms/definitions.py | 49 ++- ann_benchmarks/algorithms/diskann.py | 92 +++--- ann_benchmarks/algorithms/dolphinnpy.py | 3 +- ann_benchmarks/algorithms/dummy_algo.py | 4 +- ann_benchmarks/algorithms/elasticsearch.py | 32 +- ann_benchmarks/algorithms/elastiknn.py | 23 +- ann_benchmarks/algorithms/faiss.py | 34 +- ann_benchmarks/algorithms/faiss_gpu.py | 7 +- ann_benchmarks/algorithms/faiss_hnsw.py | 4 +- ann_benchmarks/algorithms/flann.py | 14 +- ann_benchmarks/algorithms/hnswlib.py | 10 +- ann_benchmarks/algorithms/kdtree.py | 10 +- ann_benchmarks/algorithms/kgraph.py | 7 +- ann_benchmarks/algorithms/lshf.py | 16 +- ann_benchmarks/algorithms/luceneknn.py | 28 +- ann_benchmarks/algorithms/milvus.py | 5 +- ann_benchmarks/algorithms/mrpt.py | 25 +- ann_benchmarks/algorithms/n2.py | 12 +- ann_benchmarks/algorithms/nearpy.py | 26 +- ann_benchmarks/algorithms/nmslib.py | 43 ++- ann_benchmarks/algorithms/onng_ngt.py | 118 ++++--- ann_benchmarks/algorithms/opensearchknn.py | 67 ++-- ann_benchmarks/algorithms/panng_ngt.py | 64 ++-- ann_benchmarks/algorithms/pgvector.py | 7 +- ann_benchmarks/algorithms/puffinn.py | 35 ++- ann_benchmarks/algorithms/pynndescent.py | 12 +- ann_benchmarks/algorithms/qdrant.py | 87 +++--- ann_benchmarks/algorithms/qg_ngt.py | 142 +++++---- ann_benchmarks/algorithms/qsg_ngt.py | 245 +++++++++------ ann_benchmarks/algorithms/rpforest.py | 2 +- ann_benchmarks/algorithms/scann.py | 50 +-- ann_benchmarks/algorithms/sptag.py | 11 +- ann_benchmarks/algorithms/subprocess.py | 88 ++++-- ann_benchmarks/algorithms/vald.py | 132 ++++---- ann_benchmarks/algorithms/vearch.py | 33 +- ann_benchmarks/algorithms/vespa.py | 18 +- ann_benchmarks/constants.py | 2 +- ann_benchmarks/data.py | 12 +- ann_benchmarks/datasets.py | 348 ++++++++++----------- ann_benchmarks/distance.py | 46 +-- ann_benchmarks/main.py | 176 +++++------ ann_benchmarks/plotting/metrics.py | 137 ++++---- ann_benchmarks/plotting/plot_variants.py | 1 - ann_benchmarks/plotting/utils.py | 123 ++++---- ann_benchmarks/results.py | 36 +-- ann_benchmarks/runner.py | 202 ++++++------ create_dataset.py | 5 +- create_website.py | 202 +++++------- data_export.py | 17 +- install.py | 55 ++-- plot.py | 155 ++++----- 58 files changed, 1594 insertions(+), 1577 deletions(-) diff --git a/ann_benchmarks/__init__.py b/ann_benchmarks/__init__.py index 75db8ab95..c8e118fcb 100644 --- a/ann_benchmarks/__init__.py +++ b/ann_benchmarks/__init__.py @@ -1,2 +1,3 @@ from __future__ import absolute_import + # from ann_benchmarks.main import * diff --git a/ann_benchmarks/algorithms/annoy.py b/ann_benchmarks/algorithms/annoy.py index 280ef9003..b5f64c48a 100644 --- a/ann_benchmarks/algorithms/annoy.py +++ b/ann_benchmarks/algorithms/annoy.py @@ -22,5 +22,4 @@ def query(self, v, n): return self._annoy.get_nns_by_vector(v.tolist(), n, self._search_k) def __str__(self): - return 'Annoy(n_trees=%d, search_k=%d)' % (self._n_trees, - self._search_k) + return "Annoy(n_trees=%d, search_k=%d)" % (self._n_trees, self._search_k) diff --git a/ann_benchmarks/algorithms/balltree.py b/ann_benchmarks/algorithms/balltree.py index 634dc691a..2f612b9ff 100644 --- a/ann_benchmarks/algorithms/balltree.py +++ b/ann_benchmarks/algorithms/balltree.py @@ -8,15 +8,15 @@ class BallTree(BaseANN): def __init__(self, metric, leaf_size=20): self._leaf_size = leaf_size self._metric = metric - self.name = 'BallTree(leaf_size=%d)' % self._leaf_size + self.name = "BallTree(leaf_size=%d)" % self._leaf_size def fit(self, X): - if self._metric == 'angular': - X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') + if self._metric == "angular": + X = sklearn.preprocessing.normalize(X, axis=1, norm="l2") self._tree = sklearn.neighbors.BallTree(X, leaf_size=self._leaf_size) def query(self, v, n): - if self._metric == 'angular': - v = sklearn.preprocessing.normalize([v], axis=1, norm='l2')[0] + if self._metric == "angular": + v = sklearn.preprocessing.normalize([v], axis=1, norm="l2")[0] dist, ind = self._tree.query([v], k=n) return ind[0] diff --git a/ann_benchmarks/algorithms/base.py b/ann_benchmarks/algorithms/base.py index a6fae26a2..2fdcc6479 100644 --- a/ann_benchmarks/algorithms/base.py +++ b/ann_benchmarks/algorithms/base.py @@ -21,8 +21,8 @@ def query(self, q, n): def batch_query(self, X, n): """Provide all queries at once and let algorithm figure out - how to handle it. Default implementation uses a ThreadPool - to parallelize query processing.""" + how to handle it. Default implementation uses a ThreadPool + to parallelize query processing.""" pool = ThreadPool() self.res = pool.map(lambda q: self.query(q, n), X) diff --git a/ann_benchmarks/algorithms/bruteforce.py b/ann_benchmarks/algorithms/bruteforce.py index 6d11d327e..b93a32959 100644 --- a/ann_benchmarks/algorithms/bruteforce.py +++ b/ann_benchmarks/algorithms/bruteforce.py @@ -7,26 +7,21 @@ class BruteForce(BaseANN): def __init__(self, metric): - if metric not in ('angular', 'euclidean', 'hamming'): - raise NotImplementedError( - "BruteForce doesn't support metric %s" % metric) + if metric not in ("angular", "euclidean", "hamming"): + raise NotImplementedError("BruteForce doesn't support metric %s" % metric) self._metric = metric - self.name = 'BruteForce()' + self.name = "BruteForce()" def fit(self, X): - metric = {'angular': 'cosine', 'euclidean': 'l2', - 'hamming': 'hamming'}[self._metric] - self._nbrs = sklearn.neighbors.NearestNeighbors( - algorithm='brute', metric=metric) + metric = {"angular": "cosine", "euclidean": "l2", "hamming": "hamming"}[self._metric] + self._nbrs = sklearn.neighbors.NearestNeighbors(algorithm="brute", metric=metric) self._nbrs.fit(X) def query(self, v, n): - return list(self._nbrs.kneighbors( - [v], return_distance=False, n_neighbors=n)[0]) + return list(self._nbrs.kneighbors([v], return_distance=False, n_neighbors=n)[0]) def query_with_distances(self, v, n): - (distances, positions) = self._nbrs.kneighbors( - [v], return_distance=True, n_neighbors=n) + (distances, positions) = self._nbrs.kneighbors([v], return_distance=True, n_neighbors=n) return zip(list(positions[0]), list(distances[0])) @@ -34,38 +29,37 @@ class BruteForceBLAS(BaseANN): """kNN search that uses a linear scan = brute force.""" def __init__(self, metric, precision=numpy.float32): - if metric not in ('angular', 'euclidean', 'hamming', 'jaccard'): - raise NotImplementedError( - "BruteForceBLAS doesn't support metric %s" % metric) - elif metric == 'hamming' and precision != numpy.bool_: + if metric not in ("angular", "euclidean", "hamming", "jaccard"): + raise NotImplementedError("BruteForceBLAS doesn't support metric %s" % metric) + elif metric == "hamming" and precision != numpy.bool_: raise NotImplementedError( - "BruteForceBLAS doesn't support precision" - " %s with Hamming distances" % precision) + "BruteForceBLAS doesn't support precision" " %s with Hamming distances" % precision + ) self._metric = metric self._precision = precision - self.name = 'BruteForceBLAS()' + self.name = "BruteForceBLAS()" def fit(self, X): """Initialize the search index.""" - if self._metric == 'angular': + if self._metric == "angular": # precompute (squared) length of each vector - lens = (X ** 2).sum(-1) + lens = (X**2).sum(-1) # normalize index vectors to unit length X /= numpy.sqrt(lens)[..., numpy.newaxis] self.index = numpy.ascontiguousarray(X, dtype=self._precision) - elif self._metric == 'hamming': + elif self._metric == "hamming": # Regarding bitvectors as vectors in l_2 is faster for blas X = X.astype(numpy.float32) # precompute (squared) length of each vector - lens = (X ** 2).sum(-1) + lens = (X**2).sum(-1) self.index = numpy.ascontiguousarray(X, dtype=numpy.float32) self.lengths = numpy.ascontiguousarray(lens, dtype=numpy.float32) - elif self._metric == 'euclidean': + elif self._metric == "euclidean": # precompute (squared) length of each vector - lens = (X ** 2).sum(-1) + lens = (X**2).sum(-1) self.index = numpy.ascontiguousarray(X, dtype=self._precision) self.lengths = numpy.ascontiguousarray(lens, dtype=self._precision) - elif self._metric == 'jaccard': + elif self._metric == "jaccard": self.index = X else: # shouldn't get past the constructor! @@ -78,33 +72,33 @@ def query_with_distances(self, v, n): """Find indices of `n` most similar vectors from the index to query vector `v`.""" - if self._metric != 'jaccard': + if self._metric != "jaccard": # use same precision for query as for index v = numpy.ascontiguousarray(v, dtype=self.index.dtype) # HACK we ignore query length as that's a constant # not affecting the final ordering - if self._metric == 'angular': + if self._metric == "angular": # argmax_a cossim(a, b) = argmax_a dot(a, b) / |a||b| = argmin_a -dot(a, b) # noqa dists = -numpy.dot(self.index, v) - elif self._metric == 'euclidean': + elif self._metric == "euclidean": # argmin_a (a - b)^2 = argmin_a a^2 - 2ab + b^2 = argmin_a a^2 - 2ab # noqa dists = self.lengths - 2 * numpy.dot(self.index, v) - elif self._metric == 'hamming': + elif self._metric == "hamming": # Just compute hamming distance using euclidean distance dists = self.lengths - 2 * numpy.dot(self.index, v) - elif self._metric == 'jaccard': - dists = [pd[self._metric]['distance'](v, e) for e in self.index] + elif self._metric == "jaccard": + dists = [pd[self._metric]["distance"](v, e) for e in self.index] else: # shouldn't get past the constructor! assert False, "invalid metric" # partition-sort by distance, get `n` closest nearest_indices = numpy.argpartition(dists, n)[:n] - indices = [idx for idx in nearest_indices if pd[self._metric] - ["distance_valid"](dists[idx])] + indices = [idx for idx in nearest_indices if pd[self._metric]["distance_valid"](dists[idx])] def fix(index): ep = self.index[index] ev = v - return (index, pd[self._metric]['distance'](ep, ev)) + return (index, pd[self._metric]["distance"](ep, ev)) + return map(fix, indices) diff --git a/ann_benchmarks/algorithms/ckdtree.py b/ann_benchmarks/algorithms/ckdtree.py index 901373d5d..7b3d40b66 100644 --- a/ann_benchmarks/algorithms/ckdtree.py +++ b/ann_benchmarks/algorithms/ckdtree.py @@ -7,7 +7,7 @@ class CKDTree(BaseANN): def __init__(self, metric, leaf_size=20): self._leaf_size = leaf_size self._metric = metric - self.name = 'CKDTree(leaf_size=%d)' % self._leaf_size + self.name = "CKDTree(leaf_size=%d)" % self._leaf_size def fit(self, X): self._tree = cKDTree(X, leafsize=self._leaf_size) diff --git a/ann_benchmarks/algorithms/datasketch.py b/ann_benchmarks/algorithms/datasketch.py index 949a7f6df..391ee0df4 100644 --- a/ann_benchmarks/algorithms/datasketch.py +++ b/ann_benchmarks/algorithms/datasketch.py @@ -8,13 +8,12 @@ class DataSketch(BaseANN): def __init__(self, metric, n_perm, n_rep): - if metric not in ('jaccard'): - raise NotImplementedError( - "Datasketch doesn't support metric %s" % metric) + if metric not in ("jaccard"): + raise NotImplementedError("Datasketch doesn't support metric %s" % metric) self._n_perm = n_perm self._n_rep = n_rep self._metric = metric - self.name = 'Datasketch(n_perm=%d, n_rep=%d)' % (n_perm, n_rep) + self.name = "Datasketch(n_perm=%d, n_rep=%d)" % (n_perm, n_rep) def fit(self, X): self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep) @@ -22,10 +21,10 @@ def fit(self, X): m = MinHash(num_perm=self._n_perm) if x.dtype == np.bool_: for e in np.flatnonzero(x): - m.update(str(e).encode('utf8')) + m.update(str(e).encode("utf8")) else: for e in x: - m.update(str(e).encode('utf8')) + m.update(str(e).encode("utf8")) self._index.add(str(i), m) self._index.index() @@ -33,8 +32,8 @@ def query(self, v, n): m = MinHash(num_perm=self._n_perm) if v.dtype == np.bool_: for e in np.flatnonzero(v): - m.update(str(e).encode('utf8')) + m.update(str(e).encode("utf8")) else: for e in v: - m.update(str(e).encode('utf8')) + m.update(str(e).encode("utf8")) return map(int, self._index.query(m, n)) diff --git a/ann_benchmarks/algorithms/definitions.py b/ann_benchmarks/algorithms/definitions.py index cabd1446c..655b2b69d 100644 --- a/ann_benchmarks/algorithms/definitions.py +++ b/ann_benchmarks/algorithms/definitions.py @@ -7,14 +7,12 @@ Definition = collections.namedtuple( - 'Definition', - ['algorithm', 'constructor', 'module', 'docker_tag', - 'arguments', 'query_argument_groups', 'disabled']) + "Definition", ["algorithm", "constructor", "module", "docker_tag", "arguments", "query_argument_groups", "disabled"] +) def instantiate_algorithm(definition): - print('Trying to instantiate %s.%s(%s)' % - (definition.module, definition.constructor, definition.arguments)) + print("Trying to instantiate %s.%s(%s)" % (definition.module, definition.constructor, definition.arguments)) module = importlib.import_module(definition.module) constructor = getattr(module, definition.constructor) return constructor(*definition.arguments) @@ -55,8 +53,7 @@ def _generate_combinations(args): def _substitute_variables(arg, vs): if isinstance(arg, dict): - return dict([(k, _substitute_variables(v, vs)) - for k, v in arg.items()]) + return dict([(k, _substitute_variables(v, vs)) for k, v in arg.items()]) elif isinstance(arg, list): return [_substitute_variables(a, vs) for a in arg] elif isinstance(arg, str) and arg in vs: @@ -73,13 +70,13 @@ def _get_definitions(definition_file): def list_algorithms(definition_file): definitions = _get_definitions(definition_file) - print('The following algorithms are supported...') + print("The following algorithms are supported...") for point in definitions: print('\t... for the point type "%s"...' % point) for metric in definitions[point]: print('\t\t... and the distance metric "%s":' % metric) for algorithm in definitions[point][metric]: - print('\t\t\t%s' % algorithm) + print("\t\t\t%s" % algorithm) def get_unique_algorithms(definition_file): @@ -92,8 +89,7 @@ def get_unique_algorithms(definition_file): return list(sorted(algos)) -def get_definitions(definition_file, dimension, point_type="float", - distance_metric="euclidean", count=10): +def get_definitions(definition_file, dimension, point_type="float", distance_metric="euclidean", count=10): definitions = _get_definitions(definition_file) algorithm_definitions = {} @@ -103,10 +99,9 @@ def get_definitions(definition_file, dimension, point_type="float", definitions = [] for (name, algo) in algorithm_definitions.items(): - for k in ['docker-tag', 'module', 'constructor']: + for k in ["docker-tag", "module", "constructor"]: if k not in algo: - raise Exception( - 'algorithm %s does not define a "%s" property' % (name, k)) + raise Exception('algorithm %s does not define a "%s" property' % (name, k)) base_args = [] if "base-args" in algo: @@ -150,20 +145,18 @@ def get_definitions(definition_file, dimension, point_type="float", else: aargs.append(arg_group) - vs = { - "@count": count, - "@metric": distance_metric, - "@dimension": dimension - } + vs = {"@count": count, "@metric": distance_metric, "@dimension": dimension} aargs = [_substitute_variables(arg, vs) for arg in aargs] - definitions.append(Definition( - algorithm=name, - docker_tag=algo['docker-tag'], - module=algo['module'], - constructor=algo['constructor'], - arguments=aargs, - query_argument_groups=query_args, - disabled=algo.get('disabled', False) - )) + definitions.append( + Definition( + algorithm=name, + docker_tag=algo["docker-tag"], + module=algo["module"], + constructor=algo["constructor"], + arguments=aargs, + query_argument_groups=query_args, + disabled=algo.get("disabled", False), + ) + ) return definitions diff --git a/ann_benchmarks/algorithms/diskann.py b/ann_benchmarks/algorithms/diskann.py index 7502141c9..327304be7 100644 --- a/ann_benchmarks/algorithms/diskann.py +++ b/ann_benchmarks/algorithms/diskann.py @@ -8,7 +8,7 @@ class Vamana(BaseANN): def __init__(self, metric, param): - self.metric = {'angular': 'cosine', 'euclidean': 'l2'}[metric] + self.metric = {"angular": "cosine", "euclidean": "l2"}[metric] self.l_build = int(param["l_build"]) self.max_outdegree = int(param["max_outdegree"]) self.alpha = float(param["alpha"]) @@ -24,59 +24,59 @@ def __init__(self, metric, param): self.params.set("num_threads", 1) def fit(self, X): - def bin_to_float(binary): - return struct.unpack('!f',struct.pack('!I', int(binary, 2)))[0] + return struct.unpack("!f", struct.pack("!I", int(binary, 2)))[0] print("Vamana: Starting Fit...") - index_dir = 'indices' + index_dir = "indices" if not os.path.exists(index_dir): os.makedirs(index_dir) - data_path = os.path.join(index_dir, 'base.bin') - self.name = 'Vamana-{}-{}-{}'.format(self.l_build, - self.max_outdegree, self.alpha) + data_path = os.path.join(index_dir, "base.bin") + self.name = "Vamana-{}-{}-{}".format(self.l_build, self.max_outdegree, self.alpha) save_path = os.path.join(index_dir, self.name) - print('Vamana: Index Stored At: ' + save_path) - shape = [np.float32(bin_to_float('{:032b}'.format(X.shape[0]))), - np.float32(bin_to_float('{:032b}'.format(X.shape[1])))] + print("Vamana: Index Stored At: " + save_path) + shape = [ + np.float32(bin_to_float("{:032b}".format(X.shape[0]))), + np.float32(bin_to_float("{:032b}".format(X.shape[1]))), + ] X = X.flatten() X = np.insert(X, 0, shape) X.tofile(data_path) if not os.path.exists(save_path): - print('Vamana: Creating Index') + print("Vamana: Creating Index") s = time.time() - if self.metric == 'l2': + if self.metric == "l2": index = vp.SinglePrecisionIndex(vp.Metric.FAST_L2, data_path) - elif self.metric == 'cosine': + elif self.metric == "cosine": index = vp.SinglePrecisionIndex(vp.Metric.INNER_PRODUCT, data_path) else: - print('Vamana: Unknown Metric Error!') + print("Vamana: Unknown Metric Error!") index.build(self.params, []) t = time.time() - print('Vamana: Index Build Time (sec) = ' + str(t - s)) + print("Vamana: Index Build Time (sec) = " + str(t - s)) index.save(save_path) if os.path.exists(save_path): - print('Vamana: Loading Index: ' + str(save_path)) + print("Vamana: Loading Index: " + str(save_path)) s = time.time() - if self.metric == 'l2': + if self.metric == "l2": self.index = vp.SinglePrecisionIndex(vp.Metric.FAST_L2, data_path) - elif self.metric == 'cosine': + elif self.metric == "cosine": self.index = vp.SinglePrecisionIndex(vp.Metric.INNER_PRODUCT, data_path) else: - print('Vamana: Unknown Metric Error!') - self.index.load(file_name = save_path) + print("Vamana: Unknown Metric Error!") + self.index.load(file_name=save_path) print("Vamana: Index Loaded") self.index.optimize_graph() print("Vamana: Graph Optimization Completed") t = time.time() - print('Vamana: Index Load Time (sec) = ' + str(t - s)) + print("Vamana: Index Load Time (sec) = " + str(t - s)) else: print("Vamana: Unexpected Index Build Time Error") - print('Vamana: End of Fit') + print("Vamana: End of Fit") def set_query_arguments(self, l_search): print("Vamana: L_Search = " + str(l_search)) @@ -95,7 +95,7 @@ def get_batch_results(self): class VamanaPQ(BaseANN): def __init__(self, metric, param): - self.metric = {'angular': 'cosine', 'euclidean': 'l2'}[metric] + self.metric = {"angular": "cosine", "euclidean": "l2"}[metric] self.l_build = int(param["l_build"]) self.max_outdegree = int(param["max_outdegree"]) self.alpha = float(param["alpha"]) @@ -114,12 +114,11 @@ def __init__(self, metric, param): self.params.set("num_threads", 1) def fit(self, X): - def bin_to_float(binary): - return struct.unpack('!f',struct.pack('!I', int(binary, 2)))[0] + return struct.unpack("!f", struct.pack("!I", int(binary, 2)))[0] print("Vamana PQ: Starting Fit...") - index_dir = 'indices' + index_dir = "indices" if self.chunks > X.shape[1]: raise ValueError @@ -127,52 +126,53 @@ def bin_to_float(binary): if not os.path.exists(index_dir): os.makedirs(index_dir) - data_path = os.path.join(index_dir, 'base.bin') - pq_path = os.path.join(index_dir, 'pq_memory_index') - self.name = 'VamanaPQ-{}-{}-{}'.format(self.l_build, - self.max_outdegree, self.alpha) + data_path = os.path.join(index_dir, "base.bin") + pq_path = os.path.join(index_dir, "pq_memory_index") + self.name = "VamanaPQ-{}-{}-{}".format(self.l_build, self.max_outdegree, self.alpha) save_path = os.path.join(index_dir, self.name) - print('Vamana PQ: Index Stored At: ' + save_path) - shape = [np.float32(bin_to_float('{:032b}'.format(X.shape[0]))), - np.float32(bin_to_float('{:032b}'.format(X.shape[1])))] + print("Vamana PQ: Index Stored At: " + save_path) + shape = [ + np.float32(bin_to_float("{:032b}".format(X.shape[0]))), + np.float32(bin_to_float("{:032b}".format(X.shape[1]))), + ] X = X.flatten() X = np.insert(X, 0, shape) X.tofile(data_path) if not os.path.exists(save_path): - print('Vamana PQ: Creating Index') + print("Vamana PQ: Creating Index") s = time.time() - if self.metric == 'l2': + if self.metric == "l2": index = vp.SinglePrecisionIndex(vp.Metric.FAST_L2, data_path) - elif self.metric == 'cosine': + elif self.metric == "cosine": index = vp.SinglePrecisionIndex(vp.Metric.INNER_PRODUCT, data_path) else: - print('Vamana PQ: Unknown Metric Error!') + print("Vamana PQ: Unknown Metric Error!") index.pq_build(data_path, pq_path, self.params) t = time.time() - print('Vamana PQ: Index Build Time (sec) = ' + str(t - s)) + print("Vamana PQ: Index Build Time (sec) = " + str(t - s)) index.save(save_path) if os.path.exists(save_path): - print('Vamana PQ: Loading Index: ' + str(save_path)) + print("Vamana PQ: Loading Index: " + str(save_path)) s = time.time() - if self.metric == 'l2': + if self.metric == "l2": self.index = vp.SinglePrecisionIndex(vp.Metric.FAST_L2, data_path) - elif self.metric == 'cosine': + elif self.metric == "cosine": self.index = vp.SinglePrecisionIndex(vp.Metric.INNER_PRODUCT, data_path) else: - print('Vamana PQ: Unknown Metric Error!') - self.index.load(file_name = save_path) + print("Vamana PQ: Unknown Metric Error!") + self.index.load(file_name=save_path) print("Vamana PQ: Index Loaded") - self.index.pq_load(pq_prefix_path = pq_path) + self.index.pq_load(pq_prefix_path=pq_path) print("Vamana PQ: PQ Data Loaded") self.index.optimize_graph() print("Vamana PQ: Graph Optimization Completed") t = time.time() - print('Vamana PQ: Index Load Time (sec) = ' + str(t - s)) + print("Vamana PQ: Index Load Time (sec) = " + str(t - s)) else: print("Vamana PQ: Unexpected Index Build Time Error") - print('Vamana PQ: End of Fit') + print("Vamana PQ: End of Fit") def set_query_arguments(self, l_search): print("Vamana PQ: L_Search = " + str(l_search)) diff --git a/ann_benchmarks/algorithms/dolphinnpy.py b/ann_benchmarks/algorithms/dolphinnpy.py index 34e7192cc..93c41dec5 100644 --- a/ann_benchmarks/algorithms/dolphinnpy.py +++ b/ann_benchmarks/algorithms/dolphinnpy.py @@ -1,5 +1,6 @@ from __future__ import absolute_import import sys + sys.path.append("install/lib-dolphinnpy") # noqa import numpy from dolphinn import Dolphinn @@ -9,7 +10,7 @@ class DolphinnPy(BaseANN): def __init__(self, num_probes): - self.name = 'Dolphinn(num_probes={} )'.format(num_probes) + self.name = "Dolphinn(num_probes={} )".format(num_probes) self.num_probes = num_probes self.m = 1 self._index = None diff --git a/ann_benchmarks/algorithms/dummy_algo.py b/ann_benchmarks/algorithms/dummy_algo.py index 8bd39aeda..939da89b3 100644 --- a/ann_benchmarks/algorithms/dummy_algo.py +++ b/ann_benchmarks/algorithms/dummy_algo.py @@ -5,7 +5,7 @@ class DummyAlgoMt(BaseANN): def __init__(self, metric): - self.name = 'DummyAlgoMultiThread' + self.name = "DummyAlgoMultiThread" def fit(self, X): self.len = len(X) - 1 @@ -16,7 +16,7 @@ def query(self, v, n): class DummyAlgoSt(BaseANN): def __init__(self, metric): - self.name = 'DummyAlgoSingleThread' + self.name = "DummyAlgoSingleThread" def fit(self, X): self.len = len(X) - 1 diff --git a/ann_benchmarks/algorithms/elasticsearch.py b/ann_benchmarks/algorithms/elasticsearch.py index 95eccab09..e7d037554 100644 --- a/ann_benchmarks/algorithms/elasticsearch.py +++ b/ann_benchmarks/algorithms/elasticsearch.py @@ -20,6 +20,7 @@ # logging.basicConfig(level=logging.INFO) # logging.getLogger("elasticsearch").setLevel(logging.INFO) + def es_wait(): print("Waiting for elasticsearch health endpoint...") req = Request("http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=1s") @@ -50,9 +51,9 @@ def __init__(self, metric: str, dimension: int): self.es = Elasticsearch(["http://localhost:9200"]) self.batch_res = [] if self.metric == "euclidean": - self.script = "1 / (1 + l2norm(params.query_vec, \"vec\"))" + self.script = '1 / (1 + l2norm(params.query_vec, "vec"))' elif self.metric == "angular": - self.script = "1.0 + cosineSimilarity(params.query_vec, \"vec\")" + self.script = '1.0 + cosineSimilarity(params.query_vec, "vec")' else: raise NotImplementedError(f"Not implemented for metric {self.metric}") es_wait() @@ -60,17 +61,14 @@ def __init__(self, metric: str, dimension: int): def fit(self, X): body = dict(settings=dict(number_of_shards=1, number_of_replicas=0)) mapping = dict( - properties=dict( - id=dict(type="keyword", store=True), - vec=dict(type="dense_vector", dims=self.dimension) - ) + properties=dict(id=dict(type="keyword", store=True), vec=dict(type="dense_vector", dims=self.dimension)) ) self.es.indices.create(self.index, body=body) self.es.indices.put_mapping(mapping, self.index) def gen(): for i, vec in enumerate(X): - yield { "_op_type": "index", "_index": self.index, "vec": vec.tolist(), 'id': str(i + 1) } + yield {"_op_type": "index", "_index": self.index, "vec": vec.tolist(), "id": str(i + 1)} (_, errors) = bulk(self.es, gen(), chunk_size=500, max_retries=9) assert len(errors) == 0, errors @@ -82,21 +80,23 @@ def query(self, q, n): body = dict( query=dict( script_score=dict( - query=dict(match_all=dict()), - script=dict( - source=self.script, - params=dict(query_vec=q.tolist()) - ) + query=dict(match_all=dict()), script=dict(source=self.script, params=dict(query_vec=q.tolist())) ) ) ) - res = self.es.search(index=self.index, body=body, size=n, _source=False, docvalue_fields=['id'], - stored_fields="_none_", filter_path=["hits.hits.fields.id"]) - return [int(h['fields']['id'][0]) - 1 for h in res['hits']['hits']] + res = self.es.search( + index=self.index, + body=body, + size=n, + _source=False, + docvalue_fields=["id"], + stored_fields="_none_", + filter_path=["hits.hits.fields.id"], + ) + return [int(h["fields"]["id"][0]) - 1 for h in res["hits"]["hits"]] def batch_query(self, X, n): self.batch_res = [self.query(q, n) for q in X] def get_batch_results(self): return self.batch_res - diff --git a/ann_benchmarks/algorithms/elastiknn.py b/ann_benchmarks/algorithms/elastiknn.py index 8add0d8f2..d27a5ad5f 100644 --- a/ann_benchmarks/algorithms/elastiknn.py +++ b/ann_benchmarks/algorithms/elastiknn.py @@ -39,15 +39,15 @@ def es_wait(): def dealias_metric(metric: str) -> str: mlower = metric.lower() - if mlower == 'euclidean': - return 'l2' - elif mlower == 'angular': - return 'cosine' + if mlower == "euclidean": + return "l2" + elif mlower == "angular": + return "cosine" else: return mlower -class Exact(BaseANN): +class Exact(BaseANN): def __init__(self, metric: str, dimension: int): self.name = f"eknn-exact-metric={metric}_dimension={dimension}" self.metric = metric @@ -61,19 +61,19 @@ def _handle_sparse(self, X): return [Vec.SparseBool(x, self.dimension) for x in X] def fit(self, X): - if self.metric in {'jaccard', 'hamming'}: + if self.metric in {"jaccard", "hamming"}: return self.model.fit(self._handle_sparse(X), shards=1)[0] else: return self.model.fit(X, shards=1) def query(self, q, n): - if self.metric in {'jaccard', 'hamming'}: + if self.metric in {"jaccard", "hamming"}: return self.model.kneighbors(self._handle_sparse([q]), n)[0] else: return self.model.kneighbors(np.expand_dims(q, 0), n)[0] def batch_query(self, X, n): - if self.metric in {'jaccard', 'hamming'}: + if self.metric in {"jaccard", "hamming"}: self.batch_res = self.model.kneighbors(self._handle_sparse(X), n) else: self.batch_res = self.model.kneighbors(X, n) @@ -83,7 +83,6 @@ def get_batch_results(self): class L2Lsh(BaseANN): - def __init__(self, L: int, k: int, w: int): self.name_prefix = f"eknn-l2lsh-L={L}-k={k}-w={w}" self.name = None # set based on query args. @@ -112,7 +111,7 @@ def set_query_arguments(self, candidates: int, probes: int): self.sum_query_dur = 0 def query(self, q, n): - + t0 = perf_counter() res = self.model.kneighbors(np.expand_dims(q, 0), n, return_similarity=False)[0] dur = perf_counter() - t0 @@ -121,7 +120,9 @@ def query(self, q, n): self.sum_query_dur += dur self.num_queries += 1 if self.num_queries > 500 and self.num_queries / self.sum_query_dur < 50: - raise Exception("Throughput after 500 queries is less than 50 q/s. Giving up to avoid wasteful computation.") + raise Exception( + "Throughput after 500 queries is less than 50 q/s. Giving up to avoid wasteful computation." + ) elif res[-2:].sum() < 0: raise Exception(f"Model returned fewer than {n} neighbors. Giving up to avoid wasteful computation.") else: diff --git a/ann_benchmarks/algorithms/faiss.py b/ann_benchmarks/algorithms/faiss.py index 9d6244400..008386ecc 100644 --- a/ann_benchmarks/algorithms/faiss.py +++ b/ann_benchmarks/algorithms/faiss.py @@ -1,5 +1,6 @@ from __future__ import absolute_import import sys + sys.path.append("install/lib-faiss") # noqa import numpy import sklearn.preprocessing @@ -9,14 +10,13 @@ class Faiss(BaseANN): def query(self, v, n): - if self._metric == 'angular': + if self._metric == "angular": v /= numpy.linalg.norm(v) - D, I = self.index.search(numpy.expand_dims( - v, axis=0).astype(numpy.float32), n) + D, I = self.index.search(numpy.expand_dims(v, axis=0).astype(numpy.float32), n) return I[0] def batch_query(self, X, n): - if self._metric == 'angular': + if self._metric == "angular": X /= numpy.linalg.norm(X) self.res = self.index.search(X.astype(numpy.float32), n) @@ -37,7 +37,7 @@ def __init__(self, metric, n_bits): self._n_bits = n_bits self.index = None self._metric = metric - self.name = 'FaissLSH(n_bits={})'.format(self._n_bits) + self.name = "FaissLSH(n_bits={})".format(self._n_bits) def fit(self, X): if X.dtype != numpy.float32: @@ -54,15 +54,14 @@ def __init__(self, metric, n_list): self._metric = metric def fit(self, X): - if self._metric == 'angular': - X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') + if self._metric == "angular": + X = sklearn.preprocessing.normalize(X, axis=1, norm="l2") if X.dtype != numpy.float32: X = X.astype(numpy.float32) self.quantizer = faiss.IndexFlatL2(X.shape[1]) - index = faiss.IndexIVFFlat( - self.quantizer, X.shape[1], self._n_list, faiss.METRIC_L2) + index = faiss.IndexIVFFlat(self.quantizer, X.shape[1], self._n_list, faiss.METRIC_L2) index.train(X) index.add(X) self.index = index @@ -73,12 +72,10 @@ def set_query_arguments(self, n_probe): self.index.nprobe = self._n_probe def get_additional(self): - return {"dist_comps": faiss.cvar.indexIVF_stats.ndis + # noqa - faiss.cvar.indexIVF_stats.nq * self._n_list} + return {"dist_comps": faiss.cvar.indexIVF_stats.ndis + faiss.cvar.indexIVF_stats.nq * self._n_list} # noqa def __str__(self): - return 'FaissIVF(n_list=%d, n_probe=%d)' % (self._n_list, - self._n_probe) + return "FaissIVF(n_list=%d, n_probe=%d)" % (self._n_list, self._n_probe) class FaissIVFPQfs(Faiss): @@ -89,11 +86,11 @@ def __init__(self, metric, n_list): def fit(self, X): if X.dtype != numpy.float32: X = X.astype(numpy.float32) - if self._metric == 'angular': + if self._metric == "angular": faiss.normalize_L2(X) d = X.shape[1] - faiss_metric = faiss.METRIC_INNER_PRODUCT if self._metric == 'angular' else faiss.METRIC_L2 + faiss_metric = faiss.METRIC_INNER_PRODUCT if self._metric == "angular" else faiss.METRIC_L2 factory_string = f"IVF{self._n_list},PQ{d//2}x4fs" index = faiss.index_factory(d, factory_string, faiss_metric) index.train(X) @@ -114,10 +111,7 @@ def set_query_arguments(self, n_probe, k_reorder): self.index = self.refine_index def get_additional(self): - return {"dist_comps": faiss.cvar.indexIVF_stats.ndis + # noqa - faiss.cvar.indexIVF_stats.nq * self._n_list} + return {"dist_comps": faiss.cvar.indexIVF_stats.ndis + faiss.cvar.indexIVF_stats.nq * self._n_list} # noqa def __str__(self): - return 'FaissIVFPQfs(n_list=%d, n_probe=%d, k_reorder=%d)' % (self._n_list, - self._n_probe, - self._k_reorder) + return "FaissIVFPQfs(n_list=%d, n_probe=%d, k_reorder=%d)" % (self._n_list, self._n_probe, self._k_reorder) diff --git a/ann_benchmarks/algorithms/faiss_gpu.py b/ann_benchmarks/algorithms/faiss_gpu.py index b30423abc..0754ae19d 100644 --- a/ann_benchmarks/algorithms/faiss_gpu.py +++ b/ann_benchmarks/algorithms/faiss_gpu.py @@ -1,5 +1,6 @@ from __future__ import absolute_import import sys + # Assumes local installation of FAISS sys.path.append("faiss") # noqa import numpy @@ -12,8 +13,7 @@ class FaissGPU(BaseANN): def __init__(self, n_bits, n_probes): - self.name = 'FaissGPU(n_bits={}, n_probes={})'.format( - n_bits, n_probes) + self.name = "FaissGPU(n_bits={}, n_probes={})".format(n_bits, n_probes) self._n_bits = n_bits self._n_probes = n_probes self._res = faiss.StandardGpuResources() @@ -21,8 +21,7 @@ def __init__(self, n_bits, n_probes): def fit(self, X): X = X.astype(numpy.float32) - self._index = faiss.GpuIndexIVFFlat(self._res, len(X[0]), self._n_bits, - faiss.METRIC_L2) + self._index = faiss.GpuIndexIVFFlat(self._res, len(X[0]), self._n_bits, faiss.METRIC_L2) # self._index = faiss.index_factory(len(X[0]), # "IVF%d,Flat" % self._n_bits) # co = faiss.GpuClonerOptions() diff --git a/ann_benchmarks/algorithms/faiss_hnsw.py b/ann_benchmarks/algorithms/faiss_hnsw.py index 38414dfc2..da9948b0d 100644 --- a/ann_benchmarks/algorithms/faiss_hnsw.py +++ b/ann_benchmarks/algorithms/faiss_hnsw.py @@ -14,7 +14,7 @@ def fit(self, X): self.index.hnsw.efConstruction = self.method_param["efConstruction"] self.index.verbose = True - if self._metric == 'angular': + if self._metric == "angular": X = X / np.linalg.norm(X, axis=1)[:, np.newaxis] if X.dtype != np.float32: X = X.astype(np.float32) @@ -30,7 +30,7 @@ def get_additional(self): return {"dist_comps": faiss.cvar.hnsw_stats.ndis} def __str__(self): - return 'faiss (%s, ef: %d)' % (self.method_param, self.index.hnsw.efSearch) + return "faiss (%s, ef: %d)" % (self.method_param, self.index.hnsw.efSearch) def freeIndex(self): del self.p diff --git a/ann_benchmarks/algorithms/flann.py b/ann_benchmarks/algorithms/flann.py index 69790d9cc..9c8a7a8ec 100644 --- a/ann_benchmarks/algorithms/flann.py +++ b/ann_benchmarks/algorithms/flann.py @@ -8,20 +8,18 @@ class FLANN(BaseANN): def __init__(self, metric, target_precision): self._target_precision = target_precision - self.name = 'FLANN(target_precision=%f)' % self._target_precision + self.name = "FLANN(target_precision=%f)" % self._target_precision self._metric = metric def fit(self, X): - self._flann = pyflann.FLANN( - target_precision=self._target_precision, - algorithm='autotuned', log_level='info') - if self._metric == 'angular': - X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') + self._flann = pyflann.FLANN(target_precision=self._target_precision, algorithm="autotuned", log_level="info") + if self._metric == "angular": + X = sklearn.preprocessing.normalize(X, axis=1, norm="l2") self._flann.build_index(X) def query(self, v, n): - if self._metric == 'angular': - v = sklearn.preprocessing.normalize([v], axis=1, norm='l2')[0] + if self._metric == "angular": + v = sklearn.preprocessing.normalize([v], axis=1, norm="l2")[0] if v.dtype != numpy.float32: v = v.astype(numpy.float32) return self._flann.nn_index(v, n)[0][0] diff --git a/ann_benchmarks/algorithms/hnswlib.py b/ann_benchmarks/algorithms/hnswlib.py index 8526eaef4..9000517e0 100644 --- a/ann_benchmarks/algorithms/hnswlib.py +++ b/ann_benchmarks/algorithms/hnswlib.py @@ -6,18 +6,18 @@ class HnswLib(BaseANN): def __init__(self, metric, method_param): - self.metric = {'angular': 'cosine', 'euclidean': 'l2'}[metric] + self.metric = {"angular": "cosine", "euclidean": "l2"}[metric] self.method_param = method_param # print(self.method_param,save_index,query_param) # self.ef=query_param['ef'] - self.name = 'hnswlib (%s)' % (self.method_param) + self.name = "hnswlib (%s)" % (self.method_param) def fit(self, X): # Only l2 is supported currently self.p = hnswlib.Index(space=self.metric, dim=len(X[0])) - self.p.init_index(max_elements=len(X), - ef_construction=self.method_param["efConstruction"], - M=self.method_param["M"]) + self.p.init_index( + max_elements=len(X), ef_construction=self.method_param["efConstruction"], M=self.method_param["M"] + ) data_labels = np.arange(len(X)) self.p.add_items(np.asarray(X), data_labels) self.p.set_num_threads(1) diff --git a/ann_benchmarks/algorithms/kdtree.py b/ann_benchmarks/algorithms/kdtree.py index 6048ecd6a..11ca2926d 100644 --- a/ann_benchmarks/algorithms/kdtree.py +++ b/ann_benchmarks/algorithms/kdtree.py @@ -8,15 +8,15 @@ class KDTree(BaseANN): def __init__(self, metric, leaf_size=20): self._leaf_size = leaf_size self._metric = metric - self.name = 'KDTree(leaf_size=%d)' % self._leaf_size + self.name = "KDTree(leaf_size=%d)" % self._leaf_size def fit(self, X): - if self._metric == 'angular': - X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') + if self._metric == "angular": + X = sklearn.preprocessing.normalize(X, axis=1, norm="l2") self._tree = sklearn.neighbors.KDTree(X, leaf_size=self._leaf_size) def query(self, v, n): - if self._metric == 'angular': - v = sklearn.preprocessing.normalize([v], axis=1, norm='l2')[0] + if self._metric == "angular": + v = sklearn.preprocessing.normalize([v], axis=1, norm="l2")[0] dist, ind = self._tree.query([v], k=n) return ind[0] diff --git a/ann_benchmarks/algorithms/kgraph.py b/ann_benchmarks/algorithms/kgraph.py index fa14e2e17..0f8148ffa 100644 --- a/ann_benchmarks/algorithms/kgraph.py +++ b/ann_benchmarks/algorithms/kgraph.py @@ -9,7 +9,7 @@ class KGraph(BaseANN): def __init__(self, metric, index_params, save_index): metric = str(metric) - self.name = 'KGraph(%s)' % (metric) + self.name = "KGraph(%s)" % (metric) self._metric = metric self._index_params = index_params self._save_index = save_index @@ -18,7 +18,7 @@ def fit(self, X): if X.dtype != numpy.float32: X = X.astype(numpy.float32) self._kgraph = pykgraph.KGraph(X, self._metric) - path = os.path.join(INDEX_DIR, 'kgraph-index-%s' % self._metric) + path = os.path.join(INDEX_DIR, "kgraph-index-%s" % self._metric) if os.path.exists(path): self._kgraph.load(path) else: @@ -34,6 +34,5 @@ def set_query_arguments(self, P): def query(self, v, n): if v.dtype != numpy.float32: v = v.astype(numpy.float32) - result = self._kgraph.search( - numpy.array([v]), K=n, threads=1, P=self._P) + result = self._kgraph.search(numpy.array([v]), K=n, threads=1, P=self._P) return result[0] diff --git a/ann_benchmarks/algorithms/lshf.py b/ann_benchmarks/algorithms/lshf.py index 59a59dfb1..a1a8db2cc 100644 --- a/ann_benchmarks/algorithms/lshf.py +++ b/ann_benchmarks/algorithms/lshf.py @@ -6,20 +6,18 @@ class LSHF(BaseANN): def __init__(self, metric, n_estimators=10, n_candidates=50): - self.name = 'LSHF(n_est=%d, n_cand=%d)' % (n_estimators, n_candidates) + self.name = "LSHF(n_est=%d, n_cand=%d)" % (n_estimators, n_candidates) self._metric = metric self._n_estimators = n_estimators self._n_candidates = n_candidates def fit(self, X): - self._lshf = sklearn.neighbors.LSHForest( - n_estimators=self._n_estimators, n_candidates=self._n_candidates) - if self._metric == 'angular': - X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') + self._lshf = sklearn.neighbors.LSHForest(n_estimators=self._n_estimators, n_candidates=self._n_candidates) + if self._metric == "angular": + X = sklearn.preprocessing.normalize(X, axis=1, norm="l2") self._lshf.fit(X) def query(self, v, n): - if self._metric == 'angular': - v = sklearn.preprocessing.normalize([v], axis=1, norm='l2')[0] - return self._lshf.kneighbors([v], return_distance=False, - n_neighbors=n)[0] + if self._metric == "angular": + v = sklearn.preprocessing.normalize([v], axis=1, norm="l2")[0] + return self._lshf.kneighbors([v], return_distance=False, n_neighbors=n)[0] diff --git a/ann_benchmarks/algorithms/luceneknn.py b/ann_benchmarks/algorithms/luceneknn.py index 6941d636b..42bfa5b62 100644 --- a/ann_benchmarks/algorithms/luceneknn.py +++ b/ann_benchmarks/algorithms/luceneknn.py @@ -21,6 +21,7 @@ class Codec(PyLucene94Codec): """ Custom codec so that the appropriate Lucene94 codec can be returned with the configured M and efConstruction """ + def __init__(self, M, efConstruction): super(Codec, self).__init__() self.M = M @@ -37,15 +38,16 @@ class PyLuceneKNN(BaseANN): def __init__(self, metric: str, dimension: int, param): try: - lucene.initVM(vmargs=['-Djava.awt.headless=true -Xmx6g -Xms6g']) + lucene.initVM(vmargs=["-Djava.awt.headless=true -Xmx6g -Xms6g"]) except ValueError: - print('VM already initialized') + print("VM already initialized") self.metric = metric self.dimension = dimension self.param = param self.short_name = f"luceneknn-{param['M']}-{param['efConstruction']}" - self.simFunc = VectorSimilarityFunction.DOT_PRODUCT if self.metric == "angular" \ - else VectorSimilarityFunction.EUCLIDEAN + self.simFunc = ( + VectorSimilarityFunction.DOT_PRODUCT if self.metric == "angular" else VectorSimilarityFunction.EUCLIDEAN + ) if self.metric not in ("euclidean", "angular"): raise NotImplementedError(f"Not implemented for metric {self.metric}") @@ -56,10 +58,10 @@ def done(self): def fit(self, X): if self.dimension != X.shape[1]: raise Exception(f"Configured dimension {self.dimension} but data has shape {X.shape}") - if self.metric == 'angular': - X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') + if self.metric == "angular": + X = sklearn.preprocessing.normalize(X, axis=1, norm="l2") iwc = IndexWriterConfig().setOpenMode(IndexWriterConfig.OpenMode.CREATE) - codec = Codec(self.param['M'], self.param['efConstruction']) + codec = Codec(self.param["M"], self.param["efConstruction"]) iwc.setCodec(codec) iwc.setRAMBufferSizeMB(1994.0) self.dir = FSDirectory.open(Paths.get(self.short_name + ".index")) @@ -70,7 +72,7 @@ def fit(self, X): X = X.tolist() for x in X: doc = Document() - doc.add(KnnVectorField("knn", JArray('float')(x), fieldType)) + doc.add(KnnVectorField("knn", JArray("float")(x), fieldType)) doc.add(StoredField("id", id)) iw.addDocument(doc) id += 1 @@ -95,9 +97,9 @@ def run_knn_query_inner(self, num_candidates, n, q): return [int(self.searcher.doc(d.doc).get("id")) for d in topdocs.scoreDocs] def prepare_query(self, q, n): - if self.metric == 'angular': + if self.metric == "angular": q = q / np.linalg.norm(q) - self.q = JArray('float')(q.tolist()) + self.q = JArray("float")(q.tolist()) self.n = n def get_prepared_query_results(self): @@ -107,9 +109,9 @@ def run_prepared_query(self): self.res = self.run_knn_query_inner(self.ef, self.n, self.q) def prepare_batch_query(self, X, n): - if self.metric == 'angular': - X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') - self.queries = [JArray('float')(q) for q in X.tolist()] + if self.metric == "angular": + X = sklearn.preprocessing.normalize(X, axis=1, norm="l2") + self.queries = [JArray("float")(q) for q in X.tolist()] self.n = n def run_batch_query(self): diff --git a/ann_benchmarks/algorithms/milvus.py b/ann_benchmarks/algorithms/milvus.py index 55440ae85..e188e0f65 100644 --- a/ann_benchmarks/algorithms/milvus.py +++ b/ann_benchmarks/algorithms/milvus.py @@ -5,7 +5,7 @@ def metric_mapping(_metric: str): - _metric_type = {'angular': 'cosine', 'euclidean': 'l2'}.get(_metric, None) + _metric_type = {"angular": "cosine", "euclidean": "l2"}.get(_metric, None) if _metric_type is None: raise Exception(f"[Milvus] Not support metric type: {_metric}!!!") return _metric_type @@ -28,8 +28,7 @@ def __init__(self, metric, dim, index_param): self.client = None def fit(self, X): - self.client = pyknowhere.Index( - self._metric_type, self._dim, len(X), self._index_m, self._index_ef) + self.client = pyknowhere.Index(self._metric_type, self._dim, len(X), self._index_m, self._index_ef) self.client.add(X, numpy.arange(len(X))) def set_query_arguments(self, ef): diff --git a/ann_benchmarks/algorithms/mrpt.py b/ann_benchmarks/algorithms/mrpt.py index e63f70e65..63305320a 100644 --- a/ann_benchmarks/algorithms/mrpt.py +++ b/ann_benchmarks/algorithms/mrpt.py @@ -13,12 +13,11 @@ def __init__(self, metric, count): def fit(self, X): if X.dtype != numpy.float32: X = X.astype(numpy.float32) - if self._metric == 'angular': - X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') + if self._metric == "angular": + X = sklearn.preprocessing.normalize(X, axis=1, norm="l2") self._index_autotuned = mrpt.MRPTIndex(X) - self._index_autotuned.build_autotune_sample( - target_recall=None, k=self._k, n_test=1000) + self._index_autotuned.build_autotune_sample(target_recall=None, k=self._k, n_test=1000) def set_query_arguments(self, target_recall): self._target_recall = target_recall @@ -28,14 +27,16 @@ def set_query_arguments(self, target_recall): def query(self, v, n): if v.dtype != numpy.float32: v = v.astype(numpy.float32) - if self._metric == 'angular': - v = sklearn.preprocessing.normalize( - v.reshape(1, -1), axis=1, norm='l2').flatten() + if self._metric == "angular": + v = sklearn.preprocessing.normalize(v.reshape(1, -1), axis=1, norm="l2").flatten() return self._index.ann(v) def __str__(self): - str_template = ('MRPT(target recall=%.3f, trees=%d, depth=%d, vote ' - 'threshold=%d, estimated recall=%.3f)') - return str_template % (self._target_recall, self._par['n_trees'], - self._par['depth'], self._par['votes'], - self._par['estimated_recall']) + str_template = "MRPT(target recall=%.3f, trees=%d, depth=%d, vote " "threshold=%d, estimated recall=%.3f)" + return str_template % ( + self._target_recall, + self._par["n_trees"], + self._par["depth"], + self._par["votes"], + self._par["estimated_recall"], + ) diff --git a/ann_benchmarks/algorithms/n2.py b/ann_benchmarks/algorithms/n2.py index d4b5b5974..604f62097 100644 --- a/ann_benchmarks/algorithms/n2.py +++ b/ann_benchmarks/algorithms/n2.py @@ -6,9 +6,9 @@ class N2(BaseANN): def __init__(self, metric, method_param): self._metric = metric - self._m = method_param['M'] + self._m = method_param["M"] self._m0 = self._m * 2 - self._ef_construction = method_param['efConstruction'] + self._ef_construction = method_param["efConstruction"] self._n_threads = 1 self._ef_search = -1 @@ -16,7 +16,13 @@ def fit(self, X): self._n2 = n2.HnswIndex(X.shape[1], self._metric) for x in X: self._n2.add_data(x) - self._n2.build(m=self._m, max_m0=self._m0, ef_construction=self._ef_construction, n_threads=self._n_threads, graph_merging='merge_level0') + self._n2.build( + m=self._m, + max_m0=self._m0, + ef_construction=self._ef_construction, + n_threads=self._n_threads, + graph_merging="merge_level0", + ) def set_query_arguments(self, ef): self._ef_search = ef diff --git a/ann_benchmarks/algorithms/nearpy.py b/ann_benchmarks/algorithms/nearpy.py index e991141eb..d047da092 100644 --- a/ann_benchmarks/algorithms/nearpy.py +++ b/ann_benchmarks/algorithms/nearpy.py @@ -11,31 +11,23 @@ def __init__(self, metric, n_bits, hash_counts): self._hash_counts = hash_counts self._metric = metric self._filter = NearestFilter(10) - self.name = 'NearPy(n_bits=%d, hash_counts=%d)' % ( - self._n_bits, self._hash_counts) + self.name = "NearPy(n_bits=%d, hash_counts=%d)" % (self._n_bits, self._hash_counts) def fit(self, X): hashes = [] for k in range(self._hash_counts): - nearpy_rbp = nearpy.hashes.RandomBinaryProjections( - 'rbp_%d' % k, self._n_bits) + nearpy_rbp = nearpy.hashes.RandomBinaryProjections("rbp_%d" % k, self._n_bits) hashes.append(nearpy_rbp) - if self._metric == 'euclidean': + if self._metric == "euclidean": dist = nearpy.distances.EuclideanDistance() - self._nearpy_engine = nearpy.Engine( - X.shape[1], - lshashes=hashes, - distance=dist) + self._nearpy_engine = nearpy.Engine(X.shape[1], lshashes=hashes, distance=dist) else: # Default (angular) = Cosine distance - self._nearpy_engine = nearpy.Engine( - X.shape[1], - lshashes=hashes, - vector_filters=[self._filter]) + self._nearpy_engine = nearpy.Engine(X.shape[1], lshashes=hashes, vector_filters=[self._filter]) - if self._metric == 'angular': - X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') + if self._metric == "angular": + X = sklearn.preprocessing.normalize(X, axis=1, norm="l2") for i, x in enumerate(X): self._nearpy_engine.store_vector(x, i) @@ -43,6 +35,6 @@ def query(self, v, n): # XXX: This feels like an unpleasant hack, but it's not clear how to do # better without making changes to NearPy self._filter.N = n - if self._metric == 'angular': - v = sklearn.preprocessing.normalize([v], axis=1, norm='l2')[0] + if self._metric == "angular": + v = sklearn.preprocessing.normalize([v], axis=1, norm="l2")[0] return [y for x, y, z in self._nearpy_engine.neighbours(v)] diff --git a/ann_benchmarks/algorithms/nmslib.py b/ann_benchmarks/algorithms/nmslib.py index b7cd437c1..c41fd991a 100644 --- a/ann_benchmarks/algorithms/nmslib.py +++ b/ann_benchmarks/algorithms/nmslib.py @@ -6,64 +6,64 @@ from ann_benchmarks.constants import INDEX_DIR from ann_benchmarks.algorithms.base import BaseANN + def sparse_matrix_to_str(matrix): result = [] matrix = matrix.tocsr() matrix.sort_indices() for row in range(matrix.shape[0]): arr = [k for k in matrix.indices[matrix.indptr[row] : matrix.indptr[row + 1]]] - result.append(' '.join([str(k) for k in arr])) + result.append(" ".join([str(k) for k in arr])) return result + def dense_vector_to_str(vector): if vector.dtype == np.bool_: indices = np.flatnonzero(vector) else: indices = vector - result = ' '.join([str(k) for k in indices]) + result = " ".join([str(k) for k in indices]) return result + class NmslibReuseIndex(BaseANN): @staticmethod def encode(d): return ["%s=%s" % (a, b) for (a, b) in d.items()] def __init__(self, metric, method_name, index_param, query_param): - self._nmslib_metric = { - 'angular': 'cosinesimil', 'euclidean': 'l2', 'jaccard': 'jaccard_sparse'}[metric] + self._nmslib_metric = {"angular": "cosinesimil", "euclidean": "l2", "jaccard": "jaccard_sparse"}[metric] self._method_name = method_name self._save_index = False self._index_param = NmslibReuseIndex.encode(index_param) if query_param is not False: self._query_param = NmslibReuseIndex.encode(query_param) - self.name = ('Nmslib(method_name={}, index_param={}, ' - 'query_param={})'.format(self._method_name, - self._index_param, - self._query_param)) + self.name = "Nmslib(method_name={}, index_param={}, " "query_param={})".format( + self._method_name, self._index_param, self._query_param + ) else: self._query_param = None - self.name = 'Nmslib(method_name=%s, index_param=%s)' % ( - self._method_name, self._index_param) + self.name = "Nmslib(method_name=%s, index_param=%s)" % (self._method_name, self._index_param) - self._index_name = os.path.join(INDEX_DIR, "nmslib_%s_%s_%s" % ( - self._method_name, metric, '_'.join(self._index_param))) + self._index_name = os.path.join( + INDEX_DIR, "nmslib_%s_%s_%s" % (self._method_name, metric, "_".join(self._index_param)) + ) d = os.path.dirname(self._index_name) if not os.path.exists(d): os.makedirs(d) def fit(self, X): - if self._method_name == 'vptree': + if self._method_name == "vptree": # To avoid this issue: terminate called after throwing an instance # of 'std::runtime_error' # what(): The data size is too small or the bucket size is too # big. Select the parameters so that is NOT # less than * 1000 # Aborted (core dumped) - self._index_param.append('bucketSize=%d' % - min(int(len(X) * 0.0005), 1000)) + self._index_param.append("bucketSize=%d" % min(int(len(X) * 0.0005), 1000)) - if self._nmslib_metric == 'jaccard_sparse': + if self._nmslib_metric == "jaccard_sparse": self._index = nmslib.init( space=self._nmslib_metric, method=self._method_name, @@ -82,12 +82,11 @@ def fit(self, X): string_data = sparse_matrix_to_str(sparse_matrix) self._index.addDataPointBatch(string_data) else: - self._index = nmslib.init( - space=self._nmslib_metric, method=self._method_name) + self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name) self._index.addDataPointBatch(X) if os.path.exists(self._index_name): - print('Loading index from file') + print("Loading index from file") self._index.loadIndex(self._index_name) else: self._index.createIndex(self._index_param) @@ -97,11 +96,11 @@ def fit(self, X): self._index.setQueryTimeParams(self._query_param) def set_query_arguments(self, ef): - if self._method_name == 'hnsw' or self._method_name == 'sw-graph': + if self._method_name == "hnsw" or self._method_name == "sw-graph": self._index.setQueryTimeParams(["efSearch=%s" % (ef)]) def query(self, v, n): - if self._nmslib_metric == 'jaccard_sparse': + if self._nmslib_metric == "jaccard_sparse": v_string = dense_vector_to_str(v) ids, distances = self._index.knnQuery(v_string, n) else: @@ -109,7 +108,7 @@ def query(self, v, n): return ids def batch_query(self, X, n): - if self._nmslib_metric == 'jaccard_sparse': + if self._nmslib_metric == "jaccard_sparse": sparse_matrix = scipy.sparse.csr_matrix(X) string_data = sparse_matrix_to_str(sparse_matrix) self.res = self._index.knnQueryBatch(string_data, n) diff --git a/ann_benchmarks/algorithms/onng_ngt.py b/ann_benchmarks/algorithms/onng_ngt.py index 826e22e28..8255c0cca 100644 --- a/ann_benchmarks/algorithms/onng_ngt.py +++ b/ann_benchmarks/algorithms/onng_ngt.py @@ -8,85 +8,107 @@ class ONNG(BaseANN): def __init__(self, metric, object_type, epsilon, param): - metrics = {'euclidean': '2', 'angular': 'E'} - self._edge_size = int(param['edge']) - self._outdegree = int(param['outdegree']) - self._indegree = int(param['indegree']) + metrics = {"euclidean": "2", "angular": "E"} + self._edge_size = int(param["edge"]) + self._outdegree = int(param["outdegree"]) + self._indegree = int(param["indegree"]) self._metric = metrics[metric] self._object_type = object_type - self._edge_size_for_search = int(param['search_edge']) if 'search_edge' in param.keys() else 0 - self._tree_disabled = (param['tree'] is False) if 'tree' in param.keys() else False - self._refine_enabled = (param['refine'] is True) if 'refine' in param.keys() else False + self._edge_size_for_search = int(param["search_edge"]) if "search_edge" in param.keys() else 0 + self._tree_disabled = (param["tree"] is False) if "tree" in param.keys() else False + self._refine_enabled = (param["refine"] is True) if "refine" in param.keys() else False self._build_time_limit = 4 self._epsilon = epsilon - print('ONNG: edge_size=' + str(self._edge_size)) - print('ONNG: outdegree=' + str(self._outdegree)) - print('ONNG: indegree=' + str(self._indegree)) - print('ONNG: edge_size_for_search=' + str(self._edge_size_for_search)) - print('ONNG: epsilon=' + str(self._epsilon)) - print('ONNG: metric=' + metric) - print('ONNG: object_type=' + object_type) + print("ONNG: edge_size=" + str(self._edge_size)) + print("ONNG: outdegree=" + str(self._outdegree)) + print("ONNG: indegree=" + str(self._indegree)) + print("ONNG: edge_size_for_search=" + str(self._edge_size_for_search)) + print("ONNG: epsilon=" + str(self._epsilon)) + print("ONNG: metric=" + metric) + print("ONNG: object_type=" + object_type) def fit(self, X): - print('ONNG: start indexing...') + print("ONNG: start indexing...") dim = len(X[0]) - print('ONNG: # of data=' + str(len(X))) - print('ONNG: dimensionality=' + str(dim)) - index_dir = 'indexes' + print("ONNG: # of data=" + str(len(X))) + print("ONNG: dimensionality=" + str(dim)) + index_dir = "indexes" if not os.path.exists(index_dir): os.makedirs(index_dir) - index = os.path.join( - index_dir, - 'ONNG-{}-{}-{}'.format(self._edge_size, self._outdegree, - self._indegree)) - anngIndex = os.path.join(index_dir, 'ANNG-' + str(self._edge_size)) - print('ONNG: index=' + index) + index = os.path.join(index_dir, "ONNG-{}-{}-{}".format(self._edge_size, self._outdegree, self._indegree)) + anngIndex = os.path.join(index_dir, "ANNG-" + str(self._edge_size)) + print("ONNG: index=" + index) if (not os.path.exists(index)) and (not os.path.exists(anngIndex)): - print('ONNG: create ANNG') + print("ONNG: create ANNG") t = time.time() - args = ['ngt', 'create', '-it', '-p8', '-b500', '-ga', '-of', - '-D' + self._metric, '-d' + str(dim), - '-E' + str(self._edge_size), - '-S' + str(self._edge_size_for_search), - '-e' + str(self._epsilon), '-P0', '-B30', - '-T' + str(self._build_time_limit), anngIndex] + args = [ + "ngt", + "create", + "-it", + "-p8", + "-b500", + "-ga", + "-of", + "-D" + self._metric, + "-d" + str(dim), + "-E" + str(self._edge_size), + "-S" + str(self._edge_size_for_search), + "-e" + str(self._epsilon), + "-P0", + "-B30", + "-T" + str(self._build_time_limit), + anngIndex, + ] subprocess.call(args) idx = ngtpy.Index(path=anngIndex) idx.batch_insert(X, num_threads=24, debug=False) - print('ONNG: ANNG construction time(sec)=' + str(time.time() - t)) + print("ONNG: ANNG construction time(sec)=" + str(time.time() - t)) t = time.time() if self._refine_enabled: - idx.refine_anng(epsilon=self._epsilon, num_of_edges=self._edge_size, - num_of_explored_edges=self._edge_size_for_search) - print('ONNG: RNNG construction time(sec)=' + str(time.time() - t)) + idx.refine_anng( + epsilon=self._epsilon, + num_of_edges=self._edge_size, + num_of_explored_edges=self._edge_size_for_search, + ) + print("ONNG: RNNG construction time(sec)=" + str(time.time() - t)) idx.save() idx.close() if not os.path.exists(index): - print('ONNG: degree adjustment') + print("ONNG: degree adjustment") t = time.time() - args = ['ngt', 'reconstruct-graph', '-mS', - '-o ' + str(self._outdegree), - '-i ' + str(self._indegree), anngIndex, index] + args = [ + "ngt", + "reconstruct-graph", + "-mS", + "-o " + str(self._outdegree), + "-i " + str(self._indegree), + anngIndex, + index, + ] subprocess.call(args) - print('ONNG: degree adjustment time(sec)=' + str(time.time() - t)) + print("ONNG: degree adjustment time(sec)=" + str(time.time() - t)) if os.path.exists(index): - print('ONNG: index already exists! ' + str(index)) + print("ONNG: index already exists! " + str(index)) t = time.time() print(self._tree_disabled) self.index = ngtpy.Index(index, read_only=True, tree_disabled=self._tree_disabled) self.indexName = index - print('ONNG: open time(sec)=' + str(time.time() - t)) + print("ONNG: open time(sec)=" + str(time.time() - t)) else: - print('ONNG: something wrong.') - print('ONNG: end of fit') + print("ONNG: something wrong.") + print("ONNG: end of fit") def set_query_arguments(self, parameters): epsilon, edge_size = parameters print("ONNG: edge_size=" + str(edge_size)) print("ONNG: epsilon=" + str(epsilon)) - self.name = 'ONNG-NGT(%s, %s, %s, %s, %1.3f)' % ( - self._edge_size, self._outdegree, - self._indegree, edge_size, epsilon) + self.name = "ONNG-NGT(%s, %s, %s, %s, %1.3f)" % ( + self._edge_size, + self._outdegree, + self._indegree, + edge_size, + epsilon, + ) epsilon = epsilon - 1.0 self.index.set(epsilon=epsilon, edge_size=edge_size) @@ -94,4 +116,4 @@ def query(self, v, n): return self.index.search(v, n, with_distance=False) def freeIndex(self): - print('ONNG: free') + print("ONNG: free") diff --git a/ann_benchmarks/algorithms/opensearchknn.py b/ann_benchmarks/algorithms/opensearchknn.py index f1e8ebf6b..2fe636ec5 100644 --- a/ann_benchmarks/algorithms/opensearchknn.py +++ b/ann_benchmarks/algorithms/opensearchknn.py @@ -13,31 +13,27 @@ # Configure the logger. logging.getLogger("elasticsearch").setLevel(logging.WARN) + class OpenSearchKNN(BaseANN): def __init__(self, metric, dimension, method_param): self.metric = {"angular": "cosinesimil", "euclidean": "l2"}[metric] self.dimension = dimension self.method_param = method_param - self.param_string = "-".join(k+"-"+str(v) for k,v in self.method_param.items()).lower() + self.param_string = "-".join(k + "-" + str(v) for k, v in self.method_param.items()).lower() self.name = f"os-{self.param_string}" self.es = Elasticsearch(["http://localhost:9200"]) es_wait() def fit(self, X): body = { - "settings": { - "index": {"knn": True}, - "number_of_shards": 1, - "number_of_replicas": 0, - "refresh_interval": -1 - } + "settings": {"index": {"knn": True}, "number_of_shards": 1, "number_of_replicas": 0, "refresh_interval": -1} } mapping = { "properties": { "id": {"type": "keyword", "store": True}, "vec": { - "type": "knn_vector", + "type": "knn_vector", "dimension": self.dimension, "method": { "name": "hnsw", @@ -45,61 +41,60 @@ def fit(self, X): "engine": "nmslib", "parameters": { "ef_construction": self.method_param["efConstruction"], - "m": self.method_param["M"] - } - } - } + "m": self.method_param["M"], + }, + }, + }, } } - + self.es.indices.create(self.name, body=body) self.es.indices.put_mapping(mapping, self.name) print("Uploading data to the Index:", self.name) + def gen(): for i, vec in enumerate(tqdm(X)): - yield { "_op_type": "index", "_index": self.name, "vec": vec.tolist(), 'id': str(i + 1) } + yield {"_op_type": "index", "_index": self.name, "vec": vec.tolist(), "id": str(i + 1)} (_, errors) = bulk(self.es, gen(), chunk_size=500, max_retries=2, request_timeout=10) assert len(errors) == 0, errors - + print("Force Merge...") self.es.indices.forcemerge(self.name, max_num_segments=1, request_timeout=1000) - + print("Refreshing the Index...") self.es.indices.refresh(self.name, request_timeout=1000) - + print("Running Warmup API...") - res = urlopen(Request("http://localhost:9200/_plugins/_knn/warmup/"+self.name+"?pretty")) + res = urlopen(Request("http://localhost:9200/_plugins/_knn/warmup/" + self.name + "?pretty")) print(res.read().decode("utf-8")) def set_query_arguments(self, ef): - body = { - "settings": { - "index": {"knn.algo_param.ef_search": ef} - } - } + body = {"settings": {"index": {"knn.algo_param.ef_search": ef}}} self.es.indices.put_settings(body=body) def query(self, q, n): - body = { - "query": { - "knn": { - "vec": {"vector": q.tolist(), "k": n} - } - } - } + body = {"query": {"knn": {"vec": {"vector": q.tolist(), "k": n}}}} + + res = self.es.search( + index=self.name, + body=body, + size=n, + _source=False, + docvalue_fields=["id"], + stored_fields="_none_", + filter_path=["hits.hits.fields.id"], + request_timeout=10, + ) - res = self.es.search(index=self.name, body=body, size=n, _source=False, docvalue_fields=['id'], - stored_fields="_none_", filter_path=["hits.hits.fields.id"], request_timeout=10) - - return [int(h['fields']['id'][0]) - 1 for h in res['hits']['hits']] + return [int(h["fields"]["id"][0]) - 1 for h in res["hits"]["hits"]] def batch_query(self, X, n): self.batch_res = [self.query(q, n) for q in X] def get_batch_results(self): return self.batch_res - + def freeIndex(self): - self.es.indices.delete(index=self.name) \ No newline at end of file + self.es.indices.delete(index=self.name) diff --git a/ann_benchmarks/algorithms/panng_ngt.py b/ann_benchmarks/algorithms/panng_ngt.py index e3f7bdadb..027305cad 100644 --- a/ann_benchmarks/algorithms/panng_ngt.py +++ b/ann_benchmarks/algorithms/panng_ngt.py @@ -8,69 +8,67 @@ class PANNG(BaseANN): def __init__(self, metric, object_type, param): - metrics = {'euclidean': 'L2', 'angular': 'Cosine'} - self._edge_size = int(param['edge']) - self._pathadj_size = int(param['pathadj']) - self._edge_size_for_search = int(param['searchedge']) + metrics = {"euclidean": "L2", "angular": "Cosine"} + self._edge_size = int(param["edge"]) + self._pathadj_size = int(param["pathadj"]) + self._edge_size_for_search = int(param["searchedge"]) self._metric = metrics[metric] self._object_type = object_type - print('PANNG: edge_size=' + str(self._edge_size)) - print('PANNG: pathadj_size=' + str(self._pathadj_size)) - print('PANNG: edge_size_for_search=' + str(self._edge_size_for_search)) - print('PANNG: metric=' + metric) - print('PANNG: object_type=' + object_type) + print("PANNG: edge_size=" + str(self._edge_size)) + print("PANNG: pathadj_size=" + str(self._pathadj_size)) + print("PANNG: edge_size_for_search=" + str(self._edge_size_for_search)) + print("PANNG: metric=" + metric) + print("PANNG: object_type=" + object_type) def fit(self, X): - print('PANNG: start indexing...') + print("PANNG: start indexing...") dim = len(X[0]) - print('PANNG: # of data=' + str(len(X))) - print('PANNG: Dimensionality=' + str(dim)) - index_dir = 'indexes' + print("PANNG: # of data=" + str(len(X))) + print("PANNG: Dimensionality=" + str(dim)) + index_dir = "indexes" if not os.path.exists(index_dir): os.makedirs(index_dir) - index = os.path.join( - index_dir, - 'PANNG-' + str(self._edge_size) + '-' + str(self._pathadj_size)) + index = os.path.join(index_dir, "PANNG-" + str(self._edge_size) + "-" + str(self._pathadj_size)) print(index) if os.path.exists(index): - print('PANNG: index already exists! ' + str(index)) + print("PANNG: index already exists! " + str(index)) else: t0 = time.time() - ngtpy.create(path=index, dimension=dim, - edge_size_for_creation=self._edge_size, - distance_type=self._metric, - object_type=self._object_type) + ngtpy.create( + path=index, + dimension=dim, + edge_size_for_creation=self._edge_size, + distance_type=self._metric, + object_type=self._object_type, + ) idx = ngtpy.Index(path=index) idx.batch_insert(X, num_threads=24, debug=False) idx.save() idx.close() if self._pathadj_size > 0: - print('PANNG: path adjustment') - args = ['ngt', 'prune', '-s ' + str(self._pathadj_size), - index] + print("PANNG: path adjustment") + args = ["ngt", "prune", "-s " + str(self._pathadj_size), index] subprocess.call(args) indexingtime = time.time() - t0 - print('PANNG: indexing, adjustment and saving time(sec)={}' - .format(indexingtime)) + print("PANNG: indexing, adjustment and saving time(sec)={}".format(indexingtime)) t0 = time.time() self.index = ngtpy.Index(path=index, read_only=True) opentime = time.time() - t0 - print('PANNG: open time(sec)=' + str(opentime)) + print("PANNG: open time(sec)=" + str(opentime)) def set_query_arguments(self, epsilon): print("PANNG: epsilon=" + str(epsilon)) self._epsilon = epsilon - 1.0 - self.name = 'PANNG-NGT(%d, %d, %d, %1.3f)' % ( + self.name = "PANNG-NGT(%d, %d, %d, %1.3f)" % ( self._edge_size, self._pathadj_size, self._edge_size_for_search, - self._epsilon + 1.0) + self._epsilon + 1.0, + ) def query(self, v, n): - results = self.index.search( - v, n, self._epsilon, self._edge_size_for_search, - with_distance=False) + results = self.index.search(v, n, self._epsilon, self._edge_size_for_search, with_distance=False) return results def freeIndex(self): - print('PANNG: free') + print("PANNG: free") diff --git a/ann_benchmarks/algorithms/pgvector.py b/ann_benchmarks/algorithms/pgvector.py index 2fa1a9a51..5649fbac7 100644 --- a/ann_benchmarks/algorithms/pgvector.py +++ b/ann_benchmarks/algorithms/pgvector.py @@ -6,6 +6,7 @@ from ann_benchmarks.algorithms.base import BaseANN + class PGVector(BaseANN): def __init__(self, metric, lists): self._metric = metric @@ -24,9 +25,11 @@ def fit(self, X): copy.write_row((i, embedding)) print("creating index...") if self._metric == "angular": - cur.execute('CREATE INDEX ON items USING ivfflat (embedding vector_cosine_ops) WITH (lists = %d)' % self._lists) + cur.execute( + "CREATE INDEX ON items USING ivfflat (embedding vector_cosine_ops) WITH (lists = %d)" % self._lists + ) elif self._metric == "euclidean": - cur.execute('CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = %d)' % self._lists) + cur.execute("CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = %d)" % self._lists) else: raise RuntimeError(f"unknown metric {self._metric}") print("done!") diff --git a/ann_benchmarks/algorithms/puffinn.py b/ann_benchmarks/algorithms/puffinn.py index 5f372abc3..fd58bd7c9 100644 --- a/ann_benchmarks/algorithms/puffinn.py +++ b/ann_benchmarks/algorithms/puffinn.py @@ -6,11 +6,11 @@ from ann_benchmarks.algorithms.base import BaseANN import numpy + class Puffinn(BaseANN): - def __init__(self, metric, space=10**6, hash_function="fht_crosspolytope", hash_source='pool', hash_args=None): - if metric not in ['jaccard', 'angular']: - raise NotImplementedError( - "Puffinn doesn't support metric %s" % metric) + def __init__(self, metric, space=10**6, hash_function="fht_crosspolytope", hash_source="pool", hash_args=None): + if metric not in ["jaccard", "angular"]: + raise NotImplementedError("Puffinn doesn't support metric %s" % metric) self.metric = metric self.space = space self.hash_function = hash_function @@ -18,20 +18,26 @@ def __init__(self, metric, space=10**6, hash_function="fht_crosspolytope", hash_ self.hash_args = hash_args def fit(self, X): - if self.metric == 'angular': + if self.metric == "angular": dimensions = len(X[0]) else: dimensions = 0 for x in X: - dimensions = max(dimensions, max(x)+1) + dimensions = max(dimensions, max(x) + 1) if self.hash_args: - self.index = puffinn.Index(self.metric, dimensions, self.space,\ - hash_function=self.hash_function, hash_source=self.hash_source,\ - hash_args=self.hash_args) + self.index = puffinn.Index( + self.metric, + dimensions, + self.space, + hash_function=self.hash_function, + hash_source=self.hash_source, + hash_args=self.hash_args, + ) else: - self.index = puffinn.Index(self.metric, dimensions, self.space,\ - hash_function=self.hash_function, hash_source=self.hash_source) + self.index = puffinn.Index( + self.metric, dimensions, self.space, hash_function=self.hash_function, hash_source=self.hash_source + ) for i, x in enumerate(X): if self.metric == "jaccard" and x.dtype == np.bool_: x = np.flatnonzero(x) @@ -49,4 +55,9 @@ def query(self, v, n): return self.index.search(v, n, self.recall) def __str__(self): - return 'PUFFINN(space=%d, recall=%f, hf=%s, hashsource=%s)' % (self.space, self.recall, self.hash_function, self.hash_source) + return "PUFFINN(space=%d, recall=%f, hf=%s, hashsource=%s)" % ( + self.space, + self.recall, + self.hash_function, + self.hash_source, + ) diff --git a/ann_benchmarks/algorithms/pynndescent.py b/ann_benchmarks/algorithms/pynndescent.py index 92f470538..392e71c76 100644 --- a/ann_benchmarks/algorithms/pynndescent.py +++ b/ann_benchmarks/algorithms/pynndescent.py @@ -13,9 +13,7 @@ def __init__(self, metric, index_param_dict, n_search_trees=1): self._n_neighbors = 30 if "pruning_degree_multiplier" in index_param_dict: - self._pruning_degree_multiplier = float( - index_param_dict["pruning_degree_multiplier"] - ) + self._pruning_degree_multiplier = float(index_param_dict["pruning_degree_multiplier"]) else: self._pruning_degree_multiplier = 1.5 @@ -98,13 +96,13 @@ def query(self, v, n): self._query_matrix.data = np.ones(size, dtype=np.float32) ind, dist = self._index.query(self._query_matrix, k=n, epsilon=self._epsilon) else: - ind, dist = self._index.query( - v.reshape(1, -1).astype("float32"), k=n, epsilon=self._epsilon - ) + ind, dist = self._index.query(v.reshape(1, -1).astype("float32"), k=n, epsilon=self._epsilon) return ind[0] def __str__(self): - str_template = "PyNNDescent(n_neighbors=%d, pruning_mult=%.2f, diversify_prob=%.3f, epsilon=%.3f, leaf_size=%02d)" + str_template = ( + "PyNNDescent(n_neighbors=%d, pruning_mult=%.2f, diversify_prob=%.3f, epsilon=%.3f, leaf_size=%02d)" + ) return str_template % ( self._n_neighbors, self._pruning_degree_multiplier, diff --git a/ann_benchmarks/algorithms/qdrant.py b/ann_benchmarks/algorithms/qdrant.py index a88028b10..1bf34edab 100644 --- a/ann_benchmarks/algorithms/qdrant.py +++ b/ann_benchmarks/algorithms/qdrant.py @@ -4,34 +4,29 @@ import numpy as np from time import sleep + class Qdrant(BaseANN): - - _distances_mapping = { - 'dot': Distance.DOT, - 'angular': Distance.COSINE, - 'euclidean': Distance.EUCLID - } + + _distances_mapping = {"dot": Distance.DOT, "angular": Distance.COSINE, "euclidean": Distance.EUCLID} def __init__(self, metric, grpc): self._metric = metric - self._collection_name = 'ann_benchmarks_test' + self._collection_name = "ann_benchmarks_test" self._grpc = grpc - self._search_params = { - 'hnsw_ef': None - } + self._search_params = {"hnsw_ef": None} qdrant_client_params = { - 'host': 'localhost', - 'port': 6333, - 'grpc_port': 6334, - 'prefer_grpc': self._grpc, - 'https': False, - } + "host": "localhost", + "port": 6333, + "grpc_port": 6334, + "prefer_grpc": self._grpc, + "https": False, + } self._client = QdrantClient(**qdrant_client_params) - def fit(self, X): - if X.dtype != np.float32: X = X.astype(np.float32) + if X.dtype != np.float32: + X = X.astype(np.float32) self._client.recreate_collection( collection_name=self._collection_name, @@ -40,63 +35,63 @@ def fit(self, X): # hnsw_config=qdrant_models.HnswConfigDiff( # ef_construct=100, #100 is qdrant default # m=16 #16 is qdrant default - # ), - timeout=30 + # ), + timeout=30, ) self._client.upload_collection( - collection_name=self._collection_name, - vectors=X, - ids=list(range(X.shape[0])), - parallel=1 + collection_name=self._collection_name, vectors=X, ids=list(range(X.shape[0])), parallel=1 ) - #wait for vectors to be fully indexed + # wait for vectors to be fully indexed SECONDS_WAITING_FOR_INDEXING_API_CALL = 5 while True: - collection_info = self._client.http.collections_api.get_collection(self._collection_name).dict()['result'] + collection_info = self._client.http.collections_api.get_collection(self._collection_name).dict()["result"] - vectors_count = collection_info['vectors_count'] - indexed_vectors_count = collection_info['indexed_vectors_count'] - status = collection_info['status'] + vectors_count = collection_info["vectors_count"] + indexed_vectors_count = collection_info["indexed_vectors_count"] + status = collection_info["status"] + + print("Stored vectors: " + str(vectors_count)) + print("Indexed vectors: " + str(indexed_vectors_count)) + print("Collection status: " + str(status)) - print('Stored vectors: ' + str(vectors_count)) - print('Indexed vectors: ' + str(indexed_vectors_count)) - print('Collection status: ' + str(status)) - print(type(status), status) if status == CollectionStatus.GREEN: - print('Vectors indexing finished.') + print("Vectors indexing finished.") break else: - print('Waiting ' + str(SECONDS_WAITING_FOR_INDEXING_API_CALL) + ' seconds to query collection info again...') + print( + "Waiting " + + str(SECONDS_WAITING_FOR_INDEXING_API_CALL) + + " seconds to query collection info again..." + ) sleep(SECONDS_WAITING_FOR_INDEXING_API_CALL) - def set_query_arguments(self, hnsw_ef): - self._search_params['hnsw_ef'] = hnsw_ef + self._search_params["hnsw_ef"] = hnsw_ef def query(self, q, n): - search_params = SearchParams(hnsw_ef=self._search_params['hnsw_ef']) + search_params = SearchParams(hnsw_ef=self._search_params["hnsw_ef"]) search_result = self._client.search( collection_name=self._collection_name, query_vector=q, search_params=search_params, - with_payload=False, #just in case - limit=n + with_payload=False, # just in case + limit=n, ) result_ids = [point.id for point in search_result] return result_ids def batch_query(self, X, n): - search_queries = [SearchRequest(vector=q.tolist(), limit=n, params=SearchParams(hnsw_ef=self._search_params['hnsw_ef'])) for q in X] + search_queries = [ + SearchRequest(vector=q.tolist(), limit=n, params=SearchParams(hnsw_ef=self._search_params["hnsw_ef"])) + for q in X + ] - batch_search_results = self._client.search_batch( - collection_name=self._collection_name, - requests=search_queries - ) + batch_search_results = self._client.search_batch(collection_name=self._collection_name, requests=search_queries) self.batch_results = [] for search_result in batch_search_results: @@ -106,4 +101,4 @@ def get_batch_results(self): return self.batch_results def __str__(self): - return "Qdrant(grpc=%s, hnsw_ef=%s)" % (self._grpc, self._search_params['hnsw_ef']) + return "Qdrant(grpc=%s, hnsw_ef=%s)" % (self._grpc, self._search_params["hnsw_ef"]) diff --git a/ann_benchmarks/algorithms/qg_ngt.py b/ann_benchmarks/algorithms/qg_ngt.py index b097d5793..ddd4b4707 100644 --- a/ann_benchmarks/algorithms/qg_ngt.py +++ b/ann_benchmarks/algorithms/qg_ngt.py @@ -5,98 +5,126 @@ import time from ann_benchmarks.algorithms.base import BaseANN + class QG(BaseANN): def __init__(self, metric, object_type, epsilon, param): - metrics = {'euclidean': '2', 'angular': 'E'} - self._edge_size = int(param['edge']) - self._outdegree = int(param['outdegree']) - self._indegree = int(param['indegree']) - self._max_edge_size = int(param['max_edge']) if 'max_edge' in param.keys() else 128 + metrics = {"euclidean": "2", "angular": "E"} + self._edge_size = int(param["edge"]) + self._outdegree = int(param["outdegree"]) + self._indegree = int(param["indegree"]) + self._max_edge_size = int(param["max_edge"]) if "max_edge" in param.keys() else 128 self._metric = metrics[metric] self._object_type = object_type - self._edge_size_for_search = int(param['search_edge']) if 'search_edge' in param.keys() else -2 - self._tree_disabled = (param['tree'] is False) if 'tree' in param.keys() else False + self._edge_size_for_search = int(param["search_edge"]) if "search_edge" in param.keys() else -2 + self._tree_disabled = (param["tree"] is False) if "tree" in param.keys() else False self._build_time_limit = 4 self._epsilon = epsilon - print('QG: edge_size=' + str(self._edge_size)) - print('QG: outdegree=' + str(self._outdegree)) - print('QG: indegree=' + str(self._indegree)) - print('QG: edge_size_for_search=' + str(self._edge_size_for_search)) - print('QG: epsilon=' + str(self._epsilon)) - print('QG: metric=' + metric) - print('QG: object_type=' + object_type) + print("QG: edge_size=" + str(self._edge_size)) + print("QG: outdegree=" + str(self._outdegree)) + print("QG: indegree=" + str(self._indegree)) + print("QG: edge_size_for_search=" + str(self._edge_size_for_search)) + print("QG: epsilon=" + str(self._epsilon)) + print("QG: metric=" + metric) + print("QG: object_type=" + object_type) def fit(self, X): - print('QG: start indexing...') + print("QG: start indexing...") dim = len(X[0]) - print('QG: # of data=' + str(len(X))) - print('QG: dimensionality=' + str(dim)) - index_dir = 'indexes' + print("QG: # of data=" + str(len(X))) + print("QG: dimensionality=" + str(dim)) + index_dir = "indexes" if not os.path.exists(index_dir): os.makedirs(index_dir) - index = os.path.join( - index_dir, - 'ONNG-{}-{}-{}'.format(self._edge_size, self._outdegree, - self._indegree)) - anngIndex = os.path.join(index_dir, 'ANNG-' + str(self._edge_size)) - print('QG: index=' + index) + index = os.path.join(index_dir, "ONNG-{}-{}-{}".format(self._edge_size, self._outdegree, self._indegree)) + anngIndex = os.path.join(index_dir, "ANNG-" + str(self._edge_size)) + print("QG: index=" + index) if (not os.path.exists(index)) and (not os.path.exists(anngIndex)): - print('QG: create ANNG') + print("QG: create ANNG") t = time.time() - args = ['ngt', 'create', '-it', '-p8', '-b500', '-ga', '-of', - '-D' + self._metric, '-d' + str(dim), - '-E' + str(self._edge_size), '-S40', - '-e' + str(self._epsilon), '-P0', '-B30', - '-T' + str(self._build_time_limit), anngIndex] + args = [ + "ngt", + "create", + "-it", + "-p8", + "-b500", + "-ga", + "-of", + "-D" + self._metric, + "-d" + str(dim), + "-E" + str(self._edge_size), + "-S40", + "-e" + str(self._epsilon), + "-P0", + "-B30", + "-T" + str(self._build_time_limit), + anngIndex, + ] subprocess.call(args) idx = ngtpy.Index(path=anngIndex) idx.batch_insert(X, num_threads=24, debug=False) idx.save() idx.close() - print('QG: ANNG construction time(sec)=' + str(time.time() - t)) + print("QG: ANNG construction time(sec)=" + str(time.time() - t)) if not os.path.exists(index): - print('QG: degree adjustment') + print("QG: degree adjustment") t = time.time() - args = ['ngt', 'reconstruct-graph', '-mS', - '-E ' + str(self._outdegree), - '-o ' + str(self._outdegree), - '-i ' + str(self._indegree), anngIndex, index] + args = [ + "ngt", + "reconstruct-graph", + "-mS", + "-E " + str(self._outdegree), + "-o " + str(self._outdegree), + "-i " + str(self._indegree), + anngIndex, + index, + ] subprocess.call(args) - print('QG: degree adjustment time(sec)=' + str(time.time() - t)) - if not os.path.exists(index + '/qg'): - print('QG:create and append...') + print("QG: degree adjustment time(sec)=" + str(time.time() - t)) + if not os.path.exists(index + "/qg"): + print("QG:create and append...") t = time.time() - args = ['qbg', 'create-qg', index] + args = ["qbg", "create-qg", index] subprocess.call(args) - print('QG: create qg time(sec)=' + str(time.time() - t)) - print('QB: build...') + print("QG: create qg time(sec)=" + str(time.time() - t)) + print("QB: build...") t = time.time() - args = ['qbg', 'build-qg', '-o20000', '-M6', '-ib', - '-I400', '-Gz', '-Pn', - '-E' + str(self._max_edge_size), - index] + args = [ + "qbg", + "build-qg", + "-o20000", + "-M6", + "-ib", + "-I400", + "-Gz", + "-Pn", + "-E" + str(self._max_edge_size), + index, + ] subprocess.call(args) - print('QG: build qg time(sec)=' + str(time.time() - t)) - if os.path.exists(index + '/qg/grp'): - print('QG: index already exists! ' + str(index)) + print("QG: build qg time(sec)=" + str(time.time() - t)) + if os.path.exists(index + "/qg/grp"): + print("QG: index already exists! " + str(index)) t = time.time() self.index = ngtpy.QuantizedIndex(index, self._max_edge_size) self.index.set_with_distance(False) self.indexName = index - print('QG: open time(sec)=' + str(time.time() - t)) + print("QG: open time(sec)=" + str(time.time() - t)) else: - print('QG: something wrong.') - print('QG: end of fit') + print("QG: something wrong.") + print("QG: end of fit") def set_query_arguments(self, parameters): result_expansion, epsilon = parameters print("QG: result_expansion=" + str(result_expansion)) print("QG: epsilon=" + str(epsilon)) - self.name = 'QG-NGT(%s, %s, %s, %s, %s, %1.3f)' % ( - self._edge_size, self._outdegree, - self._indegree, self._max_edge_size, + self.name = "QG-NGT(%s, %s, %s, %s, %s, %1.3f)" % ( + self._edge_size, + self._outdegree, + self._indegree, + self._max_edge_size, epsilon, - result_expansion) + result_expansion, + ) epsilon = epsilon - 1.0 self.index.set(epsilon=epsilon, result_expansion=result_expansion) @@ -104,4 +132,4 @@ def query(self, v, n): return self.index.search(v, n) def freeIndex(self): - print('QG: free') + print("QG: free") diff --git a/ann_benchmarks/algorithms/qsg_ngt.py b/ann_benchmarks/algorithms/qsg_ngt.py index 8ed27f1a9..700fb4a35 100644 --- a/ann_benchmarks/algorithms/qsg_ngt.py +++ b/ann_benchmarks/algorithms/qsg_ngt.py @@ -10,149 +10,214 @@ class QSG(BaseANN): def __init__(self, metric, object_type, epsilon, param): - metrics = {'euclidean': '2', 'angular': 'E'} - self._edge_size = int(param['edge']) - self._outdegree = int(param['outdegree']) - self._indegree = int(param['indegree']) - self._max_edge_size = int(param['max_edge']) if 'max_edge' in param.keys() else 128 + metrics = {"euclidean": "2", "angular": "E"} + self._edge_size = int(param["edge"]) + self._outdegree = int(param["outdegree"]) + self._indegree = int(param["indegree"]) + self._max_edge_size = int(param["max_edge"]) if "max_edge" in param.keys() else 128 self._metric = metrics[metric] self._object_type = object_type - self._edge_size_for_search = int(param['search_edge']) if 'search_edge' in param.keys() else -2 - self._tree_disabled = (param['tree'] is False) if 'tree' in param.keys() else False + self._edge_size_for_search = int(param["search_edge"]) if "search_edge" in param.keys() else -2 + self._tree_disabled = (param["tree"] is False) if "tree" in param.keys() else False self._build_time_limit = 4 self._epsilon = epsilon - self._paramE = param['paramE'] - self._paramS = param['paramS'] - self._range = int(param['range']) - self._threshold = int(param['threshold']) - self._rangeMax = int(param['rangeMax']) - self._searchA = int(param['searchA']) - self._ifES = int(param['ifES']) - print('QSG: edge_size=' + str(self._edge_size)) - print('QSG: outdegree=' + str(self._outdegree)) - print('QSG: indegree=' + str(self._indegree)) - print('QSG: edge_size_for_search=' + str(self._edge_size_for_search)) - print('QSG: epsilon=' + str(self._epsilon)) - print('QSG: metric=' + metric) - print('QSG: object_type=' + object_type) - print('QG: range=' +str(self._range)) - print('QG: threshold=' + str(self._threshold)) + self._paramE = param["paramE"] + self._paramS = param["paramS"] + self._range = int(param["range"]) + self._threshold = int(param["threshold"]) + self._rangeMax = int(param["rangeMax"]) + self._searchA = int(param["searchA"]) + self._ifES = int(param["ifES"]) + print("QSG: edge_size=" + str(self._edge_size)) + print("QSG: outdegree=" + str(self._outdegree)) + print("QSG: indegree=" + str(self._indegree)) + print("QSG: edge_size_for_search=" + str(self._edge_size_for_search)) + print("QSG: epsilon=" + str(self._epsilon)) + print("QSG: metric=" + metric) + print("QSG: object_type=" + object_type) + print("QG: range=" + str(self._range)) + print("QG: threshold=" + str(self._threshold)) def fit(self, X): - print('QSG: start indexing...') + print("QSG: start indexing...") dim = len(X[0]) - print('QSG: # of data=' + str(len(X))) - print('QSG: dimensionality=' + str(dim)) - index_dir = 'indexes' + print("QSG: # of data=" + str(len(X))) + print("QSG: dimensionality=" + str(dim)) + index_dir = "indexes" if not os.path.exists(index_dir): os.makedirs(index_dir) - index = os.path.join( - index_dir, - 'ONNG-{}-{}-{}'.format(self._edge_size, self._outdegree, - self._indegree)) - anngIndex = os.path.join(index_dir, 'ANNG-' + str(self._edge_size)) - print('QSG: index=' + index) + index = os.path.join(index_dir, "ONNG-{}-{}-{}".format(self._edge_size, self._outdegree, self._indegree)) + anngIndex = os.path.join(index_dir, "ANNG-" + str(self._edge_size)) + print("QSG: index=" + index) if (not os.path.exists(index)) and (not os.path.exists(anngIndex)): - print('QSG: create ANNG') + print("QSG: create ANNG") t = time.time() - args = ['ngt', 'create', '-it', '-p8', '-b500', '-ga', '-of', - '-D' + self._metric, '-d' + str(dim), - '-E' + str(self._edge_size), '-S40', - '-e' + str(self._epsilon), '-P0', '-B30', - '-T' + str(self._build_time_limit),'-R' + str(self._range), '-t' + str(self._threshold),'-M' + str(self._rangeMax),'-A' + str(self._searchA),'-H' + str(self._ifES), anngIndex] + args = [ + "ngt", + "create", + "-it", + "-p8", + "-b500", + "-ga", + "-of", + "-D" + self._metric, + "-d" + str(dim), + "-E" + str(self._edge_size), + "-S40", + "-e" + str(self._epsilon), + "-P0", + "-B30", + "-T" + str(self._build_time_limit), + "-R" + str(self._range), + "-t" + str(self._threshold), + "-M" + str(self._rangeMax), + "-A" + str(self._searchA), + "-H" + str(self._ifES), + anngIndex, + ] subprocess.call(args) idx = ngtpy.Index(path=anngIndex) idx.batch_insert(X, num_threads=24, debug=False) idx.save() idx.close() - print('QSG: ANNG construction time(sec)=' + str(time.time() - t)) + print("QSG: ANNG construction time(sec)=" + str(time.time() - t)) if self._ifES == 1: - if self._metric == 'E': - X_normalized = preprocessing.normalize(X, norm='l2') - fvecs_dir = 'fvecs' + if self._metric == "E": + X_normalized = preprocessing.normalize(X, norm="l2") + fvecs_dir = "fvecs" if not os.path.exists(fvecs_dir): os.makedirs(fvecs_dir) - fvecs = os.path.join(fvecs_dir, 'base.fvecs') - with open(fvecs, 'wb') as fp: + fvecs = os.path.join(fvecs_dir, "base.fvecs") + with open(fvecs, "wb") as fp: for y in X_normalized: - d = struct.pack('I', y.size) + d = struct.pack("I", y.size) fp.write(d) for x in y: - a = struct.pack('f', x) + a = struct.pack("f", x) fp.write(a) else: - fvecs_dir = 'fvecs' + fvecs_dir = "fvecs" if not os.path.exists(fvecs_dir): os.makedirs(fvecs_dir) - fvecs = os.path.join(fvecs_dir, 'base.fvecs') - with open(fvecs, 'wb') as fp: + fvecs = os.path.join(fvecs_dir, "base.fvecs") + with open(fvecs, "wb") as fp: for y in X: - d = struct.pack('I', y.size) + d = struct.pack("I", y.size) fp.write(d) for x in y: - a = struct.pack('f', x) + a = struct.pack("f", x) fp.write(a) parmEfanna = self._paramE parmSSG = self._paramS - graph_dir = 'graph' + graph_dir = "graph" if not os.path.exists(graph_dir): os.makedirs(graph_dir) - KNNG = os.path.join(graph_dir, 'KNNG-' + str(parmEfanna[0]) + '-' + str(parmEfanna[1]) + '-' + str( - parmEfanna[2]) + '-' + str(parmEfanna[3]) + '-' + str(parmEfanna[4]) + '.graph') - SG = os.path.join(anngIndex, 'grp') - cmds = '/home/app/hwtl_sdu-anns-qsgngtlib/qsgngt-knng ' + str(fvecs) + ' ' + str(KNNG) + ' ' + str( - parmEfanna[0]) + ' ' + str(parmEfanna[1]) + ' ' + str(parmEfanna[2]) + ' ' + str( - parmEfanna[3]) + ' ' + str( - parmEfanna[4]) + \ - '&& /home/app/hwtl_sdu-anns-qsgngtlib/qsgngt-SpaceGraph ' + str(fvecs) + ' ' + str(KNNG) + ' ' + str( - parmSSG[0]) + ' ' + str(parmSSG[1]) + ' ' + str(parmSSG[2]) + ' ' + str(SG) + KNNG = os.path.join( + graph_dir, + "KNNG-" + + str(parmEfanna[0]) + + "-" + + str(parmEfanna[1]) + + "-" + + str(parmEfanna[2]) + + "-" + + str(parmEfanna[3]) + + "-" + + str(parmEfanna[4]) + + ".graph", + ) + SG = os.path.join(anngIndex, "grp") + cmds = ( + "/home/app/hwtl_sdu-anns-qsgngtlib/qsgngt-knng " + + str(fvecs) + + " " + + str(KNNG) + + " " + + str(parmEfanna[0]) + + " " + + str(parmEfanna[1]) + + " " + + str(parmEfanna[2]) + + " " + + str(parmEfanna[3]) + + " " + + str(parmEfanna[4]) + + "&& /home/app/hwtl_sdu-anns-qsgngtlib/qsgngt-SpaceGraph " + + str(fvecs) + + " " + + str(KNNG) + + " " + + str(parmSSG[0]) + + " " + + str(parmSSG[1]) + + " " + + str(parmSSG[2]) + + " " + + str(SG) + ) os.system(cmds) - if not os.path.exists(index): - print('QSG: degree adjustment') + print("QSG: degree adjustment") t = time.time() - args = ['ngt', 'reconstruct-graph', '-mS', - '-E ' + str(self._outdegree), - '-o ' + str(self._outdegree), - '-i ' + str(self._indegree), anngIndex, index] + args = [ + "ngt", + "reconstruct-graph", + "-mS", + "-E " + str(self._outdegree), + "-o " + str(self._outdegree), + "-i " + str(self._indegree), + anngIndex, + index, + ] subprocess.call(args) - print('QSG: degree adjustment time(sec)=' + str(time.time() - t)) - if not os.path.exists(index + '/qg'): - print('QSG:create and append...') + print("QSG: degree adjustment time(sec)=" + str(time.time() - t)) + if not os.path.exists(index + "/qg"): + print("QSG:create and append...") t = time.time() - args = ['qbg', 'create-qg', index] + args = ["qbg", "create-qg", index] subprocess.call(args) - print('QSG: create qg time(sec)=' + str(time.time() - t)) - print('QB: build...') + print("QSG: create qg time(sec)=" + str(time.time() - t)) + print("QB: build...") t = time.time() - args = ['qbg', 'build-qg', '-o20000', '-M6', '-ib', - '-I400', '-Gz', '-Pn', - '-E' + str(self._max_edge_size), - index] + args = [ + "qbg", + "build-qg", + "-o20000", + "-M6", + "-ib", + "-I400", + "-Gz", + "-Pn", + "-E" + str(self._max_edge_size), + index, + ] subprocess.call(args) - print('QSG: build qg time(sec)=' + str(time.time() - t)) - if os.path.exists(index + '/qg/grp'): - print('QSG: index already exists! ' + str(index)) + print("QSG: build qg time(sec)=" + str(time.time() - t)) + if os.path.exists(index + "/qg/grp"): + print("QSG: index already exists! " + str(index)) t = time.time() self.index = ngtpy.QuantizedIndex(index, self._max_edge_size) self.index.set_with_distance(False) self.indexName = index - print('QSG: open time(sec)=' + str(time.time() - t)) + print("QSG: open time(sec)=" + str(time.time() - t)) else: - print('QSG: something wrong.') - print('QSG: end of fit') - print('QSG:Successfully Build Index') + print("QSG: something wrong.") + print("QSG: end of fit") + print("QSG:Successfully Build Index") def set_query_arguments(self, parameters): result_expansion, epsilon = parameters print("QSG: result_expansion=" + str(result_expansion)) print("QSG: epsilon=" + str(epsilon)) - self.name = 'QSG-NGT(%s, %s, %s, %s, %s, %1.3f)' % ( - self._edge_size, self._outdegree, - self._indegree, self._max_edge_size, + self.name = "QSG-NGT(%s, %s, %s, %s, %s, %1.3f)" % ( + self._edge_size, + self._outdegree, + self._indegree, + self._max_edge_size, epsilon, - result_expansion) + result_expansion, + ) epsilon = epsilon - 1.0 self.index.set(epsilon=epsilon, result_expansion=result_expansion) @@ -160,4 +225,4 @@ def query(self, v, n): return self.index.search(v, n) def freeIndex(self): - print('QSG: free') + print("QSG: free") diff --git a/ann_benchmarks/algorithms/rpforest.py b/ann_benchmarks/algorithms/rpforest.py index 063a614b8..199ac3cd8 100644 --- a/ann_benchmarks/algorithms/rpforest.py +++ b/ann_benchmarks/algorithms/rpforest.py @@ -6,7 +6,7 @@ class RPForest(BaseANN): def __init__(self, leaf_size, n_trees): - self.name = 'RPForest(leaf_size=%d, n_trees=%d)' % (leaf_size, n_trees) + self.name = "RPForest(leaf_size=%d, n_trees=%d)" % (leaf_size, n_trees) self._model = rpforest.RPForest(leaf_size=leaf_size, no_trees=n_trees) def fit(self, X): diff --git a/ann_benchmarks/algorithms/scann.py b/ann_benchmarks/algorithms/scann.py index d2ceb0db2..0050bb84e 100644 --- a/ann_benchmarks/algorithms/scann.py +++ b/ann_benchmarks/algorithms/scann.py @@ -3,31 +3,35 @@ import scann from ann_benchmarks.algorithms.base import BaseANN -class Scann(BaseANN): - def __init__(self, n_leaves, avq_threshold, dims_per_block, dist): - self.name = "scann n_leaves={} avq_threshold={:.02f} dims_per_block={}".format( - n_leaves, avq_threshold, dims_per_block) - self.n_leaves = n_leaves - self.avq_threshold = avq_threshold - self.dims_per_block = dims_per_block - self.dist = dist +class Scann(BaseANN): + def __init__(self, n_leaves, avq_threshold, dims_per_block, dist): + self.name = "scann n_leaves={} avq_threshold={:.02f} dims_per_block={}".format( + n_leaves, avq_threshold, dims_per_block + ) + self.n_leaves = n_leaves + self.avq_threshold = avq_threshold + self.dims_per_block = dims_per_block + self.dist = dist - def fit(self, X): - if self.dist == "dot_product": - spherical = True - X[np.linalg.norm(X, axis=1) == 0] = 1.0 / np.sqrt(X.shape[1]) - X /= np.linalg.norm(X, axis=1)[:, np.newaxis] - else: - spherical = False + def fit(self, X): + if self.dist == "dot_product": + spherical = True + X[np.linalg.norm(X, axis=1) == 0] = 1.0 / np.sqrt(X.shape[1]) + X /= np.linalg.norm(X, axis=1)[:, np.newaxis] + else: + spherical = False - self.searcher = scann.scann_ops_pybind.builder(X, 10, self.dist).tree( - self.n_leaves, 1, training_sample_size=len(X), spherical=spherical, quantize_centroids=True).score_ah( - self.dims_per_block, anisotropic_quantization_threshold=self.avq_threshold).reorder( - 1).build() + self.searcher = ( + scann.scann_ops_pybind.builder(X, 10, self.dist) + .tree(self.n_leaves, 1, training_sample_size=len(X), spherical=spherical, quantize_centroids=True) + .score_ah(self.dims_per_block, anisotropic_quantization_threshold=self.avq_threshold) + .reorder(1) + .build() + ) - def set_query_arguments(self, leaves_reorder): - self.leaves_to_search, self.reorder = leaves_reorder + def set_query_arguments(self, leaves_reorder): + self.leaves_to_search, self.reorder = leaves_reorder - def query(self, v, n): - return self.searcher.search(v, n, self.reorder, self.leaves_to_search)[0] + def query(self, v, n): + return self.searcher.search(v, n, self.reorder, self.leaves_to_search)[0] diff --git a/ann_benchmarks/algorithms/sptag.py b/ann_benchmarks/algorithms/sptag.py index be1f82fb0..399774d3d 100644 --- a/ann_benchmarks/algorithms/sptag.py +++ b/ann_benchmarks/algorithms/sptag.py @@ -6,12 +6,11 @@ class Sptag(BaseANN): def __init__(self, metric, algo): self._algo = str(algo) - self._metric = { - 'angular': 'Cosine', 'euclidean': 'L2'}[metric] + self._metric = {"angular": "Cosine", "euclidean": "L2"}[metric] def fit(self, X): - self._sptag = SPTAG.AnnIndex(self._algo, 'Float', X.shape[1]) - self._sptag.SetBuildParam("NumberOfThreads", '32', "Index") + self._sptag = SPTAG.AnnIndex(self._algo, "Float", X.shape[1]) + self._sptag.SetBuildParam("NumberOfThreads", "32", "Index") self._sptag.SetBuildParam("DistCalcMethod", self._metric, "Index") self._sptag.Build(X, X.shape[0], False) @@ -23,6 +22,4 @@ def query(self, v, k): return self._sptag.Search(v, k)[0] def __str__(self): - return 'Sptag(metric=%s, algo=%s, check=%d)' % (self._metric, - self._algo, self._maxCheck) - + return "Sptag(metric=%s, algo=%s, check=%d)" % (self._metric, self._algo, self._maxCheck) diff --git a/ann_benchmarks/algorithms/subprocess.py b/ann_benchmarks/algorithms/subprocess.py index 22d4728e7..61aa5ae82 100644 --- a/ann_benchmarks/algorithms/subprocess.py +++ b/ann_benchmarks/algorithms/subprocess.py @@ -4,8 +4,7 @@ from types import MethodType import psutil import subprocess -from ann_benchmarks.data import \ - bit_unparse_entry, int_unparse_entry, float_unparse_entry +from ann_benchmarks.data import bit_unparse_entry, int_unparse_entry, float_unparse_entry from ann_benchmarks.algorithms.base import BaseANN @@ -17,12 +16,11 @@ def __init__(self, code): class Subprocess(BaseANN): def _raw_line(self): - return shlex.split( - self._get_program_handle().stdout.readline().strip()) + return shlex.split(self._get_program_handle().stdout.readline().strip()) def _line(self): line = self._raw_line() -# print("<- %s" % (" ".join(line))) + # print("<- %s" % (" ".join(line))) while len(line) < 1 or line[0] != "epbprtv0": line = self._raw_line() return line[1:] @@ -50,23 +48,29 @@ def _get_program_handle(self): bufsize=1, # line buffering stdin=subprocess.PIPE, stdout=subprocess.PIPE, - universal_newlines=True) + universal_newlines=True, + ) for key, value in iter(self._params.items()): - self._write("%s %s" % - (Subprocess._quote(key), Subprocess._quote(value))) - assert self._line()[0] == "ok", """\ -assigning value '%s' to option '%s' failed""" % (value, key) + self._write("%s %s" % (Subprocess._quote(key), Subprocess._quote(value))) + assert ( + self._line()[0] == "ok" + ), """\ +assigning value '%s' to option '%s' failed""" % ( + value, + key, + ) self._configuration_hook() self._write("") - assert self._line()[0] == "ok", """\ + assert ( + self._line()[0] == "ok" + ), """\ transitioning to training mode failed""" return self._program def __init__(self, args, encoder, params): - self.name = "Subprocess(program = %s, %s)" % \ - (basename(args[0]), str(params)) + self.name = "Subprocess(program = %s, %s)" % (basename(args[0]), str(params)) self._program = None self._args = args self._encoder = encoder @@ -81,10 +85,15 @@ def fit(self, X): for entry in X: d = Subprocess._quote(self._encoder(entry)) self._write(d) - assert self._line()[0] == "ok", """\ -encoded training point '%s' was rejected""" % d + assert self._line()[0] == "ok", ( + """\ +encoded training point '%s' was rejected""" + % d + ) self._write("") - assert self._line()[0] == "ok", """\ + assert ( + self._line()[0] == "ok" + ), """\ transitioning to query mode failed""" def query(self, v, n): @@ -98,7 +107,9 @@ def _handle_query_response(self): count = int(status[1]) return self._collect_query_response_lines(count) else: - assert status[0] == "fail", """\ + assert ( + status[0] == "fail" + ), """\ query neither succeeded nor failed""" return [] @@ -125,7 +136,9 @@ def __init__(self, args, encoder, params): def _configuration_hook(self): self._write("frontend prepared-queries 1") - assert self._line()[0] == "ok", """\ + assert ( + self._line()[0] == "ok" + ), """\ enabling prepared queries mode failed""" def query(self, v, n): @@ -136,8 +149,11 @@ def query(self, v, n): def prepare_query(self, v, n): d = Subprocess._quote(self._encoder(v)) self._write("%s %d" % (d, n)) - assert self._line()[0] == "ok", """\ -preparing the query '%s' failed""" % d + assert self._line()[0] == "ok", ( + """\ +preparing the query '%s' failed""" + % d + ) def run_prepared_query(self): self._write("query") @@ -145,7 +161,9 @@ def run_prepared_query(self): if status[0] == "ok": self._result_count = int(status[1]) else: - assert status[0] == "fail", """\ + assert ( + status[0] == "fail" + ), """\ query neither succeeded nor failed""" self._result_count = 0 @@ -166,7 +184,9 @@ def __init__(self, args, encoder, params): def _configuration_hook(self): self._write("frontend batch-queries 1") - assert self._line()[0] == "ok", """\ + assert ( + self._line()[0] == "ok" + ), """\ enabling batch queries mode failed""" def query(self, v, n): @@ -178,13 +198,18 @@ def prepare_batch_query(self, X, n): d = " ".join(map(lambda p: Subprocess._quote(self._encoder(p)), X)) self._qp_count = len(X) self._write("%s %d" % (d, n)) - assert self._line()[0] == "ok", """\ -preparing the batch query '%s' failed""" % d + assert self._line()[0] == "ok", ( + """\ +preparing the batch query '%s' failed""" + % d + ) def run_batch_query(self): self._write("query") status = self._line() - assert status[0] == "ok", """\ + assert ( + status[0] == "ok" + ), """\ batch query failed completely""" def get_batch_results(self): @@ -232,15 +257,20 @@ def QueryParamWrapper(constructor, args, params): def _do(self, original=r._configuration_hook): original() self._write("frontend query-parameters 1") - assert self._line()[0] == "ok", """\ + assert ( + self._line()[0] == "ok" + ), """\ enabling query parameter support failed""" + r._configuration_hook = MethodType(_do, r) def _sqa(self, *args): - self._write("query-params %s set" % - (" ".join(map(Subprocess._quote, args)))) - assert self._line()[0] == "ok", """\ + self._write("query-params %s set" % (" ".join(map(Subprocess._quote, args)))) + assert ( + self._line()[0] == "ok" + ), """\ reconfiguring query parameters failed""" print(args) + r.set_query_arguments = MethodType(_sqa, r) return r diff --git a/ann_benchmarks/algorithms/vald.py b/ann_benchmarks/algorithms/vald.py index eb5660bac..2db79b305 100644 --- a/ann_benchmarks/algorithms/vald.py +++ b/ann_benchmarks/algorithms/vald.py @@ -15,106 +15,102 @@ default_server_config = { - 'version': 'v0.0.0', - 'logging': { - 'logger': 'nop', - 'level': 'fatal', - 'format': 'raw' - }, - 'server_config': { - 'servers': [ + "version": "v0.0.0", + "logging": {"logger": "nop", "level": "fatal", "format": "raw"}, + "server_config": { + "servers": [ { - 'name': 'agent-grpc', - 'host': '127.0.0.1', - 'port': 8082, - 'mode': 'GRPC', - 'probe_wait_time': '3s', + "name": "agent-grpc", + "host": "127.0.0.1", + "port": 8082, + "mode": "GRPC", + "probe_wait_time": "3s", #'grpc': { # 'bidirectional_stream_concurrency': 1 - #}, + # }, "network": "unix", - "socket_path": "/var/run/vald.sock" + "socket_path": "/var/run/vald.sock", } ], - 'health_check_servers': [ + "health_check_servers": [ { - 'name': 'readiness', - 'host': '127.0.0.1', - 'port': 3001, - 'mode': '', - 'probe_wait_time': '3s', - 'http': { - 'shutdown_duration': '5s', - 'handler_timeout': '', - 'idle_timeout': '', - 'read_header_timeout': '', - 'read_timeout': '', - 'write_timeout': '' - } + "name": "readiness", + "host": "127.0.0.1", + "port": 3001, + "mode": "", + "probe_wait_time": "3s", + "http": { + "shutdown_duration": "5s", + "handler_timeout": "", + "idle_timeout": "", + "read_header_timeout": "", + "read_timeout": "", + "write_timeout": "", + }, } ], - 'startup_strategy': ['agent-grpc', 'readiness'], - 'shutdown_strategy': ['readiness', 'agent-grpc'], - 'full_shutdown_duration': '600s', - 'tls': { - 'enabled': False, - } + "startup_strategy": ["agent-grpc", "readiness"], + "shutdown_strategy": ["readiness", "agent-grpc"], + "full_shutdown_duration": "600s", + "tls": { + "enabled": False, + }, }, - 'ngt': { - 'enable_in_memory_mode': True, - 'default_pool_size': 10000, - 'default_epsilon': 0.01, - 'default_radius': -1.0, + "ngt": { + "enable_in_memory_mode": True, + "default_pool_size": 10000, + "default_epsilon": 0.01, + "default_radius": -1.0, #'vqueue': { # 'insert_buffer_size': 100, # 'insert_buffer_pool_size': 1000, # 'delete_buffer_size': 100, # 'delete_buffer_pool_size': 1000 - #} - } + # } + }, } grpc_opts = [ - ('grpc.keepalive_time_ms', 1000 * 10), - ('grpc.keepalive_timeout_ms', 1000 * 10), - ('grpc.max_connection_idle_ms', 1000 * 50) + ("grpc.keepalive_time_ms", 1000 * 10), + ("grpc.keepalive_timeout_ms", 1000 * 10), + ("grpc.max_connection_idle_ms", 1000 * 50), ] -metrics = {'euclidean': 'l2', 'angular': 'cosine'} +metrics = {"euclidean": "l2", "angular": "cosine"} class Vald(BaseANN): def __init__(self, metric, object_type, params): self._param = default_server_config self._ngt_config = { - 'distance_type': metrics[metric], - 'object_type': object_type, - 'search_edge_size': int(params['searchedge']), - 'creation_edge_size': int(params['edge']), - 'bulk_insert_chunk_size': int(params['bulk']) + "distance_type": metrics[metric], + "object_type": object_type, + "search_edge_size": int(params["searchedge"]), + "creation_edge_size": int(params["edge"]), + "bulk_insert_chunk_size": int(params["bulk"]), } - #self._address = 'localhost:8082' - self._address = 'unix:///var/run/vald.sock' + # self._address = 'localhost:8082' + self._address = "unix:///var/run/vald.sock" def fit(self, X): dim = len(X[0]) - self._ngt_config['dimension'] = dim - self._param['ngt'].update(self._ngt_config) - with open('config.yaml', 'w') as f: + self._ngt_config["dimension"] = dim + self._param["ngt"].update(self._ngt_config) + with open("config.yaml", "w") as f: yaml.dump(self._param, f) cfg = payload_pb2.Insert.Config(skip_strict_exist_check=True) vectors = [ - payload_pb2.Insert.Request( - vector=payload_pb2.Object.Vector(id=str(i), vector=x.tolist()), - config=cfg) for i, x in enumerate(X)] + payload_pb2.Insert.Request(vector=payload_pb2.Object.Vector(id=str(i), vector=x.tolist()), config=cfg) + for i, x in enumerate(X) + ] - p = subprocess.Popen(['/go/bin/ngt', '-f', 'config.yaml']) + p = subprocess.Popen(["/go/bin/ngt", "-f", "config.yaml"]) atexit.register(lambda: p.kill()) while True: try: - with urllib.request.urlopen('http://localhost:3001/readiness') as response: + with urllib.request.urlopen("http://localhost:3001/readiness") as response: if response.getcode() == 200: break except (urllib.error.HTTPError, urllib.error.URLError): @@ -126,9 +122,7 @@ def fit(self, X): pass astub = agent_pb2_grpc.AgentStub(channel) - astub.CreateIndex( - payload_pb2.Control.CreateIndexRequest( - pool_size=10000)) + astub.CreateIndex(payload_pb2.Control.CreateIndexRequest(pool_size=10000)) def set_query_arguments(self, epsilon): self._epsilon = epsilon - 1.0 @@ -141,9 +135,9 @@ def query(self, v, n): return [int(result.id) for result in response.results] def __str__(self): - return 'Vald(%d, %d, %d, %1.3f)' % ( - self._ngt_config['creation_edge_size'], - self._ngt_config['search_edge_size'], - self._ngt_config['bulk_insert_chunk_size'], - self._epsilon + 1.0 + return "Vald(%d, %d, %d, %1.3f)" % ( + self._ngt_config["creation_edge_size"], + self._ngt_config["search_edge_size"], + self._ngt_config["bulk_insert_chunk_size"], + self._epsilon + 1.0, ) diff --git a/ann_benchmarks/algorithms/vearch.py b/ann_benchmarks/algorithms/vearch.py index bf16dbe5f..6115bdd95 100644 --- a/ann_benchmarks/algorithms/vearch.py +++ b/ann_benchmarks/algorithms/vearch.py @@ -20,6 +20,7 @@ def get_batch_results(self): res.append(single_ids.tolist()) return res + class VearchIndex(Vearch): def __init__(self, metric, nlist, ns_threshold, n_dims_block): self.nlist = nlist @@ -29,9 +30,14 @@ def __init__(self, metric, nlist, ns_threshold, n_dims_block): self.metric = "InnerProduct" self.ns_threshold = ns_threshold self.n_dims_block = n_dims_block - + def __str__(self): - return "VearchIndex(nlist=%d, n_dims_block=%d, nprobe=%d, rerank=%d)" % (self.nlist, self.n_dims_block, self.nprobe, self.rerank) + return "VearchIndex(nlist=%d, n_dims_block=%d, nprobe=%d, rerank=%d)" % ( + self.nlist, + self.n_dims_block, + self.nprobe, + self.rerank, + ) def fit(self, X): if X.dtype != np.float32: @@ -39,38 +45,31 @@ def fit(self, X): if self.metric == "InnerProduct": X[np.linalg.norm(X, axis=1) == 0] = 1.0 / np.sqrt(X.shape[1]) - X /= np.linalg.norm(X, axis=1)[:, np.newaxis] + X /= np.linalg.norm(X, axis=1)[:, np.newaxis] d = X.shape[1] self.nsubvector = int(d / self.n_dims_block) self.engine = vearch.Engine("files", "logs") table = { - "name" : "test_table", - "engine" : { + "name": "test_table", + "engine": { "index_size": X.shape[0], - "retrieval_type": "VEARCH", + "retrieval_type": "VEARCH", "retrieval_param": { "metric_type": self.metric, "ncentroids": self.nlist, "nsubvector": self.nsubvector, "reordering": True, - "ns_threshold": self.ns_threshold - } + "ns_threshold": self.ns_threshold, + }, }, - "properties" : { - "feature": { - "type": "vector", - "index": True, - "dimension": d, - "store_type": "Mmap" - } - } + "properties": {"feature": {"type": "vector", "index": True, "dimension": d, "store_type": "Mmap"}}, } self.engine.create_table(table) self.engine.add2(X) indexed_num = 0 while indexed_num != X.shape[0]: - indexed_num = self.engine.get_status()['min_indexed_num'] + indexed_num = self.engine.get_status()["min_indexed_num"] time.sleep(0.5) def set_query_arguments(self, n_probe, k_rerank): diff --git a/ann_benchmarks/algorithms/vespa.py b/ann_benchmarks/algorithms/vespa.py index cf5ba3af7..23393f95a 100644 --- a/ann_benchmarks/algorithms/vespa.py +++ b/ann_benchmarks/algorithms/vespa.py @@ -8,10 +8,9 @@ # see https://docs.vespa.ai/en/approximate-nn-hnsw.html for more details. class VespaHnswBase(BaseANN): def __init__(self, enable_normalize, metric, dimension, param): - if metric not in ('angular', 'euclidean'): - raise NotImplementedError( - "VespaHnsw doesn't support metric %s" % metric) - self.metric = {'angular': DistanceMetric.Angular, 'euclidean': DistanceMetric.Euclidean}[metric] + if metric not in ("angular", "euclidean"): + raise NotImplementedError("VespaHnsw doesn't support metric %s" % metric) + self.metric = {"angular": DistanceMetric.Angular, "euclidean": DistanceMetric.Euclidean}[metric] normalize = False if self.metric == DistanceMetric.Angular and enable_normalize: normalize = True @@ -21,8 +20,12 @@ def __init__(self, enable_normalize, metric, dimension, param): self.max_links_per_node = param.get("M", 8) self.dimension = dimension self.neighbors_to_explore = 200 - self.name = 'VespaHnsw()' - self.index = HnswIndex(dimension, HnswIndexParams(self.max_links_per_node, self.neighbors_to_explore_at_insert, self.metric, False), normalize) + self.name = "VespaHnsw()" + self.index = HnswIndex( + dimension, + HnswIndexParams(self.max_links_per_node, self.neighbors_to_explore_at_insert, self.metric, False), + normalize, + ) def fit(self, X): for i, x in enumerate(X): @@ -38,9 +41,10 @@ def query(self, v, n): def query_with_distances(self, v, n): return self.index.find_top_k(n, v, self.neighbors_to_explore) + class VespaHnsw(VespaHnswBase): def __init__(self, metric, dimension, param): super().__init__(True, metric, dimension, param) def __str__(self): - return 'VespaHnsw ({}, ef: {})'.format(self.param, self.neighbors_to_explore) + return "VespaHnsw ({}, ef: {})".format(self.param, self.neighbors_to_explore) diff --git a/ann_benchmarks/constants.py b/ann_benchmarks/constants.py index 407200b6b..03b3c4a2b 100644 --- a/ann_benchmarks/constants.py +++ b/ann_benchmarks/constants.py @@ -1 +1 @@ -INDEX_DIR = 'indices' +INDEX_DIR = "indices" diff --git a/ann_benchmarks/data.py b/ann_benchmarks/data.py index 14b47ed47..de18ad6d9 100644 --- a/ann_benchmarks/data.py +++ b/ann_benchmarks/data.py @@ -19,9 +19,7 @@ def int_unparse_entry(entry): def bit_parse_entry(line): - return [bool(int(x)) for x in list(line.strip() - .replace(" ", "") - .replace("\t", ""))] + return [bool(int(x)) for x in list(line.strip().replace(" ", "").replace("\t", ""))] def bit_unparse_entry(entry): @@ -33,13 +31,9 @@ def bit_unparse_entry(entry): "type": numpy.float, "parse_entry": float_parse_entry, "unparse_entry": float_unparse_entry, - "finish_entries": numpy.vstack - }, - "bit": { - "type": numpy.bool_, - "parse_entry": bit_parse_entry, - "unparse_entry": bit_unparse_entry + "finish_entries": numpy.vstack, }, + "bit": {"type": numpy.bool_, "parse_entry": bit_parse_entry, "unparse_entry": bit_unparse_entry}, "int": { "type": numpy.object, "parse_entry": int_parse_entry, diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index d2aa566cc..15cd9f875 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -7,35 +7,34 @@ from urllib.request import urlretrieve - def download(src, dst): if not os.path.exists(dst): # TODO: should be atomic - print('downloading %s -> %s...' % (src, dst)) + print("downloading %s -> %s..." % (src, dst)) urlretrieve(src, dst) def get_dataset_fn(dataset): - if not os.path.exists('data'): - os.mkdir('data') - return os.path.join('data', '%s.hdf5' % dataset) + if not os.path.exists("data"): + os.mkdir("data") + return os.path.join("data", "%s.hdf5" % dataset) def get_dataset(which): hdf5_fn = get_dataset_fn(which) try: - url = 'http://ann-benchmarks.com/%s.hdf5' % which + url = "http://ann-benchmarks.com/%s.hdf5" % which download(url, hdf5_fn) except: print("Cannot download %s" % url) if which in DATASETS: print("Creating dataset locally") DATASETS[which](hdf5_fn) - hdf5_f = h5py.File(hdf5_fn, 'r') + hdf5_f = h5py.File(hdf5_fn, "r") # here for backward compatibility, to ensure old datasets can still be used with newer versions # cast to integer because the json parser (later on) cannot interpret numpy integers - dimension = int(hdf5_f.attrs['dimension']) if 'dimension' in hdf5_f.attrs else len(hdf5_f['train'][0]) + dimension = int(hdf5_f.attrs["dimension"]) if "dimension" in hdf5_f.attrs else len(hdf5_f["train"][0]) return hdf5_f, dimension @@ -45,45 +44,48 @@ def get_dataset(which): # just rely on the prepared datasets at http://ann-benchmarks.com -def write_output(train, test, fn, distance, point_type='float', count=100): +def write_output(train, test, fn, distance, point_type="float", count=100): from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS - f = h5py.File(fn, 'w') - f.attrs['type'] = 'dense' - f.attrs['distance'] = distance - f.attrs['dimension'] = len(train[0]) - f.attrs['point_type'] = point_type - print('train size: %9d * %4d' % train.shape) - print('test size: %9d * %4d' % test.shape) - f.create_dataset('train', (len(train), len( - train[0])), dtype=train.dtype)[:] = train - f.create_dataset('test', (len(test), len( - test[0])), dtype=test.dtype)[:] = test - neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i') - distances = f.create_dataset('distances', (len(test), count), dtype='f') + + f = h5py.File(fn, "w") + f.attrs["type"] = "dense" + f.attrs["distance"] = distance + f.attrs["dimension"] = len(train[0]) + f.attrs["point_type"] = point_type + print("train size: %9d * %4d" % train.shape) + print("test size: %9d * %4d" % test.shape) + f.create_dataset("train", (len(train), len(train[0])), dtype=train.dtype)[:] = train + f.create_dataset("test", (len(test), len(test[0])), dtype=test.dtype)[:] = test + neighbors = f.create_dataset("neighbors", (len(test), count), dtype="i") + distances = f.create_dataset("distances", (len(test), count), dtype="f") bf = BruteForceBLAS(distance, precision=train.dtype) bf.fit(train) for i, x in enumerate(test): if i % 1000 == 0: - print('%d/%d...' % (i, len(test))) + print("%d/%d..." % (i, len(test))) res = list(bf.query_with_distances(x, count)) res.sort(key=lambda t: t[-1]) neighbors[i] = [j for j, _ in res] distances[i] = [d for _, d in res] f.close() + """ param: train and test are arrays of arrays of indices. """ + + def write_sparse_output(train, test, fn, distance, dimension, count=100): from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS - f = h5py.File(fn, 'w') - f.attrs['type'] = 'sparse' - f.attrs['distance'] = distance - f.attrs['dimension'] = dimension - f.attrs['point_type'] = 'bit' - print('train size: %9d * %4d' % (train.shape[0], dimension)) - print('test size: %9d * %4d' % (test.shape[0], dimension)) + + f = h5py.File(fn, "w") + f.attrs["type"] = "sparse" + f.attrs["distance"] = distance + f.attrs["dimension"] = dimension + f.attrs["point_type"] = "bit" + print("train size: %9d * %4d" % (train.shape[0], dimension)) + print("test size: %9d * %4d" % (test.shape[0], dimension)) # We ensure the sets are sorted train = numpy.array(list(map(sorted, train))) @@ -92,50 +94,50 @@ def write_sparse_output(train, test, fn, distance, dimension, count=100): flat_train = numpy.hstack(train.flatten()) flat_test = numpy.hstack(test.flatten()) - f.create_dataset('train', (len(flat_train),), dtype=flat_train.dtype)[:] = flat_train - f.create_dataset('test', (len(flat_test),), dtype=flat_test.dtype)[:] = flat_test - neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i') - distances = f.create_dataset('distances', (len(test), count), dtype='f') + f.create_dataset("train", (len(flat_train),), dtype=flat_train.dtype)[:] = flat_train + f.create_dataset("test", (len(flat_test),), dtype=flat_test.dtype)[:] = flat_test + neighbors = f.create_dataset("neighbors", (len(test), count), dtype="i") + distances = f.create_dataset("distances", (len(test), count), dtype="f") - f.create_dataset('size_test', (len(test),), dtype='i')[:] = list(map(len, test)) - f.create_dataset('size_train', (len(train),), dtype='i')[:] = list(map(len, train)) + f.create_dataset("size_test", (len(test),), dtype="i")[:] = list(map(len, test)) + f.create_dataset("size_train", (len(train),), dtype="i")[:] = list(map(len, train)) bf = BruteForceBLAS(distance, precision=train.dtype) bf.fit(train) for i, x in enumerate(test): if i % 1000 == 0: - print('%d/%d...' % (i, len(test))) + print("%d/%d..." % (i, len(test))) res = list(bf.query_with_distances(x, count)) res.sort(key=lambda t: t[-1]) neighbors[i] = [j for j, _ in res] distances[i] = [d for _, d in res] f.close() + def train_test_split(X, test_size=10000, dimension=None): import sklearn.model_selection + if dimension is None: dimension = X.shape[1] - print('Splitting %d*%d into train/test' % (X.shape[0], dimension)) - return sklearn.model_selection.train_test_split( - X, test_size=test_size, random_state=1) + print("Splitting %d*%d into train/test" % (X.shape[0], dimension)) + return sklearn.model_selection.train_test_split(X, test_size=test_size, random_state=1) def glove(out_fn, d): import zipfile - url = 'http://nlp.stanford.edu/data/glove.twitter.27B.zip' - fn = os.path.join('data', 'glove.twitter.27B.zip') + url = "http://nlp.stanford.edu/data/glove.twitter.27B.zip" + fn = os.path.join("data", "glove.twitter.27B.zip") download(url, fn) with zipfile.ZipFile(fn) as z: - print('preparing %s' % out_fn) - z_fn = 'glove.twitter.27B.%dd.txt' % d + print("preparing %s" % out_fn) + z_fn = "glove.twitter.27B.%dd.txt" % d X = [] for line in z.open(z_fn): v = [float(x) for x in line.strip().split()[1:]] X.append(numpy.array(v)) X_train, X_test = train_test_split(X) - write_output(numpy.array(X_train), numpy.array( - X_test), out_fn, 'angular') + write_output(numpy.array(X_train), numpy.array(X_test), out_fn, "angular") def _load_texmex_vectors(f, n, k): @@ -144,16 +146,17 @@ def _load_texmex_vectors(f, n, k): v = numpy.zeros((n, k)) for i in range(n): f.read(4) # ignore vec length - v[i] = struct.unpack('f' * k, f.read(k * 4)) + v[i] = struct.unpack("f" * k, f.read(k * 4)) return v def _get_irisa_matrix(t, fn): import struct + m = t.getmember(fn) f = t.extractfile(m) - k, = struct.unpack('i', f.read(4)) + (k,) = struct.unpack("i", f.read(4)) n = m.size // (4 + 4 * k) f.seek(0) return _load_texmex_vectors(f, n, k) @@ -162,32 +165,32 @@ def _get_irisa_matrix(t, fn): def sift(out_fn): import tarfile - url = 'ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz' - fn = os.path.join('data', 'sift.tar.tz') + url = "ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz" + fn = os.path.join("data", "sift.tar.tz") download(url, fn) - with tarfile.open(fn, 'r:gz') as t: - train = _get_irisa_matrix(t, 'sift/sift_base.fvecs') - test = _get_irisa_matrix(t, 'sift/sift_query.fvecs') - write_output(train, test, out_fn, 'euclidean') + with tarfile.open(fn, "r:gz") as t: + train = _get_irisa_matrix(t, "sift/sift_base.fvecs") + test = _get_irisa_matrix(t, "sift/sift_query.fvecs") + write_output(train, test, out_fn, "euclidean") def gist(out_fn): import tarfile - url = 'ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz' - fn = os.path.join('data', 'gist.tar.tz') + url = "ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz" + fn = os.path.join("data", "gist.tar.tz") download(url, fn) - with tarfile.open(fn, 'r:gz') as t: - train = _get_irisa_matrix(t, 'gist/gist_base.fvecs') - test = _get_irisa_matrix(t, 'gist/gist_query.fvecs') - write_output(train, test, out_fn, 'euclidean') + with tarfile.open(fn, "r:gz") as t: + train = _get_irisa_matrix(t, "gist/gist_base.fvecs") + test = _get_irisa_matrix(t, "gist/gist_query.fvecs") + write_output(train, test, out_fn, "euclidean") def _load_mnist_vectors(fn): import gzip import struct - print('parsing vectors in %s...' % fn) + print("parsing vectors in %s..." % fn) f = gzip.open(fn) type_code_info = { 0x08: (1, "!B"), @@ -195,14 +198,13 @@ def _load_mnist_vectors(fn): 0x0B: (2, "!H"), 0x0C: (4, "!I"), 0x0D: (4, "!f"), - 0x0E: (8, "!d") + 0x0E: (8, "!d"), } magic, type_code, dim_count = struct.unpack("!hBB", f.read(4)) assert magic == 0 assert type_code in type_code_info - dimensions = [struct.unpack("!I", f.read(4))[0] - for i in range(dim_count)] + dimensions = [struct.unpack("!I", f.read(4))[0] for i in range(dim_count)] entry_count = dimensions[0] entry_size = numpy.product(dimensions[1:]) @@ -210,41 +212,46 @@ def _load_mnist_vectors(fn): b, format_string = type_code_info[type_code] vectors = [] for i in range(entry_count): - vectors.append([struct.unpack(format_string, f.read(b))[0] - for j in range(entry_size)]) + vectors.append([struct.unpack(format_string, f.read(b))[0] for j in range(entry_size)]) return numpy.array(vectors) def mnist(out_fn): - download( - 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 'mnist-train.gz') # noqa - download( - 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 'mnist-test.gz') # noqa - train = _load_mnist_vectors('mnist-train.gz') - test = _load_mnist_vectors('mnist-test.gz') - write_output(train, test, out_fn, 'euclidean') + download("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz", "mnist-train.gz") # noqa + download("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz", "mnist-test.gz") # noqa + train = _load_mnist_vectors("mnist-train.gz") + test = _load_mnist_vectors("mnist-test.gz") + write_output(train, test, out_fn, "euclidean") def fashion_mnist(out_fn): - download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz', # noqa - 'fashion-mnist-train.gz') - download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz', # noqa - 'fashion-mnist-test.gz') - train = _load_mnist_vectors('fashion-mnist-train.gz') - test = _load_mnist_vectors('fashion-mnist-test.gz') - write_output(train, test, out_fn, 'euclidean') + download( + "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz", # noqa + "fashion-mnist-train.gz", + ) + download( + "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz", # noqa + "fashion-mnist-test.gz", + ) + train = _load_mnist_vectors("fashion-mnist-train.gz") + test = _load_mnist_vectors("fashion-mnist-test.gz") + write_output(train, test, out_fn, "euclidean") + # Creates a 'deep image descriptor' dataset using the 'deep10M.fvecs' sample # from http://sites.skoltech.ru/compvision/noimi/. The download logic is adapted # from the script https://github.com/arbabenko/GNOIMI/blob/master/downloadDeep1B.py. def deep_image(out_fn): - yadisk_key = 'https://yadi.sk/d/11eDCm7Dsn9GA' - response = urlopen('https://cloud-api.yandex.net/v1/disk/public/resources/download?public_key=' \ - + yadisk_key + '&path=/deep10M.fvecs') + yadisk_key = "https://yadi.sk/d/11eDCm7Dsn9GA" + response = urlopen( + "https://cloud-api.yandex.net/v1/disk/public/resources/download?public_key=" + + yadisk_key + + "&path=/deep10M.fvecs" + ) response_body = response.read().decode("utf-8") - dataset_url = response_body.split(',')[0][9:-1] - filename = os.path.join('data', 'deep-image.fvecs') + dataset_url = response_body.split(",")[0][9:-1] + filename = os.path.join("data", "deep-image.fvecs") download(dataset_url, filename) # In the fvecs file format, each vector is stored by first writing its @@ -254,14 +261,16 @@ def deep_image(out_fn): fv = fv.reshape(-1, dim + 1)[:, 1:] X_train, X_test = train_test_split(fv) - write_output(X_train, X_test, out_fn, 'angular') + write_output(X_train, X_test, out_fn, "angular") + def transform_bag_of_words(filename, n_dimensions, out_fn): import gzip from scipy.sparse import lil_matrix from sklearn.feature_extraction.text import TfidfTransformer from sklearn import random_projection - with gzip.open(filename, 'rb') as f: + + with gzip.open(filename, "rb") as f: file_content = f.readlines() entries = int(file_content[0]) words = int(file_content[1]) @@ -274,25 +283,23 @@ def transform_bag_of_words(filename, n_dimensions, out_fn): print("normalizing matrix entries with tfidf...") B = TfidfTransformer().fit_transform(A) print("reducing dimensionality...") - C = random_projection.GaussianRandomProjection( - n_components=n_dimensions).fit_transform(B) + C = random_projection.GaussianRandomProjection(n_components=n_dimensions).fit_transform(B) X_train, X_test = train_test_split(C) - write_output(numpy.array(X_train), numpy.array( - X_test), out_fn, 'angular') + write_output(numpy.array(X_train), numpy.array(X_test), out_fn, "angular") def nytimes(out_fn, n_dimensions): - fn = 'nytimes_%s.txt.gz' % n_dimensions - download('https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz', fn) # noqa + fn = "nytimes_%s.txt.gz" % n_dimensions + download( + "https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz", fn + ) # noqa transform_bag_of_words(fn, n_dimensions, out_fn) def random_float(out_fn, n_dims, n_samples, centers, distance): import sklearn.datasets - X, _ = sklearn.datasets.make_blobs( - n_samples=n_samples, n_features=n_dims, - centers=centers, random_state=1) + X, _ = sklearn.datasets.make_blobs(n_samples=n_samples, n_features=n_dims, centers=centers, random_state=1) X_train, X_test = train_test_split(X, test_size=0.1) write_output(X_train, X_test, out_fn, distance) @@ -300,63 +307,62 @@ def random_float(out_fn, n_dims, n_samples, centers, distance): def random_bitstring(out_fn, n_dims, n_samples, n_queries): import sklearn.datasets - Y, _ = sklearn.datasets.make_blobs( - n_samples=n_samples, n_features=n_dims, - centers=n_queries, random_state=1) + Y, _ = sklearn.datasets.make_blobs(n_samples=n_samples, n_features=n_dims, centers=n_queries, random_state=1) X = numpy.zeros((n_samples, n_dims), dtype=numpy.bool_) for i, vec in enumerate(Y): X[i] = numpy.array([v > 0 for v in vec], dtype=numpy.bool_) X_train, X_test = train_test_split(X, test_size=n_queries) - write_output(X_train, X_test, out_fn, 'hamming', 'bit') + write_output(X_train, X_test, out_fn, "hamming", "bit") def word2bits(out_fn, path, fn): import tarfile - local_fn = fn + '.tar.gz' - url = 'http://web.stanford.edu/~maxlam/word_vectors/compressed/%s/%s.tar.gz' % ( # noqa - path, fn) + + local_fn = fn + ".tar.gz" + url = "http://web.stanford.edu/~maxlam/word_vectors/compressed/%s/%s.tar.gz" % (path, fn) # noqa download(url, local_fn) - print('parsing vectors in %s...' % local_fn) - with tarfile.open(local_fn, 'r:gz') as t: + print("parsing vectors in %s..." % local_fn) + with tarfile.open(local_fn, "r:gz") as t: f = t.extractfile(fn) n_words, k = [int(z) for z in next(f).strip().split()] X = numpy.zeros((n_words, k), dtype=numpy.bool_) for i in range(n_words): - X[i] = numpy.array([float(z) > 0 for z in next( - f).strip().split()[1:]], dtype=numpy.bool_) + X[i] = numpy.array([float(z) > 0 for z in next(f).strip().split()[1:]], dtype=numpy.bool_) X_train, X_test = train_test_split(X, test_size=1000) - write_output(X_train, X_test, out_fn, 'hamming', 'bit') + write_output(X_train, X_test, out_fn, "hamming", "bit") def sift_hamming(out_fn, fn): import tarfile - local_fn = fn + '.tar.gz' - url = 'http://sss.projects.itu.dk/ann-benchmarks/datasets/%s.tar.gz' % fn + + local_fn = fn + ".tar.gz" + url = "http://sss.projects.itu.dk/ann-benchmarks/datasets/%s.tar.gz" % fn download(url, local_fn) - print('parsing vectors in %s...' % local_fn) - with tarfile.open(local_fn, 'r:gz') as t: + print("parsing vectors in %s..." % local_fn) + with tarfile.open(local_fn, "r:gz") as t: f = t.extractfile(fn) lines = f.readlines() X = numpy.zeros((len(lines), 256), dtype=numpy.bool_) for i, line in enumerate(lines): - X[i] = numpy.array( - [int(x) > 0 for x in line.decode().strip()], dtype=numpy.bool_) + X[i] = numpy.array([int(x) > 0 for x in line.decode().strip()], dtype=numpy.bool_) X_train, X_test = train_test_split(X, test_size=1000) - write_output(X_train, X_test, out_fn, 'hamming', 'bit') + write_output(X_train, X_test, out_fn, "hamming", "bit") + def kosarak(out_fn): import gzip - local_fn = 'kosarak.dat.gz' + + local_fn = "kosarak.dat.gz" # only consider sets with at least min_elements many elements min_elements = 20 - url = 'http://fimi.uantwerpen.be/data/%s' % local_fn + url = "http://fimi.uantwerpen.be/data/%s" % local_fn download(url, local_fn) X = [] dimension = 0 - with gzip.open('kosarak.dat.gz', 'r') as f: + with gzip.open("kosarak.dat.gz", "r") as f: content = f.readlines() # preprocess data to find sets with more than 20 elements # keep track of used ids for reenumeration @@ -366,7 +372,8 @@ def kosarak(out_fn): dimension = max(dimension, max(X[-1]) + 1) X_train, X_test = train_test_split(numpy.array(X), test_size=500, dimension=dimension) - write_sparse_output(X_train, X_test, out_fn, 'jaccard', dimension) + write_sparse_output(X_train, X_test, out_fn, "jaccard", dimension) + def random_jaccard(out_fn, n=10000, size=50, universe=80): random.seed(1) @@ -376,8 +383,7 @@ def random_jaccard(out_fn, n=10000, size=50, universe=80): X.append(random.sample(l, size)) X_train, X_test = train_test_split(numpy.array(X), test_size=100, dimension=universe) - write_sparse_output(X_train, X_test, out_fn, 'jaccard', universe) - + write_sparse_output(X_train, X_test, out_fn, "jaccard", universe) def lastfm(out_fn, n_dimensions, test_size=50000): @@ -405,15 +411,12 @@ def lastfm(out_fn, n_dimensions, test_size=50000): # train an als model on the lastfm data _, _, play_counts = get_lastfm() model = implicit.als.AlternatingLeastSquares(factors=n_dimensions) - model.fit(implicit.nearest_neighbours.bm25_weight( - play_counts, K1=100, B=0.8)) + model.fit(implicit.nearest_neighbours.bm25_weight(play_counts, K1=100, B=0.8)) # transform item factors so that each one has the same norm, # and transform the user factors such by appending a 0 column _, item_factors = augment_inner_product_matrix(model.item_factors) - user_factors = numpy.append(model.user_factors, - numpy.zeros((model.user_factors.shape[0], 1)), - axis=1) + user_factors = numpy.append(model.user_factors, numpy.zeros((model.user_factors.shape[0], 1)), axis=1) # only query the first 50k users (speeds things up signficantly # without changing results) @@ -421,12 +424,13 @@ def lastfm(out_fn, n_dimensions, test_size=50000): # after that transformation a cosine lookup will return the same results # as the inner product on the untransformed data - write_output(item_factors, user_factors, out_fn, 'angular') + write_output(item_factors, user_factors, out_fn, "angular") -def movielens(fn, ratings_file, out_fn, separator='::', ignore_header=False): + +def movielens(fn, ratings_file, out_fn, separator="::", ignore_header=False): import zipfile - url = 'http://files.grouplens.org/datasets/movielens/%s' % fn + url = "http://files.grouplens.org/datasets/movielens/%s" % fn download(url, fn) with zipfile.ZipFile(fn) as z: @@ -434,19 +438,19 @@ def movielens(fn, ratings_file, out_fn, separator='::', ignore_header=False): if ignore_header: file.readline() - print('preparing %s' % out_fn) + print("preparing %s" % out_fn) users = {} X = [] dimension = 0 for line in file: - el = line.decode('UTF-8').split(separator) + el = line.decode("UTF-8").split(separator) userId = el[0] itemId = int(el[1]) rating = float(el[2]) - if rating < 3: # We only keep ratings >= 3 + if rating < 3: # We only keep ratings >= 3 continue if userId not in users: @@ -454,58 +458,50 @@ def movielens(fn, ratings_file, out_fn, separator='::', ignore_header=False): X.append([]) X[users[userId]].append(itemId) - dimension = max(dimension, itemId+1) + dimension = max(dimension, itemId + 1) X_train, X_test = train_test_split(numpy.array(X), test_size=500, dimension=dimension) - write_sparse_output(X_train, X_test, out_fn, 'jaccard', dimension) + write_sparse_output(X_train, X_test, out_fn, "jaccard", dimension) + def movielens1m(out_fn): - movielens('ml-1m.zip', 'ml-1m/ratings.dat', out_fn) + movielens("ml-1m.zip", "ml-1m/ratings.dat", out_fn) + def movielens10m(out_fn): - movielens('ml-10m.zip', 'ml-10M100K/ratings.dat', out_fn) + movielens("ml-10m.zip", "ml-10M100K/ratings.dat", out_fn) + def movielens20m(out_fn): - movielens('ml-20m.zip', 'ml-20m/ratings.csv', out_fn, ',', True) + movielens("ml-20m.zip", "ml-20m/ratings.csv", out_fn, ",", True) + DATASETS = { - 'deep-image-96-angular': deep_image, - 'fashion-mnist-784-euclidean': fashion_mnist, - 'gist-960-euclidean': gist, - 'glove-25-angular': lambda out_fn: glove(out_fn, 25), - 'glove-50-angular': lambda out_fn: glove(out_fn, 50), - 'glove-100-angular': lambda out_fn: glove(out_fn, 100), - 'glove-200-angular': lambda out_fn: glove(out_fn, 200), - 'mnist-784-euclidean': mnist, - 'random-xs-20-euclidean': lambda out_fn: random_float(out_fn, 20, 10000, 100, - 'euclidean'), - 'random-s-100-euclidean': lambda out_fn: random_float(out_fn, 100, 100000, 1000, - 'euclidean'), - 'random-xs-20-angular': lambda out_fn: random_float(out_fn, 20, 10000, 100, - 'angular'), - 'random-s-100-angular': lambda out_fn: random_float(out_fn, 100, 100000, 1000, - 'angular'), - 'random-xs-16-hamming': lambda out_fn: random_bitstring(out_fn, 16, 10000, - 100), - 'random-s-128-hamming': lambda out_fn: random_bitstring(out_fn, 128, - 50000, 1000), - 'random-l-256-hamming': lambda out_fn: random_bitstring(out_fn, 256, - 100000, 1000), - 'random-s-jaccard': lambda out_fn: random_jaccard(out_fn, n=10000, - size=20, universe=40), - 'random-l-jaccard': lambda out_fn: random_jaccard(out_fn, n=100000, - size=70, universe=100), - 'sift-128-euclidean': sift, - 'nytimes-256-angular': lambda out_fn: nytimes(out_fn, 256), - 'nytimes-16-angular': lambda out_fn: nytimes(out_fn, 16), - 'word2bits-800-hamming': lambda out_fn: word2bits( - out_fn, '400K', - 'w2b_bitlevel1_size800_vocab400K'), - 'lastfm-64-dot': lambda out_fn: lastfm(out_fn, 64), - 'sift-256-hamming': lambda out_fn: sift_hamming( - out_fn, 'sift.hamming.256'), - 'kosarak-jaccard': lambda out_fn: kosarak(out_fn), - 'movielens1m-jaccard': movielens1m, - 'movielens10m-jaccard': movielens10m, - 'movielens20m-jaccard': movielens20m, + "deep-image-96-angular": deep_image, + "fashion-mnist-784-euclidean": fashion_mnist, + "gist-960-euclidean": gist, + "glove-25-angular": lambda out_fn: glove(out_fn, 25), + "glove-50-angular": lambda out_fn: glove(out_fn, 50), + "glove-100-angular": lambda out_fn: glove(out_fn, 100), + "glove-200-angular": lambda out_fn: glove(out_fn, 200), + "mnist-784-euclidean": mnist, + "random-xs-20-euclidean": lambda out_fn: random_float(out_fn, 20, 10000, 100, "euclidean"), + "random-s-100-euclidean": lambda out_fn: random_float(out_fn, 100, 100000, 1000, "euclidean"), + "random-xs-20-angular": lambda out_fn: random_float(out_fn, 20, 10000, 100, "angular"), + "random-s-100-angular": lambda out_fn: random_float(out_fn, 100, 100000, 1000, "angular"), + "random-xs-16-hamming": lambda out_fn: random_bitstring(out_fn, 16, 10000, 100), + "random-s-128-hamming": lambda out_fn: random_bitstring(out_fn, 128, 50000, 1000), + "random-l-256-hamming": lambda out_fn: random_bitstring(out_fn, 256, 100000, 1000), + "random-s-jaccard": lambda out_fn: random_jaccard(out_fn, n=10000, size=20, universe=40), + "random-l-jaccard": lambda out_fn: random_jaccard(out_fn, n=100000, size=70, universe=100), + "sift-128-euclidean": sift, + "nytimes-256-angular": lambda out_fn: nytimes(out_fn, 256), + "nytimes-16-angular": lambda out_fn: nytimes(out_fn, 16), + "word2bits-800-hamming": lambda out_fn: word2bits(out_fn, "400K", "w2b_bitlevel1_size800_vocab400K"), + "lastfm-64-dot": lambda out_fn: lastfm(out_fn, 64), + "sift-256-hamming": lambda out_fn: sift_hamming(out_fn, "sift.hamming.256"), + "kosarak-jaccard": lambda out_fn: kosarak(out_fn), + "movielens1m-jaccard": movielens1m, + "movielens10m-jaccard": movielens10m, + "movielens20m-jaccard": movielens20m, } diff --git a/ann_benchmarks/distance.py b/ann_benchmarks/distance.py index d649a2769..94c3df434 100644 --- a/ann_benchmarks/distance.py +++ b/ann_benchmarks/distance.py @@ -4,51 +4,59 @@ # Need own implementation of jaccard because scipy's # implementation is different + def jaccard(a, b): if len(a) == 0 or len(b) == 0: - return 0 + return 0 intersect = len(set(a) & set(b)) return intersect / (float)(len(a) + len(b) - intersect) + def norm(a): - return np.sum(a ** 2) ** 0.5 + return np.sum(a**2) ** 0.5 + def euclidean(a, b): return norm(a - b) + metrics = { - 'hamming': { - 'distance': lambda a, b: np.sum(a.astype(np.bool_) ^ b.astype(np.bool_)), - 'distance_valid': lambda a: True, + "hamming": { + "distance": lambda a, b: np.sum(a.astype(np.bool_) ^ b.astype(np.bool_)), + "distance_valid": lambda a: True, }, # return 1 - jaccard similarity, because smaller distances are better. - 'jaccard': { - 'distance': lambda a, b: 1 - jaccard(a, b), - 'distance_valid': lambda a: a < 1 - 1e-5, + "jaccard": { + "distance": lambda a, b: 1 - jaccard(a, b), + "distance_valid": lambda a: a < 1 - 1e-5, }, - 'euclidean': { - 'distance': lambda a, b: euclidean(a, b), - 'distance_valid': lambda a: True, + "euclidean": { + "distance": lambda a, b: euclidean(a, b), + "distance_valid": lambda a: True, + }, + "angular": { + "distance": lambda a, b: euclidean(a, b) / (norm(a) * norm(b)), + "distance_valid": lambda a: True, }, - 'angular': { - 'distance': lambda a, b: euclidean(a, b) / (norm(a) * norm(b)), - 'distance_valid': lambda a: True, - } } + def sparse_to_lists(data, lengths): X = [] index = 0 for l in lengths: - X.append(data[index:index+l]) + X.append(data[index : index + l]) index += l return X + def dataset_transform(dataset): - if dataset.attrs.get('type', 'dense') != 'sparse': - return np.array(dataset['train']), np.array(dataset['test']) + if dataset.attrs.get("type", "dense") != "sparse": + return np.array(dataset["train"]), np.array(dataset["test"]) # we store the dataset as a list of integers, accompanied by a list of lengths in hdf5 # so we transform it back to the format expected by the algorithms here (array of array of ints) - return sparse_to_lists(dataset['train'], dataset['size_train']), sparse_to_lists(dataset['test'], dataset['size_test']) + return sparse_to_lists(dataset["train"], dataset["size_train"]), sparse_to_lists( + dataset["test"], dataset["size_test"] + ) diff --git a/ann_benchmarks/main.py b/ann_benchmarks/main.py index 45889143f..fd478d432 100644 --- a/ann_benchmarks/main.py +++ b/ann_benchmarks/main.py @@ -13,10 +13,12 @@ from ann_benchmarks.datasets import get_dataset, DATASETS from ann_benchmarks.constants import INDEX_DIR -from ann_benchmarks.algorithms.definitions import (get_definitions, - list_algorithms, - algorithm_status, - InstantiationStatus) +from ann_benchmarks.algorithms.definitions import ( + get_definitions, + list_algorithms, + algorithm_status, + InstantiationStatus, +) from ann_benchmarks.results import get_result_filename from ann_benchmarks.runner import run, run_docker @@ -44,83 +46,56 @@ def run_worker(cpu, args, queue): if args.batch: cpu_limit = "0-%d" % (multiprocessing.cpu_count() - 1) - run_docker(definition, args.dataset, args.count, - args.runs, args.timeout, args.batch, cpu_limit, mem_limit) + run_docker(definition, args.dataset, args.count, args.runs, args.timeout, args.batch, cpu_limit, mem_limit) def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( - '--dataset', - metavar='NAME', - help='the dataset to load training points from', - default='glove-100-angular', - choices=DATASETS.keys()) + "--dataset", + metavar="NAME", + help="the dataset to load training points from", + default="glove-100-angular", + choices=DATASETS.keys(), + ) parser.add_argument( - "-k", "--count", - default=10, - type=positive_int, - help="the number of near neighbours to search for") - parser.add_argument( - '--definitions', - metavar='FILE', - help='load algorithm definitions from FILE', - default='algos.yaml') - parser.add_argument( - '--algorithm', - metavar='NAME', - help='run only the named algorithm', - default=None) + "-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for" + ) parser.add_argument( - '--docker-tag', - metavar='NAME', - help='run only algorithms in a particular docker image', - default=None) + "--definitions", metavar="FILE", help="load algorithm definitions from FILE", default="algos.yaml" + ) + parser.add_argument("--algorithm", metavar="NAME", help="run only the named algorithm", default=None) parser.add_argument( - '--list-algorithms', - help='print the names of all known algorithms and exit', - action='store_true') + "--docker-tag", metavar="NAME", help="run only algorithms in a particular docker image", default=None + ) parser.add_argument( - '--force', - help='re-run algorithms even if their results already exist', - action='store_true') + "--list-algorithms", help="print the names of all known algorithms and exit", action="store_true" + ) + parser.add_argument("--force", help="re-run algorithms even if their results already exist", action="store_true") parser.add_argument( - '--runs', - metavar='COUNT', + "--runs", + metavar="COUNT", type=positive_int, - help='run each algorithm instance %(metavar)s times and use only' - ' the best result', - default=5) - parser.add_argument( - '--timeout', - type=int, - help='Timeout (in seconds) for each individual algorithm run, or -1' - 'if no timeout should be set', - default=2 * 3600) + help="run each algorithm instance %(metavar)s times and use only" " the best result", + default=5, + ) parser.add_argument( - '--local', - action='store_true', - help='If set, then will run everything locally (inside the same ' - 'process) rather than using Docker') - parser.add_argument( - '--batch', - action='store_true', - help='If set, algorithms get all queries at once') - parser.add_argument( - '--max-n-algorithms', + "--timeout", type=int, - help='Max number of algorithms to run (just used for testing)', - default=-1) + help="Timeout (in seconds) for each individual algorithm run, or -1" "if no timeout should be set", + default=2 * 3600, + ) parser.add_argument( - '--run-disabled', - help='run algorithms that are disabled in algos.yml', - action='store_true') + "--local", + action="store_true", + help="If set, then will run everything locally (inside the same " "process) rather than using Docker", + ) + parser.add_argument("--batch", action="store_true", help="If set, algorithms get all queries at once") parser.add_argument( - '--parallelism', - type=positive_int, - help='Number of Docker containers in parallel', - default=1) + "--max-n-algorithms", type=int, help="Max number of algorithms to run (just used for testing)", default=-1 + ) + parser.add_argument("--run-disabled", help="run algorithms that are disabled in algos.yml", action="store_true") + parser.add_argument("--parallelism", type=positive_int, help="Number of Docker containers in parallel", default=1) args = parser.parse_args() if args.timeout == -1: @@ -139,10 +114,9 @@ def main(): shutil.rmtree(INDEX_DIR) dataset, dimension = get_dataset(args.dataset) - point_type = dataset.attrs.get('point_type', 'float') - distance = dataset.attrs['distance'] - definitions = get_definitions( - args.definitions, dimension, point_type, distance, args.count) + point_type = dataset.attrs.get("point_type", "float") + distance = dataset.attrs["distance"] + definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count) # Filter out, from the loaded definitions, all those query argument groups # that correspond to experiments that have already been run. (This might @@ -155,22 +129,19 @@ def main(): query_argument_groups = [[]] not_yet_run = [] for query_arguments in query_argument_groups: - fn = get_result_filename(args.dataset, - args.count, definition, - query_arguments, args.batch) + fn = get_result_filename(args.dataset, args.count, definition, query_arguments, args.batch) if args.force or not os.path.exists(fn): not_yet_run.append(query_arguments) if not_yet_run: if definition.query_argument_groups: - definition = definition._replace( - query_argument_groups=not_yet_run) + definition = definition._replace(query_argument_groups=not_yet_run) filtered_definitions.append(definition) definitions = filtered_definitions random.shuffle(definitions) if args.algorithm: - logger.info(f'running only {args.algorithm}') + logger.info(f"running only {args.algorithm}") definitions = [d for d in definitions if d.algorithm == args.algorithm] if not args.local: @@ -179,67 +150,70 @@ def main(): docker_tags = set() for image in docker_client.images.list(): for tag in image.tags: - tag = tag.split(':')[0] + tag = tag.split(":")[0] docker_tags.add(tag) if args.docker_tag: - logger.info(f'running only {args.docker_tag}') - definitions = [ - d for d in definitions if d.docker_tag == args.docker_tag] + logger.info(f"running only {args.docker_tag}") + definitions = [d for d in definitions if d.docker_tag == args.docker_tag] if set(d.docker_tag for d in definitions).difference(docker_tags): - logger.info(f'not all docker images available, only: {set(docker_tags)}') - logger.info(f'missing docker images: ' - f'{str(set(d.docker_tag for d in definitions).difference(docker_tags))}') - definitions = [ - d for d in definitions if d.docker_tag in docker_tags] + logger.info(f"not all docker images available, only: {set(docker_tags)}") + logger.info( + f"missing docker images: " f"{str(set(d.docker_tag for d in definitions).difference(docker_tags))}" + ) + definitions = [d for d in definitions if d.docker_tag in docker_tags] else: + def _test(df): status = algorithm_status(df) # If the module was loaded but doesn't actually have a constructor # of the right name, then the definition is broken if status == InstantiationStatus.NO_CONSTRUCTOR: - raise Exception("%s.%s(%s): error: the module '%s' does not" - " expose the named constructor" % ( - df.module, df.constructor, - df.arguments, df.module)) + raise Exception( + "%s.%s(%s): error: the module '%s' does not" + " expose the named constructor" % (df.module, df.constructor, df.arguments, df.module) + ) if status == InstantiationStatus.NO_MODULE: # If the module couldn't be loaded (presumably because # of a missing dependency), print a warning and remove # this definition from the list of things to be run - logging.warning("%s.%s(%s): the module '%s' could not be " - "loaded; skipping" % (df.module, df.constructor, - df.arguments, df.module)) + logging.warning( + "%s.%s(%s): the module '%s' could not be " + "loaded; skipping" % (df.module, df.constructor, df.arguments, df.module) + ) return False else: return True + definitions = [d for d in definitions if _test(d)] if not args.run_disabled: if len([d for d in definitions if d.disabled]): - logger.info(f'Not running disabled algorithms {[d for d in definitions if d.disabled]}') + logger.info(f"Not running disabled algorithms {[d for d in definitions if d.disabled]}") definitions = [d for d in definitions if not d.disabled] if args.max_n_algorithms >= 0: - definitions = definitions[:args.max_n_algorithms] + definitions = definitions[: args.max_n_algorithms] if len(definitions) == 0: - raise Exception('Nothing to run') + raise Exception("Nothing to run") else: - logger.info(f'Order: {definitions}') + logger.info(f"Order: {definitions}") if args.parallelism > multiprocessing.cpu_count() - 1: - raise Exception('Parallelism larger than %d! (CPU count minus one)' % (multiprocessing.cpu_count() - 1)) + raise Exception("Parallelism larger than %d! (CPU count minus one)" % (multiprocessing.cpu_count() - 1)) # Multiprocessing magic to farm this out to all CPUs queue = multiprocessing.Queue() for definition in definitions: queue.put(definition) if args.batch and args.parallelism > 1: - raise Exception(f"Batch mode uses all available CPU resources, --parallelism should be set to 1. (Was: {args.parallelism})") - workers = [multiprocessing.Process(target=run_worker, args=(i+1, args, queue)) - for i in range(args.parallelism)] + raise Exception( + f"Batch mode uses all available CPU resources, --parallelism should be set to 1. (Was: {args.parallelism})" + ) + workers = [multiprocessing.Process(target=run_worker, args=(i + 1, args, queue)) for i in range(args.parallelism)] [worker.start() for worker in workers] [worker.join() for worker in workers] diff --git a/ann_benchmarks/plotting/metrics.py b/ann_benchmarks/plotting/metrics.py index e14fe5056..6d038e8dd 100644 --- a/ann_benchmarks/plotting/metrics.py +++ b/ann_benchmarks/plotting/metrics.py @@ -10,8 +10,7 @@ def epsilon_threshold(data, count, epsilon): return data[count - 1] * (1 + epsilon) -def get_recall_values(dataset_distances, run_distances, count, threshold, - epsilon=1e-3): +def get_recall_values(dataset_distances, run_distances, count, threshold, epsilon=1e-3): recalls = np.zeros(len(run_distances)) for i in range(len(run_distances)): t = threshold(dataset_distances[i], count, epsilon) @@ -20,76 +19,73 @@ def get_recall_values(dataset_distances, run_distances, count, threshold, if d <= t: actual += 1 recalls[i] = actual - return (np.mean(recalls) / float(count), - np.std(recalls) / float(count), - recalls) + return (np.mean(recalls) / float(count), np.std(recalls) / float(count), recalls) def knn(dataset_distances, run_distances, count, metrics, epsilon=1e-3): - if 'knn' not in metrics: - print('Computing knn metrics') - knn_metrics = metrics.create_group('knn') - mean, std, recalls = get_recall_values(dataset_distances, - run_distances, count, - knn_threshold, epsilon) - knn_metrics.attrs['mean'] = mean - knn_metrics.attrs['std'] = std - knn_metrics['recalls'] = recalls + if "knn" not in metrics: + print("Computing knn metrics") + knn_metrics = metrics.create_group("knn") + mean, std, recalls = get_recall_values(dataset_distances, run_distances, count, knn_threshold, epsilon) + knn_metrics.attrs["mean"] = mean + knn_metrics.attrs["std"] = std + knn_metrics["recalls"] = recalls else: print("Found cached result") - return metrics['knn'] + return metrics["knn"] def epsilon(dataset_distances, run_distances, count, metrics, epsilon=0.01): - s = 'eps' + str(epsilon) + s = "eps" + str(epsilon) if s not in metrics: - print('Computing epsilon metrics') + print("Computing epsilon metrics") epsilon_metrics = metrics.create_group(s) - mean, std, recalls = get_recall_values(dataset_distances, - run_distances, count, - epsilon_threshold, epsilon) - epsilon_metrics.attrs['mean'] = mean - epsilon_metrics.attrs['std'] = std - epsilon_metrics['recalls'] = recalls + mean, std, recalls = get_recall_values(dataset_distances, run_distances, count, epsilon_threshold, epsilon) + epsilon_metrics.attrs["mean"] = mean + epsilon_metrics.attrs["std"] = std + epsilon_metrics["recalls"] = recalls else: print("Found cached result") return metrics[s] def rel(dataset_distances, run_distances, metrics): - if 'rel' not in metrics.attrs: - print('Computing rel metrics') + if "rel" not in metrics.attrs: + print("Computing rel metrics") total_closest_distance = 0.0 total_candidate_distance = 0.0 - for true_distances, found_distances in zip(dataset_distances, - run_distances): + for true_distances, found_distances in zip(dataset_distances, run_distances): total_closest_distance += np.sum(true_distances) total_candidate_distance += np.sum(found_distances) if total_closest_distance < 0.01: - metrics.attrs['rel'] = float("inf") + metrics.attrs["rel"] = float("inf") else: - metrics.attrs['rel'] = total_candidate_distance / \ - total_closest_distance + metrics.attrs["rel"] = total_candidate_distance / total_closest_distance else: print("Found cached result") - return metrics.attrs['rel'] + return metrics.attrs["rel"] def queries_per_second(queries, attrs): return 1.0 / attrs["best_search_time"] + def percentile_50(times): return np.percentile(times, 50.0) * 1000.0 + def percentile_95(times): return np.percentile(times, 95.0) * 1000.0 + def percentile_99(times): return np.percentile(times, 99.0) * 1000.0 + def percentile_999(times): return np.percentile(times, 99.9) * 1000.0 + def index_size(queries, attrs): # TODO(erikbern): should replace this with peak memory usage or something return attrs.get("index_size", 0) @@ -104,79 +100,106 @@ def candidates(queries, attrs): def dist_computations(queries, attrs): - return attrs.get("dist_comps", 0) / (attrs['run_count'] * len(queries)) + return attrs.get("dist_comps", 0) / (attrs["run_count"] * len(queries)) all_metrics = { "k-nn": { "description": "Recall", - "function": lambda true_distances, run_distances, metrics, times, run_attrs: knn(true_distances, run_distances, run_attrs["count"], metrics).attrs['mean'], # noqa + "function": lambda true_distances, run_distances, metrics, times, run_attrs: knn( + true_distances, run_distances, run_attrs["count"], metrics + ).attrs[ + "mean" + ], # noqa "worst": float("-inf"), - "lim": [0.0, 1.03] + "lim": [0.0, 1.03], }, "epsilon": { "description": "Epsilon 0.01 Recall", - "function": lambda true_distances, run_distances, metrics, times, run_attrs: epsilon(true_distances, run_distances, run_attrs["count"], metrics).attrs['mean'], # noqa - "worst": float("-inf") + "function": lambda true_distances, run_distances, metrics, times, run_attrs: epsilon( + true_distances, run_distances, run_attrs["count"], metrics + ).attrs[ + "mean" + ], # noqa + "worst": float("-inf"), }, "largeepsilon": { "description": "Epsilon 0.1 Recall", - "function": lambda true_distances, run_distances, metrics, times, run_attrs: epsilon(true_distances, run_distances, run_attrs["count"], metrics, 0.1).attrs['mean'], # noqa - "worst": float("-inf") + "function": lambda true_distances, run_distances, metrics, times, run_attrs: epsilon( + true_distances, run_distances, run_attrs["count"], metrics, 0.1 + ).attrs[ + "mean" + ], # noqa + "worst": float("-inf"), }, "rel": { "description": "Relative Error", - "function": lambda true_distances, run_distances, metrics, times, run_attrs: rel(true_distances, run_distances, metrics), # noqa - "worst": float("inf") + "function": lambda true_distances, run_distances, metrics, times, run_attrs: rel( + true_distances, run_distances, metrics + ), # noqa + "worst": float("inf"), }, "qps": { "description": "Queries per second (1/s)", - "function": lambda true_distances, run_distances, metrics, times, run_attrs: queries_per_second(true_distances, run_attrs), # noqa - "worst": float("-inf") + "function": lambda true_distances, run_distances, metrics, times, run_attrs: queries_per_second( + true_distances, run_attrs + ), # noqa + "worst": float("-inf"), }, "p50": { "description": "Percentile 50 (millis)", "function": lambda true_distances, run_distances, metrics, times, run_attrs: percentile_50(times), # noqa - "worst": float("inf") + "worst": float("inf"), }, "p95": { "description": "Percentile 95 (millis)", "function": lambda true_distances, run_distances, metrics, times, run_attrs: percentile_95(times), # noqa - "worst": float("inf") + "worst": float("inf"), }, "p99": { "description": "Percentile 99 (millis)", "function": lambda true_distances, run_distances, metrics, times, run_attrs: percentile_99(times), # noqa - "worst": float("inf") + "worst": float("inf"), }, "p999": { "description": "Percentile 99.9 (millis)", "function": lambda true_distances, run_distances, metrics, times, run_attrs: percentile_999(times), # noqa - "worst": float("inf") + "worst": float("inf"), }, "distcomps": { "description": "Distance computations", - "function": lambda true_distances, run_distances, metrics, times, run_attrs: dist_computations(true_distances, run_attrs), # noqa - "worst": float("inf") + "function": lambda true_distances, run_distances, metrics, times, run_attrs: dist_computations( + true_distances, run_attrs + ), # noqa + "worst": float("inf"), }, "build": { "description": "Build time (s)", - "function": lambda true_distances, run_distances, metrics, times, run_attrs: build_time(true_distances, run_attrs), # noqa - "worst": float("inf") + "function": lambda true_distances, run_distances, metrics, times, run_attrs: build_time( + true_distances, run_attrs + ), # noqa + "worst": float("inf"), }, "candidates": { "description": "Candidates generated", - "function": lambda true_distances, run_distances, metrics, times, run_attrs: candidates(true_distances, run_attrs), # noqa - "worst": float("inf") + "function": lambda true_distances, run_distances, metrics, times, run_attrs: candidates( + true_distances, run_attrs + ), # noqa + "worst": float("inf"), }, "indexsize": { "description": "Index size (kB)", - "function": lambda true_distances, run_distances, metrics, times, run_attrs: index_size(true_distances, run_attrs), # noqa - "worst": float("inf") + "function": lambda true_distances, run_distances, metrics, times, run_attrs: index_size( + true_distances, run_attrs + ), # noqa + "worst": float("inf"), }, "queriessize": { "description": "Index size (kB)/Queries per second (s)", - "function": lambda true_distances, run_distances, metrics, times, run_attrs: index_size(true_distances, run_attrs) / queries_per_second(true_distances, run_attrs), # noqa - "worst": float("inf") - } + "function": lambda true_distances, run_distances, metrics, times, run_attrs: index_size( + true_distances, run_attrs + ) + / queries_per_second(true_distances, run_attrs), # noqa + "worst": float("inf"), + }, } diff --git a/ann_benchmarks/plotting/plot_variants.py b/ann_benchmarks/plotting/plot_variants.py index e8777ee47..dcba83ad9 100644 --- a/ann_benchmarks/plotting/plot_variants.py +++ b/ann_benchmarks/plotting/plot_variants.py @@ -1,4 +1,3 @@ - all_plot_variants = { "recall/time": ("k-nn", "qps"), "recall/buildtime": ("k-nn", "build"), diff --git a/ann_benchmarks/plotting/utils.py b/ann_benchmarks/plotting/utils.py index a49d795db..035ae290e 100644 --- a/ann_benchmarks/plotting/utils.py +++ b/ann_benchmarks/plotting/utils.py @@ -6,9 +6,9 @@ def get_or_create_metrics(run): - if 'metrics' not in run: - run.create_group('metrics') - return run['metrics'] + if "metrics" not in run: + run.create_group("metrics") + return run["metrics"] def create_pointset(data, xn, yn): @@ -21,8 +21,7 @@ def create_pointset(data, xn, yn): # Generate Pareto frontier xs, ys, ls = [], [], [] last_x = xm["worst"] - comparator = ((lambda xv, lx: xv > lx) - if last_x < 0 else (lambda xv, lx: xv < lx)) + comparator = (lambda xv, lx: xv > lx) if last_x < 0 else (lambda xv, lx: xv < lx) for algo, algo_name, xv, yv in data: if not xv or not yv: continue @@ -37,32 +36,29 @@ def create_pointset(data, xn, yn): return xs, ys, ls, axs, ays, als -def compute_metrics(true_nn_distances, res, metric_1, metric_2, - recompute=False): +def compute_metrics(true_nn_distances, res, metric_1, metric_2, recompute=False): all_results = {} for i, (properties, run) in enumerate(res): - algo = properties['algo'] - algo_name = properties['name'] + algo = properties["algo"] + algo_name = properties["name"] # cache distances to avoid access to hdf5 file - run_distances = np.array(run['distances']) + run_distances = np.array(run["distances"]) # cache times to avoid access to hdf5 file - times = np.array(run['times']) - if recompute and 'metrics' in run: - del run['metrics'] + times = np.array(run["times"]) + if recompute and "metrics" in run: + del run["metrics"] metrics_cache = get_or_create_metrics(run) - metric_1_value = metrics[metric_1]['function']( - true_nn_distances, - run_distances, metrics_cache, times, properties) - metric_2_value = metrics[metric_2]['function']( - true_nn_distances, - run_distances, metrics_cache, times, properties) + metric_1_value = metrics[metric_1]["function"]( + true_nn_distances, run_distances, metrics_cache, times, properties + ) + metric_2_value = metrics[metric_2]["function"]( + true_nn_distances, run_distances, metrics_cache, times, properties + ) - print('%3d: %80s %12.3f %12.3f' % - (i, algo_name, metric_1_value, metric_2_value)) + print("%3d: %80s %12.3f %12.3f" % (i, algo_name, metric_1_value, metric_2_value)) - all_results.setdefault(algo, []).append( - (algo, algo_name, metric_1_value, metric_2_value)) + all_results.setdefault(algo, []).append((algo, algo_name, metric_1_value, metric_2_value)) return all_results @@ -70,47 +66,43 @@ def compute_metrics(true_nn_distances, res, metric_1, metric_2, def compute_all_metrics(true_nn_distances, run, properties, recompute=False): algo = properties["algo"] algo_name = properties["name"] - print('--') + print("--") print(algo_name) results = {} # cache distances to avoid access to hdf5 file run_distances = np.array(run["distances"]) # cache times to avoid access to hdf5 file - times = np.array(run['times']) - if recompute and 'metrics' in run: - del run['metrics'] + times = np.array(run["times"]) + if recompute and "metrics" in run: + del run["metrics"] metrics_cache = get_or_create_metrics(run) for name, metric in metrics.items(): - v = metric["function"]( - true_nn_distances, run_distances, metrics_cache, times, properties) + v = metric["function"](true_nn_distances, run_distances, metrics_cache, times, properties) results[name] = v if v: - print('%s: %g' % (name, v)) + print("%s: %g" % (name, v)) return (algo, algo_name, results) + def compute_metrics_all_runs(dataset, res, recompute=False): - true_nn_distances=list(dataset['distances']) + true_nn_distances = list(dataset["distances"]) for i, (properties, run) in enumerate(res): - algo = properties['algo'] - algo_name = properties['name'] + algo = properties["algo"] + algo_name = properties["name"] # cache distances to avoid access to hdf5 file # print('Load distances and times') - run_distances = np.array(run['distances']) - times = np.array(run['times']) + run_distances = np.array(run["distances"]) + times = np.array(run["times"]) # print('... done') - if recompute and 'metrics' in run: - print('Recomputing metrics, clearing cache') - del run['metrics'] + if recompute and "metrics" in run: + print("Recomputing metrics, clearing cache") + del run["metrics"] metrics_cache = get_or_create_metrics(run) - - dataset = properties['dataset'] - - run_result = { - 'algorithm': algo, - 'parameters': algo_name, - 'count': properties['count'] - } + + dataset = properties["dataset"] + + run_result = {"algorithm": algo, "parameters": algo_name, "count": properties["count"]} for name, metric in metrics.items(): v = metric["function"](true_nn_distances, run_distances, metrics_cache, times, properties) run_result[name] = v @@ -119,29 +111,23 @@ def compute_metrics_all_runs(dataset, res, recompute=False): def generate_n_colors(n): vs = np.linspace(0.3, 0.9, 7) - colors = [(.9, .4, .4, 1.)] + colors = [(0.9, 0.4, 0.4, 1.0)] def euclidean(a, b): - return sum((x - y)**2 for x, y in zip(a, b)) + return sum((x - y) ** 2 for x, y in zip(a, b)) + while len(colors) < n: - new_color = max(itertools.product(vs, vs, vs), - key=lambda a: min(euclidean(a, b) for b in colors)) - colors.append(new_color + (1.,)) + new_color = max(itertools.product(vs, vs, vs), key=lambda a: min(euclidean(a, b) for b in colors)) + colors.append(new_color + (1.0,)) return colors def create_linestyles(unique_algorithms): - colors = dict( - zip(unique_algorithms, generate_n_colors(len(unique_algorithms)))) - linestyles = dict((algo, ['--', '-.', '-', ':'][i % 4]) - for i, algo in enumerate(unique_algorithms)) - markerstyles = dict((algo, ['+', '<', 'o', '*', 'x'][i % 5]) - for i, algo in enumerate(unique_algorithms)) - faded = dict((algo, (r, g, b, 0.3)) - for algo, (r, g, b, a) in colors.items()) - return dict((algo, (colors[algo], faded[algo], - linestyles[algo], markerstyles[algo])) - for algo in unique_algorithms) + colors = dict(zip(unique_algorithms, generate_n_colors(len(unique_algorithms)))) + linestyles = dict((algo, ["--", "-.", "-", ":"][i % 4]) for i, algo in enumerate(unique_algorithms)) + markerstyles = dict((algo, ["+", "<", "o", "*", "x"][i % 5]) for i, algo in enumerate(unique_algorithms)) + faded = dict((algo, (r, g, b, 0.3)) for algo, (r, g, b, a) in colors.items()) + return dict((algo, (colors[algo], faded[algo], linestyles[algo], markerstyles[algo])) for algo in unique_algorithms) def get_up_down(metric): @@ -157,9 +143,10 @@ def get_left_right(metric): def get_plot_label(xm, ym): - template = ("%(xlabel)s-%(ylabel)s tradeoff - %(updown)s and" - " to the %(leftright)s is better") - return template % {"xlabel": xm["description"], - "ylabel": ym["description"], - "updown": get_up_down(ym), - "leftright": get_left_right(xm)} + template = "%(xlabel)s-%(ylabel)s tradeoff - %(updown)s and" " to the %(leftright)s is better" + return template % { + "xlabel": xm["description"], + "ylabel": ym["description"], + "updown": get_up_down(ym), + "leftright": get_left_right(xm), + } diff --git a/ann_benchmarks/results.py b/ann_benchmarks/results.py index 5ad801e51..e11540275 100644 --- a/ann_benchmarks/results.py +++ b/ann_benchmarks/results.py @@ -7,55 +7,51 @@ import traceback -def get_result_filename(dataset=None, count=None, definition=None, - query_arguments=None, batch_mode=False): - d = ['results'] +def get_result_filename(dataset=None, count=None, definition=None, query_arguments=None, batch_mode=False): + d = ["results"] if dataset: d.append(dataset) if count: d.append(str(count)) if definition: - d.append(definition.algorithm + ('-batch' if batch_mode else '')) + d.append(definition.algorithm + ("-batch" if batch_mode else "")) data = definition.arguments + query_arguments - d.append(re.sub(r'\W+', '_', json.dumps(data, sort_keys=True)) - .strip('_') + ".hdf5") + d.append(re.sub(r"\W+", "_", json.dumps(data, sort_keys=True)).strip("_") + ".hdf5") return os.path.join(*d) -def store_results(dataset, count, definition, query_arguments, attrs, results, - batch): - fn = get_result_filename( - dataset, count, definition, query_arguments, batch) +def store_results(dataset, count, definition, query_arguments, attrs, results, batch): + fn = get_result_filename(dataset, count, definition, query_arguments, batch) head, tail = os.path.split(fn) if not os.path.isdir(head): os.makedirs(head) - f = h5py.File(fn, 'w') + f = h5py.File(fn, "w") for k, v in attrs.items(): f.attrs[k] = v - times = f.create_dataset('times', (len(results),), 'f') - neighbors = f.create_dataset('neighbors', (len(results), count), 'i') - distances = f.create_dataset('distances', (len(results), count), 'f') + times = f.create_dataset("times", (len(results),), "f") + neighbors = f.create_dataset("neighbors", (len(results), count), "i") + distances = f.create_dataset("distances", (len(results), count), "f") for i, (time, ds) in enumerate(results): times[i] = time neighbors[i] = [n for n, d in ds] + [-1] * (count - len(ds)) - distances[i] = [d for n, d in ds] + [float('inf')] * (count - len(ds)) + distances[i] = [d for n, d in ds] + [float("inf")] * (count - len(ds)) f.close() def load_all_results(dataset=None, count=None, batch_mode=False): for root, _, files in os.walk(get_result_filename(dataset, count)): for fn in files: - if os.path.splitext(fn)[-1] != '.hdf5': + if os.path.splitext(fn)[-1] != ".hdf5": continue try: - f = h5py.File(os.path.join(root, fn), 'r+') + f = h5py.File(os.path.join(root, fn), "r+") properties = dict(f.attrs) - if batch_mode != properties['batch_mode']: + if batch_mode != properties["batch_mode"]: continue yield properties, f f.close() except: - print('Was unable to read', fn) + print("Was unable to read", fn) traceback.print_exc() @@ -63,5 +59,5 @@ def get_unique_algorithms(): algorithms = set() for batch_mode in [False, True]: for properties, _ in load_all_results(batch_mode=batch_mode): - algorithms.add(properties['algo']) + algorithms.add(properties["algo"]) return algorithms diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py index 8aaddc5cd..a0ab5f457 100644 --- a/ann_benchmarks/runner.py +++ b/ann_benchmarks/runner.py @@ -11,22 +11,20 @@ import numpy import psutil -from ann_benchmarks.algorithms.definitions import (Definition, - instantiate_algorithm) +from ann_benchmarks.algorithms.definitions import Definition, instantiate_algorithm from ann_benchmarks.datasets import get_dataset, DATASETS from ann_benchmarks.distance import metrics, dataset_transform from ann_benchmarks.results import store_results -def run_individual_query(algo, X_train, X_test, distance, count, run_count, - batch): - prepared_queries = \ - (batch and hasattr(algo, "prepare_batch_query")) or \ - ((not batch) and hasattr(algo, "prepare_query")) +def run_individual_query(algo, X_train, X_test, distance, count, run_count, batch): + prepared_queries = (batch and hasattr(algo, "prepare_batch_query")) or ( + (not batch) and hasattr(algo, "prepare_query") + ) - best_search_time = float('inf') + best_search_time = float("inf") for i in range(run_count): - print('Run %d/%d...' % (i + 1, run_count)) + print("Run %d/%d..." % (i + 1, run_count)) # a bit dumb but can't be a scalar since of Python's scoping rules n_items_processed = [0] @@ -35,20 +33,23 @@ def single_query(v): algo.prepare_query(v, count) start = time.time() algo.run_prepared_query() - total = (time.time() - start) + total = time.time() - start candidates = algo.get_prepared_query_results() else: start = time.time() candidates = algo.query(v, count) - total = (time.time() - start) - candidates = [(int(idx), float(metrics[distance]['distance'](v, X_train[idx]))) # noqa - for idx in candidates] + total = time.time() - start + candidates = [ + (int(idx), float(metrics[distance]["distance"](v, X_train[idx]))) for idx in candidates # noqa + ] n_items_processed[0] += 1 if n_items_processed[0] % 1000 == 0: - print('Processed %d/%d queries...' % (n_items_processed[0], len(X_test))) + print("Processed %d/%d queries..." % (n_items_processed[0], len(X_test))) if len(candidates) > count: - print('warning: algorithm %s returned %d results, but count' - ' is only %d)' % (algo, len(candidates), count)) + print( + "warning: algorithm %s returned %d results, but count" + " is only %d)" % (algo, len(candidates), count) + ) return (total, candidates) def batch_query(X): @@ -56,15 +57,16 @@ def batch_query(X): algo.prepare_batch_query(X, count) start = time.time() algo.run_batch_query() - total = (time.time() - start) + total = time.time() - start else: start = time.time() algo.batch_query(X, count) - total = (time.time() - start) + total = time.time() - start results = algo.get_batch_results() - candidates = [[(int(idx), float(metrics[distance]['distance'](v, X_train[idx]))) # noqa - for idx in single_results] - for v, single_results in zip(X, results)] + candidates = [ + [(int(idx), float(metrics[distance]["distance"](v, X_train[idx]))) for idx in single_results] # noqa + for v, single_results in zip(X, results) + ] return [(total / float(len(X)), v) for v in candidates] if batch: @@ -87,7 +89,7 @@ def batch_query(X): "name": str(algo), "run_count": run_count, "distance": distance, - "count": int(count) + "count": int(count), } additional = algo.get_additional() for k in additional: @@ -97,18 +99,23 @@ def batch_query(X): def run(definition, dataset, count, run_count, batch): algo = instantiate_algorithm(definition) - assert not definition.query_argument_groups \ - or hasattr(algo, "set_query_arguments"), """\ + assert not definition.query_argument_groups or hasattr( + algo, "set_query_arguments" + ), """\ error: query argument groups have been specified for %s.%s(%s), but the \ algorithm instantiated from it does not implement the set_query_arguments \ -function""" % (definition.module, definition.constructor, definition.arguments) +function""" % ( + definition.module, + definition.constructor, + definition.arguments, + ) D, dimension = get_dataset(dataset) - X_train = numpy.array(D['train']) - X_test = numpy.array(D['test']) - distance = D.attrs['distance'] - print('got a train set of size (%d * %d)' % (X_train.shape[0], dimension)) - print('got %d queries' % len(X_test)) + X_train = numpy.array(D["train"]) + X_test = numpy.array(D["test"]) + distance = D.attrs["distance"] + print("got a train set of size (%d * %d)" % (X_train.shape[0], dimension)) + print("got %d queries" % len(X_test)) X_train, X_test = dataset_transform(D) @@ -121,8 +128,8 @@ def run(definition, dataset, count, run_count, batch): algo.fit(X_train) build_time = time.time() - t0 index_size = algo.get_memory_usage() - memory_usage_before - print('Built index in', build_time) - print('Index size: ', index_size) + print("Built index in", build_time) + print("Index size: ", index_size) query_argument_groups = definition.query_argument_groups # Make sure that algorithms with no query argument groups still get run @@ -131,68 +138,49 @@ def run(definition, dataset, count, run_count, batch): query_argument_groups = [[]] for pos, query_arguments in enumerate(query_argument_groups, 1): - print("Running query argument group %d of %d..." % - (pos, len(query_argument_groups))) + print("Running query argument group %d of %d..." % (pos, len(query_argument_groups))) if query_arguments: algo.set_query_arguments(*query_arguments) - descriptor, results = run_individual_query( - algo, X_train, X_test, distance, count, run_count, batch) + descriptor, results = run_individual_query(algo, X_train, X_test, distance, count, run_count, batch) descriptor["build_time"] = build_time descriptor["index_size"] = index_size descriptor["algo"] = definition.algorithm descriptor["dataset"] = dataset - store_results(dataset, count, definition, - query_arguments, descriptor, results, batch) + store_results(dataset, count, definition, query_arguments, descriptor, results, batch) finally: algo.done() def run_from_cmdline(): - parser = argparse.ArgumentParser(''' + parser = argparse.ArgumentParser( + """ NOTICE: You probably want to run.py rather than this script. -''') - parser.add_argument( - '--dataset', - choices=DATASETS.keys(), - help='Dataset to benchmark on.', - required=True) - parser.add_argument( - '--algorithm', - help='Name of algorithm for saving the results.', - required=True) - parser.add_argument( - '--module', - help='Python module containing algorithm. E.g. "ann_benchmarks.algorithms.annoy"', - required=True) +""" + ) + parser.add_argument("--dataset", choices=DATASETS.keys(), help="Dataset to benchmark on.", required=True) + parser.add_argument("--algorithm", help="Name of algorithm for saving the results.", required=True) parser.add_argument( - '--constructor', - help='Constructer to load from modulel. E.g. "Annoy"', - required=True) + "--module", help='Python module containing algorithm. E.g. "ann_benchmarks.algorithms.annoy"', required=True + ) + parser.add_argument("--constructor", help='Constructer to load from modulel. E.g. "Annoy"', required=True) parser.add_argument( - '--count', - help='K: Number of nearest neighbours for the algorithm to return.', - required=True, - type=int) + "--count", help="K: Number of nearest neighbours for the algorithm to return.", required=True, type=int + ) parser.add_argument( - '--runs', - help='Number of times to run the algorihm. Will use the fastest run-time over the bunch.', + "--runs", + help="Number of times to run the algorihm. Will use the fastest run-time over the bunch.", required=True, - type=int) + type=int, + ) parser.add_argument( - '--batch', + "--batch", help='If flag included, algorithms will be run in batch mode, rather than "individual query" mode.', - action='store_true') - parser.add_argument( - 'build', - help='JSON of arguments to pass to the constructor. E.g. ["angular", 100]' - ) - parser.add_argument( - 'queries', - help='JSON of arguments to pass to the queries. E.g. [100]', - nargs='*', - default=[]) + action="store_true", + ) + parser.add_argument("build", help='JSON of arguments to pass to the constructor. E.g. ["angular", 100]') + parser.add_argument("queries", help="JSON of arguments to pass to the queries. E.g. [100]", nargs="*", default=[]) args = parser.parse_args() algo_args = json.loads(args.build) print(algo_args) @@ -205,21 +193,28 @@ def run_from_cmdline(): constructor=args.constructor, arguments=algo_args, query_argument_groups=query_args, - disabled=False + disabled=False, ) run(definition, args.dataset, args.count, args.runs, args.batch) -def run_docker(definition, dataset, count, runs, timeout, batch, cpu_limit, - mem_limit=None): - cmd = ['--dataset', dataset, - '--algorithm', definition.algorithm, - '--module', definition.module, - '--constructor', definition.constructor, - '--runs', str(runs), - '--count', str(count)] +def run_docker(definition, dataset, count, runs, timeout, batch, cpu_limit, mem_limit=None): + cmd = [ + "--dataset", + dataset, + "--algorithm", + definition.algorithm, + "--module", + definition.module, + "--constructor", + definition.constructor, + "--runs", + str(runs), + "--count", + str(count), + ] if batch: - cmd += ['--batch'] + cmd += ["--batch"] cmd.append(json.dumps(definition.arguments)) cmd += [json.dumps(qag) for qag in definition.query_argument_groups] @@ -231,24 +226,24 @@ def run_docker(definition, dataset, count, runs, timeout, batch, cpu_limit, definition.docker_tag, cmd, volumes={ - os.path.abspath('ann_benchmarks'): - {'bind': '/home/app/ann_benchmarks', 'mode': 'ro'}, - os.path.abspath('data'): - {'bind': '/home/app/data', 'mode': 'ro'}, - os.path.abspath('results'): - {'bind': '/home/app/results', 'mode': 'rw'}, + os.path.abspath("ann_benchmarks"): {"bind": "/home/app/ann_benchmarks", "mode": "ro"}, + os.path.abspath("data"): {"bind": "/home/app/data", "mode": "ro"}, + os.path.abspath("results"): {"bind": "/home/app/results", "mode": "rw"}, }, cpuset_cpus=cpu_limit, mem_limit=mem_limit, - detach=True) + detach=True, + ) logger = logging.getLogger(f"annb.{container.short_id}") - logger.info('Created container %s: CPU limit %s, mem limit %s, timeout %d, command %s' % \ - (container.short_id, cpu_limit, mem_limit, timeout, cmd)) + logger.info( + "Created container %s: CPU limit %s, mem limit %s, timeout %d, command %s" + % (container.short_id, cpu_limit, mem_limit, timeout, cmd) + ) def stream_logs(): for line in container.logs(stream=True): - logger.info(colors.color(line.decode().rstrip(), fg='blue')) + logger.info(colors.color(line.decode().rstrip(), fg="blue")) t = threading.Thread(target=stream_logs, daemon=True) t.start() @@ -257,21 +252,22 @@ def stream_logs(): return_value = container.wait(timeout=timeout) _handle_container_return_value(return_value, container, logger) except: - logger.error('Container.wait for container %s failed with exception' % container.short_id) + logger.error("Container.wait for container %s failed with exception" % container.short_id) traceback.print_exc() finally: container.remove(force=True) + def _handle_container_return_value(return_value, container, logger): - base_msg = 'Child process for container %s' % (container.short_id) - if type(return_value) is dict: # The return value from container.wait changes from int to dict in docker 3.0.0 - error_msg = return_value['Error'] - exit_code = return_value['StatusCode'] - msg = base_msg + 'returned exit code %d with message %s' %(exit_code, error_msg) - else: + base_msg = "Child process for container %s" % (container.short_id) + if type(return_value) is dict: # The return value from container.wait changes from int to dict in docker 3.0.0 + error_msg = return_value["Error"] + exit_code = return_value["StatusCode"] + msg = base_msg + "returned exit code %d with message %s" % (exit_code, error_msg) + else: exit_code = return_value - msg = base_msg + 'returned exit code %d' % (exit_code) + msg = base_msg + "returned exit code %d" % (exit_code) if exit_code not in [0, None]: - logger.error(colors.color(container.logs().decode(), fg='red')) + logger.error(colors.color(container.logs().decode(), fg="red")) logger.error(msg) diff --git a/create_dataset.py b/create_dataset.py index b9463a8e0..0726b470f 100644 --- a/create_dataset.py +++ b/create_dataset.py @@ -3,10 +3,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument( - '--dataset', - choices=DATASETS.keys(), - required=True) + parser.add_argument("--dataset", choices=DATASETS.keys(), required=True) args = parser.parse_args() fn = get_dataset_fn(args.dataset) DATASETS[args.dataset](fn) diff --git a/create_website.py b/create_website.py index 88094775d..7cc6481e1 100644 --- a/create_website.py +++ b/create_website.py @@ -1,23 +1,16 @@ import matplotlib as mpl -mpl.use('Agg') # noqa + +mpl.use("Agg") # noqa import argparse import os -import json -import pickle -import yaml -import numpy import hashlib from jinja2 import Environment, FileSystemLoader from ann_benchmarks import results from ann_benchmarks.datasets import get_dataset -from ann_benchmarks.plotting.plot_variants import (all_plot_variants - as plot_variants) +from ann_benchmarks.plotting.plot_variants import all_plot_variants as plot_variants from ann_benchmarks.plotting.metrics import all_metrics as metrics -from ann_benchmarks.plotting.utils import (get_plot_label, compute_metrics, - compute_all_metrics, - create_pointset, - create_linestyles) +from ann_benchmarks.plotting.utils import get_plot_label, compute_all_metrics, create_pointset, create_linestyles import plot colors = [ @@ -29,7 +22,7 @@ "rgba(227,26,28,1)", "rgba(253,191,111,1)", "rgba(255,127,0,1)", - "rgba(202,178,214,1)" + "rgba(202,178,214,1)", ] point_styles = { @@ -43,17 +36,19 @@ def convert_color(color): r, g, b, a = color - return "rgba(%(r)d, %(g)d, %(b)d, %(a)d)" % { - "r": r * 255, "g": g * 255, "b": b * 255, "a": a} + return "rgba(%(r)d, %(g)d, %(b)d, %(a)d)" % {"r": r * 255, "g": g * 255, "b": b * 255, "a": a} def convert_linestyle(ls): new_ls = {} for algo in ls.keys(): algostyle = ls[algo] - new_ls[algo] = (convert_color(algostyle[0]), - convert_color(algostyle[1]), - algostyle[2], point_styles[algostyle[3]]) + new_ls[algo] = ( + convert_color(algostyle[0]), + convert_color(algostyle[1]), + algostyle[2], + point_styles[algostyle[3]], + ) return new_ls @@ -74,8 +69,7 @@ def get_distance_from_desc(desc): def get_dataset_label(desc): - return "{} (k = {})".format(get_dataset_from_desc(desc), - get_count_from_desc(desc)) + return "{} (k = {})".format(get_dataset_from_desc(desc), get_count_from_desc(desc)) def directory_path(s): @@ -94,132 +88,115 @@ def prepare_data(data, xn, yn): parser = argparse.ArgumentParser() parser.add_argument( - '--plottype', - help='Generate only the plots specified', - nargs='*', + "--plottype", + help="Generate only the plots specified", + nargs="*", choices=plot_variants.keys(), - default=plot_variants.keys()) -parser.add_argument( - '--outputdir', - help='Select output directory', - default='.', - type=directory_path, - action='store') -parser.add_argument( - '--latex', - help='generates latex code for each plot', - action='store_true') -parser.add_argument( - '--scatter', - help='create scatterplot for data', - action='store_true') -parser.add_argument( - '--recompute', - help='Clears the cache and recomputes the metrics', - action='store_true') + default=plot_variants.keys(), +) +parser.add_argument("--outputdir", help="Select output directory", default=".", type=directory_path, action="store") +parser.add_argument("--latex", help="generates latex code for each plot", action="store_true") +parser.add_argument("--scatter", help="create scatterplot for data", action="store_true") +parser.add_argument("--recompute", help="Clears the cache and recomputes the metrics", action="store_true") args = parser.parse_args() def get_lines(all_data, xn, yn, render_all_points): - """ For each algorithm run on a dataset, obtain its performance + """For each algorithm run on a dataset, obtain its performance curve coords.""" plot_data = [] for algo in sorted(all_data.keys(), key=lambda x: x.lower()): - xs, ys, ls, axs, ays, als = \ - create_pointset(prepare_data(all_data[algo], xn, yn), xn, yn) + xs, ys, ls, axs, ays, als = create_pointset(prepare_data(all_data[algo], xn, yn), xn, yn) if render_all_points: xs, ys, ls = axs, ays, als - plot_data.append({"name": algo, "coords": zip(xs, ys), "labels": ls, - "scatter": render_all_points}) + plot_data.append({"name": algo, "coords": zip(xs, ys), "labels": ls, "scatter": render_all_points}) return plot_data -def create_plot(all_data, xn, yn, linestyle, j2_env, additional_label="", - plottype="line"): +def create_plot(all_data, xn, yn, linestyle, j2_env, additional_label="", plottype="line"): xm, ym = (metrics[xn], metrics[yn]) render_all_points = plottype == "bubble" plot_data = get_lines(all_data, xn, yn, render_all_points) - latex_code = j2_env.get_template("latex.template").\ - render(plot_data=plot_data, caption=get_plot_label(xm, ym), - xlabel=xm["description"], ylabel=ym["description"]) + latex_code = j2_env.get_template("latex.template").render( + plot_data=plot_data, caption=get_plot_label(xm, ym), xlabel=xm["description"], ylabel=ym["description"] + ) plot_data = get_lines(all_data, xn, yn, render_all_points) - button_label = hashlib.sha224((get_plot_label(xm, ym) + additional_label) - .encode("utf-8")).hexdigest() - return j2_env.get_template("chartjs.template").\ - render(args=args, latex_code=latex_code, button_label=button_label, - data_points=plot_data, - xlabel=xm["description"], ylabel=ym["description"], - plottype=plottype, plot_label=get_plot_label(xm, ym), - label=additional_label, linestyle=linestyle, - render_all_points=render_all_points) + button_label = hashlib.sha224((get_plot_label(xm, ym) + additional_label).encode("utf-8")).hexdigest() + return j2_env.get_template("chartjs.template").render( + args=args, + latex_code=latex_code, + button_label=button_label, + data_points=plot_data, + xlabel=xm["description"], + ylabel=ym["description"], + plottype=plottype, + plot_label=get_plot_label(xm, ym), + label=additional_label, + linestyle=linestyle, + render_all_points=render_all_points, + ) def build_detail_site(data, label_func, j2_env, linestyles, batch=False): for (name, runs) in data.items(): print("Building '%s'" % name) - all_runs = runs.keys() + runs.keys() label = label_func(name) data = {"normal": [], "scatter": []} for plottype in args.plottype: xn, yn = plot_variants[plottype] - data["normal"].append(create_plot( - runs, xn, yn, convert_linestyle(linestyles), j2_env)) + data["normal"].append(create_plot(runs, xn, yn, convert_linestyle(linestyles), j2_env)) if args.scatter: data["scatter"].append( - create_plot(runs, xn, yn, convert_linestyle(linestyles), - j2_env, "Scatterplot ", "bubble")) + create_plot(runs, xn, yn, convert_linestyle(linestyles), j2_env, "Scatterplot ", "bubble") + ) # create png plot for summary page data_for_plot = {} for k in runs.keys(): - data_for_plot[k] = prepare_data(runs[k], 'k-nn', 'qps') + data_for_plot[k] = prepare_data(runs[k], "k-nn", "qps") plot.create_plot( - data_for_plot, False, - 'linear', 'log', 'k-nn', 'qps', - args.outputdir + name + '.png', - linestyles, batch) - output_path = \ - args.outputdir + name + '.html' + data_for_plot, False, "linear", "log", "k-nn", "qps", args.outputdir + name + ".png", linestyles, batch + ) + output_path = args.outputdir + name + ".html" with open(output_path, "w") as text_file: - text_file.write(j2_env.get_template("detail_page.html"). - render(title=label, plot_data=data, - args=args, batch=batch)) + text_file.write( + j2_env.get_template("detail_page.html").render(title=label, plot_data=data, args=args, batch=batch) + ) def build_index_site(datasets, algorithms, j2_env, file_name): - dataset_data = {'batch': [], 'non-batch': []} - for mode in ['batch', 'non-batch']: - distance_measures = sorted( - set([get_distance_from_desc(e) for e in datasets[mode].keys()])) - sorted_datasets = sorted( - set([get_dataset_from_desc(e) for e in datasets[mode].keys()])) + dataset_data = {"batch": [], "non-batch": []} + for mode in ["batch", "non-batch"]: + distance_measures = sorted(set([get_distance_from_desc(e) for e in datasets[mode].keys()])) + sorted_datasets = sorted(set([get_dataset_from_desc(e) for e in datasets[mode].keys()])) for dm in distance_measures: d = {"name": dm.capitalize(), "entries": []} for ds in sorted_datasets: - matching_datasets = [e for e in datasets[mode].keys() - if get_dataset_from_desc(e) == ds and # noqa - get_distance_from_desc(e) == dm] - sorted_matches = sorted( - matching_datasets, - key=lambda e: int(get_count_from_desc(e))) + matching_datasets = [ + e + for e in datasets[mode].keys() + if get_dataset_from_desc(e) == ds and get_distance_from_desc(e) == dm # noqa + ] + sorted_matches = sorted(matching_datasets, key=lambda e: int(get_count_from_desc(e))) for idd in sorted_matches: - d["entries"].append( - {"name": idd, "desc": get_dataset_label(idd)}) + d["entries"].append({"name": idd, "desc": get_dataset_label(idd)}) dataset_data[mode].append(d) with open(args.outputdir + "index.html", "w") as text_file: - text_file.write(j2_env.get_template("summary.html"). - render(title="ANN-Benchmarks", - dataset_with_distances=dataset_data, - algorithms=algorithms)) + text_file.write( + j2_env.get_template("summary.html").render( + title="ANN-Benchmarks", dataset_with_distances=dataset_data, algorithms=algorithms + ) + ) def load_all_results(): """Read all result files and compute all metrics""" - all_runs_by_dataset = {'batch': {}, 'non-batch': {}} - all_runs_by_algorithm = {'batch': {}, 'non-batch': {}} + all_runs_by_dataset = {"batch": {}, "non-batch": {}} + all_runs_by_algorithm = {"batch": {}, "non-batch": {}} cached_true_dist = [] old_sdn = None for mode in ["non-batch", "batch"]: @@ -230,15 +207,12 @@ def load_all_results(): cached_true_dist = list(dataset["distances"]) old_sdn = sdn algo_ds = get_dataset_label(sdn) - desc_suffix = ("-batch" if mode == "batch" else "") + desc_suffix = "-batch" if mode == "batch" else "" algo = properties["algo"] + desc_suffix sdn += desc_suffix - ms = compute_all_metrics( - cached_true_dist, f, properties, args.recompute) - all_runs_by_algorithm[mode].setdefault( - algo, {}).setdefault(algo_ds, []).append(ms) - all_runs_by_dataset[mode].setdefault( - sdn, {}).setdefault(algo, []).append(ms) + ms = compute_all_metrics(cached_true_dist, f, properties, args.recompute) + all_runs_by_algorithm[mode].setdefault(algo, {}).setdefault(algo_ds, []).append(ms) + all_runs_by_dataset[mode].setdefault(sdn, {}).setdefault(algo, []).append(ms) return (all_runs_by_dataset, all_runs_by_algorithm) @@ -246,27 +220,17 @@ def load_all_results(): j2_env = Environment(loader=FileSystemLoader("./templates/"), trim_blocks=True) j2_env.globals.update(zip=zip, len=len) runs_by_ds, runs_by_algo = load_all_results() -dataset_names = [get_dataset_label(x) for x in list( - runs_by_ds['batch'].keys()) + list(runs_by_ds['non-batch'].keys())] -algorithm_names = list(runs_by_algo['batch'].keys( -)) + list(runs_by_algo['non-batch'].keys()) +dataset_names = [get_dataset_label(x) for x in list(runs_by_ds["batch"].keys()) + list(runs_by_ds["non-batch"].keys())] +algorithm_names = list(runs_by_algo["batch"].keys()) + list(runs_by_algo["non-batch"].keys()) -linestyles = {**create_linestyles(dataset_names), - **create_linestyles(algorithm_names)} +linestyles = {**create_linestyles(dataset_names), **create_linestyles(algorithm_names)} -build_detail_site( - runs_by_ds['non-batch'], - lambda label: get_dataset_label(label), j2_env, linestyles, False) +build_detail_site(runs_by_ds["non-batch"], lambda label: get_dataset_label(label), j2_env, linestyles, False) -build_detail_site( - runs_by_ds['batch'], - lambda label: get_dataset_label(label), j2_env, linestyles, True) +build_detail_site(runs_by_ds["batch"], lambda label: get_dataset_label(label), j2_env, linestyles, True) -build_detail_site( - runs_by_algo['non-batch'], - lambda x: x, j2_env, linestyles, False) +build_detail_site(runs_by_algo["non-batch"], lambda x: x, j2_env, linestyles, False) -build_detail_site( - runs_by_algo['batch'], lambda x: x, j2_env, linestyles, True) +build_detail_site(runs_by_algo["batch"], lambda x: x, j2_env, linestyles, True) build_index_site(runs_by_ds, runs_by_algo, j2_env, "index.html") diff --git a/data_export.py b/data_export.py index d253219b6..343f3acc3 100644 --- a/data_export.py +++ b/data_export.py @@ -2,19 +2,13 @@ import csv from ann_benchmarks.datasets import DATASETS, get_dataset -from ann_benchmarks.plotting.utils import compute_metrics_all_runs +from ann_benchmarks.plotting.utils import compute_metrics_all_runs from ann_benchmarks.results import load_all_results if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument( - '--output', - help='Path to the output file', - required=True) - parser.add_argument( - '--recompute', - action='store_true', - help='Recompute metrics') + parser.add_argument("--output", help="Path to the output file", required=True) + parser.add_argument("--recompute", action="store_true", help="Recompute metrics") args = parser.parse_args() datasets = DATASETS.keys() @@ -26,13 +20,12 @@ dataset, _ = get_dataset(dataset_name) results = compute_metrics_all_runs(dataset, results, args.recompute) for res in results: - res['dataset'] = dataset_name + res["dataset"] = dataset_name dfs.append(res) if len(dfs) > 0: - with open(args.output, 'w', newline='') as csvfile: + with open(args.output, "w", newline="") as csvfile: names = list(dfs[0].keys()) writer = csv.DictWriter(csvfile, fieldnames=names) writer.writeheader() for res in dfs: writer.writerow(res) - diff --git a/install.py b/install.py index 4e41c0892..178cb81a9 100644 --- a/install.py +++ b/install.py @@ -7,19 +7,20 @@ def build(library, args): - print('Building %s...' % library) + print("Building %s..." % library) if args is not None and len(args) != 0: q = " ".join(["--build-arg " + x.replace(" ", "\\ ") for x in args]) else: q = "" - + try: subprocess.check_call( - 'docker build %s --rm -t ann-benchmarks-%s -f' - ' install/Dockerfile.%s .' % (q, library, library), shell=True) - return {library: 'success'} + "docker build %s --rm -t ann-benchmarks-%s -f" " install/Dockerfile.%s ." % (q, library, library), + shell=True, + ) + return {library: "success"} except subprocess.CalledProcessError: - return {library: 'fail'} + return {library: "fail"} def build_multiprocess(args): @@ -27,37 +28,27 @@ def build_multiprocess(args): if __name__ == "__main__": - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument( - "--proc", - default=1, - type=positive_int, - help="the number of process to build docker images") - parser.add_argument( - '--algorithm', - metavar='NAME', - help='build only the named algorithm image', - default=None) - parser.add_argument( - '--build-arg', - help='pass given args to all docker builds', - nargs="+") + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--proc", default=1, type=positive_int, help="the number of process to build docker images") + parser.add_argument("--algorithm", metavar="NAME", help="build only the named algorithm image", default=None) + parser.add_argument("--build-arg", help="pass given args to all docker builds", nargs="+") args = parser.parse_args() - print('Building base image...') + print("Building base image...") subprocess.check_call( - 'docker build \ - --rm -t ann-benchmarks -f install/Dockerfile .', shell=True) + "docker build \ + --rm -t ann-benchmarks -f install/Dockerfile .", + shell=True, + ) if args.algorithm: tags = [args.algorithm] - elif os.getenv('LIBRARY'): - tags = [os.getenv('LIBRARY')] + elif os.getenv("LIBRARY"): + tags = [os.getenv("LIBRARY")] else: - tags = [fn.split('.')[-1] for fn in os.listdir('install') if fn.startswith('Dockerfile.')] + tags = [fn.split(".")[-1] for fn in os.listdir("install") if fn.startswith("Dockerfile.")] - print('Building algorithm images... with (%d) processes' % args.proc) + print("Building algorithm images... with (%d) processes" % args.proc) if args.proc == 1: install_status = [build(tag, args.build_arg) for tag in tags] @@ -67,10 +58,10 @@ def build_multiprocess(args): pool.close() pool.join() - print('\n\nInstall Status:\n' + '\n'.join(str(algo) for algo in install_status)) + print("\n\nInstall Status:\n" + "\n".join(str(algo) for algo in install_status)) # Exit 1 if any of the installations fail. for x in install_status: - for (k,v) in x.items(): - if v == 'fail': + for (k, v) in x.items(): + if v == "fail": sys.exit(1) diff --git a/plot.py b/plot.py index f9784dbb6..660060d4e 100644 --- a/plot.py +++ b/plot.py @@ -1,21 +1,17 @@ -import os import matplotlib as mpl -mpl.use('Agg') # noqa + +mpl.use("Agg") # noqa import matplotlib.pyplot as plt import numpy as np import argparse from ann_benchmarks.datasets import get_dataset -from ann_benchmarks.algorithms.definitions import get_definitions from ann_benchmarks.plotting.metrics import all_metrics as metrics -from ann_benchmarks.plotting.utils import (get_plot_label, compute_metrics, - create_linestyles, create_pointset) -from ann_benchmarks.results import (store_results, load_all_results, - get_unique_algorithms) +from ann_benchmarks.plotting.utils import get_plot_label, compute_metrics, create_linestyles, create_pointset +from ann_benchmarks.results import load_all_results, get_unique_algorithms -def create_plot(all_data, raw, x_scale, y_scale, xn, yn, fn_out, linestyles, - batch): +def create_plot(all_data, raw, x_scale, y_scale, xn, yn, fn_out, linestyles, batch): xm, ym = (metrics[xn], metrics[yn]) # Now generate each plot handles = [] @@ -26,134 +22,119 @@ def create_plot(all_data, raw, x_scale, y_scale, xn, yn, fn_out, linestyles, def mean_y(algo): xs, ys, ls, axs, ays, als = create_pointset(all_data[algo], xn, yn) return -np.log(np.array(ys)).mean() + # Find range for logit x-scale min_x, max_x = 1, 0 for algo in sorted(all_data.keys(), key=mean_y): xs, ys, ls, axs, ays, als = create_pointset(all_data[algo], xn, yn) - min_x = min([min_x]+[x for x in xs if x > 0]) - max_x = max([max_x]+[x for x in xs if x < 1]) + min_x = min([min_x] + [x for x in xs if x > 0]) + max_x = max([max_x] + [x for x in xs if x < 1]) color, faded, linestyle, marker = linestyles[algo] - handle, = plt.plot(xs, ys, '-', label=algo, color=color, - ms=7, mew=3, lw=3, linestyle=linestyle, - marker=marker) + (handle,) = plt.plot( + xs, ys, "-", label=algo, color=color, ms=7, mew=3, lw=3, linestyle=linestyle, marker=marker + ) handles.append(handle) if raw: - handle2, = plt.plot(axs, ays, '-', label=algo, color=faded, - ms=5, mew=2, lw=2, linestyle=linestyle, - marker=marker) + (handle2,) = plt.plot( + axs, ays, "-", label=algo, color=faded, ms=5, mew=2, lw=2, linestyle=linestyle, marker=marker + ) labels.append(algo) ax = plt.gca() - ax.set_ylabel(ym['description']) - ax.set_xlabel(xm['description']) + ax.set_ylabel(ym["description"]) + ax.set_xlabel(xm["description"]) # Custom scales of the type --x-scale a3 - if x_scale[0] == 'a': + if x_scale[0] == "a": alpha = float(x_scale[1:]) - fun = lambda x: 1-(1-x)**(1/alpha) - inv_fun = lambda x: 1-(1-x)**alpha - ax.set_xscale('function', functions=(fun, inv_fun)) + + def fun(x): + return 1 - (1 - x) ** (1 / alpha) + + def inv_fun(x): + return 1 - (1 - x) ** alpha + + ax.set_xscale("function", functions=(fun, inv_fun)) if alpha <= 3: - ticks = [inv_fun(x) for x in np.arange(0,1.2,.2)] + ticks = [inv_fun(x) for x in np.arange(0, 1.2, 0.2)] plt.xticks(ticks) if alpha > 3: from matplotlib import ticker + ax.xaxis.set_major_formatter(ticker.LogitFormatter()) - #plt.xticks(ticker.LogitLocator().tick_values(min_x, max_x)) - plt.xticks([0, 1/2, 1-1e-1, 1-1e-2, 1-1e-3, 1-1e-4, 1]) + # plt.xticks(ticker.LogitLocator().tick_values(min_x, max_x)) + plt.xticks([0, 1 / 2, 1 - 1e-1, 1 - 1e-2, 1 - 1e-3, 1 - 1e-4, 1]) # Other x-scales else: ax.set_xscale(x_scale) ax.set_yscale(y_scale) ax.set_title(get_plot_label(xm, ym)) - box = plt.gca().get_position() + plt.gca().get_position() # plt.gca().set_position([box.x0, box.y0, box.width * 0.8, box.height]) - ax.legend(handles, labels, loc='center left', - bbox_to_anchor=(1, 0.5), prop={'size': 9}) - plt.grid(b=True, which='major', color='0.65', linestyle='-') + ax.legend(handles, labels, loc="center left", bbox_to_anchor=(1, 0.5), prop={"size": 9}) + plt.grid(b=True, which="major", color="0.65", linestyle="-") plt.setp(ax.get_xminorticklabels(), visible=True) # Logit scale has to be a subset of (0,1) - if 'lim' in xm and x_scale != 'logit': - x0, x1 = xm['lim'] - plt.xlim(max(x0,0), min(x1,1)) - elif x_scale == 'logit': + if "lim" in xm and x_scale != "logit": + x0, x1 = xm["lim"] + plt.xlim(max(x0, 0), min(x1, 1)) + elif x_scale == "logit": plt.xlim(min_x, max_x) - if 'lim' in ym: - plt.ylim(ym['lim']) + if "lim" in ym: + plt.ylim(ym["lim"]) # Workaround for bug https://github.com/matplotlib/matplotlib/issues/6789 - ax.spines['bottom']._adjust_location() + ax.spines["bottom"]._adjust_location() - plt.savefig(fn_out, bbox_inches='tight') + plt.savefig(fn_out, bbox_inches="tight") plt.close() if __name__ == "__main__": parser = argparse.ArgumentParser() + parser.add_argument("--dataset", metavar="DATASET", default="glove-100-angular") + parser.add_argument("--count", default=10) parser.add_argument( - '--dataset', - metavar="DATASET", - default='glove-100-angular') + "--definitions", metavar="FILE", help="load algorithm definitions from FILE", default="algos.yaml" + ) + parser.add_argument("--limit", default=-1) + parser.add_argument("-o", "--output") parser.add_argument( - '--count', - default=10) + "-x", "--x-axis", help="Which metric to use on the X-axis", choices=metrics.keys(), default="k-nn" + ) parser.add_argument( - '--definitions', - metavar='FILE', - help='load algorithm definitions from FILE', - default='algos.yaml') + "-y", "--y-axis", help="Which metric to use on the Y-axis", choices=metrics.keys(), default="qps" + ) parser.add_argument( - '--limit', - default=-1) + "-X", "--x-scale", help="Scale to use when drawing the X-axis. Typically linear, logit or a2", default="linear" + ) parser.add_argument( - '-o', '--output') - parser.add_argument( - '-x', '--x-axis', - help='Which metric to use on the X-axis', - choices=metrics.keys(), - default="k-nn") - parser.add_argument( - '-y', '--y-axis', - help='Which metric to use on the Y-axis', - choices=metrics.keys(), - default="qps") - parser.add_argument( - '-X', '--x-scale', - help='Scale to use when drawing the X-axis. Typically linear, logit or a2', - default='linear') - parser.add_argument( - '-Y', '--y-scale', - help='Scale to use when drawing the Y-axis', + "-Y", + "--y-scale", + help="Scale to use when drawing the Y-axis", choices=["linear", "log", "symlog", "logit"], - default='linear') - parser.add_argument( - '--raw', - help='Show raw results (not just Pareto frontier) in faded colours', - action='store_true') - parser.add_argument( - '--batch', - help='Plot runs in batch mode', - action='store_true') + default="linear", + ) parser.add_argument( - '--recompute', - help='Clears the cache and recomputes the metrics', - action='store_true') + "--raw", help="Show raw results (not just Pareto frontier) in faded colours", action="store_true" + ) + parser.add_argument("--batch", help="Plot runs in batch mode", action="store_true") + parser.add_argument("--recompute", help="Clears the cache and recomputes the metrics", action="store_true") args = parser.parse_args() if not args.output: - args.output = 'results/%s.png' % (args.dataset + ('-batch' if args.batch else '')) - print('writing output to %s' % args.output) + args.output = "results/%s.png" % (args.dataset + ("-batch" if args.batch else "")) + print("writing output to %s" % args.output) dataset, _ = get_dataset(args.dataset) count = int(args.count) unique_algorithms = get_unique_algorithms() results = load_all_results(args.dataset, count, args.batch) linestyles = create_linestyles(sorted(unique_algorithms)) - runs = compute_metrics(np.array(dataset["distances"]), - results, args.x_axis, args.y_axis, args.recompute) + runs = compute_metrics(np.array(dataset["distances"]), results, args.x_axis, args.y_axis, args.recompute) if not runs: - raise Exception('Nothing to plot') + raise Exception("Nothing to plot") - create_plot(runs, args.raw, args.x_scale, - args.y_scale, args.x_axis, args.y_axis, args.output, - linestyles, args.batch) + create_plot( + runs, args.raw, args.x_scale, args.y_scale, args.x_axis, args.y_axis, args.output, linestyles, args.batch + )