Skip to content

Commit 3c35312

Browse files
authored
use qdrant vectors for hybrid search (#2718)
1 parent 95e6f89 commit 3c35312

File tree

13 files changed

+252
-779
lines changed

13 files changed

+252
-779
lines changed

learning_resources_search/api.py

Lines changed: 30 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,13 @@
1313
from learning_resources.models import LearningResource
1414
from learning_resources_search.connection import (
1515
get_default_alias_name,
16-
get_vector_model_id,
1716
)
1817
from learning_resources_search.constants import (
19-
COMBINED_INDEX,
2018
CONTENT_FILE_TYPE,
2119
COURSE_QUERY_FIELDS,
2220
COURSE_TYPE,
2321
DEPARTMENT_QUERY_FIELDS,
22+
HYBRID_COMBINED_INDEX,
2423
HYBRID_SEARCH_MODE,
2524
LEARNING_RESOURCE,
2625
LEARNING_RESOURCE_QUERY_FIELDS,
@@ -55,6 +54,23 @@
5554
"-created_on",
5655
]
5756

57+
HYBRID_SEARCH_KNN_K_VALUE = 5
58+
HYBRID_SEARCH_PAGINATION_DEPTH = 10
59+
HYBRID_SEARCH_POST_PROCESSOR = {
60+
"description": "Post processor for hybrid search",
61+
"phase_results_processors": [
62+
{
63+
"normalization-processor": {
64+
"normalization": {"technique": "min_max"},
65+
"combination": {
66+
"technique": "arithmetic_mean",
67+
"parameters": {"weights": [0.8, 0.2]},
68+
},
69+
}
70+
}
71+
],
72+
}
73+
5874

5975
def gen_content_file_id(content_file_id):
6076
"""
@@ -86,7 +102,7 @@ def relevant_indexes(resource_types, aggregations, endpoint, use_hybrid_search):
86102
if endpoint == CONTENT_FILE_TYPE:
87103
return [get_default_alias_name(COURSE_TYPE)]
88104
elif use_hybrid_search:
89-
return [get_default_alias_name(COMBINED_INDEX)]
105+
return [get_default_alias_name(HYBRID_COMBINED_INDEX)]
90106

91107
if aggregations and "resource_type" in aggregations:
92108
return map(get_default_alias_name, LEARNING_RESOURCE_TYPES)
@@ -652,41 +668,22 @@ def add_text_query_to_search(
652668
text_query = {"bool": {"must": [text_query], "filter": query_type_query}}
653669

654670
if use_hybrid_search:
655-
vector_model_id = get_vector_model_id()
656-
if not vector_model_id:
657-
log.error("Vector model not found. Cannot perform hybrid search.")
658-
error_message = "Vector model not found."
659-
raise ValueError(error_message)
660-
661-
vector_query_description = {
662-
"neural": {
663-
"description_embedding": {
664-
"query_text": text,
665-
"model_id": vector_model_id,
666-
"min_score": 0.015,
667-
},
668-
}
669-
}
670-
671-
vector_query_title = {
672-
"neural": {
673-
"title_embedding": {
674-
"query_text": text,
675-
"model_id": vector_model_id,
676-
"min_score": 0.015,
677-
},
671+
encoder = dense_encoder()
672+
query_vector = encoder.embed_query(text)
673+
vector_query = {
674+
"knn": {
675+
"vector_embedding": {
676+
"vector": query_vector,
677+
"k": HYBRID_SEARCH_KNN_K_VALUE,
678+
}
678679
}
679680
}
680681

681682
search = search.extra(
682683
query={
683684
"hybrid": {
684-
"pagination_depth": 10,
685-
"queries": [
686-
text_query,
687-
vector_query_description,
688-
vector_query_title,
689-
],
685+
"pagination_depth": HYBRID_SEARCH_PAGINATION_DEPTH,
686+
"queries": [text_query, vector_query],
690687
}
691688
}
692689
)
@@ -803,22 +800,7 @@ def execute_learn_search(search_params):
803800
search = construct_search(search_params)
804801

805802
if search_params.get("search_mode") == HYBRID_SEARCH_MODE:
806-
search = search.extra(
807-
search_pipeline={
808-
"description": "Post processor for hybrid search",
809-
"phase_results_processors": [
810-
{
811-
"normalization-processor": {
812-
"normalization": {"technique": "min_max"},
813-
"combination": {
814-
"technique": "arithmetic_mean",
815-
"parameters": {"weights": [0.6, 0.2, 0.2]},
816-
},
817-
}
818-
}
819-
],
820-
}
821-
)
803+
search = search.extra(search_pipeline=HYBRID_SEARCH_POST_PROCESSOR)
822804

823805
results = search.execute().to_dict()
824806
if results.get("_shards", {}).get("failures"):

learning_resources_search/api_test.py

Lines changed: 9 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1946,8 +1946,7 @@ def test_execute_learn_search_for_learning_resource_query(opensearch):
19461946
"content",
19471947
"summary",
19481948
"flashcards",
1949-
"description_embedding",
1950-
"title_embedding",
1949+
"vector_embedding",
19511950
]
19521951
},
19531952
}
@@ -2395,8 +2394,7 @@ def test_execute_learn_search_with_script_score(
23952394
"content",
23962395
"summary",
23972396
"flashcards",
2398-
"description_embedding",
2399-
"title_embedding",
2397+
"vector_embedding",
24002398
]
24012399
},
24022400
}
@@ -2417,10 +2415,8 @@ def test_execute_learn_search_with_hybrid_search(mocker, settings, opensearch):
24172415

24182416
settings.DEFAULT_SEARCH_MODE = "best_fields"
24192417

2420-
mocker.patch(
2421-
"learning_resources_search.api.get_vector_model_id",
2422-
return_value="vector_model_id",
2423-
)
2418+
mock_encoder = mocker.patch("learning_resources_search.api.dense_encoder")()
2419+
mock_encoder.embed_query.return_value = [0.1, 0.2, 0.3]
24242420

24252421
search_params = {
24262422
"aggregations": ["offered_by"],
@@ -2727,24 +2723,7 @@ def test_execute_learn_search_with_hybrid_search(mocker, settings, opensearch):
27272723
"filter": {"exists": {"field": "resource_type"}},
27282724
}
27292725
},
2730-
{
2731-
"neural": {
2732-
"description_embedding": {
2733-
"query_text": "math",
2734-
"model_id": "vector_model_id",
2735-
"min_score": 0.015,
2736-
}
2737-
}
2738-
},
2739-
{
2740-
"neural": {
2741-
"title_embedding": {
2742-
"query_text": "math",
2743-
"model_id": "vector_model_id",
2744-
"min_score": 0.015,
2745-
}
2746-
}
2747-
},
2726+
{"knn": {"vector_embedding": {"vector": [0.1, 0.2, 0.3], "k": 5}}},
27482727
],
27492728
}
27502729
},
@@ -2805,7 +2784,7 @@ def test_execute_learn_search_with_hybrid_search(mocker, settings, opensearch):
28052784
"normalization": {"technique": "min_max"},
28062785
"combination": {
28072786
"technique": "arithmetic_mean",
2808-
"parameters": {"weights": [0.6, 0.2, 0.2]},
2787+
"parameters": {"weights": [0.8, 0.2]},
28092788
},
28102789
}
28112790
}
@@ -2824,8 +2803,7 @@ def test_execute_learn_search_with_hybrid_search(mocker, settings, opensearch):
28242803
"content",
28252804
"summary",
28262805
"flashcards",
2827-
"description_embedding",
2828-
"title_embedding",
2806+
"vector_embedding",
28292807
]
28302808
},
28312809
}
@@ -3217,8 +3195,7 @@ def test_execute_learn_search_with_min_score(mocker, settings, opensearch):
32173195
"content",
32183196
"summary",
32193197
"flashcards",
3220-
"description_embedding",
3221-
"title_embedding",
3198+
"vector_embedding",
32223199
]
32233200
},
32243201
}
@@ -3396,8 +3373,7 @@ def test_execute_learn_search_for_content_file_query(opensearch):
33963373
"content",
33973374
"summary",
33983375
"flashcards",
3399-
"description_embedding",
3400-
"title_embedding",
3376+
"vector_embedding",
34013377
]
34023378
},
34033379
}

learning_resources_search/connection.py

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -135,33 +135,3 @@ def refresh_index(index):
135135
"""
136136
conn = get_conn()
137137
conn.indices.refresh(index)
138-
139-
140-
def get_vector_model_id():
141-
"""
142-
Get the model ID for the currently loaded vector model
143-
"""
144-
conn = get_conn()
145-
model_name = settings.OPENSEARCH_VECTOR_MODEL_NAME
146-
body = {"query": {"term": {"name.keyword": model_name}}}
147-
models = conn.transport.perform_request(
148-
"GET", "/_plugins/_ml/models/_search", body=body
149-
)
150-
151-
if len(models.get("hits", {}).get("hits", [])) > 0:
152-
return models["hits"]["hits"][0]["_source"]["model_id"]
153-
154-
return None
155-
156-
157-
def get_vector_model_info():
158-
"""
159-
Get information about the currently loaded vector model
160-
"""
161-
162-
conn = get_conn()
163-
model_id = get_vector_model_id()
164-
if not model_id:
165-
return None
166-
167-
return conn.transport.perform_request("GET", f"/_plugins/_ml/models/{model_id}")

learning_resources_search/constants.py

Lines changed: 4 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,7 @@
2222
CURRENT_INDEX = "current_index"
2323
REINDEXING_INDEX = "reindexing_index"
2424
BOTH_INDEXES = "all_indexes"
25-
COMBINED_INDEX = "combined_hybrid"
26-
25+
HYBRID_COMBINED_INDEX = "combined_hybrid"
2726
LEARNING_RESOURCE = "learning_resource"
2827
HYBRID_SEARCH_MODE = "hybrid"
2928

@@ -49,7 +48,7 @@ class IndexestoUpdate(Enum):
4948
)
5049

5150

52-
BASE_INDEXES = (PERCOLATE_INDEX_TYPE, COMBINED_INDEX)
51+
BASE_INDEXES = (PERCOLATE_INDEX_TYPE, HYBRID_COMBINED_INDEX)
5352

5453
ALL_INDEX_TYPES = BASE_INDEXES + LEARNING_RESOURCE_TYPES
5554

@@ -323,26 +322,7 @@ class FilterConfig:
323322
"max_weekly_hours": {"type": "integer"},
324323
}
325324

326-
EMBEDDING_FIELDS = {
327-
"title_embedding": {
328-
"type": "knn_vector",
329-
"method": {
330-
"engine": "lucene",
331-
"space_type": "l2",
332-
"name": "hnsw",
333-
"parameters": {},
334-
},
335-
},
336-
"description_embedding": {
337-
"type": "knn_vector",
338-
"method": {
339-
"engine": "lucene",
340-
"space_type": "l2",
341-
"name": "hnsw",
342-
"parameters": {},
343-
},
344-
},
345-
}
325+
EMBEDDING_FIELDS = {"vector_embedding": {"type": "knn_vector"}}
346326

347327

348328
CONTENT_FILE_MAP = {
@@ -471,8 +451,7 @@ class FilterConfig:
471451
"content",
472452
"summary",
473453
"flashcards",
474-
"description_embedding",
475-
"title_embedding",
454+
"vector_embedding",
476455
]
477456

478457
LEARNING_RESOURCE_SEARCH_SORTBY_OPTIONS = {

0 commit comments

Comments
 (0)