From df47b1b7dcf0dd00fabca293cdb545081dd0cefb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Luis=20Santiago?= Date: Sun, 29 Dec 2024 11:55:58 +0100 Subject: [PATCH 01/18] First prototype of hybrid search Include: - Text search index - Hybrid Search configuration - Vector or hybrid search --- .../Postgres/Internals/PostgresDbClient.cs | 64 +++++++++++++++++-- .../Postgres/Postgres/PostgresConfig.cs | 6 ++ .../Postgres/Postgres/PostgresMemory.cs | 4 ++ 3 files changed, 67 insertions(+), 7 deletions(-) diff --git a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs index 5ec4ddbe7..835a67fd0 100644 --- a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs +++ b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs @@ -59,6 +59,14 @@ public PostgresDbClient(PostgresConfig config, ILoggerFactory? loggerFactory = n this._columnsListNoEmbeddings = $"{this._colId},{this._colTags},{this._colContent},{this._colPayload}"; this._columnsListWithEmbeddings = $"{this._colId},{this._colTags},{this._colContent},{this._colPayload},{this._colEmbedding}"; + this._columnsListHybrid = $"{this._colId},{this._colTags},{this._colContent},{this._colPayload},{this._colEmbedding}"; + this._columnsListHybridCoalesce = $@" + COALESCE(semantic_search.{this._colId}, keyword_search.{this._colId}) AS {this._colId}, + COALESCE(semantic_search.{this._colTags}, keyword_search.{this._colTags}) AS {this._colTags}, + COALESCE(semantic_search.{this._colContent}, keyword_search.{this._colContent}) AS {this._colContent}, + COALESCE(semantic_search.{this._colPayload}, keyword_search.{this._colPayload}) AS {this._colPayload}, + COALESCE(semantic_search.{this._colEmbedding}, keyword_search.{this._colEmbedding}) AS {this._colEmbedding}, + "; this._createTableSql = string.Empty; if (config.CreateTableSql?.Count > 0) @@ -388,6 +396,7 @@ DO UPDATE SET /// Get a list of records /// /// Table containing the records to fetch + /// Prompt query. Only used in the case of hybrid search /// Source vector to compare for similarity /// Minimum similarity threshold /// SQL filter to apply @@ -395,9 +404,11 @@ DO UPDATE SET /// Max number of records to retrieve /// Records to skip from the top /// Whether to include embedding vectors + /// Whether to use hybrid search or vector search /// Async task cancellation token public async IAsyncEnumerable<(PostgresMemoryRecord record, double similarity)> GetSimilarAsync( string tableName, + string query, Vector target, double minSimilarity, string? filterSql = null, @@ -405,6 +416,7 @@ DO UPDATE SET int limit = 1, int offset = 0, bool withEmbeddings = false, + bool useHybridSearch = false, [EnumeratorCancellation] CancellationToken cancellationToken = default) { tableName = this.WithSchemaAndTableNamePrefix(tableName); @@ -413,6 +425,8 @@ DO UPDATE SET // Column names string columns = withEmbeddings ? this._columnsListWithEmbeddings : this._columnsListNoEmbeddings; + string columsnHibrid = this._columnsListHybrid; + string columnsListHybridCoalesce = this._columnsListHybridCoalesce; // Filtering logic, including filter by similarity filterSql = filterSql?.Trim().Replace(PostgresSchema.PlaceholdersTags, this._colTags, StringComparison.Ordinal); @@ -440,16 +454,50 @@ DO UPDATE SET #pragma warning disable CA2100 // SQL reviewed string colDistance = "__distance"; - // When using 1 - (embedding <=> target) the index is not being used, therefore we calculate - // the similarity (1 - distance) later. Furthermore, colDistance can't be used in the WHERE clause. - cmd.CommandText = @$" - SELECT {columns}, {this._colEmbedding} <=> @embedding AS {colDistance} - FROM {tableName} - WHERE {filterSql} - ORDER BY {colDistance} ASC + if (useHybridSearch) + { + // When using 1 - (embedding <=> target) the index is not being used, therefore we calculate + // the similarity (1 - distance) later. Furthermore, colDistance can't be used in the WHERE clause. + cmd.CommandText = @$" + WITH semantic_search AS ( + SELECT {columsnHibrid}, RANK () OVER (ORDER BY {this._colEmbedding} <=> @embedding) AS rank + FROM {tableName} + ORDER BY {this._colEmbedding} <=> @embedding + LIMIT @limit + ), + keyword_search AS ( + SELECT {columsnHibrid}, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC) + FROM {tableName}, plainto_tsquery('english', @query) query + WHERE to_tsvector('english', {this._colContent}) @@ query + ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC + LIMIT @limit + ) + SELECT + {columnsListHybridCoalesce} + COALESCE(1.0 / (50 + semantic_search.rank), 0.0) + + COALESCE(1.0 / (50 + keyword_search.rank), 0.0) AS {colDistance} + FROM semantic_search + FULL OUTER JOIN keyword_search ON semantic_search.{this._colId} = keyword_search.{this._colId} + ORDER BY {colDistance} DESC LIMIT @limit OFFSET @offset "; + cmd.Parameters.AddWithValue("@query", query); + cmd.Parameters.AddWithValue("@minSimilarity", minSimilarity); + } + else + { + // When using 1 - (embedding <=> target) the index is not being used, therefore we calculate + // the similarity (1 - distance) later. Furthermore, colDistance can't be used in the WHERE clause. + cmd.CommandText = @$" + SELECT {columns}, {this._colEmbedding} <=> @embedding AS {colDistance} + FROM {tableName} + WHERE {filterSql} + ORDER BY {colDistance} ASC + LIMIT @limit + OFFSET @offset + "; + } cmd.Parameters.AddWithValue("@embedding", target); cmd.Parameters.AddWithValue("@maxDistance", maxDistance); @@ -680,6 +728,8 @@ public async ValueTask DisposeAsync() private readonly string _colPayload; private readonly string _columnsListNoEmbeddings; private readonly string _columnsListWithEmbeddings; + private readonly string _columnsListHybrid; + private readonly string _columnsListHybridCoalesce; private readonly bool _dbNamePresent; /// diff --git a/extensions/Postgres/Postgres/PostgresConfig.cs b/extensions/Postgres/Postgres/PostgresConfig.cs index f34be9637..6aa2d76a2 100644 --- a/extensions/Postgres/Postgres/PostgresConfig.cs +++ b/extensions/Postgres/Postgres/PostgresConfig.cs @@ -107,6 +107,12 @@ public class PostgresConfig /// public List CreateTableSql { get; set; } = []; + /// + /// Important: when using hybrid search, relevance scores + /// are very different from when using just vector search. + /// + public bool UseHybridSearch { get; set; } = false; + /// /// Create a new instance of the configuration /// diff --git a/extensions/Postgres/Postgres/PostgresMemory.cs b/extensions/Postgres/Postgres/PostgresMemory.cs index 27a92745c..3ace05503 100644 --- a/extensions/Postgres/Postgres/PostgresMemory.cs +++ b/extensions/Postgres/Postgres/PostgresMemory.cs @@ -27,6 +27,7 @@ public sealed class PostgresMemory : IMemoryDb, IDisposable, IAsyncDisposable private readonly PostgresDbClient _db; private readonly ITextEmbeddingGenerator _embeddingGenerator; private readonly ILogger _log; + private readonly bool _useHybridSearch; /// /// Create a new instance of Postgres KM connector @@ -40,6 +41,7 @@ public PostgresMemory( ILoggerFactory? loggerFactory = null) { this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger(); + this._useHybridSearch = config.UseHybridSearch; this._embeddingGenerator = embeddingGenerator; if (this._embeddingGenerator == null) @@ -159,12 +161,14 @@ await this._db.UpsertAsync( var records = this._db.GetSimilarAsync( index, + query: text, target: new Vector(textEmbedding.Data), minSimilarity: minRelevance, filterSql: sql, sqlUserValues: unsafeSqlUserValues, limit: limit, withEmbeddings: withEmbeddings, + useHybridSearch: this._useHybridSearch, cancellationToken: cancellationToken).ConfigureAwait(false); await foreach ((PostgresMemoryRecord record, double similarity) result in records) From c5c309999bc8e2e3ddc66a2c7fbebe4f414ad3df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Luis=20Santiago?= Date: Sun, 29 Dec 2024 12:04:11 +0100 Subject: [PATCH 02/18] Solve some typos --- extensions/Postgres/Postgres/Internals/PostgresDbClient.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs index 835a67fd0..bbeb90386 100644 --- a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs +++ b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs @@ -425,7 +425,7 @@ DO UPDATE SET // Column names string columns = withEmbeddings ? this._columnsListWithEmbeddings : this._columnsListNoEmbeddings; - string columsnHibrid = this._columnsListHybrid; + string columnsHibrid = this._columnsListHybrid; string columnsListHybridCoalesce = this._columnsListHybridCoalesce; // Filtering logic, including filter by similarity @@ -460,7 +460,7 @@ DO UPDATE SET // the similarity (1 - distance) later. Furthermore, colDistance can't be used in the WHERE clause. cmd.CommandText = @$" WITH semantic_search AS ( - SELECT {columsnHibrid}, RANK () OVER (ORDER BY {this._colEmbedding} <=> @embedding) AS rank + SELECT {columnsHibrid}, RANK () OVER (ORDER BY {this._colEmbedding} <=> @embedding) AS rank FROM {tableName} ORDER BY {this._colEmbedding} <=> @embedding LIMIT @limit From f0a0dddb3ed80ade2a464bb530ec32969fffa4bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Luis=20Santiago?= Date: Sun, 29 Dec 2024 12:58:58 +0100 Subject: [PATCH 03/18] More typo errors --- extensions/Postgres/Postgres/Internals/PostgresDbClient.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs index bbeb90386..aa6242cfe 100644 --- a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs +++ b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs @@ -466,7 +466,7 @@ WITH semantic_search AS ( LIMIT @limit ), keyword_search AS ( - SELECT {columsnHibrid}, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC) + SELECT {columnsHibrid}, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC) FROM {tableName}, plainto_tsquery('english', @query) query WHERE to_tsvector('english', {this._colContent}) @@ query ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC From a51cb41933da22c5ef386c26d8d9193204fc18b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Luis=20Santiago?= Date: Sun, 29 Dec 2024 22:09:32 +0100 Subject: [PATCH 04/18] Solve some issues - Index name should be different per table. - Add missing filter --- extensions/Postgres/Postgres/Internals/PostgresDbClient.cs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs index aa6242cfe..3c4527a16 100644 --- a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs +++ b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs @@ -183,7 +183,8 @@ public async Task CreateTableAsync( {this._colContent} TEXT DEFAULT '' NOT NULL, {this._colPayload} JSONB DEFAULT '{{}}'::JSONB NOT NULL ); - CREATE INDEX IF NOT EXISTS idx_tags ON {tableName} USING GIN({this._colTags}); + CREATE INDEX IF NOT EXISTS {tableName}_idx_tags ON {tableName} USING GIN({this._colTags}); + CREATE INDEX IF NOT EXISTS {tableName}_idx_content ON {tableName} USING GIN(to_tsvector('english',{this._colContent})); COMMIT; "; #pragma warning restore CA2100 @@ -462,13 +463,14 @@ DO UPDATE SET WITH semantic_search AS ( SELECT {columnsHibrid}, RANK () OVER (ORDER BY {this._colEmbedding} <=> @embedding) AS rank FROM {tableName} + WHERE {filterSql} ORDER BY {this._colEmbedding} <=> @embedding LIMIT @limit ), keyword_search AS ( SELECT {columnsHibrid}, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC) FROM {tableName}, plainto_tsquery('english', @query) query - WHERE to_tsvector('english', {this._colContent}) @@ query + WHERE to_tsvector('english', {this._colContent}) @@ query and {filterSql} ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC LIMIT @limit ) From 100b1b84dc9f0eb49ccca3610b958390d73374ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Luis=20Santiago?= Date: Mon, 30 Dec 2024 10:30:19 +0100 Subject: [PATCH 05/18] Change filter when text search --- .../Postgres/Postgres/Internals/PostgresDbClient.cs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs index 3c4527a16..98d4cf68c 100644 --- a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs +++ b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs @@ -436,6 +436,8 @@ DO UPDATE SET filterSql = "TRUE"; } + string filterSqlHybridText = filterSql; + var maxDistance = 1 - minSimilarity; filterSql += $" AND {this._colEmbedding} <=> @embedding < @maxDistance"; @@ -470,14 +472,14 @@ LIMIT @limit keyword_search AS ( SELECT {columnsHibrid}, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC) FROM {tableName}, plainto_tsquery('english', @query) query - WHERE to_tsvector('english', {this._colContent}) @@ query and {filterSql} + WHERE {filterSqlHybridText} AND to_tsvector('english', {this._colContent}) @@ query ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC LIMIT @limit ) SELECT {columnsListHybridCoalesce} - COALESCE(1.0 / (50 + semantic_search.rank), 0.0) + - COALESCE(1.0 / (50 + keyword_search.rank), 0.0) AS {colDistance} + COALESCE(1.0 / (60 + semantic_search.rank), 0.0) + + COALESCE(1.0 / (60 + keyword_search.rank), 0.0) AS {colDistance} FROM semantic_search FULL OUTER JOIN keyword_search ON semantic_search.{this._colId} = keyword_search.{this._colId} ORDER BY {colDistance} DESC From a81ece708b1f6b9f3d97997e7338eb80b658d814 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Luis=20Santiago?= Date: Mon, 30 Dec 2024 11:22:36 +0100 Subject: [PATCH 06/18] Fix Formatting --- extensions/Postgres/Postgres/Internals/PostgresDbClient.cs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs index 98d4cf68c..bc82622d0 100644 --- a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs +++ b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs @@ -435,9 +435,7 @@ DO UPDATE SET { filterSql = "TRUE"; } - - string filterSqlHybridText = filterSql; - + string filterSqlHybridText = filterSql; var maxDistance = 1 - minSimilarity; filterSql += $" AND {this._colEmbedding} <=> @embedding < @maxDistance"; From a32eb20ef348691bb1941dffe6363c46b3a541d0 Mon Sep 17 00:00:00 2001 From: Devis Lucato Date: Wed, 8 Jan 2025 19:03:54 +0100 Subject: [PATCH 07/18] Code style and docs --- extensions/Postgres/Postgres/Internals/PostgresDbClient.cs | 1 + extensions/Postgres/Postgres/PostgresMemory.cs | 1 + service/Service/appsettings.json | 5 ++++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs index bc82622d0..2483fc1e4 100644 --- a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs +++ b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs @@ -435,6 +435,7 @@ DO UPDATE SET { filterSql = "TRUE"; } + string filterSqlHybridText = filterSql; var maxDistance = 1 - minSimilarity; filterSql += $" AND {this._colEmbedding} <=> @embedding < @maxDistance"; diff --git a/extensions/Postgres/Postgres/PostgresMemory.cs b/extensions/Postgres/Postgres/PostgresMemory.cs index 3ace05503..44c7f9ae9 100644 --- a/extensions/Postgres/Postgres/PostgresMemory.cs +++ b/extensions/Postgres/Postgres/PostgresMemory.cs @@ -27,6 +27,7 @@ public sealed class PostgresMemory : IMemoryDb, IDisposable, IAsyncDisposable private readonly PostgresDbClient _db; private readonly ITextEmbeddingGenerator _embeddingGenerator; private readonly ILogger _log; + private readonly bool _useHybridSearch; /// diff --git a/service/Service/appsettings.json b/service/Service/appsettings.json index aa1970bef..f97a040f2 100644 --- a/service/Service/appsettings.json +++ b/service/Service/appsettings.json @@ -624,7 +624,10 @@ "ConnectionString": "Host=localhost;Port=5432;Username=public;Password=;Database=public", // Mandatory prefix to add to the name of table managed by KM, // e.g. to exclude other tables in the same schema. - "TableNamePrefix": "km-" + "TableNamePrefix": "km-", + // Hybrid search is not enabled by default. Note that when using hybrid search + // relevance scores are different, usually lower, than when using just vector search + "UseHybridSearch": false, }, "Qdrant": { // Qdrant endpoint From d9ff2350bff2db91df003ec339285faca13b17cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Luis=20Santiago?= Date: Wed, 22 Jan 2025 00:18:09 +0100 Subject: [PATCH 08/18] Solve a problem with the index naming --- .../Postgres.TestApplication/Program.cs | 40 ++++++++----------- .../Postgres/Internals/PostgresDbClient.cs | 10 +++-- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/extensions/Postgres/Postgres.TestApplication/Program.cs b/extensions/Postgres/Postgres.TestApplication/Program.cs index 915dd409b..37fb7b5ec 100644 --- a/extensions/Postgres/Postgres.TestApplication/Program.cs +++ b/extensions/Postgres/Postgres.TestApplication/Program.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft. All rights reserved. using Microsoft.KernelMemory; +using Microsoft.KernelMemory.AI.Ollama; using Microsoft.KernelMemory.DocumentStorage.DevTools; using Microsoft.KernelMemory.FileSystem.DevTools; @@ -26,16 +27,13 @@ private static async Task Test1() var postgresConfig = cfg.GetSection("KernelMemory:Services:Postgres").Get(); ArgumentNullExceptionEx.ThrowIfNull(postgresConfig, nameof(postgresConfig), "Postgres config not found"); - var azureOpenAIEmbeddingConfig = cfg.GetSection("KernelMemory:Services:AzureOpenAIEmbedding").Get(); - ArgumentNullExceptionEx.ThrowIfNull(azureOpenAIEmbeddingConfig, nameof(azureOpenAIEmbeddingConfig), "AzureOpenAIEmbedding config not found"); - - var azureOpenAITextConfig = cfg.GetSection("KernelMemory:Services:AzureOpenAIText").Get(); - ArgumentNullExceptionEx.ThrowIfNull(azureOpenAITextConfig, nameof(azureOpenAITextConfig), "AzureOpenAIText config not found"); + var ollamaConfig = cfg.GetSection("KernelMemory:Services:Ollama").Get(); + ArgumentNullExceptionEx.ThrowIfNull(ollamaConfig, nameof(ollamaConfig), "Ollama config not found"); // Concatenate our 'WithPostgresMemoryDb()' after 'WithOpenAIDefaults()' from the core nuget var mem1 = new KernelMemoryBuilder() - .WithAzureOpenAITextGeneration(azureOpenAITextConfig) - .WithAzureOpenAITextEmbeddingGeneration(azureOpenAIEmbeddingConfig) + .WithOllamaTextEmbeddingGeneration(ollamaConfig) + .WithOllamaTextGeneration(ollamaConfig) .WithPostgresMemoryDb(postgresConfig) .WithSimpleFileStorage(SimpleFileStorageConfig.Persistent) .Build(); @@ -44,16 +42,16 @@ private static async Task Test1() var mem2 = new KernelMemoryBuilder() .WithPostgresMemoryDb(postgresConfig) .WithSimpleFileStorage(SimpleFileStorageConfig.Persistent) - .WithAzureOpenAITextGeneration(azureOpenAITextConfig) - .WithAzureOpenAITextEmbeddingGeneration(azureOpenAIEmbeddingConfig) + .WithOllamaTextEmbeddingGeneration(ollamaConfig) + .WithOllamaTextGeneration(ollamaConfig) .Build(); // Concatenate our 'WithPostgresMemoryDb()' before and after KM builder extension methods from the core nuget var mem3 = new KernelMemoryBuilder() .WithSimpleFileStorage(SimpleFileStorageConfig.Persistent) - .WithAzureOpenAITextGeneration(azureOpenAITextConfig) + .WithOllamaTextEmbeddingGeneration(ollamaConfig) + .WithOllamaTextGeneration(ollamaConfig) .WithPostgresMemoryDb(postgresConfig) - .WithAzureOpenAITextEmbeddingGeneration(azureOpenAIEmbeddingConfig) .Build(); await mem1.DeleteIndexAsync("index1"); @@ -92,8 +90,7 @@ private static async Task Test1() private static async Task Test2() { var postgresConfig = new PostgresConfig(); - var azureOpenAIEmbeddingConfig = new AzureOpenAIConfig(); - var azureOpenAITextConfig = new AzureOpenAIConfig(); + var ollamaConfig = new OllamaConfig(); new ConfigurationBuilder() .AddJsonFile("appsettings.json") @@ -101,13 +98,12 @@ private static async Task Test2() .AddJsonFile("appsettings.Development.json", optional: true) .Build() .BindSection("KernelMemory:Services:Postgres", postgresConfig) - .BindSection("KernelMemory:Services:AzureOpenAIEmbedding", azureOpenAIEmbeddingConfig) - .BindSection("KernelMemory:Services:AzureOpenAIText", azureOpenAITextConfig); + .BindSection("KernelMemory:Services:Ollama", ollamaConfig); var memory = new KernelMemoryBuilder() .WithPostgresMemoryDb(postgresConfig) - .WithAzureOpenAITextGeneration(azureOpenAITextConfig) - .WithAzureOpenAITextEmbeddingGeneration(azureOpenAIEmbeddingConfig) + .WithOllamaTextGeneration(ollamaConfig) + .WithOllamaTextEmbeddingGeneration(ollamaConfig) .WithSimpleFileStorage(new SimpleFileStorageConfig { StorageType = FileSystemTypes.Disk, @@ -140,8 +136,7 @@ private static async Task Test2() private static async Task Test3() { var postgresConfig = new PostgresConfig(); - var azureOpenAIEmbeddingConfig = new AzureOpenAIConfig(); - var azureOpenAITextConfig = new AzureOpenAIConfig(); + var ollamaConfig = new OllamaConfig(); // Note: using appsettings.custom-sql.json new ConfigurationBuilder() @@ -151,13 +146,12 @@ private static async Task Test3() .AddJsonFile("appsettings.custom-sql.json") .Build() .BindSection("KernelMemory:Services:Postgres", postgresConfig) - .BindSection("KernelMemory:Services:AzureOpenAIEmbedding", azureOpenAIEmbeddingConfig) - .BindSection("KernelMemory:Services:AzureOpenAIText", azureOpenAITextConfig); + .BindSection("KernelMemory:Services:Ollama", ollamaConfig); var memory = new KernelMemoryBuilder() .WithPostgresMemoryDb(postgresConfig) - .WithAzureOpenAITextGeneration(azureOpenAITextConfig) - .WithAzureOpenAITextEmbeddingGeneration(azureOpenAIEmbeddingConfig) + .WithOllamaTextGeneration(ollamaConfig) + .WithOllamaTextEmbeddingGeneration(ollamaConfig) .WithSimpleFileStorage(new SimpleFileStorageConfig { StorageType = FileSystemTypes.Disk, diff --git a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs index 1484d6d49..a98393e61 100644 --- a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs +++ b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs @@ -146,6 +146,8 @@ public async Task CreateTableAsync( CancellationToken cancellationToken = default) { var origInputTableName = tableName; + var indexTags = this.WithTableNamePrefix(tableName) + "_idx_tags"; + var indexContent = this.WithTableNamePrefix(tableName) + "_idx_content"; tableName = this.WithSchemaAndTableNamePrefix(tableName); this._log.LogTrace("Creating table: {0}", tableName); @@ -183,8 +185,8 @@ public async Task CreateTableAsync( {this._colContent} TEXT DEFAULT '' NOT NULL, {this._colPayload} JSONB DEFAULT '{{}}'::JSONB NOT NULL ); - CREATE INDEX IF NOT EXISTS {tableName}_idx_tags ON {tableName} USING GIN({this._colTags}); - CREATE INDEX IF NOT EXISTS {tableName}_idx_content ON {tableName} USING GIN(to_tsvector('english',{this._colContent})); + CREATE INDEX IF NOT EXISTS ""{indexTags}"" ON {tableName} USING GIN({this._colTags}); + CREATE INDEX IF NOT EXISTS ""{indexContent}"" ON {tableName} USING GIN(to_tsvector('english',{this._colContent})); COMMIT; "; #pragma warning restore CA2100 @@ -430,13 +432,15 @@ DO UPDATE SET string columnsListHybridCoalesce = this._columnsListHybridCoalesce; // Filtering logic, including filter by similarity + // filterSql = filterSql?.Trim().Replace(PostgresSchema.PlaceholdersTags, this._colTags, StringComparison.Ordinal); if (string.IsNullOrWhiteSpace(filterSql)) { filterSql = "TRUE"; } - string filterSqlHybridText = filterSql; + string filterSqlHybridText = filterSql; + var maxDistance = 1 - minSimilarity; filterSql += $" AND {this._colEmbedding} <=> @embedding < @maxDistance"; From 8abde3c0f86afaf1c9ab9e92cb474bfb9252e353 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Luis=20Santiago?= Date: Wed, 29 Jan 2025 23:42:23 +0100 Subject: [PATCH 09/18] Parametrization Add parametrization to text search language dictionary and parametrization of the Reciprocal Ranked Fusion "k-nearest neighbor" to score results of Hybrid Search --- .../Postgres/Internals/PostgresDbClient.cs | 20 +++++++++++-------- .../Postgres/Postgres/PostgresConfig.cs | 16 +++++++++++++-- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs index a98393e61..8621f5265 100644 --- a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs +++ b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs @@ -42,6 +42,8 @@ public PostgresDbClient(PostgresConfig config, ILoggerFactory? loggerFactory = n this._dbNamePresent = config.ConnectionString.Contains("Database=", StringComparison.OrdinalIgnoreCase); this._schema = config.Schema; this._tableNamePrefix = config.TableNamePrefix; + this._textSearchLanguage = config.TextSearchLanguage; + this._rrf_K = config.RRF_K; this._colId = config.Columns[PostgresConfig.ColumnId]; this._colEmbedding = config.Columns[PostgresConfig.ColumnEmbedding]; @@ -168,7 +170,7 @@ public async Task CreateTableAsync( { cmd.CommandText = this._createTableSql .Replace(PostgresConfig.SqlPlaceholdersTableName, tableName, StringComparison.Ordinal) - .Replace(PostgresConfig.SqlPlaceholdersVectorSize, $"{vectorSize}", StringComparison.Ordinal) + .Replace(PostgresConfig.SqlPlaceholdersVectorSize, $"{vectorSize}", StringComparison.Ordinal) .Replace(PostgresConfig.SqlPlaceholdersLockId, $"{lockId}", StringComparison.Ordinal); this._log.LogTrace("Creating table with custom SQL: {0}", cmd.CommandText); @@ -186,11 +188,11 @@ public async Task CreateTableAsync( {this._colPayload} JSONB DEFAULT '{{}}'::JSONB NOT NULL ); CREATE INDEX IF NOT EXISTS ""{indexTags}"" ON {tableName} USING GIN({this._colTags}); - CREATE INDEX IF NOT EXISTS ""{indexContent}"" ON {tableName} USING GIN(to_tsvector('english',{this._colContent})); + CREATE INDEX IF NOT EXISTS ""{indexContent}"" ON {tableName} USING GIN(to_tsvector('{this._textSearchLanguage}',{this._colContent})); COMMIT; "; #pragma warning restore CA2100 - + this._log.LogTrace("Creating table with default SQL: {0}", cmd.CommandText); } @@ -428,7 +430,7 @@ DO UPDATE SET // Column names string columns = withEmbeddings ? this._columnsListWithEmbeddings : this._columnsListNoEmbeddings; - string columnsHibrid = this._columnsListHybrid; + string columnsHybrid = this._columnsListHybrid; string columnsListHybridCoalesce = this._columnsListHybridCoalesce; // Filtering logic, including filter by similarity @@ -466,14 +468,14 @@ DO UPDATE SET // the similarity (1 - distance) later. Furthermore, colDistance can't be used in the WHERE clause. cmd.CommandText = @$" WITH semantic_search AS ( - SELECT {columnsHibrid}, RANK () OVER (ORDER BY {this._colEmbedding} <=> @embedding) AS rank + SELECT {columnsHybrid}, RANK () OVER (ORDER BY {this._colEmbedding} <=> @embedding) AS rank FROM {tableName} WHERE {filterSql} ORDER BY {this._colEmbedding} <=> @embedding LIMIT @limit ), keyword_search AS ( - SELECT {columnsHibrid}, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC) + SELECT {columnsHybrid}, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC) FROM {tableName}, plainto_tsquery('english', @query) query WHERE {filterSqlHybridText} AND to_tsvector('english', {this._colContent}) @@ query ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC @@ -481,8 +483,8 @@ LIMIT @limit ) SELECT {columnsListHybridCoalesce} - COALESCE(1.0 / (60 + semantic_search.rank), 0.0) + - COALESCE(1.0 / (60 + keyword_search.rank), 0.0) AS {colDistance} + COALESCE(1.0 / ({this._rrf_K} + semantic_search.rank), 0.0) + + COALESCE(1.0 / ({this._rrf_K} + keyword_search.rank), 0.0) AS {colDistance} FROM semantic_search FULL OUTER JOIN keyword_search ON semantic_search.{this._colId} = keyword_search.{this._colId} ORDER BY {colDistance} DESC @@ -750,6 +752,8 @@ public async ValueTask DisposeAsync() private readonly string _columnsListHybrid; private readonly string _columnsListHybridCoalesce; private readonly bool _dbNamePresent; + private readonly string _textSearchLanguage; + private readonly int _rrf_K; /// /// Try to connect to PG, handling exceptions in case the DB doesn't exist diff --git a/extensions/Postgres/Postgres/PostgresConfig.cs b/extensions/Postgres/Postgres/PostgresConfig.cs index 6aa2d76a2..187999e78 100644 --- a/extensions/Postgres/Postgres/PostgresConfig.cs +++ b/extensions/Postgres/Postgres/PostgresConfig.cs @@ -113,10 +113,22 @@ public class PostgresConfig /// public bool UseHybridSearch { get; set; } = false; + /// - /// Create a new instance of the configuration + /// Defines the dictionary language the make the textual part of the hybrid Search in postgresql + /// see: /// - public PostgresConfig() + public string TextSearchLanguage { get; set; } = "english"; + + /// + /// Reciprocal Ranked Fusion "k-nearest neighbor" to score results of Hybrid Search + /// + public int RRF_K { get; set; } = 50; + + /// + /// Create a new instance of the configuration + /// + public PostgresConfig() { this.Columns = new Dictionary { From 1a70fb95501447c048206ce4ae01929b330eb3e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Luis=20Santiago?= Date: Wed, 29 Jan 2025 23:54:32 +0100 Subject: [PATCH 10/18] Eliminate unnecesary copy --- .../Postgres/Postgres/Internals/PostgresDbClient.cs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs index 8621f5265..31703158f 100644 --- a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs +++ b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs @@ -430,8 +430,7 @@ DO UPDATE SET // Column names string columns = withEmbeddings ? this._columnsListWithEmbeddings : this._columnsListNoEmbeddings; - string columnsHybrid = this._columnsListHybrid; - string columnsListHybridCoalesce = this._columnsListHybridCoalesce; + // Filtering logic, including filter by similarity // @@ -468,21 +467,21 @@ DO UPDATE SET // the similarity (1 - distance) later. Furthermore, colDistance can't be used in the WHERE clause. cmd.CommandText = @$" WITH semantic_search AS ( - SELECT {columnsHybrid}, RANK () OVER (ORDER BY {this._colEmbedding} <=> @embedding) AS rank + SELECT {this._columnsListHybrid}, RANK () OVER (ORDER BY {this._colEmbedding} <=> @embedding) AS rank FROM {tableName} WHERE {filterSql} ORDER BY {this._colEmbedding} <=> @embedding LIMIT @limit ), keyword_search AS ( - SELECT {columnsHybrid}, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC) + SELECT {this._columnsListHybrid}, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC) FROM {tableName}, plainto_tsquery('english', @query) query WHERE {filterSqlHybridText} AND to_tsvector('english', {this._colContent}) @@ query ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC LIMIT @limit ) SELECT - {columnsListHybridCoalesce} + {this._columnsListHybridCoalesce} COALESCE(1.0 / ({this._rrf_K} + semantic_search.rank), 0.0) + COALESCE(1.0 / ({this._rrf_K} + keyword_search.rank), 0.0) AS {colDistance} FROM semantic_search From 0d1410e34b9e68bd379d1b67bec6a1a4c0fca700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Luis=20Santiago?= Date: Thu, 30 Jan 2025 00:06:41 +0100 Subject: [PATCH 11/18] remove the trailing comma --- extensions/Postgres/Postgres/Internals/PostgresDbClient.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs index 31703158f..6025a4aaa 100644 --- a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs +++ b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs @@ -67,7 +67,7 @@ public PostgresDbClient(PostgresConfig config, ILoggerFactory? loggerFactory = n COALESCE(semantic_search.{this._colTags}, keyword_search.{this._colTags}) AS {this._colTags}, COALESCE(semantic_search.{this._colContent}, keyword_search.{this._colContent}) AS {this._colContent}, COALESCE(semantic_search.{this._colPayload}, keyword_search.{this._colPayload}) AS {this._colPayload}, - COALESCE(semantic_search.{this._colEmbedding}, keyword_search.{this._colEmbedding}) AS {this._colEmbedding}, + COALESCE(semantic_search.{this._colEmbedding}, keyword_search.{this._colEmbedding}) AS {this._colEmbedding} "; this._createTableSql = string.Empty; @@ -481,7 +481,7 @@ ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC LIMIT @limit ) SELECT - {this._columnsListHybridCoalesce} + {this._columnsListHybridCoalesce}, COALESCE(1.0 / ({this._rrf_K} + semantic_search.rank), 0.0) + COALESCE(1.0 / ({this._rrf_K} + keyword_search.rank), 0.0) AS {colDistance} FROM semantic_search From 1d2064eebdf1caa9f8119fe6c412934f2e4233b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Luis=20Santiago?= Date: Thu, 30 Jan 2025 21:37:21 +0100 Subject: [PATCH 12/18] Change readme to document Hybrid Search --- .../Postgres/Postgres/PostgresConfig.cs | 2 +- extensions/Postgres/README.md | 29 ++++++++++++++++--- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/extensions/Postgres/Postgres/PostgresConfig.cs b/extensions/Postgres/Postgres/PostgresConfig.cs index 187999e78..2a7595d8c 100644 --- a/extensions/Postgres/Postgres/PostgresConfig.cs +++ b/extensions/Postgres/Postgres/PostgresConfig.cs @@ -121,7 +121,7 @@ public class PostgresConfig public string TextSearchLanguage { get; set; } = "english"; /// - /// Reciprocal Ranked Fusion "k-nearest neighbor" to score results of Hybrid Search + /// Reciprocal Ranked Fusion to score results of Hybrid Search /// public int RRF_K { get; set; } = 50; diff --git a/extensions/Postgres/README.md b/extensions/Postgres/README.md index 44786f6f6..6fa2b4e36 100644 --- a/extensions/Postgres/README.md +++ b/extensions/Postgres/README.md @@ -31,7 +31,10 @@ To use Postgres with Kernel Memory: "KernelMemory": { "Services": { "Postgres": { - "ConnectionString": "Host=localhost;Port=5432;Username=myuser;Password=mypassword;Database=mydatabase" + "ConnectionString": "Host=localhost;Port=5432;Username=myuser;Password=mypassword;Database=mydatabase", + "UseHybridSearch": true, + "TextSearchLanguage": "english", + "RRF_K" : 60, } } } @@ -42,16 +45,16 @@ To use Postgres with Kernel Memory: // using Microsoft.KernelMemory; // using Microsoft.KernelMemory.Postgres; // using Microsoft.Extensions.Configuration; - + var postgresConfig = new PostgresConfig(); - + new ConfigurationBuilder() .AddJsonFile("appsettings.json") .AddJsonFile("appsettings.development.json", optional: true) .AddJsonFile("appsettings.Development.json", optional: true) .Build() .BindSection("KernelMemory:Services:Postgres", postgresConfig); - + var memory = new KernelMemoryBuilder() .WithPostgresMemoryDb(postgresConfig) .WithSimpleFileStorage(SimpleFileStorageConfig.Persistent) @@ -103,6 +106,24 @@ types supported by Kernel Memory. Overall we recommend not mixing external tables in the same DB used for Kernel Memory. +## Hybrid Search + +The Postgres memory connector support Hybrid Search. + +Hybrid Search configuration parameters: + +- **UseHybridSearch**: This parameter enables (true) or disables (false) hybrid search. +- **TextSearchLanguage**: This parameter sets the language used during text search. +- **RRF_K**: This parameter allows to configured [RRF](https://en.wikipedia.org/wiki/Mean_reciprocal_rank) for the hybrid search. A smaller value of `RRF_k` gives more weight to higher ranked items, whereas a larger value of `RRF_k` gives more weight to lower ranked items. For hybrid search, this impacts the final score when combining the two scores from the search. It defaults to 50. The range of value should be 1-100. + +You can find more information on [pgvector github project](https://github.com/pgvector/pgvector) and [in this article](https://jkatz05.com/post/postgres/hybrid-search-postgres-pgvector/) you can read + +an introduction about hybrid search in Postgres, + + + + + ## Column names and table schema The connector uses a default schema with predefined columns and indexes. From ec2b2fd34935a770cce6f58e8f8707471a281f59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Luis=20Santiago?= Date: Mon, 3 Feb 2025 22:37:47 +0100 Subject: [PATCH 13/18] Solve Fix Formating Errors --- .../Postgres/Internals/PostgresDbClient.cs | 10 +++++----- extensions/Postgres/Postgres/PostgresConfig.cs | 18 +++++++++--------- extensions/Postgres/README.md | 2 +- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs index 6025a4aaa..be6926cc1 100644 --- a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs +++ b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs @@ -43,7 +43,7 @@ public PostgresDbClient(PostgresConfig config, ILoggerFactory? loggerFactory = n this._schema = config.Schema; this._tableNamePrefix = config.TableNamePrefix; this._textSearchLanguage = config.TextSearchLanguage; - this._rrf_K = config.RRF_K; + this._rrf_K = config.RRFK; this._colId = config.Columns[PostgresConfig.ColumnId]; this._colEmbedding = config.Columns[PostgresConfig.ColumnEmbedding]; @@ -170,7 +170,7 @@ public async Task CreateTableAsync( { cmd.CommandText = this._createTableSql .Replace(PostgresConfig.SqlPlaceholdersTableName, tableName, StringComparison.Ordinal) - .Replace(PostgresConfig.SqlPlaceholdersVectorSize, $"{vectorSize}", StringComparison.Ordinal) + .Replace(PostgresConfig.SqlPlaceholdersVectorSize, $"{vectorSize}", StringComparison.Ordinal) .Replace(PostgresConfig.SqlPlaceholdersLockId, $"{lockId}", StringComparison.Ordinal); this._log.LogTrace("Creating table with custom SQL: {0}", cmd.CommandText); @@ -192,7 +192,7 @@ public async Task CreateTableAsync( COMMIT; "; #pragma warning restore CA2100 - + this._log.LogTrace("Creating table with default SQL: {0}", cmd.CommandText); } @@ -440,8 +440,8 @@ DO UPDATE SET filterSql = "TRUE"; } - string filterSqlHybridText = filterSql; - + string filterSqlHybridText = filterSql; + var maxDistance = 1 - minSimilarity; filterSql += $" AND {this._colEmbedding} <=> @embedding < @maxDistance"; diff --git a/extensions/Postgres/Postgres/PostgresConfig.cs b/extensions/Postgres/Postgres/PostgresConfig.cs index 2a7595d8c..eef0cb899 100644 --- a/extensions/Postgres/Postgres/PostgresConfig.cs +++ b/extensions/Postgres/Postgres/PostgresConfig.cs @@ -120,15 +120,15 @@ public class PostgresConfig /// public string TextSearchLanguage { get; set; } = "english"; - /// - /// Reciprocal Ranked Fusion to score results of Hybrid Search - /// - public int RRF_K { get; set; } = 50; - - /// - /// Create a new instance of the configuration - /// - public PostgresConfig() + /// + /// Reciprocal Ranked Fusion to score results of Hybrid Search + /// + public int RRFK { get; set; } = 50; + + /// + /// Create a new instance of the configuration + /// + public PostgresConfig() { this.Columns = new Dictionary { diff --git a/extensions/Postgres/README.md b/extensions/Postgres/README.md index 6fa2b4e36..5defdf3e7 100644 --- a/extensions/Postgres/README.md +++ b/extensions/Postgres/README.md @@ -114,7 +114,7 @@ Hybrid Search configuration parameters: - **UseHybridSearch**: This parameter enables (true) or disables (false) hybrid search. - **TextSearchLanguage**: This parameter sets the language used during text search. -- **RRF_K**: This parameter allows to configured [RRF](https://en.wikipedia.org/wiki/Mean_reciprocal_rank) for the hybrid search. A smaller value of `RRF_k` gives more weight to higher ranked items, whereas a larger value of `RRF_k` gives more weight to lower ranked items. For hybrid search, this impacts the final score when combining the two scores from the search. It defaults to 50. The range of value should be 1-100. +- **RRFK**: This parameter allows to configured [RRF](https://en.wikipedia.org/wiki/Mean_reciprocal_rank) for the hybrid search. A smaller value of `RRF_k` gives more weight to higher ranked items, whereas a larger value of `RRF_k` gives more weight to lower ranked items. For hybrid search, this impacts the final score when combining the two scores from the search. It defaults to 50. The range of value should be 1-100. You can find more information on [pgvector github project](https://github.com/pgvector/pgvector) and [in this article](https://jkatz05.com/post/postgres/hybrid-search-postgres-pgvector/) you can read From bb6c729899325bd8278c82003055ed24ab902b3b Mon Sep 17 00:00:00 2001 From: Devis Lucato Date: Mon, 10 Feb 2025 22:51:11 -0800 Subject: [PATCH 14/18] Code style and appsettings.json --- extensions/Postgres/Postgres/PostgresConfig.cs | 3 +-- service/Service/appsettings.json | 4 ++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/extensions/Postgres/Postgres/PostgresConfig.cs b/extensions/Postgres/Postgres/PostgresConfig.cs index eef0cb899..26a9591db 100644 --- a/extensions/Postgres/Postgres/PostgresConfig.cs +++ b/extensions/Postgres/Postgres/PostgresConfig.cs @@ -113,10 +113,9 @@ public class PostgresConfig /// public bool UseHybridSearch { get; set; } = false; - /// /// Defines the dictionary language the make the textual part of the hybrid Search in postgresql - /// see: + /// see: /// public string TextSearchLanguage { get; set; } = "english"; diff --git a/service/Service/appsettings.json b/service/Service/appsettings.json index a8c99228a..b92d7b6eb 100644 --- a/service/Service/appsettings.json +++ b/service/Service/appsettings.json @@ -625,6 +625,10 @@ // Hybrid search is not enabled by default. Note that when using hybrid search // relevance scores are different, usually lower, than when using just vector search "UseHybridSearch": false, + // Defines the dictionary language of hybrid search + "TextSearchLanguage": "english", + // Reciprocal Ranked Fusion to score results of hybrid search + "RRFK": 50, }, "Qdrant": { // Qdrant endpoint From 4687cb080d8aba6b49520d80a7a22b5b16e28c19 Mon Sep 17 00:00:00 2001 From: Devis Lucato Date: Mon, 10 Feb 2025 23:02:15 -0800 Subject: [PATCH 15/18] Fix docs --- .../Postgres/Postgres/PostgresConfig.cs | 2 +- extensions/Postgres/README.md | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/extensions/Postgres/Postgres/PostgresConfig.cs b/extensions/Postgres/Postgres/PostgresConfig.cs index 26a9591db..194416aca 100644 --- a/extensions/Postgres/Postgres/PostgresConfig.cs +++ b/extensions/Postgres/Postgres/PostgresConfig.cs @@ -114,7 +114,7 @@ public class PostgresConfig public bool UseHybridSearch { get; set; } = false; /// - /// Defines the dictionary language the make the textual part of the hybrid Search in postgresql + /// Defines the dictionary language used for the textual part of hybrid search. /// see: /// public string TextSearchLanguage { get; set; } = "english"; diff --git a/extensions/Postgres/README.md b/extensions/Postgres/README.md index 5defdf3e7..01fe359c6 100644 --- a/extensions/Postgres/README.md +++ b/extensions/Postgres/README.md @@ -114,15 +114,16 @@ Hybrid Search configuration parameters: - **UseHybridSearch**: This parameter enables (true) or disables (false) hybrid search. - **TextSearchLanguage**: This parameter sets the language used during text search. -- **RRFK**: This parameter allows to configured [RRF](https://en.wikipedia.org/wiki/Mean_reciprocal_rank) for the hybrid search. A smaller value of `RRF_k` gives more weight to higher ranked items, whereas a larger value of `RRF_k` gives more weight to lower ranked items. For hybrid search, this impacts the final score when combining the two scores from the search. It defaults to 50. The range of value should be 1-100. - -You can find more information on [pgvector github project](https://github.com/pgvector/pgvector) and [in this article](https://jkatz05.com/post/postgres/hybrid-search-postgres-pgvector/) you can read - -an introduction about hybrid search in Postgres, - - - - +- **RRFK**: This parameter allows to configured [RRF](https://en.wikipedia.org/wiki/Mean_reciprocal_rank) for the hybrid search. + A smaller value of `RRF_k` gives more weight to higher ranked items, whereas a + larger value of `RRF_k` gives more weight to lower ranked items. For hybrid search, + this impacts the final score when combining the two scores from the search. + It defaults to 50. The range of value should be 1-100. + +For more details, check out the +[pgvector GitHub project](https://github.com/pgvector/pgvector) and +[this article](https://jkatz05.com/post/postgres/hybrid-search-postgres-pgvector) +on hybrid search in Postgres. ## Column names and table schema From 12a8c23fb0d05eef1f44e04944a23aa6fb252e4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Luis=20Santiago?= Date: Tue, 11 Feb 2025 08:08:09 +0100 Subject: [PATCH 16/18] Missing links to documentation reference --- extensions/Postgres/Postgres/PostgresConfig.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/extensions/Postgres/Postgres/PostgresConfig.cs b/extensions/Postgres/Postgres/PostgresConfig.cs index 194416aca..2d4822161 100644 --- a/extensions/Postgres/Postgres/PostgresConfig.cs +++ b/extensions/Postgres/Postgres/PostgresConfig.cs @@ -115,7 +115,8 @@ public class PostgresConfig /// /// Defines the dictionary language used for the textual part of hybrid search. - /// see: + /// see: https://www.postgresql.org/docs/current/textsearch-dictionaries.html + /// This query can help you to get the list of dictionaries: SELECT * FROM pg_catalog.pg_ts_dict; /// public string TextSearchLanguage { get; set; } = "english"; From 889db7ec2950a4920fae72e4a4df9dc8d87cf38a Mon Sep 17 00:00:00 2001 From: SignalRT Date: Sun, 16 Mar 2025 17:29:29 +0100 Subject: [PATCH 17/18] Update documentation to explain the case where the table exist previously --- extensions/Postgres/README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/extensions/Postgres/README.md b/extensions/Postgres/README.md index 01fe359c6..0a93a394d 100644 --- a/extensions/Postgres/README.md +++ b/extensions/Postgres/README.md @@ -125,6 +125,16 @@ For more details, check out the [this article](https://jkatz05.com/post/postgres/hybrid-search-postgres-pgvector) on hybrid search in Postgres. +The connector creates text search index automatically on table creation. + +In the case you activate the text search once PostgreSQL tables are created with an older +version of the connector or you want to change the TextSearchLanguage you will need +to create manually the text search index using the column names and table name that +you had configured. + +**SQL to add Text Search Index:** +'CREATE INDEX IF NOT EXISTS {indexName} ON {tableName} USING GIN(to_tsvector('TextSearchLanguage',{this._colContent})); + ## Column names and table schema The connector uses a default schema with predefined columns and indexes. From 142936bcb55a75cb3b732db76fe88f6c74063e81 Mon Sep 17 00:00:00 2001 From: SignalRT Date: Sun, 16 Mar 2025 17:58:53 +0100 Subject: [PATCH 18/18] Solve missed language parameter --- .../Postgres/Internals/PostgresDbClient.cs | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs index be6926cc1..8660c1a5c 100644 --- a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs +++ b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs @@ -43,7 +43,7 @@ public PostgresDbClient(PostgresConfig config, ILoggerFactory? loggerFactory = n this._schema = config.Schema; this._tableNamePrefix = config.TableNamePrefix; this._textSearchLanguage = config.TextSearchLanguage; - this._rrf_K = config.RRFK; + this._rrfK = config.RRFK; this._colId = config.Columns[PostgresConfig.ColumnId]; this._colEmbedding = config.Columns[PostgresConfig.ColumnEmbedding]; @@ -431,7 +431,6 @@ DO UPDATE SET // Column names string columns = withEmbeddings ? this._columnsListWithEmbeddings : this._columnsListNoEmbeddings; - // Filtering logic, including filter by similarity // filterSql = filterSql?.Trim().Replace(PostgresSchema.PlaceholdersTags, this._colTags, StringComparison.Ordinal); @@ -474,16 +473,16 @@ WITH semantic_search AS ( LIMIT @limit ), keyword_search AS ( - SELECT {this._columnsListHybrid}, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC) - FROM {tableName}, plainto_tsquery('english', @query) query - WHERE {filterSqlHybridText} AND to_tsvector('english', {this._colContent}) @@ query - ORDER BY ts_rank_cd(to_tsvector('english', {this._colContent}), query) DESC + SELECT {this._columnsListHybrid}, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('{this._textSearchLanguage}', {this._colContent}), query) DESC) + FROM {tableName}, plainto_tsquery('{this._textSearchLanguage}', @query) query + WHERE {filterSqlHybridText} AND to_tsvector('{this._textSearchLanguage}', {this._colContent}) @@ query + ORDER BY ts_rank_cd(to_tsvector('{this._textSearchLanguage}', {this._colContent}), query) DESC LIMIT @limit ) SELECT {this._columnsListHybridCoalesce}, - COALESCE(1.0 / ({this._rrf_K} + semantic_search.rank), 0.0) + - COALESCE(1.0 / ({this._rrf_K} + keyword_search.rank), 0.0) AS {colDistance} + COALESCE(1.0 / ({this._rrfK} + semantic_search.rank), 0.0) + + COALESCE(1.0 / ({this._rrfK} + keyword_search.rank), 0.0) AS {colDistance} FROM semantic_search FULL OUTER JOIN keyword_search ON semantic_search.{this._colId} = keyword_search.{this._colId} ORDER BY {colDistance} DESC @@ -752,7 +751,7 @@ public async ValueTask DisposeAsync() private readonly string _columnsListHybridCoalesce; private readonly bool _dbNamePresent; private readonly string _textSearchLanguage; - private readonly int _rrf_K; + private readonly int _rrfK; /// /// Try to connect to PG, handling exceptions in case the DB doesn't exist