diff --git a/.github/_typos.toml b/.github/_typos.toml index 000d278e9..e75a4e286 100644 --- a/.github/_typos.toml +++ b/.github/_typos.toml @@ -17,7 +17,8 @@ extend-exclude = [ "appsettings.Development.json", "appsettings.*.json.*", "AzureAISearchFilteringTest.cs", - "KernelMemory.sln.DotSettings" + "KernelMemory.sln.DotSettings", + "doc1.txt", ] [default.extend-words] diff --git a/KernelMemory.sln b/KernelMemory.sln index 796dcf5bc..05098b77b 100644 --- a/KernelMemory.sln +++ b/KernelMemory.sln @@ -720,7 +720,6 @@ Global {FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.Build.0 = Debug|Any CPU {FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.ActiveCfg = Release|Any CPU - {FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/docs/how-to/custom-partitioning.md b/docs/how-to/custom-partitioning.md index 343de9923..3fb98fe20 100644 --- a/docs/how-to/custom-partitioning.md +++ b/docs/how-to/custom-partitioning.md @@ -59,7 +59,6 @@ For example, with small models supporting up to 256 tokens, something like this ... "TextPartitioning": { "MaxTokensPerParagraph": 256, - "MaxTokensPerLine": 256, "OverlappingTokens": 50 }, ... @@ -74,7 +73,6 @@ var memory = new KernelMemoryBuilder() new TextPartitioningOptions { MaxTokensPerParagraph = 256, - MaxTokensPerLine = 256, OverlappingTokens = 50 }) .Build(); diff --git a/examples/102-dotnet-custom-partitioning-options/Program.cs b/examples/102-dotnet-custom-partitioning-options/Program.cs index b278a65c5..a1c94d67b 100644 --- a/examples/102-dotnet-custom-partitioning-options/Program.cs +++ b/examples/102-dotnet-custom-partitioning-options/Program.cs @@ -7,11 +7,9 @@ .WithOpenAIDefaults(Environment.GetEnvironmentVariable("OPENAI_API_KEY")!) .WithCustomTextPartitioningOptions(new TextPartitioningOptions { - // Max 99 tokens per sentence - MaxTokensPerLine = 99, - // When sentences are merged into paragraphs (aka partitions), stop at 299 tokens + // When splitting text into chunks (aka partitions), stop at 299 tokens MaxTokensPerParagraph = 299, - // Each paragraph contains the last 47 tokens from the previous one + // Each chunk contains the last 47 tokens from the previous one OverlappingTokens = 47, }) .Build(); diff --git a/examples/108-dotnet-custom-content-decoders/Program.cs b/examples/108-dotnet-custom-content-decoders/Program.cs index 5898bb4a0..dc98303cb 100644 --- a/examples/108-dotnet-custom-content-decoders/Program.cs +++ b/examples/108-dotnet-custom-content-decoders/Program.cs @@ -91,7 +91,7 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation foreach (Page? page in pdfDocument.GetPages().Where(x => x != null)) { string pageContent = (ContentOrderTextExtractor.GetText(page, options) ?? string.Empty).ReplaceLineEndings(" "); - result.Sections.Add(new Chunk(page.Number, pageContent, Chunk.Meta(sentencesAreComplete: false))); + result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(sentencesAreComplete: false))); } return Task.FromResult(result); diff --git a/examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs b/examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs index d701cd09a..5004e6170 100644 --- a/examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs +++ b/examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs @@ -40,7 +40,7 @@ public static class Program public static async Task Main() { // Partition input text in chunks of 100 tokens - const int PartitionSize = 100; + const int Chunksize = 100; // Search settings const string Query = "astrobiology"; @@ -59,8 +59,7 @@ public static async Task Main() // Customize memory records size (in tokens) var textPartitioningOptions = new TextPartitioningOptions { - MaxTokensPerParagraph = PartitionSize, - MaxTokensPerLine = PartitionSize, + MaxTokensPerParagraph = Chunksize, OverlappingTokens = 0, }; diff --git a/examples/210-KM-without-builder/appsettings.json b/examples/210-KM-without-builder/appsettings.json index 4618489b7..1cd8d878b 100644 --- a/examples/210-KM-without-builder/appsettings.json +++ b/examples/210-KM-without-builder/appsettings.json @@ -154,14 +154,11 @@ "ImageOcrType": "None", // Partitioning / Chunking settings // How does the partitioning work? - // * Given a document, text is extracted, and text is split in sentences, called "lines of text". - // * Sentences are merged into paragraphs, called "partitions". - // * For each partition, one (potentially more) memory is generated. + // * Given a document, text is extracted, and text is split in tokens. + // * Tokens are merged into chunks, called "partitions", sometimes called "paragraphs" + // * For each chunk, one (potentially more) memory is generated. "TextPartitioning": { - // Maximum length of lines of text (aka sentences), in tokens. Tokens depend on the LLM in use. - // Sentences are grouped into paragraphs, see the next setting. - "MaxTokensPerLine": 300, - // Maximum length of paragraphs (aka partitions), in tokens. Tokens depend on the LLM in use. + // Maximum length of chunks in tokens. Tokens depend on the LLM in use. "MaxTokensPerParagraph": 1000, // How many tokens from a paragraph to keep in the following paragraph. "OverlappingTokens": 100 diff --git a/extensions/Chunkers/Chunkers.UnitTests/Chunkers.UnitTests.csproj b/extensions/Chunkers/Chunkers.UnitTests/Chunkers.UnitTests.csproj index a1f115445..f72f7ccb4 100644 --- a/extensions/Chunkers/Chunkers.UnitTests/Chunkers.UnitTests.csproj +++ b/extensions/Chunkers/Chunkers.UnitTests/Chunkers.UnitTests.csproj @@ -35,6 +35,10 @@ Always + + + Always + diff --git a/extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerManualTest.cs b/extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerManualTest.cs new file mode 100644 index 000000000..c184dcbe0 --- /dev/null +++ b/extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerManualTest.cs @@ -0,0 +1,50 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Diagnostics; +using Microsoft.KernelMemory.AI; +using Microsoft.KernelMemory.Chunkers; +using Microsoft.KM.TestHelpers; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.Chunkers.UnitTests; + +public class MarkDownChunkerManualTest(ITestOutputHelper output) : BaseUnitTestCase(output) +{ + [Fact] + [Trait("Category", "UnitTest")] + [Trait("Category", "Chunking")] + [Trait("Category", "Manual")] + public void ItSplitsMarkdownInASensibleWay() + { + // Arrange + string text = File.ReadAllText("doc2.md"); + text = $"{text}{text}"; + + // Act + var w = new Stopwatch(); + w.Start(); + var chunks = new MarkDownChunker(new CL100KTokenizer()).Split(text, new MarkDownChunkerOptions { MaxTokensPerChunk = 600, Overlap = 60 }); + w.Stop(); + + Console.WriteLine($"Text length: {text.Length:N0} chars"); + Console.WriteLine($"Chunks: {chunks.Count}"); + Console.WriteLine($"Time: {w.ElapsedMilliseconds:N0} ms"); + + // Assert + Assert.NotEmpty(chunks); + DebugChunks(chunks, new CL100KTokenizer()); + } + + private static void DebugChunks(IEnumerable chunks, ITextTokenizer tokenizer) + { + var list = chunks.ToList(); + + for (int index = 0; index < list.Count; index++) + { + Console.WriteLine($"************************* {index}: [{tokenizer.CountTokens(list[index])} tokens] *****************************************"); + Console.WriteLine(list[index]); + Console.WriteLine("***********************************************************************************"); + } + } +} diff --git a/extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerTests.cs b/extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerTests.cs index 253d7a537..b354f2920 100644 --- a/extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerTests.cs +++ b/extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerTests.cs @@ -3,7 +3,7 @@ using Microsoft.Chunkers.UnitTests.Helpers; using Microsoft.KernelMemory.AI; using Microsoft.KernelMemory.Chunkers; -using Microsoft.KernelMemory.Chunkers.internals; +using Microsoft.KernelMemory.DataFormats; using Microsoft.KM.TestHelpers; using Xunit; using Xunit.Abstractions; @@ -375,7 +375,7 @@ private static void DebugChunks(IEnumerable chunks, ITextTokenizer token Console.WriteLine("----------------------------------"); } - private static void DebugFragments(List fragments) + private static void DebugFragments(List fragments) { if (fragments.Count == 0) { @@ -384,8 +384,7 @@ private static void DebugFragments(List fragments) for (int index = 0; index < fragments.Count; index++) { - Fragment token = fragments[index]; - Console.WriteLine($"- {index}: Value: \"{token.Content}\""); + Console.WriteLine($"- {index}: Value: \"{fragments[index].Content}\""); } } diff --git a/extensions/Chunkers/Chunkers.UnitTests/PlainTextChunkerPerfTest.cs b/extensions/Chunkers/Chunkers.UnitTests/PlainTextChunkerPerfTest.cs index bbbc21460..8a2dbbd75 100644 --- a/extensions/Chunkers/Chunkers.UnitTests/PlainTextChunkerPerfTest.cs +++ b/extensions/Chunkers/Chunkers.UnitTests/PlainTextChunkerPerfTest.cs @@ -33,6 +33,7 @@ public void CanSplitVeryLargeDocumentsWithoutStackOverflowing() // Assert Assert.NotEmpty(chunks); + // DebugChunks(chunks, new CL100KTokenizer()); } private static void DebugChunks(IEnumerable chunks, ITextTokenizer tokenizer) diff --git a/extensions/Chunkers/Chunkers.UnitTests/PlainTextChunkerTest.cs b/extensions/Chunkers/Chunkers.UnitTests/PlainTextChunkerTest.cs index 36702fca4..b9e515e69 100644 --- a/extensions/Chunkers/Chunkers.UnitTests/PlainTextChunkerTest.cs +++ b/extensions/Chunkers/Chunkers.UnitTests/PlainTextChunkerTest.cs @@ -4,6 +4,7 @@ using Microsoft.KernelMemory.AI; using Microsoft.KernelMemory.Chunkers; using Microsoft.KernelMemory.Chunkers.internals; +using Microsoft.KernelMemory.DataFormats; using Microsoft.KM.TestHelpers; using Xunit; using Xunit.Abstractions; @@ -51,7 +52,7 @@ public void ItTokenizesText() string text = "Hello, world!"; // Act - List fragments = new PlainTextChunker().SplitToFragments(text, s_separators); + List fragments = new PlainTextChunker().SplitToFragments(text, s_separators); DebugFragments(fragments); // Assert @@ -72,7 +73,7 @@ public void ItHandlesConsecutiveSentenceSeparators() string text = "Hello. . . world!!!!!!!!!!!!!"; // Act - List fragments = new PlainTextChunker().SplitToFragments(text, s_separators); + List fragments = new PlainTextChunker().SplitToFragments(text, s_separators); DebugFragments(fragments); // Assert @@ -97,7 +98,7 @@ public void ItHandlesTailWithoutTermination1() string text = "Hello"; // Act - List fragments = new PlainTextChunker().SplitToFragments(text, s_separators); + List fragments = new PlainTextChunker().SplitToFragments(text, s_separators); DebugFragments(fragments); // Assert @@ -114,7 +115,7 @@ public void ItHandlesTailWithoutTermination2() string text = "Hello!World"; // Act - List fragments = new PlainTextChunker().SplitToFragments(text, s_separators); + List fragments = new PlainTextChunker().SplitToFragments(text, s_separators); DebugFragments(fragments); // Assert @@ -908,7 +909,7 @@ private static void DebugChunks(IEnumerable chunks, ITextTokenizer token Console.WriteLine("----------------------------------"); } - private static void DebugFragments(List fragments) + private static void DebugFragments(List fragments) { if (fragments.Count == 0) { @@ -917,8 +918,7 @@ private static void DebugFragments(List fragments) for (int index = 0; index < fragments.Count; index++) { - Fragment token = fragments[index]; - Console.WriteLine($"- {index}: Value: \"{token.Content}\""); + Console.WriteLine($"- {index}: Value: \"{fragments[index].Content}\""); } } diff --git a/extensions/Chunkers/Chunkers.UnitTests/doc2.md b/extensions/Chunkers/Chunkers.UnitTests/doc2.md new file mode 100644 index 000000000..e678fb951 --- /dev/null +++ b/extensions/Chunkers/Chunkers.UnitTests/doc2.md @@ -0,0 +1,571 @@ +Kernel Memory +============= + +[![License: MIT](https://img.shields.io/github/license/microsoft/kernel-memory)](https://github.com/microsoft/kernel-memory/blob/main/LICENSE) +[![Discord](https://img.shields.io/discord/1063152441819942922?label=Discord&color=d82679&logo=discord&logoColor=white)](https://aka.ms/KMdiscord) +[![Docker Image](https://img.shields.io/docker/pulls/kernelmemory/service?label=Docker&color=%230db7ed&logo=docker&logoColor=white)](https://hub.docker.com/r/kernelmemory/service) +[![NuGet Version](https://img.shields.io/nuget/v/Microsoft.KernelMemory?label=nuget&color=%23512BD4&logo=.net&logoColor=white)](https://www.nuget.org/packages/Microsoft.KernelMemory) +[![GitHub Release](https://img.shields.io/github/v/release/microsoft/kernel-memory?color=%23dddddd&label=tag&logo=github&logoColor=white)](https://github.com/microsoft/kernel-memory/releases) + +This repository presents best practices and a reference implementation for Memory in specific AI +and LLMs application scenarios. Please note that **the code provided serves as a demonstration** +and is **not an officially supported** Microsoft offering. + +**Kernel Memory** (KM) is a **multi-modal [AI Service](service/Service/README.md)** specialized +in the efficient indexing of datasets through custom continuous data hybrid pipelines, with support +for **[Retrieval Augmented Generation](https://en.wikipedia.org/wiki/Prompt_engineering#Retrieval-augmented_generation)** +(RAG), synthetic memory, prompt engineering, and custom semantic memory processing. + +KM is available as a **Web Service**, as a **[Docker container](https://hub.docker.com/r/kernelmemory/service)**, +a **[Plugin](https://learn.microsoft.com/copilot/plugins/overview)** for ChatGPT/Copilot/Semantic +Kernel, and as a .NET library for embedded applications. + +Utilizing advanced embeddings and LLMs, the system enables Natural Language querying for obtaining +answers from the indexed data, complete with citations and links to the original sources. + +Kernel Memory is designed for seamless integration as a Plugin with [Semantic Kernel](https://github.com/microsoft/semantic-kernel), +Microsoft Copilot and ChatGPT. + +![image](docs/img/kernel-memory-lambda-architecture.png) + + + + +Kernel Memory Service on Azure +============================== + +Kernel Memory can be deployed in various configurations, including as a **Service** in Azure. +To learn more about deploying Kernel Memory in Azure, please refer to the +[Azure deployment guide](https://microsoft.github.io/kernel-memory/azure). +For detailed instructions on deploying to Azure, you can check the [infrastructure documentation](/infra/README.md). + +If you are already familiar with these resources, you can quickly deploy by clicking the following +button. + +[![Deploy to Azure](docs/azure-button.png)](https://aka.ms/KernelMemoryDeploy2Azure) + +🔗 See also: [Kernel Memory via Docker](#kernel-memory-docker-image) and [Serverless Kernel Memory with Azure services example](examples/007-dotnet-serverless-azure). + +## Running Kernel Memory with Aspire + +Kernel Memory can be easily run and imported in other projects also via .NET Aspire. For example: + +```csharp +var builder = DistributedApplication.CreateBuilder(); + +builder.AddContainer("kernel-memory", "kernelmemory/service") + .WithEnvironment("KernelMemory__TextGeneratorType", "OpenAI") + .WithEnvironment("KernelMemory__DataIngestion__EmbeddingGeneratorTypes__0", "OpenAI") + .WithEnvironment("KernelMemory__Retrieval__EmbeddingGeneratorType", "OpenAI") + .WithEnvironment("KernelMemory__Services__OpenAI__APIKey", "...your OpenAI key..."); + +builder.Build().Run(); +``` + +[![Run with .NET Aspire](docs/aspire-button.png)](examples/303-dotnet-aspire/Program.cs) + + + + +Data Ingestion using Kernel Memory OpenAPI Web Service +====================================================== + +The example show the default documents ingestion pipeline: + +1. Extract text: automatically recognize the file format and extract the information +2. Partition the text in small chunks, ready for search and RAG prompts +3. Extract embeddings using any LLM embedding generator +4. Save embeddings into a vector index such as + [Azure AI Search](https://learn.microsoft.com/azure/search/vector-search-overview), + [Qdrant](https://qdrant.tech/) or other DBs. + +The example shows how to **safeguard private information** specifying who owns each document, and +how to **organize data** for search and faceted navigation, using **Tags**. + +## C# + +> ```csharp +> #r "nuget: Microsoft.KernelMemory.WebClient" +> +> var memory = new MemoryWebClient("http://127.0.0.1:9001"); // <== URL of KM web service +> +> // Import a file +> await memory.ImportDocumentAsync("meeting-transcript.docx"); +> +> // Import a file specifying Document ID and Tags +> await memory.ImportDocumentAsync("business-plan.docx", +> new Document("doc01") +> .AddTag("user", "devis@contoso.com") +> .AddTag("collection", "business") +> .AddTag("collection", "plans") +> .AddTag("fiscalYear", "2025")); +> ``` + +## Python + +> ```python +> import requests +> +> # Files to import +> files = { +> "file1": ("business-plan.docx", open("business-plan.docx", "rb")), +> } +> +> # Tags to apply, used by queries to filter memory +> data = { "documentId": "doc01", +> "tags": [ "user:devis@contoso.com", +> "collection:business", +> "collection:plans", +> "fiscalYear:2025" ] +> } +> +> response = requests.post("http://127.0.0.1:9001/upload", files=files, data=data) +> ``` + + + + +Direct Data Ingestion using embedded Serverless .NET component +============================================================== + +> ```csharp +> var memory = new KernelMemoryBuilder() +> .WithOpenAIDefaults(Environment.GetEnvironmentVariable("OPENAI_API_KEY")) +> .Build(); +> +> // Import a file +> await memory.ImportDocumentAsync("meeting-transcript.docx"); +> +> // Import a file specifying Document ID and Tags +> await memory.ImportDocumentAsync("business-plan.docx", +> new Document("doc01") +> .AddTag("collection", "business") +> .AddTag("collection", "plans") +> .AddTag("fiscalYear", "2025")); +> ``` + + + + +Memory retrieval and RAG +======================== + +Asking questions, running RAG prompts, and filtering by user and other criteria is simple, with +answers including citations and all the information needed to verify their accuracy, pointing to +which documents ground the response. + +## C# + +> ### Asking questions: +> Questions can be asked targeting the entire memory set, or a subset using filters, +> e.g. to implement security filters. +> ```csharp +> var answer1 = await memory.AskAsync("How many people attended the meeting?"); +> +> var answer2 = await memory.AskAsync("what's the project timeline?", +> filter: MemoryFilters.ByTag("user", "devis@contoso.com")); +> ``` + +> ### Token usage: +> When generating answers with LLMs, the result includes a token usage report. +> ```csharp +> foreach (var report in tokenUsage) +> { +> Console.WriteLine($"{report.ServiceType}: {report.ModelName} ({report.ModelType})"); +> Console.WriteLine($"- Input : {report.ServiceTokensIn}"); +> Console.WriteLine($"- Output: {report.ServiceTokensOut}"); +> } +> ``` +> #### Output: +> > Azure OpenAI: gpt-4o (TextGeneration) +> > - Input : 24356 tokens +> > - Output: 103 tokens + +![km-stream-token-usage](https://github.com/user-attachments/assets/71abf161-106c-47cc-af06-66f810314687) + +> ### Data lineage, citations, referencing sources: +> +> ```csharp +> await memory.ImportFileAsync("NASA-news.pdf"); +> +> var answer = await memory.AskAsync("Any news from NASA about Orion?"); +> +> Console.WriteLine(answer.Result + "/n"); +> +> foreach (var x in answer.RelevantSources) +> { +> Console.WriteLine($" * {x.SourceName} -- {x.Partitions.First().LastUpdate:D}"); +> } +> ``` +> +> > Yes, there is news from NASA about the Orion spacecraft. NASA has invited the +> > media to see a new test version [......] For more information about the Artemis program, +> > you can visit the NASA website. +> > +> > - **NASA-news.pdf -- Tuesday, August 1, 2023** + +## Python + +> ### Asking questions: +> +> ```python +> import requests +> import json +> +> data = { +> "question": "what's the project timeline?", +> "filters": [ {"user": ["devis@contoso.com"]} ] +> } +> +> response = requests.post( +> "http://127.0.0.1:9001/ask", +> headers={"Content-Type": "application/json"}, +> data=json.dumps(data), +> ).json() +> +> print(response["text"]) +> ``` + +## OpenAPI + +> ``` +> curl http://127.0.0.1:9001/ask -d'{"query":"Any news from NASA about Orion?"}' -H 'Content-Type: application/json' +> ``` +> +> ```json +> { +> "Query": "Any news from NASA about Orion?", +> "Text": "Yes, there is news from NASA about the Orion spacecraft. NASA has invited the media to see a new test version [......] For more information about the Artemis program, you can visit the NASA website.", +> "RelevantSources": [ +> { +> "Link": "...", +> "SourceContentType": "application/pdf", +> "SourceName": "file5-NASA-news.pdf", +> "Partitions": [ +> { +> "Text": "Skip to main content\nJul 28, 2023\nMEDIA ADVISORY M23-095\nNASA Invites Media to See Recovery Craft for\nArtemis Moon Mission\n(/sites/default/files/thumbnails/image/ksc-20230725-ph-fmx01_0003orig.jpg)\nAboard the [......] to Mars (/topics/moon-to-\nmars/),Orion Spacecraft (/exploration/systems/orion/index.html)\nNASA Invites Media to See Recovery Craft for Artemis Moon Miss... https://www.nasa.gov/press-release/nasa-invites-media-to-see-recov...\n2 of 3 7/28/23, 4:51 PM", +> "Relevance": 0.8430657, +> "SizeInTokens": 863, +> "LastUpdate": "2023-08-01T08:15:02-07:00" +> } +> ] +> } +> ] +> } +> ``` + +The OpenAPI schema ("swagger") is available at http://127.0.0.1:9001/swagger/index.html when +running the service locally with OpenAPI enabled. +[Here's a copy](https://editor.swagger.io/?url=https://raw.githubusercontent.com/microsoft/kernel-memory/refs/heads/main/swagger.json). + +🔗 See also: + +- [Full example with ingestion, search and RAG queries](https://github.com/microsoft/kernel-memory/tree/main/examples/001-dotnet-WebClient). +- [Full example using serverless .NET component](https://github.com/microsoft/kernel-memory/blob/main/examples/002-dotnet-Serverless). + +Kernel Memory Docker image +========================== + +If you want to give the service a quick test, use the following command +to **start the Kernel Memory Service** using OpenAI: + +```shell +docker run -e OPENAI_API_KEY="..." -it --rm -p 9001:9001 kernelmemory/service +``` + +If you prefer using custom settings and services such as Azure OpenAI, Azure +Document Intelligence, etc., you should create an `appsettings.Development.json` +file overriding the default values set in `appsettings.json`, or using the +configuration wizard included: + + cd service/Service + dotnet run setup + +Then run this command to start the [Docker image](https://hub.docker.com/r/kernelmemory/service) +with the configuration just created: + +on Windows: + + docker run --volume .\appsettings.Development.json:/app/appsettings.Production.json -it --rm -p 9001:9001 kernelmemory/service + +on Linux / macOS: + + docker run --volume ./appsettings.Development.json:/app/appsettings.Production.json -it --rm -p 9001:9001 kernelmemory/service + +🔗 See also: + +* [How to configure KM service](https://github.com/microsoft/kernel-memory/blob/main/service/Service/README.md#%EF%B8%8F-configuration) +* [Deploy Kernel Memory to Azure](#kernel-memory-service-on-azure). + +Memory as a Service: Data Ingestion Pipelines + RAG Web Service +=============================================================== + +Depending on your scenarios, you might want to run all the code **remotely through an asynchronous +and scalable service, +or locally inside your process.** + +![image](docs/img/kernel-memory-as-a-service.png) + +If you're importing small files, and use only .NET and can block the application process while +importing documents, then local-in-process execution can be fine, using the **MemoryServerless** +described below. + +However, if you are in one of these scenarios: + +- My app is written in **TypeScript, Java, Rust, or some other language** +- I'd just like a web service to import data and send questions to answer +- I'm importing **big documents that can require minutes to process**, and I don't want to block the + user interface +- I need memory import to **run independently, supporting failures and retry logic** +- I want to define **custom pipelines mixing multiple languages** like Python, TypeScript, etc + +then you're likely looking for a **Memory Service**, and you can deploy Kernel Memory as a backend +service, using the default ingestion logic, or your custom workflow including steps coded in +Python/TypeScript/Java/etc., leveraging the asynchronous non-blocking memory encoding process, +uploading documents and asking questions using the **MemoryWebClient**. + +![image](docs/img/kernel-memory-client.png) + +[Here](service/Service/README.md) you can find a complete set of instruction about +[how to run the Kernel Memory service](service/Service/README.md). + +## Embedded Memory Component (aka "serverless") + +Kernel Memory works and scales at best when running as an asynchronous **Web Service**, allowing to +ingest thousands of documents and information without blocking your app. + +However, Kernel Memory can also run in serverless mode, embedding `MemoryServerless` class instance +in .NET backend/console/desktop apps in synchronous mode. +Each request is processed immediately, although calling clients are responsible for handling +transient errors. + +![image](docs/img/kernel-memory-embedded-serverless.png) + + + + +Extensions +========== + +Kernel Memory relies on external services to run stateful pipelines, store data, handle embeddings, +and generate text responses. The project includes extensions that allow customization of file +storage, queues, vector stores, and LLMs to fit specific requirements. + +- **AI**: Azure OpenAI, OpenAI, ONNX, Ollama, Anthropic, Azure AI Document Intelligence, Azure AI + Content Safety +- **Vector Store**: Azure AI Search, Postgres, SQL Server, Elasticsearch, Qdrant, Redis, MongoDB + Atlas, In memory store +- **File Storage**: Azure Blob storage, AWS S3, MongoDB Atlas, Local disk, In memory storage +- **Ingestion pipelines**: Azure Queues, RabbitMQ, In memory queues + +Custom memory ingestion pipelines +=================================== + +Document ingestion operates as a stateful pipeline, executing steps in a defined sequence. +By default, Kernel Memory employs a pipeline to **extract** text, **chunk** content, **vectorize**, +and **store** data. + +If you need a custom data pipeline, you can modify the sequence, add new steps, or replace existing +ones by providing custom “handlers” for each desired stage. This allows complete flexibility in +defining how data is processed. For example: + +```csharp +// Memory setup, e.g. how to calculate and where to store embeddings +var memoryBuilder = new KernelMemoryBuilder() + .WithoutDefaultHandlers() + .WithOpenAIDefaults(Environment.GetEnvironmentVariable("OPENAI_API_KEY")); + +var memory = memoryBuilder.Build(); + +// Plug in custom .NET handlers +memory.Orchestrator.AddHandler("step1"); +memory.Orchestrator.AddHandler("step2"); +memory.Orchestrator.AddHandler("step3"); + +// Use the custom handlers with the memory object +await memory.ImportDocumentAsync( + new Document("mytest001") + .AddFile("file1.docx") + .AddFile("file2.pdf"), + steps: new[] { "step1", "step2", "step3" }); +``` + +![image](docs/img/kernel-memory-pipelines.png) + + + + +Kernel Memory (KM) and Semantic Kernel (SK) +=========================================== + +**Semantic Kernel is an SDK for C#, Python, and Java** used to develop solutions with AI. SK +includes libraries that wrap direct calls to databases, supporting vector search. + +Semantic Kernel is maintained in three languages, while the list of supported storage engines +(known as "connectors") varies across languages. + +**Kernel Memory (KM) is a SERVICE** built on Semantic Kernel, with additional features developed for +RAG, Security, and Cloud deployment. As a service, **KM can be used from any language, tool, or +platform, e.g. browser extensions and ChatGPT assistants.** + +Kernel Memory provides several features out of the scope of Semantic Kernel, that would usually be +developed manually, such as storing files, extracting text from documents, providing a framework to +secure users' data, content moderation etc. + +Kernel Memory is also leveraged to explore new AI patterns, which sometimes are backported to +Semantic Kernel and Microsoft libraries, for instance vector stores flexible schemas, advanced +filtering, authentications. + +Here's comparison table: + +| Feature | Kernel Memory | Semantic Kernel | +|------------------|-----------------------------------------------------------------------|--------------------| +| Runtime | Memory as a Service, Web service | SDK packages | +| Data formats | Web pages, PDF, Images, Word, PowerPoint, Excel, Markdown, Text, JSON | Text only | +| Language support | Any language | .NET, Python, Java | +| RAG | Yes | - | +| Cloud deployment | Yes | - | + +Examples and Tools +================== + +## Examples + +1. [Collection of Jupyter notebooks with various scenarios](examples/000-notebooks) +2. [Using Kernel Memory web service to upload documents and answer questions](examples/001-dotnet-WebClient) +3. [Importing files and asking question without running the service (serverless mode)](examples/002-dotnet-Serverless) +4. [Kernel Memory RAG with Azure services](examples/007-dotnet-serverless-azure) +5. [Kernel Memory with .NET Aspire](examples/303-dotnet-aspire) +6. [Using KM Plugin for Semantic Kernel](examples/003-dotnet-SemanticKernel-plugin) +7. Customizations + * [Processing files with custom logic (custom handlers) in serverless mode](examples/004-dotnet-serverless-custom-pipeline) + * [Processing files with custom logic (custom handlers) in asynchronous mode](examples/005-dotnet-AsyncMemoryCustomPipeline) + * [Customizing RAG and summarization prompts](examples/101-dotnet-custom-Prompts) + * [Custom partitioning/text chunking options](examples/102-dotnet-custom-partitioning-options) + * [Using a custom embedding/vector generator](examples/103-dotnet-custom-EmbeddingGenerator) + * [Using custom content decoders](examples/108-dotnet-custom-content-decoders) + * [Using a custom web scraper to fetch web pages](examples/109-dotnet-custom-webscraper) + * [Writing and using a custom ingestion handler](examples/201-dotnet-serverless-custom-handler) + * [Using Context Parameters to customize RAG prompt during a request](examples/209-dotnet-using-context-overrides) +8. Local models and external connectors + * [Using custom LLMs](examples/104-dotnet-custom-LLM) + * [Using local LLMs with Ollama](examples/212-dotnet-ollama) + * [Using local LLMs with llama.cpp via LlamaSharp](examples/105-dotnet-serverless-llamasharp) + * [Using local models with LM Studio](examples/208-dotnet-lmstudio) + * [Using Semantic Kernel LLM connectors](examples/107-dotnet-SemanticKernel-TextCompletion) + * [Generating answers with Anthropic LLMs](examples/110-dotnet-anthropic) +9. [Upload files and ask questions from command line using curl](examples/006-curl-calling-webservice) +10. [Summarizing documents, using synthetic memories](examples/106-dotnet-retrieve-synthetics) +11. [Hybrid Search with Azure AI Search](examples/111-dotnet-azure-ai-hybrid-search) +12. [Running a single asynchronous pipeline handler as a standalone service](examples/202-dotnet-custom-handler-as-a-service) +13. [Integrating Memory with ASP.NET applications and controllers](examples/204-dotnet-ASP.NET-MVC-integration) +14. [Sample code showing how to extract text from files](examples/205-dotnet-extract-text-from-docs) +15. [.NET configuration and logging](examples/206-dotnet-configuration-and-logging) +16. [Expanding chunks retrieving adjacent partitions](examples/207-dotnet-expanding-chunks-on-retrieval) +17. [Creating a Memory instance without KernelMemoryBuilder](examples/210-KM-without-builder) +18. [Intent Detection](examples/211-dotnet-WebClient-Intent-Detection) +19. [Fetching data from Discord](examples/301-discord-test-application) +20. [Test project using KM package from nuget.org](examples/203-dotnet-using-KM-nuget) + +## Tools + +1. [.NET appsettings.json generator](tools/InteractiveSetup) +2. [Curl script to upload files](tools/km-cli/upload-file.sh) +3. [Curl script to ask questions](tools/km-cli/ask.sh) +4. [Curl script to search documents](tools/km-cli/search.sh) +5. [Script to start Qdrant for development tasks](tools/run-qdrant.sh) +6. [Script to start Elasticsearch for development tasks](tools/run-elasticsearch.sh) +7. [Script to start MS SQL Server for development tasks](tools/run-mssql.sh) +8. [Script to start Redis for development tasks](tools/run-redis.sh) +9. [Script to start RabbitMQ for development tasks](tools/run-rabbitmq.sh) +10. [Script to start MongoDB Atlas for development tasks](tools/run-mongodb-atlas.sh) + +## .NET packages + +- **Microsoft.KernelMemory.WebClient:** .NET web client to call a running instance of Kernel Memory + web service. + + [![Nuget package](https://img.shields.io/nuget/vpre/Microsoft.KernelMemory.WebClient)](https://www.nuget.org/packages/Microsoft.KernelMemory.WebClient/) + [![Example code](https://img.shields.io/badge/example-code-blue)](examples/001-dotnet-WebClient) + +- **Microsoft.KernelMemory:** Kernel Memory library including all extensions and clients, it can be + used to build custom pipelines and handlers. It contains also the serverless client to use memory + in a synchronous way without the web service. + + [![Nuget package](https://img.shields.io/nuget/vpre/Microsoft.KernelMemory.Core)](https://www.nuget.org/packages/Microsoft.KernelMemory.Core/) + [![Serverless example](https://img.shields.io/badge/example-code-blue)](examples/002-dotnet-Serverless) + [![Custom pipeline example](https://img.shields.io/badge/example-code-blue)](examples/004-dotnet-serverless-custom-pipeline) + [![Custom pipeline example](https://img.shields.io/badge/example-code-blue)](examples/005-dotnet-async-memory-custom-pipeline) + +- **Microsoft.KernelMemory.Service.AspNetCore:** an extension to load Kernel Memory into your + ASP.NET apps. + + [![Nuget package](https://img.shields.io/nuget/vpre/Microsoft.KernelMemory.Service.AspNetCore)](https://www.nuget.org/packages/Microsoft.KernelMemory.Service.AspNetCore/) + [![Example code](https://img.shields.io/badge/example-code-blue)](examples/204-dotnet-ASP.NET-MVC-integration) + +- **Microsoft.KernelMemory.SemanticKernelPlugin:** a Memory plugin for Semantic Kernel, replacing + the original Semantic Memory available in SK. + + [![Nuget package](https://img.shields.io/nuget/vpre/Microsoft.KernelMemory.SemanticKernelPlugin)](https://www.nuget.org/packages/Microsoft.KernelMemory.SemanticKernelPlugin/) + [![Example code](https://img.shields.io/badge/example-code-blue)](examples/003-dotnet-SemanticKernel-plugin) + +- [**Microsoft.KernelMemory.\*** packages](https://www.nuget.org/packages?q=microsoft.kernelmemory): + Kernel Memory Core and all KM extensions split into distinct packages. + +### Packages for Python, Java and other languages + +Kernel Memory service offers a **Web API** out of the box, including the +[**OpenAPI swagger**](https://editor.swagger.io/?url=https://github.com/microsoft/kernel-memory/blob/main/swagger.json) +documentation that you can leverage to test the API and create custom web clients. For instance, +after starting the service locally, see http://127.0.0.1:9001/swagger/index.html. + +A .NET Web Client and a Semantic Kernel plugin are available, see the nugets packages above. + +For Python, TypeScript, Java and other languages we recommend leveraging the Web Service. We also welcome +PR contributions to support more languages. + + + + +Contributors +============ + + + +[aaronpowell](https://github.com/aaronpowell) |[afederici75](https://github.com/afederici75) |[akordowski](https://github.com/akordowski) |[alexibraimov](https://github.com/alexibraimov) |[alexmg](https://github.com/alexmg) |[alkampfergit](https://github.com/alkampfergit) | +:---: |:---: |:---: |:---: |:---: |:---: | +[aaronpowell](https://github.com/aaronpowell) |[afederici75](https://github.com/afederici75) |[akordowski](https://github.com/akordowski) |[alexibraimov](https://github.com/alexibraimov) |[alexmg](https://github.com/alexmg) |[alkampfergit](https://github.com/alkampfergit) | + +[amomra](https://github.com/amomra) |[anthonypuppo](https://github.com/anthonypuppo) |[aportillo83](https://github.com/aportillo83) |[carlodek](https://github.com/carlodek) |[chaelli](https://github.com/chaelli) |[cherchyk](https://github.com/cherchyk) | +:---: |:---: |:---: |:---: |:---: |:---: | +[amomra](https://github.com/amomra) |[anthonypuppo](https://github.com/anthonypuppo) |[aportillo83](https://github.com/aportillo83) |[carlodek](https://github.com/carlodek) |[chaelli](https://github.com/chaelli) |[cherchyk](https://github.com/cherchyk) | + +[coryisakson](https://github.com/coryisakson) |[crickman](https://github.com/crickman) |[dependabot[bot]](https://github.com/apps/dependabot) |[dluc](https://github.com/dluc) |[DM-98](https://github.com/DM-98) |[EelcoKoster](https://github.com/EelcoKoster) | +:---: |:---: |:---: |:---: |:---: |:---: | +[coryisakson](https://github.com/coryisakson) |[crickman](https://github.com/crickman) |[dependabot[bot]](https://github.com/apps/dependabot) |[dluc](https://github.com/dluc) |[DM-98](https://github.com/DM-98) |[EelcoKoster](https://github.com/EelcoKoster) | + +[Foorcee](https://github.com/Foorcee) |[GraemeJones104](https://github.com/GraemeJones104) |[imranshams](https://github.com/imranshams) |[jurepurgar](https://github.com/jurepurgar) |[JustinRidings](https://github.com/JustinRidings) |[kbeaugrand](https://github.com/kbeaugrand) | +:---: |:---: |:---: |:---: |:---: |:---: | +[Foorcee](https://github.com/Foorcee) |[GraemeJones104](https://github.com/GraemeJones104) |[imranshams](https://github.com/imranshams) |[jurepurgar](https://github.com/jurepurgar) |[JustinRidings](https://github.com/JustinRidings) |[kbeaugrand](https://github.com/kbeaugrand) | + +[koteus](https://github.com/koteus) |[KSemenenko](https://github.com/KSemenenko) |[lecramr](https://github.com/lecramr) |[luismanez](https://github.com/luismanez) |[marcominerva](https://github.com/marcominerva) |[neel015](https://github.com/neel015) | +:---: |:---: |:---: |:---: |:---: |:---: | +[koteus](https://github.com/koteus) |[KSemenenko](https://github.com/KSemenenko) |[lecramr](https://github.com/lecramr) |[luismanez](https://github.com/luismanez) |[marcominerva](https://github.com/marcominerva) |[neel015](https://github.com/neel015) | + +[pascalberger](https://github.com/pascalberger) |[pawarsum12](https://github.com/pawarsum12) |[pradeepr-roboticist](https://github.com/pradeepr-roboticist) |[qihangnet](https://github.com/qihangnet) |[roldengarm](https://github.com/roldengarm) |[setuc](https://github.com/setuc) | +:---: |:---: |:---: |:---: |:---: |:---: | +[pascalberger](https://github.com/pascalberger) |[pawarsum12](https://github.com/pawarsum12) |[pradeepr-roboticist](https://github.com/pradeepr-roboticist) |[qihangnet](https://github.com/qihangnet) |[roldengarm](https://github.com/roldengarm) |[setuc](https://github.com/setuc) | + +[slapointe](https://github.com/slapointe) |[slorello89](https://github.com/slorello89) |[snakex64](https://github.com/snakex64) |[spenavajr](https://github.com/spenavajr) |[TaoChenOSU](https://github.com/TaoChenOSU) |[tarekgh](https://github.com/tarekgh) | +:---: |:---: |:---: |:---: |:---: |:---: | +[slapointe](https://github.com/slapointe) |[slorello89](https://github.com/slorello89) |[snakex64](https://github.com/snakex64) |[spenavajr](https://github.com/spenavajr) |[TaoChenOSU](https://github.com/TaoChenOSU) |[tarekgh](https://github.com/tarekgh) | + +[teresaqhoang](https://github.com/teresaqhoang) |[tomasz-skarzynski](https://github.com/tomasz-skarzynski) |[v-msamovendyuk](https://github.com/v-msamovendyuk) |[Valkozaur](https://github.com/Valkozaur) |[vicperdana](https://github.com/vicperdana) |[walexee](https://github.com/walexee) | +:---: |:---: |:---: |:---: |:---: |:---: | +[teresaqhoang](https://github.com/teresaqhoang) |[tomasz-skarzynski](https://github.com/tomasz-skarzynski) |[v-msamovendyuk](https://github.com/v-msamovendyuk) |[Valkozaur](https://github.com/Valkozaur) |[vicperdana](https://github.com/vicperdana) |[walexee](https://github.com/walexee) | + +[westdavidr](https://github.com/westdavidr) |[xbotter](https://github.com/xbotter) | +:---: |:---: | +[westdavidr](https://github.com/westdavidr) |[xbotter](https://github.com/xbotter) | \ No newline at end of file diff --git a/extensions/Chunkers/Chunkers/MarkDownChunker.cs b/extensions/Chunkers/Chunkers/MarkDownChunker.cs index fa32357a5..b974809c8 100644 --- a/extensions/Chunkers/Chunkers/MarkDownChunker.cs +++ b/extensions/Chunkers/Chunkers/MarkDownChunker.cs @@ -11,14 +11,15 @@ using System.Text; using Microsoft.KernelMemory.AI; using Microsoft.KernelMemory.Chunkers.internals; +using Microsoft.KernelMemory.DataFormats; namespace Microsoft.KernelMemory.Chunkers; /// -/// Plain text chunker for splitting text into blocks of a maximum number of tokens. -/// Designed for Plain Text and RAG scenarios, where some special chars are irrelevant +/// Markdown text chunker for splitting Markdown content into blocks of a maximum number of tokens. +/// Designed for Markdown and RAG scenarios, where some special chars are irrelevant /// and can be removed, ie. the split can be lossy. -/// This chunker should not be used for MarkDown, where symbols have a special meaning, +/// This chunker should not be used for plain text, where symbols have a different meaning, /// or different priorities for splitting. /// Although not designed to chunk source code or math formulas, it tries to do its best. /// Acronyms with dots (e.g. N.A.S.A.) are not considered and are potentially split like sentences. @@ -238,7 +239,7 @@ internal List RecursiveSplit( // Important: 'SplitToFragments' splits content in words and delimiters, using logic specific to plain text. // These are different from LLM tokens, which are based on the tokenizer used to train the model. // Recursive logic exit clause: when separator type is NotASeparator, count each char as a fragment - List fragments = separatorType switch + List fragments = separatorType switch { SeparatorTypes.ExplicitSeparator => this.SplitToFragments(text, s_explicitSeparators), SeparatorTypes.PotentialSeparator => this.SplitToFragments(text, s_potentialSeparators), @@ -253,7 +254,7 @@ internal List RecursiveSplit( } internal List GenerateChunks( - List fragments, + List fragments, int maxChunk1Size, int maxChunkNSize, SeparatorTypes separatorType, @@ -396,18 +397,18 @@ internal List GenerateChunks( /// /// Split text into fragments using a list of separators. /// - internal List SplitToFragments(string text, SeparatorTrie? separators) + internal List SplitToFragments(string text, SeparatorTrie? separators) { // Split all chars if (separators == null) { - return text.Select(x => new Fragment(x, true)).ToList(); + return text.Select(x => new Chunk(x, -1) { IsSeparator = true }).ToList(); } // If the text is empty or there are no separators if (string.IsNullOrEmpty(text) || separators.Length == 0) { return []; } - var fragments = new List(); + var fragments = new List(); var fragmentBuilder = new StringBuilder(); int index = 0; while (index < text.Length) @@ -418,11 +419,11 @@ internal List SplitToFragments(string text, SeparatorTrie? separators) { if (fragmentBuilder.Length > 0) { - fragments.Add(new Fragment(fragmentBuilder, false)); + fragments.Add(new Chunk(fragmentBuilder, -1) { IsSeparator = false }); fragmentBuilder.Clear(); } - fragments.Add(new Fragment(foundSeparator, true)); + fragments.Add(new Chunk(foundSeparator, -1) { IsSeparator = true }); index += foundSeparator.Length; } else @@ -434,7 +435,7 @@ internal List SplitToFragments(string text, SeparatorTrie? separators) if (fragmentBuilder.Length > 0) { - fragments.Add(new Fragment(fragmentBuilder, false)); + fragments.Add(new Chunk(fragmentBuilder, -1) { IsSeparator = false }); } #if DEBUGFRAGMENTS diff --git a/extensions/Chunkers/Chunkers/PlainTextChunker.cs b/extensions/Chunkers/Chunkers/PlainTextChunker.cs index 9bdc8955a..4e0a6ea90 100644 --- a/extensions/Chunkers/Chunkers/PlainTextChunker.cs +++ b/extensions/Chunkers/Chunkers/PlainTextChunker.cs @@ -11,6 +11,7 @@ using System.Text; using Microsoft.KernelMemory.AI; using Microsoft.KernelMemory.Chunkers.internals; +using Microsoft.KernelMemory.DataFormats; namespace Microsoft.KernelMemory.Chunkers; @@ -213,7 +214,7 @@ internal List RecursiveSplit( // Important: 'SplitToFragments' splits content in words and delimiters, using logic specific to plain text. // These are different from LLM tokens, which are based on the tokenizer used to train the model. // Recursive logic exit clause: when separator type is NotASeparator, count each char as a fragment - List fragments = separatorType switch + List fragments = separatorType switch { SeparatorTypes.ExplicitSeparator => this.SplitToFragments(text, s_explicitSeparators), SeparatorTypes.PotentialSeparator => this.SplitToFragments(text, s_potentialSeparators), @@ -228,7 +229,7 @@ internal List RecursiveSplit( } internal List GenerateChunks( - List fragments, + List fragments, int maxChunk1Size, int maxChunkNSize, SeparatorTypes separatorType, @@ -371,18 +372,18 @@ internal List GenerateChunks( /// /// Split text into fragments using a list of separators. /// - internal List SplitToFragments(string text, SeparatorTrie? separators) + internal List SplitToFragments(string text, SeparatorTrie? separators) { // Split all chars if (separators == null) { - return text.Select(x => new Fragment(x, true)).ToList(); + return text.Select(x => new Chunk(x, -1) { IsSeparator = true }).ToList(); } // If the text is empty or there are no separators if (string.IsNullOrEmpty(text) || separators.Length == 0) { return []; } - var fragments = new List(); + var fragments = new List(); var fragmentBuilder = new StringBuilder(); int index = 0; while (index < text.Length) @@ -393,11 +394,11 @@ internal List SplitToFragments(string text, SeparatorTrie? separators) { if (fragmentBuilder.Length > 0) { - fragments.Add(new Fragment(fragmentBuilder, false)); + fragments.Add(new Chunk(fragmentBuilder, -1) { IsSeparator = false }); fragmentBuilder.Clear(); } - fragments.Add(new Fragment(foundSeparator, true)); + fragments.Add(new Chunk(foundSeparator, -1) { IsSeparator = true }); index += foundSeparator.Length; } else @@ -409,7 +410,7 @@ internal List SplitToFragments(string text, SeparatorTrie? separators) if (fragmentBuilder.Length > 0) { - fragments.Add(new Fragment(fragmentBuilder, false)); + fragments.Add(new Chunk(fragmentBuilder, -1) { IsSeparator = false }); } #if DEBUGFRAGMENTS diff --git a/extensions/Chunkers/Chunkers/internals/Fragment.cs b/extensions/Chunkers/Chunkers/internals/Fragment.cs deleted file mode 100644 index 697233d97..000000000 --- a/extensions/Chunkers/Chunkers/internals/Fragment.cs +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. - -using System.Text; - -namespace Microsoft.KernelMemory.Chunkers.internals; - -internal class Fragment -{ - public readonly string Content; - public readonly bool IsSeparator; - - public Fragment(char content, bool isSeparator) - { - this.Content = content.ToString(); - this.IsSeparator = isSeparator; - } - - public Fragment(string content, bool isSeparator) - { - this.Content = content; - this.IsSeparator = isSeparator; - } - - public Fragment(StringBuilder content, bool isSeparator) - { - this.Content = content.ToString(); - this.IsSeparator = isSeparator; - } -} diff --git a/extensions/Chunkers/Chunkers/internals/PlainTextChunkerV1.cs b/extensions/Chunkers/Chunkers/internals/PlainTextChunkerV1.cs deleted file mode 100644 index 166e8b98e..000000000 --- a/extensions/Chunkers/Chunkers/internals/PlainTextChunkerV1.cs +++ /dev/null @@ -1,541 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. - -// #define DEBUGCHUNKS__ -// #define DEBUGFRAGMENTS__ -// -// using System; -// using System.Collections.Generic; -// using System.Diagnostics.CodeAnalysis; -// using System.Linq; -// using System.Text; -// using Microsoft.KernelMemory.AI; -// -// namespace Microsoft.KernelMemory.Chunkers.internals; -// -// /// -// /// Plain text chunker for splitting text into blocks of a maximum number of tokens. -// /// Designed for Plain Text and RAG scenarios, where some special chars are irrelevant -// /// and can be removed, ie. the split can be lossy. -// /// This chunker should not be used for MarkDown, where symbols have a special meaning, -// /// or different priorities for splitting. -// /// Although not designed to chunk source code or math formulas, it tries to do its best. -// /// Acronyms with dots (e.g. N.A.S.A.) are not considered and are potentially split like sentences. -// /// Anomalous-long sentences are split during the chunking loop, potentially introducing noise at the start of following chunks. -// /// TODO: improve performance -// /// -// [Experimental("KMEXP00")] -// public class PlainTextChunkerV1 -// { -// public class Options -// { -// /// -// /// Maximum number of tokens per chunk -// /// -// public int MaxTokensPerChunk { get; set; } = 1024; -// -// /// -// /// Number of tokens to copy and repeat from a chunk into the next. -// /// -// public int Overlap { get; set; } = 0; -// -// /// -// /// Optional header to add before each chunk. -// /// -// public string? ChunkHeader { get; set; } = null; -// } -// -// internal enum SeparatorTypes -// { -// NotASeparator = 0, -// ExplicitSeparator = 1, -// PotentialSeparator = 2, -// WeakSeparator = 3, -// } -// -// internal class Fragment -// { -// internal string Content = string.Empty; -// internal SeparatorTypes SeparatorType = SeparatorTypes.NotASeparator; -// } -// -// private class ChunkBuilder -// { -// public readonly StringBuilder FullContent = new(); -// public readonly StringBuilder NextSentence = new(); -// } -// -// // Do not allow chunks smaller than this size, to avoid unnecessary computation. -// // Realistically, a chunk should be at least 1000 tokens long. -// private const int MinChunkSize = 5; -// -// private readonly ITextTokenizer _tokenizer; -// -// // Prioritized list of characters to split sentence from sentence. -// private static readonly List s_explicitSplitSequences = -// [ -// // Symbol + space -// ". ", ".\t", ".\n", // note: covers also the case of multiple '.' like "....\n" -// "? ", "?\t", "?\n", // note: covers also the case of multiple '?' and '!?' like "?????\n" and "?!?\n" -// "! ", "!\t", "!\n", // note: covers also the case of multiple '!' and '?!' like "!!!\n" and "!?!\n" -// "⁉ ", "⁉\t", "⁉\n", -// "⁈ ", "⁈\t", "⁈\n", -// "⁇ ", "⁇\t", "⁇\n", -// "… ", "…\t", "…\n", -// // Multi-char separators without space, ordered by length -// "!!!!", "????", "!!!", "???", "?!?", "!?!", "!?", "?!", "!!", "??", "....", "...", "..", -// // 1 char separators without space -// ".", "?", "!", "⁉", "⁈", "⁇", "…", -// ]; -// -// // Prioritized list of characters to split inside a sentence. -// private static readonly List s_potentialSplitSequences = -// [ -// "; ", ";\t", ";\n", ";", -// "} ", "}\t", "}\n", "}", // note: curly brace without spaces is up here because it's a common code ending char, more important than ')' or ']' -// ") ", ")\t", ")\n", -// "] ", "]\t", "]\n", -// ")", "]", -// ]; -// -// // Prioritized list of characters to split inside a sentence when other splits are not found. -// private static readonly List s_weakSplitSequences = -// [ -// ":", // note: \n \t make no difference with this char -// ",", // note: \n \t make no difference with this char -// " ", // note: \n \t make no difference with this char -// "-", // note: \n \t make no difference with this char -// ]; -// -// public PlainTextChunkerV1(ITextTokenizer? tokenizer = null) -// { -// this._tokenizer = tokenizer ?? new CL100KTokenizer(); -// -// // Check that split options are shorter than 5 chars -// if (s_explicitSplitSequences.Any(x => x is { Length: > 4 })) -// { -// throw new SystemException(nameof(PlainTextChunkerV1) + " contains invalid split sequences, max four chars sequences are supported."); -// } -// -// if (s_potentialSplitSequences.Any(x => x is { Length: > 4 })) -// { -// throw new SystemException(nameof(PlainTextChunkerV1) + " contains invalid split sequences, max four chars sequences are supported."); -// } -// -// if (s_weakSplitSequences.Any(x => x is { Length: > 4 })) -// { -// throw new SystemException(nameof(PlainTextChunkerV1) + " contains invalid split sequences, max four chars sequences are supported."); -// } -// } -// -// /// -// /// Split plain text into blocks. -// /// Note: -// /// - \r\n characters are replaced with \n -// /// - \r characters are replaced with \n -// /// - \t character is not replaced -// /// - Chunks cannot be smaller than [MinChunkSize] tokens (header excluded) -// /// -// /// Text to split -// /// How to handle input and how to generate chunks -// /// List of chunks. -// public List Split(string text, Options options) -// { -// ArgumentNullException.ThrowIfNull(text); -// ArgumentNullException.ThrowIfNull(options); -// -// // Clean up text. Note: LLMs don't use \r char -// text = text -// .Replace("\r\n", "\n", StringComparison.OrdinalIgnoreCase) -// .Replace("\r", "\n", StringComparison.OrdinalIgnoreCase) -// .Trim(); -// -// // Calculate chunk size leaving room for the optional chunk header -// int maxChunkSize = Math.Max(MinChunkSize, options.MaxTokensPerChunk - this.TokenCount(options.ChunkHeader)); -// -// // Chunk using recursive logic, starting with explicit separators and moving to weaker ones if needed -// var chunks = this.RecursiveSplit(text, maxChunkSize, SeparatorTypes.ExplicitSeparator); -// -// // Add header to each chunk -// if (!string.IsNullOrEmpty(options.ChunkHeader)) -// { -// chunks = chunks.Select(x => $"{options.ChunkHeader}{x}").ToList(); -// } -// -// // TODO: add overlapping tokens -// -// #if DEBUGCHUNKS -// this.DebugChunks(chunks); -// #endif -// -// return chunks; -// } -// -// /// -// /// Greedy algorithm aggregating fragments into chunks separated by a specific separator type. -// /// If any of the generated chunks is too long, those are split recursively using weaker separators. -// /// -// /// Text to split -// /// Max size of each chunk -// /// Type of separator to detect -// /// List of strings -// internal List RecursiveSplit(string text, int maxChunkSize, SeparatorTypes separatorType) -// { -// // Edge case: empty text -// if (string.IsNullOrEmpty(text)) { return []; } -// -// // Edge case: text is already short enough -// if (this.TokenCount(text) <= maxChunkSize) { return [text]; } -// -// // Important: 'SplitToFragments' splits content in words and delimiters, using logic specific to plain text. -// // These are different from LLM tokens, which are based on the tokenizer used to train the model. -// // Recursive logic exit clause: when separator type is NotASeparator, count each char as a fragment -// // TODO: reuse fragments from previous calls, this call is very expensive -// List fragments = separatorType != SeparatorTypes.NotASeparator -// ? SplitToFragments(text) -// : text.Select(x => new Fragment { Content = x.ToString(), SeparatorType = SeparatorTypes.NotASeparator }).ToList(); -// -// var chunks = this.GenerateChunks(fragments, maxChunkSize, separatorType); -// -// // TODO: overlap -// -// return chunks; -// } -// -// internal static SeparatorTypes NextSeparatorType(SeparatorTypes separatorType) -// { -// switch (separatorType) -// { -// case SeparatorTypes.ExplicitSeparator: return SeparatorTypes.PotentialSeparator; -// case SeparatorTypes.PotentialSeparator: return SeparatorTypes.WeakSeparator; -// case SeparatorTypes.WeakSeparator: return SeparatorTypes.NotASeparator; -// default: throw new ArgumentOutOfRangeException(nameof(SeparatorTypes.NotASeparator) + " doesn't have a next separator type."); -// } -// } -// -// internal List GenerateChunks(List fragments, int maxChunkSize, SeparatorTypes separatorType) -// { -// if (fragments.Count == 0) { return []; } -// -// var chunks = new List(); -// var chunk = new ChunkBuilder(); -// -// foreach (var fragment in fragments) -// { -// // Note: fragments != LLM tokens. One fragment can contain multiple tokens. -// chunk.NextSentence.Append(fragment.Content); -// -// // PERFORMANCE: wait for a complete sentence, avoiding expensive string computations -// if (fragment.SeparatorType != separatorType) { continue; } -// -// var nextSentence = chunk.NextSentence.ToString(); -// var nextSentenceSize = this.TokenCount(nextSentence); -// -// // Detect current state -// // 1: -// // - the current chunk is still empty -// // - the next sentence is complete and is NOT too long -// // 2: -// // - the current chunk is still empty -// // - the next sentence is complete and is TOO LONG -// // 3: -// // - the current chunk is NOT empty -// // - the next sentence is complete and is NOT too long -// // 4: -// // - the current chunk is NOT empty -// // - the next sentence is complete and is TOO LONG -// int state; -// if (chunk.FullContent.Length == 0) -// { -// state = (nextSentenceSize <= maxChunkSize) ? 1 : 2; -// } -// else -// { -// state = (nextSentenceSize <= maxChunkSize) ? 3 : 4; -// } -// -// switch (state) -// { -// default: -// throw new ArgumentOutOfRangeException(nameof(state)); -// -// // - the current chunk is still empty -// // - the next sentence is complete and is NOT too long -// case 1: -// chunk.FullContent.Append(nextSentence); -// chunk.NextSentence.Clear(); -// continue; -// -// // - the current chunk is still empty -// // - the next sentence is complete and is TOO LONG -// case 2: -// { -// var moreChunks = this.RecursiveSplit(nextSentence, maxChunkSize, NextSeparatorType(separatorType)); -// chunks.AddRange(moreChunks.Take(moreChunks.Count - 1)); -// chunk.NextSentence.Clear().Append(moreChunks.Last()); -// continue; -// } -// -// // - the current chunk is NOT empty -// // - the next sentence is complete and is NOT too long -// case 3: -// { -// var chunkPlusSentence = $"{chunk.FullContent}{chunk.NextSentence}"; -// if (this.TokenCount(chunkPlusSentence) <= maxChunkSize) -// { -// // Move next sentence to current chunk -// chunk.FullContent.Append(chunk.NextSentence); -// } -// else -// { -// // Complete the current chunk and start a new one -// chunks.Add(chunk.FullContent.ToString()); -// chunk.FullContent.Clear().Append(chunk.NextSentence); -// } -// -// chunk.NextSentence.Clear(); -// continue; -// } -// -// // - the current chunk is NOT empty -// // - the next sentence is complete and is TOO LONG -// case 4: -// { -// chunks.Add(chunk.FullContent.ToString()); -// chunk.FullContent.Clear(); -// -// var moreChunks = this.RecursiveSplit(nextSentence, maxChunkSize, NextSeparatorType(separatorType)); -// chunks.AddRange(moreChunks.Take(moreChunks.Count - 1)); -// chunk.NextSentence.Clear().Append(moreChunks.Last()); -// continue; -// } -// } -// } -// -// // If there's something left in the buffers -// var fullSentenceLeft = chunk.FullContent.ToString(); -// var nextSentenceLeft = chunk.NextSentence.ToString(); -// -// if (fullSentenceLeft.Length > 0 || nextSentenceLeft.Length > 0) -// { -// if (this.TokenCount($"{fullSentenceLeft}{nextSentenceLeft}") <= maxChunkSize) -// { -// chunks.Add($"{fullSentenceLeft}{nextSentenceLeft}"); -// } -// else -// { -// if (fullSentenceLeft.Length > 0) { chunks.Add(fullSentenceLeft); } -// -// if (nextSentenceLeft.Length > 0) -// { -// if (this.TokenCount(nextSentenceLeft) < maxChunkSize) -// { -// chunks.Add($"{nextSentenceLeft}"); -// } -// else -// { -// var moreChunks = this.RecursiveSplit(nextSentenceLeft, maxChunkSize, NextSeparatorType(separatorType)); -// chunks.AddRange(moreChunks); -// } -// } -// } -// } -// -// return chunks; -// } -// -// /// -// /// Split text using different separator types. -// /// - A fragment ends as soon as a separator is found. -// /// - A fragment can start with a separator. -// /// - A fragment does not contain two consecutive separators. -// /// TODO: considering that only one separator type is used and then the list of fragments is discarded, simplify the method -// /// -// internal static List SplitToFragments(string text) -// { -// var fragments = new List(); -// var buffer = new StringBuilder(); -// -// void AddFragment(SeparatorTypes type, string separator, ref int cursor, int jump) -// { -// fragments.Add(new Fragment -// { -// SeparatorType = type, -// Content = $"{buffer}{separator}", -// }); -// buffer.Clear(); -// cursor += jump; -// } -// -// void AddExplicitSeparator(string separator, ref int cursor, int jump) -// { -// AddFragment(SeparatorTypes.ExplicitSeparator, separator, ref cursor, jump); -// } -// -// void AddPotentialSeparator(string separator, ref int cursor, int jump) -// { -// AddFragment(SeparatorTypes.PotentialSeparator, separator, ref cursor, jump); -// } -// -// void AddWeakSeparator(string separator, ref int cursor, int jump) -// { -// AddFragment(SeparatorTypes.WeakSeparator, separator, ref cursor, jump); -// } -// -// for (int i = 0; i < text.Length; i++) -// { -// // Note: split options are 4 chars max -// char char1 = text[i]; -// char? char2 = i + 1 < text.Length ? text[i + 1] : null; -// char? char3 = i + 2 < text.Length ? text[i + 2] : null; -// char? char4 = i + 3 < text.Length ? text[i + 3] : null; -// -// // Check if there's a 4-chars separator -// string fourCharWord = $"{char1}{char2}{char3}{char4}"; -// if (char4.HasValue) -// { -// if (s_explicitSplitSequences.Contains(fourCharWord)) -// { -// AddExplicitSeparator(fourCharWord, ref i, 3); -// continue; -// } -// -// if (s_potentialSplitSequences.Contains(fourCharWord)) -// { -// AddPotentialSeparator(fourCharWord, ref i, 3); -// continue; -// } -// -// if (s_weakSplitSequences.Contains(fourCharWord)) -// { -// AddWeakSeparator(fourCharWord, ref i, 3); -// continue; -// } -// } -// -// // Check if there's a 3-chars separator -// string threeCharWord = $"{char1}{char2}{char3}"; -// if (char3.HasValue) -// { -// if (s_explicitSplitSequences.Contains(threeCharWord)) -// { -// AddExplicitSeparator(threeCharWord, ref i, 2); -// continue; -// } -// -// if (s_potentialSplitSequences.Contains(threeCharWord)) -// { -// AddPotentialSeparator(threeCharWord, ref i, 2); -// continue; -// } -// -// if (s_weakSplitSequences.Contains(threeCharWord)) -// { -// AddWeakSeparator(threeCharWord, ref i, 2); -// continue; -// } -// } -// -// // Check if there's a 2-chars separator -// string twoCharWord = $"{char1}{char2}"; -// if (char2.HasValue) -// { -// if (s_explicitSplitSequences.Contains(twoCharWord)) -// { -// AddExplicitSeparator(twoCharWord, ref i, 1); -// continue; -// } -// -// if (s_potentialSplitSequences.Contains(twoCharWord)) -// { -// AddPotentialSeparator(twoCharWord, ref i, 1); -// continue; -// } -// -// if (s_weakSplitSequences.Contains(twoCharWord)) -// { -// AddWeakSeparator(twoCharWord, ref i, 1); -// continue; -// } -// } -// -// // Check if there's a 1-char separator -// string oneCharWord = $"{char1}"; -// if (s_explicitSplitSequences.Contains(oneCharWord)) -// { -// AddExplicitSeparator(oneCharWord, ref i, 0); -// continue; -// } -// -// if (s_potentialSplitSequences.Contains(oneCharWord)) -// { -// AddPotentialSeparator(oneCharWord, ref i, 0); -// continue; -// } -// -// if (s_weakSplitSequences.Contains(oneCharWord)) -// { -// AddWeakSeparator(oneCharWord, ref i, 0); -// continue; -// } -// -// buffer.Append(char1); -// } -// -// // Content after the last separator -// if (buffer.Length > 0) -// { -// var _ = 0; -// AddFragment(SeparatorTypes.NotASeparator, separator: "", cursor: ref _, jump: 0); -// } -// -// #if DEBUGFRAGMENTS -// this.DebugFragments(fragments); -// #endif -// -// return fragments; -// } -// -// private int TokenCount(string? input) -// { -// if (input == null) { return 0; } -// -// return this._tokenizer.CountTokens(input); -// } -// -// #region internals -// -// #if DEBUGCHUNKS -// private void DebugChunks(List result) -// { -// Console.WriteLine("----------------------------------"); -// for (int index = 0; index < result.Count; index++) -// { -// Console.WriteLine($"- {index}: \"{result[index]}\" [{this.TokenCount(result[index])} tokens]"); -// } -// -// Console.WriteLine("----------------------------------"); -// } -// #endif -// -// #if DEBUGFRAGMENTS -// private void DebugFragments(List fragments) -// { -// if (fragments.Count == 0) -// { -// Console.WriteLine("No fragments in the list"); -// } -// -// for (int index = 0; index < fragments.Count; index++) -// { -// Fragment fragment = fragments[index]; -// Console.WriteLine($"- {index}: \"{fragment.Content}\""); -// } -// } -// #endif -// -// #endregion -// } - -#pragma warning disable CA0000 // reason - -#pragma warning restore CA0000 diff --git a/service/Abstractions/Configuration/TextPartitioningOptions.cs b/service/Abstractions/Configuration/TextPartitioningOptions.cs index 32b015da1..24a8230ba 100644 --- a/service/Abstractions/Configuration/TextPartitioningOptions.cs +++ b/service/Abstractions/Configuration/TextPartitioningOptions.cs @@ -13,13 +13,6 @@ public class TextPartitioningOptions /// public int MaxTokensPerParagraph { get; set; } = 1000; - /// - /// The maximum number of tokens per line, aka per sentence. - /// When partitioning a block of text, the text will be split into sentences, that are then grouped into paragraphs. - /// Note that this applies to any text format, including tables, code, chats, log files, etc. - /// - public int MaxTokensPerLine { get; set; } = 300; - /// /// The number of overlapping tokens between paragraphs. /// @@ -35,21 +28,11 @@ public void Validate() throw new ConfigurationException($"Text partitioning: {nameof(this.MaxTokensPerParagraph)} cannot be less than 1"); } - if (this.MaxTokensPerLine < 1) - { - throw new ConfigurationException($"Text partitioning: {nameof(this.MaxTokensPerLine)} cannot be less than 1"); - } - if (this.OverlappingTokens < 0) { throw new ConfigurationException($"Text partitioning: {nameof(this.OverlappingTokens)} cannot be less than 0"); } - if (this.MaxTokensPerLine > this.MaxTokensPerParagraph) - { - throw new ConfigurationException($"Text partitioning: {nameof(this.MaxTokensPerLine)} cannot be more than {nameof(this.MaxTokensPerParagraph)}"); - } - if (this.OverlappingTokens >= this.MaxTokensPerParagraph) { throw new ConfigurationException($"Text partitioning: {nameof(this.OverlappingTokens)} must be less than {nameof(this.MaxTokensPerParagraph)}"); diff --git a/service/Abstractions/Constants.cs b/service/Abstractions/Constants.cs index 2cb087917..8d485c1ef 100644 --- a/service/Abstractions/Constants.cs +++ b/service/Abstractions/Constants.cs @@ -36,7 +36,7 @@ public static class CustomContext public static class Partitioning { // Used to override MaxTokensPerParagraph config - public const string MaxTokensPerParagraph = "custom_partitioning_max_tokens_per_paragraph_int"; + public const string MaxTokensPerChunk = "custom_partitioning_max_tokens_per_paragraph_int"; // Used to override OverlappingTokens config public const string OverlappingTokens = "custom_partitioning_overlapping_tokens_int"; diff --git a/service/Abstractions/Context/IContext.cs b/service/Abstractions/Context/IContext.cs index a0e6ac560..8deea5766 100644 --- a/service/Abstractions/Context/IContext.cs +++ b/service/Abstractions/Context/IContext.cs @@ -200,9 +200,9 @@ public static int GetCustomSummaryOverlappingTokensOrDefault(this IContext? cont return defaultValue; } - public static int GetCustomPartitioningMaxTokensPerParagraphOrDefault(this IContext? context, int defaultValue) + public static int GetCustomPartitioningMaxTokensPerChunkOrDefault(this IContext? context, int defaultValue) { - if (context.TryGetArg(Constants.CustomContext.Partitioning.MaxTokensPerParagraph, out var customValue)) + if (context.TryGetArg(Constants.CustomContext.Partitioning.MaxTokensPerChunk, out var customValue)) { return customValue; } diff --git a/service/Abstractions/DataFormats/Chunk.cs b/service/Abstractions/DataFormats/Chunk.cs index c136bbec7..135add8bb 100644 --- a/service/Abstractions/DataFormats/Chunk.cs +++ b/service/Abstractions/DataFormats/Chunk.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft. All rights reserved. using System.Collections.Generic; +using System.Text; using System.Text.Json; using System.Text.Json.Serialization; @@ -24,7 +25,7 @@ public class Chunk /// [JsonPropertyOrder(1)] [JsonPropertyName("content")] - public string Content { get; } + public string Content { get; set; } /// /// Optional metadata attached to the section. @@ -37,6 +38,9 @@ public class Chunk [JsonPropertyName("metadata")] public Dictionary Metadata { get; set; } + [JsonIgnore] + public bool IsSeparator { get; set; } + /// /// Whether the first/last sentence may continue from the previous/into /// the next section (e.g. like PDF docs). @@ -74,10 +78,10 @@ public int PageNumber /// /// Position within the parent content container /// Text content - public Chunk(int number, string? text) + public Chunk(string? text, int number) { - this.Number = number; this.Content = text ?? string.Empty; + this.Number = number; this.Metadata = new(); } @@ -86,11 +90,35 @@ public Chunk(int number, string? text) /// /// Position within the parent content container /// Text content - /// Chunk metadata - public Chunk(int number, string? text, Dictionary metadata) + public Chunk(char text, int number) + { + this.Content = text.ToString(); + this.Number = number; + this.Metadata = new(); + } + + /// + /// Create new instance + /// + /// Position within the parent content container + /// Text content + public Chunk(StringBuilder text, int number) { + this.Content = text.ToString(); this.Number = number; + this.Metadata = new(); + } + + /// + /// Create new instance + /// + /// Position within the parent content container + /// Text content + /// Chunk metadata + public Chunk(string? text, int number, Dictionary metadata) + { this.Content = text ?? string.Empty; + this.Number = number; this.Metadata = metadata; } diff --git a/service/Abstractions/DataFormats/TextChunking.cs b/service/Abstractions/DataFormats/TextChunking.cs deleted file mode 100644 index f5bf06cfd..000000000 --- a/service/Abstractions/DataFormats/TextChunking.cs +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. - -// namespace Microsoft.KernelMemory.DataFormats; -// -// public static class TextChunking -// { -// /// -// /// Delegate for counting tokens in a string. -// /// -// /// The input string to count tokens in. -// /// The number of tokens in the input string. -// public delegate int TokenCounter(string input); -// } - -#pragma warning disable CA0000 // reason - -#pragma warning restore CA0000 diff --git a/service/Core/DataFormats/Image/ImageDecoder.cs b/service/Core/DataFormats/Image/ImageDecoder.cs index aadc95ae8..8b38e591e 100644 --- a/service/Core/DataFormats/Image/ImageDecoder.cs +++ b/service/Core/DataFormats/Image/ImageDecoder.cs @@ -40,7 +40,7 @@ public async Task DecodeAsync(string filename, CancellationToken ca var result = new FileContent(MimeTypes.PlainText); var content = await this.ImageToTextAsync(filename, cancellationToken).ConfigureAwait(false); - result.Sections.Add(new(1, content.Trim(), Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true))); return result; } @@ -52,7 +52,7 @@ public async Task DecodeAsync(BinaryData data, CancellationToken ca var result = new FileContent(MimeTypes.PlainText); var content = await this.ImageToTextAsync(data, cancellationToken).ConfigureAwait(false); - result.Sections.Add(new(1, content.Trim(), Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true))); return result; } @@ -64,7 +64,7 @@ public async Task DecodeAsync(Stream data, CancellationToken cancel var result = new FileContent(MimeTypes.PlainText); var content = await this.ImageToTextAsync(data, cancellationToken).ConfigureAwait(false); - result.Sections.Add(new(1, content.Trim(), Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true))); return result; } diff --git a/service/Core/DataFormats/Office/MsExcelDecoder.cs b/service/Core/DataFormats/Office/MsExcelDecoder.cs index 5aea87a40..f7f04e82b 100644 --- a/service/Core/DataFormats/Office/MsExcelDecoder.cs +++ b/service/Core/DataFormats/Office/MsExcelDecoder.cs @@ -152,7 +152,7 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation string worksheetContent = sb.ToString().Trim(); sb.Clear(); - result.Sections.Add(new Chunk(worksheetNumber, worksheetContent, Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new Chunk(worksheetContent, worksheetNumber, Chunk.Meta(sentencesAreComplete: true))); } return Task.FromResult(result); diff --git a/service/Core/DataFormats/Office/MsPowerPointDecoder.cs b/service/Core/DataFormats/Office/MsPowerPointDecoder.cs index 30c73c4bc..4118fda02 100644 --- a/service/Core/DataFormats/Office/MsPowerPointDecoder.cs +++ b/service/Core/DataFormats/Office/MsPowerPointDecoder.cs @@ -114,7 +114,7 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation string slideContent = sb.ToString().Trim(); sb.Clear(); - result.Sections.Add(new Chunk(slideNumber, slideContent, Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new Chunk(slideContent, slideNumber, Chunk.Meta(sentencesAreComplete: true))); } } diff --git a/service/Core/DataFormats/Office/MsWordDecoder.cs b/service/Core/DataFormats/Office/MsWordDecoder.cs index eda5779e0..5f08de347 100644 --- a/service/Core/DataFormats/Office/MsWordDecoder.cs +++ b/service/Core/DataFormats/Office/MsWordDecoder.cs @@ -81,7 +81,7 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation { string pageContent = sb.ToString().Trim(); sb.Clear(); - result.Sections.Add(new Chunk(pageNumber, pageContent, Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new Chunk(pageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true))); pageNumber++; } @@ -90,7 +90,7 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation } var lastPageContent = sb.ToString().Trim(); - result.Sections.Add(new Chunk(pageNumber, lastPageContent, Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new Chunk(lastPageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true))); return Task.FromResult(result); } diff --git a/service/Core/DataFormats/Pdf/PdfDecoder.cs b/service/Core/DataFormats/Pdf/PdfDecoder.cs index fa1317721..4a4185899 100644 --- a/service/Core/DataFormats/Pdf/PdfDecoder.cs +++ b/service/Core/DataFormats/Pdf/PdfDecoder.cs @@ -58,7 +58,7 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation { // Note: no trimming, use original spacing string pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty; - result.Sections.Add(new Chunk(page.Number, pageContent, Chunk.Meta(sentencesAreComplete: false))); + result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(sentencesAreComplete: false))); } return Task.FromResult(result); diff --git a/service/Core/DataFormats/Text/MarkDownDecoder.cs b/service/Core/DataFormats/Text/MarkDownDecoder.cs index f2cc99ac4..6a45915fb 100644 --- a/service/Core/DataFormats/Text/MarkDownDecoder.cs +++ b/service/Core/DataFormats/Text/MarkDownDecoder.cs @@ -40,7 +40,7 @@ public Task DecodeAsync(BinaryData data, CancellationToken cancella this._log.LogDebug("Extracting text from markdown file"); var result = new FileContent(MimeTypes.MarkDown); - result.Sections.Add(new(1, data.ToString().Trim(), Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true))); return Task.FromResult(result)!; } @@ -54,7 +54,7 @@ public async Task DecodeAsync(Stream data, CancellationToken cancel using var reader = new StreamReader(data); var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); - result.Sections.Add(new(1, content.Trim(), Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true))); return result; } } diff --git a/service/Core/DataFormats/Text/TextDecoder.cs b/service/Core/DataFormats/Text/TextDecoder.cs index 67c8e27e7..cf95e89ae 100644 --- a/service/Core/DataFormats/Text/TextDecoder.cs +++ b/service/Core/DataFormats/Text/TextDecoder.cs @@ -43,7 +43,7 @@ public Task DecodeAsync(BinaryData data, CancellationToken cancella this._log.LogDebug("Extracting text from file"); var result = new FileContent(MimeTypes.PlainText); - result.Sections.Add(new(1, data.ToString().Trim(), Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true))); return Task.FromResult(result)!; } @@ -57,7 +57,7 @@ public async Task DecodeAsync(Stream data, CancellationToken cancel using var reader = new StreamReader(data); var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); - result.Sections.Add(new(1, content.Trim(), Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true))); return result; } } diff --git a/service/Core/DataFormats/WebPages/HtmlDecoder.cs b/service/Core/DataFormats/WebPages/HtmlDecoder.cs index 02ebfe68e..45bd67fe7 100644 --- a/service/Core/DataFormats/WebPages/HtmlDecoder.cs +++ b/service/Core/DataFormats/WebPages/HtmlDecoder.cs @@ -51,7 +51,7 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation var doc = new HtmlDocument(); doc.Load(data); - result.Sections.Add(new Chunk(1, doc.DocumentNode.InnerText.Trim(), Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new Chunk(doc.DocumentNode.InnerText.Trim(), 1, Chunk.Meta(sentencesAreComplete: true))); return Task.FromResult(result); } diff --git a/service/Core/Handlers/SummarizationHandler.cs b/service/Core/Handlers/SummarizationHandler.cs index 1b7757fea..779388884 100644 --- a/service/Core/Handlers/SummarizationHandler.cs +++ b/service/Core/Handlers/SummarizationHandler.cs @@ -165,25 +165,9 @@ public SummarizationHandler( // By default, use 25% of the previous paragraph when summarizing a paragraph int maxTokensPerParagraph = textGenerator.MaxTokenTotal / 4; - // When splitting text in sentences take 100..500 tokens - // If possible allow 50% of the paragraph size, aka 12.5% of the model capacity. - int maxTokensPerLine = Math.Min(Math.Max(100, maxTokensPerParagraph / 2), 500); - // By default, use 6.2% of the model capacity for overlapping tokens - int overlappingTokens = maxTokensPerLine / 2; - // Allow to override the number of overlapping tokens using context arguments - var customOverlappingTokens = context.GetCustomSummaryOverlappingTokensOrDefault(-1); - if (customOverlappingTokens >= 0) - { - if (customOverlappingTokens > maxTokensPerLine / 2) - { - throw new ArgumentOutOfRangeException( - $"Custom number of overlapping tokens is too large, the max value allowed is {maxTokensPerLine / 2}"); - } - - overlappingTokens = customOverlappingTokens; - } + var overlappingTokens = context.GetCustomSummaryOverlappingTokensOrDefault(textGenerator.MaxTokenTotal / 16); this._log.LogTrace("Overlap setting: {0} tokens", overlappingTokens); @@ -214,7 +198,7 @@ public SummarizationHandler( } else { - paragraphs = this._plainTextChunker.Split(content, new PlainTextChunkerOptions() { MaxTokensPerChunk = maxTokensPerParagraph, Overlap = overlappingTokens }); + paragraphs = this._plainTextChunker.Split(content, new PlainTextChunkerOptions { MaxTokensPerChunk = maxTokensPerParagraph, Overlap = overlappingTokens }); } this._log.LogTrace("Paragraphs to summarize: {0}", paragraphs.Count); diff --git a/service/Core/Handlers/TextPartitioningHandler.cs b/service/Core/Handlers/TextPartitioningHandler.cs index 49939277a..b06195434 100644 --- a/service/Core/Handlers/TextPartitioningHandler.cs +++ b/service/Core/Handlers/TextPartitioningHandler.cs @@ -22,6 +22,7 @@ public sealed class TextPartitioningHandler : IPipelineStepHandler private readonly ILogger _log; private readonly int _maxTokensPerPartition = int.MaxValue; private readonly PlainTextChunker _plainTextChunker; + private readonly MarkDownChunker _markDownChunker; /// public string StepName { get; } @@ -43,6 +44,7 @@ public TextPartitioningHandler( this.StepName = stepName; this._orchestrator = orchestrator; this._plainTextChunker = new PlainTextChunker(new CL100KTokenizer()); + this._markDownChunker = new MarkDownChunker(new CL100KTokenizer()); this._options = options ?? new TextPartitioningOptions(); this._options.Validate(); @@ -80,10 +82,10 @@ public TextPartitioningHandler( var context = pipeline.GetContext(); // Allow to override the paragraph size using context arguments - var maxTokensPerParagraph = context.GetCustomPartitioningMaxTokensPerParagraphOrDefault(this._options.MaxTokensPerParagraph); - if (maxTokensPerParagraph > this._maxTokensPerPartition) + var maxTokensPerChunk = context.GetCustomPartitioningMaxTokensPerChunkOrDefault(this._options.MaxTokensPerParagraph); + if (maxTokensPerChunk > this._maxTokensPerPartition) { - throw ParagraphsTooBigForEmbeddingsException(maxTokensPerParagraph, this._maxTokensPerPartition, this._log); + throw ParagraphsTooBigForEmbeddingsException(maxTokensPerChunk, this._maxTokensPerPartition, this._log); } // Allow to override the number of overlapping tokens using context arguments @@ -114,7 +116,6 @@ public TextPartitioningHandler( // Use a different partitioning strategy depending on the file type List partitions; - // List sentences; BinaryData partitionContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false); string partitionsMimeType = MimeTypes.PlainText; @@ -127,9 +128,8 @@ public TextPartitioningHandler( { this._log.LogDebug("Partitioning text file {0}", file.Name); string content = partitionContent.ToString(); - partitions = this._plainTextChunker.Split( - content, - new PlainTextChunkerOptions { MaxTokensPerChunk = maxTokensPerParagraph, Overlap = overlappingTokens, ChunkHeader = chunkHeader }); + partitions = this._plainTextChunker.Split(content, + new PlainTextChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader }); break; } @@ -138,11 +138,8 @@ public TextPartitioningHandler( this._log.LogDebug("Partitioning MarkDown file {0}", file.Name); string content = partitionContent.ToString(); partitionsMimeType = MimeTypes.MarkDown; - // sentences = new List(); - partitions = new List(); - // sentences = TextChunker.SplitMarkDownLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter); - // partitions = TextChunker.SplitMarkdownParagraphs( - // sentences, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, tokenCounter: this._tokenCounter, chunkHeader: chunkHeader); + partitions = this._markDownChunker.Split(content, + new MarkDownChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader }); break; } diff --git a/service/Service/appsettings.json b/service/Service/appsettings.json index aa1970bef..1e7a0b415 100644 --- a/service/Service/appsettings.json +++ b/service/Service/appsettings.json @@ -150,16 +150,13 @@ "ImageOcrType": "None", // Partitioning / Chunking settings // How does the partitioning work? - // * Given a document, text is extracted, and text is split in sentences, called "lines of text". - // * Sentences are merged into paragraphs, called "partitions". - // * For each partition, one (potentially more) memory is generated. + // * Given a document, text is extracted, and text is split in tokens. + // * Tokens are merged into chunks, called "partitions", sometimes called "paragraphs" + // * For each chunk, one (potentially more) memory is generated. "TextPartitioning": { - // Maximum length of lines of text (aka sentences), in tokens. Tokens depend on the LLM in use. - // Sentences are grouped into paragraphs, see the next setting. - "MaxTokensPerLine": 300, - // Maximum length of paragraphs (aka partitions), in tokens. Tokens depend on the LLM in use. + // Maximum length of chunks in tokens. Tokens depend on the LLM in use. "MaxTokensPerParagraph": 1000, - // How many tokens from a paragraph to keep in the following paragraph. + // How many tokens from a chunk to keep in the following chunk. "OverlappingTokens": 100 }, // Note: keep the list empty in this file, to avoid unexpected merges diff --git a/tools/InteractiveSetup/Doctor/Check.cs b/tools/InteractiveSetup/Doctor/Check.cs index f56da090c..201043a9a 100644 --- a/tools/InteractiveSetup/Doctor/Check.cs +++ b/tools/InteractiveSetup/Doctor/Check.cs @@ -30,7 +30,7 @@ public static void Run() stats.AddSeparator(); // Partitioning - stats.Add("Text partitioning", $"Line:{config.DataIngestion.TextPartitioning.MaxTokensPerLine}; Paragraph:{config.DataIngestion.TextPartitioning.MaxTokensPerParagraph}; Overlapping:{config.DataIngestion.TextPartitioning.OverlappingTokens}"); + stats.Add("Text partitioning", $"Chunk:{config.DataIngestion.TextPartitioning.MaxTokensPerParagraph}; Overlapping:{config.DataIngestion.TextPartitioning.OverlappingTokens}"); // Image OCR stats.Add("Image OCR", string.IsNullOrWhiteSpace(config.DataIngestion.ImageOcrType) ? "Disabled" : config.DataIngestion.ImageOcrType);