Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New chunking classes #996

Merged
merged 2 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/_typos.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ extend-exclude = [
"appsettings.Development.json",
"appsettings.*.json.*",
"AzureAISearchFilteringTest.cs",
"KernelMemory.sln.DotSettings"
"KernelMemory.sln.DotSettings",
"doc1.txt",
]

[default.extend-words]
Expand Down
13 changes: 13 additions & 0 deletions KernelMemory.sln
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "images", "images", "{B7CC5E
infra\images\Pip.png = infra\images\Pip.png
EndProjectSection
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Chunkers", "extensions\Chunkers\Chunkers\Chunkers.csproj", "{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Chunkers.UnitTests", "extensions\Chunkers\Chunkers.UnitTests\Chunkers.UnitTests.csproj", "{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -709,6 +713,13 @@ Global
{41A5A076-B35D-4191-B98C-65AD5782A108}.Debug|Any CPU.Build.0 = Debug|Any CPU
{41A5A076-B35D-4191-B98C-65AD5782A108}.Release|Any CPU.ActiveCfg = Release|Any CPU
{41A5A076-B35D-4191-B98C-65AD5782A108}.Release|Any CPU.Build.0 = Release|Any CPU
{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}.Debug|Any CPU.Build.0 = Debug|Any CPU
{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}.Release|Any CPU.ActiveCfg = Release|Any CPU
{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}.Release|Any CPU.Build.0 = Release|Any CPU
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.Build.0 = Debug|Any CPU
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.ActiveCfg = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -826,6 +837,8 @@ Global
{B8858AB4-5CB9-4CD8-A6A0-12847F792FF2} = {C2D3A947-B6F9-4306-BD42-21D8D1F42750}
{237B22CA-B757-43DF-9A0B-18DE7F4DA123} = {B488168B-AD86-4CC5-9D89-324B6EB743D9}
{B7CC5E82-AD91-488F-8C05-1ECD767D4A10} = {B488168B-AD86-4CC5-9D89-324B6EB743D9}
{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB} = {155DA079-E267-49AF-973A-D1D44681970F}
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226} = {3C17F42B-CFC8-4900-8CFB-88936311E919}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {CC136C62-115C-41D1-B414-F9473EFF6EA8}
Expand Down
2 changes: 2 additions & 0 deletions KernelMemory.sln.DotSettings
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateThisQualifierSettings/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EPredefinedNamingRulesToUserRulesUpgrade/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EUnitTestFramework_002ESettings_002EMigrations_002ERemoveBuildPolicyAlwaysMigration/@EntryIndexedValue">True</s:Boolean>
<s:String x:Key="/Default/Environment/UnitTesting/XunitProvider/TestDiscoveryFromArtifactsMethod/@EntryValue">TestRunner</s:String>
<s:Boolean x:Key="/Default/Housekeeping/Layout/SolBuilderDuoView/ShowBuildProgressInToolWindow/@EntryValue">False</s:Boolean>
<s:String x:Key="/Default/Housekeeping/UnitTestingMru/UnitTestSessionDefault/LogSeverity/@EntryValue">TRACE</s:String>
<s:Int64 x:Key="/Default/Housekeeping/UnitTestingMru/UnitTestSessionDefault/OutputLineNumberLimit/@EntryValue">8201</s:Int64>
Expand Down Expand Up @@ -246,6 +247,7 @@ public void It$SOMENAME$()
<s:Boolean x:Key="/Default/UserDictionary/Words/=AZUREBLOBS/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=AZUREIDENTITY/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=AZUREQUEUE/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=chunkers/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=CONNECTIONSTRING/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=daa/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=appsettings/@EntryIndexedValue">True</s:Boolean>
Expand Down
17 changes: 7 additions & 10 deletions docs/how-to/custom-partitioning.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,18 @@ which uses settings defined in

The handler performs the following steps:

1. **Split text into lines**: If a line is too long, it stops and starts a new line.
2. **Form paragraphs**: Concatenate consecutive lines together up to a maximum paragraph size.
3. **Overlap**: When starting a new paragraph, retain a certain number of lines from the previous paragraph.
1. **Split text into chunks**
2. **Form paragraphs**: Concatenate consecutive chunks together up to a maximum chunk size.
3. **Overlap**: When starting a new chunk, retain a certain number of chunk from the previous chunk.

## Default Settings

The default values used by `TextPartitioningHandler` are:

| Setting | Value | Min | Max |
|------------------|-----------------|-----|------------------------|
| Paragraph length | 1000 tokens max | 1 | depends on the LLM |
| Line length | 300 tokens max | 1 | [paragraph length] |
| Overlap | 100 tokens | 0 | [paragraph length - 1] |
| Setting | Value | Min | Max |
|--------------|-----------------|-----|--------------------|
| Chunk length | 1000 tokens max | 1 | depends on the LLM |
| Overlap | 100 tokens | 0 | [chunk length - 1] |

Lengths are expressed in tokens, which depend on the large language model (LLM) in use and its
tokenization logic. KernelMemoryBuilder allows specifying a custom tokenizer for each LLM during setup.
Expand Down Expand Up @@ -59,7 +58,6 @@ For example, with small models supporting up to 256 tokens, something like this
...
"TextPartitioning": {
"MaxTokensPerParagraph": 256,
"MaxTokensPerLine": 256,
"OverlappingTokens": 50
},
...
Expand All @@ -74,7 +72,6 @@ var memory = new KernelMemoryBuilder()
new TextPartitioningOptions
{
MaxTokensPerParagraph = 256,
MaxTokensPerLine = 256,
OverlappingTokens = 50
})
.Build<MemoryServerless>();
Expand Down
6 changes: 2 additions & 4 deletions examples/102-dotnet-custom-partitioning-options/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,9 @@
.WithOpenAIDefaults(Environment.GetEnvironmentVariable("OPENAI_API_KEY")!)
.WithCustomTextPartitioningOptions(new TextPartitioningOptions
{
// Max 99 tokens per sentence
MaxTokensPerLine = 99,
// When sentences are merged into paragraphs (aka partitions), stop at 299 tokens
// When splitting text into chunks (aka partitions), stop at 299 tokens
MaxTokensPerParagraph = 299,
// Each paragraph contains the last 47 tokens from the previous one
// Each chunk contains the last 47 tokens from the previous one
OverlappingTokens = 47,
})
.Build<MemoryServerless>();
Expand Down
2 changes: 1 addition & 1 deletion examples/108-dotnet-custom-content-decoders/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
foreach (Page? page in pdfDocument.GetPages().Where(x => x != null))
{
string pageContent = (ContentOrderTextExtractor.GetText(page, options) ?? string.Empty).ReplaceLineEndings(" ");
result.Sections.Add(new FileSection(page.Number, pageContent, false));
result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(sentencesAreComplete: false)));
}

return Task.FromResult(result);
Expand Down
10 changes: 5 additions & 5 deletions examples/205-dotnet-extract-text-from-docs/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
var msWordDecoder = new MsWordDecoder();
content = await msWordDecoder.DecodeAsync("mswordfile.docx");

foreach (FileSection section in content.Sections)
foreach (Chunk section in content.Sections)
{
Console.WriteLine($"Page: {section.Number}/{content.Sections.Count}");
Console.WriteLine(section.Content);
Expand All @@ -36,7 +36,7 @@
var msPowerPointDecoder = new MsPowerPointDecoder();
content = await msPowerPointDecoder.DecodeAsync("mspowerpointfile.pptx");

foreach (FileSection section in content.Sections)
foreach (Chunk section in content.Sections)
{
Console.WriteLine($"Slide: {section.Number}/{content.Sections.Count}");
Console.WriteLine(section.Content);
Expand All @@ -56,7 +56,7 @@
var msExcelDecoder = new MsExcelDecoder();
content = await msExcelDecoder.DecodeAsync("msexcelfile.xlsx");

foreach (FileSection section in content.Sections)
foreach (Chunk section in content.Sections)
{
Console.WriteLine($"Worksheet: {section.Number}/{content.Sections.Count}");
Console.WriteLine(section.Content);
Expand All @@ -76,7 +76,7 @@
var pdfDecoder = new PdfDecoder();
content = await pdfDecoder.DecodeAsync("file1.pdf");

foreach (FileSection section in content.Sections)
foreach (Chunk section in content.Sections)
{
Console.WriteLine($"Page: {section.Number}/{content.Sections.Count}");
Console.WriteLine(section.Content);
Expand All @@ -95,7 +95,7 @@

content = await pdfDecoder.DecodeAsync("file2.pdf");

foreach (FileSection section in content.Sections)
foreach (Chunk section in content.Sections)
{
Console.WriteLine($"Page: {section.Number}/{content.Sections.Count}");
Console.WriteLine(section.Content);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public static class Program
public static async Task Main()
{
// Partition input text in chunks of 100 tokens
const int PartitionSize = 100;
const int Chunksize = 100;

// Search settings
const string Query = "astrobiology";
Expand All @@ -59,8 +59,7 @@ public static async Task Main()
// Customize memory records size (in tokens)
var textPartitioningOptions = new TextPartitioningOptions
{
MaxTokensPerParagraph = PartitionSize,
MaxTokensPerLine = PartitionSize,
MaxTokensPerParagraph = Chunksize,
OverlappingTokens = 0,
};

Expand Down
11 changes: 4 additions & 7 deletions examples/210-KM-without-builder/appsettings.json
Original file line number Diff line number Diff line change
Expand Up @@ -154,14 +154,11 @@
"ImageOcrType": "None",
// Partitioning / Chunking settings
// How does the partitioning work?
// * Given a document, text is extracted, and text is split in sentences, called "lines of text".
// * Sentences are merged into paragraphs, called "partitions".
// * For each partition, one (potentially more) memory is generated.
// * Given a document, text is extracted, and text is split in tokens.
// * Tokens are merged into chunks, called "partitions", sometimes called "paragraphs"
// * For each chunk, one (potentially more) memory is generated.
"TextPartitioning": {
// Maximum length of lines of text (aka sentences), in tokens. Tokens depend on the LLM in use.
// Sentences are grouped into paragraphs, see the next setting.
"MaxTokensPerLine": 300,
// Maximum length of paragraphs (aka partitions), in tokens. Tokens depend on the LLM in use.
// Maximum length of chunks in tokens. Tokens depend on the LLM in use.
"MaxTokensPerParagraph": 1000,
// How many tokens from a paragraph to keep in the following paragraph.
"OverlappingTokens": 100
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<AssemblyName>Microsoft.Chunkers.UnitTests</AssemblyName>
<RootNamespace>Microsoft.Chunkers.UnitTests</RootNamespace>
<TargetFramework>net8.0</TargetFramework>
<RollForward>LatestMajor</RollForward>
<IsTestProject>true</IsTestProject>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<IsPackable>false</IsPackable>
<NoWarn>xUnit2013;CA1303;KMEXP00;</NoWarn>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.Extensions.DependencyInjection" />
<PackageReference Include="Microsoft.NET.Test.Sdk" />
<PackageReference Include="Xunit.DependencyInjection" />
<PackageReference Include="Xunit.DependencyInjection.Logging" />
<PackageReference Include="xunit" />
<PackageReference Include="xunit.abstractions" />
<PackageReference Include="xunit.runner.visualstudio">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\..\service\tests\TestHelpers\TestHelpers.csproj" />
<ProjectReference Include="..\Chunkers\Chunkers.csproj" />
</ItemGroup>

<ItemGroup>
<None Remove="doc1.txt" />
<Content Include="doc1.txt">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
<None Remove="doc2.md" />
<Content Include="doc2.md">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.KernelMemory.AI;

namespace Microsoft.Chunkers.UnitTests.Helpers;

internal sealed class FourCharsTestTokenizer : ITextTokenizer
{
public int CountTokens(string text)
{
return (int)Math.Ceiling(text.Length / 4d);
}

public IReadOnlyList<string> GetTokens(string text)
{
var tokens = new List<string>((text.Length + 3) / 4);

Span<char> buffer = stackalloc char[4];
for (int i = 0; i < text.Length; i += 4)
{
int tokenLength = Math.Min(4, text.Length - i);
for (int j = 0; j < tokenLength; j++)
{
buffer[j] = text[i + j];
}

tokens.Add(new string(buffer.Slice(0, tokenLength)));
}

return tokens;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.KernelMemory.AI;

namespace Microsoft.Chunkers.UnitTests.Helpers;

internal sealed class OneCharTestTokenizer : ITextTokenizer
{
public int CountTokens(string text)
{
return text.Length;
}

public IReadOnlyList<string> GetTokens(string text)
{
var tokens = new List<string>(text.Length);
tokens.AddRange(text.Select(t => t.ToString()));
return tokens;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.KernelMemory.AI;

namespace Microsoft.Chunkers.UnitTests.Helpers;

internal sealed class TwoCharsTestTokenizer : ITextTokenizer
{
public int CountTokens(string text)
{
return (int)Math.Ceiling(text.Length / 2d);
}

public IReadOnlyList<string> GetTokens(string text)
{
int length = text.Length;
var tokens = new List<string>(length / 2 + length % 2);

Span<char> buffer = stackalloc char[2];
for (int i = 0; i < length; i += 2)
{
buffer[0] = text[i];
if (i + 1 < length)
{
buffer[1] = text[i + 1];
tokens.Add(new string(buffer));
}
else
{
tokens.Add(text[i].ToString());
}
}

return tokens;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Copyright (c) Microsoft. All rights reserved.

using System.Diagnostics;
using Microsoft.KernelMemory.AI;
using Microsoft.KernelMemory.Chunkers;
using Microsoft.KM.TestHelpers;
using Xunit;
using Xunit.Abstractions;

namespace Microsoft.Chunkers.UnitTests;

public class MarkDownChunkerManualTest(ITestOutputHelper output) : BaseUnitTestCase(output)
{
[Fact]
[Trait("Category", "UnitTest")]
[Trait("Category", "Chunking")]
[Trait("Category", "Manual")]
public void ItSplitsMarkdownInASensibleWay()
{
// Arrange
string text = File.ReadAllText("doc2.md");
text = $"{text}{text}";

// Act
var w = new Stopwatch();
w.Start();
var chunks = new MarkDownChunker(new CL100KTokenizer()).Split(text, new MarkDownChunkerOptions { MaxTokensPerChunk = 600, Overlap = 60 });
w.Stop();

Console.WriteLine($"Text length: {text.Length:N0} chars");
Console.WriteLine($"Chunks: {chunks.Count}");
Console.WriteLine($"Time: {w.ElapsedMilliseconds:N0} ms");

// Assert
Assert.NotEmpty(chunks);
DebugChunks(chunks, new CL100KTokenizer());
}

private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer tokenizer)
{
var list = chunks.ToList();

for (int index = 0; index < list.Count; index++)
{
Console.WriteLine($"************************* {index}: [{tokenizer.CountTokens(list[index])} tokens] *****************************************");
Console.WriteLine(list[index]);
Console.WriteLine("***********************************************************************************");
}
}
}
Loading