Skip to content

Commit

Permalink
Refactoring: merge Fragment into Chunk
Browse files Browse the repository at this point in the history
  • Loading branch information
dluc committed Feb 6, 2025
1 parent 94b5d0d commit 87adf99
Show file tree
Hide file tree
Showing 34 changed files with 734 additions and 713 deletions.
3 changes: 2 additions & 1 deletion .github/_typos.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ extend-exclude = [
"appsettings.Development.json",
"appsettings.*.json.*",
"AzureAISearchFilteringTest.cs",
"KernelMemory.sln.DotSettings"
"KernelMemory.sln.DotSettings",
"doc1.txt",
]

[default.extend-words]
Expand Down
1 change: 0 additions & 1 deletion KernelMemory.sln
Original file line number Diff line number Diff line change
Expand Up @@ -720,7 +720,6 @@ Global
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.Build.0 = Debug|Any CPU
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.ActiveCfg = Release|Any CPU
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down
2 changes: 0 additions & 2 deletions docs/how-to/custom-partitioning.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ For example, with small models supporting up to 256 tokens, something like this
...
"TextPartitioning": {
"MaxTokensPerParagraph": 256,
"MaxTokensPerLine": 256,
"OverlappingTokens": 50
},
...
Expand All @@ -74,7 +73,6 @@ var memory = new KernelMemoryBuilder()
new TextPartitioningOptions
{
MaxTokensPerParagraph = 256,
MaxTokensPerLine = 256,
OverlappingTokens = 50
})
.Build<MemoryServerless>();
Expand Down
6 changes: 2 additions & 4 deletions examples/102-dotnet-custom-partitioning-options/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,9 @@
.WithOpenAIDefaults(Environment.GetEnvironmentVariable("OPENAI_API_KEY")!)
.WithCustomTextPartitioningOptions(new TextPartitioningOptions
{
// Max 99 tokens per sentence
MaxTokensPerLine = 99,
// When sentences are merged into paragraphs (aka partitions), stop at 299 tokens
// When splitting text into chunks (aka partitions), stop at 299 tokens
MaxTokensPerParagraph = 299,
// Each paragraph contains the last 47 tokens from the previous one
// Each chunk contains the last 47 tokens from the previous one
OverlappingTokens = 47,
})
.Build<MemoryServerless>();
Expand Down
2 changes: 1 addition & 1 deletion examples/108-dotnet-custom-content-decoders/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
foreach (Page? page in pdfDocument.GetPages().Where(x => x != null))
{
string pageContent = (ContentOrderTextExtractor.GetText(page, options) ?? string.Empty).ReplaceLineEndings(" ");
result.Sections.Add(new Chunk(page.Number, pageContent, Chunk.Meta(sentencesAreComplete: false)));
result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(sentencesAreComplete: false)));
}

return Task.FromResult(result);
Expand Down
5 changes: 2 additions & 3 deletions examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public static class Program
public static async Task Main()
{
// Partition input text in chunks of 100 tokens
const int PartitionSize = 100;
const int Chunksize = 100;

// Search settings
const string Query = "astrobiology";
Expand All @@ -59,8 +59,7 @@ public static async Task Main()
// Customize memory records size (in tokens)
var textPartitioningOptions = new TextPartitioningOptions
{
MaxTokensPerParagraph = PartitionSize,
MaxTokensPerLine = PartitionSize,
MaxTokensPerParagraph = Chunksize,
OverlappingTokens = 0,
};

Expand Down
11 changes: 4 additions & 7 deletions examples/210-KM-without-builder/appsettings.json
Original file line number Diff line number Diff line change
Expand Up @@ -154,14 +154,11 @@
"ImageOcrType": "None",
// Partitioning / Chunking settings
// How does the partitioning work?
// * Given a document, text is extracted, and text is split in sentences, called "lines of text".
// * Sentences are merged into paragraphs, called "partitions".
// * For each partition, one (potentially more) memory is generated.
// * Given a document, text is extracted, and text is split in tokens.
// * Tokens are merged into chunks, called "partitions", sometimes called "paragraphs"
// * For each chunk, one (potentially more) memory is generated.
"TextPartitioning": {
// Maximum length of lines of text (aka sentences), in tokens. Tokens depend on the LLM in use.
// Sentences are grouped into paragraphs, see the next setting.
"MaxTokensPerLine": 300,
// Maximum length of paragraphs (aka partitions), in tokens. Tokens depend on the LLM in use.
// Maximum length of chunks in tokens. Tokens depend on the LLM in use.
"MaxTokensPerParagraph": 1000,
// How many tokens from a paragraph to keep in the following paragraph.
"OverlappingTokens": 100
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@
<Content Include="doc1.txt">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
<None Remove="doc2.md" />
<Content Include="doc2.md">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Copyright (c) Microsoft. All rights reserved.

using System.Diagnostics;
using Microsoft.KernelMemory.AI;
using Microsoft.KernelMemory.Chunkers;
using Microsoft.KM.TestHelpers;
using Xunit;
using Xunit.Abstractions;

namespace Microsoft.Chunkers.UnitTests;

public class MarkDownChunkerManualTest(ITestOutputHelper output) : BaseUnitTestCase(output)
{
[Fact]
[Trait("Category", "UnitTest")]
[Trait("Category", "Chunking")]
[Trait("Category", "Manual")]
public void ItSplitsMarkdownInASensibleWay()
{
// Arrange
string text = File.ReadAllText("doc2.md");
text = $"{text}{text}";

// Act
var w = new Stopwatch();
w.Start();
var chunks = new MarkDownChunker(new CL100KTokenizer()).Split(text, new MarkDownChunkerOptions { MaxTokensPerChunk = 600, Overlap = 60 });
w.Stop();

Console.WriteLine($"Text length: {text.Length:N0} chars");
Console.WriteLine($"Chunks: {chunks.Count}");
Console.WriteLine($"Time: {w.ElapsedMilliseconds:N0} ms");

// Assert
Assert.NotEmpty(chunks);
DebugChunks(chunks, new CL100KTokenizer());
}

private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer tokenizer)
{
var list = chunks.ToList();

for (int index = 0; index < list.Count; index++)
{
Console.WriteLine($"************************* {index}: [{tokenizer.CountTokens(list[index])} tokens] *****************************************");
Console.WriteLine(list[index]);
Console.WriteLine("***********************************************************************************");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
using Microsoft.Chunkers.UnitTests.Helpers;
using Microsoft.KernelMemory.AI;
using Microsoft.KernelMemory.Chunkers;
using Microsoft.KernelMemory.Chunkers.internals;
using Microsoft.KernelMemory.DataFormats;
using Microsoft.KM.TestHelpers;
using Xunit;
using Xunit.Abstractions;
Expand Down Expand Up @@ -375,7 +375,7 @@ private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer token
Console.WriteLine("----------------------------------");
}

private static void DebugFragments(List<Fragment> fragments)
private static void DebugFragments(List<Chunk> fragments)
{
if (fragments.Count == 0)
{
Expand All @@ -384,8 +384,7 @@ private static void DebugFragments(List<Fragment> fragments)

for (int index = 0; index < fragments.Count; index++)
{
Fragment token = fragments[index];
Console.WriteLine($"- {index}: Value: \"{token.Content}\"");
Console.WriteLine($"- {index}: Value: \"{fragments[index].Content}\"");
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ public void CanSplitVeryLargeDocumentsWithoutStackOverflowing()

// Assert
Assert.NotEmpty(chunks);
// DebugChunks(chunks, new CL100KTokenizer());
}

private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer tokenizer)
Expand Down
14 changes: 7 additions & 7 deletions extensions/Chunkers/Chunkers.UnitTests/PlainTextChunkerTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using Microsoft.KernelMemory.AI;
using Microsoft.KernelMemory.Chunkers;
using Microsoft.KernelMemory.Chunkers.internals;
using Microsoft.KernelMemory.DataFormats;
using Microsoft.KM.TestHelpers;
using Xunit;
using Xunit.Abstractions;
Expand Down Expand Up @@ -51,7 +52,7 @@ public void ItTokenizesText()
string text = "Hello, world!";

// Act
List<Fragment> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
List<Chunk> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
DebugFragments(fragments);

// Assert
Expand All @@ -72,7 +73,7 @@ public void ItHandlesConsecutiveSentenceSeparators()
string text = "Hello. . . world!!!!!!!!!!!!!";

// Act
List<Fragment> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
List<Chunk> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
DebugFragments(fragments);

// Assert
Expand All @@ -97,7 +98,7 @@ public void ItHandlesTailWithoutTermination1()
string text = "Hello";

// Act
List<Fragment> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
List<Chunk> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
DebugFragments(fragments);

// Assert
Expand All @@ -114,7 +115,7 @@ public void ItHandlesTailWithoutTermination2()
string text = "Hello!World";

// Act
List<Fragment> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
List<Chunk> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
DebugFragments(fragments);

// Assert
Expand Down Expand Up @@ -908,7 +909,7 @@ private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer token
Console.WriteLine("----------------------------------");
}

private static void DebugFragments(List<Fragment> fragments)
private static void DebugFragments(List<Chunk> fragments)
{
if (fragments.Count == 0)
{
Expand All @@ -917,8 +918,7 @@ private static void DebugFragments(List<Fragment> fragments)

for (int index = 0; index < fragments.Count; index++)
{
Fragment token = fragments[index];
Console.WriteLine($"- {index}: Value: \"{token.Content}\"");
Console.WriteLine($"- {index}: Value: \"{fragments[index].Content}\"");
}
}

Expand Down
Loading

0 comments on commit 87adf99

Please sign in to comment.