Skip to content

Commit 87adf99

Browse files
committed
Refactoring: merge Fragment into Chunk
1 parent 94b5d0d commit 87adf99

File tree

34 files changed

+734
-713
lines changed

34 files changed

+734
-713
lines changed

.github/_typos.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ extend-exclude = [
1717
"appsettings.Development.json",
1818
"appsettings.*.json.*",
1919
"AzureAISearchFilteringTest.cs",
20-
"KernelMemory.sln.DotSettings"
20+
"KernelMemory.sln.DotSettings",
21+
"doc1.txt",
2122
]
2223

2324
[default.extend-words]

KernelMemory.sln

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -720,7 +720,6 @@ Global
720720
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
721721
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.Build.0 = Debug|Any CPU
722722
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.ActiveCfg = Release|Any CPU
723-
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.Build.0 = Release|Any CPU
724723
EndGlobalSection
725724
GlobalSection(SolutionProperties) = preSolution
726725
HideSolutionNode = FALSE

docs/how-to/custom-partitioning.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ For example, with small models supporting up to 256 tokens, something like this
5959
...
6060
"TextPartitioning": {
6161
"MaxTokensPerParagraph": 256,
62-
"MaxTokensPerLine": 256,
6362
"OverlappingTokens": 50
6463
},
6564
...
@@ -74,7 +73,6 @@ var memory = new KernelMemoryBuilder()
7473
new TextPartitioningOptions
7574
{
7675
MaxTokensPerParagraph = 256,
77-
MaxTokensPerLine = 256,
7876
OverlappingTokens = 50
7977
})
8078
.Build<MemoryServerless>();

examples/102-dotnet-custom-partitioning-options/Program.cs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,9 @@
77
.WithOpenAIDefaults(Environment.GetEnvironmentVariable("OPENAI_API_KEY")!)
88
.WithCustomTextPartitioningOptions(new TextPartitioningOptions
99
{
10-
// Max 99 tokens per sentence
11-
MaxTokensPerLine = 99,
12-
// When sentences are merged into paragraphs (aka partitions), stop at 299 tokens
10+
// When splitting text into chunks (aka partitions), stop at 299 tokens
1311
MaxTokensPerParagraph = 299,
14-
// Each paragraph contains the last 47 tokens from the previous one
12+
// Each chunk contains the last 47 tokens from the previous one
1513
OverlappingTokens = 47,
1614
})
1715
.Build<MemoryServerless>();

examples/108-dotnet-custom-content-decoders/Program.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
9191
foreach (Page? page in pdfDocument.GetPages().Where(x => x != null))
9292
{
9393
string pageContent = (ContentOrderTextExtractor.GetText(page, options) ?? string.Empty).ReplaceLineEndings(" ");
94-
result.Sections.Add(new Chunk(page.Number, pageContent, Chunk.Meta(sentencesAreComplete: false)));
94+
result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(sentencesAreComplete: false)));
9595
}
9696

9797
return Task.FromResult(result);

examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ public static class Program
4040
public static async Task Main()
4141
{
4242
// Partition input text in chunks of 100 tokens
43-
const int PartitionSize = 100;
43+
const int Chunksize = 100;
4444

4545
// Search settings
4646
const string Query = "astrobiology";
@@ -59,8 +59,7 @@ public static async Task Main()
5959
// Customize memory records size (in tokens)
6060
var textPartitioningOptions = new TextPartitioningOptions
6161
{
62-
MaxTokensPerParagraph = PartitionSize,
63-
MaxTokensPerLine = PartitionSize,
62+
MaxTokensPerParagraph = Chunksize,
6463
OverlappingTokens = 0,
6564
};
6665

examples/210-KM-without-builder/appsettings.json

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -154,14 +154,11 @@
154154
"ImageOcrType": "None",
155155
// Partitioning / Chunking settings
156156
// How does the partitioning work?
157-
// * Given a document, text is extracted, and text is split in sentences, called "lines of text".
158-
// * Sentences are merged into paragraphs, called "partitions".
159-
// * For each partition, one (potentially more) memory is generated.
157+
// * Given a document, text is extracted, and text is split in tokens.
158+
// * Tokens are merged into chunks, called "partitions", sometimes called "paragraphs"
159+
// * For each chunk, one (potentially more) memory is generated.
160160
"TextPartitioning": {
161-
// Maximum length of lines of text (aka sentences), in tokens. Tokens depend on the LLM in use.
162-
// Sentences are grouped into paragraphs, see the next setting.
163-
"MaxTokensPerLine": 300,
164-
// Maximum length of paragraphs (aka partitions), in tokens. Tokens depend on the LLM in use.
161+
// Maximum length of chunks in tokens. Tokens depend on the LLM in use.
165162
"MaxTokensPerParagraph": 1000,
166163
// How many tokens from a paragraph to keep in the following paragraph.
167164
"OverlappingTokens": 100

extensions/Chunkers/Chunkers.UnitTests/Chunkers.UnitTests.csproj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@
3535
<Content Include="doc1.txt">
3636
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
3737
</Content>
38+
<None Remove="doc2.md" />
39+
<Content Include="doc2.md">
40+
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
41+
</Content>
3842
</ItemGroup>
3943

4044
</Project>
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
3+
using System.Diagnostics;
4+
using Microsoft.KernelMemory.AI;
5+
using Microsoft.KernelMemory.Chunkers;
6+
using Microsoft.KM.TestHelpers;
7+
using Xunit;
8+
using Xunit.Abstractions;
9+
10+
namespace Microsoft.Chunkers.UnitTests;
11+
12+
public class MarkDownChunkerManualTest(ITestOutputHelper output) : BaseUnitTestCase(output)
13+
{
14+
[Fact]
15+
[Trait("Category", "UnitTest")]
16+
[Trait("Category", "Chunking")]
17+
[Trait("Category", "Manual")]
18+
public void ItSplitsMarkdownInASensibleWay()
19+
{
20+
// Arrange
21+
string text = File.ReadAllText("doc2.md");
22+
text = $"{text}{text}";
23+
24+
// Act
25+
var w = new Stopwatch();
26+
w.Start();
27+
var chunks = new MarkDownChunker(new CL100KTokenizer()).Split(text, new MarkDownChunkerOptions { MaxTokensPerChunk = 600, Overlap = 60 });
28+
w.Stop();
29+
30+
Console.WriteLine($"Text length: {text.Length:N0} chars");
31+
Console.WriteLine($"Chunks: {chunks.Count}");
32+
Console.WriteLine($"Time: {w.ElapsedMilliseconds:N0} ms");
33+
34+
// Assert
35+
Assert.NotEmpty(chunks);
36+
DebugChunks(chunks, new CL100KTokenizer());
37+
}
38+
39+
private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer tokenizer)
40+
{
41+
var list = chunks.ToList();
42+
43+
for (int index = 0; index < list.Count; index++)
44+
{
45+
Console.WriteLine($"************************* {index}: [{tokenizer.CountTokens(list[index])} tokens] *****************************************");
46+
Console.WriteLine(list[index]);
47+
Console.WriteLine("***********************************************************************************");
48+
}
49+
}
50+
}

extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerTests.cs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
using Microsoft.Chunkers.UnitTests.Helpers;
44
using Microsoft.KernelMemory.AI;
55
using Microsoft.KernelMemory.Chunkers;
6-
using Microsoft.KernelMemory.Chunkers.internals;
6+
using Microsoft.KernelMemory.DataFormats;
77
using Microsoft.KM.TestHelpers;
88
using Xunit;
99
using Xunit.Abstractions;
@@ -375,7 +375,7 @@ private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer token
375375
Console.WriteLine("----------------------------------");
376376
}
377377

378-
private static void DebugFragments(List<Fragment> fragments)
378+
private static void DebugFragments(List<Chunk> fragments)
379379
{
380380
if (fragments.Count == 0)
381381
{
@@ -384,8 +384,7 @@ private static void DebugFragments(List<Fragment> fragments)
384384

385385
for (int index = 0; index < fragments.Count; index++)
386386
{
387-
Fragment token = fragments[index];
388-
Console.WriteLine($"- {index}: Value: \"{token.Content}\"");
387+
Console.WriteLine($"- {index}: Value: \"{fragments[index].Content}\"");
389388
}
390389
}
391390

0 commit comments

Comments
 (0)