Refactoring: merge Fragment into Chunk

microsoft · Feb 6, 2025 · 291fd6b · 291fd6b
1 parent 94b5d0d
commit 291fd6b
Show file tree

Hide file tree

Showing 23 changed files with 706 additions and 638 deletions.
diff --git a/.github/_typos.toml b/.github/_typos.toml
@@ -17,7 +17,8 @@ extend-exclude = [
     "appsettings.Development.json",
     "appsettings.*.json.*",
     "AzureAISearchFilteringTest.cs",
-    "KernelMemory.sln.DotSettings"
+    "KernelMemory.sln.DotSettings",
+    "doc1.txt",
 ]
 
 [default.extend-words]

diff --git a/KernelMemory.sln b/KernelMemory.sln
@@ -720,7 +720,6 @@ Global
 		{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE

diff --git a/examples/108-dotnet-custom-content-decoders/Program.cs b/examples/108-dotnet-custom-content-decoders/Program.cs
@@ -91,7 +91,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
         foreach (Page? page in pdfDocument.GetPages().Where(x => x != null))
         {
             string pageContent = (ContentOrderTextExtractor.GetText(page, options) ?? string.Empty).ReplaceLineEndings(" ");
-            result.Sections.Add(new Chunk(page.Number, pageContent, Chunk.Meta(sentencesAreComplete: false)));
+            result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(sentencesAreComplete: false)));
         }
 
         return Task.FromResult(result);

diff --git a/extensions/Chunkers/Chunkers.UnitTests/Chunkers.UnitTests.csproj b/extensions/Chunkers/Chunkers.UnitTests/Chunkers.UnitTests.csproj
@@ -35,6 +35,10 @@
         <Content Include="doc1.txt">
             <CopyToOutputDirectory>Always</CopyToOutputDirectory>
         </Content>
+        <None Remove="doc2.md" />
+        <Content Include="doc2.md">
+            <CopyToOutputDirectory>Always</CopyToOutputDirectory>
+        </Content>
     </ItemGroup>
 
 </Project>
diff --git a/extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerManualTest.cs b/extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerManualTest.cs
@@ -0,0 +1,50 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Diagnostics;
+using Microsoft.KernelMemory.AI;
+using Microsoft.KernelMemory.Chunkers;
+using Microsoft.KM.TestHelpers;
+using Xunit;
+using Xunit.Abstractions;
+
+namespace Microsoft.Chunkers.UnitTests;
+
+public class MarkDownChunkerManualTest(ITestOutputHelper output) : BaseUnitTestCase(output)
+{
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    [Trait("Category", "Chunking")]
+    [Trait("Category", "Manual")]
+    public void ItSplitsMarkdownInASensibleWay()
+    {
+        // Arrange
+        string text = File.ReadAllText("doc2.md");
+        text = $"{text}{text}";
+
+        // Act
+        var w = new Stopwatch();
+        w.Start();
+        var chunks = new MarkDownChunker(new CL100KTokenizer()).Split(text, new MarkDownChunkerOptions { MaxTokensPerChunk = 600, Overlap = 60 });
+        w.Stop();
+
+        Console.WriteLine($"Text length: {text.Length:N0} chars");
+        Console.WriteLine($"Chunks: {chunks.Count}");
+        Console.WriteLine($"Time: {w.ElapsedMilliseconds:N0} ms");
+
+        // Assert
+        Assert.NotEmpty(chunks);
+        DebugChunks(chunks, new CL100KTokenizer());
+    }
+
+    private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer tokenizer)
+    {
+        var list = chunks.ToList();
+
+        for (int index = 0; index < list.Count; index++)
+        {
+            Console.WriteLine($"************************* {index}: [{tokenizer.CountTokens(list[index])} tokens] *****************************************");
+            Console.WriteLine(list[index]);
+            Console.WriteLine("***********************************************************************************");
+        }
+    }
+}
diff --git a/extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerTests.cs b/extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerTests.cs
@@ -3,7 +3,7 @@
 using Microsoft.Chunkers.UnitTests.Helpers;
 using Microsoft.KernelMemory.AI;
 using Microsoft.KernelMemory.Chunkers;
-using Microsoft.KernelMemory.Chunkers.internals;
+using Microsoft.KernelMemory.DataFormats;
 using Microsoft.KM.TestHelpers;
 using Xunit;
 using Xunit.Abstractions;
@@ -375,7 +375,7 @@ private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer token
         Console.WriteLine("----------------------------------");
     }
 
-    private static void DebugFragments(List<Fragment> fragments)
+    private static void DebugFragments(List<Chunk> fragments)
     {
         if (fragments.Count == 0)
         {
@@ -384,8 +384,7 @@ private static void DebugFragments(List<Fragment> fragments)
 
         for (int index = 0; index < fragments.Count; index++)
         {
-            Fragment token = fragments[index];
-            Console.WriteLine($"- {index}: Value: \"{token.Content}\"");
+            Console.WriteLine($"- {index}: Value: \"{fragments[index].Content}\"");
         }
     }
 

diff --git a/extensions/Chunkers/Chunkers.UnitTests/PlainTextChunkerPerfTest.cs b/extensions/Chunkers/Chunkers.UnitTests/PlainTextChunkerPerfTest.cs
@@ -33,6 +33,7 @@ public void CanSplitVeryLargeDocumentsWithoutStackOverflowing()
 
         // Assert
         Assert.NotEmpty(chunks);
+        // DebugChunks(chunks, new CL100KTokenizer());
     }
 
     private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer tokenizer)

diff --git a/extensions/Chunkers/Chunkers.UnitTests/PlainTextChunkerTest.cs b/extensions/Chunkers/Chunkers.UnitTests/PlainTextChunkerTest.cs
@@ -4,6 +4,7 @@
 using Microsoft.KernelMemory.AI;
 using Microsoft.KernelMemory.Chunkers;
 using Microsoft.KernelMemory.Chunkers.internals;
+using Microsoft.KernelMemory.DataFormats;
 using Microsoft.KM.TestHelpers;
 using Xunit;
 using Xunit.Abstractions;
@@ -51,7 +52,7 @@ public void ItTokenizesText()
         string text = "Hello, world!";
 
         // Act
-        List<Fragment> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
+        List<Chunk> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
         DebugFragments(fragments);
 
         // Assert
@@ -72,7 +73,7 @@ public void ItHandlesConsecutiveSentenceSeparators()
         string text = "Hello. . . world!!!!!!!!!!!!!";
 
         // Act
-        List<Fragment> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
+        List<Chunk> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
         DebugFragments(fragments);
 
         // Assert
@@ -97,7 +98,7 @@ public void ItHandlesTailWithoutTermination1()
         string text = "Hello";
 
         // Act
-        List<Fragment> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
+        List<Chunk> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
         DebugFragments(fragments);
 
         // Assert
@@ -114,7 +115,7 @@ public void ItHandlesTailWithoutTermination2()
         string text = "Hello!World";
 
         // Act
-        List<Fragment> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
+        List<Chunk> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
         DebugFragments(fragments);
 
         // Assert
@@ -908,7 +909,7 @@ private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer token
         Console.WriteLine("----------------------------------");
     }
 
-    private static void DebugFragments(List<Fragment> fragments)
+    private static void DebugFragments(List<Chunk> fragments)
     {
         if (fragments.Count == 0)
         {
@@ -917,8 +918,7 @@ private static void DebugFragments(List<Fragment> fragments)
 
         for (int index = 0; index < fragments.Count; index++)
         {
-            Fragment token = fragments[index];
-            Console.WriteLine($"- {index}: Value: \"{token.Content}\"");
+            Console.WriteLine($"- {index}: Value: \"{fragments[index].Content}\"");
         }
     }