microsoft
diff --git a/‎.github/_typos.toml
Lines changed: 2 additions & 1 deletion b/‎.github/_typos.toml
Lines changed: 2 additions & 1 deletion
diff --git a/‎KernelMemory.sln
Lines changed: 0 additions & 1 deletion b/‎KernelMemory.sln
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/how-to/custom-partitioning.md
Lines changed: 0 additions & 2 deletions b/‎docs/how-to/custom-partitioning.md
Lines changed: 0 additions & 2 deletions
diff --git a/‎examples/102-dotnet-custom-partitioning-options/Program.cs
Lines changed: 2 additions & 4 deletions b/‎examples/102-dotnet-custom-partitioning-options/Program.cs
Lines changed: 2 additions & 4 deletions
diff --git a/‎examples/108-dotnet-custom-content-decoders/Program.cs
Lines changed: 1 addition & 1 deletion b/‎examples/108-dotnet-custom-content-decoders/Program.cs
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs
Lines changed: 2 additions & 3 deletions b/‎examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/210-KM-without-builder/appsettings.json
Lines changed: 4 additions & 7 deletions b/‎examples/210-KM-without-builder/appsettings.json
Lines changed: 4 additions & 7 deletions
diff --git a/‎extensions/Chunkers/Chunkers.UnitTests/Chunkers.UnitTests.csproj
Lines changed: 4 additions & 0 deletions b/‎extensions/Chunkers/Chunkers.UnitTests/Chunkers.UnitTests.csproj
Lines changed: 4 additions & 0 deletions
diff --git a/‎extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerManualTest.cs
Lines changed: 50 additions & 0 deletions b/‎extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerManualTest.cs
Lines changed: 50 additions & 0 deletions
diff --git a/‎extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerTests.cs
Lines changed: 3 additions & 4 deletions b/‎extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerTests.cs
Lines changed: 3 additions & 4 deletions
@@ -17,7 +17,8 @@ extend-exclude = [
     "appsettings.Development.json",
     "appsettings.*.json.*",
     "AzureAISearchFilteringTest.cs",
-    "KernelMemory.sln.DotSettings"
+    "KernelMemory.sln.DotSettings",
+    "doc1.txt",
 ]
 
 [default.extend-words]
 
@@ -720,7 +720,6 @@ Global
 		{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 
@@ -59,7 +59,6 @@ For example, with small models supporting up to 256 tokens, something like this
       ...
       "TextPartitioning": {
         "MaxTokensPerParagraph": 256,
-        "MaxTokensPerLine": 256,
         "OverlappingTokens": 50
       },
   ...
@@ -74,7 +73,6 @@ var memory = new KernelMemoryBuilder()
         new TextPartitioningOptions
         {
             MaxTokensPerParagraph = 256,
-            MaxTokensPerLine = 256,
             OverlappingTokens = 50
         })
     .Build<MemoryServerless>();
 
@@ -7,11 +7,9 @@
     .WithOpenAIDefaults(Environment.GetEnvironmentVariable("OPENAI_API_KEY")!)
     .WithCustomTextPartitioningOptions(new TextPartitioningOptions
     {
-        // Max 99 tokens per sentence
-        MaxTokensPerLine = 99,
-        // When sentences are merged into paragraphs (aka partitions), stop at 299 tokens
+        // When splitting text into chunks (aka partitions), stop at 299 tokens
         MaxTokensPerParagraph = 299,
-        // Each paragraph contains the last 47 tokens from the previous one
+        // Each chunk contains the last 47 tokens from the previous one
         OverlappingTokens = 47,
     })
     .Build<MemoryServerless>();
 
@@ -91,7 +91,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
         foreach (Page? page in pdfDocument.GetPages().Where(x => x != null))
         {
             string pageContent = (ContentOrderTextExtractor.GetText(page, options) ?? string.Empty).ReplaceLineEndings(" ");
-            result.Sections.Add(new Chunk(page.Number, pageContent, Chunk.Meta(sentencesAreComplete: false)));
+            result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(sentencesAreComplete: false)));
         }
 
         return Task.FromResult(result);
 
@@ -40,7 +40,7 @@ public static class Program
     public static async Task Main()
     {
         // Partition input text in chunks of 100 tokens
-        const int PartitionSize = 100;
+        const int Chunksize = 100;
 
         // Search settings
         const string Query = "astrobiology";
@@ -59,8 +59,7 @@ public static async Task Main()
         // Customize memory records size (in tokens)
         var textPartitioningOptions = new TextPartitioningOptions
         {
-            MaxTokensPerParagraph = PartitionSize,
-            MaxTokensPerLine = PartitionSize,
+            MaxTokensPerParagraph = Chunksize,
             OverlappingTokens = 0,
         };
 
 
@@ -154,14 +154,11 @@
       "ImageOcrType": "None",
       // Partitioning / Chunking settings
       // How does the partitioning work?
-      // * Given a document, text is extracted, and text is split in sentences, called "lines of text".
-      // * Sentences are merged into paragraphs, called "partitions".
-      // * For each partition, one (potentially more) memory is generated.
+      // * Given a document, text is extracted, and text is split in tokens.
+      // * Tokens are merged into chunks, called "partitions", sometimes called "paragraphs"
+      // * For each chunk, one (potentially more) memory is generated.
       "TextPartitioning": {
-        // Maximum length of lines of text (aka sentences), in tokens. Tokens depend on the LLM in use.
-        // Sentences are grouped into paragraphs, see the next setting.
-        "MaxTokensPerLine": 300,
-        // Maximum length of paragraphs (aka partitions), in tokens. Tokens depend on the LLM in use.
+        // Maximum length of chunks in tokens. Tokens depend on the LLM in use.
         "MaxTokensPerParagraph": 1000,
         // How many tokens from a paragraph to keep in the following paragraph.
         "OverlappingTokens": 100
 
@@ -35,6 +35,10 @@
         <Content Include="doc1.txt">
             <CopyToOutputDirectory>Always</CopyToOutputDirectory>
         </Content>
+        <None Remove="doc2.md" />
+        <Content Include="doc2.md">
+            <CopyToOutputDirectory>Always</CopyToOutputDirectory>
+        </Content>
     </ItemGroup>
 
 </Project>
@@ -0,0 +1,50 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Diagnostics;
+using Microsoft.KernelMemory.AI;
+using Microsoft.KernelMemory.Chunkers;
+using Microsoft.KM.TestHelpers;
+using Xunit;
+using Xunit.Abstractions;
+
+namespace Microsoft.Chunkers.UnitTests;
+
+public class MarkDownChunkerManualTest(ITestOutputHelper output) : BaseUnitTestCase(output)
+{
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    [Trait("Category", "Chunking")]
+    [Trait("Category", "Manual")]
+    public void ItSplitsMarkdownInASensibleWay()
+    {
+        // Arrange
+        string text = File.ReadAllText("doc2.md");
+        text = $"{text}{text}";
+
+        // Act
+        var w = new Stopwatch();
+        w.Start();
+        var chunks = new MarkDownChunker(new CL100KTokenizer()).Split(text, new MarkDownChunkerOptions { MaxTokensPerChunk = 600, Overlap = 60 });
+        w.Stop();
+
+        Console.WriteLine($"Text length: {text.Length:N0} chars");
+        Console.WriteLine($"Chunks: {chunks.Count}");
+        Console.WriteLine($"Time: {w.ElapsedMilliseconds:N0} ms");
+
+        // Assert
+        Assert.NotEmpty(chunks);
+        DebugChunks(chunks, new CL100KTokenizer());
+    }
+
+    private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer tokenizer)
+    {
+        var list = chunks.ToList();
+
+        for (int index = 0; index < list.Count; index++)
+        {
+            Console.WriteLine($"************************* {index}: [{tokenizer.CountTokens(list[index])} tokens] *****************************************");
+            Console.WriteLine(list[index]);
+            Console.WriteLine("***********************************************************************************");
+        }
+    }
+}
@@ -3,7 +3,7 @@
 using Microsoft.Chunkers.UnitTests.Helpers;
 using Microsoft.KernelMemory.AI;
 using Microsoft.KernelMemory.Chunkers;
-using Microsoft.KernelMemory.Chunkers.internals;
+using Microsoft.KernelMemory.DataFormats;
 using Microsoft.KM.TestHelpers;
 using Xunit;
 using Xunit.Abstractions;
@@ -375,7 +375,7 @@ private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer token
         Console.WriteLine("----------------------------------");
     }
 
-    private static void DebugFragments(List<Fragment> fragments)
+    private static void DebugFragments(List<Chunk> fragments)
     {
         if (fragments.Count == 0)
         {
@@ -384,8 +384,7 @@ private static void DebugFragments(List<Fragment> fragments)
 
         for (int index = 0; index < fragments.Count; index++)
         {
-            Fragment token = fragments[index];
-            Console.WriteLine($"- {index}: Value: \"{token.Content}\"");
+            Console.WriteLine($"- {index}: Value: \"{fragments[index].Content}\"");
         }
     }
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,8 @@ extend-exclude = [`
`17`	`17`	`"appsettings.Development.json",`
`18`	`18`	`"appsettings..json.",`
`19`	`19`	`"AzureAISearchFilteringTest.cs",`
`20`		`- "KernelMemory.sln.DotSettings"`
	`20`	`+ "KernelMemory.sln.DotSettings",`
	`21`	`+ "doc1.txt",`
`21`	`22`	`]`
`22`	`23`
`23`	`24`	`[default.extend-words]`
Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation`
`91`	`91`	`foreach (Page? page in pdfDocument.GetPages().Where(x => x != null))`
`92`	`92`	`{`
`93`	`93`	`string pageContent = (ContentOrderTextExtractor.GetText(page, options) ?? string.Empty).ReplaceLineEndings(" ");`
`94`		`- result.Sections.Add(new Chunk(page.Number, pageContent, Chunk.Meta(sentencesAreComplete: false)));`
	`94`	`+ result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(sentencesAreComplete: false)));`
`95`	`95`	`}`
`96`	`96`
`97`	`97`	`return Task.FromResult(result);`
Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@`
`3`	`3`	`using Microsoft.Chunkers.UnitTests.Helpers;`
`4`	`4`	`using Microsoft.KernelMemory.AI;`
`5`	`5`	`using Microsoft.KernelMemory.Chunkers;`
`6`		`-using Microsoft.KernelMemory.Chunkers.internals;`
	`6`	`+using Microsoft.KernelMemory.DataFormats;`
`7`	`7`	`using Microsoft.KM.TestHelpers;`
`8`	`8`	`using Xunit;`
`9`	`9`	`using Xunit.Abstractions;`
`@@ -375,7 +375,7 @@ private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer token`
`375`	`375`	`Console.WriteLine("----------------------------------");`
`376`	`376`	`}`
`377`	`377`
`378`		`- private static void DebugFragments(List<Fragment> fragments)`
	`378`	`+ private static void DebugFragments(List<Chunk> fragments)`
`379`	`379`	`{`
`380`	`380`	`if (fragments.Count == 0)`
`381`	`381`	`{`
`@@ -384,8 +384,7 @@ private static void DebugFragments(List<Fragment> fragments)`
`384`	`384`
`385`	`385`	`for (int index = 0; index < fragments.Count; index++)`
`386`	`386`	`{`
`387`		`- Fragment token = fragments[index];`
`388`		`- Console.WriteLine($"- {index}: Value: \"{token.Content}\"");`
	`387`	`+ Console.WriteLine($"- {index}: Value: \"{fragments[index].Content}\"");`
`389`	`388`	`}`
`390`	`389`	`}`
`391`	`390`