Skip to content

Commit

Permalink
Refactoring: merge Fragment into Chunk
Browse files Browse the repository at this point in the history
  • Loading branch information
dluc committed Feb 6, 2025
1 parent 94b5d0d commit 291fd6b
Show file tree
Hide file tree
Showing 23 changed files with 706 additions and 638 deletions.
3 changes: 2 additions & 1 deletion .github/_typos.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ extend-exclude = [
"appsettings.Development.json",
"appsettings.*.json.*",
"AzureAISearchFilteringTest.cs",
"KernelMemory.sln.DotSettings"
"KernelMemory.sln.DotSettings",
"doc1.txt",
]

[default.extend-words]
Expand Down
1 change: 0 additions & 1 deletion KernelMemory.sln
Original file line number Diff line number Diff line change
Expand Up @@ -720,7 +720,6 @@ Global
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.Build.0 = Debug|Any CPU
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.ActiveCfg = Release|Any CPU
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down
2 changes: 1 addition & 1 deletion examples/108-dotnet-custom-content-decoders/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
foreach (Page? page in pdfDocument.GetPages().Where(x => x != null))
{
string pageContent = (ContentOrderTextExtractor.GetText(page, options) ?? string.Empty).ReplaceLineEndings(" ");
result.Sections.Add(new Chunk(page.Number, pageContent, Chunk.Meta(sentencesAreComplete: false)));
result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(sentencesAreComplete: false)));
}

return Task.FromResult(result);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@
<Content Include="doc1.txt">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
<None Remove="doc2.md" />
<Content Include="doc2.md">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Copyright (c) Microsoft. All rights reserved.

using System.Diagnostics;
using Microsoft.KernelMemory.AI;
using Microsoft.KernelMemory.Chunkers;
using Microsoft.KM.TestHelpers;
using Xunit;
using Xunit.Abstractions;

namespace Microsoft.Chunkers.UnitTests;

public class MarkDownChunkerManualTest(ITestOutputHelper output) : BaseUnitTestCase(output)
{
[Fact]
[Trait("Category", "UnitTest")]
[Trait("Category", "Chunking")]
[Trait("Category", "Manual")]
public void ItSplitsMarkdownInASensibleWay()
{
// Arrange
string text = File.ReadAllText("doc2.md");
text = $"{text}{text}";

// Act
var w = new Stopwatch();
w.Start();
var chunks = new MarkDownChunker(new CL100KTokenizer()).Split(text, new MarkDownChunkerOptions { MaxTokensPerChunk = 600, Overlap = 60 });
w.Stop();

Console.WriteLine($"Text length: {text.Length:N0} chars");
Console.WriteLine($"Chunks: {chunks.Count}");
Console.WriteLine($"Time: {w.ElapsedMilliseconds:N0} ms");

// Assert
Assert.NotEmpty(chunks);
DebugChunks(chunks, new CL100KTokenizer());
}

private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer tokenizer)
{
var list = chunks.ToList();

for (int index = 0; index < list.Count; index++)
{
Console.WriteLine($"************************* {index}: [{tokenizer.CountTokens(list[index])} tokens] *****************************************");
Console.WriteLine(list[index]);
Console.WriteLine("***********************************************************************************");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
using Microsoft.Chunkers.UnitTests.Helpers;
using Microsoft.KernelMemory.AI;
using Microsoft.KernelMemory.Chunkers;
using Microsoft.KernelMemory.Chunkers.internals;
using Microsoft.KernelMemory.DataFormats;
using Microsoft.KM.TestHelpers;
using Xunit;
using Xunit.Abstractions;
Expand Down Expand Up @@ -375,7 +375,7 @@ private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer token
Console.WriteLine("----------------------------------");
}

private static void DebugFragments(List<Fragment> fragments)
private static void DebugFragments(List<Chunk> fragments)
{
if (fragments.Count == 0)
{
Expand All @@ -384,8 +384,7 @@ private static void DebugFragments(List<Fragment> fragments)

for (int index = 0; index < fragments.Count; index++)
{
Fragment token = fragments[index];
Console.WriteLine($"- {index}: Value: \"{token.Content}\"");
Console.WriteLine($"- {index}: Value: \"{fragments[index].Content}\"");
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ public void CanSplitVeryLargeDocumentsWithoutStackOverflowing()

// Assert
Assert.NotEmpty(chunks);
// DebugChunks(chunks, new CL100KTokenizer());
}

private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer tokenizer)
Expand Down
14 changes: 7 additions & 7 deletions extensions/Chunkers/Chunkers.UnitTests/PlainTextChunkerTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using Microsoft.KernelMemory.AI;
using Microsoft.KernelMemory.Chunkers;
using Microsoft.KernelMemory.Chunkers.internals;
using Microsoft.KernelMemory.DataFormats;
using Microsoft.KM.TestHelpers;
using Xunit;
using Xunit.Abstractions;
Expand Down Expand Up @@ -51,7 +52,7 @@ public void ItTokenizesText()
string text = "Hello, world!";

// Act
List<Fragment> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
List<Chunk> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
DebugFragments(fragments);

// Assert
Expand All @@ -72,7 +73,7 @@ public void ItHandlesConsecutiveSentenceSeparators()
string text = "Hello. . . world!!!!!!!!!!!!!";

// Act
List<Fragment> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
List<Chunk> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
DebugFragments(fragments);

// Assert
Expand All @@ -97,7 +98,7 @@ public void ItHandlesTailWithoutTermination1()
string text = "Hello";

// Act
List<Fragment> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
List<Chunk> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
DebugFragments(fragments);

// Assert
Expand All @@ -114,7 +115,7 @@ public void ItHandlesTailWithoutTermination2()
string text = "Hello!World";

// Act
List<Fragment> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
List<Chunk> fragments = new PlainTextChunker().SplitToFragments(text, s_separators);
DebugFragments(fragments);

// Assert
Expand Down Expand Up @@ -908,7 +909,7 @@ private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer token
Console.WriteLine("----------------------------------");
}

private static void DebugFragments(List<Fragment> fragments)
private static void DebugFragments(List<Chunk> fragments)
{
if (fragments.Count == 0)
{
Expand All @@ -917,8 +918,7 @@ private static void DebugFragments(List<Fragment> fragments)

for (int index = 0; index < fragments.Count; index++)
{
Fragment token = fragments[index];
Console.WriteLine($"- {index}: Value: \"{token.Content}\"");
Console.WriteLine($"- {index}: Value: \"{fragments[index].Content}\"");
}
}

Expand Down
Loading

0 comments on commit 291fd6b

Please sign in to comment.