Skip to content

Commit

Permalink
Added two document connectors to SemanticKernel.Connectors.Document: …
Browse files Browse the repository at this point in the history
…`DocDocumentConnector` for ´.doc´ files and `HtmlDocumentConnector` for `.html` files.
  • Loading branch information
vmunoz96 committed Nov 18, 2024
1 parent e01c1eb commit b52e02a
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 0 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ Previous classification is not required if changes are simple or all belong to t
- Added the `AtLeastOneRequiredAttribute` Data Annotation to validate that at least one of the specified properties has a value.
- Enchanced `JsonUtils` with new methods: `FastCheckIsJson` and `IsAnAdaptiveCard`.
- Added new `AtLeastOneRequiredSchemaFilter` to ensure OpenAPI schemas enforce that at least one of the specified properties is required, by modifying the schema to use the `anyOf` rule in Swagger documentation generation.
- Added two new connectors to `Encamina.Enmarcha.SemanticKernel.Connectors.Document`:
- Document connector for reading `.doc` files: `DocDocumentConnector`.
- Document connector for reading `.html` files: `HtmlDocumentConnector`.

## [8.1.8]

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
using System.Text;

using CommunityToolkit.Diagnostics;

using NPOI.HWPF;
using NPOI.HWPF.Extractor;

namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors;

/// <summary>
/// Extracts text from a document in the <c>.doc</c> format.
/// </summary>
public class DocDocumentConnector : IEnmarchaDocumentConnector
{
/// <inheritdoc/>
public IReadOnlyList<string> CompatibleFileFormats => [".DOC"];

/// <inheritdoc/>
public virtual string ReadText(Stream stream)
{
Guard.IsNotNull(stream);

// Register the code pages encoding provider for the .doc files
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);

var document = new HWPFDocument(stream);

var extractor = new WordExtractor(document);

return extractor.Text.Trim();
}

/// <inheritdoc/>
public virtual void Initialize(Stream stream)
{
// Intentionally not implemented to comply with the Liskov Substitution Principle...
}

/// <inheritdoc/>
public virtual void AppendText(Stream stream, string text)
{
// Intentionally not implemented to comply with the Liskov Substitution Principle...
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
using CommunityToolkit.Diagnostics;

using HtmlAgilityPack;

namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors;

/// <summary>
/// Extracts text from a document in the <c>.html</c> format.
/// </summary>
public class HtmlDocumentConnector : IEnmarchaDocumentConnector
{
/// <inheritdoc/>
public IReadOnlyList<string> CompatibleFileFormats => [".HTML"];

/// <inheritdoc/>
public virtual string ReadText(Stream stream)
{
Guard.IsNotNull(stream);

var htmlDoc = new HtmlDocument();
htmlDoc.Load(stream);

var text = htmlDoc.DocumentNode.InnerText.Trim();

// Remove all html tags from the text
var cleanedText = HtmlEntity.DeEntitize(text);

return cleanedText;
}

/// <inheritdoc/>
public virtual void Initialize(Stream stream)
{
// Intentionally not implemented to comply with the Liskov Substitution Principle...
}

/// <inheritdoc/>
public virtual void AppendText(Stream stream, string text)
{
// Intentionally not implemented to comply with the Liskov Substitution Principle...
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@

<ItemGroup>
<PackageReference Include="ExcelNumberFormat" Version="1.1.0" />
<PackageReference Include="HtmlAgilityPack" Version="1.11.71" />
<PackageReference Include="Microsoft.Extensions.Options.ConfigurationExtensions" Version="8.0.0" />
<PackageReference Include="Microsoft.Extensions.Options.DataAnnotations" Version="8.0.0" />
<PackageReference Include="Microsoft.SemanticKernel.Plugins.Document" Version="1.17.2-alpha" />
<PackageReference Include="PdfPig" Version="0.1.8" />
<PackageReference Include="ScratchPad.NPOI.HWPF" Version="2.5.7" />
<PackageReference Include="SixLabors.ImageSharp" Version="3.1.5" />
<PackageReference Include="System.Memory.Data" Version="8.0.0" />
</ItemGroup>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,26 @@ public static IServiceCollection AddWordDocumentConnector(this IServiceCollectio
return services.AddSingleton<IEnmarchaDocumentConnector, WordDocumentConnector>();
}

/// <summary>
/// Adds the <see cref="DocDocumentConnector"/> implementation of <see cref="IEnmarchaDocumentConnector"/> to the specified <see cref="IServiceCollection"/> as a singleton service.
/// </summary>
/// <param name="services">The <see cref="IServiceCollection"/> to add services to.</param>
/// <returns>The <see cref="IServiceCollection"/> so that additional calls can be chained.</returns>
public static IServiceCollection AddDocDocumentConnector(this IServiceCollection services)
{
return services.AddSingleton<IEnmarchaDocumentConnector, DocDocumentConnector>();
}

/// <summary>
/// Adds the <see cref="HtmlDocumentConnector"/> implementation of <see cref="IEnmarchaDocumentConnector"/> to the specified <see cref="IServiceCollection"/> as a singleton service.
/// </summary>
/// <param name="services">The <see cref="IServiceCollection"/> to add services to.</param>
/// <returns>The <see cref="IServiceCollection"/> so that additional calls can be chained.</returns>
public static IServiceCollection AddHtmlDocumentConnector(this IServiceCollection services)
{
return services.AddSingleton<IEnmarchaDocumentConnector, HtmlDocumentConnector>();
}

/// <summary>
/// Adds the <see cref="CleanPdfDocumentConnector"/> implementation of <see cref="IEnmarchaDocumentConnector"/> to the specified <see cref="IServiceCollection"/> as a singleton service.
/// </summary>
Expand Down

0 comments on commit b52e02a

Please sign in to comment.