Skip to content

Commit

Permalink
Merge pull request #144 from Encamina/@vmunoz/add_html_and_doc_connec…
Browse files Browse the repository at this point in the history
…tors

Added two document connectors to SemanticKernel.Connectors.Document
  • Loading branch information
VictorMuMo authored Nov 19, 2024
2 parents e01c1eb + ed829dd commit 2cfec68
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 1 deletion.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ Previous classification is not required if changes are simple or all belong to t
- Added the `AtLeastOneRequiredAttribute` Data Annotation to validate that at least one of the specified properties has a value.
- Enchanced `JsonUtils` with new methods: `FastCheckIsJson` and `IsAnAdaptiveCard`.
- Added new `AtLeastOneRequiredSchemaFilter` to ensure OpenAPI schemas enforce that at least one of the specified properties is required, by modifying the schema to use the `anyOf` rule in Swagger documentation generation.
- Added two new connectors to `Encamina.Enmarcha.SemanticKernel.Connectors.Document`:
- Document connector for reading `.doc` files: `DocDocumentConnector`.
- Document connector for reading `.html` files: `HtmlDocumentConnector`.

## [8.1.8]

Expand Down
2 changes: 1 addition & 1 deletion Directory.Build.props
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

<PropertyGroup>
<VersionPrefix>8.1.9</VersionPrefix>
<VersionSuffix>preview-02</VersionSuffix>
<VersionSuffix>preview-03</VersionSuffix>
</PropertyGroup>

<!--
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
using System.Text;

using CommunityToolkit.Diagnostics;

using NPOI.HWPF;
using NPOI.HWPF.Extractor;

namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors;

/// <summary>
/// Extracts text from a document in the <c>.doc</c> format.
/// </summary>
public class DocDocumentConnector : IEnmarchaDocumentConnector
{
/// <summary>
/// Initializes a new instance of the <see cref="DocDocumentConnector"/> class.
/// </summary>
public DocDocumentConnector()
{
// Register the code pages encoding provider for the .doc files
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
}

/// <inheritdoc/>
public IReadOnlyList<string> CompatibleFileFormats => [".DOC"];

/// <inheritdoc/>
public virtual string ReadText(Stream stream)
{
Guard.IsNotNull(stream);

var document = new HWPFDocument(stream);

var extractor = new WordExtractor(document);

return extractor.Text.Trim();
}

/// <inheritdoc/>
public virtual void Initialize(Stream stream)
{
// Intentionally not implemented to comply with the Liskov Substitution Principle...
}

/// <inheritdoc/>
public virtual void AppendText(Stream stream, string text)
{
// Intentionally not implemented to comply with the Liskov Substitution Principle...
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
using CommunityToolkit.Diagnostics;

using HtmlAgilityPack;

namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors;

/// <summary>
/// Extracts text from a document in the <c>.html</c> format.
/// </summary>
public class HtmlDocumentConnector : IEnmarchaDocumentConnector
{
/// <inheritdoc/>
public IReadOnlyList<string> CompatibleFileFormats => [".HTML"];

/// <inheritdoc/>
public virtual string ReadText(Stream stream)
{
Guard.IsNotNull(stream);

var htmlDoc = new HtmlDocument();
htmlDoc.Load(stream);

var text = htmlDoc.DocumentNode.InnerText.Trim();

// Remove all html tags from the text
var cleanedText = HtmlEntity.DeEntitize(text);

return cleanedText;
}

/// <inheritdoc/>
public virtual void Initialize(Stream stream)
{
// Intentionally not implemented to comply with the Liskov Substitution Principle...
}

/// <inheritdoc/>
public virtual void AppendText(Stream stream, string text)
{
// Intentionally not implemented to comply with the Liskov Substitution Principle...
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@

<ItemGroup>
<PackageReference Include="ExcelNumberFormat" Version="1.1.0" />
<PackageReference Include="HtmlAgilityPack" Version="1.11.71" />
<PackageReference Include="Microsoft.Extensions.Options.ConfigurationExtensions" Version="8.0.0" />
<PackageReference Include="Microsoft.Extensions.Options.DataAnnotations" Version="8.0.0" />
<PackageReference Include="Microsoft.SemanticKernel.Plugins.Document" Version="1.17.2-alpha" />
<PackageReference Include="PdfPig" Version="0.1.8" />
<PackageReference Include="ScratchPad.NPOI.HWPF" Version="2.5.7" />
<PackageReference Include="SixLabors.ImageSharp" Version="3.1.5" />
<PackageReference Include="System.Memory.Data" Version="8.0.0" />
</ItemGroup>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,26 @@ public static IServiceCollection AddWordDocumentConnector(this IServiceCollectio
return services.AddSingleton<IEnmarchaDocumentConnector, WordDocumentConnector>();
}

/// <summary>
/// Adds the <see cref="DocDocumentConnector"/> implementation of <see cref="IEnmarchaDocumentConnector"/> to the specified <see cref="IServiceCollection"/> as a singleton service.
/// </summary>
/// <param name="services">The <see cref="IServiceCollection"/> to add services to.</param>
/// <returns>The <see cref="IServiceCollection"/> so that additional calls can be chained.</returns>
public static IServiceCollection AddDocDocumentConnector(this IServiceCollection services)
{
return services.AddSingleton<IEnmarchaDocumentConnector, DocDocumentConnector>();
}

/// <summary>
/// Adds the <see cref="HtmlDocumentConnector"/> implementation of <see cref="IEnmarchaDocumentConnector"/> to the specified <see cref="IServiceCollection"/> as a singleton service.
/// </summary>
/// <param name="services">The <see cref="IServiceCollection"/> to add services to.</param>
/// <returns>The <see cref="IServiceCollection"/> so that additional calls can be chained.</returns>
public static IServiceCollection AddHtmlDocumentConnector(this IServiceCollection services)
{
return services.AddSingleton<IEnmarchaDocumentConnector, HtmlDocumentConnector>();
}

/// <summary>
/// Adds the <see cref="CleanPdfDocumentConnector"/> implementation of <see cref="IEnmarchaDocumentConnector"/> to the specified <see cref="IServiceCollection"/> as a singleton service.
/// </summary>
Expand Down

0 comments on commit 2cfec68

Please sign in to comment.