From b52e02a32d485c8f3eec4a662a30a5fa4bcb7ec7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Mu=C3=B1oz=20Molina?= Date: Mon, 18 Nov 2024 16:19:22 +0100 Subject: [PATCH] =?UTF-8?q?Added=20two=20document=20connectors=20to=20Sema?= =?UTF-8?q?nticKernel.Connectors.Document:=20`DocDocumentConnector`=20for?= =?UTF-8?q?=20=C2=B4.doc=C2=B4=20files=20and=20`HtmlDocumentConnector`=20f?= =?UTF-8?q?or=20`.html`=20files.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 3 ++ .../Connectors/DocDocumentConnector.cs | 44 +++++++++++++++++++ .../Connectors/HtmlDocumentConnector.cs | 42 ++++++++++++++++++ ....SemanticKernel.Connectors.Document.csproj | 2 + .../IServiceCollectionExtensions.cs | 20 +++++++++ 5 files changed, 111 insertions(+) create mode 100644 src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/DocDocumentConnector.cs create mode 100644 src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/HtmlDocumentConnector.cs diff --git a/CHANGELOG.md b/CHANGELOG.md index e5fd672..83b893c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,9 @@ Previous classification is not required if changes are simple or all belong to t - Added the `AtLeastOneRequiredAttribute` Data Annotation to validate that at least one of the specified properties has a value. - Enchanced `JsonUtils` with new methods: `FastCheckIsJson` and `IsAnAdaptiveCard`. - Added new `AtLeastOneRequiredSchemaFilter` to ensure OpenAPI schemas enforce that at least one of the specified properties is required, by modifying the schema to use the `anyOf` rule in Swagger documentation generation. +- Added two new connectors to `Encamina.Enmarcha.SemanticKernel.Connectors.Document`: + - Document connector for reading `.doc` files: `DocDocumentConnector`. + - Document connector for reading `.html` files: `HtmlDocumentConnector`. ## [8.1.8] diff --git a/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/DocDocumentConnector.cs b/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/DocDocumentConnector.cs new file mode 100644 index 0000000..616a8dd --- /dev/null +++ b/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/DocDocumentConnector.cs @@ -0,0 +1,44 @@ +using System.Text; + +using CommunityToolkit.Diagnostics; + +using NPOI.HWPF; +using NPOI.HWPF.Extractor; + +namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors; + +/// +/// Extracts text from a document in the .doc format. +/// +public class DocDocumentConnector : IEnmarchaDocumentConnector +{ + /// + public IReadOnlyList CompatibleFileFormats => [".DOC"]; + + /// + public virtual string ReadText(Stream stream) + { + Guard.IsNotNull(stream); + + // Register the code pages encoding provider for the .doc files + Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); + + var document = new HWPFDocument(stream); + + var extractor = new WordExtractor(document); + + return extractor.Text.Trim(); + } + + /// + public virtual void Initialize(Stream stream) + { + // Intentionally not implemented to comply with the Liskov Substitution Principle... + } + + /// + public virtual void AppendText(Stream stream, string text) + { + // Intentionally not implemented to comply with the Liskov Substitution Principle... + } +} diff --git a/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/HtmlDocumentConnector.cs b/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/HtmlDocumentConnector.cs new file mode 100644 index 0000000..7352d27 --- /dev/null +++ b/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/HtmlDocumentConnector.cs @@ -0,0 +1,42 @@ +using CommunityToolkit.Diagnostics; + +using HtmlAgilityPack; + +namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors; + +/// +/// Extracts text from a document in the .html format. +/// +public class HtmlDocumentConnector : IEnmarchaDocumentConnector +{ + /// + public IReadOnlyList CompatibleFileFormats => [".HTML"]; + + /// + public virtual string ReadText(Stream stream) + { + Guard.IsNotNull(stream); + + var htmlDoc = new HtmlDocument(); + htmlDoc.Load(stream); + + var text = htmlDoc.DocumentNode.InnerText.Trim(); + + // Remove all html tags from the text + var cleanedText = HtmlEntity.DeEntitize(text); + + return cleanedText; + } + + /// + public virtual void Initialize(Stream stream) + { + // Intentionally not implemented to comply with the Liskov Substitution Principle... + } + + /// + public virtual void AppendText(Stream stream, string text) + { + // Intentionally not implemented to comply with the Liskov Substitution Principle... + } +} diff --git a/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Encamina.Enmarcha.SemanticKernel.Connectors.Document.csproj b/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Encamina.Enmarcha.SemanticKernel.Connectors.Document.csproj index b1bd7cf..b11cdca 100644 --- a/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Encamina.Enmarcha.SemanticKernel.Connectors.Document.csproj +++ b/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Encamina.Enmarcha.SemanticKernel.Connectors.Document.csproj @@ -20,10 +20,12 @@ + + diff --git a/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Extensions/IServiceCollectionExtensions.cs b/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Extensions/IServiceCollectionExtensions.cs index e562463..d1462d3 100644 --- a/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Extensions/IServiceCollectionExtensions.cs +++ b/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Extensions/IServiceCollectionExtensions.cs @@ -88,6 +88,26 @@ public static IServiceCollection AddWordDocumentConnector(this IServiceCollectio return services.AddSingleton(); } + /// + /// Adds the implementation of to the specified as a singleton service. + /// + /// The to add services to. + /// The so that additional calls can be chained. + public static IServiceCollection AddDocDocumentConnector(this IServiceCollection services) + { + return services.AddSingleton(); + } + + /// + /// Adds the implementation of to the specified as a singleton service. + /// + /// The to add services to. + /// The so that additional calls can be chained. + public static IServiceCollection AddHtmlDocumentConnector(this IServiceCollection services) + { + return services.AddSingleton(); + } + /// /// Adds the implementation of to the specified as a singleton service. ///