diff --git a/text-processing-libraries-modules/pdf-2/pom.xml b/text-processing-libraries-modules/pdf-2/pom.xml index 27e1d0b5b681..fd200e1c79f7 100644 --- a/text-processing-libraries-modules/pdf-2/pom.xml +++ b/text-processing-libraries-modules/pdf-2/pom.xml @@ -40,6 +40,11 @@ poi-ooxml ${poi-ooxml.version} + + org.apache.tika + tika-core + ${tika.version} + org.apache.logging.log4j log4j-api @@ -70,8 +75,9 @@ 5.5.13.3 7.2.3 3.0.1 - 3.0.0 + 3.0.4 5.2.5 + 3.1.0 2.20.0 2.20.0 diff --git a/text-processing-libraries-modules/pdf-2/src/test/java/com/baeldung/detect/PdfDetectUnitTest.java b/text-processing-libraries-modules/pdf-2/src/test/java/com/baeldung/detect/PdfDetectUnitTest.java new file mode 100644 index 000000000000..8d50de6d4594 --- /dev/null +++ b/text-processing-libraries-modules/pdf-2/src/test/java/com/baeldung/detect/PdfDetectUnitTest.java @@ -0,0 +1,64 @@ +package com.baeldung.detect; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.tika.Tika; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.*; +import java.util.Objects; + +import com.itextpdf.commons.exceptions.ITextException; +import com.itextpdf.kernel.pdf.PdfDocument; +import com.itextpdf.kernel.pdf.PdfReader; + + +public class PdfDetectUnitTest { + + private static final File PDF_FILE = new File("src/test/resources/input.pdf"); + + @Test + void whenDetectPdfByPdfBox_thenCorrect() { + boolean isPdf; + try (PDDocument document = Loader.loadPDF(PDF_FILE)) { + isPdf = true; + } catch (IOException ioe) { + isPdf = false; + } + assertTrue(isPdf); + } + + @Test + void whenDetectPdfByItext_thenCorrect() { + boolean isPdf; + try (PdfDocument pdfDoc = new PdfDocument(new PdfReader(PDF_FILE))) { + isPdf = true; + } catch (ITextException | IOException e) { + isPdf = false; + } + assertTrue(isPdf); + } + + @Test + void whenDetectPdfByFileSignature_thenCorrect() throws IOException { + boolean isPdf = false; + try (InputStream fis = new BufferedInputStream(new FileInputStream(PDF_FILE))) { + byte[] bytes = new byte[5]; + if (fis.read(bytes) == 5) { + String header = new String(bytes); + isPdf = Objects.equals(header, "%PDF-"); + } + } + assertTrue(isPdf); + } + + @Test + void whenDetectPdfByTika_thenCorrect() throws IOException { + Tika tika = new Tika(); + boolean isPdf = Objects.equals(tika.detect(PDF_FILE), "application/pdf"); + assertTrue(isPdf); + } + +} \ No newline at end of file