Skip to content

Commit 4d3b9d3

Browse files
committed
BAEL-9267: Determine If a File is a PDF File in Java
1 parent 69e0343 commit 4d3b9d3

File tree

2 files changed

+74
-1
lines changed

2 files changed

+74
-1
lines changed

text-processing-libraries-modules/pdf-2/pom.xml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@
4040
<artifactId>poi-ooxml</artifactId>
4141
<version>${poi-ooxml.version}</version>
4242
</dependency>
43+
<dependency>
44+
<groupId>org.apache.tika</groupId>
45+
<artifactId>tika-core</artifactId>
46+
<version>${tika.version}</version>
47+
</dependency>
4348
<dependency>
4449
<groupId>org.apache.logging.log4j</groupId>
4550
<artifactId>log4j-api</artifactId>
@@ -70,8 +75,9 @@
7075
<itextpdf.version>5.5.13.3</itextpdf.version>
7176
<itextpdf.core.version>7.2.3</itextpdf.core.version>
7277
<itextpdf.cleanup.version>3.0.1</itextpdf.cleanup.version>
73-
<pdfbox.version>3.0.0</pdfbox.version>
78+
<pdfbox.version>3.0.4</pdfbox.version>
7479
<poi-ooxml.version>5.2.5</poi-ooxml.version>
80+
<tika.version>3.1.0</tika.version>
7581
<log4j-api.version>2.20.0</log4j-api.version>
7682
<log4j-core.version>2.20.0</log4j-core.version>
7783
</properties>
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
package com.baeldung.detect;
2+
3+
import org.apache.pdfbox.Loader;
4+
import org.apache.pdfbox.pdmodel.PDDocument;
5+
import org.apache.tika.Tika;
6+
import org.junit.jupiter.api.Test;
7+
8+
import static org.assertj.core.api.Assertions.assertThat;
9+
import static org.junit.jupiter.api.Assertions.assertTrue;
10+
11+
import java.io.*;
12+
import java.util.Objects;
13+
14+
import com.itextpdf.commons.exceptions.ITextException;
15+
import com.itextpdf.kernel.pdf.PdfDocument;
16+
import com.itextpdf.kernel.pdf.PdfReader;
17+
18+
19+
public class PdfDetectTest {
20+
21+
private static final File PDF_FILE = new File("src/test/resources/input.pdf");
22+
private static final File NON_PDF_FILE = new File("src/test/resources/test.jpg");
23+
24+
25+
@Test
26+
void whenDetectPdfByPdfBox_thenCorrect() throws IOException {
27+
boolean isPdf;
28+
try (PDDocument document = Loader.loadPDF(PDF_FILE)) {
29+
isPdf = true;
30+
} catch (IOException ioe) {
31+
isPdf = false;
32+
}
33+
assertTrue(isPdf);
34+
}
35+
36+
@Test
37+
void whenDetectPdfByItext_thenCorrect() throws IOException {
38+
boolean isPdf;
39+
try (PdfDocument pdfDoc = new PdfDocument(new PdfReader(PDF_FILE))) {
40+
isPdf = true;
41+
} catch (ITextException ite) {
42+
isPdf = false;
43+
}
44+
assertTrue(isPdf);
45+
}
46+
47+
@Test
48+
void whenDetectPdfByFileSignature_thenCorrect() throws IOException {
49+
boolean isPdf = false;
50+
try (InputStream fis = new BufferedInputStream(new FileInputStream(PDF_FILE))) {
51+
byte[] bytes = new byte[5];
52+
if (fis.read(bytes) >= 5) {
53+
String header = new String(bytes);
54+
isPdf = Objects.equals(header, "%PDF-");
55+
}
56+
}
57+
assertTrue(isPdf);
58+
}
59+
60+
@Test
61+
void whenDetectPdfByTika_thenCorrect() throws IOException {
62+
Tika tika = new Tika();
63+
boolean isPdf = Objects.equals(tika.detect(PDF_FILE), "application/pdf");
64+
assertTrue(isPdf);
65+
}
66+
67+
}

0 commit comments

Comments
 (0)