From dc7f4f41eedb4e5831c5dd4d723442bda077a60b Mon Sep 17 00:00:00 2001 From: Cody Holmes Date: Tue, 24 Nov 2020 09:00:26 -0600 Subject: [PATCH 1/4] Rebuild the trailer when missing pages item --- .../apache/pdfbox/pdfparser/PDFParser.java | 53 +++++++++++++++++-- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java b/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java index 836963f016f..c836a708799 100644 --- a/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java @@ -25,8 +25,6 @@ import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.cos.COSNull; -import org.apache.pdfbox.cos.COSObject; import org.apache.pdfbox.io.IOUtils; import org.apache.pdfbox.io.RandomAccessRead; import org.apache.pdfbox.io.ScratchFile; @@ -171,7 +169,6 @@ public PDDocument getPDDocument() throws IOException protected void initialParse() throws IOException { COSDictionary trailer = retrieveTrailer(); - COSBase base = parseTrailerValuesDynamically(trailer); if (!(base instanceof COSDictionary)) { @@ -193,7 +190,20 @@ protected void initialParse() throws IOException } // check pages dictionaries checkPages(root); - if (!(root.getDictionaryObject(COSName.PAGES) instanceof COSDictionary)) + boolean foundPages = false; + if (root.getDictionaryObject(COSName.PAGES) instanceof COSDictionary) + { + foundPages = true; + } + if (!foundPages && isLenient()) + { + root = rebuildTrailerRoot(); + if (root.getDictionaryObject(COSName.PAGES) instanceof COSDictionary) + { + foundPages = true; + } + } + if (!foundPages) { throw new IOException("Page tree root must be a dictionary"); } @@ -201,6 +211,41 @@ protected void initialParse() throws IOException initialParseDone = true; } + /** + * Rebuild the trailer/root dictionary if Pages can't be found. + * + * @return the rebuild trailer/root dictionary + * + * @throws IOException if something went wrong + */ + private COSDictionary rebuildTrailerRoot() throws IOException + { + // Brute force the trailer when pages couldn't be found on the original one + COSDictionary trailer = rebuildTrailer(); + COSBase base = parseTrailerValuesDynamically(trailer); + if (!(base instanceof COSDictionary)) + { + throw new IOException("Expected root dictionary, but got this: " + base); + } + COSDictionary root = (COSDictionary) base; + // in some pdfs the type value "Catalog" is missing in the root object + if (isLenient() && !root.containsKey(COSName.TYPE)) + { + root.setItem(COSName.TYPE, COSName.CATALOG); + } + // parse all objects, starting at the root dictionary + parseDictObjects(root, (COSName[]) null); + // parse all objects of the info dictionary + COSBase infoBase = trailer.getDictionaryObject(COSName.INFO); + if (infoBase instanceof COSDictionary) + { + parseDictObjects((COSDictionary) infoBase, (COSName[]) null); + } + // check pages dictionaries + checkPages(root); + return root; + } + /** * This will parse the stream and populate the COSDocument object. This will close * the keystore stream when it is done parsing. From 8d3070e4f38156bc2411e151d57703b1c87270f0 Mon Sep 17 00:00:00 2001 From: Cody Holmes Date: Tue, 24 Nov 2020 09:41:16 -0600 Subject: [PATCH 2/4] Add validation of pages and root --- .../apache/pdfbox/pdfparser/COSParser.java | 26 ++++++++- .../apache/pdfbox/pdfparser/PDFParser.java | 54 ++----------------- 2 files changed, 29 insertions(+), 51 deletions(-) diff --git a/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java b/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java index 26f7d8af53d..311fe2e6d63 100644 --- a/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java @@ -272,7 +272,7 @@ protected COSDictionary retrieveTrailer() throws IOException } } // check if the trailer contains a Root object - if (trailer != null && trailer.getItem(COSName.ROOT) == null) + if (trailer != null && !isValidTrailer(trailer)) { rebuildTrailer = isLenient(); } @@ -292,6 +292,30 @@ protected COSDictionary retrieveTrailer() throws IOException return trailer; } + /** + * Check that the trailer contains a Root object and that the Root + * contains a Pages object. + * + * @param trailer the trailer to validate + * @return whether or not the trailer is valid. + * @throws IOException if an error occurs + */ + private boolean isValidTrailer(COSDictionary trailer) throws IOException + { + COSObject root = trailer.getCOSObject(COSName.ROOT); + if (root == null) + { + return false; + } + COSBase base = parseObjectDynamically(root, false); + if (!(base instanceof COSDictionary)) + { + return false; + } + COSDictionary rootDict = (COSDictionary) base; + return rootDict.getDictionaryObject(COSName.PAGES) != null; + } + /** * Parses cross reference tables. * diff --git a/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java b/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java index c836a708799..20286bee73d 100644 --- a/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java @@ -25,6 +25,7 @@ import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSObject; import org.apache.pdfbox.io.IOUtils; import org.apache.pdfbox.io.RandomAccessRead; import org.apache.pdfbox.io.ScratchFile; @@ -162,13 +163,14 @@ public PDDocument getPDDocument() throws IOException * The initial parse will first parse only the trailer, the xrefstart and all xref tables to have a pointer (offset) * to all the pdf's objects. It can handle linearized pdfs, which will have an xref at the end pointing to an xref * at the beginning of the file. Last the root object is parsed. - * + * * @throws InvalidPasswordException If the password is incorrect. * @throws IOException If something went wrong. */ protected void initialParse() throws IOException { COSDictionary trailer = retrieveTrailer(); + COSBase base = parseTrailerValuesDynamically(trailer); if (!(base instanceof COSDictionary)) { @@ -190,20 +192,7 @@ protected void initialParse() throws IOException } // check pages dictionaries checkPages(root); - boolean foundPages = false; - if (root.getDictionaryObject(COSName.PAGES) instanceof COSDictionary) - { - foundPages = true; - } - if (!foundPages && isLenient()) - { - root = rebuildTrailerRoot(); - if (root.getDictionaryObject(COSName.PAGES) instanceof COSDictionary) - { - foundPages = true; - } - } - if (!foundPages) + if (!(root.getDictionaryObject(COSName.PAGES) instanceof COSDictionary)) { throw new IOException("Page tree root must be a dictionary"); } @@ -211,41 +200,6 @@ protected void initialParse() throws IOException initialParseDone = true; } - /** - * Rebuild the trailer/root dictionary if Pages can't be found. - * - * @return the rebuild trailer/root dictionary - * - * @throws IOException if something went wrong - */ - private COSDictionary rebuildTrailerRoot() throws IOException - { - // Brute force the trailer when pages couldn't be found on the original one - COSDictionary trailer = rebuildTrailer(); - COSBase base = parseTrailerValuesDynamically(trailer); - if (!(base instanceof COSDictionary)) - { - throw new IOException("Expected root dictionary, but got this: " + base); - } - COSDictionary root = (COSDictionary) base; - // in some pdfs the type value "Catalog" is missing in the root object - if (isLenient() && !root.containsKey(COSName.TYPE)) - { - root.setItem(COSName.TYPE, COSName.CATALOG); - } - // parse all objects, starting at the root dictionary - parseDictObjects(root, (COSName[]) null); - // parse all objects of the info dictionary - COSBase infoBase = trailer.getDictionaryObject(COSName.INFO); - if (infoBase instanceof COSDictionary) - { - parseDictObjects((COSDictionary) infoBase, (COSName[]) null); - } - // check pages dictionaries - checkPages(root); - return root; - } - /** * This will parse the stream and populate the COSDocument object. This will close * the keystore stream when it is done parsing. From 3f6bd8c074c4530013714cf528bf70bd89aae330 Mon Sep 17 00:00:00 2001 From: Cody Holmes Date: Tue, 24 Nov 2020 10:06:27 -0600 Subject: [PATCH 3/4] Add tests and fix cosobject lookup --- pdfbox/pom.xml | 13 +++++++++++++ .../java/org/apache/pdfbox/pdfparser/COSParser.java | 2 +- .../org/apache/pdfbox/pdfparser/TestPDFParser.java | 13 +++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/pdfbox/pom.xml b/pdfbox/pom.xml index 112d70221b8..81a978dda02 100644 --- a/pdfbox/pom.xml +++ b/pdfbox/pom.xml @@ -697,6 +697,19 @@ 5ae7f232c47c13ed31997eb2c368e7deb1013c1321d70bf79369f8d709b33406191d94c21a5d27b4c4bb48241bafd9328a0a6d2d093d4e540d5044e9503bd099 + + PDFBOX-5026 + generate-test-resources + + wget + + + https://issues.apache.org/jira/secure/attachment/13015945/issue9418.pdf + ${project.build.directory}/pdfs + PDFBOX-5026.pdf + 1e47caa4246752bc392e596803cb95556bde578b687352a7434d9b77f92892539f810f046e76e4b2300d859ab9a8755a4212378ae099a825367f154919524d7c + + diff --git a/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java b/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java index 311fe2e6d63..2e3732cb699 100644 --- a/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java @@ -313,7 +313,7 @@ private boolean isValidTrailer(COSDictionary trailer) throws IOException return false; } COSDictionary rootDict = (COSDictionary) base; - return rootDict.getDictionaryObject(COSName.PAGES) != null; + return rootDict.getCOSObject(COSName.PAGES) != null; } /** diff --git a/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java b/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java index 9617a912ec6..2e5546e9418 100644 --- a/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java +++ b/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java @@ -363,6 +363,19 @@ public void testPDFBox4490() throws IOException doc.close(); } + /** + * Test that PDFBOX-5026 has pages tree. + * + * @throws IOException + */ + @Test + public void testPDFBox5026() throws IOException + { + PDDocument doc = PDDocument.load(new File(TARGETPDFDIR, "PDFBOX-5026.pdf")); + assertNotNull(doc.getPages()); + doc.close(); + } + private void executeParserTest(RandomAccessRead source, MemoryUsageSetting memUsageSetting) throws IOException { ScratchFile scratchFile = new ScratchFile(memUsageSetting); From 9aaf6b18ca6fd3075b3439b10e4ed180f74a7557 Mon Sep 17 00:00:00 2001 From: cwholmes Date: Tue, 24 Nov 2020 10:20:56 -0600 Subject: [PATCH 4/4] Update TestPDFParser.java --- .../test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java b/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java index 2e5546e9418..0f13c5a078c 100644 --- a/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java +++ b/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/TestPDFParser.java @@ -372,7 +372,7 @@ public void testPDFBox4490() throws IOException public void testPDFBox5026() throws IOException { PDDocument doc = PDDocument.load(new File(TARGETPDFDIR, "PDFBOX-5026.pdf")); - assertNotNull(doc.getPages()); + assertEquals(1, doc.getNumberOfPages()); doc.close(); }