google · tballison · Aug 25, 2025 · Aug 26, 2025 · Aug 26, 2025 · Aug 26, 2025
diff --git a/projects/pdfbox/.gitignore b/projects/pdfbox/.gitignore
@@ -1,4 +1,3 @@
 project-parent/pdfbox
 project-parent/fuzz-targets/target
-project-parent/fuzz-targets/src/test/resources
 project-parent/fuzz-targets/pom.xml.versionsBackup
diff --git a/projects/pdfbox/Dockerfile b/projects/pdfbox/Dockerfile
@@ -1,4 +1,4 @@
-# Copyright 2023 Google LLC
+# Copyright 2025 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,25 +14,48 @@
 #
 ################################################################################
 
-FROM gcr.io/oss-fuzz-base/base-builder-jvm
+#We need a modern version of mupdf-tools.
+#In the version of mutools that is brought in with oss-fuzz base as of 28 Aug 2025,
+#only some font types have "font-" prepended as their name. This breaks
+#globbing, and, separately, we don't want to use such an old version.
+
+#So, we can either build it or simply use an OS that comes with a more recent version
+#I'm choosing the latter.
+FROM ubuntu:questing-20250806 AS base
+
+RUN set -eux \
+    && apt-get update \
+    && apt-get install --yes --no-install-recommends \
+    mupdf-tools curl ca-certificates unzip zip && \
+    apt-get clean
+
+#pull an arbitrary zip of 1k pdfs
+RUN mkdir /work && \
+     curl -L https://digitalcorpora.s3.amazonaws.com/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/5000-5999/5136.zip \
+    --output /work/PDFExtractTextFuzzer_seed_corpus.zip
+COPY extract-fonts.sh /work/extract-fonts.sh
+RUN cd /work && /bin/bash extract-fonts.sh && rm extract-fonts.sh
 
-RUN curl -L https://archive.apache.org/dist/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.zip -o maven.zip && \
-    unzip maven.zip -d $SRC/maven && \
-    rm -rf maven.zip
 
-ENV MVN $SRC/maven/apache-maven-3.6.3/bin/mvn
+FROM gcr.io/oss-fuzz-base/base-builder-jvm
+
+COPY --from=base /work/*_seed_corpus.zip $SRC
 
 RUN git clone --depth 1 https://github.com/google/fuzzing && \
     cp fuzzing/dictionaries/pdf.dict $SRC/PDFStreamParserFuzzer.dict && \
     cp fuzzing/dictionaries/pdf.dict $SRC/PDFWriteReadFuzzer.dict && \
     rm -rf fuzzing
 
+RUN curl -L https://archive.apache.org/dist/maven/maven-3/3.9.11/binaries/apache-maven-3.9.11-bin.zip -o maven.zip && \
+    unzip maven.zip -d $SRC/maven && \
+    rm -rf maven.zip
+
 # if not set python infra helper cannot be used for local testing
+ENV MVN=$SRC/maven/apache-maven-3.9.11/bin/mvn
 
 COPY project-parent $SRC/project-parent/
-
 RUN rm -rf $SRC/project-parent/pdfbox
 RUN git clone --depth 1 https://github.com/apache/pdfbox/ $SRC/project-parent/pdfbox
 
 COPY build.sh $SRC/
-WORKDIR $SRC/
+WORKDIR $SRC/project-parent/pdfbox
diff --git a/projects/pdfbox/build.sh b/projects/pdfbox/build.sh
@@ -28,7 +28,7 @@ function set_project_version_in_fuzz_targets_dependency {
   (cd fuzz-targets && $MVN versions:use-dep-version -Dincludes=$PROJECT_GROUP_ID:$PROJECT_ARTIFACT_ID -DdepVersion=$PROJECT_VERSION -DforceVersion=true)
 }
 
-cd project-parent
+cd $SRC/project-parent
 
 # LOCAL_DEV env variable need to be set in local development env
 if [[ -v LOCAL_DEV ]]; then
@@ -44,8 +44,9 @@ if [[ -v LOCAL_DEV ]]; then
   mvn -pl fuzz-targets install
 
 else
-  # Move seed corpus and dictionary.
+  # Move dictionaries and seed corpora.
   mv $SRC/*.dict $OUT
+  mv $SRC/*.zip $OUT
 
   set_project_version_in_fuzz_targets_dependency
 

diff --git a/projects/pdfbox/extract-fonts.sh b/projects/pdfbox/extract-fonts.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+
+mkdir tmp
+mkdir fonts
+unzip PDFExtractTextFuzzer_seed_corpus.zip -d pdfs
+
+#start with a zip of pdfs
+#use mutool to extract the fonts and images
+#keep the fonts. get rid of the images
+#If there's a more efficient way to extract just the fonts, we should implement that
+
+for file in pdfs/*.pdf; do
+    echo "$(basename $file)"
+    cp "$file" tmp
+    cd tmp
+    mutool extract "$(basename $file)"
+    FONTS=($(find . -name "font-*" -printf '%P\n' 2>/dev/null))
+    for fnt in "${FONTS[@]}"; do
+        if [ ! -d "../fonts/${fnt##*.}" ]; then
+            mkdir "../fonts/${fnt##*.}"
+        fi
+        cp "$fnt" "../fonts/${fnt##*.}/$(basename $file)-$fnt"
+    done
+    cd ..
+    rm -rf tmp/*
+done
+
+if [ -d "fonts/cff" ]; then
+    cd fonts/cff
+    zip CFFParserFuzzer_seed_corpus.zip *.cff
+    mv CFFParserFuzzer_seed_corpus.zip ../..
+    cd ../..
+fi
+
+if [ -d "fonts/otf" ]; then
+    cd fonts/otf
+    zip OTFParserFuzzer_seed_corpus.zip *.otf
+    mv OTFParserFuzzer_seed_corpus.zip ../.. 
+    cd ../.. 
+fi
+
+if [ -d "fonts/ttf" ]; then
+    cd fonts/ttf
+    zip TTFParserFuzzer_seed_corpus.zip *.ttf
+    mv TTFParserFuzzer_seed_corpus.zip ../.. 
+    cd ../.. 
+fi
+
+if [ -d "fonts/cid" ]; then
+    cd fonts/cid
+    zip CMapParserFuzzer_seed_corpus.zip *.cid
+    mv CMapParserFuzzer_seed_corpus.zip ../..
+    cd ../.. 
+fi
+
+if [ -d "fonts/pfa" ]; then
+    cd fonts/pfa
+    zip PFAParserFuzzer_seed_corpus.zip *.pfa
+    mv PFAParserFuzzer_seed_corpus.zip ../..
+    cd ../.. 
+fi
+
+rm -rf fonts pdfs tmp
diff --git a/projects/pdfbox/project-parent/fuzz-targets/pom.xml b/projects/pdfbox/project-parent/fuzz-targets/pom.xml
@@ -42,6 +42,12 @@
             <version>Fuzzing-SNAPSHOT</version>
         </dependency>
 
+        <dependency>
+            <groupId>org.apache.logging.log4j</groupId>
+            <artifactId>log4j-core</artifactId>
+            <version>2.24.3</version>
+        </dependency>
+
     </dependencies>
 
     <build>

diff --git a/projects/pdfbox/project-parent/fuzz-targets/src/test/java/com/example/CFFParserFuzzer.java b/projects/pdfbox/project-parent/fuzz-targets/src/test/java/com/example/CFFParserFuzzer.java
@@ -0,0 +1,37 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+package com.example;
+
+import java.io.IOException;
+
+import com.code_intelligence.jazzer.api.FuzzedDataProvider;
+
+import org.apache.fontbox.cff.CFFParser;
+import org.apache.pdfbox.io.RandomAccessRead;
+import org.apache.pdfbox.io.RandomAccessReadBuffer;
+
+class CFFParserFuzzer {
+
+    public static void fuzzerTestOneInput(FuzzedDataProvider data) {
+        byte [] bytes = data.consumeRemainingAsBytes();
+        CFFParser parser = new CFFParser();
+        try (RandomAccessRead buffer = new RandomAccessReadBuffer(bytes)) {
+            parser.parse(buffer);
+        } catch (IOException e) {
+        }
+    }
+}
diff --git a/projects/pdfbox/project-parent/fuzz-targets/src/test/java/com/example/CMapParserFuzzer.java b/projects/pdfbox/project-parent/fuzz-targets/src/test/java/com/example/CMapParserFuzzer.java
@@ -0,0 +1,42 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+/// /////////////////////////////////////////////////////////////////////////////
+
+package com.example;
+
+import java.io.IOException;
+
+import com.code_intelligence.jazzer.api.FuzzedDataProvider;
+
+import org.apache.fontbox.cmap.CMapParser;
+import org.apache.pdfbox.io.RandomAccessRead;
+import org.apache.pdfbox.io.RandomAccessReadBuffer;
+/**
+ * the .cid files extracted my mutool aren't pure character maps
+ * On a random selection, it looks like the CMapParser can parse ~30%
+ * without an exception. We should figure out why the other cid files
+ * aren't parsing, but they are a close enough fit for seeds for now.
+ */
+public class CMapParserFuzzer {
+
+    public static void fuzzerTestOneInput(FuzzedDataProvider data) {
+        byte[] bytes = data.consumeRemainingAsBytes();
+        try (RandomAccessRead buffer = new RandomAccessReadBuffer(bytes)) {
+            new CMapParser().parse(buffer);
+        } catch (IOException e) {
+        }
+
+    }
+}
diff --git a/projects/pdfbox/project-parent/fuzz-targets/src/test/java/com/example/OTFParserFuzzer.java b/projects/pdfbox/project-parent/fuzz-targets/src/test/java/com/example/OTFParserFuzzer.java
@@ -0,0 +1,37 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+package com.example;
+
+import java.io.IOException;
+
+import com.code_intelligence.jazzer.api.FuzzedDataProvider;
+
+import org.apache.fontbox.ttf.OTFParser;
+import org.apache.pdfbox.io.RandomAccessRead;
+import org.apache.pdfbox.io.RandomAccessReadBuffer;
+
+class OTFParserFuzzer {
+
+    public static void fuzzerTestOneInput(FuzzedDataProvider data) {
+        byte [] bytes = data.consumeRemainingAsBytes();
+        OTFParser parser = new OTFParser();
+        try (RandomAccessRead buffer = new RandomAccessReadBuffer(bytes)) {
+            parser.parse(buffer);
+        } catch (IOException e) {
+        }
+    }
+}
diff --git a/...ts/pdfbox/project-parent/fuzz-targets/src/test/java/com/example/PDFExtractTextFuzzer.java b/...ts/pdfbox/project-parent/fuzz-targets/src/test/java/com/example/PDFExtractTextFuzzer.java
@@ -0,0 +1,40 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+package com.example;
+
+import java.io.IOException;
+
+import com.code_intelligence.jazzer.api.FuzzedDataProvider;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.io.RandomAccessRead;
+import org.apache.pdfbox.io.RandomAccessReadBuffer;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
+
+class PDFExtractTextFuzzer {
+
+
+    public static void fuzzerTestOneInput(FuzzedDataProvider data) {
+        byte [] bytes = data.consumeRemainingAsBytes();
+
+        try (RandomAccessRead buffer = new RandomAccessReadBuffer(bytes)) {
+            PDDocument pdDocument = Loader.loadPDF(buffer);
+            String txt = new PDFTextStripper().getText(pdDocument);
+        } catch (IOException | IllegalArgumentException e) {
+        }
+    }
+}