Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion projects/pdfbox/.gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
project-parent/pdfbox
project-parent/fuzz-targets/target
project-parent/fuzz-targets/src/test/resources
project-parent/fuzz-targets/pom.xml.versionsBackup
39 changes: 31 additions & 8 deletions projects/pdfbox/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Google LLC
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -14,25 +14,48 @@
#
################################################################################

FROM gcr.io/oss-fuzz-base/base-builder-jvm
#We need a modern version of mupdf-tools.
#In the version of mutools that is brought in with oss-fuzz base as of 28 Aug 2025,
#only some font types have "font-" prepended as their name. This breaks
#globbing, and, separately, we don't want to use such an old version.

#So, we can either build it or simply use an OS that comes with a more recent version
#I'm choosing the latter.
FROM ubuntu:questing-20250806 AS base

RUN set -eux \
&& apt-get update \
&& apt-get install --yes --no-install-recommends \
mupdf-tools curl ca-certificates unzip zip && \
apt-get clean

#pull an arbitrary zip of 1k pdfs
RUN mkdir /work && \
curl -L https://digitalcorpora.s3.amazonaws.com/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/5000-5999/5136.zip \
--output /work/PDFExtractTextFuzzer_seed_corpus.zip
COPY extract-fonts.sh /work/extract-fonts.sh
RUN cd /work && /bin/bash extract-fonts.sh && rm extract-fonts.sh

RUN curl -L https://archive.apache.org/dist/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.zip -o maven.zip && \
unzip maven.zip -d $SRC/maven && \
rm -rf maven.zip

ENV MVN $SRC/maven/apache-maven-3.6.3/bin/mvn
FROM gcr.io/oss-fuzz-base/base-builder-jvm

COPY --from=base /work/*_seed_corpus.zip $SRC

RUN git clone --depth 1 https://github.com/google/fuzzing && \
cp fuzzing/dictionaries/pdf.dict $SRC/PDFStreamParserFuzzer.dict && \
cp fuzzing/dictionaries/pdf.dict $SRC/PDFWriteReadFuzzer.dict && \
rm -rf fuzzing

RUN curl -L https://archive.apache.org/dist/maven/maven-3/3.9.11/binaries/apache-maven-3.9.11-bin.zip -o maven.zip && \
unzip maven.zip -d $SRC/maven && \
rm -rf maven.zip

# if not set python infra helper cannot be used for local testing
ENV MVN=$SRC/maven/apache-maven-3.9.11/bin/mvn

COPY project-parent $SRC/project-parent/

RUN rm -rf $SRC/project-parent/pdfbox
RUN git clone --depth 1 https://github.com/apache/pdfbox/ $SRC/project-parent/pdfbox

COPY build.sh $SRC/
WORKDIR $SRC/
WORKDIR $SRC/project-parent/pdfbox
5 changes: 3 additions & 2 deletions projects/pdfbox/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ function set_project_version_in_fuzz_targets_dependency {
(cd fuzz-targets && $MVN versions:use-dep-version -Dincludes=$PROJECT_GROUP_ID:$PROJECT_ARTIFACT_ID -DdepVersion=$PROJECT_VERSION -DforceVersion=true)
}

cd project-parent
cd $SRC/project-parent

# LOCAL_DEV env variable need to be set in local development env
if [[ -v LOCAL_DEV ]]; then
Expand All @@ -44,8 +44,9 @@ if [[ -v LOCAL_DEV ]]; then
mvn -pl fuzz-targets install

else
# Move seed corpus and dictionary.
# Move dictionaries and seed corpora.
mv $SRC/*.dict $OUT
mv $SRC/*.zip $OUT

set_project_version_in_fuzz_targets_dependency

Expand Down
78 changes: 78 additions & 0 deletions projects/pdfbox/extract-fonts.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/bin/bash
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################

mkdir tmp
mkdir fonts
unzip PDFExtractTextFuzzer_seed_corpus.zip -d pdfs

#start with a zip of pdfs
#use mutool to extract the fonts and images
#keep the fonts. get rid of the images
#If there's a more efficient way to extract just the fonts, we should implement that

for file in pdfs/*.pdf; do
echo "$(basename $file)"
cp "$file" tmp
cd tmp
mutool extract "$(basename $file)"
FONTS=($(find . -name "font-*" -printf '%P\n' 2>/dev/null))
for fnt in "${FONTS[@]}"; do
if [ ! -d "../fonts/${fnt##*.}" ]; then
mkdir "../fonts/${fnt##*.}"
fi
cp "$fnt" "../fonts/${fnt##*.}/$(basename $file)-$fnt"
done
cd ..
rm -rf tmp/*
done

if [ -d "fonts/cff" ]; then
cd fonts/cff
zip CFFParserFuzzer_seed_corpus.zip *.cff
mv CFFParserFuzzer_seed_corpus.zip ../..
cd ../..
fi

if [ -d "fonts/otf" ]; then
cd fonts/otf
zip OTFParserFuzzer_seed_corpus.zip *.otf
mv OTFParserFuzzer_seed_corpus.zip ../..
cd ../..
fi

if [ -d "fonts/ttf" ]; then
cd fonts/ttf
zip TTFParserFuzzer_seed_corpus.zip *.ttf
mv TTFParserFuzzer_seed_corpus.zip ../..
cd ../..
fi

if [ -d "fonts/cid" ]; then
cd fonts/cid
zip CMapParserFuzzer_seed_corpus.zip *.cid
mv CMapParserFuzzer_seed_corpus.zip ../..
cd ../..
fi

if [ -d "fonts/pfa" ]; then
cd fonts/pfa
zip PFAParserFuzzer_seed_corpus.zip *.pfa
mv PFAParserFuzzer_seed_corpus.zip ../..
cd ../..
fi

rm -rf fonts pdfs tmp
6 changes: 6 additions & 0 deletions projects/pdfbox/project-parent/fuzz-targets/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@
<version>Fuzzing-SNAPSHOT</version>
</dependency>

<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.24.3</version>
</dependency>

</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
////////////////////////////////////////////////////////////////////////////////

package com.example;

import java.io.IOException;

import com.code_intelligence.jazzer.api.FuzzedDataProvider;

import org.apache.fontbox.cff.CFFParser;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.io.RandomAccessReadBuffer;

class CFFParserFuzzer {

public static void fuzzerTestOneInput(FuzzedDataProvider data) {
byte [] bytes = data.consumeRemainingAsBytes();
CFFParser parser = new CFFParser();
try (RandomAccessRead buffer = new RandomAccessReadBuffer(bytes)) {
parser.parse(buffer);
} catch (IOException e) {
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
/// /////////////////////////////////////////////////////////////////////////////

package com.example;

import java.io.IOException;

import com.code_intelligence.jazzer.api.FuzzedDataProvider;

import org.apache.fontbox.cmap.CMapParser;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
/**
* the .cid files extracted my mutool aren't pure character maps
* On a random selection, it looks like the CMapParser can parse ~30%
* without an exception. We should figure out why the other cid files
* aren't parsing, but they are a close enough fit for seeds for now.
*/
public class CMapParserFuzzer {

public static void fuzzerTestOneInput(FuzzedDataProvider data) {
byte[] bytes = data.consumeRemainingAsBytes();
try (RandomAccessRead buffer = new RandomAccessReadBuffer(bytes)) {
new CMapParser().parse(buffer);
} catch (IOException e) {
}

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
////////////////////////////////////////////////////////////////////////////////

package com.example;

import java.io.IOException;

import com.code_intelligence.jazzer.api.FuzzedDataProvider;

import org.apache.fontbox.ttf.OTFParser;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.io.RandomAccessReadBuffer;

class OTFParserFuzzer {

public static void fuzzerTestOneInput(FuzzedDataProvider data) {
byte [] bytes = data.consumeRemainingAsBytes();
OTFParser parser = new OTFParser();
try (RandomAccessRead buffer = new RandomAccessReadBuffer(bytes)) {
parser.parse(buffer);
} catch (IOException e) {
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
////////////////////////////////////////////////////////////////////////////////

package com.example;

import java.io.IOException;

import com.code_intelligence.jazzer.api.FuzzedDataProvider;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

class PDFExtractTextFuzzer {


public static void fuzzerTestOneInput(FuzzedDataProvider data) {
byte [] bytes = data.consumeRemainingAsBytes();

try (RandomAccessRead buffer = new RandomAccessReadBuffer(bytes)) {
PDDocument pdDocument = Loader.loadPDF(buffer);
String txt = new PDFTextStripper().getText(pdDocument);
} catch (IOException | IllegalArgumentException e) {
}
}
}
Loading