Practical-Formal-Methods · jiradeto · Apr 6, 2022 · Apr 14, 2022 · Apr 14, 2022 · Apr 14, 2022
diff --git a/common/experiment_utils.py b/common/experiment_utils.py
@@ -72,6 +72,12 @@ def get_oss_fuzz_corpora_filestore_path():
     return posixpath.join(get_experiment_filestore_path(), 'oss_fuzz_corpora')
 
 
+def get_custom_seed_corpora_filestore_path():
+    """Returns path containing the user-provided seed corpora."""
+    return posixpath.join(get_experiment_filestore_path(),
+                          'custom_seed_corpora')
+
+
 def get_dispatcher_instance_name(experiment: str) -> str:
     """Returns a dispatcher instance name for an experiment."""
     return 'd-%s' % experiment

diff --git a/experiment/resources/runner-startup-script-template.sh b/experiment/resources/runner-startup-script-template.sh
@@ -46,6 +46,7 @@ docker run \
 -e NO_SEEDS={{no_seeds}} \
 -e NO_DICTIONARIES={{no_dictionaries}} \
 -e OSS_FUZZ_CORPUS={{oss_fuzz_corpus}} \
+-e CUSTOM_SEED_CORPUS={{custom_seed_corpus_dir}} \
 -e DOCKER_REGISTRY={{docker_registry}} {% if not local_experiment %}-e CLOUD_PROJECT={{cloud_project}} -e CLOUD_COMPUTE_ZONE={{cloud_compute_zone}} {% endif %}\
 -e EXPERIMENT_FILESTORE={{experiment_filestore}} {% if local_experiment %}-v {{experiment_filestore}}:{{experiment_filestore}} {% endif %}\
 -e REPORT_FILESTORE={{report_filestore}} {% if local_experiment %}-v {{report_filestore}}:{{report_filestore}} {% endif %}\

diff --git a/experiment/run_experiment.py b/experiment/run_experiment.py
@@ -22,6 +22,7 @@
 import sys
 import tarfile
 import tempfile
+import zipfile
 from typing import Dict, List
 
 import jinja2
@@ -63,6 +64,9 @@
     'gs://{project}-backup.clusterfuzz-external.appspot.com/corpus/'
     'libFuzzer/{fuzz_target}/public.zip')
 
+# max size allowed per seed corpus for AFL
+CORPUS_ELEMENT_BYTES_LIMIT = 1 * 1024 * 1024
+
 
 def read_and_validate_experiment_config(config_filename: str) -> Dict:
     """Reads |config_filename|, validates it, finds as many errors as possible,
@@ -148,6 +152,48 @@ def get_directories(parent_dir):
     ]
 
 
+# pylint: disable=too-many-locals
+def validate_and_pack_custom_seed_corpus(custom_seed_corpus_dir, benchmarks):
+    """Validate and archive seed corpus provided by user"""
+    if not os.path.isdir(custom_seed_corpus_dir):
+        raise ValidationError('Corpus location "%s" is invalid.' %
+                              custom_seed_corpus_dir)
+
+    for benchmark in benchmarks:
+        benchmark_corpus_dir = os.path.join(custom_seed_corpus_dir, benchmark)
+        if not os.path.exists(benchmark_corpus_dir):
+            raise ValidationError('Custom seed corpus directory for '
+                                  'benchmark "%s" does not exist.' % benchmark)
+        if not os.path.isdir(benchmark_corpus_dir):
+            raise ValidationError('Seed corpus of benchmark "%s" must be '
+                                  'a directory.' % benchmark)
+        if not os.listdir(benchmark_corpus_dir):
+            raise ValidationError('Seed corpus of benchmark "%s" is empty.' %
+                                  benchmark)
+
+        valid_corpus_files = set()
+        for root, _, files in os.walk(benchmark_corpus_dir):
+            for filename in files:
+                file_path = os.path.join(root, filename)
+                file_size = os.path.getsize(file_path)
+
+                if file_size == 0 or file_size > CORPUS_ELEMENT_BYTES_LIMIT:
+                    continue
+                valid_corpus_files.add(file_path)
+
+        if not valid_corpus_files:
+            raise ValidationError('No valid corpus files for "%s"' % benchmark)
+
+        benchmark_corpus_archive_path = os.path.join(custom_seed_corpus_dir,
+                                                     f'{benchmark}.zip')
+        with zipfile.ZipFile(benchmark_corpus_archive_path, 'w') as archive:
+            for filename in valid_corpus_files:
+                dir_name = os.path.dirname(filename)
+                archive.write(
+                    filename,
+                    os.path.relpath(filename, os.path.join(dir_name, '..')))
+
+
 def validate_benchmarks(benchmarks: List[str]):
     """Parses and validates list of benchmarks."""
     benchmark_types = set()
@@ -219,7 +265,8 @@ def start_experiment(  # pylint: disable=too-many-arguments
         allow_uncommitted_changes=False,
         concurrent_builds=None,
         measurers_cpus=None,
-        runners_cpus=None):
+        runners_cpus=None,        
+        custom_seed_corpus_dir=None):
     """Start a fuzzer benchmarking experiment."""
     if not allow_uncommitted_changes:
         check_no_uncommitted_changes()
@@ -248,6 +295,12 @@ def start_experiment(  # pylint: disable=too-many-arguments
     # 12GB is just the amount that KLEE needs, use this default to make KLEE
     # experiments easier to run.
     config['runner_memory'] = config.get('runner_memory', '12GB')
+
+    config['custom_seed_corpus_dir'] = custom_seed_corpus_dir
+    if config['custom_seed_corpus_dir']:
+        validate_and_pack_custom_seed_corpus(config['custom_seed_corpus_dir'],
+                                             benchmarks)
+
     return start_experiment_from_full_config(config)
 
 
@@ -330,6 +383,16 @@ def filter_file(tar_info):
         for benchmark in config['benchmarks']:
             add_oss_fuzz_corpus(benchmark, oss_fuzz_corpora_dir)
 
+    if config['custom_seed_corpus_dir']:
+        for benchmark in config['benchmarks']:
+            benchmark_corpus_archive_path = os.path.join(
+                config['custom_seed_corpus_dir'], f'{benchmark}.zip')
+            filestore_utils.cp(
+                benchmark_corpus_archive_path,
+                experiment_utils.get_custom_seed_corpora_filestore_path() + '/',
+                recursive=True,
+                parallel=True)
+
 
 class BaseDispatcher:
     """Class representing the dispatcher."""
@@ -522,6 +585,10 @@ def main():
                         '--runners-cpus',
                         help='Cpus available to the runners.',
                         required=False)
+    parser.add_argument('-cs',
+                        '--custom-seed-corpus-dir',
+                        help='Path to the custom seed corpus',
+                        required=False)
 
     all_fuzzers = fuzzer_utils.get_fuzzer_names()
     parser.add_argument('-f',
@@ -585,6 +652,14 @@ def main():
         parser.error('The sum of runners and measurers cpus is greater than the'
                      ' available cpu cores (%d)' % os.cpu_count())
 
+    if args.custom_seed_corpus_dir:
+        if args.no_seeds:
+            parser.error('Cannot enable options "custom_seed_corpus_dir" and '
+                         '"no_seeds" at the same time')
+        if args.oss_fuzz_corpus:
+            parser.error('Cannot enable options "custom_seed_corpus_dir" and '
+                         '"oss_fuzz_corpus" at the same time')
+
     start_experiment(args.experiment_name,
                      args.experiment_config,
                      args.benchmarks,
@@ -596,7 +671,8 @@ def main():
                      allow_uncommitted_changes=args.allow_uncommitted_changes,
                      concurrent_builds=concurrent_builds,
                      measurers_cpus=measurers_cpus,
-                     runners_cpus=runners_cpus)
+                     runners_cpus=runners_cpus,                     
+                     custom_seed_corpus_dir=args.custom_seed_corpus_dir)
     return 0
 
 

diff --git a/experiment/runner.py b/experiment/runner.py
@@ -115,6 +115,33 @@ def get_clusterfuzz_seed_corpus_path(fuzz_target_path):
     return seed_corpus_path if os.path.exists(seed_corpus_path) else None
 
 
+def _unpack_custom_seed_corpus(corpus_directory):
+    "Unpack seed corpus provided by user"
+    # remove initial seed corpus
+    shutil.rmtree(corpus_directory)
+    os.mkdir(corpus_directory)
+    benchmark = environment.get('BENCHMARK')
+    corpus_archive_filename = posixpath.join(
+        experiment_utils.get_custom_seed_corpora_filestore_path(),
+        f'{benchmark}.zip')
+    idx = 0
+    with zipfile.ZipFile(corpus_archive_filename) as zip_file:
+        for seed_corpus_file in zip_file.infolist():
+            if seed_corpus_file.filename.endswith('/'):
+                # Ignore directories.
+                continue
+
+            if seed_corpus_file.file_size > CORPUS_ELEMENT_BYTES_LIMIT:
+                continue
+
+            output_filename = '%016d' % idx
+            output_file_path = os.path.join(corpus_directory, output_filename)
+            zip_file.extract(seed_corpus_file, output_file_path)
+            idx += 1
+
+    logs.info('Unarchived %d files from custom seed corpus.', idx)
+
+
 def _unpack_clusterfuzz_seed_corpus(fuzz_target_path, corpus_directory):
     """If a clusterfuzz seed corpus archive is available, unpack it into the
     corpus directory if it exists. Copied from unpack_seed_corpus in
@@ -172,7 +199,10 @@ def run_fuzzer(max_total_time, log_filename):
         logs.error('Fuzz target binary not found.')
         return
 
-    _unpack_clusterfuzz_seed_corpus(target_binary, input_corpus)
+    if environment.get('CUSTOM_SEED_CORPUS'):
+        _unpack_custom_seed_corpus(input_corpus)
+    else:
+        _unpack_clusterfuzz_seed_corpus(target_binary, input_corpus)
     _clean_seed_corpus(input_corpus)
 
     if max_total_time is None:

diff --git a/experiment/scheduler.py b/experiment/scheduler.py
@@ -717,6 +717,7 @@ def render_startup_script_template(instance_name: str, fuzzer: str,
         'oss_fuzz_corpus': experiment_config['oss_fuzz_corpus'],
         'num_cpu_cores': experiment_config['runner_num_cpu_cores'],
         'cpuset': CPUSET,
+        'custom_seed_corpus_dir': experiment_config['custom_seed_corpus_dir'],
     }
 
     if not local_experiment:

diff --git a/experiment/test_data/experiment-config.yaml b/experiment/test_data/experiment-config.yaml
@@ -31,6 +31,7 @@ git_hash: "git-hash"
 no_seeds: false
 no_dictionaries: false
 oss_fuzz_corpus: false
+custom_seed_corpus_dir: null
 description: "Test experiment"
 concurrent_builds: null
 runners_cpus: null

diff --git a/experiment/test_run_experiment.py b/experiment/test_run_experiment.py
@@ -202,6 +202,7 @@ def test_copy_resources_to_bucket(tmp_path):
         'experiment': 'experiment',
         'benchmarks': ['libxslt_xpath'],
         'oss_fuzz_corpus': True,
+        'custom_seed_corpus_dir': None,
     }
     try:
         with mock.patch('common.filestore_utils.cp') as mocked_filestore_cp:

diff --git a/experiment/test_scheduler.py b/experiment/test_scheduler.py
@@ -118,6 +118,7 @@ def test_create_trial_instance(benchmark, expected_image, expected_target,
 -e NO_SEEDS=False \\
 -e NO_DICTIONARIES=False \\
 -e OSS_FUZZ_CORPUS=False \\
+-e CUSTOM_SEED_CORPUS=None \\
 -e DOCKER_REGISTRY=gcr.io/fuzzbench -e CLOUD_PROJECT=fuzzbench -e CLOUD_COMPUTE_ZONE=us-central1-a \\
 -e EXPERIMENT_FILESTORE=gs://experiment-data \\
 -e REPORT_FILESTORE=gs://web-reports \\