Practical-Formal-Methods · jiradeto · May 3, 2022 · May 10, 2022 · May 10, 2022
diff --git a/common/experiment_utils.py b/common/experiment_utils.py
@@ -78,6 +78,11 @@ def get_custom_seed_corpora_filestore_path():
                           'custom_seed_corpora')
 
 
+def get_random_corpora_filestore_path():
+    """Returns path containing seed corpora for the target fuzzing experiment."""
+    return posixpath.join(get_experiment_filestore_path(), 'random_corpora')
+
+
 def get_dispatcher_instance_name(experiment: str) -> str:
     """Returns a dispatcher instance name for an experiment."""
     return 'd-%s' % experiment

diff --git a/common/random_corpus_fuzzing_utils.py b/common/random_corpus_fuzzing_utils.py
@@ -0,0 +1,166 @@
+import random
+import os
+import zipfile
+import tempfile
+import tarfile
+import multiprocessing
+import itertools
+from typing import List
+
+from common import experiment_utils
+from common import filesystem
+from experiment.measurer import coverage_utils
+from experiment.measurer import run_coverage
+from database import utils as db_utils
+from database import models
+from common import logs
+from common import benchmark_utils
+from experiment.build import build_utils
+from common import experiment_path as exp_path
+
+MAX_RANDOM_CORPUS_FILES = 5
+
+
+def get_covered_branches_per_function(coverage_info):
+    function_coverage_info = coverage_info["data"][0]["functions"]
+    covered_branches = set([])
+    for function in function_coverage_info:
+        function_name = function["name"]
+        for branch in function["branches"]:
+            if branch[4]:
+                coverage_key = "{} {}:{}-{}:{} T".format(
+                    function_name, branch[0], branch[1], branch[2], branch[3])
+                covered_branches.add(coverage_key)
+            if branch[5]:
+                coverage_key = "{} {}:{}-{}:{} F".format(
+                    function_name, branch[0], branch[1], branch[2], branch[3])
+                covered_branches.add(coverage_key)
+    return covered_branches
+
+
+def get_covered_branches(coverage_binary, corpus_dir):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        profdata_file = os.path.join(tmp_dir, 'data.profdata')
+        merged_profdata_file = os.path.join(tmp_dir, 'merged.profdata')
+        merged_summary_json_file = os.path.join(tmp_dir, 'merged.json')
+        crashes_dir = os.path.join(tmp_dir, 'crashes')
+        filesystem.create_directory(crashes_dir)
+
+        run_coverage.do_coverage_run(coverage_binary, corpus_dir, profdata_file,
+                                     crashes_dir)
+        coverage_utils.merge_profdata_files([profdata_file],
+                                            merged_profdata_file)
+        coverage_utils.generate_json_summary(coverage_binary,
+                                             merged_profdata_file,
+                                             merged_summary_json_file,
+                                             summary_only=False)
+        coverage_info = coverage_utils.get_coverage_infomation(
+            merged_summary_json_file)
+        return get_covered_branches_per_function(coverage_info)
+
+
+def initialize_random_corpus_fuzzing(benchmarks: List[str],
+                                     num_trials: int,
+                                     target_fuzzing: bool = False):
+    """Get targeting coverage from the given corpus."""
+    pool_args = ()
+    with multiprocessing.Pool(*pool_args) as pool:
+        target_coverage_list = pool.starmap(prepare_benchmark_random_corpus, [
+            (benchmark, num_trials, target_fuzzing) for benchmark in benchmarks
+        ])
+        target_coverage = list(itertools.chain(*target_coverage_list))
+        logs.info('Done Preparing target fuzzing (total %d target)',
+                  len(target_coverage))
+        db_utils.bulk_save(target_coverage)
+
+
+def get_coverage_binary(benchmark, tmp_dir):
+    """Copy coverage binary to temp directory for temporary usage."""
+    coverage_binaries_dir = build_utils.get_coverage_binaries_dir()
+    archive_name = 'coverage-build-%s.tar.gz' % benchmark
+    archive_filestore_path = exp_path.filestore(coverage_binaries_dir /
+                                                archive_name)
+    filesystem.copy(archive_filestore_path, tmp_dir)
+    archive_path = os.path.join(tmp_dir, archive_name)
+    tar = tarfile.open(archive_path, 'r:gz')
+    tar.extractall(tmp_dir)
+    os.remove(archive_path)
+    coverage_binary = os.path.join(tmp_dir,
+                                   benchmark_utils.get_fuzz_target(benchmark))
+    return coverage_binary
+
+
+def prepare_benchmark_random_corpus(benchmark: str,
+                                    num_trials: int,
+                                    target_fuzzing: bool = False):
+    """Prepare corpus for target fuzzing."""
+    coverage_binary = None
+    target_coverage = []
+    # path used to store and feed seed corpus for benchmark runner
+    # each trial group will have the same seed input(s)
+    benchmark_random_corpora = os.path.join(
+        experiment_utils.get_random_corpora_filestore_path(), benchmark)
+    filesystem.create_directory(benchmark_random_corpora)
+
+    # get inputs from the custom seed corpus directory
+    corpus_archive_filename = os.path.join(
+        experiment_utils.get_custom_seed_corpora_filestore_path(),
+        f'{benchmark}.zip')
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        if target_fuzzing:
+            coverage_binary = get_coverage_binary(benchmark, tmp_dir)
+
+        with zipfile.ZipFile(corpus_archive_filename) as zip_file:
+            # only consider file not directory
+            corpus_files = [
+                f for f in zip_file.infolist() if not f.filename.endswith('/')
+            ]
+            for trial_group_num in range(num_trials):
+                logs.info('Preparing random corpus: %s, trial_group: %d',
+                          benchmark, trial_group_num)
+
+                trial_group_subdir = 'trial-group-%d' % trial_group_num
+                custom_corpus_trial_dir = os.path.join(benchmark_random_corpora,
+                                                       trial_group_subdir)
+                src_dir = os.path.join(tmp_dir, "source")
+                filesystem.recreate_directory(src_dir)
+
+                source_files = random.sample(corpus_files,
+                                             MAX_RANDOM_CORPUS_FILES)
+                for file in source_files:
+                    zip_file.extract(file, src_dir)
+
+                if target_fuzzing:
+                    dest_dir = os.path.join(tmp_dir, "dest")
+                    filesystem.recreate_directory(dest_dir)
+
+                    dest_files = random.sample(corpus_files,
+                                               MAX_RANDOM_CORPUS_FILES)
+                    for file in dest_files:
+                        zip_file.extract(file, dest_dir)
+
+                    # extract covered branches of source and destination inputs
+                    # then subtract to get targeting branches
+                    src_branches = get_covered_branches(coverage_binary,
+                                                        src_dir)
+                    dest_branches = get_covered_branches(
+                        coverage_binary, dest_dir)
+                    target_branches = dest_branches - src_branches
+
+                    if not target_branches:
+                        raise RuntimeError(
+                            'Unable to find target branches for %s.' %
+                            benchmark)
+
+                    for branch in target_branches:
+                        target_cov = models.TargetCoverage()
+                        target_cov.trial_group_num = int(trial_group_num)
+                        target_cov.benchmark = benchmark
+                        target_cov.target_location = branch
+                        target_coverage.append(target_cov)
+
+                # copy only the src directory
+                filesystem.copytree(src_dir, custom_corpus_trial_dir)
+
+    return target_coverage
diff --git a/database/models.py b/database/models.py
@@ -50,6 +50,7 @@ class Trial(Base):
     benchmark = Column(String, nullable=False)
     time_started = Column(DateTime(), nullable=True)
     time_ended = Column(DateTime(), nullable=True)
+    trial_group_num = Column(Integer, nullable=True)
 
     # Columns used for preemptible experiments.
     preemptible = Column(Boolean, default=False, nullable=False)
@@ -71,6 +72,8 @@ class Snapshot(Base):
     trial_id = Column(Integer, ForeignKey('trial.id'), primary_key=True)
     trial = sqlalchemy.orm.relationship('Trial', back_populates='snapshots')
     edges_covered = Column(Integer, nullable=False)
+    targets_covered = Column(Integer, nullable=False)
+    trial_group_num = Column(Integer, nullable=False)
     fuzzer_stats = Column(JSON, nullable=True)
     crashes = sqlalchemy.orm.relationship(
         'Crash',
@@ -94,3 +97,13 @@ class Crash(Base):
 
     __table_args__ = (ForeignKeyConstraint(
         [time, trial_id], ['snapshot.time', 'snapshot.trial_id']),)
+
+
+class TargetCoverage(Base):
+    """Represents target branches for the target fuzzing mode."""
+    __tablename__ = 'target_coverage'
+
+    id = Column(Integer, primary_key=True)
+    benchmark = Column(String, nullable=False)
+    trial_group_num = Column(Integer, nullable=False)
+    target_location = Column(String, nullable=False)
diff --git a/experiment/dispatcher.py b/experiment/dispatcher.py
@@ -24,6 +24,7 @@
 import time
 from typing import List
 
+from common import random_corpus_fuzzing_utils
 from common import experiment_path as exp_path
 from common import experiment_utils
 from common import logs
@@ -131,7 +132,8 @@ def build_images_for_trials(fuzzers: List[str],
             models.Trial(fuzzer=fuzzer,
                          experiment=experiment_name,
                          benchmark=benchmark,
-                         preemptible=preemptible) for _ in range(num_trials)
+                         preemptible=preemptible,
+                         trial_group_num=trial) for trial in range(num_trials)
         ]
         trials.extend(fuzzer_benchmark_trials)
     return trials
@@ -159,6 +161,11 @@ def dispatcher_main():
                                      experiment.config['concurrent_builds'])
     _initialize_trials_in_db(trials)
 
+    if experiment.config['random_corpus'] or experiment.config['target_fuzzing']:
+        random_corpus_fuzzing_utils.initialize_random_corpus_fuzzing(
+            experiment.benchmarks, experiment.num_trials,
+            experiment.config['target_fuzzing'])
+
     create_work_subdirs(['experiment-folders', 'measurement-folders'])
 
     # Start measurer and scheduler in seperate threads/processes.

diff --git a/experiment/measurer/coverage_utils.py b/experiment/measurer/coverage_utils.py
@@ -233,10 +233,12 @@ def get_coverage_infomation(coverage_summary_file):
 class TrialCoverage:  # pylint: disable=too-many-instance-attributes
     """Base class for storing and getting coverage data for a trial."""
 
-    def __init__(self, fuzzer: str, benchmark: str, trial_num: int):
+    def __init__(self, fuzzer: str, benchmark: str, trial_num: int,
+                 trial_group_num: int):
         self.fuzzer = fuzzer
         self.benchmark = benchmark
         self.trial_num = trial_num
+        self.trial_group_num = trial_group_num
         self.benchmark_fuzzer_trial_dir = exp_utils.get_trial_dir(
             fuzzer, benchmark, trial_num)
         self.work_dir = exp_utils.get_work_dir()