Skip to content

Fix(experiment): Update coverage calculation logic for #727 #960

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
119 changes: 78 additions & 41 deletions experiment/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ class Result:
crashes: bool = False
coverage: float = 0.0
line_coverage_diff: float = 0.0
newly_covered_lines: int = 0
total_lines: int = 0
baseline_total_lines: int = 0
coverage_report_path: str = ''
reproducer_path: str = ''
# Grammatically correct but has false positive or no cov increase at all.
Expand Down Expand Up @@ -260,6 +263,20 @@ def __init__(self, runner: builder_runner.BuilderRunner, benchmark: Benchmark,
self.builder_runner = runner
self.benchmark = benchmark
self.work_dirs = work_dirs
self.baseline_total_lines = 0
try:
# Load baseline coverage summary to get total lines.
baseline_summary = load_existing_coverage_summary(self.benchmark.project)
if baseline_summary:
target_basename = os.path.basename(self.benchmark.target_path)
self.baseline_total_lines = \
compute_total_lines_without_fuzz_targets(
baseline_summary, target_basename)
logger.info('Baseline total lines for %s: %d', self.benchmark.project,
self.baseline_total_lines)
except Exception as e:
logger.error('Failed to load baseline summary/calculate total lines: %s', e)
# Keep baseline_total_lines as 0

def build_log_path(self, generated_target_name: str, iteration: int):
return os.path.join(self.work_dirs.run_logs,
Expand Down Expand Up @@ -429,48 +446,54 @@ def check_target(self, ai_binary, target_path: str) -> Result:
run_result = None

# 2. Calculate coverage percentage and coverage diff
coverage_summary = None
total_lines = 0
coverage_percent = 0.0
coverage_diff = 0.0
if run_result:
# Gets line coverage (diff) details.
coverage_summary = self._load_existing_coverage_summary()

if self.benchmark.language in ['python', 'jvm'] and run_result.coverage:
# The Jacoco.xml coverage report used to generate summary.json on
# OSS-Fuzz for JVM projects does not trace the source file location.
# Thus the conversion may miss some classes because they are not
# present during coverage report generation. This fix gets the total
# line calculation from the jacoco.xml report of the current run
# directly and compares it with the total_lines retrieved from
# summary.json. Then the larger total_lines is used which is assumed
# to be more accurate. This is the same case for python project which
# the total line is determined from the all_cov.json file.
total_lines = run_result.coverage.total_lines
elif coverage_summary:
total_lines = compute_total_lines_without_fuzz_targets(
coverage_summary, generated_target_name)
else:
total_lines = 0

if run_result.total_pcs:
coverage_percent = run_result.cov_pcs / run_result.total_pcs
else:
dual_logger.log(
f'Warning: total_pcs == 0 in {generated_oss_fuzz_project}.')
coverage_percent = 0.0
newly_covered_lines = 0
union_total_lines = 0
current_coverage_copy = None

existing_textcov = self.load_existing_textcov()
if run_result.coverage:
run_result.coverage.subtract_covered_lines(existing_textcov)
if run_result and run_result.coverage:
current_coverage_copy = run_result.coverage.copy()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, this will only create a shallow copy, the modification below will affect the original run_result.coverage.
Try using python's builtin deepcopy package.
We can add a function in class Textcov for this.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The line current_coverage_copy = run_result.coverage.copy() in evaluator.py should therefore already be performing a deep copy, preventing unintended modifications to the original run_result.coverage object. I think it will do deep copy please correct me if i am wrong

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@demoncoder-crypto Could you confirmed this via a simple script?
The coverage is a Textcov, which is a dataclass and does not have builtin function copy:

(Pdb) from experiment.textcov import Textcov
(Pdb) cov = Textcov()
(Pdb) cov.copy()
*** AttributeError: 'Textcov' object has no attribute 'copy'


if total_lines and run_result.coverage:
coverage_diff = run_result.coverage.covered_lines / total_lines
else:
dual_logger.log(
f'Warning: total_lines == 0 in {generated_oss_fuzz_project}.')
coverage_diff = 0.0
if run_result:
existing_textcov = self.load_existing_textcov()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you please add a TODO for this?
TODO(dongge): Move load_existing_textcov to OSS-Fuzz module so that we only need to run it once.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


if current_coverage_copy:
current_coverage_copy.merge(existing_textcov)
union_total_lines = current_coverage_copy.total_lines
else:
union_total_lines = existing_textcov.total_lines

if run_result.coverage:
run_result.coverage.subtract_covered_lines(existing_textcov)
newly_covered_lines = run_result.coverage.covered_lines
else:
newly_covered_lines = 0
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I recon the conditions of these two blocks (if current_coverage_copy: and if run_result.coverage:) are essentially the same?
We can merge them for simplicity.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes we can merge, good point, thanks


if union_total_lines > 0:
coverage_diff = newly_covered_lines / union_total_lines
else:
if newly_covered_lines > 0:
dual_logger.log(
f'Warning: union_total_lines is 0 but newly_covered_lines is {newly_covered_lines}. Cannot calculate coverage diff accurately.'
)
coverage_diff = 0.0
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As this part is getting more complex, could you please separate it into an individual function?
Thanks!


total_lines_for_percent = 0
if self.benchmark.language in ['python', 'jvm'] and run_result.coverage:
if current_coverage_copy:
total_lines_for_percent = current_coverage_copy.total_lines
elif self._load_existing_coverage_summary():
coverage_summary = self._load_existing_coverage_summary()
total_lines_for_percent = compute_total_lines_without_fuzz_targets(
coverage_summary, generated_target_name)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to modify the logic here?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As per my understanding this logic ensures that the overall coverage_percent accurately reflects the coverage achieved by the current fuzz target using its own relevant lines as the denominator, distinct from the coverage_diff which uses the union of lines. It correctly uses the pre-subtraction coverage data stored in current_coverage_copy for this calculation. That is my understanding of this please correct me if i i am wrong

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As per my understanding this logic ensures that the overall coverage_percent accurately reflects the coverage achieved by the current fuzz target using its own relevant lines as the denominator, distinct from the coverage_diff which uses the union of lines.

This makes sense, but:

  1. Is there any code using total_lines_for_percent after you assigned it?
  2. Why did we prefer to remove coverage_summary = self._load_existing_coverage_summary() from the top and call the function twice?
    Original code
      if run_result:
        # Gets line coverage (diff) details.
        coverage_summary = self._load_existing_coverage_summary()
...
        elif coverage_summary:
          total_lines = compute_total_lines_without_fuzz_targets(
              coverage_summary, generated_target_name)
  1. Why did we remove the final else?
    Original code:
        elif coverage_summary:
          total_lines = compute_total_lines_without_fuzz_targets(
              coverage_summary, generated_target_name)
        else:
          total_lines = 0
  1. Could you please add back the comment for JVM and Python? Thanks.
    Original code:
          # The Jacoco.xml coverage report used to generate summary.json on
          # OSS-Fuzz for JVM projects does not trace the source file location.
          # Thus the conversion may miss some classes because they are not
          # present during coverage report generation. This fix gets the total
          # line calculation from the jacoco.xml report of the current run
          # directly and compares it with the total_lines retrieved from
          # summary.json. Then the larger total_lines is used which is assumed
          # to be more accurate. This is the same case for python project which
          # the total line is determined from the all_cov.json file.


if run_result.total_pcs:
coverage_percent = run_result.cov_pcs / run_result.total_pcs
else:
dual_logger.log(
f'Warning: Could not determine total lines for percentage calculation in {generated_oss_fuzz_project}.')
coverage_percent = 0.0

if self.benchmark.language == 'jvm':
# For JVM, the generation is consider success if either is true
Expand Down Expand Up @@ -517,6 +540,9 @@ def check_target(self, ai_binary, target_path: str) -> Result:
False,
0.0,
0.0,
0,
0,
0,
'',
'',
False,
Expand All @@ -538,6 +564,9 @@ def check_target(self, ai_binary, target_path: str) -> Result:
False,
0.0,
0.0,
0,
0,
0,
'',
'',
False,
Expand Down Expand Up @@ -567,26 +596,34 @@ def check_target(self, ai_binary, target_path: str) -> Result:
run_result.crashes,
0.0,
0.0,
0,
0,
0,
'',
'',
not run_result.succeeded,
run_result.semantic_check.type,
run_result.triage,
textcov.Textcov(),
compile_error=build_result.log_path,
compile_log=build_result.log_path))

dual_logger.log(
f'Result for {generated_oss_fuzz_project}: '
f'crashes={run_result.crashes}, coverage={coverage_percent} '
f'crashes={run_result.crashes}, coverage={coverage_percent:.4f} '
f'({run_result.cov_pcs}/{run_result.total_pcs}), '
f'coverage diff={coverage_diff} '
f'({run_result.coverage.covered_lines}/{total_lines})')
f'newly covered lines={newly_covered_lines}, union total lines={union_total_lines}, baseline total lines={self.baseline_total_lines}, '
f'coverage diff={coverage_diff:.4f}'
)
return dual_logger.return_result(
Result(False,
True,
run_result.crashes,
coverage_percent,
coverage_diff,
newly_covered_lines,
union_total_lines,
self.baseline_total_lines,
run_result.coverage_report_path,
run_result.reproducer_path,
not run_result.succeeded,
Expand Down
107 changes: 79 additions & 28 deletions run_one_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ class AggregatedResult:
found_bug: int = 0
max_coverage: float = 0.0
max_line_coverage_diff: float = 0.0
max_newly_covered_lines: int = 0
total_lines: int = 0
baseline_total_lines: int = 0
max_coverage_sample: str = ''
max_coverage_diff_sample: str = ''
max_coverage_diff_report: str = ''
Expand All @@ -81,7 +84,10 @@ def __str__(self):
f'crash rate: {self.crash_rate}, '
f'found bug: {self.found_bug}, '
f'max coverage: {self.max_coverage}, '
f'max line coverage diff: {self.max_line_coverage_diff}\n'
f'max line coverage diff: {self.max_line_coverage_diff}, '
f'max newly covered lines: {self.max_newly_covered_lines}, '
f'total lines: {self.total_lines}, '
f'baseline total lines: {self.baseline_total_lines}\n'
f'max coverage sample: {self.max_coverage_sample}\n'
f'max coverage diff sample: {self.max_coverage_diff_sample}\n'
f'max coverage diff report: {self.max_coverage_diff_report or "None"}')
Expand Down Expand Up @@ -148,38 +154,83 @@ def fix_code(work_dirs: WorkDirs, generated_targets: List[str]) -> List[str]:
def aggregate_results(target_stats: list[tuple[int, exp_evaluator.Result]],
generated_targets: list[str]) -> AggregatedResult:
"""Aggregates experiment status and results of a targets."""
build_success_count = sum([int(stat.compiles) for _, stat in target_stats])
if not target_stats:
return AggregatedResult()

build_success_count = sum(int(stat.compiles) for _, stat in target_stats)
build_success_rate = build_success_count / len(target_stats)
crash_rate = sum([int(stat.crashes) for _, stat in target_stats
]) / len(target_stats)
found_bug = sum([
int(stat.crashes and not stat.is_semantic_error)
for _, stat in target_stats
])
max_coverage = max([stat.coverage for _, stat in target_stats])
max_line_coverage_diff = max(
[stat.line_coverage_diff for _, stat in target_stats])
crash_rate = sum(int(stat.crashes) for _, stat in target_stats) / len(target_stats)
found_bug = sum(int(stat.crashes and not stat.is_semantic_error)
for _, stat in target_stats)

max_coverage_sample = ''
max_coverage_diff_sample = ''
max_coverage_diff_report = ''
max_coverage = 0.0
max_line_coverage_diff = -1.0 # Initialize to handle cases where diff might be 0
max_newly_covered_lines = 0

all_textcov = textcov.Textcov()
for i, stat in target_stats:
if stat.coverage == max_coverage:
max_coverage_sample = generated_targets[i]
best_cov_idx = -1
best_diff_idx = -1
best_diff_stat = None

if stat.line_coverage_diff == max_line_coverage_diff:
max_coverage_diff_sample = generated_targets[i]
max_coverage_diff_report = stat.coverage_report_path

if isinstance(stat.textcov_diff, textcov.Textcov):
all_textcov.merge(stat.textcov_diff)
all_textcov = textcov.Textcov()

return AggregatedResult(build_success_count, build_success_rate, crash_rate,
found_bug, max_coverage, max_line_coverage_diff,
max_coverage_sample, max_coverage_diff_sample,
max_coverage_diff_report, all_textcov)
for i, stat in target_stats:
# Aggregate textcov diffs
if isinstance(stat.textcov_diff, textcov.Textcov):
all_textcov.merge(stat.textcov_diff)

# Find best coverage
if stat.coverage > max_coverage:
max_coverage = stat.coverage
best_cov_idx = i

# Find best coverage diff
if stat.line_coverage_diff > max_line_coverage_diff:
max_line_coverage_diff = stat.line_coverage_diff
best_diff_idx = i
best_diff_stat = stat # Store the stat object with the best diff

# Keep track of overall max newly covered lines (optional, could also be from best_diff_stat)
if stat.newly_covered_lines > max_newly_covered_lines:
max_newly_covered_lines = stat.newly_covered_lines

# Extract info from the best performing stats
max_coverage_sample = generated_targets[best_cov_idx] if best_cov_idx != -1 else ''
max_coverage_diff_sample = generated_targets[best_diff_idx] if best_diff_idx != -1 else ''

# Get metrics from the stat that produced the best diff
union_total_lines = 0
baseline_total_lines = 0
max_coverage_diff_report = ''
if best_diff_stat:
# Use union_total_lines calculated in evaluator for the 'total_lines' field
union_total_lines = best_diff_stat.total_lines
baseline_total_lines = best_diff_stat.baseline_total_lines
max_coverage_diff_report = best_diff_stat.coverage_report_path
# We could also take max_newly_covered_lines from here if desired:
# max_newly_covered_lines = best_diff_stat.newly_covered_lines

# Handle case where max_line_coverage_diff might remain -1 (e.g., if all diffs were 0 or less)
if max_line_coverage_diff < 0:
max_line_coverage_diff = 0.0

# Pass all arguments as keywords
return AggregatedResult(
build_success_count=build_success_count,
build_success_rate=build_success_rate,
crash_rate=crash_rate,
found_bug=found_bug,
max_coverage=max_coverage,
max_line_coverage_diff=max_line_coverage_diff,
# Storing the overall max newly covered lines encountered
max_newly_covered_lines=max_newly_covered_lines,
# Storing the union_total_lines from the run with the max diff
total_lines=union_total_lines,
# Storing the baseline_total_lines from the run with the max diff
baseline_total_lines=baseline_total_lines,
max_coverage_sample=max_coverage_sample,
max_coverage_diff_sample=max_coverage_diff_sample,
max_coverage_diff_report=max_coverage_diff_report,
full_textcov_diff=all_textcov)


def check_targets(
Expand Down