google · demoncoder-crypto · Apr 3, 2025 · Apr 4, 2025 · Apr 12, 2025 · Apr 15, 2025
diff --git a/experiment/evaluator.py b/experiment/evaluator.py
@@ -49,6 +49,9 @@ class Result:
   crashes: bool = False
   coverage: float = 0.0
   line_coverage_diff: float = 0.0
+  newly_covered_lines: int = 0
+  total_lines: int = 0
+  baseline_total_lines: int = 0
   coverage_report_path: str = ''
   reproducer_path: str = ''
   # Grammatically correct but has false positive or no cov increase at all.
@@ -260,6 +263,20 @@ def __init__(self, runner: builder_runner.BuilderRunner, benchmark: Benchmark,
     self.builder_runner = runner
     self.benchmark = benchmark
     self.work_dirs = work_dirs
+    self.baseline_total_lines = 0
+    try:
+      # Load baseline coverage summary to get total lines.
+      baseline_summary = load_existing_coverage_summary(self.benchmark.project)
+      if baseline_summary:
+          target_basename = os.path.basename(self.benchmark.target_path)
+          self.baseline_total_lines = \
+              compute_total_lines_without_fuzz_targets(
+                  baseline_summary, target_basename)
+      logger.info('Baseline total lines for %s: %d', self.benchmark.project,
+                  self.baseline_total_lines)
+    except Exception as e:
+        logger.error('Failed to load baseline summary/calculate total lines: %s', e)
+        # Keep baseline_total_lines as 0
 
   def build_log_path(self, generated_target_name: str, iteration: int):
     return os.path.join(self.work_dirs.run_logs,
@@ -429,48 +446,54 @@ def check_target(self, ai_binary, target_path: str) -> Result:
         run_result = None
 
       # 2. Calculate coverage percentage and coverage diff
-      coverage_summary = None
-      total_lines = 0
       coverage_percent = 0.0
       coverage_diff = 0.0
-      if run_result:
-        # Gets line coverage (diff) details.
-        coverage_summary = self._load_existing_coverage_summary()
-
-        if self.benchmark.language in ['python', 'jvm'] and run_result.coverage:
-          # The Jacoco.xml coverage report used to generate summary.json on
-          # OSS-Fuzz for JVM projects does not trace the source file location.
-          # Thus the conversion may miss some classes because they are not
-          # present during coverage report generation. This fix gets the total
-          # line calculation from the jacoco.xml report of the current run
-          # directly and compares it with the total_lines retrieved from
-          # summary.json. Then the larger total_lines is used which is assumed
-          # to be more accurate. This is the same case for python project which
-          # the total line is determined from the all_cov.json file.
-          total_lines = run_result.coverage.total_lines
-        elif coverage_summary:
-          total_lines = compute_total_lines_without_fuzz_targets(
-              coverage_summary, generated_target_name)
-        else:
-          total_lines = 0
-
-        if run_result.total_pcs:
-          coverage_percent = run_result.cov_pcs / run_result.total_pcs
-        else:
-          dual_logger.log(
-              f'Warning: total_pcs == 0 in {generated_oss_fuzz_project}.')
-          coverage_percent = 0.0
+      newly_covered_lines = 0
+      union_total_lines = 0
+      current_coverage_copy = None
 
-        existing_textcov = self.load_existing_textcov()
-        if run_result.coverage:
-          run_result.coverage.subtract_covered_lines(existing_textcov)
+      if run_result and run_result.coverage:
+          current_coverage_copy = run_result.coverage.copy()
 
-        if total_lines and run_result.coverage:
-          coverage_diff = run_result.coverage.covered_lines / total_lines
-        else:
-          dual_logger.log(
-              f'Warning: total_lines == 0 in {generated_oss_fuzz_project}.')
-          coverage_diff = 0.0
+      if run_result:
+          existing_textcov = self.load_existing_textcov()
+
+          if current_coverage_copy:
+              current_coverage_copy.merge(existing_textcov)
+              union_total_lines = current_coverage_copy.total_lines
+          else:
+              union_total_lines = existing_textcov.total_lines
+
+          if run_result.coverage:
+              run_result.coverage.subtract_covered_lines(existing_textcov)
+              newly_covered_lines = run_result.coverage.covered_lines
+          else:
+              newly_covered_lines = 0
+
+          if union_total_lines > 0:
+              coverage_diff = newly_covered_lines / union_total_lines
+          else:
+              if newly_covered_lines > 0:
+                   dual_logger.log(
+                       f'Warning: union_total_lines is 0 but newly_covered_lines is {newly_covered_lines}. Cannot calculate coverage diff accurately.'
+                   )
+              coverage_diff = 0.0
+
+          total_lines_for_percent = 0
+          if self.benchmark.language in ['python', 'jvm'] and run_result.coverage:
+             if current_coverage_copy:
+                 total_lines_for_percent = current_coverage_copy.total_lines
+          elif self._load_existing_coverage_summary():
+              coverage_summary = self._load_existing_coverage_summary()
+              total_lines_for_percent = compute_total_lines_without_fuzz_targets(
+                  coverage_summary, generated_target_name)
+
+          if run_result.total_pcs:
+              coverage_percent = run_result.cov_pcs / run_result.total_pcs
+          else:
+              dual_logger.log(
+                  f'Warning: Could not determine total lines for percentage calculation in {generated_oss_fuzz_project}.')
+              coverage_percent = 0.0
 
       if self.benchmark.language == 'jvm':
         # For JVM, the generation is consider success if either is true
@@ -517,6 +540,9 @@ def check_target(self, ai_binary, target_path: str) -> Result:
                  False,
                  0.0,
                  0.0,
+                 0,
+                 0,
+                 0,
                  '',
                  '',
                  False,
@@ -538,6 +564,9 @@ def check_target(self, ai_binary, target_path: str) -> Result:
                  False,
                  0.0,
                  0.0,
+                 0,
+                 0,
+                 0,
                  '',
                  '',
                  False,
@@ -567,26 +596,34 @@ def check_target(self, ai_binary, target_path: str) -> Result:
                  run_result.crashes,
                  0.0,
                  0.0,
+                 0,
+                 0,
+                 0,
                  '',
                  '',
                  not run_result.succeeded,
                  run_result.semantic_check.type,
                  run_result.triage,
+                 textcov.Textcov(),
                  compile_error=build_result.log_path,
                  compile_log=build_result.log_path))
 
     dual_logger.log(
         f'Result for {generated_oss_fuzz_project}: '
-        f'crashes={run_result.crashes}, coverage={coverage_percent} '
+        f'crashes={run_result.crashes}, coverage={coverage_percent:.4f} '
         f'({run_result.cov_pcs}/{run_result.total_pcs}), '
-        f'coverage diff={coverage_diff} '
-        f'({run_result.coverage.covered_lines}/{total_lines})')
+        f'newly covered lines={newly_covered_lines}, union total lines={union_total_lines}, baseline total lines={self.baseline_total_lines}, '
+        f'coverage diff={coverage_diff:.4f}'
+    )
     return dual_logger.return_result(
         Result(False,
                True,
                run_result.crashes,
                coverage_percent,
                coverage_diff,
+               newly_covered_lines,
+               union_total_lines,
+               self.baseline_total_lines,
                run_result.coverage_report_path,
                run_result.reproducer_path,
                not run_result.succeeded,

diff --git a/run_one_experiment.py b/run_one_experiment.py
@@ -69,6 +69,9 @@ class AggregatedResult:
   found_bug: int = 0
   max_coverage: float = 0.0
   max_line_coverage_diff: float = 0.0
+  max_newly_covered_lines: int = 0
+  total_lines: int = 0
+  baseline_total_lines: int = 0
   max_coverage_sample: str = ''
   max_coverage_diff_sample: str = ''
   max_coverage_diff_report: str = ''
@@ -81,7 +84,10 @@ def __str__(self):
         f'crash rate: {self.crash_rate}, '
         f'found bug: {self.found_bug}, '
         f'max coverage: {self.max_coverage}, '
-        f'max line coverage diff: {self.max_line_coverage_diff}\n'
+        f'max line coverage diff: {self.max_line_coverage_diff}, '
+        f'max newly covered lines: {self.max_newly_covered_lines}, '
+        f'total lines: {self.total_lines}, '
+        f'baseline total lines: {self.baseline_total_lines}\n'
         f'max coverage sample: {self.max_coverage_sample}\n'
         f'max coverage diff sample: {self.max_coverage_diff_sample}\n'
         f'max coverage diff report: {self.max_coverage_diff_report or "None"}')
@@ -148,38 +154,83 @@ def fix_code(work_dirs: WorkDirs, generated_targets: List[str]) -> List[str]:
 def aggregate_results(target_stats: list[tuple[int, exp_evaluator.Result]],
                       generated_targets: list[str]) -> AggregatedResult:
   """Aggregates experiment status and results of a targets."""
-  build_success_count = sum([int(stat.compiles) for _, stat in target_stats])
+  if not target_stats:
+      return AggregatedResult()
+
+  build_success_count = sum(int(stat.compiles) for _, stat in target_stats)
   build_success_rate = build_success_count / len(target_stats)
-  crash_rate = sum([int(stat.crashes) for _, stat in target_stats
-                   ]) / len(target_stats)
-  found_bug = sum([
-      int(stat.crashes and not stat.is_semantic_error)
-      for _, stat in target_stats
-  ])
-  max_coverage = max([stat.coverage for _, stat in target_stats])
-  max_line_coverage_diff = max(
-      [stat.line_coverage_diff for _, stat in target_stats])
+  crash_rate = sum(int(stat.crashes) for _, stat in target_stats) / len(target_stats)
+  found_bug = sum(int(stat.crashes and not stat.is_semantic_error)
+                    for _, stat in target_stats)
 
-  max_coverage_sample = ''
-  max_coverage_diff_sample = ''
-  max_coverage_diff_report = ''
+  max_coverage = 0.0
+  max_line_coverage_diff = -1.0 # Initialize to handle cases where diff might be 0
+  max_newly_covered_lines = 0
 
-  all_textcov = textcov.Textcov()
-  for i, stat in target_stats:
-    if stat.coverage == max_coverage:
-      max_coverage_sample = generated_targets[i]
+  best_cov_idx = -1
+  best_diff_idx = -1
+  best_diff_stat = None
 
-    if stat.line_coverage_diff == max_line_coverage_diff:
-      max_coverage_diff_sample = generated_targets[i]
-      max_coverage_diff_report = stat.coverage_report_path
-
-    if isinstance(stat.textcov_diff, textcov.Textcov):
-      all_textcov.merge(stat.textcov_diff)
+  all_textcov = textcov.Textcov()
 
-  return AggregatedResult(build_success_count, build_success_rate, crash_rate,
-                          found_bug, max_coverage, max_line_coverage_diff,
-                          max_coverage_sample, max_coverage_diff_sample,
-                          max_coverage_diff_report, all_textcov)
+  for i, stat in target_stats:
+      # Aggregate textcov diffs
+      if isinstance(stat.textcov_diff, textcov.Textcov):
+          all_textcov.merge(stat.textcov_diff)
+
+      # Find best coverage
+      if stat.coverage > max_coverage:
+          max_coverage = stat.coverage
+          best_cov_idx = i
+
+      # Find best coverage diff
+      if stat.line_coverage_diff > max_line_coverage_diff:
+          max_line_coverage_diff = stat.line_coverage_diff
+          best_diff_idx = i
+          best_diff_stat = stat # Store the stat object with the best diff
+
+      # Keep track of overall max newly covered lines (optional, could also be from best_diff_stat)
+      if stat.newly_covered_lines > max_newly_covered_lines:
+          max_newly_covered_lines = stat.newly_covered_lines
+
+  # Extract info from the best performing stats
+  max_coverage_sample = generated_targets[best_cov_idx] if best_cov_idx != -1 else ''
+  max_coverage_diff_sample = generated_targets[best_diff_idx] if best_diff_idx != -1 else ''
+
+  # Get metrics from the stat that produced the best diff
+  union_total_lines = 0
+  baseline_total_lines = 0
+  max_coverage_diff_report = ''
+  if best_diff_stat:
+      # Use union_total_lines calculated in evaluator for the 'total_lines' field
+      union_total_lines = best_diff_stat.total_lines
+      baseline_total_lines = best_diff_stat.baseline_total_lines
+      max_coverage_diff_report = best_diff_stat.coverage_report_path
+      # We could also take max_newly_covered_lines from here if desired:
+      # max_newly_covered_lines = best_diff_stat.newly_covered_lines
+
+  # Handle case where max_line_coverage_diff might remain -1 (e.g., if all diffs were 0 or less)
+  if max_line_coverage_diff < 0:
+      max_line_coverage_diff = 0.0
+
+  # Pass all arguments as keywords
+  return AggregatedResult(
+      build_success_count=build_success_count,
+      build_success_rate=build_success_rate,
+      crash_rate=crash_rate,
+      found_bug=found_bug,
+      max_coverage=max_coverage,
+      max_line_coverage_diff=max_line_coverage_diff,
+      # Storing the overall max newly covered lines encountered
+      max_newly_covered_lines=max_newly_covered_lines,
+      # Storing the union_total_lines from the run with the max diff
+      total_lines=union_total_lines,
+      # Storing the baseline_total_lines from the run with the max diff
+      baseline_total_lines=baseline_total_lines,
+      max_coverage_sample=max_coverage_sample,
+      max_coverage_diff_sample=max_coverage_diff_sample,
+      max_coverage_diff_report=max_coverage_diff_report,
+      full_textcov_diff=all_textcov)
 
 
 def check_targets(