Misc: Add script creating DCA source suites from MRVA

tausbn · tausbn · commit 483cd929ffac · 2025-04-07T13:16:11.000Z
The script takes the URL of a MRVA exported Gist and uses it to produce
a source suite compatible with DCA.

At present, you have to manually specify the language on the
commandline, using the `--language` parameter.

Also supports `--min` and `--max` parameters if you want to limit the
sources to ones with a bounded number of alerts.
diff --git a/misc/scripts/mrva-to-dca-source-suite.py b/misc/scripts/mrva-to-dca-source-suite.py
@@ -0,0 +1,134 @@
+import os
+import re
+import subprocess
+import tempfile
+import argparse
+from collections import defaultdict
+
+help_text = """
+To use this script, pass the URL of a GitHub Gist as an argument. The Gist should contain the
+exported MarkDown output of a MRVA run.
+
+The script clones the Gist to a temporary directory, and constructs a DCA source suite that covers the same repos/SHAs that had results in the Gist.
+
+Additionally, you can limit the list of repos to just the ones for which number of results are within a given range, by passing the --min and --max arguments.
+"""
+
+def clone_gist(gist_url, repo_dir):
+    try:
+        subprocess.run(
+            ["gh", "gist", "clone", gist_url, repo_dir],
+            check=True,
+            stderr=subprocess.DEVNULL
+        )
+    except subprocess.CalledProcessError:
+        print(f"Failed to clone the gist from {gist_url}")
+        subprocess.run(["rm", "-rf", repo_dir])
+        exit(1)
+
+def get_mrva_test_name(repo_dir):
+    """
+    Returns a kebab-case name for the MRVA test, based on the first header of the _summary.md file.
+    """
+    # Format of first header: ### Results for "name goes here"
+    # In this case, the return value is "name-goes-here"
+    with open(os.path.join(repo_dir, "_summary.md"), "r") as summary_file:
+        # Find the first line that matches "Results for"
+        for line in summary_file:
+            if line.startswith("### Results for"):
+                # Extract the quoted name
+                return line.split('"')[1].replace(" ", "-")
+    return "unknown-name"
+
+def get_repo_alert_counts(repo_dir):
+    """
+    Parses the Summary table in the _summary.md file to produce a dict mapping repo NWOs to alert counts.
+    """
+    with open(os.path.join(repo_dir, "_summary.md"), "r") as summary_file:
+        # Skip ahead to the Summary
+        for line in summary_file:
+            if line.startswith("### Summary"):
+                break
+
+        # Match remaining lines to extract the repo NWO and alert count using a regex.
+        # Example line: | Nuitka/Nuitka | [45 result(s)](#file-result-01-Nuitka-Nuitka-md) |
+        line_re = re.compile(r"\| ([^|]+) \| \[([0-9,]+) result")
+        d = {}
+        for line in summary_file:
+            m = line_re.match(line)
+            if m:
+                nwo, count = m.groups()
+                d[nwo] = int(count.replace(",", ""))
+        return d
+
+def get_repo_nwo_shas(repo_dir):
+    """
+    Parses each non _summary.md file in the repo_dir to produce a dict mapping repo NWOs to their corresponding SHAs.
+    """
+    # We want to look for a match in the file of the form
+    # github.com/Nuitka/Nuitka/blob/b289ee4f9d55172ed5165dab262d49bfa9cb2586/
+    # and extract the NWO (as a single unit) and SHA
+    nwo_sha_re = re.compile(r"github.com/([^/]+/[^/]+)/blob/([0-9a-f]{40})/")
+
+    repo_nwo_shas = {}
+    for filename in os.listdir(repo_dir):
+        if filename.endswith(".md") and filename != "_summary.md":
+            with open(os.path.join(repo_dir, filename), "r") as file:
+                for line in file:
+                    m = nwo_sha_re.search(line)
+                    if m:
+                        nwo, sha = m.groups()
+                        repo_nwo_shas[nwo] = sha
+                        break
+    return repo_nwo_shas
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Calculate MRVA totals from a GitHub Gist", epilog=help_text, formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("gist_url", nargs='?', help="URL of the GitHub Gist")
+    parser.add_argument("--keep-dir", action="store_true", help="Keep the temporary directory")
+    parser.add_argument("--min", type=int, help="Minimum number of alerts in repo")
+    parser.add_argument("--max", type=int, help="Maximum number of alerts in repo")
+    parser.add_argument("--language", type=str, required=True, help="Language of the MRVA run")
+
+    args = parser.parse_args()
+
+    if not args.gist_url:
+        parser.print_help()
+        exit(1)
+
+    repo_dir = tempfile.mkdtemp(dir=".")
+    clone_gist(args.gist_url, repo_dir)
+
+    repo_alerts = get_repo_alert_counts(repo_dir)
+    repo_nwo_shas = get_repo_nwo_shas(repo_dir)
+
+    min_count = args.min if args.min else min(repo_alerts.values())
+    max_count = args.max if args.max else max(repo_alerts.values())
+
+    filtered_alerts = {
+        nwo: count for nwo, count in repo_alerts.items() if min_count <= count <= max_count
+    }
+
+    test_name = get_mrva_test_name(repo_dir)
+
+    source_suite_name = f"{test_name}"
+    if args.min:
+        source_suite_name += f"-min-{args.min}"
+    if args.max:
+        source_suite_name += f"-max-{args.max}"
+    source_suite_name += ".yml"
+
+    with open(source_suite_name, "w") as source_suite_file:
+        source_suite_file.write("# This file was generated by misc/scripts/mrva-to-dca-source-suite.py\n")
+        source_suite_file.write(f"# Input Gist: {args.gist_url}\n\n")
+        for nwo, count in filtered_alerts.items():
+            source_suite_file.write(f"- language: {args.language}\n")
+            source_suite_file.write(f"  sha: {repo_nwo_shas[nwo]}\n")
+            source_suite_file.write(f"  slug: {nwo} # Alert count: {count}\n")
+
+    print(f"Source suite written to {source_suite_name}")
+
+    if args.keep_dir:
+        print(f"Temporary directory retained at: {repo_dir}")
+    else:
+        subprocess.run(["rm", "-rf", repo_dir])