Skip to content

Commit 483cd92

Browse files
committed
Misc: Add script creating DCA source suites from MRVA
The script takes the URL of a MRVA exported Gist and uses it to produce a source suite compatible with DCA. At present, you have to manually specify the language on the commandline, using the `--language` parameter. Also supports `--min` and `--max` parameters if you want to limit the sources to ones with a bounded number of alerts.
1 parent d8ca8dd commit 483cd92

File tree

1 file changed

+134
-0
lines changed

1 file changed

+134
-0
lines changed
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
import os
2+
import re
3+
import subprocess
4+
import tempfile
5+
import argparse
6+
from collections import defaultdict
7+
8+
help_text = """
9+
To use this script, pass the URL of a GitHub Gist as an argument. The Gist should contain the
10+
exported MarkDown output of a MRVA run.
11+
12+
The script clones the Gist to a temporary directory, and constructs a DCA source suite that covers the same repos/SHAs that had results in the Gist.
13+
14+
Additionally, you can limit the list of repos to just the ones for which number of results are within a given range, by passing the --min and --max arguments.
15+
"""
16+
17+
def clone_gist(gist_url, repo_dir):
18+
try:
19+
subprocess.run(
20+
["gh", "gist", "clone", gist_url, repo_dir],
21+
check=True,
22+
stderr=subprocess.DEVNULL
23+
)
24+
except subprocess.CalledProcessError:
25+
print(f"Failed to clone the gist from {gist_url}")
26+
subprocess.run(["rm", "-rf", repo_dir])
27+
exit(1)
28+
29+
def get_mrva_test_name(repo_dir):
30+
"""
31+
Returns a kebab-case name for the MRVA test, based on the first header of the _summary.md file.
32+
"""
33+
# Format of first header: ### Results for "name goes here"
34+
# In this case, the return value is "name-goes-here"
35+
with open(os.path.join(repo_dir, "_summary.md"), "r") as summary_file:
36+
# Find the first line that matches "Results for"
37+
for line in summary_file:
38+
if line.startswith("### Results for"):
39+
# Extract the quoted name
40+
return line.split('"')[1].replace(" ", "-")
41+
return "unknown-name"
42+
43+
def get_repo_alert_counts(repo_dir):
44+
"""
45+
Parses the Summary table in the _summary.md file to produce a dict mapping repo NWOs to alert counts.
46+
"""
47+
with open(os.path.join(repo_dir, "_summary.md"), "r") as summary_file:
48+
# Skip ahead to the Summary
49+
for line in summary_file:
50+
if line.startswith("### Summary"):
51+
break
52+
53+
# Match remaining lines to extract the repo NWO and alert count using a regex.
54+
# Example line: | Nuitka/Nuitka | [45 result(s)](#file-result-01-Nuitka-Nuitka-md) |
55+
line_re = re.compile(r"\| ([^|]+) \| \[([0-9,]+) result")
56+
d = {}
57+
for line in summary_file:
58+
m = line_re.match(line)
59+
if m:
60+
nwo, count = m.groups()
61+
d[nwo] = int(count.replace(",", ""))
62+
return d
63+
64+
def get_repo_nwo_shas(repo_dir):
65+
"""
66+
Parses each non _summary.md file in the repo_dir to produce a dict mapping repo NWOs to their corresponding SHAs.
67+
"""
68+
# We want to look for a match in the file of the form
69+
# github.com/Nuitka/Nuitka/blob/b289ee4f9d55172ed5165dab262d49bfa9cb2586/
70+
# and extract the NWO (as a single unit) and SHA
71+
nwo_sha_re = re.compile(r"github.com/([^/]+/[^/]+)/blob/([0-9a-f]{40})/")
72+
73+
repo_nwo_shas = {}
74+
for filename in os.listdir(repo_dir):
75+
if filename.endswith(".md") and filename != "_summary.md":
76+
with open(os.path.join(repo_dir, filename), "r") as file:
77+
for line in file:
78+
m = nwo_sha_re.search(line)
79+
if m:
80+
nwo, sha = m.groups()
81+
repo_nwo_shas[nwo] = sha
82+
break
83+
return repo_nwo_shas
84+
85+
if __name__ == "__main__":
86+
parser = argparse.ArgumentParser(description="Calculate MRVA totals from a GitHub Gist", epilog=help_text, formatter_class=argparse.RawTextHelpFormatter)
87+
parser.add_argument("gist_url", nargs='?', help="URL of the GitHub Gist")
88+
parser.add_argument("--keep-dir", action="store_true", help="Keep the temporary directory")
89+
parser.add_argument("--min", type=int, help="Minimum number of alerts in repo")
90+
parser.add_argument("--max", type=int, help="Maximum number of alerts in repo")
91+
parser.add_argument("--language", type=str, required=True, help="Language of the MRVA run")
92+
93+
args = parser.parse_args()
94+
95+
if not args.gist_url:
96+
parser.print_help()
97+
exit(1)
98+
99+
repo_dir = tempfile.mkdtemp(dir=".")
100+
clone_gist(args.gist_url, repo_dir)
101+
102+
repo_alerts = get_repo_alert_counts(repo_dir)
103+
repo_nwo_shas = get_repo_nwo_shas(repo_dir)
104+
105+
min_count = args.min if args.min else min(repo_alerts.values())
106+
max_count = args.max if args.max else max(repo_alerts.values())
107+
108+
filtered_alerts = {
109+
nwo: count for nwo, count in repo_alerts.items() if min_count <= count <= max_count
110+
}
111+
112+
test_name = get_mrva_test_name(repo_dir)
113+
114+
source_suite_name = f"{test_name}"
115+
if args.min:
116+
source_suite_name += f"-min-{args.min}"
117+
if args.max:
118+
source_suite_name += f"-max-{args.max}"
119+
source_suite_name += ".yml"
120+
121+
with open(source_suite_name, "w") as source_suite_file:
122+
source_suite_file.write("# This file was generated by misc/scripts/mrva-to-dca-source-suite.py\n")
123+
source_suite_file.write(f"# Input Gist: {args.gist_url}\n\n")
124+
for nwo, count in filtered_alerts.items():
125+
source_suite_file.write(f"- language: {args.language}\n")
126+
source_suite_file.write(f" sha: {repo_nwo_shas[nwo]}\n")
127+
source_suite_file.write(f" slug: {nwo} # Alert count: {count}\n")
128+
129+
print(f"Source suite written to {source_suite_name}")
130+
131+
if args.keep_dir:
132+
print(f"Temporary directory retained at: {repo_dir}")
133+
else:
134+
subprocess.run(["rm", "-rf", repo_dir])

0 commit comments

Comments
 (0)