codeql/misc/scripts/mrva-to-dca-source-suite.py at 483cd929ffac12cccd5fe51b387bdd780f383b7e · github/codeql · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
import re
import subprocess
import tempfile
import argparse
from collections import defaultdict

help_text = """
To use this script, pass the URL of a GitHub Gist as an argument. The Gist should contain the
exported MarkDown output of a MRVA run.

The script clones the Gist to a temporary directory, and constructs a DCA source suite that covers the same repos/SHAs that had results in the Gist.

Additionally, you can limit the list of repos to just the ones for which number of results are within a given range, by passing the --min and --max arguments.
"""

def clone_gist(gist_url, repo_dir):
    try:
        subprocess.run(
            ["gh", "gist", "clone", gist_url, repo_dir],
            check=True,
            stderr=subprocess.DEVNULL
        )
    except subprocess.CalledProcessError:
        print(f"Failed to clone the gist from {gist_url}")
        subprocess.run(["rm", "-rf", repo_dir])
        exit(1)

def get_mrva_test_name(repo_dir):
    """
    Returns a kebab-case name for the MRVA test, based on the first header of the _summary.md file.
    """
    # Format of first header: ### Results for "name goes here"
    # In this case, the return value is "name-goes-here"
    with open(os.path.join(repo_dir, "_summary.md"), "r") as summary_file:
        # Find the first line that matches "Results for"
        for line in summary_file:
            if line.startswith("### Results for"):
                # Extract the quoted name
                return line.split('"')[1].replace(" ", "-")
    return "unknown-name"

def get_repo_alert_counts(repo_dir):
    """
    Parses the Summary table in the _summary.md file to produce a dict mapping repo NWOs to alert counts.
    """
    with open(os.path.join(repo_dir, "_summary.md"), "r") as summary_file:
        # Skip ahead to the Summary
        for line in summary_file:
            if line.startswith("### Summary"):
                break

        # Match remaining lines to extract the repo NWO and alert count using a regex.
        # Example line: | Nuitka/Nuitka | [45 result(s)](#file-result-01-Nuitka-Nuitka-md) |
        line_re = re.compile(r"\| ([^|]+) \| \[([0-9,]+) result")
        d = {}
        for line in summary_file:
            m = line_re.match(line)
            if m:
                nwo, count = m.groups()
                d[nwo] = int(count.replace(",", ""))
        return d

def get_repo_nwo_shas(repo_dir):
    """
    Parses each non _summary.md file in the repo_dir to produce a dict mapping repo NWOs to their corresponding SHAs.
    """
    # We want to look for a match in the file of the form
    # github.com/Nuitka/Nuitka/blob/b289ee4f9d55172ed5165dab262d49bfa9cb2586/
    # and extract the NWO (as a single unit) and SHA
    nwo_sha_re = re.compile(r"github.com/([^/]+/[^/]+)/blob/([0-9a-f]{40})/")

    repo_nwo_shas = {}
    for filename in os.listdir(repo_dir):
        if filename.endswith(".md") and filename != "_summary.md":
            with open(os.path.join(repo_dir, filename), "r") as file:
                for line in file:
                    m = nwo_sha_re.search(line)
                    if m:
                        nwo, sha = m.groups()
                        repo_nwo_shas[nwo] = sha
                        break
    return repo_nwo_shas

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Calculate MRVA totals from a GitHub Gist", epilog=help_text, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("gist_url", nargs='?', help="URL of the GitHub Gist")
    parser.add_argument("--keep-dir", action="store_true", help="Keep the temporary directory")
    parser.add_argument("--min", type=int, help="Minimum number of alerts in repo")
    parser.add_argument("--max", type=int, help="Maximum number of alerts in repo")
    parser.add_argument("--language", type=str, required=True, help="Language of the MRVA run")

    args = parser.parse_args()

    if not args.gist_url:
        parser.print_help()
        exit(1)

    repo_dir = tempfile.mkdtemp(dir=".")
    clone_gist(args.gist_url, repo_dir)

    repo_alerts = get_repo_alert_counts(repo_dir)
    repo_nwo_shas = get_repo_nwo_shas(repo_dir)

    min_count = args.min if args.min else min(repo_alerts.values())
    max_count = args.max if args.max else max(repo_alerts.values())

    filtered_alerts = {
        nwo: count for nwo, count in repo_alerts.items() if min_count <= count <= max_count
    }

    test_name = get_mrva_test_name(repo_dir)

    source_suite_name = f"{test_name}"
    if args.min:
        source_suite_name += f"-min-{args.min}"
    if args.max:
        source_suite_name += f"-max-{args.max}"
    source_suite_name += ".yml"

    with open(source_suite_name, "w") as source_suite_file:
        source_suite_file.write("# This file was generated by misc/scripts/mrva-to-dca-source-suite.py\n")
        source_suite_file.write(f"# Input Gist: {args.gist_url}\n\n")
        for nwo, count in filtered_alerts.items():
            source_suite_file.write(f"- language: {args.language}\n")
            source_suite_file.write(f"  sha: {repo_nwo_shas[nwo]}\n")
            source_suite_file.write(f"  slug: {nwo} # Alert count: {count}\n")

    print(f"Source suite written to {source_suite_name}")

    if args.keep_dir:
        print(f"Temporary directory retained at: {repo_dir}")
    else:
        subprocess.run(["rm", "-rf", repo_dir])