Skip to content

Commit 1846938

Browse files
committed
Add "misra_help" query docs generator
Introduces scripts/generate_rules/misra_help/, a two-stage pipeline for (mostly) idempotent generation of per-query .md help files. Uses MISRA rules as input and creates (or updates, as needed) documentation for codeql-coding-standards queries for C and C++. Focuses on immediate support for: - MISRA C 2012/2023 - MISRA C++ 2023. Stage 1: deterministic docling-based extraction and rendering, with a JSON sidecar for downstream consumption. Stage 2: a headless Python driver for the Copilot SDK that rewrites each help file from the JSON sidecar against a fixed Markdown schema and American English spelling. Adds docs via -> "scripts/generate_rules/misra_help/README.md"
1 parent cee8713 commit 1846938

File tree

7 files changed

+2134
-0
lines changed

7 files changed

+2134
-0
lines changed

scripts/generate_rules/misra_help/README.md

Lines changed: 348 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
"""MISRA help-file populator.
2+
3+
See `populate_help.py` for the entry point.
4+
"""
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
"""Emit a per-standard JSON sidecar containing every extracted MISRA
2+
rule plus, for each `.ql` query that targets the rule, the query's
3+
`@name` title, target `.md` path, and the existing `.md` content (if
4+
any). This file is the input to the agent extension's LLM-driven
5+
"rewrite help docs" pass: docling extracts the structured rule data
6+
deterministically, then the LLM uses both the structured data AND the
7+
.ql title to produce a polished, idiomatic help file.
8+
9+
Output layout:
10+
11+
<help-repo>/.misra-rule-cache/<standard>.json
12+
13+
Schema (top-level):
14+
15+
{
16+
"standard": "MISRA-C-2012",
17+
"lang": "c",
18+
"lang_src": "c/misra/src/rules",
19+
"generated_at": "2026-04-20T10:11:12Z",
20+
"rules": {
21+
"RULE-9-2": {
22+
"rule_id": "RULE-9-2",
23+
"raw_id": "Rule 9.2",
24+
"standard": "MISRA-C-2012",
25+
"title": "...",
26+
"category": "Required",
27+
"analysis": "Decidable, Single Translation Unit",
28+
"applies_to": "C90, C99, C11",
29+
"amplification": "...",
30+
"rationale": "...",
31+
"exceptions": ["...", "..."],
32+
"example_layout": [
33+
{"kind": "code", "text": "..."},
34+
{"kind": "text", "text": "..."}
35+
],
36+
"see_also": [...]
37+
},
38+
...
39+
},
40+
"queries": {
41+
"RULE-9-2": [
42+
{
43+
"ql_path": "c/misra/src/rules/RULE-9-2/Init...braces.ql",
44+
"ql_name_title": "The initializer for an aggregate ...",
45+
"md_path": "c/misra/src/rules/RULE-9-2/Init...braces.md",
46+
"existing_md": "..." // null if the .md does not exist
47+
},
48+
...
49+
],
50+
...
51+
}
52+
}
53+
54+
The `existing_md` content is included so the LLM pass can preserve
55+
human-authored details (alert message wording, special examples) that
56+
docling did not capture.
57+
"""
58+
from __future__ import annotations
59+
import argparse
60+
import datetime as _dt
61+
import json
62+
import sys
63+
from dataclasses import asdict
64+
from pathlib import Path
65+
66+
sys.path.insert(0, str(Path(__file__).parent))
67+
from extract_rules import extract_rules, Rule # noqa: E402
68+
from populate_help import ( # noqa: E402
69+
STANDARD_INFO,
70+
SUPPORTED_STANDARDS,
71+
DEFAULT_HELP_REPO,
72+
DEFAULT_QUERY_REPO,
73+
collect_queries,
74+
resolve_pdf,
75+
_read_ql_name,
76+
)
77+
78+
79+
def _rule_to_jsonable(rule: Rule) -> dict:
80+
"""Serialize a Rule to JSON, including the example layout."""
81+
d = asdict(rule)
82+
layout = getattr(rule, "_example_layout", None)
83+
if layout:
84+
d["example_layout"] = [{"kind": k, "text": s} for (k, s) in layout]
85+
else:
86+
d["example_layout"] = []
87+
return d
88+
89+
90+
def _query_entries(rule_id: str, ql_paths: list[Path],
91+
query_repo: Path, help_repo: Path,
92+
lang_src: Path) -> list[dict]:
93+
out: list[dict] = []
94+
for ql in sorted(ql_paths):
95+
rel_dir = ql.parent.relative_to(query_repo / lang_src)
96+
md = help_repo / lang_src / rel_dir / (ql.stem + ".md")
97+
try:
98+
existing = md.read_text(encoding="utf-8")
99+
except FileNotFoundError:
100+
existing = None
101+
out.append({
102+
"ql_path": str(ql.relative_to(query_repo)),
103+
"ql_name_title": _read_ql_name(ql) or "",
104+
"md_path": str(md.relative_to(help_repo)),
105+
"existing_md": existing,
106+
})
107+
return out
108+
109+
110+
def main() -> int:
111+
ap = argparse.ArgumentParser(description=__doc__,
112+
formatter_class=argparse.RawDescriptionHelpFormatter)
113+
ap.add_argument("--standard", required=True, choices=SUPPORTED_STANDARDS)
114+
ap.add_argument("--query-repo", type=Path, default=DEFAULT_QUERY_REPO)
115+
ap.add_argument("--help-repo", type=Path, default=DEFAULT_HELP_REPO)
116+
ap.add_argument("--pdf", type=Path, default=None)
117+
ap.add_argument("--cache-dir", type=Path,
118+
default=Path("/tmp/misra-pdf-probe/repo-cache"),
119+
help="docling JSON cache dir")
120+
ap.add_argument("--output", type=Path, default=None,
121+
help="output path (default: "
122+
"<help-repo>/.misra-rule-cache/<standard>.json)")
123+
args = ap.parse_args()
124+
125+
pdf = resolve_pdf(args.standard, args.pdf, args.help_repo)
126+
args.cache_dir.mkdir(parents=True, exist_ok=True)
127+
rules = extract_rules(pdf, args.standard, args.cache_dir)
128+
129+
lang, lang_src = STANDARD_INFO[args.standard]
130+
queries = collect_queries(args.query_repo, args.standard)
131+
132+
rules_json: dict[str, dict] = {}
133+
for r in rules:
134+
rules_json[r.rule_id] = _rule_to_jsonable(r)
135+
136+
queries_json: dict[str, list[dict]] = {}
137+
for rule_id, ql_paths in queries.items():
138+
queries_json[rule_id] = _query_entries(
139+
rule_id, ql_paths, args.query_repo, args.help_repo, lang_src)
140+
141+
payload = {
142+
"standard": args.standard,
143+
"lang": lang,
144+
"lang_src": str(lang_src),
145+
"generated_at": _dt.datetime.now(_dt.timezone.utc)
146+
.strftime("%Y-%m-%dT%H:%M:%SZ"),
147+
"rules": rules_json,
148+
"queries": queries_json,
149+
}
150+
151+
out_path = args.output or (args.help_repo / ".misra-rule-cache"
152+
/ f"{args.standard}.json")
153+
out_path.parent.mkdir(parents=True, exist_ok=True)
154+
out_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False),
155+
encoding="utf-8")
156+
print(f"wrote {out_path} ({len(rules_json)} rules, "
157+
f"{sum(len(v) for v in queries_json.values())} queries)")
158+
return 0
159+
160+
161+
if __name__ == "__main__":
162+
raise SystemExit(main())

0 commit comments

Comments
 (0)