diff --git a/.github/actions/pr-review/prompts/base-pr-review.md b/.github/actions/pr-review/prompts/base-pr-review.md index 5496bdf..d737730 100644 --- a/.github/actions/pr-review/prompts/base-pr-review.md +++ b/.github/actions/pr-review/prompts/base-pr-review.md @@ -17,6 +17,8 @@ Read `.github/pr-context.json` — it contains pre-fetched PR data with these fi - `last_reviewed_sha`: the SHA from the previous review, used only for deduplication - `summary_comment_id`: the existing bot summary comment to update, if one exists - `incremental_diff_path`: path to a GitHub API compare diff when incremental review is available +- `incremental_diff_metadata`: metadata about filtered incremental diff coverage, + including dropped vendored/generated/lockfile paths and truncation state - `existing_findings`: list of finding lines from previous review summaries - `comments`: trusted PR comments with `id`, `user`, `user_type`, `author_association`, and `body`. @@ -42,6 +44,11 @@ Use the `review_mode` field from `.github/pr-context.json`. PR diff for security and confident correctness issues. - `"full"`: review the full PR diff for all categories. +If `incremental_diff_metadata.partial` is true, explicitly account for the +listed dropped paths or truncation before giving a no-blocking-issues verdict. +Do not assume omitted dependency lockfiles, generated source, or vendored source +are safe solely because they were filtered out of the incremental artifact. + Do not use local git history for incremental review. The local checkout is the current PR head tree, not the previous reviewed tree. @@ -66,13 +73,20 @@ as changed source instead. If review mode is `"incremental"`, read the file named by `incremental_diff_path` for suggestions. Still scan the full PR diff (`gh pr diff --repo `) for security and confident correctness issues. +If the incremental metadata reports dropped paths or truncation, mention that +partial coverage in the review summary and use the full diff to check whether +the omitted paths affect dependency locks, generated source, vendored source, +or release behavior. If review mode is `"full"`, review the full PR diff for all categories. Use the local checkout with Read, Glob, Grep, and Task for source-file inspection. Use `gh pr view` and `gh api` for extra GitHub metadata when needed. -Exclude vendored code, generated files, and lockfiles from review. +Exclude bulk content-level review of vendored code, generated files, and +lockfiles after checking whether those paths affect dependencies, generated or +vendored source reachability, or release behavior. Do not exclude `go.mod` or +`go.sum` from dependency review. ### Step 6 — Validate findings diff --git a/.github/actions/pr-review/scripts/fetch-pr-context.py b/.github/actions/pr-review/scripts/fetch-pr-context.py index b4db73f..bbf1269 100644 --- a/.github/actions/pr-review/scripts/fetch-pr-context.py +++ b/.github/actions/pr-review/scripts/fetch-pr-context.py @@ -12,6 +12,7 @@ import json import os import re +import shlex import subprocess import sys import time @@ -29,6 +30,23 @@ LEGACY_REVIEW_SUMMARY_HEADING = "### PR Review:" DEFAULT_API_ATTEMPTS = 3 +# Incremental-diff hardening. GitHub compare diffs on large vendor-refresh PRs +# can inline non-UTF-8 bytes (git misclassifies a NUL-free encrypted vendored +# file as text) and can be pathologically large (100s of MB), so the raw diff is +# read as bytes, stripped of vendored/generated noise, capped, and decoded +# losslessly before it is handed to the reviewer. +DIFF_MAX_BYTES = 20 * 1024 * 1024 +EXCLUDE_PREFIXES = ("vendor/",) +EXCLUDE_SUFFIXES = ( + ".pb.go", + "_gen.go", + "package-lock.json", + "yarn.lock", +) +DIFF_GIT_SPLIT_PATTERN = re.compile(rb"(?m)^(?=diff --git )") +DIFF_GIT_PATH_PATTERN = re.compile(r"^diff --git (a/.*) b/(.*)$") +DIFF_DROPPED_PATH_LIMIT = 200 + def review_comment_heading(comment: dict, summary_heading: str) -> Optional[str]: body = comment["body"].lstrip() @@ -106,6 +124,199 @@ def gh_api(args: list[str], *, attempts: int = DEFAULT_API_ATTEMPTS) -> subproce raise last_error +def gh_api_bytes(args: list[str], *, attempts: int = DEFAULT_API_ATTEMPTS) -> bytes: + """Run gh api like gh_api but return raw stdout bytes without UTF-8 decoding. + + The compare diff for a large vendor-refresh PR can contain a non-UTF-8 byte, + so a strict text decode of the whole response (what text=True does) raises + UnicodeDecodeError and kills the review job. Reading raw bytes here lets the + caller decode leniently. The retry policy mirrors gh_api exactly; stderr and + stdout are decoded leniently only to build human-readable error messages. + """ + last_error = None + for attempt in range(1, attempts + 1): + try: + return subprocess.run( + ["gh", "api", *args], + capture_output=True, + check=True, + ).stdout + except subprocess.CalledProcessError as e: + if isinstance(e.stderr, bytes): + e.stderr = e.stderr.decode("utf-8", "backslashreplace") + if isinstance(e.stdout, bytes): + e.stdout = e.stdout.decode("utf-8", "backslashreplace") + last_error = e + retry_limit = retry_limit_for_error(e, attempts) + e.retry_limit = retry_limit + if attempt >= retry_limit: + break + print( + "::warning::GitHub API request failed; " + f"retrying ({attempt}/{retry_limit}): gh api {' '.join(args)}: " + f"{command_error_summary(e)}", + file=sys.stderr, + ) + time.sleep(attempt) + raise last_error + + +def _split_git_header_paths(header: bytes) -> list[str]: + text = header.decode("utf-8", "backslashreplace") + match = DIFF_GIT_PATH_PATTERN.match(text) + if match: + return [match.group(1), f"b/{match.group(2)}"] + try: + parts = shlex.split(text) + except ValueError: + return [] + if len(parts) < 4 or parts[:2] != ["diff", "--git"]: + return [] + return parts[2:4] + + +def _strip_diff_prefix(path: str) -> str: + if path == "/dev/null": + return "" + if path.startswith(("a/", "b/")): + return path[2:] + return path + + +def _decode_diff_path(raw: bytes) -> str: + return _strip_diff_prefix(raw.decode("utf-8", "backslashreplace")) + + +def _diff_paths(header: bytes) -> list[str]: + """Extract old and new paths from a 'diff --git ...' header.""" + return [ + path + for path in (_strip_diff_prefix(path) for path in _split_git_header_paths(header)) + if path + ] + + +def _diff_path(header: bytes) -> str: + """Extract the a-side file path from a 'diff --git ...' header.""" + paths = _diff_paths(header) + if not paths: + return "" + return paths[0] + + +def _section_paths(section: bytes) -> list[str]: + paths = [] + for index, line in enumerate(section.splitlines()): + path = "" + if index == 0: + paths.extend(_diff_paths(line)) + continue + for prefix in (b"--- ", b"+++ "): + if line.startswith(prefix): + path = _decode_diff_path(line[len(prefix) :]) + break + for prefix in (b"rename from ", b"rename to ", b"copy from ", b"copy to "): + if line.startswith(prefix): + path = _decode_diff_path(line[len(prefix) :]) + break + if path: + paths.append(path) + return list(dict.fromkeys(paths)) + + +def _is_excluded(path: str) -> bool: + """Report whether a diff path is vendored or a known generated/lockfile.""" + if not path: + return False + if any(path.startswith(prefix) for prefix in EXCLUDE_PREFIXES): + return True + return any(path.endswith(suffix) for suffix in EXCLUDE_SUFFIXES) + + +def filter_and_decode_diff(raw: bytes) -> tuple[str, dict]: + """Strip vendored/generated noise from a raw diff, cap it, and decode it. + + Splits ``raw`` into per-file ``diff --git`` sections, drops vendored and + generated/lockfile sections, retains sections up to ``DIFF_MAX_BYTES``, and + decodes the kept bytes with errors="backslashreplace" so a stray non-UTF-8 + byte is preserved losslessly instead of raising. Returns the decoded text and + metadata: ``dropped_sections`` (count), ``dropped_paths`` (bounded list), + ``truncated`` (bool), ``kept_bytes``. + """ + sections = DIFF_GIT_SPLIT_PATTERN.split(raw) + kept_chunks: list[bytes] = [] + dropped_sections = 0 + dropped_paths: list[str] = [] + dropped_paths_omitted = 0 + kept_bytes = 0 + truncated = False + for section in sections: + if not section: + continue + if not section.startswith(b"diff --git "): + # Leading preamble before the first file header (rare); keep verbatim. + if section.strip(): + kept_chunks.append(section) + kept_bytes += len(section) + continue + paths = _section_paths(section) + if paths and all(_is_excluded(path) for path in paths): + dropped_sections += 1 + for path in paths: + if len(dropped_paths) < DIFF_DROPPED_PATH_LIMIT: + dropped_paths.append(path) + else: + dropped_paths_omitted += 1 + continue + if kept_bytes + len(section) > DIFF_MAX_BYTES: + truncated = True + break + kept_chunks.append(section) + kept_bytes += len(section) + + text = b"".join(kept_chunks).decode("utf-8", "backslashreplace") + if truncated: + text += f"\n[diff truncated to {DIFF_MAX_BYTES} bytes for review context]\n" + return text, { + "dropped_sections": dropped_sections, + "dropped_paths": dropped_paths, + "dropped_paths_omitted": dropped_paths_omitted, + "truncated": truncated, + "kept_bytes": kept_bytes, + "partial": bool(dropped_sections or truncated), + } + + +def empty_incremental_diff_metadata() -> dict: + return { + "dropped_sections": 0, + "dropped_paths": [], + "dropped_paths_omitted": 0, + "truncated": False, + "kept_bytes": 0, + "partial": False, + } + + +def incremental_diff_notice(meta: dict) -> str: + lines = [ + "[incremental diff partial coverage]", + ( + f"Dropped {meta['dropped_sections']} vendored/generated/lockfile " + f"section(s); kept {meta['kept_bytes']} bytes." + ), + ] + if meta["dropped_paths"]: + lines.append("Dropped paths:") + lines.extend(f"- {path}" for path in meta["dropped_paths"]) + if meta["dropped_paths_omitted"]: + lines.append(f"- ... {meta['dropped_paths_omitted']} more path(s)") + if meta["truncated"]: + lines.append(f"Diff truncated to {DIFF_MAX_BYTES} bytes.") + lines.append("Scan the full PR diff before issuing a no-blocking-issues verdict.") + return "\n".join(lines) + "\n\n" + + def gh_api_paginate(endpoint: str) -> list[dict]: """Fetch all pages from a gh api endpoint.""" result = gh_api( @@ -140,9 +351,10 @@ def parse_paginated_json(output: str) -> list[dict]: return entries -def fetch_compare_diff(head_repo: str, base_sha: str, head_sha: str) -> Optional[str]: +def fetch_compare_diff(head_repo: str, base_sha: str, head_sha: str) -> tuple[Optional[str], dict]: """Fetch a compare diff from the PR head repo for incremental review.""" endpoint = f"repos/{head_repo}/compare/{base_sha}...{head_sha}" + meta = empty_incremental_diff_metadata() try: metadata = gh_api([endpoint]) compare = json.loads(metadata.stdout) @@ -152,17 +364,35 @@ def fetch_compare_diff(head_repo: str, base_sha: str, head_sha: str) -> Optional f"Compare status is {status!r}, using full review mode", file=sys.stderr, ) - return None - result = gh_api(["-H", "Accept: application/vnd.github.diff", endpoint]) + return None, meta + raw = gh_api_bytes(["-H", "Accept: application/vnd.github.diff", endpoint]) except subprocess.CalledProcessError as e: print( f"Could not fetch incremental diff from {head_repo}: {e.stderr}", file=sys.stderr, ) - return None - if not result.stdout.strip(): - return None - return result.stdout + return None, meta + if not raw.strip(): + return None, meta + diff_text, meta = filter_and_decode_diff(raw) + if meta["truncated"] and meta["kept_bytes"] == 0: + print( + "Incremental diff truncated before retaining reviewable content, " + "using full review mode", + file=sys.stderr, + ) + return None, meta + if not diff_text.strip(): + return None, meta + if meta["partial"]: + diff_text = incremental_diff_notice(meta) + diff_text + print( + f"Incremental diff: dropped {meta['dropped_sections']} vendored/generated " + f"section(s), kept {meta['kept_bytes']} bytes" + f"{', truncated' if meta['truncated'] else ''}", + file=sys.stderr, + ) + return diff_text, meta def current_checkout_sha() -> Optional[str]: @@ -309,6 +539,7 @@ def main(): # provide a compact incremental artifact. review_mode = "full" incremental_diff_path = None + incremental_diff_metadata = empty_incremental_diff_metadata() if not last_reviewed_sha: print("No previous review state found, using full review mode") elif last_review_base_sha != current_base_sha: @@ -318,7 +549,11 @@ def main(): print("PR head repository is unavailable, using full review mode") last_reviewed_sha = None else: - incremental_diff = fetch_compare_diff(head_repo, last_reviewed_sha, current_sha) + incremental_diff, incremental_diff_metadata = fetch_compare_diff( + head_repo, + last_reviewed_sha, + current_sha, + ) if incremental_diff: incremental_diff_path = os.path.join(".github", "incremental.diff") os.makedirs(os.path.dirname(incremental_diff_path), exist_ok=True) @@ -355,6 +590,7 @@ def main(): "last_review_base_sha": last_review_base_sha, "summary_comment_id": summary_comment_id, "incremental_diff_path": incremental_diff_path, + "incremental_diff_metadata": incremental_diff_metadata, "existing_findings": existing_findings, "comments": trusted_context_comments, } diff --git a/.github/actions/pr-review/scripts/test_fetch_pr_context.py b/.github/actions/pr-review/scripts/test_fetch_pr_context.py new file mode 100644 index 0000000..a237d81 --- /dev/null +++ b/.github/actions/pr-review/scripts/test_fetch_pr_context.py @@ -0,0 +1,295 @@ +#!/usr/bin/env python3 +"""Unit tests for the incremental-diff hardening in fetch-pr-context.py. + +The module file name contains hyphens, so it is loaded by path via importlib +rather than imported normally. Run with: + + python3 -m unittest discover -s .github/actions/pr-review/scripts -p 'test_*.py' + +or directly: + + python3 .github/actions/pr-review/scripts/test_fetch_pr_context.py +""" + +import importlib.util +import json +import os +import tempfile +from types import SimpleNamespace +import unittest +from unittest import mock + +_SCRIPT = os.path.join(os.path.dirname(__file__), "fetch-pr-context.py") +_spec = importlib.util.spec_from_file_location("fetch_pr_context", _SCRIPT) +fpc = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(fpc) + + +# A vendored section that inlines a raw non-UTF-8 byte (0xca), exactly the shape +# that crashes a strict UTF-8 decode of the whole compare diff. +VENDOR_SECTION = ( + b"diff --git a/vendor/example.com/pkg/secret.go b/vendor/example.com/pkg/secret.go\n" + b"index 1111111..2222222 100644\n" + b"--- a/vendor/example.com/pkg/secret.go\n" + b"+++ b/vendor/example.com/pkg/secret.go\n" + b"@@ -1 +1 @@\n" + b"-old\n" + b"+\xca\xca\xca not-actually-utf8 payload\n" +) + +GO_SECTION = ( + b"diff --git a/internal/foo.go b/internal/foo.go\n" + b"index 3333333..4444444 100644\n" + b"--- a/internal/foo.go\n" + b"+++ b/internal/foo.go\n" + b"@@ -1 +1 @@\n" + b"-func old() {}\n" + b"+func New() {}\n" +) + + +def rename_section(old_path, new_path): + return ( + f"diff --git a/{old_path} b/{new_path}\n" + "similarity index 88%\n" + f"rename from {old_path}\n" + f"rename to {new_path}\n" + "@@ -1 +1 @@\n" + "-package old\n" + "+package new\n" + ).encode("utf-8") + + +class FilterAndDecodeDiffTest(unittest.TestCase): + def test_drops_vendor_keeps_go_without_raising(self): + # Must not raise UnicodeDecodeError despite the 0xca bytes. + text, meta = fpc.filter_and_decode_diff(VENDOR_SECTION + GO_SECTION) + self.assertNotIn("vendor/example.com/pkg/secret.go", text) + self.assertIn("internal/foo.go", text) + self.assertIn("func New() {}", text) + self.assertEqual(meta["dropped_sections"], 1) + self.assertEqual(meta["dropped_paths"], ["vendor/example.com/pkg/secret.go"]) + self.assertFalse(meta["truncated"]) + self.assertGreater(meta["kept_bytes"], 0) + + def test_non_utf8_bytes_in_kept_section_are_lossless_not_dropped(self): + # A retained (.go) section carrying a stray non-UTF-8 byte must decode + # via backslashreplace rather than raise or be silently lost. + go_with_bad_byte = ( + b"diff --git a/internal/bar.go b/internal/bar.go\n" + b"@@ -1 +1 @@\n" + b"+// \xca marker\n" + ) + text, meta = fpc.filter_and_decode_diff(go_with_bad_byte) + self.assertIn("internal/bar.go", text) + self.assertIn("\\xca", text) + self.assertEqual(meta["dropped_sections"], 0) + self.assertEqual(meta["dropped_paths"], []) + + def test_truncation_marker_present_when_over_cap(self): + original_cap = fpc.DIFF_MAX_BYTES + fpc.DIFF_MAX_BYTES = len(GO_SECTION) + 10 # room for one section only + try: + second = GO_SECTION.replace(b"foo.go", b"baz.go") + text, meta = fpc.filter_and_decode_diff(GO_SECTION + second) + self.assertTrue(meta["truncated"]) + self.assertIn("diff truncated to", text) + self.assertIn("internal/foo.go", text) + self.assertNotIn("internal/baz.go", text) + self.assertEqual(meta["dropped_paths"], []) + finally: + fpc.DIFF_MAX_BYTES = original_cap + + def test_marker_only_truncation_reports_zero_kept_bytes(self): + original_cap = fpc.DIFF_MAX_BYTES + fpc.DIFF_MAX_BYTES = 1 + try: + text, meta = fpc.filter_and_decode_diff(GO_SECTION) + self.assertTrue(meta["truncated"]) + self.assertEqual(meta["kept_bytes"], 0) + self.assertEqual(meta["dropped_sections"], 0) + self.assertEqual(meta["dropped_paths"], []) + self.assertEqual( + text.strip(), + "[diff truncated to 1 bytes for review context]", + ) + finally: + fpc.DIFF_MAX_BYTES = original_cap + + def test_all_excluded_returns_empty_for_full_fallback(self): + # When nothing reviewable remains, the text is empty so the caller can + # fall back to full review mode. + text, meta = fpc.filter_and_decode_diff(VENDOR_SECTION) + self.assertEqual(text.strip(), "") + self.assertEqual(meta["dropped_sections"], 1) + self.assertEqual(meta["dropped_paths"], ["vendor/example.com/pkg/secret.go"]) + + def test_rename_from_vendor_to_internal_is_kept(self): + diff = rename_section( + "vendor/example.com/pkg/secret.go", + "internal/secret.go", + ) + + text, meta = fpc.filter_and_decode_diff(diff) + + self.assertIn("rename from vendor/example.com/pkg/secret.go", text) + self.assertIn("rename to internal/secret.go", text) + self.assertEqual(meta["dropped_sections"], 0) + self.assertEqual(meta["dropped_paths"], []) + + def test_rename_from_internal_to_vendor_is_kept(self): + diff = rename_section( + "internal/secret.go", + "vendor/example.com/pkg/secret.go", + ) + + text, meta = fpc.filter_and_decode_diff(diff) + + self.assertIn("rename from internal/secret.go", text) + self.assertIn("rename to vendor/example.com/pkg/secret.go", text) + self.assertEqual(meta["dropped_sections"], 0) + self.assertEqual(meta["dropped_paths"], []) + + def test_excluded_suffixes(self): + for path in ( + "api/foo.pb.go", + "internal/mock_gen.go", + "package-lock.json", + "yarn.lock", + ): + self.assertTrue(fpc._is_excluded(path), path) + for path in ("go.sum", "go.mod", "internal/foo.go", "cmd/main.go", "README.md"): + self.assertFalse(fpc._is_excluded(path), path) + + def test_diff_path_extraction(self): + header = b"diff --git a/internal/foo.go b/internal/foo.go" + self.assertEqual(fpc._diff_path(header), "internal/foo.go") + + def test_diff_path_extraction_with_spaces(self): + header = b"diff --git a/internal/space name.go b/internal/space name.go" + self.assertEqual(fpc._diff_path(header), "internal/space name.go") + + +class FetchCompareDiffTest(unittest.TestCase): + def test_mixed_excluded_and_kept_diff_returns_metadata(self): + with ( + mock.patch.object( + fpc, + "gh_api", + return_value=SimpleNamespace(stdout='{"status":"ahead"}'), + ), + mock.patch.object( + fpc, + "gh_api_bytes", + return_value=VENDOR_SECTION + GO_SECTION, + ), + ): + text, meta = fpc.fetch_compare_diff("owner/repo", "base", "head") + + self.assertIsNotNone(text) + self.assertIn("[incremental diff partial coverage]", text) + self.assertIn("internal/foo.go", text) + self.assertIn("vendor/example.com/pkg/secret.go", text) + self.assertNotIn("not-actually-utf8 payload", text) + self.assertEqual(meta["dropped_sections"], 1) + self.assertEqual(meta["dropped_paths"], ["vendor/example.com/pkg/secret.go"]) + self.assertTrue(meta["partial"]) + + def test_marker_only_truncation_falls_back_to_full_mode(self): + original_cap = fpc.DIFF_MAX_BYTES + fpc.DIFF_MAX_BYTES = 1 + try: + with ( + mock.patch.object( + fpc, + "gh_api", + return_value=SimpleNamespace(stdout='{"status":"ahead"}'), + ), + mock.patch.object(fpc, "gh_api_bytes", return_value=GO_SECTION), + ): + text, meta = fpc.fetch_compare_diff("owner/repo", "base", "head") + finally: + fpc.DIFF_MAX_BYTES = original_cap + + self.assertIsNone(text) + self.assertTrue(meta["truncated"]) + self.assertEqual(meta["kept_bytes"], 0) + + +class MainContextTest(unittest.TestCase): + def test_incremental_diff_metadata_written_to_context(self): + metadata = { + "dropped_sections": 1, + "dropped_paths": ["vendor/example.com/pkg/secret.go"], + "dropped_paths_omitted": 0, + "truncated": False, + "kept_bytes": len(GO_SECTION), + "partial": True, + } + workflow_ref = "ConductorOne/github-workflows/.github/workflows/pr-review.yaml@refs/heads/main" + state = json.dumps( + { + "last_reviewed_sha": "old-sha", + "base_sha": "base-sha", + "workflow_ref": workflow_ref, + } + ) + raw_comments = [ + { + "id": 123, + "author_association": "MEMBER", + "user": {"login": "github-actions[bot]", "type": "Bot"}, + "body": f"{fpc.DEFAULT_REVIEW_SUMMARY_HEADING} Previous\n", + } + ] + pr = { + "head": { + "sha": "head-sha", + "repo": {"full_name": "ConductorOne/example"}, + }, + "base": {"sha": "base-sha"}, + } + + old_cwd = os.getcwd() + with tempfile.TemporaryDirectory() as tmpdir: + os.chdir(tmpdir) + try: + with ( + mock.patch.dict( + os.environ, + { + "GITHUB_REPOSITORY": "ConductorOne/example", + "PR_NUMBER": "42", + "PR_HEAD_SHA": "head-sha", + "GITHUB_WORKFLOW_REF": workflow_ref, + "GITHUB_RUN_ID": "99", + "GITHUB_SERVER_URL": "https://github.com", + }, + clear=False, + ), + mock.patch.object(fpc, "gh_api_paginate", return_value=raw_comments), + mock.patch.object( + fpc, + "gh_api", + return_value=SimpleNamespace(stdout=json.dumps(pr)), + ), + mock.patch.object(fpc, "current_checkout_sha", return_value="head-sha"), + mock.patch.object( + fpc, + "fetch_compare_diff", + return_value=("diff text", metadata), + ), + ): + fpc.main() + + with open(".github/pr-context.json") as f: + context = json.load(f) + self.assertEqual(context["review_mode"], "incremental") + self.assertEqual(context["incremental_diff_path"], ".github/incremental.diff") + self.assertEqual(context["incremental_diff_metadata"], metadata) + finally: + os.chdir(old_cwd) + + +if __name__ == "__main__": + unittest.main()