Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion .github/actions/pr-review/prompts/base-pr-review.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ Read `.github/pr-context.json` — it contains pre-fetched PR data with these fi
- `last_reviewed_sha`: the SHA from the previous review, used only for deduplication
- `summary_comment_id`: the existing bot summary comment to update, if one exists
- `incremental_diff_path`: path to a GitHub API compare diff when incremental review is available
- `incremental_diff_metadata`: metadata about filtered incremental diff coverage,
including dropped vendored/generated/lockfile paths and truncation state
- `existing_findings`: list of finding lines from previous review summaries
- `comments`: trusted PR comments with `id`, `user`, `user_type`,
`author_association`, and `body`.
Expand All @@ -42,6 +44,11 @@ Use the `review_mode` field from `.github/pr-context.json`.
PR diff for security and confident correctness issues.
- `"full"`: review the full PR diff for all categories.

If `incremental_diff_metadata.partial` is true, explicitly account for the
listed dropped paths or truncation before giving a no-blocking-issues verdict.
Do not assume omitted dependency lockfiles, generated source, or vendored source
are safe solely because they were filtered out of the incremental artifact.

Do not use local git history for incremental review. The local checkout is the current
PR head tree, not the previous reviewed tree.

Expand All @@ -66,13 +73,20 @@ as changed source instead.
If review mode is `"incremental"`, read the file named by `incremental_diff_path` for
suggestions. Still scan the full PR diff (`gh pr diff <pr_number> --repo <repository>`) for
security and confident correctness issues.
If the incremental metadata reports dropped paths or truncation, mention that
partial coverage in the review summary and use the full diff to check whether
the omitted paths affect dependency locks, generated source, vendored source,
or release behavior.

If review mode is `"full"`, review the full PR diff for all categories.

Use the local checkout with Read, Glob, Grep, and Task for source-file inspection. Use
`gh pr view` and `gh api` for extra GitHub metadata when needed.

Exclude vendored code, generated files, and lockfiles from review.
Exclude bulk content-level review of vendored code, generated files, and
lockfiles after checking whether those paths affect dependencies, generated or
vendored source reachability, or release behavior. Do not exclude `go.mod` or
`go.sum` from dependency review.

### Step 6 — Validate findings

Expand Down
252 changes: 244 additions & 8 deletions .github/actions/pr-review/scripts/fetch-pr-context.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import json
import os
import re
import shlex
import subprocess
import sys
import time
Expand All @@ -29,6 +30,23 @@
LEGACY_REVIEW_SUMMARY_HEADING = "### PR Review:"
DEFAULT_API_ATTEMPTS = 3

# Incremental-diff hardening. GitHub compare diffs on large vendor-refresh PRs
# can inline non-UTF-8 bytes (git misclassifies a NUL-free encrypted vendored
# file as text) and can be pathologically large (100s of MB), so the raw diff is
# read as bytes, stripped of vendored/generated noise, capped, and decoded
# losslessly before it is handed to the reviewer.
DIFF_MAX_BYTES = 20 * 1024 * 1024
EXCLUDE_PREFIXES = ("vendor/",)
EXCLUDE_SUFFIXES = (
".pb.go",
"_gen.go",
"package-lock.json",
"yarn.lock",
)
DIFF_GIT_SPLIT_PATTERN = re.compile(rb"(?m)^(?=diff --git )")
DIFF_GIT_PATH_PATTERN = re.compile(r"^diff --git (a/.*) b/(.*)$")
DIFF_DROPPED_PATH_LIMIT = 200


def review_comment_heading(comment: dict, summary_heading: str) -> Optional[str]:
body = comment["body"].lstrip()
Expand Down Expand Up @@ -106,6 +124,199 @@ def gh_api(args: list[str], *, attempts: int = DEFAULT_API_ATTEMPTS) -> subproce
raise last_error


def gh_api_bytes(args: list[str], *, attempts: int = DEFAULT_API_ATTEMPTS) -> bytes:
"""Run gh api like gh_api but return raw stdout bytes without UTF-8 decoding.

The compare diff for a large vendor-refresh PR can contain a non-UTF-8 byte,
so a strict text decode of the whole response (what text=True does) raises
UnicodeDecodeError and kills the review job. Reading raw bytes here lets the
caller decode leniently. The retry policy mirrors gh_api exactly; stderr and
stdout are decoded leniently only to build human-readable error messages.
"""
last_error = None
for attempt in range(1, attempts + 1):
try:
return subprocess.run(
["gh", "api", *args],
capture_output=True,
check=True,
).stdout
except subprocess.CalledProcessError as e:
if isinstance(e.stderr, bytes):
e.stderr = e.stderr.decode("utf-8", "backslashreplace")
if isinstance(e.stdout, bytes):
e.stdout = e.stdout.decode("utf-8", "backslashreplace")
last_error = e
retry_limit = retry_limit_for_error(e, attempts)
e.retry_limit = retry_limit
if attempt >= retry_limit:
break
print(
"::warning::GitHub API request failed; "
f"retrying ({attempt}/{retry_limit}): gh api {' '.join(args)}: "
f"{command_error_summary(e)}",
file=sys.stderr,
)
time.sleep(attempt)
raise last_error


def _split_git_header_paths(header: bytes) -> list[str]:
text = header.decode("utf-8", "backslashreplace")
match = DIFF_GIT_PATH_PATTERN.match(text)
if match:
return [match.group(1), f"b/{match.group(2)}"]
try:
parts = shlex.split(text)
except ValueError:
return []
if len(parts) < 4 or parts[:2] != ["diff", "--git"]:
return []
return parts[2:4]


def _strip_diff_prefix(path: str) -> str:
if path == "/dev/null":
return ""
if path.startswith(("a/", "b/")):
return path[2:]
return path


def _decode_diff_path(raw: bytes) -> str:
return _strip_diff_prefix(raw.decode("utf-8", "backslashreplace"))


def _diff_paths(header: bytes) -> list[str]:
"""Extract old and new paths from a 'diff --git ...' header."""
return [
path
for path in (_strip_diff_prefix(path) for path in _split_git_header_paths(header))
if path
]


def _diff_path(header: bytes) -> str:
"""Extract the a-side file path from a 'diff --git ...' header."""
paths = _diff_paths(header)
if not paths:
return ""
return paths[0]


def _section_paths(section: bytes) -> list[str]:
paths = []
for index, line in enumerate(section.splitlines()):
path = ""
if index == 0:
paths.extend(_diff_paths(line))
continue
for prefix in (b"--- ", b"+++ "):
if line.startswith(prefix):
path = _decode_diff_path(line[len(prefix) :])
break
for prefix in (b"rename from ", b"rename to ", b"copy from ", b"copy to "):
if line.startswith(prefix):
path = _decode_diff_path(line[len(prefix) :])
break
if path:
paths.append(path)
return list(dict.fromkeys(paths))


def _is_excluded(path: str) -> bool:
"""Report whether a diff path is vendored or a known generated/lockfile."""
if not path:
return False
if any(path.startswith(prefix) for prefix in EXCLUDE_PREFIXES):
return True
return any(path.endswith(suffix) for suffix in EXCLUDE_SUFFIXES)


def filter_and_decode_diff(raw: bytes) -> tuple[str, dict]:
"""Strip vendored/generated noise from a raw diff, cap it, and decode it.

Splits ``raw`` into per-file ``diff --git`` sections, drops vendored and
generated/lockfile sections, retains sections up to ``DIFF_MAX_BYTES``, and
decodes the kept bytes with errors="backslashreplace" so a stray non-UTF-8
byte is preserved losslessly instead of raising. Returns the decoded text and
metadata: ``dropped_sections`` (count), ``dropped_paths`` (bounded list),
``truncated`` (bool), ``kept_bytes``.
"""
sections = DIFF_GIT_SPLIT_PATTERN.split(raw)
kept_chunks: list[bytes] = []
dropped_sections = 0
dropped_paths: list[str] = []
dropped_paths_omitted = 0
kept_bytes = 0
truncated = False
for section in sections:
if not section:
continue
if not section.startswith(b"diff --git "):
# Leading preamble before the first file header (rare); keep verbatim.
if section.strip():
kept_chunks.append(section)
kept_bytes += len(section)
continue
paths = _section_paths(section)
if paths and all(_is_excluded(path) for path in paths):
dropped_sections += 1
for path in paths:
if len(dropped_paths) < DIFF_DROPPED_PATH_LIMIT:
dropped_paths.append(path)
else:
dropped_paths_omitted += 1
continue
if kept_bytes + len(section) > DIFF_MAX_BYTES:
truncated = True
break
kept_chunks.append(section)
kept_bytes += len(section)

text = b"".join(kept_chunks).decode("utf-8", "backslashreplace")
if truncated:
text += f"\n[diff truncated to {DIFF_MAX_BYTES} bytes for review context]\n"
return text, {
"dropped_sections": dropped_sections,
"dropped_paths": dropped_paths,
"dropped_paths_omitted": dropped_paths_omitted,
"truncated": truncated,
"kept_bytes": kept_bytes,
"partial": bool(dropped_sections or truncated),
}


def empty_incremental_diff_metadata() -> dict:
return {
"dropped_sections": 0,
"dropped_paths": [],
"dropped_paths_omitted": 0,
"truncated": False,
"kept_bytes": 0,
"partial": False,
}


def incremental_diff_notice(meta: dict) -> str:
lines = [
"[incremental diff partial coverage]",
(
f"Dropped {meta['dropped_sections']} vendored/generated/lockfile "
f"section(s); kept {meta['kept_bytes']} bytes."
),
]
if meta["dropped_paths"]:
lines.append("Dropped paths:")
lines.extend(f"- {path}" for path in meta["dropped_paths"])
if meta["dropped_paths_omitted"]:
lines.append(f"- ... {meta['dropped_paths_omitted']} more path(s)")
if meta["truncated"]:
lines.append(f"Diff truncated to {DIFF_MAX_BYTES} bytes.")
lines.append("Scan the full PR diff before issuing a no-blocking-issues verdict.")
return "\n".join(lines) + "\n\n"


def gh_api_paginate(endpoint: str) -> list[dict]:
"""Fetch all pages from a gh api endpoint."""
result = gh_api(
Expand Down Expand Up @@ -140,9 +351,10 @@ def parse_paginated_json(output: str) -> list[dict]:
return entries


def fetch_compare_diff(head_repo: str, base_sha: str, head_sha: str) -> Optional[str]:
def fetch_compare_diff(head_repo: str, base_sha: str, head_sha: str) -> tuple[Optional[str], dict]:
"""Fetch a compare diff from the PR head repo for incremental review."""
endpoint = f"repos/{head_repo}/compare/{base_sha}...{head_sha}"
meta = empty_incremental_diff_metadata()
try:
metadata = gh_api([endpoint])
compare = json.loads(metadata.stdout)
Expand All @@ -152,17 +364,35 @@ def fetch_compare_diff(head_repo: str, base_sha: str, head_sha: str) -> Optional
f"Compare status is {status!r}, using full review mode",
file=sys.stderr,
)
return None
result = gh_api(["-H", "Accept: application/vnd.github.diff", endpoint])
return None, meta
raw = gh_api_bytes(["-H", "Accept: application/vnd.github.diff", endpoint])
except subprocess.CalledProcessError as e:
print(
f"Could not fetch incremental diff from {head_repo}: {e.stderr}",
file=sys.stderr,
)
return None
if not result.stdout.strip():
return None
return result.stdout
return None, meta
if not raw.strip():
return None, meta
diff_text, meta = filter_and_decode_diff(raw)
if meta["truncated"] and meta["kept_bytes"] == 0:
print(
"Incremental diff truncated before retaining reviewable content, "
"using full review mode",
file=sys.stderr,
)
return None, meta
if not diff_text.strip():
return None, meta
if meta["partial"]:
diff_text = incremental_diff_notice(meta) + diff_text
print(
f"Incremental diff: dropped {meta['dropped_sections']} vendored/generated "
f"section(s), kept {meta['kept_bytes']} bytes"
f"{', truncated' if meta['truncated'] else ''}",
file=sys.stderr,
)
return diff_text, meta


def current_checkout_sha() -> Optional[str]:
Expand Down Expand Up @@ -309,6 +539,7 @@ def main():
# provide a compact incremental artifact.
review_mode = "full"
incremental_diff_path = None
incremental_diff_metadata = empty_incremental_diff_metadata()
if not last_reviewed_sha:
print("No previous review state found, using full review mode")
elif last_review_base_sha != current_base_sha:
Expand All @@ -318,7 +549,11 @@ def main():
print("PR head repository is unavailable, using full review mode")
last_reviewed_sha = None
else:
incremental_diff = fetch_compare_diff(head_repo, last_reviewed_sha, current_sha)
incremental_diff, incremental_diff_metadata = fetch_compare_diff(
head_repo,
last_reviewed_sha,
current_sha,
)
if incremental_diff:
incremental_diff_path = os.path.join(".github", "incremental.diff")
os.makedirs(os.path.dirname(incremental_diff_path), exist_ok=True)
Expand Down Expand Up @@ -355,6 +590,7 @@ def main():
"last_review_base_sha": last_review_base_sha,
"summary_comment_id": summary_comment_id,
"incremental_diff_path": incremental_diff_path,
"incremental_diff_metadata": incremental_diff_metadata,
"existing_findings": existing_findings,
"comments": trusted_context_comments,
}
Expand Down
Loading