diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 2d468c368ad..03ba90f7706 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -192,9 +192,9 @@ This pattern ensures proper encoding, timestamps, and file attributes are handle ## CI / Build Investigation -**dotnet/android's primary CI runs on Azure DevOps (internal), not GitHub Actions.** When a user asks about CI status, CI failures, why a PR is blocked, or build errors: +**dotnet/android PR validation runs on the public Azure DevOps `dotnet-android` pipeline on `dnceng-public`, not GitHub Actions.** When a user asks about CI status, CI failures, why a PR is blocked, or build errors: -1. **ALWAYS invoke the `ci-status` skill first** — do NOT rely on `gh pr checks` alone. GitHub checks may all show ✅ while the internal Azure DevOps build is failing. +1. **ALWAYS invoke the `ci-status` skill first.** The pipeline surfaces as ~39 `dotnet-android (...)` GitHub checks, but the skill adds build progress, ETA, per-stage failures, and failed-test names that `gh pr checks` alone doesn't give you. 2. The skill auto-detects the current PR from the git branch when no PR number is given. 3. For deep .binlog analysis, use the `azdo-build-investigator` skill. 4. Only after the skill confirms no Azure DevOps failures should you report CI as passing. diff --git a/.github/skills/android-reviewer/SKILL.md b/.github/skills/android-reviewer/SKILL.md index 3327beb4d34..7f130fa5a68 100644 --- a/.github/skills/android-reviewer/SKILL.md +++ b/.github/skills/android-reviewer/SKILL.md @@ -57,7 +57,6 @@ Review the CI results. **Never post ✅ LGTM if any required CI check is failing - Investigate the failure using the **azdo-build-investigator** skill (for Azure DevOps pipeline failures) or GitHub Actions job logs. - If the failure is caused by the PR's code changes, flag it as ❌ error. - If the failure is a known infrastructure issue or pre-existing flake unrelated to the PR, note it in the summary but still use ⚠️ Needs Changes — the PR isn't mergeable until CI is green. -- If **all public CI checks pass** but only the internal `Xamarin.Android-PR` check is failing, still use ⚠️ Needs Changes with a note that the internal pipeline may need a re-run. Do not give ✅ LGTM. - If the PR description acknowledges the failure and documents a dependency (e.g., "blocked on X"), note it in the summary. ### 5. Load review rules diff --git a/.github/skills/ci-status/SKILL.md b/.github/skills/ci-status/SKILL.md index 49d08331a6d..e500a6aabf8 100644 --- a/.github/skills/ci-status/SKILL.md +++ b/.github/skills/ci-status/SKILL.md @@ -4,304 +4,199 @@ description: > Check CI build status and investigate failures for dotnet/android PRs. ALWAYS use this skill when the user asks "check CI", "CI status", "why is CI failing", "is CI green", "why is my PR blocked", or anything about build status on a PR. Auto-detects the current PR from the git branch when no - PR number is given. Covers both GitHub checks and internal Azure DevOps builds. + PR number is given. Covers GitHub checks and the public Azure DevOps pipeline (dnceng-public). DO NOT USE FOR: GitHub Actions workflow authoring, non-dotnet/android repos. --- # CI Status -Check CI status and investigate build failures for dotnet/android PRs. +Triage CI for a `dotnet/android` PR in two phases: **Phase 1** (always) gathers status and renders the report; **Phase 2** (only when asked) drills in via the references. Run the commands verbatim — the `jq`/`az` queries are exact and fragile. -**Key fact:** dotnet/android's primary CI runs on Azure DevOps (internal). GitHub checks alone are insufficient — they may all show ✅ while the internal build is failing. +Every PR runs **one** public Azure DevOps build: pipeline **`dotnet-android`** on `dev.azure.com/dnceng-public` (project `public`, definition `333`), full test matrix. It surfaces on GitHub as ~39 `dotnet-android (...)` checks plus `license/cla`, all backed by that single build. -## Prerequisites +## Pipeline facts (apply throughout) -| Tool | Check | Setup | -|------|-------|-------| -| `gh` | `gh --version` | https://cli.github.com/ | -| `az` + devops ext | `az version` | `az extension add --name azure-devops` then `az login` | +Everything else is standard `gh`/`az` plus the **azure-devops** CLI extension (`az extension add --name azure-devops`); only these are non-obvious: -If `az` is not authenticated, stop and tell the user to run `az login`. +- **Judge pass/fail by the build `result` + GitHub check states — never by the test API.** Device-test lanes run with `continueOnError`, so flaky failures (notably `System.NetTests.SslTest.*`, or failures only in flavor lanes like `-TrimModePartial`/`-NoAab`) show as failed tests on otherwise-green builds. +- **Expect a fork PR to await `/azp run` approval** (re-approved per push); direct PRs auto-start on push. Forks change only triggering, not which pipeline runs. +- **Query test results with `az rest`** — `az devops invoke --area test --resource runs` 404s on dnceng-public, so use `az rest` for the `runs` and `ResultsByBuild` endpoints. Other `--area test` resources (e.g. `--resource results`, see references/azdo-queries.md) work fine. The `build` area works unauthenticated; `az rest` and log/artifact downloads need `az login` (else 401). -## Workflow +## Phase 1 — Status (always) -### Phase 1: Quick Status (always do this first) +Run the steps in order; each `jq` reuses a file an earlier fetch saved: -#### Step 1 — Resolve the PR and detect fork status - -**No PR specified** — detect from current branch: - -```bash -gh pr view --json number,title,url,headRefName,isCrossRepository --jq '{number,title,url,headRefName,isCrossRepository}' -``` - -**PR number given** — use it directly: +1. **Resolve the PR** and its build id — stop if none or not yet built. +2. **Fetch the build result** and save the timeline. +3. **Derive** job status (3a), per-job timing (3b), and the failing-job test breakdown (3c). +4. **Decide the verdict**, then **write the report**. ```bash -gh pr view $PR --repo dotnet/android --json number,title,url,headRefName,isCrossRepository --jq '{number,title,url,headRefName,isCrossRepository}' +ORG=https://dev.azure.com/dnceng-public; PROJECT=public ``` -If no PR exists for the current branch, tell the user and stop. - -**`isCrossRepository`** tells you whether the PR is from a fork: -- `true` → **fork PR** (external contributor) -- `false` → **direct PR** (team member, branch in dotnet/android) - -This matters for CI behavior: -- **Fork PRs:** `Xamarin.Android-PR` does NOT run. `dotnet-android` runs the full pipeline including tests. -- **Direct PRs:** `Xamarin.Android-PR` runs the full test suite. `dotnet-android` skips test stages (build-only) since tests run on DevDiv instead. - -Highlight the fork status in the output so the user understands which checks to expect. - -#### Step 2 — Get GitHub check status +**Step 1 — Resolve the PR.** Drop `--repo`/`$PR` to auto-detect from the current branch: ```bash -gh pr checks $PR --repo dotnet/android --json "name,state,link,bucket" 2>&1 \ - | jq '[.[] | {name, state, bucket, link}]' +gh pr view $PR --repo dotnet/android --json number,title,isCrossRepository +gh pr checks $PR --repo dotnet/android --json name,state,link +BUILD_ID=$(gh pr checks $PR --repo dotnet/android --json name,link \ + --jq '[.[]|select(.name|startswith("dotnet-android")).link][0]' | grep -oE 'buildId=[0-9]+' | cut -d= -f2 | head -1) ``` -```powershell -gh pr checks $PR --repo dotnet/android --json "name,state,link,bucket" | ConvertFrom-Json -``` - -Note which checks passed/failed/pending. The `link` field contains the AZDO build URL for internal checks. - -#### Step 3 — Get Azure DevOps build status (repeat for EACH build) +If `BUILD_ID` is empty (checks "Expected", no build URL), the pipeline hasn't started — report "awaiting `/azp run` approval" (fork) or "not triggered yet" (direct), then stop. -There are typically **two separate AZDO builds** for a dotnet/android PR. They run **independently** — neither waits for the other: -- **`dotnet-android`** on `dev.azure.com/dnceng-public` — Defined in `azure-pipelines-public.yaml` with an explicit `pr:` trigger. - - **Fork PRs:** runs the full pipeline including build + tests (since `Xamarin.Android-PR` won't run for forks). - - **Direct PRs:** runs **build-only** — test stages are auto-skipped because those run on DevDiv instead. This means the `dotnet-android` build will be significantly shorter for direct PRs. -- **`Xamarin.Android-PR`** on `devdiv.visualstudio.com` — full test suite, MAUI integration, compliance. Defined in `azure-pipelines.yaml` but its PR trigger is configured in the AZDO UI, not in YAML. - - **Fork PRs:** does NOT run at all (no access to internal resources). - - **Direct PRs:** runs the full test matrix. May take a few minutes to start after a push. - -Use the **pipeline definition name** (from the `definitionName` field) as the label in output — do NOT label them "Public" or "Internal". - -When a check shows **"Expected — Waiting for status to be reported"** on GitHub (typically `Xamarin.Android-PR`): -- **For direct PRs:** the pipeline hasn't been triggered yet — this is normal, it's not waiting for the other build, just for AZDO to pick it up. Report it as: "⏳ Not triggered yet — typically starts within a few minutes of a push." -- **For fork PRs:** `Xamarin.Android-PR` will NOT run. Report: "⏳ Will not run — fork PRs don't trigger the internal pipeline." - -Extract AZDO build URLs from the check `link` fields. Parse `{orgUrl}`, `{project}`, and `{buildId}` from patterns: -- `https://dev.azure.com/{org}/{project}/_build/results?buildId={id}` -- `https://{org}.visualstudio.com/{project}/_build/results?buildId={id}` - -**Run Steps 3, 3a, and 3b for each AZDO build independently.** The builds have different pipelines, different job counts, and different typical durations — each gets its own progress and ETA. - -For each build, first get the overall status including start time and definition ID: +**Step 2 — Fetch the build result and save the timeline** (both valid mid-build; `/tmp/tl.json` is reused by Steps 3–4): ```bash -az devops invoke --area build --resource builds \ +az devops invoke --area build --resource builds --org $ORG \ --route-parameters project=$PROJECT buildId=$BUILD_ID \ - --org $ORG_URL \ - --query "{status:status, result:result, startTime:startTime, finishTime:finishTime, definitionId:definition.id, definitionName:definition.name}" \ - --output json 2>&1 -``` + --query "{status:status, result:result, startTime:startTime, finishTime:finishTime}" -o json -**Compute elapsed time:** Subtract `startTime` from the current time (or from `finishTime` if the build is complete). Present as e.g. "Ran for 42 min" or "Running for 42 min". - -Then fetch the build timeline for **all jobs** (to get progress counts) and **any failures so far** — even when the build is still in progress: - -```bash -az devops invoke --area build --resource timeline \ - --route-parameters project=$PROJECT buildId=$BUILD_ID \ - --org $ORG_URL \ - --query "records[?type=='Job'] | [].{name:name, state:state, result:result}" \ - --output json 2>&1 +az devops invoke --area build --resource timeline --org $ORG \ + --route-parameters project=$PROJECT buildId=$BUILD_ID --query "records[]" -o json > /tmp/tl.json ``` -**Compute job progress counters** from the timeline response: -- Count jobs where `state == 'completed'` → **finished** -- Count jobs where `state == 'inProgress'` → **running** -- Count jobs where `state == 'pending'` → **waiting** -- Total = finished + running + waiting - -Then fetch failures: +**Step 3a — List job status, then failing records.** `state` is `completed`/`inProgress`/`pending` (pending is often 0 — stages start in parallel). Trust failing `issues[]` for the root cause; check names (e.g. `dotnet-android (Linux Tests Linux > Tests > MSBuild 2)`) already name the lane: ```bash -az devops invoke --area build --resource timeline \ - --route-parameters project=$PROJECT buildId=$BUILD_ID \ - --org $ORG_URL \ - --query "records[?result=='failed'] | [].{name:name, type:type, result:result, issues:issues, errorCount:errorCount, log:log}" \ - --output json 2>&1 +jq -r '.[]|select(.type=="Job")|[(.result // .state), .name]|@tsv' /tmp/tl.json | sort +jq -r '.[]|select(.result=="failed" or .result=="canceled")|[.type,.name,((.issues//[])|map(.message)|join(" | "))]|@tsv' /tmp/tl.json ``` -Check `issues` arrays first — they often contain the root cause directly. +**Step 3b — Time every job and spell out its status.** Emit one row per job: `Status` · `Wait` (build start → job start: upstream builds + agent queue) · `Run` (execution) · `Finished` (… ago, or `running`). Always spell `Status` out — never a bare icon (this vocabulary is reused in the report): -#### Step 3a — Estimate completion time per build (when build is in progress) - -Use the `definitionId` from the build to query recent successful builds of the **same pipeline definition** and compute the median duration. **Do this separately for each build** — the pipelines have very different durations. - -**Important:** The `dotnet-android` pipeline duration varies significantly based on whether the PR is from a fork: -- **Direct PRs:** `dotnet-android` runs build-only (tests skipped) — typically much shorter (~1h 45min) -- **Fork PRs:** `dotnet-android` runs the full pipeline with tests — typically much longer - -To get accurate ETAs, filter historical builds to match the current PR type. You can approximate this by looking at the **job count** of the current build vs historical builds — build-only runs have ~3 jobs while full runs have many more. Alternatively, compare the historical durations and pick the ones that are similar in magnitude to what you'd expect for the current build type. +- `✅ Passed` · `❌ Failed` · `⏹️ Canceled` +- `⏱️ Timed out (N-min cap)` — a `canceled` job whose `issues[]` says *"ran longer than the maximum time"* (read N from the message) +- `🟡 Running` · `⏳ Queued` ```bash -az devops invoke --area build --resource builds \ - --route-parameters project=$PROJECT \ - --org $ORG_URL \ - --query-parameters "definitions=$DEF_ID&statusFilter=completed&resultFilter=succeeded&\$top=10" \ - --query "value[].{startTime:startTime, finishTime:finishTime}" \ - --output json 2>&1 +jq -r ' + def secs: sub("\\.[0-9]+";"")|fromdateiso8601; + def hms: if .==null then "—" else (./1|floor) as $s|($s/3600|floor) as $h|(($s%3600)/60|floor) as $m|($s%60) as $x| + if $h>0 then "\($h)h\(if $m<10 then "0" else "" end)\($m)m" elif $m>0 then "\($m)m\(if $x<10 then "0" else "" end)\($x)s" else "\($x)s" end end; + def reason: + ((.issues//[])|map(.message)|join(" ")) as $msg + | if .result=="succeeded" then "✅ Passed" + elif .result=="canceled" or .result=="failed" then + (if ($msg|test("maximum time of")) then ($msg|capture("maximum time of (?[0-9]+) minutes")|"⏱️ Timed out (\(.m)-min cap)") + elif .result=="canceled" then "⏹️ Canceled" else "❌ Failed" end) + elif .state=="inProgress" then "🟡 Running" + elif .state=="pending" then "⏳ Queued" + else "· \(.result // .state)" end; + (now) as $now | ([.[]|select(.startTime!=null)|(.startTime|secs)]|min) as $t0 + | .[]|select(.type=="Job") + | [ reason, .name, + (if .startTime then ((.startTime|secs)-$t0|hms) else "—" end), + (if .startTime then (((.finishTime|if .==null then $now else secs end))-(.startTime|secs)|hms) else "—" end), + (if .finishTime then (($now-(.finishTime|secs))|hms)+" ago" elif .state=="inProgress" then "running" else "—" end) ] + | @tsv' /tmp/tl.json | sort -t$'\t' -k2 | column -t -s$'\t' ``` -**Compute ETA:** -1. For each recent build, calculate `duration = finishTime - startTime` -2. Filter to builds with similar duration profile (short ~1-2h for build-only, long ~3h+ for full runs) matching the current PR type -3. Compute the **median** duration of the filtered set (more robust than average against outliers) -4. `ETA = startTime + medianDuration` -5. Present as: "ETA: ~14:30 UTC (typical for direct PRs: ~1h 45min)" - -If `startTime` is null (build hasn't started yet), skip the ETA and say "Build queued, not started yet". -If the build already completed, skip the ETA and show the actual duration instead. - -#### Step 3b — Check for failed tests (always do this, especially when the build is still running) - -**This step is critical when the build is in progress.** Test results are published as jobs complete, so failures may already be visible before the build finishes. Surfacing these early lets the user start fixing them immediately. +The `reason` function detects timeout from each job's own `issues[]`. Refine a bare `❌ Failed` with the Step 3c count: **0 failed tests ⇒ a canceled `Run tests` task or the `fail if any issues occurred` gate, not a real failure** — say so. -Query test runs for this build: +**Step 3c — Fetch failed tests + per-flavor counts** (two `az rest` calls; `--area test --resource runs` 404s here, so we use `az rest` directly): **(a)** failed test names + their `runId`; **(b)** every run's per-flavor counts + its phase (`unanalyzedTests`=failed, `notApplicableTests`=skipped): ```bash -az devops invoke --area test --resource runs \ - --route-parameters project=$PROJECT \ - --org $ORG_URL \ - --query-parameters "buildUri=vstfs:///Build/Build/$BUILD_ID" \ - --query "value[?runStatistics[?outcome=='Failed']] | [].{id:id, name:name, totalTests:totalTests, state:state, stats:runStatistics}" \ - --output json 2>&1 +RES=499b84ac-1321-427f-aa17-267ca6975798 # Azure DevOps app id +az rest --method get --resource $RES \ + --url "$ORG/$PROJECT/_apis/test/ResultsByBuild?buildId=$BUILD_ID&outcomes=Failed&api-version=7.1-preview" \ + --query "value[].{test:automatedTestName, runId:runId}" -o json > /tmp/failed.json + +az rest --method get --resource $RES \ + --url "$ORG/$PROJECT/_apis/test/runs?buildUri=vstfs:///Build/Build/$BUILD_ID&api-version=7.1&includeRunDetails=true" \ + --query "value[].{id:id, name:name, total:totalTests, passed:passedTests, failed:unanalyzedTests, skipped:notApplicableTests, phase:pipelineReference.phaseReference.phaseName}" -o json > /tmp/runs.json ``` -For each test run that has failures, fetch the failed test results: +Then build the breakdown — for each failed/canceled job, list its flavors (test runs) with `passed/total · fail · skip`, failed test names nested beneath: ```bash -az devops invoke --area test --resource results \ - --route-parameters project=$PROJECT runId=$RUN_ID \ - --org $ORG_URL \ - --query-parameters "outcomes=Failed&\$top=20" \ - --query "value[].{testName:testCaseTitle, outcome:outcome, errorMessage:errorMessage, durationMs:durationInMs}" \ - --output json 2>&1 +jq -r --slurpfile failed /tmp/failed.json --slurpfile tl /tmp/tl.json ' + [$tl[0][]|select(.type=="Phase")] as $ph + | ($ph|map(select(.result=="failed" or .result=="canceled"))|map(.refName)) as $bad + | $failed[0] as $ft + | group_by(.phase)[] | select(.[0].phase as $p|$bad|index($p)) + | .[0].phase as $p | ($ph[]|select(.refName==$p)|.name) as $job + | "### \($job) — \(map(.total)|add) tests: \(map(.passed)|add) passed, \(map(.failed)|add) failed, \(map(.skipped)|add) skipped", + (sort_by(-.failed,.name)[] + | (if .failed>0 then "❌" else "✅" end) as $m + | " \($m) \(.name) (\(.passed)/\(.total) pass, \(.failed) fail, \(.skipped) skip)", + (.id as $rid|$ft[]|select(.runId==$rid)|" ↳ \(.test)")) +' /tmp/runs.json ``` -If the `errorMessage` is truncated or absent, you can fetch a single test result's full details: +`ResultsByBuild` returns every failed test across runs (only `Failed`/`Aborted` are queryable). Matrix lanes that share one phase (e.g. `MSBuild+Emulator`) aggregate in the breakdown — use the Step 3b timing table to pinpoint the numbered job that died. For per-test error/stack, the ETA query, and the run→job mapping, see [references/azdo-queries.md](references/azdo-queries.md). -```bash -az devops invoke --area test --resource results \ - --route-parameters project=$PROJECT runId=$RUN_ID testId=$TEST_ID \ - --org $ORG_URL \ - --query "{testName:testCaseTitle, errorMessage:errorMessage, stackTrace:stackTrace}" \ - --output json 2>&1 -``` - -#### Step 4 — Present summary - -Use this format — **one section per AZDO build**, each with its own progress and ETA: +**Step 3d — Deep failure analysis (run whenever the build is red).** From the repo root, run the bundled C# file-based app — it turns raw failures into the **per-test cross-config matrix**, **crash detection**, and **branch cross-reference** the report needs (makes its own `az`/`gh` calls, needs `az login` and the .NET SDK, ~15–45 s — scales with the affected test family + retries): +```bash +dotnet run .github/skills/ci-status/scripts/ci_failures.cs -- --build-id $BUILD_ID --pr $PR ``` -# CI Status for PR #NNNN — "PR Title" -🔀 **Direct PR** (branch in dotnet/android) — or 🍴 **Fork PR** (external contributor) -## GitHub Checks -| Check | Status | -|-------|--------| -| check-name | ✅ / ❌ / 🟡 | +(First run restores/builds the app, so allow a few extra seconds. Omit `--pr $PR` to skip the branch cross-reference.) -## dotnet-android [#BuildId](link) -**Result:** ✅ Succeeded / ❌ Failed / 🟡 In Progress -ℹ️ Build-only (tests run on Xamarin.Android-PR for direct PRs) — or ℹ️ Full pipeline with tests (fork PR) -⏱️ Running for **12 min** · ETA: ~15:15 UTC (typical for direct PRs: ~1h 45min) -📊 Jobs: **0/3 completed** · 1 running · 2 waiting +It prints three report-ready sections: +- **Cross-config matrix** — per failed test: the flavors/OSes where it **failed** vs **passed**, with same-build retries shown as `Failed→Passed (retry)` (a retry that passes ⇒ flaky), plus the assembly and the assert/stack. Failing in one flavor/OS only localizes the cause; failing across many is systemic. +- **Crashed / incomplete lanes** — lanes that went red with *no* usable failed-test list (`Zero tests ran`, an incomplete run, or a timeout/hang). The culprit (a test that **started but never finished**, or a native crash) lives only in the device **logcat**; the script prints the download+grep command (also in [references/azdo-queries.md](references/azdo-queries.md)). +- **Branch cross-reference** — PR-changed files whose name matches a failing test's class/namespace/assembly: a lead for an obvious cause. Confirm against the diff before asserting causation. -| Job | Status | -|-----|--------| -| macOS > Build | 🟡 In Progress | -| Linux > Build | ⏳ Waiting | -| Windows > Build & Smoke Test | ⏳ Waiting | -## Xamarin.Android-PR [#BuildId](link) -**Result:** ✅ Succeeded / ❌ Failed / 🟡 In Progress -— or for fork PRs: ⏳ **Will not run** — fork PRs don't trigger this pipeline -⏱️ Running for **42 min** · ETA: ~15:45 UTC (typical: ~2h 30min) -📊 Jobs: **18/56 completed** · 6 running · 32 waiting +### Step 4 — Verdict (decide before writing). Judge by build `result` + checks, NOT the failed-test count: -### Failures (if any) -❌ Stage > Job > Task - Error: +- **`result: failed`, or any ❌ check → red.** Lead with the gating failures (their jobs + tests). If the build is still running with a job already failed, surface it so the user can start fixing now. +- **`result: succeeded` and all checks green → green** — even if `ResultsByBuild` lists failures, those are flaky `continueOnError` lanes. Note them in one line; don't block. -### Failed Tests (if any — even while build is still running) -| Test Run | Failed | Total | -|----------|--------|-------| -| run-name | N | M | +### Report format -**Failed test names:** -- `Namespace.TestClass.TestMethod` — brief error message -- ... +Emit this structure (omit sections that don't apply). Spell out every `Status` per the Step 3b vocabulary, refining `❌ Failed` with the Step 3c count: -## What next? -1. View full logs / stack traces for a test failure -2. Download and analyze .binlog artifacts -3. Retry failed stages ``` +# CI Status — PR #NNNN "" +🔀 Direct PR (or 🍴 Fork PR — may await `/azp run` approval) -**Progress section guidelines:** -- Always show fork status (🔀 Direct PR / 🍴 Fork PR) at the top — it determines which builds run and their expected durations -- For `dotnet-android`, note whether it's build-only (direct PR) or full pipeline (fork PR) -- For `Xamarin.Android-PR` on fork PRs, don't try to query it — just report "Will not run" -- Always show elapsed time when `startTime` is available -- Show ETA when the build is in progress and historical data is available. If the build has been running longer than the median, say "overdue by ~X min" -- Show job counters as "N/Total completed · M running · P waiting" -- If the build hasn't started yet, show "⏳ Not triggered yet — typically starts within a few minutes of a push" -- If a check is in "Expected" state with no build URL on a direct PR, the AZDO pipeline hasn't picked it up yet — this is normal and not gated on other builds - -**If the build is still running but tests have already failed**, highlight these prominently so the user can start fixing them immediately. Use a note like: - -> ⚠️ Build still in progress, but **N tests have already failed** — you can start investigating these now. +## dotnet-android [#<buildId>](<link>) +**Result:** ✅ Succeeded / ❌ Failed / 🟡 In Progress +⏱️ <elapsed> · ETA ~HH:MM UTC (rough — recent runs ≈50 min–3 h) ← only while in progress +📊 Jobs: <done>/<total> done · <running> running · <waiting> waiting -**If no failures found anywhere**, report CI as green and stop. +| Stage > Job | Status | Wait | Run | Finished | +|-------------|--------|------|-----|----------| +| Mac > macOS > Build | ✅ Passed | 12m | 23m | 8h28m ago | +| Package Tests > macOS > Tests > APKs 2 | ❌ Failed — 1 test (flaky GC) | 1h42m | 1h13m | 6h12m ago | +| Package Tests > macOS > Tests > APKs 1 | ❌ Failed — 0 tests (canceled run / gate) | 1h41m | 26m31s | 7h02m ago | +| MSBuild Emulator Tests > … > MSBuild+Emulator 6 | ⏱️ Timed out (180-min cap) | 1h44m | 3h00m | 4h21m ago | +(List every job, or — for a large matrix — the failed/canceled/timed-out lanes plus the slowest few.) -### Phase 2: Deep Investigation (only if user requests) +### Failures ← if any +❌ <Stage> > <Job> — <first error from issues[]> -Only proceed here if the user asks to investigate a specific failure, view logs, or analyze binlogs. +### Failed tests — cross-config (Step 3d) ← one block per failed test +**`SslWithinTasksShouldWork`** (`System.NetTests.SslTest` · `microsoft.android.run.dll`) +- ❌ failed: `NoAab` (Failed→Passed on retry), `TrimModePartial` (Failed→Passed on retry) +- ✅ passed: `Release`, `CoreCLR`, `Debug`, +4 more +- `System.Net.WebException : 503 Service Unavailable` ⇒ flaky network, non-gating + at System.NetTests.SslTest.SslWithinTasksShouldWork() -#### Fetch logs +### Crashed / incomplete lanes (Step 3d) ← if any +⚠️ **Mono.Android.NET_Tests-Debug** — `run` task succeededWithIssues, no results published ("Zero tests ran" / native crash). Name the culprit from logcat (Step 3d command). -Get the `log.id` from failed timeline records, then: +### Branch cross-reference (Step 3d) ← if --pr and a name overlaps +🔍 `SomeType.SomeTest` ⟵ `src/.../SomeType.cs` changed in this PR — likely cause; confirm in the diff. -```bash -az devops invoke --area build --resource logs \ - --route-parameters project=$PROJECT buildId=$BUILD_ID logId=$LOG_ID \ - --org $ORG_URL --project $PROJECT \ - --out-file "/tmp/azdo-log-$LOG_ID.log" 2>&1 -tail -40 "/tmp/azdo-log-$LOG_ID.log" -``` +## Verdict: ✅ green / ❌ red — <one-line reason> -```powershell -$logFile = Join-Path $env:TEMP "azdo-log-$LOG_ID.log" -az devops invoke --area build --resource logs ` - --route-parameters project=$PROJECT buildId=$BUILD_ID logId=$LOG_ID ` - --org $ORG_URL --project $PROJECT ` - --out-file $logFile -Get-Content $logFile -Tail 40 +## What next? +1. Logs / stack trace for a failure +2. `.binlog` (+ `logcat-*.txt` for device-test crashes) +3. Re-run a flaky/failed stage with `/azp run` ``` -#### Analyze .binlog artifacts - -See [references/binlog-analysis.md](references/binlog-analysis.md) for binlog download and analysis commands. - -#### Categorize failures - -See [references/error-patterns.md](references/error-patterns.md) for dotnet/android-specific error patterns and categorization. - -## Error Handling +Notes: every `dotnet-android (...)` check is one job, so the Stage > Job table *is* the check list (the only non-`dotnet-android` check is `license/cla`). Step 3d's cross-config matrix is the fastest way to tell a real failure (fails across flavors/OSes, never passes on retry) from a flake (single flavor, or `Failed→Passed` on retry). For a crashed lane with no failed-test list, name the culprit from the device `logcat-<flavor>.txt` (Step 3d's command; recipe in [references/azdo-queries.md](references/azdo-queries.md)) — not the test message. -- **Build in progress:** Still query for failed timeline records AND test runs. Report any early failures alongside the in-progress status. Only offer `gh pr checks --watch` if there are no failures yet. -- **Check in "Expected" state (no build URL):** The AZDO pipeline hasn't been triggered yet. This is normal — the two pipelines (`dotnet-android` and `Xamarin.Android-PR`) run independently, not sequentially. Report: "⏳ Not triggered yet — typically starts within a few minutes of a push." Do NOT say it's waiting for the other build. -- **Auth expired:** Tell user to run `az login` and retry. -- **Build not found:** Verify the PR number/build ID is correct. -- **No test runs yet:** The build may not have reached the test phase. Report what's available and note that tests haven't started. +## Phase 2 — Deep dive (only when asked) -## Tips +Read the matching reference, then act on it: -- Focus on the **first** error chronologically — later errors often cascade -- `.binlog` has richer detail than text logs when logs show only "Build FAILED" -- `issues` in timeline records often contain the root cause without needing to download logs +- Logs, per-test error/stack, ETA, per-flavor breakdown fields + run→job mapping, **crash-culprit from logcat** → [references/azdo-queries.md](references/azdo-queries.md) +- `.binlog` download + analysis → [references/binlog-analysis.md](references/binlog-analysis.md) +- Categorize a failure (real / flaky / infra) → [references/error-patterns.md](references/error-patterns.md) diff --git a/.github/skills/ci-status/references/azdo-queries.md b/.github/skills/ci-status/references/azdo-queries.md new file mode 100644 index 00000000000..9642bb30736 --- /dev/null +++ b/.github/skills/ci-status/references/azdo-queries.md @@ -0,0 +1,88 @@ +# AZDO queries (dnceng-public) + +Deeper `az` commands for the `dotnet-android` build, beyond the core ones in SKILL.md. Shared setup: + +```bash +ORG=https://dev.azure.com/dnceng-public; PROJECT=public +RES=499b84ac-1321-427f-aa17-267ca6975798 # Azure DevOps app id, for `az rest --resource` +``` + +`build`-area `az devops invoke` works unauthenticated; in the `test` area only `--resource runs` is broken (404 on dnceng-public, so `runs` and `ResultsByBuild` go through `az rest`) — other resources like `--resource results` work fine. `az rest` and artifact/log downloads need `az login`. + +## ETA for an in-progress build + +Duration is dominated by hosted-agent queue time (same ~38 jobs every run, yet ~50 min to ~3 h+). Pull recent green runs of def `333`, take the **median** duration, `ETA = startTime + median`; present it as a rough window. + +```bash +az devops invoke --area build --resource builds --org $ORG \ + --route-parameters project=$PROJECT \ + --query-parameters "definitions=333&statusFilter=completed&resultFilter=succeeded&\$top=10" \ + --query "value[].{start:startTime, finish:finishTime}" -o json +``` + +## Failed-test error message / stack trace + +`ResultsByBuild` (SKILL.md) gives the names + `runId`. For messages, list the run's failed results — the single-result-by-`testId` route returns null here. Repeat per distinct `runId`: + +```bash +az devops invoke --area test --resource results --org $ORG \ + --route-parameters project=$PROJECT runId=$RUN_ID \ + --query-parameters "outcomes=Failed&\$top=20" \ + --query "value[].{test:testCaseTitle, error:errorMessage, stack:stackTrace}" -o json +``` + +## Per-flavor test breakdown — fields & run → job mapping + +The breakdown in SKILL.md fetches `/tmp/runs.json` from `/_apis/test/runs?...&includeRunDetails=true`. Field meanings per run (one run = one test *flavor*, e.g. `Mono.Android.NET_Tests-NativeAOT`): + +| Field | Source | Meaning | +|-------|--------|---------| +| `total` | `totalTests` | all tests in the run | +| `passed` | `passedTests` | passed | +| `failed` | `unanalyzedTests` | failed/aborted | +| `skipped` | `notApplicableTests` | skipped / inconclusive | +| `phase` | `pipelineReference.phaseReference.phaseName` | the pipeline phase the run belongs to | + +`run.phase` equals a timeline **Phase** record's `refName`; that record's `name` is the human lane — e.g. `mac_apk_tests_net_2` → `macOS > Tests > APKs 2`. That join (`runs` × timeline phases) is what the breakdown `jq` does. **Matrix lanes that share one phase** (e.g. all `MSBuild+Emulator N` jobs are phase `mac_dotnetdevice_tests`) aggregate into a single breakdown block — use the per-job timing table to see which numbered job actually failed/timed out. + +Quick per-run counts without the join: + +```bash +az rest --method get --resource $RES \ + --url "$ORG/$PROJECT/_apis/test/runs?buildUri=vstfs:///Build/Build/$BUILD_ID&api-version=7.1&includeRunDetails=true" \ + --query "value[].{name:name, total:totalTests, passed:passedTests, failed:unanalyzedTests, skipped:notApplicableTests}" -o json +``` + +To enrich the breakdown with the **actual error message** under each failed test, replace `/tmp/failed.json` with per-run results that include `errorMessage` (the "Failed-test error message" query above) — key them by `runId` the same way the breakdown's `$ft` lookup does. + +## Fetch a failed task's log + +Take `log.id` from a `records[?result=='failed']` timeline entry, then (works unauthenticated via `az rest`): + +```bash +az rest --method get --resource $RES \ + --url "$ORG/$PROJECT/_apis/build/builds/$BUILD_ID/logs/$LOG_ID?api-version=7.1" --output-file "/tmp/azdo-$LOG_ID.log" +``` + +The per-flavor `run <flavor>` task log holds the MTP summary (`Test run summary: Zero tests ran` ⇒ the app crashed at startup); the per-test lifecycle and native crash are **not** here — they are in logcat (below). + +## Crash culprit from logcat + +`scripts/ci_failures.cs` flags crashed/incomplete/timed-out lanes, but the culprit test is only in the device **logcat**, published inside that lane's `Test Results - ...` build artifact (100 MB–2 GB — prefer the smaller `Debug` lane). Download it, then scan `logcat-<flavor>.txt`: + +```bash +# list artifacts + sizes to pick the failing lane: +az rest --method get --resource $RES \ + --url "$ORG/$PROJECT/_apis/build/builds/$BUILD_ID/artifacts?api-version=7.1" \ + --query "value[].{name:name, mb:(resource.properties.artifactsize)}" -o json + +az pipelines runs artifact download --run-id $BUILD_ID --org $ORG --project $PROJECT \ + --artifact-name "Test Results - APKs .NET Debug - macOS 1" --path /tmp/cilogs + +# The crasher is the LAST test that logged a start with no matching pass/fail, +# usually right before a native signal: +grep -nE 'Running |\[PASS\]|\[FAIL\]|SIGSEGV|SIGABRT|tombstone|FATAL|art::|JNI DETECTED|Process .* died' \ + /tmp/cilogs/**/logcat-*.txt | tail -60 +``` + +For a `Zero tests ran` lane the crash is at app startup (look for the first `SIGSEGV`/`tombstone`/`JNI DETECTED ERROR`, not a specific test); for a timeout the suspect is the last `Running <test>` with no result. diff --git a/.github/skills/ci-status/references/binlog-analysis.md b/.github/skills/ci-status/references/binlog-analysis.md index 320b7fb9853..7066e8d522b 100644 --- a/.github/skills/ci-status/references/binlog-analysis.md +++ b/.github/skills/ci-status/references/binlog-analysis.md @@ -20,7 +20,11 @@ az pipelines runs artifact list --run-id $BUILD_ID --org $ORG_URL --project $PRO az pipelines runs artifact list --run-id $BUILD_ID --org $ORG_URL --project $PROJECT --output json ``` -Look for artifact names containing `binlog`, `msbuild`, or `build-log`. +Look for artifact names that contain build logs. On the `dotnet-android` (dnceng-public) pipeline the relevant ones are: +- `Build Results - macOS` / `Build Results - Windows` / `Build Results - Linux` — contain the `.binlog` files (published mainly when a build stage fails or when `XA.PublishAllLogs` is set). +- `Test Results - ...` — per-test-stage logs and artifacts. For the on-device `Package Tests` (APKs) stage these also include each device test's `build-<testName>.binlog`, `run-<testName>.binlog`, the `.trx`, and `logcat-<testName>.txt` (essential for native/JNI crash diagnosis). + +If a green build has no `Build Results - *` artifact, the binlogs weren't published; re-run with `XA.PublishAllLogs` or rely on the timeline/test queries instead. ### Download diff --git a/.github/skills/ci-status/references/error-patterns.md b/.github/skills/ci-status/references/error-patterns.md index f705a080548..d3e0efbf764 100644 --- a/.github/skills/ci-status/references/error-patterns.md +++ b/.github/skills/ci-status/references/error-patterns.md @@ -36,7 +36,7 @@ These are CI environment issues, not code problems. | Network | `Unable to load the service index`, `Connection refused` | | NuGet feed | `NU1301` (feed connectivity) | | Agent issues | `The agent did not connect`, `##[error] The job was canceled` | -| Timeout (job-level) | Job canceled after 55+ minutes | +| Timeout (job-level) | `result: canceled` + `issues[]` says *"ran longer than the maximum time of N minutes"* | ## Decision Tree diff --git a/.github/skills/ci-status/scripts/ci_failures.cs b/.github/skills/ci-status/scripts/ci_failures.cs new file mode 100644 index 00000000000..4719e9b80ce --- /dev/null +++ b/.github/skills/ci-status/scripts/ci_failures.cs @@ -0,0 +1,368 @@ +#!/usr/bin/env dotnet +// Enriched failure analysis for one dnceng-public `dotnet-android` build: +// 1. cross-config matrix per failed test (failed/passed/retried configs) + stack/asserts +// 2. crashed / incomplete lanes (started-but-not-finished culprit lives in logcat) +// 3. branch cross-reference (PR changes that name a failing test's class/namespace/assembly) +// +// Needs `az login`. Usage: dotnet run ci_failures.cs -- --build-id N [--pr N] [--repo dotnet/android] + +using System.Collections.Concurrent; +using System.Diagnostics; +using System.Text.Json; +using System.Text.Json.Nodes; +using System.Text.RegularExpressions; + +const string ORG = "https://dev.azure.com/dnceng-public"; +const string PROJECT = "public"; +const string RES = "499b84ac-1321-427f-aa17-267ca6975798"; + +// ---------------- argument parsing ---------------- +string? buildId = null, pr = null, repo = "dotnet/android"; +for (int i = 0; i < args.Length; i++) { + switch (args [i]) { + case "--build-id": buildId = ++i < args.Length ? args [i] : null; break; + case "--pr": pr = ++i < args.Length ? args [i] : null; break; + case "--repo": repo = ++i < args.Length ? args [i] : repo; break; + } +} +if (string.IsNullOrEmpty (buildId)) { + Console.Error.WriteLine ("usage: dotnet run ci_failures.cs -- --build-id N [--pr N] [--repo dotnet/android]"); + return 1; +} + +// ---------------- main ---------------- +var failed = GetArray (AzJson ($"{ORG}/{PROJECT}/_apis/test/ResultsByBuild?buildId={buildId}&outcomes=Failed&api-version=7.1-preview"), "value"); +var runs = GetArray (AzJson ($"{ORG}/{PROJECT}/_apis/test/runs?buildUri=vstfs:///Build/Build/{buildId}&api-version=7.1&includeRunDetails=true"), "value"); +var timeline = AzJson ($"{ORG}/{PROJECT}/_apis/build/builds/{buildId}/timeline?api-version=7.1") ?? new JsonObject (); + +var runById = new Dictionary<int, JsonNode> (); +foreach (var r in runs) + if (r is not null) + runById [ToInt (r ["id"])] = r; + +P ($"# Failure analysis - build {buildId}"); +P (); +if (failed.Count > 0) + SectionMatrix (buildId, failed, runs, runById); +else { + P ("_No failed tests in the test API (build may still be red via crash/timeout below)._"); + P (); +} +SectionCrashes (buildId, runs, timeline); +if (!string.IsNullOrEmpty (pr)) + SectionXref (failed, repo, pr); + +return 0; + +// ---------------- section 1: cross-config matrix ---------------- +void SectionMatrix (string bid, JsonArray failed, JsonArray runs, Dictionary<int, JsonNode> runById) +{ + var failRuns = new Dictionary<string, HashSet<int>> (); + var storage = new Dictionary<string, string?> (); + foreach (var f in failed) { + if (f is null) + continue; + var name = Str (f ["automatedTestName"]); + if (name.Length == 0) + continue; + if (!failRuns.TryGetValue (name, out var set)) + failRuns [name] = set = new HashSet<int> (); + set.Add (ToInt (f ["runId"])); + storage [name] = StrN (f ["automatedTestStorage"]); + } + + string FirstBase (HashSet<int> rids) + { + foreach (var r in rids) + if (runById.TryGetValue (r, out var run)) + return BaseOf (Str (run ["name"])); + return ""; + } + + var fam = failRuns.ToDictionary (kv => kv.Key, kv => FirstBase (kv.Value)); + var cand = new Dictionary<string, List<JsonNode>> (); + foreach (var fk in fam.Values.Distinct ()) { + var list = new List<JsonNode> (); + foreach (var r in runs) + if (r is not null && BaseOf (Str (r ["name"])) == fk) + list.Add (r); + cand [fk] = list; + } + var ids = new HashSet<int> (); + foreach (var fk in fam.Values) + foreach (var r in cand [fk]) + ids.Add (ToInt (r ["id"])); + var cache = FetchAll (ids); + + P ($"## Failed-test cross-config matrix — {failRuns.Count} distinct test(s)"); + P (); + foreach (var n in failRuns.Keys.OrderBy (x => x, StringComparer.Ordinal)) { + var fk = fam [n]; + var cfg = new Dictionary<string, List<(string completed, string outcome)>> (); + foreach (var r in cand [fk]) { + var rid = ToInt (r ["id"]); + if (cache.TryGetValue (rid, out var m) && m.TryGetValue (n, out var row)) { + var rname = Str (r ["name"]); + if (!cfg.TryGetValue (rname, out var lst)) + cfg [rname] = lst = new List<(string, string)> (); + lst.Add ((StrN (r ["completedDate"]) ?? "", row.outcome ?? "")); + } + } + int li = n.LastIndexOf ('.'); + string shortN = li >= 0 ? n [(li + 1)..] : n; + string ns = li >= 0 ? n [..li] : n; + P ($"### `{shortN}` ({ns})"); + storage.TryGetValue (n, out var asm); + P ($"- assembly `{asm}` · family `{fk}`"); + var fl = new List<string> (); + var pa = new List<string> (); + var ot = new List<string> (); + foreach (var name in cfg.Keys.OrderBy (x => x, StringComparer.Ordinal)) { + var outs = cfg [name] + .OrderBy (t => t.completed, StringComparer.Ordinal) + .ThenBy (t => t.outcome, StringComparer.Ordinal) + .Select (t => t.outcome).ToList (); + string label = name.Length >= fk.Length ? name [fk.Length..] : name; + label = label.TrimStart (' ', '-'); + if (label.Length == 0) + label = name; + string disp = outs.Distinct ().Count () > 1 ? string.Join ("->", outs) + " (retry)" : outs [0]; + string entry = $"`{label}`" + (disp == "Passed" ? "" : $" ({disp})"); + if (outs.Contains ("Failed")) + fl.Add (entry); + else if (outs.Distinct ().Count () == 1 && outs [0] == "Passed") + pa.Add (entry); + else + ot.Add (entry); + } + P ($"- FAILED in: {(fl.Count > 0 ? string.Join (", ", fl) : "-")}"); + P ($"- passed in: {(pa.Count > 0 ? string.Join (", ", pa) : "-")}"); + if (ot.Count > 0) + P ($"- other: {string.Join (", ", ot)}"); + foreach (var rid in failRuns [n]) { + if (cache.TryGetValue (rid, out var m) && m.TryGetValue (n, out var row) && !string.IsNullOrEmpty (row.err)) { + P ($"- assert/error: {Trunc (Lines (row.err) [0], 300)}"); + if (!string.IsNullOrEmpty (row.stack)) { + P (" ```"); + foreach (var ln in Lines (row.stack).Take (6)) + P (" " + Trunc (ln, 200)); + P (" ```"); + } + break; + } + } + P (); + } +} + +// ---------------- section 2: crashed / incomplete lanes ---------------- +void SectionCrashes (string bid, JsonArray runs, JsonNode timeline) +{ + var recs = (timeline ["records"] as JsonArray) ?? new JsonArray (); + var published = new Dictionary<string, JsonNode> (); + foreach (var r in runs) + if (r is not null) + published [Str (r ["name"])] = r; + var crashed = new List<(string name, string why)> (); + // incomplete test runs (runner died mid-run) + foreach (var r in runs) { + if (r is null) + continue; + int inc = ToInt (r ["incompleteTests"]); + if (inc > 0) + crashed.Add ((Str (r ["name"]), $"{inc} test(s) did not complete - runner died mid-run")); + } + // "run <flavor>" tasks that did not cleanly succeed AND published no (complete) results = crash/zero-tests + foreach (var rec in recs) { + if (rec is null) + continue; + var type = Str (rec ["type"]); + var name = Str (rec ["name"]); + var result = Str (rec ["result"]); + if (type == "Task" && name.StartsWith ("run ") && + (result == "failed" || result == "succeededWithIssues" || result == "canceled")) { + var flavor = name [4..].Trim (); + if (!published.TryGetValue (flavor, out var run) || ToInt (run ["incompleteTests"]) > 0) + crashed.Add ((flavor, $"`run` task {result} but no complete test run published - app likely crashed ('Zero tests ran' / native crash)")); + } + } + // job-level timeouts (hang) + foreach (var rec in recs) { + if (rec is null) + continue; + if (Str (rec ["type"]) == "Job" && Str (rec ["result"]) == "canceled") { + var issues = (rec ["issues"] as JsonArray) ?? new JsonArray (); + var msg = string.Join (" ", issues.Select (i => Str (i? ["message"]))); + var m = Regex.Match (msg, @"maximum time of (\d+) minutes"); + if (m.Success) + crashed.Add ((Str (rec ["name"]), $"timed out at {m.Groups [1].Value}-min cap - likely a hung test; last started test in logcat is the suspect")); + } + } + if (crashed.Count == 0) + return; + P ("## Crashed / incomplete lanes (!)"); + P (); + P ("These went red with **no usable failed-test list** - the culprit (a test that **started but never finished**, or a native crash) is only in the device **logcat**, not the test API:"); + P (); + var seen = new HashSet<(string, string)> (); + foreach (var (name, why) in crashed) { + if (!seen.Add ((name, why))) + continue; + P ($"- **{name}** - {why}"); + } + P (); + P ("To name the culprit, list this build's artifacts and download the matching `Test Results - ...` lane (large: 100MB-2GB - prefer a `Debug` lane), then scan its logcat (see references/azdo-queries.md):"); + P (); + P ("```bash"); + P ($"az pipelines runs artifact list --run-id {bid} --org {ORG} --project {PROJECT} \\"); + P (@" --query '[?starts_with(name, `Test Results`)].name' -o tsv"); + P ($"az pipelines runs artifact download --run-id {bid} --org {ORG} --project {PROJECT} \\"); + P (" --artifact-name \"<paste matching Test Results - ... name>\" --path /tmp/cilogs"); + P (@"grep -nE 'Running |\[PASS\]|\[FAIL\]|SIGSEGV|SIGABRT|tombstone|FATAL|art::|JNI DETECTED|Process .*died' \\"); + P (" /tmp/cilogs/**/logcat-*.txt | tail -60 # last test that STARTED with no PASS/FAIL = crasher"); + P ("```"); + P (); +} + +// ---------------- section 3: branch cross-reference ---------------- +void SectionXref (JsonArray failed, string repo, string pr) +{ + var names = new SortedSet<string> (StringComparer.Ordinal); + foreach (var f in failed) + if (f is not null) { + var n = Str (f ["automatedTestName"]); + if (n.Length > 0) + names.Add (n); + } + if (names.Count == 0) + return; + var (code, stdout, stderr) = Run ("gh", "pr", "diff", pr, "--repo", repo, "--name-only"); + if (code != 0) { + Console.Error.Write ($"gh diff failed: {Trunc (stderr, 200)}\n"); + return; + } + var files = stdout.Replace ("\r\n", "\n").Split ('\n').Where (l => l.Trim ().Length > 0).ToList (); + var stems = new Dictionary<string, string> (); + foreach (var f in files) { + var leaf = f.Contains ('/') ? f [(f.LastIndexOf ('/') + 1)..] : f; + var stem = leaf.Contains ('.') ? leaf [..leaf.LastIndexOf ('.')] : leaf; + stems [stem] = f; + } + P ("## Branch cross-reference"); + P (); + P ($"PR #{pr} changes {files.Count} file(s). Name overlaps with failing tests (judge if causal):"); + P (); + bool anyHit = false; + foreach (var n in names) { + var parts = n.Split ('.'); + string cls = parts.Length >= 2 ? parts [^2] : ""; + string method = parts [^1]; + var nsParts = parts.Take (Math.Max (0, parts.Length - 2)).ToHashSet (); + var hits = new SortedSet<string> (StringComparer.Ordinal); + foreach (var (stem, path) in stems) { + if (stem.Length > 0 && (stem == cls || stem == method || nsParts.Contains (stem) || (cls.Length > 0 && path.Contains (cls)))) + hits.Add (path); + } + if (hits.Count > 0) { + anyHit = true; + P ($"- `{cls}.{method}` <- {string.Join (", ", hits.Take (5).Select (h => "`" + h + "`"))}"); + } + } + if (!anyHit) + P ("- No direct file-name overlap. Check whether changed runtime/build code affects the failing assembly."); + P (); +} + +// ---------------- helpers ---------------- +JsonNode? AzJson (string url) +{ + var (code, stdout, stderr) = Run ("az", "rest", "--method", "get", "--resource", RES, "--url", url, "-o", "json"); + if (code != 0) { + Console.Error.Write ($"az error {url}\n{Trunc (stderr, 300)}\n"); + return null; + } + try { + return JsonNode.Parse (stdout); + } catch (JsonException) { + return null; + } +} + +(int rid, Dictionary<string, (string? outcome, string? err, string? stack)> map) RunResults (int rid) +{ + var data = AzJson ($"{ORG}/{PROJECT}/_apis/test/Runs/{rid}/results?api-version=7.1&$top=5000"); + var outd = new Dictionary<string, (string?, string?, string?)> (); + var arr = (data? ["value"] as JsonArray) ?? new JsonArray (); + foreach (var row in arr) { + if (row is null) + continue; + var n = Str (row ["automatedTestName"]); + if (n.Length > 0) + outd [n] = (StrN (row ["outcome"]), StrN (row ["errorMessage"]), StrN (row ["stackTrace"])); + } + return (rid, outd); +} + +Dictionary<int, Dictionary<string, (string? outcome, string? err, string? stack)>> FetchAll (IEnumerable<int> rids) +{ + var result = new ConcurrentDictionary<int, Dictionary<string, (string?, string?, string?)>> (); + var list = rids.ToList (); + if (list.Count == 0) + return new Dictionary<int, Dictionary<string, (string?, string?, string?)>> (); + Parallel.ForEach (list, new ParallelOptions { MaxDegreeOfParallelism = 6 }, rid => { + var (r, o) = RunResults (rid); + result [r] = o; + }); + return new Dictionary<int, Dictionary<string, (string?, string?, string?)>> (result); +} + +// Strip flavor/OS/index suffix so sibling configs share one base. +// 'Mono.Android.NET_Tests-NativeAOT' -> 'Mono.Android.NET_Tests'; +// 'Xamarin.Android.Build.Tests - macOS-7' -> 'Xamarin.Android.Build.Tests'. +static string BaseOf (string name) +{ + var b = Regex.Replace (name, @" - (macOS|Windows|Linux)(-\d+)?$", ""); + b = Regex.Replace (b, @"-[A-Za-z0-9]+$", ""); + return b; +} + +static (int code, string stdout, string stderr) Run (string file, params string [] cliArgs) +{ + var psi = new ProcessStartInfo (file) { + RedirectStandardOutput = true, + RedirectStandardError = true, + UseShellExecute = false, + }; + foreach (var a in cliArgs) + psi.ArgumentList.Add (a); + using var proc = Process.Start (psi); + if (proc is null) + return (-1, "", $"failed to start {file}"); + string stdout = proc.StandardOutput.ReadToEnd (); + string stderr = proc.StandardError.ReadToEnd (); + proc.WaitForExit (); + return (proc.ExitCode, stdout, stderr); +} + +static JsonArray GetArray (JsonNode? root, string key) + => root? [key] as JsonArray ?? new JsonArray (); + +static string Str (JsonNode? node) + => node is null || node.GetValueKind () != JsonValueKind.String ? "" : node.GetValue<string> (); + +static string? StrN (JsonNode? node) + => node is null || node.GetValueKind () != JsonValueKind.String ? null : node.GetValue<string> (); + +static int ToInt (JsonNode? node) + => node is null || node.GetValueKind () != JsonValueKind.Number ? 0 : node.GetValue<int> (); + +static string Trunc (string s, int n) + => s.Length <= n ? s : s [..n]; + +static string [] Lines (string s) +{ + var t = s.Replace ("\r\n", "\n").Trim (); + return t.Split ('\n'); +} + +static void P (string s = "") => Console.WriteLine (s);