Skip to content

Commit 0b1dc43

Browse files
authored
ci: add GH-managed prek cache image bootstrap workflow (#566)
## Summary Trying to add megatron dependencies in uv. This makes CI expensive, so I am trying to build a cached image with megatron related dependencies which does the heavy lifting. This can be updated by a dedicated workflow when megatron dependencies are modified. This PR is to add that workflow to main so #560 can test the workflow and CI.
1 parent 86d347b commit 0b1dc43

4 files changed

Lines changed: 552 additions & 0 deletions

File tree

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
name: Build Prek Cache Image
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
image_repo:
7+
description: GHCR image repository
8+
required: true
9+
default: ghcr.io/openpipe/art-ci
10+
type: string
11+
base_image:
12+
description: Base image used for prewarm build
13+
required: true
14+
default: pytorch/pytorch:2.9.0-cuda12.8-cudnn9-devel
15+
type: string
16+
python_mm:
17+
description: Python major.minor for CI metadata
18+
required: true
19+
default: "3.11"
20+
type: string
21+
build_jobs:
22+
description: Parallel build jobs for native builds (auto or positive integer)
23+
required: true
24+
default: auto
25+
type: string
26+
fingerprint:
27+
description: Megatron fingerprint for immutable tag
28+
required: true
29+
type: string
30+
31+
permissions:
32+
contents: read
33+
packages: write
34+
35+
jobs:
36+
build-and-push:
37+
runs-on: art-large-runner
38+
39+
steps:
40+
- name: Checkout
41+
uses: actions/checkout@v4
42+
43+
- name: Set up Docker Buildx
44+
uses: docker/setup-buildx-action@v3
45+
46+
- name: Log in to GHCR
47+
uses: docker/login-action@v3
48+
with:
49+
registry: ghcr.io
50+
username: ${{ github.actor }}
51+
password: ${{ secrets.GITHUB_TOKEN }}
52+
53+
- name: Resolve build parallelism
54+
id: parallelism
55+
shell: bash
56+
run: |
57+
requested="${{ inputs.build_jobs }}"
58+
mem_kib="$(awk '/MemTotal/ {print $2}' /proc/meminfo)"
59+
mem_gib="$((mem_kib / 1024 / 1024))"
60+
61+
if [[ "${requested}" == "auto" ]]; then
62+
# Keep headroom for linker and container overhead.
63+
if (( mem_gib >= 28 )); then
64+
jobs=4
65+
elif (( mem_gib >= 20 )); then
66+
jobs=3
67+
elif (( mem_gib >= 14 )); then
68+
jobs=2
69+
else
70+
jobs=1
71+
fi
72+
else
73+
if ! [[ "${requested}" =~ ^[1-9][0-9]*$ ]]; then
74+
echo "::error::build_jobs must be 'auto' or a positive integer."
75+
exit 1
76+
fi
77+
jobs="${requested}"
78+
fi
79+
80+
echo "jobs=${jobs}" >> "${GITHUB_OUTPUT}"
81+
echo "mem_gib=${mem_gib}" >> "${GITHUB_OUTPUT}"
82+
echo "Using ${jobs} parallel build jobs on ${mem_gib} GiB runner memory."
83+
84+
- name: Build and push image
85+
uses: docker/build-push-action@v6
86+
with:
87+
context: .
88+
file: docker/ci-prek-cache.Dockerfile
89+
push: true
90+
tags: |
91+
${{ inputs.image_repo }}:prek-megatron-${{ inputs.fingerprint }}
92+
${{ inputs.image_repo }}:prek-megatron-current
93+
build-args: |
94+
BASE_IMAGE=${{ inputs.base_image }}
95+
CI_PYTHON_MM=${{ inputs.python_mm }}
96+
MEGATRON_FINGERPRINT=${{ inputs.fingerprint }}
97+
BUILD_JOBS=${{ steps.parallelism.outputs.jobs }}
98+
cache-from: type=gha,scope=prek-cache-image
99+
cache-to: type=gha,mode=max,scope=prek-cache-image
100+
101+
- name: Resolve GHCR package coordinates
102+
id: package
103+
shell: bash
104+
run: |
105+
image_repo="${{ inputs.image_repo }}"
106+
if [[ ! "${image_repo}" =~ ^ghcr\.io/([^/]+)/([^:@]+)$ ]]; then
107+
echo "::error::image_repo must be in format ghcr.io/<owner>/<package>"
108+
exit 1
109+
fi
110+
echo "owner=${BASH_REMATCH[1]}" >> "${GITHUB_OUTPUT}"
111+
echo "package=${BASH_REMATCH[2]}" >> "${GITHUB_OUTPUT}"
112+
113+
- name: Prune old managed GHCR image versions
114+
id: prune
115+
uses: actions/github-script@v7
116+
env:
117+
IMAGE_OWNER: ${{ steps.package.outputs.owner }}
118+
PACKAGE_NAME: ${{ steps.package.outputs.package }}
119+
KEEP_COUNT: "4"
120+
with:
121+
script: |
122+
const owner = process.env.IMAGE_OWNER;
123+
const packageName = process.env.PACKAGE_NAME;
124+
const keepCount = Number(process.env.KEEP_COUNT || "4");
125+
const currentTag = "prek-megatron-current";
126+
const immutablePrefix = "prek-megatron-";
127+
128+
const versions = await github.paginate(
129+
github.rest.packages.getAllPackageVersionsForPackageOwnedByOrg,
130+
{
131+
org: owner,
132+
package_type: "container",
133+
package_name: packageName,
134+
per_page: 100,
135+
},
136+
);
137+
138+
const managed = versions
139+
.filter((version) => {
140+
const tags = version.metadata?.container?.tags || [];
141+
return tags.some(
142+
(tag) => tag === currentTag || tag.startsWith(immutablePrefix),
143+
);
144+
})
145+
.sort(
146+
(a, b) =>
147+
new Date(b.created_at).getTime() - new Date(a.created_at).getTime(),
148+
);
149+
150+
const toDelete = managed.slice(keepCount);
151+
for (const version of toDelete) {
152+
await github.rest.packages.deletePackageVersionForPackageOwnedByOrg({
153+
org: owner,
154+
package_type: "container",
155+
package_name: packageName,
156+
package_version_id: version.id,
157+
});
158+
}
159+
160+
core.setOutput("managed_total", String(managed.length));
161+
core.setOutput("deleted_count", String(toDelete.length));
162+
core.setOutput("retained_count", String(Math.min(keepCount, managed.length)));
163+
core.info(
164+
`Managed versions: ${managed.length}; deleted: ${toDelete.length}; retained: ${Math.min(keepCount, managed.length)}.`,
165+
);
166+
167+
- name: Summarize tags
168+
run: |
169+
{
170+
echo "## Prek Cache Image Pushed"
171+
echo ""
172+
echo "- Immutable: \\`${{ inputs.image_repo }}:prek-megatron-${{ inputs.fingerprint }}\\`"
173+
echo "- Current: \\`${{ inputs.image_repo }}:prek-megatron-current\\`"
174+
echo "- Base image: \\`${{ inputs.base_image }}\\`"
175+
echo "- Runner memory: \\`${{ steps.parallelism.outputs.mem_gib }} GiB\\`"
176+
echo "- Build jobs: \\`${{ steps.parallelism.outputs.jobs }}\\`"
177+
echo "- Managed versions seen: \\`${{ steps.prune.outputs.managed_total }}\\`"
178+
echo "- Versions deleted: \\`${{ steps.prune.outputs.deleted_count }}\\`"
179+
echo "- Versions retained: \\`${{ steps.prune.outputs.retained_count }}\\`"
180+
} >> "$GITHUB_STEP_SUMMARY"

docker/ci-prek-cache.Dockerfile

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
ARG BASE_IMAGE=pytorch/pytorch:2.9.0-cuda12.8-cudnn9-devel
2+
FROM ${BASE_IMAGE}
3+
4+
ARG CI_PYTHON_MM=3.11
5+
ARG MEGATRON_FINGERPRINT=unset
6+
ARG BUILD_JOBS=2
7+
8+
ENV UV_CACHE_DIR=/root/.cache/uv
9+
ENV UV_LINK_MODE=copy
10+
ENV UV_CONCURRENT_BUILDS=${BUILD_JOBS}
11+
ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
12+
ENV MAX_JOBS=${BUILD_JOBS}
13+
ENV NINJAFLAGS=-j${BUILD_JOBS}
14+
ENV TORCH_CUDA_ARCH_LIST=8.0
15+
16+
RUN apt-get update && \
17+
apt-get install -y --no-install-recommends ca-certificates curl git && \
18+
rm -rf /var/lib/apt/lists/*
19+
20+
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
21+
ENV PATH="/root/.cargo/bin:${PATH}"
22+
23+
WORKDIR /opt/art-prek-bootstrap
24+
COPY pyproject.toml uv.lock ./
25+
26+
# Pre-warm uv cache with the full CI dependency surface while avoiding editable install.
27+
RUN uv sync --frozen --all-extras --group dev --no-install-project && \
28+
rm -rf /opt/art-prek-bootstrap/.venv
29+
30+
RUN mkdir -p /etc/art-ci && \
31+
printf '%s' "${MEGATRON_FINGERPRINT}" > /etc/art-ci/megatron_fingerprint && \
32+
printf '%s' "${BASE_IMAGE}" > /etc/art-ci/base_image && \
33+
printf '%s' "${CI_PYTHON_MM}" > /etc/art-ci/python_mm
34+
35+
WORKDIR /workspace

0 commit comments

Comments
 (0)