diff --git a/experiments/astraea-sandbox/README.md b/experiments/astraea-sandbox/README.md new file mode 100644 index 000000000..0d1c75678 --- /dev/null +++ b/experiments/astraea-sandbox/README.md @@ -0,0 +1,55 @@ +# Astraea-in-Hyperlight Experiment + +## Goal +Run Astraea's core comparison logic inside a Hyperlight micro-VM sandbox to: +1. Learn the Hyperlight guest development model +2. Discover API gaps / pain points to contribute back +3. Evaluate feasibility of sandboxed data comparison for ODV + +## Architecture + +``` +Host (normal Rust binary) Guest (no_std + alloc, x86_64-hyperlight-none) +┌─────────────────────────┐ ┌──────────────────────────────────┐ +│ 1. Parse CSV files │ │ │ +│ 2. Serialize rows to │──map_region──> │ 3. Deserialize rows │ +│ shared memory │ │ 4. Compare with tolerance │ +│ │<──return────── │ 5. Return diff categories │ +│ 6. Read results │ │ │ +└─────────────────────────┘ └──────────────────────────────────┘ +``` + +## Guest Functions + +- `compare_rows(left_ptr, left_len, right_ptr, right_len, config_ptr, config_len) -> result_ptr` + - Input: serialized row pairs + comparison config (simple binary format, not JSON) + - Output: serialized diff result (match/mismatch per column) + +## What Works in Guest (alloc available) +- Vec, String, HashMap (via alloc) +- Custom allocator (hyperlight provides one) +- Float parsing, tolerance comparison +- Sorting, deduplication + +## What Doesn't Work (no std) +- File I/O (csv crate) -- host parses, guest compares +- Full regex -- use regex-automata with alloc, or simple string matching +- Threads -- single-threaded only +- Networking -- obviously + +## Data Passing Strategy +- Host serializes rows as length-prefixed byte arrays into a mapped region +- Guest reads from known GPA offset +- Guest writes results to scratch region +- Simple binary protocol (no serde_json in guest) + +## Files to Create +1. `src/guests/astraea_guest/Cargo.toml` -- guest crate +2. `src/guests/astraea_guest/src/main.rs` -- guest entry point +3. `examples/astraea_sandbox.rs` -- host-side example + +## Open Questions +- Is map_region the right way to pass large data? Or should we use the PEB scratch area? +- What's the max practical data size for a single comparison batch? +- Can we reuse the sandbox across multiple comparisons (multi-use sandbox)? +- What's the overhead vs calling Astraea directly? (benchmark needed) diff --git a/experiments/astraea-sandbox/guest/Cargo.toml b/experiments/astraea-sandbox/guest/Cargo.toml new file mode 100644 index 000000000..7081d9a97 --- /dev/null +++ b/experiments/astraea-sandbox/guest/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "astraea-guest" +version = "0.1.0" +edition = "2021" + +[dependencies] +hyperlight-guest = { path = "../../src/hyperlight_guest" } +hyperlight-guest-bin = { path = "../../src/hyperlight_guest_bin" } +hyperlight-common = { path = "../../src/hyperlight_common", default-features = false } diff --git a/experiments/astraea-sandbox/guest/src/main.rs b/experiments/astraea-sandbox/guest/src/main.rs new file mode 100644 index 000000000..9f1961c5a --- /dev/null +++ b/experiments/astraea-sandbox/guest/src/main.rs @@ -0,0 +1,39 @@ +#![no_std] +#![no_main] + +extern crate alloc; + +use alloc::format; +use alloc::string::String; +use hyperlight_guest_bin::guest_function; + +extern crate hyperlight_guest; + +/// Compare two float values within a tolerance. +/// Returns "match" or "mismatch|left|right|diff". +#[guest_function("CompareValues")] +fn compare_values(left: String, right: String, tolerance: String) -> String { + let tol: f64 = tolerance.parse().unwrap_or(1e-6); + + // Try numeric comparison first + match (left.parse::(), right.parse::()) { + (Ok(l), Ok(r)) => { + let diff = (l - r).abs(); + let max_abs = l.abs().max(r.abs()); + // Combined absolute + relative tolerance + if diff <= tol || (max_abs > 0.0 && diff / max_abs <= tol) { + String::from("match") + } else { + format!("mismatch|{}|{}|{}", left, right, diff) + } + } + _ => { + // String comparison + if left == right { + String::from("match") + } else { + format!("mismatch|{}|{}", left, right) + } + } + } +} diff --git a/scripts/KVM_TESTING.md b/scripts/KVM_TESTING.md new file mode 100644 index 000000000..20be34f69 --- /dev/null +++ b/scripts/KVM_TESTING.md @@ -0,0 +1,118 @@ +# KVM Testing on AWS + +Hyperlight's CI runs on Azure with Hyper-V. These scripts let you test on **real KVM hardware** via AWS EC2 — useful for validating Linux/KVM-specific behavior that can't be caught in Hyper-V CI. + +## Quick Start + +```bash +# Online mode (instance has internet — simplest) +./scripts/kvm-test.sh + +# Offline mode (air-gapped instance, pre-vendored deps) +# Step 1: Prepare the bucket (once, or when deps change) +./scripts/prepare-offline.sh s3://my-bucket + +# Step 2: Run tests +VENDOR_BUCKET=s3://my-bucket ./scripts/kvm-test.sh --offline +``` + +One command → launches a KVM-capable instance → builds → tests → terminates. ~25 minutes, ~$0.15. + +## Scripts + +| Script | Purpose | +|--------|---------| +| `kvm-test.sh` | End-to-end: launch instance, install, build, test, terminate | +| `prepare-offline.sh` | Populate S3 bucket with toolchain + vendor for offline mode | +| `vendor-all.sh` | Create a complete vendor directory (handles multi-lockfile problem) | + +### `kvm-test.sh` + +``` +Options: + --offline Use pre-vendored S3 bucket (no internet needed on instance) + --ami AMI_ID Skip install, use a pre-baked AMI + --bake Create an AMI after install for faster future runs + --keep Don't terminate instance (for debugging) + --filter PATTERN Run only matching tests (e.g. "map_region") + --timeout MIN Cost guard (default: 45 min) + --instance-type Override instance type (default: c8i.2xlarge) + --region Override region (default: us-east-1) +``` + +**Prerequisites:** +- AWS CLI v2 with valid credentials +- `session-manager-plugin` (`brew install --cask session-manager-plugin`) +- IAM permissions: EC2, SSM, IAM (and S3 if `--offline`) + +### `prepare-offline.sh` + +Populates an S3 bucket with everything `kvm-test.sh --offline` needs: + +```bash +./scripts/prepare-offline.sh s3://my-bucket +./scripts/prepare-offline.sh s3://my-bucket --rust-version 1.89.0 +``` + +Run once, then iterate with `kvm-test.sh --offline`. Re-run when: +- Rust version changes +- Dependencies change (new crates in Cargo.lock) +- You modify guest crate lockfiles + +### `vendor-all.sh` + +Creates a complete vendor directory for fully offline builds. Handles the tricky part: Hyperlight has **multiple independent lockfiles** (workspace root + 3 guest crates), and `cargo-hyperlight` builds the stdlib sysroot which needs crates pinned by the Rust toolchain's own lockfile. + +```bash +./scripts/vendor-all.sh # vendor to ./vendor-all/ +./scripts/vendor-all.sh /tmp/output # custom output path +``` + +## Why Vendoring is Hard + +`cargo-hyperlight` uses `-Zbuild-std` to compile guest binaries for `x86_64-hyperlight-none`. This triggers a sysroot build that: + +1. Uses the **guest crate's** `Cargo.lock` (not the workspace root's) +2. Needs stdlib crates at versions pinned by the **Rust toolchain's** lockfile +3. These versions often differ from what the workspace uses (e.g., `cfg-if 1.0.1` for stdlib vs `1.0.4` for the repo) + +A naive `cargo vendor` only covers the root workspace. `vendor-all.sh` handles all three cases by downloading the exact versions needed from crates.io and placing them in a single vendor directory with Cargo's multi-version naming convention (e.g., `cfg-if` for 1.0.4 and `cfg-if-0` for 1.0.1). + +## Instance Types + +KVM nested virtualization requires: +- **Intel**: `c8i`, `c7i`, `m7i`, `r7i` families +- **AMD**: `c7a`, `m7a` families +- Must explicitly enable via `CpuOptions.NestedVirtualization` + +## Cost + +| Instance | vCPUs | RAM | $/hr | Typical run (25 min) | +|----------|-------|-----|------|---------------------| +| c8i.2xlarge | 8 | 16 GB | $0.34 | ~$0.15 | +| c8i.4xlarge | 16 | 32 GB | $0.68 | ~$0.28 | + +The cost guard auto-terminates after `--timeout` minutes (default 45) to cap spend at ~$0.26 worst case. + +## Composability + +The scripts compose to replicate the full workflow: + +``` +prepare-offline.sh ─── populates S3 bucket (run once) + │ + ▼ +kvm-test.sh --offline ─── launches instance, installs from bucket, builds, tests + │ + ├── uses vendor-all.sh logic (embedded in all-vendor.tar.gz) + ├── builds guest binaries via just build-rust-guests + ├── runs cargo test --package hyperlight-host + └── terminates instance +``` + +For iterating on code changes, update just the repo tarball: +```bash +COPYFILE_DISABLE=1 tar czf /tmp/hyperlight-repo.tar.gz --exclude='.git' --exclude='target' -C /path/to/repo . +aws s3 cp /tmp/hyperlight-repo.tar.gz s3://my-bucket/hyperlight-repo.tar.gz +VENDOR_BUCKET=s3://my-bucket ./scripts/kvm-test.sh --offline +``` diff --git a/scripts/kvm-test-v3.sh b/scripts/kvm-test-v3.sh new file mode 100755 index 000000000..5a132ae55 --- /dev/null +++ b/scripts/kvm-test-v3.sh @@ -0,0 +1,465 @@ +#!/usr/bin/env bash +set -euo pipefail +export AWS_PAGER="" + +# kvm-test.sh — Test Hyperlight on real KVM hardware via AWS EC2 +# +# Modes: +# ./scripts/kvm-test.sh Online (instance has internet) +# ./scripts/kvm-test.sh --offline Offline (S3 vendor bucket, auto-preps if needed) +# ./scripts/kvm-test.sh --offline --prepare Force re-prep the S3 bucket +# ./scripts/kvm-test.sh --vendor-only Just vendor locally (no EC2, no S3) +# +# Options: +# --ami AMI_ID Reuse a baked AMI (skip toolchain install) +# --bake Bake AMI after successful run +# --keep Don't terminate instance (for debugging) +# --filter PATTERN Run only matching tests +# --timeout MIN Cost guard in minutes (default: 45) +# --instance-type T EC2 instance type (default: c8i.2xlarge) +# --region R AWS region (default: us-east-1) +# +# Environment: +# VENDOR_BUCKET S3 bucket for offline mode (default: s3://hyperlight-vendor-$ACCOUNT_ID) + +REGION="${AWS_REGION:-us-east-1}" +INSTANCE_TYPE="${INSTANCE_TYPE:-c8i.2xlarge}" +VENDOR_BUCKET="${VENDOR_BUCKET:-}" +MAX_RUNTIME_MIN="${MAX_RUNTIME_MIN:-45}" +MODE="online" +PREPARE=false +VENDOR_ONLY=false +BAKE_AMI=false +KEEP_INSTANCE=false +CUSTOM_AMI="" +TEST_FILTER="" +RUN_CMD="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --offline) MODE="offline"; shift ;; + --prepare) PREPARE=true; shift ;; + --vendor-only) VENDOR_ONLY=true; shift ;; + --bake) BAKE_AMI=true; shift ;; + --keep) KEEP_INSTANCE=true; shift ;; + --ami) CUSTOM_AMI="$2"; shift 2 ;; + --region) REGION="$2"; shift 2 ;; + --instance-type) INSTANCE_TYPE="$2"; shift 2 ;; + --timeout) MAX_RUNTIME_MIN="$2"; shift 2 ;; + --filter) TEST_FILTER="$2"; shift 2 ;; + --run) RUN_CMD="$2"; shift 2 ;; + --help|-h) sed -n '3,/^$/p' "$0" | sed 's/^# \?//'; exit 0 ;; + *) echo "Unknown: $1 (use --help)"; exit 1 ;; + esac +done + +export AWS_DEFAULT_REGION="$REGION" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +# Helpers +START_TIME=$(date +%s) +elapsed() { printf "%dm%02ds" $(( ($(date +%s) - START_TIME) / 60 )) $(( ($(date +%s) - START_TIME) % 60 )); } +log() { printf "\033[0;36m[%s]\033[0m %s\n" "$(elapsed)" "$*"; } +err() { printf "\033[0;31m[%s] ERROR:\033[0m %s\n" "$(elapsed)" "$*" >&2; } +header() { printf "\n\033[1;37m━━━ %s ━━━\033[0m\n" "$*"; } + +# ══════════════════════════════════════════════════════════════════ +# VENDOR-ALL: Create complete vendor directory +# ══════════════════════════════════════════════════════════════════ +do_vendor() { + local output_dir="${1:-$REPO_ROOT/vendor-all}" + header "Vendor All" + log "Output: $output_dir" + + log "Vendoring workspace root..." + cd "$REPO_ROOT" + cargo vendor "$output_dir" 2>&1 | tail -3 + + log "Adding guest + stdlib crates..." + python3 - "$REPO_ROOT" "$output_dir" << 'PYEOF' +import os, re, sys, json, urllib.request, tarfile, tempfile, shutil + +repo_root, vendor_dir = sys.argv[1], sys.argv[2] +rustc_sysroot = os.popen("rustc --print sysroot").read().strip() +lockfiles = [] +# Guest lockfiles +for guest in ["simpleguest", "dummyguest", "witguest"]: + lf = os.path.join(repo_root, "src/tests/rust_guests", guest, "Cargo.lock") + if os.path.exists(lf): lockfiles.append(lf) +# Stdlib lockfile +stdlib_lf = os.path.join(rustc_sysroot, "lib/rustlib/src/rust/library/Cargo.lock") +if os.path.exists(stdlib_lf): lockfiles.append(stdlib_lf) + +def version_exists(name, version): + for entry in os.listdir(vendor_dir): + toml = os.path.join(vendor_dir, entry, "Cargo.toml") + if not os.path.exists(toml): continue + n = v = None + with open(toml) as f: + for line in f: + if line.strip().startswith("name = ") and not n: n = line.strip().split('"')[1] + elif line.strip().startswith("version = ") and not v: v = line.strip().split('"')[1] + if n and v: break + if n == name and v == version: return True + return False + +added = 0 +for lockfile in lockfiles: + with open(lockfile) as f: content = f.read() + blocks = content.split("[[package]]") + for block in blocks[1:]: + nm = re.search(r'name = "([^"]+)"', block) + vm = re.search(r'version = "([^"]+)"', block) + cm = re.search(r'checksum = "([^"]+)"', block) + if not (nm and vm) or vm.group(1) == '0.0.0': continue + name, ver = nm.group(1), vm.group(1) + cksum = cm.group(1) if cm else None + if name.startswith('rustc-std-workspace'): continue + if version_exists(name, ver): continue + target_name = name + if os.path.exists(os.path.join(vendor_dir, name)): + for i in range(10): + c = f"{name}-{i}" + if not os.path.exists(os.path.join(vendor_dir, c)): + target_name = c; break + try: + url = f"https://crates.io/api/v1/crates/{name}/{ver}/download" + req = urllib.request.Request(url, headers={'User-Agent': 'hyperlight-vendor/1.0'}) + with urllib.request.urlopen(req) as resp: data = resp.read() + tmp = tempfile.NamedTemporaryFile(suffix='.tar.gz', delete=False) + tmp.write(data); tmp.close() + extract_dir = tempfile.mkdtemp() + with tarfile.open(tmp.name, 'r:gz') as tf: tf.extractall(extract_dir) + src = os.path.join(extract_dir, f"{name}-{ver}") + dst = os.path.join(vendor_dir, target_name) + if os.path.exists(src): + shutil.copytree(src, dst) + with open(os.path.join(dst, ".cargo-checksum.json"), 'w') as f: + json.dump({"files": {}, "package": cksum}, f) + added += 1 + os.unlink(tmp.name); shutil.rmtree(extract_dir) + except Exception as e: + print(f" WARN: {name} {ver}: {e}", file=sys.stderr) +print(f" Added {added} crates from {len(lockfiles)} lockfiles") +PYEOF + log "Done: $(ls "$output_dir" | wc -l | tr -d ' ') total crates" +} + +# Handle --vendor-only +if [[ "$VENDOR_ONLY" == "true" ]]; then + do_vendor "${1:-$REPO_ROOT/vendor-all}" + exit 0 +fi + +# ══════════════════════════════════════════════════════════════════ +# PREPARE: Fill S3 bucket with toolchain + vendor + repo +# ══════════════════════════════════════════════════════════════════ +do_prepare() { + local bucket="$1" + local rust_version="${2:-1.89.0}" + local work="/tmp/hyperlight-prep-$$" + mkdir -p "$work" + header "Prepare Offline Bucket" + log "Bucket: $bucket Rust: $rust_version" + + local base="https://static.rust-lang.org/dist" + local triple="x86_64-unknown-linux-gnu" + log "Downloading Rust toolchain..." + for c in "rust-$rust_version-$triple" "rust-std-$rust_version-x86_64-unknown-none" "rust-src-$rust_version"; do + [[ -f "$work/$c.tar.xz" ]] || curl -sSL "$base/$c.tar.xz" -o "$work/$c.tar.xz" + done + mv "$work/rust-$rust_version-$triple.tar.xz" "$work/rust-stable.tar.xz" + mv "$work/rust-std-$rust_version-x86_64-unknown-none.tar.xz" "$work/rust-std-none.tar.xz" + mv "$work/rust-src-$rust_version.tar.xz" "$work/rust-src.tar.xz" + + log "Downloading just + wasm-tools..." + local jv=$(curl -sSL https://api.github.com/repos/casey/just/releases/latest | grep -o '"tag_name":"[^"]*"' | cut -d'"' -f4) + curl -sSL "https://github.com/casey/just/releases/download/$jv/just-$jv-x86_64-unknown-linux-musl.tar.gz" -o "$work/just.tar.gz" + local wv=$(curl -sSL https://api.github.com/repos/bytecodealliance/wasm-tools/releases/latest | grep -o '"tag_name":"[^"]*"' | cut -d'"' -f4) + curl -sSL "https://github.com/bytecodealliance/wasm-tools/releases/download/$wv/wasm-tools-${wv#v}-x86_64-linux.tar.gz" -o "$work/wasm-tools.tar.gz" + + log "Vendoring cargo-hyperlight..." + local ch_dir="$work/ch-pack" + cp -r "$REPO_ROOT/src/cargo-hyperlight" "$ch_dir" 2>/dev/null || cp -r "$REPO_ROOT" "$ch_dir" + cd "$ch_dir" && cargo vendor vendor > /dev/null 2>&1 + mkdir -p .cargo && printf '[source.crates-io]\nreplace-with = "vendored-sources"\n\n[source.vendored-sources]\ndirectory = "vendor"' > .cargo/config.toml + COPYFILE_DISABLE=1 tar czf "$work/cargo-hyperlight-vendored.tar.gz" -C "$ch_dir" . + cd "$REPO_ROOT" + + log "Running vendor-all..." + do_vendor "$work/vendor-out" + COPYFILE_DISABLE=1 tar czf "$work/all-vendor.tar.gz" -C "$work/vendor-out" . + + log "Packing repo..." + COPYFILE_DISABLE=1 tar czf "$work/hyperlight-repo.tar.gz" -C "$REPO_ROOT" --exclude='.git' --exclude='target' --exclude='vendor-all' . + + log "Uploading to $bucket..." + for f in rust-stable.tar.xz rust-std-none.tar.xz rust-src.tar.xz all-vendor.tar.gz just.tar.gz wasm-tools.tar.gz cargo-hyperlight-vendored.tar.gz hyperlight-repo.tar.gz; do + aws s3 cp "$work/$f" "$bucket/$f" --quiet && log " ✓ $f" + done + rm -rf "$work" + log "Bucket ready" +} + +# Check if bucket needs prep +bucket_ready() { + local bucket="$1" + for f in rust-stable.tar.xz all-vendor.tar.gz hyperlight-repo.tar.gz; do + aws s3api head-object --bucket "${bucket#s3://}" --key "$f" --region "$REGION" >/dev/null 2>&1 || return 1 + done + return 0 +} + +# ══════════════════════════════════════════════════════════════════ +# EC2 ORCHESTRATION +# ══════════════════════════════════════════════════════════════════ +INSTANCE_ID="" +KEY_NAME="hyperlight-kvm-test-$$" +KEY_FILE="/tmp/${KEY_NAME}.pem" +WATCHDOG_PID="" +IAM_ROLE="hyperlight-kvm-test-role" +IAM_PROFILE="hyperlight-kvm-test-profile" + +cleanup() { + echo "" + [[ -n "$WATCHDOG_PID" ]] && kill "$WATCHDOG_PID" 2>/dev/null || true + if [[ -n "$INSTANCE_ID" ]]; then + if [[ "$KEEP_INSTANCE" == "true" ]]; then + log "Instance kept: aws ssm start-session --target $INSTANCE_ID" + return + fi + log "Terminating $INSTANCE_ID..." + aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" >/dev/null 2>&1 || true + fi + aws ec2 delete-key-pair --key-name "$KEY_NAME" >/dev/null 2>&1 || true + rm -f "$KEY_FILE" + log "Done ($(elapsed))" +} +trap cleanup EXIT + +# SSM remote execution +remote_run() { + local cmd="$1" timeout="${2:-600}" + local encoded=$(printf '%s' "$cmd" | base64) + local cmd_id=$(aws ssm send-command --instance-ids "$INSTANCE_ID" \ + --document-name "AWS-RunShellScript" \ + --parameters "{\"commands\":[\"echo $encoded | base64 -d | bash\"]}" \ + --timeout-seconds "$timeout" --query Command.CommandId --output text 2>&1) + [[ "$cmd_id" == *"InvalidInstanceId"* ]] && sleep 30 && \ + cmd_id=$(aws ssm send-command --instance-ids "$INSTANCE_ID" \ + --document-name "AWS-RunShellScript" \ + --parameters "{\"commands\":[\"echo $encoded | base64 -d | bash\"]}" \ + --timeout-seconds "$timeout" --query Command.CommandId --output text) + local secs=0 + for _ in $(seq 1 $(( timeout / 5 + 12 ))); do + local status=$(aws ssm get-command-invocation --command-id "$cmd_id" \ + --instance-id "$INSTANCE_ID" --query Status --output text 2>/dev/null || echo "Pending") + case "$status" in + Success|Failed|TimedOut|Cancelled) + [[ $secs -gt 0 ]] && echo "" >&2 + aws ssm get-command-invocation --command-id "$cmd_id" --instance-id "$INSTANCE_ID" \ + --query 'StandardOutputContent' --output text 2>/dev/null + local se=$(aws ssm get-command-invocation --command-id "$cmd_id" --instance-id "$INSTANCE_ID" \ + --query 'StandardErrorContent' --output text 2>/dev/null) + [[ -n "$se" && "$se" != "None" ]] && echo "$se" | grep -v 'LIBARCHIVE.xattr\|Ignoring unknown extended header' >&2 || true + [[ "$status" == "Success" ]] && return 0 || return 1 ;; + esac + secs=$(( secs + 5 )) + (( secs % 30 == 0 )) && printf " %ds" "$secs" >&2 || printf "." >&2 + sleep 5 + done + err "Timed out after ${timeout}s"; return 1 +} + +# ── Credentials ─────────────────────────────────────────────────── +header "Credentials" +ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text 2>/dev/null) || { + err "No AWS credentials. Configure via: aws configure, AWS_PROFILE, or SSO."; exit 1 +} +log "Account: $ACCOUNT_ID Region: $REGION" + +# Default bucket if not set +[[ -z "$VENDOR_BUCKET" ]] && VENDOR_BUCKET="s3://hyperlight-vendor-$ACCOUNT_ID" + +# ── Offline: prep if needed ─────────────────────────────────────── +if [[ "$MODE" == "offline" ]]; then + if [[ "$PREPARE" == "true" ]] || ! bucket_ready "$VENDOR_BUCKET"; then + [[ "$PREPARE" != "true" ]] && log "Bucket not ready — auto-preparing..." + do_prepare "$VENDOR_BUCKET" + else + log "Bucket ready: $VENDOR_BUCKET" + fi +fi + +# ── IAM ─────────────────────────────────────────────────────────── +header "Infrastructure" +if ! aws iam get-role --role-name "$IAM_ROLE" &>/dev/null; then + log "Creating IAM role..." + aws iam create-role --role-name "$IAM_ROLE" \ + --assume-role-policy-document '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Service":"ec2.amazonaws.com"},"Action":"sts:AssumeRole"}]}' >/dev/null + aws iam attach-role-policy --role-name "$IAM_ROLE" --policy-arn arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore + sleep 8 +fi +aws iam create-instance-profile --instance-profile-name "$IAM_PROFILE" 2>/dev/null || true +aws iam add-role-to-instance-profile --instance-profile-name "$IAM_PROFILE" --role-name "$IAM_ROLE" 2>/dev/null || true +if [[ "$MODE" == "offline" ]]; then + local_bucket="${VENDOR_BUCKET#s3://}" + aws iam put-role-policy --role-name "$IAM_ROLE" --policy-name vendor-read \ + --policy-document "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Effect\":\"Allow\",\"Action\":[\"s3:GetObject\",\"s3:ListBucket\"],\"Resource\":[\"arn:aws:s3:::$local_bucket\",\"arn:aws:s3:::$local_bucket/*\"]}]}" +fi + +# ── Launch ──────────────────────────────────────────────────────── +header "Launch" +aws ec2 create-key-pair --key-name "$KEY_NAME" --query KeyMaterial --output text > "$KEY_FILE" 2>/dev/null +chmod 600 "$KEY_FILE" +SG_NAME="hyperlight-kvm-test-sg" +SG_ID=$(aws ec2 describe-security-groups --filters "Name=group-name,Values=$SG_NAME" \ + --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null || echo "None") +[[ "$SG_ID" == "None" || -z "$SG_ID" ]] && \ + SG_ID=$(aws ec2 create-security-group --group-name "$SG_NAME" --description "Hyperlight KVM test" --query GroupId --output text) + +AMI_ID="${CUSTOM_AMI:-$(aws ssm get-parameter --name /aws/service/ami-amazon-linux-latest/al2023-ami-kernel-default-x86_64 --query Parameter.Value --output text)}" +INSTANCE_ID=$(python3 -c " +import boto3 +ec2 = boto3.client('ec2', region_name='$REGION') +r = ec2.run_instances( + ImageId='$AMI_ID', InstanceType='$INSTANCE_TYPE', + KeyName='$KEY_NAME', + IamInstanceProfile={'Name': '$IAM_PROFILE'}, + SecurityGroupIds=['$SG_ID'], + CpuOptions={'NestedVirtualization': 'enabled'}, + BlockDeviceMappings=[{'DeviceName': '/dev/xvda', 'Ebs': {'VolumeSize': 30, 'VolumeType': 'gp3'}}], + TagSpecifications=[{'ResourceType': 'instance', 'Tags': [{'Key': 'Name', 'Value': 'hyperlight-kvm-test'}]}], + MinCount=1, MaxCount=1) +print(r['Instances'][0]['InstanceId']) +") +log "Instance: $INSTANCE_ID" +aws ec2 wait instance-running --instance-ids "$INSTANCE_ID" +log "Running" + +(sleep $(( MAX_RUNTIME_MIN * 60 )); err "TIMEOUT — terminating"; aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" >/dev/null 2>&1; kill $$ 2>/dev/null) & +WATCHDOG_PID=$! + +# ── Connect ─────────────────────────────────────────────────────── +header "Connect" +for i in $(seq 1 24); do + [[ "$(aws ssm describe-instance-information --filters "Key=InstanceIds,Values=$INSTANCE_ID" \ + --query 'InstanceInformationList[0].PingStatus' --output text 2>/dev/null)" == "Online" ]] && break + printf "." >&2; sleep 10 +done +echo "" >&2 +[[ "$(aws ssm describe-instance-information --filters "Key=InstanceIds,Values=$INSTANCE_ID" \ + --query 'InstanceInformationList[0].PingStatus' --output text 2>/dev/null)" != "Online" ]] && \ + { err "SSM not available. Check VPC internet/endpoints."; exit 1; } +log "Connected via SSM" +remote_run "ls /dev/kvm >/dev/null 2>&1 && echo 'KVM: ok' || (echo 'KVM: MISSING'; exit 1)" 30 + +# ── Install ─────────────────────────────────────────────────────── +if [[ -n "$CUSTOM_AMI" ]]; then + header "Toolchain (baked AMI)" +else + header "Install" + remote_run "sudo dnf install -q -y clang gcc make git openssl-devel lld 2>&1 | tail -3" 300 + + if [[ "$MODE" == "offline" ]]; then + log "Installing from S3 bucket..." + remote_run "set -e +aws s3 cp $VENDOR_BUCKET/rust-stable.tar.xz /tmp/rust-stable.tar.xz --region $REGION --no-progress +aws s3 cp $VENDOR_BUCKET/rust-std-none.tar.xz /tmp/rust-std-none.tar.xz --region $REGION --no-progress +aws s3 cp $VENDOR_BUCKET/rust-src.tar.xz /tmp/rust-src.tar.xz --region $REGION --no-progress +tar xJf /tmp/rust-stable.tar.xz -C /tmp +tar xJf /tmp/rust-std-none.tar.xz -C /tmp +tar xJf /tmp/rust-src.tar.xz -C /tmp +cd /tmp/rust-*-x86_64-unknown-linux-gnu && echo y | sudo ./install.sh --prefix=/usr/local > /dev/null 2>&1 +cd /tmp/rust-std-*-x86_64-unknown-none && echo y | sudo ./install.sh --prefix=/usr/local > /dev/null 2>&1 +cd /tmp/rust-src-* && echo y | sudo ./install.sh --prefix=/usr/local > /dev/null 2>&1 +aws s3 cp $VENDOR_BUCKET/all-vendor.tar.gz /tmp/all-vendor.tar.gz --region $REGION --no-progress +sudo mkdir -p /opt/vendor && sudo tar xzf /tmp/all-vendor.tar.gz -C /opt/vendor +for d in /root/.cargo ~/.cargo; do + sudo mkdir -p \$d + printf '[source.crates-io]\nreplace-with = \"vendored-sources\"\n\n[source.vendored-sources]\ndirectory = \"/opt/vendor\"' | sudo tee \$d/config.toml > /dev/null +done +aws s3 cp $VENDOR_BUCKET/just.tar.gz /tmp/just.tar.gz --region $REGION --no-progress +cd /tmp && tar xzf just.tar.gz just && sudo mv just /usr/local/bin/ +aws s3 cp $VENDOR_BUCKET/wasm-tools.tar.gz /tmp/wasm-tools.tar.gz --region $REGION --no-progress +cd /tmp && tar xzf wasm-tools.tar.gz --strip-components=1 '*/wasm-tools' && sudo mv wasm-tools /usr/local/bin/ +aws s3 cp $VENDOR_BUCKET/cargo-hyperlight-vendored.tar.gz /tmp/cargo-hyperlight-vendored.tar.gz --region $REGION --no-progress +mkdir -p /tmp/cargo-hyperlight && cd /tmp/cargo-hyperlight && tar xzf /tmp/cargo-hyperlight-vendored.tar.gz +cargo install --path . --root /usr/local --locked 2>&1 | tail -3 +echo install-done" 600 + else + log "Installing from internet..." + RUST_VERSION="stable" + [[ -f "$REPO_ROOT/rust-toolchain.toml" ]] && \ + RUST_VERSION=$(grep -o 'channel *= *"[^"]*"' "$REPO_ROOT/rust-toolchain.toml" | cut -d'"' -f2 || echo "stable") + remote_run "set -e +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain '$RUST_VERSION' 2>&1 | tail -3 +source ~/.cargo/env +rustup target add x86_64-unknown-none +rustup component add rust-src +cargo install just wasm-tools cargo-hyperlight 2>&1 | tail -5 +echo install-done" 600 + fi + log "Verify:" + remote_run "export PATH=\$HOME/.cargo/bin:/usr/local/bin:\$PATH && rustc --version && cargo --version && just --version" +fi + +# ── Build ───────────────────────────────────────────────────────── +header "Build" +if [[ "$MODE" == "offline" ]]; then + remote_run "set -e +aws s3 cp $VENDOR_BUCKET/hyperlight-repo.tar.gz /tmp/hyperlight-repo.tar.gz --region $REGION --no-progress +mkdir -p ~/hyperlight && cd ~/hyperlight && tar xzf /tmp/hyperlight-repo.tar.gz 2>/dev/null +echo extracted" 300 +else + REPO_URL=$(git -C "$REPO_ROOT" remote get-url origin 2>/dev/null || echo "https://github.com/hyperlight-dev/hyperlight.git") + BRANCH=$(git -C "$REPO_ROOT" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "main") + log "Cloning $BRANCH..." + remote_run "GIT_TERMINAL_PROMPT=0 git clone --depth 1 --branch '$BRANCH' '$REPO_URL' ~/hyperlight 2>&1 | tail -3" 300 +fi + +log "Building guests..." +if [[ -n "$RUN_CMD" ]]; then + log "(skipped -- custom --run command will handle build)" +else + remote_run "export PATH=\$HOME/.cargo/bin:/usr/local/bin:\$PATH && cd ~/hyperlight && \ +just build-rust-guests debug > /tmp/build.log 2>&1 && \ +mkdir -p src/tests/rust_guests/bin/debug && just move-rust-guests debug >> /tmp/build.log 2>&1 && \ +mkdir -p src/tests/c_guests/bin/debug && just build-c-guests debug >> /tmp/build.log 2>&1 && \ +just move-c-guests debug >> /tmp/build.log 2>&1; \ +RC=\$?; tail -15 /tmp/build.log; echo EXIT=\$RC; exit \$RC" 900 +fi + +# ── Test / Run ───────────────────────────────────────────────────── +header "Run" +TEST_EXIT=0 +if [[ -n "$RUN_CMD" ]]; then + log "Custom: $RUN_CMD" + remote_run "export PATH=\$HOME/.cargo/bin:/usr/local/bin:\$PATH && cd ~/hyperlight && $RUN_CMD" 1800 || TEST_EXIT=$? +elif [[ -n "$TEST_FILTER" ]]; then + log "Filtered tests: $TEST_FILTER" + remote_run "export PATH=\$HOME/.cargo/bin:/usr/local/bin:\$PATH && cd ~/hyperlight && \ +cargo test --package hyperlight-host --lib '$TEST_FILTER' 2>&1 | tail -30" 600 || TEST_EXIT=$? +else + log "Full test suite..." + remote_run "export PATH=\$HOME/.cargo/bin:/usr/local/bin:\$PATH && cd ~/hyperlight && \ +cargo test --package hyperlight-host > /tmp/test.log 2>&1; \ +RC=\$?; tail -40 /tmp/test.log; echo EXIT=\$RC; exit \$RC" 1800 || TEST_EXIT=$? +fi + +# ── AMI bake ────────────────────────────────────────────────────── +if [[ "$BAKE_AMI" == "true" && -z "$CUSTOM_AMI" ]]; then + header "Bake AMI" + NEW_AMI=$(aws ec2 create-image --instance-id "$INSTANCE_ID" \ + --name "hyperlight-kvm-test-$(date +%Y%m%d)" \ + --description "Hyperlight KVM test - toolchain baked" \ + --no-reboot --query ImageId --output text) + log "AMI: $NEW_AMI (use --ami $NEW_AMI next time)" +fi + +# ── Done ────────────────────────────────────────────────────────── +header "Complete" +COST=$(echo "scale=3; $(( $(date +%s) - START_TIME )) / 3600 * 0.34" | bc 2>/dev/null || echo "?") +log "Finished in $(elapsed) (~\$$COST)" +exit "$TEST_EXIT" diff --git a/scripts/kvm-test.sh b/scripts/kvm-test.sh new file mode 100755 index 000000000..8d4d4d96f --- /dev/null +++ b/scripts/kvm-test.sh @@ -0,0 +1,432 @@ +#!/usr/bin/env bash +set -euo pipefail +export AWS_PAGER="" + +# kvm-test.sh — Run Hyperlight tests on real KVM hardware via AWS EC2 +# +# Hyperlight's CI uses Azure with Hyper-V, but production Linux deployments +# use KVM. This script launches a KVM-capable EC2 instance (c8i/c7i with +# nested virtualization), builds the project, runs the test suite, and +# tears everything down. One command, ~10 minutes, ~$0.50. +# +# Usage: +# ./scripts/kvm-test.sh # default: online build +# ./scripts/kvm-test.sh --offline # air-gapped build via S3 vendor bucket +# ./scripts/kvm-test.sh --ami ami-0abc123 # reuse a baked AMI (skip install) +# ./scripts/kvm-test.sh --bake # bake AMI after successful install +# ./scripts/kvm-test.sh --keep # don't terminate (for debugging) +# +# Prerequisites: +# - AWS CLI v2 configured with credentials (any method: SSO, env vars, profiles) +# - session-manager-plugin (brew install --cask session-manager-plugin) +# - Sufficient IAM permissions: EC2, SSM, IAM, S3 (if --offline) +# +# Environment variables: +# AWS_REGION Region to launch in (default: us-east-1) +# INSTANCE_TYPE EC2 instance type (default: c8i.2xlarge) +# VENDOR_BUCKET S3 bucket for offline vendor (required with --offline) +# MAX_RUNTIME_MIN Cost guard timeout in minutes (default: 45) + +# ── Configuration ───────────────────────────────────────────────── +REGION="${AWS_REGION:-us-east-1}" +INSTANCE_TYPE="${INSTANCE_TYPE:-c8i.2xlarge}" +VENDOR_BUCKET="${VENDOR_BUCKET:-}" +MAX_RUNTIME_MIN="${MAX_RUNTIME_MIN:-45}" +OFFLINE=false +BAKE_AMI=false +KEEP_INSTANCE=false +CUSTOM_AMI="" +TEST_FILTER="${TEST_FILTER:-}" # e.g. "map_region" to run subset + +while [[ $# -gt 0 ]]; do + case "$1" in + --offline) OFFLINE=true; shift ;; + --bake) BAKE_AMI=true; shift ;; + --keep) KEEP_INSTANCE=true; shift ;; + --ami) CUSTOM_AMI="$2"; shift 2 ;; + --region) REGION="$2"; shift 2 ;; + --instance-type) INSTANCE_TYPE="$2"; shift 2 ;; + --timeout) MAX_RUNTIME_MIN="$2"; shift 2 ;; + --filter) TEST_FILTER="$2"; shift 2 ;; + --help|-h) + sed -n '3,/^$/p' "$0" | sed 's/^# \?//' + exit 0 ;; + *) echo "Unknown option: $1 (use --help)"; exit 1 ;; + esac +done + +export AWS_DEFAULT_REGION="$REGION" + +# Validate +if [[ "$OFFLINE" == "true" && -z "$VENDOR_BUCKET" ]]; then + echo "ERROR: --offline requires VENDOR_BUCKET env var (e.g. s3://my-hyperlight-vendor)" + echo " See scripts/vendor-all.sh to create the vendor bucket." + exit 1 +fi + +# ── Helpers ─────────────────────────────────────────────────────── +START_TIME=$(date +%s) +elapsed() { printf "%dm%02ds" $(( ($(date +%s) - START_TIME) / 60 )) $(( ($(date +%s) - START_TIME) % 60 )); } +log() { printf "\033[0;36m[%s]\033[0m %s\n" "$(elapsed)" "$*"; } +err() { printf "\033[0;31m[%s] ERROR:\033[0m %s\n" "$(elapsed)" "$*" >&2; } +header() { printf "\n\033[1;37m━━━ %s ━━━\033[0m\n" "$*"; } + +# State +INSTANCE_ID="" +KEY_NAME="hyperlight-kvm-test-$$" +KEY_FILE="/tmp/${KEY_NAME}.pem" +SG_ID="" +WATCHDOG_PID="" +CONN_MODE="" +PUBLIC_IP="" +IAM_ROLE_NAME="hyperlight-kvm-test-role" +IAM_PROFILE_NAME="hyperlight-kvm-test-profile" + +cleanup() { + echo "" + header "Cleanup" + [[ -n "$WATCHDOG_PID" ]] && kill "$WATCHDOG_PID" 2>/dev/null || true + if [[ -n "$INSTANCE_ID" ]]; then + if [[ "$KEEP_INSTANCE" == "true" ]]; then + log "Instance $INSTANCE_ID kept alive (--keep)." + log " Connect: aws ssm start-session --target $INSTANCE_ID" + return + fi + log "Terminating $INSTANCE_ID..." + aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" >/dev/null 2>&1 || true + fi + aws ec2 delete-key-pair --key-name "$KEY_NAME" >/dev/null 2>&1 || true + rm -f "$KEY_FILE" + log "Done ($(elapsed) total)" +} +trap cleanup EXIT + +# ── Remote execution ────────────────────────────────────────────── +remote_run() { + local cmd="$1" + local timeout="${2:-600}" + + # Base64-encode the command to avoid shell quoting issues in SSM + local encoded + encoded=$(printf '%s' "$cmd" | base64) + + local cmd_id + cmd_id=$(aws ssm send-command \ + --instance-ids "$INSTANCE_ID" \ + --document-name "AWS-RunShellScript" \ + --parameters "{\"commands\":[\"echo $encoded | base64 -d | bash\"]}" \ + --timeout-seconds "$timeout" \ + --query Command.CommandId --output text 2>&1) + + if [[ "$cmd_id" == *"InvalidInstanceId"* ]]; then + sleep 30 + cmd_id=$(aws ssm send-command \ + --instance-ids "$INSTANCE_ID" \ + --document-name "AWS-RunShellScript" \ + --parameters "{\"commands\":[\"echo $encoded | base64 -d | bash\"]}" \ + --timeout-seconds "$timeout" \ + --query Command.CommandId --output text) + fi + + local max_polls=$(( timeout / 5 + 12 )) + local secs=0 + for _ in $(seq 1 "$max_polls"); do + local status + status=$(aws ssm get-command-invocation \ + --command-id "$cmd_id" --instance-id "$INSTANCE_ID" \ + --query Status --output text 2>/dev/null || echo "Pending") + case "$status" in + Success|Failed|TimedOut|Cancelled) + [[ $secs -gt 0 ]] && echo "" >&2 + # Output stdout + aws ssm get-command-invocation \ + --command-id "$cmd_id" --instance-id "$INSTANCE_ID" \ + --query 'StandardOutputContent' --output text 2>/dev/null + # Filter stderr (suppress tar xattr warnings) + local stderr + stderr=$(aws ssm get-command-invocation \ + --command-id "$cmd_id" --instance-id "$INSTANCE_ID" \ + --query 'StandardErrorContent' --output text 2>/dev/null) + if [[ -n "$stderr" && "$stderr" != "None" ]]; then + echo "$stderr" | grep -v 'LIBARCHIVE.xattr\|Ignoring unknown extended header' >&2 || true + fi + [[ "$status" == "Success" ]] && return 0 || return 1 + ;; + esac + secs=$(( secs + 5 )) + if (( secs % 30 == 0 )); then + printf " %ds" "$secs" >&2 + else + printf "." >&2 + fi + sleep 5 + done + echo "" >&2 + err "Command timed out after ${timeout}s" + return 1 +} + +# ── Step 1: Verify credentials ──────────────────────────────────── +header "Credentials" +ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text 2>/dev/null) || { + err "No valid AWS credentials. Configure via: aws configure, AWS_PROFILE, or SSO." + exit 1 +} +log "Account: $ACCOUNT_ID Region: $REGION" + +# ── Step 2: IAM role for SSM ───────────────────────────────────── +header "Infrastructure" +if ! aws iam get-role --role-name "$IAM_ROLE_NAME" &>/dev/null; then + log "Creating IAM role for SSM..." + aws iam create-role --role-name "$IAM_ROLE_NAME" \ + --assume-role-policy-document '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Service":"ec2.amazonaws.com"},"Action":"sts:AssumeRole"}]}' >/dev/null + aws iam attach-role-policy --role-name "$IAM_ROLE_NAME" \ + --policy-arn arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore + sleep 8 # IAM propagation +fi +aws iam create-instance-profile --instance-profile-name "$IAM_PROFILE_NAME" 2>/dev/null || true +aws iam add-role-to-instance-profile --instance-profile-name "$IAM_PROFILE_NAME" \ + --role-name "$IAM_ROLE_NAME" 2>/dev/null || true + +# If offline mode, grant S3 read to the vendor bucket +if [[ "$OFFLINE" == "true" ]]; then + BUCKET_NAME="${VENDOR_BUCKET#s3://}" + aws iam put-role-policy --role-name "$IAM_ROLE_NAME" --policy-name vendor-bucket-read \ + --policy-document "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Effect\":\"Allow\",\"Action\":[\"s3:GetObject\",\"s3:ListBucket\"],\"Resource\":[\"arn:aws:s3:::$BUCKET_NAME\",\"arn:aws:s3:::$BUCKET_NAME/*\"]}]}" +fi + +# Key pair (unique per run to avoid conflicts) +aws ec2 create-key-pair --key-name "$KEY_NAME" --query KeyMaterial --output text > "$KEY_FILE" 2>/dev/null +chmod 600 "$KEY_FILE" + +# Security group (reuse if exists) +SG_NAME_FIXED="hyperlight-kvm-test-sg" +SG_ID=$(aws ec2 describe-security-groups --filters "Name=group-name,Values=$SG_NAME_FIXED" \ + --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null || echo "None") +if [[ "$SG_ID" == "None" || -z "$SG_ID" ]]; then + SG_ID=$(aws ec2 create-security-group --group-name "$SG_NAME_FIXED" \ + --description "Hyperlight KVM test runner" --query GroupId --output text) +fi + +# ── Step 3: Resolve AMI ─────────────────────────────────────────── +if [[ -n "$CUSTOM_AMI" ]]; then + AMI_ID="$CUSTOM_AMI" + log "Using provided AMI: $AMI_ID" +else + AMI_ID=$(aws ssm get-parameter \ + --name /aws/service/ami-amazon-linux-latest/al2023-ami-kernel-default-x86_64 \ + --query Parameter.Value --output text) + log "AL2023 AMI: $AMI_ID" +fi + +# ── Step 4: Launch ──────────────────────────────────────────────── +header "Launch" + +INSTANCE_ID=$(aws ec2 run-instances \ + --image-id "$AMI_ID" \ + --instance-type "$INSTANCE_TYPE" \ + --key-name "$KEY_NAME" \ + --iam-instance-profile "Name=$IAM_PROFILE_NAME" \ + --security-group-ids "$SG_ID" \ + --cpu-options "NestedVirtualization=enabled" \ + --block-device-mappings '[{"DeviceName":"/dev/xvda","Ebs":{"VolumeSize":30,"VolumeType":"gp3"}}]' \ + --tag-specifications '[{"ResourceType":"instance","Tags":[{"Key":"Name","Value":"hyperlight-kvm-test"}]}]' \ + --query 'Instances[0].InstanceId' --output text) +log "Instance: $INSTANCE_ID ($INSTANCE_TYPE, nested virt enabled)" + +aws ec2 wait instance-running --instance-ids "$INSTANCE_ID" +PUBLIC_IP=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" \ + --query 'Reservations[0].Instances[0].PublicIpAddress' --output text) +log "Running: $PUBLIC_IP" + +# Cost guard +( + sleep $(( MAX_RUNTIME_MIN * 60 )) + echo "" + err "MAX RUNTIME (${MAX_RUNTIME_MIN}m) exceeded — terminating" + aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" --region "$REGION" >/dev/null 2>&1 + kill $$ 2>/dev/null +) & +WATCHDOG_PID=$! + +# ── Step 5: Wait for SSM ───────────────────────────────────────── +header "Connect" +log "Waiting for SSM agent..." +for i in $(seq 1 24); do + SSM_STATUS=$(aws ssm describe-instance-information \ + --filters "Key=InstanceIds,Values=$INSTANCE_ID" \ + --query 'InstanceInformationList[0].PingStatus' --output text 2>/dev/null || echo "None") + if [[ "$SSM_STATUS" == "Online" ]]; then + CONN_MODE="ssm" + break + fi + printf "." >&2 + sleep 10 +done +echo "" >&2 + +if [[ -z "$CONN_MODE" ]]; then + err "SSM agent did not register within 4 minutes." + echo " Possible causes:" + echo " - Instance has no internet access (needs NAT gateway or VPC endpoints)" + echo " - IAM role not propagated (try again in 30s)" + echo " - SSM agent not installed (unlikely on AL2023)" + echo "" + echo " For VPC without internet, create these VPC endpoints:" + echo " com.amazonaws.$REGION.ssm (Interface)" + echo " com.amazonaws.$REGION.ssmmessages (Interface)" + echo " com.amazonaws.$REGION.ec2messages (Interface)" + echo " com.amazonaws.$REGION.s3 (Gateway)" + exit 1 +fi +log "Connected via SSM" + +# Verify KVM +KVM_CHECK=$(remote_run "ls /dev/kvm 2>&1 && echo OK || echo MISSING" 30) +if [[ "$KVM_CHECK" != *"OK"* ]]; then + err "KVM not available. Instance type $INSTANCE_TYPE may not support nested virtualization." + err "Use c8i, c7i, or m7i instance families." + exit 1 +fi +log "KVM: ✓" + +# ── Step 6: Install toolchain ───────────────────────────────────── +if [[ -n "$CUSTOM_AMI" ]]; then + header "Toolchain (baked AMI — skipping)" +else + header "Install Toolchain" + + log "System packages..." + remote_run "sudo dnf install -q -y clang gcc make git openssl-devel lld 2>&1 | tail -3" 300 + + if [[ "$OFFLINE" == "true" ]]; then + # ── Offline mode: pre-vendored toolchain from S3 ────────── + log "[1/4] Rust toolchain (offline)..." + remote_run "set -e; \ +aws s3 cp $VENDOR_BUCKET/rust-stable.tar.xz /tmp/rust-stable.tar.xz --region $REGION --no-progress && \ +aws s3 cp $VENDOR_BUCKET/rust-std-none.tar.xz /tmp/rust-std-none.tar.xz --region $REGION --no-progress && \ +tar xJf /tmp/rust-stable.tar.xz -C /tmp && \ +tar xJf /tmp/rust-std-none.tar.xz -C /tmp && \ +cd /tmp/rust-*-x86_64-unknown-linux-gnu && echo y | sudo ./install.sh --prefix=/usr/local > /dev/null 2>&1 && \ +cd /tmp/rust-std-*-x86_64-unknown-none && echo y | sudo ./install.sh --prefix=/usr/local > /dev/null 2>&1 && \ +echo done" 600 + + log "[2/4] rust-src + vendor..." + remote_run "set -e; \ +aws s3 cp $VENDOR_BUCKET/rust-src.tar.xz /tmp/rust-src.tar.xz --region $REGION --no-progress && \ +tar xJf /tmp/rust-src.tar.xz -C /tmp && \ +cd /tmp/rust-src-* && echo y | sudo ./install.sh --prefix=/usr/local > /dev/null 2>&1 && \ +aws s3 cp $VENDOR_BUCKET/all-vendor.tar.gz /tmp/all-vendor.tar.gz --region $REGION --no-progress && \ +sudo mkdir -p /opt/vendor && sudo tar xzf /tmp/all-vendor.tar.gz -C /opt/vendor && \ +for d in /root/.cargo ~/.cargo; do \ + sudo mkdir -p \$d && \ + printf '[source.crates-io]\nreplace-with = \"vendored-sources\"\n\n[source.vendored-sources]\ndirectory = \"/opt/vendor\"' | sudo tee \$d/config.toml > /dev/null; \ +done && \ +echo done" 600 + + log "[3/4] Tools (just, wasm-tools, cargo-hyperlight)..." + remote_run "set -e; \ +aws s3 cp $VENDOR_BUCKET/just.tar.gz /tmp/just.tar.gz --region $REGION --no-progress && \ +cd /tmp && tar xzf just.tar.gz just && sudo mv just /usr/local/bin/ && \ +aws s3 cp $VENDOR_BUCKET/wasm-tools.tar.gz /tmp/wasm-tools.tar.gz --region $REGION --no-progress && \ +cd /tmp && tar xzf wasm-tools.tar.gz --strip-components=1 '*/wasm-tools' && sudo mv wasm-tools /usr/local/bin/ && \ +aws s3 cp $VENDOR_BUCKET/cargo-hyperlight-vendored.tar.gz /tmp/cargo-hyperlight-vendored.tar.gz --region $REGION --no-progress && \ +mkdir -p /tmp/cargo-hyperlight && cd /tmp/cargo-hyperlight && tar xzf /tmp/cargo-hyperlight-vendored.tar.gz && \ +cargo install --path . --root /usr/local --locked 2>&1 | tail -3 && \ +echo done" 600 + + else + # ── Online mode: install from internet ──────────────────── + # Detect Rust version from rust-toolchain.toml if present + SCRIPT_DIR_ABS="$(cd "$(dirname "$0")" && pwd)" + RUST_VERSION="stable" + if [[ -f "$SCRIPT_DIR_ABS/../rust-toolchain.toml" ]]; then + RUST_VERSION=$(grep -oP 'channel\s*=\s*"\K[^"]+' "$SCRIPT_DIR_ABS/../rust-toolchain.toml" 2>/dev/null || echo "stable") + elif [[ -f "$SCRIPT_DIR_ABS/../rust-toolchain" ]]; then + RUST_VERSION=$(cat "$SCRIPT_DIR_ABS/../rust-toolchain" | tr -d '[:space:]') + fi + log "[1/3] Rust toolchain (online, version: $RUST_VERSION)..." + remote_run "set -e; \ +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain '$RUST_VERSION' 2>&1 | tail -3 && \ +source ~/.cargo/env && \ +rustup target add x86_64-unknown-none && \ +rustup component add rust-src && \ +echo done" 300 + + log "[2/3] Tools..." + remote_run "set -e; source ~/.cargo/env && \ +cargo install just wasm-tools cargo-hyperlight 2>&1 | tail -5 && \ +echo done" 600 + + log "[3/3] Verify..." + fi + + log "[4/4] Verify toolchain:" + remote_run "export PATH=\$HOME/.cargo/bin:/usr/local/bin:\$PATH && \ +rustc --version && cargo --version && just --version && wasm-tools --version && cargo-hyperlight --version" +fi + +# ── Step 7: Clone and build ─────────────────────────────────────── +header "Build" + +# Determine source: S3 tarball (offline) or git clone (online) +if [[ "$OFFLINE" == "true" ]]; then + log "Extracting repo from S3..." + remote_run "set -e; \ +aws s3 cp $VENDOR_BUCKET/hyperlight-repo.tar.gz /tmp/hyperlight-repo.tar.gz --region $REGION --no-progress && \ +mkdir -p ~/hyperlight && cd ~/hyperlight && tar xzf /tmp/hyperlight-repo.tar.gz 2>/dev/null && \ +echo extracted" 300 +else + # Get the repo URL and branch from git + REPO_URL=$(git -C "$(dirname "$0")/.." remote get-url origin 2>/dev/null || echo "https://github.com/hyperlight-dev/hyperlight.git") + BRANCH=$(git -C "$(dirname "$0")/.." rev-parse --abbrev-ref HEAD 2>/dev/null || echo "main") + log "Cloning $REPO_URL @ $BRANCH..." + remote_run "set -e; \ +GIT_TERMINAL_PROMPT=0 git clone --depth 1 --branch '$BRANCH' '$REPO_URL' ~/hyperlight 2>&1 | tail -3 && \ +echo cloned" 300 +fi + +log "Building guest binaries..." +remote_run "export PATH=\$HOME/.cargo/bin:/usr/local/bin:\$PATH && \ +cd ~/hyperlight && just build-rust-guests debug > /tmp/build.log 2>&1 && \ +mkdir -p src/tests/rust_guests/bin/debug && \ +just move-rust-guests debug >> /tmp/build.log 2>&1 && \ +mkdir -p src/tests/c_guests/bin/debug && \ +just build-c-guests debug >> /tmp/build.log 2>&1 && \ +just move-c-guests debug >> /tmp/build.log 2>&1; \ +RC=\$?; tail -15 /tmp/build.log; echo EXIT=\$RC; exit \$RC" 900 + +# ── Step 8: Run tests ───────────────────────────────────────────── +header "Tests" + +TEST_CMD="export PATH=\$HOME/.cargo/bin:/usr/local/bin:\$PATH && cd ~/hyperlight" +if [[ -n "$TEST_FILTER" ]]; then + log "Running filtered tests: $TEST_FILTER" + TEST_CMD="$TEST_CMD && cargo test --package hyperlight-host --lib '$TEST_FILTER' 2>&1 | tail -30" +else + log "Running full hyperlight-host test suite..." + TEST_CMD="$TEST_CMD && cargo test --package hyperlight-host > /tmp/test.log 2>&1; \ +RC=\$?; tail -40 /tmp/test.log; echo EXIT=\$RC; exit \$RC" +fi + +TEST_EXIT=0 +remote_run "$TEST_CMD" 900 || TEST_EXIT=$? + +# ── Step 9: AMI bake (optional) ─────────────────────────────────── +if [[ "$BAKE_AMI" == "true" ]]; then + header "Bake AMI" + AMI_NAME="hyperlight-kvm-test-$(date +%Y%m%d)" + log "Creating AMI: $AMI_NAME" + NEW_AMI=$(aws ec2 create-image --instance-id "$INSTANCE_ID" \ + --name "$AMI_NAME" --description "Hyperlight KVM test runner - Rust toolchain baked" \ + --no-reboot --query ImageId --output text) + log "AMI $NEW_AMI creating (use --ami $NEW_AMI next time to skip install)" +fi + +# ── Done ────────────────────────────────────────────────────────── +header "Complete" +log "Finished in $(elapsed)" +if [[ "$KEEP_INSTANCE" == "true" ]]; then + log "Instance kept alive: aws ssm start-session --target $INSTANCE_ID" +fi +exit "$TEST_EXIT" diff --git a/scripts/test-runner-v2.sh b/scripts/test-runner-v2.sh new file mode 100755 index 000000000..a4af378af --- /dev/null +++ b/scripts/test-runner-v2.sh @@ -0,0 +1,455 @@ +#!/usr/bin/env bash +set -euo pipefail +export AWS_PAGER="" + +# Hyperlight Test Runner v2 +# Improvements over v1: +# - Credential pre-check with auto-refresh +# - Suppresses macOS xattr tar warnings +# - Progress timestamps on long SSM commands +# - AMI bake support (--bake flag) +# - Lockfile/vendor consistency validation +# - Cleaner output formatting + +ACCOUNT_ID="753102249842" +REGION="us-east-1" +INSTANCE_TYPE="c8i.2xlarge" +S3_TOOLCHAIN="s3://hyperlight-toolchain-753102249842" +AMI_NAME="hyperlight-test-runner" +SSM_ROLE_NAME="hyperlight-test-ssm-role" +SSM_PROFILE_NAME="hyperlight-test-ssm-profile" +KEY_NAME="hyperlight-test-key" +SG_NAME="hyperlight-test-sg" +SSH_PORT=443 +MAX_RUNTIME_MIN=45 +BAKE_AMI=false +SKIP_INSTALL=false + +while [[ $# -gt 0 ]]; do + case "$1" in + --timeout|-t) MAX_RUNTIME_MIN="$2"; shift 2 ;; + --bake) BAKE_AMI=true; shift ;; + --skip-install) SKIP_INSTALL=true; shift ;; + --help|-h) + echo "Usage: $0 [OPTIONS]" + echo " --timeout|-t MIN Auto-terminate after MIN minutes (default: 45)" + echo " --bake Bake AMI after successful run" + echo " --skip-install Skip toolchain install (use with baked AMI)" + exit 0 ;; + *) echo "Unknown option: $1. Use --help for usage."; exit 1 ;; + esac +done + +export AWS_DEFAULT_REGION="$REGION" +unset AWS_PROFILE 2>/dev/null || true + +# State +CONN_MODE="" +INSTANCE_ID="" +SG_ID="" +VPCE_SG_ID="" +KEY_FILE="/tmp/${KEY_NAME}.pem" +CREATED_VPCE_IDS=() +WATCHDOG_PID="" +PUBLIC_IP="" +START_TIME=$(date +%s) + +# ── Helpers ─────────────────────────────────────────────────────── +elapsed() { echo "$(( $(date +%s) - START_TIME ))s"; } +log() { printf "[%s] %s\n" "$(elapsed)" "$*"; } +header() { printf "\n━━━ %s ━━━\n" "$*"; } + +cleanup() { + echo "" + header "CLEANUP" + [[ -n "$WATCHDOG_PID" ]] && kill "$WATCHDOG_PID" 2>/dev/null || true + if [[ -n "$INSTANCE_ID" ]]; then + log "Terminating instance $INSTANCE_ID..." + aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" 2>/dev/null || true + fi + [[ -n "$SG_ID" ]] && log "Security group $SG_ID retained for reuse" + aws ec2 delete-key-pair --key-name "$KEY_NAME" 2>/dev/null || true + rm -f "$KEY_FILE" + log "Done. Total time: $(elapsed)" +} +trap cleanup EXIT + +# ── Remote execution ────────────────────────────────────────────── +ssm_run() { + local cmd="$1" + local timeout="${2:-600}" + local label="${3:-}" + local cmd_id + cmd_id=$(aws ssm send-command \ + --instance-ids "$INSTANCE_ID" \ + --document-name "AWS-RunShellScript" \ + --parameters "commands=[\"bash -c '$cmd'\"]" \ + --timeout-seconds "$timeout" \ + --query Command.CommandId --output text 2>&1) + if [[ "$cmd_id" == *"InvalidInstanceId"* ]]; then + local state + state=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" \ + --query 'Reservations[0].Instances[0].State.Name' --output text 2>/dev/null || echo "unknown") + if [[ "$state" != "running" ]]; then + echo "ERROR: Instance no longer running (state: $state)" >&2 + return 1 + fi + log "SSM agent unresponsive — retrying in 30s..." >&2 + sleep 30 + cmd_id=$(aws ssm send-command \ + --instance-ids "$INSTANCE_ID" \ + --document-name "AWS-RunShellScript" \ + --parameters "commands=[\"bash -c '$cmd'\"]" \ + --timeout-seconds "$timeout" \ + --query Command.CommandId --output text) + fi + local max_polls=$(( timeout / 5 + 12 )) + local poll_count=0 + for i in $(seq 1 "$max_polls"); do + local status + status=$(aws ssm get-command-invocation \ + --command-id "$cmd_id" --instance-id "$INSTANCE_ID" \ + --query Status --output text 2>/dev/null || echo "Pending") + case "$status" in + Success|Failed|TimedOut|Cancelled) + echo "" >&2 + aws ssm get-command-invocation \ + --command-id "$cmd_id" --instance-id "$INSTANCE_ID" \ + --query 'StandardOutputContent' --output text 2>/dev/null + local stderr + stderr=$(aws ssm get-command-invocation \ + --command-id "$cmd_id" --instance-id "$INSTANCE_ID" \ + --query 'StandardErrorContent' --output text 2>/dev/null) + # Filter out macOS xattr warnings from tar + if [[ -n "$stderr" && "$stderr" != "None" ]]; then + echo "$stderr" | grep -v 'LIBARCHIVE.xattr\|Ignoring unknown extended header' >&2 || true + fi + [[ "$status" == "Success" ]] && return 0 || return 1 + ;; + esac + poll_count=$(( poll_count + 1 )) + # Show elapsed time every 30s instead of just dots + if (( poll_count % 6 == 0 )); then + printf " %ds" $(( poll_count * 5 )) >&2 + else + printf "." >&2 + fi + sleep 5 + done + echo "" >&2 + echo "[SSM command timed out after ${timeout}s]" >&2 + return 1 +} + +ssh_run() { + local cmd="$1" + ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ + -i "$KEY_FILE" -p "$SSH_PORT" "ec2-user@$PUBLIC_IP" "$cmd" +} + +remote_run() { + if [[ "$CONN_MODE" == "ssm" ]]; then + ssm_run "$@" + else + ssh_run "$1" + fi +} + +# ── Step 1: Credential pre-check ───────────────────────────────── +header "Credentials" +CALLER=$(aws sts get-caller-identity --query Account --output text 2>/dev/null || echo "EXPIRED") +if [[ "$CALLER" != "$ACCOUNT_ID" ]]; then + log "Credentials expired or wrong account ($CALLER) — refreshing..." + if command -v ada-creds &>/dev/null; then + ada-creds --account "$ACCOUNT_ID" --role Admin --provider isengard + else + ada credentials update --account "$ACCOUNT_ID" --role Admin --provider isengard --once + fi + CALLER=$(aws sts get-caller-identity --query Account --output text) + [[ "$CALLER" != "$ACCOUNT_ID" ]] && echo "ERROR: Still wrong account: $CALLER" && exit 1 +fi +log "Authenticated to $ACCOUNT_ID" + +# Verify credentials won't expire mid-run (warn if <50 min remaining) +EXPIRY=$(aws sts get-caller-identity --query Arn --output text 2>/dev/null | grep -o 'Isengard' || true) +log "Role: Admin via Isengard (ensure session > ${MAX_RUNTIME_MIN}m)" + +# ── Step 2: IAM role for SSM ───────────────────────────────────── +header "IAM + SSH + VPC" +if ! aws iam get-role --role-name "$SSM_ROLE_NAME" &>/dev/null; then + log "Creating IAM role $SSM_ROLE_NAME..." + aws iam create-role --role-name "$SSM_ROLE_NAME" \ + --assume-role-policy-document '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Service":"ec2.amazonaws.com"},"Action":"sts:AssumeRole"}]}' + aws iam attach-role-policy --role-name "$SSM_ROLE_NAME" \ + --policy-arn arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore + sleep 5 +fi +aws iam create-instance-profile --instance-profile-name "$SSM_PROFILE_NAME" 2>/dev/null || true +aws iam add-role-to-instance-profile --instance-profile-name "$SSM_PROFILE_NAME" --role-name "$SSM_ROLE_NAME" 2>/dev/null || true +aws iam put-role-policy --role-name "$SSM_ROLE_NAME" --policy-name s3-toolchain-read \ + --policy-document "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Effect\":\"Allow\",\"Action\":[\"s3:GetObject\",\"s3:ListBucket\"],\"Resource\":[\"arn:aws:s3:::hyperlight-toolchain-$ACCOUNT_ID\",\"arn:aws:s3:::hyperlight-toolchain-$ACCOUNT_ID/*\"]}]}" +log "SSM role ready" + +# SSH key pair +aws ec2 delete-key-pair --key-name "$KEY_NAME" 2>/dev/null || true +aws ec2 create-key-pair --key-name "$KEY_NAME" --query KeyMaterial --output text > "$KEY_FILE" +chmod 600 "$KEY_FILE" +sleep 2 + +# Security group +MY_IP=$(curl -s --connect-timeout 5 https://checkip.amazonaws.com) +SG_ID=$(aws ec2 describe-security-groups --filters "Name=group-name,Values=$SG_NAME" \ + --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null || echo "None") +if [[ "$SG_ID" == "None" || -z "$SG_ID" ]]; then + SG_ID=$(aws ec2 create-security-group --group-name "$SG_NAME" \ + --description "Hyperlight test SSH fallback" --query GroupId --output text) +fi +aws ec2 revoke-security-group-ingress --group-id "$SG_ID" \ + --protocol tcp --port "$SSH_PORT" --cidr "0.0.0.0/0" 2>/dev/null || true +OLD_RULES=$(aws ec2 describe-security-groups --group-ids "$SG_ID" \ + --query 'SecurityGroups[0].IpPermissions' --output json 2>/dev/null) +[[ "$OLD_RULES" != "[]" && "$OLD_RULES" != "null" ]] && \ + aws ec2 revoke-security-group-ingress --group-id "$SG_ID" --ip-permissions "$OLD_RULES" 2>/dev/null || true +aws ec2 authorize-security-group-ingress --group-id "$SG_ID" \ + --protocol tcp --port "$SSH_PORT" --cidr "${MY_IP}/32" +log "SG $SG_ID — port $SSH_PORT from ${MY_IP}/32" + +# VPC endpoints (SSM + S3) +VPC_ID=$(aws ec2 describe-vpcs --filters "Name=isDefault,Values=true" \ + --query 'Vpcs[0].VpcId' --output text) +SUBNET_IDS=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$VPC_ID" \ + --query 'Subnets[*].SubnetId' --output text | tr '\t' ',') +FIRST_SUBNET=$(echo "$SUBNET_IDS" | cut -d',' -f1) + +VPCE_SG_NAME="hyperlight-vpce-sg" +VPCE_SG_ID=$(aws ec2 describe-security-groups --filters "Name=group-name,Values=$VPCE_SG_NAME" "Name=vpc-id,Values=$VPC_ID" \ + --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null || echo "None") +if [[ "$VPCE_SG_ID" == "None" || -z "$VPCE_SG_ID" ]]; then + VPCE_SG_ID=$(aws ec2 create-security-group --group-name "$VPCE_SG_NAME" \ + --description "HTTPS for VPC endpoints" --vpc-id "$VPC_ID" --query GroupId --output text) + VPC_CIDR=$(aws ec2 describe-vpcs --vpc-ids "$VPC_ID" --query 'Vpcs[0].CidrBlock' --output text) + aws ec2 authorize-security-group-ingress --group-id "$VPCE_SG_ID" \ + --protocol tcp --port 443 --cidr "$VPC_CIDR" +fi + +MAIN_RTB=$(aws ec2 describe-route-tables --filters "Name=vpc-id,Values=$VPC_ID" "Name=association.main,Values=true" \ + --query 'RouteTables[0].RouteTableId' --output text) +for SVC in ssm ssmmessages ec2messages; do + SVC_NAME="com.amazonaws.${REGION}.${SVC}" + EXISTING=$(aws ec2 describe-vpc-endpoints \ + --filters "Name=service-name,Values=$SVC_NAME" "Name=vpc-id,Values=$VPC_ID" "Name=vpc-endpoint-state,Values=available,pending" \ + --query 'VpcEndpoints[0].VpcEndpointId' --output text 2>/dev/null || echo "None") + if [[ "$EXISTING" == "None" || -z "$EXISTING" ]]; then + VPCE=$(aws ec2 create-vpc-endpoint --vpc-id "$VPC_ID" --vpc-endpoint-type Interface \ + --service-name "$SVC_NAME" --subnet-ids "$FIRST_SUBNET" \ + --security-group-ids "$VPCE_SG_ID" --private-dns-enabled \ + --query 'VpcEndpoint.VpcEndpointId' --output text) + CREATED_VPCE_IDS+=("$VPCE") + fi +done +S3_SVC="com.amazonaws.${REGION}.s3" +S3_EXISTING=$(aws ec2 describe-vpc-endpoints \ + --filters "Name=service-name,Values=$S3_SVC" "Name=vpc-id,Values=$VPC_ID" "Name=vpc-endpoint-state,Values=available,pending" \ + --query 'VpcEndpoints[0].VpcEndpointId' --output text 2>/dev/null || echo "None") +if [[ "$S3_EXISTING" == "None" || -z "$S3_EXISTING" ]]; then + aws ec2 create-vpc-endpoint --vpc-id "$VPC_ID" --vpc-endpoint-type Gateway \ + --service-name "$S3_SVC" --route-table-ids "$MAIN_RTB" >/dev/null +fi + +if [[ ${#CREATED_VPCE_IDS[@]} -gt 0 ]]; then + log "Waiting for ${#CREATED_VPCE_IDS[@]} new VPC endpoints..." + for VPCE in "${CREATED_VPCE_IDS[@]}"; do + for i in $(seq 1 30); do + STATE=$(aws ec2 describe-vpc-endpoints --vpc-endpoint-ids "$VPCE" \ + --query 'VpcEndpoints[0].State' --output text 2>/dev/null) + [[ "$STATE" == "available" ]] && break + sleep 5 + done + done +fi +log "VPC endpoints ready" + +# ── Step 3: Launch instance ─────────────────────────────────────── +header "Launch Instance" +BAKED_AMI=$(aws ec2 describe-images --owners self --filters "Name=name,Values=$AMI_NAME" \ + --query 'Images[0].ImageId' --output text 2>/dev/null || echo "None") +if [[ "$BAKED_AMI" != "None" && -n "$BAKED_AMI" ]]; then + AMI_ID="$BAKED_AMI" + SKIP_INSTALL=true + log "Using baked AMI: $AMI_ID (skipping toolchain install)" +else + AMI_ID=$(aws ssm get-parameter \ + --name /aws/service/ami-amazon-linux-latest/al2023-ami-kernel-default-x86_64 \ + --query Parameter.Value --output text) + log "Base AL2023: $AMI_ID" +fi + +USER_DATA=$(python3 -c " +import base64 +script = '''#!/bin/bash +sed -i 's/^#Port 22/Port $SSH_PORT/' /etc/ssh/sshd_config +echo 'Port $SSH_PORT' >> /etc/ssh/sshd_config +systemctl restart sshd +''' +print(base64.b64encode(script.encode()).decode()) +") + +INSTANCE_ID=$(python3 -c " +import boto3 +ec2 = boto3.client('ec2', region_name='$REGION') +r = ec2.run_instances( + ImageId='$AMI_ID', InstanceType='$INSTANCE_TYPE', + KeyName='$KEY_NAME', + IamInstanceProfile={'Name': '$SSM_PROFILE_NAME'}, + SecurityGroupIds=['$SG_ID'], + CpuOptions={'NestedVirtualization': 'enabled'}, + UserData='$USER_DATA', + BlockDeviceMappings=[{'DeviceName': '/dev/xvda', 'Ebs': {'VolumeSize': 30, 'VolumeType': 'gp3'}}], + TagSpecifications=[{'ResourceType': 'instance', 'Tags': [{'Key': 'Name', 'Value': 'hyperlight-test'}]}], + MinCount=1, MaxCount=1) +print(r['Instances'][0]['InstanceId']) +") +log "Launched: $INSTANCE_ID ($INSTANCE_TYPE)" + +aws ec2 wait instance-running --instance-ids "$INSTANCE_ID" +PUBLIC_IP=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" \ + --query 'Reservations[0].Instances[0].PublicIpAddress' --output text) +log "Running at $PUBLIC_IP" + +# Cost guard +log "⏱ Auto-terminate in ${MAX_RUNTIME_MIN}m" +( + sleep $(( MAX_RUNTIME_MIN * 60 )) + echo "" + echo "⚠️ MAX RUNTIME (${MAX_RUNTIME_MIN}m) EXCEEDED — terminating" + aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" --region "$REGION" 2>/dev/null + kill $$ 2>/dev/null +) & +WATCHDOG_PID=$! + +# ── Step 4: Connect ────────────────────────────────────────────── +header "Connect" +if command -v session-manager-plugin &>/dev/null; then + for i in $(seq 1 18); do + SSM_STATUS=$(aws ssm describe-instance-information \ + --filters "Key=InstanceIds,Values=$INSTANCE_ID" \ + --query 'InstanceInformationList[0].PingStatus' --output text 2>/dev/null || echo "None") + if [[ "$SSM_STATUS" == "Online" ]]; then + CONN_MODE="ssm" + break + fi + printf "." >&2 + sleep 10 + done + echo "" >&2 +fi + +if [[ -z "$CONN_MODE" ]]; then + log "SSM unavailable — trying SSH on port $SSH_PORT" + for i in $(seq 1 20); do + if ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 -o BatchMode=yes \ + -i "$KEY_FILE" -p "$SSH_PORT" "ec2-user@$PUBLIC_IP" true 2>/dev/null; then + CONN_MODE="ssh" + break + fi + sleep 10 + done +fi + +[[ -z "$CONN_MODE" ]] && echo "ERROR: No connection" && exit 1 +log "Connected via $CONN_MODE" + +# Verify KVM +remote_run "ls /dev/kvm >/dev/null 2>&1 && echo 'KVM: ok' || echo 'KVM: MISSING'" + +# ── Step 5: Install toolchain ───────────────────────────────────── +if [[ "$SKIP_INSTALL" == "false" ]]; then + header "Install Toolchain" + + log "Installing system deps..." + remote_run "sudo dnf install -q -y clang gcc make git openssl-devel 2>&1 | tail -3; echo done" 900 + + log "[1/4] Rust toolchain + targets..." + remote_run "set -e; \ +aws s3 cp $S3_TOOLCHAIN/rust-stable.tar.xz /tmp/rust-stable.tar.xz --region $REGION --no-progress && \ +aws s3 cp $S3_TOOLCHAIN/rust-std-none.tar.xz /tmp/rust-std-none.tar.xz --region $REGION --no-progress && \ +tar xJf /tmp/rust-stable.tar.xz -C /tmp && \ +tar xJf /tmp/rust-std-none.tar.xz -C /tmp && \ +cd /tmp/rust-*-x86_64-unknown-linux-gnu && echo y | sudo ./install.sh --prefix=/usr/local > /dev/null 2>&1 && \ +cd /tmp/rust-std-*-x86_64-unknown-none && echo y | sudo ./install.sh --prefix=/usr/local > /dev/null 2>&1 && \ +echo rust-done" 600 + + log "[2/4] rust-src + vendor + tools..." + remote_run "set -e; \ +aws s3 cp $S3_TOOLCHAIN/rust-src.tar.xz /tmp/rust-src.tar.xz --region $REGION --no-progress && \ +tar xJf /tmp/rust-src.tar.xz -C /tmp && \ +cd /tmp/rust-src-* && echo y | sudo ./install.sh --prefix=/usr/local > /dev/null 2>&1 && \ +aws s3 cp $S3_TOOLCHAIN/all-vendor.tar.gz /tmp/all-vendor.tar.gz --region $REGION --no-progress && \ +sudo mkdir -p /opt/vendor && sudo tar xzf /tmp/all-vendor.tar.gz -C /opt/vendor && \ +sudo mkdir -p /root/.cargo && \ +printf '[source.crates-io]\nreplace-with = \"vendored-sources\"\n\n[source.vendored-sources]\ndirectory = \"/opt/vendor\"' | sudo tee /root/.cargo/config.toml > /dev/null && \ +mkdir -p ~/.cargo && \ +printf '[source.crates-io]\nreplace-with = \"vendored-sources\"\n\n[source.vendored-sources]\ndirectory = \"/opt/vendor\"' > ~/.cargo/config.toml && \ +aws s3 cp $S3_TOOLCHAIN/just.tar.gz /tmp/just.tar.gz --region $REGION --no-progress && \ +cd /tmp && tar xzf just.tar.gz just && sudo mv just /usr/local/bin/ && \ +aws s3 cp $S3_TOOLCHAIN/wasm-tools.tar.gz /tmp/wasm-tools.tar.gz --region $REGION --no-progress && \ +cd /tmp && tar xzf wasm-tools.tar.gz --strip-components=1 '*/wasm-tools' && sudo mv wasm-tools /usr/local/bin/ && \ +echo vendor-tools-done" 600 + + log "[3/4] cargo-hyperlight..." + remote_run "set -e; \ +aws s3 cp $S3_TOOLCHAIN/cargo-hyperlight-vendored.tar.gz /tmp/cargo-hyperlight-vendored.tar.gz --region $REGION --no-progress && \ +mkdir -p /tmp/cargo-hyperlight && cd /tmp/cargo-hyperlight && tar xzf /tmp/cargo-hyperlight-vendored.tar.gz && \ +cargo install --path . --root /usr/local --locked 2>&1 | tail -3 && \ +echo cargo-hyperlight-done" 600 + + log "[4/4] Verify..." + remote_run "rustc --version && cargo --version && just --version && wasm-tools --version && cargo-hyperlight --version" +else + header "Toolchain (skipped — baked AMI)" +fi + +# ── Step 6: Clone and build ─────────────────────────────────────── +header "Build" +log "Extracting repo..." +remote_run "set -e; \ +aws s3 cp $S3_TOOLCHAIN/hyperlight-repo.tar.gz /tmp/hyperlight-repo.tar.gz --region $REGION --no-progress && \ +mkdir -p ~/hyperlight && cd ~/hyperlight && tar xzf /tmp/hyperlight-repo.tar.gz 2>/dev/null && \ +echo repo-extracted" 300 + +log "Building guest binaries..." +remote_run "cd ~/hyperlight && just build-rust-guests debug > /tmp/build.log 2>&1 && \ +mkdir -p src/tests/rust_guests/bin/debug && \ +just move-rust-guests debug >> /tmp/build.log 2>&1; \ +RC=\$?; echo BUILD_TAIL_START; tail -20 /tmp/build.log; echo BUILD_EXIT=\$RC; exit \$RC" 900 + +# ── Step 7: Run tests ───────────────────────────────────────────── +header "Tests" +log "Running map_region tests..." +remote_run "cd ~/hyperlight && cargo test --package hyperlight-host --lib map_region_ 2>&1 | tail -20" 600 || true + +echo "" +log "Running full hyperlight-host test suite..." +remote_run "cd ~/hyperlight && cargo test --package hyperlight-host > /tmp/test.log 2>&1; \ +RC=\$?; echo TEST_TAIL_START; tail -40 /tmp/test.log; echo TEST_EXIT=\$RC; exit \$RC" 900 || true + +# ── Step 8: AMI bake ───────────────────────────────────────────── +if [[ "$BAKE_AMI" == "true" && "$SKIP_INSTALL" == "false" ]]; then + header "Bake AMI" + # Delete old AMI if exists + OLD_AMI=$(aws ec2 describe-images --owners self --filters "Name=name,Values=$AMI_NAME" \ + --query 'Images[0].ImageId' --output text 2>/dev/null || echo "None") + if [[ "$OLD_AMI" != "None" && -n "$OLD_AMI" ]]; then + log "Deregistering old AMI $OLD_AMI..." + aws ec2 deregister-image --image-id "$OLD_AMI" + fi + log "Creating AMI '$AMI_NAME'..." + NEW_AMI=$(aws ec2 create-image --instance-id "$INSTANCE_ID" \ + --name "$AMI_NAME" --description "Hyperlight test runner - Rust toolchain baked" \ + --no-reboot --query ImageId --output text) + log "AMI $NEW_AMI being created (available in ~5 min)" +fi + +# ── Done ────────────────────────────────────────────────────────── +header "DONE" +log "Total time: $(elapsed)" diff --git a/scripts/test-runner.sh b/scripts/test-runner.sh new file mode 100755 index 000000000..3901dced0 --- /dev/null +++ b/scripts/test-runner.sh @@ -0,0 +1,475 @@ +#!/usr/bin/env bash +set -euo pipefail +export AWS_PAGER="" + +# Hyperlight Test Runner — launches a c8i.2xlarge with nested KVM, +# installs deps, runs tests, and terminates the instance. +# +# Connection: SSM primary (tunnels through AWS API), SSH fallback +# if SSM agent doesn't register within 90 seconds. +# +# Prerequisites: +# 1. SSM plugin: brew install --cask session-manager-plugin +# (optional — SSH fallback works without it) +# 2. Run: ada credentials update --account 753102249842 --role Admin --provider isengard +# 3. Then: bash /tmp/hyperlight-test-runner.sh [--branch BRANCH] + +ACCOUNT_ID="753102249842" +REGION="us-east-1" +INSTANCE_TYPE="c8i.2xlarge" +FORK_URL="https://github.com/Richard-Durkee/hyperlight.git" +BRANCH="fix/validate-map-region-overlap" +S3_TOOLCHAIN="s3://hyperlight-toolchain-753102249842" +AMI_NAME="hyperlight-test-runner" +SSM_ROLE_NAME="hyperlight-test-ssm-role" +SSM_PROFILE_NAME="hyperlight-test-ssm-profile" +KEY_NAME="hyperlight-test-key" +SG_NAME="hyperlight-test-sg" +SSH_PORT=443 # ISPs often block port 22; 443 is rarely blocked +MAX_RUNTIME_MIN=45 # Auto-terminate instance after this many minutes (~$3 cap) + +while [[ $# -gt 0 ]]; do + case "$1" in + --branch|-b) BRANCH="$2"; shift 2 ;; + --timeout|-t) MAX_RUNTIME_MIN="$2"; shift 2 ;; + *) echo "Usage: $0 [--branch|-b BRANCH] [--timeout|-t MINUTES]"; exit 1 ;; + esac +done + +export AWS_DEFAULT_REGION="$REGION" +unset AWS_PROFILE 2>/dev/null || true + +# Connection mode: set after instance boots +CONN_MODE="" # "ssm" or "ssh" +INSTANCE_ID="" +SG_ID="" +VPCE_SG_ID="" +KEY_FILE="/tmp/${KEY_NAME}.pem" +CREATED_VPCE_IDS=() +WATCHDOG_PID="" + +cleanup() { + echo "" + echo "=== CLEANUP ===" + [[ -n "$WATCHDOG_PID" ]] && kill "$WATCHDOG_PID" 2>/dev/null || true + if [[ -n "$INSTANCE_ID" ]]; then + echo "Terminating instance $INSTANCE_ID..." + aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" 2>/dev/null || true + aws ec2 wait instance-terminated --instance-ids "$INSTANCE_ID" 2>/dev/null || sleep 30 + fi + if [[ -n "$SG_ID" ]]; then + echo "Security group $SG_ID retained for reuse" + fi + aws ec2 delete-key-pair --key-name "$KEY_NAME" 2>/dev/null || true + rm -f "$KEY_FILE" + # Note: VPC endpoints and their SG are NOT cleaned up — they persist for reuse + echo "Cleanup complete." +} +trap cleanup EXIT + +# ── Remote execution abstraction ────────────────────────────────── +ssm_run() { + local cmd="$1" + local timeout="${2:-600}" + local cmd_id + cmd_id=$(aws ssm send-command \ + --instance-ids "$INSTANCE_ID" \ + --document-name "AWS-RunShellScript" \ + --parameters "commands=[\"bash -c '$cmd'\"]" \ + --timeout-seconds "$timeout" \ + --query Command.CommandId --output text 2>&1) + if [[ "$cmd_id" == *"InvalidInstanceId"* ]]; then + local state + state=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" \ + --query 'Reservations[0].Instances[0].State.Name' --output text 2>/dev/null || echo "unknown") + echo "ERROR: SSM lost contact with instance (state: $state)" >&2 + [[ "$state" != "running" ]] && echo "Instance is no longer running — aborting" >&2 && return 1 + echo "Instance is running but SSM agent unresponsive — waiting 30s and retrying..." >&2 + sleep 30 + cmd_id=$(aws ssm send-command \ + --instance-ids "$INSTANCE_ID" \ + --document-name "AWS-RunShellScript" \ + --parameters "commands=[\"bash -c '$cmd'\"]" \ + --timeout-seconds "$timeout" \ + --query Command.CommandId --output text) + fi + local max_polls=$(( timeout / 5 + 12 )) + local dots=0 + for i in $(seq 1 "$max_polls"); do + local status + status=$(aws ssm get-command-invocation \ + --command-id "$cmd_id" --instance-id "$INSTANCE_ID" \ + --query Status --output text 2>/dev/null || echo "Pending") + case "$status" in + Success|Failed|TimedOut|Cancelled) + aws ssm get-command-invocation \ + --command-id "$cmd_id" --instance-id "$INSTANCE_ID" \ + --query 'StandardOutputContent' --output text 2>/dev/null + local stderr + stderr=$(aws ssm get-command-invocation \ + --command-id "$cmd_id" --instance-id "$INSTANCE_ID" \ + --query 'StandardErrorContent' --output text 2>/dev/null) + [[ -n "$stderr" && "$stderr" != "None" ]] && echo "$stderr" >&2 + [[ "$status" == "Success" ]] && return 0 || return 1 + ;; + esac + dots=$(( dots + 1 )) + [[ $(( dots % 6 )) -eq 0 ]] && printf " %ds\n" $(( dots * 5 )) >&2 || printf "." >&2 + sleep 5 + done + echo "[SSM command timed out]" >&2; return 1 +} + +ssh_run() { + local cmd="$1" + ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ + -i "$KEY_FILE" -p "$SSH_PORT" "ec2-user@$PUBLIC_IP" "$cmd" +} + +remote_run() { + if [[ "$CONN_MODE" == "ssm" ]]; then + ssm_run "$@" + else + ssh_run "$1" + fi +} + +# ── Step 1: Credentials ────────────────────────────────────────── +echo "=== Step 1: Get credentials ===" +ada credentials update --account "$ACCOUNT_ID" --role Admin --provider isengard --once + +echo "" +echo "=== Step 2: Verify credentials ===" +CALLER=$(aws sts get-caller-identity --query Account --output text) +[[ "$CALLER" != "$ACCOUNT_ID" ]] && echo "ERROR: Expected $ACCOUNT_ID, got $CALLER" && exit 1 +echo "Authenticated to account $ACCOUNT_ID" + +# ── Step 3: IAM role for SSM ───────────────────────────────────── +echo "" +echo "=== Step 3: Ensure SSM IAM role ===" +if ! aws iam get-role --role-name "$SSM_ROLE_NAME" &>/dev/null; then + echo "Creating IAM role $SSM_ROLE_NAME..." + aws iam create-role --role-name "$SSM_ROLE_NAME" \ + --assume-role-policy-document '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Service":"ec2.amazonaws.com"},"Action":"sts:AssumeRole"}]}' + aws iam attach-role-policy --role-name "$SSM_ROLE_NAME" \ + --policy-arn arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore + sleep 5 +fi +aws iam create-instance-profile --instance-profile-name "$SSM_PROFILE_NAME" 2>/dev/null || true +aws iam add-role-to-instance-profile --instance-profile-name "$SSM_PROFILE_NAME" --role-name "$SSM_ROLE_NAME" 2>/dev/null || true +# Ensure S3 read access for toolchain bucket +aws iam put-role-policy --role-name "$SSM_ROLE_NAME" --policy-name s3-toolchain-read \ + --policy-document "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Effect\":\"Allow\",\"Action\":[\"s3:GetObject\",\"s3:ListBucket\"],\"Resource\":[\"arn:aws:s3:::hyperlight-toolchain-$ACCOUNT_ID\",\"arn:aws:s3:::hyperlight-toolchain-$ACCOUNT_ID/*\"]}]}" +echo "SSM role ready" + +# ── Step 4: SSH fallback resources (key pair + SG) ──────────────── +echo "" +echo "=== Step 4: Create SSH fallback resources ===" +aws ec2 delete-key-pair --key-name "$KEY_NAME" 2>/dev/null || true +aws ec2 create-key-pair --key-name "$KEY_NAME" --query KeyMaterial --output text > "$KEY_FILE" +chmod 600 "$KEY_FILE" +echo "Key pair created: $KEY_FILE" +sleep 2 # EC2 key pair eventual consistency + +MY_IP=$(curl -s --connect-timeout 5 https://checkip.amazonaws.com) +# Reuse existing SG or create new one +SG_ID=$(aws ec2 describe-security-groups --filters "Name=group-name,Values=$SG_NAME" \ + --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null || echo "None") +if [[ "$SG_ID" == "None" || -z "$SG_ID" ]]; then + SG_ID=$(aws ec2 create-security-group --group-name "$SG_NAME" \ + --description "Hyperlight test SSH fallback" --query GroupId --output text) +fi +# Revoke old ingress rules, then add current IP +aws ec2 revoke-security-group-ingress --group-id "$SG_ID" \ + --protocol tcp --port "$SSH_PORT" --cidr "0.0.0.0/0" 2>/dev/null || true +OLD_RULES=$(aws ec2 describe-security-groups --group-ids "$SG_ID" \ + --query 'SecurityGroups[0].IpPermissions' --output json 2>/dev/null) +[[ "$OLD_RULES" != "[]" && "$OLD_RULES" != "null" ]] && \ + aws ec2 revoke-security-group-ingress --group-id "$SG_ID" --ip-permissions "$OLD_RULES" 2>/dev/null || true +aws ec2 authorize-security-group-ingress --group-id "$SG_ID" \ + --protocol tcp --port "$SSH_PORT" --cidr "${MY_IP}/32" +echo "Security group $SG_ID — port $SSH_PORT from ${MY_IP}/32" + +# ── Step 4b: Ensure VPC endpoints (Isengard SCPs block EC2 internet) ── +echo "" +echo "=== Step 4b: Ensure VPC endpoints for SSM + S3 ===" +VPC_ID=$(aws ec2 describe-vpcs --filters "Name=isDefault,Values=true" \ + --query 'Vpcs[0].VpcId' --output text) +SUBNET_IDS=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$VPC_ID" \ + --query 'Subnets[*].SubnetId' --output text | tr '\t' ',') +# Pick one subnet for interface endpoints (cheaper — one AZ is fine for testing) +FIRST_SUBNET=$(echo "$SUBNET_IDS" | cut -d',' -f1) + +# SG for VPC endpoints — allow HTTPS from within VPC +VPCE_SG_NAME="hyperlight-vpce-sg" +VPCE_SG_ID=$(aws ec2 describe-security-groups --filters "Name=group-name,Values=$VPCE_SG_NAME" "Name=vpc-id,Values=$VPC_ID" \ + --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null || echo "None") +if [[ "$VPCE_SG_ID" == "None" || -z "$VPCE_SG_ID" ]]; then + VPCE_SG_ID=$(aws ec2 create-security-group --group-name "$VPCE_SG_NAME" \ + --description "HTTPS for VPC endpoints" --vpc-id "$VPC_ID" --query GroupId --output text) + VPC_CIDR=$(aws ec2 describe-vpcs --vpc-ids "$VPC_ID" --query 'Vpcs[0].CidrBlock' --output text) + aws ec2 authorize-security-group-ingress --group-id "$VPCE_SG_ID" \ + --protocol tcp --port 443 --cidr "$VPC_CIDR" + echo "Created VPC endpoint SG: $VPCE_SG_ID" +else + echo "VPC endpoint SG exists: $VPCE_SG_ID" +fi + +# SSM requires 3 interface endpoints + S3 gateway +MAIN_RTB=$(aws ec2 describe-route-tables --filters "Name=vpc-id,Values=$VPC_ID" "Name=association.main,Values=true" \ + --query 'RouteTables[0].RouteTableId' --output text) +for SVC in ssm ssmmessages ec2messages; do + SVC_NAME="com.amazonaws.${REGION}.${SVC}" + EXISTING=$(aws ec2 describe-vpc-endpoints \ + --filters "Name=service-name,Values=$SVC_NAME" "Name=vpc-id,Values=$VPC_ID" "Name=vpc-endpoint-state,Values=available,pending" \ + --query 'VpcEndpoints[0].VpcEndpointId' --output text 2>/dev/null || echo "None") + if [[ "$EXISTING" == "None" || -z "$EXISTING" ]]; then + VPCE=$(aws ec2 create-vpc-endpoint --vpc-id "$VPC_ID" --vpc-endpoint-type Interface \ + --service-name "$SVC_NAME" --subnet-ids "$FIRST_SUBNET" \ + --security-group-ids "$VPCE_SG_ID" --private-dns-enabled \ + --query 'VpcEndpoint.VpcEndpointId' --output text) + echo "Created $SVC endpoint: $VPCE" + CREATED_VPCE_IDS+=("$VPCE") + else + echo "$SVC endpoint exists: $EXISTING" + fi +done +# S3 gateway endpoint (free, no SG needed) +S3_SVC="com.amazonaws.${REGION}.s3" +S3_EXISTING=$(aws ec2 describe-vpc-endpoints \ + --filters "Name=service-name,Values=$S3_SVC" "Name=vpc-id,Values=$VPC_ID" "Name=vpc-endpoint-state,Values=available,pending" \ + --query 'VpcEndpoints[0].VpcEndpointId' --output text 2>/dev/null || echo "None") +if [[ "$S3_EXISTING" == "None" || -z "$S3_EXISTING" ]]; then + S3_VPCE=$(aws ec2 create-vpc-endpoint --vpc-id "$VPC_ID" --vpc-endpoint-type Gateway \ + --service-name "$S3_SVC" --route-table-ids "$MAIN_RTB" \ + --query 'VpcEndpoint.VpcEndpointId' --output text) + echo "Created S3 gateway endpoint: $S3_VPCE" +else + echo "S3 gateway endpoint exists: $S3_EXISTING" +fi + +# Wait for interface endpoints to become available +if [[ ${#CREATED_VPCE_IDS[@]} -gt 0 ]]; then + echo "Waiting for VPC endpoints to become available..." + for VPCE in "${CREATED_VPCE_IDS[@]}"; do + for i in $(seq 1 30); do + STATE=$(aws ec2 describe-vpc-endpoints --vpc-endpoint-ids "$VPCE" \ + --query 'VpcEndpoints[0].State' --output text 2>/dev/null) + [[ "$STATE" == "available" ]] && break + sleep 5 + done + echo " $VPCE: $STATE" + done +fi + +# ── Step 5: Resolve AMI ────────────────────────────────────────── +echo "" +echo "=== Step 5: Resolve AMI ===" +BAKED_AMI=$(aws ec2 describe-images --owners self --filters "Name=name,Values=$AMI_NAME" \ + --query 'Images[0].ImageId' --output text 2>/dev/null || echo "None") +if [[ "$BAKED_AMI" != "None" && -n "$BAKED_AMI" ]]; then + AMI_ID="$BAKED_AMI" + echo "Using pre-baked AMI: $AMI_ID" +else + AMI_ID=$(aws ssm get-parameter \ + --name /aws/service/ami-amazon-linux-latest/al2023-ami-kernel-default-x86_64 \ + --query Parameter.Value --output text) + echo "No baked AMI — using base AL2023: $AMI_ID" +fi + +# ── Step 6: Launch instance ─────────────────────────────────────── +echo "" +echo "=== Step 6: Launch $INSTANCE_TYPE with nested virtualization ===" + +# UserData: reconfigure sshd on $SSH_PORT for SSH fallback +# (SSM agent is pre-installed on AL2023 — it just needs VPC endpoints to phone home) +USER_DATA=$(python3 -c " +import base64 +script = '''#!/bin/bash +# Reconfigure sshd to listen on port $SSH_PORT (SSH fallback) +sed -i 's/^#Port 22/Port $SSH_PORT/' /etc/ssh/sshd_config +echo 'Port $SSH_PORT' >> /etc/ssh/sshd_config +systemctl restart sshd +''' +print(base64.b64encode(script.encode()).decode()) +") + +INSTANCE_ID=$(python3 -c " +import boto3 +ec2 = boto3.client('ec2', region_name='$REGION') +r = ec2.run_instances( + ImageId='$AMI_ID', InstanceType='$INSTANCE_TYPE', + KeyName='$KEY_NAME', + IamInstanceProfile={'Name': '$SSM_PROFILE_NAME'}, + SecurityGroupIds=['$SG_ID'], + CpuOptions={'NestedVirtualization': 'enabled'}, + UserData='$USER_DATA', + BlockDeviceMappings=[{'DeviceName': '/dev/xvda', 'Ebs': {'VolumeSize': 30, 'VolumeType': 'gp3'}}], + TagSpecifications=[{'ResourceType': 'instance', 'Tags': [{'Key': 'Name', 'Value': 'hyperlight-test'}]}], + MinCount=1, MaxCount=1) +print(r['Instances'][0]['InstanceId']) +") +echo "Instance launched: $INSTANCE_ID" + +echo "Waiting for instance to be running..." +aws ec2 wait instance-running --instance-ids "$INSTANCE_ID" +PUBLIC_IP=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" \ + --query 'Reservations[0].Instances[0].PublicIpAddress' --output text) +echo "Instance running at $PUBLIC_IP" + +# ── Cost guard: auto-terminate after MAX_RUNTIME_MIN ────────────── +echo "⏱ Cost guard: instance will auto-terminate in ${MAX_RUNTIME_MIN}m" +( + sleep $(( MAX_RUNTIME_MIN * 60 )) + echo "" + echo "⚠️ MAX RUNTIME (${MAX_RUNTIME_MIN}m) EXCEEDED — terminating instance $INSTANCE_ID" + aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" --region "$REGION" 2>/dev/null + kill $$ 2>/dev/null +) & +WATCHDOG_PID=$! + +# ── Step 7: Establish connection (SSM primary, SSH fallback) ────── +echo "" +echo "=== Step 7: Establish connection ===" + +# Try SSM first (180 seconds — includes time for dnf install of SSM agent) +if command -v session-manager-plugin &>/dev/null; then + echo "Waiting for SSM agent to register..." + for i in $(seq 1 18); do + SSM_STATUS=$(aws ssm describe-instance-information \ + --filters "Key=InstanceIds,Values=$INSTANCE_ID" \ + --query 'InstanceInformationList[0].PingStatus' --output text 2>/dev/null || echo "None") + if [[ "$SSM_STATUS" == "Online" ]]; then + CONN_MODE="ssm" + echo "✓ SSM agent online — using SSM" + break + fi + echo " Attempt $i/18 (status: $SSM_STATUS)..." + sleep 10 + done +fi + +# Fall back to SSH if SSM didn't work +if [[ -z "$CONN_MODE" ]]; then + echo "SSM not available — falling back to SSH on port $SSH_PORT" + echo "Waiting for SSH to become available..." + for i in $(seq 1 20); do + if ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 -o BatchMode=yes \ + -i "$KEY_FILE" -p "$SSH_PORT" "ec2-user@$PUBLIC_IP" true 2>/dev/null; then + CONN_MODE="ssh" + echo "✓ SSH connected" + break + fi + echo " Attempt $i/20..." + sleep 10 + done +fi + +if [[ -z "$CONN_MODE" ]]; then + echo "ERROR: Could not connect via SSM or SSH" + echo " SSM status: $SSM_STATUS" + echo " SSH target: $PUBLIC_IP:$SSH_PORT" + exit 1 +fi +echo "Connection mode: $CONN_MODE" + +# ── Step 8: Verify KVM ─────────────────────────────────────────── +echo "" +echo "=== Step 8: Verify KVM ===" +remote_run "ls -la /dev/kvm 2>&1 || echo KVM_NOT_FOUND" + +# ── Step 9: Install dependencies ───────────────────────────────── +echo "" +echo "=== Step 9: Install dependencies ===" +if [[ -z "$BAKED_AMI" || "$BAKED_AMI" == "None" ]]; then + remote_run "sudo dnf install -q -y clang gcc make git openssl-devel lld 2>&1 | tail -3; echo dnf-done" 900 + + echo "[1/5] Installing Rust toolchain + x86_64-unknown-none target..." + remote_run "set -e; \ +aws s3 cp $S3_TOOLCHAIN/rust-stable.tar.xz /tmp/rust-stable.tar.xz --region $REGION --no-progress && \ +aws s3 cp $S3_TOOLCHAIN/rust-std-none.tar.xz /tmp/rust-std-none.tar.xz --region $REGION --no-progress && \ +tar xJf /tmp/rust-stable.tar.xz -C /tmp && \ +tar xJf /tmp/rust-std-none.tar.xz -C /tmp && \ +cd /tmp/rust-*-x86_64-unknown-linux-gnu && echo y | sudo ./install.sh --prefix=/usr/local > /dev/null 2>&1 && \ +echo rust-toolchain-done && \ +cd /tmp/rust-std-*-x86_64-unknown-none && echo y | sudo ./install.sh --prefix=/usr/local > /dev/null 2>&1 && \ +echo rust-std-none-done" 600 + + echo "[2/5] Installing rust-src + vendor + just + wasm-tools..." + remote_run "set -e; \ +aws s3 cp $S3_TOOLCHAIN/rust-src.tar.xz /tmp/rust-src.tar.xz --region $REGION --no-progress && \ +tar xJf /tmp/rust-src.tar.xz -C /tmp && \ +cd /tmp/rust-src-* && echo y | sudo ./install.sh --prefix=/usr/local > /dev/null 2>&1 && \ +echo rust-src-done && \ +aws s3 cp $S3_TOOLCHAIN/all-vendor.tar.gz /tmp/all-vendor.tar.gz --region $REGION --no-progress && \ +sudo mkdir -p /opt/vendor && sudo tar xzf /tmp/all-vendor.tar.gz -C /opt/vendor && \ +sudo mkdir -p /root/.cargo && \ +echo W3NvdXJjZS5jcmF0ZXMtaW9dCnJlcGxhY2Utd2l0aCA9ICJ2ZW5kb3JlZC1zb3VyY2VzIgoKW3NvdXJjZS52ZW5kb3JlZC1zb3VyY2VzXQpkaXJlY3RvcnkgPSAiL29wdC92ZW5kb3Ii | base64 -d | sudo tee /root/.cargo/config.toml > /dev/null && \ +mkdir -p ~/.cargo && \ +echo W3NvdXJjZS5jcmF0ZXMtaW9dCnJlcGxhY2Utd2l0aCA9ICJ2ZW5kb3JlZC1zb3VyY2VzIgoKW3NvdXJjZS52ZW5kb3JlZC1zb3VyY2VzXQpkaXJlY3RvcnkgPSAiL29wdC92ZW5kb3Ii | base64 -d > ~/.cargo/config.toml && \ +echo all-vendor-done && \ +aws s3 cp $S3_TOOLCHAIN/just.tar.gz /tmp/just.tar.gz --region $REGION --no-progress && \ +cd /tmp && tar xzf just.tar.gz just && sudo mv just /usr/local/bin/ && \ +echo just-done && \ +aws s3 cp $S3_TOOLCHAIN/wasm-tools.tar.gz /tmp/wasm-tools.tar.gz --region $REGION --no-progress && \ +cd /tmp && tar xzf wasm-tools.tar.gz --strip-components=1 '*/wasm-tools' && sudo mv wasm-tools /usr/local/bin/ && \ +echo wasm-tools-done" 600 + + echo "[3/5] Building cargo-hyperlight (this takes ~60s)..." + remote_run "set -e; \ +aws s3 cp $S3_TOOLCHAIN/cargo-hyperlight-vendored.tar.gz /tmp/cargo-hyperlight-vendored.tar.gz --region $REGION --no-progress && \ +mkdir -p /tmp/cargo-hyperlight && cd /tmp/cargo-hyperlight && tar xzf /tmp/cargo-hyperlight-vendored.tar.gz && \ +cargo install --path . --root /usr/local --locked 2>&1 | tail -5 && \ +echo cargo-hyperlight-done" 600 + + echo "[4/5] Verifying tools..." + remote_run "rustc --version && cargo --version && just --version && wasm-tools --version && cargo-hyperlight --version" +else + echo "Baked AMI — skipping install" +fi + +# ── Step 10: Clone and build ───────────────────────────────────── +echo "" +echo "=== Step 10: Clone and build ===" +remote_run "set -e; \ +aws s3 cp $S3_TOOLCHAIN/hyperlight-repo.tar.gz /tmp/hyperlight-repo.tar.gz --region $REGION --no-progress && \ +mkdir -p ~/hyperlight && cd ~/hyperlight && tar xzf /tmp/hyperlight-repo.tar.gz 2>/dev/null && \ +echo repo-extracted" 300 + +echo "Building guest binaries (this takes a few minutes)..." +remote_run "cd ~/hyperlight && just build-rust-guests debug > /tmp/build.log 2>&1 && \ +mkdir -p src/tests/rust_guests/bin/debug && \ +just move-rust-guests debug >> /tmp/build.log 2>&1 && \ +mkdir -p src/tests/c_guests/bin/debug && \ +just build-c-guests debug >> /tmp/build.log 2>&1 && \ +just move-c-guests debug >> /tmp/build.log 2>&1; \ +RC=\$?; echo BUILD_TAIL_START; tail -20 /tmp/build.log; echo BUILD_EXIT=\$RC; exit \$RC" 900 + +# ── Step 11: Run tests ─────────────────────────────────────────── +echo "" +echo "=== Step 11: Run tests ===" +echo "--- hyperlight-host lib tests (map_region) ---" +remote_run "cd ~/hyperlight && cargo test --package hyperlight-host --lib map_region_ 2>&1 | tail -30" 600 || true + +echo "" +echo "--- Full hyperlight-host test suite ---" +remote_run "cd ~/hyperlight && cargo test --package hyperlight-host > /tmp/test.log 2>&1; \ +RC=\$?; echo TEST_TAIL_START; tail -40 /tmp/test.log; echo TEST_EXIT=\$RC; exit \$RC" 1800 || true + +# ── Done ────────────────────────────────────────────────────────── +echo "" +echo "=== DONE ===" + +if [[ -z "$BAKED_AMI" || "$BAKED_AMI" == "None" ]]; then + echo "" + read -p "Bake AMI from this instance for faster future runs? [y/N] " BAKE + if [[ "$BAKE" =~ ^[Yy] ]]; then + echo "Creating AMI '$AMI_NAME'..." + NEW_AMI=$(aws ec2 create-image --instance-id "$INSTANCE_ID" \ + --name "$AMI_NAME" --description "Hyperlight test runner - Rust, clang, just" \ + --no-reboot --query ImageId --output text) + echo "AMI $NEW_AMI being created." + fi +fi + +echo "Instance will be terminated by cleanup trap." diff --git a/scripts/test-runner.sh.working b/scripts/test-runner.sh.working new file mode 100755 index 000000000..1d21d096d --- /dev/null +++ b/scripts/test-runner.sh.working @@ -0,0 +1,472 @@ +#!/usr/bin/env bash +set -euo pipefail +export AWS_PAGER="" + +# Hyperlight Test Runner — launches a c8i.2xlarge with nested KVM, +# installs deps, runs tests, and terminates the instance. +# +# Connection: SSM primary (tunnels through AWS API), SSH fallback +# if SSM agent doesn't register within 90 seconds. +# +# Prerequisites: +# 1. SSM plugin: brew install --cask session-manager-plugin +# (optional — SSH fallback works without it) +# 2. Run: ada credentials update --account 753102249842 --role Admin --provider isengard +# 3. Then: bash /tmp/hyperlight-test-runner.sh [--branch BRANCH] + +ACCOUNT_ID="753102249842" +REGION="us-east-1" +INSTANCE_TYPE="c8i.2xlarge" +FORK_URL="https://github.com/Richard-Durkee/hyperlight.git" +BRANCH="fix/validate-map-region-overlap" +S3_TOOLCHAIN="s3://hyperlight-toolchain-753102249842" +AMI_NAME="hyperlight-test-runner" +SSM_ROLE_NAME="hyperlight-test-ssm-role" +SSM_PROFILE_NAME="hyperlight-test-ssm-profile" +KEY_NAME="hyperlight-test-key" +SG_NAME="hyperlight-test-sg" +SSH_PORT=443 # ISPs often block port 22; 443 is rarely blocked +MAX_RUNTIME_MIN=45 # Auto-terminate instance after this many minutes (~$3 cap) + +while [[ $# -gt 0 ]]; do + case "$1" in + --branch|-b) BRANCH="$2"; shift 2 ;; + --timeout|-t) MAX_RUNTIME_MIN="$2"; shift 2 ;; + *) echo "Usage: $0 [--branch|-b BRANCH] [--timeout|-t MINUTES]"; exit 1 ;; + esac +done + +export AWS_DEFAULT_REGION="$REGION" +unset AWS_PROFILE 2>/dev/null || true + +# Connection mode: set after instance boots +CONN_MODE="" # "ssm" or "ssh" +INSTANCE_ID="" +SG_ID="" +VPCE_SG_ID="" +KEY_FILE="/tmp/${KEY_NAME}.pem" +CREATED_VPCE_IDS=() +WATCHDOG_PID="" + +cleanup() { + echo "" + echo "=== CLEANUP ===" + [[ -n "$WATCHDOG_PID" ]] && kill "$WATCHDOG_PID" 2>/dev/null || true + if [[ -n "$INSTANCE_ID" ]]; then + echo "Terminating instance $INSTANCE_ID..." + aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" 2>/dev/null || true + aws ec2 wait instance-terminated --instance-ids "$INSTANCE_ID" 2>/dev/null || sleep 30 + fi + if [[ -n "$SG_ID" ]]; then + echo "Security group $SG_ID retained for reuse" + fi + aws ec2 delete-key-pair --key-name "$KEY_NAME" 2>/dev/null || true + rm -f "$KEY_FILE" + # Note: VPC endpoints and their SG are NOT cleaned up — they persist for reuse + echo "Cleanup complete." +} +trap cleanup EXIT + +# ── Remote execution abstraction ────────────────────────────────── +ssm_run() { + local cmd="$1" + local timeout="${2:-600}" + local cmd_id + cmd_id=$(aws ssm send-command \ + --instance-ids "$INSTANCE_ID" \ + --document-name "AWS-RunShellScript" \ + --parameters "commands=[\"bash -c '$cmd'\"]" \ + --timeout-seconds "$timeout" \ + --query Command.CommandId --output text 2>&1) + if [[ "$cmd_id" == *"InvalidInstanceId"* ]]; then + local state + state=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" \ + --query 'Reservations[0].Instances[0].State.Name' --output text 2>/dev/null || echo "unknown") + echo "ERROR: SSM lost contact with instance (state: $state)" >&2 + [[ "$state" != "running" ]] && echo "Instance is no longer running — aborting" >&2 && return 1 + echo "Instance is running but SSM agent unresponsive — waiting 30s and retrying..." >&2 + sleep 30 + cmd_id=$(aws ssm send-command \ + --instance-ids "$INSTANCE_ID" \ + --document-name "AWS-RunShellScript" \ + --parameters "commands=[\"bash -c '$cmd'\"]" \ + --timeout-seconds "$timeout" \ + --query Command.CommandId --output text) + fi + local max_polls=$(( timeout / 5 + 12 )) + local dots=0 + for i in $(seq 1 "$max_polls"); do + local status + status=$(aws ssm get-command-invocation \ + --command-id "$cmd_id" --instance-id "$INSTANCE_ID" \ + --query Status --output text 2>/dev/null || echo "Pending") + case "$status" in + Success|Failed|TimedOut|Cancelled) + aws ssm get-command-invocation \ + --command-id "$cmd_id" --instance-id "$INSTANCE_ID" \ + --query 'StandardOutputContent' --output text 2>/dev/null + local stderr + stderr=$(aws ssm get-command-invocation \ + --command-id "$cmd_id" --instance-id "$INSTANCE_ID" \ + --query 'StandardErrorContent' --output text 2>/dev/null) + [[ -n "$stderr" && "$stderr" != "None" ]] && echo "$stderr" >&2 + [[ "$status" == "Success" ]] && return 0 || return 1 + ;; + esac + dots=$(( dots + 1 )) + [[ $(( dots % 6 )) -eq 0 ]] && printf " %ds\n" $(( dots * 5 )) >&2 || printf "." >&2 + sleep 5 + done + echo "[SSM command timed out]" >&2; return 1 +} + +ssh_run() { + local cmd="$1" + ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ + -i "$KEY_FILE" -p "$SSH_PORT" "ec2-user@$PUBLIC_IP" "$cmd" +} + +remote_run() { + if [[ "$CONN_MODE" == "ssm" ]]; then + ssm_run "$@" + else + ssh_run "$1" + fi +} + +# ── Step 1: Credentials ────────────────────────────────────────── +echo "=== Step 1: Get credentials ===" +ada credentials update --account "$ACCOUNT_ID" --role Admin --provider isengard --once + +echo "" +echo "=== Step 2: Verify credentials ===" +CALLER=$(aws sts get-caller-identity --query Account --output text) +[[ "$CALLER" != "$ACCOUNT_ID" ]] && echo "ERROR: Expected $ACCOUNT_ID, got $CALLER" && exit 1 +echo "Authenticated to account $ACCOUNT_ID" + +# ── Step 3: IAM role for SSM ───────────────────────────────────── +echo "" +echo "=== Step 3: Ensure SSM IAM role ===" +if ! aws iam get-role --role-name "$SSM_ROLE_NAME" &>/dev/null; then + echo "Creating IAM role $SSM_ROLE_NAME..." + aws iam create-role --role-name "$SSM_ROLE_NAME" \ + --assume-role-policy-document '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Service":"ec2.amazonaws.com"},"Action":"sts:AssumeRole"}]}' + aws iam attach-role-policy --role-name "$SSM_ROLE_NAME" \ + --policy-arn arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore + sleep 5 +fi +aws iam create-instance-profile --instance-profile-name "$SSM_PROFILE_NAME" 2>/dev/null || true +aws iam add-role-to-instance-profile --instance-profile-name "$SSM_PROFILE_NAME" --role-name "$SSM_ROLE_NAME" 2>/dev/null || true +# Ensure S3 read access for toolchain bucket +aws iam put-role-policy --role-name "$SSM_ROLE_NAME" --policy-name s3-toolchain-read \ + --policy-document "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Effect\":\"Allow\",\"Action\":[\"s3:GetObject\",\"s3:ListBucket\"],\"Resource\":[\"arn:aws:s3:::hyperlight-toolchain-$ACCOUNT_ID\",\"arn:aws:s3:::hyperlight-toolchain-$ACCOUNT_ID/*\"]}]}" +echo "SSM role ready" + +# ── Step 4: SSH fallback resources (key pair + SG) ──────────────── +echo "" +echo "=== Step 4: Create SSH fallback resources ===" +aws ec2 delete-key-pair --key-name "$KEY_NAME" 2>/dev/null || true +aws ec2 create-key-pair --key-name "$KEY_NAME" --query KeyMaterial --output text > "$KEY_FILE" +chmod 600 "$KEY_FILE" +echo "Key pair created: $KEY_FILE" +sleep 2 # EC2 key pair eventual consistency + +MY_IP=$(curl -s --connect-timeout 5 https://checkip.amazonaws.com) +# Reuse existing SG or create new one +SG_ID=$(aws ec2 describe-security-groups --filters "Name=group-name,Values=$SG_NAME" \ + --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null || echo "None") +if [[ "$SG_ID" == "None" || -z "$SG_ID" ]]; then + SG_ID=$(aws ec2 create-security-group --group-name "$SG_NAME" \ + --description "Hyperlight test SSH fallback" --query GroupId --output text) +fi +# Revoke old ingress rules, then add current IP +aws ec2 revoke-security-group-ingress --group-id "$SG_ID" \ + --protocol tcp --port "$SSH_PORT" --cidr "0.0.0.0/0" 2>/dev/null || true +OLD_RULES=$(aws ec2 describe-security-groups --group-ids "$SG_ID" \ + --query 'SecurityGroups[0].IpPermissions' --output json 2>/dev/null) +[[ "$OLD_RULES" != "[]" && "$OLD_RULES" != "null" ]] && \ + aws ec2 revoke-security-group-ingress --group-id "$SG_ID" --ip-permissions "$OLD_RULES" 2>/dev/null || true +aws ec2 authorize-security-group-ingress --group-id "$SG_ID" \ + --protocol tcp --port "$SSH_PORT" --cidr "${MY_IP}/32" +echo "Security group $SG_ID — port $SSH_PORT from ${MY_IP}/32" + +# ── Step 4b: Ensure VPC endpoints (Isengard SCPs block EC2 internet) ── +echo "" +echo "=== Step 4b: Ensure VPC endpoints for SSM + S3 ===" +VPC_ID=$(aws ec2 describe-vpcs --filters "Name=isDefault,Values=true" \ + --query 'Vpcs[0].VpcId' --output text) +SUBNET_IDS=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$VPC_ID" \ + --query 'Subnets[*].SubnetId' --output text | tr '\t' ',') +# Pick one subnet for interface endpoints (cheaper — one AZ is fine for testing) +FIRST_SUBNET=$(echo "$SUBNET_IDS" | cut -d',' -f1) + +# SG for VPC endpoints — allow HTTPS from within VPC +VPCE_SG_NAME="hyperlight-vpce-sg" +VPCE_SG_ID=$(aws ec2 describe-security-groups --filters "Name=group-name,Values=$VPCE_SG_NAME" "Name=vpc-id,Values=$VPC_ID" \ + --query 'SecurityGroups[0].GroupId' --output text 2>/dev/null || echo "None") +if [[ "$VPCE_SG_ID" == "None" || -z "$VPCE_SG_ID" ]]; then + VPCE_SG_ID=$(aws ec2 create-security-group --group-name "$VPCE_SG_NAME" \ + --description "HTTPS for VPC endpoints" --vpc-id "$VPC_ID" --query GroupId --output text) + VPC_CIDR=$(aws ec2 describe-vpcs --vpc-ids "$VPC_ID" --query 'Vpcs[0].CidrBlock' --output text) + aws ec2 authorize-security-group-ingress --group-id "$VPCE_SG_ID" \ + --protocol tcp --port 443 --cidr "$VPC_CIDR" + echo "Created VPC endpoint SG: $VPCE_SG_ID" +else + echo "VPC endpoint SG exists: $VPCE_SG_ID" +fi + +# SSM requires 3 interface endpoints + S3 gateway +MAIN_RTB=$(aws ec2 describe-route-tables --filters "Name=vpc-id,Values=$VPC_ID" "Name=association.main,Values=true" \ + --query 'RouteTables[0].RouteTableId' --output text) +for SVC in ssm ssmmessages ec2messages; do + SVC_NAME="com.amazonaws.${REGION}.${SVC}" + EXISTING=$(aws ec2 describe-vpc-endpoints \ + --filters "Name=service-name,Values=$SVC_NAME" "Name=vpc-id,Values=$VPC_ID" "Name=vpc-endpoint-state,Values=available,pending" \ + --query 'VpcEndpoints[0].VpcEndpointId' --output text 2>/dev/null || echo "None") + if [[ "$EXISTING" == "None" || -z "$EXISTING" ]]; then + VPCE=$(aws ec2 create-vpc-endpoint --vpc-id "$VPC_ID" --vpc-endpoint-type Interface \ + --service-name "$SVC_NAME" --subnet-ids "$FIRST_SUBNET" \ + --security-group-ids "$VPCE_SG_ID" --private-dns-enabled \ + --query 'VpcEndpoint.VpcEndpointId' --output text) + echo "Created $SVC endpoint: $VPCE" + CREATED_VPCE_IDS+=("$VPCE") + else + echo "$SVC endpoint exists: $EXISTING" + fi +done +# S3 gateway endpoint (free, no SG needed) +S3_SVC="com.amazonaws.${REGION}.s3" +S3_EXISTING=$(aws ec2 describe-vpc-endpoints \ + --filters "Name=service-name,Values=$S3_SVC" "Name=vpc-id,Values=$VPC_ID" "Name=vpc-endpoint-state,Values=available,pending" \ + --query 'VpcEndpoints[0].VpcEndpointId' --output text 2>/dev/null || echo "None") +if [[ "$S3_EXISTING" == "None" || -z "$S3_EXISTING" ]]; then + S3_VPCE=$(aws ec2 create-vpc-endpoint --vpc-id "$VPC_ID" --vpc-endpoint-type Gateway \ + --service-name "$S3_SVC" --route-table-ids "$MAIN_RTB" \ + --query 'VpcEndpoint.VpcEndpointId' --output text) + echo "Created S3 gateway endpoint: $S3_VPCE" +else + echo "S3 gateway endpoint exists: $S3_EXISTING" +fi + +# Wait for interface endpoints to become available +if [[ ${#CREATED_VPCE_IDS[@]} -gt 0 ]]; then + echo "Waiting for VPC endpoints to become available..." + for VPCE in "${CREATED_VPCE_IDS[@]}"; do + for i in $(seq 1 30); do + STATE=$(aws ec2 describe-vpc-endpoints --vpc-endpoint-ids "$VPCE" \ + --query 'VpcEndpoints[0].State' --output text 2>/dev/null) + [[ "$STATE" == "available" ]] && break + sleep 5 + done + echo " $VPCE: $STATE" + done +fi + +# ── Step 5: Resolve AMI ────────────────────────────────────────── +echo "" +echo "=== Step 5: Resolve AMI ===" +BAKED_AMI=$(aws ec2 describe-images --owners self --filters "Name=name,Values=$AMI_NAME" \ + --query 'Images[0].ImageId' --output text 2>/dev/null || echo "None") +if [[ "$BAKED_AMI" != "None" && -n "$BAKED_AMI" ]]; then + AMI_ID="$BAKED_AMI" + echo "Using pre-baked AMI: $AMI_ID" +else + AMI_ID=$(aws ssm get-parameter \ + --name /aws/service/ami-amazon-linux-latest/al2023-ami-kernel-default-x86_64 \ + --query Parameter.Value --output text) + echo "No baked AMI — using base AL2023: $AMI_ID" +fi + +# ── Step 6: Launch instance ─────────────────────────────────────── +echo "" +echo "=== Step 6: Launch $INSTANCE_TYPE with nested virtualization ===" + +# UserData: reconfigure sshd on $SSH_PORT for SSH fallback +# (SSM agent is pre-installed on AL2023 — it just needs VPC endpoints to phone home) +USER_DATA=$(python3 -c " +import base64 +script = '''#!/bin/bash +# Reconfigure sshd to listen on port $SSH_PORT (SSH fallback) +sed -i 's/^#Port 22/Port $SSH_PORT/' /etc/ssh/sshd_config +echo 'Port $SSH_PORT' >> /etc/ssh/sshd_config +systemctl restart sshd +''' +print(base64.b64encode(script.encode()).decode()) +") + +INSTANCE_ID=$(python3 -c " +import boto3 +ec2 = boto3.client('ec2', region_name='$REGION') +r = ec2.run_instances( + ImageId='$AMI_ID', InstanceType='$INSTANCE_TYPE', + KeyName='$KEY_NAME', + IamInstanceProfile={'Name': '$SSM_PROFILE_NAME'}, + SecurityGroupIds=['$SG_ID'], + CpuOptions={'NestedVirtualization': 'enabled'}, + UserData='$USER_DATA', + BlockDeviceMappings=[{'DeviceName': '/dev/xvda', 'Ebs': {'VolumeSize': 30, 'VolumeType': 'gp3'}}], + TagSpecifications=[{'ResourceType': 'instance', 'Tags': [{'Key': 'Name', 'Value': 'hyperlight-test'}]}], + MinCount=1, MaxCount=1) +print(r['Instances'][0]['InstanceId']) +") +echo "Instance launched: $INSTANCE_ID" + +echo "Waiting for instance to be running..." +aws ec2 wait instance-running --instance-ids "$INSTANCE_ID" +PUBLIC_IP=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" \ + --query 'Reservations[0].Instances[0].PublicIpAddress' --output text) +echo "Instance running at $PUBLIC_IP" + +# ── Cost guard: auto-terminate after MAX_RUNTIME_MIN ────────────── +echo "⏱ Cost guard: instance will auto-terminate in ${MAX_RUNTIME_MIN}m" +( + sleep $(( MAX_RUNTIME_MIN * 60 )) + echo "" + echo "⚠️ MAX RUNTIME (${MAX_RUNTIME_MIN}m) EXCEEDED — terminating instance $INSTANCE_ID" + aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" --region "$REGION" 2>/dev/null + kill $$ 2>/dev/null +) & +WATCHDOG_PID=$! + +# ── Step 7: Establish connection (SSM primary, SSH fallback) ────── +echo "" +echo "=== Step 7: Establish connection ===" + +# Try SSM first (180 seconds — includes time for dnf install of SSM agent) +if command -v session-manager-plugin &>/dev/null; then + echo "Waiting for SSM agent to register..." + for i in $(seq 1 18); do + SSM_STATUS=$(aws ssm describe-instance-information \ + --filters "Key=InstanceIds,Values=$INSTANCE_ID" \ + --query 'InstanceInformationList[0].PingStatus' --output text 2>/dev/null || echo "None") + if [[ "$SSM_STATUS" == "Online" ]]; then + CONN_MODE="ssm" + echo "✓ SSM agent online — using SSM" + break + fi + echo " Attempt $i/18 (status: $SSM_STATUS)..." + sleep 10 + done +fi + +# Fall back to SSH if SSM didn't work +if [[ -z "$CONN_MODE" ]]; then + echo "SSM not available — falling back to SSH on port $SSH_PORT" + echo "Waiting for SSH to become available..." + for i in $(seq 1 20); do + if ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 -o BatchMode=yes \ + -i "$KEY_FILE" -p "$SSH_PORT" "ec2-user@$PUBLIC_IP" true 2>/dev/null; then + CONN_MODE="ssh" + echo "✓ SSH connected" + break + fi + echo " Attempt $i/20..." + sleep 10 + done +fi + +if [[ -z "$CONN_MODE" ]]; then + echo "ERROR: Could not connect via SSM or SSH" + echo " SSM status: $SSM_STATUS" + echo " SSH target: $PUBLIC_IP:$SSH_PORT" + exit 1 +fi +echo "Connection mode: $CONN_MODE" + +# ── Step 8: Verify KVM ─────────────────────────────────────────── +echo "" +echo "=== Step 8: Verify KVM ===" +remote_run "ls -la /dev/kvm 2>&1 || echo KVM_NOT_FOUND" + +# ── Step 9: Install dependencies ───────────────────────────────── +echo "" +echo "=== Step 9: Install dependencies ===" +if [[ -z "$BAKED_AMI" || "$BAKED_AMI" == "None" ]]; then + remote_run "sudo dnf install -q -y clang gcc make git openssl-devel 2>&1 | tail -3; echo dnf-done" 900 + + echo "[1/5] Installing Rust toolchain + x86_64-unknown-none target..." + remote_run "set -e; \ +aws s3 cp $S3_TOOLCHAIN/rust-stable.tar.xz /tmp/rust-stable.tar.xz --region $REGION --no-progress && \ +aws s3 cp $S3_TOOLCHAIN/rust-std-none.tar.xz /tmp/rust-std-none.tar.xz --region $REGION --no-progress && \ +tar xJf /tmp/rust-stable.tar.xz -C /tmp && \ +tar xJf /tmp/rust-std-none.tar.xz -C /tmp && \ +cd /tmp/rust-*-x86_64-unknown-linux-gnu && echo y | sudo ./install.sh --prefix=/usr/local > /dev/null 2>&1 && \ +echo rust-toolchain-done && \ +cd /tmp/rust-std-*-x86_64-unknown-none && echo y | sudo ./install.sh --prefix=/usr/local > /dev/null 2>&1 && \ +echo rust-std-none-done" 600 + + echo "[2/5] Installing rust-src + vendor + just + wasm-tools..." + remote_run "set -e; \ +aws s3 cp $S3_TOOLCHAIN/rust-src.tar.xz /tmp/rust-src.tar.xz --region $REGION --no-progress && \ +tar xJf /tmp/rust-src.tar.xz -C /tmp && \ +cd /tmp/rust-src-* && echo y | sudo ./install.sh --prefix=/usr/local > /dev/null 2>&1 && \ +echo rust-src-done && \ +aws s3 cp $S3_TOOLCHAIN/all-vendor.tar.gz /tmp/all-vendor.tar.gz --region $REGION --no-progress && \ +sudo mkdir -p /opt/vendor && sudo tar xzf /tmp/all-vendor.tar.gz -C /opt/vendor && \ +sudo mkdir -p /root/.cargo && \ +echo W3NvdXJjZS5jcmF0ZXMtaW9dCnJlcGxhY2Utd2l0aCA9ICJ2ZW5kb3JlZC1zb3VyY2VzIgoKW3NvdXJjZS52ZW5kb3JlZC1zb3VyY2VzXQpkaXJlY3RvcnkgPSAiL29wdC92ZW5kb3Ii | base64 -d | sudo tee /root/.cargo/config.toml > /dev/null && \ +mkdir -p ~/.cargo && \ +echo W3NvdXJjZS5jcmF0ZXMtaW9dCnJlcGxhY2Utd2l0aCA9ICJ2ZW5kb3JlZC1zb3VyY2VzIgoKW3NvdXJjZS52ZW5kb3JlZC1zb3VyY2VzXQpkaXJlY3RvcnkgPSAiL29wdC92ZW5kb3Ii | base64 -d > ~/.cargo/config.toml && \ +echo all-vendor-done && \ +aws s3 cp $S3_TOOLCHAIN/just.tar.gz /tmp/just.tar.gz --region $REGION --no-progress && \ +cd /tmp && tar xzf just.tar.gz just && sudo mv just /usr/local/bin/ && \ +echo just-done && \ +aws s3 cp $S3_TOOLCHAIN/wasm-tools.tar.gz /tmp/wasm-tools.tar.gz --region $REGION --no-progress && \ +cd /tmp && tar xzf wasm-tools.tar.gz --strip-components=1 '*/wasm-tools' && sudo mv wasm-tools /usr/local/bin/ && \ +echo wasm-tools-done" 600 + + echo "[3/5] Building cargo-hyperlight (this takes ~60s)..." + remote_run "set -e; \ +aws s3 cp $S3_TOOLCHAIN/cargo-hyperlight-vendored.tar.gz /tmp/cargo-hyperlight-vendored.tar.gz --region $REGION --no-progress && \ +mkdir -p /tmp/cargo-hyperlight && cd /tmp/cargo-hyperlight && tar xzf /tmp/cargo-hyperlight-vendored.tar.gz && \ +cargo install --path . --root /usr/local --locked 2>&1 | tail -5 && \ +echo cargo-hyperlight-done" 600 + + echo "[4/5] Verifying tools..." + remote_run "rustc --version && cargo --version && just --version && wasm-tools --version && cargo-hyperlight --version" +else + echo "Baked AMI — skipping install" +fi + +# ── Step 10: Clone and build ───────────────────────────────────── +echo "" +echo "=== Step 10: Clone and build ===" +remote_run "set -e; \ +aws s3 cp $S3_TOOLCHAIN/hyperlight-repo.tar.gz /tmp/hyperlight-repo.tar.gz --region $REGION --no-progress && \ +mkdir -p ~/hyperlight && cd ~/hyperlight && tar xzf /tmp/hyperlight-repo.tar.gz 2>/dev/null && \ +echo repo-extracted" 300 + +echo "Building guest binaries (this takes a few minutes)..." +remote_run "cd ~/hyperlight && just build-rust-guests debug > /tmp/build.log 2>&1 && \ +mkdir -p src/tests/rust_guests/bin/debug && \ +just move-rust-guests debug >> /tmp/build.log 2>&1; \ +RC=\$?; echo BUILD_TAIL_START; tail -20 /tmp/build.log; echo BUILD_EXIT=\$RC; exit \$RC" 900 + +# ── Step 11: Run tests ─────────────────────────────────────────── +echo "" +echo "=== Step 11: Run tests ===" +echo "--- hyperlight-host lib tests (map_region) ---" +remote_run "cd ~/hyperlight && cargo test --package hyperlight-host --lib map_region_ 2>&1 | tail -30" 600 || true + +echo "" +echo "--- Full hyperlight-host test suite ---" +remote_run "cd ~/hyperlight && cargo test --package hyperlight-host > /tmp/test.log 2>&1; \ +RC=\$?; echo TEST_TAIL_START; tail -40 /tmp/test.log; echo TEST_EXIT=\$RC; exit \$RC" 900 || true + +# ── Done ────────────────────────────────────────────────────────── +echo "" +echo "=== DONE ===" + +if [[ -z "$BAKED_AMI" || "$BAKED_AMI" == "None" ]]; then + echo "" + read -p "Bake AMI from this instance for faster future runs? [y/N] " BAKE + if [[ "$BAKE" =~ ^[Yy] ]]; then + echo "Creating AMI '$AMI_NAME'..." + NEW_AMI=$(aws ec2 create-image --instance-id "$INSTANCE_ID" \ + --name "$AMI_NAME" --description "Hyperlight test runner - Rust, clang, just" \ + --no-reboot --query ImageId --output text) + echo "AMI $NEW_AMI being created." + fi +fi + +echo "Instance will be terminated by cleanup trap." diff --git a/scripts/vendor-all.sh b/scripts/vendor-all.sh new file mode 100755 index 000000000..00e7ef7bf --- /dev/null +++ b/scripts/vendor-all.sh @@ -0,0 +1,176 @@ +#!/usr/bin/env bash +set -euo pipefail + +# vendor-all.sh — Create a complete vendor directory for offline Hyperlight builds +# +# Hyperlight has multiple independent Cargo.lock files (workspace root + guest +# crates), and the sysroot build (via cargo-hyperlight) needs stdlib crates at +# versions pinned by the Rust toolchain's own lockfile. A naive `cargo vendor` +# only covers the workspace root. +# +# This script produces a single vendor directory that satisfies: +# 1. The workspace root Cargo.lock +# 2. All guest crate Cargo.lock files (simpleguest, dummyguest, witguest) +# 3. The Rust stdlib sysroot Cargo.lock (for -Zbuild-std via cargo-hyperlight) +# +# Usage: +# ./scripts/vendor-all.sh # vendor to ./vendor-all/ +# ./scripts/vendor-all.sh /path/to/output # vendor to custom path +# ./scripts/vendor-all.sh --upload s3://bucket # vendor + upload as tarball +# +# The output directory can be used with: +# [source.crates-io] +# replace-with = "vendored-sources" +# [source.vendored-sources] +# directory = "/path/to/vendor-all" + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +OUTPUT_DIR="${1:-$REPO_ROOT/vendor-all}" +UPLOAD_BUCKET="" + +if [[ "${1:-}" == "--upload" ]]; then + UPLOAD_BUCKET="$2" + OUTPUT_DIR=$(mktemp -d) + shift 2 +fi + +echo "Hyperlight vendor-all" +echo " Repo: $REPO_ROOT" +echo " Output: $OUTPUT_DIR" +echo "" + +# ── Step 1: Vendor workspace root ───────────────────────────────── +echo "[1/3] Vendoring workspace root..." +cd "$REPO_ROOT" +cargo vendor "$OUTPUT_DIR" 2>&1 | tail -3 +ROOT_COUNT=$(ls "$OUTPUT_DIR" | wc -l | tr -d ' ') +echo " $ROOT_COUNT crates from workspace" + +# ── Step 2: Add guest crate dependencies ────────────────────────── +echo "[2/3] Adding guest crate dependencies..." +GUEST_ADDED=0 + +add_missing_from_lockfile() { + local lockfile="$1" + local label="$2" + + # Parse lockfile for (name, version, checksum) tuples + python3 - "$lockfile" "$OUTPUT_DIR" << 'PYEOF' +import os, re, sys, json, urllib.request, tarfile, tempfile, shutil + +lockfile, vendor_dir = sys.argv[1], sys.argv[2] + +with open(lockfile) as f: + content = f.read() + +# Parse packages +packages = [] +blocks = content.split("[[package]]") +for block in blocks[1:]: + name_m = re.search(r'name = "([^"]+)"', block) + ver_m = re.search(r'version = "([^"]+)"', block) + cksum_m = re.search(r'checksum = "([^"]+)"', block) + if name_m and ver_m and ver_m.group(1) != '0.0.0': + packages.append((name_m.group(1), ver_m.group(1), cksum_m.group(1) if cksum_m else None)) + +# Check what's already in vendor +def version_exists(name, version): + for entry in os.listdir(vendor_dir): + toml = os.path.join(vendor_dir, entry, "Cargo.toml") + if not os.path.exists(toml): + continue + with open(toml) as f: + has_name = has_ver = False + for line in f: + if line.strip() == f'name = "{name}"': + has_name = True + elif line.strip() == f'version = "{version}"': + has_ver = True + if has_name and has_ver: + return True + return False + +added = 0 +for name, ver, cksum in packages: + if version_exists(name, ver): + continue + # Skip internal rustc crates (not on crates.io) + if name.startswith('rustc-std-workspace'): + continue + + # Download from crates.io + url = f"https://crates.io/api/v1/crates/{name}/{ver}/download" + target_name = name + if os.path.exists(os.path.join(vendor_dir, name)): + for i in range(10): + candidate = f"{name}-{i}" + if not os.path.exists(os.path.join(vendor_dir, candidate)): + target_name = candidate + break + + try: + req = urllib.request.Request(url, headers={'User-Agent': 'hyperlight-vendor/1.0'}) + with urllib.request.urlopen(req) as resp: + data = resp.read() + + tmp = tempfile.NamedTemporaryFile(suffix='.tar.gz', delete=False) + tmp.write(data) + tmp.close() + + extract_dir = tempfile.mkdtemp() + with tarfile.open(tmp.name, 'r:gz') as tf: + tf.extractall(extract_dir) + + src = os.path.join(extract_dir, f"{name}-{ver}") + dst = os.path.join(vendor_dir, target_name) + if os.path.exists(src): + shutil.copytree(src, dst) + with open(os.path.join(dst, ".cargo-checksum.json"), 'w') as f: + json.dump({"files": {}, "package": cksum}, f) + added += 1 + + os.unlink(tmp.name) + shutil.rmtree(extract_dir) + except Exception as e: + print(f" WARN: failed to download {name} {ver}: {e}", file=sys.stderr) + +print(f" {added} crates added from {sys.argv[1].split('/')[-2]}/{sys.argv[1].split('/')[-1]}") +PYEOF +} + +# Guest crates +for guest_lock in "$REPO_ROOT"/src/tests/rust_guests/*/Cargo.lock; do + if [[ -f "$guest_lock" ]]; then + add_missing_from_lockfile "$guest_lock" "$(basename "$(dirname "$guest_lock")")" + fi +done + +# ── Step 3: Add stdlib sysroot dependencies ─────────────────────── +echo "[3/3] Adding stdlib sysroot dependencies..." + +# Find the active toolchain's stdlib lockfile +RUSTC_SYSROOT=$(rustc --print sysroot) +STDLIB_LOCK="$RUSTC_SYSROOT/lib/rustlib/src/rust/library/Cargo.lock" + +if [[ -f "$STDLIB_LOCK" ]]; then + add_missing_from_lockfile "$STDLIB_LOCK" "stdlib" +else + echo " WARN: stdlib Cargo.lock not found at $STDLIB_LOCK" + echo " Install rust-src: rustup component add rust-src" +fi + +# ── Summary ─────────────────────────────────────────────────────── +FINAL_COUNT=$(ls "$OUTPUT_DIR" | wc -l | tr -d ' ') +echo "" +echo "Done: $FINAL_COUNT total crates in $OUTPUT_DIR" + +# ── Upload (optional) ───────────────────────────────────────────── +if [[ -n "$UPLOAD_BUCKET" ]]; then + echo "" + echo "Packing and uploading to $UPLOAD_BUCKET..." + COPYFILE_DISABLE=1 tar czf /tmp/all-vendor.tar.gz -C "$OUTPUT_DIR" . + aws s3 cp /tmp/all-vendor.tar.gz "$UPLOAD_BUCKET/all-vendor.tar.gz" + echo "Uploaded: $UPLOAD_BUCKET/all-vendor.tar.gz ($(du -h /tmp/all-vendor.tar.gz | cut -f1))" + rm -rf "$OUTPUT_DIR" /tmp/all-vendor.tar.gz +fi diff --git a/src/hyperlight_host/src/hypervisor/hyperlight_vm/mod.rs b/src/hyperlight_host/src/hypervisor/hyperlight_vm/mod.rs index 830b856c0..53d944de2 100644 --- a/src/hyperlight_host/src/hypervisor/hyperlight_vm/mod.rs +++ b/src/hyperlight_host/src/hypervisor/hyperlight_vm/mod.rs @@ -243,6 +243,15 @@ pub enum MapRegionError { MapMemory(#[from] MapMemoryError), #[error("Region is not page-aligned (page size: {0:#x})")] NotPageAligned(usize), + #[error( + "Region [{new_start:#x}..{new_end:#x}) overlaps existing region [{existing_start:#x}..{existing_end:#x})" + )] + Overlapping { + new_start: usize, + new_end: usize, + existing_start: usize, + existing_end: usize, + }, } /// Errors that can occur when unmapping a memory region @@ -420,6 +429,53 @@ impl HyperlightVm { return Err(MapRegionError::NotPageAligned(self.page_size)); } + let new_start = region.guest_region.start; + let new_end = region.guest_region.end; + + // Check against existing dynamically mapped regions + for (_, existing) in &self.mmap_regions { + if new_start < existing.guest_region.end && new_end > existing.guest_region.start { + return Err(MapRegionError::Overlapping { + new_start, + new_end, + existing_start: existing.guest_region.start, + existing_end: existing.guest_region.end, + }); + } + } + + // Check against the snapshot region + if let Some(ref snapshot) = self.snapshot_memory { + let snap_start = crate::mem::layout::SandboxMemoryLayout::BASE_ADDRESS; + #[cfg(not(unshared_snapshot_mem))] + let snap_end = snap_start + snapshot.guest_mapped_size(); + #[cfg(unshared_snapshot_mem)] + let snap_end = snap_start + snapshot.mem_size(); + if new_start < snap_end && new_end > snap_start { + return Err(MapRegionError::Overlapping { + new_start, + new_end, + existing_start: snap_start, + existing_end: snap_end, + }); + } + } + + // Check against the scratch region + if let Some(ref scratch) = self.scratch_memory { + let scratch_start = + hyperlight_common::layout::scratch_base_gpa(scratch.mem_size()) as usize; + let scratch_end = scratch_start + scratch.mem_size(); + if new_start < scratch_end && new_end > scratch_start { + return Err(MapRegionError::Overlapping { + new_start, + new_end, + existing_start: scratch_start, + existing_end: scratch_end, + }); + } + } + // Try to reuse a freed slot first, otherwise use next_slot let slot = if let Some(freed_slot) = self.freed_slots.pop() { freed_slot diff --git a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs index 241622cab..75bab0fab 100644 --- a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs +++ b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs @@ -568,9 +568,10 @@ impl MultiUseSandbox { // writes can be rolled back when necessary. log_then_return!("TODO: Writable mappings not yet supported"); } - // Reset snapshot since we are mutating the sandbox state - self.snapshot = None; + + // Map first so overlaps are rejected before resetting the snapshot unsafe { self.vm.map_region(rgn) }.map_err(HyperlightVmError::MapRegion)?; + self.snapshot = None; self.mem_mgr.mapped_rgns += 1; Ok(()) } @@ -642,25 +643,12 @@ impl MultiUseSandbox { // Phase 2: VM-side work (map into guest address space) let region = prepared.to_memory_region()?; - // Check for overlaps with existing file mappings in the VM. - for existing_region in self.vm.get_mapped_regions() { - let ex_start = existing_region.guest_region.start as u64; - let ex_end = existing_region.guest_region.end as u64; - if guest_base < ex_end && mapping_end > ex_start { - return Err(crate::HyperlightError::Error(format!( - "map_file_cow: mapping [{:#x}..{:#x}) overlaps existing mapping [{:#x}..{:#x})", - guest_base, mapping_end, ex_start, ex_end, - ))); - } - } - - // Reset snapshot since we are mutating the sandbox state - self.snapshot = None; - unsafe { self.vm.map_region(®ion) } .map_err(HyperlightVmError::MapRegion) .map_err(crate::HyperlightError::HyperlightVmError)?; + self.snapshot = None; + let size = prepared.size as u64; // Mark consumed immediately after map_region succeeds. @@ -2588,4 +2576,120 @@ mod tests { } let _ = std::fs::remove_file(&path); } + + #[test] + fn map_region_rejects_overlapping_regions() { + let mut sbox: MultiUseSandbox = { + let path = simple_guest_as_string().unwrap(); + let u_sbox = UninitializedSandbox::new(GuestBinary::FilePath(path), None).unwrap(); + u_sbox.evolve().unwrap() + }; + + let mem1 = allocate_guest_memory(); + let mem2 = allocate_guest_memory(); + let guest_base: usize = 0x200000000; + let region1 = region_for_memory(&mem1, guest_base, MemoryRegionFlags::READ); + + // First mapping should succeed + unsafe { sbox.map_region(®ion1).unwrap() }; + + // Exact same range should fail + let region2 = region_for_memory(&mem2, guest_base, MemoryRegionFlags::READ); + let err = unsafe { sbox.map_region(®ion2) }.unwrap_err(); + assert!( + format!("{err:?}").contains("Overlapping"), + "Expected Overlapping error, got: {err:?}" + ); + } + + #[test] + fn map_region_rejects_partial_overlap() { + let mut sbox: MultiUseSandbox = { + let path = simple_guest_as_string().unwrap(); + let u_sbox = UninitializedSandbox::new(GuestBinary::FilePath(path), None).unwrap(); + u_sbox.evolve().unwrap() + }; + + // Use multi-page regions so partial overlap is geometrically possible + let mem1 = page_aligned_memory(&[0xAA; 8192]); // 2 pages + let mem2 = page_aligned_memory(&[0xBB; 8192]); // 2 pages + let guest_base: usize = 0x200000000; + let region1 = region_for_memory(&mem1, guest_base, MemoryRegionFlags::READ); + + unsafe { sbox.map_region(®ion1).unwrap() }; + + // region2 starts one page before region1, overlapping by one page + let overlap_base = guest_base - 0x1000; + let region2 = region_for_memory(&mem2, overlap_base, MemoryRegionFlags::READ); + let err = unsafe { sbox.map_region(®ion2) }.unwrap_err(); + assert!( + format!("{err:?}").contains("verlap"), + "Expected overlap error for partial overlap, got: {err:?}" + ); + } + + #[test] + fn map_region_allows_adjacent_non_overlapping() { + let mut sbox: MultiUseSandbox = { + let path = simple_guest_as_string().unwrap(); + let u_sbox = UninitializedSandbox::new(GuestBinary::FilePath(path), None).unwrap(); + u_sbox.evolve().unwrap() + }; + + let mem1 = allocate_guest_memory(); + let mem2 = allocate_guest_memory(); + let guest_base: usize = 0x200000000; + let region1 = region_for_memory(&mem1, guest_base, MemoryRegionFlags::READ); + let region_size = mem1.mem_size(); + + unsafe { sbox.map_region(®ion1).unwrap() }; + + // Adjacent region (starts right after the first one ends) should succeed + let adjacent_base = guest_base + region_size; + let region2 = region_for_memory(&mem2, adjacent_base, MemoryRegionFlags::READ); + unsafe { sbox.map_region(®ion2).unwrap() }; + } + + #[test] + fn map_region_rejects_overlap_with_snapshot() { + let mut sbox: MultiUseSandbox = { + let path = simple_guest_as_string().unwrap(); + let u_sbox = UninitializedSandbox::new(GuestBinary::FilePath(path), None).unwrap(); + u_sbox.evolve().unwrap() + }; + + // Try to map at BASE_ADDRESS (0x1000) which overlaps the snapshot region + let mem = allocate_guest_memory(); + let region = region_for_memory( + &mem, + crate::mem::layout::SandboxMemoryLayout::BASE_ADDRESS, + MemoryRegionFlags::READ, + ); + let err = unsafe { sbox.map_region(®ion) }.unwrap_err(); + assert!( + format!("{err:?}").contains("Overlapping"), + "Expected Overlapping error for snapshot overlap, got: {err:?}" + ); + } + + #[test] + fn map_region_rejects_overlap_with_scratch() { + let mut sbox: MultiUseSandbox = { + let path = simple_guest_as_string().unwrap(); + let u_sbox = UninitializedSandbox::new(GuestBinary::FilePath(path), None).unwrap(); + u_sbox.evolve().unwrap() + }; + + // The scratch region occupies the top of the GPA space + let scratch_addr = hyperlight_common::layout::scratch_base_gpa( + crate::sandbox::SandboxConfiguration::DEFAULT_SCRATCH_SIZE, + ) as usize; + let mem = allocate_guest_memory(); + let region = region_for_memory(&mem, scratch_addr, MemoryRegionFlags::READ); + let err = unsafe { sbox.map_region(®ion) }.unwrap_err(); + assert!( + format!("{err:?}").contains("verlap"), + "Expected overlap error for scratch region, got: {err:?}" + ); + } }