From 795688356174098848c067e8048c8c2a7af13197 Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Tue, 16 Jun 2026 15:51:43 -0400 Subject: [PATCH 01/11] test(longhaul): add long-haul test driver core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the test/longhaul Go module: a long-running canary driver that exercises a persistent DocumentDB cluster with continuous workload, scheduled disruptive operations, health monitoring, and reporting. Scope of this PR (Part 2/5 of #348 split): * cmd/longhaul: driver entrypoint * config: env-driven configuration * workload: writer/verifier/metrics * operations: scheduler + scale + upgrade ops * monitor: cluster client (consumes test/shared), health, leak detector * journal: thread-safe event log + outage policy oracle * report: markdown report, periodic checkpoint, GitHub Actions alerts * Dockerfile, README, deploy/setup.yaml bootstrap manifest Reuses test/shared/documentdb (lifecycle) and test/shared/mongo (data plane) extracted in #401 — no duplication of CR/Mongo logic. Excluded (in follow-up PRs): CI/CD workflows + deploy/deployment.yaml + deploy/rbac.yaml (PR-3), auto-upgrade logic (PR-4). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test/longhaul/Dockerfile | 30 +++ test/longhaul/README.md | 173 ++++++++++++ test/longhaul/cmd/longhaul/main.go | 278 +++++++++++++++++++ test/longhaul/config/config.go | 228 ++++++++++++++++ test/longhaul/config/config_test.go | 232 ++++++++++++++++ test/longhaul/config/suite_test.go | 16 ++ test/longhaul/deploy/setup.yaml | 78 ++++++ test/longhaul/go.mod | 108 ++++++++ test/longhaul/go.sum | 293 +++++++++++++++++++++ test/longhaul/journal/journal.go | 207 +++++++++++++++ test/longhaul/journal/journal_test.go | 130 +++++++++ test/longhaul/journal/policy.go | 79 ++++++ test/longhaul/journal/policy_test.go | 85 ++++++ test/longhaul/journal/suite_test.go | 16 ++ test/longhaul/monitor/health.go | 169 ++++++++++++ test/longhaul/monitor/health_test.go | 117 ++++++++ test/longhaul/monitor/k8sclient.go | 267 +++++++++++++++++++ test/longhaul/monitor/k8sclient_test.go | 209 +++++++++++++++ test/longhaul/monitor/leakdetect.go | 124 +++++++++ test/longhaul/monitor/leakdetect_test.go | 81 ++++++ test/longhaul/monitor/suite_test.go | 16 ++ test/longhaul/operations/scale.go | 135 ++++++++++ test/longhaul/operations/scale_test.go | 140 ++++++++++ test/longhaul/operations/scheduler.go | 176 +++++++++++++ test/longhaul/operations/scheduler_test.go | 123 +++++++++ test/longhaul/operations/suite_test.go | 16 ++ test/longhaul/operations/upgrade.go | 178 +++++++++++++ test/longhaul/operations/upgrade_test.go | 96 +++++++ test/longhaul/report/alert.go | 44 ++++ test/longhaul/report/alert_test.go | 115 ++++++++ test/longhaul/report/checkpoint.go | 139 ++++++++++ test/longhaul/report/checkpoint_test.go | 79 ++++++ test/longhaul/report/report.go | 108 ++++++++ test/longhaul/report/report_test.go | 118 +++++++++ test/longhaul/report/suite_test.go | 16 ++ test/longhaul/workload/metrics.go | 74 ++++++ test/longhaul/workload/metrics_test.go | 99 +++++++ test/longhaul/workload/suite_test.go | 16 ++ test/longhaul/workload/verifier.go | 163 ++++++++++++ test/longhaul/workload/verifier_test.go | 48 ++++ test/longhaul/workload/writer.go | 145 ++++++++++ test/longhaul/workload/writer_test.go | 65 +++++ 42 files changed, 5029 insertions(+) create mode 100644 test/longhaul/Dockerfile create mode 100644 test/longhaul/README.md create mode 100644 test/longhaul/cmd/longhaul/main.go create mode 100644 test/longhaul/config/config.go create mode 100644 test/longhaul/config/config_test.go create mode 100644 test/longhaul/config/suite_test.go create mode 100644 test/longhaul/deploy/setup.yaml create mode 100644 test/longhaul/go.mod create mode 100644 test/longhaul/go.sum create mode 100644 test/longhaul/journal/journal.go create mode 100644 test/longhaul/journal/journal_test.go create mode 100644 test/longhaul/journal/policy.go create mode 100644 test/longhaul/journal/policy_test.go create mode 100644 test/longhaul/journal/suite_test.go create mode 100644 test/longhaul/monitor/health.go create mode 100644 test/longhaul/monitor/health_test.go create mode 100644 test/longhaul/monitor/k8sclient.go create mode 100644 test/longhaul/monitor/k8sclient_test.go create mode 100644 test/longhaul/monitor/leakdetect.go create mode 100644 test/longhaul/monitor/leakdetect_test.go create mode 100644 test/longhaul/monitor/suite_test.go create mode 100644 test/longhaul/operations/scale.go create mode 100644 test/longhaul/operations/scale_test.go create mode 100644 test/longhaul/operations/scheduler.go create mode 100644 test/longhaul/operations/scheduler_test.go create mode 100644 test/longhaul/operations/suite_test.go create mode 100644 test/longhaul/operations/upgrade.go create mode 100644 test/longhaul/operations/upgrade_test.go create mode 100644 test/longhaul/report/alert.go create mode 100644 test/longhaul/report/alert_test.go create mode 100644 test/longhaul/report/checkpoint.go create mode 100644 test/longhaul/report/checkpoint_test.go create mode 100644 test/longhaul/report/report.go create mode 100644 test/longhaul/report/report_test.go create mode 100644 test/longhaul/report/suite_test.go create mode 100644 test/longhaul/workload/metrics.go create mode 100644 test/longhaul/workload/metrics_test.go create mode 100644 test/longhaul/workload/suite_test.go create mode 100644 test/longhaul/workload/verifier.go create mode 100644 test/longhaul/workload/verifier_test.go create mode 100644 test/longhaul/workload/writer.go create mode 100644 test/longhaul/workload/writer_test.go diff --git a/test/longhaul/Dockerfile b/test/longhaul/Dockerfile new file mode 100644 index 000000000..f57ea270c --- /dev/null +++ b/test/longhaul/Dockerfile @@ -0,0 +1,30 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# Build stage +FROM golang:1.26-alpine AS builder + +# Note: the golang:*-alpine image already ships ca-certificates, and all +# deps in go.mod resolve via proxy.golang.org (no VCS fetches), so neither +# `git` nor an extra `ca-certificates` install is required here. + +WORKDIR /src + +# Cache module downloads. +COPY go.mod go.sum ./ +RUN go mod download + +# Copy source. +COPY . . + +# Build the standalone binary. +RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /longhaul ./cmd/longhaul + +# Runtime stage +FROM gcr.io/distroless/static:nonroot + +COPY --from=builder /longhaul /longhaul + +USER nonroot:nonroot + +ENTRYPOINT ["/longhaul"] diff --git a/test/longhaul/README.md b/test/longhaul/README.md new file mode 100644 index 000000000..f5404c7d0 --- /dev/null +++ b/test/longhaul/README.md @@ -0,0 +1,173 @@ +# Long Haul Tests + +Long haul tests validate that DocumentDB Kubernetes Operator clusters remain healthy under +continuous load over extended periods. They run a canary workload that writes and reads data, +performs management operations, and checks for data integrity. + +See the [design document](../../docs/designs/long-haul-test-design.md) for architecture and rationale. + +## Quick Start + +### Prerequisites + +- A running Kubernetes cluster with DocumentDB deployed +- `kubectl` configured to access the cluster +- Go 1.25+ + +> **HA topology required for upgrade tests.** The `upgrade-documentdb` operation +> auto-skips when `spec.instancesPerNode < 2` because a single-instance cluster +> has no standby to absorb writes during the rolling restart — the upgrade +> would produce real (true-positive) downtime that no operator change can +> prevent. Run with `instancesPerNode: 2` (or `3`) to exercise the HA upgrade +> path. The skip is "free": no cooldown is consumed, and the next 10s scheduler +> tick re-evaluates eligibility, so scaling up at any point makes the upgrade +> immediately schedulable. + +### Run the Config Unit Tests + +These are fast and require no cluster: + +```bash +cd test/longhaul +go test ./config/ -v +``` + +### Run Locally + +Useful for iterating on driver code against a real cluster without rebuilding the +container image. The driver auto-falls back to `~/.kube/config` when not running +in-cluster, so the same binary works in both modes. + +You need network reachability from your machine to the DocumentDB gateway port +(10260). If you're behind a firewall that blocks it, use the in-cluster deployment +path below instead. + +```bash +cd test/longhaul +NS=documentdb-test-ns + +# 1. Port-forward the gateway service in another terminal and leave it running. +kubectl port-forward -n $NS svc/documentdb-service-documentdb-cluster 10260:10260 + +# 2. Read credentials from the secret the operator created. +USER=$(kubectl get secret documentdb-credentials -n $NS -o jsonpath='{.data.username}' | base64 -d) +PASS=$(kubectl get secret documentdb-credentials -n $NS -o jsonpath='{.data.password}' | base64 -d) + +# 3. Run the driver. Override LONGHAUL_MAX_DURATION for short dev iterations. +LONGHAUL_MONGO_URI="mongodb://${USER}:${PASS}@127.0.0.1:10260/?directConnection=true&authMechanism=SCRAM-SHA-256&tls=true&tlsInsecure=true" \ +LONGHAUL_CLUSTER_NAME=documentdb-cluster \ +LONGHAUL_NAMESPACE=$NS \ +LONGHAUL_MAX_DURATION=5m \ +go run ./cmd/longhaul/ +``` + +### Deploy as Kubernetes Deployment (Recommended for Real Runs) + +This is the intended deployment model. The test runs inside the cluster with direct +access to the DocumentDB service (no port-forward needed). + +**Production path (CI):** the `LONGHAUL - Build Test Driver Image` workflow builds +the image to GHCR; the `LONGHAUL - Deploy Test Driver to AKS` workflow rolls it +onto the cluster using a long-lived ServiceAccount-token kubeconfig stored in the +`LONGHAUL_KUBECONFIG` repo secret. Trigger both via the Actions tab. + +**Manual path (one-off / local cluster):** + +```bash +cd test/longhaul + +# 1. Build and push the container image (or use the GHCR image from CI). +docker build -t /longhaul-test:latest -f Dockerfile . +docker push /longhaul-test:latest + +# 2. Create the MongoDB credentials secret +kubectl create secret generic longhaul-mongo-credentials \ + --from-literal=uri='mongodb://docdb:YourPass@documentdb-service-documentdb-cluster.documentdb-test-ns.svc:10260/?directConnection=true&authMechanism=SCRAM-SHA-256&tls=true&tlsInsecure=true' \ + -n documentdb-test-ns + +# 3. Deploy RBAC and Deployment. deployment.yaml has placeholders +# __OWNER__ and __IMAGE_TAG__ that are normally substituted by the +# deploy workflow; for a manual apply, sed them yourself or edit +# the file in place. +kubectl apply -f deploy/setup.yaml +kubectl apply -f deploy/rbac.yaml +sed -e 's|__OWNER__||g' \ + -e 's|__IMAGE_TAG__|latest|g' \ + deploy/deployment.yaml | kubectl apply -f - + +# 4. Monitor progress +kubectl logs -f deployment/longhaul-test -n documentdb-test-ns + +# 5. Check status (Deployment auto-restarts pods on crash, so use +# the report ConfigMap or alerts as the source of truth for "did +# the test pass?", not the pod status alone). +kubectl get deployment longhaul-test -n documentdb-test-ns +kubectl get configmap longhaul-report -n documentdb-test-ns -o yaml +``` + +To roll a new image (e.g. after a code change rebuilt by CI): + +```bash +kubectl -n documentdb-test-ns set image deployment/longhaul-test \ + driver=ghcr.io//documentdb-kubernetes-operator/longhaul-test:sha-abc1234 +kubectl -n documentdb-test-ns rollout status deployment/longhaul-test +``` + +## Configuration + +All configuration is via environment variables. + +| Variable | Required | Default | Description | +|----------|----------|---------|-------------| +| `LONGHAUL_MONGO_URI` | Yes | — | MongoDB connection string to the DocumentDB gateway. | +| `LONGHAUL_CLUSTER_NAME` | Yes | — | Name of the target DocumentDB cluster CR. | +| `LONGHAUL_NAMESPACE` | No | `default` | Kubernetes namespace of the target cluster. | +| `LONGHAUL_MAX_DURATION` | No | `30m` | Max test duration. Use `0s` for run-until-failure. | +| `LONGHAUL_NUM_WRITERS` | No | `5` | Number of concurrent writers. | +| `LONGHAUL_NUM_VERIFIERS` | No | `2` | Number of concurrent verifiers. | +| `LONGHAUL_OP_COOLDOWN` | No | `5m` | Cooldown between management operations. | +| `LONGHAUL_RECOVERY_TIMEOUT` | No | `5m` | Max wait for cluster recovery after an operation. | +| `LONGHAUL_MIN_INSTANCES` | No | `1` | Minimum `spec.instancesPerNode` for scale-down operations (CRD lower bound: 1). | +| `LONGHAUL_MAX_INSTANCES` | No | `3` | Maximum `spec.instancesPerNode` for scale-up operations (CRD upper bound: 3). | +| `LONGHAUL_REPORT_INTERVAL` | No | `1h` | How often to write checkpoint reports to ConfigMap. | + +## CI Safety + +The long haul test binary is deployed as a Kubernetes Job on a dedicated AKS cluster. +It does **not** run in any PR-gated CI workflow. + +The config unit tests (`test/longhaul/config/`) run unconditionally and are included in normal +CI test runs — they are fast (~0.002s) and require no cluster. + +## Relationship to `test/e2e/` + +The `test/e2e/` Ginkgo suite (added in PR #346) and this long haul harness are **separate +modules with intentionally different shapes**. They share a problem domain (exercising a +DocumentDB cluster) but answer different questions: + +| | `test/e2e/` | `test/longhaul/` | +|---|---|---| +| Shape | Go test binary (Ginkgo specs) | Standalone long-running daemon | +| Lifetime | Minutes per spec | Days–weeks per run | +| Asserts | One behavior per spec, then exits | Continuous invariants over time | +| Failure mode | `t.Fail` per spec | Journal entry + alert + auto-restart | +| Cluster | Created + torn down per run | Long-lived dedicated AKS cluster | +| Operator API | Typed (`previewv1.DocumentDB` via controller-runtime) | Dynamic client (no operator import) | + +### Code that could be shared in the future + +The e2e suite has helpers in `test/e2e/pkg/e2eutils/` that this harness will likely consume +once it grows beyond the current scope: + +- `e2eutils/mongo` — `BuildURI` (URL-escapes username/password), TLS-from-CA-bundle, `Handle` + with port-forward + secret-backed credentials. The long haul driver currently takes a raw + `LONGHAUL_MONGO_URI` string; when it moves to per-secret credentials or in-cluster TLS, + these helpers become directly applicable. +- `e2eutils/operatorhealth` — pod-ready / CRD-ready gating used during e2e setup. The + monitor's `isPodReady` could delegate to this when the modules are unified. +- `e2eutils/clusterprobe` — CRD presence checks. + +A shared `test/shared/` module is **deliberately not introduced yet**: the modules' Go and +dependency versions differ today, and the only currently-duplicated surface (raw mongo +connect + ping) is too small to justify the third-module overhead. Revisit this when the +long haul driver adopts the same connection model as e2e. diff --git a/test/longhaul/cmd/longhaul/main.go b/test/longhaul/cmd/longhaul/main.go new file mode 100644 index 000000000..8902e2b86 --- /dev/null +++ b/test/longhaul/cmd/longhaul/main.go @@ -0,0 +1,278 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package main provides a standalone binary entry point for running +// long haul tests as a Kubernetes Job (without Ginkgo test framework). +package main + +import ( + "context" + "fmt" + "log" + "os" + "time" + + "github.com/documentdb/documentdb-operator/test/longhaul/config" + "github.com/documentdb/documentdb-operator/test/longhaul/journal" + "github.com/documentdb/documentdb-operator/test/longhaul/monitor" + "github.com/documentdb/documentdb-operator/test/longhaul/operations" + "github.com/documentdb/documentdb-operator/test/longhaul/report" + "github.com/documentdb/documentdb-operator/test/longhaul/workload" + + sharedmongo "github.com/documentdb/documentdb-operator/test/shared/mongo" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" +) + +func main() { + log.SetFlags(log.Ltime | log.Lmsgprefix) + log.SetPrefix("[longhaul] ") + + cfg, err := config.LoadFromEnv() + if err != nil { + log.Fatalf("failed to load config: %v", err) + } + if err := cfg.Validate(); err != nil { + log.Fatalf("invalid config: %v", err) + } + + log.Printf("config loaded: duration=%s namespace=%s cluster=%s writers=%d verifiers=%d", + cfg.MaxDuration, cfg.Namespace, cfg.ClusterName, cfg.NumWriters, cfg.NumVerifiers) + + exitCode := run(cfg) + os.Exit(exitCode) +} + +func run(cfg config.Config) int { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + if cfg.MaxDuration > 0 { + var timeoutCancel context.CancelFunc + ctx, timeoutCancel = context.WithTimeout(ctx, cfg.MaxDuration) + defer timeoutCancel() + } + + // Initialize components. + j := journal.New() + metrics := workload.NewMetrics() + + // Connect to MongoDB. + if cfg.MongoURI == "" { + log.Fatal("LONGHAUL_MONGO_URI must be set") + } + mongoClient, err := sharedmongo.NewFromURI(ctx, cfg.MongoURI) + if err != nil { + log.Fatalf("failed to connect to MongoDB: %v", err) + } + defer func() { + disconnectCtx, c := context.WithTimeout(context.Background(), 5*time.Second) + defer c() + _ = mongoClient.Disconnect(disconnectCtx) + }() + + // Verify connectivity. + pingCtx, pingCancel := context.WithTimeout(ctx, 10*time.Second) + defer pingCancel() + if err := mongoClient.Ping(pingCtx, nil); err != nil { + log.Fatalf("MongoDB ping failed: %v", err) + } + log.Println("MongoDB connection established") + + db := mongoClient.Database("longhaul") + + // Drop previous test data to avoid duplicate key conflicts. + if err := db.Collection(workload.CollectionName).Drop(ctx); err != nil { + log.Fatalf("failed to drop collection: %v", err) + } + + // Create indexes. + if err := workload.EnsureIndexes(ctx, db); err != nil { + log.Fatalf("failed to create indexes: %v", err) + } + + j.Info("main", "long haul test starting") + + // Initialize real k8s cluster client. + clusterClient, k8sClientset, err := initK8sClient(cfg) + if err != nil { + log.Fatalf("failed to initialize k8s client: %v", err) + } + j.Info("main", "k8s client initialized") + + // Start health monitor. + healthMon := monitor.NewHealthMonitor(clusterClient, j, cfg.SteadyStateWait) + go healthMon.Run(ctx) + + // Start leak detector. + leakDetector := monitor.NewLeakDetector(j, 10.0, 10) + + // Start writers. + workload.StartWriters(ctx, cfg.NumWriters, db, metrics, j) + j.Info("main", fmt.Sprintf("started %d writers", cfg.NumWriters)) + + // Start verifiers. + workload.StartVerifiers(ctx, cfg.NumVerifiers, db, metrics, j) + j.Info("main", fmt.Sprintf("started %d verifiers", cfg.NumVerifiers)) + + // Configure operations. + ops := []operations.Operation{ + operations.NewScaleUp(clusterClient, healthMon, cfg.MaxInstances, cfg.RecoveryTimeout), + operations.NewScaleDown(clusterClient, healthMon, cfg.MinInstances, cfg.RecoveryTimeout), + operations.NewUpgradeDocumentDB(clusterClient, k8sClientset, healthMon, j, cfg.Namespace, cfg.RecoveryTimeout), + } + + // Start operation scheduler. + scheduler := operations.NewScheduler(ops, healthMon, j, cfg.OpCooldown) + go scheduler.Run(ctx) + + // Start metrics sampling goroutine (feeds leak detector). + go runMetricsSampling(ctx, clusterClient, leakDetector, j) + + // Start periodic checkpoint reporter. + summaryFunc := func() report.Summary { + return buildSummary(metrics, leakDetector, scheduler, j) + } + reporter := report.NewCheckpointReporter(k8sClientset, cfg.Namespace, cfg.ReportInterval, summaryFunc) + go reporter.Run(ctx) + + j.Info("main", "all components started, entering main loop") + + // Main loop: wait for context expiry. + <-ctx.Done() + j.Info("main", fmt.Sprintf("test ending: %v", ctx.Err())) + + // Allow goroutines to flush. + time.Sleep(500 * time.Millisecond) + + // Generate final report. + summary := buildSummary(metrics, leakDetector, scheduler, j) + markdown := report.GenerateMarkdown(summary) + fmt.Println("\n" + markdown) + + // Emit final GitHub Actions annotation. + report.EmitAnnotation(summary) + + if summary.Result == report.ResultFail { + log.Printf("TEST FAILED: %s", summary.FailReason) + return 1 + } + + log.Println("TEST PASSED") + return 0 +} + +// buildSummary constructs a report.Summary from current state. +func buildSummary(metrics *workload.Metrics, leakDetector *monitor.LeakDetector, scheduler *operations.Scheduler, j *journal.Journal) report.Summary { + snap := metrics.Snapshot() + leakAnalysis := leakDetector.Analyze() + + result := report.ResultPass + failReason := "" + + if snap.HasDataLoss() { + result = report.ResultFail + failReason = fmt.Sprintf("data loss: %d gaps, %d checksum errors", + snap.GapsDetected, snap.ChecksumErrors) + } + if j.HasPolicyViolation() { + result = report.ResultFail + if failReason != "" { + failReason += "; " + } + failReason += "outage policy violated" + } + + return report.Summary{ + Result: result, + Duration: snap.Elapsed, + Metrics: snap, + LeakAnalysis: leakAnalysis, + OpsExecuted: scheduler.OpsExecuted(), + Windows: j.DisruptionWindows(), + Events: j.Events(), + FailReason: failReason, + } +} + +// initK8sClient creates the real K8s cluster client and returns the clientset for ConfigMap access. +func initK8sClient(cfg config.Config) (*monitor.K8sClusterClient, kubernetes.Interface, error) { + k8sCfg := monitor.K8sClientConfig{ + Namespace: cfg.Namespace, + ClusterName: cfg.ClusterName, + Kubeconfig: os.Getenv("KUBECONFIG"), + } + + client, err := monitor.NewK8sClusterClient(k8sCfg) + if err != nil { + return nil, nil, err + } + + // Build a clientset for the reporter (ConfigMap operations). + restConfig, err := buildRESTConfig() + if err != nil { + return nil, nil, fmt.Errorf("failed to build REST config for clientset: %w", err) + } + clientset, err := kubernetes.NewForConfig(restConfig) + if err != nil { + return nil, nil, fmt.Errorf("failed to create clientset: %w", err) + } + + return client, clientset, nil +} + +func buildRESTConfig() (*rest.Config, error) { + cfg, err := rest.InClusterConfig() + if err == nil { + return cfg, nil + } + kubeconfig := os.Getenv("KUBECONFIG") + if kubeconfig == "" { + kubeconfig = clientcmd.RecommendedHomeFile + } + return clientcmd.BuildConfigFromFlags("", kubeconfig) +} + +// runMetricsSampling periodically collects pod resource metrics and feeds the leak detector. +func runMetricsSampling(ctx context.Context, client *monitor.K8sClusterClient, ld *monitor.LeakDetector, j *journal.Journal) { + if !client.MetricsAvailable() { + j.Info("metrics", "metrics-server not available, leak detection sampling disabled") + return + } + j.Info("metrics", "metrics sampling started (60s interval)") + + ticker := time.NewTicker(60 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + podMetrics, err := client.GetPodMetrics(ctx) + if err != nil { + j.Warn("metrics", fmt.Sprintf("metrics query error: %v", err)) + continue + } + if podMetrics == nil { + // Metrics became unavailable. + j.Warn("metrics", "metrics-server became unavailable, stopping sampling") + return + } + + // Sum memory and CPU across all DocumentDB pods. + var totalMem, totalCPU float64 + for _, pm := range podMetrics { + totalMem += pm.MemoryMB + totalCPU += pm.CPUCores + } + + ld.AddSample(monitor.ResourceSample{ + Timestamp: time.Now(), + MemoryMB: totalMem, + CPUCores: totalCPU, + }) + } + } +} diff --git a/test/longhaul/config/config.go b/test/longhaul/config/config.go new file mode 100644 index 000000000..b07338bea --- /dev/null +++ b/test/longhaul/config/config.go @@ -0,0 +1,228 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package config + +import ( + "fmt" + "os" + "strconv" + "strings" + "time" +) + +const ( + // Environment variable names for long haul test configuration. + EnvEnabled = "LONGHAUL_ENABLED" + EnvMaxDuration = "LONGHAUL_MAX_DURATION" + EnvNamespace = "LONGHAUL_NAMESPACE" + EnvClusterName = "LONGHAUL_CLUSTER_NAME" + + // Workload and operation tuning. + EnvMongoURI = "LONGHAUL_MONGO_URI" + EnvNumWriters = "LONGHAUL_NUM_WRITERS" + EnvNumVerifiers = "LONGHAUL_NUM_VERIFIERS" + EnvOpCooldown = "LONGHAUL_OP_COOLDOWN" + EnvRecoveryTimeout = "LONGHAUL_RECOVERY_TIMEOUT" + EnvSteadyStateWait = "LONGHAUL_STEADY_STATE_WAIT" + // Scale operation bounds. The DocumentDB CRD hard-caps spec.nodeCount=1, + // so the scale dimension actually exercised is spec.instancesPerNode (1-3). + EnvMinInstances = "LONGHAUL_MIN_INSTANCES" + EnvMaxInstances = "LONGHAUL_MAX_INSTANCES" + + // Observability and reporting. + EnvReportInterval = "LONGHAUL_REPORT_INTERVAL" +) + +// Config holds all configuration for a long haul test run. +type Config struct { + // MaxDuration is the maximum test duration. Zero means run until failure. + MaxDuration time.Duration + + // Namespace is the Kubernetes namespace of the target DocumentDB cluster. + Namespace string + + // ClusterName is the name of the target DocumentDB cluster CR. + ClusterName string + + // MongoURI is the MongoDB connection string for data-plane workload. + MongoURI string + + // NumWriters is the number of concurrent writer goroutines. + NumWriters int + + // NumVerifiers is the number of concurrent verifier goroutines. + NumVerifiers int + + // OpCooldown is the minimum time between disruptive operations. + OpCooldown time.Duration + + // RecoveryTimeout is the max time to wait for cluster recovery after an operation. + RecoveryTimeout time.Duration + + // SteadyStateWait is how long the cluster must be healthy before an operation fires. + SteadyStateWait time.Duration + + // MinInstances is the minimum spec.instancesPerNode for scale-down. + // CRD lower bound is 1. + MinInstances int + + // MaxInstances is the maximum spec.instancesPerNode for scale-up. + // CRD upper bound is 3. + MaxInstances int + + // ReportInterval is how often checkpoint reports are generated. + ReportInterval time.Duration +} + +// DefaultConfig returns a Config with safe defaults for local development. +func DefaultConfig() Config { + return Config{ + MaxDuration: 30 * time.Minute, + Namespace: "default", + ClusterName: "", + MongoURI: "", + NumWriters: 5, + NumVerifiers: 2, + OpCooldown: 5 * time.Minute, + RecoveryTimeout: 5 * time.Minute, + SteadyStateWait: 60 * time.Second, + MinInstances: 1, + MaxInstances: 3, + ReportInterval: 1 * time.Hour, + } +} + +// LoadFromEnv loads configuration from environment variables, +// falling back to defaults for any unset variable. +func LoadFromEnv() (Config, error) { + cfg := DefaultConfig() + + if v := os.Getenv(EnvMaxDuration); v != "" { + d, err := time.ParseDuration(v) + if err != nil { + return cfg, fmt.Errorf("invalid %s=%q: %w", EnvMaxDuration, v, err) + } + cfg.MaxDuration = d + } + + if v := os.Getenv(EnvNamespace); v != "" { + cfg.Namespace = v + } + + if v := os.Getenv(EnvClusterName); v != "" { + cfg.ClusterName = v + } + + if v := os.Getenv(EnvMongoURI); v != "" { + cfg.MongoURI = v + } + + if v := os.Getenv(EnvNumWriters); v != "" { + n, err := strconv.Atoi(v) + if err != nil { + return cfg, fmt.Errorf("invalid %s=%q: %w", EnvNumWriters, v, err) + } + cfg.NumWriters = n + } + + if v := os.Getenv(EnvNumVerifiers); v != "" { + n, err := strconv.Atoi(v) + if err != nil { + return cfg, fmt.Errorf("invalid %s=%q: %w", EnvNumVerifiers, v, err) + } + cfg.NumVerifiers = n + } + + if v := os.Getenv(EnvOpCooldown); v != "" { + d, err := time.ParseDuration(v) + if err != nil { + return cfg, fmt.Errorf("invalid %s=%q: %w", EnvOpCooldown, v, err) + } + cfg.OpCooldown = d + } + + if v := os.Getenv(EnvRecoveryTimeout); v != "" { + d, err := time.ParseDuration(v) + if err != nil { + return cfg, fmt.Errorf("invalid %s=%q: %w", EnvRecoveryTimeout, v, err) + } + cfg.RecoveryTimeout = d + } + + if v := os.Getenv(EnvSteadyStateWait); v != "" { + d, err := time.ParseDuration(v) + if err != nil { + return cfg, fmt.Errorf("invalid %s=%q: %w", EnvSteadyStateWait, v, err) + } + cfg.SteadyStateWait = d + } + + if v := os.Getenv(EnvMinInstances); v != "" { + n, err := strconv.Atoi(v) + if err != nil { + return cfg, fmt.Errorf("invalid %s=%q: %w", EnvMinInstances, v, err) + } + cfg.MinInstances = n + } + + if v := os.Getenv(EnvMaxInstances); v != "" { + n, err := strconv.Atoi(v) + if err != nil { + return cfg, fmt.Errorf("invalid %s=%q: %w", EnvMaxInstances, v, err) + } + cfg.MaxInstances = n + } + + if v := os.Getenv(EnvReportInterval); v != "" { + d, err := time.ParseDuration(v) + if err != nil { + return cfg, fmt.Errorf("invalid %s=%q: %w", EnvReportInterval, v, err) + } + cfg.ReportInterval = d + } + + return cfg, nil +} + +// Validate checks that the configuration is valid. +func (c *Config) Validate() error { + if c.MaxDuration < 0 { + return fmt.Errorf("max duration must not be negative, got %s", c.MaxDuration) + } + if c.Namespace == "" { + return fmt.Errorf("namespace must not be empty") + } + if c.ClusterName == "" { + return fmt.Errorf("cluster name must not be empty") + } + if c.NumWriters < 1 { + return fmt.Errorf("num writers must be at least 1, got %d", c.NumWriters) + } + if c.NumVerifiers < 1 { + return fmt.Errorf("num verifiers must be at least 1, got %d", c.NumVerifiers) + } + if c.OpCooldown < 0 { + return fmt.Errorf("operation cooldown must not be negative, got %s", c.OpCooldown) + } + if c.RecoveryTimeout <= 0 { + return fmt.Errorf("recovery timeout must be positive, got %s", c.RecoveryTimeout) + } + if c.MinInstances < 1 { + return fmt.Errorf("min instances must be at least 1, got %d", c.MinInstances) + } + if c.MaxInstances > 3 { + return fmt.Errorf("max instances must not exceed 3 (CRD upper bound for spec.instancesPerNode), got %d", c.MaxInstances) + } + if c.MaxInstances < c.MinInstances { + return fmt.Errorf("max instances (%d) must be >= min instances (%d)", c.MaxInstances, c.MinInstances) + } + return nil +} + +// IsEnabled returns true if the long haul test is explicitly enabled +// via the LONGHAUL_ENABLED environment variable. +func IsEnabled() bool { + v := strings.TrimSpace(strings.ToLower(os.Getenv(EnvEnabled))) + return v == "true" || v == "1" || v == "yes" +} diff --git a/test/longhaul/config/config_test.go b/test/longhaul/config/config_test.go new file mode 100644 index 000000000..0fcc53306 --- /dev/null +++ b/test/longhaul/config/config_test.go @@ -0,0 +1,232 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package config + +import ( + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Config", func() { + Describe("DefaultConfig", func() { + It("returns safe defaults", func() { + cfg := DefaultConfig() + Expect(cfg.MaxDuration).To(Equal(30 * time.Minute)) + Expect(cfg.Namespace).To(Equal("default")) + Expect(cfg.ClusterName).To(BeEmpty()) + Expect(cfg.NumWriters).To(Equal(5)) + Expect(cfg.NumVerifiers).To(Equal(2)) + Expect(cfg.OpCooldown).To(Equal(5 * time.Minute)) + Expect(cfg.RecoveryTimeout).To(Equal(5 * time.Minute)) + Expect(cfg.SteadyStateWait).To(Equal(60 * time.Second)) + Expect(cfg.MinInstances).To(Equal(1)) + Expect(cfg.MaxInstances).To(Equal(3)) + }) + }) + + Describe("LoadFromEnv", func() { + // Clear every LONGHAUL_* env var before each spec so tests do not pick up + // values from the developer's shell (Copilot review feedback on PR #348). + BeforeEach(func() { + for _, k := range []string{ + EnvEnabled, EnvMaxDuration, EnvNamespace, EnvClusterName, + EnvMongoURI, EnvNumWriters, EnvNumVerifiers, + EnvOpCooldown, EnvRecoveryTimeout, EnvSteadyStateWait, + EnvMinInstances, EnvMaxInstances, EnvReportInterval, + } { + GinkgoT().Setenv(k, "") + } + }) + + It("uses defaults when no env vars set", func() { + cfg, err := LoadFromEnv() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg.MaxDuration).To(Equal(30 * time.Minute)) + }) + + It("parses MaxDuration from env", func() { + GinkgoT().Setenv(EnvMaxDuration, "1h") + cfg, err := LoadFromEnv() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg.MaxDuration).To(Equal(1 * time.Hour)) + }) + + It("parses zero MaxDuration for infinite runs", func() { + GinkgoT().Setenv(EnvMaxDuration, "0s") + cfg, err := LoadFromEnv() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg.MaxDuration).To(Equal(time.Duration(0))) + }) + + It("parses Namespace and ClusterName from env", func() { + GinkgoT().Setenv(EnvNamespace, "test-ns") + GinkgoT().Setenv(EnvClusterName, "my-cluster") + cfg, err := LoadFromEnv() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg.Namespace).To(Equal("test-ns")) + Expect(cfg.ClusterName).To(Equal("my-cluster")) + }) + + It("returns error for invalid MaxDuration", func() { + GinkgoT().Setenv(EnvMaxDuration, "not-a-duration") + _, err := LoadFromEnv() + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring(EnvMaxDuration)) + }) + + It("parses NumWriters from env", func() { + GinkgoT().Setenv(EnvNumWriters, "10") + cfg, err := LoadFromEnv() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg.NumWriters).To(Equal(10)) + }) + + It("returns error for invalid NumWriters", func() { + GinkgoT().Setenv(EnvNumWriters, "abc") + _, err := LoadFromEnv() + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring(EnvNumWriters)) + }) + + It("parses OpCooldown from env", func() { + GinkgoT().Setenv(EnvOpCooldown, "10m") + cfg, err := LoadFromEnv() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg.OpCooldown).To(Equal(10 * time.Minute)) + }) + + It("parses MongoURI from env", func() { + GinkgoT().Setenv(EnvMongoURI, "mongodb://localhost:27017") + cfg, err := LoadFromEnv() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg.MongoURI).To(Equal("mongodb://localhost:27017")) + }) + }) + + Describe("Validate", func() { + It("passes for valid config", func() { + cfg := DefaultConfig() + cfg.ClusterName = "test-cluster" + Expect(cfg.Validate()).To(Succeed()) + }) + + It("fails when Namespace is empty", func() { + cfg := DefaultConfig() + cfg.ClusterName = "test" + cfg.Namespace = "" + Expect(cfg.Validate()).To(MatchError(ContainSubstring("namespace"))) + }) + + It("fails when ClusterName is empty", func() { + cfg := DefaultConfig() + Expect(cfg.Validate()).To(MatchError(ContainSubstring("cluster name"))) + }) + + It("fails when MaxDuration is negative", func() { + cfg := DefaultConfig() + cfg.ClusterName = "test" + cfg.MaxDuration = -1 * time.Second + Expect(cfg.Validate()).To(MatchError(ContainSubstring("max duration must not be negative"))) + }) + + It("fails when NumWriters is zero", func() { + cfg := DefaultConfig() + cfg.ClusterName = "test" + cfg.NumWriters = 0 + Expect(cfg.Validate()).To(MatchError(ContainSubstring("num writers"))) + }) + + It("fails when RecoveryTimeout is zero", func() { + cfg := DefaultConfig() + cfg.ClusterName = "test" + cfg.RecoveryTimeout = 0 + Expect(cfg.Validate()).To(MatchError(ContainSubstring("recovery timeout"))) + }) + + It("fails when MaxInstances < MinInstances", func() { + cfg := DefaultConfig() + cfg.ClusterName = "test" + cfg.MinInstances = 3 + cfg.MaxInstances = 2 + Expect(cfg.Validate()).To(MatchError(ContainSubstring("max instances"))) + }) + + It("fails when MaxInstances exceeds CRD upper bound (3)", func() { + cfg := DefaultConfig() + cfg.ClusterName = "test" + cfg.MinInstances = 1 + cfg.MaxInstances = 4 + Expect(cfg.Validate()).To(MatchError(ContainSubstring("must not exceed 3"))) + }) + }) + + Describe("IsEnabled", func() { + It("returns false when env not set", func() { + GinkgoT().Setenv(EnvEnabled, "") + Expect(IsEnabled()).To(BeFalse()) + }) + + It("returns true for 'true'", func() { + GinkgoT().Setenv(EnvEnabled, "true") + Expect(IsEnabled()).To(BeTrue()) + }) + + It("returns true for '1'", func() { + GinkgoT().Setenv(EnvEnabled, "1") + Expect(IsEnabled()).To(BeTrue()) + }) + + It("returns true for 'yes'", func() { + GinkgoT().Setenv(EnvEnabled, "yes") + Expect(IsEnabled()).To(BeTrue()) + }) + + It("returns true case-insensitively", func() { + GinkgoT().Setenv(EnvEnabled, "TRUE") + Expect(IsEnabled()).To(BeTrue()) + }) + + It("returns true for mixed case 'True'", func() { + GinkgoT().Setenv(EnvEnabled, "True") + Expect(IsEnabled()).To(BeTrue()) + }) + + It("returns true for mixed case 'YES'", func() { + GinkgoT().Setenv(EnvEnabled, "YES") + Expect(IsEnabled()).To(BeTrue()) + }) + + It("returns true with surrounding whitespace", func() { + GinkgoT().Setenv(EnvEnabled, " true ") + Expect(IsEnabled()).To(BeTrue()) + }) + + It("returns true for ' yes ' with whitespace", func() { + GinkgoT().Setenv(EnvEnabled, " yes ") + Expect(IsEnabled()).To(BeTrue()) + }) + + It("returns false for whitespace-only", func() { + GinkgoT().Setenv(EnvEnabled, " ") + Expect(IsEnabled()).To(BeFalse()) + }) + + It("returns false for 'false'", func() { + GinkgoT().Setenv(EnvEnabled, "false") + Expect(IsEnabled()).To(BeFalse()) + }) + + It("returns false for '0'", func() { + GinkgoT().Setenv(EnvEnabled, "0") + Expect(IsEnabled()).To(BeFalse()) + }) + + It("returns false for 'no'", func() { + GinkgoT().Setenv(EnvEnabled, "no") + Expect(IsEnabled()).To(BeFalse()) + }) + }) +}) diff --git a/test/longhaul/config/suite_test.go b/test/longhaul/config/suite_test.go new file mode 100644 index 000000000..c12c6a89a --- /dev/null +++ b/test/longhaul/config/suite_test.go @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package config + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestConfig(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Long Haul Config Suite") +} diff --git a/test/longhaul/deploy/setup.yaml b/test/longhaul/deploy/setup.yaml new file mode 100644 index 000000000..2c3580af2 --- /dev/null +++ b/test/longhaul/deploy/setup.yaml @@ -0,0 +1,78 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# Long Haul Test — DocumentDB Cluster Setup +# +# This manifest deploys a DocumentDB cluster for long-haul testing. +# It creates the target namespace, credentials secret, and DocumentDB CR. +# +# Prerequisites: +# 1. AKS cluster with k8s >= 1.30 +# 2. cert-manager installed: +# helm repo add jetstack https://charts.jetstack.io && helm repo update +# helm install cert-manager jetstack/cert-manager --namespace cert-manager \ +# --create-namespace --set crds.enabled=true +# 3. DocumentDB operator installed: +# helm install documentdb-operator \ +# oci://ghcr.io/documentdb/documentdb-operator \ +# --namespace documentdb-operator --create-namespace --wait --timeout 10m +# +# Usage: +# kubectl apply -f deploy/setup.yaml +# +# After the cluster is healthy (~3-5 min), deploy the long-haul test: +# kubectl apply -f deploy/rbac.yaml +# kubectl apply -f deploy/deployment.yaml +# +--- +apiVersion: v1 +kind: Namespace +metadata: + name: documentdb-test-ns + labels: + app.kubernetes.io/part-of: longhaul-test +--- +# Credentials for the DocumentDB cluster. +# The operator creates the MongoDB user with these credentials. +apiVersion: v1 +kind: Secret +metadata: + name: documentdb-credentials + namespace: documentdb-test-ns +type: kubernetes.io/basic-auth +stringData: + username: longhaul-user + password: LongHaul-T3st-Passw0rd! +--- +# DocumentDB cluster for long-haul testing. +# +# Spec notes: +# - nodeCount: 1 (CRD hard-caps this to 1; the long-haul test scales the +# instancesPerNode dimension instead via the operations scheduler). +# - instancesPerNode: 2 (HA topology: 1 primary + 1 standby). Required for +# the upgrade-documentdb operation to fire — UpgradeDocumentDB.Precondition +# skips when instancesPerNode<2 because a single-instance cluster has no +# standby and a rolling upgrade would produce real (unavoidable) downtime. +# - pvcSize: 10Gi (sufficient for continuous writes over 72h) +# - LoadBalancer exposes the cluster for monitoring/debugging +# +apiVersion: documentdb.io/preview +kind: DocumentDB +metadata: + name: documentdb-cluster + namespace: documentdb-test-ns + labels: + app.kubernetes.io/part-of: longhaul-test + app.kubernetes.io/component: database +spec: + environment: aks + nodeCount: 1 + instancesPerNode: 2 + schemaVersion: "auto" + documentDbCredentialSecret: documentdb-credentials + resource: + storage: + pvcSize: 10Gi + exposeViaService: + serviceType: LoadBalancer + sidecarInjectorPluginName: cnpg-i-sidecar-injector.documentdb.io diff --git a/test/longhaul/go.mod b/test/longhaul/go.mod new file mode 100644 index 000000000..061577178 --- /dev/null +++ b/test/longhaul/go.mod @@ -0,0 +1,108 @@ +module github.com/documentdb/documentdb-operator/test/longhaul + +go 1.25.11 + +require ( + github.com/documentdb/documentdb-operator v0.0.0-00010101000000-000000000000 + github.com/documentdb/documentdb-operator/test/shared v0.0.0-00010101000000-000000000000 + github.com/onsi/ginkgo/v2 v2.28.1 + github.com/onsi/gomega v1.39.1 + go.mongodb.org/mongo-driver/v2 v2.5.1 + k8s.io/api v0.35.0 + k8s.io/apimachinery v0.35.0 + k8s.io/client-go v0.35.0 + k8s.io/metrics v0.35.0 + sigs.k8s.io/controller-runtime v0.22.4 +) + +replace github.com/documentdb/documentdb-operator => ../../operator/src + +replace github.com/documentdb/documentdb-operator/test/shared => ../shared + +require ( + github.com/Masterminds/semver/v3 v3.4.0 // indirect + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/cloudnative-pg/barman-cloud v0.4.1-0.20260108104508-ced266c145f5 // indirect + github.com/cloudnative-pg/cloudnative-pg v1.28.1 // indirect + github.com/cloudnative-pg/cnpg-i v0.3.1 // indirect + github.com/cloudnative-pg/machinery v0.3.3 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/emicklei/go-restful/v3 v3.13.0 // indirect + github.com/evanphx/json-patch/v5 v5.9.11 // indirect + github.com/fsnotify/fsnotify v1.9.0 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/zapr v1.3.0 // indirect + github.com/go-openapi/jsonpointer v0.22.4 // indirect + github.com/go-openapi/jsonreference v0.21.4 // indirect + github.com/go-openapi/swag v0.25.4 // indirect + github.com/go-openapi/swag/cmdutils v0.25.4 // indirect + github.com/go-openapi/swag/conv v0.25.4 // indirect + github.com/go-openapi/swag/fileutils v0.25.4 // indirect + github.com/go-openapi/swag/jsonname v0.25.4 // indirect + github.com/go-openapi/swag/jsonutils v0.25.4 // indirect + github.com/go-openapi/swag/loading v0.25.4 // indirect + github.com/go-openapi/swag/mangling v0.25.4 // indirect + github.com/go-openapi/swag/netutils v0.25.4 // indirect + github.com/go-openapi/swag/stringutils v0.25.4 // indirect + github.com/go-openapi/swag/typeutils v0.25.4 // indirect + github.com/go-openapi/swag/yamlutils v0.25.4 // indirect + github.com/go-task/slim-sprig/v3 v3.0.0 // indirect + github.com/google/btree v1.1.3 // indirect + github.com/google/gnostic-models v0.7.1 // indirect + github.com/google/go-cmp v0.7.0 // indirect + github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/klauspost/compress v1.18.0 // indirect + github.com/kubernetes-csi/external-snapshotter/client/v8 v8.4.0 // indirect + github.com/lib/pq v1.11.1 // indirect + github.com/moby/spdystream v0.5.1 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.87.1 // indirect + github.com/prometheus/client_golang v1.23.2 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.67.4 // indirect + github.com/prometheus/procfs v0.19.2 // indirect + github.com/robfig/cron v1.2.0 // indirect + github.com/spf13/pflag v1.0.10 // indirect + github.com/x448/float16 v0.8.4 // indirect + github.com/xdg-go/pbkdf2 v1.0.0 // indirect + github.com/xdg-go/scram v1.2.0 // indirect + github.com/xdg-go/stringprep v1.0.4 // indirect + github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect + go.uber.org/multierr v1.11.0 // indirect + go.uber.org/zap v1.27.1 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/crypto v0.51.0 // indirect + golang.org/x/mod v0.35.0 // indirect + golang.org/x/net v0.55.0 // indirect + golang.org/x/oauth2 v0.34.0 // indirect + golang.org/x/sync v0.20.0 // indirect + golang.org/x/sys v0.45.0 // indirect + golang.org/x/term v0.43.0 // indirect + golang.org/x/text v0.37.0 // indirect + golang.org/x/time v0.14.0 // indirect + golang.org/x/tools v0.44.0 // indirect + gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect + google.golang.org/grpc v1.79.3 // indirect + google.golang.org/protobuf v1.36.12-0.20260120151049-f2248ac996af // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + k8s.io/apiextensions-apiserver v0.35.0 // indirect + k8s.io/klog/v2 v2.140.0 // indirect + k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a // indirect + k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect + sigs.k8s.io/randfill v1.0.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.2 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect +) diff --git a/test/longhaul/go.sum b/test/longhaul/go.sum new file mode 100644 index 000000000..75c925d75 --- /dev/null +++ b/test/longhaul/go.sum @@ -0,0 +1,293 @@ +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cloudnative-pg/barman-cloud v0.4.1-0.20260108104508-ced266c145f5 h1:wPB7VTNgTv6t9sl4QYOBakmVTqHnOdKUht7Q3aL+uns= +github.com/cloudnative-pg/barman-cloud v0.4.1-0.20260108104508-ced266c145f5/go.mod h1:qD0NtJOllNQbRB0MaleuHsZjFYaXtXfdg0HbFTbuHn0= +github.com/cloudnative-pg/cloudnative-pg v1.28.1 h1:HdOUWgFhta558uHfXeO/199qCApxaj5yi05x6nWNmgs= +github.com/cloudnative-pg/cloudnative-pg v1.28.1/go.mod h1:yhRa4GqJAjNd0tT9AiRgk1KdqLhMjo/JmGGoASRl2CU= +github.com/cloudnative-pg/cnpg-i v0.3.1 h1:fKj8NoToWI11HUL2UWYJBpkVzmaTvbs3kDMo7wQF8RU= +github.com/cloudnative-pg/cnpg-i v0.3.1/go.mod h1:glRDiJLJY51FY8ScJIv/OkaGJxFnojJkkNAqSy5XC6s= +github.com/cloudnative-pg/machinery v0.3.3 h1:CaqXqLTJH9RrVv3R/YU0NmFaI/F18HLg2JfH3mQLcDk= +github.com/cloudnative-pg/machinery v0.3.3/go.mod h1:RYAYlVKBF5pH4mg+Q8wHjNDyENV9ajbkG41zOEf8DEs= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes= +github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/evanphx/json-patch v5.6.0+incompatible h1:jBYDEEiFBPxA0v50tFdvOzQQTCvpL6mnFh5mB2/l16U= +github.com/evanphx/json-patch v5.6.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= +github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/gkampitakis/ciinfo v0.3.2 h1:JcuOPk8ZU7nZQjdUhctuhQofk7BGHuIy0c9Ez8BNhXs= +github.com/gkampitakis/ciinfo v0.3.2/go.mod h1:1NIwaOcFChN4fa/B0hEBdAb6npDlFL8Bwx4dfRLRqAo= +github.com/gkampitakis/go-diff v1.3.2 h1:Qyn0J9XJSDTgnsgHRdz9Zp24RaJeKMUHg2+PDZZdC4M= +github.com/gkampitakis/go-diff v1.3.2/go.mod h1:LLgOrpqleQe26cte8s36HTWcTmMEur6OPYerdAAS9tk= +github.com/gkampitakis/go-snaps v0.5.15 h1:amyJrvM1D33cPHwVrjo9jQxX8g/7E2wYdZ+01KS3zGE= +github.com/gkampitakis/go-snaps v0.5.15/go.mod h1:HNpx/9GoKisdhw9AFOBT1N7DBs9DiHo/hGheFGBZ+mc= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= +github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= +github.com/go-openapi/jsonpointer v0.22.4 h1:dZtK82WlNpVLDW2jlA1YCiVJFVqkED1MegOUy9kR5T4= +github.com/go-openapi/jsonpointer v0.22.4/go.mod h1:elX9+UgznpFhgBuaMQ7iu4lvvX1nvNsesQ3oxmYTw80= +github.com/go-openapi/jsonreference v0.21.4 h1:24qaE2y9bx/q3uRK/qN+TDwbok1NhbSmGjjySRCHtC8= +github.com/go-openapi/jsonreference v0.21.4/go.mod h1:rIENPTjDbLpzQmQWCj5kKj3ZlmEh+EFVbz3RTUh30/4= +github.com/go-openapi/swag v0.25.4 h1:OyUPUFYDPDBMkqyxOTkqDYFnrhuhi9NR6QVUvIochMU= +github.com/go-openapi/swag v0.25.4/go.mod h1:zNfJ9WZABGHCFg2RnY0S4IOkAcVTzJ6z2Bi+Q4i6qFQ= +github.com/go-openapi/swag/cmdutils v0.25.4 h1:8rYhB5n6WawR192/BfUu2iVlxqVR9aRgGJP6WaBoW+4= +github.com/go-openapi/swag/cmdutils v0.25.4/go.mod h1:pdae/AFo6WxLl5L0rq87eRzVPm/XRHM3MoYgRMvG4A0= +github.com/go-openapi/swag/conv v0.25.4 h1:/Dd7p0LZXczgUcC/Ikm1+YqVzkEeCc9LnOWjfkpkfe4= +github.com/go-openapi/swag/conv v0.25.4/go.mod h1:3LXfie/lwoAv0NHoEuY1hjoFAYkvlqI/Bn5EQDD3PPU= +github.com/go-openapi/swag/fileutils v0.25.4 h1:2oI0XNW5y6UWZTC7vAxC8hmsK/tOkWXHJQH4lKjqw+Y= +github.com/go-openapi/swag/fileutils v0.25.4/go.mod h1:cdOT/PKbwcysVQ9Tpr0q20lQKH7MGhOEb6EwmHOirUk= +github.com/go-openapi/swag/jsonname v0.25.4 h1:bZH0+MsS03MbnwBXYhuTttMOqk+5KcQ9869Vye1bNHI= +github.com/go-openapi/swag/jsonname v0.25.4/go.mod h1:GPVEk9CWVhNvWhZgrnvRA6utbAltopbKwDu8mXNUMag= +github.com/go-openapi/swag/jsonutils v0.25.4 h1:VSchfbGhD4UTf4vCdR2F4TLBdLwHyUDTd1/q4i+jGZA= +github.com/go-openapi/swag/jsonutils v0.25.4/go.mod h1:7OYGXpvVFPn4PpaSdPHJBtF0iGnbEaTk8AvBkoWnaAY= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4 h1:IACsSvBhiNJwlDix7wq39SS2Fh7lUOCJRmx/4SN4sVo= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4/go.mod h1:Mt0Ost9l3cUzVv4OEZG+WSeoHwjWLnarzMePNDAOBiM= +github.com/go-openapi/swag/loading v0.25.4 h1:jN4MvLj0X6yhCDduRsxDDw1aHe+ZWoLjW+9ZQWIKn2s= +github.com/go-openapi/swag/loading v0.25.4/go.mod h1:rpUM1ZiyEP9+mNLIQUdMiD7dCETXvkkC30z53i+ftTE= +github.com/go-openapi/swag/mangling v0.25.4 h1:2b9kBJk9JvPgxr36V23FxJLdwBrpijI26Bx5JH4Hp48= +github.com/go-openapi/swag/mangling v0.25.4/go.mod h1:6dxwu6QyORHpIIApsdZgb6wBk/DPU15MdyYj/ikn0Hg= +github.com/go-openapi/swag/netutils v0.25.4 h1:Gqe6K71bGRb3ZQLusdI8p/y1KLgV4M/k+/HzVSqT8H0= +github.com/go-openapi/swag/netutils v0.25.4/go.mod h1:m2W8dtdaoX7oj9rEttLyTeEFFEBvnAx9qHd5nJEBzYg= +github.com/go-openapi/swag/stringutils v0.25.4 h1:O6dU1Rd8bej4HPA3/CLPciNBBDwZj9HiEpdVsb8B5A8= +github.com/go-openapi/swag/stringutils v0.25.4/go.mod h1:GTsRvhJW5xM5gkgiFe0fV3PUlFm0dr8vki6/VSRaZK0= +github.com/go-openapi/swag/typeutils v0.25.4 h1:1/fbZOUN472NTc39zpa+YGHn3jzHWhv42wAJSN91wRw= +github.com/go-openapi/swag/typeutils v0.25.4/go.mod h1:Ou7g//Wx8tTLS9vG0UmzfCsjZjKhpjxayRKTHXf2pTE= +github.com/go-openapi/swag/yamlutils v0.25.4 h1:6jdaeSItEUb7ioS9lFoCZ65Cne1/RZtPBZ9A56h92Sw= +github.com/go-openapi/swag/yamlutils v0.25.4/go.mod h1:MNzq1ulQu+yd8Kl7wPOut/YHAAU/H6hL91fF+E2RFwc= +github.com/go-openapi/testify/enable/yaml/v2 v2.0.2 h1:0+Y41Pz1NkbTHz8NngxTuAXxEodtNSI1WG1c/m5Akw4= +github.com/go-openapi/testify/enable/yaml/v2 v2.0.2/go.mod h1:kme83333GCtJQHXQ8UKX3IBZu6z8T5Dvy5+CW3NLUUg= +github.com/go-openapi/testify/v2 v2.0.2 h1:X999g3jeLcoY8qctY/c/Z8iBHTbwLz7R2WXd6Ub6wls= +github.com/go-openapi/testify/v2 v2.0.2/go.mod h1:HCPmvFFnheKK2BuwSA0TbbdxJ3I16pjwMkYkP4Ywn54= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= +github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= +github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= +github.com/google/gnostic-models v0.7.1 h1:SisTfuFKJSKM5CPZkffwi6coztzzeYUhc3v4yxLWH8c= +github.com/google/gnostic-models v0.7.1/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83 h1:z2ogiKUYzX5Is6zr/vP9vJGqPwcdqsWjOt+V8J7+bTc= +github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA= +github.com/joshdk/go-junit v1.0.0 h1:S86cUKIdwBHWwA6xCmFlf3RTLfVXYQfvanM5Uh+K6GE= +github.com/joshdk/go-junit v1.0.0/go.mod h1:TiiV0PqkaNfFXjEiyjWM3XXrhVyCa1K4Zfga6W52ung= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kubernetes-csi/external-snapshotter/client/v8 v8.4.0 h1:bMqrb3UHgHbP+PW9VwiejfDJU1R0PpXVZNMdeH8WYKI= +github.com/kubernetes-csi/external-snapshotter/client/v8 v8.4.0/go.mod h1:E3vdYxHj2C2q6qo8/Da4g7P+IcwqRZyy3gJBzYybV9Y= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/lib/pq v1.11.1 h1:wuChtj2hfsGmmx3nf1m7xC2XpK6OtelS2shMY+bGMtI= +github.com/lib/pq v1.11.1/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA= +github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo= +github.com/maruel/natural v1.1.1/go.mod h1:v+Rfd79xlw1AgVBjbO0BEQmptqb5HvL/k9GRHB7ZKEg= +github.com/mfridman/tparse v0.18.0 h1:wh6dzOKaIwkUGyKgOntDW4liXSo37qg5AXbIhkMV3vE= +github.com/mfridman/tparse v0.18.0/go.mod h1:gEvqZTuCgEhPbYk/2lS3Kcxg1GmTxxU7kTC8DvP0i/A= +github.com/moby/spdystream v0.5.1 h1:9sNYeYZUcci9R6/w7KDaFWEWeV4LStVG78Mpyq/Zm/Y= +github.com/moby/spdystream v0.5.1/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= +github.com/onsi/ginkgo/v2 v2.28.1 h1:S4hj+HbZp40fNKuLUQOYLDgZLwNUVn19N3Atb98NCyI= +github.com/onsi/ginkgo/v2 v2.28.1/go.mod h1:CLtbVInNckU3/+gC8LzkGUb9oF+e8W8TdUsxPwvdOgE= +github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28= +github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.87.1 h1:wyKanf+IFdbIqbDNYGt+f1dabLErLWtBaxd0KaAx4aM= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.87.1/go.mod h1:WHiLZmOWVop/MoYvRD58LfnPeyE+dcITby/jQjg83Hw= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.67.4 h1:yR3NqWO1/UyO1w2PhUvXlGQs/PtFmoveVO0KZ4+Lvsc= +github.com/prometheus/common v0.67.4/go.mod h1:gP0fq6YjjNCLssJCQp0yk4M8W6ikLURwkdd/YKtTbyI= +github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws= +github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw= +github.com/robfig/cron v1.2.0 h1:ZjScXvvxeQ63Dbyxy76Fj3AT3Ut0aKsyd2/tl3DTMuQ= +github.com/robfig/cron v1.2.0/go.mod h1:JGuDeoQd7Z6yL4zQhZ3OPEVHB7fL6Ka6skscFHfmt2k= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= +github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= +github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/thoas/go-funk v0.9.3 h1:7+nAEx3kn5ZJcnDm2Bh23N2yOtweO14bi//dvRtgLpw= +github.com/thoas/go-funk v0.9.3/go.mod h1:+IWnUfUmFO1+WVYQWQtIJHeRRdaIyyYglZN7xzUPe4Q= +github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= +github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= +github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= +github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c= +github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= +github.com/xdg-go/scram v1.2.0 h1:bYKF2AEwG5rqd1BumT4gAnvwU/M9nBp2pTSxeZw7Wvs= +github.com/xdg-go/scram v1.2.0/go.mod h1:3dlrS0iBaWKYVt2ZfA4cj48umJZ+cAEbR6/SjLA88I8= +github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8= +github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.mongodb.org/mongo-driver/v2 v2.5.1 h1:j2U/Qp+wvueSpqitLCSZPT/+ZpVc1xzuwdHWwl7d8ro= +go.mongodb.org/mongo-driver/v2 v2.5.1/go.mod h1:yOI9kBsufol30iFsl1slpdq1I0eHPzybRWdyYUs8K/0= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/otel v1.40.0 h1:oA5YeOcpRTXq6NN7frwmwFR0Cn3RhTVZvXsP4duvCms= +go.opentelemetry.io/otel v1.40.0/go.mod h1:IMb+uXZUKkMXdPddhwAHm6UfOwJyh4ct1ybIlV14J0g= +go.opentelemetry.io/otel/metric v1.40.0 h1:rcZe317KPftE2rstWIBitCdVp89A2HqjkxR3c11+p9g= +go.opentelemetry.io/otel/metric v1.40.0/go.mod h1:ib/crwQH7N3r5kfiBZQbwrTge743UDc7DTFVZrrXnqc= +go.opentelemetry.io/otel/sdk v1.40.0 h1:KHW/jUzgo6wsPh9At46+h4upjtccTmuZCFAc9OJ71f8= +go.opentelemetry.io/otel/sdk v1.40.0/go.mod h1:Ph7EFdYvxq72Y8Li9q8KebuYUr2KoeyHx0DRMKrYBUE= +go.opentelemetry.io/otel/sdk/metric v1.39.0 h1:cXMVVFVgsIf2YL6QkRF4Urbr/aMInf+2WKg+sEJTtB8= +go.opentelemetry.io/otel/sdk/metric v1.39.0/go.mod h1:xq9HEVH7qeX69/JnwEfp6fVq5wosJsY1mt4lLfYdVew= +go.opentelemetry.io/otel/trace v1.40.0 h1:WA4etStDttCSYuhwvEa8OP8I5EWu24lkOzp+ZYblVjw= +go.opentelemetry.io/otel/trace v1.40.0/go.mod h1:zeAhriXecNGP/s2SEG3+Y8X9ujcJOTqQ5RgdEJcawiA= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +go.uber.org/zap v1.27.1 h1:08RqriUEv8+ArZRYSTXy1LeBScaMpVSTBhCeaZYfMYc= +go.uber.org/zap v1.27.1/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI= +golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.35.0 h1:Ww1D637e6Pg+Zb2KrWfHQUnH2dQRLBQyAtpr/haaJeM= +golang.org/x/mod v0.35.0/go.mod h1:+GwiRhIInF8wPm+4AoT6L0FA1QWAad3OMdTRx4tFYlU= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.55.0 h1:bcvxaJn3e1U6InsFWt1JUq1aSjnRxLzT2rtD2KfkDF8= +golang.org/x/net v0.55.0/go.mod h1:L5U2KuzuOe1lY7Z+aWVIKK6qEeJXnXV9yzGA+WCHJww= +golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw= +golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY= +golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4= +golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc= +golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38= +golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= +golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.44.0 h1:UP4ajHPIcuMjT1GqzDWRlalUEoY+uzoZKnhOjbIPD2c= +golang.org/x/tools v0.44.0/go.mod h1:KA0AfVErSdxRZIsOVipbv3rQhVXTnlU6UhKxHd1seDI= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0= +gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 h1:gRkg/vSppuSQoDjxyiGfN4Upv/h/DQmIR10ZU8dh4Ww= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= +google.golang.org/grpc v1.79.3 h1:sybAEdRIEtvcD68Gx7dmnwjZKlyfuc61Dyo9pGXXkKE= +google.golang.org/grpc v1.79.3/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ= +google.golang.org/protobuf v1.36.12-0.20260120151049-f2248ac996af h1:+5/Sw3GsDNlEmu7TfklWKPdQ0Ykja5VEmq2i817+jbI= +google.golang.org/protobuf v1.36.12-0.20260120151049-f2248ac996af/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +k8s.io/api v0.35.0 h1:iBAU5LTyBI9vw3L5glmat1njFK34srdLmktWwLTprlY= +k8s.io/api v0.35.0/go.mod h1:AQ0SNTzm4ZAczM03QH42c7l3bih1TbAXYo0DkF8ktnA= +k8s.io/apiextensions-apiserver v0.35.0 h1:3xHk2rTOdWXXJM+RDQZJvdx0yEOgC0FgQ1PlJatA5T4= +k8s.io/apiextensions-apiserver v0.35.0/go.mod h1:E1Ahk9SADaLQ4qtzYFkwUqusXTcaV2uw3l14aqpL2LU= +k8s.io/apimachinery v0.35.0 h1:Z2L3IHvPVv/MJ7xRxHEtk6GoJElaAqDCCU0S6ncYok8= +k8s.io/apimachinery v0.35.0/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= +k8s.io/client-go v0.35.0 h1:IAW0ifFbfQQwQmga0UdoH0yvdqrbwMdq9vIFEhRpxBE= +k8s.io/client-go v0.35.0/go.mod h1:q2E5AAyqcbeLGPdoRB+Nxe3KYTfPce1Dnu1myQdqz9o= +k8s.io/klog/v2 v2.140.0 h1:Tf+J3AH7xnUzZyVVXhTgGhEKnFqye14aadWv7bzXdzc= +k8s.io/klog/v2 v2.140.0/go.mod h1:o+/RWfJ6PwpnFn7OyAG3QnO47BFsymfEfrz6XyYSSp0= +k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a h1:xCeOEAOoGYl2jnJoHkC3hkbPJgdATINPMAxaynU2Ovg= +k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a/go.mod h1:uGBT7iTA6c6MvqUvSXIaYZo9ukscABYi2btjhvgKGZ0= +k8s.io/metrics v0.35.0 h1:xVFoqtAGm2dMNJAcB5TFZJPCen0uEqqNt52wW7ABbX8= +k8s.io/metrics v0.35.0/go.mod h1:g2Up4dcBygZi2kQSEQVDByFs+VUwepJMzzQLJJLpq4M= +k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2 h1:AZYQSJemyQB5eRxqcPky+/7EdBj0xi3g0ZcxxJ7vbWU= +k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2/go.mod h1:xDxuJ0whA3d0I4mf/C4ppKHxXynQ+fxnkmQH0vTHnuk= +sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327UfMq9A= +sigs.k8s.io/controller-runtime v0.22.4/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= +sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2 h1:kwVWMx5yS1CrnFWA/2QHyRVJ8jM6dBA80uLmm0wJkk8= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/test/longhaul/journal/journal.go b/test/longhaul/journal/journal.go new file mode 100644 index 000000000..019aff257 --- /dev/null +++ b/test/longhaul/journal/journal.go @@ -0,0 +1,207 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package journal provides an append-only event log for tracking test execution, +// disruption windows, and significant state changes during long haul tests. +package journal + +import ( + "fmt" + "sync" + "time" +) + +// Level represents the severity of a journal event. +type Level string + +const ( + LevelInfo Level = "INFO" + LevelWarn Level = "WARN" + LevelError Level = "ERROR" +) + +// Event represents a single journal entry. +type Event struct { + Timestamp time.Time + Level Level + Component string + Message string + Metadata map[string]string +} + +// String returns a human-readable representation of the event. +func (e Event) String() string { + return fmt.Sprintf("[%s] %s %s: %s", + e.Timestamp.Format("15:04:05"), e.Level, e.Component, e.Message) +} + +// Journal is a thread-safe, append-only event log. +type Journal struct { + mu sync.RWMutex + events []Event + + // Active disruption window (nil if none). + activeWindow *DisruptionWindow + + // All closed disruption windows. + closedWindows []DisruptionWindow +} + +// New creates a new empty Journal. +func New() *Journal { + return &Journal{ + events: make([]Event, 0, 256), + } +} + +// Record appends a new event to the journal. +func (j *Journal) Record(level Level, component, message string, metadata map[string]string) { + e := Event{ + Timestamp: time.Now(), + Level: level, + Component: component, + Message: message, + Metadata: metadata, + } + j.mu.Lock() + j.events = append(j.events, e) + j.mu.Unlock() +} + +// Info records an info-level event. +func (j *Journal) Info(component, message string) { + j.Record(LevelInfo, component, message, nil) +} + +// Warn records a warn-level event. +func (j *Journal) Warn(component, message string) { + j.Record(LevelWarn, component, message, nil) +} + +// Error records an error-level event. +func (j *Journal) Error(component, message string) { + j.Record(LevelError, component, message, nil) +} + +// OpenDisruptionWindow starts tracking a new disruption period. +func (j *Journal) OpenDisruptionWindow(operationName string, policy OutagePolicy) { + j.mu.Lock() + defer j.mu.Unlock() + + // Close any existing window first. + if j.activeWindow != nil { + j.activeWindow.EndTime = time.Now() + j.closedWindows = append(j.closedWindows, *j.activeWindow) + } + + j.activeWindow = &DisruptionWindow{ + OperationName: operationName, + StartTime: time.Now(), + Policy: policy, + } + + j.events = append(j.events, Event{ + Timestamp: time.Now(), + Level: LevelWarn, + Component: "journal", + Message: fmt.Sprintf("disruption window opened: %s", operationName), + }) +} + +// CloseDisruptionWindow ends the active disruption period. +func (j *Journal) CloseDisruptionWindow() { + j.mu.Lock() + defer j.mu.Unlock() + + if j.activeWindow == nil { + return + } + + j.activeWindow.EndTime = time.Now() + j.closedWindows = append(j.closedWindows, *j.activeWindow) + + j.events = append(j.events, Event{ + Timestamp: time.Now(), + Level: LevelInfo, + Component: "journal", + Message: fmt.Sprintf("disruption window closed: %s (duration: %s)", + j.activeWindow.OperationName, j.activeWindow.Duration()), + }) + + j.activeWindow = nil +} + +// RecordWriteFailure increments the failure count for the active disruption window. +func (j *Journal) RecordWriteFailure() { + j.mu.Lock() + defer j.mu.Unlock() + if j.activeWindow != nil { + j.activeWindow.WriteFailures++ + } +} + +// ActiveWindow returns the current disruption window, or nil if none is active. +func (j *Journal) ActiveWindow() *DisruptionWindow { + j.mu.RLock() + defer j.mu.RUnlock() + if j.activeWindow == nil { + return nil + } + // Return a copy to avoid data races. + w := *j.activeWindow + return &w +} + +// HasPolicyViolation returns true if any disruption window exceeded its policy. +func (j *Journal) HasPolicyViolation() bool { + j.mu.RLock() + defer j.mu.RUnlock() + + if j.activeWindow != nil && j.activeWindow.ExceededPolicy() { + return true + } + for i := range j.closedWindows { + if j.closedWindows[i].ExceededPolicy() { + return true + } + } + return false +} + +// Events returns a copy of all events recorded so far. +func (j *Journal) Events() []Event { + j.mu.RLock() + defer j.mu.RUnlock() + result := make([]Event, len(j.events)) + copy(result, j.events) + return result +} + +// EventsSince returns events recorded after the given time. +func (j *Journal) EventsSince(t time.Time) []Event { + j.mu.RLock() + defer j.mu.RUnlock() + var result []Event + for _, e := range j.events { + if e.Timestamp.After(t) { + result = append(result, e) + } + } + return result +} + +// Len returns the number of events in the journal. +func (j *Journal) Len() int { + j.mu.RLock() + defer j.mu.RUnlock() + return len(j.events) +} + +// DisruptionWindows returns all closed disruption windows. +func (j *Journal) DisruptionWindows() []DisruptionWindow { + j.mu.RLock() + defer j.mu.RUnlock() + result := make([]DisruptionWindow, len(j.closedWindows)) + copy(result, j.closedWindows) + return result +} diff --git a/test/longhaul/journal/journal_test.go b/test/longhaul/journal/journal_test.go new file mode 100644 index 000000000..2eb5c1ee5 --- /dev/null +++ b/test/longhaul/journal/journal_test.go @@ -0,0 +1,130 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package journal + +import ( + "sync" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Journal", func() { + Describe("Record and Events", func() { + It("preserves levels and length", func() { + j := New() + j.Info("test", "first") + j.Warn("test", "second") + j.Error("test", "third") + + events := j.Events() + Expect(events).To(HaveLen(3)) + Expect(events[0].Level).To(Equal(LevelInfo)) + Expect(events[1].Level).To(Equal(LevelWarn)) + Expect(events[2].Level).To(Equal(LevelError)) + Expect(j.Len()).To(Equal(3)) + }) + + It("returns only events after a cutoff", func() { + j := New() + j.Info("test", "before") + cutoff := time.Now() + time.Sleep(2 * time.Millisecond) + j.Info("test", "after1") + j.Info("test", "after2") + + Expect(j.EventsSince(cutoff)).To(HaveLen(2)) + }) + }) + + Describe("DisruptionWindow lifecycle", func() { + It("opens, records failures, and closes correctly", func() { + j := New() + policy := OutagePolicy{MustRecoverWithin: time.Minute, AllowedWriteFailures: 10} + + Expect(j.ActiveWindow()).To(BeNil()) + + j.OpenDisruptionWindow("scale-up", policy) + w := j.ActiveWindow() + Expect(w).NotTo(BeNil()) + Expect(w.OperationName).To(Equal("scale-up")) + Expect(w.IsActive()).To(BeTrue()) + + j.RecordWriteFailure() + j.RecordWriteFailure() + j.RecordWriteFailure() + Expect(j.ActiveWindow().WriteFailures).To(Equal(int64(3))) + + j.CloseDisruptionWindow() + Expect(j.ActiveWindow()).To(BeNil()) + closed := j.DisruptionWindows() + Expect(closed).To(HaveLen(1)) + Expect(closed[0].WriteFailures).To(Equal(int64(3))) + Expect(closed[0].IsActive()).To(BeFalse()) + }) + + It("opening a new window closes the previous active window", func() { + j := New() + j.OpenDisruptionWindow("op1", DefaultOutagePolicy()) + j.OpenDisruptionWindow("op2", DefaultOutagePolicy()) + Expect(j.ActiveWindow().OperationName).To(Equal("op2")) + closed := j.DisruptionWindows() + Expect(closed).To(HaveLen(1)) + Expect(closed[0].OperationName).To(Equal("op1")) + }) + + It("RecordWriteFailure without an active window is a no-op", func() { + j := New() + Expect(func() { j.RecordWriteFailure() }).NotTo(Panic()) + }) + }) + + Describe("HasPolicyViolation", func() { + It("returns false on empty journal", func() { + Expect(New().HasPolicyViolation()).To(BeFalse()) + }) + + It("returns false on a closed window within budget", func() { + j := New() + j.OpenDisruptionWindow("op", OutagePolicy{MustRecoverWithin: time.Minute, AllowedWriteFailures: 10}) + j.CloseDisruptionWindow() + Expect(j.HasPolicyViolation()).To(BeFalse()) + }) + + It("returns true on a closed window over write-failure budget", func() { + j := New() + j.OpenDisruptionWindow("op", OutagePolicy{MustRecoverWithin: time.Minute, AllowedWriteFailures: 1}) + j.RecordWriteFailure() + j.RecordWriteFailure() + j.CloseDisruptionWindow() + Expect(j.HasPolicyViolation()).To(BeTrue()) + }) + + It("returns true on an active window over time budget", func() { + j := New() + j.OpenDisruptionWindow("op", OutagePolicy{MustRecoverWithin: time.Nanosecond, AllowedWriteFailures: 10}) + time.Sleep(1 * time.Millisecond) + Expect(j.HasPolicyViolation()).To(BeTrue()) + }) + }) + + It("appends concurrently without races (run with -race)", func() { + j := New() + var wg sync.WaitGroup + const writers = 8 + const perWriter = 100 + for i := 0; i < writers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for k := 0; k < perWriter; k++ { + j.Info("c", "x") + } + }() + } + wg.Wait() + Expect(j.Len()).To(Equal(writers * perWriter)) + }) +}) diff --git a/test/longhaul/journal/policy.go b/test/longhaul/journal/policy.go new file mode 100644 index 000000000..188780ba2 --- /dev/null +++ b/test/longhaul/journal/policy.go @@ -0,0 +1,79 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package journal + +import "time" + +// OutagePolicy defines acceptable disruption bounds for an operation. +type OutagePolicy struct { + // AllowedDowntime is the maximum duration of write unavailability. + AllowedDowntime time.Duration + + // AllowedWriteFailures is the maximum number of write failures during the window. + AllowedWriteFailures int64 + + // MustRecoverWithin is the maximum time from operation start to full recovery. + MustRecoverWithin time.Duration +} + +// DefaultOutagePolicy returns a conservative policy suitable for most operations. +func DefaultOutagePolicy() OutagePolicy { + return OutagePolicy{ + AllowedDowntime: 60 * time.Second, + AllowedWriteFailures: 50, + MustRecoverWithin: 5 * time.Minute, + } +} + +// DisruptionWindow represents an active or closed disruption period. +type DisruptionWindow struct { + // OperationName identifies which operation opened this window. + OperationName string + + // StartTime is when the disruption began. + StartTime time.Time + + // EndTime is when the disruption ended. Zero means still active. + EndTime time.Time + + // Policy is the outage budget for this window. + Policy OutagePolicy + + // WriteFailures counts failures observed during this window. + WriteFailures int64 +} + +// IsActive returns true if the disruption window has not been closed. +func (w *DisruptionWindow) IsActive() bool { + return w.EndTime.IsZero() +} + +// Duration returns the elapsed time of the disruption window. +// For active windows, this is time since start. +func (w *DisruptionWindow) Duration() time.Duration { + if w.IsActive() { + return time.Since(w.StartTime) + } + return w.EndTime.Sub(w.StartTime) +} + +// ExceededPolicy returns true if the window has violated its outage policy. +// +// TODO(longhaul, #220): also enforce Policy.AllowedDowntime here. Today +// AllowedDowntime is set by every operation's OutagePolicy() but never +// consulted — only the (always-set) MustRecoverWithin and AllowedWriteFailures +// budgets are actually checked. To enforce AllowedDowntime, the journal needs +// to start tracking the actual write-unavailable interval inside the window +// (e.g., longest contiguous run of write failures or first-failure to +// first-recovery). That requires changes in writer.go to feed per-write +// timestamps into the active window, so it's a separate change. +func (w *DisruptionWindow) ExceededPolicy() bool { + if w.Duration() > w.Policy.MustRecoverWithin { + return true + } + if w.WriteFailures > w.Policy.AllowedWriteFailures { + return true + } + return false +} diff --git a/test/longhaul/journal/policy_test.go b/test/longhaul/journal/policy_test.go new file mode 100644 index 000000000..d8f8cd7c5 --- /dev/null +++ b/test/longhaul/journal/policy_test.go @@ -0,0 +1,85 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package journal + +import ( + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("DisruptionWindow", func() { + Describe("IsActive", func() { + It("returns true for an open window (no end time)", func() { + w := DisruptionWindow{StartTime: time.Now()} + Expect(w.IsActive()).To(BeTrue()) + }) + It("returns false once EndTime is set", func() { + now := time.Now() + w := DisruptionWindow{StartTime: now, EndTime: now.Add(time.Second)} + Expect(w.IsActive()).To(BeFalse()) + }) + }) + + Describe("Duration", func() { + It("returns end-start for a closed window", func() { + start := time.Now() + end := start.Add(7 * time.Second) + w := DisruptionWindow{StartTime: start, EndTime: end} + Expect(w.Duration()).To(Equal(7 * time.Second)) + }) + It("returns at-least-since-start for an active window", func() { + start := time.Now().Add(-3 * time.Second) + w := DisruptionWindow{StartTime: start} + Expect(w.Duration()).To(BeNumerically(">=", 3*time.Second)) + }) + }) + + DescribeTable("ExceededPolicy", + func(w DisruptionWindow, want bool) { + Expect(w.ExceededPolicy()).To(Equal(want)) + }, + Entry("within all budgets", + DisruptionWindow{ + StartTime: time.Now().Add(-10 * time.Second), + EndTime: time.Now(), + WriteFailures: 5, + Policy: OutagePolicy{MustRecoverWithin: time.Minute, AllowedWriteFailures: 50}, + }, false), + Entry("exceeds MustRecoverWithin", + DisruptionWindow{ + StartTime: time.Now().Add(-2 * time.Minute), + EndTime: time.Now(), + WriteFailures: 1, + Policy: OutagePolicy{MustRecoverWithin: time.Minute, AllowedWriteFailures: 50}, + }, true), + Entry("exceeds AllowedWriteFailures", + DisruptionWindow{ + StartTime: time.Now().Add(-10 * time.Second), + EndTime: time.Now(), + WriteFailures: 100, + Policy: OutagePolicy{MustRecoverWithin: time.Minute, AllowedWriteFailures: 50}, + }, true), + Entry("boundary: equal to write-failure budget is allowed", + DisruptionWindow{ + StartTime: time.Now().Add(-10 * time.Second), + EndTime: time.Now(), + WriteFailures: 50, + Policy: OutagePolicy{MustRecoverWithin: time.Minute, AllowedWriteFailures: 50}, + }, false), + Entry("active window also evaluated against MustRecoverWithin", + DisruptionWindow{ + StartTime: time.Now().Add(-2 * time.Minute), + Policy: OutagePolicy{MustRecoverWithin: time.Minute, AllowedWriteFailures: 50}, + }, true), + ) + + It("DefaultOutagePolicy returns no zero-valued field", func() { + p := DefaultOutagePolicy() + Expect(p.MustRecoverWithin).NotTo(BeZero()) + Expect(p.AllowedWriteFailures).NotTo(BeZero()) + Expect(p.AllowedDowntime).NotTo(BeZero()) + }) +}) diff --git a/test/longhaul/journal/suite_test.go b/test/longhaul/journal/suite_test.go new file mode 100644 index 000000000..52ac140d5 --- /dev/null +++ b/test/longhaul/journal/suite_test.go @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package journal + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestJournal(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Long Haul Journal Suite") +} diff --git a/test/longhaul/monitor/health.go b/test/longhaul/monitor/health.go new file mode 100644 index 000000000..bc84bf284 --- /dev/null +++ b/test/longhaul/monitor/health.go @@ -0,0 +1,169 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package monitor provides health monitoring and resource leak detection +// for the target DocumentDB cluster during long haul tests. +package monitor + +import ( + "context" + "fmt" + "sync" + "time" + + "github.com/documentdb/documentdb-operator/test/longhaul/journal" +) + +const ( + // healthCheckInterval is how often the health monitor polls cluster state. + healthCheckInterval = 5 * time.Second +) + +// ClusterHealth represents the observed health of the cluster at a point in time. +type ClusterHealth struct { + Timestamp time.Time + AllPodsReady bool + ReadyPods int + TotalPods int + CRReady bool + RestartCount int32 +} + +// ClusterClient is the interface for querying cluster state. +// This allows testing with mocks instead of a real k8s client. +type ClusterClient interface { + // GetClusterHealth returns the current health of the target cluster. + GetClusterHealth(ctx context.Context) (ClusterHealth, error) + + // GetCurrentDocumentDBImageTag returns the tag portion of status.documentDBImage + // (e.g., "0.109.0" from "ghcr.io/.../documentdb:0.109.0"). + // Returns empty string if status not yet populated. + GetCurrentDocumentDBImageTag(ctx context.Context) (string, error) + + // GetInstancesPerNode returns spec.instancesPerNode (range 1-3). + // 1 means single-instance (no HA); >=2 means at least one standby exists. + GetInstancesPerNode(ctx context.Context) (int, error) + + // ScaleCluster sets the desired spec.instancesPerNode value (CRD range 1-3). + ScaleCluster(ctx context.Context, instancesPerNode int) error + + // UpgradeDocumentDB patches spec.documentDBVersion and spec.schemaVersion="auto". + UpgradeDocumentDB(ctx context.Context, version string) error +} + +// HealthMonitor continuously monitors cluster health and tracks steady-state. +type HealthMonitor struct { + client ClusterClient + journal *journal.Journal + steadyStateWait time.Duration + + mu sync.RWMutex + lastHealth ClusterHealth + steadySince time.Time // time when cluster became healthy + healthySamples int +} + +// NewHealthMonitor creates a monitor that polls the cluster for health status. +func NewHealthMonitor(client ClusterClient, j *journal.Journal, steadyStateWait time.Duration) *HealthMonitor { + return &HealthMonitor{ + client: client, + journal: j, + steadyStateWait: steadyStateWait, + } +} + +// Run starts the health monitoring loop. Blocks until context is cancelled. +func (h *HealthMonitor) Run(ctx context.Context) { + h.journal.Info("health", "health monitor started") + defer h.journal.Info("health", "health monitor stopped") + + ticker := time.NewTicker(healthCheckInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + h.check(ctx) + } + } +} + +func (h *HealthMonitor) check(ctx context.Context) { + health, err := h.client.GetClusterHealth(ctx) + if err != nil { + h.journal.Warn("health", fmt.Sprintf("health check failed: %v", err)) + h.mu.Lock() + h.steadySince = time.Time{} + h.mu.Unlock() + return + } + + h.mu.Lock() + defer h.mu.Unlock() + + prev := h.lastHealth + h.lastHealth = health + + isHealthy := health.AllPodsReady && health.CRReady + + if isHealthy { + if h.steadySince.IsZero() { + h.steadySince = time.Now() + } + h.healthySamples++ + } else { + if !h.steadySince.IsZero() { + h.journal.Warn("health", fmt.Sprintf( + "cluster lost steady state: pods=%d/%d cr_ready=%v", + health.ReadyPods, health.TotalPods, health.CRReady)) + } + h.steadySince = time.Time{} + h.healthySamples = 0 + } + + // Log transitions. + if prev.AllPodsReady && !health.AllPodsReady { + h.journal.Warn("health", fmt.Sprintf("pods degraded: %d/%d ready", + health.ReadyPods, health.TotalPods)) + } else if !prev.AllPodsReady && health.AllPodsReady { + h.journal.Info("health", "all pods ready") + } +} + +// IsSteadyState returns true if the cluster has been continuously healthy +// for at least the configured steady-state duration. +func (h *HealthMonitor) IsSteadyState() bool { + h.mu.RLock() + defer h.mu.RUnlock() + + if h.steadySince.IsZero() { + return false + } + return time.Since(h.steadySince) >= h.steadyStateWait +} + +// LastHealth returns the most recent health observation. +func (h *HealthMonitor) LastHealth() ClusterHealth { + h.mu.RLock() + defer h.mu.RUnlock() + return h.lastHealth +} + +// WaitForSteadyState blocks until the cluster reaches steady state or context expires. +func (h *HealthMonitor) WaitForSteadyState(ctx context.Context) error { + ticker := time.NewTicker(1 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return fmt.Errorf("timed out waiting for steady state: %w", ctx.Err()) + case <-ticker.C: + if h.IsSteadyState() { + return nil + } + } + } +} diff --git a/test/longhaul/monitor/health_test.go b/test/longhaul/monitor/health_test.go new file mode 100644 index 000000000..1ad903fd9 --- /dev/null +++ b/test/longhaul/monitor/health_test.go @@ -0,0 +1,117 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package monitor + +import ( + "context" + "errors" + "sync" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/longhaul/journal" +) + +// fakeClusterClient is a minimal ClusterClient stub for tests. +type fakeClusterClient struct { + mu sync.Mutex + health ClusterHealth + err error +} + +func (f *fakeClusterClient) setHealth(h ClusterHealth) { + f.mu.Lock() + defer f.mu.Unlock() + f.health = h + f.err = nil +} +func (f *fakeClusterClient) setErr(err error) { + f.mu.Lock() + defer f.mu.Unlock() + f.err = err +} +func (f *fakeClusterClient) GetClusterHealth(_ context.Context) (ClusterHealth, error) { + f.mu.Lock() + defer f.mu.Unlock() + return f.health, f.err +} +func (f *fakeClusterClient) GetCurrentDocumentDBImageTag(_ context.Context) (string, error) { + return "", nil +} +func (f *fakeClusterClient) GetInstancesPerNode(_ context.Context) (int, error) { return 1, nil } +func (f *fakeClusterClient) ScaleCluster(_ context.Context, _ int) error { return nil } +func (f *fakeClusterClient) UpgradeDocumentDB(_ context.Context, _ string) error { return nil } + +var _ = Describe("HealthMonitor", func() { + Describe("IsSteadyState", func() { + It("is false before any check", func() { + c := &fakeClusterClient{} + h := NewHealthMonitor(c, journal.New(), 100*time.Millisecond) + Expect(h.IsSteadyState()).To(BeFalse()) + }) + + It("becomes true after staying healthy beyond steadyStateWait", func() { + c := &fakeClusterClient{} + c.setHealth(ClusterHealth{AllPodsReady: true, CRReady: true, ReadyPods: 2, TotalPods: 2}) + h := NewHealthMonitor(c, journal.New(), 50*time.Millisecond) + + h.check(context.Background()) + Expect(h.IsSteadyState()).To(BeFalse(), "first healthy check sets steadySince but elapsed=0") + + time.Sleep(60 * time.Millisecond) + Expect(h.IsSteadyState()).To(BeTrue()) + }) + + It("resets to false when health is lost", func() { + c := &fakeClusterClient{} + c.setHealth(ClusterHealth{AllPodsReady: true, CRReady: true}) + h := NewHealthMonitor(c, journal.New(), 1*time.Millisecond) + h.check(context.Background()) + time.Sleep(2 * time.Millisecond) + Expect(h.IsSteadyState()).To(BeTrue()) + + c.setHealth(ClusterHealth{AllPodsReady: false, CRReady: false}) + h.check(context.Background()) + Expect(h.IsSteadyState()).To(BeFalse()) + }) + + It("resets to false on poll error", func() { + c := &fakeClusterClient{} + c.setHealth(ClusterHealth{AllPodsReady: true, CRReady: true}) + h := NewHealthMonitor(c, journal.New(), 1*time.Millisecond) + h.check(context.Background()) + time.Sleep(2 * time.Millisecond) + Expect(h.IsSteadyState()).To(BeTrue()) + + c.setErr(errors.New("apiserver unreachable")) + h.check(context.Background()) + Expect(h.IsSteadyState()).To(BeFalse()) + }) + }) + + Describe("WaitForSteadyState", func() { + It("returns an error when the context is cancelled", func() { + c := &fakeClusterClient{} + h := NewHealthMonitor(c, journal.New(), 10*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Millisecond) + defer cancel() + Expect(h.WaitForSteadyState(ctx)).To(HaveOccurred()) + }) + + It("returns nil once steady state is reached", func() { + c := &fakeClusterClient{} + c.setHealth(ClusterHealth{AllPodsReady: true, CRReady: true}) + h := NewHealthMonitor(c, journal.New(), 1*time.Millisecond) + + h.check(context.Background()) + time.Sleep(5 * time.Millisecond) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + Expect(h.WaitForSteadyState(ctx)).To(Succeed()) + }) + }) +}) diff --git a/test/longhaul/monitor/k8sclient.go b/test/longhaul/monitor/k8sclient.go new file mode 100644 index 000000000..1e87ff184 --- /dev/null +++ b/test/longhaul/monitor/k8sclient.go @@ -0,0 +1,267 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package monitor + +import ( + "context" + "fmt" + "log" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + metricsv "k8s.io/metrics/pkg/client/clientset/versioned" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" + shareddb "github.com/documentdb/documentdb-operator/test/shared/documentdb" +) + +// PodMetrics holds resource usage for a single pod. +type PodMetrics struct { + Name string + MemoryMB float64 + CPUCores float64 +} + +// K8sClusterClient implements ClusterClient using real Kubernetes API calls. +type K8sClusterClient struct { + clientset kubernetes.Interface + crClient ctrlclient.Client + metricsClient metricsv.Interface + namespace string + clusterName string + metricsAvail bool +} + +// K8sClientConfig holds configuration for creating a K8sClusterClient. +type K8sClientConfig struct { + Namespace string + ClusterName string + Kubeconfig string // optional, empty uses in-cluster +} + +// NewK8sClusterClient creates a real Kubernetes cluster client. +// It first attempts in-cluster config, then falls back to KUBECONFIG. +func NewK8sClusterClient(cfg K8sClientConfig) (*K8sClusterClient, error) { + restConfig, err := buildRestConfig(cfg.Kubeconfig) + if err != nil { + return nil, fmt.Errorf("failed to build rest config: %w", err) + } + + clientset, err := kubernetes.NewForConfig(restConfig) + if err != nil { + return nil, fmt.Errorf("failed to create clientset: %w", err) + } + + scheme := runtime.NewScheme() + if err := previewv1.AddToScheme(scheme); err != nil { + return nil, fmt.Errorf("failed to add previewv1 to scheme: %w", err) + } + crClient, err := ctrlclient.New(restConfig, ctrlclient.Options{Scheme: scheme}) + if err != nil { + return nil, fmt.Errorf("failed to create controller-runtime client: %w", err) + } + + // Try to create metrics client (graceful fallback). + metricsClient, metricsAvail := tryMetricsClient(restConfig) + + return &K8sClusterClient{ + clientset: clientset, + crClient: crClient, + metricsClient: metricsClient, + namespace: cfg.Namespace, + clusterName: cfg.ClusterName, + metricsAvail: metricsAvail, + }, nil +} + +func buildRestConfig(kubeconfig string) (*rest.Config, error) { + // Try in-cluster first. + config, err := rest.InClusterConfig() + if err == nil { + return config, nil + } + + // Fall back to kubeconfig. + if kubeconfig == "" { + kubeconfig = clientcmd.RecommendedHomeFile + } + return clientcmd.BuildConfigFromFlags("", kubeconfig) +} + +func tryMetricsClient(config *rest.Config) (metricsv.Interface, bool) { + mc, err := metricsv.NewForConfig(config) + if err != nil { + log.Printf("[k8sclient] metrics client creation failed (leak detection disabled): %v", err) + return nil, false + } + return mc, true +} + +// GetClusterHealth queries pod status and CR status to determine cluster health. +func (k *K8sClusterClient) GetClusterHealth(ctx context.Context) (ClusterHealth, error) { + health := ClusterHealth{Timestamp: time.Now()} + + // List pods with the CNPG cluster label. + labelSelector := fmt.Sprintf("cnpg.io/cluster=%s", k.clusterName) + pods, err := k.clientset.CoreV1().Pods(k.namespace).List(ctx, metav1.ListOptions{ + LabelSelector: labelSelector, + }) + if err != nil { + return health, fmt.Errorf("failed to list pods: %w", err) + } + + health.TotalPods = len(pods.Items) + var totalRestarts int32 + readyCount := 0 + + for i := range pods.Items { + pod := &pods.Items[i] + if isPodReady(pod) { + readyCount++ + } + for _, cs := range pod.Status.ContainerStatuses { + totalRestarts += cs.RestartCount + } + } + + health.ReadyPods = readyCount + health.AllPodsReady = readyCount == health.TotalPods && health.TotalPods > 0 + health.RestartCount = totalRestarts + + // Get the DocumentDB CR status via the shared typed helper. Using + // shareddb.IsHealthy keeps the readiness predicate consistent with + // the e2e suite (single source of truth for ReadyStatus). + dd, err := shareddb.Get(ctx, k.crClient, types.NamespacedName{Namespace: k.namespace, Name: k.clusterName}) + if err != nil { + return health, fmt.Errorf("failed to get DocumentDB CR: %w", err) + } + health.CRReady = shareddb.IsHealthy(dd) + + return health, nil +} + +// GetInstancesPerNode reads spec.instancesPerNode from the DocumentDB CR. +// Range is 1-3 per the CRD; 1 means no HA, >=2 means at least one standby. +// +// The previous unstructured-based implementation returned 1 when the +// field was omitted from the CR (operator default). The typed +// previewv1.DocumentDB gives a zero-value of 0 for omitted ints, so we +// preserve the original semantics explicitly here. +func (k *K8sClusterClient) GetInstancesPerNode(ctx context.Context) (int, error) { + dd, err := shareddb.Get(ctx, k.crClient, types.NamespacedName{Namespace: k.namespace, Name: k.clusterName}) + if err != nil { + return 0, fmt.Errorf("failed to get DocumentDB CR: %w", err) + } + if dd.Spec.InstancesPerNode == 0 { + return 1, nil + } + return dd.Spec.InstancesPerNode, nil +} + +// ScaleCluster patches spec.instancesPerNode on the DocumentDB CR. +// +// Note: spec.nodeCount is hard-capped at 1 by the CRD (minimum=maximum=1), +// so the only scale dimension exposed today is instancesPerNode (range 1-3). +// Each instance is a CNPG replica (1 primary + N-1 standbys); growing this +// dimension is what gives the cluster HA. +func (k *K8sClusterClient) ScaleCluster(ctx context.Context, instancesPerNode int) error { + if err := shareddb.PatchInstances(ctx, k.crClient, k.namespace, k.clusterName, instancesPerNode); err != nil { + return fmt.Errorf("failed to patch DocumentDB CR: %w", err) + } + return nil +} + +// GetCurrentDocumentDBImageTag reads status.documentDBImage from the CR +// and returns the tag portion (after the last colon). +func (k *K8sClusterClient) GetCurrentDocumentDBImageTag(ctx context.Context) (string, error) { + dd, err := shareddb.Get(ctx, k.crClient, types.NamespacedName{Namespace: k.namespace, Name: k.clusterName}) + if err != nil { + return "", fmt.Errorf("failed to get DocumentDB CR: %w", err) + } + + image := dd.Status.DocumentDBImage + if image == "" { + return "", nil + } + idx := strings.LastIndex(image, ":") + if idx < 0 || idx == len(image)-1 { + return "", nil + } + return image[idx+1:], nil +} + +// UpgradeDocumentDB patches the DocumentDB CR to set documentDBVersion +// and schemaVersion="auto" so the operator performs a rolling upgrade. +// NOTE: the CRD field is documentDBVersion (capital DB), not documentDbVersion. +func (k *K8sClusterClient) UpgradeDocumentDB(ctx context.Context, version string) error { + dd, err := shareddb.Get(ctx, k.crClient, types.NamespacedName{Namespace: k.namespace, Name: k.clusterName}) + if err != nil { + return fmt.Errorf("failed to get DocumentDB CR: %w", err) + } + if err := shareddb.PatchSpec(ctx, k.crClient, dd, func(spec *previewv1.DocumentDBSpec) { + spec.DocumentDBVersion = version + spec.SchemaVersion = "auto" + }); err != nil { + return fmt.Errorf("failed to patch DocumentDB CR: %w", err) + } + return nil +} + +// GetPodMetrics queries metrics-server for pod resource usage. +// Returns nil, nil if metrics-server is not available. +func (k *K8sClusterClient) GetPodMetrics(ctx context.Context) ([]PodMetrics, error) { + if !k.metricsAvail || k.metricsClient == nil { + return nil, nil + } + + labelSelector := fmt.Sprintf("cnpg.io/cluster=%s", k.clusterName) + podMetricsList, err := k.metricsClient.MetricsV1beta1().PodMetricses(k.namespace).List(ctx, metav1.ListOptions{ + LabelSelector: labelSelector, + }) + if err != nil { + // Metrics API might have become unavailable. + log.Printf("[k8sclient] metrics query failed (disabling): %v", err) + k.metricsAvail = false + return nil, nil + } + + var result []PodMetrics + for _, pm := range podMetricsList.Items { + var totalMemBytes int64 + var totalCPUMillis int64 + for _, c := range pm.Containers { + totalMemBytes += c.Usage.Memory().Value() + totalCPUMillis += c.Usage.Cpu().MilliValue() + } + result = append(result, PodMetrics{ + Name: pm.Name, + MemoryMB: float64(totalMemBytes) / (1024 * 1024), + CPUCores: float64(totalCPUMillis) / 1000.0, + }) + } + + return result, nil +} + +// MetricsAvailable returns whether metrics-server is usable. +func (k *K8sClusterClient) MetricsAvailable() bool { + return k.metricsAvail +} + +func isPodReady(pod *corev1.Pod) bool { + for _, cond := range pod.Status.Conditions { + if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue { + return true + } + } + return false +} diff --git a/test/longhaul/monitor/k8sclient_test.go b/test/longhaul/monitor/k8sclient_test.go new file mode 100644 index 000000000..3b7cebf9b --- /dev/null +++ b/test/longhaul/monitor/k8sclient_test.go @@ -0,0 +1,209 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package monitor + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes/fake" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" + fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" + + previewv1 "github.com/documentdb/documentdb-operator/api/preview" + shareddb "github.com/documentdb/documentdb-operator/test/shared/documentdb" +) + +func newTestScheme() *runtime.Scheme { + s := runtime.NewScheme() + Expect(previewv1.AddToScheme(s)).To(Succeed()) + return s +} + +func newTestDocumentDB(ns, name string, modify func(dd *previewv1.DocumentDB)) *previewv1.DocumentDB { + dd := &previewv1.DocumentDB{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: name}, + } + if modify != nil { + modify(dd) + } + return dd +} + +// newTestK8sClient builds a K8sClusterClient backed by fake clients. Failures +// are reported via Gomega's Expect, so it must be called from inside a spec. +func newTestK8sClient(ns, cluster string, cs *fake.Clientset, objs ...ctrlclient.Object) *K8sClusterClient { + scheme := newTestScheme() + builder := fakeclient.NewClientBuilder().WithScheme(scheme) + if len(objs) > 0 { + builder = builder.WithObjects(objs...) + } + return &K8sClusterClient{ + clientset: cs, + crClient: builder.Build(), + namespace: ns, + clusterName: cluster, + } +} + +var _ = Describe("K8sClusterClient", func() { + const ns, cluster = "default", "documentdb-cluster" + + DescribeTable("isPodReady", + func(pod *corev1.Pod, want bool) { + Expect(isPodReady(pod)).To(Equal(want)) + }, + Entry("no conditions", &corev1.Pod{}, false), + Entry("PodReady=True", + &corev1.Pod{Status: corev1.PodStatus{Conditions: []corev1.PodCondition{ + {Type: corev1.PodReady, Status: corev1.ConditionTrue}, + }}}, true), + Entry("PodReady=False", + &corev1.Pod{Status: corev1.PodStatus{Conditions: []corev1.PodCondition{ + {Type: corev1.PodReady, Status: corev1.ConditionFalse}, + }}}, false), + Entry("only PodScheduled=True (no Ready condition)", + &corev1.Pod{Status: corev1.PodStatus{Conditions: []corev1.PodCondition{ + {Type: corev1.PodScheduled, Status: corev1.ConditionTrue}, + }}}, false), + ) + + Describe("GetClusterHealth", func() { + It("aggregates pods filtered by cnpg.io/cluster and reads CR status", func() { + pods := []runtime.Object{ + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: "pod1", Labels: map[string]string{"cnpg.io/cluster": cluster}}, + Status: corev1.PodStatus{ + Conditions: []corev1.PodCondition{{Type: corev1.PodReady, Status: corev1.ConditionTrue}}, + ContainerStatuses: []corev1.ContainerStatus{{RestartCount: 1}}, + }, + }, + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: "pod2", Labels: map[string]string{"cnpg.io/cluster": cluster}}, + Status: corev1.PodStatus{ + Conditions: []corev1.PodCondition{{Type: corev1.PodReady, Status: corev1.ConditionTrue}}, + ContainerStatuses: []corev1.ContainerStatus{{RestartCount: 2}}, + }, + }, + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: "other", Labels: map[string]string{"cnpg.io/cluster": "other-cluster"}}, + Status: corev1.PodStatus{ + Conditions: []corev1.PodCondition{{Type: corev1.PodReady, Status: corev1.ConditionTrue}}, + }, + }, + } + cr := newTestDocumentDB(ns, cluster, func(dd *previewv1.DocumentDB) { + dd.Status.Status = shareddb.ReadyStatus + }) + k := newTestK8sClient(ns, cluster, fake.NewSimpleClientset(pods...), cr) + + got, err := k.GetClusterHealth(context.Background()) + Expect(err).NotTo(HaveOccurred()) + Expect(got.TotalPods).To(Equal(2)) + Expect(got.ReadyPods).To(Equal(2)) + Expect(got.AllPodsReady).To(BeTrue()) + Expect(got.RestartCount).To(Equal(int32(3))) + Expect(got.CRReady).To(BeTrue()) + }) + + It("flags CRReady=false when status is not healthy and no pods exist", func() { + cr := newTestDocumentDB(ns, cluster, func(dd *previewv1.DocumentDB) { + dd.Status.Status = "Reconciling" + }) + k := newTestK8sClient(ns, cluster, fake.NewSimpleClientset(), cr) + + got, err := k.GetClusterHealth(context.Background()) + Expect(err).NotTo(HaveOccurred()) + Expect(got.CRReady).To(BeFalse()) + Expect(got.AllPodsReady).To(BeFalse()) + }) + }) + + Describe("GetInstancesPerNode", func() { + DescribeTable("returns spec.instancesPerNode (defaulting to 1)", + func(modify func(dd *previewv1.DocumentDB), want int) { + cr := newTestDocumentDB(ns, cluster, modify) + k := newTestK8sClient(ns, cluster, fake.NewSimpleClientset(), cr) + got, err := k.GetInstancesPerNode(context.Background()) + Expect(err).NotTo(HaveOccurred()) + Expect(got).To(Equal(want)) + }, + Entry("explicit ipn=2", + func(dd *previewv1.DocumentDB) { dd.Spec.InstancesPerNode = 2 }, 2), + Entry("explicit ipn=3", + func(dd *previewv1.DocumentDB) { dd.Spec.InstancesPerNode = 3 }, 3), + Entry("unset defaults to 1", + func(dd *previewv1.DocumentDB) {}, 1), + ) + + It("returns an error when the CR is missing", func() { + k := newTestK8sClient(ns, "missing", fake.NewSimpleClientset()) + _, err := k.GetInstancesPerNode(context.Background()) + Expect(err).To(HaveOccurred()) + }) + }) + + It("ScaleCluster patches instancesPerNode", func() { + cr := newTestDocumentDB(ns, cluster, func(dd *previewv1.DocumentDB) { + dd.Spec.InstancesPerNode = 1 + }) + k := newTestK8sClient(ns, cluster, fake.NewSimpleClientset(), cr) + + Expect(k.ScaleCluster(context.Background(), 3)).To(Succeed()) + got, err := k.GetInstancesPerNode(context.Background()) + Expect(err).NotTo(HaveOccurred()) + Expect(got).To(Equal(3)) + }) + + DescribeTable("GetCurrentDocumentDBImageTag", + func(image string, want string) { + cr := newTestDocumentDB(ns, cluster, func(dd *previewv1.DocumentDB) { + dd.Status.DocumentDBImage = image + }) + k := newTestK8sClient(ns, cluster, fake.NewSimpleClientset(), cr) + got, err := k.GetCurrentDocumentDBImageTag(context.Background()) + Expect(err).NotTo(HaveOccurred()) + Expect(got).To(Equal(want)) + }, + Entry("empty string", "", ""), + Entry("image with tag", "ghcr.io/foo/documentdb:0.109.0", "0.109.0"), + Entry("registry without tag", "ghcr.io/foo/documentdb", ""), + Entry("trailing colon (malformed)", "ghcr.io/foo/documentdb:", ""), + Entry("semver tag with port-like host", "host:5000/foo/documentdb:0.110.0-rc1", "0.110.0-rc1"), + ) + + It("UpgradeDocumentDB patches version fields", func() { + cr := newTestDocumentDB(ns, cluster, nil) + k := newTestK8sClient(ns, cluster, fake.NewSimpleClientset(), cr) + + Expect(k.UpgradeDocumentDB(context.Background(), "0.110.0")).To(Succeed()) + + var got previewv1.DocumentDB + Expect(k.crClient.Get(context.Background(), types.NamespacedName{Namespace: ns, Name: cluster}, &got)).To(Succeed()) + Expect(got.Spec.DocumentDBVersion).To(Equal("0.110.0")) + Expect(got.Spec.SchemaVersion).To(Equal("auto")) + }) + + Describe("metrics", func() { + It("MetricsAvailable mirrors the metricsAvail flag", func() { + k := &K8sClusterClient{metricsAvail: true} + Expect(k.MetricsAvailable()).To(BeTrue()) + k.metricsAvail = false + Expect(k.MetricsAvailable()).To(BeFalse()) + }) + + It("GetPodMetrics returns nil/nil when metrics are unavailable", func() { + k := &K8sClusterClient{metricsAvail: false} + got, err := k.GetPodMetrics(context.Background()) + Expect(err).NotTo(HaveOccurred()) + Expect(got).To(BeNil()) + }) + }) +}) diff --git a/test/longhaul/monitor/leakdetect.go b/test/longhaul/monitor/leakdetect.go new file mode 100644 index 000000000..3955325b0 --- /dev/null +++ b/test/longhaul/monitor/leakdetect.go @@ -0,0 +1,124 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package monitor + +import ( + "fmt" + "math" + "sync" + "time" + + "github.com/documentdb/documentdb-operator/test/longhaul/journal" +) + +// ResourceSample represents a single observation of resource usage. +type ResourceSample struct { + Timestamp time.Time + MemoryMB float64 + CPUCores float64 +} + +// LeakDetector analyzes resource usage trends over time using linear regression. +// A consistently positive slope above the threshold indicates a resource leak. +type LeakDetector struct { + journal *journal.Journal + slopeThreshold float64 // MB/hour threshold for memory leak detection + minSamples int // minimum samples before analysis + + mu sync.RWMutex + samples []ResourceSample +} + +// NewLeakDetector creates a leak detector with the given sensitivity. +// slopeThreshold is in MB/hour — a memory growth rate above this is flagged. +func NewLeakDetector(j *journal.Journal, slopeThresholdMBPerHour float64, minSamples int) *LeakDetector { + if minSamples < 3 { + minSamples = 3 + } + return &LeakDetector{ + journal: j, + slopeThreshold: slopeThresholdMBPerHour, + minSamples: minSamples, + samples: make([]ResourceSample, 0, 256), + } +} + +// AddSample records a resource usage observation. +func (l *LeakDetector) AddSample(s ResourceSample) { + l.mu.Lock() + l.samples = append(l.samples, s) + l.mu.Unlock() +} + +// LeakAnalysis contains the results of trend analysis. +type LeakAnalysis struct { + HasLeak bool + MemorySlopeMB float64 // MB per hour + CPUSlopeCores float64 // cores per hour + SampleCount int + Duration time.Duration +} + +// Analyze performs linear regression on collected samples and returns the trend. +func (l *LeakDetector) Analyze() LeakAnalysis { + l.mu.RLock() + defer l.mu.RUnlock() + + result := LeakAnalysis{SampleCount: len(l.samples)} + + if len(l.samples) < l.minSamples { + return result + } + + first := l.samples[0].Timestamp + last := l.samples[len(l.samples)-1].Timestamp + result.Duration = last.Sub(first) + + // Compute linear regression for memory. + result.MemorySlopeMB = linearRegressionSlope(l.samples, func(s ResourceSample) float64 { + return s.MemoryMB + }) * 3600 // convert per-second to per-hour + + // Compute linear regression for CPU. + result.CPUSlopeCores = linearRegressionSlope(l.samples, func(s ResourceSample) float64 { + return s.CPUCores + }) * 3600 + + if result.MemorySlopeMB > l.slopeThreshold { + result.HasLeak = true + l.journal.Warn("leakdetect", fmt.Sprintf( + "memory leak suspected: %.2f MB/hour over %s (%d samples)", + result.MemorySlopeMB, result.Duration.Round(time.Minute), len(l.samples))) + } + + return result +} + +// linearRegressionSlope computes the slope of a least-squares linear fit. +// x-axis is elapsed seconds from first sample, y-axis is extracted value. +func linearRegressionSlope(samples []ResourceSample, getValue func(ResourceSample) float64) float64 { + n := float64(len(samples)) + if n < 2 { + return 0 + } + + t0 := samples[0].Timestamp + var sumX, sumY, sumXY, sumX2 float64 + + for _, s := range samples { + x := s.Timestamp.Sub(t0).Seconds() + y := getValue(s) + sumX += x + sumY += y + sumXY += x * y + sumX2 += x * x + } + + denom := n*sumX2 - sumX*sumX + if math.Abs(denom) < 1e-10 { + return 0 + } + + return (n*sumXY - sumX*sumY) / denom +} diff --git a/test/longhaul/monitor/leakdetect_test.go b/test/longhaul/monitor/leakdetect_test.go new file mode 100644 index 000000000..0bc04332a --- /dev/null +++ b/test/longhaul/monitor/leakdetect_test.go @@ -0,0 +1,81 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package monitor + +import ( + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/longhaul/journal" +) + +var _ = Describe("LeakDetector", func() { + It("returns zero slope when sample count is below the floor", func() { + d := NewLeakDetector(journal.New(), 10, 5) + d.AddSample(ResourceSample{Timestamp: time.Now(), MemoryMB: 100}) + a := d.Analyze() + Expect(a.HasLeak).To(BeFalse()) + Expect(a.MemorySlopeMB).To(BeZero()) + Expect(a.SampleCount).To(Equal(1)) + }) + + It("does not flag a leak when memory is flat", func() { + d := NewLeakDetector(journal.New(), 1.0, 3) + t0 := time.Now() + for i := 0; i < 10; i++ { + d.AddSample(ResourceSample{ + Timestamp: t0.Add(time.Duration(i) * time.Second), + MemoryMB: 100, + CPUCores: 0.5, + }) + } + a := d.Analyze() + Expect(a.HasLeak).To(BeFalse()) + Expect(absf(a.MemorySlopeMB)).To(BeNumerically("<", 0.001)) + }) + + It("flags a leak when growth far exceeds the threshold", func() { + // Threshold 100 MB/h; growth 1 MB/sec = 3600 MB/h. + d := NewLeakDetector(journal.New(), 100.0, 3) + t0 := time.Now() + for i := 0; i < 60; i++ { + d.AddSample(ResourceSample{ + Timestamp: t0.Add(time.Duration(i) * time.Second), + MemoryMB: 100 + float64(i), + }) + } + a := d.Analyze() + Expect(a.HasLeak).To(BeTrue()) + Expect(a.MemorySlopeMB).To(BeNumerically("~", 3600, 100)) + }) + + It("does not flag a leak when growth stays below threshold", func() { + // 0.01 MB/sec = 36 MB/h, threshold 100 MB/h. + d := NewLeakDetector(journal.New(), 100.0, 3) + t0 := time.Now() + for i := 0; i < 60; i++ { + d.AddSample(ResourceSample{ + Timestamp: t0.Add(time.Duration(i) * time.Second), + MemoryMB: 100 + 0.01*float64(i), + }) + } + Expect(d.Analyze().HasLeak).To(BeFalse()) + }) + + It("enforces the minSamples floor of 3", func() { + d := NewLeakDetector(journal.New(), 10, 1) // request 1, floored to 3 + d.AddSample(ResourceSample{Timestamp: time.Now(), MemoryMB: 100}) + d.AddSample(ResourceSample{Timestamp: time.Now(), MemoryMB: 100}) + Expect(d.Analyze().MemorySlopeMB).To(BeZero()) + }) +}) + +func absf(x float64) float64 { + if x < 0 { + return -x + } + return x +} diff --git a/test/longhaul/monitor/suite_test.go b/test/longhaul/monitor/suite_test.go new file mode 100644 index 000000000..73fc450f5 --- /dev/null +++ b/test/longhaul/monitor/suite_test.go @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package monitor + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestMonitor(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Long Haul Monitor Suite") +} diff --git a/test/longhaul/operations/scale.go b/test/longhaul/operations/scale.go new file mode 100644 index 000000000..2d907a935 --- /dev/null +++ b/test/longhaul/operations/scale.go @@ -0,0 +1,135 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package operations + +import ( + "context" + "fmt" + "time" + + "github.com/documentdb/documentdb-operator/test/longhaul/journal" + "github.com/documentdb/documentdb-operator/test/longhaul/monitor" +) + +// ScaleUp increases spec.instancesPerNode by 1 (HA scale dimension; range 1-3). +type ScaleUp struct { + client monitor.ClusterClient + healthMon *monitor.HealthMonitor + maxInstances int + recovery time.Duration +} + +// NewScaleUp creates a ScaleUp operation. maxInstances is clamped to the +// CRD upper bound (3) to avoid admission rejections. +func NewScaleUp(client monitor.ClusterClient, health *monitor.HealthMonitor, maxInstances int, recovery time.Duration) *ScaleUp { + if maxInstances > 3 { + maxInstances = 3 + } + return &ScaleUp{ + client: client, + healthMon: health, + maxInstances: maxInstances, + recovery: recovery, + } +} + +func (s *ScaleUp) Name() string { return "scale-up" } +func (s *ScaleUp) Weight() int { return 3 } + +func (s *ScaleUp) Precondition(ctx context.Context) (bool, string) { + current, err := s.client.GetInstancesPerNode(ctx) + if err != nil { + return false, fmt.Sprintf("cannot get instancesPerNode: %v", err) + } + if current >= s.maxInstances { + return false, fmt.Sprintf("already at max instancesPerNode (%d)", s.maxInstances) + } + return true, "" +} + +func (s *ScaleUp) Execute(ctx context.Context) error { + current, err := s.client.GetInstancesPerNode(ctx) + if err != nil { + return fmt.Errorf("get instancesPerNode: %w", err) + } + + target := current + 1 + if err := s.client.ScaleCluster(ctx, target); err != nil { + return fmt.Errorf("scale to %d: %w", target, err) + } + + // Wait for recovery (new pod becomes ready). + recoveryCtx, cancel := context.WithTimeout(ctx, s.recovery) + defer cancel() + return s.healthMon.WaitForSteadyState(recoveryCtx) +} + +func (s *ScaleUp) OutagePolicy() journal.OutagePolicy { + return journal.OutagePolicy{ + AllowedDowntime: 30 * time.Second, + AllowedWriteFailures: 20, + MustRecoverWithin: s.recovery, + } +} + +// ScaleDown decreases spec.instancesPerNode by 1 (HA scale dimension; range 1-3). +type ScaleDown struct { + client monitor.ClusterClient + healthMon *monitor.HealthMonitor + minInstances int + recovery time.Duration +} + +// NewScaleDown creates a ScaleDown operation. minInstances is clamped to the +// CRD lower bound (1) to avoid admission rejections. +func NewScaleDown(client monitor.ClusterClient, health *monitor.HealthMonitor, minInstances int, recovery time.Duration) *ScaleDown { + if minInstances < 1 { + minInstances = 1 + } + return &ScaleDown{ + client: client, + healthMon: health, + minInstances: minInstances, + recovery: recovery, + } +} + +func (s *ScaleDown) Name() string { return "scale-down" } +func (s *ScaleDown) Weight() int { return 2 } + +func (s *ScaleDown) Precondition(ctx context.Context) (bool, string) { + current, err := s.client.GetInstancesPerNode(ctx) + if err != nil { + return false, fmt.Sprintf("cannot get instancesPerNode: %v", err) + } + if current <= s.minInstances { + return false, fmt.Sprintf("already at min instancesPerNode (%d)", s.minInstances) + } + return true, "" +} + +func (s *ScaleDown) Execute(ctx context.Context) error { + current, err := s.client.GetInstancesPerNode(ctx) + if err != nil { + return fmt.Errorf("get instancesPerNode: %w", err) + } + + target := current - 1 + if err := s.client.ScaleCluster(ctx, target); err != nil { + return fmt.Errorf("scale to %d: %w", target, err) + } + + // Wait for recovery (cluster stabilizes at new size). + recoveryCtx, cancel := context.WithTimeout(ctx, s.recovery) + defer cancel() + return s.healthMon.WaitForSteadyState(recoveryCtx) +} + +func (s *ScaleDown) OutagePolicy() journal.OutagePolicy { + return journal.OutagePolicy{ + AllowedDowntime: 60 * time.Second, + AllowedWriteFailures: 50, + MustRecoverWithin: s.recovery, + } +} diff --git a/test/longhaul/operations/scale_test.go b/test/longhaul/operations/scale_test.go new file mode 100644 index 000000000..58c950399 --- /dev/null +++ b/test/longhaul/operations/scale_test.go @@ -0,0 +1,140 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package operations + +import ( + "context" + "errors" + "sync" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/longhaul/monitor" +) + +// fakeClient is a minimal monitor.ClusterClient stub for unit tests. +type fakeClient struct { + mu sync.Mutex + instancesPerNode int + ipnErr error + imageTag string + scaleCalls []int + upgradeCalls []string +} + +func (f *fakeClient) GetClusterHealth(_ context.Context) (monitor.ClusterHealth, error) { + return monitor.ClusterHealth{}, nil +} +func (f *fakeClient) GetCurrentDocumentDBImageTag(_ context.Context) (string, error) { + f.mu.Lock() + defer f.mu.Unlock() + return f.imageTag, nil +} +func (f *fakeClient) GetInstancesPerNode(_ context.Context) (int, error) { + f.mu.Lock() + defer f.mu.Unlock() + return f.instancesPerNode, f.ipnErr +} +func (f *fakeClient) ScaleCluster(_ context.Context, n int) error { + f.mu.Lock() + defer f.mu.Unlock() + f.scaleCalls = append(f.scaleCalls, n) + f.instancesPerNode = n + return nil +} +func (f *fakeClient) UpgradeDocumentDB(_ context.Context, v string) error { + f.mu.Lock() + defer f.mu.Unlock() + f.upgradeCalls = append(f.upgradeCalls, v) + return nil +} + +var _ = Describe("ScaleUp", func() { + DescribeTable("clamps maxInstances to the CRD upper bound", + func(in, want int) { + s := NewScaleUp(&fakeClient{}, nil, in, time.Second) + Expect(s.maxInstances).To(Equal(want)) + }, + Entry("1->1", 1, 1), + Entry("2->2", 2, 2), + Entry("3->3", 3, 3), + Entry("4 clamped to 3", 4, 3), + Entry("99 clamped to 3", 99, 3), + ) + + It("Name and Weight are scale-up/3", func() { + s := NewScaleUp(&fakeClient{}, nil, 3, time.Second) + Expect(s.Name()).To(Equal("scale-up")) + Expect(s.Weight()).To(Equal(3)) + }) + + DescribeTable("Precondition", + func(current int, ipnErr error, max int, wantOK bool, wantReasonHas string) { + c := &fakeClient{instancesPerNode: current, ipnErr: ipnErr} + s := NewScaleUp(c, nil, max, time.Second) + ok, reason := s.Precondition(context.Background()) + Expect(ok).To(Equal(wantOK), "reason=%q", reason) + if wantReasonHas != "" { + Expect(reason).To(ContainSubstring(wantReasonHas)) + } + }, + Entry("eligible: under max", 1, nil, 3, true, ""), + Entry("eligible: just under max", 2, nil, 3, true, ""), + Entry("blocked: at max", 3, nil, 3, false, "already at max"), + Entry("blocked: ipn read error", 0, errors.New("apiserver down"), 3, false, "cannot get instancesPerNode"), + ) + + It("OutagePolicy uses tighter budgets and echoes MustRecoverWithin", func() { + s := NewScaleUp(&fakeClient{}, nil, 3, 5*time.Minute) + p := s.OutagePolicy() + Expect(p.AllowedDowntime).To(Equal(30 * time.Second)) + Expect(p.AllowedWriteFailures).To(Equal(int64(20))) + Expect(p.MustRecoverWithin).To(Equal(5 * time.Minute)) + }) +}) + +var _ = Describe("ScaleDown", func() { + DescribeTable("clamps minInstances to the CRD lower bound", + func(in, want int) { + s := NewScaleDown(&fakeClient{}, nil, in, time.Second) + Expect(s.minInstances).To(Equal(want)) + }, + Entry("0 -> 1", 0, 1), + Entry("-5 -> 1", -5, 1), + Entry("1 -> 1", 1, 1), + Entry("2 -> 2", 2, 2), + Entry("3 -> 3", 3, 3), + ) + + It("Name and Weight are scale-down/2", func() { + s := NewScaleDown(&fakeClient{}, nil, 1, time.Second) + Expect(s.Name()).To(Equal("scale-down")) + Expect(s.Weight()).To(Equal(2)) + }) + + DescribeTable("Precondition", + func(current int, ipnErr error, min int, wantOK bool, wantReasonHas string) { + c := &fakeClient{instancesPerNode: current, ipnErr: ipnErr} + s := NewScaleDown(c, nil, min, time.Second) + ok, reason := s.Precondition(context.Background()) + Expect(ok).To(Equal(wantOK), "reason=%q", reason) + if wantReasonHas != "" { + Expect(reason).To(ContainSubstring(wantReasonHas)) + } + }, + Entry("eligible: above min", 3, nil, 1, true, ""), + Entry("eligible: just above min", 2, nil, 1, true, ""), + Entry("blocked: at min", 1, nil, 1, false, "already at min"), + Entry("blocked: ipn read error", 0, errors.New("apiserver down"), 1, false, "cannot get instancesPerNode"), + ) + + It("OutagePolicy is more lenient than scale-up", func() { + s := NewScaleDown(&fakeClient{}, nil, 1, 5*time.Minute) + p := s.OutagePolicy() + Expect(p.AllowedDowntime).To(Equal(60 * time.Second)) + Expect(p.AllowedWriteFailures).To(Equal(int64(50))) + }) +}) diff --git a/test/longhaul/operations/scheduler.go b/test/longhaul/operations/scheduler.go new file mode 100644 index 000000000..e1e824ea2 --- /dev/null +++ b/test/longhaul/operations/scheduler.go @@ -0,0 +1,176 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package operations implements the operation scheduler and individual +// disruptive operations for long haul tests. +package operations + +import ( + "context" + "fmt" + "math/rand" + "sync" + "time" + + "github.com/documentdb/documentdb-operator/test/longhaul/journal" + "github.com/documentdb/documentdb-operator/test/longhaul/monitor" +) + +// Operation defines the interface for a disruptive operation. +type Operation interface { + // Name returns a human-readable identifier for this operation. + Name() string + + // Weight returns the relative probability of selection (higher = more likely). + Weight() int + + // Precondition checks if the operation can be executed in the current state. + Precondition(ctx context.Context) (bool, string) + + // Execute performs the operation and returns when complete. + Execute(ctx context.Context) error + + // OutagePolicy returns the disruption budget for this operation. + OutagePolicy() journal.OutagePolicy +} + +// Scheduler selects and executes operations based on weighted random selection, +// preconditions, cooldowns, and steady-state gates. +type Scheduler struct { + operations []Operation + healthMonitor *monitor.HealthMonitor + journal *journal.Journal + cooldown time.Duration + + mu sync.Mutex + lastOpTime time.Time + opsExecuted int + inProgress bool +} + +// NewScheduler creates an operation scheduler. +func NewScheduler( + ops []Operation, + health *monitor.HealthMonitor, + j *journal.Journal, + cooldown time.Duration, +) *Scheduler { + return &Scheduler{ + operations: ops, + healthMonitor: health, + journal: j, + cooldown: cooldown, + } +} + +// Run starts the scheduler loop. It blocks until context is cancelled. +func (s *Scheduler) Run(ctx context.Context) { + s.journal.Info("scheduler", "operation scheduler started") + defer s.journal.Info("scheduler", "operation scheduler stopped") + + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + s.tryExecute(ctx) + } + } +} + +func (s *Scheduler) tryExecute(ctx context.Context) { + s.mu.Lock() + if s.inProgress { + s.mu.Unlock() + return + } + + // Check cooldown. + if !s.lastOpTime.IsZero() && time.Since(s.lastOpTime) < s.cooldown { + s.mu.Unlock() + return + } + s.mu.Unlock() + + // Check steady-state gate. + if !s.healthMonitor.IsSteadyState() { + return + } + + // Select an operation. + op := s.selectOperation(ctx) + if op == nil { + return + } + + // Execute. + s.mu.Lock() + s.inProgress = true + s.mu.Unlock() + + s.executeOp(ctx, op) + + s.mu.Lock() + s.inProgress = false + s.lastOpTime = time.Now() + s.opsExecuted++ + s.mu.Unlock() +} + +func (s *Scheduler) selectOperation(ctx context.Context) Operation { + // Filter by preconditions and build weighted list. + type candidate struct { + op Operation + weight int + } + var candidates []candidate + totalWeight := 0 + + for _, op := range s.operations { + ok, _ := op.Precondition(ctx) + if ok { + w := op.Weight() + candidates = append(candidates, candidate{op: op, weight: w}) + totalWeight += w + } + } + + if len(candidates) == 0 || totalWeight == 0 { + return nil + } + + // Weighted random selection. + r := rand.Intn(totalWeight) + for _, c := range candidates { + r -= c.weight + if r < 0 { + return c.op + } + } + return candidates[len(candidates)-1].op +} + +func (s *Scheduler) executeOp(ctx context.Context, op Operation) { + s.journal.Info("scheduler", fmt.Sprintf("executing operation: %s", op.Name())) + s.journal.OpenDisruptionWindow(op.Name(), op.OutagePolicy()) + + err := op.Execute(ctx) + + s.journal.CloseDisruptionWindow() + + if err != nil { + s.journal.Error("scheduler", fmt.Sprintf("operation %s failed: %v", op.Name(), err)) + } else { + s.journal.Info("scheduler", fmt.Sprintf("operation %s completed successfully", op.Name())) + } +} + +// OpsExecuted returns the number of operations completed. +func (s *Scheduler) OpsExecuted() int { + s.mu.Lock() + defer s.mu.Unlock() + return s.opsExecuted +} diff --git a/test/longhaul/operations/scheduler_test.go b/test/longhaul/operations/scheduler_test.go new file mode 100644 index 000000000..cd351eb9c --- /dev/null +++ b/test/longhaul/operations/scheduler_test.go @@ -0,0 +1,123 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package operations + +import ( + "context" + "errors" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/longhaul/journal" +) + +// fakeOp is a minimal Operation for scheduler tests. +type fakeOp struct { + name string + weight int + available bool + executed int + err error +} + +func (f *fakeOp) Name() string { return f.name } +func (f *fakeOp) Weight() int { return f.weight } +func (f *fakeOp) Precondition(_ context.Context) (bool, string) { + if f.available { + return true, "" + } + return false, "precondition not met" +} +func (f *fakeOp) Execute(_ context.Context) error { + f.executed++ + return f.err +} +func (f *fakeOp) OutagePolicy() journal.OutagePolicy { return journal.DefaultOutagePolicy() } + +func newSchedulerForTest(ops ...Operation) *Scheduler { + return &Scheduler{ + operations: ops, + journal: journal.New(), + cooldown: time.Hour, + } +} + +var _ = Describe("Scheduler", func() { + Describe("selectOperation", func() { + It("returns nil when no candidates pass precondition", func() { + s := newSchedulerForTest(&fakeOp{name: "a", weight: 1, available: false}) + Expect(s.selectOperation(context.Background())).To(BeNil()) + }) + + It("returns nil when total weight is zero", func() { + a := &fakeOp{name: "a", weight: 0, available: true} + s := newSchedulerForTest(a) + Expect(s.selectOperation(context.Background())).To(BeNil()) + }) + + It("picks only operations whose precondition passes", func() { + a := &fakeOp{name: "a", weight: 1, available: false} + b := &fakeOp{name: "b", weight: 1, available: true} + s := newSchedulerForTest(a, b) + for i := 0; i < 50; i++ { + got := s.selectOperation(context.Background()) + Expect(got).NotTo(BeNil(), "iter %d", i) + Expect(got.Name()).To(Equal("b")) + } + }) + + It("respects relative weights (a:1, b:9 -> b ~ 90%)", func() { + a := &fakeOp{name: "a", weight: 1, available: true} + b := &fakeOp{name: "b", weight: 9, available: true} + s := newSchedulerForTest(a, b) + + const trials = 2000 + bCount := 0 + for i := 0; i < trials; i++ { + if s.selectOperation(context.Background()).Name() == "b" { + bCount++ + } + } + // Expected 1800; allow generous +/-10% (180) for randomness. + Expect(bCount).To(BeNumerically(">=", 1620)) + Expect(bCount).To(BeNumerically("<=", 1980)) + }) + }) + + Describe("executeOp", func() { + It("opens and closes a disruption window around the call", func() { + op := &fakeOp{name: "op", weight: 1, available: true} + s := newSchedulerForTest(op) + s.executeOp(context.Background(), op) + Expect(op.executed).To(Equal(1)) + Expect(s.journal.ActiveWindow()).To(BeNil()) + closed := s.journal.DisruptionWindows() + Expect(closed).To(HaveLen(1)) + Expect(closed[0].OperationName).To(Equal("op")) + }) + + It("records an ERROR event when Execute fails", func() { + op := &fakeOp{name: "boom", weight: 1, available: true, err: errors.New("kaboom")} + s := newSchedulerForTest(op) + s.executeOp(context.Background(), op) + + var sawError bool + for _, e := range s.journal.Events() { + if e.Level == journal.LevelError && e.Component == "scheduler" { + sawError = true + } + } + Expect(sawError).To(BeTrue(), "expected scheduler ERROR event on Execute failure") + }) + }) + + It("OpsExecuted mirrors the internal counter", func() { + s := newSchedulerForTest() + Expect(s.OpsExecuted()).To(Equal(0)) + s.opsExecuted = 7 + Expect(s.OpsExecuted()).To(Equal(7)) + }) +}) diff --git a/test/longhaul/operations/suite_test.go b/test/longhaul/operations/suite_test.go new file mode 100644 index 000000000..2a69dc2ef --- /dev/null +++ b/test/longhaul/operations/suite_test.go @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package operations + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestOperations(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Long Haul Operations Suite") +} diff --git a/test/longhaul/operations/upgrade.go b/test/longhaul/operations/upgrade.go new file mode 100644 index 000000000..8eba66e0d --- /dev/null +++ b/test/longhaul/operations/upgrade.go @@ -0,0 +1,178 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package operations + +import ( + "context" + "fmt" + "time" + + "github.com/documentdb/documentdb-operator/test/longhaul/journal" + "github.com/documentdb/documentdb-operator/test/longhaul/monitor" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +// VersionConfigMapName is the ConfigMap maintained by the monitor workflow +// that publishes the desired DocumentDB version. +const VersionConfigMapName = "longhaul-versions" + +// VersionConfigMapKey is the key inside the ConfigMap that holds the +// desired DocumentDB image tag (e.g., "0.110.0"). +const VersionConfigMapKey = "desired-documentdb-version" + +// UpgradeDocumentDB performs an in-place version upgrade of the DocumentDB +// cluster, then waits for the rolling restart to complete and the cluster +// to return to steady state. Continuous verifiers will detect any data +// corruption introduced by the upgrade. +type UpgradeDocumentDB struct { + client monitor.ClusterClient + clientset kubernetes.Interface + healthMon *monitor.HealthMonitor + j *journal.Journal + namespace string + recovery time.Duration +} + +// NewUpgradeDocumentDB creates an UpgradeDocumentDB operation. +func NewUpgradeDocumentDB( + client monitor.ClusterClient, + clientset kubernetes.Interface, + health *monitor.HealthMonitor, + j *journal.Journal, + namespace string, + recovery time.Duration, +) *UpgradeDocumentDB { + return &UpgradeDocumentDB{ + client: client, + clientset: clientset, + healthMon: health, + j: j, + namespace: namespace, + recovery: recovery, + } +} + +func (u *UpgradeDocumentDB) Name() string { return "upgrade-documentdb" } + +// Weight is intentionally low — upgrades are infrequent in practice and +// should not crowd out scale/failover operations. +func (u *UpgradeDocumentDB) Weight() int { return 1 } + +// Precondition is satisfied when the desired version published by the +// workflow differs from the running version observed in CR status, +// AND the cluster has at least one standby (instancesPerNode>=2) so the +// rolling upgrade has an HA failover target. +// +// Skipping when instancesPerNode<2 is intentional: a single-instance cluster +// has no standby to absorb writes, so a rolling upgrade WILL produce real +// downtime. Reporting that as a policy violation is a true positive but not +// useful — there is nothing the operator can do about it. The next scheduler +// tick (10s later) re-evaluates this precondition, so as soon as the cluster +// is scaled up to instancesPerNode>=2 the upgrade becomes eligible again. +// Note: the global cooldown is NOT consumed by a skipped operation — +// see scheduler.tryExecute(), lastOpTime is only updated after successful +// executeOp(). So this guard is "free" from a scheduling perspective. +func (u *UpgradeDocumentDB) Precondition(ctx context.Context) (bool, string) { + desired, err := u.readDesiredVersion(ctx) + if err != nil { + return false, fmt.Sprintf("cannot read desired version: %v", err) + } + if desired == "" { + return false, "no desired version published" + } + + running, err := u.client.GetCurrentDocumentDBImageTag(ctx) + if err != nil { + return false, fmt.Sprintf("cannot read running version: %v", err) + } + if running == desired { + return false, fmt.Sprintf("already at desired version %s", desired) + } + + ipn, err := u.client.GetInstancesPerNode(ctx) + if err != nil { + return false, fmt.Sprintf("cannot read instancesPerNode: %v", err) + } + if ipn < 2 { + return false, fmt.Sprintf("instancesPerNode=%d (no HA standby) — upgrade would cause real downtime; skipping", ipn) + } + return true, "" +} + +func (u *UpgradeDocumentDB) Execute(ctx context.Context) error { + desired, err := u.readDesiredVersion(ctx) + if err != nil { + return fmt.Errorf("read desired version: %w", err) + } + if desired == "" { + return fmt.Errorf("desired version is empty") + } + + running, _ := u.client.GetCurrentDocumentDBImageTag(ctx) + if err := u.client.UpgradeDocumentDB(ctx, desired); err != nil { + return fmt.Errorf("patch CR: %w", err) + } + + // Poll status.documentDBImage until it reflects the desired version. + pollCtx, cancel := context.WithTimeout(ctx, u.recovery) + defer cancel() + if err := u.waitForImage(pollCtx, desired, running); err != nil { + return err + } + + // Wait for the cluster to settle after the rolling restart. + steadyCtx, cancel2 := context.WithTimeout(ctx, u.recovery) + defer cancel2() + return u.healthMon.WaitForSteadyState(steadyCtx) +} + +func (u *UpgradeDocumentDB) waitForImage(ctx context.Context, desired, previous string) error { + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return fmt.Errorf("timed out waiting for status.documentDBImage to become %s (was %s): %w", desired, previous, ctx.Err()) + case <-ticker.C: + tag, err := u.client.GetCurrentDocumentDBImageTag(ctx) + if err != nil { + // Transient read errors during a rolling upgrade are + // expected (apiserver throttling, brief CR webhook + // blips, etc.) — retry on the next tick rather than + // fail the whole operation. But surface them to the + // journal so a sustained read outage is visible in + // the report instead of being silently dropped. + u.j.Warn("upgrade", fmt.Sprintf("status read error (will retry): %v", err)) + continue + } + if tag == desired { + return nil + } + } + } +} + +func (u *UpgradeDocumentDB) readDesiredVersion(ctx context.Context) (string, error) { + cm, err := u.clientset.CoreV1().ConfigMaps(u.namespace).Get(ctx, VersionConfigMapName, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + return "", nil + } + return "", err + } + return cm.Data[VersionConfigMapKey], nil +} + +// OutagePolicy allows for a longer disruption window during an upgrade +// because rolling restarts touch every pod sequentially. +func (u *UpgradeDocumentDB) OutagePolicy() journal.OutagePolicy { + return journal.OutagePolicy{ + AllowedDowntime: 120 * time.Second, + AllowedWriteFailures: 200, + MustRecoverWithin: u.recovery, + } +} diff --git a/test/longhaul/operations/upgrade_test.go b/test/longhaul/operations/upgrade_test.go new file mode 100644 index 000000000..f77be4494 --- /dev/null +++ b/test/longhaul/operations/upgrade_test.go @@ -0,0 +1,96 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package operations + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +var _ = Describe("UpgradeDocumentDB", func() { + It("Name is upgrade-documentdb and Weight is 1 (low so it doesn't crowd out other ops)", func() { + u := NewUpgradeDocumentDB(&fakeClient{}, fake.NewSimpleClientset(), nil, nil, "ns", time.Minute) + Expect(u.Name()).To(Equal("upgrade-documentdb")) + Expect(u.Weight()).To(Equal(1)) + }) + + It("OutagePolicy gives upgrades a longer downtime budget", func() { + u := NewUpgradeDocumentDB(&fakeClient{}, fake.NewSimpleClientset(), nil, nil, "ns", 10*time.Minute) + p := u.OutagePolicy() + Expect(p.AllowedDowntime).To(Equal(120 * time.Second)) + Expect(p.AllowedWriteFailures).To(Equal(int64(200))) + Expect(p.MustRecoverWithin).To(Equal(10 * time.Minute)) + }) + + Describe("readDesiredVersion", func() { + It("returns empty string and no error when ConfigMap is missing", func() { + cs := fake.NewSimpleClientset() + u := NewUpgradeDocumentDB(&fakeClient{}, cs, nil, nil, "ns", time.Minute) + + got, err := u.readDesiredVersion(context.Background()) + Expect(err).NotTo(HaveOccurred()) + Expect(got).To(BeEmpty()) + }) + + It("returns the value when CM has the expected key", func() { + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: VersionConfigMapName, Namespace: "ns"}, + Data: map[string]string{VersionConfigMapKey: "0.110.0"}, + } + cs := fake.NewSimpleClientset(cm) + u := NewUpgradeDocumentDB(&fakeClient{}, cs, nil, nil, "ns", time.Minute) + + got, err := u.readDesiredVersion(context.Background()) + Expect(err).NotTo(HaveOccurred()) + Expect(got).To(Equal("0.110.0")) + }) + + It("returns empty string when CM exists but the key is missing", func() { + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: VersionConfigMapName, Namespace: "ns"}, + Data: map[string]string{"unrelated": "value"}, + } + cs := fake.NewSimpleClientset(cm) + u := NewUpgradeDocumentDB(&fakeClient{}, cs, nil, nil, "ns", time.Minute) + + got, err := u.readDesiredVersion(context.Background()) + Expect(err).NotTo(HaveOccurred()) + Expect(got).To(BeEmpty()) + }) + }) + + DescribeTable("Precondition", + func(desired, runningTag string, ipn int, wantOK bool, wantReasonHas string) { + const ns = "ns" + cs := fake.NewSimpleClientset() + if desired != "" { + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: VersionConfigMapName, Namespace: ns}, + Data: map[string]string{VersionConfigMapKey: desired}, + } + cs = fake.NewSimpleClientset(cm) + } + c := &fakeClient{instancesPerNode: ipn, imageTag: runningTag} + u := NewUpgradeDocumentDB(c, cs, nil, nil, ns, time.Minute) + + ok, reason := u.Precondition(context.Background()) + Expect(ok).To(Equal(wantOK), "reason=%q", reason) + if wantReasonHas != "" { + Expect(reason).To(ContainSubstring(wantReasonHas)) + } + }, + Entry("no desired version published", "", "0.109.0", 2, false, "no desired version"), + Entry("already at desired", "0.110.0", "0.110.0", 2, false, "already at desired"), + Entry("single-instance: ipn=1 -> skip", "0.110.0", "0.109.0", 1, false, "no HA standby"), + Entry("eligible: HA + version differs", "0.110.0", "0.109.0", 2, true, ""), + Entry("eligible: max HA", "0.110.0", "0.109.0", 3, true, ""), + ) +}) diff --git a/test/longhaul/report/alert.go b/test/longhaul/report/alert.go new file mode 100644 index 000000000..c1fcb37a1 --- /dev/null +++ b/test/longhaul/report/alert.go @@ -0,0 +1,44 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package report + +import ( + "fmt" + "os" + "time" +) + +// isGitHubActions returns true when running inside GitHub Actions. +func isGitHubActions() bool { + return os.Getenv("GITHUB_ACTIONS") == "true" +} + +// EmitAnnotation emits GitHub Actions workflow annotations based on test status. +// These annotations appear in the Actions UI on the workflow run summary. +func EmitAnnotation(s Summary) { + if !isGitHubActions() { + return + } + + switch s.Result { + case ResultFail: + msg := "Long haul test FAILED" + if s.FailReason != "" { + msg = fmt.Sprintf("Long haul test FAILED: %s", s.FailReason) + } + // ::error:: annotations show as red in the Actions UI. + fmt.Printf("::error title=Long Haul Test Failure::%s\n", msg) + + case ResultPass: + // For intermediate checkpoints, emit a notice. + fmt.Printf("::notice title=Long Haul Checkpoint::PASS after %s — %d writes, %d ops, %d gaps\n", + s.Duration.Round(time.Second), s.Metrics.WriteAttempted, s.OpsExecuted, s.Metrics.GapsDetected) + } + + // Emit warning for memory leak regardless of result. + if s.LeakAnalysis.HasLeak { + fmt.Printf("::warning title=Memory Leak Suspected::%.2f MB/hour over %s (%d samples)\n", + s.LeakAnalysis.MemorySlopeMB, s.LeakAnalysis.Duration.Round(time.Second), s.LeakAnalysis.SampleCount) + } +} diff --git a/test/longhaul/report/alert_test.go b/test/longhaul/report/alert_test.go new file mode 100644 index 000000000..8d190d96b --- /dev/null +++ b/test/longhaul/report/alert_test.go @@ -0,0 +1,115 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package report + +import ( + "bytes" + "io" + "os" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/longhaul/monitor" + "github.com/documentdb/documentdb-operator/test/longhaul/workload" +) + +// captureStdout redirects os.Stdout for the duration of fn and returns what was +// written. EmitAnnotation uses fmt.Printf which writes directly to os.Stdout, +// so we cannot use a *bytes.Buffer here. +func captureStdout(fn func()) string { + r, w, err := os.Pipe() + Expect(err).NotTo(HaveOccurred()) + + orig := os.Stdout + os.Stdout = w + defer func() { os.Stdout = orig }() + + done := make(chan string, 1) + go func() { + var buf bytes.Buffer + _, _ = io.Copy(&buf, r) + done <- buf.String() + }() + fn() + _ = w.Close() + return <-done +} + +var _ = Describe("EmitAnnotation", func() { + Context("outside GitHub Actions (GITHUB_ACTIONS unset)", func() { + BeforeEach(func() { + GinkgoT().Setenv("GITHUB_ACTIONS", "") + }) + + It("is a silent no-op", func() { + out := captureStdout(func() { + EmitAnnotation(Summary{Result: ResultFail, FailReason: "boom"}) + }) + Expect(out).To(BeEmpty()) + }) + }) + + Context("inside GitHub Actions (GITHUB_ACTIONS=true)", func() { + BeforeEach(func() { + GinkgoT().Setenv("GITHUB_ACTIONS", "true") + }) + + It("emits ::error for FAIL and includes the reason", func() { + out := captureStdout(func() { + EmitAnnotation(Summary{Result: ResultFail, FailReason: "data loss detected"}) + }) + Expect(out).To(ContainSubstring("::error")) + Expect(out).To(ContainSubstring("data loss detected")) + }) + + It("emits a default ::error when FAIL has no reason", func() { + out := captureStdout(func() { + EmitAnnotation(Summary{Result: ResultFail}) + }) + Expect(out).To(ContainSubstring("::error")) + Expect(out).To(ContainSubstring("Long haul test FAILED")) + }) + + It("emits ::notice on PASS with metric values", func() { + out := captureStdout(func() { + EmitAnnotation(Summary{ + Result: ResultPass, + Duration: 2 * time.Hour, + OpsExecuted: 17, + Metrics: workload.MetricsSnapshot{WriteAttempted: 1234, GapsDetected: 0}, + }) + }) + Expect(out).To(ContainSubstring("::notice")) + Expect(out).To(ContainSubstring("1234")) + Expect(out).To(ContainSubstring("17")) + }) + + DescribeTable("emits leak ::warning regardless of result when HasLeak=true", + func(res Result) { + leak := monitor.LeakAnalysis{ + HasLeak: true, + MemorySlopeMB: 12.5, + Duration: 90 * time.Minute, + SampleCount: 60, + } + out := captureStdout(func() { + EmitAnnotation(Summary{Result: res, LeakAnalysis: leak}) + }) + Expect(out).To(ContainSubstring("::warning")) + Expect(out).To(ContainSubstring("12.50")) + }, + Entry("on PASS", ResultPass), + Entry("on FAIL", ResultFail), + ) + + It("does not emit a leak ::warning when HasLeak=false", func() { + out := captureStdout(func() { + EmitAnnotation(Summary{Result: ResultPass, LeakAnalysis: monitor.LeakAnalysis{HasLeak: false}}) + }) + Expect(out).NotTo(ContainSubstring("::warning")) + }) + }) +}) diff --git a/test/longhaul/report/checkpoint.go b/test/longhaul/report/checkpoint.go new file mode 100644 index 000000000..363f1a87d --- /dev/null +++ b/test/longhaul/report/checkpoint.go @@ -0,0 +1,139 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package report + +import ( + "context" + "encoding/json" + "fmt" + "log" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +const ( + // ConfigMapName is the name of the ConfigMap used to persist reports. + ConfigMapName = "longhaul-report" +) + +// SummaryFunc is called to generate the current test summary. +type SummaryFunc func() Summary + +// CheckpointReporter periodically generates and persists reports. +type CheckpointReporter struct { + clientset kubernetes.Interface + namespace string + interval time.Duration + summaryFunc SummaryFunc +} + +// NewCheckpointReporter creates a periodic reporter that writes to stdout and ConfigMap. +func NewCheckpointReporter(clientset kubernetes.Interface, namespace string, interval time.Duration, fn SummaryFunc) *CheckpointReporter { + return &CheckpointReporter{ + clientset: clientset, + namespace: namespace, + interval: interval, + summaryFunc: fn, + } +} + +// Run starts the periodic reporting loop. Blocks until context is cancelled. +func (r *CheckpointReporter) Run(ctx context.Context) { + log.Printf("[checkpoint] periodic reporter started (interval=%s)", r.interval) + defer log.Println("[checkpoint] periodic reporter stopped") + + ticker := time.NewTicker(r.interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + // Final report on exit. + r.emit(context.Background()) + return + case <-ticker.C: + r.emit(ctx) + } + } +} + +func (r *CheckpointReporter) emit(ctx context.Context) { + summary := r.summaryFunc() + + // Mark as RUNNING for intermediate checkpoints (unless already FAIL). + resultStr := string(summary.Result) + if summary.Result == ResultPass { + resultStr = "RUNNING" + } + + markdown := GenerateMarkdown(summary) + + // Print to stdout with clear delimiter. + fmt.Printf("\n%s\n", "=== CHECKPOINT REPORT ===") + fmt.Println(markdown) + fmt.Printf("%s\n\n", "=== END CHECKPOINT ===") + + // Emit GitHub Actions annotations. + EmitAnnotation(summary) + + // Persist to ConfigMap. + if r.clientset == nil { + return + } + + data := map[string]string{ + "latest-report": markdown, + "last-updated": time.Now().UTC().Format(time.RFC3339), + "result": resultStr, + } + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: ConfigMapName, + Namespace: r.namespace, + Labels: map[string]string{ + "app.kubernetes.io/name": "longhaul-test", + "app.kubernetes.io/part-of": "documentdb-operator", + }, + }, + Data: data, + } + + existing, err := r.clientset.CoreV1().ConfigMaps(r.namespace).Get(ctx, ConfigMapName, metav1.GetOptions{}) + if errors.IsNotFound(err) { + _, err = r.clientset.CoreV1().ConfigMaps(r.namespace).Create(ctx, cm, metav1.CreateOptions{}) + if err != nil { + log.Printf("[checkpoint] failed to create ConfigMap: %v", err) + } else { + log.Println("[checkpoint] ConfigMap created") + } + } else if err == nil { + existing.Data = data + _, err = r.clientset.CoreV1().ConfigMaps(r.namespace).Update(ctx, existing, metav1.UpdateOptions{}) + if err != nil { + log.Printf("[checkpoint] failed to update ConfigMap: %v", err) + } else { + log.Println("[checkpoint] ConfigMap updated") + } + } else { + log.Printf("[checkpoint] failed to get ConfigMap: %v", err) + } + + // Also log the summary as JSON for structured log consumers. + summaryJSON, _ := json.Marshal(map[string]interface{}{ + "result": resultStr, + "elapsed": summary.Duration.String(), + "writes": summary.Metrics.WriteAttempted, + "gaps": summary.Metrics.GapsDetected, + "ops": summary.OpsExecuted, + "memory_leak": summary.LeakAnalysis.HasLeak, + "memory_slope": fmt.Sprintf("%.2f MB/h", summary.LeakAnalysis.MemorySlopeMB), + "checkpoint_time": time.Now().UTC().Format(time.RFC3339), + }) + log.Printf("[checkpoint] %s", string(summaryJSON)) +} diff --git a/test/longhaul/report/checkpoint_test.go b/test/longhaul/report/checkpoint_test.go new file mode 100644 index 000000000..29932173e --- /dev/null +++ b/test/longhaul/report/checkpoint_test.go @@ -0,0 +1,79 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package report + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +var _ = Describe("CheckpointReporter", func() { + It("emit() is safe with a nil clientset (logs to stdout, does not panic)", func() { + r := NewCheckpointReporter(nil, "ns", time.Second, func() Summary { + return Summary{Result: ResultPass, Duration: time.Minute} + }) + Expect(func() { r.emit(context.Background()) }).NotTo(Panic()) + }) + + It("creates the ConfigMap on first emit and labels it identifiably", func() { + cs := fake.NewSimpleClientset() + r := NewCheckpointReporter(cs, "ns", time.Second, func() Summary { + return Summary{Result: ResultPass, Duration: 2 * time.Hour, OpsExecuted: 5} + }) + + r.emit(context.Background()) + + cm, err := cs.CoreV1().ConfigMaps("ns").Get(context.Background(), ConfigMapName, metav1.GetOptions{}) + Expect(err).NotTo(HaveOccurred()) + Expect(cm.Data).To(HaveKey("latest-report")) + Expect(cm.Data).To(HaveKey("last-updated")) + Expect(cm.Data).To(HaveKey("result")) + // PASS at intermediate checkpoint is persisted as RUNNING so consumers + // can distinguish in-flight from final state. + Expect(cm.Data["result"]).To(Equal("RUNNING")) + Expect(cm.Labels).To(HaveKeyWithValue("app.kubernetes.io/name", "longhaul-test")) + }) + + It("persists FAIL results as FAIL", func() { + cs := fake.NewSimpleClientset() + r := NewCheckpointReporter(cs, "ns", time.Second, func() Summary { + return Summary{Result: ResultFail, FailReason: "data loss"} + }) + + r.emit(context.Background()) + + cm, err := cs.CoreV1().ConfigMaps("ns").Get(context.Background(), ConfigMapName, metav1.GetOptions{}) + Expect(err).NotTo(HaveOccurred()) + Expect(cm.Data["result"]).To(Equal("FAIL")) + }) + + It("Updates the existing ConfigMap on subsequent emits", func() { + cs := fake.NewSimpleClientset() + + calls := 0 + r := NewCheckpointReporter(cs, "ns", time.Second, func() Summary { + calls++ + return Summary{Result: ResultPass, Duration: time.Duration(calls) * time.Hour, OpsExecuted: calls * 10} + }) + + r.emit(context.Background()) + cm1, err := cs.CoreV1().ConfigMaps("ns").Get(context.Background(), ConfigMapName, metav1.GetOptions{}) + Expect(err).NotTo(HaveOccurred()) + report1 := cm1.Data["latest-report"] + + // Fake clientset doesn't bump ResourceVersion automatically, so assert + // on content change instead. + r.emit(context.Background()) + cm2, err := cs.CoreV1().ConfigMaps("ns").Get(context.Background(), ConfigMapName, metav1.GetOptions{}) + Expect(err).NotTo(HaveOccurred()) + Expect(cm2.Data["latest-report"]).NotTo(Equal(report1)) + Expect(calls).To(Equal(2)) + }) +}) diff --git a/test/longhaul/report/report.go b/test/longhaul/report/report.go new file mode 100644 index 000000000..4b43ebe47 --- /dev/null +++ b/test/longhaul/report/report.go @@ -0,0 +1,108 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package report generates a markdown summary of the long haul test run. +package report + +import ( + "fmt" + "strings" + "time" + + "github.com/documentdb/documentdb-operator/test/longhaul/journal" + "github.com/documentdb/documentdb-operator/test/longhaul/monitor" + "github.com/documentdb/documentdb-operator/test/longhaul/workload" +) + +// Result represents the overall test outcome. +type Result string + +const ( + ResultPass Result = "PASS" + ResultFail Result = "FAIL" +) + +// Summary contains all data needed to generate the final report. +type Summary struct { + Result Result + Duration time.Duration + Metrics workload.MetricsSnapshot + LeakAnalysis monitor.LeakAnalysis + OpsExecuted int + Windows []journal.DisruptionWindow + Events []journal.Event + FailReason string +} + +// GenerateMarkdown produces a human-readable markdown report. +func GenerateMarkdown(s Summary) string { + var b strings.Builder + + b.WriteString("# Long Haul Test Report\n\n") + + // Header + b.WriteString(fmt.Sprintf("**Result:** %s\n", s.Result)) + b.WriteString(fmt.Sprintf("**Duration:** %s\n", s.Duration.Round(time.Second))) + b.WriteString(fmt.Sprintf("**Operations Executed:** %d\n", s.OpsExecuted)) + if s.FailReason != "" { + b.WriteString(fmt.Sprintf("**Failure Reason:** %s\n", s.FailReason)) + } + b.WriteString("\n") + + // Data Plane Metrics + b.WriteString("## Data Plane Metrics\n\n") + b.WriteString("| Metric | Value |\n") + b.WriteString("|--------|-------|\n") + b.WriteString(fmt.Sprintf("| Writes Attempted | %d |\n", s.Metrics.WriteAttempted)) + b.WriteString(fmt.Sprintf("| Writes Acknowledged | %d |\n", s.Metrics.WriteAcknowledged)) + b.WriteString(fmt.Sprintf("| Writes Failed | %d |\n", s.Metrics.WriteFailed)) + b.WriteString(fmt.Sprintf("| Write Success Rate | %.2f%% |\n", s.Metrics.WriteSuccessRate()*100)) + b.WriteString(fmt.Sprintf("| Verify Passes | %d |\n", s.Metrics.VerifyPasses)) + b.WriteString(fmt.Sprintf("| Gaps Detected | %d |\n", s.Metrics.GapsDetected)) + b.WriteString(fmt.Sprintf("| Checksum Errors | %d |\n", s.Metrics.ChecksumErrors)) + b.WriteString("\n") + + // Disruption Windows + if len(s.Windows) > 0 { + b.WriteString("## Disruption Windows\n\n") + b.WriteString("| Operation | Duration | Write Failures | Policy Exceeded |\n") + b.WriteString("|-----------|----------|----------------|------------------|\n") + for _, w := range s.Windows { + exceeded := "No" + if w.ExceededPolicy() { + exceeded = "**YES**" + } + b.WriteString(fmt.Sprintf("| %s | %s | %d | %s |\n", + w.OperationName, w.Duration().Round(time.Second), w.WriteFailures, exceeded)) + } + b.WriteString("\n") + } + + // Leak Analysis + if s.LeakAnalysis.SampleCount > 0 { + b.WriteString("## Resource Leak Analysis\n\n") + b.WriteString(fmt.Sprintf("- Samples: %d over %s\n", + s.LeakAnalysis.SampleCount, s.LeakAnalysis.Duration.Round(time.Minute))) + b.WriteString(fmt.Sprintf("- Memory trend: %.2f MB/hour\n", s.LeakAnalysis.MemorySlopeMB)) + b.WriteString(fmt.Sprintf("- CPU trend: %.4f cores/hour\n", s.LeakAnalysis.CPUSlopeCores)) + if s.LeakAnalysis.HasLeak { + b.WriteString("- **⚠️ Memory leak suspected**\n") + } + b.WriteString("\n") + } + + // Recent Events (last 20) + b.WriteString("## Recent Events\n\n") + b.WriteString("```\n") + events := s.Events + start := 0 + if len(events) > 20 { + start = len(events) - 20 + } + for _, e := range events[start:] { + b.WriteString(e.String() + "\n") + } + b.WriteString("```\n") + + return b.String() +} diff --git a/test/longhaul/report/report_test.go b/test/longhaul/report/report_test.go new file mode 100644 index 000000000..f8b77297e --- /dev/null +++ b/test/longhaul/report/report_test.go @@ -0,0 +1,118 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package report + +import ( + "strings" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/longhaul/journal" + "github.com/documentdb/documentdb-operator/test/longhaul/monitor" + "github.com/documentdb/documentdb-operator/test/longhaul/workload" +) + +var _ = Describe("GenerateMarkdown", func() { + It("PASS report has header, rounded duration, and no failure reason", func() { + md := GenerateMarkdown(Summary{Result: ResultPass, Duration: 3 * time.Hour}) + Expect(md).To(ContainSubstring("**Result:** PASS")) + Expect(md).To(ContainSubstring("**Duration:** 3h0m0s")) + Expect(md).NotTo(ContainSubstring("Failure Reason")) + }) + + It("FAIL report includes header and reason", func() { + md := GenerateMarkdown(Summary{ + Result: ResultFail, + Duration: 90 * time.Second, + FailReason: "policy exceeded on scale-up", + }) + Expect(md).To(ContainSubstring("**Result:** FAIL")) + Expect(md).To(ContainSubstring("policy exceeded on scale-up")) + }) + + It("rounds duration to seconds", func() { + md := GenerateMarkdown(Summary{ + Result: ResultPass, + Duration: 2*time.Second + 678*time.Millisecond, + }) + Expect(md).To(ContainSubstring("**Duration:** 3s")) + }) + + It("always includes the Data Plane Metrics table", func() { + md := GenerateMarkdown(Summary{ + Result: ResultPass, + Metrics: workload.MetricsSnapshot{ + WriteAttempted: 1000, + WriteAcknowledged: 998, + WriteFailed: 2, + VerifyPasses: 10, + GapsDetected: 0, + ChecksumErrors: 0, + }, + }) + for _, want := range []string{ + "## Data Plane Metrics", + "| Writes Attempted | 1000 |", + "| Writes Acknowledged | 998 |", + "| Writes Failed | 2 |", + } { + Expect(md).To(ContainSubstring(want)) + } + }) + + Describe("Disruption Windows section", func() { + It("is hidden when there are no windows", func() { + md := GenerateMarkdown(Summary{Result: ResultPass}) + Expect(md).NotTo(ContainSubstring("Disruption Windows")) + }) + + It("appears with the operation name when at least one window exists", func() { + now := time.Now() + md := GenerateMarkdown(Summary{ + Result: ResultPass, + Windows: []journal.DisruptionWindow{{ + OperationName: "scale-up", + StartTime: now.Add(-30 * time.Second), + EndTime: now, + WriteFailures: 3, + Policy: journal.OutagePolicy{MustRecoverWithin: time.Minute, AllowedWriteFailures: 50}, + }}, + }) + Expect(md).To(ContainSubstring("Disruption Windows")) + Expect(md).To(ContainSubstring("scale-up")) + }) + }) + + Describe("Resource Leak Analysis section", func() { + It("is hidden when SampleCount=0", func() { + md := GenerateMarkdown(Summary{Result: ResultPass}) + Expect(md).NotTo(ContainSubstring("Resource Leak Analysis")) + }) + + It("appears with a warning when HasLeak=true and SampleCount>0", func() { + md := GenerateMarkdown(Summary{ + Result: ResultPass, + LeakAnalysis: monitor.LeakAnalysis{SampleCount: 100, HasLeak: true, MemorySlopeMB: 250.5}, + }) + Expect(md).To(ContainSubstring("Resource Leak Analysis")) + Expect(md).To(ContainSubstring("Memory leak suspected")) + }) + }) + + It("truncates Recent Events to the last 20", func() { + events := make([]journal.Event, 30) + for i := range events { + events[i] = journal.Event{ + Timestamp: time.Now(), + Level: journal.LevelInfo, + Component: "x", + Message: "evt", + } + } + md := GenerateMarkdown(Summary{Result: ResultPass, Events: events}) + Expect(strings.Count(md, "INFO x: evt")).To(Equal(20)) + }) +}) diff --git a/test/longhaul/report/suite_test.go b/test/longhaul/report/suite_test.go new file mode 100644 index 000000000..2b8b03867 --- /dev/null +++ b/test/longhaul/report/suite_test.go @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package report + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestReport(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Long Haul Report Suite") +} diff --git a/test/longhaul/workload/metrics.go b/test/longhaul/workload/metrics.go new file mode 100644 index 000000000..864f28282 --- /dev/null +++ b/test/longhaul/workload/metrics.go @@ -0,0 +1,74 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package workload implements the data-plane workload for long haul tests, +// including writers that generate sequential inserts and verifiers that +// detect gaps and checksum mismatches. +package workload + +import ( + "sync/atomic" + "time" +) + +// Metrics tracks aggregate workload counters using atomic operations. +// All fields are safe for concurrent access from multiple goroutines. +type Metrics struct { + // Writer metrics + WriteAttempted atomic.Int64 + WriteAcknowledged atomic.Int64 + WriteFailed atomic.Int64 + + // Verifier metrics + VerifyPasses atomic.Int64 + VerifyGapsDetected atomic.Int64 + ChecksumErrors atomic.Int64 + + // Timing + StartTime time.Time +} + +// NewMetrics creates a new Metrics instance with the start time set to now. +func NewMetrics() *Metrics { + return &Metrics{ + StartTime: time.Now(), + } +} + +// Snapshot returns a point-in-time copy of all metric values. +type MetricsSnapshot struct { + WriteAttempted int64 + WriteAcknowledged int64 + WriteFailed int64 + VerifyPasses int64 + GapsDetected int64 + ChecksumErrors int64 + Elapsed time.Duration +} + +// Snapshot captures the current metric values atomically. +func (m *Metrics) Snapshot() MetricsSnapshot { + return MetricsSnapshot{ + WriteAttempted: m.WriteAttempted.Load(), + WriteAcknowledged: m.WriteAcknowledged.Load(), + WriteFailed: m.WriteFailed.Load(), + VerifyPasses: m.VerifyPasses.Load(), + GapsDetected: m.VerifyGapsDetected.Load(), + ChecksumErrors: m.ChecksumErrors.Load(), + Elapsed: time.Since(m.StartTime), + } +} + +// WriteSuccessRate returns the ratio of acknowledged to attempted writes. +// Returns 1.0 if no writes have been attempted. +func (s MetricsSnapshot) WriteSuccessRate() float64 { + if s.WriteAttempted == 0 { + return 1.0 + } + return float64(s.WriteAcknowledged) / float64(s.WriteAttempted) +} + +// HasDataLoss returns true if any gaps or checksum errors have been detected. +func (s MetricsSnapshot) HasDataLoss() bool { + return s.GapsDetected > 0 || s.ChecksumErrors > 0 +} diff --git a/test/longhaul/workload/metrics_test.go b/test/longhaul/workload/metrics_test.go new file mode 100644 index 000000000..ef040ba66 --- /dev/null +++ b/test/longhaul/workload/metrics_test.go @@ -0,0 +1,99 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package workload + +import ( + "sync" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Metrics", func() { + It("NewMetrics records StartTime in the present", func() { + before := time.Now() + m := NewMetrics() + after := time.Now() + Expect(m.StartTime).To(BeTemporally(">=", before)) + Expect(m.StartTime).To(BeTemporally("<=", after)) + }) + + It("Snapshot reads all counters and reports a positive elapsed", func() { + m := NewMetrics() + m.WriteAttempted.Store(10) + m.WriteAcknowledged.Store(8) + m.WriteFailed.Store(2) + m.VerifyPasses.Store(3) + m.VerifyGapsDetected.Store(1) + m.ChecksumErrors.Store(0) + + time.Sleep(2 * time.Millisecond) + s := m.Snapshot() + Expect(s.WriteAttempted).To(Equal(int64(10))) + Expect(s.WriteAcknowledged).To(Equal(int64(8))) + Expect(s.WriteFailed).To(Equal(int64(2))) + Expect(s.VerifyPasses).To(Equal(int64(3))) + Expect(s.GapsDetected).To(Equal(int64(1))) + Expect(s.ChecksumErrors).To(Equal(int64(0))) + Expect(s.Elapsed).To(BeNumerically(">", 0)) + }) + + DescribeTable("WriteSuccessRate", + func(attempted, acked int64, want float64) { + s := MetricsSnapshot{WriteAttempted: attempted, WriteAcknowledged: acked} + Expect(s.WriteSuccessRate()).To(Equal(want)) + }, + Entry("no attempts returns 1.0", int64(0), int64(0), 1.0), + Entry("all succeeded", int64(100), int64(100), 1.0), + Entry("none succeeded", int64(100), int64(0), 0.0), + Entry("half succeeded", int64(100), int64(50), 0.5), + Entry("acked exceeds attempted (sanity, not clamped)", int64(10), int64(12), 1.2), + ) + + DescribeTable("HasDataLoss", + func(gaps, checksums int64, want bool) { + s := MetricsSnapshot{GapsDetected: gaps, ChecksumErrors: checksums} + Expect(s.HasDataLoss()).To(Equal(want)) + }, + Entry("clean", int64(0), int64(0), false), + Entry("one gap", int64(1), int64(0), true), + Entry("one checksum mismatch", int64(0), int64(1), true), + Entry("both", int64(5), int64(7), true), + ) + + It("counters are atomic-safe under concurrent increment + Snapshot", func() { + // All counter mutations happen across goroutines in production + // (writer + verifier + scheduler). Verify the atomic.Int64 fields + // don't race under concurrent increment + Snapshot reads. + m := NewMetrics() + const writers = 8 + const perWriter = 1000 + + var wg sync.WaitGroup + wg.Add(writers + 1) + + for i := 0; i < writers; i++ { + go func() { + defer wg.Done() + for j := 0; j < perWriter; j++ { + m.WriteAttempted.Add(1) + m.WriteAcknowledged.Add(1) + } + }() + } + go func() { + defer wg.Done() + for i := 0; i < 1000; i++ { + _ = m.Snapshot() + } + }() + wg.Wait() + + s := m.Snapshot() + want := int64(writers * perWriter) + Expect(s.WriteAttempted).To(Equal(want)) + Expect(s.WriteAcknowledged).To(Equal(want)) + }) +}) diff --git a/test/longhaul/workload/suite_test.go b/test/longhaul/workload/suite_test.go new file mode 100644 index 000000000..1e8f5feda --- /dev/null +++ b/test/longhaul/workload/suite_test.go @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package workload + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestWorkload(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Long Haul Workload Suite") +} diff --git a/test/longhaul/workload/verifier.go b/test/longhaul/workload/verifier.go new file mode 100644 index 000000000..ae147b509 --- /dev/null +++ b/test/longhaul/workload/verifier.go @@ -0,0 +1,163 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package workload + +import ( + "context" + "fmt" + "time" + + "github.com/documentdb/documentdb-operator/test/longhaul/journal" + + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/mongo" + "go.mongodb.org/mongo-driver/v2/mongo/options" + "go.mongodb.org/mongo-driver/v2/mongo/readconcern" +) + +const ( + // verifyInterval is how often the verifier scans for gaps. + verifyInterval = 10 * time.Second +) + +// Verifier periodically scans the workload collection to detect +// sequence gaps and checksum mismatches in acknowledged writes. +// +// To bound the per-cycle scan cost over a multi-day run, the verifier tracks +// the next expected sequence per writer in nextSeq and only scans documents +// with seq >= nextSeq. Without this, a 100ms-per-write writer accumulates +// ~864k docs/day and verifyAll would re-read the entire history every 10s +// (~75M doc-reads/hour per writer), which both saturates the cluster and +// turns the verifier's own load into a confounding signal in the report. +type Verifier struct { + id string + metrics *Metrics + journal *journal.Journal + collection *mongo.Collection + + // nextSeq is the next sequence number we expect to see for each writer. + // Only mutated from the verifier goroutine, so no lock is needed. + nextSeq map[string]int64 +} + +// NewVerifier creates a verifier with the given ID. +func NewVerifier(id string, db *mongo.Database, metrics *Metrics, j *journal.Journal) *Verifier { + coll := db.Collection(CollectionName, options.Collection(). + SetReadConcern(readconcern.Majority())) + return &Verifier{ + id: id, + metrics: metrics, + journal: j, + collection: coll, + nextSeq: make(map[string]int64), + } +} + +// Run starts the verifier loop. It blocks until the context is cancelled. +func (v *Verifier) Run(ctx context.Context) { + v.journal.Info("verifier", fmt.Sprintf("verifier %s started", v.id)) + defer v.journal.Info("verifier", fmt.Sprintf("verifier %s stopped", v.id)) + + ticker := time.NewTicker(verifyInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + v.verifyAll(ctx) + } + } +} + +func (v *Verifier) verifyAll(ctx context.Context) { + // Get distinct writer IDs using aggregation (v2 API compatible). + pipeline := bson.A{ + bson.D{{Key: "$group", Value: bson.D{{Key: "_id", Value: "$writer_id"}}}}, + } + cursor, err := v.collection.Aggregate(ctx, pipeline) + if err != nil { + v.journal.Warn("verifier", fmt.Sprintf("failed to get writer IDs: %v", err)) + return + } + defer cursor.Close(ctx) + + var results []struct { + ID string `bson:"_id"` + } + if err := cursor.All(ctx, &results); err != nil { + v.journal.Warn("verifier", fmt.Sprintf("failed to decode writer IDs: %v", err)) + return + } + + for _, r := range results { + v.verifyWriter(ctx, r.ID) + } + + v.metrics.VerifyPasses.Add(1) +} + +func (v *Verifier) verifyWriter(ctx context.Context, writerID string) { + // Resume from where the previous cycle left off. First-ever scan starts at 1. + expectedSeq := v.nextSeq[writerID] + if expectedSeq == 0 { + expectedSeq = 1 + } + + opts := options.Find().SetSort(bson.D{{Key: "seq", Value: 1}}) + filter := bson.D{ + {Key: "writer_id", Value: writerID}, + {Key: "seq", Value: bson.D{{Key: "$gte", Value: expectedSeq}}}, + } + cursor, err := v.collection.Find(ctx, filter, opts) + if err != nil { + v.journal.Warn("verifier", fmt.Sprintf("query failed for writer %s: %v", writerID, err)) + return + } + defer cursor.Close(ctx) + + for cursor.Next(ctx) { + var doc WriteDocument + if err := cursor.Decode(&doc); err != nil { + v.journal.Warn("verifier", fmt.Sprintf("decode error for writer %s: %v", writerID, err)) + continue + } + + // Check for gaps in the sequence. + if doc.Seq > expectedSeq { + gaps := doc.Seq - expectedSeq + v.metrics.VerifyGapsDetected.Add(gaps) + v.journal.Error("verifier", fmt.Sprintf( + "gap detected: writer=%s expected_seq=%d got_seq=%d (missing %d)", + writerID, expectedSeq, doc.Seq, gaps)) + } + expectedSeq = doc.Seq + 1 + + // Verify checksum. + expected := computeChecksum(doc.WriterID, doc.Seq, doc.Payload) + if doc.Checksum != expected { + v.metrics.ChecksumErrors.Add(1) + v.journal.Error("verifier", fmt.Sprintf( + "checksum mismatch: writer=%s seq=%d stored=%s computed=%s", + writerID, doc.Seq, doc.Checksum, expected)) + } + } + + // Persist the resume point. If the cursor returned no rows, expectedSeq is + // unchanged and we'll re-scan from the same point next cycle (correct: a + // gap might fill in later when a delayed/recovered write commits). + v.nextSeq[writerID] = expectedSeq +} + +// StartVerifiers launches n verifiers and returns them. +func StartVerifiers(ctx context.Context, n int, db *mongo.Database, metrics *Metrics, j *journal.Journal) []*Verifier { + verifiers := make([]*Verifier, n) + for i := 0; i < n; i++ { + id := fmt.Sprintf("v%03d", i) + verifiers[i] = NewVerifier(id, db, metrics, j) + go verifiers[i].Run(ctx) + } + return verifiers +} diff --git a/test/longhaul/workload/verifier_test.go b/test/longhaul/workload/verifier_test.go new file mode 100644 index 000000000..606f85892 --- /dev/null +++ b/test/longhaul/workload/verifier_test.go @@ -0,0 +1,48 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package workload + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/longhaul/journal" +) + +var _ = Describe("Verifier", func() { + It("constructor wires id, metrics, and journal correctly", func() { + m := NewMetrics() + j := journal.New() + v := &Verifier{id: "v007", metrics: m, journal: j, nextSeq: make(map[string]int64)} + Expect(v.id).To(Equal("v007")) + Expect(v.metrics).To(BeIdenticalTo(m)) + Expect(v.journal).To(BeIdenticalTo(j)) + Expect(v.nextSeq).To(BeEmpty()) + }) + + It("nextSeq is the per-writer resume point that bounds per-cycle scan cost", func() { + // verifyWriter sets nextSeq[writerID] to the seq AFTER the last seen doc, + // so on the next cycle the scan filter is "seq >= nextSeq". This is what + // keeps the per-cycle scan cost bounded over a multi-day run. + v := &Verifier{nextSeq: make(map[string]int64)} + + got, ok := v.nextSeq["w1"] + Expect(ok).To(BeFalse()) + Expect(got).To(BeZero()) + + v.nextSeq["w1"] = 4 + Expect(v.nextSeq["w1"]).To(Equal(int64(4))) + + v.nextSeq["w2"] = 10 + Expect(v.nextSeq["w1"]).To(Equal(int64(4))) + Expect(v.nextSeq["w2"]).To(Equal(int64(10))) + }) + + It("verifyAll requires a *mongo.Database (covered by integration runs)", func() { + // We can't unit-test verifyAll's mongo path without a server, but the + // constructor wiring above + the table-driven gap-detection logic is + // what the verifier actually does. Document the boundary. + Skip("verifyAll requires a *mongo.Database; covered by long-haul integration runs") + }) +}) diff --git a/test/longhaul/workload/writer.go b/test/longhaul/workload/writer.go new file mode 100644 index 000000000..b5fe68e52 --- /dev/null +++ b/test/longhaul/workload/writer.go @@ -0,0 +1,145 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package workload + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "fmt" + "sync/atomic" + "time" + + "github.com/documentdb/documentdb-operator/test/longhaul/journal" + + "go.mongodb.org/mongo-driver/v2/bson" + "go.mongodb.org/mongo-driver/v2/mongo" + "go.mongodb.org/mongo-driver/v2/mongo/options" + "go.mongodb.org/mongo-driver/v2/mongo/writeconcern" +) + +const ( + // CollectionName is the MongoDB collection used by the workload. + CollectionName = "longhaul_writes" + + // writeInterval is the time between sequential writes per writer. + writeInterval = 100 * time.Millisecond +) + +// WriteDocument is the schema for data-plane durability tracking. +type WriteDocument struct { + WriterID string `bson:"writer_id"` + Seq int64 `bson:"seq"` + Payload string `bson:"payload"` + Checksum string `bson:"checksum"` + Timestamp time.Time `bson:"timestamp"` +} + +// Writer performs sequential inserts to a MongoDB collection. +// Each writer has a unique ID and tracks its own sequence number. +type Writer struct { + id string + seq atomic.Int64 + metrics *Metrics + journal *journal.Journal + collection *mongo.Collection +} + +// NewWriter creates a writer with the given ID connected to the specified database. +func NewWriter(id string, db *mongo.Database, metrics *Metrics, j *journal.Journal) *Writer { + coll := db.Collection(CollectionName, options.Collection(). + SetWriteConcern(writeconcern.Majority())) + return &Writer{ + id: id, + metrics: metrics, + journal: j, + collection: coll, + } +} + +// Run starts the writer loop. It blocks until the context is cancelled. +func (w *Writer) Run(ctx context.Context) { + w.journal.Info("writer", fmt.Sprintf("writer %s started", w.id)) + defer w.journal.Info("writer", fmt.Sprintf("writer %s stopped", w.id)) + + ticker := time.NewTicker(writeInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + w.writeOne(ctx) + } + } +} + +func (w *Writer) writeOne(ctx context.Context) { + seq := w.seq.Add(1) + payload := fmt.Sprintf("writer=%s seq=%d t=%d", w.id, seq, time.Now().UnixNano()) + checksum := computeChecksum(w.id, seq, payload) + + doc := WriteDocument{ + WriterID: w.id, + Seq: seq, + Payload: payload, + Checksum: checksum, + Timestamp: time.Now(), + } + + w.metrics.WriteAttempted.Add(1) + + _, err := w.collection.InsertOne(ctx, doc) + if err != nil { + // Retryable writes are on by default in the v2 driver, so a network + // blip during a disruption window can produce this sequence: + // 1. driver sends InsertOne, server commits, ACK is dropped + // 2. driver auto-retries the same _id, server returns code 11000 + // 3. InsertOne returns a duplicate-key error to us + // The data is durably committed in case (3), so counting it as a write + // failure (and feeding the policy/AllowedWriteFailures gate) would turn + // successful writes into spurious FAIL verdicts. Treat dup-key as a + // successful, idempotent ACK instead. + if mongo.IsDuplicateKeyError(err) { + w.metrics.WriteAcknowledged.Add(1) + return + } + w.metrics.WriteFailed.Add(1) + w.journal.RecordWriteFailure() + return + } + w.metrics.WriteAcknowledged.Add(1) +} + +// computeChecksum creates a deterministic hash of the write for verification. +func computeChecksum(writerID string, seq int64, payload string) string { + data := fmt.Sprintf("%s:%d:%s", writerID, seq, payload) + hash := sha256.Sum256([]byte(data)) + return hex.EncodeToString(hash[:8]) +} + +// StartWriters launches n writers and returns a cancel function to stop them. +func StartWriters(ctx context.Context, n int, db *mongo.Database, metrics *Metrics, j *journal.Journal) []*Writer { + writers := make([]*Writer, n) + for i := 0; i < n; i++ { + id := fmt.Sprintf("w%03d", i) + writers[i] = NewWriter(id, db, metrics, j) + go writers[i].Run(ctx) + } + return writers +} + +// EnsureIndexes creates the necessary indexes on the workload collection. +func EnsureIndexes(ctx context.Context, db *mongo.Database) error { + coll := db.Collection(CollectionName) + _, err := coll.Indexes().CreateOne(ctx, mongo.IndexModel{ + Keys: bson.D{ + {Key: "writer_id", Value: 1}, + {Key: "seq", Value: 1}, + }, + Options: options.Index().SetUnique(true), + }) + return err +} diff --git a/test/longhaul/workload/writer_test.go b/test/longhaul/workload/writer_test.go new file mode 100644 index 000000000..6641dba4f --- /dev/null +++ b/test/longhaul/workload/writer_test.go @@ -0,0 +1,65 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package workload + +import ( + "strings" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/longhaul/journal" +) + +var _ = Describe("computeChecksum", func() { + It("is deterministic for the same inputs", func() { + a := computeChecksum("w001", 42, "payload-x") + b := computeChecksum("w001", 42, "payload-x") + Expect(a).To(Equal(b)) + }) + + DescribeTable("differs when any input changes", + func(name string, modified string) { + base := computeChecksum("w001", 42, "payload") + Expect(modified).NotTo(Equal(base), "field %q should change checksum", name) + }, + Entry("writerID changed", "writerID", computeChecksum("w002", 42, "payload")), + Entry("seq changed", "seq", computeChecksum("w001", 43, "payload")), + Entry("payload changed", "payload", computeChecksum("w001", 42, "payload-x")), + ) + + It("is 16 lowercase hex chars (SHA-256 truncated to 8 bytes)", func() { + got := computeChecksum("w001", 1, "x") + Expect(got).To(HaveLen(16)) + for _, r := range got { + Expect(strings.ContainsRune("0123456789abcdef", r)).To(BeTrue(), "non-hex char %q in %q", r, got) + } + }) +}) + +var _ = Describe("Writer", func() { + It("constructor preserves id, metrics, journal, and starts seq at 0", func() { + // The writer constructor is mostly composition; verify it doesn't panic + // when given a nil collection (mongo.Database can produce a Collection + // without I/O), and that the ID is preserved. We can't construct a real + // *mongo.Database without a connection, so we limit the assertion to + // what's safe to inspect: the metrics, journal, and id wiring. + m := NewMetrics() + j := journal.New() + w := &Writer{id: "w042", metrics: m, journal: j} + Expect(w.id).To(Equal("w042")) + Expect(w.metrics).To(BeIdenticalTo(m)) + Expect(w.journal).To(BeIdenticalTo(j)) + Expect(w.seq.Load()).To(BeZero()) + }) + + It("seq is monotonically increasing under repeated Add", func() { + // Even though writeOne hits the network, the seq.Add is the first thing + // it does. Verify the atomic counter advances correctly. + w := &Writer{id: "w001"} + for i := int64(1); i <= 100; i++ { + Expect(w.seq.Add(1)).To(Equal(i)) + } + }) +}) From 3e9ac70996f25e7a34b52c4c5739f41f790144fa Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Wed, 17 Jun 2026 13:18:11 -0400 Subject: [PATCH 02/11] test(longhaul): address PR review (Copilot inline + local review) Copilot inline comments (16): - writer.go: fix StartWriters doc; switch to math/rand/v2 (auto-seeded) - main.go: header says Deployment (not Job); drop-collection gated on LONGHAUL_RESET_DATA - upgrade.go: wrap+return error from GetCurrentDocumentDBImageTag - checkpoint.go: bounded ctx on final emit; final PASS persists as PASS (not RUNNING) - alert.go: escape % / CR / LF in GH Actions annotation messages - deploy/setup.yaml: password is placeholder; plugin field uses spec.plugins.sidecarInjectorName - Dockerfile + README: build from repo root so go.mod replace paths resolve; Go 1.25; update relationship table for shared module Local code-review-agent (3 critical): - C1: writer no longer pre-increments seq; advance only on success or DupKey ack, so non-DupKey failures retry the same seq instead of being mistaken for data loss - C2: main calls reporter.EmitFinal() synchronously before exit so the authoritative final verdict reaches the longhaul-report ConfigMap - C3: writers seed seq from FindOne(sort:-1) on startup so a Deployment-driven pod restart resumes past the prior tip without colliding with the unique (writer_id, seq) index Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test/longhaul/Dockerfile | 24 +++++++--- test/longhaul/README.md | 58 +++++++++++++++---------- test/longhaul/cmd/longhaul/main.go | 24 +++++++--- test/longhaul/config/config.go | 12 +++++ test/longhaul/deploy/setup.yaml | 13 +++++- test/longhaul/operations/scheduler.go | 4 +- test/longhaul/operations/upgrade.go | 5 ++- test/longhaul/report/alert.go | 24 +++++++--- test/longhaul/report/checkpoint.go | 28 +++++++++--- test/longhaul/report/checkpoint_test.go | 10 ++--- test/longhaul/workload/writer.go | 48 +++++++++++++++++--- 11 files changed, 189 insertions(+), 61 deletions(-) diff --git a/test/longhaul/Dockerfile b/test/longhaul/Dockerfile index f57ea270c..bba11a3ff 100644 --- a/test/longhaul/Dockerfile +++ b/test/longhaul/Dockerfile @@ -2,7 +2,14 @@ # Licensed under the MIT License. # Build stage -FROM golang:1.26-alpine AS builder +# +# IMPORTANT: This Dockerfile must be built with the REPOSITORY ROOT as the +# build context, not test/longhaul/, because test/longhaul/go.mod has +# `replace` directives pointing to ../shared and ../../operator/src. +# +# docker build -f test/longhaul/Dockerfile -t /longhaul-test:latest . +# +FROM golang:1.25-alpine AS builder # Note: the golang:*-alpine image already ships ca-certificates, and all # deps in go.mod resolve via proxy.golang.org (no VCS fetches), so neither @@ -10,12 +17,17 @@ FROM golang:1.26-alpine AS builder WORKDIR /src -# Cache module downloads. -COPY go.mod go.sum ./ -RUN go mod download +# Copy the three modules referenced by test/longhaul/go.mod's replace directives. +# operator/src is the main operator module; test/shared is the shared helpers +# module consumed via `test/shared/{documentdb,mongo}`. +COPY operator/src ./operator/src +COPY test/shared ./test/shared +COPY test/longhaul ./test/longhaul + +WORKDIR /src/test/longhaul -# Copy source. -COPY . . +# Cache module downloads (after sources are in place so replace paths resolve). +RUN go mod download # Build the standalone binary. RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /longhaul ./cmd/longhaul diff --git a/test/longhaul/README.md b/test/longhaul/README.md index f5404c7d0..3dc8fa4d3 100644 --- a/test/longhaul/README.md +++ b/test/longhaul/README.md @@ -74,10 +74,12 @@ onto the cluster using a long-lived ServiceAccount-token kubeconfig stored in th **Manual path (one-off / local cluster):** ```bash -cd test/longhaul +# Build from the REPOSITORY ROOT (not test/longhaul/) so the replace +# paths in test/longhaul/go.mod (../shared and ../../operator/src) resolve. +cd # 1. Build and push the container image (or use the GHCR image from CI). -docker build -t /longhaul-test:latest -f Dockerfile . +docker build -t /longhaul-test:latest -f test/longhaul/Dockerfile . docker push /longhaul-test:latest # 2. Create the MongoDB credentials secret @@ -88,12 +90,13 @@ kubectl create secret generic longhaul-mongo-credentials \ # 3. Deploy RBAC and Deployment. deployment.yaml has placeholders # __OWNER__ and __IMAGE_TAG__ that are normally substituted by the # deploy workflow; for a manual apply, sed them yourself or edit -# the file in place. -kubectl apply -f deploy/setup.yaml -kubectl apply -f deploy/rbac.yaml +# the file in place. setup.yaml also has __LONGHAUL_PASSWORD__. +sed -i "s/__LONGHAUL_PASSWORD__/$(openssl rand -base64 24)/" test/longhaul/deploy/setup.yaml +kubectl apply -f test/longhaul/deploy/setup.yaml +kubectl apply -f test/longhaul/deploy/rbac.yaml sed -e 's|__OWNER__||g' \ -e 's|__IMAGE_TAG__|latest|g' \ - deploy/deployment.yaml | kubectl apply -f - + test/longhaul/deploy/deployment.yaml | kubectl apply -f - # 4. Monitor progress kubectl logs -f deployment/longhaul-test -n documentdb-test-ns @@ -130,11 +133,15 @@ All configuration is via environment variables. | `LONGHAUL_MIN_INSTANCES` | No | `1` | Minimum `spec.instancesPerNode` for scale-down operations (CRD lower bound: 1). | | `LONGHAUL_MAX_INSTANCES` | No | `3` | Maximum `spec.instancesPerNode` for scale-up operations (CRD upper bound: 3). | | `LONGHAUL_REPORT_INTERVAL` | No | `1h` | How often to write checkpoint reports to ConfigMap. | +| `LONGHAUL_RESET_DATA` | No | `false` | If `true`, drop the workload collection on startup. Off by default so a Deployment pod restart preserves durability history. | ## CI Safety -The long haul test binary is deployed as a Kubernetes Job on a dedicated AKS cluster. -It does **not** run in any PR-gated CI workflow. +The long haul test binary is deployed as a Kubernetes Deployment on a dedicated AKS +cluster. It does **not** run in any PR-gated CI workflow. Because a Deployment +auto-restarts crashed pods, the source of truth for "did the test pass?" is the +`longhaul-report` ConfigMap and the GitHub Actions annotations, not the pod +status. The config unit tests (`test/longhaul/config/`) run unconditionally and are included in normal CI test runs — they are fast (~0.002s) and require no cluster. @@ -152,22 +159,27 @@ DocumentDB cluster) but answer different questions: | Asserts | One behavior per spec, then exits | Continuous invariants over time | | Failure mode | `t.Fail` per spec | Journal entry + alert + auto-restart | | Cluster | Created + torn down per run | Long-lived dedicated AKS cluster | -| Operator API | Typed (`previewv1.DocumentDB` via controller-runtime) | Dynamic client (no operator import) | +| Operator API | Typed (`previewv1.DocumentDB` via controller-runtime) | Typed (`previewv1.DocumentDB` via controller-runtime + `test/shared/documentdb` helpers) | -### Code that could be shared in the future +### Code that is shared today -The e2e suite has helpers in `test/e2e/pkg/e2eutils/` that this harness will likely consume -once it grows beyond the current scope: +The harness consumes the `test/shared/` module (extracted in PR #401): -- `e2eutils/mongo` — `BuildURI` (URL-escapes username/password), TLS-from-CA-bundle, `Handle` - with port-forward + secret-backed credentials. The long haul driver currently takes a raw - `LONGHAUL_MONGO_URI` string; when it moves to per-secret credentials or in-cluster TLS, - these helpers become directly applicable. -- `e2eutils/operatorhealth` — pod-ready / CRD-ready gating used during e2e setup. The - monitor's `isPodReady` could delegate to this when the modules are unified. -- `e2eutils/clusterprobe` — CRD presence checks. +- `test/shared/documentdb` — typed `DocumentDB` CR helpers (`Get`, `IsHealthy`, + `PatchInstances`, `PatchSpec`). The monitor's `K8sClusterClient` uses these + as the single source of truth for the readiness predicate so longhaul and + e2e can't drift on what "healthy" means. +- `test/shared/mongo` — `NewFromURI` for the data-plane connection. + +### Future opportunities -A shared `test/shared/` module is **deliberately not introduced yet**: the modules' Go and -dependency versions differ today, and the only currently-duplicated surface (raw mongo -connect + ping) is too small to justify the third-module overhead. Revisit this when the -long haul driver adopts the same connection model as e2e. +The e2e suite has additional helpers in `test/e2e/pkg/e2eutils/` that this +harness will likely consume as it grows: + +- `e2eutils/mongo` — `BuildURI` (URL-escapes username/password), TLS-from-CA-bundle, + `Handle` with port-forward + secret-backed credentials. The long haul driver + currently takes a raw `LONGHAUL_MONGO_URI` string; when it moves to per-secret + credentials or in-cluster TLS, these helpers become directly applicable. +- `e2eutils/operatorhealth` — pod-ready / CRD-ready gating used during e2e setup. + The monitor's `isPodReady` could delegate to this. +- `e2eutils/clusterprobe` — CRD presence checks. diff --git a/test/longhaul/cmd/longhaul/main.go b/test/longhaul/cmd/longhaul/main.go index 8902e2b86..71fc96bfb 100644 --- a/test/longhaul/cmd/longhaul/main.go +++ b/test/longhaul/cmd/longhaul/main.go @@ -2,7 +2,11 @@ // Licensed under the MIT License. // Package main provides a standalone binary entry point for running -// long haul tests as a Kubernetes Job (without Ginkgo test framework). +// long haul tests as a Kubernetes Deployment (without Ginkgo test framework). +// A Deployment is used (not a Job) so the kubelet auto-restarts the driver +// pod on crash; the canonical "did the test pass?" signal is the +// longhaul-report ConfigMap and the GitHub Actions annotations, not the pod +// exit status. package main import ( @@ -82,9 +86,15 @@ func run(cfg config.Config) int { db := mongoClient.Database("longhaul") - // Drop previous test data to avoid duplicate key conflicts. - if err := db.Collection(workload.CollectionName).Drop(ctx); err != nil { - log.Fatalf("failed to drop collection: %v", err) + // Optionally drop previous test data. Disabled by default so that pod + // restarts (Deployment auto-restart on crash) preserve durability + // history for post-mortem; opt in with LONGHAUL_RESET_DATA=true for + // local/dev iterations or fresh CI runs. + if cfg.ResetData { + if err := db.Collection(workload.CollectionName).Drop(ctx); err != nil { + log.Fatalf("failed to drop collection: %v", err) + } + log.Println("workload collection dropped (LONGHAUL_RESET_DATA=true)") } // Create indexes. @@ -146,10 +156,14 @@ func run(cfg config.Config) int { // Allow goroutines to flush. time.Sleep(500 * time.Millisecond) - // Generate final report. + // Generate final report. Persist to the report ConfigMap synchronously + // here (before os.Exit) so the authoritative verdict reaches the source + // of truth that operators consult — the Run() goroutine cannot do this + // reliably because os.Exit can kill it mid-Update. summary := buildSummary(metrics, leakDetector, scheduler, j) markdown := report.GenerateMarkdown(summary) fmt.Println("\n" + markdown) + reporter.EmitFinal() // Emit final GitHub Actions annotation. report.EmitAnnotation(summary) diff --git a/test/longhaul/config/config.go b/test/longhaul/config/config.go index b07338bea..085986c42 100644 --- a/test/longhaul/config/config.go +++ b/test/longhaul/config/config.go @@ -32,6 +32,9 @@ const ( // Observability and reporting. EnvReportInterval = "LONGHAUL_REPORT_INTERVAL" + + // Operational toggles. + EnvResetData = "LONGHAUL_RESET_DATA" ) // Config holds all configuration for a long haul test run. @@ -73,6 +76,11 @@ type Config struct { // ReportInterval is how often checkpoint reports are generated. ReportInterval time.Duration + + // ResetData controls whether the workload collection is dropped on startup. + // Default false so that pod restarts preserve durability history; opt in + // for fresh local/dev iterations. + ResetData bool } // DefaultConfig returns a Config with safe defaults for local development. @@ -182,6 +190,10 @@ func LoadFromEnv() (Config, error) { cfg.ReportInterval = d } + if v := strings.TrimSpace(strings.ToLower(os.Getenv(EnvResetData))); v != "" { + cfg.ResetData = v == "true" || v == "1" || v == "yes" + } + return cfg, nil } diff --git a/test/longhaul/deploy/setup.yaml b/test/longhaul/deploy/setup.yaml index 2c3580af2..6f3ac588b 100644 --- a/test/longhaul/deploy/setup.yaml +++ b/test/longhaul/deploy/setup.yaml @@ -34,6 +34,14 @@ metadata: --- # Credentials for the DocumentDB cluster. # The operator creates the MongoDB user with these credentials. +# +# NOTE: This is a placeholder. Before applying, substitute a real password +# (e.g. `sed -i "s/__LONGHAUL_PASSWORD__/$(openssl rand -base64 24)/" deploy/setup.yaml`) +# or generate the secret out-of-band: +# kubectl create secret generic documentdb-credentials -n documentdb-test-ns \ +# --type=kubernetes.io/basic-auth \ +# --from-literal=username=longhaul-user \ +# --from-literal=password="$(openssl rand -base64 24)" apiVersion: v1 kind: Secret metadata: @@ -42,7 +50,7 @@ metadata: type: kubernetes.io/basic-auth stringData: username: longhaul-user - password: LongHaul-T3st-Passw0rd! + password: __LONGHAUL_PASSWORD__ --- # DocumentDB cluster for long-haul testing. # @@ -75,4 +83,5 @@ spec: pvcSize: 10Gi exposeViaService: serviceType: LoadBalancer - sidecarInjectorPluginName: cnpg-i-sidecar-injector.documentdb.io + plugins: + sidecarInjectorName: cnpg-i-sidecar-injector.documentdb.io diff --git a/test/longhaul/operations/scheduler.go b/test/longhaul/operations/scheduler.go index e1e824ea2..2567f36e5 100644 --- a/test/longhaul/operations/scheduler.go +++ b/test/longhaul/operations/scheduler.go @@ -8,7 +8,7 @@ package operations import ( "context" "fmt" - "math/rand" + "math/rand/v2" "sync" "time" @@ -143,7 +143,7 @@ func (s *Scheduler) selectOperation(ctx context.Context) Operation { } // Weighted random selection. - r := rand.Intn(totalWeight) + r := rand.IntN(totalWeight) for _, c := range candidates { r -= c.weight if r < 0 { diff --git a/test/longhaul/operations/upgrade.go b/test/longhaul/operations/upgrade.go index 8eba66e0d..020413991 100644 --- a/test/longhaul/operations/upgrade.go +++ b/test/longhaul/operations/upgrade.go @@ -112,7 +112,10 @@ func (u *UpgradeDocumentDB) Execute(ctx context.Context) error { return fmt.Errorf("desired version is empty") } - running, _ := u.client.GetCurrentDocumentDBImageTag(ctx) + running, err := u.client.GetCurrentDocumentDBImageTag(ctx) + if err != nil { + return fmt.Errorf("read current image tag: %w", err) + } if err := u.client.UpgradeDocumentDB(ctx, desired); err != nil { return fmt.Errorf("patch CR: %w", err) } diff --git a/test/longhaul/report/alert.go b/test/longhaul/report/alert.go index c1fcb37a1..73aa6f006 100644 --- a/test/longhaul/report/alert.go +++ b/test/longhaul/report/alert.go @@ -6,6 +6,7 @@ package report import ( "fmt" "os" + "strings" "time" ) @@ -14,6 +15,17 @@ func isGitHubActions() bool { return os.Getenv("GITHUB_ACTIONS") == "true" } +// escapeAnnotation escapes characters that are special in GitHub Actions +// workflow commands. Per the runner spec, %, CR and LF must be percent-escaped +// in the message body or they corrupt the workflow command stream. +// See: https://docs.github.com/actions/using-workflows/workflow-commands-for-github-actions +func escapeAnnotation(s string) string { + s = strings.ReplaceAll(s, "%", "%25") + s = strings.ReplaceAll(s, "\r", "%0D") + s = strings.ReplaceAll(s, "\n", "%0A") + return s +} + // EmitAnnotation emits GitHub Actions workflow annotations based on test status. // These annotations appear in the Actions UI on the workflow run summary. func EmitAnnotation(s Summary) { @@ -28,17 +40,19 @@ func EmitAnnotation(s Summary) { msg = fmt.Sprintf("Long haul test FAILED: %s", s.FailReason) } // ::error:: annotations show as red in the Actions UI. - fmt.Printf("::error title=Long Haul Test Failure::%s\n", msg) + fmt.Printf("::error title=Long Haul Test Failure::%s\n", escapeAnnotation(msg)) case ResultPass: // For intermediate checkpoints, emit a notice. - fmt.Printf("::notice title=Long Haul Checkpoint::PASS after %s — %d writes, %d ops, %d gaps\n", - s.Duration.Round(time.Second), s.Metrics.WriteAttempted, s.OpsExecuted, s.Metrics.GapsDetected) + fmt.Printf("::notice title=Long Haul Checkpoint::%s\n", + escapeAnnotation(fmt.Sprintf("PASS after %s — %d writes, %d ops, %d gaps", + s.Duration.Round(time.Second), s.Metrics.WriteAttempted, s.OpsExecuted, s.Metrics.GapsDetected))) } // Emit warning for memory leak regardless of result. if s.LeakAnalysis.HasLeak { - fmt.Printf("::warning title=Memory Leak Suspected::%.2f MB/hour over %s (%d samples)\n", - s.LeakAnalysis.MemorySlopeMB, s.LeakAnalysis.Duration.Round(time.Second), s.LeakAnalysis.SampleCount) + fmt.Printf("::warning title=Memory Leak Suspected::%s\n", + escapeAnnotation(fmt.Sprintf("%.2f MB/hour over %s (%d samples)", + s.LeakAnalysis.MemorySlopeMB, s.LeakAnalysis.Duration.Round(time.Second), s.LeakAnalysis.SampleCount))) } } diff --git a/test/longhaul/report/checkpoint.go b/test/longhaul/report/checkpoint.go index 363f1a87d..2c698ae42 100644 --- a/test/longhaul/report/checkpoint.go +++ b/test/longhaul/report/checkpoint.go @@ -43,6 +43,9 @@ func NewCheckpointReporter(clientset kubernetes.Interface, namespace string, int } // Run starts the periodic reporting loop. Blocks until context is cancelled. +// On shutdown, callers should invoke EmitFinal() synchronously — Run no longer +// emits its own final report because the goroutine can be killed by os.Exit +// before the K8s Update returns. func (r *CheckpointReporter) Run(ctx context.Context) { log.Printf("[checkpoint] periodic reporter started (interval=%s)", r.interval) defer log.Println("[checkpoint] periodic reporter stopped") @@ -53,21 +56,34 @@ func (r *CheckpointReporter) Run(ctx context.Context) { for { select { case <-ctx.Done(): - // Final report on exit. - r.emit(context.Background()) return case <-ticker.C: - r.emit(ctx) + r.emit(ctx, false) } } } -func (r *CheckpointReporter) emit(ctx context.Context) { +// EmitFinal writes a terminal summary (PASS or FAIL is persisted as itself, +// not as RUNNING) using a bounded context. Safe to call after the main +// context has been cancelled. Intended to be called synchronously from main +// just before exit so the verdict is durable in the ConfigMap. +func (r *CheckpointReporter) EmitFinal() { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + r.emit(ctx, true) +} + +// emit writes the current summary to stdout, GH Actions annotations, and the +// status ConfigMap. final=true means this is the shutdown emit, in which case +// PASS is persisted as "PASS" (not "RUNNING") so consumers can distinguish a +// finished clean run from an in-flight checkpoint. +func (r *CheckpointReporter) emit(ctx context.Context, final bool) { summary := r.summaryFunc() - // Mark as RUNNING for intermediate checkpoints (unless already FAIL). + // Intermediate PASS checkpoints surface as RUNNING; the final emit + // preserves the true PASS/FAIL outcome. resultStr := string(summary.Result) - if summary.Result == ResultPass { + if summary.Result == ResultPass && !final { resultStr = "RUNNING" } diff --git a/test/longhaul/report/checkpoint_test.go b/test/longhaul/report/checkpoint_test.go index 29932173e..891e90b4d 100644 --- a/test/longhaul/report/checkpoint_test.go +++ b/test/longhaul/report/checkpoint_test.go @@ -19,7 +19,7 @@ var _ = Describe("CheckpointReporter", func() { r := NewCheckpointReporter(nil, "ns", time.Second, func() Summary { return Summary{Result: ResultPass, Duration: time.Minute} }) - Expect(func() { r.emit(context.Background()) }).NotTo(Panic()) + Expect(func() { r.emit(context.Background(), false) }).NotTo(Panic()) }) It("creates the ConfigMap on first emit and labels it identifiably", func() { @@ -28,7 +28,7 @@ var _ = Describe("CheckpointReporter", func() { return Summary{Result: ResultPass, Duration: 2 * time.Hour, OpsExecuted: 5} }) - r.emit(context.Background()) + r.emit(context.Background(), false) cm, err := cs.CoreV1().ConfigMaps("ns").Get(context.Background(), ConfigMapName, metav1.GetOptions{}) Expect(err).NotTo(HaveOccurred()) @@ -47,7 +47,7 @@ var _ = Describe("CheckpointReporter", func() { return Summary{Result: ResultFail, FailReason: "data loss"} }) - r.emit(context.Background()) + r.emit(context.Background(), false) cm, err := cs.CoreV1().ConfigMaps("ns").Get(context.Background(), ConfigMapName, metav1.GetOptions{}) Expect(err).NotTo(HaveOccurred()) @@ -63,14 +63,14 @@ var _ = Describe("CheckpointReporter", func() { return Summary{Result: ResultPass, Duration: time.Duration(calls) * time.Hour, OpsExecuted: calls * 10} }) - r.emit(context.Background()) + r.emit(context.Background(), false) cm1, err := cs.CoreV1().ConfigMaps("ns").Get(context.Background(), ConfigMapName, metav1.GetOptions{}) Expect(err).NotTo(HaveOccurred()) report1 := cm1.Data["latest-report"] // Fake clientset doesn't bump ResourceVersion automatically, so assert // on content change instead. - r.emit(context.Background()) + r.emit(context.Background(), false) cm2, err := cs.CoreV1().ConfigMaps("ns").Get(context.Background(), ConfigMapName, metav1.GetOptions{}) Expect(err).NotTo(HaveOccurred()) Expect(cm2.Data["latest-report"]).NotTo(Equal(report1)) diff --git a/test/longhaul/workload/writer.go b/test/longhaul/workload/writer.go index b5fe68e52..052b79f16 100644 --- a/test/longhaul/workload/writer.go +++ b/test/longhaul/workload/writer.go @@ -7,6 +7,7 @@ import ( "context" "crypto/sha256" "encoding/hex" + "errors" "fmt" "sync/atomic" "time" @@ -77,7 +78,11 @@ func (w *Writer) Run(ctx context.Context) { } func (w *Writer) writeOne(ctx context.Context) { - seq := w.seq.Add(1) + // Compute the next seq without advancing the counter yet — only commit on + // success. Each writer has exactly one goroutine (Run), so a plain + // Load/Store pair is race-free; atomic.Int64 is retained so external + // observers (verifier tests, future debug endpoints) can read it safely. + seq := w.seq.Load() + 1 payload := fmt.Sprintf("writer=%s seq=%d t=%d", w.id, seq, time.Now().UnixNano()) checksum := computeChecksum(w.id, seq, payload) @@ -98,21 +103,42 @@ func (w *Writer) writeOne(ctx context.Context) { // 1. driver sends InsertOne, server commits, ACK is dropped // 2. driver auto-retries the same _id, server returns code 11000 // 3. InsertOne returns a duplicate-key error to us - // The data is durably committed in case (3), so counting it as a write - // failure (and feeding the policy/AllowedWriteFailures gate) would turn - // successful writes into spurious FAIL verdicts. Treat dup-key as a - // successful, idempotent ACK instead. + // The data is durably committed in case (3), so we advance seq and + // count it as a successful, idempotent ACK. if mongo.IsDuplicateKeyError(err) { + w.seq.Store(seq) w.metrics.WriteAcknowledged.Add(1) return } + // For any other error the document was NOT committed. Do NOT advance + // seq, otherwise the verifier will see a permanent gap and report + // false-positive data loss. The next tick will retry the same seq. w.metrics.WriteFailed.Add(1) w.journal.RecordWriteFailure() return } + w.seq.Store(seq) w.metrics.WriteAcknowledged.Add(1) } +// Resume seeds the writer's seq counter from the highest seq already persisted +// for this writer_id. Called on startup so a Deployment-driven restart picks +// up where the previous pod left off instead of colliding with the existing +// unique index on (writer_id, seq). +func (w *Writer) Resume(ctx context.Context) (int64, error) { + opts := options.FindOne().SetSort(bson.D{{Key: "seq", Value: -1}}) + var doc WriteDocument + err := w.collection.FindOne(ctx, bson.M{"writer_id": w.id}, opts).Decode(&doc) + if err != nil { + if errors.Is(err, mongo.ErrNoDocuments) { + return 0, nil + } + return 0, err + } + w.seq.Store(doc.Seq) + return doc.Seq, nil +} + // computeChecksum creates a deterministic hash of the write for verification. func computeChecksum(writerID string, seq int64, payload string) string { data := fmt.Sprintf("%s:%d:%s", writerID, seq, payload) @@ -120,12 +146,22 @@ func computeChecksum(writerID string, seq int64, payload string) string { return hex.EncodeToString(hash[:8]) } -// StartWriters launches n writers and returns a cancel function to stop them. +// StartWriters launches n writer goroutines and returns the slice of writers. +// Each writer is seeded from the collection so a restart resumes its seq +// counter past the previous tip (preventing dup-key collisions on the unique +// (writer_id, seq) index). +// The writers run until the supplied context is cancelled; there is no separate +// stop signal. func StartWriters(ctx context.Context, n int, db *mongo.Database, metrics *Metrics, j *journal.Journal) []*Writer { writers := make([]*Writer, n) for i := 0; i < n; i++ { id := fmt.Sprintf("w%03d", i) writers[i] = NewWriter(id, db, metrics, j) + if seq, err := writers[i].Resume(ctx); err != nil { + j.Warn("writer", fmt.Sprintf("writer %s resume failed: %v (starting at 0)", id, err)) + } else if seq > 0 { + j.Info("writer", fmt.Sprintf("writer %s resumed at seq=%d", id, seq)) + } go writers[i].Run(ctx) } return writers From ed498dfbc1903964b698fa72dc5cc4f922eb7340 Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Wed, 17 Jun 2026 18:17:14 -0400 Subject: [PATCH 03/11] test(longhaul): structural cleanup per xgerman review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - gofmt -w across the module (4 files were not gofmt-clean: scale.go, scheduler.go, health.go, checkpoint.go) — fixes the CI lint gate - collapse ScaleUp/ScaleDown into a shared scaleOp struct (parameterized by delta/bound/policy); thin ScaleUp / ScaleDown wrappers keep call sites and tests unchanged - expose K8sClusterClient.Clientset() and drop the duplicate REST-config + clientset construction in cmd/longhaul/main.go - extract K8sClusterClient.getCR() helper; the namespaced-name lookup no longer repeats in every method - report.go: b.WriteString(fmt.Sprintf(...)) -> fmt.Fprintf(&b, ...) - checkpoint.go: map[string]interface{} -> map[string]any; collapse double Printf wrappers - metrics.go: MetricsSnapshot doc comment now starts with the type name Skipped (with rationale in PR replies): - cross-package tick helper (xgerman marked optional; ~5 lines saved is not worth a new utility package) - removing DefaultOutagePolicy (still used by journal/operations tests as a reasonable-defaults stand-in) go build / go vet / go test ./... all clean; gofmt -l reports no remaining issues. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test/longhaul/cmd/longhaul/main.go | 52 ++------ test/longhaul/monitor/health.go | 8 +- test/longhaul/monitor/k8sclient.go | 19 ++- test/longhaul/operations/scale.go | 162 ++++++++++++------------- test/longhaul/operations/scale_test.go | 4 +- test/longhaul/operations/scheduler.go | 8 +- test/longhaul/report/checkpoint.go | 20 +-- test/longhaul/report/report.go | 34 +++--- test/longhaul/workload/metrics.go | 2 +- 9 files changed, 141 insertions(+), 168 deletions(-) diff --git a/test/longhaul/cmd/longhaul/main.go b/test/longhaul/cmd/longhaul/main.go index 71fc96bfb..dacd86212 100644 --- a/test/longhaul/cmd/longhaul/main.go +++ b/test/longhaul/cmd/longhaul/main.go @@ -24,9 +24,6 @@ import ( "github.com/documentdb/documentdb-operator/test/longhaul/workload" sharedmongo "github.com/documentdb/documentdb-operator/test/shared/mongo" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/clientcmd" ) func main() { @@ -104,11 +101,18 @@ func run(cfg config.Config) int { j.Info("main", "long haul test starting") - // Initialize real k8s cluster client. - clusterClient, k8sClientset, err := initK8sClient(cfg) + // Initialize real k8s cluster client. The clientset built inside + // K8sClusterClient is reused for ConfigMap operations (reporter) below + // instead of building a second one against the same REST config. + clusterClient, err := monitor.NewK8sClusterClient(monitor.K8sClientConfig{ + Namespace: cfg.Namespace, + ClusterName: cfg.ClusterName, + Kubeconfig: os.Getenv("KUBECONFIG"), + }) if err != nil { log.Fatalf("failed to initialize k8s client: %v", err) } + k8sClientset := clusterClient.Clientset() j.Info("main", "k8s client initialized") // Start health monitor. @@ -210,44 +214,6 @@ func buildSummary(metrics *workload.Metrics, leakDetector *monitor.LeakDetector, } } -// initK8sClient creates the real K8s cluster client and returns the clientset for ConfigMap access. -func initK8sClient(cfg config.Config) (*monitor.K8sClusterClient, kubernetes.Interface, error) { - k8sCfg := monitor.K8sClientConfig{ - Namespace: cfg.Namespace, - ClusterName: cfg.ClusterName, - Kubeconfig: os.Getenv("KUBECONFIG"), - } - - client, err := monitor.NewK8sClusterClient(k8sCfg) - if err != nil { - return nil, nil, err - } - - // Build a clientset for the reporter (ConfigMap operations). - restConfig, err := buildRESTConfig() - if err != nil { - return nil, nil, fmt.Errorf("failed to build REST config for clientset: %w", err) - } - clientset, err := kubernetes.NewForConfig(restConfig) - if err != nil { - return nil, nil, fmt.Errorf("failed to create clientset: %w", err) - } - - return client, clientset, nil -} - -func buildRESTConfig() (*rest.Config, error) { - cfg, err := rest.InClusterConfig() - if err == nil { - return cfg, nil - } - kubeconfig := os.Getenv("KUBECONFIG") - if kubeconfig == "" { - kubeconfig = clientcmd.RecommendedHomeFile - } - return clientcmd.BuildConfigFromFlags("", kubeconfig) -} - // runMetricsSampling periodically collects pod resource metrics and feeds the leak detector. func runMetricsSampling(ctx context.Context, client *monitor.K8sClusterClient, ld *monitor.LeakDetector, j *journal.Journal) { if !client.MetricsAvailable() { diff --git a/test/longhaul/monitor/health.go b/test/longhaul/monitor/health.go index bc84bf284..0c5e95163 100644 --- a/test/longhaul/monitor/health.go +++ b/test/longhaul/monitor/health.go @@ -57,10 +57,10 @@ type HealthMonitor struct { journal *journal.Journal steadyStateWait time.Duration - mu sync.RWMutex - lastHealth ClusterHealth - steadySince time.Time // time when cluster became healthy - healthySamples int + mu sync.RWMutex + lastHealth ClusterHealth + steadySince time.Time // time when cluster became healthy + healthySamples int } // NewHealthMonitor creates a monitor that polls the cluster for health status. diff --git a/test/longhaul/monitor/k8sclient.go b/test/longhaul/monitor/k8sclient.go index 1e87ff184..fb7358f6a 100644 --- a/test/longhaul/monitor/k8sclient.go +++ b/test/longhaul/monitor/k8sclient.go @@ -106,6 +106,17 @@ func tryMetricsClient(config *rest.Config) (metricsv.Interface, bool) { return mc, true } +// Clientset returns the underlying typed Kubernetes clientset. Exposed so the +// rest of the driver (e.g. the report ConfigMap writer) can reuse it instead +// of building a second clientset against the same REST config. +func (k *K8sClusterClient) Clientset() kubernetes.Interface { return k.clientset } + +// getCR fetches the target DocumentDB CR via the shared typed helper. Wraps +// the namespaced-name lookup that's otherwise repeated in every method below. +func (k *K8sClusterClient) getCR(ctx context.Context) (*previewv1.DocumentDB, error) { + return shareddb.Get(ctx, k.crClient, types.NamespacedName{Namespace: k.namespace, Name: k.clusterName}) +} + // GetClusterHealth queries pod status and CR status to determine cluster health. func (k *K8sClusterClient) GetClusterHealth(ctx context.Context) (ClusterHealth, error) { health := ClusterHealth{Timestamp: time.Now()} @@ -140,7 +151,7 @@ func (k *K8sClusterClient) GetClusterHealth(ctx context.Context) (ClusterHealth, // Get the DocumentDB CR status via the shared typed helper. Using // shareddb.IsHealthy keeps the readiness predicate consistent with // the e2e suite (single source of truth for ReadyStatus). - dd, err := shareddb.Get(ctx, k.crClient, types.NamespacedName{Namespace: k.namespace, Name: k.clusterName}) + dd, err := k.getCR(ctx) if err != nil { return health, fmt.Errorf("failed to get DocumentDB CR: %w", err) } @@ -157,7 +168,7 @@ func (k *K8sClusterClient) GetClusterHealth(ctx context.Context) (ClusterHealth, // previewv1.DocumentDB gives a zero-value of 0 for omitted ints, so we // preserve the original semantics explicitly here. func (k *K8sClusterClient) GetInstancesPerNode(ctx context.Context) (int, error) { - dd, err := shareddb.Get(ctx, k.crClient, types.NamespacedName{Namespace: k.namespace, Name: k.clusterName}) + dd, err := k.getCR(ctx) if err != nil { return 0, fmt.Errorf("failed to get DocumentDB CR: %w", err) } @@ -183,7 +194,7 @@ func (k *K8sClusterClient) ScaleCluster(ctx context.Context, instancesPerNode in // GetCurrentDocumentDBImageTag reads status.documentDBImage from the CR // and returns the tag portion (after the last colon). func (k *K8sClusterClient) GetCurrentDocumentDBImageTag(ctx context.Context) (string, error) { - dd, err := shareddb.Get(ctx, k.crClient, types.NamespacedName{Namespace: k.namespace, Name: k.clusterName}) + dd, err := k.getCR(ctx) if err != nil { return "", fmt.Errorf("failed to get DocumentDB CR: %w", err) } @@ -203,7 +214,7 @@ func (k *K8sClusterClient) GetCurrentDocumentDBImageTag(ctx context.Context) (st // and schemaVersion="auto" so the operator performs a rolling upgrade. // NOTE: the CRD field is documentDBVersion (capital DB), not documentDbVersion. func (k *K8sClusterClient) UpgradeDocumentDB(ctx context.Context, version string) error { - dd, err := shareddb.Get(ctx, k.crClient, types.NamespacedName{Namespace: k.namespace, Name: k.clusterName}) + dd, err := k.getCR(ctx) if err != nil { return fmt.Errorf("failed to get DocumentDB CR: %w", err) } diff --git a/test/longhaul/operations/scale.go b/test/longhaul/operations/scale.go index 2d907a935..e9fb471d8 100644 --- a/test/longhaul/operations/scale.go +++ b/test/longhaul/operations/scale.go @@ -12,124 +12,120 @@ import ( "github.com/documentdb/documentdb-operator/test/longhaul/monitor" ) -// ScaleUp increases spec.instancesPerNode by 1 (HA scale dimension; range 1-3). -type ScaleUp struct { - client monitor.ClusterClient - healthMon *monitor.HealthMonitor - maxInstances int - recovery time.Duration +// scaleOp parameterizes the scale-up / scale-down operations. +// ScaleUp and ScaleDown are ~95% identical (same fields, Precondition / Execute +// differ only by delta sign, the bound comparison, and policy constants), so +// they share one implementation via this struct. NewScaleUp / NewScaleDown +// remain the public surface. +type scaleOp struct { + client monitor.ClusterClient + healthMon *monitor.HealthMonitor + name string + weight int + delta int // +1 for scale-up, -1 for scale-down + bound int // upper bound for up; lower bound for down + boundKind string // "max" or "min" — only used in human-readable reasons + recovery time.Duration + policy journal.OutagePolicy } -// NewScaleUp creates a ScaleUp operation. maxInstances is clamped to the -// CRD upper bound (3) to avoid admission rejections. -func NewScaleUp(client monitor.ClusterClient, health *monitor.HealthMonitor, maxInstances int, recovery time.Duration) *ScaleUp { - if maxInstances > 3 { - maxInstances = 3 - } - return &ScaleUp{ - client: client, - healthMon: health, - maxInstances: maxInstances, - recovery: recovery, - } -} - -func (s *ScaleUp) Name() string { return "scale-up" } -func (s *ScaleUp) Weight() int { return 3 } +func (s *scaleOp) Name() string { return s.name } +func (s *scaleOp) Weight() int { return s.weight } -func (s *ScaleUp) Precondition(ctx context.Context) (bool, string) { +func (s *scaleOp) Precondition(ctx context.Context) (bool, string) { current, err := s.client.GetInstancesPerNode(ctx) if err != nil { return false, fmt.Sprintf("cannot get instancesPerNode: %v", err) } - if current >= s.maxInstances { - return false, fmt.Sprintf("already at max instancesPerNode (%d)", s.maxInstances) + if s.atBound(current) { + return false, fmt.Sprintf("already at %s instancesPerNode (%d)", s.boundKind, s.bound) } return true, "" } -func (s *ScaleUp) Execute(ctx context.Context) error { +func (s *scaleOp) Execute(ctx context.Context) error { current, err := s.client.GetInstancesPerNode(ctx) if err != nil { return fmt.Errorf("get instancesPerNode: %w", err) } - target := current + 1 + target := current + s.delta if err := s.client.ScaleCluster(ctx, target); err != nil { return fmt.Errorf("scale to %d: %w", target, err) } - // Wait for recovery (new pod becomes ready). + // Wait for recovery (new pod ready / cluster stabilizes at new size). recoveryCtx, cancel := context.WithTimeout(ctx, s.recovery) defer cancel() return s.healthMon.WaitForSteadyState(recoveryCtx) } -func (s *ScaleUp) OutagePolicy() journal.OutagePolicy { - return journal.OutagePolicy{ - AllowedDowntime: 30 * time.Second, - AllowedWriteFailures: 20, - MustRecoverWithin: s.recovery, +func (s *scaleOp) OutagePolicy() journal.OutagePolicy { return s.policy } + +// atBound reports whether the current size already equals the operation's +// target bound (max for up, min for down). +func (s *scaleOp) atBound(current int) bool { + if s.delta > 0 { + return current >= s.bound } + return current <= s.bound } -// ScaleDown decreases spec.instancesPerNode by 1 (HA scale dimension; range 1-3). -type ScaleDown struct { - client monitor.ClusterClient - healthMon *monitor.HealthMonitor - minInstances int - recovery time.Duration +// ScaleUp is a scale-up operation. Exported as a concrete type so the existing +// callers keep their (*ScaleUp) return type — internally it's a thin wrapper. +type ScaleUp struct{ scaleOp } + +// NewScaleUp creates a ScaleUp operation. maxInstances is clamped to the +// CRD upper bound (3) to avoid admission rejections. +func NewScaleUp(client monitor.ClusterClient, health *monitor.HealthMonitor, maxInstances int, recovery time.Duration) *ScaleUp { + if maxInstances > 3 { + maxInstances = 3 + } + return &ScaleUp{scaleOp{ + client: client, + healthMon: health, + name: "scale-up", + weight: 3, + delta: +1, + bound: maxInstances, + boundKind: "max", + recovery: recovery, + policy: journal.OutagePolicy{ + AllowedDowntime: 30 * time.Second, + AllowedWriteFailures: 20, + MustRecoverWithin: recovery, + }, + }} } +// maxInstances exposes the upper bound for tests that previously read it directly. +func (s *ScaleUp) maxInstances() int { return s.bound } + +// ScaleDown is a scale-down operation. See ScaleUp comment. +type ScaleDown struct{ scaleOp } + // NewScaleDown creates a ScaleDown operation. minInstances is clamped to the // CRD lower bound (1) to avoid admission rejections. func NewScaleDown(client monitor.ClusterClient, health *monitor.HealthMonitor, minInstances int, recovery time.Duration) *ScaleDown { if minInstances < 1 { minInstances = 1 } - return &ScaleDown{ - client: client, - healthMon: health, - minInstances: minInstances, - recovery: recovery, - } + return &ScaleDown{scaleOp{ + client: client, + healthMon: health, + name: "scale-down", + weight: 2, + delta: -1, + bound: minInstances, + boundKind: "min", + recovery: recovery, + policy: journal.OutagePolicy{ + AllowedDowntime: 60 * time.Second, + AllowedWriteFailures: 50, + MustRecoverWithin: recovery, + }, + }} } -func (s *ScaleDown) Name() string { return "scale-down" } -func (s *ScaleDown) Weight() int { return 2 } - -func (s *ScaleDown) Precondition(ctx context.Context) (bool, string) { - current, err := s.client.GetInstancesPerNode(ctx) - if err != nil { - return false, fmt.Sprintf("cannot get instancesPerNode: %v", err) - } - if current <= s.minInstances { - return false, fmt.Sprintf("already at min instancesPerNode (%d)", s.minInstances) - } - return true, "" -} - -func (s *ScaleDown) Execute(ctx context.Context) error { - current, err := s.client.GetInstancesPerNode(ctx) - if err != nil { - return fmt.Errorf("get instancesPerNode: %w", err) - } - - target := current - 1 - if err := s.client.ScaleCluster(ctx, target); err != nil { - return fmt.Errorf("scale to %d: %w", target, err) - } - - // Wait for recovery (cluster stabilizes at new size). - recoveryCtx, cancel := context.WithTimeout(ctx, s.recovery) - defer cancel() - return s.healthMon.WaitForSteadyState(recoveryCtx) -} - -func (s *ScaleDown) OutagePolicy() journal.OutagePolicy { - return journal.OutagePolicy{ - AllowedDowntime: 60 * time.Second, - AllowedWriteFailures: 50, - MustRecoverWithin: s.recovery, - } -} +// minInstances exposes the lower bound for tests that previously read it directly. +func (s *ScaleDown) minInstances() int { return s.bound } diff --git a/test/longhaul/operations/scale_test.go b/test/longhaul/operations/scale_test.go index 58c950399..de9fdb7fa 100644 --- a/test/longhaul/operations/scale_test.go +++ b/test/longhaul/operations/scale_test.go @@ -56,7 +56,7 @@ var _ = Describe("ScaleUp", func() { DescribeTable("clamps maxInstances to the CRD upper bound", func(in, want int) { s := NewScaleUp(&fakeClient{}, nil, in, time.Second) - Expect(s.maxInstances).To(Equal(want)) + Expect(s.maxInstances()).To(Equal(want)) }, Entry("1->1", 1, 1), Entry("2->2", 2, 2), @@ -100,7 +100,7 @@ var _ = Describe("ScaleDown", func() { DescribeTable("clamps minInstances to the CRD lower bound", func(in, want int) { s := NewScaleDown(&fakeClient{}, nil, in, time.Second) - Expect(s.minInstances).To(Equal(want)) + Expect(s.minInstances()).To(Equal(want)) }, Entry("0 -> 1", 0, 1), Entry("-5 -> 1", -5, 1), diff --git a/test/longhaul/operations/scheduler.go b/test/longhaul/operations/scheduler.go index 2567f36e5..a9351d18d 100644 --- a/test/longhaul/operations/scheduler.go +++ b/test/longhaul/operations/scheduler.go @@ -42,10 +42,10 @@ type Scheduler struct { journal *journal.Journal cooldown time.Duration - mu sync.Mutex - lastOpTime time.Time - opsExecuted int - inProgress bool + mu sync.Mutex + lastOpTime time.Time + opsExecuted int + inProgress bool } // NewScheduler creates an operation scheduler. diff --git a/test/longhaul/report/checkpoint.go b/test/longhaul/report/checkpoint.go index 2c698ae42..5875412c7 100644 --- a/test/longhaul/report/checkpoint.go +++ b/test/longhaul/report/checkpoint.go @@ -90,9 +90,9 @@ func (r *CheckpointReporter) emit(ctx context.Context, final bool) { markdown := GenerateMarkdown(summary) // Print to stdout with clear delimiter. - fmt.Printf("\n%s\n", "=== CHECKPOINT REPORT ===") + fmt.Println("\n=== CHECKPOINT REPORT ===") fmt.Println(markdown) - fmt.Printf("%s\n\n", "=== END CHECKPOINT ===") + fmt.Print("=== END CHECKPOINT ===\n\n") // Emit GitHub Actions annotations. EmitAnnotation(summary) @@ -141,14 +141,14 @@ func (r *CheckpointReporter) emit(ctx context.Context, final bool) { } // Also log the summary as JSON for structured log consumers. - summaryJSON, _ := json.Marshal(map[string]interface{}{ - "result": resultStr, - "elapsed": summary.Duration.String(), - "writes": summary.Metrics.WriteAttempted, - "gaps": summary.Metrics.GapsDetected, - "ops": summary.OpsExecuted, - "memory_leak": summary.LeakAnalysis.HasLeak, - "memory_slope": fmt.Sprintf("%.2f MB/h", summary.LeakAnalysis.MemorySlopeMB), + summaryJSON, _ := json.Marshal(map[string]any{ + "result": resultStr, + "elapsed": summary.Duration.String(), + "writes": summary.Metrics.WriteAttempted, + "gaps": summary.Metrics.GapsDetected, + "ops": summary.OpsExecuted, + "memory_leak": summary.LeakAnalysis.HasLeak, + "memory_slope": fmt.Sprintf("%.2f MB/h", summary.LeakAnalysis.MemorySlopeMB), "checkpoint_time": time.Now().UTC().Format(time.RFC3339), }) log.Printf("[checkpoint] %s", string(summaryJSON)) diff --git a/test/longhaul/report/report.go b/test/longhaul/report/report.go index 4b43ebe47..efd4f3d33 100644 --- a/test/longhaul/report/report.go +++ b/test/longhaul/report/report.go @@ -41,11 +41,11 @@ func GenerateMarkdown(s Summary) string { b.WriteString("# Long Haul Test Report\n\n") // Header - b.WriteString(fmt.Sprintf("**Result:** %s\n", s.Result)) - b.WriteString(fmt.Sprintf("**Duration:** %s\n", s.Duration.Round(time.Second))) - b.WriteString(fmt.Sprintf("**Operations Executed:** %d\n", s.OpsExecuted)) + fmt.Fprintf(&b, "**Result:** %s\n", s.Result) + fmt.Fprintf(&b, "**Duration:** %s\n", s.Duration.Round(time.Second)) + fmt.Fprintf(&b, "**Operations Executed:** %d\n", s.OpsExecuted) if s.FailReason != "" { - b.WriteString(fmt.Sprintf("**Failure Reason:** %s\n", s.FailReason)) + fmt.Fprintf(&b, "**Failure Reason:** %s\n", s.FailReason) } b.WriteString("\n") @@ -53,13 +53,13 @@ func GenerateMarkdown(s Summary) string { b.WriteString("## Data Plane Metrics\n\n") b.WriteString("| Metric | Value |\n") b.WriteString("|--------|-------|\n") - b.WriteString(fmt.Sprintf("| Writes Attempted | %d |\n", s.Metrics.WriteAttempted)) - b.WriteString(fmt.Sprintf("| Writes Acknowledged | %d |\n", s.Metrics.WriteAcknowledged)) - b.WriteString(fmt.Sprintf("| Writes Failed | %d |\n", s.Metrics.WriteFailed)) - b.WriteString(fmt.Sprintf("| Write Success Rate | %.2f%% |\n", s.Metrics.WriteSuccessRate()*100)) - b.WriteString(fmt.Sprintf("| Verify Passes | %d |\n", s.Metrics.VerifyPasses)) - b.WriteString(fmt.Sprintf("| Gaps Detected | %d |\n", s.Metrics.GapsDetected)) - b.WriteString(fmt.Sprintf("| Checksum Errors | %d |\n", s.Metrics.ChecksumErrors)) + fmt.Fprintf(&b, "| Writes Attempted | %d |\n", s.Metrics.WriteAttempted) + fmt.Fprintf(&b, "| Writes Acknowledged | %d |\n", s.Metrics.WriteAcknowledged) + fmt.Fprintf(&b, "| Writes Failed | %d |\n", s.Metrics.WriteFailed) + fmt.Fprintf(&b, "| Write Success Rate | %.2f%% |\n", s.Metrics.WriteSuccessRate()*100) + fmt.Fprintf(&b, "| Verify Passes | %d |\n", s.Metrics.VerifyPasses) + fmt.Fprintf(&b, "| Gaps Detected | %d |\n", s.Metrics.GapsDetected) + fmt.Fprintf(&b, "| Checksum Errors | %d |\n", s.Metrics.ChecksumErrors) b.WriteString("\n") // Disruption Windows @@ -72,8 +72,8 @@ func GenerateMarkdown(s Summary) string { if w.ExceededPolicy() { exceeded = "**YES**" } - b.WriteString(fmt.Sprintf("| %s | %s | %d | %s |\n", - w.OperationName, w.Duration().Round(time.Second), w.WriteFailures, exceeded)) + fmt.Fprintf(&b, "| %s | %s | %d | %s |\n", + w.OperationName, w.Duration().Round(time.Second), w.WriteFailures, exceeded) } b.WriteString("\n") } @@ -81,10 +81,10 @@ func GenerateMarkdown(s Summary) string { // Leak Analysis if s.LeakAnalysis.SampleCount > 0 { b.WriteString("## Resource Leak Analysis\n\n") - b.WriteString(fmt.Sprintf("- Samples: %d over %s\n", - s.LeakAnalysis.SampleCount, s.LeakAnalysis.Duration.Round(time.Minute))) - b.WriteString(fmt.Sprintf("- Memory trend: %.2f MB/hour\n", s.LeakAnalysis.MemorySlopeMB)) - b.WriteString(fmt.Sprintf("- CPU trend: %.4f cores/hour\n", s.LeakAnalysis.CPUSlopeCores)) + fmt.Fprintf(&b, "- Samples: %d over %s\n", + s.LeakAnalysis.SampleCount, s.LeakAnalysis.Duration.Round(time.Minute)) + fmt.Fprintf(&b, "- Memory trend: %.2f MB/hour\n", s.LeakAnalysis.MemorySlopeMB) + fmt.Fprintf(&b, "- CPU trend: %.4f cores/hour\n", s.LeakAnalysis.CPUSlopeCores) if s.LeakAnalysis.HasLeak { b.WriteString("- **⚠️ Memory leak suspected**\n") } diff --git a/test/longhaul/workload/metrics.go b/test/longhaul/workload/metrics.go index 864f28282..c9ced84ec 100644 --- a/test/longhaul/workload/metrics.go +++ b/test/longhaul/workload/metrics.go @@ -35,7 +35,7 @@ func NewMetrics() *Metrics { } } -// Snapshot returns a point-in-time copy of all metric values. +// MetricsSnapshot is a point-in-time copy of all metric values. type MetricsSnapshot struct { WriteAttempted int64 WriteAcknowledged int64 From 19b8389415bbb3daa89b2e69cca1b57206ae70ac Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Thu, 18 Jun 2026 11:57:08 -0400 Subject: [PATCH 04/11] test(longhaul): single verifier + per-field metric docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Verifier: - Drop NumVerifiers config knob and StartVerifiers helper; always run exactly one verifier. - Multiple verifiers were a bug, not a feature: each one independently scanned the whole collection and wrote to the SHARED Metrics.VerifyGapsDetected counter, so a single real gap of N missing seqs was reported as N x NumVerifiers in alerts, and read load on the cluster scaled with NumVerifiers (turning the verifier's own load into a confounding signal for the test report). - One verifier is sufficient: the scan is stateless against the DB and bounded by the per-writer nextSeq resume map, so adding more instances adds noise, not coverage. - LONGHAUL_NUM_VERIFIERS env var removed (along with the cfg field, default, parse, validate, README row, and config_test references). Metrics: - Add per-field godoc on every Metrics field clarifying ack-vs-fail semantics, what triggers FAIL (verifier-side only), DupKey ack behavior, scan-cycle vs document semantics, and StartTime / Elapsed restart behavior. - The user asked 'what is VerifyGapsDetected' — that the question came up at all is evidence the field comments were insufficient. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test/longhaul/README.md | 1 - test/longhaul/cmd/longhaul/main.go | 11 +++--- test/longhaul/config/config.go | 16 --------- test/longhaul/config/config_test.go | 3 +- test/longhaul/workload/metrics.go | 54 +++++++++++++++++++++++++---- test/longhaul/workload/verifier.go | 21 ++++++----- 6 files changed, 66 insertions(+), 40 deletions(-) diff --git a/test/longhaul/README.md b/test/longhaul/README.md index 3dc8fa4d3..e84f9de74 100644 --- a/test/longhaul/README.md +++ b/test/longhaul/README.md @@ -127,7 +127,6 @@ All configuration is via environment variables. | `LONGHAUL_NAMESPACE` | No | `default` | Kubernetes namespace of the target cluster. | | `LONGHAUL_MAX_DURATION` | No | `30m` | Max test duration. Use `0s` for run-until-failure. | | `LONGHAUL_NUM_WRITERS` | No | `5` | Number of concurrent writers. | -| `LONGHAUL_NUM_VERIFIERS` | No | `2` | Number of concurrent verifiers. | | `LONGHAUL_OP_COOLDOWN` | No | `5m` | Cooldown between management operations. | | `LONGHAUL_RECOVERY_TIMEOUT` | No | `5m` | Max wait for cluster recovery after an operation. | | `LONGHAUL_MIN_INSTANCES` | No | `1` | Minimum `spec.instancesPerNode` for scale-down operations (CRD lower bound: 1). | diff --git a/test/longhaul/cmd/longhaul/main.go b/test/longhaul/cmd/longhaul/main.go index dacd86212..c5a10e444 100644 --- a/test/longhaul/cmd/longhaul/main.go +++ b/test/longhaul/cmd/longhaul/main.go @@ -38,8 +38,8 @@ func main() { log.Fatalf("invalid config: %v", err) } - log.Printf("config loaded: duration=%s namespace=%s cluster=%s writers=%d verifiers=%d", - cfg.MaxDuration, cfg.Namespace, cfg.ClusterName, cfg.NumWriters, cfg.NumVerifiers) + log.Printf("config loaded: duration=%s namespace=%s cluster=%s writers=%d", + cfg.MaxDuration, cfg.Namespace, cfg.ClusterName, cfg.NumWriters) exitCode := run(cfg) os.Exit(exitCode) @@ -126,9 +126,10 @@ func run(cfg config.Config) int { workload.StartWriters(ctx, cfg.NumWriters, db, metrics, j) j.Info("main", fmt.Sprintf("started %d writers", cfg.NumWriters)) - // Start verifiers. - workload.StartVerifiers(ctx, cfg.NumVerifiers, db, metrics, j) - j.Info("main", fmt.Sprintf("started %d verifiers", cfg.NumVerifiers)) + // Start verifier. A single verifier is sufficient — see StartVerifier + // godoc for why multiple verifiers would multi-count gaps. + workload.StartVerifier(ctx, db, metrics, j) + j.Info("main", "verifier started") // Configure operations. ops := []operations.Operation{ diff --git a/test/longhaul/config/config.go b/test/longhaul/config/config.go index 085986c42..18b44b1c3 100644 --- a/test/longhaul/config/config.go +++ b/test/longhaul/config/config.go @@ -21,7 +21,6 @@ const ( // Workload and operation tuning. EnvMongoURI = "LONGHAUL_MONGO_URI" EnvNumWriters = "LONGHAUL_NUM_WRITERS" - EnvNumVerifiers = "LONGHAUL_NUM_VERIFIERS" EnvOpCooldown = "LONGHAUL_OP_COOLDOWN" EnvRecoveryTimeout = "LONGHAUL_RECOVERY_TIMEOUT" EnvSteadyStateWait = "LONGHAUL_STEADY_STATE_WAIT" @@ -54,9 +53,6 @@ type Config struct { // NumWriters is the number of concurrent writer goroutines. NumWriters int - // NumVerifiers is the number of concurrent verifier goroutines. - NumVerifiers int - // OpCooldown is the minimum time between disruptive operations. OpCooldown time.Duration @@ -91,7 +87,6 @@ func DefaultConfig() Config { ClusterName: "", MongoURI: "", NumWriters: 5, - NumVerifiers: 2, OpCooldown: 5 * time.Minute, RecoveryTimeout: 5 * time.Minute, SteadyStateWait: 60 * time.Second, @@ -134,14 +129,6 @@ func LoadFromEnv() (Config, error) { cfg.NumWriters = n } - if v := os.Getenv(EnvNumVerifiers); v != "" { - n, err := strconv.Atoi(v) - if err != nil { - return cfg, fmt.Errorf("invalid %s=%q: %w", EnvNumVerifiers, v, err) - } - cfg.NumVerifiers = n - } - if v := os.Getenv(EnvOpCooldown); v != "" { d, err := time.ParseDuration(v) if err != nil { @@ -211,9 +198,6 @@ func (c *Config) Validate() error { if c.NumWriters < 1 { return fmt.Errorf("num writers must be at least 1, got %d", c.NumWriters) } - if c.NumVerifiers < 1 { - return fmt.Errorf("num verifiers must be at least 1, got %d", c.NumVerifiers) - } if c.OpCooldown < 0 { return fmt.Errorf("operation cooldown must not be negative, got %s", c.OpCooldown) } diff --git a/test/longhaul/config/config_test.go b/test/longhaul/config/config_test.go index 0fcc53306..629732954 100644 --- a/test/longhaul/config/config_test.go +++ b/test/longhaul/config/config_test.go @@ -18,7 +18,6 @@ var _ = Describe("Config", func() { Expect(cfg.Namespace).To(Equal("default")) Expect(cfg.ClusterName).To(BeEmpty()) Expect(cfg.NumWriters).To(Equal(5)) - Expect(cfg.NumVerifiers).To(Equal(2)) Expect(cfg.OpCooldown).To(Equal(5 * time.Minute)) Expect(cfg.RecoveryTimeout).To(Equal(5 * time.Minute)) Expect(cfg.SteadyStateWait).To(Equal(60 * time.Second)) @@ -33,7 +32,7 @@ var _ = Describe("Config", func() { BeforeEach(func() { for _, k := range []string{ EnvEnabled, EnvMaxDuration, EnvNamespace, EnvClusterName, - EnvMongoURI, EnvNumWriters, EnvNumVerifiers, + EnvMongoURI, EnvNumWriters, EnvOpCooldown, EnvRecoveryTimeout, EnvSteadyStateWait, EnvMinInstances, EnvMaxInstances, EnvReportInterval, } { diff --git a/test/longhaul/workload/metrics.go b/test/longhaul/workload/metrics.go index c9ced84ec..4097e46c0 100644 --- a/test/longhaul/workload/metrics.go +++ b/test/longhaul/workload/metrics.go @@ -13,18 +13,55 @@ import ( // Metrics tracks aggregate workload counters using atomic operations. // All fields are safe for concurrent access from multiple goroutines. +// +// Note on roles: +// - The writer-side counters (WriteAttempted/Acknowledged/Failed) are local +// observations made by writers from their InsertOne return values; they +// feed the disruption-window budget but do NOT by themselves fail the test. +// - The verifier-side counters (VerifyGapsDetected, ChecksumErrors) are the +// durability oracle: any non-zero value flips Result to FAIL in main. type Metrics struct { - // Writer metrics - WriteAttempted atomic.Int64 + // WriteAttempted is the total number of InsertOne calls issued by all + // writers (one per writer tick). Includes calls that later fail or are + // retried as DupKey acks. Each writer ticks every writeInterval (100 ms). + WriteAttempted atomic.Int64 + + // WriteAcknowledged is the number of writes the server confirmed as + // durable. Includes DupKey replies, which are treated as idempotent acks + // because the v2 driver's retryable-writes path can resend a committed + // insert after a dropped ACK (see writer.go:99-110). Equals + // WriteAttempted - WriteFailed in steady state. WriteAcknowledged atomic.Int64 - WriteFailed atomic.Int64 - // Verifier metrics - VerifyPasses atomic.Int64 + // WriteFailed counts non-DupKey insert errors observed by writers. + // These do NOT advance the writer's seq counter (so the next tick retries + // the same seq) and therefore do not cause data-loss gaps on their own. + // They DO get charged against the active disruption window's + // AllowedWriteFailures budget via journal.RecordWriteFailure. + WriteFailed atomic.Int64 + + // VerifyPasses is the number of completed verifier scan cycles (not the + // number of documents verified). Each verifier ticks every verifyInterval + // (10 s) and increments this on a clean scan with no gaps/checksum + // mismatches in the rows it observed this cycle. + VerifyPasses atomic.Int64 + + // VerifyGapsDetected is the durability-oracle signal: count of missing seq + // numbers observed in the workload collection. Incremented by + // (doc.Seq - expectedSeq) when the verifier reads a document whose seq is + // higher than the next expected one for that writer (verifier.go:127-135). + // A non-zero value flips Result to FAIL with reason "data loss". VerifyGapsDetected atomic.Int64 - ChecksumErrors atomic.Int64 - // Timing + // ChecksumErrors counts documents whose stored SHA-256 checksum doesn't + // match the recomputed checksum over (writer_id, seq, payload). Indicates + // silent corruption (writer never sees these — only the verifier does). + // A non-zero value flips Result to FAIL with reason "data loss". + ChecksumErrors atomic.Int64 + + // StartTime is the process-local clock time when this Metrics was + // constructed. Used to derive Elapsed in snapshots. Resets when the pod + // restarts (the data history does not — see Writer.Resume). StartTime time.Time } @@ -36,6 +73,9 @@ func NewMetrics() *Metrics { } // MetricsSnapshot is a point-in-time copy of all metric values. +// Field semantics mirror the Metrics counters above; GapsDetected is the +// snapshot name for VerifyGapsDetected, and Elapsed is time.Since(StartTime) +// captured at snapshot time. type MetricsSnapshot struct { WriteAttempted int64 WriteAcknowledged int64 diff --git a/test/longhaul/workload/verifier.go b/test/longhaul/workload/verifier.go index ae147b509..2f52ab4cc 100644 --- a/test/longhaul/workload/verifier.go +++ b/test/longhaul/workload/verifier.go @@ -151,13 +151,16 @@ func (v *Verifier) verifyWriter(ctx context.Context, writerID string) { v.nextSeq[writerID] = expectedSeq } -// StartVerifiers launches n verifiers and returns them. -func StartVerifiers(ctx context.Context, n int, db *mongo.Database, metrics *Metrics, j *journal.Journal) []*Verifier { - verifiers := make([]*Verifier, n) - for i := 0; i < n; i++ { - id := fmt.Sprintf("v%03d", i) - verifiers[i] = NewVerifier(id, db, metrics, j) - go verifiers[i].Run(ctx) - } - return verifiers +// StartVerifier launches a single verifier goroutine and returns it. +// +// Only one verifier runs. Each verifier scans the full collection and writes +// to the shared Metrics.VerifyGapsDetected counter, so running multiple +// verifiers would multi-count every real gap by N and double the read load +// on the cluster (turning the verifier's own load into a confounding signal +// for the test report). One verifier is sufficient because the scan is +// stateless and bounded by the per-writer nextSeq resume map. +func StartVerifier(ctx context.Context, db *mongo.Database, metrics *Metrics, j *journal.Journal) *Verifier { + v := NewVerifier("v000", db, metrics, j) + go v.Run(ctx) + return v } From 656757b91362f0e3cda152d576a9462c439da42a Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Thu, 18 Jun 2026 12:03:16 -0400 Subject: [PATCH 05/11] test(longhaul): tighten verifier/metrics doc comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Metrics: one-line godoc per field; drop the multi-paragraph prose. - Verifier: drop the dead 'id' field (only one verifier exists), update test/log sites accordingly. - Fix incorrect nextSeq comment: once we step past a gap, expectedSeq advances to seq+1, so a late-arriving fill at the missing seq is filtered out by 'seq >= nextSeq' on subsequent cycles — gaps are counted exactly once, not re-checked. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test/longhaul/workload/metrics.go | 58 ++++++++----------------- test/longhaul/workload/verifier.go | 45 ++++++++++--------- test/longhaul/workload/verifier_test.go | 5 +-- 3 files changed, 42 insertions(+), 66 deletions(-) diff --git a/test/longhaul/workload/metrics.go b/test/longhaul/workload/metrics.go index 4097e46c0..e8837d202 100644 --- a/test/longhaul/workload/metrics.go +++ b/test/longhaul/workload/metrics.go @@ -12,56 +12,36 @@ import ( ) // Metrics tracks aggregate workload counters using atomic operations. -// All fields are safe for concurrent access from multiple goroutines. // -// Note on roles: -// - The writer-side counters (WriteAttempted/Acknowledged/Failed) are local -// observations made by writers from their InsertOne return values; they -// feed the disruption-window budget but do NOT by themselves fail the test. -// - The verifier-side counters (VerifyGapsDetected, ChecksumErrors) are the -// durability oracle: any non-zero value flips Result to FAIL in main. +// Writer-side counters (WriteAttempted/Acknowledged/Failed) feed the +// disruption-window budget but do not fail the test on their own. +// Verifier-side counters (VerifyGapsDetected, ChecksumErrors) are the +// durability oracle: any non-zero value flips Result to FAIL. type Metrics struct { - // WriteAttempted is the total number of InsertOne calls issued by all - // writers (one per writer tick). Includes calls that later fail or are - // retried as DupKey acks. Each writer ticks every writeInterval (100 ms). + // WriteAttempted is the total number of InsertOne calls issued by writers. WriteAttempted atomic.Int64 - // WriteAcknowledged is the number of writes the server confirmed as - // durable. Includes DupKey replies, which are treated as idempotent acks - // because the v2 driver's retryable-writes path can resend a committed - // insert after a dropped ACK (see writer.go:99-110). Equals - // WriteAttempted - WriteFailed in steady state. + // WriteAcknowledged is the number of writes the server confirmed durable. + // Includes DupKey replies (treated as idempotent acks for retryable writes). WriteAcknowledged atomic.Int64 - // WriteFailed counts non-DupKey insert errors observed by writers. - // These do NOT advance the writer's seq counter (so the next tick retries - // the same seq) and therefore do not cause data-loss gaps on their own. - // They DO get charged against the active disruption window's - // AllowedWriteFailures budget via journal.RecordWriteFailure. + // WriteFailed counts non-DupKey insert errors. Does not advance seq, so + // the next tick retries the same seq; charged against the disruption-window + // budget via journal.RecordWriteFailure. WriteFailed atomic.Int64 - // VerifyPasses is the number of completed verifier scan cycles (not the - // number of documents verified). Each verifier ticks every verifyInterval - // (10 s) and increments this on a clean scan with no gaps/checksum - // mismatches in the rows it observed this cycle. + // VerifyPasses is the number of completed verifier scan cycles. VerifyPasses atomic.Int64 - // VerifyGapsDetected is the durability-oracle signal: count of missing seq - // numbers observed in the workload collection. Incremented by - // (doc.Seq - expectedSeq) when the verifier reads a document whose seq is - // higher than the next expected one for that writer (verifier.go:127-135). - // A non-zero value flips Result to FAIL with reason "data loss". + // VerifyGapsDetected counts missing seq numbers observed by the verifier. + // Non-zero => FAIL with reason "data loss". VerifyGapsDetected atomic.Int64 - // ChecksumErrors counts documents whose stored SHA-256 checksum doesn't - // match the recomputed checksum over (writer_id, seq, payload). Indicates - // silent corruption (writer never sees these — only the verifier does). - // A non-zero value flips Result to FAIL with reason "data loss". + // ChecksumErrors counts documents whose stored checksum doesn't match the + // recomputed value. Non-zero => FAIL with reason "data loss". ChecksumErrors atomic.Int64 - // StartTime is the process-local clock time when this Metrics was - // constructed. Used to derive Elapsed in snapshots. Resets when the pod - // restarts (the data history does not — see Writer.Resume). + // StartTime is when this Metrics was constructed; resets on pod restart. StartTime time.Time } @@ -72,10 +52,8 @@ func NewMetrics() *Metrics { } } -// MetricsSnapshot is a point-in-time copy of all metric values. -// Field semantics mirror the Metrics counters above; GapsDetected is the -// snapshot name for VerifyGapsDetected, and Elapsed is time.Since(StartTime) -// captured at snapshot time. +// MetricsSnapshot is a point-in-time copy of Metrics. GapsDetected is the +// snapshot name for VerifyGapsDetected. type MetricsSnapshot struct { WriteAttempted int64 WriteAcknowledged int64 diff --git a/test/longhaul/workload/verifier.go b/test/longhaul/workload/verifier.go index 2f52ab4cc..c72f33719 100644 --- a/test/longhaul/workload/verifier.go +++ b/test/longhaul/workload/verifier.go @@ -21,32 +21,32 @@ const ( verifyInterval = 10 * time.Second ) -// Verifier periodically scans the workload collection to detect -// sequence gaps and checksum mismatches in acknowledged writes. +// Verifier periodically scans the workload collection to detect sequence +// gaps and checksum mismatches in acknowledged writes. // -// To bound the per-cycle scan cost over a multi-day run, the verifier tracks -// the next expected sequence per writer in nextSeq and only scans documents -// with seq >= nextSeq. Without this, a 100ms-per-write writer accumulates -// ~864k docs/day and verifyAll would re-read the entire history every 10s -// (~75M doc-reads/hour per writer), which both saturates the cluster and -// turns the verifier's own load into a confounding signal in the report. +// Per-cycle scan cost is bounded by nextSeq: each scan starts at the +// highest-seen seq+1 per writer, so the cycle cost is O(new docs since last +// tick), not O(full history). Without this, a 100ms writer would accumulate +// ~864k docs/day per writer and re-reading the whole collection every 10s +// would dominate cluster load. type Verifier struct { - id string metrics *Metrics journal *journal.Journal collection *mongo.Collection - // nextSeq is the next sequence number we expect to see for each writer. + // nextSeq[writerID] is highest-observed seq + 1 for that writer; documents + // below this point are skipped on subsequent cycles. Consequence: a gap is + // counted exactly once when we step past it — a late-arriving fill at the + // missing seq is not re-checked. // Only mutated from the verifier goroutine, so no lock is needed. nextSeq map[string]int64 } -// NewVerifier creates a verifier with the given ID. -func NewVerifier(id string, db *mongo.Database, metrics *Metrics, j *journal.Journal) *Verifier { +// NewVerifier creates a verifier. +func NewVerifier(db *mongo.Database, metrics *Metrics, j *journal.Journal) *Verifier { coll := db.Collection(CollectionName, options.Collection(). SetReadConcern(readconcern.Majority())) return &Verifier{ - id: id, metrics: metrics, journal: j, collection: coll, @@ -56,8 +56,8 @@ func NewVerifier(id string, db *mongo.Database, metrics *Metrics, j *journal.Jou // Run starts the verifier loop. It blocks until the context is cancelled. func (v *Verifier) Run(ctx context.Context) { - v.journal.Info("verifier", fmt.Sprintf("verifier %s started", v.id)) - defer v.journal.Info("verifier", fmt.Sprintf("verifier %s stopped", v.id)) + v.journal.Info("verifier", "verifier started") + defer v.journal.Info("verifier", "verifier stopped") ticker := time.NewTicker(verifyInterval) defer ticker.Stop() @@ -145,9 +145,9 @@ func (v *Verifier) verifyWriter(ctx context.Context, writerID string) { } } - // Persist the resume point. If the cursor returned no rows, expectedSeq is - // unchanged and we'll re-scan from the same point next cycle (correct: a - // gap might fill in later when a delayed/recovered write commits). + // Persist the resume point. Note: if no rows were returned, expectedSeq + // is unchanged; if a gap was crossed, expectedSeq is past it, so a late + // fill at the missing seq will be filtered out by seq >= nextSeq next cycle. v.nextSeq[writerID] = expectedSeq } @@ -155,12 +155,11 @@ func (v *Verifier) verifyWriter(ctx context.Context, writerID string) { // // Only one verifier runs. Each verifier scans the full collection and writes // to the shared Metrics.VerifyGapsDetected counter, so running multiple -// verifiers would multi-count every real gap by N and double the read load -// on the cluster (turning the verifier's own load into a confounding signal -// for the test report). One verifier is sufficient because the scan is -// stateless and bounded by the per-writer nextSeq resume map. +// verifiers would multi-count every real gap by N and double the cluster +// read load. One verifier is sufficient because the per-writer nextSeq map +// bounds each cycle to new documents. func StartVerifier(ctx context.Context, db *mongo.Database, metrics *Metrics, j *journal.Journal) *Verifier { - v := NewVerifier("v000", db, metrics, j) + v := NewVerifier(db, metrics, j) go v.Run(ctx) return v } diff --git a/test/longhaul/workload/verifier_test.go b/test/longhaul/workload/verifier_test.go index 606f85892..c675f492d 100644 --- a/test/longhaul/workload/verifier_test.go +++ b/test/longhaul/workload/verifier_test.go @@ -11,11 +11,10 @@ import ( ) var _ = Describe("Verifier", func() { - It("constructor wires id, metrics, and journal correctly", func() { + It("constructor wires metrics and journal correctly", func() { m := NewMetrics() j := journal.New() - v := &Verifier{id: "v007", metrics: m, journal: j, nextSeq: make(map[string]int64)} - Expect(v.id).To(Equal("v007")) + v := &Verifier{metrics: m, journal: j, nextSeq: make(map[string]int64)} Expect(v.metrics).To(BeIdenticalTo(m)) Expect(v.journal).To(BeIdenticalTo(j)) Expect(v.nextSeq).To(BeEmpty()) From e131912756b00990ec1b06f16ca8cc839696412e Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Thu, 18 Jun 2026 12:13:11 -0400 Subject: [PATCH 06/11] test(longhaul): detect tail loss by comparing writer tip to DB contents MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this, the verifier could miss data loss in the tail: if seqs above the verifier's last-observed point were acked then lost, and no later writes arrived to expose the gap via the per-doc check, the loss was invisible. Concretely: writer acks 1..110, DB loses 101..110, writer crashes/idles. Verifier scans seq>=101, sees nothing, and the durability oracle stays clean — false negative. Fix: snapshot writer.Seq() at the top of each per-writer scan, bound the Find filter to seq <= that snapshot, and after the per-doc loop check whether expectedSeq <= snapshot. Any residual is tail loss; count it under VerifyGapsDetected and log a distinct 'tail loss' message. Advance nextSeq to snapshot+1 so the next cycle scans only newly committed seqs. Race safety: reading writer.Seq() BEFORE the Find guarantees we never count in-flight writes (which commit after Seq() advances). Verified expected <= actual in steady state, so missing>0 only when seqs are truly gone. Also drops the aggregation-based writer-id discovery (we now iterate the writers slice the verifier owns) and adds Writer.Seq() public accessor + unit test. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test/longhaul/cmd/longhaul/main.go | 7 +- test/longhaul/workload/metrics.go | 4 +- test/longhaul/workload/verifier.go | 98 +++++++++++++------------ test/longhaul/workload/verifier_test.go | 7 +- test/longhaul/workload/writer.go | 6 ++ test/longhaul/workload/writer_test.go | 7 ++ 6 files changed, 77 insertions(+), 52 deletions(-) diff --git a/test/longhaul/cmd/longhaul/main.go b/test/longhaul/cmd/longhaul/main.go index c5a10e444..0c31663f5 100644 --- a/test/longhaul/cmd/longhaul/main.go +++ b/test/longhaul/cmd/longhaul/main.go @@ -123,12 +123,13 @@ func run(cfg config.Config) int { leakDetector := monitor.NewLeakDetector(j, 10.0, 10) // Start writers. - workload.StartWriters(ctx, cfg.NumWriters, db, metrics, j) + writers := workload.StartWriters(ctx, cfg.NumWriters, db, metrics, j) j.Info("main", fmt.Sprintf("started %d writers", cfg.NumWriters)) // Start verifier. A single verifier is sufficient — see StartVerifier - // godoc for why multiple verifiers would multi-count gaps. - workload.StartVerifier(ctx, db, metrics, j) + // godoc. Writers are passed so the verifier can detect tail loss by + // comparing each writer's acked tip against what's in the DB. + workload.StartVerifier(ctx, db, writers, metrics, j) j.Info("main", "verifier started") // Configure operations. diff --git a/test/longhaul/workload/metrics.go b/test/longhaul/workload/metrics.go index e8837d202..64c1a416e 100644 --- a/test/longhaul/workload/metrics.go +++ b/test/longhaul/workload/metrics.go @@ -33,7 +33,9 @@ type Metrics struct { // VerifyPasses is the number of completed verifier scan cycles. VerifyPasses atomic.Int64 - // VerifyGapsDetected counts missing seq numbers observed by the verifier. + // VerifyGapsDetected counts missing seq numbers observed by the verifier — + // both internal gaps (a hole between two observed docs) and tail loss + // (acked seqs beyond the highest doc present in the DB). // Non-zero => FAIL with reason "data loss". VerifyGapsDetected atomic.Int64 diff --git a/test/longhaul/workload/verifier.go b/test/longhaul/workload/verifier.go index c72f33719..4e3f2c223 100644 --- a/test/longhaul/workload/verifier.go +++ b/test/longhaul/workload/verifier.go @@ -22,10 +22,10 @@ const ( ) // Verifier periodically scans the workload collection to detect sequence -// gaps and checksum mismatches in acknowledged writes. +// gaps, tail loss, and checksum mismatches in acknowledged writes. // // Per-cycle scan cost is bounded by nextSeq: each scan starts at the -// highest-seen seq+1 per writer, so the cycle cost is O(new docs since last +// highest-checked seq+1 per writer, so cycle cost is O(new docs since last // tick), not O(full history). Without this, a 100ms writer would accumulate // ~864k docs/day per writer and re-reading the whole collection every 10s // would dominate cluster load. @@ -33,23 +33,27 @@ type Verifier struct { metrics *Metrics journal *journal.Journal collection *mongo.Collection + writers []*Writer - // nextSeq[writerID] is highest-observed seq + 1 for that writer; documents - // below this point are skipped on subsequent cycles. Consequence: a gap is - // counted exactly once when we step past it — a late-arriving fill at the - // missing seq is not re-checked. + // nextSeq[writerID] is the seq we'll start the next scan from for that + // writer — set to (snapshotted writer.Seq() + 1) at the end of each cycle. + // Consequence: any seq <= that snapshot is accounted for exactly once; + // a late-arriving fill at a missing seq is not re-checked. // Only mutated from the verifier goroutine, so no lock is needed. nextSeq map[string]int64 } -// NewVerifier creates a verifier. -func NewVerifier(db *mongo.Database, metrics *Metrics, j *journal.Journal) *Verifier { +// NewVerifier creates a verifier. writers is the set of writers whose tips +// the verifier will compare against the DB for tail-loss detection; pass nil +// to disable tail-loss checks (useful in unit tests). +func NewVerifier(db *mongo.Database, writers []*Writer, metrics *Metrics, j *journal.Journal) *Verifier { coll := db.Collection(CollectionName, options.Collection(). SetReadConcern(readconcern.Majority())) return &Verifier{ metrics: metrics, journal: j, collection: coll, + writers: writers, nextSeq: make(map[string]int64), } } @@ -73,43 +77,38 @@ func (v *Verifier) Run(ctx context.Context) { } func (v *Verifier) verifyAll(ctx context.Context) { - // Get distinct writer IDs using aggregation (v2 API compatible). - pipeline := bson.A{ - bson.D{{Key: "$group", Value: bson.D{{Key: "_id", Value: "$writer_id"}}}}, + for _, w := range v.writers { + v.verifyWriter(ctx, w) } - cursor, err := v.collection.Aggregate(ctx, pipeline) - if err != nil { - v.journal.Warn("verifier", fmt.Sprintf("failed to get writer IDs: %v", err)) - return - } - defer cursor.Close(ctx) - - var results []struct { - ID string `bson:"_id"` - } - if err := cursor.All(ctx, &results); err != nil { - v.journal.Warn("verifier", fmt.Sprintf("failed to decode writer IDs: %v", err)) - return - } - - for _, r := range results { - v.verifyWriter(ctx, r.ID) - } - v.metrics.VerifyPasses.Add(1) } -func (v *Verifier) verifyWriter(ctx context.Context, writerID string) { - // Resume from where the previous cycle left off. First-ever scan starts at 1. +func (v *Verifier) verifyWriter(ctx context.Context, w *Writer) { + writerID := w.id + + // Snapshot the writer's tip BEFORE scanning. Writes that commit after this + // point land above maxSeq and the scan filter excludes them, so they're + // accounted for in the next cycle. Reading w.Seq() first (vs. CountDocuments + // first) guarantees expected <= what's-in-DB modulo real loss, so no false + // positives from in-flight writes. + maxSeq := w.Seq() + expectedSeq := v.nextSeq[writerID] if expectedSeq == 0 { expectedSeq = 1 } + if maxSeq < expectedSeq { + // Nothing new committed since last cycle. + return + } opts := options.Find().SetSort(bson.D{{Key: "seq", Value: 1}}) filter := bson.D{ {Key: "writer_id", Value: writerID}, - {Key: "seq", Value: bson.D{{Key: "$gte", Value: expectedSeq}}}, + {Key: "seq", Value: bson.D{ + {Key: "$gte", Value: expectedSeq}, + {Key: "$lte", Value: maxSeq}, + }}, } cursor, err := v.collection.Find(ctx, filter, opts) if err != nil { @@ -125,7 +124,7 @@ func (v *Verifier) verifyWriter(ctx context.Context, writerID string) { continue } - // Check for gaps in the sequence. + // Internal gap: missing seq numbers between two observed docs. if doc.Seq > expectedSeq { gaps := doc.Seq - expectedSeq v.metrics.VerifyGapsDetected.Add(gaps) @@ -145,21 +144,30 @@ func (v *Verifier) verifyWriter(ctx context.Context, writerID string) { } } - // Persist the resume point. Note: if no rows were returned, expectedSeq - // is unchanged; if a gap was crossed, expectedSeq is past it, so a late - // fill at the missing seq will be filtered out by seq >= nextSeq next cycle. - v.nextSeq[writerID] = expectedSeq + // Tail loss: writer acked through maxSeq but DB has nothing in + // (expectedSeq-1, maxSeq]. This catches the case where the most recent + // acked writes vanished and no later writes have arrived to expose the + // gap via the per-doc check above. + if expectedSeq <= maxSeq { + tail := maxSeq - expectedSeq + 1 + v.metrics.VerifyGapsDetected.Add(tail) + v.journal.Error("verifier", fmt.Sprintf( + "tail loss: writer=%s expected_seq=%d acked_tip=%d (missing %d)", + writerID, expectedSeq, maxSeq, tail)) + } + + // We've accounted for every seq up to maxSeq; advance the resume point. + v.nextSeq[writerID] = maxSeq + 1 } // StartVerifier launches a single verifier goroutine and returns it. // -// Only one verifier runs. Each verifier scans the full collection and writes -// to the shared Metrics.VerifyGapsDetected counter, so running multiple -// verifiers would multi-count every real gap by N and double the cluster -// read load. One verifier is sufficient because the per-writer nextSeq map -// bounds each cycle to new documents. -func StartVerifier(ctx context.Context, db *mongo.Database, metrics *Metrics, j *journal.Journal) *Verifier { - v := NewVerifier(db, metrics, j) +// Only one verifier runs. Each verifier writes to the shared +// Metrics.VerifyGapsDetected counter, so running multiple would multi-count +// every real gap by N and double the cluster read load. One is sufficient +// because the per-writer nextSeq map bounds each cycle to new documents. +func StartVerifier(ctx context.Context, db *mongo.Database, writers []*Writer, metrics *Metrics, j *journal.Journal) *Verifier { + v := NewVerifier(db, writers, metrics, j) go v.Run(ctx) return v } diff --git a/test/longhaul/workload/verifier_test.go b/test/longhaul/workload/verifier_test.go index c675f492d..ec38ebcb3 100644 --- a/test/longhaul/workload/verifier_test.go +++ b/test/longhaul/workload/verifier_test.go @@ -21,9 +21,10 @@ var _ = Describe("Verifier", func() { }) It("nextSeq is the per-writer resume point that bounds per-cycle scan cost", func() { - // verifyWriter sets nextSeq[writerID] to the seq AFTER the last seen doc, - // so on the next cycle the scan filter is "seq >= nextSeq". This is what - // keeps the per-cycle scan cost bounded over a multi-day run. + // verifyWriter sets nextSeq[writerID] to maxSeq+1 (writer's tip at scan + // time + 1), so the next cycle scans seq in (prev_tip, new_tip]. This + // is what bounds per-cycle scan cost AND lets tail loss be detected by + // comparing maxSeq against the highest doc actually present. v := &Verifier{nextSeq: make(map[string]int64)} got, ok := v.nextSeq["w1"] diff --git a/test/longhaul/workload/writer.go b/test/longhaul/workload/writer.go index 052b79f16..e0279f2ec 100644 --- a/test/longhaul/workload/writer.go +++ b/test/longhaul/workload/writer.go @@ -59,6 +59,12 @@ func NewWriter(id string, db *mongo.Database, metrics *Metrics, j *journal.Journ } } +// Seq returns the highest sequence number this writer has successfully +// committed (including DupKey-as-ack). Safe to call from any goroutine. +func (w *Writer) Seq() int64 { + return w.seq.Load() +} + // Run starts the writer loop. It blocks until the context is cancelled. func (w *Writer) Run(ctx context.Context) { w.journal.Info("writer", fmt.Sprintf("writer %s started", w.id)) diff --git a/test/longhaul/workload/writer_test.go b/test/longhaul/workload/writer_test.go index 6641dba4f..bb91d8992 100644 --- a/test/longhaul/workload/writer_test.go +++ b/test/longhaul/workload/writer_test.go @@ -62,4 +62,11 @@ var _ = Describe("Writer", func() { Expect(w.seq.Add(1)).To(Equal(i)) } }) + + It("Seq() returns the current committed sequence number", func() { + w := &Writer{id: "w001"} + Expect(w.Seq()).To(BeZero()) + w.seq.Store(42) + Expect(w.Seq()).To(Equal(int64(42))) + }) }) From 44ce98036b26d3112c6b77db0026e2716dea61aa Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Thu, 18 Jun 2026 12:23:13 -0400 Subject: [PATCH 07/11] test(longhaul): unit-test verifier audit math and writer state machine Verifier: extract auditDocs as a pure function that takes the decoded docs and (expectedSeq, maxSeq) and returns (newExpectedSeq, internalGaps, tailLoss, checksumErrors) plus a list of structured findings the caller logs. This decouples the gap/tail/checksum math from cursor iteration and the journal, so it's unit-testable without a live mongo. Writer: introduce writeBackend interface (insert + isDuplicate + highestSeq) and wrap *mongo.Collection in mongoBackend. writeOne and Resume now go through the interface, so we can stub it in tests. Adds 11 audit table cases (no docs, clean run, gap-in-middle, gap-at-start, empty-with-tail, tail-after-docs, gap+tail, late-write-exposes-gap, single checksum, multi-checksum, nextSeq advance invariant) and 9 writer cases (success, DupKey-as-ack, transient error, retry-after-error, monotonic advance; Resume empty / has-data / on-error). Behavior unchanged. Net: ~+200 lines test, ~+50/-40 lines source. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test/longhaul/workload/audit_test.go | 161 ++++++++++++++++ test/longhaul/workload/verifier.go | 152 ++++++++++++---- test/longhaul/workload/writer.go | 74 +++++--- test/longhaul/workload/writer_backend_test.go | 172 ++++++++++++++++++ 4 files changed, 506 insertions(+), 53 deletions(-) create mode 100644 test/longhaul/workload/audit_test.go create mode 100644 test/longhaul/workload/writer_backend_test.go diff --git a/test/longhaul/workload/audit_test.go b/test/longhaul/workload/audit_test.go new file mode 100644 index 000000000..4b523dd20 --- /dev/null +++ b/test/longhaul/workload/audit_test.go @@ -0,0 +1,161 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package workload + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// makeDoc constructs a valid WriteDocument whose checksum matches; tests that +// want a checksum mismatch override Checksum directly. +func makeDoc(writerID string, seq int64) WriteDocument { + payload := "p" + return WriteDocument{ + WriterID: writerID, + Seq: seq, + Payload: payload, + Checksum: computeChecksum(writerID, seq, payload), + } +} + +var _ = Describe("auditDocs", func() { + It("returns zero counters when there are no docs and maxSeq < expectedSeq", func() { + // Cycle with no new writes since last tick: expectedSeq=5, maxSeq=4. + // Note: verifyWriter short-circuits this case BEFORE calling auditDocs, + // but auditDocs itself must still be safe — it should report no tail. + r := auditDocs("w1", nil, 5, 4) + Expect(r.internalGaps).To(BeZero()) + Expect(r.tailLoss).To(BeZero()) + Expect(r.checksumErrors).To(BeZero()) + Expect(r.findings).To(BeEmpty()) + // expectedSeq is unchanged when expectedSeq > maxSeq. + Expect(r.newExpectedSeq).To(Equal(int64(5))) + }) + + It("reports a clean contiguous run with no gaps and no tail", func() { + // expectedSeq=1, docs=[1,2,3], maxSeq=3. + docs := []WriteDocument{makeDoc("w1", 1), makeDoc("w1", 2), makeDoc("w1", 3)} + r := auditDocs("w1", docs, 1, 3) + Expect(r.internalGaps).To(BeZero()) + Expect(r.tailLoss).To(BeZero()) + Expect(r.checksumErrors).To(BeZero()) + Expect(r.findings).To(BeEmpty()) + Expect(r.newExpectedSeq).To(Equal(int64(4))) + }) + + It("detects an internal gap between two docs", func() { + // expectedSeq=1, docs=[1,4,5], maxSeq=5 → gap of 2 (seqs 2 and 3). + docs := []WriteDocument{makeDoc("w1", 1), makeDoc("w1", 4), makeDoc("w1", 5)} + r := auditDocs("w1", docs, 1, 5) + Expect(r.internalGaps).To(Equal(int64(2))) + Expect(r.tailLoss).To(BeZero()) + Expect(r.checksumErrors).To(BeZero()) + Expect(r.findings).To(HaveLen(1)) + Expect(r.findings[0].kind).To(Equal(findingGap)) + Expect(r.findings[0].seq).To(Equal(int64(2))) // first missing + Expect(r.findings[0].endSeq).To(Equal(int64(4))) // the doc that exposed the gap + Expect(r.findings[0].count).To(Equal(int64(2))) + Expect(r.newExpectedSeq).To(Equal(int64(6))) + }) + + It("detects a gap at the start of the scan window", func() { + // expectedSeq=1, docs=[3], maxSeq=3 → gap of 2 (seqs 1 and 2). + docs := []WriteDocument{makeDoc("w1", 3)} + r := auditDocs("w1", docs, 1, 3) + Expect(r.internalGaps).To(Equal(int64(2))) + Expect(r.tailLoss).To(BeZero()) + Expect(r.newExpectedSeq).To(Equal(int64(4))) + }) + + It("detects tail loss with no docs at all (empty scan, non-zero tip)", func() { + // Case B from review: writer acked through seq=10, DB lost them all, + // no later writes. expectedSeq=1, docs=[], maxSeq=10 → tail=10. + r := auditDocs("w1", nil, 1, 10) + Expect(r.internalGaps).To(BeZero()) + Expect(r.tailLoss).To(Equal(int64(10))) + Expect(r.findings).To(HaveLen(1)) + Expect(r.findings[0].kind).To(Equal(findingTail)) + Expect(r.findings[0].seq).To(Equal(int64(1))) + Expect(r.findings[0].endSeq).To(Equal(int64(10))) + Expect(r.findings[0].count).To(Equal(int64(10))) + Expect(r.newExpectedSeq).To(Equal(int64(11))) + }) + + It("detects tail loss when last doc is below maxSeq", func() { + // expectedSeq=1, docs=[1,2], maxSeq=5 → tail of 3 (seqs 3,4,5). + docs := []WriteDocument{makeDoc("w1", 1), makeDoc("w1", 2)} + r := auditDocs("w1", docs, 1, 5) + Expect(r.internalGaps).To(BeZero()) + Expect(r.tailLoss).To(Equal(int64(3))) + Expect(r.findings).To(HaveLen(1)) + Expect(r.findings[0].kind).To(Equal(findingTail)) + Expect(r.findings[0].seq).To(Equal(int64(3))) + Expect(r.findings[0].endSeq).To(Equal(int64(5))) + Expect(r.newExpectedSeq).To(Equal(int64(6))) + }) + + It("detects internal gap AND tail loss in the same cycle", func() { + // expectedSeq=1, docs=[2,5], maxSeq=8 → gap=1 (seq 1) + gap=2 (seqs 3,4) + tail=3 (6,7,8). + // Internal gaps total = 3; tail = 3. + docs := []WriteDocument{makeDoc("w1", 2), makeDoc("w1", 5)} + r := auditDocs("w1", docs, 1, 8) + Expect(r.internalGaps).To(Equal(int64(3))) + Expect(r.tailLoss).To(Equal(int64(3))) + Expect(r.findings).To(HaveLen(3)) // two gaps + one tail + Expect(r.findings[0].kind).To(Equal(findingGap)) + Expect(r.findings[1].kind).To(Equal(findingGap)) + Expect(r.findings[2].kind).To(Equal(findingTail)) + Expect(r.newExpectedSeq).To(Equal(int64(9))) + }) + + It("Case A: late write exposes an earlier-lost range as an internal gap", func() { + // Original: writer acked 1..110, DB loses 101..110, writer writes 111. + // Verifier last cycle: nextSeq[w1]=101, so expectedSeq=101. + // docs=[doc with seq=111], maxSeq=111 → gap=10 (101..110), no tail. + docs := []WriteDocument{makeDoc("w1", 111)} + r := auditDocs("w1", docs, 101, 111) + Expect(r.internalGaps).To(Equal(int64(10))) + Expect(r.tailLoss).To(BeZero()) + Expect(r.newExpectedSeq).To(Equal(int64(112))) + }) + + It("detects a checksum mismatch without affecting gap counters", func() { + // docs=[1,2,3], doc 2 has a bad checksum. + bad := makeDoc("w1", 2) + bad.Checksum = "deadbeef00000000" + docs := []WriteDocument{makeDoc("w1", 1), bad, makeDoc("w1", 3)} + r := auditDocs("w1", docs, 1, 3) + Expect(r.checksumErrors).To(Equal(int64(1))) + Expect(r.internalGaps).To(BeZero()) + Expect(r.tailLoss).To(BeZero()) + Expect(r.findings).To(HaveLen(1)) + Expect(r.findings[0].kind).To(Equal(findingChecksum)) + Expect(r.findings[0].seq).To(Equal(int64(2))) + Expect(r.findings[0].stored).To(Equal("deadbeef00000000")) + }) + + It("counts checksum errors across multiple bad docs", func() { + bad1 := makeDoc("w1", 1) + bad1.Checksum = "xx" + bad2 := makeDoc("w1", 2) + bad2.Checksum = "yy" + r := auditDocs("w1", []WriteDocument{bad1, bad2}, 1, 2) + Expect(r.checksumErrors).To(Equal(int64(2))) + Expect(r.findings).To(HaveLen(2)) + }) + + It("advances newExpectedSeq to maxSeq+1 even with tail loss", func() { + // Critical invariant: after a tail-loss cycle, nextSeq must move past + // maxSeq so the next cycle doesn't re-detect the same tail. + r := auditDocs("w1", nil, 1, 100) + Expect(r.newExpectedSeq).To(Equal(int64(101))) + }) + + It("preserves writerID in findings", func() { + r := auditDocs("worker-xyz", nil, 1, 1) + Expect(r.findings).To(HaveLen(1)) + Expect(r.findings[0].writerID).To(Equal("worker-xyz")) + }) +}) diff --git a/test/longhaul/workload/verifier.go b/test/longhaul/workload/verifier.go index 4e3f2c223..e64d0740f 100644 --- a/test/longhaul/workload/verifier.go +++ b/test/longhaul/workload/verifier.go @@ -83,6 +83,89 @@ func (v *Verifier) verifyAll(ctx context.Context) { v.metrics.VerifyPasses.Add(1) } +// findingKind labels what the verifier observed at a particular seq. +type findingKind int + +const ( + findingGap findingKind = iota + findingChecksum + findingTail +) + +// finding describes a single anomaly the verifier observed; auditDocs returns +// these so the caller can log them with full context without coupling the +// math to the journal. +type finding struct { + kind findingKind + writerID string + seq int64 // doc.Seq for gap/checksum; expectedSeq for tail + endSeq int64 // for gap: doc.Seq; for tail: maxSeq; unused for checksum + count int64 // number of missing seqs (gap/tail), 1 for checksum + stored string // for checksum only + computed string // for checksum only +} + +// auditResult is the aggregate counts from one verifyWriter cycle. Pure — +// no I/O — so it's table-testable without a mongo. +type auditResult struct { + newExpectedSeq int64 + internalGaps int64 + tailLoss int64 + checksumErrors int64 + findings []finding +} + +// auditDocs is the pure decision core of verifyWriter. Given the docs the +// verifier read (sorted by seq ascending) plus the writer's expected starting +// seq and current tip, it returns the new expected seq, the gap/checksum/tail +// counters, and a list of findings for the caller to log. +// +// Invariants checked: +// - For each doc, if doc.Seq > expectedSeq, the slots in [expectedSeq, doc.Seq) +// are missing (internal gap). +// - For each doc, checksum is recomputed and compared. +// - After processing all docs, if expectedSeq <= maxSeq the trailing slots +// [expectedSeq, maxSeq] are missing (tail loss). +// - On exit, newExpectedSeq is always maxSeq+1 when maxSeq >= initial +// expectedSeq, so the next cycle accounts for everything past maxSeq. +func auditDocs(writerID string, docs []WriteDocument, expectedSeq, maxSeq int64) auditResult { + var r auditResult + for _, doc := range docs { + if doc.Seq > expectedSeq { + gaps := doc.Seq - expectedSeq + r.internalGaps += gaps + r.findings = append(r.findings, finding{ + kind: findingGap, writerID: writerID, + seq: expectedSeq, endSeq: doc.Seq, count: gaps, + }) + } + expectedSeq = doc.Seq + 1 + + want := computeChecksum(doc.WriterID, doc.Seq, doc.Payload) + if doc.Checksum != want { + r.checksumErrors++ + r.findings = append(r.findings, finding{ + kind: findingChecksum, writerID: writerID, + seq: doc.Seq, count: 1, + stored: doc.Checksum, computed: want, + }) + } + } + + if expectedSeq <= maxSeq { + tail := maxSeq - expectedSeq + 1 + r.tailLoss = tail + r.findings = append(r.findings, finding{ + kind: findingTail, writerID: writerID, + seq: expectedSeq, endSeq: maxSeq, count: tail, + }) + expectedSeq = maxSeq + 1 + } + + r.newExpectedSeq = expectedSeq + return r +} + func (v *Verifier) verifyWriter(ctx context.Context, w *Writer) { writerID := w.id @@ -102,6 +185,26 @@ func (v *Verifier) verifyWriter(ctx context.Context, w *Writer) { return } + docs, err := v.fetchDocs(ctx, writerID, expectedSeq, maxSeq) + if err != nil { + v.journal.Warn("verifier", fmt.Sprintf("query failed for writer %s: %v", writerID, err)) + return + } + + r := auditDocs(writerID, docs, expectedSeq, maxSeq) + v.metrics.VerifyGapsDetected.Add(r.internalGaps + r.tailLoss) + v.metrics.ChecksumErrors.Add(r.checksumErrors) + for _, f := range r.findings { + v.logFinding(f) + } + + v.nextSeq[writerID] = r.newExpectedSeq +} + +// fetchDocs reads all docs for writerID with seq in [expectedSeq, maxSeq], +// sorted by seq ascending. Decode errors are logged but skipped (the rest of +// the scan continues; a skipped doc looks like a gap to auditDocs). +func (v *Verifier) fetchDocs(ctx context.Context, writerID string, expectedSeq, maxSeq int64) ([]WriteDocument, error) { opts := options.Find().SetSort(bson.D{{Key: "seq", Value: 1}}) filter := bson.D{ {Key: "writer_id", Value: writerID}, @@ -112,52 +215,37 @@ func (v *Verifier) verifyWriter(ctx context.Context, w *Writer) { } cursor, err := v.collection.Find(ctx, filter, opts) if err != nil { - v.journal.Warn("verifier", fmt.Sprintf("query failed for writer %s: %v", writerID, err)) - return + return nil, err } defer cursor.Close(ctx) + var out []WriteDocument for cursor.Next(ctx) { var doc WriteDocument if err := cursor.Decode(&doc); err != nil { v.journal.Warn("verifier", fmt.Sprintf("decode error for writer %s: %v", writerID, err)) continue } - - // Internal gap: missing seq numbers between two observed docs. - if doc.Seq > expectedSeq { - gaps := doc.Seq - expectedSeq - v.metrics.VerifyGapsDetected.Add(gaps) - v.journal.Error("verifier", fmt.Sprintf( - "gap detected: writer=%s expected_seq=%d got_seq=%d (missing %d)", - writerID, expectedSeq, doc.Seq, gaps)) - } - expectedSeq = doc.Seq + 1 - - // Verify checksum. - expected := computeChecksum(doc.WriterID, doc.Seq, doc.Payload) - if doc.Checksum != expected { - v.metrics.ChecksumErrors.Add(1) - v.journal.Error("verifier", fmt.Sprintf( - "checksum mismatch: writer=%s seq=%d stored=%s computed=%s", - writerID, doc.Seq, doc.Checksum, expected)) - } + out = append(out, doc) } + return out, nil +} - // Tail loss: writer acked through maxSeq but DB has nothing in - // (expectedSeq-1, maxSeq]. This catches the case where the most recent - // acked writes vanished and no later writes have arrived to expose the - // gap via the per-doc check above. - if expectedSeq <= maxSeq { - tail := maxSeq - expectedSeq + 1 - v.metrics.VerifyGapsDetected.Add(tail) +func (v *Verifier) logFinding(f finding) { + switch f.kind { + case findingGap: + v.journal.Error("verifier", fmt.Sprintf( + "gap detected: writer=%s expected_seq=%d got_seq=%d (missing %d)", + f.writerID, f.seq, f.endSeq, f.count)) + case findingTail: v.journal.Error("verifier", fmt.Sprintf( "tail loss: writer=%s expected_seq=%d acked_tip=%d (missing %d)", - writerID, expectedSeq, maxSeq, tail)) + f.writerID, f.seq, f.endSeq, f.count)) + case findingChecksum: + v.journal.Error("verifier", fmt.Sprintf( + "checksum mismatch: writer=%s seq=%d stored=%s computed=%s", + f.writerID, f.seq, f.stored, f.computed)) } - - // We've accounted for every seq up to maxSeq; advance the resume point. - v.nextSeq[writerID] = maxSeq + 1 } // StartVerifier launches a single verifier goroutine and returns it. diff --git a/test/longhaul/workload/writer.go b/test/longhaul/workload/writer.go index e0279f2ec..b2e0d31d2 100644 --- a/test/longhaul/workload/writer.go +++ b/test/longhaul/workload/writer.go @@ -37,14 +37,51 @@ type WriteDocument struct { Timestamp time.Time `bson:"timestamp"` } +// writeBackend abstracts the mongo collection so writeOne / Resume can be +// unit-tested with a stub. Production uses mongoBackend (a thin wrapper over +// *mongo.Collection); tests use a controllable fake. +type writeBackend interface { + insert(ctx context.Context, doc WriteDocument) error + isDuplicate(err error) bool + // highestSeq returns the highest seq committed for writerID, or 0 if none. + highestSeq(ctx context.Context, writerID string) (int64, error) +} + +// mongoBackend adapts *mongo.Collection to writeBackend. +type mongoBackend struct { + coll *mongo.Collection +} + +func (m mongoBackend) insert(ctx context.Context, doc WriteDocument) error { + _, err := m.coll.InsertOne(ctx, doc) + return err +} + +func (m mongoBackend) isDuplicate(err error) bool { + return mongo.IsDuplicateKeyError(err) +} + +func (m mongoBackend) highestSeq(ctx context.Context, writerID string) (int64, error) { + opts := options.FindOne().SetSort(bson.D{{Key: "seq", Value: -1}}) + var doc WriteDocument + err := m.coll.FindOne(ctx, bson.M{"writer_id": writerID}, opts).Decode(&doc) + if err != nil { + if errors.Is(err, mongo.ErrNoDocuments) { + return 0, nil + } + return 0, err + } + return doc.Seq, nil +} + // Writer performs sequential inserts to a MongoDB collection. // Each writer has a unique ID and tracks its own sequence number. type Writer struct { - id string - seq atomic.Int64 - metrics *Metrics - journal *journal.Journal - collection *mongo.Collection + id string + seq atomic.Int64 + metrics *Metrics + journal *journal.Journal + backend writeBackend } // NewWriter creates a writer with the given ID connected to the specified database. @@ -52,10 +89,10 @@ func NewWriter(id string, db *mongo.Database, metrics *Metrics, j *journal.Journ coll := db.Collection(CollectionName, options.Collection(). SetWriteConcern(writeconcern.Majority())) return &Writer{ - id: id, - metrics: metrics, - journal: j, - collection: coll, + id: id, + metrics: metrics, + journal: j, + backend: mongoBackend{coll: coll}, } } @@ -86,8 +123,8 @@ func (w *Writer) Run(ctx context.Context) { func (w *Writer) writeOne(ctx context.Context) { // Compute the next seq without advancing the counter yet — only commit on // success. Each writer has exactly one goroutine (Run), so a plain - // Load/Store pair is race-free; atomic.Int64 is retained so external - // observers (verifier tests, future debug endpoints) can read it safely. + // Load/Store pair is race-free; atomic.Int64 is retained because the + // verifier reads w.seq concurrently via Seq(). seq := w.seq.Load() + 1 payload := fmt.Sprintf("writer=%s seq=%d t=%d", w.id, seq, time.Now().UnixNano()) checksum := computeChecksum(w.id, seq, payload) @@ -102,7 +139,7 @@ func (w *Writer) writeOne(ctx context.Context) { w.metrics.WriteAttempted.Add(1) - _, err := w.collection.InsertOne(ctx, doc) + err := w.backend.insert(ctx, doc) if err != nil { // Retryable writes are on by default in the v2 driver, so a network // blip during a disruption window can produce this sequence: @@ -111,7 +148,7 @@ func (w *Writer) writeOne(ctx context.Context) { // 3. InsertOne returns a duplicate-key error to us // The data is durably committed in case (3), so we advance seq and // count it as a successful, idempotent ACK. - if mongo.IsDuplicateKeyError(err) { + if w.backend.isDuplicate(err) { w.seq.Store(seq) w.metrics.WriteAcknowledged.Add(1) return @@ -132,17 +169,12 @@ func (w *Writer) writeOne(ctx context.Context) { // up where the previous pod left off instead of colliding with the existing // unique index on (writer_id, seq). func (w *Writer) Resume(ctx context.Context) (int64, error) { - opts := options.FindOne().SetSort(bson.D{{Key: "seq", Value: -1}}) - var doc WriteDocument - err := w.collection.FindOne(ctx, bson.M{"writer_id": w.id}, opts).Decode(&doc) + seq, err := w.backend.highestSeq(ctx, w.id) if err != nil { - if errors.Is(err, mongo.ErrNoDocuments) { - return 0, nil - } return 0, err } - w.seq.Store(doc.Seq) - return doc.Seq, nil + w.seq.Store(seq) + return seq, nil } // computeChecksum creates a deterministic hash of the write for verification. diff --git a/test/longhaul/workload/writer_backend_test.go b/test/longhaul/workload/writer_backend_test.go new file mode 100644 index 000000000..4005330b7 --- /dev/null +++ b/test/longhaul/workload/writer_backend_test.go @@ -0,0 +1,172 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package workload + +import ( + "context" + "errors" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/documentdb/documentdb-operator/test/longhaul/journal" +) + +// fakeBackend is a controllable writeBackend stub for writer state-machine tests. +type fakeBackend struct { + insertErrs []error // returned in order; if exhausted, returns last + dupClassifier func(error) bool + insertedDocs []WriteDocument + highestSeqReturn int64 + highestSeqErr error + highestSeqCalls int +} + +func (f *fakeBackend) insert(_ context.Context, doc WriteDocument) error { + f.insertedDocs = append(f.insertedDocs, doc) + if len(f.insertErrs) == 0 { + return nil + } + err := f.insertErrs[0] + if len(f.insertErrs) > 1 { + f.insertErrs = f.insertErrs[1:] + } + return err +} + +func (f *fakeBackend) isDuplicate(err error) bool { + if f.dupClassifier == nil { + return false + } + return f.dupClassifier(err) +} + +func (f *fakeBackend) highestSeq(_ context.Context, _ string) (int64, error) { + f.highestSeqCalls++ + return f.highestSeqReturn, f.highestSeqErr +} + +func newTestWriter(b writeBackend) *Writer { + return &Writer{ + id: "w001", + metrics: NewMetrics(), + journal: journal.New(), + backend: b, + } +} + +var errDup = errors.New("dup-key") +var errTransient = errors.New("transient network error") + +var _ = Describe("Writer.writeOne", func() { + It("on success: advances seq, increments Attempted+Acknowledged, sends correct doc", func() { + b := &fakeBackend{} + w := newTestWriter(b) + w.writeOne(context.Background()) + + Expect(w.Seq()).To(Equal(int64(1))) + Expect(w.metrics.WriteAttempted.Load()).To(Equal(int64(1))) + Expect(w.metrics.WriteAcknowledged.Load()).To(Equal(int64(1))) + Expect(w.metrics.WriteFailed.Load()).To(BeZero()) + + Expect(b.insertedDocs).To(HaveLen(1)) + got := b.insertedDocs[0] + Expect(got.WriterID).To(Equal("w001")) + Expect(got.Seq).To(Equal(int64(1))) + // Checksum must match what computeChecksum would produce for this payload. + Expect(got.Checksum).To(Equal(computeChecksum(got.WriterID, got.Seq, got.Payload))) + }) + + It("on DupKey error: advances seq + Acknowledged, NOT WriteFailed (retryable-write ack)", func() { + b := &fakeBackend{ + insertErrs: []error{errDup}, + dupClassifier: func(err error) bool { return errors.Is(err, errDup) }, + } + w := newTestWriter(b) + w.writeOne(context.Background()) + + Expect(w.Seq()).To(Equal(int64(1)), "DupKey is an idempotent ack; seq must advance") + Expect(w.metrics.WriteAcknowledged.Load()).To(Equal(int64(1))) + Expect(w.metrics.WriteFailed.Load()).To(BeZero(), "DupKey must not count as a failure") + }) + + It("on non-DupKey error: seq does NOT advance, WriteFailed++, journal records failure", func() { + b := &fakeBackend{ + insertErrs: []error{errTransient}, + dupClassifier: func(err error) bool { return errors.Is(err, errDup) }, + } + w := newTestWriter(b) + w.writeOne(context.Background()) + + Expect(w.Seq()).To(BeZero(), "seq must not advance on non-DupKey error") + Expect(w.metrics.WriteAttempted.Load()).To(Equal(int64(1))) + Expect(w.metrics.WriteFailed.Load()).To(Equal(int64(1))) + Expect(w.metrics.WriteAcknowledged.Load()).To(BeZero()) + // Next tick must retry the same seq=1 (since seq is still 0, next call computes 0+1). + }) + + It("retries the same seq after a non-DupKey failure", func() { + // First call: transient error -> seq stays 0. + // Second call: success -> seq becomes 1 (retry of the same logical write). + b := &fakeBackend{ + insertErrs: []error{errTransient, nil}, + dupClassifier: func(err error) bool { return false }, + } + w := newTestWriter(b) + w.writeOne(context.Background()) + w.writeOne(context.Background()) + + Expect(w.Seq()).To(Equal(int64(1))) + Expect(b.insertedDocs).To(HaveLen(2)) + Expect(b.insertedDocs[0].Seq).To(Equal(int64(1))) + Expect(b.insertedDocs[1].Seq).To(Equal(int64(1)), "second attempt must reuse seq=1") + Expect(w.metrics.WriteFailed.Load()).To(Equal(int64(1))) + Expect(w.metrics.WriteAcknowledged.Load()).To(Equal(int64(1))) + }) + + It("advances seq monotonically across N successful writes", func() { + b := &fakeBackend{} + w := newTestWriter(b) + for i := 0; i < 5; i++ { + w.writeOne(context.Background()) + } + Expect(w.Seq()).To(Equal(int64(5))) + Expect(b.insertedDocs).To(HaveLen(5)) + for i, doc := range b.insertedDocs { + Expect(doc.Seq).To(Equal(int64(i + 1))) + } + }) +}) + +var _ = Describe("Writer.Resume", func() { + It("on empty collection: returns 0 and leaves seq at 0", func() { + b := &fakeBackend{highestSeqReturn: 0} + w := newTestWriter(b) + got, err := w.Resume(context.Background()) + Expect(err).NotTo(HaveOccurred()) + Expect(got).To(BeZero()) + Expect(w.Seq()).To(BeZero()) + }) + + It("with existing data: seeds seq from the highest persisted seq", func() { + b := &fakeBackend{highestSeqReturn: 42} + w := newTestWriter(b) + got, err := w.Resume(context.Background()) + Expect(err).NotTo(HaveOccurred()) + Expect(got).To(Equal(int64(42))) + Expect(w.Seq()).To(Equal(int64(42)), "subsequent writeOne will compute seq=43") + }) + + It("on backend error: returns the error and leaves seq untouched", func() { + boom := errors.New("network down") + b := &fakeBackend{highestSeqErr: boom} + w := newTestWriter(b) + w.seq.Store(99) // pretend something was already there + + got, err := w.Resume(context.Background()) + Expect(err).To(MatchError(boom)) + Expect(got).To(BeZero()) + Expect(w.Seq()).To(Equal(int64(99)), "Resume must not clobber seq on error") + }) +}) From cba60b4bce637ce5d20f8b3bd5892f7a26c1d351 Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Thu, 18 Jun 2026 12:29:47 -0400 Subject: [PATCH 08/11] test(longhaul): consolidate workload tests to one file per source file Merge audit_test.go into verifier_test.go and writer_backend_test.go into writer_test.go. Each source file now has exactly one matching _test.go (Go convention), no test changes. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test/longhaul/workload/audit_test.go | 161 ---------------- test/longhaul/workload/verifier_test.go | 152 ++++++++++++++++ test/longhaul/workload/writer_backend_test.go | 172 ------------------ test/longhaul/workload/writer_test.go | 160 ++++++++++++++++ 4 files changed, 312 insertions(+), 333 deletions(-) delete mode 100644 test/longhaul/workload/audit_test.go delete mode 100644 test/longhaul/workload/writer_backend_test.go diff --git a/test/longhaul/workload/audit_test.go b/test/longhaul/workload/audit_test.go deleted file mode 100644 index 4b523dd20..000000000 --- a/test/longhaul/workload/audit_test.go +++ /dev/null @@ -1,161 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -package workload - -import ( - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" -) - -// makeDoc constructs a valid WriteDocument whose checksum matches; tests that -// want a checksum mismatch override Checksum directly. -func makeDoc(writerID string, seq int64) WriteDocument { - payload := "p" - return WriteDocument{ - WriterID: writerID, - Seq: seq, - Payload: payload, - Checksum: computeChecksum(writerID, seq, payload), - } -} - -var _ = Describe("auditDocs", func() { - It("returns zero counters when there are no docs and maxSeq < expectedSeq", func() { - // Cycle with no new writes since last tick: expectedSeq=5, maxSeq=4. - // Note: verifyWriter short-circuits this case BEFORE calling auditDocs, - // but auditDocs itself must still be safe — it should report no tail. - r := auditDocs("w1", nil, 5, 4) - Expect(r.internalGaps).To(BeZero()) - Expect(r.tailLoss).To(BeZero()) - Expect(r.checksumErrors).To(BeZero()) - Expect(r.findings).To(BeEmpty()) - // expectedSeq is unchanged when expectedSeq > maxSeq. - Expect(r.newExpectedSeq).To(Equal(int64(5))) - }) - - It("reports a clean contiguous run with no gaps and no tail", func() { - // expectedSeq=1, docs=[1,2,3], maxSeq=3. - docs := []WriteDocument{makeDoc("w1", 1), makeDoc("w1", 2), makeDoc("w1", 3)} - r := auditDocs("w1", docs, 1, 3) - Expect(r.internalGaps).To(BeZero()) - Expect(r.tailLoss).To(BeZero()) - Expect(r.checksumErrors).To(BeZero()) - Expect(r.findings).To(BeEmpty()) - Expect(r.newExpectedSeq).To(Equal(int64(4))) - }) - - It("detects an internal gap between two docs", func() { - // expectedSeq=1, docs=[1,4,5], maxSeq=5 → gap of 2 (seqs 2 and 3). - docs := []WriteDocument{makeDoc("w1", 1), makeDoc("w1", 4), makeDoc("w1", 5)} - r := auditDocs("w1", docs, 1, 5) - Expect(r.internalGaps).To(Equal(int64(2))) - Expect(r.tailLoss).To(BeZero()) - Expect(r.checksumErrors).To(BeZero()) - Expect(r.findings).To(HaveLen(1)) - Expect(r.findings[0].kind).To(Equal(findingGap)) - Expect(r.findings[0].seq).To(Equal(int64(2))) // first missing - Expect(r.findings[0].endSeq).To(Equal(int64(4))) // the doc that exposed the gap - Expect(r.findings[0].count).To(Equal(int64(2))) - Expect(r.newExpectedSeq).To(Equal(int64(6))) - }) - - It("detects a gap at the start of the scan window", func() { - // expectedSeq=1, docs=[3], maxSeq=3 → gap of 2 (seqs 1 and 2). - docs := []WriteDocument{makeDoc("w1", 3)} - r := auditDocs("w1", docs, 1, 3) - Expect(r.internalGaps).To(Equal(int64(2))) - Expect(r.tailLoss).To(BeZero()) - Expect(r.newExpectedSeq).To(Equal(int64(4))) - }) - - It("detects tail loss with no docs at all (empty scan, non-zero tip)", func() { - // Case B from review: writer acked through seq=10, DB lost them all, - // no later writes. expectedSeq=1, docs=[], maxSeq=10 → tail=10. - r := auditDocs("w1", nil, 1, 10) - Expect(r.internalGaps).To(BeZero()) - Expect(r.tailLoss).To(Equal(int64(10))) - Expect(r.findings).To(HaveLen(1)) - Expect(r.findings[0].kind).To(Equal(findingTail)) - Expect(r.findings[0].seq).To(Equal(int64(1))) - Expect(r.findings[0].endSeq).To(Equal(int64(10))) - Expect(r.findings[0].count).To(Equal(int64(10))) - Expect(r.newExpectedSeq).To(Equal(int64(11))) - }) - - It("detects tail loss when last doc is below maxSeq", func() { - // expectedSeq=1, docs=[1,2], maxSeq=5 → tail of 3 (seqs 3,4,5). - docs := []WriteDocument{makeDoc("w1", 1), makeDoc("w1", 2)} - r := auditDocs("w1", docs, 1, 5) - Expect(r.internalGaps).To(BeZero()) - Expect(r.tailLoss).To(Equal(int64(3))) - Expect(r.findings).To(HaveLen(1)) - Expect(r.findings[0].kind).To(Equal(findingTail)) - Expect(r.findings[0].seq).To(Equal(int64(3))) - Expect(r.findings[0].endSeq).To(Equal(int64(5))) - Expect(r.newExpectedSeq).To(Equal(int64(6))) - }) - - It("detects internal gap AND tail loss in the same cycle", func() { - // expectedSeq=1, docs=[2,5], maxSeq=8 → gap=1 (seq 1) + gap=2 (seqs 3,4) + tail=3 (6,7,8). - // Internal gaps total = 3; tail = 3. - docs := []WriteDocument{makeDoc("w1", 2), makeDoc("w1", 5)} - r := auditDocs("w1", docs, 1, 8) - Expect(r.internalGaps).To(Equal(int64(3))) - Expect(r.tailLoss).To(Equal(int64(3))) - Expect(r.findings).To(HaveLen(3)) // two gaps + one tail - Expect(r.findings[0].kind).To(Equal(findingGap)) - Expect(r.findings[1].kind).To(Equal(findingGap)) - Expect(r.findings[2].kind).To(Equal(findingTail)) - Expect(r.newExpectedSeq).To(Equal(int64(9))) - }) - - It("Case A: late write exposes an earlier-lost range as an internal gap", func() { - // Original: writer acked 1..110, DB loses 101..110, writer writes 111. - // Verifier last cycle: nextSeq[w1]=101, so expectedSeq=101. - // docs=[doc with seq=111], maxSeq=111 → gap=10 (101..110), no tail. - docs := []WriteDocument{makeDoc("w1", 111)} - r := auditDocs("w1", docs, 101, 111) - Expect(r.internalGaps).To(Equal(int64(10))) - Expect(r.tailLoss).To(BeZero()) - Expect(r.newExpectedSeq).To(Equal(int64(112))) - }) - - It("detects a checksum mismatch without affecting gap counters", func() { - // docs=[1,2,3], doc 2 has a bad checksum. - bad := makeDoc("w1", 2) - bad.Checksum = "deadbeef00000000" - docs := []WriteDocument{makeDoc("w1", 1), bad, makeDoc("w1", 3)} - r := auditDocs("w1", docs, 1, 3) - Expect(r.checksumErrors).To(Equal(int64(1))) - Expect(r.internalGaps).To(BeZero()) - Expect(r.tailLoss).To(BeZero()) - Expect(r.findings).To(HaveLen(1)) - Expect(r.findings[0].kind).To(Equal(findingChecksum)) - Expect(r.findings[0].seq).To(Equal(int64(2))) - Expect(r.findings[0].stored).To(Equal("deadbeef00000000")) - }) - - It("counts checksum errors across multiple bad docs", func() { - bad1 := makeDoc("w1", 1) - bad1.Checksum = "xx" - bad2 := makeDoc("w1", 2) - bad2.Checksum = "yy" - r := auditDocs("w1", []WriteDocument{bad1, bad2}, 1, 2) - Expect(r.checksumErrors).To(Equal(int64(2))) - Expect(r.findings).To(HaveLen(2)) - }) - - It("advances newExpectedSeq to maxSeq+1 even with tail loss", func() { - // Critical invariant: after a tail-loss cycle, nextSeq must move past - // maxSeq so the next cycle doesn't re-detect the same tail. - r := auditDocs("w1", nil, 1, 100) - Expect(r.newExpectedSeq).To(Equal(int64(101))) - }) - - It("preserves writerID in findings", func() { - r := auditDocs("worker-xyz", nil, 1, 1) - Expect(r.findings).To(HaveLen(1)) - Expect(r.findings[0].writerID).To(Equal("worker-xyz")) - }) -}) diff --git a/test/longhaul/workload/verifier_test.go b/test/longhaul/workload/verifier_test.go index ec38ebcb3..529cbf0bc 100644 --- a/test/longhaul/workload/verifier_test.go +++ b/test/longhaul/workload/verifier_test.go @@ -46,3 +46,155 @@ var _ = Describe("Verifier", func() { Skip("verifyAll requires a *mongo.Database; covered by long-haul integration runs") }) }) + +// makeDoc constructs a valid WriteDocument whose checksum matches; tests that +// want a checksum mismatch override Checksum directly. +func makeDoc(writerID string, seq int64) WriteDocument { + payload := "p" + return WriteDocument{ + WriterID: writerID, + Seq: seq, + Payload: payload, + Checksum: computeChecksum(writerID, seq, payload), + } +} + +var _ = Describe("auditDocs", func() { + It("returns zero counters when there are no docs and maxSeq < expectedSeq", func() { + // Cycle with no new writes since last tick: expectedSeq=5, maxSeq=4. + // Note: verifyWriter short-circuits this case BEFORE calling auditDocs, + // but auditDocs itself must still be safe — it should report no tail. + r := auditDocs("w1", nil, 5, 4) + Expect(r.internalGaps).To(BeZero()) + Expect(r.tailLoss).To(BeZero()) + Expect(r.checksumErrors).To(BeZero()) + Expect(r.findings).To(BeEmpty()) + // expectedSeq is unchanged when expectedSeq > maxSeq. + Expect(r.newExpectedSeq).To(Equal(int64(5))) + }) + + It("reports a clean contiguous run with no gaps and no tail", func() { + // expectedSeq=1, docs=[1,2,3], maxSeq=3. + docs := []WriteDocument{makeDoc("w1", 1), makeDoc("w1", 2), makeDoc("w1", 3)} + r := auditDocs("w1", docs, 1, 3) + Expect(r.internalGaps).To(BeZero()) + Expect(r.tailLoss).To(BeZero()) + Expect(r.checksumErrors).To(BeZero()) + Expect(r.findings).To(BeEmpty()) + Expect(r.newExpectedSeq).To(Equal(int64(4))) + }) + + It("detects an internal gap between two docs", func() { + // expectedSeq=1, docs=[1,4,5], maxSeq=5 → gap of 2 (seqs 2 and 3). + docs := []WriteDocument{makeDoc("w1", 1), makeDoc("w1", 4), makeDoc("w1", 5)} + r := auditDocs("w1", docs, 1, 5) + Expect(r.internalGaps).To(Equal(int64(2))) + Expect(r.tailLoss).To(BeZero()) + Expect(r.checksumErrors).To(BeZero()) + Expect(r.findings).To(HaveLen(1)) + Expect(r.findings[0].kind).To(Equal(findingGap)) + Expect(r.findings[0].seq).To(Equal(int64(2))) // first missing + Expect(r.findings[0].endSeq).To(Equal(int64(4))) // the doc that exposed the gap + Expect(r.findings[0].count).To(Equal(int64(2))) + Expect(r.newExpectedSeq).To(Equal(int64(6))) + }) + + It("detects a gap at the start of the scan window", func() { + // expectedSeq=1, docs=[3], maxSeq=3 → gap of 2 (seqs 1 and 2). + docs := []WriteDocument{makeDoc("w1", 3)} + r := auditDocs("w1", docs, 1, 3) + Expect(r.internalGaps).To(Equal(int64(2))) + Expect(r.tailLoss).To(BeZero()) + Expect(r.newExpectedSeq).To(Equal(int64(4))) + }) + + It("detects tail loss with no docs at all (empty scan, non-zero tip)", func() { + // Case B from review: writer acked through seq=10, DB lost them all, + // no later writes. expectedSeq=1, docs=[], maxSeq=10 → tail=10. + r := auditDocs("w1", nil, 1, 10) + Expect(r.internalGaps).To(BeZero()) + Expect(r.tailLoss).To(Equal(int64(10))) + Expect(r.findings).To(HaveLen(1)) + Expect(r.findings[0].kind).To(Equal(findingTail)) + Expect(r.findings[0].seq).To(Equal(int64(1))) + Expect(r.findings[0].endSeq).To(Equal(int64(10))) + Expect(r.findings[0].count).To(Equal(int64(10))) + Expect(r.newExpectedSeq).To(Equal(int64(11))) + }) + + It("detects tail loss when last doc is below maxSeq", func() { + // expectedSeq=1, docs=[1,2], maxSeq=5 → tail of 3 (seqs 3,4,5). + docs := []WriteDocument{makeDoc("w1", 1), makeDoc("w1", 2)} + r := auditDocs("w1", docs, 1, 5) + Expect(r.internalGaps).To(BeZero()) + Expect(r.tailLoss).To(Equal(int64(3))) + Expect(r.findings).To(HaveLen(1)) + Expect(r.findings[0].kind).To(Equal(findingTail)) + Expect(r.findings[0].seq).To(Equal(int64(3))) + Expect(r.findings[0].endSeq).To(Equal(int64(5))) + Expect(r.newExpectedSeq).To(Equal(int64(6))) + }) + + It("detects internal gap AND tail loss in the same cycle", func() { + // expectedSeq=1, docs=[2,5], maxSeq=8 → gap=1 (seq 1) + gap=2 (seqs 3,4) + tail=3 (6,7,8). + // Internal gaps total = 3; tail = 3. + docs := []WriteDocument{makeDoc("w1", 2), makeDoc("w1", 5)} + r := auditDocs("w1", docs, 1, 8) + Expect(r.internalGaps).To(Equal(int64(3))) + Expect(r.tailLoss).To(Equal(int64(3))) + Expect(r.findings).To(HaveLen(3)) // two gaps + one tail + Expect(r.findings[0].kind).To(Equal(findingGap)) + Expect(r.findings[1].kind).To(Equal(findingGap)) + Expect(r.findings[2].kind).To(Equal(findingTail)) + Expect(r.newExpectedSeq).To(Equal(int64(9))) + }) + + It("Case A: late write exposes an earlier-lost range as an internal gap", func() { + // Original: writer acked 1..110, DB loses 101..110, writer writes 111. + // Verifier last cycle: nextSeq[w1]=101, so expectedSeq=101. + // docs=[doc with seq=111], maxSeq=111 → gap=10 (101..110), no tail. + docs := []WriteDocument{makeDoc("w1", 111)} + r := auditDocs("w1", docs, 101, 111) + Expect(r.internalGaps).To(Equal(int64(10))) + Expect(r.tailLoss).To(BeZero()) + Expect(r.newExpectedSeq).To(Equal(int64(112))) + }) + + It("detects a checksum mismatch without affecting gap counters", func() { + // docs=[1,2,3], doc 2 has a bad checksum. + bad := makeDoc("w1", 2) + bad.Checksum = "deadbeef00000000" + docs := []WriteDocument{makeDoc("w1", 1), bad, makeDoc("w1", 3)} + r := auditDocs("w1", docs, 1, 3) + Expect(r.checksumErrors).To(Equal(int64(1))) + Expect(r.internalGaps).To(BeZero()) + Expect(r.tailLoss).To(BeZero()) + Expect(r.findings).To(HaveLen(1)) + Expect(r.findings[0].kind).To(Equal(findingChecksum)) + Expect(r.findings[0].seq).To(Equal(int64(2))) + Expect(r.findings[0].stored).To(Equal("deadbeef00000000")) + }) + + It("counts checksum errors across multiple bad docs", func() { + bad1 := makeDoc("w1", 1) + bad1.Checksum = "xx" + bad2 := makeDoc("w1", 2) + bad2.Checksum = "yy" + r := auditDocs("w1", []WriteDocument{bad1, bad2}, 1, 2) + Expect(r.checksumErrors).To(Equal(int64(2))) + Expect(r.findings).To(HaveLen(2)) + }) + + It("advances newExpectedSeq to maxSeq+1 even with tail loss", func() { + // Critical invariant: after a tail-loss cycle, nextSeq must move past + // maxSeq so the next cycle doesn't re-detect the same tail. + r := auditDocs("w1", nil, 1, 100) + Expect(r.newExpectedSeq).To(Equal(int64(101))) + }) + + It("preserves writerID in findings", func() { + r := auditDocs("worker-xyz", nil, 1, 1) + Expect(r.findings).To(HaveLen(1)) + Expect(r.findings[0].writerID).To(Equal("worker-xyz")) + }) +}) diff --git a/test/longhaul/workload/writer_backend_test.go b/test/longhaul/workload/writer_backend_test.go deleted file mode 100644 index 4005330b7..000000000 --- a/test/longhaul/workload/writer_backend_test.go +++ /dev/null @@ -1,172 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -package workload - -import ( - "context" - "errors" - - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" - - "github.com/documentdb/documentdb-operator/test/longhaul/journal" -) - -// fakeBackend is a controllable writeBackend stub for writer state-machine tests. -type fakeBackend struct { - insertErrs []error // returned in order; if exhausted, returns last - dupClassifier func(error) bool - insertedDocs []WriteDocument - highestSeqReturn int64 - highestSeqErr error - highestSeqCalls int -} - -func (f *fakeBackend) insert(_ context.Context, doc WriteDocument) error { - f.insertedDocs = append(f.insertedDocs, doc) - if len(f.insertErrs) == 0 { - return nil - } - err := f.insertErrs[0] - if len(f.insertErrs) > 1 { - f.insertErrs = f.insertErrs[1:] - } - return err -} - -func (f *fakeBackend) isDuplicate(err error) bool { - if f.dupClassifier == nil { - return false - } - return f.dupClassifier(err) -} - -func (f *fakeBackend) highestSeq(_ context.Context, _ string) (int64, error) { - f.highestSeqCalls++ - return f.highestSeqReturn, f.highestSeqErr -} - -func newTestWriter(b writeBackend) *Writer { - return &Writer{ - id: "w001", - metrics: NewMetrics(), - journal: journal.New(), - backend: b, - } -} - -var errDup = errors.New("dup-key") -var errTransient = errors.New("transient network error") - -var _ = Describe("Writer.writeOne", func() { - It("on success: advances seq, increments Attempted+Acknowledged, sends correct doc", func() { - b := &fakeBackend{} - w := newTestWriter(b) - w.writeOne(context.Background()) - - Expect(w.Seq()).To(Equal(int64(1))) - Expect(w.metrics.WriteAttempted.Load()).To(Equal(int64(1))) - Expect(w.metrics.WriteAcknowledged.Load()).To(Equal(int64(1))) - Expect(w.metrics.WriteFailed.Load()).To(BeZero()) - - Expect(b.insertedDocs).To(HaveLen(1)) - got := b.insertedDocs[0] - Expect(got.WriterID).To(Equal("w001")) - Expect(got.Seq).To(Equal(int64(1))) - // Checksum must match what computeChecksum would produce for this payload. - Expect(got.Checksum).To(Equal(computeChecksum(got.WriterID, got.Seq, got.Payload))) - }) - - It("on DupKey error: advances seq + Acknowledged, NOT WriteFailed (retryable-write ack)", func() { - b := &fakeBackend{ - insertErrs: []error{errDup}, - dupClassifier: func(err error) bool { return errors.Is(err, errDup) }, - } - w := newTestWriter(b) - w.writeOne(context.Background()) - - Expect(w.Seq()).To(Equal(int64(1)), "DupKey is an idempotent ack; seq must advance") - Expect(w.metrics.WriteAcknowledged.Load()).To(Equal(int64(1))) - Expect(w.metrics.WriteFailed.Load()).To(BeZero(), "DupKey must not count as a failure") - }) - - It("on non-DupKey error: seq does NOT advance, WriteFailed++, journal records failure", func() { - b := &fakeBackend{ - insertErrs: []error{errTransient}, - dupClassifier: func(err error) bool { return errors.Is(err, errDup) }, - } - w := newTestWriter(b) - w.writeOne(context.Background()) - - Expect(w.Seq()).To(BeZero(), "seq must not advance on non-DupKey error") - Expect(w.metrics.WriteAttempted.Load()).To(Equal(int64(1))) - Expect(w.metrics.WriteFailed.Load()).To(Equal(int64(1))) - Expect(w.metrics.WriteAcknowledged.Load()).To(BeZero()) - // Next tick must retry the same seq=1 (since seq is still 0, next call computes 0+1). - }) - - It("retries the same seq after a non-DupKey failure", func() { - // First call: transient error -> seq stays 0. - // Second call: success -> seq becomes 1 (retry of the same logical write). - b := &fakeBackend{ - insertErrs: []error{errTransient, nil}, - dupClassifier: func(err error) bool { return false }, - } - w := newTestWriter(b) - w.writeOne(context.Background()) - w.writeOne(context.Background()) - - Expect(w.Seq()).To(Equal(int64(1))) - Expect(b.insertedDocs).To(HaveLen(2)) - Expect(b.insertedDocs[0].Seq).To(Equal(int64(1))) - Expect(b.insertedDocs[1].Seq).To(Equal(int64(1)), "second attempt must reuse seq=1") - Expect(w.metrics.WriteFailed.Load()).To(Equal(int64(1))) - Expect(w.metrics.WriteAcknowledged.Load()).To(Equal(int64(1))) - }) - - It("advances seq monotonically across N successful writes", func() { - b := &fakeBackend{} - w := newTestWriter(b) - for i := 0; i < 5; i++ { - w.writeOne(context.Background()) - } - Expect(w.Seq()).To(Equal(int64(5))) - Expect(b.insertedDocs).To(HaveLen(5)) - for i, doc := range b.insertedDocs { - Expect(doc.Seq).To(Equal(int64(i + 1))) - } - }) -}) - -var _ = Describe("Writer.Resume", func() { - It("on empty collection: returns 0 and leaves seq at 0", func() { - b := &fakeBackend{highestSeqReturn: 0} - w := newTestWriter(b) - got, err := w.Resume(context.Background()) - Expect(err).NotTo(HaveOccurred()) - Expect(got).To(BeZero()) - Expect(w.Seq()).To(BeZero()) - }) - - It("with existing data: seeds seq from the highest persisted seq", func() { - b := &fakeBackend{highestSeqReturn: 42} - w := newTestWriter(b) - got, err := w.Resume(context.Background()) - Expect(err).NotTo(HaveOccurred()) - Expect(got).To(Equal(int64(42))) - Expect(w.Seq()).To(Equal(int64(42)), "subsequent writeOne will compute seq=43") - }) - - It("on backend error: returns the error and leaves seq untouched", func() { - boom := errors.New("network down") - b := &fakeBackend{highestSeqErr: boom} - w := newTestWriter(b) - w.seq.Store(99) // pretend something was already there - - got, err := w.Resume(context.Background()) - Expect(err).To(MatchError(boom)) - Expect(got).To(BeZero()) - Expect(w.Seq()).To(Equal(int64(99)), "Resume must not clobber seq on error") - }) -}) diff --git a/test/longhaul/workload/writer_test.go b/test/longhaul/workload/writer_test.go index bb91d8992..31c0c65a3 100644 --- a/test/longhaul/workload/writer_test.go +++ b/test/longhaul/workload/writer_test.go @@ -4,6 +4,8 @@ package workload import ( + "context" + "errors" "strings" . "github.com/onsi/ginkgo/v2" @@ -70,3 +72,161 @@ var _ = Describe("Writer", func() { Expect(w.Seq()).To(Equal(int64(42))) }) }) + +// fakeBackend is a controllable writeBackend stub for writer state-machine tests. +type fakeBackend struct { + insertErrs []error // returned in order; if exhausted, returns last + dupClassifier func(error) bool + insertedDocs []WriteDocument + highestSeqReturn int64 + highestSeqErr error + highestSeqCalls int +} + +func (f *fakeBackend) insert(_ context.Context, doc WriteDocument) error { + f.insertedDocs = append(f.insertedDocs, doc) + if len(f.insertErrs) == 0 { + return nil + } + err := f.insertErrs[0] + if len(f.insertErrs) > 1 { + f.insertErrs = f.insertErrs[1:] + } + return err +} + +func (f *fakeBackend) isDuplicate(err error) bool { + if f.dupClassifier == nil { + return false + } + return f.dupClassifier(err) +} + +func (f *fakeBackend) highestSeq(_ context.Context, _ string) (int64, error) { + f.highestSeqCalls++ + return f.highestSeqReturn, f.highestSeqErr +} + +func newTestWriter(b writeBackend) *Writer { + return &Writer{ + id: "w001", + metrics: NewMetrics(), + journal: journal.New(), + backend: b, + } +} + +var errDup = errors.New("dup-key") +var errTransient = errors.New("transient network error") + +var _ = Describe("Writer.writeOne", func() { + It("on success: advances seq, increments Attempted+Acknowledged, sends correct doc", func() { + b := &fakeBackend{} + w := newTestWriter(b) + w.writeOne(context.Background()) + + Expect(w.Seq()).To(Equal(int64(1))) + Expect(w.metrics.WriteAttempted.Load()).To(Equal(int64(1))) + Expect(w.metrics.WriteAcknowledged.Load()).To(Equal(int64(1))) + Expect(w.metrics.WriteFailed.Load()).To(BeZero()) + + Expect(b.insertedDocs).To(HaveLen(1)) + got := b.insertedDocs[0] + Expect(got.WriterID).To(Equal("w001")) + Expect(got.Seq).To(Equal(int64(1))) + // Checksum must match what computeChecksum would produce for this payload. + Expect(got.Checksum).To(Equal(computeChecksum(got.WriterID, got.Seq, got.Payload))) + }) + + It("on DupKey error: advances seq + Acknowledged, NOT WriteFailed (retryable-write ack)", func() { + b := &fakeBackend{ + insertErrs: []error{errDup}, + dupClassifier: func(err error) bool { return errors.Is(err, errDup) }, + } + w := newTestWriter(b) + w.writeOne(context.Background()) + + Expect(w.Seq()).To(Equal(int64(1)), "DupKey is an idempotent ack; seq must advance") + Expect(w.metrics.WriteAcknowledged.Load()).To(Equal(int64(1))) + Expect(w.metrics.WriteFailed.Load()).To(BeZero(), "DupKey must not count as a failure") + }) + + It("on non-DupKey error: seq does NOT advance, WriteFailed++, journal records failure", func() { + b := &fakeBackend{ + insertErrs: []error{errTransient}, + dupClassifier: func(err error) bool { return errors.Is(err, errDup) }, + } + w := newTestWriter(b) + w.writeOne(context.Background()) + + Expect(w.Seq()).To(BeZero(), "seq must not advance on non-DupKey error") + Expect(w.metrics.WriteAttempted.Load()).To(Equal(int64(1))) + Expect(w.metrics.WriteFailed.Load()).To(Equal(int64(1))) + Expect(w.metrics.WriteAcknowledged.Load()).To(BeZero()) + // Next tick must retry the same seq=1 (since seq is still 0, next call computes 0+1). + }) + + It("retries the same seq after a non-DupKey failure", func() { + // First call: transient error -> seq stays 0. + // Second call: success -> seq becomes 1 (retry of the same logical write). + b := &fakeBackend{ + insertErrs: []error{errTransient, nil}, + dupClassifier: func(err error) bool { return false }, + } + w := newTestWriter(b) + w.writeOne(context.Background()) + w.writeOne(context.Background()) + + Expect(w.Seq()).To(Equal(int64(1))) + Expect(b.insertedDocs).To(HaveLen(2)) + Expect(b.insertedDocs[0].Seq).To(Equal(int64(1))) + Expect(b.insertedDocs[1].Seq).To(Equal(int64(1)), "second attempt must reuse seq=1") + Expect(w.metrics.WriteFailed.Load()).To(Equal(int64(1))) + Expect(w.metrics.WriteAcknowledged.Load()).To(Equal(int64(1))) + }) + + It("advances seq monotonically across N successful writes", func() { + b := &fakeBackend{} + w := newTestWriter(b) + for i := 0; i < 5; i++ { + w.writeOne(context.Background()) + } + Expect(w.Seq()).To(Equal(int64(5))) + Expect(b.insertedDocs).To(HaveLen(5)) + for i, doc := range b.insertedDocs { + Expect(doc.Seq).To(Equal(int64(i + 1))) + } + }) +}) + +var _ = Describe("Writer.Resume", func() { + It("on empty collection: returns 0 and leaves seq at 0", func() { + b := &fakeBackend{highestSeqReturn: 0} + w := newTestWriter(b) + got, err := w.Resume(context.Background()) + Expect(err).NotTo(HaveOccurred()) + Expect(got).To(BeZero()) + Expect(w.Seq()).To(BeZero()) + }) + + It("with existing data: seeds seq from the highest persisted seq", func() { + b := &fakeBackend{highestSeqReturn: 42} + w := newTestWriter(b) + got, err := w.Resume(context.Background()) + Expect(err).NotTo(HaveOccurred()) + Expect(got).To(Equal(int64(42))) + Expect(w.Seq()).To(Equal(int64(42)), "subsequent writeOne will compute seq=43") + }) + + It("on backend error: returns the error and leaves seq untouched", func() { + boom := errors.New("network down") + b := &fakeBackend{highestSeqErr: boom} + w := newTestWriter(b) + w.seq.Store(99) // pretend something was already there + + got, err := w.Resume(context.Background()) + Expect(err).To(MatchError(boom)) + Expect(got).To(BeZero()) + Expect(w.Seq()).To(Equal(int64(99)), "Resume must not clobber seq on error") + }) +}) From 2dff23a0b377c3314b1ab8ff31b9c8b3e9f2f71b Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Thu, 18 Jun 2026 12:34:30 -0400 Subject: [PATCH 09/11] test(longhaul): document the report package's three-file layout - Add doc.go with the package-level overview explaining why report.go / checkpoint.go / alert.go are separate (data model vs. orchestration vs. CI surface) so the next reader doesn't have to ask. - Add per-field godoc on Summary clarifying who populates each field, when it's empty, and the PASS/FAIL relationship to LeakAnalysis (warning only, not a verdict flip). - Drop the narrow one-liner package doc from report.go in favor of the richer one in doc.go. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test/longhaul/report/doc.go | 27 +++++++++++++++++++ test/longhaul/report/report.go | 48 +++++++++++++++++++++++++++------- 2 files changed, 65 insertions(+), 10 deletions(-) create mode 100644 test/longhaul/report/doc.go diff --git a/test/longhaul/report/doc.go b/test/longhaul/report/doc.go new file mode 100644 index 000000000..c79fc44a0 --- /dev/null +++ b/test/longhaul/report/doc.go @@ -0,0 +1,27 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Package report aggregates the long-haul test verdict and publishes it to +// three independent surfaces: stdout (for kubectl logs / pod tailers), a +// Kubernetes ConfigMap (for kubectl get / operator UIs), and GitHub Actions +// workflow annotations (for the CI summary page). +// +// The package is split into three files by concern: +// +// - report.go — pure data model (Summary) + Markdown rendering. No I/O, +// no K8s deps; safe to import from any tool that wants to +// render a Summary. +// - checkpoint.go — orchestration: a ticker-driven CheckpointReporter that +// calls GenerateMarkdown, prints to stdout, persists the +// ConfigMap, and invokes EmitAnnotation. Owns the only +// K8s client dependency in the package and the +// intermediate-vs-final emit lifecycle. +// - alert.go — GitHub Actions surface: translates a Summary into the +// runner's `::error::` / `::notice::` / `::warning::` +// workflow commands. Gated on GITHUB_ACTIONS=true so the +// magic strings stay out of local-dev logs. +// +// A different CI provider (Buildkite, Jenkins, etc.) would be added by +// writing a peer to alert.go; nothing in report.go or checkpoint.go needs +// to change. +package report diff --git a/test/longhaul/report/report.go b/test/longhaul/report/report.go index efd4f3d33..e7316a7f7 100644 --- a/test/longhaul/report/report.go +++ b/test/longhaul/report/report.go @@ -1,7 +1,6 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -// Package report generates a markdown summary of the long haul test run. package report import ( @@ -14,7 +13,7 @@ import ( "github.com/documentdb/documentdb-operator/test/longhaul/workload" ) -// Result represents the overall test outcome. +// Result is the terminal verdict of the test run. type Result string const ( @@ -22,16 +21,45 @@ const ( ResultFail Result = "FAIL" ) -// Summary contains all data needed to generate the final report. +// Summary is the full state needed to render a checkpoint or final report. +// It is a pure value snapshot — no live counters, no channels — so it can be +// passed across goroutines and re-rendered offline. type Summary struct { - Result Result - Duration time.Duration - Metrics workload.MetricsSnapshot + // Result is the current verdict. PASS while data-loss counters stay zero, + // flipped to FAIL when the durability oracle detects gaps/checksum errors + // or a disruption window blows its policy budget. + Result Result + + // Duration is wall-clock time since the run started (process StartTime), + // not since the cluster was created. Resets on pod restart. + Duration time.Duration + + // Metrics is a snapshot of the workload counters (writes attempted/acked/ + // failed, verify passes, gaps, checksum errors). + Metrics workload.MetricsSnapshot + + // LeakAnalysis is the operator-pod resource trend (memory/CPU slope over + // the run); LeakAnalysis.HasLeak being true does NOT flip Result — it + // only emits a warning annotation. LeakAnalysis monitor.LeakAnalysis - OpsExecuted int - Windows []journal.DisruptionWindow - Events []journal.Event - FailReason string + + // OpsExecuted is the count of operations (scale up/down, restart, etc.) + // the operations scheduler has run since startup. + OpsExecuted int + + // Windows is every disruption window opened during the run, in start + // order. Each window records its op, duration, write-failure count, and + // whether it exceeded its policy budget. + Windows []journal.DisruptionWindow + + // Events is the journal's full event ring (info/warn/error log lines). + // The renderer only includes the last 20 in the markdown body to keep + // the ConfigMap value under the 1 MiB limit. + Events []journal.Event + + // FailReason is a short human-readable cause when Result == FAIL + // (e.g. "data loss: 17 gaps detected"). Empty when Result == PASS. + FailReason string } // GenerateMarkdown produces a human-readable markdown report. From 93dae80fa2a426a22c657beeb67938096123a74c Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Thu, 18 Jun 2026 12:41:46 -0400 Subject: [PATCH 10/11] test(longhaul): drop dead AllowedDowntime field and Event.Metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AllowedDowntime: set by every operation's OutagePolicy() and DefaultOutagePolicy() but never read by ExceededPolicy (which is the only consumer). Misleading public API surface — readers assume it's enforced. Removed the field, all assignments, and the TODO. The downtime budget can be re-introduced together with the writer-side timestamp plumbing it would actually require. Event.Metadata: declared, allocated, threaded through Record's signature, set to nil at every Info/Warn/Error call site, and never read. Dropped the field and simplified Record(level, component, message). No behavior change. All callers were updated; tests still pass. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test/longhaul/journal/journal.go | 12 +++++------- test/longhaul/journal/policy.go | 13 ------------- test/longhaul/journal/policy_test.go | 1 - test/longhaul/operations/scale.go | 2 -- test/longhaul/operations/scale_test.go | 2 -- test/longhaul/operations/upgrade.go | 1 - test/longhaul/operations/upgrade_test.go | 3 +-- test/longhaul/report/doc.go | 18 +++++++++--------- 8 files changed, 15 insertions(+), 37 deletions(-) diff --git a/test/longhaul/journal/journal.go b/test/longhaul/journal/journal.go index 019aff257..b5e4f266b 100644 --- a/test/longhaul/journal/journal.go +++ b/test/longhaul/journal/journal.go @@ -26,7 +26,6 @@ type Event struct { Level Level Component string Message string - Metadata map[string]string } // String returns a human-readable representation of the event. @@ -54,14 +53,13 @@ func New() *Journal { } } -// Record appends a new event to the journal. -func (j *Journal) Record(level Level, component, message string, metadata map[string]string) { +// Record appends a new event to the journal. Safe for concurrent use. +func (j *Journal) Record(level Level, component, message string) { e := Event{ Timestamp: time.Now(), Level: level, Component: component, Message: message, - Metadata: metadata, } j.mu.Lock() j.events = append(j.events, e) @@ -70,17 +68,17 @@ func (j *Journal) Record(level Level, component, message string, metadata map[st // Info records an info-level event. func (j *Journal) Info(component, message string) { - j.Record(LevelInfo, component, message, nil) + j.Record(LevelInfo, component, message) } // Warn records a warn-level event. func (j *Journal) Warn(component, message string) { - j.Record(LevelWarn, component, message, nil) + j.Record(LevelWarn, component, message) } // Error records an error-level event. func (j *Journal) Error(component, message string) { - j.Record(LevelError, component, message, nil) + j.Record(LevelError, component, message) } // OpenDisruptionWindow starts tracking a new disruption period. diff --git a/test/longhaul/journal/policy.go b/test/longhaul/journal/policy.go index 188780ba2..25d28e826 100644 --- a/test/longhaul/journal/policy.go +++ b/test/longhaul/journal/policy.go @@ -7,9 +7,6 @@ import "time" // OutagePolicy defines acceptable disruption bounds for an operation. type OutagePolicy struct { - // AllowedDowntime is the maximum duration of write unavailability. - AllowedDowntime time.Duration - // AllowedWriteFailures is the maximum number of write failures during the window. AllowedWriteFailures int64 @@ -20,7 +17,6 @@ type OutagePolicy struct { // DefaultOutagePolicy returns a conservative policy suitable for most operations. func DefaultOutagePolicy() OutagePolicy { return OutagePolicy{ - AllowedDowntime: 60 * time.Second, AllowedWriteFailures: 50, MustRecoverWithin: 5 * time.Minute, } @@ -59,15 +55,6 @@ func (w *DisruptionWindow) Duration() time.Duration { } // ExceededPolicy returns true if the window has violated its outage policy. -// -// TODO(longhaul, #220): also enforce Policy.AllowedDowntime here. Today -// AllowedDowntime is set by every operation's OutagePolicy() but never -// consulted — only the (always-set) MustRecoverWithin and AllowedWriteFailures -// budgets are actually checked. To enforce AllowedDowntime, the journal needs -// to start tracking the actual write-unavailable interval inside the window -// (e.g., longest contiguous run of write failures or first-failure to -// first-recovery). That requires changes in writer.go to feed per-write -// timestamps into the active window, so it's a separate change. func (w *DisruptionWindow) ExceededPolicy() bool { if w.Duration() > w.Policy.MustRecoverWithin { return true diff --git a/test/longhaul/journal/policy_test.go b/test/longhaul/journal/policy_test.go index d8f8cd7c5..4fe26b1a4 100644 --- a/test/longhaul/journal/policy_test.go +++ b/test/longhaul/journal/policy_test.go @@ -80,6 +80,5 @@ var _ = Describe("DisruptionWindow", func() { p := DefaultOutagePolicy() Expect(p.MustRecoverWithin).NotTo(BeZero()) Expect(p.AllowedWriteFailures).NotTo(BeZero()) - Expect(p.AllowedDowntime).NotTo(BeZero()) }) }) diff --git a/test/longhaul/operations/scale.go b/test/longhaul/operations/scale.go index e9fb471d8..9a1672959 100644 --- a/test/longhaul/operations/scale.go +++ b/test/longhaul/operations/scale.go @@ -91,7 +91,6 @@ func NewScaleUp(client monitor.ClusterClient, health *monitor.HealthMonitor, max boundKind: "max", recovery: recovery, policy: journal.OutagePolicy{ - AllowedDowntime: 30 * time.Second, AllowedWriteFailures: 20, MustRecoverWithin: recovery, }, @@ -120,7 +119,6 @@ func NewScaleDown(client monitor.ClusterClient, health *monitor.HealthMonitor, m boundKind: "min", recovery: recovery, policy: journal.OutagePolicy{ - AllowedDowntime: 60 * time.Second, AllowedWriteFailures: 50, MustRecoverWithin: recovery, }, diff --git a/test/longhaul/operations/scale_test.go b/test/longhaul/operations/scale_test.go index de9fdb7fa..3df032c05 100644 --- a/test/longhaul/operations/scale_test.go +++ b/test/longhaul/operations/scale_test.go @@ -90,7 +90,6 @@ var _ = Describe("ScaleUp", func() { It("OutagePolicy uses tighter budgets and echoes MustRecoverWithin", func() { s := NewScaleUp(&fakeClient{}, nil, 3, 5*time.Minute) p := s.OutagePolicy() - Expect(p.AllowedDowntime).To(Equal(30 * time.Second)) Expect(p.AllowedWriteFailures).To(Equal(int64(20))) Expect(p.MustRecoverWithin).To(Equal(5 * time.Minute)) }) @@ -134,7 +133,6 @@ var _ = Describe("ScaleDown", func() { It("OutagePolicy is more lenient than scale-up", func() { s := NewScaleDown(&fakeClient{}, nil, 1, 5*time.Minute) p := s.OutagePolicy() - Expect(p.AllowedDowntime).To(Equal(60 * time.Second)) Expect(p.AllowedWriteFailures).To(Equal(int64(50))) }) }) diff --git a/test/longhaul/operations/upgrade.go b/test/longhaul/operations/upgrade.go index 020413991..f9a586b6e 100644 --- a/test/longhaul/operations/upgrade.go +++ b/test/longhaul/operations/upgrade.go @@ -174,7 +174,6 @@ func (u *UpgradeDocumentDB) readDesiredVersion(ctx context.Context) (string, err // because rolling restarts touch every pod sequentially. func (u *UpgradeDocumentDB) OutagePolicy() journal.OutagePolicy { return journal.OutagePolicy{ - AllowedDowntime: 120 * time.Second, AllowedWriteFailures: 200, MustRecoverWithin: u.recovery, } diff --git a/test/longhaul/operations/upgrade_test.go b/test/longhaul/operations/upgrade_test.go index f77be4494..f442e5b58 100644 --- a/test/longhaul/operations/upgrade_test.go +++ b/test/longhaul/operations/upgrade_test.go @@ -22,10 +22,9 @@ var _ = Describe("UpgradeDocumentDB", func() { Expect(u.Weight()).To(Equal(1)) }) - It("OutagePolicy gives upgrades a longer downtime budget", func() { + It("OutagePolicy gives upgrades a more lenient failure budget", func() { u := NewUpgradeDocumentDB(&fakeClient{}, fake.NewSimpleClientset(), nil, nil, "ns", 10*time.Minute) p := u.OutagePolicy() - Expect(p.AllowedDowntime).To(Equal(120 * time.Second)) Expect(p.AllowedWriteFailures).To(Equal(int64(200))) Expect(p.MustRecoverWithin).To(Equal(10 * time.Minute)) }) diff --git a/test/longhaul/report/doc.go b/test/longhaul/report/doc.go index c79fc44a0..61405baf1 100644 --- a/test/longhaul/report/doc.go +++ b/test/longhaul/report/doc.go @@ -9,17 +9,17 @@ // The package is split into three files by concern: // // - report.go — pure data model (Summary) + Markdown rendering. No I/O, -// no K8s deps; safe to import from any tool that wants to -// render a Summary. +// no K8s deps; safe to import from any tool that wants to +// render a Summary. // - checkpoint.go — orchestration: a ticker-driven CheckpointReporter that -// calls GenerateMarkdown, prints to stdout, persists the -// ConfigMap, and invokes EmitAnnotation. Owns the only -// K8s client dependency in the package and the -// intermediate-vs-final emit lifecycle. +// calls GenerateMarkdown, prints to stdout, persists the +// ConfigMap, and invokes EmitAnnotation. Owns the only +// K8s client dependency in the package and the +// intermediate-vs-final emit lifecycle. // - alert.go — GitHub Actions surface: translates a Summary into the -// runner's `::error::` / `::notice::` / `::warning::` -// workflow commands. Gated on GITHUB_ACTIONS=true so the -// magic strings stay out of local-dev logs. +// runner's `::error::` / `::notice::` / `::warning::` +// workflow commands. Gated on GITHUB_ACTIONS=true so the +// magic strings stay out of local-dev logs. // // A different CI provider (Buildkite, Jenkins, etc.) would be added by // writing a peer to alert.go; nothing in report.go or checkpoint.go needs From 815ebaadb0d6dc4cd18163ff6b1215b139124033 Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Thu, 18 Jun 2026 12:50:20 -0400 Subject: [PATCH 11/11] test(longhaul): bound journal event ring at 10k entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a multi-day run the in-memory events slice grew unbounded — ~200 events/day baseline (op open/close + verifier batch errors) means ~6k after a month, harmless on its own but Events() returns a full slice copy on every checkpoint (~30s), so allocation/GC pressure scales with run length. Cap the ring at 10k with 1k headroom so the trim is amortized to one copy every ~1k appends, not paid on every append. The rendered report only ever surfaces the last 20 events so the cap is invisible to consumers. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test/longhaul/journal/journal.go | 18 ++++++++++++++++++ test/longhaul/journal/journal_test.go | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/test/longhaul/journal/journal.go b/test/longhaul/journal/journal.go index b5e4f266b..2ec7ebec3 100644 --- a/test/longhaul/journal/journal.go +++ b/test/longhaul/journal/journal.go @@ -20,6 +20,14 @@ const ( LevelError Level = "ERROR" ) +// maxEvents bounds the in-memory event ring. We trim with headroom so the +// trim cost is amortized over many appends (one copy every trimHeadroom +// events), not paid on every append once we hit the cap. +const ( + maxEvents = 10000 + trimHeadroom = 1000 +) + // Event represents a single journal entry. type Event struct { Timestamp time.Time @@ -54,6 +62,11 @@ func New() *Journal { } // Record appends a new event to the journal. Safe for concurrent use. +// +// To bound memory on multi-day runs the in-memory ring is capped at +// maxEvents; once exceeded by trimHeadroom, the oldest events are dropped. +// The rendered report only ever surfaces the last 20 events, so the cap is +// invisible to consumers. func (j *Journal) Record(level Level, component, message string) { e := Event{ Timestamp: time.Now(), @@ -63,6 +76,11 @@ func (j *Journal) Record(level Level, component, message string) { } j.mu.Lock() j.events = append(j.events, e) + if len(j.events) > maxEvents+trimHeadroom { + trimmed := make([]Event, maxEvents) + copy(trimmed, j.events[len(j.events)-maxEvents:]) + j.events = trimmed + } j.mu.Unlock() } diff --git a/test/longhaul/journal/journal_test.go b/test/longhaul/journal/journal_test.go index 2eb5c1ee5..02e2b3c6c 100644 --- a/test/longhaul/journal/journal_test.go +++ b/test/longhaul/journal/journal_test.go @@ -4,6 +4,7 @@ package journal import ( + "fmt" "sync" "time" @@ -127,4 +128,21 @@ var _ = Describe("Journal", func() { wg.Wait() Expect(j.Len()).To(Equal(writers * perWriter)) }) + + It("caps the in-memory event ring and keeps the most recent entries", func() { + // Exceed maxEvents + trimHeadroom so the trim path fires at least once. + j := New() + total := maxEvents + trimHeadroom + 500 + for i := 0; i < total; i++ { + j.Info("c", fmt.Sprintf("%d", i)) + } + // After amortized trim, length is between maxEvents and maxEvents+trimHeadroom. + Expect(j.Len()).To(BeNumerically(">=", maxEvents)) + Expect(j.Len()).To(BeNumerically("<=", maxEvents+trimHeadroom)) + + events := j.Events() + // Oldest surviving message is total - len(events); newest is total-1. + Expect(events[0].Message).To(Equal(fmt.Sprintf("%d", total-len(events)))) + Expect(events[len(events)-1].Message).To(Equal(fmt.Sprintf("%d", total-1))) + }) })