Skip to content

Commit 7a0b524

Browse files
adp2201jack-berg
andauthored
Add CRaC lifecycle integration test scaffold (#8179)
Co-authored-by: Amol Patil <9298683+adp2201@users.noreply.github.com> Co-authored-by: Jack Berg <34418638+jack-berg@users.noreply.github.com>
1 parent bf27c62 commit 7a0b524

4 files changed

Lines changed: 223 additions & 0 deletions

File tree

dependencyManagement/build.gradle.kts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ val DEPENDENCIES = listOf(
7878
"com.uber.nullaway:nullaway:0.13.1",
7979
"edu.berkeley.cs.jqf:jqf-fuzz:1.7", // jqf-fuzz version 1.8+ requires Java 11+
8080
"eu.rekawek.toxiproxy:toxiproxy-java:2.1.11",
81+
"io.github.crac:org-crac:0.1.3",
8182
"io.github.netmikey.logunit:logunit-jul:2.0.0",
8283
"io.jaegertracing:jaeger-client:1.8.1",
8384
"io.opentelemetry.contrib:opentelemetry-aws-xray-propagator:1.54.0-alpha",

integration-tests/build.gradle.kts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ dependencies {
1111
testImplementation(project(":extensions:trace-propagators"))
1212

1313
testImplementation("com.linecorp.armeria:armeria-junit5")
14+
testImplementation("io.github.crac:org-crac")
1415
testImplementation("org.junit.jupiter:junit-jupiter-params")
1516
testImplementation("org.testcontainers:testcontainers-junit-jupiter")
1617
}
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
/*
2+
* Copyright The OpenTelemetry Authors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package io.opentelemetry;
7+
8+
import static org.assertj.core.api.Assertions.assertThat;
9+
10+
import io.opentelemetry.api.trace.Span;
11+
import io.opentelemetry.api.trace.Tracer;
12+
import io.opentelemetry.sdk.OpenTelemetrySdk;
13+
import io.opentelemetry.sdk.common.CompletableResultCode;
14+
import io.opentelemetry.sdk.trace.SdkTracerProvider;
15+
import io.opentelemetry.sdk.trace.data.SpanData;
16+
import io.opentelemetry.sdk.trace.export.SimpleSpanProcessor;
17+
import io.opentelemetry.sdk.trace.export.SpanExporter;
18+
import java.util.ArrayList;
19+
import java.util.Collection;
20+
import java.util.List;
21+
import java.util.concurrent.TimeUnit;
22+
import org.crac.Context;
23+
import org.crac.Resource;
24+
import org.junit.jupiter.api.Disabled;
25+
import org.junit.jupiter.api.Test;
26+
27+
/**
28+
* Integration-style lifecycle tests for CRaC (Coordinated Restore at Checkpoint) support.
29+
*
30+
* <p>These tests use {@link MockCracContext} to simulate the CRaC checkpoint/restore lifecycle
31+
* without a CRaC-enabled JDK. Resources register with the mock context; the test then drives {@code
32+
* beforeCheckpoint} and {@code afterRestore} callbacks directly.
33+
*
34+
* <p>See: <a href="https://github.com/open-telemetry/opentelemetry-java/issues/6756">#6756</a>
35+
*/
36+
class CracLifecycleIntegrationTest {
37+
38+
/**
39+
* Demonstrates the failure mode when the SDK is naively shut down at checkpoint with no
40+
* corresponding restore logic. This is what happens today without proper CRaC support: the SDK is
41+
* a one-shot object, so spans emitted after a restore are silently dropped.
42+
*/
43+
@Test
44+
void spansDroppedAfterRestore_naiveCracIntegration() throws Exception {
45+
MockCracContext cracContext = new MockCracContext();
46+
InMemorySpanExporter exporter = new InMemorySpanExporter();
47+
OpenTelemetrySdk sdk = buildSdk(exporter);
48+
Tracer tracer = sdk.getTracer("crac-lifecycle-test");
49+
50+
// Naive CRaC resource: shuts the SDK down at checkpoint, does nothing on restore.
51+
cracContext.register(
52+
new Resource() {
53+
@Override
54+
public void beforeCheckpoint(Context<? extends Resource> context) {
55+
sdk.getSdkTracerProvider().shutdown().join(10, TimeUnit.SECONDS);
56+
}
57+
58+
@Override
59+
public void afterRestore(Context<? extends Resource> context) {
60+
// No restore logic — this is the gap that #6756 addresses.
61+
}
62+
});
63+
64+
emitSpan(tracer, "before-checkpoint");
65+
sdk.getSdkTracerProvider().forceFlush().join(10, TimeUnit.SECONDS);
66+
assertThat(exporter.exportedCount()).isEqualTo(1);
67+
68+
cracContext.simulateCheckpoint();
69+
cracContext.simulateRestore();
70+
71+
// Post-restore span is silently dropped: the SDK is shut down and has no way to reinitialize.
72+
emitSpan(tracer, "after-restore");
73+
sdk.getSdkTracerProvider().forceFlush().join(10, TimeUnit.SECONDS);
74+
assertThat(exporter.exportedCount()).isEqualTo(1);
75+
}
76+
77+
/**
78+
* Describes the desired behavior once the SDK properly implements {@link Resource}: spans emitted
79+
* after a CRaC restore should be exported normally.
80+
*
81+
* <p>This test is disabled until <a
82+
* href="https://github.com/open-telemetry/opentelemetry-java/issues/6756">#6756</a> is addressed.
83+
* When that work lands, the SDK (or an adapter it exposes) should register with the CRaC context
84+
* so that {@code beforeCheckpoint} flushes and quiesces, and {@code afterRestore} reinitializes
85+
* exporters and processors. Replace the TODO below with the real SDK API.
86+
*/
87+
@Test
88+
@Disabled("Expected to fail until #6756 adds checkpoint/restore-safe SDK lifecycle")
89+
void spansExportedAfterRestore_properCracIntegration() throws Exception {
90+
MockCracContext cracContext = new MockCracContext();
91+
InMemorySpanExporter exporter = new InMemorySpanExporter();
92+
OpenTelemetrySdk sdk = buildSdk(exporter);
93+
Tracer tracer = sdk.getTracer("crac-lifecycle-test");
94+
95+
// TODO(#6756): replace this placeholder with the real SDK CRaC API, e.g.:
96+
// cracContext.register(sdk.asCracResource());
97+
cracContext.register(
98+
new Resource() {
99+
@Override
100+
public void beforeCheckpoint(Context<? extends Resource> context) throws Exception {
101+
sdk.getSdkTracerProvider().shutdown().join(10, TimeUnit.SECONDS);
102+
}
103+
104+
@Override
105+
public void afterRestore(Context<? extends Resource> context) throws Exception {
106+
// Reinitialize: reopen connections, restart background threads.
107+
// No SDK API exists for this yet — this is the body of #6756.
108+
}
109+
});
110+
111+
emitSpan(tracer, "before-checkpoint");
112+
sdk.getSdkTracerProvider().forceFlush().join(10, TimeUnit.SECONDS);
113+
assertThat(exporter.exportedCount()).isEqualTo(1);
114+
115+
cracContext.simulateCheckpoint();
116+
cracContext.simulateRestore();
117+
118+
emitSpan(tracer, "after-restore");
119+
sdk.getSdkTracerProvider().forceFlush().join(10, TimeUnit.SECONDS);
120+
assertThat(exporter.exportedCount()).isEqualTo(2);
121+
}
122+
123+
private static OpenTelemetrySdk buildSdk(SpanExporter exporter) {
124+
return OpenTelemetrySdk.builder()
125+
.setTracerProvider(
126+
SdkTracerProvider.builder()
127+
.addSpanProcessor(SimpleSpanProcessor.create(exporter))
128+
.build())
129+
.build();
130+
}
131+
132+
private static void emitSpan(Tracer tracer, String name) {
133+
Span span = tracer.spanBuilder(name).startSpan();
134+
span.end();
135+
}
136+
137+
private static final class InMemorySpanExporter implements SpanExporter {
138+
private final List<SpanData> spans = new ArrayList<>();
139+
private boolean shutdown;
140+
141+
@Override
142+
public CompletableResultCode export(Collection<SpanData> spans) {
143+
if (shutdown) {
144+
return CompletableResultCode.ofFailure();
145+
}
146+
this.spans.addAll(spans);
147+
return CompletableResultCode.ofSuccess();
148+
}
149+
150+
@Override
151+
public CompletableResultCode flush() {
152+
return CompletableResultCode.ofSuccess();
153+
}
154+
155+
@Override
156+
public CompletableResultCode shutdown() {
157+
shutdown = true;
158+
return CompletableResultCode.ofSuccess();
159+
}
160+
161+
int exportedCount() {
162+
return spans.size();
163+
}
164+
}
165+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
* Copyright The OpenTelemetry Authors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package io.opentelemetry;
7+
8+
import java.util.ArrayList;
9+
import java.util.List;
10+
import org.crac.Context;
11+
import org.crac.Resource;
12+
13+
/**
14+
* A test-only {@link Context} that allows simulating CRaC checkpoint and restore lifecycle events
15+
* without requiring a CRaC-enabled JDK. Register resources with {@link #register(Resource)}, then
16+
* call {@link #simulateCheckpoint()} and {@link #simulateRestore()} to drive the lifecycle.
17+
*
18+
* <p>Notification order follows the CRaC specification: checkpoint callbacks fire in reverse
19+
* registration order; restore callbacks fire in forward registration order.
20+
*/
21+
final class MockCracContext extends Context<Resource> {
22+
23+
private final List<Resource> resources = new ArrayList<>();
24+
25+
@Override
26+
public void register(Resource resource) {
27+
resources.add(resource);
28+
}
29+
30+
/**
31+
* Simulates a CRaC checkpoint by invoking {@link Resource#beforeCheckpoint} on all registered
32+
* resources in reverse registration order, as the CRaC spec requires.
33+
*/
34+
void simulateCheckpoint() throws Exception {
35+
for (int i = resources.size() - 1; i >= 0; i--) {
36+
resources.get(i).beforeCheckpoint(this);
37+
}
38+
}
39+
40+
/**
41+
* Simulates a CRaC restore by invoking {@link Resource#afterRestore} on all registered resources
42+
* in forward registration order, as the CRaC spec requires.
43+
*/
44+
void simulateRestore() throws Exception {
45+
for (Resource resource : resources) {
46+
resource.afterRestore(this);
47+
}
48+
}
49+
50+
// Not used: this context is not itself registered with a parent context.
51+
@Override
52+
public void beforeCheckpoint(Context<? extends Resource> context) {}
53+
54+
@Override
55+
public void afterRestore(Context<? extends Resource> context) {}
56+
}

0 commit comments

Comments
 (0)