Skip to content

Commit f2529a2

Browse files
authored
Merge pull request #2413 from simon-agent-go-expert/fix/mcp-stdio-retry-on-unavailable
fix: retry stdio MCP toolset when binary is unavailable at startup
2 parents b15c72b + c25ed72 commit f2529a2

4 files changed

Lines changed: 324 additions & 15 deletions

File tree

pkg/teamloader/registry.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"context"
66
"errors"
77
"fmt"
8+
"log/slog"
89
"os"
910
"path/filepath"
1011
"time"
@@ -268,10 +269,15 @@ func createMCPTool(ctx context.Context, toolset latest.Toolset, _ string, runCon
268269

269270
// STDIO MCP Server from shell command
270271
case toolset.Command != "":
271-
// Auto-install missing command binary if needed
272+
// Auto-install missing command binary if needed.
273+
// If EnsureCommand fails (binary not on PATH, no aqua package, etc.),
274+
// treat as transient: create the toolset with the original command
275+
// and let mcp.Toolset.Start() retry on each conversation turn.
272276
resolvedCommand, err := toolinstall.EnsureCommand(ctx, toolset.Command, toolset.Version)
273277
if err != nil {
274-
return nil, fmt.Errorf("resolving command %q: %w", toolset.Command, err)
278+
slog.Warn("MCP command not yet available, will retry on next turn",
279+
"command", toolset.Command, "error", err)
280+
resolvedCommand = toolset.Command
275281
}
276282

277283
env, err := environment.ExpandAll(ctx, environment.ToValues(toolset.Env), envProvider)

pkg/teamloader/registry_test.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@ package teamloader
33
import (
44
"testing"
55

6+
"github.com/stretchr/testify/assert"
67
"github.com/stretchr/testify/require"
78

89
"github.com/docker/docker-agent/pkg/config"
910
"github.com/docker/docker-agent/pkg/config/latest"
1011
"github.com/docker/docker-agent/pkg/environment"
12+
"github.com/docker/docker-agent/pkg/tools"
1113
)
1214

1315
func TestCreateShellTool(t *testing.T) {
@@ -26,3 +28,45 @@ func TestCreateShellTool(t *testing.T) {
2628
require.NoError(t, err)
2729
require.NotNil(t, tool)
2830
}
31+
32+
func TestCreateMCPTool_CommandNotFound_CreatesToolsetAnyway(t *testing.T) {
33+
t.Setenv("DOCKER_AGENT_TOOLS_DIR", t.TempDir())
34+
35+
toolset := latest.Toolset{
36+
Type: "mcp",
37+
Command: "./bin/nonexistent-mcp-server",
38+
}
39+
40+
registry := NewDefaultToolsetRegistry()
41+
42+
runConfig := &config.RuntimeConfig{
43+
Config: config.Config{WorkingDir: t.TempDir()},
44+
EnvProviderForTests: environment.NewOsEnvProvider(),
45+
}
46+
47+
tool, err := registry.CreateTool(t.Context(), toolset, ".", runConfig, "test-agent")
48+
require.NoError(t, err)
49+
require.NotNil(t, tool)
50+
assert.Equal(t, "mcp(stdio cmd=./bin/nonexistent-mcp-server)", tools.DescribeToolSet(tool))
51+
}
52+
53+
func TestCreateMCPTool_BareCommandNotFound_CreatesToolsetAnyway(t *testing.T) {
54+
t.Setenv("DOCKER_AGENT_TOOLS_DIR", t.TempDir())
55+
56+
toolset := latest.Toolset{
57+
Type: "mcp",
58+
Command: "some-nonexistent-mcp-binary",
59+
}
60+
61+
registry := NewDefaultToolsetRegistry()
62+
63+
runConfig := &config.RuntimeConfig{
64+
Config: config.Config{WorkingDir: t.TempDir()},
65+
EnvProviderForTests: environment.NewOsEnvProvider(),
66+
}
67+
68+
tool, err := registry.CreateTool(t.Context(), toolset, ".", runConfig, "test-agent")
69+
require.NoError(t, err)
70+
require.NotNil(t, tool)
71+
assert.Equal(t, "mcp(stdio cmd=some-nonexistent-mcp-binary)", tools.DescribeToolSet(tool))
72+
}

pkg/tools/mcp/mcp.go

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ import (
1212
"log/slog"
1313
"net"
1414
"net/url"
15+
"os"
16+
"os/exec"
1517
"strings"
1618
"sync"
1719
"time"
@@ -118,9 +120,9 @@ func NewRemoteToolset(name, urlString, transport string, headers map[string]stri
118120
}
119121

120122
// errServerUnavailable is returned by doStart when the MCP server could not be
121-
// reached but the error is non-fatal (e.g. EOF). The toolset is considered
122-
// "started" so the agent can proceed, but watchConnection must not be spawned
123-
// because there is no live connection to monitor.
123+
// reached but the error is non-fatal (e.g. EOF, binary not found).
124+
// Start() propagates this so started remains false, and the agent runtime
125+
// retries via ensureToolSetsAreStarted on the next conversation turn.
124126
var errServerUnavailable = errors.New("MCP server unavailable")
125127

126128
// Describe returns a short, user-visible description of this toolset instance.
@@ -155,16 +157,11 @@ func (ts *Toolset) Start(ctx context.Context) error {
155157
return nil
156158
}
157159

158-
ts.restarted = make(chan struct{})
160+
if ts.restarted == nil {
161+
ts.restarted = make(chan struct{})
162+
}
159163

160164
if err := ts.doStart(ctx); err != nil {
161-
if errors.Is(err, errServerUnavailable) {
162-
// The server is unreachable but the error is non-fatal.
163-
// Mark as started so the agent can proceed; tools will simply
164-
// be empty. Don't spawn a watcher — there's nothing to watch.
165-
ts.started = true
166-
return nil
167-
}
168165
return err
169166
}
170167

@@ -240,10 +237,11 @@ func (ts *Toolset) doStart(ctx context.Context) error {
240237
//
241238
// Only retry when initialization fails due to sending the initialized notification.
242239
if !isInitNotificationSendError(err) {
243-
if errors.Is(err, io.EOF) {
240+
if isServerUnavailableError(err) {
244241
slog.Debug(
245-
"MCP client unavailable (EOF), skipping MCP toolset",
242+
"MCP client unavailable, will retry on next conversation turn",
246243
"server", ts.logID,
244+
"error", err,
247245
)
248246
return errServerUnavailable
249247
}
@@ -548,6 +546,15 @@ func isInitNotificationSendError(err error) bool {
548546
return false
549547
}
550548

549+
// isServerUnavailableError returns true if err indicates the MCP server process
550+
// could not be reached — binary missing/not-found, or process exited immediately
551+
// before completing the MCP handshake (io.EOF). These are retryable conditions.
552+
func isServerUnavailableError(err error) bool {
553+
return errors.Is(err, io.EOF) ||
554+
errors.Is(err, exec.ErrNotFound) ||
555+
errors.Is(err, os.ErrNotExist)
556+
}
557+
551558
func processMCPContent(toolResult *mcp.CallToolResult) *tools.ToolCallResult {
552559
var text strings.Builder
553560
var images, audios []tools.MediaContent

0 commit comments

Comments
 (0)