fix: retry MCP toolset Start() when server is unavailable

simonferquel-clanker · simonferquel-clanker · commit 42e6d448d576 · 2026-04-14T22:09:37.000Z
When doStart() fails with errServerUnavailable (EOF, binary not found,
etc.), Start() now propagates the error for all toolset types — both
stdio and remote. started remains false so ensureToolSetsAreStarted()
retries on the next conversation turn.

Previously, remote toolsets swallowed errServerUnavailable and marked
themselves as started with an empty tool list. Now retry-on-next-turn
applies uniformly.

Assisted-By: docker-agent
diff --git a/pkg/tools/mcp/mcp.go b/pkg/tools/mcp/mcp.go
@@ -12,6 +12,8 @@ import (
 	"log/slog"
 	"net"
 	"net/url"
+	"os"
+	"os/exec"
 	"strings"
 	"sync"
 	"time"
@@ -118,9 +120,9 @@ func NewRemoteToolset(name, urlString, transport string, headers map[string]stri
 }
 
 // errServerUnavailable is returned by doStart when the MCP server could not be
-// reached but the error is non-fatal (e.g. EOF). The toolset is considered
-// "started" so the agent can proceed, but watchConnection must not be spawned
-// because there is no live connection to monitor.
+// reached but the error is non-fatal (e.g. EOF, binary not found).
+// Start() propagates this so started remains false, and the agent runtime
+// retries via ensureToolSetsAreStarted on the next conversation turn.
 var errServerUnavailable = errors.New("MCP server unavailable")
 
 // Describe returns a short, user-visible description of this toolset instance.
@@ -155,16 +157,11 @@ func (ts *Toolset) Start(ctx context.Context) error {
 		return nil
 	}
 
-	ts.restarted = make(chan struct{})
+	if ts.restarted == nil {
+		ts.restarted = make(chan struct{})
+	}
 
 	if err := ts.doStart(ctx); err != nil {
-		if errors.Is(err, errServerUnavailable) {
-			// The server is unreachable but the error is non-fatal.
-			// Mark as started so the agent can proceed; tools will simply
-			// be empty. Don't spawn a watcher — there's nothing to watch.
-			ts.started = true
-			return nil
-		}
 		return err
 	}
 
@@ -240,10 +237,11 @@ func (ts *Toolset) doStart(ctx context.Context) error {
 		//
 		// Only retry when initialization fails due to sending the initialized notification.
 		if !isInitNotificationSendError(err) {
-			if errors.Is(err, io.EOF) {
+			if isServerUnavailableError(err) {
 				slog.Debug(
-					"MCP client unavailable (EOF), skipping MCP toolset",
+					"MCP client unavailable, will retry on next conversation turn",
 					"server", ts.logID,
+					"error", err,
 				)
 				return errServerUnavailable
 			}
@@ -548,6 +546,15 @@ func isInitNotificationSendError(err error) bool {
 	return false
 }
 
+// isServerUnavailableError returns true if err indicates the MCP server process
+// could not be reached — binary missing/not-found, or process exited immediately
+// before completing the MCP handshake (io.EOF). These are retryable conditions.
+func isServerUnavailableError(err error) bool {
+	return errors.Is(err, io.EOF) ||
+		errors.Is(err, exec.ErrNotFound) ||
+		errors.Is(err, os.ErrNotExist)
+}
+
 func processMCPContent(toolResult *mcp.CallToolResult) *tools.ToolCallResult {
 	var text strings.Builder
 	var images, audios []tools.MediaContent
diff --git a/pkg/tools/mcp/reconnect_test.go b/pkg/tools/mcp/reconnect_test.go
@@ -3,8 +3,13 @@ package mcp
 import (
 	"context"
 	"fmt"
+	"io"
+	"iter"
 	"net"
 	"net/http"
+	"os"
+	"os/exec"
+	"sync"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -238,3 +243,250 @@ func TestRemoteReconnectRefreshesTools(t *testing.T) {
 	assert.Contains(t, toolNames, "ns_shared")
 	assert.NotContains(t, toolNames, "ns_alpha", "stale tool from old server should not be present")
 }
+
+// failingInitClient is a mock mcpClient whose Initialize method returns a
+// configurable error for the first N calls, then succeeds.
+type failingInitClient struct {
+	mu          sync.Mutex
+	initErr     error // error to return from Initialize
+	failsLeft   int   // how many more times Initialize should fail
+	initCalls   int   // total Initialize calls
+	waitCh      chan struct{}
+	toolsToList []*gomcp.Tool
+}
+
+func (m *failingInitClient) Initialize(_ context.Context, _ *gomcp.InitializeRequest) (*gomcp.InitializeResult, error) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	m.initCalls++
+	if m.failsLeft > 0 {
+		m.failsLeft--
+		return nil, m.initErr
+	}
+	if m.waitCh != nil {
+		m.waitCh = make(chan struct{})
+	}
+	return &gomcp.InitializeResult{}, nil
+}
+
+func (m *failingInitClient) ListTools(_ context.Context, _ *gomcp.ListToolsParams) iter.Seq2[*gomcp.Tool, error] {
+	m.mu.Lock()
+	t := m.toolsToList
+	m.mu.Unlock()
+	return func(yield func(*gomcp.Tool, error) bool) {
+		for _, tool := range t {
+			if !yield(tool, nil) {
+				return
+			}
+		}
+	}
+}
+
+func (m *failingInitClient) CallTool(context.Context, *gomcp.CallToolParams) (*gomcp.CallToolResult, error) {
+	return &gomcp.CallToolResult{Content: []gomcp.Content{&gomcp.TextContent{Text: "ok"}}}, nil
+}
+
+func (m *failingInitClient) ListPrompts(context.Context, *gomcp.ListPromptsParams) iter.Seq2[*gomcp.Prompt, error] {
+	return func(func(*gomcp.Prompt, error) bool) {}
+}
+
+func (m *failingInitClient) GetPrompt(context.Context, *gomcp.GetPromptParams) (*gomcp.GetPromptResult, error) {
+	return &gomcp.GetPromptResult{}, nil
+}
+
+func (m *failingInitClient) SetElicitationHandler(tools.ElicitationHandler) {}
+func (m *failingInitClient) SetOAuthSuccessHandler(func())                  {}
+func (m *failingInitClient) SetManagedOAuth(bool)                           {}
+func (m *failingInitClient) SetToolListChangedHandler(func())               {}
+func (m *failingInitClient) SetPromptListChangedHandler(func())             {}
+
+func (m *failingInitClient) Wait() error {
+	m.mu.Lock()
+	ch := m.waitCh
+	m.mu.Unlock()
+	if ch == nil {
+		select {}
+	}
+	<-ch
+	return nil
+}
+
+func (m *failingInitClient) Close(context.Context) error {
+	m.mu.Lock()
+	if m.waitCh != nil {
+		select {
+		case <-m.waitCh:
+		default:
+			close(m.waitCh)
+		}
+	}
+	m.mu.Unlock()
+	return nil
+}
+
+// TestStdioStartReturnsErrorWhenServerUnavailable verifies that a stdio toolset
+// propagates errServerUnavailable when Initialize returns io.EOF, and that
+// started remains false so the runtime can retry.
+func TestStdioStartReturnsErrorWhenServerUnavailable(t *testing.T) {
+	t.Parallel()
+
+	mock := &failingInitClient{
+		initErr:   io.EOF,
+		failsLeft: 1,
+	}
+
+	ts := &Toolset{
+		name:      "test-stdio",
+		mcpClient: mock,
+		logID:     "test-cmd",
+	}
+
+	err := ts.Start(t.Context())
+	require.Error(t, err)
+	require.ErrorIs(t, err, errServerUnavailable)
+
+	ts.mu.Lock()
+	started := ts.started
+	ts.mu.Unlock()
+	assert.False(t, started, "stdio toolset must not be marked as started when server is unavailable")
+}
+
+// TestStdioStartReturnsErrorWhenBinaryNotFound verifies that exec.ErrNotFound
+// from Initialize is treated the same as io.EOF for stdio toolsets.
+func TestStdioStartReturnsErrorWhenBinaryNotFound(t *testing.T) {
+	t.Parallel()
+
+	mock := &failingInitClient{
+		initErr:   fmt.Errorf("start command: %w", exec.ErrNotFound),
+		failsLeft: 1,
+	}
+
+	ts := &Toolset{
+		name:      "test-stdio",
+		mcpClient: mock,
+		logID:     "missing-binary",
+	}
+
+	err := ts.Start(t.Context())
+	require.Error(t, err)
+	require.ErrorIs(t, err, errServerUnavailable)
+
+	ts.mu.Lock()
+	started := ts.started
+	ts.mu.Unlock()
+	assert.False(t, started, "stdio toolset must not be marked as started when binary is not found")
+}
+
+// TestStdioLazyRetrySucceedsWhenBinaryAppears verifies the end-to-end retry
+// scenario: turn 1 fails with EOF (binary not yet available), turn 2 succeeds
+// once the binary "appears" (mock stops failing).
+func TestStdioLazyRetrySucceedsWhenBinaryAppears(t *testing.T) {
+	t.Parallel()
+
+	pingTool := &gomcp.Tool{Name: "ping"}
+	mock := &failingInitClient{
+		initErr:     io.EOF,
+		failsLeft:   1,
+		toolsToList: []*gomcp.Tool{pingTool},
+		waitCh:      make(chan struct{}),
+	}
+
+	ts := &Toolset{
+		name:      "test-stdio",
+		mcpClient: mock,
+		logID:     "lazy-binary",
+	}
+
+	// Turn 1: Start fails — binary not available yet.
+	err := ts.Start(t.Context())
+	require.Error(t, err)
+	require.ErrorIs(t, err, errServerUnavailable)
+
+	// Turn 2: Binary has "appeared" (mock will succeed).
+	err = ts.Start(t.Context())
+	require.NoError(t, err)
+
+	ts.mu.Lock()
+	started := ts.started
+	ts.mu.Unlock()
+	assert.True(t, started, "stdio toolset must be started after successful retry")
+
+	toolList, err := ts.Tools(t.Context())
+	require.NoError(t, err)
+	require.Len(t, toolList, 1)
+	assert.Equal(t, "test-stdio_ping", toolList[0].Name)
+
+	_ = ts.Stop(t.Context())
+}
+
+// TestRemoteStartRetriesWhenUnavailable verifies that a remote toolset also
+// returns an error and stays un-started when the server is unavailable (EOF),
+// confirming retry-on-next-turn applies to all toolset types.
+func TestRemoteStartRetriesWhenUnavailable(t *testing.T) {
+	t.Parallel()
+
+	mock := &failingInitClient{
+		initErr:   io.EOF,
+		failsLeft: 1,
+	}
+
+	ts := &Toolset{
+		name:      "test-remote",
+		mcpClient: mock,
+		logID:     "remote-server",
+	}
+
+	err := ts.Start(t.Context())
+	require.Error(t, err)
+	require.ErrorIs(t, err, errServerUnavailable)
+
+	ts.mu.Lock()
+	started := ts.started
+	ts.mu.Unlock()
+	assert.False(t, started, "remote toolset must not be marked as started when server is unavailable")
+}
+
+// TestStartableToolSetRetryAcrossTurns is a full integration test using
+// tools.NewStartable to wrap an MCP Toolset. It verifies that when a stdio
+// toolset fails N turns, the StartableToolSet keeps retrying and succeeds
+// on turn N+1.
+func TestStartableToolSetRetryAcrossTurns(t *testing.T) {
+	t.Parallel()
+
+	const failTurns = 3
+
+	pingTool := &gomcp.Tool{Name: "ping"}
+	mock := &failingInitClient{
+		initErr:     fmt.Errorf("command not found: %w", os.ErrNotExist),
+		failsLeft:   failTurns,
+		toolsToList: []*gomcp.Tool{pingTool},
+		waitCh:      make(chan struct{}),
+	}
+
+	mcpToolset := &Toolset{
+		name:      "retry-test",
+		mcpClient: mock,
+		logID:     "retry-binary",
+	}
+
+	startable := tools.NewStartable(mcpToolset)
+
+	// Turns 1..N: Start fails, IsStarted stays false.
+	for turn := 1; turn <= failTurns; turn++ {
+		err := startable.Start(t.Context())
+		require.Error(t, err, "turn %d should fail", turn)
+		assert.False(t, startable.IsStarted(), "turn %d: should not be started", turn)
+	}
+
+	// Turn N+1: binary is now available, Start succeeds.
+	err := startable.Start(t.Context())
+	require.NoError(t, err)
+	assert.True(t, startable.IsStarted())
+
+	toolList, err := mcpToolset.Tools(t.Context())
+	require.NoError(t, err)
+	require.Len(t, toolList, 1)
+	assert.Equal(t, "retry-test_ping", toolList[0].Name)
+
+	_ = startable.Stop(t.Context())
+}