Skip to content

Commit cdfed81

Browse files
committed
fix: retry MCP callTool on connection/session errors, not just ErrSessionMissing
When an MCP server restarts, callTool may receive transport errors (EOF, connection reset, broken pipe) instead of ErrSessionMissing depending on timing. Add an isConnectionError helper that selectively retries on known connection/session error types rather than retrying on all errors indiscriminately. The helper checks for: - mcp.ErrSessionMissing (server lost our session) - io.EOF (connection dropped) - net.Error (timeout, connection reset, etc.) - String-based fallback for transport errors the MCP SDK wraps with %v (losing the original error chain) This fixes flaky TestRemoteReconnectAfterServerRestart without broadening retry to application-level errors. Assisted-By: docker-agent
1 parent ad4f99b commit cdfed81

1 file changed

Lines changed: 31 additions & 5 deletions

File tree

pkg/tools/mcp/mcp.go

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"io"
1111
"iter"
1212
"log/slog"
13+
"net"
1314
"net/url"
1415
"strings"
1516
"sync"
@@ -454,12 +455,13 @@ func (ts *Toolset) callTool(ctx context.Context, toolCall tools.ToolCall) (*tool
454455

455456
resp, err := ts.mcpClient.CallTool(ctx, request)
456457

457-
// If the server lost our session (e.g. it restarted), force a
458-
// reconnection and retry the call once.
459-
if errors.Is(err, mcp.ErrSessionMissing) {
460-
slog.Warn("MCP session missing, forcing reconnect and retrying", "tool", toolCall.Function.Name, "server", ts.logID)
458+
// If the call failed with a connection or session error (e.g. the
459+
// server restarted), trigger or wait for a reconnection and retry
460+
// the call once.
461+
if err != nil && isConnectionError(err) && ctx.Err() == nil {
462+
slog.Warn("MCP call failed, forcing reconnect and retrying", "tool", toolCall.Function.Name, "server", ts.logID, "error", err)
461463
if waitErr := ts.forceReconnectAndWait(ctx); waitErr != nil {
462-
return nil, fmt.Errorf("failed to reconnect after session loss: %w", waitErr)
464+
return nil, fmt.Errorf("failed to reconnect after call failure: %w", waitErr)
463465
}
464466
resp, err = ts.mcpClient.CallTool(ctx, request)
465467
}
@@ -690,3 +692,27 @@ func (ts *Toolset) GetPrompt(ctx context.Context, name string, arguments map[str
690692
slog.Debug("Retrieved MCP prompt", "prompt", name, "messages_count", len(result.Messages))
691693
return result, nil
692694
}
695+
696+
// isConnectionError reports whether err is a connection or session error
697+
// that warrants a reconnect-and-retry (as opposed to an application-level
698+
// error that would fail again even after reconnecting).
699+
func isConnectionError(err error) bool {
700+
if errors.Is(err, mcp.ErrSessionMissing) || errors.Is(err, io.EOF) {
701+
return true
702+
}
703+
var netErr net.Error
704+
if errors.As(err, &netErr) {
705+
return true
706+
}
707+
// The MCP SDK wraps transport failures (e.g. connection reset, EOF from
708+
// client.Do) with its internal ErrRejected sentinel using %v, which
709+
// drops the original error from the chain. Detect these by checking
710+
// the error message for common transport-failure substrings.
711+
if msg := err.Error(); strings.Contains(msg, "connection reset") ||
712+
strings.Contains(msg, "connection refused") ||
713+
strings.Contains(msg, "broken pipe") ||
714+
strings.Contains(msg, "EOF") {
715+
return true
716+
}
717+
return false
718+
}

0 commit comments

Comments
 (0)