Skip to content

Commit 6ce280c

Browse files
authored
Merge pull request #2462 from dgageot/fix-shell-bg
shell: fix hang when a tool command backgrounds a child process
2 parents 42ca212 + 35bb084 commit 6ce280c

File tree

2 files changed

+88
-0
lines changed

2 files changed

+88
-0
lines changed

pkg/tools/builtin/shell.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,11 +150,27 @@ func (h *shellHandler) RunShell(ctx context.Context, params RunShellArgs) (*tool
150150
return h.runNativeCommand(timeoutCtx, ctx, params.Cmd, cwd, timeout), nil
151151
}
152152

153+
// waitDelayAfterShellExit caps how long cmd.Wait() blocks on stdout/stderr
154+
// copy goroutines after the direct shell child has exited.
155+
//
156+
// When cmd.Stdout/Stderr are not *os.File, Go's exec package creates OS pipes
157+
// and spawns copy goroutines; cmd.Wait() only returns after *both* the child
158+
// exits and those goroutines see EOF on the pipes. If the command backgrounds
159+
// a grandchild (e.g. `docker run ... &`, `sleep 10 &`) that inherits the pipe
160+
// fds, the pipes stay open and Wait() blocks until the configured timeout.
161+
//
162+
// cmd.WaitDelay tells Go to force-close the pipes and return this long after
163+
// the direct child has exited, letting the grandchild keep running while the
164+
// tool call returns promptly. A short delay is plenty because any output the
165+
// shell itself produced is already flushed by the time it exits.
166+
const waitDelayAfterShellExit = 500 * time.Millisecond
167+
153168
func (h *shellHandler) runNativeCommand(timeoutCtx, ctx context.Context, command, cwd string, timeout time.Duration) *tools.ToolCallResult {
154169
cmd := exec.Command(h.shell, append(h.shellArgsPrefix, command)...)
155170
cmd.Env = h.env
156171
cmd.Dir = cwd
157172
cmd.SysProcAttr = platformSpecificSysProcAttr()
173+
cmd.WaitDelay = waitDelayAfterShellExit
158174

159175
var outBuf bytes.Buffer
160176
cmd.Stdout = &outBuf

pkg/tools/builtin/shell_test.go

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@ package builtin
22

33
import (
44
"os"
5+
"os/exec"
6+
"runtime"
57
"testing"
8+
"time"
69

710
"github.com/stretchr/testify/assert"
811
"github.com/stretchr/testify/require"
@@ -177,3 +180,72 @@ func TestShellTool_RelativeCwdResolvesAgainstWorkingDir(t *testing.T) {
177180
assert.Contains(t, result.Output, subdir,
178181
"relative cwd must resolve against the configured workingDir, not the process cwd")
179182
}
183+
184+
// Regression test for a shell-tool hang caused by backgrounded grandchildren.
185+
//
186+
// A command like `sleep 10 &` makes the shell exit immediately, but the
187+
// backgrounded sleep inherits stdout/stderr. Without cmd.WaitDelay, Go's
188+
// exec.Cmd.Wait() blocks reading the pipe until the configured timeout,
189+
// which makes the tool call hang (observed in eval runs where the agent
190+
// launched a server with `docker run ... &`).
191+
//
192+
// With the WaitDelay safeguard the tool must return within a small fraction
193+
// of the configured timeout.
194+
func TestShellTool_BackgroundedChildDoesNotBlockReturn(t *testing.T) {
195+
if runtime.GOOS == "windows" {
196+
t.Skip("POSIX shell backgrounding semantics; skipped on Windows")
197+
}
198+
199+
tool := NewShellTool(nil, &config.RuntimeConfig{Config: config.Config{WorkingDir: t.TempDir()}})
200+
201+
start := time.Now()
202+
result, err := tool.handler.RunShell(t.Context(), RunShellArgs{
203+
// sleep inherits stdout/stderr from the shell and holds the pipe
204+
// open for 30s. The tool must return as soon as the shell exits.
205+
Cmd: "sleep 30 &",
206+
Timeout: 20,
207+
})
208+
elapsed := time.Since(start)
209+
210+
require.NoError(t, err)
211+
require.NotNil(t, result)
212+
assert.Less(t, elapsed, 5*time.Second,
213+
"shell tool must return promptly when the command backgrounds a child "+
214+
"that inherits stdout/stderr; elapsed=%s", elapsed)
215+
}
216+
217+
// Even when the backgrounded child detaches into its own session (so the
218+
// shell tool's process-group kill cannot reach it on timeout), cmd.WaitDelay
219+
// must still allow the tool call to return.
220+
func TestShellTool_DetachedBackgroundedChildDoesNotBlockReturn(t *testing.T) {
221+
if runtime.GOOS == "windows" {
222+
t.Skip("POSIX shell backgrounding semantics; skipped on Windows")
223+
}
224+
if _, err := exec.LookPath("setsid"); err != nil {
225+
t.Skip("setsid not available")
226+
}
227+
228+
tool := NewShellTool(nil, &config.RuntimeConfig{Config: config.Config{WorkingDir: t.TempDir()}})
229+
230+
done := make(chan struct{})
231+
var result *tools.ToolCallResult
232+
var err error
233+
go func() {
234+
defer close(done)
235+
result, err = tool.handler.RunShell(t.Context(), RunShellArgs{
236+
// setsid places sleep in its own session/process group, so the
237+
// process-group kill fallback in the timeout path cannot reach
238+
// it. Only cmd.WaitDelay can unblock Wait() here.
239+
Cmd: "setsid sleep 30 &",
240+
Timeout: 20,
241+
})
242+
}()
243+
244+
select {
245+
case <-done:
246+
require.NoError(t, err)
247+
require.NotNil(t, result)
248+
case <-time.After(10 * time.Second):
249+
t.Fatal("shell tool hung when command backgrounded a detached child")
250+
}
251+
}

0 commit comments

Comments
 (0)