Skip to content

Commit a5efa9f

Browse files
committed
feat: auto-pull deferred backends on demand
Signed-off-by: Dorin Geman <dorin.geman@docker.com>
1 parent 07f29df commit a5efa9f

1 file changed

Lines changed: 31 additions & 11 deletions

File tree

pkg/inference/scheduling/installer.go

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ type installer struct {
5454
// mu protects statuses map mutations in installBackend. Readers
5555
// (wait, isInstalled) take an RLock; installBackend takes a full Lock.
5656
mu sync.RWMutex
57+
// installMu serializes on-demand install operations so that only one
58+
// goroutine performs the actual download at a time. Held independently
59+
// of mu so that long-running installs don't block map readers.
60+
installMu sync.Mutex
5761
}
5862

5963
// newInstaller creates a new backend installer. Backends listed in
@@ -118,8 +122,8 @@ func (i *installer) run(ctx context.Context) {
118122
close(status.installed)
119123
}
120124
}
121-
// If not on disk, leave channels open so wait() returns
122-
// errBackendNotInstalled.
125+
// If not on disk, leave channels open so wait() can trigger
126+
// on-demand installation when the backend is first needed.
123127
continue
124128
}
125129

@@ -153,8 +157,9 @@ func (i *installer) run(ctx context.Context) {
153157
}
154158

155159
// wait waits for installation of the specified backend to complete or fail.
156-
// For deferred backends that have never been installed, it returns
157-
// errBackendNotInstalled immediately instead of blocking.
160+
// For deferred backends that have not yet been installed, it triggers
161+
// on-demand installation (auto-pull), blocking until complete or the caller's
162+
// context is cancelled.
158163
func (i *installer) wait(ctx context.Context, backend string) error {
159164
// Grab the backend status under a read lock, since installBackend may replace entries in the map.
160165
i.mu.RLock()
@@ -164,17 +169,16 @@ func (i *installer) wait(ctx context.Context, backend string) error {
164169
return ErrBackendNotFound
165170
}
166171

167-
// For deferred backends, check whether installation has completed without
168-
// blocking. This doesn't depend on the installer being started, since
169-
// deferred backends are installed on-demand, not by the run loop.
172+
// For deferred backends, check whether installation has already completed.
173+
// If not, trigger on-demand installation (auto-pull).
170174
if i.deferredBackends[backend] {
171175
select {
172176
case <-status.installed:
173177
return nil
174178
case <-status.failed:
175179
return status.err
176180
default:
177-
return errBackendNotInstalled
181+
return i.installBackend(ctx, backend)
178182
}
179183
}
180184

@@ -198,16 +202,24 @@ func (i *installer) wait(ctx context.Context, backend string) error {
198202

199203
// installBackend triggers on-demand installation of a deferred backend.
200204
// It is idempotent: if the backend is already installed, it returns nil.
205+
// installMu serializes actual downloads so only one goroutine installs at a
206+
// time, while mu is held only briefly for map reads/writes so that other
207+
// goroutines calling wait() or isInstalled() are not blocked during the
208+
// (potentially long) Install() call.
201209
func (i *installer) installBackend(ctx context.Context, name string) error {
202-
i.mu.Lock()
203-
defer i.mu.Unlock()
210+
// Serialize install operations so only one download runs at a time.
211+
i.installMu.Lock()
212+
defer i.installMu.Unlock()
204213

205214
backend, ok := i.backends[name]
206215
if !ok {
207216
return ErrBackendNotFound
208217
}
209218

219+
// Check current status under read lock.
220+
i.mu.RLock()
210221
status := i.statuses[name]
222+
i.mu.RUnlock()
211223

212224
// Already installed — nothing to do.
213225
select {
@@ -223,12 +235,20 @@ func (i *installer) installBackend(ctx context.Context, name string) error {
223235
installed: make(chan struct{}),
224236
failed: make(chan struct{}),
225237
}
238+
i.mu.Lock()
226239
i.statuses[name] = status
240+
i.mu.Unlock()
227241
default:
228242
}
229243

230-
// Perform installation.
244+
// Perform installation without holding mu.
231245
if err := backend.Install(ctx, i.httpClient); err != nil {
246+
// If the caller's context was cancelled (e.g. Ctrl-C), don't
247+
// permanently mark the backend as failed — leave channels open
248+
// so the next request can retry.
249+
if ctx.Err() != nil {
250+
return err
251+
}
232252
status.err = err
233253
close(status.failed)
234254
return err

0 commit comments

Comments
 (0)