Skip to content

Commit 930ce8e

Browse files
authored
Merge pull request #652 from doringeman/install-vllm-metal
feat: make vllm-metal backend installation opt-in
2 parents 1294bde + 8b6c1ae commit 930ce8e

File tree

14 files changed

+343
-30
lines changed

14 files changed

+343
-30
lines changed

cmd/cli/commands/install-runner.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"github.com/docker/model-runner/pkg/inference/backends/diffusers"
1818
"github.com/docker/model-runner/pkg/inference/backends/llamacpp"
1919
"github.com/docker/model-runner/pkg/inference/backends/vllm"
20+
"github.com/docker/model-runner/pkg/inference/backends/vllmmetal"
2021
"github.com/spf13/cobra"
2122
)
2223

@@ -28,7 +29,7 @@ const (
2829
// installation will try to reach the model runner while waiting for it to
2930
// be ready.
3031
installWaitRetryInterval = 500 * time.Millisecond
31-
backendUsage = "Specify backend (" + llamacpp.Name + "|" + vllm.Name + "|" + diffusers.Name + "). Default: " + llamacpp.Name
32+
backendUsage = "Specify backend (" + llamacpp.Name + "|" + vllm.Name + "|" + diffusers.Name + "|" + vllmmetal.Name + "). Default: " + llamacpp.Name
3233
)
3334

3435
// waitForStandaloneRunnerAfterInstall waits for a standalone model runner
@@ -237,6 +238,17 @@ type runnerOptions struct {
237238

238239
// runInstallOrStart is shared logic for install-runner and start-runner commands
239240
func runInstallOrStart(cmd *cobra.Command, opts runnerOptions, debug bool) error {
241+
// vllm-metal is installed on-demand via the running model runner,
242+
// not as a standalone container. This applies to all engine kinds.
243+
if opts.backend == vllmmetal.Name {
244+
cmd.Println("Installing vllm-metal backend...")
245+
if err := desktopClient.InstallBackend(vllmmetal.Name); err != nil {
246+
return fmt.Errorf("failed to install vllm-metal backend: %w", err)
247+
}
248+
cmd.Println("vllm-metal backend installed successfully")
249+
return nil
250+
}
251+
240252
var vllmOnWSL bool
241253
// Ensure that we're running in a supported model runner context.
242254
engineKind := modelRunner.EngineKind()
@@ -324,7 +336,7 @@ func runInstallOrStart(cmd *cobra.Command, opts runnerOptions, debug bool) error
324336
}
325337

326338
// Validate backend selection
327-
validBackends := []string{llamacpp.Name, vllm.Name, diffusers.Name}
339+
validBackends := []string{llamacpp.Name, vllm.Name, diffusers.Name, vllmmetal.Name}
328340
if opts.backend != "" {
329341
isValid := false
330342
for _, valid := range validBackends {

cmd/cli/desktop/desktop.go

Lines changed: 73 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -479,12 +479,58 @@ func (c *Client) ChatWithMessagesContext(ctx context.Context, model string, conv
479479
TotalTokens int `json:"total_tokens"`
480480
}
481481

482-
// Detect streaming vs non-streaming response via Content-Type header
482+
// Use a buffered reader so we can consume server-sent progress
483+
// lines (e.g. "Installing vllm-metal backend...") that arrive
484+
// before the actual SSE or JSON inference response.
485+
br := bufio.NewReader(resp.Body)
486+
487+
// Consume any plain-text progress lines that precede the real
488+
// response. We peek ahead: if the next non-empty content starts
489+
// with '{' (JSON) or "data:" / ":" (SSE), the progress section
490+
// is over and we fall through to normal processing.
491+
for {
492+
peek, err := br.Peek(1)
493+
if err != nil {
494+
break
495+
}
496+
// JSON object or SSE stream — stop consuming progress lines.
497+
if peek[0] == '{' || peek[0] == ':' {
498+
break
499+
}
500+
line, err := br.ReadString('\n')
501+
if err != nil && line == "" {
502+
break
503+
}
504+
line = strings.TrimRight(line, "\r\n")
505+
if line == "" {
506+
continue
507+
}
508+
// SSE data line — stop, let the normal SSE parser handle it.
509+
if strings.HasPrefix(line, "data:") {
510+
// Put the line back by chaining a reader with the rest.
511+
br = bufio.NewReader(io.MultiReader(
512+
strings.NewReader(line+"\n"),
513+
br,
514+
))
515+
break
516+
}
517+
// Progress message — print to stderr.
518+
fmt.Fprintln(os.Stderr, line)
519+
}
520+
521+
// Detect streaming vs non-streaming response. Because server-sent
522+
// progress lines may have been flushed before the Content-Type was
523+
// set, we also peek at the body content to detect SSE.
483524
isStreaming := strings.HasPrefix(resp.Header.Get("Content-Type"), "text/event-stream")
525+
if !isStreaming {
526+
if peek, err := br.Peek(5); err == nil {
527+
isStreaming = strings.HasPrefix(string(peek), "data:")
528+
}
529+
}
484530

485531
if !isStreaming {
486532
// Non-streaming JSON response
487-
body, err := io.ReadAll(resp.Body)
533+
body, err := io.ReadAll(br)
488534
if err != nil {
489535
return assistantResponse.String(), fmt.Errorf("error reading response body: %w", err)
490536
}
@@ -506,7 +552,7 @@ func (c *Client) ChatWithMessagesContext(ctx context.Context, model string, conv
506552
}
507553
} else {
508554
// SSE streaming response - process line by line
509-
scanner := bufio.NewScanner(resp.Body)
555+
scanner := bufio.NewScanner(br)
510556

511557
for scanner.Scan() {
512558
// Check if context was cancelled
@@ -778,6 +824,30 @@ func (c *Client) ShowConfigs(modelFilter string) ([]scheduling.ModelConfigEntry,
778824
return configs, nil
779825
}
780826

827+
// InstallBackend triggers on-demand installation of a deferred backend
828+
func (c *Client) InstallBackend(backend string) error {
829+
installPath := inference.InferencePrefix + "/install-backend"
830+
jsonData, err := json.Marshal(struct {
831+
Backend string `json:"backend"`
832+
}{Backend: backend})
833+
if err != nil {
834+
return fmt.Errorf("error marshaling request: %w", err)
835+
}
836+
837+
resp, err := c.doRequest(http.MethodPost, installPath, bytes.NewReader(jsonData))
838+
if err != nil {
839+
return c.handleQueryError(err, installPath)
840+
}
841+
defer resp.Body.Close()
842+
843+
if resp.StatusCode != http.StatusOK {
844+
body, _ := io.ReadAll(resp.Body)
845+
return fmt.Errorf("install backend failed with status %s: %s", resp.Status, string(body))
846+
}
847+
848+
return nil
849+
}
850+
781851
func (c *Client) ConfigureBackend(request scheduling.ConfigureRequest) error {
782852
configureBackendPath := inference.InferencePrefix + "/_configure"
783853
jsonData, err := json.Marshal(request)

cmd/cli/docs/reference/docker_model_install-runner.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ plink: docker_model.yaml
88
options:
99
- option: backend
1010
value_type: string
11-
description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp'
11+
description: |
12+
Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp
1213
deprecated: false
1314
hidden: false
1415
experimental: false

cmd/cli/docs/reference/docker_model_reinstall-runner.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ plink: docker_model.yaml
88
options:
99
- option: backend
1010
value_type: string
11-
description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp'
11+
description: |
12+
Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp
1213
deprecated: false
1314
hidden: false
1415
experimental: false

cmd/cli/docs/reference/docker_model_start-runner.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ plink: docker_model.yaml
1010
options:
1111
- option: backend
1212
value_type: string
13-
description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp'
13+
description: |
14+
Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp
1415
deprecated: false
1516
hidden: false
1617
experimental: false

cmd/cli/docs/reference/model_install-runner.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ Install Docker Model Runner (Docker Engine only)
77

88
| Name | Type | Default | Description |
99
|:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------|
10-
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp |
10+
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp |
1111
| `--debug` | `bool` | | Enable debug logging |
1212
| `--do-not-track` | `bool` | | Do not track models usage in Docker Model Runner |
1313
| `--gpu` | `string` | `auto` | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann) |

cmd/cli/docs/reference/model_reinstall-runner.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ Reinstall Docker Model Runner (Docker Engine only)
77

88
| Name | Type | Default | Description |
99
|:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------|
10-
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp |
10+
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp |
1111
| `--debug` | `bool` | | Enable debug logging |
1212
| `--do-not-track` | `bool` | | Do not track models usage in Docker Model Runner |
1313
| `--gpu` | `string` | `auto` | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann) |

cmd/cli/docs/reference/model_start-runner.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ Start Docker Model Runner (Docker Engine only)
77

88
| Name | Type | Default | Description |
99
|:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------|
10-
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp |
10+
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp |
1111
| `--debug` | `bool` | | Enable debug logging |
1212
| `--do-not-track` | `bool` | | Do not track models usage in Docker Model Runner |
1313
| `--gpu` | `string` | `auto` | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann) |

main.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,12 @@ func main() {
208208
backends[vllmmetal.Name] = vllmMetalBackend
209209
}
210210

211+
// Backends whose installation is deferred until explicitly requested.
212+
var deferredBackends []string
213+
if vllmMetalBackend != nil {
214+
deferredBackends = append(deferredBackends, vllmmetal.Name)
215+
}
216+
211217
scheduler := scheduling.NewScheduler(
212218
log,
213219
backends,
@@ -220,6 +226,7 @@ func main() {
220226
"",
221227
false,
222228
),
229+
deferredBackends,
223230
)
224231

225232
// Create the HTTP handler for the scheduler

pkg/inference/scheduling/api.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ const (
1313
// enough to encompass any real-world request but also small enough to avoid
1414
// DoS attacks.
1515
maximumOpenAIInferenceRequestSize = 10 * 1024 * 1024
16+
17+
// modelCLIUserAgentPrefix is the user-agent prefix set by the model CLI.
18+
modelCLIUserAgentPrefix = "docker-model-cli/"
1619
)
1720

1821
// trimRequestPathToOpenAIRoot trims a request path to start at the first

0 commit comments

Comments
 (0)