feat: stream backend installation progress to CLI users

doringeman · doringeman · commit 8b6c1ae1dba2 · 2026-02-11T11:20:42.000+02:00
Signed-off-by: Dorin Geman &lt;dorin.geman@docker.com&gt;
diff --git a/cmd/cli/desktop/desktop.go b/cmd/cli/desktop/desktop.go
@@ -479,12 +479,58 @@ func (c *Client) ChatWithMessagesContext(ctx context.Context, model string, conv
 		TotalTokens      int `json:"total_tokens"`
 	}
 
-	// Detect streaming vs non-streaming response via Content-Type header
+	// Use a buffered reader so we can consume server-sent progress
+	// lines (e.g. "Installing vllm-metal backend...") that arrive
+	// before the actual SSE or JSON inference response.
+	br := bufio.NewReader(resp.Body)
+
+	// Consume any plain-text progress lines that precede the real
+	// response. We peek ahead: if the next non-empty content starts
+	// with '{' (JSON) or "data:" / ":" (SSE), the progress section
+	// is over and we fall through to normal processing.
+	for {
+		peek, err := br.Peek(1)
+		if err != nil {
+			break
+		}
+		// JSON object or SSE stream — stop consuming progress lines.
+		if peek[0] == '{' || peek[0] == ':' {
+			break
+		}
+		line, err := br.ReadString('\n')
+		if err != nil && line == "" {
+			break
+		}
+		line = strings.TrimRight(line, "\r\n")
+		if line == "" {
+			continue
+		}
+		// SSE data line — stop, let the normal SSE parser handle it.
+		if strings.HasPrefix(line, "data:") {
+			// Put the line back by chaining a reader with the rest.
+			br = bufio.NewReader(io.MultiReader(
+				strings.NewReader(line+"\n"),
+				br,
+			))
+			break
+		}
+		// Progress message — print to stderr.
+		fmt.Fprintln(os.Stderr, line)
+	}
+
+	// Detect streaming vs non-streaming response. Because server-sent
+	// progress lines may have been flushed before the Content-Type was
+	// set, we also peek at the body content to detect SSE.
 	isStreaming := strings.HasPrefix(resp.Header.Get("Content-Type"), "text/event-stream")
+	if !isStreaming {
+		if peek, err := br.Peek(5); err == nil {
+			isStreaming = strings.HasPrefix(string(peek), "data:")
+		}
+	}
 
 	if !isStreaming {
 		// Non-streaming JSON response
-		body, err := io.ReadAll(resp.Body)
+		body, err := io.ReadAll(br)
 		if err != nil {
 			return assistantResponse.String(), fmt.Errorf("error reading response body: %w", err)
 		}
@@ -506,7 +552,7 @@ func (c *Client) ChatWithMessagesContext(ctx context.Context, model string, conv
 		}
 	} else {
 		// SSE streaming response - process line by line
-		scanner := bufio.NewScanner(resp.Body)
+		scanner := bufio.NewScanner(br)
 
 		for scanner.Scan() {
 			// Check if context was cancelled
diff --git a/pkg/inference/scheduling/api.go b/pkg/inference/scheduling/api.go
@@ -13,6 +13,9 @@ const (
 	// enough to encompass any real-world request but also small enough to avoid
 	// DoS attacks.
 	maximumOpenAIInferenceRequestSize = 10 * 1024 * 1024
+
+	// modelCLIUserAgentPrefix is the user-agent prefix set by the model CLI.
+	modelCLIUserAgentPrefix = "docker-model-cli/"
 )
 
 // trimRequestPathToOpenAIRoot trims a request path to start at the first
diff --git a/pkg/inference/scheduling/http_handler.go b/pkg/inference/scheduling/http_handler.go
@@ -198,11 +198,28 @@ func (h *HTTPHandler) handleOpenAIInference(w http.ResponseWriter, r *http.Reque
 		backend = h.scheduler.selectBackendForModel(model, backend, request.Model)
 	}
 
+	// If a deferred backend needs on-demand installation and the request
+	// comes from the model CLI, stream progress messages so the user sees
+	// what is happening while the download runs.
+	autoInstall := h.scheduler.installer.deferredBackends[backend.Name()] &&
+		!h.scheduler.installer.isInstalled(backend.Name()) &&
+		strings.Contains(r.UserAgent(), modelCLIUserAgentPrefix)
+	if autoInstall {
+		fmt.Fprintf(w, "Installing %s backend...\n", backend.Name())
+		if f, ok := w.(http.Flusher); ok {
+			f.Flush()
+		}
+	}
+
 	// Wait for the corresponding backend installation to complete or fail. We
 	// don't allow any requests to be scheduled for a backend until it has
 	// completed installation.
 	if err := h.scheduler.installer.wait(r.Context(), backend.Name()); err != nil {
-		if errors.Is(err, ErrBackendNotFound) {
+		if autoInstall {
+			// Headers are already sent (200 OK) from the progress
+			// line, so we can only write the error as plain text.
+			fmt.Fprintf(w, "backend installation failed: %v\n", err)
+		} else if errors.Is(err, ErrBackendNotFound) {
 			http.Error(w, err.Error(), http.StatusNotFound)
 		} else if errors.Is(err, errInstallerNotStarted) {
 			http.Error(w, err.Error(), http.StatusServiceUnavailable)
@@ -222,6 +239,13 @@ func (h *HTTPHandler) handleOpenAIInference(w http.ResponseWriter, r *http.Reque
 		return
 	}
 
+	if autoInstall {
+		fmt.Fprintf(w, "%s backend installed successfully\n", backend.Name())
+		if f, ok := w.(http.Flusher); ok {
+			f.Flush()
+		}
+	}
+
 	modelID := h.scheduler.modelManager.ResolveID(request.Model)
 
 	// Request a runner to execute the request and defer its release.

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,9 @@ const (`
`13`	`13`	`// enough to encompass any real-world request but also small enough to avoid`
`14`	`14`	`// DoS attacks.`
`15`	`15`	`maximumOpenAIInferenceRequestSize = 10 * 1024 * 1024`
	`16`	`+`
	`17`	`+ // modelCLIUserAgentPrefix is the user-agent prefix set by the model CLI.`
	`18`	`+ modelCLIUserAgentPrefix = "docker-model-cli/"`
`16`	`19`	`)`
`17`	`20`
`18`	`21`	`// trimRequestPathToOpenAIRoot trims a request path to start at the first`