@@ -479,12 +479,58 @@ func (c *Client) ChatWithMessagesContext(ctx context.Context, model string, conv
479479 TotalTokens int `json:"total_tokens"`
480480 }
481481
482- // Detect streaming vs non-streaming response via Content-Type header
482+ // Use a buffered reader so we can consume server-sent progress
483+ // lines (e.g. "Installing vllm-metal backend...") that arrive
484+ // before the actual SSE or JSON inference response.
485+ br := bufio .NewReader (resp .Body )
486+
487+ // Consume any plain-text progress lines that precede the real
488+ // response. We peek ahead: if the next non-empty content starts
489+ // with '{' (JSON) or "data:" / ":" (SSE), the progress section
490+ // is over and we fall through to normal processing.
491+ for {
492+ peek , err := br .Peek (1 )
493+ if err != nil {
494+ break
495+ }
496+ // JSON object or SSE stream — stop consuming progress lines.
497+ if peek [0 ] == '{' || peek [0 ] == ':' {
498+ break
499+ }
500+ line , err := br .ReadString ('\n' )
501+ if err != nil && line == "" {
502+ break
503+ }
504+ line = strings .TrimRight (line , "\r \n " )
505+ if line == "" {
506+ continue
507+ }
508+ // SSE data line — stop, let the normal SSE parser handle it.
509+ if strings .HasPrefix (line , "data:" ) {
510+ // Put the line back by chaining a reader with the rest.
511+ br = bufio .NewReader (io .MultiReader (
512+ strings .NewReader (line + "\n " ),
513+ br ,
514+ ))
515+ break
516+ }
517+ // Progress message — print to stderr.
518+ fmt .Fprintln (os .Stderr , line )
519+ }
520+
521+ // Detect streaming vs non-streaming response. Because server-sent
522+ // progress lines may have been flushed before the Content-Type was
523+ // set, we also peek at the body content to detect SSE.
483524 isStreaming := strings .HasPrefix (resp .Header .Get ("Content-Type" ), "text/event-stream" )
525+ if ! isStreaming {
526+ if peek , err := br .Peek (5 ); err == nil {
527+ isStreaming = strings .HasPrefix (string (peek ), "data:" )
528+ }
529+ }
484530
485531 if ! isStreaming {
486532 // Non-streaming JSON response
487- body , err := io .ReadAll (resp . Body )
533+ body , err := io .ReadAll (br )
488534 if err != nil {
489535 return assistantResponse .String (), fmt .Errorf ("error reading response body: %w" , err )
490536 }
@@ -506,7 +552,7 @@ func (c *Client) ChatWithMessagesContext(ctx context.Context, model string, conv
506552 }
507553 } else {
508554 // SSE streaming response - process line by line
509- scanner := bufio .NewScanner (resp . Body )
555+ scanner := bufio .NewScanner (br )
510556
511557 for scanner .Scan () {
512558 // Check if context was cancelled
@@ -778,6 +824,30 @@ func (c *Client) ShowConfigs(modelFilter string) ([]scheduling.ModelConfigEntry,
778824 return configs , nil
779825}
780826
827+ // InstallBackend triggers on-demand installation of a deferred backend
828+ func (c * Client ) InstallBackend (backend string ) error {
829+ installPath := inference .InferencePrefix + "/install-backend"
830+ jsonData , err := json .Marshal (struct {
831+ Backend string `json:"backend"`
832+ }{Backend : backend })
833+ if err != nil {
834+ return fmt .Errorf ("error marshaling request: %w" , err )
835+ }
836+
837+ resp , err := c .doRequest (http .MethodPost , installPath , bytes .NewReader (jsonData ))
838+ if err != nil {
839+ return c .handleQueryError (err , installPath )
840+ }
841+ defer resp .Body .Close ()
842+
843+ if resp .StatusCode != http .StatusOK {
844+ body , _ := io .ReadAll (resp .Body )
845+ return fmt .Errorf ("install backend failed with status %s: %s" , resp .Status , string (body ))
846+ }
847+
848+ return nil
849+ }
850+
781851func (c * Client ) ConfigureBackend (request scheduling.ConfigureRequest ) error {
782852 configureBackendPath := inference .InferencePrefix + "/_configure"
783853 jsonData , err := json .Marshal (request )
0 commit comments