model-runner/pkg/inference/scheduling/api.go at 82ea264f933fc046d4e5749d31e660f02853eeda · docker/model-runner · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
package scheduling

import (
	"strings"
	"time"

	"github.com/docker/model-runner/pkg/inference"
)

const (
	// maximumOpenAIInferenceRequestSize is the maximum OpenAI API embedding or
	// completion request size that Scheduler will allow. This should be large
	// enough to encompass any real-world request but also small enough to avoid
	// DoS attacks.
	maximumOpenAIInferenceRequestSize = 10 * 1024 * 1024

	// modelCLIUserAgentPrefix is the user-agent prefix set by the model CLI.
	modelCLIUserAgentPrefix = "docker-model-cli/"
)

// trimRequestPathToOpenAIRoot trims a request path to start at the first
// instance of /v1/ to appear in the path.
func trimRequestPathToOpenAIRoot(path string) string {
	if index := strings.Index(path, "/v1/"); index != -1 {
		return path[index:]
	} else if index = strings.Index(path, "/rerank"); index != -1 {
		return path[index:]
	} else if index = strings.Index(path, "/score"); index != -1 {
		return path[index:]
	}
	return path
}

// backendModeForRequest determines the backend operation mode to handle an
// OpenAI or Anthropic inference request. Its second parameter is true if and only if a valid
// mode could be determined.
func backendModeForRequest(path string) (inference.BackendMode, bool) {
	if strings.HasSuffix(path, "/v1/chat/completions") || strings.HasSuffix(path, "/v1/completions") {
		return inference.BackendModeCompletion, true
	} else if strings.HasSuffix(path, "/v1/embeddings") {
		return inference.BackendModeEmbedding, true
	} else if strings.HasSuffix(path, "/rerank") || strings.HasSuffix(path, "/score") {
		return inference.BackendModeReranking, true
	} else if strings.HasSuffix(path, "/v1/messages") || strings.HasSuffix(path, "/v1/messages/count_tokens") {
		// Anthropic Messages API - treated as completion mode
		return inference.BackendModeCompletion, true
	} else if strings.HasSuffix(path, "/v1/images/generations") {
		// OpenAI Images API - image generation mode
		return inference.BackendModeImageGeneration, true
	}
	return inference.BackendMode(0), false
}

// OpenAIInferenceRequest is used to extract the model specification from either
// a chat completion or embedding request in the OpenAI API.
type OpenAIInferenceRequest struct {
	// Model is the requested model name.
	Model string `json:"model"`
}

// OpenAIErrorResponse is used to format an OpenAI API compatible error response
// (see https://platform.openai.com/docs/api-reference/responses-streaming/error)
type OpenAIErrorResponse struct {
	Type           string  `json:"type"` // always "error"
	Code           *string `json:"code"`
	Message        string  `json:"message"`
	Param          *string `json:"param"`
	SequenceNumber int     `json:"sequence_number"`
}

// BackendStatus represents information about a running backend
type BackendStatus struct {
	// BackendName is the name of the backend
	BackendName string `json:"backend_name"`
	// ModelName is the name of the model loaded in the backend
	ModelName string `json:"model_name"`
	// Mode is the mode the backend is operating in
	Mode string `json:"mode"`
	// LastUsed represents when this (backend, model, mode) tuple was last used
	LastUsed time.Time `json:"last_used,omitempty"`
	// InUse indicates whether this backend is currently handling a request
	InUse bool `json:"in_use,omitempty"`
	// Loading indicates whether this backend is currently being initialized
	Loading   bool                 `json:"loading,omitempty"`
	KeepAlive *inference.KeepAlive `json:"keep_alive,omitempty"`
}

// DiskUsage represents the disk usage of the models and default backend.
type DiskUsage struct {
	ModelsDiskUsage         int64 `json:"models_disk_usage"`
	DefaultBackendDiskUsage int64 `json:"default_backend_disk_usage"`
}

// UnloadRequest is used to specify which models to unload.
type UnloadRequest struct {
	All     bool     `json:"all"`
	Backend string   `json:"backend"`
	Models  []string `json:"models"`
}

// UnloadResponse is used to return the number of unloaded runners (backend, model).
type UnloadResponse struct {
	UnloadedRunners int `json:"unloaded_runners"`
}

// ConfigureRequest specifies per-model runtime configuration options.
type ConfigureRequest struct {
	Model           string                 `json:"model"`
	Mode            *inference.BackendMode `json:"mode,omitempty"`
	RawRuntimeFlags string                 `json:"raw-runtime-flags,omitempty"`
	inference.BackendConfiguration
}

// ModelConfigEntry represents a model configuration entry with its associated metadata.
type ModelConfigEntry struct {
	Backend string
	Model   string
	ModelID string
	Mode    inference.BackendMode
	Config  inference.BackendConfiguration
}

// ModelBackendSelection describes the backend selected by the scheduler for a model.
type ModelBackendSelection struct {
	Backend   string `json:"backend"`
	Installed bool   `json:"installed"`
}