Skip to content

Commit 5b00c42

Browse files
committed
fix: move reasoning token budget floor to OpenAI provider
Keep titleMaxTokens at 20 for non-reasoning models. The OpenAI client now enforces a minimum output-token budget (256) only when NoThinking is set on a reasoning model, where hidden reasoning tokens would otherwise starve the visible text.
1 parent 4b23da4 commit 5b00c42

File tree

2 files changed

+28
-6
lines changed

2 files changed

+28
-6
lines changed

pkg/model/provider/openai/client.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,15 @@ func (c *Client) CreateChatCompletionStream(
283283
if isOpenAIReasoningModel(c.ModelConfig.Model) {
284284
if c.ModelOptions.NoThinking() {
285285
params.ReasoningEffort = shared.ReasoningEffort("low")
286+
// Hidden reasoning tokens count against the output budget even
287+
// with low effort. Enforce a floor so visible text isn't starved.
288+
if c.ModelConfig.MaxTokens != nil && *c.ModelConfig.MaxTokens < noThinkingMinOutputTokens {
289+
if !isResponsesModel(c.ModelConfig.Model) {
290+
params.MaxTokens = openai.Int(noThinkingMinOutputTokens)
291+
} else {
292+
params.MaxCompletionTokens = openai.Int(noThinkingMinOutputTokens)
293+
}
294+
}
286295
slog.Debug("OpenAI request using low reasoning (NoThinking)")
287296
} else if c.ModelConfig.ThinkingBudget != nil {
288297
effortStr, err := openAIReasoningEffort(c.ModelConfig.ThinkingBudget)
@@ -408,6 +417,11 @@ func (c *Client) CreateResponseStream(
408417
params.Reasoning = shared.ReasoningParam{
409418
Effort: shared.ReasoningEffort("low"),
410419
}
420+
// Hidden reasoning tokens count against max_output_tokens even
421+
// with low effort. Enforce a floor so visible text isn't starved.
422+
if c.ModelConfig.MaxTokens != nil && *c.ModelConfig.MaxTokens < noThinkingMinOutputTokens {
423+
params.MaxOutputTokens = param.NewOpt(noThinkingMinOutputTokens)
424+
}
411425
slog.Debug("OpenAI responses request using low reasoning (NoThinking)")
412426
} else {
413427
params.Reasoning = shared.ReasoningParam{
@@ -1050,6 +1064,14 @@ func isOpenAIReasoningModel(model string) bool {
10501064
strings.HasPrefix(m, "gpt-5")
10511065
}
10521066

1067+
// noThinkingMinOutputTokens is the minimum max-output-token budget for
1068+
// reasoning models when NoThinking is set. Even with low reasoning effort
1069+
// the model still produces hidden reasoning tokens that count against
1070+
// max_output_tokens / max_completion_tokens. A small budget (e.g. 20)
1071+
// gets entirely consumed by reasoning, leaving nothing for visible text.
1072+
// 256 tokens is enough for low-effort reasoning plus a short visible response.
1073+
const noThinkingMinOutputTokens int64 = 256
1074+
10531075
// openAIReasoningEffort validates a ThinkingBudget effort string for the
10541076
// OpenAI API. Returns the effort string or an error.
10551077
func openAIReasoningEffort(b *latest.ThinkingBudget) (string, error) {

pkg/sessiontitle/generator.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,14 @@ const (
2222
userPromptFormat = "Based on the following recent user messages from a conversation with an AI assistant, generate a short, descriptive title (maximum 50 characters) that captures the main topic or purpose of the conversation. Return ONLY the title text on a single line, nothing else. Do not include any newlines, explanations, or formatting.\n\nRecent user messages:\n%s\n\n"
2323

2424
// titleMaxTokens is the max output token budget for title generation.
25-
// This must be large enough for reasoning models (o-series, gpt-5) where
26-
// max_output_tokens includes hidden reasoning tokens. With minimal
27-
// reasoning effort a short title needs ~200-250 tokens total.
28-
titleMaxTokens = 256
25+
// This is sized for visible output only (~50 chars ≈ 12-15 tokens).
26+
// Providers that need extra headroom for hidden reasoning tokens
27+
// (e.g. OpenAI reasoning models) handle the adjustment internally.
28+
titleMaxTokens = 20
2929

3030
// titleGenerationTimeout is the maximum time to wait for title generation.
31-
// Title generation should be quick since we use minimal thinking and a
32-
// small token budget. If the API is slow or hanging, we should timeout.
31+
// Title generation should be quick since we disable thinking and use low max_tokens.
32+
// If the API is slow or hanging (e.g., due to server-side thinking), we should timeout.
3333
titleGenerationTimeout = 30 * time.Second
3434
)
3535

0 commit comments

Comments
 (0)