fix: move reasoning token budget floor to OpenAI provider

trungutt · trungutt · commit 5b00c423772f · 2026-04-16T17:46:35.000+02:00
Keep titleMaxTokens at 20 for non-reasoning models. The OpenAI client
now enforces a minimum output-token budget (256) only when NoThinking
is set on a reasoning model, where hidden reasoning tokens would
otherwise starve the visible text.
diff --git a/pkg/model/provider/openai/client.go b/pkg/model/provider/openai/client.go
@@ -283,6 +283,15 @@ func (c *Client) CreateChatCompletionStream(
 	if isOpenAIReasoningModel(c.ModelConfig.Model) {
 		if c.ModelOptions.NoThinking() {
 			params.ReasoningEffort = shared.ReasoningEffort("low")
+			// Hidden reasoning tokens count against the output budget even
+			// with low effort. Enforce a floor so visible text isn't starved.
+			if c.ModelConfig.MaxTokens != nil && *c.ModelConfig.MaxTokens < noThinkingMinOutputTokens {
+				if !isResponsesModel(c.ModelConfig.Model) {
+					params.MaxTokens = openai.Int(noThinkingMinOutputTokens)
+				} else {
+					params.MaxCompletionTokens = openai.Int(noThinkingMinOutputTokens)
+				}
+			}
 			slog.Debug("OpenAI request using low reasoning (NoThinking)")
 		} else if c.ModelConfig.ThinkingBudget != nil {
 			effortStr, err := openAIReasoningEffort(c.ModelConfig.ThinkingBudget)
@@ -408,6 +417,11 @@ func (c *Client) CreateResponseStream(
 			params.Reasoning = shared.ReasoningParam{
 				Effort: shared.ReasoningEffort("low"),
 			}
+			// Hidden reasoning tokens count against max_output_tokens even
+			// with low effort. Enforce a floor so visible text isn't starved.
+			if c.ModelConfig.MaxTokens != nil && *c.ModelConfig.MaxTokens < noThinkingMinOutputTokens {
+				params.MaxOutputTokens = param.NewOpt(noThinkingMinOutputTokens)
+			}
 			slog.Debug("OpenAI responses request using low reasoning (NoThinking)")
 		} else {
 			params.Reasoning = shared.ReasoningParam{
@@ -1050,6 +1064,14 @@ func isOpenAIReasoningModel(model string) bool {
 		strings.HasPrefix(m, "gpt-5")
 }
 
+// noThinkingMinOutputTokens is the minimum max-output-token budget for
+// reasoning models when NoThinking is set.  Even with low reasoning effort
+// the model still produces hidden reasoning tokens that count against
+// max_output_tokens / max_completion_tokens.  A small budget (e.g. 20)
+// gets entirely consumed by reasoning, leaving nothing for visible text.
+// 256 tokens is enough for low-effort reasoning plus a short visible response.
+const noThinkingMinOutputTokens int64 = 256
+
 // openAIReasoningEffort validates a ThinkingBudget effort string for the
 // OpenAI API. Returns the effort string or an error.
 func openAIReasoningEffort(b *latest.ThinkingBudget) (string, error) {
diff --git a/pkg/sessiontitle/generator.go b/pkg/sessiontitle/generator.go
@@ -22,14 +22,14 @@ const (
 	userPromptFormat = "Based on the following recent user messages from a conversation with an AI assistant, generate a short, descriptive title (maximum 50 characters) that captures the main topic or purpose of the conversation. Return ONLY the title text on a single line, nothing else. Do not include any newlines, explanations, or formatting.\n\nRecent user messages:\n%s\n\n"
 
 	// titleMaxTokens is the max output token budget for title generation.
-	// This must be large enough for reasoning models (o-series, gpt-5) where
-	// max_output_tokens includes hidden reasoning tokens. With minimal
-	// reasoning effort a short title needs ~200-250 tokens total.
-	titleMaxTokens = 256
+	// This is sized for visible output only (~50 chars ≈ 12-15 tokens).
+	// Providers that need extra headroom for hidden reasoning tokens
+	// (e.g. OpenAI reasoning models) handle the adjustment internally.
+	titleMaxTokens = 20
 
 	// titleGenerationTimeout is the maximum time to wait for title generation.
-	// Title generation should be quick since we use minimal thinking and a
-	// small token budget. If the API is slow or hanging, we should timeout.
+	// Title generation should be quick since we disable thinking and use low max_tokens.
+	// If the API is slow or hanging (e.g., due to server-side thinking), we should timeout.
 	titleGenerationTimeout = 30 * time.Second
 )