Merge pull request #2412 from trungutt/fix/2318-gpt5-title-generation

dgageot · web-flow · commit 03a22ece85bd · 2026-04-17T09:23:41.000+02:00
fix: title generation fails with OpenAI reasoning models
diff --git a/e2e/debug_title_test.go b/e2e/debug_title_test.go
@@ -35,6 +35,10 @@ func TestDebug_Title(t *testing.T) {
 
 			title := runCLI(t, "debug", "title", "testdata/basic.yaml", "--model="+tt.model, "What can you do?")
 
+			// The non-empty check is the key invariant: reasoning models
+			// (o-series, gpt-5) must produce visible title text despite
+			// hidden reasoning tokens consuming part of the output budget.
+			assert.NotEmpty(t, title, "title must not be empty")
 			assert.Equal(t, tt.want, title)
 		})
 	}
diff --git a/pkg/fake/proxy.go b/pkg/fake/proxy.go
@@ -218,6 +218,8 @@ func DefaultMatcher(onError func(err error)) recorder.MatcherFunc {
 	// Normalize Gemini thinkingConfig (varies based on provider defaults for thinking budget).
 	// This handles both camelCase (API) variants of the thinkingConfig field.
 	thinkingConfigRegex := regexp.MustCompile(`"thinkingConfig":\{[^}]*\},?`)
+	// Normalize OpenAI reasoning config (varies based on NoThinking flag and thinking budget).
+	reasoningRegex := regexp.MustCompile(`"reasoning":\{[^}]*\},?`)
 
 	return func(r *http.Request, i cassette.Request) bool {
 		if r.Body == nil || r.Body == http.NoBody {
@@ -246,9 +248,11 @@ func DefaultMatcher(onError func(err error)) recorder.MatcherFunc {
 		normalizedReq := callIDRegex.ReplaceAllString(string(reqBody), "call_ID")
 		normalizedReq = maxTokensRegex.ReplaceAllString(normalizedReq, "")
 		normalizedReq = thinkingConfigRegex.ReplaceAllString(normalizedReq, "")
+		normalizedReq = reasoningRegex.ReplaceAllString(normalizedReq, "")
 		normalizedCassette := callIDRegex.ReplaceAllString(i.Body, "call_ID")
 		normalizedCassette = maxTokensRegex.ReplaceAllString(normalizedCassette, "")
 		normalizedCassette = thinkingConfigRegex.ReplaceAllString(normalizedCassette, "")
+		normalizedCassette = reasoningRegex.ReplaceAllString(normalizedCassette, "")
 
 		return normalizedReq == normalizedCassette
 	}
diff --git a/pkg/model/provider/openai/client.go b/pkg/model/provider/openai/client.go
@@ -274,15 +274,34 @@ func (c *Client) CreateChatCompletionStream(
 		}
 	}
 
-	// Apply thinking budget: set reasoning_effort for reasoning models (o-series, gpt-5)
-	if c.ModelConfig.ThinkingBudget != nil && isOpenAIReasoningModel(c.ModelConfig.Model) {
-		effortStr, err := openAIReasoningEffort(c.ModelConfig.ThinkingBudget)
-		if err != nil {
-			slog.Error("OpenAI request using thinking_budget failed", "error", err)
-			return nil, err
+	// Apply thinking budget: set reasoning_effort for reasoning models (o-series, gpt-5).
+	// Reasoning models always reason; omitting the param uses the default effort.
+	// When NoThinking is set we still need to send low effort so hidden
+	// reasoning tokens don't exhaust the max_completion_tokens budget.
+	// We use "low" instead of "minimal" because older models (o3-mini, o1)
+	// only accept low/medium/high.
+	if isOpenAIReasoningModel(c.ModelConfig.Model) {
+		if c.ModelOptions.NoThinking() {
+			params.ReasoningEffort = shared.ReasoningEffort("low")
+			// Hidden reasoning tokens count against the output budget even
+			// with low effort. Enforce a floor so visible text isn't starved.
+			if c.ModelConfig.MaxTokens != nil && *c.ModelConfig.MaxTokens < noThinkingMinOutputTokens {
+				if !isResponsesModel(c.ModelConfig.Model) {
+					params.MaxTokens = openai.Int(noThinkingMinOutputTokens)
+				} else {
+					params.MaxCompletionTokens = openai.Int(noThinkingMinOutputTokens)
+				}
+			}
+			slog.Debug("OpenAI request using low reasoning (NoThinking)")
+		} else if c.ModelConfig.ThinkingBudget != nil {
+			effortStr, err := openAIReasoningEffort(c.ModelConfig.ThinkingBudget)
+			if err != nil {
+				slog.Error("OpenAI request using thinking_budget failed", "error", err)
+				return nil, err
+			}
+			params.ReasoningEffort = shared.ReasoningEffort(effortStr)
+			slog.Debug("OpenAI request using thinking_budget", "reasoning_effort", effortStr)
 		}
-		params.ReasoningEffort = shared.ReasoningEffort(effortStr)
-		slog.Debug("OpenAI request using thinking_budget", "reasoning_effort", effortStr)
 	}
 
 	// Apply structured output configuration
@@ -384,20 +403,39 @@ func (c *Client) CreateResponseStream(
 	}
 
 	// Configure reasoning for models that support it (o-series, gpt-5).
-	// Skip reasoning entirely when NoThinking is set (e.g. title generation)
-	// to avoid wasting output tokens on internal reasoning.
-	if isOpenAIReasoningModel(c.ModelConfig.Model) && !c.ModelOptions.NoThinking() {
-		params.Reasoning = shared.ReasoningParam{
-			Summary: shared.ReasoningSummaryDetailed,
-		}
-		if c.ModelConfig.ThinkingBudget != nil {
-			effortStr, err := openAIReasoningEffort(c.ModelConfig.ThinkingBudget)
-			if err != nil {
-				slog.Error("OpenAI responses request using thinking_budget failed", "error", err)
-				return nil, err
+	// Reasoning models always reason internally; omitting the reasoning param
+	// does NOT disable reasoning — it just uses the model's default effort.
+	// Those hidden reasoning tokens still count against max_output_tokens,
+	// so with a small budget (e.g. title generation) the model can exhaust
+	// all tokens on reasoning and return empty visible text.
+	if isOpenAIReasoningModel(c.ModelConfig.Model) {
+		if c.ModelOptions.NoThinking() {
+			// Use low effort so the model spends as few output tokens as
+			// possible on reasoning, leaving room for visible text.
+			// We use "low" instead of "minimal" because older models
+			// (o3-mini, o1) only accept low/medium/high.
+			params.Reasoning = shared.ReasoningParam{
+				Effort: shared.ReasoningEffort("low"),
+			}
+			// Hidden reasoning tokens count against max_output_tokens even
+			// with low effort. Enforce a floor so visible text isn't starved.
+			if c.ModelConfig.MaxTokens != nil && *c.ModelConfig.MaxTokens < noThinkingMinOutputTokens {
+				params.MaxOutputTokens = param.NewOpt(noThinkingMinOutputTokens)
+			}
+			slog.Debug("OpenAI responses request using low reasoning (NoThinking)")
+		} else {
+			params.Reasoning = shared.ReasoningParam{
+				Summary: shared.ReasoningSummaryDetailed,
+			}
+			if c.ModelConfig.ThinkingBudget != nil {
+				effortStr, err := openAIReasoningEffort(c.ModelConfig.ThinkingBudget)
+				if err != nil {
+					slog.Error("OpenAI responses request using thinking_budget failed", "error", err)
+					return nil, err
+				}
+				params.Reasoning.Effort = shared.ReasoningEffort(effortStr)
+				slog.Debug("OpenAI responses request using thinking_budget", "reasoning_effort", effortStr)
 			}
-			params.Reasoning.Effort = shared.ReasoningEffort(effortStr)
-			slog.Debug("OpenAI responses request using thinking_budget", "reasoning_effort", effortStr)
 		}
 	}
 
@@ -1036,6 +1074,14 @@ func isOpenAIReasoningModel(model string) bool {
 		strings.HasPrefix(m, "gpt-5")
 }
 
+// noThinkingMinOutputTokens is the minimum max-output-token budget for
+// reasoning models when NoThinking is set.  Even with low reasoning effort
+// the model still produces hidden reasoning tokens that count against
+// max_output_tokens / max_completion_tokens.  A small budget (e.g. 20)
+// gets entirely consumed by reasoning, leaving nothing for visible text.
+// 256 tokens is enough for low-effort reasoning plus a short visible response.
+const noThinkingMinOutputTokens int64 = 256
+
 // openAIReasoningEffort validates a ThinkingBudget effort string for the
 // OpenAI API. Returns the effort string or an error.
 func openAIReasoningEffort(b *latest.ThinkingBudget) (string, error) {
diff --git a/pkg/sessiontitle/generator.go b/pkg/sessiontitle/generator.go
@@ -21,6 +21,12 @@ const (
 	systemPrompt     = "You are a helpful AI assistant that generates concise, descriptive titles for conversations. You will be given up to 2 recent user messages and asked to create a single-line title that captures the main topic. Never use newlines or line breaks in your response."
 	userPromptFormat = "Based on the following recent user messages from a conversation with an AI assistant, generate a short, descriptive title (maximum 50 characters) that captures the main topic or purpose of the conversation. Return ONLY the title text on a single line, nothing else. Do not include any newlines, explanations, or formatting.\n\nRecent user messages:\n%s\n\n"
 
+	// titleMaxTokens is the max output token budget for title generation.
+	// This is sized for visible output only (~50 chars ≈ 12-15 tokens).
+	// Providers that need extra headroom for hidden reasoning tokens
+	// (e.g. OpenAI reasoning models) handle the adjustment internally.
+	titleMaxTokens = 20
+
 	// titleGenerationTimeout is the maximum time to wait for title generation.
 	// Title generation should be quick since we disable thinking and use low max_tokens.
 	// If the API is slow or hanging (e.g., due to server-side thinking), we should timeout.
@@ -103,7 +109,7 @@ func (g *Generator) Generate(ctx context.Context, sessionID string, userMessages
 			ctx,
 			baseModel,
 			options.WithStructuredOutput(nil),
-			options.WithMaxTokens(20),
+			options.WithMaxTokens(titleMaxTokens),
 			options.WithNoThinking(),
 			options.WithGeneratingTitle(),
 		)

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,10 @@ func TestDebug_Title(t *testing.T) {`
`35`	`35`
`36`	`36`	`title := runCLI(t, "debug", "title", "testdata/basic.yaml", "--model="+tt.model, "What can you do?")`
`37`	`37`
	`38`	`+ // The non-empty check is the key invariant: reasoning models`
	`39`	`+ // (o-series, gpt-5) must produce visible title text despite`
	`40`	`+ // hidden reasoning tokens consuming part of the output budget.`
	`41`	`+ assert.NotEmpty(t, title, "title must not be empty")`
`38`	`42`	`assert.Equal(t, tt.want, title)`
`39`	`43`	`})`
`40`	`44`	`}`