sourcebot-dev · jsourcebot · Jun 20, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+- Added per-step token cost tracking and estimated tool call token usage to Ask Sourcebot chat history. [#1353](https://github.com/sourcebot-dev/sourcebot/pull/1353)
+
 ## [5.0.4] - 2026-06-18
 
 ### Changed

diff --git a/packages/web/src/ee/features/chat/agent.test.ts b/packages/web/src/ee/features/chat/agent.test.ts
@@ -137,7 +137,8 @@ const createAssistantMessage = (parts: SBChatMessagePart[]): SBChatMessage => ({
 });
 
 const createFakeStreamResult = () => ({
-    response: Promise.resolve(new Response()),
+    response: Promise.resolve({ messages: [] }),
+    steps: Promise.resolve([]),
     totalUsage: Promise.resolve({
         inputTokens: 1,
         outputTokens: 1,

diff --git a/packages/web/src/ee/features/chat/agent.ts b/packages/web/src/ee/features/chat/agent.ts
@@ -1,4 +1,5 @@
-import { SBChatMessage, SBChatMessageMetadata } from "@/features/chat/types";
+import { SBChatMessage, SBChatMessageMetadata, StepTokenUsageEntry, ToolTokenUsageEntry } from "@/features/chat/types";
+import { estimateModelToolOutputTokens } from "@/ee/features/chat/tokenEstimation";
 import { getFileSource } from '@/features/git';
 import { isServiceError } from "@/lib/utils";
 import { LanguageModelV3 as AISDKLanguageModelV3 } from "@ai-sdk/provider";
@@ -190,19 +191,76 @@ export const createMessageStream = async ({
             });
 
             const totalUsage = await researchStream.totalUsage;
+            const steps = await researchStream.steps;
+            const response = await researchStream.response;
+
+            // Tool output estimates are derived from `response.messages` rather
+            // than per-step `toolResults` because the response messages cover
+            // tool calls that never run inside a step — approval-gated tools
+            // execute before the step loop, and thrown tool errors are recorded
+            // as `tool-error` parts that `toolResults` excludes. Their
+            // `tool-result` parts also carry the output in model-visible form
+            // (`toModelOutput` already applied), which is exactly the payload
+            // whose token footprint we want to estimate.
+            const toolUsageByToolCallId = new Map<string, ToolTokenUsageEntry>(
+                response.messages.flatMap((message) =>
+                    message.role !== 'tool' ? [] : message.content.flatMap((part) =>
+                        part.type !== 'tool-result' ? [] : [[part.toolCallId, {
+                            toolCallId: part.toolCallId,
+                            toolName: part.toolName,
+                            estimatedOutputTokens: estimateModelToolOutputTokens(part.output),
+                        }] as const]
+                    )
+                )
+            );
+
+            // One entry per step, in step order. The UI joins its step groups
+            // to these entries by array position, so the order and count must
+            // mirror the stream's steps exactly. Tool calls nest under the
+            // step they ran in; `content` is matched rather than `toolResults`
+            // so that thrown tool errors (`tool-error` parts, which
+            // `toolResults` excludes) are still attributed to their step.
+            const stepTokenUsage: StepTokenUsageEntry[] = steps.map(({ usage, content }) => ({
+                inputTokens: usage.inputTokens,
+                outputTokens: usage.outputTokens,
+                cacheReadTokens: usage.inputTokenDetails?.cacheReadTokens,
+                tools: content.flatMap((part) => {
+                    if (part.type !== 'tool-result' && part.type !== 'tool-error') {
+                        return [];
+                    }
+                    const entry = toolUsageByToolCallId.get(part.toolCallId);
+                    if (!entry) {
+                        return [];
+                    }
+                    toolUsageByToolCallId.delete(part.toolCallId);
+                    return [entry];
+                }),
+            }));
+
+            // Any estimates left unclaimed belong to tool calls that executed
+            // before the step loop (approval continuations). Their output
+            // enters the context as input to this phase's first step, so nest
+            // them under it.
+            if (toolUsageByToolCallId.size > 0 && stepTokenUsage.length > 0) {
+                stepTokenUsage[0].tools.unshift(...toolUsageByToolCallId.values());
+            }
 
             writer.write({
                 type: 'message-metadata',
                 messageMetadata: {
+                    // Spread first so the derived fields below can't be overwritten by caller metadata.
+                    ...metadata,
                     totalTokens: (priorMetadata?.totalTokens ?? 0) + (totalUsage.totalTokens ?? 0),
                     totalInputTokens: (priorMetadata?.totalInputTokens ?? 0) + (totalUsage.inputTokens ?? 0),
                     totalOutputTokens: (priorMetadata?.totalOutputTokens ?? 0) + (totalUsage.outputTokens ?? 0),
                     totalCacheReadTokens: (priorMetadata?.totalCacheReadTokens ?? 0) + (totalUsage.inputTokenDetails?.cacheReadTokens ?? 0),
                     totalCacheWriteTokens: (priorMetadata?.totalCacheWriteTokens ?? 0) + (totalUsage.inputTokenDetails?.cacheWriteTokens ?? 0),
                     totalResponseTimeMs: (priorMetadata?.totalResponseTimeMs ?? 0) + (new Date().getTime() - startTime.getTime()),
+                    // Concatenated (not summed) across approval-continuation
+                    // phases so earlier phases' steps are preserved in order.
+                    stepTokenUsage: [...(priorMetadata?.stepTokenUsage ?? []), ...stepTokenUsage],
                     modelName,
                     traceId,
-                    ...metadata,
                 }
             });
 
@@ -430,6 +488,13 @@ const createAgentStream = async ({
                 logger.warn(`Tool call repair failed for "${toolCall.toolName}": ${error.message}`);
                 return null;
             },
+            // Token usage collection deliberately does NOT happen here: the SDK
+            // awaits this callback before starting the next step, so it must
+            // stay cheap, and `toolResults` misses tool calls that never run
+            // inside a step (approval-gated tools execute before the step loop)
+            // as well as thrown tool errors (recorded as `tool-error` parts).
+            // Both are instead derived post-stream in `createMessageStream`
+            // from `steps` and `response.messages`.
             onStepFinish: ({ toolResults }) => {
                 toolResults.forEach(({ output, dynamic }) => {
                     if (dynamic || isServiceError(output)) {

diff --git a/packages/web/src/ee/features/chat/components/chatThread/chatThreadListItem.tsx b/packages/web/src/ee/features/chat/components/chatThread/chatThreadListItem.tsx
@@ -91,33 +91,57 @@ const ChatThreadListItemComponent = forwardRef<HTMLDivElement, ChatThreadListIte
     // should be visible to the user. By "steps", we mean parts that originated
     // from the same LLM invocation. By "visibile", we mean parts that have some
     // visual representation in the UI (e.g., text, reasoning, tool calls, etc.).
-    const uiVisibleThinkingSteps = useMemo(() => {
-        const steps = groupMessageIntoSteps(assistantMessage?.parts ?? []);
-
-        // Filter out the answerPart and empty steps
-        return steps
-            .map(
-                (step) => step
-                    // First, filter out any parts that are not text
-                    .filter((part) => {
-                        if (part.type === 'text') {
-                            return !part.text.includes(ANSWER_TAG);
-                        }
-
-                        return true;
-                    })
-                    .filter((part) => {
-                        // Only include text, reasoning, and tool parts
-                        return (
-                            part.type === 'text' ||
-                            part.type === 'reasoning' ||
-                            part.type.startsWith('tool-') ||
-                            part.type === 'dynamic-tool'
-                        )
-                    })
-            )
+    //
+    // Each step is tagged with its stepIndex — the invocation's position in
+    // the turn, which indexes into `metadata.stepTokenUsage`. Indices are
+    // assigned by counting 'step-start' markers (one per invocation) BEFORE
+    // any filtering, so dropping empty or answer-only steps below cannot
+    // shift the indices of the steps that remain.
+    const { uiVisibleThinkingSteps, answerStepIndex } = useMemo(() => {
+        const groupedParts = groupMessageIntoSteps(assistantMessage?.parts ?? []);
+
+        // Parts written before the first step-start (e.g. data parts) don't
+        // belong to any step; they get stepIndex -1 and never survive the
+        // visibility filters below.
+        let stepIndex = -1;
+        let answerStepIndex: number | undefined = undefined;
+
+        const steps = groupedParts
+            .map((stepParts) => {
+                if (stepParts[0]?.type === 'step-start') {
+                    stepIndex++;
+                }
+
+                if (stepParts.some((part) => part.type === 'text' && part.text.includes(ANSWER_TAG))) {
+                    answerStepIndex = stepIndex;
+                }
+
+                return {
+                    stepIndex,
+                    parts: stepParts
+                        // First, filter out the answer text
+                        .filter((part) => {
+                            if (part.type === 'text') {
+                                return !part.text.includes(ANSWER_TAG);
+                            }
+
+                            return true;
+                        })
+                        .filter((part) => {
+                            // Only include text, reasoning, and tool parts
+                            return (
+                                part.type === 'text' ||
+                                part.type === 'reasoning' ||
+                                part.type.startsWith('tool-') ||
+                                part.type === 'dynamic-tool'
+                            )
+                        }),
+                };
+            })
             // Then, filter out any steps that are empty
-            .filter(step => step.length > 0);
+            .filter((step) => step.parts.length > 0);
+
+        return { uiVisibleThinkingSteps: steps, answerStepIndex };
     }, [assistantMessage?.parts]);
 
     // "thinking" is when the agent is generating output that is not the answer.
@@ -379,6 +403,7 @@ const ChatThreadListItemComponent = forwardRef<HTMLDivElement, ChatThreadListIte
                             isNetworkActive={isNetworkActive}
                             isAwaitingToolApproval={isAwaitingToolApproval}
                             thinkingSteps={uiVisibleThinkingSteps}
+                            answerStepIndex={answerStepIndex}
                             metadata={assistantMessage?.metadata}
                         />
 

diff --git a/packages/web/src/ee/features/chat/components/chatThread/detailsCard.test.tsx b/packages/web/src/ee/features/chat/components/chatThread/detailsCard.test.tsx
@@ -111,7 +111,7 @@ describe('DetailsCard', () => {
                     isTurnInProgress={true}
                     isNetworkActive={false}
                     isAwaitingToolApproval={false}
-                    thinkingSteps={[[failedActivationPart]]}
+                    thinkingSteps={[{ stepIndex: 0, parts: [failedActivationPart] }]}
                 />
             </TooltipProvider>
         );