including missing tool log inputs

2026-02-05 09:08:42 -08:00
parent 7519c73f2a
commit 305b880b1d
3 changed files with 54 additions and 27 deletions
@@ -74,3 +74,4 @@ exports/*

 docs/github-issues/*
 core/tests/*dumps/*
+screenshots/*
@@ -305,6 +305,7 @@ class EventLoopNode(NodeProtocol):
                real_tool_results,
                outputs_set,
                turn_tokens,
+                logged_tool_calls,
            ) = await self._run_single_turn(ctx, conversation, tools, iteration, accumulator)
            logger.info(
                "[%s] iter=%d: LLM done — text=%d chars, real_tools=%d, "
@@ -370,7 +371,7 @@ class EventLoopNode(NodeProtocol):
                        step_index=iteration,
                        verdict="CONTINUE",
                        verdict_feedback="Stall detected before judge evaluation",
-                        tool_calls=real_tool_results,
+                        tool_calls=logged_tool_calls,
                        llm_text=assistant_text,
                        input_tokens=turn_tokens.get("input", 0),
                        output_tokens=turn_tokens.get("output", 0),
@@ -430,7 +431,7 @@ class EventLoopNode(NodeProtocol):
                            step_index=iteration,
                            verdict="CONTINUE",
                            verdict_feedback="Shutdown signaled (client-facing)",
-                            tool_calls=real_tool_results,
+                            tool_calls=logged_tool_calls,
                            llm_text=assistant_text,
                            input_tokens=turn_tokens.get("input", 0),
                            output_tokens=turn_tokens.get("output", 0),
@@ -474,7 +475,7 @@ class EventLoopNode(NodeProtocol):
                            step_index=iteration,
                            verdict="CONTINUE",
                            verdict_feedback="No input received (shutdown during wait)",
-                            tool_calls=real_tool_results,
+                            tool_calls=logged_tool_calls,
                            llm_text=assistant_text,
                            input_tokens=turn_tokens.get("input", 0),
                            output_tokens=turn_tokens.get("output", 0),
@@ -524,7 +525,7 @@ class EventLoopNode(NodeProtocol):
                        step_index=iteration,
                        verdict="CONTINUE",
                        verdict_feedback="Unjudged (judge_every_n_turns skip)",
-                        tool_calls=real_tool_results,
+                        tool_calls=logged_tool_calls,
                        llm_text=assistant_text,
                        input_tokens=turn_tokens.get("input", 0),
                        output_tokens=turn_tokens.get("output", 0),
@@ -579,7 +580,7 @@ class EventLoopNode(NodeProtocol):
                                verdict_feedback=(
                                    f"Judge accepted but missing output keys: {missing}"
                                ),
-                                tool_calls=real_tool_results,
+                                tool_calls=logged_tool_calls,
                                llm_text=assistant_text,
                                input_tokens=turn_tokens.get("input", 0),
                                output_tokens=turn_tokens.get("output", 0),
@@ -603,7 +604,7 @@ class EventLoopNode(NodeProtocol):
                            step_index=iteration,
                            verdict="ACCEPT",
                            verdict_feedback=verdict.feedback,
-                            tool_calls=real_tool_results,
+                            tool_calls=logged_tool_calls,
                            llm_text=assistant_text,
                            input_tokens=turn_tokens.get("input", 0),
                            output_tokens=turn_tokens.get("output", 0),
@@ -645,7 +646,7 @@ class EventLoopNode(NodeProtocol):
                            step_index=iteration,
                            verdict="ESCALATE",
                            verdict_feedback=verdict.feedback,
-                            tool_calls=real_tool_results,
+                            tool_calls=logged_tool_calls,
                            llm_text=assistant_text,
                            input_tokens=turn_tokens.get("input", 0),
                            output_tokens=turn_tokens.get("output", 0),
@@ -686,7 +687,7 @@ class EventLoopNode(NodeProtocol):
                            step_index=iteration,
                            verdict="RETRY",
                            verdict_feedback=verdict.feedback,
-                            tool_calls=real_tool_results,
+                            tool_calls=logged_tool_calls,
                            llm_text=assistant_text,
                            input_tokens=turn_tokens.get("input", 0),
                            output_tokens=turn_tokens.get("output", 0),
@@ -774,16 +775,21 @@ class EventLoopNode(NodeProtocol):
        tools: list[Tool],
        iteration: int,
        accumulator: OutputAccumulator,
-    ) -> tuple[str, list[dict], list[str], dict[str, int]]:
+    ) -> tuple[str, list[dict], list[str], dict[str, int], list[dict]]:
        """Run a single LLM turn with streaming and tool execution.

-        Returns (assistant_text, real_tool_results, outputs_set, token_counts).
+        Returns (assistant_text, real_tool_results, outputs_set, token_counts, logged_tool_calls).

        ``real_tool_results`` contains only results from actual tools (web_search,
        etc.), NOT from the synthetic ``set_output`` tool.  ``outputs_set`` lists
        the output keys written via ``set_output`` during this turn.  This
        separation lets the caller treat set_output as a framework concern
        rather than a tool-execution concern.
+
+        ``logged_tool_calls`` accumulates ALL tool calls across inner iterations
+        (real tools, set_output, and discarded calls) for L3 logging.  Unlike
+        ``real_tool_results`` which resets each inner iteration, this list grows
+        across the entire turn.
        """
        stream_id = ctx.node_id
        node_id = ctx.node_id
@@ -792,6 +798,9 @@ class EventLoopNode(NodeProtocol):
        final_text = ""
        # Track output keys set via set_output across all inner iterations
        outputs_set_this_turn: list[str] = []
+        # Accumulate ALL tool calls across inner iterations for L3 logging.
+        # Unlike real_tool_results (reset each inner iteration), this persists.
+        logged_tool_calls: list[dict] = []

        # Inner tool loop: stream may produce tool calls requiring re-invocation
        while True:
@@ -862,7 +871,7 @@ class EventLoopNode(NodeProtocol):

            # If no tool calls, turn is complete
            if not tool_calls:
-                return final_text, [], outputs_set_this_turn, token_counts
+                return final_text, [], outputs_set_this_turn, token_counts, logged_tool_calls

            # Execute tool calls — separate real tools from set_output
            real_tool_results: list[dict] = []
@@ -909,18 +918,28 @@ class EventLoopNode(NodeProtocol):
                                pass
                        await accumulator.set(tc.tool_input["key"], value)
                        outputs_set_this_turn.append(tc.tool_input["key"])
-                else:
-                    # --- Real tool execution ---
-                    result = await self._execute_tool(tc)
-                    result = self._truncate_tool_result(result, tc.tool_name)
-                    real_tool_results.append(
+                    logged_tool_calls.append(
                        {
                            "tool_use_id": tc.tool_use_id,
-                            "tool_name": tc.tool_name,
+                            "tool_name": "set_output",
+                            "tool_input": tc.tool_input,
                            "content": result.content,
                            "is_error": result.is_error,
                        }
                    )
+                else:
+                    # --- Real tool execution ---
+                    result = await self._execute_tool(tc)
+                    result = self._truncate_tool_result(result, tc.tool_name)
+                    tool_entry = {
+                        "tool_use_id": tc.tool_use_id,
+                        "tool_name": tc.tool_name,
+                        "tool_input": tc.tool_input,
+                        "content": result.content,
+                        "is_error": result.is_error,
+                    }
+                    real_tool_results.append(tool_entry)
+                    logged_tool_calls.append(tool_entry)

                # Record tool result in conversation (both real and set_output
                # go into the conversation for LLM context continuity)
@@ -967,15 +986,15 @@ class EventLoopNode(NodeProtocol):
                    )
                    # Discarded calls go into real_tool_results so the
                    # caller sees they were attempted (for judge context).
-                    real_tool_results.append(
-                        {
-                            "tool_use_id": tc.tool_use_id,
-                            "tool_name": tc.tool_name,
-                            "tool_input": tc.tool_input,
-                            "content": discard_msg,
-                            "is_error": True,
-                        }
-                    )
+                    discard_entry = {
+                        "tool_use_id": tc.tool_use_id,
+                        "tool_name": tc.tool_name,
+                        "tool_input": tc.tool_input,
+                        "content": discard_msg,
+                        "is_error": True,
+                    }
+                    real_tool_results.append(discard_entry)
+                    logged_tool_calls.append(discard_entry)
                # Prune old tool results NOW to prevent context bloat on the
                # next turn.  The char-based token estimator underestimates
                # actual API tokens, so the standard compaction check in the
@@ -993,7 +1012,13 @@ class EventLoopNode(NodeProtocol):
                    )
                # Limit hit — return from this turn so the judge can
                # evaluate instead of looping back for another stream.
-                return final_text, real_tool_results, outputs_set_this_turn, token_counts
+                return (
+                    final_text,
+                    real_tool_results,
+                    outputs_set_this_turn,
+                    token_counts,
+                    logged_tool_calls,
+                )

            # --- Mid-turn pruning: prevent context blowup within a single turn ---
            if conversation.usage_ratio() >= 0.6:
@@ -468,6 +468,7 @@ def create_agent_runtime(
        tools: Available tools
        tool_executor: Tool executor function
        config: Runtime configuration
+        runtime_log_store: Optional RuntimeLogStore for per-execution logging

    Returns:
        Configured AgentRuntime (not yet started)