chore: update gitignore

This commit is contained in:
Timothy
2026-02-02 10:30:03 -08:00
parent 3240616808
commit 22e816bf86
5 changed files with 2432 additions and 1 deletions
+746
View File
@@ -0,0 +1,746 @@
"""WP-8: Tests for EventLoopNode, OutputAccumulator, LoopConfig, JudgeProtocol.
Uses real FileConversationStore (no mocks for storage) and a MockStreamingLLM
that yields pre-programmed StreamEvents to control the loop deterministically.
"""
from __future__ import annotations
import asyncio
from collections.abc import AsyncIterator
from typing import Any
from unittest.mock import AsyncMock, MagicMock
import pytest
from framework.graph.conversation import NodeConversation
from framework.graph.event_loop_node import (
EventLoopNode,
JudgeProtocol,
JudgeVerdict,
LoopConfig,
OutputAccumulator,
)
from framework.graph.node import NodeContext, NodeProtocol, NodeSpec, SharedMemory
from framework.llm.provider import LLMProvider, LLMResponse, Tool, ToolResult, ToolUse
from framework.llm.stream_events import (
FinishEvent,
StreamErrorEvent,
TextDeltaEvent,
ToolCallEvent,
)
from framework.runtime.core import Runtime
from framework.runtime.event_bus import EventBus, EventType
from framework.storage.conversation_store import FileConversationStore
# ---------------------------------------------------------------------------
# Mock LLM that yields pre-programmed stream events
# ---------------------------------------------------------------------------
class MockStreamingLLM(LLMProvider):
"""Mock LLM that yields pre-programmed StreamEvent sequences.
Each call to stream() consumes the next scenario from the list.
Cycles back to the beginning if more calls are made than scenarios.
"""
def __init__(self, scenarios: list[list] | None = None):
self.scenarios = scenarios or []
self._call_index = 0
self.stream_calls: list[dict] = []
async def stream(
self,
messages: list[dict[str, Any]],
system: str = "",
tools: list[Tool] | None = None,
max_tokens: int = 4096,
) -> AsyncIterator:
self.stream_calls.append({"messages": messages, "system": system, "tools": tools})
if not self.scenarios:
return
events = self.scenarios[self._call_index % len(self.scenarios)]
self._call_index += 1
for event in events:
yield event
def complete(self, messages, system="", **kwargs) -> LLMResponse:
return LLMResponse(content="Summary of conversation.", model="mock", stop_reason="stop")
def complete_with_tools(self, messages, system, tools, tool_executor, **kwargs) -> LLMResponse:
return LLMResponse(content="", model="mock", stop_reason="stop")
# ---------------------------------------------------------------------------
# Helper: build a simple text-only scenario
# ---------------------------------------------------------------------------
def text_scenario(text: str, input_tokens: int = 10, output_tokens: int = 5) -> list:
"""Build a stream scenario that produces text and finishes."""
return [
TextDeltaEvent(content=text, snapshot=text),
FinishEvent(
stop_reason="stop", input_tokens=input_tokens, output_tokens=output_tokens, model="mock"
),
]
def tool_call_scenario(
tool_name: str,
tool_input: dict,
tool_use_id: str = "call_1",
text: str = "",
) -> list:
"""Build a stream scenario that produces a tool call."""
events = []
if text:
events.append(TextDeltaEvent(content=text, snapshot=text))
events.append(
ToolCallEvent(tool_use_id=tool_use_id, tool_name=tool_name, tool_input=tool_input)
)
events.append(
FinishEvent(stop_reason="tool_calls", input_tokens=10, output_tokens=5, model="mock")
)
return events
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def runtime():
rt = MagicMock(spec=Runtime)
rt.start_run = MagicMock(return_value="run_1")
rt.decide = MagicMock(return_value="dec_1")
rt.record_outcome = MagicMock()
rt.end_run = MagicMock()
rt.report_problem = MagicMock()
rt.set_node = MagicMock()
return rt
@pytest.fixture
def node_spec():
return NodeSpec(
id="test_loop",
name="Test Loop",
description="A test event loop node",
node_type="event_loop",
output_keys=["result"],
system_prompt="You are a test assistant.",
)
@pytest.fixture
def memory():
return SharedMemory()
def build_ctx(runtime, node_spec, memory, llm, tools=None, input_data=None, goal_context=""):
"""Build a NodeContext for testing."""
return NodeContext(
runtime=runtime,
node_id=node_spec.id,
node_spec=node_spec,
memory=memory,
input_data=input_data or {},
llm=llm,
available_tools=tools or [],
goal_context=goal_context,
)
# ===========================================================================
# NodeProtocol conformance
# ===========================================================================
class TestNodeProtocolConformance:
def test_subclasses_node_protocol(self):
"""EventLoopNode must be a subclass of NodeProtocol."""
assert issubclass(EventLoopNode, NodeProtocol)
def test_has_execute_method(self):
node = EventLoopNode()
assert hasattr(node, "execute")
assert asyncio.iscoroutinefunction(node.execute)
def test_has_validate_input(self):
node = EventLoopNode()
assert hasattr(node, "validate_input")
# ===========================================================================
# Basic loop execution
# ===========================================================================
class TestBasicLoop:
@pytest.mark.asyncio
async def test_basic_text_only_implicit_accept(self, runtime, node_spec, memory):
"""No tools, no judge. LLM produces text, implicit accept on stop."""
# Override to no output_keys so implicit judge accepts immediately
node_spec.output_keys = []
llm = MockStreamingLLM(scenarios=[text_scenario("Hello world")])
ctx = build_ctx(runtime, node_spec, memory, llm)
node = EventLoopNode(config=LoopConfig(max_iterations=5))
result = await node.execute(ctx)
assert result.success is True
assert result.tokens_used > 0
@pytest.mark.asyncio
async def test_no_llm_returns_failure(self, runtime, node_spec, memory):
"""ctx.llm=None should return failure immediately."""
ctx = build_ctx(runtime, node_spec, memory, llm=None)
node = EventLoopNode()
result = await node.execute(ctx)
assert result.success is False
assert "LLM" in result.error
@pytest.mark.asyncio
async def test_max_iterations_failure(self, runtime, node_spec, memory):
"""When max_iterations is reached without acceptance, should fail."""
# LLM always produces text but never calls set_output, so implicit
# judge retries asking for missing keys
llm = MockStreamingLLM(scenarios=[text_scenario("thinking...")])
ctx = build_ctx(runtime, node_spec, memory, llm)
node = EventLoopNode(config=LoopConfig(max_iterations=2))
result = await node.execute(ctx)
assert result.success is False
assert "Max iterations" in result.error
# ===========================================================================
# Judge integration
# ===========================================================================
class TestJudgeIntegration:
@pytest.mark.asyncio
async def test_judge_accept(self, runtime, node_spec, memory):
"""Mock judge ACCEPT -> success."""
node_spec.output_keys = []
llm = MockStreamingLLM(scenarios=[text_scenario("Done!")])
judge = AsyncMock(spec=JudgeProtocol)
judge.evaluate = AsyncMock(return_value=JudgeVerdict(action="ACCEPT"))
ctx = build_ctx(runtime, node_spec, memory, llm)
node = EventLoopNode(judge=judge, config=LoopConfig(max_iterations=5))
result = await node.execute(ctx)
assert result.success is True
judge.evaluate.assert_called_once()
@pytest.mark.asyncio
async def test_judge_escalate(self, runtime, node_spec, memory):
"""Mock judge ESCALATE -> failure."""
node_spec.output_keys = []
llm = MockStreamingLLM(scenarios=[text_scenario("Attempt")])
judge = AsyncMock(spec=JudgeProtocol)
judge.evaluate = AsyncMock(
return_value=JudgeVerdict(action="ESCALATE", feedback="Tone violation")
)
ctx = build_ctx(runtime, node_spec, memory, llm)
node = EventLoopNode(judge=judge, config=LoopConfig(max_iterations=5))
result = await node.execute(ctx)
assert result.success is False
assert "escalated" in result.error.lower()
assert "Tone violation" in result.error
@pytest.mark.asyncio
async def test_judge_retry_then_accept(self, runtime, node_spec, memory):
"""RETRY twice, then ACCEPT. Should run 3 iterations."""
node_spec.output_keys = []
llm = MockStreamingLLM(
scenarios=[
text_scenario("attempt 1"),
text_scenario("attempt 2"),
text_scenario("attempt 3"),
]
)
call_count = 0
async def evaluate_fn(context):
nonlocal call_count
call_count += 1
if call_count < 3:
return JudgeVerdict(action="RETRY", feedback="Try harder")
return JudgeVerdict(action="ACCEPT")
judge = AsyncMock(spec=JudgeProtocol)
judge.evaluate = AsyncMock(side_effect=evaluate_fn)
ctx = build_ctx(runtime, node_spec, memory, llm)
node = EventLoopNode(judge=judge, config=LoopConfig(max_iterations=10))
result = await node.execute(ctx)
assert result.success is True
assert call_count == 3
# ===========================================================================
# set_output tool
# ===========================================================================
class TestSetOutput:
@pytest.mark.asyncio
async def test_set_output_accumulates(self, runtime, node_spec, memory):
"""LLM calls set_output -> values appear in NodeResult.output."""
llm = MockStreamingLLM(
scenarios=[
# Turn 1: call set_output
tool_call_scenario("set_output", {"key": "result", "value": "42"}),
# Turn 2: text response (triggers implicit judge)
text_scenario("Done, result is 42"),
]
)
ctx = build_ctx(runtime, node_spec, memory, llm)
node = EventLoopNode(config=LoopConfig(max_iterations=5))
result = await node.execute(ctx)
assert result.success is True
assert result.output["result"] == "42"
@pytest.mark.asyncio
async def test_set_output_rejects_invalid_key(self, runtime, node_spec, memory):
"""set_output with key not in output_keys -> is_error=True."""
llm = MockStreamingLLM(
scenarios=[
# Turn 1: call set_output with bad key
tool_call_scenario("set_output", {"key": "bad_key", "value": "x"}),
# Turn 2: call set_output with good key
tool_call_scenario("set_output", {"key": "result", "value": "ok"}),
# Turn 3: text done
text_scenario("Done"),
]
)
ctx = build_ctx(runtime, node_spec, memory, llm)
node = EventLoopNode(config=LoopConfig(max_iterations=5))
result = await node.execute(ctx)
assert result.success is True
assert result.output["result"] == "ok"
assert "bad_key" not in result.output
@pytest.mark.asyncio
async def test_missing_keys_triggers_retry(self, runtime, node_spec, memory):
"""Judge accepts but output keys are missing -> retry with hint."""
judge = AsyncMock(spec=JudgeProtocol)
judge.evaluate = AsyncMock(return_value=JudgeVerdict(action="ACCEPT"))
llm = MockStreamingLLM(
scenarios=[
# Turn 1: text without set_output -> judge accepts but keys missing -> retry
text_scenario("I'll get to it"),
# Turn 2: set_output
tool_call_scenario("set_output", {"key": "result", "value": "done"}),
# Turn 3: text -> judge accepts, keys present -> success
text_scenario("All done"),
]
)
ctx = build_ctx(runtime, node_spec, memory, llm)
node = EventLoopNode(judge=judge, config=LoopConfig(max_iterations=5))
result = await node.execute(ctx)
assert result.success is True
assert result.output["result"] == "done"
# ===========================================================================
# Stall detection
# ===========================================================================
class TestStallDetection:
@pytest.mark.asyncio
async def test_stall_detection(self, runtime, node_spec, memory):
"""3 identical responses should trigger stall detection."""
node_spec.output_keys = [] # so implicit judge would accept
# But we need the judge to RETRY so we actually get 3 identical responses
judge = AsyncMock(spec=JudgeProtocol)
judge.evaluate = AsyncMock(return_value=JudgeVerdict(action="RETRY"))
llm = MockStreamingLLM(scenarios=[text_scenario("same answer")])
ctx = build_ctx(runtime, node_spec, memory, llm)
node = EventLoopNode(
judge=judge,
config=LoopConfig(max_iterations=10, stall_detection_threshold=3),
)
result = await node.execute(ctx)
assert result.success is False
assert "stalled" in result.error.lower()
# ===========================================================================
# EventBus lifecycle events
# ===========================================================================
class TestEventBusLifecycle:
@pytest.mark.asyncio
async def test_lifecycle_events_published(self, runtime, node_spec, memory):
"""NODE_LOOP_STARTED, NODE_LOOP_ITERATION, NODE_LOOP_COMPLETED should be published."""
node_spec.output_keys = []
llm = MockStreamingLLM(scenarios=[text_scenario("ok")])
bus = EventBus()
received_events = []
bus.subscribe(
event_types=[
EventType.NODE_LOOP_STARTED,
EventType.NODE_LOOP_ITERATION,
EventType.NODE_LOOP_COMPLETED,
],
handler=lambda e: received_events.append(e.type),
)
ctx = build_ctx(runtime, node_spec, memory, llm)
node = EventLoopNode(event_bus=bus, config=LoopConfig(max_iterations=5))
result = await node.execute(ctx)
assert result.success is True
assert EventType.NODE_LOOP_STARTED in received_events
assert EventType.NODE_LOOP_ITERATION in received_events
assert EventType.NODE_LOOP_COMPLETED in received_events
@pytest.mark.asyncio
async def test_client_facing_uses_client_output_delta(self, runtime, memory):
"""client_facing=True should emit CLIENT_OUTPUT_DELTA instead of LLM_TEXT_DELTA."""
spec = NodeSpec(
id="ui_node",
name="UI Node",
description="Streams to user",
node_type="event_loop",
output_keys=[],
client_facing=True,
)
llm = MockStreamingLLM(scenarios=[text_scenario("visible to user")])
bus = EventBus()
received_types = []
bus.subscribe(
event_types=[EventType.CLIENT_OUTPUT_DELTA, EventType.LLM_TEXT_DELTA],
handler=lambda e: received_types.append(e.type),
)
ctx = build_ctx(runtime, spec, memory, llm)
node = EventLoopNode(event_bus=bus, config=LoopConfig(max_iterations=5))
await node.execute(ctx)
assert EventType.CLIENT_OUTPUT_DELTA in received_types
assert EventType.LLM_TEXT_DELTA not in received_types
# ===========================================================================
# Tool execution
# ===========================================================================
class TestToolExecution:
@pytest.mark.asyncio
async def test_tool_execution_feedback(self, runtime, node_spec, memory):
"""Tool call -> result fed back to conversation via stream loop."""
node_spec.output_keys = []
def my_tool_executor(tool_use: ToolUse) -> ToolResult:
return ToolResult(
tool_use_id=tool_use.id,
content=f"Result for {tool_use.name}",
is_error=False,
)
llm = MockStreamingLLM(
scenarios=[
# Turn 1: call a tool
tool_call_scenario("search", {"query": "test"}, tool_use_id="call_search"),
# Turn 2: text response after seeing tool result
text_scenario("Found the answer"),
]
)
ctx = build_ctx(
runtime,
node_spec,
memory,
llm,
tools=[Tool(name="search", description="Search", parameters={})],
)
node = EventLoopNode(
tool_executor=my_tool_executor,
config=LoopConfig(max_iterations=5),
)
result = await node.execute(ctx)
assert result.success is True
# stream() should have been called twice (tool call turn + final text turn)
assert llm._call_index >= 2
# ===========================================================================
# Write-through persistence with real FileConversationStore
# ===========================================================================
class TestWriteThroughPersistence:
@pytest.mark.asyncio
async def test_messages_written_to_store(self, tmp_path, runtime, node_spec, memory):
"""Messages should be persisted immediately via write-through."""
store = FileConversationStore(tmp_path / "conv")
node_spec.output_keys = []
llm = MockStreamingLLM(scenarios=[text_scenario("Hello")])
ctx = build_ctx(runtime, node_spec, memory, llm)
node = EventLoopNode(
conversation_store=store,
config=LoopConfig(max_iterations=5),
)
result = await node.execute(ctx)
assert result.success is True
# Verify parts were written to disk
parts = await store.read_parts()
assert len(parts) >= 2 # at least initial user msg + assistant msg
@pytest.mark.asyncio
async def test_output_accumulator_write_through(self, tmp_path, runtime, node_spec, memory):
"""set_output values should be persisted in cursor immediately."""
store = FileConversationStore(tmp_path / "conv")
llm = MockStreamingLLM(
scenarios=[
tool_call_scenario("set_output", {"key": "result", "value": "persisted_value"}),
text_scenario("Done"),
]
)
ctx = build_ctx(runtime, node_spec, memory, llm)
node = EventLoopNode(
conversation_store=store,
config=LoopConfig(max_iterations=5),
)
result = await node.execute(ctx)
assert result.success is True
assert result.output["result"] == "persisted_value"
# Verify output was written to cursor on disk
cursor = await store.read_cursor()
assert cursor is not None
assert cursor["outputs"]["result"] == "persisted_value"
# ===========================================================================
# Crash recovery (restore from real FileConversationStore)
# ===========================================================================
class TestCrashRecovery:
@pytest.mark.asyncio
async def test_restore_from_checkpoint(self, tmp_path, runtime, node_spec, memory):
"""Populate a store with state, then verify EventLoopNode restores from it."""
store = FileConversationStore(tmp_path / "conv")
# Simulate a previous run that wrote conversation + cursor
conv = NodeConversation(
system_prompt="You are a test assistant.",
output_keys=["result"],
store=store,
)
await conv.add_user_message("Initial input")
await conv.add_assistant_message("Working on it...")
# Write cursor with iteration and outputs
await store.write_cursor(
{
"iteration": 1,
"next_seq": conv.next_seq,
"outputs": {"result": "partial_value"},
}
)
# Now create a new EventLoopNode and execute -- it should restore
node_spec.output_keys = [] # no required keys so implicit accept works
llm = MockStreamingLLM(scenarios=[text_scenario("Continuing...")])
ctx = build_ctx(runtime, node_spec, memory, llm)
node = EventLoopNode(
conversation_store=store,
config=LoopConfig(max_iterations=5),
)
result = await node.execute(ctx)
assert result.success is True
# Should have the restored output
assert result.output.get("result") == "partial_value"
# ===========================================================================
# External event injection
# ===========================================================================
class TestEventInjection:
@pytest.mark.asyncio
async def test_inject_event(self, runtime, node_spec, memory):
"""inject_event() content should appear as user message in next iteration."""
node_spec.output_keys = []
judge_calls = []
async def evaluate_fn(context):
judge_calls.append(context)
if len(judge_calls) >= 2:
return JudgeVerdict(action="ACCEPT")
return JudgeVerdict(action="RETRY")
judge = AsyncMock(spec=JudgeProtocol)
judge.evaluate = AsyncMock(side_effect=evaluate_fn)
llm = MockStreamingLLM(
scenarios=[
text_scenario("iteration 1"),
text_scenario("iteration 2"),
]
)
ctx = build_ctx(runtime, node_spec, memory, llm)
node = EventLoopNode(
judge=judge,
config=LoopConfig(max_iterations=5),
)
# Pre-inject an event before execute runs
await node.inject_event("Priority: CEO wants meeting rescheduled")
result = await node.execute(ctx)
assert result.success is True
# Verify the injected content made it into the LLM messages
all_messages = []
for call in llm.stream_calls:
all_messages.extend(call["messages"])
injected_found = any("[External event]" in str(m.get("content", "")) for m in all_messages)
assert injected_found
# ===========================================================================
# Pause/resume
# ===========================================================================
class TestPauseResume:
@pytest.mark.asyncio
async def test_pause_returns_early(self, runtime, node_spec, memory):
"""pause_requested in input_data should trigger early return."""
node_spec.output_keys = []
llm = MockStreamingLLM(scenarios=[text_scenario("should not run")])
ctx = build_ctx(
runtime,
node_spec,
memory,
llm,
input_data={"pause_requested": True},
)
node = EventLoopNode(config=LoopConfig(max_iterations=10))
result = await node.execute(ctx)
# Should return success (paused, not failed)
assert result.success is True
# LLM should not have been called (paused before first turn)
assert llm._call_index == 0
# ===========================================================================
# Stream errors
# ===========================================================================
class TestStreamErrors:
@pytest.mark.asyncio
async def test_non_recoverable_stream_error_raises(self, runtime, node_spec, memory):
"""Non-recoverable StreamErrorEvent should raise RuntimeError."""
node_spec.output_keys = []
llm = MockStreamingLLM(
scenarios=[
[StreamErrorEvent(error="Connection lost", recoverable=False)],
]
)
ctx = build_ctx(runtime, node_spec, memory, llm)
node = EventLoopNode(config=LoopConfig(max_iterations=5))
with pytest.raises(RuntimeError, match="Stream error"):
await node.execute(ctx)
# ===========================================================================
# OutputAccumulator unit tests
# ===========================================================================
class TestOutputAccumulator:
@pytest.mark.asyncio
async def test_set_and_get(self):
acc = OutputAccumulator()
await acc.set("key1", "value1")
assert acc.get("key1") == "value1"
assert acc.get("nonexistent") is None
@pytest.mark.asyncio
async def test_to_dict(self):
acc = OutputAccumulator()
await acc.set("a", 1)
await acc.set("b", 2)
assert acc.to_dict() == {"a": 1, "b": 2}
@pytest.mark.asyncio
async def test_has_all_keys(self):
acc = OutputAccumulator()
assert acc.has_all_keys([]) is True
assert acc.has_all_keys(["x"]) is False
await acc.set("x", "val")
assert acc.has_all_keys(["x"]) is True
@pytest.mark.asyncio
async def test_write_through_to_real_store(self, tmp_path):
"""OutputAccumulator should write through to FileConversationStore cursor."""
store = FileConversationStore(tmp_path / "acc_test")
acc = OutputAccumulator(store=store)
await acc.set("result", "hello")
cursor = await store.read_cursor()
assert cursor["outputs"]["result"] == "hello"
@pytest.mark.asyncio
async def test_restore_from_real_store(self, tmp_path):
"""OutputAccumulator.restore() should rebuild from FileConversationStore."""
store = FileConversationStore(tmp_path / "acc_restore")
await store.write_cursor({"outputs": {"key1": "val1", "key2": "val2"}})
acc = await OutputAccumulator.restore(store)
assert acc.get("key1") == "val1"
assert acc.get("key2") == "val2"
assert acc.has_all_keys(["key1", "key2"]) is True