e5a93b059f
* fix(tests): resolve test failures across framework and tools Framework tests (52 -> 1 failure): - Add missing `model` attribute to mock LLM classes (MockStreamingLLM, CrashingLLM, ErrorThenSuccessLLM, etc.) to match new agent_loop.py requirement at line 624 - Update skill count assertions from 6 to 7 (new writing-hive-skills) - Fix phase compaction test to match new message format (no brackets) - Update model catalog test for current gemini model names - Fix queen memory test: set phase="building" to match prompt_building, adjust reflection trigger count to match cooldown behavior Tools tests (52 -> 0 failures): - Update csv_tool tests: remove agent_id parameter, use absolute paths, patch _ALLOWED_ROOTS instead of AGENT_SANDBOXES_DIR - Fix browser_evaluate test to allow toast wrapper around script Remaining: 1 pre-existing failure in test_worker_report where mock LLM gets stuck when scenarios are exhausted (separate bug). * fix(tests): resolve remaining test failures - Add text stop scenario to test_worker_report so worker terminates cleanly after tool_calls finish instead of replaying the last scenario forever - Remove duplicated hive home isolation fixture from test_colony_fork_live; reuse conftest autouse fixture and only add config copy on top * fix(tests): prevent mock LLM infinite loops on exhausted scenarios fix(core): accept both pruned tool result sentinel formats MockStreamingLLM and _ByTaskMockLLM replay the last scenario forever when call_index exceeds the scenario list, causing worker timeouts in CI. Fix by emitting a text stop when scenarios are exhausted (scenarios mode) or already consumed (by_task mode). Also fix pruned tool result sentinel mismatch: conversation.py produces "Pruned tool result ..." but compaction.py and conversation.py only checked for "[Pruned tool result". Now both formats are accepted. Also remove duplicated hive home isolation fixture from test_colony_fork_live; reuse conftest autouse fixture instead.
319 lines
10 KiB
Python
319 lines
10 KiB
Python
"""Tests for ContextHandoff and HandoffContext."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import Any
|
|
|
|
import pytest
|
|
|
|
from framework.agent_loop.conversation import NodeConversation
|
|
from framework.llm.mock import MockLLMProvider
|
|
from framework.llm.provider import LLMProvider, LLMResponse
|
|
from framework.orchestrator.context_handoff import ContextHandoff, HandoffContext
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class SpyLLMProvider(MockLLMProvider):
|
|
"""MockLLMProvider that records whether complete() was called."""
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__()
|
|
self.complete_called = False
|
|
self.complete_call_args: dict[str, Any] | None = None
|
|
|
|
def complete(self, messages: list[dict[str, Any]], **kwargs: Any) -> LLMResponse:
|
|
self.complete_called = True
|
|
self.complete_call_args = {"messages": messages, **kwargs}
|
|
return super().complete(messages, **kwargs)
|
|
|
|
|
|
class FailingLLMProvider(LLMProvider):
|
|
"""LLM provider that always raises."""
|
|
|
|
model: str = "mock"
|
|
|
|
def complete(self, messages: list[dict[str, Any]], **kwargs: Any) -> LLMResponse:
|
|
raise RuntimeError("LLM unavailable")
|
|
|
|
|
|
async def _build_conversation(*pairs: tuple[str, str]) -> NodeConversation:
|
|
"""Build a NodeConversation from (user, assistant) message pairs."""
|
|
conv = NodeConversation()
|
|
for user_msg, assistant_msg in pairs:
|
|
await conv.add_user_message(user_msg)
|
|
await conv.add_assistant_message(assistant_msg)
|
|
return conv
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TestHandoffContext
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestHandoffContext:
|
|
def test_instantiation(self) -> None:
|
|
hc = HandoffContext(
|
|
source_node_id="node_A",
|
|
summary="Summary text",
|
|
key_outputs={"result": "42"},
|
|
turn_count=3,
|
|
total_tokens_used=1200,
|
|
)
|
|
assert hc.source_node_id == "node_A"
|
|
assert hc.summary == "Summary text"
|
|
assert hc.key_outputs == {"result": "42"}
|
|
assert hc.turn_count == 3
|
|
assert hc.total_tokens_used == 1200
|
|
|
|
def test_field_access(self) -> None:
|
|
hc = HandoffContext(
|
|
source_node_id="n1",
|
|
summary="s",
|
|
key_outputs={},
|
|
turn_count=0,
|
|
total_tokens_used=0,
|
|
)
|
|
assert hc.key_outputs == {}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TestExtractiveSummary
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestExtractiveSummary:
|
|
@pytest.mark.asyncio
|
|
async def test_extractive_summary_includes_first_last(self) -> None:
|
|
conv = await _build_conversation(
|
|
("hello", "First response here."),
|
|
("continue", "Middle response."),
|
|
("finish", "Final conclusion."),
|
|
)
|
|
ch = ContextHandoff()
|
|
hc = ch.summarize_conversation(conv, node_id="test_node")
|
|
|
|
assert "First response here." in hc.summary
|
|
assert "Final conclusion." in hc.summary
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_extractive_summary_metadata(self) -> None:
|
|
conv = await _build_conversation(
|
|
("hi", "hello"),
|
|
("bye", "goodbye"),
|
|
)
|
|
ch = ContextHandoff()
|
|
hc = ch.summarize_conversation(conv, node_id="node_42")
|
|
|
|
assert hc.source_node_id == "node_42"
|
|
assert hc.turn_count == 2
|
|
assert hc.total_tokens_used > 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_extractive_with_output_keys_colon(self) -> None:
|
|
conv = await _build_conversation(
|
|
("what is the answer?", "answer: 42"),
|
|
)
|
|
ch = ContextHandoff()
|
|
hc = ch.summarize_conversation(conv, node_id="n", output_keys=["answer"])
|
|
|
|
assert hc.key_outputs["answer"] == "42"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_extractive_with_output_keys_equals(self) -> None:
|
|
conv = await _build_conversation(
|
|
("compute", "result = success"),
|
|
)
|
|
ch = ContextHandoff()
|
|
hc = ch.summarize_conversation(conv, node_id="n", output_keys=["result"])
|
|
|
|
assert hc.key_outputs["result"] == "success"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_extractive_json_output_keys(self) -> None:
|
|
conv = await _build_conversation(
|
|
("give me json", '{"score": 95, "grade": "A"}'),
|
|
)
|
|
ch = ContextHandoff()
|
|
hc = ch.summarize_conversation(conv, node_id="n", output_keys=["score", "grade"])
|
|
|
|
assert hc.key_outputs["score"] == "95"
|
|
assert hc.key_outputs["grade"] == "A"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_extractive_empty_conversation(self) -> None:
|
|
conv = NodeConversation()
|
|
ch = ContextHandoff()
|
|
hc = ch.summarize_conversation(conv, node_id="empty")
|
|
|
|
assert hc.summary == "Empty conversation."
|
|
assert hc.turn_count == 0
|
|
assert hc.key_outputs == {}
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_extractive_no_assistant_messages(self) -> None:
|
|
conv = NodeConversation()
|
|
await conv.add_user_message("hello?")
|
|
await conv.add_user_message("anyone there?")
|
|
|
|
ch = ContextHandoff()
|
|
hc = ch.summarize_conversation(conv, node_id="silent")
|
|
|
|
assert hc.summary == "No assistant responses."
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_extractive_most_recent_wins(self) -> None:
|
|
conv = await _build_conversation(
|
|
("first", "status: old_value"),
|
|
("second", "status: new_value"),
|
|
)
|
|
ch = ContextHandoff()
|
|
hc = ch.summarize_conversation(conv, node_id="n", output_keys=["status"])
|
|
|
|
assert hc.key_outputs["status"] == "new_value"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_extractive_truncation(self) -> None:
|
|
long_text = "x" * 1000
|
|
conv = await _build_conversation(
|
|
("go", long_text),
|
|
)
|
|
ch = ContextHandoff()
|
|
hc = ch.summarize_conversation(conv, node_id="n")
|
|
|
|
# Summary should be truncated to ~500 chars
|
|
assert len(hc.summary) <= 500
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TestLLMSummary
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestLLMSummary:
|
|
@pytest.mark.asyncio
|
|
async def test_llm_summary_calls_provider(self) -> None:
|
|
llm = SpyLLMProvider()
|
|
conv = await _build_conversation(
|
|
("hi", "hello back"),
|
|
("what now?", "we are done"),
|
|
)
|
|
ch = ContextHandoff(llm=llm)
|
|
hc = ch.summarize_conversation(conv, node_id="llm_node")
|
|
|
|
assert llm.complete_called, "LLM complete() was never invoked"
|
|
assert hc.summary == "This is a mock response for testing purposes."
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_llm_summary_includes_output_key_hint(self) -> None:
|
|
llm = SpyLLMProvider()
|
|
conv = await _build_conversation(
|
|
("compute", '{"score": 95}'),
|
|
)
|
|
ch = ContextHandoff(llm=llm)
|
|
ch.summarize_conversation(conv, node_id="n", output_keys=["score", "grade"])
|
|
|
|
assert llm.complete_call_args is not None
|
|
system = llm.complete_call_args.get("system", "")
|
|
assert "score" in system
|
|
assert "grade" in system
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_llm_fallback_on_error(self) -> None:
|
|
llm = FailingLLMProvider()
|
|
conv = await _build_conversation(
|
|
("start", "First assistant message."),
|
|
("end", "Last assistant message."),
|
|
)
|
|
ch = ContextHandoff(llm=llm)
|
|
hc = ch.summarize_conversation(conv, node_id="fallback_node")
|
|
|
|
# Should fall back to extractive (first + last assistant messages)
|
|
assert "First assistant message." in hc.summary
|
|
assert "Last assistant message." in hc.summary
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# TestFormatAsInput
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestFormatAsInput:
|
|
def test_format_structure(self) -> None:
|
|
hc = HandoffContext(
|
|
source_node_id="analyzer",
|
|
summary="Analysis complete.",
|
|
key_outputs={"score": "95"},
|
|
turn_count=5,
|
|
total_tokens_used=2000,
|
|
)
|
|
output = ContextHandoff.format_as_input(hc)
|
|
|
|
assert "--- CONTEXT FROM: analyzer" in output
|
|
assert "KEY OUTPUTS:" in output
|
|
assert "SUMMARY:" in output
|
|
assert "--- END CONTEXT ---" in output
|
|
|
|
def test_format_no_key_outputs(self) -> None:
|
|
hc = HandoffContext(
|
|
source_node_id="simple",
|
|
summary="Done.",
|
|
key_outputs={},
|
|
turn_count=1,
|
|
total_tokens_used=100,
|
|
)
|
|
output = ContextHandoff.format_as_input(hc)
|
|
|
|
assert "KEY OUTPUTS:" not in output
|
|
assert "SUMMARY:" in output
|
|
|
|
def test_format_content_values(self) -> None:
|
|
hc = HandoffContext(
|
|
source_node_id="node_X",
|
|
summary="Found 3 bugs.",
|
|
key_outputs={"bugs": "3", "severity": "high"},
|
|
turn_count=7,
|
|
total_tokens_used=5000,
|
|
)
|
|
output = ContextHandoff.format_as_input(hc)
|
|
|
|
assert "node_X" in output
|
|
assert "7 turns" in output
|
|
assert "~5000 tokens" in output
|
|
assert "- bugs: 3" in output
|
|
assert "- severity: high" in output
|
|
assert "Found 3 bugs." in output
|
|
|
|
def test_format_empty_summary(self) -> None:
|
|
hc = HandoffContext(
|
|
source_node_id="n",
|
|
summary="",
|
|
key_outputs={},
|
|
turn_count=0,
|
|
total_tokens_used=0,
|
|
)
|
|
output = ContextHandoff.format_as_input(hc)
|
|
|
|
assert "No summary available." in output
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_format_as_input_usable_as_message(self) -> None:
|
|
"""Formatted output can be fed into a NodeConversation as a user message."""
|
|
hc = HandoffContext(
|
|
source_node_id="prev_node",
|
|
summary="Completed analysis.",
|
|
key_outputs={"result": "42"},
|
|
turn_count=3,
|
|
total_tokens_used=900,
|
|
)
|
|
text = ContextHandoff.format_as_input(hc)
|
|
|
|
conv = NodeConversation()
|
|
msg = await conv.add_user_message(text)
|
|
|
|
assert msg.role == "user"
|
|
assert "CONTEXT FROM: prev_node" in msg.content
|
|
assert conv.turn_count == 1
|