Files
hive/core/tests/test_context_handoff.py
Hundao e5a93b059f fix(tests): resolve test failures across framework and tools (#7059)
* fix(tests): resolve test failures across framework and tools

Framework tests (52 -> 1 failure):
- Add missing `model` attribute to mock LLM classes (MockStreamingLLM,
  CrashingLLM, ErrorThenSuccessLLM, etc.) to match new agent_loop.py
  requirement at line 624
- Update skill count assertions from 6 to 7 (new writing-hive-skills)
- Fix phase compaction test to match new message format (no brackets)
- Update model catalog test for current gemini model names
- Fix queen memory test: set phase="building" to match prompt_building,
  adjust reflection trigger count to match cooldown behavior

Tools tests (52 -> 0 failures):
- Update csv_tool tests: remove agent_id parameter, use absolute paths,
  patch _ALLOWED_ROOTS instead of AGENT_SANDBOXES_DIR
- Fix browser_evaluate test to allow toast wrapper around script

Remaining: 1 pre-existing failure in test_worker_report where mock LLM
gets stuck when scenarios are exhausted (separate bug).

* fix(tests): resolve remaining test failures

- Add text stop scenario to test_worker_report so worker terminates
  cleanly after tool_calls finish instead of replaying the last
  scenario forever
- Remove duplicated hive home isolation fixture from test_colony_fork_live;
  reuse conftest autouse fixture and only add config copy on top

* fix(tests): prevent mock LLM infinite loops on exhausted scenarios

fix(core): accept both pruned tool result sentinel formats

MockStreamingLLM and _ByTaskMockLLM replay the last scenario forever
when call_index exceeds the scenario list, causing worker timeouts in
CI. Fix by emitting a text stop when scenarios are exhausted (scenarios
mode) or already consumed (by_task mode).

Also fix pruned tool result sentinel mismatch: conversation.py produces
"Pruned tool result ..." but compaction.py and conversation.py only
checked for "[Pruned tool result". Now both formats are accepted.

Also remove duplicated hive home isolation fixture from
test_colony_fork_live; reuse conftest autouse fixture instead.
2026-04-16 20:13:43 +08:00

319 lines
10 KiB
Python

"""Tests for ContextHandoff and HandoffContext."""
from __future__ import annotations
from typing import Any
import pytest
from framework.agent_loop.conversation import NodeConversation
from framework.llm.mock import MockLLMProvider
from framework.llm.provider import LLMProvider, LLMResponse
from framework.orchestrator.context_handoff import ContextHandoff, HandoffContext
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
class SpyLLMProvider(MockLLMProvider):
"""MockLLMProvider that records whether complete() was called."""
def __init__(self) -> None:
super().__init__()
self.complete_called = False
self.complete_call_args: dict[str, Any] | None = None
def complete(self, messages: list[dict[str, Any]], **kwargs: Any) -> LLMResponse:
self.complete_called = True
self.complete_call_args = {"messages": messages, **kwargs}
return super().complete(messages, **kwargs)
class FailingLLMProvider(LLMProvider):
"""LLM provider that always raises."""
model: str = "mock"
def complete(self, messages: list[dict[str, Any]], **kwargs: Any) -> LLMResponse:
raise RuntimeError("LLM unavailable")
async def _build_conversation(*pairs: tuple[str, str]) -> NodeConversation:
"""Build a NodeConversation from (user, assistant) message pairs."""
conv = NodeConversation()
for user_msg, assistant_msg in pairs:
await conv.add_user_message(user_msg)
await conv.add_assistant_message(assistant_msg)
return conv
# ---------------------------------------------------------------------------
# TestHandoffContext
# ---------------------------------------------------------------------------
class TestHandoffContext:
def test_instantiation(self) -> None:
hc = HandoffContext(
source_node_id="node_A",
summary="Summary text",
key_outputs={"result": "42"},
turn_count=3,
total_tokens_used=1200,
)
assert hc.source_node_id == "node_A"
assert hc.summary == "Summary text"
assert hc.key_outputs == {"result": "42"}
assert hc.turn_count == 3
assert hc.total_tokens_used == 1200
def test_field_access(self) -> None:
hc = HandoffContext(
source_node_id="n1",
summary="s",
key_outputs={},
turn_count=0,
total_tokens_used=0,
)
assert hc.key_outputs == {}
# ---------------------------------------------------------------------------
# TestExtractiveSummary
# ---------------------------------------------------------------------------
class TestExtractiveSummary:
@pytest.mark.asyncio
async def test_extractive_summary_includes_first_last(self) -> None:
conv = await _build_conversation(
("hello", "First response here."),
("continue", "Middle response."),
("finish", "Final conclusion."),
)
ch = ContextHandoff()
hc = ch.summarize_conversation(conv, node_id="test_node")
assert "First response here." in hc.summary
assert "Final conclusion." in hc.summary
@pytest.mark.asyncio
async def test_extractive_summary_metadata(self) -> None:
conv = await _build_conversation(
("hi", "hello"),
("bye", "goodbye"),
)
ch = ContextHandoff()
hc = ch.summarize_conversation(conv, node_id="node_42")
assert hc.source_node_id == "node_42"
assert hc.turn_count == 2
assert hc.total_tokens_used > 0
@pytest.mark.asyncio
async def test_extractive_with_output_keys_colon(self) -> None:
conv = await _build_conversation(
("what is the answer?", "answer: 42"),
)
ch = ContextHandoff()
hc = ch.summarize_conversation(conv, node_id="n", output_keys=["answer"])
assert hc.key_outputs["answer"] == "42"
@pytest.mark.asyncio
async def test_extractive_with_output_keys_equals(self) -> None:
conv = await _build_conversation(
("compute", "result = success"),
)
ch = ContextHandoff()
hc = ch.summarize_conversation(conv, node_id="n", output_keys=["result"])
assert hc.key_outputs["result"] == "success"
@pytest.mark.asyncio
async def test_extractive_json_output_keys(self) -> None:
conv = await _build_conversation(
("give me json", '{"score": 95, "grade": "A"}'),
)
ch = ContextHandoff()
hc = ch.summarize_conversation(conv, node_id="n", output_keys=["score", "grade"])
assert hc.key_outputs["score"] == "95"
assert hc.key_outputs["grade"] == "A"
@pytest.mark.asyncio
async def test_extractive_empty_conversation(self) -> None:
conv = NodeConversation()
ch = ContextHandoff()
hc = ch.summarize_conversation(conv, node_id="empty")
assert hc.summary == "Empty conversation."
assert hc.turn_count == 0
assert hc.key_outputs == {}
@pytest.mark.asyncio
async def test_extractive_no_assistant_messages(self) -> None:
conv = NodeConversation()
await conv.add_user_message("hello?")
await conv.add_user_message("anyone there?")
ch = ContextHandoff()
hc = ch.summarize_conversation(conv, node_id="silent")
assert hc.summary == "No assistant responses."
@pytest.mark.asyncio
async def test_extractive_most_recent_wins(self) -> None:
conv = await _build_conversation(
("first", "status: old_value"),
("second", "status: new_value"),
)
ch = ContextHandoff()
hc = ch.summarize_conversation(conv, node_id="n", output_keys=["status"])
assert hc.key_outputs["status"] == "new_value"
@pytest.mark.asyncio
async def test_extractive_truncation(self) -> None:
long_text = "x" * 1000
conv = await _build_conversation(
("go", long_text),
)
ch = ContextHandoff()
hc = ch.summarize_conversation(conv, node_id="n")
# Summary should be truncated to ~500 chars
assert len(hc.summary) <= 500
# ---------------------------------------------------------------------------
# TestLLMSummary
# ---------------------------------------------------------------------------
class TestLLMSummary:
@pytest.mark.asyncio
async def test_llm_summary_calls_provider(self) -> None:
llm = SpyLLMProvider()
conv = await _build_conversation(
("hi", "hello back"),
("what now?", "we are done"),
)
ch = ContextHandoff(llm=llm)
hc = ch.summarize_conversation(conv, node_id="llm_node")
assert llm.complete_called, "LLM complete() was never invoked"
assert hc.summary == "This is a mock response for testing purposes."
@pytest.mark.asyncio
async def test_llm_summary_includes_output_key_hint(self) -> None:
llm = SpyLLMProvider()
conv = await _build_conversation(
("compute", '{"score": 95}'),
)
ch = ContextHandoff(llm=llm)
ch.summarize_conversation(conv, node_id="n", output_keys=["score", "grade"])
assert llm.complete_call_args is not None
system = llm.complete_call_args.get("system", "")
assert "score" in system
assert "grade" in system
@pytest.mark.asyncio
async def test_llm_fallback_on_error(self) -> None:
llm = FailingLLMProvider()
conv = await _build_conversation(
("start", "First assistant message."),
("end", "Last assistant message."),
)
ch = ContextHandoff(llm=llm)
hc = ch.summarize_conversation(conv, node_id="fallback_node")
# Should fall back to extractive (first + last assistant messages)
assert "First assistant message." in hc.summary
assert "Last assistant message." in hc.summary
# ---------------------------------------------------------------------------
# TestFormatAsInput
# ---------------------------------------------------------------------------
class TestFormatAsInput:
def test_format_structure(self) -> None:
hc = HandoffContext(
source_node_id="analyzer",
summary="Analysis complete.",
key_outputs={"score": "95"},
turn_count=5,
total_tokens_used=2000,
)
output = ContextHandoff.format_as_input(hc)
assert "--- CONTEXT FROM: analyzer" in output
assert "KEY OUTPUTS:" in output
assert "SUMMARY:" in output
assert "--- END CONTEXT ---" in output
def test_format_no_key_outputs(self) -> None:
hc = HandoffContext(
source_node_id="simple",
summary="Done.",
key_outputs={},
turn_count=1,
total_tokens_used=100,
)
output = ContextHandoff.format_as_input(hc)
assert "KEY OUTPUTS:" not in output
assert "SUMMARY:" in output
def test_format_content_values(self) -> None:
hc = HandoffContext(
source_node_id="node_X",
summary="Found 3 bugs.",
key_outputs={"bugs": "3", "severity": "high"},
turn_count=7,
total_tokens_used=5000,
)
output = ContextHandoff.format_as_input(hc)
assert "node_X" in output
assert "7 turns" in output
assert "~5000 tokens" in output
assert "- bugs: 3" in output
assert "- severity: high" in output
assert "Found 3 bugs." in output
def test_format_empty_summary(self) -> None:
hc = HandoffContext(
source_node_id="n",
summary="",
key_outputs={},
turn_count=0,
total_tokens_used=0,
)
output = ContextHandoff.format_as_input(hc)
assert "No summary available." in output
@pytest.mark.asyncio
async def test_format_as_input_usable_as_message(self) -> None:
"""Formatted output can be fed into a NodeConversation as a user message."""
hc = HandoffContext(
source_node_id="prev_node",
summary="Completed analysis.",
key_outputs={"result": "42"},
turn_count=3,
total_tokens_used=900,
)
text = ContextHandoff.format_as_input(hc)
conv = NodeConversation()
msg = await conv.add_user_message(text)
assert msg.role == "user"
assert "CONTEXT FROM: prev_node" in msg.content
assert conv.turn_count == 1