365 lines
13 KiB
Python
365 lines
13 KiB
Python
"""Shared fixtures for dummy agent end-to-end tests.
|
|
|
|
These tests use real LLM providers — they are NOT part of regular CI.
|
|
Run via: cd core && uv run python tests/dummy_agents/run_all.py
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from framework.graph.executor import GraphExecutor, ParallelExecutionConfig
|
|
from framework.graph.goal import Goal
|
|
from framework.llm.litellm import LiteLLMProvider
|
|
from framework.runtime.core import Runtime
|
|
|
|
# ── module-level state set by run_all.py ─────────────────────────────
|
|
|
|
_selected_model: str | None = None
|
|
_selected_api_key: str | None = None
|
|
_selected_extra_headers: dict[str, str] | None = None
|
|
_selected_api_base: str | None = None
|
|
_EXECUTION_TIMEOUT_SECS = float(os.environ.get("DUMMY_AGENT_EXEC_TIMEOUT_SECS", "90"))
|
|
|
|
|
|
def set_llm_selection(
|
|
model: str,
|
|
api_key: str,
|
|
extra_headers: dict[str, str] | None = None,
|
|
api_base: str | None = None,
|
|
) -> None:
|
|
"""Called by run_all.py after user selects a provider."""
|
|
global _selected_model, _selected_api_key, _selected_extra_headers, _selected_api_base
|
|
_selected_model = model
|
|
_selected_api_key = api_key
|
|
_selected_extra_headers = extra_headers
|
|
_selected_api_base = api_base
|
|
|
|
|
|
# ── collection hook: skip entire directory when not configured ───────
|
|
|
|
|
|
def _try_auto_configure_from_hive_config() -> bool:
|
|
"""Try to load LLM provider from ~/.hive/configuration.json.
|
|
|
|
Returns True if successfully configured, False otherwise.
|
|
"""
|
|
try:
|
|
from framework.config import (
|
|
get_api_base,
|
|
get_api_key,
|
|
get_llm_extra_kwargs,
|
|
get_preferred_model,
|
|
)
|
|
|
|
model = get_preferred_model()
|
|
api_key = get_api_key()
|
|
if not model or not api_key:
|
|
return False
|
|
|
|
extra_kwargs = get_llm_extra_kwargs()
|
|
set_llm_selection(
|
|
model=model,
|
|
api_key=api_key,
|
|
api_base=get_api_base(),
|
|
extra_headers=extra_kwargs.get("extra_headers"),
|
|
)
|
|
return True
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def pytest_collection_modifyitems(config, items):
|
|
"""Skip all dummy_agents tests when no LLM is configured.
|
|
|
|
Resolution order:
|
|
1. Already configured via run_all.py (set_llm_selection called)
|
|
2. Auto-configure from ~/.hive/configuration.json
|
|
3. Skip tests
|
|
"""
|
|
if _selected_model is not None:
|
|
return # LLM configured via run_all.py, run normally
|
|
|
|
# Try auto-configure from hive config
|
|
if _try_auto_configure_from_hive_config():
|
|
return # Config found, run tests
|
|
|
|
skip = pytest.mark.skip(
|
|
reason="Dummy agent tests require a real LLM. "
|
|
"Configure ~/.hive/configuration.json or "
|
|
"run via: cd core && uv run python tests/dummy_agents/run_all.py"
|
|
)
|
|
for item in items:
|
|
if "dummy_agents" in str(item.fspath):
|
|
item.add_marker(skip)
|
|
|
|
|
|
# ── fixtures ─────────────────────────────────────────────────────────
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def llm_provider():
|
|
"""Real LLM provider using the user-selected model."""
|
|
if _selected_model is None or _selected_api_key is None:
|
|
pytest.skip("No LLM selected — run via run_all.py")
|
|
kwargs = {"model": _selected_model, "api_key": _selected_api_key}
|
|
if _selected_extra_headers:
|
|
kwargs["extra_headers"] = _selected_extra_headers
|
|
if _selected_api_base:
|
|
kwargs["api_base"] = _selected_api_base
|
|
return LiteLLMProvider(**kwargs)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def tool_registry():
|
|
"""Load hive-tools MCP server and return a ToolRegistry with real tools.
|
|
|
|
Session-scoped so the MCP server is started once and reused across tests.
|
|
"""
|
|
from framework.runner.tool_registry import ToolRegistry
|
|
|
|
registry = ToolRegistry()
|
|
# Resolve the tools directory relative to the repo root
|
|
repo_root = Path(__file__).resolve().parents[3] # core/tests/dummy_agents -> repo root
|
|
tools_dir = repo_root / "tools"
|
|
|
|
mcp_config = {
|
|
"name": "hive-tools",
|
|
"transport": "stdio",
|
|
"command": "uv",
|
|
"args": ["run", "python", "mcp_server.py", "--stdio"],
|
|
"cwd": str(tools_dir),
|
|
"description": "Hive tools MCP server",
|
|
}
|
|
registry.register_mcp_server(mcp_config)
|
|
yield registry
|
|
registry.cleanup()
|
|
|
|
|
|
@pytest.fixture
|
|
def runtime(tmp_path):
|
|
"""Real Runtime backed by a temp directory."""
|
|
return Runtime(storage_path=tmp_path / "runtime")
|
|
|
|
|
|
@pytest.fixture
|
|
def goal():
|
|
return Goal(id="dummy", name="Dummy Agent Test", description="Level 2 end-to-end testing")
|
|
|
|
|
|
def make_executor(
|
|
runtime: Runtime,
|
|
llm: LiteLLMProvider,
|
|
*,
|
|
enable_parallel: bool = True,
|
|
parallel_config: ParallelExecutionConfig | None = None,
|
|
loop_config: dict | None = None,
|
|
tool_registry=None,
|
|
storage_path: Path | None = None,
|
|
event_bus=None,
|
|
stream_id: str = "",
|
|
) -> GraphExecutor:
|
|
"""Factory that creates a GraphExecutor with a real LLM."""
|
|
tools = []
|
|
tool_executor = None
|
|
if tool_registry is not None:
|
|
tools = list(tool_registry.get_tools().values())
|
|
tool_executor = tool_registry.get_executor()
|
|
|
|
executor = GraphExecutor(
|
|
runtime=runtime,
|
|
llm=llm,
|
|
tools=tools,
|
|
tool_executor=tool_executor,
|
|
enable_parallel_execution=enable_parallel,
|
|
parallel_config=parallel_config,
|
|
loop_config=loop_config or {"max_iterations": 10},
|
|
storage_path=storage_path,
|
|
event_bus=event_bus,
|
|
stream_id=stream_id,
|
|
)
|
|
|
|
original_execute = executor.execute
|
|
|
|
async def execute_with_timeout(*args, **kwargs):
|
|
try:
|
|
return await asyncio.wait_for(
|
|
original_execute(*args, **kwargs),
|
|
timeout=_EXECUTION_TIMEOUT_SECS,
|
|
)
|
|
except TimeoutError as e:
|
|
raise TimeoutError(
|
|
"Dummy agent execution timed out after "
|
|
f"{_EXECUTION_TIMEOUT_SECS:.0f}s. "
|
|
"This usually means the current worker execution path "
|
|
"(GraphExecutor -> WorkerAgent -> EventLoopNode) is stuck "
|
|
"waiting on the provider or tool-calling behavior."
|
|
) from e
|
|
|
|
executor.execute = execute_with_timeout # type: ignore[method-assign]
|
|
return executor
|
|
|
|
|
|
# ── Artifact capture: raw output written to disk for every test ──────
|
|
|
|
ARTIFACTS_DIR = Path("/tmp/hive_test_artifacts")
|
|
|
|
|
|
class TestArtifact:
|
|
"""Collects raw output + expected behavior for a single test.
|
|
|
|
Captures TWO kinds of data:
|
|
1. Checks: individual assertion results (expected vs actual)
|
|
2. Framework raw output: the real conversation, state, tool calls
|
|
written by the executor to storage_path — copied verbatim,
|
|
not curated.
|
|
|
|
Usage in tests:
|
|
def test_foo(artifact, ...):
|
|
result = await executor.execute(...)
|
|
artifact.record(result, expected="...", storage_path=tmp_path/"session")
|
|
"""
|
|
|
|
def __init__(self, test_id: str):
|
|
self.test_id = test_id
|
|
self._safe_name = test_id.replace("::", "__").replace("/", "_")
|
|
self._dir = ARTIFACTS_DIR / self._safe_name
|
|
self._data: dict = {"test_id": test_id, "raw_output": None, "expected": "", "checks": []}
|
|
|
|
def record(self, result, *, expected: str = "", storage_path=None):
|
|
"""Record an ExecutionResult and copy real framework files."""
|
|
self._data["expected"] = expected
|
|
if result is None:
|
|
self._data["raw_output"] = None
|
|
return
|
|
self._data["raw_output"] = {
|
|
"success": getattr(result, "success", None),
|
|
"output": _safe_serialize(getattr(result, "output", {})),
|
|
"error": getattr(result, "error", None),
|
|
"path": getattr(result, "path", []),
|
|
"steps_executed": getattr(result, "steps_executed", 0),
|
|
"total_tokens": getattr(result, "total_tokens", 0),
|
|
"total_latency_ms": getattr(result, "total_latency_ms", 0),
|
|
"execution_quality": getattr(result, "execution_quality", ""),
|
|
"total_retries": getattr(result, "total_retries", 0),
|
|
"node_visit_counts": getattr(result, "node_visit_counts", {}),
|
|
"nodes_with_failures": getattr(result, "nodes_with_failures", []),
|
|
"session_state_buffer": _safe_serialize(
|
|
(getattr(result, "session_state", {}) or {}).get("data_buffer", {})
|
|
),
|
|
}
|
|
# Copy real framework output files (conversations, state, runs)
|
|
if storage_path is not None:
|
|
self._copy_framework_files(Path(storage_path))
|
|
|
|
def _copy_framework_files(self, storage_path: Path):
|
|
"""Copy real framework output to persistent artifact directory."""
|
|
import shutil
|
|
|
|
raw_dir = self._dir / "raw"
|
|
raw_dir.mkdir(parents=True, exist_ok=True)
|
|
if storage_path.exists():
|
|
for src in storage_path.rglob("*"):
|
|
if src.is_file() and src.suffix in (".json", ".jsonl", ".txt"):
|
|
rel = src.relative_to(storage_path)
|
|
dst = raw_dir / rel
|
|
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.copy2(src, dst)
|
|
|
|
def record_value(self, key: str, value, *, expected: str = ""):
|
|
"""Record an arbitrary key-value (for non-ExecutionResult tests)."""
|
|
self._data.setdefault("values", {})[key] = _safe_serialize(value)
|
|
if expected:
|
|
self._data["expected"] = expected
|
|
|
|
def check(self, description: str, passed: bool, actual: str = "", expected_val: str = ""):
|
|
"""Record an individual assertion check."""
|
|
self._data["checks"].append(
|
|
{
|
|
"description": description,
|
|
"passed": passed,
|
|
"actual": actual,
|
|
"expected": expected_val,
|
|
}
|
|
)
|
|
|
|
def save(self):
|
|
"""Write artifact to disk."""
|
|
self._dir.mkdir(parents=True, exist_ok=True)
|
|
path = self._dir / "artifact.json"
|
|
with open(path, "w") as f:
|
|
json.dump(self._data, f, indent=2, default=str)
|
|
|
|
|
|
def _safe_serialize(obj):
|
|
"""Convert to JSON-safe types."""
|
|
if obj is None:
|
|
return None
|
|
if isinstance(obj, (str, int, float, bool)):
|
|
return obj
|
|
if isinstance(obj, dict):
|
|
return {str(k): _safe_serialize(v) for k, v in obj.items()}
|
|
if isinstance(obj, (list, tuple)):
|
|
return [_safe_serialize(v) for v in obj]
|
|
return str(obj)[:500]
|
|
|
|
|
|
@pytest.fixture
|
|
def artifact(request, tmp_path):
|
|
"""Fixture that captures raw test output to disk.
|
|
|
|
Every test gets an artifact recorder. Call artifact.record(result)
|
|
and artifact.check("description", passed, actual, expected) to
|
|
capture data. Saved automatically on teardown.
|
|
|
|
On teardown, copies ALL framework output files (conversations, state,
|
|
tool logs) from tmp_path to the persistent artifact directory. This
|
|
captures the REAL raw output — not curated summaries.
|
|
"""
|
|
test_id = request.node.nodeid
|
|
art = TestArtifact(test_id)
|
|
yield art
|
|
# Copy all framework files from the test's tmp_path
|
|
art._copy_framework_files(tmp_path)
|
|
art.save()
|
|
|
|
|
|
# Autouse hook: for tests that DON'T use the artifact fixture,
|
|
# create a minimal artifact from pass/fail status.
|
|
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
|
|
def pytest_runtest_makereport(item, call):
|
|
outcome = yield
|
|
rep = outcome.get_result()
|
|
if rep.when == "call":
|
|
item._test_report = rep
|
|
|
|
|
|
def pytest_runtest_teardown(item, nextitem):
|
|
"""Auto-save a minimal artifact for tests that didn't use the fixture."""
|
|
report = getattr(item, "_test_report", None)
|
|
if report is None:
|
|
return
|
|
# Check if the test already used the artifact fixture
|
|
if "artifact" in item.fixturenames:
|
|
return # Already handled by fixture teardown
|
|
safe_name = item.nodeid.replace("::", "__").replace("/", "_")
|
|
out_dir = ARTIFACTS_DIR / safe_name
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
data = {
|
|
"test_id": item.nodeid,
|
|
"raw_output": None,
|
|
"expected": "",
|
|
"checks": [],
|
|
"auto_captured": True,
|
|
"status": "PASS" if report.passed else ("FAIL" if report.failed else "SKIP"),
|
|
}
|
|
if report.failed and report.longreprtext:
|
|
data["failure_text"] = report.longreprtext[:5000]
|
|
with open(out_dir / "artifact.json", "w") as f:
|
|
json.dump(data, f, indent=2, default=str)
|