Files
hive/core/framework/testing/success_gen.py
T
2026-01-23 17:21:59 -08:00

224 lines
8.2 KiB
Python

"""
Success criteria test generator.
Generates tests for Goal success_criteria using LLM.
Tests are returned with PENDING approval status.
"""
import uuid
from typing import TYPE_CHECKING
from framework.graph.goal import Goal, SuccessCriterion
from framework.llm.provider import Tool, ToolResult, ToolUse
from framework.testing.prompts import SUCCESS_CRITERIA_TEST_PROMPT
from framework.testing.test_case import ApprovalStatus, Test, TestType
if TYPE_CHECKING:
from framework.llm.provider import LLMProvider
# Tool for collecting generated tests - Claude handles JSON escaping automatically
SUBMIT_TEST_TOOL = Tool(
name="submit_test",
description="Submit a generated success criteria test. Call once per test.",
parameters={
"properties": {
"criteria_id": {
"type": "string",
"description": "ID of the success criterion being tested",
},
"test_name": {
"type": "string",
"description": "pytest function name, e.g., test_find_videos_happy_path",
},
"test_code": {
"type": "string",
"description": "Complete Python test function code",
},
"description": {
"type": "string",
"description": "What the test validates",
},
"input": {
"type": "object",
"description": "Test input data",
},
"expected_output": {
"type": "object",
"description": "Expected output",
},
"confidence": {
"type": "number",
"description": "Confidence score 0-1",
},
},
"required": ["criteria_id", "test_name", "test_code", "description", "confidence"],
},
)
class SuccessCriteriaTestGenerator:
"""
Generate success criteria tests from Goal success_criteria.
Generated tests require user approval before being added to the test suite.
Unlike constraint tests, success criteria tests are generated during the
Eval stage (after the agent exists) and may reference agent nodes/tools.
"""
def __init__(self, llm: "LLMProvider"):
"""
Initialize generator with LLM provider.
Args:
llm: LLM provider for test generation (e.g., AnthropicProvider)
"""
self.llm = llm
def generate(
self,
goal: Goal,
node_names: list[str] | None = None,
tool_names: list[str] | None = None,
agent_module: str = "my_agent",
) -> list[Test]:
"""
Generate tests for all success criteria in a goal.
Args:
goal: Goal with success_criteria to test
node_names: Names of agent nodes (for context)
tool_names: Names of tools available to agent (for context)
agent_module: The agent module name (e.g., "web_research_agent")
Used to generate import: from exports.{agent_module} import default_agent
Returns:
List of Test objects with approval_status=PENDING.
These MUST be approved before being added to the test suite.
"""
if not goal.success_criteria:
return []
# Format prompt
prompt = SUCCESS_CRITERIA_TEST_PROMPT.format(
goal_name=goal.name,
goal_description=goal.description,
success_criteria_formatted=self._format_criteria(goal.success_criteria),
node_names=", ".join(node_names or ["(not specified)"]),
tool_names=", ".join(tool_names or ["(not specified)"]),
agent_module=agent_module,
)
# Collect tests via tool calls - Claude handles JSON escaping automatically
collected_tests: list[dict] = []
def tool_executor(tool_use: ToolUse) -> ToolResult:
if tool_use.name == "submit_test":
collected_tests.append(tool_use.input)
return ToolResult(tool_use_id=tool_use.id, content="Test recorded successfully")
return ToolResult(tool_use_id=tool_use.id, content="Unknown tool", is_error=True)
self.llm.complete_with_tools(
messages=[{"role": "user", "content": prompt}],
system=(
"You are a test generation expert. "
"For each success criterion, call the submit_test tool with the test details."
),
tools=[SUBMIT_TEST_TOOL],
tool_executor=tool_executor,
max_iterations=12,
)
tests = self._create_tests_from_collected(collected_tests, goal.id)
# Filter out skeleton tests (empty code with default confidence)
tests = [t for t in tests if t.test_code.strip() and t.llm_confidence != 0.5]
# Enforce max 12 tests total
return tests[:12]
def generate_for_criterion(
self,
goal: Goal,
criterion: SuccessCriterion,
node_names: list[str] | None = None,
tool_names: list[str] | None = None,
agent_module: str = "my_agent",
) -> list[Test]:
"""
Generate tests for a single success criterion.
Args:
goal: Goal containing the criterion
criterion: Specific criterion to test
node_names: Names of agent nodes
tool_names: Names of tools available
agent_module: The agent module name (e.g., "web_research_agent")
Returns:
List of Test objects for the criterion
"""
prompt = SUCCESS_CRITERIA_TEST_PROMPT.format(
goal_name=goal.name,
goal_description=goal.description,
success_criteria_formatted=self._format_criterion(criterion),
node_names=", ".join(node_names or ["(not specified)"]),
tool_names=", ".join(tool_names or ["(not specified)"]),
agent_module=agent_module,
)
# Collect tests via tool calls
collected_tests: list[dict] = []
def tool_executor(tool_use: ToolUse) -> ToolResult:
if tool_use.name == "submit_test":
collected_tests.append(tool_use.input)
return ToolResult(tool_use_id=tool_use.id, content="Test recorded successfully")
return ToolResult(tool_use_id=tool_use.id, content="Unknown tool", is_error=True)
self.llm.complete_with_tools(
messages=[{"role": "user", "content": prompt}],
system="You are a test generation expert. Call the submit_test tool with the test details.",
tools=[SUBMIT_TEST_TOOL],
tool_executor=tool_executor,
max_iterations=5,
)
return self._create_tests_from_collected(collected_tests, goal.id)
def _format_criteria(self, criteria: list[SuccessCriterion]) -> str:
"""Format success criteria for prompt."""
lines = []
for c in criteria:
lines.append(self._format_criterion(c))
lines.append("")
return "\n".join(lines)
def _format_criterion(self, criterion: SuccessCriterion) -> str:
"""Format a single criterion for prompt."""
return f"""### Success Criterion: {criterion.id}
- Description: {criterion.description}
- Metric: {criterion.metric}
- Target: {criterion.target}
- Weight: {criterion.weight}
- Currently met: {criterion.met}"""
def _create_tests_from_collected(self, collected: list[dict], goal_id: str) -> list[Test]:
"""Create Test objects from tool call data."""
tests = []
for td in collected:
test = Test(
id=f"test_{uuid.uuid4().hex[:8]}",
goal_id=goal_id,
parent_criteria_id=td.get("criteria_id", "unknown"),
test_type=TestType.SUCCESS_CRITERIA,
test_name=td.get("test_name", "unnamed_test"),
test_code=td.get("test_code", ""),
description=td.get("description", ""),
input=td.get("input", {}),
expected_output=td.get("expected_output", {}),
generated_by="llm",
llm_confidence=float(td.get("confidence", 0.5)),
approval_status=ApprovalStatus.PENDING,
)
tests.append(test)
return tests