Files
hive/core/framework/testing/debug_tool.py
T
Hundao 589c5b06fe fix: resolve all ruff lint and format errors across codebase (#7058)
- Auto-fixed 70 lint errors (import sorting, aliased errors, datetime.UTC)
- Fixed 85 remaining errors manually:
  - E501: wrapped long lines in queen_profiles, catalog, routes_credentials
  - F821: added missing TYPE_CHECKING imports for AgentHost, ToolRegistry,
    HookContext, HookResult; added runtime imports where needed
  - F811: removed duplicate method definitions in queen_lifecycle_tools
  - F841/B007: removed unused variables in discovery.py
  - W291: removed trailing whitespace in queen nodes
  - E402: moved import to top of queen_memory_v2.py
  - Fixed AgentRuntime -> AgentHost in example template type annotations
- Reformatted 343 files with ruff format
2026-04-16 19:30:01 +08:00

289 lines
9.4 KiB
Python

"""
Debug tool for analyzing failed tests.
Provides detailed information for debugging:
- Test input and expected output
- Actual output and error details
- Error categorization
- Runtime logs and execution path
- Fix suggestions
"""
from typing import Any
from pydantic import BaseModel, Field
from framework.testing.categorizer import ErrorCategorizer
from framework.testing.test_case import Test
from framework.testing.test_result import ErrorCategory, TestResult
from framework.testing.test_storage import TestStorage
class DebugInfo(BaseModel):
"""
Comprehensive debug information for a failed test.
"""
test_id: str
test_name: str
# Test definition
input: dict[str, Any] = Field(default_factory=dict)
expected: dict[str, Any] = Field(default_factory=dict)
# Actual result
actual: Any = None
passed: bool = False
# Error details
error_message: str | None = None
error_category: str | None = None
stack_trace: str | None = None
# Runtime data
logs: list[dict[str, Any]] = Field(default_factory=list)
runtime_data: dict[str, Any] = Field(default_factory=dict)
# Fix guidance
suggested_fix: str | None = None
iteration_guidance: dict[str, Any] = Field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
"""Convert to dict for JSON serialization."""
return self.model_dump()
class DebugTool:
"""
Debug tool for analyzing failed tests.
Integrates with:
- TestStorage for test and result data
- Runtime storage (optional) for decision logs
- ErrorCategorizer for classification
"""
def __init__(
self,
test_storage: TestStorage,
runtime_storage: Any | None = None,
):
"""
Initialize debug tool.
Args:
test_storage: Storage for test and result data
runtime_storage: Optional storage backend for Runtime data.
Must expose a synchronous ``load_run_sync(run_id)`` method
(e.g. ``ConcurrentStorage``).
"""
self.test_storage = test_storage
self.runtime_storage = runtime_storage
self.categorizer = ErrorCategorizer()
def analyze(
self,
goal_id: str,
test_id: str,
run_id: str | None = None,
) -> DebugInfo:
"""
Get detailed debug info for a failed test.
Args:
goal_id: Goal ID containing the test
test_id: ID of the test to analyze
run_id: Optional Runtime run ID for detailed logs
Returns:
DebugInfo with comprehensive debug data
"""
# Load test
test = self.test_storage.load_test(goal_id, test_id)
if not test:
return DebugInfo(
test_id=test_id,
test_name="unknown",
error_message=f"Test {test_id} not found in goal {goal_id}",
)
# Load latest result
result = self.test_storage.get_latest_result(test_id)
# Build debug info
debug_info = DebugInfo(
test_id=test_id,
test_name=test.test_name,
input=test.input,
expected=test.expected_output,
)
if result:
debug_info.actual = result.actual_output
debug_info.passed = result.passed
debug_info.error_message = result.error_message
debug_info.stack_trace = result.stack_trace
debug_info.logs = result.runtime_logs
# Set category
if result.error_category:
debug_info.error_category = result.error_category.value
elif not result.passed:
# Categorize if not already done
category = self.categorizer.categorize(result)
if category:
debug_info.error_category = category.value
# Get runtime data if available
if run_id and self.runtime_storage:
debug_info.runtime_data = self._get_runtime_data(run_id)
# Generate fix suggestions
if debug_info.error_category:
category = ErrorCategory(debug_info.error_category)
debug_info.suggested_fix = self.categorizer.get_fix_suggestion(category)
debug_info.iteration_guidance = self.categorizer.get_iteration_guidance(category)
return debug_info
def analyze_result(
self,
test: Test,
result: TestResult,
run_id: str | None = None,
) -> DebugInfo:
"""
Analyze a test result directly (without loading from storage).
Args:
test: The Test that was run
result: The TestResult to analyze
run_id: Optional Runtime run ID
Returns:
DebugInfo with debug data
"""
debug_info = DebugInfo(
test_id=test.id,
test_name=test.test_name,
input=test.input,
expected=test.expected_output,
actual=result.actual_output,
passed=result.passed,
error_message=result.error_message,
stack_trace=result.stack_trace,
logs=result.runtime_logs,
)
# Categorize
if result.error_category:
debug_info.error_category = result.error_category.value
elif not result.passed:
category = self.categorizer.categorize(result)
if category:
debug_info.error_category = category.value
# Runtime data
if run_id and self.runtime_storage:
debug_info.runtime_data = self._get_runtime_data(run_id)
# Fix suggestions
if debug_info.error_category:
category = ErrorCategory(debug_info.error_category)
debug_info.suggested_fix = self.categorizer.get_fix_suggestion(category)
debug_info.iteration_guidance = self.categorizer.get_iteration_guidance(category)
return debug_info
def get_failure_summary(
self,
goal_id: str,
) -> dict[str, Any]:
"""
Get summary of all failures for a goal.
Returns:
Dict with failure counts by category and test IDs
"""
tests = self.test_storage.get_tests_by_goal(goal_id)
failures_by_category: dict[str, list[str]] = {
"logic_error": [],
"implementation_error": [],
"edge_case": [],
"uncategorized": [],
}
for test in tests:
if test.last_result == "failed":
result = self.test_storage.get_latest_result(test.id)
if result and result.error_category:
failures_by_category[result.error_category.value].append(test.id)
else:
failures_by_category["uncategorized"].append(test.id)
return {
"goal_id": goal_id,
"total_failures": sum(len(ids) for ids in failures_by_category.values()),
"by_category": failures_by_category,
"iteration_suggestions": self._get_iteration_suggestions(failures_by_category),
}
def _get_runtime_data(self, run_id: str) -> dict[str, Any]:
"""Extract runtime data from Runtime storage."""
if not self.runtime_storage:
return {}
try:
# Use the synchronous loader — _get_runtime_data is not async
# and ConcurrentStorage.load_run() is a coroutine.
run = self.runtime_storage.load_run_sync(run_id)
if not run:
return {"error": f"Run {run_id} not found"}
return {
"execution_path": run.metrics.nodes_executed if hasattr(run, "metrics") else [],
"decisions": [
d.model_dump() if hasattr(d, "model_dump") else str(d) for d in getattr(run, "decisions", [])
],
"problems": [
p.model_dump() if hasattr(p, "model_dump") else str(p) for p in getattr(run, "problems", [])
],
"status": run.status.value if hasattr(run, "status") else "unknown",
}
except Exception as e:
return {"error": f"Failed to load runtime data: {e}"}
def _get_iteration_suggestions(
self,
failures_by_category: dict[str, list[str]],
) -> list[str]:
"""Generate iteration suggestions based on failure categories."""
suggestions = []
if failures_by_category["logic_error"]:
suggestions.append(
f"Found {len(failures_by_category['logic_error'])} logic errors. "
"Review and update Goal success_criteria/constraints, then restart "
"the full Goal → Agent → Eval flow."
)
if failures_by_category["implementation_error"]:
suggestions.append(
f"Found {len(failures_by_category['implementation_error'])} implementation errors. "
"Fix agent node/edge code and re-run Eval."
)
if failures_by_category["edge_case"]:
suggestions.append(
f"Found {len(failures_by_category['edge_case'])} edge cases. "
"These are new scenarios - add tests for them."
)
if failures_by_category["uncategorized"]:
suggestions.append(
f"Found {len(failures_by_category['uncategorized'])} uncategorized failures. Manual review required."
)
return suggestions