Files
hive/core/framework/testing/categorizer.py
T
2026-01-26 07:32:03 -08:00

251 lines
8.6 KiB
Python

"""
Error categorization for test failures.
Categorizes errors to guide iteration strategy:
- LOGIC_ERROR: Goal definition is wrong → update success_criteria/constraints
- IMPLEMENTATION_ERROR: Code bug → fix nodes/edges in Agent stage
- EDGE_CASE: New scenario discovered → add new test only
"""
import re
from typing import Any
from framework.testing.test_result import ErrorCategory, TestResult
class ErrorCategorizer:
"""
Categorize test failures for guiding iteration.
Uses pattern matching heuristics to classify errors.
Each category has different implications for how to fix.
"""
# Patterns indicating goal/criteria definition is wrong
LOGIC_ERROR_PATTERNS = [
r"goal not achieved",
r"constraint violated:?\s*core",
r"fundamental assumption",
r"success criteria mismatch",
r"criteria not met",
r"expected behavior incorrect",
r"specification error",
r"requirement mismatch",
]
# Patterns indicating code/implementation bug
IMPLEMENTATION_ERROR_PATTERNS = [
r"TypeError",
r"AttributeError",
r"KeyError",
r"IndexError",
r"ValueError",
r"NameError",
r"ImportError",
r"ModuleNotFoundError",
r"RuntimeError",
r"NullPointerException",
r"NoneType.*has no attribute",
r"tool call failed",
r"node execution error",
r"agent execution failed",
r"assertion.*failed",
r"AssertionError",
r"expected.*but got",
r"unexpected.*type",
r"missing required",
r"invalid.*argument",
]
# Patterns indicating edge case / new scenario
EDGE_CASE_PATTERNS = [
r"boundary condition",
r"timeout",
r"connection.*timeout",
r"request.*timeout",
r"unexpected format",
r"unexpected response",
r"rare input",
r"empty.*result",
r"null.*value",
r"empty.*response",
r"no.*results",
r"rate.*limit",
r"quota.*exceeded",
r"retry.*exhausted",
r"unicode.*error",
r"encoding.*error",
r"special.*character",
]
def __init__(self):
"""Initialize categorizer with compiled patterns."""
self._logic_patterns = [re.compile(p, re.IGNORECASE) for p in self.LOGIC_ERROR_PATTERNS]
self._impl_patterns = [
re.compile(p, re.IGNORECASE) for p in self.IMPLEMENTATION_ERROR_PATTERNS
]
self._edge_patterns = [re.compile(p, re.IGNORECASE) for p in self.EDGE_CASE_PATTERNS]
def categorize(self, result: TestResult) -> ErrorCategory | None:
"""
Categorize a test failure.
Args:
result: TestResult to categorize
Returns:
ErrorCategory if test failed, None if passed
"""
if result.passed:
return None
# Combine error sources for analysis
error_text = self._get_error_text(result)
# Check patterns in priority order
# Logic errors take precedence (wrong goal definition)
for pattern in self._logic_patterns:
if pattern.search(error_text):
return ErrorCategory.LOGIC_ERROR
# Then implementation errors (code bugs)
for pattern in self._impl_patterns:
if pattern.search(error_text):
return ErrorCategory.IMPLEMENTATION_ERROR
# Then edge cases (new scenarios)
for pattern in self._edge_patterns:
if pattern.search(error_text):
return ErrorCategory.EDGE_CASE
# Default to implementation error (most common)
return ErrorCategory.IMPLEMENTATION_ERROR
def categorize_with_confidence(self, result: TestResult) -> tuple[ErrorCategory | None, float]:
"""
Categorize with a confidence score.
Args:
result: TestResult to categorize
Returns:
Tuple of (category, confidence 0-1)
"""
if result.passed:
return None, 1.0
error_text = self._get_error_text(result)
# Count pattern matches for each category
logic_matches = sum(1 for p in self._logic_patterns if p.search(error_text))
impl_matches = sum(1 for p in self._impl_patterns if p.search(error_text))
edge_matches = sum(1 for p in self._edge_patterns if p.search(error_text))
total_matches = logic_matches + impl_matches + edge_matches
if total_matches == 0:
# No pattern matches, default to implementation with low confidence
return ErrorCategory.IMPLEMENTATION_ERROR, 0.3
# Calculate confidence based on match dominance
if logic_matches >= impl_matches and logic_matches >= edge_matches:
confidence = logic_matches / total_matches if total_matches > 0 else 0.5
return ErrorCategory.LOGIC_ERROR, min(0.9, 0.5 + confidence * 0.4)
if impl_matches >= logic_matches and impl_matches >= edge_matches:
confidence = impl_matches / total_matches if total_matches > 0 else 0.5
return ErrorCategory.IMPLEMENTATION_ERROR, min(0.9, 0.5 + confidence * 0.4)
confidence = edge_matches / total_matches if total_matches > 0 else 0.5
return ErrorCategory.EDGE_CASE, min(0.9, 0.5 + confidence * 0.4)
def _get_error_text(self, result: TestResult) -> str:
"""Extract all error text from a result for analysis."""
parts = []
if result.error_message:
parts.append(result.error_message)
if result.stack_trace:
parts.append(result.stack_trace)
# Include log messages
for log in result.runtime_logs:
if log.get("level") in ("ERROR", "CRITICAL", "WARNING"):
parts.append(str(log.get("msg", "")))
return " ".join(parts)
def get_fix_suggestion(self, category: ErrorCategory) -> str:
"""
Get a fix suggestion based on error category.
Args:
category: ErrorCategory from categorization
Returns:
Human-readable fix suggestion
"""
suggestions = {
ErrorCategory.LOGIC_ERROR: (
"Review and update success_criteria or constraints in the Goal definition. "
"The goal specification may not accurately describe the desired behavior."
),
ErrorCategory.IMPLEMENTATION_ERROR: (
"Fix the code in agent nodes/edges. "
"There's a bug in the implementation that needs to be corrected."
),
ErrorCategory.EDGE_CASE: (
"Add a new test for this edge case scenario. "
"This is a valid scenario that wasn't covered by existing tests."
),
}
return suggestions.get(category, "Review the test and agent implementation.")
def get_iteration_guidance(self, category: ErrorCategory) -> dict[str, Any]:
"""
Get detailed iteration guidance based on error category.
Returns a dict with:
- stage: Which stage to return to (Goal, Agent, Eval)
- action: What action to take
- restart_required: Whether full 3-step flow restart is needed
"""
guidance = {
ErrorCategory.LOGIC_ERROR: {
"stage": "Goal",
"action": "Update success_criteria or constraints",
"restart_required": True,
"description": (
"The goal definition is incorrect. Update the success criteria "
"or constraints, then restart the full Goal → Agent → Eval flow."
),
},
ErrorCategory.IMPLEMENTATION_ERROR: {
"stage": "Agent",
"action": "Fix nodes/edges implementation",
"restart_required": False,
"description": (
"There's a code bug. Fix the agent implementation, "
"then re-run Eval (skip Goal stage)."
),
},
ErrorCategory.EDGE_CASE: {
"stage": "Eval",
"action": "Add new test only",
"restart_required": False,
"description": (
"This is a new scenario. Add a test for it and continue in the Eval stage."
),
},
}
return guidance.get(
category,
{
"stage": "Unknown",
"action": "Review manually",
"restart_required": False,
"description": "Unable to determine category. Manual review required.",
},
)