""" Error categorization for test failures. Categorizes errors to guide iteration strategy: - LOGIC_ERROR: Goal definition is wrong → update success_criteria/constraints - IMPLEMENTATION_ERROR: Code bug → fix nodes/edges in Agent stage - EDGE_CASE: New scenario discovered → add new test only """ import re from typing import Any from framework.testing.test_result import ErrorCategory, TestResult class ErrorCategorizer: """ Categorize test failures for guiding iteration. Uses pattern matching heuristics to classify errors. Each category has different implications for how to fix. """ # Patterns indicating goal/criteria definition is wrong LOGIC_ERROR_PATTERNS = [ r"goal not achieved", r"constraint violated:?\s*core", r"fundamental assumption", r"success criteria mismatch", r"criteria not met", r"expected behavior incorrect", r"specification error", r"requirement mismatch", ] # Patterns indicating code/implementation bug IMPLEMENTATION_ERROR_PATTERNS = [ r"TypeError", r"AttributeError", r"KeyError", r"IndexError", r"ValueError", r"NameError", r"ImportError", r"ModuleNotFoundError", r"RuntimeError", r"NullPointerException", r"NoneType.*has no attribute", r"tool call failed", r"node execution error", r"agent execution failed", r"assertion.*failed", r"AssertionError", r"expected.*but got", r"unexpected.*type", r"missing required", r"invalid.*argument", ] # Patterns indicating edge case / new scenario EDGE_CASE_PATTERNS = [ r"boundary condition", r"timeout", r"connection.*timeout", r"request.*timeout", r"unexpected format", r"unexpected response", r"rare input", r"empty.*result", r"null.*value", r"empty.*response", r"no.*results", r"rate.*limit", r"quota.*exceeded", r"retry.*exhausted", r"unicode.*error", r"encoding.*error", r"special.*character", ] def __init__(self): """Initialize categorizer with compiled patterns.""" self._logic_patterns = [re.compile(p, re.IGNORECASE) for p in self.LOGIC_ERROR_PATTERNS] self._impl_patterns = [re.compile(p, re.IGNORECASE) for p in self.IMPLEMENTATION_ERROR_PATTERNS] self._edge_patterns = [re.compile(p, re.IGNORECASE) for p in self.EDGE_CASE_PATTERNS] def categorize(self, result: TestResult) -> ErrorCategory | None: """ Categorize a test failure. Args: result: TestResult to categorize Returns: ErrorCategory if test failed, None if passed """ if result.passed: return None # Combine error sources for analysis error_text = self._get_error_text(result) # Check patterns in priority order # Logic errors take precedence (wrong goal definition) for pattern in self._logic_patterns: if pattern.search(error_text): return ErrorCategory.LOGIC_ERROR # Then implementation errors (code bugs) for pattern in self._impl_patterns: if pattern.search(error_text): return ErrorCategory.IMPLEMENTATION_ERROR # Then edge cases (new scenarios) for pattern in self._edge_patterns: if pattern.search(error_text): return ErrorCategory.EDGE_CASE # Default to implementation error (most common) return ErrorCategory.IMPLEMENTATION_ERROR def categorize_with_confidence(self, result: TestResult) -> tuple[ErrorCategory | None, float]: """ Categorize with a confidence score. Args: result: TestResult to categorize Returns: Tuple of (category, confidence 0-1) """ if result.passed: return None, 1.0 error_text = self._get_error_text(result) # Count pattern matches for each category logic_matches = sum(1 for p in self._logic_patterns if p.search(error_text)) impl_matches = sum(1 for p in self._impl_patterns if p.search(error_text)) edge_matches = sum(1 for p in self._edge_patterns if p.search(error_text)) total_matches = logic_matches + impl_matches + edge_matches if total_matches == 0: # No pattern matches, default to implementation with low confidence return ErrorCategory.IMPLEMENTATION_ERROR, 0.3 # Calculate confidence based on match dominance if logic_matches >= impl_matches and logic_matches >= edge_matches: confidence = logic_matches / total_matches if total_matches > 0 else 0.5 return ErrorCategory.LOGIC_ERROR, min(0.9, 0.5 + confidence * 0.4) if impl_matches >= logic_matches and impl_matches >= edge_matches: confidence = impl_matches / total_matches if total_matches > 0 else 0.5 return ErrorCategory.IMPLEMENTATION_ERROR, min(0.9, 0.5 + confidence * 0.4) confidence = edge_matches / total_matches if total_matches > 0 else 0.5 return ErrorCategory.EDGE_CASE, min(0.9, 0.5 + confidence * 0.4) def _get_error_text(self, result: TestResult) -> str: """Extract all error text from a result for analysis.""" parts = [] if result.error_message: parts.append(result.error_message) if result.stack_trace: parts.append(result.stack_trace) # Include log messages for log in result.runtime_logs: if log.get("level") in ("ERROR", "CRITICAL", "WARNING"): parts.append(str(log.get("msg", ""))) return " ".join(parts) def get_fix_suggestion(self, category: ErrorCategory) -> str: """ Get a fix suggestion based on error category. Args: category: ErrorCategory from categorization Returns: Human-readable fix suggestion """ suggestions = { ErrorCategory.LOGIC_ERROR: ( "Review and update success_criteria or constraints in the Goal definition. " "The goal specification may not accurately describe the desired behavior." ), ErrorCategory.IMPLEMENTATION_ERROR: ( "Fix the code in agent nodes/edges. There's a bug in the implementation that needs to be corrected." ), ErrorCategory.EDGE_CASE: ( "Add a new test for this edge case scenario. " "This is a valid scenario that wasn't covered by existing tests." ), } return suggestions.get(category, "Review the test and agent implementation.") def get_iteration_guidance(self, category: ErrorCategory) -> dict[str, Any]: """ Get detailed iteration guidance based on error category. Returns a dict with: - stage: Which stage to return to (Goal, Agent, Eval) - action: What action to take - restart_required: Whether full 3-step flow restart is needed """ guidance = { ErrorCategory.LOGIC_ERROR: { "stage": "Goal", "action": "Update success_criteria or constraints", "restart_required": True, "description": ( "The goal definition is incorrect. Update the success criteria " "or constraints, then restart the full Goal → Agent → Eval flow." ), }, ErrorCategory.IMPLEMENTATION_ERROR: { "stage": "Agent", "action": "Fix nodes/edges implementation", "restart_required": False, "description": ( "There's a code bug. Fix the agent implementation, then re-run Eval (skip Goal stage)." ), }, ErrorCategory.EDGE_CASE: { "stage": "Eval", "action": "Add new test only", "restart_required": False, "description": ("This is a new scenario. Add a test for it and continue in the Eval stage."), }, } return guidance.get( category, { "stage": "Unknown", "action": "Review manually", "restart_required": False, "description": "Unable to determine category. Manual review required.", }, )