chore: lint code quality

chore lint fixes
chore: lint fixes for ruff -part 1
2026-01-23 18:04:54 -08:00 · 2026-01-23 18:03:50 -08:00 · 2026-01-23 17:21:59 -08:00
59 changed files with 1934 additions and 1770 deletions
@@ -78,6 +78,7 @@ async def example_3_config_file():

    # Copy example config (in practice, you'd place this in your agent folder)
    import shutil
+
    shutil.copy(
        "examples/mcp_servers.json",
        test_agent_path / "mcp_servers.json"
@@ -110,18 +111,12 @@ async def example_4_custom_agent_with_mcp_tools():
    builder.set_goal(
        goal_id="web-researcher",
        name="Web Research Agent",
-        description="Search the web and summarize findings"
+        description="Search the web and summarize findings",
    )

    # Add success criteria
-    builder.add_success_criterion(
-        "search-results",
-        "Successfully retrieve at least 3 web search results"
-    )
-    builder.add_success_criterion(
-        "summary",
-        "Provide a clear, concise summary of the findings"
-    )
+    builder.add_success_criterion("search-results", "Successfully retrieve at least 3 web search results")
+    builder.add_success_criterion("summary", "Provide a clear, concise summary of the findings")

    # Add nodes that will use MCP tools
    builder.add_node(
@@ -192,6 +187,7 @@ async def main():
    except Exception as e:
        print(f"\nError running example: {e}")
        import traceback
+
        traceback.print_exc()


@@ -22,24 +22,24 @@ The framework includes a Goal-Based Testing system (Goal → Agent → Eval):
 See `framework.testing` for details.
 """

-from framework.schemas.decision import Decision, Option, Outcome, DecisionEvaluation
-from framework.schemas.run import Run, RunSummary, Problem
-from framework.runtime.core import Runtime
 from framework.builder.query import BuilderQuery
-from framework.llm import LLMProvider, AnthropicProvider
-from framework.runner import AgentRunner, AgentOrchestrator
+from framework.llm import AnthropicProvider, LLMProvider
+from framework.runner import AgentOrchestrator, AgentRunner
+from framework.runtime.core import Runtime
+from framework.schemas.decision import Decision, DecisionEvaluation, Option, Outcome
+from framework.schemas.run import Problem, Run, RunSummary

 # Testing framework
 from framework.testing import (
+    ApprovalStatus,
+    ConstraintTestGenerator,
+    DebugTool,
+    ErrorCategory,
+    SuccessCriteriaTestGenerator,
    Test,
    TestResult,
-    TestSuiteResult,
    TestStorage,
-    ApprovalStatus,
-    ErrorCategory,
-    ConstraintTestGenerator,
-    SuccessCriteriaTestGenerator,
-    DebugTool,
+    TestSuiteResult,
 )

 __all__ = [
@@ -2,12 +2,12 @@

 from framework.builder.query import BuilderQuery
 from framework.builder.workflow import (
-    GraphBuilder,
-    BuildSession,
    BuildPhase,
-    ValidationResult,
+    BuildSession,
+    GraphBuilder,
    TestCase,
    TestResult,
+    ValidationResult,
 )

 __all__ = [
@@ -8,12 +8,12 @@ This is designed around the questions I need to answer:
 4. What should we change? (suggestions)
 """

-from typing import Any
 from collections import defaultdict
 from pathlib import Path
+from typing import Any

 from framework.schemas.decision import Decision
-from framework.schemas.run import Run, RunSummary, RunStatus
+from framework.schemas.run import Run, RunStatus, RunSummary
 from framework.storage.backend import FileStorage


@@ -196,10 +196,7 @@ class BuilderQuery:
                break

        # Extract problems
-        problems = [
-            f"[{p.severity}] {p.description}"
-            for p in run.problems
-        ]
+        problems = [f"[{p.severity}] {p.description}" for p in run.problems]

        # Generate suggestions based on the failure
        suggestions = self._generate_suggestions(run, failed_decisions)
@@ -253,11 +250,7 @@ class BuilderQuery:
                    error = decision.outcome.error or "Unknown error"
                    failure_counts[error] += 1

-        common_failures = sorted(
-            failure_counts.items(),
-            key=lambda x: x[1],
-            reverse=True
-        )[:5]
+        common_failures = sorted(failure_counts.items(), key=lambda x: x[1], reverse=True)[:5]

        # Find problematic nodes
        node_stats: dict[str, dict[str, int]] = defaultdict(lambda: {"total": 0, "failed": 0})
@@ -328,34 +321,42 @@ class BuilderQuery:

        # Suggestion: Fix problematic nodes
        for node_id, failure_rate in patterns.problematic_nodes:
-            suggestions.append({
-                "type": "node_improvement",
-                "target": node_id,
-                "reason": f"Node has {failure_rate:.1%} failure rate",
-                "recommendation": f"Review and improve node '{node_id}' - high failure rate suggests prompt or tool issues",
-                "priority": "high" if failure_rate > 0.3 else "medium",
-            })
+            suggestions.append(
+                {
+                    "type": "node_improvement",
+                    "target": node_id,
+                    "reason": f"Node has {failure_rate:.1%} failure rate",
+                    "recommendation": (
+                        f"Review and improve node '{node_id}' - high failure rate suggests prompt or tool issues"
+                    ),
+                    "priority": "high" if failure_rate > 0.3 else "medium",
+                }
+            )

        # Suggestion: Address common failures
        for failure, count in patterns.common_failures:
            if count >= 2:
-                suggestions.append({
-                    "type": "error_handling",
-                    "target": failure,
-                    "reason": f"Error occurred {count} times",
-                    "recommendation": f"Add handling for: {failure}",
-                    "priority": "high" if count >= 5 else "medium",
-                })
+                suggestions.append(
+                    {
+                        "type": "error_handling",
+                        "target": failure,
+                        "reason": f"Error occurred {count} times",
+                        "recommendation": f"Add handling for: {failure}",
+                        "priority": "high" if count >= 5 else "medium",
+                    }
+                )

        # Suggestion: Overall success rate
        if patterns.success_rate < 0.8:
-            suggestions.append({
-                "type": "architecture",
-                "target": goal_id,
-                "reason": f"Goal success rate is only {patterns.success_rate:.1%}",
-                "recommendation": "Consider restructuring the agent graph or improving goal definition",
-                "priority": "high",
-            })
+            suggestions.append(
+                {
+                    "type": "architecture",
+                    "target": goal_id,
+                    "reason": f"Goal success rate is only {patterns.success_rate:.1%}",
+                    "recommendation": ("Consider restructuring the agent graph or improving goal definition"),
+                    "priority": "high",
+                }
+            )

        return suggestions

@@ -408,9 +409,8 @@ class BuilderQuery:
                alternatives = [o for o in decision.options if o.id != decision.chosen_option_id]
                if alternatives:
                    alt_desc = alternatives[0].description
-                    suggestions.append(
-                        f"Consider alternative: '{alt_desc}' instead of '{chosen.description if chosen else 'unknown'}'"
-                    )
+                    chosen_desc = chosen.description if chosen else "unknown"
+                    suggestions.append(f"Consider alternative: '{alt_desc}' instead of '{chosen_desc}'")

            # Check for missing context
            if not decision.input_context:
@@ -420,9 +420,8 @@ class BuilderQuery:

            # Check for constraint issues
            if decision.active_constraints:
-                suggestions.append(
-                    f"Review constraints: {', '.join(decision.active_constraints)} - may be too restrictive"
-                )
+                constraints = ", ".join(decision.active_constraints)
+                suggestions.append(f"Review constraints: {constraints} - may be too restrictive")

        # Check for reported problems with suggestions
        for problem in run.problems:
@@ -471,12 +470,10 @@ class BuilderQuery:

        # Decision count difference
        if len(run1.decisions) != len(run2.decisions):
-            differences.append(
-                f"Decision count: {len(run1.decisions)} vs {len(run2.decisions)}"
-            )
+            differences.append(f"Decision count: {len(run1.decisions)} vs {len(run2.decisions)}")

        # Find first divergence point
-        for i, (d1, d2) in enumerate(zip(run1.decisions, run2.decisions)):
+        for i, (d1, d2) in enumerate(zip(run1.decisions, run2.decisions, strict=False)):
            if d1.chosen_option_id != d2.chosen_option_id:
                differences.append(
                    f"Diverged at decision {i}: chose '{d1.chosen_option_id}' vs '{d2.chosen_option_id}'"
@@ -13,32 +13,35 @@ Each step requires validation and human approval before proceeding.
 You cannot skip steps or bypass validation.
 """

+from collections.abc import Callable
+from datetime import datetime
 from enum import Enum
 from pathlib import Path
-from datetime import datetime
-from typing import Any, Callable
+from typing import Any

 from pydantic import BaseModel, Field

+from framework.graph.edge import EdgeCondition, EdgeSpec, GraphSpec
 from framework.graph.goal import Goal
 from framework.graph.node import NodeSpec
-from framework.graph.edge import EdgeSpec, EdgeCondition, GraphSpec


 class BuildPhase(str, Enum):
    """Current phase of the build process."""
-    INIT = "init"                    # Just started
-    GOAL_DRAFT = "goal_draft"        # Drafting goal
+
+    INIT = "init"  # Just started
+    GOAL_DRAFT = "goal_draft"  # Drafting goal
    GOAL_APPROVED = "goal_approved"  # Goal approved
-    ADDING_NODES = "adding_nodes"    # Adding nodes
-    ADDING_EDGES = "adding_edges"    # Adding edges
-    TESTING = "testing"              # Running tests
-    APPROVED = "approved"            # Fully approved
-    EXPORTED = "exported"            # Exported to file
+    ADDING_NODES = "adding_nodes"  # Adding nodes
+    ADDING_EDGES = "adding_edges"  # Adding edges
+    TESTING = "testing"  # Running tests
+    APPROVED = "approved"  # Fully approved
+    EXPORTED = "exported"  # Exported to file


 class ValidationResult(BaseModel):
    """Result of a validation check."""
+
    valid: bool
    errors: list[str] = Field(default_factory=list)
    warnings: list[str] = Field(default_factory=list)
@@ -47,6 +50,7 @@ class ValidationResult(BaseModel):

 class TestCase(BaseModel):
    """A test case for validating agent behavior."""
+
    id: str
    description: str
    input: dict[str, Any]
@@ -56,6 +60,7 @@ class TestCase(BaseModel):

 class TestResult(BaseModel):
    """Result of running a test case."""
+
    test_id: str
    passed: bool
    actual_output: Any = None
@@ -69,6 +74,7 @@ class BuildSession(BaseModel):

    Saved after each approved step so you can resume later.
    """
+
    id: str
    name: str
    phase: BuildPhase = BuildPhase.INIT
@@ -457,11 +463,14 @@ class GraphBuilder:

            # Run the test
            import asyncio
-            result = asyncio.run(executor.execute(
-                graph=graph,
-                goal=self.session.goal,
-                input_data=test.input,
-            ))
+
+            result = asyncio.run(
+                executor.execute(
+                    graph=graph,
+                    goal=self.session.goal,
+                    input_data=test.input,
+                )
+            )

            # Check result
            passed = result.success
@@ -515,12 +524,14 @@ class GraphBuilder:
        if not self._pending_validation.valid:
            return False

-        self.session.approvals.append({
-            "phase": self.session.phase.value,
-            "comment": comment,
-            "timestamp": datetime.now().isoformat(),
-            "validation": self._pending_validation.model_dump(),
-        })
+        self.session.approvals.append(
+            {
+                "phase": self.session.phase.value,
+                "comment": comment,
+                "timestamp": datetime.now().isoformat(),
+                "validation": self._pending_validation.model_dump(),
+            }
+        )

        # Advance phase if appropriate
        if self.session.phase == BuildPhase.GOAL_DRAFT:
@@ -554,11 +565,13 @@ class GraphBuilder:
                return False

        self.session.phase = BuildPhase.APPROVED
-        self.session.approvals.append({
-            "phase": "final",
-            "comment": comment,
-            "timestamp": datetime.now().isoformat(),
-        })
+        self.session.approvals.append(
+            {
+                "phase": "final",
+                "comment": comment,
+                "timestamp": datetime.now().isoformat(),
+            }
+        )

        self._save_session()
        return True
@@ -630,69 +643,75 @@ class GraphBuilder:
        """Generate Python code for the graph."""
        lines = [
            '"""',
-            f'Generated agent: {self.session.name}',
-            f'Generated at: {datetime.now().isoformat()}',
+            f"Generated agent: {self.session.name}",
+            f"Generated at: {datetime.now().isoformat()}",
            '"""',
-            '',
-            'from framework.graph import (',
-            '    Goal, SuccessCriterion, Constraint,',
-            '    NodeSpec, EdgeSpec, EdgeCondition,',
-            ')',
-            'from framework.graph.edge import GraphSpec',
-            'from framework.graph.goal import GoalStatus',
-            '',
-            '',
-            '# Goal',
+            "",
+            "from framework.graph import (",
+            "    Goal, SuccessCriterion, Constraint,",
+            "    NodeSpec, EdgeSpec, EdgeCondition,",
+            ")",
+            "from framework.graph.edge import GraphSpec",
+            "from framework.graph.goal import GoalStatus",
+            "",
+            "",
+            "# Goal",
        ]

        if self.session.goal:
            goal_json = self.session.goal.model_dump_json(indent=4)
-            lines.append('GOAL = Goal.model_validate_json(\'\'\'')
+            lines.append("GOAL = Goal.model_validate_json('''")
            lines.append(goal_json)
            lines.append("''')")
        else:
-            lines.append('GOAL = None')
+            lines.append("GOAL = None")

-        lines.extend([
-            '',
-            '',
-            '# Nodes',
-            'NODES = [',
-        ])
+        lines.extend(
+            [
+                "",
+                "",
+                "# Nodes",
+                "NODES = [",
+            ]
+        )

        for node in self.session.nodes:
            node_json = node.model_dump_json(indent=4)
-            lines.append('    NodeSpec.model_validate_json(\'\'\'')
+            lines.append("    NodeSpec.model_validate_json('''")
            lines.append(node_json)
            lines.append("    '''),")

-        lines.extend([
-            ']',
-            '',
-            '',
-            '# Edges',
-            'EDGES = [',
-        ])
+        lines.extend(
+            [
+                "]",
+                "",
+                "",
+                "# Edges",
+                "EDGES = [",
+            ]
+        )

        for edge in self.session.edges:
            edge_json = edge.model_dump_json(indent=4)
-            lines.append('    EdgeSpec.model_validate_json(\'\'\'')
+            lines.append("    EdgeSpec.model_validate_json('''")
            lines.append(edge_json)
            lines.append("    '''),")

-        lines.extend([
-            ']',
-            '',
-            '',
-            '# Graph',
-        ])
+        lines.extend(
+            [
+                "]",
+                "",
+                "",
+                "# Graph",
+            ]
+        )

        graph_json = graph.model_dump_json(indent=4)
-        lines.append('GRAPH = GraphSpec.model_validate_json(\'\'\'')
+        lines.append("GRAPH = GraphSpec.model_validate_json('''")
        lines.append(graph_json)
        lines.append("''')")

-        return '\n'.join(lines)
+        return "\n".join(lines)

    # =========================================================================
    # SESSION MANAGEMENT
@@ -743,7 +762,7 @@ class GraphBuilder:
            "tests": len(self.session.test_cases),
            "tests_passed": sum(1 for t in self.session.test_results if t.passed),
            "approvals": len(self.session.approvals),
-            "pending_validation": self._pending_validation.model_dump() if self._pending_validation else None,
+            "pending_validation": (self._pending_validation.model_dump() if self._pending_validation else None),
        }

    def show(self) -> str:
@@ -755,11 +774,13 @@ class GraphBuilder:
        ]

        if self.session.goal:
-            lines.extend([
-                f"Goal: {self.session.goal.name}",
-                f"  {self.session.goal.description}",
-                "",
-            ])
+            lines.extend(
+                [
+                    f"Goal: {self.session.goal.name}",
+                    f"  {self.session.goal.description}",
+                    "",
+                ]
+            )

        if self.session.nodes:
            lines.append("Nodes:")
@@ -23,9 +23,7 @@ import sys


 def main():
-    parser = argparse.ArgumentParser(
-        description="Goal Agent - Build and run goal-driven agents"
-    )
+    parser = argparse.ArgumentParser(description="Goal Agent - Build and run goal-driven agents")
    parser.add_argument(
        "--model",
        default="claude-haiku-4-5-20251001",
@@ -36,10 +34,12 @@ def main():

    # Register runner commands (run, info, validate, list, dispatch, shell)
    from framework.runner.cli import register_commands
+
    register_commands(subparsers)

    # Register testing commands (test-generate, test-approve, test-run, test-debug, etc.)
    from framework.testing.cli import register_testing_commands
+
    register_testing_commands(subparsers)

    args = parser.parse_args()
@@ -1,32 +1,32 @@
 """Graph structures: Goals, Nodes, Edges, and Flexible Execution."""

-from framework.graph.goal import Goal, SuccessCriterion, Constraint, GoalStatus
-from framework.graph.node import NodeSpec, NodeContext, NodeResult, NodeProtocol
-from framework.graph.edge import EdgeSpec, EdgeCondition
+from framework.graph.code_sandbox import CodeSandbox, safe_eval, safe_exec
+from framework.graph.edge import EdgeCondition, EdgeSpec
 from framework.graph.executor import GraphExecutor
+from framework.graph.flexible_executor import ExecutorConfig, FlexibleGraphExecutor
+from framework.graph.goal import Constraint, Goal, GoalStatus, SuccessCriterion
+from framework.graph.judge import HybridJudge, create_default_judge
+from framework.graph.node import NodeContext, NodeProtocol, NodeResult, NodeSpec

 # Flexible execution (Worker-Judge pattern)
 from framework.graph.plan import (
-    Plan,
-    PlanStep,
    ActionSpec,
    ActionType,
-    StepStatus,
-    Judgment,
-    JudgmentAction,
-    EvaluationRule,
-    PlanExecutionResult,
-    ExecutionStatus,
-    load_export,
    # HITL (Human-in-the-loop)
    ApprovalDecision,
    ApprovalRequest,
    ApprovalResult,
+    EvaluationRule,
+    ExecutionStatus,
+    Judgment,
+    JudgmentAction,
+    Plan,
+    PlanExecutionResult,
+    PlanStep,
+    StepStatus,
+    load_export,
 )
-from framework.graph.judge import HybridJudge, create_default_judge
-from framework.graph.worker_node import WorkerNode, StepExecutionResult
-from framework.graph.flexible_executor import FlexibleGraphExecutor, ExecutorConfig
-from framework.graph.code_sandbox import CodeSandbox, safe_exec, safe_eval
+from framework.graph.worker_node import StepExecutionResult, WorkerNode

 __all__ = [
    # Goal
@@ -13,11 +13,11 @@ Security measures:
 """

 import ast
-import sys
 import signal
-from typing import Any
-from dataclasses import dataclass, field
+import sys
 from contextlib import contextmanager
+from dataclasses import dataclass, field
+from typing import Any

 # Safe builtins whitelist
 SAFE_BUILTINS = {
@@ -25,7 +25,6 @@ SAFE_BUILTINS = {
    "True": True,
    "False": False,
    "None": None,
-
    # Type constructors
    "bool": bool,
    "int": int,
@@ -36,7 +35,6 @@ SAFE_BUILTINS = {
    "set": set,
    "tuple": tuple,
    "frozenset": frozenset,
-
    # Basic functions
    "abs": abs,
    "all": all,
@@ -97,22 +95,26 @@ BLOCKED_AST_NODES = {

 class CodeSandboxError(Exception):
    """Error during sandboxed code execution."""
+
    pass


 class TimeoutError(CodeSandboxError):
    """Code execution timed out."""
+
    pass


 class SecurityError(CodeSandboxError):
    """Code contains potentially dangerous operations."""
+
    pass


@dataclass
 class SandboxResult:
    """Result of sandboxed code execution."""
+
    success: bool
    result: Any = None
    error: str | None = None
@@ -134,6 +136,7 @@ class RestrictedImporter:

        if name not in self._cache:
            import importlib
+
            self._cache[name] = importlib.import_module(name)

        return self._cache[name]
@@ -161,24 +164,19 @@ class CodeValidator:
        for node in ast.walk(tree):
            # Check for blocked node types
            if type(node) in self.blocked_nodes:
-                issues.append(
-                    f"Blocked operation: {type(node).__name__} at line {getattr(node, 'lineno', '?')}"
-                )
+                lineno = getattr(node, "lineno", "?")
+                issues.append(f"Blocked operation: {type(node).__name__} at line {lineno}")

            # Check for dangerous attribute access
            if isinstance(node, ast.Attribute):
                if node.attr.startswith("_"):
-                    issues.append(
-                        f"Access to private attribute '{node.attr}' at line {node.lineno}"
-                    )
+                    issues.append(f"Access to private attribute '{node.attr}' at line {node.lineno}")

            # Check for exec/eval calls
            if isinstance(node, ast.Call):
                if isinstance(node.func, ast.Name):
                    if node.func.id in ("exec", "eval", "compile", "__import__"):
-                        issues.append(
-                            f"Blocked function call: {node.func.id} at line {node.lineno}"
-                        )
+                        issues.append(f"Blocked function call: {node.func.id} at line {node.lineno}")

        return issues

@@ -212,11 +210,12 @@ class CodeSandbox:
    @contextmanager
    def _timeout_context(self, seconds: int):
        """Context manager for timeout enforcement."""
+
        def handler(signum, frame):
            raise TimeoutError(f"Code execution timed out after {seconds} seconds")

        # Only works on Unix-like systems
-        if hasattr(signal, 'SIGALRM'):
+        if hasattr(signal, "SIGALRM"):
            old_handler = signal.signal(signal.SIGALRM, handler)
            signal.alarm(seconds)
            try:
@@ -275,6 +274,7 @@ class CodeSandbox:

        # Capture stdout
        import io
+
        old_stdout = sys.stdout
        sys.stdout = captured_stdout = io.StringIO()

@@ -296,11 +296,7 @@ class CodeSandbox:

            # Also extract any new variables (not in inputs or builtins)
            for key, value in namespace.items():
-                if (
-                    key not in inputs
-                    and key not in self.safe_builtins
-                    and not key.startswith("_")
-                ):
+                if key not in inputs and key not in self.safe_builtins and not key.startswith("_"):
                    extracted[key] = value

            return SandboxResult(
@@ -21,19 +21,20 @@ allowing the LLM to evaluate whether proceeding along an edge makes sense
 given the current goal, context, and execution state.
 """

-from typing import Any
 from enum import Enum
+from typing import Any

 from pydantic import BaseModel, Field


 class EdgeCondition(str, Enum):
    """When an edge should be traversed."""
-    ALWAYS = "always"           # Always after source completes
-    ON_SUCCESS = "on_success"   # Only if source succeeds
-    ON_FAILURE = "on_failure"   # Only if source fails
-    CONDITIONAL = "conditional" # Based on expression
-    LLM_DECIDE = "llm_decide"   # Let LLM decide based on goal and context
+
+    ALWAYS = "always"  # Always after source completes
+    ON_SUCCESS = "on_success"  # Only if source succeeds
+    ON_FAILURE = "on_failure"  # Only if source fails
+    CONDITIONAL = "conditional"  # Based on expression
+    LLM_DECIDE = "llm_decide"  # Let LLM decide based on goal and context


 class EdgeSpec(BaseModel):
@@ -68,6 +69,7 @@ class EdgeSpec(BaseModel):
            description="Only filter if results need refinement to meet goal",
        )
    """
+
    id: str
    source: str = Field(description="Source node ID")
    target: str = Field(description="Target node ID")
@@ -76,20 +78,17 @@ class EdgeSpec(BaseModel):
    condition: EdgeCondition = EdgeCondition.ALWAYS
    condition_expr: str | None = Field(
        default=None,
-        description="Expression for CONDITIONAL edges, e.g., 'output.confidence > 0.8'"
+        description="Expression for CONDITIONAL edges, e.g., 'output.confidence > 0.8'",
    )

    # Data flow
    input_mapping: dict[str, str] = Field(
        default_factory=dict,
-        description="Map source outputs to target inputs: {target_key: source_key}"
+        description="Map source outputs to target inputs: {target_key: source_key}",
    )

    # Priority for multiple outgoing edges
-    priority: int = Field(
-        default=0,
-        description="Higher priority edges are evaluated first"
-    )
+    priority: int = Field(default=0, description="Higher priority edges are evaluated first")

    # Metadata
    description: str = ""
@@ -164,7 +163,7 @@ class EdgeSpec(BaseModel):
            "output": output,
            "memory": memory,
            "result": output.get("result"),
-            "true": True,   # Allow lowercase true/false in conditions
+            "true": True,  # Allow lowercase true/false in conditions
            "false": False,
            **memory,  # Unpack memory keys directly into context
        }
@@ -175,6 +174,7 @@ class EdgeSpec(BaseModel):
        except Exception as e:
            # Log the error for debugging
            import logging
+
            logger = logging.getLogger(__name__)
            logger.warning(f"      ⚠ Condition evaluation failed: {self.condition_expr}")
            logger.warning(f"         Error: {e}")
@@ -235,7 +235,8 @@ Respond with ONLY a JSON object:

            # Parse response
            import re
-            json_match = re.search(r'\{[^{}]*\}', response.content, re.DOTALL)
+
+            json_match = re.search(r"\{[^{}]*\}", response.content, re.DOTALL)
            if json_match:
                data = json.loads(json_match.group())
                proceed = data.get("proceed", False)
@@ -243,6 +244,7 @@ Respond with ONLY a JSON object:

                # Log the decision (using basic print for now)
                import logging
+
                logger = logging.getLogger(__name__)
                logger.info(f"      🤔 LLM routing decision: {'PROCEED' if proceed else 'SKIP'}")
                logger.info(f"         Reason: {reasoning}")
@@ -252,6 +254,7 @@ Respond with ONLY a JSON object:
        except Exception as e:
            # Fallback: proceed on success
            import logging
+
            logger = logging.getLogger(__name__)
            logger.warning(f"      ⚠ LLM routing failed, defaulting to on_success: {e}")
            return source_success
@@ -304,6 +307,7 @@ class GraphSpec(BaseModel):
            edges=[...],
        )
    """
+
    id: str
    goal_id: str
    version: str = "1.0.0"
@@ -312,42 +316,26 @@ class GraphSpec(BaseModel):
    entry_node: str = Field(description="ID of the first node to execute")
    entry_points: dict[str, str] = Field(
        default_factory=dict,
-        description="Named entry points for resuming execution. Format: {name: node_id}"
-    )
-    terminal_nodes: list[str] = Field(
-        default_factory=list,
-        description="IDs of nodes that end execution"
-    )
-    pause_nodes: list[str] = Field(
-        default_factory=list,
-        description="IDs of nodes that pause execution for HITL input"
+        description="Named entry points for resuming execution. Format: {name: node_id}",
    )
+    terminal_nodes: list[str] = Field(default_factory=list, description="IDs of nodes that end execution")
+    pause_nodes: list[str] = Field(default_factory=list, description="IDs of nodes that pause execution for HITL input")

    # Components
    nodes: list[Any] = Field(  # NodeSpec, but avoiding circular import
-        default_factory=list,
-        description="All node specifications"
-    )
-    edges: list[EdgeSpec] = Field(
-        default_factory=list,
-        description="All edge specifications"
+        default_factory=list, description="All node specifications"
    )
+    edges: list[EdgeSpec] = Field(default_factory=list, description="All edge specifications")

    # Shared memory keys
-    memory_keys: list[str] = Field(
-        default_factory=list,
-        description="Keys available in shared memory"
-    )
+    memory_keys: list[str] = Field(default_factory=list, description="Keys available in shared memory")

    # Default LLM settings
    default_model: str = "claude-haiku-4-5-20251001"
    max_tokens: int = 1024

    # Execution limits
-    max_steps: int = Field(
-        default=100,
-        description="Maximum node executions before timeout"
-    )
+    max_steps: int = Field(default=100, description="Maximum node executions before timeout")
    max_retries_per_node: int = 3

    # Metadata
@@ -10,28 +10,30 @@ The executor:
 """

 import logging
-from typing import Any, Callable
+from collections.abc import Callable
 from dataclasses import dataclass, field
+from typing import Any

-from framework.runtime.core import Runtime
+from framework.graph.edge import GraphSpec
 from framework.graph.goal import Goal
 from framework.graph.node import (
-    NodeSpec,
-    NodeContext,
-    NodeResult,
-    NodeProtocol,
-    SharedMemory,
-    LLMNode,
-    RouterNode,
    FunctionNode,
+    LLMNode,
+    NodeContext,
+    NodeProtocol,
+    NodeResult,
+    NodeSpec,
+    RouterNode,
+    SharedMemory,
 )
-from framework.graph.edge import GraphSpec
 from framework.llm.provider import LLMProvider, Tool
+from framework.runtime.core import Runtime


@dataclass
 class ExecutionResult:
    """Result of executing a graph."""
+
    success: bool
    output: dict[str, Any] = field(default_factory=dict)
    error: str | None = None
@@ -125,7 +127,8 @@ class GraphExecutor:
            # Restore memory from previous session
            for key, value in session_state["memory"].items():
                memory.write(key, value)
-            self.logger.info(f"📥 Restored session state with {len(session_state['memory'])} memory keys")
+            num_keys = len(session_state["memory"])
+            self.logger.info(f"📥 Restored session state with {num_keys} memory keys")

        # Write new input data to memory (each key individually)
        if input_data:
@@ -212,7 +215,9 @@ class GraphExecutor:
                result = await node_impl.execute(ctx)

                if result.success:
-                    self.logger.info(f"   ✓ Success (tokens: {result.tokens_used}, latency: {result.latency_ms}ms)")
+                    tokens = result.tokens_used
+                    latency = result.latency_ms
+                    self.logger.info(f"   ✓ Success (tokens: {tokens}, latency: {latency}ms)")

                    # Generate and log human-readable summary
                    summary = result.to_summary(node_spec)
@@ -393,10 +398,7 @@ class GraphExecutor:

        if node_spec.node_type == "function":
            # Function nodes need explicit registration
-            raise RuntimeError(
-                f"Function node '{node_spec.id}' not registered. "
-                "Register with node_registry."
-            )
+            raise RuntimeError(f"Function node '{node_spec.id}' not registered. Register with node_registry.")

        # Default to LLM node
        return LLMNode(tool_executor=self.tool_executor)
@@ -15,28 +15,29 @@ using a Worker-Judge loop:
 This keeps planning external while execution/evaluation is internal.
 """

-from typing import Any, Callable
+from collections.abc import Callable
 from dataclasses import dataclass
 from datetime import datetime
+from typing import Any

-from framework.runtime.core import Runtime
+from framework.graph.code_sandbox import CodeSandbox
 from framework.graph.goal import Goal
+from framework.graph.judge import HybridJudge, create_default_judge
 from framework.graph.plan import (
-    Plan,
-    PlanStep,
-    PlanExecutionResult,
-    ExecutionStatus,
-    StepStatus,
-    Judgment,
-    JudgmentAction,
+    ApprovalDecision,
    ApprovalRequest,
    ApprovalResult,
-    ApprovalDecision,
+    ExecutionStatus,
+    Judgment,
+    JudgmentAction,
+    Plan,
+    PlanExecutionResult,
+    PlanStep,
+    StepStatus,
 )
-from framework.graph.judge import HybridJudge, create_default_judge
-from framework.graph.worker_node import WorkerNode, StepExecutionResult
-from framework.graph.code_sandbox import CodeSandbox
+from framework.graph.worker_node import StepExecutionResult, WorkerNode
 from framework.llm.provider import LLMProvider, Tool
+from framework.runtime.core import Runtime

 # Type alias for approval callback
 ApprovalCallback = Callable[[ApprovalRequest], ApprovalResult]
@@ -45,6 +46,7 @@ ApprovalCallback = Callable[[ApprovalRequest], ApprovalResult]
@dataclass
 class ExecutorConfig:
    """Configuration for FlexibleGraphExecutor."""
+
    max_retries_per_step: int = 3
    max_total_steps: int = 100
    timeout_seconds: int = 300
@@ -165,7 +167,7 @@ class FlexibleGraphExecutor:
                            status=ExecutionStatus.NEEDS_REPLAN,
                            plan=plan,
                            context=context,
-                            feedback="No executable steps available but plan not complete. Check dependencies.",
+                            feedback=("No executable steps available but plan not complete. Check dependencies."),
                            steps_executed=steps_executed,
                            total_tokens=total_tokens,
                            total_latency=total_latency,
@@ -174,7 +176,8 @@ class FlexibleGraphExecutor:
                # Execute next step (for now, sequential; could be parallel)
                step = ready_steps[0]
                # Debug: show ready steps
-                # print(f"  [DEBUG] Ready steps: {[s.id for s in ready_steps]}, executing: {step.id}")
+                # ready_ids = [s.id for s in ready_steps]
+                # print(f"  [DEBUG] Ready steps: {ready_ids}, executing: {step.id}")

                # APPROVAL CHECK - before execution
                if step.requires_approval:
@@ -360,7 +363,7 @@ class FlexibleGraphExecutor:
                    status=ExecutionStatus.NEEDS_REPLAN,
                    plan=plan,
                    context=context,
-                    feedback=f"Step '{step.id}' failed after {step.attempts} attempts: {judgment.feedback}",
+                    feedback=(f"Step '{step.id}' failed after {step.attempts} attempts: {judgment.feedback}"),
                    steps_executed=steps_executed,
                    total_tokens=total_tokens,
                    total_latency=total_latency,
@@ -450,12 +453,16 @@ class FlexibleGraphExecutor:
            preview_parts.append(f"Tool: {step.action.tool_name}")
            if step.action.tool_args:
                import json
+
                args_preview = json.dumps(step.action.tool_args, indent=2, default=str)
                if len(args_preview) > 500:
                    args_preview = args_preview[:500] + "..."
                preview_parts.append(f"Args: {args_preview}")
        elif step.action.prompt:
-            prompt_preview = step.action.prompt[:300] + "..." if len(step.action.prompt) > 300 else step.action.prompt
+            if len(step.action.prompt) > 300:
+                prompt_preview = step.action.prompt[:300] + "..."
+            else:
+                prompt_preview = step.action.prompt
            preview_parts.append(f"Prompt: {prompt_preview}")

        # Include step inputs resolved from context (what will be sent/used)
@@ -12,20 +12,21 @@ Goals are:
 """

 from datetime import datetime
-from typing import Any
 from enum import Enum
+from typing import Any

 from pydantic import BaseModel, Field


 class GoalStatus(str, Enum):
    """Lifecycle status of a goal."""
-    DRAFT = "draft"           # Being defined
-    READY = "ready"           # Ready for agent creation
-    ACTIVE = "active"         # Has an agent graph, can execute
-    COMPLETED = "completed"   # Achieved
-    FAILED = "failed"         # Could not be achieved
-    SUSPENDED = "suspended"   # Paused for revision
+
+    DRAFT = "draft"  # Being defined
+    READY = "ready"  # Ready for agent creation
+    ACTIVE = "active"  # Has an agent graph, can execute
+    COMPLETED = "completed"  # Achieved
+    FAILED = "failed"  # Could not be achieved
+    SUSPENDED = "suspended"  # Paused for revision


 class SuccessCriterion(BaseModel):
@@ -37,22 +38,12 @@ class SuccessCriterion(BaseModel):
    - Measurable: Can be evaluated programmatically or by LLM
    - Achievable: Within the agent's capabilities
    """
+
    id: str
-    description: str = Field(
-        description="Human-readable description of what success looks like"
-    )
-    metric: str = Field(
-        description="How to measure: 'output_contains', 'output_equals', 'llm_judge', 'custom'"
-    )
-    target: Any = Field(
-        description="The target value or condition"
-    )
-    weight: float = Field(
-        default=1.0,
-        ge=0.0,
-        le=1.0,
-        description="Relative importance (0-1)"
-    )
+    description: str = Field(description="Human-readable description of what success looks like")
+    metric: str = Field(description="How to measure: 'output_contains', 'output_equals', 'llm_judge', 'custom'")
+    target: Any = Field(description="The target value or condition")
+    weight: float = Field(default=1.0, ge=0.0, le=1.0, description="Relative importance (0-1)")
    met: bool = False

    model_config = {"extra": "allow"}
@@ -66,19 +57,12 @@ class Constraint(BaseModel):
    - Hard: Violation means failure
    - Soft: Violation is discouraged but allowed
    """
+
    id: str
    description: str
-    constraint_type: str = Field(
-        description="Type: 'hard' (must not violate) or 'soft' (prefer not to violate)"
-    )
-    category: str = Field(
-        default="general",
-        description="Category: 'time', 'cost', 'safety', 'scope', 'quality'"
-    )
-    check: str = Field(
-        default="",
-        description="How to check: expression, function name, or 'llm_judge'"
-    )
+    constraint_type: str = Field(description="Type: 'hard' (must not violate) or 'soft' (prefer not to violate)")
+    category: str = Field(default="general", description="Category: 'time', 'cost', 'safety', 'scope', 'quality'")
+    check: str = Field(default="", description="How to check: expression, function name, or 'llm_judge'")

    model_config = {"extra": "allow"}

@@ -119,6 +103,7 @@ class Goal(BaseModel):
            ]
        )
    """
+
    id: str
    name: str
    description: str
@@ -133,24 +118,18 @@ class Goal(BaseModel):
    # Context for the agent
    context: dict[str, Any] = Field(
        default_factory=dict,
-        description="Additional context: domain knowledge, user preferences, etc."
+        description="Additional context: domain knowledge, user preferences, etc.",
    )

    # Capabilities required
    required_capabilities: list[str] = Field(
        default_factory=list,
-        description="What the agent needs: 'llm', 'web_search', 'code_execution', etc."
+        description="What the agent needs: 'llm', 'web_search', 'code_execution', etc.",
    )

    # Input/output schema
-    input_schema: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Expected input format"
-    )
-    output_schema: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Expected output format"
-    )
+    input_schema: dict[str, Any] = Field(default_factory=dict, description="Expected input format")
+    output_schema: dict[str, Any] = Field(default_factory=dict, description="Expected output format")

    # Versioning for evolution
    version: str = "1.0.0"
@@ -12,6 +12,7 @@ from typing import Any

 class HITLInputType(str, Enum):
    """Type of input expected from human."""
+
    FREE_TEXT = "free_text"  # Open-ended text response
    STRUCTURED = "structured"  # Specific fields to fill
    SELECTION = "selection"  # Choose from options
@@ -22,6 +23,7 @@ class HITLInputType(str, Enum):
@dataclass
 class HITLQuestion:
    """A single question to ask the human."""
+
    id: str
    question: str
    input_type: HITLInputType = HITLInputType.FREE_TEXT
@@ -44,6 +46,7 @@ class HITLRequest:

    This is what the agent produces when it needs human input.
    """
+
    # Context
    objective: str  # What we're trying to accomplish
    current_state: str  # Where we are in the process
@@ -92,6 +95,7 @@ class HITLResponse:

    This is what gets passed back when resuming from a pause.
    """
+
    # Original request reference
    request_id: str

@@ -170,13 +174,11 @@ class HITLProtocol:

        # Use Haiku to extract answers
        try:
-            import anthropic
            import json

-            questions_str = "\n".join([
-                f"{i+1}. {q.question} (id: {q.id})"
-                for i, q in enumerate(request.questions)
-            ])
+            import anthropic
+
+            questions_str = "\n".join([f"{i + 1}. {q.question} (id: {q.id})" for i, q in enumerate(request.questions)])

            prompt = f"""Parse the user's response and extract answers for each question.

@@ -195,13 +197,14 @@ Example format:
            message = client.messages.create(
                model="claude-3-5-haiku-20241022",
                max_tokens=500,
-                messages=[{"role": "user", "content": prompt}]
+                messages=[{"role": "user", "content": prompt}],
            )

            # Parse Haiku's response
            import re
+
            response_text = message.content[0].text.strip()
-            json_match = re.search(r'\{[^{}]*\}', response_text, re.DOTALL)
+            json_match = re.search(r"\{[^{}]*\}", response_text, re.DOTALL)

            if json_match:
                parsed = json.loads(json_match.group())
@@ -8,23 +8,24 @@ The HybridJudge evaluates step execution results using:
 Escalation path: rules → LLM → human
 """

-from typing import Any
 from dataclasses import dataclass, field
+from typing import Any

+from framework.graph.code_sandbox import safe_eval
+from framework.graph.goal import Goal
 from framework.graph.plan import (
-    PlanStep,
+    EvaluationRule,
    Judgment,
    JudgmentAction,
-    EvaluationRule,
+    PlanStep,
 )
-from framework.graph.goal import Goal
-from framework.graph.code_sandbox import safe_eval
 from framework.llm.provider import LLMProvider


@dataclass
 class RuleEvaluationResult:
    """Result of rule-based evaluation."""
+
    is_definitive: bool  # True if a rule matched definitively
    judgment: Judgment | None = None
    context: dict[str, Any] = field(default_factory=dict)
@@ -136,9 +137,9 @@ class HybridJudge:

        # Build evaluation context
        eval_context = {
-            "step": step.model_dump() if hasattr(step, 'model_dump') else step,
+            "step": step.model_dump() if hasattr(step, "model_dump") else step,
            "result": result,
-            "goal": goal.model_dump() if hasattr(goal, 'model_dump') else goal,
+            "goal": goal.model_dump() if hasattr(goal, "model_dump") else goal,
            "context": context,
            "success": isinstance(result, dict) and result.get("success", False),
            "error": isinstance(result, dict) and result.get("error"),
@@ -216,7 +217,9 @@ class HybridJudge:
                # Low confidence - escalate
                return Judgment(
                    action=JudgmentAction.ESCALATE,
-                    reasoning=f"LLM confidence ({judgment.confidence:.2f}) below threshold ({self.llm_confidence_threshold})",
+                    reasoning=(
+                        f"LLM confidence ({judgment.confidence:.2f}) below threshold ({self.llm_confidence_threshold})"
+                    ),
                    feedback=judgment.feedback,
                    confidence=judgment.confidence,
                    llm_used=True,
@@ -338,52 +341,64 @@ def create_default_judge(llm: LLMProvider | None = None) -> HybridJudge:
    judge = HybridJudge(llm=llm)

    # Rule: Accept on explicit success flag
-    judge.add_rule(EvaluationRule(
-        id="explicit_success",
-        description="Step explicitly marked as successful",
-        condition="isinstance(result, dict) and result.get('success') == True",
-        action=JudgmentAction.ACCEPT,
-        priority=100,
-    ))
+    judge.add_rule(
+        EvaluationRule(
+            id="explicit_success",
+            description="Step explicitly marked as successful",
+            condition="isinstance(result, dict) and result.get('success') == True",
+            action=JudgmentAction.ACCEPT,
+            priority=100,
+        )
+    )

    # Rule: Retry on transient errors
-    judge.add_rule(EvaluationRule(
-        id="transient_error_retry",
-        description="Transient error that can be retried",
-        condition="isinstance(result, dict) and result.get('error_type') in ['timeout', 'rate_limit', 'connection_error']",
-        action=JudgmentAction.RETRY,
-        feedback_template="Transient error: {result[error]}. Please retry.",
-        priority=90,
-    ))
+    judge.add_rule(
+        EvaluationRule(
+            id="transient_error_retry",
+            description="Transient error that can be retried",
+            condition=(
+                "isinstance(result, dict) and result.get('error_type') in ['timeout', 'rate_limit', 'connection_error']"
+            ),
+            action=JudgmentAction.RETRY,
+            feedback_template="Transient error: {result[error]}. Please retry.",
+            priority=90,
+        )
+    )

    # Rule: Replan on missing data
-    judge.add_rule(EvaluationRule(
-        id="missing_data_replan",
-        description="Required data not available",
-        condition="isinstance(result, dict) and result.get('error_type') == 'missing_data'",
-        action=JudgmentAction.REPLAN,
-        feedback_template="Missing required data: {result[error]}. Plan needs adjustment.",
-        priority=80,
-    ))
+    judge.add_rule(
+        EvaluationRule(
+            id="missing_data_replan",
+            description="Required data not available",
+            condition="isinstance(result, dict) and result.get('error_type') == 'missing_data'",
+            action=JudgmentAction.REPLAN,
+            feedback_template="Missing required data: {result[error]}. Plan needs adjustment.",
+            priority=80,
+        )
+    )

    # Rule: Escalate on security issues
-    judge.add_rule(EvaluationRule(
-        id="security_escalate",
-        description="Security issue detected",
-        condition="isinstance(result, dict) and result.get('error_type') == 'security'",
-        action=JudgmentAction.ESCALATE,
-        feedback_template="Security issue detected: {result[error]}",
-        priority=200,
-    ))
+    judge.add_rule(
+        EvaluationRule(
+            id="security_escalate",
+            description="Security issue detected",
+            condition="isinstance(result, dict) and result.get('error_type') == 'security'",
+            action=JudgmentAction.ESCALATE,
+            feedback_template="Security issue detected: {result[error]}",
+            priority=200,
+        )
+    )

    # Rule: Fail on max retries exceeded
-    judge.add_rule(EvaluationRule(
-        id="max_retries_fail",
-        description="Maximum retries exceeded",
-        condition="step.get('attempts', 0) >= step.get('max_retries', 3)",
-        action=JudgmentAction.REPLAN,
-        feedback_template="Step '{step[id]}' failed after {step[attempts]} attempts",
-        priority=150,
-    ))
+    judge.add_rule(
+        EvaluationRule(
+            id="max_retries_fail",
+            description="Maximum retries exceeded",
+            condition="step.get('attempts', 0) >= step.get('max_retries', 3)",
+            action=JudgmentAction.REPLAN,
+            feedback_template="Step '{step[id]}' failed after {step[attempts]} attempts",
+            priority=150,
+        )
+    )

    return judge
@@ -17,13 +17,14 @@ Protocol:

 import logging
 from abc import ABC, abstractmethod
-from typing import Any, Callable
+from collections.abc import Callable
 from dataclasses import dataclass, field
+from typing import Any

 from pydantic import BaseModel, Field

-from framework.runtime.core import Runtime
 from framework.llm.provider import LLMProvider, Tool
+from framework.runtime.core import Runtime

 logger = logging.getLogger(__name__)

@@ -48,6 +49,7 @@ class NodeSpec(BaseModel):
            system_prompt="You are a calculator..."
        )
    """
+
    id: str
    name: str
    description: str
@@ -55,51 +57,27 @@ class NodeSpec(BaseModel):
    # Node behavior type
    node_type: str = Field(
        default="llm_tool_use",
-        description="Type: 'llm_tool_use', 'llm_generate', 'function', 'router', 'human_input'"
+        description="Type: 'llm_tool_use', 'llm_generate', 'function', 'router', 'human_input'",
    )

    # Data flow
-    input_keys: list[str] = Field(
-        default_factory=list,
-        description="Keys this node reads from shared memory or input"
-    )
-    output_keys: list[str] = Field(
-        default_factory=list,
-        description="Keys this node writes to shared memory or output"
-    )
+    input_keys: list[str] = Field(default_factory=list, description="Keys this node reads from shared memory or input")
+    output_keys: list[str] = Field(default_factory=list, description="Keys this node writes to shared memory or output")

    # For LLM nodes
-    system_prompt: str | None = Field(
-        default=None,
-        description="System prompt for LLM nodes"
-    )
-    tools: list[str] = Field(
-        default_factory=list,
-        description="Tool names this node can use"
-    )
-    model: str | None = Field(
-        default=None,
-        description="Specific model to use (defaults to graph default)"
-    )
+    system_prompt: str | None = Field(default=None, description="System prompt for LLM nodes")
+    tools: list[str] = Field(default_factory=list, description="Tool names this node can use")
+    model: str | None = Field(default=None, description="Specific model to use (defaults to graph default)")

    # For function nodes
-    function: str | None = Field(
-        default=None,
-        description="Function name or path for function nodes"
-    )
+    function: str | None = Field(default=None, description="Function name or path for function nodes")

    # For router nodes
-    routes: dict[str, str] = Field(
-        default_factory=dict,
-        description="Condition -> target_node_id mapping for routers"
-    )
+    routes: dict[str, str] = Field(default_factory=dict, description="Condition -> target_node_id mapping for routers")

    # Retry behavior
    max_retries: int = Field(default=3)
-    retry_on: list[str] = Field(
-        default_factory=list,
-        description="Error types to retry on"
-    )
+    retry_on: list[str] = Field(default_factory=list, description="Error types to retry on")

    model_config = {"extra": "allow"}

@@ -112,6 +90,7 @@ class SharedMemory:
    Nodes read and write to shared memory using typed keys.
    The memory is scoped to a single run.
    """
+
    _data: dict[str, Any] = field(default_factory=dict)
    _allowed_read: set[str] = field(default_factory=set)
    _allowed_write: set[str] = field(default_factory=set)
@@ -159,6 +138,7 @@ class NodeContext:
    - Access to tools (for actions)
    - The goal context (for guidance)
    """
+
    # Core runtime
    runtime: Runtime

@@ -194,6 +174,7 @@ class NodeResult:
    - State changes made
    - Route decision (for routers)
    """
+
    success: bool
    output: dict[str, Any] = field(default_factory=dict)
    error: str | None = None
@@ -221,6 +202,7 @@ class NodeResult:

        # Use Haiku to generate intelligent summary
        import os
+
        api_key = os.environ.get("ANTHROPIC_API_KEY")

        if not api_key:
@@ -235,25 +217,28 @@ class NodeResult:

        # Use Haiku to generate intelligent summary
        try:
-            import anthropic
            import json

+            import anthropic
+
            node_context = ""
            if node_spec:
                node_context = f"\nNode: {node_spec.name}\nPurpose: {node_spec.description}"

-            prompt = f"""Generate a 1-2 sentence human-readable summary of what this node produced.{node_context}
-
-Node output:
-{json.dumps(self.output, indent=2, default=str)[:2000]}
-
-Provide a concise, clear summary that a human can quickly understand. Focus on the key information produced."""
+            output_json = json.dumps(self.output, indent=2, default=str)[:2000]
+            prompt = (
+                "Generate a 1-2 sentence human-readable summary of what this "
+                f"node produced.{node_context}\n\n"
+                f"Node output:\n{output_json}\n\n"
+                "Provide a concise, clear summary that a human can quickly "
+                "understand. Focus on the key information produced."
+            )

            client = anthropic.Anthropic(api_key=api_key)
            message = client.messages.create(
                model="claude-3-5-haiku-20241022",
                max_tokens=200,
-                messages=[{"role": "user", "content": prompt}]
+                messages=[{"role": "user", "content": prompt}],
            )

            summary = message.content[0].text.strip()
@@ -381,17 +366,21 @@ class LLMNode(NodeProtocol):

            # Log the LLM call details
            logger.info("      🤖 LLM Call:")
-            logger.info(f"         System: {system[:150]}..." if len(system) > 150 else f"         System: {system}")
-            logger.info(f"         User message: {messages[-1]['content'][:150]}..." if len(messages[-1]['content']) > 150 else f"         User message: {messages[-1]['content']}")
+            sys_preview = system[:150] + "..." if len(system) > 150 else system
+            logger.info(f"         System: {sys_preview}")
+            user_msg = messages[-1]["content"]
+            msg_preview = user_msg[:150] + "..." if len(user_msg) > 150 else user_msg
+            logger.info(f"         User message: {msg_preview}")
            if ctx.available_tools:
                logger.info(f"         Tools available: {[t.name for t in ctx.available_tools]}")

            # Call LLM
            if ctx.available_tools and self.tool_executor:
-                from framework.llm.provider import ToolUse, ToolResult
+                from framework.llm.provider import ToolResult, ToolUse

                def executor(tool_use: ToolUse) -> ToolResult:
-                    logger.info(f"         🔧 Tool call: {tool_use.name}({', '.join(f'{k}={v}' for k, v in tool_use.input.items())})")
+                    args_str = ", ".join(f"{k}={v}" for k, v in tool_use.input.items())
+                    logger.info(f"         🔧 Tool call: {tool_use.name}({args_str})")
                    result = self.tool_executor(tool_use)
                    # Truncate long results
                    result_str = str(result.content)[:150]
@@ -413,9 +402,10 @@ class LLMNode(NodeProtocol):
                )

            # Log the response
-            response_preview = response.content[:200] if len(response.content) > 200 else response.content
            if len(response.content) > 200:
-                response_preview += "..."
+                response_preview = response.content[:200] + "..."
+            else:
+                response_preview = response.content
            logger.info(f"      ← Response: {response_preview}")

            latency_ms = int((time.time() - start) * 1000)
@@ -432,7 +422,9 @@ class LLMNode(NodeProtocol):
            output = self._parse_output(response.content, ctx.node_spec)

            # For llm_generate and llm_tool_use nodes, try to parse JSON and extract fields
-            if ctx.node_spec.node_type in ("llm_generate", "llm_tool_use") and len(ctx.node_spec.output_keys) > 1:
+            is_llm_node = ctx.node_spec.node_type in ("llm_generate", "llm_tool_use")
+            has_multiple_outputs = len(ctx.node_spec.output_keys) > 1
+            if is_llm_node and has_multiple_outputs:
                try:
                    import json

@@ -446,7 +438,8 @@ class LLMNode(NodeProtocol):
                                ctx.memory.write(key, parsed[key])
                                output[key] = parsed[key]
                            elif key in ctx.input_data:
-                                # Key not in parsed JSON but exists in input - pass through input value
+                                # Key not in parsed JSON but exists in input
+                                # Pass through input value
                                ctx.memory.write(key, ctx.input_data[key])
                                output[key] = ctx.input_data[key]
                            else:
@@ -508,7 +501,7 @@ class LLMNode(NodeProtocol):
            content = raw_response.strip()
            # Remove markdown code blocks if present
            if content.startswith("```"):
-                match = re.search(r'```(?:json)?\s*\n?(.*?)\n?```', content, re.DOTALL)
+                match = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", content, re.DOTALL)
                if match:
                    content = match.group(1).strip()

@@ -520,15 +513,16 @@ class LLMNode(NodeProtocol):

        # JSON parse failed - use Haiku to extract clean JSON
        import os
+
        api_key = os.environ.get("ANTHROPIC_API_KEY")
        if not api_key:
            # No API key, try one more simple extraction
            try:
                # Find first { and last }
-                start = raw_response.find('{')
-                end = raw_response.rfind('}')
+                start = raw_response.find("{")
+                end = raw_response.rfind("}")
                if start != -1 and end != -1:
-                    json_str = raw_response[start:end+1]
+                    json_str = raw_response[start : end + 1]
                    return json.loads(json_str)
            except (ValueError, json.JSONDecodeError):
                pass
@@ -536,31 +530,31 @@ class LLMNode(NodeProtocol):

        # Use Haiku to clean the response
        from framework.llm.anthropic import AnthropicProvider
+
        haiku = AnthropicProvider(model="claude-3-5-haiku-20241022")

-        prompt = f"""Extract the JSON object from this LLM response. Extract ONLY the values that the LLM actually generated.
-
-Expected output keys: {output_keys}
-
-LLM Response:
-{raw_response}
-
-IMPORTANT:
- Only extract keys that the LLM explicitly output in its response
- Do NOT include keys that were just mentioned or passed through from input
- If the LLM output multiple pieces of text/JSON, extract the LAST JSON object only
- Output ONLY valid JSON with no extra text, no markdown, no explanations"""
+        prompt = (
+            "Extract the JSON object from this LLM response. "
+            "Extract ONLY the values that the LLM actually generated.\n\n"
+            f"Expected output keys: {output_keys}\n\n"
+            f"LLM Response:\n{raw_response}\n\n"
+            "IMPORTANT:\n"
+            "- Only extract keys that the LLM explicitly output in its response\n"
+            "- Do NOT include keys that were just mentioned or passed through from input\n"
+            "- If the LLM output multiple pieces of text/JSON, extract the LAST JSON object only\n"
+            "- Output ONLY valid JSON with no extra text, no markdown, no explanations"
+        )

        try:
            result = haiku.complete(
                messages=[{"role": "user", "content": prompt}],
-                system="You extract clean JSON from messy responses. Output only valid JSON, nothing else.",
+                system=("You extract clean JSON from messy responses. Output only valid JSON, nothing else."),
            )

            cleaned = result.content.strip()
            # Remove markdown if Haiku added it
            if cleaned.startswith("```"):
-                match = re.search(r'```(?:json)?\s*\n?(.*?)\n?```', cleaned, re.DOTALL)
+                match = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", cleaned, re.DOTALL)
                if match:
                    cleaned = match.group(1).strip()

@@ -598,6 +592,7 @@ IMPORTANT:

        # Use Haiku to intelligently extract relevant data
        import os
+
        api_key = os.environ.get("ANTHROPIC_API_KEY")
        if not api_key:
            # Fallback to simple formatting if no API key
@@ -610,24 +605,27 @@ IMPORTANT:

        # Build prompt for Haiku to extract clean values
        import json
-        prompt = f"""Extract the following information from the memory context:

-Required fields: {', '.join(ctx.node_spec.input_keys)}
-
-Memory context (may contain nested data, JSON strings, or extra information):
-{json.dumps(memory_data, indent=2, default=str)[:3000]}
-
-Extract ONLY the clean values for the required fields. Ignore nested structures, JSON wrappers, and irrelevant data.
-
-Output as JSON with the exact field names requested."""
+        fields = ", ".join(ctx.node_spec.input_keys)
+        memory_json = json.dumps(memory_data, indent=2, default=str)[:3000]
+        prompt = (
+            "Extract the following information from the memory context:\n\n"
+            f"Required fields: {fields}\n\n"
+            "Memory context (may contain nested data, JSON strings, "
+            f"or extra information):\n{memory_json}\n\n"
+            "Extract ONLY the clean values for the required fields. "
+            "Ignore nested structures, JSON wrappers, and irrelevant data.\n\n"
+            "Output as JSON with the exact field names requested."
+        )

        try:
            import anthropic
+
            client = anthropic.Anthropic(api_key=api_key)
            message = client.messages.create(
                model="claude-3-5-haiku-20241022",
                max_tokens=1000,
-                messages=[{"role": "user", "content": prompt}]
+                messages=[{"role": "user", "content": prompt}],
            )

            # Parse Haiku's response
@@ -635,7 +633,8 @@ Output as JSON with the exact field names requested."""

            # Try to extract JSON
            import re
-            json_match = re.search(r'\{[^{}]*\}', response_text, re.DOTALL)
+
+            json_match = re.search(r"\{[^{}]*\}", response_text, re.DOTALL)
            if json_match:
                extracted = json.loads(json_match.group())
                # Format as key: value pairs
@@ -708,11 +707,13 @@ class RouterNode(NodeProtocol):
        # Build options from routes
        options = []
        for condition, target in ctx.node_spec.routes.items():
-            options.append({
-                "id": condition,
-                "description": f"Route to {target} when condition '{condition}' is met",
-                "target": target,
-            })
+            options.append(
+                {
+                    "id": condition,
+                    "description": f"Route to {target} when condition '{condition}' is met",
+                    "target": target,
+                }
+            )

        # Check if we should use LLM-based routing
        if ctx.node_spec.system_prompt and ctx.llm:
@@ -765,10 +766,9 @@ class RouterNode(NodeProtocol):
        import json

        # Build routing options description
-        options_desc = "\n".join([
-            f"- {opt['id']}: {opt['description']} → goes to '{opt['target']}'"
-            for opt in options
-        ])
+        options_desc = "\n".join(
+            [f"- {opt['id']}: {opt['description']} → goes to '{opt['target']}'" for opt in options]
+        )

        # Build context
        context_data = {
@@ -795,15 +795,17 @@ Respond with ONLY a JSON object:
        logger.info("      🤔 Router using LLM to choose path...")

        try:
+            default_system = "You are a routing agent. Respond with JSON only."
            response = ctx.llm.complete(
                messages=[{"role": "user", "content": prompt}],
-                system=ctx.node_spec.system_prompt or "You are a routing agent. Respond with JSON only.",
+                system=ctx.node_spec.system_prompt or default_system,
                max_tokens=150,
            )

            # Parse response
            import re
-            json_match = re.search(r'\{[^{}]*\}', response.content, re.DOTALL)
+
+            json_match = re.search(r"\{[^{}]*\}", response.content, re.DOTALL)
            if json_match:
                data = json.loads(json_match.group())
                chosen = data.get("chosen", "default")
@@ -813,7 +815,8 @@ Respond with ONLY a JSON object:
                logger.info(f"         Reason: {reasoning}")

                # Find the target for this choice
-                target = ctx.node_spec.routes.get(chosen, ctx.node_spec.routes.get("default", "end"))
+                default_target = ctx.node_spec.routes.get("default", "end")
+                target = ctx.node_spec.routes.get(chosen, default_target)
                return (chosen, target)

        except Exception as e:
@@ -864,10 +867,12 @@ class FunctionNode(NodeProtocol):

        decision_id = ctx.runtime.decide(
            intent=f"Execute function {ctx.node_spec.function or 'unknown'}",
-            options=[{
-                "id": "execute",
-                "description": f"Run function with inputs: {list(ctx.input_data.keys())}",
-            }],
+            options=[
+                {
+                    "id": "execute",
+                    "description": f"Run function with inputs: {list(ctx.input_data.keys())}",
+                }
+            ],
            chosen="execute",
            reasoning="Deterministic function execution",
        )
@@ -10,24 +10,26 @@ The Plan is the contract between the external planner and the executor:
 - If replanning needed, returns feedback to external planner
 """

-from typing import Any
-from enum import Enum
 from datetime import datetime
+from enum import Enum
+from typing import Any

 from pydantic import BaseModel, Field


 class ActionType(str, Enum):
    """Types of actions a PlanStep can perform."""
-    LLM_CALL = "llm_call"           # Call LLM for generation
-    TOOL_USE = "tool_use"           # Use a registered tool
-    SUB_GRAPH = "sub_graph"         # Execute a sub-graph
-    FUNCTION = "function"           # Call a Python function
+
+    LLM_CALL = "llm_call"  # Call LLM for generation
+    TOOL_USE = "tool_use"  # Use a registered tool
+    SUB_GRAPH = "sub_graph"  # Execute a sub-graph
+    FUNCTION = "function"  # Call a Python function
    CODE_EXECUTION = "code_execution"  # Execute dynamic code (sandboxed)


 class StepStatus(str, Enum):
    """Status of a plan step."""
+
    PENDING = "pending"
    AWAITING_APPROVAL = "awaiting_approval"  # Waiting for human approval
    IN_PROGRESS = "in_progress"
@@ -39,14 +41,16 @@ class StepStatus(str, Enum):

 class ApprovalDecision(str, Enum):
    """Human decision on a step requiring approval."""
-    APPROVE = "approve"      # Execute as planned
-    REJECT = "reject"        # Skip this step
-    MODIFY = "modify"        # Execute with modifications
-    ABORT = "abort"          # Stop entire execution
+
+    APPROVE = "approve"  # Execute as planned
+    REJECT = "reject"  # Skip this step
+    MODIFY = "modify"  # Execute with modifications
+    ABORT = "abort"  # Stop entire execution


 class ApprovalRequest(BaseModel):
    """Request for human approval before executing a step."""
+
    step_id: str
    step_description: str
    action_type: str
@@ -62,6 +66,7 @@ class ApprovalRequest(BaseModel):

 class ApprovalResult(BaseModel):
    """Result of human approval decision."""
+
    decision: ApprovalDecision
    reason: str | None = None
    modifications: dict[str, Any] = Field(default_factory=dict)
@@ -71,10 +76,11 @@ class ApprovalResult(BaseModel):

 class JudgmentAction(str, Enum):
    """Actions the judge can take after evaluating a step."""
-    ACCEPT = "accept"       # Step completed successfully, continue
-    RETRY = "retry"         # Retry the step with feedback
-    REPLAN = "replan"       # Return to external planner for new plan
-    ESCALATE = "escalate"   # Request human intervention
+
+    ACCEPT = "accept"  # Step completed successfully, continue
+    RETRY = "retry"  # Retry the step with feedback
+    REPLAN = "replan"  # Return to external planner for new plan
+    ESCALATE = "escalate"  # Request human intervention


 class ActionSpec(BaseModel):
@@ -83,6 +89,7 @@ class ActionSpec(BaseModel):

    This is the "what to do" part of a PlanStep.
    """
+
    action_type: ActionType

    # For LLM_CALL
@@ -114,6 +121,7 @@ class PlanStep(BaseModel):

    Created by external planner, executed by Worker, evaluated by Judge.
    """
+
    id: str
    description: str
    action: ActionSpec
@@ -121,28 +129,16 @@ class PlanStep(BaseModel):
    # Data flow
    inputs: dict[str, Any] = Field(
        default_factory=dict,
-        description="Input data for this step (can reference previous step outputs)"
-    )
-    expected_outputs: list[str] = Field(
-        default_factory=list,
-        description="Keys this step should produce"
+        description="Input data for this step (can reference previous step outputs)",
    )
+    expected_outputs: list[str] = Field(default_factory=list, description="Keys this step should produce")

    # Dependencies
-    dependencies: list[str] = Field(
-        default_factory=list,
-        description="IDs of steps that must complete before this one"
-    )
+    dependencies: list[str] = Field(default_factory=list, description="IDs of steps that must complete before this one")

    # Human-in-the-loop (HITL)
-    requires_approval: bool = Field(
-        default=False,
-        description="If True, requires human approval before execution"
-    )
-    approval_message: str | None = Field(
-        default=None,
-        description="Message to show human when requesting approval"
-    )
+    requires_approval: bool = Field(default=False, description="If True, requires human approval before execution")
+    approval_message: str | None = Field(default=None, description="Message to show human when requesting approval")

    # Execution state
    status: StepStatus = StepStatus.PENDING
@@ -170,6 +166,7 @@ class Judgment(BaseModel):

    The Judge evaluates step results and decides what to do next.
    """
+
    action: JudgmentAction
    reasoning: str
    feedback: str | None = None  # For retry/replan - what went wrong
@@ -193,6 +190,7 @@ class EvaluationRule(BaseModel):

    Rules are checked before falling back to LLM evaluation.
    """
+
    id: str
    description: str

@@ -216,6 +214,7 @@ class Plan(BaseModel):
    Created by external planner (Claude Code, etc).
    Executed by FlexibleGraphExecutor.
    """
+
    id: str
    goal_id: str
    description: str
@@ -361,12 +360,13 @@ class Plan(BaseModel):

 class ExecutionStatus(str, Enum):
    """Status of plan execution."""
+
    COMPLETED = "completed"
    AWAITING_APPROVAL = "awaiting_approval"  # Paused for human approval
    NEEDS_REPLAN = "needs_replan"
    NEEDS_ESCALATION = "needs_escalation"
    REJECTED = "rejected"  # Human rejected a step
-    ABORTED = "aborted"    # Human aborted execution
+    ABORTED = "aborted"  # Human aborted execution
    FAILED = "failed"


@@ -376,6 +376,7 @@ class PlanExecutionResult(BaseModel):

    Returned to external planner with status and feedback.
    """
+
    status: ExecutionStatus

    # Results from completed steps
@@ -421,6 +422,7 @@ def load_export(data: str | dict) -> tuple["Plan", Any]:
        result = await executor.execute_plan(plan, goal, context)
    """
    import json as json_module
+
    from framework.graph.goal import Goal

    if isinstance(data, str):
@@ -10,20 +10,21 @@ appropriate executor based on action type:
 - Code execution (sandboxed)
 """

-from typing import Any, Callable
-from dataclasses import dataclass, field
-import time
 import json
 import re
+import time
+from collections.abc import Callable
+from dataclasses import dataclass, field
+from typing import Any

+from framework.graph.code_sandbox import CodeSandbox
 from framework.graph.plan import (
-    PlanStep,
    ActionSpec,
    ActionType,
+    PlanStep,
 )
-from framework.graph.code_sandbox import CodeSandbox
-from framework.runtime.core import Runtime
 from framework.llm.provider import LLMProvider, Tool
+from framework.runtime.core import Runtime


 def parse_llm_json_response(text: str) -> tuple[Any | None, str]:
@@ -50,7 +51,7 @@ def parse_llm_json_response(text: str) -> tuple[Any | None, str]:

    # Try to extract JSON from markdown code blocks
    # Pattern: ```json ... ``` or ``` ... ```
-    code_block_pattern = r'```(?:json)?\s*([\s\S]*?)\s*```'
+    code_block_pattern = r"```(?:json)?\s*([\s\S]*?)\s*```"
    matches = re.findall(code_block_pattern, cleaned)

    if matches:
@@ -70,7 +71,7 @@ def parse_llm_json_response(text: str) -> tuple[Any | None, str]:
        pass

    # Try to find JSON-like content (starts with { or [)
-    json_start_pattern = r'(\{[\s\S]*\}|\[[\s\S]*\])'
+    json_start_pattern = r"(\{[\s\S]*\}|\[[\s\S]*\])"
    json_matches = re.findall(json_start_pattern, cleaned)

    for match in json_matches:
@@ -87,6 +88,7 @@ def parse_llm_json_response(text: str) -> tuple[Any | None, str]:
@dataclass
 class StepExecutionResult:
    """Result of executing a plan step."""
+
    success: bool
    outputs: dict[str, Any] = field(default_factory=dict)
    error: str | None = None
@@ -160,11 +162,13 @@ class WorkerNode:
        # Record decision
        decision_id = self.runtime.decide(
            intent=f"Execute plan step: {step.description}",
-            options=[{
-                "id": step.action.action_type.value,
-                "description": f"Execute {step.action.action_type.value} action",
-                "action_type": step.action.action_type.value,
-            }],
+            options=[
+                {
+                    "id": step.action.action_type.value,
+                    "description": f"Execute {step.action.action_type.value} action",
+                    "action_type": step.action.action_type.value,
+                }
+            ],
            chosen=step.action.action_type.value,
            reasoning=f"Step requires {step.action.action_type.value}",
            context={"step_id": step.id, "inputs": step.inputs},
@@ -414,6 +418,7 @@ class WorkerNode:
        try:
            # Execute tool via formal executor
            from framework.llm.provider import ToolUse
+
            tool_use = ToolUse(
                id=f"step_{tool_name}",
                name=tool_name,
@@ -1,7 +1,7 @@
 """LLM provider abstraction."""

-from framework.llm.provider import LLMProvider, LLMResponse
 from framework.llm.anthropic import AnthropicProvider
 from framework.llm.litellm import LiteLLMProvider
+from framework.llm.provider import LLMProvider, LLMResponse

 __all__ = ["LLMProvider", "LLMResponse", "AnthropicProvider", "LiteLLMProvider"]
@@ -3,8 +3,8 @@
 import os
 from typing import Any

-from framework.llm.provider import LLMProvider, LLMResponse, Tool
 from framework.llm.litellm import LiteLLMProvider
+from framework.llm.provider import LLMProvider, LLMResponse, Tool


 def _get_api_key_from_credential_manager() -> str | None:
@@ -50,12 +50,10 @@ class AnthropicProvider(LLMProvider):
        # Delegate to LiteLLMProvider internally.
        self.api_key = api_key or _get_api_key_from_credential_manager()
        if not self.api_key:
-            raise ValueError(
-                "Anthropic API key required. Set ANTHROPIC_API_KEY env var or pass api_key."
-            )
+            raise ValueError("Anthropic API key required. Set ANTHROPIC_API_KEY env var or pass api_key.")

        self.model = model
-        
+
        self._provider = LiteLLMProvider(
            model=model,
            api_key=self.api_key,
@@ -183,21 +183,23 @@ class LiteLLMProvider(LLMProvider):

            # Process tool calls.
            # Add assistant message with tool calls.
-            current_messages.append({
-                "role": "assistant",
-                "content": message.content,
-                "tool_calls": [
-                    {
-                        "id": tc.id,
-                        "type": "function",
-                        "function": {
-                            "name": tc.function.name,
-                            "arguments": tc.function.arguments,
-                        },
-                    }
-                    for tc in message.tool_calls
-                ],
-            })
+            current_messages.append(
+                {
+                    "role": "assistant",
+                    "content": message.content,
+                    "tool_calls": [
+                        {
+                            "id": tc.id,
+                            "type": "function",
+                            "function": {
+                                "name": tc.function.name,
+                                "arguments": tc.function.arguments,
+                            },
+                        }
+                        for tc in message.tool_calls
+                    ],
+                }
+            )

            # Execute tools and add results.
            for tool_call in message.tool_calls:
@@ -216,11 +218,13 @@ class LiteLLMProvider(LLMProvider):
                result = tool_executor(tool_use)

                # Add tool result message
-                current_messages.append({
-                    "role": "tool",
-                    "tool_call_id": result.tool_use_id,
-                    "content": result.content,
-                })
+                current_messages.append(
+                    {
+                        "role": "tool",
+                        "tool_call_id": result.tool_use_id,
+                        "content": result.content,
+                    }
+                )

        # Max iterations reached
        return LLMResponse(
@@ -8,6 +8,7 @@ from typing import Any
@dataclass
 class LLMResponse:
    """Response from an LLM call."""
+
    content: str
    model: str
    input_tokens: int = 0
@@ -19,6 +20,7 @@ class LLMResponse:
@dataclass
 class Tool:
    """A tool the LLM can use."""
+
    name: str
    description: str
    parameters: dict[str, Any] = field(default_factory=dict)
@@ -27,6 +29,7 @@ class Tool:
@dataclass
 class ToolUse:
    """A tool call requested by the LLM."""
+
    id: str
    name: str
    input: dict[str, Any]
@@ -35,6 +38,7 @@ class ToolUse:
@dataclass
 class ToolResult:
    """Result of executing a tool."""
+
    tool_use_id: str
    content: str
    is_error: bool = False
@@ -1,15 +1,15 @@
 """Agent Runner - load and run exported agents."""

-from framework.runner.runner import AgentRunner, AgentInfo, ValidationResult
-from framework.runner.tool_registry import ToolRegistry, tool
 from framework.runner.orchestrator import AgentOrchestrator
 from framework.runner.protocol import (
    AgentMessage,
-    MessageType,
    CapabilityLevel,
    CapabilityResponse,
+    MessageType,
    OrchestratorResult,
 )
+from framework.runner.runner import AgentInfo, AgentRunner, ValidationResult
+from framework.runner.tool_registry import ToolRegistry, tool

 __all__ = [
    # Single agent
@@ -22,12 +22,14 @@ def register_commands(subparsers: argparse._SubParsersAction) -> None:
        help="Path to agent folder (containing agent.json)",
    )
    run_parser.add_argument(
-        "--input", "-i",
+        "--input",
+        "-i",
        type=str,
        help="Input context as JSON string",
    )
    run_parser.add_argument(
-        "--input-file", "-f",
+        "--input-file",
+        "-f",
        type=str,
        help="Input context from JSON file",
    )
@@ -37,17 +39,20 @@ def register_commands(subparsers: argparse._SubParsersAction) -> None:
        help="Run in mock mode (no real LLM calls)",
    )
    run_parser.add_argument(
-        "--output", "-o",
+        "--output",
+        "-o",
        type=str,
        help="Write results to file instead of stdout",
    )
    run_parser.add_argument(
-        "--quiet", "-q",
+        "--quiet",
+        "-q",
        action="store_true",
        help="Only output the final result JSON",
    )
    run_parser.add_argument(
-        "--verbose", "-v",
+        "--verbose",
+        "-v",
        action="store_true",
        help="Show detailed execution logs (steps, LLM calls, etc.)",
    )
@@ -113,7 +118,8 @@ def register_commands(subparsers: argparse._SubParsersAction) -> None:
        help="Directory containing agent folders (default: exports)",
    )
    dispatch_parser.add_argument(
-        "--input", "-i",
+        "--input",
+        "-i",
        type=str,
        required=True,
        help="Input context as JSON string",
@@ -124,13 +130,15 @@ def register_commands(subparsers: argparse._SubParsersAction) -> None:
        help="Description of what you want to accomplish",
    )
    dispatch_parser.add_argument(
-        "--agents", "-a",
+        "--agents",
+        "-a",
        type=str,
        nargs="+",
        help="Specific agent names to use (default: all in directory)",
    )
    dispatch_parser.add_argument(
-        "--quiet", "-q",
+        "--quiet",
+        "-q",
        action="store_true",
        help="Only output the final result JSON",
    )
@@ -170,15 +178,16 @@ def register_commands(subparsers: argparse._SubParsersAction) -> None:
 def cmd_run(args: argparse.Namespace) -> int:
    """Run an exported agent."""
    import logging
+
    from framework.runner import AgentRunner

    # Set logging level (quiet by default for cleaner output)
    if args.quiet:
-        logging.basicConfig(level=logging.ERROR, format='%(message)s')
-    elif getattr(args, 'verbose', False):
-        logging.basicConfig(level=logging.INFO, format='%(message)s')
+        logging.basicConfig(level=logging.ERROR, format="%(message)s")
+    elif getattr(args, "verbose", False):
+        logging.basicConfig(level=logging.INFO, format="%(message)s")
    else:
-        logging.basicConfig(level=logging.WARNING, format='%(message)s')
+        logging.basicConfig(level=logging.WARNING, format="%(message)s")

    # Load input context
    context = {}
@@ -211,6 +220,7 @@ def cmd_run(args: argparse.Namespace) -> int:
    entry_input_keys = runner.graph.nodes[0].input_keys if runner.graph.nodes else []
    if "user_id" in entry_input_keys and context.get("user_id") is None:
        import os
+
        context["user_id"] = os.environ.get("USER", "default_user")

    if not args.quiet:
@@ -279,7 +289,13 @@ def cmd_run(args: argparse.Namespace) -> int:
                # If no meaningful key found, show all non-internal keys
                if not shown:
                    for key, value in result.output.items():
-                        if not key.startswith("_") and key not in ["user_id", "request", "memory_loaded", "user_profile", "recent_context"]:
+                        if not key.startswith("_") and key not in [
+                            "user_id",
+                            "request",
+                            "memory_loaded",
+                            "user_profile",
+                            "recent_context",
+                        ]:
                            if isinstance(value, (dict, list)):
                                print(f"\n{key}:")
                                value_str = json.dumps(value, indent=2, default=str)
@@ -311,19 +327,24 @@ def cmd_info(args: argparse.Namespace) -> int:
    info = runner.info()

    if args.json:
-        print(json.dumps({
-            "name": info.name,
-            "description": info.description,
-            "goal_name": info.goal_name,
-            "goal_description": info.goal_description,
-            "node_count": info.node_count,
-            "nodes": info.nodes,
-            "edges": info.edges,
-            "success_criteria": info.success_criteria,
-            "constraints": info.constraints,
-            "required_tools": info.required_tools,
-            "has_tools_module": info.has_tools_module,
-        }, indent=2))
+        print(
+            json.dumps(
+                {
+                    "name": info.name,
+                    "description": info.description,
+                    "goal_name": info.goal_name,
+                    "goal_description": info.goal_description,
+                    "node_count": info.node_count,
+                    "nodes": info.nodes,
+                    "edges": info.edges,
+                    "success_criteria": info.success_criteria,
+                    "constraints": info.constraints,
+                    "required_tools": info.required_tools,
+                    "has_tools_module": info.has_tools_module,
+                },
+                indent=2,
+            )
+        )
    else:
        print(f"Agent: {info.name}")
        print(f"Description: {info.description}")
@@ -333,8 +354,8 @@ def cmd_info(args: argparse.Namespace) -> int:
        print()
        print(f"Nodes ({info.node_count}):")
        for node in info.nodes:
-            inputs = f" [in: {', '.join(node['input_keys'])}]" if node.get('input_keys') else ""
-            outputs = f" [out: {', '.join(node['output_keys'])}]" if node.get('output_keys') else ""
+            inputs = f" [in: {', '.join(node['input_keys'])}]" if node.get("input_keys") else ""
+            outputs = f" [out: {', '.join(node['output_keys'])}]" if node.get("output_keys") else ""
            print(f"  - {node['id']}: {node['name']}{inputs}{outputs}")
        print()
        print(f"Success Criteria ({len(info.success_criteria)}):")
@@ -405,19 +426,25 @@ def cmd_list(args: argparse.Namespace) -> int:
            try:
                runner = AgentRunner.load(path)
                info = runner.info()
-                agents.append({
-                    "path": str(path),
-                    "name": info.name,
-                    "description": info.description[:60] + "..." if len(info.description) > 60 else info.description,
-                    "nodes": info.node_count,
-                    "tools": len(info.required_tools),
-                })
+                agents.append(
+                    {
+                        "path": str(path),
+                        "name": info.name,
+                        "description": info.description[:60] + "..."
+                        if len(info.description) > 60
+                        else info.description,
+                        "nodes": info.node_count,
+                        "tools": len(info.required_tools),
+                    }
+                )
                runner.cleanup()
            except Exception as e:
-                agents.append({
-                    "path": str(path),
-                    "error": str(e),
-                })
+                agents.append(
+                    {
+                        "path": str(path),
+                        "error": str(e),
+                    }
+                )

    if not agents:
        print(f"No agents found in {directory}")
@@ -540,7 +567,7 @@ def cmd_dispatch(args: argparse.Namespace) -> int:

 def _interactive_approval(request):
    """Interactive approval callback for HITL mode."""
-    from framework.graph import ApprovalResult, ApprovalDecision
+    from framework.graph import ApprovalDecision, ApprovalResult

    print()
    print("=" * 60)
@@ -561,6 +588,7 @@ def _interactive_approval(request):
            print(f"\n[{key}]:")
            if isinstance(value, (dict, list)):
                import json
+
                value_str = json.dumps(value, indent=2, default=str)
                # Show more content for approval - up to 2000 chars
                if len(value_str) > 2000:
@@ -605,11 +633,14 @@ def _interactive_approval(request):
            print("Invalid choice. Please enter a, r, s, or x.")


-def _format_natural_language_to_json(user_input: str, input_keys: list[str], agent_description: str, session_context: dict = None) -> dict:
+def _format_natural_language_to_json(
+    user_input: str, input_keys: list[str], agent_description: str, session_context: dict = None
+) -> dict:
    """Use Haiku to convert natural language input to JSON based on agent's input schema."""
-    import anthropic
    import os

+    import anthropic
+
    client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))

    # Build prompt for Haiku
@@ -619,17 +650,26 @@ def _format_natural_language_to_json(user_input: str, input_keys: list[str], age
        main_field = input_keys[0] if input_keys else "objective"
        existing_value = session_context.get(main_field, "")

-        session_info = f"\n\nExisting {main_field}: \"{existing_value}\"\n\nThe user is providing ADDITIONAL information. Append this new information to the existing {main_field} to create an enriched, more detailed version."
+        session_info = (
+            f'\n\nExisting {main_field}: "{existing_value}"\n\n'
+            "The user is providing ADDITIONAL information. Append this new information "
+            f"to the existing {main_field} to create an enriched, more detailed version."
+        )

    prompt = f"""You are formatting user input for an agent that requires specific input fields.

 Agent: {agent_description}

-Required input fields: {', '.join(input_keys)}{session_info}
+Required input fields: {", ".join(input_keys)}{session_info}

 User input: {user_input}

-{"If this is a follow-up message, APPEND the new information to the existing field value to create a more complete, detailed version. Do not create new fields." if session_context else ""}
+{
+        "If this is a follow-up message, APPEND the new information to the existing field "
+        "value to create a more complete, detailed version. Do not create new fields."
+        if session_context
+        else ""
+    }

 Output ONLY valid JSON, no explanation:"""

@@ -637,7 +677,7 @@ Output ONLY valid JSON, no explanation:"""
        message = client.messages.create(
            model="claude-3-5-haiku-20241022",  # Fast and cheap
            max_tokens=500,
-            messages=[{"role": "user", "content": prompt}]
+            messages=[{"role": "user", "content": prompt}],
        )

        json_str = message.content[0].text.strip()
@@ -661,12 +701,13 @@ Output ONLY valid JSON, no explanation:"""
 def cmd_shell(args: argparse.Namespace) -> int:
    """Start an interactive agent session."""
    import logging
+
    from framework.runner import AgentRunner

    # Configure logging to show runtime visibility
    logging.basicConfig(
        level=logging.INFO,
-        format='%(message)s',  # Simple format for clean output
+        format="%(message)s",  # Simple format for clean output
    )

    agents_dir = Path(args.agents_dir)
@@ -690,7 +731,7 @@ def cmd_shell(args: argparse.Namespace) -> int:
        return 1

    # Set up approval callback by default (unless --no-approve is set)
-    if not getattr(args, 'no_approve', False):
+    if not getattr(args, "no_approve", False):
        runner.set_approval_callback(_interactive_approval)
        print("\n🔔 Human-in-the-loop mode enabled")
        print("   Steps marked for approval will pause for your review")
@@ -748,8 +789,8 @@ def cmd_shell(args: argparse.Namespace) -> int:
        if user_input == "/nodes":
            print("\nAgent nodes:")
            for node in info.nodes:
-                inputs = f" [in: {', '.join(node['input_keys'])}]" if node.get('input_keys') else ""
-                outputs = f" [out: {', '.join(node['output_keys'])}]" if node.get('output_keys') else ""
+                inputs = f" [in: {', '.join(node['input_keys'])}]" if node.get("input_keys") else ""
+                outputs = f" [out: {', '.join(node['output_keys'])}]" if node.get("output_keys") else ""
                print(f"  {node['id']}: {node['name']}{inputs}{outputs}")
                print(f"    {node['description']}")
            print()
@@ -784,7 +825,7 @@ def cmd_shell(args: argparse.Namespace) -> int:
                        user_input,
                        entry_input_keys,
                        info.description,
-                        session_context=session_memory
+                        session_context=session_memory,
                    )
                    print(f"✓ Formatted to: {json.dumps(context)}")
                except Exception as e:
@@ -807,6 +848,7 @@ def cmd_shell(args: argparse.Namespace) -> int:
            # Auto-inject user_id if missing (for personal assistant agents)
            if "user_id" in entry_input_keys and run_context.get("user_id") is None:
                import os
+
                run_context["user_id"] = os.environ.get("USER", "default_user")

            # Add conversation history to context if agent expects it
@@ -872,12 +914,14 @@ def cmd_shell(args: argparse.Namespace) -> int:
                    session_memory[key] = value

        # Track conversation history
-        conversation_history.append({
-            "input": context,
-            "output": result.output if result.output else {},
-            "status": "success" if result.success else "failed",
-            "paused_at": result.paused_at
-        })
+        conversation_history.append(
+            {
+                "input": context,
+                "output": result.output if result.output else {},
+                "status": "success" if result.success else "failed",
+                "paused_at": result.paused_at,
+            }
+        )

        print()

@@ -904,6 +948,7 @@ def _select_agent(agents_dir: Path) -> str | None:
    for i, agent_path in enumerate(agents, 1):
        try:
            from framework.runner import AgentRunner
+
            runner = AgentRunner.load(agent_path)
            info = runner.info()
            desc = info.description[:50] + "..." if len(info.description) > 50 else info.description
@@ -145,6 +145,7 @@ class MCPClient:

        try:
            import threading
+
            from mcp import StdioServerParameters

            # Create server parameters
@@ -177,7 +178,10 @@ class MCPClient:

                        # Create persistent stdio client context
                        self._stdio_context = stdio_client(server_params)
-                        self._read_stream, self._write_stream = await self._stdio_context.__aenter__()
+                        (
+                            self._read_stream,
+                            self._write_stream,
+                        ) = await self._stdio_context.__aenter__()

                        # Create persistent session
                        self._session = ClientSession(self._read_stream, self._write_stream)
@@ -212,7 +216,7 @@ class MCPClient:

            logger.info(f"Connected to MCP server '{self.config.name}' via STDIO (persistent)")
        except Exception as e:
-            raise RuntimeError(f"Failed to connect to MCP server: {e}")
+            raise RuntimeError(f"Failed to connect to MCP server: {e}") from e

    def _connect_http(self) -> None:
        """Connect to MCP server via HTTP transport."""
@@ -268,11 +272,13 @@ class MCPClient:
        # Convert tools to dict format
        tools_list = []
        for tool in response.tools:
-            tools_list.append({
-                "name": tool.name,
-                "description": tool.description,
-                "inputSchema": tool.inputSchema,
-            })
+            tools_list.append(
+                {
+                    "name": tool.name,
+                    "description": tool.description,
+                    "inputSchema": tool.inputSchema,
+                }
+            )

        return tools_list

@@ -300,7 +306,7 @@ class MCPClient:

            return data.get("result", {}).get("tools", [])
        except Exception as e:
-            raise RuntimeError(f"Failed to list tools via HTTP: {e}")
+            raise RuntimeError(f"Failed to list tools via HTTP: {e}") from e

    def list_tools(self) -> list[MCPTool]:
        """
@@ -350,9 +356,9 @@ class MCPClient:
            if len(result.content) > 0:
                content_item = result.content[0]
                # Check if it's a text content item
-                if hasattr(content_item, 'text'):
+                if hasattr(content_item, "text"):
                    return content_item.text
-                elif hasattr(content_item, 'data'):
+                elif hasattr(content_item, "data"):
                    return content_item.data
            return result.content

@@ -384,7 +390,7 @@ class MCPClient:

            return data.get("result", {}).get("content", [])
        except Exception as e:
-            raise RuntimeError(f"Failed to call tool via HTTP: {e}")
+            raise RuntimeError(f"Failed to call tool via HTTP: {e}") from e

    def disconnect(self) -> None:
        """Disconnect from the MCP server."""
@@ -72,6 +72,7 @@ class AgentOrchestrator:
        # Auto-create LLM - LiteLLM auto-detects provider and API key from model name
        if self._llm is None:
            from framework.llm.litellm import LiteLLMProvider
+
            self._llm = LiteLLMProvider(model=self._model)

    def register(
@@ -205,7 +206,7 @@ class AgentOrchestrator:

            responses = await asyncio.gather(*tasks, return_exceptions=True)

-            for agent_name, response in zip(routing.selected_agents, responses):
+            for agent_name, response in zip(routing.selected_agents, responses, strict=False):
                if isinstance(response, Exception):
                    results[agent_name] = {"error": str(response)}
                else:
@@ -326,7 +327,7 @@ class AgentOrchestrator:

        results = await asyncio.gather(*tasks, return_exceptions=True)

-        for name, result in zip(agent_names, results):
+        for name, result in zip(agent_names, results, strict=False):
            if isinstance(result, Exception):
                responses[name] = AgentMessage(
                    type=MessageType.RESPONSE,
@@ -355,7 +356,7 @@ class AgentOrchestrator:
        results = await asyncio.gather(*tasks, return_exceptions=True)

        capabilities = {}
-        for name, result in zip(agent_names, results):
+        for name, result in zip(agent_names, results, strict=False):
            if isinstance(result, Exception):
                capabilities[name] = CapabilityResponse(
                    agent_name=name,
@@ -399,11 +400,7 @@ class AgentOrchestrator:
            return await self._llm_route(request, intent, capable)

        # If no capable agents, check uncertain ones
-        uncertain = [
-            (name, cap)
-            for name, cap in capabilities.items()
-            if cap.level == CapabilityLevel.UNCERTAIN
-        ]
+        uncertain = [(name, cap) for name, cap in capabilities.items() if cap.level == CapabilityLevel.UNCERTAIN]
        if uncertain:
            uncertain.sort(key=lambda x: -x[1].confidence)
            return RoutingDecision(
@@ -429,8 +426,7 @@ class AgentOrchestrator:
        """Use LLM to decide routing when multiple agents are capable."""

        agents_info = "\n".join(
-            f"- {name}: {cap.reasoning} (confidence: {cap.confidence:.2f})"
-            for name, cap in capable
+            f"- {name}: {cap.reasoning} (confidence: {cap.confidence:.2f})" for name, cap in capable
        )

        prompt = f"""Multiple agents can handle this request. Decide the best routing.
@@ -463,7 +459,8 @@ Respond with JSON only:
            )

            import re
-            json_match = re.search(r'\{[^{}]*\}', response.content, re.DOTALL)
+
+            json_match = re.search(r"\{[^{}]*\}", response.content, re.DOTALL)
            if json_match:
                data = json.loads(json_match.group())
                selected = data.get("selected", [])
@@ -1,10 +1,10 @@
 """Message protocol for multi-agent communication."""

+import uuid
 from dataclasses import dataclass, field
 from datetime import datetime
 from enum import Enum
 from typing import Any
-import uuid


 class MessageType(Enum):
@@ -2,20 +2,21 @@

 import json
 import os
+from collections.abc import Callable
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import TYPE_CHECKING, Callable
+from typing import TYPE_CHECKING

 from framework.graph import Goal
-from framework.graph.edge import GraphSpec, EdgeSpec, EdgeCondition
+from framework.graph.edge import EdgeCondition, EdgeSpec, GraphSpec
+from framework.graph.executor import ExecutionResult, GraphExecutor
 from framework.graph.node import NodeSpec
-from framework.graph.executor import GraphExecutor, ExecutionResult
 from framework.llm.provider import LLMProvider, Tool
 from framework.runner.tool_registry import ToolRegistry
 from framework.runtime.core import Runtime

 if TYPE_CHECKING:
-    from framework.runner.protocol import CapabilityResponse, AgentMessage
+    from framework.runner.protocol import AgentMessage, CapabilityResponse


@dataclass
@@ -109,27 +110,31 @@ def load_agent_export(data: str | dict) -> tuple[GraphSpec, Goal]:
    )

    # Build Goal
-    from framework.graph.goal import SuccessCriterion, Constraint
+    from framework.graph.goal import Constraint, SuccessCriterion

    success_criteria = []
    for sc_data in goal_data.get("success_criteria", []):
-        success_criteria.append(SuccessCriterion(
-            id=sc_data["id"],
-            description=sc_data["description"],
-            metric=sc_data.get("metric", ""),
-            target=sc_data.get("target", ""),
-            weight=sc_data.get("weight", 1.0),
-        ))
+        success_criteria.append(
+            SuccessCriterion(
+                id=sc_data["id"],
+                description=sc_data["description"],
+                metric=sc_data.get("metric", ""),
+                target=sc_data.get("target", ""),
+                weight=sc_data.get("weight", 1.0),
+            )
+        )

    constraints = []
    for c_data in goal_data.get("constraints", []):
-        constraints.append(Constraint(
-            id=c_data["id"],
-            description=c_data["description"],
-            constraint_type=c_data.get("constraint_type", "hard"),
-            category=c_data.get("category", "safety"),
-            check=c_data.get("check", ""),
-        ))
+        constraints.append(
+            Constraint(
+                id=c_data["id"],
+                description=c_data["description"],
+                constraint_type=c_data.get("constraint_type", "hard"),
+                category=c_data.get("category", "safety"),
+                check=c_data.get("check", ""),
+            )
+        )

    goal = Goal(
        id=goal_data.get("id", ""),
@@ -466,12 +471,16 @@ class AgentRunner:
            entry_node=self.graph.entry_node,
            terminal_nodes=self.graph.terminal_nodes,
            success_criteria=[
-                {"id": sc.id, "description": sc.description, "metric": sc.metric, "target": sc.target}
+                {
+                    "id": sc.id,
+                    "description": sc.description,
+                    "metric": sc.metric,
+                    "target": sc.target,
+                }
                for sc in self.goal.success_criteria
            ],
            constraints=[
-                {"id": c.id, "description": c.description, "type": c.constraint_type}
-                for c in self.goal.constraints
+                {"id": c.id, "description": c.description, "type": c.constraint_type} for c in self.goal.constraints
            ],
            required_tools=sorted(required_tools),
            has_tools_module=(self.agent_path / "tools.py").exists(),
@@ -514,7 +523,7 @@ class AgentRunner:

            # Check tool credentials (Tier 2)
            missing_creds = cred_manager.get_missing_for_tools(info.required_tools)
-            for cred_name, spec in missing_creds:
+            for _cred_name, spec in missing_creds:
                missing_credentials.append(spec.env_var)
                affected_tools = [t for t in info.required_tools if t in spec.tools]
                tools_str = ", ".join(affected_tools)
@@ -524,9 +533,9 @@ class AgentRunner:
                warnings.append(warning_msg)

            # Check node type credentials (e.g., ANTHROPIC_API_KEY for LLM nodes)
-            node_types = list(set(node.node_type for node in self.graph.nodes))
+            node_types = list({node.node_type for node in self.graph.nodes})
            missing_node_creds = cred_manager.get_missing_for_node_types(node_types)
-            for cred_name, spec in missing_node_creds:
+            for _cred_name, spec in missing_node_creds:
                if spec.env_var not in missing_credentials:  # Avoid duplicates
                    missing_credentials.append(spec.env_var)
                    affected_types = [t for t in node_types if t in spec.node_types]
@@ -537,10 +546,7 @@ class AgentRunner:
                    warnings.append(warning_msg)
        except ImportError:
            # aden_tools not installed - fall back to direct check
-            has_llm_nodes = any(
-                node.node_type in ("llm_generate", "llm_tool_use")
-                for node in self.graph.nodes
-            )
+            has_llm_nodes = any(node.node_type in ("llm_generate", "llm_tool_use") for node in self.graph.nodes)
            if has_llm_nodes and not os.environ.get("ANTHROPIC_API_KEY"):
                warnings.append("Agent has LLM nodes but ANTHROPIC_API_KEY not set")

@@ -565,7 +571,7 @@ class AgentRunner:
        Returns:
            CapabilityResponse with level, confidence, and reasoning
        """
-        from framework.runner.protocol import CapabilityResponse, CapabilityLevel
+        from framework.runner.protocol import CapabilityLevel, CapabilityResponse

        # Use provided LLM or set up our own
        eval_llm = llm
@@ -622,7 +628,8 @@ Respond with JSON only:

            # Parse response
            import re
-            json_match = re.search(r'\{[^{}]*\}', response.content, re.DOTALL)
+
+            json_match = re.search(r"\{[^{}]*\}", response.content, re.DOTALL)
            if json_match:
                data = json.loads(json_match.group())
                level_map = {
@@ -646,7 +653,7 @@ Respond with JSON only:

    def _keyword_capability_check(self, request: dict) -> "CapabilityResponse":
        """Simple keyword-based capability check (fallback when no LLM)."""
-        from framework.runner.protocol import CapabilityResponse, CapabilityLevel
+        from framework.runner.protocol import CapabilityLevel, CapabilityResponse

        info = self.info()
        request_str = json.dumps(request).lower()
@@ -4,11 +4,12 @@ import importlib.util
 import inspect
 import json
 import logging
+from collections.abc import Callable
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Callable
+from typing import Any

-from framework.llm.provider import Tool, ToolUse, ToolResult
+from framework.llm.provider import Tool, ToolResult, ToolUse

 logger = logging.getLogger(__name__)

@@ -142,7 +143,7 @@ class ToolRegistry:

        # Check for TOOLS dict
        if hasattr(module, "TOOLS"):
-            tools_dict = getattr(module, "TOOLS")
+            tools_dict = module.TOOLS
            executor_func = getattr(module, "tool_executor", None)

            for name, tool in tools_dict.items():
@@ -6,13 +6,13 @@ that Builder can analyze. The agent calls simple methods, and the runtime
 handles all the structured logging.
 """

-from datetime import datetime
-from typing import Any
-from pathlib import Path
 import logging
 import uuid
+from datetime import datetime
+from pathlib import Path
+from typing import Any

-from framework.schemas.decision import Decision, Option, Outcome, DecisionType
+from framework.schemas.decision import Decision, DecisionType, Option, Outcome
 from framework.schemas.run import Run, RunStatus
 from framework.storage.backend import FileStorage

@@ -174,15 +174,17 @@ class Runtime:
        # Build Option objects
        option_objects = []
        for opt in options:
-            option_objects.append(Option(
-                id=opt["id"],
-                description=opt.get("description", ""),
-                action_type=opt.get("action_type", "unknown"),
-                action_params=opt.get("action_params", {}),
-                pros=opt.get("pros", []),
-                cons=opt.get("cons", []),
-                confidence=opt.get("confidence", 0.5),
-            ))
+            option_objects.append(
+                Option(
+                    id=opt["id"],
+                    description=opt.get("description", ""),
+                    action_type=opt.get("action_type", "unknown"),
+                    action_params=opt.get("action_params", {}),
+                    pros=opt.get("pros", []),
+                    cons=opt.get("cons", []),
+                    confidence=opt.get("confidence", 0.5),
+                )
+            )

        # Create decision
        decision_id = f"dec_{len(self._current_run.decisions)}"
@@ -370,11 +372,13 @@ class Runtime:
        """
        return self.decide(
            intent=intent,
-            options=[{
-                "id": "action",
-                "description": action,
-                "action_type": "execute",
-            }],
+            options=[
+                {
+                    "id": "action",
+                    "description": action,
+                    "action_type": "execute",
+                }
+            ],
            chosen="action",
            reasoning=reasoning,
            node_id=node_id,
@@ -1,7 +1,7 @@
 """Schema definitions for runtime data."""

-from framework.schemas.decision import Decision, Option, Outcome, DecisionEvaluation
-from framework.schemas.run import Run, RunSummary, Problem
+from framework.schemas.decision import Decision, DecisionEvaluation, Option, Outcome
+from framework.schemas.run import Problem, Run, RunSummary

 __all__ = [
    "Decision",
@@ -10,22 +10,23 @@ This is MORE important than actions because:
 """

 from datetime import datetime
-from typing import Any
 from enum import Enum
+from typing import Any

 from pydantic import BaseModel, Field, computed_field


 class DecisionType(str, Enum):
    """Types of decisions an agent can make."""
-    TOOL_SELECTION = "tool_selection"      # Which tool to use
+
+    TOOL_SELECTION = "tool_selection"  # Which tool to use
    PARAMETER_CHOICE = "parameter_choice"  # What parameters to pass
-    PATH_CHOICE = "path_choice"            # Which branch to take
-    OUTPUT_FORMAT = "output_format"        # How to format output
-    RETRY_STRATEGY = "retry_strategy"      # How to handle failure
-    DELEGATION = "delegation"              # Whether to delegate to another node
-    TERMINATION = "termination"            # Whether to stop or continue
-    CUSTOM = "custom"                      # User-defined decision type
+    PATH_CHOICE = "path_choice"  # Which branch to take
+    OUTPUT_FORMAT = "output_format"  # How to format output
+    RETRY_STRATEGY = "retry_strategy"  # How to handle failure
+    DELEGATION = "delegation"  # Whether to delegate to another node
+    TERMINATION = "termination"  # Whether to stop or continue
+    CUSTOM = "custom"  # User-defined decision type


 class Option(BaseModel):
@@ -35,9 +36,10 @@ class Option(BaseModel):
    Capturing options is crucial - it shows what the agent considered
    and enables us to evaluate whether the right choice was made.
    """
+
    id: str
-    description: str                    # Human-readable: "Call search API"
-    action_type: str                    # "tool_call", "generate", "delegate"
+    description: str  # Human-readable: "Call search API"
+    action_type: str  # "tool_call", "generate", "delegate"
    action_params: dict[str, Any] = Field(default_factory=dict)

    # Why might this be good or bad?
@@ -57,9 +59,10 @@ class Outcome(BaseModel):
    This is filled in AFTER the action completes, allowing us to
    correlate decisions with their results.
    """
+
    success: bool
-    result: Any = None                  # The actual output
-    error: str | None = None            # Error message if failed
+    result: Any = None  # The actual output
+    error: str | None = None  # Error message if failed

    # Side effects
    state_changes: dict[str, Any] = Field(default_factory=dict)
@@ -67,7 +70,7 @@ class Outcome(BaseModel):
    latency_ms: int = 0

    # Natural language summary (crucial for Builder)
-    summary: str = ""                   # "Found 3 contacts matching query"
+    summary: str = ""  # "Found 3 contacts matching query"

    timestamp: datetime = Field(default_factory=datetime.now)

@@ -81,6 +84,7 @@ class DecisionEvaluation(BaseModel):
    This is computed AFTER the run completes, allowing us to
    judge decisions in light of their eventual outcomes.
    """
+
    # Did it move toward the goal?
    goal_aligned: bool = True
    alignment_score: float = Field(default=1.0, ge=0.0, le=1.0)
@@ -109,6 +113,7 @@ class Decision(BaseModel):
    Every significant choice the agent makes is captured here.
    This is the core data structure for understanding and improving agents.
    """
+
    id: str
    timestamp: datetime = Field(default_factory=datetime.now)
    node_id: str
@@ -6,8 +6,8 @@ summaries and metrics that Builder needs to understand what happened.
 """

 from datetime import datetime
-from typing import Any
 from enum import Enum
+from typing import Any

 from pydantic import BaseModel, Field, computed_field

@@ -16,10 +16,11 @@ from framework.schemas.decision import Decision, Outcome

 class RunStatus(str, Enum):
    """Status of a run."""
+
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"
-    STUCK = "stuck"           # Making no progress
+    STUCK = "stuck"  # Making no progress
    CANCELLED = "cancelled"


@@ -29,6 +30,7 @@ class Problem(BaseModel):

    Problems are surfaced explicitly so Builder can focus on what needs fixing.
    """
+
    id: str
    severity: str = Field(description="critical, warning, or minor")
    description: str
@@ -42,6 +44,7 @@ class Problem(BaseModel):

 class RunMetrics(BaseModel):
    """Quantitative metrics about a run."""
+
    total_decisions: int = 0
    successful_decisions: int = 0
    failed_decisions: int = 0
@@ -68,6 +71,7 @@ class Run(BaseModel):

    Contains all decisions, problems, and metrics from a single run.
    """
+
    id: str
    goal_id: str
    started_at: datetime = Field(default_factory=datetime.now)
@@ -191,6 +195,7 @@ class RunSummary(BaseModel):

    This is what I (Builder) want to see first when analyzing runs.
    """
+
    run_id: str
    goal_id: str
    status: RunStatus
@@ -8,7 +8,7 @@ Uses Pydantic's built-in serialization.
 import json
 from pathlib import Path

-from framework.schemas.run import Run, RunSummary, RunStatus
+from framework.schemas.run import Run, RunStatus, RunSummary


 class FileStorage:
@@ -63,27 +63,7 @@ python -m framework test-debug <goal_id> <test_id>
 """

 # Schemas
-from framework.testing.test_case import (
-    ApprovalStatus,
-    TestType,
-    Test,
-)
-from framework.testing.test_result import (
-    ErrorCategory,
-    TestResult,
-    TestSuiteResult,
-)
-
-# Storage
-from framework.testing.test_storage import TestStorage
-
-# Generation
-from framework.testing.constraint_gen import ConstraintTestGenerator
-from framework.testing.success_gen import SuccessCriteriaTestGenerator
-from framework.testing.prompts import (
-    CONSTRAINT_TEST_PROMPT,
-    SUCCESS_CRITERIA_TEST_PROMPT,
-)
+from framework.testing.approval_cli import batch_approval, interactive_approval

 # Approval
 from framework.testing.approval_types import (
@@ -93,20 +73,40 @@ from framework.testing.approval_types import (
    BatchApprovalRequest,
    BatchApprovalResult,
 )
-from framework.testing.approval_cli import interactive_approval, batch_approval

 # Error categorization
 from framework.testing.categorizer import ErrorCategorizer

-# LLM Judge for semantic evaluation
-from framework.testing.llm_judge import LLMJudge
-
-# Debug
-from framework.testing.debug_tool import DebugTool, DebugInfo
-
 # CLI
 from framework.testing.cli import register_testing_commands

+# Generation
+from framework.testing.constraint_gen import ConstraintTestGenerator
+
+# Debug
+from framework.testing.debug_tool import DebugInfo, DebugTool
+
+# LLM Judge for semantic evaluation
+from framework.testing.llm_judge import LLMJudge
+from framework.testing.prompts import (
+    CONSTRAINT_TEST_PROMPT,
+    SUCCESS_CRITERIA_TEST_PROMPT,
+)
+from framework.testing.success_gen import SuccessCriteriaTestGenerator
+from framework.testing.test_case import (
+    ApprovalStatus,
+    Test,
+    TestType,
+)
+from framework.testing.test_result import (
+    ErrorCategory,
+    TestResult,
+    TestSuiteResult,
+)
+
+# Storage
+from framework.testing.test_storage import TestStorage
+
 __all__ = [
    # Schemas
    "ApprovalStatus",
@@ -6,19 +6,19 @@ This CLI provides the interactive approval workflow.
 """

 import json
-import tempfile
-import subprocess
 import os
-from typing import Callable
+import subprocess
+import tempfile
+from collections.abc import Callable

-from framework.testing.test_case import Test
-from framework.testing.test_storage import TestStorage
 from framework.testing.approval_types import (
    ApprovalAction,
    ApprovalRequest,
    ApprovalResult,
    BatchApprovalResult,
 )
+from framework.testing.test_case import Test
+from framework.testing.test_storage import TestStorage


 def interactive_approval(
@@ -96,18 +96,14 @@ def batch_approval(
        # Validate request
        valid, error = req.validate_action()
        if not valid:
-            results.append(ApprovalResult.error_result(
-                req.test_id, req.action, error or "Invalid request"
-            ))
+            results.append(ApprovalResult.error_result(req.test_id, req.action, error or "Invalid request"))
            counts["errors"] += 1
            continue

        # Load test
        test = storage.load_test(goal_id, req.test_id)
        if not test:
-            results.append(ApprovalResult.error_result(
-                req.test_id, req.action, f"Test {req.test_id} not found"
-            ))
+            results.append(ApprovalResult.error_result(req.test_id, req.action, f"Test {req.test_id} not found"))
            counts["errors"] += 1
            continue

@@ -129,14 +125,12 @@ def batch_approval(
            if req.action != ApprovalAction.SKIP:
                storage.update_test(test)

-            results.append(ApprovalResult.success_result(
-                req.test_id, req.action, f"Test {req.action.value}d successfully"
-            ))
+            results.append(
+                ApprovalResult.success_result(req.test_id, req.action, f"Test {req.action.value}d successfully")
+            )

        except Exception as e:
-            results.append(ApprovalResult.error_result(
-                req.test_id, req.action, str(e)
-            ))
+            results.append(ApprovalResult.error_result(req.test_id, req.action, str(e)))
            counts["errors"] += 1

    return BatchApprovalResult(
@@ -260,11 +254,7 @@ def _edit_test_code(code: str) -> str:
                break

    # Create temp file with code
-    with tempfile.NamedTemporaryFile(
-        mode="w",
-        suffix=".py",
-        delete=False
-    ) as f:
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
        f.write(code)
        temp_path = f.name

@@ -292,4 +282,5 @@ def _edit_test_code(code: str) -> str:
 def _command_exists(cmd: str) -> bool:
    """Check if a command exists in PATH."""
    from shutil import which
+
    return which(cmd) is not None
@@ -5,8 +5,8 @@ These types are used for both interactive CLI approval and
 programmatic/MCP-based approval.
 """

-from enum import Enum
 from datetime import datetime
+from enum import Enum
 from typing import Any

 from pydantic import BaseModel, Field
@@ -14,10 +14,11 @@ from pydantic import BaseModel, Field

 class ApprovalAction(str, Enum):
    """Actions a user can take on a generated test."""
-    APPROVE = "approve"   # Accept as-is
-    MODIFY = "modify"     # Accept with modifications
-    REJECT = "reject"     # Decline
-    SKIP = "skip"         # Leave pending (decide later)
+
+    APPROVE = "approve"  # Accept as-is
+    MODIFY = "modify"  # Accept with modifications
+    REJECT = "reject"  # Decline
+    SKIP = "skip"  # Leave pending (decide later)


 class ApprovalRequest(BaseModel):
@@ -26,16 +27,11 @@ class ApprovalRequest(BaseModel):

    Used by both CLI and MCP interfaces.
    """
+
    test_id: str
    action: ApprovalAction
-    modified_code: str | None = Field(
-        default=None,
-        description="New code if action is MODIFY"
-    )
-    reason: str | None = Field(
-        default=None,
-        description="Rejection reason if action is REJECT"
-    )
+    modified_code: str | None = Field(default=None, description="New code if action is MODIFY")
+    reason: str | None = Field(default=None, description="Rejection reason if action is REJECT")
    approved_by: str = "user"

    def validate_action(self) -> tuple[bool, str | None]:
@@ -56,6 +52,7 @@ class ApprovalResult(BaseModel):
    """
    Result of processing an approval request.
    """
+
    test_id: str
    action: ApprovalAction
    success: bool
@@ -64,9 +61,7 @@ class ApprovalResult(BaseModel):
    timestamp: datetime = Field(default_factory=datetime.now)

    @classmethod
-    def success_result(
-        cls, test_id: str, action: ApprovalAction, message: str | None = None
-    ) -> "ApprovalResult":
+    def success_result(cls, test_id: str, action: ApprovalAction, message: str | None = None) -> "ApprovalResult":
        """Create a successful result."""
        return cls(
            test_id=test_id,
@@ -76,9 +71,7 @@ class ApprovalResult(BaseModel):
        )

    @classmethod
-    def error_result(
-        cls, test_id: str, action: ApprovalAction, error: str
-    ) -> "ApprovalResult":
+    def error_result(cls, test_id: str, action: ApprovalAction, error: str) -> "ApprovalResult":
        """Create an error result."""
        return cls(
            test_id=test_id,
@@ -94,6 +87,7 @@ class BatchApprovalRequest(BaseModel):

    Useful for MCP interface where user reviews all tests and submits decisions.
    """
+
    goal_id: str
    approvals: list[ApprovalRequest]

@@ -109,6 +103,7 @@ class BatchApprovalResult(BaseModel):
    """
    Result of processing a batch approval request.
    """
+
    goal_id: str
    total: int
    approved: int
@@ -80,15 +80,9 @@ class ErrorCategorizer:

    def __init__(self):
        """Initialize categorizer with compiled patterns."""
-        self._logic_patterns = [
-            re.compile(p, re.IGNORECASE) for p in self.LOGIC_ERROR_PATTERNS
-        ]
-        self._impl_patterns = [
-            re.compile(p, re.IGNORECASE) for p in self.IMPLEMENTATION_ERROR_PATTERNS
-        ]
-        self._edge_patterns = [
-            re.compile(p, re.IGNORECASE) for p in self.EDGE_CASE_PATTERNS
-        ]
+        self._logic_patterns = [re.compile(p, re.IGNORECASE) for p in self.LOGIC_ERROR_PATTERNS]
+        self._impl_patterns = [re.compile(p, re.IGNORECASE) for p in self.IMPLEMENTATION_ERROR_PATTERNS]
+        self._edge_patterns = [re.compile(p, re.IGNORECASE) for p in self.EDGE_CASE_PATTERNS]

    def categorize(self, result: TestResult) -> ErrorCategory | None:
        """
@@ -125,9 +119,7 @@ class ErrorCategorizer:
        # Default to implementation error (most common)
        return ErrorCategory.IMPLEMENTATION_ERROR

-    def categorize_with_confidence(
-        self, result: TestResult
-    ) -> tuple[ErrorCategory | None, float]:
+    def categorize_with_confidence(self, result: TestResult) -> tuple[ErrorCategory | None, float]:
        """
        Categorize with a confidence score.

@@ -143,15 +135,9 @@ class ErrorCategorizer:
        error_text = self._get_error_text(result)

        # Count pattern matches for each category
-        logic_matches = sum(
-            1 for p in self._logic_patterns if p.search(error_text)
-        )
-        impl_matches = sum(
-            1 for p in self._impl_patterns if p.search(error_text)
-        )
-        edge_matches = sum(
-            1 for p in self._edge_patterns if p.search(error_text)
-        )
+        logic_matches = sum(1 for p in self._logic_patterns if p.search(error_text))
+        impl_matches = sum(1 for p in self._impl_patterns if p.search(error_text))
+        edge_matches = sum(1 for p in self._edge_patterns if p.search(error_text))

        total_matches = logic_matches + impl_matches + edge_matches

@@ -204,8 +190,7 @@ class ErrorCategorizer:
                "The goal specification may not accurately describe the desired behavior."
            ),
            ErrorCategory.IMPLEMENTATION_ERROR: (
-                "Fix the code in agent nodes/edges. "
-                "There's a bug in the implementation that needs to be corrected."
+                "Fix the code in agent nodes/edges. There's a bug in the implementation that needs to be corrected."
            ),
            ErrorCategory.EDGE_CASE: (
                "Add a new test for this edge case scenario. "
@@ -238,23 +223,22 @@ class ErrorCategorizer:
                "action": "Fix nodes/edges implementation",
                "restart_required": False,
                "description": (
-                    "There's a code bug. Fix the agent implementation, "
-                    "then re-run Eval (skip Goal stage)."
+                    "There's a code bug. Fix the agent implementation, then re-run Eval (skip Goal stage)."
                ),
            },
            ErrorCategory.EDGE_CASE: {
                "stage": "Eval",
                "action": "Add new test only",
                "restart_required": False,
-                "description": (
-                    "This is a new scenario. Add a test for it and continue "
-                    "in the Eval stage."
-                ),
+                "description": ("This is a new scenario. Add a test for it and continue in the Eval stage."),
            },
        }
-        return guidance.get(category, {
-            "stage": "Unknown",
-            "action": "Review manually",
-            "restart_required": False,
-            "description": "Unable to determine category. Manual review required.",
-        })
+        return guidance.get(
+            category,
+            {
+                "stage": "Unknown",
+                "action": "Review manually",
+                "restart_required": False,
+                "description": "Unable to determine category. Manual review required.",
+            },
+        )
@@ -14,11 +14,10 @@ import subprocess
 from pathlib import Path

 from framework.graph.goal import Goal
-from framework.testing.test_storage import TestStorage
+from framework.testing.approval_cli import interactive_approval
 from framework.testing.constraint_gen import ConstraintTestGenerator
 from framework.testing.success_gen import SuccessCriteriaTestGenerator
-from framework.testing.approval_cli import interactive_approval
-
+from framework.testing.test_storage import TestStorage

 DEFAULT_STORAGE_PATH = Path("exports")

@@ -173,6 +172,7 @@ def cmd_test_generate(args: argparse.Namespace) -> int:
    # Get LLM provider
    try:
        from framework.llm import AnthropicProvider
+
        llm = AnthropicProvider()
    except Exception as e:
        print(f"Error: Failed to initialize LLM provider: {e}")
@@ -376,6 +376,7 @@ def cmd_test_list(args: argparse.Namespace) -> int:
    # Filter by status
    if args.status != "all":
        from framework.testing.test_case import ApprovalStatus
+
        try:
            filter_status = ApprovalStatus(args.status)
            tests = [t for t in tests if t.approval_status == filter_status]
@@ -8,10 +8,10 @@ Tests are returned with PENDING approval status.
 import uuid
 from typing import TYPE_CHECKING

-from framework.graph.goal import Goal, Constraint
-from framework.testing.test_case import Test, TestType, ApprovalStatus
+from framework.graph.goal import Constraint, Goal
+from framework.llm.provider import Tool, ToolResult, ToolUse
 from framework.testing.prompts import CONSTRAINT_TEST_PROMPT
-from framework.llm.provider import Tool, ToolUse, ToolResult
+from framework.testing.test_case import ApprovalStatus, Test, TestType

 if TYPE_CHECKING:
    from framework.llm.provider import LLMProvider
@@ -103,16 +103,15 @@ class ConstraintTestGenerator:
        def tool_executor(tool_use: ToolUse) -> ToolResult:
            if tool_use.name == "submit_test":
                collected_tests.append(tool_use.input)
-                return ToolResult(
-                    tool_use_id=tool_use.id, content="Test recorded successfully"
-                )
-            return ToolResult(
-                tool_use_id=tool_use.id, content="Unknown tool", is_error=True
-            )
+                return ToolResult(tool_use_id=tool_use.id, content="Test recorded successfully")
+            return ToolResult(tool_use_id=tool_use.id, content="Unknown tool", is_error=True)

        self.llm.complete_with_tools(
            messages=[{"role": "user", "content": prompt}],
-            system="You are a test generation expert. For each constraint, call the submit_test tool with the test details.",
+            system=(
+                "You are a test generation expert. "
+                "For each constraint, call the submit_test tool with the test details."
+            ),
            tools=[SUBMIT_TEST_TOOL],
            tool_executor=tool_executor,
            max_iterations=5,
@@ -124,9 +123,7 @@ class ConstraintTestGenerator:
        # Enforce max 5 tests total
        return tests[:5]

-    def generate_for_constraint(
-        self, goal: Goal, constraint: Constraint, agent_module: str = "my_agent"
-    ) -> list[Test]:
+    def generate_for_constraint(self, goal: Goal, constraint: Constraint, agent_module: str = "my_agent") -> list[Test]:
        """
        Generate tests for a single constraint.

@@ -152,12 +149,8 @@ class ConstraintTestGenerator:
        def tool_executor(tool_use: ToolUse) -> ToolResult:
            if tool_use.name == "submit_test":
                collected_tests.append(tool_use.input)
-                return ToolResult(
-                    tool_use_id=tool_use.id, content="Test recorded successfully"
-                )
-            return ToolResult(
-                tool_use_id=tool_use.id, content="Unknown tool", is_error=True
-            )
+                return ToolResult(tool_use_id=tool_use.id, content="Test recorded successfully")
+            return ToolResult(tool_use_id=tool_use.id, content="Unknown tool", is_error=True)

        self.llm.complete_with_tools(
            messages=[{"role": "user", "content": prompt}],
@@ -186,9 +179,7 @@ class ConstraintTestGenerator:
 - Description: {constraint.description}
 - Check: {constraint.check}"""

-    def _create_tests_from_collected(
-        self, collected: list[dict], goal_id: str
-    ) -> list[Test]:
+    def _create_tests_from_collected(self, collected: list[dict], goal_id: str) -> list[Test]:
        """Create Test objects from tool call data."""
        tests = []
        for td in collected:
@@ -13,16 +13,17 @@ from typing import Any

 from pydantic import BaseModel, Field

-from framework.testing.test_case import Test
-from framework.testing.test_result import TestResult, ErrorCategory
-from framework.testing.test_storage import TestStorage
 from framework.testing.categorizer import ErrorCategorizer
+from framework.testing.test_case import Test
+from framework.testing.test_result import ErrorCategory, TestResult
+from framework.testing.test_storage import TestStorage


 class DebugInfo(BaseModel):
    """
    Comprehensive debug information for a failed test.
    """
+
    test_id: str
    test_name: str

@@ -239,12 +240,10 @@ class DebugTool:
            return {
                "execution_path": run.metrics.nodes_executed if hasattr(run, "metrics") else [],
                "decisions": [
-                    d.model_dump() if hasattr(d, "model_dump") else str(d)
-                    for d in getattr(run, "decisions", [])
+                    d.model_dump() if hasattr(d, "model_dump") else str(d) for d in getattr(run, "decisions", [])
                ],
                "problems": [
-                    p.model_dump() if hasattr(p, "model_dump") else str(p)
-                    for p in getattr(run, "problems", [])
+                    p.model_dump() if hasattr(p, "model_dump") else str(p) for p in getattr(run, "problems", [])
                ],
                "status": run.status.value if hasattr(run, "status") else "unknown",
            }
@@ -279,8 +278,7 @@ class DebugTool:

        if failures_by_category["uncategorized"]:
            suggestions.append(
-                f"Found {len(failures_by_category['uncategorized'])} uncategorized failures. "
-                "Manual review required."
+                f"Found {len(failures_by_category['uncategorized'])} uncategorized failures. Manual review required."
            )

        return suggestions
@@ -41,8 +41,8 @@ class LLMJudge:
                import anthropic

                self._client = anthropic.Anthropic()
-            except ImportError:
-                raise RuntimeError("anthropic package required for LLM judge")
+            except ImportError as e:
+                raise RuntimeError("anthropic package required for LLM judge") from e
        return self._client

    def evaluate(
@@ -9,9 +9,9 @@ import uuid
 from typing import TYPE_CHECKING

 from framework.graph.goal import Goal, SuccessCriterion
-from framework.testing.test_case import Test, TestType, ApprovalStatus
+from framework.llm.provider import Tool, ToolResult, ToolUse
 from framework.testing.prompts import SUCCESS_CRITERIA_TEST_PROMPT
-from framework.llm.provider import Tool, ToolUse, ToolResult
+from framework.testing.test_case import ApprovalStatus, Test, TestType

 if TYPE_CHECKING:
    from framework.llm.provider import LLMProvider
@@ -115,16 +115,15 @@ class SuccessCriteriaTestGenerator:
        def tool_executor(tool_use: ToolUse) -> ToolResult:
            if tool_use.name == "submit_test":
                collected_tests.append(tool_use.input)
-                return ToolResult(
-                    tool_use_id=tool_use.id, content="Test recorded successfully"
-                )
-            return ToolResult(
-                tool_use_id=tool_use.id, content="Unknown tool", is_error=True
-            )
+                return ToolResult(tool_use_id=tool_use.id, content="Test recorded successfully")
+            return ToolResult(tool_use_id=tool_use.id, content="Unknown tool", is_error=True)

        self.llm.complete_with_tools(
            messages=[{"role": "user", "content": prompt}],
-            system="You are a test generation expert. For each success criterion, call the submit_test tool with the test details.",
+            system=(
+                "You are a test generation expert. "
+                "For each success criterion, call the submit_test tool with the test details."
+            ),
            tools=[SUBMIT_TEST_TOOL],
            tool_executor=tool_executor,
            max_iterations=12,
@@ -172,12 +171,8 @@ class SuccessCriteriaTestGenerator:
        def tool_executor(tool_use: ToolUse) -> ToolResult:
            if tool_use.name == "submit_test":
                collected_tests.append(tool_use.input)
-                return ToolResult(
-                    tool_use_id=tool_use.id, content="Test recorded successfully"
-                )
-            return ToolResult(
-                tool_use_id=tool_use.id, content="Unknown tool", is_error=True
-            )
+                return ToolResult(tool_use_id=tool_use.id, content="Test recorded successfully")
+            return ToolResult(tool_use_id=tool_use.id, content="Unknown tool", is_error=True)

        self.llm.complete_with_tools(
            messages=[{"role": "user", "content": prompt}],
@@ -206,9 +201,7 @@ class SuccessCriteriaTestGenerator:
 - Weight: {criterion.weight}
 - Currently met: {criterion.met}"""

-    def _create_tests_from_collected(
-        self, collected: list[dict], goal_id: str
-    ) -> list[Test]:
+    def _create_tests_from_collected(self, collected: list[dict], goal_id: str) -> list[Test]:
        """Create Test objects from tool call data."""
        tests = []
        for td in collected:
@@ -14,18 +14,20 @@ from pydantic import BaseModel, Field

 class ApprovalStatus(str, Enum):
    """Status of user approval for a generated test."""
-    PENDING = "pending"      # Awaiting user review
-    APPROVED = "approved"    # User accepted as-is
-    MODIFIED = "modified"    # User edited before accepting
-    REJECTED = "rejected"    # User declined (with reason)
+
+    PENDING = "pending"  # Awaiting user review
+    APPROVED = "approved"  # User accepted as-is
+    MODIFIED = "modified"  # User edited before accepting
+    REJECTED = "rejected"  # User declined (with reason)


 class TestType(str, Enum):
    """Type of test based on what it validates."""
+
    __test__ = False  # Not a pytest test class
-    CONSTRAINT = "constraint"           # Validates constraint boundaries
-    SUCCESS_CRITERIA = "outcome"        # Validates success criteria achievement
-    EDGE_CASE = "edge_case"            # Validates edge case handling
+    CONSTRAINT = "constraint"  # Validates constraint boundaries
+    SUCCESS_CRITERIA = "outcome"  # Validates success criteria achievement
+    EDGE_CASE = "edge_case"  # Validates edge case handling


 class Test(BaseModel):
@@ -38,64 +40,34 @@ class Test(BaseModel):

    All tests require approval before being added to the test suite.
    """
+
    __test__ = False  # Not a pytest test class
    id: str
    goal_id: str
-    parent_criteria_id: str = Field(
-        description="Links to success_criteria.id or constraint.id"
-    )
+    parent_criteria_id: str = Field(description="Links to success_criteria.id or constraint.id")
    test_type: TestType

    # Test definition
-    test_name: str = Field(
-        description="Descriptive function name, e.g., test_constraint_api_limits_respected"
-    )
-    test_code: str = Field(
-        description="Python test function code (pytest compatible)"
-    )
-    description: str = Field(
-        description="Human-readable description of what the test validates"
-    )
-    input: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Test input data"
-    )
-    expected_output: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Expected output or assertions"
-    )
+    test_name: str = Field(description="Descriptive function name, e.g., test_constraint_api_limits_respected")
+    test_code: str = Field(description="Python test function code (pytest compatible)")
+    description: str = Field(description="Human-readable description of what the test validates")
+    input: dict[str, Any] = Field(default_factory=dict, description="Test input data")
+    expected_output: dict[str, Any] = Field(default_factory=dict, description="Expected output or assertions")

    # LLM generation metadata
-    generated_by: str = Field(
-        default="llm",
-        description="Who created the test: 'llm' or 'human'"
-    )
-    llm_confidence: float = Field(
-        default=0.0,
-        ge=0.0,
-        le=1.0,
-        description="LLM's confidence in the test quality (0-1)"
-    )
+    generated_by: str = Field(default="llm", description="Who created the test: 'llm' or 'human'")
+    llm_confidence: float = Field(default=0.0, ge=0.0, le=1.0, description="LLM's confidence in the test quality (0-1)")

    # Approval tracking (CRITICAL - tests are never used without approval)
    approval_status: ApprovalStatus = ApprovalStatus.PENDING
    approved_by: str | None = None
    approved_at: datetime | None = None
-    rejection_reason: str | None = Field(
-        default=None,
-        description="Reason for rejection if status is REJECTED"
-    )
-    original_code: str | None = Field(
-        default=None,
-        description="Original LLM-generated code if user modified it"
-    )
+    rejection_reason: str | None = Field(default=None, description="Reason for rejection if status is REJECTED")
+    original_code: str | None = Field(default=None, description="Original LLM-generated code if user modified it")

    # Execution tracking
    last_run: datetime | None = None
-    last_result: str | None = Field(
-        default=None,
-        description="Result of last run: 'passed', 'failed', 'error'"
-    )
+    last_result: str | None = Field(default=None, description="Result of last run: 'passed', 'failed', 'error'")
    run_count: int = 0
    pass_count: int = 0
    fail_count: int = 0
@@ -21,6 +21,7 @@ class ErrorCategory(str, Enum):
    - IMPLEMENTATION_ERROR: Code bug → fix nodes/edges in Agent stage
    - EDGE_CASE: New scenario discovered → add new test only
    """
+
    LOGIC_ERROR = "logic_error"
    IMPLEMENTATION_ERROR = "implementation_error"
    EDGE_CASE = "edge_case"
@@ -36,13 +37,11 @@ class TestResult(BaseModel):
    - Error details for debugging
    - Runtime logs and execution path
    """
+
    __test__ = False  # Not a pytest test class
    test_id: str
    passed: bool
-    duration_ms: int = Field(
-        ge=0,
-        description="Test execution time in milliseconds"
-    )
+    duration_ms: int = Field(ge=0, description="Test execution time in milliseconds")

    # Output comparison
    actual_output: Any = None
@@ -54,24 +53,12 @@ class TestResult(BaseModel):
    stack_trace: str | None = None

    # Runtime data for debugging
-    runtime_logs: list[dict[str, Any]] = Field(
-        default_factory=list,
-        description="Log entries from test execution"
-    )
-    node_outputs: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Output from each node executed during test"
-    )
-    execution_path: list[str] = Field(
-        default_factory=list,
-        description="Sequence of nodes executed"
-    )
+    runtime_logs: list[dict[str, Any]] = Field(default_factory=list, description="Log entries from test execution")
+    node_outputs: dict[str, Any] = Field(default_factory=dict, description="Output from each node executed during test")
+    execution_path: list[str] = Field(default_factory=list, description="Sequence of nodes executed")

    # Associated run ID (links to Runtime data)
-    run_id: str | None = Field(
-        default=None,
-        description="Runtime run ID for detailed analysis"
-    )
+    run_id: str | None = Field(default=None, description="Runtime run ID for detailed analysis")

    timestamp: datetime = Field(default_factory=datetime.now)

@@ -94,6 +81,7 @@ class TestSuiteResult(BaseModel):

    Provides summary statistics and individual results.
    """
+
    __test__ = False  # Not a pytest test class
    goal_id: str
    total: int
@@ -104,10 +92,7 @@ class TestSuiteResult(BaseModel):

    results: list[TestResult] = Field(default_factory=list)

-    duration_ms: int = Field(
-        default=0,
-        description="Total execution time in milliseconds"
-    )
+    duration_ms: int = Field(default=0, description="Total execution time in milliseconds")

    timestamp: datetime = Field(default_factory=datetime.now)

@@ -145,11 +130,6 @@ class TestSuiteResult(BaseModel):
        """Get all failed test results for debugging."""
        return [r for r in self.results if not r.passed]

-    def get_results_by_category(
-        self, category: ErrorCategory
-    ) -> list[TestResult]:
+    def get_results_by_category(self, category: ErrorCategory) -> list[TestResult]:
        """Get failed results by error category."""
-        return [
-            r for r in self.results
-            if not r.passed and r.error_category == category
-        ]
+        return [r for r in self.results if not r.passed and r.error_category == category]
@@ -6,10 +6,10 @@ storing tests as JSON files with indexes for efficient querying.
 """

 import json
-from pathlib import Path
 from datetime import datetime
+from pathlib import Path

-from framework.testing.test_case import Test, ApprovalStatus, TestType
+from framework.testing.test_case import ApprovalStatus, Test, TestType
 from framework.testing.test_result import TestResult


@@ -34,6 +34,7 @@ class TestStorage:
      suites/
        {goal_id}_suite.json       # Test suite metadata
    """
+
    __test__ = False  # Not a pytest test class

    def __init__(self, base_path: str | Path):
@@ -197,10 +198,7 @@ class TestStorage:
            return []

        # Get all result files except latest.json
-        result_files = sorted(
-            [f for f in results_dir.glob("*.json") if f.name != "latest.json"],
-            reverse=True
-        )[:limit]
+        result_files = sorted([f for f in results_dir.glob("*.json") if f.name != "latest.json"], reverse=True)[:limit]

        results = []
        for f in result_files:
@@ -32,7 +32,7 @@ packages = ["framework"]
 [tool.ruff]
 target-version = "py311"

-line-length = 100
+line-length = 120

 lint.select = [
  "B",   # bugbear errors
@@ -14,11 +14,12 @@ from pathlib import Path

 class Colors:
    """ANSI color codes for terminal output."""
-    GREEN = '\033[0;32m'
-    YELLOW = '\033[1;33m'
-    RED = '\033[0;31m'
-    BLUE = '\033[0;34m'
-    NC = '\033[0m'  # No Color
+
+    GREEN = "\033[0;32m"
+    YELLOW = "\033[1;33m"
+    RED = "\033[0;31m"
+    BLUE = "\033[0;34m"
+    NC = "\033[0m"  # No Color


 def print_step(message: str, color: str = Colors.YELLOW):
@@ -58,10 +59,7 @@ def main():

    # Step 1: Install framework package
    print_step("Step 1: Installing framework package...")
-    if not run_command(
-        [sys.executable, "-m", "pip", "install", "-e", "."],
-        "Failed to install framework package"
-    ):
+    if not run_command([sys.executable, "-m", "pip", "install", "-e", "."], "Failed to install framework package"):
        sys.exit(1)
    print_success("Framework package installed")
    print()
@@ -70,7 +68,7 @@ def main():
    print_step("Step 2: Installing MCP dependencies...")
    if not run_command(
        [sys.executable, "-m", "pip", "install", "mcp", "fastmcp"],
-        "Failed to install MCP dependencies"
+        "Failed to install MCP dependencies",
    ):
        sys.exit(1)
    print_success("MCP dependencies installed")
@@ -95,12 +93,12 @@ def main():
                "agent-builder": {
                    "command": "python",
                    "args": ["-m", "framework.mcp.agent_builder_server"],
-                    "cwd": str(script_dir)
+                    "cwd": str(script_dir),
                }
            }
        }

-        with open(mcp_config_path, 'w') as f:
+        with open(mcp_config_path, "w") as f:
            json.dump(config, f, indent=2)

        print_success("Created .mcp.json")
@@ -114,7 +112,7 @@ def main():
            [sys.executable, "-c", "from framework.mcp import agent_builder_server"],
            check=True,
            capture_output=True,
-            text=True
+            text=True,
        )
        print_success("MCP server module verified")
    except subprocess.CalledProcessError as e:
@@ -143,7 +141,7 @@ def main():
            "agent-builder": {
                "command": "python",
                "args": ["-m", "framework.mcp.agent_builder_server"],
-                "cwd": str(script_dir)
+                "cwd": str(script_dir),
            }
        }
    }
@@ -2,7 +2,7 @@

 from pathlib import Path

-from framework import Runtime, BuilderQuery
+from framework import BuilderQuery, Runtime
 from framework.schemas.run import RunStatus


@@ -10,27 +10,28 @@ Tests cover:
 """

 import asyncio
+
 import pytest

-from framework.graph.plan import (
-    Plan,
-    PlanStep,
-    ActionSpec,
-    ActionType,
-    StepStatus,
-    Judgment,
-    JudgmentAction,
-    EvaluationRule,
-    PlanExecutionResult,
-    ExecutionStatus,
-)
 from framework.graph.code_sandbox import (
    CodeSandbox,
-    safe_exec,
    safe_eval,
+    safe_exec,
 )
-from framework.graph.judge import HybridJudge, create_default_judge
 from framework.graph.goal import Goal, SuccessCriterion
+from framework.graph.judge import HybridJudge, create_default_judge
+from framework.graph.plan import (
+    ActionSpec,
+    ActionType,
+    EvaluationRule,
+    ExecutionStatus,
+    Judgment,
+    JudgmentAction,
+    Plan,
+    PlanExecutionResult,
+    PlanStep,
+    StepStatus,
+)


 class TestPlanDataStructures:
@@ -216,12 +217,14 @@ class TestHybridJudge:
    def test_rule_based_accept(self):
        """Test rule-based accept judgment."""
        judge = HybridJudge()
-        judge.add_rule(EvaluationRule(
-            id="success_check",
-            description="Accept on success flag",
-            condition="result.get('success') == True",
-            action=JudgmentAction.ACCEPT,
-        ))
+        judge.add_rule(
+            EvaluationRule(
+                id="success_check",
+                description="Accept on success flag",
+                condition="result.get('success') == True",
+                action=JudgmentAction.ACCEPT,
+            )
+        )

        step = PlanStep(
            id="test_step",
@@ -238,9 +241,7 @@ class TestHybridJudge:
        )

        # Use sync version for testing
-        judgment = asyncio.run(
-            judge.evaluate(step, {"success": True}, goal)
-        )
+        judgment = asyncio.run(judge.evaluate(step, {"success": True}, goal))

        assert judgment.action == JudgmentAction.ACCEPT
        assert judgment.rule_matched == "success_check"
@@ -248,13 +249,15 @@ class TestHybridJudge:
    def test_rule_based_retry(self):
        """Test rule-based retry judgment."""
        judge = HybridJudge()
-        judge.add_rule(EvaluationRule(
-            id="timeout_retry",
-            description="Retry on timeout",
-            condition="result.get('error_type') == 'timeout'",
-            action=JudgmentAction.RETRY,
-            feedback_template="Timeout occurred, please retry",
-        ))
+        judge.add_rule(
+            EvaluationRule(
+                id="timeout_retry",
+                description="Retry on timeout",
+                condition="result.get('error_type') == 'timeout'",
+                action=JudgmentAction.RETRY,
+                feedback_template="Timeout occurred, please retry",
+            )
+        )

        step = PlanStep(
            id="test_step",
@@ -270,9 +273,7 @@ class TestHybridJudge:
            ],
        )

-        judgment = asyncio.run(
-            judge.evaluate(step, {"error_type": "timeout"}, goal)
-        )
+        judgment = asyncio.run(judge.evaluate(step, {"error_type": "timeout"}, goal))

        assert judgment.action == JudgmentAction.RETRY

@@ -281,22 +282,26 @@ class TestHybridJudge:
        judge = HybridJudge()

        # Lower priority - would match
-        judge.add_rule(EvaluationRule(
-            id="low_priority",
-            description="Low priority accept",
-            condition="True",
-            action=JudgmentAction.ACCEPT,
-            priority=1,
-        ))
+        judge.add_rule(
+            EvaluationRule(
+                id="low_priority",
+                description="Low priority accept",
+                condition="True",
+                action=JudgmentAction.ACCEPT,
+                priority=1,
+            )
+        )

        # Higher priority - should match first
-        judge.add_rule(EvaluationRule(
-            id="high_priority",
-            description="High priority escalate",
-            condition="True",
-            action=JudgmentAction.ESCALATE,
-            priority=100,
-        ))
+        judge.add_rule(
+            EvaluationRule(
+                id="high_priority",
+                description="High priority escalate",
+                condition="True",
+                action=JudgmentAction.ESCALATE,
+                priority=100,
+            )
+        )

        step = PlanStep(
            id="test_step",
@@ -312,9 +317,7 @@ class TestHybridJudge:
            ],
        )

-        judgment = asyncio.run(
-            judge.evaluate(step, {}, goal)
-        )
+        judgment = asyncio.run(judge.evaluate(step, {}, goal))

        assert judgment.rule_matched == "high_priority"
        assert judgment.action == JudgmentAction.ESCALATE
@@ -397,8 +400,8 @@ class TestFlexibleExecutorIntegration:

    def test_executor_creation(self, tmp_path):
        """Test creating a FlexibleGraphExecutor."""
-        from framework.runtime.core import Runtime
        from framework.graph.flexible_executor import FlexibleGraphExecutor
+        from framework.runtime.core import Runtime

        runtime = Runtime(storage_path=tmp_path / "runtime")
        executor = FlexibleGraphExecutor(runtime=runtime)
@@ -409,17 +412,19 @@ class TestFlexibleExecutorIntegration:

    def test_executor_with_custom_judge(self, tmp_path):
        """Test executor with custom judge."""
-        from framework.runtime.core import Runtime
        from framework.graph.flexible_executor import FlexibleGraphExecutor
+        from framework.runtime.core import Runtime

        runtime = Runtime(storage_path=tmp_path / "runtime")
        custom_judge = HybridJudge()
-        custom_judge.add_rule(EvaluationRule(
-            id="custom_rule",
-            description="Custom rule",
-            condition="True",
-            action=JudgmentAction.ACCEPT,
-        ))
+        custom_judge.add_rule(
+            EvaluationRule(
+                id="custom_rule",
+                description="Custom rule",
+                condition="True",
+                action=JudgmentAction.ACCEPT,
+            )
+        )

        executor = FlexibleGraphExecutor(runtime=runtime, judge=custom_judge)

@@ -10,11 +10,11 @@ For live tests (requires API keys):
 """

 import os
-from unittest.mock import patch, MagicMock
+from unittest.mock import MagicMock, patch

-from framework.llm.litellm import LiteLLMProvider
 from framework.llm.anthropic import AnthropicProvider
-from framework.llm.provider import LLMProvider, Tool, ToolUse, ToolResult
+from framework.llm.litellm import LiteLLMProvider
+from framework.llm.provider import LLMProvider, Tool, ToolResult, ToolUse


 class TestLiteLLMProviderInit:
@@ -41,11 +41,7 @@ class TestLiteLLMProviderInit:

    def test_init_with_api_base(self):
        """Test initialization with custom API base."""
-        provider = LiteLLMProvider(
-            model="gpt-4o-mini",
-            api_key="my-key",
-            api_base="https://my-proxy.com/v1"
-        )
+        provider = LiteLLMProvider(model="gpt-4o-mini", api_key="my-key", api_base="https://my-proxy.com/v1")
        assert provider.api_base == "https://my-proxy.com/v1"

    def test_init_ollama_no_key_needed(self):
@@ -73,9 +69,7 @@ class TestLiteLLMProviderComplete:
        mock_completion.return_value = mock_response

        provider = LiteLLMProvider(model="gpt-4o-mini", api_key="test-key")
-        result = provider.complete(
-            messages=[{"role": "user", "content": "Hello"}]
-        )
+        result = provider.complete(messages=[{"role": "user", "content": "Hello"}])

        assert result.content == "Hello! I'm an AI assistant."
        assert result.model == "gpt-4o-mini"
@@ -102,10 +96,7 @@ class TestLiteLLMProviderComplete:
        mock_completion.return_value = mock_response

        provider = LiteLLMProvider(model="gpt-4o-mini", api_key="test-key")
-        provider.complete(
-            messages=[{"role": "user", "content": "Hello"}],
-            system="You are a helpful assistant."
-        )
+        provider.complete(messages=[{"role": "user", "content": "Hello"}], system="You are a helpful assistant.")

        call_kwargs = mock_completion.call_args[1]
        messages = call_kwargs["messages"]
@@ -131,18 +122,13 @@ class TestLiteLLMProviderComplete:
                name="get_weather",
                description="Get the weather for a location",
                parameters={
-                    "properties": {
-                        "location": {"type": "string", "description": "City name"}
-                    },
-                    "required": ["location"]
-                }
+                    "properties": {"location": {"type": "string", "description": "City name"}},
+                    "required": ["location"],
+                },
            )
        ]

-        provider.complete(
-            messages=[{"role": "user", "content": "What's the weather?"}],
-            tools=tools
-        )
+        provider.complete(messages=[{"role": "user", "content": "What's the weather?"}], tools=tools)

        call_kwargs = mock_completion.call_args[1]
        assert "tools" in call_kwargs
@@ -187,22 +173,21 @@ class TestLiteLLMProviderToolUse:
            Tool(
                name="get_weather",
                description="Get the weather",
-                parameters={"properties": {"location": {"type": "string"}}, "required": ["location"]}
+                parameters={
+                    "properties": {"location": {"type": "string"}},
+                    "required": ["location"],
+                },
            )
        ]

        def tool_executor(tool_use: ToolUse) -> ToolResult:
-            return ToolResult(
-                tool_use_id=tool_use.id,
-                content="Sunny, 22C",
-                is_error=False
-            )
+            return ToolResult(tool_use_id=tool_use.id, content="Sunny, 22C", is_error=False)

        result = provider.complete_with_tools(
            messages=[{"role": "user", "content": "What's the weather in London?"}],
            system="You are a weather assistant.",
            tools=tools,
-            tool_executor=tool_executor
+            tool_executor=tool_executor,
        )

        assert result.content == "The weather in London is sunny."
@@ -222,11 +207,9 @@ class TestToolConversion:
            name="search",
            description="Search the web",
            parameters={
-                "properties": {
-                    "query": {"type": "string", "description": "Search query"}
-                },
-                "required": ["query"]
-            }
+                "properties": {"query": {"type": "string", "description": "Search query"}},
+                "required": ["query"],
+            },
        )

        result = provider._tool_to_openai_format(tool)
@@ -280,7 +263,7 @@ class TestAnthropicProviderBackwardCompatibility:
        result = provider.complete(
            messages=[{"role": "user", "content": "Hello"}],
            system="You are helpful.",
-            max_tokens=100
+            max_tokens=100,
        )

        assert result.content == "Hello from Claude!"
@@ -313,7 +296,7 @@ class TestAnthropicProviderBackwardCompatibility:
            Tool(
                name="get_time",
                description="Get current time",
-                parameters={"properties": {}, "required": []}
+                parameters={"properties": {}, "required": []},
            )
        ]

@@ -324,7 +307,7 @@ class TestAnthropicProviderBackwardCompatibility:
            messages=[{"role": "user", "content": "What time is it?"}],
            system="You are a time assistant.",
            tools=tools,
-            tool_executor=tool_executor
+            tool_executor=tool_executor,
        )

        assert result.content == "The time is 3:00 PM."
@@ -7,12 +7,9 @@ import pytest

 def _mcp_available() -> bool:
    """Check if MCP dependencies are installed."""
-    try:
-        import mcp
-        from mcp.server import FastMCP
-        return True
-    except ImportError:
-        return False
+    import importlib.util
+
+    return importlib.util.find_spec("mcp") is not None


 MCP_AVAILABLE = _mcp_available()
@@ -28,6 +25,7 @@ class TestMCPDependencies:
            pytest.skip(MCP_SKIP_REASON)

        import mcp
+
        assert mcp is not None

    def test_fastmcp_available(self):
@@ -36,6 +34,7 @@ class TestMCPDependencies:
            pytest.skip(MCP_SKIP_REASON)

        from mcp.server import FastMCP
+
        assert FastMCP is not None


@@ -48,6 +47,7 @@ class TestAgentBuilderServerModule:
            pytest.skip(MCP_SKIP_REASON)

        import framework.mcp.agent_builder_server as module
+
        assert module is not None

    def test_mcp_object_exported(self):
@@ -55,9 +55,10 @@ class TestAgentBuilderServerModule:
        if not MCP_AVAILABLE:
            pytest.skip(MCP_SKIP_REASON)

-        from framework.mcp.agent_builder_server import mcp
        from mcp.server import FastMCP

+        from framework.mcp.agent_builder_server import mcp
+
        assert mcp is not None
        assert isinstance(mcp, FastMCP)

@@ -67,6 +68,7 @@ class TestAgentBuilderServerModule:
            pytest.skip(MCP_SKIP_REASON)

        from framework.mcp.agent_builder_server import mcp
+
        assert mcp.name == "agent-builder"


@@ -79,6 +81,7 @@ class TestMCPPackageExports:
            pytest.skip(MCP_SKIP_REASON)

        import framework.mcp
+
        assert framework.mcp is not None

    def test_agent_builder_server_exported(self):
@@ -86,8 +89,9 @@ class TestMCPPackageExports:
        if not MCP_AVAILABLE:
            pytest.skip(MCP_SKIP_REASON)

-        from framework.mcp import agent_builder_server
        from mcp.server import FastMCP

+        from framework.mcp import agent_builder_server
+
        assert agent_builder_server is not None
        assert isinstance(agent_builder_server, FastMCP)
@@ -7,8 +7,8 @@ Run with:

 from unittest.mock import Mock, patch

-from framework.llm.provider import LLMProvider
 from framework.llm.litellm import LiteLLMProvider
+from framework.llm.provider import LLMProvider
 from framework.runner.orchestrator import AgentOrchestrator


@@ -17,7 +17,7 @@ class TestOrchestratorLLMInitialization:

    def test_auto_creates_litellm_provider_when_no_llm_passed(self):
        """Test that LiteLLMProvider is auto-created when no llm is passed."""
-        with patch.object(LiteLLMProvider, '__init__', return_value=None) as mock_init:
+        with patch.object(LiteLLMProvider, "__init__", return_value=None) as mock_init:
            orchestrator = AgentOrchestrator()

            mock_init.assert_called_once_with(model="claude-haiku-4-5-20251001")
@@ -25,14 +25,14 @@ class TestOrchestratorLLMInitialization:

    def test_uses_custom_model_parameter(self):
        """Test that custom model parameter is passed to LiteLLMProvider."""
-        with patch.object(LiteLLMProvider, '__init__', return_value=None) as mock_init:
+        with patch.object(LiteLLMProvider, "__init__", return_value=None) as mock_init:
            AgentOrchestrator(model="gpt-4o")

            mock_init.assert_called_once_with(model="gpt-4o")

    def test_supports_openai_model_names(self):
        """Test that OpenAI model names are supported."""
-        with patch.object(LiteLLMProvider, '__init__', return_value=None) as mock_init:
+        with patch.object(LiteLLMProvider, "__init__", return_value=None) as mock_init:
            orchestrator = AgentOrchestrator(model="gpt-4o-mini")

            mock_init.assert_called_once_with(model="gpt-4o-mini")
@@ -40,7 +40,7 @@ class TestOrchestratorLLMInitialization:

    def test_supports_anthropic_model_names(self):
        """Test that Anthropic model names are supported."""
-        with patch.object(LiteLLMProvider, '__init__', return_value=None) as mock_init:
+        with patch.object(LiteLLMProvider, "__init__", return_value=None) as mock_init:
            orchestrator = AgentOrchestrator(model="claude-3-haiku-20240307")

            mock_init.assert_called_once_with(model="claude-3-haiku-20240307")
@@ -50,7 +50,7 @@ class TestOrchestratorLLMInitialization:
        """Test that auto-creation is skipped when llm is explicitly passed."""
        mock_llm = Mock(spec=LLMProvider)

-        with patch.object(LiteLLMProvider, '__init__', return_value=None) as mock_init:
+        with patch.object(LiteLLMProvider, "__init__", return_value=None) as mock_init:
            orchestrator = AgentOrchestrator(llm=mock_llm)

            mock_init.assert_not_called()
@@ -58,7 +58,7 @@ class TestOrchestratorLLMInitialization:

    def test_model_attribute_stored_correctly(self):
        """Test that _model attribute is stored correctly."""
-        with patch.object(LiteLLMProvider, '__init__', return_value=None):
+        with patch.object(LiteLLMProvider, "__init__", return_value=None):
            orchestrator = AgentOrchestrator(model="gemini/gemini-1.5-flash")

            assert orchestrator._model == "gemini/gemini-1.5-flash"
@@ -78,5 +78,5 @@ class TestOrchestratorLLMProviderType:
        orchestrator = AgentOrchestrator()

        assert isinstance(orchestrator._llm, LLMProvider)
-        assert hasattr(orchestrator._llm, 'complete')
-        assert hasattr(orchestrator._llm, 'complete_with_tools')
+        assert hasattr(orchestrator._llm, "complete")
+        assert hasattr(orchestrator._llm, "complete_with_tools")
@@ -1,16 +1,18 @@
 """Tests for plan.py - Plan enums and Pydantic models."""
+
 import json
+
 import pytest

 from framework.graph.plan import (
-    ActionType,
-    StepStatus,
-    ApprovalDecision,
-    JudgmentAction,
-    ExecutionStatus,
    ActionSpec,
-    PlanStep,
+    ActionType,
+    ApprovalDecision,
+    ExecutionStatus,
+    JudgmentAction,
    Plan,
+    PlanStep,
+    StepStatus,
 )


@@ -210,21 +212,23 @@ class TestPlanFromJson:

    def test_plan_from_json_string(self):
        """Parse Plan from JSON string."""
-        json_str = json.dumps({
-            "id": "plan_1",
-            "goal_id": "goal_1",
-            "description": "Test plan",
-            "steps": [
-                {
-                    "id": "step_1",
-                    "description": "First step",
-                    "action": {
-                        "action_type": "function",
-                        "function_name": "do_something",
-                    },
-                }
-            ],
-        })
+        json_str = json.dumps(
+            {
+                "id": "plan_1",
+                "goal_id": "goal_1",
+                "description": "Test plan",
+                "steps": [
+                    {
+                        "id": "step_1",
+                        "description": "First step",
+                        "action": {
+                            "action_type": "function",
+                            "function_name": "do_something",
+                        },
+                    }
+                ],
+            }
+        )

        plan = Plan.from_json(json_str)

@@ -1,12 +1,16 @@
 """
 Test the run module.
 """
+
 from datetime import datetime
-from framework.schemas.run import RunMetrics, Run, RunStatus, RunSummary
-from framework.schemas.decision import Decision, Outcome, Option
+
+from framework.schemas.decision import Decision, Option, Outcome
+from framework.schemas.run import Run, RunMetrics, RunStatus, RunSummary
+

 class TestRuntimeMetrics:
    """Test the RunMetrics class."""
+
    def test_success_rate(self):
        metrics = RunMetrics(
            total_decisions=10,
@@ -14,7 +18,7 @@ class TestRuntimeMetrics:
            failed_decisions=2,
        )
        assert metrics.success_rate == 0.8
-    
+
    def test_success_rate_zero_decisions(self):
        metrics = RunMetrics(
            total_decisions=0,
@@ -23,8 +27,10 @@ class TestRuntimeMetrics:
        )
        assert metrics.success_rate == 0.0

+
 class TestRun:
    """Test the Run class."""
+
    def test_duration_ms(self):
        run = Run(
            id="test_run",
@@ -87,7 +93,7 @@ class TestRun:
        assert run.metrics.failed_decisions == 0
        assert run.metrics.total_tokens == 10
        assert run.metrics.total_latency_ms == 100
-    
+
    def test_add_problem(self):
        run = Run(
            id="test_run",
@@ -95,16 +101,16 @@ class TestRun:
            started_at=datetime.now(),
            completed_at=datetime.now(),
        )
-        problem_id =  run.add_problem(
-            "Test problem", 
-            "Test problem description", 
-            "test_decision", 
-            "Test root cause", 
+        problem_id = run.add_problem(
+            "Test problem",
+            "Test problem description",
+            "test_decision",
+            "Test root cause",
            "Test suggested fix",
-            )
-        
+        )
+
        assert problem_id == f"prob_{len(run.problems) - 1}"
-        
+
        problem = run.problems[0]
        assert problem.id == f"prob_{len(run.problems) - 1}"
        assert problem.severity == "Test problem"
@@ -112,7 +118,7 @@ class TestRun:
        assert problem.decision_id == "test_decision"
        assert problem.root_cause == "Test root cause"
        assert problem.suggested_fix == "Test suggested fix"
-    
+
    def test_complete(self):
        run = Run(
            id="test_run",
@@ -124,8 +130,10 @@ class TestRun:
        assert run.status == RunStatus.COMPLETED
        assert run.narrative == "Test narrative"

+
 class TestRunSummary:
    """Test the RunSummary class."""
+
    def test_from_run_basic(self):
        run = Run(
            id="test_run",
@@ -134,9 +142,9 @@ class TestRunSummary:
            completed_at=datetime.now(),
        )
        run.complete(RunStatus.COMPLETED, "Test narrative")
-        
+
        summary = RunSummary.from_run(run)
-        
+
        assert summary.run_id == "test_run"
        assert summary.goal_id == "test_goal"
        assert summary.status == RunStatus.COMPLETED
@@ -144,7 +152,7 @@ class TestRunSummary:
        assert summary.success_rate == 0.0
        assert summary.problem_count == 0
        assert summary.narrative == "Test narrative"
-    
+
    def test_from_run_with_decisions(self):
        run = Run(
            id="test_run",
@@ -152,7 +160,7 @@ class TestRunSummary:
            started_at=datetime.now(),
            completed_at=datetime.now(),
        )
-        
+
        successful_decision = Decision(
            id="decision_1",
            timestamp=datetime.now(),
@@ -173,7 +181,7 @@ class TestRunSummary:
            latency_ms=100,
            summary="Successfully greeted user",
        )
-        
+
        failed_decision = Decision(
            id="decision_2",
            timestamp=datetime.now(),
@@ -194,21 +202,21 @@ class TestRunSummary:
            tokens_used=5,
            latency_ms=50,
        )
-        
+
        run.add_decision(successful_decision)
        run.record_outcome("decision_1", successful_outcome)
        run.add_decision(failed_decision)
        run.record_outcome("decision_2", failed_outcome)
        run.complete(RunStatus.COMPLETED, "Test narrative")
-        
+
        summary = RunSummary.from_run(run)
-        
+
        assert summary.decision_count == 2
        assert summary.success_rate == 0.5
        assert len(summary.key_decisions) == 1
        assert len(summary.successes) == 1
        assert summary.successes[0] == "Successfully greeted user"
-    
+
    def test_from_run_with_problems(self):
        run = Run(
            id="test_run",
@@ -216,7 +224,7 @@ class TestRunSummary:
            started_at=datetime.now(),
            completed_at=datetime.now(),
        )
-        
+
        run.add_problem(
            severity="critical",
            description="API timeout",
@@ -224,7 +232,7 @@ class TestRunSummary:
            root_cause="Network issue",
            suggested_fix="Add retry logic",
        )
-        
+
        run.add_problem(
            severity="warning",
            description="High latency",
@@ -232,13 +240,13 @@ class TestRunSummary:
            root_cause="Large payload",
            suggested_fix="Optimize data size",
        )
-        
+
        run.complete(RunStatus.COMPLETED, "Test narrative")
-        
+
        summary = RunSummary.from_run(run)
-        
+
        assert summary.problem_count == 2
        assert len(summary.critical_problems) == 1
        assert len(summary.warnings) == 1
        assert summary.critical_problems[0] == "API timeout"
-        assert summary.warnings[0] == "High latency"
+        assert summary.warnings[0] == "High latency"
@@ -1,8 +1,9 @@
 """Tests for the Runtime class - the agent's interface to record decisions."""

-import pytest
 from pathlib import Path

+import pytest
+
 from framework import Runtime
 from framework.schemas.decision import DecisionType

@@ -9,25 +9,25 @@ Tests cover:

 import pytest

-from framework.testing.test_case import (
-    Test,
-    TestType,
-    ApprovalStatus,
-)
-from framework.testing.test_result import (
-    TestResult,
-    TestSuiteResult,
-    ErrorCategory,
-)
-from framework.testing.test_storage import TestStorage
 from framework.testing.categorizer import ErrorCategorizer
 from framework.testing.debug_tool import DebugTool
-
+from framework.testing.test_case import (
+    ApprovalStatus,
+    Test,
+    TestType,
+)
+from framework.testing.test_result import (
+    ErrorCategory,
+    TestResult,
+    TestSuiteResult,
+)
+from framework.testing.test_storage import TestStorage

 # ============================================================================
 # Test Schema Tests
 # ============================================================================

+
 class TestTestCaseSchema:
    """Tests for Test schema."""

@@ -189,8 +189,12 @@ class TestTestSuiteResult:
        results = [
            TestResult(test_id="t1", passed=True, duration_ms=100),
            TestResult(test_id="t2", passed=True, duration_ms=50),
-            TestResult(test_id="t3", passed=False, duration_ms=75,
-                       error_category=ErrorCategory.IMPLEMENTATION_ERROR),
+            TestResult(
+                test_id="t3",
+                passed=False,
+                duration_ms=75,
+                error_category=ErrorCategory.IMPLEMENTATION_ERROR,
+            ),
        ]

        suite = TestSuiteResult(
@@ -203,18 +207,30 @@ class TestTestSuiteResult:
        )

        assert not suite.all_passed
-        assert suite.pass_rate == pytest.approx(2/3)
+        assert suite.pass_rate == pytest.approx(2 / 3)
        assert len(suite.get_failed_results()) == 1

    def test_get_results_by_category(self):
        """Test filtering results by error category."""
        results = [
-            TestResult(test_id="t1", passed=False, duration_ms=100,
-                       error_category=ErrorCategory.LOGIC_ERROR),
-            TestResult(test_id="t2", passed=False, duration_ms=50,
-                       error_category=ErrorCategory.IMPLEMENTATION_ERROR),
-            TestResult(test_id="t3", passed=False, duration_ms=75,
-                       error_category=ErrorCategory.IMPLEMENTATION_ERROR),
+            TestResult(
+                test_id="t1",
+                passed=False,
+                duration_ms=100,
+                error_category=ErrorCategory.LOGIC_ERROR,
+            ),
+            TestResult(
+                test_id="t2",
+                passed=False,
+                duration_ms=50,
+                error_category=ErrorCategory.IMPLEMENTATION_ERROR,
+            ),
+            TestResult(
+                test_id="t3",
+                passed=False,
+                duration_ms=75,
+                error_category=ErrorCategory.IMPLEMENTATION_ERROR,
+            ),
        ]

        suite = TestSuiteResult(
@@ -233,6 +249,7 @@ class TestTestSuiteResult:
 # Storage Tests
 # ============================================================================

+
 class TestTestStorage:
    """Tests for TestStorage."""

@@ -389,6 +406,7 @@ class TestTestStorage:
 # Error Categorizer Tests
 # ============================================================================

+
 class TestErrorCategorizer:
    """Tests for ErrorCategorizer."""

@@ -463,6 +481,7 @@ class TestErrorCategorizer:
 # Debug Tool Tests
 # ============================================================================

+
 class TestDebugTool:
    """Tests for DebugTool."""

@@ -12,11 +12,11 @@ from pathlib import Path


 class Colors:
-    GREEN = '\033[0;32m'
-    YELLOW = '\033[1;33m'
-    RED = '\033[0;31m'
-    BLUE = '\033[0;34m'
-    NC = '\033[0m'
+    GREEN = "\033[0;32m"
+    YELLOW = "\033[1;33m"
+    RED = "\033[0;31m"
+    BLUE = "\033[0;34m"
+    NC = "\033[0m"


 def check(description: str) -> bool:
@@ -55,7 +55,7 @@ def main():
            [sys.executable, "-c", "import framework; print(framework.__file__)"],
            capture_output=True,
            text=True,
-            check=True
+            check=True,
        )
        framework_path = result.stdout.strip()
        success(f"installed at {framework_path}")
@@ -69,11 +69,7 @@ def main():
    missing_deps = []
    for dep in ["mcp", "fastmcp"]:
        try:
-            subprocess.run(
-                [sys.executable, "-c", f"import {dep}"],
-                capture_output=True,
-                check=True
-            )
+            subprocess.run([sys.executable, "-c", f"import {dep}"], capture_output=True, check=True)
        except subprocess.CalledProcessError:
            missing_deps.append(dep)

@@ -91,7 +87,7 @@ def main():
            [sys.executable, "-c", "from framework.mcp import agent_builder_server"],
            capture_output=True,
            text=True,
-            check=True
+            check=True,
        )
        success("loads successfully")
    except subprocess.CalledProcessError as e:
@@ -137,11 +133,7 @@ def main():
    failed_modules = []
    for module in modules_to_check:
        try:
-            subprocess.run(
-                [sys.executable, "-c", f"import {module}"],
-                capture_output=True,
-                check=True
-            )
+            subprocess.run([sys.executable, "-c", f"import {module}"], capture_output=True, check=True)
        except subprocess.CalledProcessError:
            failed_modules.append(module)

@@ -156,12 +148,15 @@ def main():
    try:
        # Try to import and instantiate the MCP server
        result = subprocess.run(
-            [sys.executable, "-c",
-             "from framework.mcp.agent_builder_server import mcp; print('OK')"],
+            [
+                sys.executable,
+                "-c",
+                "from framework.mcp.agent_builder_server import mcp; print('OK')",
+            ],
            capture_output=True,
            text=True,
            check=True,
-            timeout=5
+            timeout=5,
        )
        if "OK" in result.stdout:
            success("server can start")
@@ -31,7 +31,7 @@ class TestGetSecurePath:
        """Session directory is created if it doesn't exist."""
        from aden_tools.tools.file_system_toolkits.security import get_secure_path

-        result = get_secure_path("file.txt", **ids)
+        get_secure_path("file.txt", **ids)

        session_dir = self.workspaces_dir / "test-workspace" / "test-agent" / "test-session"
        assert session_dir.exists()
Author	SHA1	Message	Date
Richard T	38ba872bf1	chore: lint code quality	2026-01-23 18:04:54 -08:00
Richard T	0dbd1f7896	chore lint fixes	2026-01-23 18:03:50 -08:00
Richard T	364fb57d3c	chore: lint fixes for ruff -part 1	2026-01-23 17:21:59 -08:00