fix: worker tab groups

fix: parallel execution
feat: new queen phases
2026-04-17 12:34:38 -07:00 · 2026-04-17 11:20:06 -07:00 · 2026-04-17 06:19:15 -07:00 · 2026-04-17 04:47:51 -07:00 · 2026-04-17 04:12:35 -07:00 · 2026-04-17 04:06:59 -07:00
1286 changed files with 313565 additions and 48104 deletions
@@ -0,0 +1,81 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(grep -n \"_is_context_too_large_error\" core/framework/agent_loop/agent_loop.py core/framework/agent_loop/internals/*.py)",
+      "Read(//^class/ {cls=$3} /def test_/**)",
+      "Read(//^    @pytest.mark.asyncio/{getline n; print NR\": \"n} /^    def test_/**)",
+      "Bash(python3)",
+      "Bash(grep -nE 'Tool\\\\\\(\\\\s*$|name=\"[a-z_]+\",' core/framework/tools/queen_lifecycle_tools.py)",
+      "Bash(awk -F'\"' '{print $2}')",
+      "Bash(grep -n \"create_colony\\\\|colony-spawn\\\\|colony_spawn\" /home/timothy/aden/hive/core/framework/agents/queen/nodes/__init__.py /home/timothy/aden/hive/core/framework/tools/*.py)",
+      "Bash(git stash:*)",
+      "Bash(python3 -c \"import sys,json; d=json.loads\\(sys.stdin.read\\(\\)\\); print\\('keys:', list\\(d.keys\\(\\)\\)[:10]\\)\")",
+      "Bash(python3 -c ':*)",
+      "Bash(uv run:*)",
+      "Read(//tmp/**)",
+      "Bash(grep -n \"useColony\\\\|const { queens, queenProfiles\" /home/timothy/aden/hive/core/frontend/src/pages/queen-dm.tsx)",
+      "Bash(awk 'NR==385,/\\\\}, \\\\[/' /home/timothy/aden/hive/core/frontend/src/pages/queen-dm.tsx)",
+      "Bash(xargs -I{} sh -c 'if ! grep -q \"^import base64\\\\|^from base64\" \"{}\"; then echo \"MISSING: {}\"; fi')",
+      "Bash(find /home/timothy/aden/hive/core/framework -name \"*.py\" -type f -exec grep -l \"FileConversationStore\\\\|class.*ConversationStore\" {} \\\\;)",
+      "Bash(find /home/timothy/aden/hive/core/framework -name \"*.py\" -exec grep -l \"run_parallel_workers\\\\|create_colony\" {} \\\\;)",
+      "Bash(awk '/^    async def execute\\\\\\(self, ctx: AgentContext\\\\\\)/,/^    async def [a-z_]+/ {print NR\": \"$0}' /home/timothy/aden/hive/core/framework/agent_loop/agent_loop.py)",
+      "Bash(grep -r \"max_concurrent_workers\\\\|max_depth\\\\|recursion\\\\|spawn.*bomb\" /home/timothy/aden/hive/core/framework/host/*.py)",
+      "Bash(wc -l /home/timothy/aden/hive/tools/src/gcu/browser/*.py /home/timothy/aden/hive/tools/src/gcu/browser/tools/*.py)",
+      "Bash(file /tmp/gcu_verify/*.png)",
+      "Bash(ps -eo pid,cmd)",
+      "Bash(ps -o pid,lstart,cmd -p 746640)",
+      "Bash(kill 746636)",
+      "Bash(ps -eo pid,lstart,cmd)",
+      "Bash(grep -E \"^d|\\\\.py$\")",
+      "Bash(grep -E \"\\\\.\\(ts|tsx\\)$\")",
+      "Bash(xargs cat:*)",
+      "Bash(find /home/timothy/aden/hive -path \"*/.venv\" -prune -o -name \"*.py\" -type f -exec grep -l \"frontend\\\\|UI\\\\|terminal\\\\|interactive\\\\|TUI\" {} \\\\;)",
+      "Bash(wc -l /home/timothy/.hive/backup/*/SKILL.md)",
+      "Bash(awk -F'::' '{print $1}')",
+      "Bash(wait)",
+      "Bash(pkill -f \"pytest.*test_event_loop_node\")",
+      "Bash(pkill -f \"pytest.*TestToolConcurrency\")",
+      "Bash(grep -n \"def.*discover\\\\|/api/agents\\\\|agents_discover\" /home/timothy/aden/hive/core/framework/server/*.py)",
+      "Bash(bun run:*)",
+      "Bash(npx eslint:*)",
+      "Bash(npm run:*)",
+      "Bash(npm test:*)",
+      "Bash(grep -n \"PIL\\\\|Image\\\\|to_thread\\\\|run_in_executor\" /home/timothy/aden/hive/tools/src/gcu/browser/*.py /home/timothy/aden/hive/tools/src/gcu/browser/tools/*.py)",
+      "WebFetch(domain:docs.litellm.ai)",
+      "Bash(cat /home/timothy/aden/hive/.venv/lib/python3.11/site-packages/litellm-*.dist-info/METADATA)",
+      "Bash(find \"/home/timothy/.hive/agents/queens/queen_brand_design/sessions/session_20260415_100751_d49f4c28/\" -type f -name \"*.json*\" -exec grep -l \"协日\" {} \\\\;)",
+      "Bash(grep -v ':0$')",
+      "Bash(curl -s -m 2 http://127.0.0.1:4002/sse -o /dev/null -w 'status=%{http_code} time=%{time_total}s\\\\n')",
+      "mcp__gcu-tools__browser_status",
+      "mcp__gcu-tools__browser_start",
+      "mcp__gcu-tools__browser_navigate",
+      "mcp__gcu-tools__browser_evaluate",
+      "mcp__gcu-tools__browser_screenshot",
+      "mcp__gcu-tools__browser_open",
+      "mcp__gcu-tools__browser_click_coordinate",
+      "mcp__gcu-tools__browser_get_rect",
+      "mcp__gcu-tools__browser_type_focused",
+      "mcp__gcu-tools__browser_wait",
+      "Bash(python3 -c ' *)",
+      "Bash(python3 scripts/debug_queen_prompt.py independent)"
+    ],
+    "additionalDirectories": [
+      "/home/timothy/.hive/skills/writing-hive-skills",
+      "/tmp",
+      "/home/timothy/.hive/skills"
+    ]
+  },
+  "hooks": {
+    "PostToolUse": [
+      {
+        "matcher": "Edit|Write|NotebookEdit",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "ruff check --fix \"$CLAUDE_FILE_PATH\" 2>/dev/null; ruff format \"$CLAUDE_FILE_PATH\" 2>/dev/null; true"
+          }
+        ]
+      }
+    ]
+  }
+}
@@ -1,19 +0,0 @@
-{
-  "permissions": {
-    "allow": [
-      "Bash(npm install:*)",
-      "Bash(npm test:*)",
-      "Skill(building-agents-construction)",
-      "Skill(building-agents-construction:*)",
-      "Bash(PYTHONPATH=core:exports pytest:*)",
-      "mcp__agent-builder__create_session",
-      "mcp__agent-builder__get_session_status",
-      "mcp__agent-builder__set_goal",
-      "mcp__agent-builder__list_mcp_servers",
-      "mcp__agent-builder__test_node",
-      "mcp__agent-builder__add_node",
-      "mcp__agent-builder__add_edge",
-      "mcp__agent-builder__validate_graph"
-    ]
-  }
-}
@@ -0,0 +1,16 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(git status:*)",
+      "Bash(gh run view:*)",
+      "Bash(uv run:*)",
+      "Bash(env:*)",
+      "Bash(python -m py_compile:*)",
+      "Bash(python -m pytest:*)",
+      "Bash(source:*)",
+      "Bash(find:*)",
+      "Bash(PYTHONPATH=core:exports:tools/src uv run pytest:*)"
+    ]
+  },
+  "enabledMcpjsonServers": ["tools"]
+}
@@ -1,458 +0,0 @@
---
-name: agent-workflow
-description: Complete workflow for building, implementing, and testing goal-driven agents. Orchestrates building-agents-* and testing-agent skills. Use when starting a new agent project, unsure which skill to use, or need end-to-end guidance.
-license: Apache-2.0
-metadata:
-  author: hive
-  version: "2.0"
-  type: workflow-orchestrator
-  orchestrates:
-    - building-agents-core
-    - building-agents-construction
-    - building-agents-patterns
-    - testing-agent
---
-
-# Agent Development Workflow
-
-Complete Standard Operating Procedure (SOP) for building production-ready goal-driven agents.
-
-## Overview
-
-This workflow orchestrates specialized skills to take you from initial concept to production-ready agent:
-
-1. **Understand Concepts** (5-10 min) → `/building-agents-core` (optional)
-2. **Build Structure** (15-30 min) → `/building-agents-construction`
-3. **Optimize Design** (10-15 min) → `/building-agents-patterns` (optional)
-4. **Test & Validate** (20-40 min) → `/testing-agent`
-
-## When to Use This Workflow
-
-Use this meta-skill when:
- Starting a new agent from scratch
- Unclear which skill to use first
- Need end-to-end guidance for agent development
- Want consistent, repeatable agent builds
-
-**Skip this workflow** if:
- You only need to test an existing agent → use `/testing-agent` directly
- You know exactly which phase you're in → use specific skill directly
-
-## Quick Decision Tree
-
-```
-"Need to understand agent concepts" → building-agents-core
-"Build a new agent" → building-agents-construction
-"Optimize my agent design" → building-agents-patterns
-"Test my agent" → testing-agent
-"Not sure what I need" → Read phases below, then decide
-"Agent has structure but needs implementation" → See agent directory STATUS.md
-```
-
-## Phase 0: Understand Concepts (Optional)
-
-**Duration**: 5-10 minutes
-**Skill**: `/building-agents-core`
-**Input**: Questions about agent architecture
-
-### When to Use
-
- First time building an agent
- Need to understand node types, edges, goals
- Want to validate tool availability
- Learning about pause/resume architecture
-
-### What This Phase Provides
-
- Architecture overview (Python packages, not JSON)
- Core concepts (Goal, Node, Edge, Pause/Resume)
- Tool discovery and validation procedures
- Workflow overview
-
-**Skip this phase** if you already understand agent fundamentals.
-
-## Phase 1: Build Agent Structure
-
-**Duration**: 15-30 minutes
-**Skill**: `/building-agents-construction`
-**Input**: User requirements ("Build an agent that...")
-
-### What This Phase Does
-
-Creates the complete agent architecture:
- Package structure (`exports/agent_name/`)
- Goal with success criteria and constraints
- Workflow graph (nodes and edges)
- Node specifications
- CLI interface
- Documentation
-
-### Process
-
-1. **Create package** - Directory structure with skeleton files
-2. **Define goal** - Success criteria and constraints written to agent.py
-3. **Design nodes** - Each node approved and written incrementally
-4. **Connect edges** - Workflow graph with conditional routing
-5. **Finalize** - Agent class, exports, and documentation
-
-### Outputs
-
- ✅ `exports/agent_name/` package created
- ✅ Goal defined in agent.py
- ✅ 5-10 nodes specified in nodes/__init__.py
- ✅ 8-15 edges connecting workflow
- ✅ Validated structure (passes `python -m agent_name validate`)
- ✅ README.md with usage instructions
- ✅ CLI commands (info, validate, run, shell)
-
-### Success Criteria
-
-You're ready for Phase 2 when:
- Agent structure validates without errors
- All nodes and edges are defined
- CLI commands work (info, validate)
- You see: "Agent complete: exports/agent_name/"
-
-### Common Outputs
-
-The building-agents-construction skill produces:
-```
-exports/agent_name/
-├── __init__.py          (package exports)
-├── __main__.py          (CLI interface)
-├── agent.py             (goal, graph, agent class)
-├── nodes/__init__.py    (node specifications)
-├── config.py            (configuration)
-├── implementations.py   (may be created for Python functions)
-└── README.md            (documentation)
-```
-
-### Next Steps
-
-**If structure complete and validated:**
-→ Check `exports/agent_name/STATUS.md` or `IMPLEMENTATION_GUIDE.md`
-→ These files explain implementation options
-→ You may need to add Python functions or MCP tools (not covered by current skills)
-
-**If want to optimize design:**
-→ Proceed to Phase 1.5 (building-agents-patterns)
-
-**If ready to test:**
-→ Proceed to Phase 2
-
-## Phase 1.5: Optimize Design (Optional)
-
-**Duration**: 10-15 minutes
-**Skill**: `/building-agents-patterns`
-**Input**: Completed agent structure
-
-### When to Use
-
- Want to add pause/resume functionality
- Need error handling patterns
- Want to optimize performance
- Need examples of complex routing
- Want best practices guidance
-
-### What This Phase Provides
-
- Practical examples and patterns
- Pause/resume architecture
- Error handling strategies
- Anti-patterns to avoid
- Performance optimization techniques
-
-**Skip this phase** if your agent design is straightforward.
-
-## Phase 2: Test & Validate
-
-**Duration**: 20-40 minutes
-**Skill**: `/testing-agent`
-**Input**: Working agent from Phase 1
-
-### What This Phase Does
-
-Creates comprehensive test suite:
- Constraint tests (verify hard requirements)
- Success criteria tests (measure goal achievement)
- Edge case tests (handle failures gracefully)
- Integration tests (end-to-end workflows)
-
-### Process
-
-1. **Analyze agent** - Read goal, constraints, success criteria
-2. **Generate tests** - Create pytest files in `exports/agent_name/tests/`
-3. **User approval** - Review and approve each test
-4. **Run evaluation** - Execute tests and collect results
-5. **Debug failures** - Identify and fix issues
-6. **Iterate** - Repeat until all tests pass
-
-### Outputs
-
- ✅ Test files in `exports/agent_name/tests/`
- ✅ Test report with pass/fail metrics
- ✅ Coverage of all success criteria
- ✅ Coverage of all constraints
- ✅ Edge case handling verified
-
-### Success Criteria
-
-You're done when:
- All tests pass
- All success criteria validated
- All constraints verified
- Agent handles edge cases
- Test coverage is comprehensive
-
-### Next Steps
-
-**Agent ready for:**
- Production deployment
- Integration into larger systems
- Documentation and handoff
- Continuous monitoring
-
-## Phase Transitions
-
-### From Phase 1 to Phase 2
-
-**Trigger signals:**
- "Agent complete: exports/..."
- Structure validation passes
- README indicates implementation complete
-
-**Before proceeding:**
- Verify agent can be imported: `from exports.agent_name import default_agent`
- Check if implementation is needed (see STATUS.md or IMPLEMENTATION_GUIDE.md)
- Confirm agent executes without import errors
-
-### Skipping Phases
-
-**When to skip Phase 1:**
- Agent structure already exists
- Only need to add tests
- Modifying existing agent
-
-**When to skip Phase 2:**
- Prototyping or exploring
- Agent not production-bound
- Manual testing sufficient
-
-## Common Patterns
-
-### Pattern 1: Complete New Build (Simple)
-
-```
-User: "Build an agent that monitors files"
-→ Use /building-agents-construction
-→ Agent structure created
-→ Use /testing-agent
-→ Tests created and passing
-→ Done: Production-ready agent
-```
-
-### Pattern 1b: Complete New Build (With Learning)
-
-```
-User: "Build an agent (first time)"
-→ Use /building-agents-core (understand concepts)
-→ Use /building-agents-construction (build structure)
-→ Use /building-agents-patterns (optimize design)
-→ Use /testing-agent (validate)
-→ Done: Production-ready agent
-```
-
-### Pattern 2: Test Existing Agent
-
-```
-User: "Test my agent at exports/my_agent"
-→ Skip Phase 1
-→ Use /testing-agent directly
-→ Tests created
-→ Done: Validated agent
-```
-
-### Pattern 3: Iterative Development
-
-```
-User: "Build an agent"
-→ Use /building-agents-construction (Phase 1)
-→ Implementation needed (see STATUS.md)
-→ [User implements functions]
-→ Use /testing-agent (Phase 2)
-→ Tests reveal bugs
-→ [Fix bugs manually]
-→ Re-run tests
-→ Done: Working agent
-```
-
-### Pattern 4: Complex Agent with Patterns
-
-```
-User: "Build an agent with multi-turn conversations"
-→ Use /building-agents-core (learn pause/resume)
-→ Use /building-agents-construction (build structure)
-→ Use /building-agents-patterns (implement pause/resume pattern)
-→ Use /testing-agent (validate conversation flows)
-→ Done: Complex conversational agent
-```
-
-## Skill Dependencies
-
-```
-agent-workflow (meta-skill)
-    │
-    ├── building-agents-core (foundational)
-    │   ├── Architecture concepts
-    │   ├── Node/Edge/Goal definitions
-    │   ├── Tool discovery procedures
-    │   └── Workflow overview
-    │
-    ├── building-agents-construction (procedural)
-    │   ├── Creates package structure
-    │   ├── Defines goal
-    │   ├── Adds nodes incrementally
-    │   ├── Connects edges
-    │   ├── Finalizes agent class
-    │   └── Requires: building-agents-core
-    │
-    ├── building-agents-patterns (reference)
-    │   ├── Best practices
-    │   ├── Pause/resume patterns
-    │   ├── Error handling
-    │   ├── Anti-patterns
-    │   └── Performance optimization
-    │
-    └── testing-agent
-        ├── Reads agent goal
-        ├── Generates tests
-        ├── Runs evaluation
-        └── Reports results
-```
-
-## Troubleshooting
-
-### "Agent structure won't validate"
-
- Check node IDs match between nodes/__init__.py and agent.py
- Verify all edges reference valid node IDs
- Ensure entry_node exists in nodes list
- Run: `PYTHONPATH=core:exports python -m agent_name validate`
-
-### "Agent has structure but won't run"
-
- Check for STATUS.md or IMPLEMENTATION_GUIDE.md in agent directory
- Implementation may be needed (Python functions or MCP tools)
- This is expected - building-agents-construction creates structure, not implementation
- See implementation guide for completion options
-
-### "Tests are failing"
-
- Review test output for specific failures
- Check agent goal and success criteria
- Verify constraints are met
- Use `/testing-agent` to debug and iterate
- Fix agent code and re-run tests
-
-### "Not sure which phase I'm in"
-
-Run these checks:
-
-```bash
-# Check if agent structure exists
-ls exports/my_agent/agent.py
-
-# Check if it validates
-PYTHONPATH=core:exports python -m my_agent validate
-
-# Check if tests exist
-ls exports/my_agent/tests/
-
-# If structure exists and validates → Phase 2 (testing)
-# If structure doesn't exist → Phase 1 (building)
-# If tests exist but failing → Debug phase
-```
-
-## Best Practices
-
-### For Phase 1 (Building)
-
-1. **Start with clear requirements** - Know what the agent should do
-2. **Define success criteria early** - Measurable goals drive design
-3. **Keep nodes focused** - One responsibility per node
-4. **Use descriptive names** - Node IDs should explain purpose
-5. **Validate incrementally** - Check structure after each major addition
-
-### For Phase 2 (Testing)
-
-1. **Test constraints first** - Hard requirements must pass
-2. **Mock external dependencies** - Use mock mode for LLMs/APIs
-3. **Cover edge cases** - Test failures, not just success paths
-4. **Iterate quickly** - Fix one test at a time
-5. **Document test patterns** - Future tests follow same structure
-
-### General Workflow
-
-1. **Use version control** - Git commit after each phase
-2. **Document decisions** - Update README with changes
-3. **Keep iterations small** - Build → Test → Fix → Repeat
-4. **Preserve working states** - Tag successful iterations
-5. **Learn from failures** - Failed tests reveal design issues
-
-## Exit Criteria
-
-You're done with the workflow when:
-
-✅ Agent structure validates
-✅ All tests pass
-✅ Success criteria met
-✅ Constraints verified
-✅ Documentation complete
-✅ Agent ready for deployment
-
-## Additional Resources
-
- **building-agents-core**: See `.claude/skills/building-agents-core/SKILL.md`
- **building-agents-construction**: See `.claude/skills/building-agents-construction/SKILL.md`
- **building-agents-patterns**: See `.claude/skills/building-agents-patterns/SKILL.md`
- **testing-agent**: See `.claude/skills/testing-agent/SKILL.md`
- **Agent framework docs**: See `core/README.md`
- **Example agents**: See `exports/` directory
-
-## Summary
-
-This workflow provides a proven path from concept to production-ready agent:
-
-1. **Learn** with `/building-agents-core` → Understand fundamentals (optional)
-2. **Build** with `/building-agents-construction` → Get validated structure
-3. **Optimize** with `/building-agents-patterns` → Apply best practices (optional)
-4. **Test** with `/testing-agent` → Get verified functionality
-
-The workflow is **flexible** - skip phases as needed, iterate freely, and adapt to your specific requirements. The goal is **production-ready agents** built with **consistent, repeatable processes**.
-
-## Skill Selection Guide
-
-**Choose building-agents-core when:**
- First time building agents
- Need to understand architecture
- Validating tool availability
- Learning about node types and edges
-
-**Choose building-agents-construction when:**
- Actually building an agent
- Have clear requirements
- Ready to write code
- Want step-by-step guidance
-
-**Choose building-agents-patterns when:**
- Agent structure complete
- Need advanced patterns
- Implementing pause/resume
- Optimizing performance
- Want best practices
-
-**Choose testing-agent when:**
- Agent structure complete
- Ready to validate functionality
- Need comprehensive test coverage
- Debugging agent behavior
@@ -1,199 +0,0 @@
-# Example: File Monitor Agent
-
-This example shows the complete agent-workflow in action for building a file monitoring agent.
-
-## Initial Request
-
-```
-User: "Build an agent that monitors ~/Downloads and copies new files to ~/Documents"
-```
-
-## Phase 1: Building (20 minutes)
-
-### Step 1: Create Structure
-
-Agent invokes `/building-agents` skill and:
-
-1. Creates `exports/file_monitor_agent/` package
-2. Writes skeleton files (__init__.py, __main__.py, agent.py, etc.)
-
-**Output**: Package structure visible immediately
-
-### Step 2: Define Goal
-
-```python
-goal = Goal(
-    id="file-monitor-copy",
-    name="Automated File Monitor & Copy",
-    success_criteria=[
-        # 100% detection rate
-        # 100% copy success
-        # 100% conflict resolution
-        # >99% uptime
-    ],
-    constraints=[
-        # Preserve originals
-        # Handle errors gracefully
-        # Track state
-        # Respect permissions
-    ]
-)
-```
-
-**Output**: Goal written to agent.py
-
-### Step 3: Design Nodes
-
-7 nodes approved and written incrementally:
-
-1. `initialize-state` - Set up tracking
-2. `list-downloads` - Scan directory
-3. `identify-new-files` - Find new files
-4. `check-for-new-files` - Router
-5. `copy-files` - Copy with conflict resolution
-6. `update-state` - Mark as processed
-7. `wait-interval` - Sleep between cycles
-
-**Output**: All nodes in nodes/__init__.py
-
-### Step 4: Connect Edges
-
-8 edges connecting the workflow loop:
-
-```
-initialize → list → identify → check
-                                ↓  ↓
-                              copy  wait
-                                ↓    ↑
-                              update ↓
-                                ↓    ↓
-                              wait → list (loop)
-```
-
-**Output**: Edges written to agent.py
-
-### Step 5: Finalize
-
-```bash
-$ PYTHONPATH=core:exports python -m file_monitor_agent validate
-✓ Agent is valid
-
-$ PYTHONPATH=core:exports python -m file_monitor_agent info
-Agent: File Monitor & Copy Agent
-Nodes: 7
-Edges: 8
-```
-
-**Phase 1 Complete**: Structure validated ✅
-
-### Status After Phase 1
-
-```
-exports/file_monitor_agent/
-├── __init__.py          ✅ (exports)
-├── __main__.py          ✅ (CLI)
-├── agent.py             ✅ (goal, graph, agent class)
-├── nodes/__init__.py    ✅ (7 nodes)
-├── config.py            ✅ (configuration)
-├── implementations.py   ✅ (Python functions)
-├── README.md            ✅ (documentation)
-├── IMPLEMENTATION_GUIDE.md ✅ (next steps)
-└── STATUS.md            ✅ (current state)
-```
-
-**Note**: Implementation gap exists - data flow needs connection (covered in STATUS.md)
-
-## Phase 2: Testing (25 minutes)
-
-### Step 1: Analyze Agent
-
-Agent invokes `/testing-agent` skill and:
-
-1. Reads goal from `exports/file_monitor_agent/agent.py`
-2. Identifies 4 success criteria to test
-3. Identifies 4 constraints to verify
-4. Plans test coverage
-
-### Step 2: Generate Tests
-
-Creates test files:
-
-```
-exports/file_monitor_agent/tests/
-├── conftest.py              (fixtures)
-├── test_constraints.py      (4 constraint tests)
-├── test_success_criteria.py (4 success tests)
-└── test_edge_cases.py       (error handling)
-```
-
-Tests approved incrementally by user.
-
-### Step 3: Run Tests
-
-```bash
-$ PYTHONPATH=core:exports pytest exports/file_monitor_agent/tests/
-
-test_constraints.py::test_preserves_originals     PASSED
-test_constraints.py::test_handles_errors          PASSED
-test_constraints.py::test_tracks_state            PASSED
-test_constraints.py::test_respects_permissions    PASSED
-
-test_success_criteria.py::test_detects_all_files  PASSED
-test_success_criteria.py::test_copies_all_files   PASSED
-test_success_criteria.py::test_resolves_conflicts PASSED
-test_success_criteria.py::test_continuous_run     PASSED
-
-test_edge_cases.py::test_empty_directory          PASSED
-test_edge_cases.py::test_permission_denied        PASSED
-test_edge_cases.py::test_disk_full                PASSED
-test_edge_cases.py::test_large_files              PASSED
-
-========================== 12 passed in 3.42s ==========================
-```
-
-**Phase 2 Complete**: All tests pass ✅
-
-## Final Output
-
-**Production-Ready Agent:**
-
-```bash
-# Run the agent
-./RUN_AGENT.sh
-
-# Or manually
-PYTHONPATH=core:exports:aden-tools/src python -m file_monitor_agent run
-```
-
-**Capabilities:**
- Monitors ~/Downloads continuously
- Copies new files to ~/Documents
- Resolves conflicts with timestamps
- Handles errors gracefully
- Tracks processed files
- Runs as background service
-
-**Total Time**: ~45 minutes from concept to production
-
-## Key Learnings
-
-1. **Incremental building** - Files written immediately, visible throughout
-2. **Validation early** - Structure validated before moving to implementation
-3. **Test-driven** - Tests reveal real behavior
-4. **Documentation included** - README, STATUS, and guides auto-generated
-5. **Repeatable process** - Same workflow for any agent type
-
-## Variations
-
-**For simpler agents:**
- Fewer nodes (3-5 instead of 7)
- Simpler workflow (linear instead of looping)
- Faster build time (10-15 minutes)
-
-**For complex agents:**
- More nodes (10-15+)
- Multiple subgraphs
- Pause/resume points for human-in-the-loop
- Longer build time (45-60 minutes)
-
-The workflow scales to your needs!
@@ -0,0 +1,241 @@
+---
+name: browser-edge-cases
+description: SOP for debugging browser automation failures on complex websites. Use when browser tools fail on specific sites like LinkedIn, Twitter/X, SPAs, or sites with Shadow DOM.
+license: MIT
+---
+
+# Browser Tool Edge Cases
+
+Standard Operating Procedure for debugging and fixing browser automation failures on complex websites.
+
+## When to Use This Skill
+
+- `browser_scroll` succeeds but page doesn't move
+- `browser_click` succeeds but no action triggered
+- `browser_type` text disappears or doesn't work
+- `browser_snapshot` hangs or returns stale content
+- `browser_navigate` loads wrong content
+
+## SOP: Debugging Browser Tool Failures
+
+### Phase 1: Reproduce & Isolate
+
+```
+1. Create minimal test case demonstrating failure
+2. Test against simple site (example.com) to verify tool works
+3. Test against problematic site to confirm issue
+```
+
+**Quick isolation test:**
+```python
+# Test 1: Does the tool work at all?
+await browser_navigate(tab_id, "https://example.com")
+result = await browser_scroll(tab_id, "down", 100)
+# Should work on simple sites
+
+# Test 2: Does it fail on the problematic site?
+await browser_navigate(tab_id, "https://linkedin.com/feed")
+result = await browser_scroll(tab_id, "down", 100)
+# If this fails but example.com works → site-specific edge case
+```
+
+### Phase 2: Analyze Root Cause
+
+**Step 2a: Check console for errors**
+```python
+console = await browser_console(tab_id)
+# Look for: CSP violations, React errors, JavaScript exceptions
+```
+
+**Step 2b: Inspect DOM structure**
+```python
+html = await browser_html(tab_id)
+snapshot = await browser_snapshot(tab_id)
+# Look for:
+# - Nested scrollable divs (overflow: scroll/auto)
+# - Shadow DOM roots
+# - iframes
+# - Custom widgets
+```
+
+**Step 2c: Identify the pattern**
+
+| Symptom | Likely Cause | Check |
+|---------|--------------|-------|
+| Scroll doesn't move | Nested scroll container | Look for `overflow: scroll` divs |
+| Click no effect | Element covered | Check `getBoundingClientRect` vs viewport |
+| Type clears | Autocomplete/React | Check for event listeners on input; try `browser_type_focused` |
+| Snapshot hangs | Huge DOM | Check node count in snapshot |
+| Snapshot stale | SPA hydration | Wait after navigation |
+
+### Phase 3: Implement Multi-Layer Fix
+
+**Pattern: Always have fallbacks**
+
+```python
+async def robust_operation(tab_id):
+    # Method 1: Primary approach
+    try:
+        result = await primary_method(tab_id)
+        if verify_success(result):
+            return result
+    except Exception:
+        pass
+
+    # Method 2: CDP fallback
+    try:
+        result = await cdp_fallback(tab_id)
+        if verify_success(result):
+            return result
+    except Exception:
+        pass
+
+    # Method 3: JavaScript fallback
+    return await javascript_fallback(tab_id)
+```
+
+**Pattern: Always add timeouts**
+
+```python
+# Bad - can hang forever
+result = await browser_snapshot(tab_id)
+
+# Good - fails fast with useful error
+try:
+    result = await browser_snapshot(tab_id, timeout_s=10.0)
+except asyncio.TimeoutError:
+    # Handle timeout gracefully
+    result = await fallback_snapshot(tab_id)
+```
+
+### Phase 4: Verify Fix
+
+```
+1. Run against problematic site → should work
+2. Run against simple site → should still work (regression check)
+3. Document in registry.md
+```
+
+## Pattern Library
+
+### P1: Nested Scrollable Containers
+
+**Sites:** LinkedIn, Twitter/X, any SPA with scrollable feeds
+
+**Detection:**
+```javascript
+// Find largest scrollable container
+const candidates = [];
+document.querySelectorAll('*').forEach(el => {
+    const style = getComputedStyle(el);
+    if (style.overflow.includes('scroll') || style.overflow.includes('auto')) {
+        const rect = el.getBoundingClientRect();
+        if (rect.width > 100 && rect.height > 100) {
+            candidates.push({el, area: rect.width * rect.height});
+        }
+    }
+});
+candidates.sort((a, b) => b.area - a.area);
+return candidates[0]?.el;
+```
+
+**Fix:** Dispatch scroll events at container's center, not viewport center.
+
+### P2: Element Covered by Overlay
+
+**Sites:** Modals, tooltips, SPAs with loading overlays
+
+**Detection:**
+```javascript
+const rect = element.getBoundingClientRect();
+const centerX = rect.left + rect.width / 2;
+const centerY = rect.top + rect.height / 2;
+const topElement = document.elementFromPoint(centerX, centerY);
+return topElement === element || element.contains(topElement);
+```
+
+**Fix:** Wait for overlay to disappear, or use JavaScript click.
+
+### P3: React Synthetic Events
+
+**Sites:** React SPAs, modern web apps
+
+**Detection:** If CDP click doesn't trigger handler but manual click works.
+
+**Fix:** Use JavaScript click as primary:
+```javascript
+element.click();
+```
+
+### P4: Huge DOM / Accessibility Tree
+
+**Sites:** LinkedIn, Facebook, Twitter (feeds with 1000s of nodes)
+
+**Detection:**
+```javascript
+document.querySelectorAll('*').length > 5000
+```
+
+**Fix:**
+1. Add timeout to snapshot operation
+2. Truncate tree at 2000 nodes
+3. Fall back to DOM-based snapshot if accessibility tree too large
+
+### P5: SPA Hydration Delay
+
+**Sites:** React, Vue, Angular SPAs after navigation
+
+**Detection:**
+```javascript
+// Check if React app has hydrated
+document.querySelector('[data-reactroot]') ||
+document.querySelector('[data-reactid]')
+```
+
+**Fix:** Wait for specific selector after navigation:
+```python
+await browser_navigate(tab_id, url, wait_until="load")
+await browser_wait(tab_id, selector='[data-testid="content"]', timeout_ms=5000)
+```
+
+### P6: Shadow DOM
+
+**Sites:** Components using Shadow DOM, Lit elements
+
+**Detection:**
+```javascript
+document.querySelectorAll('*').some(el => el.shadowRoot)
+```
+
+**Fix:** Pierce shadow root:
+```javascript
+function queryShadow(selector) {
+    const parts = selector.split('>>>');
+    let node = document;
+    for (const part of parts) {
+        if (node.shadowRoot) {
+            node = node.shadowRoot.querySelector(part.trim());
+        } else {
+            node = node.querySelector(part.trim());
+        }
+    }
+    return node;
+}
+```
+
+## Quick Reference
+
+| Issue | Primary Fix | Fallback |
+|-------|-------------|----------|
+| Scroll not working | Find scrollable container | Mouse wheel at container center |
+| Click no effect | JavaScript click() | CDP mouse events |
+| Type clears | Add delay_ms | Use `browser_type_focused` (Input.insertText) |
+| Snapshot hangs | Add timeout_s | DOM snapshot fallback |
+| Stale content | Wait for selector | Increase wait_until timeout |
+| Shadow DOM | Pierce selector | JavaScript traversal |
+
+## References
+
+- [registry.md](registry.md) - Full list of known edge cases
+- [scripts/test_case.py](scripts/test_case.py) - Template for testing new cases
+- [BROWSER_USE_PATTERNS.md](../../tools/BROWSER_USE_PATTERNS.md) - Implementation patterns from browser-use
@@ -0,0 +1,261 @@
+# Browser Edge Case Registry
+
+Curated list of known browser automation edge cases with symptoms, causes, and fixes.
+
+---
+
+## Scroll Issues
+
+### #1: LinkedIn Nested Scroll Container
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | LinkedIn (linkedin.com/feed) |
+| **Symptom** | `browser_scroll()` returns `{ok: true}` but page doesn't move |
+| **Root Cause** | Content is in a nested scrollable div (`overflow: scroll`), not the main window |
+| **Detection** | `document.querySelectorAll('*')` with `overflow: scroll/auto` has large candidates |
+| **Fix** | JavaScript finds largest scrollable container, uses `container.scrollBy()` |
+| **Code** | `bridge.py:808-891` - smart scroll with container detection |
+| **Verified** | 2026-04-03 ✓ |
+
+### #2: Twitter/X Lazy Loading
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Twitter/X (x.com) |
+| **Symptom** | Infinite scroll doesn't load new content |
+| **Root Cause** | Lazy loading requires content to be visible before loading more |
+| **Detection** | Scroll position at bottom but no new `[data-testid="tweet"]` elements |
+| **Fix** | Add `wait_for_selector` between scroll calls with 1s delay |
+| **Code** | Test file: `tests/test_x_page_load_repro.py` |
+| **Verified** | - |
+
+### #3: Modal/Dialog Scroll Container
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Any site with modal dialogs |
+| **Symptom** | Scroll scrolls background page, not modal content |
+| **Root Cause** | Modal has its own scroll container with `overflow: scroll` |
+| **Detection** | Visible element with `position: fixed` and scrollable content |
+| **Fix** | Find visible modal container (highest z-index scrollable), scroll that |
+| **Code** | - |
+| **Verified** | - |
+
+---
+
+## Click Issues
+
+### #4: Element Covered by Overlay
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | SPAs, sites with loading overlays |
+| **Symptom** | Click succeeds but no action triggered |
+| **Root Cause** | Element is covered by transparent overlay, tooltip, or iframe |
+| **Detection** | `document.elementFromPoint(x, y) !== target` |
+| **Fix** | Wait for overlay to disappear, or use JavaScript `element.click()` |
+| **Code** | `bridge.py:394-591` - JavaScript click as primary |
+| **Verified** | - |
+
+### #5: React Synthetic Events
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | React applications |
+| **Symptom** | CDP click doesn't trigger React handler |
+| **Root Cause** | React uses synthetic events that don't respond to CDP events |
+| **Detection** | Site uses React (check for `__reactFiber$` or `data-reactroot`) |
+| **Fix** | Use JavaScript `element.click()` as primary method |
+| **Code** | `bridge.py:394-591` - JavaScript-first click |
+| **Verified** | - |
+
+### #6: Shadow DOM Elements
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Components using Shadow DOM, Lit elements |
+| **Symptom** | `querySelector` can't find element |
+| **Root Cause** | Element is inside a shadow root, not main DOM tree |
+| **Detection** | `element.shadowRoot !== null` on parent elements |
+| **Fix** | Use piercing selector (`host >>> target`) or traverse shadow roots |
+| **Code** | See SKILL.md P6 pattern |
+| **Verified** | 2026-04-03 ✓ |
+
+---
+
+## Input Issues
+
+### #7: ContentEditable / Rich Text Editors
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Rich text editors (Notion, Slack web, etc.) |
+| **Symptom** | `browser_type()` doesn't insert text |
+| **Root Cause** | Element is `contenteditable`, not an `<input>` or `<textarea>` |
+| **Detection** | `element.contentEditable === 'true'` |
+| **Fix** | Focus via JavaScript, use `execCommand('insertText')` or `Input.dispatchKeyEvent` |
+| **Code** | `bridge.py:616-694` - contentEditable handling |
+| **Verified** | 2026-04-03 ✓ |
+
+### #8: Autocomplete Field Clearing
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Search fields with autocomplete, address forms |
+| **Symptom** | Typed text gets cleared immediately |
+| **Root Cause** | Field expects realistic keystroke timing for autocomplete |
+| **Detection** | Field has autocomplete listeners or dropdown appears |
+| **Fix** | Add `delay_ms=50` between keystrokes |
+| **Code** | `bridge.py:type()` - delay_ms parameter |
+| **Verified** | 2026-04-03 ✓ |
+
+### #9: Custom Date Pickers
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Forms with custom date widgets |
+| **Symptom** | Can't type date into date field |
+| **Root Cause** | Custom widget intercepts and blocks keyboard input |
+| **Detection** | Typing doesn't change field value |
+| **Fix** | Click calendar widget icon, select date from dropdown |
+| **Code** | - |
+| **Verified** | - |
+
+---
+
+## Snapshot Issues
+
+### #10: LinkedIn Huge DOM Tree
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | LinkedIn, Facebook, Twitter feeds |
+| **Symptom** | `browser_snapshot()` hangs forever |
+| **Root Cause** | 10k+ DOM nodes, accessibility tree has 50k+ nodes |
+| **Detection** | `document.querySelectorAll('*').length > 5000` |
+| **Fix** | Add `timeout_s` param with `asyncio.timeout()`, proper error handling |
+| **Code** | `bridge.py:1041-1028` - snapshot with timeout protection |
+| **Verified** | 2026-04-03 ✓ (0.08s on LinkedIn) |
+
+### #11: SPA Hydration Delay
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | React/Vue/Angular SPAs |
+| **Symptom** | Snapshot shows old content after navigation |
+| **Root Cause** | Client-side hydration hasn't completed when snapshot runs |
+| **Detection** | `document.readyState === 'complete'` but content missing |
+| **Fix** | Wait for specific selector after navigation |
+| **Code** | Test file: `tests/test_x_page_load_repro.py` |
+| **Verified** | - |
+
+### #12: iframe Content Missing
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Sites with embedded content |
+| **Symptom** | Snapshot missing iframe content |
+| **Root Cause** | Accessibility tree doesn't include iframe content |
+| **Detection** | `document.querySelectorAll('iframe')` has results |
+| **Fix** | Use `DOM.getFrameOwner` + separate snapshot for each iframe |
+| **Code** | - |
+| **Verified** | - |
+
+---
+
+## Navigation Issues
+
+### #13: SPA Navigation Events
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | React Router, Vue Router SPAs |
+| **Symptom** | `wait_until="load"` fires before content ready |
+| **Root Cause** | SPA uses client-side routing, no full page load |
+| **Detection** | URL changes but `load` event already fired |
+| **Fix** | Use `wait_until="networkidle"` or `wait_for_selector` |
+| **Code** | `bridge.py:navigate()` - wait_until options |
+| **Verified** | - |
+
+### #14: Cross-Origin Redirects
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | OAuth flows, SSO logins |
+| **Symptom** | Navigation fails during redirect |
+| **Root Cause** | Cross-origin security prevents CDP tracking |
+| **Detection** | URL changes to different domain |
+| **Fix** | Use `wait_for_url` with pattern matching instead of exact URL |
+| **Code** | - |
+| **Verified** | - |
+
+---
+
+## Screenshot Issues
+
+### #15: Selector Screenshot Not Implemented
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Any site |
+| **Symptom** | `browser_screenshot(selector="h1")` takes full viewport instead of element |
+| **Root Cause** | `selector` param existed in signature but was silently ignored in both `bridge.py` and `inspection.py` |
+| **Detection** | Screenshot with selector same byte size as screenshot without selector |
+| **Fix** | Use CDP `Runtime.evaluate` to call `getBoundingClientRect()` on the element, pass result as `clip` to `Page.captureScreenshot` |
+| **Code** | `bridge.py:1315-1344` - selector clip logic; `inspection.py:94-96` - pass selector to bridge |
+| **Verified** | 2026-04-03 ✓ (JS rect query returns correct viewport coords; requires server restart) |
+
+### #16: Stale Browser Context (Group ID Mismatch)
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Any |
+| **Symptom** | `browser_open()` returns `"No group with id: XXXXXXX"` even though `browser_status` shows `running: true` |
+| **Root Cause** | In-memory `_contexts` dict has a stale `groupId` from a Chrome tab group that was closed outside the tool (e.g. user closed the tab group) |
+| **Detection** | `browser_status` returns `running: true` but `browser_open` fails with "No group with id" |
+| **Fix** | Call `browser_stop()` to clear stale context from `_contexts`, then `browser_start()` again |
+| **Code** | `tools/lifecycle.py:144-160` - `already_running` check uses cached dict without validating against Chrome |
+| **Verified** | 2026-04-03 ✓ |
+
+---
+
+## How to Add New Edge Cases
+
+1. **Reproduce** the issue with minimal test case
+2. **Document** using the template below
+3. **Implement** fix with multi-layer fallback
+4. **Verify** against both problematic and simple sites
+5. **Submit** by appending to this file
+
+### Template
+
+```markdown
+### #N: [Short Title]
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | [URL or site type] |
+| **Symptom** | [What the user observes] |
+| **Root Cause** | [Technical explanation] |
+| **Detection** | [JavaScript to detect this case] |
+| **Fix** | [Solution approach] |
+| **Code** | [File:line reference if implemented] |
+| **Verified** | [Date or "pending"] |
+```
+
+---
+
+## Statistics
+
+| Category | Count |
+|----------|-------|
+| Scroll Issues | 3 |
+| Click Issues | 3 |
+| Input Issues | 3 |
+| Snapshot Issues | 3 |
+| Navigation Issues | 2 |
+| Screenshot Issues | 2 |
+| **Total** | **16** |
+
+Last updated: 2026-04-03
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+"""
+Test #2: Twitter/X Lazy Loading Scroll
+
+Symptom: Infinite scroll doesn't load new content
+Root Cause: Lazy loading requires content to be visible before loading more
+Fix: Add wait_for_selector between scroll calls
+"""
+
+import asyncio
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+BRIDGE_PORT = 9229
+CONTEXT_NAME = "twitter-scroll-test"
+
+
+async def test_twitter_lazy_scroll():
+    """Test that repeated scrolls with waits load new content."""
+    print("=" * 70)
+    print("TEST #2: Twitter/X Lazy Loading Scroll")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+            print(f"Waiting for extension... ({i + 1}/10)")
+        else:
+            print("✗ Extension not connected")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Navigate to Twitter/X
+        print("\n--- Navigating to X.com ---")
+        await bridge.navigate(tab_id, "https://x.com", wait_until="networkidle", timeout_ms=30000)
+        print("✓ Page loaded")
+
+        # Wait for tweets to appear
+        print("\n--- Waiting for tweets ---")
+        await bridge.wait_for_selector(tab_id, '[data-testid="tweet"]', timeout_ms=10000)
+
+        # Count initial tweets
+        initial_count = await bridge.evaluate(
+            tab_id,
+            "(function() { return document.querySelectorAll('[data-testid=\"tweet\"]').length; })()",
+        )
+        print(f"Initial tweet count: {initial_count.get('result', 0)}")
+
+        # Take screenshot of initial state
+        screenshot = await bridge.screenshot(tab_id)
+        print(f"Screenshot: {len(screenshot.get('data', ''))} bytes")
+
+        # Scroll multiple times with waits
+        print("\n--- Scrolling with waits ---")
+        for i in range(3):
+            result = await bridge.scroll(tab_id, "down", 500)
+            print(f"  Scroll {i + 1}: {result.get('method', 'unknown')} method")
+
+            # Wait for new content to load
+            await asyncio.sleep(2)
+
+            # Count tweets after scroll
+            count_result = await bridge.evaluate(
+                tab_id,
+                "(function() { return document.querySelectorAll('[data-testid=\"tweet\"]').length; })()",
+            )
+            count = count_result.get("result", 0)
+            print(f"  Tweet count after scroll: {count}")
+
+        # Final count
+        final_count = await bridge.evaluate(
+            tab_id,
+            "(function() { return document.querySelectorAll('[data-testid=\"tweet\"]').length; })()",
+        )
+        final = final_count.get("result", 0)
+        initial = initial_count.get("result", 0)
+
+        print("\n--- Results ---")
+        print(f"Initial tweets: {initial}")
+        print(f"Final tweets: {final}")
+
+        if final > initial:
+            print(f"✓ PASS: Loaded {final - initial} new tweets")
+        else:
+            print("✗ FAIL: No new tweets loaded (may need login)")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_twitter_lazy_scroll())
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+"""
+Test #3: Modal/Dialog Scroll Container
+
+Symptom: Scroll scrolls background page, not modal content
+Root Cause: Modal has its own scroll container with overflow: scroll
+Fix: Find visible modal container (highest z-index scrollable), scroll that
+"""
+
+import asyncio
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+BRIDGE_PORT = 9229
+CONTEXT_NAME = "modal-scroll-test"
+
+# Test site with modal - using a demo site
+MODAL_DEMO_URL = "https://www.w3schools.com/howto/howto_css_modals.asp"
+
+
+async def test_modal_scroll():
+    """Test that scroll targets modal content, not background."""
+    print("=" * 70)
+    print("TEST #3: Modal/Dialog Scroll Container")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+        else:
+            print("✗ Extension not connected")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Navigate to modal demo
+        print("\n--- Navigating to modal demo ---")
+        await bridge.navigate(tab_id, MODAL_DEMO_URL, wait_until="load")
+        print("✓ Page loaded")
+
+        # Take screenshot before
+        screenshot_before = await bridge.screenshot(tab_id)
+        print(f"Screenshot before: {len(screenshot_before.get('data', ''))} bytes")
+
+        # Click button to open modal
+        print("\n--- Opening modal ---")
+        # Find and click the "Open Modal" button
+        result = await bridge.click(tab_id, ".ws-btn", timeout_ms=5000)
+        print(f"Click result: {result}")
+
+        await asyncio.sleep(1)
+
+        # Take screenshot with modal open
+        screenshot_modal = await bridge.screenshot(tab_id)
+        print(f"Screenshot modal open: {len(screenshot_modal.get('data', ''))} bytes")
+
+        # Try to scroll within modal
+        print("\n--- Scrolling modal content ---")
+        result = await bridge.scroll(tab_id, "down", 100)
+        print(f"Scroll result: {result}")
+
+        await asyncio.sleep(0.5)
+
+        # Take screenshot after scroll
+        screenshot_after = await bridge.screenshot(tab_id)
+        print(f"Screenshot after scroll: {len(screenshot_after.get('data', ''))} bytes")
+
+        # Check if modal content scrolled (not background)
+        # This is a visual check - we can verify by comparing screenshots
+        print("\n--- Results ---")
+        print(f"Modal scroll test completed. Method used: {result.get('method', 'unknown')}")
+        print("Visual verification needed: Check if modal content scrolled vs background")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_modal_scroll())
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+"""
+Test #4: Element Covered by Overlay
+
+Symptom: Click succeeds but no action triggered
+Root Cause: Element is covered by transparent overlay, tooltip, or iframe
+Detection: document.elementFromPoint(x, y) !== target
+Fix: Wait for overlay to disappear, or use JavaScript element.click()
+"""
+
+import asyncio
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+CONTEXT_NAME = "overlay-click-test"
+
+
+async def test_overlay_click():
+    """Test clicking elements that are covered by overlays."""
+    print("=" * 70)
+    print("TEST #4: Element Covered by Overlay")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+        else:
+            print("✗ Extension not connected")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Create a test page with overlay
+        print("\n--- Creating test page with overlay ---")
+        test_html = """
+        <!DOCTYPE html>
+        <html>
+        <head><title>Overlay Test</title></head>
+        <body>
+            <button id="target-btn" onclick="alert('Clicked!')">Click Me</button>
+            <div id="overlay" style="position:fixed;top:0;left:0;
+            width:100%;height:100%;
+            background:rgba(0,0,0,0.3);z-index:1000;"></div>
+            <script>
+                window.clickCount = 0;
+                document.getElementById('target-btn').addEventListener('click', () => {
+                    window.clickCount++;
+                });
+            </script>
+        </body>
+        </html>
+        """
+
+        # Navigate to data URL
+        import base64
+
+        data_url = f"data:text/html;base64,{base64.b64encode(test_html.encode()).decode()}"
+        await bridge.navigate(tab_id, data_url, wait_until="load")
+
+        # Screenshot before
+        screenshot = await bridge.screenshot(tab_id)
+        print(f"Screenshot: {len(screenshot.get('data', ''))} bytes")
+
+        # Try to click the covered button
+        print("\n--- Attempting to click covered button ---")
+
+        # First, check if element is covered
+        coverage_check = await bridge.evaluate(
+            tab_id,
+            """
+            (function() {
+                const btn = document.getElementById('target-btn');
+                const rect = btn.getBoundingClientRect();
+                const centerX = rect.left + rect.width / 2;
+                const centerY = rect.top + rect.height / 2;
+                const topElement = document.elementFromPoint(centerX, centerY);
+                return {
+                    isCovered: topElement !== btn && !btn.contains(topElement),
+                    topElement: topElement?.tagName,
+                    targetElement: btn.tagName
+                };
+            })();
+        """,
+        )
+        print(f"Coverage check: {coverage_check.get('result', {})}")
+
+        # Try CDP click (may fail due to overlay)
+        click_result = await bridge.click(tab_id, "#target-btn", timeout_ms=5000)
+        print(f"Click result: {click_result}")
+
+        # Check if click registered
+        count_result = await bridge.evaluate(tab_id, "(function() { return window.clickCount; })()")
+        count = count_result.get("result", 0)
+        print(f"Click count after CDP click: {count}")
+
+        if count > 0:
+            print("✓ PASS: JavaScript click penetrated overlay")
+        else:
+            print("✗ FAIL: Click did not reach button (overlay blocked it)")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_overlay_click())
@@ -0,0 +1,150 @@
+#!/usr/bin/env python
+"""
+Test #6: Shadow DOM Elements
+
+Symptom: querySelector can't find element
+Root Cause: Element is inside a shadow root, not main DOM tree
+Detection: element.shadowRoot !== null on parent elements
+Fix: Use piercing selector (host >>> target) or traverse shadow roots
+"""
+
+import asyncio
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+CONTEXT_NAME = "shadow-dom-test"
+
+
+async def test_shadow_dom():
+    """Test clicking elements inside Shadow DOM."""
+    print("=" * 70)
+    print("TEST #6: Shadow DOM Elements")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+        else:
+            print("✗ Extension not connected")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Create test page with Shadow DOM
+        print("\n--- Creating test page with Shadow DOM ---")
+        test_html = """
+        <!DOCTYPE html>
+        <html>
+        <head><title>Shadow DOM Test</title></head>
+        <body>
+            <div id="shadow-host"></div>
+            <script>
+                const host = document.getElementById('shadow-host');
+                const shadow = host.attachShadow({ mode: 'open' });
+                shadow.innerHTML = `
+                    <style>
+                        button { padding: 10px 20px; font-size: 16px; }
+                    </style>
+                    <button id="shadow-btn">Shadow Button</button>
+                `;
+                shadow.getElementById('shadow-btn').addEventListener('click', () => {
+                    window.shadowClickCount = (window.shadowClickCount || 0) + 1;
+                    console.log('Shadow button clicked:', window.shadowClickCount);
+                });
+            </script>
+        </body>
+        </html>
+        """
+
+        # Write to file and use file:// URL (data: URLs don't work well with extension)
+        test_file = Path("/tmp/shadow_dom_test.html")
+        test_file.write_text(test_html.strip())
+        file_url = f"file://{test_file}"
+        await bridge.navigate(tab_id, file_url, wait_until="load")
+        print("✓ Page loaded")
+
+        # Screenshot
+        screenshot = await bridge.screenshot(tab_id)
+        print(f"Screenshot: {len(screenshot.get('data', ''))} bytes")
+
+        # Detect Shadow DOM
+        print("\n--- Detecting Shadow DOM ---")
+        detection = await bridge.evaluate(
+            tab_id,
+            """
+            (function() {
+                const hosts = [];
+                document.querySelectorAll('*').forEach(el => {
+                    if (el.shadowRoot) {
+                        hosts.push({
+                            tag: el.tagName,
+                            id: el.id,
+                            hasButton: el.shadowRoot.querySelector('button') !== null
+                        });
+                    }
+                });
+                return { count: hosts.length, hosts };
+            })();
+        """,
+        )
+        print(f"Shadow DOM detection: {detection.get('result', {})}")
+
+        # Try to click shadow button using regular selector (should fail)
+        print("\n--- Attempting click with regular selector ---")
+        try:
+            result = await bridge.click(tab_id, "#shadow-btn", timeout_ms=3000)
+            print(f"Result: {result}")
+        except Exception as e:
+            print(f"Expected failure: {e}")
+
+        # Try to click using JavaScript that pierces shadow DOM
+        print("\n--- Clicking via JavaScript shadow piercing ---")
+        click_result = await bridge.evaluate(
+            tab_id,
+            """
+            (function() {
+                const host = document.getElementById('shadow-host');
+                const btn = host.shadowRoot.getElementById('shadow-btn');
+                if (btn) {
+                    btn.click();
+                    return { success: true, clicked: 'shadow-btn' };
+                }
+                return { success: false, error: 'Button not found' };
+            })();
+        """,
+        )
+        print(f"JS click result: {click_result.get('result', {})}")
+
+        # Verify click was registered
+        count_result = await bridge.evaluate(tab_id, "(function() { return window.shadowClickCount || 0; })()")
+        count = count_result.get("result") or 0
+        print(f"Shadow click count: {count}")
+
+        if count and count > 0:
+            print("✓ PASS: Shadow DOM element clicked successfully")
+        else:
+            print("✗ FAIL: Could not click Shadow DOM element")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_shadow_dom())
@@ -0,0 +1,180 @@
+#!/usr/bin/env python
+"""
+Test #7: ContentEditable / Rich Text Editors
+
+Symptom: browser_type() doesn't insert text
+Root Cause: Element is contenteditable, not an <input> or <textarea>
+Detection: element.contentEditable === 'true'
+Fix: Focus via JavaScript, use execCommand('insertText') or Input.dispatchKeyEvent
+"""
+
+import asyncio
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+CONTEXT_NAME = "contenteditable-test"
+
+
+async def test_contenteditable():
+    """Test typing into contenteditable elements."""
+    print("=" * 70)
+    print("TEST #7: ContentEditable / Rich Text Editors")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+        else:
+            print("✗ Extension not connected")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Create test page with contenteditable
+        test_html = """
+        <!DOCTYPE html>
+        <html>
+        <head><title>ContentEditable Test</title></head>
+        <body>
+            <h2>ContentEditable Test</h2>
+
+            <h3>1. Simple contenteditable div</h3>
+            <div id="editor1" contenteditable="true"
+            style="border:1px solid #ccc;padding:10px;
+            min-height:50px;">Start text</div>
+
+            <h3>2. Rich text editor (like Notion)</h3>
+            <div id="editor2" contenteditable="true"
+            style="border:1px solid #ccc;padding:10px;
+            min-height:50px;">
+                <p>Type here...</p>
+            </div>
+
+            <h3>3. Regular input (for comparison)</h3>
+            <input id="input1" type="text" placeholder="Regular input" />
+
+            <script>
+                // Track content changes
+                window.editor1Content = '';
+                window.editor2Content = '';
+
+                document.getElementById('editor1').addEventListener('input', (e) => {
+                    window.editor1Content = e.target.innerText;
+                });
+                document.getElementById('editor2').addEventListener('input', (e) => {
+                    window.editor2Content = e.target.innerText;
+                });
+            </script>
+        </body>
+        </html>
+        """
+
+        # Write to file and use file:// URL (data: URLs don't work well with extension)
+        test_file = Path("/tmp/contenteditable_test.html")
+        test_file.write_text(test_html.strip())
+        file_url = f"file://{test_file}"
+        await bridge.navigate(tab_id, file_url, wait_until="load")
+        print("✓ Page loaded")
+
+        # Screenshot with timeout protection
+        try:
+            screenshot = await asyncio.wait_for(bridge.screenshot(tab_id), timeout=10.0)
+            print(f"Screenshot: {len(screenshot.get('data', ''))} bytes")
+        except asyncio.TimeoutError:
+            print("Screenshot timed out (skipping)")
+
+        # Detect contenteditable
+        print("\n--- Detecting contenteditable elements ---")
+        detection = await bridge.evaluate(
+            tab_id,
+            """
+            (function() {
+                const editables = document.querySelectorAll('[contenteditable="true"]');
+                return {
+                    count: editables.length,
+                    ids: Array.from(editables).map(el => el.id)
+                };
+            })();
+        """,
+        )
+        print(f"Contenteditable detection: {detection.get('result', {})}")
+
+        # Test 1: Type into regular input (baseline)
+        print("\n--- Test 1: Regular input ---")
+        await bridge.click(tab_id, "#input1")
+        await bridge.type_text(tab_id, "#input1", "Hello input")
+        input_result = await bridge.evaluate(
+            tab_id, "(function() { return document.getElementById('input1').value; })()"
+        )
+        print(f"Input value: {input_result.get('result', '')}")
+
+        # Test 2: Type into contenteditable div
+        print("\n--- Test 2: Contenteditable div ---")
+        await bridge.click(tab_id, "#editor1")
+        await bridge.type_text(tab_id, "#editor1", "Hello contenteditable", clear_first=True)
+        editor_result = await bridge.evaluate(
+            tab_id,
+            "(function() { return document.getElementById('editor1').innerText; })()",
+        )
+        print(f"Editor1 innerText: {editor_result.get('result', '')}")
+
+        # Test 3: Use JavaScript insertText for rich editor
+        print("\n--- Test 3: JavaScript insertText for rich editor ---")
+        insert_result = await bridge.evaluate(
+            tab_id,
+            """
+            (function() {
+                const editor = document.getElementById('editor2');
+                editor.focus();
+                document.execCommand('selectAll', false, null);
+                document.execCommand('insertText', false, 'Hello from execCommand');
+                return editor.innerText;
+            })();
+        """,
+        )
+        print(f"Editor2 after execCommand: {insert_result.get('result', '')}")
+
+        # Screenshot after with timeout protection
+        try:
+            screenshot_after = await asyncio.wait_for(bridge.screenshot(tab_id), timeout=10.0)
+            print(f"Screenshot after: {len(screenshot_after.get('data', ''))} bytes")
+        except asyncio.TimeoutError:
+            print("Screenshot after timed out (skipping)")
+
+        # Results
+        print("\n--- Results ---")
+        input_val = input_result.get("result", "")
+        editor1_val = editor_result.get("result", "")
+        editor2_val = insert_result.get("result", "")
+
+        input_pass = "Hello input" in input_val
+        editor1_pass = "Hello contenteditable" in editor1_val
+        editor2_pass = "execCommand" in editor2_val
+
+        print(f"Input: {'✓ PASS' if input_pass else '✗ FAIL'} - {input_val}")
+        print(f"Editor1: {'✓ PASS' if editor1_pass else '✗ FAIL'} - {editor1_val}")
+        print(f"Editor2: {'✓ PASS' if editor2_pass else '✗ FAIL'} - {editor2_val}")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_contenteditable())
@@ -0,0 +1,250 @@
+#!/usr/bin/env python
+"""
+Test #8: Autocomplete Field Clearing
+
+Symptom: Typed text gets cleared immediately
+Root Cause: Field expects realistic keystroke timing for autocomplete
+Detection: Field has autocomplete listeners or dropdown appears
+Fix: Add delay_ms between keystrokes
+"""
+
+import asyncio
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+CONTEXT_NAME = "autocomplete-test"
+
+
+async def test_autocomplete():
+    """Test typing into fields with autocomplete behavior."""
+    print("=" * 70)
+    print("TEST #8: Autocomplete Field Clearing")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+        else:
+            print("✗ Extension not connected")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Create test page with autocomplete behavior
+        test_html = """
+        <!DOCTYPE html>
+        <html>
+        <head><title>Autocomplete Test</title>
+        <style>
+            .autocomplete-items {
+                position: absolute;
+                border: 1px solid #d4d4d4;
+                border-top: none;
+                z-index: 99;
+                top: 100%;
+                left: 0;
+                right: 0;
+                max-height: 200px;
+                overflow-y: auto;
+                background: white;
+            }
+            .autocomplete-items div {
+                padding: 10px;
+                cursor: pointer;
+            }
+            .autocomplete-items div:hover {
+                background-color: #e9e9e9;
+            }
+            .autocomplete-active {
+                background-color: DodgerBlue !important;
+                color: white;
+            }
+            .autocomplete { position: relative; display: inline-block; }
+            input { width: 300px; padding: 10px; font-size: 16px; }
+        </style></head>
+        <body>
+            <h2>Autocomplete Test</h2>
+
+            <div class="autocomplete">
+                <input id="search" type="text" placeholder="Search countries..." autocomplete="off">
+            </div>
+
+            <div id="log" style="margin-top:20px;font-family:monospace;"></div>
+
+            <script>
+                const countries = [
+                    "Afghanistan","Albania","Algeria",
+                    "Andorra","Angola","Argentina",
+                    "Armenia","Australia","Austria",
+                    "Azerbaijan","Bahamas","Bahrain",
+                    "Bangladesh","Belarus","Belgium",
+                    "Belize","Benin","Bhutan",
+                    "Bolivia","Brazil","Canada",
+                    "China","Colombia","Denmark",
+                    "Egypt","France","Germany",
+                    "India","Indonesia","Italy",
+                    "Japan","Mexico","Netherlands",
+                    "Nigeria","Norway","Pakistan",
+                    "Peru","Philippines","Poland",
+                    "Portugal","Russia","Spain",
+                    "Sweden","Switzerland","Thailand",
+                    "Turkey","Ukraine",
+                    "United Kingdom","United States",
+                    "Vietnam"
+                ];
+
+                const input = document.getElementById('search');
+                const log = document.getElementById('log');
+                let currentFocus = -1;
+                let typingTimeout = null;
+
+                // Track events for testing
+                window.inputEvents = [];
+                window.inputValue = '';
+
+                function logEvent(type, value) {
+                    window.inputEvents.push({ type, value, time: Date.now() });
+                    const entry = document.createElement('div');
+                    entry.textContent = type + ': ' + value;
+                    log.insertBefore(entry, log.firstChild);
+                }
+
+                // Simulate autocomplete that clears fast typing
+                input.addEventListener('input', function(e) {
+                    const val = this.value;
+
+                    // Clear previous dropdown
+                    closeAllLists();
+
+                    if (!val) return;
+
+                    // If typing too fast (autocomplete-style), clear and restart
+                    clearTimeout(typingTimeout);
+                    typingTimeout = setTimeout(() => {
+                        logEvent('input', val);
+                        window.inputValue = val;
+
+                        // Create dropdown
+                        const div = document.createElement('div');
+                        div.setAttribute('id', this.id + 'autocomplete-list');
+                        div.setAttribute('class', 'autocomplete-items');
+                        this.parentNode.appendChild(div);
+
+                        countries.filter(
+                            c => c.substr(0, val.length).toUpperCase()
+                                === val.toUpperCase()
+                        ).slice(0, 5).forEach(country => {
+                                const item = document.createElement('div');
+                                item.innerHTML = '<strong>'
+                                    + country.substr(0, val.length)
+                                    + '</strong>'
+                                    + country.substr(val.length);
+                                item.addEventListener('click', function() {
+                                    input.value = country;
+                                    closeAllLists();
+                                    logEvent('select', country);
+                                    window.inputValue = country;
+                                });
+                                div.appendChild(item);
+                            });
+                    }, 100); // 100ms debounce
+                });
+
+                function closeAllLists() {
+                    document.querySelectorAll('.autocomplete-items').forEach(el => el.remove());
+                }
+
+                document.addEventListener('click', function() {
+                    closeAllLists();
+                });
+            </script>
+        </body>
+        </html>
+        """
+
+        # Write to file and use file:// URL (data: URLs don't work well with extension)
+        test_file = Path("/tmp/autocomplete_test.html")
+        test_file.write_text(test_html.strip())
+        file_url = f"file://{test_file}"
+        await bridge.navigate(tab_id, file_url, wait_until="load")
+        print("✓ Page loaded")
+
+        # Screenshot
+        screenshot = await bridge.screenshot(tab_id)
+        print(f"Screenshot: {len(screenshot.get('data', ''))} bytes")
+
+        # Test 1: Fast typing (no delay) - may fail
+        print("\n--- Test 1: Fast typing (delay_ms=0) ---")
+        await bridge.click(tab_id, "#search")
+        await bridge.type_text(tab_id, "#search", "Ger", clear_first=True, delay_ms=0)
+        await asyncio.sleep(0.5)
+
+        fast_result = await bridge.evaluate(
+            tab_id, "(function() { return document.getElementById('search').value; })()"
+        )
+        fast_value = fast_result.get("result", "")
+        print(f"Value after fast typing: '{fast_value}'")
+
+        # Check events
+        events_result = await bridge.evaluate(tab_id, "(function() { return window.inputEvents; })()")
+        print(f"Events logged: {events_result.get('result', [])}")
+
+        # Test 2: Slow typing (with delay) - should work
+        print("\n--- Test 2: Slow typing (delay_ms=100) ---")
+        await bridge.click(tab_id, "#search")
+        await bridge.type_text(tab_id, "#search", "United", clear_first=True, delay_ms=100)
+        await asyncio.sleep(0.5)
+
+        slow_result = await bridge.evaluate(
+            tab_id, "(function() { return document.getElementById('search').value; })()"
+        )
+        slow_value = slow_result.get("result", "")
+        print(f"Value after slow typing: '{slow_value}'")
+
+        # Check if dropdown appeared
+        dropdown_result = await bridge.evaluate(
+            tab_id,
+            "(function() { return document.querySelectorAll('.autocomplete-items div').length; })()",
+        )
+        dropdown_count = dropdown_result.get("result", 0)
+        print(f"Dropdown items: {dropdown_count}")
+
+        # Screenshot with dropdown
+        screenshot_dropdown = await bridge.screenshot(tab_id)
+        print(f"Screenshot with dropdown: {len(screenshot_dropdown.get('data', ''))} bytes")
+
+        # Results
+        print("\n--- Results ---")
+        if "United" in slow_value:
+            print("✓ PASS: Slow typing with delay_ms worked")
+        else:
+            print("✗ FAIL: Slow typing still didn't work")
+
+        if dropdown_count > 0:
+            print("✓ PASS: Autocomplete dropdown appeared")
+        else:
+            print("⚠ WARNING: No autocomplete dropdown")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_autocomplete())
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+"""
+Test #10: LinkedIn Huge DOM Tree
+
+Symptom: browser_snapshot() hangs forever
+Root Cause: 10k+ DOM nodes, accessibility tree has 50k+ nodes
+Detection: document.querySelectorAll('*').length > 5000
+Fix: Add timeout (10s default), truncate tree at 2000 nodes
+"""
+
+import asyncio
+import sys
+import time
+import base64
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+CONTEXT_NAME = "huge-dom-test"
+
+
+async def test_huge_dom():
+    """Test snapshot performance on huge DOM trees."""
+    print("=" * 70)
+    print("TEST #10: Huge DOM Tree (LinkedIn-style)")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+        else:
+            print("✗ Extension not connected")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Test 1: Small DOM (baseline)
+        print("\n--- Test 1: Small DOM (baseline) ---")
+        small_html = """
+        <!DOCTYPE html>
+        <html><body>
+            <h1>Small Page</h1>
+            <p>A few elements</p>
+            <button>Click me</button>
+        </body></html>
+        """
+        data_url = f"data:text/html;base64,{base64.b64encode(small_html.encode()).decode()}"
+        await bridge.navigate(tab_id, data_url, wait_until="load")
+
+        start = time.perf_counter()
+        snapshot = await bridge.snapshot(tab_id, timeout_s=5.0)
+        elapsed = time.perf_counter() - start
+        tree_len = len(snapshot.get("tree", ""))
+        print(f"Small DOM snapshot: {elapsed:.3f}s, {tree_len} chars")
+
+        # Test 2: Generate huge DOM
+        print("\n--- Test 2: Huge DOM (5000+ elements) ---")
+        huge_html = """
+        <!DOCTYPE html>
+        <html><body>
+        <h1>Huge DOM Test</h1>
+        <div id="container"></div>
+        <script>
+            const container = document.getElementById('container');
+            for (let i = 0; i < 5000; i++) {
+                const div = document.createElement('div');
+                div.className = 'item-' + i;
+                div.innerHTML = '<span>Item ' + i + '</span><button>Action</button>';
+                container.appendChild(div);
+            }
+        </script>
+        </body></html>
+        """
+        data_url = f"data:text/html;base64,{base64.b64encode(huge_html.encode()).decode()}"
+        await bridge.navigate(tab_id, data_url, wait_until="load")
+
+        # Count elements
+        count_result = await bridge.evaluate(tab_id, "(function() { return document.querySelectorAll('*').length; })()")
+        elem_count = count_result.get("result", 0)
+        print(f"DOM elements: {elem_count}")
+
+        # Skip screenshot on huge DOM - it can timeout
+        # Instead verify page loaded by checking DOM
+        print("✓ Page verified (skipping screenshot on huge DOM)")
+
+        # Test snapshot with timeout
+        print("\n--- Testing snapshot with 10s timeout ---")
+        start = time.perf_counter()
+        try:
+            snapshot = await bridge.snapshot(tab_id, timeout_s=10.0)
+            elapsed = time.perf_counter() - start
+            tree_len = len(snapshot.get("tree", ""))
+            truncated = "(truncated)" in snapshot.get("tree", "")
+            print(f"✓ Huge DOM snapshot: {elapsed:.3f}s, {tree_len} chars, truncated={truncated}")
+
+            if elapsed < 5.0:
+                print("✓ PASS: Snapshot completed quickly")
+            else:
+                print(f"⚠ WARNING: Snapshot took {elapsed:.1f}s")
+
+            if truncated:
+                print("✓ PASS: Tree was truncated to prevent hang")
+            else:
+                print("⚠ WARNING: Tree not truncated (may need adjustment)")
+
+        except asyncio.TimeoutError:
+            print("✗ FAIL: Snapshot timed out (this shouldn't happen)")
+
+        # Test 3: Real LinkedIn
+        print("\n--- Test 3: Real LinkedIn Feed ---")
+        await bridge.navigate(tab_id, "https://www.linkedin.com/feed", wait_until="load", timeout_ms=30000)
+        await asyncio.sleep(2)
+
+        count_result = await bridge.evaluate(tab_id, "(function() { return document.querySelectorAll('*').length; })()")
+        elem_count = count_result.get("result", 0)
+        print(f"LinkedIn DOM elements: {elem_count}")
+
+        start = time.perf_counter()
+        try:
+            snapshot = await bridge.snapshot(tab_id, timeout_s=15.0)
+            elapsed = time.perf_counter() - start
+            tree_len = len(snapshot.get("tree", ""))
+            truncated = "(truncated)" in snapshot.get("tree", "")
+            print(f"LinkedIn snapshot: {elapsed:.3f}s, {tree_len} chars, truncated={truncated}")
+
+            if elapsed < 5.0:
+                print("✓ PASS: LinkedIn snapshot fast enough")
+            elif elapsed < 15.0:
+                print("⚠ WARNING: LinkedIn snapshot slow but within timeout")
+            else:
+                print("✗ FAIL: LinkedIn snapshot too slow")
+
+        except asyncio.TimeoutError:
+            print("✗ FAIL: LinkedIn snapshot timed out")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_huge_dom())
@@ -0,0 +1,190 @@
+#!/usr/bin/env python
+"""
+Test #13: SPA Navigation Events
+
+Symptom: wait_until="load" fires before content ready
+Root Cause: SPA uses client-side routing, no full page load
+Detection: URL changes but load event already fired
+Fix: Use wait_until="networkidle" or wait_for_selector
+"""
+
+import asyncio
+import sys
+import time
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+CONTEXT_NAME = "spa-nav-test"
+
+
+async def test_spa_navigation():
+    """Test navigation timing on SPA pages."""
+    print("=" * 70)
+    print("TEST #13: SPA Navigation Events")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+        else:
+            print("✗ Extension not connected")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Create a test SPA
+        spa_html = """
+        <!DOCTYPE html>
+        <html>
+        <head>
+            <title>SPA Test</title>
+            <style>
+                nav a { margin-right: 10px; }
+                .page { padding: 20px; border: 1px solid #ccc; margin-top: 10px; }
+            </style>
+        </head>
+        <body>
+            <nav>
+                <a href="#home" onclick="navigate('home')">Home</a>
+                <a href="#about" onclick="navigate('about')">About</a>
+                <a href="#contact" onclick="navigate('contact')">Contact</a>
+            </nav>
+            <div id="app" class="page">
+                <h1>Loading...</h1>
+            </div>
+            <script>
+                // Simulate SPA routing
+                let currentPage = '';
+
+                async function navigate(page) {
+                    event.preventDefault();
+                    currentPage = page;
+
+                    // Show loading state
+                    document.getElementById('app').innerHTML = '<h1>Loading...</h1>';
+
+                    // Simulate async content loading (like real SPAs)
+                    await new Promise(r => setTimeout(r, 500));
+
+                    // Render content
+                    const content = {
+                        home: '<h1>Home Page</h1><p>Welcome!</p>'
+                            + '<button id="home-btn">Home Action</button>',
+                        about: '<h1>About Page</h1><p>Simulated SPA.</p>'
+                            + '<button id="about-btn">About Action</button>',
+                        contact: '<h1>Contact Page</h1>'
+                            + '<p>Contact us at test@example.com</p>'
+                            + '<button id="contact-btn">Contact Action</button>'
+                    };
+
+                    document.getElementById('app').innerHTML = content[page] || '<h1>404</h1>';
+                    window.location.hash = page;
+                }
+
+                // Initial load with delay (simulates SPA hydration)
+                setTimeout(() => {
+                    navigate('home');
+                }, 1000);
+
+                // Track for testing
+                window.pageLoads = [];
+                window.addEventListener('hashchange', () => {
+                    window.pageLoads.push(window.location.hash);
+                });
+            </script>
+        </body>
+        </html>
+        """
+
+        # Write to file and use file:// URL (data: URLs don't work well with extension)
+        test_file = Path("/tmp/spa_test.html")
+        test_file.write_text(spa_html.strip())
+        file_url = f"file://{test_file}"
+
+        # Test 1: wait_until="load" - may fire before content ready
+        print("\n--- Test 1: wait_until='load' ---")
+        start = time.perf_counter()
+        await bridge.navigate(tab_id, file_url, wait_until="load")
+        elapsed = time.perf_counter() - start
+        print(f"Navigation completed in {elapsed:.3f}s")
+
+        # Check content immediately
+        content = await bridge.evaluate(
+            tab_id,
+            "(function() { return document.getElementById('app').innerText; })()",
+        )
+        print(f"Content immediately after load: '{content.get('result', '')}'")
+
+        # Screenshot
+        screenshot = await bridge.screenshot(tab_id)
+        print(f"Screenshot: {len(screenshot.get('data', ''))} bytes")
+
+        # Wait for content
+        print("\n--- Waiting for content to hydrate ---")
+        await bridge.wait_for_selector(tab_id, "#home-btn", timeout_ms=5000)
+        print("✓ Content loaded")
+
+        # Check content after wait
+        content_after = await bridge.evaluate(
+            tab_id,
+            "(function() { return document.getElementById('app').innerText; })()",
+        )
+        print(f"Content after wait: '{content_after.get('result', '')}'")
+
+        # Test 2: SPA navigation (no full page load)
+        print("\n--- Test 2: SPA client-side navigation ---")
+
+        # Click "About" link
+        await bridge.click(tab_id, 'a[href="#about"]')
+        await asyncio.sleep(1)
+
+        # Check if content changed
+        about_content = await bridge.evaluate(
+            tab_id,
+            "(function() { return document.getElementById('app').innerText; })()",
+        )
+        print(f"Content after SPA nav: '{about_content.get('result', '')}'")
+
+        if "About Page" in about_content.get("result", ""):
+            print("✓ PASS: SPA navigation worked")
+        else:
+            print("✗ FAIL: SPA navigation didn't update content")
+
+        # Test 3: wait_until="networkidle"
+        print("\n--- Test 3: wait_until='networkidle' ---")
+        await bridge.navigate(tab_id, file_url, wait_until="networkidle", timeout_ms=10000)
+
+        # Check content immediately
+        content_networkidle = await bridge.evaluate(
+            tab_id,
+            "(function() { return document.getElementById('app').innerText; })()",
+        )
+        print(f"Content after networkidle: '{content_networkidle.get('result', '')}'")
+
+        if "Home Page" in content_networkidle.get("result", ""):
+            print("✓ PASS: networkidle waited for content")
+        else:
+            print("⚠ WARNING: networkidle didn't wait long enough")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_spa_navigation())
@@ -0,0 +1,262 @@
+#!/usr/bin/env python
+"""
+Test #15: Screenshot Functionality
+
+Tests browser_screenshot across multiple scenarios:
+- Basic viewport screenshot
+- Full-page screenshot
+- Selector-based screenshot
+- Screenshot on complex DOM
+- Timeout handling
+
+Category: screenshot
+"""
+
+import asyncio
+import base64
+import sys
+import time
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+CONTEXT_NAME = "screenshot-test"
+
+SIMPLE_HTML = """<!DOCTYPE html>
+<html>
+<head><style>
+  body { margin: 0; background: #fff; font-family: sans-serif; }
+  h1 { color: #333; padding: 20px; }
+  .box { width: 200px; height: 100px; background: #4a90e2; margin: 20px; }
+  .long-content { height: 2000px; background: linear-gradient(blue, red); }
+</style></head>
+<body>
+  <h1 id="title">Screenshot Test Page</h1>
+  <div class="box" id="target-box">Target Box</div>
+  <div class="long-content"></div>
+</body>
+</html>"""
+
+
+def check_png(data: str) -> bool:
+    """Verify that base64 data decodes to a valid PNG."""
+    try:
+        raw = base64.b64decode(data)
+        return raw[:8] == b"\x89PNG\r\n\x1a\n"
+    except Exception:
+        return False
+
+
+async def test_basic_screenshot(bridge: BeelineBridge, tab_id: int, data_url: str):
+    print("\n--- Test 1: Basic Viewport Screenshot ---")
+    await bridge.navigate(tab_id, data_url, wait_until="load")
+    await asyncio.sleep(0.5)
+
+    start = time.perf_counter()
+    result = await bridge.screenshot(tab_id)
+    elapsed = time.perf_counter() - start
+
+    ok = result.get("ok")
+    data = result.get("data", "")
+    mime = result.get("mimeType", "")
+
+    print(f"  ok={ok}, mimeType={mime}, elapsed={elapsed:.3f}s")
+    print(f"  data length: {len(data)} chars")
+
+    if ok and data:
+        valid_png = check_png(data)
+        print(f"  valid PNG: {valid_png}")
+        if valid_png:
+            raw = base64.b64decode(data)
+            print(f"  PNG size: {len(raw)} bytes")
+            print("  ✓ PASS: Basic screenshot works")
+            return True
+        else:
+            print("  ✗ FAIL: Data is not a valid PNG")
+    else:
+        print(f"  ✗ FAIL: {result.get('error', 'no data')}")
+    return False
+
+
+async def test_full_page_screenshot(bridge: BeelineBridge, tab_id: int, data_url: str):
+    print("\n--- Test 2: Full Page Screenshot ---")
+    await bridge.navigate(tab_id, data_url, wait_until="load")
+    await asyncio.sleep(0.5)
+
+    viewport_result = await bridge.screenshot(tab_id, full_page=False)
+    full_result = await bridge.screenshot(tab_id, full_page=True)
+
+    v_data = viewport_result.get("data", "")
+    f_data = full_result.get("data", "")
+
+    if not v_data or not f_data:
+        print(f"  ✗ FAIL: viewport ok={viewport_result.get('ok')}, full ok={full_result.get('ok')}")
+        return False
+
+    v_size = len(base64.b64decode(v_data))
+    f_size = len(base64.b64decode(f_data))
+    print(f"  Viewport PNG: {v_size} bytes")
+    print(f"  Full page PNG: {f_size} bytes")
+
+    if f_size > v_size:
+        print("  ✓ PASS: Full page larger than viewport")
+        return True
+    else:
+        print("  ✗ FAIL: Full page not larger than viewport (may not capture long pages)")
+        return False
+
+
+async def test_selector_screenshot(bridge: BeelineBridge, tab_id: int, data_url: str):
+    print("\n--- Test 3: Selector Screenshot ---")
+    await bridge.navigate(tab_id, data_url, wait_until="load")
+    await asyncio.sleep(0.5)
+
+    # selector param exists in signature but may not be implemented
+    result = await bridge.screenshot(tab_id, selector="#target-box")
+
+    ok = result.get("ok")
+    data = result.get("data", "")
+
+    if ok and data:
+        # If implemented, the box screenshot should be smaller than a full viewport screenshot
+        full_result = await bridge.screenshot(tab_id)
+        full_data = full_result.get("data", "")
+
+        if full_data:
+            sel_size = len(base64.b64decode(data))
+            full_size = len(base64.b64decode(full_data))
+            print(f"  Selector PNG: {sel_size} bytes")
+            print(f"  Full page PNG: {full_size} bytes")
+            if sel_size < full_size:
+                print("  ✓ PASS: Selector screenshot smaller than full page")
+                return True
+            else:
+                print("  ⚠ WARNING: Selector screenshot not smaller (may be full page)")
+                return False
+    else:
+        print(f"  ⚠ NOT IMPLEMENTED: selector param ignored (returns full page) - error={result.get('error')}")
+        print("  NOTE: selector parameter exists in signature but is not used in implementation")
+        return False
+
+
+async def test_screenshot_url_metadata(bridge: BeelineBridge, tab_id: int):
+    print("\n--- Test 4: Screenshot URL Metadata ---")
+    await bridge.navigate(tab_id, "https://example.com", wait_until="load")
+    await asyncio.sleep(1)
+
+    result = await bridge.screenshot(tab_id)
+    url = result.get("url", "")
+    tab = result.get("tabId")
+
+    print(f"  url={url!r}, tabId={tab}")
+
+    if "example.com" in url:
+        print("  ✓ PASS: URL metadata captured correctly")
+        return True
+    else:
+        print(f"  ✗ FAIL: Expected example.com in URL, got {url!r}")
+        return False
+
+
+async def test_screenshot_timeout(bridge: BeelineBridge, tab_id: int, data_url: str):
+    print("\n--- Test 5: Timeout Handling ---")
+    await bridge.navigate(tab_id, data_url, wait_until="load")
+
+    # Very short timeout - likely still completes since simple page
+    start = time.perf_counter()
+    result = await bridge.screenshot(tab_id, timeout_s=0.001)
+    elapsed = time.perf_counter() - start
+
+    if not result.get("ok"):
+        err = result.get("error", "")
+        if "timed out" in err or "cancelled" in err:
+            print(f"  ✓ PASS: Timeout handled gracefully: {err!r}")
+            return True
+        else:
+            print(f"  ⚠ Fast enough to beat timeout: {err!r} in {elapsed:.3f}s")
+            return True  # Not a failure, just fast
+    else:
+        print(f"  ⚠ Screenshot completed before timeout ({elapsed:.3f}s) - too fast to test timeout")
+        return True  # Still ok, just very fast
+
+
+async def test_screenshot_complex_site(bridge: BeelineBridge, tab_id: int):
+    print("\n--- Test 6: Complex Site (example.com) ---")
+    await bridge.navigate(tab_id, "https://example.com", wait_until="load")
+    await asyncio.sleep(1)
+
+    start = time.perf_counter()
+    result = await bridge.screenshot(tab_id)
+    elapsed = time.perf_counter() - start
+
+    ok = result.get("ok")
+    data = result.get("data", "")
+
+    print(f"  ok={ok}, elapsed={elapsed:.3f}s, data_len={len(data)}")
+    if ok and check_png(data):
+        print("  ✓ PASS: Screenshot on real site works")
+        return True
+    else:
+        print(f"  ✗ FAIL: {result.get('error', 'bad data')}")
+        return False
+
+
+async def main():
+    print("=" * 70)
+    print("TEST #15: Screenshot Functionality")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+            print(f"Waiting for extension... ({i + 1}/10)")
+        else:
+            print("✗ Extension not connected. Ensure Chrome with Beeline extension is running.")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        data_url = f"data:text/html;base64,{base64.b64encode(SIMPLE_HTML.encode()).decode()}"
+
+        results = {
+            "basic": await test_basic_screenshot(bridge, tab_id, data_url),
+            "full_page": await test_full_page_screenshot(bridge, tab_id, data_url),
+            "selector": await test_selector_screenshot(bridge, tab_id, data_url),
+            "metadata": await test_screenshot_url_metadata(bridge, tab_id),
+            "timeout": await test_screenshot_timeout(bridge, tab_id, data_url),
+            "complex_site": await test_screenshot_complex_site(bridge, tab_id),
+        }
+
+        print("\n" + "=" * 70)
+        print("SUMMARY")
+        print("=" * 70)
+        for name, passed in results.items():
+            status = "✓ PASS" if passed else "✗ FAIL"
+            print(f"  {status}: {name}")
+
+        passed_count = sum(1 for v in results.values() if v)
+        total = len(results)
+        print(f"\n  {passed_count}/{total} tests passed")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+        print("✓ Bridge stopped")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,327 @@
+#!/usr/bin/env python
+"""
+Browser Edge Case Test Template
+
+This script provides a template for testing and debugging browser tool failures
+on specific websites. Use this to reproduce, isolate, and verify fixes.
+
+Usage:
+    1. Copy this file: cp test_case.py test_#[number]_[site].py
+    2. Fill in the CONFIG section with your test details
+    3. Run: uv run python test_#[number]_[site].py
+
+Example:
+    uv run python test_01_linkedin_scroll.py
+"""
+
+import asyncio
+import sys
+import time
+from pathlib import Path
+
+# Add tools to path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# CONFIG: Fill in these values for your test case
+# ═══════════════════════════════════════════════════════════════════════════════
+
+TEST_CASE = {
+    "number": 1,
+    "name": "LinkedIn Nested Scroll Container",
+    "site": "https://www.linkedin.com/feed",
+    "simple_site": "https://example.com",
+    "category": "scroll",  # scroll, click, input, snapshot, navigation
+    "symptom": "scroll() returns success but page doesn't move",
+}
+
+BRIDGE_PORT = 9229
+CONTEXT_NAME = "edge-case-test"
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST FUNCTIONS
+# ═══════════════════════════════════════════════════════════════════════════════
+
+
+async def test_simple_site(bridge: BeelineBridge, tab_id: int) -> dict:
+    """Test that the tool works on a simple site (baseline)."""
+    print("\n--- Baseline Test (Simple Site) ---")
+
+    await bridge.navigate(tab_id, TEST_CASE["simple_site"], wait_until="load")
+    await asyncio.sleep(1)
+
+    # Adjust this based on category
+    if TEST_CASE["category"] == "scroll":
+        result = await bridge.scroll(tab_id, "down", 100)
+        print(f"  Scroll result: {result}")
+        return result
+    elif TEST_CASE["category"] == "click":
+        # Add click test
+        pass
+    elif TEST_CASE["category"] == "snapshot":
+        result = await bridge.snapshot(tab_id, timeout_s=5.0)
+        print(f"  Snapshot length: {len(result.get('tree', ''))}")
+        return result
+
+    return {"ok": True}
+
+
+async def test_problematic_site(bridge: BeelineBridge, tab_id: int) -> dict:
+    """Test the tool on the problematic site."""
+    print("\n--- Problem Site Test ---")
+
+    await bridge.navigate(tab_id, TEST_CASE["site"], wait_until="load", timeout_ms=30000)
+    await asyncio.sleep(2)
+
+    # Adjust this based on category
+    if TEST_CASE["category"] == "scroll":
+        # Get scroll positions before
+        before = await bridge.evaluate(
+            tab_id,
+            """
+            (function() {
+                const results = { window: { y: window.scrollY } };
+                document.querySelectorAll('*').forEach((el, i) => {
+                    const style = getComputedStyle(el);
+                    if ((style.overflowY === 'scroll' || style.overflowY === 'auto') &&
+                        el.scrollHeight > el.clientHeight) {
+                        results['el_' + i] = {
+                            tag: el.tagName,
+                            scrollTop: el.scrollTop,
+                            class: el.className.substring(0, 30)
+                        };
+                    }
+                });
+                return results;
+            })();
+        """,
+        )
+        print(f"  Before scroll: {before.get('result', {})}")
+
+        # Try to scroll
+        result = await bridge.scroll(tab_id, "down", 500)
+        print(f"  Scroll result: {result}")
+
+        await asyncio.sleep(1)
+
+        # Get scroll positions after
+        after = await bridge.evaluate(
+            tab_id,
+            """
+            (function() {
+                const results = { window: { y: window.scrollY } };
+                document.querySelectorAll('*').forEach((el, i) => {
+                    const style = getComputedStyle(el);
+                    if ((style.overflowY === 'scroll' || style.overflowY === 'auto') &&
+                        el.scrollHeight > el.clientHeight) {
+                        results['el_' + i] = {
+                            tag: el.tagName,
+                            scrollTop: el.scrollTop,
+                            class: el.className.substring(0, 30)
+                        };
+                    }
+                });
+                return results;
+            })();
+        """,
+        )
+        print(f"  After scroll: {after.get('result', {})}")
+
+        # Check if anything changed
+        before_data = before.get("result", {}) or {}
+        after_data = after.get("result", {}) or {}
+
+        changed = False
+        for key in after_data:
+            if key in before_data:
+                b_val = before_data[key].get("scrollTop", 0) if isinstance(before_data[key], dict) else 0
+                a_val = after_data[key].get("scrollTop", 0) if isinstance(after_data[key], dict) else 0
+                if a_val != b_val:
+                    print(f"  ✓ CHANGE DETECTED: {key} scrolled from {b_val} to {a_val}")
+                    changed = True
+
+        if not changed:
+            print("  ✗ NO CHANGE: Scroll did not affect any container")
+
+        return {"ok": changed, "scroll_result": result}
+
+    elif TEST_CASE["category"] == "snapshot":
+        start = time.perf_counter()
+        try:
+            result = await bridge.snapshot(tab_id, timeout_s=15.0)
+            elapsed = time.perf_counter() - start
+            tree_len = len(result.get("tree", ""))
+            print(f"  Snapshot completed in {elapsed:.2f}s, {tree_len} chars")
+            return {"ok": True, "elapsed": elapsed, "tree_length": tree_len}
+        except asyncio.TimeoutError:
+            print("  ✗ SNAPSHOT TIMED OUT")
+            return {"ok": False, "error": "timeout"}
+
+    return {"ok": True}
+
+
+async def detect_root_cause(bridge: BeelineBridge, tab_id: int) -> dict:
+    """Run detection scripts to identify the root cause."""
+    print("\n--- Root Cause Detection ---")
+
+    detections = {}
+
+    # Detection 1: Nested scrollable containers
+    scroll_check = await bridge.evaluate(
+        tab_id,
+        """
+        (function() {
+            const candidates = [];
+            document.querySelectorAll('*').forEach(el => {
+                const style = getComputedStyle(el);
+                if (style.overflow.includes('scroll') || style.overflow.includes('auto')) {
+                    const rect = el.getBoundingClientRect();
+                    if (rect.width > 100 && rect.height > 100) {
+                        candidates.push({
+                            tag: el.tagName,
+                            area: rect.width * rect.height,
+                            class: el.className.substring(0, 30)
+                        });
+                    }
+                }
+            });
+            candidates.sort((a, b) => b.area - a.area);
+            return {
+                count: candidates.length,
+                largest: candidates[0]
+            };
+        })();
+    """,
+    )
+    detections["nested_scroll"] = scroll_check.get("result", {})
+    print(f"  Nested scroll containers: {detections['nested_scroll']}")
+
+    # Detection 2: Shadow DOM
+    shadow_check = await bridge.evaluate(
+        tab_id,
+        """
+        (function() {
+            const withShadow = [];
+            document.querySelectorAll('*').forEach(el => {
+                if (el.shadowRoot) {
+                    withShadow.push(el.tagName);
+                }
+            });
+            return { count: withShadow.length, elements: withShadow.slice(0, 5) };
+        })();
+    """,
+    )
+    detections["shadow_dom"] = shadow_check.get("result", {})
+    print(f"  Shadow DOM: {detections['shadow_dom']}")
+
+    # Detection 3: iframes
+    iframe_check = await bridge.evaluate(
+        tab_id,
+        """
+        (function() {
+            const iframes = document.querySelectorAll('iframe');
+            return { count: iframes.length };
+        })();
+    """,
+    )
+    detections["iframes"] = iframe_check.get("result", {})
+    print(f"  iframes: {detections['iframes']}")
+
+    # Detection 4: DOM size
+    dom_check = await bridge.evaluate(
+        tab_id,
+        """
+        (function() {
+            return {
+                elements: document.querySelectorAll('*').length,
+                body_children: document.body.children.length
+            };
+        })();
+    """,
+    )
+    detections["dom_size"] = dom_check.get("result", {})
+    print(f"  DOM size: {detections['dom_size']}")
+
+    # Detection 5: Framework detection
+    framework_check = await bridge.evaluate(
+        tab_id,
+        """
+        (function() {
+            return {
+                react: !!document.querySelector('[data-reactroot], [data-reactid]'),
+                vue: !!document.querySelector('[data-v-]'),
+                angular: !!document.querySelector('[ng-app], [ng-version]')
+            };
+        })();
+    """,
+    )
+    detections["frameworks"] = framework_check.get("result", {})
+    print(f"  Frameworks: {detections['frameworks']}")
+
+    return detections
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# MAIN
+# ═══════════════════════════════════════════════════════════════════════════════
+
+
+async def main():
+    print("=" * 70)
+    print(f"EDGE CASE TEST #{TEST_CASE['number']}: {TEST_CASE['name']}")
+    print("=" * 70)
+    print(f"Site: {TEST_CASE['site']}")
+    print(f"Category: {TEST_CASE['category']}")
+    print(f"Symptom: {TEST_CASE['symptom']}")
+
+    bridge = BeelineBridge()
+
+    try:
+        print("\n--- Starting Bridge ---")
+        await bridge.start()
+
+        # Wait for extension connection
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+            print(f"Waiting for extension... ({i + 1}/10)")
+        else:
+            print("✗ Extension not connected. Ensure Chrome with Beeline extension is running.")
+            return
+
+        # Create browser context
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Run tests
+        baseline_result = await test_simple_site(bridge, tab_id)
+        problem_result = await test_problematic_site(bridge, tab_id)
+        detections = await detect_root_cause(bridge, tab_id)
+
+        # Summary
+        print("\n" + "=" * 70)
+        print("SUMMARY")
+        print("=" * 70)
+        print(f"Baseline test: {'✓ PASS' if baseline_result.get('ok') else '✗ FAIL'}")
+        print(f"Problem test: {'✓ PASS' if problem_result.get('ok') else '✗ FAIL'}")
+        print(f"Root cause indicators: {list(k for k, v in detections.items() if v)}")
+
+        # Cleanup
+        print("\n--- Cleanup ---")
+        await bridge.destroy_context(group_id)
+        print("✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+        print("✓ Bridge stopped")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -1,953 +0,0 @@
---
-name: building-agents-construction
-description: Step-by-step guide for building goal-driven agents. Creates package structure, defines goals, adds nodes, connects edges, and finalizes agent class. Use when actively building an agent.
-license: Apache-2.0
-metadata:
-  author: hive
-  version: "1.0"
-  type: procedural
-  part_of: building-agents
-  requires: building-agents-core
---
-
-# Building Agents - Construction Process
-
-Step-by-step guide for building goal-driven agent packages.
-
-**Prerequisites:** Read `building-agents-core` for fundamental concepts.
-
-## CRITICAL: entry_points Format Reference
-
-**⚠️ Common Mistake Prevention:**
-
-The `entry_points` parameter in GraphSpec has a specific format that is easy to get wrong. This section exists because this mistake has caused production bugs.
-
-### Correct Format
-
-```python
-entry_points = {"start": "first-node-id"}
-```
-
-**Examples from working agents:**
-
-```python
-# From exports/outbound_sales_agent/agent.py
-entry_node = "lead-qualification"
-entry_points = {"start": "lead-qualification"}
-
-# From exports/support_ticket_agent/agent.py (FIXED)
-entry_node = "parse-ticket"
-entry_points = {"start": "parse-ticket"}
-```
-
-### WRONG Formats (DO NOT USE)
-
-```python
-# ❌ WRONG: Using node ID as key with input keys as value
-entry_points = {
-    "parse-ticket": ["ticket_content", "customer_id", "ticket_id"]
-}
-# Error: ValidationError: Input should be a valid string, got list
-
-# ❌ WRONG: Using set instead of dict
-entry_points = {"parse-ticket"}
-# Error: ValidationError: Input should be a valid dictionary, got set
-
-# ❌ WRONG: Missing "start" key
-entry_points = {"entry": "parse-ticket"}
-# Error: Graph execution fails, cannot find entry point
-```
-
-### Validation Check
-
-After writing graph configuration, ALWAYS validate:
-
-```python
-# Check 1: Must be a dict
-assert isinstance(entry_points, dict), f"entry_points must be dict, got {type(entry_points)}"
-
-# Check 2: Must have "start" key
-assert "start" in entry_points, f"entry_points must have 'start' key, got keys: {entry_points.keys()}"
-
-# Check 3: "start" value must match entry_node
-assert entry_points["start"] == entry_node, f"entry_points['start']={entry_points['start']} must match entry_node={entry_node}"
-
-# Check 4: Value must be a string (node ID)
-assert isinstance(entry_points["start"], str), f"entry_points['start'] must be string, got {type(entry_points['start'])}"
-```
-
-**Why this matters:** GraphSpec uses Pydantic validation. The wrong format causes ValidationError at runtime, which blocks all agent execution and tests. This bug is not caught until you try to run the agent.
-
-## Building Session Management with MCP
-
-**MANDATORY**: Use the agent-builder MCP server's BuildSession system for automatic bookkeeping and persistence.
-
-### Available MCP Session Tools
-
-```python
-# Create new session (call FIRST before building)
-mcp__agent-builder__create_session(name="Support Ticket Agent")
-# Returns: session_id, automatically sets as active session
-
-# Get current session status (use for progress tracking)
-status = mcp__agent-builder__get_session_status()
-# Returns: {
-#   "session_id": "build_20250122_...",
-#   "name": "Support Ticket Agent",
-#   "has_goal": true,
-#   "node_count": 5,
-#   "edge_count": 7,
-#   "nodes": ["parse-ticket", "categorize", ...],
-#   "edges": [("parse-ticket", "categorize"), ...]
-# }
-
-# List all saved sessions
-mcp__agent-builder__list_sessions()
-
-# Load previous session
-mcp__agent-builder__load_session_by_id(session_id="build_...")
-
-# Delete session
-mcp__agent-builder__delete_session(session_id="build_...")
-```
-
-### How MCP Session Works
-
-The BuildSession class (in `core/framework/mcp/agent_builder_server.py`) automatically:
- **Persists to disk** after every operation (`_save_session()` called automatically)
- **Tracks all components**: goal, nodes, edges, mcp_servers
- **Maintains timestamps**: created_at, last_modified
- **Stores to**: `~/.claude-code-agent-builder/sessions/`
-
-When you call MCP tools like:
- `mcp__agent-builder__set_goal(...)` - Automatically added to session.goal and saved
- `mcp__agent-builder__add_node(...)` - Automatically added to session.nodes and saved
- `mcp__agent-builder__add_edge(...)` - Automatically added to session.edges and saved
-
-**No manual bookkeeping needed** - the MCP server handles it all!
-
-### Show Progress to User
-
-```python
-# Get session status to show progress
-status = json.loads(mcp__agent-builder__get_session_status())
-
-print(f"\n📊 Building Progress:")
-print(f"   Session: {status['name']}")
-print(f"   Goal defined: {status['has_goal']}")
-print(f"   Nodes: {status['node_count']}")
-print(f"   Edges: {status['edge_count']}")
-print(f"   Nodes added: {', '.join(status['nodes'])}")
-```
-
-**Benefits:**
- Automatic persistence - survive crashes/restarts
- Clear audit trail - all operations logged
- Session resume - continue from where you left off
- Progress tracking built-in
- No manual state management needed
-
-## Step-by-Step Guide
-
-### Step 1: Create Building Session & Package Structure
-
-When user requests an agent, **immediately create MCP session and package**:
-
-```python
-# 0. FIRST: Create MCP building session
-agent_name = "technical_research_agent"  # snake_case
-session_result = mcp__agent-builder__create_session(name=agent_name.replace('_', ' ').title())
-session_id = json.loads(session_result)["session_id"]
-print(f"✅ Created building session: {session_id}")
-
-# 1. Create directory
-package_path = f"exports/{agent_name}"
-
-Bash(f"mkdir -p {package_path}/nodes")
-
-# 2. Write skeleton files
-Write(
-    file_path=f"{package_path}/__init__.py",
-    content='''"""
-Agent package - will be populated as build progresses.
-"""
-'''
-)
-
-Write(
-    file_path=f"{package_path}/nodes/__init__.py",
-    content='''"""Node definitions."""
-from framework.graph import NodeSpec
-
-# Nodes will be added here as they are approved
-
-__all__ = []
-'''
-)
-
-Write(
-    file_path=f"{package_path}/agent.py",
-    content='''"""Agent graph construction."""
-from framework.graph import EdgeSpec, EdgeCondition, Goal, SuccessCriterion, Constraint
-from framework.graph.edge import GraphSpec
-from framework.graph.executor import GraphExecutor
-from framework.runtime import Runtime
-from framework.llm.anthropic import AnthropicProvider
-from framework.runner.tool_registry import ToolRegistry
-from aden_tools.credentials import CredentialManager
-
-# Goal will be added when defined
-# Nodes will be imported from .nodes
-# Edges will be added when approved
-# Agent class will be created when graph is complete
-'''
-)
-
-Write(
-    file_path=f"{package_path}/config.py",
-    content='''"""Runtime configuration."""
-from dataclasses import dataclass
-
-@dataclass
-class RuntimeConfig:
-    model: str = "claude-sonnet-4-5-20250929"
-    temperature: float = 0.7
-    max_tokens: int = 4096
-
-default_config = RuntimeConfig()
-
-# Metadata will be added when goal is set
-'''
-)
-
-Write(
-    file_path=f"{package_path}/__main__.py",
-    content=CLI_TEMPLATE  # Full CLI template (see below)
-)
-```
-
-**Show user:**
-
-```
-✅ Package created: exports/technical_research_agent/
-📁 Files created:
-   - __init__.py (skeleton)
-   - __main__.py (CLI ready)
-   - agent.py (skeleton)
-   - nodes/__init__.py (empty)
-   - config.py (skeleton)
-
-You can open these files now and watch them grow as we build!
-```
-
-### Step 2: Define Goal
-
-Propose goal, get approval, **write immediately**:
-
-```python
-# After user approves goal...
-
-goal_code = f'''
-goal = Goal(
-    id="{goal_id}",
-    name="{name}",
-    description="{description}",
-    success_criteria=[
-        SuccessCriterion(
-            id="{sc.id}",
-            description="{sc.description}",
-            metric="{sc.metric}",
-            target="{sc.target}",
-            weight={sc.weight},
-        ),
-        # ... more criteria
-    ],
-    constraints=[
-        Constraint(
-            id="{c.id}",
-            description="{c.description}",
-            constraint_type="{c.constraint_type}",
-            category="{c.category}",
-        ),
-        # ... more constraints
-    ],
-)
-'''
-
-# Append to agent.py
-Read(f"{package_path}/agent.py")  # Must read first
-Edit(
-    file_path=f"{package_path}/agent.py",
-    old_string="# Goal will be added when defined",
-    new_string=f"# Goal definition\n{goal_code}"
-)
-
-# Write metadata to config.py
-metadata_code = f'''
-@dataclass
-class AgentMetadata:
-    name: str = "{name}"
-    version: str = "1.0.0"
-    description: str = "{description}"
-
-metadata = AgentMetadata()
-'''
-
-Read(f"{package_path}/config.py")
-Edit(
-    file_path=f"{package_path}/config.py",
-    old_string="# Metadata will be added when goal is set",
-    new_string=f"# Agent metadata\n{metadata_code}"
-)
-```
-
-**Show user:**
-
-```
-✅ Goal written to agent.py
-✅ Metadata written to config.py
-
-Open exports/technical_research_agent/agent.py to see the goal!
-```
-
-**Note:** Goal is automatically tracked in MCP session. Use `mcp__agent-builder__get_session_status()` to check progress.
-
-### Step 3: Add Nodes (Incremental)
-
-**⚠️ CRITICAL VALIDATION REQUIREMENTS:**
-
-Before adding any node with tools:
-1. Call `mcp__agent-builder__list_mcp_tools()` to discover available tools
-2. Verify each tool exists in the response
-3. If a tool doesn't exist, inform the user and ask how to proceed
-
-After writing each node:
-4. **MANDATORY**: Validate with `mcp__agent-builder__test_node()` before proceeding
-5. **MANDATORY**: Check MCP session status to track progress
-6. Only proceed to next node after validation passes
-
-For each node, **write immediately after approval**:
-
-```python
-# After user approves node...
-
-node_code = f'''
-{node_id.replace('-', '_')}_node = NodeSpec(
-    id="{node_id}",
-    name="{name}",
-    description="{description}",
-    node_type="{node_type}",
-    input_keys={input_keys},
-    output_keys={output_keys},
-    system_prompt="""\\
-{system_prompt}
-""",
-    tools={tools},
-    max_retries={max_retries},
-)
-
-'''
-
-# Append to nodes/__init__.py
-Read(f"{package_path}/nodes/__init__.py")
-Edit(
-    file_path=f"{package_path}/nodes/__init__.py",
-    old_string="__all__ = []",
-    new_string=f"{node_code}\n__all__ = []"
-)
-
-# Update __all__ exports
-all_node_names = [n.replace('-', '_') + '_node' for n in approved_nodes]
-all_exports = f"__all__ = {all_node_names}"
-
-Edit(
-    file_path=f"{package_path}/nodes/__init__.py",
-    old_string="__all__ = []",
-    new_string=all_exports
-)
-```
-
-**Show user after each node:**
-
-```
-✅ Added analyze_request_node to nodes/__init__.py
-📊 Progress: 1/6 nodes added
-
-Open exports/technical_research_agent/nodes/__init__.py to see it!
-```
-
-**Repeat for each node.** User watches the file grow.
-
-#### MANDATORY: Validate Each Node with MCP Tools
-
-After writing EVERY node, you MUST validate before proceeding:
-
-```python
-# Node is already written to file. Now VALIDATE IT (REQUIRED):
-validation_result = json.loads(mcp__agent-builder__test_node(
-    node_id="analyze-request",
-    test_input='{"query": "test query"}',
-    mock_llm_response='{"analysis": "mock output"}'
-))
-
-# Check validation result
-if validation_result["valid"]:
-    # Show user validation passed
-    print(f"✅ Node validation passed: analyze-request")
-
-    # Show session progress
-    status = json.loads(mcp__agent-builder__get_session_status())
-    print(f"📊 Session progress: {status['node_count']} nodes added")
-else:
-    # STOP - Do not proceed until fixed
-    print(f"❌ Node validation FAILED:")
-    for error in validation_result["errors"]:
-        print(f"   - {error}")
-    print("⚠️ Must fix node before proceeding to next component")
-    # Ask user how to proceed
-```
-
-**CRITICAL:** Do NOT proceed to the next node until validation passes. Bugs caught here prevent wasted work later.
-
-### Step 4: Connect Edges
-
-After all nodes approved, add edges:
-
-```python
-# Generate edges code
-edges_code = "edges = [\n"
-for edge in approved_edges:
-    edges_code += f'''    EdgeSpec(
-        id="{edge.id}",
-        source="{edge.source}",
-        target="{edge.target}",
-        condition=EdgeCondition.{edge.condition.upper()},
-'''
-    if edge.condition_expr:
-        edges_code += f'        condition_expr="{edge.condition_expr}",\n'
-    edges_code += f'        priority={edge.priority},\n'
-    edges_code += '    ),\n'
-edges_code += "]\n"
-
-# Write to agent.py
-Read(f"{package_path}/agent.py")
-Edit(
-    file_path=f"{package_path}/agent.py",
-    old_string="# Edges will be added when approved",
-    new_string=f"# Edge definitions\n{edges_code}"
-)
-
-# Write entry points and terminal nodes
-# ⚠️ CRITICAL: entry_points format must be {"start": "node_id"}
-# Common mistake: {"node_id": ["input_keys"]} is WRONG
-# Correct format: {"start": "first-node-id"}
-# Reference: See exports/outbound_sales_agent/agent.py for example
-
-graph_config = f'''
-# Graph configuration
-entry_node = "{entry_node_id}"
-entry_points = {{"start": "{entry_node_id}"}}  # CRITICAL: Must be {{"start": "node-id"}}
-pause_nodes = {pause_nodes}
-terminal_nodes = {terminal_nodes}
-
-# Collect all nodes
-nodes = [
-    {', '.join(node_names)},
-]
-'''
-
-Edit(
-    file_path=f"{package_path}/agent.py",
-    old_string="# Agent class will be created when graph is complete",
-    new_string=graph_config
-)
-```
-
-**Show user:**
-
-```
-✅ Edges written to agent.py
-✅ Graph configuration added
-
-5 edges connecting 6 nodes
-```
-
-#### MANDATORY: Validate Graph Structure
-
-After writing edges, you MUST validate before proceeding to finalization:
-
-```python
-# Edges already written to agent.py. Now VALIDATE STRUCTURE (REQUIRED):
-graph_validation = json.loads(mcp__agent-builder__validate_graph())
-
-# Check for structural issues
-if graph_validation["valid"]:
-    print("✅ Graph structure validated successfully")
-
-    # Show session summary
-    status = json.loads(mcp__agent-builder__get_session_status())
-    print(f"   - Nodes: {status['node_count']}")
-    print(f"   - Edges: {status['edge_count']}")
-    print(f"   - Entry point: {entry_node_id}")
-else:
-    print("❌ Graph validation FAILED:")
-    for error in graph_validation["errors"]:
-        print(f"   ERROR: {error}")
-    print("\n⚠️ Must fix graph structure before finalizing agent")
-    # Ask user how to proceed
-
-# Additional validation: Check entry_points format
-if not isinstance(entry_points, dict):
-    print("❌ CRITICAL ERROR: entry_points must be a dict")
-    print(f"   Current value: {entry_points} (type: {type(entry_points)})")
-    print("   Correct format: {'start': 'node-id'}")
-    # STOP - This is the mistake that caused the support_ticket_agent bug
-
-if entry_points.get("start") != entry_node_id:
-    print("❌ CRITICAL ERROR: entry_points['start'] must match entry_node")
-    print(f"   entry_points: {entry_points}")
-    print(f"   entry_node: {entry_node_id}")
-    print("   They must be consistent!")
-```
-
-**CRITICAL:** Do NOT proceed to Step 5 (finalization) until graph validation passes. This checkpoint prevents structural bugs from reaching production.
-
-### Step 5: Finalize Agent Class
-
-**Pre-flight checks before finalization:**
-
-```python
-# MANDATORY: Verify all validations passed before finalizing
-print("\n🔍 Pre-finalization Checklist:")
-
-# Get current session status
-status = json.loads(mcp__agent-builder__get_session_status())
-
-checks_passed = True
-
-# Check 1: Goal defined
-if not status["has_goal"]:
-    print("❌ No goal defined")
-    checks_passed = False
-else:
-    print(f"✅ Goal defined: {status['goal_name']}")
-
-# Check 2: Nodes added
-if status["node_count"] == 0:
-    print("❌ No nodes added")
-    checks_passed = False
-else:
-    print(f"✅ {status['node_count']} nodes added: {', '.join(status['nodes'])}")
-
-# Check 3: Edges added
-if status["edge_count"] == 0:
-    print("❌ No edges added")
-    checks_passed = False
-else:
-    print(f"✅ {status['edge_count']} edges added")
-
-# Check 4: Entry points format correct
-if not isinstance(entry_points, dict) or "start" not in entry_points:
-    print("❌ CRITICAL: entry_points format incorrect")
-    print(f"   Current: {entry_points}")
-    print("   Required: {'start': 'node-id'}")
-    checks_passed = False
-else:
-    print(f"✅ Entry points valid: {entry_points}")
-
-if not checks_passed:
-    print("\n⚠️ CANNOT PROCEED to finalization until all checks pass")
-    print("   Fix the issues above first")
-    # Ask user how to proceed or stop here
-    return
-
-print("\n✅ All pre-flight checks passed - proceeding to finalization\n")
-```
-
-Write the agent class:
-
-````python
-agent_class_code = f'''
-
-class {agent_class_name}:
-    """
-    {agent_description}
-    """
-
-    def __init__(self, config=None):
-        self.config = config or default_config
-        self.goal = goal
-        self.nodes = nodes
-        self.edges = edges
-        self.entry_node = entry_node
-        self.entry_points = entry_points
-        self.pause_nodes = pause_nodes
-        self.terminal_nodes = terminal_nodes
-        self.executor = None
-
-    def _create_executor(self, mock_mode=False):
-        """Create executor instance."""
-        import tempfile
-        from pathlib import Path
-
-        storage_path = Path(tempfile.gettempdir()) / "{agent_name}"
-        storage_path.mkdir(parents=True, exist_ok=True)
-
-        runtime = Runtime(storage_path=storage_path)
-        tool_registry = ToolRegistry()
-
-        llm = None
-        if not mock_mode:
-            creds = CredentialManager()
-            if creds.is_available("anthropic"):
-                api_key = creds.get("anthropic")
-                llm = AnthropicProvider(api_key=api_key, model=self.config.model)
-
-        graph = GraphSpec(
-            id="{agent_name}-graph",
-            goal_id=self.goal.id,
-            version="1.0.0",
-            entry_node=self.entry_node,
-            entry_points=self.entry_points,
-            terminal_nodes=self.terminal_nodes,
-            pause_nodes=self.pause_nodes,
-            nodes=self.nodes,
-            edges=self.edges,
-            default_model=self.config.model,
-            max_tokens=self.config.max_tokens,
-        )
-
-        self.executor = GraphExecutor(
-            runtime=runtime,
-            llm=llm,
-            tools=list(tool_registry.get_tools().values()),
-            tool_executor=tool_registry.get_executor(),
-        )
-
-        self.graph = graph
-        return self.executor
-
-    async def run(self, context: dict, mock_mode=False, session_state=None):
-        """Run the agent."""
-        executor = self._create_executor(mock_mode=mock_mode)
-        result = await executor.execute(
-            graph=self.graph,
-            goal=self.goal,
-            input_data=context,
-            session_state=session_state,
-        )
-        return result
-
-    def info(self):
-        """Get agent information."""
-        return {{
-            "name": metadata.name,
-            "version": metadata.version,
-            "description": metadata.description,
-            "goal": {{
-                "name": self.goal.name,
-                "description": self.goal.description,
-            }},
-            "nodes": [n.id for n in self.nodes],
-            "edges": [e.id for e in self.edges],
-            "entry_node": self.entry_node,
-            "pause_nodes": self.pause_nodes,
-            "terminal_nodes": self.terminal_nodes,
-        }}
-
-    def validate(self):
-        """Validate agent structure."""
-        errors = []
-        warnings = []
-
-        node_ids = {{node.id for node in self.nodes}}
-        for edge in self.edges:
-            if edge.source not in node_ids:
-                errors.append(f"Edge {{edge.id}}: source '{{edge.source}}' not found")
-            if edge.target not in node_ids:
-                errors.append(f"Edge {{edge.id}}: target '{{edge.target}}' not found")
-
-        if self.entry_node not in node_ids:
-            errors.append(f"Entry node '{{self.entry_node}}' not found")
-
-        return {{
-            "valid": len(errors) == 0,
-            "errors": errors,
-            "warnings": warnings,
-        }}
-
-
-# Create default instance
-default_agent = {agent_class_name}()
-'''
-
-# Append agent class
-Read(f"{package_path}/agent.py")
-Edit(
-    file_path=f"{package_path}/agent.py",
-    old_string="nodes = [",
-    new_string=f"nodes = [\n{agent_class_code}"
-)
-
-# Finalize __init__.py exports
-init_content = f'''"""
-{agent_description}
-"""
-
-from .agent import {agent_class_name}, default_agent, goal, nodes, edges
-from .config import RuntimeConfig, AgentMetadata, default_config, metadata
-
-__version__ = "1.0.0"
-
-__all__ = [
-    "{agent_class_name}",
-    "default_agent",
-    "goal",
-    "nodes",
-    "edges",
-    "RuntimeConfig",
-    "AgentMetadata",
-    "default_config",
-    "metadata",
-]
-'''
-
-Read(f"{package_path}/__init__.py")
-Edit(
-    file_path=f"{package_path}/__init__.py",
-    old_string='"""',
-    new_string=init_content,
-    replace_all=True
-)
-
-# Write README
-readme_content = f'''# {agent_name.replace('_', ' ').title()}
-
-{agent_description}
-
-## Usage
-
-```bash
-# Show agent info
-python -m {agent_name} info
-
-# Validate structure
-python -m {agent_name} validate
-
-# Run agent
-python -m {agent_name} run --input '{{"key": "value"}}'
-
-# Interactive shell
-python -m {agent_name} shell
-````
-
-## As Python Module
-
-```python
-from {agent_name} import default_agent
-
-result = await default_agent.run({{"key": "value"}})
-```
-
-## Structure
-
- `agent.py` - Goal, edges, graph construction
- `nodes/__init__.py` - Node definitions
- `config.py` - Runtime configuration
- `__main__.py` - CLI interface
-  '''
-
-Write(
-file_path=f"{package_path}/README.md",
-content=readme_content
-)
-
-```
-
-**Show user:**
-
-```
-
-✅ Agent class written to agent.py
-✅ Package exports finalized in __init__.py
-✅ README.md generated
-
-🎉 Agent complete: exports/technical_research_agent/
-
-Commands:
-python -m technical_research_agent info
-python -m technical_research_agent validate
-python -m technical_research_agent run --input '{"topic": "..."}'
-```
-
-**Final session summary:**
-
-```python
-# Show final MCP session status
-status = json.loads(mcp__agent-builder__get_session_status())
-
-print("\n📊 Build Session Summary:")
-print(f"   Session ID: {status['session_id']}")
-print(f"   Agent: {status['name']}")
-print(f"   Goal: {status['goal_name']}")
-print(f"   Nodes: {status['node_count']}")
-print(f"   Edges: {status['edge_count']}")
-print(f"   MCP Servers: {status['mcp_servers_count']}")
-print("\n✅ Agent construction complete with full validation")
-print(f"\nSession saved to: ~/.claude-code-agent-builder/sessions/{status['session_id']}.json")
-````
-
-## CLI Template
-
-```python
-CLI_TEMPLATE = '''"""
-CLI entry point for agent.
-"""
-
-import asyncio
-import json
-import sys
-import click
-
-from .agent import default_agent
-
-@click.group()
-@click.version_option(version="1.0.0")
-def cli():
-    """Agent CLI."""
-    pass
-
-@cli.command()
-@click.option("--input", "-i", "input_json", type=str, required=True)
-@click.option("--mock", is_flag=True, help="Run in mock mode")
-@click.option("--quiet", "-q", is_flag=True, help="Only output result JSON")
-def run(input_json, mock, quiet):
-    """Execute the agent."""
-    try:
-        context = json.loads(input_json)
-    except json.JSONDecodeError as e:
-        click.echo(f"Error parsing input JSON: {e}", err=True)
-        sys.exit(1)
-
-    if not quiet:
-        click.echo(f"Running agent with input: {json.dumps(context)}")
-
-    result = asyncio.run(default_agent.run(context, mock_mode=mock))
-
-    output_data = {
-        "success": result.success,
-        "steps_executed": result.steps_executed,
-        "output": result.output,
-    }
-    if result.error:
-        output_data["error"] = result.error
-    if result.paused_at:
-        output_data["paused_at"] = result.paused_at
-
-    click.echo(json.dumps(output_data, indent=2, default=str))
-    sys.exit(0 if result.success else 1)
-
-@cli.command()
-@click.option("--json", "output_json", is_flag=True)
-def info(output_json):
-    """Show agent information."""
-    info_data = default_agent.info()
-    if output_json:
-        click.echo(json.dumps(info_data, indent=2))
-    else:
-        click.echo(f"Agent: {info_data['name']}")
-        click.echo(f"Description: {info_data['description']}")
-        click.echo(f"Nodes: {len(info_data['nodes'])}")
-        click.echo(f"Edges: {len(info_data['edges'])}")
-
-@cli.command()
-def validate():
-    """Validate agent structure."""
-    validation = default_agent.validate()
-    if validation["valid"]:
-        click.echo("✓ Agent is valid")
-    else:
-        click.echo("✗ Agent has errors:")
-        for error in validation["errors"]:
-            click.echo(f"  ERROR: {error}")
-    sys.exit(0 if validation["valid"] else 1)
-
-@cli.command()
-def shell():
-    """Interactive agent session."""
-    click.echo("Interactive mode - enter JSON input:")
-    # ... implementation
-
-if __name__ == "__main__":
-    cli()
-'''
-````
-
-## Testing During Build
-
-After nodes are added:
-
-```python
-# Test individual node
-python -c "
-from exports.my_agent.nodes import analyze_request_node
-print(analyze_request_node.id)
-print(analyze_request_node.input_keys)
-"
-
-# Validate current state
-PYTHONPATH=core:exports python -m my_agent validate
-
-# Show info
-PYTHONPATH=core:exports python -m my_agent info
-```
-
-## Approval Pattern
-
-Use AskUserQuestion for all approvals:
-
-```python
-response = AskUserQuestion(
-    questions=[{
-        "question": "Do you approve this [component]?",
-        "header": "Approve",
-        "options": [
-            {
-                "label": "✓ Approve (Recommended)",
-                "description": "Component looks good, proceed"
-            },
-            {
-                "label": "✗ Reject & Modify",
-                "description": "Need to make changes"
-            },
-            {
-                "label": "⏸ Pause & Review",
-                "description": "Need more time to review"
-            }
-        ],
-        "multiSelect": false
-    }]
-)
-```
-
-## Next Steps
-
-After completing construction:
-
-**If agent structure complete:**
-
- Validate: `python -m agent_name validate`
- Test basic execution: `python -m agent_name info`
- Proceed to testing-agent skill for comprehensive tests
-
-**If implementation needed:**
-
- Check for STATUS.md or IMPLEMENTATION_GUIDE.md in agent directory
- May need Python functions or MCP tool integration
-
-## Related Skills
-
- **building-agents-core** - Fundamental concepts
- **building-agents-patterns** - Best practices and examples
- **testing-agent** - Test and validate completed agents
- **agent-workflow** - Complete workflow orchestrator
@@ -1,303 +0,0 @@
---
-name: building-agents-core
-description: Core concepts for goal-driven agents - architecture, node types, tool discovery, and workflow overview. Use when starting agent development or need to understand agent fundamentals.
-license: Apache-2.0
-metadata:
-  author: hive
-  version: "1.0"
-  type: foundational
-  part_of: building-agents
---
-
-# Building Agents - Core Concepts
-
-Foundational knowledge for building goal-driven agents as Python packages.
-
-## Architecture: Python Services (Not JSON Configs)
-
-Agents are built as Python packages:
-
-```
-exports/my_agent/
-├── __init__.py          # Package exports
-├── __main__.py          # CLI (run, info, validate, shell)
-├── agent.py             # Graph construction (goal, edges, agent class)
-├── nodes/__init__.py    # Node definitions (NodeSpec)
-├── config.py            # Runtime config
-└── README.md            # Documentation
-```
-
-**Key Principle: Agent is visible and editable during build**
-
- ✅ Files created immediately as components are approved
- ✅ User can watch files grow in their editor
- ✅ No session state - just direct file writes
- ✅ No "export" step - agent is ready when build completes
-
-## Core Concepts
-
-### Goal
-
-Success criteria and constraints (written to agent.py)
-
-```python
-goal = Goal(
-    id="research-goal",
-    name="Technical Research Agent",
-    description="Research technical topics thoroughly",
-    success_criteria=[
-        SuccessCriterion(
-            id="completeness",
-            description="Cover all aspects of topic",
-            metric="coverage_score",
-            target=">=0.9",
-            weight=0.4,
-        ),
-        # ... more criteria
-    ],
-    constraints=[
-        Constraint(
-            id="accuracy",
-            description="All information must be verified",
-            constraint_type="hard",
-            category="quality",
-        ),
-        # ... more constraints
-    ],
-)
-```
-
-### Node
-
-Unit of work (written to nodes/__init__.py)
-
-**Node Types:**
-
- `llm_generate` - Text generation, parsing
- `llm_tool_use` - Actions requiring tools
- `router` - Conditional branching
- `function` - Deterministic operations
-
-```python
-search_node = NodeSpec(
-    id="search-web",
-    name="Search Web",
-    description="Search for information online",
-    node_type="llm_tool_use",
-    input_keys=["query"],
-    output_keys=["search_results"],
-    system_prompt="Search the web for: {query}",
-    tools=["web_search"],
-    max_retries=3,
-)
-```
-
-### Edge
-
-Connection between nodes (written to agent.py)
-
-**Edge Conditions:**
-
- `on_success` - Proceed if node succeeds
- `on_failure` - Handle errors
- `always` - Always proceed
- `conditional` - Based on expression
-
-```python
-EdgeSpec(
-    id="search-to-analyze",
-    source="search-web",
-    target="analyze-results",
-    condition=EdgeCondition.ON_SUCCESS,
-    priority=1,
-)
-```
-
-### Pause/Resume
-
-Multi-turn conversations
-
- **Pause nodes** - Stop execution, wait for user input
- **Resume entry points** - Continue from pause with user's response
-
-```python
-# Example pause/resume configuration
-pause_nodes = ["request-clarification"]
-entry_points = {
-    "start": "analyze-request",
-    "request-clarification_resume": "process-clarification"
-}
-```
-
-## Tool Discovery & Validation
-
-**CRITICAL:** Before adding a node with tools, you MUST verify the tools exist.
-
-Tools are provided by MCP servers. Never assume a tool exists - always discover dynamically.
-
-### Step 1: Register MCP Server (if not already done)
-
-```python
-mcp__agent-builder__add_mcp_server(
-    name="aden-tools",
-    transport="stdio",
-    command="python",
-    args='["mcp_server.py", "--stdio"]',
-    cwd="../aden-tools"
-)
-```
-
-### Step 2: Discover Available Tools
-
-```python
-# List all tools from all registered servers
-mcp__agent-builder__list_mcp_tools()
-
-# Or list tools from a specific server
-mcp__agent-builder__list_mcp_tools(server_name="aden-tools")
-```
-
-This returns available tools with their descriptions and parameters:
-
-```json
-{
-  "success": true,
-  "tools_by_server": {
-    "aden-tools": [
-      {
-        "name": "web_search",
-        "description": "Search the web...",
-        "parameters": ["query"]
-      },
-      {
-        "name": "web_scrape",
-        "description": "Scrape a URL...",
-        "parameters": ["url"]
-      }
-    ]
-  },
-  "total_tools": 14
-}
-```
-
-### Step 3: Validate Before Adding Nodes
-
-Before writing a node with `tools=[...]`:
-
-1. Call `list_mcp_tools()` to get available tools
-2. Check each tool in your node exists in the response
-3. If a tool doesn't exist:
-   - **DO NOT proceed** with the node
-   - Inform the user: "The tool 'X' is not available. Available tools are: ..."
-   - Ask if they want to use an alternative or proceed without the tool
-
-### Tool Validation Anti-Patterns
-
-❌ **Never assume a tool exists** - always call `list_mcp_tools()` first
-❌ **Never write a node with unverified tools** - validate before writing
-❌ **Never silently drop tools** - if a tool doesn't exist, inform the user
-❌ **Never guess tool names** - use exact names from discovery response
-
-### Example Validation Flow
-
-```python
-# 1. User requests: "Add a node that searches the web"
-# 2. Discover available tools
-tools_response = mcp__agent-builder__list_mcp_tools()
-
-# 3. Check if web_search exists
-available = [t["name"] for tools in tools_response["tools_by_server"].values() for t in tools]
-if "web_search" not in available:
-    # Inform user and ask how to proceed
-    print("❌ 'web_search' not available. Available tools:", available)
-else:
-    # Proceed with node creation
-    # ...
-```
-
-## Workflow Overview: Incremental File Construction
-
-```
-1. CREATE PACKAGE → mkdir + write skeletons
-2. DEFINE GOAL → Write to agent.py + config.py
-3. FOR EACH NODE:
-   - Propose design
-   - User approves
-   - Write to nodes/__init__.py IMMEDIATELY ← FILE WRITTEN
-   - (Optional) Validate with test_node ← MCP VALIDATION
-   - User can open file and see it
-4. CONNECT EDGES → Update agent.py ← FILE WRITTEN
-   - (Optional) Validate with validate_graph ← MCP VALIDATION
-5. FINALIZE → Write agent class to agent.py ← FILE WRITTEN
-6. DONE - Agent ready at exports/my_agent/
-```
-
-**Files written immediately. MCP tools optional for validation/testing bookkeeping.**
-
-### The Key Difference
-
-**OLD (Bad):**
-
-```
-MCP add_node → Session State → MCP add_node → Session State → ...
-                                                                ↓
-                                                     MCP export_graph
-                                                                ↓
-                                                       Files appear
-```
-
-**NEW (Good):**
-
-```
-Write node to file → (Optional: MCP test_node) → Write node to file → ...
-       ↓                                               ↓
-  File visible                                    File visible
-  immediately                                     immediately
-```
-
-**Bottom line:** Use Write/Edit for construction, MCP for validation if needed.
-
-## When to Use This Skill
-
-Use building-agents-core when:
- Starting a new agent project and need to understand fundamentals
- Need to understand agent architecture before building
- Want to validate tool availability before proceeding
- Learning about node types, edges, and graph execution
-
-**Next Steps:**
- Ready to build? → Use `building-agents-construction` skill
- Need patterns and examples? → Use `building-agents-patterns` skill
-
-## MCP Tools for Validation
-
-After writing files, optionally use MCP tools for validation:
-
-**test_node** - Validate node configuration with mock inputs
-```python
-mcp__agent-builder__test_node(
-    node_id="search-web",
-    test_input='{"query": "test query"}',
-    mock_llm_response='{"results": "mock output"}'
-)
-```
-
-**validate_graph** - Check graph structure
-```python
-mcp__agent-builder__validate_graph()
-# Returns: unreachable nodes, missing connections, etc.
-```
-
-**create_session** - Track session state for bookkeeping
-```python
-mcp__agent-builder__create_session(session_name="my-build")
-```
-
-**Key Point:** Files are written FIRST. MCP tools are for validation only.
-
-## Related Skills
-
- **building-agents-construction** - Step-by-step building process
- **building-agents-patterns** - Best practices and examples
- **agent-workflow** - Complete workflow orchestrator
- **testing-agent** - Test and validate completed agents
@@ -1,497 +0,0 @@
---
-name: building-agents-patterns
-description: Best practices, patterns, and examples for building goal-driven agents. Includes pause/resume architecture, hybrid workflows, anti-patterns, and handoff to testing. Use when optimizing agent design.
-license: Apache-2.0
-metadata:
-  author: hive
-  version: "1.0"
-  type: reference
-  part_of: building-agents
---
-
-# Building Agents - Patterns & Best Practices
-
-Design patterns, examples, and best practices for building robust goal-driven agents.
-
-**Prerequisites:** Complete agent structure using `building-agents-construction`.
-
-## Practical Example: Hybrid Workflow
-
-How to build a node using both direct file writes and optional MCP validation:
-
-```python
-# 1. WRITE TO FILE FIRST (Primary - makes it visible)
-node_code = '''
-search_node = NodeSpec(
-    id="search-web",
-    node_type="llm_tool_use",
-    input_keys=["query"],
-    output_keys=["search_results"],
-    system_prompt="Search the web for: {query}",
-    tools=["web_search"],
-)
-'''
-
-Edit(
-    file_path="exports/research_agent/nodes/__init__.py",
-    old_string="# Nodes will be added here",
-    new_string=node_code
-)
-
-print("✅ Added search_node to nodes/__init__.py")
-print("📁 Open exports/research_agent/nodes/__init__.py to see it!")
-
-# 2. OPTIONALLY VALIDATE WITH MCP (Secondary - bookkeeping)
-validation = mcp__agent-builder__test_node(
-    node_id="search-web",
-    test_input='{"query": "python tutorials"}',
-    mock_llm_response='{"search_results": [...mock results...]}'
-)
-
-print(f"✓ Validation: {validation['success']}")
-```
-
-**User experience:**
-
- Immediately sees node in their editor (from step 1)
- Gets validation feedback (from step 2)
- Can edit the file directly if needed
-
-This combines visibility (files) with validation (MCP tools).
-
-## Pause/Resume Architecture
-
-For agents needing multi-turn conversations with user interaction:
-
-### Basic Pause/Resume Flow
-
-```python
-# Define pause nodes - execution stops at these nodes
-pause_nodes = ["request-clarification", "await-approval"]
-
-# Define entry points - where to resume from each pause
-entry_points = {
-    "start": "analyze-request",  # Initial entry
-    "request-clarification_resume": "process-clarification",  # Resume from clarification
-    "await-approval_resume": "execute-action",  # Resume from approval
-}
-```
-
-### Example: Multi-Turn Research Agent
-
-```python
-# Nodes
-nodes = [
-    NodeSpec(id="analyze-request", ...),
-    NodeSpec(id="request-clarification", ...),  # PAUSE NODE
-    NodeSpec(id="process-clarification", ...),
-    NodeSpec(id="generate-results", ...),
-    NodeSpec(id="await-approval", ...),  # PAUSE NODE
-    NodeSpec(id="execute-action", ...),
-]
-
-# Edges with resume flows
-edges = [
-    EdgeSpec(
-        id="analyze-to-clarify",
-        source="analyze-request",
-        target="request-clarification",
-        condition=EdgeCondition.CONDITIONAL,
-        condition_expr="needs_clarification == true",
-    ),
-    # When resumed, goes to process-clarification
-    EdgeSpec(
-        id="clarify-to-process",
-        source="request-clarification",
-        target="process-clarification",
-        condition=EdgeCondition.ALWAYS,
-    ),
-    EdgeSpec(
-        id="results-to-approval",
-        source="generate-results",
-        target="await-approval",
-        condition=EdgeCondition.ALWAYS,
-    ),
-    # When resumed, goes to execute-action
-    EdgeSpec(
-        id="approval-to-execute",
-        source="await-approval",
-        target="execute-action",
-        condition=EdgeCondition.ALWAYS,
-    ),
-]
-
-# Configuration
-pause_nodes = ["request-clarification", "await-approval"]
-entry_points = {
-    "start": "analyze-request",
-    "request-clarification_resume": "process-clarification",
-    "await-approval_resume": "execute-action",
-}
-```
-
-### Running Pause/Resume Agents
-
-```python
-# Initial run - will pause at first pause node
-result1 = await agent.run(
-    context={"query": "research topic"},
-    session_state=None
-)
-
-# Check if paused
-if result1.paused_at:
-    print(f"Paused at: {result1.paused_at}")
-
-    # Resume with user input
-    result2 = await agent.run(
-        context={"user_response": "clarification details"},
-        session_state=result1.session_state  # Pass previous state
-    )
-```
-
-## Anti-Patterns
-
-### What NOT to Do
-
-❌ **Don't rely on `export_graph`** - Write files immediately, not at end
-```python
-# BAD: Building in session state, exporting at end
-mcp__agent-builder__add_node(...)
-mcp__agent-builder__add_node(...)
-mcp__agent-builder__export_graph()  # Files appear only now
-
-# GOOD: Writing files immediately
-Write(file_path="...", content=node_code)  # File visible now
-Write(file_path="...", content=node_code)  # File visible now
-```
-
-❌ **Don't hide code in session** - Write to files as components approved
-```python
-# BAD: Accumulating changes invisibly
-session.add_component(component1)
-session.add_component(component2)
-# User can't see anything yet
-
-# GOOD: Incremental visibility
-Edit(file_path="...", ...)  # User sees change 1
-Edit(file_path="...", ...)  # User sees change 2
-```
-
-❌ **Don't wait to write files** - Agent visible from first step
-```python
-# BAD: Building everything before writing
-design_all_nodes()
-design_all_edges()
-write_everything_at_once()
-
-# GOOD: Write as you go
-write_package_structure()  # Visible
-write_goal()  # Visible
-write_node_1()  # Visible
-write_node_2()  # Visible
-```
-
-❌ **Don't batch everything** - Write incrementally
-```python
-# BAD: Batching all nodes
-nodes = [design_node_1(), design_node_2(), ...]
-write_all_nodes(nodes)
-
-# GOOD: One at a time with user feedback
-write_node_1()  # User approves
-write_node_2()  # User approves
-write_node_3()  # User approves
-```
-
-### MCP Tools - Correct Usage
-
-**MCP tools OK for:**
-✅ `test_node` - Validate node configuration with mock inputs
-✅ `validate_graph` - Check graph structure
-✅ `create_session` - Track session state for bookkeeping
-✅ Other validation tools
-
-**Just don't:** Use MCP as the primary construction method or rely on export_graph
-
-## Best Practices
-
-### 1. Show Progress After Each Write
-
-```python
-# After writing a node
-print("✅ Added analyze_request_node to nodes/__init__.py")
-print("📊 Progress: 1/6 nodes added")
-print("📁 Open exports/my_agent/nodes/__init__.py to see it!")
-```
-
-### 2. Let User Open Files During Build
-
-```python
-# Encourage file inspection
-print("✅ Goal written to agent.py")
-print("")
-print("💡 Tip: Open exports/my_agent/agent.py in your editor to see the goal!")
-```
-
-### 3. Write Incrementally - One Component at a Time
-
-```python
-# Good flow
-write_package_structure()
-show_user("Package created")
-
-write_goal()
-show_user("Goal written")
-
-for node in nodes:
-    get_approval(node)
-    write_node(node)
-    show_user(f"Node {node.id} written")
-```
-
-### 4. Test As You Build
-
-```python
-# After adding several nodes
-print("💡 You can test current state with:")
-print("  PYTHONPATH=core:exports python -m my_agent validate")
-print("  PYTHONPATH=core:exports python -m my_agent info")
-```
-
-### 5. Keep User Informed
-
-```python
-# Clear status updates
-print("🔨 Creating package structure...")
-print("✅ Package created: exports/my_agent/")
-print("")
-print("📝 Next: Define agent goal")
-```
-
-## Continuous Monitoring Agents
-
-For agents that run continuously without terminal nodes:
-
-```python
-# No terminal nodes - loops forever
-terminal_nodes = []
-
-# Workflow loops back to start
-edges = [
-    EdgeSpec(id="monitor-to-check", source="monitor", target="check-condition"),
-    EdgeSpec(id="check-to-wait", source="check-condition", target="wait"),
-    EdgeSpec(id="wait-to-monitor", source="wait", target="monitor"),  # Loop
-]
-
-# Entry node only
-entry_node = "monitor"
-entry_points = {"start": "monitor"}
-pause_nodes = []
-```
-
-**Example: File Monitor**
-
-```python
-nodes = [
-    NodeSpec(id="list-files", ...),
-    NodeSpec(id="check-new-files", node_type="router", ...),
-    NodeSpec(id="process-files", ...),
-    NodeSpec(id="wait-interval", node_type="function", ...),
-]
-
-edges = [
-    EdgeSpec(id="list-to-check", source="list-files", target="check-new-files"),
-    EdgeSpec(
-        id="check-to-process",
-        source="check-new-files",
-        target="process-files",
-        condition=EdgeCondition.CONDITIONAL,
-        condition_expr="new_files_count > 0",
-    ),
-    EdgeSpec(
-        id="check-to-wait",
-        source="check-new-files",
-        target="wait-interval",
-        condition=EdgeCondition.CONDITIONAL,
-        condition_expr="new_files_count == 0",
-    ),
-    EdgeSpec(id="process-to-wait", source="process-files", target="wait-interval"),
-    EdgeSpec(id="wait-to-list", source="wait-interval", target="list-files"),  # Loop back
-]
-
-terminal_nodes = []  # No terminal - runs forever
-```
-
-## Complex Routing Patterns
-
-### Multi-Condition Router
-
-```python
-router_node = NodeSpec(
-    id="decision-router",
-    node_type="router",
-    input_keys=["analysis_result"],
-    output_keys=["decision"],
-    system_prompt="""
-    Based on the analysis result, decide the next action:
-    - If confidence > 0.9: route to "execute"
-    - If 0.5 <= confidence <= 0.9: route to "review"
-    - If confidence < 0.5: route to "clarify"
-
-    Return: {"decision": "execute|review|clarify"}
-    """,
-)
-
-# Edges for each route
-edges = [
-    EdgeSpec(
-        id="router-to-execute",
-        source="decision-router",
-        target="execute-action",
-        condition=EdgeCondition.CONDITIONAL,
-        condition_expr="decision == 'execute'",
-        priority=1,
-    ),
-    EdgeSpec(
-        id="router-to-review",
-        source="decision-router",
-        target="human-review",
-        condition=EdgeCondition.CONDITIONAL,
-        condition_expr="decision == 'review'",
-        priority=2,
-    ),
-    EdgeSpec(
-        id="router-to-clarify",
-        source="decision-router",
-        target="request-clarification",
-        condition=EdgeCondition.CONDITIONAL,
-        condition_expr="decision == 'clarify'",
-        priority=3,
-    ),
-]
-```
-
-## Error Handling Patterns
-
-### Graceful Failure with Fallback
-
-```python
-# Primary node with error handling
-nodes = [
-    NodeSpec(id="api-call", max_retries=3, ...),
-    NodeSpec(id="fallback-cache", ...),
-    NodeSpec(id="report-error", ...),
-]
-
-edges = [
-    # Success path
-    EdgeSpec(
-        id="api-success",
-        source="api-call",
-        target="process-results",
-        condition=EdgeCondition.ON_SUCCESS,
-    ),
-    # Fallback on failure
-    EdgeSpec(
-        id="api-to-fallback",
-        source="api-call",
-        target="fallback-cache",
-        condition=EdgeCondition.ON_FAILURE,
-        priority=1,
-    ),
-    # Report if fallback also fails
-    EdgeSpec(
-        id="fallback-to-error",
-        source="fallback-cache",
-        target="report-error",
-        condition=EdgeCondition.ON_FAILURE,
-        priority=1,
-    ),
-]
-```
-
-## Performance Optimization
-
-### Parallel Node Execution
-
-```python
-# Use multiple edges from same source for parallel execution
-edges = [
-    EdgeSpec(
-        id="start-to-search1",
-        source="start",
-        target="search-source-1",
-        condition=EdgeCondition.ALWAYS,
-    ),
-    EdgeSpec(
-        id="start-to-search2",
-        source="start",
-        target="search-source-2",
-        condition=EdgeCondition.ALWAYS,
-    ),
-    EdgeSpec(
-        id="start-to-search3",
-        source="start",
-        target="search-source-3",
-        condition=EdgeCondition.ALWAYS,
-    ),
-    # Converge results
-    EdgeSpec(
-        id="search1-to-merge",
-        source="search-source-1",
-        target="merge-results",
-    ),
-    EdgeSpec(
-        id="search2-to-merge",
-        source="search-source-2",
-        target="merge-results",
-    ),
-    EdgeSpec(
-        id="search3-to-merge",
-        source="search-source-3",
-        target="merge-results",
-    ),
-]
-```
-
-## Handoff to Testing
-
-When agent is complete, transition to testing phase:
-
-```python
-print("""
-✅ Agent complete: exports/my_agent/
-
-Next steps:
-1. Switch to testing-agent skill
-2. Generate and approve tests
-3. Run evaluation
-4. Debug any failures
-
-Command: "Test the agent at exports/my_agent/"
-""")
-```
-
-### Pre-Testing Checklist
-
-Before handing off to testing-agent:
-
- [ ] Agent structure validates: `python -m agent_name validate`
- [ ] All nodes defined in nodes/__init__.py
- [ ] All edges connect valid nodes
- [ ] Entry node specified
- [ ] Agent can be imported: `from exports.agent_name import default_agent`
- [ ] README.md with usage instructions
- [ ] CLI commands work (info, validate)
-
-## Related Skills
-
- **building-agents-core** - Fundamental concepts
- **building-agents-construction** - Step-by-step building
- **testing-agent** - Test and validate agents
- **agent-workflow** - Complete workflow orchestrator
-
---
-
-**Remember: Agent is actively constructed, visible the whole time. No hidden state. No surprise exports. Just transparent, incremental file building.**
@@ -0,0 +1,225 @@
+# Integration Test Reporting Skill
+
+Run the Level 2 dummy agent integration test suite and produce a detailed HTML report with per-test input → outcome analysis.
+
+## Trigger
+
+User wants to run integration tests and see results:
+- `/test-reporting`
+- `/test-reporting test_component_queen_live.py`
+- `/test-reporting --all`
+
+## SOP: Running Tests
+
+### Step 1: Select Scope
+
+If the user provides a specific test file or pattern, use it. Otherwise run the full suite.
+
+```bash
+# Full suite
+cd core && echo "1" | uv run python tests/dummy_agents/run_all.py --interactive 2>&1
+
+# Specific file (requires manual provider setup)
+cd core && uv run python -c "
+import sys
+sys.path.insert(0, '.')
+from tests.dummy_agents.run_all import detect_available
+from tests.dummy_agents.conftest import set_llm_selection
+
+avail = detect_available()
+claude = [p for p in avail if 'Claude Code' in p['name']]
+if not claude:
+    avail_names = [p['name'] for p in avail]
+    raise RuntimeError(f'No Claude Code subscription. Available: {avail_names}')
+provider = claude[0]
+set_llm_selection(
+    model=provider['model'],
+    api_key=provider['api_key'],
+    extra_headers=provider.get('extra_headers'),
+    api_base=provider.get('api_base'),
+)
+
+import pytest
+sys.exit(pytest.main([
+    'tests/dummy_agents/TEST_FILE_HERE',
+    '-v', '--override-ini=asyncio_mode=auto', '--no-header', '--tb=long',
+    '--log-cli-level=WARNING', '--junitxml=/tmp/hive_test_results.xml',
+]))
+"
+```
+
+### Step 2: Collect Results
+
+After the test run completes, collect:
+1. **JUnit XML** from `--junitxml` output (if available)
+2. **stdout/stderr** from the run
+3. **Summary table** from `run_all.py` output (the Unicode table)
+
+### Step 3: Generate HTML Report
+
+Write the report to `/tmp/hive_integration_test_report.html`.
+
+The report MUST include these sections:
+
+#### Header
+- Run timestamp (ISO 8601)
+- Provider used (model name, source)
+- Total tests / passed / failed / skipped
+- Total wall-clock time
+- Overall verdict: PASS (all green) or FAIL (with count)
+
+#### Per-Test Table
+
+For EVERY test (not just failures), include a row with:
+
+| Column | Description |
+|--------|-------------|
+| Component | Test file grouping (e.g., `component_queen_live`) |
+| Test Name | Function name (e.g., `test_queen_starts_in_planning_without_worker`) |
+| Status | PASS / FAIL / SKIP / ERROR with color badge |
+| Duration | Wall-clock seconds |
+| What | One-line description of what the test verifies |
+| How | How it works (setup → action → assertion) |
+| Why | Why this test matters (what bug/behavior it catches) |
+| Input | The input data or configuration (graph spec, initial prompt, phase, etc.) |
+| Expected Outcome | What the test asserts |
+| Actual Outcome | What actually happened (PASS: matches expected / FAIL: actual vs expected) |
+| Failure Detail | For failures only: full traceback + diagnosis |
+
+#### What / How / Why Descriptions
+
+These MUST be derived from the test function's docstring and code. Read each test file to extract:
+- **What**: From the docstring first line
+- **How**: From the test body (what fixtures, what graph, what assertions)
+- **Why**: From the docstring body or "Why this matters" section in the test module
+
+Use these mappings for the component test files:
+
+```
+test_component_llm.py          → "LLM Provider" — streaming, tool calling, tokens
+test_component_tools.py        → "Tool Registry + MCP" — connection, execution
+test_component_event_loop.py   → "EventLoopNode" — iteration, output, stall
+test_component_edges.py        → "Edge Evaluation" — conditional, priority
+test_component_conversation.py → "Conversation Persistence" — storage, cursor
+test_component_escalation.py   → "Escalation Flow" — worker→queen signaling
+test_component_continuous.py   → "Continuous Mode" — conversation threading
+test_component_queen.py        → "Queen Phase (Unit)" — phase state, tools, events
+test_component_queen_live.py   → "Queen Phase (Live)" — real queen, real LLM
+test_component_queen_state_machine.py → "Queen State Machine" — edge cases, races
+test_component_worker_comms.py → "Worker Communication" — events, data flow
+test_component_strict_outcomes.py → "Strict Outcomes" — exact path, output, quality
+```
+
+#### HTML Template
+
+Use this structure:
+
+```html
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<title>Hive Integration Test Report — {timestamp}</title>
+<style>
+  :root { --pass: #22c55e; --fail: #ef4444; --skip: #f59e0b; --bg: #0f172a; --surface: #1e293b; --text: #e2e8f0; --muted: #94a3b8; --border: #334155; }
+  * { box-sizing: border-box; margin: 0; padding: 0; }
+  body { font-family: 'SF Mono', 'Fira Code', monospace; background: var(--bg); color: var(--text); padding: 2rem; line-height: 1.6; }
+  h1, h2, h3 { font-weight: 600; }
+  h1 { font-size: 1.5rem; margin-bottom: 1rem; }
+  h2 { font-size: 1.2rem; margin: 2rem 0 1rem; border-bottom: 1px solid var(--border); padding-bottom: 0.5rem; }
+  .summary { display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
+  .card { background: var(--surface); padding: 1rem; border-radius: 8px; border: 1px solid var(--border); }
+  .card .label { color: var(--muted); font-size: 0.75rem; text-transform: uppercase; }
+  .card .value { font-size: 1.5rem; font-weight: 700; margin-top: 0.25rem; }
+  .card .value.pass { color: var(--pass); }
+  .card .value.fail { color: var(--fail); }
+  table { width: 100%; border-collapse: collapse; font-size: 0.8rem; }
+  th { background: var(--surface); position: sticky; top: 0; text-align: left; padding: 0.5rem; border-bottom: 2px solid var(--border); color: var(--muted); text-transform: uppercase; font-size: 0.7rem; }
+  td { padding: 0.5rem; border-bottom: 1px solid var(--border); vertical-align: top; }
+  tr:hover { background: rgba(255,255,255,0.03); }
+  .badge { display: inline-block; padding: 2px 8px; border-radius: 4px; font-size: 0.7rem; font-weight: 700; }
+  .badge.pass { background: rgba(34,197,94,0.2); color: var(--pass); }
+  .badge.fail { background: rgba(239,68,68,0.2); color: var(--fail); }
+  .badge.skip { background: rgba(245,158,11,0.2); color: var(--skip); }
+  .detail { background: #1a1a2e; padding: 0.75rem; border-radius: 4px; margin-top: 0.5rem; font-size: 0.75rem; white-space: pre-wrap; overflow-x: auto; max-height: 200px; overflow-y: auto; }
+  .component-header { background: var(--surface); padding: 0.75rem 0.5rem; font-weight: 600; font-size: 0.85rem; }
+  .meta { color: var(--muted); font-size: 0.75rem; }
+</style>
+</head>
+<body>
+<h1>Hive Integration Test Report</h1>
+<p class="meta">Generated: {timestamp} | Provider: {provider} | Duration: {duration}s</p>
+
+<div class="summary">
+  <div class="card"><div class="label">Total</div><div class="value">{total}</div></div>
+  <div class="card"><div class="label">Passed</div><div class="value pass">{passed}</div></div>
+  <div class="card"><div class="label">Failed</div><div class="value fail">{failed}</div></div>
+  <div class="card"><div class="label">Verdict</div><div class="value {verdict_class}">{verdict}</div></div>
+</div>
+
+<h2>Test Results</h2>
+<table>
+<thead>
+<tr>
+  <th>Component</th>
+  <th>Test</th>
+  <th>Status</th>
+  <th>Time</th>
+  <th>What</th>
+  <th>Input → Expected → Actual</th>
+</tr>
+</thead>
+<tbody>
+<!-- For each test: -->
+<tr>
+  <td>{component}</td>
+  <td>{test_name}</td>
+  <td><span class="badge {status_class}">{status}</span></td>
+  <td>{duration}s</td>
+  <td>{what_description}</td>
+  <td>
+    <strong>Input:</strong> {input_description}<br>
+    <strong>Expected:</strong> {expected_outcome}<br>
+    <strong>Actual:</strong> {actual_outcome}
+    <!-- If failed: -->
+    <div class="detail">{failure_traceback}</div>
+  </td>
+</tr>
+</tbody>
+</table>
+
+<h2>Failure Analysis</h2>
+<!-- Only if there are failures -->
+<p>For each failure, provide:</p>
+<ul>
+  <li><strong>Root cause:</strong> Why it failed</li>
+  <li><strong>Impact:</strong> What this means for the system</li>
+  <li><strong>Suggested fix:</strong> How to address it</li>
+</ul>
+
+</body>
+</html>
+```
+
+### Step 4: Output
+
+1. Write the HTML file to `/tmp/hive_integration_test_report.html`
+2. Print the file path so the user can open it
+3. Print a concise summary to the terminal:
+   ```
+   Test Report: /tmp/hive_integration_test_report.html
+   Result: 74/76 PASSED (2 failures)
+   Failures:
+     - parallel_merge::test_parallel_disjoint_output_keys
+     - worker::test_worker_timestamped_note_artifact
+   ```
+
+## Key Rules
+
+1. ALWAYS use `--junitxml` when running pytest to get structured results
+2. ALWAYS read the test source files to populate What/How/Why columns — do not guess
+3. For Input/Expected/Actual, extract from the test's graph spec, assertions, and result
+4. Color-code everything: green for pass, red for fail, amber for skip
+5. Include the full traceback for failures in a scrollable `<div class="detail">`
+6. Group tests by component (file name) with a visual separator
+7. The report must be self-contained HTML (no external CSS/JS dependencies)
@@ -1,348 +0,0 @@
-# Example: Testing a YouTube Research Agent
-
-This example walks through testing a YouTube research agent that finds relevant videos based on a topic.
-
-## Prerequisites
-
- Agent built with building-agents skill at `exports/youtube-research/`
- Goal defined with success criteria and constraints
-
-## Step 1: Load the Goal
-
-First, load the goal that was defined during the Goal stage:
-
-```json
-{
-    "id": "youtube-research",
-    "name": "YouTube Research Agent",
-    "description": "Find relevant YouTube videos on a given topic",
-    "success_criteria": [
-        {
-            "id": "find_videos",
-            "description": "Find 3-5 relevant videos",
-            "metric": "video_count",
-            "target": "3-5",
-            "weight": 1.0
-        },
-        {
-            "id": "relevance",
-            "description": "Videos must be relevant to the topic",
-            "metric": "relevance_score",
-            "target": ">0.8",
-            "weight": 0.8
-        }
-    ],
-    "constraints": [
-        {
-            "id": "api_limits",
-            "description": "Must not exceed YouTube API rate limits",
-            "constraint_type": "hard",
-            "category": "technical"
-        },
-        {
-            "id": "content_safety",
-            "description": "Must filter out inappropriate content",
-            "constraint_type": "hard",
-            "category": "safety"
-        }
-    ]
-}
-```
-
-## Step 2: Generate Constraint Tests
-
-During the Goal stage (or early Eval), generate tests for constraints:
-
-```python
-result = generate_constraint_tests(
-    goal_id="youtube-research",
-    goal_json='<goal JSON above>'
-)
-```
-
-**Generated tests (awaiting approval):**
-
-```
-┌─────────────────────────────────────────────────────────────────┐
-│ Generated Constraint Tests (2 tests)                             │
-├─────────────────────────────────────────────────────────────────┤
-│ [1/2] test_constraint_api_limits_respected                       │
-│       Constraint: api_limits                                     │
-│       Confidence: 88%                                            │
-│                                                                  │
-│       def test_constraint_api_limits_respected(agent):           │
-│           """Verify API rate limits are not exceeded."""         │
-│           import time                                            │
-│           for i in range(10):                                    │
-│               result = agent.run({"topic": f"test_{i}"})         │
-│               time.sleep(0.1)                                    │
-│           # Should complete without rate limit errors            │
-│           assert "rate limit" not in str(result).lower()         │
-│                                                                  │
-│       [a]pprove  [r]eject  [e]dit  [s]kip                       │
-├─────────────────────────────────────────────────────────────────┤
-│ [2/2] test_constraint_content_safety_filter                      │
-│       Constraint: content_safety                                 │
-│       Confidence: 91%                                            │
-│                                                                  │
-│       def test_constraint_content_safety_filter(agent):          │
-│           """Verify inappropriate content is filtered."""        │
-│           result = agent.run({"topic": "general topic"})         │
-│           for video in result.videos:                            │
-│               assert video.safe_for_work is True                 │
-│               assert video.age_restricted is False               │
-│                                                                  │
-│       [a]pprove  [r]eject  [e]dit  [s]kip                       │
-└─────────────────────────────────────────────────────────────────┘
-```
-
-## Step 3: Approve Constraint Tests
-
-Review and approve each test:
-
-```python
-result = approve_tests(
-    goal_id="youtube-research",
-    approvals='[
-        {"test_id": "test_constraint_api_001", "action": "approve"},
-        {"test_id": "test_constraint_content_001", "action": "approve"}
-    ]'
-)
-```
-
-## Step 4: Generate Success Criteria Tests
-
-After the agent is built, generate success criteria tests:
-
-```python
-result = generate_success_tests(
-    goal_id="youtube-research",
-    goal_json='<goal JSON>',
-    node_names="search_node,filter_node,rank_node,format_node",
-    tool_names="youtube_search,video_details,channel_info"
-)
-```
-
-**Generated tests (awaiting approval):**
-
-```
-┌─────────────────────────────────────────────────────────────────┐
-│ Generated Success Criteria Tests (4 tests)                       │
-├─────────────────────────────────────────────────────────────────┤
-│ [1/4] test_find_videos_happy_path                               │
-│       Criteria: find_videos                                      │
-│       Confidence: 95%                                            │
-│                                                                  │
-│       def test_find_videos_happy_path(agent):                    │
-│           """Test finding videos for a common topic."""          │
-│           result = agent.run({"topic": "machine learning"})      │
-│           assert result.success                                  │
-│           assert 3 <= len(result.videos) <= 5                    │
-│           assert all(v.title for v in result.videos)             │
-│           assert all(v.video_id for v in result.videos)          │
-│                                                                  │
-│       [a]pprove  [r]eject  [e]dit  [s]kip                       │
-├─────────────────────────────────────────────────────────────────┤
-│ [2/4] test_find_videos_minimum_boundary                          │
-│       Criteria: find_videos                                      │
-│       Confidence: 87%                                            │
-│                                                                  │
-│       def test_find_videos_minimum_boundary(agent):              │
-│           """Test at minimum threshold (3 videos)."""            │
-│           result = agent.run({"topic": "niche topic xyz"})       │
-│           assert len(result.videos) >= 3                         │
-│                                                                  │
-│       [a]pprove  [r]eject  [e]dit  [s]kip                       │
-├─────────────────────────────────────────────────────────────────┤
-│ [3/4] test_relevance_score_threshold                             │
-│       Criteria: relevance                                        │
-│       Confidence: 92%                                            │
-│                                                                  │
-│       def test_relevance_score_threshold(agent):                 │
-│           """Test relevance scoring meets threshold."""          │
-│           result = agent.run({"topic": "python programming"})    │
-│           for video in result.videos:                            │
-│               assert video.relevance_score > 0.8                 │
-│                                                                  │
-│       [a]pprove  [r]eject  [e]dit  [s]kip                       │
-├─────────────────────────────────────────────────────────────────┤
-│ [4/4] test_find_videos_no_results_graceful                       │
-│       Criteria: find_videos                                      │
-│       Confidence: 84%                                            │
-│                                                                  │
-│       def test_find_videos_no_results_graceful(agent):           │
-│           """Test graceful handling of no results."""            │
-│           result = agent.run({"topic": "xyznonexistent123"})     │
-│           # Should not crash, return empty or message            │
-│           assert result.videos == [] or result.message           │
-│                                                                  │
-│       [a]pprove  [r]eject  [e]dit  [s]kip                       │
-└─────────────────────────────────────────────────────────────────┘
-```
-
-## Step 5: Approve Success Criteria Tests
-
-```python
-result = approve_tests(
-    goal_id="youtube-research",
-    approvals='[
-        {"test_id": "test_success_001", "action": "approve"},
-        {"test_id": "test_success_002", "action": "approve"},
-        {"test_id": "test_success_003", "action": "approve"},
-        {"test_id": "test_success_004", "action": "approve"}
-    ]'
-)
-```
-
-## Step 6: Run All Tests
-
-Execute all approved tests:
-
-```python
-result = run_tests(
-    goal_id="youtube-research",
-    agent_path="exports/youtube-research",
-    test_types='["all"]',
-    parallel=4
-)
-```
-
-**Results:**
-
-```json
-{
-    "goal_id": "youtube-research",
-    "overall_passed": false,
-    "summary": {
-        "total": 6,
-        "passed": 5,
-        "failed": 1,
-        "pass_rate": "83.3%"
-    },
-    "duration_ms": 4521,
-    "results": [
-        {"test_id": "test_constraint_api_001", "passed": true, "duration_ms": 1234},
-        {"test_id": "test_constraint_content_001", "passed": true, "duration_ms": 456},
-        {"test_id": "test_success_001", "passed": true, "duration_ms": 789},
-        {"test_id": "test_success_002", "passed": true, "duration_ms": 654},
-        {"test_id": "test_success_003", "passed": true, "duration_ms": 543},
-        {"test_id": "test_success_004", "passed": false, "duration_ms": 845,
-         "error_category": "IMPLEMENTATION_ERROR",
-         "error_message": "TypeError: 'NoneType' object has no attribute 'videos'"}
-    ]
-}
-```
-
-## Step 7: Debug the Failed Test
-
-```python
-result = debug_test(
-    goal_id="youtube-research",
-    test_id="test_success_004"
-)
-```
-
-**Debug Output:**
-
-```json
-{
-    "test_id": "test_success_004",
-    "test_name": "test_find_videos_no_results_graceful",
-    "input": {"topic": "xyznonexistent123"},
-    "expected": "Empty list or message",
-    "actual": {"error": "TypeError: 'NoneType' object has no attribute 'videos'"},
-    "passed": false,
-    "error_message": "TypeError: 'NoneType' object has no attribute 'videos'",
-    "error_category": "IMPLEMENTATION_ERROR",
-    "stack_trace": "Traceback (most recent call last):\n  File \"filter_node.py\", line 42\n    for video in result.videos:\nTypeError: 'NoneType' object has no attribute 'videos'",
-    "logs": [
-        {"timestamp": "2026-01-20T10:00:01", "node": "search_node", "level": "INFO", "msg": "Searching for: xyznonexistent123"},
-        {"timestamp": "2026-01-20T10:00:02", "node": "search_node", "level": "WARNING", "msg": "No results found"},
-        {"timestamp": "2026-01-20T10:00:02", "node": "filter_node", "level": "ERROR", "msg": "NoneType error"}
-    ],
-    "runtime_data": {
-        "execution_path": ["start", "search_node", "filter_node"],
-        "node_outputs": {
-            "search_node": null
-        }
-    },
-    "suggested_fix": "Add null check in filter_node before accessing .videos attribute",
-    "iteration_guidance": {
-        "stage": "Agent",
-        "action": "Fix the code in nodes/edges",
-        "restart_required": false,
-        "description": "The goal is correct, but filter_node doesn't handle null results from search_node."
-    }
-}
-```
-
-## Step 8: Iterate Based on Category
-
-Since this is an **IMPLEMENTATION_ERROR**, we:
-
-1. **Don't restart** the Goal → Agent → Eval flow
-2. **Fix the agent** using building-agents skill:
-   - Modify `filter_node` to handle null results
-3. **Re-run Eval** (tests only)
-
-### Fix in building-agents:
-
-```python
-# Update the filter_node to handle null
-add_node(
-    node_id="filter_node",
-    name="Filter Node",
-    description="Filter and rank videos",
-    node_type="function",
-    input_keys=["search_results"],
-    output_keys=["filtered_videos"],
-    system_prompt="""
-    Filter videos by relevance.
-    IMPORTANT: Handle case where search_results is None or empty.
-    Return empty list if no results.
-    """
-)
-```
-
-### Re-export and re-test:
-
-```python
-# Re-export the fixed agent
-export_graph(path="exports/youtube-research")
-
-# Re-run tests
-result = run_tests(
-    goal_id="youtube-research",
-    agent_path="exports/youtube-research",
-    test_types='["all"]'
-)
-```
-
-**Updated Results:**
-
-```json
-{
-    "goal_id": "youtube-research",
-    "overall_passed": true,
-    "summary": {
-        "total": 6,
-        "passed": 6,
-        "failed": 0,
-        "pass_rate": "100.0%"
-    }
-}
-```
-
-## Summary
-
-1. **Generated** constraint tests during Goal stage
-2. **Generated** success criteria tests during Eval stage
-3. **Approved** all tests with user review
-4. **Ran** tests in parallel
-5. **Debugged** the one failure
-6. **Categorized** as IMPLEMENTATION_ERROR
-7. **Fixed** the agent (not the goal)
-8. **Re-ran** Eval only (didn't restart full flow)
-9. **Passed** all tests
-
-The agent is now validated and ready for production use.
@@ -0,0 +1,145 @@
+# Triage Issue Skill
+
+Analyze a GitHub issue, verify claims against the codebase, and close invalid issues with a technical response.
+
+## Trigger
+
+User provides a GitHub issue URL or number, e.g.:
+- `/triage-issue 1970`
+- `/triage-issue https://github.com/adenhq/hive/issues/1970`
+
+## Workflow
+
+### Step 1: Fetch Issue Details
+
+```bash
+gh issue view <number> --repo adenhq/hive --json title,body,state,labels,author
+```
+
+Extract:
+- Title
+- Body (the claim/bug report)
+- Current state
+- Labels
+- Author
+
+If issue is already closed, inform user and stop.
+
+### Step 2: Analyze the Claim
+
+Read the issue body and identify:
+1. **The core claim** - What is the user asserting?
+2. **Technical specifics** - File paths, function names, code snippets mentioned
+3. **Expected behavior** - What do they think should happen?
+4. **Severity claimed** - Security issue? Bug? Feature request?
+
+### Step 3: Investigate the Codebase
+
+For each technical claim:
+1. Find the referenced code using Grep/Glob/Read
+2. Understand the actual implementation
+3. Check if the claim accurately describes the behavior
+4. Look for related tests, documentation, or design decisions
+
+### Step 4: Evaluate Validity
+
+Categorize the issue as one of:
+
+| Category | Action |
+|----------|--------|
+| **Valid Bug** | Do NOT close. Inform user this is a real issue. |
+| **Valid Feature Request** | Do NOT close. Suggest labeling appropriately. |
+| **Misunderstanding** | Prepare technical explanation for why behavior is correct. |
+| **Fundamentally Flawed** | Prepare critique explaining the technical impossibility or design rationale. |
+| **Duplicate** | Find the original issue and prepare duplicate notice. |
+| **Incomplete** | Prepare request for more information. |
+
+### Step 5: Draft Response
+
+For issues to be closed, draft a response that:
+
+1. **Acknowledges the concern** - Don't be dismissive
+2. **Explains the actual behavior** - With code references
+3. **Provides technical rationale** - Why it works this way
+4. **References industry standards** - If applicable
+5. **Offers alternatives** - If there's a better approach for the user
+
+Use this template:
+
+```markdown
+## Analysis
+
+[Brief summary of what was investigated]
+
+## Technical Details
+
+[Explanation with code references]
+
+## Why This Is Working As Designed
+
+[Rationale]
+
+## Recommendation
+
+[What the user should do instead, if applicable]
+
+---
+*This issue was reviewed and closed by the maintainers.*
+```
+
+### Step 6: User Review
+
+Present the draft to the user with:
+
+```
+## Issue #<number>: <title>
+
+**Claim:** <summary of claim>
+
+**Finding:** <valid/invalid/misunderstanding/etc>
+
+**Draft Response:**
+<the markdown response>
+
+---
+Do you want me to post this comment and close the issue?
+```
+
+Use AskUserQuestion with options:
+- "Post and close" - Post comment, close issue
+- "Edit response" - Let user modify the response
+- "Skip" - Don't take action
+
+### Step 7: Execute Action
+
+If user approves:
+
+```bash
+# Post comment
+gh issue comment <number> --repo adenhq/hive --body "<response>"
+
+# Close issue
+gh issue close <number> --repo adenhq/hive --reason "not planned"
+```
+
+Report success with link to the issue.
+
+## Important Guidelines
+
+1. **Never close valid issues** - If there's any merit to the claim, don't close it
+2. **Be respectful** - The reporter took time to file the issue
+3. **Be technical** - Provide code references and evidence
+4. **Be educational** - Help them understand, don't just dismiss
+5. **Check twice** - Make sure you understand the code before declaring something invalid
+6. **Consider edge cases** - Maybe their environment reveals a real issue
+
+## Example Critiques
+
+### Security Misunderstanding
+> "The claim that secrets are exposed in plaintext misunderstands the encryption architecture. While `SecretStr` is used for logging protection, actual encryption is provided by Fernet (AES-128-CBC) at the storage layer. The code path is: serialize → encrypt → write. Only encrypted bytes touch disk."
+
+### Impossible Request
+> "The requested feature would require [X] which violates [fundamental constraint]. This is not a limitation of our implementation but a fundamental property of [technology/protocol]."
+
+### Already Handled
+> "This scenario is already handled by [code reference]. The reporter may be using an older version or misconfigured environment."
@@ -0,0 +1,18 @@
+This project uses ruff for Python linting and formatting.
+
+Rules:
+- Line length: 100 characters
+- Python target: 3.11+
+- Use double quotes for strings
+- Sort imports with isort (ruff I rules): stdlib, third-party, first-party (framework), local
+- Combine as-imports
+- Use type hints on all function signatures
+- Use `from __future__ import annotations` for modern type syntax
+- Raise exceptions with `from` in except blocks (B904)
+- No unused imports (F401), no unused variables (F841)
+- Prefer list/dict/set comprehensions over map/filter (C4)
+
+Run `make lint` to auto-fix, `make check` to verify without modifying files.
+Run `make format` to apply ruff formatting.
+
+The ruff config lives in core/pyproject.toml under [tool.ruff].
@@ -11,6 +11,9 @@ indent_size = 2
 insert_final_newline = true
 trim_trailing_whitespace = true

+[*.py]
+indent_size = 4
+
 [*.md]
 trim_trailing_whitespace = false

@@ -0,0 +1,124 @@
+# Normalize line endings for all text files
+* text=auto
+
+# Source code
+*.py text diff=python
+*.js text
+*.ts text
+*.jsx text
+*.tsx text
+*.json text
+*.yaml text
+*.yml text
+*.toml text
+*.ini text
+*.cfg text
+
+# Shell scripts (must use LF)
+*.sh text eol=lf
+quickstart.sh text eol=lf
+
+# PowerShell scripts (Windows-friendly)
+*.ps1 text eol=lf
+*.psm1 text eol=lf
+
+# Windows batch files (must use CRLF)
+*.bat text eol=crlf
+*.cmd text eol=crlf
+
+# Documentation
+*.md text
+*.txt text
+*.rst text
+*.tex text
+
+# Configuration files
+.gitignore text
+.gitattributes text
+.editorconfig text
+Dockerfile text
+docker-compose.yml text
+requirements*.txt text
+pyproject.toml text
+setup.py text
+setup.cfg text
+MANIFEST.in text
+LICENSE text
+README* text
+CHANGELOG* text
+CONTRIBUTING* text
+CODE_OF_CONDUCT* text
+
+# Web files
+*.html text
+*.css text
+*.scss text
+*.sass text
+
+# Data files
+*.xml text
+*.csv text
+*.sql text
+
+# Graphics (binary)
+*.png binary
+*.jpg binary
+*.jpeg binary
+*.gif binary
+*.ico binary
+*.svg binary
+*.eps binary
+*.bmp binary
+*.tif binary
+*.tiff binary
+
+# Archives (binary)
+*.zip binary
+*.tar binary
+*.gz binary
+*.bz2 binary
+*.7z binary
+*.rar binary
+
+# Python compiled (binary)
+*.pyc binary
+*.pyo binary
+*.pyd binary
+*.whl binary
+*.egg binary
+
+# System libraries (binary)
+*.so binary
+*.dll binary
+*.dylib binary
+*.lib binary
+*.a binary
+
+# Documents (binary)
+*.pdf binary
+*.doc binary
+*.docx binary
+*.ppt binary
+*.pptx binary
+*.xls binary
+*.xlsx binary
+
+# Fonts (binary)
+*.ttf binary
+*.otf binary
+*.woff binary
+*.woff2 binary
+*.eot binary
+
+# Audio/Video (binary)
+*.mp3 binary
+*.mp4 binary
+*.wav binary
+*.avi binary
+*.mov binary
+*.flv binary
+
+# Database files (binary)
+*.db binary
+*.sqlite binary
+*.sqlite3 binary
@@ -8,7 +8,6 @@
 /hive/ @adenhq/maintainers

 # Infrastructure
-/docker-compose*.yml @adenhq/maintainers
 /.github/ @adenhq/maintainers

 # Documentation
@@ -1,9 +1,10 @@
 ---
 name: Bug Report
 about: Report a bug to help us improve
-title: '[Bug]: '
-labels: bug
+title: "[Bug]: "
+labels: bug, enhancement
 assignees: ''
+
 ---

 ## Describe the Bug
@@ -1,9 +1,10 @@
 ---
 name: Feature Request
 about: Suggest a new feature or enhancement
-title: '[Feature]: '
+title: "[Feature]: "
 labels: enhancement
 assignees: ''
+
 ---

 ## Problem Statement
@@ -0,0 +1,89 @@
+name: Integration Bounty
+description: A bounty task for the integration contribution program
+title: "[Bounty]: "
+labels: []
+body:
+  - type: markdown
+    attributes:
+      value: |
+        ## Integration Bounty
+
+        This issue is part of the [Integration Bounty Program](../../docs/bounty-program/README.md).
+        **Claim this bounty** by commenting below — a maintainer will assign you within 24 hours.
+
+  - type: dropdown
+    id: bounty-type
+    attributes:
+      label: Bounty Type
+      options:
+        - "Test a Tool (20 pts)"
+        - "Write Docs (20 pts)"
+        - "Code Contribution (30 pts)"
+        - "New Integration (75 pts)"
+    validations:
+      required: true
+
+  - type: dropdown
+    id: difficulty
+    attributes:
+      label: Difficulty
+      options:
+        - Easy
+        - Medium
+        - Hard
+    validations:
+      required: true
+
+  - type: input
+    id: tool-name
+    attributes:
+      label: Tool Name
+      description: The integration this bounty targets (e.g., `airtable`, `salesforce`)
+      placeholder: e.g., airtable
+    validations:
+      required: true
+
+  - type: textarea
+    id: description
+    attributes:
+      label: Description
+      description: What needs to be done to complete this bounty.
+      placeholder: |
+        Describe the specific task, including:
+        - What the contributor needs to do
+        - Links to relevant files in the repo
+        - Any setup requirements (API keys, accounts, etc.)
+    validations:
+      required: true
+
+  - type: textarea
+    id: acceptance-criteria
+    attributes:
+      label: Acceptance Criteria
+      description: What "done" looks like. The PR or report must meet all criteria.
+      placeholder: |
+        - [ ] Criterion 1
+        - [ ] Criterion 2
+        - [ ] CI passes
+    validations:
+      required: true
+
+  - type: textarea
+    id: relevant-files
+    attributes:
+      label: Relevant Files
+      description: Links to tool directory, credential spec, health check file, etc.
+      placeholder: |
+        - Tool: `tools/src/aden_tools/tools/{tool_name}/`
+        - Credential spec: `tools/src/aden_tools/credentials/{category}.py`
+        - Health checks: `tools/src/aden_tools/credentials/health_check.py`
+
+  - type: textarea
+    id: resources
+    attributes:
+      label: Resources
+      description: Links to API docs, examples, or guides that will help the contributor.
+      placeholder: |
+        - [Building Tools Guide](../../tools/BUILDING_TOOLS.md)
+        - [Tool README Template](../../docs/bounty-program/templates/tool-readme-template.md)
+        - API docs: https://...
@@ -0,0 +1,71 @@
+---
+name: Integration Request
+about: Suggest a new integration
+title: "[Integration]:"
+labels: ''
+assignees: ''
+
+---
+
+## Service                                                                                      
+                                                                                                 
+ Name and brief description of the service and what it enables agents to do.                     
+                                                                                                 
+ **Description:** [e.g., "API key for Slack Bot" — short one-liner for the credential spec]      
+                                                                                                 
+ ## Credential Identity                                                                          
+                                                                                                 
+ - **credential_id:** [e.g., `slack`]                                                            
+ - **env_var:** [e.g., `SLACK_BOT_TOKEN`]                                                        
+ - **credential_key:** [e.g., `access_token`, `api_key`, `bot_token`]                            
+                                                                                                 
+ ## Tools                                                                                        
+                                                                                                 
+ Tool function names that require this credential:                                               
+                                                                                                 
+ - [e.g., `slack_send_message`]                                                                  
+ - [e.g., `slack_list_channels`]                                                                 
+                                                                                                 
+ ## Auth Methods                                                                                 
+                                                                                                 
+ - **Direct API key supported:** Yes / No                                                        
+ - **Aden OAuth supported:** Yes / No                                                            
+                                                                                                 
+ If Aden OAuth is supported, describe the OAuth scopes/permissions required.                     
+                                                                                                 
+ ## How to Get the Credential                                                                    
+                                                                                                 
+ Link where users obtain the key/token:                                                          
+                                                                                                 
+ [e.g., https://api.slack.com/apps]                                                              
+                                                                                                 
+ Step-by-step instructions:                                                                      
+                                                                                                 
+ 1. Go to ...                                                                                    
+ 2. Create a ...                                                                                 
+ 3. Select scopes/permissions: ...                                                               
+ 4. Copy the key/token                                                                           
+                                                                                                 
+ ## Health Check                                                                                 
+                                                                                                 
+ A lightweight API call to validate the credential (no writes, no charges).                      
+                                                                                                 
+ - **Endpoint:** [e.g., `https://slack.com/api/auth.test`]                                       
+ - **Method:** [e.g., `GET` or `POST`]                                                           
+ - **Auth header:** [e.g., `Authorization: Bearer {token}` or `X-Api-Key: {key}`]                
+ - **Parameters (if any):** [e.g., `?limit=1`]                                                   
+ - **200 means:** [e.g., key is valid]                                                           
+ - **401 means:** [e.g., invalid or expired]                                                     
+ - **429 means:** [e.g., rate limited but key is valid]                                          
+                                                                                                 
+ ## Credential Group                                                                             
+                                                                                                 
+ Does this require multiple credentials configured together? (e.g., Google Custom Search needs   
+ both an API key and a CSE ID)                                                                   
+                                                                                                 
+ - [ ] No, single credential                                                                     
+ - [ ] Yes — list the other credential IDs in the group:                                         
+                                                                                                 
+ ## Additional Context                                                                           
+                                                                                                 
+ Links to API docs, rate limits, free tier availability, or anything else relevant.
@@ -0,0 +1,78 @@
+name: Standard Bounty
+description: A bounty task for general framework contributions (not integration-specific)
+title: "[Bounty]: "
+labels: []
+body:
+  - type: markdown
+    attributes:
+      value: |
+        ## Standard Bounty
+
+        This issue is part of the [Bounty Program](../../docs/bounty-program/README.md).
+        **Claim this bounty** by commenting below — a maintainer will assign you within 24 hours.
+
+  - type: dropdown
+    id: bounty-size
+    attributes:
+      label: Bounty Size
+      options:
+        - "Small (10 pts)"
+        - "Medium (30 pts)"
+        - "Large (75 pts)"
+        - "Extreme (150 pts)"
+    validations:
+      required: true
+
+  - type: dropdown
+    id: difficulty
+    attributes:
+      label: Difficulty
+      options:
+        - Easy
+        - Medium
+        - Hard
+    validations:
+      required: true
+
+  - type: textarea
+    id: description
+    attributes:
+      label: Description
+      description: What needs to be done to complete this bounty.
+      placeholder: |
+        Describe the specific task, including:
+        - What the contributor needs to do
+        - Links to relevant files in the repo
+        - Any context or motivation for the change
+    validations:
+      required: true
+
+  - type: textarea
+    id: acceptance-criteria
+    attributes:
+      label: Acceptance Criteria
+      description: What "done" looks like. The PR must meet all criteria.
+      placeholder: |
+        - [ ] Criterion 1
+        - [ ] Criterion 2
+        - [ ] CI passes
+    validations:
+      required: true
+
+  - type: textarea
+    id: relevant-files
+    attributes:
+      label: Relevant Files
+      description: Links to files or directories related to this bounty.
+      placeholder: |
+        - `path/to/file.py`
+        - `path/to/directory/`
+
+  - type: textarea
+    id: resources
+    attributes:
+      label: Resources
+      description: Links to docs, issues, or external references that will help.
+      placeholder: |
+        - Related issue: #XXXX
+        - Docs: https://...
@@ -0,0 +1,34 @@
+name: Auto-close duplicate issues
+description: Auto-closes issues that are duplicates of existing issues
+on:
+  schedule:
+    - cron: "0 */6 * * *"
+  workflow_dispatch:
+
+jobs:
+  auto-close-duplicates:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    permissions:
+      contents: read
+      issues: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: latest
+
+      - name: Run auto-close-duplicates tests
+        run: bun test scripts/auto-close-duplicates
+
+      - name: Auto-close duplicate issues
+        run: bun run scripts/auto-close-duplicates.ts
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY_OWNER: ${{ github.repository_owner }}
+          GITHUB_REPOSITORY_NAME: ${{ github.event.repository.name }}
+          STATSIG_API_KEY: ${{ secrets.STATSIG_API_KEY }}
@@ -0,0 +1,47 @@
+name: Bounty completed
+description: Awards points and notifies Discord when a bounty PR is merged
+
+on:
+  pull_request_target:
+    types: [closed]
+
+  workflow_dispatch:
+    inputs:
+      pr_number:
+        description: "PR number to process (for missed bounties)"
+        required: true
+        type: number
+
+jobs:
+  bounty-notify:
+    if: >
+      github.event_name == 'workflow_dispatch' ||
+      (github.event.pull_request.merged == true &&
+       contains(join(github.event.pull_request.labels.*.name, ','), 'bounty:'))
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    permissions:
+      contents: read
+      pull-requests: read
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: latest
+
+      - name: Award XP and notify Discord
+        run: bun run scripts/bounty-tracker.ts notify
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY_OWNER: ${{ github.repository_owner }}
+          GITHUB_REPOSITORY_NAME: ${{ github.event.repository.name }}
+          DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_BOUNTY_WEBHOOK_URL }}
+          BOT_API_URL: ${{ secrets.BOT_API_URL }}
+          BOT_API_KEY: ${{ secrets.BOT_API_KEY }}
+          LURKR_API_KEY: ${{ secrets.LURKR_API_KEY }}
+          LURKR_GUILD_ID: ${{ secrets.LURKR_GUILD_ID }}
+          PR_NUMBER: ${{ inputs.pr_number || github.event.pull_request.number }}
@@ -5,7 +5,7 @@ on:
    branches: [main]
  pull_request:
    branches: [main]
-
+    
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
@@ -21,22 +21,31 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
-          cache: 'pip'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          enable-cache: true

      - name: Install dependencies
-        run: |
-          cd core
-          pip install -e .
-          pip install -r requirements-dev.txt
+        run: uv sync --project core --group dev

-      - name: Run ruff
+      - name: Ruff lint
        run: |
-          cd core
-          ruff check .
+          uv run --project core ruff check core/
+          uv run --project core ruff check tools/
+
+      - name: Ruff format
+        run: |
+          uv run --project core ruff format --check core/
+          uv run --project core ruff format --check tools/

  test:
    name: Test Python Framework
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest]
    steps:
      - uses: actions/checkout@v4

@@ -44,23 +53,47 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
-          cache: 'pip'

-      - name: Install dependencies
-        run: |
-          cd core
-          pip install -e .
-          pip install -r requirements-dev.txt
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          enable-cache: true

-      - name: Run tests
+      - name: Install dependencies and run tests
+        working-directory: core
        run: |
-          cd core
-          pytest tests/ -v
+          uv sync
+          uv run pytest tests/ -v --ignore=tests/dummy_agents
+
+  test-tools:
+    name: Test Tools (${{ matrix.os }})
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          enable-cache: true
+
+      - name: Install dependencies and run tests
+        working-directory: tools
+        run: |
+          uv sync --extra dev
+          uv run pytest tests/ -v

  validate:
    name: Validate Agent Exports
    runs-on: ubuntu-latest
-    needs: [lint, test]
+    needs: [lint, test, test-tools]
    steps:
      - uses: actions/checkout@v4

@@ -68,20 +101,45 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
-          cache: 'pip'

+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          enable-cache: true
+            
      - name: Install dependencies
+        working-directory: core
        run: |
-          cd core
-          pip install -e .
-          pip install -r requirements-dev.txt
+          uv sync

      - name: Validate exported agents
        run: |
          # Check that agent exports have valid structure
-          for agent_dir in exports/*/; do
+          if [ ! -d "exports" ]; then
+            echo "No exports/ directory found, skipping validation"
+            exit 0
+          fi
+
+          shopt -s nullglob
+          agent_dirs=(exports/*/)
+          shopt -u nullglob
+
+          if [ ${#agent_dirs[@]} -eq 0 ]; then
+            echo "No agent directories in exports/, skipping validation"
+            exit 0
+          fi
+
+          validated=0
+          for agent_dir in "${agent_dirs[@]}"; do
            if [ -f "$agent_dir/agent.json" ]; then
              echo "Validating $agent_dir"
-              python -c "import json; json.load(open('$agent_dir/agent.json'))"
+              uv run python -c "import json; json.load(open('$agent_dir/agent.json'))"
+              validated=$((validated + 1))
            fi
          done
+
+          if [ "$validated" -eq 0 ]; then
+            echo "No agent.json files found in exports/, skipping validation"
+          else
+            echo "Validated $validated agent(s)"
+          fi
@@ -0,0 +1,103 @@
+name: Issue Triage
+
+on:
+  issues:
+    types: [opened]
+
+jobs:
+  triage:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    permissions:
+      contents: read
+      issues: write
+      id-token: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - name: Triage and check for duplicates
+        uses: anthropics/claude-code-action@v1
+        with:
+          anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          allowed_non_write_users: "*"
+          prompt: |
+            Analyze this new issue and perform triage tasks.
+
+            Issue: #${{ github.event.issue.number }}
+            Repository: ${{ github.repository }}
+
+            ## Your Tasks:
+
+            ### 1. Get issue details
+            Use mcp__github__get_issue to get the full details of issue #${{ github.event.issue.number }}
+
+            ### 2. Check for duplicates
+            Search for similar existing issues using mcp__github__search_issues with relevant keywords from the issue title and body.
+
+            Criteria for duplicates:
+            - Same bug or error being reported
+            - Same feature request (even if worded differently)
+            - Same question being asked
+            - Issues describing the same root problem
+
+            If you find a duplicate:
+            - Add a comment using EXACTLY this format (required for auto-close to work):
+              "Found a possible duplicate of #<issue_number>: <brief explanation of why it's a duplicate>"
+            - Do NOT apply the "duplicate" label yet (the auto-close script will add it after 12 hours if no objections)
+            - Suggest the user react with a thumbs-down if they disagree
+
+            ### 3. Check for Low-Quality / AI Spam
+            Analyze the issue quality. We are receiving many low-effort, AI-generated spam issues.
+            Flag the issue as INVALID if it matches these criteria:
+            - **Vague/Generic**: Title is "Fix bug" or "Error" without specific context.
+            - **Hallucinated**: Refers to files or features that do not exist in this repo.
+            - **Template Filler**: Body contains "Insert description here" or unrelated gibberish.
+            - **Low Effort**: No reproduction steps, no logs, only 1-2 sentences.
+
+            If identified as spam/low-quality:
+            - Add the "invalid" label.
+            - Add a comment:
+              "This issue has been automatically flagged as low-quality or potentially AI-generated spam. It lacks specific details (logs, reproduction steps, file references) required for us to help. Please open a new issue following the template exactly if this is a legitimate request."
+            - Do NOT proceed to other steps.
+
+            ### 4. Check for invalid issues (General)
+            If the issue is not spam but still lacks information:
+            - Add the "invalid" label
+            - Comment asking for clarification
+
+            ### 5. Categorize with labels (if NOT a duplicate or spam)
+            Apply appropriate labels based on the issue content. Use ONLY these labels:
+            - bug: Something isn't working
+            - enhancement: New feature or request
+            - question: Further information is requested
+            - documentation: Improvements or additions to documentation
+            - good first issue: Good for newcomers (if issue is well-defined and small scope)
+            - help wanted: Extra attention is needed (if issue needs community input)
+            - backlog: Tracked for the future, but not currently planned or prioritized
+
+            ### 6. Estimate size (if NOT a duplicate, spam, or invalid)
+            Apply exactly ONE size label to help contributors match their capacity to the task:
+            - "size: small": Docs, typos, single-file fixes, config changes
+            - "size: medium": Bug fixes with tests, adding a single tool, changes within one package
+            - "size: large": Cross-package changes (core + tools), new modules, complex logic, architectural refactors
+
+            You may apply multiple labels if appropriate (e.g., "bug", "size: small", and "good first issue").
+
+            ## Tools Available:
+            - mcp__github__get_issue: Get issue details
+            - mcp__github__search_issues: Search for similar issues
+            - mcp__github__list_issues: List recent issues if needed
+            - mcp__github__add_issue_comment: Add a comment
+            - mcp__github__update_issue: Add labels
+            - mcp__github__get_issue_comments: Get existing comments
+
+            Be thorough but efficient. Focus on accurate categorization and finding true duplicates.
+
+          claude_args: |
+            --model claude-haiku-4-5-20251001
+            --allowedTools "mcp__github__get_issue,mcp__github__search_issues,mcp__github__list_issues,mcp__github__add_issue_comment,mcp__github__update_issue,mcp__github__get_issue_comments"
@@ -0,0 +1,204 @@
+name: PR Check Command
+
+on:
+  issue_comment:
+    types: [created]
+
+jobs:
+  check-pr:
+    # Only run on PR comments that start with /check
+    if: github.event.issue.pull_request && startsWith(github.event.comment.body, '/check')
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+      issues: write
+      checks: write
+      statuses: write
+
+    steps:
+      - name: Check PR requirements
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const prNumber = context.payload.issue.number;
+            console.log(`Triggered by /check comment on PR #${prNumber}`);
+
+            // Fetch PR data
+            const { data: pr } = await github.rest.pulls.get({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              pull_number: prNumber,
+            });
+
+            const prBody = pr.body || '';
+            const prTitle = pr.title || '';
+            const prAuthor = pr.user.login;
+            const headSha = pr.head.sha;
+
+            // Create a check run in progress
+            const { data: checkRun } = await github.rest.checks.create({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              name: 'check-requirements',
+              head_sha: headSha,
+              status: 'in_progress',
+              started_at: new Date().toISOString(),
+            });
+
+            // Extract issue numbers
+            const issuePattern = /(?:close[sd]?|fix(?:e[sd])?|resolve[sd]?)?\s*#(\d+)/gi;
+            const allText = `${prTitle} ${prBody}`;
+            const matches = [...allText.matchAll(issuePattern)];
+            const issueNumbers = [...new Set(matches.map(m => parseInt(m[1], 10)))];
+
+            console.log(`PR #${prNumber}:`);
+            console.log(`  Author: ${prAuthor}`);
+            console.log(`  Found issue references: ${issueNumbers.length > 0 ? issueNumbers.join(', ') : 'none'}`);
+
+            if (issueNumbers.length === 0) {
+              const message = `## PR Closed - Requirements Not Met
+
+            This PR has been automatically closed because it doesn't meet the requirements.
+
+            **Missing:** No linked issue found.
+
+            **To fix:**
+            1. Create or find an existing issue for this work
+            2. Assign yourself to the issue
+            3. Re-open this PR and add \`Fixes #123\` in the description
+
+            **Why is this required?** See #472 for details.`;
+
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: prNumber,
+                body: message,
+              });
+
+              await github.rest.pulls.update({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                pull_number: prNumber,
+                state: 'closed',
+              });
+
+              // Update check run to failure
+              await github.rest.checks.update({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                check_run_id: checkRun.id,
+                status: 'completed',
+                conclusion: 'failure',
+                completed_at: new Date().toISOString(),
+                output: {
+                  title: 'Missing linked issue',
+                  summary: 'PR must reference an issue (e.g., `Fixes #123`)',
+                },
+              });
+
+              core.setFailed('PR must reference an issue');
+              return;
+            }
+
+            // Check if PR author is assigned to any linked issue
+            let issueWithAuthorAssigned = null;
+            let issuesWithoutAuthor = [];
+
+            for (const issueNum of issueNumbers) {
+              try {
+                const { data: issue } = await github.rest.issues.get({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: issueNum,
+                });
+
+                const assigneeLogins = (issue.assignees || []).map(a => a.login);
+                if (assigneeLogins.includes(prAuthor)) {
+                  issueWithAuthorAssigned = issueNum;
+                  console.log(`  Issue #${issueNum} has PR author ${prAuthor} as assignee`);
+                  break;
+                } else {
+                  issuesWithoutAuthor.push({
+                    number: issueNum,
+                    assignees: assigneeLogins
+                  });
+                  console.log(`  Issue #${issueNum} assignees: ${assigneeLogins.length > 0 ? assigneeLogins.join(', ') : 'none'}`);
+                }
+              } catch (error) {
+                console.log(`  Issue #${issueNum} not found`);
+              }
+            }
+
+            if (!issueWithAuthorAssigned) {
+              const issueList = issuesWithoutAuthor.map(i =>
+                `#${i.number} (assignees: ${i.assignees.length > 0 ? i.assignees.join(', ') : 'none'})`
+              ).join(', ');
+
+              const message = `## PR Closed - Requirements Not Met
+
+            This PR has been automatically closed because it doesn't meet the requirements.
+
+            **PR Author:** @${prAuthor}
+            **Found issues:** ${issueList}
+            **Problem:** The PR author must be assigned to the linked issue.
+
+            **To fix:**
+            1. Assign yourself (@${prAuthor}) to one of the linked issues
+            2. Re-open this PR
+
+            **Why is this required?** See #472 for details.`;
+
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: prNumber,
+                body: message,
+              });
+
+              await github.rest.pulls.update({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                pull_number: prNumber,
+                state: 'closed',
+              });
+
+              // Update check run to failure
+              await github.rest.checks.update({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                check_run_id: checkRun.id,
+                status: 'completed',
+                conclusion: 'failure',
+                completed_at: new Date().toISOString(),
+                output: {
+                  title: 'PR author not assigned to issue',
+                  summary: `PR author @${prAuthor} must be assigned to one of the linked issues: ${issueList}`,
+                },
+              });
+
+              core.setFailed('PR author must be assigned to the linked issue');
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: prNumber,
+                body: `✅ PR requirements met! Issue #${issueWithAuthorAssigned} has @${prAuthor} as assignee.`,
+              });
+
+              // Update check run to success
+              await github.rest.checks.update({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                check_run_id: checkRun.id,
+                status: 'completed',
+                conclusion: 'success',
+                completed_at: new Date().toISOString(),
+                output: {
+                  title: 'Requirements met',
+                  summary: `Issue #${issueWithAuthorAssigned} has @${prAuthor} as assignee.`,
+                },
+              });
+
+              console.log(`PR requirements met!`);
+            }
@@ -0,0 +1,138 @@
+name: PR Requirements Backfill
+
+on:
+  workflow_dispatch:
+
+jobs:
+  check-all-open-prs:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+      issues: write
+
+    steps:
+      - name: Check all open PRs
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const { data: pullRequests } = await github.rest.pulls.list({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              state: 'open',
+              per_page: 100,
+            });
+
+            console.log(`Found ${pullRequests.length} open PRs`);
+
+            for (const pr of pullRequests) {
+              const prNumber = pr.number;
+              const prBody = pr.body || '';
+              const prTitle = pr.title || '';
+              const prAuthor = pr.user.login;
+
+              console.log(`\nChecking PR #${prNumber}: ${prTitle}`);
+
+              // Extract issue numbers from body and title
+              const issuePattern = /(?:close[sd]?|fix(?:e[sd])?|resolve[sd]?)?\s*#(\d+)/gi;
+              const allText = `${prTitle} ${prBody}`;
+              const matches = [...allText.matchAll(issuePattern)];
+              const issueNumbers = [...new Set(matches.map(m => parseInt(m[1], 10)))];
+
+              console.log(`  Found issue references: ${issueNumbers.length > 0 ? issueNumbers.join(', ') : 'none'}`);
+
+              if (issueNumbers.length === 0) {
+                console.log(`  ❌ No linked issue - closing PR`);
+
+                const message = `## PR Closed - Requirements Not Met
+
+            This PR has been automatically closed because it doesn't meet the requirements.
+
+            **Missing:** No linked issue found.
+
+            **To fix:**
+            1. Create or find an existing issue for this work
+            2. Assign yourself to the issue
+            3. Re-open this PR and add \`Fixes #123\` in the description`;
+
+                await github.rest.issues.createComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: prNumber,
+                  body: message,
+                });
+
+                await github.rest.pulls.update({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  pull_number: prNumber,
+                  state: 'closed',
+                });
+
+                continue;
+              }
+
+              // Check if any linked issue has the PR author as assignee
+              let issueWithAuthorAssigned = null;
+              let issuesWithoutAuthor = [];
+
+              for (const issueNum of issueNumbers) {
+                try {
+                  const { data: issue } = await github.rest.issues.get({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: issueNum,
+                  });
+
+                  const assigneeLogins = (issue.assignees || []).map(a => a.login);
+                  if (assigneeLogins.includes(prAuthor)) {
+                    issueWithAuthorAssigned = issueNum;
+                    break;
+                  } else {
+                    issuesWithoutAuthor.push({
+                      number: issueNum,
+                      assignees: assigneeLogins
+                    });
+                  }
+                } catch (error) {
+                  console.log(`  Issue #${issueNum} not found or inaccessible`);
+                }
+              }
+
+              if (!issueWithAuthorAssigned) {
+                const issueList = issuesWithoutAuthor.map(i =>
+                  `#${i.number} (assignees: ${i.assignees.length > 0 ? i.assignees.join(', ') : 'none'})`
+                ).join(', ');
+
+                console.log(`  ❌ PR author not assigned to any linked issue - closing PR`);
+
+                const message = `## PR Closed - Requirements Not Met
+
+            This PR has been automatically closed because it doesn't meet the requirements.
+
+            **PR Author:** @${prAuthor}
+            **Found issues:** ${issueList}
+            **Problem:** The PR author must be assigned to the linked issue.
+
+            **To fix:**
+            1. Assign yourself (@${prAuthor}) to one of the linked issues
+            2. Re-open this PR`;
+
+                await github.rest.issues.createComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: prNumber,
+                  body: message,
+                });
+
+                await github.rest.pulls.update({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  pull_number: prNumber,
+                  state: 'closed',
+                });
+              } else {
+                console.log(`  ✅ PR requirements met! Issue #${issueWithAuthorAssigned} has ${prAuthor} as assignee.`);
+              }
+            }
+
+            console.log('\nBackfill complete!');
@@ -0,0 +1,54 @@
+# Closes PRs that still have the `pr-requirements-warning` label
+# after contributors were warned in pr-requirements.yml.
+name: PR Requirements Enforcement
+on:
+  schedule:
+    - cron: "0 0 * * *"   # runs every day once at midnight 
+jobs:
+  enforce:
+    name: Close PRs still failing contribution requirements
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+      issues: write
+    steps:
+      - name: Close PRs still failing requirements
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const { owner, repo } = context.repo;
+            const prs = await github.paginate(github.rest.pulls.list, {
+              owner,
+              repo,
+              state: "open",
+              per_page: 100
+            });
+            for (const pr of prs) {
+              // Skip draft PRs — author may still be actively working toward compliance
+              if (pr.draft) continue;
+              const labels = pr.labels.map(l => l.name);
+              if (!labels.includes("pr-requirements-warning")) continue;
+              const gracePeriod = 24 * 60 * 60 * 1000;
+              const lastUpdated = new Date(pr.created_at);
+              const now = new Date();
+              if (now - lastUpdated < gracePeriod) {
+                console.log(`Skipping PR #${pr.number} — still within grace period`);
+                continue;
+              }
+              const prNumber = pr.number;
+              const prAuthor = pr.user.login;
+              await github.rest.issues.createComment({
+                owner,
+                repo,
+                issue_number: prNumber,
+                body: `Closing PR because the contribution requirements were not resolved within the 24-hour grace period.
+                If this was closed in error, feel free to reopen the PR after fixing the requirements.`
+              });
+              await github.rest.pulls.update({
+                owner,
+                repo,
+                pull_number: prNumber,
+                state: "closed"
+              });
+              console.log(`Closed PR #${prNumber} by ${prAuthor} (PR requirements were not met)`);
+            }
@@ -0,0 +1,203 @@
+name: PR Requirements Check
+
+on:
+  pull_request_target:
+    types: [opened, reopened, edited, synchronize]
+
+jobs:
+  check-requirements:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+      issues: write
+
+    steps:
+      - name: Check PR has linked issue with assignee
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const pr = context.payload.pull_request;
+            const prNumber = pr.number;
+            const prBody = pr.body || '';
+            const prTitle = pr.title || '';
+            const prLabels = (pr.labels || []).map(l => l.name);
+
+            // Allow micro-fix and documentation PRs without a linked issue
+            const isMicroFix = prLabels.includes('micro-fix') || /micro-fix/i.test(prTitle);
+            const isDocumentation = prLabels.includes('documentation') || /\bdocs?\b/i.test(prTitle);
+            if (isMicroFix || isDocumentation) {
+              const reason = isMicroFix ? 'micro-fix' : 'documentation';
+              console.log(`PR #${prNumber} is a ${reason}, skipping issue requirement.`);
+              return;
+            }
+
+            // Extract issue numbers from body and title
+            // Matches: fixes #123, closes #123, resolves #123, or plain #123
+            const issuePattern = /(?:close[sd]?|fix(?:e[sd])?|resolve[sd]?)?\s*#(\d+)/gi;
+
+            const allText = `${prTitle} ${prBody}`;
+            const matches = [...allText.matchAll(issuePattern)];
+            const issueNumbers = [...new Set(matches.map(m => parseInt(m[1], 10)))];
+
+            console.log(`PR #${prNumber}:`);
+            console.log(`  Found issue references: ${issueNumbers.length > 0 ? issueNumbers.join(', ') : 'none'}`);
+
+            if (issueNumbers.length === 0) {
+              const message = `## PR Requirements Warning
+
+            This PR does not meet the contribution requirements.
+            If the issue is not fixed within ~24 hours, it may be automatically closed.
+
+            **Missing:** No linked issue found.
+
+            **To fix:**
+            1. Create or find an existing issue for this work
+            2. Assign yourself to the issue
+            3. Re-open this PR and add \`Fixes #123\` in the description
+
+            **Exception:** To bypass this requirement, you can:
+            - Add the \`micro-fix\` label or include \`micro-fix\` in your PR title for trivial fixes
+            - Add the \`documentation\` label or include \`doc\`/\`docs\` in your PR title for documentation changes
+
+            **Micro-fix requirements** (must meet ALL):
+            | Qualifies | Disqualifies |
+            |-----------|--------------|
+            | < 20 lines changed | Any functional bug fix |
+            | Typos & Documentation & Linting | Refactoring for "clean code" |
+            | No logic/API/DB changes | New features (even tiny ones) |
+
+            **Why is this required?** See #472 for details.`;
+
+              const comments = await github.paginate(github.rest.issues.listComments, {
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: prNumber,
+                per_page: 100,
+              });
+
+              const botComment = comments.find(
+                (c) => c.user.type === 'Bot' && c.body.includes('PR Requirements Warning')
+              );
+
+              if (!botComment) {
+                await github.rest.issues.createComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: prNumber,
+                  body: message,
+                });
+              }
+
+              await github.rest.issues.addLabels({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: prNumber,
+                labels: ['pr-requirements-warning'],
+              });
+
+              core.setFailed('PR must reference an issue');
+              return;
+            }
+
+            // Check if any linked issue has the PR author as assignee
+            const prAuthor = pr.user.login;
+            let issueWithAuthorAssigned = null;
+            let issuesWithoutAuthor = [];
+
+            for (const issueNum of issueNumbers) {
+              try {
+                const { data: issue } = await github.rest.issues.get({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: issueNum,
+                });
+
+                const assigneeLogins = (issue.assignees || []).map(a => a.login);
+                if (assigneeLogins.includes(prAuthor)) {
+                  issueWithAuthorAssigned = issueNum;
+                  console.log(`  Issue #${issueNum} has PR author ${prAuthor} as assignee`);
+                  break;
+                } else {
+                  issuesWithoutAuthor.push({
+                    number: issueNum,
+                    assignees: assigneeLogins
+                  });
+                  console.log(`  Issue #${issueNum} assignees: ${assigneeLogins.length > 0 ? assigneeLogins.join(', ') : 'none'} (PR author: ${prAuthor})`);
+                }
+              } catch (error) {
+                console.log(`  Issue #${issueNum} not found or inaccessible`);
+              }
+            }
+
+            if (!issueWithAuthorAssigned) {
+              const issueList = issuesWithoutAuthor.map(i =>
+                `#${i.number} (assignees: ${i.assignees.length > 0 ? i.assignees.join(', ') : 'none'})`
+              ).join(', ');
+
+              const message = `## PR Requirements Warning
+
+            This PR does not meet the contribution requirements.
+            If the issue is not fixed within ~24 hours, it may be automatically closed.
+
+            **PR Author:** @${prAuthor}
+            **Found issues:** ${issueList}
+            **Problem:** The PR author must be assigned to the linked issue.
+
+            **To fix:**
+            1. Assign yourself (@${prAuthor}) to one of the linked issues
+            2. Re-open this PR
+
+            **Exception:** To bypass this requirement, you can:
+            - Add the \`micro-fix\` label or include \`micro-fix\` in your PR title for trivial fixes
+            - Add the \`documentation\` label or include \`doc\`/\`docs\` in your PR title for documentation changes
+
+            **Micro-fix requirements** (must meet ALL):
+            | Qualifies | Disqualifies |
+            |-----------|--------------|
+            | < 20 lines changed | Any functional bug fix |
+            | Typos & Documentation & Linting | Refactoring for "clean code" |
+            | No logic/API/DB changes | New features (even tiny ones) |
+
+            **Why is this required?** See #472 for details.`;
+
+              const comments = await github.paginate(github.rest.issues.listComments, {
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: prNumber,
+                per_page: 100,
+              });
+
+              const botComment = comments.find(
+                (c) => c.user.type === 'Bot' && c.body.includes('PR Requirements Warning')
+              );
+
+              if (!botComment) {
+                await github.rest.issues.createComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: prNumber,
+                  body: message,
+                });
+              }
+
+              await github.rest.issues.addLabels({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: prNumber,
+                labels: ['pr-requirements-warning'],
+              });
+
+              core.setFailed('PR author must be assigned to the linked issue');
+            } else {
+              console.log(`PR requirements met! Issue #${issueWithAuthorAssigned} has ${prAuthor} as assignee.`);
+              try {
+                await github.rest.issues.removeLabel({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: prNumber,
+                  name: "pr-requirements-warning"
+                });
+              }catch (error){
+                //ignore if label doesn't exist
+              }
+            }
@@ -21,18 +21,19 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
-          cache: 'pip'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4

      - name: Install dependencies
        run: |
          cd core
-          pip install -e .
-          pip install -r requirements-dev.txt
+          uv sync

      - name: Run tests
        run: |
          cd core
-          pytest tests/ -v
+          uv run pytest tests/ -v

      - name: Generate changelog
        id: changelog
@@ -0,0 +1,42 @@
+name: Weekly bounty leaderboard
+description: Posts the integration bounty leaderboard to Discord every Monday
+
+on:
+  schedule:
+    # Every Monday at 9:00 UTC
+    - cron: "0 9 * * 1"
+  workflow_dispatch:
+    inputs:
+      since_date:
+        description: "Only count PRs merged after this date (YYYY-MM-DD). Leave empty for all-time."
+        required: false
+
+jobs:
+  leaderboard:
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    permissions:
+      contents: read
+      pull-requests: read
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: latest
+
+      - name: Post leaderboard to Discord
+        run: bun run scripts/bounty-tracker.ts leaderboard
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY_OWNER: ${{ github.repository_owner }}
+          GITHUB_REPOSITORY_NAME: ${{ github.event.repository.name }}
+          DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_BOUNTY_WEBHOOK_URL }}
+          BOT_API_URL: ${{ secrets.BOT_API_URL }}
+          BOT_API_KEY: ${{ secrets.BOT_API_KEY }}
+          LURKR_API_KEY: ${{ secrets.LURKR_API_KEY }}
+          LURKR_GUILD_ID: ${{ secrets.LURKR_GUILD_ID }}
+          SINCE_DATE: ${{ github.event.inputs.since_date || '' }}
@@ -13,6 +13,10 @@ out/
 .env
 .env.local
 .env.*.local
+.venv
+/venv
+tools/src/uv.lock
+

 # User configuration (copied from .example)
 config.yaml
@@ -46,6 +50,7 @@ coverage/

 # TypeScript
 *.tsbuildinfo
+vite.config.d.ts

 # Python
 __pycache__/
@@ -65,6 +70,15 @@ tmp/
 temp/

 exports/*
+exports.old*
+artifacts/*

-core/.agent-builder-sessions/*
-.agent-builder-sessions/
+.claude/settings.local.json
+
+docs/github-issues/*
+core/tests/*dumps/*
+
+screenshots/*
+
+.gemini/*
+.coverage
@@ -0,0 +1,9 @@
+{"type": "connection", "event": "connect", "ts": "2026-04-04T01:10:38.245667+00:00", "profile": "default"}
+{"type": "connection", "event": "hello", "details": {"version": "1.0"}, "ts": "2026-04-04T01:10:38.247207+00:00", "profile": "default"}
+{"type": "connection", "event": "disconnect", "ts": "2026-04-04T01:11:57.148273+00:00", "profile": "default"}
+{"type": "connection", "event": "connect", "ts": "2026-04-04T01:12:09.162378+00:00", "profile": "default"}
+{"type": "connection", "event": "hello", "details": {"version": "1.0"}, "ts": "2026-04-04T01:12:09.163899+00:00", "profile": "default"}
+{"type": "connection", "event": "disconnect", "ts": "2026-04-04T01:15:12.826042+00:00", "profile": "default"}
+{"type": "connection", "event": "connect", "ts": "2026-04-04T01:15:30.842533+00:00", "profile": "default"}
+{"type": "connection", "event": "hello", "details": {"version": "1.0"}, "ts": "2026-04-04T01:15:30.845025+00:00", "profile": "default"}
+{"type": "tool_call", "tool": "browser_stop", "params": {"profile": "gcu-browser-worker:3"}, "result": {"ok": true, "status": "not_running", "profile": "gcu-browser-worker:3"}, "ok": true, "duration_ms": 0.01, "ts": "2026-04-04T01:29:04.294954+00:00", "profile": "default"}
@@ -1,20 +1,10 @@
 {
  "mcpServers": {
-    "agent-builder": {
-      "command": "python",
-      "args": ["-m", "framework.mcp.agent_builder_server"],
-      "cwd": "core",
-      "env": {
-        "PYTHONPATH": "../tools/src"
-      }
-    },
-    "tools": {
-      "command": "python",
-      "args": ["mcp_server.py", "--stdio"],
-      "cwd": "tools",
-      "env": {
-        "PYTHONPATH": "src"
-      }
+    "gcu-tools": {
+      "type": "stdio",
+      "command": "uv",
+      "args": ["run", "python", "-m", "gcu.server", "--stdio"],
+      "cwd": "/home/timothy/aden/hive/tools"
    }
  }
 }
@@ -0,0 +1,18 @@
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.15.0
+    hooks:
+      - id: ruff
+        name: ruff lint (core)
+        args: [--fix]
+        files: ^core/
+      - id: ruff
+        name: ruff lint (tools)
+        args: [--fix]
+        files: ^tools/
+      - id: ruff-format
+        name: ruff format (core)
+        files: ^core/
+      - id: ruff-format
+        name: ruff format (tools)
+        files: ^tools/
@@ -0,0 +1 @@
+3.11
@@ -0,0 +1,30 @@
+# Repository Guidelines
+
+Shared agent instructions for this workspace.
+
+## Coding Agent Notes
+
+- 
+- When working on a GitHub Issue or PR, print the full URL at the end of the task.
+- When answering questions, respond with high-confidence answers only: verify in code; do not guess.
+- Do not update dependencies casually. Version bumps, patched dependencies, overrides, or vendored dependency changes require explicit approval.
+- Add brief comments for tricky logic. Keep files reasonably small when practical; split or refactor large files instead of growing them indefinitely.
+- If shared guardrails are available locally, review them; otherwise follow this repo's guidance.
+- Use `uv` for Python execution and package management. Do not use `python` or `python3` directly unless the user explicitly asks for it.
+- Prefer `uv run` for scripts and tests, and `uv pip` for package operations.
+
+
+## Multi-Agent Safety
+
+- Do not create, apply, or drop `git stash` entries unless explicitly requested.
+- Do not create, remove, or modify `git worktree` checkouts unless explicitly requested.
+- Do not switch branches or check out a different branch unless explicitly requested.
+- When the user says `push`, you may `git pull --rebase` to integrate latest changes, but never discard other in-progress work.
+- When the user says `commit`, commit only your changes. When the user says `commit all`, commit everything in grouped chunks.
+- When you see unrecognized files or unrelated changes, keep going and focus on your scoped changes.
+
+## Change Hygiene
+
+- If staged and unstaged diffs are formatting-only, resolve them without asking.
+- If a commit or push was already requested, include formatting-only follow-up changes in that same commit when practical.
+- Only stop to ask for confirmation when changes are semantic and may alter behavior.
@@ -1,40 +1,330 @@
-# Changelog
+# Release Notes

-All notable changes to this project will be documented in this file.
+## v0.7.1

-The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
-and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+**Release Date:** March 13, 2026
+**Tag:** v0.7.1

-## [Unreleased]
+### Chrome-Native Browser Control

-### Added
- Initial project structure
- React frontend (honeycomb) with Vite and TypeScript
- Node.js backend (hive) with Express and TypeScript
- Docker Compose configuration for local development
- Configuration system via `config.yaml`
- GitHub Actions CI/CD workflows
- Comprehensive documentation
+v0.7.1 replaces Playwright with direct Chrome DevTools Protocol (CDP) integration. The GCU now launches the user's system Chrome via `open -n` on macOS, connects over CDP, and manages browser lifecycle end-to-end -- no extra browser binary required.

-### Changed
- N/A
+---

-### Deprecated
- N/A
+### Highlights

-### Removed
- N/A
+#### System Chrome via CDP

-### Fixed
- N/A
+The entire GCU browser stack has been rewritten:

-### Security
- N/A
+- **Chrome finder & launcher** -- New `chrome_finder.py` discovers installed Chrome and `chrome_launcher.py` manages process lifecycle with `--remote-debugging-port`
+- **Coexist with user's browser** -- `open -n` on macOS launches a separate Chrome instance so the user's tabs stay untouched
+- **Dynamic viewport sizing** -- Viewport auto-sizes to the available display area, suppressing Chrome warning bars
+- **Orphan cleanup** -- Chrome processes are killed on GCU server shutdown to prevent leaks
+- **`--no-startup-window`** -- Chrome launches headlessly by default until a page is needed

-## [0.1.0] - 2025-01-13
+#### Per-Subagent Browser Isolation

-### Added
- Initial release
+Each GCU subagent gets its own Chrome user-data directory, preventing cookie/session cross-contamination:

-[Unreleased]: https://github.com/adenhq/hive/compare/v0.1.0...HEAD
-[0.1.0]: https://github.com/adenhq/hive/releases/tag/v0.1.0
+- Unique browser profiles injected per subagent
+- Profiles cleaned up after top-level GCU node execution
+- Tab origin and age metadata tracked per subagent
+
+#### Dummy Agent Testing Framework
+
+A comprehensive test suite for validating agent graph patterns without LLM calls:
+
+- 8 test modules covering echo, pipeline, branch, parallel merge, retry, feedback loop, worker, and GCU subagent patterns
+- Shared fixtures and a `run_all.py` runner for CI integration
+- Subagent lifecycle tests
+
+---
+
+### What's New
+
+#### GCU Browser
+
+- **Switch from Playwright to system Chrome via CDP** -- Direct CDP connection replaces Playwright dependency. (@bryanadenhq)
+- **Chrome finder and launcher modules** -- `chrome_finder.py` and `chrome_launcher.py` for cross-platform Chrome discovery and process management. (@bryanadenhq)
+- **Dynamic viewport sizing** -- Auto-size viewport and suppress Chrome warning bar. (@bryanadenhq)
+- **Per-subagent browser profile isolation** -- Unique user-data directories per subagent with cleanup. (@bryanadenhq)
+- **Tab origin/age metadata** -- Track which subagent opened each tab and when. (@bryanadenhq)
+- **`browser_close_all` tool** -- Bulk tab cleanup for agents managing many pages. (@bryanadenhq)
+- **Auto-track popup pages** -- Popups are automatically captured and tracked. (@bryanadenhq)
+- **Auto-snapshot from browser interactions** -- Browser interaction tools return screenshots automatically. (@bryanadenhq)
+- **Kill orphaned Chrome processes** -- GCU server shutdown cleans up lingering Chrome instances. (@bryanadenhq)
+- **`--no-startup-window` Chrome flag** -- Prevent empty window on launch. (@bryanadenhq)
+- **Launch Chrome via `open -n` on macOS** -- Coexist with the user's running browser. (@bryanadenhq)
+
+#### Framework & Runtime
+
+- **Session resume fix for new agents** -- Correctly resume sessions when a new agent is loaded. (@bryanadenhq)
+- **Queen upsert fix** -- Prevent duplicate queen entries on session restore. (@bryanadenhq)
+- **Anchor worker monitoring to queen's session ID on cold-restore** -- Worker monitors reconnect to the correct queen after restart. (@bryanadenhq)
+- **Update meta.json when loading workers** -- Worker metadata stays in sync with runtime state. (@RichardTang-Aden)
+- **Generate worker MCP file correctly** -- Fix MCP config generation for spawned workers. (@RichardTang-Aden)
+- **Share event bus so tool events are visible to parent** -- Tool execution events propagate up to parent graphs. (@bryanadenhq)
+- **Subagent activity tracking in queen status** -- Queen instructions include live subagent status. (@bryanadenhq)
+- **GCU system prompt updates** -- Auto-snapshots, batching, popup tracking, and close_all guidance. (@bryanadenhq)
+
+#### Frontend
+
+- **Loading spinner in draft panel** -- Shows spinner during planning phase instead of blank panel. (@bryanadenhq)
+- **Fix credential modal errors** -- Modal no longer eats errors; banner stays visible. (@bryanadenhq)
+- **Fix credentials_required loop** -- Stop clearing the flag on modal close to prevent infinite re-prompting. (@bryanadenhq)
+- **Fix "Add tab" dropdown overflow** -- Dropdown no longer hidden when many agents are open. (@prasoonmhwr)
+
+#### Testing
+
+- **Dummy agent test framework** -- 8 test modules (echo, pipeline, branch, parallel merge, retry, feedback loop, worker, GCU subagent) with shared fixtures and CI runner. (@bryanadenhq)
+- **Subagent lifecycle tests** -- Validate subagent spawn and completion flows. (@bryanadenhq)
+
+#### Documentation & Infrastructure
+
+- **MCP integration PRD** -- Product requirements for MCP server registry. (@TimothyZhang7)
+- **Skills registry PRD** -- Product requirements for skill registry system. (@bryanadenhq)
+- **Bounty program updates** -- Standard bounty issue template and updated contributor guide. (@bryanadenhq)
+- **Windows quickstart** -- Add default context limit for PowerShell setup. (@bryanadenhq)
+- **Remove deprecated files** -- Clean up `setup_mcp.py`, `verify_mcp.py`, `antigravity-setup.md`, and `setup-antigravity-mcp.sh`. (@bryanadenhq)
+
+---
+
+### Bug Fixes
+
+- Fix credential modal eating errors and banner staying open
+- Stop clearing `credentials_required` on modal close to prevent infinite loop
+- Share event bus so tool events are visible to parent graph
+- Use lazy %-formatting in subagent completion log to avoid f-string in logger
+- Anchor worker monitoring to queen's session ID on cold-restore
+- Update meta.json when loading workers
+- Generate worker MCP file correctly
+- Fix "Add tab" dropdown partially hidden when creating multiple agents
+
+---
+
+### Community Contributors
+
+- **Prasoon Mahawar** (@prasoonmhwr) -- Fix UI overflow on agent tab dropdown
+- **Richard Tang** (@RichardTang-Aden) -- Worker MCP generation and meta.json fixes
+
+---
+
+### Upgrading
+
+```bash
+git pull origin main
+uv sync
+```
+
+The Playwright dependency is no longer required for GCU browser operations. Chrome must be installed on the host system.
+
+---
+
+## v0.7.0
+
+**Release Date:** March 5, 2026
+**Tag:** v0.7.0
+
+Session management refactor release.
+
+---
+
+## v0.5.1
+
+**Release Date:** February 18, 2026
+**Tag:** v0.5.1
+
+### The Hive Gets a Brain
+
+v0.5.1 is our most ambitious release yet. Hive agents can now **build other agents** -- the new Hive Coder meta-agent writes, tests, and fixes agent packages from natural language. The runtime grows multi-graph support so one session can orchestrate multiple agents simultaneously. The TUI gets a complete overhaul with an in-app agent picker, live streaming, and seamless escalation to the Coder. And we're now provider-agnostic: Claude Code subscriptions, OpenAI-compatible endpoints, and any LiteLLM-supported model work out of the box.
+
+---
+
+### Highlights
+
+#### Hive Coder -- The Agent That Builds Agents
+
+A native meta-agent that lives inside the framework at `core/framework/agents/hive_coder/`. Give it a natural-language specification and it produces a complete agent package -- goal definition, node prompts, edge routing, MCP tool wiring, tests, and all boilerplate files.
+
+```bash
+# Launch the Coder directly
+hive code
+
+# Or escalate from any running agent (TUI)
+Ctrl+E  # or /coder in chat
+```
+
+The Coder ships with:
+
+- **Reference documentation** -- anti-patterns, construction guide, and design patterns baked into its system prompt
+- **Guardian watchdog** -- an event-driven monitor that catches agent failures and triggers automatic remediation
+- **Coder Tools MCP server** -- file I/O, fuzzy-match editing, git snapshots, and sandboxed shell execution (`tools/coder_tools_server.py`)
+- **Test generation** -- structural tests for forever-alive agents that don't hang on `runner.run()`
+
+#### Multi-Graph Agent Runtime
+
+`AgentRuntime` now supports loading, managing, and switching between multiple agent graphs within a single session. Six new lifecycle tools give agents (and the TUI) full control:
+
+```python
+# Load a second agent into the runtime
+await runtime.add_graph("exports/deep_research_agent")
+
+# Tools available to agents:
+# load_agent, unload_agent, start_agent, restart_agent, list_agents, get_user_presence
+```
+
+The Hive Coder uses multi-graph internally -- when you escalate from a worker agent, the Coder loads as a separate graph while the worker stays alive in the background.
+
+#### TUI Revamp
+
+The Terminal UI gets a ground-up rebuild with five major additions:
+
+- **Agent Picker** (Ctrl+A) -- tabbed modal screen for browsing Your Agents, Framework agents, and Examples with metadata badges (node count, tool count, session count, tags)
+- **Runtime-optional startup** -- TUI launches without a pre-loaded agent, showing the picker on first open
+- **Live streaming pane** -- dedicated RichLog widget shows LLM tokens as they arrive, replacing the old one-token-per-line display
+- **PDF attachments** -- `/attach` and `/detach` commands with native OS file dialog (macOS, Linux, Windows)
+- **Multi-graph commands** -- `/graphs`, `/graph <id>`, `/load <path>`, `/unload <id>` for managing agent graphs in-session
+
+#### Provider-Agnostic LLM Support
+
+Hive is no longer Anthropic-only. v0.5.1 adds first-class support for:
+
+- **Claude Code subscriptions** -- `use_claude_code_subscription: true` in `~/.hive/configuration.json` reads OAuth tokens from `~/.claude/.credentials.json` with automatic refresh
+- **OpenAI-compatible endpoints** -- `api_base` config routes traffic through any compatible API (Azure OpenAI, vLLM, Ollama, etc.)
+- **Any LiteLLM model** -- `RuntimeConfig` now passes `api_key`, `api_base`, and `extra_kwargs` through to LiteLLM
+
+The quickstart script auto-detects Claude Code subscriptions and ZAI Code installations.
+
+---
+
+### What's New
+
+#### Architecture & Runtime
+
+- **Hive Coder meta-agent** -- Natural-language agent builder with reference docs, guardian watchdog, and `hive code` CLI command. (@TimothyZhang7)
+- **Multi-graph agent sessions** -- `add_graph`/`remove_graph` on AgentRuntime with 6 lifecycle tools (`load_agent`, `unload_agent`, `start_agent`, `restart_agent`, `list_agents`, `get_user_presence`). (@TimothyZhang7)
+- **Claude Code subscription support** -- OAuth token refresh via `use_claude_code_subscription` config, auto-detection in quickstart, LiteLLM header patching. (@TimothyZhang7)
+- **OpenAI-compatible endpoint support** -- `api_base` and `extra_kwargs` in `RuntimeConfig` for any OpenAI-compatible API. (@TimothyZhang7)
+- **Remove deprecated node types** -- Delete `FlexibleGraphExecutor`, `WorkerNode`, `HybridJudge`, `CodeSandbox`, `Plan`, `FunctionNode`, `LLMNode`, `RouterNode`. Deprecated types (`llm_tool_use`, `llm_generate`, `function`, `router`, `human_input`) now raise `RuntimeError` with migration guidance. (@TimothyZhang7)
+- **Interactive credential setup** -- Guided `CredentialSetupSession` with health checks and encrypted storage, accessible via `hive setup-credentials` or automatic prompting on credential errors. (@RichardTang-Aden)
+- **Pre-start confirmation prompt** -- Interactive prompt before agent execution allowing credential updates or abort. (@RichardTang-Aden)
+- **Event bus multi-graph support** -- `graph_id` on events, `filter_graph` on subscriptions, `ESCALATION_REQUESTED` event type, `exclude_own_graph` filter. (@TimothyZhang7)
+
+#### TUI Improvements
+
+- **In-app agent picker** (Ctrl+A) -- Tabbed modal for browsing agents with metadata badges (nodes, tools, sessions, tags). (@TimothyZhang7)
+- **Runtime-optional TUI startup** -- Launches without a pre-loaded agent, shows agent picker on startup. (@TimothyZhang7)
+- **Hive Coder escalation** (Ctrl+E) -- Escalate to Hive Coder and return; also available via `/coder` and `/back` chat commands. (@TimothyZhang7)
+- **PDF attachment support** -- `/attach` and `/detach` commands with native OS file dialog. (@TimothyZhang7)
+- **Streaming output pane** -- Dedicated RichLog widget for live LLM token streaming. (@TimothyZhang7)
+- **Multi-graph TUI commands** -- `/graphs`, `/graph <id>`, `/load <path>`, `/unload <id>`. (@TimothyZhang7)
+- **Agent Guardian watchdog** -- Event-driven monitor that catches secondary agent failures and triggers automatic remediation, with `--no-guardian` CLI flag. (@TimothyZhang7)
+
+#### New Tool Integrations
+
+| Tool                   | Description                                                                                                                                                            | Contributor        |
+| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------ |
+| **Discord**            | 4 MCP tools (`discord_list_guilds`, `discord_list_channels`, `discord_send_message`, `discord_get_messages`) with rate-limit retry and channel filtering               | @mishrapravin114   |
+| **Exa Search API**     | 4 AI-powered search tools (`exa_search`, `exa_find_similar`, `exa_get_contents`, `exa_answer`) with neural/keyword search, domain filters, and citation-backed answers | @JeetKaria06       |
+| **Razorpay**           | 6 payment processing tools for payments, invoices, payment links, and refunds with HTTP Basic Auth                                                                     | @shivamshahi07     |
+| **Google Docs**        | Document creation, reading, and editing with OAuth credential support                                                                                                  | @haliaeetusvocifer |
+| **Gmail enhancements** | Expanded mail operations for inbox management                                                                                                                          | @bryanadenhq       |
+
+#### Infrastructure
+
+- **Default node type → `event_loop`** -- `NodeSpec.node_type` defaults to `"event_loop"` instead of `"llm_tool_use"`. (@TimothyZhang7)
+- **Default `max_node_visits` → 0 (unlimited)** -- Nodes default to unlimited visits, reducing friction for feedback loops and forever-alive agents. (@TimothyZhang7)
+- **Remove `function` field from NodeSpec** -- Follows deprecation of `FunctionNode`. (@TimothyZhang7)
+- **LiteLLM OAuth patch** -- Correct header construction for OAuth tokens (remove `x-api-key` when Bearer token is present). (@TimothyZhang7)
+- **Orchestrator config centralization** -- Reads `api_key`, `api_base`, `extra_kwargs` from centralized `~/.hive/configuration.json`. (@TimothyZhang7)
+- **System prompt datetime injection** -- All system prompts now include current date/time for time-aware agent behavior. (@TimothyZhang7)
+- **Utils module exports** -- Proper `__init__.py` exports for the utils module. (@Siddharth2624)
+- **Increased default max_tokens** -- Opus 4.6 defaults to 32768, Sonnet 4.5 to 16384 (up from 8192). (@TimothyZhang7)
+
+---
+
+### Bug Fixes
+
+- Flush WIP accumulator outputs on cancel/failure so edge conditions see correct values on resume
+- Stall detection state preserved across resume (no more resets on checkpoint restore)
+- Skip client-facing blocking for event-triggered executions (timer/webhook)
+- Executor retry override scoped to actual EventLoopNode instances only
+- Add `_awaiting_input` flag to EventLoopNode to prevent input injection race conditions
+- Fix TUI streaming display (tokens no longer appear one-per-line)
+- Fix `_return_from_escalation` crash when ChatRepl widgets not yet mounted
+- Fix tools registration problems for Google Docs credentials (@RichardTang-Aden)
+- Fix email agent version conflicts (@RichardTang-Aden)
+- Fix coder tool timeouts (120s for tests, 300s cap for commands)
+
+### Documentation
+
+- Clarify installation and prevent root pip install misuse (@paarths-collab)
+
+---
+
+### Agent Updates
+
+- **Email Inbox Management** -- Consolidate `gmail_inbox_guardian` and `inbox_management` into a single unified agent with updated prompts and config. (@RichardTang-Aden, @bryanadenhq)
+- **Job Hunter** -- Updated node prompts, config, and agent metadata; added PDF resume selection. (@bryanadenhq)
+- **Deep Research Agent** -- Revised node implementations with updated prompts and output handling.
+- **Tech News Reporter** -- Revised node prompts for improved output quality.
+- **Vulnerability Assessment** -- Expanded prompts with more detailed assessment instructions. (@bryanadenhq)
+
+---
+
+### Breaking Changes
+
+- **Deprecated node types raise `RuntimeError`** -- `llm_tool_use`, `llm_generate`, `function`, `router`, `human_input` now fail instead of warning. Migrate to `event_loop`.
+- **`NodeSpec.node_type` defaults to `"event_loop"`** (was `"llm_tool_use"`)
+- **`NodeSpec.max_node_visits` defaults to `0` / unlimited** (was `1`)
+- **`NodeSpec.function` field removed** -- `FunctionNode` is deleted; use event_loop nodes with tools instead.
+
+---
+
+### Community Contributors
+
+A huge thank you to everyone who contributed to this release:
+
+- **Richard Tang** (@RichardTang-Aden) -- Interactive credential setup, pre-start confirmation, email agent consolidation, tool registration fixes, lint and formatting
+- **Pravin Mishra** (@mishrapravin114) -- Discord integration with 4 MCP tools
+- **Jeet Karia** (@JeetKaria06) -- Exa Search API integration with 4 AI-powered search tools
+- **Shivam Shahi** (@shivamshahi07) -- Razorpay payment processing integration
+- **Siddharth Varshney** (@Siddharth2624) -- Utils module exports
+- **@haliaeetusvocifer** -- Google Docs integration with OAuth support
+- **Bryan** (@bryanadenhq) -- PDF selection, inbox agent fixes, Job Hunter and Vulnerability Assessment updates
+- **@paarths-collab** -- Documentation improvements
+
+---
+
+### Upgrading
+
+```bash
+git pull origin main
+uv sync
+```
+
+#### Migration Guide
+
+If your agents use deprecated node types, update them:
+
+```python
+# Before (v0.5.0) -- these now raise RuntimeError
+NodeSpec(node_type="llm_tool_use", ...)
+NodeSpec(node_type="function", function=my_func, ...)
+
+# After (v0.5.1) -- use event_loop for everything
+NodeSpec(node_type="event_loop", ...)  # or just omit node_type (it's the default now)
+```
+
+If your agents set `max_node_visits=1` explicitly, they'll still work. The only change is the _default_ -- new agents without an explicit value now get unlimited visits.
+
+To try the new Hive Coder:
+
+```bash
+# Launch Coder directly
+hive code
+
+# Or from TUI -- press Ctrl+E to escalate
+hive tui
+```
@@ -0,0 +1 @@
+AGENTS.md
@@ -1,771 +0,0 @@
-# Developer Guide
-
-This guide covers everything you need to know to develop with the Aden Agent Framework.
-
-## Table of Contents
-
-1. [Repository Overview](#repository-overview)
-2. [Initial Setup](#initial-setup)
-3. [Project Structure](#project-structure)
-4. [Building Agents](#building-agents)
-5. [Testing Agents](#testing-agents)
-6. [Code Style & Conventions](#code-style--conventions)
-7. [Git Workflow](#git-workflow)
-8. [Common Tasks](#common-tasks)
-9. [Troubleshooting](#troubleshooting)
-
---
-
-## Repository Overview
-
-Aden Agent Framework is a Python-based system for building goal-driven, self-improving AI agents.
-
-| Package       | Directory  | Description                                  | Tech Stack        |
-| ------------- | ---------- | -------------------------------------------- | ----------------- |
-| **framework** | `/core`    | Core runtime, graph executor, protocols      | Python 3.11+      |
-| **tools**     | `/tools`   | 19 MCP tools for agent capabilities          | Python 3.11+      |
-| **exports**   | `/exports` | Agent packages and examples                  | Python 3.11+      |
-| **skills**    | `.claude`  | Claude Code skills for building/testing      | Markdown          |
-
-### Key Principles
-
- **Goal-Driven Development**: Define objectives, framework generates agent graphs
- **Self-Improving**: Agents adapt and evolve based on failures
- **SDK-Wrapped Nodes**: Built-in memory, monitoring, and tool access
- **Human-in-the-Loop**: Intervention points for human oversight
- **Production-Ready**: Evaluation, testing, and deployment infrastructure
-
---
-
-## Initial Setup
-
-### Prerequisites
-
-Ensure you have installed:
-
- **Python 3.11+** - [Download](https://www.python.org/downloads/) (3.12 or 3.13 recommended)
- **pip** - Package installer for Python (comes with Python)
- **git** - Version control
- **Claude Code** - [Install](https://docs.anthropic.com/claude/docs/claude-code) (optional, for using building skills)
-
-Verify installation:
-
-```bash
-python --version    # Should be 3.11+
-pip --version       # Should be latest
-git --version       # Any recent version
-```
-
-### Step-by-Step Setup
-
-```bash
-# 1. Clone the repository
-git clone https://github.com/adenhq/hive.git
-cd hive
-
-# 2. Run automated Python setup
-./scripts/setup-python.sh
-```
-
-The setup script performs these actions:
-
-1. Checks Python version (3.10+ required, 3.11+ recommended)
-2. Installs `framework` package from `/core` (editable mode)
-3. Installs `aden_tools` package from `/tools` (editable mode)
-4. Fixes package compatibility (upgrades openai for litellm)
-5. Verifies all installations
-
-### API Keys (Optional)
-
-For running agents with real LLMs:
-
-```bash
-# Add to your shell profile (~/.bashrc, ~/.zshrc, etc.)
-export ANTHROPIC_API_KEY="your-key-here"
-export OPENAI_API_KEY="your-key-here"        # Optional
-export BRAVE_SEARCH_API_KEY="your-key-here"  # Optional, for web search tool
-```
-
-Get API keys:
- **Anthropic**: [console.anthropic.com](https://console.anthropic.com/)
- **OpenAI**: [platform.openai.com](https://platform.openai.com/)
- **Brave Search**: [brave.com/search/api](https://brave.com/search/api/)
-
-### Install Claude Code Skills
-
-```bash
-# Install building-agents and testing-agent skills
-./quickstart.sh
-```
-
-This installs:
- `/building-agents` - Build new goal-driven agents
- `/testing-agent` - Test agents with evaluation framework
-
-### Verify Setup
-
-```bash
-# Verify package imports
-python -c "import framework; print('✓ framework OK')"
-python -c "import aden_tools; print('✓ aden_tools OK')"
-python -c "import litellm; print('✓ litellm OK')"
-
-# Run an example agent
-PYTHONPATH=core:exports python -m support_ticket_agent validate
-```
-
---
-
-## Project Structure
-
-```
-hive/                                    # Repository root
-│
-├── .github/                             # GitHub configuration
-│   ├── workflows/
-│   │   ├── ci.yml                       # Runs on every PR
-│   │   └── release.yml                  # Runs on tags
-│   ├── ISSUE_TEMPLATE/                  # Bug report & feature request templates
-│   ├── PULL_REQUEST_TEMPLATE.md         # PR description template
-│   └── CODEOWNERS                       # Auto-assign reviewers
-│
-├── .claude/                             # Claude Code Skills
-│   └── skills/
-│       ├── building-agents/             # Skills for building agents
-│       │   ├── SKILL.md                 # Main skill definition
-│       │   ├── building-agents-core/
-│       │   ├── building-agents-patterns/
-│       │   └── building-agents-construction/
-│       ├── testing-agent/               # Skills for testing agents
-│       │   └── SKILL.md
-│       └── agent-workflow/              # Complete workflow orchestration
-│
-├── core/                                # CORE FRAMEWORK PACKAGE
-│   ├── framework/                       # Main package code
-│   │   ├── runner/                      # AgentRunner - loads and runs agents
-│   │   ├── executor/                    # GraphExecutor - executes node graphs
-│   │   ├── protocols/                   # Standard protocols (hooks, tracing, etc.)
-│   │   ├── llm/                         # LLM provider integrations (Anthropic, OpenAI, etc.)
-│   │   ├── memory/                      # Memory systems (STM, LTM/RLM)
-│   │   ├── tools/                       # Tool registry and management
-│   │   └── __init__.py
-│   ├── pyproject.toml                   # Package metadata and dependencies
-│   ├── requirements.txt                 # Python dependencies
-│   ├── README.md                        # Framework documentation
-│   ├── MCP_INTEGRATION_GUIDE.md         # MCP server integration guide
-│   └── docs/                            # Protocol documentation
-│
-├── tools/                               # TOOLS PACKAGE (19 MCP tools)
-│   ├── src/
-│   │   └── aden_tools/
-│   │       ├── tools/                   # Individual tool implementations
-│   │       │   ├── web_search_tool/
-│   │       │   ├── web_scrape_tool/
-│   │       │   ├── file_system_toolkits/
-│   │       │   └── ...                  # 19 tools total
-│   │       ├── mcp_server.py            # HTTP MCP server
-│   │       └── __init__.py
-│   ├── pyproject.toml                   # Package metadata
-│   ├── requirements.txt                 # Python dependencies
-│   └── README.md                        # Tools documentation
-│
-├── exports/                             # AGENT PACKAGES
-│   ├── support_ticket_agent/            # Example: Support ticket handler
-│   ├── market_research_agent/           # Example: Market research
-│   ├── outbound_sales_agent/            # Example: Sales outreach
-│   ├── personal_assistant_agent/        # Example: Personal assistant
-│   └── ...                              # More agent examples
-│
-├── docs/                                # Documentation
-│   ├── getting-started.md               # Quick start guide
-│   ├── configuration.md                 # Configuration reference
-│   ├── architecture.md                  # System architecture
-│   └── articles/                        # Technical articles
-│
-├── scripts/                             # Build & utility scripts
-│   ├── setup-python.sh                  # Python environment setup
-│   └── setup.sh                         # Legacy setup script
-│
-├── quickstart.sh                        # Install Claude Code skills
-├── ENVIRONMENT_SETUP.md                 # Complete Python setup guide
-├── README.md                            # Project overview
-├── DEVELOPER.md                         # This file
-├── CONTRIBUTING.md                      # Contribution guidelines
-├── CHANGELOG.md                         # Version history
-├── ROADMAP.md                           # Product roadmap
-├── LICENSE                              # Apache 2.0 License
-├── CODE_OF_CONDUCT.md                   # Community guidelines
-└── SECURITY.md                          # Security policy
-```
-
---
-
-## Building Agents
-
-### Using Claude Code Skills
-
-The fastest way to build agents is using the Claude Code skills:
-
-```bash
-# Install skills (one-time)
-./quickstart.sh
-
-# Build a new agent
-claude> /building-agents
-
-# Test the agent
-claude> /testing-agent
-```
-
-### Agent Development Workflow
-
-1. **Define Your Goal**
-   ```
-   claude> /building-agents
-   Enter goal: "Build an agent that processes customer support tickets"
-   ```
-
-2. **Design the Workflow**
-   - The skill guides you through defining nodes
-   - Each node is a unit of work (LLM call, function, router)
-   - Edges define how execution flows
-
-3. **Generate the Agent**
-   - The skill generates a complete Python package in `exports/`
-   - Includes: `agent.json`, `tools.py`, `README.md`
-
-4. **Validate the Agent**
-   ```bash
-   PYTHONPATH=core:exports python -m your_agent_name validate
-   ```
-
-5. **Test the Agent**
-   ```
-   claude> /testing-agent
-   ```
-
-### Manual Agent Development
-
-If you prefer to build agents manually:
-
-```python
-# exports/my_agent/agent.json
-{
-  "goal": {
-    "goal_id": "support_ticket",
-    "name": "Support Ticket Handler",
-    "description": "Process customer support tickets",
-    "success_criteria": "Ticket is categorized, prioritized, and routed correctly"
-  },
-  "nodes": [
-    {
-      "node_id": "analyze",
-      "name": "Analyze Ticket",
-      "node_type": "llm",
-      "system_prompt": "Analyze this support ticket...",
-      "input_keys": ["ticket_content"],
-      "output_keys": ["category", "priority"]
-    }
-  ],
-  "edges": [
-    {
-      "edge_id": "start_to_analyze",
-      "source": "START",
-      "target": "analyze",
-      "condition": "on_success"
-    }
-  ]
-}
-```
-
-### Running Agents
-
-```bash
-# Validate agent structure
-PYTHONPATH=core:exports python -m agent_name validate
-
-# Show agent information
-PYTHONPATH=core:exports python -m agent_name info
-
-# Run agent with input
-PYTHONPATH=core:exports python -m agent_name run --input '{
-  "ticket_content": "My login is broken",
-  "customer_id": "CUST-123"
-}'
-
-# Run in mock mode (no LLM calls)
-PYTHONPATH=core:exports python -m agent_name run --mock --input '{...}'
-```
-
---
-
-## Testing Agents
-
-### Using the Testing Agent Skill
-
-```bash
-# Run tests for an agent
-claude> /testing-agent
-```
-
-This generates and runs:
- **Constraint tests** - Verify agent respects constraints
- **Success tests** - Verify agent achieves success criteria
- **Integration tests** - End-to-end workflows
-
-### Manual Testing
-
-```bash
-# Run all tests for an agent
-PYTHONPATH=core:exports python -m agent_name test
-
-# Run specific test type
-PYTHONPATH=core:exports python -m agent_name test --type constraint
-PYTHONPATH=core:exports python -m agent_name test --type success
-
-# Run with parallel execution
-PYTHONPATH=core:exports python -m agent_name test --parallel 4
-
-# Fail fast (stop on first failure)
-PYTHONPATH=core:exports python -m agent_name test --fail-fast
-```
-
-### Writing Custom Tests
-
-```python
-# exports/my_agent/tests/test_custom.py
-import pytest
-from framework.runner import AgentRunner
-
-def test_ticket_categorization():
-    """Test that tickets are categorized correctly"""
-    runner = AgentRunner.from_file("exports/my_agent/agent.json")
-
-    result = runner.run({
-        "ticket_content": "I can't log in to my account"
-    })
-
-    assert result["category"] == "authentication"
-    assert result["priority"] in ["high", "medium", "low"]
-```
-
---
-
-## Code Style & Conventions
-
-### Python Code Style
-
- **PEP 8** - Follow Python style guide
- **Type hints** - Use for function signatures and class attributes
- **Docstrings** - Document classes and public functions
- **Black** - Code formatter (run with `black .`)
-
-```python
-# Good
-from typing import Optional, Dict, Any
-
-def process_ticket(
-    ticket_content: str,
-    customer_id: str,
-    priority: Optional[str] = None
-) -> Dict[str, Any]:
-    """
-    Process a customer support ticket.
-
-    Args:
-        ticket_content: The content of the ticket
-        customer_id: The customer's ID
-        priority: Optional priority override
-
-    Returns:
-        Dictionary with processing results
-    """
-    # Implementation
-    return {"status": "processed", "id": ticket_id}
-
-# Avoid
-def process_ticket(ticket_content, customer_id, priority=None):
-    # No types, no docstring
-    return {"status": "processed", "id": ticket_id}
-```
-
-### Agent Package Structure
-
-```
-my_agent/
-├── __init__.py              # Package initialization
-├── __main__.py              # CLI entry point
-├── agent.json               # Agent definition (nodes, edges, goal)
-├── tools.py                 # Custom tools (optional)
-├── mcp_servers.json         # MCP server config (optional)
-├── README.md                # Agent documentation
-└── tests/                   # Test files
-    ├── __init__.py
-    ├── test_constraint.py   # Constraint tests
-    └── test_success.py      # Success criteria tests
-```
-
-### File Naming
-
-| Type                | Convention               | Example                     |
-| ------------------- | ------------------------ | --------------------------- |
-| Modules             | snake_case               | `ticket_handler.py`         |
-| Classes             | PascalCase               | `TicketHandler`             |
-| Functions/Variables | snake_case               | `process_ticket()`          |
-| Constants           | UPPER_SNAKE_CASE         | `MAX_RETRIES = 3`           |
-| Test files          | `test_` prefix           | `test_ticket_handler.py`    |
-| Agent packages      | snake_case               | `support_ticket_agent/`     |
-
-### Import Order
-
-1. Standard library
-2. Third-party packages
-3. Framework imports
-4. Local imports
-
-```python
-# Standard library
-import json
-from typing import Dict, Any
-
-# Third-party
-import litellm
-from pydantic import BaseModel
-
-# Framework
-from framework.runner import AgentRunner
-from framework.context import NodeContext
-
-# Local
-from .tools import custom_tool
-```
-
---
-
-## Git Workflow
-
-### Branch Naming
-
-```
-feature/add-user-authentication
-bugfix/fix-login-redirect
-hotfix/security-patch
-chore/update-dependencies
-docs/improve-readme
-```
-
-### Commit Messages
-
-Follow [Conventional Commits](https://www.conventionalcommits.org/):
-
-```
-<type>(<scope>): <description>
-
-[optional body]
-
-[optional footer]
-```
-
-**Types:**
-
- `feat` - New feature
- `fix` - Bug fix
- `docs` - Documentation only
- `style` - Formatting, missing semicolons, etc.
- `refactor` - Code change that neither fixes a bug nor adds a feature
- `test` - Adding or updating tests
- `chore` - Maintenance tasks
-
-**Examples:**
-
-```
-feat(auth): add JWT authentication
-
-fix(api): handle null response from external service
-
-docs(readme): update installation instructions
-
-chore(deps): update React to 18.2.0
-```
-
-### Pull Request Process
-
-1. Create a feature branch from `main`
-2. Make your changes with clear commits
-3. Run tests locally: `npm run test`
-4. Run linting: `npm run lint`
-5. Push and create a PR
-6. Fill out the PR template
-7. Request review from CODEOWNERS
-8. Address feedback
-9. Squash and merge when approved
-
---
-
-## Debugging
-
-### Frontend Debugging
-
-**React Developer Tools:**
-
-1. Install the [React DevTools browser extension](https://react.dev/learn/react-developer-tools)
-2. Open browser DevTools → React tab
-3. Inspect component tree, props, state, and hooks
-
-**VS Code Debugging:**
-
-1. Add Chrome debug configuration to `.vscode/launch.json`:
-
-```json
-{
-  "type": "chrome",
-  "request": "launch",
-  "name": "Debug Frontend",
-  "url": "http://localhost:3000",
-  "webRoot": "${workspaceFolder}/honeycomb/src"
-}
-```
-
-2. Start the dev server: `npm run dev -w honeycomb`
-3. Press F5 in VS Code
-
-### Backend Debugging
-
-**VS Code Debugging:**
-
-1. Add Node debug configuration:
-
-```json
-{
-  "type": "node",
-  "request": "launch",
-  "name": "Debug Backend",
-  "runtimeExecutable": "npm",
-  "runtimeArgs": ["run", "dev"],
-  "cwd": "${workspaceFolder}/hive",
-  "console": "integratedTerminal"
-}
-```
-
-2. Set breakpoints in your code
-3. Press F5 to start debugging
-
-**Logging:**
-
-```typescript
-import { logger } from "../utils/logger";
-
-// Add debug logs
-logger.debug("Processing request", {
-  userId: req.user.id,
-  body: req.body,
-});
-```
-
---
-
-## Common Tasks
-
-### Adding Python Dependencies
-
-```bash
-# Add to core framework
-cd core
-pip install <package>
-# Then add to requirements.txt or pyproject.toml
-
-# Add to tools package
-cd tools
-pip install <package>
-# Then add to requirements.txt or pyproject.toml
-
-# Reinstall in editable mode
-pip install -e .
-```
-
-### Creating a New Agent
-
-```bash
-# Option 1: Use Claude Code skill (recommended)
-claude> /building-agents
-
-# Option 2: Copy from example
-cp -r exports/support_ticket_agent exports/my_new_agent
-cd exports/my_new_agent
-# Edit agent.json, tools.py, README.md
-
-# Option 3: Use the agent builder MCP tools (advanced)
-# See core/MCP_BUILDER_TOOLS_GUIDE.md
-```
-
-### Adding Custom Tools to an Agent
-
-```python
-# exports/my_agent/tools.py
-from typing import Dict, Any
-
-def my_custom_tool(param1: str, param2: int) -> Dict[str, Any]:
-    """
-    Description of what this tool does.
-
-    Args:
-        param1: Description of param1
-        param2: Description of param2
-
-    Returns:
-        Dictionary with tool results
-    """
-    # Implementation
-    return {"result": "success", "data": ...}
-
-# Register tool in agent.json
-{
-  "nodes": [
-    {
-      "node_id": "use_tool",
-      "node_type": "function",
-      "tools": ["my_custom_tool"],
-      ...
-    }
-  ]
-}
-```
-
-### Adding MCP Server Integration
-
-```bash
-# 1. Create mcp_servers.json in your agent package
-# exports/my_agent/mcp_servers.json
-{
-  "tools": {
-    "transport": "stdio",
-    "command": "python",
-    "args": ["-m", "aden_tools.mcp_server"],
-    "cwd": "tools/",
-    "description": "File system and web tools"
-  }
-}
-
-# 2. Reference tools in agent.json
-{
-  "nodes": [
-    {
-      "node_id": "search",
-      "tools": ["web_search", "web_scrape"],
-      ...
-    }
-  ]
-}
-```
-
-### Setting Environment Variables
-
-```bash
-# Add to your shell profile (~/.bashrc, ~/.zshrc, etc.)
-export ANTHROPIC_API_KEY="your-key-here"
-export OPENAI_API_KEY="your-key-here"
-export BRAVE_SEARCH_API_KEY="your-key-here"
-
-# Or create .env file (not committed to git)
-echo 'ANTHROPIC_API_KEY=your-key-here' >> .env
-```
-
-### Debugging Agent Execution
-
-```python
-# Add debug logging to your agent
-import logging
-logging.basicConfig(level=logging.DEBUG)
-
-# Run with verbose output
-PYTHONPATH=core:exports python -m agent_name run --input '{...}' --verbose
-
-# Use mock mode to test without LLM calls
-PYTHONPATH=core:exports python -m agent_name run --mock --input '{...}'
-```
-
---
-
-## Troubleshooting
-
-### Port Already in Use
-
-```bash
-# Find process using port
-lsof -i :3000
-lsof -i :4000
-
-# Kill process
-kill -9 <PID>
-
-# Or change ports in config.yaml and regenerate
-```
-
-### Node Modules Issues
-
-```bash
-# Clean everything and reinstall
-npm run clean
-rm -rf node_modules package-lock.json
-npm install
-```
-
-### Docker Issues
-
-```bash
-# Reset Docker state
-docker compose down -v
-docker system prune -f
-docker compose build --no-cache
-docker compose up
-```
-
-### TypeScript Errors After Pull
-
-```bash
-# Rebuild TypeScript
-npm run build
-
-# Or restart TS server in VS Code
-# Cmd/Ctrl + Shift + P → "TypeScript: Restart TS Server"
-```
-
-### Environment Variables Not Loading
-
-```bash
-# Regenerate from config.yaml
-npm run generate:env
-
-# Verify files exist
-cat .env
-cat honeycomb/.env
-cat hive/.env
-
-# Restart dev servers after changing env
-```
-
-### Tests Failing
-
-```bash
-# Run with verbose output
-npm run test -w honeycomb -- --reporter=verbose
-
-# Run single test file
-npm run test -w honeycomb -- src/components/Button.test.tsx
-
-# Clear test cache
-npm run test -w honeycomb -- --clearCache
-```
-
---
-
-## Getting Help
-
- **Documentation**: Check the `/docs` folder
- **Issues**: Search [existing issues](https://github.com/adenhq/hive/issues)
- **Discord**: Join our [community](https://discord.com/invite/MXE49hrKDk)
- **Code Review**: Tag a maintainer on your PR
-
---
-
-_Happy coding!_ 🐝
@@ -1,347 +0,0 @@
-# Agent Development Environment Setup
-
-Complete setup guide for building and running goal-driven agents with the Aden Agent Framework.
-
-## Quick Setup
-
-```bash
-# Run the automated setup script
-./scripts/setup-python.sh
-```
-
-This will:
-
- Check Python version (requires 3.10+, recommends 3.11+)
- Install the core framework package (`framework`)
- Install the tools package (`aden_tools`)
- Fix package compatibility issues (openai + litellm)
- Verify all installations
-
-## Manual Setup (Alternative)
-
-If you prefer to set up manually or the script fails:
-
-### 1. Install Core Framework
-
-```bash
-cd core
-pip install -e .
-```
-
-### 2. Install Tools Package
-
-```bash
-cd tools
-pip install -e .
-```
-
-### 3. Upgrade OpenAI Package
-
-```bash
-# litellm requires openai >= 1.0.0
-pip install --upgrade "openai>=1.0.0"
-```
-
-### 4. Verify Installation
-
-```bash
-python -c "import framework; print('✓ framework OK')"
-python -c "import aden_tools; print('✓ aden_tools OK')"
-python -c "import litellm; print('✓ litellm OK')"
-```
-
-## Requirements
-
-### Python Version
-
- **Minimum:** Python 3.10
- **Recommended:** Python 3.11 or 3.12
- **Tested on:** Python 3.11, 3.12, 3.13
-
-### System Requirements
-
- pip (latest version)
- 2GB+ RAM
- Internet connection (for LLM API calls)
-
-### API Keys (Optional)
-
-For running agents with real LLMs:
-
-```bash
-export ANTHROPIC_API_KEY="your-key-here"
-```
-
-## Running Agents
-
-All agent commands must be run from the project root with `PYTHONPATH` set:
-
-```bash
-# From /home/timothy/oss/hive/ directory
-PYTHONPATH=core:exports python -m agent_name COMMAND
-```
-
-### Example: Support Ticket Agent
-
-```bash
-# Validate agent structure
-PYTHONPATH=core:exports python -m support_ticket_agent validate
-
-# Show agent information
-PYTHONPATH=core:exports python -m support_ticket_agent info
-
-# Run agent with input
-PYTHONPATH=core:exports python -m support_ticket_agent run --input '{
-  "ticket_content": "My login is broken. Error 401.",
-  "customer_id": "CUST-123",
-  "ticket_id": "TKT-456"
-}'
-
-# Run in mock mode (no LLM calls)
-PYTHONPATH=core:exports python -m support_ticket_agent run --mock --input '{...}'
-```
-
-### Example: Other Agents
-
-```bash
-# Market Research Agent
-PYTHONPATH=core:exports python -m market_research_agent info
-
-# Outbound Sales Agent
-PYTHONPATH=core:exports python -m outbound_sales_agent validate
-
-# Personal Assistant Agent
-PYTHONPATH=core:exports python -m personal_assistant_agent run --input '{...}'
-```
-
-## Building New Agents
-
-Use Claude Code CLI with the agent building skills:
-
-### 1. Install Skills (One-time)
-
-```bash
-./quickstart.sh
-```
-
-This installs:
-
- `/building-agents` - Build new agents
- `/testing-agent` - Test agents
-
-### 2. Build an Agent
-
-```
-claude> /building-agents
-```
-
-Follow the prompts to:
-
-1. Define your agent's goal
-2. Design the workflow nodes
-3. Connect edges
-4. Generate the agent package
-
-### 3. Test Your Agent
-
-```
-claude> /testing-agent
-```
-
-Creates comprehensive test suites for your agent.
-
-## Troubleshooting
-
-### "ModuleNotFoundError: No module named 'framework'"
-
-**Solution:** Install the core package:
-
-```bash
-cd core && pip install -e .
-```
-
-### "ModuleNotFoundError: No module named 'aden_tools'"
-
-**Solution:** Install the tools package:
-
-```bash
-cd tools && pip install -e .
-```
-
-Or run the setup script:
-
-```bash
-./scripts/setup-python.sh
-```
-
-### "ModuleNotFoundError: No module named 'openai.\_models'"
-
-**Cause:** Outdated `openai` package (0.27.x) incompatible with `litellm`
-
-**Solution:** Upgrade openai:
-
-```bash
-pip install --upgrade "openai>=1.0.0"
-```
-
-### "No module named 'support_ticket_agent'"
-
-**Cause:** Not running from project root or missing PYTHONPATH
-
-**Solution:** Ensure you're in `/home/timothy/oss/hive/` and use:
-
-```bash
-PYTHONPATH=core:exports python -m support_ticket_agent validate
-```
-
-### Agent imports fail with "broken installation"
-
-**Symptom:** `pip list` shows packages pointing to non-existent directories
-
-**Solution:** Reinstall packages properly:
-
-```bash
-# Remove broken installations
-pip uninstall -y framework tools aden-tools
-
-# Reinstall correctly
-cd /home/timothy/oss/hive
-./scripts/setup-python.sh
-```
-
-## Package Structure
-
-The Hive framework consists of three Python packages:
-
-```
-hive/
-├── core/                    # Core framework (runtime, graph executor, LLM providers)
-│   ├── framework/
-│   ├── pyproject.toml
-│   └── requirements.txt
-│
-├── tools/                   # Tools and MCP servers
-│   ├── src/
-│   │   └── aden_tools/     # Actual package location
-│   ├── pyproject.toml
-│   └── README.md
-│
-└── exports/                 # Agent packages (your agents go here)
-    ├── support_ticket_agent/
-    ├── market_research_agent/
-    ├── outbound_sales_agent/
-    └── personal_assistant_agent/
-```
-
-### Why PYTHONPATH is Required
-
-The packages are installed in **editable mode** (`pip install -e`), which means:
-
- `framework` and `aden_tools` are globally importable (no PYTHONPATH needed)
- `exports` is NOT installed as a package (PYTHONPATH required)
-
-This design allows agents in `exports/` to be:
-
- Developed independently
- Version controlled separately
- Deployed as standalone packages
-
-## Development Workflow
-
-### 1. Setup (Once)
-
-```bash
-./scripts/setup-python.sh
-```
-
-### 2. Build Agent (Claude Code)
-
-```
-claude> /building-agents
-Enter goal: "Build an agent that processes customer support tickets"
-```
-
-### 3. Validate Agent
-
-```bash
-PYTHONPATH=core:exports python -m support_ticket_agent validate
-```
-
-### 4. Test Agent
-
-```
-claude> /testing-agent
-```
-
-### 5. Run Agent
-
-```bash
-PYTHONPATH=core:exports python -m support_ticket_agent run --input '{...}'
-```
-
-## IDE Setup
-
-### VSCode
-
-Add to `.vscode/settings.json`:
-
-```json
-{
-  "python.analysis.extraPaths": [
-    "${workspaceFolder}/core",
-    "${workspaceFolder}/exports"
-  ],
-  "python.autoComplete.extraPaths": [
-    "${workspaceFolder}/core",
-    "${workspaceFolder}/exports"
-  ]
-}
-```
-
-### PyCharm
-
-1. Open Project Settings → Project Structure
-2. Mark `core` as Sources Root
-3. Mark `exports` as Sources Root
-
-## Environment Variables
-
-### Required for LLM Operations
-
-```bash
-export ANTHROPIC_API_KEY="sk-ant-..."
-```
-
-### Optional Configuration
-
-```bash
-# Credentials storage location (default: ~/.aden/credentials)
-export ADEN_CREDENTIALS_PATH="/custom/path"
-
-# Agent storage location (default: /tmp)
-export AGENT_STORAGE_PATH="/custom/storage"
-```
-
-## Additional Resources
-
- **Framework Documentation:** [core/README.md](core/README.md)
- **Tools Documentation:** [tools/README.md](tools/README.md)
- **Example Agents:** [exports/](exports/)
- **Agent Building Guide:** [.claude/skills/building-agents-construction/SKILL.md](.claude/skills/building-agents-construction/SKILL.md)
- **Testing Guide:** [.claude/skills/testing-agent/SKILL.md](.claude/skills/testing-agent/SKILL.md)
-
-## Contributing
-
-When contributing agent packages:
-
-1. Place agents in `exports/agent_name/`
-2. Follow the standard agent structure (see existing agents)
-3. Include README.md with usage instructions
-4. Add tests if using `/testing-agent`
-5. Document required environment variables
-
-## Support
-
- **Issues:** https://github.com/adenhq/hive/issues
- **Discord:** https://discord.com/invite/MXE49hrKDk
- **Documentation:** https://docs.adenhq.com/
@@ -0,0 +1,56 @@
+.PHONY: lint format check test test-tools test-live test-all install-hooks help frontend-install frontend-dev frontend-build
+
+# ── Ensure uv is findable in Git Bash on Windows ──────────────────────────────
+# uv installs to ~/.local/bin on Windows/Linux/macOS. Git Bash may not include
+# this in PATH by default, so we prepend it here.
+export PATH := $(HOME)/.local/bin:$(PATH)
+
+# ── Targets ───────────────────────────────────────────────────────────────────
+
+help: ## Show this help
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \
+		awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-15s\033[0m %s\n", $$1, $$2}'
+
+lint: ## Run ruff linter and formatter (with auto-fix)
+	cd core && uv run ruff check --fix .
+	cd tools && uv run ruff check --fix .
+	cd core && uv run ruff format .
+	cd tools && uv run ruff format .
+
+format: ## Run ruff formatter
+	cd core && uv run ruff format .
+	cd tools && uv run ruff format .
+
+check: ## Run all checks without modifying files (CI-safe)
+	cd core && uv run ruff check .
+	cd tools && uv run ruff check .
+	cd core && uv run ruff format --check .
+	cd tools && uv run ruff format --check .
+
+test: ## Run all tests (core + tools, excludes live)
+	cd core && uv run python -m pytest tests/ -v --ignore=tests/dummy_agents
+	cd tools && uv run python -m pytest -v
+
+test-tools: ## Run tool tests only (mocked, no credentials needed)
+	cd tools && uv run python -m pytest -v
+
+test-live: ## Run live integration tests (requires real API credentials)
+	cd tools && uv run python -m pytest -m live -s -o "addopts=" --log-cli-level=INFO
+
+test-all: ## Run everything including live tests
+	cd core && uv run python -m pytest tests/ -v --ignore=tests/dummy_agents
+	cd tools && uv run python -m pytest -v
+	cd tools && uv run python -m pytest -m live -s -o "addopts=" --log-cli-level=INFO
+
+install-hooks: ## Install pre-commit hooks
+	uv pip install pre-commit
+	pre-commit install
+
+frontend-install: ## Install frontend npm packages
+	cd core/frontend && npm install
+
+frontend-dev: ## Start frontend dev server
+	cd core/frontend && npm run dev
+
+frontend-build: ## Build frontend for production
+	cd core/frontend && npm run build
@@ -1,339 +0,0 @@
-<p align="center">
-  <img width="100%" alt="Hive Banner" src="https://storage.googleapis.com/aden-prod-assets/website/aden-title-card.png" />
-</p>
-
-<p align="center">
-  <a href="README.md">English</a> |
-  <a href="README.zh-CN.md">简体中文</a> |
-  <a href="README.es.md">Español</a> |
-  <a href="README.pt.md">Português</a> |
-  <a href="README.ja.md">日本語</a> |
-  <a href="README.ru.md">Русский</a>
-</p>
-
-[![Apache 2.0 License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/adenhq/hive/blob/main/LICENSE)
-[![Y Combinator](https://img.shields.io/badge/Y%20Combinator-Aden-orange)](https://www.ycombinator.com/companies/aden)
-[![Docker Pulls](https://img.shields.io/docker/pulls/adenhq/hive?logo=Docker&labelColor=%23528bff)](https://hub.docker.com/u/adenhq)
-[![Discord](https://img.shields.io/discord/1172610340073242735?logo=discord&labelColor=%235462eb&logoColor=%23f5f5f5&color=%235462eb)](https://discord.com/invite/MXE49hrKDk)
-[![Twitter Follow](https://img.shields.io/twitter/follow/teamaden?logo=X&color=%23f5f5f5)](https://x.com/aden_hq)
-[![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/teamaden/)
-
-<p align="center">
-  <img src="https://img.shields.io/badge/AI_Agents-Self--Improving-brightgreen?style=flat-square" alt="AI Agents" />
-  <img src="https://img.shields.io/badge/Multi--Agent-Systems-blue?style=flat-square" alt="Multi-Agent" />
-  <img src="https://img.shields.io/badge/Goal--Driven-Development-purple?style=flat-square" alt="Goal-Driven" />
-  <img src="https://img.shields.io/badge/Human--in--the--Loop-orange?style=flat-square" alt="HITL" />
-  <img src="https://img.shields.io/badge/Production--Ready-red?style=flat-square" alt="Production" />
-</p>
-<p align="center">
-  <img src="https://img.shields.io/badge/OpenAI-supported-412991?style=flat-square&logo=openai" alt="OpenAI" />
-  <img src="https://img.shields.io/badge/Anthropic-supported-d4a574?style=flat-square" alt="Anthropic" />
-  <img src="https://img.shields.io/badge/Google_Gemini-supported-4285F4?style=flat-square&logo=google" alt="Gemini" />
-  <img src="https://img.shields.io/badge/MCP-19_Tools-00ADD8?style=flat-square" alt="MCP" />
-</p>
-
-## Descripción General
-
-Construye agentes de IA confiables y auto-mejorables sin codificar flujos de trabajo. Define tu objetivo a través de una conversación con un agente de codificación, y el framework genera un grafo de nodos con código de conexión creado dinámicamente. Cuando algo falla, el framework captura los datos del error, evoluciona el agente a través del agente de codificación y lo vuelve a desplegar. Los nodos de intervención humana integrados, la gestión de credenciales y el monitoreo en tiempo real te dan control sin sacrificar la adaptabilidad.
-
-Visita [adenhq.com](https://adenhq.com) para documentación completa, ejemplos y guías.
-
-## ¿Qué es Aden?
-
-<p align="center">
-  <img width="100%" alt="Aden Architecture" src="docs/assets/aden-architecture-diagram.jpg" />
-</p>
-
-Aden es una plataforma para construir, desplegar, operar y adaptar agentes de IA:
-
- **Construir** - Un Agente de Codificación genera Agentes de Trabajo especializados (Ventas, Marketing, Operaciones) a partir de objetivos en lenguaje natural
- **Desplegar** - Despliegue headless con integración CI/CD y gestión completa del ciclo de vida de API
- **Operar** - Monitoreo en tiempo real, observabilidad y guardarraíles de ejecución mantienen los agentes confiables
- **Adaptar** - Evaluación continua, supervisión y adaptación aseguran que los agentes mejoren con el tiempo
- **Infraestructura** - Memoria compartida, integraciones LLM, herramientas y habilidades impulsan cada agente
-
-## Enlaces Rápidos
-
- **[Documentación](https://docs.adenhq.com/)** - Guías completas y referencia de API
- **[Guía de Auto-Hospedaje](https://docs.adenhq.com/getting-started/quickstart)** - Despliega Hive en tu infraestructura
- **[Registro de Cambios](https://github.com/adenhq/hive/releases)** - Últimas actualizaciones y versiones
-<!-- - **[Hoja de Ruta](https://adenhq.com/roadmap)** - Funciones y planes próximos -->
- **[Reportar Problemas](https://github.com/adenhq/hive/issues)** - Reportes de bugs y solicitudes de funciones
-
-## Inicio Rápido
-
-### Prerrequisitos
-
- [Python 3.11+](https://www.python.org/downloads/) - Para desarrollo de agentes
- [Docker](https://docs.docker.com/get-docker/) (v20.10+) - Opcional, para herramientas en contenedores
-
-### Instalación
-
-```bash
-# Clonar el repositorio
-git clone https://github.com/adenhq/hive.git
-cd hive
-
-# Ejecutar configuración del entorno Python
-./scripts/setup-python.sh
-```
-
-Esto instala:
- **framework** - Runtime del agente principal y ejecutor de grafos
- **aden_tools** - 19 herramientas MCP para capacidades de agentes
- Todas las dependencias requeridas
-
-### Construye Tu Primer Agente
-
-```bash
-# Instalar habilidades de Claude Code (una vez)
-./quickstart.sh
-
-# Construir un agente usando Claude Code
-claude> /building-agents
-
-# Probar tu agente
-claude> /testing-agent
-
-# Ejecutar tu agente
-PYTHONPATH=core:exports python -m your_agent_name run --input '{...}'
-```
-
-**[📖 Guía de Configuración Completa](ENVIRONMENT_SETUP.md)** - Instrucciones detalladas para desarrollo de agentes
-
-## Características
-
- **Desarrollo Orientado a Objetivos** - Define objetivos en lenguaje natural; el agente de codificación genera el grafo de agentes y el código de conexión para lograrlos
- **Agentes Auto-Adaptables** - El framework captura fallos, actualiza objetivos y actualiza el grafo de agentes
- **Conexiones de Nodos Dinámicas** - Sin aristas predefinidas; el código de conexión es generado por cualquier LLM capaz basado en tus objetivos
- **Nodos Envueltos en SDK** - Cada nodo obtiene memoria compartida, memoria RLM local, monitoreo, herramientas y acceso LLM de serie
- **Humano en el Bucle** - Nodos de intervención que pausan la ejecución para entrada humana con tiempos de espera y escalación configurables
- **Observabilidad en Tiempo Real** - Streaming WebSocket para monitoreo en vivo de ejecución de agentes, decisiones y comunicación entre nodos
- **Control de Costos y Presupuesto** - Establece límites de gasto, limitadores y políticas de degradación automática de modelos
- **Listo para Producción** - Auto-hospedable, construido para escala y confiabilidad
-
-## Por Qué Aden
-
-Los frameworks de agentes tradicionales requieren que diseñes manualmente flujos de trabajo, definas interacciones de agentes y manejes fallos de forma reactiva. Aden invierte este paradigma—**describes resultados, y el sistema se construye solo**.
-
-```mermaid
-flowchart LR
-    subgraph BUILD["🏗️ BUILD"]
-        GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>LLM/Router/Function"]
-        NODES --> EDGES["Connect Edges<br/>on_success/failure/conditional"]
-        EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"]
-    end
-
-    subgraph EXPORT["📦 EXPORT"]
-        direction TB
-        JSON["agent.json<br/>(GraphSpec)"]
-        TOOLS["tools.py<br/>(Functions)"]
-        MCP["mcp_servers.json<br/>(Integrations)"]
-    end
-
-    subgraph RUN["🚀 RUNTIME"]
-        LOAD["AgentRunner<br/>Load + Parse"] --> SETUP["Setup Runtime<br/>+ ToolRegistry"]
-        SETUP --> EXEC["GraphExecutor<br/>Execute Nodes"]
-
-        subgraph DECISION["Decision Recording"]
-            DEC1["runtime.decide()<br/>intent → options → choice"]
-            DEC2["runtime.record_outcome()<br/>success, result, metrics"]
-        end
-    end
-
-    subgraph INFRA["⚙️ INFRASTRUCTURE"]
-        CTX["NodeContext<br/>memory • llm • tools"]
-        STORE[("FileStorage<br/>Runs & Decisions")]
-    end
-
-    APPROVE --> EXPORT
-    EXPORT --> LOAD
-    EXEC --> DECISION
-    EXEC --> CTX
-    DECISION --> STORE
-    STORE -.->|"Analyze & Improve"| NODES
-
-    style BUILD fill:#ffbe42,stroke:#cc5d00,stroke-width:3px,color:#333
-    style EXPORT fill:#fff59d,stroke:#ed8c00,stroke-width:2px,color:#333
-    style RUN fill:#ffb100,stroke:#cc5d00,stroke-width:3px,color:#333
-    style DECISION fill:#ffcc80,stroke:#ed8c00,stroke-width:2px,color:#333
-    style INFRA fill:#e8763d,stroke:#cc5d00,stroke-width:3px,color:#fff
-    style STORE fill:#ed8c00,stroke:#cc5d00,stroke-width:2px,color:#fff
-```
-
-### La Ventaja de Aden
-
-| Frameworks Tradicionales | Aden |
-|--------------------------|------|
-| Codificar flujos de trabajo de agentes | Describir objetivos en lenguaje natural |
-| Definición manual de grafos | Grafos de agentes auto-generados |
-| Manejo reactivo de errores | Auto-evolución proactiva |
-| Configuraciones de herramientas estáticas | Nodos dinámicos envueltos en SDK |
-| Configuración de monitoreo separada | Observabilidad en tiempo real integrada |
-| Gestión de presupuesto DIY | Controles de costos y degradación integrados |
-
-### Cómo Funciona
-
-1. **Define Tu Objetivo** → Describe lo que quieres lograr en lenguaje simple
-2. **El Agente de Codificación Genera** → Crea el grafo de agentes, código de conexión y casos de prueba
-3. **Los Trabajadores Ejecutan** → Los nodos envueltos en SDK se ejecutan con observabilidad completa y acceso a herramientas
-4. **El Plano de Control Monitorea** → Métricas en tiempo real, aplicación de presupuesto, gestión de políticas
-5. **Auto-Mejora** → En caso de fallo, el sistema evoluciona el grafo y lo vuelve a desplegar automáticamente
-
-## Cómo se Compara Aden
-
-Aden adopta un enfoque fundamentalmente diferente al desarrollo de agentes. Mientras que la mayoría de los frameworks requieren que codifiques flujos de trabajo o definas manualmente grafos de agentes, Aden usa un **agente de codificación para generar todo tu sistema de agentes** a partir de objetivos en lenguaje natural. Cuando los agentes fallan, el framework no solo registra errores—**evoluciona automáticamente el grafo de agentes** y lo vuelve a desplegar.
-
-> **Nota:** Para la tabla de comparación detallada de frameworks y preguntas frecuentes, consulta el [README.md](README.md) en inglés.
-
-### Cuándo Elegir Aden
-
-Elige Aden cuando necesites:
-
- Agentes que **se auto-mejoren a partir de fallos** sin intervención manual
- **Desarrollo orientado a objetivos** donde describes resultados, no flujos de trabajo
- **Confiabilidad en producción** con recuperación y redespliegue automáticos
- **Iteración rápida** en arquitecturas de agentes sin reescribir código
- **Observabilidad completa** con monitoreo en tiempo real y supervisión humana
-
-Elige otros frameworks cuando necesites:
-
- **Flujos de trabajo predecibles y con tipos seguros** (PydanticAI, Mastra)
- **RAG y procesamiento de documentos** (LlamaIndex, Haystack)
- **Investigación sobre emergencia de agentes** (CAMEL)
- **Voz/multimodal en tiempo real** (TEN Framework)
- **Encadenamiento simple de componentes** (LangChain, Swarm)
-
-## Estructura del Proyecto
-
-```
-hive/
-├── core/                   # Framework principal - Runtime de agentes, ejecutor de grafos, protocolos
-├── tools/                  # Paquete de Herramientas MCP - 19 herramientas para capacidades de agentes
-├── exports/                # Paquetes de Agentes - Agentes pre-construidos y ejemplos
-├── docs/                   # Documentación y guías
-├── scripts/                # Scripts de construcción y utilidades
-├── .claude/                # Habilidades de Claude Code para construir agentes
-├── ENVIRONMENT_SETUP.md    # Guía de configuración de Python para desarrollo de agentes
-├── DEVELOPER.md            # Guía del desarrollador
-├── CONTRIBUTING.md         # Directrices de contribución
-└── ROADMAP.md              # Hoja de ruta del producto
-```
-
-## Desarrollo
-
-### Desarrollo de Agentes en Python
-
-Para construir y ejecutar agentes orientados a objetivos con el framework:
-
-```bash
-# Configuración única
-./scripts/setup-python.sh
-
-# Esto instala:
-# - paquete framework (runtime principal)
-# - paquete aden_tools (19 herramientas MCP)
-# - Todas las dependencias
-
-# Construir nuevos agentes usando habilidades de Claude Code
-claude> /building-agents
-
-# Probar agentes
-claude> /testing-agent
-
-# Ejecutar agentes
-PYTHONPATH=core:exports python -m agent_name run --input '{...}'
-```
-
-Consulta [ENVIRONMENT_SETUP.md](ENVIRONMENT_SETUP.md) para instrucciones de configuración completas.
-
-## Documentación
-
- **[Guía del Desarrollador](DEVELOPER.md)** - Guía completa para desarrolladores
- [Primeros Pasos](docs/getting-started.md) - Instrucciones de configuración rápida
- [Guía de Configuración](docs/configuration.md) - Todas las opciones de configuración
- [Visión General de Arquitectura](docs/architecture.md) - Diseño y estructura del sistema
-
-## Hoja de Ruta
-
-El Framework de Agentes Aden tiene como objetivo ayudar a los desarrolladores a construir agentes auto-adaptativos orientados a resultados. Encuentra nuestra hoja de ruta aquí
-
-[ROADMAP.md](ROADMAP.md)
-
-```mermaid
-timeline
-    title Aden Agent Framework Roadmap
-    section Foundation
-        Architecture : Node-Based Architecture : Python SDK : LLM Integration (OpenAI, Anthropic, Google) : Communication Protocol
-        Coding Agent : Goal Creation Session : Worker Agent Creation : MCP Tools Integration
-        Worker Agent : Human-in-the-Loop : Callback Handlers : Intervention Points : Streaming Interface
-        Tools : File Use : Memory (STM/LTM) : Web Search : Web Scraper : Audit Trail
-        Core : Eval System : Pydantic Validation : Docker Deployment : Documentation : Sample Agents
-    section Expansion
-        Intelligence : Guardrails : Streaming Mode : Semantic Search
-        Platform : JavaScript SDK : Custom Tool Integrator : Credential Store
-        Deployment : Self-Hosted : Cloud Services : CI/CD Pipeline
-        Templates : Sales Agent : Marketing Agent : Analytics Agent : Training Agent : Smart Form Agent
-```
-
-## Comunidad y Soporte
-
-Usamos [Discord](https://discord.com/invite/MXE49hrKDk) para soporte, solicitudes de funciones y discusiones de la comunidad.
-
- Discord - [Únete a nuestra comunidad](https://discord.com/invite/MXE49hrKDk)
- Twitter/X - [@adenhq](https://x.com/aden_hq)
- LinkedIn - [Página de la Empresa](https://www.linkedin.com/company/teamaden/)
-
-## Contribuir
-
-¡Damos la bienvenida a las contribuciones! Por favor consulta [CONTRIBUTING.md](CONTRIBUTING.md) para las directrices.
-
-1. Haz fork del repositorio
-2. Crea tu rama de funcionalidad (`git checkout -b feature/amazing-feature`)
-3. Haz commit de tus cambios (`git commit -m 'Add amazing feature'`)
-4. Haz push a la rama (`git push origin feature/amazing-feature`)
-5. Abre un Pull Request
-
-## Únete a Nuestro Equipo
-
-**¡Estamos contratando!** Únete a nosotros en roles de ingeniería, investigación y comercialización.
-
-[Ver Posiciones Abiertas](https://jobs.adenhq.com/a8cec478-cdbc-473c-bbd4-f4b7027ec193/applicant)
-
-## Seguridad
-
-Para preocupaciones de seguridad, por favor consulta [SECURITY.md](SECURITY.md).
-
-## Licencia
-
-Este proyecto está licenciado bajo la Licencia Apache 2.0 - consulta el archivo [LICENSE](LICENSE) para más detalles.
-
-## Preguntas Frecuentes (FAQ)
-
-> **Nota:** Para las preguntas frecuentes completas, consulta el [README.md](README.md) en inglés.
-
-**P: ¿Aden depende de LangChain u otros frameworks de agentes?**
-
-No. Aden está construido desde cero sin dependencias de LangChain, CrewAI u otros frameworks de agentes. El framework está diseñado para ser ligero y flexible, generando grafos de agentes dinámicamente en lugar de depender de componentes predefinidos.
-
-**P: ¿Qué proveedores de LLM soporta Aden?**
-
-Aden soporta más de 100 proveedores de LLM a través de la integración de LiteLLM, incluyendo OpenAI (GPT-4, GPT-4o), Anthropic (modelos Claude), Google Gemini, Mistral, Groq y muchos más. Simplemente configura la variable de entorno de la clave API apropiada y especifica el nombre del modelo.
-
-**P: ¿Aden es de código abierto?**
-
-Sí, Aden es completamente de código abierto bajo la Licencia Apache 2.0. Fomentamos activamente las contribuciones y colaboración de la comunidad.
-
-**P: ¿Qué hace que Aden sea diferente de otros frameworks de agentes?**
-
-Aden genera todo tu sistema de agentes a partir de objetivos en lenguaje natural usando un agente de codificación—no codificas flujos de trabajo ni defines grafos manualmente. Cuando los agentes fallan, el framework captura automáticamente los datos del fallo, evoluciona el grafo de agentes y lo vuelve a desplegar. Este ciclo de auto-mejora es único de Aden.
-
-**P: ¿Aden soporta flujos de trabajo con humano en el bucle?**
-
-Sí, Aden soporta completamente flujos de trabajo con humano en el bucle a través de nodos de intervención que pausan la ejecución para entrada humana. Estos incluyen tiempos de espera configurables y políticas de escalación, permitiendo colaboración fluida entre expertos humanos y agentes de IA.
-
---
-
-<p align="center">
-  Hecho con 🔥 Pasión en San Francisco
-</p>
@@ -1,339 +0,0 @@
-<p align="center">
-  <img width="100%" alt="Hive Banner" src="https://storage.googleapis.com/aden-prod-assets/website/aden-title-card.png" />
-</p>
-
-<p align="center">
-  <a href="README.md">English</a> |
-  <a href="README.zh-CN.md">简体中文</a> |
-  <a href="README.es.md">Español</a> |
-  <a href="README.pt.md">Português</a> |
-  <a href="README.ja.md">日本語</a> |
-  <a href="README.ru.md">Русский</a>
-</p>
-
-[![Apache 2.0 License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/adenhq/hive/blob/main/LICENSE)
-[![Y Combinator](https://img.shields.io/badge/Y%20Combinator-Aden-orange)](https://www.ycombinator.com/companies/aden)
-[![Docker Pulls](https://img.shields.io/docker/pulls/adenhq/hive?logo=Docker&labelColor=%23528bff)](https://hub.docker.com/u/adenhq)
-[![Discord](https://img.shields.io/discord/1172610340073242735?logo=discord&labelColor=%235462eb&logoColor=%23f5f5f5&color=%235462eb)](https://discord.com/invite/MXE49hrKDk)
-[![Twitter Follow](https://img.shields.io/twitter/follow/teamaden?logo=X&color=%23f5f5f5)](https://x.com/aden_hq)
-[![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/teamaden/)
-
-<p align="center">
-  <img src="https://img.shields.io/badge/AI_Agents-Self--Improving-brightgreen?style=flat-square" alt="AI Agents" />
-  <img src="https://img.shields.io/badge/Multi--Agent-Systems-blue?style=flat-square" alt="Multi-Agent" />
-  <img src="https://img.shields.io/badge/Goal--Driven-Development-purple?style=flat-square" alt="Goal-Driven" />
-  <img src="https://img.shields.io/badge/Human--in--the--Loop-orange?style=flat-square" alt="HITL" />
-  <img src="https://img.shields.io/badge/Production--Ready-red?style=flat-square" alt="Production" />
-</p>
-<p align="center">
-  <img src="https://img.shields.io/badge/OpenAI-supported-412991?style=flat-square&logo=openai" alt="OpenAI" />
-  <img src="https://img.shields.io/badge/Anthropic-supported-d4a574?style=flat-square" alt="Anthropic" />
-  <img src="https://img.shields.io/badge/Google_Gemini-supported-4285F4?style=flat-square&logo=google" alt="Gemini" />
-  <img src="https://img.shields.io/badge/MCP-19_Tools-00ADD8?style=flat-square" alt="MCP" />
-</p>
-
-## 概要
-
-ワークフローをハードコーディングせずに、信頼性の高い自己改善型AIエージェントを構築できます。コーディングエージェントとの会話を通じて目標を定義すると、フレームワークが動的に作成された接続コードを持つノードグラフを生成します。問題が発生すると、フレームワークは障害データをキャプチャし、コーディングエージェントを通じてエージェントを進化させ、再デプロイします。組み込みのヒューマンインザループノード、認証情報管理、リアルタイムモニタリングにより、適応性を損なうことなく制御を維持できます。
-
-完全なドキュメント、例、ガイドについては [adenhq.com](https://adenhq.com) をご覧ください。
-
-## Adenとは
-
-<p align="center">
-  <img width="100%" alt="Aden Architecture" src="docs/assets/aden-architecture-diagram.jpg" />
-</p>
-
-Adenは、AIエージェントの構築、デプロイ、運用、適応のためのプラットフォームです：
-
- **構築** - コーディングエージェントが自然言語の目標から専門的なワーカーエージェント（セールス、マーケティング、オペレーション）を生成
- **デプロイ** - CI/CD統合と完全なAPIライフサイクル管理を備えたヘッドレスデプロイメント
- **運用** - リアルタイムモニタリング、可観測性、ランタイムガードレールがエージェントの信頼性を維持
- **適応** - 継続的な評価、監督、適応により、エージェントは時間とともに改善
- **インフラ** - 共有メモリ、LLM統合、ツール、スキルがすべてのエージェントを支援
-
-## クイックリンク
-
- **[ドキュメント](https://docs.adenhq.com/)** - 完全なガイドとAPIリファレンス
- **[セルフホスティングガイド](https://docs.adenhq.com/getting-started/quickstart)** - インフラストラクチャへのHiveデプロイ
- **[変更履歴](https://github.com/adenhq/hive/releases)** - 最新の更新とリリース
-<!-- - **[ロードマップ](https://adenhq.com/roadmap)** - 今後の機能と計画 -->
- **[問題を報告](https://github.com/adenhq/hive/issues)** - バグレポートと機能リクエスト
-
-## クイックスタート
-
-### 前提条件
-
- [Python 3.11+](https://www.python.org/downloads/) - エージェント開発用
- [Docker](https://docs.docker.com/get-docker/) (v20.10+) - オプション、コンテナ化されたツール用
-
-### インストール
-
-```bash
-# リポジトリをクローン
-git clone https://github.com/adenhq/hive.git
-cd hive
-
-# Python環境セットアップを実行
-./scripts/setup-python.sh
-```
-
-これにより以下がインストールされます：
- **framework** - コアエージェントランタイムとグラフエグゼキュータ
- **aden_tools** - エージェント機能のための19個のMCPツール
- すべての必要な依存関係
-
-### 最初のエージェントを構築
-
-```bash
-# Claude Codeスキルをインストール（1回のみ）
-./quickstart.sh
-
-# Claude Codeを使用してエージェントを構築
-claude> /building-agents
-
-# エージェントをテスト
-claude> /testing-agent
-
-# エージェントを実行
-PYTHONPATH=core:exports python -m your_agent_name run --input '{...}'
-```
-
-**[📖 完全セットアップガイド](ENVIRONMENT_SETUP.md)** - エージェント開発の詳細な手順
-
-## 機能
-
- **目標駆動開発** - 自然言語で目標を定義；コーディングエージェントがそれを達成するためのエージェントグラフと接続コードを生成
- **自己適応エージェント** - フレームワークが障害をキャプチャし、目標を更新し、エージェントグラフを更新
- **動的ノード接続** - 事前定義されたエッジなし；接続コードは目標に基づいて任意の対応LLMによって生成
- **SDKラップノード** - すべてのノードが共有メモリ、ローカルRLMメモリ、モニタリング、ツール、LLMアクセスを標準装備
- **ヒューマンインザループ** - 設定可能なタイムアウトとエスカレーションを備えた、人間の入力のために実行を一時停止する介入ノード
- **リアルタイム可観測性** - エージェント実行、決定、ノード間通信のライブモニタリングのためのWebSocketストリーミング
- **コストと予算管理** - 支出制限、スロットル、自動モデル劣化ポリシーを設定
- **本番環境対応** - セルフホスト可能、スケールと信頼性のために構築
-
-## なぜAdenか
-
-従来のエージェントフレームワークでは、ワークフローを手動で設計し、エージェントの相互作用を定義し、障害を事後的に処理する必要があります。Adenはこのパラダイムを逆転させます—**結果を記述すれば、システムが自ら構築します**。
-
-```mermaid
-flowchart LR
-    subgraph BUILD["🏗️ BUILD"]
-        GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>LLM/Router/Function"]
-        NODES --> EDGES["Connect Edges<br/>on_success/failure/conditional"]
-        EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"]
-    end
-
-    subgraph EXPORT["📦 EXPORT"]
-        direction TB
-        JSON["agent.json<br/>(GraphSpec)"]
-        TOOLS["tools.py<br/>(Functions)"]
-        MCP["mcp_servers.json<br/>(Integrations)"]
-    end
-
-    subgraph RUN["🚀 RUNTIME"]
-        LOAD["AgentRunner<br/>Load + Parse"] --> SETUP["Setup Runtime<br/>+ ToolRegistry"]
-        SETUP --> EXEC["GraphExecutor<br/>Execute Nodes"]
-
-        subgraph DECISION["Decision Recording"]
-            DEC1["runtime.decide()<br/>intent → options → choice"]
-            DEC2["runtime.record_outcome()<br/>success, result, metrics"]
-        end
-    end
-
-    subgraph INFRA["⚙️ INFRASTRUCTURE"]
-        CTX["NodeContext<br/>memory • llm • tools"]
-        STORE[("FileStorage<br/>Runs & Decisions")]
-    end
-
-    APPROVE --> EXPORT
-    EXPORT --> LOAD
-    EXEC --> DECISION
-    EXEC --> CTX
-    DECISION --> STORE
-    STORE -.->|"Analyze & Improve"| NODES
-
-    style BUILD fill:#ffbe42,stroke:#cc5d00,stroke-width:3px,color:#333
-    style EXPORT fill:#fff59d,stroke:#ed8c00,stroke-width:2px,color:#333
-    style RUN fill:#ffb100,stroke:#cc5d00,stroke-width:3px,color:#333
-    style DECISION fill:#ffcc80,stroke:#ed8c00,stroke-width:2px,color:#333
-    style INFRA fill:#e8763d,stroke:#cc5d00,stroke-width:3px,color:#fff
-    style STORE fill:#ed8c00,stroke:#cc5d00,stroke-width:2px,color:#fff
-```
-
-### Adenの優位性
-
-| 従来のフレームワーク | Aden |
-|----------------------|------|
-| エージェントワークフローをハードコード | 自然言語で目標を記述 |
-| 手動でグラフを定義 | 自動生成されるエージェントグラフ |
-| 事後的なエラー処理 | プロアクティブな自己進化 |
-| 静的なツール設定 | 動的なSDKラップノード |
-| 別途モニタリング設定 | 組み込みのリアルタイム可観測性 |
-| DIY予算管理 | 統合されたコスト制御と劣化 |
-
-### 仕組み
-
-1. **目標を定義** → 達成したいことを平易な言葉で記述
-2. **コーディングエージェントが生成** → エージェントグラフ、接続コード、テストケースを作成
-3. **ワーカーが実行** → SDKラップノードが完全な可観測性とツールアクセスで実行
-4. **コントロールプレーンが監視** → リアルタイムメトリクス、予算執行、ポリシー管理
-5. **自己改善** → 障害時、システムがグラフを進化させ自動的に再デプロイ
-
-## Adenの比較
-
-Adenはエージェント開発に根本的に異なるアプローチを採用しています。ほとんどのフレームワークがワークフローをハードコードするか、エージェントグラフを手動で定義することを要求するのに対し、Adenは**コーディングエージェントを使用して自然言語の目標からエージェントシステム全体を生成**します。エージェントが失敗した場合、フレームワークは単にエラーをログに記録するだけでなく—**自動的にエージェントグラフを進化させ**、再デプロイします。
-
-> **注意：** 詳細なフレームワーク比較表とよくある質問については、英語の[README.md](README.md)を参照してください。
-
-### Adenを選ぶべきとき
-
-Adenを選択する場合：
-
- 手動介入なしに**失敗から自己改善する**エージェントが必要
- ワークフローではなく結果を記述する**目標駆動開発**が必要
- 自動回復と再デプロイを備えた**本番環境の信頼性**が必要
- コードを書き直すことなくエージェントアーキテクチャを**迅速に反復**する必要がある
- リアルタイムモニタリングと人間の監督を備えた**完全な可観測性**が必要
-
-他のフレームワークを選択する場合：
-
- **型安全で予測可能なワークフロー**（PydanticAI、Mastra）
- **RAGとドキュメント処理**（LlamaIndex、Haystack）
- **エージェント創発の研究**（CAMEL）
- **リアルタイム音声/マルチモーダル**（TEN Framework）
- **シンプルなコンポーネント連鎖**（LangChain、Swarm）
-
-## プロジェクト構造
-
-```
-hive/
-├── core/                   # コアフレームワーク - エージェントランタイム、グラフエグゼキュータ、プロトコル
-├── tools/                  # MCPツールパッケージ - エージェント機能のための19個のツール
-├── exports/                # エージェントパッケージ - 事前構築されたエージェントと例
-├── docs/                   # ドキュメントとガイド
-├── scripts/                # ビルドとユーティリティスクリプト
-├── .claude/                # エージェント構築用のClaude Codeスキル
-├── ENVIRONMENT_SETUP.md    # エージェント開発用のPythonセットアップガイド
-├── DEVELOPER.md            # 開発者ガイド
-├── CONTRIBUTING.md         # 貢献ガイドライン
-└── ROADMAP.md              # プロダクトロードマップ
-```
-
-## 開発
-
-### Pythonエージェント開発
-
-フレームワークで目標駆動エージェントを構築および実行するには：
-
-```bash
-# 1回限りのセットアップ
-./scripts/setup-python.sh
-
-# これにより以下がインストールされます：
-# - frameworkパッケージ（コアランタイム）
-# - aden_toolsパッケージ（19個のMCPツール）
-# - すべての依存関係
-
-# Claude Codeスキルを使用して新しいエージェントを構築
-claude> /building-agents
-
-# エージェントをテスト
-claude> /testing-agent
-
-# エージェントを実行
-PYTHONPATH=core:exports python -m agent_name run --input '{...}'
-```
-
-完全なセットアップ手順については、[ENVIRONMENT_SETUP.md](ENVIRONMENT_SETUP.md)を参照してください。
-
-## ドキュメント
-
- **[開発者ガイド](DEVELOPER.md)** - 開発者向け総合ガイド
- [はじめに](docs/getting-started.md) - クイックセットアップ手順
- [設定ガイド](docs/configuration.md) - すべての設定オプション
- [アーキテクチャ概要](docs/architecture.md) - システム設計と構造
-
-## ロードマップ
-
-Adenエージェントフレームワークは、開発者が結果志向で自己適応するエージェントを構築できるよう支援することを目指しています。ロードマップはこちらをご覧ください
-
-[ROADMAP.md](ROADMAP.md)
-
-```mermaid
-timeline
-    title Aden Agent Framework Roadmap
-    section Foundation
-        Architecture : Node-Based Architecture : Python SDK : LLM Integration (OpenAI, Anthropic, Google) : Communication Protocol
-        Coding Agent : Goal Creation Session : Worker Agent Creation : MCP Tools Integration
-        Worker Agent : Human-in-the-Loop : Callback Handlers : Intervention Points : Streaming Interface
-        Tools : File Use : Memory (STM/LTM) : Web Search : Web Scraper : Audit Trail
-        Core : Eval System : Pydantic Validation : Docker Deployment : Documentation : Sample Agents
-    section Expansion
-        Intelligence : Guardrails : Streaming Mode : Semantic Search
-        Platform : JavaScript SDK : Custom Tool Integrator : Credential Store
-        Deployment : Self-Hosted : Cloud Services : CI/CD Pipeline
-        Templates : Sales Agent : Marketing Agent : Analytics Agent : Training Agent : Smart Form Agent
-```
-
-## コミュニティとサポート
-
-サポート、機能リクエスト、コミュニティディスカッションには[Discord](https://discord.com/invite/MXE49hrKDk)を使用しています。
-
- Discord - [コミュニティに参加](https://discord.com/invite/MXE49hrKDk)
- Twitter/X - [@adenhq](https://x.com/aden_hq)
- LinkedIn - [会社ページ](https://www.linkedin.com/company/teamaden/)
-
-## 貢献
-
-貢献を歓迎します！ガイドラインについては[CONTRIBUTING.md](CONTRIBUTING.md)をご覧ください。
-
-1. リポジトリをフォーク
-2. 機能ブランチを作成 (`git checkout -b feature/amazing-feature`)
-3. 変更をコミット (`git commit -m 'Add amazing feature'`)
-4. ブランチにプッシュ (`git push origin feature/amazing-feature`)
-5. プルリクエストを開く
-
-## チームに参加
-
-**採用中です！** エンジニアリング、リサーチ、マーケティングの役職で私たちに参加してください。
-
-[オープンポジションを見る](https://jobs.adenhq.com/a8cec478-cdbc-473c-bbd4-f4b7027ec193/applicant)
-
-## セキュリティ
-
-セキュリティに関する懸念については、[SECURITY.md](SECURITY.md)をご覧ください。
-
-## ライセンス
-
-このプロジェクトはApache License 2.0の下でライセンスされています - 詳細は[LICENSE](LICENSE)ファイルをご覧ください。
-
-## よくある質問 (FAQ)
-
-> **注意：** よくある質問の完全版については、英語の[README.md](README.md)を参照してください。
-
-**Q: AdenはLangChainや他のエージェントフレームワークに依存していますか？**
-
-いいえ。AdenはLangChain、CrewAI、その他のエージェントフレームワークに依存せずにゼロから構築されています。フレームワークは軽量で柔軟に設計されており、事前定義されたコンポーネントに依存するのではなく、エージェントグラフを動的に生成します。
-
-**Q: AdenはどのLLMプロバイダーをサポートしていますか？**
-
-AdenはLiteLLM統合を通じて100以上のLLMプロバイダーをサポートしており、OpenAI（GPT-4、GPT-4o）、Anthropic（Claudeモデル）、Google Gemini、Mistral、Groqなどが含まれます。適切なAPIキー環境変数を設定し、モデル名を指定するだけです。
-
-**Q: Adenはオープンソースですか？**
-
-はい、AdenはApache License 2.0の下で完全にオープンソースです。コミュニティの貢献とコラボレーションを積極的に奨励しています。
-
-**Q: Adenは他のエージェントフレームワークと何が違いますか？**
-
-Adenはコーディングエージェントを使用して自然言語の目標からエージェントシステム全体を生成します—ワークフローをハードコードしたり、グラフを手動で定義したりする必要はありません。エージェントが失敗すると、フレームワークは自動的に障害データをキャプチャし、エージェントグラフを進化させ、再デプロイします。この自己改善ループはAden独自のものです。
-
-**Q: Adenはヒューマンインザループワークフローをサポートしていますか？**
-
-はい、Adenは人間の入力のために実行を一時停止する介入ノードを通じて、ヒューマンインザループワークフローを完全にサポートしています。設定可能なタイムアウトとエスカレーションポリシーが含まれており、人間の専門家とAIエージェントのシームレスなコラボレーションを可能にします。
-
---
-
-<p align="center">
-  サンフランシスコで 🔥 情熱を込めて作成
-</p>
@@ -1,291 +1,223 @@
 <p align="center">
-  <img width="100%" alt="Hive Banner" src="https://storage.googleapis.com/aden-prod-assets/website/aden-title-card.png" />
+  <img width="100%" alt="Hive Banner" src="https://asset.acho.io/github/img/banner.gif" />
 </p>

 <p align="center">
  <a href="README.md">English</a> |
-  <a href="README.zh-CN.md">简体中文</a> |
-  <a href="README.es.md">Español</a> |
-  <a href="README.pt.md">Português</a> |
-  <a href="README.ja.md">日本語</a> |
-  <a href="README.ru.md">Русский</a>
+  <a href="docs/i18n/zh-CN.md">简体中文</a> |
+  <a href="docs/i18n/es.md">Español</a> |
+  <a href="docs/i18n/hi.md">हिन्दी</a> |
+  <a href="docs/i18n/pt.md">Português</a> |
+  <a href="docs/i18n/ja.md">日本語</a> |
+  <a href="docs/i18n/ru.md">Русский</a> |
+  <a href="docs/i18n/ko.md">한국어</a>
 </p>

-[![Apache 2.0 License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/adenhq/hive/blob/main/LICENSE)
-[![Y Combinator](https://img.shields.io/badge/Y%20Combinator-Aden-orange)](https://www.ycombinator.com/companies/aden)
-[![Docker Pulls](https://img.shields.io/docker/pulls/adenhq/hive?logo=Docker&labelColor=%23528bff)](https://hub.docker.com/u/adenhq)
-[![Discord](https://img.shields.io/discord/1172610340073242735?logo=discord&labelColor=%235462eb&logoColor=%23f5f5f5&color=%235462eb)](https://discord.com/invite/MXE49hrKDk)
-[![Twitter Follow](https://img.shields.io/twitter/follow/teamaden?logo=X&color=%23f5f5f5)](https://x.com/aden_hq)
-[![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/teamaden/)
+<p align="center">
+  <a href="https://github.com/aden-hive/hive/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-Apache%202.0-blue.svg" alt="Apache 2.0 License" /></a>
+  <a href="https://www.ycombinator.com/companies/aden"><img src="https://img.shields.io/badge/Y%20Combinator-Aden-orange" alt="Y Combinator" /></a>
+  <a href="https://discord.com/invite/MXE49hrKDk"><img src="https://img.shields.io/discord/1172610340073242735?logo=discord&labelColor=%235462eb&logoColor=%23f5f5f5&color=%235462eb" alt="Discord" /></a>
+  <a href="https://x.com/aden_hq"><img src="https://img.shields.io/twitter/follow/teamaden?logo=X&color=%23f5f5f5" alt="Twitter Follow" /></a>
+  <a href="https://www.linkedin.com/company/teamaden/"><img src="https://custom-icon-badges.demolab.com/badge/LinkedIn-0A66C2?logo=linkedin-white&logoColor=fff" alt="LinkedIn" /></a>
+  <img src="https://img.shields.io/badge/MCP-102_Tools-00ADD8?style=flat-square" alt="MCP" />
+</p>

 <p align="center">
+  <img src="https://img.shields.io/badge/Agent_Harness-Runtime_Layer-ff6600?style=flat-square" alt="Agent Harness" />
  <img src="https://img.shields.io/badge/AI_Agents-Self--Improving-brightgreen?style=flat-square" alt="AI Agents" />
  <img src="https://img.shields.io/badge/Multi--Agent-Systems-blue?style=flat-square" alt="Multi-Agent" />
-  <img src="https://img.shields.io/badge/Goal--Driven-Development-purple?style=flat-square" alt="Goal-Driven" />
+  <img src="https://img.shields.io/badge/Headless-Development-purple?style=flat-square" alt="Headless" />
  <img src="https://img.shields.io/badge/Human--in--the--Loop-orange?style=flat-square" alt="HITL" />
-  <img src="https://img.shields.io/badge/Production--Ready-red?style=flat-square" alt="Production" />
+  <img src="https://img.shields.io/badge/Browser-Use-red?style=flat-square" alt="Browser Use" />
 </p>
 <p align="center">
  <img src="https://img.shields.io/badge/OpenAI-supported-412991?style=flat-square&logo=openai" alt="OpenAI" />
  <img src="https://img.shields.io/badge/Anthropic-supported-d4a574?style=flat-square" alt="Anthropic" />
  <img src="https://img.shields.io/badge/Google_Gemini-supported-4285F4?style=flat-square&logo=google" alt="Gemini" />
-  <img src="https://img.shields.io/badge/MCP-19_Tools-00ADD8?style=flat-square" alt="MCP" />
 </p>

+<p align="center"><em>The agent harness for production workloads — state management, failure recovery, observability, and human oversight so your agents actually run.</em></p>
+
 ## Overview

-Build reliable, self-improving AI agents without hardcoding workflows. Define your goal through conversation with a coding agent, and the framework generates a node graph with dynamically created connection code. When things break, the framework captures failure data, evolves the agent through the coding agent, and redeploys. Built-in human-in-the-loop nodes, credential management, and real-time monitoring give you control without sacrificing adaptability.
+OpenHive is a zero-setup, model-agnostic execution harness that dynamically generates multi-agent topologies to tackle complex, long-running business workflows without requiring any orchestration boilerplate. By simply defining your objective, the runtime compiles a strict, graph-based execution DAG that safely coordinates specialized agents to execute concurrent tasks in parallel. Backed by persistent, role-based memory that intelligently evolves with your project's context, OpenHive ensures deterministic fault tolerance, deep state observability, and seamless asynchronous execution across whichever underlying LLMs you choose to plug in.
+
+## Features
+
+- ✅ Multi-Agent Coordination for parallel task execution 
+- ✅ Graph-based execution for recurring and complex processes 
+- ✅ Role-based memory that evolves with your projects 
+- ✅ Zero Setup - No technical configuration required
+- ✅ General Compute Use and Browser Use with Native Extension 
+- ✅ Custom Model Support

 Visit [adenhq.com](https://adenhq.com) for complete documentation, examples, and guides.

-## What is Aden
+Visit [HoneyComb](http://honeycomb.open-hive.com/) to see what jobs are being automated by AI. It’s a stock market for jobs, driven by our community’s AI agent progress. You can long and short jobs (with no real money but compute token)based on how much you think a job is going to be replaced by AI.

-<p align="center">
-  <img width="100%" alt="Aden Architecture" src="docs/assets/aden-architecture-diagram.jpg" />
-</p>
+https://github.com/user-attachments/assets/bf10edc3-06ba-48b6-98ba-d069b15fb69d

-Aden is a platform for building, deploying, operating, and adapting AI agents:

- **Build** - A Coding Agent generates specialized Worker Agents (Sales, Marketing, Ops) from natural language goals
- **Deploy** - Headless deployment with CI/CD integration and full API lifecycle management
- **Operate** - Real-time monitoring, observability, and runtime guardrails keep agents reliable
- **Adapt** - Continuous evaluation, supervision, and adaptation ensure agents improve over time
- **Infra** - Shared memory, LLM integrations, tools, and skills power every agent
+## Who Is Hive For?
+
+Hive is the multi-agent harness layer for teams moving AI agents from prototype to production. Single agents like Openclaw and Cowork can finish personal jobs pretty well but lack the rigor to fulfil business processes. 
+
+Hive is a good fit if you:
+
+- Want AI agents that **execute real business processes**, not demos
+- Need a **runtime that handles state, recovery, and parallel execution** at scale
+- Need **self-healing and adaptive agents** that improve over time
+- Require **human-in-the-loop control**, observability, and cost limits
+- Plan to run agents in **production** where uptime, cost, and auditability matter
+
+Hive may not be the best fit if you’re only experimenting with simple agent chains or one-off scripts.
+
+## When Should You Use Hive?
+
+Use Hive when the bottleneck is no longer the model but the harness around it:
+
+- Long-running agents that need **state persistence and crash recovery**
+- Production workloads requiring **cost enforcement, observability, and audit trails**
+- Agents that **self-heal** through failure capture and graph evolution
+- Multi-agent coordination with **session isolation and shared buffers**
+- A framework that **scales with model improvements** rather than fighting them

 ## Quick Links

 - **[Documentation](https://docs.adenhq.com/)** - Complete guides and API reference
 - **[Self-Hosting Guide](https://docs.adenhq.com/getting-started/quickstart)** - Deploy Hive on your infrastructure
- **[Changelog](https://github.com/adenhq/hive/releases)** - Latest updates and releases
-<!-- - **[Roadmap](https://adenhq.com/roadmap)** - Upcoming features and plans -->
- **[Report Issues](https://github.com/adenhq/hive/issues)** - Bug reports and feature requests
+- **[Changelog](https://github.com/aden-hive/hive/releases)** - Latest updates and releases
+- **[Roadmap](docs/roadmap.md)** - Upcoming features and plans
+- **[Report Issues](https://github.com/aden-hive/hive/issues)** - Bug reports and feature requests
+- **[Contributing](CONTRIBUTING.md)** - How to contribute and submit PRs

 ## Quick Start

 ### Prerequisites

- [Python 3.11+](https://www.python.org/downloads/) for agent development
- [Docker](https://docs.docker.com/get-docker/) (v20.10+) - Optional, for containerized tools
+- Python 3.11+ for agent development
+- An LLM provider that powers the agents
+- **ripgrep (optional, recommended on Windows):** The `search_files` tool uses ripgrep for faster file search. If not installed, a Python fallback is used. On Windows: `winget install BurntSushi.ripgrep` or `scoop install ripgrep`
+
+> **Windows Users:** Native Windows is supported via `quickstart.ps1` and `hive.ps1`. Run these in PowerShell 5.1+. WSL is also an option but not required.

 ### Installation

+> **Note**
+> Hive uses a `uv` workspace layout and is not installed with `pip install`.
+> Running `pip install -e .` from the repository root will create a placeholder package and Hive will not function correctly.
+> Please use the quickstart script below to set up the environment.
+
 ```bash
 # Clone the repository
-git clone https://github.com/adenhq/hive.git
+git clone https://github.com/aden-hive/hive.git
 cd hive

-# Run Python environment setup
-./scripts/setup-python.sh
+# Run quickstart setup (macOS/Linux)
+./quickstart.sh
+
+# Windows (PowerShell)
+.\quickstart.ps1
 ```

-This installs:
- **framework** - Core agent runtime and graph executor
- **aden_tools** - 19 MCP tools for agent capabilities
- All required dependencies
+This sets up:
+
+- **framework** - Core agent runtime and graph executor (in `core/.venv`)
+- **aden_tools** - MCP tools for agent capabilities (in `tools/.venv`)
+- **credential store** - Encrypted API key storage (`~/.hive/credentials`)
+- **LLM provider** - Interactive default model configuration, including Hive LLM and OpenRouter
+- All required Python dependencies with `uv`
+
+- Finally, it will open the Hive interface in your browser
+
+> **Tip:** To reopen the dashboard later, run `hive open` from the project directory.

 ### Build Your First Agent

-```bash
-# Install Claude Code skills (one-time)
-./quickstart.sh
+Type the agent you want to build in the home input box. The queen is going to ask you questions and work out a solution with you.

-# Build an agent using Claude Code
-claude> /building-agents
+<img width="2500" height="1214" alt="Image" src="https://github.com/user-attachments/assets/1ce19141-a78b-46f5-8d64-dbf987e048f4" />

-# Test your agent
-claude> /testing-agent
+### Use Template Agents

-# Run your agent
-PYTHONPATH=core:exports python -m your_agent_name run --input '{...}'
-```
+Click "Try a sample agent" and check the templates. You can run a template directly or choose to build your version on top of the existing template.

-**[📖 Complete Setup Guide](ENVIRONMENT_SETUP.md)** - Detailed instructions for agent development
+### Run Agents

-## Features
+Now you can run an agent by selecting the agent (either an existing agent or example agent). You can click the Run button on the top left, or talk to the queen agent and it can run the agent for you.

- **Goal-Driven Development** - Define objectives in natural language; the coding agent generates the agent graph and connection code to achieve them
- **Self-Adapting Agents** - Framework captures failures, updates objectives and updates the agent graph
- **Dynamic Node Connections** - No predefined edges; connection code is generated by any capable LLM based on your goals
- **SDK-Wrapped Nodes** - Every node gets shared memory, local RLM memory, monitoring, tools, and LLM access out of the box
- **Human-in-the-Loop** - Intervention nodes that pause execution for human input with configurable timeouts and escalation
- **Real-time Observability** - WebSocket streaming for live monitoring of agent execution, decisions, and node-to-node communication
- **Cost & Budget Control** - Set spending limits, throttles, and automatic model degradation policies
- **Production-Ready** - Self-hostable, built for scale and reliability
+<img width="2549" height="1174" alt="Screenshot 2026-03-12 at 9 27 36 PM" src="https://github.com/user-attachments/assets/7c7d30fa-9ceb-4c23-95af-b1caa405547d" />

-## Why Aden
+## Integration

-Traditional agent frameworks require you to manually design workflows, define agent interactions, and handle failures reactively. Aden flips this paradigm—**you describe outcomes, and the system builds itself**.
+<a href="https://github.com/aden-hive/hive/tree/main/tools/src/aden_tools/tools"><img width="100%" alt="Integration" src="https://github.com/user-attachments/assets/a1573f93-cf02-4bb8-b3d5-b305b05b1e51" /></a>
+Hive is built to be model-agnostic and system-agnostic.
+
+- **LLM flexibility** - Hive Framework supports Anthropic, OpenAI, OpenRouter, Hive LLM, and other hosted or local models through LiteLLM-compatible providers.
+- **Business system connectivity** - Hive Framework is designed to connect to all kinds of business systems as tools, such as CRM, support, messaging, data, file, and internal APIs via MCP.
+
+## Why Hive
+
+As models improve, the upper bound of what agents can do rises — but their reliability and production value are determined by the harness. Hive focuses on generating agents that run real business processes rather than generic agents. Instead of requiring you to manually design workflows, define agent interactions, and handle failures reactively, Hive flips the paradigm: **you describe outcomes, and the system builds itself**—delivering an outcome-driven, adaptive experience with an easy-to-use set of tools and integrations.

 ```mermaid
 flowchart LR
-    subgraph BUILD["🏗️ BUILD"]
-        GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>LLM/Router/Function"]
-        NODES --> EDGES["Connect Edges<br/>on_success/failure/conditional"]
-        EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"]
-    end
+    GOAL["Define Goal"] --> GEN["Auto-Generate Graph"]
+    GEN --> EXEC["Execute Agents"]
+    EXEC --> MON["Monitor & Observe"]
+    MON --> CHECK{{"Pass?"}}
+    CHECK -- "Yes" --> DONE["Deliver Result"]
+    CHECK -- "No" --> EVOLVE["Evolve Graph"]
+    EVOLVE --> EXEC

-    subgraph EXPORT["📦 EXPORT"]
-        direction TB
-        JSON["agent.json<br/>(GraphSpec)"]
-        TOOLS["tools.py<br/>(Functions)"]
-        MCP["mcp_servers.json<br/>(Integrations)"]
-    end
+    GOAL -.- V1["Natural Language"]
+    GEN -.- V2["Instant Architecture"]
+    EXEC -.- V3["Easy Integrations"]
+    MON -.- V4["Full visibility"]
+    EVOLVE -.- V5["Adaptability"]
+    DONE -.- V6["Reliable outcomes"]

-    subgraph RUN["🚀 RUNTIME"]
-        LOAD["AgentRunner<br/>Load + Parse"] --> SETUP["Setup Runtime<br/>+ ToolRegistry"]
-        SETUP --> EXEC["GraphExecutor<br/>Execute Nodes"]
-
-        subgraph DECISION["Decision Recording"]
-            DEC1["runtime.decide()<br/>intent → options → choice"]
-            DEC2["runtime.record_outcome()<br/>success, result, metrics"]
-        end
-    end
-
-    subgraph INFRA["⚙️ INFRASTRUCTURE"]
-        CTX["NodeContext<br/>memory • llm • tools"]
-        STORE[("FileStorage<br/>Runs & Decisions")]
-    end
-
-    APPROVE --> EXPORT
-    EXPORT --> LOAD
-    EXEC --> DECISION
-    EXEC --> CTX
-    DECISION --> STORE
-    STORE -.->|"Analyze & Improve"| NODES
-
-    style BUILD fill:#ffbe42,stroke:#cc5d00,stroke-width:3px,color:#333
-    style EXPORT fill:#fff59d,stroke:#ed8c00,stroke-width:2px,color:#333
-    style RUN fill:#ffb100,stroke:#cc5d00,stroke-width:3px,color:#333
-    style DECISION fill:#ffcc80,stroke:#ed8c00,stroke-width:2px,color:#333
-    style INFRA fill:#e8763d,stroke:#cc5d00,stroke-width:3px,color:#fff
-    style STORE fill:#ed8c00,stroke:#cc5d00,stroke-width:2px,color:#fff
+    style GOAL fill:#ffbe42,stroke:#cc5d00,stroke-width:2px,color:#333
+    style GEN fill:#ffb100,stroke:#cc5d00,stroke-width:2px,color:#333
+    style EXEC fill:#ff9800,stroke:#cc5d00,stroke-width:2px,color:#fff
+    style MON fill:#ff9800,stroke:#cc5d00,stroke-width:2px,color:#fff
+    style CHECK fill:#fff59d,stroke:#ed8c00,stroke-width:2px,color:#333
+    style DONE fill:#4caf50,stroke:#2e7d32,stroke-width:2px,color:#fff
+    style EVOLVE fill:#e8763d,stroke:#cc5d00,stroke-width:2px,color:#fff
+    style V1 fill:#fff,stroke:#ed8c00,stroke-width:1px,color:#cc5d00
+    style V2 fill:#fff,stroke:#ed8c00,stroke-width:1px,color:#cc5d00
+    style V3 fill:#fff,stroke:#ed8c00,stroke-width:1px,color:#cc5d00
+    style V4 fill:#fff,stroke:#ed8c00,stroke-width:1px,color:#cc5d00
+    style V5 fill:#fff,stroke:#ed8c00,stroke-width:1px,color:#cc5d00
+    style V6 fill:#fff,stroke:#ed8c00,stroke-width:1px,color:#cc5d00
 ```

-### The Aden Advantage
-
-| Traditional Frameworks     | Aden                                   |
-| -------------------------- | -------------------------------------- |
-| Hardcode agent workflows   | Describe goals in natural language     |
-| Manual graph definition    | Auto-generated agent graphs            |
-| Reactive error handling    | Proactive self-evolution               |
-| Static tool configurations | Dynamic SDK-wrapped nodes              |
-| Separate monitoring setup  | Built-in real-time observability       |
-| DIY budget management      | Integrated cost controls & degradation |
-
 ### How It Works

-1. **Define Your Goal** → Describe what you want to achieve in plain English
-2. **Coding Agent Generates** → Creates the agent graph, connection code, and test cases
-3. **Workers Execute** → SDK-wrapped nodes run with full observability and tool access
+1. **[Define Your Goal](docs/key_concepts/goals_outcome.md)** → Describe what you want to achieve in plain English
+2. **Coding Agent Generates** → Creates the [agent graph](docs/key_concepts/graph.md), connection code, and test cases
+3. **[Workers Execute](docs/key_concepts/worker_agent.md)** → SDK-wrapped nodes run with full observability and tool access
 4. **Control Plane Monitors** → Real-time metrics, budget enforcement, policy management
-5. **Self-Improve** → On failure, the system evolves the graph and redeploys automatically
-
-## How Aden Compares
-
-Aden takes a fundamentally different approach to agent development. While most frameworks require you to hardcode workflows or manually define agent graphs, Aden uses a **coding agent to generate your entire agent system** from natural language goals. When agents fail, the framework doesn't just log errors—it **automatically evolves the agent graph** and redeploys.
-
-### Comparison Table
-
-| Framework                           | Category                  | Approach                                                        | Aden Difference                                           |
-| ----------------------------------- | ------------------------- | --------------------------------------------------------------- | --------------------------------------------------------- |
-| **LangChain, LlamaIndex, Haystack** | Component Libraries       | Predefined components for RAG/LLM apps; manual connection logic | Generates entire graph and connection code upfront        |
-| **CrewAI, AutoGen, Swarm**          | Multi-Agent Orchestration | Role-based agents with predefined collaboration patterns        | Dynamically creates agents/connections; adapts on failure |
-| **PydanticAI, Mastra, Agno**        | Type-Safe Frameworks      | Structured outputs and validation for known workflows           | Evolving workflows; structure emerges through iteration   |
-| **Agent Zero, Letta**               | Personal AI Assistants    | Memory and learning; OS-as-tool or stateful memory focus        | Production multi-agent systems with self-healing          |
-| **CAMEL**                           | Research Framework        | Emergent behavior in large-scale simulations (up to 1M agents)  | Production-oriented with reliable execution and recovery  |
-| **TEN Framework, Genkit**           | Infrastructure Frameworks | Real-time multimodal (TEN) or full-stack AI (Genkit)            | Higher abstraction—generates and evolves agent logic      |
-| **GPT Engineer, Motia**             | Code Generation           | Code from specs (GPT Engineer) or "Step" primitive (Motia)      | Self-adapting graphs with automatic failure recovery      |
-| **Trading Agents**                  | Domain-Specific           | Hardcoded trading firm roles on LangGraph                       | Domain-agnostic; generates structures for any use case    |
-
-### When to Choose Aden
-
-Choose Aden when you need:
-
- Agents that **self-improve from failures** without manual intervention
- **Goal-driven development** where you describe outcomes, not workflows
- **Production reliability** with automatic recovery and redeployment
- **Rapid iteration** on agent architectures without rewriting code
- **Full observability** with real-time monitoring and human oversight
-
-Choose other frameworks when you need:
-
- **Type-safe, predictable workflows** (PydanticAI, Mastra)
- **RAG and document processing** (LlamaIndex, Haystack)
- **Research on agent emergence** (CAMEL)
- **Real-time voice/multimodal** (TEN Framework)
- **Simple component chaining** (LangChain, Swarm)
-
-## Project Structure
-
-```
-hive/
-├── core/                   # Core framework - Agent runtime, graph executor, protocols
-├── tools/                  # MCP Tools Package - 19 tools for agent capabilities
-├── exports/                # Agent packages - Pre-built agents and examples
-├── docs/                   # Documentation and guides
-├── scripts/                # Build and utility scripts
-├── .claude/                # Claude Code skills for building agents
-├── ENVIRONMENT_SETUP.md    # Python setup guide for agent development
-├── DEVELOPER.md            # Developer guide
-├── CONTRIBUTING.md         # Contribution guidelines
-└── ROADMAP.md              # Product roadmap
-```
-
-## Development
-
-### Python Agent Development
-
-For building and running goal-driven agents with the framework:
-
-```bash
-# One-time setup
-./scripts/setup-python.sh
-
-# This installs:
-# - framework package (core runtime)
-# - aden_tools package (19 MCP tools)
-# - All dependencies
-
-# Build new agents using Claude Code skills
-claude> /building-agents
-
-# Test agents
-claude> /testing-agent
-
-# Run agents
-PYTHONPATH=core:exports python -m agent_name run --input '{...}'
-```
-
-See [ENVIRONMENT_SETUP.md](ENVIRONMENT_SETUP.md) for complete setup instructions.
+5. **[Adaptiveness](docs/key_concepts/evolution.md)** → On failure, the system evolves the graph and redeploys automatically

 ## Documentation

- **[Developer Guide](DEVELOPER.md)** - Comprehensive guide for developers
+- **[Developer Guide](docs/developer-guide.md)** - Comprehensive guide for developers
 - [Getting Started](docs/getting-started.md) - Quick setup instructions
 - [Configuration Guide](docs/configuration.md) - All configuration options
- [Architecture Overview](docs/architecture.md) - System design and structure
+- [Architecture Overview](docs/architecture/README.md) - System design and structure

-## Roadmap
+## Contributing
+We welcome contributions from the community! We’re especially looking for help building tools, integrations, and example agents for the framework ([check #2805](https://github.com/aden-hive/hive/issues/2805)). If you’re interested in extending its functionality, this is the perfect place to start. Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.

-Aden Agent Framework aims to help developers build outcome oriented, self-adaptive agents. Please find our roadmap here
+**Important:** Please get assigned to an issue before submitting a PR. Comment on an issue to claim it, and a maintainer will assign you. Issues with reproducible steps and proposals are prioritized. This helps prevent duplicate work.

-[ROADMAP.md](ROADMAP.md)
-
-```mermaid
-timeline
-    title Aden Agent Framework Roadmap
-    section Foundation
-        Architecture : Node-Based Architecture : Python SDK : LLM Integration (OpenAI, Anthropic, Google) : Communication Protocol
-        Coding Agent : Goal Creation Session : Worker Agent Creation : MCP Tools Integration
-        Worker Agent : Human-in-the-Loop : Callback Handlers : Intervention Points : Streaming Interface
-        Tools : File Use : Memory (STM/LTM) : Web Search : Web Scraper : Audit Trail
-        Core : Eval System : Pydantic Validation : Docker Deployment : Documentation : Sample Agents
-    section Expansion
-        Intelligence : Guardrails : Streaming Mode : Semantic Search
-        Platform : JavaScript SDK : Custom Tool Integrator : Credential Store
-        Deployment : Self-Hosted : Cloud Services : CI/CD Pipeline
-        Templates : Sales Agent : Marketing Agent : Analytics Agent : Training Agent : Smart Form Agent
-```
+1. Find or create an issue and get assigned
+2. Fork the repository
+3. Create your feature branch (`git checkout -b feature/amazing-feature`)
+4. Commit your changes (`git commit -m 'Add amazing feature'`)
+5. Push to the branch (`git push origin feature/amazing-feature`)
+6. Open a Pull Request

 ## Community & Support

@@ -295,16 +227,6 @@ We use [Discord](https://discord.com/invite/MXE49hrKDk) for support, feature req
 - Twitter/X - [@adenhq](https://x.com/aden_hq)
 - LinkedIn - [Company Page](https://www.linkedin.com/company/teamaden/)

-## Contributing
-
-We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
-
-1. Fork the repository
-2. Create your feature branch (`git checkout -b feature/amazing-feature`)
-3. Commit your changes (`git commit -m 'Add amazing feature'`)
-4. Push to the branch (`git push origin feature/amazing-feature`)
-5. Open a Pull Request
-
 ## Join Our Team

 **We're hiring!** Join us in engineering, research, and go-to-market roles.
@@ -321,69 +243,55 @@ This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENS

 ## Frequently Asked Questions (FAQ)

-**Q: Does Aden depend on LangChain or other agent frameworks?**
+**Q: What LLM providers does Hive support?**

-No. Aden is built from the ground up with no dependencies on LangChain, CrewAI, or other agent frameworks. The framework is designed to be lean and flexible, generating agent graphs dynamically rather than relying on predefined components.
+Hive supports 100+ LLM providers through LiteLLM integration, including OpenAI (GPT-4, GPT-4o), Anthropic (Claude models), Google Gemini, DeepSeek, Mistral, Groq, OpenRouter, and Hive LLM. Simply set the appropriate API key environment variable and specify the model name. See [docs/configuration.md](docs/configuration.md) for provider-specific configuration examples.

-**Q: What LLM providers does Aden support?**
+**Q: Can I use Hive with local AI models like Ollama?**

-Aden supports 100+ LLM providers through LiteLLM integration, including OpenAI (GPT-4, GPT-4o), Anthropic (Claude models), Google Gemini, Mistral, Groq, and many more. Simply set the appropriate API key environment variable and specify the model name.
+Yes! Hive supports local models through LiteLLM. Simply use the model name format `ollama/model-name` (e.g., `ollama/llama3`, `ollama/mistral`) and ensure Ollama is running locally.

-**Q: Can I use Aden with local AI models like Ollama?**
+**Q: What makes Hive different from other agent frameworks?**

-Yes! Aden supports local models through LiteLLM. Simply use the model name format `ollama/model-name` (e.g., `ollama/llama3`, `ollama/mistral`) and ensure Ollama is running locally.
+Hive is an agent harness, not just an orchestration framework. It provides the production runtime layer — session isolation, checkpoint-based crash recovery, cost enforcement, real-time observability, and human-in-the-loop controls — that makes agents reliable enough to run real workloads. On top of that, Hive generates your entire agent system from natural language goals and automatically [evolves the graph](docs/key_concepts/evolution.md) when agents fail. The combination of a robust harness with self-improving generation is what sets Hive apart.

-**Q: What makes Aden different from other agent frameworks?**
+**Q: Is Hive open-source?**

-Aden generates your entire agent system from natural language goals using a coding agent—you don't hardcode workflows or manually define graphs. When agents fail, the framework automatically captures failure data, evolves the agent graph, and redeploys. This self-improving loop is unique to Aden.
+Yes, Hive is fully open-source under the Apache License 2.0. We actively encourage community contributions and collaboration.

-**Q: Is Aden open-source?**
+**Q: Does Hive support human-in-the-loop workflows?**

-Yes, Aden is fully open-source under the Apache License 2.0. We actively encourage community contributions and collaboration.
+Yes, Hive fully supports [human-in-the-loop](docs/key_concepts/graph.md#human-in-the-loop) workflows through intervention nodes that pause execution for human input. These include configurable timeouts and escalation policies, allowing seamless collaboration between human experts and AI agents.

-**Q: Does Aden collect data from users?**
+**Q: What programming languages does Hive support?**

-Aden collects telemetry data for monitoring and observability purposes, including token usage, latency metrics, and cost tracking. Content capture (prompts and responses) is configurable and stored with team-scoped data isolation. All data stays within your infrastructure when self-hosted.
+The Hive framework is built in Python. A JavaScript/TypeScript SDK is on the roadmap.

-**Q: What deployment options does Aden support?**
-
-Aden supports Docker Compose deployment out of the box, with both production and development configurations. Self-hosted deployments work on any infrastructure supporting Docker. Cloud deployment options and Kubernetes-ready configurations are on the roadmap.
-
-**Q: Can Aden handle complex, production-scale use cases?**
-
-Yes. Aden is explicitly designed for production environments with features like automatic failure recovery, real-time observability, cost controls, and horizontal scaling support. The framework handles both simple automations and complex multi-agent workflows.
-
-**Q: Does Aden support human-in-the-loop workflows?**
-
-Yes, Aden fully supports human-in-the-loop workflows through intervention nodes that pause execution for human input. These include configurable timeouts and escalation policies, allowing seamless collaboration between human experts and AI agents.
-
-**Q: What monitoring and debugging tools does Aden provide?**
-
-Aden includes comprehensive observability features: real-time WebSocket streaming for live agent execution monitoring, TimescaleDB-powered analytics for cost and performance metrics, health check endpoints for Kubernetes integration, and 19 MCP tools for budget management, agent status, and policy control.
-
-**Q: What programming languages does Aden support?**
-
-Aden provides SDKs for both Python and JavaScript/TypeScript. The Python SDK includes integration templates for LangGraph, LangFlow, and LiveKit. The backend is Node.js/TypeScript, and the frontend is React/TypeScript.
-
-**Q: Can Aden agents interact with external tools and APIs?**
+**Q: Can Hive agents interact with external tools and APIs?**

 Yes. Aden's SDK-wrapped nodes provide built-in tool access, and the framework supports flexible tool ecosystems. Agents can integrate with external APIs, databases, and services through the node architecture.

-**Q: How does cost control work in Aden?**
+**Q: How does cost control work in Hive?**

-Aden provides granular budget controls including spending limits, throttles, and automatic model degradation policies. You can set budgets at the team, agent, or workflow level, with real-time cost tracking and alerts.
+Hive provides granular budget controls including spending limits, throttles, and automatic model degradation policies. You can set budgets at the team, agent, or workflow level, with real-time cost tracking and alerts.

 **Q: Where can I find examples and documentation?**

-Visit [docs.adenhq.com](https://docs.adenhq.com/) for complete guides, API reference, and getting started tutorials. The repository also includes documentation in the `docs/` folder and a comprehensive [DEVELOPER.md](DEVELOPER.md) guide.
+Visit [docs.adenhq.com](https://docs.adenhq.com/) for complete guides, API reference, and getting started tutorials. The repository also includes documentation in the `docs/` folder and a comprehensive [developer guide](docs/developer-guide.md).

 **Q: How can I contribute to Aden?**

 Contributions are welcome! Fork the repository, create your feature branch, implement your changes, and submit a pull request. See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed guidelines.

-**Q: Does Aden offer enterprise support?**
+## Star History

-For enterprise inquiries, contact the Aden team through [adenhq.com](https://adenhq.com) or join our [Discord community](https://discord.com/invite/MXE49hrKDk) for support and discussions.
+<a href="https://star-history.com/#aden-hive/hive&Date">
+ <picture>
+   <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=aden-hive/hive&type=Date&theme=dark" />
+   <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=aden-hive/hive&type=Date" />
+   <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=aden-hive/hive&type=Date" />
+ </picture>
+</a>

 ---

@@ -1,339 +0,0 @@
-<p align="center">
-  <img width="100%" alt="Hive Banner" src="https://storage.googleapis.com/aden-prod-assets/website/aden-title-card.png" />
-</p>
-
-<p align="center">
-  <a href="README.md">English</a> |
-  <a href="README.zh-CN.md">简体中文</a> |
-  <a href="README.es.md">Español</a> |
-  <a href="README.pt.md">Português</a> |
-  <a href="README.ja.md">日本語</a> |
-  <a href="README.ru.md">Русский</a>
-</p>
-
-[![Apache 2.0 License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/adenhq/hive/blob/main/LICENSE)
-[![Y Combinator](https://img.shields.io/badge/Y%20Combinator-Aden-orange)](https://www.ycombinator.com/companies/aden)
-[![Docker Pulls](https://img.shields.io/docker/pulls/adenhq/hive?logo=Docker&labelColor=%23528bff)](https://hub.docker.com/u/adenhq)
-[![Discord](https://img.shields.io/discord/1172610340073242735?logo=discord&labelColor=%235462eb&logoColor=%23f5f5f5&color=%235462eb)](https://discord.com/invite/MXE49hrKDk)
-[![Twitter Follow](https://img.shields.io/twitter/follow/teamaden?logo=X&color=%23f5f5f5)](https://x.com/aden_hq)
-[![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/teamaden/)
-
-<p align="center">
-  <img src="https://img.shields.io/badge/AI_Agents-Self--Improving-brightgreen?style=flat-square" alt="AI Agents" />
-  <img src="https://img.shields.io/badge/Multi--Agent-Systems-blue?style=flat-square" alt="Multi-Agent" />
-  <img src="https://img.shields.io/badge/Goal--Driven-Development-purple?style=flat-square" alt="Goal-Driven" />
-  <img src="https://img.shields.io/badge/Human--in--the--Loop-orange?style=flat-square" alt="HITL" />
-  <img src="https://img.shields.io/badge/Production--Ready-red?style=flat-square" alt="Production" />
-</p>
-<p align="center">
-  <img src="https://img.shields.io/badge/OpenAI-supported-412991?style=flat-square&logo=openai" alt="OpenAI" />
-  <img src="https://img.shields.io/badge/Anthropic-supported-d4a574?style=flat-square" alt="Anthropic" />
-  <img src="https://img.shields.io/badge/Google_Gemini-supported-4285F4?style=flat-square&logo=google" alt="Gemini" />
-  <img src="https://img.shields.io/badge/MCP-19_Tools-00ADD8?style=flat-square" alt="MCP" />
-</p>
-
-## Visão Geral
-
-Construa agentes de IA confiáveis e auto-aperfeiçoáveis sem codificar fluxos de trabalho. Defina seu objetivo através de uma conversa com um agente de codificação, e o framework gera um grafo de nós com código de conexão criado dinamicamente. Quando algo quebra, o framework captura dados de falha, evolui o agente através do agente de codificação e reimplanta. Nós de intervenção humana integrados, gerenciamento de credenciais e monitoramento em tempo real dão a você controle sem sacrificar a adaptabilidade.
-
-Visite [adenhq.com](https://adenhq.com) para documentação completa, exemplos e guias.
-
-## O que é Aden
-
-<p align="center">
-  <img width="100%" alt="Aden Architecture" src="docs/assets/aden-architecture-diagram.jpg" />
-</p>
-
-Aden é uma plataforma para construir, implantar, operar e adaptar agentes de IA:
-
- **Construir** - Um Agente de Codificação gera Agentes de Trabalho especializados (Vendas, Marketing, Operações) a partir de objetivos em linguagem natural
- **Implantar** - Implantação headless com integração CI/CD e gerenciamento completo do ciclo de vida de API
- **Operar** - Monitoramento em tempo real, observabilidade e guardrails de runtime mantêm os agentes confiáveis
- **Adaptar** - Avaliação contínua, supervisão e adaptação garantem que os agentes melhorem ao longo do tempo
- **Infraestrutura** - Memória compartilhada, integrações LLM, ferramentas e habilidades alimentam cada agente
-
-## Links Rápidos
-
- **[Documentação](https://docs.adenhq.com/)** - Guias completos e referência de API
- **[Guia de Auto-Hospedagem](https://docs.adenhq.com/getting-started/quickstart)** - Implante o Hive em sua infraestrutura
- **[Changelog](https://github.com/adenhq/hive/releases)** - Últimas atualizações e versões
-<!-- - **[Roadmap](https://adenhq.com/roadmap)** - Funcionalidades e planos futuros -->
- **[Reportar Problemas](https://github.com/adenhq/hive/issues)** - Relatórios de bugs e solicitações de funcionalidades
-
-## Início Rápido
-
-### Pré-requisitos
-
- [Python 3.11+](https://www.python.org/downloads/) - Para desenvolvimento de agentes
- [Docker](https://docs.docker.com/get-docker/) (v20.10+) - Opcional, para ferramentas containerizadas
-
-### Instalação
-
-```bash
-# Clonar o repositório
-git clone https://github.com/adenhq/hive.git
-cd hive
-
-# Executar configuração do ambiente Python
-./scripts/setup-python.sh
-```
-
-Isto instala:
- **framework** - Runtime do agente principal e executor de grafos
- **aden_tools** - 19 ferramentas MCP para capacidades de agentes
- Todas as dependências necessárias
-
-### Construa Seu Primeiro Agente
-
-```bash
-# Instalar habilidades do Claude Code (uma vez)
-./quickstart.sh
-
-# Construir um agente usando Claude Code
-claude> /building-agents
-
-# Testar seu agente
-claude> /testing-agent
-
-# Executar seu agente
-PYTHONPATH=core:exports python -m your_agent_name run --input '{...}'
-```
-
-**[📖 Guia Completo de Configuração](ENVIRONMENT_SETUP.md)** - Instruções detalhadas para desenvolvimento de agentes
-
-## Funcionalidades
-
- **Desenvolvimento Orientado a Objetivos** - Defina objetivos em linguagem natural; o agente de codificação gera o grafo de agentes e código de conexão para alcançá-los
- **Agentes Auto-Adaptáveis** - Framework captura falhas, atualiza objetivos e atualiza o grafo de agentes
- **Conexões de Nós Dinâmicas** - Sem arestas predefinidas; código de conexão é gerado por qualquer LLM capaz baseado em seus objetivos
- **Nós Envolvidos em SDK** - Cada nó recebe memória compartilhada, memória RLM local, monitoramento, ferramentas e acesso LLM prontos para uso
- **Humano no Loop** - Nós de intervenção que pausam a execução para entrada humana com timeouts e escalonamento configuráveis
- **Observabilidade em Tempo Real** - Streaming WebSocket para monitoramento ao vivo de execução de agentes, decisões e comunicação entre nós
- **Controle de Custo e Orçamento** - Defina limites de gastos, throttles e políticas de degradação automática de modelo
- **Pronto para Produção** - Auto-hospedável, construído para escala e confiabilidade
-
-## Por que Aden
-
-Frameworks de agentes tradicionais exigem que você projete manualmente fluxos de trabalho, defina interações de agentes e lide com falhas reativamente. Aden inverte esse paradigma—**você descreve resultados, e o sistema se constrói sozinho**.
-
-```mermaid
-flowchart LR
-    subgraph BUILD["🏗️ BUILD"]
-        GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>LLM/Router/Function"]
-        NODES --> EDGES["Connect Edges<br/>on_success/failure/conditional"]
-        EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"]
-    end
-
-    subgraph EXPORT["📦 EXPORT"]
-        direction TB
-        JSON["agent.json<br/>(GraphSpec)"]
-        TOOLS["tools.py<br/>(Functions)"]
-        MCP["mcp_servers.json<br/>(Integrations)"]
-    end
-
-    subgraph RUN["🚀 RUNTIME"]
-        LOAD["AgentRunner<br/>Load + Parse"] --> SETUP["Setup Runtime<br/>+ ToolRegistry"]
-        SETUP --> EXEC["GraphExecutor<br/>Execute Nodes"]
-
-        subgraph DECISION["Decision Recording"]
-            DEC1["runtime.decide()<br/>intent → options → choice"]
-            DEC2["runtime.record_outcome()<br/>success, result, metrics"]
-        end
-    end
-
-    subgraph INFRA["⚙️ INFRASTRUCTURE"]
-        CTX["NodeContext<br/>memory • llm • tools"]
-        STORE[("FileStorage<br/>Runs & Decisions")]
-    end
-
-    APPROVE --> EXPORT
-    EXPORT --> LOAD
-    EXEC --> DECISION
-    EXEC --> CTX
-    DECISION --> STORE
-    STORE -.->|"Analyze & Improve"| NODES
-
-    style BUILD fill:#ffbe42,stroke:#cc5d00,stroke-width:3px,color:#333
-    style EXPORT fill:#fff59d,stroke:#ed8c00,stroke-width:2px,color:#333
-    style RUN fill:#ffb100,stroke:#cc5d00,stroke-width:3px,color:#333
-    style DECISION fill:#ffcc80,stroke:#ed8c00,stroke-width:2px,color:#333
-    style INFRA fill:#e8763d,stroke:#cc5d00,stroke-width:3px,color:#fff
-    style STORE fill:#ed8c00,stroke:#cc5d00,stroke-width:2px,color:#fff
-```
-
-### A Vantagem Aden
-
-| Frameworks Tradicionais | Aden |
-|-------------------------|------|
-| Codificar fluxos de trabalho de agentes | Descrever objetivos em linguagem natural |
-| Definição manual de grafos | Grafos de agentes auto-gerados |
-| Tratamento reativo de erros | Auto-evolução proativa |
-| Configurações de ferramentas estáticas | Nós dinâmicos envolvidos em SDK |
-| Configuração de monitoramento separada | Observabilidade em tempo real integrada |
-| Gerenciamento de orçamento DIY | Controles de custo e degradação integrados |
-
-### Como Funciona
-
-1. **Defina Seu Objetivo** → Descreva o que você quer alcançar em linguagem simples
-2. **Agente de Codificação Gera** → Cria o grafo de agentes, código de conexão e casos de teste
-3. **Workers Executam** → Nós envolvidos em SDK executam com observabilidade completa e acesso a ferramentas
-4. **Plano de Controle Monitora** → Métricas em tempo real, aplicação de orçamento, gerenciamento de políticas
-5. **Auto-Aperfeiçoamento** → Em caso de falha, o sistema evolui o grafo e reimplanta automaticamente
-
-## Como Aden se Compara
-
-Aden adota uma abordagem fundamentalmente diferente para o desenvolvimento de agentes. Enquanto a maioria dos frameworks exige que você codifique fluxos de trabalho ou defina manualmente grafos de agentes, Aden usa um **agente de codificação para gerar todo o seu sistema de agentes** a partir de objetivos em linguagem natural. Quando os agentes falham, o framework não apenas registra erros—**ele evolui automaticamente o grafo de agentes** e reimplanta.
-
-> **Nota:** Para a tabela de comparação detalhada de frameworks e perguntas frequentes, consulte o [README.md](README.md) em inglês.
-
-### Quando Escolher Aden
-
-Escolha Aden quando você precisar de:
-
- Agentes que **se auto-aperfeiçoam a partir de falhas** sem intervenção manual
- **Desenvolvimento orientado a objetivos** onde você descreve resultados, não fluxos de trabalho
- **Confiabilidade em produção** com recuperação e reimplantação automáticas
- **Iteração rápida** em arquiteturas de agentes sem reescrever código
- **Observabilidade completa** com monitoramento em tempo real e supervisão humana
-
-Escolha outros frameworks quando você precisar de:
-
- **Fluxos de trabalho previsíveis e type-safe** (PydanticAI, Mastra)
- **RAG e processamento de documentos** (LlamaIndex, Haystack)
- **Pesquisa sobre emergência de agentes** (CAMEL)
- **Voz/multimodal em tempo real** (TEN Framework)
- **Encadeamento simples de componentes** (LangChain, Swarm)
-
-## Estrutura do Projeto
-
-```
-hive/
-├── core/                   # Framework principal - Runtime de agentes, executor de grafos, protocolos
-├── tools/                  # Pacote de Ferramentas MCP - 19 ferramentas para capacidades de agentes
-├── exports/                # Pacotes de Agentes - Agentes pré-construídos e exemplos
-├── docs/                   # Documentação e guias
-├── scripts/                # Scripts de build e utilitários
-├── .claude/                # Habilidades Claude Code para construir agentes
-├── ENVIRONMENT_SETUP.md    # Guia de configuração Python para desenvolvimento de agentes
-├── DEVELOPER.md            # Guia do desenvolvedor
-├── CONTRIBUTING.md         # Diretrizes de contribuição
-└── ROADMAP.md              # Roadmap do produto
-```
-
-## Desenvolvimento
-
-### Desenvolvimento de Agentes Python
-
-Para construir e executar agentes orientados a objetivos com o framework:
-
-```bash
-# Configuração única
-./scripts/setup-python.sh
-
-# Isto instala:
-# - pacote framework (runtime principal)
-# - pacote aden_tools (19 ferramentas MCP)
-# - Todas as dependências
-
-# Construir novos agentes usando habilidades Claude Code
-claude> /building-agents
-
-# Testar agentes
-claude> /testing-agent
-
-# Executar agentes
-PYTHONPATH=core:exports python -m agent_name run --input '{...}'
-```
-
-Consulte [ENVIRONMENT_SETUP.md](ENVIRONMENT_SETUP.md) para instruções completas de configuração.
-
-## Documentação
-
- **[Guia do Desenvolvedor](DEVELOPER.md)** - Guia abrangente para desenvolvedores
- [Começando](docs/getting-started.md) - Instruções de configuração rápida
- [Guia de Configuração](docs/configuration.md) - Todas as opções de configuração
- [Visão Geral da Arquitetura](docs/architecture.md) - Design e estrutura do sistema
-
-## Roadmap
-
-O Aden Agent Framework visa ajudar desenvolvedores a construir agentes auto-adaptativos orientados a resultados. Encontre nosso roadmap aqui
-
-[ROADMAP.md](ROADMAP.md)
-
-```mermaid
-timeline
-    title Aden Agent Framework Roadmap
-    section Foundation
-        Architecture : Node-Based Architecture : Python SDK : LLM Integration (OpenAI, Anthropic, Google) : Communication Protocol
-        Coding Agent : Goal Creation Session : Worker Agent Creation : MCP Tools Integration
-        Worker Agent : Human-in-the-Loop : Callback Handlers : Intervention Points : Streaming Interface
-        Tools : File Use : Memory (STM/LTM) : Web Search : Web Scraper : Audit Trail
-        Core : Eval System : Pydantic Validation : Docker Deployment : Documentation : Sample Agents
-    section Expansion
-        Intelligence : Guardrails : Streaming Mode : Semantic Search
-        Platform : JavaScript SDK : Custom Tool Integrator : Credential Store
-        Deployment : Self-Hosted : Cloud Services : CI/CD Pipeline
-        Templates : Sales Agent : Marketing Agent : Analytics Agent : Training Agent : Smart Form Agent
-```
-
-## Comunidade e Suporte
-
-Usamos [Discord](https://discord.com/invite/MXE49hrKDk) para suporte, solicitações de funcionalidades e discussões da comunidade.
-
- Discord - [Junte-se à nossa comunidade](https://discord.com/invite/MXE49hrKDk)
- Twitter/X - [@adenhq](https://x.com/aden_hq)
- LinkedIn - [Página da Empresa](https://www.linkedin.com/company/teamaden/)
-
-## Contribuindo
-
-Aceitamos contribuições! Por favor, consulte [CONTRIBUTING.md](CONTRIBUTING.md) para diretrizes.
-
-1. Faça fork do repositório
-2. Crie sua branch de funcionalidade (`git checkout -b feature/amazing-feature`)
-3. Faça commit das suas alterações (`git commit -m 'Add amazing feature'`)
-4. Faça push para a branch (`git push origin feature/amazing-feature`)
-5. Abra um Pull Request
-
-## Junte-se ao Nosso Time
-
-**Estamos contratando!** Junte-se a nós em funções de engenharia, pesquisa e go-to-market.
-
-[Ver Posições Abertas](https://jobs.adenhq.com/a8cec478-cdbc-473c-bbd4-f4b7027ec193/applicant)
-
-## Segurança
-
-Para questões de segurança, por favor consulte [SECURITY.md](SECURITY.md).
-
-## Licença
-
-Este projeto está licenciado sob a Licença Apache 2.0 - veja o arquivo [LICENSE](LICENSE) para detalhes.
-
-## Perguntas Frequentes (FAQ)
-
-> **Nota:** Para as perguntas frequentes completas, consulte o [README.md](README.md) em inglês.
-
-**P: O Aden depende do LangChain ou outros frameworks de agentes?**
-
-Não. O Aden é construído do zero sem dependências do LangChain, CrewAI ou outros frameworks de agentes. O framework é projetado para ser leve e flexível, gerando grafos de agentes dinamicamente em vez de depender de componentes predefinidos.
-
-**P: Quais provedores de LLM o Aden suporta?**
-
-O Aden suporta mais de 100 provedores de LLM através da integração LiteLLM, incluindo OpenAI (GPT-4, GPT-4o), Anthropic (modelos Claude), Google Gemini, Mistral, Groq e muitos mais. Simplesmente configure a variável de ambiente da chave API apropriada e especifique o nome do modelo.
-
-**P: O Aden é open-source?**
-
-Sim, o Aden é totalmente open-source sob a Licença Apache 2.0. Incentivamos ativamente contribuições e colaboração da comunidade.
-
-**P: O que torna o Aden diferente de outros frameworks de agentes?**
-
-O Aden gera todo o seu sistema de agentes a partir de objetivos em linguagem natural usando um agente de codificação—você não codifica fluxos de trabalho nem define grafos manualmente. Quando os agentes falham, o framework captura automaticamente os dados de falha, evolui o grafo de agentes e reimplanta. Este loop de auto-aperfeiçoamento é único do Aden.
-
-**P: O Aden suporta fluxos de trabalho com humano no loop?**
-
-Sim, o Aden suporta totalmente fluxos de trabalho com humano no loop através de nós de intervenção que pausam a execução para entrada humana. Estes incluem timeouts configuráveis e políticas de escalonamento, permitindo colaboração perfeita entre especialistas humanos e agentes de IA.
-
---
-
-<p align="center">
-  Feito com 🔥 Paixão em San Francisco
-</p>
@@ -1,339 +0,0 @@
-<p align="center">
-  <img width="100%" alt="Hive Banner" src="https://storage.googleapis.com/aden-prod-assets/website/aden-title-card.png" />
-</p>
-
-<p align="center">
-  <a href="README.md">English</a> |
-  <a href="README.zh-CN.md">简体中文</a> |
-  <a href="README.es.md">Español</a> |
-  <a href="README.pt.md">Português</a> |
-  <a href="README.ja.md">日本語</a> |
-  <a href="README.ru.md">Русский</a>
-</p>
-
-[![Apache 2.0 License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/adenhq/hive/blob/main/LICENSE)
-[![Y Combinator](https://img.shields.io/badge/Y%20Combinator-Aden-orange)](https://www.ycombinator.com/companies/aden)
-[![Docker Pulls](https://img.shields.io/docker/pulls/adenhq/hive?logo=Docker&labelColor=%23528bff)](https://hub.docker.com/u/adenhq)
-[![Discord](https://img.shields.io/discord/1172610340073242735?logo=discord&labelColor=%235462eb&logoColor=%23f5f5f5&color=%235462eb)](https://discord.com/invite/MXE49hrKDk)
-[![Twitter Follow](https://img.shields.io/twitter/follow/teamaden?logo=X&color=%23f5f5f5)](https://x.com/aden_hq)
-[![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/teamaden/)
-
-<p align="center">
-  <img src="https://img.shields.io/badge/AI_Agents-Self--Improving-brightgreen?style=flat-square" alt="AI Agents" />
-  <img src="https://img.shields.io/badge/Multi--Agent-Systems-blue?style=flat-square" alt="Multi-Agent" />
-  <img src="https://img.shields.io/badge/Goal--Driven-Development-purple?style=flat-square" alt="Goal-Driven" />
-  <img src="https://img.shields.io/badge/Human--in--the--Loop-orange?style=flat-square" alt="HITL" />
-  <img src="https://img.shields.io/badge/Production--Ready-red?style=flat-square" alt="Production" />
-</p>
-<p align="center">
-  <img src="https://img.shields.io/badge/OpenAI-supported-412991?style=flat-square&logo=openai" alt="OpenAI" />
-  <img src="https://img.shields.io/badge/Anthropic-supported-d4a574?style=flat-square" alt="Anthropic" />
-  <img src="https://img.shields.io/badge/Google_Gemini-supported-4285F4?style=flat-square&logo=google" alt="Gemini" />
-  <img src="https://img.shields.io/badge/MCP-19_Tools-00ADD8?style=flat-square" alt="MCP" />
-</p>
-
-## Обзор
-
-Создавайте надёжных, самосовершенствующихся ИИ-агентов без жёсткого кодирования рабочих процессов. Определите свою цель через разговор с кодирующим агентом, и фреймворк сгенерирует граф узлов с динамически созданным кодом соединений. Когда что-то ломается, фреймворк захватывает данные об ошибке, эволюционирует агента через кодирующего агента и переразвёртывает. Встроенные узлы человеческого вмешательства, управление учётными данными и мониторинг в реальном времени дают вам контроль без ущерба для адаптивности.
-
-Посетите [adenhq.com](https://adenhq.com) для полной документации, примеров и руководств.
-
-## Что такое Aden
-
-<p align="center">
-  <img width="100%" alt="Aden Architecture" src="docs/assets/aden-architecture-diagram.jpg" />
-</p>
-
-Aden — это платформа для создания, развёртывания, эксплуатации и адаптации ИИ-агентов:
-
- **Создание** - Кодирующий агент генерирует специализированных рабочих агентов (продажи, маркетинг, операции) из целей на естественном языке
- **Развёртывание** - Headless-развёртывание с интеграцией CI/CD и полным управлением жизненным циклом API
- **Эксплуатация** - Мониторинг в реальном времени, наблюдаемость и защитные барьеры времени выполнения обеспечивают надёжность агентов
- **Адаптация** - Непрерывная оценка, контроль и адаптация гарантируют улучшение агентов со временем
- **Инфраструктура** - Общая память, интеграции LLM, инструменты и навыки питают каждого агента
-
-## Быстрые ссылки
-
- **[Документация](https://docs.adenhq.com/)** - Полные руководства и справочник API
- **[Руководство по самостоятельному хостингу](https://docs.adenhq.com/getting-started/quickstart)** - Разверните Hive в своей инфраструктуре
- **[История изменений](https://github.com/adenhq/hive/releases)** - Последние обновления и релизы
-<!-- - **[Дорожная карта](https://adenhq.com/roadmap)** - Предстоящие функции и планы -->
- **[Сообщить о проблеме](https://github.com/adenhq/hive/issues)** - Отчёты об ошибках и запросы функций
-
-## Быстрый старт
-
-### Предварительные требования
-
- [Python 3.11+](https://www.python.org/downloads/) - Для разработки агентов
- [Docker](https://docs.docker.com/get-docker/) (v20.10+) - Опционально, для контейнеризованных инструментов
-
-### Установка
-
-```bash
-# Клонировать репозиторий
-git clone https://github.com/adenhq/hive.git
-cd hive
-
-# Запустить настройку окружения Python
-./scripts/setup-python.sh
-```
-
-Это установит:
- **framework** - Основная среда выполнения агентов и исполнитель графов
- **aden_tools** - 19 инструментов MCP для возможностей агентов
- Все необходимые зависимости
-
-### Создайте своего первого агента
-
-```bash
-# Установить навыки Claude Code (один раз)
-./quickstart.sh
-
-# Создать агента с помощью Claude Code
-claude> /building-agents
-
-# Протестировать агента
-claude> /testing-agent
-
-# Запустить агента
-PYTHONPATH=core:exports python -m your_agent_name run --input '{...}'
-```
-
-**[📖 Полное руководство по настройке](ENVIRONMENT_SETUP.md)** - Подробные инструкции для разработки агентов
-
-## Функции
-
- **Целеориентированная разработка** - Определяйте цели на естественном языке; кодирующий агент генерирует граф агентов и код соединений для их достижения
- **Самоадаптирующиеся агенты** - Фреймворк захватывает сбои, обновляет цели и обновляет граф агентов
- **Динамические соединения узлов** - Без предопределённых рёбер; код соединений генерируется любым способным LLM на основе ваших целей
- **Узлы, обёрнутые SDK** - Каждый узел получает общую память, локальную RLM-память, мониторинг, инструменты и доступ к LLM из коробки
- **Человек в контуре** - Узлы вмешательства, которые приостанавливают выполнение для человеческого ввода с настраиваемыми таймаутами и эскалацией
- **Наблюдаемость в реальном времени** - WebSocket-стриминг для живого мониторинга выполнения агентов, решений и межузловой коммуникации
- **Контроль затрат и бюджета** - Устанавливайте лимиты расходов, ограничения и политики автоматической деградации модели
- **Готовность к продакшену** - Возможность самостоятельного хостинга, создан для масштабирования и надёжности
-
-## Почему Aden
-
-Традиционные фреймворки агентов требуют ручного проектирования рабочих процессов, определения взаимодействий агентов и реактивной обработки сбоев. Aden переворачивает эту парадигму — **вы описываете результаты, и система строит себя сама**.
-
-```mermaid
-flowchart LR
-    subgraph BUILD["🏗️ BUILD"]
-        GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>LLM/Router/Function"]
-        NODES --> EDGES["Connect Edges<br/>on_success/failure/conditional"]
-        EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"]
-    end
-
-    subgraph EXPORT["📦 EXPORT"]
-        direction TB
-        JSON["agent.json<br/>(GraphSpec)"]
-        TOOLS["tools.py<br/>(Functions)"]
-        MCP["mcp_servers.json<br/>(Integrations)"]
-    end
-
-    subgraph RUN["🚀 RUNTIME"]
-        LOAD["AgentRunner<br/>Load + Parse"] --> SETUP["Setup Runtime<br/>+ ToolRegistry"]
-        SETUP --> EXEC["GraphExecutor<br/>Execute Nodes"]
-
-        subgraph DECISION["Decision Recording"]
-            DEC1["runtime.decide()<br/>intent → options → choice"]
-            DEC2["runtime.record_outcome()<br/>success, result, metrics"]
-        end
-    end
-
-    subgraph INFRA["⚙️ INFRASTRUCTURE"]
-        CTX["NodeContext<br/>memory • llm • tools"]
-        STORE[("FileStorage<br/>Runs & Decisions")]
-    end
-
-    APPROVE --> EXPORT
-    EXPORT --> LOAD
-    EXEC --> DECISION
-    EXEC --> CTX
-    DECISION --> STORE
-    STORE -.->|"Analyze & Improve"| NODES
-
-    style BUILD fill:#ffbe42,stroke:#cc5d00,stroke-width:3px,color:#333
-    style EXPORT fill:#fff59d,stroke:#ed8c00,stroke-width:2px,color:#333
-    style RUN fill:#ffb100,stroke:#cc5d00,stroke-width:3px,color:#333
-    style DECISION fill:#ffcc80,stroke:#ed8c00,stroke-width:2px,color:#333
-    style INFRA fill:#e8763d,stroke:#cc5d00,stroke-width:3px,color:#fff
-    style STORE fill:#ed8c00,stroke:#cc5d00,stroke-width:2px,color:#fff
-```
-
-### Преимущество Aden
-
-| Традиционные фреймворки | Aden |
-|-------------------------|------|
-| Жёсткое кодирование рабочих процессов | Описание целей на естественном языке |
-| Ручное определение графов | Автоматически генерируемые графы агентов |
-| Реактивная обработка ошибок | Проактивная самоэволюция |
-| Статические конфигурации инструментов | Динамические узлы, обёрнутые SDK |
-| Отдельная настройка мониторинга | Встроенная наблюдаемость в реальном времени |
-| DIY управление бюджетом | Интегрированный контроль затрат и деградация |
-
-### Как это работает
-
-1. **Определите цель** → Опишите, чего хотите достичь, простым языком
-2. **Кодирующий агент генерирует** → Создаёт граф агентов, код соединений и тестовые случаи
-3. **Рабочие выполняют** → Узлы, обёрнутые SDK, работают с полной наблюдаемостью и доступом к инструментам
-4. **Плоскость управления мониторит** → Метрики в реальном времени, применение бюджета, управление политиками
-5. **Самосовершенствование** → При сбое система эволюционирует граф и автоматически переразвёртывает
-
-## Сравнение Aden
-
-Aden использует принципиально иной подход к разработке агентов. В то время как большинство фреймворков требуют жёсткого кодирования рабочих процессов или ручного определения графов агентов, Aden использует **кодирующего агента для генерации всей системы агентов** из целей на естественном языке. Когда агенты терпят неудачу, фреймворк не просто регистрирует ошибки — он **автоматически эволюционирует граф агентов** и переразвёртывает.
-
-> **Примечание:** Для подробной таблицы сравнения фреймворков и часто задаваемых вопросов обратитесь к английской версии [README.md](README.md).
-
-### Когда выбирать Aden
-
-Выбирайте Aden, когда вам нужны:
-
- Агенты, которые **самосовершенствуются на основе сбоев** без ручного вмешательства
- **Целеориентированная разработка**, где вы описываете результаты, а не рабочие процессы
- **Надёжность продакшена** с автоматическим восстановлением и переразвёртыванием
- **Быстрая итерация** архитектур агентов без переписывания кода
- **Полная наблюдаемость** с мониторингом в реальном времени и человеческим надзором
-
-Выбирайте другие фреймворки, когда вам нужны:
-
- **Предсказуемые, типобезопасные рабочие процессы** (PydanticAI, Mastra)
- **RAG и обработка документов** (LlamaIndex, Haystack)
- **Исследование эмерджентности агентов** (CAMEL)
- **Голос/мультимодальность в реальном времени** (TEN Framework)
- **Простое связывание компонентов** (LangChain, Swarm)
-
-## Структура проекта
-
-```
-hive/
-├── core/                   # Основной фреймворк - Среда выполнения агентов, исполнитель графов, протоколы
-├── tools/                  # Пакет инструментов MCP - 19 инструментов для возможностей агентов
-├── exports/                # Пакеты агентов - Предварительно созданные агенты и примеры
-├── docs/                   # Документация и руководства
-├── scripts/                # Скрипты сборки и утилиты
-├── .claude/                # Навыки Claude Code для создания агентов
-├── ENVIRONMENT_SETUP.md    # Руководство по настройке Python для разработки агентов
-├── DEVELOPER.md            # Руководство разработчика
-├── CONTRIBUTING.md         # Руководство по участию
-└── ROADMAP.md              # Дорожная карта продукта
-```
-
-## Разработка
-
-### Разработка агентов на Python
-
-Для создания и запуска целеориентированных агентов с помощью фреймворка:
-
-```bash
-# Одноразовая настройка
-./scripts/setup-python.sh
-
-# Это установит:
-# - пакет framework (основная среда выполнения)
-# - пакет aden_tools (19 инструментов MCP)
-# - Все зависимости
-
-# Создать новых агентов с помощью навыков Claude Code
-claude> /building-agents
-
-# Протестировать агентов
-claude> /testing-agent
-
-# Запустить агентов
-PYTHONPATH=core:exports python -m agent_name run --input '{...}'
-```
-
-Обратитесь к [ENVIRONMENT_SETUP.md](ENVIRONMENT_SETUP.md) для полных инструкций по настройке.
-
-## Документация
-
- **[Руководство разработчика](DEVELOPER.md)** - Полное руководство для разработчиков
- [Начало работы](docs/getting-started.md) - Инструкции по быстрой настройке
- [Руководство по конфигурации](docs/configuration.md) - Все опции конфигурации
- [Обзор архитектуры](docs/architecture.md) - Дизайн и структура системы
-
-## Дорожная карта
-
-Aden Agent Framework призван помочь разработчикам создавать самоадаптирующихся агентов, ориентированных на результат. Найдите нашу дорожную карту здесь
-
-[ROADMAP.md](ROADMAP.md)
-
-```mermaid
-timeline
-    title Aden Agent Framework Roadmap
-    section Foundation
-        Architecture : Node-Based Architecture : Python SDK : LLM Integration (OpenAI, Anthropic, Google) : Communication Protocol
-        Coding Agent : Goal Creation Session : Worker Agent Creation : MCP Tools Integration
-        Worker Agent : Human-in-the-Loop : Callback Handlers : Intervention Points : Streaming Interface
-        Tools : File Use : Memory (STM/LTM) : Web Search : Web Scraper : Audit Trail
-        Core : Eval System : Pydantic Validation : Docker Deployment : Documentation : Sample Agents
-    section Expansion
-        Intelligence : Guardrails : Streaming Mode : Semantic Search
-        Platform : JavaScript SDK : Custom Tool Integrator : Credential Store
-        Deployment : Self-Hosted : Cloud Services : CI/CD Pipeline
-        Templates : Sales Agent : Marketing Agent : Analytics Agent : Training Agent : Smart Form Agent
-```
-
-## Сообщество и поддержка
-
-Мы используем [Discord](https://discord.com/invite/MXE49hrKDk) для поддержки, запросов функций и обсуждений сообщества.
-
- Discord - [Присоединиться к сообществу](https://discord.com/invite/MXE49hrKDk)
- Twitter/X - [@adenhq](https://x.com/aden_hq)
- LinkedIn - [Страница компании](https://www.linkedin.com/company/teamaden/)
-
-## Участие в разработке
-
-Мы приветствуем вклад! Пожалуйста, ознакомьтесь с [CONTRIBUTING.md](CONTRIBUTING.md) для руководств.
-
-1. Сделайте форк репозитория
-2. Создайте ветку функции (`git checkout -b feature/amazing-feature`)
-3. Зафиксируйте изменения (`git commit -m 'Add amazing feature'`)
-4. Отправьте в ветку (`git push origin feature/amazing-feature`)
-5. Откройте Pull Request
-
-## Присоединяйтесь к команде
-
-**Мы нанимаем!** Присоединяйтесь к нам на позициях в инженерии, исследованиях и выходе на рынок.
-
-[Посмотреть открытые позиции](https://jobs.adenhq.com/a8cec478-cdbc-473c-bbd4-f4b7027ec193/applicant)
-
-## Безопасность
-
-По вопросам безопасности, пожалуйста, обратитесь к [SECURITY.md](SECURITY.md).
-
-## Лицензия
-
-Этот проект лицензирован под лицензией Apache 2.0 - см. файл [LICENSE](LICENSE) для деталей.
-
-## Часто задаваемые вопросы (FAQ)
-
-> **Примечание:** Для полных часто задаваемых вопросов обратитесь к английской версии [README.md](README.md).
-
-**В: Зависит ли Aden от LangChain или других фреймворков агентов?**
-
-Нет. Aden построен с нуля без зависимостей от LangChain, CrewAI или других фреймворков агентов. Фреймворк разработан лёгким и гибким, динамически генерируя графы агентов вместо того, чтобы полагаться на предопределённые компоненты.
-
-**В: Каких провайдеров LLM поддерживает Aden?**
-
-Aden поддерживает более 100 провайдеров LLM через интеграцию LiteLLM, включая OpenAI (GPT-4, GPT-4o), Anthropic (модели Claude), Google Gemini, Mistral, Groq и многих других. Просто настройте соответствующую переменную окружения API-ключа и укажите имя модели.
-
-**В: Aden с открытым исходным кодом?**
-
-Да, Aden полностью с открытым исходным кодом под лицензией Apache 2.0. Мы активно поощряем вклад и сотрудничество сообщества.
-
-**В: Что делает Aden отличным от других фреймворков агентов?**
-
-Aden генерирует всю систему агентов из целей на естественном языке, используя кодирующего агента — вы не кодируете рабочие процессы и не определяете графы вручную. Когда агенты терпят неудачу, фреймворк автоматически захватывает данные о сбое, эволюционирует граф агентов и переразвёртывает. Этот цикл самосовершенствования уникален для Aden.
-
-**В: Поддерживает ли Aden рабочие процессы с человеком в контуре?**
-
-Да, Aden полностью поддерживает рабочие процессы с человеком в контуре через узлы вмешательства, которые приостанавливают выполнение для человеческого ввода. Они включают настраиваемые таймауты и политики эскалации, обеспечивая бесшовное сотрудничество между экспертами-людьми и ИИ-агентами.
-
---
-
-<p align="center">
-  Сделано с 🔥 Страстью в Сан-Франциско
-</p>
@@ -1,339 +0,0 @@
-<p align="center">
-  <img width="100%" alt="Hive Banner" src="https://storage.googleapis.com/aden-prod-assets/website/aden-title-card.png" />
-</p>
-
-<p align="center">
-  <a href="README.md">English</a> |
-  <a href="README.zh-CN.md">简体中文</a> |
-  <a href="README.es.md">Español</a> |
-  <a href="README.pt.md">Português</a> |
-  <a href="README.ja.md">日本語</a> |
-  <a href="README.ru.md">Русский</a>
-</p>
-
-[![Apache 2.0 License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/adenhq/hive/blob/main/LICENSE)
-[![Y Combinator](https://img.shields.io/badge/Y%20Combinator-Aden-orange)](https://www.ycombinator.com/companies/aden)
-[![Docker Pulls](https://img.shields.io/docker/pulls/adenhq/hive?logo=Docker&labelColor=%23528bff)](https://hub.docker.com/u/adenhq)
-[![Discord](https://img.shields.io/discord/1172610340073242735?logo=discord&labelColor=%235462eb&logoColor=%23f5f5f5&color=%235462eb)](https://discord.com/invite/MXE49hrKDk)
-[![Twitter Follow](https://img.shields.io/twitter/follow/teamaden?logo=X&color=%23f5f5f5)](https://x.com/aden_hq)
-[![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/teamaden/)
-
-<p align="center">
-  <img src="https://img.shields.io/badge/AI_Agents-Self--Improving-brightgreen?style=flat-square" alt="AI Agents" />
-  <img src="https://img.shields.io/badge/Multi--Agent-Systems-blue?style=flat-square" alt="Multi-Agent" />
-  <img src="https://img.shields.io/badge/Goal--Driven-Development-purple?style=flat-square" alt="Goal-Driven" />
-  <img src="https://img.shields.io/badge/Human--in--the--Loop-orange?style=flat-square" alt="HITL" />
-  <img src="https://img.shields.io/badge/Production--Ready-red?style=flat-square" alt="Production" />
-</p>
-<p align="center">
-  <img src="https://img.shields.io/badge/OpenAI-supported-412991?style=flat-square&logo=openai" alt="OpenAI" />
-  <img src="https://img.shields.io/badge/Anthropic-supported-d4a574?style=flat-square" alt="Anthropic" />
-  <img src="https://img.shields.io/badge/Google_Gemini-supported-4285F4?style=flat-square&logo=google" alt="Gemini" />
-  <img src="https://img.shields.io/badge/MCP-19_Tools-00ADD8?style=flat-square" alt="MCP" />
-</p>
-
-## 概述
-
-构建可靠的、自我改进的 AI 智能体，无需硬编码工作流。通过与编码智能体对话来定义目标，框架会生成带有动态创建连接代码的节点图。当出现问题时，框架会捕获故障数据，通过编码智能体进化智能体，并重新部署。内置的人机协作节点、凭证管理和实时监控让您在保持适应性的同时拥有完全控制权。
-
-访问 [adenhq.com](https://adenhq.com) 获取完整文档、示例和指南。
-
-## 什么是 Aden
-
-<p align="center">
-  <img width="100%" alt="Aden Architecture" src="docs/assets/aden-architecture-diagram.jpg" />
-</p>
-
-Aden 是一个用于构建、部署、运营和适应 AI 智能体的平台：
-
- **构建** - 编码智能体根据自然语言目标生成专业的工作智能体（销售、营销、运营）
- **部署** - 无头部署，支持 CI/CD 集成和完整的 API 生命周期管理
- **运营** - 实时监控、可观测性和运行时护栏确保智能体可靠运行
- **适应** - 持续评估、监督和适应确保智能体随时间改进
- **基础设施** - 共享内存、LLM 集成、工具和技能为每个智能体提供支持
-
-## 快速链接
-
- **[文档](https://docs.adenhq.com/)** - 完整指南和 API 参考
- **[自托管指南](https://docs.adenhq.com/getting-started/quickstart)** - 在您的基础设施上部署 Hive
- **[更新日志](https://github.com/adenhq/hive/releases)** - 最新更新和版本
-<!-- - **[路线图](https://adenhq.com/roadmap)** - 即将推出的功能和计划 -->
- **[报告问题](https://github.com/adenhq/hive/issues)** - Bug 报告和功能请求
-
-## 快速开始
-
-### 前置要求
-
- [Python 3.11+](https://www.python.org/downloads/) - 用于智能体开发
- [Docker](https://docs.docker.com/get-docker/) (v20.10+) - 可选，用于容器化工具
-
-### 安装
-
-```bash
-# 克隆仓库
-git clone https://github.com/adenhq/hive.git
-cd hive
-
-# 运行 Python 环境设置
-./scripts/setup-python.sh
-```
-
-这将安装：
- **framework** - 核心智能体运行时和图执行器
- **aden_tools** - 19 个 MCP 工具提供智能体能力
- 所有必需的依赖项
-
-### 构建您的第一个智能体
-
-```bash
-# 安装 Claude Code 技能（一次性）
-./quickstart.sh
-
-# 使用 Claude Code 构建智能体
-claude> /building-agents
-
-# 测试您的智能体
-claude> /testing-agent
-
-# 运行您的智能体
-PYTHONPATH=core:exports python -m your_agent_name run --input '{...}'
-```
-
-**[📖 完整设置指南](ENVIRONMENT_SETUP.md)** - 智能体开发的详细说明
-
-## 功能特性
-
- **目标驱动开发** - 用自然语言定义目标；编码智能体生成智能体图和连接代码来实现它们
- **自适应智能体** - 框架捕获故障，更新目标并更新智能体图
- **动态节点连接** - 没有预定义边；连接代码由任何有能力的 LLM 根据您的目标生成
- **SDK 封装节点** - 每个节点开箱即用地获得共享内存、本地 RLM 内存、监控、工具和 LLM 访问
- **人机协作** - 干预节点暂停执行以等待人工输入，支持可配置的超时和升级
- **实时可观测性** - WebSocket 流式传输用于实时监控智能体执行、决策和节点间通信
- **成本与预算控制** - 设置支出限制、节流和自动模型降级策略
- **生产就绪** - 可自托管，为规模和可靠性而构建
-
-## 为什么选择 Aden
-
-传统智能体框架要求您手动设计工作流、定义智能体交互并被动处理故障。Aden 颠覆了这一范式——**您描述结果，系统自动构建自己**。
-
-```mermaid
-flowchart LR
-    subgraph BUILD["🏗️ BUILD"]
-        GOAL["Define Goal<br/>+ Success Criteria"] --> NODES["Add Nodes<br/>LLM/Router/Function"]
-        NODES --> EDGES["Connect Edges<br/>on_success/failure/conditional"]
-        EDGES --> TEST["Test & Validate"] --> APPROVE["Approve & Export"]
-    end
-
-    subgraph EXPORT["📦 EXPORT"]
-        direction TB
-        JSON["agent.json<br/>(GraphSpec)"]
-        TOOLS["tools.py<br/>(Functions)"]
-        MCP["mcp_servers.json<br/>(Integrations)"]
-    end
-
-    subgraph RUN["🚀 RUNTIME"]
-        LOAD["AgentRunner<br/>Load + Parse"] --> SETUP["Setup Runtime<br/>+ ToolRegistry"]
-        SETUP --> EXEC["GraphExecutor<br/>Execute Nodes"]
-
-        subgraph DECISION["Decision Recording"]
-            DEC1["runtime.decide()<br/>intent → options → choice"]
-            DEC2["runtime.record_outcome()<br/>success, result, metrics"]
-        end
-    end
-
-    subgraph INFRA["⚙️ INFRASTRUCTURE"]
-        CTX["NodeContext<br/>memory • llm • tools"]
-        STORE[("FileStorage<br/>Runs & Decisions")]
-    end
-
-    APPROVE --> EXPORT
-    EXPORT --> LOAD
-    EXEC --> DECISION
-    EXEC --> CTX
-    DECISION --> STORE
-    STORE -.->|"Analyze & Improve"| NODES
-
-    style BUILD fill:#ffbe42,stroke:#cc5d00,stroke-width:3px,color:#333
-    style EXPORT fill:#fff59d,stroke:#ed8c00,stroke-width:2px,color:#333
-    style RUN fill:#ffb100,stroke:#cc5d00,stroke-width:3px,color:#333
-    style DECISION fill:#ffcc80,stroke:#ed8c00,stroke-width:2px,color:#333
-    style INFRA fill:#e8763d,stroke:#cc5d00,stroke-width:3px,color:#fff
-    style STORE fill:#ed8c00,stroke:#cc5d00,stroke-width:2px,color:#fff
-```
-
-### Aden 的优势
-
-| 传统框架 | Aden |
-|----------|------|
-| 硬编码智能体工作流 | 用自然语言描述目标 |
-| 手动图定义 | 自动生成智能体图 |
-| 被动错误处理 | 主动自我进化 |
-| 静态工具配置 | 动态 SDK 封装节点 |
-| 单独设置监控 | 内置实时可观测性 |
-| DIY 预算管理 | 集成成本控制和降级 |
-
-### 工作原理
-
-1. **定义目标** → 用简单英语描述您想要实现的目标
-2. **编码智能体生成** → 创建智能体图、连接代码和测试用例
-3. **工作节点执行** → SDK 封装节点以完全可观测性和工具访问运行
-4. **控制平面监控** → 实时指标、预算执行、策略管理
-5. **自我改进** → 失败时，系统进化图并自动重新部署
-
-## Aden 与其他框架的比较
-
-Aden 在智能体开发方面采取了根本不同的方法。虽然大多数框架要求您硬编码工作流或手动定义智能体图，但 Aden 使用**编码智能体从自然语言目标生成整个智能体系统**。当智能体失败时，框架不仅记录错误——它会**自动进化智能体图**并重新部署。
-
-> **注意：** 详细的框架比较表和常见问题解答，请参阅英文版 [README.md](README.md)。
-
-### 何时选择 Aden
-
-选择 Aden 当您需要：
-
- 智能体从失败中**自我改进**而无需人工干预
- **目标驱动的开发**，您描述结果而非工作流
- 具有自动恢复和重新部署的**生产可靠性**
- 无需重写代码即可**快速迭代**智能体架构
- 具有实时监控和人工监督的**完整可观测性**
-
-选择其他框架当您需要：
-
- **类型安全、可预测的工作流**（PydanticAI、Mastra）
- **RAG 和文档处理**（LlamaIndex、Haystack）
- **智能体涌现的研究**（CAMEL）
- **实时语音/多模态**（TEN Framework）
- **简单的组件链接**（LangChain、Swarm）
-
-## 项目结构
-
-```
-hive/
-├── core/                   # 核心框架 - 智能体运行时、图执行器、协议
-├── tools/                  # MCP 工具包 - 19 个工具提供智能体能力
-├── exports/                # 智能体包 - 预构建的智能体和示例
-├── docs/                   # 文档和指南
-├── scripts/                # 构建和实用脚本
-├── .claude/                # Claude Code 技能用于构建智能体
-├── ENVIRONMENT_SETUP.md    # 智能体开发的 Python 设置指南
-├── DEVELOPER.md            # 开发者指南
-├── CONTRIBUTING.md         # 贡献指南
-└── ROADMAP.md              # 产品路线图
-```
-
-## 开发
-
-### Python 智能体开发
-
-使用框架构建和运行目标驱动的智能体：
-
-```bash
-# 一次性设置
-./scripts/setup-python.sh
-
-# 这将安装：
-# - framework 包（核心运行时）
-# - aden_tools 包（19 个 MCP 工具）
-# - 所有依赖项
-
-# 使用 Claude Code 技能构建新智能体
-claude> /building-agents
-
-# 测试智能体
-claude> /testing-agent
-
-# 运行智能体
-PYTHONPATH=core:exports python -m agent_name run --input '{...}'
-```
-
-完整设置说明请参阅 [ENVIRONMENT_SETUP.md](ENVIRONMENT_SETUP.md)。
-
-## 文档
-
- **[开发者指南](DEVELOPER.md)** - 开发者综合指南
- [入门指南](docs/getting-started.md) - 快速设置说明
- [配置指南](docs/configuration.md) - 所有配置选项
- [架构概述](docs/architecture.md) - 系统设计和结构
-
-## 路线图
-
-Aden 智能体框架旨在帮助开发者构建面向结果的、自适应的智能体。请在此查看我们的路线图
-
-[ROADMAP.md](ROADMAP.md)
-
-```mermaid
-timeline
-    title Aden Agent Framework Roadmap
-    section Foundation
-        Architecture : Node-Based Architecture : Python SDK : LLM Integration (OpenAI, Anthropic, Google) : Communication Protocol
-        Coding Agent : Goal Creation Session : Worker Agent Creation : MCP Tools Integration
-        Worker Agent : Human-in-the-Loop : Callback Handlers : Intervention Points : Streaming Interface
-        Tools : File Use : Memory (STM/LTM) : Web Search : Web Scraper : Audit Trail
-        Core : Eval System : Pydantic Validation : Docker Deployment : Documentation : Sample Agents
-    section Expansion
-        Intelligence : Guardrails : Streaming Mode : Semantic Search
-        Platform : JavaScript SDK : Custom Tool Integrator : Credential Store
-        Deployment : Self-Hosted : Cloud Services : CI/CD Pipeline
-        Templates : Sales Agent : Marketing Agent : Analytics Agent : Training Agent : Smart Form Agent
-```
-
-## 社区与支持
-
-我们使用 [Discord](https://discord.com/invite/MXE49hrKDk) 进行支持、功能请求和社区讨论。
-
- Discord - [加入我们的社区](https://discord.com/invite/MXE49hrKDk)
- Twitter/X - [@adenhq](https://x.com/aden_hq)
- LinkedIn - [公司主页](https://www.linkedin.com/company/teamaden/)
-
-## 贡献
-
-我们欢迎贡献！请参阅 [CONTRIBUTING.md](CONTRIBUTING.md) 了解指南。
-
-1. Fork 仓库
-2. 创建功能分支 (`git checkout -b feature/amazing-feature`)
-3. 提交更改 (`git commit -m 'Add amazing feature'`)
-4. 推送到分支 (`git push origin feature/amazing-feature`)
-5. 创建 Pull Request
-
-## 加入我们的团队
-
-**我们正在招聘！** 加入我们的工程、研究和市场推广团队。
-
-[查看开放职位](https://jobs.adenhq.com/a8cec478-cdbc-473c-bbd4-f4b7027ec193/applicant)
-
-## 安全
-
-有关安全问题，请参阅 [SECURITY.md](SECURITY.md)。
-
-## 许可证
-
-本项目采用 Apache License 2.0 许可证 - 详情请参阅 [LICENSE](LICENSE) 文件。
-
-## 常见问题 (FAQ)
-
-> **注意：** 完整的常见问题解答，请参阅英文版 [README.md](README.md)。
-
-**问：Aden 是否依赖 LangChain 或其他智能体框架？**
-
-不。Aden 从头开始构建，不依赖 LangChain、CrewAI 或其他智能体框架。该框架设计精简灵活，动态生成智能体图而非依赖预定义组件。
-
-**问：Aden 支持哪些 LLM 提供商？**
-
-Aden 通过 LiteLLM 集成支持 100 多个 LLM 提供商，包括 OpenAI（GPT-4、GPT-4o）、Anthropic（Claude 模型）、Google Gemini、Mistral、Groq 等。只需设置适当的 API 密钥环境变量并指定模型名称即可。
-
-**问：Aden 是开源的吗？**
-
-是的，Aden 在 Apache License 2.0 下完全开源。我们积极鼓励社区贡献和协作。
-
-**问：Aden 与其他智能体框架有何不同？**
-
-Aden 使用编码智能体从自然语言目标生成整个智能体系统——您无需硬编码工作流或手动定义图。当智能体失败时，框架会自动捕获故障数据、进化智能体图并重新部署。这种自我改进循环是 Aden 独有的。
-
-**问：Aden 支持人机协作工作流吗？**
-
-是的，Aden 通过干预节点完全支持人机协作工作流，这些节点会暂停执行以等待人工输入。包括可配置的超时和升级策略，实现人类专家与 AI 智能体的无缝协作。
-
---
-
-<p align="center">
-  用 🔥 热情打造于旧金山
-</p>
@@ -1,150 +0,0 @@
-Product Roadmap
-
-Aden Agent Framework aims to help developers build outcome oriented, self-adaptive agents. Please find our roadmap here
-
-```mermaid
-timeline
-    title Aden Agent Framework Roadmap
-    section Foundation
-        Architecture : Node-Based Architecture : Python SDK : LLM Integration (OpenAI, Anthropic, Google) : Communication Protocol
-        Coding Agent : Goal Creation Session : Worker Agent Creation : MCP Tools Integration
-        Worker Agent : Human-in-the-Loop : Callback Handlers : Intervention Points : Streaming Interface
-        Tools : File Use : Memory (STM/LTM) : Web Search : Web Scraper : Audit Trail
-        Core : Eval System : Pydantic Validation : Docker Deployment : Documentation : Sample Agents
-    section Expansion
-        Intelligence : Guardrails : Streaming Mode : Semantic Search
-        Platform : JavaScript SDK : Custom Tool Integrator : Credential Store
-        Deployment : Self-Hosted : Cloud Services : CI/CD Pipeline
-        Templates : Sales Agent : Marketing Agent : Analytics Agent : Training Agent : Smart Form Agent
-```
-
---
-
-## Phase 1: Foundation
-
-### Backbone Architecture
- [ ] **Node-Based Architecture (Agent as a node)**
-    - [x] Object schema definition
-    - [x] Node wrapper SDK
-    - [ ] Shared memory access
-    - [ ] Default monitoring hooks
-    - [ ] Tool access layer
-    - [x] LLM integration layer (Natively supports all mainstream LLMs through LiteLLM)
-        - [x] Anthropic
-        - [x] OpenAI
-        - [x] Google
- [ ] **Communication protocol between nodes**
- [ ] **[Coding Agent] Goal Creation Session** (separate from coding session)
-    - [ ] Instruction back and forth
-    - [x] Goal Object schema definition
-    - [ ] Being able to generate the test cases
-    - [ ] Test case validation for worker agent (Outcome driven)
- [ ] **[Coding Agent] Worker Agent Creation**
-    - [x] Coding Agent tools
-    - [ ] Use Template Agent as a start
-    - [x] Use our MCP tools
- [ ] **[Worker Agent] Human-in-the-Loop**
-    - [x] Worker Agents request with questions and options
-    - [x] Callback Handler System to receive events throughout execution
-    - [ ] Tool-Based Intervention Points (tool to pause execution and request human input)
-    - [x] Multiple entrypoint for different event source (e.g. Human input, webhook)
-    - [ ] Streaming Interface for Real-time Monitoring
-    - [ ] Request State Management
-
-### Essential Tools
- [x] **File Use Tool Kit**
- [ ] **Memory Tools**
-    - [x] STM Layer Tool (state-based short-term memory)
-    - [x] LTM Layer Tool (RLM - long-term memory)
- [ ] **Infrastructure Tools**
-    - [x] Runtime Log Tool (logs for coding agent)
-    - [ ] Audit Trail Tool (decision timeline generation)
-    - [ ] Web Search
-    - [ ] Web Scraper
-    - [ ] Recipe for "Add your own tools"
-
-### Memory & File System
- [x] DB for long-term persistent memory (Filesystem as durable scratchpad pattern)
- [x] Session Local memory isolation
-
-### Eval System (Basic)
- [x] Test Driven - Run test case for all agent iteration
- [ ] Failure recording mechanism
- [ ] SDK for defining failure conditions
- [ ] Basic observability hooks
- [ ] User-driven log analysis (OSS approach)
-
-### Data Validation
- [ ] Natively Support data validation of LLMs output with Pydantic
-
-### Developer Experience
- [ ] **Debugging mode**
- [ ] **Documentation**
-    - [ ] Quick start guide
-    - [ ] Goal creation guide
-    - [ ] Agent creation guide
-    - [ ] GitHub Page setup
-    - [ ] README with examples
-    - [ ] Contributing guidelines
- [ ] **Distribution**
-    - [ ] PyPI package
-    - [ ] Docker image on Docker Hub
-
-### Sample Agents
- [ ] Knowledge Agent
- [ ] Blog Writer Agent
- [ ] SDR Agent
-
---
-
-## Phase 2: Expansion
-
-### Basic Guardrails
- [ ] Support Basic Monitoring from Agent node SDK
- [ ] SDK guardrail implementation (in node)
- [ ] Guardrail type support (Determined Condition as Guardrails)
-
-### Agent Capability
- [ ] Streaming mode support
-
-### Cross-Platform
- [ ] JavaScript / TypeScript Version SDK
-
-### File System Enhancement
- [ ] Semantic Search integration
- [ ] Interactive File System in product (frontend integration)
-
-### More Worker Tools
- [ ] Custom Tool Integrator
- [ ] Integration as a tool (Credential Store & Support)
- [ ] **Core Agent Tools**
-    - [ ] Node Discovery Tool (find other agents in the graph)
-    - [ ] HITL Tool (pause execution for human approval)
-    - [ ] Wake-up Tool (resume agent tasks)
-
-### Deployment (Self-Hosted)
- [ ] Docker container standardization
- [ ] Headless backend execution
- [ ] Exposed API for frontend attachment
- [ ] Local monitoring & observability
- [ ] Basic lifecycle APIs (Start, Stop, Pause, Resume)
-
-### Deployment (Cloud)
- [ ] Cloud Service Options
- [ ] Support deployment to 3rd-party platforms
- [ ] Self-deploy + orchestrator connection
- [ ] **CI/CD Pipeline**
-    - [ ] Automated test execution
-    - [ ] Agent version control
-    - [ ] All tests must pass for deployment
-
-### Developer Experience Enhancement
- [ ] Tool usage documentation
- [ ] Discord Support Channel
-
-### More Agent Templates
- [ ] GTM Sales Agent (workflow)
- [ ] GTM Marketing Agent (workflow)
- [ ] Analytics Agent
- [ ] Training Agent
- [ ] Smart Entry / Form Agent (self-evolution emphasis)
@@ -39,8 +39,8 @@ We consider security research conducted in accordance with this policy to be:
 ## Security Best Practices for Users

 1. **Keep Updated**: Always run the latest version
-2. **Secure Configuration**: Review `config.yaml` settings, especially in production
-3. **Environment Variables**: Never commit `.env` files or `config.yaml` with secrets
+2. **Secure Configuration**: Review your `~/.hive/configuration.json`, `.mcp.json`, and environment variable settings, especially in production
+3. **Environment Variables**: Never commit `.env` files or any configuration files that contain secrets
 4. **Network Security**: Use HTTPS in production, configure firewalls appropriately
 5. **Database Security**: Use strong passwords, limit network access

@@ -0,0 +1,11 @@
+import json
+
+with open('/home/timothy/aden/hive/x_rapid_ledger.json', 'r') as f:
+    data = json.load(f)
+
+data['replies'].append({
+    'original_preview': 'Alright, I give in. Here’s my picture with the boss, courtesy of @johnkrausphotos. Oh, and hook ‘em!'
+})
+
+with open('/home/timothy/aden/hive/x_rapid_ledger.json', 'w') as f:
+    json.dump(data, f, indent=2)
@@ -0,0 +1,11 @@
+import json, sys
+
+with open('/home/timothy/aden/hive/x_rapid_ledger.json', 'r') as f:
+    ledger = json.load(f)
+
+text = sys.argv[1]
+for r in ledger['replies']:
+    if r.get('original_preview') == text:
+        print("YES")
+        sys.exit(0)
+print("NO")
@@ -1,14 +1,9 @@
 {
  "mcpServers": {
-    "agent-builder": {
-      "command": "python",
-      "args": ["-m", "framework.mcp.agent_builder_server"],
-      "cwd": "/home/timothy/oss/hive/core"
-    },
    "tools": {
      "command": "python",
      "args": ["-m", "aden_tools.mcp_server", "--stdio"],
-      "cwd": "/home/timothy/oss/hive/tools"
+      "cwd": "tools"
    }
  }
 }
@@ -82,7 +82,7 @@ Register an MCP server as a tool source for your agent.
    "example_tool"
  ],
  "total_mcp_servers": 1,
-  "note": "MCP server 'tools' registered with 6 tools. These tools can now be used in llm_tool_use nodes."
+  "note": "MCP server 'tools' registered with 6 tools. These tools can now be used in event_loop nodes."
 }
 ```

@@ -149,7 +149,7 @@ List tools available from registered MCP servers.
    ]
  },
  "total_tools": 6,
-  "note": "Use these tool names in the 'tools' parameter when adding llm_tool_use nodes"
+  "note": "Use these tool names in the 'tools' parameter when adding event_loop nodes"
 }
 ```

@@ -246,7 +246,7 @@ Here's a complete workflow for building an agent with MCP tools:
    "node_id": "web-searcher",
    "name": "Web Search",
    "description": "Search the web for information",
-    "node_type": "llm_tool_use",
+    "node_type": "event_loop",
    "input_keys": "[\"query\"]",
    "output_keys": "[\"search_results\"]",
    "system_prompt": "Search for {query} using the web_search tool",
@@ -6,7 +6,7 @@ This guide explains how to integrate Model Context Protocol (MCP) servers with t

 The framework provides built-in support for MCP servers, allowing you to:

- **Register MCP servers** via STDIO or HTTP transport
+- **Register MCP servers** via STDIO, HTTP, Unix socket, or SSE transport
 - **Auto-discover tools** from registered servers
 - **Use MCP tools** seamlessly in your agents
 - **Manage multiple MCP servers** simultaneously
@@ -104,6 +104,48 @@ runner.register_mcp_server(
 - `url`: Base URL of the MCP server
 - `headers`: HTTP headers to include (optional)

+### Unix Socket Transport
+
+Best for same-host inter-process communication with lower overhead than TCP:
+
+```python
+runner.register_mcp_server(
+    name="local-ipc-tools",
+    transport="unix",
+    url="http://localhost",
+    socket_path="/tmp/mcp_server.sock",
+    headers={
+        "Authorization": "Bearer token"
+    }
+)
+```
+
+**Configuration:**
+
+- `url`: Base URL for HTTP requests over the socket (required, e.g., `"http://localhost"`)
+- `socket_path`: Absolute path to the Unix socket file (required, e.g., `"/tmp/mcp_server.sock"`)
+- `headers`: HTTP headers to include (optional)
+
+### SSE Transport
+
+Best for real-time, event-driven connections using the MCP SDK's SSE client:
+
+```python
+runner.register_mcp_server(
+    name="streaming-tools",
+    transport="sse",
+    url="http://localhost:8000/sse",
+    headers={
+        "Authorization": "Bearer token"
+    }
+)
+```
+
+**Configuration:**
+
+- `url`: SSE endpoint URL (required, e.g., `"http://localhost:8000/sse"`)
+- `headers`: HTTP headers for the SSE connection (optional)
+
 ## Using MCP Tools in Agents

 Once registered, MCP tools are available just like any other tool:
@@ -119,7 +161,7 @@ builder = WorkflowBuilder()
 builder.add_node(
    node_id="researcher",
    name="Web Researcher",
-    node_type="llm_tool_use",
+    node_type="event_loop",
    system_prompt="Research the topic using web_search",
    tools=["web_search"],  # Tool from tools MCP server
    input_keys=["topic"],
@@ -137,7 +179,7 @@ Tools from MCP servers can be referenced in your agent.json just like built-in t
    {
      "id": "searcher",
      "name": "Web Searcher",
-      "node_type": "llm_tool_use",
+      "node_type": "event_loop",
      "system_prompt": "Search for information about {topic}",
      "tools": ["web_search", "web_scrape"],
      "input_keys": ["topic"],
@@ -258,7 +300,32 @@ runner.register_mcp_server(
 )
 ```

-### 3. Handle Cleanup
+### 3. Use Unix Socket for Same-Host IPC
+
+When both the agent and MCP server run on the same machine, Unix sockets avoid TCP overhead:
+
+```python
+runner.register_mcp_server(
+    name="fast-local-tools",
+    transport="unix",
+    url="http://localhost",
+    socket_path="/tmp/mcp_server.sock"
+)
+```
+
+### 4. Use SSE for Streaming and Real-Time Tools
+
+SSE transport maintains a persistent connection, ideal for event-driven servers:
+
+```python
+runner.register_mcp_server(
+    name="realtime-tools",
+    transport="sse",
+    url="http://realtime-server:8000/sse"
+)
+```
+
+### 5. Handle Cleanup

 Always clean up MCP connections when done:

@@ -280,7 +347,7 @@ async with AgentRunner.load("exports/my-agent") as runner:
    # Automatic cleanup
 ```

-### 4. Tool Name Conflicts
+### 6. Tool Name Conflicts

 If multiple MCP servers provide tools with the same name, the last registered server wins. To avoid conflicts:

@@ -315,6 +382,24 @@ If HTTP transport fails:
 2. Check firewall settings
 3. Verify the URL and port are correct

+### Unix Socket Not Connecting
+
+If Unix socket transport fails:
+
+1. Verify the socket file exists: `ls -la /tmp/mcp_server.sock`
+2. Check file permissions on the socket
+3. Ensure no other process has locked the socket
+4. Verify the `url` field is set (e.g., `"http://localhost"`)
+
+### SSE Connection Issues
+
+If SSE transport fails:
+
+1. Verify the server supports SSE at the given URL
+2. Check that the `mcp` Python package is installed (`pip install mcp`)
+3. Ensure the SSE endpoint is accessible: `curl http://localhost:8000/sse`
+4. Check for firewall or proxy issues blocking long-lived connections
+
 ## Example: Full Agent with MCP Tools

 Here's a complete example of an agent that uses MCP tools:
@@ -1,17 +1,16 @@
-# MCP Server Guide - Agent Builder
+# MCP Server Guide - Agent Building Tools

-This guide covers the MCP (Model Context Protocol) server for building goal-driven agents.
+> **Note:** The standalone `agent-builder` MCP server (`framework.mcp.agent_builder_server`) has been replaced. Agent building is now done via the `coder-tools` server's `initialize_and_build_agent` tool, with underlying logic in `tools/coder_tools_server.py`.
+
+This guide covers the MCP tools available for building goal-driven agents.

 ## Setup

 ### Quick Setup

 ```bash
-# Using the setup script (recommended)
-python setup_mcp.py
-
-# Or using bash
-./setup_mcp.sh
+# Run the quickstart script (recommended)
+./quickstart.sh
 ```

 ### Manual Configuration
@@ -21,10 +20,10 @@ Add to your MCP client configuration (e.g., Claude Desktop):
 ```json
 {
  "mcpServers": {
-    "agent-builder": {
-      "command": "python",
-      "args": ["-m", "framework.mcp.agent_builder_server"],
-      "cwd": "/path/to/goal-agent"
+    "coder-tools": {
+      "command": "uv",
+      "args": ["run", "coder_tools_server.py", "--stdio"],
+      "cwd": "/path/to/hive/tools"
    }
  }
 }
@@ -103,31 +102,20 @@ Add a processing node to the agent graph.
 - `node_id` (string, required): Unique node identifier
 - `name` (string, required): Human-readable name
 - `description` (string, required): What this node does
- `node_type` (string, required): One of: `llm_generate`, `llm_tool_use`, `router`, `function`
+- `node_type` (string, required): Must be `event_loop` (the only valid type)
 - `input_keys` (string, required): JSON array of input variable names
 - `output_keys` (string, required): JSON array of output variable names
- `system_prompt` (string, optional): System prompt for LLM nodes
- `tools` (string, optional): JSON array of tool names for tool_use nodes
- `routes` (string, optional): JSON object of route mappings for router nodes
+- `system_prompt` (string, optional): System prompt for the LLM
+- `tools` (string, optional): JSON array of tool names
+- `client_facing` (boolean, optional): Set to true for human-in-the-loop interaction

-**Node Types:**
+**Node Type:**

-1. **llm_generate**: Uses LLM to generate output from inputs
-   - Requires: `system_prompt`
-   - Tools: Not used
-
-2. **llm_tool_use**: Uses LLM with tools to accomplish tasks
-   - Requires: `system_prompt`, `tools`
-   - Tools: Array of tool names (e.g., `["web_search", "web_fetch"]`)
-
-3. **router**: LLM-powered routing to different paths
-   - Requires: `system_prompt`, `routes`
-   - Routes: Object mapping route names to target node IDs
-   - Example: `{"pass": "success_node", "fail": "retry_node"}`
-
-4. **function**: Executes a pre-defined function
-   - System prompt describes the function behavior
-   - No LLM calls, pure computation
+**event_loop**: LLM-powered node with self-correction loop
+- Requires: `system_prompt`
+- Optional: `tools` (array of tool names, e.g., `["web_search", "web_fetch"]`)
+- Optional: `client_facing` (set to true for HITL / user interaction)
+- Supports: iterative refinement, judge-based evaluation, tool use, streaming

 **Example:**
 ```json
@@ -135,7 +123,7 @@ Add a processing node to the agent graph.
  "node_id": "search_sources",
  "name": "Search Sources",
  "description": "Searches for relevant sources on the topic",
-  "node_type": "llm_tool_use",
+  "node_type": "event_loop",
  "input_keys": "[\"topic\", \"search_queries\"]",
  "output_keys": "[\"sources\", \"source_count\"]",
  "system_prompt": "Search for sources using the provided queries...",
@@ -198,7 +186,7 @@ Export the validated graph as an agent specification.

 **What it does:**
 1. Validates the graph
-2. Auto-generates missing edges from router routes
+2. Validates edge connectivity
 3. Writes files to disk:
   - `exports/{agent-name}/agent.json` - Full agent specification
   - `exports/{agent-name}/README.md` - Auto-generated documentation
@@ -252,47 +240,6 @@ Test the complete agent graph with sample inputs.

 ---

-### Evaluation Rules
-
-#### `add_evaluation_rule`
-Add a rule for the HybridJudge to evaluate node outputs.
-
-**Parameters:**
- `rule_id` (string, required): Unique rule identifier
- `description` (string, required): What this rule checks
- `condition` (string, required): Python expression to evaluate
- `action` (string, required): Action to take: `accept`, `retry`, `escalate`
- `priority` (integer, optional): Rule priority (default: 0)
- `feedback_template` (string, optional): Feedback message template
-
-**Condition Examples:**
- `'result.get("success") == True'` - Check for success flag
- `'result.get("error_type") == "timeout"'` - Check error type
- `'len(result.get("data", [])) > 0'` - Check for non-empty data
-
-**Example:**
-```json
-{
-  "rule_id": "timeout_retry",
-  "description": "Retry on timeout errors",
-  "condition": "result.get('error_type') == 'timeout'",
-  "action": "retry",
-  "priority": 10,
-  "feedback_template": "Timeout occurred, retrying..."
-}
-```
-
-#### `list_evaluation_rules`
-List all configured evaluation rules.
-
-#### `remove_evaluation_rule`
-Remove an evaluation rule.
-
-**Parameters:**
- `rule_id` (string, required): Rule to remove
-
---
-
 ## Example Workflow

 Here's a complete workflow for building a research agent:
@@ -320,7 +267,7 @@ add_node(
    node_id="planner",
    name="Research Planner",
    description="Creates research strategy",
-    node_type="llm_generate",
+    node_type="event_loop",
    input_keys='["topic"]',
    output_keys='["strategy", "queries"]',
    system_prompt="Analyze topic and create research plan..."
@@ -330,7 +277,7 @@ add_node(
    node_id="searcher",
    name="Search Sources",
    description="Find relevant sources",
-    node_type="llm_tool_use",
+    node_type="event_loop",
    input_keys='["queries"]',
    output_keys='["sources"]',
    system_prompt="Search for sources...",
@@ -359,10 +306,9 @@ The exported agent will be saved to `exports/research-agent/`.

 1. **Start with the goal**: Define clear success criteria before building nodes
 2. **Test nodes individually**: Use `test_node` to verify each node works
-3. **Use router nodes for branching**: Don't create edges manually for routers - define routes and they'll be auto-generated
-4. **Add evaluation rules**: Help the judge evaluate outputs deterministically
-5. **Validate early, validate often**: Run `validate_graph` after adding nodes/edges
-6. **Check exports**: Review the generated README.md to verify your agent structure
+3. **Use conditional edges for branching**: Define condition_expr on edges for decision points
+4. **Validate early, validate often**: Run `validate_graph` after adding nodes/edges
+5. **Check exports**: Review the generated README.md to verify your agent structure

 ---

@@ -14,69 +14,14 @@ Framework provides a runtime framework that captures **decisions**, not just act
 ## Installation

 ```bash
-pip install -e .
+uv pip install -e .
 ```

-## MCP Server Setup
+## Agent Building

-The framework includes an MCP (Model Context Protocol) server for building agents. To set up the MCP server:
+Agent scaffolding is handled by the `coder-tools` MCP server (in `tools/coder_tools_server.py`), which provides the `initialize_and_build_agent` tool and related utilities. The package generation logic lives directly in `tools/coder_tools_server.py`.

-### Automated Setup
-
-**Using bash (Linux/macOS):**
-```bash
-./setup_mcp.sh
-```
-
-**Using Python (cross-platform):**
-```bash
-python setup_mcp.py
-```
-
-The setup script will:
-1. Install the framework package
-2. Install MCP dependencies (mcp, fastmcp)
-3. Create/verify `.mcp.json` configuration
-4. Test the MCP server module
-
-### Manual Setup
-
-If you prefer manual setup:
-
-```bash
-# Install framework
-pip install -e .
-
-# Install MCP dependencies
-pip install mcp fastmcp
-
-# Test the server
-python -m framework.mcp.agent_builder_server
-```
-
-### Using with MCP Clients
-
-To use the agent builder with Claude Desktop or other MCP clients, add this to your MCP client configuration:
-
-```json
-{
-  "mcpServers": {
-    "agent-builder": {
-      "command": "python",
-      "args": ["-m", "framework.mcp.agent_builder_server"],
-      "cwd": "/path/to/goal-agent"
-    }
-  }
-}
-```
-
-The MCP server provides tools for:
- Creating agent building sessions
- Defining goals with success criteria
- Adding nodes (llm_generate, llm_tool_use, router, function)
- Connecting nodes with edges
- Validating and exporting agent graphs
- Testing nodes and full agent graphs
+See the [Getting Started Guide](../docs/getting-started.md) for building agents.

 ## Quick Start

@@ -85,14 +30,14 @@ The MCP server provides tools for:
 Run an LLM-powered calculator:

 ```bash
-# Single calculation
-python -m framework calculate "2 + 3 * 4"
+# Run an exported agent
+uv run python -m framework run exports/calculator --input '{"expression": "2 + 3 * 4"}'

-# Interactive mode
-python -m framework interactive
+# Interactive shell session
+uv run python -m framework shell exports/calculator

-# Analyze runs with Builder
-python -m framework analyze calculator
+# Show agent info
+uv run python -m framework info exports/calculator
 ```

 ### Using the Runtime
@@ -132,24 +77,20 @@ runtime.end_run(success=True, narrative="Successfully processed all data")

 The framework includes a goal-based testing framework for validating agent behavior.

+Tests are generated using MCP tools (`generate_constraint_tests`, `generate_success_tests`) which return guidelines. Claude writes tests directly using the Write tool based on these guidelines.
+
 ```bash
-# Generate tests from a goal definition
-python -m framework test-generate goal.json
-
-# Interactively approve generated tests
-python -m framework test-approve <goal_id>
-
 # Run tests against an agent
-python -m framework test-run <agent_path> --parallel 4
+uv run python -m framework test-run <agent_path> --goal <goal_id> --parallel 4

 # Debug failed tests
-python -m framework test-debug <goal_id> <test_id>
+uv run python -m framework test-debug <agent_path> <test_name>

-# List tests by status
-python -m framework test-list <goal_id>
+# List tests for an agent
+uv run python -m framework test-list <agent_path>
 ```

-For detailed testing workflows, see the [testing-agent skill](.claude/skills/testing-agent/SKILL.md).
+For detailed testing workflows, see [developer-guide.md](../docs/developer-guide.md).

 ### Analyzing Agent Behavior with Builder

@@ -0,0 +1,569 @@
+#!/usr/bin/env python3
+"""Antigravity authentication CLI.
+
+Implements OAuth2 flow for Google's Antigravity Code Assist gateway.
+Credentials are stored in ~/.hive/antigravity-accounts.json.
+
+Usage:
+    python -m antigravity_auth auth account add
+    python -m antigravity_auth auth account list
+    python -m antigravity_auth auth account remove <email>
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import secrets
+import socket
+import sys
+import time
+import urllib.parse
+import urllib.request
+import webbrowser
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from pathlib import Path
+from typing import Any
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+logger = logging.getLogger(__name__)
+
+# OAuth endpoints
+_OAUTH_AUTH_URL = "https://accounts.google.com/o/oauth2/v2/auth"
+_OAUTH_TOKEN_URL = "https://oauth2.googleapis.com/token"
+
+# Scopes for Antigravity/Cloud Code Assist
+_OAUTH_SCOPES = [
+    "https://www.googleapis.com/auth/cloud-platform",
+    "https://www.googleapis.com/auth/userinfo.email",
+    "https://www.googleapis.com/auth/userinfo.profile",
+]
+
+# Credentials file path in ~/.hive/
+_ACCOUNTS_FILE = Path.home() / ".hive" / "antigravity-accounts.json"
+
+# Default project ID
+_DEFAULT_PROJECT_ID = "rising-fact-p41fc"
+_DEFAULT_REDIRECT_PORT = 51121
+
+# OAuth credentials fetched from the opencode-antigravity-auth project.
+# This project reverse-engineered and published the public OAuth credentials
+# for Google's Antigravity/Cloud Code Assist API.
+# Source: https://github.com/NoeFabris/opencode-antigravity-auth
+_CREDENTIALS_URL = "https://raw.githubusercontent.com/NoeFabris/opencode-antigravity-auth/dev/src/constants.ts"
+
+# Cached credentials fetched from public source
+_cached_client_id: str | None = None
+_cached_client_secret: str | None = None
+
+
+def _fetch_credentials_from_public_source() -> tuple[str | None, str | None]:
+    """Fetch OAuth client ID and secret from the public npm package source on GitHub."""
+    global _cached_client_id, _cached_client_secret
+    if _cached_client_id and _cached_client_secret:
+        return _cached_client_id, _cached_client_secret
+
+    try:
+        req = urllib.request.Request(_CREDENTIALS_URL, headers={"User-Agent": "Hive-Antigravity-Auth/1.0"})
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            content = resp.read().decode("utf-8")
+            import re
+
+            id_match = re.search(r'ANTIGRAVITY_CLIENT_ID\s*=\s*"([^"]+)"', content)
+            secret_match = re.search(r'ANTIGRAVITY_CLIENT_SECRET\s*=\s*"([^"]+)"', content)
+            if id_match:
+                _cached_client_id = id_match.group(1)
+            if secret_match:
+                _cached_client_secret = secret_match.group(1)
+            return _cached_client_id, _cached_client_secret
+    except Exception as e:
+        logger.debug(f"Failed to fetch credentials from public source: {e}")
+    return None, None
+
+
+def get_client_id() -> str:
+    """Get OAuth client ID from env, config, or public source."""
+    env_id = os.environ.get("ANTIGRAVITY_CLIENT_ID")
+    if env_id:
+        return env_id
+
+    # Try hive config
+    hive_cfg = Path.home() / ".hive" / "configuration.json"
+    if hive_cfg.exists():
+        try:
+            with open(hive_cfg) as f:
+                cfg = json.load(f)
+                cfg_id = cfg.get("llm", {}).get("antigravity_client_id")
+                if cfg_id:
+                    return cfg_id
+        except Exception:
+            pass
+
+    # Fetch from public source
+    client_id, _ = _fetch_credentials_from_public_source()
+    if client_id:
+        return client_id
+
+    raise RuntimeError("Could not obtain Antigravity OAuth client ID")
+
+
+def get_client_secret() -> str | None:
+    """Get OAuth client secret from env, config, or public source."""
+    secret = os.environ.get("ANTIGRAVITY_CLIENT_SECRET")
+    if secret:
+        return secret
+
+    # Try to read from hive config
+    hive_cfg = Path.home() / ".hive" / "configuration.json"
+    if hive_cfg.exists():
+        try:
+            with open(hive_cfg) as f:
+                cfg = json.load(f)
+                secret = cfg.get("llm", {}).get("antigravity_client_secret")
+                if secret:
+                    return secret
+        except Exception:
+            pass
+
+    # Fetch from public source (npm package on GitHub)
+    _, secret = _fetch_credentials_from_public_source()
+    return secret
+
+
+def find_free_port() -> int:
+    """Find an available local port."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        s.listen(1)
+        return s.getsockname()[1]
+
+
+class OAuthCallbackHandler(BaseHTTPRequestHandler):
+    """Handle OAuth callback from browser."""
+
+    auth_code: str | None = None
+    state: str | None = None
+    error: str | None = None
+
+    def log_message(self, format: str, *args: Any) -> None:
+        pass  # Suppress default logging
+
+    def do_GET(self) -> None:
+        parsed = urllib.parse.urlparse(self.path)
+
+        if parsed.path == "/oauth-callback":
+            query = urllib.parse.parse_qs(parsed.query)
+
+            if "error" in query:
+                self.error = query["error"][0]
+                self._send_response("Authentication failed. You can close this window.")
+                return
+
+            if "code" in query and "state" in query:
+                OAuthCallbackHandler.auth_code = query["code"][0]
+                OAuthCallbackHandler.state = query["state"][0]
+                self._send_response("Authentication successful! You can close this window and return to the terminal.")
+                return
+
+        self._send_response("Waiting for authentication...")
+
+    def _send_response(self, message: str) -> None:
+        self.send_response(200)
+        self.send_header("Content-Type", "text/html")
+        self.end_headers()
+        html = f"""<!DOCTYPE html>
+<html>
+<head><title>Antigravity Auth</title></head>
+<body style="font-family: system-ui; display: flex; align-items: center;
+      justify-content: center; height: 100vh; margin: 0; background: #1a1a2e;
+      color: #eee;">
+    <div style="text-align: center;">
+        <h2>{message}</h2>
+    </div>
+</body>
+</html>"""
+        self.wfile.write(html.encode())
+
+
+def wait_for_callback(port: int, timeout: int = 300) -> tuple[str | None, str | None, str | None]:
+    """Start local server and wait for OAuth callback."""
+    server = HTTPServer(("localhost", port), OAuthCallbackHandler)
+    server.timeout = 1
+
+    start = time.time()
+    while time.time() - start < timeout:
+        if OAuthCallbackHandler.auth_code:
+            return (
+                OAuthCallbackHandler.auth_code,
+                OAuthCallbackHandler.state,
+                OAuthCallbackHandler.error,
+            )
+        server.handle_request()
+
+    return None, None, "timeout"
+
+
+def exchange_code_for_tokens(
+    code: str, redirect_uri: str, client_id: str, client_secret: str | None
+) -> dict[str, Any] | None:
+    """Exchange authorization code for tokens."""
+    data = {
+        "code": code,
+        "client_id": client_id,
+        "redirect_uri": redirect_uri,
+        "grant_type": "authorization_code",
+    }
+    if client_secret:
+        data["client_secret"] = client_secret
+
+    body = urllib.parse.urlencode(data).encode()
+
+    req = urllib.request.Request(
+        _OAUTH_TOKEN_URL,
+        data=body,
+        headers={"Content-Type": "application/x-www-form-urlencoded"},
+        method="POST",
+    )
+
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            return json.loads(resp.read())
+    except Exception as e:
+        logger.error(f"Token exchange failed: {e}")
+        return None
+
+
+def get_user_email(access_token: str) -> str | None:
+    """Get user email from Google API."""
+    req = urllib.request.Request(
+        "https://www.googleapis.com/oauth2/v2/userinfo",
+        headers={"Authorization": f"Bearer {access_token}"},
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            data = json.loads(resp.read())
+            return data.get("email")
+    except Exception:
+        return None
+
+
+def load_accounts() -> dict[str, Any]:
+    """Load existing accounts from file."""
+    if not _ACCOUNTS_FILE.exists():
+        return {"schemaVersion": 4, "accounts": []}
+    try:
+        with open(_ACCOUNTS_FILE) as f:
+            return json.load(f)
+    except Exception:
+        return {"schemaVersion": 4, "accounts": []}
+
+
+def save_accounts(data: dict[str, Any]) -> None:
+    """Save accounts to file."""
+    _ACCOUNTS_FILE.parent.mkdir(parents=True, exist_ok=True)
+    with open(_ACCOUNTS_FILE, "w") as f:
+        json.dump(data, f, indent=2)
+    logger.info(f"Saved credentials to {_ACCOUNTS_FILE}")
+
+
+def validate_credentials(access_token: str, project_id: str = _DEFAULT_PROJECT_ID) -> bool:
+    """Test if credentials work by making a simple API call to Antigravity.
+
+    Returns True if credentials are valid, False otherwise.
+    """
+    endpoint = "https://daily-cloudcode-pa.sandbox.googleapis.com"
+    body = {
+        "project": project_id,
+        "model": "gemini-3-flash",
+        "request": {
+            "contents": [{"role": "user", "parts": [{"text": "hi"}]}],
+            "generationConfig": {"maxOutputTokens": 10},
+        },
+        "requestType": "agent",
+        "userAgent": "antigravity",
+        "requestId": "validation-test",
+    }
+    headers = {
+        "Authorization": f"Bearer {access_token}",
+        "Content-Type": "application/json",
+        "User-Agent": (
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Antigravity/1.18.3"
+        ),
+        "X-Goog-Api-Client": "google-cloud-sdk vscode_cloudshelleditor/0.1",
+    }
+
+    try:
+        req = urllib.request.Request(
+            f"{endpoint}/v1internal:generateContent",
+            data=json.dumps(body).encode("utf-8"),
+            headers=headers,
+            method="POST",
+        )
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            json.loads(resp.read())
+            return True
+    except Exception:
+        return False
+
+
+def refresh_access_token(refresh_token: str, client_id: str, client_secret: str | None) -> dict | None:
+    """Refresh the access token using the refresh token."""
+    data = {
+        "grant_type": "refresh_token",
+        "refresh_token": refresh_token,
+        "client_id": client_id,
+    }
+    if client_secret:
+        data["client_secret"] = client_secret
+
+    body = urllib.parse.urlencode(data).encode()
+    req = urllib.request.Request(
+        _OAUTH_TOKEN_URL,
+        data=body,
+        headers={"Content-Type": "application/x-www-form-urlencoded"},
+        method="POST",
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            return json.loads(resp.read())
+    except Exception as e:
+        logger.debug(f"Token refresh failed: {e}")
+        return None
+
+
+def cmd_account_add(args: argparse.Namespace) -> int:
+    """Add a new Antigravity account via OAuth2.
+
+    First checks if valid credentials already exist. If so, validates them
+    and skips OAuth if they work. Otherwise, proceeds with OAuth flow.
+    """
+    client_id = get_client_id()
+    client_secret = get_client_secret()
+
+    # Check if credentials already exist
+    accounts_data = load_accounts()
+    accounts = accounts_data.get("accounts", [])
+
+    if accounts:
+        account = next((a for a in accounts if a.get("enabled", True) is not False), accounts[0])
+        access_token = account.get("access")
+        refresh_token_str = account.get("refresh", "")
+        refresh_token = refresh_token_str.split("|")[0] if refresh_token_str else None
+        project_id = refresh_token_str.split("|")[1] if "|" in refresh_token_str else _DEFAULT_PROJECT_ID
+        email = account.get("email", "unknown")
+        expires_ms = account.get("expires", 0)
+        expires_at = expires_ms / 1000.0 if expires_ms else 0.0
+
+        # Check if token is expired or near expiry
+        if access_token and expires_at and time.time() < expires_at - 60:
+            # Token still valid, test it
+            logger.info(f"Found existing credentials for: {email}")
+            logger.info("Validating existing credentials...")
+            if validate_credentials(access_token, project_id):
+                logger.info("✓ Credentials valid! Skipping OAuth.")
+                return 0
+            else:
+                logger.info("Credentials failed validation, refreshing...")
+        elif refresh_token:
+            logger.info(f"Found expired credentials for: {email}")
+            logger.info("Attempting token refresh...")
+
+            tokens = refresh_access_token(refresh_token, client_id, client_secret)
+            if tokens:
+                new_access = tokens.get("access_token")
+                expires_in = tokens.get("expires_in", 3600)
+                if new_access:
+                    # Update the account
+                    account["access"] = new_access
+                    account["expires"] = int((time.time() + expires_in) * 1000)
+                    accounts_data["last_refresh"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+                    save_accounts(accounts_data)
+
+                    # Validate the refreshed token
+                    logger.info("Validating refreshed credentials...")
+                    if validate_credentials(new_access, project_id):
+                        logger.info("✓ Credentials refreshed and validated!")
+                        return 0
+                    else:
+                        logger.info("Refreshed token failed validation, proceeding with OAuth...")
+            else:
+                logger.info("Token refresh failed, proceeding with OAuth...")
+
+    # No valid credentials, proceed with OAuth
+    if not client_secret:
+        logger.warning(
+            "No client secret configured. Token refresh may fail.\n"
+            "Set ANTIGRAVITY_CLIENT_SECRET env var or add "
+            "'antigravity_client_secret' to ~/.hive/configuration.json"
+        )
+
+    # Use fixed port and path matching Google's expected OAuth redirect URI
+    port = _DEFAULT_REDIRECT_PORT
+    redirect_uri = f"http://localhost:{port}/oauth-callback"
+
+    # Generate state for CSRF protection
+    state = secrets.token_urlsafe(16)
+
+    # Build authorization URL
+    params = {
+        "client_id": client_id,
+        "redirect_uri": redirect_uri,
+        "response_type": "code",
+        "scope": " ".join(_OAUTH_SCOPES),
+        "state": state,
+        "access_type": "offline",
+        "prompt": "consent",
+    }
+    auth_url = f"{_OAUTH_AUTH_URL}?{urllib.parse.urlencode(params)}"
+
+    logger.info("Opening browser for authentication...")
+    logger.info(f"If the browser doesn't open, visit: {auth_url}\n")
+
+    # Open browser
+    webbrowser.open(auth_url)
+
+    # Wait for callback
+    logger.info(f"Listening for callback on port {port}...")
+    code, received_state, error = wait_for_callback(port)
+
+    if error:
+        logger.error(f"Authentication failed: {error}")
+        return 1
+
+    if not code:
+        logger.error("No authorization code received")
+        return 1
+
+    if received_state != state:
+        logger.error("State mismatch - possible CSRF attack")
+        return 1
+
+    # Exchange code for tokens
+    logger.info("Exchanging authorization code for tokens...")
+    tokens = exchange_code_for_tokens(code, redirect_uri, client_id, client_secret)
+
+    if not tokens:
+        return 1
+
+    access_token = tokens.get("access_token")
+    refresh_token = tokens.get("refresh_token")
+    expires_in = tokens.get("expires_in", 3600)
+
+    if not access_token:
+        logger.error("No access token in response")
+        return 1
+
+    # Get user email
+    email = get_user_email(access_token)
+    if email:
+        logger.info(f"Authenticated as: {email}")
+
+    # Load existing accounts and add/update
+    accounts_data = load_accounts()
+    accounts = accounts_data.get("accounts", [])
+
+    # Build new account entry (V4 schema)
+    expires_ms = int((time.time() + expires_in) * 1000)
+    refresh_entry = f"{refresh_token}|{_DEFAULT_PROJECT_ID}"
+
+    new_account = {
+        "access": access_token,
+        "refresh": refresh_entry,
+        "expires": expires_ms,
+        "email": email,
+        "enabled": True,
+    }
+
+    # Update existing account or add new one
+    existing_idx = next((i for i, a in enumerate(accounts) if a.get("email") == email), None)
+    if existing_idx is not None:
+        accounts[existing_idx] = new_account
+        logger.info(f"Updated existing account: {email}")
+    else:
+        accounts.append(new_account)
+        logger.info(f"Added new account: {email}")
+
+    accounts_data["accounts"] = accounts
+    accounts_data["schemaVersion"] = 4
+    accounts_data["last_refresh"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+
+    save_accounts(accounts_data)
+    logger.info("\n✓ Authentication complete!")
+    return 0
+
+
+def cmd_account_list(args: argparse.Namespace) -> int:
+    """List all stored accounts."""
+    data = load_accounts()
+    accounts = data.get("accounts", [])
+
+    if not accounts:
+        logger.info("No accounts configured.")
+        logger.info("Run 'antigravity auth account add' to add one.")
+        return 0
+
+    logger.info("Configured accounts:\n")
+    for i, account in enumerate(accounts, 1):
+        email = account.get("email", "unknown")
+        enabled = "enabled" if account.get("enabled", True) else "disabled"
+        logger.info(f"  {i}. {email} ({enabled})")
+
+    return 0
+
+
+def cmd_account_remove(args: argparse.Namespace) -> int:
+    """Remove an account by email."""
+    email = args.email
+    data = load_accounts()
+    accounts = data.get("accounts", [])
+
+    original_len = len(accounts)
+    accounts = [a for a in accounts if a.get("email") != email]
+
+    if len(accounts) == original_len:
+        logger.error(f"No account found with email: {email}")
+        return 1
+
+    data["accounts"] = accounts
+    save_accounts(data)
+    logger.info(f"Removed account: {email}")
+    return 0
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Antigravity authentication CLI",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    subparsers = parser.add_subparsers(dest="command", help="Commands")
+
+    # auth account add
+    auth_parser = subparsers.add_parser("auth", help="Authentication commands")
+    auth_subparsers = auth_parser.add_subparsers(dest="auth_command")
+
+    account_parser = auth_subparsers.add_parser("account", help="Account management")
+    account_subparsers = account_parser.add_subparsers(dest="account_command")
+
+    add_parser = account_subparsers.add_parser("add", help="Add a new account via OAuth2")
+    add_parser.set_defaults(func=cmd_account_add)
+
+    list_parser = account_subparsers.add_parser("list", help="List configured accounts")
+    list_parser.set_defaults(func=cmd_account_list)
+
+    remove_parser = account_subparsers.add_parser("remove", help="Remove an account")
+    remove_parser.add_argument("email", help="Email of account to remove")
+    remove_parser.set_defaults(func=cmd_account_remove)
+
+    args = parser.parse_args()
+
+    if hasattr(args, "func"):
+        return args.func(args)
+
+    parser.print_help()
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,441 @@
+"""OpenAI Codex OAuth PKCE login flow.
+
+Runs the full browser-based OAuth flow so users can authenticate with their
+ChatGPT Plus/Pro subscription without needing the Codex CLI installed.
+
+Usage (from quickstart.sh):
+    uv run python codex_oauth.py
+
+Exit codes:
+    0 - success (credentials saved to ~/.codex/auth.json)
+    1 - failure (user cancelled, timeout, or token exchange error)
+"""
+
+import base64
+import hashlib
+import http.server
+import json
+import os
+import platform
+import queue
+import secrets
+import subprocess
+import sys
+import threading
+import time
+import urllib.error
+import urllib.parse
+import urllib.request
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import TextIO
+
+# OAuth constants (from the Codex CLI binary)
+CLIENT_ID = "app_EMoamEEZ73f0CkXaXp7hrann"
+AUTHORIZE_URL = "https://auth.openai.com/oauth/authorize"
+TOKEN_URL = "https://auth.openai.com/oauth/token"
+REDIRECT_URI = "http://localhost:1455/auth/callback"
+SCOPE = "openid profile email offline_access"
+CALLBACK_PORT = 1455
+
+# Where to save credentials (same location the Codex CLI uses)
+CODEX_AUTH_FILE = Path.home() / ".codex" / "auth.json"
+
+# JWT claim path for account_id
+JWT_CLAIM_PATH = "https://api.openai.com/auth"
+
+
+def _base64url(data: bytes) -> str:
+    return base64.urlsafe_b64encode(data).rstrip(b"=").decode("ascii")
+
+
+def generate_pkce() -> tuple[str, str]:
+    """Generate PKCE code_verifier and code_challenge (S256)."""
+    verifier_bytes = secrets.token_bytes(32)
+    verifier = _base64url(verifier_bytes)
+    challenge = _base64url(hashlib.sha256(verifier.encode("ascii")).digest())
+    return verifier, challenge
+
+
+def build_authorize_url(state: str, challenge: str) -> str:
+    """Build the OpenAI OAuth authorize URL with PKCE."""
+    params = urllib.parse.urlencode(
+        {
+            "response_type": "code",
+            "client_id": CLIENT_ID,
+            "redirect_uri": REDIRECT_URI,
+            "scope": SCOPE,
+            "code_challenge": challenge,
+            "code_challenge_method": "S256",
+            "state": state,
+            "id_token_add_organizations": "true",
+            "codex_cli_simplified_flow": "true",
+            "originator": "hive",
+        }
+    )
+    return f"{AUTHORIZE_URL}?{params}"
+
+
+def exchange_code_for_tokens(code: str, verifier: str) -> dict | None:
+    """Exchange the authorization code for tokens."""
+    data = urllib.parse.urlencode(
+        {
+            "grant_type": "authorization_code",
+            "client_id": CLIENT_ID,
+            "code": code,
+            "code_verifier": verifier,
+            "redirect_uri": REDIRECT_URI,
+        }
+    ).encode("utf-8")
+
+    req = urllib.request.Request(
+        TOKEN_URL,
+        data=data,
+        headers={"Content-Type": "application/x-www-form-urlencoded"},
+        method="POST",
+    )
+
+    try:
+        with urllib.request.urlopen(req, timeout=15) as resp:
+            token_data = json.loads(resp.read())
+    except (urllib.error.URLError, json.JSONDecodeError, TimeoutError, OSError) as exc:
+        print(f"\033[0;31mToken exchange failed: {exc}\033[0m", file=sys.stderr)
+        return None
+
+    if not token_data.get("access_token") or not token_data.get("refresh_token"):
+        print("\033[0;31mToken response missing required fields\033[0m", file=sys.stderr)
+        return None
+
+    return token_data
+
+
+def decode_jwt_payload(token: str) -> dict | None:
+    """Decode the payload of a JWT (no signature verification)."""
+    try:
+        parts = token.split(".")
+        if len(parts) != 3:
+            return None
+        payload = parts[1]
+        # Add padding
+        padding = 4 - len(payload) % 4
+        if padding != 4:
+            payload += "=" * padding
+        decoded = base64.urlsafe_b64decode(payload)
+        return json.loads(decoded)
+    except Exception:
+        return None
+
+
+def get_account_id(access_token: str) -> str | None:
+    """Extract the ChatGPT account_id from the access token JWT."""
+    payload = decode_jwt_payload(access_token)
+    if not payload:
+        return None
+    auth = payload.get(JWT_CLAIM_PATH)
+    if isinstance(auth, dict):
+        account_id = auth.get("chatgpt_account_id")
+        if isinstance(account_id, str) and account_id:
+            return account_id
+    return None
+
+
+def save_credentials(token_data: dict, account_id: str) -> None:
+    """Save credentials to ~/.codex/auth.json in the same format the Codex CLI uses."""
+    auth_data = {
+        "tokens": {
+            "access_token": token_data["access_token"],
+            "refresh_token": token_data["refresh_token"],
+            "account_id": account_id,
+        },
+        "auth_mode": "chatgpt",
+        "last_refresh": datetime.now(UTC).isoformat(),
+    }
+    if "id_token" in token_data:
+        auth_data["tokens"]["id_token"] = token_data["id_token"]
+
+    CODEX_AUTH_FILE.parent.mkdir(parents=True, exist_ok=True, mode=0o700)
+    fd = os.open(CODEX_AUTH_FILE, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600)
+    with os.fdopen(fd, "w") as f:
+        json.dump(auth_data, f, indent=2)
+
+
+def open_browser(url: str) -> bool:
+    """Open the URL in the user's default browser."""
+    system = platform.system()
+    try:
+        devnull = subprocess.DEVNULL
+        if system == "Darwin":
+            subprocess.Popen(["open", url], stdout=devnull, stderr=devnull)
+        elif system == "Windows":
+            os.startfile(url)  # type: ignore[attr-defined]
+        else:
+            subprocess.Popen(["xdg-open", url], stdout=devnull, stderr=devnull)
+        return True
+    except (AttributeError, OSError):
+        return False
+
+
+class OAuthCallbackHandler(http.server.BaseHTTPRequestHandler):
+    """HTTP handler that captures the OAuth callback."""
+
+    auth_code: str | None = None
+    received_state: str | None = None
+
+    def do_GET(self) -> None:
+        parsed = urllib.parse.urlparse(self.path)
+        if parsed.path != "/auth/callback":
+            self.send_response(404)
+            self.end_headers()
+            self.wfile.write(b"Not found")
+            return
+
+        params = urllib.parse.parse_qs(parsed.query)
+        code = params.get("code", [None])[0]
+        state = params.get("state", [None])[0]
+
+        if not code:
+            self.send_response(400)
+            self.end_headers()
+            self.wfile.write(b"Missing authorization code")
+            return
+
+        OAuthCallbackHandler.auth_code = code
+        OAuthCallbackHandler.received_state = state
+
+        self.send_response(200)
+        self.send_header("Content-Type", "text/html; charset=utf-8")
+        self.end_headers()
+        self.wfile.write(
+            b"<!doctype html><html><head><meta charset='utf-8'/></head>"
+            b"<body><h2>Authentication successful</h2>"
+            b"<p>Return to your terminal to continue.</p></body></html>"
+        )
+
+    def log_message(self, format: str, *args: object) -> None:
+        # Suppress request logging
+        pass
+
+
+def wait_for_callback(state: str, timeout_secs: int = 120) -> str | None:
+    """Start a local HTTP server and wait for the OAuth callback.
+
+    Returns the authorization code on success, None on timeout.
+    """
+    OAuthCallbackHandler.auth_code = None
+    OAuthCallbackHandler.received_state = None
+
+    server = http.server.HTTPServer(("127.0.0.1", CALLBACK_PORT), OAuthCallbackHandler)
+    server.timeout = 1
+
+    deadline = time.time() + timeout_secs
+    server_thread = threading.Thread(target=_serve_until_done, args=(server, deadline, state))
+    server_thread.daemon = True
+    server_thread.start()
+    server_thread.join(timeout=timeout_secs + 2)
+
+    server.server_close()
+
+    if OAuthCallbackHandler.auth_code and OAuthCallbackHandler.received_state == state:
+        return OAuthCallbackHandler.auth_code
+    return None
+
+
+def _serve_until_done(server: http.server.HTTPServer, deadline: float, state: str) -> None:
+    while time.time() < deadline:
+        server.handle_request()
+        if OAuthCallbackHandler.auth_code and OAuthCallbackHandler.received_state == state:
+            return
+
+
+def parse_manual_input(value: str, expected_state: str) -> str | None:
+    """Parse user-pasted redirect URL or auth code."""
+    value = value.strip()
+    if not value:
+        return None
+    try:
+        parsed = urllib.parse.urlparse(value)
+        params = urllib.parse.parse_qs(parsed.query)
+        code = params.get("code", [None])[0]
+        state = params.get("state", [None])[0]
+        if state and state != expected_state:
+            return None
+        return code
+    except Exception:
+        pass
+    # Maybe it's just the raw code
+    if len(value) > 10 and " " not in value:
+        return value
+    return None
+
+
+def _read_manual_input_lines(
+    manual_inputs: queue.Queue[str],
+    stop_event: threading.Event,
+    stdin: TextIO | None = None,
+) -> None:
+    stream = sys.stdin if stdin is None else stdin
+
+    while not stop_event.is_set():
+        try:
+            manual = stream.readline()
+        except (EOFError, OSError):
+            return
+
+        if not manual:
+            return
+
+        if manual.strip():
+            manual_inputs.put(manual)
+
+
+def wait_for_code_from_callback_or_stdin(
+    expected_state: str,
+    callback_result: list[str | None],
+    callback_done: threading.Event,
+    timeout_secs: float = 120,
+    poll_interval: float = 0.1,
+    stdin: TextIO | None = None,
+) -> str | None:
+    manual_inputs: queue.Queue[str] = queue.Queue()
+    stop_event = threading.Event()
+
+    # Read stdin on a daemon thread so manual paste works on platforms where
+    # select() cannot poll console handles, including Windows terminals.
+    threading.Thread(
+        target=_read_manual_input_lines,
+        args=(manual_inputs, stop_event, stdin),
+        daemon=True,
+    ).start()
+
+    deadline = time.time() + timeout_secs
+    try:
+        while time.time() < deadline:
+            if callback_result[0]:
+                return callback_result[0]
+
+            while True:
+                try:
+                    manual = manual_inputs.get_nowait()
+                except queue.Empty:
+                    break
+
+                code = parse_manual_input(manual, expected_state)
+                if code:
+                    return code
+
+            if callback_done.is_set():
+                return callback_result[0]
+
+            time.sleep(poll_interval)
+
+        return callback_result[0]
+    finally:
+        stop_event.set()
+
+
+def main() -> int:
+    # Generate PKCE and state
+    verifier, challenge = generate_pkce()
+    state = secrets.token_hex(16)
+
+    # Build URL
+    auth_url = build_authorize_url(state, challenge)
+
+    print()
+    print("\033[1mOpenAI Codex OAuth Login\033[0m")
+    print()
+
+    # Try to start the local callback server first
+    try:
+        server_available = True
+        # Quick test that port is free
+        import socket
+
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.settimeout(1)
+        result = sock.connect_ex(("127.0.0.1", CALLBACK_PORT))
+        sock.close()
+        if result == 0:
+            print(f"\033[1;33mPort {CALLBACK_PORT} is in use. Using manual paste mode.\033[0m")
+            server_available = False
+    except Exception:
+        server_available = True
+
+    # Open browser
+    browser_opened = open_browser(auth_url)
+    if browser_opened:
+        print("  Browser opened for OpenAI sign-in...")
+    else:
+        print("  Could not open browser automatically.")
+
+    print()
+    print("  If the browser didn't open, visit this URL:")
+    print(f"  \033[0;36m{auth_url}\033[0m")
+    print()
+
+    code = None
+
+    if server_available:
+        print("  Waiting for authentication (up to 2 minutes)...")
+        print("  \033[2mOr paste the redirect URL below if the callback didn't work:\033[0m")
+        print()
+
+        # Start callback server in background
+        callback_result: list[str | None] = [None]
+        callback_done = threading.Event()
+
+        def run_server() -> None:
+            try:
+                callback_result[0] = wait_for_callback(state, timeout_secs=120)
+            finally:
+                callback_done.set()
+
+        server_thread = threading.Thread(target=run_server)
+        server_thread.daemon = True
+        server_thread.start()
+
+        try:
+            code = wait_for_code_from_callback_or_stdin(
+                state,
+                callback_result,
+                callback_done,
+                timeout_secs=120,
+            )
+        except KeyboardInterrupt:
+            print("\n\033[0;31mCancelled.\033[0m")
+            return 1
+    else:
+        # Manual paste mode
+        try:
+            manual = input("  Paste the redirect URL: ").strip()
+            code = parse_manual_input(manual, state)
+        except (KeyboardInterrupt, EOFError):
+            print("\n\033[0;31mCancelled.\033[0m")
+            return 1
+
+    if not code:
+        print("\n\033[0;31mAuthentication timed out or failed.\033[0m")
+        return 1
+
+    # Exchange code for tokens
+    print()
+    print("  Exchanging authorization code for tokens...")
+    token_data = exchange_code_for_tokens(code, verifier)
+    if not token_data:
+        return 1
+
+    # Extract account_id from JWT
+    account_id = get_account_id(token_data["access_token"])
+    if not account_id:
+        print("\033[0;31mFailed to extract account ID from token.\033[0m", file=sys.stderr)
+        return 1
+
+    # Save credentials
+    save_credentials(token_data, account_id)
+    print("  \033[0;32mAuthentication successful!\033[0m")
+    print(f"  Credentials saved to {CODEX_AUTH_FILE}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -1,199 +0,0 @@
-#!/usr/bin/env python3
-"""
-Example: Integrating MCP Servers with the Core Framework
-
-This example demonstrates how to:
-1. Register MCP servers programmatically
-2. Use MCP tools in agents
-3. Load MCP servers from configuration files
-"""
-
-import asyncio
-from pathlib import Path
-
-from framework.runner.runner import AgentRunner
-
-
-async def example_1_programmatic_registration():
-    """Example 1: Register MCP server programmatically"""
-    print("\n=== Example 1: Programmatic MCP Server Registration ===\n")
-
-    # Load an existing agent
-    runner = AgentRunner.load("exports/task-planner")
-
-    # Register tools MCP server via STDIO
-    num_tools = runner.register_mcp_server(
-        name="tools",
-        transport="stdio",
-        command="python",
-        args=["-m", "aden_tools.mcp_server", "--stdio"],
-        cwd="../tools",
-    )
-
-    print(f"Registered {num_tools} tools from tools MCP server")
-
-    # List all available tools
-    tools = runner._tool_registry.get_tools()
-    print(f"\nAvailable tools: {list(tools.keys())}")
-
-    # Run the agent with MCP tools available
-    result = await runner.run({
-        "objective": "Search for 'Claude AI' and summarize the top 3 results"
-    })
-
-    print(f"\nAgent result: {result}")
-
-    # Cleanup
-    runner.cleanup()
-
-
-async def example_2_http_transport():
-    """Example 2: Connect to MCP server via HTTP"""
-    print("\n=== Example 2: HTTP MCP Server Connection ===\n")
-
-    # First, start the tools MCP server in HTTP mode:
-    # cd tools && python mcp_server.py --port 4001
-
-    runner = AgentRunner.load("exports/task-planner")
-
-    # Register tools via HTTP
-    num_tools = runner.register_mcp_server(
-        name="tools-http",
-        transport="http",
-        url="http://localhost:4001",
-    )
-
-    print(f"Registered {num_tools} tools from HTTP MCP server")
-
-    # Cleanup
-    runner.cleanup()
-
-
-async def example_3_config_file():
-    """Example 3: Load MCP servers from configuration file"""
-    print("\n=== Example 3: Load from Configuration File ===\n")
-
-    # Create a test agent folder with mcp_servers.json
-    test_agent_path = Path("exports/task-planner")
-
-    # Copy example config (in practice, you'd place this in your agent folder)
-    import shutil
-    shutil.copy(
-        "examples/mcp_servers.json",
-        test_agent_path / "mcp_servers.json"
-    )
-
-    # Load agent - MCP servers will be auto-discovered
-    runner = AgentRunner.load(test_agent_path)
-
-    # Tools are automatically available
-    tools = runner._tool_registry.get_tools()
-    print(f"Available tools: {list(tools.keys())}")
-
-    # Cleanup
-    runner.cleanup()
-
-    # Clean up the test config
-    (test_agent_path / "mcp_servers.json").unlink()
-
-
-async def example_4_custom_agent_with_mcp_tools():
-    """Example 4: Build custom agent that uses MCP tools"""
-    print("\n=== Example 4: Custom Agent with MCP Tools ===\n")
-
-    from framework.builder.workflow import WorkflowBuilder
-
-    # Create a workflow builder
-    builder = WorkflowBuilder()
-
-    # Define goal
-    builder.set_goal(
-        goal_id="web-researcher",
-        name="Web Research Agent",
-        description="Search the web and summarize findings"
-    )
-
-    # Add success criteria
-    builder.add_success_criterion(
-        "search-results",
-        "Successfully retrieve at least 3 web search results"
-    )
-    builder.add_success_criterion(
-        "summary",
-        "Provide a clear, concise summary of the findings"
-    )
-
-    # Add nodes that will use MCP tools
-    builder.add_node(
-        node_id="web-searcher",
-        name="Web Search",
-        description="Search the web for information",
-        node_type="llm_tool_use",
-        system_prompt="Search for {query} and return the top results. Use the web_search tool.",
-        tools=["web_search"],  # This tool comes from tools MCP server
-        input_keys=["query"],
-        output_keys=["search_results"],
-    )
-
-    builder.add_node(
-        node_id="summarizer",
-        name="Summarize Results",
-        description="Summarize the search results",
-        node_type="llm_generate",
-        system_prompt="Summarize the following search results in 2-3 sentences: {search_results}",
-        input_keys=["search_results"],
-        output_keys=["summary"],
-    )
-
-    # Connect nodes
-    builder.add_edge("web-searcher", "summarizer")
-
-    # Set entry point
-    builder.set_entry("web-searcher")
-    builder.set_terminal("summarizer")
-
-    # Export the agent
-    export_path = Path("exports/web-research-agent")
-    export_path.mkdir(parents=True, exist_ok=True)
-    builder.export(export_path)
-
-    # Load and register MCP server
-    runner = AgentRunner.load(export_path)
-    runner.register_mcp_server(
-        name="tools",
-        transport="stdio",
-        command="python",
-        args=["-m", "aden_tools.mcp_server", "--stdio"],
-        cwd="../tools",
-    )
-
-    # Run the agent
-    result = await runner.run({"query": "latest AI breakthroughs 2026"})
-
-    print(f"\nAgent completed with result:\n{result}")
-
-    # Cleanup
-    runner.cleanup()
-
-
-async def main():
-    """Run all examples"""
-    print("=" * 60)
-    print("MCP Integration Examples")
-    print("=" * 60)
-
-    try:
-        # Run examples
-        await example_1_programmatic_registration()
-        # await example_2_http_transport()  # Requires HTTP server running
-        # await example_3_config_file()
-        # await example_4_custom_agent_with_mcp_tools()
-
-    except Exception as e:
-        print(f"\nError running example: {e}")
-        import traceback
-        traceback.print_exc()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
@@ -4,8 +4,8 @@
      "name": "tools",
      "description": "Aden tools including web search, file operations, and PDF reading",
      "transport": "stdio",
-      "command": "python",
-      "args": ["mcp_server.py", "--stdio"],
+      "command": "uv",
+      "args": ["run", "python", "mcp_server.py", "--stdio"],
      "cwd": "../tools",
      "env": {
        "BRAVE_SEARCH_API_KEY": "${BRAVE_SEARCH_API_KEY}"
@@ -1,78 +1,20 @@
-"""
-Aden Hive Framework: A goal-driven agent runtime optimized for Builder observability.
+"""Hive Agent Framework.

-The runtime is designed around DECISIONS, not just actions. Every significant
-choice the agent makes is captured with:
- What it was trying to do (intent)
- What options it considered
- What it chose and why
- What happened as a result
- Whether that was good or bad (evaluated post-hoc)
-
-This gives the Builder LLM the information it needs to improve agent behavior.
-
-## Testing Framework
-
-The framework includes a Goal-Based Testing system (Goal → Agent → Eval):
- Generate tests from Goal success_criteria and constraints
- Mandatory user approval before tests are stored
- Parallel test execution with error categorization
- Debug tools with fix suggestions
-
-See `framework.testing` for details.
+Core classes:
+    ColonyRuntime -- orchestrates parallel worker clones in a colony
+    AgentLoop      -- the LLM + tool execution loop (one per worker)
+    AgentLoader    -- loads agent config from disk, builds pipeline
+    DecisionTracker -- records decisions for post-hoc analysis
 """

-from framework.schemas.decision import Decision, Option, Outcome, DecisionEvaluation
-from framework.schemas.run import Run, RunSummary, Problem
-from framework.runtime.core import Runtime
-from framework.builder.query import BuilderQuery
-from framework.llm import LLMProvider, AnthropicProvider
-from framework.runner import AgentRunner, AgentOrchestrator
-
-# Testing framework
-from framework.testing import (
-    Test,
-    TestResult,
-    TestSuiteResult,
-    TestStorage,
-    ApprovalStatus,
-    ErrorCategory,
-    ConstraintTestGenerator,
-    SuccessCriteriaTestGenerator,
-    ParallelTestRunner,
-    ParallelConfig,
-    DebugTool,
-)
+from framework.agent_loop import AgentLoop
+from framework.host import ColonyRuntime
+from framework.loader import AgentLoader
+from framework.tracker import DecisionTracker

 __all__ = [
-    # Schemas
-    "Decision",
-    "Option",
-    "Outcome",
-    "DecisionEvaluation",
-    "Run",
-    "RunSummary",
-    "Problem",
-    # Runtime
-    "Runtime",
-    # Builder
-    "BuilderQuery",
-    # LLM
-    "LLMProvider",
-    "AnthropicProvider",
-    # Runner
-    "AgentRunner",
-    "AgentOrchestrator",
-    # Testing
-    "Test",
-    "TestResult",
-    "TestSuiteResult",
-    "TestStorage",
-    "ApprovalStatus",
-    "ErrorCategory",
-    "ConstraintTestGenerator",
-    "SuccessCriteriaTestGenerator",
-    "ParallelTestRunner",
-    "ParallelConfig",
-    "DebugTool",
+    "ColonyRuntime",
+    "AgentLoader",
+    "AgentLoop",
+    "DecisionTracker",
 ]
@@ -1,4 +1,4 @@
-"""Allow running as python -m framework"""
+"""Allow running as ``python -m framework``, which powers the ``hive`` console entry point."""

 from framework.cli import main

@@ -0,0 +1,34 @@
+"""Agent loop -- the core agent execution primitive."""
+
+from framework.agent_loop.conversation import (  # noqa: F401
+    ConversationStore,
+    Message,
+    NodeConversation,
+)
+from framework.agent_loop.types import (  # noqa: F401
+    AgentContext,
+    AgentProtocol,
+    AgentResult,
+    AgentSpec,
+)
+
+
+def __getattr__(name: str):
+    if name in ("AgentLoop", "JudgeProtocol", "JudgeVerdict", "LoopConfig", "OutputAccumulator"):
+        from framework.agent_loop.agent_loop import (
+            AgentLoop,
+            JudgeProtocol,
+            JudgeVerdict,
+            LoopConfig,
+            OutputAccumulator,
+        )
+
+        _exports = {
+            "AgentLoop": AgentLoop,
+            "JudgeProtocol": JudgeProtocol,
+            "JudgeVerdict": JudgeVerdict,
+            "LoopConfig": LoopConfig,
+            "OutputAccumulator": OutputAccumulator,
+        }
+        return _exports[name]
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,7 @@
+"""Agent loop internals -- compaction, judge, tools, subagent execution.
+
+Re-exports from legacy locations for the new import path.
+"""
+
+from framework.agent_loop.internals.compaction import *  # noqa: F401, F403
+from framework.agent_loop.internals.synthetic_tools import *  # noqa: F401, F403
@@ -0,0 +1,848 @@
+"""Conversation compaction pipeline.
+
+Implements the multi-level compaction strategy:
+0. Microcompaction (count-based tool result clearing — cheapest)
+1. Prune old tool results (token-budget based)
+2. Structure-preserving compaction (spillover)
+3. LLM summary compaction (with recursive splitting)
+4. Emergency deterministic summary (no LLM)
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+import time
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+from framework.agent_loop.conversation import Message, NodeConversation
+from framework.agent_loop.internals.event_publishing import publish_context_usage
+from framework.agent_loop.internals.types import LoopConfig, OutputAccumulator
+from framework.host.event_bus import EventBus
+from framework.orchestrator.node import NodeContext
+
+logger = logging.getLogger(__name__)
+
+# Limits for LLM compaction
+LLM_COMPACT_CHAR_LIMIT: int = 240_000
+LLM_COMPACT_MAX_DEPTH: int = 10
+
+# Microcompaction: tools whose results can be safely cleared
+COMPACTABLE_TOOLS: frozenset[str] = frozenset(
+    {
+        "read_file",
+        "run_command",
+        "web_search",
+        "web_fetch",
+        "grep_search",
+        "glob_search",
+        "write_file",
+        "edit_file",
+        "browser_screenshot",
+        "list_directory",
+    }
+)
+
+# Keep at most this many compactable tool results; clear older ones
+MICROCOMPACT_KEEP_RECENT: int = 8
+
+# Circuit-breaker: stop auto-compacting after this many consecutive failures
+MAX_CONSECUTIVE_FAILURES: int = 3
+
+# Track consecutive compaction failures per conversation (module-level)
+_failure_counts: dict[int, int] = {}
+
+# Track last compaction time per conversation for recompaction detection
+_last_compact_times: dict[int, float] = {}
+
+
+def microcompact(
+    conversation: NodeConversation,
+    *,
+    keep_recent: int = MICROCOMPACT_KEEP_RECENT,
+) -> int:
+    """Clear old compactable tool results by count, keeping only the most recent.
+
+    This is the cheapest possible compaction — no LLM call, no structural
+    changes, just replaces old tool result content with a short placeholder.
+    Inspired by Claude Code's cached-microcompact strategy.
+
+    Returns the number of tool results cleared.
+    """
+    # Collect indices of compactable tool results (newest first)
+    compactable_indices: list[int] = []
+    messages = conversation.messages
+    for i in range(len(messages) - 1, -1, -1):
+        msg = messages[i]
+        if msg.role != "tool" or msg.is_error or msg.is_skill_content:
+            continue
+        if msg.content.startswith(("Pruned tool result", "[Pruned tool result", "[Old tool result")):
+            continue
+        if len(msg.content) < 100:
+            continue
+
+        # Check if the tool that produced this result is compactable
+        tool_name = _find_tool_name_for_result(messages, msg)
+        if tool_name and tool_name in COMPACTABLE_TOOLS:
+            compactable_indices.append(i)
+
+    # Keep the most recent N, clear the rest
+    to_clear = compactable_indices[keep_recent:]
+    if not to_clear:
+        return 0
+
+    cleared = 0
+    for i in to_clear:
+        msg = messages[i]
+        spillover = _extract_spillover_filename_inline(msg.content)
+        orig_len = len(msg.content)
+        if spillover:
+            placeholder = (
+                f"Old tool result ({orig_len:,} chars) cleared from context. "
+                f"Full data saved at: {spillover}\n"
+                f"Read the complete data with read_file(path='{spillover}')."
+            )
+        else:
+            placeholder = f"Old tool result ({orig_len:,} chars) cleared from context."
+
+        # Mutate in-place (microcompact is synchronous, no store writes)
+        conversation._messages[i] = Message(
+            seq=msg.seq,
+            role=msg.role,
+            content=placeholder,
+            tool_use_id=msg.tool_use_id,
+            tool_calls=msg.tool_calls,
+            is_error=msg.is_error,
+            phase_id=msg.phase_id,
+            is_transition_marker=msg.is_transition_marker,
+        )
+        cleared += 1
+
+    if cleared > 0:
+        # Invalidate cached token count
+        conversation._last_api_input_tokens = None
+
+    return cleared
+
+
+def _find_tool_name_for_result(messages: list[Message], tool_msg: Message) -> str | None:
+    """Find the tool name from the assistant message that triggered this tool result."""
+    if not tool_msg.tool_use_id:
+        return None
+    for msg in messages:
+        if msg.tool_calls:
+            for tc in msg.tool_calls:
+                if tc.get("id") == tool_msg.tool_use_id:
+                    return tc.get("function", {}).get("name")
+    return None
+
+
+def _extract_spillover_filename_inline(content: str) -> str | None:
+    """Quick inline check for spillover filename in tool result content.
+
+    Matches both the new prose format ("saved at: /path") and the
+    legacy bracketed trailer ("saved to '/path'").
+    """
+    match = re.search(r"saved at:\s*(\S+)", content, re.IGNORECASE)
+    if match:
+        return match.group(1)
+    match = re.search(r"saved to '([^']+)'", content, re.IGNORECASE)
+    return match.group(1) if match else None
+
+
+async def compact(
+    ctx: NodeContext,
+    conversation: NodeConversation,
+    accumulator: OutputAccumulator | None,
+    *,
+    config: LoopConfig,
+    event_bus: EventBus | None,
+    char_limit: int = LLM_COMPACT_CHAR_LIMIT,
+    max_depth: int = LLM_COMPACT_MAX_DEPTH,
+) -> None:
+    """Run the full compaction pipeline if conversation needs compaction.
+
+    Pipeline stages (in order, short-circuits when budget is restored):
+    0. Microcompaction (count-based tool result clearing — cheapest)
+    1. Prune old tool results (token-budget based)
+    2. Structure-preserving compaction (free, no LLM)
+    3. LLM summary compaction (recursive split if too large)
+    4. Emergency deterministic summary (fallback)
+    """
+    conv_id = id(conversation)
+
+    # Circuit breaker: stop LLM-based compaction after repeated failures,
+    # but still fall through to the emergency deterministic summary so
+    # the conversation doesn't silently grow past the context window.
+    # Without this, a persistent LLM outage during compaction would
+    # leave the agent stuck sending oversized prompts until the API 400s.
+    _llm_compaction_skipped = _failure_counts.get(conv_id, 0) >= MAX_CONSECUTIVE_FAILURES
+    if _llm_compaction_skipped:
+        logger.warning(
+            "Circuit breaker: LLM compaction disabled after %d failures — skipping straight to emergency summary",
+            _failure_counts[conv_id],
+        )
+
+    # Recompaction detection
+    now = time.monotonic()
+    last_time = _last_compact_times.get(conv_id)
+    if last_time is not None and (now - last_time) < 30:
+        logger.warning(
+            "Recompaction chain detected: only %.1fs since last compaction",
+            now - last_time,
+        )
+
+    ratio_before = conversation.usage_ratio()
+    phase_grad = getattr(ctx, "continuous_mode", False)
+    pre_inventory: list[dict[str, Any]] | None = None
+
+    if ratio_before >= 1.0:
+        pre_inventory = build_message_inventory(conversation)
+
+    # --- Step 0: Microcompaction (count-based, cheapest) ---
+    mc_cleared = microcompact(conversation)
+    if mc_cleared > 0:
+        logger.info(
+            "Microcompact cleared %d old tool results: %.0f%% -> %.0f%%",
+            mc_cleared,
+            ratio_before * 100,
+            conversation.usage_ratio() * 100,
+        )
+    if not conversation.needs_compaction():
+        _record_success(conv_id, now)
+        await log_compaction(
+            ctx,
+            conversation,
+            ratio_before,
+            event_bus,
+            pre_inventory=pre_inventory,
+        )
+        return
+
+    # --- Step 1: Prune old tool results (free, fast) ---
+    protect = max(2000, config.max_context_tokens // 12)
+    pruned = await conversation.prune_old_tool_results(
+        protect_tokens=protect,
+        min_prune_tokens=max(1000, protect // 3),
+    )
+    if pruned > 0:
+        logger.info(
+            "Pruned %d old tool results: %.0f%% -> %.0f%%",
+            pruned,
+            ratio_before * 100,
+            conversation.usage_ratio() * 100,
+        )
+    if not conversation.needs_compaction():
+        _record_success(conv_id, now)
+        await log_compaction(
+            ctx,
+            conversation,
+            ratio_before,
+            event_bus,
+            pre_inventory=pre_inventory,
+        )
+        return
+
+    # --- Step 2: Standard structure-preserving compaction (free, no LLM) ---
+    spill_dir = config.spillover_dir
+    if spill_dir:
+        await conversation.compact_preserving_structure(
+            spillover_dir=spill_dir,
+            keep_recent=4,
+            phase_graduated=phase_grad,
+        )
+    if not conversation.needs_compaction():
+        _record_success(conv_id, now)
+        await log_compaction(
+            ctx,
+            conversation,
+            ratio_before,
+            event_bus,
+            pre_inventory=pre_inventory,
+        )
+        return
+
+    # --- Step 3: LLM summary compaction ---
+    if ctx.llm is not None and not _llm_compaction_skipped:
+        logger.info(
+            "LLM summary compaction triggered (%.0f%% usage)",
+            conversation.usage_ratio() * 100,
+        )
+        try:
+            summary = await llm_compact(
+                ctx,
+                list(conversation.messages),
+                accumulator,
+                char_limit=char_limit,
+                max_depth=max_depth,
+                max_context_tokens=config.max_context_tokens,
+            )
+            await conversation.compact(
+                summary,
+                keep_recent=2,
+                phase_graduated=phase_grad,
+            )
+        except Exception as e:
+            logger.warning("LLM compaction failed: %s", e)
+            _failure_counts[conv_id] = _failure_counts.get(conv_id, 0) + 1
+
+    if not conversation.needs_compaction():
+        _record_success(conv_id, now)
+        await log_compaction(
+            ctx,
+            conversation,
+            ratio_before,
+            event_bus,
+            pre_inventory=pre_inventory,
+        )
+        return
+
+    # --- Step 4: Emergency deterministic summary (LLM failed/unavailable) ---
+    logger.warning(
+        "Emergency compaction (%.0f%% usage)",
+        conversation.usage_ratio() * 100,
+    )
+    summary = build_emergency_summary(ctx, accumulator, conversation, config)
+    await conversation.compact(
+        summary,
+        keep_recent=1,
+        phase_graduated=phase_grad,
+    )
+    _record_success(conv_id, now)
+    await log_compaction(
+        ctx,
+        conversation,
+        ratio_before,
+        event_bus,
+        pre_inventory=pre_inventory,
+    )
+
+
+def _record_success(conv_id: int, timestamp: float) -> None:
+    """Reset failure counter and record compaction time on success."""
+    _failure_counts.pop(conv_id, None)
+    _last_compact_times[conv_id] = timestamp
+
+
+# --- LLM compaction with binary-search splitting ----------------------
+
+
+def strip_images_from_messages(messages: list[Message]) -> list[Message]:
+    """Strip image_content from messages before LLM summarisation.
+
+    Images/documents are replaced with ``[image]`` markers so the summary
+    notes they existed without wasting tokens sending binary data to the
+    compaction LLM.  Returns a new list (original messages are not mutated).
+    """
+    stripped: list[Message] = []
+    for msg in messages:
+        if msg.image_content:
+            n_images = len(msg.image_content)
+            marker = " ".join("[image]" for _ in range(n_images))
+            content = f"{msg.content}\n{marker}" if msg.content else marker
+            stripped.append(
+                Message(
+                    seq=msg.seq,
+                    role=msg.role,
+                    content=content,
+                    tool_use_id=msg.tool_use_id,
+                    tool_calls=msg.tool_calls,
+                    is_error=msg.is_error,
+                    phase_id=msg.phase_id,
+                    is_transition_marker=msg.is_transition_marker,
+                    image_content=None,  # stripped
+                )
+            )
+        else:
+            stripped.append(msg)
+    return stripped
+
+
+async def llm_compact(
+    ctx: NodeContext,
+    messages: list,
+    accumulator: OutputAccumulator | None = None,
+    _depth: int = 0,
+    *,
+    char_limit: int = LLM_COMPACT_CHAR_LIMIT,
+    max_depth: int = LLM_COMPACT_MAX_DEPTH,
+    max_context_tokens: int = 128_000,
+) -> str:
+    """Summarise *messages* with LLM, splitting recursively if too large.
+
+    If the formatted text exceeds ``LLM_COMPACT_CHAR_LIMIT`` or the LLM
+    rejects the call with a context-length error, the messages are split
+    in half and each half is summarised independently.  Tool history is
+    appended once at the top-level call (``_depth == 0``).
+    """
+    from framework.agent_loop.conversation import extract_tool_call_history
+    from framework.agent_loop.internals.tool_result_handler import is_context_too_large_error
+
+    if _depth > max_depth:
+        raise RuntimeError(f"LLM compaction recursion limit ({max_depth})")
+
+    # Strip images before summarisation to avoid wasting tokens
+    if _depth == 0:
+        messages = strip_images_from_messages(messages)
+
+    formatted = format_messages_for_summary(messages)
+
+    # Proactive split: avoid wasting an API call on oversized input
+    if len(formatted) > char_limit and len(messages) > 1:
+        summary = await _llm_compact_split(
+            ctx,
+            messages,
+            accumulator,
+            _depth,
+            char_limit=char_limit,
+            max_depth=max_depth,
+            max_context_tokens=max_context_tokens,
+        )
+    else:
+        prompt = build_llm_compaction_prompt(
+            ctx,
+            accumulator,
+            formatted,
+            max_context_tokens=max_context_tokens,
+        )
+        summary_budget = max(1024, max_context_tokens // 2)
+        try:
+            response = await ctx.llm.acomplete(
+                messages=[{"role": "user", "content": prompt}],
+                system=(
+                    "You are a conversation compactor for an AI agent. "
+                    "Write a detailed summary that allows the agent to "
+                    "continue its work. Preserve user-stated rules, "
+                    "constraints, and account/identity preferences verbatim."
+                ),
+                max_tokens=summary_budget,
+            )
+            summary = response.content
+        except Exception as e:
+            if is_context_too_large_error(e) and len(messages) > 1:
+                logger.info(
+                    "LLM context too large (depth=%d, msgs=%d) — splitting",
+                    _depth,
+                    len(messages),
+                )
+                summary = await _llm_compact_split(
+                    ctx,
+                    messages,
+                    accumulator,
+                    _depth,
+                    char_limit=char_limit,
+                    max_depth=max_depth,
+                    max_context_tokens=max_context_tokens,
+                )
+            else:
+                raise
+
+    # Append tool history at top level only
+    if _depth == 0:
+        tool_history = extract_tool_call_history(messages)
+        if tool_history and "TOOLS ALREADY CALLED" not in summary:
+            summary += "\n\n" + tool_history
+
+    return summary
+
+
+async def _llm_compact_split(
+    ctx: NodeContext,
+    messages: list,
+    accumulator: OutputAccumulator | None,
+    _depth: int,
+    *,
+    char_limit: int = LLM_COMPACT_CHAR_LIMIT,
+    max_depth: int = LLM_COMPACT_MAX_DEPTH,
+    max_context_tokens: int = 128_000,
+) -> str:
+    """Split messages in half and summarise each half independently."""
+    mid = max(1, len(messages) // 2)
+    s1 = await llm_compact(
+        ctx,
+        messages[:mid],
+        None,
+        _depth + 1,
+        char_limit=char_limit,
+        max_depth=max_depth,
+        max_context_tokens=max_context_tokens,
+    )
+    s2 = await llm_compact(
+        ctx,
+        messages[mid:],
+        accumulator,
+        _depth + 1,
+        char_limit=char_limit,
+        max_depth=max_depth,
+        max_context_tokens=max_context_tokens,
+    )
+    return s1 + "\n\n" + s2
+
+
+# --- Compaction helpers ------------------------------------------------
+
+
+def format_messages_for_summary(messages: list) -> str:
+    """Format messages as text for LLM summarisation."""
+    lines: list[str] = []
+    for m in messages:
+        if m.role == "tool":
+            content = m.content[:500]
+            if len(m.content) > 500:
+                content += "..."
+            lines.append(f"[tool result]: {content}")
+        elif m.role == "assistant" and m.tool_calls:
+            names = [tc.get("function", {}).get("name", "?") for tc in m.tool_calls]
+            text = m.content[:200] if m.content else ""
+            lines.append(f"[assistant (calls: {', '.join(names)})]: {text}")
+        else:
+            lines.append(f"[{m.role}]: {m.content}")
+    return "\n\n".join(lines)
+
+
+def build_llm_compaction_prompt(
+    ctx: NodeContext,
+    accumulator: OutputAccumulator | None,
+    formatted_messages: str,
+    *,
+    max_context_tokens: int = 128_000,
+) -> str:
+    """Build prompt for LLM compaction targeting 50% of token budget.
+
+    Uses a structured section format inspired by Claude Code's compact
+    service.  Each section focuses on a different aspect of the conversation
+    so the summariser produces consistently useful, well-organised output.
+    """
+    spec = ctx.agent_spec
+    ctx_lines = [f"NODE: {spec.name} (id={spec.id})"]
+    if spec.description:
+        ctx_lines.append(f"PURPOSE: {spec.description}")
+    if spec.success_criteria:
+        ctx_lines.append(f"SUCCESS CRITERIA: {spec.success_criteria}")
+
+    if accumulator:
+        acc = accumulator.to_dict()
+        done = {k: v for k, v in acc.items() if v is not None}
+        todo = [k for k, v in acc.items() if v is None]
+        if done:
+            ctx_lines.append("OUTPUTS ALREADY SET:\n" + "\n".join(f"  {k}: {str(v)[:150]}" for k, v in done.items()))
+        if todo:
+            ctx_lines.append(f"OUTPUTS STILL NEEDED: {', '.join(todo)}")
+    elif spec.output_keys:
+        ctx_lines.append(f"OUTPUTS STILL NEEDED: {', '.join(spec.output_keys)}")
+
+    target_tokens = max_context_tokens // 2
+    target_chars = target_tokens * 4
+    node_ctx = "\n".join(ctx_lines)
+
+    return (
+        "You are compacting an AI agent's conversation history. "
+        "The agent is still working and needs to continue.\n\n"
+        f"AGENT CONTEXT:\n{node_ctx}\n\n"
+        f"CONVERSATION MESSAGES:\n{formatted_messages}\n\n"
+        "INSTRUCTIONS:\n"
+        f"Write a summary of approximately {target_chars} characters "
+        f"(~{target_tokens} tokens).\n\n"
+        "Organise the summary into these sections (omit empty ones):\n\n"
+        "1. **Primary Request and Intent** — What the user originally asked "
+        "for and the high-level goal the agent is working toward.\n"
+        "2. **Key Technical Concepts** — Important domain-specific terms, "
+        "patterns, or architectural decisions established in the conversation.\n"
+        "3. **Files and Code Sections** — Specific files read/written/edited "
+        "with brief descriptions of changes. Include short code snippets only "
+        "when they capture critical logic.\n"
+        "4. **Errors and Fixes** — Problems encountered and how they were "
+        "resolved. Include root causes so the agent doesn't repeat them.\n"
+        "5. **Problem Solving Efforts** — Approaches tried, dead ends hit, "
+        "and reasoning behind the current strategy.\n"
+        "6. **User Messages** — Preserve ALL user-stated rules, constraints, "
+        "identity preferences, and account details verbatim.\n"
+        "7. **Pending Tasks** — Work remaining, outputs still needed, and "
+        "any blockers.\n"
+        "8. **Current Work** — The most recent action taken and the immediate "
+        "next step the agent should perform. This section is the most important "
+        "for seamless resumption.\n\n"
+        "Additional rules:\n"
+        "- Be detailed enough that the agent can resume without re-doing work.\n"
+        "- Preserve key decisions made and results obtained.\n"
+        "- When in doubt, keep information rather than discard it.\n"
+    )
+
+
+def build_message_inventory(conversation: NodeConversation) -> list[dict[str, Any]]:
+    """Build a per-message size inventory for debug logging."""
+    inventory: list[dict[str, Any]] = []
+    for message in conversation.messages:
+        content_chars = len(message.content)
+        tool_call_args_chars = 0
+        tool_name = None
+        if message.tool_calls:
+            for tool_call in message.tool_calls:
+                args = tool_call.get("function", {}).get("arguments", "")
+                tool_call_args_chars += len(args) if isinstance(args, str) else len(json.dumps(args))
+            names = [tool_call.get("function", {}).get("name", "?") for tool_call in message.tool_calls]
+            tool_name = ", ".join(names)
+        elif message.role == "tool" and message.tool_use_id:
+            for previous in conversation.messages:
+                if previous.tool_calls:
+                    for tool_call in previous.tool_calls:
+                        if tool_call.get("id") == message.tool_use_id:
+                            tool_name = tool_call.get("function", {}).get("name", "?")
+                            break
+                if tool_name:
+                    break
+        entry: dict[str, Any] = {
+            "seq": message.seq,
+            "role": message.role,
+            "content_chars": content_chars,
+        }
+        if tool_call_args_chars:
+            entry["tool_call_args_chars"] = tool_call_args_chars
+        if tool_name:
+            entry["tool"] = tool_name
+        if message.is_error:
+            entry["is_error"] = True
+        if message.phase_id:
+            entry["phase"] = message.phase_id
+        if content_chars > 2000:
+            entry["preview"] = message.content[:200] + "…"
+        inventory.append(entry)
+    return inventory
+
+
+def write_compaction_debug_log(
+    ctx: NodeContext,
+    before_pct: int,
+    after_pct: int,
+    level: str,
+    inventory: list[dict[str, Any]] | None,
+) -> None:
+    """Write detailed compaction analysis to ~/.hive/compaction_log/."""
+    log_dir = Path.home() / ".hive" / "compaction_log"
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    ts = datetime.now(UTC).strftime("%Y%m%dT%H%M%S_%f")
+    node_label = ctx.agent_id.replace("/", "_")
+    log_path = log_dir / f"{ts}_{node_label}.md"
+
+    lines: list[str] = [
+        f"# Compaction Debug — {ctx.agent_id}",
+        f"**Time:** {datetime.now(UTC).isoformat()}",
+        f"**Node:** {ctx.agent_spec.name} (`{ctx.agent_id}`)",
+    ]
+    if ctx.stream_id:
+        lines.append(f"**Stream:** {ctx.stream_id}")
+    lines.append(f"**Level:** {level}")
+    lines.append(f"**Usage:** {before_pct}% → {after_pct}%")
+    lines.append("")
+
+    if inventory:
+        total_chars = sum(entry.get("content_chars", 0) + entry.get("tool_call_args_chars", 0) for entry in inventory)
+        lines.append(f"## Pre-Compaction Message Inventory ({len(inventory)} messages, {total_chars:,} total chars)")
+        lines.append("")
+        ranked = sorted(
+            inventory,
+            key=lambda entry: entry.get("content_chars", 0) + entry.get("tool_call_args_chars", 0),
+            reverse=True,
+        )
+        lines.append("| # | seq | role | tool | chars | % of total | flags |")
+        lines.append("|---|-----|------|------|------:|------------|-------|")
+        for i, entry in enumerate(ranked, 1):
+            chars = entry.get("content_chars", 0) + entry.get("tool_call_args_chars", 0)
+            pct = (chars / total_chars * 100) if total_chars else 0
+            tool = entry.get("tool", "")
+            flags: list[str] = []
+            if entry.get("is_error"):
+                flags.append("error")
+            if entry.get("phase"):
+                flags.append(f"phase={entry['phase']}")
+            lines.append(
+                f"| {i} | {entry['seq']} | {entry['role']} | {tool} | {chars:,} | {pct:.1f}% | {', '.join(flags)} |"
+            )
+
+        large = [entry for entry in ranked if entry.get("preview")]
+        if large:
+            lines.append("")
+            lines.append("### Large message previews")
+            for entry in large:
+                lines.append(f"\n**seq={entry['seq']}** ({entry['role']}, {entry.get('tool', '')}):")
+                lines.append(f"```\n{entry['preview']}\n```")
+    lines.append("")
+
+    try:
+        log_path.write_text("\n".join(lines), encoding="utf-8")
+        logger.debug("Compaction debug log written to %s", log_path)
+    except OSError:
+        logger.debug("Failed to write compaction debug log to %s", log_path)
+
+
+async def log_compaction(
+    ctx: NodeContext,
+    conversation: NodeConversation,
+    ratio_before: float,
+    event_bus: EventBus | None,
+    *,
+    pre_inventory: list[dict[str, Any]] | None = None,
+) -> None:
+    """Log compaction result to runtime logger and event bus."""
+    ratio_after = conversation.usage_ratio()
+    before_pct = round(ratio_before * 100)
+    after_pct = round(ratio_after * 100)
+
+    # Determine label from what happened
+    if after_pct >= before_pct - 1:
+        level = "prune_only"
+    elif ratio_after <= 0.6:
+        level = "llm"
+    else:
+        level = "structural"
+
+    logger.info(
+        "Compaction complete (%s): %d%% -> %d%%",
+        level,
+        before_pct,
+        after_pct,
+    )
+
+    if ctx.runtime_logger:
+        ctx.runtime_logger.log_step(
+            node_id=ctx.agent_id,
+            node_type="event_loop",
+            step_index=-1,
+            llm_text=f"Context compacted ({level}): {before_pct}% \u2192 {after_pct}%",
+            verdict="COMPACTION",
+            verdict_feedback=f"level={level} before={before_pct}% after={after_pct}%",
+        )
+
+    if event_bus:
+        from framework.host.event_bus import AgentEvent, EventType
+
+        event_data: dict[str, Any] = {
+            "level": level,
+            "usage_before": before_pct,
+            "usage_after": after_pct,
+        }
+        if pre_inventory is not None:
+            event_data["message_inventory"] = pre_inventory
+        await event_bus.publish(
+            AgentEvent(
+                type=EventType.CONTEXT_COMPACTED,
+                stream_id=ctx.stream_id or ctx.agent_id,
+                node_id=ctx.agent_id,
+                data=event_data,
+            )
+        )
+
+    await publish_context_usage(event_bus, ctx, conversation, "post_compaction")
+
+    if os.environ.get("HIVE_COMPACTION_DEBUG"):
+        write_compaction_debug_log(ctx, before_pct, after_pct, level, pre_inventory)
+
+
+def build_emergency_summary(
+    ctx: NodeContext,
+    accumulator: OutputAccumulator | None = None,
+    conversation: NodeConversation | None = None,
+    config: LoopConfig | None = None,
+) -> str:
+    """Build a structured emergency compaction summary.
+
+    Unlike normal/aggressive compaction which uses an LLM summary,
+    emergency compaction cannot afford an LLM call (context is already
+    way over budget).  Instead, build a deterministic summary from the
+    node's known state so the LLM can continue working after
+    compaction without losing track of its task and inputs.
+    """
+    parts = ["EMERGENCY COMPACTION — previous conversation was too large and has been replaced with this summary.\n"]
+
+    # 1. Node identity
+    spec = ctx.agent_spec
+    parts.append(f"NODE: {spec.name} (id={spec.id})")
+    if spec.description:
+        parts.append(f"PURPOSE: {spec.description}")
+
+    # 2. Inputs the node received
+    input_lines = []
+    for key in spec.input_keys:
+        value = ctx.input_data.get(key)
+        if value is not None:
+            # Truncate long values but keep them recognisable
+            v_str = str(value)
+            if len(v_str) > 200:
+                v_str = v_str[:200] + "…"
+            input_lines.append(f"  {key}: {v_str}")
+    if input_lines:
+        parts.append("INPUTS:\n" + "\n".join(input_lines))
+
+    # 3. Output accumulator state (what's been set so far)
+    if accumulator:
+        acc_state = accumulator.to_dict()
+        set_keys = {k: v for k, v in acc_state.items() if v is not None}
+        missing = [k for k, v in acc_state.items() if v is None]
+        if set_keys:
+            lines = [f"  {k}: {str(v)[:150]}" for k, v in set_keys.items()]
+            parts.append("OUTPUTS ALREADY SET:\n" + "\n".join(lines))
+        if missing:
+            parts.append(f"OUTPUTS STILL NEEDED: {', '.join(missing)}")
+    elif spec.output_keys:
+        parts.append(f"OUTPUTS STILL NEEDED: {', '.join(spec.output_keys)}")
+
+    # 4. Available tools reminder
+    if spec.tools:
+        parts.append(f"AVAILABLE TOOLS: {', '.join(spec.tools)}")
+
+    # 5. Spillover files — list actual files so the LLM can load
+    # them immediately instead of having to call list_data_files first.
+    spillover_dir = config.spillover_dir if config else None
+    if spillover_dir:
+        try:
+            from pathlib import Path
+
+            data_dir = Path(spillover_dir)
+            if data_dir.is_dir():
+                all_files = sorted(f.name for f in data_dir.iterdir() if f.is_file())
+                # Separate conversation history files from regular data files
+                conv_files = [f for f in all_files if re.match(r"conversation_\d+\.md$", f)]
+                data_files = [f for f in all_files if f not in conv_files]
+
+                if conv_files:
+                    conv_list = "\n".join(f"  - {f}  (full path: {data_dir / f})" for f in conv_files)
+                    parts.append(
+                        "CONVERSATION HISTORY (freeform messages saved during compaction — "
+                        "use read_file('<filename>') to review earlier dialogue):\n" + conv_list
+                    )
+                if data_files:
+                    file_list = "\n".join(f"  - {f}  (full path: {data_dir / f})" for f in data_files[:30])
+                    parts.append("DATA FILES (use read_file('<filename>') to read):\n" + file_list)
+                if not all_files:
+                    parts.append(
+                        "NOTE: Large tool results may have been saved to files. "
+                        "Use list_directory to check the data directory."
+                    )
+        except Exception:
+            parts.append("NOTE: Large tool results were saved to files. Use read_file(path='<path>') to read them.")
+
+    # 6. Tool call history (prevent re-calling tools)
+    if conversation is not None:
+        tool_history = _extract_tool_call_history(conversation)
+        if tool_history:
+            parts.append(tool_history)
+
+    parts.append("\nContinue working towards setting the remaining outputs. Use your tools and the inputs above.")
+    return "\n\n".join(parts)
+
+
+def _extract_tool_call_history(conversation: NodeConversation) -> str:
+    """Extract tool call history from conversation messages.
+
+    This is the instance-level variant that operates on a NodeConversation
+    directly (vs. the module-level extract_tool_call_history in conversation.py
+    which works on raw message lists).
+    """
+    from framework.agent_loop.conversation import extract_tool_call_history
+
+    return extract_tool_call_history(list(conversation.messages))
@@ -0,0 +1,265 @@
+"""Cursor persistence, queue draining, and pause detection.
+
+Handles the checkpoint/resume cycle: restoring state from a previous
+conversation store, writing cursor data, and managing injection/trigger
+queues between iterations.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+from collections.abc import Awaitable, Callable
+from dataclasses import dataclass
+from typing import Any
+
+from framework.agent_loop.conversation import ConversationStore, NodeConversation
+from framework.agent_loop.internals.types import LoopConfig, OutputAccumulator, TriggerEvent
+from framework.llm.capabilities import supports_image_tool_results
+from framework.orchestrator.node import NodeContext
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class RestoredState:
+    """State recovered from a previous checkpoint."""
+
+    conversation: NodeConversation
+    accumulator: OutputAccumulator
+    start_iteration: int
+    recent_responses: list[str]
+    recent_tool_fingerprints: list[list[tuple[str, str]]]
+    pending_input: dict[str, Any] | None
+
+
+async def restore(
+    conversation_store: ConversationStore | None,
+    ctx: NodeContext,
+    config: LoopConfig,
+) -> RestoredState | None:
+    """Attempt to restore from a previous checkpoint.
+
+    Returns a ``RestoredState`` with conversation, accumulator, iteration
+    counter, and stall/doom-loop detection state — everything needed to
+    resume exactly where execution stopped.
+    """
+    if conversation_store is None:
+        return None
+
+    # In isolated mode, filter parts by phase_id so the node only sees
+    # its own messages in the shared flat conversation store.  In
+    # continuous mode (or when _restore is called for timer-resume)
+    # load all parts — the full conversation threads across nodes.
+    _is_continuous = getattr(ctx, "continuous_mode", False)
+    # The queen has agent_id="queen" but messages are stored with phase_id=None.
+    # Only apply phase filtering for non-queen workers in a multi-agent setup.
+    phase_filter = None if (_is_continuous or ctx.agent_id == "queen") else ctx.agent_id
+    conversation = await NodeConversation.restore(
+        conversation_store,
+        phase_id=phase_filter,
+        run_id=ctx.effective_run_id,
+    )
+    if conversation is None:
+        logger.info(
+            "[restore] No conversation found for agent_id=%s phase_filter=%s run_id=%s",
+            ctx.agent_id,
+            phase_filter,
+            ctx.effective_run_id,
+        )
+        return None
+
+    logger.info(
+        "[restore] Restored %d messages for agent_id=%s phase_filter=%s run_id=%s",
+        conversation.message_count,
+        ctx.agent_id,
+        phase_filter,
+        ctx.effective_run_id,
+    )
+
+    # If run_id filtering removed all messages, this is an intentional
+    # restart (new run), not a crash recovery.  Return None so the caller
+    # falls through to the fresh-conversation path.
+    if conversation.message_count == 0:
+        return None
+
+    accumulator = await OutputAccumulator.restore(conversation_store, run_id=ctx.effective_run_id)
+    accumulator.spillover_dir = config.spillover_dir
+    accumulator.max_value_chars = config.max_output_value_chars
+
+    cursor = await conversation_store.read_cursor() or {}
+    start_iteration = cursor.get("iteration", 0) + 1
+
+    # Restore stall/doom-loop detection state
+    recent_responses: list[str] = cursor.get("recent_responses", [])
+    raw_fps = cursor.get("recent_tool_fingerprints", [])
+    recent_tool_fingerprints: list[list[tuple[str, str]]] = [
+        [tuple(pair) for pair in fps]  # type: ignore[misc]
+        for fps in raw_fps
+    ]
+    pending_input = cursor.get("pending_input")
+    if not isinstance(pending_input, dict):
+        pending_input = None
+
+    logger.info(
+        f"Restored event loop: iteration={start_iteration}, "
+        f"messages={conversation.message_count}, "
+        f"outputs={list(accumulator.values.keys())}, "
+        f"stall_window={len(recent_responses)}, "
+        f"doom_window={len(recent_tool_fingerprints)}"
+    )
+    return RestoredState(
+        conversation=conversation,
+        accumulator=accumulator,
+        start_iteration=start_iteration,
+        recent_responses=recent_responses,
+        recent_tool_fingerprints=recent_tool_fingerprints,
+        pending_input=pending_input,
+    )
+
+
+async def write_cursor(
+    conversation_store: ConversationStore | None,
+    ctx: NodeContext,
+    conversation: NodeConversation,
+    accumulator: OutputAccumulator,
+    iteration: int,
+    *,
+    recent_responses: list[str] | None = None,
+    recent_tool_fingerprints: list[list[tuple[str, str]]] | None = None,
+    pending_input: dict[str, Any] | None = None,
+) -> None:
+    """Write checkpoint cursor for crash recovery.
+
+    Persists iteration counter, accumulator outputs, and stall/doom-loop
+    detection state so that resume picks up exactly where execution stopped.
+    """
+    if conversation_store:
+        cursor = await conversation_store.read_cursor() or {}
+        cursor.update(
+            {
+                "iteration": iteration,
+                "node_id": ctx.agent_id,
+                "outputs": accumulator.to_dict(),
+            }
+        )
+        # Persist stall/doom-loop detection state for reliable resume
+        if recent_responses is not None:
+            cursor["recent_responses"] = recent_responses
+        if recent_tool_fingerprints is not None:
+            # Convert list[list[tuple]] → list[list[list]] for JSON
+            cursor["recent_tool_fingerprints"] = [[list(pair) for pair in fps] for fps in recent_tool_fingerprints]
+        # Persist blocked-input state so restored runs re-block instead of
+        # manufacturing a synthetic continuation turn.
+        cursor["pending_input"] = pending_input
+        await conversation_store.write_cursor(cursor)
+
+
+async def drain_injection_queue(
+    queue: asyncio.Queue,
+    conversation: NodeConversation,
+    *,
+    ctx: NodeContext,
+    describe_images_as_text_fn: (Callable[[list[dict[str, Any]]], Awaitable[str | None]] | None) = None,
+) -> int:
+    """Drain all pending injected events as user messages. Returns count."""
+    count = 0
+    logger.debug(
+        "[drain_injection_queue] Starting to drain queue, initial queue size: %s",
+        queue.qsize() if hasattr(queue, "qsize") else "unknown",
+    )
+    while not queue.empty():
+        try:
+            content, is_client_input, image_content = queue.get_nowait()
+            logger.info(
+                "[drain] injected message (client_input=%s, images=%d): %s",
+                is_client_input,
+                len(image_content) if image_content else 0,
+                content[:200] if content else "(empty)",
+            )
+            if image_content and ctx.llm and not supports_image_tool_results(ctx.llm.model):
+                logger.info(
+                    "Model '%s' does not support images; attempting vision fallback",
+                    ctx.llm.model,
+                )
+                if describe_images_as_text_fn is not None:
+                    description = await describe_images_as_text_fn(image_content)
+                    if description:
+                        content = f"{content}\n\n{description}" if content else description
+                        logger.info("[drain] image described as text via vision fallback")
+                    else:
+                        logger.info("[drain] no vision fallback available; images dropped")
+                image_content = None
+            # Real user input is stored as-is; external events get a prefix
+            if is_client_input:
+                await conversation.add_user_message(
+                    content,
+                    is_client_input=True,
+                    image_content=image_content,
+                )
+            else:
+                await conversation.add_user_message(f"[External event]: {content}")
+            count += 1
+        except asyncio.QueueEmpty:
+            break
+    return count
+
+
+async def drain_trigger_queue(
+    queue: asyncio.Queue,
+    conversation: NodeConversation,
+) -> int:
+    """Drain all pending trigger events as a single batched user message.
+
+    Multiple triggers are merged so the LLM sees them atomically and can
+    reason about all pending triggers before acting.
+    """
+    triggers: list[TriggerEvent] = []
+    while not queue.empty():
+        try:
+            triggers.append(queue.get_nowait())
+        except asyncio.QueueEmpty:
+            break
+
+    if not triggers:
+        return 0
+
+    parts: list[str] = []
+    for t in triggers:
+        task = t.payload.get("task", "")
+        task_line = f"\nTask: {task}" if task else ""
+        payload_str = json.dumps(t.payload, default=str)
+        parts.append(f"[TRIGGER: {t.trigger_type}/{t.source_id}]{task_line}\n{payload_str}")
+
+    combined = "\n\n".join(parts)
+    logger.info("[drain] %d trigger(s): %s", len(triggers), combined[:200])
+    await conversation.add_user_message(combined)
+    return len(triggers)
+
+
+async def check_pause(
+    ctx: NodeContext,
+    conversation: NodeConversation,
+    iteration: int,
+) -> bool:
+    """
+    Check if pause has been requested. Returns True if paused.
+
+    Note: This check happens BEFORE starting iteration N, after completing N-1.
+    If paused, the node exits having completed {iteration} iterations (0 to iteration-1).
+    """
+    # Check executor-level pause event (for /pause command, Ctrl+Z)
+    if ctx.pause_event and ctx.pause_event.is_set():
+        completed = iteration  # 0-indexed: iteration=3 means 3 iterations completed (0,1,2)
+        logger.info(f"⏸ Pausing after {completed} iteration(s) completed (executor-level)")
+        return True
+
+    # Check context-level pause flags (legacy/alternative methods)
+    pause_requested = ctx.input_data.get("pause_requested", False)
+    if pause_requested:
+        completed = iteration
+        logger.info(f"⏸ Pausing after {completed} iteration(s) completed (context-level)")
+        return True
+
+    return False
@@ -0,0 +1,358 @@
+"""EventBus publishing helpers for the event loop.
+
+Thin wrappers around EventBus.emit_*() calls that check for bus existence
+before publishing.  Extracted to reduce noise in the main orchestrator.
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+
+from framework.agent_loop.conversation import NodeConversation
+from framework.agent_loop.internals.types import HookContext
+from framework.host.event_bus import EventBus
+from framework.orchestrator.node import NodeContext
+
+logger = logging.getLogger(__name__)
+
+
+async def publish_loop_started(
+    event_bus: EventBus | None,
+    stream_id: str,
+    node_id: str,
+    max_iterations: int,
+    execution_id: str = "",
+) -> None:
+    if event_bus:
+        await event_bus.emit_node_loop_started(
+            stream_id=stream_id,
+            node_id=node_id,
+            max_iterations=max_iterations,
+            execution_id=execution_id,
+        )
+
+
+async def generate_action_plan(
+    event_bus: EventBus | None,
+    ctx: NodeContext,
+    stream_id: str,
+    node_id: str,
+    execution_id: str,
+) -> None:
+    """Generate a brief action plan via LLM and emit it as an SSE event.
+
+    Runs as a fire-and-forget task so it never blocks the main loop.
+    """
+    try:
+        system_prompt = ctx.agent_spec.system_prompt or ""
+        # Trim to keep the prompt small
+        prompt_summary = system_prompt[:500]
+        if len(system_prompt) > 500:
+            prompt_summary += "..."
+
+        tool_names = [t.name for t in ctx.available_tools]
+        output_keys = ctx.agent_spec.output_keys or []
+
+        prompt = (
+            f'You are about to work on a task as node "{node_id}".\n\n'
+            f"System prompt:\n{prompt_summary}\n\n"
+            f"Tools available: {tool_names}\n"
+            f"Required outputs: {output_keys}\n\n"
+            f"Write a brief action plan (2-5 bullet points) describing "
+            f"what you will do to complete this task. Be specific and concise.\n"
+            f"Return ONLY the plan text, no preamble."
+        )
+
+        response = await ctx.llm.acomplete(
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=1024,
+        )
+
+        plan = response.content.strip()
+        if plan and event_bus:
+            await event_bus.emit_node_action_plan(
+                stream_id=stream_id,
+                node_id=node_id,
+                plan=plan,
+                execution_id=execution_id,
+            )
+    except Exception as e:
+        logger.warning("Action plan generation failed for node '%s': %s", node_id, e)
+
+
+async def publish_iteration(
+    event_bus: EventBus | None,
+    stream_id: str,
+    node_id: str,
+    iteration: int,
+    execution_id: str = "",
+    extra_data: dict | None = None,
+) -> None:
+    if event_bus:
+        await event_bus.emit_node_loop_iteration(
+            stream_id=stream_id,
+            node_id=node_id,
+            iteration=iteration,
+            execution_id=execution_id,
+            extra_data=extra_data,
+        )
+
+
+async def publish_llm_turn_complete(
+    event_bus: EventBus | None,
+    stream_id: str,
+    node_id: str,
+    stop_reason: str,
+    model: str,
+    input_tokens: int,
+    output_tokens: int,
+    cached_tokens: int = 0,
+    execution_id: str = "",
+    iteration: int | None = None,
+) -> None:
+    if event_bus:
+        await event_bus.emit_llm_turn_complete(
+            stream_id=stream_id,
+            node_id=node_id,
+            stop_reason=stop_reason,
+            model=model,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            cached_tokens=cached_tokens,
+            execution_id=execution_id,
+            iteration=iteration,
+        )
+
+
+def log_skip_judge(
+    ctx: NodeContext,
+    node_id: str,
+    iteration: int,
+    feedback: str,
+    tool_calls: list[dict],
+    llm_text: str,
+    turn_tokens: dict[str, int],
+    iter_start: float,
+) -> None:
+    """Log a CONTINUE step that skips judge evaluation (e.g., waiting for input)."""
+    if ctx.runtime_logger:
+        ctx.runtime_logger.log_step(
+            node_id=node_id,
+            node_type="event_loop",
+            step_index=iteration,
+            verdict="CONTINUE",
+            verdict_feedback=feedback,
+            tool_calls=tool_calls,
+            llm_text=llm_text,
+            input_tokens=turn_tokens.get("input", 0),
+            output_tokens=turn_tokens.get("output", 0),
+            latency_ms=int((time.time() - iter_start) * 1000),
+        )
+
+
+async def publish_loop_completed(
+    event_bus: EventBus | None,
+    stream_id: str,
+    node_id: str,
+    iterations: int,
+    execution_id: str = "",
+) -> None:
+    if event_bus:
+        await event_bus.emit_node_loop_completed(
+            stream_id=stream_id,
+            node_id=node_id,
+            iterations=iterations,
+            execution_id=execution_id,
+        )
+
+
+async def publish_context_usage(
+    event_bus: EventBus | None,
+    ctx: NodeContext,
+    conversation: NodeConversation,
+    trigger: str,
+) -> None:
+    """Emit a CONTEXT_USAGE_UPDATED event with current context window state."""
+    if not event_bus:
+        return
+
+    from framework.host.event_bus import AgentEvent, EventType
+
+    estimated = conversation.estimate_tokens()
+    max_tokens = conversation._max_context_tokens
+    ratio = estimated / max_tokens if max_tokens > 0 else 0.0
+    await event_bus.publish(
+        AgentEvent(
+            type=EventType.CONTEXT_USAGE_UPDATED,
+            stream_id=ctx.stream_id or ctx.agent_id,
+            node_id=ctx.agent_id,
+            data={
+                "usage_ratio": round(ratio, 4),
+                "usage_pct": round(ratio * 100),
+                "message_count": conversation.message_count,
+                "estimated_tokens": estimated,
+                "max_context_tokens": max_tokens,
+                "trigger": trigger,
+            },
+        )
+    )
+
+
+async def publish_stalled(
+    event_bus: EventBus | None,
+    stream_id: str,
+    node_id: str,
+    execution_id: str = "",
+) -> None:
+    if event_bus:
+        await event_bus.emit_node_stalled(
+            stream_id=stream_id,
+            node_id=node_id,
+            reason="Consecutive similar responses detected",
+            execution_id=execution_id,
+        )
+
+
+async def publish_text_delta(
+    event_bus: EventBus | None,
+    stream_id: str,
+    node_id: str,
+    content: str,
+    snapshot: str,
+    ctx: NodeContext,
+    execution_id: str = "",
+    iteration: int | None = None,
+    inner_turn: int = 0,
+) -> None:
+    if event_bus:
+        if ctx.emits_client_io:
+            await event_bus.emit_client_output_delta(
+                stream_id=stream_id,
+                node_id=node_id,
+                content=content,
+                snapshot=snapshot,
+                execution_id=execution_id,
+                iteration=iteration,
+                inner_turn=inner_turn,
+            )
+        else:
+            await event_bus.emit_llm_text_delta(
+                stream_id=stream_id,
+                node_id=node_id,
+                content=content,
+                snapshot=snapshot,
+                execution_id=execution_id,
+                inner_turn=inner_turn,
+            )
+
+
+async def publish_tool_started(
+    event_bus: EventBus | None,
+    stream_id: str,
+    node_id: str,
+    tool_use_id: str,
+    tool_name: str,
+    tool_input: dict,
+    execution_id: str = "",
+) -> None:
+    if event_bus:
+        await event_bus.emit_tool_call_started(
+            stream_id=stream_id,
+            node_id=node_id,
+            tool_use_id=tool_use_id,
+            tool_name=tool_name,
+            tool_input=tool_input,
+            execution_id=execution_id,
+        )
+
+
+async def publish_tool_completed(
+    event_bus: EventBus | None,
+    stream_id: str,
+    node_id: str,
+    tool_use_id: str,
+    tool_name: str,
+    result: str,
+    is_error: bool,
+    execution_id: str = "",
+) -> None:
+    if event_bus:
+        await event_bus.emit_tool_call_completed(
+            stream_id=stream_id,
+            node_id=node_id,
+            tool_use_id=tool_use_id,
+            tool_name=tool_name,
+            result=result,
+            is_error=is_error,
+            execution_id=execution_id,
+        )
+
+
+async def publish_judge_verdict(
+    event_bus: EventBus | None,
+    stream_id: str,
+    node_id: str,
+    action: str,
+    feedback: str = "",
+    judge_type: str = "implicit",
+    iteration: int = 0,
+    execution_id: str = "",
+) -> None:
+    if event_bus:
+        await event_bus.emit_judge_verdict(
+            stream_id=stream_id,
+            node_id=node_id,
+            action=action,
+            feedback=feedback,
+            judge_type=judge_type,
+            iteration=iteration,
+            execution_id=execution_id,
+        )
+
+
+async def publish_output_key_set(
+    event_bus: EventBus | None,
+    stream_id: str,
+    node_id: str,
+    key: str,
+    execution_id: str = "",
+) -> None:
+    if event_bus:
+        pass
+
+
+async def run_hooks(
+    hooks_config: dict[str, list],
+    event: str,
+    conversation: NodeConversation,
+    trigger: str | None = None,
+) -> None:
+    """Run all registered hooks for *event*, applying their results.
+
+    Each hook receives a HookContext and may return a HookResult that:
+    - replaces the system prompt (result.system_prompt)
+    - injects an extra user message (result.inject)
+    Hooks run in registration order; each sees the prompt as left by the
+    previous hook.
+    """
+    hook_list = hooks_config.get(event, [])
+    if not hook_list:
+        return
+    for hook in hook_list:
+        ctx = HookContext(
+            event=event,
+            trigger=trigger,
+            system_prompt=conversation.system_prompt,
+        )
+        try:
+            result = await hook(ctx)
+        except Exception:
+            logger.warning("Hook '%s' raised an exception", event, exc_info=True)
+            continue
+        if result is None:
+            continue
+        if result.system_prompt:
+            conversation.update_system_prompt(result.system_prompt)
+        if result.inject:
+            await conversation.add_user_message(result.inject)
@@ -0,0 +1,152 @@
+"""Judge evaluation pipeline for the event loop."""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Callable
+
+from framework.agent_loop.conversation import NodeConversation
+from framework.agent_loop.internals.types import JudgeProtocol, JudgeVerdict, OutputAccumulator
+from framework.orchestrator.node import NodeContext
+
+logger = logging.getLogger(__name__)
+
+
+class SubagentJudge:
+    """Judge for subagent execution."""
+
+    def __init__(self, task: str, max_iterations: int = 10):
+        self._task = task
+        self._max_iterations = max_iterations
+
+    async def evaluate(self, context: dict[str, object]) -> JudgeVerdict:
+        missing = context.get("missing_keys", [])
+        if not isinstance(missing, list) or not missing:
+            return JudgeVerdict(action="ACCEPT", feedback="")
+
+        iteration = context.get("iteration", 0)
+        if not isinstance(iteration, int):
+            iteration = 0
+        remaining = self._max_iterations - iteration - 1
+
+        if remaining <= 3:
+            urgency = (
+                f"URGENT: Only {remaining} iterations left. Stop all other work and call set_output NOW for: {missing}"
+            )
+        elif remaining <= self._max_iterations // 2:
+            urgency = f"WARNING: {remaining} iterations remaining. You must call set_output for: {missing}"
+        else:
+            urgency = f"Missing output keys: {missing}. Use set_output to provide them."
+
+        return JudgeVerdict(action="RETRY", feedback=f"Your task: {self._task}\n{urgency}")
+
+
+async def judge_turn(
+    *,
+    mark_complete_flag: bool,
+    judge: JudgeProtocol | None,
+    ctx: NodeContext,
+    conversation: NodeConversation,
+    accumulator: OutputAccumulator,
+    assistant_text: str,
+    tool_results: list[dict[str, object]],
+    iteration: int,
+    get_missing_output_keys_fn: Callable[
+        [OutputAccumulator, list[str] | None, list[str] | None],
+        list[str],
+    ],
+    max_context_tokens: int,
+) -> JudgeVerdict:
+    """Evaluate the current state using judge or implicit logic.
+
+    Evaluation levels (in order):
+      0. Short-circuits: mark_complete, skip_judge, tool-continue.
+      1. Custom judge (JudgeProtocol) — full authority when set.
+      2. Implicit judge — output-key check + optional conversation-aware
+         quality gate (when ``success_criteria`` is defined).
+
+    Returns a JudgeVerdict.  ``feedback=None`` means no real evaluation
+    happened (skip_judge, tool-continue); the caller must not inject a
+    feedback message.  Any non-None feedback (including ``""``) means a
+    real evaluation occurred and will be logged into the conversation.
+    """
+    # --- Level 0: short-circuits (no evaluation) -----------------------
+
+    if mark_complete_flag:
+        return JudgeVerdict(action="ACCEPT")
+
+    if ctx.agent_spec.skip_judge:
+        return JudgeVerdict(action="RETRY")  # feedback=None → not logged
+
+    # --- Level 1: custom judge -----------------------------------------
+
+    if judge is not None:
+        context = {
+            "assistant_text": assistant_text,
+            "tool_calls": tool_results,
+            "output_accumulator": accumulator.to_dict(),
+            "accumulator": accumulator,
+            "iteration": iteration,
+            "conversation_summary": conversation.export_summary(),
+            "output_keys": ctx.agent_spec.output_keys,
+            "missing_keys": get_missing_output_keys_fn(
+                accumulator, ctx.agent_spec.output_keys, ctx.agent_spec.nullable_output_keys
+            ),
+        }
+        verdict = await judge.evaluate(context)
+        # Ensure evaluated RETRY always carries feedback for logging.
+        if verdict.action == "RETRY" and not verdict.feedback:
+            return JudgeVerdict(action="RETRY", feedback="Custom judge returned RETRY.")
+        return verdict
+
+    # --- Level 2: implicit judge ---------------------------------------
+
+    # Real tool calls were made — let the agent keep working.
+    if tool_results:
+        return JudgeVerdict(action="RETRY")  # feedback=None → not logged
+
+    missing = get_missing_output_keys_fn(accumulator, ctx.agent_spec.output_keys, ctx.agent_spec.nullable_output_keys)
+
+    if missing:
+        return JudgeVerdict(
+            action="RETRY",
+            feedback=(
+                f"Task incomplete. Required outputs not yet produced: {missing}. "
+                f"Follow your system prompt instructions to complete the work."
+            ),
+        )
+
+    # All output keys present — run safety checks before accepting.
+
+    output_keys = ctx.agent_spec.output_keys or []
+    nullable_keys = set(ctx.agent_spec.nullable_output_keys or [])
+
+    # All-nullable with nothing set → node produced nothing useful.
+    all_nullable = output_keys and nullable_keys >= set(output_keys)
+    none_set = not any(accumulator.get(k) is not None for k in output_keys)
+    if all_nullable and none_set:
+        return JudgeVerdict(
+            action="RETRY",
+            feedback=(f"No output keys have been set yet. Use set_output to set at least one of: {output_keys}"),
+        )
+
+    # Level 2b: conversation-aware quality check (if success_criteria set)
+    if ctx.agent_spec.success_criteria and ctx.llm:
+        from framework.orchestrator.conversation_judge import evaluate_phase_completion
+
+        verdict = await evaluate_phase_completion(
+            llm=ctx.llm,
+            conversation=conversation,
+            phase_name=ctx.agent_spec.name,
+            phase_description=ctx.agent_spec.description,
+            success_criteria=ctx.agent_spec.success_criteria,
+            accumulator_state=accumulator.to_dict(),
+            max_context_tokens=max_context_tokens,
+        )
+        if verdict.action != "ACCEPT":
+            return JudgeVerdict(
+                action=verdict.action,
+                feedback=verdict.feedback or "Phase criteria not met.",
+            )
+
+    return JudgeVerdict(action="ACCEPT", feedback="")
@@ -0,0 +1,106 @@
+"""Stall and doom-loop detection for the event loop.
+
+Pure functions with no class dependencies — safe to call from any context.
+"""
+
+from __future__ import annotations
+
+import json
+
+
+def ngram_similarity(s1: str, s2: str, n: int = 2) -> float:
+    """Jaccard similarity of n-gram sets.
+
+    Returns 0.0-1.0, where 1.0 is exact match.
+    Fast: O(len(s) + len(s2)) using set operations.
+    """
+
+    def _ngrams(s: str) -> set[str]:
+        return {s[i : i + n] for i in range(len(s) - n + 1) if s.strip()}
+
+    if not s1 or not s2:
+        return 0.0
+
+    ngrams1, ngrams2 = _ngrams(s1.lower()), _ngrams(s2.lower())
+    if not ngrams1 or not ngrams2:
+        return 0.0
+
+    intersection = len(ngrams1 & ngrams2)
+    union = len(ngrams1 | ngrams2)
+    return intersection / union if union else 0.0
+
+
+def is_stalled(
+    recent_responses: list[str],
+    threshold: int,
+    similarity_threshold: float,
+) -> bool:
+    """Detect stall using n-gram similarity.
+
+    Detects when ALL N consecutive responses are mutually similar
+    (>= threshold).  A single dissimilar response resets the signal.
+    This catches phrases like "I'm still stuck" vs "I'm stuck"
+    without false-positives on "attempt 1" vs "attempt 2".
+    """
+    if len(recent_responses) < threshold:
+        return False
+    if not recent_responses[0]:
+        return False
+
+    # Every consecutive pair must be similar
+    for i in range(1, len(recent_responses)):
+        if ngram_similarity(recent_responses[i], recent_responses[i - 1]) < similarity_threshold:
+            return False
+    return True
+
+
+def fingerprint_tool_calls(
+    tool_results: list[dict],
+) -> list[tuple[str, str]]:
+    """Create deterministic fingerprints for a turn's tool calls.
+
+    Each fingerprint is (tool_name, canonical_args_json).  Order-sensitive
+    so [search("a"), fetch("b")] != [fetch("b"), search("a")].
+    """
+    fingerprints = []
+    for tr in tool_results:
+        name = tr.get("tool_name", "")
+        args = tr.get("tool_input", {})
+        try:
+            canonical = json.dumps(args, sort_keys=True, default=str)
+        except (TypeError, ValueError):
+            canonical = str(args)
+        fingerprints.append((name, canonical))
+    return fingerprints
+
+
+def is_tool_doom_loop(
+    recent_tool_fingerprints: list[list[tuple[str, str]]],
+    threshold: int,
+    enabled: bool = True,
+) -> tuple[bool, str]:
+    """Detect doom loop via exact fingerprint match.
+
+    Detects when N consecutive turns invoke the same tools with
+    identical (canonicalized) arguments.  Different arguments mean
+    different work, so only exact matches count.
+
+    Returns (is_doom_loop, description).
+    """
+    if not enabled:
+        return False, ""
+    if len(recent_tool_fingerprints) < threshold:
+        return False, ""
+    first = recent_tool_fingerprints[0]
+    if not first:
+        return False, ""
+
+    # All turns in the window must match the first exactly
+    if all(fp == first for fp in recent_tool_fingerprints[1:]):
+        tool_names = [name for name, _ in first]
+        desc = (
+            f"Doom loop detected: {len(recent_tool_fingerprints)} "
+            f"identical consecutive tool calls ({', '.join(tool_names)})"
+        )
+        return True, desc
+    return False, ""
@@ -0,0 +1,427 @@
+"""Synthetic tool builders for the event loop.
+
+Factory functions that create ``Tool`` definitions for framework-level
+synthetic tools (set_output, ask_user, escalate, delegate, report_to_parent).
+Also includes the ``handle_set_output`` validation logic.
+
+All functions are pure — they receive explicit parameters and return
+``Tool`` or ``ToolResult`` objects with no side effects.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from framework.llm.provider import Tool, ToolResult
+
+
+def sanitize_ask_user_inputs(
+    raw_question: Any,
+    raw_options: Any,
+) -> tuple[str, list[str] | None]:
+    """Self-heal a malformed ``ask_user`` tool call.
+
+    Some model families (notably when the system prompt teaches them
+    XML-ish scratchpad tags like ``<relationship>...</relationship>``)
+    carry that style into tool arguments and produce calls like::
+
+        ask_user({
+            "question": "What now?</question>\\n_OPTIONS: [\\"A\\", \\"B\\"]"
+        })
+
+    Symptoms:
+    - The chat UI renders ``</question>`` and ``_OPTIONS: [...]`` as
+      literal text in the question bubble.
+    - No buttons appear because the real ``options`` parameter is
+      empty.
+
+    This function:
+    - Strips leading/trailing whitespace.
+    - Removes a trailing ``</question>`` (with optional preceding
+      whitespace) from the question text.
+    - Detects an inline ``_OPTIONS:``, ``OPTIONS:``, or ``options:``
+      line followed by a JSON array, parses it, and returns the
+      recovered list as the second element.
+    - Removes the parsed line from the returned question text.
+
+    Returns ``(cleaned_question, recovered_options_or_None)``. The
+    caller should treat the recovered list as a fallback only when
+    the model did not also supply a real ``options`` array.
+    """
+    import json as _json
+    import re as _re
+
+    if raw_question is None:
+        return "", None
+    q = str(raw_question)
+
+    # Strip a stray </question> tag (case-insensitive, with optional
+    # preceding whitespace) anywhere in the string. This is the most
+    # common failure mode and never represents valid content.
+    q = _re.sub(r"\s*</\s*question\s*>\s*", "\n", q, flags=_re.IGNORECASE)
+
+    # Look for an inline options line. Match _OPTIONS, OPTIONS, options
+    # (with or without leading underscore), followed by ':' or '=', then
+    # a JSON array on the same line OR on the next line.
+    inline_options_re = _re.compile(
+        r"(?im)^\s*_?options\s*[:=]\s*(\[.*?\])\s*$",
+        _re.DOTALL,
+    )
+
+    recovered: list[str] | None = None
+    match = inline_options_re.search(q)
+    if match is not None:
+        try:
+            parsed = _json.loads(match.group(1))
+            if isinstance(parsed, list):
+                cleaned = [str(o).strip() for o in parsed if str(o).strip()]
+                if 1 <= len(cleaned) <= 8:
+                    recovered = cleaned
+        except (ValueError, TypeError):
+            pass
+        if recovered is not None:
+            # Remove the parsed line so it doesn't leak into the
+            # rendered question text.
+            q = inline_options_re.sub("", q, count=1)
+
+    # Strip any final whitespace / leftover blank lines from the
+    # question after removals.
+    q = _re.sub(r"\n{3,}", "\n\n", q).strip()
+
+    return q, recovered
+
+
+def build_ask_user_tool() -> Tool:
+    """Build the synthetic ask_user tool for explicit user-input requests.
+
+    The queen calls ask_user() when it needs to pause and wait
+    for user input.  Text-only turns WITHOUT ask_user flow through without
+    blocking, allowing progress updates and summaries to stream freely.
+    """
+    return Tool(
+        name="ask_user",
+        description=(
+            "You MUST call this tool whenever you need the user's response. "
+            "Always call it after greeting the user, asking a question, or "
+            "requesting approval. Do NOT call it for status updates or "
+            "summaries that don't require a response.\n\n"
+            "STRUCTURE RULES (CRITICAL):\n"
+            "- The 'question' field is PLAIN TEXT shown to the user. Do NOT "
+            "include XML tags, pseudo-tags like </question>, or option lists "
+            "in the question string. The UI does not parse them — they "
+            "render as raw text and look broken.\n"
+            "- The 'options' parameter is the ONLY way to render buttons. "
+            "If you want buttons, put them in the 'options' array, not in "
+            "the question string. Do NOT write 'OPTIONS: [...]', "
+            "'_options: [...]', or any inline list inside 'question'.\n"
+            "- The question text must read as a single clean prompt with "
+            "no markup. Example: 'What would you like to do?' — not "
+            "'What would you like to do?</question>'.\n\n"
+            "USAGE:\n"
+            "Always include 2-3 predefined options. The UI automatically "
+            "appends an 'Other' free-text input after your options, so NEVER "
+            "include catch-all options like 'Custom idea', 'Something else', "
+            "'Other', or 'None of the above' — the UI handles that. "
+            "When the question primarily needs a typed answer but you must "
+            "include options, make one option signal that typing is expected "
+            "(e.g. 'I\\'ll type my response'). This helps users discover the "
+            "free-text input. "
+            "The ONLY exception: omit options when the question demands a "
+            "free-form answer the user must type out (e.g. 'Describe your "
+            "agent idea', 'Paste the error message').\n\n"
+            "CORRECT EXAMPLE:\n"
+            '{"question": "What would you like to do?", "options": '
+            '["Build a new agent", "Modify existing agent", "Run tests"]}\n\n'
+            "FREE-FORM EXAMPLE:\n"
+            '{"question": "Describe the agent you want to build."}\n\n'
+            "WRONG (do NOT do this — buttons will not render):\n"
+            '{"question": "What now?</question>\\n_OPTIONS: [\\"A\\", \\"B\\"]"}'
+        ),
+        parameters={
+            "type": "object",
+            "properties": {
+                "question": {
+                    "type": "string",
+                    "description": "The question or prompt shown to the user.",
+                },
+                "options": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": (
+                        "2-3 specific predefined choices. Include in most cases. "
+                        'Example: ["Option A", "Option B", "Option C"]. '
+                        "The UI always appends an 'Other' free-text input, so "
+                        "do NOT include catch-alls like 'Custom idea' or 'Other'. "
+                        "Omit ONLY when the user must type a free-form answer."
+                    ),
+                    "minItems": 2,
+                    "maxItems": 3,
+                },
+            },
+            "required": ["question"],
+        },
+    )
+
+
+def build_ask_user_multiple_tool() -> Tool:
+    """Build the synthetic ask_user_multiple tool for batched questions.
+
+    Queen-only tool that presents multiple questions at once so the user
+    can answer them all in a single interaction rather than one at a time.
+    """
+    return Tool(
+        name="ask_user_multiple",
+        description=(
+            "Ask the user multiple questions at once. Use this instead of "
+            "ask_user when you have 2 or more questions to ask in the same "
+            "turn — it lets the user answer everything in one go rather than "
+            "going back and forth. Each question can have its own predefined "
+            "options (2-3 choices) or be free-form. The UI renders all "
+            "questions together with a single Submit button. "
+            "ALWAYS prefer this over ask_user when you have multiple things "
+            "to clarify. "
+            "IMPORTANT: Do NOT repeat the questions in your text response — "
+            "the widget renders them. Keep your text to a brief intro only. "
+            '{"questions": ['
+            '  {"id": "scope", "prompt": "What scope?", "options": ["Full", "Partial"]},'
+            '  {"id": "format", "prompt": "Output format?", "options": ["PDF", "CSV", "JSON"]},'
+            '  {"id": "details", "prompt": "Any special requirements?"}'
+            "]}"
+        ),
+        parameters={
+            "type": "object",
+            "properties": {
+                "questions": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "id": {
+                                "type": "string",
+                                "description": ("Short identifier for this question (used in the response)."),
+                            },
+                            "prompt": {
+                                "type": "string",
+                                "description": "The question text shown to the user.",
+                            },
+                            "options": {
+                                "type": "array",
+                                "items": {"type": "string"},
+                                "description": (
+                                    "2-3 predefined choices. The UI appends an "
+                                    "'Other' free-text input automatically. "
+                                    "Omit only when the user must type a free-form answer."
+                                ),
+                                "minItems": 2,
+                                "maxItems": 3,
+                            },
+                        },
+                        "required": ["id", "prompt"],
+                    },
+                    "minItems": 2,
+                    "maxItems": 8,
+                    "description": "List of questions to present to the user.",
+                },
+            },
+            "required": ["questions"],
+        },
+    )
+
+
+def build_set_output_tool(output_keys: list[str] | None) -> Tool | None:
+    """Build the synthetic set_output tool for explicit output declaration."""
+    if not output_keys:
+        return None
+    return Tool(
+        name="set_output",
+        description=(
+            "Set an output value for this node. Call once per output key. "
+            "Use this for brief notes, counts, status, and file references — "
+            "NOT for large data payloads. When a tool result was saved to a "
+            "data file, pass the filename as the value "
+            "(e.g. 'google_sheets_get_values_1.txt') so the next phase can "
+            "load the full data. Values exceeding ~2000 characters are "
+            "auto-saved to data files. "
+            f"Valid keys: {output_keys}"
+        ),
+        parameters={
+            "type": "object",
+            "properties": {
+                "key": {
+                    "type": "string",
+                    "description": f"Output key. Must be one of: {output_keys}",
+                    "enum": output_keys,
+                },
+                "value": {
+                    "type": "string",
+                    "description": ("The output value — a brief note, count, status, or data filename reference."),
+                },
+            },
+            "required": ["key", "value"],
+        },
+    )
+
+
+def build_escalate_tool() -> Tool:
+    """Build the synthetic escalate tool for worker -> queen handoff."""
+    return Tool(
+        name="escalate",
+        description=(
+            "Escalate to the queen when requesting user input, "
+            "blocked by errors, missing "
+            "credentials, or ambiguous constraints that require supervisor "
+            "guidance. Include a concise reason and optional context. "
+            "The node will pause until the queen injects guidance."
+        ),
+        parameters={
+            "type": "object",
+            "properties": {
+                "reason": {
+                    "type": "string",
+                    "description": ("Short reason for escalation (e.g. 'Tool repeatedly failing')."),
+                },
+                "context": {
+                    "type": "string",
+                    "description": "Optional diagnostic details for the queen.",
+                },
+            },
+            "required": ["reason"],
+        },
+    )
+
+
+def build_report_to_parent_tool() -> Tool:
+    """Build the synthetic ``report_to_parent`` tool.
+
+    Parallel workers (those spawned by the overseer via
+    ``run_parallel_workers``) call this to send a structured report back
+    to the overseer queen when they have finished their task. Calling
+    ``report_to_parent`` terminates the worker's loop cleanly -- do not
+    call other tools after it.
+
+    The overseer receives these as ``SUBAGENT_REPORT`` events and
+    aggregates them into a single summary for the user.
+    """
+    return Tool(
+        name="report_to_parent",
+        description=(
+            "Send a structured report back to the parent overseer and "
+            "terminate. Call this when you have finished your task "
+            "(success, partial, or failed) or cannot make further "
+            "progress. Your loop ends after this call -- do not call any "
+            "other tool afterwards. The overseer reads the summary + "
+            "data fields and aggregates them into a user-facing response."
+        ),
+        parameters={
+            "type": "object",
+            "properties": {
+                "status": {
+                    "type": "string",
+                    "enum": ["success", "partial", "failed"],
+                    "description": (
+                        "Overall outcome. 'success' = task complete. "
+                        "'partial' = some progress but incomplete. "
+                        "'failed' = could not make progress."
+                    ),
+                },
+                "summary": {
+                    "type": "string",
+                    "description": (
+                        "One-paragraph narrative for the overseer. What "
+                        "you did, what you found, and any notable issues."
+                    ),
+                },
+                "data": {
+                    "type": "object",
+                    "description": (
+                        "Optional structured payload (rows fetched, IDs "
+                        "processed, files written, etc.) that the "
+                        "overseer can merge into its final summary."
+                    ),
+                },
+            },
+            "required": ["status", "summary"],
+        },
+    )
+
+
+def handle_report_to_parent(tool_input: dict[str, Any]) -> ToolResult:
+    """Normalise + validate a ``report_to_parent`` tool call.
+
+    Returns a ``ToolResult`` with the acknowledgement text the LLM sees;
+    the side effects (record on Worker, emit SUBAGENT_REPORT, terminate
+    loop) are performed by ``AgentLoop`` after this helper returns.
+    """
+    status = str(tool_input.get("status", "success")).strip().lower()
+    if status not in ("success", "partial", "failed"):
+        status = "success"
+    summary = str(tool_input.get("summary", "")).strip()
+    if not summary:
+        summary = f"(worker returned {status} with no summary)"
+    data = tool_input.get("data") or {}
+    if not isinstance(data, dict):
+        data = {"value": data}
+    # Store the normalised payload back on the input dict so the caller
+    # can pick it up without re-parsing.
+    tool_input["_normalised"] = {
+        "status": status,
+        "summary": summary,
+        "data": data,
+    }
+    return ToolResult(
+        tool_use_id=tool_input.get("tool_use_id", ""),
+        content=(f"Report delivered to overseer (status={status}). This worker will terminate now."),
+    )
+
+
+def handle_set_output(
+    tool_input: dict[str, Any],
+    output_keys: list[str] | None,
+) -> ToolResult:
+    """Handle set_output tool call. Returns ToolResult (sync)."""
+    import logging
+    import re
+
+    logger = logging.getLogger(__name__)
+
+    key = tool_input.get("key", "")
+    value = tool_input.get("value", "")
+    valid_keys = output_keys or []
+
+    # Recover from truncated JSON (max_tokens hit mid-argument).
+    # The _raw key is set by litellm when json.loads fails.
+    if not key and "_raw" in tool_input:
+        raw = tool_input["_raw"]
+        key_match = re.search(r'"key"\s*:\s*"(\w+)"', raw)
+        if key_match:
+            key = key_match.group(1)
+        val_match = re.search(r'"value"\s*:\s*"', raw)
+        if val_match:
+            start = val_match.end()
+            value = raw[start:].rstrip()
+            for suffix in ('"}\n', '"}', '"'):
+                if value.endswith(suffix):
+                    value = value[: -len(suffix)]
+                    break
+        if key:
+            logger.warning(
+                "Recovered set_output args from truncated JSON: key=%s, value_len=%d",
+                key,
+                len(value),
+            )
+            # Re-inject so the caller sees proper key/value
+            tool_input["key"] = key
+            tool_input["value"] = value
+
+    if key not in valid_keys:
+        return ToolResult(
+            tool_use_id="",
+            content=f"Invalid output key '{key}'. Valid keys: {valid_keys}",
+            is_error=True,
+        )
+
+    return ToolResult(
+        tool_use_id="",
+        content=f"Output '{key}' set successfully.",
+        is_error=False,
+    )
@@ -0,0 +1,548 @@
+"""Tool result handling: truncation, spillover, JSON preview, and execution.
+
+Manages tool result size limits, file spillover for large results, and
+smart JSON previews.  Also includes transient error classification and
+the context-window-exceeded error detector.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import contextvars
+import json
+import logging
+import re
+from pathlib import Path
+from typing import Any
+
+from framework.llm.provider import ToolResult, ToolUse
+from framework.llm.stream_events import ToolCallEvent
+
+logger = logging.getLogger(__name__)
+
+# Pattern for detecting context-window-exceeded errors across LLM providers.
+_CONTEXT_TOO_LARGE_RE = re.compile(
+    r"context.{0,20}(length|window|limit|size)|"
+    r"too.{0,10}(long|large|many.{0,10}tokens)|"
+    r"(exceed|exceeds|exceeded).{0,30}(limit|window|context|tokens)|"
+    r"maximum.{0,20}token|prompt.{0,20}too.{0,10}long",
+    re.IGNORECASE,
+)
+
+
+def is_context_too_large_error(exc: BaseException) -> bool:
+    """Detect whether an exception indicates the LLM input was too large."""
+    cls = type(exc).__name__
+    if "ContextWindow" in cls:
+        return True
+    return bool(_CONTEXT_TOO_LARGE_RE.search(str(exc)))
+
+
+def is_transient_error(exc: BaseException) -> bool:
+    """Classify whether an exception is transient (retryable) vs permanent.
+
+    Transient: network errors, rate limits, server errors, timeouts.
+    Permanent: auth errors, bad requests, context window exceeded.
+    """
+    try:
+        from litellm.exceptions import (
+            APIConnectionError,
+            BadGatewayError,
+            InternalServerError,
+            RateLimitError,
+            ServiceUnavailableError,
+        )
+
+        transient_types: tuple[type[BaseException], ...] = (
+            RateLimitError,
+            APIConnectionError,
+            InternalServerError,
+            BadGatewayError,
+            ServiceUnavailableError,
+            TimeoutError,
+            ConnectionError,
+            OSError,
+        )
+    except ImportError:
+        transient_types = (TimeoutError, ConnectionError, OSError)
+
+    if isinstance(exc, transient_types):
+        return True
+
+    # RuntimeError from StreamErrorEvent with "Stream error:" prefix
+    if isinstance(exc, RuntimeError):
+        error_str = str(exc).lower()
+        transient_keywords = [
+            "rate limit",
+            "429",
+            "timeout",
+            "connection",
+            "internal server",
+            "502",
+            "503",
+            "504",
+            "service unavailable",
+            "bad gateway",
+            "overloaded",
+            "failed to parse tool call",
+        ]
+        return any(kw in error_str for kw in transient_keywords)
+
+    return False
+
+
+def extract_json_metadata(parsed: Any, *, _depth: int = 0, _max_depth: int = 3) -> str:
+    """Return a concise structural summary of parsed JSON.
+
+    Reports key names, value types, and — crucially — array lengths so
+    the LLM knows how much data exists beyond the preview.
+
+    Returns an empty string for simple scalars.
+    """
+    if _depth >= _max_depth:
+        if isinstance(parsed, dict):
+            return f"dict with {len(parsed)} keys"
+        if isinstance(parsed, list):
+            return f"list of {len(parsed)} items"
+        return type(parsed).__name__
+
+    if isinstance(parsed, dict):
+        if not parsed:
+            return "empty dict"
+        lines: list[str] = []
+        indent = "  " * (_depth + 1)
+        for key, value in list(parsed.items())[:20]:
+            if isinstance(value, list):
+                line = f'{indent}"{key}": list of {len(value)} items'
+                if value:
+                    first = value[0]
+                    if isinstance(first, dict):
+                        sample_keys = list(first.keys())[:10]
+                        line += f" (each item: dict with keys {sample_keys})"
+                    elif isinstance(first, list):
+                        line += f" (each item: list of {len(first)} elements)"
+                lines.append(line)
+            elif isinstance(value, dict):
+                child = extract_json_metadata(value, _depth=_depth + 1, _max_depth=_max_depth)
+                lines.append(f'{indent}"{key}": {child}')
+            else:
+                lines.append(f'{indent}"{key}": {type(value).__name__}')
+        if len(parsed) > 20:
+            lines.append(f"{indent}... and {len(parsed) - 20} more keys")
+        return "\n".join(lines)
+
+    if isinstance(parsed, list):
+        if not parsed:
+            return "empty list"
+        desc = f"list of {len(parsed)} items"
+        first = parsed[0]
+        if isinstance(first, dict):
+            sample_keys = list(first.keys())[:10]
+            desc += f" (each item: dict with keys {sample_keys})"
+        elif isinstance(first, list):
+            desc += f" (each item: list of {len(first)} elements)"
+        return desc
+
+    return ""
+
+
+def build_json_preview(parsed: Any, *, max_chars: int = 5000) -> str | None:
+    """Build a smart preview of parsed JSON, truncating large arrays.
+
+    Shows first 3 + last 1 items of large arrays with explicit count
+    markers so the LLM cannot mistake the preview for the full dataset.
+
+    Returns ``None`` if no truncation was needed (no large arrays).
+    """
+    _LARGE_ARRAY_THRESHOLD = 10
+
+    def _truncate_arrays(obj: Any) -> tuple[Any, bool]:
+        """Return (truncated_copy, was_truncated)."""
+        if isinstance(obj, list) and len(obj) > _LARGE_ARRAY_THRESHOLD:
+            n = len(obj)
+            head = obj[:3]
+            tail = obj[-1:]
+            marker = f"... ({n - 4} more items omitted, {n} total) ..."
+            return head + [marker] + tail, True
+        if isinstance(obj, dict):
+            changed = False
+            out: dict[str, Any] = {}
+            for k, v in obj.items():
+                new_v, did = _truncate_arrays(v)
+                out[k] = new_v
+                changed = changed or did
+            return (out, True) if changed else (obj, False)
+        return obj, False
+
+    preview_obj, was_truncated = _truncate_arrays(parsed)
+    if not was_truncated:
+        return None  # No large arrays — caller should use raw slicing
+
+    try:
+        result = json.dumps(preview_obj, indent=2, ensure_ascii=False)
+    except (TypeError, ValueError):
+        return None
+
+    if len(result) > max_chars:
+        # Even 3+1 items too big — try just 1 item
+        def _minimal_arrays(obj: Any) -> Any:
+            if isinstance(obj, list) and len(obj) > _LARGE_ARRAY_THRESHOLD:
+                n = len(obj)
+                return obj[:1] + [f"... ({n - 1} more items omitted, {n} total) ..."]
+            if isinstance(obj, dict):
+                return {k: _minimal_arrays(v) for k, v in obj.items()}
+            return obj
+
+        preview_obj = _minimal_arrays(parsed)
+        try:
+            result = json.dumps(preview_obj, indent=2, ensure_ascii=False)
+        except (TypeError, ValueError):
+            return None
+        if len(result) > max_chars:
+            result = result[:max_chars] + "…"
+
+    return result
+
+
+def truncate_tool_result(
+    result: ToolResult,
+    tool_name: str,
+    *,
+    max_tool_result_chars: int,
+    spillover_dir: str | None,
+    next_spill_filename_fn: Any,  # Callable[[str], str]
+) -> ToolResult:
+    """Persist tool result to file and optionally truncate for context.
+
+    When *spillover_dir* is configured, EVERY non-error tool result is
+    written to disk for debugging. The LLM-visible content is then
+    shaped to avoid a **poison pattern** that we traced on 2026-04-15
+    through a gemini-3.1-pro-preview-customtools queen session: the prior format
+    appended ``\\n\\n[Saved to '/abs/path/file.txt']`` after every
+    small result, and frontier pattern-matching models (gemini 3.x in
+    particular) learned to autocomplete the `[Saved to '...']` trailer
+    in their own assistant turns, eventually degenerating into echoing
+    the whole tool result instead of deciding what to do next. See
+    ``session_20260415_100751_d49f4c28/conversations/parts/0000000056.json``
+    for the terminal case where the model's "text" output was the full
+    tool_result JSON.
+
+    Rules after the fix:
+    - **Small results (≤ limit):** pass content through unchanged. No
+      trailer. No annotation. The full content is already in the
+      message; the disk copy is for debugging only.
+    - **Large results (> limit):** preview + file reference, but
+      formatted as plain prose instead of a bracketed ``[...]``
+      pattern. Structured JSON metadata ("_saved_to") is embedded
+      inside the JSON body when the preview is JSON-shaped so the
+      model can locate the full file without seeing a mimicry-prone
+      bracket token outside the body.
+    - **Errors:** pass through unchanged.
+    - **read_file results:** truncate with pagination hint (no re-spill).
+    """
+    limit = max_tool_result_chars
+
+    # Errors always pass through unchanged
+    if result.is_error:
+        return result
+
+    # read_file reads FROM spilled files — never re-spill (circular).
+    # Just truncate with a pagination hint if the result is too large.
+    if tool_name == "read_file":
+        if limit <= 0 or len(result.content) <= limit:
+            return result  # Small result — pass through as-is
+        # Large result — truncate with smart preview
+        PREVIEW_CAP = min(5000, max(limit - 500, limit // 2))
+
+        metadata_str = ""
+        smart_preview: str | None = None
+        try:
+            parsed_ld = json.loads(result.content)
+            metadata_str = extract_json_metadata(parsed_ld)
+            smart_preview = build_json_preview(parsed_ld, max_chars=PREVIEW_CAP)
+        except (json.JSONDecodeError, TypeError, ValueError):
+            pass
+
+        if smart_preview is not None:
+            preview_block = smart_preview
+        else:
+            preview_block = result.content[:PREVIEW_CAP] + "…"
+
+        # Prose header (no brackets).
+        header = (
+            f"Tool `{tool_name}` returned {len(result.content):,} characters "
+            f"(too large for context). Use offset_bytes / limit_bytes "
+            f"parameters to paginate smaller chunks."
+        )
+        if metadata_str:
+            header += f"\n\nData structure:\n{metadata_str}"
+        header += (
+            "\n\nWARNING: the preview below is a SAMPLE only — do NOT draw counts, totals, or conclusions from it."
+        )
+
+        truncated = f"{header}\n\nPreview (truncated):\n{preview_block}"
+        logger.info(
+            "%s result truncated: %d → %d chars (use offset/limit to paginate)",
+            tool_name,
+            len(result.content),
+            len(truncated),
+        )
+        return ToolResult(
+            tool_use_id=result.tool_use_id,
+            content=truncated,
+            is_error=False,
+            image_content=result.image_content,
+            is_skill_content=result.is_skill_content,
+        )
+
+    spill_dir = spillover_dir
+    if spill_dir:
+        spill_path = Path(spill_dir)
+        spill_path.mkdir(parents=True, exist_ok=True)
+        filename = next_spill_filename_fn(tool_name)
+
+        # Pretty-print JSON content so read_file's line-based
+        # pagination works correctly.
+        write_content = result.content
+        parsed_json: Any = None  # track for metadata extraction
+        try:
+            parsed_json = json.loads(result.content)
+            write_content = json.dumps(parsed_json, indent=2, ensure_ascii=False)
+        except (json.JSONDecodeError, TypeError, ValueError):
+            pass  # Not JSON — write as-is
+
+        file_path = spill_path / filename
+        file_path.write_text(write_content, encoding="utf-8")
+        # Use absolute path so parent agents can find files from subagents
+        abs_path = str(file_path.resolve())
+
+        if limit > 0 and len(result.content) > limit:
+            # Large result: build a small, metadata-rich preview so the
+            # LLM cannot mistake it for the complete dataset. The
+            # preview is introduced as plain prose (no bracketed
+            # ``[Result from …]`` token) so it doesn't prime the model
+            # to autocomplete the same pattern in its next turn.
+            PREVIEW_CAP = 5000
+
+            # Extract structural metadata (array lengths, key names)
+            metadata_str = ""
+            smart_preview: str | None = None
+            if parsed_json is not None:
+                metadata_str = extract_json_metadata(parsed_json)
+                smart_preview = build_json_preview(parsed_json, max_chars=PREVIEW_CAP)
+
+            if smart_preview is not None:
+                preview_block = smart_preview
+            else:
+                preview_block = result.content[:PREVIEW_CAP] + "…"
+
+            # Prose header (no brackets). Absolute path still surfaced
+            # so the agent can read the full file, but it's framed as
+            # a sentence, not a bracketed trailer.
+            header = (
+                f"Tool `{tool_name}` returned {len(result.content):,} characters "
+                f"(too large for context). Full result saved at: {abs_path}\n"
+                f"Read the complete data with read_file(path='{abs_path}').\n"
+            )
+            if metadata_str:
+                header += f"\nData structure:\n{metadata_str}\n"
+            header += (
+                "\nWARNING: the preview below is a SAMPLE only — do NOT draw counts, totals, or conclusions from it."
+            )
+
+            content = f"{header}\n\nPreview (truncated):\n{preview_block}"
+            logger.info(
+                "Tool result spilled to file: %s (%d chars → %s)",
+                tool_name,
+                len(result.content),
+                abs_path,
+            )
+        else:
+            # Small result: pass content through UNCHANGED.
+            #
+            # The prior design appended `\n\n[Saved to '/abs/path']`
+            # after every small result so the agent could re-read the
+            # file later. But (a) the full content is already in the
+            # message, so there's nothing to re-read; (b) the
+            # `[Saved to '…']` trailer is a repeating token pattern
+            # that frontier pattern-matching models autocomplete into
+            # their own assistant turns, eventually echoing whole tool
+            # results as "text" instead of making decisions. Dropping
+            # the trailer entirely kills the poison pattern. Spilled
+            # files on disk still exist for debugging — they just
+            # aren't advertised in the LLM-visible message.
+            content = result.content
+            logger.info(
+                "Tool result saved to file: %s (%d chars → %s, no trailer)",
+                tool_name,
+                len(result.content),
+                filename,
+            )
+
+        return ToolResult(
+            tool_use_id=result.tool_use_id,
+            content=content,
+            is_error=False,
+            image_content=result.image_content,
+            is_skill_content=result.is_skill_content,
+        )
+
+    # No spillover_dir — truncate in-place if needed
+    if limit > 0 and len(result.content) > limit:
+        PREVIEW_CAP = min(5000, max(limit - 500, limit // 2))
+
+        metadata_str = ""
+        smart_preview: str | None = None
+        try:
+            parsed_inline = json.loads(result.content)
+            metadata_str = extract_json_metadata(parsed_inline)
+            smart_preview = build_json_preview(parsed_inline, max_chars=PREVIEW_CAP)
+        except (json.JSONDecodeError, TypeError, ValueError):
+            pass
+
+        if smart_preview is not None:
+            preview_block = smart_preview
+        else:
+            preview_block = result.content[:PREVIEW_CAP] + "…"
+
+        # Prose header (no brackets) — see docstring for the poison
+        # pattern that the bracket format triggered.
+        header = (
+            f"Tool `{tool_name}` returned {len(result.content):,} characters "
+            f"(truncated to fit context budget — no spillover dir configured)."
+        )
+        if metadata_str:
+            header += f"\n\nData structure:\n{metadata_str}"
+        header += (
+            "\n\nWARNING: the preview below is a SAMPLE only — do NOT draw counts, totals, or conclusions from it."
+        )
+
+        truncated = f"{header}\n\n{preview_block}"
+        logger.info(
+            "Tool result truncated in-place: %s (%d → %d chars)",
+            tool_name,
+            len(result.content),
+            len(truncated),
+        )
+        return ToolResult(
+            tool_use_id=result.tool_use_id,
+            content=truncated,
+            is_error=False,
+            image_content=result.image_content,
+            is_skill_content=result.is_skill_content,
+        )
+
+    return result
+
+
+async def execute_tool(
+    tool_executor: Any,  # Callable[[ToolUse], ToolResult | Awaitable[ToolResult]] | None
+    tc: ToolCallEvent,
+    timeout: float,
+    skill_dirs: list[str] | None = None,
+) -> ToolResult:
+    """Execute a tool call, handling both sync and async executors.
+
+    Applies ``tool_call_timeout_seconds`` to prevent hung MCP servers
+    from blocking the event loop indefinitely.  The initial executor
+    call is offloaded to a thread pool so that sync executors don't
+    freeze the event loop.
+    """
+    if tool_executor is None:
+        return ToolResult(
+            tool_use_id=tc.tool_use_id,
+            content=f"No tool executor configured for '{tc.tool_name}'",
+            is_error=True,
+        )
+
+    skill_dirs = skill_dirs or []
+    skill_read_tools = {"view_file", "read_file"}
+    if tc.tool_name in skill_read_tools and skill_dirs:
+        raw_path = tc.tool_input.get("path", "")
+        if raw_path:
+            resolved = Path(raw_path).resolve(strict=False)
+            resolved_roots = [Path(skill_dir).resolve(strict=False) for skill_dir in skill_dirs]
+            if any(resolved.is_relative_to(root) for root in resolved_roots):
+                try:
+                    content = resolved.read_text(encoding="utf-8")
+                except Exception as exc:
+                    return ToolResult(
+                        tool_use_id=tc.tool_use_id,
+                        content=f"Could not read skill resource '{raw_path}': {exc}",
+                        is_error=True,
+                    )
+                return ToolResult(
+                    tool_use_id=tc.tool_use_id,
+                    content=content,
+                    is_skill_content=resolved.name == "SKILL.md",
+                )
+
+    tool_use = ToolUse(id=tc.tool_use_id, name=tc.tool_name, input=tc.tool_input)
+
+    async def _run() -> ToolResult:
+        # Offload the executor call to a thread.  Sync MCP executors
+        # block on future.result() — running in a thread keeps the
+        # event loop free so asyncio.wait_for can fire the timeout.
+        # Copy the current context so contextvars (e.g. data_dir from
+        # execution context) propagate into the worker thread.
+        loop = asyncio.get_running_loop()
+        ctx = contextvars.copy_context()
+        result = await loop.run_in_executor(None, ctx.run, tool_executor, tool_use)
+        # Async executors return a coroutine — await it on the loop
+        if asyncio.iscoroutine(result) or asyncio.isfuture(result):
+            result = await result
+        return result
+
+    try:
+        if timeout > 0:
+            result = await asyncio.wait_for(_run(), timeout=timeout)
+        else:
+            result = await _run()
+    except TimeoutError:
+        logger.warning("Tool '%s' timed out after %.0fs", tc.tool_name, timeout)
+        # asyncio.wait_for cancels the awaiting coroutine, but the sync
+        # executor running inside run_in_executor keeps going — and so
+        # does any MCP subprocess it is blocked on. Reach through to the
+        # owning MCPClient and force-disconnect it so the subprocess is
+        # torn down. Next call_tool triggers a reconnect. Without this
+        # the executor thread and MCP child leak on every timeout.
+        kill_for_tool = getattr(tool_executor, "kill_for_tool", None)
+        if callable(kill_for_tool):
+            try:
+                await asyncio.to_thread(kill_for_tool, tc.tool_name)
+            except Exception as exc:  # defensive — never let cleanup crash the loop
+                logger.warning(
+                    "kill_for_tool('%s') raised during timeout handling: %s",
+                    tc.tool_name,
+                    exc,
+                )
+        return ToolResult(
+            tool_use_id=tc.tool_use_id,
+            content=(
+                f"Tool '{tc.tool_name}' timed out after {timeout:.0f}s. "
+                "The operation took too long and was cancelled. "
+                "Try a simpler request or a different approach."
+            ),
+            is_error=True,
+        )
+    return result
+
+
+def restore_spill_counter(spillover_dir: str | None) -> int:
+    """Scan spillover_dir for existing spill files and return the max counter.
+
+    Returns the highest spill number found (or 0 if none).
+    """
+    if not spillover_dir:
+        return 0
+    spill_path = Path(spillover_dir)
+    if not spill_path.is_dir():
+        return 0
+    max_n = 0
+    for f in spill_path.iterdir():
+        if not f.is_file():
+            continue
+        m = re.search(r"_(\d+)\.txt$", f.name)
+        if m:
+            max_n = max(max_n, int(m.group(1)))
+    return max_n
@@ -0,0 +1,309 @@
+"""Shared types and state containers for the event loop package."""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Literal, Protocol, runtime_checkable
+
+from framework.agent_loop.conversation import (
+    ConversationStore,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class TriggerEvent:
+    """A framework-level trigger signal (timer tick or webhook hit)."""
+
+    trigger_type: str
+    source_id: str
+    payload: dict[str, Any] = field(default_factory=dict)
+    timestamp: float = field(default_factory=time.time)
+
+
+@dataclass
+class JudgeVerdict:
+    """Result of judge evaluation for the event loop."""
+
+    action: Literal["ACCEPT", "RETRY", "ESCALATE"]
+    # None  = no evaluation happened (skip_judge, tool-continue); not logged.
+    # ""    = evaluated but no feedback; logged with default text.
+    # "..." = evaluated with feedback; logged as-is.
+    feedback: str | None = None
+
+
+@runtime_checkable
+class JudgeProtocol(Protocol):
+    """Protocol for event-loop judges."""
+
+    async def evaluate(self, context: dict[str, Any]) -> JudgeVerdict: ...
+
+
+@dataclass
+class LoopConfig:
+    """Configuration for the event loop."""
+
+    max_iterations: int = 50
+    # 0 (or any non-positive value) disables the per-turn hard limit,
+    # letting a single assistant turn fan out arbitrarily many tool
+    # calls. Models like Gemini 3.1 Pro routinely emit 40-80 tool
+    # calls in one turn during browser exploration; capping them
+    # strands work half-finished and makes the next turn repeat the
+    # discarded calls, which is worse than just running them.
+    max_tool_calls_per_turn: int = 0
+    judge_every_n_turns: int = 1
+    stall_detection_threshold: int = 3
+    stall_similarity_threshold: float = 0.85
+    max_context_tokens: int = 32_000
+    # Headroom reserved for the NEXT turn's input + output so that
+    # proactive compaction always finishes before the hard context limit
+    # is hit mid-stream. Scaled to match Claude Code's 13k-buffer-on-
+    # 200k-window ratio (~6.5%) applied to hive's default 32k window,
+    # with extra margin because hive's token estimator is char-based
+    # and less tight than Anthropic's own counting. Override via
+    # LoopConfig for larger windows.
+    compaction_buffer_tokens: int = 8_000
+    # Warning is emitted one buffer earlier so the user/telemetry gets
+    # a "we're close" signal without triggering a compaction pass.
+    compaction_warning_buffer_tokens: int = 12_000
+    store_prefix: str = ""
+
+    # Overflow margin for max_tool_calls_per_turn. When the limit is
+    # enabled (>0), tool calls are only discarded when the count
+    # exceeds max_tool_calls_per_turn * (1 + margin). Ignored when
+    # max_tool_calls_per_turn is 0.
+    tool_call_overflow_margin: float = 0.5
+
+    # Tool result context management.
+    max_tool_result_chars: int = 30_000
+    spillover_dir: str | None = None
+
+    # Image retention in conversation history.
+    # Screenshots from ``browser_screenshot`` are inlined as base64
+    # data URLs inside message ``image_content``. Each full-page
+    # screenshot costs ~250k tokens when the provider counts the
+    # base64 as text (gemini, most non-Anthropic providers). Four
+    # screenshots in one conversation push gemini's 1M context over
+    # the limit and the model starts emitting garbage.
+    #
+    # The framework strips image_content from older messages after
+    # every tool-result batch, keeping only the most recent N
+    # screenshots. The text metadata on evicted messages (url, size,
+    # scale hints) is preserved so the agent can still reason about
+    # "I took a screenshot at step N that showed the compose modal".
+    # Raise this only if you genuinely need longer visual history AND
+    # you know your provider is using native image tokenization.
+    max_retained_screenshots: int = 2
+
+    # set_output value spilling.
+    max_output_value_chars: int = 2_000
+
+    # Stream retry.
+    max_stream_retries: int = 5
+    stream_retry_backoff_base: float = 2.0
+    stream_retry_max_delay: float = 60.0
+    # Persistent retry for capacity-class errors (429, 529, overloaded).
+    # Unlike the bounded retry above, these keep trying until the wall-clock
+    # budget below is exhausted — modelled after claude-code's withRetry.
+    # The loop still publishes a retry event each attempt so the UI can
+    # see progress. Set to 0 to disable and fall back to bounded retry.
+    capacity_retry_max_seconds: float = 600.0
+    capacity_retry_max_delay: float = 60.0
+
+    # Tool doom loop detection.
+    tool_doom_loop_threshold: int = 3
+
+    # Client-facing auto-block grace period.
+    cf_grace_turns: int = 1
+    # Worker auto-escalation: text-only turns before escalating to queen.
+    worker_escalation_grace_turns: int = 1
+    tool_doom_loop_enabled: bool = True
+    # Silent worker: consecutive tool-only turns (no user-facing text)
+    # before injecting a nudge to communicate progress.
+    silent_tool_streak_threshold: int = 5
+
+    # Per-tool-call timeout.
+    tool_call_timeout_seconds: float = 60.0
+
+    # LLM stream inactivity watchdog. Split into two budgets so legitimate
+    # slow TTFT on large contexts doesn't get mistaken for a dead connection.
+    # - ttft: stream open -> first event. Large-context local models can
+    #   legitimately take minutes before the first token arrives.
+    # - inter_event: last event -> now, ONLY after the first event. A stream
+    #   that started producing and then went silent is a real stall.
+    # Whichever fires first cancels the stream. Set to 0 to disable that
+    # individual budget; set both to 0 to fully disable the watchdog.
+    llm_stream_ttft_timeout_seconds: float = 600.0
+    llm_stream_inter_event_idle_seconds: float = 120.0
+    # Deprecated alias — kept so existing configs keep working. If set to a
+    # non-default value it overrides inter_event_idle (historical behavior).
+    llm_stream_inactivity_timeout_seconds: float = 120.0
+
+    # Continue-nudge recovery. When the idle watchdog fires on a live but
+    # stuck stream, cancel the stream and append a short continuation
+    # hint to the conversation instead of raising a ConnectionError and
+    # re-running the whole turn. Preserves any partial text/tool-calls the
+    # stream emitted before the stall.
+    continue_nudge_enabled: bool = True
+    # Cap so a truly dead endpoint eventually falls back to the error path
+    # instead of nudging forever.
+    continue_nudge_max_per_turn: int = 3
+
+    # Tool-call replay detector. When the model emits a tool call whose
+    # (name + canonical-args) matches a prior successful call in the last
+    # K assistant turns, emit telemetry and prepend a short steer onto the
+    # tool result — but still execute. Weaker models legitimately repeat
+    # read-only calls (screenshot, evaluate), so silent skipping would
+    # cause surprising behavior.
+    replay_detector_enabled: bool = True
+    replay_detector_within_last_turns: int = 3
+
+    # Subagent delegation timeout (wall-clock max).
+    subagent_timeout_seconds: float = 3600.0
+
+    # Subagent inactivity timeout - only timeout if no activity for this duration.
+    # This resets whenever the subagent makes progress (tool calls, LLM responses).
+    # Set to 0 to use only the wall-clock timeout.
+    subagent_inactivity_timeout_seconds: float = 300.0
+
+    # Lifecycle hooks.
+    hooks: dict[str, list] | None = None
+
+    def __post_init__(self) -> None:
+        if self.hooks is None:
+            object.__setattr__(self, "hooks", {})
+
+
+@dataclass
+class HookContext:
+    """Context passed to every lifecycle hook."""
+
+    event: str
+    trigger: str | None
+    system_prompt: str
+
+
+@dataclass
+class HookResult:
+    """What a hook may return to modify node state."""
+
+    system_prompt: str | None = None
+    inject: str | None = None
+
+
+@dataclass
+class OutputAccumulator:
+    """Accumulates output key-value pairs with optional write-through persistence."""
+
+    values: dict[str, Any] = field(default_factory=dict)
+    store: ConversationStore | None = None
+    spillover_dir: str | None = None
+    max_value_chars: int = 0
+    run_id: str | None = None
+
+    async def set(self, key: str, value: Any) -> None:
+        """Set a key-value pair, auto-spilling large values to files."""
+        value = await self._auto_spill(key, value)
+        self.values[key] = value
+        if self.store:
+            cursor = await self.store.read_cursor() or {}
+            outputs = cursor.get("outputs", {})
+            outputs[key] = value
+            cursor["outputs"] = outputs
+            await self.store.write_cursor(cursor)
+
+    async def _auto_spill(self, key: str, value: Any) -> Any:
+        """Save large values to a file and return a reference string.
+
+        Runs the JSON serialization and file write on a worker thread
+        so they don't block the asyncio event loop. For a 100k-char
+        dict this used to freeze every concurrent tool call for ~50ms
+        of ``json.dumps(indent=2)`` + a sync disk write; for bigger
+        payloads or slow storage (NFS, networked FS) the freeze was
+        proportionally worse.
+        """
+        if self.max_value_chars <= 0 or not self.spillover_dir:
+            return value
+
+        # Cheap size probe first — if the value is already a short
+        # string we can skip both the JSON round-trip and the thread
+        # hop entirely.
+        if isinstance(value, str) and len(value) <= self.max_value_chars:
+            return value
+
+        def _spill_sync() -> Any:
+            # JSON serialization for size check (only for non-strings).
+            if isinstance(value, str):
+                val_str = value
+            else:
+                val_str = json.dumps(value, ensure_ascii=False)
+            if len(val_str) <= self.max_value_chars:
+                return value
+
+            spill_path = Path(self.spillover_dir)
+            spill_path.mkdir(parents=True, exist_ok=True)
+            ext = ".json" if isinstance(value, (dict, list)) else ".txt"
+            filename = f"output_{key}{ext}"
+            write_content = (
+                json.dumps(value, indent=2, ensure_ascii=False) if isinstance(value, (dict, list)) else str(value)
+            )
+            file_path = spill_path / filename
+            file_path.write_text(write_content, encoding="utf-8")
+            file_size = file_path.stat().st_size
+            logger.info(
+                "set_output value auto-spilled: key=%s, %d chars -> %s (%d bytes)",
+                key,
+                len(val_str),
+                filename,
+                file_size,
+            )
+            # Use absolute path so parent agents can find files from subagents.
+            #
+            # Prose format (no brackets) — same fix as tool_result_handler:
+            # frontier pattern-matching models autocomplete bracketed
+            # `[Saved to '...']` trailers into their own assistant turns,
+            # eventually degenerating into echoing the file path as text.
+            # Keep the path accessible but frame it as plain prose.
+            abs_path = str(file_path.resolve())
+            return (
+                f"Output saved at: {abs_path} ({file_size:,} bytes). "
+                f"Read the full data with read_file(path='{abs_path}')."
+            )
+
+        return await asyncio.to_thread(_spill_sync)
+
+    def get(self, key: str) -> Any | None:
+        return self.values.get(key)
+
+    def to_dict(self) -> dict[str, Any]:
+        return dict(self.values)
+
+    def has_all_keys(self, required: list[str]) -> bool:
+        return all(key in self.values and self.values[key] is not None for key in required)
+
+    @classmethod
+    async def restore(
+        cls,
+        store: ConversationStore,
+        run_id: str | None = None,
+    ) -> OutputAccumulator:
+        cursor = await store.read_cursor()
+        values = cursor.get("outputs", {}) if cursor else {}
+        return cls(values=values, store=store, run_id=run_id)
+
+
+__all__ = [
+    "HookContext",
+    "HookResult",
+    "JudgeProtocol",
+    "JudgeVerdict",
+    "LoopConfig",
+    "OutputAccumulator",
+    "TriggerEvent",
+]
@@ -0,0 +1,98 @@
+"""Prompt composition for agent loops.
+
+Builds canonical system prompts from AgentContext fields.
+Extracted from the former orchestrator/prompting module.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any
+
+
+@dataclass(frozen=True)
+class PromptSpec:
+    identity_prompt: str = ""
+    focus_prompt: str = ""
+    narrative: str = ""
+    accounts_prompt: str = ""
+    skills_catalog_prompt: str = ""
+    protocols_prompt: str = ""
+    memory_prompt: str = ""
+    agent_type: str = "event_loop"
+    output_keys: tuple[str, ...] = ()
+
+
+def stamp_prompt_datetime(prompt: str) -> str:
+    local = datetime.now().astimezone()
+    stamp = f"Current date and time: {local.strftime('%Y-%m-%d %H:%M %Z (UTC%z)')}"
+    return f"{prompt}\n\n{stamp}" if prompt else stamp
+
+
+def build_prompt_spec(
+    ctx: Any,
+    *,
+    focus_prompt: str | None = None,
+    narrative: str | None = None,
+    memory_prompt: str | None = None,
+) -> PromptSpec:
+    from framework.skills.tool_gating import augment_catalog_for_tools
+
+    resolved_memory = memory_prompt
+    if resolved_memory is None:
+        resolved_memory = getattr(ctx, "memory_prompt", "") or ""
+        dynamic = getattr(ctx, "dynamic_memory_provider", None)
+        if dynamic is not None:
+            try:
+                resolved_memory = dynamic() or ""
+            except Exception:
+                resolved_memory = getattr(ctx, "memory_prompt", "") or ""
+
+    # Tool-gated pre-activation: inject full body of default skills whose
+    # trigger tools are present in this agent's tool list (e.g. browser_*
+    # pulls in hive.browser-automation). Keeps non-browser agents lean.
+    tool_names = [getattr(t, "name", "") for t in (getattr(ctx, "available_tools", None) or [])]
+    skills_catalog_prompt = augment_catalog_for_tools(ctx.skills_catalog_prompt or "", tool_names)
+
+    return PromptSpec(
+        identity_prompt=ctx.identity_prompt or "",
+        focus_prompt=focus_prompt if focus_prompt is not None else (ctx.agent_spec.system_prompt or ""),
+        narrative=narrative if narrative is not None else (ctx.narrative or ""),
+        accounts_prompt=ctx.accounts_prompt or "",
+        skills_catalog_prompt=skills_catalog_prompt,
+        protocols_prompt=ctx.protocols_prompt or "",
+        memory_prompt=resolved_memory,
+        agent_type=ctx.agent_spec.agent_type,
+        output_keys=tuple(ctx.agent_spec.output_keys or ()),
+    )
+
+
+def build_system_prompt(spec: PromptSpec) -> str:
+    parts: list[str] = []
+    if spec.identity_prompt:
+        parts.append(spec.identity_prompt)
+    if spec.accounts_prompt:
+        parts.append(f"\n{spec.accounts_prompt}")
+    if spec.skills_catalog_prompt:
+        parts.append(f"\n{spec.skills_catalog_prompt}")
+    if spec.protocols_prompt:
+        parts.append(f"\n{spec.protocols_prompt}")
+    if spec.memory_prompt:
+        parts.append(f"\n{spec.memory_prompt}")
+    if spec.focus_prompt:
+        parts.append(f"\n{spec.focus_prompt}")
+    if spec.narrative:
+        parts.append(f"\n{spec.narrative}")
+    return "\n".join(parts)
+
+
+def build_system_prompt_for_context(
+    ctx: Any,
+    *,
+    focus_prompt: str | None = None,
+    narrative: str | None = None,
+    memory_prompt: str | None = None,
+) -> str:
+    spec = build_prompt_spec(ctx, focus_prompt=focus_prompt, narrative=narrative, memory_prompt=memory_prompt)
+    return build_system_prompt(spec)
@@ -0,0 +1,264 @@
+"""Core types for the agent loop — the execution primitive of the colony.
+
+AgentSpec:    Declarative definition of what an agent does.
+AgentContext: Everything an agent loop needs to execute.
+AgentResult:  What comes out of an agent loop execution.
+AgentProtocol: Interface that all agent implementations must satisfy.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from framework.llm.provider import LLMProvider, Tool
+from framework.tracker.decision_tracker import DecisionTracker
+
+
+class AgentSpec(BaseModel):
+    """Declarative definition of an agent's capabilities and configuration.
+
+    This is the blueprint from which AgentLoop instances are created.
+    Workers in a colony are exact copies of the queen's AgentSpec.
+    """
+
+    id: str
+    name: str
+    description: str
+
+    agent_type: str = Field(
+        default="event_loop",
+        description="Type: 'event_loop' (recommended), 'gcu' (browser automation).",
+    )
+
+    input_keys: list[str] = Field(
+        default_factory=list,
+        description="Keys this agent reads from input data",
+    )
+    output_keys: list[str] = Field(
+        default_factory=list,
+        description="Keys this agent produces as output",
+    )
+    nullable_output_keys: list[str] = Field(
+        default_factory=list,
+        description="Output keys that can be None without triggering validation errors",
+    )
+
+    input_schema: dict[str, dict] = Field(
+        default_factory=dict,
+        description="Optional schema for input validation.",
+    )
+    output_schema: dict[str, dict] = Field(
+        default_factory=dict,
+        description="Optional schema for output validation.",
+    )
+
+    system_prompt: str | None = Field(default=None, description="System prompt for the LLM")
+    tools: list[str] = Field(default_factory=list, description="Tool names this agent can use")
+    tool_access_policy: str = Field(
+        default="explicit",
+        description=(
+            "'all' = all tools from registry, "
+            "'explicit' = only tools listed in `tools` (default), "
+            "'none' = no tools at all."
+        ),
+    )
+    model: str | None = Field(default=None, description="Specific model override")
+
+    function: str | None = Field(default=None, description="Function name or path")
+    routes: dict[str, str] = Field(default_factory=dict, description="Condition -> target mapping")
+
+    max_retries: int = Field(default=3)
+    retry_on: list[str] = Field(default_factory=list, description="Error types to retry on")
+
+    max_visits: int = Field(
+        default=0,
+        description=("Max times this agent executes in one colony run. 0 = unlimited. Set >1 for one-shot agents."),
+    )
+
+    output_model: type[BaseModel] | None = Field(
+        default=None,
+        description="Optional Pydantic model for validating LLM output.",
+    )
+    max_validation_retries: int = Field(
+        default=2,
+        description="Maximum retries when Pydantic validation fails",
+    )
+
+    client_facing: bool = Field(
+        default=False,
+        description="Deprecated — the queen is intrinsically interactive.",
+    )
+
+    success_criteria: str | None = Field(
+        default=None,
+        description="Natural-language criteria for phase completion.",
+    )
+
+    skip_judge: bool = Field(
+        default=False,
+        description="When True, the implicit judge is bypassed entirely.",
+    )
+
+    model_config = {"extra": "allow", "arbitrary_types_allowed": True}
+
+    def is_queen(self) -> bool:
+        return self.id == "queen"
+
+    def supports_direct_user_io(self) -> bool:
+        return self.is_queen()
+
+
+def deprecated_client_facing_warning(spec: AgentSpec) -> str | None:
+    if spec.client_facing and not spec.is_queen():
+        return (
+            f"Agent '{spec.id}' sets deprecated client_facing=True. "
+            "Non-queen direct human I/O is no longer supported; route worker "
+            "questions and approvals through queen escalation instead."
+        )
+    return None
+
+
+def warn_if_deprecated_client_facing(spec: AgentSpec) -> None:
+    import logging
+
+    warning = deprecated_client_facing_warning(spec)
+    if warning:
+        logging.getLogger(__name__).warning(warning)
+
+
+@dataclass
+class AgentContext:
+    """Everything an agent loop needs to execute.
+
+    Passed to every agent implementation and provides:
+    - Runtime (for decision logging)
+    - LLM access
+    - Tools
+    - Goal context
+    - Execution metadata
+    """
+
+    runtime: DecisionTracker
+
+    agent_id: str
+    agent_spec: AgentSpec
+
+    input_data: dict[str, Any] = field(default_factory=dict)
+
+    llm: LLMProvider | None = None
+    available_tools: list[Tool] = field(default_factory=list)
+
+    goal_context: str = ""
+    goal: Any = None
+
+    max_tokens: int = 4096
+
+    attempt: int = 1
+    max_attempts: int = 3
+
+    runtime_logger: Any = None
+    pause_event: Any = None
+
+    accounts_prompt: str = ""
+
+    identity_prompt: str = ""
+    narrative: str = ""
+    memory_prompt: str = ""
+
+    event_triggered: bool = False
+
+    execution_id: str = ""
+    run_id: str = ""
+
+    @property
+    def effective_run_id(self) -> str | None:
+        return self.run_id or None
+
+    stream_id: str = ""
+
+    dynamic_tools_provider: Any = None
+    dynamic_prompt_provider: Any = None
+    dynamic_memory_provider: Any = None
+
+    skills_catalog_prompt: str = ""
+    protocols_prompt: str = ""
+    skill_dirs: list[str] = field(default_factory=list)
+    default_skill_batch_nudge: str | None = None
+    default_skill_warn_ratio: float | None = None
+
+    iteration_metadata_provider: Any = None
+
+    @property
+    def is_queen_stream(self) -> bool:
+        return self.stream_id == "queen" or self.agent_spec.is_queen()
+
+    @property
+    def emits_client_io(self) -> bool:
+        return self.is_queen_stream
+
+    @property
+    def supports_direct_user_io(self) -> bool:
+        return self.is_queen_stream and not self.event_triggered
+
+
+@dataclass
+class AgentResult:
+    """Output of an agent loop execution."""
+
+    success: bool
+    output: dict[str, Any] = field(default_factory=dict)
+    error: str | None = None
+
+    next_agent: str | None = None
+    route_reason: str | None = None
+
+    tokens_used: int = 0
+    latency_ms: int = 0
+
+    validation_errors: list[str] = field(default_factory=list)
+
+    conversation: Any = None
+
+    # Machine-readable reason the loop stopped (see LoopExitReason in
+    # agent_loop/internals/types.py). "?" means the loop didn't set one,
+    # which should itself be treated as a diagnostic.
+    exit_reason: str = "?"
+    # Counters for reliability events surfaced during this execution.
+    # Populated from the loop's TaskRegistry-style counters at return
+    # time so callers can spot recurring failure modes without tailing
+    # logs. Keys are stable strings; missing keys mean "zero".
+    reliability_stats: dict[str, int] = field(default_factory=dict)
+
+    def to_summary(self, spec: Any = None) -> str:
+        if not self.success:
+            return f"Failed: {self.error}"
+
+        if not self.output:
+            return "Completed (no output)"
+
+        parts = [f"Completed with {len(self.output)} outputs:"]
+        for key, value in list(self.output.items())[:5]:
+            value_str = str(value)[:100]
+            if len(str(value)) > 100:
+                value_str += "..."
+            parts.append(f"  - {key}: {value_str}")
+        return "\n".join(parts)
+
+
+class AgentProtocol(ABC):
+    """Interface all agent implementations must satisfy."""
+
+    @abstractmethod
+    async def execute(self, ctx: AgentContext) -> AgentResult:
+        pass
+
+    def validate_input(self, ctx: AgentContext) -> list[str]:
+        errors = []
+        for key in ctx.agent_spec.input_keys:
+            if key not in ctx.input_data:
+                errors.append(f"Missing required input: {key}")
+        return errors
@@ -0,0 +1,17 @@
+"""Framework-provided agents."""
+
+from pathlib import Path
+
+FRAMEWORK_AGENTS_DIR = Path(__file__).parent
+
+
+def list_framework_agents() -> list[Path]:
+    """List all framework agent directories."""
+    return sorted(
+        [
+            p
+            for p in FRAMEWORK_AGENTS_DIR.iterdir()
+            if p.is_dir() and ((p / "agent.json").exists() or (p / "agent.py").exists())
+        ],
+        key=lambda p: p.name,
+    )
@@ -0,0 +1,55 @@
+"""
+Credential Tester — verify credentials (Aden OAuth + local API keys) via live API calls.
+
+Interactive agent that lists all testable accounts, lets the user pick one,
+loads the provider's tools, and runs a chat session to test the credential.
+"""
+
+from .agent import (
+    CredentialTesterAgent,
+    _list_aden_accounts,
+    _list_env_fallback_accounts,
+    _list_local_accounts,
+    configure_for_account,
+    conversation_mode,
+    edges,
+    entry_node,
+    entry_points,
+    get_tools_for_provider,
+    goal,
+    identity_prompt,
+    list_connected_accounts,
+    loop_config,
+    nodes,
+    pause_nodes,
+    requires_account_selection,
+    skip_credential_validation,
+    terminal_nodes,
+)
+from .config import default_config
+
+__version__ = "1.0.0"
+
+__all__ = [
+    "CredentialTesterAgent",
+    "configure_for_account",
+    "conversation_mode",
+    "default_config",
+    "edges",
+    "entry_node",
+    "entry_points",
+    "get_tools_for_provider",
+    "goal",
+    "identity_prompt",
+    "list_connected_accounts",
+    "loop_config",
+    "nodes",
+    "pause_nodes",
+    "requires_account_selection",
+    "skip_credential_validation",
+    "terminal_nodes",
+    # Internal list helpers (exposed for testing)
+    "_list_aden_accounts",
+    "_list_local_accounts",
+    "_list_env_fallback_accounts",
+]
@@ -0,0 +1,111 @@
+"""CLI entry point for Credential Tester agent."""
+
+import asyncio
+
+import click
+
+from .agent import CredentialTesterAgent
+
+
+def setup_logging(verbose=False, debug=False):
+    from framework.observability import configure_logging
+
+    if debug:
+        configure_logging(level="DEBUG")
+    elif verbose:
+        configure_logging(level="INFO")
+    else:
+        configure_logging(level="WARNING")
+
+
+def pick_account(agent: CredentialTesterAgent) -> dict | None:
+    """Interactive account picker. Returns selected account dict or None."""
+    accounts = agent.list_accounts()
+    if not accounts:
+        click.echo("No connected accounts found.")
+        click.echo("Set ADEN_API_KEY and connect accounts at https://app.adenhq.com")
+        return None
+
+    click.echo("\nConnected accounts:\n")
+    for i, acct in enumerate(accounts, 1):
+        provider = acct.get("provider", "?")
+        alias = acct.get("alias", "?")
+        identity = acct.get("identity", {})
+        detail_parts = [f"{k}: {v}" for k, v in identity.items() if v]
+        detail = f"  ({', '.join(detail_parts)})" if detail_parts else ""
+        click.echo(f"  {i}. {provider}/{alias}{detail}")
+
+    click.echo()
+    while True:
+        choice = click.prompt("Pick an account to test", type=int, default=1)
+        if 1 <= choice <= len(accounts):
+            return accounts[choice - 1]
+        click.echo(f"Invalid choice. Enter 1-{len(accounts)}.")
+
+
+@click.group()
+@click.version_option(version="1.0.0")
+def cli():
+    """Credential Tester — verify synced credentials via live API calls."""
+    pass
+
+
+@cli.command()
+@click.option("--verbose", "-v", is_flag=True)
+@click.option("--debug", is_flag=True)
+def shell(verbose, debug):
+    """Interactive CLI session to test a credential."""
+    setup_logging(verbose=verbose, debug=debug)
+    asyncio.run(_interactive_shell(verbose))
+
+
+async def _interactive_shell(verbose=False):
+    agent = CredentialTesterAgent()
+    account = pick_account(agent)
+    if account is None:
+        return
+
+    agent.select_account(account)
+    provider = account.get("provider", "?")
+    alias = account.get("alias", "?")
+
+    click.echo(f"\nTesting {provider}/{alias}")
+    click.echo("Type your requests or 'quit' to exit.\n")
+
+    await agent.start()
+
+    try:
+        result = await agent._agent_runtime.trigger_and_wait(
+            entry_point_id="start",
+            input_data={},
+        )
+        if result:
+            click.echo(f"\nSession ended: {'success' if result.success else result.error}")
+    except KeyboardInterrupt:
+        click.echo("\nGoodbye!")
+    finally:
+        await agent.stop()
+
+
+@cli.command(name="list")
+def list_accounts():
+    """List all connected accounts."""
+    agent = CredentialTesterAgent()
+    accounts = agent.list_accounts()
+
+    if not accounts:
+        click.echo("No connected accounts found.")
+        return
+
+    click.echo("\nConnected accounts:\n")
+    for acct in accounts:
+        provider = acct.get("provider", "?")
+        alias = acct.get("alias", "?")
+        identity = acct.get("identity", {})
+        detail_parts = [f"{k}: {v}" for k, v in identity.items() if v]
+        detail = f"  ({', '.join(detail_parts)})" if detail_parts else ""
+        click.echo(f"  {provider}/{alias}{detail}")
+
+
+if __name__ == "__main__":
+    cli()
@@ -0,0 +1,645 @@
+"""Credential Tester agent — verify credentials via live API calls.
+
+Supports both Aden OAuth2-synced accounts AND locally-stored API key accounts.
+Aden accounts use account="alias" routing; local accounts inject the key into
+the session environment so tools read it without an account= parameter.
+
+When loaded via AgentRunner.load() (TUI picker, ``hive run``), the module-level
+``nodes`` / ``edges`` variables provide a static graph.  The TUI detects
+``requires_account_selection`` and shows an account picker *before* starting
+the agent.  ``configure_for_account()`` then scopes the node's tools to the
+selected provider.
+
+When used directly (``CredentialTesterAgent``), the graph is built dynamically
+after the user picks an account programmatically.
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from framework.config import get_max_context_tokens
+from framework.host.agent_host import AgentHost
+from framework.host.execution_manager import EntryPointSpec
+from framework.llm import LiteLLMProvider
+from framework.loader.mcp_registry import MCPRegistry
+from framework.loader.tool_registry import ToolRegistry
+from framework.orchestrator import Goal, NodeSpec, SuccessCriterion
+from framework.orchestrator.checkpoint_config import CheckpointConfig
+from framework.orchestrator.edge import GraphSpec
+from framework.orchestrator.orchestrator import ExecutionResult
+
+from .config import default_config
+from .nodes import build_tester_node
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from framework.loader import AgentLoader
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Goal
+# ---------------------------------------------------------------------------
+
+goal = Goal(
+    id="credential-tester",
+    name="Credential Tester",
+    description="Verify that a credential can make real API calls.",
+    success_criteria=[
+        SuccessCriterion(
+            id="api-call-success",
+            description="At least one API call succeeds using the credential",
+            metric="api_call_success",
+            target="true",
+            weight=1.0,
+        ),
+    ],
+    constraints=[],
+)
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def get_tools_for_provider(provider_name: str) -> list[str]:
+    """Collect tool names for a credential by credential_id OR credential_group.
+
+    Matches on both ``credential_id`` (e.g. "google" → Gmail tools) and
+    ``credential_group`` (e.g. "google_custom_search" → all google search tools).
+    """
+    from aden_tools.credentials import CREDENTIAL_SPECS
+
+    tools: list[str] = []
+    for spec in CREDENTIAL_SPECS.values():
+        if spec.credential_id == provider_name or spec.credential_group == provider_name:
+            tools.extend(spec.tools)
+    return sorted(set(tools))
+
+
+def _list_aden_accounts() -> list[dict]:
+    """List active accounts from the Aden platform (requires ADEN_API_KEY)."""
+    import os
+
+    api_key = os.environ.get("ADEN_API_KEY")
+    if not api_key:
+        return []
+
+    try:
+        from framework.credentials.aden.client import AdenClientConfig, AdenCredentialClient
+
+        client = AdenCredentialClient(
+            AdenClientConfig(
+                base_url=os.environ.get("ADEN_API_URL", "https://api.adenhq.com"),
+            )
+        )
+        try:
+            integrations = client.list_integrations()
+        finally:
+            client.close()
+
+        return [
+            {
+                "provider": c.provider,
+                "alias": c.alias,
+                "identity": {"email": c.email} if c.email else {},
+                "integration_id": c.integration_id,
+                "source": "aden",
+            }
+            for c in integrations
+            if c.status == "active"
+        ]
+    except (ImportError, OSError) as exc:
+        logger.debug("Could not list Aden accounts: %s", exc)
+        return []
+    except Exception:
+        logger.warning("Unexpected error listing Aden accounts", exc_info=True)
+        return []
+
+
+def _list_local_accounts() -> list[dict]:
+    """List named local API key accounts from LocalCredentialRegistry."""
+    try:
+        from framework.credentials.local.registry import LocalCredentialRegistry
+
+        return [info.to_account_dict() for info in LocalCredentialRegistry.default().list_accounts()]
+    except ImportError as exc:
+        logger.debug("Local credential registry unavailable: %s", exc)
+        return []
+    except Exception:
+        logger.warning("Unexpected error listing local accounts", exc_info=True)
+        return []
+
+
+def _list_env_fallback_accounts() -> list[dict]:
+    """Surface configured-but-unregistered credentials as testable entries.
+
+    Detects credentials available via env vars OR stored in the encrypted
+    store in the old flat format (e.g. ``brave_search`` with no alias).
+    These are users who haven't yet run ``save_account()`` but have a working key.
+    Shows with alias="default" and status="unknown".
+    """
+    import os
+
+    from aden_tools.credentials import CREDENTIAL_SPECS
+
+    # Collect IDs in encrypted store (includes old flat entries like "brave_search")
+    try:
+        from framework.credentials.storage import EncryptedFileStorage
+
+        encrypted_ids: set[str] = set(EncryptedFileStorage().list_all())
+    except (ImportError, OSError) as exc:
+        logger.debug("Could not read encrypted store: %s", exc)
+        encrypted_ids = set()
+    except Exception:
+        logger.warning("Unexpected error reading encrypted store", exc_info=True)
+        encrypted_ids = set()
+
+    def _is_configured(cred_name: str, spec) -> bool:
+        # 1. Env var present
+        if os.environ.get(spec.env_var):
+            return True
+        # 2. Old flat encrypted entry (no slash — new entries have {x}/{y})
+        if cred_name in encrypted_ids:
+            return True
+        return False
+
+    seen_groups: set[str] = set()
+    accounts: list[dict] = []
+
+    for cred_name, spec in CREDENTIAL_SPECS.items():
+        if not spec.direct_api_key_supported or not spec.tools:
+            continue
+
+        if spec.credential_group:
+            if spec.credential_group in seen_groups:
+                continue
+            group_available = all(
+                _is_configured(n, s) for n, s in CREDENTIAL_SPECS.items() if s.credential_group == spec.credential_group
+            )
+            if not group_available:
+                continue
+            seen_groups.add(spec.credential_group)
+            provider = spec.credential_group
+        else:
+            if not _is_configured(cred_name, spec):
+                continue
+            provider = cred_name
+
+        accounts.append(
+            {
+                "provider": provider,
+                "alias": "default",
+                "identity": {},
+                "integration_id": None,
+                "source": "local",
+                "status": "unknown",
+            }
+        )
+
+    return accounts
+
+
+def list_connected_accounts() -> list[dict]:
+    """List all testable accounts: Aden-synced + named local + env-var fallbacks."""
+    aden = _list_aden_accounts()
+    local = _list_local_accounts()
+
+    # Show env-var fallbacks only for credentials not already in the named registry
+    local_providers = {a["provider"] for a in local}
+    env_fallbacks = [a for a in _list_env_fallback_accounts() if a["provider"] not in local_providers]
+
+    return aden + local + env_fallbacks
+
+
+# ---------------------------------------------------------------------------
+# Module-level hooks (read by AgentRunner.load / TUI)
+# ---------------------------------------------------------------------------
+
+skip_credential_validation = True
+"""Don't validate credentials at load time — we don't know which provider yet."""
+
+requires_account_selection = True
+"""Signal TUI to show account picker before starting the agent."""
+
+
+def configure_for_account(runner: AgentLoader, account: dict) -> None:
+    """Scope the tester node's tools to the selected provider.
+
+    Handles both Aden accounts (account= routing) and local accounts
+    (session-level env var injection, no account= parameter in prompt).
+    """
+    provider = account["provider"]
+    source = account.get("source", "aden")
+    alias = account.get("alias", "unknown")
+    identity = account.get("identity", {})
+    tools = get_tools_for_provider(provider)
+
+    if source == "aden":
+        tools.append("get_account_info")
+        email = identity.get("email", "")
+        detail = f" (email: {email})" if email else ""
+        _configure_aden_node(runner, provider, alias, detail, tools)
+    else:
+        status = account.get("status", "unknown")
+        _activate_local_account(provider, alias)
+        _configure_local_node(runner, provider, alias, identity, tools, status)
+
+
+def _activate_local_account(credential_id: str, alias: str) -> None:
+    """Inject a named local account's key into the session environment.
+
+    Handles three cases:
+    1. Named account in LocalCredentialRegistry (new format: {credential_id}/{alias})
+    2. Old flat credential in EncryptedFileStorage (id == credential_id, no alias)
+    3. Env var already set — skip injection (nothing to do)
+    """
+    import os
+
+    from aden_tools.credentials import CREDENTIAL_SPECS
+
+    # Collect specs for this credential (handles grouped credentials too)
+    group_specs = [
+        (cred_name, spec)
+        for cred_name, spec in CREDENTIAL_SPECS.items()
+        if spec.credential_group == credential_id or spec.credential_id == credential_id or cred_name == credential_id
+    ]
+    # Deduplicate — credential_id and credential_group may both match the same spec
+    seen_env_vars: set[str] = set()
+
+    try:
+        from framework.credentials.local.registry import LocalCredentialRegistry
+        from framework.credentials.storage import EncryptedFileStorage
+
+        registry = LocalCredentialRegistry.default()
+        flat_storage = EncryptedFileStorage()
+
+        for _cred_name, spec in group_specs:
+            if spec.env_var in seen_env_vars:
+                continue
+            # If env var is already set, nothing to do for this one
+            if os.environ.get(spec.env_var):
+                seen_env_vars.add(spec.env_var)
+                continue
+
+            seen_env_vars.add(spec.env_var)
+
+            # Determine key name based on spec
+            key_name = "api_key"
+            if spec.credential_group and "cse" in spec.env_var.lower():
+                key_name = "cse_id"
+
+            key: str | None = None
+
+            # 1. Try named account in registry (new format)
+            if alias != "default":
+                key = registry.get_key(credential_id, alias, key_name)
+            else:
+                # For "default" alias, check registry first, then fall back to flat store
+                key = registry.get_key(credential_id, "default", key_name)
+
+            # 2. Fall back to old flat encrypted entry (id == credential_id, no alias)
+            if key is None:
+                flat_cred = flat_storage.load(credential_id)
+                if flat_cred is not None:
+                    key = flat_cred.get_key(key_name) or flat_cred.get_default_key()
+
+            if key:
+                os.environ[spec.env_var] = key
+    except (ImportError, KeyError, OSError) as exc:
+        logger.debug("Could not inject credentials: %s", exc)
+    except Exception:
+        logger.warning("Unexpected error injecting credentials", exc_info=True)
+
+
+def _configure_aden_node(
+    runner: AgentLoader,
+    provider: str,
+    alias: str,
+    detail: str,
+    tools: list[str],
+) -> None:
+    for node in runner.graph.nodes:
+        if node.id == "tester":
+            node.tools = sorted(set(tools))
+            node.system_prompt = f"""\
+You are a credential tester for the account: {provider}/{alias}{detail}
+
+# Instructions
+
+1. Suggest a simple read-only API call to verify the credential works \
+(e.g. list messages, list channels, list contacts).
+2. Execute the call when the user agrees.
+3. Report the result: success (with sample data) or failure (with error).
+4. Let the user request additional API calls to further test the credential.
+
+# Account routing
+
+IMPORTANT: Always pass `account="{alias}"` when calling any tool. \
+This routes the API call to the correct credential. Never use the email \
+or any other identifier — always use the alias exactly as shown.
+
+# Rules
+
+- Start with read-only operations (list, get) before write operations.
+- Always confirm with the user before performing write operations.
+- If a call fails, report the exact error — this helps diagnose credential issues.
+- Be concise. No emojis.
+"""
+            break
+
+    runner.intro_message = (
+        f"Testing {provider}/{alias}{detail} — "
+        f"{len(tools)} tools loaded. "
+        "I'll suggest a read-only API call to verify the credential works."
+    )
+
+
+def _configure_local_node(
+    runner: AgentLoader,
+    provider: str,
+    alias: str,
+    identity: dict,
+    tools: list[str],
+    status: str,
+) -> None:
+    identity_parts = [f"{k}: {v}" for k, v in identity.items() if v]
+    detail = f" ({', '.join(identity_parts)})" if identity_parts else ""
+    status_note = " [key not yet validated]" if status == "unknown" else ""
+
+    for node in runner.graph.nodes:
+        if node.id == "tester":
+            node.tools = sorted(set(tools))
+            node.system_prompt = f"""\
+You are a credential tester for the local API key: {provider}/{alias}{detail}{status_note}
+
+# Instructions
+
+1. Suggest a simple test call to verify the credential works \
+(e.g. search for "test", list items, get profile info).
+2. Execute the call when the user agrees.
+3. Report the result: success (with sample data) or failure (with error).
+4. Let the user request additional API calls to further test the credential.
+
+# Rules
+
+- Do NOT pass an `account` parameter — this credential is injected \
+directly into the session environment and tools read it automatically.
+- Start with read-only operations before write operations.
+- Always confirm with the user before performing write operations.
+- If a call fails, report the exact error — this helps diagnose credential issues.
+- Be concise. No emojis.
+"""
+            break
+
+    runner.intro_message = (
+        f"Testing {provider}/{alias}{detail} — "
+        f"{len(tools)} tools loaded. "
+        "I'll suggest a test API call to verify the credential works."
+    )
+
+
+# ---------------------------------------------------------------------------
+# Module-level graph variables (read by AgentRunner.load)
+# ---------------------------------------------------------------------------
+
+nodes = [
+    NodeSpec(
+        id="tester",
+        name="Credential Tester",
+        description=("Interactive credential testing — lets the user pick an account and verify it via API calls."),
+        node_type="event_loop",
+        client_facing=True,
+        max_node_visits=0,
+        input_keys=[],
+        output_keys=["test_result"],
+        nullable_output_keys=["test_result"],
+        tools=["get_account_info"],
+        system_prompt="""\
+You are a credential tester. Your job is to help the user verify that their \
+connected accounts and API keys can make real API calls.
+
+# Startup
+
+1. Call ``get_account_info`` to list the user's connected accounts.
+2. Present the list and ask the user which account to test.
+3. Once they pick one, note the account's **alias** (e.g. "Timothy", "work-slack").
+4. Suggest a simple read-only API call to verify the credential works \
+(e.g. list messages, list channels, list contacts).
+5. Execute the call when the user agrees.
+6. Report the result: success (with sample data) or failure (with error).
+7. Let the user request additional API calls to further test the credential.
+
+# Account routing (Aden accounts only)
+
+IMPORTANT: For Aden-synced accounts, always pass the account's **alias** as the \
+``account`` parameter when calling any tool. For local API key accounts, do NOT \
+pass an account parameter — they are pre-injected into the session.
+
+# Rules
+
+- Start with read-only operations (list, get) before write operations.
+- Always confirm with the user before performing write operations.
+- If a call fails, report the exact error — this helps diagnose credential issues.
+- Be concise. No emojis.
+""",
+    ),
+]
+
+edges = []
+
+entry_node = "tester"
+entry_points = {"start": "tester"}
+pause_nodes = []
+terminal_nodes = ["tester"]  # Tester node can terminate
+
+conversation_mode = "continuous"
+identity_prompt = "You are a credential tester that verifies connected accounts and API keys can make real API calls."
+loop_config = {
+    "max_iterations": 50,
+    "max_tool_calls_per_turn": 30,
+}
+
+# ---------------------------------------------------------------------------
+# Programmatic agent class (used by __main__.py CLI)
+# ---------------------------------------------------------------------------
+
+
+class CredentialTesterAgent:
+    """Interactive agent that tests a specific credential via API calls.
+
+    Usage:
+        agent = CredentialTesterAgent()
+        accounts = agent.list_accounts()
+        agent.select_account(accounts[0])
+        await agent.start()
+        await agent.stop()
+    """
+
+    def __init__(self, config=None):
+        self.config = config or default_config
+        self._selected_account: dict | None = None
+        self._agent_runtime: AgentHost | None = None
+        self._tool_registry: ToolRegistry | None = None
+        self._storage_path: Path | None = None
+
+    def list_accounts(self) -> list[dict]:
+        """List all testable accounts (Aden + local named + env-var fallbacks)."""
+        return list_connected_accounts()
+
+    def select_account(self, account: dict) -> None:
+        """Select an account to test.
+
+        Args:
+            account: Account dict from list_accounts() with
+                     provider, alias, identity, source keys.
+        """
+        self._selected_account = account
+
+    @property
+    def selected_provider(self) -> str:
+        if self._selected_account is None:
+            raise RuntimeError("No account selected. Call select_account() first.")
+        return self._selected_account["provider"]
+
+    @property
+    def selected_alias(self) -> str:
+        if self._selected_account is None:
+            raise RuntimeError("No account selected. Call select_account() first.")
+        return self._selected_account.get("alias", "unknown")
+
+    def _build_graph(self) -> GraphSpec:
+        provider = self.selected_provider
+        alias = self.selected_alias
+        source = self._selected_account.get("source", "aden")
+        identity = self._selected_account.get("identity", {})
+        tools = get_tools_for_provider(provider)
+
+        if source == "local":
+            _activate_local_account(provider, alias)
+        elif source == "aden":
+            tools.append("get_account_info")
+
+        tester_node = build_tester_node(
+            provider=provider,
+            alias=alias,
+            tools=tools,
+            identity=identity,
+            source=source,
+        )
+
+        return GraphSpec(
+            id="credential-tester-graph",
+            goal_id=goal.id,
+            version="1.0.0",
+            entry_node="tester",
+            entry_points={"start": "tester"},
+            terminal_nodes=["tester"],  # Tester node can terminate
+            pause_nodes=[],
+            nodes=[tester_node],
+            edges=[],
+            default_model=self.config.model,
+            max_tokens=self.config.max_tokens,
+            loop_config={
+                "max_iterations": 50,
+                "max_tool_calls_per_turn": 30,
+                "max_context_tokens": get_max_context_tokens(),
+            },
+            conversation_mode="continuous",
+            identity_prompt=(
+                f"You are testing the {provider}/{alias} credential. "
+                "Help the user verify it works by making real API calls."
+            ),
+        )
+
+    def _setup(self) -> None:
+        if self._selected_account is None:
+            raise RuntimeError("No account selected. Call select_account() first.")
+
+        self._storage_path = Path.home() / ".hive" / "agents" / "credential_tester"
+        self._storage_path.mkdir(parents=True, exist_ok=True)
+
+        self._tool_registry = ToolRegistry()
+
+        mcp_config_path = Path(__file__).parent / "mcp_servers.json"
+        if mcp_config_path.exists():
+            self._tool_registry.load_mcp_config(mcp_config_path)
+
+        try:
+            agent_dir = Path(__file__).parent
+            registry = MCPRegistry()
+            registry.initialize()
+            if (agent_dir / "mcp_registry.json").is_file():
+                self._tool_registry.set_mcp_registry_agent_path(agent_dir)
+            registry_configs, selection_max_tools = registry.load_agent_selection(agent_dir)
+            if registry_configs:
+                self._tool_registry.load_registry_servers(
+                    registry_configs,
+                    preserve_existing_tools=True,
+                    log_collisions=True,
+                    max_tools=selection_max_tools,
+                )
+        except Exception:
+            logger.warning("MCP registry config failed to load", exc_info=True)
+
+        extra_kwargs = getattr(self.config, "extra_kwargs", {}) or {}
+        llm = LiteLLMProvider(
+            model=self.config.model,
+            api_key=self.config.api_key,
+            api_base=self.config.api_base,
+            **extra_kwargs,
+        )
+
+        tool_executor = self._tool_registry.get_executor()
+        tools = list(self._tool_registry.get_tools().values())
+
+        graph = self._build_graph()
+
+        self._agent_runtime = AgentHost(
+            graph=graph,
+            goal=goal,
+            storage_path=self._storage_path,
+            entry_points=[
+                EntryPointSpec(
+                    id="start",
+                    name="Test Credential",
+                    entry_node="tester",
+                    trigger_type="manual",
+                    isolation_level="isolated",
+                ),
+            ],
+            llm=llm,
+            tools=tools,
+            tool_executor=tool_executor,
+            checkpoint_config=CheckpointConfig(enabled=False),
+            graph_id="credential_tester",
+        )
+
+    async def start(self) -> None:
+        """Set up and start the agent runtime."""
+        if self._agent_runtime is None:
+            self._setup()
+        if not self._agent_runtime.is_running:
+            await self._agent_runtime.start()
+
+    async def stop(self) -> None:
+        """Stop the agent runtime."""
+        if self._agent_runtime and self._agent_runtime.is_running:
+            await self._agent_runtime.stop()
+        self._agent_runtime = None
+
+    async def run(self) -> ExecutionResult:
+        """Run the agent (convenience for single execution)."""
+        await self.start()
+        try:
+            result = await self._agent_runtime.trigger_and_wait(
+                entry_point_id="start",
+                input_data={},
+            )
+            return result or ExecutionResult(success=False, error="Execution timeout")
+        finally:
+            await self.stop()
@@ -0,0 +1,19 @@
+"""Runtime configuration for Credential Tester agent."""
+
+from dataclasses import dataclass
+
+from framework.config import RuntimeConfig
+
+
+@dataclass
+class AgentMetadata:
+    name: str = "Credential Tester"
+    version: str = "1.0.0"
+    description: str = (
+        "Test connected accounts by making real API calls. "
+        "Pick an account, verify credentials work, and explore available tools."
+    )
+
+
+metadata = AgentMetadata()
+default_config = RuntimeConfig(temperature=0.3)
@@ -0,0 +1,9 @@
+{
+  "hive_tools": {
+    "transport": "stdio",
+    "command": "uv",
+    "args": ["run", "python", "mcp_server.py", "--stdio"],
+    "cwd": "../../../../tools",
+    "description": "hive_tools MCP server with provider-specific tools"
+  }
+}
@@ -0,0 +1,85 @@
+"""Node definitions for Credential Tester agent."""
+
+from framework.orchestrator import NodeSpec
+
+
+def build_tester_node(
+    provider: str,
+    alias: str,
+    tools: list[str],
+    identity: dict[str, str],
+    source: str = "aden",
+) -> NodeSpec:
+    """Build the tester node dynamically for the selected account.
+
+    Args:
+        provider: Provider / credential name (e.g. "google", "brave_search").
+        alias: User-set alias (e.g. "Timothy", "work").
+        tools: Tool names available for this provider.
+        identity: Identity dict (email, workspace, etc.) for context.
+        source: "aden" or "local" — controls routing instructions in the prompt.
+    """
+    detail_parts = [f"{k}: {v}" for k, v in identity.items() if v]
+    detail = f" ({', '.join(detail_parts)})" if detail_parts else ""
+
+    if source == "aden":
+        routing_section = f"""\
+# Account routing
+
+IMPORTANT: Always pass `account="{alias}"` when calling any tool. \
+This routes the API call to the correct credential. Never use the email \
+or any other identifier — always use the alias exactly as shown.
+"""
+    else:
+        routing_section = """\
+# Credential routing
+
+This is a local API key credential — do NOT pass an `account` parameter. \
+The key is pre-injected into the session environment and tools read it automatically.
+"""
+
+    account_label = "account" if source == "aden" else "local API key"
+
+    return NodeSpec(
+        id="tester",
+        name="Credential Tester",
+        description=(
+            f"Interactive testing node for {provider}/{alias}. "
+            f"Has access to all {provider} tools to verify the credential works."
+        ),
+        node_type="event_loop",
+        client_facing=True,
+        max_node_visits=0,
+        input_keys=[],
+        output_keys=["test_result"],
+        nullable_output_keys=["test_result"],
+        tools=tools,
+        system_prompt=f"""\
+You are a credential tester for the {account_label}: {provider}/{alias}{detail}
+
+Your job is to help the user verify that this credential works by making \
+real API calls using the available tools.
+
+{routing_section}
+# Instructions
+
+1. Start by greeting the user and confirming which account you're testing.
+2. Suggest a simple, safe, read-only API call to verify the credential works \
+(e.g. list messages, list channels, list contacts, search for "test").
+3. Execute the call when the user agrees.
+4. Report the result clearly: success (with sample data) or failure (with error).
+5. Let the user request additional API calls to further test the credential.
+
+# Available tools
+
+You have access to {len(tools)} tools for {provider}:
+{chr(10).join(f"- {t}" for t in tools)}
+
+# Rules
+
+- Start with read-only operations (list, get) before write operations (create, update, delete).
+- Always confirm with the user before performing write operations.
+- If a call fails, report the exact error — this helps diagnose credential issues.
+- Be concise. No emojis.
+""",
+    )
@@ -0,0 +1,267 @@
+"""Agent discovery — scan known directories and return categorised AgentEntry lists."""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+
+
+@dataclass
+class WorkerEntry:
+    """A single worker within a colony."""
+
+    name: str
+    config_path: Path
+    description: str = ""
+    tool_count: int = 0
+    task: str = ""
+    spawned_at: str = ""
+    queen_name: str = ""
+    colony_name: str = ""
+
+    def to_dict(self) -> dict:
+        return {
+            "name": self.name,
+            "config_path": str(self.config_path),
+            "description": self.description,
+            "tool_count": self.tool_count,
+            "task": self.task,
+            "spawned_at": self.spawned_at,
+            "queen_name": self.queen_name,
+            "colony_name": self.colony_name,
+        }
+
+
+@dataclass
+class AgentEntry:
+    """Lightweight agent metadata for the picker / API discover endpoint."""
+
+    path: Path
+    name: str
+    description: str
+    category: str
+    session_count: int = 0
+    run_count: int = 0
+    node_count: int = 0
+    tool_count: int = 0
+    tags: list[str] = field(default_factory=list)
+    last_active: str | None = None
+    workers: list[WorkerEntry] = field(default_factory=list)
+
+
+def _get_last_active(agent_path: Path) -> str | None:
+    """Return the most recent updated_at timestamp across all sessions.
+
+    Checks both worker sessions (``~/.hive/agents/{name}/sessions/``) and
+    queen sessions (``~/.hive/agents/queens/default/sessions/``) whose
+    ``meta.json`` references the same *agent_path*.
+    """
+    from datetime import datetime
+
+    agent_name = agent_path.name
+    latest: str | None = None
+
+    # 1. Worker sessions
+    sessions_dir = Path.home() / ".hive" / "agents" / agent_name / "sessions"
+    if sessions_dir.exists():
+        for session_dir in sessions_dir.iterdir():
+            if not session_dir.is_dir() or not session_dir.name.startswith("session_"):
+                continue
+            state_file = session_dir / "state.json"
+            if not state_file.exists():
+                continue
+            try:
+                data = json.loads(state_file.read_text(encoding="utf-8"))
+                ts = data.get("timestamps", {}).get("updated_at")
+                if ts and (latest is None or ts > latest):
+                    latest = ts
+            except Exception:
+                continue
+
+    # 2. Queen sessions (scan all queen identity directories)
+    from framework.config import QUEENS_DIR
+
+    if QUEENS_DIR.exists():
+        resolved = agent_path.resolve()
+        for queen_dir in QUEENS_DIR.iterdir():
+            if not queen_dir.is_dir():
+                continue
+            sessions_dir = queen_dir / "sessions"
+            if not sessions_dir.exists():
+                continue
+            for d in sessions_dir.iterdir():
+                if not d.is_dir():
+                    continue
+                meta_file = d / "meta.json"
+                if not meta_file.exists():
+                    continue
+                try:
+                    meta = json.loads(meta_file.read_text(encoding="utf-8"))
+                    stored = meta.get("agent_path")
+                    if not stored or Path(stored).resolve() != resolved:
+                        continue
+                    ts = datetime.fromtimestamp(d.stat().st_mtime).isoformat()
+                    if latest is None or ts > latest:
+                        latest = ts
+                except Exception:
+                    continue
+
+    return latest
+
+
+def _count_sessions(agent_name: str) -> int:
+    """Count session directories under ~/.hive/agents/{agent_name}/sessions/."""
+    sessions_dir = Path.home() / ".hive" / "agents" / agent_name / "sessions"
+    if not sessions_dir.exists():
+        return 0
+    return sum(1 for d in sessions_dir.iterdir() if d.is_dir() and d.name.startswith("session_"))
+
+
+def _count_runs(agent_name: str) -> int:
+    """Count unique run_ids across all sessions for an agent."""
+    sessions_dir = Path.home() / ".hive" / "agents" / agent_name / "sessions"
+    if not sessions_dir.exists():
+        return 0
+    run_ids: set[str] = set()
+    for session_dir in sessions_dir.iterdir():
+        if not session_dir.is_dir() or not session_dir.name.startswith("session_"):
+            continue
+        # runs.jsonl lives inside workspace subdirectories
+        for runs_file in session_dir.rglob("runs.jsonl"):
+            try:
+                for line in runs_file.read_text(encoding="utf-8").splitlines():
+                    line = line.strip()
+                    if not line:
+                        continue
+                    record = json.loads(line)
+                    rid = record.get("run_id")
+                    if rid:
+                        run_ids.add(rid)
+            except Exception:
+                continue
+    return len(run_ids)
+
+
+_EXCLUDED_JSON_STEMS = {"agent", "flowchart", "triggers", "configuration", "metadata"}
+
+
+def _is_colony_dir(path: Path) -> bool:
+    """Check if a directory is a colony with worker config files."""
+    if not path.is_dir():
+        return False
+    return any(f.suffix == ".json" and f.stem not in _EXCLUDED_JSON_STEMS for f in path.iterdir() if f.is_file())
+
+
+def _find_worker_configs(colony_dir: Path) -> list[Path]:
+    """Find all worker config JSON files in a colony directory."""
+    return sorted(
+        p for p in colony_dir.iterdir() if p.is_file() and p.suffix == ".json" and p.stem not in _EXCLUDED_JSON_STEMS
+    )
+
+
+def _extract_agent_stats(agent_path: Path) -> tuple[int, int, list[str]]:
+    """Extract worker count, tool count, and tags from a colony directory."""
+    tags: list[str] = []
+
+    worker_configs = _find_worker_configs(agent_path)
+    if worker_configs:
+        all_tools: set[str] = set()
+        for wc_path in worker_configs:
+            try:
+                data = json.loads(wc_path.read_text(encoding="utf-8"))
+                if isinstance(data, dict):
+                    tools = data.get("tools", [])
+                    if isinstance(tools, list):
+                        all_tools.update(tools)
+            except Exception:
+                pass
+        return len(worker_configs), len(all_tools), tags
+
+    return 0, 0, tags
+
+
+def discover_agents() -> dict[str, list[AgentEntry]]:
+    """Discover agents from all known sources grouped by category."""
+    from framework.config import COLONIES_DIR
+
+    groups: dict[str, list[AgentEntry]] = {}
+    sources = [
+        ("Your Agents", COLONIES_DIR),
+    ]
+
+    # Track seen agent directory names to avoid duplicates when the same
+    # agent exists in both colonies/ and exports/ (colonies takes priority).
+    _seen_agent_names: set[str] = set()
+
+    for category, base_dir in sources:
+        if not base_dir.exists():
+            continue
+        entries: list[AgentEntry] = []
+        for path in sorted(base_dir.iterdir(), key=lambda p: p.name):
+            if not _is_colony_dir(path):
+                continue
+            if path.name in _seen_agent_names:
+                continue
+            _seen_agent_names.add(path.name)
+
+            config_fallback_name = path.name.replace("_", " ").title()
+            name = config_fallback_name
+            desc = ""
+
+            # Read colony metadata for queen provenance
+            colony_queen_name = ""
+            metadata_path = path / "metadata.json"
+            if metadata_path.exists():
+                try:
+                    mdata = json.loads(metadata_path.read_text(encoding="utf-8"))
+                    colony_queen_name = mdata.get("queen_name", "")
+                except Exception:
+                    pass
+
+            worker_entries: list[WorkerEntry] = []
+            worker_configs = _find_worker_configs(path)
+            for wc_path in worker_configs:
+                try:
+                    data = json.loads(wc_path.read_text(encoding="utf-8"))
+                    if isinstance(data, dict):
+                        w = WorkerEntry(
+                            name=data.get("name", wc_path.stem),
+                            config_path=wc_path,
+                            description=data.get("description", ""),
+                            tool_count=len(data.get("tools", [])),
+                            task=data.get("goal", {}).get("description", ""),
+                            spawned_at=data.get("spawned_at", ""),
+                            queen_name=colony_queen_name,
+                            colony_name=path.name,
+                        )
+                        worker_entries.append(w)
+                        if not desc:
+                            desc = data.get("description", "")
+                except Exception:
+                    pass
+
+            node_count = len(worker_entries)
+            tool_count = max((w.tool_count for w in worker_entries), default=0)
+
+            entries.append(
+                AgentEntry(
+                    path=path,
+                    name=name,
+                    description=desc,
+                    category=category,
+                    session_count=_count_sessions(path.name),
+                    run_count=_count_runs(path.name),
+                    node_count=node_count,
+                    tool_count=tool_count,
+                    tags=[],
+                    last_active=_get_last_active(path),
+                    workers=worker_entries,
+                )
+            )
+        if entries:
+            existing = groups.get(category, [])
+            existing.extend(entries)
+            groups[category] = existing
+
+    return groups
@@ -0,0 +1,15 @@
+"""Queen -- the agent builder for the Hive framework."""
+
+from .agent import queen_goal, queen_loop_config
+from .config import AgentMetadata, RuntimeConfig, default_config, metadata
+
+__version__ = "1.0.0"
+
+__all__ = [
+    "queen_goal",
+    "queen_loop_config",
+    "RuntimeConfig",
+    "AgentMetadata",
+    "default_config",
+    "metadata",
+]
@@ -0,0 +1,26 @@
+"""Queen agent definition.
+
+The queen is a single AgentLoop — no orchestrator dependency.
+Loaded by queen_orchestrator.create_queen().
+"""
+
+from framework.schemas.goal import Goal
+
+from .nodes import queen_node
+
+queen_goal = Goal(
+    id="queen-manager",
+    name="Queen Manager",
+    description=("Manage the worker agent lifecycle and serve as the user's primary interactive interface."),
+    success_criteria=[],
+    constraints=[],
+)
+
+# Loop config -- used by queen_orchestrator to build LoopConfig
+queen_loop_config = {
+    "max_iterations": 999_999,
+    "max_tool_calls_per_turn": 30,
+    "max_context_tokens": 180_000,
+}
+
+__all__ = ["queen_goal", "queen_loop_config", "queen_node"]
--- a/Show More
+++ b/Show More